{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9274431057563588, "eval_steps": 500, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00428380187416332, "grad_norm": 1.7204455338427447, "learning_rate": 9.635974304068523e-08, "loss": 0.310353684425354, "step": 10 }, { "epoch": 0.00856760374832664, "grad_norm": 1.5542088461890025, "learning_rate": 2.0342612419700217e-07, "loss": 0.3165663003921509, "step": 20 }, { "epoch": 0.01285140562248996, "grad_norm": 1.2986368981078442, "learning_rate": 3.1049250535331905e-07, "loss": 0.2931360721588135, "step": 30 }, { "epoch": 0.01713520749665328, "grad_norm": 1.3372226018458075, "learning_rate": 4.1755888650963603e-07, "loss": 0.29114551544189454, "step": 40 }, { "epoch": 0.0214190093708166, "grad_norm": 1.065879980657948, "learning_rate": 5.24625267665953e-07, "loss": 0.2506369352340698, "step": 50 }, { "epoch": 0.02570281124497992, "grad_norm": 0.6781359530445974, "learning_rate": 6.3169164882227e-07, "loss": 0.22955031394958497, "step": 60 }, { "epoch": 0.02998661311914324, "grad_norm": 0.5249895834183264, "learning_rate": 7.387580299785868e-07, "loss": 0.22067618370056152, "step": 70 }, { "epoch": 0.03427041499330656, "grad_norm": 0.5375825140839746, "learning_rate": 8.458244111349037e-07, "loss": 0.21640052795410156, "step": 80 }, { "epoch": 0.03855421686746988, "grad_norm": 0.5549702260654302, "learning_rate": 9.528907922912206e-07, "loss": 0.22633485794067382, "step": 90 }, { "epoch": 0.0428380187416332, "grad_norm": 0.43849850542690183, "learning_rate": 1.0599571734475375e-06, "loss": 0.20759968757629393, "step": 100 }, { "epoch": 0.04712182061579652, "grad_norm": 0.46359825184269493, "learning_rate": 1.1670235546038546e-06, "loss": 0.1973546862602234, "step": 110 }, { "epoch": 0.05140562248995984, "grad_norm": 0.41875998061321557, "learning_rate": 1.2740899357601712e-06, "loss": 0.19023516178131103, "step": 120 }, { "epoch": 0.055689424364123156, "grad_norm": 0.5127942743282548, "learning_rate": 1.3811563169164883e-06, "loss": 0.2127223491668701, "step": 130 }, { "epoch": 0.05997322623828648, "grad_norm": 0.46830156678706125, "learning_rate": 1.4882226980728054e-06, "loss": 0.1934453845024109, "step": 140 }, { "epoch": 0.0642570281124498, "grad_norm": 0.508710707179685, "learning_rate": 1.5952890792291223e-06, "loss": 0.2092526912689209, "step": 150 }, { "epoch": 0.06854082998661312, "grad_norm": 0.5421780654693655, "learning_rate": 1.7023554603854392e-06, "loss": 0.2070756435394287, "step": 160 }, { "epoch": 0.07282463186077644, "grad_norm": 0.493152534196984, "learning_rate": 1.809421841541756e-06, "loss": 0.19875586032867432, "step": 170 }, { "epoch": 0.07710843373493977, "grad_norm": 0.4980754232181657, "learning_rate": 1.916488222698073e-06, "loss": 0.18669115304946898, "step": 180 }, { "epoch": 0.08139223560910308, "grad_norm": 0.5304243381141518, "learning_rate": 2.02355460385439e-06, "loss": 0.2146810531616211, "step": 190 }, { "epoch": 0.0856760374832664, "grad_norm": 0.5500032540817943, "learning_rate": 2.1306209850107067e-06, "loss": 0.19487186670303344, "step": 200 }, { "epoch": 0.08995983935742972, "grad_norm": 0.42930678514550324, "learning_rate": 2.2376873661670238e-06, "loss": 0.18126411437988282, "step": 210 }, { "epoch": 0.09424364123159304, "grad_norm": 0.5711883922291429, "learning_rate": 2.3447537473233404e-06, "loss": 0.2076016664505005, "step": 220 }, { "epoch": 0.09852744310575635, "grad_norm": 0.6678390082859929, "learning_rate": 2.4518201284796575e-06, "loss": 0.20209894180297852, "step": 230 }, { "epoch": 0.10281124497991968, "grad_norm": 0.4835994184206877, "learning_rate": 2.558886509635974e-06, "loss": 0.16395035982131959, "step": 240 }, { "epoch": 0.107095046854083, "grad_norm": 0.42091478620818606, "learning_rate": 2.6659528907922917e-06, "loss": 0.17233937978744507, "step": 250 }, { "epoch": 0.11137884872824631, "grad_norm": 0.5320893808200788, "learning_rate": 2.7730192719486084e-06, "loss": 0.18311020135879516, "step": 260 }, { "epoch": 0.11566265060240964, "grad_norm": 0.5257058584608517, "learning_rate": 2.8800856531049255e-06, "loss": 0.20057764053344726, "step": 270 }, { "epoch": 0.11994645247657296, "grad_norm": 0.5293308935757324, "learning_rate": 2.987152034261242e-06, "loss": 0.1837336540222168, "step": 280 }, { "epoch": 0.12423025435073627, "grad_norm": 0.46642029540934604, "learning_rate": 3.0942184154175592e-06, "loss": 0.19081385135650636, "step": 290 }, { "epoch": 0.1285140562248996, "grad_norm": 0.5628183880631954, "learning_rate": 3.201284796573876e-06, "loss": 0.17274467945098876, "step": 300 }, { "epoch": 0.13279785809906292, "grad_norm": 0.48916829855974603, "learning_rate": 3.308351177730193e-06, "loss": 0.18039458990097046, "step": 310 }, { "epoch": 0.13708165997322624, "grad_norm": 0.5298792138248726, "learning_rate": 3.41541755888651e-06, "loss": 0.1898115634918213, "step": 320 }, { "epoch": 0.14136546184738955, "grad_norm": 0.439555530924186, "learning_rate": 3.5224839400428268e-06, "loss": 0.17530070543289183, "step": 330 }, { "epoch": 0.14564926372155287, "grad_norm": 0.5117824224937999, "learning_rate": 3.629550321199144e-06, "loss": 0.17437742948532103, "step": 340 }, { "epoch": 0.1499330655957162, "grad_norm": 0.4753694102031763, "learning_rate": 3.7366167023554605e-06, "loss": 0.17850689888000487, "step": 350 }, { "epoch": 0.15421686746987953, "grad_norm": 0.6306563088828507, "learning_rate": 3.843683083511778e-06, "loss": 0.18741222620010375, "step": 360 }, { "epoch": 0.15850066934404283, "grad_norm": 0.4694179327929818, "learning_rate": 3.950749464668095e-06, "loss": 0.17026090621948242, "step": 370 }, { "epoch": 0.16278447121820616, "grad_norm": 0.5048197989896139, "learning_rate": 4.057815845824411e-06, "loss": 0.1726588487625122, "step": 380 }, { "epoch": 0.1670682730923695, "grad_norm": 0.524700358081214, "learning_rate": 4.164882226980728e-06, "loss": 0.18944069147109985, "step": 390 }, { "epoch": 0.1713520749665328, "grad_norm": 0.4571670229694066, "learning_rate": 4.2719486081370455e-06, "loss": 0.16420159339904786, "step": 400 }, { "epoch": 0.17563587684069612, "grad_norm": 0.46820018814554304, "learning_rate": 4.379014989293362e-06, "loss": 0.19183117151260376, "step": 410 }, { "epoch": 0.17991967871485945, "grad_norm": 0.46894253012471776, "learning_rate": 4.486081370449679e-06, "loss": 0.17184211015701295, "step": 420 }, { "epoch": 0.18420348058902275, "grad_norm": 0.46858595995063135, "learning_rate": 4.593147751605996e-06, "loss": 0.17618422508239745, "step": 430 }, { "epoch": 0.18848728246318608, "grad_norm": 0.5091947698167847, "learning_rate": 4.700214132762313e-06, "loss": 0.18246437311172486, "step": 440 }, { "epoch": 0.1927710843373494, "grad_norm": 0.5203679422298269, "learning_rate": 4.807280513918631e-06, "loss": 0.16799516677856446, "step": 450 }, { "epoch": 0.1970548862115127, "grad_norm": 0.4306784639956151, "learning_rate": 4.914346895074946e-06, "loss": 0.1661084771156311, "step": 460 }, { "epoch": 0.20133868808567604, "grad_norm": 0.4604689622728843, "learning_rate": 5.021413276231264e-06, "loss": 0.17491416931152343, "step": 470 }, { "epoch": 0.20562248995983937, "grad_norm": 0.47273083263588245, "learning_rate": 5.128479657387581e-06, "loss": 0.16252427101135253, "step": 480 }, { "epoch": 0.20990629183400267, "grad_norm": 0.4611929063195057, "learning_rate": 5.235546038543897e-06, "loss": 0.1942029356956482, "step": 490 }, { "epoch": 0.214190093708166, "grad_norm": 0.4640899103515948, "learning_rate": 5.342612419700215e-06, "loss": 0.1781969666481018, "step": 500 }, { "epoch": 0.214190093708166, "eval_loss": 0.17298774421215057, "eval_runtime": 813.3235, "eval_samples_per_second": 20.41, "eval_steps_per_second": 5.103, "step": 500 }, { "epoch": 0.21847389558232932, "grad_norm": 0.4892486598590822, "learning_rate": 5.4496788008565314e-06, "loss": 0.17805953025817872, "step": 510 }, { "epoch": 0.22275769745649263, "grad_norm": 0.45908639825034264, "learning_rate": 5.556745182012848e-06, "loss": 0.1704517126083374, "step": 520 }, { "epoch": 0.22704149933065595, "grad_norm": 0.4606868972349124, "learning_rate": 5.663811563169165e-06, "loss": 0.17605620622634888, "step": 530 }, { "epoch": 0.23132530120481928, "grad_norm": 0.4883203630934758, "learning_rate": 5.770877944325482e-06, "loss": 0.185607647895813, "step": 540 }, { "epoch": 0.23560910307898258, "grad_norm": 0.480706769968442, "learning_rate": 5.877944325481799e-06, "loss": 0.1776334285736084, "step": 550 }, { "epoch": 0.2398929049531459, "grad_norm": 0.43013827677127364, "learning_rate": 5.985010706638116e-06, "loss": 0.17925962209701538, "step": 560 }, { "epoch": 0.24417670682730924, "grad_norm": 0.43681041122775155, "learning_rate": 6.092077087794433e-06, "loss": 0.15904269218444825, "step": 570 }, { "epoch": 0.24846050870147254, "grad_norm": 0.4057531376060292, "learning_rate": 6.19914346895075e-06, "loss": 0.17201122045516967, "step": 580 }, { "epoch": 0.2527443105756359, "grad_norm": 0.5693511659878766, "learning_rate": 6.3062098501070665e-06, "loss": 0.1783498764038086, "step": 590 }, { "epoch": 0.2570281124497992, "grad_norm": 0.5038894023292907, "learning_rate": 6.413276231263383e-06, "loss": 0.16208181381225586, "step": 600 }, { "epoch": 0.2613119143239625, "grad_norm": 0.4255056407918071, "learning_rate": 6.5203426124197015e-06, "loss": 0.1778697967529297, "step": 610 }, { "epoch": 0.26559571619812583, "grad_norm": 0.42463834883952506, "learning_rate": 6.627408993576018e-06, "loss": 0.17847087383270263, "step": 620 }, { "epoch": 0.26987951807228916, "grad_norm": 0.4280904585197745, "learning_rate": 6.734475374732334e-06, "loss": 0.16192808151245117, "step": 630 }, { "epoch": 0.2741633199464525, "grad_norm": 0.4032310396751306, "learning_rate": 6.841541755888651e-06, "loss": 0.1500581383705139, "step": 640 }, { "epoch": 0.2784471218206158, "grad_norm": 0.36936808807497884, "learning_rate": 6.948608137044969e-06, "loss": 0.1805708885192871, "step": 650 }, { "epoch": 0.2827309236947791, "grad_norm": 0.41279770820447376, "learning_rate": 7.055674518201286e-06, "loss": 0.15682549476623536, "step": 660 }, { "epoch": 0.2870147255689424, "grad_norm": 0.48113068018089383, "learning_rate": 7.162740899357602e-06, "loss": 0.17637710571289061, "step": 670 }, { "epoch": 0.29129852744310575, "grad_norm": 0.45019312769869485, "learning_rate": 7.26980728051392e-06, "loss": 0.16801434755325317, "step": 680 }, { "epoch": 0.2955823293172691, "grad_norm": 0.4323771559896418, "learning_rate": 7.3768736616702365e-06, "loss": 0.1738981246948242, "step": 690 }, { "epoch": 0.2998661311914324, "grad_norm": 0.4445466528485117, "learning_rate": 7.483940042826553e-06, "loss": 0.17883800268173217, "step": 700 }, { "epoch": 0.30414993306559573, "grad_norm": 0.4169235110055358, "learning_rate": 7.59100642398287e-06, "loss": 0.1757150650024414, "step": 710 }, { "epoch": 0.30843373493975906, "grad_norm": 0.46124417838321063, "learning_rate": 7.698072805139187e-06, "loss": 0.16563992500305175, "step": 720 }, { "epoch": 0.31271753681392234, "grad_norm": 0.4455214464656937, "learning_rate": 7.805139186295504e-06, "loss": 0.15891735553741454, "step": 730 }, { "epoch": 0.31700133868808567, "grad_norm": 0.48435793526108334, "learning_rate": 7.91220556745182e-06, "loss": 0.16565344333648682, "step": 740 }, { "epoch": 0.321285140562249, "grad_norm": 0.425099998591317, "learning_rate": 8.019271948608137e-06, "loss": 0.16711184978485108, "step": 750 }, { "epoch": 0.3255689424364123, "grad_norm": 0.4137507644842352, "learning_rate": 8.126338329764456e-06, "loss": 0.17436256408691406, "step": 760 }, { "epoch": 0.32985274431057565, "grad_norm": 0.5261718559693129, "learning_rate": 8.23340471092077e-06, "loss": 0.17338960170745848, "step": 770 }, { "epoch": 0.334136546184739, "grad_norm": 0.44161850092055, "learning_rate": 8.340471092077087e-06, "loss": 0.15373395681381224, "step": 780 }, { "epoch": 0.33842034805890225, "grad_norm": 0.42667362111196244, "learning_rate": 8.447537473233406e-06, "loss": 0.170109760761261, "step": 790 }, { "epoch": 0.3427041499330656, "grad_norm": 0.4289414936466275, "learning_rate": 8.554603854389722e-06, "loss": 0.16255849599838257, "step": 800 }, { "epoch": 0.3469879518072289, "grad_norm": 0.4462302049947027, "learning_rate": 8.661670235546039e-06, "loss": 0.1558121919631958, "step": 810 }, { "epoch": 0.35127175368139224, "grad_norm": 0.39502484462695925, "learning_rate": 8.768736616702356e-06, "loss": 0.14783246517181398, "step": 820 }, { "epoch": 0.35555555555555557, "grad_norm": 0.4872082027579418, "learning_rate": 8.875802997858674e-06, "loss": 0.162847638130188, "step": 830 }, { "epoch": 0.3598393574297189, "grad_norm": 0.427450758730554, "learning_rate": 8.98286937901499e-06, "loss": 0.16352038383483886, "step": 840 }, { "epoch": 0.36412315930388217, "grad_norm": 0.4528788471261664, "learning_rate": 9.089935760171307e-06, "loss": 0.16523147821426393, "step": 850 }, { "epoch": 0.3684069611780455, "grad_norm": 0.37599550924220604, "learning_rate": 9.197002141327624e-06, "loss": 0.15126256942749022, "step": 860 }, { "epoch": 0.37269076305220883, "grad_norm": 0.40592589779270666, "learning_rate": 9.30406852248394e-06, "loss": 0.15496088266372682, "step": 870 }, { "epoch": 0.37697456492637216, "grad_norm": 0.39024589028386475, "learning_rate": 9.411134903640257e-06, "loss": 0.16612087488174437, "step": 880 }, { "epoch": 0.3812583668005355, "grad_norm": 0.46291145028584035, "learning_rate": 9.518201284796574e-06, "loss": 0.16229329109191895, "step": 890 }, { "epoch": 0.3855421686746988, "grad_norm": 0.4622950426469592, "learning_rate": 9.625267665952892e-06, "loss": 0.16289321184158326, "step": 900 }, { "epoch": 0.3898259705488621, "grad_norm": 0.4404561037311073, "learning_rate": 9.732334047109209e-06, "loss": 0.16939005851745606, "step": 910 }, { "epoch": 0.3941097724230254, "grad_norm": 0.4522735218377503, "learning_rate": 9.839400428265526e-06, "loss": 0.1664318323135376, "step": 920 }, { "epoch": 0.39839357429718875, "grad_norm": 0.39873486371619626, "learning_rate": 9.946466809421842e-06, "loss": 0.17302082777023314, "step": 930 }, { "epoch": 0.4026773761713521, "grad_norm": 0.4403816711799427, "learning_rate": 9.99999127026893e-06, "loss": 0.1635822534561157, "step": 940 }, { "epoch": 0.4069611780455154, "grad_norm": 0.42736129250630583, "learning_rate": 9.999921432603256e-06, "loss": 0.16364479064941406, "step": 950 }, { "epoch": 0.41124497991967873, "grad_norm": 0.48227838403112244, "learning_rate": 9.999781758247374e-06, "loss": 0.1692502498626709, "step": 960 }, { "epoch": 0.41552878179384206, "grad_norm": 0.4325606288398738, "learning_rate": 9.999572249152187e-06, "loss": 0.1753953218460083, "step": 970 }, { "epoch": 0.41981258366800533, "grad_norm": 0.4184812393572346, "learning_rate": 9.999292908244031e-06, "loss": 0.15361449718475342, "step": 980 }, { "epoch": 0.42409638554216866, "grad_norm": 0.3536766183699388, "learning_rate": 9.998943739424614e-06, "loss": 0.16968698501586915, "step": 990 }, { "epoch": 0.428380187416332, "grad_norm": 0.4421427075174403, "learning_rate": 9.99852474757097e-06, "loss": 0.17062946557998657, "step": 1000 }, { "epoch": 0.428380187416332, "eval_loss": 0.1632310301065445, "eval_runtime": 809.0798, "eval_samples_per_second": 20.517, "eval_steps_per_second": 5.129, "step": 1000 }, { "epoch": 0.4326639892904953, "grad_norm": 0.4501749565827156, "learning_rate": 9.998035938535395e-06, "loss": 0.17221925258636475, "step": 1010 }, { "epoch": 0.43694779116465865, "grad_norm": 0.35159158648894256, "learning_rate": 9.997477319145354e-06, "loss": 0.18630390167236327, "step": 1020 }, { "epoch": 0.441231593038822, "grad_norm": 0.38517475012295227, "learning_rate": 9.9968488972034e-06, "loss": 0.17598154544830322, "step": 1030 }, { "epoch": 0.44551539491298525, "grad_norm": 0.3612688847646603, "learning_rate": 9.996150681487047e-06, "loss": 0.1822005033493042, "step": 1040 }, { "epoch": 0.4497991967871486, "grad_norm": 0.3577703505886406, "learning_rate": 9.995382681748667e-06, "loss": 0.16494649648666382, "step": 1050 }, { "epoch": 0.4540829986613119, "grad_norm": 0.4006846696021192, "learning_rate": 9.99454490871534e-06, "loss": 0.1681265115737915, "step": 1060 }, { "epoch": 0.45836680053547524, "grad_norm": 0.41680986168641504, "learning_rate": 9.99363737408871e-06, "loss": 0.15723063945770263, "step": 1070 }, { "epoch": 0.46265060240963857, "grad_norm": 0.3955828911870276, "learning_rate": 9.992660090544814e-06, "loss": 0.17240710258483888, "step": 1080 }, { "epoch": 0.4669344042838019, "grad_norm": 0.40819367835971887, "learning_rate": 9.991613071733923e-06, "loss": 0.1590951204299927, "step": 1090 }, { "epoch": 0.47121820615796517, "grad_norm": 0.4961313776161533, "learning_rate": 9.990496332280327e-06, "loss": 0.16744821071624755, "step": 1100 }, { "epoch": 0.4755020080321285, "grad_norm": 0.40118583702904315, "learning_rate": 9.989309887782153e-06, "loss": 0.16566444635391236, "step": 1110 }, { "epoch": 0.4797858099062918, "grad_norm": 0.38801799234687073, "learning_rate": 9.988053754811129e-06, "loss": 0.16186387538909913, "step": 1120 }, { "epoch": 0.48406961178045516, "grad_norm": 0.40747871131177194, "learning_rate": 9.986727950912364e-06, "loss": 0.162397563457489, "step": 1130 }, { "epoch": 0.4883534136546185, "grad_norm": 0.3728066796444714, "learning_rate": 9.985332494604107e-06, "loss": 0.1676606059074402, "step": 1140 }, { "epoch": 0.4926372155287818, "grad_norm": 0.40985672457156785, "learning_rate": 9.983867405377467e-06, "loss": 0.1700581431388855, "step": 1150 }, { "epoch": 0.4969210174029451, "grad_norm": 0.4419642574041659, "learning_rate": 9.982332703696165e-06, "loss": 0.16604260206222535, "step": 1160 }, { "epoch": 0.5012048192771085, "grad_norm": 0.4019173064441985, "learning_rate": 9.980728410996235e-06, "loss": 0.16702601909637452, "step": 1170 }, { "epoch": 0.5054886211512718, "grad_norm": 0.37849315597886735, "learning_rate": 9.979054549685726e-06, "loss": 0.17048054933547974, "step": 1180 }, { "epoch": 0.5097724230254351, "grad_norm": 0.41757098420175776, "learning_rate": 9.977311143144392e-06, "loss": 0.1623483419418335, "step": 1190 }, { "epoch": 0.5140562248995983, "grad_norm": 0.39624453257545467, "learning_rate": 9.97549821572337e-06, "loss": 0.18060542345046998, "step": 1200 }, { "epoch": 0.5183400267737617, "grad_norm": 0.3790478315082819, "learning_rate": 9.97361579274482e-06, "loss": 0.15714950561523439, "step": 1210 }, { "epoch": 0.522623828647925, "grad_norm": 0.47455877319994494, "learning_rate": 9.971663900501597e-06, "loss": 0.1706780195236206, "step": 1220 }, { "epoch": 0.5269076305220883, "grad_norm": 0.41051182237414957, "learning_rate": 9.969642566256869e-06, "loss": 0.17303004264831542, "step": 1230 }, { "epoch": 0.5311914323962517, "grad_norm": 0.3664624770254722, "learning_rate": 9.967551818243738e-06, "loss": 0.16188311576843262, "step": 1240 }, { "epoch": 0.535475234270415, "grad_norm": 0.37091548258017915, "learning_rate": 9.965391685664844e-06, "loss": 0.14944344758987427, "step": 1250 }, { "epoch": 0.5397590361445783, "grad_norm": 0.36490771306848957, "learning_rate": 9.963162198691967e-06, "loss": 0.17565066814422609, "step": 1260 }, { "epoch": 0.5440428380187416, "grad_norm": 0.36889359791667947, "learning_rate": 9.960863388465592e-06, "loss": 0.14779505729675294, "step": 1270 }, { "epoch": 0.548326639892905, "grad_norm": 0.437009279584505, "learning_rate": 9.958495287094485e-06, "loss": 0.16427998542785643, "step": 1280 }, { "epoch": 0.5526104417670683, "grad_norm": 0.39554167977080396, "learning_rate": 9.956057927655236e-06, "loss": 0.15541106462478638, "step": 1290 }, { "epoch": 0.5568942436412316, "grad_norm": 0.45029869907045383, "learning_rate": 9.953551344191806e-06, "loss": 0.16692056655883789, "step": 1300 }, { "epoch": 0.561178045515395, "grad_norm": 0.4199591876603144, "learning_rate": 9.95097557171504e-06, "loss": 0.14758901596069335, "step": 1310 }, { "epoch": 0.5654618473895582, "grad_norm": 0.4075053894893693, "learning_rate": 9.948330646202192e-06, "loss": 0.14503839015960693, "step": 1320 }, { "epoch": 0.5697456492637215, "grad_norm": 0.43881797258551375, "learning_rate": 9.94561660459641e-06, "loss": 0.16932222843170167, "step": 1330 }, { "epoch": 0.5740294511378848, "grad_norm": 0.3663150123238361, "learning_rate": 9.942833484806224e-06, "loss": 0.1607386827468872, "step": 1340 }, { "epoch": 0.5783132530120482, "grad_norm": 0.3407640340916305, "learning_rate": 9.939981325705022e-06, "loss": 0.1527782440185547, "step": 1350 }, { "epoch": 0.5825970548862115, "grad_norm": 0.37405131401648734, "learning_rate": 9.937060167130499e-06, "loss": 0.171100389957428, "step": 1360 }, { "epoch": 0.5868808567603748, "grad_norm": 0.37761828710703715, "learning_rate": 9.934070049884108e-06, "loss": 0.15846436023712157, "step": 1370 }, { "epoch": 0.5911646586345382, "grad_norm": 0.3693798885089601, "learning_rate": 9.931011015730481e-06, "loss": 0.16067838668823242, "step": 1380 }, { "epoch": 0.5954484605087015, "grad_norm": 0.3911912390175172, "learning_rate": 9.927883107396855e-06, "loss": 0.17477972507476808, "step": 1390 }, { "epoch": 0.5997322623828648, "grad_norm": 0.3338272015147582, "learning_rate": 9.924686368572467e-06, "loss": 0.15092020034790038, "step": 1400 }, { "epoch": 0.6040160642570281, "grad_norm": 0.4078089060822029, "learning_rate": 9.921420843907954e-06, "loss": 0.1569045066833496, "step": 1410 }, { "epoch": 0.6082998661311915, "grad_norm": 0.4794604605869132, "learning_rate": 9.918086579014719e-06, "loss": 0.15916914939880372, "step": 1420 }, { "epoch": 0.6125836680053548, "grad_norm": 0.3657509810107675, "learning_rate": 9.914683620464296e-06, "loss": 0.1613703727722168, "step": 1430 }, { "epoch": 0.6168674698795181, "grad_norm": 0.3781037238260815, "learning_rate": 9.911212015787705e-06, "loss": 0.1711595058441162, "step": 1440 }, { "epoch": 0.6211512717536813, "grad_norm": 0.4177181930728637, "learning_rate": 9.907671813474787e-06, "loss": 0.1607887864112854, "step": 1450 }, { "epoch": 0.6254350736278447, "grad_norm": 0.43202846060475897, "learning_rate": 9.904063062973518e-06, "loss": 0.1596811056137085, "step": 1460 }, { "epoch": 0.629718875502008, "grad_norm": 0.37967408761934135, "learning_rate": 9.90038581468933e-06, "loss": 0.15738776922225953, "step": 1470 }, { "epoch": 0.6340026773761713, "grad_norm": 0.35431221490178816, "learning_rate": 9.8966401199844e-06, "loss": 0.16409718990325928, "step": 1480 }, { "epoch": 0.6382864792503347, "grad_norm": 0.5424318304534815, "learning_rate": 9.892826031176932e-06, "loss": 0.1624216079711914, "step": 1490 }, { "epoch": 0.642570281124498, "grad_norm": 0.4154998221845867, "learning_rate": 9.888943601540435e-06, "loss": 0.16612136363983154, "step": 1500 }, { "epoch": 0.642570281124498, "eval_loss": 0.15596744418144226, "eval_runtime": 5287.3746, "eval_samples_per_second": 3.14, "eval_steps_per_second": 0.785, "step": 1500 }, { "epoch": 0.6468540829986613, "grad_norm": 0.31788237500758254, "learning_rate": 9.884992885302964e-06, "loss": 0.16352603435516358, "step": 1510 }, { "epoch": 0.6511378848728246, "grad_norm": 0.3933875722388967, "learning_rate": 9.880973937646376e-06, "loss": 0.16239913702011108, "step": 1520 }, { "epoch": 0.655421686746988, "grad_norm": 0.3911043138186677, "learning_rate": 9.876886814705557e-06, "loss": 0.15573612451553345, "step": 1530 }, { "epoch": 0.6597054886211513, "grad_norm": 0.4165342919082731, "learning_rate": 9.87273157356763e-06, "loss": 0.15565356016159057, "step": 1540 }, { "epoch": 0.6639892904953146, "grad_norm": 0.4041990333202639, "learning_rate": 9.868508272271162e-06, "loss": 0.15832991600036622, "step": 1550 }, { "epoch": 0.668273092369478, "grad_norm": 0.42249142494241126, "learning_rate": 9.86421696980536e-06, "loss": 0.15069495439529418, "step": 1560 }, { "epoch": 0.6725568942436412, "grad_norm": 0.3703727697545347, "learning_rate": 9.859857726109237e-06, "loss": 0.1529747486114502, "step": 1570 }, { "epoch": 0.6768406961178045, "grad_norm": 0.340178722202618, "learning_rate": 9.85543060207078e-06, "loss": 0.15539826154708863, "step": 1580 }, { "epoch": 0.6811244979919678, "grad_norm": 0.3731049216784043, "learning_rate": 9.850935659526097e-06, "loss": 0.15447347164154052, "step": 1590 }, { "epoch": 0.6854082998661312, "grad_norm": 0.4042949825278044, "learning_rate": 9.84637296125856e-06, "loss": 0.17724437713623048, "step": 1600 }, { "epoch": 0.6896921017402945, "grad_norm": 0.3660587911460726, "learning_rate": 9.841742570997916e-06, "loss": 0.16080789566040038, "step": 1610 }, { "epoch": 0.6939759036144578, "grad_norm": 0.41736075936721456, "learning_rate": 9.837044553419411e-06, "loss": 0.16406190395355225, "step": 1620 }, { "epoch": 0.6982597054886212, "grad_norm": 0.36046398358975057, "learning_rate": 9.832278974142872e-06, "loss": 0.15605542659759522, "step": 1630 }, { "epoch": 0.7025435073627845, "grad_norm": 0.3954125125143182, "learning_rate": 9.827445899731805e-06, "loss": 0.16570944786071778, "step": 1640 }, { "epoch": 0.7068273092369478, "grad_norm": 0.40637254190631067, "learning_rate": 9.822545397692453e-06, "loss": 0.16883254051208496, "step": 1650 }, { "epoch": 0.7111111111111111, "grad_norm": 0.41139029483344075, "learning_rate": 9.81757753647286e-06, "loss": 0.16364901065826415, "step": 1660 }, { "epoch": 0.7153949129852745, "grad_norm": 0.4536938064672351, "learning_rate": 9.812542385461912e-06, "loss": 0.1675459623336792, "step": 1670 }, { "epoch": 0.7196787148594378, "grad_norm": 0.3585184083438791, "learning_rate": 9.807440014988375e-06, "loss": 0.16231054067611694, "step": 1680 }, { "epoch": 0.7239625167336011, "grad_norm": 0.3167942544933684, "learning_rate": 9.802270496319896e-06, "loss": 0.14959096908569336, "step": 1690 }, { "epoch": 0.7282463186077643, "grad_norm": 0.3699602110663905, "learning_rate": 9.79703390166203e-06, "loss": 0.14526536464691162, "step": 1700 }, { "epoch": 0.7325301204819277, "grad_norm": 0.3613102627272191, "learning_rate": 9.791730304157212e-06, "loss": 0.15053074359893798, "step": 1710 }, { "epoch": 0.736813922356091, "grad_norm": 0.36315339995103474, "learning_rate": 9.786359777883743e-06, "loss": 0.1579727292060852, "step": 1720 }, { "epoch": 0.7410977242302543, "grad_norm": 0.35352051713516114, "learning_rate": 9.78092239785476e-06, "loss": 0.16381702423095704, "step": 1730 }, { "epoch": 0.7453815261044177, "grad_norm": 0.41420218762506095, "learning_rate": 9.775418240017183e-06, "loss": 0.16737335920333862, "step": 1740 }, { "epoch": 0.749665327978581, "grad_norm": 0.32647328326287134, "learning_rate": 9.769847381250647e-06, "loss": 0.16527401208877562, "step": 1750 }, { "epoch": 0.7539491298527443, "grad_norm": 0.35594572768523836, "learning_rate": 9.764209899366451e-06, "loss": 0.17207796573638917, "step": 1760 }, { "epoch": 0.7582329317269076, "grad_norm": 0.3529492671194906, "learning_rate": 9.75850587310644e-06, "loss": 0.15534259080886842, "step": 1770 }, { "epoch": 0.762516733601071, "grad_norm": 0.41518414925000824, "learning_rate": 9.752735382141931e-06, "loss": 0.17126250267028809, "step": 1780 }, { "epoch": 0.7668005354752343, "grad_norm": 0.3416212552791915, "learning_rate": 9.74689850707259e-06, "loss": 0.17300653457641602, "step": 1790 }, { "epoch": 0.7710843373493976, "grad_norm": 0.4506868511706448, "learning_rate": 9.740995329425304e-06, "loss": 0.16119366884231567, "step": 1800 }, { "epoch": 0.775368139223561, "grad_norm": 0.4127032617250803, "learning_rate": 9.735025931653047e-06, "loss": 0.1660417675971985, "step": 1810 }, { "epoch": 0.7796519410977242, "grad_norm": 0.3711426866374276, "learning_rate": 9.728990397133725e-06, "loss": 0.15557256937026978, "step": 1820 }, { "epoch": 0.7839357429718875, "grad_norm": 0.4403684798533838, "learning_rate": 9.722888810169015e-06, "loss": 0.14504989385604858, "step": 1830 }, { "epoch": 0.7882195448460508, "grad_norm": 0.43523370881285106, "learning_rate": 9.716721255983184e-06, "loss": 0.18080484867095947, "step": 1840 }, { "epoch": 0.7925033467202142, "grad_norm": 0.39796977768343905, "learning_rate": 9.710487820721897e-06, "loss": 0.16169551610946656, "step": 1850 }, { "epoch": 0.7967871485943775, "grad_norm": 0.3696365244924919, "learning_rate": 9.704188591451021e-06, "loss": 0.1710440158843994, "step": 1860 }, { "epoch": 0.8010709504685408, "grad_norm": 0.3501836475183059, "learning_rate": 9.697823656155404e-06, "loss": 0.14459784030914308, "step": 1870 }, { "epoch": 0.8053547523427041, "grad_norm": 0.4008873451421875, "learning_rate": 9.691393103737646e-06, "loss": 0.15653254985809326, "step": 1880 }, { "epoch": 0.8096385542168675, "grad_norm": 0.3726778794979056, "learning_rate": 9.684897024016856e-06, "loss": 0.15802738666534424, "step": 1890 }, { "epoch": 0.8139223560910308, "grad_norm": 0.3602622222902254, "learning_rate": 9.678335507727406e-06, "loss": 0.15577685832977295, "step": 1900 }, { "epoch": 0.8182061579651941, "grad_norm": 0.36180831789633733, "learning_rate": 9.671708646517644e-06, "loss": 0.1501927375793457, "step": 1910 }, { "epoch": 0.8224899598393575, "grad_norm": 0.4291946610668789, "learning_rate": 9.665016532948643e-06, "loss": 0.1524329662322998, "step": 1920 }, { "epoch": 0.8267737617135208, "grad_norm": 0.36439021529215626, "learning_rate": 9.658259260492879e-06, "loss": 0.1579957962036133, "step": 1930 }, { "epoch": 0.8310575635876841, "grad_norm": 0.36185634405902617, "learning_rate": 9.651436923532947e-06, "loss": 0.1648595690727234, "step": 1940 }, { "epoch": 0.8353413654618473, "grad_norm": 0.3896804732201538, "learning_rate": 9.644549617360227e-06, "loss": 0.14703061580657958, "step": 1950 }, { "epoch": 0.8396251673360107, "grad_norm": 0.3270830246578632, "learning_rate": 9.63759743817357e-06, "loss": 0.14643968343734742, "step": 1960 }, { "epoch": 0.843908969210174, "grad_norm": 0.36683351098847644, "learning_rate": 9.630580483077934e-06, "loss": 0.15101373195648193, "step": 1970 }, { "epoch": 0.8481927710843373, "grad_norm": 0.2637127315901447, "learning_rate": 9.623498850083043e-06, "loss": 0.1591057300567627, "step": 1980 }, { "epoch": 0.8524765729585007, "grad_norm": 0.3681053572408943, "learning_rate": 9.616352638102017e-06, "loss": 0.1697171926498413, "step": 1990 }, { "epoch": 0.856760374832664, "grad_norm": 0.40805430553066435, "learning_rate": 9.609141946949978e-06, "loss": 0.1591539740562439, "step": 2000 }, { "epoch": 0.856760374832664, "eval_loss": 0.15128476917743683, "eval_runtime": 813.5807, "eval_samples_per_second": 20.404, "eval_steps_per_second": 5.101, "step": 2000 }, { "epoch": 0.8610441767068273, "grad_norm": 0.3914592710894462, "learning_rate": 9.601866877342673e-06, "loss": 0.15913846492767333, "step": 2010 }, { "epoch": 0.8653279785809906, "grad_norm": 0.34232621179600625, "learning_rate": 9.594527530895055e-06, "loss": 0.1589035987854004, "step": 2020 }, { "epoch": 0.869611780455154, "grad_norm": 0.35138032967412824, "learning_rate": 9.587124010119866e-06, "loss": 0.15038516521453857, "step": 2030 }, { "epoch": 0.8738955823293173, "grad_norm": 0.38790494555500904, "learning_rate": 9.579656418426208e-06, "loss": 0.14970223903656005, "step": 2040 }, { "epoch": 0.8781793842034806, "grad_norm": 0.453347749337455, "learning_rate": 9.572124860118099e-06, "loss": 0.15592522621154786, "step": 2050 }, { "epoch": 0.882463186077644, "grad_norm": 0.36254040692639466, "learning_rate": 9.564529440393013e-06, "loss": 0.14756847620010377, "step": 2060 }, { "epoch": 0.8867469879518072, "grad_norm": 0.28500745218910845, "learning_rate": 9.55687026534041e-06, "loss": 0.15284668207168578, "step": 2070 }, { "epoch": 0.8910307898259705, "grad_norm": 0.33059182026983963, "learning_rate": 9.54914744194026e-06, "loss": 0.13931398391723632, "step": 2080 }, { "epoch": 0.8953145917001338, "grad_norm": 0.3308533363527482, "learning_rate": 9.541361078061543e-06, "loss": 0.152490496635437, "step": 2090 }, { "epoch": 0.8995983935742972, "grad_norm": 0.39410920160803786, "learning_rate": 9.533511282460744e-06, "loss": 0.15455267429351807, "step": 2100 }, { "epoch": 0.9038821954484605, "grad_norm": 0.3510760250872994, "learning_rate": 9.525598164780335e-06, "loss": 0.15271444320678712, "step": 2110 }, { "epoch": 0.9081659973226238, "grad_norm": 0.34879574406946134, "learning_rate": 9.51762183554724e-06, "loss": 0.145074462890625, "step": 2120 }, { "epoch": 0.9124497991967871, "grad_norm": 0.4102513842794922, "learning_rate": 9.5095824061713e-06, "loss": 0.1671789288520813, "step": 2130 }, { "epoch": 0.9167336010709505, "grad_norm": 0.32100215647635666, "learning_rate": 9.501479988943705e-06, "loss": 0.14845454692840576, "step": 2140 }, { "epoch": 0.9210174029451138, "grad_norm": 0.35303111230486783, "learning_rate": 9.493314697035433e-06, "loss": 0.14766921997070312, "step": 2150 }, { "epoch": 0.9253012048192771, "grad_norm": 0.3595530843531657, "learning_rate": 9.48508664449567e-06, "loss": 0.1577920436859131, "step": 2160 }, { "epoch": 0.9295850066934405, "grad_norm": 0.3500784633268657, "learning_rate": 9.476795946250213e-06, "loss": 0.15419769287109375, "step": 2170 }, { "epoch": 0.9338688085676038, "grad_norm": 0.5035759293187142, "learning_rate": 9.468442718099866e-06, "loss": 0.15254662036895753, "step": 2180 }, { "epoch": 0.9381526104417671, "grad_norm": 0.3597669443798906, "learning_rate": 9.460027076718825e-06, "loss": 0.15965031385421752, "step": 2190 }, { "epoch": 0.9424364123159303, "grad_norm": 0.32302117680971176, "learning_rate": 9.451549139653043e-06, "loss": 0.15642788410186767, "step": 2200 }, { "epoch": 0.9467202141900937, "grad_norm": 0.37709479129796397, "learning_rate": 9.443009025318595e-06, "loss": 0.16215311288833617, "step": 2210 }, { "epoch": 0.951004016064257, "grad_norm": 0.41863991954422164, "learning_rate": 9.434406853000017e-06, "loss": 0.16595734357833863, "step": 2220 }, { "epoch": 0.9552878179384203, "grad_norm": 0.3895832137317719, "learning_rate": 9.425742742848652e-06, "loss": 0.1542948842048645, "step": 2230 }, { "epoch": 0.9595716198125837, "grad_norm": 0.3383760951721925, "learning_rate": 9.417016815880948e-06, "loss": 0.1523042917251587, "step": 2240 }, { "epoch": 0.963855421686747, "grad_norm": 0.4388306567649398, "learning_rate": 9.4082291939768e-06, "loss": 0.13539564609527588, "step": 2250 }, { "epoch": 0.9681392235609103, "grad_norm": 0.392487987824093, "learning_rate": 9.399379999877816e-06, "loss": 0.16397664546966553, "step": 2260 }, { "epoch": 0.9724230254350736, "grad_norm": 0.37843944567360804, "learning_rate": 9.390469357185626e-06, "loss": 0.1599686861038208, "step": 2270 }, { "epoch": 0.976706827309237, "grad_norm": 0.36495911845917256, "learning_rate": 9.381497390360146e-06, "loss": 0.16280412673950195, "step": 2280 }, { "epoch": 0.9809906291834003, "grad_norm": 0.3098293192725145, "learning_rate": 9.372464224717836e-06, "loss": 0.16709411144256592, "step": 2290 }, { "epoch": 0.9852744310575636, "grad_norm": 0.36503501082057177, "learning_rate": 9.36336998642996e-06, "loss": 0.14577250480651854, "step": 2300 }, { "epoch": 0.989558232931727, "grad_norm": 0.369748777319339, "learning_rate": 9.354214802520813e-06, "loss": 0.15008455514907837, "step": 2310 }, { "epoch": 0.9938420348058902, "grad_norm": 0.38954595915895235, "learning_rate": 9.344998800865949e-06, "loss": 0.16494543552398683, "step": 2320 }, { "epoch": 0.9981258366800535, "grad_norm": 0.3263933545214738, "learning_rate": 9.335722110190409e-06, "loss": 0.1547703266143799, "step": 2330 }, { "epoch": 1.0021419009370816, "grad_norm": 0.3240736359093112, "learning_rate": 9.326384860066894e-06, "loss": 0.1678773880004883, "step": 2340 }, { "epoch": 1.0064257028112449, "grad_norm": 0.4121951074794008, "learning_rate": 9.316987180913993e-06, "loss": 0.13320955038070678, "step": 2350 }, { "epoch": 1.0107095046854082, "grad_norm": 0.35703547426799104, "learning_rate": 9.30752920399432e-06, "loss": 0.12546956539154053, "step": 2360 }, { "epoch": 1.0149933065595715, "grad_norm": 0.40623072991807463, "learning_rate": 9.298011061412718e-06, "loss": 0.13189778327941895, "step": 2370 }, { "epoch": 1.0192771084337349, "grad_norm": 0.44222195266756315, "learning_rate": 9.288432886114388e-06, "loss": 0.12098613977432252, "step": 2380 }, { "epoch": 1.0235609103078982, "grad_norm": 0.367684966832025, "learning_rate": 9.278794811883047e-06, "loss": 0.11746659278869628, "step": 2390 }, { "epoch": 1.0278447121820615, "grad_norm": 0.33284392755056474, "learning_rate": 9.26909697333905e-06, "loss": 0.12567752599716187, "step": 2400 }, { "epoch": 1.0321285140562249, "grad_norm": 0.3455297587313404, "learning_rate": 9.259339505937514e-06, "loss": 0.12083170413970948, "step": 2410 }, { "epoch": 1.0364123159303882, "grad_norm": 0.3389705981902334, "learning_rate": 9.249522545966427e-06, "loss": 0.12095551490783692, "step": 2420 }, { "epoch": 1.0406961178045515, "grad_norm": 0.40601150502203404, "learning_rate": 9.239646230544741e-06, "loss": 0.14402755498886108, "step": 2430 }, { "epoch": 1.0449799196787148, "grad_norm": 0.3559777449007349, "learning_rate": 9.229710697620462e-06, "loss": 0.1495804786682129, "step": 2440 }, { "epoch": 1.0492637215528782, "grad_norm": 0.36896684434500243, "learning_rate": 9.219716085968716e-06, "loss": 0.12875673770904542, "step": 2450 }, { "epoch": 1.0535475234270415, "grad_norm": 0.39146972255890167, "learning_rate": 9.209662535189814e-06, "loss": 0.13340590000152588, "step": 2460 }, { "epoch": 1.0578313253012048, "grad_norm": 0.40291541972442413, "learning_rate": 9.199550185707309e-06, "loss": 0.1337528109550476, "step": 2470 }, { "epoch": 1.0621151271753682, "grad_norm": 0.37956437532491505, "learning_rate": 9.189379178766022e-06, "loss": 0.12576285600662232, "step": 2480 }, { "epoch": 1.0663989290495315, "grad_norm": 0.41298961387679495, "learning_rate": 9.179149656430077e-06, "loss": 0.1333579182624817, "step": 2490 }, { "epoch": 1.0706827309236948, "grad_norm": 0.36210102393181387, "learning_rate": 9.168861761580916e-06, "loss": 0.13212097883224488, "step": 2500 }, { "epoch": 1.0706827309236948, "eval_loss": 0.15047596395015717, "eval_runtime": 816.7373, "eval_samples_per_second": 20.325, "eval_steps_per_second": 5.081, "step": 2500 }, { "epoch": 1.0749665327978581, "grad_norm": 0.3726254379576281, "learning_rate": 9.158515637915303e-06, "loss": 0.12463078498840333, "step": 2510 }, { "epoch": 1.0792503346720215, "grad_norm": 0.37712160221949104, "learning_rate": 9.148111429943316e-06, "loss": 0.12076478004455567, "step": 2520 }, { "epoch": 1.0835341365461848, "grad_norm": 0.34263415579260603, "learning_rate": 9.137649282986326e-06, "loss": 0.11901497840881348, "step": 2530 }, { "epoch": 1.0878179384203481, "grad_norm": 0.43199587554265134, "learning_rate": 9.127129343174974e-06, "loss": 0.1473910093307495, "step": 2540 }, { "epoch": 1.0921017402945115, "grad_norm": 0.3888436375726906, "learning_rate": 9.116551757447124e-06, "loss": 0.12526917457580566, "step": 2550 }, { "epoch": 1.0963855421686748, "grad_norm": 0.3720699240255782, "learning_rate": 9.105916673545811e-06, "loss": 0.12781134843826295, "step": 2560 }, { "epoch": 1.1006693440428381, "grad_norm": 0.3528738970780735, "learning_rate": 9.095224240017187e-06, "loss": 0.12412866353988647, "step": 2570 }, { "epoch": 1.1049531459170012, "grad_norm": 0.4631292939251323, "learning_rate": 9.084474606208426e-06, "loss": 0.11998128890991211, "step": 2580 }, { "epoch": 1.1092369477911648, "grad_norm": 0.3596397164936987, "learning_rate": 9.073667922265659e-06, "loss": 0.13821544647216796, "step": 2590 }, { "epoch": 1.1135207496653279, "grad_norm": 0.37491061752134996, "learning_rate": 9.062804339131865e-06, "loss": 0.12905315160751343, "step": 2600 }, { "epoch": 1.1178045515394912, "grad_norm": 0.33236296782840824, "learning_rate": 9.051884008544769e-06, "loss": 0.11152592897415162, "step": 2610 }, { "epoch": 1.1220883534136545, "grad_norm": 0.41099647558351027, "learning_rate": 9.040907083034714e-06, "loss": 0.120727276802063, "step": 2620 }, { "epoch": 1.1263721552878179, "grad_norm": 0.3859893211528485, "learning_rate": 9.02987371592254e-06, "loss": 0.14195597171783447, "step": 2630 }, { "epoch": 1.1306559571619812, "grad_norm": 0.36839784644083184, "learning_rate": 9.018784061317434e-06, "loss": 0.12041090726852417, "step": 2640 }, { "epoch": 1.1349397590361445, "grad_norm": 0.41679177088273905, "learning_rate": 9.007638274114787e-06, "loss": 0.13752386569976807, "step": 2650 }, { "epoch": 1.1392235609103079, "grad_norm": 0.38785605712752647, "learning_rate": 8.996436509994022e-06, "loss": 0.12111247777938842, "step": 2660 }, { "epoch": 1.1435073627844712, "grad_norm": 0.37367256419499406, "learning_rate": 8.985178925416424e-06, "loss": 0.13275750875473022, "step": 2670 }, { "epoch": 1.1477911646586345, "grad_norm": 0.3674681161529881, "learning_rate": 8.973865677622954e-06, "loss": 0.13491373062133788, "step": 2680 }, { "epoch": 1.1520749665327978, "grad_norm": 0.34447615774959234, "learning_rate": 8.962496924632051e-06, "loss": 0.13558318614959716, "step": 2690 }, { "epoch": 1.1563587684069612, "grad_norm": 0.3564170987558211, "learning_rate": 8.951072825237426e-06, "loss": 0.1193579912185669, "step": 2700 }, { "epoch": 1.1606425702811245, "grad_norm": 0.35305355840674923, "learning_rate": 8.939593539005842e-06, "loss": 0.13529754877090455, "step": 2710 }, { "epoch": 1.1649263721552878, "grad_norm": 0.35352573268560833, "learning_rate": 8.928059226274894e-06, "loss": 0.12423286437988282, "step": 2720 }, { "epoch": 1.1692101740294512, "grad_norm": 0.36689834147420736, "learning_rate": 8.916470048150756e-06, "loss": 0.13518364429473878, "step": 2730 }, { "epoch": 1.1734939759036145, "grad_norm": 0.38162988673475534, "learning_rate": 8.90482616650594e-06, "loss": 0.13908401727676392, "step": 2740 }, { "epoch": 1.1777777777777778, "grad_norm": 0.40336299975505086, "learning_rate": 8.893127743977036e-06, "loss": 0.13255722522735597, "step": 2750 }, { "epoch": 1.1820615796519411, "grad_norm": 0.3324644606155607, "learning_rate": 8.881374943962426e-06, "loss": 0.1357291579246521, "step": 2760 }, { "epoch": 1.1863453815261045, "grad_norm": 0.3994255780678427, "learning_rate": 8.869567930620027e-06, "loss": 0.12042539119720459, "step": 2770 }, { "epoch": 1.1906291834002678, "grad_norm": 0.35581614764806313, "learning_rate": 8.857706868864977e-06, "loss": 0.13282716274261475, "step": 2780 }, { "epoch": 1.1949129852744311, "grad_norm": 0.3106663113756059, "learning_rate": 8.845791924367334e-06, "loss": 0.12471635341644287, "step": 2790 }, { "epoch": 1.1991967871485945, "grad_norm": 0.4256051698707425, "learning_rate": 8.833823263549775e-06, "loss": 0.11954612731933593, "step": 2800 }, { "epoch": 1.2034805890227578, "grad_norm": 0.41689423223672023, "learning_rate": 8.821801053585254e-06, "loss": 0.12010161876678467, "step": 2810 }, { "epoch": 1.2077643908969211, "grad_norm": 0.3817725350186758, "learning_rate": 8.809725462394684e-06, "loss": 0.11917848587036133, "step": 2820 }, { "epoch": 1.2120481927710842, "grad_norm": 0.35927209564755835, "learning_rate": 8.797596658644581e-06, "loss": 0.12020325660705566, "step": 2830 }, { "epoch": 1.2163319946452478, "grad_norm": 0.4238739504322855, "learning_rate": 8.785414811744703e-06, "loss": 0.13289868831634521, "step": 2840 }, { "epoch": 1.2206157965194109, "grad_norm": 0.39167399451224444, "learning_rate": 8.773180091845701e-06, "loss": 0.12138681411743164, "step": 2850 }, { "epoch": 1.2248995983935742, "grad_norm": 0.3947355797116567, "learning_rate": 8.760892669836729e-06, "loss": 0.14103634357452394, "step": 2860 }, { "epoch": 1.2291834002677375, "grad_norm": 0.39740732729868383, "learning_rate": 8.74855271734306e-06, "loss": 0.13904783725738526, "step": 2870 }, { "epoch": 1.2334672021419009, "grad_norm": 0.41730678044784, "learning_rate": 8.736160406723688e-06, "loss": 0.12443190813064575, "step": 2880 }, { "epoch": 1.2377510040160642, "grad_norm": 0.3765448851707534, "learning_rate": 8.723715911068931e-06, "loss": 0.1321355938911438, "step": 2890 }, { "epoch": 1.2420348058902275, "grad_norm": 0.3634462539369135, "learning_rate": 8.71121940419799e-06, "loss": 0.14078364372253419, "step": 2900 }, { "epoch": 1.2463186077643909, "grad_norm": 0.4094668843551737, "learning_rate": 8.698671060656549e-06, "loss": 0.13006095886230468, "step": 2910 }, { "epoch": 1.2506024096385542, "grad_norm": 0.37644871257316387, "learning_rate": 8.686071055714318e-06, "loss": 0.12324719429016114, "step": 2920 }, { "epoch": 1.2548862115127175, "grad_norm": 0.4032403895979568, "learning_rate": 8.673419565362587e-06, "loss": 0.14000382423400878, "step": 2930 }, { "epoch": 1.2591700133868808, "grad_norm": 0.4398800669174728, "learning_rate": 8.660716766311778e-06, "loss": 0.11818475723266601, "step": 2940 }, { "epoch": 1.2634538152610442, "grad_norm": 0.3615959969952865, "learning_rate": 8.647962835988968e-06, "loss": 0.1338767886161804, "step": 2950 }, { "epoch": 1.2677376171352075, "grad_norm": 0.31737564808536584, "learning_rate": 8.635157952535411e-06, "loss": 0.1270219087600708, "step": 2960 }, { "epoch": 1.2720214190093708, "grad_norm": 0.3582723999510731, "learning_rate": 8.622302294804052e-06, "loss": 0.12293977737426758, "step": 2970 }, { "epoch": 1.2763052208835342, "grad_norm": 0.35972611924117803, "learning_rate": 8.609396042357033e-06, "loss": 0.12699666023254394, "step": 2980 }, { "epoch": 1.2805890227576975, "grad_norm": 0.4025263144287077, "learning_rate": 8.596439375463174e-06, "loss": 0.13745148181915284, "step": 2990 }, { "epoch": 1.2848728246318608, "grad_norm": 0.3227213368538748, "learning_rate": 8.583432475095468e-06, "loss": 0.11785190105438233, "step": 3000 }, { "epoch": 1.2848728246318608, "eval_loss": 0.1485673487186432, "eval_runtime": 812.0391, "eval_samples_per_second": 20.442, "eval_steps_per_second": 5.111, "step": 3000 }, { "epoch": 1.2891566265060241, "grad_norm": 0.35470826251565785, "learning_rate": 8.570375522928543e-06, "loss": 0.12998595237731933, "step": 3010 }, { "epoch": 1.2934404283801875, "grad_norm": 0.3775363159731956, "learning_rate": 8.55726870133613e-06, "loss": 0.11246494054794312, "step": 3020 }, { "epoch": 1.2977242302543508, "grad_norm": 0.36015262199345144, "learning_rate": 8.544112193388513e-06, "loss": 0.1255005955696106, "step": 3030 }, { "epoch": 1.3020080321285141, "grad_norm": 0.32095363304273905, "learning_rate": 8.530906182849971e-06, "loss": 0.14123222827911378, "step": 3040 }, { "epoch": 1.3062918340026775, "grad_norm": 0.35963777187492285, "learning_rate": 8.51765085417622e-06, "loss": 0.12764023542404174, "step": 3050 }, { "epoch": 1.3105756358768406, "grad_norm": 0.3545121600646447, "learning_rate": 8.504346392511824e-06, "loss": 0.12473820447921753, "step": 3060 }, { "epoch": 1.3148594377510041, "grad_norm": 0.3752599966671012, "learning_rate": 8.490992983687617e-06, "loss": 0.12995026111602784, "step": 3070 }, { "epoch": 1.3191432396251672, "grad_norm": 0.3326424253698993, "learning_rate": 8.477590814218104e-06, "loss": 0.13189772367477418, "step": 3080 }, { "epoch": 1.3234270414993308, "grad_norm": 0.33944521013309487, "learning_rate": 8.464140071298858e-06, "loss": 0.12935359477996827, "step": 3090 }, { "epoch": 1.3277108433734939, "grad_norm": 0.37010591984301416, "learning_rate": 8.450640942803904e-06, "loss": 0.13249437808990477, "step": 3100 }, { "epoch": 1.3319946452476574, "grad_norm": 0.3615034420980659, "learning_rate": 8.437093617283099e-06, "loss": 0.12562718391418456, "step": 3110 }, { "epoch": 1.3362784471218205, "grad_norm": 0.3917767190914898, "learning_rate": 8.423498283959487e-06, "loss": 0.13038911819458007, "step": 3120 }, { "epoch": 1.3405622489959839, "grad_norm": 0.38109850464604067, "learning_rate": 8.40985513272667e-06, "loss": 0.13978877067565917, "step": 3130 }, { "epoch": 1.3448460508701472, "grad_norm": 0.4445890595042772, "learning_rate": 8.39616435414615e-06, "loss": 0.13834033012390137, "step": 3140 }, { "epoch": 1.3491298527443105, "grad_norm": 0.31874973345433283, "learning_rate": 8.38242613944466e-06, "loss": 0.1258203625679016, "step": 3150 }, { "epoch": 1.3534136546184738, "grad_norm": 0.33127205404029225, "learning_rate": 8.368640680511507e-06, "loss": 0.12356986999511718, "step": 3160 }, { "epoch": 1.3576974564926372, "grad_norm": 0.416559211705474, "learning_rate": 8.35480816989588e-06, "loss": 0.11982156038284301, "step": 3170 }, { "epoch": 1.3619812583668005, "grad_norm": 0.3660453384090912, "learning_rate": 8.34092880080417e-06, "loss": 0.11788184642791748, "step": 3180 }, { "epoch": 1.3662650602409638, "grad_norm": 0.34339125686903177, "learning_rate": 8.32700276709726e-06, "loss": 0.13102638721466064, "step": 3190 }, { "epoch": 1.3705488621151272, "grad_norm": 0.35489193494077403, "learning_rate": 8.313030263287825e-06, "loss": 0.1122696876525879, "step": 3200 }, { "epoch": 1.3748326639892905, "grad_norm": 0.3746174683003833, "learning_rate": 8.299011484537621e-06, "loss": 0.1276139497756958, "step": 3210 }, { "epoch": 1.3791164658634538, "grad_norm": 0.7123969948931433, "learning_rate": 8.284946626654743e-06, "loss": 0.1328984022140503, "step": 3220 }, { "epoch": 1.3834002677376172, "grad_norm": 0.3822847406441411, "learning_rate": 8.270835886090901e-06, "loss": 0.11024882793426513, "step": 3230 }, { "epoch": 1.3876840696117805, "grad_norm": 0.300720958006405, "learning_rate": 8.256679459938681e-06, "loss": 0.11192436218261718, "step": 3240 }, { "epoch": 1.3919678714859438, "grad_norm": 0.36933913785412426, "learning_rate": 8.242477545928775e-06, "loss": 0.1279488682746887, "step": 3250 }, { "epoch": 1.3962516733601071, "grad_norm": 0.3733629104677544, "learning_rate": 8.228230342427237e-06, "loss": 0.12411469221115112, "step": 3260 }, { "epoch": 1.4005354752342705, "grad_norm": 0.37527660608807045, "learning_rate": 8.213938048432697e-06, "loss": 0.12071568965911865, "step": 3270 }, { "epoch": 1.4048192771084338, "grad_norm": 0.32477552350056993, "learning_rate": 8.199600863573599e-06, "loss": 0.10580611228942871, "step": 3280 }, { "epoch": 1.4091030789825971, "grad_norm": 0.4717886686044222, "learning_rate": 8.185218988105392e-06, "loss": 0.14088404178619385, "step": 3290 }, { "epoch": 1.4133868808567605, "grad_norm": 0.3538053722734618, "learning_rate": 8.170792622907751e-06, "loss": 0.14626517295837402, "step": 3300 }, { "epoch": 1.4176706827309236, "grad_norm": 0.3513040684652719, "learning_rate": 8.156321969481762e-06, "loss": 0.11440718173980713, "step": 3310 }, { "epoch": 1.421954484605087, "grad_norm": 0.3591684736408224, "learning_rate": 8.14180722994711e-06, "loss": 0.13487778902053832, "step": 3320 }, { "epoch": 1.4262382864792502, "grad_norm": 0.3658104845595591, "learning_rate": 8.127248607039254e-06, "loss": 0.11574537754058838, "step": 3330 }, { "epoch": 1.4305220883534138, "grad_norm": 0.3197726257189657, "learning_rate": 8.112646304106593e-06, "loss": 0.12187765836715699, "step": 3340 }, { "epoch": 1.4348058902275769, "grad_norm": 0.3846940368635854, "learning_rate": 8.09800052510764e-06, "loss": 0.11478321552276612, "step": 3350 }, { "epoch": 1.4390896921017404, "grad_norm": 0.4176339612370988, "learning_rate": 8.08331147460815e-06, "loss": 0.1217038869857788, "step": 3360 }, { "epoch": 1.4433734939759035, "grad_norm": 0.37885782340374674, "learning_rate": 8.068579357778284e-06, "loss": 0.12176965475082398, "step": 3370 }, { "epoch": 1.4476572958500669, "grad_norm": 0.41287039590285307, "learning_rate": 8.053804380389728e-06, "loss": 0.12061818838119506, "step": 3380 }, { "epoch": 1.4519410977242302, "grad_norm": 0.3494750223733423, "learning_rate": 8.038986748812832e-06, "loss": 0.131140398979187, "step": 3390 }, { "epoch": 1.4562248995983935, "grad_norm": 0.3952195504175884, "learning_rate": 8.024126670013716e-06, "loss": 0.11915416717529297, "step": 3400 }, { "epoch": 1.4605087014725568, "grad_norm": 0.3737316963804442, "learning_rate": 8.009224351551386e-06, "loss": 0.11794298887252808, "step": 3410 }, { "epoch": 1.4647925033467202, "grad_norm": 0.3446468035702987, "learning_rate": 7.99428000157483e-06, "loss": 0.1277950167655945, "step": 3420 }, { "epoch": 1.4690763052208835, "grad_norm": 0.37272918562452995, "learning_rate": 7.979293828820119e-06, "loss": 0.14721099138259888, "step": 3430 }, { "epoch": 1.4733601070950468, "grad_norm": 0.33085072922732706, "learning_rate": 7.96426604260748e-06, "loss": 0.11756453514099122, "step": 3440 }, { "epoch": 1.4776439089692102, "grad_norm": 0.320178401178284, "learning_rate": 7.949196852838383e-06, "loss": 0.1269507050514221, "step": 3450 }, { "epoch": 1.4819277108433735, "grad_norm": 0.3580459421820677, "learning_rate": 7.934086469992605e-06, "loss": 0.1412634253501892, "step": 3460 }, { "epoch": 1.4862115127175368, "grad_norm": 0.36913989344261383, "learning_rate": 7.918935105125283e-06, "loss": 0.15048539638519287, "step": 3470 }, { "epoch": 1.4904953145917001, "grad_norm": 0.38425139309308326, "learning_rate": 7.903742969863982e-06, "loss": 0.13397784233093263, "step": 3480 }, { "epoch": 1.4947791164658635, "grad_norm": 0.3627531984044689, "learning_rate": 7.88851027640572e-06, "loss": 0.11737120151519775, "step": 3490 }, { "epoch": 1.4990629183400268, "grad_norm": 0.30678086877528343, "learning_rate": 7.873237237514024e-06, "loss": 0.1271947741508484, "step": 3500 }, { "epoch": 1.4990629183400268, "eval_loss": 0.14634032547473907, "eval_runtime": 11508.5453, "eval_samples_per_second": 1.442, "eval_steps_per_second": 0.361, "step": 3500 }, { "epoch": 1.5033467202141901, "grad_norm": 0.3989430501599751, "learning_rate": 7.857924066515941e-06, "loss": 0.1253154993057251, "step": 3510 }, { "epoch": 1.5076305220883535, "grad_norm": 0.3712393247049027, "learning_rate": 7.842570977299067e-06, "loss": 0.13159399032592772, "step": 3520 }, { "epoch": 1.5119143239625168, "grad_norm": 0.39043985321189406, "learning_rate": 7.827178184308559e-06, "loss": 0.12818803787231445, "step": 3530 }, { "epoch": 1.51619812583668, "grad_norm": 0.3783719248133356, "learning_rate": 7.81174590254414e-06, "loss": 0.12482264041900634, "step": 3540 }, { "epoch": 1.5204819277108435, "grad_norm": 0.33627341086836304, "learning_rate": 7.796274347557094e-06, "loss": 0.1259792685508728, "step": 3550 }, { "epoch": 1.5247657295850066, "grad_norm": 0.3307003231873695, "learning_rate": 7.780763735447252e-06, "loss": 0.11816374063491822, "step": 3560 }, { "epoch": 1.52904953145917, "grad_norm": 0.41275730039950287, "learning_rate": 7.765214282859981e-06, "loss": 0.12664893865585328, "step": 3570 }, { "epoch": 1.5333333333333332, "grad_norm": 0.3033638102712773, "learning_rate": 7.749626206983157e-06, "loss": 0.1236607551574707, "step": 3580 }, { "epoch": 1.5376171352074968, "grad_norm": 0.3554700928985279, "learning_rate": 7.733999725544126e-06, "loss": 0.12761454582214354, "step": 3590 }, { "epoch": 1.5419009370816599, "grad_norm": 0.35291611398156203, "learning_rate": 7.718335056806665e-06, "loss": 0.1287233352661133, "step": 3600 }, { "epoch": 1.5461847389558234, "grad_norm": 0.3567309323303257, "learning_rate": 7.702632419567937e-06, "loss": 0.14273253679275513, "step": 3610 }, { "epoch": 1.5504685408299865, "grad_norm": 0.3185688429925057, "learning_rate": 7.68689203315543e-06, "loss": 0.12159850597381591, "step": 3620 }, { "epoch": 1.55475234270415, "grad_norm": 0.36346114221079345, "learning_rate": 7.671114117423896e-06, "loss": 0.12236592769622803, "step": 3630 }, { "epoch": 1.5590361445783132, "grad_norm": 0.28944951523091206, "learning_rate": 7.655298892752281e-06, "loss": 0.1200286865234375, "step": 3640 }, { "epoch": 1.5633199464524767, "grad_norm": 0.3320579302602756, "learning_rate": 7.639446580040647e-06, "loss": 0.13653804063796998, "step": 3650 }, { "epoch": 1.5676037483266398, "grad_norm": 0.3367309725103469, "learning_rate": 7.623557400707081e-06, "loss": 0.12761712074279785, "step": 3660 }, { "epoch": 1.5718875502008032, "grad_norm": 0.36883406849675304, "learning_rate": 7.607631576684611e-06, "loss": 0.12503886222839355, "step": 3670 }, { "epoch": 1.5761713520749665, "grad_norm": 0.35021731907363346, "learning_rate": 7.5916693304181e-06, "loss": 0.11194202899932862, "step": 3680 }, { "epoch": 1.5804551539491298, "grad_norm": 0.3466816731323651, "learning_rate": 7.575670884861142e-06, "loss": 0.11533315181732177, "step": 3690 }, { "epoch": 1.5847389558232932, "grad_norm": 0.3695468619685566, "learning_rate": 7.559636463472941e-06, "loss": 0.12558252811431886, "step": 3700 }, { "epoch": 1.5890227576974565, "grad_norm": 0.33317140225660996, "learning_rate": 7.543566290215205e-06, "loss": 0.11223011016845703, "step": 3710 }, { "epoch": 1.5933065595716198, "grad_norm": 0.3802726049715593, "learning_rate": 7.5274605895490014e-06, "loss": 0.11428353786468506, "step": 3720 }, { "epoch": 1.5975903614457831, "grad_norm": 0.3502543345535625, "learning_rate": 7.511319586431631e-06, "loss": 0.12747797966003419, "step": 3730 }, { "epoch": 1.6018741633199465, "grad_norm": 0.48600388474175416, "learning_rate": 7.495143506313484e-06, "loss": 0.12503063678741455, "step": 3740 }, { "epoch": 1.6061579651941098, "grad_norm": 0.402765639804346, "learning_rate": 7.478932575134887e-06, "loss": 0.1338959217071533, "step": 3750 }, { "epoch": 1.6104417670682731, "grad_norm": 0.3921866593643898, "learning_rate": 7.462687019322957e-06, "loss": 0.11669353246688843, "step": 3760 }, { "epoch": 1.6147255689424365, "grad_norm": 0.33652188082752615, "learning_rate": 7.446407065788428e-06, "loss": 0.12007842063903809, "step": 3770 }, { "epoch": 1.6190093708165998, "grad_norm": 0.4118194067707435, "learning_rate": 7.4300929419224866e-06, "loss": 0.12169758081436158, "step": 3780 }, { "epoch": 1.623293172690763, "grad_norm": 0.36857356877163894, "learning_rate": 7.413744875593597e-06, "loss": 0.12564884424209594, "step": 3790 }, { "epoch": 1.6275769745649264, "grad_norm": 0.350693413841003, "learning_rate": 7.397363095144318e-06, "loss": 0.12418256998062134, "step": 3800 }, { "epoch": 1.6318607764390896, "grad_norm": 0.3712677998496879, "learning_rate": 7.380947829388108e-06, "loss": 0.12151600122451782, "step": 3810 }, { "epoch": 1.636144578313253, "grad_norm": 0.4183039288576934, "learning_rate": 7.364499307606136e-06, "loss": 0.11588020324707031, "step": 3820 }, { "epoch": 1.6404283801874162, "grad_norm": 0.39624838378484395, "learning_rate": 7.348017759544075e-06, "loss": 0.12545753717422486, "step": 3830 }, { "epoch": 1.6447121820615798, "grad_norm": 0.335543915765519, "learning_rate": 7.331503415408899e-06, "loss": 0.11865659952163696, "step": 3840 }, { "epoch": 1.6489959839357429, "grad_norm": 0.30699590589486353, "learning_rate": 7.3149565058656545e-06, "loss": 0.11257133483886719, "step": 3850 }, { "epoch": 1.6532797858099064, "grad_norm": 0.4211864176178027, "learning_rate": 7.298377262034258e-06, "loss": 0.12412948608398437, "step": 3860 }, { "epoch": 1.6575635876840695, "grad_norm": 0.5480668142726313, "learning_rate": 7.281765915486247e-06, "loss": 0.11110868453979492, "step": 3870 }, { "epoch": 1.661847389558233, "grad_norm": 0.38707346036306395, "learning_rate": 7.265122698241562e-06, "loss": 0.1353888154029846, "step": 3880 }, { "epoch": 1.6661311914323962, "grad_norm": 0.3496172916459521, "learning_rate": 7.248447842765298e-06, "loss": 0.12294532060623169, "step": 3890 }, { "epoch": 1.6704149933065597, "grad_norm": 0.3178905513170639, "learning_rate": 7.231741581964455e-06, "loss": 0.11635351181030273, "step": 3900 }, { "epoch": 1.6746987951807228, "grad_norm": 0.37730744546548595, "learning_rate": 7.2150041491846965e-06, "loss": 0.13707247972488404, "step": 3910 }, { "epoch": 1.6789825970548862, "grad_norm": 0.4258774014748926, "learning_rate": 7.198235778207072e-06, "loss": 0.11108559370040894, "step": 3920 }, { "epoch": 1.6832663989290495, "grad_norm": 0.3269689561831232, "learning_rate": 7.181436703244773e-06, "loss": 0.13123619556427002, "step": 3930 }, { "epoch": 1.6875502008032128, "grad_norm": 0.3328432989440898, "learning_rate": 7.1646071589398406e-06, "loss": 0.11167018413543701, "step": 3940 }, { "epoch": 1.6918340026773762, "grad_norm": 0.3844316794696797, "learning_rate": 7.147747380359905e-06, "loss": 0.11800698041915894, "step": 3950 }, { "epoch": 1.6961178045515395, "grad_norm": 0.33099179444642823, "learning_rate": 7.130857602994894e-06, "loss": 0.13457157611846923, "step": 3960 }, { "epoch": 1.7004016064257028, "grad_norm": 0.3147285218500962, "learning_rate": 7.113938062753742e-06, "loss": 0.13172318935394287, "step": 3970 }, { "epoch": 1.7046854082998661, "grad_norm": 0.3592833207498237, "learning_rate": 7.0969889959611045e-06, "loss": 0.1196314811706543, "step": 3980 }, { "epoch": 1.7089692101740295, "grad_norm": 0.29811223409083043, "learning_rate": 7.080010639354045e-06, "loss": 0.11256670951843262, "step": 3990 }, { "epoch": 1.7132530120481928, "grad_norm": 0.4270868815948092, "learning_rate": 7.063003230078734e-06, "loss": 0.12309803962707519, "step": 4000 }, { "epoch": 1.7132530120481928, "eval_loss": 0.144321471452713, "eval_runtime": 817.2721, "eval_samples_per_second": 20.311, "eval_steps_per_second": 5.078, "step": 4000 }, { "epoch": 1.7175368139223561, "grad_norm": 0.3733167797076492, "learning_rate": 7.045967005687141e-06, "loss": 0.11690073013305664, "step": 4010 }, { "epoch": 1.7218206157965195, "grad_norm": 0.32612931848843507, "learning_rate": 7.028902204133711e-06, "loss": 0.1235615611076355, "step": 4020 }, { "epoch": 1.7261044176706828, "grad_norm": 0.3537546537362819, "learning_rate": 7.011809063772038e-06, "loss": 0.1282111883163452, "step": 4030 }, { "epoch": 1.730388219544846, "grad_norm": 0.41855495134878623, "learning_rate": 6.994687823351547e-06, "loss": 0.13276000022888185, "step": 4040 }, { "epoch": 1.7346720214190094, "grad_norm": 0.3640723677373699, "learning_rate": 6.9775387220141465e-06, "loss": 0.12338956594467163, "step": 4050 }, { "epoch": 1.7389558232931726, "grad_norm": 0.348482478201222, "learning_rate": 6.960361999290894e-06, "loss": 0.1142328143119812, "step": 4060 }, { "epoch": 1.743239625167336, "grad_norm": 0.41291989661610773, "learning_rate": 6.943157895098656e-06, "loss": 0.12496788501739502, "step": 4070 }, { "epoch": 1.7475234270414992, "grad_norm": 0.31746340210362767, "learning_rate": 6.925926649736745e-06, "loss": 0.11045465469360352, "step": 4080 }, { "epoch": 1.7518072289156628, "grad_norm": 0.32179304285895316, "learning_rate": 6.9086685038835725e-06, "loss": 0.13367241621017456, "step": 4090 }, { "epoch": 1.7560910307898259, "grad_norm": 0.3467502021616522, "learning_rate": 6.891383698593283e-06, "loss": 0.11450705528259278, "step": 4100 }, { "epoch": 1.7603748326639894, "grad_norm": 0.37824785627911034, "learning_rate": 6.874072475292388e-06, "loss": 0.11085845232009887, "step": 4110 }, { "epoch": 1.7646586345381525, "grad_norm": 0.33242640245264393, "learning_rate": 6.856735075776395e-06, "loss": 0.12101356983184815, "step": 4120 }, { "epoch": 1.768942436412316, "grad_norm": 0.3295693613929198, "learning_rate": 6.839371742206432e-06, "loss": 0.11143279075622559, "step": 4130 }, { "epoch": 1.7732262382864792, "grad_norm": 0.41043258389255455, "learning_rate": 6.821982717105855e-06, "loss": 0.11657199859619141, "step": 4140 }, { "epoch": 1.7775100401606427, "grad_norm": 0.3336241961556357, "learning_rate": 6.804568243356876e-06, "loss": 0.12107970714569091, "step": 4150 }, { "epoch": 1.7817938420348058, "grad_norm": 0.404764797519025, "learning_rate": 6.7871285641971576e-06, "loss": 0.12142288684844971, "step": 4160 }, { "epoch": 1.7860776439089692, "grad_norm": 0.35528280014790076, "learning_rate": 6.769663923216419e-06, "loss": 0.14445422887802123, "step": 4170 }, { "epoch": 1.7903614457831325, "grad_norm": 0.36424811344112645, "learning_rate": 6.75217456435304e-06, "loss": 0.11748452186584472, "step": 4180 }, { "epoch": 1.7946452476572958, "grad_norm": 0.3580564279402089, "learning_rate": 6.734660731890645e-06, "loss": 0.11877243518829346, "step": 4190 }, { "epoch": 1.7989290495314592, "grad_norm": 0.3945693311810663, "learning_rate": 6.717122670454701e-06, "loss": 0.12274388074874878, "step": 4200 }, { "epoch": 1.8032128514056225, "grad_norm": 0.3274495553953029, "learning_rate": 6.699560625009085e-06, "loss": 0.11418673992156983, "step": 4210 }, { "epoch": 1.8074966532797858, "grad_norm": 0.43405948322435506, "learning_rate": 6.6819748408526775e-06, "loss": 0.11989142894744872, "step": 4220 }, { "epoch": 1.8117804551539491, "grad_norm": 0.34302792735595455, "learning_rate": 6.6643655636159325e-06, "loss": 0.10752333402633667, "step": 4230 }, { "epoch": 1.8160642570281125, "grad_norm": 0.38396837748701773, "learning_rate": 6.646733039257442e-06, "loss": 0.12758421897888184, "step": 4240 }, { "epoch": 1.8203480589022758, "grad_norm": 0.35943197748111966, "learning_rate": 6.629077514060501e-06, "loss": 0.11687214374542236, "step": 4250 }, { "epoch": 1.8246318607764391, "grad_norm": 0.33900665564961463, "learning_rate": 6.611399234629679e-06, "loss": 0.1235961675643921, "step": 4260 }, { "epoch": 1.8289156626506025, "grad_norm": 0.36539098779168305, "learning_rate": 6.593698447887357e-06, "loss": 0.12241628170013427, "step": 4270 }, { "epoch": 1.8331994645247658, "grad_norm": 0.38361329899883734, "learning_rate": 6.575975401070291e-06, "loss": 0.12448443174362182, "step": 4280 }, { "epoch": 1.837483266398929, "grad_norm": 0.4014122394041882, "learning_rate": 6.5582303417261605e-06, "loss": 0.1193004846572876, "step": 4290 }, { "epoch": 1.8417670682730924, "grad_norm": 0.3678903848404944, "learning_rate": 6.540463517710099e-06, "loss": 0.1212453842163086, "step": 4300 }, { "epoch": 1.8460508701472556, "grad_norm": 0.3251163301086072, "learning_rate": 6.5226751771812476e-06, "loss": 0.12798908948898316, "step": 4310 }, { "epoch": 1.850334672021419, "grad_norm": 0.3415099254328554, "learning_rate": 6.5048655685992705e-06, "loss": 0.13018690347671508, "step": 4320 }, { "epoch": 1.8546184738955822, "grad_norm": 0.3905905047279772, "learning_rate": 6.487034940720902e-06, "loss": 0.12057719230651856, "step": 4330 }, { "epoch": 1.8589022757697458, "grad_norm": 0.3646836032160996, "learning_rate": 6.469183542596464e-06, "loss": 0.13052282333374024, "step": 4340 }, { "epoch": 1.8631860776439089, "grad_norm": 0.33435104754269, "learning_rate": 6.451311623566386e-06, "loss": 0.11543186902999877, "step": 4350 }, { "epoch": 1.8674698795180724, "grad_norm": 0.3562601136655919, "learning_rate": 6.433419433257726e-06, "loss": 0.12250864505767822, "step": 4360 }, { "epoch": 1.8717536813922355, "grad_norm": 0.3226539154934918, "learning_rate": 6.415507221580678e-06, "loss": 0.12082786560058593, "step": 4370 }, { "epoch": 1.876037483266399, "grad_norm": 0.3635681601652211, "learning_rate": 6.397575238725091e-06, "loss": 0.12619302272796631, "step": 4380 }, { "epoch": 1.8803212851405622, "grad_norm": 0.3607934399845053, "learning_rate": 6.379623735156968e-06, "loss": 0.12855522632598876, "step": 4390 }, { "epoch": 1.8846050870147257, "grad_norm": 0.33220984445822355, "learning_rate": 6.361652961614966e-06, "loss": 0.11576036214828492, "step": 4400 }, { "epoch": 1.8888888888888888, "grad_norm": 0.3957629269071009, "learning_rate": 6.343663169106897e-06, "loss": 0.12123892307281495, "step": 4410 }, { "epoch": 1.8931726907630522, "grad_norm": 0.38648528140436955, "learning_rate": 6.325654608906228e-06, "loss": 0.13391902446746826, "step": 4420 }, { "epoch": 1.8974564926372155, "grad_norm": 0.322831029116286, "learning_rate": 6.307627532548554e-06, "loss": 0.11682146787643433, "step": 4430 }, { "epoch": 1.9017402945113788, "grad_norm": 0.34943896220332243, "learning_rate": 6.289582191828102e-06, "loss": 0.10885384082794189, "step": 4440 }, { "epoch": 1.9060240963855422, "grad_norm": 0.3885943387224764, "learning_rate": 6.2715188387942085e-06, "loss": 0.11223304271697998, "step": 4450 }, { "epoch": 1.9103078982597055, "grad_norm": 0.336737608622642, "learning_rate": 6.253437725747795e-06, "loss": 0.11982736587524415, "step": 4460 }, { "epoch": 1.9145917001338688, "grad_norm": 0.3692087496930761, "learning_rate": 6.235339105237849e-06, "loss": 0.12470091581344604, "step": 4470 }, { "epoch": 1.9188755020080321, "grad_norm": 0.39542747620483304, "learning_rate": 6.217223230057891e-06, "loss": 0.1260706901550293, "step": 4480 }, { "epoch": 1.9231593038821955, "grad_norm": 0.36423974792020714, "learning_rate": 6.199090353242452e-06, "loss": 0.11962894201278687, "step": 4490 }, { "epoch": 1.9274431057563588, "grad_norm": 0.36962654876484385, "learning_rate": 6.18094072806353e-06, "loss": 0.11819722652435302, "step": 4500 }, { "epoch": 1.9274431057563588, "eval_loss": 0.14271628856658936, "eval_runtime": 1019.4039, "eval_samples_per_second": 16.284, "eval_steps_per_second": 4.071, "step": 4500 } ], "logging_steps": 10, "max_steps": 9340, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 481757134651392.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }