{ "best_metric": 0.08512182533740997, "best_model_checkpoint": "./model_dir\\checkpoint-12900", "epoch": 4.0, "eval_steps": 100, "global_step": 19736, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002026753141467369, "grad_norm": 3.2063848972320557, "learning_rate": 0.00019989866234292666, "loss": 0.5934, "step": 10 }, { "epoch": 0.004053506282934738, "grad_norm": 23.631210327148438, "learning_rate": 0.00019979732468585327, "loss": 0.6803, "step": 20 }, { "epoch": 0.006080259424402107, "grad_norm": 0.24470672011375427, "learning_rate": 0.0001996959870287799, "loss": 0.4146, "step": 30 }, { "epoch": 0.008107012565869477, "grad_norm": 0.6175822615623474, "learning_rate": 0.00019959464937170654, "loss": 0.8121, "step": 40 }, { "epoch": 0.010133765707336847, "grad_norm": 0.15328890085220337, "learning_rate": 0.00019949331171463316, "loss": 0.1752, "step": 50 }, { "epoch": 0.012160518848804215, "grad_norm": 13.491938591003418, "learning_rate": 0.0001993919740575598, "loss": 0.7451, "step": 60 }, { "epoch": 0.014187271990271585, "grad_norm": 0.1225113719701767, "learning_rate": 0.00019929063640048645, "loss": 0.2662, "step": 70 }, { "epoch": 0.016214025131738953, "grad_norm": 0.08415812253952026, "learning_rate": 0.00019918929874341304, "loss": 0.4252, "step": 80 }, { "epoch": 0.018240778273206325, "grad_norm": 0.07037778943777084, "learning_rate": 0.0001990879610863397, "loss": 0.1327, "step": 90 }, { "epoch": 0.020267531414673693, "grad_norm": 0.06210090592503548, "learning_rate": 0.00019898662342926634, "loss": 0.8814, "step": 100 }, { "epoch": 0.020267531414673693, "eval_accuracy": 0.76007326007326, "eval_loss": 0.9432225227355957, "eval_runtime": 46.1647, "eval_samples_per_second": 11.827, "eval_steps_per_second": 1.495, "step": 100 }, { "epoch": 0.02229428455614106, "grad_norm": 0.09677636623382568, "learning_rate": 0.00019888528577219296, "loss": 0.048, "step": 110 }, { "epoch": 0.02432103769760843, "grad_norm": 0.06678306311368942, "learning_rate": 0.00019878394811511957, "loss": 0.6436, "step": 120 }, { "epoch": 0.0263477908390758, "grad_norm": 0.0650462880730629, "learning_rate": 0.00019868261045804622, "loss": 0.3952, "step": 130 }, { "epoch": 0.02837454398054317, "grad_norm": 0.06772352010011673, "learning_rate": 0.00019858127280097284, "loss": 0.0091, "step": 140 }, { "epoch": 0.030401297122010538, "grad_norm": 0.12076238542795181, "learning_rate": 0.00019847993514389949, "loss": 1.0189, "step": 150 }, { "epoch": 0.032428050263477906, "grad_norm": 0.06581564247608185, "learning_rate": 0.00019837859748682613, "loss": 0.4685, "step": 160 }, { "epoch": 0.03445480340494528, "grad_norm": 0.09730742871761322, "learning_rate": 0.00019827725982975272, "loss": 0.4774, "step": 170 }, { "epoch": 0.03648155654641265, "grad_norm": 0.09050973504781723, "learning_rate": 0.00019817592217267937, "loss": 0.7342, "step": 180 }, { "epoch": 0.038508309687880014, "grad_norm": 0.0777946189045906, "learning_rate": 0.00019807458451560602, "loss": 0.4465, "step": 190 }, { "epoch": 0.040535062829347386, "grad_norm": 9.031527519226074, "learning_rate": 0.00019797324685853264, "loss": 0.4374, "step": 200 }, { "epoch": 0.040535062829347386, "eval_accuracy": 0.8864468864468864, "eval_loss": 0.4927012622356415, "eval_runtime": 50.3717, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.37, "step": 200 }, { "epoch": 0.04256181597081476, "grad_norm": 0.1537230908870697, "learning_rate": 0.00019787190920145928, "loss": 0.0122, "step": 210 }, { "epoch": 0.04458856911228212, "grad_norm": 0.06793428212404251, "learning_rate": 0.0001977705715443859, "loss": 0.3605, "step": 220 }, { "epoch": 0.046615322253749494, "grad_norm": 0.05865870788693428, "learning_rate": 0.00019766923388731252, "loss": 0.4666, "step": 230 }, { "epoch": 0.04864207539521686, "grad_norm": 0.07028614729642868, "learning_rate": 0.00019756789623023917, "loss": 0.5873, "step": 240 }, { "epoch": 0.05066882853668423, "grad_norm": 13.204813003540039, "learning_rate": 0.0001974665585731658, "loss": 0.1207, "step": 250 }, { "epoch": 0.0526955816781516, "grad_norm": 0.0801391452550888, "learning_rate": 0.00019736522091609243, "loss": 0.0369, "step": 260 }, { "epoch": 0.05472233481961897, "grad_norm": 0.06529464572668076, "learning_rate": 0.00019726388325901905, "loss": 0.0137, "step": 270 }, { "epoch": 0.05674908796108634, "grad_norm": 0.0377487987279892, "learning_rate": 0.0001971625456019457, "loss": 0.4132, "step": 280 }, { "epoch": 0.05877584110255371, "grad_norm": 0.0332893468439579, "learning_rate": 0.00019706120794487232, "loss": 0.0059, "step": 290 }, { "epoch": 0.060802594244021076, "grad_norm": 0.0327690914273262, "learning_rate": 0.00019695987028779896, "loss": 0.0042, "step": 300 }, { "epoch": 0.060802594244021076, "eval_accuracy": 0.9267399267399268, "eval_loss": 0.35336464643478394, "eval_runtime": 49.466, "eval_samples_per_second": 11.038, "eval_steps_per_second": 1.395, "step": 300 }, { "epoch": 0.06282934738548845, "grad_norm": 0.029724499210715294, "learning_rate": 0.00019685853263072558, "loss": 0.004, "step": 310 }, { "epoch": 0.06485610052695581, "grad_norm": 0.05020628869533539, "learning_rate": 0.00019675719497365223, "loss": 0.5528, "step": 320 }, { "epoch": 0.06688285366842318, "grad_norm": 0.03226783871650696, "learning_rate": 0.00019665585731657885, "loss": 0.0087, "step": 330 }, { "epoch": 0.06890960680989056, "grad_norm": 0.03420436754822731, "learning_rate": 0.0001965545196595055, "loss": 0.0036, "step": 340 }, { "epoch": 0.07093635995135793, "grad_norm": 0.045836128294467926, "learning_rate": 0.0001964531820024321, "loss": 1.3375, "step": 350 }, { "epoch": 0.0729631130928253, "grad_norm": 0.8065479397773743, "learning_rate": 0.00019635184434535873, "loss": 0.5225, "step": 360 }, { "epoch": 0.07498986623429266, "grad_norm": 0.0850081741809845, "learning_rate": 0.00019625050668828538, "loss": 0.0174, "step": 370 }, { "epoch": 0.07701661937576003, "grad_norm": 0.10884834825992584, "learning_rate": 0.00019614916903121202, "loss": 0.9653, "step": 380 }, { "epoch": 0.0790433725172274, "grad_norm": 0.07834824174642563, "learning_rate": 0.00019604783137413864, "loss": 0.4623, "step": 390 }, { "epoch": 0.08107012565869477, "grad_norm": 0.09319596737623215, "learning_rate": 0.00019594649371706526, "loss": 0.0093, "step": 400 }, { "epoch": 0.08107012565869477, "eval_accuracy": 0.9413919413919414, "eval_loss": 0.23352867364883423, "eval_runtime": 50.3492, "eval_samples_per_second": 10.844, "eval_steps_per_second": 1.37, "step": 400 }, { "epoch": 0.08309687880016214, "grad_norm": 0.06457238644361496, "learning_rate": 0.0001958451560599919, "loss": 1.0088, "step": 410 }, { "epoch": 0.08512363194162952, "grad_norm": 0.10535389930009842, "learning_rate": 0.00019574381840291853, "loss": 0.0167, "step": 420 }, { "epoch": 0.08715038508309687, "grad_norm": 0.12313750386238098, "learning_rate": 0.00019564248074584517, "loss": 1.3943, "step": 430 }, { "epoch": 0.08917713822456425, "grad_norm": 0.22795221209526062, "learning_rate": 0.0001955411430887718, "loss": 0.4093, "step": 440 }, { "epoch": 0.09120389136603162, "grad_norm": 0.09523962438106537, "learning_rate": 0.0001954398054316984, "loss": 0.0147, "step": 450 }, { "epoch": 0.09323064450749899, "grad_norm": 0.13858100771903992, "learning_rate": 0.00019533846777462506, "loss": 0.8304, "step": 460 }, { "epoch": 0.09525739764896636, "grad_norm": 0.2124408781528473, "learning_rate": 0.0001952371301175517, "loss": 0.011, "step": 470 }, { "epoch": 0.09728415079043372, "grad_norm": 0.04959016665816307, "learning_rate": 0.00019513579246047832, "loss": 0.0067, "step": 480 }, { "epoch": 0.09931090393190109, "grad_norm": 0.511265754699707, "learning_rate": 0.00019503445480340497, "loss": 0.0071, "step": 490 }, { "epoch": 0.10133765707336846, "grad_norm": 0.05785638839006424, "learning_rate": 0.0001949331171463316, "loss": 0.125, "step": 500 }, { "epoch": 0.10133765707336846, "eval_accuracy": 0.9285714285714286, "eval_loss": 0.36297860741615295, "eval_runtime": 50.3719, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.37, "step": 500 }, { "epoch": 0.10336441021483583, "grad_norm": 0.033664364367723465, "learning_rate": 0.0001948317794892582, "loss": 0.0036, "step": 510 }, { "epoch": 0.1053911633563032, "grad_norm": 0.03444075211882591, "learning_rate": 0.00019473044183218485, "loss": 1.1187, "step": 520 }, { "epoch": 0.10741791649777058, "grad_norm": 0.03646405413746834, "learning_rate": 0.0001946291041751115, "loss": 0.0067, "step": 530 }, { "epoch": 0.10944466963923793, "grad_norm": 0.037678416818380356, "learning_rate": 0.0001945277665180381, "loss": 0.5384, "step": 540 }, { "epoch": 0.1114714227807053, "grad_norm": 9.350079536437988, "learning_rate": 0.00019442642886096474, "loss": 1.0734, "step": 550 }, { "epoch": 0.11349817592217268, "grad_norm": 0.11477504670619965, "learning_rate": 0.00019432509120389138, "loss": 1.0244, "step": 560 }, { "epoch": 0.11552492906364005, "grad_norm": 0.09600544720888138, "learning_rate": 0.000194223753546818, "loss": 0.014, "step": 570 }, { "epoch": 0.11755168220510742, "grad_norm": 0.08096398413181305, "learning_rate": 0.00019412241588974465, "loss": 0.0123, "step": 580 }, { "epoch": 0.1195784353465748, "grad_norm": 0.05573810264468193, "learning_rate": 0.00019402107823267127, "loss": 0.0086, "step": 590 }, { "epoch": 0.12160518848804215, "grad_norm": 0.05102921277284622, "learning_rate": 0.0001939197405755979, "loss": 0.4924, "step": 600 }, { "epoch": 0.12160518848804215, "eval_accuracy": 0.9468864468864469, "eval_loss": 0.23736581206321716, "eval_runtime": 47.7703, "eval_samples_per_second": 11.43, "eval_steps_per_second": 1.444, "step": 600 }, { "epoch": 0.12363194162950952, "grad_norm": 95.2990951538086, "learning_rate": 0.00019381840291852453, "loss": 0.2859, "step": 610 }, { "epoch": 0.1256586947709769, "grad_norm": 0.043713510036468506, "learning_rate": 0.00019371706526145118, "loss": 0.4951, "step": 620 }, { "epoch": 0.12768544791244427, "grad_norm": 0.057737067341804504, "learning_rate": 0.0001936157276043778, "loss": 0.9175, "step": 630 }, { "epoch": 0.12971220105391162, "grad_norm": 0.07625377923250198, "learning_rate": 0.00019351438994730442, "loss": 0.287, "step": 640 }, { "epoch": 0.131738954195379, "grad_norm": 0.0649242177605629, "learning_rate": 0.00019341305229023106, "loss": 0.0102, "step": 650 }, { "epoch": 0.13376570733684637, "grad_norm": 0.07740580290555954, "learning_rate": 0.00019331171463315768, "loss": 0.008, "step": 660 }, { "epoch": 0.13579246047831375, "grad_norm": 0.043081168085336685, "learning_rate": 0.00019321037697608433, "loss": 0.0063, "step": 670 }, { "epoch": 0.1378192136197811, "grad_norm": 0.03584828972816467, "learning_rate": 0.00019310903931901098, "loss": 0.0044, "step": 680 }, { "epoch": 0.13984596676124847, "grad_norm": 0.05630917102098465, "learning_rate": 0.00019300770166193757, "loss": 0.5407, "step": 690 }, { "epoch": 0.14187271990271585, "grad_norm": 0.037703562527894974, "learning_rate": 0.00019290636400486421, "loss": 0.0052, "step": 700 }, { "epoch": 0.14187271990271585, "eval_accuracy": 0.9487179487179487, "eval_loss": 0.20145167410373688, "eval_runtime": 46.9732, "eval_samples_per_second": 11.624, "eval_steps_per_second": 1.469, "step": 700 }, { "epoch": 0.1438994730441832, "grad_norm": 0.029574614018201828, "learning_rate": 0.00019280502634779086, "loss": 0.0041, "step": 710 }, { "epoch": 0.1459262261856506, "grad_norm": 0.04035133868455887, "learning_rate": 0.00019270368869071748, "loss": 0.0039, "step": 720 }, { "epoch": 0.14795297932711796, "grad_norm": 0.054472994059324265, "learning_rate": 0.0001926023510336441, "loss": 1.1169, "step": 730 }, { "epoch": 0.14997973246858531, "grad_norm": 8.792170524597168, "learning_rate": 0.00019250101337657074, "loss": 0.5607, "step": 740 }, { "epoch": 0.1520064856100527, "grad_norm": 0.05719525367021561, "learning_rate": 0.00019239967571949736, "loss": 0.9535, "step": 750 }, { "epoch": 0.15403323875152006, "grad_norm": 0.10215990990400314, "learning_rate": 0.000192298338062424, "loss": 0.0119, "step": 760 }, { "epoch": 0.15605999189298744, "grad_norm": 0.049983322620391846, "learning_rate": 0.00019219700040535066, "loss": 0.0235, "step": 770 }, { "epoch": 0.1580867450344548, "grad_norm": 0.044148072600364685, "learning_rate": 0.00019209566274827725, "loss": 0.4816, "step": 780 }, { "epoch": 0.1601134981759222, "grad_norm": 38.11033630371094, "learning_rate": 0.0001919943250912039, "loss": 0.2388, "step": 790 }, { "epoch": 0.16214025131738954, "grad_norm": 0.03907947614789009, "learning_rate": 0.00019189298743413054, "loss": 0.3738, "step": 800 }, { "epoch": 0.16214025131738954, "eval_accuracy": 0.8864468864468864, "eval_loss": 0.4199656844139099, "eval_runtime": 50.4489, "eval_samples_per_second": 10.823, "eval_steps_per_second": 1.368, "step": 800 }, { "epoch": 0.1641670044588569, "grad_norm": 0.08232033997774124, "learning_rate": 0.00019179164977705716, "loss": 0.7998, "step": 810 }, { "epoch": 0.1661937576003243, "grad_norm": 0.04074680432677269, "learning_rate": 0.0001916903121199838, "loss": 0.0489, "step": 820 }, { "epoch": 0.16822051074179165, "grad_norm": 12.359615325927734, "learning_rate": 0.00019158897446291042, "loss": 0.2878, "step": 830 }, { "epoch": 0.17024726388325903, "grad_norm": 7.305814743041992, "learning_rate": 0.00019148763680583704, "loss": 0.3854, "step": 840 }, { "epoch": 0.1722740170247264, "grad_norm": 0.03477887436747551, "learning_rate": 0.0001913862991487637, "loss": 0.0468, "step": 850 }, { "epoch": 0.17430077016619375, "grad_norm": 0.03299262002110481, "learning_rate": 0.00019128496149169034, "loss": 0.0444, "step": 860 }, { "epoch": 0.17632752330766113, "grad_norm": 0.029569240286946297, "learning_rate": 0.00019118362383461696, "loss": 0.0036, "step": 870 }, { "epoch": 0.1783542764491285, "grad_norm": 0.026648886501789093, "learning_rate": 0.00019108228617754357, "loss": 0.0497, "step": 880 }, { "epoch": 0.18038102959059588, "grad_norm": 0.0252667348831892, "learning_rate": 0.00019098094852047022, "loss": 0.4905, "step": 890 }, { "epoch": 0.18240778273206323, "grad_norm": 0.027241753414273262, "learning_rate": 0.00019087961086339684, "loss": 0.4533, "step": 900 }, { "epoch": 0.18240778273206323, "eval_accuracy": 0.9285714285714286, "eval_loss": 0.2573205232620239, "eval_runtime": 48.2075, "eval_samples_per_second": 11.326, "eval_steps_per_second": 1.431, "step": 900 }, { "epoch": 0.1844345358735306, "grad_norm": 0.7260812520980835, "learning_rate": 0.00019077827320632349, "loss": 0.6588, "step": 910 }, { "epoch": 0.18646128901499798, "grad_norm": 9.329017639160156, "learning_rate": 0.0001906769355492501, "loss": 1.0854, "step": 920 }, { "epoch": 0.18848804215646534, "grad_norm": 0.18261322379112244, "learning_rate": 0.00019057559789217675, "loss": 1.0766, "step": 930 }, { "epoch": 0.19051479529793272, "grad_norm": 0.22918297350406647, "learning_rate": 0.00019047426023510337, "loss": 0.2613, "step": 940 }, { "epoch": 0.19254154843940008, "grad_norm": 0.05047022923827171, "learning_rate": 0.00019037292257803002, "loss": 0.0534, "step": 950 }, { "epoch": 0.19456830158086744, "grad_norm": 0.03922040015459061, "learning_rate": 0.00019027158492095664, "loss": 1.0114, "step": 960 }, { "epoch": 0.19659505472233482, "grad_norm": 0.10960974544286728, "learning_rate": 0.00019017024726388325, "loss": 0.8342, "step": 970 }, { "epoch": 0.19862180786380218, "grad_norm": 0.1044323593378067, "learning_rate": 0.0001900689096068099, "loss": 0.8305, "step": 980 }, { "epoch": 0.20064856100526957, "grad_norm": 0.12734976410865784, "learning_rate": 0.00018996757194973655, "loss": 0.4303, "step": 990 }, { "epoch": 0.20267531414673692, "grad_norm": 0.07667859643697739, "learning_rate": 0.00018986623429266317, "loss": 0.027, "step": 1000 }, { "epoch": 0.20267531414673692, "eval_accuracy": 0.9120879120879121, "eval_loss": 0.34083420038223267, "eval_runtime": 50.3862, "eval_samples_per_second": 10.836, "eval_steps_per_second": 1.369, "step": 1000 }, { "epoch": 0.2047020672882043, "grad_norm": 0.04669175669550896, "learning_rate": 0.00018976489663558979, "loss": 0.3329, "step": 1010 }, { "epoch": 0.20672882042967167, "grad_norm": 0.0436105839908123, "learning_rate": 0.00018966355897851643, "loss": 0.416, "step": 1020 }, { "epoch": 0.20875557357113902, "grad_norm": 0.03847363963723183, "learning_rate": 0.00018956222132144305, "loss": 0.0052, "step": 1030 }, { "epoch": 0.2107823267126064, "grad_norm": 0.04070694372057915, "learning_rate": 0.0001894608836643697, "loss": 0.6871, "step": 1040 }, { "epoch": 0.21280907985407377, "grad_norm": 0.044938068836927414, "learning_rate": 0.00018935954600729632, "loss": 0.3686, "step": 1050 }, { "epoch": 0.21483583299554115, "grad_norm": 0.3830691874027252, "learning_rate": 0.00018925820835022293, "loss": 0.0704, "step": 1060 }, { "epoch": 0.2168625861370085, "grad_norm": 0.09455056488513947, "learning_rate": 0.00018915687069314958, "loss": 0.9455, "step": 1070 }, { "epoch": 0.21888933927847587, "grad_norm": 0.03371147811412811, "learning_rate": 0.00018905553303607623, "loss": 0.3658, "step": 1080 }, { "epoch": 0.22091609241994326, "grad_norm": 0.4871658980846405, "learning_rate": 0.00018895419537900285, "loss": 0.2985, "step": 1090 }, { "epoch": 0.2229428455614106, "grad_norm": 0.038024235516786575, "learning_rate": 0.0001888528577219295, "loss": 0.6685, "step": 1100 }, { "epoch": 0.2229428455614106, "eval_accuracy": 0.826007326007326, "eval_loss": 0.3140435516834259, "eval_runtime": 47.3231, "eval_samples_per_second": 11.538, "eval_steps_per_second": 1.458, "step": 1100 }, { "epoch": 0.224969598702878, "grad_norm": 0.03393388167023659, "learning_rate": 0.0001887515200648561, "loss": 0.1142, "step": 1110 }, { "epoch": 0.22699635184434536, "grad_norm": 0.030963918194174767, "learning_rate": 0.00018865018240778273, "loss": 0.1054, "step": 1120 }, { "epoch": 0.22902310498581271, "grad_norm": 0.036398351192474365, "learning_rate": 0.00018854884475070938, "loss": 0.6332, "step": 1130 }, { "epoch": 0.2310498581272801, "grad_norm": 0.04975204914808273, "learning_rate": 0.00018844750709363602, "loss": 0.7696, "step": 1140 }, { "epoch": 0.23307661126874746, "grad_norm": 0.04488310590386391, "learning_rate": 0.00018834616943656262, "loss": 0.1347, "step": 1150 }, { "epoch": 0.23510336441021484, "grad_norm": 24.52461051940918, "learning_rate": 0.00018824483177948926, "loss": 0.8782, "step": 1160 }, { "epoch": 0.2371301175516822, "grad_norm": 0.0589730367064476, "learning_rate": 0.0001881434941224159, "loss": 0.4471, "step": 1170 }, { "epoch": 0.2391568706931496, "grad_norm": 8.6546630859375, "learning_rate": 0.00018804215646534253, "loss": 0.5269, "step": 1180 }, { "epoch": 0.24118362383461694, "grad_norm": 81.45098876953125, "learning_rate": 0.00018794081880826917, "loss": 0.4896, "step": 1190 }, { "epoch": 0.2432103769760843, "grad_norm": 0.05012829601764679, "learning_rate": 0.0001878394811511958, "loss": 0.0703, "step": 1200 }, { "epoch": 0.2432103769760843, "eval_accuracy": 0.9322344322344323, "eval_loss": 0.2424902766942978, "eval_runtime": 47.8835, "eval_samples_per_second": 11.403, "eval_steps_per_second": 1.441, "step": 1200 }, { "epoch": 0.2452371301175517, "grad_norm": 0.045358020812273026, "learning_rate": 0.0001877381434941224, "loss": 0.213, "step": 1210 }, { "epoch": 0.24726388325901905, "grad_norm": 8.763317108154297, "learning_rate": 0.00018763680583704906, "loss": 1.51, "step": 1220 }, { "epoch": 0.24929063640048643, "grad_norm": 0.3494032621383667, "learning_rate": 0.0001875354681799757, "loss": 0.7853, "step": 1230 }, { "epoch": 0.2513173895419538, "grad_norm": 0.11207950115203857, "learning_rate": 0.00018743413052290232, "loss": 0.0299, "step": 1240 }, { "epoch": 0.25334414268342115, "grad_norm": 0.05717464163899422, "learning_rate": 0.00018733279286582894, "loss": 0.0085, "step": 1250 }, { "epoch": 0.25537089582488853, "grad_norm": 0.05250593274831772, "learning_rate": 0.0001872314552087556, "loss": 0.5135, "step": 1260 }, { "epoch": 0.2573976489663559, "grad_norm": 0.04397350549697876, "learning_rate": 0.0001871301175516822, "loss": 0.0056, "step": 1270 }, { "epoch": 0.25942440210782325, "grad_norm": 8.682046890258789, "learning_rate": 0.00018702877989460885, "loss": 1.0666, "step": 1280 }, { "epoch": 0.26145115524929063, "grad_norm": 0.07138803601264954, "learning_rate": 0.00018692744223753547, "loss": 0.5221, "step": 1290 }, { "epoch": 0.263477908390758, "grad_norm": 0.1037185862660408, "learning_rate": 0.0001868261045804621, "loss": 0.9411, "step": 1300 }, { "epoch": 0.263477908390758, "eval_accuracy": 0.8223443223443223, "eval_loss": 0.7809145450592041, "eval_runtime": 49.4934, "eval_samples_per_second": 11.032, "eval_steps_per_second": 1.394, "step": 1300 }, { "epoch": 0.26550466153222535, "grad_norm": 8.10766887664795, "learning_rate": 0.00018672476692338874, "loss": 0.8295, "step": 1310 }, { "epoch": 0.26753141467369274, "grad_norm": 0.18092480301856995, "learning_rate": 0.00018662342926631538, "loss": 0.0202, "step": 1320 }, { "epoch": 0.2695581678151601, "grad_norm": 0.15583205223083496, "learning_rate": 0.000186522091609242, "loss": 0.3999, "step": 1330 }, { "epoch": 0.2715849209566275, "grad_norm": 0.09881691634654999, "learning_rate": 0.00018642075395216862, "loss": 0.4089, "step": 1340 }, { "epoch": 0.27361167409809484, "grad_norm": 0.1042148619890213, "learning_rate": 0.00018631941629509527, "loss": 1.257, "step": 1350 }, { "epoch": 0.2756384272395622, "grad_norm": 7.86333703994751, "learning_rate": 0.0001862180786380219, "loss": 0.7033, "step": 1360 }, { "epoch": 0.2776651803810296, "grad_norm": 7.599421977996826, "learning_rate": 0.00018611674098094853, "loss": 0.6313, "step": 1370 }, { "epoch": 0.27969193352249694, "grad_norm": 0.16084067523479462, "learning_rate": 0.00018601540332387518, "loss": 0.0262, "step": 1380 }, { "epoch": 0.2817186866639643, "grad_norm": 0.09960556775331497, "learning_rate": 0.00018591406566680177, "loss": 0.3459, "step": 1390 }, { "epoch": 0.2837454398054317, "grad_norm": 0.13508349657058716, "learning_rate": 0.00018581272800972842, "loss": 0.4378, "step": 1400 }, { "epoch": 0.2837454398054317, "eval_accuracy": 0.8223443223443223, "eval_loss": 0.6968061327934265, "eval_runtime": 50.393, "eval_samples_per_second": 10.835, "eval_steps_per_second": 1.369, "step": 1400 }, { "epoch": 0.2857721929468991, "grad_norm": 0.06493504345417023, "learning_rate": 0.00018571139035265506, "loss": 0.388, "step": 1410 }, { "epoch": 0.2877989460883664, "grad_norm": 0.4153689444065094, "learning_rate": 0.00018561005269558168, "loss": 0.6636, "step": 1420 }, { "epoch": 0.2898256992298338, "grad_norm": 0.0951620489358902, "learning_rate": 0.00018550871503850833, "loss": 0.7434, "step": 1430 }, { "epoch": 0.2918524523713012, "grad_norm": 6.5460944175720215, "learning_rate": 0.00018540737738143495, "loss": 0.2931, "step": 1440 }, { "epoch": 0.2938792055127685, "grad_norm": 0.5932949185371399, "learning_rate": 0.00018530603972436157, "loss": 0.3002, "step": 1450 }, { "epoch": 0.2959059586542359, "grad_norm": 8.747719764709473, "learning_rate": 0.00018520470206728821, "loss": 0.5126, "step": 1460 }, { "epoch": 0.2979327117957033, "grad_norm": 0.05845149606466293, "learning_rate": 0.00018510336441021486, "loss": 0.4434, "step": 1470 }, { "epoch": 0.29995946493717063, "grad_norm": 0.23026475310325623, "learning_rate": 0.00018500202675314148, "loss": 0.6812, "step": 1480 }, { "epoch": 0.301986218078638, "grad_norm": 0.06300631165504456, "learning_rate": 0.0001849006890960681, "loss": 0.5783, "step": 1490 }, { "epoch": 0.3040129712201054, "grad_norm": 1.2849607467651367, "learning_rate": 0.00018479935143899474, "loss": 0.7127, "step": 1500 }, { "epoch": 0.3040129712201054, "eval_accuracy": 0.8241758241758241, "eval_loss": 0.3293890655040741, "eval_runtime": 88.987, "eval_samples_per_second": 6.136, "eval_steps_per_second": 0.775, "step": 1500 }, { "epoch": 0.3060397243615728, "grad_norm": 0.06597457081079483, "learning_rate": 0.00018469801378192136, "loss": 0.4313, "step": 1510 }, { "epoch": 0.3080664775030401, "grad_norm": 0.054177168756723404, "learning_rate": 0.000184596676124848, "loss": 0.1819, "step": 1520 }, { "epoch": 0.3100932306445075, "grad_norm": 0.05142025649547577, "learning_rate": 0.00018449533846777463, "loss": 0.1027, "step": 1530 }, { "epoch": 0.3121199837859749, "grad_norm": 0.595727264881134, "learning_rate": 0.00018439400081070127, "loss": 0.5553, "step": 1540 }, { "epoch": 0.3141467369274422, "grad_norm": 0.044500961899757385, "learning_rate": 0.0001842926631536279, "loss": 0.5383, "step": 1550 }, { "epoch": 0.3161734900689096, "grad_norm": 6.882216453552246, "learning_rate": 0.00018419132549655454, "loss": 0.4053, "step": 1560 }, { "epoch": 0.318200243210377, "grad_norm": 0.06732060760259628, "learning_rate": 0.00018408998783948116, "loss": 0.765, "step": 1570 }, { "epoch": 0.3202269963518444, "grad_norm": 0.07889731228351593, "learning_rate": 0.00018398865018240778, "loss": 0.1044, "step": 1580 }, { "epoch": 0.3222537494933117, "grad_norm": 8.76712703704834, "learning_rate": 0.00018388731252533442, "loss": 0.5054, "step": 1590 }, { "epoch": 0.3242805026347791, "grad_norm": 0.10699472576379776, "learning_rate": 0.00018378597486826104, "loss": 0.9465, "step": 1600 }, { "epoch": 0.3242805026347791, "eval_accuracy": 0.8223443223443223, "eval_loss": 0.4912700653076172, "eval_runtime": 48.2783, "eval_samples_per_second": 11.309, "eval_steps_per_second": 1.429, "step": 1600 }, { "epoch": 0.3263072557762465, "grad_norm": 0.6591342091560364, "learning_rate": 0.0001836846372111877, "loss": 0.5213, "step": 1610 }, { "epoch": 0.3283340089177138, "grad_norm": 6.809113025665283, "learning_rate": 0.0001835832995541143, "loss": 0.2711, "step": 1620 }, { "epoch": 0.3303607620591812, "grad_norm": 0.08120753616094589, "learning_rate": 0.00018348196189704096, "loss": 0.1901, "step": 1630 }, { "epoch": 0.3323875152006486, "grad_norm": 0.1842392235994339, "learning_rate": 0.00018338062423996757, "loss": 0.5417, "step": 1640 }, { "epoch": 0.3344142683421159, "grad_norm": 0.09670919179916382, "learning_rate": 0.00018327928658289422, "loss": 0.3437, "step": 1650 }, { "epoch": 0.3364410214835833, "grad_norm": 0.04945284500718117, "learning_rate": 0.00018317794892582084, "loss": 0.1802, "step": 1660 }, { "epoch": 0.3384677746250507, "grad_norm": 8.956902503967285, "learning_rate": 0.00018307661126874746, "loss": 0.9073, "step": 1670 }, { "epoch": 0.34049452776651806, "grad_norm": 0.11272208392620087, "learning_rate": 0.0001829752736116741, "loss": 0.365, "step": 1680 }, { "epoch": 0.3425212809079854, "grad_norm": 0.13924777507781982, "learning_rate": 0.00018287393595460075, "loss": 0.3967, "step": 1690 }, { "epoch": 0.3445480340494528, "grad_norm": 0.05262771248817444, "learning_rate": 0.00018277259829752737, "loss": 0.3834, "step": 1700 }, { "epoch": 0.3445480340494528, "eval_accuracy": 0.9047619047619048, "eval_loss": 0.25941210985183716, "eval_runtime": 47.82, "eval_samples_per_second": 11.418, "eval_steps_per_second": 1.443, "step": 1700 }, { "epoch": 0.34657478719092016, "grad_norm": 7.016679763793945, "learning_rate": 0.00018267126064045402, "loss": 0.0367, "step": 1710 }, { "epoch": 0.3486015403323875, "grad_norm": 9.35397720336914, "learning_rate": 0.00018256992298338064, "loss": 0.8748, "step": 1720 }, { "epoch": 0.3506282934738549, "grad_norm": 0.036801666021347046, "learning_rate": 0.00018246858532630725, "loss": 0.5593, "step": 1730 }, { "epoch": 0.35265504661532227, "grad_norm": 0.040228940546512604, "learning_rate": 0.0001823672476692339, "loss": 0.0114, "step": 1740 }, { "epoch": 0.35468179975678965, "grad_norm": 0.035426534712314606, "learning_rate": 0.00018226591001216055, "loss": 0.0266, "step": 1750 }, { "epoch": 0.356708552898257, "grad_norm": 0.03302001953125, "learning_rate": 0.00018216457235508714, "loss": 0.0056, "step": 1760 }, { "epoch": 0.35873530603972437, "grad_norm": 0.02835068479180336, "learning_rate": 0.00018206323469801379, "loss": 0.0041, "step": 1770 }, { "epoch": 0.36076205918119175, "grad_norm": 0.03482850268483162, "learning_rate": 0.00018196189704094043, "loss": 0.9573, "step": 1780 }, { "epoch": 0.3627888123226591, "grad_norm": 0.03174089640378952, "learning_rate": 0.00018186055938386705, "loss": 0.5716, "step": 1790 }, { "epoch": 0.36481556546412647, "grad_norm": 0.04105382412672043, "learning_rate": 0.0001817592217267937, "loss": 0.6691, "step": 1800 }, { "epoch": 0.36481556546412647, "eval_accuracy": 0.8992673992673993, "eval_loss": 0.35374078154563904, "eval_runtime": 50.5022, "eval_samples_per_second": 10.811, "eval_steps_per_second": 1.366, "step": 1800 }, { "epoch": 0.36684231860559385, "grad_norm": 0.05565650016069412, "learning_rate": 0.00018165788406972032, "loss": 0.2785, "step": 1810 }, { "epoch": 0.3688690717470612, "grad_norm": 0.038423530757427216, "learning_rate": 0.00018155654641264693, "loss": 0.0051, "step": 1820 }, { "epoch": 0.37089582488852857, "grad_norm": 0.034793466329574585, "learning_rate": 0.00018145520875557358, "loss": 0.034, "step": 1830 }, { "epoch": 0.37292257802999595, "grad_norm": 0.029749181121587753, "learning_rate": 0.00018135387109850023, "loss": 0.0037, "step": 1840 }, { "epoch": 0.37494933117146334, "grad_norm": 0.02845478057861328, "learning_rate": 0.00018125253344142685, "loss": 0.3519, "step": 1850 }, { "epoch": 0.37697608431293067, "grad_norm": 0.026904482394456863, "learning_rate": 0.00018115119578435347, "loss": 0.0096, "step": 1860 }, { "epoch": 0.37900283745439806, "grad_norm": 0.2545231580734253, "learning_rate": 0.0001810498581272801, "loss": 0.0083, "step": 1870 }, { "epoch": 0.38102959059586544, "grad_norm": 0.028374874964356422, "learning_rate": 0.00018094852047020673, "loss": 0.5834, "step": 1880 }, { "epoch": 0.38305634373733277, "grad_norm": 0.03022819012403488, "learning_rate": 0.00018084718281313338, "loss": 0.5286, "step": 1890 }, { "epoch": 0.38508309687880016, "grad_norm": 0.034386951476335526, "learning_rate": 0.00018074584515606, "loss": 0.3002, "step": 1900 }, { "epoch": 0.38508309687880016, "eval_accuracy": 0.9285714285714286, "eval_loss": 0.2502322494983673, "eval_runtime": 48.7593, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.415, "step": 1900 }, { "epoch": 0.38710985002026754, "grad_norm": 8.355843544006348, "learning_rate": 0.00018064450749898662, "loss": 1.2353, "step": 1910 }, { "epoch": 0.3891366031617349, "grad_norm": 0.22805503010749817, "learning_rate": 0.00018054316984191326, "loss": 0.0135, "step": 1920 }, { "epoch": 0.39116335630320226, "grad_norm": 9.000629425048828, "learning_rate": 0.0001804418321848399, "loss": 0.8987, "step": 1930 }, { "epoch": 0.39319010944466964, "grad_norm": 0.10103480517864227, "learning_rate": 0.00018034049452776653, "loss": 0.6317, "step": 1940 }, { "epoch": 0.39521686258613703, "grad_norm": 0.053478993475437164, "learning_rate": 0.00018023915687069315, "loss": 0.0372, "step": 1950 }, { "epoch": 0.39724361572760436, "grad_norm": 0.034072987735271454, "learning_rate": 0.0001801378192136198, "loss": 0.3131, "step": 1960 }, { "epoch": 0.39927036886907175, "grad_norm": 0.046674974262714386, "learning_rate": 0.0001800364815565464, "loss": 0.6352, "step": 1970 }, { "epoch": 0.40129712201053913, "grad_norm": 0.03819415718317032, "learning_rate": 0.00017993514389947306, "loss": 0.0087, "step": 1980 }, { "epoch": 0.40332387515200646, "grad_norm": 0.9910769462585449, "learning_rate": 0.0001798338062423997, "loss": 0.3056, "step": 1990 }, { "epoch": 0.40535062829347385, "grad_norm": 0.04858968034386635, "learning_rate": 0.0001797324685853263, "loss": 0.0473, "step": 2000 }, { "epoch": 0.40535062829347385, "eval_accuracy": 0.9322344322344323, "eval_loss": 0.23118893802165985, "eval_runtime": 49.9277, "eval_samples_per_second": 10.936, "eval_steps_per_second": 1.382, "step": 2000 }, { "epoch": 0.40737738143494123, "grad_norm": 0.025557000190019608, "learning_rate": 0.00017963113092825294, "loss": 0.2941, "step": 2010 }, { "epoch": 0.4094041345764086, "grad_norm": 0.02778341993689537, "learning_rate": 0.0001795297932711796, "loss": 0.8051, "step": 2020 }, { "epoch": 0.41143088771787595, "grad_norm": 0.03390109911561012, "learning_rate": 0.0001794284556141062, "loss": 0.2695, "step": 2030 }, { "epoch": 0.41345764085934333, "grad_norm": 0.026163168251514435, "learning_rate": 0.00017932711795703285, "loss": 0.2401, "step": 2040 }, { "epoch": 0.4154843940008107, "grad_norm": 0.023556355386972427, "learning_rate": 0.00017922578029995947, "loss": 0.0089, "step": 2050 }, { "epoch": 0.41751114714227805, "grad_norm": 0.028165694326162338, "learning_rate": 0.0001791244426428861, "loss": 0.0059, "step": 2060 }, { "epoch": 0.41953790028374544, "grad_norm": 0.036073777824640274, "learning_rate": 0.00017902310498581274, "loss": 0.5876, "step": 2070 }, { "epoch": 0.4215646534252128, "grad_norm": 0.02447659894824028, "learning_rate": 0.00017892176732873938, "loss": 0.0045, "step": 2080 }, { "epoch": 0.42359140656668015, "grad_norm": 0.030884768813848495, "learning_rate": 0.000178820429671666, "loss": 0.5126, "step": 2090 }, { "epoch": 0.42561815970814754, "grad_norm": 0.032383281737565994, "learning_rate": 0.00017871909201459262, "loss": 0.634, "step": 2100 }, { "epoch": 0.42561815970814754, "eval_accuracy": 0.9358974358974359, "eval_loss": 0.2406046986579895, "eval_runtime": 46.5595, "eval_samples_per_second": 11.727, "eval_steps_per_second": 1.482, "step": 2100 }, { "epoch": 0.4276449128496149, "grad_norm": 0.030924880877137184, "learning_rate": 0.00017861775435751927, "loss": 0.129, "step": 2110 }, { "epoch": 0.4296716659910823, "grad_norm": 0.7038991451263428, "learning_rate": 0.0001785164167004459, "loss": 0.4845, "step": 2120 }, { "epoch": 0.43169841913254964, "grad_norm": 0.02341308444738388, "learning_rate": 0.00017841507904337253, "loss": 0.0078, "step": 2130 }, { "epoch": 0.433725172274017, "grad_norm": 0.024457694962620735, "learning_rate": 0.00017831374138629915, "loss": 0.7938, "step": 2140 }, { "epoch": 0.4357519254154844, "grad_norm": 0.024586468935012817, "learning_rate": 0.0001782124037292258, "loss": 0.3548, "step": 2150 }, { "epoch": 0.43777867855695174, "grad_norm": 0.5639256834983826, "learning_rate": 0.00017811106607215242, "loss": 0.3112, "step": 2160 }, { "epoch": 0.4398054316984191, "grad_norm": 0.026940103620290756, "learning_rate": 0.00017800972841507906, "loss": 0.5675, "step": 2170 }, { "epoch": 0.4418321848398865, "grad_norm": 0.027325104922056198, "learning_rate": 0.00017790839075800568, "loss": 0.014, "step": 2180 }, { "epoch": 0.4438589379813539, "grad_norm": 0.024721914902329445, "learning_rate": 0.0001778070531009323, "loss": 0.5824, "step": 2190 }, { "epoch": 0.4458856911228212, "grad_norm": 0.03028850629925728, "learning_rate": 0.00017770571544385895, "loss": 0.4471, "step": 2200 }, { "epoch": 0.4458856911228212, "eval_accuracy": 0.9377289377289377, "eval_loss": 0.29827624559402466, "eval_runtime": 50.2155, "eval_samples_per_second": 10.873, "eval_steps_per_second": 1.374, "step": 2200 }, { "epoch": 0.4479124442642886, "grad_norm": 44.90025329589844, "learning_rate": 0.00017760437778678557, "loss": 0.5802, "step": 2210 }, { "epoch": 0.449939197405756, "grad_norm": 0.1824452131986618, "learning_rate": 0.00017750304012971221, "loss": 0.006, "step": 2220 }, { "epoch": 0.4519659505472233, "grad_norm": 0.03286247327923775, "learning_rate": 0.00017740170247263883, "loss": 0.4521, "step": 2230 }, { "epoch": 0.4539927036886907, "grad_norm": 0.030796563252806664, "learning_rate": 0.00017730036481556548, "loss": 0.0075, "step": 2240 }, { "epoch": 0.4560194568301581, "grad_norm": 0.025900913402438164, "learning_rate": 0.0001771990271584921, "loss": 0.0054, "step": 2250 }, { "epoch": 0.45804620997162543, "grad_norm": 0.02531251683831215, "learning_rate": 0.00017709768950141874, "loss": 0.4758, "step": 2260 }, { "epoch": 0.4600729631130928, "grad_norm": 2.9233317375183105, "learning_rate": 0.00017699635184434536, "loss": 0.4245, "step": 2270 }, { "epoch": 0.4620997162545602, "grad_norm": 8.508463859558105, "learning_rate": 0.00017689501418727198, "loss": 0.4203, "step": 2280 }, { "epoch": 0.4641264693960276, "grad_norm": 8.093629837036133, "learning_rate": 0.00017679367653019863, "loss": 1.4054, "step": 2290 }, { "epoch": 0.4661532225374949, "grad_norm": 1.5853863954544067, "learning_rate": 0.00017669233887312527, "loss": 0.3229, "step": 2300 }, { "epoch": 0.4661532225374949, "eval_accuracy": 0.9212454212454212, "eval_loss": 0.3600928783416748, "eval_runtime": 49.7309, "eval_samples_per_second": 10.979, "eval_steps_per_second": 1.387, "step": 2300 }, { "epoch": 0.4681799756789623, "grad_norm": 0.03744541481137276, "learning_rate": 0.0001765910012160519, "loss": 1.5756, "step": 2310 }, { "epoch": 0.4702067288204297, "grad_norm": 8.397408485412598, "learning_rate": 0.00017648966355897854, "loss": 0.5392, "step": 2320 }, { "epoch": 0.472233481961897, "grad_norm": 0.07392023503780365, "learning_rate": 0.00017638832590190516, "loss": 0.4868, "step": 2330 }, { "epoch": 0.4742602351033644, "grad_norm": 0.07620003819465637, "learning_rate": 0.00017628698824483178, "loss": 0.2934, "step": 2340 }, { "epoch": 0.4762869882448318, "grad_norm": 0.05634753778576851, "learning_rate": 0.00017618565058775842, "loss": 1.0435, "step": 2350 }, { "epoch": 0.4783137413862992, "grad_norm": 0.06447988003492355, "learning_rate": 0.00017608431293068507, "loss": 0.2949, "step": 2360 }, { "epoch": 0.4803404945277665, "grad_norm": 0.12075554579496384, "learning_rate": 0.00017598297527361166, "loss": 0.5441, "step": 2370 }, { "epoch": 0.4823672476692339, "grad_norm": 0.04562091454863548, "learning_rate": 0.0001758816376165383, "loss": 0.2559, "step": 2380 }, { "epoch": 0.4843940008107013, "grad_norm": 0.03965690732002258, "learning_rate": 0.00017578029995946496, "loss": 0.4775, "step": 2390 }, { "epoch": 0.4864207539521686, "grad_norm": 0.038196004927158356, "learning_rate": 0.00017567896230239157, "loss": 0.4769, "step": 2400 }, { "epoch": 0.4864207539521686, "eval_accuracy": 0.9010989010989011, "eval_loss": 0.2990172803401947, "eval_runtime": 48.3946, "eval_samples_per_second": 11.282, "eval_steps_per_second": 1.426, "step": 2400 }, { "epoch": 0.488447507093636, "grad_norm": 0.8857225775718689, "learning_rate": 0.00017557762464531822, "loss": 0.508, "step": 2410 }, { "epoch": 0.4904742602351034, "grad_norm": 0.06662506610155106, "learning_rate": 0.00017547628698824484, "loss": 0.0342, "step": 2420 }, { "epoch": 0.4925010133765707, "grad_norm": 0.06565208733081818, "learning_rate": 0.00017537494933117146, "loss": 0.2875, "step": 2430 }, { "epoch": 0.4945277665180381, "grad_norm": 0.056362636387348175, "learning_rate": 0.0001752736116740981, "loss": 0.7128, "step": 2440 }, { "epoch": 0.4965545196595055, "grad_norm": 0.042020898312330246, "learning_rate": 0.00017517227401702475, "loss": 0.4941, "step": 2450 }, { "epoch": 0.49858127280097286, "grad_norm": 0.11986752599477768, "learning_rate": 0.00017507093635995137, "loss": 0.3659, "step": 2460 }, { "epoch": 0.5006080259424402, "grad_norm": 0.03309104964137077, "learning_rate": 0.000174969598702878, "loss": 0.0116, "step": 2470 }, { "epoch": 0.5026347790839076, "grad_norm": 0.031792882829904556, "learning_rate": 0.00017486826104580464, "loss": 0.017, "step": 2480 }, { "epoch": 0.5046615322253749, "grad_norm": 0.029629742726683617, "learning_rate": 0.00017476692338873125, "loss": 0.8387, "step": 2490 }, { "epoch": 0.5066882853668423, "grad_norm": 0.029320184141397476, "learning_rate": 0.0001746655857316579, "loss": 0.0135, "step": 2500 }, { "epoch": 0.5066882853668423, "eval_accuracy": 0.9029304029304029, "eval_loss": 0.31343939900398254, "eval_runtime": 46.7566, "eval_samples_per_second": 11.678, "eval_steps_per_second": 1.476, "step": 2500 }, { "epoch": 0.5087150385083097, "grad_norm": 0.03285427764058113, "learning_rate": 0.00017456424807458452, "loss": 0.3224, "step": 2510 }, { "epoch": 0.5107417916497771, "grad_norm": 0.02792639471590519, "learning_rate": 0.00017446291041751114, "loss": 0.0042, "step": 2520 }, { "epoch": 0.5127685447912445, "grad_norm": 0.047437168657779694, "learning_rate": 0.00017436157276043779, "loss": 0.0038, "step": 2530 }, { "epoch": 0.5147952979327118, "grad_norm": 0.025487160310149193, "learning_rate": 0.00017426023510336443, "loss": 0.0088, "step": 2540 }, { "epoch": 0.5168220510741791, "grad_norm": 0.02262968011200428, "learning_rate": 0.00017415889744629105, "loss": 0.0162, "step": 2550 }, { "epoch": 0.5188488042156465, "grad_norm": 0.021806327626109123, "learning_rate": 0.00017405755978921767, "loss": 0.34, "step": 2560 }, { "epoch": 0.5208755573571139, "grad_norm": 0.022118626162409782, "learning_rate": 0.00017395622213214432, "loss": 0.3285, "step": 2570 }, { "epoch": 0.5229023104985813, "grad_norm": 0.021631283685564995, "learning_rate": 0.00017385488447507093, "loss": 0.0077, "step": 2580 }, { "epoch": 0.5249290636400487, "grad_norm": 0.020419137552380562, "learning_rate": 0.00017375354681799758, "loss": 0.3255, "step": 2590 }, { "epoch": 0.526955816781516, "grad_norm": 0.020009351894259453, "learning_rate": 0.00017365220916092423, "loss": 0.3025, "step": 2600 }, { "epoch": 0.526955816781516, "eval_accuracy": 0.9505494505494505, "eval_loss": 0.17476053535938263, "eval_runtime": 50.4498, "eval_samples_per_second": 10.823, "eval_steps_per_second": 1.368, "step": 2600 }, { "epoch": 0.5289825699229834, "grad_norm": 0.022722359746694565, "learning_rate": 0.00017355087150385082, "loss": 0.2968, "step": 2610 }, { "epoch": 0.5310093230644507, "grad_norm": 0.783506453037262, "learning_rate": 0.00017344953384677747, "loss": 0.5455, "step": 2620 }, { "epoch": 0.5330360762059181, "grad_norm": 0.024192122742533684, "learning_rate": 0.0001733481961897041, "loss": 0.2437, "step": 2630 }, { "epoch": 0.5350628293473855, "grad_norm": 0.01950955204665661, "learning_rate": 0.00017324685853263073, "loss": 0.2505, "step": 2640 }, { "epoch": 0.5370895824888529, "grad_norm": 0.018212512135505676, "learning_rate": 0.00017314552087555735, "loss": 0.0021, "step": 2650 }, { "epoch": 0.5391163356303202, "grad_norm": 0.0288174357265234, "learning_rate": 0.000173044183218484, "loss": 0.8558, "step": 2660 }, { "epoch": 0.5411430887717876, "grad_norm": 0.026978524401783943, "learning_rate": 0.00017294284556141062, "loss": 0.6232, "step": 2670 }, { "epoch": 0.543169841913255, "grad_norm": 7.463168144226074, "learning_rate": 0.00017284150790433726, "loss": 0.2648, "step": 2680 }, { "epoch": 0.5451965950547223, "grad_norm": 0.6724880337715149, "learning_rate": 0.0001727401702472639, "loss": 0.0268, "step": 2690 }, { "epoch": 0.5472233481961897, "grad_norm": 0.2459801882505417, "learning_rate": 0.00017263883259019053, "loss": 0.0114, "step": 2700 }, { "epoch": 0.5472233481961897, "eval_accuracy": 0.9212454212454212, "eval_loss": 0.2898261249065399, "eval_runtime": 48.0997, "eval_samples_per_second": 11.351, "eval_steps_per_second": 1.435, "step": 2700 }, { "epoch": 0.5492501013376571, "grad_norm": 0.017503296956419945, "learning_rate": 0.00017253749493311715, "loss": 0.0114, "step": 2710 }, { "epoch": 0.5512768544791244, "grad_norm": 0.14966024458408356, "learning_rate": 0.0001724361572760438, "loss": 0.5774, "step": 2720 }, { "epoch": 0.5533036076205918, "grad_norm": 0.05726105347275734, "learning_rate": 0.0001723348196189704, "loss": 0.0069, "step": 2730 }, { "epoch": 0.5553303607620592, "grad_norm": 0.020008716732263565, "learning_rate": 0.00017223348196189706, "loss": 0.4275, "step": 2740 }, { "epoch": 0.5573571139035266, "grad_norm": 0.019402628764510155, "learning_rate": 0.00017213214430482368, "loss": 0.0041, "step": 2750 }, { "epoch": 0.5593838670449939, "grad_norm": 0.029374495148658752, "learning_rate": 0.0001720308066477503, "loss": 0.0031, "step": 2760 }, { "epoch": 0.5614106201864613, "grad_norm": 0.016268255189061165, "learning_rate": 0.00017192946899067694, "loss": 0.0035, "step": 2770 }, { "epoch": 0.5634373733279286, "grad_norm": 0.03707795962691307, "learning_rate": 0.0001718281313336036, "loss": 1.0901, "step": 2780 }, { "epoch": 0.565464126469396, "grad_norm": 0.12352888286113739, "learning_rate": 0.0001717267936765302, "loss": 0.5252, "step": 2790 }, { "epoch": 0.5674908796108634, "grad_norm": 0.028375711292028427, "learning_rate": 0.00017162545601945683, "loss": 0.1636, "step": 2800 }, { "epoch": 0.5674908796108634, "eval_accuracy": 0.9395604395604396, "eval_loss": 0.22813718020915985, "eval_runtime": 49.0458, "eval_samples_per_second": 11.132, "eval_steps_per_second": 1.407, "step": 2800 }, { "epoch": 0.5695176327523308, "grad_norm": 0.03394703567028046, "learning_rate": 0.00017152411836238347, "loss": 1.4137, "step": 2810 }, { "epoch": 0.5715443858937982, "grad_norm": 0.04633708298206329, "learning_rate": 0.0001714227807053101, "loss": 0.391, "step": 2820 }, { "epoch": 0.5735711390352655, "grad_norm": 0.2281627207994461, "learning_rate": 0.00017132144304823674, "loss": 0.0142, "step": 2830 }, { "epoch": 0.5755978921767329, "grad_norm": 0.034119848161935806, "learning_rate": 0.00017122010539116336, "loss": 0.6163, "step": 2840 }, { "epoch": 0.5776246453182002, "grad_norm": 0.18611551821231842, "learning_rate": 0.00017111876773409, "loss": 0.0072, "step": 2850 }, { "epoch": 0.5796513984596676, "grad_norm": 0.30093681812286377, "learning_rate": 0.00017101743007701662, "loss": 0.4285, "step": 2860 }, { "epoch": 0.581678151601135, "grad_norm": 0.1138378456234932, "learning_rate": 0.00017091609241994327, "loss": 0.3155, "step": 2870 }, { "epoch": 0.5837049047426024, "grad_norm": 0.038984451442956924, "learning_rate": 0.0001708147547628699, "loss": 0.7609, "step": 2880 }, { "epoch": 0.5857316578840697, "grad_norm": 0.031690433621406555, "learning_rate": 0.0001707134171057965, "loss": 0.8971, "step": 2890 }, { "epoch": 0.587758411025537, "grad_norm": 16.161060333251953, "learning_rate": 0.00017061207944872315, "loss": 0.7427, "step": 2900 }, { "epoch": 0.587758411025537, "eval_accuracy": 0.9340659340659341, "eval_loss": 0.2333686351776123, "eval_runtime": 52.3105, "eval_samples_per_second": 10.438, "eval_steps_per_second": 1.319, "step": 2900 }, { "epoch": 0.5897851641670044, "grad_norm": 0.06748467683792114, "learning_rate": 0.0001705107417916498, "loss": 0.6921, "step": 2910 }, { "epoch": 0.5918119173084718, "grad_norm": 0.04822501912713051, "learning_rate": 0.00017040940413457642, "loss": 0.0078, "step": 2920 }, { "epoch": 0.5938386704499392, "grad_norm": 0.6019286513328552, "learning_rate": 0.00017030806647750306, "loss": 0.5709, "step": 2930 }, { "epoch": 0.5958654235914066, "grad_norm": 0.4931797981262207, "learning_rate": 0.00017020672882042968, "loss": 0.0105, "step": 2940 }, { "epoch": 0.597892176732874, "grad_norm": 0.03483499214053154, "learning_rate": 0.0001701053911633563, "loss": 0.1893, "step": 2950 }, { "epoch": 0.5999189298743413, "grad_norm": 0.030498500913381577, "learning_rate": 0.00017000405350628295, "loss": 1.0754, "step": 2960 }, { "epoch": 0.6019456830158086, "grad_norm": 0.03928288072347641, "learning_rate": 0.0001699027158492096, "loss": 0.2254, "step": 2970 }, { "epoch": 0.603972436157276, "grad_norm": 0.04761834815144539, "learning_rate": 0.0001698013781921362, "loss": 0.3954, "step": 2980 }, { "epoch": 0.6059991892987434, "grad_norm": 0.04778355360031128, "learning_rate": 0.00016970004053506283, "loss": 0.4868, "step": 2990 }, { "epoch": 0.6080259424402108, "grad_norm": 0.04448671266436577, "learning_rate": 0.00016959870287798948, "loss": 0.0083, "step": 3000 }, { "epoch": 0.6080259424402108, "eval_accuracy": 0.9358974358974359, "eval_loss": 0.24660906195640564, "eval_runtime": 47.9338, "eval_samples_per_second": 11.391, "eval_steps_per_second": 1.439, "step": 3000 }, { "epoch": 0.6100526955816782, "grad_norm": 0.03465289995074272, "learning_rate": 0.0001694973652209161, "loss": 0.0068, "step": 3010 }, { "epoch": 0.6120794487231456, "grad_norm": 0.03001750074326992, "learning_rate": 0.00016939602756384274, "loss": 0.0083, "step": 3020 }, { "epoch": 0.6141062018646128, "grad_norm": 9.022475242614746, "learning_rate": 0.00016929468990676936, "loss": 0.4394, "step": 3030 }, { "epoch": 0.6161329550060802, "grad_norm": 0.029527999460697174, "learning_rate": 0.00016919335224969598, "loss": 0.0052, "step": 3040 }, { "epoch": 0.6181597081475476, "grad_norm": 0.027496349066495895, "learning_rate": 0.00016909201459262263, "loss": 0.0032, "step": 3050 }, { "epoch": 0.620186461289015, "grad_norm": 0.029770225286483765, "learning_rate": 0.00016899067693554928, "loss": 0.4973, "step": 3060 }, { "epoch": 0.6222132144304824, "grad_norm": 0.03310020640492439, "learning_rate": 0.0001688893392784759, "loss": 0.4599, "step": 3070 }, { "epoch": 0.6242399675719498, "grad_norm": 0.24616017937660217, "learning_rate": 0.0001687880016214025, "loss": 0.009, "step": 3080 }, { "epoch": 0.6262667207134172, "grad_norm": 4.76401424407959, "learning_rate": 0.00016868666396432916, "loss": 0.0128, "step": 3090 }, { "epoch": 0.6282934738548844, "grad_norm": 0.01683022826910019, "learning_rate": 0.00016858532630725578, "loss": 0.0041, "step": 3100 }, { "epoch": 0.6282934738548844, "eval_accuracy": 0.9432234432234432, "eval_loss": 0.2736949026584625, "eval_runtime": 51.3858, "eval_samples_per_second": 10.626, "eval_steps_per_second": 1.343, "step": 3100 }, { "epoch": 0.6303202269963518, "grad_norm": 0.07328706234693527, "learning_rate": 0.00016848398865018242, "loss": 0.0041, "step": 3110 }, { "epoch": 0.6323469801378192, "grad_norm": 0.017396189272403717, "learning_rate": 0.00016838265099310904, "loss": 0.0032, "step": 3120 }, { "epoch": 0.6343737332792866, "grad_norm": 102.0742416381836, "learning_rate": 0.00016828131333603566, "loss": 0.341, "step": 3130 }, { "epoch": 0.636400486420754, "grad_norm": 0.014245778322219849, "learning_rate": 0.0001681799756789623, "loss": 0.6143, "step": 3140 }, { "epoch": 0.6384272395622214, "grad_norm": 0.020878920331597328, "learning_rate": 0.00016807863802188896, "loss": 0.8261, "step": 3150 }, { "epoch": 0.6404539927036887, "grad_norm": 7.475837707519531, "learning_rate": 0.00016797730036481557, "loss": 0.1837, "step": 3160 }, { "epoch": 0.642480745845156, "grad_norm": 0.0842457190155983, "learning_rate": 0.0001678759627077422, "loss": 0.0093, "step": 3170 }, { "epoch": 0.6445074989866234, "grad_norm": 0.018949219956994057, "learning_rate": 0.00016777462505066884, "loss": 0.3746, "step": 3180 }, { "epoch": 0.6465342521280908, "grad_norm": 0.017446573823690414, "learning_rate": 0.00016767328739359546, "loss": 0.017, "step": 3190 }, { "epoch": 0.6485610052695582, "grad_norm": 8.801593780517578, "learning_rate": 0.0001675719497365221, "loss": 1.7268, "step": 3200 }, { "epoch": 0.6485610052695582, "eval_accuracy": 0.9395604395604396, "eval_loss": 0.26258817315101624, "eval_runtime": 47.8977, "eval_samples_per_second": 11.399, "eval_steps_per_second": 1.441, "step": 3200 }, { "epoch": 0.6505877584110256, "grad_norm": 0.040938105434179306, "learning_rate": 0.00016747061207944875, "loss": 0.2597, "step": 3210 }, { "epoch": 0.652614511552493, "grad_norm": 0.030415307730436325, "learning_rate": 0.00016736927442237534, "loss": 0.0417, "step": 3220 }, { "epoch": 0.6546412646939602, "grad_norm": 0.02822272665798664, "learning_rate": 0.000167267936765302, "loss": 0.4415, "step": 3230 }, { "epoch": 0.6566680178354276, "grad_norm": 0.028088340535759926, "learning_rate": 0.00016716659910822864, "loss": 0.0044, "step": 3240 }, { "epoch": 0.658694770976895, "grad_norm": 0.022151542827486992, "learning_rate": 0.00016706526145115525, "loss": 0.0065, "step": 3250 }, { "epoch": 0.6607215241183624, "grad_norm": 0.10828548669815063, "learning_rate": 0.00016696392379408187, "loss": 0.0053, "step": 3260 }, { "epoch": 0.6627482772598298, "grad_norm": 0.019026095047593117, "learning_rate": 0.00016686258613700852, "loss": 0.5957, "step": 3270 }, { "epoch": 0.6647750304012972, "grad_norm": 9.751636505126953, "learning_rate": 0.00016676124847993514, "loss": 0.9879, "step": 3280 }, { "epoch": 0.6668017835427645, "grad_norm": 0.12755624949932098, "learning_rate": 0.00016665991082286179, "loss": 0.5395, "step": 3290 }, { "epoch": 0.6688285366842318, "grad_norm": 0.21898022294044495, "learning_rate": 0.00016655857316578843, "loss": 0.0115, "step": 3300 }, { "epoch": 0.6688285366842318, "eval_accuracy": 0.9304029304029304, "eval_loss": 0.26207664608955383, "eval_runtime": 47.5971, "eval_samples_per_second": 11.471, "eval_steps_per_second": 1.45, "step": 3300 }, { "epoch": 0.6708552898256992, "grad_norm": 0.05415915325284004, "learning_rate": 0.00016645723550871502, "loss": 0.3092, "step": 3310 }, { "epoch": 0.6728820429671666, "grad_norm": 0.04886092618107796, "learning_rate": 0.00016635589785164167, "loss": 0.5327, "step": 3320 }, { "epoch": 0.674908796108634, "grad_norm": 0.03909413889050484, "learning_rate": 0.00016625456019456832, "loss": 0.0087, "step": 3330 }, { "epoch": 0.6769355492501014, "grad_norm": 0.040225472301244736, "learning_rate": 0.00016615322253749493, "loss": 0.7562, "step": 3340 }, { "epoch": 0.6789623023915687, "grad_norm": 0.28949564695358276, "learning_rate": 0.00016605188488042158, "loss": 0.4992, "step": 3350 }, { "epoch": 0.6809890555330361, "grad_norm": 0.02482379972934723, "learning_rate": 0.0001659505472233482, "loss": 0.2469, "step": 3360 }, { "epoch": 0.6830158086745034, "grad_norm": 0.9251216053962708, "learning_rate": 0.00016584920956627482, "loss": 0.8497, "step": 3370 }, { "epoch": 0.6850425618159708, "grad_norm": 0.02334374003112316, "learning_rate": 0.00016574787190920147, "loss": 0.0106, "step": 3380 }, { "epoch": 0.6870693149574382, "grad_norm": 0.02161947637796402, "learning_rate": 0.0001656465342521281, "loss": 0.0092, "step": 3390 }, { "epoch": 0.6890960680989056, "grad_norm": 0.3686254322528839, "learning_rate": 0.00016554519659505473, "loss": 0.6196, "step": 3400 }, { "epoch": 0.6890960680989056, "eval_accuracy": 0.9267399267399268, "eval_loss": 0.3545539081096649, "eval_runtime": 50.3875, "eval_samples_per_second": 10.836, "eval_steps_per_second": 1.369, "step": 3400 }, { "epoch": 0.6911228212403729, "grad_norm": 0.0242935661226511, "learning_rate": 0.00016544385893798135, "loss": 0.3237, "step": 3410 }, { "epoch": 0.6931495743818403, "grad_norm": 0.023281089961528778, "learning_rate": 0.000165342521280908, "loss": 0.0075, "step": 3420 }, { "epoch": 0.6951763275233077, "grad_norm": 15.10773754119873, "learning_rate": 0.00016524118362383462, "loss": 1.8529, "step": 3430 }, { "epoch": 0.697203080664775, "grad_norm": 0.3190120756626129, "learning_rate": 0.00016513984596676126, "loss": 0.0278, "step": 3440 }, { "epoch": 0.6992298338062424, "grad_norm": 0.28799039125442505, "learning_rate": 0.00016503850830968788, "loss": 1.241, "step": 3450 }, { "epoch": 0.7012565869477098, "grad_norm": 0.08853207528591156, "learning_rate": 0.00016493717065261453, "loss": 0.0218, "step": 3460 }, { "epoch": 0.7032833400891771, "grad_norm": 0.05192064866423607, "learning_rate": 0.00016483583299554115, "loss": 0.7215, "step": 3470 }, { "epoch": 0.7053100932306445, "grad_norm": 0.35214605927467346, "learning_rate": 0.0001647344953384678, "loss": 0.6614, "step": 3480 }, { "epoch": 0.7073368463721119, "grad_norm": 0.06966587156057358, "learning_rate": 0.0001646331576813944, "loss": 0.0155, "step": 3490 }, { "epoch": 0.7093635995135793, "grad_norm": 0.04265503212809563, "learning_rate": 0.00016453182002432103, "loss": 0.0141, "step": 3500 }, { "epoch": 0.7093635995135793, "eval_accuracy": 0.9505494505494505, "eval_loss": 0.20635519921779633, "eval_runtime": 49.0829, "eval_samples_per_second": 11.124, "eval_steps_per_second": 1.406, "step": 3500 }, { "epoch": 0.7113903526550466, "grad_norm": 0.027219530194997787, "learning_rate": 0.00016443048236724768, "loss": 0.0142, "step": 3510 }, { "epoch": 0.713417105796514, "grad_norm": 0.051878347992897034, "learning_rate": 0.00016432914471017432, "loss": 0.0095, "step": 3520 }, { "epoch": 0.7154438589379813, "grad_norm": 0.025991367176175117, "learning_rate": 0.00016422780705310094, "loss": 0.0053, "step": 3530 }, { "epoch": 0.7174706120794487, "grad_norm": 8.773286819458008, "learning_rate": 0.0001641264693960276, "loss": 1.0328, "step": 3540 }, { "epoch": 0.7194973652209161, "grad_norm": 0.09533801674842834, "learning_rate": 0.0001640251317389542, "loss": 0.0059, "step": 3550 }, { "epoch": 0.7215241183623835, "grad_norm": 0.0500728003680706, "learning_rate": 0.00016392379408188083, "loss": 0.0071, "step": 3560 }, { "epoch": 0.7235508715038508, "grad_norm": 0.025775128975510597, "learning_rate": 0.00016382245642480747, "loss": 0.0076, "step": 3570 }, { "epoch": 0.7255776246453182, "grad_norm": 0.03006925620138645, "learning_rate": 0.00016372111876773412, "loss": 0.0046, "step": 3580 }, { "epoch": 0.7276043777867855, "grad_norm": 0.027190232649445534, "learning_rate": 0.0001636197811106607, "loss": 0.5673, "step": 3590 }, { "epoch": 0.7296311309282529, "grad_norm": 0.025452909991145134, "learning_rate": 0.00016351844345358736, "loss": 0.006, "step": 3600 }, { "epoch": 0.7296311309282529, "eval_accuracy": 0.9487179487179487, "eval_loss": 0.2203851193189621, "eval_runtime": 50.394, "eval_samples_per_second": 10.835, "eval_steps_per_second": 1.369, "step": 3600 }, { "epoch": 0.7316578840697203, "grad_norm": 0.0257217139005661, "learning_rate": 0.000163417105796514, "loss": 0.0044, "step": 3610 }, { "epoch": 0.7336846372111877, "grad_norm": 0.01945357397198677, "learning_rate": 0.00016331576813944062, "loss": 0.0045, "step": 3620 }, { "epoch": 0.7357113903526551, "grad_norm": 0.05564802885055542, "learning_rate": 0.00016321443048236727, "loss": 0.5722, "step": 3630 }, { "epoch": 0.7377381434941224, "grad_norm": 0.06898773461580276, "learning_rate": 0.0001631130928252939, "loss": 0.0048, "step": 3640 }, { "epoch": 0.7397648966355898, "grad_norm": 0.02191758342087269, "learning_rate": 0.0001630117551682205, "loss": 0.0043, "step": 3650 }, { "epoch": 0.7417916497770571, "grad_norm": 0.024975722655653954, "learning_rate": 0.00016291041751114715, "loss": 0.5345, "step": 3660 }, { "epoch": 0.7438184029185245, "grad_norm": 0.057791002094745636, "learning_rate": 0.0001628090798540738, "loss": 1.3099, "step": 3670 }, { "epoch": 0.7458451560599919, "grad_norm": 0.5608110427856445, "learning_rate": 0.00016270774219700042, "loss": 0.5324, "step": 3680 }, { "epoch": 0.7478719092014593, "grad_norm": 0.08051464706659317, "learning_rate": 0.00016260640453992704, "loss": 0.0108, "step": 3690 }, { "epoch": 0.7498986623429267, "grad_norm": 0.022334013134241104, "learning_rate": 0.00016250506688285368, "loss": 0.0226, "step": 3700 }, { "epoch": 0.7498986623429267, "eval_accuracy": 0.945054945054945, "eval_loss": 0.2544254660606384, "eval_runtime": 50.1882, "eval_samples_per_second": 10.879, "eval_steps_per_second": 1.375, "step": 3700 }, { "epoch": 0.751925415484394, "grad_norm": 0.019915521144866943, "learning_rate": 0.0001624037292257803, "loss": 0.005, "step": 3710 }, { "epoch": 0.7539521686258613, "grad_norm": 0.022011524066329002, "learning_rate": 0.00016230239156870695, "loss": 0.0046, "step": 3720 }, { "epoch": 0.7559789217673287, "grad_norm": 0.018707839772105217, "learning_rate": 0.00016220105391163357, "loss": 0.3635, "step": 3730 }, { "epoch": 0.7580056749087961, "grad_norm": 0.020906759425997734, "learning_rate": 0.0001620997162545602, "loss": 0.7634, "step": 3740 }, { "epoch": 0.7600324280502635, "grad_norm": 0.01719515211880207, "learning_rate": 0.00016199837859748683, "loss": 0.4532, "step": 3750 }, { "epoch": 0.7620591811917309, "grad_norm": 0.02170070819556713, "learning_rate": 0.00016189704094041348, "loss": 0.0023, "step": 3760 }, { "epoch": 0.7640859343331983, "grad_norm": 0.02049507014453411, "learning_rate": 0.0001617957032833401, "loss": 0.579, "step": 3770 }, { "epoch": 0.7661126874746655, "grad_norm": 0.0394478403031826, "learning_rate": 0.00016169436562626672, "loss": 0.3844, "step": 3780 }, { "epoch": 0.7681394406161329, "grad_norm": 0.029683848842978477, "learning_rate": 0.00016159302796919336, "loss": 0.2356, "step": 3790 }, { "epoch": 0.7701661937576003, "grad_norm": 0.027253972366452217, "learning_rate": 0.00016149169031211998, "loss": 0.0084, "step": 3800 }, { "epoch": 0.7701661937576003, "eval_accuracy": 0.9542124542124543, "eval_loss": 0.16977500915527344, "eval_runtime": 49.3971, "eval_samples_per_second": 11.053, "eval_steps_per_second": 1.397, "step": 3800 }, { "epoch": 0.7721929468990677, "grad_norm": 0.05158557370305061, "learning_rate": 0.00016139035265504663, "loss": 0.3413, "step": 3810 }, { "epoch": 0.7742197000405351, "grad_norm": 0.030439468100667, "learning_rate": 0.00016128901499797328, "loss": 0.5162, "step": 3820 }, { "epoch": 0.7762464531820025, "grad_norm": 0.12102861702442169, "learning_rate": 0.00016118767734089987, "loss": 0.3457, "step": 3830 }, { "epoch": 0.7782732063234697, "grad_norm": 0.0236958060413599, "learning_rate": 0.0001610863396838265, "loss": 0.0053, "step": 3840 }, { "epoch": 0.7802999594649371, "grad_norm": 0.08496245741844177, "learning_rate": 0.00016098500202675316, "loss": 0.2449, "step": 3850 }, { "epoch": 0.7823267126064045, "grad_norm": 0.03004252351820469, "learning_rate": 0.00016088366436967978, "loss": 0.525, "step": 3860 }, { "epoch": 0.7843534657478719, "grad_norm": 0.12477823346853256, "learning_rate": 0.0001607823267126064, "loss": 0.5057, "step": 3870 }, { "epoch": 0.7863802188893393, "grad_norm": 0.039420656859874725, "learning_rate": 0.00016068098905553304, "loss": 0.0204, "step": 3880 }, { "epoch": 0.7884069720308067, "grad_norm": 0.026279330253601074, "learning_rate": 0.00016057965139845966, "loss": 0.409, "step": 3890 }, { "epoch": 0.7904337251722741, "grad_norm": 0.020288709551095963, "learning_rate": 0.0001604783137413863, "loss": 0.0035, "step": 3900 }, { "epoch": 0.7904337251722741, "eval_accuracy": 0.9304029304029304, "eval_loss": 0.25412970781326294, "eval_runtime": 195.8099, "eval_samples_per_second": 2.788, "eval_steps_per_second": 0.352, "step": 3900 }, { "epoch": 0.7924604783137413, "grad_norm": 0.01888447254896164, "learning_rate": 0.00016037697608431296, "loss": 0.0032, "step": 3910 }, { "epoch": 0.7944872314552087, "grad_norm": 0.017191996797919273, "learning_rate": 0.00016027563842723955, "loss": 0.0196, "step": 3920 }, { "epoch": 0.7965139845966761, "grad_norm": 0.01510152593255043, "learning_rate": 0.0001601743007701662, "loss": 0.0021, "step": 3930 }, { "epoch": 0.7985407377381435, "grad_norm": 0.014952914789319038, "learning_rate": 0.00016007296311309284, "loss": 0.4915, "step": 3940 }, { "epoch": 0.8005674908796109, "grad_norm": 0.014625638723373413, "learning_rate": 0.00015997162545601946, "loss": 0.0787, "step": 3950 }, { "epoch": 0.8025942440210783, "grad_norm": 0.022355277091264725, "learning_rate": 0.0001598702877989461, "loss": 0.9008, "step": 3960 }, { "epoch": 0.8046209971625456, "grad_norm": 9.831883430480957, "learning_rate": 0.00015976895014187272, "loss": 1.1827, "step": 3970 }, { "epoch": 0.8066477503040129, "grad_norm": 0.06537922471761703, "learning_rate": 0.00015966761248479934, "loss": 0.0154, "step": 3980 }, { "epoch": 0.8086745034454803, "grad_norm": 0.025931930169463158, "learning_rate": 0.000159566274827726, "loss": 0.9364, "step": 3990 }, { "epoch": 0.8107012565869477, "grad_norm": 0.022608162835240364, "learning_rate": 0.00015946493717065264, "loss": 0.0137, "step": 4000 }, { "epoch": 0.8107012565869477, "eval_accuracy": 0.967032967032967, "eval_loss": 0.12349825352430344, "eval_runtime": 197.3061, "eval_samples_per_second": 2.767, "eval_steps_per_second": 0.35, "step": 4000 }, { "epoch": 0.8127280097284151, "grad_norm": 0.022211702540516853, "learning_rate": 0.00015936359951357925, "loss": 0.2224, "step": 4010 }, { "epoch": 0.8147547628698825, "grad_norm": 8.373774528503418, "learning_rate": 0.00015926226185650587, "loss": 0.4282, "step": 4020 }, { "epoch": 0.8167815160113499, "grad_norm": 0.02716963365674019, "learning_rate": 0.00015916092419943252, "loss": 0.0043, "step": 4030 }, { "epoch": 0.8188082691528172, "grad_norm": 0.019980791956186295, "learning_rate": 0.00015905958654235914, "loss": 0.048, "step": 4040 }, { "epoch": 0.8208350222942845, "grad_norm": 0.019544512033462524, "learning_rate": 0.00015895824888528579, "loss": 0.4089, "step": 4050 }, { "epoch": 0.8228617754357519, "grad_norm": 0.13491490483283997, "learning_rate": 0.0001588569112282124, "loss": 0.8966, "step": 4060 }, { "epoch": 0.8248885285772193, "grad_norm": 0.03310864046216011, "learning_rate": 0.00015875557357113905, "loss": 0.7088, "step": 4070 }, { "epoch": 0.8269152817186867, "grad_norm": 0.1253669708967209, "learning_rate": 0.00015865423591406567, "loss": 0.0569, "step": 4080 }, { "epoch": 0.828942034860154, "grad_norm": 0.028889838606119156, "learning_rate": 0.00015855289825699232, "loss": 0.008, "step": 4090 }, { "epoch": 0.8309687880016214, "grad_norm": 0.1096082478761673, "learning_rate": 0.00015845156059991894, "loss": 0.9026, "step": 4100 }, { "epoch": 0.8309687880016214, "eval_accuracy": 0.924908424908425, "eval_loss": 0.33190029859542847, "eval_runtime": 191.8064, "eval_samples_per_second": 2.847, "eval_steps_per_second": 0.36, "step": 4100 }, { "epoch": 0.8329955411430888, "grad_norm": 0.022014208137989044, "learning_rate": 0.00015835022294284555, "loss": 0.005, "step": 4110 }, { "epoch": 0.8350222942845561, "grad_norm": 8.200719833374023, "learning_rate": 0.0001582488852857722, "loss": 0.3836, "step": 4120 }, { "epoch": 0.8370490474260235, "grad_norm": 0.26717376708984375, "learning_rate": 0.00015814754762869885, "loss": 0.0054, "step": 4130 }, { "epoch": 0.8390758005674909, "grad_norm": 0.018976308405399323, "learning_rate": 0.00015804620997162547, "loss": 0.3529, "step": 4140 }, { "epoch": 0.8411025537089583, "grad_norm": 0.020883018150925636, "learning_rate": 0.0001579448723145521, "loss": 0.4956, "step": 4150 }, { "epoch": 0.8431293068504256, "grad_norm": 0.0398993194103241, "learning_rate": 0.00015784353465747873, "loss": 1.5973, "step": 4160 }, { "epoch": 0.845156059991893, "grad_norm": 1.495320439338684, "learning_rate": 0.00015774219700040535, "loss": 0.0245, "step": 4170 }, { "epoch": 0.8471828131333603, "grad_norm": 0.8191264867782593, "learning_rate": 0.000157640859343332, "loss": 0.0561, "step": 4180 }, { "epoch": 0.8492095662748277, "grad_norm": 0.01949804276227951, "learning_rate": 0.00015753952168625862, "loss": 0.0103, "step": 4190 }, { "epoch": 0.8512363194162951, "grad_norm": 0.02161702886223793, "learning_rate": 0.00015743818402918523, "loss": 0.4531, "step": 4200 }, { "epoch": 0.8512363194162951, "eval_accuracy": 0.9413919413919414, "eval_loss": 0.2221231460571289, "eval_runtime": 191.3883, "eval_samples_per_second": 2.853, "eval_steps_per_second": 0.361, "step": 4200 }, { "epoch": 0.8532630725577625, "grad_norm": 0.16009657084941864, "learning_rate": 0.00015733684637211188, "loss": 0.0062, "step": 4210 }, { "epoch": 0.8552898256992298, "grad_norm": 0.016462478786706924, "learning_rate": 0.00015723550871503853, "loss": 0.0053, "step": 4220 }, { "epoch": 0.8573165788406972, "grad_norm": 0.11075223982334137, "learning_rate": 0.00015713417105796515, "loss": 0.0055, "step": 4230 }, { "epoch": 0.8593433319821646, "grad_norm": 8.982980728149414, "learning_rate": 0.0001570328334008918, "loss": 0.9514, "step": 4240 }, { "epoch": 0.8613700851236319, "grad_norm": 0.023856502026319504, "learning_rate": 0.0001569314957438184, "loss": 0.2588, "step": 4250 }, { "epoch": 0.8633968382650993, "grad_norm": 0.025910574942827225, "learning_rate": 0.00015683015808674503, "loss": 0.0022, "step": 4260 }, { "epoch": 0.8654235914065667, "grad_norm": 0.017551589757204056, "learning_rate": 0.00015672882042967168, "loss": 0.0036, "step": 4270 }, { "epoch": 0.867450344548034, "grad_norm": 0.016716953366994858, "learning_rate": 0.00015662748277259832, "loss": 0.0411, "step": 4280 }, { "epoch": 0.8694770976895014, "grad_norm": 0.017348669469356537, "learning_rate": 0.00015652614511552494, "loss": 0.4721, "step": 4290 }, { "epoch": 0.8715038508309688, "grad_norm": 0.016933787614107132, "learning_rate": 0.00015642480745845156, "loss": 0.0039, "step": 4300 }, { "epoch": 0.8715038508309688, "eval_accuracy": 0.9560439560439561, "eval_loss": 0.18227943778038025, "eval_runtime": 177.8492, "eval_samples_per_second": 3.07, "eval_steps_per_second": 0.388, "step": 4300 }, { "epoch": 0.8735306039724362, "grad_norm": 0.016385329887270927, "learning_rate": 0.0001563234698013782, "loss": 0.0029, "step": 4310 }, { "epoch": 0.8755573571139035, "grad_norm": 0.015703881159424782, "learning_rate": 0.00015622213214430483, "loss": 0.0041, "step": 4320 }, { "epoch": 0.8775841102553709, "grad_norm": 0.01479947380721569, "learning_rate": 0.00015612079448723147, "loss": 0.0026, "step": 4330 }, { "epoch": 0.8796108633968382, "grad_norm": 0.10049838572740555, "learning_rate": 0.0001560194568301581, "loss": 0.4659, "step": 4340 }, { "epoch": 0.8816376165383056, "grad_norm": 0.014785503037273884, "learning_rate": 0.0001559181191730847, "loss": 0.0038, "step": 4350 }, { "epoch": 0.883664369679773, "grad_norm": 0.017938219010829926, "learning_rate": 0.00015581678151601136, "loss": 0.0041, "step": 4360 }, { "epoch": 0.8856911228212404, "grad_norm": 0.01392845343798399, "learning_rate": 0.000155715443858938, "loss": 0.0016, "step": 4370 }, { "epoch": 0.8877178759627078, "grad_norm": 0.013149906881153584, "learning_rate": 0.00015561410620186462, "loss": 0.0016, "step": 4380 }, { "epoch": 0.8897446291041751, "grad_norm": 0.012668000534176826, "learning_rate": 0.00015551276854479124, "loss": 0.0029, "step": 4390 }, { "epoch": 0.8917713822456425, "grad_norm": 0.013916357420384884, "learning_rate": 0.0001554114308877179, "loss": 1.3298, "step": 4400 }, { "epoch": 0.8917713822456425, "eval_accuracy": 0.9542124542124543, "eval_loss": 0.21251553297042847, "eval_runtime": 44.7327, "eval_samples_per_second": 12.206, "eval_steps_per_second": 1.542, "step": 4400 }, { "epoch": 0.8937981353871098, "grad_norm": 8.101142883300781, "learning_rate": 0.0001553100932306445, "loss": 0.3703, "step": 4410 }, { "epoch": 0.8958248885285772, "grad_norm": 0.024000072851777077, "learning_rate": 0.00015520875557357115, "loss": 0.0048, "step": 4420 }, { "epoch": 0.8978516416700446, "grad_norm": 0.12596918642520905, "learning_rate": 0.0001551074179164978, "loss": 1.2688, "step": 4430 }, { "epoch": 0.899878394811512, "grad_norm": 0.021065017208456993, "learning_rate": 0.0001550060802594244, "loss": 0.6219, "step": 4440 }, { "epoch": 0.9019051479529794, "grad_norm": 0.017869850620627403, "learning_rate": 0.00015490474260235104, "loss": 0.0409, "step": 4450 }, { "epoch": 0.9039319010944467, "grad_norm": 0.016384383663535118, "learning_rate": 0.00015480340494527768, "loss": 0.0156, "step": 4460 }, { "epoch": 0.905958654235914, "grad_norm": 0.0144584272056818, "learning_rate": 0.0001547020672882043, "loss": 0.2851, "step": 4470 }, { "epoch": 0.9079854073773814, "grad_norm": 0.01379258744418621, "learning_rate": 0.00015460072963113092, "loss": 0.0049, "step": 4480 }, { "epoch": 0.9100121605188488, "grad_norm": 0.01782505214214325, "learning_rate": 0.00015449939197405757, "loss": 0.5712, "step": 4490 }, { "epoch": 0.9120389136603162, "grad_norm": 0.2501247525215149, "learning_rate": 0.0001543980543169842, "loss": 0.4403, "step": 4500 }, { "epoch": 0.9120389136603162, "eval_accuracy": 0.8937728937728938, "eval_loss": 0.49004384875297546, "eval_runtime": 49.7342, "eval_samples_per_second": 10.978, "eval_steps_per_second": 1.387, "step": 4500 }, { "epoch": 0.9140656668017836, "grad_norm": 0.017710473388433456, "learning_rate": 0.00015429671665991083, "loss": 0.2776, "step": 4510 }, { "epoch": 0.9160924199432509, "grad_norm": 0.018590105697512627, "learning_rate": 0.00015419537900283748, "loss": 0.3543, "step": 4520 }, { "epoch": 0.9181191730847182, "grad_norm": 0.08836981654167175, "learning_rate": 0.00015409404134576407, "loss": 1.3144, "step": 4530 }, { "epoch": 0.9201459262261856, "grad_norm": 0.7396237850189209, "learning_rate": 0.00015399270368869072, "loss": 0.7759, "step": 4540 }, { "epoch": 0.922172679367653, "grad_norm": 0.04432849586009979, "learning_rate": 0.00015389136603161736, "loss": 0.3832, "step": 4550 }, { "epoch": 0.9241994325091204, "grad_norm": 0.030519770458340645, "learning_rate": 0.00015379002837454398, "loss": 0.3436, "step": 4560 }, { "epoch": 0.9262261856505878, "grad_norm": 0.033803071826696396, "learning_rate": 0.00015368869071747063, "loss": 0.0147, "step": 4570 }, { "epoch": 0.9282529387920552, "grad_norm": 0.022388586774468422, "learning_rate": 0.00015358735306039725, "loss": 0.0106, "step": 4580 }, { "epoch": 0.9302796919335224, "grad_norm": 0.017862534150481224, "learning_rate": 0.00015348601540332387, "loss": 0.0053, "step": 4590 }, { "epoch": 0.9323064450749898, "grad_norm": 0.016625819727778435, "learning_rate": 0.0001533846777462505, "loss": 0.0025, "step": 4600 }, { "epoch": 0.9323064450749898, "eval_accuracy": 0.924908424908425, "eval_loss": 0.3010227382183075, "eval_runtime": 47.4421, "eval_samples_per_second": 11.509, "eval_steps_per_second": 1.454, "step": 4600 }, { "epoch": 0.9343331982164572, "grad_norm": 0.08215595036745071, "learning_rate": 0.00015328334008917716, "loss": 0.0045, "step": 4610 }, { "epoch": 0.9363599513579246, "grad_norm": 0.013064803555607796, "learning_rate": 0.00015318200243210378, "loss": 0.0085, "step": 4620 }, { "epoch": 0.938386704499392, "grad_norm": 0.11635798960924149, "learning_rate": 0.0001530806647750304, "loss": 0.0041, "step": 4630 }, { "epoch": 0.9404134576408594, "grad_norm": 0.012454268522560596, "learning_rate": 0.00015297932711795704, "loss": 0.0062, "step": 4640 }, { "epoch": 0.9424402107823268, "grad_norm": 0.08445250242948532, "learning_rate": 0.00015287798946088366, "loss": 0.0033, "step": 4650 }, { "epoch": 0.944466963923794, "grad_norm": 0.009565513581037521, "learning_rate": 0.0001527766518038103, "loss": 0.0016, "step": 4660 }, { "epoch": 0.9464937170652614, "grad_norm": 0.009623161517083645, "learning_rate": 0.00015267531414673693, "loss": 0.6146, "step": 4670 }, { "epoch": 0.9485204702067288, "grad_norm": 0.046667005866765976, "learning_rate": 0.00015257397648966357, "loss": 0.4766, "step": 4680 }, { "epoch": 0.9505472233481962, "grad_norm": 12.367220878601074, "learning_rate": 0.0001524726388325902, "loss": 0.8669, "step": 4690 }, { "epoch": 0.9525739764896636, "grad_norm": 0.16352809965610504, "learning_rate": 0.00015237130117551684, "loss": 0.0056, "step": 4700 }, { "epoch": 0.9525739764896636, "eval_accuracy": 0.9267399267399268, "eval_loss": 0.29778388142585754, "eval_runtime": 50.4375, "eval_samples_per_second": 10.825, "eval_steps_per_second": 1.368, "step": 4700 }, { "epoch": 0.954600729631131, "grad_norm": 0.012875114567577839, "learning_rate": 0.00015226996351844346, "loss": 1.1458, "step": 4710 }, { "epoch": 0.9566274827725983, "grad_norm": 0.280487060546875, "learning_rate": 0.00015216862586137008, "loss": 0.0048, "step": 4720 }, { "epoch": 0.9586542359140656, "grad_norm": 0.014041159301996231, "learning_rate": 0.00015206728820429672, "loss": 0.0014, "step": 4730 }, { "epoch": 0.960680989055533, "grad_norm": 0.26647835969924927, "learning_rate": 0.00015196595054722337, "loss": 0.5942, "step": 4740 }, { "epoch": 0.9627077421970004, "grad_norm": 0.4411877691745758, "learning_rate": 0.00015186461289015, "loss": 1.0128, "step": 4750 }, { "epoch": 0.9647344953384678, "grad_norm": 0.20091605186462402, "learning_rate": 0.00015176327523307664, "loss": 0.4349, "step": 4760 }, { "epoch": 0.9667612484799352, "grad_norm": 0.026517393067479134, "learning_rate": 0.00015166193757600325, "loss": 0.3691, "step": 4770 }, { "epoch": 0.9687880016214026, "grad_norm": 0.02195628732442856, "learning_rate": 0.00015156059991892987, "loss": 0.4035, "step": 4780 }, { "epoch": 0.9708147547628699, "grad_norm": 0.020767178386449814, "learning_rate": 0.00015145926226185652, "loss": 0.3917, "step": 4790 }, { "epoch": 0.9728415079043372, "grad_norm": 8.028557777404785, "learning_rate": 0.00015135792460478314, "loss": 0.3642, "step": 4800 }, { "epoch": 0.9728415079043372, "eval_accuracy": 0.945054945054945, "eval_loss": 0.21620576083660126, "eval_runtime": 48.6161, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.419, "step": 4800 }, { "epoch": 0.9748682610458046, "grad_norm": 0.03613073378801346, "learning_rate": 0.00015125658694770976, "loss": 0.3875, "step": 4810 }, { "epoch": 0.976895014187272, "grad_norm": 0.391535222530365, "learning_rate": 0.0001511552492906364, "loss": 0.0072, "step": 4820 }, { "epoch": 0.9789217673287394, "grad_norm": 0.020091639831662178, "learning_rate": 0.00015105391163356305, "loss": 0.0022, "step": 4830 }, { "epoch": 0.9809485204702068, "grad_norm": 0.012985313311219215, "learning_rate": 0.00015095257397648967, "loss": 0.0087, "step": 4840 }, { "epoch": 0.9829752736116741, "grad_norm": 0.013414951972663403, "learning_rate": 0.00015085123631941632, "loss": 0.0016, "step": 4850 }, { "epoch": 0.9850020267531414, "grad_norm": 9.259086608886719, "learning_rate": 0.00015074989866234294, "loss": 0.6416, "step": 4860 }, { "epoch": 0.9870287798946088, "grad_norm": 0.02051294408738613, "learning_rate": 0.00015064856100526955, "loss": 0.002, "step": 4870 }, { "epoch": 0.9890555330360762, "grad_norm": 0.015335061587393284, "learning_rate": 0.0001505472233481962, "loss": 0.0025, "step": 4880 }, { "epoch": 0.9910822861775436, "grad_norm": 0.11195827275514603, "learning_rate": 0.00015044588569112285, "loss": 0.0069, "step": 4890 }, { "epoch": 0.993109039319011, "grad_norm": 0.01335314754396677, "learning_rate": 0.00015034454803404947, "loss": 0.5704, "step": 4900 }, { "epoch": 0.993109039319011, "eval_accuracy": 0.9413919413919414, "eval_loss": 0.24586544930934906, "eval_runtime": 50.4198, "eval_samples_per_second": 10.829, "eval_steps_per_second": 1.369, "step": 4900 }, { "epoch": 0.9951357924604783, "grad_norm": 0.034988854080438614, "learning_rate": 0.00015024321037697608, "loss": 0.0083, "step": 4910 }, { "epoch": 0.9971625456019457, "grad_norm": 0.08549095690250397, "learning_rate": 0.00015014187271990273, "loss": 0.4143, "step": 4920 }, { "epoch": 0.999189298743413, "grad_norm": 7.6238579750061035, "learning_rate": 0.00015004053506282935, "loss": 0.6845, "step": 4930 }, { "epoch": 1.0012160518848805, "grad_norm": 0.051581960171461105, "learning_rate": 0.000149939197405756, "loss": 0.0079, "step": 4940 }, { "epoch": 1.0032428050263478, "grad_norm": 9.605396270751953, "learning_rate": 0.00014983785974868262, "loss": 0.3086, "step": 4950 }, { "epoch": 1.0052695581678153, "grad_norm": 0.012973079457879066, "learning_rate": 0.00014973652209160923, "loss": 0.3138, "step": 4960 }, { "epoch": 1.0072963113092825, "grad_norm": 0.012983009219169617, "learning_rate": 0.00014963518443453588, "loss": 0.0084, "step": 4970 }, { "epoch": 1.0093230644507498, "grad_norm": 0.01481639128178358, "learning_rate": 0.00014953384677746253, "loss": 0.0036, "step": 4980 }, { "epoch": 1.0113498175922173, "grad_norm": 0.019080623984336853, "learning_rate": 0.00014943250912038915, "loss": 0.0028, "step": 4990 }, { "epoch": 1.0133765707336846, "grad_norm": 0.011588463559746742, "learning_rate": 0.00014933117146331577, "loss": 0.1761, "step": 5000 }, { "epoch": 1.0133765707336846, "eval_accuracy": 0.9652014652014652, "eval_loss": 0.16739189624786377, "eval_runtime": 48.1195, "eval_samples_per_second": 11.347, "eval_steps_per_second": 1.434, "step": 5000 }, { "epoch": 1.015403323875152, "grad_norm": 0.06948883086442947, "learning_rate": 0.0001492298338062424, "loss": 0.0203, "step": 5010 }, { "epoch": 1.0174300770166194, "grad_norm": 0.010312282480299473, "learning_rate": 0.00014912849614916903, "loss": 0.1966, "step": 5020 }, { "epoch": 1.0194568301580866, "grad_norm": 0.03138704597949982, "learning_rate": 0.00014902715849209568, "loss": 0.6573, "step": 5030 }, { "epoch": 1.0214835832995541, "grad_norm": 0.03172450512647629, "learning_rate": 0.00014892582083502232, "loss": 0.0043, "step": 5040 }, { "epoch": 1.0235103364410214, "grad_norm": 0.06547462940216064, "learning_rate": 0.00014882448317794891, "loss": 0.0034, "step": 5050 }, { "epoch": 1.025537089582489, "grad_norm": 0.023958025500178337, "learning_rate": 0.00014872314552087556, "loss": 0.6314, "step": 5060 }, { "epoch": 1.0275638427239562, "grad_norm": 0.022199515253305435, "learning_rate": 0.0001486218078638022, "loss": 0.0052, "step": 5070 }, { "epoch": 1.0295905958654237, "grad_norm": 0.023740138858556747, "learning_rate": 0.00014852047020672883, "loss": 0.0033, "step": 5080 }, { "epoch": 1.031617349006891, "grad_norm": 0.021390412002801895, "learning_rate": 0.00014841913254965545, "loss": 0.0041, "step": 5090 }, { "epoch": 1.0336441021483582, "grad_norm": 0.02551642619073391, "learning_rate": 0.0001483177948925821, "loss": 0.0023, "step": 5100 }, { "epoch": 1.0336441021483582, "eval_accuracy": 0.9542124542124543, "eval_loss": 0.18546713888645172, "eval_runtime": 50.6035, "eval_samples_per_second": 10.79, "eval_steps_per_second": 1.364, "step": 5100 }, { "epoch": 1.0356708552898257, "grad_norm": 0.011156442575156689, "learning_rate": 0.0001482164572355087, "loss": 0.0023, "step": 5110 }, { "epoch": 1.037697608431293, "grad_norm": 0.051858507096767426, "learning_rate": 0.00014811511957843536, "loss": 0.0026, "step": 5120 }, { "epoch": 1.0397243615727605, "grad_norm": 0.011161372996866703, "learning_rate": 0.000148013781921362, "loss": 0.0013, "step": 5130 }, { "epoch": 1.0417511147142278, "grad_norm": 0.009919494390487671, "learning_rate": 0.0001479124442642886, "loss": 0.0016, "step": 5140 }, { "epoch": 1.0437778678556953, "grad_norm": 0.008916059508919716, "learning_rate": 0.00014781110660721524, "loss": 0.0018, "step": 5150 }, { "epoch": 1.0458046209971625, "grad_norm": 0.009410860016942024, "learning_rate": 0.0001477097689501419, "loss": 0.0017, "step": 5160 }, { "epoch": 1.0478313741386298, "grad_norm": 0.008212919346988201, "learning_rate": 0.0001476084312930685, "loss": 0.001, "step": 5170 }, { "epoch": 1.0498581272800973, "grad_norm": 0.0102662593126297, "learning_rate": 0.00014750709363599515, "loss": 0.0016, "step": 5180 }, { "epoch": 1.0518848804215646, "grad_norm": 0.008377410471439362, "learning_rate": 0.00014740575597892177, "loss": 0.5981, "step": 5190 }, { "epoch": 1.053911633563032, "grad_norm": 0.011341779492795467, "learning_rate": 0.0001473044183218484, "loss": 0.1477, "step": 5200 }, { "epoch": 1.053911633563032, "eval_accuracy": 0.9652014652014652, "eval_loss": 0.15164771676063538, "eval_runtime": 50.7744, "eval_samples_per_second": 10.753, "eval_steps_per_second": 1.359, "step": 5200 }, { "epoch": 1.0559383867044994, "grad_norm": 0.0325639471411705, "learning_rate": 0.00014720308066477504, "loss": 0.1064, "step": 5210 }, { "epoch": 1.0579651398459669, "grad_norm": 0.03364536911249161, "learning_rate": 0.00014710174300770168, "loss": 0.5205, "step": 5220 }, { "epoch": 1.0599918929874341, "grad_norm": 0.034866757690906525, "learning_rate": 0.0001470004053506283, "loss": 0.0129, "step": 5230 }, { "epoch": 1.0620186461289014, "grad_norm": 0.014683687128126621, "learning_rate": 0.00014689906769355492, "loss": 0.3104, "step": 5240 }, { "epoch": 1.064045399270369, "grad_norm": 0.03544352203607559, "learning_rate": 0.00014679773003648157, "loss": 0.9745, "step": 5250 }, { "epoch": 1.0660721524118362, "grad_norm": 0.7537864446640015, "learning_rate": 0.0001466963923794082, "loss": 0.0038, "step": 5260 }, { "epoch": 1.0680989055533037, "grad_norm": 0.013130277395248413, "learning_rate": 0.00014659505472233483, "loss": 0.2515, "step": 5270 }, { "epoch": 1.070125658694771, "grad_norm": 0.013951468281447887, "learning_rate": 0.00014649371706526145, "loss": 0.5514, "step": 5280 }, { "epoch": 1.0721524118362384, "grad_norm": 0.013629689812660217, "learning_rate": 0.0001463923794081881, "loss": 0.3022, "step": 5290 }, { "epoch": 1.0741791649777057, "grad_norm": 0.016600918024778366, "learning_rate": 0.00014629104175111472, "loss": 0.0034, "step": 5300 }, { "epoch": 1.0741791649777057, "eval_accuracy": 0.7326007326007326, "eval_loss": 0.81169593334198, "eval_runtime": 50.4513, "eval_samples_per_second": 10.822, "eval_steps_per_second": 1.368, "step": 5300 }, { "epoch": 1.076205918119173, "grad_norm": 8.864251136779785, "learning_rate": 0.00014618970409404136, "loss": 0.8189, "step": 5310 }, { "epoch": 1.0782326712606405, "grad_norm": 0.012856297194957733, "learning_rate": 0.00014608836643696798, "loss": 0.0027, "step": 5320 }, { "epoch": 1.0802594244021078, "grad_norm": 0.07995825260877609, "learning_rate": 0.0001459870287798946, "loss": 0.0022, "step": 5330 }, { "epoch": 1.0822861775435753, "grad_norm": 0.0793415978550911, "learning_rate": 0.00014588569112282125, "loss": 0.168, "step": 5340 }, { "epoch": 1.0843129306850425, "grad_norm": 0.010782618075609207, "learning_rate": 0.00014578435346574787, "loss": 0.0052, "step": 5350 }, { "epoch": 1.08633968382651, "grad_norm": 0.01051894761621952, "learning_rate": 0.0001456830158086745, "loss": 0.4624, "step": 5360 }, { "epoch": 1.0883664369679773, "grad_norm": 0.013191080652177334, "learning_rate": 0.00014558167815160116, "loss": 1.1017, "step": 5370 }, { "epoch": 1.0903931901094446, "grad_norm": 0.018109530210494995, "learning_rate": 0.00014548034049452778, "loss": 0.0047, "step": 5380 }, { "epoch": 1.092419943250912, "grad_norm": 0.018270637840032578, "learning_rate": 0.0001453790028374544, "loss": 0.0019, "step": 5390 }, { "epoch": 1.0944466963923793, "grad_norm": 0.020311271771788597, "learning_rate": 0.00014527766518038104, "loss": 0.4936, "step": 5400 }, { "epoch": 1.0944466963923793, "eval_accuracy": 0.9377289377289377, "eval_loss": 0.21022464334964752, "eval_runtime": 50.3508, "eval_samples_per_second": 10.844, "eval_steps_per_second": 1.37, "step": 5400 }, { "epoch": 1.0964734495338468, "grad_norm": 0.07982522994279861, "learning_rate": 0.00014517632752330766, "loss": 0.9046, "step": 5410 }, { "epoch": 1.0985002026753141, "grad_norm": 0.1394471377134323, "learning_rate": 0.00014507498986623428, "loss": 0.3592, "step": 5420 }, { "epoch": 1.1005269558167816, "grad_norm": 0.03570636734366417, "learning_rate": 0.00014497365220916093, "loss": 0.3656, "step": 5430 }, { "epoch": 1.102553708958249, "grad_norm": 0.01714455708861351, "learning_rate": 0.00014487231455208757, "loss": 0.0669, "step": 5440 }, { "epoch": 1.1045804620997162, "grad_norm": 0.013243530876934528, "learning_rate": 0.0001447709768950142, "loss": 0.002, "step": 5450 }, { "epoch": 1.1066072152411837, "grad_norm": 0.08380251377820969, "learning_rate": 0.00014466963923794084, "loss": 0.0048, "step": 5460 }, { "epoch": 1.108633968382651, "grad_norm": 0.10041346400976181, "learning_rate": 0.00014456830158086746, "loss": 0.6733, "step": 5470 }, { "epoch": 1.1106607215241184, "grad_norm": 0.018171994015574455, "learning_rate": 0.00014446696392379408, "loss": 0.0029, "step": 5480 }, { "epoch": 1.1126874746655857, "grad_norm": 0.02070717327296734, "learning_rate": 0.00014436562626672072, "loss": 0.3299, "step": 5490 }, { "epoch": 1.1147142278070532, "grad_norm": 0.014606601558625698, "learning_rate": 0.00014426428860964737, "loss": 0.0158, "step": 5500 }, { "epoch": 1.1147142278070532, "eval_accuracy": 0.9523809523809523, "eval_loss": 0.18859973549842834, "eval_runtime": 48.3094, "eval_samples_per_second": 11.302, "eval_steps_per_second": 1.428, "step": 5500 }, { "epoch": 1.1167409809485205, "grad_norm": 0.01225406676530838, "learning_rate": 0.000144162950952574, "loss": 0.0015, "step": 5510 }, { "epoch": 1.1187677340899878, "grad_norm": 0.011000520549714565, "learning_rate": 0.0001440616132955006, "loss": 0.0194, "step": 5520 }, { "epoch": 1.1207944872314552, "grad_norm": 0.011146724224090576, "learning_rate": 0.00014396027563842725, "loss": 0.0028, "step": 5530 }, { "epoch": 1.1228212403729225, "grad_norm": 0.009962116368114948, "learning_rate": 0.00014385893798135387, "loss": 0.0026, "step": 5540 }, { "epoch": 1.12484799351439, "grad_norm": 0.01039500255137682, "learning_rate": 0.00014375760032428052, "loss": 0.4091, "step": 5550 }, { "epoch": 1.1268747466558573, "grad_norm": 0.00947809312492609, "learning_rate": 0.00014365626266720714, "loss": 0.0041, "step": 5560 }, { "epoch": 1.1289014997973248, "grad_norm": 0.01036769337952137, "learning_rate": 0.00014355492501013376, "loss": 0.331, "step": 5570 }, { "epoch": 1.130928252938792, "grad_norm": 18.621246337890625, "learning_rate": 0.0001434535873530604, "loss": 0.83, "step": 5580 }, { "epoch": 1.1329550060802593, "grad_norm": 0.24284754693508148, "learning_rate": 0.00014335224969598705, "loss": 0.019, "step": 5590 }, { "epoch": 1.1349817592217268, "grad_norm": 0.016728529706597328, "learning_rate": 0.00014325091203891367, "loss": 0.0041, "step": 5600 }, { "epoch": 1.1349817592217268, "eval_accuracy": 0.9285714285714286, "eval_loss": 0.25444215536117554, "eval_runtime": 50.4897, "eval_samples_per_second": 10.814, "eval_steps_per_second": 1.367, "step": 5600 }, { "epoch": 1.1370085123631941, "grad_norm": 0.010051474906504154, "learning_rate": 0.0001431495743818403, "loss": 0.0043, "step": 5610 }, { "epoch": 1.1390352655046616, "grad_norm": 0.008353658951818943, "learning_rate": 0.00014304823672476694, "loss": 0.0021, "step": 5620 }, { "epoch": 1.1410620186461289, "grad_norm": 0.027239592745900154, "learning_rate": 0.00014294689906769355, "loss": 0.3325, "step": 5630 }, { "epoch": 1.1430887717875962, "grad_norm": 0.00991065800189972, "learning_rate": 0.0001428455614106202, "loss": 0.0096, "step": 5640 }, { "epoch": 1.1451155249290637, "grad_norm": 0.01124146580696106, "learning_rate": 0.00014274422375354685, "loss": 0.7095, "step": 5650 }, { "epoch": 1.147142278070531, "grad_norm": 0.009897377341985703, "learning_rate": 0.00014264288609647344, "loss": 0.0032, "step": 5660 }, { "epoch": 1.1491690312119984, "grad_norm": 0.009127453900873661, "learning_rate": 0.00014254154843940008, "loss": 0.006, "step": 5670 }, { "epoch": 1.1511957843534657, "grad_norm": 0.009598911739885807, "learning_rate": 0.00014244021078232673, "loss": 0.0014, "step": 5680 }, { "epoch": 1.1532225374949332, "grad_norm": 0.033580176532268524, "learning_rate": 0.00014233887312525335, "loss": 0.0021, "step": 5690 }, { "epoch": 1.1552492906364005, "grad_norm": 0.012398588471114635, "learning_rate": 0.00014223753546817997, "loss": 0.7993, "step": 5700 }, { "epoch": 1.1552492906364005, "eval_accuracy": 0.9304029304029304, "eval_loss": 0.25227564573287964, "eval_runtime": 50.1813, "eval_samples_per_second": 10.881, "eval_steps_per_second": 1.375, "step": 5700 }, { "epoch": 1.157276043777868, "grad_norm": 0.021187039092183113, "learning_rate": 0.00014213619781110662, "loss": 0.0019, "step": 5710 }, { "epoch": 1.1593027969193352, "grad_norm": 0.012165443040430546, "learning_rate": 0.00014203486015403323, "loss": 0.0134, "step": 5720 }, { "epoch": 1.1613295500608025, "grad_norm": 0.009807312861084938, "learning_rate": 0.00014193352249695988, "loss": 0.2478, "step": 5730 }, { "epoch": 1.16335630320227, "grad_norm": 0.009756849147379398, "learning_rate": 0.00014183218483988653, "loss": 0.0014, "step": 5740 }, { "epoch": 1.1653830563437373, "grad_norm": 0.008829077705740929, "learning_rate": 0.00014173084718281312, "loss": 0.0012, "step": 5750 }, { "epoch": 1.1674098094852048, "grad_norm": 0.08207603543996811, "learning_rate": 0.00014162950952573977, "loss": 0.0021, "step": 5760 }, { "epoch": 1.169436562626672, "grad_norm": 10.163952827453613, "learning_rate": 0.0001415281718686664, "loss": 1.0266, "step": 5770 }, { "epoch": 1.1714633157681393, "grad_norm": 0.03201476112008095, "learning_rate": 0.00014142683421159303, "loss": 0.2121, "step": 5780 }, { "epoch": 1.1734900689096068, "grad_norm": 0.032649170607328415, "learning_rate": 0.00014132549655451968, "loss": 0.6479, "step": 5790 }, { "epoch": 1.175516822051074, "grad_norm": 4.468822002410889, "learning_rate": 0.0001412241588974463, "loss": 0.6292, "step": 5800 }, { "epoch": 1.175516822051074, "eval_accuracy": 0.945054945054945, "eval_loss": 0.1681230217218399, "eval_runtime": 47.0611, "eval_samples_per_second": 11.602, "eval_steps_per_second": 1.466, "step": 5800 }, { "epoch": 1.1775435751925416, "grad_norm": 0.01576576940715313, "learning_rate": 0.00014112282124037291, "loss": 0.0021, "step": 5810 }, { "epoch": 1.1795703283340089, "grad_norm": 0.011586645618081093, "learning_rate": 0.00014102148358329956, "loss": 0.5428, "step": 5820 }, { "epoch": 1.1815970814754764, "grad_norm": 0.1455240249633789, "learning_rate": 0.0001409201459262262, "loss": 0.0091, "step": 5830 }, { "epoch": 1.1836238346169436, "grad_norm": 0.014510289765894413, "learning_rate": 0.00014081880826915283, "loss": 0.0022, "step": 5840 }, { "epoch": 1.1856505877584111, "grad_norm": 0.020773572847247124, "learning_rate": 0.00014071747061207945, "loss": 1.1927, "step": 5850 }, { "epoch": 1.1876773408998784, "grad_norm": 10.849150657653809, "learning_rate": 0.0001406161329550061, "loss": 0.448, "step": 5860 }, { "epoch": 1.1897040940413457, "grad_norm": 22.743642807006836, "learning_rate": 0.0001405147952979327, "loss": 0.4768, "step": 5870 }, { "epoch": 1.1917308471828132, "grad_norm": 0.020906543359160423, "learning_rate": 0.00014041345764085936, "loss": 0.0059, "step": 5880 }, { "epoch": 1.1937576003242805, "grad_norm": 0.017099998891353607, "learning_rate": 0.00014031211998378598, "loss": 0.1922, "step": 5890 }, { "epoch": 1.195784353465748, "grad_norm": 0.013171990402042866, "learning_rate": 0.0001402107823267126, "loss": 0.0048, "step": 5900 }, { "epoch": 1.195784353465748, "eval_accuracy": 0.9377289377289377, "eval_loss": 0.2745637595653534, "eval_runtime": 50.3331, "eval_samples_per_second": 10.848, "eval_steps_per_second": 1.371, "step": 5900 }, { "epoch": 1.1978111066072152, "grad_norm": 1.8413903713226318, "learning_rate": 0.00014010944466963924, "loss": 0.0059, "step": 5910 }, { "epoch": 1.1998378597486825, "grad_norm": 13.45578670501709, "learning_rate": 0.0001400081070125659, "loss": 1.215, "step": 5920 }, { "epoch": 1.20186461289015, "grad_norm": 0.026761075481772423, "learning_rate": 0.0001399067693554925, "loss": 0.0028, "step": 5930 }, { "epoch": 1.2038913660316173, "grad_norm": 0.10492666810750961, "learning_rate": 0.00013980543169841913, "loss": 0.4688, "step": 5940 }, { "epoch": 1.2059181191730848, "grad_norm": 0.02013002149760723, "learning_rate": 0.00013970409404134577, "loss": 0.0057, "step": 5950 }, { "epoch": 1.207944872314552, "grad_norm": 0.01581076718866825, "learning_rate": 0.0001396027563842724, "loss": 0.0474, "step": 5960 }, { "epoch": 1.2099716254560196, "grad_norm": 0.019453393295407295, "learning_rate": 0.00013950141872719904, "loss": 0.2017, "step": 5970 }, { "epoch": 1.2119983785974868, "grad_norm": 0.013611565344035625, "learning_rate": 0.00013940008107012566, "loss": 0.1209, "step": 5980 }, { "epoch": 1.214025131738954, "grad_norm": 0.012966940179467201, "learning_rate": 0.0001392987434130523, "loss": 0.005, "step": 5990 }, { "epoch": 1.2160518848804216, "grad_norm": 8.436431884765625, "learning_rate": 0.00013919740575597892, "loss": 0.4908, "step": 6000 }, { "epoch": 1.2160518848804216, "eval_accuracy": 0.9358974358974359, "eval_loss": 0.31936949491500854, "eval_runtime": 50.4877, "eval_samples_per_second": 10.815, "eval_steps_per_second": 1.367, "step": 6000 }, { "epoch": 1.2180786380218889, "grad_norm": 0.012801182456314564, "learning_rate": 0.00013909606809890557, "loss": 0.5063, "step": 6010 }, { "epoch": 1.2201053911633564, "grad_norm": 0.012545096687972546, "learning_rate": 0.0001389947304418322, "loss": 0.0014, "step": 6020 }, { "epoch": 1.2221321443048236, "grad_norm": 0.1367848515510559, "learning_rate": 0.0001388933927847588, "loss": 0.0059, "step": 6030 }, { "epoch": 1.2241588974462911, "grad_norm": 0.011788008734583855, "learning_rate": 0.00013879205512768545, "loss": 0.0029, "step": 6040 }, { "epoch": 1.2261856505877584, "grad_norm": 0.13413147628307343, "learning_rate": 0.0001386907174706121, "loss": 0.0053, "step": 6050 }, { "epoch": 1.2282124037292257, "grad_norm": 0.0873718187212944, "learning_rate": 0.00013858937981353872, "loss": 0.0046, "step": 6060 }, { "epoch": 1.2302391568706932, "grad_norm": 0.0356074720621109, "learning_rate": 0.00013848804215646536, "loss": 0.6459, "step": 6070 }, { "epoch": 1.2322659100121605, "grad_norm": 0.07616440206766129, "learning_rate": 0.00013838670449939198, "loss": 0.0242, "step": 6080 }, { "epoch": 1.234292663153628, "grad_norm": 0.013825983740389347, "learning_rate": 0.0001382853668423186, "loss": 0.2924, "step": 6090 }, { "epoch": 1.2363194162950952, "grad_norm": 0.05042441561818123, "learning_rate": 0.00013818402918524525, "loss": 0.4156, "step": 6100 }, { "epoch": 1.2363194162950952, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.13203047215938568, "eval_runtime": 50.4313, "eval_samples_per_second": 10.827, "eval_steps_per_second": 1.368, "step": 6100 }, { "epoch": 1.2383461694365625, "grad_norm": 0.013852720148861408, "learning_rate": 0.0001380826915281719, "loss": 0.3545, "step": 6110 }, { "epoch": 1.24037292257803, "grad_norm": 0.012885493226349354, "learning_rate": 0.0001379813538710985, "loss": 0.0035, "step": 6120 }, { "epoch": 1.2423996757194973, "grad_norm": 0.029491251334547997, "learning_rate": 0.00013788001621402513, "loss": 0.0025, "step": 6130 }, { "epoch": 1.2444264288609648, "grad_norm": 0.011255360208451748, "learning_rate": 0.00013777867855695178, "loss": 0.0023, "step": 6140 }, { "epoch": 1.246453182002432, "grad_norm": 0.02658005803823471, "learning_rate": 0.0001376773408998784, "loss": 0.0017, "step": 6150 }, { "epoch": 1.2484799351438995, "grad_norm": 0.009282359853386879, "learning_rate": 0.00013757600324280504, "loss": 0.0012, "step": 6160 }, { "epoch": 1.2505066882853668, "grad_norm": 0.00913665909320116, "learning_rate": 0.00013747466558573166, "loss": 0.6614, "step": 6170 }, { "epoch": 1.2525334414268343, "grad_norm": 0.011567773297429085, "learning_rate": 0.00013737332792865828, "loss": 0.6064, "step": 6180 }, { "epoch": 1.2545601945683016, "grad_norm": 0.019871024414896965, "learning_rate": 0.00013727199027158493, "loss": 0.0037, "step": 6190 }, { "epoch": 1.2565869477097689, "grad_norm": 0.013100901618599892, "learning_rate": 0.00013717065261451157, "loss": 0.0056, "step": 6200 }, { "epoch": 1.2565869477097689, "eval_accuracy": 0.8992673992673993, "eval_loss": 0.3195291757583618, "eval_runtime": 49.1699, "eval_samples_per_second": 11.104, "eval_steps_per_second": 1.403, "step": 6200 }, { "epoch": 1.2586137008512364, "grad_norm": 9.371114730834961, "learning_rate": 0.0001370693149574382, "loss": 0.2707, "step": 6210 }, { "epoch": 1.2606404539927036, "grad_norm": 0.009086466394364834, "learning_rate": 0.0001369679773003648, "loss": 0.116, "step": 6220 }, { "epoch": 1.2626672071341711, "grad_norm": 0.3528311252593994, "learning_rate": 0.00013686663964329146, "loss": 0.0044, "step": 6230 }, { "epoch": 1.2646939602756384, "grad_norm": 0.009217889979481697, "learning_rate": 0.00013676530198621808, "loss": 0.0816, "step": 6240 }, { "epoch": 1.2667207134171057, "grad_norm": 0.008680080063641071, "learning_rate": 0.00013666396432914472, "loss": 0.4132, "step": 6250 }, { "epoch": 1.2687474665585732, "grad_norm": 0.008342086337506771, "learning_rate": 0.00013656262667207137, "loss": 0.0014, "step": 6260 }, { "epoch": 1.2707742197000405, "grad_norm": 0.007774224504828453, "learning_rate": 0.00013646128901499796, "loss": 0.0017, "step": 6270 }, { "epoch": 1.272800972841508, "grad_norm": 0.007495061028748751, "learning_rate": 0.0001363599513579246, "loss": 0.0022, "step": 6280 }, { "epoch": 1.2748277259829752, "grad_norm": 0.042415015399456024, "learning_rate": 0.00013625861370085125, "loss": 0.5376, "step": 6290 }, { "epoch": 1.2768544791244427, "grad_norm": 0.008079341612756252, "learning_rate": 0.00013615727604377787, "loss": 0.0013, "step": 6300 }, { "epoch": 1.2768544791244427, "eval_accuracy": 0.9615384615384616, "eval_loss": 0.15808889269828796, "eval_runtime": 50.3952, "eval_samples_per_second": 10.834, "eval_steps_per_second": 1.369, "step": 6300 }, { "epoch": 1.27888123226591, "grad_norm": 0.007808642461895943, "learning_rate": 0.0001360559383867045, "loss": 0.0015, "step": 6310 }, { "epoch": 1.2809079854073775, "grad_norm": 0.0075371447019279, "learning_rate": 0.00013595460072963114, "loss": 0.0026, "step": 6320 }, { "epoch": 1.2829347385488448, "grad_norm": 0.06489552557468414, "learning_rate": 0.00013585326307255776, "loss": 0.4667, "step": 6330 }, { "epoch": 1.284961491690312, "grad_norm": 0.007573794573545456, "learning_rate": 0.0001357519254154844, "loss": 0.4561, "step": 6340 }, { "epoch": 1.2869882448317795, "grad_norm": 0.008704758249223232, "learning_rate": 0.00013565058775841105, "loss": 0.7077, "step": 6350 }, { "epoch": 1.2890149979732468, "grad_norm": 0.12125850468873978, "learning_rate": 0.00013554925010133764, "loss": 0.3011, "step": 6360 }, { "epoch": 1.2910417511147143, "grad_norm": 0.0689581036567688, "learning_rate": 0.0001354479124442643, "loss": 0.0025, "step": 6370 }, { "epoch": 1.2930685042561816, "grad_norm": 2.5777084827423096, "learning_rate": 0.00013534657478719094, "loss": 0.0149, "step": 6380 }, { "epoch": 1.2950952573976489, "grad_norm": 0.009006716310977936, "learning_rate": 0.00013524523713011755, "loss": 0.0026, "step": 6390 }, { "epoch": 1.2971220105391164, "grad_norm": 0.1382710337638855, "learning_rate": 0.0001351438994730442, "loss": 0.0027, "step": 6400 }, { "epoch": 1.2971220105391164, "eval_accuracy": 0.9413919413919414, "eval_loss": 0.2660368084907532, "eval_runtime": 47.3575, "eval_samples_per_second": 11.529, "eval_steps_per_second": 1.457, "step": 6400 }, { "epoch": 1.2991487636805836, "grad_norm": 0.008069846779108047, "learning_rate": 0.00013504256181597082, "loss": 0.0017, "step": 6410 }, { "epoch": 1.3011755168220511, "grad_norm": 0.008503837510943413, "learning_rate": 0.00013494122415889744, "loss": 0.5095, "step": 6420 }, { "epoch": 1.3032022699635184, "grad_norm": 0.008521609008312225, "learning_rate": 0.00013483988650182408, "loss": 0.3073, "step": 6430 }, { "epoch": 1.3052290231049857, "grad_norm": 0.008452016860246658, "learning_rate": 0.00013473854884475073, "loss": 0.0017, "step": 6440 }, { "epoch": 1.3072557762464532, "grad_norm": 8.367830276489258, "learning_rate": 0.00013463721118767735, "loss": 0.4673, "step": 6450 }, { "epoch": 1.3092825293879207, "grad_norm": 0.08890511095523834, "learning_rate": 0.00013453587353060397, "loss": 0.0046, "step": 6460 }, { "epoch": 1.311309282529388, "grad_norm": 0.008890490978956223, "learning_rate": 0.00013443453587353062, "loss": 0.4202, "step": 6470 }, { "epoch": 1.3133360356708552, "grad_norm": 0.009985828772187233, "learning_rate": 0.00013433319821645723, "loss": 0.6943, "step": 6480 }, { "epoch": 1.3153627888123227, "grad_norm": 0.13318373262882233, "learning_rate": 0.00013423186055938388, "loss": 0.0028, "step": 6490 }, { "epoch": 1.31738954195379, "grad_norm": 0.011707228608429432, "learning_rate": 0.0001341305229023105, "loss": 0.1753, "step": 6500 }, { "epoch": 1.31738954195379, "eval_accuracy": 0.9560439560439561, "eval_loss": 0.18581902980804443, "eval_runtime": 62.4093, "eval_samples_per_second": 8.749, "eval_steps_per_second": 1.106, "step": 6500 }, { "epoch": 1.3194162950952575, "grad_norm": 0.022704975679516792, "learning_rate": 0.00013402918524523712, "loss": 0.0043, "step": 6510 }, { "epoch": 1.3214430482367248, "grad_norm": 0.09878182411193848, "learning_rate": 0.00013392784758816377, "loss": 0.005, "step": 6520 }, { "epoch": 1.323469801378192, "grad_norm": 0.010758204385638237, "learning_rate": 0.0001338265099310904, "loss": 0.005, "step": 6530 }, { "epoch": 1.3254965545196595, "grad_norm": 0.08000922948122025, "learning_rate": 0.00013372517227401703, "loss": 0.0027, "step": 6540 }, { "epoch": 1.3275233076611268, "grad_norm": 0.009771626442670822, "learning_rate": 0.00013362383461694365, "loss": 0.4507, "step": 6550 }, { "epoch": 1.3295500608025943, "grad_norm": 0.09330135583877563, "learning_rate": 0.0001335224969598703, "loss": 0.0652, "step": 6560 }, { "epoch": 1.3315768139440616, "grad_norm": 0.06133830174803734, "learning_rate": 0.00013342115930279691, "loss": 0.0024, "step": 6570 }, { "epoch": 1.3336035670855289, "grad_norm": 0.00886420626193285, "learning_rate": 0.00013331982164572356, "loss": 0.0819, "step": 6580 }, { "epoch": 1.3356303202269963, "grad_norm": 0.008621442131698132, "learning_rate": 0.00013321848398865018, "loss": 0.0025, "step": 6590 }, { "epoch": 1.3376570733684638, "grad_norm": 0.06374801695346832, "learning_rate": 0.00013311714633157683, "loss": 0.0013, "step": 6600 }, { "epoch": 1.3376570733684638, "eval_accuracy": 0.9615384615384616, "eval_loss": 0.2017831653356552, "eval_runtime": 50.4548, "eval_samples_per_second": 10.822, "eval_steps_per_second": 1.368, "step": 6600 }, { "epoch": 1.3396838265099311, "grad_norm": 0.0080784372985363, "learning_rate": 0.00013301580867450345, "loss": 0.0009, "step": 6610 }, { "epoch": 1.3417105796513984, "grad_norm": 0.007808597292751074, "learning_rate": 0.0001329144710174301, "loss": 0.0021, "step": 6620 }, { "epoch": 1.343737332792866, "grad_norm": 0.053149402141571045, "learning_rate": 0.0001328131333603567, "loss": 0.0034, "step": 6630 }, { "epoch": 1.3457640859343332, "grad_norm": 0.04929700493812561, "learning_rate": 0.00013271179570328333, "loss": 0.0013, "step": 6640 }, { "epoch": 1.3477908390758007, "grad_norm": 0.00766234565526247, "learning_rate": 0.00013261045804620998, "loss": 0.0013, "step": 6650 }, { "epoch": 1.349817592217268, "grad_norm": 0.007020989898592234, "learning_rate": 0.00013250912038913662, "loss": 0.5308, "step": 6660 }, { "epoch": 1.3518443453587352, "grad_norm": 0.007394090294837952, "learning_rate": 0.00013240778273206324, "loss": 0.0021, "step": 6670 }, { "epoch": 1.3538710985002027, "grad_norm": 0.065384142100811, "learning_rate": 0.0001323064450749899, "loss": 0.6402, "step": 6680 }, { "epoch": 1.35589785164167, "grad_norm": 0.04742232337594032, "learning_rate": 0.0001322051074179165, "loss": 0.0041, "step": 6690 }, { "epoch": 1.3579246047831375, "grad_norm": 0.06092911213636398, "learning_rate": 0.00013210376976084313, "loss": 0.0033, "step": 6700 }, { "epoch": 1.3579246047831375, "eval_accuracy": 0.9706959706959707, "eval_loss": 0.14746183156967163, "eval_runtime": 46.8652, "eval_samples_per_second": 11.65, "eval_steps_per_second": 1.472, "step": 6700 }, { "epoch": 1.3599513579246048, "grad_norm": 0.011696516536176205, "learning_rate": 0.00013200243210376977, "loss": 0.2504, "step": 6710 }, { "epoch": 1.361978111066072, "grad_norm": 0.00928343553096056, "learning_rate": 0.00013190109444669642, "loss": 0.0039, "step": 6720 }, { "epoch": 1.3640048642075395, "grad_norm": 0.007892207242548466, "learning_rate": 0.00013179975678962304, "loss": 0.1108, "step": 6730 }, { "epoch": 1.366031617349007, "grad_norm": 0.0072936443611979485, "learning_rate": 0.00013169841913254966, "loss": 0.0009, "step": 6740 }, { "epoch": 1.3680583704904743, "grad_norm": 0.00823760312050581, "learning_rate": 0.0001315970814754763, "loss": 0.4251, "step": 6750 }, { "epoch": 1.3700851236319416, "grad_norm": 14.980917930603027, "learning_rate": 0.00013149574381840292, "loss": 0.6206, "step": 6760 }, { "epoch": 1.372111876773409, "grad_norm": 0.01617688313126564, "learning_rate": 0.00013139440616132957, "loss": 0.0022, "step": 6770 }, { "epoch": 1.3741386299148763, "grad_norm": 0.04971484839916229, "learning_rate": 0.0001312930685042562, "loss": 0.0049, "step": 6780 }, { "epoch": 1.3761653830563438, "grad_norm": 0.01129236165434122, "learning_rate": 0.0001311917308471828, "loss": 0.0038, "step": 6790 }, { "epoch": 1.3781921361978111, "grad_norm": 0.009702612645924091, "learning_rate": 0.00013109039319010945, "loss": 0.0037, "step": 6800 }, { "epoch": 1.3781921361978111, "eval_accuracy": 0.9688644688644689, "eval_loss": 0.141746386885643, "eval_runtime": 50.2101, "eval_samples_per_second": 10.874, "eval_steps_per_second": 1.374, "step": 6800 }, { "epoch": 1.3802188893392784, "grad_norm": 0.008086517453193665, "learning_rate": 0.0001309890555330361, "loss": 0.5346, "step": 6810 }, { "epoch": 1.3822456424807459, "grad_norm": 0.06050160527229309, "learning_rate": 0.00013088771787596272, "loss": 0.0015, "step": 6820 }, { "epoch": 1.3842723956222132, "grad_norm": 0.0070461998693645, "learning_rate": 0.00013078638021888934, "loss": 0.0022, "step": 6830 }, { "epoch": 1.3862991487636807, "grad_norm": 0.05749473720788956, "learning_rate": 0.00013068504256181598, "loss": 0.002, "step": 6840 }, { "epoch": 1.388325901905148, "grad_norm": 10.301835060119629, "learning_rate": 0.0001305837049047426, "loss": 1.1992, "step": 6850 }, { "epoch": 1.3903526550466152, "grad_norm": 0.009321840479969978, "learning_rate": 0.00013048236724766925, "loss": 0.4717, "step": 6860 }, { "epoch": 1.3923794081880827, "grad_norm": 0.0103806983679533, "learning_rate": 0.0001303810295905959, "loss": 0.0021, "step": 6870 }, { "epoch": 1.3944061613295502, "grad_norm": 0.012516554445028305, "learning_rate": 0.00013027969193352249, "loss": 0.3355, "step": 6880 }, { "epoch": 1.3964329144710175, "grad_norm": 0.011678949929773808, "learning_rate": 0.00013017835427644913, "loss": 0.0031, "step": 6890 }, { "epoch": 1.3984596676124847, "grad_norm": 0.04873785376548767, "learning_rate": 0.00013007701661937578, "loss": 1.2775, "step": 6900 }, { "epoch": 1.3984596676124847, "eval_accuracy": 0.967032967032967, "eval_loss": 0.11005932837724686, "eval_runtime": 52.1773, "eval_samples_per_second": 10.464, "eval_steps_per_second": 1.322, "step": 6900 }, { "epoch": 1.4004864207539522, "grad_norm": 0.34723198413848877, "learning_rate": 0.0001299756789623024, "loss": 0.149, "step": 6910 }, { "epoch": 1.4025131738954195, "grad_norm": 0.059217892587184906, "learning_rate": 0.00012987434130522902, "loss": 0.0068, "step": 6920 }, { "epoch": 1.404539927036887, "grad_norm": 0.025736402720212936, "learning_rate": 0.00012977300364815566, "loss": 0.8352, "step": 6930 }, { "epoch": 1.4065666801783543, "grad_norm": 0.01956155151128769, "learning_rate": 0.00012967166599108228, "loss": 0.4539, "step": 6940 }, { "epoch": 1.4085934333198216, "grad_norm": 0.020265795290470123, "learning_rate": 0.00012957032833400893, "loss": 0.0025, "step": 6950 }, { "epoch": 1.410620186461289, "grad_norm": 0.015172087587416172, "learning_rate": 0.00012946899067693557, "loss": 0.0048, "step": 6960 }, { "epoch": 1.4126469396027563, "grad_norm": 0.044815607368946075, "learning_rate": 0.00012936765301986217, "loss": 0.9364, "step": 6970 }, { "epoch": 1.4146736927442238, "grad_norm": 0.043388959020376205, "learning_rate": 0.0001292663153627888, "loss": 0.0073, "step": 6980 }, { "epoch": 1.416700445885691, "grad_norm": 0.024978777393698692, "learning_rate": 0.00012916497770571546, "loss": 0.0102, "step": 6990 }, { "epoch": 1.4187271990271584, "grad_norm": 0.08704657852649689, "learning_rate": 0.00012906364004864208, "loss": 0.0051, "step": 7000 }, { "epoch": 1.4187271990271584, "eval_accuracy": 0.9706959706959707, "eval_loss": 0.12920933961868286, "eval_runtime": 48.3802, "eval_samples_per_second": 11.286, "eval_steps_per_second": 1.426, "step": 7000 }, { "epoch": 1.4207539521686259, "grad_norm": 0.15106026828289032, "learning_rate": 0.00012896230239156872, "loss": 0.2045, "step": 7010 }, { "epoch": 1.4227807053100932, "grad_norm": 0.07075616717338562, "learning_rate": 0.00012886096473449534, "loss": 0.0048, "step": 7020 }, { "epoch": 1.4248074584515606, "grad_norm": 0.008650687523186207, "learning_rate": 0.00012875962707742196, "loss": 0.4909, "step": 7030 }, { "epoch": 1.426834211593028, "grad_norm": 0.011124524287879467, "learning_rate": 0.0001286582894203486, "loss": 0.0019, "step": 7040 }, { "epoch": 1.4288609647344954, "grad_norm": 0.008949482813477516, "learning_rate": 0.00012855695176327525, "loss": 0.0034, "step": 7050 }, { "epoch": 1.4308877178759627, "grad_norm": 0.0093899667263031, "learning_rate": 0.00012845561410620185, "loss": 0.0011, "step": 7060 }, { "epoch": 1.4329144710174302, "grad_norm": 0.0081386873498559, "learning_rate": 0.0001283542764491285, "loss": 0.0068, "step": 7070 }, { "epoch": 1.4349412241588975, "grad_norm": 0.06856881827116013, "learning_rate": 0.00012825293879205514, "loss": 0.0031, "step": 7080 }, { "epoch": 1.4369679773003647, "grad_norm": 0.007543250452727079, "learning_rate": 0.00012815160113498176, "loss": 0.4818, "step": 7090 }, { "epoch": 1.4389947304418322, "grad_norm": 0.008494116365909576, "learning_rate": 0.0001280502634779084, "loss": 0.4954, "step": 7100 }, { "epoch": 1.4389947304418322, "eval_accuracy": 0.9468864468864469, "eval_loss": 0.24729128181934357, "eval_runtime": 50.3955, "eval_samples_per_second": 10.834, "eval_steps_per_second": 1.369, "step": 7100 }, { "epoch": 1.4410214835832995, "grad_norm": 0.008369691669940948, "learning_rate": 0.00012794892582083502, "loss": 0.4182, "step": 7110 }, { "epoch": 1.443048236724767, "grad_norm": 0.00896457303315401, "learning_rate": 0.00012784758816376164, "loss": 0.0027, "step": 7120 }, { "epoch": 1.4450749898662343, "grad_norm": 0.008388702757656574, "learning_rate": 0.0001277462505066883, "loss": 0.153, "step": 7130 }, { "epoch": 1.4471017430077016, "grad_norm": 0.011490442790091038, "learning_rate": 0.00012764491284961494, "loss": 0.6053, "step": 7140 }, { "epoch": 1.449128496149169, "grad_norm": 0.010425391606986523, "learning_rate": 0.00012754357519254155, "loss": 0.0012, "step": 7150 }, { "epoch": 1.4511552492906363, "grad_norm": 0.01710696704685688, "learning_rate": 0.00012744223753546817, "loss": 0.0036, "step": 7160 }, { "epoch": 1.4531820024321038, "grad_norm": 10.217206001281738, "learning_rate": 0.00012734089987839482, "loss": 0.2024, "step": 7170 }, { "epoch": 1.455208755573571, "grad_norm": 0.10198117792606354, "learning_rate": 0.00012723956222132144, "loss": 0.0135, "step": 7180 }, { "epoch": 1.4572355087150384, "grad_norm": 0.007917650043964386, "learning_rate": 0.00012713822456424808, "loss": 0.0073, "step": 7190 }, { "epoch": 1.4592622618565059, "grad_norm": 0.00752667523920536, "learning_rate": 0.0001270368869071747, "loss": 0.1533, "step": 7200 }, { "epoch": 1.4592622618565059, "eval_accuracy": 0.9706959706959707, "eval_loss": 0.11806527525186539, "eval_runtime": 50.3295, "eval_samples_per_second": 10.849, "eval_steps_per_second": 1.371, "step": 7200 }, { "epoch": 1.4612890149979734, "grad_norm": 0.0070377918891608715, "learning_rate": 0.00012693554925010135, "loss": 0.001, "step": 7210 }, { "epoch": 1.4633157681394406, "grad_norm": 0.007255935110151768, "learning_rate": 0.00012683421159302797, "loss": 0.4828, "step": 7220 }, { "epoch": 1.465342521280908, "grad_norm": 0.00714620528742671, "learning_rate": 0.00012673287393595462, "loss": 0.003, "step": 7230 }, { "epoch": 1.4673692744223754, "grad_norm": 0.006977812387049198, "learning_rate": 0.00012663153627888123, "loss": 0.0008, "step": 7240 }, { "epoch": 1.4693960275638427, "grad_norm": 0.00690856808796525, "learning_rate": 0.00012653019862180785, "loss": 0.4458, "step": 7250 }, { "epoch": 1.4714227807053102, "grad_norm": 0.007223729509860277, "learning_rate": 0.0001264288609647345, "loss": 0.14, "step": 7260 }, { "epoch": 1.4734495338467775, "grad_norm": 0.00870873499661684, "learning_rate": 0.00012632752330766115, "loss": 0.7125, "step": 7270 }, { "epoch": 1.4754762869882447, "grad_norm": 0.00891156680881977, "learning_rate": 0.00012622618565058777, "loss": 0.0041, "step": 7280 }, { "epoch": 1.4775030401297122, "grad_norm": 0.008706452324986458, "learning_rate": 0.0001261248479935144, "loss": 0.0053, "step": 7290 }, { "epoch": 1.4795297932711795, "grad_norm": 0.00838247500360012, "learning_rate": 0.00012602351033644103, "loss": 0.0022, "step": 7300 }, { "epoch": 1.4795297932711795, "eval_accuracy": 0.9706959706959707, "eval_loss": 0.15122626721858978, "eval_runtime": 50.3826, "eval_samples_per_second": 10.837, "eval_steps_per_second": 1.37, "step": 7300 }, { "epoch": 1.481556546412647, "grad_norm": 0.009303655475378036, "learning_rate": 0.00012592217267936765, "loss": 0.004, "step": 7310 }, { "epoch": 1.4835832995541143, "grad_norm": 0.07870160788297653, "learning_rate": 0.0001258208350222943, "loss": 0.7105, "step": 7320 }, { "epoch": 1.4856100526955816, "grad_norm": 0.01071919035166502, "learning_rate": 0.00012571949736522094, "loss": 0.0052, "step": 7330 }, { "epoch": 1.487636805837049, "grad_norm": 0.00973090622574091, "learning_rate": 0.00012561815970814756, "loss": 0.4796, "step": 7340 }, { "epoch": 1.4896635589785165, "grad_norm": 0.01041113119572401, "learning_rate": 0.00012551682205107418, "loss": 0.4586, "step": 7350 }, { "epoch": 1.4916903121199838, "grad_norm": 0.010993756353855133, "learning_rate": 0.00012541548439400083, "loss": 0.4192, "step": 7360 }, { "epoch": 1.493717065261451, "grad_norm": 0.18665625154972076, "learning_rate": 0.00012531414673692745, "loss": 0.0035, "step": 7370 }, { "epoch": 1.4957438184029186, "grad_norm": 0.013825108297169209, "learning_rate": 0.0001252128090798541, "loss": 0.5293, "step": 7380 }, { "epoch": 1.4977705715443859, "grad_norm": 0.1872238665819168, "learning_rate": 0.0001251114714227807, "loss": 0.0109, "step": 7390 }, { "epoch": 1.4997973246858534, "grad_norm": 0.14896440505981445, "learning_rate": 0.00012501013376570733, "loss": 0.005, "step": 7400 }, { "epoch": 1.4997973246858534, "eval_accuracy": 0.967032967032967, "eval_loss": 0.13286960124969482, "eval_runtime": 47.115, "eval_samples_per_second": 11.589, "eval_steps_per_second": 1.465, "step": 7400 }, { "epoch": 1.5018240778273206, "grad_norm": 14.306511878967285, "learning_rate": 0.00012490879610863398, "loss": 0.0343, "step": 7410 }, { "epoch": 1.503850830968788, "grad_norm": 0.09887898713350296, "learning_rate": 0.00012480745845156062, "loss": 0.0047, "step": 7420 }, { "epoch": 1.5058775841102554, "grad_norm": 0.2640922963619232, "learning_rate": 0.00012470612079448724, "loss": 0.0021, "step": 7430 }, { "epoch": 1.507904337251723, "grad_norm": 0.008840842172503471, "learning_rate": 0.00012460478313741386, "loss": 0.0047, "step": 7440 }, { "epoch": 1.5099310903931902, "grad_norm": 0.01149743888527155, "learning_rate": 0.0001245034454803405, "loss": 0.9103, "step": 7450 }, { "epoch": 1.5119578435346575, "grad_norm": 0.12512584030628204, "learning_rate": 0.00012440210782326713, "loss": 0.0066, "step": 7460 }, { "epoch": 1.5139845966761247, "grad_norm": 0.009386054240167141, "learning_rate": 0.00012430077016619377, "loss": 0.0011, "step": 7470 }, { "epoch": 1.5160113498175922, "grad_norm": 0.0302512738853693, "learning_rate": 0.00012419943250912042, "loss": 0.0039, "step": 7480 }, { "epoch": 1.5180381029590597, "grad_norm": 0.008296155370771885, "learning_rate": 0.000124098094852047, "loss": 0.0021, "step": 7490 }, { "epoch": 1.520064856100527, "grad_norm": 0.00832971278578043, "learning_rate": 0.00012399675719497366, "loss": 0.4396, "step": 7500 }, { "epoch": 1.520064856100527, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.12192816287279129, "eval_runtime": 49.4588, "eval_samples_per_second": 11.039, "eval_steps_per_second": 1.395, "step": 7500 }, { "epoch": 1.5220916092419943, "grad_norm": 0.008520863950252533, "learning_rate": 0.0001238954195379003, "loss": 0.0027, "step": 7510 }, { "epoch": 1.5241183623834615, "grad_norm": 0.008221836760640144, "learning_rate": 0.00012379408188082692, "loss": 0.0025, "step": 7520 }, { "epoch": 1.526145115524929, "grad_norm": 0.00811388622969389, "learning_rate": 0.00012369274422375354, "loss": 0.7087, "step": 7530 }, { "epoch": 1.5281718686663965, "grad_norm": 0.00921147596091032, "learning_rate": 0.0001235914065666802, "loss": 0.001, "step": 7540 }, { "epoch": 1.5301986218078638, "grad_norm": 0.010357416234910488, "learning_rate": 0.0001234900689096068, "loss": 0.3959, "step": 7550 }, { "epoch": 1.532225374949331, "grad_norm": 0.011035448871552944, "learning_rate": 0.00012338873125253345, "loss": 0.0078, "step": 7560 }, { "epoch": 1.5342521280907986, "grad_norm": 0.04455064609646797, "learning_rate": 0.0001232873935954601, "loss": 0.0059, "step": 7570 }, { "epoch": 1.5362788812322659, "grad_norm": 0.009438133798539639, "learning_rate": 0.0001231860559383867, "loss": 0.0043, "step": 7580 }, { "epoch": 1.5383056343737334, "grad_norm": 0.008560802787542343, "learning_rate": 0.00012308471828131334, "loss": 0.0024, "step": 7590 }, { "epoch": 1.5403323875152006, "grad_norm": 0.008284498006105423, "learning_rate": 0.00012298338062423998, "loss": 0.0044, "step": 7600 }, { "epoch": 1.5403323875152006, "eval_accuracy": 0.967032967032967, "eval_loss": 0.16649296879768372, "eval_runtime": 50.3329, "eval_samples_per_second": 10.848, "eval_steps_per_second": 1.371, "step": 7600 }, { "epoch": 1.542359140656668, "grad_norm": 0.00793454609811306, "learning_rate": 0.0001228820429671666, "loss": 0.0009, "step": 7610 }, { "epoch": 1.5443858937981354, "grad_norm": 0.09009437263011932, "learning_rate": 0.00012278070531009325, "loss": 0.0029, "step": 7620 }, { "epoch": 1.546412646939603, "grad_norm": 0.008107963018119335, "learning_rate": 0.00012267936765301987, "loss": 0.0008, "step": 7630 }, { "epoch": 1.5484394000810702, "grad_norm": 8.214654922485352, "learning_rate": 0.00012257802999594649, "loss": 0.8614, "step": 7640 }, { "epoch": 1.5504661532225374, "grad_norm": 0.008868742734193802, "learning_rate": 0.00012247669233887313, "loss": 0.4323, "step": 7650 }, { "epoch": 1.5524929063640047, "grad_norm": 0.009688987396657467, "learning_rate": 0.00012237535468179978, "loss": 0.362, "step": 7660 }, { "epoch": 1.5545196595054722, "grad_norm": 0.010521828196942806, "learning_rate": 0.00012227401702472637, "loss": 0.0141, "step": 7670 }, { "epoch": 1.5565464126469397, "grad_norm": 0.22994591295719147, "learning_rate": 0.00012217267936765302, "loss": 0.0142, "step": 7680 }, { "epoch": 1.558573165788407, "grad_norm": 0.008071416057646275, "learning_rate": 0.00012207134171057966, "loss": 0.0036, "step": 7690 }, { "epoch": 1.5605999189298743, "grad_norm": 9.246078491210938, "learning_rate": 0.0001219700040535063, "loss": 0.7054, "step": 7700 }, { "epoch": 1.5605999189298743, "eval_accuracy": 0.967032967032967, "eval_loss": 0.16522414982318878, "eval_runtime": 46.6457, "eval_samples_per_second": 11.705, "eval_steps_per_second": 1.479, "step": 7700 }, { "epoch": 1.5626266720713418, "grad_norm": 39.2440185546875, "learning_rate": 0.00012186866639643293, "loss": 0.8085, "step": 7710 }, { "epoch": 1.564653425212809, "grad_norm": 0.1413354128599167, "learning_rate": 0.00012176732873935955, "loss": 0.0097, "step": 7720 }, { "epoch": 1.5666801783542765, "grad_norm": 0.018545445054769516, "learning_rate": 0.00012166599108228618, "loss": 0.0032, "step": 7730 }, { "epoch": 1.5687069314957438, "grad_norm": 0.02521946094930172, "learning_rate": 0.00012156465342521281, "loss": 0.0039, "step": 7740 }, { "epoch": 1.570733684637211, "grad_norm": 0.019712034612894058, "learning_rate": 0.00012146331576813945, "loss": 0.4125, "step": 7750 }, { "epoch": 1.5727604377786786, "grad_norm": 0.012678617611527443, "learning_rate": 0.00012136197811106609, "loss": 0.437, "step": 7760 }, { "epoch": 1.574787190920146, "grad_norm": 0.014548375271260738, "learning_rate": 0.0001212606404539927, "loss": 0.6042, "step": 7770 }, { "epoch": 1.5768139440616133, "grad_norm": 0.014556125737726688, "learning_rate": 0.00012115930279691934, "loss": 0.0912, "step": 7780 }, { "epoch": 1.5788406972030806, "grad_norm": 0.014982403255999088, "learning_rate": 0.00012105796513984598, "loss": 0.4238, "step": 7790 }, { "epoch": 1.580867450344548, "grad_norm": 0.014122478663921356, "learning_rate": 0.00012095662748277261, "loss": 0.4057, "step": 7800 }, { "epoch": 1.580867450344548, "eval_accuracy": 0.9542124542124543, "eval_loss": 0.16829834878444672, "eval_runtime": 50.1526, "eval_samples_per_second": 10.887, "eval_steps_per_second": 1.376, "step": 7800 }, { "epoch": 1.5828942034860154, "grad_norm": 0.042958226054906845, "learning_rate": 0.00012085528982569923, "loss": 0.3522, "step": 7810 }, { "epoch": 1.584920956627483, "grad_norm": 0.014224754646420479, "learning_rate": 0.00012075395216862586, "loss": 0.6036, "step": 7820 }, { "epoch": 1.5869477097689502, "grad_norm": 0.012702335603535175, "learning_rate": 0.00012065261451155249, "loss": 0.0226, "step": 7830 }, { "epoch": 1.5889744629104174, "grad_norm": 0.012445378117263317, "learning_rate": 0.00012055127685447914, "loss": 0.0063, "step": 7840 }, { "epoch": 1.5910012160518847, "grad_norm": 0.011407758109271526, "learning_rate": 0.00012044993919740577, "loss": 0.0057, "step": 7850 }, { "epoch": 1.5930279691933522, "grad_norm": 0.3679606318473816, "learning_rate": 0.00012034860154033238, "loss": 1.0166, "step": 7860 }, { "epoch": 1.5950547223348197, "grad_norm": 0.013010966591536999, "learning_rate": 0.00012024726388325902, "loss": 0.0016, "step": 7870 }, { "epoch": 1.597081475476287, "grad_norm": 0.013456660322844982, "learning_rate": 0.00012014592622618566, "loss": 0.0016, "step": 7880 }, { "epoch": 1.5991082286177543, "grad_norm": 0.012532263062894344, "learning_rate": 0.00012004458856911229, "loss": 0.009, "step": 7890 }, { "epoch": 1.6011349817592218, "grad_norm": 0.01159537211060524, "learning_rate": 0.00011994325091203894, "loss": 0.011, "step": 7900 }, { "epoch": 1.6011349817592218, "eval_accuracy": 0.9285714285714286, "eval_loss": 0.3927004933357239, "eval_runtime": 50.1818, "eval_samples_per_second": 10.88, "eval_steps_per_second": 1.375, "step": 7900 }, { "epoch": 1.6031617349006893, "grad_norm": 0.010766562074422836, "learning_rate": 0.00011984191325496554, "loss": 0.0061, "step": 7910 }, { "epoch": 1.6051884880421565, "grad_norm": 0.015047757886350155, "learning_rate": 0.00011974057559789217, "loss": 1.1934, "step": 7920 }, { "epoch": 1.6072152411836238, "grad_norm": 0.025979992002248764, "learning_rate": 0.00011963923794081882, "loss": 0.0091, "step": 7930 }, { "epoch": 1.609241994325091, "grad_norm": 0.025664396584033966, "learning_rate": 0.00011953790028374545, "loss": 0.0938, "step": 7940 }, { "epoch": 1.6112687474665586, "grad_norm": 0.026306096464395523, "learning_rate": 0.00011943656262667207, "loss": 0.8206, "step": 7950 }, { "epoch": 1.613295500608026, "grad_norm": 0.01710491254925728, "learning_rate": 0.0001193352249695987, "loss": 0.4074, "step": 7960 }, { "epoch": 1.6153222537494933, "grad_norm": 0.20820653438568115, "learning_rate": 0.00011923388731252534, "loss": 0.007, "step": 7970 }, { "epoch": 1.6173490068909606, "grad_norm": 0.22726105153560638, "learning_rate": 0.00011913254965545197, "loss": 0.8665, "step": 7980 }, { "epoch": 1.619375760032428, "grad_norm": 0.017417067661881447, "learning_rate": 0.00011903121199837862, "loss": 0.0103, "step": 7990 }, { "epoch": 1.6214025131738954, "grad_norm": 0.022518666461110115, "learning_rate": 0.00011892987434130522, "loss": 0.7, "step": 8000 }, { "epoch": 1.6214025131738954, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.09989867359399796, "eval_runtime": 47.4891, "eval_samples_per_second": 11.497, "eval_steps_per_second": 1.453, "step": 8000 }, { "epoch": 1.6234292663153629, "grad_norm": 0.037107568234205246, "learning_rate": 0.00011882853668423187, "loss": 0.0115, "step": 8010 }, { "epoch": 1.6254560194568302, "grad_norm": 0.2722521722316742, "learning_rate": 0.0001187271990271585, "loss": 0.0076, "step": 8020 }, { "epoch": 1.6274827725982974, "grad_norm": 0.020260510966181755, "learning_rate": 0.00011862586137008513, "loss": 0.003, "step": 8030 }, { "epoch": 1.629509525739765, "grad_norm": 11.66630744934082, "learning_rate": 0.00011852452371301177, "loss": 1.0787, "step": 8040 }, { "epoch": 1.6315362788812324, "grad_norm": 0.03827909007668495, "learning_rate": 0.00011842318605593838, "loss": 0.0067, "step": 8050 }, { "epoch": 1.6335630320226997, "grad_norm": 0.05085739120841026, "learning_rate": 0.00011832184839886502, "loss": 0.0078, "step": 8060 }, { "epoch": 1.635589785164167, "grad_norm": 0.03725253790616989, "learning_rate": 0.00011822051074179166, "loss": 0.2087, "step": 8070 }, { "epoch": 1.6376165383056343, "grad_norm": 0.016044335439801216, "learning_rate": 0.0001181191730847183, "loss": 0.0063, "step": 8080 }, { "epoch": 1.6396432914471017, "grad_norm": 0.013224687427282333, "learning_rate": 0.00011801783542764493, "loss": 0.0591, "step": 8090 }, { "epoch": 1.6416700445885692, "grad_norm": 0.012949744239449501, "learning_rate": 0.00011791649777057155, "loss": 0.0026, "step": 8100 }, { "epoch": 1.6416700445885692, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.12492057681083679, "eval_runtime": 47.5509, "eval_samples_per_second": 11.482, "eval_steps_per_second": 1.451, "step": 8100 }, { "epoch": 1.6436967977300365, "grad_norm": 0.011045355349779129, "learning_rate": 0.00011781516011349818, "loss": 0.0017, "step": 8110 }, { "epoch": 1.6457235508715038, "grad_norm": 0.010615518316626549, "learning_rate": 0.00011771382245642481, "loss": 0.0013, "step": 8120 }, { "epoch": 1.647750304012971, "grad_norm": 10.780515670776367, "learning_rate": 0.00011761248479935145, "loss": 1.5084, "step": 8130 }, { "epoch": 1.6497770571544386, "grad_norm": 0.023448778316378593, "learning_rate": 0.00011751114714227806, "loss": 0.0028, "step": 8140 }, { "epoch": 1.651803810295906, "grad_norm": 0.014660656452178955, "learning_rate": 0.0001174098094852047, "loss": 0.0034, "step": 8150 }, { "epoch": 1.6538305634373733, "grad_norm": 0.16463139653205872, "learning_rate": 0.00011730847182813134, "loss": 0.7985, "step": 8160 }, { "epoch": 1.6558573165788406, "grad_norm": 0.020764421671628952, "learning_rate": 0.00011720713417105798, "loss": 0.5618, "step": 8170 }, { "epoch": 1.657884069720308, "grad_norm": 7.786919116973877, "learning_rate": 0.00011710579651398461, "loss": 0.4688, "step": 8180 }, { "epoch": 1.6599108228617754, "grad_norm": 0.15201693773269653, "learning_rate": 0.00011700445885691123, "loss": 0.3611, "step": 8190 }, { "epoch": 1.6619375760032429, "grad_norm": 0.01921539194881916, "learning_rate": 0.00011690312119983786, "loss": 0.002, "step": 8200 }, { "epoch": 1.6619375760032429, "eval_accuracy": 0.9615384615384616, "eval_loss": 0.13859277963638306, "eval_runtime": 50.3004, "eval_samples_per_second": 10.855, "eval_steps_per_second": 1.372, "step": 8200 }, { "epoch": 1.6639643291447102, "grad_norm": 0.015072059817612171, "learning_rate": 0.00011680178354276449, "loss": 0.5388, "step": 8210 }, { "epoch": 1.6659910822861774, "grad_norm": 0.018651235848665237, "learning_rate": 0.00011670044588569114, "loss": 0.3912, "step": 8220 }, { "epoch": 1.668017835427645, "grad_norm": 0.029343340545892715, "learning_rate": 0.00011659910822861777, "loss": 0.0036, "step": 8230 }, { "epoch": 1.6700445885691124, "grad_norm": 0.017087263986468315, "learning_rate": 0.00011649777057154439, "loss": 0.0055, "step": 8240 }, { "epoch": 1.6720713417105797, "grad_norm": 0.22589299082756042, "learning_rate": 0.00011639643291447102, "loss": 0.0078, "step": 8250 }, { "epoch": 1.674098094852047, "grad_norm": 0.012997474521398544, "learning_rate": 0.00011629509525739766, "loss": 0.0047, "step": 8260 }, { "epoch": 1.6761248479935142, "grad_norm": 0.011784421280026436, "learning_rate": 0.00011619375760032429, "loss": 0.0058, "step": 8270 }, { "epoch": 1.6781516011349817, "grad_norm": 0.014172839932143688, "learning_rate": 0.00011609241994325091, "loss": 0.0069, "step": 8280 }, { "epoch": 1.6801783542764492, "grad_norm": 0.010487750172615051, "learning_rate": 0.00011599108228617754, "loss": 0.0028, "step": 8290 }, { "epoch": 1.6822051074179165, "grad_norm": 0.010051158256828785, "learning_rate": 0.00011588974462910417, "loss": 0.0041, "step": 8300 }, { "epoch": 1.6822051074179165, "eval_accuracy": 0.967032967032967, "eval_loss": 0.11753609031438828, "eval_runtime": 50.388, "eval_samples_per_second": 10.836, "eval_steps_per_second": 1.369, "step": 8300 }, { "epoch": 1.6842318605593838, "grad_norm": 0.10530339926481247, "learning_rate": 0.00011578840697203082, "loss": 0.0048, "step": 8310 }, { "epoch": 1.6862586137008513, "grad_norm": 0.009275587275624275, "learning_rate": 0.00011568706931495745, "loss": 0.6973, "step": 8320 }, { "epoch": 1.6882853668423186, "grad_norm": 0.012001892551779747, "learning_rate": 0.00011558573165788407, "loss": 0.0023, "step": 8330 }, { "epoch": 1.690312119983786, "grad_norm": 12.442451477050781, "learning_rate": 0.0001154843940008107, "loss": 0.5785, "step": 8340 }, { "epoch": 1.6923388731252533, "grad_norm": 0.012126659043133259, "learning_rate": 0.00011538305634373734, "loss": 0.4018, "step": 8350 }, { "epoch": 1.6943656262667206, "grad_norm": 0.011285288259387016, "learning_rate": 0.00011528171868666397, "loss": 0.0054, "step": 8360 }, { "epoch": 1.696392379408188, "grad_norm": 0.011082268320024014, "learning_rate": 0.00011518038102959062, "loss": 0.005, "step": 8370 }, { "epoch": 1.6984191325496556, "grad_norm": 0.04278941825032234, "learning_rate": 0.00011507904337251722, "loss": 0.0043, "step": 8380 }, { "epoch": 1.7004458856911229, "grad_norm": 0.10877048969268799, "learning_rate": 0.00011497770571544387, "loss": 0.0023, "step": 8390 }, { "epoch": 1.7024726388325901, "grad_norm": 0.08552572131156921, "learning_rate": 0.0001148763680583705, "loss": 0.0034, "step": 8400 }, { "epoch": 1.7024726388325901, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.11600035429000854, "eval_runtime": 49.2969, "eval_samples_per_second": 11.076, "eval_steps_per_second": 1.4, "step": 8400 }, { "epoch": 1.7044993919740574, "grad_norm": 0.0091655058786273, "learning_rate": 0.00011477503040129713, "loss": 0.001, "step": 8410 }, { "epoch": 1.706526145115525, "grad_norm": 0.09666013717651367, "learning_rate": 0.00011467369274422375, "loss": 0.1692, "step": 8420 }, { "epoch": 1.7085528982569924, "grad_norm": 0.00899573601782322, "learning_rate": 0.00011457235508715038, "loss": 0.003, "step": 8430 }, { "epoch": 1.7105796513984597, "grad_norm": 9.241072654724121, "learning_rate": 0.00011447101743007702, "loss": 0.7043, "step": 8440 }, { "epoch": 1.712606404539927, "grad_norm": 0.010043786838650703, "learning_rate": 0.00011436967977300366, "loss": 0.0019, "step": 8450 }, { "epoch": 1.7146331576813945, "grad_norm": 0.01046903245151043, "learning_rate": 0.0001142683421159303, "loss": 0.4672, "step": 8460 }, { "epoch": 1.7166599108228617, "grad_norm": 0.011509249918162823, "learning_rate": 0.0001141670044588569, "loss": 0.0028, "step": 8470 }, { "epoch": 1.7186866639643292, "grad_norm": 0.012271828949451447, "learning_rate": 0.00011406566680178355, "loss": 0.4348, "step": 8480 }, { "epoch": 1.7207134171057965, "grad_norm": 0.010808630846440792, "learning_rate": 0.00011396432914471018, "loss": 0.0422, "step": 8490 }, { "epoch": 1.7227401702472638, "grad_norm": 0.1264539510011673, "learning_rate": 0.00011386299148763681, "loss": 0.0041, "step": 8500 }, { "epoch": 1.7227401702472638, "eval_accuracy": 0.9542124542124543, "eval_loss": 0.20973940193653107, "eval_runtime": 45.8027, "eval_samples_per_second": 11.921, "eval_steps_per_second": 1.506, "step": 8500 }, { "epoch": 1.7247669233887313, "grad_norm": 0.11836584657430649, "learning_rate": 0.00011376165383056346, "loss": 0.5827, "step": 8510 }, { "epoch": 1.7267936765301988, "grad_norm": 0.1095990464091301, "learning_rate": 0.00011366031617349006, "loss": 0.0078, "step": 8520 }, { "epoch": 1.728820429671666, "grad_norm": 0.05297279730439186, "learning_rate": 0.0001135589785164167, "loss": 0.5844, "step": 8530 }, { "epoch": 1.7308471828131333, "grad_norm": 0.10899315029382706, "learning_rate": 0.00011345764085934334, "loss": 0.6763, "step": 8540 }, { "epoch": 1.7328739359546006, "grad_norm": 0.023719044402241707, "learning_rate": 0.00011335630320226998, "loss": 0.4364, "step": 8550 }, { "epoch": 1.734900689096068, "grad_norm": 0.014032549224793911, "learning_rate": 0.0001132549655451966, "loss": 0.0061, "step": 8560 }, { "epoch": 1.7369274422375356, "grad_norm": 0.010516401380300522, "learning_rate": 0.00011315362788812323, "loss": 0.0762, "step": 8570 }, { "epoch": 1.7389541953790029, "grad_norm": 0.13084004819393158, "learning_rate": 0.00011305229023104986, "loss": 0.0242, "step": 8580 }, { "epoch": 1.7409809485204701, "grad_norm": 0.010349682532250881, "learning_rate": 0.00011295095257397649, "loss": 0.2937, "step": 8590 }, { "epoch": 1.7430077016619374, "grad_norm": 0.009873668663203716, "learning_rate": 0.00011284961491690314, "loss": 0.3303, "step": 8600 }, { "epoch": 1.7430077016619374, "eval_accuracy": 0.9597069597069597, "eval_loss": 0.15265528857707977, "eval_runtime": 49.6316, "eval_samples_per_second": 11.001, "eval_steps_per_second": 1.39, "step": 8600 }, { "epoch": 1.745034454803405, "grad_norm": 0.5044683218002319, "learning_rate": 0.00011274827725982974, "loss": 1.0479, "step": 8610 }, { "epoch": 1.7470612079448724, "grad_norm": 0.17945395410060883, "learning_rate": 0.00011264693960275639, "loss": 0.4109, "step": 8620 }, { "epoch": 1.7490879610863397, "grad_norm": 8.084101676940918, "learning_rate": 0.00011254560194568302, "loss": 0.3745, "step": 8630 }, { "epoch": 1.751114714227807, "grad_norm": 0.015885835513472557, "learning_rate": 0.00011244426428860966, "loss": 0.0023, "step": 8640 }, { "epoch": 1.7531414673692745, "grad_norm": 0.035967472940683365, "learning_rate": 0.00011234292663153629, "loss": 0.3532, "step": 8650 }, { "epoch": 1.755168220510742, "grad_norm": 0.01129199843853712, "learning_rate": 0.00011224158897446291, "loss": 0.006, "step": 8660 }, { "epoch": 1.7571949736522092, "grad_norm": 0.294033408164978, "learning_rate": 0.00011214025131738954, "loss": 0.0176, "step": 8670 }, { "epoch": 1.7592217267936765, "grad_norm": 0.00972708035260439, "learning_rate": 0.00011203891366031619, "loss": 0.0105, "step": 8680 }, { "epoch": 1.7612484799351438, "grad_norm": 9.236714363098145, "learning_rate": 0.00011193757600324282, "loss": 0.7035, "step": 8690 }, { "epoch": 1.7632752330766113, "grad_norm": 0.010941145941615105, "learning_rate": 0.00011183623834616945, "loss": 0.006, "step": 8700 }, { "epoch": 1.7632752330766113, "eval_accuracy": 0.967032967032967, "eval_loss": 0.13893744349479675, "eval_runtime": 49.6277, "eval_samples_per_second": 11.002, "eval_steps_per_second": 1.39, "step": 8700 }, { "epoch": 1.7653019862180788, "grad_norm": 0.016714446246623993, "learning_rate": 0.00011173490068909607, "loss": 0.9278, "step": 8710 }, { "epoch": 1.767328739359546, "grad_norm": 0.01921900361776352, "learning_rate": 0.0001116335630320227, "loss": 0.0035, "step": 8720 }, { "epoch": 1.7693554925010133, "grad_norm": 0.02214176394045353, "learning_rate": 0.00011153222537494934, "loss": 0.0057, "step": 8730 }, { "epoch": 1.7713822456424806, "grad_norm": 0.024326831102371216, "learning_rate": 0.00011143088771787597, "loss": 1.0052, "step": 8740 }, { "epoch": 1.773408998783948, "grad_norm": 0.20547142624855042, "learning_rate": 0.00011132955006080259, "loss": 0.3104, "step": 8750 }, { "epoch": 1.7754357519254156, "grad_norm": 0.15221790969371796, "learning_rate": 0.00011122821240372922, "loss": 0.0205, "step": 8760 }, { "epoch": 1.7774625050668829, "grad_norm": 0.014965363778173923, "learning_rate": 0.00011112687474665587, "loss": 0.005, "step": 8770 }, { "epoch": 1.7794892582083501, "grad_norm": 0.01209799014031887, "learning_rate": 0.0001110255370895825, "loss": 0.005, "step": 8780 }, { "epoch": 1.7815160113498176, "grad_norm": 0.011696490459144115, "learning_rate": 0.00011092419943250913, "loss": 0.0054, "step": 8790 }, { "epoch": 1.7835427644912851, "grad_norm": 0.010011350736021996, "learning_rate": 0.00011082286177543575, "loss": 0.0012, "step": 8800 }, { "epoch": 1.7835427644912851, "eval_accuracy": 0.9597069597069597, "eval_loss": 0.17990480363368988, "eval_runtime": 48.1871, "eval_samples_per_second": 11.331, "eval_steps_per_second": 1.432, "step": 8800 }, { "epoch": 1.7855695176327524, "grad_norm": 0.0209858026355505, "learning_rate": 0.00011072152411836238, "loss": 0.5705, "step": 8810 }, { "epoch": 1.7875962707742197, "grad_norm": 0.010832561179995537, "learning_rate": 0.00011062018646128902, "loss": 0.0075, "step": 8820 }, { "epoch": 1.789623023915687, "grad_norm": 0.009939854964613914, "learning_rate": 0.00011051884880421566, "loss": 0.0049, "step": 8830 }, { "epoch": 1.7916497770571544, "grad_norm": 0.009512858465313911, "learning_rate": 0.0001104175111471423, "loss": 0.4469, "step": 8840 }, { "epoch": 1.793676530198622, "grad_norm": 0.009505028836429119, "learning_rate": 0.0001103161734900689, "loss": 0.0025, "step": 8850 }, { "epoch": 1.7957032833400892, "grad_norm": 0.011000646278262138, "learning_rate": 0.00011021483583299555, "loss": 0.6492, "step": 8860 }, { "epoch": 1.7977300364815565, "grad_norm": 0.011823964305222034, "learning_rate": 0.00011011349817592218, "loss": 0.0043, "step": 8870 }, { "epoch": 1.7997567896230238, "grad_norm": 0.028389113023877144, "learning_rate": 0.00011001216051884881, "loss": 0.002, "step": 8880 }, { "epoch": 1.8017835427644913, "grad_norm": 0.0190489050000906, "learning_rate": 0.00010991082286177543, "loss": 0.0061, "step": 8890 }, { "epoch": 1.8038102959059588, "grad_norm": 0.027871625497937202, "learning_rate": 0.00010980948520470206, "loss": 0.0027, "step": 8900 }, { "epoch": 1.8038102959059588, "eval_accuracy": 0.9615384615384616, "eval_loss": 0.17173990607261658, "eval_runtime": 210.3245, "eval_samples_per_second": 2.596, "eval_steps_per_second": 0.328, "step": 8900 }, { "epoch": 1.805837049047426, "grad_norm": 0.00957685150206089, "learning_rate": 0.0001097081475476287, "loss": 0.0012, "step": 8910 }, { "epoch": 1.8078638021888933, "grad_norm": 0.013015580363571644, "learning_rate": 0.00010960680989055534, "loss": 0.0042, "step": 8920 }, { "epoch": 1.8098905553303608, "grad_norm": 0.008935632184147835, "learning_rate": 0.00010950547223348198, "loss": 0.0018, "step": 8930 }, { "epoch": 1.811917308471828, "grad_norm": 0.011125830933451653, "learning_rate": 0.0001094041345764086, "loss": 0.0022, "step": 8940 }, { "epoch": 1.8139440616132956, "grad_norm": 0.008295311592519283, "learning_rate": 0.00010930279691933523, "loss": 0.409, "step": 8950 }, { "epoch": 1.8159708147547629, "grad_norm": 0.007981337606906891, "learning_rate": 0.00010920145926226186, "loss": 0.0026, "step": 8960 }, { "epoch": 1.8179975678962301, "grad_norm": 0.007811954244971275, "learning_rate": 0.00010910012160518849, "loss": 0.0025, "step": 8970 }, { "epoch": 1.8200243210376976, "grad_norm": 0.011638646014034748, "learning_rate": 0.00010899878394811514, "loss": 0.3995, "step": 8980 }, { "epoch": 1.8220510741791651, "grad_norm": 0.06227179989218712, "learning_rate": 0.00010889744629104174, "loss": 0.0019, "step": 8990 }, { "epoch": 1.8240778273206324, "grad_norm": 8.30929183959961, "learning_rate": 0.00010879610863396839, "loss": 0.4926, "step": 9000 }, { "epoch": 1.8240778273206324, "eval_accuracy": 0.967032967032967, "eval_loss": 0.15174922347068787, "eval_runtime": 114.8648, "eval_samples_per_second": 4.753, "eval_steps_per_second": 0.601, "step": 9000 }, { "epoch": 1.8261045804620997, "grad_norm": 0.17263716459274292, "learning_rate": 0.00010869477097689502, "loss": 0.485, "step": 9010 }, { "epoch": 1.828131333603567, "grad_norm": 0.008078324608504772, "learning_rate": 0.00010859343331982166, "loss": 0.0021, "step": 9020 }, { "epoch": 1.8301580867450344, "grad_norm": 0.008292094804346561, "learning_rate": 0.00010849209566274828, "loss": 0.0048, "step": 9030 }, { "epoch": 1.832184839886502, "grad_norm": 0.007516876794397831, "learning_rate": 0.00010839075800567491, "loss": 0.0019, "step": 9040 }, { "epoch": 1.8342115930279692, "grad_norm": 0.007887504994869232, "learning_rate": 0.00010828942034860154, "loss": 0.0009, "step": 9050 }, { "epoch": 1.8362383461694365, "grad_norm": 0.015964463353157043, "learning_rate": 0.00010818808269152819, "loss": 0.0028, "step": 9060 }, { "epoch": 1.838265099310904, "grad_norm": 0.1481909453868866, "learning_rate": 0.00010808674503445482, "loss": 0.0033, "step": 9070 }, { "epoch": 1.8402918524523713, "grad_norm": 0.006927825044840574, "learning_rate": 0.00010798540737738143, "loss": 0.0031, "step": 9080 }, { "epoch": 1.8423186055938388, "grad_norm": 0.006547117605805397, "learning_rate": 0.00010788406972030807, "loss": 0.0015, "step": 9090 }, { "epoch": 1.844345358735306, "grad_norm": 0.006231117062270641, "learning_rate": 0.0001077827320632347, "loss": 0.0023, "step": 9100 }, { "epoch": 1.844345358735306, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.12722833454608917, "eval_runtime": 71.9284, "eval_samples_per_second": 7.591, "eval_steps_per_second": 0.959, "step": 9100 }, { "epoch": 1.8463721118767733, "grad_norm": 0.006408121902495623, "learning_rate": 0.00010768139440616134, "loss": 0.0019, "step": 9110 }, { "epoch": 1.8483988650182408, "grad_norm": 0.006610712967813015, "learning_rate": 0.00010758005674908798, "loss": 0.7366, "step": 9120 }, { "epoch": 1.8504256181597083, "grad_norm": 0.060888879001140594, "learning_rate": 0.00010747871909201459, "loss": 0.0028, "step": 9130 }, { "epoch": 1.8524523713011756, "grad_norm": 0.0076481616124510765, "learning_rate": 0.00010737738143494122, "loss": 0.0022, "step": 9140 }, { "epoch": 1.8544791244426428, "grad_norm": 0.006980573292821646, "learning_rate": 0.00010727604377786787, "loss": 0.0014, "step": 9150 }, { "epoch": 1.8565058775841101, "grad_norm": 0.006991061381995678, "learning_rate": 0.0001071747061207945, "loss": 0.0021, "step": 9160 }, { "epoch": 1.8585326307255776, "grad_norm": 0.007438084110617638, "learning_rate": 0.00010707336846372112, "loss": 0.0014, "step": 9170 }, { "epoch": 1.8605593838670451, "grad_norm": 0.006888444069772959, "learning_rate": 0.00010697203080664775, "loss": 0.0014, "step": 9180 }, { "epoch": 1.8625861370085124, "grad_norm": 0.006462920922785997, "learning_rate": 0.00010687069314957438, "loss": 0.0013, "step": 9190 }, { "epoch": 1.8646128901499797, "grad_norm": 0.04772758483886719, "learning_rate": 0.00010676935549250102, "loss": 0.5028, "step": 9200 }, { "epoch": 1.8646128901499797, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.1444089710712433, "eval_runtime": 76.7389, "eval_samples_per_second": 7.115, "eval_steps_per_second": 0.899, "step": 9200 }, { "epoch": 1.866639643291447, "grad_norm": 0.007817757315933704, "learning_rate": 0.00010666801783542766, "loss": 0.7305, "step": 9210 }, { "epoch": 1.8686663964329144, "grad_norm": 0.00858191680163145, "learning_rate": 0.00010656668017835427, "loss": 1.0045, "step": 9220 }, { "epoch": 1.870693149574382, "grad_norm": 0.009446145966649055, "learning_rate": 0.00010646534252128091, "loss": 0.002, "step": 9230 }, { "epoch": 1.8727199027158492, "grad_norm": 0.00917446706444025, "learning_rate": 0.00010636400486420755, "loss": 0.0083, "step": 9240 }, { "epoch": 1.8747466558573165, "grad_norm": 0.09278376400470734, "learning_rate": 0.00010626266720713418, "loss": 0.5391, "step": 9250 }, { "epoch": 1.876773408998784, "grad_norm": 0.009064766578376293, "learning_rate": 0.00010616132955006081, "loss": 0.0019, "step": 9260 }, { "epoch": 1.8788001621402515, "grad_norm": 0.009625071659684181, "learning_rate": 0.00010605999189298743, "loss": 0.909, "step": 9270 }, { "epoch": 1.8808269152817187, "grad_norm": 0.011897668242454529, "learning_rate": 0.00010595865423591406, "loss": 0.4662, "step": 9280 }, { "epoch": 1.882853668423186, "grad_norm": 0.10990961641073227, "learning_rate": 0.0001058573165788407, "loss": 0.0138, "step": 9290 }, { "epoch": 1.8848804215646533, "grad_norm": 0.01036153919994831, "learning_rate": 0.00010575597892176734, "loss": 0.0051, "step": 9300 }, { "epoch": 1.8848804215646533, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.1275521218776703, "eval_runtime": 72.9524, "eval_samples_per_second": 7.484, "eval_steps_per_second": 0.946, "step": 9300 }, { "epoch": 1.8869071747061208, "grad_norm": 0.0937432125210762, "learning_rate": 0.00010565464126469395, "loss": 0.0087, "step": 9310 }, { "epoch": 1.8889339278475883, "grad_norm": 0.009895688854157925, "learning_rate": 0.0001055533036076206, "loss": 0.2518, "step": 9320 }, { "epoch": 1.8909606809890556, "grad_norm": 0.008279825560748577, "learning_rate": 0.00010545196595054723, "loss": 0.0021, "step": 9330 }, { "epoch": 1.8929874341305228, "grad_norm": 0.0076844836585223675, "learning_rate": 0.00010535062829347386, "loss": 0.0019, "step": 9340 }, { "epoch": 1.8950141872719901, "grad_norm": 0.07048050314188004, "learning_rate": 0.00010524929063640049, "loss": 0.0024, "step": 9350 }, { "epoch": 1.8970409404134576, "grad_norm": 8.189104080200195, "learning_rate": 0.00010514795297932711, "loss": 0.9113, "step": 9360 }, { "epoch": 1.899067693554925, "grad_norm": 0.008056996390223503, "learning_rate": 0.00010504661532225374, "loss": 0.0046, "step": 9370 }, { "epoch": 1.9010944466963924, "grad_norm": 0.007996568456292152, "learning_rate": 0.00010494527766518039, "loss": 0.005, "step": 9380 }, { "epoch": 1.9031211998378597, "grad_norm": 0.00770143186673522, "learning_rate": 0.00010484394000810702, "loss": 0.0045, "step": 9390 }, { "epoch": 1.9051479529793272, "grad_norm": 0.007663055323064327, "learning_rate": 0.00010474260235103366, "loss": 0.0019, "step": 9400 }, { "epoch": 1.9051479529793272, "eval_accuracy": 0.9688644688644689, "eval_loss": 0.15504136681556702, "eval_runtime": 78.4367, "eval_samples_per_second": 6.961, "eval_steps_per_second": 0.88, "step": 9400 }, { "epoch": 1.9071747061207946, "grad_norm": 0.00760328583419323, "learning_rate": 0.00010464126469396028, "loss": 0.0018, "step": 9410 }, { "epoch": 1.909201459262262, "grad_norm": 21.01438331604004, "learning_rate": 0.00010453992703688691, "loss": 0.3983, "step": 9420 }, { "epoch": 1.9112282124037292, "grad_norm": 0.007572650909423828, "learning_rate": 0.00010443858937981354, "loss": 0.4565, "step": 9430 }, { "epoch": 1.9132549655451965, "grad_norm": 0.007722463458776474, "learning_rate": 0.00010433725172274019, "loss": 0.0009, "step": 9440 }, { "epoch": 1.915281718686664, "grad_norm": 0.007536469493061304, "learning_rate": 0.00010423591406566682, "loss": 0.0027, "step": 9450 }, { "epoch": 1.9173084718281315, "grad_norm": 0.007839085534214973, "learning_rate": 0.00010413457640859343, "loss": 0.4254, "step": 9460 }, { "epoch": 1.9193352249695987, "grad_norm": 0.12683309614658356, "learning_rate": 0.00010403323875152007, "loss": 0.0037, "step": 9470 }, { "epoch": 1.921361978111066, "grad_norm": 0.007982700131833553, "learning_rate": 0.0001039319010944467, "loss": 0.0038, "step": 9480 }, { "epoch": 1.9233887312525333, "grad_norm": 0.007812978699803352, "learning_rate": 0.00010383056343737334, "loss": 0.3642, "step": 9490 }, { "epoch": 1.9254154843940008, "grad_norm": 0.0074735949747264385, "learning_rate": 0.00010372922578029996, "loss": 0.0052, "step": 9500 }, { "epoch": 1.9254154843940008, "eval_accuracy": 0.9633699633699634, "eval_loss": 0.19582439959049225, "eval_runtime": 76.1295, "eval_samples_per_second": 7.172, "eval_steps_per_second": 0.906, "step": 9500 }, { "epoch": 1.9274422375354683, "grad_norm": 0.007177373860031366, "learning_rate": 0.00010362788812322659, "loss": 0.005, "step": 9510 }, { "epoch": 1.9294689906769356, "grad_norm": 0.007034731563180685, "learning_rate": 0.00010352655046615322, "loss": 0.428, "step": 9520 }, { "epoch": 1.9314957438184028, "grad_norm": 0.00731532322242856, "learning_rate": 0.00010342521280907987, "loss": 0.0059, "step": 9530 }, { "epoch": 1.9335224969598703, "grad_norm": 0.007122396025806665, "learning_rate": 0.0001033238751520065, "loss": 0.6688, "step": 9540 }, { "epoch": 1.9355492501013376, "grad_norm": 0.009541909210383892, "learning_rate": 0.00010322253749493312, "loss": 0.0027, "step": 9550 }, { "epoch": 1.937576003242805, "grad_norm": 0.1792389452457428, "learning_rate": 0.00010312119983785975, "loss": 0.0067, "step": 9560 }, { "epoch": 1.9396027563842724, "grad_norm": 0.010249803774058819, "learning_rate": 0.00010301986218078638, "loss": 0.3941, "step": 9570 }, { "epoch": 1.9416295095257396, "grad_norm": 0.11628035455942154, "learning_rate": 0.00010291852452371302, "loss": 0.4732, "step": 9580 }, { "epoch": 1.9436562626672071, "grad_norm": 0.10727599263191223, "learning_rate": 0.00010281718686663966, "loss": 0.7003, "step": 9590 }, { "epoch": 1.9456830158086746, "grad_norm": 0.168293759226799, "learning_rate": 0.00010271584920956627, "loss": 0.0099, "step": 9600 }, { "epoch": 1.9456830158086746, "eval_accuracy": 0.9688644688644689, "eval_loss": 0.13594172894954681, "eval_runtime": 78.0607, "eval_samples_per_second": 6.995, "eval_steps_per_second": 0.884, "step": 9600 }, { "epoch": 1.947709768950142, "grad_norm": 0.014889145269989967, "learning_rate": 0.00010261451155249291, "loss": 0.0028, "step": 9610 }, { "epoch": 1.9497365220916092, "grad_norm": 0.1032806932926178, "learning_rate": 0.00010251317389541955, "loss": 0.4153, "step": 9620 }, { "epoch": 1.9517632752330765, "grad_norm": 0.0114668607711792, "learning_rate": 0.00010241183623834618, "loss": 0.0057, "step": 9630 }, { "epoch": 1.953790028374544, "grad_norm": 0.18307086825370789, "learning_rate": 0.0001023104985812728, "loss": 0.0052, "step": 9640 }, { "epoch": 1.9558167815160115, "grad_norm": 0.009420246817171574, "learning_rate": 0.00010220916092419943, "loss": 0.0061, "step": 9650 }, { "epoch": 1.9578435346574787, "grad_norm": 0.01083473488688469, "learning_rate": 0.00010210782326712606, "loss": 0.833, "step": 9660 }, { "epoch": 1.959870287798946, "grad_norm": 0.17974765598773956, "learning_rate": 0.00010200648561005271, "loss": 0.0044, "step": 9670 }, { "epoch": 1.9618970409404135, "grad_norm": 0.10389303416013718, "learning_rate": 0.00010190514795297934, "loss": 0.0036, "step": 9680 }, { "epoch": 1.9639237940818808, "grad_norm": 0.010213226079940796, "learning_rate": 0.00010180381029590595, "loss": 0.0028, "step": 9690 }, { "epoch": 1.9659505472233483, "grad_norm": 0.009096471592783928, "learning_rate": 0.0001017024726388326, "loss": 0.3494, "step": 9700 }, { "epoch": 1.9659505472233483, "eval_accuracy": 0.9542124542124543, "eval_loss": 0.19689708948135376, "eval_runtime": 77.5829, "eval_samples_per_second": 7.038, "eval_steps_per_second": 0.889, "step": 9700 }, { "epoch": 1.9679773003648156, "grad_norm": 0.007500569336116314, "learning_rate": 0.00010160113498175923, "loss": 0.5272, "step": 9710 }, { "epoch": 1.9700040535062828, "grad_norm": 0.007903238758444786, "learning_rate": 0.00010149979732468586, "loss": 0.0036, "step": 9720 }, { "epoch": 1.9720308066477503, "grad_norm": 0.022083204239606857, "learning_rate": 0.00010139845966761249, "loss": 0.0023, "step": 9730 }, { "epoch": 1.9740575597892178, "grad_norm": 0.00797079224139452, "learning_rate": 0.00010129712201053911, "loss": 0.884, "step": 9740 }, { "epoch": 1.976084312930685, "grad_norm": 0.010292446240782738, "learning_rate": 0.00010119578435346574, "loss": 0.0073, "step": 9750 }, { "epoch": 1.9781110660721524, "grad_norm": 0.00823873933404684, "learning_rate": 0.00010109444669639239, "loss": 0.3514, "step": 9760 }, { "epoch": 1.9801378192136196, "grad_norm": 0.10817509144544601, "learning_rate": 0.00010099310903931902, "loss": 0.0078, "step": 9770 }, { "epoch": 1.9821645723550871, "grad_norm": 0.007184948306530714, "learning_rate": 0.00010089177138224564, "loss": 0.0009, "step": 9780 }, { "epoch": 1.9841913254965546, "grad_norm": 0.13583801686763763, "learning_rate": 0.00010079043372517228, "loss": 0.4471, "step": 9790 }, { "epoch": 1.986218078638022, "grad_norm": 0.11831188201904297, "learning_rate": 0.00010068909606809891, "loss": 0.0035, "step": 9800 }, { "epoch": 1.986218078638022, "eval_accuracy": 0.9578754578754579, "eval_loss": 0.16712024807929993, "eval_runtime": 77.7527, "eval_samples_per_second": 7.022, "eval_steps_per_second": 0.887, "step": 9800 }, { "epoch": 1.9882448317794892, "grad_norm": 0.17543888092041016, "learning_rate": 0.00010058775841102554, "loss": 1.7896, "step": 9810 }, { "epoch": 1.9902715849209567, "grad_norm": 0.01244261208921671, "learning_rate": 0.00010048642075395219, "loss": 0.0038, "step": 9820 }, { "epoch": 1.992298338062424, "grad_norm": 0.010985905304551125, "learning_rate": 0.00010038508309687879, "loss": 0.0011, "step": 9830 }, { "epoch": 1.9943250912038915, "grad_norm": 0.0096734669059515, "learning_rate": 0.00010028374543980544, "loss": 0.0092, "step": 9840 }, { "epoch": 1.9963518443453587, "grad_norm": 0.009174306876957417, "learning_rate": 0.00010018240778273207, "loss": 0.0039, "step": 9850 }, { "epoch": 1.998378597486826, "grad_norm": 0.010896787978708744, "learning_rate": 0.0001000810701256587, "loss": 0.0142, "step": 9860 }, { "epoch": 2.0004053506282933, "grad_norm": 0.008190103806555271, "learning_rate": 9.997973246858532e-05, "loss": 0.0048, "step": 9870 }, { "epoch": 2.002432103769761, "grad_norm": 0.007770847994834185, "learning_rate": 9.987839481151197e-05, "loss": 0.0025, "step": 9880 }, { "epoch": 2.0044588569112283, "grad_norm": 0.009759979322552681, "learning_rate": 9.977705715443859e-05, "loss": 0.3902, "step": 9890 }, { "epoch": 2.0064856100526955, "grad_norm": 0.007514787372201681, "learning_rate": 9.967571949736522e-05, "loss": 0.0025, "step": 9900 }, { "epoch": 2.0064856100526955, "eval_accuracy": 0.9706959706959707, "eval_loss": 0.14347095787525177, "eval_runtime": 72.6553, "eval_samples_per_second": 7.515, "eval_steps_per_second": 0.95, "step": 9900 }, { "epoch": 2.008512363194163, "grad_norm": 0.007548211142420769, "learning_rate": 9.957438184029185e-05, "loss": 0.0066, "step": 9910 }, { "epoch": 2.0105391163356305, "grad_norm": 0.11043214052915573, "learning_rate": 9.947304418321849e-05, "loss": 0.0049, "step": 9920 }, { "epoch": 2.012565869477098, "grad_norm": 0.006975914351642132, "learning_rate": 9.937170652614512e-05, "loss": 0.5866, "step": 9930 }, { "epoch": 2.014592622618565, "grad_norm": 0.015638014301657677, "learning_rate": 9.927036886907175e-05, "loss": 0.4344, "step": 9940 }, { "epoch": 2.0166193757600324, "grad_norm": 0.006982761435210705, "learning_rate": 9.916903121199838e-05, "loss": 0.0098, "step": 9950 }, { "epoch": 2.0186461289014996, "grad_norm": 0.006579644978046417, "learning_rate": 9.906769355492502e-05, "loss": 0.0094, "step": 9960 }, { "epoch": 2.0206728820429674, "grad_norm": 0.006210808642208576, "learning_rate": 9.896635589785165e-05, "loss": 0.0073, "step": 9970 }, { "epoch": 2.0226996351844346, "grad_norm": 0.030406000092625618, "learning_rate": 9.886501824077827e-05, "loss": 0.0041, "step": 9980 }, { "epoch": 2.024726388325902, "grad_norm": 0.011690155602991581, "learning_rate": 9.876368058370491e-05, "loss": 0.0042, "step": 9990 }, { "epoch": 2.026753141467369, "grad_norm": 0.005466008093208075, "learning_rate": 9.866234292663155e-05, "loss": 0.0006, "step": 10000 }, { "epoch": 2.026753141467369, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.11869247257709503, "eval_runtime": 72.4776, "eval_samples_per_second": 7.533, "eval_steps_per_second": 0.952, "step": 10000 }, { "epoch": 2.0287798946088365, "grad_norm": 0.00618654815480113, "learning_rate": 9.856100526955817e-05, "loss": 0.0014, "step": 10010 }, { "epoch": 2.030806647750304, "grad_norm": 0.005324477329850197, "learning_rate": 9.845966761248481e-05, "loss": 0.0024, "step": 10020 }, { "epoch": 2.0328334008917714, "grad_norm": 0.005492259282618761, "learning_rate": 9.835832995541143e-05, "loss": 0.0006, "step": 10030 }, { "epoch": 2.0348601540332387, "grad_norm": 0.005066926125437021, "learning_rate": 9.825699229833806e-05, "loss": 0.0006, "step": 10040 }, { "epoch": 2.036886907174706, "grad_norm": 0.004997451324015856, "learning_rate": 9.81556546412647e-05, "loss": 0.0005, "step": 10050 }, { "epoch": 2.0389136603161733, "grad_norm": 0.004906692076474428, "learning_rate": 9.805431698419133e-05, "loss": 0.0031, "step": 10060 }, { "epoch": 2.040940413457641, "grad_norm": 0.005268965382128954, "learning_rate": 9.795297932711796e-05, "loss": 0.417, "step": 10070 }, { "epoch": 2.0429671665991083, "grad_norm": 0.13274605572223663, "learning_rate": 9.78516416700446e-05, "loss": 0.002, "step": 10080 }, { "epoch": 2.0449939197405755, "grad_norm": 0.13358217477798462, "learning_rate": 9.775030401297123e-05, "loss": 0.4915, "step": 10090 }, { "epoch": 2.047020672882043, "grad_norm": 0.0064069111831486225, "learning_rate": 9.764896635589785e-05, "loss": 0.0035, "step": 10100 }, { "epoch": 2.047020672882043, "eval_accuracy": 0.978021978021978, "eval_loss": 0.13034148514270782, "eval_runtime": 75.3202, "eval_samples_per_second": 7.249, "eval_steps_per_second": 0.916, "step": 10100 }, { "epoch": 2.0490474260235105, "grad_norm": 0.09453519433736801, "learning_rate": 9.754762869882449e-05, "loss": 0.4684, "step": 10110 }, { "epoch": 2.051074179164978, "grad_norm": 0.005555831361562014, "learning_rate": 9.744629104175111e-05, "loss": 0.0018, "step": 10120 }, { "epoch": 2.053100932306445, "grad_norm": 0.005330225918442011, "learning_rate": 9.734495338467774e-05, "loss": 0.0028, "step": 10130 }, { "epoch": 2.0551276854479124, "grad_norm": 0.14626197516918182, "learning_rate": 9.724361572760439e-05, "loss": 0.0023, "step": 10140 }, { "epoch": 2.0571544385893796, "grad_norm": 0.005088937468826771, "learning_rate": 9.714227807053101e-05, "loss": 0.0016, "step": 10150 }, { "epoch": 2.0591811917308473, "grad_norm": 0.005233987234532833, "learning_rate": 9.704094041345764e-05, "loss": 0.0054, "step": 10160 }, { "epoch": 2.0612079448723146, "grad_norm": 0.00554394256323576, "learning_rate": 9.693960275638428e-05, "loss": 0.4676, "step": 10170 }, { "epoch": 2.063234698013782, "grad_norm": 0.00520316231995821, "learning_rate": 9.683826509931091e-05, "loss": 0.0026, "step": 10180 }, { "epoch": 2.065261451155249, "grad_norm": 0.005012698005884886, "learning_rate": 9.673692744223754e-05, "loss": 0.0019, "step": 10190 }, { "epoch": 2.0672882042967164, "grad_norm": 0.005969386547803879, "learning_rate": 9.663558978516417e-05, "loss": 0.7492, "step": 10200 }, { "epoch": 2.0672882042967164, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.12935897707939148, "eval_runtime": 78.1995, "eval_samples_per_second": 6.982, "eval_steps_per_second": 0.882, "step": 10200 }, { "epoch": 2.069314957438184, "grad_norm": 0.006063271313905716, "learning_rate": 9.65342521280908e-05, "loss": 0.002, "step": 10210 }, { "epoch": 2.0713417105796514, "grad_norm": 0.15411558747291565, "learning_rate": 9.643291447101744e-05, "loss": 0.0025, "step": 10220 }, { "epoch": 2.0733684637211187, "grad_norm": 0.006028542760759592, "learning_rate": 9.633157681394407e-05, "loss": 0.0028, "step": 10230 }, { "epoch": 2.075395216862586, "grad_norm": 0.005989754106849432, "learning_rate": 9.623023915687069e-05, "loss": 0.0017, "step": 10240 }, { "epoch": 2.0774219700040537, "grad_norm": 0.006351362448185682, "learning_rate": 9.612890149979734e-05, "loss": 0.0033, "step": 10250 }, { "epoch": 2.079448723145521, "grad_norm": 0.7296304702758789, "learning_rate": 9.602756384272396e-05, "loss": 0.7193, "step": 10260 }, { "epoch": 2.0814754762869883, "grad_norm": 0.010890481062233448, "learning_rate": 9.592622618565059e-05, "loss": 1.4063, "step": 10270 }, { "epoch": 2.0835022294284555, "grad_norm": 0.014627876691520214, "learning_rate": 9.582488852857723e-05, "loss": 1.5828, "step": 10280 }, { "epoch": 2.085528982569923, "grad_norm": 0.012853405438363552, "learning_rate": 9.572355087150385e-05, "loss": 0.0077, "step": 10290 }, { "epoch": 2.0875557357113905, "grad_norm": 0.06252937763929367, "learning_rate": 9.562221321443049e-05, "loss": 0.0154, "step": 10300 }, { "epoch": 2.0875557357113905, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.11084937304258347, "eval_runtime": 74.4902, "eval_samples_per_second": 7.33, "eval_steps_per_second": 0.926, "step": 10300 }, { "epoch": 2.089582488852858, "grad_norm": 0.012860951945185661, "learning_rate": 9.552087555735712e-05, "loss": 0.005, "step": 10310 }, { "epoch": 2.091609241994325, "grad_norm": 0.01152266189455986, "learning_rate": 9.541953790028375e-05, "loss": 0.3453, "step": 10320 }, { "epoch": 2.0936359951357923, "grad_norm": 0.00889595691114664, "learning_rate": 9.531820024321037e-05, "loss": 0.0035, "step": 10330 }, { "epoch": 2.0956627482772596, "grad_norm": 0.04616453871130943, "learning_rate": 9.521686258613702e-05, "loss": 0.0013, "step": 10340 }, { "epoch": 2.0976895014187273, "grad_norm": 0.010016735643148422, "learning_rate": 9.511552492906365e-05, "loss": 0.0084, "step": 10350 }, { "epoch": 2.0997162545601946, "grad_norm": 0.17551860213279724, "learning_rate": 9.501418727199027e-05, "loss": 0.0062, "step": 10360 }, { "epoch": 2.101743007701662, "grad_norm": 0.02717851661145687, "learning_rate": 9.491284961491692e-05, "loss": 0.2781, "step": 10370 }, { "epoch": 2.103769760843129, "grad_norm": 0.006723467726260424, "learning_rate": 9.481151195784353e-05, "loss": 0.0025, "step": 10380 }, { "epoch": 2.105796513984597, "grad_norm": 0.006483188830316067, "learning_rate": 9.471017430077017e-05, "loss": 0.0064, "step": 10390 }, { "epoch": 2.107823267126064, "grad_norm": 0.006499973591417074, "learning_rate": 9.460883664369681e-05, "loss": 0.0007, "step": 10400 }, { "epoch": 2.107823267126064, "eval_accuracy": 0.9487179487179487, "eval_loss": 0.26751798391342163, "eval_runtime": 72.3009, "eval_samples_per_second": 7.552, "eval_steps_per_second": 0.954, "step": 10400 }, { "epoch": 2.1098500202675314, "grad_norm": 0.006514677777886391, "learning_rate": 9.450749898662343e-05, "loss": 0.0027, "step": 10410 }, { "epoch": 2.1118767734089987, "grad_norm": 0.0059182727709412575, "learning_rate": 9.440616132955006e-05, "loss": 0.0032, "step": 10420 }, { "epoch": 2.113903526550466, "grad_norm": 0.09711989760398865, "learning_rate": 9.43048236724767e-05, "loss": 0.0029, "step": 10430 }, { "epoch": 2.1159302796919337, "grad_norm": 0.006037118844687939, "learning_rate": 9.420348601540333e-05, "loss": 0.0006, "step": 10440 }, { "epoch": 2.117957032833401, "grad_norm": 0.005589130334556103, "learning_rate": 9.410214835832995e-05, "loss": 0.2581, "step": 10450 }, { "epoch": 2.1199837859748683, "grad_norm": 0.08527911454439163, "learning_rate": 9.40008107012566e-05, "loss": 0.0016, "step": 10460 }, { "epoch": 2.1220105391163355, "grad_norm": 0.07996934652328491, "learning_rate": 9.389947304418323e-05, "loss": 0.4869, "step": 10470 }, { "epoch": 2.124037292257803, "grad_norm": 0.007227790541946888, "learning_rate": 9.379813538710985e-05, "loss": 0.4646, "step": 10480 }, { "epoch": 2.1260640453992705, "grad_norm": 0.0073805213905870914, "learning_rate": 9.36967977300365e-05, "loss": 0.0031, "step": 10490 }, { "epoch": 2.128090798540738, "grad_norm": 0.00691114692017436, "learning_rate": 9.359546007296311e-05, "loss": 0.0008, "step": 10500 }, { "epoch": 2.128090798540738, "eval_accuracy": 0.9688644688644689, "eval_loss": 0.1334153413772583, "eval_runtime": 77.7167, "eval_samples_per_second": 7.026, "eval_steps_per_second": 0.888, "step": 10500 }, { "epoch": 2.130117551682205, "grad_norm": 0.00719085056334734, "learning_rate": 9.349412241588974e-05, "loss": 0.0061, "step": 10510 }, { "epoch": 2.1321443048236723, "grad_norm": 0.005878064781427383, "learning_rate": 9.339278475881638e-05, "loss": 0.0079, "step": 10520 }, { "epoch": 2.1341710579651396, "grad_norm": 0.005655771121382713, "learning_rate": 9.329144710174301e-05, "loss": 0.003, "step": 10530 }, { "epoch": 2.1361978111066073, "grad_norm": 0.07498996704816818, "learning_rate": 9.319010944466964e-05, "loss": 0.0046, "step": 10540 }, { "epoch": 2.1382245642480746, "grad_norm": 0.005195173434913158, "learning_rate": 9.308877178759628e-05, "loss": 0.0014, "step": 10550 }, { "epoch": 2.140251317389542, "grad_norm": 0.1367482841014862, "learning_rate": 9.298743413052291e-05, "loss": 0.0019, "step": 10560 }, { "epoch": 2.142278070531009, "grad_norm": 0.005297408904880285, "learning_rate": 9.288609647344954e-05, "loss": 0.0027, "step": 10570 }, { "epoch": 2.144304823672477, "grad_norm": 0.006349307019263506, "learning_rate": 9.278475881637617e-05, "loss": 0.0016, "step": 10580 }, { "epoch": 2.146331576813944, "grad_norm": 0.005633942317217588, "learning_rate": 9.268342115930279e-05, "loss": 0.2652, "step": 10590 }, { "epoch": 2.1483583299554114, "grad_norm": 0.05479590594768524, "learning_rate": 9.258208350222944e-05, "loss": 0.003, "step": 10600 }, { "epoch": 2.1483583299554114, "eval_accuracy": 0.967032967032967, "eval_loss": 0.15828333795070648, "eval_runtime": 76.828, "eval_samples_per_second": 7.107, "eval_steps_per_second": 0.898, "step": 10600 }, { "epoch": 2.1503850830968787, "grad_norm": 0.00457912078127265, "learning_rate": 9.248074584515607e-05, "loss": 0.003, "step": 10610 }, { "epoch": 2.152411836238346, "grad_norm": 0.005090583115816116, "learning_rate": 9.237940818808269e-05, "loss": 0.7639, "step": 10620 }, { "epoch": 2.1544385893798137, "grad_norm": 0.0057753403671085835, "learning_rate": 9.227807053100934e-05, "loss": 0.0011, "step": 10630 }, { "epoch": 2.156465342521281, "grad_norm": 0.006544235162436962, "learning_rate": 9.217673287393596e-05, "loss": 0.0009, "step": 10640 }, { "epoch": 2.1584920956627482, "grad_norm": 0.005472972523421049, "learning_rate": 9.207539521686259e-05, "loss": 0.0006, "step": 10650 }, { "epoch": 2.1605188488042155, "grad_norm": 0.005556443706154823, "learning_rate": 9.197405755978922e-05, "loss": 0.7453, "step": 10660 }, { "epoch": 2.1625456019456832, "grad_norm": 0.007693535648286343, "learning_rate": 9.187271990271585e-05, "loss": 0.0014, "step": 10670 }, { "epoch": 2.1645723550871505, "grad_norm": 0.007847932167351246, "learning_rate": 9.177138224564249e-05, "loss": 0.0015, "step": 10680 }, { "epoch": 2.166599108228618, "grad_norm": 0.016400208696722984, "learning_rate": 9.167004458856912e-05, "loss": 0.0014, "step": 10690 }, { "epoch": 2.168625861370085, "grad_norm": 0.010732585564255714, "learning_rate": 9.156870693149575e-05, "loss": 0.4043, "step": 10700 }, { "epoch": 2.168625861370085, "eval_accuracy": 0.978021978021978, "eval_loss": 0.11980610340833664, "eval_runtime": 76.3749, "eval_samples_per_second": 7.149, "eval_steps_per_second": 0.903, "step": 10700 }, { "epoch": 2.1706526145115523, "grad_norm": 0.04904749244451523, "learning_rate": 9.146736927442237e-05, "loss": 0.0034, "step": 10710 }, { "epoch": 2.17267936765302, "grad_norm": 873.9728393554688, "learning_rate": 9.136603161734902e-05, "loss": 0.157, "step": 10720 }, { "epoch": 2.1747061207944873, "grad_norm": 0.006011638790369034, "learning_rate": 9.126469396027564e-05, "loss": 0.0012, "step": 10730 }, { "epoch": 2.1767328739359546, "grad_norm": 0.040896765887737274, "learning_rate": 9.116335630320227e-05, "loss": 0.0016, "step": 10740 }, { "epoch": 2.178759627077422, "grad_norm": 0.005535133183002472, "learning_rate": 9.106201864612892e-05, "loss": 0.0016, "step": 10750 }, { "epoch": 2.180786380218889, "grad_norm": 0.0057787708938121796, "learning_rate": 9.096068098905553e-05, "loss": 0.0018, "step": 10760 }, { "epoch": 2.182813133360357, "grad_norm": 0.007440904155373573, "learning_rate": 9.085934333198217e-05, "loss": 1.3857, "step": 10770 }, { "epoch": 2.184839886501824, "grad_norm": 0.008765467442572117, "learning_rate": 9.07580056749088e-05, "loss": 0.5295, "step": 10780 }, { "epoch": 2.1868666396432914, "grad_norm": 0.009373875334858894, "learning_rate": 9.065666801783543e-05, "loss": 0.005, "step": 10790 }, { "epoch": 2.1888933927847587, "grad_norm": 0.007794825825840235, "learning_rate": 9.055533036076206e-05, "loss": 0.0016, "step": 10800 }, { "epoch": 2.1888933927847587, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.11303602159023285, "eval_runtime": 71.211, "eval_samples_per_second": 7.667, "eval_steps_per_second": 0.969, "step": 10800 }, { "epoch": 2.190920145926226, "grad_norm": 0.0074144043028354645, "learning_rate": 9.04539927036887e-05, "loss": 0.0015, "step": 10810 }, { "epoch": 2.1929468990676937, "grad_norm": 0.00861959159374237, "learning_rate": 9.035265504661533e-05, "loss": 0.0017, "step": 10820 }, { "epoch": 2.194973652209161, "grad_norm": 0.008004290983080864, "learning_rate": 9.025131738954196e-05, "loss": 0.6036, "step": 10830 }, { "epoch": 2.1970004053506282, "grad_norm": 0.05160319060087204, "learning_rate": 9.01499797324686e-05, "loss": 0.0024, "step": 10840 }, { "epoch": 2.1990271584920955, "grad_norm": 0.03136659041047096, "learning_rate": 9.004864207539521e-05, "loss": 0.511, "step": 10850 }, { "epoch": 2.2010539116335632, "grad_norm": 0.14971128106117249, "learning_rate": 8.994730441832186e-05, "loss": 0.9823, "step": 10860 }, { "epoch": 2.2030806647750305, "grad_norm": 0.009087011218070984, "learning_rate": 8.984596676124848e-05, "loss": 0.0022, "step": 10870 }, { "epoch": 2.205107417916498, "grad_norm": 0.010233234614133835, "learning_rate": 8.974462910417511e-05, "loss": 0.005, "step": 10880 }, { "epoch": 2.207134171057965, "grad_norm": 0.008126436732709408, "learning_rate": 8.964329144710175e-05, "loss": 0.0085, "step": 10890 }, { "epoch": 2.2091609241994323, "grad_norm": 0.010633064433932304, "learning_rate": 8.954195379002838e-05, "loss": 0.0033, "step": 10900 }, { "epoch": 2.2091609241994323, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.11015180498361588, "eval_runtime": 75.0803, "eval_samples_per_second": 7.272, "eval_steps_per_second": 0.919, "step": 10900 }, { "epoch": 2.2111876773409, "grad_norm": 0.008036792278289795, "learning_rate": 8.944061613295501e-05, "loss": 0.3998, "step": 10910 }, { "epoch": 2.2132144304823673, "grad_norm": 0.17050723731517792, "learning_rate": 8.933927847588164e-05, "loss": 1.1443, "step": 10920 }, { "epoch": 2.2152411836238346, "grad_norm": 0.030761348083615303, "learning_rate": 8.923794081880828e-05, "loss": 0.5031, "step": 10930 }, { "epoch": 2.217267936765302, "grad_norm": 0.01277527492493391, "learning_rate": 8.91366031617349e-05, "loss": 0.0014, "step": 10940 }, { "epoch": 2.2192946899067696, "grad_norm": 0.01332076545804739, "learning_rate": 8.903526550466154e-05, "loss": 0.0056, "step": 10950 }, { "epoch": 2.221321443048237, "grad_norm": 0.0114201745018363, "learning_rate": 8.893392784758817e-05, "loss": 0.0034, "step": 10960 }, { "epoch": 2.223348196189704, "grad_norm": 0.01028998102992773, "learning_rate": 8.883259019051479e-05, "loss": 0.0012, "step": 10970 }, { "epoch": 2.2253749493311714, "grad_norm": 0.011788029223680496, "learning_rate": 8.873125253344144e-05, "loss": 0.8934, "step": 10980 }, { "epoch": 2.2274017024726387, "grad_norm": 0.012349162250757217, "learning_rate": 8.862991487636806e-05, "loss": 0.0175, "step": 10990 }, { "epoch": 2.2294284556141064, "grad_norm": 0.026122385635972023, "learning_rate": 8.852857721929469e-05, "loss": 1.0287, "step": 11000 }, { "epoch": 2.2294284556141064, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.10534922033548355, "eval_runtime": 70.5584, "eval_samples_per_second": 7.738, "eval_steps_per_second": 0.978, "step": 11000 }, { "epoch": 2.2314552087555737, "grad_norm": 0.027139287441968918, "learning_rate": 8.842723956222134e-05, "loss": 0.0108, "step": 11010 }, { "epoch": 2.233481961897041, "grad_norm": 0.0179609265178442, "learning_rate": 8.832590190514796e-05, "loss": 0.3606, "step": 11020 }, { "epoch": 2.2355087150385082, "grad_norm": 0.015641842037439346, "learning_rate": 8.822456424807459e-05, "loss": 0.0066, "step": 11030 }, { "epoch": 2.2375354681799755, "grad_norm": 0.013537859544157982, "learning_rate": 8.812322659100122e-05, "loss": 0.0059, "step": 11040 }, { "epoch": 2.2395622213214432, "grad_norm": 0.012414119206368923, "learning_rate": 8.802188893392785e-05, "loss": 0.0056, "step": 11050 }, { "epoch": 2.2415889744629105, "grad_norm": 0.015200314112007618, "learning_rate": 8.792055127685447e-05, "loss": 0.9405, "step": 11060 }, { "epoch": 2.2436157276043778, "grad_norm": 0.020387277007102966, "learning_rate": 8.781921361978112e-05, "loss": 0.0144, "step": 11070 }, { "epoch": 2.245642480745845, "grad_norm": 0.016295185312628746, "learning_rate": 8.771787596270775e-05, "loss": 0.4575, "step": 11080 }, { "epoch": 2.2476692338873123, "grad_norm": 0.405525267124176, "learning_rate": 8.761653830563437e-05, "loss": 0.0092, "step": 11090 }, { "epoch": 2.24969598702878, "grad_norm": 0.016881616786122322, "learning_rate": 8.751520064856102e-05, "loss": 0.3159, "step": 11100 }, { "epoch": 2.24969598702878, "eval_accuracy": 0.978021978021978, "eval_loss": 0.10043869912624359, "eval_runtime": 72.9585, "eval_samples_per_second": 7.484, "eval_steps_per_second": 0.946, "step": 11100 }, { "epoch": 2.2517227401702473, "grad_norm": 0.01376478374004364, "learning_rate": 8.741386299148764e-05, "loss": 0.0095, "step": 11110 }, { "epoch": 2.2537494933117146, "grad_norm": 0.030010223388671875, "learning_rate": 8.731252533441427e-05, "loss": 0.0126, "step": 11120 }, { "epoch": 2.255776246453182, "grad_norm": 0.011137278750538826, "learning_rate": 8.72111876773409e-05, "loss": 0.0038, "step": 11130 }, { "epoch": 2.2578029995946496, "grad_norm": 0.015156904235482216, "learning_rate": 8.710985002026753e-05, "loss": 0.7007, "step": 11140 }, { "epoch": 2.259829752736117, "grad_norm": 0.010496138595044613, "learning_rate": 8.700851236319417e-05, "loss": 0.003, "step": 11150 }, { "epoch": 2.261856505877584, "grad_norm": 0.0970601812005043, "learning_rate": 8.69071747061208e-05, "loss": 0.0035, "step": 11160 }, { "epoch": 2.2638832590190514, "grad_norm": 0.009098607115447521, "learning_rate": 8.680583704904743e-05, "loss": 0.0043, "step": 11170 }, { "epoch": 2.2659100121605187, "grad_norm": 0.008969028480350971, "learning_rate": 8.670449939197406e-05, "loss": 0.0059, "step": 11180 }, { "epoch": 2.2679367653019864, "grad_norm": 0.009530116803944111, "learning_rate": 8.66031617349007e-05, "loss": 0.0047, "step": 11190 }, { "epoch": 2.2699635184434537, "grad_norm": 0.008103832602500916, "learning_rate": 8.650182407782732e-05, "loss": 0.0464, "step": 11200 }, { "epoch": 2.2699635184434537, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.11814778298139572, "eval_runtime": 74.7548, "eval_samples_per_second": 7.304, "eval_steps_per_second": 0.923, "step": 11200 }, { "epoch": 2.271990271584921, "grad_norm": 0.008212698623538017, "learning_rate": 8.640048642075396e-05, "loss": 0.0033, "step": 11210 }, { "epoch": 2.2740170247263882, "grad_norm": 0.007732476573437452, "learning_rate": 8.62991487636806e-05, "loss": 0.7119, "step": 11220 }, { "epoch": 2.276043777867856, "grad_norm": 0.009567365050315857, "learning_rate": 8.619781110660721e-05, "loss": 0.1782, "step": 11230 }, { "epoch": 2.278070531009323, "grad_norm": 0.05973416939377785, "learning_rate": 8.609647344953386e-05, "loss": 0.0023, "step": 11240 }, { "epoch": 2.2800972841507905, "grad_norm": 0.009354268200695515, "learning_rate": 8.599513579246048e-05, "loss": 0.0025, "step": 11250 }, { "epoch": 2.2821240372922578, "grad_norm": 0.00866016000509262, "learning_rate": 8.589379813538711e-05, "loss": 0.001, "step": 11260 }, { "epoch": 2.284150790433725, "grad_norm": 0.008696874603629112, "learning_rate": 8.579246047831375e-05, "loss": 0.0033, "step": 11270 }, { "epoch": 2.2861775435751923, "grad_norm": 0.00813909713178873, "learning_rate": 8.569112282124038e-05, "loss": 0.002, "step": 11280 }, { "epoch": 2.28820429671666, "grad_norm": 0.00973751861602068, "learning_rate": 8.558978516416701e-05, "loss": 0.0014, "step": 11290 }, { "epoch": 2.2902310498581273, "grad_norm": 0.007876581512391567, "learning_rate": 8.548844750709364e-05, "loss": 0.002, "step": 11300 }, { "epoch": 2.2902310498581273, "eval_accuracy": 0.9560439560439561, "eval_loss": 0.2652477025985718, "eval_runtime": 76.3473, "eval_samples_per_second": 7.152, "eval_steps_per_second": 0.904, "step": 11300 }, { "epoch": 2.2922578029995946, "grad_norm": 0.00753940362483263, "learning_rate": 8.538710985002028e-05, "loss": 0.0013, "step": 11310 }, { "epoch": 2.294284556141062, "grad_norm": 0.007245345041155815, "learning_rate": 8.52857721929469e-05, "loss": 0.0008, "step": 11320 }, { "epoch": 2.2963113092825296, "grad_norm": 0.00759044848382473, "learning_rate": 8.518443453587354e-05, "loss": 0.5261, "step": 11330 }, { "epoch": 2.298338062423997, "grad_norm": 0.0077063278295099735, "learning_rate": 8.508309687880016e-05, "loss": 0.0019, "step": 11340 }, { "epoch": 2.300364815565464, "grad_norm": 0.007679214235395193, "learning_rate": 8.498175922172679e-05, "loss": 0.001, "step": 11350 }, { "epoch": 2.3023915687069314, "grad_norm": 0.0077296835370361805, "learning_rate": 8.488042156465344e-05, "loss": 0.0015, "step": 11360 }, { "epoch": 2.3044183218483987, "grad_norm": 0.007177960593253374, "learning_rate": 8.477908390758006e-05, "loss": 0.0019, "step": 11370 }, { "epoch": 2.3064450749898664, "grad_norm": 0.006858570966869593, "learning_rate": 8.467774625050669e-05, "loss": 0.002, "step": 11380 }, { "epoch": 2.3084718281313337, "grad_norm": 0.03824995085597038, "learning_rate": 8.457640859343332e-05, "loss": 0.0011, "step": 11390 }, { "epoch": 2.310498581272801, "grad_norm": 0.007602210622280836, "learning_rate": 8.447507093635996e-05, "loss": 0.0758, "step": 11400 }, { "epoch": 2.310498581272801, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.141328364610672, "eval_runtime": 73.9667, "eval_samples_per_second": 7.382, "eval_steps_per_second": 0.933, "step": 11400 }, { "epoch": 2.312525334414268, "grad_norm": 0.008061299100518227, "learning_rate": 8.437373327928659e-05, "loss": 0.0011, "step": 11410 }, { "epoch": 2.314552087555736, "grad_norm": 0.24199163913726807, "learning_rate": 8.427239562221322e-05, "loss": 0.0025, "step": 11420 }, { "epoch": 2.316578840697203, "grad_norm": 13.275392532348633, "learning_rate": 8.417105796513985e-05, "loss": 1.0208, "step": 11430 }, { "epoch": 2.3186055938386705, "grad_norm": 0.006914615165442228, "learning_rate": 8.406972030806647e-05, "loss": 0.0017, "step": 11440 }, { "epoch": 2.3206323469801378, "grad_norm": 0.009104193188250065, "learning_rate": 8.396838265099312e-05, "loss": 0.2027, "step": 11450 }, { "epoch": 2.322659100121605, "grad_norm": 0.007759319618344307, "learning_rate": 8.386704499391974e-05, "loss": 0.2647, "step": 11460 }, { "epoch": 2.3246858532630723, "grad_norm": 0.04744374752044678, "learning_rate": 8.376570733684637e-05, "loss": 0.0013, "step": 11470 }, { "epoch": 2.32671260640454, "grad_norm": 0.08690156787633896, "learning_rate": 8.3664369679773e-05, "loss": 0.476, "step": 11480 }, { "epoch": 2.3287393595460073, "grad_norm": 0.007856318727135658, "learning_rate": 8.356303202269964e-05, "loss": 0.5006, "step": 11490 }, { "epoch": 2.3307661126874746, "grad_norm": 0.008044347167015076, "learning_rate": 8.346169436562627e-05, "loss": 0.0027, "step": 11500 }, { "epoch": 2.3307661126874746, "eval_accuracy": 0.945054945054945, "eval_loss": 0.2024707794189453, "eval_runtime": 213.0607, "eval_samples_per_second": 2.563, "eval_steps_per_second": 0.324, "step": 11500 }, { "epoch": 2.332792865828942, "grad_norm": 0.011505112051963806, "learning_rate": 8.33603567085529e-05, "loss": 0.1192, "step": 11510 }, { "epoch": 2.3348196189704096, "grad_norm": 0.007121453061699867, "learning_rate": 8.325901905147953e-05, "loss": 0.0023, "step": 11520 }, { "epoch": 2.336846372111877, "grad_norm": 0.007184705231338739, "learning_rate": 8.315768139440617e-05, "loss": 0.0047, "step": 11530 }, { "epoch": 2.338873125253344, "grad_norm": 0.007320650387555361, "learning_rate": 8.30563437373328e-05, "loss": 0.4688, "step": 11540 }, { "epoch": 2.3408998783948114, "grad_norm": 0.0069977049715816975, "learning_rate": 8.295500608025942e-05, "loss": 0.0026, "step": 11550 }, { "epoch": 2.3429266315362787, "grad_norm": 0.006968265399336815, "learning_rate": 8.285366842318606e-05, "loss": 0.0027, "step": 11560 }, { "epoch": 2.3449533846777464, "grad_norm": 0.007060995325446129, "learning_rate": 8.27523307661127e-05, "loss": 0.0021, "step": 11570 }, { "epoch": 2.3469801378192137, "grad_norm": 0.006562945432960987, "learning_rate": 8.265099310903932e-05, "loss": 0.0038, "step": 11580 }, { "epoch": 2.349006890960681, "grad_norm": 0.00626459950581193, "learning_rate": 8.254965545196596e-05, "loss": 0.0037, "step": 11590 }, { "epoch": 2.351033644102148, "grad_norm": 0.006127441301941872, "learning_rate": 8.244831779489258e-05, "loss": 0.0011, "step": 11600 }, { "epoch": 2.351033644102148, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.13722333312034607, "eval_runtime": 211.759, "eval_samples_per_second": 2.578, "eval_steps_per_second": 0.326, "step": 11600 }, { "epoch": 2.353060397243616, "grad_norm": 6.2088303565979, "learning_rate": 8.234698013781921e-05, "loss": 0.0371, "step": 11610 }, { "epoch": 2.355087150385083, "grad_norm": 0.005822502542287111, "learning_rate": 8.224564248074585e-05, "loss": 0.0017, "step": 11620 }, { "epoch": 2.3571139035265505, "grad_norm": 0.005627756007015705, "learning_rate": 8.214430482367248e-05, "loss": 0.3425, "step": 11630 }, { "epoch": 2.3591406566680178, "grad_norm": 0.005736709106713533, "learning_rate": 8.204296716659911e-05, "loss": 0.0006, "step": 11640 }, { "epoch": 2.361167409809485, "grad_norm": 0.04625217244029045, "learning_rate": 8.194162950952575e-05, "loss": 0.0028, "step": 11650 }, { "epoch": 2.3631941629509527, "grad_norm": 0.005438953638076782, "learning_rate": 8.184029185245238e-05, "loss": 0.0018, "step": 11660 }, { "epoch": 2.36522091609242, "grad_norm": 0.005321810021996498, "learning_rate": 8.1738954195379e-05, "loss": 0.001, "step": 11670 }, { "epoch": 2.3672476692338873, "grad_norm": 0.0051677110604941845, "learning_rate": 8.163761653830564e-05, "loss": 0.001, "step": 11680 }, { "epoch": 2.3692744223753546, "grad_norm": 0.03902814909815788, "learning_rate": 8.153627888123228e-05, "loss": 0.3135, "step": 11690 }, { "epoch": 2.3713011755168223, "grad_norm": 0.005199107341468334, "learning_rate": 8.14349412241589e-05, "loss": 0.0009, "step": 11700 }, { "epoch": 2.3713011755168223, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.1458211988210678, "eval_runtime": 213.7889, "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.323, "step": 11700 }, { "epoch": 2.3733279286582896, "grad_norm": 0.005133099388331175, "learning_rate": 8.133360356708554e-05, "loss": 0.0018, "step": 11710 }, { "epoch": 2.375354681799757, "grad_norm": 0.03676588460803032, "learning_rate": 8.123226591001216e-05, "loss": 0.0013, "step": 11720 }, { "epoch": 2.377381434941224, "grad_norm": 0.03504160791635513, "learning_rate": 8.113092825293879e-05, "loss": 0.002, "step": 11730 }, { "epoch": 2.3794081880826914, "grad_norm": 0.005366514436900616, "learning_rate": 8.102959059586543e-05, "loss": 0.7592, "step": 11740 }, { "epoch": 2.3814349412241587, "grad_norm": 0.03350529819726944, "learning_rate": 8.092825293879206e-05, "loss": 0.0017, "step": 11750 }, { "epoch": 2.3834616943656264, "grad_norm": 0.006684424821287394, "learning_rate": 8.082691528171869e-05, "loss": 0.5015, "step": 11760 }, { "epoch": 2.3854884475070937, "grad_norm": 0.006355280987918377, "learning_rate": 8.072557762464532e-05, "loss": 0.0642, "step": 11770 }, { "epoch": 2.387515200648561, "grad_norm": 0.006372395902872086, "learning_rate": 8.062423996757196e-05, "loss": 0.9822, "step": 11780 }, { "epoch": 2.389541953790028, "grad_norm": 0.006969653069972992, "learning_rate": 8.052290231049859e-05, "loss": 0.0021, "step": 11790 }, { "epoch": 2.391568706931496, "grad_norm": 0.006968590896576643, "learning_rate": 8.042156465342522e-05, "loss": 0.4178, "step": 11800 }, { "epoch": 2.391568706931496, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.1403357833623886, "eval_runtime": 44.9689, "eval_samples_per_second": 12.142, "eval_steps_per_second": 1.534, "step": 11800 }, { "epoch": 2.393595460072963, "grad_norm": 0.00706483842805028, "learning_rate": 8.032022699635184e-05, "loss": 0.0016, "step": 11810 }, { "epoch": 2.3956222132144305, "grad_norm": 0.09024324268102646, "learning_rate": 8.021888933927849e-05, "loss": 0.4716, "step": 11820 }, { "epoch": 2.3976489663558977, "grad_norm": 0.007200425956398249, "learning_rate": 8.011755168220512e-05, "loss": 0.0019, "step": 11830 }, { "epoch": 2.399675719497365, "grad_norm": 0.007590134162455797, "learning_rate": 8.001621402513174e-05, "loss": 0.2847, "step": 11840 }, { "epoch": 2.4017024726388327, "grad_norm": 0.007439048029482365, "learning_rate": 7.991487636805838e-05, "loss": 0.0008, "step": 11850 }, { "epoch": 2.4037292257803, "grad_norm": 0.1115642786026001, "learning_rate": 7.9813538710985e-05, "loss": 0.0023, "step": 11860 }, { "epoch": 2.4057559789217673, "grad_norm": 0.007191861514002085, "learning_rate": 7.971220105391164e-05, "loss": 0.0034, "step": 11870 }, { "epoch": 2.4077827320632346, "grad_norm": 0.006929817609488964, "learning_rate": 7.961086339683827e-05, "loss": 0.0021, "step": 11880 }, { "epoch": 2.4098094852047023, "grad_norm": 0.006752875633537769, "learning_rate": 7.95095257397649e-05, "loss": 0.0239, "step": 11890 }, { "epoch": 2.4118362383461696, "grad_norm": 0.006513989996165037, "learning_rate": 7.940818808269153e-05, "loss": 0.0028, "step": 11900 }, { "epoch": 2.4118362383461696, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.14057356119155884, "eval_runtime": 48.5905, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.42, "step": 11900 }, { "epoch": 2.413862991487637, "grad_norm": 0.006521328818053007, "learning_rate": 7.930685042561817e-05, "loss": 0.0024, "step": 11910 }, { "epoch": 2.415889744629104, "grad_norm": 0.006255956832319498, "learning_rate": 7.92055127685448e-05, "loss": 0.0007, "step": 11920 }, { "epoch": 2.4179164977705714, "grad_norm": 0.08518156409263611, "learning_rate": 7.910417511147142e-05, "loss": 0.4114, "step": 11930 }, { "epoch": 2.419943250912039, "grad_norm": 0.006490825209766626, "learning_rate": 7.900283745439806e-05, "loss": 0.0018, "step": 11940 }, { "epoch": 2.4219700040535064, "grad_norm": 0.09310487657785416, "learning_rate": 7.890149979732468e-05, "loss": 0.0018, "step": 11950 }, { "epoch": 2.4239967571949737, "grad_norm": 0.006273780949413776, "learning_rate": 7.880016214025132e-05, "loss": 0.2179, "step": 11960 }, { "epoch": 2.426023510336441, "grad_norm": 0.00689974520355463, "learning_rate": 7.869882448317796e-05, "loss": 0.7144, "step": 11970 }, { "epoch": 2.428050263477908, "grad_norm": 0.22196179628372192, "learning_rate": 7.859748682610458e-05, "loss": 0.0045, "step": 11980 }, { "epoch": 2.430077016619376, "grad_norm": 0.007610929664224386, "learning_rate": 7.849614916903121e-05, "loss": 0.0233, "step": 11990 }, { "epoch": 2.432103769760843, "grad_norm": 0.006953901145607233, "learning_rate": 7.839481151195785e-05, "loss": 0.0009, "step": 12000 }, { "epoch": 2.432103769760843, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.12951794266700745, "eval_runtime": 47.4956, "eval_samples_per_second": 11.496, "eval_steps_per_second": 1.453, "step": 12000 }, { "epoch": 2.4341305229023105, "grad_norm": 0.006798309274017811, "learning_rate": 7.829347385488448e-05, "loss": 0.0008, "step": 12010 }, { "epoch": 2.4361572760437777, "grad_norm": 0.006593131460249424, "learning_rate": 7.81921361978111e-05, "loss": 0.0009, "step": 12020 }, { "epoch": 2.438184029185245, "grad_norm": 0.008275761269032955, "learning_rate": 7.809079854073775e-05, "loss": 0.0022, "step": 12030 }, { "epoch": 2.4402107823267127, "grad_norm": 0.0106047997251153, "learning_rate": 7.798946088366438e-05, "loss": 0.0701, "step": 12040 }, { "epoch": 2.44223753546818, "grad_norm": 0.006123432423919439, "learning_rate": 7.7888123226591e-05, "loss": 0.7337, "step": 12050 }, { "epoch": 2.4442642886096473, "grad_norm": 0.01519776787608862, "learning_rate": 7.778678556951764e-05, "loss": 0.0023, "step": 12060 }, { "epoch": 2.4462910417511146, "grad_norm": 0.007234047167003155, "learning_rate": 7.768544791244426e-05, "loss": 0.0016, "step": 12070 }, { "epoch": 2.4483177948925823, "grad_norm": 0.00756498659029603, "learning_rate": 7.75841102553709e-05, "loss": 0.0022, "step": 12080 }, { "epoch": 2.4503445480340496, "grad_norm": 0.006812751293182373, "learning_rate": 7.748277259829753e-05, "loss": 0.0015, "step": 12090 }, { "epoch": 2.452371301175517, "grad_norm": 0.06002921611070633, "learning_rate": 7.738143494122416e-05, "loss": 0.002, "step": 12100 }, { "epoch": 2.452371301175517, "eval_accuracy": 0.967032967032967, "eval_loss": 0.16850095987319946, "eval_runtime": 48.7693, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.415, "step": 12100 }, { "epoch": 2.454398054316984, "grad_norm": 0.0065709855407476425, "learning_rate": 7.728009728415079e-05, "loss": 0.506, "step": 12110 }, { "epoch": 2.4564248074584514, "grad_norm": 0.06784232705831528, "learning_rate": 7.717875962707743e-05, "loss": 0.6716, "step": 12120 }, { "epoch": 2.458451560599919, "grad_norm": 0.05798913538455963, "learning_rate": 7.707742197000406e-05, "loss": 0.0025, "step": 12130 }, { "epoch": 2.4604783137413864, "grad_norm": 0.01616346277296543, "learning_rate": 7.697608431293069e-05, "loss": 0.0051, "step": 12140 }, { "epoch": 2.4625050668828536, "grad_norm": 0.01163223572075367, "learning_rate": 7.687474665585732e-05, "loss": 0.6955, "step": 12150 }, { "epoch": 2.464531820024321, "grad_norm": 0.1036323606967926, "learning_rate": 7.677340899878394e-05, "loss": 0.5214, "step": 12160 }, { "epoch": 2.4665585731657886, "grad_norm": 8.167608261108398, "learning_rate": 7.667207134171059e-05, "loss": 0.4512, "step": 12170 }, { "epoch": 2.468585326307256, "grad_norm": 0.02810136415064335, "learning_rate": 7.657073368463722e-05, "loss": 0.0084, "step": 12180 }, { "epoch": 2.470612079448723, "grad_norm": 0.015877937898039818, "learning_rate": 7.646939602756384e-05, "loss": 0.1126, "step": 12190 }, { "epoch": 2.4726388325901905, "grad_norm": 0.01663248986005783, "learning_rate": 7.636805837049049e-05, "loss": 0.0022, "step": 12200 }, { "epoch": 2.4726388325901905, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.11507423967123032, "eval_runtime": 50.3537, "eval_samples_per_second": 10.843, "eval_steps_per_second": 1.37, "step": 12200 }, { "epoch": 2.4746655857316577, "grad_norm": 0.011700361035764217, "learning_rate": 7.62667207134171e-05, "loss": 0.0029, "step": 12210 }, { "epoch": 2.476692338873125, "grad_norm": 0.011930271051824093, "learning_rate": 7.616538305634374e-05, "loss": 0.0015, "step": 12220 }, { "epoch": 2.4787190920145927, "grad_norm": 0.009713798761367798, "learning_rate": 7.606404539927037e-05, "loss": 0.002, "step": 12230 }, { "epoch": 2.48074584515606, "grad_norm": 0.05965464562177658, "learning_rate": 7.5962707742197e-05, "loss": 0.0048, "step": 12240 }, { "epoch": 2.4827725982975273, "grad_norm": 0.007966786623001099, "learning_rate": 7.586137008512364e-05, "loss": 0.0019, "step": 12250 }, { "epoch": 2.4847993514389946, "grad_norm": 0.055097125470638275, "learning_rate": 7.576003242805027e-05, "loss": 0.0015, "step": 12260 }, { "epoch": 2.4868261045804623, "grad_norm": 0.012019754387438297, "learning_rate": 7.56586947709769e-05, "loss": 0.0015, "step": 12270 }, { "epoch": 2.4888528577219295, "grad_norm": 0.051623955368995667, "learning_rate": 7.555735711390352e-05, "loss": 0.0032, "step": 12280 }, { "epoch": 2.490879610863397, "grad_norm": 0.0482669472694397, "learning_rate": 7.545601945683017e-05, "loss": 0.0018, "step": 12290 }, { "epoch": 2.492906364004864, "grad_norm": 0.0074604470282793045, "learning_rate": 7.535468179975679e-05, "loss": 0.0008, "step": 12300 }, { "epoch": 2.492906364004864, "eval_accuracy": 0.9688644688644689, "eval_loss": 0.16353316605091095, "eval_runtime": 47.0606, "eval_samples_per_second": 11.602, "eval_steps_per_second": 1.466, "step": 12300 }, { "epoch": 2.4949331171463314, "grad_norm": 0.007159892935305834, "learning_rate": 7.525334414268342e-05, "loss": 0.724, "step": 12310 }, { "epoch": 2.496959870287799, "grad_norm": 0.007703261915594339, "learning_rate": 7.515200648561006e-05, "loss": 0.0439, "step": 12320 }, { "epoch": 2.4989866234292664, "grad_norm": 0.007789842318743467, "learning_rate": 7.505066882853668e-05, "loss": 0.0026, "step": 12330 }, { "epoch": 2.5010133765707336, "grad_norm": 0.008039126172661781, "learning_rate": 7.494933117146332e-05, "loss": 0.0014, "step": 12340 }, { "epoch": 2.503040129712201, "grad_norm": 0.014271967113018036, "learning_rate": 7.484799351438995e-05, "loss": 0.7042, "step": 12350 }, { "epoch": 2.5050668828536686, "grad_norm": 0.01031290553510189, "learning_rate": 7.474665585731658e-05, "loss": 0.0018, "step": 12360 }, { "epoch": 2.507093635995136, "grad_norm": 0.013794993981719017, "learning_rate": 7.464531820024321e-05, "loss": 0.6638, "step": 12370 }, { "epoch": 2.509120389136603, "grad_norm": 0.013239707797765732, "learning_rate": 7.454398054316985e-05, "loss": 0.0018, "step": 12380 }, { "epoch": 2.5111471422780705, "grad_norm": 0.04807324707508087, "learning_rate": 7.444264288609648e-05, "loss": 0.4963, "step": 12390 }, { "epoch": 2.5131738954195377, "grad_norm": 0.016311677172780037, "learning_rate": 7.434130522902311e-05, "loss": 0.0035, "step": 12400 }, { "epoch": 2.5131738954195377, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.12831450998783112, "eval_runtime": 50.0658, "eval_samples_per_second": 10.906, "eval_steps_per_second": 1.378, "step": 12400 }, { "epoch": 2.515200648561005, "grad_norm": 0.025094080716371536, "learning_rate": 7.423996757194975e-05, "loss": 0.4348, "step": 12410 }, { "epoch": 2.5172274017024727, "grad_norm": 0.013695972040295601, "learning_rate": 7.413862991487636e-05, "loss": 0.0023, "step": 12420 }, { "epoch": 2.51925415484394, "grad_norm": 0.011631296947598457, "learning_rate": 7.403729225780301e-05, "loss": 0.0025, "step": 12430 }, { "epoch": 2.5212809079854073, "grad_norm": 0.04988066852092743, "learning_rate": 7.393595460072964e-05, "loss": 0.0041, "step": 12440 }, { "epoch": 2.523307661126875, "grad_norm": 0.010438222438097, "learning_rate": 7.383461694365626e-05, "loss": 0.0022, "step": 12450 }, { "epoch": 2.5253344142683423, "grad_norm": 0.06062883883714676, "learning_rate": 7.37332792865829e-05, "loss": 0.0017, "step": 12460 }, { "epoch": 2.5273611674098095, "grad_norm": 0.011333596892654896, "learning_rate": 7.363194162950953e-05, "loss": 0.6328, "step": 12470 }, { "epoch": 2.529387920551277, "grad_norm": 0.012408534996211529, "learning_rate": 7.353060397243616e-05, "loss": 0.0014, "step": 12480 }, { "epoch": 2.531414673692744, "grad_norm": 0.013357922434806824, "learning_rate": 7.342926631536279e-05, "loss": 0.5256, "step": 12490 }, { "epoch": 2.5334414268342114, "grad_norm": 0.0201270692050457, "learning_rate": 7.332792865828943e-05, "loss": 0.7689, "step": 12500 }, { "epoch": 2.5334414268342114, "eval_accuracy": 0.9688644688644689, "eval_loss": 0.15511265397071838, "eval_runtime": 52.8918, "eval_samples_per_second": 10.323, "eval_steps_per_second": 1.305, "step": 12500 }, { "epoch": 2.535468179975679, "grad_norm": 8.252181053161621, "learning_rate": 7.322659100121606e-05, "loss": 0.5807, "step": 12510 }, { "epoch": 2.5374949331171464, "grad_norm": 0.08722558617591858, "learning_rate": 7.312525334414269e-05, "loss": 0.0678, "step": 12520 }, { "epoch": 2.5395216862586136, "grad_norm": 0.024868663400411606, "learning_rate": 7.302391568706932e-05, "loss": 0.0043, "step": 12530 }, { "epoch": 2.541548439400081, "grad_norm": 0.07840926200151443, "learning_rate": 7.292257802999594e-05, "loss": 0.0038, "step": 12540 }, { "epoch": 2.5435751925415486, "grad_norm": 0.017033569514751434, "learning_rate": 7.282124037292259e-05, "loss": 0.0487, "step": 12550 }, { "epoch": 2.545601945683016, "grad_norm": 0.01854672282934189, "learning_rate": 7.271990271584921e-05, "loss": 0.0032, "step": 12560 }, { "epoch": 2.547628698824483, "grad_norm": 0.011858698911964893, "learning_rate": 7.261856505877584e-05, "loss": 0.2406, "step": 12570 }, { "epoch": 2.5496554519659504, "grad_norm": 0.016108671203255653, "learning_rate": 7.251722740170249e-05, "loss": 0.0027, "step": 12580 }, { "epoch": 2.5516822051074177, "grad_norm": 8.133279800415039, "learning_rate": 7.24158897446291e-05, "loss": 0.4846, "step": 12590 }, { "epoch": 2.5537089582488854, "grad_norm": 0.010792690329253674, "learning_rate": 7.231455208755574e-05, "loss": 0.0126, "step": 12600 }, { "epoch": 2.5537089582488854, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.11439267545938492, "eval_runtime": 137.7708, "eval_samples_per_second": 3.963, "eval_steps_per_second": 0.501, "step": 12600 }, { "epoch": 2.5557357113903527, "grad_norm": 0.009957440197467804, "learning_rate": 7.221321443048237e-05, "loss": 0.0145, "step": 12610 }, { "epoch": 2.55776246453182, "grad_norm": 0.009772456251084805, "learning_rate": 7.2111876773409e-05, "loss": 0.4205, "step": 12620 }, { "epoch": 2.5597892176732873, "grad_norm": 0.011758706532418728, "learning_rate": 7.201053911633562e-05, "loss": 0.0012, "step": 12630 }, { "epoch": 2.561815970814755, "grad_norm": 0.00956796109676361, "learning_rate": 7.190920145926227e-05, "loss": 0.0021, "step": 12640 }, { "epoch": 2.5638427239562223, "grad_norm": 0.009638584218919277, "learning_rate": 7.18078638021889e-05, "loss": 0.0011, "step": 12650 }, { "epoch": 2.5658694770976895, "grad_norm": 0.009001140482723713, "learning_rate": 7.170652614511552e-05, "loss": 0.0036, "step": 12660 }, { "epoch": 2.567896230239157, "grad_norm": 0.009449226781725883, "learning_rate": 7.160518848804217e-05, "loss": 0.0062, "step": 12670 }, { "epoch": 2.569922983380624, "grad_norm": 0.008211339823901653, "learning_rate": 7.150385083096879e-05, "loss": 0.0026, "step": 12680 }, { "epoch": 2.5719497365220914, "grad_norm": 0.008610163815319538, "learning_rate": 7.140251317389542e-05, "loss": 0.0018, "step": 12690 }, { "epoch": 2.573976489663559, "grad_norm": 0.00778192188590765, "learning_rate": 7.130117551682205e-05, "loss": 0.0028, "step": 12700 }, { "epoch": 2.573976489663559, "eval_accuracy": 0.9835164835164835, "eval_loss": 0.09194227308034897, "eval_runtime": 198.1506, "eval_samples_per_second": 2.755, "eval_steps_per_second": 0.348, "step": 12700 }, { "epoch": 2.5760032428050263, "grad_norm": 0.055679794400930405, "learning_rate": 7.119983785974868e-05, "loss": 0.0034, "step": 12710 }, { "epoch": 2.5780299959464936, "grad_norm": 0.015648189932107925, "learning_rate": 7.109850020267532e-05, "loss": 1.0479, "step": 12720 }, { "epoch": 2.5800567490879613, "grad_norm": 0.012088973075151443, "learning_rate": 7.099716254560195e-05, "loss": 0.4879, "step": 12730 }, { "epoch": 2.5820835022294286, "grad_norm": 0.00957775954157114, "learning_rate": 7.089582488852858e-05, "loss": 0.0027, "step": 12740 }, { "epoch": 2.584110255370896, "grad_norm": 0.011500762775540352, "learning_rate": 7.079448723145521e-05, "loss": 0.2668, "step": 12750 }, { "epoch": 2.586137008512363, "grad_norm": 0.014193236827850342, "learning_rate": 7.069314957438185e-05, "loss": 0.0033, "step": 12760 }, { "epoch": 2.5881637616538304, "grad_norm": 0.010394740849733353, "learning_rate": 7.059181191730847e-05, "loss": 0.004, "step": 12770 }, { "epoch": 2.5901905147952977, "grad_norm": 0.013128154911100864, "learning_rate": 7.049047426023511e-05, "loss": 0.0064, "step": 12780 }, { "epoch": 2.5922172679367654, "grad_norm": 0.009156018495559692, "learning_rate": 7.038913660316175e-05, "loss": 0.4746, "step": 12790 }, { "epoch": 2.5942440210782327, "grad_norm": 1.7480326890945435, "learning_rate": 7.028779894608836e-05, "loss": 0.0053, "step": 12800 }, { "epoch": 2.5942440210782327, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.11324365437030792, "eval_runtime": 165.9605, "eval_samples_per_second": 3.29, "eval_steps_per_second": 0.416, "step": 12800 }, { "epoch": 2.5962707742197, "grad_norm": 0.010218513198196888, "learning_rate": 7.018646128901501e-05, "loss": 0.005, "step": 12810 }, { "epoch": 2.5982975273611673, "grad_norm": 0.008802374824881554, "learning_rate": 7.008512363194163e-05, "loss": 0.0019, "step": 12820 }, { "epoch": 2.600324280502635, "grad_norm": 0.07323005795478821, "learning_rate": 6.998378597486826e-05, "loss": 0.002, "step": 12830 }, { "epoch": 2.6023510336441023, "grad_norm": 0.010269319638609886, "learning_rate": 6.98824483177949e-05, "loss": 0.0011, "step": 12840 }, { "epoch": 2.6043777867855695, "grad_norm": 0.08123511075973511, "learning_rate": 6.978111066072153e-05, "loss": 0.4715, "step": 12850 }, { "epoch": 2.606404539927037, "grad_norm": 0.010554922744631767, "learning_rate": 6.967977300364816e-05, "loss": 0.003, "step": 12860 }, { "epoch": 2.608431293068504, "grad_norm": 0.008621037937700748, "learning_rate": 6.957843534657479e-05, "loss": 0.0038, "step": 12870 }, { "epoch": 2.6104580462099714, "grad_norm": 0.07454310357570648, "learning_rate": 6.947709768950143e-05, "loss": 0.0046, "step": 12880 }, { "epoch": 2.612484799351439, "grad_norm": 0.0076058050617575645, "learning_rate": 6.937576003242804e-05, "loss": 0.0055, "step": 12890 }, { "epoch": 2.6145115524929063, "grad_norm": 0.008823697455227375, "learning_rate": 6.927442237535469e-05, "loss": 0.0018, "step": 12900 }, { "epoch": 2.6145115524929063, "eval_accuracy": 0.9853479853479854, "eval_loss": 0.08512182533740997, "eval_runtime": 249.2668, "eval_samples_per_second": 2.19, "eval_steps_per_second": 0.277, "step": 12900 }, { "epoch": 2.6165383056343736, "grad_norm": 0.00716985110193491, "learning_rate": 6.917308471828131e-05, "loss": 0.0009, "step": 12910 }, { "epoch": 2.6185650587758413, "grad_norm": 0.007007387932389975, "learning_rate": 6.907174706120794e-05, "loss": 0.0029, "step": 12920 }, { "epoch": 2.6205918119173086, "grad_norm": 0.05765289068222046, "learning_rate": 6.897040940413459e-05, "loss": 0.0021, "step": 12930 }, { "epoch": 2.622618565058776, "grad_norm": 0.007231696508824825, "learning_rate": 6.886907174706121e-05, "loss": 0.0016, "step": 12940 }, { "epoch": 2.624645318200243, "grad_norm": 0.007360757794231176, "learning_rate": 6.876773408998784e-05, "loss": 0.4605, "step": 12950 }, { "epoch": 2.6266720713417104, "grad_norm": 0.017173467203974724, "learning_rate": 6.866639643291447e-05, "loss": 0.0015, "step": 12960 }, { "epoch": 2.6286988244831777, "grad_norm": 0.11444433033466339, "learning_rate": 6.85650587758411e-05, "loss": 0.67, "step": 12970 }, { "epoch": 2.6307255776246454, "grad_norm": 0.011185628361999989, "learning_rate": 6.846372111876774e-05, "loss": 0.4956, "step": 12980 }, { "epoch": 2.6327523307661127, "grad_norm": 8.056800842285156, "learning_rate": 6.836238346169437e-05, "loss": 0.4779, "step": 12990 }, { "epoch": 2.63477908390758, "grad_norm": 0.008451711386442184, "learning_rate": 6.8261045804621e-05, "loss": 0.0014, "step": 13000 }, { "epoch": 2.63477908390758, "eval_accuracy": 0.978021978021978, "eval_loss": 0.10948704928159714, "eval_runtime": 371.0896, "eval_samples_per_second": 1.471, "eval_steps_per_second": 0.186, "step": 13000 }, { "epoch": 2.6368058370490477, "grad_norm": 0.013107103295624256, "learning_rate": 6.815970814754764e-05, "loss": 0.0042, "step": 13010 }, { "epoch": 2.638832590190515, "grad_norm": 0.008205901831388474, "learning_rate": 6.805837049047427e-05, "loss": 0.5378, "step": 13020 }, { "epoch": 2.6408593433319822, "grad_norm": 0.13924755156040192, "learning_rate": 6.795703283340089e-05, "loss": 0.0067, "step": 13030 }, { "epoch": 2.6428860964734495, "grad_norm": 0.07655428349971771, "learning_rate": 6.785569517632752e-05, "loss": 0.0051, "step": 13040 }, { "epoch": 2.644912849614917, "grad_norm": 0.011031039990484715, "learning_rate": 6.775435751925417e-05, "loss": 0.002, "step": 13050 }, { "epoch": 2.646939602756384, "grad_norm": 0.007966967299580574, "learning_rate": 6.765301986218079e-05, "loss": 0.0025, "step": 13060 }, { "epoch": 2.648966355897852, "grad_norm": 0.062473565340042114, "learning_rate": 6.755168220510742e-05, "loss": 0.0038, "step": 13070 }, { "epoch": 2.650993109039319, "grad_norm": 0.006963303312659264, "learning_rate": 6.745034454803405e-05, "loss": 0.0017, "step": 13080 }, { "epoch": 2.6530198621807863, "grad_norm": 0.009901291690766811, "learning_rate": 6.734900689096068e-05, "loss": 0.003, "step": 13090 }, { "epoch": 2.6550466153222536, "grad_norm": 0.05340264365077019, "learning_rate": 6.724766923388732e-05, "loss": 0.0017, "step": 13100 }, { "epoch": 2.6550466153222536, "eval_accuracy": 0.9816849816849816, "eval_loss": 0.08778306096792221, "eval_runtime": 355.0894, "eval_samples_per_second": 1.538, "eval_steps_per_second": 0.194, "step": 13100 }, { "epoch": 2.6570733684637213, "grad_norm": 0.006709767039865255, "learning_rate": 6.714633157681395e-05, "loss": 0.0019, "step": 13110 }, { "epoch": 2.6591001216051886, "grad_norm": 0.008395565673708916, "learning_rate": 6.704499391974058e-05, "loss": 0.0057, "step": 13120 }, { "epoch": 2.661126874746656, "grad_norm": 0.00617511011660099, "learning_rate": 6.694365626266721e-05, "loss": 0.0012, "step": 13130 }, { "epoch": 2.663153627888123, "grad_norm": 0.00602354621514678, "learning_rate": 6.684231860559385e-05, "loss": 0.0043, "step": 13140 }, { "epoch": 2.6651803810295904, "grad_norm": 0.007556139025837183, "learning_rate": 6.674098094852047e-05, "loss": 0.0013, "step": 13150 }, { "epoch": 2.6672071341710577, "grad_norm": 0.007209232077002525, "learning_rate": 6.663964329144711e-05, "loss": 0.0022, "step": 13160 }, { "epoch": 2.6692338873125254, "grad_norm": 0.005784170236438513, "learning_rate": 6.653830563437373e-05, "loss": 0.0016, "step": 13170 }, { "epoch": 2.6712606404539927, "grad_norm": 0.005957774352282286, "learning_rate": 6.643696797730036e-05, "loss": 0.5298, "step": 13180 }, { "epoch": 2.67328739359546, "grad_norm": 0.005831047426909208, "learning_rate": 6.633563032022701e-05, "loss": 0.0017, "step": 13190 }, { "epoch": 2.6753141467369277, "grad_norm": 0.005900830961763859, "learning_rate": 6.623429266315363e-05, "loss": 0.0014, "step": 13200 }, { "epoch": 2.6753141467369277, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.13224680721759796, "eval_runtime": 340.8015, "eval_samples_per_second": 1.602, "eval_steps_per_second": 0.202, "step": 13200 }, { "epoch": 2.677340899878395, "grad_norm": 0.006084654945880175, "learning_rate": 6.613295500608026e-05, "loss": 1.0179, "step": 13210 }, { "epoch": 2.6793676530198622, "grad_norm": 0.009283014573156834, "learning_rate": 6.60316173490069e-05, "loss": 0.0455, "step": 13220 }, { "epoch": 2.6813944061613295, "grad_norm": 0.008484777063131332, "learning_rate": 6.593027969193353e-05, "loss": 0.4282, "step": 13230 }, { "epoch": 2.683421159302797, "grad_norm": 0.007272159215062857, "learning_rate": 6.582894203486015e-05, "loss": 0.0012, "step": 13240 }, { "epoch": 2.685447912444264, "grad_norm": 0.009423608891665936, "learning_rate": 6.572760437778679e-05, "loss": 0.0036, "step": 13250 }, { "epoch": 2.687474665585732, "grad_norm": 0.022705376148223877, "learning_rate": 6.562626672071343e-05, "loss": 0.0117, "step": 13260 }, { "epoch": 2.689501418727199, "grad_norm": 0.006629382260143757, "learning_rate": 6.552492906364004e-05, "loss": 0.0037, "step": 13270 }, { "epoch": 2.6915281718686663, "grad_norm": 0.007442939560860395, "learning_rate": 6.542359140656669e-05, "loss": 0.0023, "step": 13280 }, { "epoch": 2.6935549250101336, "grad_norm": 0.007592672016471624, "learning_rate": 6.532225374949331e-05, "loss": 0.1412, "step": 13290 }, { "epoch": 2.6955816781516013, "grad_norm": 0.006130645051598549, "learning_rate": 6.522091609241994e-05, "loss": 0.0015, "step": 13300 }, { "epoch": 2.6955816781516013, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10588613152503967, "eval_runtime": 458.6185, "eval_samples_per_second": 1.191, "eval_steps_per_second": 0.15, "step": 13300 }, { "epoch": 2.6976084312930686, "grad_norm": 0.007999964989721775, "learning_rate": 6.511957843534658e-05, "loss": 0.0015, "step": 13310 }, { "epoch": 2.699635184434536, "grad_norm": 0.007326509803533554, "learning_rate": 6.501824077827321e-05, "loss": 0.4529, "step": 13320 }, { "epoch": 2.701661937576003, "grad_norm": 0.005822496023029089, "learning_rate": 6.491690312119984e-05, "loss": 0.0014, "step": 13330 }, { "epoch": 2.7036886907174704, "grad_norm": 8.135007858276367, "learning_rate": 6.481556546412647e-05, "loss": 0.5238, "step": 13340 }, { "epoch": 2.705715443858938, "grad_norm": 0.06068016588687897, "learning_rate": 6.47142278070531e-05, "loss": 0.0021, "step": 13350 }, { "epoch": 2.7077421970004054, "grad_norm": 0.06504343450069427, "learning_rate": 6.461289014997974e-05, "loss": 0.0025, "step": 13360 }, { "epoch": 2.7097689501418727, "grad_norm": 0.005796231795102358, "learning_rate": 6.451155249290637e-05, "loss": 0.0015, "step": 13370 }, { "epoch": 2.71179570328334, "grad_norm": 17.916643142700195, "learning_rate": 6.441021483583299e-05, "loss": 0.5551, "step": 13380 }, { "epoch": 2.7138224564248077, "grad_norm": 0.006149032153189182, "learning_rate": 6.430887717875964e-05, "loss": 0.4887, "step": 13390 }, { "epoch": 2.715849209566275, "grad_norm": 0.019423088058829308, "learning_rate": 6.420753952168627e-05, "loss": 0.0036, "step": 13400 }, { "epoch": 2.715849209566275, "eval_accuracy": 0.9816849816849816, "eval_loss": 0.09267744421958923, "eval_runtime": 347.4343, "eval_samples_per_second": 1.572, "eval_steps_per_second": 0.199, "step": 13400 }, { "epoch": 2.7178759627077422, "grad_norm": 0.006651645991951227, "learning_rate": 6.410620186461289e-05, "loss": 0.0018, "step": 13410 }, { "epoch": 2.7199027158492095, "grad_norm": 0.006167796906083822, "learning_rate": 6.400486420753953e-05, "loss": 0.0033, "step": 13420 }, { "epoch": 2.721929468990677, "grad_norm": 0.009791787713766098, "learning_rate": 6.390352655046615e-05, "loss": 0.0017, "step": 13430 }, { "epoch": 2.723956222132144, "grad_norm": 0.009656204842031002, "learning_rate": 6.380218889339279e-05, "loss": 0.7369, "step": 13440 }, { "epoch": 2.7259829752736118, "grad_norm": 0.010334176942706108, "learning_rate": 6.370085123631942e-05, "loss": 0.5731, "step": 13450 }, { "epoch": 2.728009728415079, "grad_norm": 0.007303066086024046, "learning_rate": 6.359951357924605e-05, "loss": 0.0024, "step": 13460 }, { "epoch": 2.7300364815565463, "grad_norm": 0.007056308910250664, "learning_rate": 6.349817592217268e-05, "loss": 0.0022, "step": 13470 }, { "epoch": 2.732063234698014, "grad_norm": 0.00947339553385973, "learning_rate": 6.339683826509932e-05, "loss": 0.0035, "step": 13480 }, { "epoch": 2.7340899878394813, "grad_norm": 0.011178678832948208, "learning_rate": 6.329550060802595e-05, "loss": 0.0029, "step": 13490 }, { "epoch": 2.7361167409809486, "grad_norm": 0.006489630322903395, "learning_rate": 6.319416295095257e-05, "loss": 0.0051, "step": 13500 }, { "epoch": 2.7361167409809486, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10085071623325348, "eval_runtime": 58.9004, "eval_samples_per_second": 9.27, "eval_steps_per_second": 1.171, "step": 13500 }, { "epoch": 2.738143494122416, "grad_norm": 0.055221959948539734, "learning_rate": 6.309282529387921e-05, "loss": 0.0041, "step": 13510 }, { "epoch": 2.740170247263883, "grad_norm": 0.006161363795399666, "learning_rate": 6.299148763680583e-05, "loss": 0.002, "step": 13520 }, { "epoch": 2.7421970004053504, "grad_norm": 0.006010277196764946, "learning_rate": 6.289014997973247e-05, "loss": 0.0019, "step": 13530 }, { "epoch": 2.744223753546818, "grad_norm": 0.007737209089100361, "learning_rate": 6.278881232265911e-05, "loss": 0.0026, "step": 13540 }, { "epoch": 2.7462505066882854, "grad_norm": 0.04785401001572609, "learning_rate": 6.268747466558573e-05, "loss": 0.0029, "step": 13550 }, { "epoch": 2.7482772598297527, "grad_norm": 0.0055003599263727665, "learning_rate": 6.258613700851236e-05, "loss": 0.002, "step": 13560 }, { "epoch": 2.75030401297122, "grad_norm": 0.044067516922950745, "learning_rate": 6.2484799351439e-05, "loss": 0.0024, "step": 13570 }, { "epoch": 2.7523307661126877, "grad_norm": 0.005313580390065908, "learning_rate": 6.238346169436563e-05, "loss": 0.4124, "step": 13580 }, { "epoch": 2.754357519254155, "grad_norm": 0.005581700708717108, "learning_rate": 6.228212403729226e-05, "loss": 0.4932, "step": 13590 }, { "epoch": 2.7563842723956222, "grad_norm": 0.005542232189327478, "learning_rate": 6.21807863802189e-05, "loss": 0.0028, "step": 13600 }, { "epoch": 2.7563842723956222, "eval_accuracy": 0.967032967032967, "eval_loss": 0.16798269748687744, "eval_runtime": 53.1296, "eval_samples_per_second": 10.277, "eval_steps_per_second": 1.299, "step": 13600 }, { "epoch": 2.7584110255370895, "grad_norm": 0.046516843140125275, "learning_rate": 6.207944872314553e-05, "loss": 0.0017, "step": 13610 }, { "epoch": 2.7604377786785568, "grad_norm": 0.005435958504676819, "learning_rate": 6.197811106607215e-05, "loss": 0.3577, "step": 13620 }, { "epoch": 2.762464531820024, "grad_norm": 0.006663784850388765, "learning_rate": 6.187677340899879e-05, "loss": 0.0013, "step": 13630 }, { "epoch": 2.7644912849614918, "grad_norm": 0.005261593498289585, "learning_rate": 6.177543575192541e-05, "loss": 0.0023, "step": 13640 }, { "epoch": 2.766518038102959, "grad_norm": 0.04430309310555458, "learning_rate": 6.167409809485204e-05, "loss": 0.0013, "step": 13650 }, { "epoch": 2.7685447912444263, "grad_norm": 0.015064161270856857, "learning_rate": 6.157276043777868e-05, "loss": 0.0052, "step": 13660 }, { "epoch": 2.770571544385894, "grad_norm": 0.006480317562818527, "learning_rate": 6.147142278070531e-05, "loss": 0.7568, "step": 13670 }, { "epoch": 2.7725982975273613, "grad_norm": 0.007233656011521816, "learning_rate": 6.137008512363194e-05, "loss": 0.001, "step": 13680 }, { "epoch": 2.7746250506688286, "grad_norm": 0.005915817804634571, "learning_rate": 6.126874746655858e-05, "loss": 0.4029, "step": 13690 }, { "epoch": 2.776651803810296, "grad_norm": 0.006141826510429382, "learning_rate": 6.116740980948521e-05, "loss": 0.6951, "step": 13700 }, { "epoch": 2.776651803810296, "eval_accuracy": 0.9487179487179487, "eval_loss": 0.24969354271888733, "eval_runtime": 53.1917, "eval_samples_per_second": 10.265, "eval_steps_per_second": 1.297, "step": 13700 }, { "epoch": 2.778678556951763, "grad_norm": 0.006940559484064579, "learning_rate": 6.106607215241184e-05, "loss": 0.0013, "step": 13710 }, { "epoch": 2.7807053100932304, "grad_norm": 0.007143535651266575, "learning_rate": 6.096473449533847e-05, "loss": 0.0016, "step": 13720 }, { "epoch": 2.782732063234698, "grad_norm": 0.007185479626059532, "learning_rate": 6.0863396838265106e-05, "loss": 0.487, "step": 13730 }, { "epoch": 2.7847588163761654, "grad_norm": 0.5434855818748474, "learning_rate": 6.076205918119173e-05, "loss": 0.0092, "step": 13740 }, { "epoch": 2.7867855695176327, "grad_norm": 0.007091503124684095, "learning_rate": 6.066072152411837e-05, "loss": 0.002, "step": 13750 }, { "epoch": 2.7888123226591004, "grad_norm": 0.04187440127134323, "learning_rate": 6.0559383867045e-05, "loss": 0.4058, "step": 13760 }, { "epoch": 2.7908390758005677, "grad_norm": 0.009630071930587292, "learning_rate": 6.045804620997163e-05, "loss": 0.0018, "step": 13770 }, { "epoch": 2.792865828942035, "grad_norm": 0.2932543456554413, "learning_rate": 6.0356708552898255e-05, "loss": 0.0048, "step": 13780 }, { "epoch": 2.794892582083502, "grad_norm": 0.00625465577468276, "learning_rate": 6.0255370895824895e-05, "loss": 0.0029, "step": 13790 }, { "epoch": 2.7969193352249695, "grad_norm": 0.22124755382537842, "learning_rate": 6.015403323875153e-05, "loss": 0.0096, "step": 13800 }, { "epoch": 2.7969193352249695, "eval_accuracy": 0.978021978021978, "eval_loss": 0.1137545183300972, "eval_runtime": 50.9606, "eval_samples_per_second": 10.714, "eval_steps_per_second": 1.354, "step": 13800 }, { "epoch": 2.7989460883664368, "grad_norm": 0.009009423665702343, "learning_rate": 6.005269558167815e-05, "loss": 0.531, "step": 13810 }, { "epoch": 2.8009728415079045, "grad_norm": 0.009486384689807892, "learning_rate": 5.995135792460479e-05, "loss": 0.4189, "step": 13820 }, { "epoch": 2.8029995946493718, "grad_norm": 0.015886301174759865, "learning_rate": 5.985002026753141e-05, "loss": 0.3436, "step": 13830 }, { "epoch": 2.805026347790839, "grad_norm": 0.006443299353122711, "learning_rate": 5.974868261045805e-05, "loss": 0.0014, "step": 13840 }, { "epoch": 2.8070531009323063, "grad_norm": 0.04074406623840332, "learning_rate": 5.964734495338468e-05, "loss": 0.0028, "step": 13850 }, { "epoch": 2.809079854073774, "grad_norm": 0.011449450626969337, "learning_rate": 5.954600729631131e-05, "loss": 0.0035, "step": 13860 }, { "epoch": 2.8111066072152413, "grad_norm": 0.006783784367144108, "learning_rate": 5.944466963923795e-05, "loss": 0.5468, "step": 13870 }, { "epoch": 2.8131333603567086, "grad_norm": 0.02318020910024643, "learning_rate": 5.9343331982164575e-05, "loss": 0.2205, "step": 13880 }, { "epoch": 2.815160113498176, "grad_norm": 0.006445059087127447, "learning_rate": 5.924199432509121e-05, "loss": 0.0045, "step": 13890 }, { "epoch": 2.817186866639643, "grad_norm": 0.006976704113185406, "learning_rate": 5.9140656668017834e-05, "loss": 0.5063, "step": 13900 }, { "epoch": 2.817186866639643, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.1150827407836914, "eval_runtime": 54.029, "eval_samples_per_second": 10.106, "eval_steps_per_second": 1.277, "step": 13900 }, { "epoch": 2.8192136197811104, "grad_norm": 0.09443381428718567, "learning_rate": 5.903931901094447e-05, "loss": 0.0142, "step": 13910 }, { "epoch": 2.821240372922578, "grad_norm": 0.013332713395357132, "learning_rate": 5.89379813538711e-05, "loss": 0.1031, "step": 13920 }, { "epoch": 2.8232671260640454, "grad_norm": 0.006388837471604347, "learning_rate": 5.883664369679773e-05, "loss": 0.0024, "step": 13930 }, { "epoch": 2.8252938792055127, "grad_norm": 0.006204861216247082, "learning_rate": 5.873530603972437e-05, "loss": 0.0024, "step": 13940 }, { "epoch": 2.8273206323469804, "grad_norm": 0.007994485087692738, "learning_rate": 5.8633968382651e-05, "loss": 0.0014, "step": 13950 }, { "epoch": 2.8293473854884477, "grad_norm": 0.0065392907708883286, "learning_rate": 5.853263072557763e-05, "loss": 0.7336, "step": 13960 }, { "epoch": 2.831374138629915, "grad_norm": 0.009553118608891964, "learning_rate": 5.8431293068504255e-05, "loss": 0.0073, "step": 13970 }, { "epoch": 2.833400891771382, "grad_norm": 0.06365495920181274, "learning_rate": 5.8329955411430895e-05, "loss": 0.0045, "step": 13980 }, { "epoch": 2.8354276449128495, "grad_norm": 0.006651031784713268, "learning_rate": 5.8228617754357514e-05, "loss": 0.0029, "step": 13990 }, { "epoch": 2.8374543980543168, "grad_norm": 0.0455886535346508, "learning_rate": 5.812728009728415e-05, "loss": 0.0026, "step": 14000 }, { "epoch": 2.8374543980543168, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.11792446672916412, "eval_runtime": 53.4448, "eval_samples_per_second": 10.216, "eval_steps_per_second": 1.291, "step": 14000 }, { "epoch": 2.8394811511957845, "grad_norm": 0.006337382830679417, "learning_rate": 5.802594244021079e-05, "loss": 0.0024, "step": 14010 }, { "epoch": 2.8415079043372518, "grad_norm": 0.006261122412979603, "learning_rate": 5.792460478313741e-05, "loss": 0.0025, "step": 14020 }, { "epoch": 2.843534657478719, "grad_norm": 0.00638343533501029, "learning_rate": 5.782326712606405e-05, "loss": 0.5445, "step": 14030 }, { "epoch": 2.8455614106201863, "grad_norm": 9.283822059631348, "learning_rate": 5.772192946899068e-05, "loss": 1.2487, "step": 14040 }, { "epoch": 2.847588163761654, "grad_norm": 0.09864260256290436, "learning_rate": 5.762059181191731e-05, "loss": 0.0025, "step": 14050 }, { "epoch": 2.8496149169031213, "grad_norm": 0.06132243201136589, "learning_rate": 5.7519254154843936e-05, "loss": 0.8716, "step": 14060 }, { "epoch": 2.8516416700445886, "grad_norm": 0.009238564409315586, "learning_rate": 5.7417916497770575e-05, "loss": 0.0028, "step": 14070 }, { "epoch": 2.853668423186056, "grad_norm": 0.008571449667215347, "learning_rate": 5.731657884069721e-05, "loss": 0.4832, "step": 14080 }, { "epoch": 2.855695176327523, "grad_norm": 0.00848014559596777, "learning_rate": 5.7215241183623834e-05, "loss": 0.0027, "step": 14090 }, { "epoch": 2.857721929468991, "grad_norm": 0.008391969837248325, "learning_rate": 5.711390352655047e-05, "loss": 0.0041, "step": 14100 }, { "epoch": 2.857721929468991, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.12661290168762207, "eval_runtime": 50.5324, "eval_samples_per_second": 10.805, "eval_steps_per_second": 1.365, "step": 14100 }, { "epoch": 2.859748682610458, "grad_norm": 0.011006497777998447, "learning_rate": 5.70125658694771e-05, "loss": 0.0089, "step": 14110 }, { "epoch": 2.8617754357519254, "grad_norm": 0.065226711332798, "learning_rate": 5.691122821240373e-05, "loss": 1.1768, "step": 14120 }, { "epoch": 2.8638021888933927, "grad_norm": 0.009555099532008171, "learning_rate": 5.680989055533036e-05, "loss": 0.0033, "step": 14130 }, { "epoch": 2.8658289420348604, "grad_norm": 0.01057603769004345, "learning_rate": 5.6708552898257e-05, "loss": 0.4643, "step": 14140 }, { "epoch": 2.8678556951763277, "grad_norm": 0.2201487123966217, "learning_rate": 5.660721524118363e-05, "loss": 0.342, "step": 14150 }, { "epoch": 2.869882448317795, "grad_norm": 0.012544561177492142, "learning_rate": 5.6505877584110255e-05, "loss": 0.0799, "step": 14160 }, { "epoch": 2.871909201459262, "grad_norm": 0.009598630480468273, "learning_rate": 5.6404539927036895e-05, "loss": 0.0052, "step": 14170 }, { "epoch": 2.8739359546007295, "grad_norm": 0.019055837765336037, "learning_rate": 5.630320226996352e-05, "loss": 0.0024, "step": 14180 }, { "epoch": 2.8759627077421968, "grad_norm": 0.009083470329642296, "learning_rate": 5.620186461289015e-05, "loss": 0.0024, "step": 14190 }, { "epoch": 2.8779894608836645, "grad_norm": 0.00886489450931549, "learning_rate": 5.610052695581678e-05, "loss": 0.0019, "step": 14200 }, { "epoch": 2.8779894608836645, "eval_accuracy": 0.978021978021978, "eval_loss": 0.09981222450733185, "eval_runtime": 50.1764, "eval_samples_per_second": 10.882, "eval_steps_per_second": 1.375, "step": 14200 }, { "epoch": 2.8800162140251317, "grad_norm": 0.008524172939360142, "learning_rate": 5.599918929874341e-05, "loss": 0.0062, "step": 14210 }, { "epoch": 2.882042967166599, "grad_norm": 0.01295129768550396, "learning_rate": 5.589785164167005e-05, "loss": 0.0023, "step": 14220 }, { "epoch": 2.8840697203080667, "grad_norm": 0.008135986514389515, "learning_rate": 5.579651398459668e-05, "loss": 0.0021, "step": 14230 }, { "epoch": 2.886096473449534, "grad_norm": 0.007830134592950344, "learning_rate": 5.569517632752331e-05, "loss": 0.0028, "step": 14240 }, { "epoch": 2.8881232265910013, "grad_norm": 0.018854841589927673, "learning_rate": 5.5593838670449936e-05, "loss": 0.0023, "step": 14250 }, { "epoch": 2.8901499797324686, "grad_norm": 0.009230035357177258, "learning_rate": 5.5492501013376575e-05, "loss": 0.445, "step": 14260 }, { "epoch": 2.892176732873936, "grad_norm": 0.007842128165066242, "learning_rate": 5.53911633563032e-05, "loss": 0.4464, "step": 14270 }, { "epoch": 2.894203486015403, "grad_norm": 0.7170781493186951, "learning_rate": 5.5289825699229834e-05, "loss": 0.0045, "step": 14280 }, { "epoch": 2.896230239156871, "grad_norm": 38.8918571472168, "learning_rate": 5.518848804215647e-05, "loss": 0.3268, "step": 14290 }, { "epoch": 2.898256992298338, "grad_norm": 0.007682281080633402, "learning_rate": 5.50871503850831e-05, "loss": 0.0038, "step": 14300 }, { "epoch": 2.898256992298338, "eval_accuracy": 0.9652014652014652, "eval_loss": 0.12904779613018036, "eval_runtime": 50.6457, "eval_samples_per_second": 10.781, "eval_steps_per_second": 1.362, "step": 14300 }, { "epoch": 2.9002837454398054, "grad_norm": 0.007591232657432556, "learning_rate": 5.498581272800973e-05, "loss": 0.4361, "step": 14310 }, { "epoch": 2.9023104985812727, "grad_norm": 0.007921284064650536, "learning_rate": 5.488447507093636e-05, "loss": 0.0033, "step": 14320 }, { "epoch": 2.9043372517227404, "grad_norm": 0.008791577070951462, "learning_rate": 5.4783137413863e-05, "loss": 0.4804, "step": 14330 }, { "epoch": 2.9063640048642077, "grad_norm": 0.008087759837508202, "learning_rate": 5.468179975678963e-05, "loss": 0.2424, "step": 14340 }, { "epoch": 2.908390758005675, "grad_norm": 0.007720464374870062, "learning_rate": 5.4580462099716255e-05, "loss": 0.0122, "step": 14350 }, { "epoch": 2.910417511147142, "grad_norm": 0.05355573073029518, "learning_rate": 5.4479124442642895e-05, "loss": 0.0032, "step": 14360 }, { "epoch": 2.9124442642886095, "grad_norm": 0.007251413073390722, "learning_rate": 5.437778678556952e-05, "loss": 0.0034, "step": 14370 }, { "epoch": 2.9144710174300767, "grad_norm": 0.0070653860457241535, "learning_rate": 5.427644912849615e-05, "loss": 0.0156, "step": 14380 }, { "epoch": 2.9164977705715445, "grad_norm": 5.426998138427734, "learning_rate": 5.417511147142278e-05, "loss": 0.5408, "step": 14390 }, { "epoch": 2.9185245237130117, "grad_norm": 0.11581844836473465, "learning_rate": 5.407377381434942e-05, "loss": 0.0131, "step": 14400 }, { "epoch": 2.9185245237130117, "eval_accuracy": 0.9413919413919414, "eval_loss": 0.19978734850883484, "eval_runtime": 49.0837, "eval_samples_per_second": 11.124, "eval_steps_per_second": 1.406, "step": 14400 }, { "epoch": 2.920551276854479, "grad_norm": 0.08270927518606186, "learning_rate": 5.397243615727605e-05, "loss": 0.4666, "step": 14410 }, { "epoch": 2.9225780299959467, "grad_norm": 0.008222842589020729, "learning_rate": 5.387109850020268e-05, "loss": 0.4219, "step": 14420 }, { "epoch": 2.924604783137414, "grad_norm": 0.007847205735743046, "learning_rate": 5.376976084312931e-05, "loss": 0.0029, "step": 14430 }, { "epoch": 2.9266315362788813, "grad_norm": 0.007569338195025921, "learning_rate": 5.3668423186055936e-05, "loss": 0.0156, "step": 14440 }, { "epoch": 2.9286582894203486, "grad_norm": 0.045873142778873444, "learning_rate": 5.3567085528982575e-05, "loss": 0.0032, "step": 14450 }, { "epoch": 2.930685042561816, "grad_norm": 1.2365529537200928, "learning_rate": 5.34657478719092e-05, "loss": 0.0261, "step": 14460 }, { "epoch": 2.932711795703283, "grad_norm": 0.007316601928323507, "learning_rate": 5.3364410214835834e-05, "loss": 0.4418, "step": 14470 }, { "epoch": 2.934738548844751, "grad_norm": 0.007185041438788176, "learning_rate": 5.326307255776247e-05, "loss": 0.0022, "step": 14480 }, { "epoch": 2.936765301986218, "grad_norm": 0.007051927037537098, "learning_rate": 5.31617349006891e-05, "loss": 0.2142, "step": 14490 }, { "epoch": 2.9387920551276854, "grad_norm": 0.007010738831013441, "learning_rate": 5.306039724361573e-05, "loss": 0.0037, "step": 14500 }, { "epoch": 2.9387920551276854, "eval_accuracy": 0.9633699633699634, "eval_loss": 0.1214241087436676, "eval_runtime": 50.1191, "eval_samples_per_second": 10.894, "eval_steps_per_second": 1.377, "step": 14500 }, { "epoch": 2.9408188082691527, "grad_norm": 0.006798572838306427, "learning_rate": 5.295905958654236e-05, "loss": 0.1482, "step": 14510 }, { "epoch": 2.9428455614106204, "grad_norm": 0.08644963055849075, "learning_rate": 5.2857721929469e-05, "loss": 0.002, "step": 14520 }, { "epoch": 2.9448723145520876, "grad_norm": 0.006649952381849289, "learning_rate": 5.275638427239562e-05, "loss": 0.0037, "step": 14530 }, { "epoch": 2.946899067693555, "grad_norm": 1.2725645303726196, "learning_rate": 5.2655046615322255e-05, "loss": 0.0225, "step": 14540 }, { "epoch": 2.948925820835022, "grad_norm": 0.07782497256994247, "learning_rate": 5.2553708958248895e-05, "loss": 0.2055, "step": 14550 }, { "epoch": 2.9509525739764895, "grad_norm": 0.006569635588675737, "learning_rate": 5.245237130117552e-05, "loss": 0.4006, "step": 14560 }, { "epoch": 2.952979327117957, "grad_norm": 0.00675079133361578, "learning_rate": 5.2351033644102153e-05, "loss": 0.0029, "step": 14570 }, { "epoch": 2.9550060802594245, "grad_norm": 0.006548064760863781, "learning_rate": 5.224969598702878e-05, "loss": 0.0007, "step": 14580 }, { "epoch": 2.9570328334008917, "grad_norm": 0.006818473804742098, "learning_rate": 5.214835832995542e-05, "loss": 0.0018, "step": 14590 }, { "epoch": 2.959059586542359, "grad_norm": 0.0069591510109603405, "learning_rate": 5.204702067288204e-05, "loss": 0.2382, "step": 14600 }, { "epoch": 2.959059586542359, "eval_accuracy": 0.978021978021978, "eval_loss": 0.10965513437986374, "eval_runtime": 50.308, "eval_samples_per_second": 10.853, "eval_steps_per_second": 1.372, "step": 14600 }, { "epoch": 2.9610863396838267, "grad_norm": 0.00643087737262249, "learning_rate": 5.194568301580868e-05, "loss": 0.0027, "step": 14610 }, { "epoch": 2.963113092825294, "grad_norm": 0.006223716307431459, "learning_rate": 5.184434535873532e-05, "loss": 0.0055, "step": 14620 }, { "epoch": 2.9651398459667613, "grad_norm": 0.005989754106849432, "learning_rate": 5.1743007701661936e-05, "loss": 0.0033, "step": 14630 }, { "epoch": 2.9671665991082286, "grad_norm": 0.00613937946036458, "learning_rate": 5.1641670044588575e-05, "loss": 0.0039, "step": 14640 }, { "epoch": 2.969193352249696, "grad_norm": 0.006323641631752253, "learning_rate": 5.15403323875152e-05, "loss": 0.0007, "step": 14650 }, { "epoch": 2.971220105391163, "grad_norm": 0.005707310978323221, "learning_rate": 5.1438994730441834e-05, "loss": 0.0006, "step": 14660 }, { "epoch": 2.973246858532631, "grad_norm": 0.06553935259580612, "learning_rate": 5.133765707336846e-05, "loss": 0.0021, "step": 14670 }, { "epoch": 2.975273611674098, "grad_norm": 0.06091271713376045, "learning_rate": 5.12363194162951e-05, "loss": 0.0032, "step": 14680 }, { "epoch": 2.9773003648155654, "grad_norm": 0.0053886366076767445, "learning_rate": 5.113498175922173e-05, "loss": 0.0006, "step": 14690 }, { "epoch": 2.979327117957033, "grad_norm": 0.005328685510903597, "learning_rate": 5.103364410214836e-05, "loss": 0.0021, "step": 14700 }, { "epoch": 2.979327117957033, "eval_accuracy": 0.978021978021978, "eval_loss": 0.11521938443183899, "eval_runtime": 50.3755, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.37, "step": 14700 }, { "epoch": 2.9813538710985004, "grad_norm": 2.63281512260437, "learning_rate": 5.0932306445075e-05, "loss": 0.5353, "step": 14710 }, { "epoch": 2.9833806242399676, "grad_norm": 0.006335753481835127, "learning_rate": 5.083096878800162e-05, "loss": 0.0014, "step": 14720 }, { "epoch": 2.985407377381435, "grad_norm": 0.005788266658782959, "learning_rate": 5.0729631130928255e-05, "loss": 0.4826, "step": 14730 }, { "epoch": 2.987434130522902, "grad_norm": 0.006027384661138058, "learning_rate": 5.062829347385488e-05, "loss": 0.0022, "step": 14740 }, { "epoch": 2.9894608836643695, "grad_norm": 0.0832933858036995, "learning_rate": 5.052695581678152e-05, "loss": 0.4548, "step": 14750 }, { "epoch": 2.991487636805837, "grad_norm": 7.829352855682373, "learning_rate": 5.0425618159708153e-05, "loss": 0.445, "step": 14760 }, { "epoch": 2.9935143899473045, "grad_norm": 0.006316567305475473, "learning_rate": 5.032428050263478e-05, "loss": 0.0139, "step": 14770 }, { "epoch": 2.9955411430887717, "grad_norm": 0.006239940412342548, "learning_rate": 5.022294284556142e-05, "loss": 0.0034, "step": 14780 }, { "epoch": 2.997567896230239, "grad_norm": 0.006074379198253155, "learning_rate": 5.012160518848804e-05, "loss": 0.0007, "step": 14790 }, { "epoch": 2.9995946493717067, "grad_norm": 0.005999103654175997, "learning_rate": 5.002026753141468e-05, "loss": 0.002, "step": 14800 }, { "epoch": 2.9995946493717067, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10014810413122177, "eval_runtime": 47.3674, "eval_samples_per_second": 11.527, "eval_steps_per_second": 1.457, "step": 14800 }, { "epoch": 3.001621402513174, "grad_norm": 0.007480887696146965, "learning_rate": 4.991892987434131e-05, "loss": 0.7399, "step": 14810 }, { "epoch": 3.0036481556546413, "grad_norm": 0.006860152818262577, "learning_rate": 4.9817592217267936e-05, "loss": 0.0018, "step": 14820 }, { "epoch": 3.0056749087961085, "grad_norm": 0.007421591319143772, "learning_rate": 4.971625456019457e-05, "loss": 0.0041, "step": 14830 }, { "epoch": 3.007701661937576, "grad_norm": 0.08642849326133728, "learning_rate": 4.96149169031212e-05, "loss": 0.0028, "step": 14840 }, { "epoch": 3.0097284150790435, "grad_norm": 0.08337121456861496, "learning_rate": 4.9513579246047834e-05, "loss": 0.0038, "step": 14850 }, { "epoch": 3.011755168220511, "grad_norm": 0.0063841864466667175, "learning_rate": 4.9412241588974466e-05, "loss": 0.0129, "step": 14860 }, { "epoch": 3.013781921361978, "grad_norm": 0.005981700029224157, "learning_rate": 4.93109039319011e-05, "loss": 0.0016, "step": 14870 }, { "epoch": 3.0158086745034454, "grad_norm": 0.005900870077311993, "learning_rate": 4.920956627482773e-05, "loss": 0.0065, "step": 14880 }, { "epoch": 3.0178354276449126, "grad_norm": 0.005930820945650339, "learning_rate": 4.910822861775436e-05, "loss": 0.2251, "step": 14890 }, { "epoch": 3.0198621807863804, "grad_norm": 0.07632249593734741, "learning_rate": 4.900689096068099e-05, "loss": 0.0027, "step": 14900 }, { "epoch": 3.0198621807863804, "eval_accuracy": 0.978021978021978, "eval_loss": 0.12906181812286377, "eval_runtime": 50.1474, "eval_samples_per_second": 10.888, "eval_steps_per_second": 1.376, "step": 14900 }, { "epoch": 3.0218889339278476, "grad_norm": 0.00579514866694808, "learning_rate": 4.890555330360762e-05, "loss": 0.0148, "step": 14910 }, { "epoch": 3.023915687069315, "grad_norm": 9.295763969421387, "learning_rate": 4.8804215646534255e-05, "loss": 0.4507, "step": 14920 }, { "epoch": 3.025942440210782, "grad_norm": 0.07488545775413513, "learning_rate": 4.870287798946089e-05, "loss": 0.0024, "step": 14930 }, { "epoch": 3.02796919335225, "grad_norm": 0.005740451160818338, "learning_rate": 4.860154033238752e-05, "loss": 0.0026, "step": 14940 }, { "epoch": 3.029995946493717, "grad_norm": 0.07854526489973068, "learning_rate": 4.850020267531415e-05, "loss": 0.0015, "step": 14950 }, { "epoch": 3.0320226996351844, "grad_norm": 0.07284305989742279, "learning_rate": 4.839886501824078e-05, "loss": 0.0024, "step": 14960 }, { "epoch": 3.0340494527766517, "grad_norm": 0.0056227208115160465, "learning_rate": 4.829752736116741e-05, "loss": 0.4737, "step": 14970 }, { "epoch": 3.036076205918119, "grad_norm": 0.006293569691479206, "learning_rate": 4.8196189704094045e-05, "loss": 0.4649, "step": 14980 }, { "epoch": 3.0381029590595867, "grad_norm": 0.006045538932085037, "learning_rate": 4.809485204702068e-05, "loss": 0.0006, "step": 14990 }, { "epoch": 3.040129712201054, "grad_norm": 0.006128450855612755, "learning_rate": 4.799351438994731e-05, "loss": 0.971, "step": 15000 }, { "epoch": 3.040129712201054, "eval_accuracy": 0.9688644688644689, "eval_loss": 0.16169553995132446, "eval_runtime": 47.2487, "eval_samples_per_second": 11.556, "eval_steps_per_second": 1.46, "step": 15000 }, { "epoch": 3.0421564653425213, "grad_norm": 0.007412407547235489, "learning_rate": 4.7892176732873936e-05, "loss": 0.0007, "step": 15010 }, { "epoch": 3.0441832184839885, "grad_norm": 0.006472380366176367, "learning_rate": 4.779083907580057e-05, "loss": 0.0035, "step": 15020 }, { "epoch": 3.046209971625456, "grad_norm": 0.2207396775484085, "learning_rate": 4.76895014187272e-05, "loss": 0.0244, "step": 15030 }, { "epoch": 3.0482367247669235, "grad_norm": 0.006490557920187712, "learning_rate": 4.7588163761653834e-05, "loss": 0.002, "step": 15040 }, { "epoch": 3.050263477908391, "grad_norm": 0.00590917281806469, "learning_rate": 4.748682610458046e-05, "loss": 0.0016, "step": 15050 }, { "epoch": 3.052290231049858, "grad_norm": 0.09242843836545944, "learning_rate": 4.73854884475071e-05, "loss": 0.0071, "step": 15060 }, { "epoch": 3.0543169841913254, "grad_norm": 0.005696085747331381, "learning_rate": 4.728415079043373e-05, "loss": 0.003, "step": 15070 }, { "epoch": 3.056343737332793, "grad_norm": 0.005512133240699768, "learning_rate": 4.718281313336036e-05, "loss": 0.0008, "step": 15080 }, { "epoch": 3.0583704904742604, "grad_norm": 0.0054388162679970264, "learning_rate": 4.708147547628699e-05, "loss": 0.0037, "step": 15090 }, { "epoch": 3.0603972436157276, "grad_norm": 0.005437575746327639, "learning_rate": 4.698013781921362e-05, "loss": 0.0024, "step": 15100 }, { "epoch": 3.0603972436157276, "eval_accuracy": 0.9706959706959707, "eval_loss": 0.12453050911426544, "eval_runtime": 50.2348, "eval_samples_per_second": 10.869, "eval_steps_per_second": 1.374, "step": 15100 }, { "epoch": 3.062423996757195, "grad_norm": 0.07207438349723816, "learning_rate": 4.687880016214025e-05, "loss": 0.0023, "step": 15110 }, { "epoch": 3.064450749898662, "grad_norm": 0.07433387637138367, "learning_rate": 4.677746250506688e-05, "loss": 0.0024, "step": 15120 }, { "epoch": 3.06647750304013, "grad_norm": 0.06784066557884216, "learning_rate": 4.667612484799352e-05, "loss": 0.0025, "step": 15130 }, { "epoch": 3.068504256181597, "grad_norm": 0.005310414358973503, "learning_rate": 4.657478719092015e-05, "loss": 0.0006, "step": 15140 }, { "epoch": 3.0705310093230644, "grad_norm": 0.06503447145223618, "learning_rate": 4.647344953384678e-05, "loss": 0.0036, "step": 15150 }, { "epoch": 3.0725577624645317, "grad_norm": 0.0050943815149366856, "learning_rate": 4.637211187677341e-05, "loss": 0.002, "step": 15160 }, { "epoch": 3.074584515605999, "grad_norm": 0.0048345597460865974, "learning_rate": 4.6270774219700045e-05, "loss": 0.0027, "step": 15170 }, { "epoch": 3.0766112687474667, "grad_norm": 0.004860189743340015, "learning_rate": 4.616943656262667e-05, "loss": 0.0005, "step": 15180 }, { "epoch": 3.078638021888934, "grad_norm": 0.0678575411438942, "learning_rate": 4.606809890555331e-05, "loss": 0.002, "step": 15190 }, { "epoch": 3.0806647750304013, "grad_norm": 0.004794835112988949, "learning_rate": 4.596676124847994e-05, "loss": 0.0172, "step": 15200 }, { "epoch": 3.0806647750304013, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.12464291602373123, "eval_runtime": 51.9311, "eval_samples_per_second": 10.514, "eval_steps_per_second": 1.329, "step": 15200 }, { "epoch": 3.0826915281718685, "grad_norm": 0.0046084243804216385, "learning_rate": 4.586542359140657e-05, "loss": 0.0013, "step": 15210 }, { "epoch": 3.084718281313336, "grad_norm": 0.004596021492034197, "learning_rate": 4.57640859343332e-05, "loss": 0.4799, "step": 15220 }, { "epoch": 3.0867450344548035, "grad_norm": 0.004740960896015167, "learning_rate": 4.5662748277259834e-05, "loss": 0.002, "step": 15230 }, { "epoch": 3.088771787596271, "grad_norm": 0.004619854502379894, "learning_rate": 4.556141062018646e-05, "loss": 0.0076, "step": 15240 }, { "epoch": 3.090798540737738, "grad_norm": 0.00916003342717886, "learning_rate": 4.546007296311309e-05, "loss": 0.0012, "step": 15250 }, { "epoch": 3.0928252938792054, "grad_norm": 0.004485053010284901, "learning_rate": 4.535873530603973e-05, "loss": 0.0012, "step": 15260 }, { "epoch": 3.094852047020673, "grad_norm": 0.004415922798216343, "learning_rate": 4.525739764896636e-05, "loss": 0.0073, "step": 15270 }, { "epoch": 3.0968788001621403, "grad_norm": 0.054408419877290726, "learning_rate": 4.515605999189299e-05, "loss": 0.002, "step": 15280 }, { "epoch": 3.0989055533036076, "grad_norm": 0.02117004431784153, "learning_rate": 4.505472233481962e-05, "loss": 0.0007, "step": 15290 }, { "epoch": 3.100932306445075, "grad_norm": 0.004247212782502174, "learning_rate": 4.495338467774625e-05, "loss": 0.0016, "step": 15300 }, { "epoch": 3.100932306445075, "eval_accuracy": 0.9633699633699634, "eval_loss": 0.16279546916484833, "eval_runtime": 48.6438, "eval_samples_per_second": 11.224, "eval_steps_per_second": 1.418, "step": 15300 }, { "epoch": 3.102959059586542, "grad_norm": 0.04946291446685791, "learning_rate": 4.485204702067288e-05, "loss": 0.0022, "step": 15310 }, { "epoch": 3.10498581272801, "grad_norm": 0.0042871409095823765, "learning_rate": 4.4750709363599514e-05, "loss": 0.0016, "step": 15320 }, { "epoch": 3.107012565869477, "grad_norm": 0.00467030331492424, "learning_rate": 4.464937170652615e-05, "loss": 0.0015, "step": 15330 }, { "epoch": 3.1090393190109444, "grad_norm": 0.007411759812384844, "learning_rate": 4.454803404945278e-05, "loss": 0.0025, "step": 15340 }, { "epoch": 3.1110660721524117, "grad_norm": 0.004603350069373846, "learning_rate": 4.444669639237941e-05, "loss": 0.0011, "step": 15350 }, { "epoch": 3.1130928252938794, "grad_norm": 0.0041840276680886745, "learning_rate": 4.4345358735306045e-05, "loss": 0.0014, "step": 15360 }, { "epoch": 3.1151195784353467, "grad_norm": 0.004000730812549591, "learning_rate": 4.424402107823267e-05, "loss": 0.529, "step": 15370 }, { "epoch": 3.117146331576814, "grad_norm": 0.004089189227670431, "learning_rate": 4.41426834211593e-05, "loss": 0.001, "step": 15380 }, { "epoch": 3.1191730847182813, "grad_norm": 0.0041065397672355175, "learning_rate": 4.4041345764085936e-05, "loss": 0.001, "step": 15390 }, { "epoch": 3.1211998378597485, "grad_norm": 0.004117593169212341, "learning_rate": 4.394000810701257e-05, "loss": 0.0016, "step": 15400 }, { "epoch": 3.1211998378597485, "eval_accuracy": 0.9633699633699634, "eval_loss": 0.16209031641483307, "eval_runtime": 50.0913, "eval_samples_per_second": 10.9, "eval_steps_per_second": 1.377, "step": 15400 }, { "epoch": 3.1232265910012162, "grad_norm": 0.004019442014396191, "learning_rate": 4.38386704499392e-05, "loss": 0.0012, "step": 15410 }, { "epoch": 3.1252533441426835, "grad_norm": 0.0039987810887396336, "learning_rate": 4.3737332792865834e-05, "loss": 0.0015, "step": 15420 }, { "epoch": 3.127280097284151, "grad_norm": 0.004105426371097565, "learning_rate": 4.363599513579246e-05, "loss": 0.5071, "step": 15430 }, { "epoch": 3.129306850425618, "grad_norm": 0.004296373575925827, "learning_rate": 4.353465747871909e-05, "loss": 0.4613, "step": 15440 }, { "epoch": 3.1313336035670853, "grad_norm": 0.004371760878711939, "learning_rate": 4.3433319821645725e-05, "loss": 0.002, "step": 15450 }, { "epoch": 3.133360356708553, "grad_norm": 0.004319510888308287, "learning_rate": 4.333198216457236e-05, "loss": 0.0012, "step": 15460 }, { "epoch": 3.1353871098500203, "grad_norm": 0.004228163044899702, "learning_rate": 4.323064450749899e-05, "loss": 0.0023, "step": 15470 }, { "epoch": 3.1374138629914876, "grad_norm": 0.004289459902793169, "learning_rate": 4.312930685042562e-05, "loss": 0.2547, "step": 15480 }, { "epoch": 3.139440616132955, "grad_norm": 0.004262127447873354, "learning_rate": 4.3027969193352255e-05, "loss": 0.4993, "step": 15490 }, { "epoch": 3.141467369274422, "grad_norm": 0.004445136059075594, "learning_rate": 4.292663153627888e-05, "loss": 0.0005, "step": 15500 }, { "epoch": 3.141467369274422, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.11043673753738403, "eval_runtime": 50.316, "eval_samples_per_second": 10.851, "eval_steps_per_second": 1.371, "step": 15500 }, { "epoch": 3.14349412241589, "grad_norm": 0.004357473459094763, "learning_rate": 4.2825293879205514e-05, "loss": 0.0013, "step": 15510 }, { "epoch": 3.145520875557357, "grad_norm": 0.004336953163146973, "learning_rate": 4.272395622213215e-05, "loss": 0.0045, "step": 15520 }, { "epoch": 3.1475476286988244, "grad_norm": 0.0772530660033226, "learning_rate": 4.262261856505878e-05, "loss": 0.003, "step": 15530 }, { "epoch": 3.1495743818402917, "grad_norm": 0.06546340137720108, "learning_rate": 4.252128090798541e-05, "loss": 0.0028, "step": 15540 }, { "epoch": 3.1516011349817594, "grad_norm": 0.004146865103393793, "learning_rate": 4.2419943250912045e-05, "loss": 0.0021, "step": 15550 }, { "epoch": 3.1536278881232267, "grad_norm": 0.0040993038564920425, "learning_rate": 4.231860559383867e-05, "loss": 0.0011, "step": 15560 }, { "epoch": 3.155654641264694, "grad_norm": 0.004080483224242926, "learning_rate": 4.22172679367653e-05, "loss": 0.0004, "step": 15570 }, { "epoch": 3.1576813944061612, "grad_norm": 0.004077650140970945, "learning_rate": 4.2115930279691936e-05, "loss": 0.0005, "step": 15580 }, { "epoch": 3.1597081475476285, "grad_norm": 0.003945386037230492, "learning_rate": 4.201459262261856e-05, "loss": 0.3534, "step": 15590 }, { "epoch": 3.1617349006890962, "grad_norm": 0.004126140847802162, "learning_rate": 4.19132549655452e-05, "loss": 0.3195, "step": 15600 }, { "epoch": 3.1617349006890962, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.1447170376777649, "eval_runtime": 48.1002, "eval_samples_per_second": 11.351, "eval_steps_per_second": 1.435, "step": 15600 }, { "epoch": 3.1637616538305635, "grad_norm": 0.06581421196460724, "learning_rate": 4.1811917308471834e-05, "loss": 0.0053, "step": 15610 }, { "epoch": 3.165788406972031, "grad_norm": 0.004257733467966318, "learning_rate": 4.171057965139846e-05, "loss": 0.0005, "step": 15620 }, { "epoch": 3.167815160113498, "grad_norm": 0.0041100881062448025, "learning_rate": 4.160924199432509e-05, "loss": 0.0029, "step": 15630 }, { "epoch": 3.1698419132549653, "grad_norm": 0.004436058457940817, "learning_rate": 4.1507904337251725e-05, "loss": 0.7637, "step": 15640 }, { "epoch": 3.171868666396433, "grad_norm": 0.061650391668081284, "learning_rate": 4.140656668017836e-05, "loss": 0.0012, "step": 15650 }, { "epoch": 3.1738954195379003, "grad_norm": 0.004761091433465481, "learning_rate": 4.1305229023104983e-05, "loss": 0.0091, "step": 15660 }, { "epoch": 3.1759221726793676, "grad_norm": 0.07118819653987885, "learning_rate": 4.120389136603162e-05, "loss": 0.0028, "step": 15670 }, { "epoch": 3.177948925820835, "grad_norm": 0.004552809055894613, "learning_rate": 4.1102553708958255e-05, "loss": 0.4552, "step": 15680 }, { "epoch": 3.1799756789623026, "grad_norm": 0.0048391795717179775, "learning_rate": 4.100121605188488e-05, "loss": 0.0026, "step": 15690 }, { "epoch": 3.18200243210377, "grad_norm": 0.005941119510680437, "learning_rate": 4.0899878394811514e-05, "loss": 2.3502, "step": 15700 }, { "epoch": 3.18200243210377, "eval_accuracy": 0.9652014652014652, "eval_loss": 0.1826866865158081, "eval_runtime": 50.1588, "eval_samples_per_second": 10.885, "eval_steps_per_second": 1.376, "step": 15700 }, { "epoch": 3.184029185245237, "grad_norm": 0.006943140644580126, "learning_rate": 4.079854073773815e-05, "loss": 0.0019, "step": 15710 }, { "epoch": 3.1860559383867044, "grad_norm": 0.00895757507532835, "learning_rate": 4.069720308066477e-05, "loss": 0.002, "step": 15720 }, { "epoch": 3.1880826915281717, "grad_norm": 0.0076103429310023785, "learning_rate": 4.0595865423591405e-05, "loss": 0.0024, "step": 15730 }, { "epoch": 3.1901094446696394, "grad_norm": 0.009083851240575314, "learning_rate": 4.0494527766518045e-05, "loss": 0.0073, "step": 15740 }, { "epoch": 3.1921361978111067, "grad_norm": 0.009438318200409412, "learning_rate": 4.039319010944467e-05, "loss": 0.4983, "step": 15750 }, { "epoch": 3.194162950952574, "grad_norm": 0.1173553317785263, "learning_rate": 4.02918524523713e-05, "loss": 0.003, "step": 15760 }, { "epoch": 3.1961897040940412, "grad_norm": 0.006215721368789673, "learning_rate": 4.0190514795297936e-05, "loss": 0.0022, "step": 15770 }, { "epoch": 3.1982164572355085, "grad_norm": 0.0618855357170105, "learning_rate": 4.008917713822457e-05, "loss": 0.0015, "step": 15780 }, { "epoch": 3.2002432103769762, "grad_norm": 0.06262574344873428, "learning_rate": 3.9987839481151194e-05, "loss": 0.0015, "step": 15790 }, { "epoch": 3.2022699635184435, "grad_norm": 0.006081962957978249, "learning_rate": 3.988650182407783e-05, "loss": 0.4252, "step": 15800 }, { "epoch": 3.2022699635184435, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.10769849270582199, "eval_runtime": 47.2943, "eval_samples_per_second": 11.545, "eval_steps_per_second": 1.459, "step": 15800 }, { "epoch": 3.204296716659911, "grad_norm": 0.010130221024155617, "learning_rate": 3.978516416700446e-05, "loss": 0.002, "step": 15810 }, { "epoch": 3.206323469801378, "grad_norm": 0.005564684513956308, "learning_rate": 3.968382650993109e-05, "loss": 0.0064, "step": 15820 }, { "epoch": 3.2083502229428458, "grad_norm": 0.005339638330042362, "learning_rate": 3.9582488852857725e-05, "loss": 0.4571, "step": 15830 }, { "epoch": 3.210376976084313, "grad_norm": 0.005142607726156712, "learning_rate": 3.948115119578436e-05, "loss": 0.0022, "step": 15840 }, { "epoch": 3.2124037292257803, "grad_norm": 0.005214919801801443, "learning_rate": 3.9379813538710983e-05, "loss": 0.002, "step": 15850 }, { "epoch": 3.2144304823672476, "grad_norm": 0.005264309234917164, "learning_rate": 3.9278475881637616e-05, "loss": 0.0036, "step": 15860 }, { "epoch": 3.216457235508715, "grad_norm": 0.0050828661769628525, "learning_rate": 3.9177138224564256e-05, "loss": 0.0007, "step": 15870 }, { "epoch": 3.2184839886501826, "grad_norm": 10.684004783630371, "learning_rate": 3.907580056749088e-05, "loss": 0.318, "step": 15880 }, { "epoch": 3.22051074179165, "grad_norm": 0.004819140769541264, "learning_rate": 3.8974462910417514e-05, "loss": 0.0013, "step": 15890 }, { "epoch": 3.222537494933117, "grad_norm": 0.07204253226518631, "learning_rate": 3.887312525334415e-05, "loss": 0.0042, "step": 15900 }, { "epoch": 3.222537494933117, "eval_accuracy": 0.9706959706959707, "eval_loss": 0.1431271880865097, "eval_runtime": 50.1685, "eval_samples_per_second": 10.883, "eval_steps_per_second": 1.375, "step": 15900 }, { "epoch": 3.2245642480745844, "grad_norm": 0.004766241647303104, "learning_rate": 3.877178759627077e-05, "loss": 0.0006, "step": 15910 }, { "epoch": 3.2265910012160517, "grad_norm": 0.004663664381951094, "learning_rate": 3.8670449939197405e-05, "loss": 0.0055, "step": 15920 }, { "epoch": 3.2286177543575194, "grad_norm": 0.004675132222473621, "learning_rate": 3.856911228212404e-05, "loss": 0.0017, "step": 15930 }, { "epoch": 3.2306445074989867, "grad_norm": 0.007203600835055113, "learning_rate": 3.846777462505067e-05, "loss": 0.0018, "step": 15940 }, { "epoch": 3.232671260640454, "grad_norm": 0.005374888889491558, "learning_rate": 3.83664369679773e-05, "loss": 0.0022, "step": 15950 }, { "epoch": 3.2346980137819212, "grad_norm": 0.049246896058321, "learning_rate": 3.8265099310903936e-05, "loss": 0.0015, "step": 15960 }, { "epoch": 3.2367247669233885, "grad_norm": 0.004231675993651152, "learning_rate": 3.816376165383057e-05, "loss": 0.0086, "step": 15970 }, { "epoch": 3.2387515200648562, "grad_norm": 0.004210128914564848, "learning_rate": 3.8062423996757194e-05, "loss": 0.0005, "step": 15980 }, { "epoch": 3.2407782732063235, "grad_norm": 0.04477294534444809, "learning_rate": 3.796108633968383e-05, "loss": 0.0027, "step": 15990 }, { "epoch": 3.2428050263477908, "grad_norm": 0.00412526773288846, "learning_rate": 3.785974868261046e-05, "loss": 1.0207, "step": 16000 }, { "epoch": 3.2428050263477908, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.1287493109703064, "eval_runtime": 46.4503, "eval_samples_per_second": 11.754, "eval_steps_per_second": 1.485, "step": 16000 }, { "epoch": 3.244831779489258, "grad_norm": 0.009321167133748531, "learning_rate": 3.775841102553709e-05, "loss": 0.0041, "step": 16010 }, { "epoch": 3.2468585326307258, "grad_norm": 0.004468958359211683, "learning_rate": 3.7657073368463725e-05, "loss": 0.0038, "step": 16020 }, { "epoch": 3.248885285772193, "grad_norm": 0.004842129535973072, "learning_rate": 3.755573571139036e-05, "loss": 0.0043, "step": 16030 }, { "epoch": 3.2509120389136603, "grad_norm": 0.004659067839384079, "learning_rate": 3.7454398054316983e-05, "loss": 0.7708, "step": 16040 }, { "epoch": 3.2529387920551276, "grad_norm": 0.005155695136636496, "learning_rate": 3.7353060397243616e-05, "loss": 0.0012, "step": 16050 }, { "epoch": 3.254965545196595, "grad_norm": 0.0048374454490840435, "learning_rate": 3.725172274017025e-05, "loss": 0.0019, "step": 16060 }, { "epoch": 3.2569922983380626, "grad_norm": 0.004809567239135504, "learning_rate": 3.715038508309688e-05, "loss": 0.0614, "step": 16070 }, { "epoch": 3.25901905147953, "grad_norm": 0.05183807387948036, "learning_rate": 3.7049047426023514e-05, "loss": 0.003, "step": 16080 }, { "epoch": 3.261045804620997, "grad_norm": 0.004665842745453119, "learning_rate": 3.694770976895015e-05, "loss": 0.0012, "step": 16090 }, { "epoch": 3.2630725577624644, "grad_norm": 0.0047615994699299335, "learning_rate": 3.684637211187677e-05, "loss": 0.5064, "step": 16100 }, { "epoch": 3.2630725577624644, "eval_accuracy": 0.9688644688644689, "eval_loss": 0.16628408432006836, "eval_runtime": 49.9971, "eval_samples_per_second": 10.921, "eval_steps_per_second": 1.38, "step": 16100 }, { "epoch": 3.265099310903932, "grad_norm": 0.004774071741849184, "learning_rate": 3.6745034454803405e-05, "loss": 0.0025, "step": 16110 }, { "epoch": 3.2671260640453994, "grad_norm": 0.004758866038173437, "learning_rate": 3.664369679773004e-05, "loss": 0.0019, "step": 16120 }, { "epoch": 3.2691528171868667, "grad_norm": 0.0046609085984528065, "learning_rate": 3.654235914065667e-05, "loss": 0.0012, "step": 16130 }, { "epoch": 3.271179570328334, "grad_norm": 0.004649462644010782, "learning_rate": 3.6441021483583296e-05, "loss": 0.0019, "step": 16140 }, { "epoch": 3.2732063234698012, "grad_norm": 0.004584324546158314, "learning_rate": 3.6339683826509936e-05, "loss": 0.0012, "step": 16150 }, { "epoch": 3.2752330766112685, "grad_norm": 0.004533680621534586, "learning_rate": 3.623834616943657e-05, "loss": 0.0011, "step": 16160 }, { "epoch": 3.277259829752736, "grad_norm": 0.004560311324894428, "learning_rate": 3.6137008512363194e-05, "loss": 0.0011, "step": 16170 }, { "epoch": 3.2792865828942035, "grad_norm": 0.0044805011712014675, "learning_rate": 3.603567085528983e-05, "loss": 0.5046, "step": 16180 }, { "epoch": 3.2813133360356708, "grad_norm": 0.05615738034248352, "learning_rate": 3.593433319821646e-05, "loss": 0.0024, "step": 16190 }, { "epoch": 3.283340089177138, "grad_norm": 0.004519937094300985, "learning_rate": 3.5832995541143086e-05, "loss": 0.0018, "step": 16200 }, { "epoch": 3.283340089177138, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.13267415761947632, "eval_runtime": 49.914, "eval_samples_per_second": 10.939, "eval_steps_per_second": 1.382, "step": 16200 }, { "epoch": 3.2853668423186058, "grad_norm": 0.05049968510866165, "learning_rate": 3.5731657884069725e-05, "loss": 0.0021, "step": 16210 }, { "epoch": 3.287393595460073, "grad_norm": 0.048404913395643234, "learning_rate": 3.563032022699636e-05, "loss": 0.0043, "step": 16220 }, { "epoch": 3.2894203486015403, "grad_norm": 0.004592543467879295, "learning_rate": 3.5528982569922983e-05, "loss": 0.6482, "step": 16230 }, { "epoch": 3.2914471017430076, "grad_norm": 0.004584785085171461, "learning_rate": 3.5427644912849616e-05, "loss": 0.0017, "step": 16240 }, { "epoch": 3.293473854884475, "grad_norm": 0.004494310822337866, "learning_rate": 3.532630725577625e-05, "loss": 0.0012, "step": 16250 }, { "epoch": 3.2955006080259426, "grad_norm": 0.10379710048437119, "learning_rate": 3.522496959870288e-05, "loss": 0.0021, "step": 16260 }, { "epoch": 3.29752736116741, "grad_norm": 0.047005441039800644, "learning_rate": 3.512363194162951e-05, "loss": 0.002, "step": 16270 }, { "epoch": 3.299554114308877, "grad_norm": 0.04951295629143715, "learning_rate": 3.502229428455615e-05, "loss": 0.3844, "step": 16280 }, { "epoch": 3.3015808674503444, "grad_norm": 0.004487763158977032, "learning_rate": 3.492095662748278e-05, "loss": 0.0017, "step": 16290 }, { "epoch": 3.303607620591812, "grad_norm": 0.0049188267439603806, "learning_rate": 3.4819618970409405e-05, "loss": 0.0006, "step": 16300 }, { "epoch": 3.303607620591812, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.11632156372070312, "eval_runtime": 47.9658, "eval_samples_per_second": 11.383, "eval_steps_per_second": 1.439, "step": 16300 }, { "epoch": 3.3056343737332794, "grad_norm": 0.004525529686361551, "learning_rate": 3.471828131333604e-05, "loss": 0.002, "step": 16310 }, { "epoch": 3.3076611268747467, "grad_norm": 0.02013551816344261, "learning_rate": 3.461694365626267e-05, "loss": 0.0012, "step": 16320 }, { "epoch": 3.309687880016214, "grad_norm": 0.052169717848300934, "learning_rate": 3.4515605999189296e-05, "loss": 0.514, "step": 16330 }, { "epoch": 3.311714633157681, "grad_norm": 0.007503776345402002, "learning_rate": 3.441426834211593e-05, "loss": 0.0013, "step": 16340 }, { "epoch": 3.313741386299149, "grad_norm": 0.022679857909679413, "learning_rate": 3.431293068504257e-05, "loss": 0.4985, "step": 16350 }, { "epoch": 3.315768139440616, "grad_norm": 0.0106239328160882, "learning_rate": 3.4211593027969194e-05, "loss": 0.7458, "step": 16360 }, { "epoch": 3.3177948925820835, "grad_norm": 0.005137187894433737, "learning_rate": 3.411025537089583e-05, "loss": 0.0036, "step": 16370 }, { "epoch": 3.3198216457235508, "grad_norm": 0.06359566748142242, "learning_rate": 3.400891771382246e-05, "loss": 0.3599, "step": 16380 }, { "epoch": 3.3218483988650185, "grad_norm": 0.0068815951235592365, "learning_rate": 3.3907580056749086e-05, "loss": 0.0072, "step": 16390 }, { "epoch": 3.3238751520064858, "grad_norm": 0.0051819924265146255, "learning_rate": 3.380624239967572e-05, "loss": 0.0039, "step": 16400 }, { "epoch": 3.3238751520064858, "eval_accuracy": 0.9725274725274725, "eval_loss": 0.141290545463562, "eval_runtime": 50.1791, "eval_samples_per_second": 10.881, "eval_steps_per_second": 1.375, "step": 16400 }, { "epoch": 3.325901905147953, "grad_norm": 0.005205860361456871, "learning_rate": 3.370490474260235e-05, "loss": 0.0076, "step": 16410 }, { "epoch": 3.3279286582894203, "grad_norm": 0.08553090691566467, "learning_rate": 3.3603567085528983e-05, "loss": 0.4323, "step": 16420 }, { "epoch": 3.3299554114308876, "grad_norm": 0.06025693565607071, "learning_rate": 3.3502229428455616e-05, "loss": 0.0022, "step": 16430 }, { "epoch": 3.331982164572355, "grad_norm": 0.056943122297525406, "learning_rate": 3.340089177138225e-05, "loss": 0.0013, "step": 16440 }, { "epoch": 3.3340089177138226, "grad_norm": 0.007948963902890682, "learning_rate": 3.329955411430888e-05, "loss": 0.0039, "step": 16450 }, { "epoch": 3.33603567085529, "grad_norm": 0.004892650060355663, "learning_rate": 3.319821645723551e-05, "loss": 0.0013, "step": 16460 }, { "epoch": 3.338062423996757, "grad_norm": 0.004903930705040693, "learning_rate": 3.309687880016214e-05, "loss": 0.3424, "step": 16470 }, { "epoch": 3.3400891771382244, "grad_norm": 0.007834755815565586, "learning_rate": 3.299554114308877e-05, "loss": 0.3003, "step": 16480 }, { "epoch": 3.342115930279692, "grad_norm": 0.0051131355576217175, "learning_rate": 3.2894203486015405e-05, "loss": 0.0013, "step": 16490 }, { "epoch": 3.3441426834211594, "grad_norm": 0.005035320296883583, "learning_rate": 3.279286582894204e-05, "loss": 0.5045, "step": 16500 }, { "epoch": 3.3441426834211594, "eval_accuracy": 0.9688644688644689, "eval_loss": 0.1572037637233734, "eval_runtime": 47.102, "eval_samples_per_second": 11.592, "eval_steps_per_second": 1.465, "step": 16500 }, { "epoch": 3.3461694365626267, "grad_norm": 0.07073163986206055, "learning_rate": 3.269152817186867e-05, "loss": 0.752, "step": 16510 }, { "epoch": 3.348196189704094, "grad_norm": 0.007176402490586042, "learning_rate": 3.2590190514795296e-05, "loss": 0.0015, "step": 16520 }, { "epoch": 3.350222942845561, "grad_norm": 0.007812405005097389, "learning_rate": 3.248885285772193e-05, "loss": 0.0084, "step": 16530 }, { "epoch": 3.352249695987029, "grad_norm": 0.008438252843916416, "learning_rate": 3.238751520064856e-05, "loss": 0.0031, "step": 16540 }, { "epoch": 3.354276449128496, "grad_norm": 0.007445335853844881, "learning_rate": 3.2286177543575194e-05, "loss": 0.0021, "step": 16550 }, { "epoch": 3.3563032022699635, "grad_norm": 0.005647340323776007, "learning_rate": 3.218483988650183e-05, "loss": 0.2166, "step": 16560 }, { "epoch": 3.3583299554114308, "grad_norm": 0.007124828174710274, "learning_rate": 3.208350222942846e-05, "loss": 0.7467, "step": 16570 }, { "epoch": 3.3603567085528985, "grad_norm": 0.007737248204648495, "learning_rate": 3.1982164572355086e-05, "loss": 0.0014, "step": 16580 }, { "epoch": 3.3623834616943657, "grad_norm": 0.005917510949075222, "learning_rate": 3.188082691528172e-05, "loss": 0.0061, "step": 16590 }, { "epoch": 3.364410214835833, "grad_norm": 0.006020793225616217, "learning_rate": 3.177948925820835e-05, "loss": 0.0069, "step": 16600 }, { "epoch": 3.364410214835833, "eval_accuracy": 0.967032967032967, "eval_loss": 0.1553422510623932, "eval_runtime": 50.2992, "eval_samples_per_second": 10.855, "eval_steps_per_second": 1.372, "step": 16600 }, { "epoch": 3.3664369679773003, "grad_norm": 0.005671895109117031, "learning_rate": 3.1678151601134983e-05, "loss": 0.0403, "step": 16610 }, { "epoch": 3.3684637211187676, "grad_norm": 0.005752407480031252, "learning_rate": 3.1576813944061616e-05, "loss": 0.4096, "step": 16620 }, { "epoch": 3.3704904742602353, "grad_norm": 0.005818086210638285, "learning_rate": 3.147547628698825e-05, "loss": 1.4527, "step": 16630 }, { "epoch": 3.3725172274017026, "grad_norm": 0.007470723241567612, "learning_rate": 3.137413862991488e-05, "loss": 0.0081, "step": 16640 }, { "epoch": 3.37454398054317, "grad_norm": 0.007930467836558819, "learning_rate": 3.127280097284151e-05, "loss": 0.0048, "step": 16650 }, { "epoch": 3.376570733684637, "grad_norm": 0.008607808500528336, "learning_rate": 3.117146331576814e-05, "loss": 0.0042, "step": 16660 }, { "epoch": 3.3785974868261044, "grad_norm": 0.006221098359674215, "learning_rate": 3.107012565869477e-05, "loss": 0.0011, "step": 16670 }, { "epoch": 3.380624239967572, "grad_norm": 0.010589617304503918, "learning_rate": 3.09687880016214e-05, "loss": 0.0032, "step": 16680 }, { "epoch": 3.3826509931090394, "grad_norm": 0.005919576622545719, "learning_rate": 3.086745034454804e-05, "loss": 0.3659, "step": 16690 }, { "epoch": 3.3846777462505067, "grad_norm": 0.010437965393066406, "learning_rate": 3.076611268747467e-05, "loss": 0.0058, "step": 16700 }, { "epoch": 3.3846777462505067, "eval_accuracy": 0.978021978021978, "eval_loss": 0.10221080482006073, "eval_runtime": 47.2291, "eval_samples_per_second": 11.561, "eval_steps_per_second": 1.461, "step": 16700 }, { "epoch": 3.386704499391974, "grad_norm": 0.005819872487336397, "learning_rate": 3.0664775030401296e-05, "loss": 0.0029, "step": 16710 }, { "epoch": 3.388731252533441, "grad_norm": 0.059754569083452225, "learning_rate": 3.056343737332793e-05, "loss": 0.0032, "step": 16720 }, { "epoch": 3.390758005674909, "grad_norm": 0.007293600123375654, "learning_rate": 3.0462099716254562e-05, "loss": 0.0026, "step": 16730 }, { "epoch": 3.392784758816376, "grad_norm": 0.0075224339962005615, "learning_rate": 3.036076205918119e-05, "loss": 0.0026, "step": 16740 }, { "epoch": 3.3948115119578435, "grad_norm": 0.006849432829767466, "learning_rate": 3.0259424402107824e-05, "loss": 0.7453, "step": 16750 }, { "epoch": 3.3968382650993107, "grad_norm": 0.00680953124538064, "learning_rate": 3.015808674503446e-05, "loss": 0.0026, "step": 16760 }, { "epoch": 3.3988650182407785, "grad_norm": 0.006028030067682266, "learning_rate": 3.005674908796109e-05, "loss": 0.0014, "step": 16770 }, { "epoch": 3.4008917713822457, "grad_norm": 0.00616206182166934, "learning_rate": 2.9955411430887718e-05, "loss": 0.0038, "step": 16780 }, { "epoch": 3.402918524523713, "grad_norm": 0.006383887026458979, "learning_rate": 2.985407377381435e-05, "loss": 0.0016, "step": 16790 }, { "epoch": 3.4049452776651803, "grad_norm": 0.006572426296770573, "learning_rate": 2.975273611674098e-05, "loss": 0.006, "step": 16800 }, { "epoch": 3.4049452776651803, "eval_accuracy": 0.978021978021978, "eval_loss": 0.09933307021856308, "eval_runtime": 48.804, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.414, "step": 16800 }, { "epoch": 3.4069720308066476, "grad_norm": 0.005607839208096266, "learning_rate": 2.9651398459667613e-05, "loss": 0.0017, "step": 16810 }, { "epoch": 3.4089987839481153, "grad_norm": 0.005661617498844862, "learning_rate": 2.9550060802594242e-05, "loss": 0.0034, "step": 16820 }, { "epoch": 3.4110255370895826, "grad_norm": 0.03819294273853302, "learning_rate": 2.9448723145520878e-05, "loss": 0.0017, "step": 16830 }, { "epoch": 3.41305229023105, "grad_norm": 0.005522882100194693, "learning_rate": 2.934738548844751e-05, "loss": 0.0016, "step": 16840 }, { "epoch": 3.415079043372517, "grad_norm": 0.005343144293874502, "learning_rate": 2.924604783137414e-05, "loss": 0.5167, "step": 16850 }, { "epoch": 3.417105796513985, "grad_norm": 0.055113568902015686, "learning_rate": 2.9144710174300773e-05, "loss": 0.5119, "step": 16860 }, { "epoch": 3.419132549655452, "grad_norm": 0.06563153862953186, "learning_rate": 2.9043372517227402e-05, "loss": 0.9829, "step": 16870 }, { "epoch": 3.4211593027969194, "grad_norm": 0.02681180089712143, "learning_rate": 2.894203486015403e-05, "loss": 0.0017, "step": 16880 }, { "epoch": 3.4231860559383867, "grad_norm": 0.00725955655798316, "learning_rate": 2.8840697203080667e-05, "loss": 0.587, "step": 16890 }, { "epoch": 3.425212809079854, "grad_norm": 0.007704930379986763, "learning_rate": 2.87393595460073e-05, "loss": 0.002, "step": 16900 }, { "epoch": 3.425212809079854, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.09541966766119003, "eval_runtime": 50.1127, "eval_samples_per_second": 10.895, "eval_steps_per_second": 1.377, "step": 16900 }, { "epoch": 3.427239562221321, "grad_norm": 0.06465483456850052, "learning_rate": 2.863802188893393e-05, "loss": 0.0066, "step": 16910 }, { "epoch": 3.429266315362789, "grad_norm": 8.011322021484375, "learning_rate": 2.8536684231860562e-05, "loss": 0.4889, "step": 16920 }, { "epoch": 3.431293068504256, "grad_norm": 0.061154309660196304, "learning_rate": 2.843534657478719e-05, "loss": 0.0023, "step": 16930 }, { "epoch": 3.4333198216457235, "grad_norm": 0.006984674371778965, "learning_rate": 2.8334008917713824e-05, "loss": 0.0016, "step": 16940 }, { "epoch": 3.4353465747871907, "grad_norm": 0.013019927777349949, "learning_rate": 2.8232671260640453e-05, "loss": 0.4919, "step": 16950 }, { "epoch": 3.4373733279286585, "grad_norm": 0.006715769879519939, "learning_rate": 2.813133360356709e-05, "loss": 0.003, "step": 16960 }, { "epoch": 3.4394000810701257, "grad_norm": 0.0073410640470683575, "learning_rate": 2.802999594649372e-05, "loss": 0.9557, "step": 16970 }, { "epoch": 3.441426834211593, "grad_norm": 0.007587122730910778, "learning_rate": 2.792865828942035e-05, "loss": 0.0015, "step": 16980 }, { "epoch": 3.4434535873530603, "grad_norm": 0.007860773243010044, "learning_rate": 2.782732063234698e-05, "loss": 0.0043, "step": 16990 }, { "epoch": 3.4454803404945276, "grad_norm": 0.030218765139579773, "learning_rate": 2.7725982975273613e-05, "loss": 0.0082, "step": 17000 }, { "epoch": 3.4454803404945276, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.09758459776639938, "eval_runtime": 47.6392, "eval_samples_per_second": 11.461, "eval_steps_per_second": 1.448, "step": 17000 }, { "epoch": 3.4475070936359953, "grad_norm": 0.007678163703531027, "learning_rate": 2.7624645318200242e-05, "loss": 0.0026, "step": 17010 }, { "epoch": 3.4495338467774626, "grad_norm": 0.006676503922790289, "learning_rate": 2.7523307661126875e-05, "loss": 0.0029, "step": 17020 }, { "epoch": 3.45156059991893, "grad_norm": 0.011629360727965832, "learning_rate": 2.742197000405351e-05, "loss": 0.0026, "step": 17030 }, { "epoch": 3.453587353060397, "grad_norm": 0.0064224074594676495, "learning_rate": 2.732063234698014e-05, "loss": 0.0011, "step": 17040 }, { "epoch": 3.455614106201865, "grad_norm": 0.009052238427102566, "learning_rate": 2.7219294689906773e-05, "loss": 0.0019, "step": 17050 }, { "epoch": 3.457640859343332, "grad_norm": 0.006074480712413788, "learning_rate": 2.7117957032833402e-05, "loss": 0.0119, "step": 17060 }, { "epoch": 3.4596676124847994, "grad_norm": 0.00601073307916522, "learning_rate": 2.701661937576003e-05, "loss": 0.0026, "step": 17070 }, { "epoch": 3.4616943656262666, "grad_norm": 0.006008122581988573, "learning_rate": 2.6915281718686664e-05, "loss": 0.003, "step": 17080 }, { "epoch": 3.463721118767734, "grad_norm": 0.006180489901453257, "learning_rate": 2.6813944061613293e-05, "loss": 0.0022, "step": 17090 }, { "epoch": 3.4657478719092016, "grad_norm": 0.005895602982491255, "learning_rate": 2.671260640453993e-05, "loss": 0.0029, "step": 17100 }, { "epoch": 3.4657478719092016, "eval_accuracy": 0.978021978021978, "eval_loss": 0.0977577343583107, "eval_runtime": 48.4339, "eval_samples_per_second": 11.273, "eval_steps_per_second": 1.425, "step": 17100 }, { "epoch": 3.467774625050669, "grad_norm": 0.06110497564077377, "learning_rate": 2.6611268747466562e-05, "loss": 0.0112, "step": 17110 }, { "epoch": 3.469801378192136, "grad_norm": 0.0056753759272396564, "learning_rate": 2.650993109039319e-05, "loss": 0.0034, "step": 17120 }, { "epoch": 3.4718281313336035, "grad_norm": 0.006453051697462797, "learning_rate": 2.6408593433319824e-05, "loss": 0.2768, "step": 17130 }, { "epoch": 3.473854884475071, "grad_norm": 0.005545321386307478, "learning_rate": 2.6307255776246453e-05, "loss": 0.0034, "step": 17140 }, { "epoch": 3.4758816376165385, "grad_norm": 0.005507095716893673, "learning_rate": 2.6205918119173086e-05, "loss": 0.0008, "step": 17150 }, { "epoch": 3.4779083907580057, "grad_norm": 0.005461165215820074, "learning_rate": 2.6104580462099715e-05, "loss": 0.0039, "step": 17160 }, { "epoch": 3.479935143899473, "grad_norm": 0.07790371030569077, "learning_rate": 2.600324280502635e-05, "loss": 0.0028, "step": 17170 }, { "epoch": 3.4819618970409403, "grad_norm": 0.005248835310339928, "learning_rate": 2.590190514795298e-05, "loss": 0.0019, "step": 17180 }, { "epoch": 3.4839886501824076, "grad_norm": 0.006307022646069527, "learning_rate": 2.5800567490879613e-05, "loss": 0.5085, "step": 17190 }, { "epoch": 3.4860154033238753, "grad_norm": 0.005371470469981432, "learning_rate": 2.5699229833806242e-05, "loss": 0.0008, "step": 17200 }, { "epoch": 3.4860154033238753, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.09734708815813065, "eval_runtime": 50.2153, "eval_samples_per_second": 10.873, "eval_steps_per_second": 1.374, "step": 17200 }, { "epoch": 3.4880421564653425, "grad_norm": 0.05594426393508911, "learning_rate": 2.5597892176732875e-05, "loss": 0.002, "step": 17210 }, { "epoch": 3.49006890960681, "grad_norm": 0.05673331022262573, "learning_rate": 2.5496554519659504e-05, "loss": 0.0019, "step": 17220 }, { "epoch": 3.492095662748277, "grad_norm": 0.005430069286376238, "learning_rate": 2.539521686258614e-05, "loss": 0.0031, "step": 17230 }, { "epoch": 3.494122415889745, "grad_norm": 8.014272689819336, "learning_rate": 2.5293879205512773e-05, "loss": 0.5023, "step": 17240 }, { "epoch": 3.496149169031212, "grad_norm": 0.006552092730998993, "learning_rate": 2.5192541548439402e-05, "loss": 0.0026, "step": 17250 }, { "epoch": 3.4981759221726794, "grad_norm": 0.0060019418597221375, "learning_rate": 2.5091203891366035e-05, "loss": 0.0018, "step": 17260 }, { "epoch": 3.5002026753141466, "grad_norm": 0.0064520156010985374, "learning_rate": 2.4989866234292664e-05, "loss": 0.7485, "step": 17270 }, { "epoch": 3.502229428455614, "grad_norm": 0.059147171676158905, "learning_rate": 2.4888528577219296e-05, "loss": 0.0029, "step": 17280 }, { "epoch": 3.5042561815970816, "grad_norm": 0.005694244988262653, "learning_rate": 2.478719092014593e-05, "loss": 0.0013, "step": 17290 }, { "epoch": 3.506282934738549, "grad_norm": 0.005751201882958412, "learning_rate": 2.468585326307256e-05, "loss": 0.0014, "step": 17300 }, { "epoch": 3.506282934738549, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.09791267663240433, "eval_runtime": 46.9778, "eval_samples_per_second": 11.623, "eval_steps_per_second": 1.469, "step": 17300 }, { "epoch": 3.508309687880016, "grad_norm": 0.05871247500181198, "learning_rate": 2.458451560599919e-05, "loss": 0.457, "step": 17310 }, { "epoch": 3.5103364410214835, "grad_norm": 0.006226697936654091, "learning_rate": 2.4483177948925824e-05, "loss": 0.0022, "step": 17320 }, { "epoch": 3.512363194162951, "grad_norm": 0.00761007983237505, "learning_rate": 2.4381840291852453e-05, "loss": 0.0013, "step": 17330 }, { "epoch": 3.5143899473044184, "grad_norm": 0.00598880834877491, "learning_rate": 2.4280502634779086e-05, "loss": 0.4966, "step": 17340 }, { "epoch": 3.5164167004458857, "grad_norm": 0.006399375386536121, "learning_rate": 2.4179164977705718e-05, "loss": 0.0006, "step": 17350 }, { "epoch": 3.518443453587353, "grad_norm": 0.0067553287371993065, "learning_rate": 2.4077827320632347e-05, "loss": 0.0017, "step": 17360 }, { "epoch": 3.5204702067288203, "grad_norm": 0.006269978359341621, "learning_rate": 2.397648966355898e-05, "loss": 0.0846, "step": 17370 }, { "epoch": 3.5224969598702875, "grad_norm": 0.005716945044696331, "learning_rate": 2.387515200648561e-05, "loss": 0.1505, "step": 17380 }, { "epoch": 3.5245237130117553, "grad_norm": 0.005705594085156918, "learning_rate": 2.3773814349412242e-05, "loss": 0.0044, "step": 17390 }, { "epoch": 3.5265504661532225, "grad_norm": 0.01609107106924057, "learning_rate": 2.3672476692338875e-05, "loss": 0.0008, "step": 17400 }, { "epoch": 3.5265504661532225, "eval_accuracy": 0.9743589743589743, "eval_loss": 0.11509870737791061, "eval_runtime": 50.238, "eval_samples_per_second": 10.868, "eval_steps_per_second": 1.373, "step": 17400 }, { "epoch": 3.52857721929469, "grad_norm": 0.005944013595581055, "learning_rate": 2.3571139035265504e-05, "loss": 0.0013, "step": 17410 }, { "epoch": 3.5306039724361575, "grad_norm": 0.3377014398574829, "learning_rate": 2.346980137819214e-05, "loss": 0.0054, "step": 17420 }, { "epoch": 3.532630725577625, "grad_norm": 0.005403846502304077, "learning_rate": 2.336846372111877e-05, "loss": 0.0033, "step": 17430 }, { "epoch": 3.534657478719092, "grad_norm": 0.0827544778585434, "learning_rate": 2.32671260640454e-05, "loss": 0.0028, "step": 17440 }, { "epoch": 3.5366842318605594, "grad_norm": 0.006947341840714216, "learning_rate": 2.3165788406972035e-05, "loss": 0.493, "step": 17450 }, { "epoch": 3.5387109850020266, "grad_norm": 0.005381606053560972, "learning_rate": 2.3064450749898664e-05, "loss": 0.0018, "step": 17460 }, { "epoch": 3.540737738143494, "grad_norm": 0.00517958914861083, "learning_rate": 2.2963113092825293e-05, "loss": 0.0037, "step": 17470 }, { "epoch": 3.5427644912849616, "grad_norm": 0.0052943965420126915, "learning_rate": 2.2861775435751926e-05, "loss": 0.0015, "step": 17480 }, { "epoch": 3.544791244426429, "grad_norm": 0.0053497194312512875, "learning_rate": 2.276043777867856e-05, "loss": 0.0013, "step": 17490 }, { "epoch": 3.546817997567896, "grad_norm": 0.005343489348888397, "learning_rate": 2.265910012160519e-05, "loss": 0.0023, "step": 17500 }, { "epoch": 3.546817997567896, "eval_accuracy": 0.978021978021978, "eval_loss": 0.10926567018032074, "eval_runtime": 50.1658, "eval_samples_per_second": 10.884, "eval_steps_per_second": 1.375, "step": 17500 }, { "epoch": 3.5488447507093634, "grad_norm": 0.05171739682555199, "learning_rate": 2.255776246453182e-05, "loss": 0.0032, "step": 17510 }, { "epoch": 3.550871503850831, "grad_norm": 0.004982843995094299, "learning_rate": 2.2456424807458453e-05, "loss": 0.2611, "step": 17520 }, { "epoch": 3.5528982569922984, "grad_norm": 0.4834737181663513, "learning_rate": 2.2355087150385086e-05, "loss": 0.0103, "step": 17530 }, { "epoch": 3.5549250101337657, "grad_norm": 0.05277622491121292, "learning_rate": 2.2253749493311715e-05, "loss": 0.5101, "step": 17540 }, { "epoch": 3.556951763275233, "grad_norm": 0.005033069755882025, "learning_rate": 2.2152411836238347e-05, "loss": 0.0018, "step": 17550 }, { "epoch": 3.5589785164167003, "grad_norm": 0.005153517704457045, "learning_rate": 2.205107417916498e-05, "loss": 0.0039, "step": 17560 }, { "epoch": 3.5610052695581675, "grad_norm": 0.005008082836866379, "learning_rate": 2.194973652209161e-05, "loss": 0.0012, "step": 17570 }, { "epoch": 3.5630320226996353, "grad_norm": 0.005480339750647545, "learning_rate": 2.1848398865018242e-05, "loss": 0.0027, "step": 17580 }, { "epoch": 3.5650587758411025, "grad_norm": 0.004882423207163811, "learning_rate": 2.1747061207944875e-05, "loss": 0.0011, "step": 17590 }, { "epoch": 3.56708552898257, "grad_norm": 0.006747744046151638, "learning_rate": 2.1645723550871504e-05, "loss": 0.0012, "step": 17600 }, { "epoch": 3.56708552898257, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.0995689406991005, "eval_runtime": 46.2745, "eval_samples_per_second": 11.799, "eval_steps_per_second": 1.491, "step": 17600 }, { "epoch": 3.5691122821240375, "grad_norm": 0.0049659782089293, "learning_rate": 2.1544385893798137e-05, "loss": 0.0018, "step": 17610 }, { "epoch": 3.571139035265505, "grad_norm": 0.0050674243830144405, "learning_rate": 2.144304823672477e-05, "loss": 0.0011, "step": 17620 }, { "epoch": 3.573165788406972, "grad_norm": 0.048832111060619354, "learning_rate": 2.13417105796514e-05, "loss": 0.0035, "step": 17630 }, { "epoch": 3.5751925415484394, "grad_norm": 0.008065484464168549, "learning_rate": 2.124037292257803e-05, "loss": 0.0022, "step": 17640 }, { "epoch": 3.5772192946899066, "grad_norm": 0.004766256082803011, "learning_rate": 2.113903526550466e-05, "loss": 0.0009, "step": 17650 }, { "epoch": 3.579246047831374, "grad_norm": 0.005500113591551781, "learning_rate": 2.1037697608431296e-05, "loss": 0.0011, "step": 17660 }, { "epoch": 3.5812728009728416, "grad_norm": 0.004638390615582466, "learning_rate": 2.0936359951357926e-05, "loss": 0.0018, "step": 17670 }, { "epoch": 3.583299554114309, "grad_norm": 0.0049532013945281506, "learning_rate": 2.0835022294284555e-05, "loss": 0.7605, "step": 17680 }, { "epoch": 3.585326307255776, "grad_norm": 0.005197311285883188, "learning_rate": 2.073368463721119e-05, "loss": 0.0016, "step": 17690 }, { "epoch": 3.587353060397244, "grad_norm": 0.0053718313574790955, "learning_rate": 2.063234698013782e-05, "loss": 0.0016, "step": 17700 }, { "epoch": 3.587353060397244, "eval_accuracy": 0.9816849816849816, "eval_loss": 0.09795909374952316, "eval_runtime": 50.1492, "eval_samples_per_second": 10.888, "eval_steps_per_second": 1.376, "step": 17700 }, { "epoch": 3.589379813538711, "grad_norm": 0.0051916828379035, "learning_rate": 2.0531009323064453e-05, "loss": 0.0016, "step": 17710 }, { "epoch": 3.5914065666801784, "grad_norm": 0.005293379537761211, "learning_rate": 2.0429671665991082e-05, "loss": 0.7548, "step": 17720 }, { "epoch": 3.5934333198216457, "grad_norm": 0.00580208096653223, "learning_rate": 2.0328334008917715e-05, "loss": 0.0006, "step": 17730 }, { "epoch": 3.595460072963113, "grad_norm": 0.005405276548117399, "learning_rate": 2.0226996351844347e-05, "loss": 0.0023, "step": 17740 }, { "epoch": 3.5974868261045803, "grad_norm": 0.0055334181524813175, "learning_rate": 2.0125658694770977e-05, "loss": 0.2852, "step": 17750 }, { "epoch": 3.599513579246048, "grad_norm": 0.00532138254493475, "learning_rate": 2.002432103769761e-05, "loss": 0.0022, "step": 17760 }, { "epoch": 3.6015403323875153, "grad_norm": 0.04558062180876732, "learning_rate": 1.9922983380624242e-05, "loss": 0.0011, "step": 17770 }, { "epoch": 3.6035670855289825, "grad_norm": 0.005479319021105766, "learning_rate": 1.982164572355087e-05, "loss": 0.0017, "step": 17780 }, { "epoch": 3.60559383867045, "grad_norm": 0.005388789344578981, "learning_rate": 1.9720308066477504e-05, "loss": 0.0012, "step": 17790 }, { "epoch": 3.6076205918119175, "grad_norm": 0.00526474229991436, "learning_rate": 1.9618970409404137e-05, "loss": 0.0015, "step": 17800 }, { "epoch": 3.6076205918119175, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10518113523721695, "eval_runtime": 50.1299, "eval_samples_per_second": 10.892, "eval_steps_per_second": 1.376, "step": 17800 }, { "epoch": 3.609647344953385, "grad_norm": 0.005050942301750183, "learning_rate": 1.9517632752330766e-05, "loss": 0.0039, "step": 17810 }, { "epoch": 3.611674098094852, "grad_norm": 0.005604945123195648, "learning_rate": 1.94162950952574e-05, "loss": 0.0021, "step": 17820 }, { "epoch": 3.6137008512363193, "grad_norm": 0.005479446146637201, "learning_rate": 1.931495743818403e-05, "loss": 0.0015, "step": 17830 }, { "epoch": 3.6157276043777866, "grad_norm": 0.004991782829165459, "learning_rate": 1.921361978111066e-05, "loss": 0.001, "step": 17840 }, { "epoch": 3.617754357519254, "grad_norm": 0.005143380258232355, "learning_rate": 1.9112282124037293e-05, "loss": 0.5258, "step": 17850 }, { "epoch": 3.6197811106607216, "grad_norm": 0.005482929293066263, "learning_rate": 1.9010944466963926e-05, "loss": 0.005, "step": 17860 }, { "epoch": 3.621807863802189, "grad_norm": 0.005034882575273514, "learning_rate": 1.8909606809890555e-05, "loss": 0.0011, "step": 17870 }, { "epoch": 3.623834616943656, "grad_norm": 0.005108018405735493, "learning_rate": 1.8808269152817188e-05, "loss": 0.0011, "step": 17880 }, { "epoch": 3.625861370085124, "grad_norm": 0.004975921008735895, "learning_rate": 1.8706931495743817e-05, "loss": 0.0015, "step": 17890 }, { "epoch": 3.627888123226591, "grad_norm": 0.005141813308000565, "learning_rate": 1.8605593838670453e-05, "loss": 0.0018, "step": 17900 }, { "epoch": 3.627888123226591, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10540201514959335, "eval_runtime": 46.0356, "eval_samples_per_second": 11.86, "eval_steps_per_second": 1.499, "step": 17900 }, { "epoch": 3.6299148763680584, "grad_norm": 0.04432378336787224, "learning_rate": 1.8504256181597082e-05, "loss": 0.0026, "step": 17910 }, { "epoch": 3.6319416295095257, "grad_norm": 0.005204243119806051, "learning_rate": 1.840291852452371e-05, "loss": 0.0021, "step": 17920 }, { "epoch": 3.633968382650993, "grad_norm": 0.005375333596020937, "learning_rate": 1.8301580867450347e-05, "loss": 0.7511, "step": 17930 }, { "epoch": 3.6359951357924603, "grad_norm": 0.042768221348524094, "learning_rate": 1.8200243210376977e-05, "loss": 0.0015, "step": 17940 }, { "epoch": 3.638021888933928, "grad_norm": 0.008297201246023178, "learning_rate": 1.809890555330361e-05, "loss": 0.0006, "step": 17950 }, { "epoch": 3.6400486420753952, "grad_norm": 0.005426391493529081, "learning_rate": 1.7997567896230242e-05, "loss": 0.7386, "step": 17960 }, { "epoch": 3.6420753952168625, "grad_norm": 0.005903474520891905, "learning_rate": 1.789623023915687e-05, "loss": 0.0011, "step": 17970 }, { "epoch": 3.64410214835833, "grad_norm": 0.006364389322698116, "learning_rate": 1.7794892582083504e-05, "loss": 0.0018, "step": 17980 }, { "epoch": 3.6461289014997975, "grad_norm": 0.04269855469465256, "learning_rate": 1.7693554925010133e-05, "loss": 0.0057, "step": 17990 }, { "epoch": 3.648155654641265, "grad_norm": 0.04199114069342613, "learning_rate": 1.7592217267936766e-05, "loss": 0.003, "step": 18000 }, { "epoch": 3.648155654641265, "eval_accuracy": 0.978021978021978, "eval_loss": 0.10523013025522232, "eval_runtime": 51.7239, "eval_samples_per_second": 10.556, "eval_steps_per_second": 1.334, "step": 18000 }, { "epoch": 3.650182407782732, "grad_norm": 0.006271640304476023, "learning_rate": 1.74908796108634e-05, "loss": 0.0032, "step": 18010 }, { "epoch": 3.6522091609241993, "grad_norm": 0.005405428819358349, "learning_rate": 1.7389541953790028e-05, "loss": 0.0016, "step": 18020 }, { "epoch": 3.6542359140656666, "grad_norm": 0.005375401582568884, "learning_rate": 1.728820429671666e-05, "loss": 0.001, "step": 18030 }, { "epoch": 3.6562626672071343, "grad_norm": 0.00598566560074687, "learning_rate": 1.7186866639643293e-05, "loss": 0.0015, "step": 18040 }, { "epoch": 3.6582894203486016, "grad_norm": 0.005265843588858843, "learning_rate": 1.7085528982569922e-05, "loss": 0.0011, "step": 18050 }, { "epoch": 3.660316173490069, "grad_norm": 0.040170665830373764, "learning_rate": 1.6984191325496555e-05, "loss": 0.3142, "step": 18060 }, { "epoch": 3.662342926631536, "grad_norm": 0.040453970432281494, "learning_rate": 1.6882853668423188e-05, "loss": 0.0015, "step": 18070 }, { "epoch": 3.664369679773004, "grad_norm": 0.09230988472700119, "learning_rate": 1.6781516011349817e-05, "loss": 0.0015, "step": 18080 }, { "epoch": 3.666396432914471, "grad_norm": 0.006269339006394148, "learning_rate": 1.668017835427645e-05, "loss": 0.0015, "step": 18090 }, { "epoch": 3.6684231860559384, "grad_norm": 0.005188298877328634, "learning_rate": 1.6578840697203082e-05, "loss": 0.002, "step": 18100 }, { "epoch": 3.6684231860559384, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10631264001131058, "eval_runtime": 52.7075, "eval_samples_per_second": 10.359, "eval_steps_per_second": 1.309, "step": 18100 }, { "epoch": 3.6704499391974057, "grad_norm": 0.005731545854359865, "learning_rate": 1.647750304012971e-05, "loss": 0.0024, "step": 18110 }, { "epoch": 3.672476692338873, "grad_norm": 0.005191850941628218, "learning_rate": 1.6376165383056344e-05, "loss": 0.0013, "step": 18120 }, { "epoch": 3.6745034454803402, "grad_norm": 0.005665656179189682, "learning_rate": 1.6274827725982977e-05, "loss": 0.0011, "step": 18130 }, { "epoch": 3.676530198621808, "grad_norm": 0.03839230164885521, "learning_rate": 1.617349006890961e-05, "loss": 0.0014, "step": 18140 }, { "epoch": 3.6785569517632752, "grad_norm": 0.005275613162666559, "learning_rate": 1.607215241183624e-05, "loss": 0.7439, "step": 18150 }, { "epoch": 3.6805837049047425, "grad_norm": 0.005713291000574827, "learning_rate": 1.5970814754762868e-05, "loss": 0.3076, "step": 18160 }, { "epoch": 3.6826104580462102, "grad_norm": 0.005449062213301659, "learning_rate": 1.5869477097689504e-05, "loss": 0.0014, "step": 18170 }, { "epoch": 3.6846372111876775, "grad_norm": 0.0056668976321816444, "learning_rate": 1.5768139440616133e-05, "loss": 0.0006, "step": 18180 }, { "epoch": 3.686663964329145, "grad_norm": 0.006036930251866579, "learning_rate": 1.5666801783542766e-05, "loss": 0.0022, "step": 18190 }, { "epoch": 3.688690717470612, "grad_norm": 0.005712085869163275, "learning_rate": 1.55654641264694e-05, "loss": 0.0011, "step": 18200 }, { "epoch": 3.688690717470612, "eval_accuracy": 0.9761904761904762, "eval_loss": 0.11952827125787735, "eval_runtime": 46.1217, "eval_samples_per_second": 11.838, "eval_steps_per_second": 1.496, "step": 18200 }, { "epoch": 3.6907174706120793, "grad_norm": 0.037588682025671005, "learning_rate": 1.5464126469396028e-05, "loss": 0.0047, "step": 18210 }, { "epoch": 3.6927442237535466, "grad_norm": 0.0060033174231648445, "learning_rate": 1.536278881232266e-05, "loss": 0.2324, "step": 18220 }, { "epoch": 3.6947709768950143, "grad_norm": 0.005877983290702105, "learning_rate": 1.526145115524929e-05, "loss": 0.0008, "step": 18230 }, { "epoch": 3.6967977300364816, "grad_norm": 0.06802316009998322, "learning_rate": 1.5160113498175924e-05, "loss": 0.0017, "step": 18240 }, { "epoch": 3.698824483177949, "grad_norm": 0.0809393897652626, "learning_rate": 1.5058775841102555e-05, "loss": 0.0014, "step": 18250 }, { "epoch": 3.700851236319416, "grad_norm": 0.0054187779314816, "learning_rate": 1.4957438184029184e-05, "loss": 0.001, "step": 18260 }, { "epoch": 3.702877989460884, "grad_norm": 0.005456519313156605, "learning_rate": 1.4856100526955819e-05, "loss": 0.0016, "step": 18270 }, { "epoch": 3.704904742602351, "grad_norm": 0.005753775592893362, "learning_rate": 1.475476286988245e-05, "loss": 0.0015, "step": 18280 }, { "epoch": 3.7069314957438184, "grad_norm": 0.006126856431365013, "learning_rate": 1.465342521280908e-05, "loss": 0.7524, "step": 18290 }, { "epoch": 3.7089582488852857, "grad_norm": 0.03605639562010765, "learning_rate": 1.4552087555735713e-05, "loss": 0.4766, "step": 18300 }, { "epoch": 3.7089582488852857, "eval_accuracy": 0.9835164835164835, "eval_loss": 0.08734480291604996, "eval_runtime": 50.1542, "eval_samples_per_second": 10.886, "eval_steps_per_second": 1.376, "step": 18300 }, { "epoch": 3.710985002026753, "grad_norm": 0.0060221729800105095, "learning_rate": 1.4450749898662344e-05, "loss": 0.001, "step": 18310 }, { "epoch": 3.7130117551682202, "grad_norm": 0.08299941569566727, "learning_rate": 1.4349412241588975e-05, "loss": 0.0013, "step": 18320 }, { "epoch": 3.715038508309688, "grad_norm": 0.006154602859169245, "learning_rate": 1.4248074584515606e-05, "loss": 0.0011, "step": 18330 }, { "epoch": 3.7170652614511552, "grad_norm": 0.03808287903666496, "learning_rate": 1.4146736927442239e-05, "loss": 0.0023, "step": 18340 }, { "epoch": 3.7190920145926225, "grad_norm": 0.005551365204155445, "learning_rate": 1.404539927036887e-05, "loss": 0.0014, "step": 18350 }, { "epoch": 3.7211187677340902, "grad_norm": 0.005745531991124153, "learning_rate": 1.39440616132955e-05, "loss": 0.0014, "step": 18360 }, { "epoch": 3.7231455208755575, "grad_norm": 0.006904330104589462, "learning_rate": 1.3842723956222133e-05, "loss": 0.001, "step": 18370 }, { "epoch": 3.7251722740170248, "grad_norm": 0.006295742932707071, "learning_rate": 1.3741386299148764e-05, "loss": 0.001, "step": 18380 }, { "epoch": 3.727199027158492, "grad_norm": 0.024064790457487106, "learning_rate": 1.3640048642075395e-05, "loss": 0.0018, "step": 18390 }, { "epoch": 3.7292257802999593, "grad_norm": 0.005999110173434019, "learning_rate": 1.3538710985002026e-05, "loss": 0.0026, "step": 18400 }, { "epoch": 3.7292257802999593, "eval_accuracy": 0.9835164835164835, "eval_loss": 0.08764051645994186, "eval_runtime": 49.6519, "eval_samples_per_second": 10.997, "eval_steps_per_second": 1.39, "step": 18400 }, { "epoch": 3.7312525334414266, "grad_norm": 0.005963246803730726, "learning_rate": 1.3437373327928659e-05, "loss": 0.0006, "step": 18410 }, { "epoch": 3.7332792865828943, "grad_norm": 0.00517231086269021, "learning_rate": 1.333603567085529e-05, "loss": 0.001, "step": 18420 }, { "epoch": 3.7353060397243616, "grad_norm": 0.034096866846084595, "learning_rate": 1.323469801378192e-05, "loss": 0.0017, "step": 18430 }, { "epoch": 3.737332792865829, "grad_norm": 0.005253325682133436, "learning_rate": 1.3133360356708555e-05, "loss": 0.001, "step": 18440 }, { "epoch": 3.7393595460072966, "grad_norm": 0.034667935222387314, "learning_rate": 1.3032022699635186e-05, "loss": 0.0009, "step": 18450 }, { "epoch": 3.741386299148764, "grad_norm": 0.0051004644483327866, "learning_rate": 1.2930685042561815e-05, "loss": 0.0006, "step": 18460 }, { "epoch": 3.743413052290231, "grad_norm": 0.005032215733081102, "learning_rate": 1.282934738548845e-05, "loss": 0.0015, "step": 18470 }, { "epoch": 3.7454398054316984, "grad_norm": 0.005197310354560614, "learning_rate": 1.272800972841508e-05, "loss": 0.5533, "step": 18480 }, { "epoch": 3.7474665585731657, "grad_norm": 0.005205828696489334, "learning_rate": 1.2626672071341711e-05, "loss": 0.0021, "step": 18490 }, { "epoch": 3.749493311714633, "grad_norm": 0.005383576266467571, "learning_rate": 1.252533441426834e-05, "loss": 0.0006, "step": 18500 }, { "epoch": 3.749493311714633, "eval_accuracy": 0.9835164835164835, "eval_loss": 0.09415023028850555, "eval_runtime": 47.0485, "eval_samples_per_second": 11.605, "eval_steps_per_second": 1.467, "step": 18500 }, { "epoch": 3.7515200648561007, "grad_norm": 0.034614093601703644, "learning_rate": 1.2423996757194973e-05, "loss": 0.0014, "step": 18510 }, { "epoch": 3.753546817997568, "grad_norm": 0.004989073611795902, "learning_rate": 1.2322659100121606e-05, "loss": 0.0012, "step": 18520 }, { "epoch": 3.7555735711390352, "grad_norm": 0.005143389571458101, "learning_rate": 1.2221321443048237e-05, "loss": 0.0007, "step": 18530 }, { "epoch": 3.7576003242805025, "grad_norm": 0.005387092940509319, "learning_rate": 1.2119983785974868e-05, "loss": 0.0012, "step": 18540 }, { "epoch": 3.75962707742197, "grad_norm": 0.005190431606024504, "learning_rate": 1.20186461289015e-05, "loss": 0.006, "step": 18550 }, { "epoch": 3.7616538305634375, "grad_norm": 0.005023384466767311, "learning_rate": 1.1917308471828132e-05, "loss": 0.001, "step": 18560 }, { "epoch": 3.7636805837049048, "grad_norm": 0.005527614150196314, "learning_rate": 1.1815970814754764e-05, "loss": 0.545, "step": 18570 }, { "epoch": 3.765707336846372, "grad_norm": 0.005237712990492582, "learning_rate": 1.1714633157681395e-05, "loss": 0.0006, "step": 18580 }, { "epoch": 3.7677340899878393, "grad_norm": 0.005363814067095518, "learning_rate": 1.1613295500608026e-05, "loss": 0.0013, "step": 18590 }, { "epoch": 3.7697608431293066, "grad_norm": 0.036191727966070175, "learning_rate": 1.1511957843534659e-05, "loss": 0.0014, "step": 18600 }, { "epoch": 3.7697608431293066, "eval_accuracy": 0.9835164835164835, "eval_loss": 0.09435998648405075, "eval_runtime": 49.8122, "eval_samples_per_second": 10.961, "eval_steps_per_second": 1.385, "step": 18600 }, { "epoch": 3.7717875962707743, "grad_norm": 0.0053384387865662575, "learning_rate": 1.141062018646129e-05, "loss": 0.001, "step": 18610 }, { "epoch": 3.7738143494122416, "grad_norm": 0.0050865658558905125, "learning_rate": 1.130928252938792e-05, "loss": 0.0073, "step": 18620 }, { "epoch": 3.775841102553709, "grad_norm": 0.004879043437540531, "learning_rate": 1.1207944872314553e-05, "loss": 0.0009, "step": 18630 }, { "epoch": 3.7778678556951766, "grad_norm": 0.00479856925085187, "learning_rate": 1.1106607215241184e-05, "loss": 0.0017, "step": 18640 }, { "epoch": 3.779894608836644, "grad_norm": 0.004984683822840452, "learning_rate": 1.1005269558167817e-05, "loss": 0.0013, "step": 18650 }, { "epoch": 3.781921361978111, "grad_norm": 0.004968602210283279, "learning_rate": 1.0903931901094446e-05, "loss": 0.001, "step": 18660 }, { "epoch": 3.7839481151195784, "grad_norm": 0.005061035510152578, "learning_rate": 1.0802594244021079e-05, "loss": 0.0009, "step": 18670 }, { "epoch": 3.7859748682610457, "grad_norm": 0.00481655215844512, "learning_rate": 1.070125658694771e-05, "loss": 0.0017, "step": 18680 }, { "epoch": 3.788001621402513, "grad_norm": 0.005656828172504902, "learning_rate": 1.0599918929874342e-05, "loss": 0.0013, "step": 18690 }, { "epoch": 3.7900283745439807, "grad_norm": 0.004866285249590874, "learning_rate": 1.0498581272800973e-05, "loss": 0.0013, "step": 18700 }, { "epoch": 3.7900283745439807, "eval_accuracy": 0.9816849816849816, "eval_loss": 0.09718893468379974, "eval_runtime": 50.1654, "eval_samples_per_second": 10.884, "eval_steps_per_second": 1.375, "step": 18700 }, { "epoch": 3.792055127685448, "grad_norm": 0.010259117931127548, "learning_rate": 1.0397243615727604e-05, "loss": 0.0006, "step": 18710 }, { "epoch": 3.794081880826915, "grad_norm": 0.004835317377001047, "learning_rate": 1.0295905958654237e-05, "loss": 0.0009, "step": 18720 }, { "epoch": 3.7961086339683825, "grad_norm": 0.005045641213655472, "learning_rate": 1.0194568301580868e-05, "loss": 0.0011, "step": 18730 }, { "epoch": 3.79813538710985, "grad_norm": 0.004773715045303106, "learning_rate": 1.0093230644507499e-05, "loss": 0.001, "step": 18740 }, { "epoch": 3.8001621402513175, "grad_norm": 0.03401784598827362, "learning_rate": 9.991892987434132e-06, "loss": 0.002, "step": 18750 }, { "epoch": 3.8021888933927848, "grad_norm": 0.004623086657375097, "learning_rate": 9.890555330360763e-06, "loss": 0.0017, "step": 18760 }, { "epoch": 3.804215646534252, "grad_norm": 0.004645565524697304, "learning_rate": 9.789217673287395e-06, "loss": 0.001, "step": 18770 }, { "epoch": 3.8062423996757193, "grad_norm": 0.03281537815928459, "learning_rate": 9.687880016214024e-06, "loss": 0.0147, "step": 18780 }, { "epoch": 3.808269152817187, "grad_norm": 0.03286786004900932, "learning_rate": 9.586542359140657e-06, "loss": 0.005, "step": 18790 }, { "epoch": 3.8102959059586543, "grad_norm": 0.02435663342475891, "learning_rate": 9.48520470206729e-06, "loss": 0.0016, "step": 18800 }, { "epoch": 3.8102959059586543, "eval_accuracy": 0.9816849816849816, "eval_loss": 0.10444357991218567, "eval_runtime": 48.0273, "eval_samples_per_second": 11.369, "eval_steps_per_second": 1.437, "step": 18800 }, { "epoch": 3.8123226591001216, "grad_norm": 0.004839787259697914, "learning_rate": 9.38386704499392e-06, "loss": 0.0024, "step": 18810 }, { "epoch": 3.814349412241589, "grad_norm": 0.20349133014678955, "learning_rate": 9.282529387920552e-06, "loss": 0.0021, "step": 18820 }, { "epoch": 3.8163761653830566, "grad_norm": 0.004515075124800205, "learning_rate": 9.181191730847183e-06, "loss": 0.0034, "step": 18830 }, { "epoch": 3.818402918524524, "grad_norm": 0.03329044207930565, "learning_rate": 9.079854073773815e-06, "loss": 0.0012, "step": 18840 }, { "epoch": 3.820429671665991, "grad_norm": 0.004499832168221474, "learning_rate": 8.978516416700446e-06, "loss": 0.0005, "step": 18850 }, { "epoch": 3.8224564248074584, "grad_norm": 0.004518959205597639, "learning_rate": 8.877178759627077e-06, "loss": 0.0018, "step": 18860 }, { "epoch": 3.8244831779489257, "grad_norm": 0.004625269211828709, "learning_rate": 8.77584110255371e-06, "loss": 0.0009, "step": 18870 }, { "epoch": 3.826509931090393, "grad_norm": 0.00611333129927516, "learning_rate": 8.67450344548034e-06, "loss": 0.0012, "step": 18880 }, { "epoch": 3.8285366842318607, "grad_norm": 0.004493276588618755, "learning_rate": 8.573165788406973e-06, "loss": 1.116, "step": 18890 }, { "epoch": 3.830563437373328, "grad_norm": 0.004905337002128363, "learning_rate": 8.471828131333603e-06, "loss": 0.0009, "step": 18900 }, { "epoch": 3.830563437373328, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10386329889297485, "eval_runtime": 48.9261, "eval_samples_per_second": 11.16, "eval_steps_per_second": 1.41, "step": 18900 }, { "epoch": 3.832590190514795, "grad_norm": 0.004614606965333223, "learning_rate": 8.370490474260235e-06, "loss": 0.0012, "step": 18910 }, { "epoch": 3.834616943656263, "grad_norm": 0.006155083421617746, "learning_rate": 8.269152817186868e-06, "loss": 0.0009, "step": 18920 }, { "epoch": 3.83664369679773, "grad_norm": 0.03494027629494667, "learning_rate": 8.167815160113499e-06, "loss": 0.0013, "step": 18930 }, { "epoch": 3.8386704499391975, "grad_norm": 0.004626502748578787, "learning_rate": 8.06647750304013e-06, "loss": 0.0005, "step": 18940 }, { "epoch": 3.8406972030806648, "grad_norm": 0.004527637269347906, "learning_rate": 7.96513984596676e-06, "loss": 0.0009, "step": 18950 }, { "epoch": 3.842723956222132, "grad_norm": 0.03365058824419975, "learning_rate": 7.863802188893393e-06, "loss": 0.003, "step": 18960 }, { "epoch": 3.8447507093635993, "grad_norm": 0.004913891199976206, "learning_rate": 7.762464531820026e-06, "loss": 0.0012, "step": 18970 }, { "epoch": 3.846777462505067, "grad_norm": 0.004651523195207119, "learning_rate": 7.661126874746655e-06, "loss": 0.0012, "step": 18980 }, { "epoch": 3.8488042156465343, "grad_norm": 0.004609841853380203, "learning_rate": 7.559789217673288e-06, "loss": 0.0005, "step": 18990 }, { "epoch": 3.8508309687880016, "grad_norm": 0.004604669287800789, "learning_rate": 7.458451560599919e-06, "loss": 0.0008, "step": 19000 }, { "epoch": 3.8508309687880016, "eval_accuracy": 0.9816849816849816, "eval_loss": 0.09755395352840424, "eval_runtime": 50.2984, "eval_samples_per_second": 10.855, "eval_steps_per_second": 1.372, "step": 19000 }, { "epoch": 3.852857721929469, "grad_norm": 0.004643167834728956, "learning_rate": 7.357113903526551e-06, "loss": 0.0008, "step": 19010 }, { "epoch": 3.8548844750709366, "grad_norm": 0.0045625087805092335, "learning_rate": 7.255776246453182e-06, "loss": 0.0008, "step": 19020 }, { "epoch": 3.856911228212404, "grad_norm": 0.009439327754080296, "learning_rate": 7.1544385893798136e-06, "loss": 0.0012, "step": 19030 }, { "epoch": 3.858937981353871, "grad_norm": 0.11327062547206879, "learning_rate": 7.053100932306446e-06, "loss": 0.0021, "step": 19040 }, { "epoch": 3.8609647344953384, "grad_norm": 0.004486795514822006, "learning_rate": 6.951763275233076e-06, "loss": 0.0012, "step": 19050 }, { "epoch": 3.8629914876368057, "grad_norm": 0.004348400980234146, "learning_rate": 6.850425618159709e-06, "loss": 0.0009, "step": 19060 }, { "epoch": 3.865018240778273, "grad_norm": 0.004538117442280054, "learning_rate": 6.749087961086339e-06, "loss": 0.5545, "step": 19070 }, { "epoch": 3.8670449939197407, "grad_norm": 0.004487019032239914, "learning_rate": 6.647750304012972e-06, "loss": 0.0023, "step": 19080 }, { "epoch": 3.869071747061208, "grad_norm": 0.004544503055512905, "learning_rate": 6.5464126469396035e-06, "loss": 0.0005, "step": 19090 }, { "epoch": 3.871098500202675, "grad_norm": 0.004624968860298395, "learning_rate": 6.4450749898662345e-06, "loss": 0.0005, "step": 19100 }, { "epoch": 3.871098500202675, "eval_accuracy": 0.9835164835164835, "eval_loss": 0.09687044471502304, "eval_runtime": 49.0878, "eval_samples_per_second": 11.123, "eval_steps_per_second": 1.406, "step": 19100 }, { "epoch": 3.873125253344143, "grad_norm": 0.004524781834334135, "learning_rate": 6.343737332792866e-06, "loss": 0.001, "step": 19110 }, { "epoch": 3.87515200648561, "grad_norm": 0.004462063778191805, "learning_rate": 6.242399675719498e-06, "loss": 0.0012, "step": 19120 }, { "epoch": 3.8771787596270775, "grad_norm": 0.10787799954414368, "learning_rate": 6.141062018646129e-06, "loss": 0.0019, "step": 19130 }, { "epoch": 3.8792055127685448, "grad_norm": 0.004407837055623531, "learning_rate": 6.039724361572761e-06, "loss": 0.5658, "step": 19140 }, { "epoch": 3.881232265910012, "grad_norm": 0.006913017015904188, "learning_rate": 5.938386704499392e-06, "loss": 0.0009, "step": 19150 }, { "epoch": 3.8832590190514793, "grad_norm": 0.0045287227258086205, "learning_rate": 5.837049047426024e-06, "loss": 0.534, "step": 19160 }, { "epoch": 3.885285772192947, "grad_norm": 0.0046244170516729355, "learning_rate": 5.735711390352655e-06, "loss": 0.0054, "step": 19170 }, { "epoch": 3.8873125253344143, "grad_norm": 0.004677692428231239, "learning_rate": 5.634373733279287e-06, "loss": 0.0017, "step": 19180 }, { "epoch": 3.8893392784758816, "grad_norm": 0.03539438173174858, "learning_rate": 5.533036076205918e-06, "loss": 0.7679, "step": 19190 }, { "epoch": 3.891366031617349, "grad_norm": 0.004666884895414114, "learning_rate": 5.43169841913255e-06, "loss": 0.0009, "step": 19200 }, { "epoch": 3.891366031617349, "eval_accuracy": 0.9835164835164835, "eval_loss": 0.09635677188634872, "eval_runtime": 46.483, "eval_samples_per_second": 11.746, "eval_steps_per_second": 1.484, "step": 19200 }, { "epoch": 3.8933927847588166, "grad_norm": 0.004748017527163029, "learning_rate": 5.330360762059181e-06, "loss": 0.0012, "step": 19210 }, { "epoch": 3.895419537900284, "grad_norm": 0.15614542365074158, "learning_rate": 5.229023104985813e-06, "loss": 0.0016, "step": 19220 }, { "epoch": 3.897446291041751, "grad_norm": 0.00450799660757184, "learning_rate": 5.1276854479124445e-06, "loss": 0.2846, "step": 19230 }, { "epoch": 3.8994730441832184, "grad_norm": 0.004577334504574537, "learning_rate": 5.026347790839076e-06, "loss": 0.0015, "step": 19240 }, { "epoch": 3.9014997973246857, "grad_norm": 0.004710934590548277, "learning_rate": 4.925010133765707e-06, "loss": 0.0013, "step": 19250 }, { "epoch": 3.9035265504661534, "grad_norm": 0.03456436097621918, "learning_rate": 4.823672476692339e-06, "loss": 0.0014, "step": 19260 }, { "epoch": 3.9055533036076207, "grad_norm": 0.01188341248780489, "learning_rate": 4.72233481961897e-06, "loss": 0.2853, "step": 19270 }, { "epoch": 3.907580056749088, "grad_norm": 0.03473988547921181, "learning_rate": 4.620997162545603e-06, "loss": 0.5524, "step": 19280 }, { "epoch": 3.909606809890555, "grad_norm": 0.0046269637532532215, "learning_rate": 4.519659505472234e-06, "loss": 0.0013, "step": 19290 }, { "epoch": 3.911633563032023, "grad_norm": 0.0044867959804832935, "learning_rate": 4.4183218483988654e-06, "loss": 0.0005, "step": 19300 }, { "epoch": 3.911633563032023, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10195985436439514, "eval_runtime": 50.1764, "eval_samples_per_second": 10.882, "eval_steps_per_second": 1.375, "step": 19300 }, { "epoch": 3.91366031617349, "grad_norm": 0.004721532575786114, "learning_rate": 4.316984191325496e-06, "loss": 0.0005, "step": 19310 }, { "epoch": 3.9156870693149575, "grad_norm": 0.0053703757002949715, "learning_rate": 4.215646534252128e-06, "loss": 0.0009, "step": 19320 }, { "epoch": 3.9177138224564247, "grad_norm": 0.03514038398861885, "learning_rate": 4.114308877178759e-06, "loss": 0.4019, "step": 19330 }, { "epoch": 3.919740575597892, "grad_norm": 0.00470903841778636, "learning_rate": 4.012971220105392e-06, "loss": 0.0019, "step": 19340 }, { "epoch": 3.9217673287393593, "grad_norm": 0.005483035929501057, "learning_rate": 3.911633563032023e-06, "loss": 0.0005, "step": 19350 }, { "epoch": 3.923794081880827, "grad_norm": 0.0046259555965662, "learning_rate": 3.8102959059586546e-06, "loss": 0.002, "step": 19360 }, { "epoch": 3.9258208350222943, "grad_norm": 0.0044891200959682465, "learning_rate": 3.708958248885286e-06, "loss": 0.0005, "step": 19370 }, { "epoch": 3.9278475881637616, "grad_norm": 0.004647380672395229, "learning_rate": 3.6076205918119173e-06, "loss": 0.0009, "step": 19380 }, { "epoch": 3.9298743413052293, "grad_norm": 0.005284647457301617, "learning_rate": 3.5062829347385487e-06, "loss": 0.0014, "step": 19390 }, { "epoch": 3.9319010944466966, "grad_norm": 0.46924781799316406, "learning_rate": 3.404945277665181e-06, "loss": 0.5488, "step": 19400 }, { "epoch": 3.9319010944466966, "eval_accuracy": 0.9816849816849816, "eval_loss": 0.09855089336633682, "eval_runtime": 49.0745, "eval_samples_per_second": 11.126, "eval_steps_per_second": 1.406, "step": 19400 }, { "epoch": 3.933927847588164, "grad_norm": 0.004715560935437679, "learning_rate": 3.3036076205918123e-06, "loss": 0.002, "step": 19410 }, { "epoch": 3.935954600729631, "grad_norm": 0.03552348166704178, "learning_rate": 3.2022699635184437e-06, "loss": 0.5464, "step": 19420 }, { "epoch": 3.9379813538710984, "grad_norm": 0.6693766713142395, "learning_rate": 3.100932306445075e-06, "loss": 0.0054, "step": 19430 }, { "epoch": 3.9400081070125657, "grad_norm": 0.03712967783212662, "learning_rate": 2.999594649371707e-06, "loss": 0.2938, "step": 19440 }, { "epoch": 3.9420348601540334, "grad_norm": 0.004706588573753834, "learning_rate": 2.8982569922983382e-06, "loss": 0.0013, "step": 19450 }, { "epoch": 3.9440616132955006, "grad_norm": 0.004487375728785992, "learning_rate": 2.7969193352249696e-06, "loss": 0.0005, "step": 19460 }, { "epoch": 3.946088366436968, "grad_norm": 0.004523433744907379, "learning_rate": 2.6955816781516014e-06, "loss": 0.002, "step": 19470 }, { "epoch": 3.948115119578435, "grad_norm": 0.004606922157108784, "learning_rate": 2.594244021078233e-06, "loss": 0.0006, "step": 19480 }, { "epoch": 3.950141872719903, "grad_norm": 0.006367456633597612, "learning_rate": 2.492906364004864e-06, "loss": 0.0006, "step": 19490 }, { "epoch": 3.95216862586137, "grad_norm": 0.004643953405320644, "learning_rate": 2.391568706931496e-06, "loss": 0.0014, "step": 19500 }, { "epoch": 3.95216862586137, "eval_accuracy": 0.9835164835164835, "eval_loss": 0.0963364690542221, "eval_runtime": 46.4967, "eval_samples_per_second": 11.743, "eval_steps_per_second": 1.484, "step": 19500 }, { "epoch": 3.9541953790028375, "grad_norm": 0.00468785222619772, "learning_rate": 2.2902310498581274e-06, "loss": 0.0009, "step": 19510 }, { "epoch": 3.9562221321443047, "grad_norm": 2.83689546585083, "learning_rate": 2.188893392784759e-06, "loss": 0.0027, "step": 19520 }, { "epoch": 3.958248885285772, "grad_norm": 0.0045558735728263855, "learning_rate": 2.0875557357113905e-06, "loss": 0.0009, "step": 19530 }, { "epoch": 3.9602756384272393, "grad_norm": 0.004627866670489311, "learning_rate": 1.986218078638022e-06, "loss": 0.0009, "step": 19540 }, { "epoch": 3.962302391568707, "grad_norm": 0.035609010607004166, "learning_rate": 1.8848804215646537e-06, "loss": 0.0013, "step": 19550 }, { "epoch": 3.9643291447101743, "grad_norm": 0.004579546395689249, "learning_rate": 1.7835427644912851e-06, "loss": 0.0014, "step": 19560 }, { "epoch": 3.9663558978516416, "grad_norm": 0.03501616045832634, "learning_rate": 1.6822051074179165e-06, "loss": 0.0016, "step": 19570 }, { "epoch": 3.9683826509931093, "grad_norm": 0.00457020616158843, "learning_rate": 1.5808674503445483e-06, "loss": 0.001, "step": 19580 }, { "epoch": 3.9704094041345765, "grad_norm": 0.004598743282258511, "learning_rate": 1.4795297932711797e-06, "loss": 0.0011, "step": 19590 }, { "epoch": 3.972436157276044, "grad_norm": 0.004656678065657616, "learning_rate": 1.3781921361978113e-06, "loss": 0.001, "step": 19600 }, { "epoch": 3.972436157276044, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10372201353311539, "eval_runtime": 49.7746, "eval_samples_per_second": 10.969, "eval_steps_per_second": 1.386, "step": 19600 }, { "epoch": 3.974462910417511, "grad_norm": 0.03887145593762398, "learning_rate": 1.2768544791244426e-06, "loss": 0.001, "step": 19610 }, { "epoch": 3.9764896635589784, "grad_norm": 0.03490873798727989, "learning_rate": 1.1755168220510742e-06, "loss": 0.0013, "step": 19620 }, { "epoch": 3.9785164167004456, "grad_norm": 0.00451950216665864, "learning_rate": 1.0741791649777058e-06, "loss": 0.002, "step": 19630 }, { "epoch": 3.9805431698419134, "grad_norm": 0.00451238127425313, "learning_rate": 9.728415079043374e-07, "loss": 0.0019, "step": 19640 }, { "epoch": 3.9825699229833806, "grad_norm": 0.004648827016353607, "learning_rate": 8.715038508309688e-07, "loss": 0.0005, "step": 19650 }, { "epoch": 3.984596676124848, "grad_norm": 0.004489494021981955, "learning_rate": 7.701661937576004e-07, "loss": 0.0013, "step": 19660 }, { "epoch": 3.9866234292663156, "grad_norm": 0.004483028780668974, "learning_rate": 6.688285366842319e-07, "loss": 0.5184, "step": 19670 }, { "epoch": 3.988650182407783, "grad_norm": 0.004603186156600714, "learning_rate": 5.674908796108635e-07, "loss": 0.0027, "step": 19680 }, { "epoch": 3.99067693554925, "grad_norm": 0.00456126919016242, "learning_rate": 4.6615322253749494e-07, "loss": 0.3085, "step": 19690 }, { "epoch": 3.9927036886907175, "grad_norm": 0.004487769678235054, "learning_rate": 3.648155654641265e-07, "loss": 0.0009, "step": 19700 }, { "epoch": 3.9927036886907175, "eval_accuracy": 0.9798534798534798, "eval_loss": 0.10453352332115173, "eval_runtime": 50.4335, "eval_samples_per_second": 10.826, "eval_steps_per_second": 1.368, "step": 19700 }, { "epoch": 3.9947304418321847, "grad_norm": 0.004525844007730484, "learning_rate": 2.63477908390758e-07, "loss": 0.0013, "step": 19710 }, { "epoch": 3.996757194973652, "grad_norm": 0.004443651530891657, "learning_rate": 1.6214025131738955e-07, "loss": 0.0009, "step": 19720 }, { "epoch": 3.9987839481151197, "grad_norm": 0.004505722783505917, "learning_rate": 6.080259424402109e-08, "loss": 0.0013, "step": 19730 }, { "epoch": 4.0, "step": 19736, "total_flos": 1.5293819070335877e+18, "train_loss": 0.20495193709398876, "train_runtime": 28441.8571, "train_samples_per_second": 0.694, "train_steps_per_second": 0.694 } ], "logging_steps": 10, "max_steps": 19736, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5293819070335877e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }