{ "best_metric": 0.8853943711763073, "best_model_checkpoint": "/workspace/previous_works/M3D/LaMed/output/LaMed-Llama3-8B-finetune-0000/checkpoint-12888", "epoch": 3.0, "eval_steps": 4296, "global_step": 14319, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0031426775612822125, "grad_norm": 6.532504558563232, "learning_rate": 1.744186046511628e-06, "loss": 1.9456, "step": 15 }, { "epoch": 0.006285355122564425, "grad_norm": 4.389227867126465, "learning_rate": 3.488372093023256e-06, "loss": 1.8427, "step": 30 }, { "epoch": 0.009428032683846637, "grad_norm": 3.4557132720947266, "learning_rate": 5.232558139534884e-06, "loss": 1.6337, "step": 45 }, { "epoch": 0.01257071024512885, "grad_norm": 3.462625503540039, "learning_rate": 6.976744186046512e-06, "loss": 1.3449, "step": 60 }, { "epoch": 0.01571338780641106, "grad_norm": 3.7610018253326416, "learning_rate": 8.72093023255814e-06, "loss": 1.1347, "step": 75 }, { "epoch": 0.018856065367693273, "grad_norm": 3.2558743953704834, "learning_rate": 1.0465116279069768e-05, "loss": 0.9932, "step": 90 }, { "epoch": 0.02199874292897549, "grad_norm": 4.160295486450195, "learning_rate": 1.2209302325581395e-05, "loss": 0.9954, "step": 105 }, { "epoch": 0.0251414204902577, "grad_norm": 3.3803467750549316, "learning_rate": 1.3953488372093024e-05, "loss": 0.8322, "step": 120 }, { "epoch": 0.028284098051539912, "grad_norm": 3.2412078380584717, "learning_rate": 1.569767441860465e-05, "loss": 0.8286, "step": 135 }, { "epoch": 0.03142677561282212, "grad_norm": 3.4582881927490234, "learning_rate": 1.744186046511628e-05, "loss": 0.7777, "step": 150 }, { "epoch": 0.034569453174104335, "grad_norm": 3.038137435913086, "learning_rate": 1.918604651162791e-05, "loss": 0.7253, "step": 165 }, { "epoch": 0.03771213073538655, "grad_norm": 3.4821434020996094, "learning_rate": 2.0930232558139536e-05, "loss": 0.7581, "step": 180 }, { "epoch": 0.04085480829666876, "grad_norm": 4.621170520782471, "learning_rate": 2.2674418604651163e-05, "loss": 0.7054, "step": 195 }, { "epoch": 0.04399748585795098, "grad_norm": 2.803231716156006, "learning_rate": 2.441860465116279e-05, "loss": 0.7732, "step": 210 }, { "epoch": 0.04714016341923319, "grad_norm": 3.1358466148376465, "learning_rate": 2.616279069767442e-05, "loss": 0.6582, "step": 225 }, { "epoch": 0.0502828409805154, "grad_norm": 2.628765106201172, "learning_rate": 2.7906976744186048e-05, "loss": 0.6487, "step": 240 }, { "epoch": 0.05342551854179761, "grad_norm": 3.6059532165527344, "learning_rate": 2.9651162790697678e-05, "loss": 0.589, "step": 255 }, { "epoch": 0.056568196103079824, "grad_norm": 2.951493263244629, "learning_rate": 3.13953488372093e-05, "loss": 0.6081, "step": 270 }, { "epoch": 0.059710873664362035, "grad_norm": 2.9226279258728027, "learning_rate": 3.313953488372093e-05, "loss": 0.6117, "step": 285 }, { "epoch": 0.06285355122564425, "grad_norm": 3.403846263885498, "learning_rate": 3.488372093023256e-05, "loss": 0.6731, "step": 300 }, { "epoch": 0.06599622878692646, "grad_norm": 2.577772617340088, "learning_rate": 3.662790697674418e-05, "loss": 0.6461, "step": 315 }, { "epoch": 0.06913890634820867, "grad_norm": 3.0141305923461914, "learning_rate": 3.837209302325582e-05, "loss": 0.6386, "step": 330 }, { "epoch": 0.07228158390949088, "grad_norm": 2.3152832984924316, "learning_rate": 4.0116279069767444e-05, "loss": 0.5524, "step": 345 }, { "epoch": 0.0754242614707731, "grad_norm": 2.8160572052001953, "learning_rate": 4.186046511627907e-05, "loss": 0.6205, "step": 360 }, { "epoch": 0.0785669390320553, "grad_norm": 2.3307974338531494, "learning_rate": 4.36046511627907e-05, "loss": 0.6004, "step": 375 }, { "epoch": 0.08170961659333752, "grad_norm": 2.2888669967651367, "learning_rate": 4.5348837209302326e-05, "loss": 0.5461, "step": 390 }, { "epoch": 0.08485229415461974, "grad_norm": 2.36181378364563, "learning_rate": 4.709302325581396e-05, "loss": 0.5971, "step": 405 }, { "epoch": 0.08799497171590195, "grad_norm": 2.1626923084259033, "learning_rate": 4.883720930232558e-05, "loss": 0.5446, "step": 420 }, { "epoch": 0.09113764927718417, "grad_norm": 2.3800854682922363, "learning_rate": 4.999998401149839e-05, "loss": 0.6413, "step": 435 }, { "epoch": 0.09428032683846638, "grad_norm": 2.2933521270751953, "learning_rate": 4.999974418438328e-05, "loss": 0.5955, "step": 450 }, { "epoch": 0.09742300439974859, "grad_norm": 2.338463306427002, "learning_rate": 4.999921656742949e-05, "loss": 0.5819, "step": 465 }, { "epoch": 0.1005656819610308, "grad_norm": 2.9759883880615234, "learning_rate": 4.9998401166710804e-05, "loss": 0.5898, "step": 480 }, { "epoch": 0.10370835952231301, "grad_norm": 2.243450880050659, "learning_rate": 4.999729799161389e-05, "loss": 0.623, "step": 495 }, { "epoch": 0.10685103708359522, "grad_norm": 2.647433280944824, "learning_rate": 4.9995907054838166e-05, "loss": 0.5426, "step": 510 }, { "epoch": 0.10999371464487744, "grad_norm": 2.0400497913360596, "learning_rate": 4.99942283723957e-05, "loss": 0.6028, "step": 525 }, { "epoch": 0.11313639220615965, "grad_norm": 2.918405771255493, "learning_rate": 4.999226196361099e-05, "loss": 0.5556, "step": 540 }, { "epoch": 0.11627906976744186, "grad_norm": 2.571192741394043, "learning_rate": 4.999000785112079e-05, "loss": 0.5625, "step": 555 }, { "epoch": 0.11942174732872407, "grad_norm": 2.483920097351074, "learning_rate": 4.998746606087377e-05, "loss": 0.6185, "step": 570 }, { "epoch": 0.12256442489000628, "grad_norm": 2.963257312774658, "learning_rate": 4.9984636622130285e-05, "loss": 0.5841, "step": 585 }, { "epoch": 0.1257071024512885, "grad_norm": 2.1929099559783936, "learning_rate": 4.998151956746204e-05, "loss": 0.5831, "step": 600 }, { "epoch": 0.12884978001257072, "grad_norm": 1.990614891052246, "learning_rate": 4.997811493275165e-05, "loss": 0.5116, "step": 615 }, { "epoch": 0.13199245757385292, "grad_norm": 2.227179527282715, "learning_rate": 4.997442275719229e-05, "loss": 0.59, "step": 630 }, { "epoch": 0.13513513513513514, "grad_norm": 1.7978647947311401, "learning_rate": 4.997044308328722e-05, "loss": 0.4995, "step": 645 }, { "epoch": 0.13827781269641734, "grad_norm": 2.2707254886627197, "learning_rate": 4.9966175956849306e-05, "loss": 0.5299, "step": 660 }, { "epoch": 0.14142049025769957, "grad_norm": 2.358933687210083, "learning_rate": 4.996162142700045e-05, "loss": 0.597, "step": 675 }, { "epoch": 0.14456316781898176, "grad_norm": 2.036271333694458, "learning_rate": 4.995677954617112e-05, "loss": 0.5392, "step": 690 }, { "epoch": 0.147705845380264, "grad_norm": 2.3753066062927246, "learning_rate": 4.995165037009962e-05, "loss": 0.5778, "step": 705 }, { "epoch": 0.1508485229415462, "grad_norm": 1.849295973777771, "learning_rate": 4.994623395783157e-05, "loss": 0.6238, "step": 720 }, { "epoch": 0.1539912005028284, "grad_norm": 2.010460376739502, "learning_rate": 4.994053037171912e-05, "loss": 0.4691, "step": 735 }, { "epoch": 0.1571338780641106, "grad_norm": 2.023106575012207, "learning_rate": 4.993453967742032e-05, "loss": 0.5377, "step": 750 }, { "epoch": 0.16027655562539284, "grad_norm": 2.195887804031372, "learning_rate": 4.9928261943898315e-05, "loss": 0.5639, "step": 765 }, { "epoch": 0.16341923318667503, "grad_norm": 1.9283181428909302, "learning_rate": 4.9921697243420564e-05, "loss": 0.5141, "step": 780 }, { "epoch": 0.16656191074795726, "grad_norm": 1.7017083168029785, "learning_rate": 4.9914845651557985e-05, "loss": 0.5132, "step": 795 }, { "epoch": 0.16970458830923948, "grad_norm": 2.1977009773254395, "learning_rate": 4.990770724718415e-05, "loss": 0.5415, "step": 810 }, { "epoch": 0.17284726587052168, "grad_norm": 1.9427462816238403, "learning_rate": 4.99002821124743e-05, "loss": 0.5381, "step": 825 }, { "epoch": 0.1759899434318039, "grad_norm": 2.5321216583251953, "learning_rate": 4.989257033290443e-05, "loss": 0.5512, "step": 840 }, { "epoch": 0.1791326209930861, "grad_norm": 1.7843250036239624, "learning_rate": 4.988457199725034e-05, "loss": 0.5028, "step": 855 }, { "epoch": 0.18227529855436833, "grad_norm": 2.1043522357940674, "learning_rate": 4.987628719758655e-05, "loss": 0.5928, "step": 870 }, { "epoch": 0.18541797611565053, "grad_norm": 2.0235021114349365, "learning_rate": 4.9867716029285284e-05, "loss": 0.5651, "step": 885 }, { "epoch": 0.18856065367693275, "grad_norm": 1.885472059249878, "learning_rate": 4.985885859101536e-05, "loss": 0.4879, "step": 900 }, { "epoch": 0.19170333123821495, "grad_norm": 1.9070786237716675, "learning_rate": 4.9849714984741046e-05, "loss": 0.4901, "step": 915 }, { "epoch": 0.19484600879949718, "grad_norm": 2.001380681991577, "learning_rate": 4.984028531572091e-05, "loss": 0.574, "step": 930 }, { "epoch": 0.19798868636077938, "grad_norm": 1.9602166414260864, "learning_rate": 4.9830569692506564e-05, "loss": 0.5307, "step": 945 }, { "epoch": 0.2011313639220616, "grad_norm": 2.094599485397339, "learning_rate": 4.9820568226941466e-05, "loss": 0.5821, "step": 960 }, { "epoch": 0.2042740414833438, "grad_norm": 2.0091841220855713, "learning_rate": 4.98102810341596e-05, "loss": 0.5969, "step": 975 }, { "epoch": 0.20741671904462602, "grad_norm": 2.306108236312866, "learning_rate": 4.979970823258415e-05, "loss": 0.5745, "step": 990 }, { "epoch": 0.21055939660590822, "grad_norm": 1.636775255203247, "learning_rate": 4.978884994392618e-05, "loss": 0.6422, "step": 1005 }, { "epoch": 0.21370207416719045, "grad_norm": 2.4798927307128906, "learning_rate": 4.9777706293183154e-05, "loss": 0.5046, "step": 1020 }, { "epoch": 0.21684475172847265, "grad_norm": 1.804826259613037, "learning_rate": 4.976627740863756e-05, "loss": 0.5399, "step": 1035 }, { "epoch": 0.21998742928975487, "grad_norm": 2.0178399085998535, "learning_rate": 4.975456342185544e-05, "loss": 0.5123, "step": 1050 }, { "epoch": 0.2231301068510371, "grad_norm": 2.50925350189209, "learning_rate": 4.9742564467684805e-05, "loss": 0.4928, "step": 1065 }, { "epoch": 0.2262727844123193, "grad_norm": 1.973009705543518, "learning_rate": 4.9730280684254166e-05, "loss": 0.5736, "step": 1080 }, { "epoch": 0.22941546197360152, "grad_norm": 1.8204375505447388, "learning_rate": 4.971771221297088e-05, "loss": 0.4693, "step": 1095 }, { "epoch": 0.23255813953488372, "grad_norm": 2.157780647277832, "learning_rate": 4.970485919851958e-05, "loss": 0.5993, "step": 1110 }, { "epoch": 0.23570081709616594, "grad_norm": 2.113952398300171, "learning_rate": 4.9691721788860433e-05, "loss": 0.5987, "step": 1125 }, { "epoch": 0.23884349465744814, "grad_norm": 2.577479124069214, "learning_rate": 4.967830013522753e-05, "loss": 0.5443, "step": 1140 }, { "epoch": 0.24198617221873037, "grad_norm": 1.7032134532928467, "learning_rate": 4.966459439212706e-05, "loss": 0.5301, "step": 1155 }, { "epoch": 0.24512884978001256, "grad_norm": 1.8560705184936523, "learning_rate": 4.965060471733559e-05, "loss": 0.5027, "step": 1170 }, { "epoch": 0.2482715273412948, "grad_norm": 1.7248977422714233, "learning_rate": 4.963633127189821e-05, "loss": 0.5522, "step": 1185 }, { "epoch": 0.251414204902577, "grad_norm": 1.6348320245742798, "learning_rate": 4.9621774220126694e-05, "loss": 0.48, "step": 1200 }, { "epoch": 0.2545568824638592, "grad_norm": 1.7352231740951538, "learning_rate": 4.960693372959764e-05, "loss": 0.5886, "step": 1215 }, { "epoch": 0.25769956002514144, "grad_norm": 2.1465370655059814, "learning_rate": 4.959180997115049e-05, "loss": 0.5238, "step": 1230 }, { "epoch": 0.2608422375864236, "grad_norm": 1.7073941230773926, "learning_rate": 4.957640311888557e-05, "loss": 0.487, "step": 1245 }, { "epoch": 0.26398491514770583, "grad_norm": 1.8688887357711792, "learning_rate": 4.9560713350162137e-05, "loss": 0.5792, "step": 1260 }, { "epoch": 0.26712759270898806, "grad_norm": 2.24149227142334, "learning_rate": 4.9544740845596254e-05, "loss": 0.4613, "step": 1275 }, { "epoch": 0.2702702702702703, "grad_norm": 1.6652510166168213, "learning_rate": 4.9528485789058805e-05, "loss": 0.4311, "step": 1290 }, { "epoch": 0.27341294783155246, "grad_norm": 1.6432390213012695, "learning_rate": 4.951194836767329e-05, "loss": 0.5199, "step": 1305 }, { "epoch": 0.2765556253928347, "grad_norm": 1.566832184791565, "learning_rate": 4.9495128771813755e-05, "loss": 0.4897, "step": 1320 }, { "epoch": 0.2796983029541169, "grad_norm": 1.6974416971206665, "learning_rate": 4.94780271951025e-05, "loss": 0.5192, "step": 1335 }, { "epoch": 0.28284098051539913, "grad_norm": 1.9494693279266357, "learning_rate": 4.946064383440798e-05, "loss": 0.4957, "step": 1350 }, { "epoch": 0.28598365807668136, "grad_norm": 2.093959331512451, "learning_rate": 4.944297888984239e-05, "loss": 0.5164, "step": 1365 }, { "epoch": 0.2891263356379635, "grad_norm": 1.9262990951538086, "learning_rate": 4.9425032564759485e-05, "loss": 0.504, "step": 1380 }, { "epoch": 0.29226901319924575, "grad_norm": 1.8158432245254517, "learning_rate": 4.940680506575218e-05, "loss": 0.4649, "step": 1395 }, { "epoch": 0.295411690760528, "grad_norm": 1.7862390279769897, "learning_rate": 4.9388296602650185e-05, "loss": 0.5356, "step": 1410 }, { "epoch": 0.2985543683218102, "grad_norm": 2.2066242694854736, "learning_rate": 4.936950738851758e-05, "loss": 0.5076, "step": 1425 }, { "epoch": 0.3016970458830924, "grad_norm": 2.2866694927215576, "learning_rate": 4.935043763965038e-05, "loss": 0.4621, "step": 1440 }, { "epoch": 0.3048397234443746, "grad_norm": 1.6391174793243408, "learning_rate": 4.933108757557402e-05, "loss": 0.4651, "step": 1455 }, { "epoch": 0.3079824010056568, "grad_norm": 2.0994527339935303, "learning_rate": 4.9311457419040866e-05, "loss": 0.5533, "step": 1470 }, { "epoch": 0.31112507856693905, "grad_norm": 1.7273298501968384, "learning_rate": 4.9291547396027594e-05, "loss": 0.5621, "step": 1485 }, { "epoch": 0.3142677561282212, "grad_norm": 2.017411470413208, "learning_rate": 4.9271357735732655e-05, "loss": 0.4768, "step": 1500 }, { "epoch": 0.31741043368950345, "grad_norm": 1.7073991298675537, "learning_rate": 4.925088867057359e-05, "loss": 0.4989, "step": 1515 }, { "epoch": 0.32055311125078567, "grad_norm": 2.071885585784912, "learning_rate": 4.9230140436184364e-05, "loss": 0.4984, "step": 1530 }, { "epoch": 0.3236957888120679, "grad_norm": 2.1429100036621094, "learning_rate": 4.9209113271412665e-05, "loss": 0.5494, "step": 1545 }, { "epoch": 0.32683846637335007, "grad_norm": 1.709663987159729, "learning_rate": 4.9187807418317144e-05, "loss": 0.5701, "step": 1560 }, { "epoch": 0.3299811439346323, "grad_norm": 1.9613614082336426, "learning_rate": 4.9166223122164635e-05, "loss": 0.4878, "step": 1575 }, { "epoch": 0.3331238214959145, "grad_norm": 1.7875553369522095, "learning_rate": 4.9144360631427325e-05, "loss": 0.4705, "step": 1590 }, { "epoch": 0.33626649905719674, "grad_norm": 1.9654724597930908, "learning_rate": 4.9122220197779886e-05, "loss": 0.4385, "step": 1605 }, { "epoch": 0.33940917661847897, "grad_norm": 1.4906249046325684, "learning_rate": 4.90998020760966e-05, "loss": 0.4427, "step": 1620 }, { "epoch": 0.34255185417976114, "grad_norm": 1.86861252784729, "learning_rate": 4.907710652444843e-05, "loss": 0.4817, "step": 1635 }, { "epoch": 0.34569453174104336, "grad_norm": 1.9250684976577759, "learning_rate": 4.90541338041e-05, "loss": 0.5351, "step": 1650 }, { "epoch": 0.3488372093023256, "grad_norm": 1.8099184036254883, "learning_rate": 4.903088417950664e-05, "loss": 0.5238, "step": 1665 }, { "epoch": 0.3519798868636078, "grad_norm": 1.4055452346801758, "learning_rate": 4.9007357918311315e-05, "loss": 0.5157, "step": 1680 }, { "epoch": 0.35512256442489, "grad_norm": 1.7121083736419678, "learning_rate": 4.898355529134156e-05, "loss": 0.5087, "step": 1695 }, { "epoch": 0.3582652419861722, "grad_norm": 1.7254718542099, "learning_rate": 4.895947657260633e-05, "loss": 0.482, "step": 1710 }, { "epoch": 0.36140791954745444, "grad_norm": 1.7115743160247803, "learning_rate": 4.893512203929291e-05, "loss": 0.5415, "step": 1725 }, { "epoch": 0.36455059710873666, "grad_norm": 1.5224454402923584, "learning_rate": 4.8910491971763625e-05, "loss": 0.5531, "step": 1740 }, { "epoch": 0.36769327467001883, "grad_norm": 1.4693105220794678, "learning_rate": 4.888558665355273e-05, "loss": 0.5007, "step": 1755 }, { "epoch": 0.37083595223130106, "grad_norm": 1.823201298713684, "learning_rate": 4.8860406371363056e-05, "loss": 0.4568, "step": 1770 }, { "epoch": 0.3739786297925833, "grad_norm": 1.6682394742965698, "learning_rate": 4.883495141506272e-05, "loss": 0.5111, "step": 1785 }, { "epoch": 0.3771213073538655, "grad_norm": 1.9045063257217407, "learning_rate": 4.880922207768186e-05, "loss": 0.5081, "step": 1800 }, { "epoch": 0.3802639849151477, "grad_norm": 1.9026966094970703, "learning_rate": 4.8783218655409165e-05, "loss": 0.5094, "step": 1815 }, { "epoch": 0.3834066624764299, "grad_norm": 2.230048418045044, "learning_rate": 4.875694144758852e-05, "loss": 0.4501, "step": 1830 }, { "epoch": 0.38654934003771213, "grad_norm": 1.8619111776351929, "learning_rate": 4.873039075671558e-05, "loss": 0.5595, "step": 1845 }, { "epoch": 0.38969201759899436, "grad_norm": 1.0510592460632324, "learning_rate": 4.8703566888434216e-05, "loss": 0.4494, "step": 1860 }, { "epoch": 0.3928346951602766, "grad_norm": 1.61916983127594, "learning_rate": 4.8676470151533054e-05, "loss": 0.5619, "step": 1875 }, { "epoch": 0.39597737272155875, "grad_norm": 2.1640028953552246, "learning_rate": 4.864910085794192e-05, "loss": 0.4624, "step": 1890 }, { "epoch": 0.399120050282841, "grad_norm": 1.8915683031082153, "learning_rate": 4.8621459322728216e-05, "loss": 0.4953, "step": 1905 }, { "epoch": 0.4022627278441232, "grad_norm": 1.5854873657226562, "learning_rate": 4.859354586409331e-05, "loss": 0.4952, "step": 1920 }, { "epoch": 0.40540540540540543, "grad_norm": 1.8864436149597168, "learning_rate": 4.8565360803368885e-05, "loss": 0.4643, "step": 1935 }, { "epoch": 0.4085480829666876, "grad_norm": 1.7292683124542236, "learning_rate": 4.853690446501323e-05, "loss": 0.4995, "step": 1950 }, { "epoch": 0.4116907605279698, "grad_norm": 1.1200498342514038, "learning_rate": 4.85081771766075e-05, "loss": 0.4397, "step": 1965 }, { "epoch": 0.41483343808925205, "grad_norm": 1.6311380863189697, "learning_rate": 4.8479179268851934e-05, "loss": 0.5041, "step": 1980 }, { "epoch": 0.4179761156505343, "grad_norm": 1.5585182905197144, "learning_rate": 4.844991107556208e-05, "loss": 0.4968, "step": 1995 }, { "epoch": 0.42111879321181644, "grad_norm": 1.9798181056976318, "learning_rate": 4.8420372933664934e-05, "loss": 0.5101, "step": 2010 }, { "epoch": 0.42426147077309867, "grad_norm": 1.5805935859680176, "learning_rate": 4.839056518319507e-05, "loss": 0.5093, "step": 2025 }, { "epoch": 0.4274041483343809, "grad_norm": 1.8099379539489746, "learning_rate": 4.836048816729068e-05, "loss": 0.4841, "step": 2040 }, { "epoch": 0.4305468258956631, "grad_norm": 1.294607400894165, "learning_rate": 4.833014223218971e-05, "loss": 0.5417, "step": 2055 }, { "epoch": 0.4336895034569453, "grad_norm": 1.446961760520935, "learning_rate": 4.8299527727225796e-05, "loss": 0.4639, "step": 2070 }, { "epoch": 0.4368321810182275, "grad_norm": 1.460518479347229, "learning_rate": 4.826864500482428e-05, "loss": 0.4648, "step": 2085 }, { "epoch": 0.43997485857950974, "grad_norm": 1.3880281448364258, "learning_rate": 4.823749442049817e-05, "loss": 0.4185, "step": 2100 }, { "epoch": 0.44311753614079197, "grad_norm": 1.6404091119766235, "learning_rate": 4.820607633284397e-05, "loss": 0.4007, "step": 2115 }, { "epoch": 0.4462602137020742, "grad_norm": 1.201521873474121, "learning_rate": 4.8174391103537655e-05, "loss": 0.4781, "step": 2130 }, { "epoch": 0.44940289126335636, "grad_norm": 1.4873559474945068, "learning_rate": 4.814243909733043e-05, "loss": 0.4317, "step": 2145 }, { "epoch": 0.4525455688246386, "grad_norm": 1.9189249277114868, "learning_rate": 4.811022068204457e-05, "loss": 0.5085, "step": 2160 }, { "epoch": 0.4556882463859208, "grad_norm": 1.4758615493774414, "learning_rate": 4.807773622856918e-05, "loss": 0.4815, "step": 2175 }, { "epoch": 0.45883092394720304, "grad_norm": 1.6353334188461304, "learning_rate": 4.804498611085589e-05, "loss": 0.4794, "step": 2190 }, { "epoch": 0.4619736015084852, "grad_norm": 1.4237501621246338, "learning_rate": 4.8011970705914634e-05, "loss": 0.4593, "step": 2205 }, { "epoch": 0.46511627906976744, "grad_norm": 1.6772956848144531, "learning_rate": 4.7978690393809186e-05, "loss": 0.486, "step": 2220 }, { "epoch": 0.46825895663104966, "grad_norm": 1.553051233291626, "learning_rate": 4.794514555765293e-05, "loss": 0.4658, "step": 2235 }, { "epoch": 0.4714016341923319, "grad_norm": 1.8338069915771484, "learning_rate": 4.7911336583604306e-05, "loss": 0.4953, "step": 2250 }, { "epoch": 0.47454431175361406, "grad_norm": 1.431541919708252, "learning_rate": 4.7877263860862477e-05, "loss": 0.4442, "step": 2265 }, { "epoch": 0.4776869893148963, "grad_norm": 1.120583415031433, "learning_rate": 4.7842927781662796e-05, "loss": 0.4537, "step": 2280 }, { "epoch": 0.4808296668761785, "grad_norm": 1.380642056465149, "learning_rate": 4.780832874127228e-05, "loss": 0.4621, "step": 2295 }, { "epoch": 0.48397234443746073, "grad_norm": 1.1469544172286987, "learning_rate": 4.777346713798512e-05, "loss": 0.5226, "step": 2310 }, { "epoch": 0.4871150219987429, "grad_norm": 1.483512043952942, "learning_rate": 4.7738343373118e-05, "loss": 0.5479, "step": 2325 }, { "epoch": 0.49025769956002513, "grad_norm": 1.610948920249939, "learning_rate": 4.770295785100558e-05, "loss": 0.5046, "step": 2340 }, { "epoch": 0.49340037712130735, "grad_norm": 1.3163951635360718, "learning_rate": 4.7667310978995785e-05, "loss": 0.4603, "step": 2355 }, { "epoch": 0.4965430546825896, "grad_norm": 1.4908734560012817, "learning_rate": 4.763140316744509e-05, "loss": 0.4806, "step": 2370 }, { "epoch": 0.4996857322438718, "grad_norm": 1.3357776403427124, "learning_rate": 4.759523482971388e-05, "loss": 0.471, "step": 2385 }, { "epoch": 0.502828409805154, "grad_norm": 1.4438153505325317, "learning_rate": 4.755880638216161e-05, "loss": 0.443, "step": 2400 }, { "epoch": 0.5059710873664363, "grad_norm": 1.4169646501541138, "learning_rate": 4.752211824414205e-05, "loss": 0.4842, "step": 2415 }, { "epoch": 0.5091137649277184, "grad_norm": 1.4930610656738281, "learning_rate": 4.7485170837998455e-05, "loss": 0.4815, "step": 2430 }, { "epoch": 0.5122564424890006, "grad_norm": 1.5918561220169067, "learning_rate": 4.74479645890587e-05, "loss": 0.4372, "step": 2445 }, { "epoch": 0.5153991200502829, "grad_norm": 1.6254751682281494, "learning_rate": 4.7410499925630395e-05, "loss": 0.4187, "step": 2460 }, { "epoch": 0.518541797611565, "grad_norm": 1.5545734167099, "learning_rate": 4.737277727899591e-05, "loss": 0.4743, "step": 2475 }, { "epoch": 0.5216844751728472, "grad_norm": 1.727158546447754, "learning_rate": 4.7334797083407475e-05, "loss": 0.4294, "step": 2490 }, { "epoch": 0.5248271527341295, "grad_norm": 1.7546805143356323, "learning_rate": 4.729655977608214e-05, "loss": 0.5043, "step": 2505 }, { "epoch": 0.5279698302954117, "grad_norm": 1.4232885837554932, "learning_rate": 4.7258065797196746e-05, "loss": 0.4729, "step": 2520 }, { "epoch": 0.531112507856694, "grad_norm": 1.391065239906311, "learning_rate": 4.721931558988286e-05, "loss": 0.4915, "step": 2535 }, { "epoch": 0.5342551854179761, "grad_norm": 1.7134276628494263, "learning_rate": 4.7180309600221706e-05, "loss": 0.5102, "step": 2550 }, { "epoch": 0.5373978629792583, "grad_norm": 1.5847156047821045, "learning_rate": 4.714104827723895e-05, "loss": 0.4785, "step": 2565 }, { "epoch": 0.5405405405405406, "grad_norm": 1.3267030715942383, "learning_rate": 4.7101532072899623e-05, "loss": 0.5135, "step": 2580 }, { "epoch": 0.5436832181018227, "grad_norm": 1.5763999223709106, "learning_rate": 4.706176144210286e-05, "loss": 0.4916, "step": 2595 }, { "epoch": 0.5468258956631049, "grad_norm": 1.4937148094177246, "learning_rate": 4.7021736842676687e-05, "loss": 0.4561, "step": 2610 }, { "epoch": 0.5499685732243872, "grad_norm": 1.6091326475143433, "learning_rate": 4.698145873537274e-05, "loss": 0.482, "step": 2625 }, { "epoch": 0.5531112507856694, "grad_norm": 1.5875076055526733, "learning_rate": 4.694092758386095e-05, "loss": 0.4104, "step": 2640 }, { "epoch": 0.5562539283469516, "grad_norm": 1.3293397426605225, "learning_rate": 4.690014385472424e-05, "loss": 0.4143, "step": 2655 }, { "epoch": 0.5593966059082338, "grad_norm": 1.1707426309585571, "learning_rate": 4.6859108017453136e-05, "loss": 0.4726, "step": 2670 }, { "epoch": 0.562539283469516, "grad_norm": 1.3706302642822266, "learning_rate": 4.6817820544440346e-05, "loss": 0.461, "step": 2685 }, { "epoch": 0.5656819610307983, "grad_norm": 1.7703521251678467, "learning_rate": 4.677628191097534e-05, "loss": 0.5042, "step": 2700 }, { "epoch": 0.5688246385920804, "grad_norm": 1.5359523296356201, "learning_rate": 4.6734492595238874e-05, "loss": 0.4192, "step": 2715 }, { "epoch": 0.5719673161533627, "grad_norm": 1.700126051902771, "learning_rate": 4.6692453078297495e-05, "loss": 0.5095, "step": 2730 }, { "epoch": 0.5751099937146449, "grad_norm": 1.4070463180541992, "learning_rate": 4.665016384409798e-05, "loss": 0.4779, "step": 2745 }, { "epoch": 0.578252671275927, "grad_norm": 1.2797980308532715, "learning_rate": 4.660762537946178e-05, "loss": 0.4351, "step": 2760 }, { "epoch": 0.5813953488372093, "grad_norm": 1.4518544673919678, "learning_rate": 4.656483817407944e-05, "loss": 0.448, "step": 2775 }, { "epoch": 0.5845380263984915, "grad_norm": 1.300370216369629, "learning_rate": 4.652180272050491e-05, "loss": 0.44, "step": 2790 }, { "epoch": 0.5876807039597737, "grad_norm": 1.4460704326629639, "learning_rate": 4.64785195141499e-05, "loss": 0.4565, "step": 2805 }, { "epoch": 0.590823381521056, "grad_norm": 1.5882294178009033, "learning_rate": 4.643498905327819e-05, "loss": 0.5078, "step": 2820 }, { "epoch": 0.5939660590823381, "grad_norm": 1.3055689334869385, "learning_rate": 4.639121183899989e-05, "loss": 0.5, "step": 2835 }, { "epoch": 0.5971087366436204, "grad_norm": 1.4545074701309204, "learning_rate": 4.6347188375265645e-05, "loss": 0.4767, "step": 2850 }, { "epoch": 0.6002514142049026, "grad_norm": 1.0975799560546875, "learning_rate": 4.630291916886086e-05, "loss": 0.4384, "step": 2865 }, { "epoch": 0.6033940917661847, "grad_norm": 1.6817741394042969, "learning_rate": 4.625840472939987e-05, "loss": 0.5, "step": 2880 }, { "epoch": 0.606536769327467, "grad_norm": 1.0438511371612549, "learning_rate": 4.621364556932005e-05, "loss": 0.4671, "step": 2895 }, { "epoch": 0.6096794468887492, "grad_norm": 1.1330349445343018, "learning_rate": 4.616864220387592e-05, "loss": 0.4275, "step": 2910 }, { "epoch": 0.6128221244500315, "grad_norm": 1.6542346477508545, "learning_rate": 4.612339515113324e-05, "loss": 0.4801, "step": 2925 }, { "epoch": 0.6159648020113137, "grad_norm": 1.1006687879562378, "learning_rate": 4.6077904931963036e-05, "loss": 0.4756, "step": 2940 }, { "epoch": 0.6191074795725958, "grad_norm": 1.3067682981491089, "learning_rate": 4.603217207003555e-05, "loss": 0.4416, "step": 2955 }, { "epoch": 0.6222501571338781, "grad_norm": 1.2261842489242554, "learning_rate": 4.598619709181431e-05, "loss": 0.4276, "step": 2970 }, { "epoch": 0.6253928346951603, "grad_norm": 1.4903597831726074, "learning_rate": 4.593998052654998e-05, "loss": 0.4972, "step": 2985 }, { "epoch": 0.6285355122564424, "grad_norm": 1.4376386404037476, "learning_rate": 4.589352290627433e-05, "loss": 0.4568, "step": 3000 }, { "epoch": 0.6316781898177247, "grad_norm": 1.351223111152649, "learning_rate": 4.584682476579406e-05, "loss": 0.4858, "step": 3015 }, { "epoch": 0.6348208673790069, "grad_norm": 1.364617943763733, "learning_rate": 4.57998866426847e-05, "loss": 0.4876, "step": 3030 }, { "epoch": 0.6379635449402892, "grad_norm": 1.459356665611267, "learning_rate": 4.575270907728437e-05, "loss": 0.478, "step": 3045 }, { "epoch": 0.6411062225015713, "grad_norm": 1.6396265029907227, "learning_rate": 4.5705292612687576e-05, "loss": 0.529, "step": 3060 }, { "epoch": 0.6442489000628535, "grad_norm": 0.960100531578064, "learning_rate": 4.565763779473898e-05, "loss": 0.4391, "step": 3075 }, { "epoch": 0.6473915776241358, "grad_norm": 1.315019130706787, "learning_rate": 4.560974517202709e-05, "loss": 0.4917, "step": 3090 }, { "epoch": 0.650534255185418, "grad_norm": 1.5295921564102173, "learning_rate": 4.556161529587794e-05, "loss": 0.4924, "step": 3105 }, { "epoch": 0.6536769327467001, "grad_norm": 1.1837646961212158, "learning_rate": 4.551324872034879e-05, "loss": 0.4493, "step": 3120 }, { "epoch": 0.6568196103079824, "grad_norm": 1.4307267665863037, "learning_rate": 4.5464646002221684e-05, "loss": 0.468, "step": 3135 }, { "epoch": 0.6599622878692646, "grad_norm": 1.155652403831482, "learning_rate": 4.541580770099709e-05, "loss": 0.4243, "step": 3150 }, { "epoch": 0.6631049654305469, "grad_norm": 1.3834953308105469, "learning_rate": 4.536673437888743e-05, "loss": 0.5501, "step": 3165 }, { "epoch": 0.666247642991829, "grad_norm": 1.0636712312698364, "learning_rate": 4.531742660081063e-05, "loss": 0.4274, "step": 3180 }, { "epoch": 0.6693903205531112, "grad_norm": 0.8389808535575867, "learning_rate": 4.526788493438359e-05, "loss": 0.4489, "step": 3195 }, { "epoch": 0.6725329981143935, "grad_norm": 1.242849349975586, "learning_rate": 4.5218109949915674e-05, "loss": 0.5231, "step": 3210 }, { "epoch": 0.6756756756756757, "grad_norm": 1.4097121953964233, "learning_rate": 4.516810222040214e-05, "loss": 0.4373, "step": 3225 }, { "epoch": 0.6788183532369579, "grad_norm": 1.4146395921707153, "learning_rate": 4.511786232151753e-05, "loss": 0.4185, "step": 3240 }, { "epoch": 0.6819610307982401, "grad_norm": 1.1632105112075806, "learning_rate": 4.506739083160906e-05, "loss": 0.4387, "step": 3255 }, { "epoch": 0.6851037083595223, "grad_norm": 1.1534103155136108, "learning_rate": 4.501668833168995e-05, "loss": 0.4387, "step": 3270 }, { "epoch": 0.6882463859208046, "grad_norm": 1.355643391609192, "learning_rate": 4.496575540543275e-05, "loss": 0.4568, "step": 3285 }, { "epoch": 0.6913890634820867, "grad_norm": 1.2842720746994019, "learning_rate": 4.49145926391626e-05, "loss": 0.4486, "step": 3300 }, { "epoch": 0.6945317410433689, "grad_norm": 0.981799840927124, "learning_rate": 4.48632006218505e-05, "loss": 0.4268, "step": 3315 }, { "epoch": 0.6976744186046512, "grad_norm": 1.5337742567062378, "learning_rate": 4.481157994510652e-05, "loss": 0.5001, "step": 3330 }, { "epoch": 0.7008170961659334, "grad_norm": 1.4315093755722046, "learning_rate": 4.475973120317298e-05, "loss": 0.4779, "step": 3345 }, { "epoch": 0.7039597737272156, "grad_norm": 1.181176781654358, "learning_rate": 4.4707654992917635e-05, "loss": 0.4312, "step": 3360 }, { "epoch": 0.7071024512884978, "grad_norm": 1.5547527074813843, "learning_rate": 4.465535191382679e-05, "loss": 0.5246, "step": 3375 }, { "epoch": 0.71024512884978, "grad_norm": 1.2100272178649902, "learning_rate": 4.460282256799839e-05, "loss": 0.4601, "step": 3390 }, { "epoch": 0.7133878064110623, "grad_norm": 1.2901486158370972, "learning_rate": 4.455006756013511e-05, "loss": 0.4294, "step": 3405 }, { "epoch": 0.7165304839723444, "grad_norm": 1.2931948900222778, "learning_rate": 4.449708749753736e-05, "loss": 0.4618, "step": 3420 }, { "epoch": 0.7196731615336267, "grad_norm": 1.1794995069503784, "learning_rate": 4.444388299009633e-05, "loss": 0.4513, "step": 3435 }, { "epoch": 0.7228158390949089, "grad_norm": 0.9884097576141357, "learning_rate": 4.439045465028695e-05, "loss": 0.4033, "step": 3450 }, { "epoch": 0.725958516656191, "grad_norm": 1.3767797946929932, "learning_rate": 4.433680309316086e-05, "loss": 0.5132, "step": 3465 }, { "epoch": 0.7291011942174733, "grad_norm": 1.2242072820663452, "learning_rate": 4.428292893633928e-05, "loss": 0.4564, "step": 3480 }, { "epoch": 0.7322438717787555, "grad_norm": 1.416617512702942, "learning_rate": 4.422883280000596e-05, "loss": 0.4765, "step": 3495 }, { "epoch": 0.7353865493400377, "grad_norm": 1.5963226556777954, "learning_rate": 4.417451530690001e-05, "loss": 0.4593, "step": 3510 }, { "epoch": 0.73852922690132, "grad_norm": 1.3153035640716553, "learning_rate": 4.411997708230872e-05, "loss": 0.4175, "step": 3525 }, { "epoch": 0.7416719044626021, "grad_norm": 1.202329158782959, "learning_rate": 4.40652187540604e-05, "loss": 0.4668, "step": 3540 }, { "epoch": 0.7448145820238844, "grad_norm": 1.2087334394454956, "learning_rate": 4.4010240952517115e-05, "loss": 0.469, "step": 3555 }, { "epoch": 0.7479572595851666, "grad_norm": 1.1056499481201172, "learning_rate": 4.395504431056745e-05, "loss": 0.4764, "step": 3570 }, { "epoch": 0.7510999371464487, "grad_norm": 1.2779186964035034, "learning_rate": 4.389962946361921e-05, "loss": 0.3649, "step": 3585 }, { "epoch": 0.754242614707731, "grad_norm": 1.545474886894226, "learning_rate": 4.384399704959211e-05, "loss": 0.4498, "step": 3600 }, { "epoch": 0.7573852922690132, "grad_norm": 1.0024960041046143, "learning_rate": 4.378814770891045e-05, "loss": 0.4717, "step": 3615 }, { "epoch": 0.7605279698302954, "grad_norm": 1.3661173582077026, "learning_rate": 4.373208208449572e-05, "loss": 0.4662, "step": 3630 }, { "epoch": 0.7636706473915776, "grad_norm": 1.1410945653915405, "learning_rate": 4.3675800821759205e-05, "loss": 0.5376, "step": 3645 }, { "epoch": 0.7668133249528598, "grad_norm": 1.1424890756607056, "learning_rate": 4.361930456859455e-05, "loss": 0.4682, "step": 3660 }, { "epoch": 0.7699560025141421, "grad_norm": 1.373201847076416, "learning_rate": 4.3562593975370314e-05, "loss": 0.4454, "step": 3675 }, { "epoch": 0.7730986800754243, "grad_norm": 1.1460034847259521, "learning_rate": 4.350566969492248e-05, "loss": 0.4749, "step": 3690 }, { "epoch": 0.7762413576367064, "grad_norm": 1.2430229187011719, "learning_rate": 4.344853238254692e-05, "loss": 0.4535, "step": 3705 }, { "epoch": 0.7793840351979887, "grad_norm": 1.3757741451263428, "learning_rate": 4.339118269599191e-05, "loss": 0.41, "step": 3720 }, { "epoch": 0.7825267127592709, "grad_norm": 0.9454161524772644, "learning_rate": 4.333362129545046e-05, "loss": 0.4454, "step": 3735 }, { "epoch": 0.7856693903205532, "grad_norm": 0.9156450033187866, "learning_rate": 4.327584884355281e-05, "loss": 0.4719, "step": 3750 }, { "epoch": 0.7888120678818353, "grad_norm": 1.2694880962371826, "learning_rate": 4.321786600535874e-05, "loss": 0.4304, "step": 3765 }, { "epoch": 0.7919547454431175, "grad_norm": 1.2514046430587769, "learning_rate": 4.315967344834996e-05, "loss": 0.409, "step": 3780 }, { "epoch": 0.7950974230043998, "grad_norm": 1.184391736984253, "learning_rate": 4.310127184242237e-05, "loss": 0.4198, "step": 3795 }, { "epoch": 0.798240100565682, "grad_norm": 1.2372093200683594, "learning_rate": 4.304266185987842e-05, "loss": 0.5023, "step": 3810 }, { "epoch": 0.8013827781269641, "grad_norm": 1.340918779373169, "learning_rate": 4.29838441754193e-05, "loss": 0.4776, "step": 3825 }, { "epoch": 0.8045254556882464, "grad_norm": 1.2824565172195435, "learning_rate": 4.292481946613721e-05, "loss": 0.4951, "step": 3840 }, { "epoch": 0.8076681332495286, "grad_norm": 1.2031137943267822, "learning_rate": 4.286558841150757e-05, "loss": 0.5001, "step": 3855 }, { "epoch": 0.8108108108108109, "grad_norm": 1.3976994752883911, "learning_rate": 4.2806151693381194e-05, "loss": 0.459, "step": 3870 }, { "epoch": 0.813953488372093, "grad_norm": 1.8632055521011353, "learning_rate": 4.274650999597641e-05, "loss": 0.4622, "step": 3885 }, { "epoch": 0.8170961659333752, "grad_norm": 1.4277501106262207, "learning_rate": 4.2686664005871226e-05, "loss": 0.4629, "step": 3900 }, { "epoch": 0.8202388434946575, "grad_norm": 1.189048409461975, "learning_rate": 4.262661441199541e-05, "loss": 0.4408, "step": 3915 }, { "epoch": 0.8233815210559396, "grad_norm": 1.2833003997802734, "learning_rate": 4.2566361905622555e-05, "loss": 0.4064, "step": 3930 }, { "epoch": 0.8265241986172219, "grad_norm": 1.1060303449630737, "learning_rate": 4.250590718036211e-05, "loss": 0.3962, "step": 3945 }, { "epoch": 0.8296668761785041, "grad_norm": 1.0350922346115112, "learning_rate": 4.2445250932151425e-05, "loss": 0.4252, "step": 3960 }, { "epoch": 0.8328095537397863, "grad_norm": 1.3250532150268555, "learning_rate": 4.2384393859247726e-05, "loss": 0.4291, "step": 3975 }, { "epoch": 0.8359522313010685, "grad_norm": 1.2099930047988892, "learning_rate": 4.232333666222006e-05, "loss": 0.4341, "step": 3990 }, { "epoch": 0.8390949088623507, "grad_norm": 1.3332287073135376, "learning_rate": 4.226208004394127e-05, "loss": 0.466, "step": 4005 }, { "epoch": 0.8422375864236329, "grad_norm": 1.3363186120986938, "learning_rate": 4.220062470957986e-05, "loss": 0.4196, "step": 4020 }, { "epoch": 0.8453802639849152, "grad_norm": 0.9614083170890808, "learning_rate": 4.213897136659189e-05, "loss": 0.4183, "step": 4035 }, { "epoch": 0.8485229415461973, "grad_norm": 1.7605079412460327, "learning_rate": 4.2077120724712844e-05, "loss": 0.4756, "step": 4050 }, { "epoch": 0.8516656191074796, "grad_norm": 1.3952196836471558, "learning_rate": 4.201507349594946e-05, "loss": 0.433, "step": 4065 }, { "epoch": 0.8548082966687618, "grad_norm": 1.1092714071273804, "learning_rate": 4.195283039457155e-05, "loss": 0.4721, "step": 4080 }, { "epoch": 0.857950974230044, "grad_norm": 0.9377354979515076, "learning_rate": 4.189039213710369e-05, "loss": 0.4666, "step": 4095 }, { "epoch": 0.8610936517913262, "grad_norm": 1.2234201431274414, "learning_rate": 4.1827759442317116e-05, "loss": 0.4582, "step": 4110 }, { "epoch": 0.8642363293526084, "grad_norm": 1.2329143285751343, "learning_rate": 4.176493303122131e-05, "loss": 0.4581, "step": 4125 }, { "epoch": 0.8673790069138906, "grad_norm": 1.2294172048568726, "learning_rate": 4.170191362705578e-05, "loss": 0.4688, "step": 4140 }, { "epoch": 0.8705216844751729, "grad_norm": 0.8059648871421814, "learning_rate": 4.163870195528171e-05, "loss": 0.3847, "step": 4155 }, { "epoch": 0.873664362036455, "grad_norm": 1.3568918704986572, "learning_rate": 4.157529874357364e-05, "loss": 0.4839, "step": 4170 }, { "epoch": 0.8768070395977373, "grad_norm": 1.33687424659729, "learning_rate": 4.151170472181103e-05, "loss": 0.469, "step": 4185 }, { "epoch": 0.8799497171590195, "grad_norm": 1.1635092496871948, "learning_rate": 4.144792062206989e-05, "loss": 0.4117, "step": 4200 }, { "epoch": 0.8830923947203017, "grad_norm": 0.4810682237148285, "learning_rate": 4.138394717861438e-05, "loss": 0.3328, "step": 4215 }, { "epoch": 0.8862350722815839, "grad_norm": 1.170903205871582, "learning_rate": 4.131978512788832e-05, "loss": 0.5026, "step": 4230 }, { "epoch": 0.8893777498428661, "grad_norm": 0.9785465598106384, "learning_rate": 4.1255435208506695e-05, "loss": 0.4031, "step": 4245 }, { "epoch": 0.8925204274041484, "grad_norm": 1.0040161609649658, "learning_rate": 4.1190898161247216e-05, "loss": 0.3992, "step": 4260 }, { "epoch": 0.8956631049654306, "grad_norm": 1.2257813215255737, "learning_rate": 4.112617472904175e-05, "loss": 0.4431, "step": 4275 }, { "epoch": 0.8988057825267127, "grad_norm": 0.9779378771781921, "learning_rate": 4.106126565696774e-05, "loss": 0.4387, "step": 4290 }, { "epoch": 0.9000628535512256, "eval_accuracy": 0.8749659063444953, "eval_loss": 0.4478217661380768, "eval_runtime": 801.5583, "eval_samples_per_second": 5.97, "eval_steps_per_second": 1.493, "step": 4296 }, { "epoch": 0.901948460087995, "grad_norm": 1.0927642583847046, "learning_rate": 4.099617169223971e-05, "loss": 0.4717, "step": 4305 }, { "epoch": 0.9050911376492772, "grad_norm": 1.3863451480865479, "learning_rate": 4.093089358420059e-05, "loss": 0.4482, "step": 4320 }, { "epoch": 0.9082338152105593, "grad_norm": 0.8744410276412964, "learning_rate": 4.08654320843131e-05, "loss": 0.4739, "step": 4335 }, { "epoch": 0.9113764927718416, "grad_norm": 1.1781022548675537, "learning_rate": 4.079978794615115e-05, "loss": 0.408, "step": 4350 }, { "epoch": 0.9145191703331238, "grad_norm": 1.225847840309143, "learning_rate": 4.07339619253911e-05, "loss": 0.4624, "step": 4365 }, { "epoch": 0.9176618478944061, "grad_norm": 1.2807953357696533, "learning_rate": 4.0667954779803094e-05, "loss": 0.4506, "step": 4380 }, { "epoch": 0.9208045254556882, "grad_norm": 1.3124723434448242, "learning_rate": 4.0601767269242356e-05, "loss": 0.4253, "step": 4395 }, { "epoch": 0.9239472030169704, "grad_norm": 1.10555899143219, "learning_rate": 4.053540015564039e-05, "loss": 0.4078, "step": 4410 }, { "epoch": 0.9270898805782527, "grad_norm": 1.0445165634155273, "learning_rate": 4.046885420299625e-05, "loss": 0.4157, "step": 4425 }, { "epoch": 0.9302325581395349, "grad_norm": 1.0756609439849854, "learning_rate": 4.040213017736774e-05, "loss": 0.4494, "step": 4440 }, { "epoch": 0.933375235700817, "grad_norm": 1.2414379119873047, "learning_rate": 4.0335228846862575e-05, "loss": 0.4544, "step": 4455 }, { "epoch": 0.9365179132620993, "grad_norm": 1.2390245199203491, "learning_rate": 4.026815098162957e-05, "loss": 0.4086, "step": 4470 }, { "epoch": 0.9396605908233815, "grad_norm": 1.250126600265503, "learning_rate": 4.020089735384973e-05, "loss": 0.4206, "step": 4485 }, { "epoch": 0.9428032683846638, "grad_norm": 1.0727368593215942, "learning_rate": 4.013346873772743e-05, "loss": 0.4265, "step": 4500 }, { "epoch": 0.9459459459459459, "grad_norm": 1.2256518602371216, "learning_rate": 4.0065865909481417e-05, "loss": 0.4437, "step": 4515 }, { "epoch": 0.9490886235072281, "grad_norm": 1.4009459018707275, "learning_rate": 3.9998089647335933e-05, "loss": 0.4203, "step": 4530 }, { "epoch": 0.9522313010685104, "grad_norm": 1.1759395599365234, "learning_rate": 3.993014073151175e-05, "loss": 0.4978, "step": 4545 }, { "epoch": 0.9553739786297926, "grad_norm": 1.0505579710006714, "learning_rate": 3.9862019944217175e-05, "loss": 0.4191, "step": 4560 }, { "epoch": 0.9585166561910748, "grad_norm": 1.3067837953567505, "learning_rate": 3.9793728069639046e-05, "loss": 0.4671, "step": 4575 }, { "epoch": 0.961659333752357, "grad_norm": 1.2706676721572876, "learning_rate": 3.972526589393372e-05, "loss": 0.4288, "step": 4590 }, { "epoch": 0.9648020113136392, "grad_norm": 1.1527299880981445, "learning_rate": 3.965663420521798e-05, "loss": 0.4697, "step": 4605 }, { "epoch": 0.9679446888749215, "grad_norm": 0.8752300143241882, "learning_rate": 3.9587833793560026e-05, "loss": 0.4522, "step": 4620 }, { "epoch": 0.9710873664362036, "grad_norm": 1.0137310028076172, "learning_rate": 3.9518865450970346e-05, "loss": 0.4606, "step": 4635 }, { "epoch": 0.9742300439974858, "grad_norm": 1.1071418523788452, "learning_rate": 3.944972997139257e-05, "loss": 0.4403, "step": 4650 }, { "epoch": 0.9773727215587681, "grad_norm": 1.193814754486084, "learning_rate": 3.93804281506944e-05, "loss": 0.4046, "step": 4665 }, { "epoch": 0.9805153991200503, "grad_norm": 1.1703835725784302, "learning_rate": 3.93109607866584e-05, "loss": 0.3727, "step": 4680 }, { "epoch": 0.9836580766813325, "grad_norm": 1.2460951805114746, "learning_rate": 3.924132867897279e-05, "loss": 0.4457, "step": 4695 }, { "epoch": 0.9868007542426147, "grad_norm": 1.162644624710083, "learning_rate": 3.9171532629222304e-05, "loss": 0.4532, "step": 4710 }, { "epoch": 0.9899434318038969, "grad_norm": 1.1026623249053955, "learning_rate": 3.910157344087892e-05, "loss": 0.4886, "step": 4725 }, { "epoch": 0.9930861093651792, "grad_norm": 1.3245232105255127, "learning_rate": 3.9031451919292616e-05, "loss": 0.474, "step": 4740 }, { "epoch": 0.9962287869264613, "grad_norm": 1.5628905296325684, "learning_rate": 3.8961168871682116e-05, "loss": 0.5021, "step": 4755 }, { "epoch": 0.9993714644877436, "grad_norm": 1.0988940000534058, "learning_rate": 3.889072510712557e-05, "loss": 0.4488, "step": 4770 }, { "epoch": 1.0025141420490258, "grad_norm": 1.1718677282333374, "learning_rate": 3.882012143655126e-05, "loss": 0.4284, "step": 4785 }, { "epoch": 1.005656819610308, "grad_norm": 1.3951458930969238, "learning_rate": 3.874935867272826e-05, "loss": 0.4057, "step": 4800 }, { "epoch": 1.0087994971715901, "grad_norm": 1.1581798791885376, "learning_rate": 3.867843763025709e-05, "loss": 0.4073, "step": 4815 }, { "epoch": 1.0119421747328725, "grad_norm": 1.4225468635559082, "learning_rate": 3.860735912556031e-05, "loss": 0.4437, "step": 4830 }, { "epoch": 1.0150848522941547, "grad_norm": 0.9562087059020996, "learning_rate": 3.853612397687315e-05, "loss": 0.4008, "step": 4845 }, { "epoch": 1.0182275298554369, "grad_norm": 1.3174970149993896, "learning_rate": 3.846473300423409e-05, "loss": 0.4135, "step": 4860 }, { "epoch": 1.021370207416719, "grad_norm": 1.4198646545410156, "learning_rate": 3.839318702947538e-05, "loss": 0.434, "step": 4875 }, { "epoch": 1.0245128849780012, "grad_norm": 1.2705206871032715, "learning_rate": 3.832148687621365e-05, "loss": 0.4136, "step": 4890 }, { "epoch": 1.0276555625392834, "grad_norm": 1.254346489906311, "learning_rate": 3.8249633369840346e-05, "loss": 0.3875, "step": 4905 }, { "epoch": 1.0307982401005658, "grad_norm": 1.2936162948608398, "learning_rate": 3.817762733751231e-05, "loss": 0.3966, "step": 4920 }, { "epoch": 1.033940917661848, "grad_norm": 1.0256013870239258, "learning_rate": 3.81054696081422e-05, "loss": 0.4171, "step": 4935 }, { "epoch": 1.03708359522313, "grad_norm": 1.2666840553283691, "learning_rate": 3.803316101238895e-05, "loss": 0.4003, "step": 4950 }, { "epoch": 1.0402262727844123, "grad_norm": 1.2721953392028809, "learning_rate": 3.796070238264826e-05, "loss": 0.4034, "step": 4965 }, { "epoch": 1.0433689503456944, "grad_norm": 1.24618661403656, "learning_rate": 3.7888094553042954e-05, "loss": 0.4406, "step": 4980 }, { "epoch": 1.0465116279069768, "grad_norm": 0.923187255859375, "learning_rate": 3.78153383594134e-05, "loss": 0.4689, "step": 4995 }, { "epoch": 1.049654305468259, "grad_norm": 1.0710513591766357, "learning_rate": 3.774243463930791e-05, "loss": 0.3844, "step": 5010 }, { "epoch": 1.0527969830295412, "grad_norm": 1.2138617038726807, "learning_rate": 3.766938423197306e-05, "loss": 0.3412, "step": 5025 }, { "epoch": 1.0559396605908233, "grad_norm": 1.3552145957946777, "learning_rate": 3.7596187978344056e-05, "loss": 0.4033, "step": 5040 }, { "epoch": 1.0590823381521055, "grad_norm": 1.2156639099121094, "learning_rate": 3.752284672103503e-05, "loss": 0.4309, "step": 5055 }, { "epoch": 1.062225015713388, "grad_norm": 1.4516615867614746, "learning_rate": 3.7449361304329384e-05, "loss": 0.42, "step": 5070 }, { "epoch": 1.06536769327467, "grad_norm": 1.2875463962554932, "learning_rate": 3.737573257417001e-05, "loss": 0.3772, "step": 5085 }, { "epoch": 1.0685103708359522, "grad_norm": 1.2341505289077759, "learning_rate": 3.730196137814959e-05, "loss": 0.4058, "step": 5100 }, { "epoch": 1.0716530483972344, "grad_norm": 1.193441390991211, "learning_rate": 3.7228048565500854e-05, "loss": 0.4121, "step": 5115 }, { "epoch": 1.0747957259585166, "grad_norm": 1.274909496307373, "learning_rate": 3.715399498708676e-05, "loss": 0.4187, "step": 5130 }, { "epoch": 1.077938403519799, "grad_norm": 1.2880769968032837, "learning_rate": 3.7079801495390715e-05, "loss": 0.4071, "step": 5145 }, { "epoch": 1.0810810810810811, "grad_norm": 0.7923028469085693, "learning_rate": 3.70054689445068e-05, "loss": 0.3541, "step": 5160 }, { "epoch": 1.0842237586423633, "grad_norm": 1.3296815156936646, "learning_rate": 3.6930998190129864e-05, "loss": 0.3166, "step": 5175 }, { "epoch": 1.0873664362036455, "grad_norm": 1.1654574871063232, "learning_rate": 3.685639008954574e-05, "loss": 0.484, "step": 5190 }, { "epoch": 1.0905091137649277, "grad_norm": 1.2645684480667114, "learning_rate": 3.6781645501621365e-05, "loss": 0.416, "step": 5205 }, { "epoch": 1.0936517913262098, "grad_norm": 1.2940104007720947, "learning_rate": 3.670676528679483e-05, "loss": 0.3892, "step": 5220 }, { "epoch": 1.0967944688874922, "grad_norm": 1.003873586654663, "learning_rate": 3.663175030706557e-05, "loss": 0.4249, "step": 5235 }, { "epoch": 1.0999371464487744, "grad_norm": 1.3847322463989258, "learning_rate": 3.655660142598437e-05, "loss": 0.3728, "step": 5250 }, { "epoch": 1.1030798240100566, "grad_norm": 0.9578964710235596, "learning_rate": 3.648131950864347e-05, "loss": 0.3692, "step": 5265 }, { "epoch": 1.1062225015713387, "grad_norm": 1.3054499626159668, "learning_rate": 3.640590542166656e-05, "loss": 0.3691, "step": 5280 }, { "epoch": 1.109365179132621, "grad_norm": 1.1627558469772339, "learning_rate": 3.633036003319885e-05, "loss": 0.4018, "step": 5295 }, { "epoch": 1.1125078566939033, "grad_norm": 1.445669174194336, "learning_rate": 3.6254684212897035e-05, "loss": 0.4158, "step": 5310 }, { "epoch": 1.1156505342551855, "grad_norm": 0.9246712327003479, "learning_rate": 3.617887883191931e-05, "loss": 0.3393, "step": 5325 }, { "epoch": 1.1187932118164676, "grad_norm": 1.249263882637024, "learning_rate": 3.6102944762915355e-05, "loss": 0.3863, "step": 5340 }, { "epoch": 1.1219358893777498, "grad_norm": 1.1501426696777344, "learning_rate": 3.602688288001624e-05, "loss": 0.403, "step": 5355 }, { "epoch": 1.125078566939032, "grad_norm": 1.2710976600646973, "learning_rate": 3.595069405882441e-05, "loss": 0.4146, "step": 5370 }, { "epoch": 1.1282212445003144, "grad_norm": 1.4132471084594727, "learning_rate": 3.587437917640358e-05, "loss": 0.3891, "step": 5385 }, { "epoch": 1.1313639220615965, "grad_norm": 1.3578236103057861, "learning_rate": 3.5797939111268665e-05, "loss": 0.378, "step": 5400 }, { "epoch": 1.1345065996228787, "grad_norm": 1.1907520294189453, "learning_rate": 3.57213747433756e-05, "loss": 0.379, "step": 5415 }, { "epoch": 1.1376492771841609, "grad_norm": 1.0988811254501343, "learning_rate": 3.5644686954111305e-05, "loss": 0.3431, "step": 5430 }, { "epoch": 1.140791954745443, "grad_norm": 1.3456612825393677, "learning_rate": 3.556787662628347e-05, "loss": 0.3863, "step": 5445 }, { "epoch": 1.1439346323067254, "grad_norm": 1.257224678993225, "learning_rate": 3.549094464411042e-05, "loss": 0.4368, "step": 5460 }, { "epoch": 1.1470773098680076, "grad_norm": 1.4249401092529297, "learning_rate": 3.541389189321092e-05, "loss": 0.4006, "step": 5475 }, { "epoch": 1.1502199874292898, "grad_norm": 1.2512503862380981, "learning_rate": 3.5336719260594e-05, "loss": 0.4137, "step": 5490 }, { "epoch": 1.153362664990572, "grad_norm": 1.3531768321990967, "learning_rate": 3.5259427634648737e-05, "loss": 0.4046, "step": 5505 }, { "epoch": 1.156505342551854, "grad_norm": 0.8420467972755432, "learning_rate": 3.5182017905134e-05, "loss": 0.3743, "step": 5520 }, { "epoch": 1.1596480201131363, "grad_norm": 1.3925787210464478, "learning_rate": 3.5104490963168274e-05, "loss": 0.4171, "step": 5535 }, { "epoch": 1.1627906976744187, "grad_norm": 1.1061654090881348, "learning_rate": 3.502684770121932e-05, "loss": 0.3032, "step": 5550 }, { "epoch": 1.1659333752357008, "grad_norm": 1.4722493886947632, "learning_rate": 3.494908901309396e-05, "loss": 0.3401, "step": 5565 }, { "epoch": 1.169076052796983, "grad_norm": 1.3742226362228394, "learning_rate": 3.487121579392777e-05, "loss": 0.394, "step": 5580 }, { "epoch": 1.1722187303582652, "grad_norm": 0.6497241258621216, "learning_rate": 3.479322894017476e-05, "loss": 0.362, "step": 5595 }, { "epoch": 1.1753614079195476, "grad_norm": 1.2617154121398926, "learning_rate": 3.471512934959709e-05, "loss": 0.3857, "step": 5610 }, { "epoch": 1.1785040854808297, "grad_norm": 1.2584044933319092, "learning_rate": 3.46369179212547e-05, "loss": 0.4159, "step": 5625 }, { "epoch": 1.181646763042112, "grad_norm": 0.9578741788864136, "learning_rate": 3.455859555549498e-05, "loss": 0.4259, "step": 5640 }, { "epoch": 1.184789440603394, "grad_norm": 1.0911635160446167, "learning_rate": 3.448016315394238e-05, "loss": 0.3585, "step": 5655 }, { "epoch": 1.1879321181646763, "grad_norm": 1.2654902935028076, "learning_rate": 3.440162161948809e-05, "loss": 0.3954, "step": 5670 }, { "epoch": 1.1910747957259584, "grad_norm": 1.2683358192443848, "learning_rate": 3.432297185627956e-05, "loss": 0.3946, "step": 5685 }, { "epoch": 1.1942174732872408, "grad_norm": 1.0978072881698608, "learning_rate": 3.424421476971018e-05, "loss": 0.3866, "step": 5700 }, { "epoch": 1.197360150848523, "grad_norm": 1.1124176979064941, "learning_rate": 3.41653512664088e-05, "loss": 0.3547, "step": 5715 }, { "epoch": 1.2005028284098052, "grad_norm": 1.274763584136963, "learning_rate": 3.408638225422928e-05, "loss": 0.3512, "step": 5730 }, { "epoch": 1.2036455059710873, "grad_norm": 1.1088907718658447, "learning_rate": 3.400730864224011e-05, "loss": 0.3982, "step": 5745 }, { "epoch": 1.2067881835323695, "grad_norm": 1.464532494544983, "learning_rate": 3.392813134071388e-05, "loss": 0.3889, "step": 5760 }, { "epoch": 1.2099308610936519, "grad_norm": 1.2237341403961182, "learning_rate": 3.3848851261116845e-05, "loss": 0.433, "step": 5775 }, { "epoch": 1.213073538654934, "grad_norm": 1.3050017356872559, "learning_rate": 3.3769469316098375e-05, "loss": 0.3904, "step": 5790 }, { "epoch": 1.2162162162162162, "grad_norm": 1.3422915935516357, "learning_rate": 3.368998641948052e-05, "loss": 0.3807, "step": 5805 }, { "epoch": 1.2193588937774984, "grad_norm": 1.2591235637664795, "learning_rate": 3.3610403486247436e-05, "loss": 0.3875, "step": 5820 }, { "epoch": 1.2225015713387806, "grad_norm": 1.665328860282898, "learning_rate": 3.353072143253489e-05, "loss": 0.3621, "step": 5835 }, { "epoch": 1.2256442489000627, "grad_norm": 1.1227225065231323, "learning_rate": 3.345094117561967e-05, "loss": 0.4314, "step": 5850 }, { "epoch": 1.2287869264613451, "grad_norm": 1.421695351600647, "learning_rate": 3.337106363390907e-05, "loss": 0.3899, "step": 5865 }, { "epoch": 1.2319296040226273, "grad_norm": 1.3472914695739746, "learning_rate": 3.32910897269303e-05, "loss": 0.4728, "step": 5880 }, { "epoch": 1.2350722815839095, "grad_norm": 1.234174132347107, "learning_rate": 3.321102037531987e-05, "loss": 0.4298, "step": 5895 }, { "epoch": 1.2382149591451916, "grad_norm": 1.3448835611343384, "learning_rate": 3.313085650081307e-05, "loss": 0.3667, "step": 5910 }, { "epoch": 1.241357636706474, "grad_norm": 1.5955106019973755, "learning_rate": 3.305059902623326e-05, "loss": 0.3968, "step": 5925 }, { "epoch": 1.2445003142677562, "grad_norm": 0.8962088823318481, "learning_rate": 3.297024887548134e-05, "loss": 0.3656, "step": 5940 }, { "epoch": 1.2476429918290384, "grad_norm": 1.0347754955291748, "learning_rate": 3.288980697352504e-05, "loss": 0.3872, "step": 5955 }, { "epoch": 1.2507856693903205, "grad_norm": 1.20237135887146, "learning_rate": 3.280927424638832e-05, "loss": 0.338, "step": 5970 }, { "epoch": 1.2539283469516027, "grad_norm": 1.0156171321868896, "learning_rate": 3.272865162114068e-05, "loss": 0.3318, "step": 5985 }, { "epoch": 1.2570710245128849, "grad_norm": 1.4129784107208252, "learning_rate": 3.2647940025886525e-05, "loss": 0.4283, "step": 6000 }, { "epoch": 1.260213702074167, "grad_norm": 1.121748924255371, "learning_rate": 3.256714038975443e-05, "loss": 0.4193, "step": 6015 }, { "epoch": 1.2633563796354494, "grad_norm": 1.0323454141616821, "learning_rate": 3.248625364288648e-05, "loss": 0.4382, "step": 6030 }, { "epoch": 1.2664990571967316, "grad_norm": 1.118606686592102, "learning_rate": 3.240528071642756e-05, "loss": 0.3337, "step": 6045 }, { "epoch": 1.2696417347580138, "grad_norm": 1.1677335500717163, "learning_rate": 3.232422254251463e-05, "loss": 0.4412, "step": 6060 }, { "epoch": 1.2727844123192962, "grad_norm": 1.3037948608398438, "learning_rate": 3.2243080054265994e-05, "loss": 0.4399, "step": 6075 }, { "epoch": 1.2759270898805783, "grad_norm": 1.1724669933319092, "learning_rate": 3.216185418577054e-05, "loss": 0.3618, "step": 6090 }, { "epoch": 1.2790697674418605, "grad_norm": 1.173636794090271, "learning_rate": 3.208054587207703e-05, "loss": 0.3273, "step": 6105 }, { "epoch": 1.2822124450031427, "grad_norm": 1.416745901107788, "learning_rate": 3.1999156049183297e-05, "loss": 0.4196, "step": 6120 }, { "epoch": 1.2853551225644249, "grad_norm": 1.1313838958740234, "learning_rate": 3.191768565402549e-05, "loss": 0.3977, "step": 6135 }, { "epoch": 1.288497800125707, "grad_norm": 1.193344235420227, "learning_rate": 3.1836135624467276e-05, "loss": 0.4304, "step": 6150 }, { "epoch": 1.2916404776869892, "grad_norm": 1.3981118202209473, "learning_rate": 3.175450689928907e-05, "loss": 0.3614, "step": 6165 }, { "epoch": 1.2947831552482716, "grad_norm": 1.1428194046020508, "learning_rate": 3.167280041817717e-05, "loss": 0.4059, "step": 6180 }, { "epoch": 1.2979258328095538, "grad_norm": 1.2573941946029663, "learning_rate": 3.1591017121713027e-05, "loss": 0.3004, "step": 6195 }, { "epoch": 1.301068510370836, "grad_norm": 1.4468852281570435, "learning_rate": 3.150915795136232e-05, "loss": 0.43, "step": 6210 }, { "epoch": 1.304211187932118, "grad_norm": 1.2576549053192139, "learning_rate": 3.14272238494642e-05, "loss": 0.4297, "step": 6225 }, { "epoch": 1.3073538654934005, "grad_norm": 1.1931512355804443, "learning_rate": 3.1345215759220405e-05, "loss": 0.4177, "step": 6240 }, { "epoch": 1.3104965430546827, "grad_norm": 1.3183330297470093, "learning_rate": 3.126313462468438e-05, "loss": 0.3405, "step": 6255 }, { "epoch": 1.3136392206159648, "grad_norm": 1.4701759815216064, "learning_rate": 3.118098139075046e-05, "loss": 0.4108, "step": 6270 }, { "epoch": 1.316781898177247, "grad_norm": 1.1573525667190552, "learning_rate": 3.109875700314296e-05, "loss": 0.3971, "step": 6285 }, { "epoch": 1.3199245757385292, "grad_norm": 1.167579174041748, "learning_rate": 3.1016462408405304e-05, "loss": 0.2966, "step": 6300 }, { "epoch": 1.3230672532998113, "grad_norm": 1.184237003326416, "learning_rate": 3.0934098553889095e-05, "loss": 0.4177, "step": 6315 }, { "epoch": 1.3262099308610937, "grad_norm": 1.4354579448699951, "learning_rate": 3.0851666387743265e-05, "loss": 0.3421, "step": 6330 }, { "epoch": 1.329352608422376, "grad_norm": 1.3448097705841064, "learning_rate": 3.076916685890311e-05, "loss": 0.3851, "step": 6345 }, { "epoch": 1.332495285983658, "grad_norm": 1.4120362997055054, "learning_rate": 3.0686600917079386e-05, "loss": 0.3758, "step": 6360 }, { "epoch": 1.3356379635449402, "grad_norm": 1.4061853885650635, "learning_rate": 3.060396951274739e-05, "loss": 0.4013, "step": 6375 }, { "epoch": 1.3387806411062226, "grad_norm": 0.6553401947021484, "learning_rate": 3.0521273597136e-05, "loss": 0.3807, "step": 6390 }, { "epoch": 1.3419233186675048, "grad_norm": 1.2400474548339844, "learning_rate": 3.0438514122216722e-05, "loss": 0.3544, "step": 6405 }, { "epoch": 1.345065996228787, "grad_norm": 1.2030977010726929, "learning_rate": 3.0355692040692736e-05, "loss": 0.3586, "step": 6420 }, { "epoch": 1.3482086737900691, "grad_norm": 1.2839069366455078, "learning_rate": 3.0272808305987943e-05, "loss": 0.3798, "step": 6435 }, { "epoch": 1.3513513513513513, "grad_norm": 1.0002667903900146, "learning_rate": 3.0189863872235968e-05, "loss": 0.386, "step": 6450 }, { "epoch": 1.3544940289126335, "grad_norm": 1.1636244058609009, "learning_rate": 3.0106859694269196e-05, "loss": 0.4351, "step": 6465 }, { "epoch": 1.3576367064739157, "grad_norm": 0.9394842982292175, "learning_rate": 3.002379672760776e-05, "loss": 0.3461, "step": 6480 }, { "epoch": 1.360779384035198, "grad_norm": 1.2645450830459595, "learning_rate": 2.994067592844856e-05, "loss": 0.3852, "step": 6495 }, { "epoch": 1.3639220615964802, "grad_norm": 1.3446435928344727, "learning_rate": 2.9857498253654232e-05, "loss": 0.3481, "step": 6510 }, { "epoch": 1.3670647391577624, "grad_norm": 1.2624894380569458, "learning_rate": 2.9774264660742164e-05, "loss": 0.3987, "step": 6525 }, { "epoch": 1.3702074167190446, "grad_norm": 1.2067941427230835, "learning_rate": 2.9690976107873453e-05, "loss": 0.3639, "step": 6540 }, { "epoch": 1.373350094280327, "grad_norm": 1.1371479034423828, "learning_rate": 2.960763355384188e-05, "loss": 0.3925, "step": 6555 }, { "epoch": 1.3764927718416091, "grad_norm": 1.0012383460998535, "learning_rate": 2.9524237958062862e-05, "loss": 0.4186, "step": 6570 }, { "epoch": 1.3796354494028913, "grad_norm": 1.0432685613632202, "learning_rate": 2.944079028056243e-05, "loss": 0.3869, "step": 6585 }, { "epoch": 1.3827781269641735, "grad_norm": 1.4123237133026123, "learning_rate": 2.9357291481966155e-05, "loss": 0.4134, "step": 6600 }, { "epoch": 1.3859208045254556, "grad_norm": 1.1969938278198242, "learning_rate": 2.927374252348812e-05, "loss": 0.3821, "step": 6615 }, { "epoch": 1.3890634820867378, "grad_norm": 1.2030854225158691, "learning_rate": 2.9190144366919793e-05, "loss": 0.3853, "step": 6630 }, { "epoch": 1.3922061596480202, "grad_norm": 1.1836553812026978, "learning_rate": 2.9106497974619042e-05, "loss": 0.3595, "step": 6645 }, { "epoch": 1.3953488372093024, "grad_norm": 1.6539838314056396, "learning_rate": 2.9022804309498975e-05, "loss": 0.4392, "step": 6660 }, { "epoch": 1.3984915147705845, "grad_norm": 1.295224666595459, "learning_rate": 2.8939064335016913e-05, "loss": 0.4172, "step": 6675 }, { "epoch": 1.4016341923318667, "grad_norm": 1.1444505453109741, "learning_rate": 2.8855279015163273e-05, "loss": 0.3857, "step": 6690 }, { "epoch": 1.404776869893149, "grad_norm": 1.4091520309448242, "learning_rate": 2.8771449314450466e-05, "loss": 0.4384, "step": 6705 }, { "epoch": 1.4079195474544313, "grad_norm": 0.9858888983726501, "learning_rate": 2.8687576197901812e-05, "loss": 0.342, "step": 6720 }, { "epoch": 1.4110622250157134, "grad_norm": 1.2735475301742554, "learning_rate": 2.860366063104041e-05, "loss": 0.462, "step": 6735 }, { "epoch": 1.4142049025769956, "grad_norm": 1.1398062705993652, "learning_rate": 2.8519703579878053e-05, "loss": 0.4295, "step": 6750 }, { "epoch": 1.4173475801382778, "grad_norm": 1.4460091590881348, "learning_rate": 2.8435706010904085e-05, "loss": 0.3801, "step": 6765 }, { "epoch": 1.42049025769956, "grad_norm": 1.573014736175537, "learning_rate": 2.835166889107425e-05, "loss": 0.4661, "step": 6780 }, { "epoch": 1.4236329352608421, "grad_norm": 1.5855605602264404, "learning_rate": 2.8267593187799633e-05, "loss": 0.3628, "step": 6795 }, { "epoch": 1.4267756128221245, "grad_norm": 1.3220208883285522, "learning_rate": 2.8183479868935466e-05, "loss": 0.3755, "step": 6810 }, { "epoch": 1.4299182903834067, "grad_norm": 1.4992631673812866, "learning_rate": 2.809932990276997e-05, "loss": 0.4043, "step": 6825 }, { "epoch": 1.4330609679446888, "grad_norm": 1.355560302734375, "learning_rate": 2.8015144258013282e-05, "loss": 0.412, "step": 6840 }, { "epoch": 1.436203645505971, "grad_norm": 1.146181583404541, "learning_rate": 2.7930923903786255e-05, "loss": 0.3505, "step": 6855 }, { "epoch": 1.4393463230672534, "grad_norm": 1.8377063274383545, "learning_rate": 2.7846669809609267e-05, "loss": 0.4537, "step": 6870 }, { "epoch": 1.4424890006285356, "grad_norm": 1.4548070430755615, "learning_rate": 2.7762382945391156e-05, "loss": 0.4113, "step": 6885 }, { "epoch": 1.4456316781898177, "grad_norm": 1.3672486543655396, "learning_rate": 2.7678064281417952e-05, "loss": 0.3917, "step": 6900 }, { "epoch": 1.4487743557511, "grad_norm": 1.1587488651275635, "learning_rate": 2.7593714788341795e-05, "loss": 0.3334, "step": 6915 }, { "epoch": 1.451917033312382, "grad_norm": 1.2732610702514648, "learning_rate": 2.7509335437169693e-05, "loss": 0.373, "step": 6930 }, { "epoch": 1.4550597108736643, "grad_norm": 1.458500862121582, "learning_rate": 2.7424927199252364e-05, "loss": 0.3409, "step": 6945 }, { "epoch": 1.4582023884349467, "grad_norm": 1.3266096115112305, "learning_rate": 2.734049104627311e-05, "loss": 0.443, "step": 6960 }, { "epoch": 1.4613450659962288, "grad_norm": 1.0348279476165771, "learning_rate": 2.7256027950236517e-05, "loss": 0.3772, "step": 6975 }, { "epoch": 1.464487743557511, "grad_norm": 1.2738145589828491, "learning_rate": 2.7171538883457396e-05, "loss": 0.364, "step": 6990 }, { "epoch": 1.4676304211187932, "grad_norm": 1.184635877609253, "learning_rate": 2.708702481854947e-05, "loss": 0.3866, "step": 7005 }, { "epoch": 1.4707730986800756, "grad_norm": 1.2299425601959229, "learning_rate": 2.7002486728414283e-05, "loss": 0.3716, "step": 7020 }, { "epoch": 1.4739157762413577, "grad_norm": 1.3776116371154785, "learning_rate": 2.6917925586229897e-05, "loss": 0.402, "step": 7035 }, { "epoch": 1.47705845380264, "grad_norm": 1.3003356456756592, "learning_rate": 2.68333423654398e-05, "loss": 0.3722, "step": 7050 }, { "epoch": 1.480201131363922, "grad_norm": 1.2862930297851562, "learning_rate": 2.67487380397416e-05, "loss": 0.4417, "step": 7065 }, { "epoch": 1.4833438089252042, "grad_norm": 1.116700530052185, "learning_rate": 2.666411358307586e-05, "loss": 0.3577, "step": 7080 }, { "epoch": 1.4864864864864864, "grad_norm": 1.3424625396728516, "learning_rate": 2.657946996961493e-05, "loss": 0.3389, "step": 7095 }, { "epoch": 1.4896291640477686, "grad_norm": 1.3122916221618652, "learning_rate": 2.6494808173751622e-05, "loss": 0.4148, "step": 7110 }, { "epoch": 1.492771841609051, "grad_norm": 0.8987470865249634, "learning_rate": 2.6410129170088115e-05, "loss": 0.387, "step": 7125 }, { "epoch": 1.4959145191703331, "grad_norm": 1.0086872577667236, "learning_rate": 2.6325433933424644e-05, "loss": 0.3495, "step": 7140 }, { "epoch": 1.4990571967316153, "grad_norm": 1.3022773265838623, "learning_rate": 2.6240723438748332e-05, "loss": 0.366, "step": 7155 }, { "epoch": 1.5021998742928977, "grad_norm": 1.324033260345459, "learning_rate": 2.615599866122193e-05, "loss": 0.3845, "step": 7170 }, { "epoch": 1.5053425518541799, "grad_norm": 0.7969958782196045, "learning_rate": 2.6071260576172634e-05, "loss": 0.3597, "step": 7185 }, { "epoch": 1.508485229415462, "grad_norm": 1.2666351795196533, "learning_rate": 2.5986510159080824e-05, "loss": 0.3573, "step": 7200 }, { "epoch": 1.5116279069767442, "grad_norm": 1.4982563257217407, "learning_rate": 2.590174838556881e-05, "loss": 0.3576, "step": 7215 }, { "epoch": 1.5147705845380264, "grad_norm": 1.5081130266189575, "learning_rate": 2.581697623138969e-05, "loss": 0.2803, "step": 7230 }, { "epoch": 1.5179132620993085, "grad_norm": 1.267719030380249, "learning_rate": 2.5732194672416012e-05, "loss": 0.3586, "step": 7245 }, { "epoch": 1.5210559396605907, "grad_norm": 1.1292250156402588, "learning_rate": 2.5647404684628622e-05, "loss": 0.3974, "step": 7260 }, { "epoch": 1.5241986172218729, "grad_norm": 1.3279204368591309, "learning_rate": 2.556260724410538e-05, "loss": 0.3828, "step": 7275 }, { "epoch": 1.5273412947831553, "grad_norm": 1.337803602218628, "learning_rate": 2.5477803327009948e-05, "loss": 0.3692, "step": 7290 }, { "epoch": 1.5304839723444374, "grad_norm": 1.159134030342102, "learning_rate": 2.5392993909580537e-05, "loss": 0.354, "step": 7305 }, { "epoch": 1.5336266499057196, "grad_norm": 1.2121402025222778, "learning_rate": 2.5308179968118677e-05, "loss": 0.4087, "step": 7320 }, { "epoch": 1.536769327467002, "grad_norm": 1.2714091539382935, "learning_rate": 2.522336247897799e-05, "loss": 0.4065, "step": 7335 }, { "epoch": 1.5399120050282842, "grad_norm": 1.128733515739441, "learning_rate": 2.5138542418552913e-05, "loss": 0.3605, "step": 7350 }, { "epoch": 1.5430546825895664, "grad_norm": 1.140023946762085, "learning_rate": 2.5053720763267506e-05, "loss": 0.3573, "step": 7365 }, { "epoch": 1.5461973601508485, "grad_norm": 1.3230198621749878, "learning_rate": 2.4968898489564185e-05, "loss": 0.3182, "step": 7380 }, { "epoch": 1.5493400377121307, "grad_norm": 1.0801093578338623, "learning_rate": 2.4884076573892464e-05, "loss": 0.3523, "step": 7395 }, { "epoch": 1.5524827152734129, "grad_norm": 1.204451084136963, "learning_rate": 2.4799255992697767e-05, "loss": 0.3502, "step": 7410 }, { "epoch": 1.555625392834695, "grad_norm": 1.164306640625, "learning_rate": 2.4714437722410145e-05, "loss": 0.3451, "step": 7425 }, { "epoch": 1.5587680703959774, "grad_norm": 0.8542248606681824, "learning_rate": 2.4629622739433016e-05, "loss": 0.3803, "step": 7440 }, { "epoch": 1.5619107479572596, "grad_norm": 1.2533782720565796, "learning_rate": 2.4544812020132007e-05, "loss": 0.3561, "step": 7455 }, { "epoch": 1.5650534255185418, "grad_norm": 1.3054505586624146, "learning_rate": 2.4460006540823635e-05, "loss": 0.4579, "step": 7470 }, { "epoch": 1.5681961030798242, "grad_norm": 1.4427162408828735, "learning_rate": 2.4375207277764085e-05, "loss": 0.3762, "step": 7485 }, { "epoch": 1.5713387806411063, "grad_norm": 1.1473865509033203, "learning_rate": 2.4290415207137995e-05, "loss": 0.4135, "step": 7500 }, { "epoch": 1.5744814582023885, "grad_norm": 1.0101532936096191, "learning_rate": 2.4205631305047222e-05, "loss": 0.3653, "step": 7515 }, { "epoch": 1.5776241357636707, "grad_norm": 1.428271770477295, "learning_rate": 2.4120856547499564e-05, "loss": 0.386, "step": 7530 }, { "epoch": 1.5807668133249528, "grad_norm": 1.0353528261184692, "learning_rate": 2.4036091910397555e-05, "loss": 0.3912, "step": 7545 }, { "epoch": 1.583909490886235, "grad_norm": 1.2192641496658325, "learning_rate": 2.3951338369527233e-05, "loss": 0.3303, "step": 7560 }, { "epoch": 1.5870521684475172, "grad_norm": 1.2922149896621704, "learning_rate": 2.3866596900546902e-05, "loss": 0.3768, "step": 7575 }, { "epoch": 1.5901948460087993, "grad_norm": 1.3581557273864746, "learning_rate": 2.3781868478975884e-05, "loss": 0.393, "step": 7590 }, { "epoch": 1.5933375235700817, "grad_norm": 1.2488782405853271, "learning_rate": 2.3697154080183308e-05, "loss": 0.3889, "step": 7605 }, { "epoch": 1.596480201131364, "grad_norm": 1.0586172342300415, "learning_rate": 2.3612454679376886e-05, "loss": 0.3639, "step": 7620 }, { "epoch": 1.5996228786926463, "grad_norm": 1.226731300354004, "learning_rate": 2.3527771251591675e-05, "loss": 0.3783, "step": 7635 }, { "epoch": 1.6027655562539285, "grad_norm": 1.4184266328811646, "learning_rate": 2.344310477167883e-05, "loss": 0.4132, "step": 7650 }, { "epoch": 1.6059082338152106, "grad_norm": 1.2709243297576904, "learning_rate": 2.3358456214294456e-05, "loss": 0.3314, "step": 7665 }, { "epoch": 1.6090509113764928, "grad_norm": 1.1103581190109253, "learning_rate": 2.3273826553888294e-05, "loss": 0.3735, "step": 7680 }, { "epoch": 1.612193588937775, "grad_norm": 1.1599838733673096, "learning_rate": 2.3189216764692578e-05, "loss": 0.3968, "step": 7695 }, { "epoch": 1.6153362664990571, "grad_norm": 1.1679604053497314, "learning_rate": 2.3104627820710754e-05, "loss": 0.3501, "step": 7710 }, { "epoch": 1.6184789440603393, "grad_norm": 1.0258073806762695, "learning_rate": 2.302006069570635e-05, "loss": 0.3992, "step": 7725 }, { "epoch": 1.6216216216216215, "grad_norm": 1.1728984117507935, "learning_rate": 2.2935516363191693e-05, "loss": 0.3625, "step": 7740 }, { "epoch": 1.6247642991829039, "grad_norm": 1.3930670022964478, "learning_rate": 2.2850995796416726e-05, "loss": 0.3898, "step": 7755 }, { "epoch": 1.627906976744186, "grad_norm": 0.9263485074043274, "learning_rate": 2.2766499968357834e-05, "loss": 0.3145, "step": 7770 }, { "epoch": 1.6310496543054682, "grad_norm": 1.388420581817627, "learning_rate": 2.2682029851706584e-05, "loss": 0.3849, "step": 7785 }, { "epoch": 1.6341923318667506, "grad_norm": 1.2891064882278442, "learning_rate": 2.2597586418858586e-05, "loss": 0.3998, "step": 7800 }, { "epoch": 1.6373350094280328, "grad_norm": 1.1814244985580444, "learning_rate": 2.251317064190224e-05, "loss": 0.3652, "step": 7815 }, { "epoch": 1.640477686989315, "grad_norm": 1.1944345235824585, "learning_rate": 2.2428783492607638e-05, "loss": 0.3612, "step": 7830 }, { "epoch": 1.6436203645505971, "grad_norm": 0.9002747535705566, "learning_rate": 2.2344425942415258e-05, "loss": 0.3131, "step": 7845 }, { "epoch": 1.6467630421118793, "grad_norm": 1.203361988067627, "learning_rate": 2.2260098962424874e-05, "loss": 0.3476, "step": 7860 }, { "epoch": 1.6499057196731615, "grad_norm": 1.0701284408569336, "learning_rate": 2.2175803523384352e-05, "loss": 0.3972, "step": 7875 }, { "epoch": 1.6530483972344436, "grad_norm": 1.255242943763733, "learning_rate": 2.209154059567843e-05, "loss": 0.4292, "step": 7890 }, { "epoch": 1.6561910747957258, "grad_norm": 1.1037348508834839, "learning_rate": 2.200731114931763e-05, "loss": 0.3782, "step": 7905 }, { "epoch": 1.6593337523570082, "grad_norm": 1.404234528541565, "learning_rate": 2.1923116153927e-05, "loss": 0.3984, "step": 7920 }, { "epoch": 1.6624764299182904, "grad_norm": 1.2808343172073364, "learning_rate": 2.183895657873505e-05, "loss": 0.3551, "step": 7935 }, { "epoch": 1.6656191074795728, "grad_norm": 1.4898031949996948, "learning_rate": 2.1754833392562502e-05, "loss": 0.3651, "step": 7950 }, { "epoch": 1.668761785040855, "grad_norm": 1.1187386512756348, "learning_rate": 2.167074756381119e-05, "loss": 0.3626, "step": 7965 }, { "epoch": 1.671904462602137, "grad_norm": 0.9661749005317688, "learning_rate": 2.1586700060452912e-05, "loss": 0.3337, "step": 7980 }, { "epoch": 1.6750471401634193, "grad_norm": 1.339406967163086, "learning_rate": 2.1502691850018263e-05, "loss": 0.3907, "step": 7995 }, { "epoch": 1.6781898177247014, "grad_norm": 1.0702762603759766, "learning_rate": 2.141872389958551e-05, "loss": 0.3788, "step": 8010 }, { "epoch": 1.6813324952859836, "grad_norm": 1.4297361373901367, "learning_rate": 2.133479717576945e-05, "loss": 0.4034, "step": 8025 }, { "epoch": 1.6844751728472658, "grad_norm": 0.8980254530906677, "learning_rate": 2.1250912644710325e-05, "loss": 0.3243, "step": 8040 }, { "epoch": 1.687617850408548, "grad_norm": 1.4087092876434326, "learning_rate": 2.1167071272062626e-05, "loss": 0.4123, "step": 8055 }, { "epoch": 1.6907605279698303, "grad_norm": 1.134097933769226, "learning_rate": 2.108327402298404e-05, "loss": 0.3734, "step": 8070 }, { "epoch": 1.6939032055311125, "grad_norm": 1.1244763135910034, "learning_rate": 2.099952186212429e-05, "loss": 0.3626, "step": 8085 }, { "epoch": 1.6970458830923947, "grad_norm": 1.1340084075927734, "learning_rate": 2.091581575361411e-05, "loss": 0.3261, "step": 8100 }, { "epoch": 1.700188560653677, "grad_norm": 1.2386656999588013, "learning_rate": 2.0832156661054036e-05, "loss": 0.3485, "step": 8115 }, { "epoch": 1.7033312382149592, "grad_norm": 1.6566152572631836, "learning_rate": 2.074854554750339e-05, "loss": 0.3902, "step": 8130 }, { "epoch": 1.7064739157762414, "grad_norm": 1.209065556526184, "learning_rate": 2.06649833754692e-05, "loss": 0.4162, "step": 8145 }, { "epoch": 1.7096165933375236, "grad_norm": 1.2372878789901733, "learning_rate": 2.0581471106895043e-05, "loss": 0.3521, "step": 8160 }, { "epoch": 1.7127592708988058, "grad_norm": 1.2591501474380493, "learning_rate": 2.0498009703150063e-05, "loss": 0.3496, "step": 8175 }, { "epoch": 1.715901948460088, "grad_norm": 1.1610863208770752, "learning_rate": 2.0414600125017834e-05, "loss": 0.407, "step": 8190 }, { "epoch": 1.71904462602137, "grad_norm": 1.165305495262146, "learning_rate": 2.0331243332685367e-05, "loss": 0.4154, "step": 8205 }, { "epoch": 1.7221873035826523, "grad_norm": 0.9598828554153442, "learning_rate": 2.024794028573197e-05, "loss": 0.3947, "step": 8220 }, { "epoch": 1.7253299811439347, "grad_norm": 1.2426929473876953, "learning_rate": 2.0164691943118283e-05, "loss": 0.3481, "step": 8235 }, { "epoch": 1.7284726587052168, "grad_norm": 0.9565463066101074, "learning_rate": 2.00814992631752e-05, "loss": 0.3251, "step": 8250 }, { "epoch": 1.7316153362664992, "grad_norm": 1.1574795246124268, "learning_rate": 1.9998363203592836e-05, "loss": 0.374, "step": 8265 }, { "epoch": 1.7347580138277814, "grad_norm": 1.3719727993011475, "learning_rate": 1.9915284721409506e-05, "loss": 0.4395, "step": 8280 }, { "epoch": 1.7379006913890636, "grad_norm": 1.21462082862854, "learning_rate": 1.983226477300071e-05, "loss": 0.3879, "step": 8295 }, { "epoch": 1.7410433689503457, "grad_norm": 1.2950128316879272, "learning_rate": 1.974930431406815e-05, "loss": 0.3903, "step": 8310 }, { "epoch": 1.744186046511628, "grad_norm": 0.568601131439209, "learning_rate": 1.966640429962867e-05, "loss": 0.3608, "step": 8325 }, { "epoch": 1.74732872407291, "grad_norm": 1.234540343284607, "learning_rate": 1.9583565684003294e-05, "loss": 0.3574, "step": 8340 }, { "epoch": 1.7504714016341922, "grad_norm": 1.170241355895996, "learning_rate": 1.9500789420806274e-05, "loss": 0.3476, "step": 8355 }, { "epoch": 1.7536140791954744, "grad_norm": 1.1727917194366455, "learning_rate": 1.9418076462934057e-05, "loss": 0.3825, "step": 8370 }, { "epoch": 1.7567567567567568, "grad_norm": 1.1901155710220337, "learning_rate": 1.933542776255432e-05, "loss": 0.3182, "step": 8385 }, { "epoch": 1.759899434318039, "grad_norm": 1.3078737258911133, "learning_rate": 1.9252844271095056e-05, "loss": 0.3766, "step": 8400 }, { "epoch": 1.7630421118793211, "grad_norm": 1.255685567855835, "learning_rate": 1.917032693923359e-05, "loss": 0.4278, "step": 8415 }, { "epoch": 1.7661847894406035, "grad_norm": 1.2631891965866089, "learning_rate": 1.908787671688561e-05, "loss": 0.3988, "step": 8430 }, { "epoch": 1.7693274670018857, "grad_norm": 1.0149579048156738, "learning_rate": 1.9005494553194277e-05, "loss": 0.3164, "step": 8445 }, { "epoch": 1.7724701445631679, "grad_norm": 1.2755389213562012, "learning_rate": 1.892318139651929e-05, "loss": 0.3699, "step": 8460 }, { "epoch": 1.77561282212445, "grad_norm": 1.3909375667572021, "learning_rate": 1.884093819442595e-05, "loss": 0.3975, "step": 8475 }, { "epoch": 1.7787554996857322, "grad_norm": 1.3214746713638306, "learning_rate": 1.8758765893674242e-05, "loss": 0.385, "step": 8490 }, { "epoch": 1.7818981772470144, "grad_norm": 1.1242390871047974, "learning_rate": 1.867666544020798e-05, "loss": 0.3882, "step": 8505 }, { "epoch": 1.7850408548082966, "grad_norm": 1.41203773021698, "learning_rate": 1.8594637779143895e-05, "loss": 0.4134, "step": 8520 }, { "epoch": 1.7881835323695787, "grad_norm": 1.1696633100509644, "learning_rate": 1.851268385476074e-05, "loss": 0.3835, "step": 8535 }, { "epoch": 1.7913262099308611, "grad_norm": 1.27289879322052, "learning_rate": 1.8430804610488423e-05, "loss": 0.3411, "step": 8550 }, { "epoch": 1.7944688874921433, "grad_norm": 1.1815760135650635, "learning_rate": 1.8349000988897183e-05, "loss": 0.3953, "step": 8565 }, { "epoch": 1.7976115650534257, "grad_norm": 0.9872913956642151, "learning_rate": 1.8267273931686697e-05, "loss": 0.3807, "step": 8580 }, { "epoch": 1.8001257071024512, "eval_accuracy": 0.8832372290913474, "eval_loss": 0.4148283004760742, "eval_runtime": 1191.4012, "eval_samples_per_second": 4.016, "eval_steps_per_second": 1.005, "step": 8592 }, { "epoch": 1.8007542426147078, "grad_norm": 1.2675862312316895, "learning_rate": 1.818562437967525e-05, "loss": 0.4136, "step": 8595 }, { "epoch": 1.80389692017599, "grad_norm": 1.2914496660232544, "learning_rate": 1.8104053272788912e-05, "loss": 0.3426, "step": 8610 }, { "epoch": 1.8070395977372722, "grad_norm": 0.8845340609550476, "learning_rate": 1.802256155005073e-05, "loss": 0.3796, "step": 8625 }, { "epoch": 1.8101822752985544, "grad_norm": 1.2812376022338867, "learning_rate": 1.79411501495699e-05, "loss": 0.3813, "step": 8640 }, { "epoch": 1.8133249528598365, "grad_norm": 1.479176640510559, "learning_rate": 1.7859820008530943e-05, "loss": 0.347, "step": 8655 }, { "epoch": 1.8164676304211187, "grad_norm": 1.5261151790618896, "learning_rate": 1.7778572063182976e-05, "loss": 0.3942, "step": 8670 }, { "epoch": 1.8196103079824009, "grad_norm": 1.0050832033157349, "learning_rate": 1.76974072488289e-05, "loss": 0.3831, "step": 8685 }, { "epoch": 1.8227529855436833, "grad_norm": 0.8978458046913147, "learning_rate": 1.761632649981462e-05, "loss": 0.4253, "step": 8700 }, { "epoch": 1.8258956631049654, "grad_norm": 1.3533804416656494, "learning_rate": 1.753533074951831e-05, "loss": 0.4012, "step": 8715 }, { "epoch": 1.8290383406662476, "grad_norm": 1.2724169492721558, "learning_rate": 1.7454420930339676e-05, "loss": 0.4422, "step": 8730 }, { "epoch": 1.83218101822753, "grad_norm": 1.2476907968521118, "learning_rate": 1.737359797368921e-05, "loss": 0.3421, "step": 8745 }, { "epoch": 1.8353236957888122, "grad_norm": 1.1641727685928345, "learning_rate": 1.7292862809977432e-05, "loss": 0.3912, "step": 8760 }, { "epoch": 1.8384663733500943, "grad_norm": 1.0571367740631104, "learning_rate": 1.7212216368604264e-05, "loss": 0.3262, "step": 8775 }, { "epoch": 1.8416090509113765, "grad_norm": 1.1409281492233276, "learning_rate": 1.7131659577948254e-05, "loss": 0.4101, "step": 8790 }, { "epoch": 1.8447517284726587, "grad_norm": 1.1299269199371338, "learning_rate": 1.7051193365355926e-05, "loss": 0.4095, "step": 8805 }, { "epoch": 1.8478944060339408, "grad_norm": 1.0926958322525024, "learning_rate": 1.697081865713108e-05, "loss": 0.3668, "step": 8820 }, { "epoch": 1.851037083595223, "grad_norm": 1.262511968612671, "learning_rate": 1.689053637852417e-05, "loss": 0.3699, "step": 8835 }, { "epoch": 1.8541797611565052, "grad_norm": 0.9396837949752808, "learning_rate": 1.681034745372161e-05, "loss": 0.3793, "step": 8850 }, { "epoch": 1.8573224387177876, "grad_norm": 1.3683308362960815, "learning_rate": 1.6730252805835145e-05, "loss": 0.3633, "step": 8865 }, { "epoch": 1.8604651162790697, "grad_norm": 1.2032579183578491, "learning_rate": 1.6650253356891247e-05, "loss": 0.3644, "step": 8880 }, { "epoch": 1.8636077938403521, "grad_norm": 1.1967633962631226, "learning_rate": 1.6570350027820485e-05, "loss": 0.3737, "step": 8895 }, { "epoch": 1.8667504714016343, "grad_norm": 1.4144322872161865, "learning_rate": 1.6490543738446927e-05, "loss": 0.3816, "step": 8910 }, { "epoch": 1.8698931489629165, "grad_norm": 1.4581791162490845, "learning_rate": 1.6410835407477513e-05, "loss": 0.3189, "step": 8925 }, { "epoch": 1.8730358265241986, "grad_norm": 1.2554900646209717, "learning_rate": 1.6331225952491557e-05, "loss": 0.3555, "step": 8940 }, { "epoch": 1.8761785040854808, "grad_norm": 1.4458445310592651, "learning_rate": 1.6251716289930134e-05, "loss": 0.4001, "step": 8955 }, { "epoch": 1.879321181646763, "grad_norm": 1.4509528875350952, "learning_rate": 1.6172307335085512e-05, "loss": 0.4032, "step": 8970 }, { "epoch": 1.8824638592080452, "grad_norm": 1.3516335487365723, "learning_rate": 1.6093000002090657e-05, "loss": 0.4087, "step": 8985 }, { "epoch": 1.8856065367693273, "grad_norm": 1.1090672016143799, "learning_rate": 1.6013795203908703e-05, "loss": 0.3573, "step": 9000 }, { "epoch": 1.8887492143306097, "grad_norm": 1.2857966423034668, "learning_rate": 1.593469385232243e-05, "loss": 0.4204, "step": 9015 }, { "epoch": 1.8918918918918919, "grad_norm": 1.1753884553909302, "learning_rate": 1.5855696857923738e-05, "loss": 0.4041, "step": 9030 }, { "epoch": 1.895034569453174, "grad_norm": 1.3764643669128418, "learning_rate": 1.577680513010325e-05, "loss": 0.3901, "step": 9045 }, { "epoch": 1.8981772470144564, "grad_norm": 1.2634403705596924, "learning_rate": 1.569801957703975e-05, "loss": 0.3669, "step": 9060 }, { "epoch": 1.9013199245757386, "grad_norm": 1.501197338104248, "learning_rate": 1.5619341105689793e-05, "loss": 0.3875, "step": 9075 }, { "epoch": 1.9044626021370208, "grad_norm": 1.1498409509658813, "learning_rate": 1.5540770621777213e-05, "loss": 0.3769, "step": 9090 }, { "epoch": 1.907605279698303, "grad_norm": 1.2901723384857178, "learning_rate": 1.5462309029782756e-05, "loss": 0.4069, "step": 9105 }, { "epoch": 1.9107479572595851, "grad_norm": 1.2987323999404907, "learning_rate": 1.5383957232933623e-05, "loss": 0.3264, "step": 9120 }, { "epoch": 1.9138906348208673, "grad_norm": 1.0844594240188599, "learning_rate": 1.5305716133193056e-05, "loss": 0.352, "step": 9135 }, { "epoch": 1.9170333123821495, "grad_norm": 1.4493502378463745, "learning_rate": 1.5227586631250047e-05, "loss": 0.4362, "step": 9150 }, { "epoch": 1.9201759899434316, "grad_norm": 1.2252168655395508, "learning_rate": 1.5149569626508848e-05, "loss": 0.3463, "step": 9165 }, { "epoch": 1.923318667504714, "grad_norm": 1.2073407173156738, "learning_rate": 1.5071666017078705e-05, "loss": 0.3452, "step": 9180 }, { "epoch": 1.9264613450659962, "grad_norm": 0.9203445315361023, "learning_rate": 1.4993876699763467e-05, "loss": 0.3588, "step": 9195 }, { "epoch": 1.9296040226272786, "grad_norm": 1.270068645477295, "learning_rate": 1.4916202570051319e-05, "loss": 0.3777, "step": 9210 }, { "epoch": 1.9327467001885608, "grad_norm": 1.1798357963562012, "learning_rate": 1.4838644522104416e-05, "loss": 0.3975, "step": 9225 }, { "epoch": 1.935889377749843, "grad_norm": 1.4530518054962158, "learning_rate": 1.476120344874861e-05, "loss": 0.4299, "step": 9240 }, { "epoch": 1.939032055311125, "grad_norm": 1.449532151222229, "learning_rate": 1.4683880241463197e-05, "loss": 0.4051, "step": 9255 }, { "epoch": 1.9421747328724073, "grad_norm": 1.4117298126220703, "learning_rate": 1.460667579037061e-05, "loss": 0.3639, "step": 9270 }, { "epoch": 1.9453174104336894, "grad_norm": 1.2169469594955444, "learning_rate": 1.452959098422621e-05, "loss": 0.357, "step": 9285 }, { "epoch": 1.9484600879949716, "grad_norm": 1.243122935295105, "learning_rate": 1.4452626710408017e-05, "loss": 0.3618, "step": 9300 }, { "epoch": 1.9516027655562538, "grad_norm": 1.175661563873291, "learning_rate": 1.4375783854906555e-05, "loss": 0.3524, "step": 9315 }, { "epoch": 1.9547454431175362, "grad_norm": 1.468005895614624, "learning_rate": 1.4299063302314597e-05, "loss": 0.3667, "step": 9330 }, { "epoch": 1.9578881206788183, "grad_norm": 1.145400047302246, "learning_rate": 1.4222465935816975e-05, "loss": 0.4047, "step": 9345 }, { "epoch": 1.9610307982401005, "grad_norm": 1.3986377716064453, "learning_rate": 1.4145992637180492e-05, "loss": 0.3254, "step": 9360 }, { "epoch": 1.964173475801383, "grad_norm": 1.3191365003585815, "learning_rate": 1.4069644286743669e-05, "loss": 0.3564, "step": 9375 }, { "epoch": 1.967316153362665, "grad_norm": 1.48728346824646, "learning_rate": 1.3993421763406672e-05, "loss": 0.3196, "step": 9390 }, { "epoch": 1.9704588309239472, "grad_norm": 1.3215950727462769, "learning_rate": 1.3917325944621195e-05, "loss": 0.3826, "step": 9405 }, { "epoch": 1.9736015084852294, "grad_norm": 1.3539785146713257, "learning_rate": 1.3841357706380348e-05, "loss": 0.392, "step": 9420 }, { "epoch": 1.9767441860465116, "grad_norm": 1.0365345478057861, "learning_rate": 1.3765517923208554e-05, "loss": 0.3862, "step": 9435 }, { "epoch": 1.9798868636077938, "grad_norm": 1.2735167741775513, "learning_rate": 1.3689807468151491e-05, "loss": 0.372, "step": 9450 }, { "epoch": 1.983029541169076, "grad_norm": 1.4106998443603516, "learning_rate": 1.3614227212766079e-05, "loss": 0.3768, "step": 9465 }, { "epoch": 1.9861722187303583, "grad_norm": 1.568157434463501, "learning_rate": 1.3538778027110402e-05, "loss": 0.3453, "step": 9480 }, { "epoch": 1.9893148962916405, "grad_norm": 1.4247443675994873, "learning_rate": 1.3463460779733706e-05, "loss": 0.407, "step": 9495 }, { "epoch": 1.9924575738529227, "grad_norm": 1.2098503112792969, "learning_rate": 1.3388276337666384e-05, "loss": 0.3444, "step": 9510 }, { "epoch": 1.995600251414205, "grad_norm": 1.054401159286499, "learning_rate": 1.3313225566410042e-05, "loss": 0.3342, "step": 9525 }, { "epoch": 1.9987429289754872, "grad_norm": 1.186824917793274, "learning_rate": 1.3238309329927511e-05, "loss": 0.3322, "step": 9540 }, { "epoch": 2.0018856065367694, "grad_norm": 1.0764572620391846, "learning_rate": 1.3163528490632854e-05, "loss": 0.3444, "step": 9555 }, { "epoch": 2.0050282840980516, "grad_norm": 1.051069974899292, "learning_rate": 1.3088883909381531e-05, "loss": 0.2928, "step": 9570 }, { "epoch": 2.0081709616593337, "grad_norm": 1.2765467166900635, "learning_rate": 1.3014376445460391e-05, "loss": 0.303, "step": 9585 }, { "epoch": 2.011313639220616, "grad_norm": 0.9927627444267273, "learning_rate": 1.2940006956577871e-05, "loss": 0.2736, "step": 9600 }, { "epoch": 2.014456316781898, "grad_norm": 1.6037464141845703, "learning_rate": 1.2865776298854043e-05, "loss": 0.2862, "step": 9615 }, { "epoch": 2.0175989943431802, "grad_norm": 1.486846923828125, "learning_rate": 1.2791685326810826e-05, "loss": 0.3303, "step": 9630 }, { "epoch": 2.0207416719044624, "grad_norm": 1.5033382177352905, "learning_rate": 1.2717734893362102e-05, "loss": 0.273, "step": 9645 }, { "epoch": 2.023884349465745, "grad_norm": 1.7398715019226074, "learning_rate": 1.2643925849803895e-05, "loss": 0.3412, "step": 9660 }, { "epoch": 2.027027027027027, "grad_norm": 1.2956515550613403, "learning_rate": 1.2570259045804628e-05, "loss": 0.371, "step": 9675 }, { "epoch": 2.0301697045883094, "grad_norm": 1.6283161640167236, "learning_rate": 1.2496735329395286e-05, "loss": 0.3437, "step": 9690 }, { "epoch": 2.0333123821495915, "grad_norm": 1.208808183670044, "learning_rate": 1.2423355546959664e-05, "loss": 0.3402, "step": 9705 }, { "epoch": 2.0364550597108737, "grad_norm": 1.0130226612091064, "learning_rate": 1.2350120543224625e-05, "loss": 0.3091, "step": 9720 }, { "epoch": 2.039597737272156, "grad_norm": 1.4891202449798584, "learning_rate": 1.2277031161250398e-05, "loss": 0.3595, "step": 9735 }, { "epoch": 2.042740414833438, "grad_norm": 1.399242877960205, "learning_rate": 1.2204088242420866e-05, "loss": 0.2866, "step": 9750 }, { "epoch": 2.04588309239472, "grad_norm": 1.6362804174423218, "learning_rate": 1.2131292626433843e-05, "loss": 0.3116, "step": 9765 }, { "epoch": 2.0490257699560024, "grad_norm": 1.3457330465316772, "learning_rate": 1.2058645151291436e-05, "loss": 0.3473, "step": 9780 }, { "epoch": 2.0521684475172846, "grad_norm": 1.0016905069351196, "learning_rate": 1.198614665329042e-05, "loss": 0.3299, "step": 9795 }, { "epoch": 2.0553111250785667, "grad_norm": 1.6363437175750732, "learning_rate": 1.1913797967012585e-05, "loss": 0.2997, "step": 9810 }, { "epoch": 2.0584538026398493, "grad_norm": 1.3227770328521729, "learning_rate": 1.1841599925315106e-05, "loss": 0.312, "step": 9825 }, { "epoch": 2.0615964802011315, "grad_norm": 1.6865644454956055, "learning_rate": 1.1769553359321017e-05, "loss": 0.2977, "step": 9840 }, { "epoch": 2.0647391577624137, "grad_norm": 1.7184381484985352, "learning_rate": 1.169765909840957e-05, "loss": 0.2997, "step": 9855 }, { "epoch": 2.067881835323696, "grad_norm": 1.0318830013275146, "learning_rate": 1.1625917970206759e-05, "loss": 0.3017, "step": 9870 }, { "epoch": 2.071024512884978, "grad_norm": 1.549784779548645, "learning_rate": 1.155433080057573e-05, "loss": 0.3203, "step": 9885 }, { "epoch": 2.07416719044626, "grad_norm": 1.5676542520523071, "learning_rate": 1.1482898413607333e-05, "loss": 0.3512, "step": 9900 }, { "epoch": 2.0773098680075424, "grad_norm": 1.68881356716156, "learning_rate": 1.1411621631610575e-05, "loss": 0.3201, "step": 9915 }, { "epoch": 2.0804525455688245, "grad_norm": 1.3327656984329224, "learning_rate": 1.1340501275103178e-05, "loss": 0.3129, "step": 9930 }, { "epoch": 2.0835952231301067, "grad_norm": 1.5713459253311157, "learning_rate": 1.1269538162802196e-05, "loss": 0.3212, "step": 9945 }, { "epoch": 2.086737900691389, "grad_norm": 1.3707289695739746, "learning_rate": 1.1198733111614474e-05, "loss": 0.2978, "step": 9960 }, { "epoch": 2.0898805782526715, "grad_norm": 1.3866550922393799, "learning_rate": 1.1128086936627321e-05, "loss": 0.353, "step": 9975 }, { "epoch": 2.0930232558139537, "grad_norm": 1.3355560302734375, "learning_rate": 1.1057600451099104e-05, "loss": 0.2947, "step": 9990 }, { "epoch": 2.096165933375236, "grad_norm": 1.3299508094787598, "learning_rate": 1.0987274466449907e-05, "loss": 0.2719, "step": 10005 }, { "epoch": 2.099308610936518, "grad_norm": 1.4944045543670654, "learning_rate": 1.0917109792252173e-05, "loss": 0.3074, "step": 10020 }, { "epoch": 2.1024512884978, "grad_norm": 1.238981008529663, "learning_rate": 1.084710723622136e-05, "loss": 0.3253, "step": 10035 }, { "epoch": 2.1055939660590823, "grad_norm": 1.7395031452178955, "learning_rate": 1.0777267604206703e-05, "loss": 0.3404, "step": 10050 }, { "epoch": 2.1087366436203645, "grad_norm": 1.597024917602539, "learning_rate": 1.0707591700181874e-05, "loss": 0.3362, "step": 10065 }, { "epoch": 2.1118793211816467, "grad_norm": 1.5733188390731812, "learning_rate": 1.0638080326235777e-05, "loss": 0.3694, "step": 10080 }, { "epoch": 2.115021998742929, "grad_norm": 1.2697248458862305, "learning_rate": 1.0568734282563272e-05, "loss": 0.3231, "step": 10095 }, { "epoch": 2.118164676304211, "grad_norm": 1.410846471786499, "learning_rate": 1.049955436745601e-05, "loss": 0.3175, "step": 10110 }, { "epoch": 2.121307353865493, "grad_norm": 1.4120702743530273, "learning_rate": 1.0430541377293191e-05, "loss": 0.3534, "step": 10125 }, { "epoch": 2.124450031426776, "grad_norm": 1.8276065587997437, "learning_rate": 1.0361696106532442e-05, "loss": 0.3332, "step": 10140 }, { "epoch": 2.127592708988058, "grad_norm": 1.6806981563568115, "learning_rate": 1.0293019347700658e-05, "loss": 0.2967, "step": 10155 }, { "epoch": 2.13073538654934, "grad_norm": 2.0087246894836426, "learning_rate": 1.0224511891384853e-05, "loss": 0.3439, "step": 10170 }, { "epoch": 2.1338780641106223, "grad_norm": 1.5151036977767944, "learning_rate": 1.015617452622309e-05, "loss": 0.3344, "step": 10185 }, { "epoch": 2.1370207416719045, "grad_norm": 1.1880221366882324, "learning_rate": 1.008800803889537e-05, "loss": 0.2934, "step": 10200 }, { "epoch": 2.1401634192331866, "grad_norm": 1.1785838603973389, "learning_rate": 1.0020013214114657e-05, "loss": 0.3163, "step": 10215 }, { "epoch": 2.143306096794469, "grad_norm": 1.2505255937576294, "learning_rate": 9.952190834617728e-06, "loss": 0.3166, "step": 10230 }, { "epoch": 2.146448774355751, "grad_norm": 2.049252510070801, "learning_rate": 9.884541681156226e-06, "loss": 0.3077, "step": 10245 }, { "epoch": 2.149591451917033, "grad_norm": 1.616794466972351, "learning_rate": 9.817066532487701e-06, "loss": 0.3077, "step": 10260 }, { "epoch": 2.1527341294783153, "grad_norm": 1.339815378189087, "learning_rate": 9.749766165366567e-06, "loss": 0.3528, "step": 10275 }, { "epoch": 2.155876807039598, "grad_norm": 1.4637688398361206, "learning_rate": 9.682641354535244e-06, "loss": 0.3619, "step": 10290 }, { "epoch": 2.15901948460088, "grad_norm": 1.2227802276611328, "learning_rate": 9.615692872715154e-06, "loss": 0.3413, "step": 10305 }, { "epoch": 2.1621621621621623, "grad_norm": 1.7328176498413086, "learning_rate": 9.548921490597917e-06, "loss": 0.3127, "step": 10320 }, { "epoch": 2.1653048397234445, "grad_norm": 1.122909665107727, "learning_rate": 9.482327976836392e-06, "loss": 0.2989, "step": 10335 }, { "epoch": 2.1684475172847266, "grad_norm": 1.163944959640503, "learning_rate": 9.415913098035895e-06, "loss": 0.3264, "step": 10350 }, { "epoch": 2.171590194846009, "grad_norm": 1.4139958620071411, "learning_rate": 9.349677618745347e-06, "loss": 0.2845, "step": 10365 }, { "epoch": 2.174732872407291, "grad_norm": 1.749042272567749, "learning_rate": 9.28362230144846e-06, "loss": 0.3336, "step": 10380 }, { "epoch": 2.177875549968573, "grad_norm": 1.489220142364502, "learning_rate": 9.217747906554969e-06, "loss": 0.299, "step": 10395 }, { "epoch": 2.1810182275298553, "grad_norm": 1.2497318983078003, "learning_rate": 9.152055192391903e-06, "loss": 0.2956, "step": 10410 }, { "epoch": 2.1841609050911375, "grad_norm": 1.4486489295959473, "learning_rate": 9.086544915194831e-06, "loss": 0.3065, "step": 10425 }, { "epoch": 2.1873035826524196, "grad_norm": 1.4671967029571533, "learning_rate": 9.021217829099143e-06, "loss": 0.3275, "step": 10440 }, { "epoch": 2.1904462602137023, "grad_norm": 1.387172818183899, "learning_rate": 8.956074686131396e-06, "loss": 0.2766, "step": 10455 }, { "epoch": 2.1935889377749844, "grad_norm": 1.0154411792755127, "learning_rate": 8.89111623620065e-06, "loss": 0.3188, "step": 10470 }, { "epoch": 2.1967316153362666, "grad_norm": 1.452532172203064, "learning_rate": 8.826343227089843e-06, "loss": 0.3148, "step": 10485 }, { "epoch": 2.1998742928975488, "grad_norm": 1.309695839881897, "learning_rate": 8.761756404447144e-06, "loss": 0.2735, "step": 10500 }, { "epoch": 2.203016970458831, "grad_norm": 1.652197003364563, "learning_rate": 8.69735651177741e-06, "loss": 0.3238, "step": 10515 }, { "epoch": 2.206159648020113, "grad_norm": 1.330776572227478, "learning_rate": 8.633144290433629e-06, "loss": 0.3433, "step": 10530 }, { "epoch": 2.2093023255813953, "grad_norm": 1.5660831928253174, "learning_rate": 8.56912047960834e-06, "loss": 0.3275, "step": 10545 }, { "epoch": 2.2124450031426774, "grad_norm": 1.1177830696105957, "learning_rate": 8.50528581632519e-06, "loss": 0.3697, "step": 10560 }, { "epoch": 2.2155876807039596, "grad_norm": 1.4742639064788818, "learning_rate": 8.441641035430381e-06, "loss": 0.3099, "step": 10575 }, { "epoch": 2.218730358265242, "grad_norm": 1.505416750907898, "learning_rate": 8.378186869584275e-06, "loss": 0.33, "step": 10590 }, { "epoch": 2.2218730358265244, "grad_norm": 1.5553947687149048, "learning_rate": 8.314924049252895e-06, "loss": 0.3302, "step": 10605 }, { "epoch": 2.2250157133878066, "grad_norm": 1.5330064296722412, "learning_rate": 8.251853302699578e-06, "loss": 0.3387, "step": 10620 }, { "epoch": 2.2281583909490887, "grad_norm": 1.2511600255966187, "learning_rate": 8.188975355976557e-06, "loss": 0.2764, "step": 10635 }, { "epoch": 2.231301068510371, "grad_norm": 1.3672597408294678, "learning_rate": 8.126290932916599e-06, "loss": 0.3554, "step": 10650 }, { "epoch": 2.234443746071653, "grad_norm": 1.28493332862854, "learning_rate": 8.06380075512468e-06, "loss": 0.3377, "step": 10665 }, { "epoch": 2.2375864236329353, "grad_norm": 1.5767827033996582, "learning_rate": 8.001505541969698e-06, "loss": 0.328, "step": 10680 }, { "epoch": 2.2407291011942174, "grad_norm": 1.3858174085617065, "learning_rate": 7.939406010576167e-06, "loss": 0.2975, "step": 10695 }, { "epoch": 2.2438717787554996, "grad_norm": 1.6385616064071655, "learning_rate": 7.877502875815961e-06, "loss": 0.3297, "step": 10710 }, { "epoch": 2.2470144563167818, "grad_norm": 1.4886940717697144, "learning_rate": 7.815796850300095e-06, "loss": 0.3159, "step": 10725 }, { "epoch": 2.250157133878064, "grad_norm": 1.1138700246810913, "learning_rate": 7.754288644370528e-06, "loss": 0.336, "step": 10740 }, { "epoch": 2.253299811439346, "grad_norm": 1.5991181135177612, "learning_rate": 7.692978966091977e-06, "loss": 0.3252, "step": 10755 }, { "epoch": 2.2564424890006287, "grad_norm": 1.1452405452728271, "learning_rate": 7.631868521243757e-06, "loss": 0.316, "step": 10770 }, { "epoch": 2.259585166561911, "grad_norm": 1.069392204284668, "learning_rate": 7.57095801331166e-06, "loss": 0.3167, "step": 10785 }, { "epoch": 2.262727844123193, "grad_norm": 1.717702865600586, "learning_rate": 7.510248143479876e-06, "loss": 0.3426, "step": 10800 }, { "epoch": 2.2658705216844752, "grad_norm": 1.7524367570877075, "learning_rate": 7.4497396106229134e-06, "loss": 0.3732, "step": 10815 }, { "epoch": 2.2690131992457574, "grad_norm": 1.937584638595581, "learning_rate": 7.38943311129752e-06, "loss": 0.3333, "step": 10830 }, { "epoch": 2.2721558768070396, "grad_norm": 1.3948473930358887, "learning_rate": 7.329329339734722e-06, "loss": 0.3149, "step": 10845 }, { "epoch": 2.2752985543683217, "grad_norm": 1.588791012763977, "learning_rate": 7.269428987831783e-06, "loss": 0.3433, "step": 10860 }, { "epoch": 2.278441231929604, "grad_norm": 1.2459790706634521, "learning_rate": 7.209732745144254e-06, "loss": 0.2659, "step": 10875 }, { "epoch": 2.281583909490886, "grad_norm": 1.0872770547866821, "learning_rate": 7.150241298878055e-06, "loss": 0.2956, "step": 10890 }, { "epoch": 2.2847265870521687, "grad_norm": 1.6503065824508667, "learning_rate": 7.090955333881555e-06, "loss": 0.3258, "step": 10905 }, { "epoch": 2.287869264613451, "grad_norm": 1.3506873846054077, "learning_rate": 7.0318755326376576e-06, "loss": 0.2789, "step": 10920 }, { "epoch": 2.291011942174733, "grad_norm": 1.3215200901031494, "learning_rate": 6.973002575255974e-06, "loss": 0.3325, "step": 10935 }, { "epoch": 2.294154619736015, "grad_norm": 1.4247123003005981, "learning_rate": 6.914337139465004e-06, "loss": 0.3329, "step": 10950 }, { "epoch": 2.2972972972972974, "grad_norm": 1.0532140731811523, "learning_rate": 6.85587990060432e-06, "loss": 0.2541, "step": 10965 }, { "epoch": 2.3004399748585795, "grad_norm": 1.6737048625946045, "learning_rate": 6.797631531616769e-06, "loss": 0.3642, "step": 10980 }, { "epoch": 2.3035826524198617, "grad_norm": 1.2676361799240112, "learning_rate": 6.739592703040759e-06, "loss": 0.2897, "step": 10995 }, { "epoch": 2.306725329981144, "grad_norm": 1.5627233982086182, "learning_rate": 6.681764083002534e-06, "loss": 0.3278, "step": 11010 }, { "epoch": 2.309868007542426, "grad_norm": 1.7141146659851074, "learning_rate": 6.624146337208484e-06, "loss": 0.3139, "step": 11025 }, { "epoch": 2.313010685103708, "grad_norm": 1.1188994646072388, "learning_rate": 6.566740128937451e-06, "loss": 0.295, "step": 11040 }, { "epoch": 2.3161533626649904, "grad_norm": 1.5478028059005737, "learning_rate": 6.509546119033152e-06, "loss": 0.3149, "step": 11055 }, { "epoch": 2.3192960402262726, "grad_norm": 1.1058639287948608, "learning_rate": 6.4525649658965045e-06, "loss": 0.274, "step": 11070 }, { "epoch": 2.322438717787555, "grad_norm": 1.5267043113708496, "learning_rate": 6.395797325478106e-06, "loss": 0.3099, "step": 11085 }, { "epoch": 2.3255813953488373, "grad_norm": 1.4159321784973145, "learning_rate": 6.339243851270635e-06, "loss": 0.3495, "step": 11100 }, { "epoch": 2.3287240729101195, "grad_norm": 1.2933320999145508, "learning_rate": 6.282905194301375e-06, "loss": 0.2708, "step": 11115 }, { "epoch": 2.3318667504714017, "grad_norm": 1.9966567754745483, "learning_rate": 6.226782003124676e-06, "loss": 0.2899, "step": 11130 }, { "epoch": 2.335009428032684, "grad_norm": 1.3963077068328857, "learning_rate": 6.170874923814499e-06, "loss": 0.3259, "step": 11145 }, { "epoch": 2.338152105593966, "grad_norm": 1.3655591011047363, "learning_rate": 6.115184599957033e-06, "loss": 0.289, "step": 11160 }, { "epoch": 2.341294783155248, "grad_norm": 1.4125938415527344, "learning_rate": 6.059711672643195e-06, "loss": 0.291, "step": 11175 }, { "epoch": 2.3444374607165304, "grad_norm": 2.017850875854492, "learning_rate": 6.004456780461315e-06, "loss": 0.3044, "step": 11190 }, { "epoch": 2.3475801382778125, "grad_norm": 1.441328525543213, "learning_rate": 5.949420559489752e-06, "loss": 0.3245, "step": 11205 }, { "epoch": 2.350722815839095, "grad_norm": 1.799134373664856, "learning_rate": 5.894603643289601e-06, "loss": 0.3593, "step": 11220 }, { "epoch": 2.3538654934003773, "grad_norm": 1.8016554117202759, "learning_rate": 5.840006662897388e-06, "loss": 0.2787, "step": 11235 }, { "epoch": 2.3570081709616595, "grad_norm": 1.4649808406829834, "learning_rate": 5.785630246817781e-06, "loss": 0.3168, "step": 11250 }, { "epoch": 2.3601508485229417, "grad_norm": 1.3161333799362183, "learning_rate": 5.731475021016383e-06, "loss": 0.3732, "step": 11265 }, { "epoch": 2.363293526084224, "grad_norm": 1.663887858390808, "learning_rate": 5.677541608912526e-06, "loss": 0.2998, "step": 11280 }, { "epoch": 2.366436203645506, "grad_norm": 1.439397931098938, "learning_rate": 5.623830631372087e-06, "loss": 0.3206, "step": 11295 }, { "epoch": 2.369578881206788, "grad_norm": 1.6403486728668213, "learning_rate": 5.570342706700324e-06, "loss": 0.3565, "step": 11310 }, { "epoch": 2.3727215587680703, "grad_norm": 1.6395245790481567, "learning_rate": 5.517078450634799e-06, "loss": 0.294, "step": 11325 }, { "epoch": 2.3758642363293525, "grad_norm": 1.496952772140503, "learning_rate": 5.464038476338237e-06, "loss": 0.2963, "step": 11340 }, { "epoch": 2.3790069138906347, "grad_norm": 1.9148141145706177, "learning_rate": 5.411223394391529e-06, "loss": 0.3353, "step": 11355 }, { "epoch": 2.382149591451917, "grad_norm": 1.4077427387237549, "learning_rate": 5.3586338127866396e-06, "loss": 0.3174, "step": 11370 }, { "epoch": 2.385292269013199, "grad_norm": 1.5252655744552612, "learning_rate": 5.306270336919661e-06, "loss": 0.3134, "step": 11385 }, { "epoch": 2.3884349465744816, "grad_norm": 1.5777688026428223, "learning_rate": 5.254133569583808e-06, "loss": 0.3309, "step": 11400 }, { "epoch": 2.391577624135764, "grad_norm": 1.7088990211486816, "learning_rate": 5.2022241109624805e-06, "loss": 0.2441, "step": 11415 }, { "epoch": 2.394720301697046, "grad_norm": 1.7140231132507324, "learning_rate": 5.150542558622415e-06, "loss": 0.3053, "step": 11430 }, { "epoch": 2.397862979258328, "grad_norm": 3.6586174964904785, "learning_rate": 5.099089507506705e-06, "loss": 0.3079, "step": 11445 }, { "epoch": 2.4010056568196103, "grad_norm": 1.752259612083435, "learning_rate": 5.047865549928024e-06, "loss": 0.324, "step": 11460 }, { "epoch": 2.4041483343808925, "grad_norm": 1.5753651857376099, "learning_rate": 4.996871275561779e-06, "loss": 0.3128, "step": 11475 }, { "epoch": 2.4072910119421747, "grad_norm": 1.9012105464935303, "learning_rate": 4.946107271439343e-06, "loss": 0.3764, "step": 11490 }, { "epoch": 2.410433689503457, "grad_norm": 1.4729382991790771, "learning_rate": 4.895574121941285e-06, "loss": 0.2755, "step": 11505 }, { "epoch": 2.413576367064739, "grad_norm": 1.4175302982330322, "learning_rate": 4.845272408790621e-06, "loss": 0.3121, "step": 11520 }, { "epoch": 2.4167190446260216, "grad_norm": 1.7722225189208984, "learning_rate": 4.795202711046168e-06, "loss": 0.2744, "step": 11535 }, { "epoch": 2.4198617221873038, "grad_norm": 1.4909186363220215, "learning_rate": 4.74536560509582e-06, "loss": 0.3025, "step": 11550 }, { "epoch": 2.423004399748586, "grad_norm": 1.8246691226959229, "learning_rate": 4.695761664649964e-06, "loss": 0.3324, "step": 11565 }, { "epoch": 2.426147077309868, "grad_norm": 1.7963186502456665, "learning_rate": 4.646391460734837e-06, "loss": 0.3575, "step": 11580 }, { "epoch": 2.4292897548711503, "grad_norm": 1.5770527124404907, "learning_rate": 4.5972555616859816e-06, "loss": 0.2908, "step": 11595 }, { "epoch": 2.4324324324324325, "grad_norm": 1.617647409439087, "learning_rate": 4.548354533141677e-06, "loss": 0.2994, "step": 11610 }, { "epoch": 2.4355751099937146, "grad_norm": 1.745650291442871, "learning_rate": 4.49968893803645e-06, "loss": 0.3361, "step": 11625 }, { "epoch": 2.438717787554997, "grad_norm": 1.0638154745101929, "learning_rate": 4.451259336594596e-06, "loss": 0.3368, "step": 11640 }, { "epoch": 2.441860465116279, "grad_norm": 1.482951045036316, "learning_rate": 4.403066286323693e-06, "loss": 0.3004, "step": 11655 }, { "epoch": 2.445003142677561, "grad_norm": 1.4275717735290527, "learning_rate": 4.355110342008231e-06, "loss": 0.2826, "step": 11670 }, { "epoch": 2.4481458202388433, "grad_norm": 1.4426920413970947, "learning_rate": 4.307392055703182e-06, "loss": 0.2944, "step": 11685 }, { "epoch": 2.4512884978001255, "grad_norm": 1.5074379444122314, "learning_rate": 4.259911976727712e-06, "loss": 0.3222, "step": 11700 }, { "epoch": 2.454431175361408, "grad_norm": 1.3746048212051392, "learning_rate": 4.212670651658768e-06, "loss": 0.317, "step": 11715 }, { "epoch": 2.4575738529226903, "grad_norm": 1.6050078868865967, "learning_rate": 4.165668624324845e-06, "loss": 0.3172, "step": 11730 }, { "epoch": 2.4607165304839724, "grad_norm": 1.2552024126052856, "learning_rate": 4.118906435799724e-06, "loss": 0.2816, "step": 11745 }, { "epoch": 2.4638592080452546, "grad_norm": 1.3392716646194458, "learning_rate": 4.0723846243962084e-06, "loss": 0.3155, "step": 11760 }, { "epoch": 2.4670018856065368, "grad_norm": 1.5874278545379639, "learning_rate": 4.026103725659977e-06, "loss": 0.2603, "step": 11775 }, { "epoch": 2.470144563167819, "grad_norm": 1.235484004020691, "learning_rate": 3.980064272363362e-06, "loss": 0.2499, "step": 11790 }, { "epoch": 2.473287240729101, "grad_norm": 1.6743351221084595, "learning_rate": 3.934266794499275e-06, "loss": 0.3402, "step": 11805 }, { "epoch": 2.4764299182903833, "grad_norm": 1.4384301900863647, "learning_rate": 3.888711819275048e-06, "loss": 0.3176, "step": 11820 }, { "epoch": 2.4795725958516655, "grad_norm": 1.4185879230499268, "learning_rate": 3.84339987110641e-06, "loss": 0.3183, "step": 11835 }, { "epoch": 2.482715273412948, "grad_norm": 1.382876992225647, "learning_rate": 3.7983314716114384e-06, "loss": 0.3044, "step": 11850 }, { "epoch": 2.4858579509742302, "grad_norm": 1.7051907777786255, "learning_rate": 3.7535071396045286e-06, "loss": 0.3701, "step": 11865 }, { "epoch": 2.4890006285355124, "grad_norm": 1.6134312152862549, "learning_rate": 3.708927391090447e-06, "loss": 0.2941, "step": 11880 }, { "epoch": 2.4921433060967946, "grad_norm": 1.5831973552703857, "learning_rate": 3.664592739258399e-06, "loss": 0.33, "step": 11895 }, { "epoch": 2.4952859836580767, "grad_norm": 1.5520756244659424, "learning_rate": 3.6205036944761045e-06, "loss": 0.3087, "step": 11910 }, { "epoch": 2.498428661219359, "grad_norm": 1.497530460357666, "learning_rate": 3.5766607642839093e-06, "loss": 0.3003, "step": 11925 }, { "epoch": 2.501571338780641, "grad_norm": 1.3204107284545898, "learning_rate": 3.5330644533889705e-06, "loss": 0.284, "step": 11940 }, { "epoch": 2.5047140163419233, "grad_norm": 1.4598573446273804, "learning_rate": 3.489715263659435e-06, "loss": 0.2783, "step": 11955 }, { "epoch": 2.5078566939032054, "grad_norm": 1.5349574089050293, "learning_rate": 3.4466136941186724e-06, "loss": 0.2826, "step": 11970 }, { "epoch": 2.5109993714644876, "grad_norm": 1.3122080564498901, "learning_rate": 3.403760240939502e-06, "loss": 0.2675, "step": 11985 }, { "epoch": 2.5141420490257698, "grad_norm": 1.218714714050293, "learning_rate": 3.361155397438501e-06, "loss": 0.3582, "step": 12000 }, { "epoch": 2.517284726587052, "grad_norm": 1.8126921653747559, "learning_rate": 3.3187996540703424e-06, "loss": 0.2697, "step": 12015 }, { "epoch": 2.520427404148334, "grad_norm": 1.4559165239334106, "learning_rate": 3.276693498422104e-06, "loss": 0.3061, "step": 12030 }, { "epoch": 2.5235700817096167, "grad_norm": 1.0276938676834106, "learning_rate": 3.234837415207706e-06, "loss": 0.3437, "step": 12045 }, { "epoch": 2.526712759270899, "grad_norm": 1.4260108470916748, "learning_rate": 3.193231886262288e-06, "loss": 0.282, "step": 12060 }, { "epoch": 2.529855436832181, "grad_norm": 1.7475075721740723, "learning_rate": 3.1518773905366976e-06, "loss": 0.3306, "step": 12075 }, { "epoch": 2.5329981143934632, "grad_norm": 1.1481621265411377, "learning_rate": 3.1107744040919427e-06, "loss": 0.2692, "step": 12090 }, { "epoch": 2.5361407919547454, "grad_norm": 1.8862768411636353, "learning_rate": 3.0699234000937464e-06, "loss": 0.332, "step": 12105 }, { "epoch": 2.5392834695160276, "grad_norm": 1.4870737791061401, "learning_rate": 3.0293248488070745e-06, "loss": 0.3344, "step": 12120 }, { "epoch": 2.5424261470773097, "grad_norm": 1.7676063776016235, "learning_rate": 2.9889792175907318e-06, "loss": 0.3323, "step": 12135 }, { "epoch": 2.5455688246385924, "grad_norm": 1.3961862325668335, "learning_rate": 2.9488869708919674e-06, "loss": 0.3279, "step": 12150 }, { "epoch": 2.5487115021998745, "grad_norm": 1.2494407892227173, "learning_rate": 2.9090485702411603e-06, "loss": 0.3043, "step": 12165 }, { "epoch": 2.5518541797611567, "grad_norm": 2.1194069385528564, "learning_rate": 2.869464474246483e-06, "loss": 0.3251, "step": 12180 }, { "epoch": 2.554996857322439, "grad_norm": 1.5678242444992065, "learning_rate": 2.8301351385886214e-06, "loss": 0.3134, "step": 12195 }, { "epoch": 2.558139534883721, "grad_norm": 1.7995771169662476, "learning_rate": 2.7910610160155256e-06, "loss": 0.3218, "step": 12210 }, { "epoch": 2.561282212445003, "grad_norm": 1.077495813369751, "learning_rate": 2.7522425563372202e-06, "loss": 0.2961, "step": 12225 }, { "epoch": 2.5644248900062854, "grad_norm": 1.7993483543395996, "learning_rate": 2.7136802064206157e-06, "loss": 0.3097, "step": 12240 }, { "epoch": 2.5675675675675675, "grad_norm": 1.5372523069381714, "learning_rate": 2.675374410184345e-06, "loss": 0.2836, "step": 12255 }, { "epoch": 2.5707102451288497, "grad_norm": 1.4500757455825806, "learning_rate": 2.6373256085936742e-06, "loss": 0.3154, "step": 12270 }, { "epoch": 2.573852922690132, "grad_norm": 1.4548457860946655, "learning_rate": 2.5995342396554325e-06, "loss": 0.3113, "step": 12285 }, { "epoch": 2.576995600251414, "grad_norm": 1.9645068645477295, "learning_rate": 2.562000738412945e-06, "loss": 0.3444, "step": 12300 }, { "epoch": 2.5801382778126962, "grad_norm": 1.7881463766098022, "learning_rate": 2.5247255369410418e-06, "loss": 0.2974, "step": 12315 }, { "epoch": 2.5832809553739784, "grad_norm": 1.7925788164138794, "learning_rate": 2.4877090643410927e-06, "loss": 0.2944, "step": 12330 }, { "epoch": 2.586423632935261, "grad_norm": 1.5786759853363037, "learning_rate": 2.4509517467360356e-06, "loss": 0.3785, "step": 12345 }, { "epoch": 2.589566310496543, "grad_norm": 1.4962717294692993, "learning_rate": 2.4144540072654987e-06, "loss": 0.3267, "step": 12360 }, { "epoch": 2.5927089880578253, "grad_norm": 1.163743257522583, "learning_rate": 2.378216266080929e-06, "loss": 0.2757, "step": 12375 }, { "epoch": 2.5958516656191075, "grad_norm": 1.7964270114898682, "learning_rate": 2.342238940340746e-06, "loss": 0.2904, "step": 12390 }, { "epoch": 2.5989943431803897, "grad_norm": 1.7889028787612915, "learning_rate": 2.3065224442055333e-06, "loss": 0.3064, "step": 12405 }, { "epoch": 2.602137020741672, "grad_norm": 1.5097829103469849, "learning_rate": 2.271067188833281e-06, "loss": 0.3401, "step": 12420 }, { "epoch": 2.605279698302954, "grad_norm": 1.4333211183547974, "learning_rate": 2.235873582374659e-06, "loss": 0.2794, "step": 12435 }, { "epoch": 2.608422375864236, "grad_norm": 1.2477611303329468, "learning_rate": 2.200942029968309e-06, "loss": 0.2935, "step": 12450 }, { "epoch": 2.611565053425519, "grad_norm": 1.7559458017349243, "learning_rate": 2.166272933736177e-06, "loss": 0.3258, "step": 12465 }, { "epoch": 2.614707730986801, "grad_norm": 1.6621719598770142, "learning_rate": 2.1318666927788834e-06, "loss": 0.3111, "step": 12480 }, { "epoch": 2.617850408548083, "grad_norm": 1.6579554080963135, "learning_rate": 2.0977237031711506e-06, "loss": 0.2611, "step": 12495 }, { "epoch": 2.6209930861093653, "grad_norm": 1.7369964122772217, "learning_rate": 2.063844357957223e-06, "loss": 0.3577, "step": 12510 }, { "epoch": 2.6241357636706475, "grad_norm": 1.6332292556762695, "learning_rate": 2.0302290471463314e-06, "loss": 0.2942, "step": 12525 }, { "epoch": 2.6272784412319297, "grad_norm": 1.5578200817108154, "learning_rate": 1.996878157708243e-06, "loss": 0.2695, "step": 12540 }, { "epoch": 2.630421118793212, "grad_norm": 1.5188201665878296, "learning_rate": 1.963792073568757e-06, "loss": 0.3078, "step": 12555 }, { "epoch": 2.633563796354494, "grad_norm": 1.8250635862350464, "learning_rate": 1.9309711756053367e-06, "loss": 0.3146, "step": 12570 }, { "epoch": 2.636706473915776, "grad_norm": 1.7131030559539795, "learning_rate": 1.8984158416426728e-06, "loss": 0.3182, "step": 12585 }, { "epoch": 2.6398491514770583, "grad_norm": 1.473404884338379, "learning_rate": 1.8661264464483852e-06, "loss": 0.2727, "step": 12600 }, { "epoch": 2.6429918290383405, "grad_norm": 1.508779764175415, "learning_rate": 1.8341033617286645e-06, "loss": 0.3448, "step": 12615 }, { "epoch": 2.6461345065996227, "grad_norm": 1.147560477256775, "learning_rate": 1.8023469561240126e-06, "loss": 0.2783, "step": 12630 }, { "epoch": 2.649277184160905, "grad_norm": 1.760060429573059, "learning_rate": 1.770857595205011e-06, "loss": 0.3152, "step": 12645 }, { "epoch": 2.6524198617221875, "grad_norm": 1.4739596843719482, "learning_rate": 1.7396356414680959e-06, "loss": 0.29, "step": 12660 }, { "epoch": 2.6555625392834696, "grad_norm": 1.567877173423767, "learning_rate": 1.7086814543313816e-06, "loss": 0.2672, "step": 12675 }, { "epoch": 2.658705216844752, "grad_norm": 1.3326002359390259, "learning_rate": 1.6779953901305295e-06, "loss": 0.251, "step": 12690 }, { "epoch": 2.661847894406034, "grad_norm": 1.3788151741027832, "learning_rate": 1.647577802114661e-06, "loss": 0.3416, "step": 12705 }, { "epoch": 2.664990571967316, "grad_norm": 1.7790052890777588, "learning_rate": 1.6174290404422726e-06, "loss": 0.2999, "step": 12720 }, { "epoch": 2.6681332495285983, "grad_norm": 1.4312305450439453, "learning_rate": 1.5875494521771922e-06, "loss": 0.3305, "step": 12735 }, { "epoch": 2.6712759270898805, "grad_norm": 1.6938543319702148, "learning_rate": 1.5579393812846316e-06, "loss": 0.3117, "step": 12750 }, { "epoch": 2.6744186046511627, "grad_norm": 1.5854291915893555, "learning_rate": 1.528599168627165e-06, "loss": 0.3289, "step": 12765 }, { "epoch": 2.6775612822124453, "grad_norm": 1.1590096950531006, "learning_rate": 1.4995291519608602e-06, "loss": 0.283, "step": 12780 }, { "epoch": 2.6807039597737274, "grad_norm": 1.068301796913147, "learning_rate": 1.470729665931353e-06, "loss": 0.331, "step": 12795 }, { "epoch": 2.6838466373350096, "grad_norm": 1.2185308933258057, "learning_rate": 1.4422010420700182e-06, "loss": 0.3014, "step": 12810 }, { "epoch": 2.686989314896292, "grad_norm": 1.4308061599731445, "learning_rate": 1.413943608790133e-06, "loss": 0.2939, "step": 12825 }, { "epoch": 2.690131992457574, "grad_norm": 1.1259864568710327, "learning_rate": 1.385957691383119e-06, "loss": 0.2669, "step": 12840 }, { "epoch": 2.693274670018856, "grad_norm": 1.5093046426773071, "learning_rate": 1.3582436120147729e-06, "loss": 0.3374, "step": 12855 }, { "epoch": 2.6964173475801383, "grad_norm": 1.3771803379058838, "learning_rate": 1.3308016897215807e-06, "loss": 0.2783, "step": 12870 }, { "epoch": 2.6995600251414205, "grad_norm": 2.384852409362793, "learning_rate": 1.3036322404070296e-06, "loss": 0.3162, "step": 12885 }, { "epoch": 2.700188560653677, "eval_accuracy": 0.8853943711763073, "eval_loss": 0.4137997329235077, "eval_runtime": 1196.9935, "eval_samples_per_second": 3.998, "eval_steps_per_second": 1.0, "step": 12888 }, { "epoch": 2.7027027027027026, "grad_norm": 1.673790693283081, "learning_rate": 1.2767355768379702e-06, "loss": 0.2855, "step": 12900 }, { "epoch": 2.705845380263985, "grad_norm": 1.8752899169921875, "learning_rate": 1.2501120086410411e-06, "loss": 0.3085, "step": 12915 }, { "epoch": 2.708988057825267, "grad_norm": 1.8645318746566772, "learning_rate": 1.2237618422990733e-06, "loss": 0.3068, "step": 12930 }, { "epoch": 2.712130735386549, "grad_norm": 1.9585272073745728, "learning_rate": 1.1976853811475675e-06, "loss": 0.3283, "step": 12945 }, { "epoch": 2.7152734129478313, "grad_norm": 1.7527602910995483, "learning_rate": 1.1718829253712204e-06, "loss": 0.3222, "step": 12960 }, { "epoch": 2.718416090509114, "grad_norm": 1.3966923952102661, "learning_rate": 1.1463547720004546e-06, "loss": 0.3092, "step": 12975 }, { "epoch": 2.721558768070396, "grad_norm": 1.3295458555221558, "learning_rate": 1.1211012149080074e-06, "loss": 0.3237, "step": 12990 }, { "epoch": 2.7247014456316783, "grad_norm": 0.9988710284233093, "learning_rate": 1.0961225448055307e-06, "loss": 0.3216, "step": 13005 }, { "epoch": 2.7278441231929604, "grad_norm": 1.6158466339111328, "learning_rate": 1.0714190492402715e-06, "loss": 0.3017, "step": 13020 }, { "epoch": 2.7309868007542426, "grad_norm": 1.4756746292114258, "learning_rate": 1.0469910125917358e-06, "loss": 0.3169, "step": 13035 }, { "epoch": 2.7341294783155248, "grad_norm": 1.3889656066894531, "learning_rate": 1.0228387160684333e-06, "loss": 0.3754, "step": 13050 }, { "epoch": 2.737272155876807, "grad_norm": 1.2530293464660645, "learning_rate": 9.989624377046258e-07, "loss": 0.2958, "step": 13065 }, { "epoch": 2.740414833438089, "grad_norm": 1.8963161706924438, "learning_rate": 9.753624523571425e-07, "loss": 0.3641, "step": 13080 }, { "epoch": 2.7435575109993717, "grad_norm": 1.4623044729232788, "learning_rate": 9.520390317021955e-07, "loss": 0.3061, "step": 13095 }, { "epoch": 2.746700188560654, "grad_norm": 1.604202151298523, "learning_rate": 9.289924442322767e-07, "loss": 0.2785, "step": 13110 }, { "epoch": 2.749842866121936, "grad_norm": 1.8192863464355469, "learning_rate": 9.062229552530471e-07, "loss": 0.3169, "step": 13125 }, { "epoch": 2.7529855436832182, "grad_norm": 1.419291377067566, "learning_rate": 8.83730826880294e-07, "loss": 0.3015, "step": 13140 }, { "epoch": 2.7561282212445004, "grad_norm": 1.5753535032272339, "learning_rate": 8.615163180369035e-07, "loss": 0.284, "step": 13155 }, { "epoch": 2.7592708988057826, "grad_norm": 1.789189338684082, "learning_rate": 8.395796844498815e-07, "loss": 0.3423, "step": 13170 }, { "epoch": 2.7624135763670647, "grad_norm": 1.343781590461731, "learning_rate": 8.17921178647435e-07, "loss": 0.3119, "step": 13185 }, { "epoch": 2.765556253928347, "grad_norm": 1.652388572692871, "learning_rate": 7.96541049956026e-07, "loss": 0.3219, "step": 13200 }, { "epoch": 2.768698931489629, "grad_norm": 1.597399353981018, "learning_rate": 7.754395444975221e-07, "loss": 0.2873, "step": 13215 }, { "epoch": 2.7718416090509113, "grad_norm": 1.3452566862106323, "learning_rate": 7.546169051863672e-07, "loss": 0.3125, "step": 13230 }, { "epoch": 2.7749842866121934, "grad_norm": 1.605913758277893, "learning_rate": 7.340733717267678e-07, "loss": 0.278, "step": 13245 }, { "epoch": 2.7781269641734756, "grad_norm": 1.465397596359253, "learning_rate": 7.138091806099589e-07, "loss": 0.3208, "step": 13260 }, { "epoch": 2.7812696417347578, "grad_norm": 1.7374017238616943, "learning_rate": 6.938245651114506e-07, "loss": 0.2933, "step": 13275 }, { "epoch": 2.7844123192960404, "grad_norm": 1.9815653562545776, "learning_rate": 6.741197552883771e-07, "loss": 0.3335, "step": 13290 }, { "epoch": 2.7875549968573226, "grad_norm": 1.4085747003555298, "learning_rate": 6.546949779768136e-07, "loss": 0.2711, "step": 13305 }, { "epoch": 2.7906976744186047, "grad_norm": 1.6339495182037354, "learning_rate": 6.355504567891912e-07, "loss": 0.3331, "step": 13320 }, { "epoch": 2.793840351979887, "grad_norm": 1.441635251045227, "learning_rate": 6.166864121117167e-07, "loss": 0.3628, "step": 13335 }, { "epoch": 2.796983029541169, "grad_norm": 1.4819507598876953, "learning_rate": 5.981030611018234e-07, "loss": 0.2825, "step": 13350 }, { "epoch": 2.8001257071024512, "grad_norm": 1.5747650861740112, "learning_rate": 5.798006176856802e-07, "loss": 0.3144, "step": 13365 }, { "epoch": 2.8032683846637334, "grad_norm": 1.4870857000350952, "learning_rate": 5.617792925557363e-07, "loss": 0.3289, "step": 13380 }, { "epoch": 2.8064110622250156, "grad_norm": 1.7161614894866943, "learning_rate": 5.440392931682859e-07, "loss": 0.3379, "step": 13395 }, { "epoch": 2.809553739786298, "grad_norm": 0.8529698848724365, "learning_rate": 5.265808237410824e-07, "loss": 0.3143, "step": 13410 }, { "epoch": 2.8126964173475804, "grad_norm": 1.6342661380767822, "learning_rate": 5.094040852509779e-07, "loss": 0.3144, "step": 13425 }, { "epoch": 2.8158390949088625, "grad_norm": 1.4123117923736572, "learning_rate": 4.925092754316352e-07, "loss": 0.3407, "step": 13440 }, { "epoch": 2.8189817724701447, "grad_norm": 1.3898142576217651, "learning_rate": 4.7589658877122967e-07, "loss": 0.3385, "step": 13455 }, { "epoch": 2.822124450031427, "grad_norm": 1.6428829431533813, "learning_rate": 4.5956621651020994e-07, "loss": 0.2963, "step": 13470 }, { "epoch": 2.825267127592709, "grad_norm": 1.465915322303772, "learning_rate": 4.4351834663910465e-07, "loss": 0.3302, "step": 13485 }, { "epoch": 2.828409805153991, "grad_norm": 1.8282034397125244, "learning_rate": 4.277531638963689e-07, "loss": 0.3171, "step": 13500 }, { "epoch": 2.8315524827152734, "grad_norm": 2.015639305114746, "learning_rate": 4.122708497662275e-07, "loss": 0.3633, "step": 13515 }, { "epoch": 2.8346951602765555, "grad_norm": 1.0915390253067017, "learning_rate": 3.97071582476613e-07, "loss": 0.3, "step": 13530 }, { "epoch": 2.8378378378378377, "grad_norm": 0.9291322827339172, "learning_rate": 3.821555369971086e-07, "loss": 0.3471, "step": 13545 }, { "epoch": 2.84098051539912, "grad_norm": 1.6048222780227661, "learning_rate": 3.6752288503691945e-07, "loss": 0.3209, "step": 13560 }, { "epoch": 2.844123192960402, "grad_norm": 1.6999403238296509, "learning_rate": 3.5317379504291316e-07, "loss": 0.3446, "step": 13575 }, { "epoch": 2.8472658705216842, "grad_norm": 2.1094415187835693, "learning_rate": 3.391084321976656e-07, "loss": 0.3502, "step": 13590 }, { "epoch": 2.850408548082967, "grad_norm": 1.3436388969421387, "learning_rate": 3.2532695841758496e-07, "loss": 0.3167, "step": 13605 }, { "epoch": 2.853551225644249, "grad_norm": 1.470632553100586, "learning_rate": 3.118295323510101e-07, "loss": 0.3063, "step": 13620 }, { "epoch": 2.856693903205531, "grad_norm": 1.0371286869049072, "learning_rate": 2.9861630937641494e-07, "loss": 0.3034, "step": 13635 }, { "epoch": 2.8598365807668134, "grad_norm": 1.7494783401489258, "learning_rate": 2.8568744160061e-07, "loss": 0.2834, "step": 13650 }, { "epoch": 2.8629792583280955, "grad_norm": 1.5144836902618408, "learning_rate": 2.730430778569909e-07, "loss": 0.3142, "step": 13665 }, { "epoch": 2.8661219358893777, "grad_norm": 1.8125107288360596, "learning_rate": 2.606833637038231e-07, "loss": 0.3513, "step": 13680 }, { "epoch": 2.86926461345066, "grad_norm": 1.099411129951477, "learning_rate": 2.4860844142256257e-07, "loss": 0.3025, "step": 13695 }, { "epoch": 2.872407291011942, "grad_norm": 1.8955268859863281, "learning_rate": 2.3681845001623515e-07, "loss": 0.3418, "step": 13710 }, { "epoch": 2.8755499685732246, "grad_norm": 1.2657068967819214, "learning_rate": 2.2531352520781535e-07, "loss": 0.2709, "step": 13725 }, { "epoch": 2.878692646134507, "grad_norm": 1.8179534673690796, "learning_rate": 2.140937994386777e-07, "loss": 0.3291, "step": 13740 }, { "epoch": 2.881835323695789, "grad_norm": 1.7901382446289062, "learning_rate": 2.031594018670674e-07, "loss": 0.3132, "step": 13755 }, { "epoch": 2.884978001257071, "grad_norm": 1.1521648168563843, "learning_rate": 1.9251045836661263e-07, "loss": 0.2764, "step": 13770 }, { "epoch": 2.8881206788183533, "grad_norm": 1.2185838222503662, "learning_rate": 1.8214709152487575e-07, "loss": 0.3465, "step": 13785 }, { "epoch": 2.8912633563796355, "grad_norm": 1.640515685081482, "learning_rate": 1.720694206419432e-07, "loss": 0.315, "step": 13800 }, { "epoch": 2.8944060339409177, "grad_norm": 1.314355731010437, "learning_rate": 1.6227756172905729e-07, "loss": 0.2685, "step": 13815 }, { "epoch": 2.8975487115022, "grad_norm": 1.2538273334503174, "learning_rate": 1.527716275072699e-07, "loss": 0.3432, "step": 13830 }, { "epoch": 2.900691389063482, "grad_norm": 1.3175392150878906, "learning_rate": 1.435517274061493e-07, "loss": 0.2969, "step": 13845 }, { "epoch": 2.903834066624764, "grad_norm": 1.512818694114685, "learning_rate": 1.346179675625253e-07, "loss": 0.2804, "step": 13860 }, { "epoch": 2.9069767441860463, "grad_norm": 1.2288899421691895, "learning_rate": 1.2597045081926551e-07, "loss": 0.3092, "step": 13875 }, { "epoch": 2.9101194217473285, "grad_norm": 1.157689094543457, "learning_rate": 1.1760927672408161e-07, "loss": 0.3075, "step": 13890 }, { "epoch": 2.9132620993086107, "grad_norm": 1.6113057136535645, "learning_rate": 1.0953454152839993e-07, "loss": 0.3319, "step": 13905 }, { "epoch": 2.9164047768698933, "grad_norm": 1.4615386724472046, "learning_rate": 1.0174633818623991e-07, "loss": 0.306, "step": 13920 }, { "epoch": 2.9195474544311755, "grad_norm": 1.0442296266555786, "learning_rate": 9.424475635315122e-08, "loss": 0.3057, "step": 13935 }, { "epoch": 2.9226901319924576, "grad_norm": 1.2906923294067383, "learning_rate": 8.702988238517562e-08, "loss": 0.2989, "step": 13950 }, { "epoch": 2.92583280955374, "grad_norm": 1.6215356588363647, "learning_rate": 8.010179933786167e-08, "loss": 0.324, "step": 13965 }, { "epoch": 2.928975487115022, "grad_norm": 1.602383017539978, "learning_rate": 7.346058696530156e-08, "loss": 0.381, "step": 13980 }, { "epoch": 2.932118164676304, "grad_norm": 1.5103670358657837, "learning_rate": 6.710632171921527e-08, "loss": 0.3379, "step": 13995 }, { "epoch": 2.9352608422375863, "grad_norm": 1.6660419702529907, "learning_rate": 6.103907674807064e-08, "loss": 0.312, "step": 14010 }, { "epoch": 2.9384035197988685, "grad_norm": 1.0635946989059448, "learning_rate": 5.52589218962396e-08, "loss": 0.2964, "step": 14025 }, { "epoch": 2.941546197360151, "grad_norm": 1.247497797012329, "learning_rate": 4.976592370319611e-08, "loss": 0.2952, "step": 14040 }, { "epoch": 2.9446888749214333, "grad_norm": 1.4133594036102295, "learning_rate": 4.456014540275e-08, "loss": 0.2696, "step": 14055 }, { "epoch": 2.9478315524827154, "grad_norm": 1.5689040422439575, "learning_rate": 3.964164692231709e-08, "loss": 0.341, "step": 14070 }, { "epoch": 2.9509742300439976, "grad_norm": 1.2708498239517212, "learning_rate": 3.5010484882233574e-08, "loss": 0.3055, "step": 14085 }, { "epoch": 2.95411690760528, "grad_norm": 1.7094337940216064, "learning_rate": 3.066671259510101e-08, "loss": 0.3289, "step": 14100 }, { "epoch": 2.957259585166562, "grad_norm": 1.60092294216156, "learning_rate": 2.6610380065170136e-08, "loss": 0.2657, "step": 14115 }, { "epoch": 2.960402262727844, "grad_norm": 1.0856350660324097, "learning_rate": 2.284153398777189e-08, "loss": 0.3139, "step": 14130 }, { "epoch": 2.9635449402891263, "grad_norm": 1.8443694114685059, "learning_rate": 1.936021774877339e-08, "loss": 0.2993, "step": 14145 }, { "epoch": 2.9666876178504085, "grad_norm": 1.4500629901885986, "learning_rate": 1.616647142408112e-08, "loss": 0.2914, "step": 14160 }, { "epoch": 2.9698302954116906, "grad_norm": 1.634055256843567, "learning_rate": 1.3260331779182955e-08, "loss": 0.3251, "step": 14175 }, { "epoch": 2.972972972972973, "grad_norm": 1.6882349252700806, "learning_rate": 1.0641832268717955e-08, "loss": 0.2889, "step": 14190 }, { "epoch": 2.976115650534255, "grad_norm": 1.6775078773498535, "learning_rate": 8.311003036098885e-09, "loss": 0.2957, "step": 14205 }, { "epoch": 2.979258328095537, "grad_norm": 2.209030866622925, "learning_rate": 6.267870913156948e-09, "loss": 0.3114, "step": 14220 }, { "epoch": 2.9824010056568198, "grad_norm": 1.3158173561096191, "learning_rate": 4.512459419839243e-09, "loss": 0.293, "step": 14235 }, { "epoch": 2.985543683218102, "grad_norm": 1.2444883584976196, "learning_rate": 3.0447887639367676e-09, "loss": 0.2313, "step": 14250 }, { "epoch": 2.988686360779384, "grad_norm": 1.1739709377288818, "learning_rate": 1.8648758408512656e-09, "loss": 0.3228, "step": 14265 }, { "epoch": 2.9918290383406663, "grad_norm": 1.4359891414642334, "learning_rate": 9.72734233398165e-10, "loss": 0.2946, "step": 14280 }, { "epoch": 2.9949717159019484, "grad_norm": 1.3152233362197876, "learning_rate": 3.6837421165669685e-10, "loss": 0.2678, "step": 14295 }, { "epoch": 2.9981143934632306, "grad_norm": 1.9656248092651367, "learning_rate": 5.1802732842221036e-11, "loss": 0.2903, "step": 14310 }, { "epoch": 3.0, "step": 14319, "total_flos": 5.387086302585815e+18, "train_loss": 0.4039875044737109, "train_runtime": 21568.6928, "train_samples_per_second": 2.655, "train_steps_per_second": 0.664 } ], "logging_steps": 15, "max_steps": 14319, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 4296, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.387086302585815e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }