diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6747 @@ +{ + "best_metric": 0.8853943711763073, + "best_model_checkpoint": "/workspace/previous_works/M3D/LaMed/output/LaMed-Llama3-8B-finetune-0000/checkpoint-12888", + "epoch": 3.0, + "eval_steps": 4296, + "global_step": 14319, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0031426775612822125, + "grad_norm": 6.532504558563232, + "learning_rate": 1.744186046511628e-06, + "loss": 1.9456, + "step": 15 + }, + { + "epoch": 0.006285355122564425, + "grad_norm": 4.389227867126465, + "learning_rate": 3.488372093023256e-06, + "loss": 1.8427, + "step": 30 + }, + { + "epoch": 0.009428032683846637, + "grad_norm": 3.4557132720947266, + "learning_rate": 5.232558139534884e-06, + "loss": 1.6337, + "step": 45 + }, + { + "epoch": 0.01257071024512885, + "grad_norm": 3.462625503540039, + "learning_rate": 6.976744186046512e-06, + "loss": 1.3449, + "step": 60 + }, + { + "epoch": 0.01571338780641106, + "grad_norm": 3.7610018253326416, + "learning_rate": 8.72093023255814e-06, + "loss": 1.1347, + "step": 75 + }, + { + "epoch": 0.018856065367693273, + "grad_norm": 3.2558743953704834, + "learning_rate": 1.0465116279069768e-05, + "loss": 0.9932, + "step": 90 + }, + { + "epoch": 0.02199874292897549, + "grad_norm": 4.160295486450195, + "learning_rate": 1.2209302325581395e-05, + "loss": 0.9954, + "step": 105 + }, + { + "epoch": 0.0251414204902577, + "grad_norm": 3.3803467750549316, + "learning_rate": 1.3953488372093024e-05, + "loss": 0.8322, + "step": 120 + }, + { + "epoch": 0.028284098051539912, + "grad_norm": 3.2412078380584717, + "learning_rate": 1.569767441860465e-05, + "loss": 0.8286, + "step": 135 + }, + { + "epoch": 0.03142677561282212, + "grad_norm": 3.4582881927490234, + "learning_rate": 1.744186046511628e-05, + "loss": 0.7777, + "step": 150 + }, + { + "epoch": 0.034569453174104335, + "grad_norm": 3.038137435913086, + "learning_rate": 1.918604651162791e-05, + "loss": 0.7253, + "step": 165 + }, + { + "epoch": 0.03771213073538655, + "grad_norm": 3.4821434020996094, + "learning_rate": 2.0930232558139536e-05, + "loss": 0.7581, + "step": 180 + }, + { + "epoch": 0.04085480829666876, + "grad_norm": 4.621170520782471, + "learning_rate": 2.2674418604651163e-05, + "loss": 0.7054, + "step": 195 + }, + { + "epoch": 0.04399748585795098, + "grad_norm": 2.803231716156006, + "learning_rate": 2.441860465116279e-05, + "loss": 0.7732, + "step": 210 + }, + { + "epoch": 0.04714016341923319, + "grad_norm": 3.1358466148376465, + "learning_rate": 2.616279069767442e-05, + "loss": 0.6582, + "step": 225 + }, + { + "epoch": 0.0502828409805154, + "grad_norm": 2.628765106201172, + "learning_rate": 2.7906976744186048e-05, + "loss": 0.6487, + "step": 240 + }, + { + "epoch": 0.05342551854179761, + "grad_norm": 3.6059532165527344, + "learning_rate": 2.9651162790697678e-05, + "loss": 0.589, + "step": 255 + }, + { + "epoch": 0.056568196103079824, + "grad_norm": 2.951493263244629, + "learning_rate": 3.13953488372093e-05, + "loss": 0.6081, + "step": 270 + }, + { + "epoch": 0.059710873664362035, + "grad_norm": 2.9226279258728027, + "learning_rate": 3.313953488372093e-05, + "loss": 0.6117, + "step": 285 + }, + { + "epoch": 0.06285355122564425, + "grad_norm": 3.403846263885498, + "learning_rate": 3.488372093023256e-05, + "loss": 0.6731, + "step": 300 + }, + { + "epoch": 0.06599622878692646, + "grad_norm": 2.577772617340088, + "learning_rate": 3.662790697674418e-05, + "loss": 0.6461, + "step": 315 + }, + { + "epoch": 0.06913890634820867, + "grad_norm": 3.0141305923461914, + "learning_rate": 3.837209302325582e-05, + "loss": 0.6386, + "step": 330 + }, + { + "epoch": 0.07228158390949088, + "grad_norm": 2.3152832984924316, + "learning_rate": 4.0116279069767444e-05, + "loss": 0.5524, + "step": 345 + }, + { + "epoch": 0.0754242614707731, + "grad_norm": 2.8160572052001953, + "learning_rate": 4.186046511627907e-05, + "loss": 0.6205, + "step": 360 + }, + { + "epoch": 0.0785669390320553, + "grad_norm": 2.3307974338531494, + "learning_rate": 4.36046511627907e-05, + "loss": 0.6004, + "step": 375 + }, + { + "epoch": 0.08170961659333752, + "grad_norm": 2.2888669967651367, + "learning_rate": 4.5348837209302326e-05, + "loss": 0.5461, + "step": 390 + }, + { + "epoch": 0.08485229415461974, + "grad_norm": 2.36181378364563, + "learning_rate": 4.709302325581396e-05, + "loss": 0.5971, + "step": 405 + }, + { + "epoch": 0.08799497171590195, + "grad_norm": 2.1626923084259033, + "learning_rate": 4.883720930232558e-05, + "loss": 0.5446, + "step": 420 + }, + { + "epoch": 0.09113764927718417, + "grad_norm": 2.3800854682922363, + "learning_rate": 4.999998401149839e-05, + "loss": 0.6413, + "step": 435 + }, + { + "epoch": 0.09428032683846638, + "grad_norm": 2.2933521270751953, + "learning_rate": 4.999974418438328e-05, + "loss": 0.5955, + "step": 450 + }, + { + "epoch": 0.09742300439974859, + "grad_norm": 2.338463306427002, + "learning_rate": 4.999921656742949e-05, + "loss": 0.5819, + "step": 465 + }, + { + "epoch": 0.1005656819610308, + "grad_norm": 2.9759883880615234, + "learning_rate": 4.9998401166710804e-05, + "loss": 0.5898, + "step": 480 + }, + { + "epoch": 0.10370835952231301, + "grad_norm": 2.243450880050659, + "learning_rate": 4.999729799161389e-05, + "loss": 0.623, + "step": 495 + }, + { + "epoch": 0.10685103708359522, + "grad_norm": 2.647433280944824, + "learning_rate": 4.9995907054838166e-05, + "loss": 0.5426, + "step": 510 + }, + { + "epoch": 0.10999371464487744, + "grad_norm": 2.0400497913360596, + "learning_rate": 4.99942283723957e-05, + "loss": 0.6028, + "step": 525 + }, + { + "epoch": 0.11313639220615965, + "grad_norm": 2.918405771255493, + "learning_rate": 4.999226196361099e-05, + "loss": 0.5556, + "step": 540 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 2.571192741394043, + "learning_rate": 4.999000785112079e-05, + "loss": 0.5625, + "step": 555 + }, + { + "epoch": 0.11942174732872407, + "grad_norm": 2.483920097351074, + "learning_rate": 4.998746606087377e-05, + "loss": 0.6185, + "step": 570 + }, + { + "epoch": 0.12256442489000628, + "grad_norm": 2.963257312774658, + "learning_rate": 4.9984636622130285e-05, + "loss": 0.5841, + "step": 585 + }, + { + "epoch": 0.1257071024512885, + "grad_norm": 2.1929099559783936, + "learning_rate": 4.998151956746204e-05, + "loss": 0.5831, + "step": 600 + }, + { + "epoch": 0.12884978001257072, + "grad_norm": 1.990614891052246, + "learning_rate": 4.997811493275165e-05, + "loss": 0.5116, + "step": 615 + }, + { + "epoch": 0.13199245757385292, + "grad_norm": 2.227179527282715, + "learning_rate": 4.997442275719229e-05, + "loss": 0.59, + "step": 630 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 1.7978647947311401, + "learning_rate": 4.997044308328722e-05, + "loss": 0.4995, + "step": 645 + }, + { + "epoch": 0.13827781269641734, + "grad_norm": 2.2707254886627197, + "learning_rate": 4.9966175956849306e-05, + "loss": 0.5299, + "step": 660 + }, + { + "epoch": 0.14142049025769957, + "grad_norm": 2.358933687210083, + "learning_rate": 4.996162142700045e-05, + "loss": 0.597, + "step": 675 + }, + { + "epoch": 0.14456316781898176, + "grad_norm": 2.036271333694458, + "learning_rate": 4.995677954617112e-05, + "loss": 0.5392, + "step": 690 + }, + { + "epoch": 0.147705845380264, + "grad_norm": 2.3753066062927246, + "learning_rate": 4.995165037009962e-05, + "loss": 0.5778, + "step": 705 + }, + { + "epoch": 0.1508485229415462, + "grad_norm": 1.849295973777771, + "learning_rate": 4.994623395783157e-05, + "loss": 0.6238, + "step": 720 + }, + { + "epoch": 0.1539912005028284, + "grad_norm": 2.010460376739502, + "learning_rate": 4.994053037171912e-05, + "loss": 0.4691, + "step": 735 + }, + { + "epoch": 0.1571338780641106, + "grad_norm": 2.023106575012207, + "learning_rate": 4.993453967742032e-05, + "loss": 0.5377, + "step": 750 + }, + { + "epoch": 0.16027655562539284, + "grad_norm": 2.195887804031372, + "learning_rate": 4.9928261943898315e-05, + "loss": 0.5639, + "step": 765 + }, + { + "epoch": 0.16341923318667503, + "grad_norm": 1.9283181428909302, + "learning_rate": 4.9921697243420564e-05, + "loss": 0.5141, + "step": 780 + }, + { + "epoch": 0.16656191074795726, + "grad_norm": 1.7017083168029785, + "learning_rate": 4.9914845651557985e-05, + "loss": 0.5132, + "step": 795 + }, + { + "epoch": 0.16970458830923948, + "grad_norm": 2.1977009773254395, + "learning_rate": 4.990770724718415e-05, + "loss": 0.5415, + "step": 810 + }, + { + "epoch": 0.17284726587052168, + "grad_norm": 1.9427462816238403, + "learning_rate": 4.99002821124743e-05, + "loss": 0.5381, + "step": 825 + }, + { + "epoch": 0.1759899434318039, + "grad_norm": 2.5321216583251953, + "learning_rate": 4.989257033290443e-05, + "loss": 0.5512, + "step": 840 + }, + { + "epoch": 0.1791326209930861, + "grad_norm": 1.7843250036239624, + "learning_rate": 4.988457199725034e-05, + "loss": 0.5028, + "step": 855 + }, + { + "epoch": 0.18227529855436833, + "grad_norm": 2.1043522357940674, + "learning_rate": 4.987628719758655e-05, + "loss": 0.5928, + "step": 870 + }, + { + "epoch": 0.18541797611565053, + "grad_norm": 2.0235021114349365, + "learning_rate": 4.9867716029285284e-05, + "loss": 0.5651, + "step": 885 + }, + { + "epoch": 0.18856065367693275, + "grad_norm": 1.885472059249878, + "learning_rate": 4.985885859101536e-05, + "loss": 0.4879, + "step": 900 + }, + { + "epoch": 0.19170333123821495, + "grad_norm": 1.9070786237716675, + "learning_rate": 4.9849714984741046e-05, + "loss": 0.4901, + "step": 915 + }, + { + "epoch": 0.19484600879949718, + "grad_norm": 2.001380681991577, + "learning_rate": 4.984028531572091e-05, + "loss": 0.574, + "step": 930 + }, + { + "epoch": 0.19798868636077938, + "grad_norm": 1.9602166414260864, + "learning_rate": 4.9830569692506564e-05, + "loss": 0.5307, + "step": 945 + }, + { + "epoch": 0.2011313639220616, + "grad_norm": 2.094599485397339, + "learning_rate": 4.9820568226941466e-05, + "loss": 0.5821, + "step": 960 + }, + { + "epoch": 0.2042740414833438, + "grad_norm": 2.0091841220855713, + "learning_rate": 4.98102810341596e-05, + "loss": 0.5969, + "step": 975 + }, + { + "epoch": 0.20741671904462602, + "grad_norm": 2.306108236312866, + "learning_rate": 4.979970823258415e-05, + "loss": 0.5745, + "step": 990 + }, + { + "epoch": 0.21055939660590822, + "grad_norm": 1.636775255203247, + "learning_rate": 4.978884994392618e-05, + "loss": 0.6422, + "step": 1005 + }, + { + "epoch": 0.21370207416719045, + "grad_norm": 2.4798927307128906, + "learning_rate": 4.9777706293183154e-05, + "loss": 0.5046, + "step": 1020 + }, + { + "epoch": 0.21684475172847265, + "grad_norm": 1.804826259613037, + "learning_rate": 4.976627740863756e-05, + "loss": 0.5399, + "step": 1035 + }, + { + "epoch": 0.21998742928975487, + "grad_norm": 2.0178399085998535, + "learning_rate": 4.975456342185544e-05, + "loss": 0.5123, + "step": 1050 + }, + { + "epoch": 0.2231301068510371, + "grad_norm": 2.50925350189209, + "learning_rate": 4.9742564467684805e-05, + "loss": 0.4928, + "step": 1065 + }, + { + "epoch": 0.2262727844123193, + "grad_norm": 1.973009705543518, + "learning_rate": 4.9730280684254166e-05, + "loss": 0.5736, + "step": 1080 + }, + { + "epoch": 0.22941546197360152, + "grad_norm": 1.8204375505447388, + "learning_rate": 4.971771221297088e-05, + "loss": 0.4693, + "step": 1095 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 2.157780647277832, + "learning_rate": 4.970485919851958e-05, + "loss": 0.5993, + "step": 1110 + }, + { + "epoch": 0.23570081709616594, + "grad_norm": 2.113952398300171, + "learning_rate": 4.9691721788860433e-05, + "loss": 0.5987, + "step": 1125 + }, + { + "epoch": 0.23884349465744814, + "grad_norm": 2.577479124069214, + "learning_rate": 4.967830013522753e-05, + "loss": 0.5443, + "step": 1140 + }, + { + "epoch": 0.24198617221873037, + "grad_norm": 1.7032134532928467, + "learning_rate": 4.966459439212706e-05, + "loss": 0.5301, + "step": 1155 + }, + { + "epoch": 0.24512884978001256, + "grad_norm": 1.8560705184936523, + "learning_rate": 4.965060471733559e-05, + "loss": 0.5027, + "step": 1170 + }, + { + "epoch": 0.2482715273412948, + "grad_norm": 1.7248977422714233, + "learning_rate": 4.963633127189821e-05, + "loss": 0.5522, + "step": 1185 + }, + { + "epoch": 0.251414204902577, + "grad_norm": 1.6348320245742798, + "learning_rate": 4.9621774220126694e-05, + "loss": 0.48, + "step": 1200 + }, + { + "epoch": 0.2545568824638592, + "grad_norm": 1.7352231740951538, + "learning_rate": 4.960693372959764e-05, + "loss": 0.5886, + "step": 1215 + }, + { + "epoch": 0.25769956002514144, + "grad_norm": 2.1465370655059814, + "learning_rate": 4.959180997115049e-05, + "loss": 0.5238, + "step": 1230 + }, + { + "epoch": 0.2608422375864236, + "grad_norm": 1.7073941230773926, + "learning_rate": 4.957640311888557e-05, + "loss": 0.487, + "step": 1245 + }, + { + "epoch": 0.26398491514770583, + "grad_norm": 1.8688887357711792, + "learning_rate": 4.9560713350162137e-05, + "loss": 0.5792, + "step": 1260 + }, + { + "epoch": 0.26712759270898806, + "grad_norm": 2.24149227142334, + "learning_rate": 4.9544740845596254e-05, + "loss": 0.4613, + "step": 1275 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.6652510166168213, + "learning_rate": 4.9528485789058805e-05, + "loss": 0.4311, + "step": 1290 + }, + { + "epoch": 0.27341294783155246, + "grad_norm": 1.6432390213012695, + "learning_rate": 4.951194836767329e-05, + "loss": 0.5199, + "step": 1305 + }, + { + "epoch": 0.2765556253928347, + "grad_norm": 1.566832184791565, + "learning_rate": 4.9495128771813755e-05, + "loss": 0.4897, + "step": 1320 + }, + { + "epoch": 0.2796983029541169, + "grad_norm": 1.6974416971206665, + "learning_rate": 4.94780271951025e-05, + "loss": 0.5192, + "step": 1335 + }, + { + "epoch": 0.28284098051539913, + "grad_norm": 1.9494693279266357, + "learning_rate": 4.946064383440798e-05, + "loss": 0.4957, + "step": 1350 + }, + { + "epoch": 0.28598365807668136, + "grad_norm": 2.093959331512451, + "learning_rate": 4.944297888984239e-05, + "loss": 0.5164, + "step": 1365 + }, + { + "epoch": 0.2891263356379635, + "grad_norm": 1.9262990951538086, + "learning_rate": 4.9425032564759485e-05, + "loss": 0.504, + "step": 1380 + }, + { + "epoch": 0.29226901319924575, + "grad_norm": 1.8158432245254517, + "learning_rate": 4.940680506575218e-05, + "loss": 0.4649, + "step": 1395 + }, + { + "epoch": 0.295411690760528, + "grad_norm": 1.7862390279769897, + "learning_rate": 4.9388296602650185e-05, + "loss": 0.5356, + "step": 1410 + }, + { + "epoch": 0.2985543683218102, + "grad_norm": 2.2066242694854736, + "learning_rate": 4.936950738851758e-05, + "loss": 0.5076, + "step": 1425 + }, + { + "epoch": 0.3016970458830924, + "grad_norm": 2.2866694927215576, + "learning_rate": 4.935043763965038e-05, + "loss": 0.4621, + "step": 1440 + }, + { + "epoch": 0.3048397234443746, + "grad_norm": 1.6391174793243408, + "learning_rate": 4.933108757557402e-05, + "loss": 0.4651, + "step": 1455 + }, + { + "epoch": 0.3079824010056568, + "grad_norm": 2.0994527339935303, + "learning_rate": 4.9311457419040866e-05, + "loss": 0.5533, + "step": 1470 + }, + { + "epoch": 0.31112507856693905, + "grad_norm": 1.7273298501968384, + "learning_rate": 4.9291547396027594e-05, + "loss": 0.5621, + "step": 1485 + }, + { + "epoch": 0.3142677561282212, + "grad_norm": 2.017411470413208, + "learning_rate": 4.9271357735732655e-05, + "loss": 0.4768, + "step": 1500 + }, + { + "epoch": 0.31741043368950345, + "grad_norm": 1.7073991298675537, + "learning_rate": 4.925088867057359e-05, + "loss": 0.4989, + "step": 1515 + }, + { + "epoch": 0.32055311125078567, + "grad_norm": 2.071885585784912, + "learning_rate": 4.9230140436184364e-05, + "loss": 0.4984, + "step": 1530 + }, + { + "epoch": 0.3236957888120679, + "grad_norm": 2.1429100036621094, + "learning_rate": 4.9209113271412665e-05, + "loss": 0.5494, + "step": 1545 + }, + { + "epoch": 0.32683846637335007, + "grad_norm": 1.709663987159729, + "learning_rate": 4.9187807418317144e-05, + "loss": 0.5701, + "step": 1560 + }, + { + "epoch": 0.3299811439346323, + "grad_norm": 1.9613614082336426, + "learning_rate": 4.9166223122164635e-05, + "loss": 0.4878, + "step": 1575 + }, + { + "epoch": 0.3331238214959145, + "grad_norm": 1.7875553369522095, + "learning_rate": 4.9144360631427325e-05, + "loss": 0.4705, + "step": 1590 + }, + { + "epoch": 0.33626649905719674, + "grad_norm": 1.9654724597930908, + "learning_rate": 4.9122220197779886e-05, + "loss": 0.4385, + "step": 1605 + }, + { + "epoch": 0.33940917661847897, + "grad_norm": 1.4906249046325684, + "learning_rate": 4.90998020760966e-05, + "loss": 0.4427, + "step": 1620 + }, + { + "epoch": 0.34255185417976114, + "grad_norm": 1.86861252784729, + "learning_rate": 4.907710652444843e-05, + "loss": 0.4817, + "step": 1635 + }, + { + "epoch": 0.34569453174104336, + "grad_norm": 1.9250684976577759, + "learning_rate": 4.90541338041e-05, + "loss": 0.5351, + "step": 1650 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 1.8099184036254883, + "learning_rate": 4.903088417950664e-05, + "loss": 0.5238, + "step": 1665 + }, + { + "epoch": 0.3519798868636078, + "grad_norm": 1.4055452346801758, + "learning_rate": 4.9007357918311315e-05, + "loss": 0.5157, + "step": 1680 + }, + { + "epoch": 0.35512256442489, + "grad_norm": 1.7121083736419678, + "learning_rate": 4.898355529134156e-05, + "loss": 0.5087, + "step": 1695 + }, + { + "epoch": 0.3582652419861722, + "grad_norm": 1.7254718542099, + "learning_rate": 4.895947657260633e-05, + "loss": 0.482, + "step": 1710 + }, + { + "epoch": 0.36140791954745444, + "grad_norm": 1.7115743160247803, + "learning_rate": 4.893512203929291e-05, + "loss": 0.5415, + "step": 1725 + }, + { + "epoch": 0.36455059710873666, + "grad_norm": 1.5224454402923584, + "learning_rate": 4.8910491971763625e-05, + "loss": 0.5531, + "step": 1740 + }, + { + "epoch": 0.36769327467001883, + "grad_norm": 1.4693105220794678, + "learning_rate": 4.888558665355273e-05, + "loss": 0.5007, + "step": 1755 + }, + { + "epoch": 0.37083595223130106, + "grad_norm": 1.823201298713684, + "learning_rate": 4.8860406371363056e-05, + "loss": 0.4568, + "step": 1770 + }, + { + "epoch": 0.3739786297925833, + "grad_norm": 1.6682394742965698, + "learning_rate": 4.883495141506272e-05, + "loss": 0.5111, + "step": 1785 + }, + { + "epoch": 0.3771213073538655, + "grad_norm": 1.9045063257217407, + "learning_rate": 4.880922207768186e-05, + "loss": 0.5081, + "step": 1800 + }, + { + "epoch": 0.3802639849151477, + "grad_norm": 1.9026966094970703, + "learning_rate": 4.8783218655409165e-05, + "loss": 0.5094, + "step": 1815 + }, + { + "epoch": 0.3834066624764299, + "grad_norm": 2.230048418045044, + "learning_rate": 4.875694144758852e-05, + "loss": 0.4501, + "step": 1830 + }, + { + "epoch": 0.38654934003771213, + "grad_norm": 1.8619111776351929, + "learning_rate": 4.873039075671558e-05, + "loss": 0.5595, + "step": 1845 + }, + { + "epoch": 0.38969201759899436, + "grad_norm": 1.0510592460632324, + "learning_rate": 4.8703566888434216e-05, + "loss": 0.4494, + "step": 1860 + }, + { + "epoch": 0.3928346951602766, + "grad_norm": 1.61916983127594, + "learning_rate": 4.8676470151533054e-05, + "loss": 0.5619, + "step": 1875 + }, + { + "epoch": 0.39597737272155875, + "grad_norm": 2.1640028953552246, + "learning_rate": 4.864910085794192e-05, + "loss": 0.4624, + "step": 1890 + }, + { + "epoch": 0.399120050282841, + "grad_norm": 1.8915683031082153, + "learning_rate": 4.8621459322728216e-05, + "loss": 0.4953, + "step": 1905 + }, + { + "epoch": 0.4022627278441232, + "grad_norm": 1.5854873657226562, + "learning_rate": 4.859354586409331e-05, + "loss": 0.4952, + "step": 1920 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 1.8864436149597168, + "learning_rate": 4.8565360803368885e-05, + "loss": 0.4643, + "step": 1935 + }, + { + "epoch": 0.4085480829666876, + "grad_norm": 1.7292683124542236, + "learning_rate": 4.853690446501323e-05, + "loss": 0.4995, + "step": 1950 + }, + { + "epoch": 0.4116907605279698, + "grad_norm": 1.1200498342514038, + "learning_rate": 4.85081771766075e-05, + "loss": 0.4397, + "step": 1965 + }, + { + "epoch": 0.41483343808925205, + "grad_norm": 1.6311380863189697, + "learning_rate": 4.8479179268851934e-05, + "loss": 0.5041, + "step": 1980 + }, + { + "epoch": 0.4179761156505343, + "grad_norm": 1.5585182905197144, + "learning_rate": 4.844991107556208e-05, + "loss": 0.4968, + "step": 1995 + }, + { + "epoch": 0.42111879321181644, + "grad_norm": 1.9798181056976318, + "learning_rate": 4.8420372933664934e-05, + "loss": 0.5101, + "step": 2010 + }, + { + "epoch": 0.42426147077309867, + "grad_norm": 1.5805935859680176, + "learning_rate": 4.839056518319507e-05, + "loss": 0.5093, + "step": 2025 + }, + { + "epoch": 0.4274041483343809, + "grad_norm": 1.8099379539489746, + "learning_rate": 4.836048816729068e-05, + "loss": 0.4841, + "step": 2040 + }, + { + "epoch": 0.4305468258956631, + "grad_norm": 1.294607400894165, + "learning_rate": 4.833014223218971e-05, + "loss": 0.5417, + "step": 2055 + }, + { + "epoch": 0.4336895034569453, + "grad_norm": 1.446961760520935, + "learning_rate": 4.8299527727225796e-05, + "loss": 0.4639, + "step": 2070 + }, + { + "epoch": 0.4368321810182275, + "grad_norm": 1.460518479347229, + "learning_rate": 4.826864500482428e-05, + "loss": 0.4648, + "step": 2085 + }, + { + "epoch": 0.43997485857950974, + "grad_norm": 1.3880281448364258, + "learning_rate": 4.823749442049817e-05, + "loss": 0.4185, + "step": 2100 + }, + { + "epoch": 0.44311753614079197, + "grad_norm": 1.6404091119766235, + "learning_rate": 4.820607633284397e-05, + "loss": 0.4007, + "step": 2115 + }, + { + "epoch": 0.4462602137020742, + "grad_norm": 1.201521873474121, + "learning_rate": 4.8174391103537655e-05, + "loss": 0.4781, + "step": 2130 + }, + { + "epoch": 0.44940289126335636, + "grad_norm": 1.4873559474945068, + "learning_rate": 4.814243909733043e-05, + "loss": 0.4317, + "step": 2145 + }, + { + "epoch": 0.4525455688246386, + "grad_norm": 1.9189249277114868, + "learning_rate": 4.811022068204457e-05, + "loss": 0.5085, + "step": 2160 + }, + { + "epoch": 0.4556882463859208, + "grad_norm": 1.4758615493774414, + "learning_rate": 4.807773622856918e-05, + "loss": 0.4815, + "step": 2175 + }, + { + "epoch": 0.45883092394720304, + "grad_norm": 1.6353334188461304, + "learning_rate": 4.804498611085589e-05, + "loss": 0.4794, + "step": 2190 + }, + { + "epoch": 0.4619736015084852, + "grad_norm": 1.4237501621246338, + "learning_rate": 4.8011970705914634e-05, + "loss": 0.4593, + "step": 2205 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 1.6772956848144531, + "learning_rate": 4.7978690393809186e-05, + "loss": 0.486, + "step": 2220 + }, + { + "epoch": 0.46825895663104966, + "grad_norm": 1.553051233291626, + "learning_rate": 4.794514555765293e-05, + "loss": 0.4658, + "step": 2235 + }, + { + "epoch": 0.4714016341923319, + "grad_norm": 1.8338069915771484, + "learning_rate": 4.7911336583604306e-05, + "loss": 0.4953, + "step": 2250 + }, + { + "epoch": 0.47454431175361406, + "grad_norm": 1.431541919708252, + "learning_rate": 4.7877263860862477e-05, + "loss": 0.4442, + "step": 2265 + }, + { + "epoch": 0.4776869893148963, + "grad_norm": 1.120583415031433, + "learning_rate": 4.7842927781662796e-05, + "loss": 0.4537, + "step": 2280 + }, + { + "epoch": 0.4808296668761785, + "grad_norm": 1.380642056465149, + "learning_rate": 4.780832874127228e-05, + "loss": 0.4621, + "step": 2295 + }, + { + "epoch": 0.48397234443746073, + "grad_norm": 1.1469544172286987, + "learning_rate": 4.777346713798512e-05, + "loss": 0.5226, + "step": 2310 + }, + { + "epoch": 0.4871150219987429, + "grad_norm": 1.483512043952942, + "learning_rate": 4.7738343373118e-05, + "loss": 0.5479, + "step": 2325 + }, + { + "epoch": 0.49025769956002513, + "grad_norm": 1.610948920249939, + "learning_rate": 4.770295785100558e-05, + "loss": 0.5046, + "step": 2340 + }, + { + "epoch": 0.49340037712130735, + "grad_norm": 1.3163951635360718, + "learning_rate": 4.7667310978995785e-05, + "loss": 0.4603, + "step": 2355 + }, + { + "epoch": 0.4965430546825896, + "grad_norm": 1.4908734560012817, + "learning_rate": 4.763140316744509e-05, + "loss": 0.4806, + "step": 2370 + }, + { + "epoch": 0.4996857322438718, + "grad_norm": 1.3357776403427124, + "learning_rate": 4.759523482971388e-05, + "loss": 0.471, + "step": 2385 + }, + { + "epoch": 0.502828409805154, + "grad_norm": 1.4438153505325317, + "learning_rate": 4.755880638216161e-05, + "loss": 0.443, + "step": 2400 + }, + { + "epoch": 0.5059710873664363, + "grad_norm": 1.4169646501541138, + "learning_rate": 4.752211824414205e-05, + "loss": 0.4842, + "step": 2415 + }, + { + "epoch": 0.5091137649277184, + "grad_norm": 1.4930610656738281, + "learning_rate": 4.7485170837998455e-05, + "loss": 0.4815, + "step": 2430 + }, + { + "epoch": 0.5122564424890006, + "grad_norm": 1.5918561220169067, + "learning_rate": 4.74479645890587e-05, + "loss": 0.4372, + "step": 2445 + }, + { + "epoch": 0.5153991200502829, + "grad_norm": 1.6254751682281494, + "learning_rate": 4.7410499925630395e-05, + "loss": 0.4187, + "step": 2460 + }, + { + "epoch": 0.518541797611565, + "grad_norm": 1.5545734167099, + "learning_rate": 4.737277727899591e-05, + "loss": 0.4743, + "step": 2475 + }, + { + "epoch": 0.5216844751728472, + "grad_norm": 1.727158546447754, + "learning_rate": 4.7334797083407475e-05, + "loss": 0.4294, + "step": 2490 + }, + { + "epoch": 0.5248271527341295, + "grad_norm": 1.7546805143356323, + "learning_rate": 4.729655977608214e-05, + "loss": 0.5043, + "step": 2505 + }, + { + "epoch": 0.5279698302954117, + "grad_norm": 1.4232885837554932, + "learning_rate": 4.7258065797196746e-05, + "loss": 0.4729, + "step": 2520 + }, + { + "epoch": 0.531112507856694, + "grad_norm": 1.391065239906311, + "learning_rate": 4.721931558988286e-05, + "loss": 0.4915, + "step": 2535 + }, + { + "epoch": 0.5342551854179761, + "grad_norm": 1.7134276628494263, + "learning_rate": 4.7180309600221706e-05, + "loss": 0.5102, + "step": 2550 + }, + { + "epoch": 0.5373978629792583, + "grad_norm": 1.5847156047821045, + "learning_rate": 4.714104827723895e-05, + "loss": 0.4785, + "step": 2565 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 1.3267030715942383, + "learning_rate": 4.7101532072899623e-05, + "loss": 0.5135, + "step": 2580 + }, + { + "epoch": 0.5436832181018227, + "grad_norm": 1.5763999223709106, + "learning_rate": 4.706176144210286e-05, + "loss": 0.4916, + "step": 2595 + }, + { + "epoch": 0.5468258956631049, + "grad_norm": 1.4937148094177246, + "learning_rate": 4.7021736842676687e-05, + "loss": 0.4561, + "step": 2610 + }, + { + "epoch": 0.5499685732243872, + "grad_norm": 1.6091326475143433, + "learning_rate": 4.698145873537274e-05, + "loss": 0.482, + "step": 2625 + }, + { + "epoch": 0.5531112507856694, + "grad_norm": 1.5875076055526733, + "learning_rate": 4.694092758386095e-05, + "loss": 0.4104, + "step": 2640 + }, + { + "epoch": 0.5562539283469516, + "grad_norm": 1.3293397426605225, + "learning_rate": 4.690014385472424e-05, + "loss": 0.4143, + "step": 2655 + }, + { + "epoch": 0.5593966059082338, + "grad_norm": 1.1707426309585571, + "learning_rate": 4.6859108017453136e-05, + "loss": 0.4726, + "step": 2670 + }, + { + "epoch": 0.562539283469516, + "grad_norm": 1.3706302642822266, + "learning_rate": 4.6817820544440346e-05, + "loss": 0.461, + "step": 2685 + }, + { + "epoch": 0.5656819610307983, + "grad_norm": 1.7703521251678467, + "learning_rate": 4.677628191097534e-05, + "loss": 0.5042, + "step": 2700 + }, + { + "epoch": 0.5688246385920804, + "grad_norm": 1.5359523296356201, + "learning_rate": 4.6734492595238874e-05, + "loss": 0.4192, + "step": 2715 + }, + { + "epoch": 0.5719673161533627, + "grad_norm": 1.700126051902771, + "learning_rate": 4.6692453078297495e-05, + "loss": 0.5095, + "step": 2730 + }, + { + "epoch": 0.5751099937146449, + "grad_norm": 1.4070463180541992, + "learning_rate": 4.665016384409798e-05, + "loss": 0.4779, + "step": 2745 + }, + { + "epoch": 0.578252671275927, + "grad_norm": 1.2797980308532715, + "learning_rate": 4.660762537946178e-05, + "loss": 0.4351, + "step": 2760 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 1.4518544673919678, + "learning_rate": 4.656483817407944e-05, + "loss": 0.448, + "step": 2775 + }, + { + "epoch": 0.5845380263984915, + "grad_norm": 1.300370216369629, + "learning_rate": 4.652180272050491e-05, + "loss": 0.44, + "step": 2790 + }, + { + "epoch": 0.5876807039597737, + "grad_norm": 1.4460704326629639, + "learning_rate": 4.64785195141499e-05, + "loss": 0.4565, + "step": 2805 + }, + { + "epoch": 0.590823381521056, + "grad_norm": 1.5882294178009033, + "learning_rate": 4.643498905327819e-05, + "loss": 0.5078, + "step": 2820 + }, + { + "epoch": 0.5939660590823381, + "grad_norm": 1.3055689334869385, + "learning_rate": 4.639121183899989e-05, + "loss": 0.5, + "step": 2835 + }, + { + "epoch": 0.5971087366436204, + "grad_norm": 1.4545074701309204, + "learning_rate": 4.6347188375265645e-05, + "loss": 0.4767, + "step": 2850 + }, + { + "epoch": 0.6002514142049026, + "grad_norm": 1.0975799560546875, + "learning_rate": 4.630291916886086e-05, + "loss": 0.4384, + "step": 2865 + }, + { + "epoch": 0.6033940917661847, + "grad_norm": 1.6817741394042969, + "learning_rate": 4.625840472939987e-05, + "loss": 0.5, + "step": 2880 + }, + { + "epoch": 0.606536769327467, + "grad_norm": 1.0438511371612549, + "learning_rate": 4.621364556932005e-05, + "loss": 0.4671, + "step": 2895 + }, + { + "epoch": 0.6096794468887492, + "grad_norm": 1.1330349445343018, + "learning_rate": 4.616864220387592e-05, + "loss": 0.4275, + "step": 2910 + }, + { + "epoch": 0.6128221244500315, + "grad_norm": 1.6542346477508545, + "learning_rate": 4.612339515113324e-05, + "loss": 0.4801, + "step": 2925 + }, + { + "epoch": 0.6159648020113137, + "grad_norm": 1.1006687879562378, + "learning_rate": 4.6077904931963036e-05, + "loss": 0.4756, + "step": 2940 + }, + { + "epoch": 0.6191074795725958, + "grad_norm": 1.3067682981491089, + "learning_rate": 4.603217207003555e-05, + "loss": 0.4416, + "step": 2955 + }, + { + "epoch": 0.6222501571338781, + "grad_norm": 1.2261842489242554, + "learning_rate": 4.598619709181431e-05, + "loss": 0.4276, + "step": 2970 + }, + { + "epoch": 0.6253928346951603, + "grad_norm": 1.4903597831726074, + "learning_rate": 4.593998052654998e-05, + "loss": 0.4972, + "step": 2985 + }, + { + "epoch": 0.6285355122564424, + "grad_norm": 1.4376386404037476, + "learning_rate": 4.589352290627433e-05, + "loss": 0.4568, + "step": 3000 + }, + { + "epoch": 0.6316781898177247, + "grad_norm": 1.351223111152649, + "learning_rate": 4.584682476579406e-05, + "loss": 0.4858, + "step": 3015 + }, + { + "epoch": 0.6348208673790069, + "grad_norm": 1.364617943763733, + "learning_rate": 4.57998866426847e-05, + "loss": 0.4876, + "step": 3030 + }, + { + "epoch": 0.6379635449402892, + "grad_norm": 1.459356665611267, + "learning_rate": 4.575270907728437e-05, + "loss": 0.478, + "step": 3045 + }, + { + "epoch": 0.6411062225015713, + "grad_norm": 1.6396265029907227, + "learning_rate": 4.5705292612687576e-05, + "loss": 0.529, + "step": 3060 + }, + { + "epoch": 0.6442489000628535, + "grad_norm": 0.960100531578064, + "learning_rate": 4.565763779473898e-05, + "loss": 0.4391, + "step": 3075 + }, + { + "epoch": 0.6473915776241358, + "grad_norm": 1.315019130706787, + "learning_rate": 4.560974517202709e-05, + "loss": 0.4917, + "step": 3090 + }, + { + "epoch": 0.650534255185418, + "grad_norm": 1.5295921564102173, + "learning_rate": 4.556161529587794e-05, + "loss": 0.4924, + "step": 3105 + }, + { + "epoch": 0.6536769327467001, + "grad_norm": 1.1837646961212158, + "learning_rate": 4.551324872034879e-05, + "loss": 0.4493, + "step": 3120 + }, + { + "epoch": 0.6568196103079824, + "grad_norm": 1.4307267665863037, + "learning_rate": 4.5464646002221684e-05, + "loss": 0.468, + "step": 3135 + }, + { + "epoch": 0.6599622878692646, + "grad_norm": 1.155652403831482, + "learning_rate": 4.541580770099709e-05, + "loss": 0.4243, + "step": 3150 + }, + { + "epoch": 0.6631049654305469, + "grad_norm": 1.3834953308105469, + "learning_rate": 4.536673437888743e-05, + "loss": 0.5501, + "step": 3165 + }, + { + "epoch": 0.666247642991829, + "grad_norm": 1.0636712312698364, + "learning_rate": 4.531742660081063e-05, + "loss": 0.4274, + "step": 3180 + }, + { + "epoch": 0.6693903205531112, + "grad_norm": 0.8389808535575867, + "learning_rate": 4.526788493438359e-05, + "loss": 0.4489, + "step": 3195 + }, + { + "epoch": 0.6725329981143935, + "grad_norm": 1.242849349975586, + "learning_rate": 4.5218109949915674e-05, + "loss": 0.5231, + "step": 3210 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 1.4097121953964233, + "learning_rate": 4.516810222040214e-05, + "loss": 0.4373, + "step": 3225 + }, + { + "epoch": 0.6788183532369579, + "grad_norm": 1.4146395921707153, + "learning_rate": 4.511786232151753e-05, + "loss": 0.4185, + "step": 3240 + }, + { + "epoch": 0.6819610307982401, + "grad_norm": 1.1632105112075806, + "learning_rate": 4.506739083160906e-05, + "loss": 0.4387, + "step": 3255 + }, + { + "epoch": 0.6851037083595223, + "grad_norm": 1.1534103155136108, + "learning_rate": 4.501668833168995e-05, + "loss": 0.4387, + "step": 3270 + }, + { + "epoch": 0.6882463859208046, + "grad_norm": 1.355643391609192, + "learning_rate": 4.496575540543275e-05, + "loss": 0.4568, + "step": 3285 + }, + { + "epoch": 0.6913890634820867, + "grad_norm": 1.2842720746994019, + "learning_rate": 4.49145926391626e-05, + "loss": 0.4486, + "step": 3300 + }, + { + "epoch": 0.6945317410433689, + "grad_norm": 0.981799840927124, + "learning_rate": 4.48632006218505e-05, + "loss": 0.4268, + "step": 3315 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 1.5337742567062378, + "learning_rate": 4.481157994510652e-05, + "loss": 0.5001, + "step": 3330 + }, + { + "epoch": 0.7008170961659334, + "grad_norm": 1.4315093755722046, + "learning_rate": 4.475973120317298e-05, + "loss": 0.4779, + "step": 3345 + }, + { + "epoch": 0.7039597737272156, + "grad_norm": 1.181176781654358, + "learning_rate": 4.4707654992917635e-05, + "loss": 0.4312, + "step": 3360 + }, + { + "epoch": 0.7071024512884978, + "grad_norm": 1.5547527074813843, + "learning_rate": 4.465535191382679e-05, + "loss": 0.5246, + "step": 3375 + }, + { + "epoch": 0.71024512884978, + "grad_norm": 1.2100272178649902, + "learning_rate": 4.460282256799839e-05, + "loss": 0.4601, + "step": 3390 + }, + { + "epoch": 0.7133878064110623, + "grad_norm": 1.2901486158370972, + "learning_rate": 4.455006756013511e-05, + "loss": 0.4294, + "step": 3405 + }, + { + "epoch": 0.7165304839723444, + "grad_norm": 1.2931948900222778, + "learning_rate": 4.449708749753736e-05, + "loss": 0.4618, + "step": 3420 + }, + { + "epoch": 0.7196731615336267, + "grad_norm": 1.1794995069503784, + "learning_rate": 4.444388299009633e-05, + "loss": 0.4513, + "step": 3435 + }, + { + "epoch": 0.7228158390949089, + "grad_norm": 0.9884097576141357, + "learning_rate": 4.439045465028695e-05, + "loss": 0.4033, + "step": 3450 + }, + { + "epoch": 0.725958516656191, + "grad_norm": 1.3767797946929932, + "learning_rate": 4.433680309316086e-05, + "loss": 0.5132, + "step": 3465 + }, + { + "epoch": 0.7291011942174733, + "grad_norm": 1.2242072820663452, + "learning_rate": 4.428292893633928e-05, + "loss": 0.4564, + "step": 3480 + }, + { + "epoch": 0.7322438717787555, + "grad_norm": 1.416617512702942, + "learning_rate": 4.422883280000596e-05, + "loss": 0.4765, + "step": 3495 + }, + { + "epoch": 0.7353865493400377, + "grad_norm": 1.5963226556777954, + "learning_rate": 4.417451530690001e-05, + "loss": 0.4593, + "step": 3510 + }, + { + "epoch": 0.73852922690132, + "grad_norm": 1.3153035640716553, + "learning_rate": 4.411997708230872e-05, + "loss": 0.4175, + "step": 3525 + }, + { + "epoch": 0.7416719044626021, + "grad_norm": 1.202329158782959, + "learning_rate": 4.40652187540604e-05, + "loss": 0.4668, + "step": 3540 + }, + { + "epoch": 0.7448145820238844, + "grad_norm": 1.2087334394454956, + "learning_rate": 4.4010240952517115e-05, + "loss": 0.469, + "step": 3555 + }, + { + "epoch": 0.7479572595851666, + "grad_norm": 1.1056499481201172, + "learning_rate": 4.395504431056745e-05, + "loss": 0.4764, + "step": 3570 + }, + { + "epoch": 0.7510999371464487, + "grad_norm": 1.2779186964035034, + "learning_rate": 4.389962946361921e-05, + "loss": 0.3649, + "step": 3585 + }, + { + "epoch": 0.754242614707731, + "grad_norm": 1.545474886894226, + "learning_rate": 4.384399704959211e-05, + "loss": 0.4498, + "step": 3600 + }, + { + "epoch": 0.7573852922690132, + "grad_norm": 1.0024960041046143, + "learning_rate": 4.378814770891045e-05, + "loss": 0.4717, + "step": 3615 + }, + { + "epoch": 0.7605279698302954, + "grad_norm": 1.3661173582077026, + "learning_rate": 4.373208208449572e-05, + "loss": 0.4662, + "step": 3630 + }, + { + "epoch": 0.7636706473915776, + "grad_norm": 1.1410945653915405, + "learning_rate": 4.3675800821759205e-05, + "loss": 0.5376, + "step": 3645 + }, + { + "epoch": 0.7668133249528598, + "grad_norm": 1.1424890756607056, + "learning_rate": 4.361930456859455e-05, + "loss": 0.4682, + "step": 3660 + }, + { + "epoch": 0.7699560025141421, + "grad_norm": 1.373201847076416, + "learning_rate": 4.3562593975370314e-05, + "loss": 0.4454, + "step": 3675 + }, + { + "epoch": 0.7730986800754243, + "grad_norm": 1.1460034847259521, + "learning_rate": 4.350566969492248e-05, + "loss": 0.4749, + "step": 3690 + }, + { + "epoch": 0.7762413576367064, + "grad_norm": 1.2430229187011719, + "learning_rate": 4.344853238254692e-05, + "loss": 0.4535, + "step": 3705 + }, + { + "epoch": 0.7793840351979887, + "grad_norm": 1.3757741451263428, + "learning_rate": 4.339118269599191e-05, + "loss": 0.41, + "step": 3720 + }, + { + "epoch": 0.7825267127592709, + "grad_norm": 0.9454161524772644, + "learning_rate": 4.333362129545046e-05, + "loss": 0.4454, + "step": 3735 + }, + { + "epoch": 0.7856693903205532, + "grad_norm": 0.9156450033187866, + "learning_rate": 4.327584884355281e-05, + "loss": 0.4719, + "step": 3750 + }, + { + "epoch": 0.7888120678818353, + "grad_norm": 1.2694880962371826, + "learning_rate": 4.321786600535874e-05, + "loss": 0.4304, + "step": 3765 + }, + { + "epoch": 0.7919547454431175, + "grad_norm": 1.2514046430587769, + "learning_rate": 4.315967344834996e-05, + "loss": 0.409, + "step": 3780 + }, + { + "epoch": 0.7950974230043998, + "grad_norm": 1.184391736984253, + "learning_rate": 4.310127184242237e-05, + "loss": 0.4198, + "step": 3795 + }, + { + "epoch": 0.798240100565682, + "grad_norm": 1.2372093200683594, + "learning_rate": 4.304266185987842e-05, + "loss": 0.5023, + "step": 3810 + }, + { + "epoch": 0.8013827781269641, + "grad_norm": 1.340918779373169, + "learning_rate": 4.29838441754193e-05, + "loss": 0.4776, + "step": 3825 + }, + { + "epoch": 0.8045254556882464, + "grad_norm": 1.2824565172195435, + "learning_rate": 4.292481946613721e-05, + "loss": 0.4951, + "step": 3840 + }, + { + "epoch": 0.8076681332495286, + "grad_norm": 1.2031137943267822, + "learning_rate": 4.286558841150757e-05, + "loss": 0.5001, + "step": 3855 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 1.3976994752883911, + "learning_rate": 4.2806151693381194e-05, + "loss": 0.459, + "step": 3870 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 1.8632055521011353, + "learning_rate": 4.274650999597641e-05, + "loss": 0.4622, + "step": 3885 + }, + { + "epoch": 0.8170961659333752, + "grad_norm": 1.4277501106262207, + "learning_rate": 4.2686664005871226e-05, + "loss": 0.4629, + "step": 3900 + }, + { + "epoch": 0.8202388434946575, + "grad_norm": 1.189048409461975, + "learning_rate": 4.262661441199541e-05, + "loss": 0.4408, + "step": 3915 + }, + { + "epoch": 0.8233815210559396, + "grad_norm": 1.2833003997802734, + "learning_rate": 4.2566361905622555e-05, + "loss": 0.4064, + "step": 3930 + }, + { + "epoch": 0.8265241986172219, + "grad_norm": 1.1060303449630737, + "learning_rate": 4.250590718036211e-05, + "loss": 0.3962, + "step": 3945 + }, + { + "epoch": 0.8296668761785041, + "grad_norm": 1.0350922346115112, + "learning_rate": 4.2445250932151425e-05, + "loss": 0.4252, + "step": 3960 + }, + { + "epoch": 0.8328095537397863, + "grad_norm": 1.3250532150268555, + "learning_rate": 4.2384393859247726e-05, + "loss": 0.4291, + "step": 3975 + }, + { + "epoch": 0.8359522313010685, + "grad_norm": 1.2099930047988892, + "learning_rate": 4.232333666222006e-05, + "loss": 0.4341, + "step": 3990 + }, + { + "epoch": 0.8390949088623507, + "grad_norm": 1.3332287073135376, + "learning_rate": 4.226208004394127e-05, + "loss": 0.466, + "step": 4005 + }, + { + "epoch": 0.8422375864236329, + "grad_norm": 1.3363186120986938, + "learning_rate": 4.220062470957986e-05, + "loss": 0.4196, + "step": 4020 + }, + { + "epoch": 0.8453802639849152, + "grad_norm": 0.9614083170890808, + "learning_rate": 4.213897136659189e-05, + "loss": 0.4183, + "step": 4035 + }, + { + "epoch": 0.8485229415461973, + "grad_norm": 1.7605079412460327, + "learning_rate": 4.2077120724712844e-05, + "loss": 0.4756, + "step": 4050 + }, + { + "epoch": 0.8516656191074796, + "grad_norm": 1.3952196836471558, + "learning_rate": 4.201507349594946e-05, + "loss": 0.433, + "step": 4065 + }, + { + "epoch": 0.8548082966687618, + "grad_norm": 1.1092714071273804, + "learning_rate": 4.195283039457155e-05, + "loss": 0.4721, + "step": 4080 + }, + { + "epoch": 0.857950974230044, + "grad_norm": 0.9377354979515076, + "learning_rate": 4.189039213710369e-05, + "loss": 0.4666, + "step": 4095 + }, + { + "epoch": 0.8610936517913262, + "grad_norm": 1.2234201431274414, + "learning_rate": 4.1827759442317116e-05, + "loss": 0.4582, + "step": 4110 + }, + { + "epoch": 0.8642363293526084, + "grad_norm": 1.2329143285751343, + "learning_rate": 4.176493303122131e-05, + "loss": 0.4581, + "step": 4125 + }, + { + "epoch": 0.8673790069138906, + "grad_norm": 1.2294172048568726, + "learning_rate": 4.170191362705578e-05, + "loss": 0.4688, + "step": 4140 + }, + { + "epoch": 0.8705216844751729, + "grad_norm": 0.8059648871421814, + "learning_rate": 4.163870195528171e-05, + "loss": 0.3847, + "step": 4155 + }, + { + "epoch": 0.873664362036455, + "grad_norm": 1.3568918704986572, + "learning_rate": 4.157529874357364e-05, + "loss": 0.4839, + "step": 4170 + }, + { + "epoch": 0.8768070395977373, + "grad_norm": 1.33687424659729, + "learning_rate": 4.151170472181103e-05, + "loss": 0.469, + "step": 4185 + }, + { + "epoch": 0.8799497171590195, + "grad_norm": 1.1635092496871948, + "learning_rate": 4.144792062206989e-05, + "loss": 0.4117, + "step": 4200 + }, + { + "epoch": 0.8830923947203017, + "grad_norm": 0.4810682237148285, + "learning_rate": 4.138394717861438e-05, + "loss": 0.3328, + "step": 4215 + }, + { + "epoch": 0.8862350722815839, + "grad_norm": 1.170903205871582, + "learning_rate": 4.131978512788832e-05, + "loss": 0.5026, + "step": 4230 + }, + { + "epoch": 0.8893777498428661, + "grad_norm": 0.9785465598106384, + "learning_rate": 4.1255435208506695e-05, + "loss": 0.4031, + "step": 4245 + }, + { + "epoch": 0.8925204274041484, + "grad_norm": 1.0040161609649658, + "learning_rate": 4.1190898161247216e-05, + "loss": 0.3992, + "step": 4260 + }, + { + "epoch": 0.8956631049654306, + "grad_norm": 1.2257813215255737, + "learning_rate": 4.112617472904175e-05, + "loss": 0.4431, + "step": 4275 + }, + { + "epoch": 0.8988057825267127, + "grad_norm": 0.9779378771781921, + "learning_rate": 4.106126565696774e-05, + "loss": 0.4387, + "step": 4290 + }, + { + "epoch": 0.9000628535512256, + "eval_accuracy": 0.8749659063444953, + "eval_loss": 0.4478217661380768, + "eval_runtime": 801.5583, + "eval_samples_per_second": 5.97, + "eval_steps_per_second": 1.493, + "step": 4296 + }, + { + "epoch": 0.901948460087995, + "grad_norm": 1.0927642583847046, + "learning_rate": 4.099617169223971e-05, + "loss": 0.4717, + "step": 4305 + }, + { + "epoch": 0.9050911376492772, + "grad_norm": 1.3863451480865479, + "learning_rate": 4.093089358420059e-05, + "loss": 0.4482, + "step": 4320 + }, + { + "epoch": 0.9082338152105593, + "grad_norm": 0.8744410276412964, + "learning_rate": 4.08654320843131e-05, + "loss": 0.4739, + "step": 4335 + }, + { + "epoch": 0.9113764927718416, + "grad_norm": 1.1781022548675537, + "learning_rate": 4.079978794615115e-05, + "loss": 0.408, + "step": 4350 + }, + { + "epoch": 0.9145191703331238, + "grad_norm": 1.225847840309143, + "learning_rate": 4.07339619253911e-05, + "loss": 0.4624, + "step": 4365 + }, + { + "epoch": 0.9176618478944061, + "grad_norm": 1.2807953357696533, + "learning_rate": 4.0667954779803094e-05, + "loss": 0.4506, + "step": 4380 + }, + { + "epoch": 0.9208045254556882, + "grad_norm": 1.3124723434448242, + "learning_rate": 4.0601767269242356e-05, + "loss": 0.4253, + "step": 4395 + }, + { + "epoch": 0.9239472030169704, + "grad_norm": 1.10555899143219, + "learning_rate": 4.053540015564039e-05, + "loss": 0.4078, + "step": 4410 + }, + { + "epoch": 0.9270898805782527, + "grad_norm": 1.0445165634155273, + "learning_rate": 4.046885420299625e-05, + "loss": 0.4157, + "step": 4425 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 1.0756609439849854, + "learning_rate": 4.040213017736774e-05, + "loss": 0.4494, + "step": 4440 + }, + { + "epoch": 0.933375235700817, + "grad_norm": 1.2414379119873047, + "learning_rate": 4.0335228846862575e-05, + "loss": 0.4544, + "step": 4455 + }, + { + "epoch": 0.9365179132620993, + "grad_norm": 1.2390245199203491, + "learning_rate": 4.026815098162957e-05, + "loss": 0.4086, + "step": 4470 + }, + { + "epoch": 0.9396605908233815, + "grad_norm": 1.250126600265503, + "learning_rate": 4.020089735384973e-05, + "loss": 0.4206, + "step": 4485 + }, + { + "epoch": 0.9428032683846638, + "grad_norm": 1.0727368593215942, + "learning_rate": 4.013346873772743e-05, + "loss": 0.4265, + "step": 4500 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 1.2256518602371216, + "learning_rate": 4.0065865909481417e-05, + "loss": 0.4437, + "step": 4515 + }, + { + "epoch": 0.9490886235072281, + "grad_norm": 1.4009459018707275, + "learning_rate": 3.9998089647335933e-05, + "loss": 0.4203, + "step": 4530 + }, + { + "epoch": 0.9522313010685104, + "grad_norm": 1.1759395599365234, + "learning_rate": 3.993014073151175e-05, + "loss": 0.4978, + "step": 4545 + }, + { + "epoch": 0.9553739786297926, + "grad_norm": 1.0505579710006714, + "learning_rate": 3.9862019944217175e-05, + "loss": 0.4191, + "step": 4560 + }, + { + "epoch": 0.9585166561910748, + "grad_norm": 1.3067837953567505, + "learning_rate": 3.9793728069639046e-05, + "loss": 0.4671, + "step": 4575 + }, + { + "epoch": 0.961659333752357, + "grad_norm": 1.2706676721572876, + "learning_rate": 3.972526589393372e-05, + "loss": 0.4288, + "step": 4590 + }, + { + "epoch": 0.9648020113136392, + "grad_norm": 1.1527299880981445, + "learning_rate": 3.965663420521798e-05, + "loss": 0.4697, + "step": 4605 + }, + { + "epoch": 0.9679446888749215, + "grad_norm": 0.8752300143241882, + "learning_rate": 3.9587833793560026e-05, + "loss": 0.4522, + "step": 4620 + }, + { + "epoch": 0.9710873664362036, + "grad_norm": 1.0137310028076172, + "learning_rate": 3.9518865450970346e-05, + "loss": 0.4606, + "step": 4635 + }, + { + "epoch": 0.9742300439974858, + "grad_norm": 1.1071418523788452, + "learning_rate": 3.944972997139257e-05, + "loss": 0.4403, + "step": 4650 + }, + { + "epoch": 0.9773727215587681, + "grad_norm": 1.193814754486084, + "learning_rate": 3.93804281506944e-05, + "loss": 0.4046, + "step": 4665 + }, + { + "epoch": 0.9805153991200503, + "grad_norm": 1.1703835725784302, + "learning_rate": 3.93109607866584e-05, + "loss": 0.3727, + "step": 4680 + }, + { + "epoch": 0.9836580766813325, + "grad_norm": 1.2460951805114746, + "learning_rate": 3.924132867897279e-05, + "loss": 0.4457, + "step": 4695 + }, + { + "epoch": 0.9868007542426147, + "grad_norm": 1.162644624710083, + "learning_rate": 3.9171532629222304e-05, + "loss": 0.4532, + "step": 4710 + }, + { + "epoch": 0.9899434318038969, + "grad_norm": 1.1026623249053955, + "learning_rate": 3.910157344087892e-05, + "loss": 0.4886, + "step": 4725 + }, + { + "epoch": 0.9930861093651792, + "grad_norm": 1.3245232105255127, + "learning_rate": 3.9031451919292616e-05, + "loss": 0.474, + "step": 4740 + }, + { + "epoch": 0.9962287869264613, + "grad_norm": 1.5628905296325684, + "learning_rate": 3.8961168871682116e-05, + "loss": 0.5021, + "step": 4755 + }, + { + "epoch": 0.9993714644877436, + "grad_norm": 1.0988940000534058, + "learning_rate": 3.889072510712557e-05, + "loss": 0.4488, + "step": 4770 + }, + { + "epoch": 1.0025141420490258, + "grad_norm": 1.1718677282333374, + "learning_rate": 3.882012143655126e-05, + "loss": 0.4284, + "step": 4785 + }, + { + "epoch": 1.005656819610308, + "grad_norm": 1.3951458930969238, + "learning_rate": 3.874935867272826e-05, + "loss": 0.4057, + "step": 4800 + }, + { + "epoch": 1.0087994971715901, + "grad_norm": 1.1581798791885376, + "learning_rate": 3.867843763025709e-05, + "loss": 0.4073, + "step": 4815 + }, + { + "epoch": 1.0119421747328725, + "grad_norm": 1.4225468635559082, + "learning_rate": 3.860735912556031e-05, + "loss": 0.4437, + "step": 4830 + }, + { + "epoch": 1.0150848522941547, + "grad_norm": 0.9562087059020996, + "learning_rate": 3.853612397687315e-05, + "loss": 0.4008, + "step": 4845 + }, + { + "epoch": 1.0182275298554369, + "grad_norm": 1.3174970149993896, + "learning_rate": 3.846473300423409e-05, + "loss": 0.4135, + "step": 4860 + }, + { + "epoch": 1.021370207416719, + "grad_norm": 1.4198646545410156, + "learning_rate": 3.839318702947538e-05, + "loss": 0.434, + "step": 4875 + }, + { + "epoch": 1.0245128849780012, + "grad_norm": 1.2705206871032715, + "learning_rate": 3.832148687621365e-05, + "loss": 0.4136, + "step": 4890 + }, + { + "epoch": 1.0276555625392834, + "grad_norm": 1.254346489906311, + "learning_rate": 3.8249633369840346e-05, + "loss": 0.3875, + "step": 4905 + }, + { + "epoch": 1.0307982401005658, + "grad_norm": 1.2936162948608398, + "learning_rate": 3.817762733751231e-05, + "loss": 0.3966, + "step": 4920 + }, + { + "epoch": 1.033940917661848, + "grad_norm": 1.0256013870239258, + "learning_rate": 3.81054696081422e-05, + "loss": 0.4171, + "step": 4935 + }, + { + "epoch": 1.03708359522313, + "grad_norm": 1.2666840553283691, + "learning_rate": 3.803316101238895e-05, + "loss": 0.4003, + "step": 4950 + }, + { + "epoch": 1.0402262727844123, + "grad_norm": 1.2721953392028809, + "learning_rate": 3.796070238264826e-05, + "loss": 0.4034, + "step": 4965 + }, + { + "epoch": 1.0433689503456944, + "grad_norm": 1.24618661403656, + "learning_rate": 3.7888094553042954e-05, + "loss": 0.4406, + "step": 4980 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 0.923187255859375, + "learning_rate": 3.78153383594134e-05, + "loss": 0.4689, + "step": 4995 + }, + { + "epoch": 1.049654305468259, + "grad_norm": 1.0710513591766357, + "learning_rate": 3.774243463930791e-05, + "loss": 0.3844, + "step": 5010 + }, + { + "epoch": 1.0527969830295412, + "grad_norm": 1.2138617038726807, + "learning_rate": 3.766938423197306e-05, + "loss": 0.3412, + "step": 5025 + }, + { + "epoch": 1.0559396605908233, + "grad_norm": 1.3552145957946777, + "learning_rate": 3.7596187978344056e-05, + "loss": 0.4033, + "step": 5040 + }, + { + "epoch": 1.0590823381521055, + "grad_norm": 1.2156639099121094, + "learning_rate": 3.752284672103503e-05, + "loss": 0.4309, + "step": 5055 + }, + { + "epoch": 1.062225015713388, + "grad_norm": 1.4516615867614746, + "learning_rate": 3.7449361304329384e-05, + "loss": 0.42, + "step": 5070 + }, + { + "epoch": 1.06536769327467, + "grad_norm": 1.2875463962554932, + "learning_rate": 3.737573257417001e-05, + "loss": 0.3772, + "step": 5085 + }, + { + "epoch": 1.0685103708359522, + "grad_norm": 1.2341505289077759, + "learning_rate": 3.730196137814959e-05, + "loss": 0.4058, + "step": 5100 + }, + { + "epoch": 1.0716530483972344, + "grad_norm": 1.193441390991211, + "learning_rate": 3.7228048565500854e-05, + "loss": 0.4121, + "step": 5115 + }, + { + "epoch": 1.0747957259585166, + "grad_norm": 1.274909496307373, + "learning_rate": 3.715399498708676e-05, + "loss": 0.4187, + "step": 5130 + }, + { + "epoch": 1.077938403519799, + "grad_norm": 1.2880769968032837, + "learning_rate": 3.7079801495390715e-05, + "loss": 0.4071, + "step": 5145 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.7923028469085693, + "learning_rate": 3.70054689445068e-05, + "loss": 0.3541, + "step": 5160 + }, + { + "epoch": 1.0842237586423633, + "grad_norm": 1.3296815156936646, + "learning_rate": 3.6930998190129864e-05, + "loss": 0.3166, + "step": 5175 + }, + { + "epoch": 1.0873664362036455, + "grad_norm": 1.1654574871063232, + "learning_rate": 3.685639008954574e-05, + "loss": 0.484, + "step": 5190 + }, + { + "epoch": 1.0905091137649277, + "grad_norm": 1.2645684480667114, + "learning_rate": 3.6781645501621365e-05, + "loss": 0.416, + "step": 5205 + }, + { + "epoch": 1.0936517913262098, + "grad_norm": 1.2940104007720947, + "learning_rate": 3.670676528679483e-05, + "loss": 0.3892, + "step": 5220 + }, + { + "epoch": 1.0967944688874922, + "grad_norm": 1.003873586654663, + "learning_rate": 3.663175030706557e-05, + "loss": 0.4249, + "step": 5235 + }, + { + "epoch": 1.0999371464487744, + "grad_norm": 1.3847322463989258, + "learning_rate": 3.655660142598437e-05, + "loss": 0.3728, + "step": 5250 + }, + { + "epoch": 1.1030798240100566, + "grad_norm": 0.9578964710235596, + "learning_rate": 3.648131950864347e-05, + "loss": 0.3692, + "step": 5265 + }, + { + "epoch": 1.1062225015713387, + "grad_norm": 1.3054499626159668, + "learning_rate": 3.640590542166656e-05, + "loss": 0.3691, + "step": 5280 + }, + { + "epoch": 1.109365179132621, + "grad_norm": 1.1627558469772339, + "learning_rate": 3.633036003319885e-05, + "loss": 0.4018, + "step": 5295 + }, + { + "epoch": 1.1125078566939033, + "grad_norm": 1.445669174194336, + "learning_rate": 3.6254684212897035e-05, + "loss": 0.4158, + "step": 5310 + }, + { + "epoch": 1.1156505342551855, + "grad_norm": 0.9246712327003479, + "learning_rate": 3.617887883191931e-05, + "loss": 0.3393, + "step": 5325 + }, + { + "epoch": 1.1187932118164676, + "grad_norm": 1.249263882637024, + "learning_rate": 3.6102944762915355e-05, + "loss": 0.3863, + "step": 5340 + }, + { + "epoch": 1.1219358893777498, + "grad_norm": 1.1501426696777344, + "learning_rate": 3.602688288001624e-05, + "loss": 0.403, + "step": 5355 + }, + { + "epoch": 1.125078566939032, + "grad_norm": 1.2710976600646973, + "learning_rate": 3.595069405882441e-05, + "loss": 0.4146, + "step": 5370 + }, + { + "epoch": 1.1282212445003144, + "grad_norm": 1.4132471084594727, + "learning_rate": 3.587437917640358e-05, + "loss": 0.3891, + "step": 5385 + }, + { + "epoch": 1.1313639220615965, + "grad_norm": 1.3578236103057861, + "learning_rate": 3.5797939111268665e-05, + "loss": 0.378, + "step": 5400 + }, + { + "epoch": 1.1345065996228787, + "grad_norm": 1.1907520294189453, + "learning_rate": 3.57213747433756e-05, + "loss": 0.379, + "step": 5415 + }, + { + "epoch": 1.1376492771841609, + "grad_norm": 1.0988811254501343, + "learning_rate": 3.5644686954111305e-05, + "loss": 0.3431, + "step": 5430 + }, + { + "epoch": 1.140791954745443, + "grad_norm": 1.3456612825393677, + "learning_rate": 3.556787662628347e-05, + "loss": 0.3863, + "step": 5445 + }, + { + "epoch": 1.1439346323067254, + "grad_norm": 1.257224678993225, + "learning_rate": 3.549094464411042e-05, + "loss": 0.4368, + "step": 5460 + }, + { + "epoch": 1.1470773098680076, + "grad_norm": 1.4249401092529297, + "learning_rate": 3.541389189321092e-05, + "loss": 0.4006, + "step": 5475 + }, + { + "epoch": 1.1502199874292898, + "grad_norm": 1.2512503862380981, + "learning_rate": 3.5336719260594e-05, + "loss": 0.4137, + "step": 5490 + }, + { + "epoch": 1.153362664990572, + "grad_norm": 1.3531768321990967, + "learning_rate": 3.5259427634648737e-05, + "loss": 0.4046, + "step": 5505 + }, + { + "epoch": 1.156505342551854, + "grad_norm": 0.8420467972755432, + "learning_rate": 3.5182017905134e-05, + "loss": 0.3743, + "step": 5520 + }, + { + "epoch": 1.1596480201131363, + "grad_norm": 1.3925787210464478, + "learning_rate": 3.5104490963168274e-05, + "loss": 0.4171, + "step": 5535 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 1.1061654090881348, + "learning_rate": 3.502684770121932e-05, + "loss": 0.3032, + "step": 5550 + }, + { + "epoch": 1.1659333752357008, + "grad_norm": 1.4722493886947632, + "learning_rate": 3.494908901309396e-05, + "loss": 0.3401, + "step": 5565 + }, + { + "epoch": 1.169076052796983, + "grad_norm": 1.3742226362228394, + "learning_rate": 3.487121579392777e-05, + "loss": 0.394, + "step": 5580 + }, + { + "epoch": 1.1722187303582652, + "grad_norm": 0.6497241258621216, + "learning_rate": 3.479322894017476e-05, + "loss": 0.362, + "step": 5595 + }, + { + "epoch": 1.1753614079195476, + "grad_norm": 1.2617154121398926, + "learning_rate": 3.471512934959709e-05, + "loss": 0.3857, + "step": 5610 + }, + { + "epoch": 1.1785040854808297, + "grad_norm": 1.2584044933319092, + "learning_rate": 3.46369179212547e-05, + "loss": 0.4159, + "step": 5625 + }, + { + "epoch": 1.181646763042112, + "grad_norm": 0.9578741788864136, + "learning_rate": 3.455859555549498e-05, + "loss": 0.4259, + "step": 5640 + }, + { + "epoch": 1.184789440603394, + "grad_norm": 1.0911635160446167, + "learning_rate": 3.448016315394238e-05, + "loss": 0.3585, + "step": 5655 + }, + { + "epoch": 1.1879321181646763, + "grad_norm": 1.2654902935028076, + "learning_rate": 3.440162161948809e-05, + "loss": 0.3954, + "step": 5670 + }, + { + "epoch": 1.1910747957259584, + "grad_norm": 1.2683358192443848, + "learning_rate": 3.432297185627956e-05, + "loss": 0.3946, + "step": 5685 + }, + { + "epoch": 1.1942174732872408, + "grad_norm": 1.0978072881698608, + "learning_rate": 3.424421476971018e-05, + "loss": 0.3866, + "step": 5700 + }, + { + "epoch": 1.197360150848523, + "grad_norm": 1.1124176979064941, + "learning_rate": 3.41653512664088e-05, + "loss": 0.3547, + "step": 5715 + }, + { + "epoch": 1.2005028284098052, + "grad_norm": 1.274763584136963, + "learning_rate": 3.408638225422928e-05, + "loss": 0.3512, + "step": 5730 + }, + { + "epoch": 1.2036455059710873, + "grad_norm": 1.1088907718658447, + "learning_rate": 3.400730864224011e-05, + "loss": 0.3982, + "step": 5745 + }, + { + "epoch": 1.2067881835323695, + "grad_norm": 1.464532494544983, + "learning_rate": 3.392813134071388e-05, + "loss": 0.3889, + "step": 5760 + }, + { + "epoch": 1.2099308610936519, + "grad_norm": 1.2237341403961182, + "learning_rate": 3.3848851261116845e-05, + "loss": 0.433, + "step": 5775 + }, + { + "epoch": 1.213073538654934, + "grad_norm": 1.3050017356872559, + "learning_rate": 3.3769469316098375e-05, + "loss": 0.3904, + "step": 5790 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 1.3422915935516357, + "learning_rate": 3.368998641948052e-05, + "loss": 0.3807, + "step": 5805 + }, + { + "epoch": 1.2193588937774984, + "grad_norm": 1.2591235637664795, + "learning_rate": 3.3610403486247436e-05, + "loss": 0.3875, + "step": 5820 + }, + { + "epoch": 1.2225015713387806, + "grad_norm": 1.665328860282898, + "learning_rate": 3.353072143253489e-05, + "loss": 0.3621, + "step": 5835 + }, + { + "epoch": 1.2256442489000627, + "grad_norm": 1.1227225065231323, + "learning_rate": 3.345094117561967e-05, + "loss": 0.4314, + "step": 5850 + }, + { + "epoch": 1.2287869264613451, + "grad_norm": 1.421695351600647, + "learning_rate": 3.337106363390907e-05, + "loss": 0.3899, + "step": 5865 + }, + { + "epoch": 1.2319296040226273, + "grad_norm": 1.3472914695739746, + "learning_rate": 3.32910897269303e-05, + "loss": 0.4728, + "step": 5880 + }, + { + "epoch": 1.2350722815839095, + "grad_norm": 1.234174132347107, + "learning_rate": 3.321102037531987e-05, + "loss": 0.4298, + "step": 5895 + }, + { + "epoch": 1.2382149591451916, + "grad_norm": 1.3448835611343384, + "learning_rate": 3.313085650081307e-05, + "loss": 0.3667, + "step": 5910 + }, + { + "epoch": 1.241357636706474, + "grad_norm": 1.5955106019973755, + "learning_rate": 3.305059902623326e-05, + "loss": 0.3968, + "step": 5925 + }, + { + "epoch": 1.2445003142677562, + "grad_norm": 0.8962088823318481, + "learning_rate": 3.297024887548134e-05, + "loss": 0.3656, + "step": 5940 + }, + { + "epoch": 1.2476429918290384, + "grad_norm": 1.0347754955291748, + "learning_rate": 3.288980697352504e-05, + "loss": 0.3872, + "step": 5955 + }, + { + "epoch": 1.2507856693903205, + "grad_norm": 1.20237135887146, + "learning_rate": 3.280927424638832e-05, + "loss": 0.338, + "step": 5970 + }, + { + "epoch": 1.2539283469516027, + "grad_norm": 1.0156171321868896, + "learning_rate": 3.272865162114068e-05, + "loss": 0.3318, + "step": 5985 + }, + { + "epoch": 1.2570710245128849, + "grad_norm": 1.4129784107208252, + "learning_rate": 3.2647940025886525e-05, + "loss": 0.4283, + "step": 6000 + }, + { + "epoch": 1.260213702074167, + "grad_norm": 1.121748924255371, + "learning_rate": 3.256714038975443e-05, + "loss": 0.4193, + "step": 6015 + }, + { + "epoch": 1.2633563796354494, + "grad_norm": 1.0323454141616821, + "learning_rate": 3.248625364288648e-05, + "loss": 0.4382, + "step": 6030 + }, + { + "epoch": 1.2664990571967316, + "grad_norm": 1.118606686592102, + "learning_rate": 3.240528071642756e-05, + "loss": 0.3337, + "step": 6045 + }, + { + "epoch": 1.2696417347580138, + "grad_norm": 1.1677335500717163, + "learning_rate": 3.232422254251463e-05, + "loss": 0.4412, + "step": 6060 + }, + { + "epoch": 1.2727844123192962, + "grad_norm": 1.3037948608398438, + "learning_rate": 3.2243080054265994e-05, + "loss": 0.4399, + "step": 6075 + }, + { + "epoch": 1.2759270898805783, + "grad_norm": 1.1724669933319092, + "learning_rate": 3.216185418577054e-05, + "loss": 0.3618, + "step": 6090 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 1.173636794090271, + "learning_rate": 3.208054587207703e-05, + "loss": 0.3273, + "step": 6105 + }, + { + "epoch": 1.2822124450031427, + "grad_norm": 1.416745901107788, + "learning_rate": 3.1999156049183297e-05, + "loss": 0.4196, + "step": 6120 + }, + { + "epoch": 1.2853551225644249, + "grad_norm": 1.1313838958740234, + "learning_rate": 3.191768565402549e-05, + "loss": 0.3977, + "step": 6135 + }, + { + "epoch": 1.288497800125707, + "grad_norm": 1.193344235420227, + "learning_rate": 3.1836135624467276e-05, + "loss": 0.4304, + "step": 6150 + }, + { + "epoch": 1.2916404776869892, + "grad_norm": 1.3981118202209473, + "learning_rate": 3.175450689928907e-05, + "loss": 0.3614, + "step": 6165 + }, + { + "epoch": 1.2947831552482716, + "grad_norm": 1.1428194046020508, + "learning_rate": 3.167280041817717e-05, + "loss": 0.4059, + "step": 6180 + }, + { + "epoch": 1.2979258328095538, + "grad_norm": 1.2573941946029663, + "learning_rate": 3.1591017121713027e-05, + "loss": 0.3004, + "step": 6195 + }, + { + "epoch": 1.301068510370836, + "grad_norm": 1.4468852281570435, + "learning_rate": 3.150915795136232e-05, + "loss": 0.43, + "step": 6210 + }, + { + "epoch": 1.304211187932118, + "grad_norm": 1.2576549053192139, + "learning_rate": 3.14272238494642e-05, + "loss": 0.4297, + "step": 6225 + }, + { + "epoch": 1.3073538654934005, + "grad_norm": 1.1931512355804443, + "learning_rate": 3.1345215759220405e-05, + "loss": 0.4177, + "step": 6240 + }, + { + "epoch": 1.3104965430546827, + "grad_norm": 1.3183330297470093, + "learning_rate": 3.126313462468438e-05, + "loss": 0.3405, + "step": 6255 + }, + { + "epoch": 1.3136392206159648, + "grad_norm": 1.4701759815216064, + "learning_rate": 3.118098139075046e-05, + "loss": 0.4108, + "step": 6270 + }, + { + "epoch": 1.316781898177247, + "grad_norm": 1.1573525667190552, + "learning_rate": 3.109875700314296e-05, + "loss": 0.3971, + "step": 6285 + }, + { + "epoch": 1.3199245757385292, + "grad_norm": 1.167579174041748, + "learning_rate": 3.1016462408405304e-05, + "loss": 0.2966, + "step": 6300 + }, + { + "epoch": 1.3230672532998113, + "grad_norm": 1.184237003326416, + "learning_rate": 3.0934098553889095e-05, + "loss": 0.4177, + "step": 6315 + }, + { + "epoch": 1.3262099308610937, + "grad_norm": 1.4354579448699951, + "learning_rate": 3.0851666387743265e-05, + "loss": 0.3421, + "step": 6330 + }, + { + "epoch": 1.329352608422376, + "grad_norm": 1.3448097705841064, + "learning_rate": 3.076916685890311e-05, + "loss": 0.3851, + "step": 6345 + }, + { + "epoch": 1.332495285983658, + "grad_norm": 1.4120362997055054, + "learning_rate": 3.0686600917079386e-05, + "loss": 0.3758, + "step": 6360 + }, + { + "epoch": 1.3356379635449402, + "grad_norm": 1.4061853885650635, + "learning_rate": 3.060396951274739e-05, + "loss": 0.4013, + "step": 6375 + }, + { + "epoch": 1.3387806411062226, + "grad_norm": 0.6553401947021484, + "learning_rate": 3.0521273597136e-05, + "loss": 0.3807, + "step": 6390 + }, + { + "epoch": 1.3419233186675048, + "grad_norm": 1.2400474548339844, + "learning_rate": 3.0438514122216722e-05, + "loss": 0.3544, + "step": 6405 + }, + { + "epoch": 1.345065996228787, + "grad_norm": 1.2030977010726929, + "learning_rate": 3.0355692040692736e-05, + "loss": 0.3586, + "step": 6420 + }, + { + "epoch": 1.3482086737900691, + "grad_norm": 1.2839069366455078, + "learning_rate": 3.0272808305987943e-05, + "loss": 0.3798, + "step": 6435 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 1.0002667903900146, + "learning_rate": 3.0189863872235968e-05, + "loss": 0.386, + "step": 6450 + }, + { + "epoch": 1.3544940289126335, + "grad_norm": 1.1636244058609009, + "learning_rate": 3.0106859694269196e-05, + "loss": 0.4351, + "step": 6465 + }, + { + "epoch": 1.3576367064739157, + "grad_norm": 0.9394842982292175, + "learning_rate": 3.002379672760776e-05, + "loss": 0.3461, + "step": 6480 + }, + { + "epoch": 1.360779384035198, + "grad_norm": 1.2645450830459595, + "learning_rate": 2.994067592844856e-05, + "loss": 0.3852, + "step": 6495 + }, + { + "epoch": 1.3639220615964802, + "grad_norm": 1.3446435928344727, + "learning_rate": 2.9857498253654232e-05, + "loss": 0.3481, + "step": 6510 + }, + { + "epoch": 1.3670647391577624, + "grad_norm": 1.2624894380569458, + "learning_rate": 2.9774264660742164e-05, + "loss": 0.3987, + "step": 6525 + }, + { + "epoch": 1.3702074167190446, + "grad_norm": 1.2067941427230835, + "learning_rate": 2.9690976107873453e-05, + "loss": 0.3639, + "step": 6540 + }, + { + "epoch": 1.373350094280327, + "grad_norm": 1.1371479034423828, + "learning_rate": 2.960763355384188e-05, + "loss": 0.3925, + "step": 6555 + }, + { + "epoch": 1.3764927718416091, + "grad_norm": 1.0012383460998535, + "learning_rate": 2.9524237958062862e-05, + "loss": 0.4186, + "step": 6570 + }, + { + "epoch": 1.3796354494028913, + "grad_norm": 1.0432685613632202, + "learning_rate": 2.944079028056243e-05, + "loss": 0.3869, + "step": 6585 + }, + { + "epoch": 1.3827781269641735, + "grad_norm": 1.4123237133026123, + "learning_rate": 2.9357291481966155e-05, + "loss": 0.4134, + "step": 6600 + }, + { + "epoch": 1.3859208045254556, + "grad_norm": 1.1969938278198242, + "learning_rate": 2.927374252348812e-05, + "loss": 0.3821, + "step": 6615 + }, + { + "epoch": 1.3890634820867378, + "grad_norm": 1.2030854225158691, + "learning_rate": 2.9190144366919793e-05, + "loss": 0.3853, + "step": 6630 + }, + { + "epoch": 1.3922061596480202, + "grad_norm": 1.1836553812026978, + "learning_rate": 2.9106497974619042e-05, + "loss": 0.3595, + "step": 6645 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 1.6539838314056396, + "learning_rate": 2.9022804309498975e-05, + "loss": 0.4392, + "step": 6660 + }, + { + "epoch": 1.3984915147705845, + "grad_norm": 1.295224666595459, + "learning_rate": 2.8939064335016913e-05, + "loss": 0.4172, + "step": 6675 + }, + { + "epoch": 1.4016341923318667, + "grad_norm": 1.1444505453109741, + "learning_rate": 2.8855279015163273e-05, + "loss": 0.3857, + "step": 6690 + }, + { + "epoch": 1.404776869893149, + "grad_norm": 1.4091520309448242, + "learning_rate": 2.8771449314450466e-05, + "loss": 0.4384, + "step": 6705 + }, + { + "epoch": 1.4079195474544313, + "grad_norm": 0.9858888983726501, + "learning_rate": 2.8687576197901812e-05, + "loss": 0.342, + "step": 6720 + }, + { + "epoch": 1.4110622250157134, + "grad_norm": 1.2735475301742554, + "learning_rate": 2.860366063104041e-05, + "loss": 0.462, + "step": 6735 + }, + { + "epoch": 1.4142049025769956, + "grad_norm": 1.1398062705993652, + "learning_rate": 2.8519703579878053e-05, + "loss": 0.4295, + "step": 6750 + }, + { + "epoch": 1.4173475801382778, + "grad_norm": 1.4460091590881348, + "learning_rate": 2.8435706010904085e-05, + "loss": 0.3801, + "step": 6765 + }, + { + "epoch": 1.42049025769956, + "grad_norm": 1.573014736175537, + "learning_rate": 2.835166889107425e-05, + "loss": 0.4661, + "step": 6780 + }, + { + "epoch": 1.4236329352608421, + "grad_norm": 1.5855605602264404, + "learning_rate": 2.8267593187799633e-05, + "loss": 0.3628, + "step": 6795 + }, + { + "epoch": 1.4267756128221245, + "grad_norm": 1.3220208883285522, + "learning_rate": 2.8183479868935466e-05, + "loss": 0.3755, + "step": 6810 + }, + { + "epoch": 1.4299182903834067, + "grad_norm": 1.4992631673812866, + "learning_rate": 2.809932990276997e-05, + "loss": 0.4043, + "step": 6825 + }, + { + "epoch": 1.4330609679446888, + "grad_norm": 1.355560302734375, + "learning_rate": 2.8015144258013282e-05, + "loss": 0.412, + "step": 6840 + }, + { + "epoch": 1.436203645505971, + "grad_norm": 1.146181583404541, + "learning_rate": 2.7930923903786255e-05, + "loss": 0.3505, + "step": 6855 + }, + { + "epoch": 1.4393463230672534, + "grad_norm": 1.8377063274383545, + "learning_rate": 2.7846669809609267e-05, + "loss": 0.4537, + "step": 6870 + }, + { + "epoch": 1.4424890006285356, + "grad_norm": 1.4548070430755615, + "learning_rate": 2.7762382945391156e-05, + "loss": 0.4113, + "step": 6885 + }, + { + "epoch": 1.4456316781898177, + "grad_norm": 1.3672486543655396, + "learning_rate": 2.7678064281417952e-05, + "loss": 0.3917, + "step": 6900 + }, + { + "epoch": 1.4487743557511, + "grad_norm": 1.1587488651275635, + "learning_rate": 2.7593714788341795e-05, + "loss": 0.3334, + "step": 6915 + }, + { + "epoch": 1.451917033312382, + "grad_norm": 1.2732610702514648, + "learning_rate": 2.7509335437169693e-05, + "loss": 0.373, + "step": 6930 + }, + { + "epoch": 1.4550597108736643, + "grad_norm": 1.458500862121582, + "learning_rate": 2.7424927199252364e-05, + "loss": 0.3409, + "step": 6945 + }, + { + "epoch": 1.4582023884349467, + "grad_norm": 1.3266096115112305, + "learning_rate": 2.734049104627311e-05, + "loss": 0.443, + "step": 6960 + }, + { + "epoch": 1.4613450659962288, + "grad_norm": 1.0348279476165771, + "learning_rate": 2.7256027950236517e-05, + "loss": 0.3772, + "step": 6975 + }, + { + "epoch": 1.464487743557511, + "grad_norm": 1.2738145589828491, + "learning_rate": 2.7171538883457396e-05, + "loss": 0.364, + "step": 6990 + }, + { + "epoch": 1.4676304211187932, + "grad_norm": 1.184635877609253, + "learning_rate": 2.708702481854947e-05, + "loss": 0.3866, + "step": 7005 + }, + { + "epoch": 1.4707730986800756, + "grad_norm": 1.2299425601959229, + "learning_rate": 2.7002486728414283e-05, + "loss": 0.3716, + "step": 7020 + }, + { + "epoch": 1.4739157762413577, + "grad_norm": 1.3776116371154785, + "learning_rate": 2.6917925586229897e-05, + "loss": 0.402, + "step": 7035 + }, + { + "epoch": 1.47705845380264, + "grad_norm": 1.3003356456756592, + "learning_rate": 2.68333423654398e-05, + "loss": 0.3722, + "step": 7050 + }, + { + "epoch": 1.480201131363922, + "grad_norm": 1.2862930297851562, + "learning_rate": 2.67487380397416e-05, + "loss": 0.4417, + "step": 7065 + }, + { + "epoch": 1.4833438089252042, + "grad_norm": 1.116700530052185, + "learning_rate": 2.666411358307586e-05, + "loss": 0.3577, + "step": 7080 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 1.3424625396728516, + "learning_rate": 2.657946996961493e-05, + "loss": 0.3389, + "step": 7095 + }, + { + "epoch": 1.4896291640477686, + "grad_norm": 1.3122916221618652, + "learning_rate": 2.6494808173751622e-05, + "loss": 0.4148, + "step": 7110 + }, + { + "epoch": 1.492771841609051, + "grad_norm": 0.8987470865249634, + "learning_rate": 2.6410129170088115e-05, + "loss": 0.387, + "step": 7125 + }, + { + "epoch": 1.4959145191703331, + "grad_norm": 1.0086872577667236, + "learning_rate": 2.6325433933424644e-05, + "loss": 0.3495, + "step": 7140 + }, + { + "epoch": 1.4990571967316153, + "grad_norm": 1.3022773265838623, + "learning_rate": 2.6240723438748332e-05, + "loss": 0.366, + "step": 7155 + }, + { + "epoch": 1.5021998742928977, + "grad_norm": 1.324033260345459, + "learning_rate": 2.615599866122193e-05, + "loss": 0.3845, + "step": 7170 + }, + { + "epoch": 1.5053425518541799, + "grad_norm": 0.7969958782196045, + "learning_rate": 2.6071260576172634e-05, + "loss": 0.3597, + "step": 7185 + }, + { + "epoch": 1.508485229415462, + "grad_norm": 1.2666351795196533, + "learning_rate": 2.5986510159080824e-05, + "loss": 0.3573, + "step": 7200 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 1.4982563257217407, + "learning_rate": 2.590174838556881e-05, + "loss": 0.3576, + "step": 7215 + }, + { + "epoch": 1.5147705845380264, + "grad_norm": 1.5081130266189575, + "learning_rate": 2.581697623138969e-05, + "loss": 0.2803, + "step": 7230 + }, + { + "epoch": 1.5179132620993085, + "grad_norm": 1.267719030380249, + "learning_rate": 2.5732194672416012e-05, + "loss": 0.3586, + "step": 7245 + }, + { + "epoch": 1.5210559396605907, + "grad_norm": 1.1292250156402588, + "learning_rate": 2.5647404684628622e-05, + "loss": 0.3974, + "step": 7260 + }, + { + "epoch": 1.5241986172218729, + "grad_norm": 1.3279204368591309, + "learning_rate": 2.556260724410538e-05, + "loss": 0.3828, + "step": 7275 + }, + { + "epoch": 1.5273412947831553, + "grad_norm": 1.337803602218628, + "learning_rate": 2.5477803327009948e-05, + "loss": 0.3692, + "step": 7290 + }, + { + "epoch": 1.5304839723444374, + "grad_norm": 1.159134030342102, + "learning_rate": 2.5392993909580537e-05, + "loss": 0.354, + "step": 7305 + }, + { + "epoch": 1.5336266499057196, + "grad_norm": 1.2121402025222778, + "learning_rate": 2.5308179968118677e-05, + "loss": 0.4087, + "step": 7320 + }, + { + "epoch": 1.536769327467002, + "grad_norm": 1.2714091539382935, + "learning_rate": 2.522336247897799e-05, + "loss": 0.4065, + "step": 7335 + }, + { + "epoch": 1.5399120050282842, + "grad_norm": 1.128733515739441, + "learning_rate": 2.5138542418552913e-05, + "loss": 0.3605, + "step": 7350 + }, + { + "epoch": 1.5430546825895664, + "grad_norm": 1.140023946762085, + "learning_rate": 2.5053720763267506e-05, + "loss": 0.3573, + "step": 7365 + }, + { + "epoch": 1.5461973601508485, + "grad_norm": 1.3230198621749878, + "learning_rate": 2.4968898489564185e-05, + "loss": 0.3182, + "step": 7380 + }, + { + "epoch": 1.5493400377121307, + "grad_norm": 1.0801093578338623, + "learning_rate": 2.4884076573892464e-05, + "loss": 0.3523, + "step": 7395 + }, + { + "epoch": 1.5524827152734129, + "grad_norm": 1.204451084136963, + "learning_rate": 2.4799255992697767e-05, + "loss": 0.3502, + "step": 7410 + }, + { + "epoch": 1.555625392834695, + "grad_norm": 1.164306640625, + "learning_rate": 2.4714437722410145e-05, + "loss": 0.3451, + "step": 7425 + }, + { + "epoch": 1.5587680703959774, + "grad_norm": 0.8542248606681824, + "learning_rate": 2.4629622739433016e-05, + "loss": 0.3803, + "step": 7440 + }, + { + "epoch": 1.5619107479572596, + "grad_norm": 1.2533782720565796, + "learning_rate": 2.4544812020132007e-05, + "loss": 0.3561, + "step": 7455 + }, + { + "epoch": 1.5650534255185418, + "grad_norm": 1.3054505586624146, + "learning_rate": 2.4460006540823635e-05, + "loss": 0.4579, + "step": 7470 + }, + { + "epoch": 1.5681961030798242, + "grad_norm": 1.4427162408828735, + "learning_rate": 2.4375207277764085e-05, + "loss": 0.3762, + "step": 7485 + }, + { + "epoch": 1.5713387806411063, + "grad_norm": 1.1473865509033203, + "learning_rate": 2.4290415207137995e-05, + "loss": 0.4135, + "step": 7500 + }, + { + "epoch": 1.5744814582023885, + "grad_norm": 1.0101532936096191, + "learning_rate": 2.4205631305047222e-05, + "loss": 0.3653, + "step": 7515 + }, + { + "epoch": 1.5776241357636707, + "grad_norm": 1.428271770477295, + "learning_rate": 2.4120856547499564e-05, + "loss": 0.386, + "step": 7530 + }, + { + "epoch": 1.5807668133249528, + "grad_norm": 1.0353528261184692, + "learning_rate": 2.4036091910397555e-05, + "loss": 0.3912, + "step": 7545 + }, + { + "epoch": 1.583909490886235, + "grad_norm": 1.2192641496658325, + "learning_rate": 2.3951338369527233e-05, + "loss": 0.3303, + "step": 7560 + }, + { + "epoch": 1.5870521684475172, + "grad_norm": 1.2922149896621704, + "learning_rate": 2.3866596900546902e-05, + "loss": 0.3768, + "step": 7575 + }, + { + "epoch": 1.5901948460087993, + "grad_norm": 1.3581557273864746, + "learning_rate": 2.3781868478975884e-05, + "loss": 0.393, + "step": 7590 + }, + { + "epoch": 1.5933375235700817, + "grad_norm": 1.2488782405853271, + "learning_rate": 2.3697154080183308e-05, + "loss": 0.3889, + "step": 7605 + }, + { + "epoch": 1.596480201131364, + "grad_norm": 1.0586172342300415, + "learning_rate": 2.3612454679376886e-05, + "loss": 0.3639, + "step": 7620 + }, + { + "epoch": 1.5996228786926463, + "grad_norm": 1.226731300354004, + "learning_rate": 2.3527771251591675e-05, + "loss": 0.3783, + "step": 7635 + }, + { + "epoch": 1.6027655562539285, + "grad_norm": 1.4184266328811646, + "learning_rate": 2.344310477167883e-05, + "loss": 0.4132, + "step": 7650 + }, + { + "epoch": 1.6059082338152106, + "grad_norm": 1.2709243297576904, + "learning_rate": 2.3358456214294456e-05, + "loss": 0.3314, + "step": 7665 + }, + { + "epoch": 1.6090509113764928, + "grad_norm": 1.1103581190109253, + "learning_rate": 2.3273826553888294e-05, + "loss": 0.3735, + "step": 7680 + }, + { + "epoch": 1.612193588937775, + "grad_norm": 1.1599838733673096, + "learning_rate": 2.3189216764692578e-05, + "loss": 0.3968, + "step": 7695 + }, + { + "epoch": 1.6153362664990571, + "grad_norm": 1.1679604053497314, + "learning_rate": 2.3104627820710754e-05, + "loss": 0.3501, + "step": 7710 + }, + { + "epoch": 1.6184789440603393, + "grad_norm": 1.0258073806762695, + "learning_rate": 2.302006069570635e-05, + "loss": 0.3992, + "step": 7725 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 1.1728984117507935, + "learning_rate": 2.2935516363191693e-05, + "loss": 0.3625, + "step": 7740 + }, + { + "epoch": 1.6247642991829039, + "grad_norm": 1.3930670022964478, + "learning_rate": 2.2850995796416726e-05, + "loss": 0.3898, + "step": 7755 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.9263485074043274, + "learning_rate": 2.2766499968357834e-05, + "loss": 0.3145, + "step": 7770 + }, + { + "epoch": 1.6310496543054682, + "grad_norm": 1.388420581817627, + "learning_rate": 2.2682029851706584e-05, + "loss": 0.3849, + "step": 7785 + }, + { + "epoch": 1.6341923318667506, + "grad_norm": 1.2891064882278442, + "learning_rate": 2.2597586418858586e-05, + "loss": 0.3998, + "step": 7800 + }, + { + "epoch": 1.6373350094280328, + "grad_norm": 1.1814244985580444, + "learning_rate": 2.251317064190224e-05, + "loss": 0.3652, + "step": 7815 + }, + { + "epoch": 1.640477686989315, + "grad_norm": 1.1944345235824585, + "learning_rate": 2.2428783492607638e-05, + "loss": 0.3612, + "step": 7830 + }, + { + "epoch": 1.6436203645505971, + "grad_norm": 0.9002747535705566, + "learning_rate": 2.2344425942415258e-05, + "loss": 0.3131, + "step": 7845 + }, + { + "epoch": 1.6467630421118793, + "grad_norm": 1.203361988067627, + "learning_rate": 2.2260098962424874e-05, + "loss": 0.3476, + "step": 7860 + }, + { + "epoch": 1.6499057196731615, + "grad_norm": 1.0701284408569336, + "learning_rate": 2.2175803523384352e-05, + "loss": 0.3972, + "step": 7875 + }, + { + "epoch": 1.6530483972344436, + "grad_norm": 1.255242943763733, + "learning_rate": 2.209154059567843e-05, + "loss": 0.4292, + "step": 7890 + }, + { + "epoch": 1.6561910747957258, + "grad_norm": 1.1037348508834839, + "learning_rate": 2.200731114931763e-05, + "loss": 0.3782, + "step": 7905 + }, + { + "epoch": 1.6593337523570082, + "grad_norm": 1.404234528541565, + "learning_rate": 2.1923116153927e-05, + "loss": 0.3984, + "step": 7920 + }, + { + "epoch": 1.6624764299182904, + "grad_norm": 1.2808343172073364, + "learning_rate": 2.183895657873505e-05, + "loss": 0.3551, + "step": 7935 + }, + { + "epoch": 1.6656191074795728, + "grad_norm": 1.4898031949996948, + "learning_rate": 2.1754833392562502e-05, + "loss": 0.3651, + "step": 7950 + }, + { + "epoch": 1.668761785040855, + "grad_norm": 1.1187386512756348, + "learning_rate": 2.167074756381119e-05, + "loss": 0.3626, + "step": 7965 + }, + { + "epoch": 1.671904462602137, + "grad_norm": 0.9661749005317688, + "learning_rate": 2.1586700060452912e-05, + "loss": 0.3337, + "step": 7980 + }, + { + "epoch": 1.6750471401634193, + "grad_norm": 1.339406967163086, + "learning_rate": 2.1502691850018263e-05, + "loss": 0.3907, + "step": 7995 + }, + { + "epoch": 1.6781898177247014, + "grad_norm": 1.0702762603759766, + "learning_rate": 2.141872389958551e-05, + "loss": 0.3788, + "step": 8010 + }, + { + "epoch": 1.6813324952859836, + "grad_norm": 1.4297361373901367, + "learning_rate": 2.133479717576945e-05, + "loss": 0.4034, + "step": 8025 + }, + { + "epoch": 1.6844751728472658, + "grad_norm": 0.8980254530906677, + "learning_rate": 2.1250912644710325e-05, + "loss": 0.3243, + "step": 8040 + }, + { + "epoch": 1.687617850408548, + "grad_norm": 1.4087092876434326, + "learning_rate": 2.1167071272062626e-05, + "loss": 0.4123, + "step": 8055 + }, + { + "epoch": 1.6907605279698303, + "grad_norm": 1.134097933769226, + "learning_rate": 2.108327402298404e-05, + "loss": 0.3734, + "step": 8070 + }, + { + "epoch": 1.6939032055311125, + "grad_norm": 1.1244763135910034, + "learning_rate": 2.099952186212429e-05, + "loss": 0.3626, + "step": 8085 + }, + { + "epoch": 1.6970458830923947, + "grad_norm": 1.1340084075927734, + "learning_rate": 2.091581575361411e-05, + "loss": 0.3261, + "step": 8100 + }, + { + "epoch": 1.700188560653677, + "grad_norm": 1.2386656999588013, + "learning_rate": 2.0832156661054036e-05, + "loss": 0.3485, + "step": 8115 + }, + { + "epoch": 1.7033312382149592, + "grad_norm": 1.6566152572631836, + "learning_rate": 2.074854554750339e-05, + "loss": 0.3902, + "step": 8130 + }, + { + "epoch": 1.7064739157762414, + "grad_norm": 1.209065556526184, + "learning_rate": 2.06649833754692e-05, + "loss": 0.4162, + "step": 8145 + }, + { + "epoch": 1.7096165933375236, + "grad_norm": 1.2372878789901733, + "learning_rate": 2.0581471106895043e-05, + "loss": 0.3521, + "step": 8160 + }, + { + "epoch": 1.7127592708988058, + "grad_norm": 1.2591501474380493, + "learning_rate": 2.0498009703150063e-05, + "loss": 0.3496, + "step": 8175 + }, + { + "epoch": 1.715901948460088, + "grad_norm": 1.1610863208770752, + "learning_rate": 2.0414600125017834e-05, + "loss": 0.407, + "step": 8190 + }, + { + "epoch": 1.71904462602137, + "grad_norm": 1.165305495262146, + "learning_rate": 2.0331243332685367e-05, + "loss": 0.4154, + "step": 8205 + }, + { + "epoch": 1.7221873035826523, + "grad_norm": 0.9598828554153442, + "learning_rate": 2.024794028573197e-05, + "loss": 0.3947, + "step": 8220 + }, + { + "epoch": 1.7253299811439347, + "grad_norm": 1.2426929473876953, + "learning_rate": 2.0164691943118283e-05, + "loss": 0.3481, + "step": 8235 + }, + { + "epoch": 1.7284726587052168, + "grad_norm": 0.9565463066101074, + "learning_rate": 2.00814992631752e-05, + "loss": 0.3251, + "step": 8250 + }, + { + "epoch": 1.7316153362664992, + "grad_norm": 1.1574795246124268, + "learning_rate": 1.9998363203592836e-05, + "loss": 0.374, + "step": 8265 + }, + { + "epoch": 1.7347580138277814, + "grad_norm": 1.3719727993011475, + "learning_rate": 1.9915284721409506e-05, + "loss": 0.4395, + "step": 8280 + }, + { + "epoch": 1.7379006913890636, + "grad_norm": 1.21462082862854, + "learning_rate": 1.983226477300071e-05, + "loss": 0.3879, + "step": 8295 + }, + { + "epoch": 1.7410433689503457, + "grad_norm": 1.2950128316879272, + "learning_rate": 1.974930431406815e-05, + "loss": 0.3903, + "step": 8310 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 0.568601131439209, + "learning_rate": 1.966640429962867e-05, + "loss": 0.3608, + "step": 8325 + }, + { + "epoch": 1.74732872407291, + "grad_norm": 1.234540343284607, + "learning_rate": 1.9583565684003294e-05, + "loss": 0.3574, + "step": 8340 + }, + { + "epoch": 1.7504714016341922, + "grad_norm": 1.170241355895996, + "learning_rate": 1.9500789420806274e-05, + "loss": 0.3476, + "step": 8355 + }, + { + "epoch": 1.7536140791954744, + "grad_norm": 1.1727917194366455, + "learning_rate": 1.9418076462934057e-05, + "loss": 0.3825, + "step": 8370 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 1.1901155710220337, + "learning_rate": 1.933542776255432e-05, + "loss": 0.3182, + "step": 8385 + }, + { + "epoch": 1.759899434318039, + "grad_norm": 1.3078737258911133, + "learning_rate": 1.9252844271095056e-05, + "loss": 0.3766, + "step": 8400 + }, + { + "epoch": 1.7630421118793211, + "grad_norm": 1.255685567855835, + "learning_rate": 1.917032693923359e-05, + "loss": 0.4278, + "step": 8415 + }, + { + "epoch": 1.7661847894406035, + "grad_norm": 1.2631891965866089, + "learning_rate": 1.908787671688561e-05, + "loss": 0.3988, + "step": 8430 + }, + { + "epoch": 1.7693274670018857, + "grad_norm": 1.0149579048156738, + "learning_rate": 1.9005494553194277e-05, + "loss": 0.3164, + "step": 8445 + }, + { + "epoch": 1.7724701445631679, + "grad_norm": 1.2755389213562012, + "learning_rate": 1.892318139651929e-05, + "loss": 0.3699, + "step": 8460 + }, + { + "epoch": 1.77561282212445, + "grad_norm": 1.3909375667572021, + "learning_rate": 1.884093819442595e-05, + "loss": 0.3975, + "step": 8475 + }, + { + "epoch": 1.7787554996857322, + "grad_norm": 1.3214746713638306, + "learning_rate": 1.8758765893674242e-05, + "loss": 0.385, + "step": 8490 + }, + { + "epoch": 1.7818981772470144, + "grad_norm": 1.1242390871047974, + "learning_rate": 1.867666544020798e-05, + "loss": 0.3882, + "step": 8505 + }, + { + "epoch": 1.7850408548082966, + "grad_norm": 1.41203773021698, + "learning_rate": 1.8594637779143895e-05, + "loss": 0.4134, + "step": 8520 + }, + { + "epoch": 1.7881835323695787, + "grad_norm": 1.1696633100509644, + "learning_rate": 1.851268385476074e-05, + "loss": 0.3835, + "step": 8535 + }, + { + "epoch": 1.7913262099308611, + "grad_norm": 1.27289879322052, + "learning_rate": 1.8430804610488423e-05, + "loss": 0.3411, + "step": 8550 + }, + { + "epoch": 1.7944688874921433, + "grad_norm": 1.1815760135650635, + "learning_rate": 1.8349000988897183e-05, + "loss": 0.3953, + "step": 8565 + }, + { + "epoch": 1.7976115650534257, + "grad_norm": 0.9872913956642151, + "learning_rate": 1.8267273931686697e-05, + "loss": 0.3807, + "step": 8580 + }, + { + "epoch": 1.8001257071024512, + "eval_accuracy": 0.8832372290913474, + "eval_loss": 0.4148283004760742, + "eval_runtime": 1191.4012, + "eval_samples_per_second": 4.016, + "eval_steps_per_second": 1.005, + "step": 8592 + }, + { + "epoch": 1.8007542426147078, + "grad_norm": 1.2675862312316895, + "learning_rate": 1.818562437967525e-05, + "loss": 0.4136, + "step": 8595 + }, + { + "epoch": 1.80389692017599, + "grad_norm": 1.2914496660232544, + "learning_rate": 1.8104053272788912e-05, + "loss": 0.3426, + "step": 8610 + }, + { + "epoch": 1.8070395977372722, + "grad_norm": 0.8845340609550476, + "learning_rate": 1.802256155005073e-05, + "loss": 0.3796, + "step": 8625 + }, + { + "epoch": 1.8101822752985544, + "grad_norm": 1.2812376022338867, + "learning_rate": 1.79411501495699e-05, + "loss": 0.3813, + "step": 8640 + }, + { + "epoch": 1.8133249528598365, + "grad_norm": 1.479176640510559, + "learning_rate": 1.7859820008530943e-05, + "loss": 0.347, + "step": 8655 + }, + { + "epoch": 1.8164676304211187, + "grad_norm": 1.5261151790618896, + "learning_rate": 1.7778572063182976e-05, + "loss": 0.3942, + "step": 8670 + }, + { + "epoch": 1.8196103079824009, + "grad_norm": 1.0050832033157349, + "learning_rate": 1.76974072488289e-05, + "loss": 0.3831, + "step": 8685 + }, + { + "epoch": 1.8227529855436833, + "grad_norm": 0.8978458046913147, + "learning_rate": 1.761632649981462e-05, + "loss": 0.4253, + "step": 8700 + }, + { + "epoch": 1.8258956631049654, + "grad_norm": 1.3533804416656494, + "learning_rate": 1.753533074951831e-05, + "loss": 0.4012, + "step": 8715 + }, + { + "epoch": 1.8290383406662476, + "grad_norm": 1.2724169492721558, + "learning_rate": 1.7454420930339676e-05, + "loss": 0.4422, + "step": 8730 + }, + { + "epoch": 1.83218101822753, + "grad_norm": 1.2476907968521118, + "learning_rate": 1.737359797368921e-05, + "loss": 0.3421, + "step": 8745 + }, + { + "epoch": 1.8353236957888122, + "grad_norm": 1.1641727685928345, + "learning_rate": 1.7292862809977432e-05, + "loss": 0.3912, + "step": 8760 + }, + { + "epoch": 1.8384663733500943, + "grad_norm": 1.0571367740631104, + "learning_rate": 1.7212216368604264e-05, + "loss": 0.3262, + "step": 8775 + }, + { + "epoch": 1.8416090509113765, + "grad_norm": 1.1409281492233276, + "learning_rate": 1.7131659577948254e-05, + "loss": 0.4101, + "step": 8790 + }, + { + "epoch": 1.8447517284726587, + "grad_norm": 1.1299269199371338, + "learning_rate": 1.7051193365355926e-05, + "loss": 0.4095, + "step": 8805 + }, + { + "epoch": 1.8478944060339408, + "grad_norm": 1.0926958322525024, + "learning_rate": 1.697081865713108e-05, + "loss": 0.3668, + "step": 8820 + }, + { + "epoch": 1.851037083595223, + "grad_norm": 1.262511968612671, + "learning_rate": 1.689053637852417e-05, + "loss": 0.3699, + "step": 8835 + }, + { + "epoch": 1.8541797611565052, + "grad_norm": 0.9396837949752808, + "learning_rate": 1.681034745372161e-05, + "loss": 0.3793, + "step": 8850 + }, + { + "epoch": 1.8573224387177876, + "grad_norm": 1.3683308362960815, + "learning_rate": 1.6730252805835145e-05, + "loss": 0.3633, + "step": 8865 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 1.2032579183578491, + "learning_rate": 1.6650253356891247e-05, + "loss": 0.3644, + "step": 8880 + }, + { + "epoch": 1.8636077938403521, + "grad_norm": 1.1967633962631226, + "learning_rate": 1.6570350027820485e-05, + "loss": 0.3737, + "step": 8895 + }, + { + "epoch": 1.8667504714016343, + "grad_norm": 1.4144322872161865, + "learning_rate": 1.6490543738446927e-05, + "loss": 0.3816, + "step": 8910 + }, + { + "epoch": 1.8698931489629165, + "grad_norm": 1.4581791162490845, + "learning_rate": 1.6410835407477513e-05, + "loss": 0.3189, + "step": 8925 + }, + { + "epoch": 1.8730358265241986, + "grad_norm": 1.2554900646209717, + "learning_rate": 1.6331225952491557e-05, + "loss": 0.3555, + "step": 8940 + }, + { + "epoch": 1.8761785040854808, + "grad_norm": 1.4458445310592651, + "learning_rate": 1.6251716289930134e-05, + "loss": 0.4001, + "step": 8955 + }, + { + "epoch": 1.879321181646763, + "grad_norm": 1.4509528875350952, + "learning_rate": 1.6172307335085512e-05, + "loss": 0.4032, + "step": 8970 + }, + { + "epoch": 1.8824638592080452, + "grad_norm": 1.3516335487365723, + "learning_rate": 1.6093000002090657e-05, + "loss": 0.4087, + "step": 8985 + }, + { + "epoch": 1.8856065367693273, + "grad_norm": 1.1090672016143799, + "learning_rate": 1.6013795203908703e-05, + "loss": 0.3573, + "step": 9000 + }, + { + "epoch": 1.8887492143306097, + "grad_norm": 1.2857966423034668, + "learning_rate": 1.593469385232243e-05, + "loss": 0.4204, + "step": 9015 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 1.1753884553909302, + "learning_rate": 1.5855696857923738e-05, + "loss": 0.4041, + "step": 9030 + }, + { + "epoch": 1.895034569453174, + "grad_norm": 1.3764643669128418, + "learning_rate": 1.577680513010325e-05, + "loss": 0.3901, + "step": 9045 + }, + { + "epoch": 1.8981772470144564, + "grad_norm": 1.2634403705596924, + "learning_rate": 1.569801957703975e-05, + "loss": 0.3669, + "step": 9060 + }, + { + "epoch": 1.9013199245757386, + "grad_norm": 1.501197338104248, + "learning_rate": 1.5619341105689793e-05, + "loss": 0.3875, + "step": 9075 + }, + { + "epoch": 1.9044626021370208, + "grad_norm": 1.1498409509658813, + "learning_rate": 1.5540770621777213e-05, + "loss": 0.3769, + "step": 9090 + }, + { + "epoch": 1.907605279698303, + "grad_norm": 1.2901723384857178, + "learning_rate": 1.5462309029782756e-05, + "loss": 0.4069, + "step": 9105 + }, + { + "epoch": 1.9107479572595851, + "grad_norm": 1.2987323999404907, + "learning_rate": 1.5383957232933623e-05, + "loss": 0.3264, + "step": 9120 + }, + { + "epoch": 1.9138906348208673, + "grad_norm": 1.0844594240188599, + "learning_rate": 1.5305716133193056e-05, + "loss": 0.352, + "step": 9135 + }, + { + "epoch": 1.9170333123821495, + "grad_norm": 1.4493502378463745, + "learning_rate": 1.5227586631250047e-05, + "loss": 0.4362, + "step": 9150 + }, + { + "epoch": 1.9201759899434316, + "grad_norm": 1.2252168655395508, + "learning_rate": 1.5149569626508848e-05, + "loss": 0.3463, + "step": 9165 + }, + { + "epoch": 1.923318667504714, + "grad_norm": 1.2073407173156738, + "learning_rate": 1.5071666017078705e-05, + "loss": 0.3452, + "step": 9180 + }, + { + "epoch": 1.9264613450659962, + "grad_norm": 0.9203445315361023, + "learning_rate": 1.4993876699763467e-05, + "loss": 0.3588, + "step": 9195 + }, + { + "epoch": 1.9296040226272786, + "grad_norm": 1.270068645477295, + "learning_rate": 1.4916202570051319e-05, + "loss": 0.3777, + "step": 9210 + }, + { + "epoch": 1.9327467001885608, + "grad_norm": 1.1798357963562012, + "learning_rate": 1.4838644522104416e-05, + "loss": 0.3975, + "step": 9225 + }, + { + "epoch": 1.935889377749843, + "grad_norm": 1.4530518054962158, + "learning_rate": 1.476120344874861e-05, + "loss": 0.4299, + "step": 9240 + }, + { + "epoch": 1.939032055311125, + "grad_norm": 1.449532151222229, + "learning_rate": 1.4683880241463197e-05, + "loss": 0.4051, + "step": 9255 + }, + { + "epoch": 1.9421747328724073, + "grad_norm": 1.4117298126220703, + "learning_rate": 1.460667579037061e-05, + "loss": 0.3639, + "step": 9270 + }, + { + "epoch": 1.9453174104336894, + "grad_norm": 1.2169469594955444, + "learning_rate": 1.452959098422621e-05, + "loss": 0.357, + "step": 9285 + }, + { + "epoch": 1.9484600879949716, + "grad_norm": 1.243122935295105, + "learning_rate": 1.4452626710408017e-05, + "loss": 0.3618, + "step": 9300 + }, + { + "epoch": 1.9516027655562538, + "grad_norm": 1.175661563873291, + "learning_rate": 1.4375783854906555e-05, + "loss": 0.3524, + "step": 9315 + }, + { + "epoch": 1.9547454431175362, + "grad_norm": 1.468005895614624, + "learning_rate": 1.4299063302314597e-05, + "loss": 0.3667, + "step": 9330 + }, + { + "epoch": 1.9578881206788183, + "grad_norm": 1.145400047302246, + "learning_rate": 1.4222465935816975e-05, + "loss": 0.4047, + "step": 9345 + }, + { + "epoch": 1.9610307982401005, + "grad_norm": 1.3986377716064453, + "learning_rate": 1.4145992637180492e-05, + "loss": 0.3254, + "step": 9360 + }, + { + "epoch": 1.964173475801383, + "grad_norm": 1.3191365003585815, + "learning_rate": 1.4069644286743669e-05, + "loss": 0.3564, + "step": 9375 + }, + { + "epoch": 1.967316153362665, + "grad_norm": 1.48728346824646, + "learning_rate": 1.3993421763406672e-05, + "loss": 0.3196, + "step": 9390 + }, + { + "epoch": 1.9704588309239472, + "grad_norm": 1.3215950727462769, + "learning_rate": 1.3917325944621195e-05, + "loss": 0.3826, + "step": 9405 + }, + { + "epoch": 1.9736015084852294, + "grad_norm": 1.3539785146713257, + "learning_rate": 1.3841357706380348e-05, + "loss": 0.392, + "step": 9420 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 1.0365345478057861, + "learning_rate": 1.3765517923208554e-05, + "loss": 0.3862, + "step": 9435 + }, + { + "epoch": 1.9798868636077938, + "grad_norm": 1.2735167741775513, + "learning_rate": 1.3689807468151491e-05, + "loss": 0.372, + "step": 9450 + }, + { + "epoch": 1.983029541169076, + "grad_norm": 1.4106998443603516, + "learning_rate": 1.3614227212766079e-05, + "loss": 0.3768, + "step": 9465 + }, + { + "epoch": 1.9861722187303583, + "grad_norm": 1.568157434463501, + "learning_rate": 1.3538778027110402e-05, + "loss": 0.3453, + "step": 9480 + }, + { + "epoch": 1.9893148962916405, + "grad_norm": 1.4247443675994873, + "learning_rate": 1.3463460779733706e-05, + "loss": 0.407, + "step": 9495 + }, + { + "epoch": 1.9924575738529227, + "grad_norm": 1.2098503112792969, + "learning_rate": 1.3388276337666384e-05, + "loss": 0.3444, + "step": 9510 + }, + { + "epoch": 1.995600251414205, + "grad_norm": 1.054401159286499, + "learning_rate": 1.3313225566410042e-05, + "loss": 0.3342, + "step": 9525 + }, + { + "epoch": 1.9987429289754872, + "grad_norm": 1.186824917793274, + "learning_rate": 1.3238309329927511e-05, + "loss": 0.3322, + "step": 9540 + }, + { + "epoch": 2.0018856065367694, + "grad_norm": 1.0764572620391846, + "learning_rate": 1.3163528490632854e-05, + "loss": 0.3444, + "step": 9555 + }, + { + "epoch": 2.0050282840980516, + "grad_norm": 1.051069974899292, + "learning_rate": 1.3088883909381531e-05, + "loss": 0.2928, + "step": 9570 + }, + { + "epoch": 2.0081709616593337, + "grad_norm": 1.2765467166900635, + "learning_rate": 1.3014376445460391e-05, + "loss": 0.303, + "step": 9585 + }, + { + "epoch": 2.011313639220616, + "grad_norm": 0.9927627444267273, + "learning_rate": 1.2940006956577871e-05, + "loss": 0.2736, + "step": 9600 + }, + { + "epoch": 2.014456316781898, + "grad_norm": 1.6037464141845703, + "learning_rate": 1.2865776298854043e-05, + "loss": 0.2862, + "step": 9615 + }, + { + "epoch": 2.0175989943431802, + "grad_norm": 1.486846923828125, + "learning_rate": 1.2791685326810826e-05, + "loss": 0.3303, + "step": 9630 + }, + { + "epoch": 2.0207416719044624, + "grad_norm": 1.5033382177352905, + "learning_rate": 1.2717734893362102e-05, + "loss": 0.273, + "step": 9645 + }, + { + "epoch": 2.023884349465745, + "grad_norm": 1.7398715019226074, + "learning_rate": 1.2643925849803895e-05, + "loss": 0.3412, + "step": 9660 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 1.2956515550613403, + "learning_rate": 1.2570259045804628e-05, + "loss": 0.371, + "step": 9675 + }, + { + "epoch": 2.0301697045883094, + "grad_norm": 1.6283161640167236, + "learning_rate": 1.2496735329395286e-05, + "loss": 0.3437, + "step": 9690 + }, + { + "epoch": 2.0333123821495915, + "grad_norm": 1.208808183670044, + "learning_rate": 1.2423355546959664e-05, + "loss": 0.3402, + "step": 9705 + }, + { + "epoch": 2.0364550597108737, + "grad_norm": 1.0130226612091064, + "learning_rate": 1.2350120543224625e-05, + "loss": 0.3091, + "step": 9720 + }, + { + "epoch": 2.039597737272156, + "grad_norm": 1.4891202449798584, + "learning_rate": 1.2277031161250398e-05, + "loss": 0.3595, + "step": 9735 + }, + { + "epoch": 2.042740414833438, + "grad_norm": 1.399242877960205, + "learning_rate": 1.2204088242420866e-05, + "loss": 0.2866, + "step": 9750 + }, + { + "epoch": 2.04588309239472, + "grad_norm": 1.6362804174423218, + "learning_rate": 1.2131292626433843e-05, + "loss": 0.3116, + "step": 9765 + }, + { + "epoch": 2.0490257699560024, + "grad_norm": 1.3457330465316772, + "learning_rate": 1.2058645151291436e-05, + "loss": 0.3473, + "step": 9780 + }, + { + "epoch": 2.0521684475172846, + "grad_norm": 1.0016905069351196, + "learning_rate": 1.198614665329042e-05, + "loss": 0.3299, + "step": 9795 + }, + { + "epoch": 2.0553111250785667, + "grad_norm": 1.6363437175750732, + "learning_rate": 1.1913797967012585e-05, + "loss": 0.2997, + "step": 9810 + }, + { + "epoch": 2.0584538026398493, + "grad_norm": 1.3227770328521729, + "learning_rate": 1.1841599925315106e-05, + "loss": 0.312, + "step": 9825 + }, + { + "epoch": 2.0615964802011315, + "grad_norm": 1.6865644454956055, + "learning_rate": 1.1769553359321017e-05, + "loss": 0.2977, + "step": 9840 + }, + { + "epoch": 2.0647391577624137, + "grad_norm": 1.7184381484985352, + "learning_rate": 1.169765909840957e-05, + "loss": 0.2997, + "step": 9855 + }, + { + "epoch": 2.067881835323696, + "grad_norm": 1.0318830013275146, + "learning_rate": 1.1625917970206759e-05, + "loss": 0.3017, + "step": 9870 + }, + { + "epoch": 2.071024512884978, + "grad_norm": 1.549784779548645, + "learning_rate": 1.155433080057573e-05, + "loss": 0.3203, + "step": 9885 + }, + { + "epoch": 2.07416719044626, + "grad_norm": 1.5676542520523071, + "learning_rate": 1.1482898413607333e-05, + "loss": 0.3512, + "step": 9900 + }, + { + "epoch": 2.0773098680075424, + "grad_norm": 1.68881356716156, + "learning_rate": 1.1411621631610575e-05, + "loss": 0.3201, + "step": 9915 + }, + { + "epoch": 2.0804525455688245, + "grad_norm": 1.3327656984329224, + "learning_rate": 1.1340501275103178e-05, + "loss": 0.3129, + "step": 9930 + }, + { + "epoch": 2.0835952231301067, + "grad_norm": 1.5713459253311157, + "learning_rate": 1.1269538162802196e-05, + "loss": 0.3212, + "step": 9945 + }, + { + "epoch": 2.086737900691389, + "grad_norm": 1.3707289695739746, + "learning_rate": 1.1198733111614474e-05, + "loss": 0.2978, + "step": 9960 + }, + { + "epoch": 2.0898805782526715, + "grad_norm": 1.3866550922393799, + "learning_rate": 1.1128086936627321e-05, + "loss": 0.353, + "step": 9975 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 1.3355560302734375, + "learning_rate": 1.1057600451099104e-05, + "loss": 0.2947, + "step": 9990 + }, + { + "epoch": 2.096165933375236, + "grad_norm": 1.3299508094787598, + "learning_rate": 1.0987274466449907e-05, + "loss": 0.2719, + "step": 10005 + }, + { + "epoch": 2.099308610936518, + "grad_norm": 1.4944045543670654, + "learning_rate": 1.0917109792252173e-05, + "loss": 0.3074, + "step": 10020 + }, + { + "epoch": 2.1024512884978, + "grad_norm": 1.238981008529663, + "learning_rate": 1.084710723622136e-05, + "loss": 0.3253, + "step": 10035 + }, + { + "epoch": 2.1055939660590823, + "grad_norm": 1.7395031452178955, + "learning_rate": 1.0777267604206703e-05, + "loss": 0.3404, + "step": 10050 + }, + { + "epoch": 2.1087366436203645, + "grad_norm": 1.597024917602539, + "learning_rate": 1.0707591700181874e-05, + "loss": 0.3362, + "step": 10065 + }, + { + "epoch": 2.1118793211816467, + "grad_norm": 1.5733188390731812, + "learning_rate": 1.0638080326235777e-05, + "loss": 0.3694, + "step": 10080 + }, + { + "epoch": 2.115021998742929, + "grad_norm": 1.2697248458862305, + "learning_rate": 1.0568734282563272e-05, + "loss": 0.3231, + "step": 10095 + }, + { + "epoch": 2.118164676304211, + "grad_norm": 1.410846471786499, + "learning_rate": 1.049955436745601e-05, + "loss": 0.3175, + "step": 10110 + }, + { + "epoch": 2.121307353865493, + "grad_norm": 1.4120702743530273, + "learning_rate": 1.0430541377293191e-05, + "loss": 0.3534, + "step": 10125 + }, + { + "epoch": 2.124450031426776, + "grad_norm": 1.8276065587997437, + "learning_rate": 1.0361696106532442e-05, + "loss": 0.3332, + "step": 10140 + }, + { + "epoch": 2.127592708988058, + "grad_norm": 1.6806981563568115, + "learning_rate": 1.0293019347700658e-05, + "loss": 0.2967, + "step": 10155 + }, + { + "epoch": 2.13073538654934, + "grad_norm": 2.0087246894836426, + "learning_rate": 1.0224511891384853e-05, + "loss": 0.3439, + "step": 10170 + }, + { + "epoch": 2.1338780641106223, + "grad_norm": 1.5151036977767944, + "learning_rate": 1.015617452622309e-05, + "loss": 0.3344, + "step": 10185 + }, + { + "epoch": 2.1370207416719045, + "grad_norm": 1.1880221366882324, + "learning_rate": 1.008800803889537e-05, + "loss": 0.2934, + "step": 10200 + }, + { + "epoch": 2.1401634192331866, + "grad_norm": 1.1785838603973389, + "learning_rate": 1.0020013214114657e-05, + "loss": 0.3163, + "step": 10215 + }, + { + "epoch": 2.143306096794469, + "grad_norm": 1.2505255937576294, + "learning_rate": 9.952190834617728e-06, + "loss": 0.3166, + "step": 10230 + }, + { + "epoch": 2.146448774355751, + "grad_norm": 2.049252510070801, + "learning_rate": 9.884541681156226e-06, + "loss": 0.3077, + "step": 10245 + }, + { + "epoch": 2.149591451917033, + "grad_norm": 1.616794466972351, + "learning_rate": 9.817066532487701e-06, + "loss": 0.3077, + "step": 10260 + }, + { + "epoch": 2.1527341294783153, + "grad_norm": 1.339815378189087, + "learning_rate": 9.749766165366567e-06, + "loss": 0.3528, + "step": 10275 + }, + { + "epoch": 2.155876807039598, + "grad_norm": 1.4637688398361206, + "learning_rate": 9.682641354535244e-06, + "loss": 0.3619, + "step": 10290 + }, + { + "epoch": 2.15901948460088, + "grad_norm": 1.2227802276611328, + "learning_rate": 9.615692872715154e-06, + "loss": 0.3413, + "step": 10305 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 1.7328176498413086, + "learning_rate": 9.548921490597917e-06, + "loss": 0.3127, + "step": 10320 + }, + { + "epoch": 2.1653048397234445, + "grad_norm": 1.122909665107727, + "learning_rate": 9.482327976836392e-06, + "loss": 0.2989, + "step": 10335 + }, + { + "epoch": 2.1684475172847266, + "grad_norm": 1.163944959640503, + "learning_rate": 9.415913098035895e-06, + "loss": 0.3264, + "step": 10350 + }, + { + "epoch": 2.171590194846009, + "grad_norm": 1.4139958620071411, + "learning_rate": 9.349677618745347e-06, + "loss": 0.2845, + "step": 10365 + }, + { + "epoch": 2.174732872407291, + "grad_norm": 1.749042272567749, + "learning_rate": 9.28362230144846e-06, + "loss": 0.3336, + "step": 10380 + }, + { + "epoch": 2.177875549968573, + "grad_norm": 1.489220142364502, + "learning_rate": 9.217747906554969e-06, + "loss": 0.299, + "step": 10395 + }, + { + "epoch": 2.1810182275298553, + "grad_norm": 1.2497318983078003, + "learning_rate": 9.152055192391903e-06, + "loss": 0.2956, + "step": 10410 + }, + { + "epoch": 2.1841609050911375, + "grad_norm": 1.4486489295959473, + "learning_rate": 9.086544915194831e-06, + "loss": 0.3065, + "step": 10425 + }, + { + "epoch": 2.1873035826524196, + "grad_norm": 1.4671967029571533, + "learning_rate": 9.021217829099143e-06, + "loss": 0.3275, + "step": 10440 + }, + { + "epoch": 2.1904462602137023, + "grad_norm": 1.387172818183899, + "learning_rate": 8.956074686131396e-06, + "loss": 0.2766, + "step": 10455 + }, + { + "epoch": 2.1935889377749844, + "grad_norm": 1.0154411792755127, + "learning_rate": 8.89111623620065e-06, + "loss": 0.3188, + "step": 10470 + }, + { + "epoch": 2.1967316153362666, + "grad_norm": 1.452532172203064, + "learning_rate": 8.826343227089843e-06, + "loss": 0.3148, + "step": 10485 + }, + { + "epoch": 2.1998742928975488, + "grad_norm": 1.309695839881897, + "learning_rate": 8.761756404447144e-06, + "loss": 0.2735, + "step": 10500 + }, + { + "epoch": 2.203016970458831, + "grad_norm": 1.652197003364563, + "learning_rate": 8.69735651177741e-06, + "loss": 0.3238, + "step": 10515 + }, + { + "epoch": 2.206159648020113, + "grad_norm": 1.330776572227478, + "learning_rate": 8.633144290433629e-06, + "loss": 0.3433, + "step": 10530 + }, + { + "epoch": 2.2093023255813953, + "grad_norm": 1.5660831928253174, + "learning_rate": 8.56912047960834e-06, + "loss": 0.3275, + "step": 10545 + }, + { + "epoch": 2.2124450031426774, + "grad_norm": 1.1177830696105957, + "learning_rate": 8.50528581632519e-06, + "loss": 0.3697, + "step": 10560 + }, + { + "epoch": 2.2155876807039596, + "grad_norm": 1.4742639064788818, + "learning_rate": 8.441641035430381e-06, + "loss": 0.3099, + "step": 10575 + }, + { + "epoch": 2.218730358265242, + "grad_norm": 1.505416750907898, + "learning_rate": 8.378186869584275e-06, + "loss": 0.33, + "step": 10590 + }, + { + "epoch": 2.2218730358265244, + "grad_norm": 1.5553947687149048, + "learning_rate": 8.314924049252895e-06, + "loss": 0.3302, + "step": 10605 + }, + { + "epoch": 2.2250157133878066, + "grad_norm": 1.5330064296722412, + "learning_rate": 8.251853302699578e-06, + "loss": 0.3387, + "step": 10620 + }, + { + "epoch": 2.2281583909490887, + "grad_norm": 1.2511600255966187, + "learning_rate": 8.188975355976557e-06, + "loss": 0.2764, + "step": 10635 + }, + { + "epoch": 2.231301068510371, + "grad_norm": 1.3672597408294678, + "learning_rate": 8.126290932916599e-06, + "loss": 0.3554, + "step": 10650 + }, + { + "epoch": 2.234443746071653, + "grad_norm": 1.28493332862854, + "learning_rate": 8.06380075512468e-06, + "loss": 0.3377, + "step": 10665 + }, + { + "epoch": 2.2375864236329353, + "grad_norm": 1.5767827033996582, + "learning_rate": 8.001505541969698e-06, + "loss": 0.328, + "step": 10680 + }, + { + "epoch": 2.2407291011942174, + "grad_norm": 1.3858174085617065, + "learning_rate": 7.939406010576167e-06, + "loss": 0.2975, + "step": 10695 + }, + { + "epoch": 2.2438717787554996, + "grad_norm": 1.6385616064071655, + "learning_rate": 7.877502875815961e-06, + "loss": 0.3297, + "step": 10710 + }, + { + "epoch": 2.2470144563167818, + "grad_norm": 1.4886940717697144, + "learning_rate": 7.815796850300095e-06, + "loss": 0.3159, + "step": 10725 + }, + { + "epoch": 2.250157133878064, + "grad_norm": 1.1138700246810913, + "learning_rate": 7.754288644370528e-06, + "loss": 0.336, + "step": 10740 + }, + { + "epoch": 2.253299811439346, + "grad_norm": 1.5991181135177612, + "learning_rate": 7.692978966091977e-06, + "loss": 0.3252, + "step": 10755 + }, + { + "epoch": 2.2564424890006287, + "grad_norm": 1.1452405452728271, + "learning_rate": 7.631868521243757e-06, + "loss": 0.316, + "step": 10770 + }, + { + "epoch": 2.259585166561911, + "grad_norm": 1.069392204284668, + "learning_rate": 7.57095801331166e-06, + "loss": 0.3167, + "step": 10785 + }, + { + "epoch": 2.262727844123193, + "grad_norm": 1.717702865600586, + "learning_rate": 7.510248143479876e-06, + "loss": 0.3426, + "step": 10800 + }, + { + "epoch": 2.2658705216844752, + "grad_norm": 1.7524367570877075, + "learning_rate": 7.4497396106229134e-06, + "loss": 0.3732, + "step": 10815 + }, + { + "epoch": 2.2690131992457574, + "grad_norm": 1.937584638595581, + "learning_rate": 7.38943311129752e-06, + "loss": 0.3333, + "step": 10830 + }, + { + "epoch": 2.2721558768070396, + "grad_norm": 1.3948473930358887, + "learning_rate": 7.329329339734722e-06, + "loss": 0.3149, + "step": 10845 + }, + { + "epoch": 2.2752985543683217, + "grad_norm": 1.588791012763977, + "learning_rate": 7.269428987831783e-06, + "loss": 0.3433, + "step": 10860 + }, + { + "epoch": 2.278441231929604, + "grad_norm": 1.2459790706634521, + "learning_rate": 7.209732745144254e-06, + "loss": 0.2659, + "step": 10875 + }, + { + "epoch": 2.281583909490886, + "grad_norm": 1.0872770547866821, + "learning_rate": 7.150241298878055e-06, + "loss": 0.2956, + "step": 10890 + }, + { + "epoch": 2.2847265870521687, + "grad_norm": 1.6503065824508667, + "learning_rate": 7.090955333881555e-06, + "loss": 0.3258, + "step": 10905 + }, + { + "epoch": 2.287869264613451, + "grad_norm": 1.3506873846054077, + "learning_rate": 7.0318755326376576e-06, + "loss": 0.2789, + "step": 10920 + }, + { + "epoch": 2.291011942174733, + "grad_norm": 1.3215200901031494, + "learning_rate": 6.973002575255974e-06, + "loss": 0.3325, + "step": 10935 + }, + { + "epoch": 2.294154619736015, + "grad_norm": 1.4247123003005981, + "learning_rate": 6.914337139465004e-06, + "loss": 0.3329, + "step": 10950 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 1.0532140731811523, + "learning_rate": 6.85587990060432e-06, + "loss": 0.2541, + "step": 10965 + }, + { + "epoch": 2.3004399748585795, + "grad_norm": 1.6737048625946045, + "learning_rate": 6.797631531616769e-06, + "loss": 0.3642, + "step": 10980 + }, + { + "epoch": 2.3035826524198617, + "grad_norm": 1.2676361799240112, + "learning_rate": 6.739592703040759e-06, + "loss": 0.2897, + "step": 10995 + }, + { + "epoch": 2.306725329981144, + "grad_norm": 1.5627233982086182, + "learning_rate": 6.681764083002534e-06, + "loss": 0.3278, + "step": 11010 + }, + { + "epoch": 2.309868007542426, + "grad_norm": 1.7141146659851074, + "learning_rate": 6.624146337208484e-06, + "loss": 0.3139, + "step": 11025 + }, + { + "epoch": 2.313010685103708, + "grad_norm": 1.1188994646072388, + "learning_rate": 6.566740128937451e-06, + "loss": 0.295, + "step": 11040 + }, + { + "epoch": 2.3161533626649904, + "grad_norm": 1.5478028059005737, + "learning_rate": 6.509546119033152e-06, + "loss": 0.3149, + "step": 11055 + }, + { + "epoch": 2.3192960402262726, + "grad_norm": 1.1058639287948608, + "learning_rate": 6.4525649658965045e-06, + "loss": 0.274, + "step": 11070 + }, + { + "epoch": 2.322438717787555, + "grad_norm": 1.5267043113708496, + "learning_rate": 6.395797325478106e-06, + "loss": 0.3099, + "step": 11085 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 1.4159321784973145, + "learning_rate": 6.339243851270635e-06, + "loss": 0.3495, + "step": 11100 + }, + { + "epoch": 2.3287240729101195, + "grad_norm": 1.2933320999145508, + "learning_rate": 6.282905194301375e-06, + "loss": 0.2708, + "step": 11115 + }, + { + "epoch": 2.3318667504714017, + "grad_norm": 1.9966567754745483, + "learning_rate": 6.226782003124676e-06, + "loss": 0.2899, + "step": 11130 + }, + { + "epoch": 2.335009428032684, + "grad_norm": 1.3963077068328857, + "learning_rate": 6.170874923814499e-06, + "loss": 0.3259, + "step": 11145 + }, + { + "epoch": 2.338152105593966, + "grad_norm": 1.3655591011047363, + "learning_rate": 6.115184599957033e-06, + "loss": 0.289, + "step": 11160 + }, + { + "epoch": 2.341294783155248, + "grad_norm": 1.4125938415527344, + "learning_rate": 6.059711672643195e-06, + "loss": 0.291, + "step": 11175 + }, + { + "epoch": 2.3444374607165304, + "grad_norm": 2.017850875854492, + "learning_rate": 6.004456780461315e-06, + "loss": 0.3044, + "step": 11190 + }, + { + "epoch": 2.3475801382778125, + "grad_norm": 1.441328525543213, + "learning_rate": 5.949420559489752e-06, + "loss": 0.3245, + "step": 11205 + }, + { + "epoch": 2.350722815839095, + "grad_norm": 1.799134373664856, + "learning_rate": 5.894603643289601e-06, + "loss": 0.3593, + "step": 11220 + }, + { + "epoch": 2.3538654934003773, + "grad_norm": 1.8016554117202759, + "learning_rate": 5.840006662897388e-06, + "loss": 0.2787, + "step": 11235 + }, + { + "epoch": 2.3570081709616595, + "grad_norm": 1.4649808406829834, + "learning_rate": 5.785630246817781e-06, + "loss": 0.3168, + "step": 11250 + }, + { + "epoch": 2.3601508485229417, + "grad_norm": 1.3161333799362183, + "learning_rate": 5.731475021016383e-06, + "loss": 0.3732, + "step": 11265 + }, + { + "epoch": 2.363293526084224, + "grad_norm": 1.663887858390808, + "learning_rate": 5.677541608912526e-06, + "loss": 0.2998, + "step": 11280 + }, + { + "epoch": 2.366436203645506, + "grad_norm": 1.439397931098938, + "learning_rate": 5.623830631372087e-06, + "loss": 0.3206, + "step": 11295 + }, + { + "epoch": 2.369578881206788, + "grad_norm": 1.6403486728668213, + "learning_rate": 5.570342706700324e-06, + "loss": 0.3565, + "step": 11310 + }, + { + "epoch": 2.3727215587680703, + "grad_norm": 1.6395245790481567, + "learning_rate": 5.517078450634799e-06, + "loss": 0.294, + "step": 11325 + }, + { + "epoch": 2.3758642363293525, + "grad_norm": 1.496952772140503, + "learning_rate": 5.464038476338237e-06, + "loss": 0.2963, + "step": 11340 + }, + { + "epoch": 2.3790069138906347, + "grad_norm": 1.9148141145706177, + "learning_rate": 5.411223394391529e-06, + "loss": 0.3353, + "step": 11355 + }, + { + "epoch": 2.382149591451917, + "grad_norm": 1.4077427387237549, + "learning_rate": 5.3586338127866396e-06, + "loss": 0.3174, + "step": 11370 + }, + { + "epoch": 2.385292269013199, + "grad_norm": 1.5252655744552612, + "learning_rate": 5.306270336919661e-06, + "loss": 0.3134, + "step": 11385 + }, + { + "epoch": 2.3884349465744816, + "grad_norm": 1.5777688026428223, + "learning_rate": 5.254133569583808e-06, + "loss": 0.3309, + "step": 11400 + }, + { + "epoch": 2.391577624135764, + "grad_norm": 1.7088990211486816, + "learning_rate": 5.2022241109624805e-06, + "loss": 0.2441, + "step": 11415 + }, + { + "epoch": 2.394720301697046, + "grad_norm": 1.7140231132507324, + "learning_rate": 5.150542558622415e-06, + "loss": 0.3053, + "step": 11430 + }, + { + "epoch": 2.397862979258328, + "grad_norm": 3.6586174964904785, + "learning_rate": 5.099089507506705e-06, + "loss": 0.3079, + "step": 11445 + }, + { + "epoch": 2.4010056568196103, + "grad_norm": 1.752259612083435, + "learning_rate": 5.047865549928024e-06, + "loss": 0.324, + "step": 11460 + }, + { + "epoch": 2.4041483343808925, + "grad_norm": 1.5753651857376099, + "learning_rate": 4.996871275561779e-06, + "loss": 0.3128, + "step": 11475 + }, + { + "epoch": 2.4072910119421747, + "grad_norm": 1.9012105464935303, + "learning_rate": 4.946107271439343e-06, + "loss": 0.3764, + "step": 11490 + }, + { + "epoch": 2.410433689503457, + "grad_norm": 1.4729382991790771, + "learning_rate": 4.895574121941285e-06, + "loss": 0.2755, + "step": 11505 + }, + { + "epoch": 2.413576367064739, + "grad_norm": 1.4175302982330322, + "learning_rate": 4.845272408790621e-06, + "loss": 0.3121, + "step": 11520 + }, + { + "epoch": 2.4167190446260216, + "grad_norm": 1.7722225189208984, + "learning_rate": 4.795202711046168e-06, + "loss": 0.2744, + "step": 11535 + }, + { + "epoch": 2.4198617221873038, + "grad_norm": 1.4909186363220215, + "learning_rate": 4.74536560509582e-06, + "loss": 0.3025, + "step": 11550 + }, + { + "epoch": 2.423004399748586, + "grad_norm": 1.8246691226959229, + "learning_rate": 4.695761664649964e-06, + "loss": 0.3324, + "step": 11565 + }, + { + "epoch": 2.426147077309868, + "grad_norm": 1.7963186502456665, + "learning_rate": 4.646391460734837e-06, + "loss": 0.3575, + "step": 11580 + }, + { + "epoch": 2.4292897548711503, + "grad_norm": 1.5770527124404907, + "learning_rate": 4.5972555616859816e-06, + "loss": 0.2908, + "step": 11595 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 1.617647409439087, + "learning_rate": 4.548354533141677e-06, + "loss": 0.2994, + "step": 11610 + }, + { + "epoch": 2.4355751099937146, + "grad_norm": 1.745650291442871, + "learning_rate": 4.49968893803645e-06, + "loss": 0.3361, + "step": 11625 + }, + { + "epoch": 2.438717787554997, + "grad_norm": 1.0638154745101929, + "learning_rate": 4.451259336594596e-06, + "loss": 0.3368, + "step": 11640 + }, + { + "epoch": 2.441860465116279, + "grad_norm": 1.482951045036316, + "learning_rate": 4.403066286323693e-06, + "loss": 0.3004, + "step": 11655 + }, + { + "epoch": 2.445003142677561, + "grad_norm": 1.4275717735290527, + "learning_rate": 4.355110342008231e-06, + "loss": 0.2826, + "step": 11670 + }, + { + "epoch": 2.4481458202388433, + "grad_norm": 1.4426920413970947, + "learning_rate": 4.307392055703182e-06, + "loss": 0.2944, + "step": 11685 + }, + { + "epoch": 2.4512884978001255, + "grad_norm": 1.5074379444122314, + "learning_rate": 4.259911976727712e-06, + "loss": 0.3222, + "step": 11700 + }, + { + "epoch": 2.454431175361408, + "grad_norm": 1.3746048212051392, + "learning_rate": 4.212670651658768e-06, + "loss": 0.317, + "step": 11715 + }, + { + "epoch": 2.4575738529226903, + "grad_norm": 1.6050078868865967, + "learning_rate": 4.165668624324845e-06, + "loss": 0.3172, + "step": 11730 + }, + { + "epoch": 2.4607165304839724, + "grad_norm": 1.2552024126052856, + "learning_rate": 4.118906435799724e-06, + "loss": 0.2816, + "step": 11745 + }, + { + "epoch": 2.4638592080452546, + "grad_norm": 1.3392716646194458, + "learning_rate": 4.0723846243962084e-06, + "loss": 0.3155, + "step": 11760 + }, + { + "epoch": 2.4670018856065368, + "grad_norm": 1.5874278545379639, + "learning_rate": 4.026103725659977e-06, + "loss": 0.2603, + "step": 11775 + }, + { + "epoch": 2.470144563167819, + "grad_norm": 1.235484004020691, + "learning_rate": 3.980064272363362e-06, + "loss": 0.2499, + "step": 11790 + }, + { + "epoch": 2.473287240729101, + "grad_norm": 1.6743351221084595, + "learning_rate": 3.934266794499275e-06, + "loss": 0.3402, + "step": 11805 + }, + { + "epoch": 2.4764299182903833, + "grad_norm": 1.4384301900863647, + "learning_rate": 3.888711819275048e-06, + "loss": 0.3176, + "step": 11820 + }, + { + "epoch": 2.4795725958516655, + "grad_norm": 1.4185879230499268, + "learning_rate": 3.84339987110641e-06, + "loss": 0.3183, + "step": 11835 + }, + { + "epoch": 2.482715273412948, + "grad_norm": 1.382876992225647, + "learning_rate": 3.7983314716114384e-06, + "loss": 0.3044, + "step": 11850 + }, + { + "epoch": 2.4858579509742302, + "grad_norm": 1.7051907777786255, + "learning_rate": 3.7535071396045286e-06, + "loss": 0.3701, + "step": 11865 + }, + { + "epoch": 2.4890006285355124, + "grad_norm": 1.6134312152862549, + "learning_rate": 3.708927391090447e-06, + "loss": 0.2941, + "step": 11880 + }, + { + "epoch": 2.4921433060967946, + "grad_norm": 1.5831973552703857, + "learning_rate": 3.664592739258399e-06, + "loss": 0.33, + "step": 11895 + }, + { + "epoch": 2.4952859836580767, + "grad_norm": 1.5520756244659424, + "learning_rate": 3.6205036944761045e-06, + "loss": 0.3087, + "step": 11910 + }, + { + "epoch": 2.498428661219359, + "grad_norm": 1.497530460357666, + "learning_rate": 3.5766607642839093e-06, + "loss": 0.3003, + "step": 11925 + }, + { + "epoch": 2.501571338780641, + "grad_norm": 1.3204107284545898, + "learning_rate": 3.5330644533889705e-06, + "loss": 0.284, + "step": 11940 + }, + { + "epoch": 2.5047140163419233, + "grad_norm": 1.4598573446273804, + "learning_rate": 3.489715263659435e-06, + "loss": 0.2783, + "step": 11955 + }, + { + "epoch": 2.5078566939032054, + "grad_norm": 1.5349574089050293, + "learning_rate": 3.4466136941186724e-06, + "loss": 0.2826, + "step": 11970 + }, + { + "epoch": 2.5109993714644876, + "grad_norm": 1.3122080564498901, + "learning_rate": 3.403760240939502e-06, + "loss": 0.2675, + "step": 11985 + }, + { + "epoch": 2.5141420490257698, + "grad_norm": 1.218714714050293, + "learning_rate": 3.361155397438501e-06, + "loss": 0.3582, + "step": 12000 + }, + { + "epoch": 2.517284726587052, + "grad_norm": 1.8126921653747559, + "learning_rate": 3.3187996540703424e-06, + "loss": 0.2697, + "step": 12015 + }, + { + "epoch": 2.520427404148334, + "grad_norm": 1.4559165239334106, + "learning_rate": 3.276693498422104e-06, + "loss": 0.3061, + "step": 12030 + }, + { + "epoch": 2.5235700817096167, + "grad_norm": 1.0276938676834106, + "learning_rate": 3.234837415207706e-06, + "loss": 0.3437, + "step": 12045 + }, + { + "epoch": 2.526712759270899, + "grad_norm": 1.4260108470916748, + "learning_rate": 3.193231886262288e-06, + "loss": 0.282, + "step": 12060 + }, + { + "epoch": 2.529855436832181, + "grad_norm": 1.7475075721740723, + "learning_rate": 3.1518773905366976e-06, + "loss": 0.3306, + "step": 12075 + }, + { + "epoch": 2.5329981143934632, + "grad_norm": 1.1481621265411377, + "learning_rate": 3.1107744040919427e-06, + "loss": 0.2692, + "step": 12090 + }, + { + "epoch": 2.5361407919547454, + "grad_norm": 1.8862768411636353, + "learning_rate": 3.0699234000937464e-06, + "loss": 0.332, + "step": 12105 + }, + { + "epoch": 2.5392834695160276, + "grad_norm": 1.4870737791061401, + "learning_rate": 3.0293248488070745e-06, + "loss": 0.3344, + "step": 12120 + }, + { + "epoch": 2.5424261470773097, + "grad_norm": 1.7676063776016235, + "learning_rate": 2.9889792175907318e-06, + "loss": 0.3323, + "step": 12135 + }, + { + "epoch": 2.5455688246385924, + "grad_norm": 1.3961862325668335, + "learning_rate": 2.9488869708919674e-06, + "loss": 0.3279, + "step": 12150 + }, + { + "epoch": 2.5487115021998745, + "grad_norm": 1.2494407892227173, + "learning_rate": 2.9090485702411603e-06, + "loss": 0.3043, + "step": 12165 + }, + { + "epoch": 2.5518541797611567, + "grad_norm": 2.1194069385528564, + "learning_rate": 2.869464474246483e-06, + "loss": 0.3251, + "step": 12180 + }, + { + "epoch": 2.554996857322439, + "grad_norm": 1.5678242444992065, + "learning_rate": 2.8301351385886214e-06, + "loss": 0.3134, + "step": 12195 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 1.7995771169662476, + "learning_rate": 2.7910610160155256e-06, + "loss": 0.3218, + "step": 12210 + }, + { + "epoch": 2.561282212445003, + "grad_norm": 1.077495813369751, + "learning_rate": 2.7522425563372202e-06, + "loss": 0.2961, + "step": 12225 + }, + { + "epoch": 2.5644248900062854, + "grad_norm": 1.7993483543395996, + "learning_rate": 2.7136802064206157e-06, + "loss": 0.3097, + "step": 12240 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 1.5372523069381714, + "learning_rate": 2.675374410184345e-06, + "loss": 0.2836, + "step": 12255 + }, + { + "epoch": 2.5707102451288497, + "grad_norm": 1.4500757455825806, + "learning_rate": 2.6373256085936742e-06, + "loss": 0.3154, + "step": 12270 + }, + { + "epoch": 2.573852922690132, + "grad_norm": 1.4548457860946655, + "learning_rate": 2.5995342396554325e-06, + "loss": 0.3113, + "step": 12285 + }, + { + "epoch": 2.576995600251414, + "grad_norm": 1.9645068645477295, + "learning_rate": 2.562000738412945e-06, + "loss": 0.3444, + "step": 12300 + }, + { + "epoch": 2.5801382778126962, + "grad_norm": 1.7881463766098022, + "learning_rate": 2.5247255369410418e-06, + "loss": 0.2974, + "step": 12315 + }, + { + "epoch": 2.5832809553739784, + "grad_norm": 1.7925788164138794, + "learning_rate": 2.4877090643410927e-06, + "loss": 0.2944, + "step": 12330 + }, + { + "epoch": 2.586423632935261, + "grad_norm": 1.5786759853363037, + "learning_rate": 2.4509517467360356e-06, + "loss": 0.3785, + "step": 12345 + }, + { + "epoch": 2.589566310496543, + "grad_norm": 1.4962717294692993, + "learning_rate": 2.4144540072654987e-06, + "loss": 0.3267, + "step": 12360 + }, + { + "epoch": 2.5927089880578253, + "grad_norm": 1.163743257522583, + "learning_rate": 2.378216266080929e-06, + "loss": 0.2757, + "step": 12375 + }, + { + "epoch": 2.5958516656191075, + "grad_norm": 1.7964270114898682, + "learning_rate": 2.342238940340746e-06, + "loss": 0.2904, + "step": 12390 + }, + { + "epoch": 2.5989943431803897, + "grad_norm": 1.7889028787612915, + "learning_rate": 2.3065224442055333e-06, + "loss": 0.3064, + "step": 12405 + }, + { + "epoch": 2.602137020741672, + "grad_norm": 1.5097829103469849, + "learning_rate": 2.271067188833281e-06, + "loss": 0.3401, + "step": 12420 + }, + { + "epoch": 2.605279698302954, + "grad_norm": 1.4333211183547974, + "learning_rate": 2.235873582374659e-06, + "loss": 0.2794, + "step": 12435 + }, + { + "epoch": 2.608422375864236, + "grad_norm": 1.2477611303329468, + "learning_rate": 2.200942029968309e-06, + "loss": 0.2935, + "step": 12450 + }, + { + "epoch": 2.611565053425519, + "grad_norm": 1.7559458017349243, + "learning_rate": 2.166272933736177e-06, + "loss": 0.3258, + "step": 12465 + }, + { + "epoch": 2.614707730986801, + "grad_norm": 1.6621719598770142, + "learning_rate": 2.1318666927788834e-06, + "loss": 0.3111, + "step": 12480 + }, + { + "epoch": 2.617850408548083, + "grad_norm": 1.6579554080963135, + "learning_rate": 2.0977237031711506e-06, + "loss": 0.2611, + "step": 12495 + }, + { + "epoch": 2.6209930861093653, + "grad_norm": 1.7369964122772217, + "learning_rate": 2.063844357957223e-06, + "loss": 0.3577, + "step": 12510 + }, + { + "epoch": 2.6241357636706475, + "grad_norm": 1.6332292556762695, + "learning_rate": 2.0302290471463314e-06, + "loss": 0.2942, + "step": 12525 + }, + { + "epoch": 2.6272784412319297, + "grad_norm": 1.5578200817108154, + "learning_rate": 1.996878157708243e-06, + "loss": 0.2695, + "step": 12540 + }, + { + "epoch": 2.630421118793212, + "grad_norm": 1.5188201665878296, + "learning_rate": 1.963792073568757e-06, + "loss": 0.3078, + "step": 12555 + }, + { + "epoch": 2.633563796354494, + "grad_norm": 1.8250635862350464, + "learning_rate": 1.9309711756053367e-06, + "loss": 0.3146, + "step": 12570 + }, + { + "epoch": 2.636706473915776, + "grad_norm": 1.7131030559539795, + "learning_rate": 1.8984158416426728e-06, + "loss": 0.3182, + "step": 12585 + }, + { + "epoch": 2.6398491514770583, + "grad_norm": 1.473404884338379, + "learning_rate": 1.8661264464483852e-06, + "loss": 0.2727, + "step": 12600 + }, + { + "epoch": 2.6429918290383405, + "grad_norm": 1.508779764175415, + "learning_rate": 1.8341033617286645e-06, + "loss": 0.3448, + "step": 12615 + }, + { + "epoch": 2.6461345065996227, + "grad_norm": 1.147560477256775, + "learning_rate": 1.8023469561240126e-06, + "loss": 0.2783, + "step": 12630 + }, + { + "epoch": 2.649277184160905, + "grad_norm": 1.760060429573059, + "learning_rate": 1.770857595205011e-06, + "loss": 0.3152, + "step": 12645 + }, + { + "epoch": 2.6524198617221875, + "grad_norm": 1.4739596843719482, + "learning_rate": 1.7396356414680959e-06, + "loss": 0.29, + "step": 12660 + }, + { + "epoch": 2.6555625392834696, + "grad_norm": 1.567877173423767, + "learning_rate": 1.7086814543313816e-06, + "loss": 0.2672, + "step": 12675 + }, + { + "epoch": 2.658705216844752, + "grad_norm": 1.3326002359390259, + "learning_rate": 1.6779953901305295e-06, + "loss": 0.251, + "step": 12690 + }, + { + "epoch": 2.661847894406034, + "grad_norm": 1.3788151741027832, + "learning_rate": 1.647577802114661e-06, + "loss": 0.3416, + "step": 12705 + }, + { + "epoch": 2.664990571967316, + "grad_norm": 1.7790052890777588, + "learning_rate": 1.6174290404422726e-06, + "loss": 0.2999, + "step": 12720 + }, + { + "epoch": 2.6681332495285983, + "grad_norm": 1.4312305450439453, + "learning_rate": 1.5875494521771922e-06, + "loss": 0.3305, + "step": 12735 + }, + { + "epoch": 2.6712759270898805, + "grad_norm": 1.6938543319702148, + "learning_rate": 1.5579393812846316e-06, + "loss": 0.3117, + "step": 12750 + }, + { + "epoch": 2.6744186046511627, + "grad_norm": 1.5854291915893555, + "learning_rate": 1.528599168627165e-06, + "loss": 0.3289, + "step": 12765 + }, + { + "epoch": 2.6775612822124453, + "grad_norm": 1.1590096950531006, + "learning_rate": 1.4995291519608602e-06, + "loss": 0.283, + "step": 12780 + }, + { + "epoch": 2.6807039597737274, + "grad_norm": 1.068301796913147, + "learning_rate": 1.470729665931353e-06, + "loss": 0.331, + "step": 12795 + }, + { + "epoch": 2.6838466373350096, + "grad_norm": 1.2185308933258057, + "learning_rate": 1.4422010420700182e-06, + "loss": 0.3014, + "step": 12810 + }, + { + "epoch": 2.686989314896292, + "grad_norm": 1.4308061599731445, + "learning_rate": 1.413943608790133e-06, + "loss": 0.2939, + "step": 12825 + }, + { + "epoch": 2.690131992457574, + "grad_norm": 1.1259864568710327, + "learning_rate": 1.385957691383119e-06, + "loss": 0.2669, + "step": 12840 + }, + { + "epoch": 2.693274670018856, + "grad_norm": 1.5093046426773071, + "learning_rate": 1.3582436120147729e-06, + "loss": 0.3374, + "step": 12855 + }, + { + "epoch": 2.6964173475801383, + "grad_norm": 1.3771803379058838, + "learning_rate": 1.3308016897215807e-06, + "loss": 0.2783, + "step": 12870 + }, + { + "epoch": 2.6995600251414205, + "grad_norm": 2.384852409362793, + "learning_rate": 1.3036322404070296e-06, + "loss": 0.3162, + "step": 12885 + }, + { + "epoch": 2.700188560653677, + "eval_accuracy": 0.8853943711763073, + "eval_loss": 0.4137997329235077, + "eval_runtime": 1196.9935, + "eval_samples_per_second": 3.998, + "eval_steps_per_second": 1.0, + "step": 12888 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 1.673790693283081, + "learning_rate": 1.2767355768379702e-06, + "loss": 0.2855, + "step": 12900 + }, + { + "epoch": 2.705845380263985, + "grad_norm": 1.8752899169921875, + "learning_rate": 1.2501120086410411e-06, + "loss": 0.3085, + "step": 12915 + }, + { + "epoch": 2.708988057825267, + "grad_norm": 1.8645318746566772, + "learning_rate": 1.2237618422990733e-06, + "loss": 0.3068, + "step": 12930 + }, + { + "epoch": 2.712130735386549, + "grad_norm": 1.9585272073745728, + "learning_rate": 1.1976853811475675e-06, + "loss": 0.3283, + "step": 12945 + }, + { + "epoch": 2.7152734129478313, + "grad_norm": 1.7527602910995483, + "learning_rate": 1.1718829253712204e-06, + "loss": 0.3222, + "step": 12960 + }, + { + "epoch": 2.718416090509114, + "grad_norm": 1.3966923952102661, + "learning_rate": 1.1463547720004546e-06, + "loss": 0.3092, + "step": 12975 + }, + { + "epoch": 2.721558768070396, + "grad_norm": 1.3295458555221558, + "learning_rate": 1.1211012149080074e-06, + "loss": 0.3237, + "step": 12990 + }, + { + "epoch": 2.7247014456316783, + "grad_norm": 0.9988710284233093, + "learning_rate": 1.0961225448055307e-06, + "loss": 0.3216, + "step": 13005 + }, + { + "epoch": 2.7278441231929604, + "grad_norm": 1.6158466339111328, + "learning_rate": 1.0714190492402715e-06, + "loss": 0.3017, + "step": 13020 + }, + { + "epoch": 2.7309868007542426, + "grad_norm": 1.4756746292114258, + "learning_rate": 1.0469910125917358e-06, + "loss": 0.3169, + "step": 13035 + }, + { + "epoch": 2.7341294783155248, + "grad_norm": 1.3889656066894531, + "learning_rate": 1.0228387160684333e-06, + "loss": 0.3754, + "step": 13050 + }, + { + "epoch": 2.737272155876807, + "grad_norm": 1.2530293464660645, + "learning_rate": 9.989624377046258e-07, + "loss": 0.2958, + "step": 13065 + }, + { + "epoch": 2.740414833438089, + "grad_norm": 1.8963161706924438, + "learning_rate": 9.753624523571425e-07, + "loss": 0.3641, + "step": 13080 + }, + { + "epoch": 2.7435575109993717, + "grad_norm": 1.4623044729232788, + "learning_rate": 9.520390317021955e-07, + "loss": 0.3061, + "step": 13095 + }, + { + "epoch": 2.746700188560654, + "grad_norm": 1.604202151298523, + "learning_rate": 9.289924442322767e-07, + "loss": 0.2785, + "step": 13110 + }, + { + "epoch": 2.749842866121936, + "grad_norm": 1.8192863464355469, + "learning_rate": 9.062229552530471e-07, + "loss": 0.3169, + "step": 13125 + }, + { + "epoch": 2.7529855436832182, + "grad_norm": 1.419291377067566, + "learning_rate": 8.83730826880294e-07, + "loss": 0.3015, + "step": 13140 + }, + { + "epoch": 2.7561282212445004, + "grad_norm": 1.5753535032272339, + "learning_rate": 8.615163180369035e-07, + "loss": 0.284, + "step": 13155 + }, + { + "epoch": 2.7592708988057826, + "grad_norm": 1.789189338684082, + "learning_rate": 8.395796844498815e-07, + "loss": 0.3423, + "step": 13170 + }, + { + "epoch": 2.7624135763670647, + "grad_norm": 1.343781590461731, + "learning_rate": 8.17921178647435e-07, + "loss": 0.3119, + "step": 13185 + }, + { + "epoch": 2.765556253928347, + "grad_norm": 1.652388572692871, + "learning_rate": 7.96541049956026e-07, + "loss": 0.3219, + "step": 13200 + }, + { + "epoch": 2.768698931489629, + "grad_norm": 1.597399353981018, + "learning_rate": 7.754395444975221e-07, + "loss": 0.2873, + "step": 13215 + }, + { + "epoch": 2.7718416090509113, + "grad_norm": 1.3452566862106323, + "learning_rate": 7.546169051863672e-07, + "loss": 0.3125, + "step": 13230 + }, + { + "epoch": 2.7749842866121934, + "grad_norm": 1.605913758277893, + "learning_rate": 7.340733717267678e-07, + "loss": 0.278, + "step": 13245 + }, + { + "epoch": 2.7781269641734756, + "grad_norm": 1.465397596359253, + "learning_rate": 7.138091806099589e-07, + "loss": 0.3208, + "step": 13260 + }, + { + "epoch": 2.7812696417347578, + "grad_norm": 1.7374017238616943, + "learning_rate": 6.938245651114506e-07, + "loss": 0.2933, + "step": 13275 + }, + { + "epoch": 2.7844123192960404, + "grad_norm": 1.9815653562545776, + "learning_rate": 6.741197552883771e-07, + "loss": 0.3335, + "step": 13290 + }, + { + "epoch": 2.7875549968573226, + "grad_norm": 1.4085747003555298, + "learning_rate": 6.546949779768136e-07, + "loss": 0.2711, + "step": 13305 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 1.6339495182037354, + "learning_rate": 6.355504567891912e-07, + "loss": 0.3331, + "step": 13320 + }, + { + "epoch": 2.793840351979887, + "grad_norm": 1.441635251045227, + "learning_rate": 6.166864121117167e-07, + "loss": 0.3628, + "step": 13335 + }, + { + "epoch": 2.796983029541169, + "grad_norm": 1.4819507598876953, + "learning_rate": 5.981030611018234e-07, + "loss": 0.2825, + "step": 13350 + }, + { + "epoch": 2.8001257071024512, + "grad_norm": 1.5747650861740112, + "learning_rate": 5.798006176856802e-07, + "loss": 0.3144, + "step": 13365 + }, + { + "epoch": 2.8032683846637334, + "grad_norm": 1.4870857000350952, + "learning_rate": 5.617792925557363e-07, + "loss": 0.3289, + "step": 13380 + }, + { + "epoch": 2.8064110622250156, + "grad_norm": 1.7161614894866943, + "learning_rate": 5.440392931682859e-07, + "loss": 0.3379, + "step": 13395 + }, + { + "epoch": 2.809553739786298, + "grad_norm": 0.8529698848724365, + "learning_rate": 5.265808237410824e-07, + "loss": 0.3143, + "step": 13410 + }, + { + "epoch": 2.8126964173475804, + "grad_norm": 1.6342661380767822, + "learning_rate": 5.094040852509779e-07, + "loss": 0.3144, + "step": 13425 + }, + { + "epoch": 2.8158390949088625, + "grad_norm": 1.4123117923736572, + "learning_rate": 4.925092754316352e-07, + "loss": 0.3407, + "step": 13440 + }, + { + "epoch": 2.8189817724701447, + "grad_norm": 1.3898142576217651, + "learning_rate": 4.7589658877122967e-07, + "loss": 0.3385, + "step": 13455 + }, + { + "epoch": 2.822124450031427, + "grad_norm": 1.6428829431533813, + "learning_rate": 4.5956621651020994e-07, + "loss": 0.2963, + "step": 13470 + }, + { + "epoch": 2.825267127592709, + "grad_norm": 1.465915322303772, + "learning_rate": 4.4351834663910465e-07, + "loss": 0.3302, + "step": 13485 + }, + { + "epoch": 2.828409805153991, + "grad_norm": 1.8282034397125244, + "learning_rate": 4.277531638963689e-07, + "loss": 0.3171, + "step": 13500 + }, + { + "epoch": 2.8315524827152734, + "grad_norm": 2.015639305114746, + "learning_rate": 4.122708497662275e-07, + "loss": 0.3633, + "step": 13515 + }, + { + "epoch": 2.8346951602765555, + "grad_norm": 1.0915390253067017, + "learning_rate": 3.97071582476613e-07, + "loss": 0.3, + "step": 13530 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.9291322827339172, + "learning_rate": 3.821555369971086e-07, + "loss": 0.3471, + "step": 13545 + }, + { + "epoch": 2.84098051539912, + "grad_norm": 1.6048222780227661, + "learning_rate": 3.6752288503691945e-07, + "loss": 0.3209, + "step": 13560 + }, + { + "epoch": 2.844123192960402, + "grad_norm": 1.6999403238296509, + "learning_rate": 3.5317379504291316e-07, + "loss": 0.3446, + "step": 13575 + }, + { + "epoch": 2.8472658705216842, + "grad_norm": 2.1094415187835693, + "learning_rate": 3.391084321976656e-07, + "loss": 0.3502, + "step": 13590 + }, + { + "epoch": 2.850408548082967, + "grad_norm": 1.3436388969421387, + "learning_rate": 3.2532695841758496e-07, + "loss": 0.3167, + "step": 13605 + }, + { + "epoch": 2.853551225644249, + "grad_norm": 1.470632553100586, + "learning_rate": 3.118295323510101e-07, + "loss": 0.3063, + "step": 13620 + }, + { + "epoch": 2.856693903205531, + "grad_norm": 1.0371286869049072, + "learning_rate": 2.9861630937641494e-07, + "loss": 0.3034, + "step": 13635 + }, + { + "epoch": 2.8598365807668134, + "grad_norm": 1.7494783401489258, + "learning_rate": 2.8568744160061e-07, + "loss": 0.2834, + "step": 13650 + }, + { + "epoch": 2.8629792583280955, + "grad_norm": 1.5144836902618408, + "learning_rate": 2.730430778569909e-07, + "loss": 0.3142, + "step": 13665 + }, + { + "epoch": 2.8661219358893777, + "grad_norm": 1.8125107288360596, + "learning_rate": 2.606833637038231e-07, + "loss": 0.3513, + "step": 13680 + }, + { + "epoch": 2.86926461345066, + "grad_norm": 1.099411129951477, + "learning_rate": 2.4860844142256257e-07, + "loss": 0.3025, + "step": 13695 + }, + { + "epoch": 2.872407291011942, + "grad_norm": 1.8955268859863281, + "learning_rate": 2.3681845001623515e-07, + "loss": 0.3418, + "step": 13710 + }, + { + "epoch": 2.8755499685732246, + "grad_norm": 1.2657068967819214, + "learning_rate": 2.2531352520781535e-07, + "loss": 0.2709, + "step": 13725 + }, + { + "epoch": 2.878692646134507, + "grad_norm": 1.8179534673690796, + "learning_rate": 2.140937994386777e-07, + "loss": 0.3291, + "step": 13740 + }, + { + "epoch": 2.881835323695789, + "grad_norm": 1.7901382446289062, + "learning_rate": 2.031594018670674e-07, + "loss": 0.3132, + "step": 13755 + }, + { + "epoch": 2.884978001257071, + "grad_norm": 1.1521648168563843, + "learning_rate": 1.9251045836661263e-07, + "loss": 0.2764, + "step": 13770 + }, + { + "epoch": 2.8881206788183533, + "grad_norm": 1.2185838222503662, + "learning_rate": 1.8214709152487575e-07, + "loss": 0.3465, + "step": 13785 + }, + { + "epoch": 2.8912633563796355, + "grad_norm": 1.640515685081482, + "learning_rate": 1.720694206419432e-07, + "loss": 0.315, + "step": 13800 + }, + { + "epoch": 2.8944060339409177, + "grad_norm": 1.314355731010437, + "learning_rate": 1.6227756172905729e-07, + "loss": 0.2685, + "step": 13815 + }, + { + "epoch": 2.8975487115022, + "grad_norm": 1.2538273334503174, + "learning_rate": 1.527716275072699e-07, + "loss": 0.3432, + "step": 13830 + }, + { + "epoch": 2.900691389063482, + "grad_norm": 1.3175392150878906, + "learning_rate": 1.435517274061493e-07, + "loss": 0.2969, + "step": 13845 + }, + { + "epoch": 2.903834066624764, + "grad_norm": 1.512818694114685, + "learning_rate": 1.346179675625253e-07, + "loss": 0.2804, + "step": 13860 + }, + { + "epoch": 2.9069767441860463, + "grad_norm": 1.2288899421691895, + "learning_rate": 1.2597045081926551e-07, + "loss": 0.3092, + "step": 13875 + }, + { + "epoch": 2.9101194217473285, + "grad_norm": 1.157689094543457, + "learning_rate": 1.1760927672408161e-07, + "loss": 0.3075, + "step": 13890 + }, + { + "epoch": 2.9132620993086107, + "grad_norm": 1.6113057136535645, + "learning_rate": 1.0953454152839993e-07, + "loss": 0.3319, + "step": 13905 + }, + { + "epoch": 2.9164047768698933, + "grad_norm": 1.4615386724472046, + "learning_rate": 1.0174633818623991e-07, + "loss": 0.306, + "step": 13920 + }, + { + "epoch": 2.9195474544311755, + "grad_norm": 1.0442296266555786, + "learning_rate": 9.424475635315122e-08, + "loss": 0.3057, + "step": 13935 + }, + { + "epoch": 2.9226901319924576, + "grad_norm": 1.2906923294067383, + "learning_rate": 8.702988238517562e-08, + "loss": 0.2989, + "step": 13950 + }, + { + "epoch": 2.92583280955374, + "grad_norm": 1.6215356588363647, + "learning_rate": 8.010179933786167e-08, + "loss": 0.324, + "step": 13965 + }, + { + "epoch": 2.928975487115022, + "grad_norm": 1.602383017539978, + "learning_rate": 7.346058696530156e-08, + "loss": 0.381, + "step": 13980 + }, + { + "epoch": 2.932118164676304, + "grad_norm": 1.5103670358657837, + "learning_rate": 6.710632171921527e-08, + "loss": 0.3379, + "step": 13995 + }, + { + "epoch": 2.9352608422375863, + "grad_norm": 1.6660419702529907, + "learning_rate": 6.103907674807064e-08, + "loss": 0.312, + "step": 14010 + }, + { + "epoch": 2.9384035197988685, + "grad_norm": 1.0635946989059448, + "learning_rate": 5.52589218962396e-08, + "loss": 0.2964, + "step": 14025 + }, + { + "epoch": 2.941546197360151, + "grad_norm": 1.247497797012329, + "learning_rate": 4.976592370319611e-08, + "loss": 0.2952, + "step": 14040 + }, + { + "epoch": 2.9446888749214333, + "grad_norm": 1.4133594036102295, + "learning_rate": 4.456014540275e-08, + "loss": 0.2696, + "step": 14055 + }, + { + "epoch": 2.9478315524827154, + "grad_norm": 1.5689040422439575, + "learning_rate": 3.964164692231709e-08, + "loss": 0.341, + "step": 14070 + }, + { + "epoch": 2.9509742300439976, + "grad_norm": 1.2708498239517212, + "learning_rate": 3.5010484882233574e-08, + "loss": 0.3055, + "step": 14085 + }, + { + "epoch": 2.95411690760528, + "grad_norm": 1.7094337940216064, + "learning_rate": 3.066671259510101e-08, + "loss": 0.3289, + "step": 14100 + }, + { + "epoch": 2.957259585166562, + "grad_norm": 1.60092294216156, + "learning_rate": 2.6610380065170136e-08, + "loss": 0.2657, + "step": 14115 + }, + { + "epoch": 2.960402262727844, + "grad_norm": 1.0856350660324097, + "learning_rate": 2.284153398777189e-08, + "loss": 0.3139, + "step": 14130 + }, + { + "epoch": 2.9635449402891263, + "grad_norm": 1.8443694114685059, + "learning_rate": 1.936021774877339e-08, + "loss": 0.2993, + "step": 14145 + }, + { + "epoch": 2.9666876178504085, + "grad_norm": 1.4500629901885986, + "learning_rate": 1.616647142408112e-08, + "loss": 0.2914, + "step": 14160 + }, + { + "epoch": 2.9698302954116906, + "grad_norm": 1.634055256843567, + "learning_rate": 1.3260331779182955e-08, + "loss": 0.3251, + "step": 14175 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 1.6882349252700806, + "learning_rate": 1.0641832268717955e-08, + "loss": 0.2889, + "step": 14190 + }, + { + "epoch": 2.976115650534255, + "grad_norm": 1.6775078773498535, + "learning_rate": 8.311003036098885e-09, + "loss": 0.2957, + "step": 14205 + }, + { + "epoch": 2.979258328095537, + "grad_norm": 2.209030866622925, + "learning_rate": 6.267870913156948e-09, + "loss": 0.3114, + "step": 14220 + }, + { + "epoch": 2.9824010056568198, + "grad_norm": 1.3158173561096191, + "learning_rate": 4.512459419839243e-09, + "loss": 0.293, + "step": 14235 + }, + { + "epoch": 2.985543683218102, + "grad_norm": 1.2444883584976196, + "learning_rate": 3.0447887639367676e-09, + "loss": 0.2313, + "step": 14250 + }, + { + "epoch": 2.988686360779384, + "grad_norm": 1.1739709377288818, + "learning_rate": 1.8648758408512656e-09, + "loss": 0.3228, + "step": 14265 + }, + { + "epoch": 2.9918290383406663, + "grad_norm": 1.4359891414642334, + "learning_rate": 9.72734233398165e-10, + "loss": 0.2946, + "step": 14280 + }, + { + "epoch": 2.9949717159019484, + "grad_norm": 1.3152233362197876, + "learning_rate": 3.6837421165669685e-10, + "loss": 0.2678, + "step": 14295 + }, + { + "epoch": 2.9981143934632306, + "grad_norm": 1.9656248092651367, + "learning_rate": 5.1802732842221036e-11, + "loss": 0.2903, + "step": 14310 + }, + { + "epoch": 3.0, + "step": 14319, + "total_flos": 5.387086302585815e+18, + "train_loss": 0.4039875044737109, + "train_runtime": 21568.6928, + "train_samples_per_second": 2.655, + "train_steps_per_second": 0.664 + } + ], + "logging_steps": 15, + "max_steps": 14319, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 4296, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.387086302585815e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}