diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5501 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 10, + "global_step": 7808, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012807377049180329, + "grad_norm": 13.030157089233398, + "learning_rate": 0.0, + "loss": 0.6785, + "step": 1 + }, + { + "epoch": 0.0012807377049180327, + "grad_norm": 22.160924911499023, + "learning_rate": 3.837953091684436e-07, + "loss": 0.7023, + "step": 10 + }, + { + "epoch": 0.0025614754098360654, + "grad_norm": 28.483007431030273, + "learning_rate": 8.102345415778253e-07, + "loss": 0.5318, + "step": 20 + }, + { + "epoch": 0.0038422131147540983, + "grad_norm": 14.279166221618652, + "learning_rate": 1.236673773987207e-06, + "loss": 0.7028, + "step": 30 + }, + { + "epoch": 0.005122950819672131, + "grad_norm": 19.40032196044922, + "learning_rate": 1.6631130063965886e-06, + "loss": 0.7097, + "step": 40 + }, + { + "epoch": 0.006403688524590164, + "grad_norm": 17.553495407104492, + "learning_rate": 2.08955223880597e-06, + "loss": 0.7789, + "step": 50 + }, + { + "epoch": 0.007684426229508197, + "grad_norm": 5.795664310455322, + "learning_rate": 2.515991471215352e-06, + "loss": 0.5146, + "step": 60 + }, + { + "epoch": 0.008965163934426229, + "grad_norm": 15.086921691894531, + "learning_rate": 2.9424307036247335e-06, + "loss": 0.703, + "step": 70 + }, + { + "epoch": 0.010245901639344262, + "grad_norm": 11.061614990234375, + "learning_rate": 3.3688699360341154e-06, + "loss": 0.5794, + "step": 80 + }, + { + "epoch": 0.011526639344262296, + "grad_norm": 11.43583869934082, + "learning_rate": 3.7953091684434973e-06, + "loss": 0.6144, + "step": 90 + }, + { + "epoch": 0.012807377049180328, + "grad_norm": 5.861094951629639, + "learning_rate": 4.221748400852878e-06, + "loss": 0.5769, + "step": 100 + }, + { + "epoch": 0.01408811475409836, + "grad_norm": 28.796695709228516, + "learning_rate": 4.64818763326226e-06, + "loss": 0.5953, + "step": 110 + }, + { + "epoch": 0.015368852459016393, + "grad_norm": 17.27574348449707, + "learning_rate": 5.074626865671642e-06, + "loss": 0.4116, + "step": 120 + }, + { + "epoch": 0.016649590163934427, + "grad_norm": 20.032840728759766, + "learning_rate": 5.501066098081024e-06, + "loss": 0.7965, + "step": 130 + }, + { + "epoch": 0.017930327868852458, + "grad_norm": 35.11494827270508, + "learning_rate": 5.927505330490405e-06, + "loss": 0.8488, + "step": 140 + }, + { + "epoch": 0.019211065573770492, + "grad_norm": 17.658639907836914, + "learning_rate": 6.353944562899788e-06, + "loss": 0.446, + "step": 150 + }, + { + "epoch": 0.020491803278688523, + "grad_norm": 10.555081367492676, + "learning_rate": 6.780383795309169e-06, + "loss": 0.4964, + "step": 160 + }, + { + "epoch": 0.021772540983606557, + "grad_norm": 30.8939266204834, + "learning_rate": 7.20682302771855e-06, + "loss": 0.5762, + "step": 170 + }, + { + "epoch": 0.02305327868852459, + "grad_norm": 14.771651268005371, + "learning_rate": 7.633262260127933e-06, + "loss": 0.5545, + "step": 180 + }, + { + "epoch": 0.024334016393442622, + "grad_norm": 0.9534880518913269, + "learning_rate": 8.059701492537314e-06, + "loss": 0.3119, + "step": 190 + }, + { + "epoch": 0.025614754098360656, + "grad_norm": 13.96252727508545, + "learning_rate": 8.486140724946695e-06, + "loss": 0.6571, + "step": 200 + }, + { + "epoch": 0.026895491803278687, + "grad_norm": 6.706875801086426, + "learning_rate": 8.912579957356077e-06, + "loss": 0.8117, + "step": 210 + }, + { + "epoch": 0.02817622950819672, + "grad_norm": 57.71232604980469, + "learning_rate": 9.339019189765458e-06, + "loss": 0.4906, + "step": 220 + }, + { + "epoch": 0.029456967213114756, + "grad_norm": 15.123934745788574, + "learning_rate": 9.765458422174841e-06, + "loss": 0.4204, + "step": 230 + }, + { + "epoch": 0.030737704918032786, + "grad_norm": 3.7344789505004883, + "learning_rate": 1.0191897654584222e-05, + "loss": 0.6303, + "step": 240 + }, + { + "epoch": 0.03201844262295082, + "grad_norm": 0.1564660370349884, + "learning_rate": 1.0618336886993603e-05, + "loss": 0.3297, + "step": 250 + }, + { + "epoch": 0.033299180327868855, + "grad_norm": 22.37810516357422, + "learning_rate": 1.1044776119402986e-05, + "loss": 0.3784, + "step": 260 + }, + { + "epoch": 0.034579918032786885, + "grad_norm": 4.665563106536865, + "learning_rate": 1.1471215351812369e-05, + "loss": 0.7696, + "step": 270 + }, + { + "epoch": 0.035860655737704916, + "grad_norm": 9.491741180419922, + "learning_rate": 1.189765458422175e-05, + "loss": 0.5464, + "step": 280 + }, + { + "epoch": 0.037141393442622954, + "grad_norm": 18.859682083129883, + "learning_rate": 1.2324093816631131e-05, + "loss": 0.5767, + "step": 290 + }, + { + "epoch": 0.038422131147540985, + "grad_norm": 7.065849304199219, + "learning_rate": 1.2750533049040512e-05, + "loss": 0.5723, + "step": 300 + }, + { + "epoch": 0.039702868852459015, + "grad_norm": 43.178043365478516, + "learning_rate": 1.3176972281449893e-05, + "loss": 0.6343, + "step": 310 + }, + { + "epoch": 0.040983606557377046, + "grad_norm": 9.827512741088867, + "learning_rate": 1.3603411513859277e-05, + "loss": 0.5718, + "step": 320 + }, + { + "epoch": 0.042264344262295084, + "grad_norm": 2.420236349105835, + "learning_rate": 1.4029850746268658e-05, + "loss": 0.5491, + "step": 330 + }, + { + "epoch": 0.043545081967213115, + "grad_norm": 8.602315902709961, + "learning_rate": 1.445628997867804e-05, + "loss": 0.5566, + "step": 340 + }, + { + "epoch": 0.044825819672131145, + "grad_norm": 19.52743148803711, + "learning_rate": 1.488272921108742e-05, + "loss": 0.4385, + "step": 350 + }, + { + "epoch": 0.04610655737704918, + "grad_norm": 59.86263656616211, + "learning_rate": 1.5309168443496803e-05, + "loss": 0.6635, + "step": 360 + }, + { + "epoch": 0.047387295081967214, + "grad_norm": 35.44069290161133, + "learning_rate": 1.5735607675906184e-05, + "loss": 0.7269, + "step": 370 + }, + { + "epoch": 0.048668032786885244, + "grad_norm": 20.887710571289062, + "learning_rate": 1.616204690831557e-05, + "loss": 0.5448, + "step": 380 + }, + { + "epoch": 0.04994877049180328, + "grad_norm": 22.93721580505371, + "learning_rate": 1.658848614072495e-05, + "loss": 0.7712, + "step": 390 + }, + { + "epoch": 0.05122950819672131, + "grad_norm": 11.434581756591797, + "learning_rate": 1.701492537313433e-05, + "loss": 0.4954, + "step": 400 + }, + { + "epoch": 0.052510245901639344, + "grad_norm": 3.9810707569122314, + "learning_rate": 1.7441364605543712e-05, + "loss": 0.4475, + "step": 410 + }, + { + "epoch": 0.053790983606557374, + "grad_norm": 8.25676155090332, + "learning_rate": 1.7867803837953093e-05, + "loss": 0.5226, + "step": 420 + }, + { + "epoch": 0.05507172131147541, + "grad_norm": 40.57249069213867, + "learning_rate": 1.8294243070362474e-05, + "loss": 0.5507, + "step": 430 + }, + { + "epoch": 0.05635245901639344, + "grad_norm": 0.5660319924354553, + "learning_rate": 1.872068230277186e-05, + "loss": 0.4327, + "step": 440 + }, + { + "epoch": 0.057633196721311473, + "grad_norm": 37.062320709228516, + "learning_rate": 1.914712153518124e-05, + "loss": 0.3592, + "step": 450 + }, + { + "epoch": 0.05891393442622951, + "grad_norm": 22.973651885986328, + "learning_rate": 1.957356076759062e-05, + "loss": 0.4596, + "step": 460 + }, + { + "epoch": 0.06019467213114754, + "grad_norm": 24.05460548400879, + "learning_rate": 2e-05, + "loss": 0.5558, + "step": 470 + }, + { + "epoch": 0.06147540983606557, + "grad_norm": 0.19256171584129333, + "learning_rate": 1.9972748330835264e-05, + "loss": 0.3182, + "step": 480 + }, + { + "epoch": 0.0627561475409836, + "grad_norm": 3.1222236156463623, + "learning_rate": 1.994549666167053e-05, + "loss": 0.6922, + "step": 490 + }, + { + "epoch": 0.06403688524590163, + "grad_norm": 35.97098922729492, + "learning_rate": 1.9918244992505793e-05, + "loss": 0.9556, + "step": 500 + }, + { + "epoch": 0.06531762295081968, + "grad_norm": 0.13992930948734283, + "learning_rate": 1.9890993323341056e-05, + "loss": 0.6617, + "step": 510 + }, + { + "epoch": 0.06659836065573771, + "grad_norm": 5.176881313323975, + "learning_rate": 1.986374165417632e-05, + "loss": 0.8168, + "step": 520 + }, + { + "epoch": 0.06787909836065574, + "grad_norm": 43.677433013916016, + "learning_rate": 1.9836489985011584e-05, + "loss": 0.7252, + "step": 530 + }, + { + "epoch": 0.06915983606557377, + "grad_norm": 15.75368881225586, + "learning_rate": 1.9809238315846847e-05, + "loss": 0.6547, + "step": 540 + }, + { + "epoch": 0.0704405737704918, + "grad_norm": 14.22448444366455, + "learning_rate": 1.9781986646682113e-05, + "loss": 0.4269, + "step": 550 + }, + { + "epoch": 0.07172131147540983, + "grad_norm": 20.48627471923828, + "learning_rate": 1.9754734977517372e-05, + "loss": 1.0193, + "step": 560 + }, + { + "epoch": 0.07300204918032786, + "grad_norm": 51.78612518310547, + "learning_rate": 1.9727483308352638e-05, + "loss": 0.5702, + "step": 570 + }, + { + "epoch": 0.07428278688524591, + "grad_norm": 0.18359607458114624, + "learning_rate": 1.97002316391879e-05, + "loss": 0.3638, + "step": 580 + }, + { + "epoch": 0.07556352459016394, + "grad_norm": 74.03116607666016, + "learning_rate": 1.9672979970023163e-05, + "loss": 0.7977, + "step": 590 + }, + { + "epoch": 0.07684426229508197, + "grad_norm": 12.116443634033203, + "learning_rate": 1.964572830085843e-05, + "loss": 0.8353, + "step": 600 + }, + { + "epoch": 0.078125, + "grad_norm": 36.37770080566406, + "learning_rate": 1.9618476631693692e-05, + "loss": 0.534, + "step": 610 + }, + { + "epoch": 0.07940573770491803, + "grad_norm": 1.0840022563934326, + "learning_rate": 1.9591224962528958e-05, + "loss": 0.6757, + "step": 620 + }, + { + "epoch": 0.08068647540983606, + "grad_norm": 18.524744033813477, + "learning_rate": 1.956397329336422e-05, + "loss": 0.6852, + "step": 630 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.5453315377235413, + "learning_rate": 1.9536721624199483e-05, + "loss": 0.3746, + "step": 640 + }, + { + "epoch": 0.08324795081967214, + "grad_norm": 3.647247076034546, + "learning_rate": 1.950946995503475e-05, + "loss": 0.3404, + "step": 650 + }, + { + "epoch": 0.08452868852459017, + "grad_norm": 2.8789007663726807, + "learning_rate": 1.9482218285870012e-05, + "loss": 0.558, + "step": 660 + }, + { + "epoch": 0.0858094262295082, + "grad_norm": 59.30935287475586, + "learning_rate": 1.9454966616705274e-05, + "loss": 0.2678, + "step": 670 + }, + { + "epoch": 0.08709016393442623, + "grad_norm": 36.327701568603516, + "learning_rate": 1.942771494754054e-05, + "loss": 0.8112, + "step": 680 + }, + { + "epoch": 0.08837090163934426, + "grad_norm": 30.401525497436523, + "learning_rate": 1.9400463278375803e-05, + "loss": 0.8637, + "step": 690 + }, + { + "epoch": 0.08965163934426229, + "grad_norm": 65.09701538085938, + "learning_rate": 1.9373211609211066e-05, + "loss": 0.4575, + "step": 700 + }, + { + "epoch": 0.09093237704918032, + "grad_norm": 0.17984363436698914, + "learning_rate": 1.9345959940046332e-05, + "loss": 0.6336, + "step": 710 + }, + { + "epoch": 0.09221311475409837, + "grad_norm": 8.531198501586914, + "learning_rate": 1.931870827088159e-05, + "loss": 0.6553, + "step": 720 + }, + { + "epoch": 0.0934938524590164, + "grad_norm": 1.3908320665359497, + "learning_rate": 1.9291456601716857e-05, + "loss": 0.5388, + "step": 730 + }, + { + "epoch": 0.09477459016393443, + "grad_norm": 27.024486541748047, + "learning_rate": 1.926420493255212e-05, + "loss": 0.5134, + "step": 740 + }, + { + "epoch": 0.09605532786885246, + "grad_norm": 1.363821268081665, + "learning_rate": 1.9236953263387382e-05, + "loss": 0.4295, + "step": 750 + }, + { + "epoch": 0.09733606557377049, + "grad_norm": 18.301353454589844, + "learning_rate": 1.9209701594222648e-05, + "loss": 0.8292, + "step": 760 + }, + { + "epoch": 0.09861680327868852, + "grad_norm": 7.517091751098633, + "learning_rate": 1.918244992505791e-05, + "loss": 0.5639, + "step": 770 + }, + { + "epoch": 0.09989754098360656, + "grad_norm": 0.8409481048583984, + "learning_rate": 1.9155198255893174e-05, + "loss": 0.4416, + "step": 780 + }, + { + "epoch": 0.1011782786885246, + "grad_norm": 10.660968780517578, + "learning_rate": 1.912794658672844e-05, + "loss": 0.6898, + "step": 790 + }, + { + "epoch": 0.10245901639344263, + "grad_norm": 13.175348281860352, + "learning_rate": 1.9100694917563702e-05, + "loss": 0.4468, + "step": 800 + }, + { + "epoch": 0.10373975409836066, + "grad_norm": 11.351682662963867, + "learning_rate": 1.9073443248398965e-05, + "loss": 1.0168, + "step": 810 + }, + { + "epoch": 0.10502049180327869, + "grad_norm": 2.7584354877471924, + "learning_rate": 1.904619157923423e-05, + "loss": 0.4303, + "step": 820 + }, + { + "epoch": 0.10630122950819672, + "grad_norm": 34.519954681396484, + "learning_rate": 1.9018939910069493e-05, + "loss": 0.1582, + "step": 830 + }, + { + "epoch": 0.10758196721311475, + "grad_norm": 1.5243237018585205, + "learning_rate": 1.8991688240904756e-05, + "loss": 0.5226, + "step": 840 + }, + { + "epoch": 0.1088627049180328, + "grad_norm": 12.513932228088379, + "learning_rate": 1.8964436571740022e-05, + "loss": 0.6845, + "step": 850 + }, + { + "epoch": 0.11014344262295082, + "grad_norm": 32.45783996582031, + "learning_rate": 1.8937184902575285e-05, + "loss": 1.0624, + "step": 860 + }, + { + "epoch": 0.11142418032786885, + "grad_norm": 0.5410599112510681, + "learning_rate": 1.8909933233410547e-05, + "loss": 0.7797, + "step": 870 + }, + { + "epoch": 0.11270491803278689, + "grad_norm": 40.27082443237305, + "learning_rate": 1.888268156424581e-05, + "loss": 0.6134, + "step": 880 + }, + { + "epoch": 0.11398565573770492, + "grad_norm": 14.060335159301758, + "learning_rate": 1.8855429895081076e-05, + "loss": 0.4326, + "step": 890 + }, + { + "epoch": 0.11526639344262295, + "grad_norm": 13.475322723388672, + "learning_rate": 1.882817822591634e-05, + "loss": 0.7133, + "step": 900 + }, + { + "epoch": 0.11654713114754098, + "grad_norm": 3.2171595096588135, + "learning_rate": 1.88009265567516e-05, + "loss": 0.6357, + "step": 910 + }, + { + "epoch": 0.11782786885245902, + "grad_norm": 34.33395767211914, + "learning_rate": 1.8773674887586867e-05, + "loss": 0.6751, + "step": 920 + }, + { + "epoch": 0.11910860655737705, + "grad_norm": 0.17749445140361786, + "learning_rate": 1.874642321842213e-05, + "loss": 0.3697, + "step": 930 + }, + { + "epoch": 0.12038934426229508, + "grad_norm": 49.89470291137695, + "learning_rate": 1.8719171549257392e-05, + "loss": 0.5144, + "step": 940 + }, + { + "epoch": 0.12167008196721311, + "grad_norm": 15.842961311340332, + "learning_rate": 1.869191988009266e-05, + "loss": 0.7409, + "step": 950 + }, + { + "epoch": 0.12295081967213115, + "grad_norm": 5.076769828796387, + "learning_rate": 1.866466821092792e-05, + "loss": 0.2575, + "step": 960 + }, + { + "epoch": 0.12423155737704918, + "grad_norm": 6.906425476074219, + "learning_rate": 1.8637416541763184e-05, + "loss": 0.4676, + "step": 970 + }, + { + "epoch": 0.1255122950819672, + "grad_norm": 0.2420882135629654, + "learning_rate": 1.861016487259845e-05, + "loss": 0.5278, + "step": 980 + }, + { + "epoch": 0.12679303278688525, + "grad_norm": 42.10707473754883, + "learning_rate": 1.8582913203433712e-05, + "loss": 0.2199, + "step": 990 + }, + { + "epoch": 0.12807377049180327, + "grad_norm": 67.881103515625, + "learning_rate": 1.8555661534268975e-05, + "loss": 0.7105, + "step": 1000 + }, + { + "epoch": 0.1293545081967213, + "grad_norm": 0.4502294361591339, + "learning_rate": 1.852840986510424e-05, + "loss": 0.7214, + "step": 1010 + }, + { + "epoch": 0.13063524590163936, + "grad_norm": 0.19563625752925873, + "learning_rate": 1.8501158195939504e-05, + "loss": 0.344, + "step": 1020 + }, + { + "epoch": 0.13191598360655737, + "grad_norm": 70.44747161865234, + "learning_rate": 1.8473906526774766e-05, + "loss": 0.5475, + "step": 1030 + }, + { + "epoch": 0.13319672131147542, + "grad_norm": 68.5734634399414, + "learning_rate": 1.844665485761003e-05, + "loss": 0.866, + "step": 1040 + }, + { + "epoch": 0.13447745901639344, + "grad_norm": 5.665011405944824, + "learning_rate": 1.841940318844529e-05, + "loss": 0.4777, + "step": 1050 + }, + { + "epoch": 0.13575819672131148, + "grad_norm": 34.88306427001953, + "learning_rate": 1.8392151519280557e-05, + "loss": 0.6516, + "step": 1060 + }, + { + "epoch": 0.1370389344262295, + "grad_norm": 5.857304096221924, + "learning_rate": 1.836489985011582e-05, + "loss": 0.3298, + "step": 1070 + }, + { + "epoch": 0.13831967213114754, + "grad_norm": 0.40846720337867737, + "learning_rate": 1.8337648180951083e-05, + "loss": 0.6535, + "step": 1080 + }, + { + "epoch": 0.1396004098360656, + "grad_norm": 26.644474029541016, + "learning_rate": 1.831039651178635e-05, + "loss": 0.5543, + "step": 1090 + }, + { + "epoch": 0.1408811475409836, + "grad_norm": 1.7488807439804077, + "learning_rate": 1.828314484262161e-05, + "loss": 0.6122, + "step": 1100 + }, + { + "epoch": 0.14216188524590165, + "grad_norm": 63.28523254394531, + "learning_rate": 1.8255893173456874e-05, + "loss": 0.805, + "step": 1110 + }, + { + "epoch": 0.14344262295081966, + "grad_norm": 56.30666732788086, + "learning_rate": 1.822864150429214e-05, + "loss": 0.8171, + "step": 1120 + }, + { + "epoch": 0.1447233606557377, + "grad_norm": 52.1702880859375, + "learning_rate": 1.8201389835127403e-05, + "loss": 0.5012, + "step": 1130 + }, + { + "epoch": 0.14600409836065573, + "grad_norm": 6.9870452880859375, + "learning_rate": 1.817413816596267e-05, + "loss": 0.3778, + "step": 1140 + }, + { + "epoch": 0.14728483606557377, + "grad_norm": 48.00603103637695, + "learning_rate": 1.814688649679793e-05, + "loss": 0.4939, + "step": 1150 + }, + { + "epoch": 0.14856557377049182, + "grad_norm": 0.6154949069023132, + "learning_rate": 1.8119634827633194e-05, + "loss": 0.3668, + "step": 1160 + }, + { + "epoch": 0.14984631147540983, + "grad_norm": 1.350846529006958, + "learning_rate": 1.809238315846846e-05, + "loss": 0.8219, + "step": 1170 + }, + { + "epoch": 0.15112704918032788, + "grad_norm": 17.47528648376465, + "learning_rate": 1.8065131489303723e-05, + "loss": 0.643, + "step": 1180 + }, + { + "epoch": 0.1524077868852459, + "grad_norm": 2.0453598499298096, + "learning_rate": 1.8037879820138985e-05, + "loss": 0.6053, + "step": 1190 + }, + { + "epoch": 0.15368852459016394, + "grad_norm": 28.069385528564453, + "learning_rate": 1.8010628150974248e-05, + "loss": 0.7856, + "step": 1200 + }, + { + "epoch": 0.15496926229508196, + "grad_norm": 4.573185920715332, + "learning_rate": 1.798337648180951e-05, + "loss": 0.4821, + "step": 1210 + }, + { + "epoch": 0.15625, + "grad_norm": 49.39838790893555, + "learning_rate": 1.7956124812644776e-05, + "loss": 0.596, + "step": 1220 + }, + { + "epoch": 0.15753073770491804, + "grad_norm": 5.040111064910889, + "learning_rate": 1.792887314348004e-05, + "loss": 0.5817, + "step": 1230 + }, + { + "epoch": 0.15881147540983606, + "grad_norm": 75.80863189697266, + "learning_rate": 1.79016214743153e-05, + "loss": 1.0482, + "step": 1240 + }, + { + "epoch": 0.1600922131147541, + "grad_norm": 8.544283866882324, + "learning_rate": 1.7874369805150568e-05, + "loss": 0.616, + "step": 1250 + }, + { + "epoch": 0.16137295081967212, + "grad_norm": 11.687309265136719, + "learning_rate": 1.784711813598583e-05, + "loss": 0.4421, + "step": 1260 + }, + { + "epoch": 0.16265368852459017, + "grad_norm": 11.043490409851074, + "learning_rate": 1.7819866466821093e-05, + "loss": 0.2868, + "step": 1270 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 3.897243022918701, + "learning_rate": 1.779261479765636e-05, + "loss": 0.4681, + "step": 1280 + }, + { + "epoch": 0.16521516393442623, + "grad_norm": 0.32223525643348694, + "learning_rate": 1.776536312849162e-05, + "loss": 0.3196, + "step": 1290 + }, + { + "epoch": 0.16649590163934427, + "grad_norm": 0.1265946328639984, + "learning_rate": 1.7738111459326884e-05, + "loss": 0.638, + "step": 1300 + }, + { + "epoch": 0.1677766393442623, + "grad_norm": 40.56721115112305, + "learning_rate": 1.771085979016215e-05, + "loss": 0.5043, + "step": 1310 + }, + { + "epoch": 0.16905737704918034, + "grad_norm": 6.785134315490723, + "learning_rate": 1.7683608120997413e-05, + "loss": 0.4148, + "step": 1320 + }, + { + "epoch": 0.17033811475409835, + "grad_norm": 33.5522346496582, + "learning_rate": 1.7656356451832675e-05, + "loss": 1.1591, + "step": 1330 + }, + { + "epoch": 0.1716188524590164, + "grad_norm": 7.858984470367432, + "learning_rate": 1.762910478266794e-05, + "loss": 0.9522, + "step": 1340 + }, + { + "epoch": 0.1728995901639344, + "grad_norm": 32.17461013793945, + "learning_rate": 1.7601853113503204e-05, + "loss": 0.5926, + "step": 1350 + }, + { + "epoch": 0.17418032786885246, + "grad_norm": 11.334968566894531, + "learning_rate": 1.7574601444338467e-05, + "loss": 0.7914, + "step": 1360 + }, + { + "epoch": 0.1754610655737705, + "grad_norm": 13.335744857788086, + "learning_rate": 1.754734977517373e-05, + "loss": 0.8537, + "step": 1370 + }, + { + "epoch": 0.17674180327868852, + "grad_norm": 19.00205421447754, + "learning_rate": 1.7520098106008992e-05, + "loss": 0.4838, + "step": 1380 + }, + { + "epoch": 0.17802254098360656, + "grad_norm": 8.699183464050293, + "learning_rate": 1.7492846436844258e-05, + "loss": 0.666, + "step": 1390 + }, + { + "epoch": 0.17930327868852458, + "grad_norm": 1.6320335865020752, + "learning_rate": 1.746559476767952e-05, + "loss": 0.5107, + "step": 1400 + }, + { + "epoch": 0.18058401639344263, + "grad_norm": 1.2799221277236938, + "learning_rate": 1.7438343098514787e-05, + "loss": 0.4847, + "step": 1410 + }, + { + "epoch": 0.18186475409836064, + "grad_norm": 2.808711528778076, + "learning_rate": 1.741109142935005e-05, + "loss": 0.4733, + "step": 1420 + }, + { + "epoch": 0.1831454918032787, + "grad_norm": 18.037717819213867, + "learning_rate": 1.7383839760185312e-05, + "loss": 0.6985, + "step": 1430 + }, + { + "epoch": 0.18442622950819673, + "grad_norm": 2.3388659954071045, + "learning_rate": 1.7356588091020578e-05, + "loss": 0.2029, + "step": 1440 + }, + { + "epoch": 0.18570696721311475, + "grad_norm": 15.241260528564453, + "learning_rate": 1.732933642185584e-05, + "loss": 0.2651, + "step": 1450 + }, + { + "epoch": 0.1869877049180328, + "grad_norm": 27.643362045288086, + "learning_rate": 1.7302084752691103e-05, + "loss": 0.6641, + "step": 1460 + }, + { + "epoch": 0.1882684426229508, + "grad_norm": 51.026947021484375, + "learning_rate": 1.727483308352637e-05, + "loss": 0.5786, + "step": 1470 + }, + { + "epoch": 0.18954918032786885, + "grad_norm": 62.00007247924805, + "learning_rate": 1.7247581414361632e-05, + "loss": 0.3976, + "step": 1480 + }, + { + "epoch": 0.19082991803278687, + "grad_norm": 80.54548645019531, + "learning_rate": 1.7220329745196894e-05, + "loss": 0.4812, + "step": 1490 + }, + { + "epoch": 0.19211065573770492, + "grad_norm": 108.28478240966797, + "learning_rate": 1.719307807603216e-05, + "loss": 0.3106, + "step": 1500 + }, + { + "epoch": 0.19339139344262296, + "grad_norm": 31.335493087768555, + "learning_rate": 1.7165826406867423e-05, + "loss": 0.4389, + "step": 1510 + }, + { + "epoch": 0.19467213114754098, + "grad_norm": 2.4842689037323, + "learning_rate": 1.7138574737702686e-05, + "loss": 0.4561, + "step": 1520 + }, + { + "epoch": 0.19595286885245902, + "grad_norm": 34.57732391357422, + "learning_rate": 1.7111323068537948e-05, + "loss": 0.6538, + "step": 1530 + }, + { + "epoch": 0.19723360655737704, + "grad_norm": 89.62613677978516, + "learning_rate": 1.708407139937321e-05, + "loss": 0.3808, + "step": 1540 + }, + { + "epoch": 0.19851434426229508, + "grad_norm": 0.6716292500495911, + "learning_rate": 1.7056819730208477e-05, + "loss": 0.5189, + "step": 1550 + }, + { + "epoch": 0.19979508196721313, + "grad_norm": 32.78571319580078, + "learning_rate": 1.702956806104374e-05, + "loss": 1.2536, + "step": 1560 + }, + { + "epoch": 0.20107581967213115, + "grad_norm": 5.39422607421875, + "learning_rate": 1.7002316391879002e-05, + "loss": 0.4674, + "step": 1570 + }, + { + "epoch": 0.2023565573770492, + "grad_norm": 0.295356422662735, + "learning_rate": 1.6975064722714268e-05, + "loss": 0.9923, + "step": 1580 + }, + { + "epoch": 0.2036372950819672, + "grad_norm": 2.7056820392608643, + "learning_rate": 1.694781305354953e-05, + "loss": 0.1946, + "step": 1590 + }, + { + "epoch": 0.20491803278688525, + "grad_norm": 2.453801393508911, + "learning_rate": 1.6920561384384793e-05, + "loss": 0.4442, + "step": 1600 + }, + { + "epoch": 0.20619877049180327, + "grad_norm": 5.696882247924805, + "learning_rate": 1.689330971522006e-05, + "loss": 0.5623, + "step": 1610 + }, + { + "epoch": 0.2074795081967213, + "grad_norm": 17.160661697387695, + "learning_rate": 1.6866058046055322e-05, + "loss": 1.097, + "step": 1620 + }, + { + "epoch": 0.20876024590163936, + "grad_norm": 23.408737182617188, + "learning_rate": 1.6838806376890585e-05, + "loss": 0.5359, + "step": 1630 + }, + { + "epoch": 0.21004098360655737, + "grad_norm": 0.7226897478103638, + "learning_rate": 1.681155470772585e-05, + "loss": 0.3844, + "step": 1640 + }, + { + "epoch": 0.21132172131147542, + "grad_norm": 28.273542404174805, + "learning_rate": 1.6784303038561113e-05, + "loss": 0.6221, + "step": 1650 + }, + { + "epoch": 0.21260245901639344, + "grad_norm": 0.6800060272216797, + "learning_rate": 1.6757051369396376e-05, + "loss": 0.4647, + "step": 1660 + }, + { + "epoch": 0.21388319672131148, + "grad_norm": 7.838409423828125, + "learning_rate": 1.6729799700231642e-05, + "loss": 0.7482, + "step": 1670 + }, + { + "epoch": 0.2151639344262295, + "grad_norm": 68.58909606933594, + "learning_rate": 1.6702548031066905e-05, + "loss": 0.7191, + "step": 1680 + }, + { + "epoch": 0.21644467213114754, + "grad_norm": 10.408316612243652, + "learning_rate": 1.6675296361902167e-05, + "loss": 0.9812, + "step": 1690 + }, + { + "epoch": 0.2177254098360656, + "grad_norm": 45.571781158447266, + "learning_rate": 1.664804469273743e-05, + "loss": 0.4901, + "step": 1700 + }, + { + "epoch": 0.2190061475409836, + "grad_norm": 14.653166770935059, + "learning_rate": 1.6620793023572696e-05, + "loss": 0.4544, + "step": 1710 + }, + { + "epoch": 0.22028688524590165, + "grad_norm": 75.12469482421875, + "learning_rate": 1.659354135440796e-05, + "loss": 0.5393, + "step": 1720 + }, + { + "epoch": 0.22156762295081966, + "grad_norm": 20.70387077331543, + "learning_rate": 1.656628968524322e-05, + "loss": 0.7864, + "step": 1730 + }, + { + "epoch": 0.2228483606557377, + "grad_norm": 2.0562734603881836, + "learning_rate": 1.6539038016078487e-05, + "loss": 0.6331, + "step": 1740 + }, + { + "epoch": 0.22412909836065573, + "grad_norm": 13.042604446411133, + "learning_rate": 1.651178634691375e-05, + "loss": 0.5563, + "step": 1750 + }, + { + "epoch": 0.22540983606557377, + "grad_norm": 33.89776611328125, + "learning_rate": 1.6484534677749012e-05, + "loss": 0.4452, + "step": 1760 + }, + { + "epoch": 0.22669057377049182, + "grad_norm": 16.996103286743164, + "learning_rate": 1.645728300858428e-05, + "loss": 0.6087, + "step": 1770 + }, + { + "epoch": 0.22797131147540983, + "grad_norm": 2.9814796447753906, + "learning_rate": 1.643003133941954e-05, + "loss": 0.3813, + "step": 1780 + }, + { + "epoch": 0.22925204918032788, + "grad_norm": 0.20661257207393646, + "learning_rate": 1.6402779670254804e-05, + "loss": 0.3531, + "step": 1790 + }, + { + "epoch": 0.2305327868852459, + "grad_norm": 0.23248881101608276, + "learning_rate": 1.637552800109007e-05, + "loss": 0.4496, + "step": 1800 + }, + { + "epoch": 0.23181352459016394, + "grad_norm": 55.3471565246582, + "learning_rate": 1.6348276331925332e-05, + "loss": 0.5625, + "step": 1810 + }, + { + "epoch": 0.23309426229508196, + "grad_norm": 11.669384002685547, + "learning_rate": 1.6321024662760595e-05, + "loss": 0.4075, + "step": 1820 + }, + { + "epoch": 0.234375, + "grad_norm": 65.76184844970703, + "learning_rate": 1.629377299359586e-05, + "loss": 0.3711, + "step": 1830 + }, + { + "epoch": 0.23565573770491804, + "grad_norm": 1.0016331672668457, + "learning_rate": 1.6266521324431124e-05, + "loss": 0.1958, + "step": 1840 + }, + { + "epoch": 0.23693647540983606, + "grad_norm": 9.233772277832031, + "learning_rate": 1.6239269655266386e-05, + "loss": 0.5992, + "step": 1850 + }, + { + "epoch": 0.2382172131147541, + "grad_norm": 7.4546732902526855, + "learning_rate": 1.621201798610165e-05, + "loss": 0.775, + "step": 1860 + }, + { + "epoch": 0.23949795081967212, + "grad_norm": 0.771056056022644, + "learning_rate": 1.618476631693691e-05, + "loss": 0.6516, + "step": 1870 + }, + { + "epoch": 0.24077868852459017, + "grad_norm": 13.350895881652832, + "learning_rate": 1.6157514647772177e-05, + "loss": 0.6574, + "step": 1880 + }, + { + "epoch": 0.24205942622950818, + "grad_norm": 1.9616976976394653, + "learning_rate": 1.613026297860744e-05, + "loss": 0.7321, + "step": 1890 + }, + { + "epoch": 0.24334016393442623, + "grad_norm": 0.2918919622898102, + "learning_rate": 1.6103011309442703e-05, + "loss": 1.1059, + "step": 1900 + }, + { + "epoch": 0.24462090163934427, + "grad_norm": 13.870285987854004, + "learning_rate": 1.607575964027797e-05, + "loss": 0.4844, + "step": 1910 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 19.64275360107422, + "learning_rate": 1.604850797111323e-05, + "loss": 0.3566, + "step": 1920 + }, + { + "epoch": 0.24718237704918034, + "grad_norm": 54.27963638305664, + "learning_rate": 1.6021256301948497e-05, + "loss": 0.5747, + "step": 1930 + }, + { + "epoch": 0.24846311475409835, + "grad_norm": 163.11248779296875, + "learning_rate": 1.599400463278376e-05, + "loss": 1.0376, + "step": 1940 + }, + { + "epoch": 0.2497438524590164, + "grad_norm": 3.2400197982788086, + "learning_rate": 1.5966752963619023e-05, + "loss": 0.3435, + "step": 1950 + }, + { + "epoch": 0.2510245901639344, + "grad_norm": 0.17113502323627472, + "learning_rate": 1.593950129445429e-05, + "loss": 0.5393, + "step": 1960 + }, + { + "epoch": 0.25230532786885246, + "grad_norm": 22.859413146972656, + "learning_rate": 1.591224962528955e-05, + "loss": 0.3003, + "step": 1970 + }, + { + "epoch": 0.2535860655737705, + "grad_norm": 1.3010896444320679, + "learning_rate": 1.5884997956124814e-05, + "loss": 0.626, + "step": 1980 + }, + { + "epoch": 0.25486680327868855, + "grad_norm": 2.824781656265259, + "learning_rate": 1.585774628696008e-05, + "loss": 0.5887, + "step": 1990 + }, + { + "epoch": 0.25614754098360654, + "grad_norm": 52.8790397644043, + "learning_rate": 1.5830494617795342e-05, + "loss": 0.5767, + "step": 2000 + }, + { + "epoch": 0.2574282786885246, + "grad_norm": 10.472972869873047, + "learning_rate": 1.5803242948630605e-05, + "loss": 0.5818, + "step": 2010 + }, + { + "epoch": 0.2587090163934426, + "grad_norm": 0.7781365513801575, + "learning_rate": 1.5775991279465868e-05, + "loss": 0.9479, + "step": 2020 + }, + { + "epoch": 0.25998975409836067, + "grad_norm": 5.116518974304199, + "learning_rate": 1.574873961030113e-05, + "loss": 0.6473, + "step": 2030 + }, + { + "epoch": 0.2612704918032787, + "grad_norm": 31.682783126831055, + "learning_rate": 1.5721487941136396e-05, + "loss": 1.0271, + "step": 2040 + }, + { + "epoch": 0.2625512295081967, + "grad_norm": 0.6573253273963928, + "learning_rate": 1.569423627197166e-05, + "loss": 0.3799, + "step": 2050 + }, + { + "epoch": 0.26383196721311475, + "grad_norm": 5.006514072418213, + "learning_rate": 1.566698460280692e-05, + "loss": 0.406, + "step": 2060 + }, + { + "epoch": 0.2651127049180328, + "grad_norm": 30.3986873626709, + "learning_rate": 1.5639732933642188e-05, + "loss": 0.7609, + "step": 2070 + }, + { + "epoch": 0.26639344262295084, + "grad_norm": 8.392414093017578, + "learning_rate": 1.561248126447745e-05, + "loss": 0.3631, + "step": 2080 + }, + { + "epoch": 0.2676741803278688, + "grad_norm": 0.6506038308143616, + "learning_rate": 1.5585229595312713e-05, + "loss": 0.5563, + "step": 2090 + }, + { + "epoch": 0.26895491803278687, + "grad_norm": 34.08297348022461, + "learning_rate": 1.555797792614798e-05, + "loss": 0.3359, + "step": 2100 + }, + { + "epoch": 0.2702356557377049, + "grad_norm": 30.52340316772461, + "learning_rate": 1.553072625698324e-05, + "loss": 0.4175, + "step": 2110 + }, + { + "epoch": 0.27151639344262296, + "grad_norm": 19.159420013427734, + "learning_rate": 1.5503474587818504e-05, + "loss": 0.5232, + "step": 2120 + }, + { + "epoch": 0.272797131147541, + "grad_norm": 1.3067234754562378, + "learning_rate": 1.547622291865377e-05, + "loss": 0.2685, + "step": 2130 + }, + { + "epoch": 0.274077868852459, + "grad_norm": 29.783512115478516, + "learning_rate": 1.5448971249489033e-05, + "loss": 0.7583, + "step": 2140 + }, + { + "epoch": 0.27535860655737704, + "grad_norm": 55.58544921875, + "learning_rate": 1.5421719580324295e-05, + "loss": 0.7714, + "step": 2150 + }, + { + "epoch": 0.2766393442622951, + "grad_norm": 6.930970191955566, + "learning_rate": 1.539446791115956e-05, + "loss": 0.4346, + "step": 2160 + }, + { + "epoch": 0.27792008196721313, + "grad_norm": 7.723865509033203, + "learning_rate": 1.5367216241994824e-05, + "loss": 0.6486, + "step": 2170 + }, + { + "epoch": 0.2792008196721312, + "grad_norm": 0.23200243711471558, + "learning_rate": 1.5339964572830087e-05, + "loss": 0.514, + "step": 2180 + }, + { + "epoch": 0.28048155737704916, + "grad_norm": 29.773784637451172, + "learning_rate": 1.531271290366535e-05, + "loss": 0.9953, + "step": 2190 + }, + { + "epoch": 0.2817622950819672, + "grad_norm": 19.467941284179688, + "learning_rate": 1.5285461234500615e-05, + "loss": 0.6698, + "step": 2200 + }, + { + "epoch": 0.28304303278688525, + "grad_norm": 0.44849446415901184, + "learning_rate": 1.5258209565335878e-05, + "loss": 0.3486, + "step": 2210 + }, + { + "epoch": 0.2843237704918033, + "grad_norm": 3.40317702293396, + "learning_rate": 1.523095789617114e-05, + "loss": 0.3062, + "step": 2220 + }, + { + "epoch": 0.2856045081967213, + "grad_norm": 23.58439826965332, + "learning_rate": 1.5203706227006405e-05, + "loss": 0.4546, + "step": 2230 + }, + { + "epoch": 0.28688524590163933, + "grad_norm": 0.14240220189094543, + "learning_rate": 1.517645455784167e-05, + "loss": 0.466, + "step": 2240 + }, + { + "epoch": 0.2881659836065574, + "grad_norm": 22.152645111083984, + "learning_rate": 1.5149202888676932e-05, + "loss": 0.9443, + "step": 2250 + }, + { + "epoch": 0.2894467213114754, + "grad_norm": 40.078433990478516, + "learning_rate": 1.5121951219512196e-05, + "loss": 0.7763, + "step": 2260 + }, + { + "epoch": 0.29072745901639346, + "grad_norm": 22.58036231994629, + "learning_rate": 1.509469955034746e-05, + "loss": 0.5156, + "step": 2270 + }, + { + "epoch": 0.29200819672131145, + "grad_norm": 9.161020278930664, + "learning_rate": 1.5067447881182725e-05, + "loss": 0.5463, + "step": 2280 + }, + { + "epoch": 0.2932889344262295, + "grad_norm": 8.112720489501953, + "learning_rate": 1.5040196212017987e-05, + "loss": 0.8208, + "step": 2290 + }, + { + "epoch": 0.29456967213114754, + "grad_norm": 2.3632164001464844, + "learning_rate": 1.5012944542853252e-05, + "loss": 0.5992, + "step": 2300 + }, + { + "epoch": 0.2958504098360656, + "grad_norm": 5.630832672119141, + "learning_rate": 1.4985692873688516e-05, + "loss": 0.6949, + "step": 2310 + }, + { + "epoch": 0.29713114754098363, + "grad_norm": 75.62430572509766, + "learning_rate": 1.4958441204523779e-05, + "loss": 0.5873, + "step": 2320 + }, + { + "epoch": 0.2984118852459016, + "grad_norm": 11.58348274230957, + "learning_rate": 1.4931189535359043e-05, + "loss": 0.704, + "step": 2330 + }, + { + "epoch": 0.29969262295081966, + "grad_norm": 20.816808700561523, + "learning_rate": 1.4903937866194304e-05, + "loss": 0.2416, + "step": 2340 + }, + { + "epoch": 0.3009733606557377, + "grad_norm": 0.5709815621376038, + "learning_rate": 1.4876686197029568e-05, + "loss": 0.4159, + "step": 2350 + }, + { + "epoch": 0.30225409836065575, + "grad_norm": 0.5212659239768982, + "learning_rate": 1.4849434527864833e-05, + "loss": 0.4472, + "step": 2360 + }, + { + "epoch": 0.30353483606557374, + "grad_norm": 10.903100967407227, + "learning_rate": 1.4822182858700095e-05, + "loss": 0.5868, + "step": 2370 + }, + { + "epoch": 0.3048155737704918, + "grad_norm": 60.755706787109375, + "learning_rate": 1.479493118953536e-05, + "loss": 0.8967, + "step": 2380 + }, + { + "epoch": 0.30609631147540983, + "grad_norm": 0.22794629633426666, + "learning_rate": 1.4767679520370624e-05, + "loss": 0.7226, + "step": 2390 + }, + { + "epoch": 0.3073770491803279, + "grad_norm": 52.29710006713867, + "learning_rate": 1.4740427851205888e-05, + "loss": 0.7622, + "step": 2400 + }, + { + "epoch": 0.3086577868852459, + "grad_norm": 0.6769666075706482, + "learning_rate": 1.471317618204115e-05, + "loss": 0.6245, + "step": 2410 + }, + { + "epoch": 0.3099385245901639, + "grad_norm": 1.508181095123291, + "learning_rate": 1.4685924512876415e-05, + "loss": 0.3395, + "step": 2420 + }, + { + "epoch": 0.31121926229508196, + "grad_norm": 78.36157989501953, + "learning_rate": 1.465867284371168e-05, + "loss": 0.6791, + "step": 2430 + }, + { + "epoch": 0.3125, + "grad_norm": 0.1883663535118103, + "learning_rate": 1.4631421174546942e-05, + "loss": 0.2169, + "step": 2440 + }, + { + "epoch": 0.31378073770491804, + "grad_norm": 42.14516067504883, + "learning_rate": 1.4604169505382206e-05, + "loss": 1.2172, + "step": 2450 + }, + { + "epoch": 0.3150614754098361, + "grad_norm": 11.31810474395752, + "learning_rate": 1.457691783621747e-05, + "loss": 0.5506, + "step": 2460 + }, + { + "epoch": 0.3163422131147541, + "grad_norm": 5.650265216827393, + "learning_rate": 1.4549666167052733e-05, + "loss": 0.595, + "step": 2470 + }, + { + "epoch": 0.3176229508196721, + "grad_norm": 1.0849229097366333, + "learning_rate": 1.4522414497887998e-05, + "loss": 0.6437, + "step": 2480 + }, + { + "epoch": 0.31890368852459017, + "grad_norm": 25.959819793701172, + "learning_rate": 1.4495162828723262e-05, + "loss": 0.6475, + "step": 2490 + }, + { + "epoch": 0.3201844262295082, + "grad_norm": 0.33578041195869446, + "learning_rate": 1.4467911159558523e-05, + "loss": 0.7596, + "step": 2500 + }, + { + "epoch": 0.32146516393442626, + "grad_norm": 3.292280673980713, + "learning_rate": 1.4440659490393787e-05, + "loss": 0.6943, + "step": 2510 + }, + { + "epoch": 0.32274590163934425, + "grad_norm": 12.404119491577148, + "learning_rate": 1.4413407821229052e-05, + "loss": 0.8402, + "step": 2520 + }, + { + "epoch": 0.3240266393442623, + "grad_norm": 23.83495330810547, + "learning_rate": 1.4386156152064314e-05, + "loss": 0.8554, + "step": 2530 + }, + { + "epoch": 0.32530737704918034, + "grad_norm": 13.377483367919922, + "learning_rate": 1.4358904482899578e-05, + "loss": 0.7793, + "step": 2540 + }, + { + "epoch": 0.3265881147540984, + "grad_norm": 73.42285919189453, + "learning_rate": 1.4331652813734843e-05, + "loss": 0.4669, + "step": 2550 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 40.041080474853516, + "learning_rate": 1.4304401144570105e-05, + "loss": 0.721, + "step": 2560 + }, + { + "epoch": 0.3291495901639344, + "grad_norm": 0.547555148601532, + "learning_rate": 1.427714947540537e-05, + "loss": 0.4564, + "step": 2570 + }, + { + "epoch": 0.33043032786885246, + "grad_norm": 1.5647186040878296, + "learning_rate": 1.4249897806240634e-05, + "loss": 0.4615, + "step": 2580 + }, + { + "epoch": 0.3317110655737705, + "grad_norm": 35.03215789794922, + "learning_rate": 1.4222646137075897e-05, + "loss": 0.8154, + "step": 2590 + }, + { + "epoch": 0.33299180327868855, + "grad_norm": 2.925804615020752, + "learning_rate": 1.4195394467911161e-05, + "loss": 0.4514, + "step": 2600 + }, + { + "epoch": 0.33427254098360654, + "grad_norm": 67.04120635986328, + "learning_rate": 1.4168142798746425e-05, + "loss": 0.3808, + "step": 2610 + }, + { + "epoch": 0.3355532786885246, + "grad_norm": 44.40123748779297, + "learning_rate": 1.4140891129581688e-05, + "loss": 0.4096, + "step": 2620 + }, + { + "epoch": 0.3368340163934426, + "grad_norm": 0.8358442187309265, + "learning_rate": 1.4113639460416952e-05, + "loss": 0.5851, + "step": 2630 + }, + { + "epoch": 0.33811475409836067, + "grad_norm": 0.4117409884929657, + "learning_rate": 1.4086387791252217e-05, + "loss": 0.4103, + "step": 2640 + }, + { + "epoch": 0.3393954918032787, + "grad_norm": 34.275489807128906, + "learning_rate": 1.405913612208748e-05, + "loss": 0.5521, + "step": 2650 + }, + { + "epoch": 0.3406762295081967, + "grad_norm": 0.12396706640720367, + "learning_rate": 1.4031884452922742e-05, + "loss": 0.1471, + "step": 2660 + }, + { + "epoch": 0.34195696721311475, + "grad_norm": 26.100513458251953, + "learning_rate": 1.4004632783758006e-05, + "loss": 0.6545, + "step": 2670 + }, + { + "epoch": 0.3432377049180328, + "grad_norm": 1.4054203033447266, + "learning_rate": 1.3977381114593269e-05, + "loss": 0.3345, + "step": 2680 + }, + { + "epoch": 0.34451844262295084, + "grad_norm": 18.780344009399414, + "learning_rate": 1.3950129445428533e-05, + "loss": 0.5751, + "step": 2690 + }, + { + "epoch": 0.3457991803278688, + "grad_norm": 2.4345474243164062, + "learning_rate": 1.3922877776263797e-05, + "loss": 0.518, + "step": 2700 + }, + { + "epoch": 0.34707991803278687, + "grad_norm": 153.76368713378906, + "learning_rate": 1.389562610709906e-05, + "loss": 0.9605, + "step": 2710 + }, + { + "epoch": 0.3483606557377049, + "grad_norm": 23.214303970336914, + "learning_rate": 1.3868374437934324e-05, + "loss": 0.5311, + "step": 2720 + }, + { + "epoch": 0.34964139344262296, + "grad_norm": 3.1090455055236816, + "learning_rate": 1.3841122768769589e-05, + "loss": 0.7492, + "step": 2730 + }, + { + "epoch": 0.350922131147541, + "grad_norm": 18.95741081237793, + "learning_rate": 1.3813871099604851e-05, + "loss": 0.8132, + "step": 2740 + }, + { + "epoch": 0.352202868852459, + "grad_norm": 35.78852081298828, + "learning_rate": 1.3786619430440116e-05, + "loss": 0.4757, + "step": 2750 + }, + { + "epoch": 0.35348360655737704, + "grad_norm": 0.2885892391204834, + "learning_rate": 1.375936776127538e-05, + "loss": 0.4375, + "step": 2760 + }, + { + "epoch": 0.3547643442622951, + "grad_norm": 32.26221466064453, + "learning_rate": 1.3732116092110643e-05, + "loss": 0.7023, + "step": 2770 + }, + { + "epoch": 0.35604508196721313, + "grad_norm": 23.65122413635254, + "learning_rate": 1.3704864422945907e-05, + "loss": 0.1926, + "step": 2780 + }, + { + "epoch": 0.3573258196721312, + "grad_norm": 22.145179748535156, + "learning_rate": 1.3677612753781171e-05, + "loss": 0.7968, + "step": 2790 + }, + { + "epoch": 0.35860655737704916, + "grad_norm": 15.272971153259277, + "learning_rate": 1.3650361084616435e-05, + "loss": 0.4302, + "step": 2800 + }, + { + "epoch": 0.3598872950819672, + "grad_norm": 69.59125518798828, + "learning_rate": 1.3623109415451698e-05, + "loss": 0.5592, + "step": 2810 + }, + { + "epoch": 0.36116803278688525, + "grad_norm": 0.19557702541351318, + "learning_rate": 1.359585774628696e-05, + "loss": 0.5795, + "step": 2820 + }, + { + "epoch": 0.3624487704918033, + "grad_norm": 28.615272521972656, + "learning_rate": 1.3568606077122223e-05, + "loss": 0.517, + "step": 2830 + }, + { + "epoch": 0.3637295081967213, + "grad_norm": 4.395263195037842, + "learning_rate": 1.3541354407957488e-05, + "loss": 0.6146, + "step": 2840 + }, + { + "epoch": 0.36501024590163933, + "grad_norm": 0.7227006554603577, + "learning_rate": 1.3514102738792752e-05, + "loss": 0.5504, + "step": 2850 + }, + { + "epoch": 0.3662909836065574, + "grad_norm": 15.734036445617676, + "learning_rate": 1.3486851069628015e-05, + "loss": 0.59, + "step": 2860 + }, + { + "epoch": 0.3675717213114754, + "grad_norm": 1.6639937162399292, + "learning_rate": 1.3459599400463279e-05, + "loss": 0.5332, + "step": 2870 + }, + { + "epoch": 0.36885245901639346, + "grad_norm": 0.7330634593963623, + "learning_rate": 1.3432347731298543e-05, + "loss": 0.4509, + "step": 2880 + }, + { + "epoch": 0.37013319672131145, + "grad_norm": 42.200531005859375, + "learning_rate": 1.3405096062133806e-05, + "loss": 0.5252, + "step": 2890 + }, + { + "epoch": 0.3714139344262295, + "grad_norm": 65.34646606445312, + "learning_rate": 1.337784439296907e-05, + "loss": 0.4143, + "step": 2900 + }, + { + "epoch": 0.37269467213114754, + "grad_norm": 0.17863045632839203, + "learning_rate": 1.3350592723804335e-05, + "loss": 0.4167, + "step": 2910 + }, + { + "epoch": 0.3739754098360656, + "grad_norm": 28.605680465698242, + "learning_rate": 1.3323341054639599e-05, + "loss": 0.6769, + "step": 2920 + }, + { + "epoch": 0.37525614754098363, + "grad_norm": 0.0853688195347786, + "learning_rate": 1.3296089385474861e-05, + "loss": 0.5126, + "step": 2930 + }, + { + "epoch": 0.3765368852459016, + "grad_norm": 63.26204299926758, + "learning_rate": 1.3268837716310126e-05, + "loss": 1.1696, + "step": 2940 + }, + { + "epoch": 0.37781762295081966, + "grad_norm": 45.06633377075195, + "learning_rate": 1.324158604714539e-05, + "loss": 0.6522, + "step": 2950 + }, + { + "epoch": 0.3790983606557377, + "grad_norm": 36.450233459472656, + "learning_rate": 1.3214334377980653e-05, + "loss": 0.91, + "step": 2960 + }, + { + "epoch": 0.38037909836065575, + "grad_norm": 58.59020233154297, + "learning_rate": 1.3187082708815917e-05, + "loss": 0.5549, + "step": 2970 + }, + { + "epoch": 0.38165983606557374, + "grad_norm": 13.287269592285156, + "learning_rate": 1.3159831039651181e-05, + "loss": 0.4198, + "step": 2980 + }, + { + "epoch": 0.3829405737704918, + "grad_norm": 20.24810218811035, + "learning_rate": 1.3132579370486442e-05, + "loss": 0.6179, + "step": 2990 + }, + { + "epoch": 0.38422131147540983, + "grad_norm": 18.099557876586914, + "learning_rate": 1.3105327701321707e-05, + "loss": 0.8484, + "step": 3000 + }, + { + "epoch": 0.3855020491803279, + "grad_norm": 41.92770004272461, + "learning_rate": 1.307807603215697e-05, + "loss": 0.929, + "step": 3010 + }, + { + "epoch": 0.3867827868852459, + "grad_norm": 11.101128578186035, + "learning_rate": 1.3050824362992234e-05, + "loss": 0.6886, + "step": 3020 + }, + { + "epoch": 0.3880635245901639, + "grad_norm": 0.5516038537025452, + "learning_rate": 1.3023572693827498e-05, + "loss": 0.232, + "step": 3030 + }, + { + "epoch": 0.38934426229508196, + "grad_norm": 19.20160675048828, + "learning_rate": 1.299632102466276e-05, + "loss": 0.632, + "step": 3040 + }, + { + "epoch": 0.390625, + "grad_norm": 89.39508056640625, + "learning_rate": 1.2969069355498025e-05, + "loss": 0.5, + "step": 3050 + }, + { + "epoch": 0.39190573770491804, + "grad_norm": 3.156262159347534, + "learning_rate": 1.294181768633329e-05, + "loss": 0.5471, + "step": 3060 + }, + { + "epoch": 0.3931864754098361, + "grad_norm": 1.8074102401733398, + "learning_rate": 1.2914566017168553e-05, + "loss": 0.4993, + "step": 3070 + }, + { + "epoch": 0.3944672131147541, + "grad_norm": 10.57691764831543, + "learning_rate": 1.2887314348003816e-05, + "loss": 0.3536, + "step": 3080 + }, + { + "epoch": 0.3957479508196721, + "grad_norm": 31.425968170166016, + "learning_rate": 1.286006267883908e-05, + "loss": 0.4929, + "step": 3090 + }, + { + "epoch": 0.39702868852459017, + "grad_norm": 1.107421636581421, + "learning_rate": 1.2832811009674345e-05, + "loss": 0.5302, + "step": 3100 + }, + { + "epoch": 0.3983094262295082, + "grad_norm": 31.851308822631836, + "learning_rate": 1.2805559340509607e-05, + "loss": 0.6524, + "step": 3110 + }, + { + "epoch": 0.39959016393442626, + "grad_norm": 33.0150146484375, + "learning_rate": 1.2778307671344872e-05, + "loss": 0.5296, + "step": 3120 + }, + { + "epoch": 0.40087090163934425, + "grad_norm": 60.5539665222168, + "learning_rate": 1.2751056002180136e-05, + "loss": 0.7329, + "step": 3130 + }, + { + "epoch": 0.4021516393442623, + "grad_norm": 26.929574966430664, + "learning_rate": 1.2723804333015399e-05, + "loss": 0.5035, + "step": 3140 + }, + { + "epoch": 0.40343237704918034, + "grad_norm": 28.021299362182617, + "learning_rate": 1.2696552663850661e-05, + "loss": 0.603, + "step": 3150 + }, + { + "epoch": 0.4047131147540984, + "grad_norm": 59.49539566040039, + "learning_rate": 1.2669300994685924e-05, + "loss": 0.5007, + "step": 3160 + }, + { + "epoch": 0.40599385245901637, + "grad_norm": 31.815570831298828, + "learning_rate": 1.2642049325521188e-05, + "loss": 0.4406, + "step": 3170 + }, + { + "epoch": 0.4072745901639344, + "grad_norm": 60.27109146118164, + "learning_rate": 1.2614797656356453e-05, + "loss": 0.5205, + "step": 3180 + }, + { + "epoch": 0.40855532786885246, + "grad_norm": 3.3493058681488037, + "learning_rate": 1.2587545987191717e-05, + "loss": 0.6267, + "step": 3190 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 23.72585678100586, + "learning_rate": 1.256029431802698e-05, + "loss": 0.6263, + "step": 3200 + }, + { + "epoch": 0.41111680327868855, + "grad_norm": 24.219833374023438, + "learning_rate": 1.2533042648862244e-05, + "loss": 0.4589, + "step": 3210 + }, + { + "epoch": 0.41239754098360654, + "grad_norm": 0.2840415835380554, + "learning_rate": 1.2505790979697508e-05, + "loss": 0.3652, + "step": 3220 + }, + { + "epoch": 0.4136782786885246, + "grad_norm": 17.429651260375977, + "learning_rate": 1.247853931053277e-05, + "loss": 0.7563, + "step": 3230 + }, + { + "epoch": 0.4149590163934426, + "grad_norm": 0.5108852386474609, + "learning_rate": 1.2451287641368035e-05, + "loss": 0.3132, + "step": 3240 + }, + { + "epoch": 0.41623975409836067, + "grad_norm": 50.98451614379883, + "learning_rate": 1.24240359722033e-05, + "loss": 0.5077, + "step": 3250 + }, + { + "epoch": 0.4175204918032787, + "grad_norm": 1.3974177837371826, + "learning_rate": 1.2396784303038562e-05, + "loss": 0.4276, + "step": 3260 + }, + { + "epoch": 0.4188012295081967, + "grad_norm": 12.84176254272461, + "learning_rate": 1.2369532633873826e-05, + "loss": 0.5647, + "step": 3270 + }, + { + "epoch": 0.42008196721311475, + "grad_norm": 21.05103302001953, + "learning_rate": 1.234228096470909e-05, + "loss": 0.1421, + "step": 3280 + }, + { + "epoch": 0.4213627049180328, + "grad_norm": 0.3647187352180481, + "learning_rate": 1.2315029295544353e-05, + "loss": 0.6179, + "step": 3290 + }, + { + "epoch": 0.42264344262295084, + "grad_norm": 90.5313720703125, + "learning_rate": 1.2287777626379618e-05, + "loss": 0.8233, + "step": 3300 + }, + { + "epoch": 0.4239241803278688, + "grad_norm": 19.75844955444336, + "learning_rate": 1.226052595721488e-05, + "loss": 0.5078, + "step": 3310 + }, + { + "epoch": 0.42520491803278687, + "grad_norm": 0.42248353362083435, + "learning_rate": 1.2233274288050143e-05, + "loss": 0.3436, + "step": 3320 + }, + { + "epoch": 0.4264856557377049, + "grad_norm": 59.313232421875, + "learning_rate": 1.2206022618885407e-05, + "loss": 0.5244, + "step": 3330 + }, + { + "epoch": 0.42776639344262296, + "grad_norm": 14.109567642211914, + "learning_rate": 1.2178770949720671e-05, + "loss": 0.7947, + "step": 3340 + }, + { + "epoch": 0.429047131147541, + "grad_norm": 16.229310989379883, + "learning_rate": 1.2151519280555934e-05, + "loss": 0.5386, + "step": 3350 + }, + { + "epoch": 0.430327868852459, + "grad_norm": 25.23029136657715, + "learning_rate": 1.2124267611391198e-05, + "loss": 0.5719, + "step": 3360 + }, + { + "epoch": 0.43160860655737704, + "grad_norm": 1.4985939264297485, + "learning_rate": 1.2097015942226463e-05, + "loss": 0.3424, + "step": 3370 + }, + { + "epoch": 0.4328893442622951, + "grad_norm": 24.808349609375, + "learning_rate": 1.2069764273061725e-05, + "loss": 0.8804, + "step": 3380 + }, + { + "epoch": 0.43417008196721313, + "grad_norm": 30.150056838989258, + "learning_rate": 1.204251260389699e-05, + "loss": 0.3999, + "step": 3390 + }, + { + "epoch": 0.4354508196721312, + "grad_norm": 59.782325744628906, + "learning_rate": 1.2015260934732254e-05, + "loss": 0.4612, + "step": 3400 + }, + { + "epoch": 0.43673155737704916, + "grad_norm": 55.766117095947266, + "learning_rate": 1.1988009265567517e-05, + "loss": 0.3971, + "step": 3410 + }, + { + "epoch": 0.4380122950819672, + "grad_norm": 69.8100814819336, + "learning_rate": 1.1960757596402781e-05, + "loss": 0.5867, + "step": 3420 + }, + { + "epoch": 0.43929303278688525, + "grad_norm": 24.89929962158203, + "learning_rate": 1.1933505927238045e-05, + "loss": 0.793, + "step": 3430 + }, + { + "epoch": 0.4405737704918033, + "grad_norm": 21.96668243408203, + "learning_rate": 1.1906254258073308e-05, + "loss": 0.8675, + "step": 3440 + }, + { + "epoch": 0.4418545081967213, + "grad_norm": 59.37974548339844, + "learning_rate": 1.1879002588908572e-05, + "loss": 0.5162, + "step": 3450 + }, + { + "epoch": 0.44313524590163933, + "grad_norm": 0.49646639823913574, + "learning_rate": 1.1851750919743837e-05, + "loss": 0.6127, + "step": 3460 + }, + { + "epoch": 0.4444159836065574, + "grad_norm": 8.308236122131348, + "learning_rate": 1.1824499250579097e-05, + "loss": 0.6185, + "step": 3470 + }, + { + "epoch": 0.4456967213114754, + "grad_norm": 2.5998694896698, + "learning_rate": 1.1797247581414362e-05, + "loss": 0.5386, + "step": 3480 + }, + { + "epoch": 0.44697745901639346, + "grad_norm": 39.297706604003906, + "learning_rate": 1.1769995912249626e-05, + "loss": 0.7987, + "step": 3490 + }, + { + "epoch": 0.44825819672131145, + "grad_norm": 7.121617794036865, + "learning_rate": 1.1742744243084889e-05, + "loss": 0.8963, + "step": 3500 + }, + { + "epoch": 0.4495389344262295, + "grad_norm": 1.0637052059173584, + "learning_rate": 1.1715492573920153e-05, + "loss": 0.3344, + "step": 3510 + }, + { + "epoch": 0.45081967213114754, + "grad_norm": 65.03225708007812, + "learning_rate": 1.1688240904755417e-05, + "loss": 0.6484, + "step": 3520 + }, + { + "epoch": 0.4521004098360656, + "grad_norm": 0.4046671986579895, + "learning_rate": 1.166098923559068e-05, + "loss": 0.2954, + "step": 3530 + }, + { + "epoch": 0.45338114754098363, + "grad_norm": 10.253545761108398, + "learning_rate": 1.1633737566425944e-05, + "loss": 0.1934, + "step": 3540 + }, + { + "epoch": 0.4546618852459016, + "grad_norm": 99.9068832397461, + "learning_rate": 1.1606485897261209e-05, + "loss": 0.7702, + "step": 3550 + }, + { + "epoch": 0.45594262295081966, + "grad_norm": 58.01685333251953, + "learning_rate": 1.1579234228096471e-05, + "loss": 0.5098, + "step": 3560 + }, + { + "epoch": 0.4572233606557377, + "grad_norm": 116.0182876586914, + "learning_rate": 1.1551982558931736e-05, + "loss": 0.7526, + "step": 3570 + }, + { + "epoch": 0.45850409836065575, + "grad_norm": 0.7602908611297607, + "learning_rate": 1.1524730889767e-05, + "loss": 0.3513, + "step": 3580 + }, + { + "epoch": 0.45978483606557374, + "grad_norm": 23.507183074951172, + "learning_rate": 1.1497479220602264e-05, + "loss": 0.5627, + "step": 3590 + }, + { + "epoch": 0.4610655737704918, + "grad_norm": 0.25320929288864136, + "learning_rate": 1.1470227551437527e-05, + "loss": 0.7757, + "step": 3600 + }, + { + "epoch": 0.46234631147540983, + "grad_norm": 2.4358434677124023, + "learning_rate": 1.1442975882272791e-05, + "loss": 0.5189, + "step": 3610 + }, + { + "epoch": 0.4636270491803279, + "grad_norm": 3.7247753143310547, + "learning_rate": 1.1415724213108055e-05, + "loss": 0.5093, + "step": 3620 + }, + { + "epoch": 0.4649077868852459, + "grad_norm": 39.57719421386719, + "learning_rate": 1.1388472543943316e-05, + "loss": 0.7375, + "step": 3630 + }, + { + "epoch": 0.4661885245901639, + "grad_norm": 68.47445678710938, + "learning_rate": 1.136122087477858e-05, + "loss": 0.5694, + "step": 3640 + }, + { + "epoch": 0.46746926229508196, + "grad_norm": 18.36240577697754, + "learning_rate": 1.1333969205613843e-05, + "loss": 0.668, + "step": 3650 + }, + { + "epoch": 0.46875, + "grad_norm": 38.88651657104492, + "learning_rate": 1.1306717536449108e-05, + "loss": 0.6662, + "step": 3660 + }, + { + "epoch": 0.47003073770491804, + "grad_norm": 22.401813507080078, + "learning_rate": 1.1279465867284372e-05, + "loss": 0.7211, + "step": 3670 + }, + { + "epoch": 0.4713114754098361, + "grad_norm": 0.3502928912639618, + "learning_rate": 1.1252214198119635e-05, + "loss": 0.4916, + "step": 3680 + }, + { + "epoch": 0.4725922131147541, + "grad_norm": 4.397254467010498, + "learning_rate": 1.1224962528954899e-05, + "loss": 0.7776, + "step": 3690 + }, + { + "epoch": 0.4738729508196721, + "grad_norm": 3.871940851211548, + "learning_rate": 1.1197710859790163e-05, + "loss": 0.707, + "step": 3700 + }, + { + "epoch": 0.47515368852459017, + "grad_norm": 33.0516242980957, + "learning_rate": 1.1170459190625428e-05, + "loss": 0.383, + "step": 3710 + }, + { + "epoch": 0.4764344262295082, + "grad_norm": 26.215961456298828, + "learning_rate": 1.114320752146069e-05, + "loss": 0.4088, + "step": 3720 + }, + { + "epoch": 0.47771516393442626, + "grad_norm": 32.82633972167969, + "learning_rate": 1.1115955852295954e-05, + "loss": 0.4577, + "step": 3730 + }, + { + "epoch": 0.47899590163934425, + "grad_norm": 186.45492553710938, + "learning_rate": 1.1088704183131219e-05, + "loss": 0.4099, + "step": 3740 + }, + { + "epoch": 0.4802766393442623, + "grad_norm": 129.46585083007812, + "learning_rate": 1.1061452513966481e-05, + "loss": 0.6375, + "step": 3750 + }, + { + "epoch": 0.48155737704918034, + "grad_norm": 0.7614141702651978, + "learning_rate": 1.1034200844801746e-05, + "loss": 0.5316, + "step": 3760 + }, + { + "epoch": 0.4828381147540984, + "grad_norm": 34.36369323730469, + "learning_rate": 1.100694917563701e-05, + "loss": 0.7375, + "step": 3770 + }, + { + "epoch": 0.48411885245901637, + "grad_norm": 0.140080064535141, + "learning_rate": 1.0979697506472273e-05, + "loss": 0.6295, + "step": 3780 + }, + { + "epoch": 0.4853995901639344, + "grad_norm": 7.306354999542236, + "learning_rate": 1.0952445837307537e-05, + "loss": 0.9806, + "step": 3790 + }, + { + "epoch": 0.48668032786885246, + "grad_norm": 85.7445068359375, + "learning_rate": 1.0925194168142798e-05, + "loss": 0.7028, + "step": 3800 + }, + { + "epoch": 0.4879610655737705, + "grad_norm": 1.7156010866165161, + "learning_rate": 1.0897942498978062e-05, + "loss": 0.3287, + "step": 3810 + }, + { + "epoch": 0.48924180327868855, + "grad_norm": 4.566237926483154, + "learning_rate": 1.0870690829813327e-05, + "loss": 0.4033, + "step": 3820 + }, + { + "epoch": 0.49052254098360654, + "grad_norm": 46.89541244506836, + "learning_rate": 1.084343916064859e-05, + "loss": 0.823, + "step": 3830 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 12.144411087036133, + "learning_rate": 1.0816187491483854e-05, + "loss": 0.5362, + "step": 3840 + }, + { + "epoch": 0.4930840163934426, + "grad_norm": 18.448686599731445, + "learning_rate": 1.0788935822319118e-05, + "loss": 0.4256, + "step": 3850 + }, + { + "epoch": 0.49436475409836067, + "grad_norm": 0.24063384532928467, + "learning_rate": 1.0761684153154382e-05, + "loss": 0.2343, + "step": 3860 + }, + { + "epoch": 0.4956454918032787, + "grad_norm": 65.46757507324219, + "learning_rate": 1.0734432483989645e-05, + "loss": 0.5689, + "step": 3870 + }, + { + "epoch": 0.4969262295081967, + "grad_norm": 11.055042266845703, + "learning_rate": 1.0707180814824909e-05, + "loss": 0.6077, + "step": 3880 + }, + { + "epoch": 0.49820696721311475, + "grad_norm": 0.7104390263557434, + "learning_rate": 1.0679929145660173e-05, + "loss": 0.4516, + "step": 3890 + }, + { + "epoch": 0.4994877049180328, + "grad_norm": 33.67184066772461, + "learning_rate": 1.0652677476495436e-05, + "loss": 0.895, + "step": 3900 + }, + { + "epoch": 0.5007684426229508, + "grad_norm": 2.971726417541504, + "learning_rate": 1.06254258073307e-05, + "loss": 0.6966, + "step": 3910 + }, + { + "epoch": 0.5020491803278688, + "grad_norm": 0.6927921772003174, + "learning_rate": 1.0598174138165965e-05, + "loss": 0.6475, + "step": 3920 + }, + { + "epoch": 0.5033299180327869, + "grad_norm": 18.608713150024414, + "learning_rate": 1.0570922469001227e-05, + "loss": 0.6699, + "step": 3930 + }, + { + "epoch": 0.5046106557377049, + "grad_norm": 4.135254859924316, + "learning_rate": 1.0543670799836492e-05, + "loss": 0.4654, + "step": 3940 + }, + { + "epoch": 0.5058913934426229, + "grad_norm": 18.821929931640625, + "learning_rate": 1.0516419130671756e-05, + "loss": 0.8085, + "step": 3950 + }, + { + "epoch": 0.507172131147541, + "grad_norm": 45.03554916381836, + "learning_rate": 1.0489167461507017e-05, + "loss": 0.4745, + "step": 3960 + }, + { + "epoch": 0.508452868852459, + "grad_norm": 170.6229705810547, + "learning_rate": 1.0461915792342281e-05, + "loss": 0.6601, + "step": 3970 + }, + { + "epoch": 0.5097336065573771, + "grad_norm": 23.49982261657715, + "learning_rate": 1.0434664123177546e-05, + "loss": 0.379, + "step": 3980 + }, + { + "epoch": 0.5110143442622951, + "grad_norm": 2.7527880668640137, + "learning_rate": 1.0407412454012808e-05, + "loss": 0.427, + "step": 3990 + }, + { + "epoch": 0.5122950819672131, + "grad_norm": 0.854061484336853, + "learning_rate": 1.0380160784848072e-05, + "loss": 0.8099, + "step": 4000 + }, + { + "epoch": 0.5135758196721312, + "grad_norm": 77.43912506103516, + "learning_rate": 1.0352909115683337e-05, + "loss": 0.2461, + "step": 4010 + }, + { + "epoch": 0.5148565573770492, + "grad_norm": 0.2251901924610138, + "learning_rate": 1.03256574465186e-05, + "loss": 0.6852, + "step": 4020 + }, + { + "epoch": 0.5161372950819673, + "grad_norm": 0.30500558018684387, + "learning_rate": 1.0298405777353864e-05, + "loss": 0.2133, + "step": 4030 + }, + { + "epoch": 0.5174180327868853, + "grad_norm": 258.4718017578125, + "learning_rate": 1.0271154108189128e-05, + "loss": 0.6098, + "step": 4040 + }, + { + "epoch": 0.5186987704918032, + "grad_norm": 38.535884857177734, + "learning_rate": 1.024390243902439e-05, + "loss": 1.0855, + "step": 4050 + }, + { + "epoch": 0.5199795081967213, + "grad_norm": 13.258109092712402, + "learning_rate": 1.0216650769859655e-05, + "loss": 0.8256, + "step": 4060 + }, + { + "epoch": 0.5212602459016393, + "grad_norm": 45.26698684692383, + "learning_rate": 1.018939910069492e-05, + "loss": 0.579, + "step": 4070 + }, + { + "epoch": 0.5225409836065574, + "grad_norm": 9.766562461853027, + "learning_rate": 1.0162147431530182e-05, + "loss": 0.3964, + "step": 4080 + }, + { + "epoch": 0.5238217213114754, + "grad_norm": 12.843767166137695, + "learning_rate": 1.0134895762365446e-05, + "loss": 0.5889, + "step": 4090 + }, + { + "epoch": 0.5251024590163934, + "grad_norm": 10.034939765930176, + "learning_rate": 1.010764409320071e-05, + "loss": 0.5689, + "step": 4100 + }, + { + "epoch": 0.5263831967213115, + "grad_norm": 18.635753631591797, + "learning_rate": 1.0080392424035975e-05, + "loss": 0.3298, + "step": 4110 + }, + { + "epoch": 0.5276639344262295, + "grad_norm": 6.539854049682617, + "learning_rate": 1.0053140754871236e-05, + "loss": 0.8252, + "step": 4120 + }, + { + "epoch": 0.5289446721311475, + "grad_norm": 19.9680118560791, + "learning_rate": 1.00258890857065e-05, + "loss": 0.5432, + "step": 4130 + }, + { + "epoch": 0.5302254098360656, + "grad_norm": 38.84269714355469, + "learning_rate": 9.998637416541764e-06, + "loss": 1.0371, + "step": 4140 + }, + { + "epoch": 0.5315061475409836, + "grad_norm": 8.018956184387207, + "learning_rate": 9.971385747377029e-06, + "loss": 1.1152, + "step": 4150 + }, + { + "epoch": 0.5327868852459017, + "grad_norm": 1.0766541957855225, + "learning_rate": 9.944134078212291e-06, + "loss": 0.5306, + "step": 4160 + }, + { + "epoch": 0.5340676229508197, + "grad_norm": 0.5119646787643433, + "learning_rate": 9.916882409047554e-06, + "loss": 0.4113, + "step": 4170 + }, + { + "epoch": 0.5353483606557377, + "grad_norm": 60.749359130859375, + "learning_rate": 9.889630739882818e-06, + "loss": 0.4195, + "step": 4180 + }, + { + "epoch": 0.5366290983606558, + "grad_norm": 65.9601058959961, + "learning_rate": 9.862379070718083e-06, + "loss": 0.3612, + "step": 4190 + }, + { + "epoch": 0.5379098360655737, + "grad_norm": 10.21090316772461, + "learning_rate": 9.835127401553345e-06, + "loss": 0.6783, + "step": 4200 + }, + { + "epoch": 0.5391905737704918, + "grad_norm": 1.9999886751174927, + "learning_rate": 9.80787573238861e-06, + "loss": 0.1912, + "step": 4210 + }, + { + "epoch": 0.5404713114754098, + "grad_norm": 1.5724451541900635, + "learning_rate": 9.780624063223874e-06, + "loss": 0.5334, + "step": 4220 + }, + { + "epoch": 0.5417520491803278, + "grad_norm": 64.44462585449219, + "learning_rate": 9.753372394059137e-06, + "loss": 0.6993, + "step": 4230 + }, + { + "epoch": 0.5430327868852459, + "grad_norm": 61.30992126464844, + "learning_rate": 9.726120724894401e-06, + "loss": 0.4775, + "step": 4240 + }, + { + "epoch": 0.5443135245901639, + "grad_norm": 0.6172360777854919, + "learning_rate": 9.698869055729663e-06, + "loss": 0.829, + "step": 4250 + }, + { + "epoch": 0.545594262295082, + "grad_norm": 73.66020202636719, + "learning_rate": 9.671617386564928e-06, + "loss": 0.6753, + "step": 4260 + }, + { + "epoch": 0.546875, + "grad_norm": 14.051901817321777, + "learning_rate": 9.644365717400192e-06, + "loss": 0.9101, + "step": 4270 + }, + { + "epoch": 0.548155737704918, + "grad_norm": 8.695210456848145, + "learning_rate": 9.617114048235455e-06, + "loss": 0.3771, + "step": 4280 + }, + { + "epoch": 0.5494364754098361, + "grad_norm": 0.41656801104545593, + "learning_rate": 9.589862379070719e-06, + "loss": 0.498, + "step": 4290 + }, + { + "epoch": 0.5507172131147541, + "grad_norm": 0.6697580814361572, + "learning_rate": 9.562610709905983e-06, + "loss": 0.6485, + "step": 4300 + }, + { + "epoch": 0.5519979508196722, + "grad_norm": 7.877650260925293, + "learning_rate": 9.535359040741246e-06, + "loss": 0.6239, + "step": 4310 + }, + { + "epoch": 0.5532786885245902, + "grad_norm": 7.576630115509033, + "learning_rate": 9.508107371576509e-06, + "loss": 0.9483, + "step": 4320 + }, + { + "epoch": 0.5545594262295082, + "grad_norm": 21.719369888305664, + "learning_rate": 9.480855702411773e-06, + "loss": 0.4436, + "step": 4330 + }, + { + "epoch": 0.5558401639344263, + "grad_norm": 21.08763885498047, + "learning_rate": 9.453604033247037e-06, + "loss": 0.4597, + "step": 4340 + }, + { + "epoch": 0.5571209016393442, + "grad_norm": 18.030412673950195, + "learning_rate": 9.4263523640823e-06, + "loss": 0.77, + "step": 4350 + }, + { + "epoch": 0.5584016393442623, + "grad_norm": 18.394670486450195, + "learning_rate": 9.399100694917564e-06, + "loss": 0.2567, + "step": 4360 + }, + { + "epoch": 0.5596823770491803, + "grad_norm": 9.325862884521484, + "learning_rate": 9.371849025752829e-06, + "loss": 0.514, + "step": 4370 + }, + { + "epoch": 0.5609631147540983, + "grad_norm": 0.574291467666626, + "learning_rate": 9.344597356588093e-06, + "loss": 0.4198, + "step": 4380 + }, + { + "epoch": 0.5622438524590164, + "grad_norm": 0.7731497883796692, + "learning_rate": 9.317345687423355e-06, + "loss": 0.5641, + "step": 4390 + }, + { + "epoch": 0.5635245901639344, + "grad_norm": 10.017977714538574, + "learning_rate": 9.290094018258618e-06, + "loss": 0.4151, + "step": 4400 + }, + { + "epoch": 0.5648053278688525, + "grad_norm": 9.897109031677246, + "learning_rate": 9.262842349093882e-06, + "loss": 0.5924, + "step": 4410 + }, + { + "epoch": 0.5660860655737705, + "grad_norm": 0.5375286936759949, + "learning_rate": 9.235590679929147e-06, + "loss": 0.2296, + "step": 4420 + }, + { + "epoch": 0.5673668032786885, + "grad_norm": 15.379401206970215, + "learning_rate": 9.20833901076441e-06, + "loss": 0.3977, + "step": 4430 + }, + { + "epoch": 0.5686475409836066, + "grad_norm": 33.24900436401367, + "learning_rate": 9.181087341599674e-06, + "loss": 0.6875, + "step": 4440 + }, + { + "epoch": 0.5699282786885246, + "grad_norm": 17.754283905029297, + "learning_rate": 9.153835672434938e-06, + "loss": 0.7589, + "step": 4450 + }, + { + "epoch": 0.5712090163934426, + "grad_norm": 9.958178520202637, + "learning_rate": 9.1265840032702e-06, + "loss": 1.0191, + "step": 4460 + }, + { + "epoch": 0.5724897540983607, + "grad_norm": 0.880713701248169, + "learning_rate": 9.099332334105465e-06, + "loss": 0.3109, + "step": 4470 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 19.9377498626709, + "learning_rate": 9.072080664940728e-06, + "loss": 1.0805, + "step": 4480 + }, + { + "epoch": 0.5750512295081968, + "grad_norm": 45.10675811767578, + "learning_rate": 9.044828995775992e-06, + "loss": 0.485, + "step": 4490 + }, + { + "epoch": 0.5763319672131147, + "grad_norm": 2.320873498916626, + "learning_rate": 9.017577326611256e-06, + "loss": 0.7725, + "step": 4500 + }, + { + "epoch": 0.5776127049180327, + "grad_norm": 8.428739547729492, + "learning_rate": 8.990325657446519e-06, + "loss": 0.9644, + "step": 4510 + }, + { + "epoch": 0.5788934426229508, + "grad_norm": 31.189594268798828, + "learning_rate": 8.963073988281783e-06, + "loss": 1.0522, + "step": 4520 + }, + { + "epoch": 0.5801741803278688, + "grad_norm": 3.0397989749908447, + "learning_rate": 8.935822319117047e-06, + "loss": 0.7336, + "step": 4530 + }, + { + "epoch": 0.5814549180327869, + "grad_norm": 28.37086296081543, + "learning_rate": 8.90857064995231e-06, + "loss": 1.0115, + "step": 4540 + }, + { + "epoch": 0.5827356557377049, + "grad_norm": 24.746339797973633, + "learning_rate": 8.881318980787574e-06, + "loss": 0.6436, + "step": 4550 + }, + { + "epoch": 0.5840163934426229, + "grad_norm": 41.54196548461914, + "learning_rate": 8.854067311622839e-06, + "loss": 0.7121, + "step": 4560 + }, + { + "epoch": 0.585297131147541, + "grad_norm": 45.90923309326172, + "learning_rate": 8.826815642458101e-06, + "loss": 0.8686, + "step": 4570 + }, + { + "epoch": 0.586577868852459, + "grad_norm": 16.441612243652344, + "learning_rate": 8.799563973293364e-06, + "loss": 0.8231, + "step": 4580 + }, + { + "epoch": 0.5878586065573771, + "grad_norm": 16.66089630126953, + "learning_rate": 8.772312304128628e-06, + "loss": 0.5949, + "step": 4590 + }, + { + "epoch": 0.5891393442622951, + "grad_norm": 23.114477157592773, + "learning_rate": 8.745060634963893e-06, + "loss": 0.8395, + "step": 4600 + }, + { + "epoch": 0.5904200819672131, + "grad_norm": 22.976099014282227, + "learning_rate": 8.717808965799155e-06, + "loss": 0.8844, + "step": 4610 + }, + { + "epoch": 0.5917008196721312, + "grad_norm": 12.82754898071289, + "learning_rate": 8.69055729663442e-06, + "loss": 0.8147, + "step": 4620 + }, + { + "epoch": 0.5929815573770492, + "grad_norm": 43.78225326538086, + "learning_rate": 8.663305627469684e-06, + "loss": 0.7544, + "step": 4630 + }, + { + "epoch": 0.5942622950819673, + "grad_norm": 19.483823776245117, + "learning_rate": 8.636053958304948e-06, + "loss": 0.6818, + "step": 4640 + }, + { + "epoch": 0.5955430327868853, + "grad_norm": 8.231918334960938, + "learning_rate": 8.60880228914021e-06, + "loss": 0.4572, + "step": 4650 + }, + { + "epoch": 0.5968237704918032, + "grad_norm": 8.501511573791504, + "learning_rate": 8.581550619975473e-06, + "loss": 0.7072, + "step": 4660 + }, + { + "epoch": 0.5981045081967213, + "grad_norm": 28.3646297454834, + "learning_rate": 8.554298950810738e-06, + "loss": 0.7487, + "step": 4670 + }, + { + "epoch": 0.5993852459016393, + "grad_norm": 10.618340492248535, + "learning_rate": 8.527047281646002e-06, + "loss": 0.4582, + "step": 4680 + }, + { + "epoch": 0.6006659836065574, + "grad_norm": 48.34426498413086, + "learning_rate": 8.499795612481265e-06, + "loss": 0.743, + "step": 4690 + }, + { + "epoch": 0.6019467213114754, + "grad_norm": 19.851808547973633, + "learning_rate": 8.472543943316529e-06, + "loss": 1.0467, + "step": 4700 + }, + { + "epoch": 0.6032274590163934, + "grad_norm": 14.339262962341309, + "learning_rate": 8.445292274151793e-06, + "loss": 0.7851, + "step": 4710 + }, + { + "epoch": 0.6045081967213115, + "grad_norm": 28.62281608581543, + "learning_rate": 8.418040604987056e-06, + "loss": 0.7838, + "step": 4720 + }, + { + "epoch": 0.6057889344262295, + "grad_norm": 19.882169723510742, + "learning_rate": 8.390788935822319e-06, + "loss": 0.8792, + "step": 4730 + }, + { + "epoch": 0.6070696721311475, + "grad_norm": 6.609494686126709, + "learning_rate": 8.363537266657583e-06, + "loss": 0.4227, + "step": 4740 + }, + { + "epoch": 0.6083504098360656, + "grad_norm": 15.172801971435547, + "learning_rate": 8.336285597492847e-06, + "loss": 0.8444, + "step": 4750 + }, + { + "epoch": 0.6096311475409836, + "grad_norm": 46.852413177490234, + "learning_rate": 8.309033928328112e-06, + "loss": 0.5798, + "step": 4760 + }, + { + "epoch": 0.6109118852459017, + "grad_norm": 41.42491912841797, + "learning_rate": 8.281782259163374e-06, + "loss": 0.8357, + "step": 4770 + }, + { + "epoch": 0.6121926229508197, + "grad_norm": 19.07272720336914, + "learning_rate": 8.254530589998639e-06, + "loss": 0.7716, + "step": 4780 + }, + { + "epoch": 0.6134733606557377, + "grad_norm": 6.932359218597412, + "learning_rate": 8.227278920833903e-06, + "loss": 1.1022, + "step": 4790 + }, + { + "epoch": 0.6147540983606558, + "grad_norm": 39.23098373413086, + "learning_rate": 8.200027251669165e-06, + "loss": 0.7454, + "step": 4800 + }, + { + "epoch": 0.6160348360655737, + "grad_norm": 25.000465393066406, + "learning_rate": 8.172775582504428e-06, + "loss": 0.6045, + "step": 4810 + }, + { + "epoch": 0.6173155737704918, + "grad_norm": 16.970958709716797, + "learning_rate": 8.145523913339692e-06, + "loss": 0.8267, + "step": 4820 + }, + { + "epoch": 0.6185963114754098, + "grad_norm": 15.70919132232666, + "learning_rate": 8.118272244174957e-06, + "loss": 0.6148, + "step": 4830 + }, + { + "epoch": 0.6198770491803278, + "grad_norm": 25.68458366394043, + "learning_rate": 8.09102057501022e-06, + "loss": 0.6463, + "step": 4840 + }, + { + "epoch": 0.6211577868852459, + "grad_norm": 13.340360641479492, + "learning_rate": 8.063768905845484e-06, + "loss": 0.645, + "step": 4850 + }, + { + "epoch": 0.6224385245901639, + "grad_norm": 124.80747985839844, + "learning_rate": 8.036517236680748e-06, + "loss": 0.5991, + "step": 4860 + }, + { + "epoch": 0.623719262295082, + "grad_norm": 20.25383186340332, + "learning_rate": 8.00926556751601e-06, + "loss": 0.5449, + "step": 4870 + }, + { + "epoch": 0.625, + "grad_norm": 19.14507484436035, + "learning_rate": 7.982013898351275e-06, + "loss": 0.8269, + "step": 4880 + }, + { + "epoch": 0.626280737704918, + "grad_norm": 15.882426261901855, + "learning_rate": 7.954762229186538e-06, + "loss": 0.8284, + "step": 4890 + }, + { + "epoch": 0.6275614754098361, + "grad_norm": 22.384090423583984, + "learning_rate": 7.927510560021802e-06, + "loss": 0.6205, + "step": 4900 + }, + { + "epoch": 0.6288422131147541, + "grad_norm": 32.309017181396484, + "learning_rate": 7.900258890857066e-06, + "loss": 1.0006, + "step": 4910 + }, + { + "epoch": 0.6301229508196722, + "grad_norm": 0.9309699535369873, + "learning_rate": 7.873007221692329e-06, + "loss": 0.8793, + "step": 4920 + }, + { + "epoch": 0.6314036885245902, + "grad_norm": 18.254060745239258, + "learning_rate": 7.845755552527593e-06, + "loss": 0.5832, + "step": 4930 + }, + { + "epoch": 0.6326844262295082, + "grad_norm": 1.1032278537750244, + "learning_rate": 7.818503883362857e-06, + "loss": 0.6085, + "step": 4940 + }, + { + "epoch": 0.6339651639344263, + "grad_norm": 4.0901360511779785, + "learning_rate": 7.79125221419812e-06, + "loss": 0.7917, + "step": 4950 + }, + { + "epoch": 0.6352459016393442, + "grad_norm": 8.3672456741333, + "learning_rate": 7.764000545033384e-06, + "loss": 0.4282, + "step": 4960 + }, + { + "epoch": 0.6365266393442623, + "grad_norm": 25.113962173461914, + "learning_rate": 7.736748875868647e-06, + "loss": 0.719, + "step": 4970 + }, + { + "epoch": 0.6378073770491803, + "grad_norm": 16.38678741455078, + "learning_rate": 7.709497206703911e-06, + "loss": 0.8253, + "step": 4980 + }, + { + "epoch": 0.6390881147540983, + "grad_norm": 38.32978439331055, + "learning_rate": 7.682245537539174e-06, + "loss": 0.7423, + "step": 4990 + }, + { + "epoch": 0.6403688524590164, + "grad_norm": 36.88998794555664, + "learning_rate": 7.654993868374438e-06, + "loss": 0.6787, + "step": 5000 + }, + { + "epoch": 0.6416495901639344, + "grad_norm": 31.3937931060791, + "learning_rate": 7.627742199209703e-06, + "loss": 0.4184, + "step": 5010 + }, + { + "epoch": 0.6429303278688525, + "grad_norm": 43.30199432373047, + "learning_rate": 7.600490530044966e-06, + "loss": 0.7261, + "step": 5020 + }, + { + "epoch": 0.6442110655737705, + "grad_norm": 24.66848373413086, + "learning_rate": 7.5732388608802296e-06, + "loss": 0.782, + "step": 5030 + }, + { + "epoch": 0.6454918032786885, + "grad_norm": 0.7670093774795532, + "learning_rate": 7.545987191715494e-06, + "loss": 0.7826, + "step": 5040 + }, + { + "epoch": 0.6467725409836066, + "grad_norm": 28.53043556213379, + "learning_rate": 7.5187355225507565e-06, + "loss": 0.7738, + "step": 5050 + }, + { + "epoch": 0.6480532786885246, + "grad_norm": 39.68383026123047, + "learning_rate": 7.49148385338602e-06, + "loss": 0.6153, + "step": 5060 + }, + { + "epoch": 0.6493340163934426, + "grad_norm": 1.5401833057403564, + "learning_rate": 7.464232184221284e-06, + "loss": 0.4508, + "step": 5070 + }, + { + "epoch": 0.6506147540983607, + "grad_norm": 18.586135864257812, + "learning_rate": 7.436980515056548e-06, + "loss": 0.7714, + "step": 5080 + }, + { + "epoch": 0.6518954918032787, + "grad_norm": 4.915235996246338, + "learning_rate": 7.409728845891811e-06, + "loss": 0.7213, + "step": 5090 + }, + { + "epoch": 0.6531762295081968, + "grad_norm": 13.506136894226074, + "learning_rate": 7.3824771767270756e-06, + "loss": 0.8002, + "step": 5100 + }, + { + "epoch": 0.6544569672131147, + "grad_norm": 24.696321487426758, + "learning_rate": 7.355225507562339e-06, + "loss": 0.6098, + "step": 5110 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 1.202723503112793, + "learning_rate": 7.3279738383976025e-06, + "loss": 0.5947, + "step": 5120 + }, + { + "epoch": 0.6570184426229508, + "grad_norm": 27.57708168029785, + "learning_rate": 7.300722169232866e-06, + "loss": 0.7283, + "step": 5130 + }, + { + "epoch": 0.6582991803278688, + "grad_norm": 27.763059616088867, + "learning_rate": 7.2734705000681294e-06, + "loss": 0.5104, + "step": 5140 + }, + { + "epoch": 0.6595799180327869, + "grad_norm": 22.97685432434082, + "learning_rate": 7.246218830903393e-06, + "loss": 0.7658, + "step": 5150 + }, + { + "epoch": 0.6608606557377049, + "grad_norm": 20.47222137451172, + "learning_rate": 7.218967161738657e-06, + "loss": 0.3926, + "step": 5160 + }, + { + "epoch": 0.6621413934426229, + "grad_norm": 34.984249114990234, + "learning_rate": 7.191715492573921e-06, + "loss": 0.9199, + "step": 5170 + }, + { + "epoch": 0.663422131147541, + "grad_norm": 32.431888580322266, + "learning_rate": 7.164463823409184e-06, + "loss": 0.4709, + "step": 5180 + }, + { + "epoch": 0.664702868852459, + "grad_norm": 11.426986694335938, + "learning_rate": 7.1372121542444485e-06, + "loss": 0.8031, + "step": 5190 + }, + { + "epoch": 0.6659836065573771, + "grad_norm": 27.146059036254883, + "learning_rate": 7.109960485079712e-06, + "loss": 0.7132, + "step": 5200 + }, + { + "epoch": 0.6672643442622951, + "grad_norm": 11.636030197143555, + "learning_rate": 7.082708815914975e-06, + "loss": 0.5368, + "step": 5210 + }, + { + "epoch": 0.6685450819672131, + "grad_norm": 16.758148193359375, + "learning_rate": 7.055457146750239e-06, + "loss": 0.6143, + "step": 5220 + }, + { + "epoch": 0.6698258196721312, + "grad_norm": 0.33391350507736206, + "learning_rate": 7.028205477585502e-06, + "loss": 0.5793, + "step": 5230 + }, + { + "epoch": 0.6711065573770492, + "grad_norm": 25.285449981689453, + "learning_rate": 7.000953808420766e-06, + "loss": 0.64, + "step": 5240 + }, + { + "epoch": 0.6723872950819673, + "grad_norm": 1.447174072265625, + "learning_rate": 6.97370213925603e-06, + "loss": 0.8713, + "step": 5250 + }, + { + "epoch": 0.6736680327868853, + "grad_norm": 34.83108139038086, + "learning_rate": 6.946450470091294e-06, + "loss": 0.6408, + "step": 5260 + }, + { + "epoch": 0.6749487704918032, + "grad_norm": 13.1771821975708, + "learning_rate": 6.919198800926558e-06, + "loss": 0.6303, + "step": 5270 + }, + { + "epoch": 0.6762295081967213, + "grad_norm": 31.439207077026367, + "learning_rate": 6.8919471317618214e-06, + "loss": 0.6238, + "step": 5280 + }, + { + "epoch": 0.6775102459016393, + "grad_norm": 11.551750183105469, + "learning_rate": 6.864695462597084e-06, + "loss": 0.9247, + "step": 5290 + }, + { + "epoch": 0.6787909836065574, + "grad_norm": 18.42095947265625, + "learning_rate": 6.8374437934323475e-06, + "loss": 1.0127, + "step": 5300 + }, + { + "epoch": 0.6800717213114754, + "grad_norm": 0.4005849361419678, + "learning_rate": 6.810192124267612e-06, + "loss": 0.675, + "step": 5310 + }, + { + "epoch": 0.6813524590163934, + "grad_norm": 13.756119728088379, + "learning_rate": 6.782940455102875e-06, + "loss": 0.5458, + "step": 5320 + }, + { + "epoch": 0.6826331967213115, + "grad_norm": 15.997631072998047, + "learning_rate": 6.75568878593814e-06, + "loss": 0.4342, + "step": 5330 + }, + { + "epoch": 0.6839139344262295, + "grad_norm": 16.906126022338867, + "learning_rate": 6.728437116773403e-06, + "loss": 0.6665, + "step": 5340 + }, + { + "epoch": 0.6851946721311475, + "grad_norm": 12.170743942260742, + "learning_rate": 6.701185447608667e-06, + "loss": 0.9264, + "step": 5350 + }, + { + "epoch": 0.6864754098360656, + "grad_norm": 35.61259841918945, + "learning_rate": 6.673933778443931e-06, + "loss": 0.6735, + "step": 5360 + }, + { + "epoch": 0.6877561475409836, + "grad_norm": 13.542879104614258, + "learning_rate": 6.646682109279194e-06, + "loss": 0.8259, + "step": 5370 + }, + { + "epoch": 0.6890368852459017, + "grad_norm": 39.6423225402832, + "learning_rate": 6.619430440114457e-06, + "loss": 0.7956, + "step": 5380 + }, + { + "epoch": 0.6903176229508197, + "grad_norm": 30.907363891601562, + "learning_rate": 6.592178770949721e-06, + "loss": 0.7366, + "step": 5390 + }, + { + "epoch": 0.6915983606557377, + "grad_norm": 12.479640007019043, + "learning_rate": 6.564927101784985e-06, + "loss": 0.3273, + "step": 5400 + }, + { + "epoch": 0.6928790983606558, + "grad_norm": 19.15838623046875, + "learning_rate": 6.537675432620248e-06, + "loss": 0.4575, + "step": 5410 + }, + { + "epoch": 0.6941598360655737, + "grad_norm": 20.0745792388916, + "learning_rate": 6.510423763455513e-06, + "loss": 0.8541, + "step": 5420 + }, + { + "epoch": 0.6954405737704918, + "grad_norm": 30.12567901611328, + "learning_rate": 6.483172094290776e-06, + "loss": 0.3144, + "step": 5430 + }, + { + "epoch": 0.6967213114754098, + "grad_norm": 8.731266975402832, + "learning_rate": 6.4559204251260395e-06, + "loss": 0.4327, + "step": 5440 + }, + { + "epoch": 0.6980020491803278, + "grad_norm": 1.370941400527954, + "learning_rate": 6.428668755961304e-06, + "loss": 0.3519, + "step": 5450 + }, + { + "epoch": 0.6992827868852459, + "grad_norm": 28.71232795715332, + "learning_rate": 6.4014170867965665e-06, + "loss": 0.6712, + "step": 5460 + }, + { + "epoch": 0.7005635245901639, + "grad_norm": 20.623737335205078, + "learning_rate": 6.37416541763183e-06, + "loss": 0.6607, + "step": 5470 + }, + { + "epoch": 0.701844262295082, + "grad_norm": 7.713385581970215, + "learning_rate": 6.346913748467094e-06, + "loss": 0.5568, + "step": 5480 + }, + { + "epoch": 0.703125, + "grad_norm": 10.449071884155273, + "learning_rate": 6.319662079302358e-06, + "loss": 0.8054, + "step": 5490 + }, + { + "epoch": 0.704405737704918, + "grad_norm": 11.34548568725586, + "learning_rate": 6.292410410137621e-06, + "loss": 0.8166, + "step": 5500 + }, + { + "epoch": 0.7056864754098361, + "grad_norm": 2.661618947982788, + "learning_rate": 6.2651587409728855e-06, + "loss": 0.6567, + "step": 5510 + }, + { + "epoch": 0.7069672131147541, + "grad_norm": 4.278378486633301, + "learning_rate": 6.237907071808149e-06, + "loss": 0.7271, + "step": 5520 + }, + { + "epoch": 0.7082479508196722, + "grad_norm": 56.11579513549805, + "learning_rate": 6.210655402643413e-06, + "loss": 0.8394, + "step": 5530 + }, + { + "epoch": 0.7095286885245902, + "grad_norm": 25.923078536987305, + "learning_rate": 6.183403733478676e-06, + "loss": 0.7055, + "step": 5540 + }, + { + "epoch": 0.7108094262295082, + "grad_norm": 7.200451850891113, + "learning_rate": 6.156152064313939e-06, + "loss": 0.6715, + "step": 5550 + }, + { + "epoch": 0.7120901639344263, + "grad_norm": 25.070093154907227, + "learning_rate": 6.128900395149203e-06, + "loss": 0.6701, + "step": 5560 + }, + { + "epoch": 0.7133709016393442, + "grad_norm": 0.7995045781135559, + "learning_rate": 6.101648725984467e-06, + "loss": 0.7706, + "step": 5570 + }, + { + "epoch": 0.7146516393442623, + "grad_norm": 14.150104522705078, + "learning_rate": 6.074397056819731e-06, + "loss": 0.8404, + "step": 5580 + }, + { + "epoch": 0.7159323770491803, + "grad_norm": 21.669960021972656, + "learning_rate": 6.047145387654995e-06, + "loss": 0.5122, + "step": 5590 + }, + { + "epoch": 0.7172131147540983, + "grad_norm": 10.61308765411377, + "learning_rate": 6.0198937184902585e-06, + "loss": 0.7182, + "step": 5600 + }, + { + "epoch": 0.7184938524590164, + "grad_norm": 12.267438888549805, + "learning_rate": 5.992642049325522e-06, + "loss": 0.7212, + "step": 5610 + }, + { + "epoch": 0.7197745901639344, + "grad_norm": 12.50552749633789, + "learning_rate": 5.9653903801607846e-06, + "loss": 0.6373, + "step": 5620 + }, + { + "epoch": 0.7210553278688525, + "grad_norm": 3.3687191009521484, + "learning_rate": 5.938138710996049e-06, + "loss": 0.7845, + "step": 5630 + }, + { + "epoch": 0.7223360655737705, + "grad_norm": 4.029101848602295, + "learning_rate": 5.910887041831312e-06, + "loss": 0.6061, + "step": 5640 + }, + { + "epoch": 0.7236168032786885, + "grad_norm": 11.404590606689453, + "learning_rate": 5.883635372666576e-06, + "loss": 0.2602, + "step": 5650 + }, + { + "epoch": 0.7248975409836066, + "grad_norm": 14.377605438232422, + "learning_rate": 5.85638370350184e-06, + "loss": 0.6709, + "step": 5660 + }, + { + "epoch": 0.7261782786885246, + "grad_norm": 54.396888732910156, + "learning_rate": 5.829132034337104e-06, + "loss": 0.7768, + "step": 5670 + }, + { + "epoch": 0.7274590163934426, + "grad_norm": 11.300426483154297, + "learning_rate": 5.801880365172368e-06, + "loss": 0.6319, + "step": 5680 + }, + { + "epoch": 0.7287397540983607, + "grad_norm": 25.368356704711914, + "learning_rate": 5.774628696007631e-06, + "loss": 0.6695, + "step": 5690 + }, + { + "epoch": 0.7300204918032787, + "grad_norm": 8.80262279510498, + "learning_rate": 5.747377026842894e-06, + "loss": 0.8748, + "step": 5700 + }, + { + "epoch": 0.7313012295081968, + "grad_norm": 14.3671236038208, + "learning_rate": 5.7201253576781575e-06, + "loss": 0.7312, + "step": 5710 + }, + { + "epoch": 0.7325819672131147, + "grad_norm": 20.28556251525879, + "learning_rate": 5.692873688513422e-06, + "loss": 0.6096, + "step": 5720 + }, + { + "epoch": 0.7338627049180327, + "grad_norm": 22.88327980041504, + "learning_rate": 5.665622019348685e-06, + "loss": 1.0243, + "step": 5730 + }, + { + "epoch": 0.7351434426229508, + "grad_norm": 12.539216041564941, + "learning_rate": 5.63837035018395e-06, + "loss": 0.6279, + "step": 5740 + }, + { + "epoch": 0.7364241803278688, + "grad_norm": 37.97767639160156, + "learning_rate": 5.611118681019213e-06, + "loss": 0.7142, + "step": 5750 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 2.0420548915863037, + "learning_rate": 5.5838670118544766e-06, + "loss": 0.5773, + "step": 5760 + }, + { + "epoch": 0.7389856557377049, + "grad_norm": 12.780746459960938, + "learning_rate": 5.556615342689741e-06, + "loss": 0.6708, + "step": 5770 + }, + { + "epoch": 0.7402663934426229, + "grad_norm": 2.0761895179748535, + "learning_rate": 5.5293636735250035e-06, + "loss": 0.6491, + "step": 5780 + }, + { + "epoch": 0.741547131147541, + "grad_norm": 41.1733512878418, + "learning_rate": 5.502112004360267e-06, + "loss": 0.9564, + "step": 5790 + }, + { + "epoch": 0.742827868852459, + "grad_norm": 25.633703231811523, + "learning_rate": 5.474860335195531e-06, + "loss": 0.7985, + "step": 5800 + }, + { + "epoch": 0.7441086065573771, + "grad_norm": 9.461475372314453, + "learning_rate": 5.447608666030795e-06, + "loss": 0.5234, + "step": 5810 + }, + { + "epoch": 0.7453893442622951, + "grad_norm": 18.90468978881836, + "learning_rate": 5.420356996866058e-06, + "loss": 0.4353, + "step": 5820 + }, + { + "epoch": 0.7466700819672131, + "grad_norm": 8.587220191955566, + "learning_rate": 5.3931053277013226e-06, + "loss": 0.6629, + "step": 5830 + }, + { + "epoch": 0.7479508196721312, + "grad_norm": 15.917558670043945, + "learning_rate": 5.365853658536586e-06, + "loss": 0.7589, + "step": 5840 + }, + { + "epoch": 0.7492315573770492, + "grad_norm": 6.725412368774414, + "learning_rate": 5.3386019893718495e-06, + "loss": 0.5328, + "step": 5850 + }, + { + "epoch": 0.7505122950819673, + "grad_norm": 18.641759872436523, + "learning_rate": 5.311350320207113e-06, + "loss": 0.5993, + "step": 5860 + }, + { + "epoch": 0.7517930327868853, + "grad_norm": 30.297088623046875, + "learning_rate": 5.2840986510423764e-06, + "loss": 0.4351, + "step": 5870 + }, + { + "epoch": 0.7530737704918032, + "grad_norm": 22.469974517822266, + "learning_rate": 5.25684698187764e-06, + "loss": 0.5852, + "step": 5880 + }, + { + "epoch": 0.7543545081967213, + "grad_norm": 5.6571173667907715, + "learning_rate": 5.229595312712904e-06, + "loss": 0.6621, + "step": 5890 + }, + { + "epoch": 0.7556352459016393, + "grad_norm": 32.2354736328125, + "learning_rate": 5.202343643548168e-06, + "loss": 0.8106, + "step": 5900 + }, + { + "epoch": 0.7569159836065574, + "grad_norm": 15.729165077209473, + "learning_rate": 5.175091974383431e-06, + "loss": 0.519, + "step": 5910 + }, + { + "epoch": 0.7581967213114754, + "grad_norm": 20.02010726928711, + "learning_rate": 5.1478403052186955e-06, + "loss": 0.599, + "step": 5920 + }, + { + "epoch": 0.7594774590163934, + "grad_norm": 1.9774470329284668, + "learning_rate": 5.120588636053959e-06, + "loss": 0.3835, + "step": 5930 + }, + { + "epoch": 0.7607581967213115, + "grad_norm": 10.6248779296875, + "learning_rate": 5.093336966889222e-06, + "loss": 0.5819, + "step": 5940 + }, + { + "epoch": 0.7620389344262295, + "grad_norm": 8.844250679016113, + "learning_rate": 5.066085297724486e-06, + "loss": 0.524, + "step": 5950 + }, + { + "epoch": 0.7633196721311475, + "grad_norm": 24.882261276245117, + "learning_rate": 5.038833628559749e-06, + "loss": 0.5727, + "step": 5960 + }, + { + "epoch": 0.7646004098360656, + "grad_norm": 16.70749855041504, + "learning_rate": 5.011581959395013e-06, + "loss": 0.651, + "step": 5970 + }, + { + "epoch": 0.7658811475409836, + "grad_norm": 25.65505027770996, + "learning_rate": 4.984330290230277e-06, + "loss": 0.563, + "step": 5980 + }, + { + "epoch": 0.7671618852459017, + "grad_norm": 27.863927841186523, + "learning_rate": 4.957078621065541e-06, + "loss": 0.4771, + "step": 5990 + }, + { + "epoch": 0.7684426229508197, + "grad_norm": 0.7001621723175049, + "learning_rate": 4.929826951900804e-06, + "loss": 0.5382, + "step": 6000 + }, + { + "epoch": 0.7697233606557377, + "grad_norm": 16.65908432006836, + "learning_rate": 4.902575282736068e-06, + "loss": 0.829, + "step": 6010 + }, + { + "epoch": 0.7710040983606558, + "grad_norm": 6.999290943145752, + "learning_rate": 4.875323613571332e-06, + "loss": 0.3504, + "step": 6020 + }, + { + "epoch": 0.7722848360655737, + "grad_norm": 21.872570037841797, + "learning_rate": 4.848071944406595e-06, + "loss": 0.6681, + "step": 6030 + }, + { + "epoch": 0.7735655737704918, + "grad_norm": 12.923929214477539, + "learning_rate": 4.820820275241859e-06, + "loss": 0.7376, + "step": 6040 + }, + { + "epoch": 0.7748463114754098, + "grad_norm": 24.330562591552734, + "learning_rate": 4.793568606077122e-06, + "loss": 0.6243, + "step": 6050 + }, + { + "epoch": 0.7761270491803278, + "grad_norm": 9.132780075073242, + "learning_rate": 4.766316936912387e-06, + "loss": 0.9117, + "step": 6060 + }, + { + "epoch": 0.7774077868852459, + "grad_norm": 9.875121116638184, + "learning_rate": 4.73906526774765e-06, + "loss": 0.6056, + "step": 6070 + }, + { + "epoch": 0.7786885245901639, + "grad_norm": 14.28087329864502, + "learning_rate": 4.711813598582914e-06, + "loss": 0.8161, + "step": 6080 + }, + { + "epoch": 0.779969262295082, + "grad_norm": 4.369551658630371, + "learning_rate": 4.684561929418177e-06, + "loss": 0.5907, + "step": 6090 + }, + { + "epoch": 0.78125, + "grad_norm": 30.508066177368164, + "learning_rate": 4.6573102602534405e-06, + "loss": 0.5567, + "step": 6100 + }, + { + "epoch": 0.782530737704918, + "grad_norm": 24.87715721130371, + "learning_rate": 4.630058591088705e-06, + "loss": 0.6462, + "step": 6110 + }, + { + "epoch": 0.7838114754098361, + "grad_norm": 15.003620147705078, + "learning_rate": 4.602806921923968e-06, + "loss": 0.5864, + "step": 6120 + }, + { + "epoch": 0.7850922131147541, + "grad_norm": 21.42226219177246, + "learning_rate": 4.575555252759232e-06, + "loss": 0.7504, + "step": 6130 + }, + { + "epoch": 0.7863729508196722, + "grad_norm": 2.328996181488037, + "learning_rate": 4.548303583594495e-06, + "loss": 0.6912, + "step": 6140 + }, + { + "epoch": 0.7876536885245902, + "grad_norm": 10.392210960388184, + "learning_rate": 4.52105191442976e-06, + "loss": 0.8199, + "step": 6150 + }, + { + "epoch": 0.7889344262295082, + "grad_norm": 8.533187866210938, + "learning_rate": 4.493800245265023e-06, + "loss": 0.5175, + "step": 6160 + }, + { + "epoch": 0.7902151639344263, + "grad_norm": 11.740133285522461, + "learning_rate": 4.4665485761002865e-06, + "loss": 0.9367, + "step": 6170 + }, + { + "epoch": 0.7914959016393442, + "grad_norm": 26.58624267578125, + "learning_rate": 4.43929690693555e-06, + "loss": 0.6825, + "step": 6180 + }, + { + "epoch": 0.7927766393442623, + "grad_norm": 1.783715844154358, + "learning_rate": 4.412045237770814e-06, + "loss": 0.5531, + "step": 6190 + }, + { + "epoch": 0.7940573770491803, + "grad_norm": 10.153545379638672, + "learning_rate": 4.384793568606078e-06, + "loss": 0.7688, + "step": 6200 + }, + { + "epoch": 0.7953381147540983, + "grad_norm": 12.468855857849121, + "learning_rate": 4.357541899441341e-06, + "loss": 0.5524, + "step": 6210 + }, + { + "epoch": 0.7966188524590164, + "grad_norm": 3.6368038654327393, + "learning_rate": 4.330290230276605e-06, + "loss": 0.6093, + "step": 6220 + }, + { + "epoch": 0.7978995901639344, + "grad_norm": 12.435820579528809, + "learning_rate": 4.303038561111868e-06, + "loss": 0.6865, + "step": 6230 + }, + { + "epoch": 0.7991803278688525, + "grad_norm": 9.179264068603516, + "learning_rate": 4.2757868919471325e-06, + "loss": 0.6233, + "step": 6240 + }, + { + "epoch": 0.8004610655737705, + "grad_norm": 46.63306427001953, + "learning_rate": 4.248535222782396e-06, + "loss": 0.5358, + "step": 6250 + }, + { + "epoch": 0.8017418032786885, + "grad_norm": 7.405709266662598, + "learning_rate": 4.2212835536176595e-06, + "loss": 0.5872, + "step": 6260 + }, + { + "epoch": 0.8030225409836066, + "grad_norm": 20.083263397216797, + "learning_rate": 4.194031884452923e-06, + "loss": 0.4936, + "step": 6270 + }, + { + "epoch": 0.8043032786885246, + "grad_norm": 2.6786341667175293, + "learning_rate": 4.166780215288187e-06, + "loss": 0.6548, + "step": 6280 + }, + { + "epoch": 0.8055840163934426, + "grad_norm": 13.946334838867188, + "learning_rate": 4.13952854612345e-06, + "loss": 0.4651, + "step": 6290 + }, + { + "epoch": 0.8068647540983607, + "grad_norm": 39.37618637084961, + "learning_rate": 4.112276876958714e-06, + "loss": 0.7712, + "step": 6300 + }, + { + "epoch": 0.8081454918032787, + "grad_norm": 18.16588020324707, + "learning_rate": 4.085025207793978e-06, + "loss": 0.7139, + "step": 6310 + }, + { + "epoch": 0.8094262295081968, + "grad_norm": 12.700222969055176, + "learning_rate": 4.057773538629242e-06, + "loss": 0.5219, + "step": 6320 + }, + { + "epoch": 0.8107069672131147, + "grad_norm": 28.98236656188965, + "learning_rate": 4.030521869464505e-06, + "loss": 0.7143, + "step": 6330 + }, + { + "epoch": 0.8119877049180327, + "grad_norm": 24.590084075927734, + "learning_rate": 4.003270200299769e-06, + "loss": 0.46, + "step": 6340 + }, + { + "epoch": 0.8132684426229508, + "grad_norm": 24.325733184814453, + "learning_rate": 3.976018531135032e-06, + "loss": 0.7554, + "step": 6350 + }, + { + "epoch": 0.8145491803278688, + "grad_norm": 8.794258117675781, + "learning_rate": 3.948766861970296e-06, + "loss": 0.4404, + "step": 6360 + }, + { + "epoch": 0.8158299180327869, + "grad_norm": 0.7277682423591614, + "learning_rate": 3.921515192805559e-06, + "loss": 0.6449, + "step": 6370 + }, + { + "epoch": 0.8171106557377049, + "grad_norm": 22.101137161254883, + "learning_rate": 3.894263523640824e-06, + "loss": 0.7285, + "step": 6380 + }, + { + "epoch": 0.8183913934426229, + "grad_norm": 22.26101303100586, + "learning_rate": 3.867011854476087e-06, + "loss": 0.7699, + "step": 6390 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 17.823871612548828, + "learning_rate": 3.839760185311351e-06, + "loss": 0.6389, + "step": 6400 + }, + { + "epoch": 0.820952868852459, + "grad_norm": 35.937286376953125, + "learning_rate": 3.812508516146614e-06, + "loss": 0.7776, + "step": 6410 + }, + { + "epoch": 0.8222336065573771, + "grad_norm": 1.8482409715652466, + "learning_rate": 3.785256846981878e-06, + "loss": 0.7117, + "step": 6420 + }, + { + "epoch": 0.8235143442622951, + "grad_norm": 1.5273475646972656, + "learning_rate": 3.758005177817142e-06, + "loss": 0.8126, + "step": 6430 + }, + { + "epoch": 0.8247950819672131, + "grad_norm": 17.53533935546875, + "learning_rate": 3.7307535086524054e-06, + "loss": 0.8421, + "step": 6440 + }, + { + "epoch": 0.8260758196721312, + "grad_norm": 9.50154972076416, + "learning_rate": 3.703501839487669e-06, + "loss": 0.5142, + "step": 6450 + }, + { + "epoch": 0.8273565573770492, + "grad_norm": 12.528085708618164, + "learning_rate": 3.6762501703229327e-06, + "loss": 0.7004, + "step": 6460 + }, + { + "epoch": 0.8286372950819673, + "grad_norm": 19.719446182250977, + "learning_rate": 3.648998501158196e-06, + "loss": 0.5631, + "step": 6470 + }, + { + "epoch": 0.8299180327868853, + "grad_norm": 21.097314834594727, + "learning_rate": 3.62174683199346e-06, + "loss": 0.6893, + "step": 6480 + }, + { + "epoch": 0.8311987704918032, + "grad_norm": 9.299731254577637, + "learning_rate": 3.594495162828723e-06, + "loss": 0.389, + "step": 6490 + }, + { + "epoch": 0.8324795081967213, + "grad_norm": 5.358484268188477, + "learning_rate": 3.567243493663987e-06, + "loss": 0.59, + "step": 6500 + }, + { + "epoch": 0.8337602459016393, + "grad_norm": 17.12688446044922, + "learning_rate": 3.539991824499251e-06, + "loss": 0.4049, + "step": 6510 + }, + { + "epoch": 0.8350409836065574, + "grad_norm": 22.643938064575195, + "learning_rate": 3.512740155334515e-06, + "loss": 0.5396, + "step": 6520 + }, + { + "epoch": 0.8363217213114754, + "grad_norm": 15.25439167022705, + "learning_rate": 3.485488486169778e-06, + "loss": 0.7493, + "step": 6530 + }, + { + "epoch": 0.8376024590163934, + "grad_norm": 0.7836318016052246, + "learning_rate": 3.4582368170050418e-06, + "loss": 0.6233, + "step": 6540 + }, + { + "epoch": 0.8388831967213115, + "grad_norm": 7.646884918212891, + "learning_rate": 3.4309851478403057e-06, + "loss": 0.6084, + "step": 6550 + }, + { + "epoch": 0.8401639344262295, + "grad_norm": 0.6499843001365662, + "learning_rate": 3.4037334786755696e-06, + "loss": 0.6373, + "step": 6560 + }, + { + "epoch": 0.8414446721311475, + "grad_norm": 4.813564777374268, + "learning_rate": 3.3764818095108326e-06, + "loss": 0.8403, + "step": 6570 + }, + { + "epoch": 0.8427254098360656, + "grad_norm": 20.393510818481445, + "learning_rate": 3.3492301403460965e-06, + "loss": 0.7535, + "step": 6580 + }, + { + "epoch": 0.8440061475409836, + "grad_norm": 15.966805458068848, + "learning_rate": 3.32197847118136e-06, + "loss": 0.8566, + "step": 6590 + }, + { + "epoch": 0.8452868852459017, + "grad_norm": 4.333749294281006, + "learning_rate": 3.294726802016624e-06, + "loss": 0.742, + "step": 6600 + }, + { + "epoch": 0.8465676229508197, + "grad_norm": 10.89919376373291, + "learning_rate": 3.2674751328518873e-06, + "loss": 0.5804, + "step": 6610 + }, + { + "epoch": 0.8478483606557377, + "grad_norm": 9.388509750366211, + "learning_rate": 3.240223463687151e-06, + "loss": 0.5165, + "step": 6620 + }, + { + "epoch": 0.8491290983606558, + "grad_norm": 4.184054374694824, + "learning_rate": 3.2129717945224147e-06, + "loss": 0.5183, + "step": 6630 + }, + { + "epoch": 0.8504098360655737, + "grad_norm": 7.253784656524658, + "learning_rate": 3.1857201253576786e-06, + "loss": 0.7198, + "step": 6640 + }, + { + "epoch": 0.8516905737704918, + "grad_norm": 11.86843490600586, + "learning_rate": 3.1584684561929417e-06, + "loss": 0.4899, + "step": 6650 + }, + { + "epoch": 0.8529713114754098, + "grad_norm": 10.385624885559082, + "learning_rate": 3.1312167870282056e-06, + "loss": 0.8167, + "step": 6660 + }, + { + "epoch": 0.8542520491803278, + "grad_norm": 21.208568572998047, + "learning_rate": 3.1039651178634695e-06, + "loss": 0.5819, + "step": 6670 + }, + { + "epoch": 0.8555327868852459, + "grad_norm": 0.44370558857917786, + "learning_rate": 3.0767134486987333e-06, + "loss": 0.7301, + "step": 6680 + }, + { + "epoch": 0.8568135245901639, + "grad_norm": 8.252354621887207, + "learning_rate": 3.0494617795339964e-06, + "loss": 0.488, + "step": 6690 + }, + { + "epoch": 0.858094262295082, + "grad_norm": 13.996326446533203, + "learning_rate": 3.0222101103692603e-06, + "loss": 0.7691, + "step": 6700 + }, + { + "epoch": 0.859375, + "grad_norm": 35.99543380737305, + "learning_rate": 2.994958441204524e-06, + "loss": 0.5266, + "step": 6710 + }, + { + "epoch": 0.860655737704918, + "grad_norm": 19.631608963012695, + "learning_rate": 2.9677067720397877e-06, + "loss": 0.9918, + "step": 6720 + }, + { + "epoch": 0.8619364754098361, + "grad_norm": 33.22177505493164, + "learning_rate": 2.940455102875051e-06, + "loss": 0.6044, + "step": 6730 + }, + { + "epoch": 0.8632172131147541, + "grad_norm": 20.986621856689453, + "learning_rate": 2.913203433710315e-06, + "loss": 0.3522, + "step": 6740 + }, + { + "epoch": 0.8644979508196722, + "grad_norm": 14.024015426635742, + "learning_rate": 2.8859517645455785e-06, + "loss": 0.6916, + "step": 6750 + }, + { + "epoch": 0.8657786885245902, + "grad_norm": 11.796330451965332, + "learning_rate": 2.8587000953808424e-06, + "loss": 0.704, + "step": 6760 + }, + { + "epoch": 0.8670594262295082, + "grad_norm": 20.83628273010254, + "learning_rate": 2.831448426216106e-06, + "loss": 0.8603, + "step": 6770 + }, + { + "epoch": 0.8683401639344263, + "grad_norm": 18.570674896240234, + "learning_rate": 2.8041967570513693e-06, + "loss": 0.6714, + "step": 6780 + }, + { + "epoch": 0.8696209016393442, + "grad_norm": 8.486098289489746, + "learning_rate": 2.7769450878866332e-06, + "loss": 0.7107, + "step": 6790 + }, + { + "epoch": 0.8709016393442623, + "grad_norm": 2.3732173442840576, + "learning_rate": 2.749693418721897e-06, + "loss": 0.7988, + "step": 6800 + }, + { + "epoch": 0.8721823770491803, + "grad_norm": 2.5915911197662354, + "learning_rate": 2.72244174955716e-06, + "loss": 0.6847, + "step": 6810 + }, + { + "epoch": 0.8734631147540983, + "grad_norm": 30.59233856201172, + "learning_rate": 2.695190080392424e-06, + "loss": 0.6228, + "step": 6820 + }, + { + "epoch": 0.8747438524590164, + "grad_norm": 9.502323150634766, + "learning_rate": 2.667938411227688e-06, + "loss": 0.5615, + "step": 6830 + }, + { + "epoch": 0.8760245901639344, + "grad_norm": 17.929569244384766, + "learning_rate": 2.640686742062952e-06, + "loss": 0.5991, + "step": 6840 + }, + { + "epoch": 0.8773053278688525, + "grad_norm": 14.03685474395752, + "learning_rate": 2.613435072898215e-06, + "loss": 0.7148, + "step": 6850 + }, + { + "epoch": 0.8785860655737705, + "grad_norm": 14.739727020263672, + "learning_rate": 2.586183403733479e-06, + "loss": 0.5939, + "step": 6860 + }, + { + "epoch": 0.8798668032786885, + "grad_norm": 13.458857536315918, + "learning_rate": 2.5589317345687427e-06, + "loss": 0.5892, + "step": 6870 + }, + { + "epoch": 0.8811475409836066, + "grad_norm": 13.960780143737793, + "learning_rate": 2.531680065404006e-06, + "loss": 0.5841, + "step": 6880 + }, + { + "epoch": 0.8824282786885246, + "grad_norm": 13.514850616455078, + "learning_rate": 2.5044283962392696e-06, + "loss": 0.5341, + "step": 6890 + }, + { + "epoch": 0.8837090163934426, + "grad_norm": 12.330262184143066, + "learning_rate": 2.4771767270745335e-06, + "loss": 0.8722, + "step": 6900 + }, + { + "epoch": 0.8849897540983607, + "grad_norm": 14.698627471923828, + "learning_rate": 2.449925057909797e-06, + "loss": 0.3832, + "step": 6910 + }, + { + "epoch": 0.8862704918032787, + "grad_norm": 25.05308723449707, + "learning_rate": 2.4226733887450605e-06, + "loss": 0.9005, + "step": 6920 + }, + { + "epoch": 0.8875512295081968, + "grad_norm": 16.247404098510742, + "learning_rate": 2.3954217195803244e-06, + "loss": 0.4334, + "step": 6930 + }, + { + "epoch": 0.8888319672131147, + "grad_norm": 24.826126098632812, + "learning_rate": 2.3681700504155883e-06, + "loss": 0.4239, + "step": 6940 + }, + { + "epoch": 0.8901127049180327, + "grad_norm": 30.12708282470703, + "learning_rate": 2.3409183812508517e-06, + "loss": 0.9281, + "step": 6950 + }, + { + "epoch": 0.8913934426229508, + "grad_norm": 33.377967834472656, + "learning_rate": 2.3136667120861156e-06, + "loss": 1.0247, + "step": 6960 + }, + { + "epoch": 0.8926741803278688, + "grad_norm": 9.090696334838867, + "learning_rate": 2.286415042921379e-06, + "loss": 0.6998, + "step": 6970 + }, + { + "epoch": 0.8939549180327869, + "grad_norm": 18.32761001586914, + "learning_rate": 2.259163373756643e-06, + "loss": 0.6415, + "step": 6980 + }, + { + "epoch": 0.8952356557377049, + "grad_norm": 3.1769232749938965, + "learning_rate": 2.2319117045919065e-06, + "loss": 0.5628, + "step": 6990 + }, + { + "epoch": 0.8965163934426229, + "grad_norm": 11.886983871459961, + "learning_rate": 2.2046600354271704e-06, + "loss": 0.9674, + "step": 7000 + }, + { + "epoch": 0.897797131147541, + "grad_norm": 2.503143072128296, + "learning_rate": 2.177408366262434e-06, + "loss": 0.5693, + "step": 7010 + }, + { + "epoch": 0.899077868852459, + "grad_norm": 29.00408935546875, + "learning_rate": 2.1501566970976973e-06, + "loss": 0.6684, + "step": 7020 + }, + { + "epoch": 0.9003586065573771, + "grad_norm": 14.518806457519531, + "learning_rate": 2.1229050279329612e-06, + "loss": 0.8301, + "step": 7030 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 46.252830505371094, + "learning_rate": 2.0956533587682247e-06, + "loss": 1.0172, + "step": 7040 + }, + { + "epoch": 0.9029200819672131, + "grad_norm": 19.148435592651367, + "learning_rate": 2.068401689603488e-06, + "loss": 0.7153, + "step": 7050 + }, + { + "epoch": 0.9042008196721312, + "grad_norm": 17.86318588256836, + "learning_rate": 2.041150020438752e-06, + "loss": 0.7907, + "step": 7060 + }, + { + "epoch": 0.9054815573770492, + "grad_norm": 14.341056823730469, + "learning_rate": 2.0138983512740155e-06, + "loss": 0.4611, + "step": 7070 + }, + { + "epoch": 0.9067622950819673, + "grad_norm": 8.442182540893555, + "learning_rate": 1.9866466821092794e-06, + "loss": 0.8596, + "step": 7080 + }, + { + "epoch": 0.9080430327868853, + "grad_norm": 15.53111743927002, + "learning_rate": 1.959395012944543e-06, + "loss": 0.5854, + "step": 7090 + }, + { + "epoch": 0.9093237704918032, + "grad_norm": 8.210785865783691, + "learning_rate": 1.932143343779807e-06, + "loss": 0.855, + "step": 7100 + }, + { + "epoch": 0.9106045081967213, + "grad_norm": 11.097797393798828, + "learning_rate": 1.9048916746150703e-06, + "loss": 0.7989, + "step": 7110 + }, + { + "epoch": 0.9118852459016393, + "grad_norm": 6.103325843811035, + "learning_rate": 1.8776400054503342e-06, + "loss": 0.4565, + "step": 7120 + }, + { + "epoch": 0.9131659836065574, + "grad_norm": 15.080409049987793, + "learning_rate": 1.8503883362855976e-06, + "loss": 0.4197, + "step": 7130 + }, + { + "epoch": 0.9144467213114754, + "grad_norm": 23.386219024658203, + "learning_rate": 1.8231366671208613e-06, + "loss": 0.6161, + "step": 7140 + }, + { + "epoch": 0.9157274590163934, + "grad_norm": 13.018634796142578, + "learning_rate": 1.795884997956125e-06, + "loss": 0.4554, + "step": 7150 + }, + { + "epoch": 0.9170081967213115, + "grad_norm": 9.674510955810547, + "learning_rate": 1.7686333287913887e-06, + "loss": 0.7063, + "step": 7160 + }, + { + "epoch": 0.9182889344262295, + "grad_norm": 13.369217872619629, + "learning_rate": 1.7413816596266522e-06, + "loss": 0.6227, + "step": 7170 + }, + { + "epoch": 0.9195696721311475, + "grad_norm": 19.81302833557129, + "learning_rate": 1.714129990461916e-06, + "loss": 0.5844, + "step": 7180 + }, + { + "epoch": 0.9208504098360656, + "grad_norm": 13.579237937927246, + "learning_rate": 1.6868783212971795e-06, + "loss": 0.632, + "step": 7190 + }, + { + "epoch": 0.9221311475409836, + "grad_norm": 9.165477752685547, + "learning_rate": 1.6596266521324434e-06, + "loss": 0.5509, + "step": 7200 + }, + { + "epoch": 0.9234118852459017, + "grad_norm": 18.232845306396484, + "learning_rate": 1.6323749829677069e-06, + "loss": 0.6403, + "step": 7210 + }, + { + "epoch": 0.9246926229508197, + "grad_norm": 18.56736946105957, + "learning_rate": 1.6051233138029706e-06, + "loss": 0.7943, + "step": 7220 + }, + { + "epoch": 0.9259733606557377, + "grad_norm": 8.743745803833008, + "learning_rate": 1.5778716446382343e-06, + "loss": 0.4279, + "step": 7230 + }, + { + "epoch": 0.9272540983606558, + "grad_norm": 2.6923177242279053, + "learning_rate": 1.550619975473498e-06, + "loss": 0.5608, + "step": 7240 + }, + { + "epoch": 0.9285348360655737, + "grad_norm": 29.790340423583984, + "learning_rate": 1.5233683063087614e-06, + "loss": 0.4361, + "step": 7250 + }, + { + "epoch": 0.9298155737704918, + "grad_norm": 1.7628939151763916, + "learning_rate": 1.4961166371440253e-06, + "loss": 0.6859, + "step": 7260 + }, + { + "epoch": 0.9310963114754098, + "grad_norm": 10.456538200378418, + "learning_rate": 1.4688649679792888e-06, + "loss": 0.7482, + "step": 7270 + }, + { + "epoch": 0.9323770491803278, + "grad_norm": 28.223440170288086, + "learning_rate": 1.4416132988145527e-06, + "loss": 0.675, + "step": 7280 + }, + { + "epoch": 0.9336577868852459, + "grad_norm": 6.400082111358643, + "learning_rate": 1.4143616296498161e-06, + "loss": 0.6709, + "step": 7290 + }, + { + "epoch": 0.9349385245901639, + "grad_norm": 16.48478889465332, + "learning_rate": 1.3871099604850798e-06, + "loss": 0.3667, + "step": 7300 + }, + { + "epoch": 0.936219262295082, + "grad_norm": 14.860025405883789, + "learning_rate": 1.3598582913203435e-06, + "loss": 0.7024, + "step": 7310 + }, + { + "epoch": 0.9375, + "grad_norm": 14.933452606201172, + "learning_rate": 1.3326066221556072e-06, + "loss": 0.6838, + "step": 7320 + }, + { + "epoch": 0.938780737704918, + "grad_norm": 23.65451431274414, + "learning_rate": 1.3053549529908707e-06, + "loss": 0.5882, + "step": 7330 + }, + { + "epoch": 0.9400614754098361, + "grad_norm": 23.98202133178711, + "learning_rate": 1.2781032838261346e-06, + "loss": 0.7442, + "step": 7340 + }, + { + "epoch": 0.9413422131147541, + "grad_norm": 38.25538635253906, + "learning_rate": 1.250851614661398e-06, + "loss": 0.6544, + "step": 7350 + }, + { + "epoch": 0.9426229508196722, + "grad_norm": 1.7557686567306519, + "learning_rate": 1.223599945496662e-06, + "loss": 0.3867, + "step": 7360 + }, + { + "epoch": 0.9439036885245902, + "grad_norm": 10.53632926940918, + "learning_rate": 1.1963482763319254e-06, + "loss": 0.9491, + "step": 7370 + }, + { + "epoch": 0.9451844262295082, + "grad_norm": 8.34455680847168, + "learning_rate": 1.169096607167189e-06, + "loss": 0.4885, + "step": 7380 + }, + { + "epoch": 0.9464651639344263, + "grad_norm": 3.460608720779419, + "learning_rate": 1.1418449380024528e-06, + "loss": 0.4286, + "step": 7390 + }, + { + "epoch": 0.9477459016393442, + "grad_norm": 20.152204513549805, + "learning_rate": 1.1145932688377165e-06, + "loss": 0.7888, + "step": 7400 + }, + { + "epoch": 0.9490266393442623, + "grad_norm": 12.72758960723877, + "learning_rate": 1.0873415996729801e-06, + "loss": 0.6784, + "step": 7410 + }, + { + "epoch": 0.9503073770491803, + "grad_norm": 13.164525985717773, + "learning_rate": 1.0600899305082438e-06, + "loss": 0.3325, + "step": 7420 + }, + { + "epoch": 0.9515881147540983, + "grad_norm": 15.550426483154297, + "learning_rate": 1.0328382613435075e-06, + "loss": 0.5533, + "step": 7430 + }, + { + "epoch": 0.9528688524590164, + "grad_norm": 4.542503356933594, + "learning_rate": 1.005586592178771e-06, + "loss": 0.5264, + "step": 7440 + }, + { + "epoch": 0.9541495901639344, + "grad_norm": 22.304424285888672, + "learning_rate": 9.783349230140347e-07, + "loss": 0.7576, + "step": 7450 + }, + { + "epoch": 0.9554303278688525, + "grad_norm": 24.396604537963867, + "learning_rate": 9.510832538492983e-07, + "loss": 0.6932, + "step": 7460 + }, + { + "epoch": 0.9567110655737705, + "grad_norm": 5.150862216949463, + "learning_rate": 9.23831584684562e-07, + "loss": 0.5392, + "step": 7470 + }, + { + "epoch": 0.9579918032786885, + "grad_norm": 6.6292829513549805, + "learning_rate": 8.965799155198257e-07, + "loss": 0.43, + "step": 7480 + }, + { + "epoch": 0.9592725409836066, + "grad_norm": 35.094058990478516, + "learning_rate": 8.693282463550894e-07, + "loss": 0.4879, + "step": 7490 + }, + { + "epoch": 0.9605532786885246, + "grad_norm": 31.886293411254883, + "learning_rate": 8.42076577190353e-07, + "loss": 0.4554, + "step": 7500 + }, + { + "epoch": 0.9618340163934426, + "grad_norm": 10.12392807006836, + "learning_rate": 8.148249080256167e-07, + "loss": 0.8466, + "step": 7510 + }, + { + "epoch": 0.9631147540983607, + "grad_norm": 23.29629898071289, + "learning_rate": 7.875732388608803e-07, + "loss": 0.6954, + "step": 7520 + }, + { + "epoch": 0.9643954918032787, + "grad_norm": 34.42799758911133, + "learning_rate": 7.60321569696144e-07, + "loss": 0.4265, + "step": 7530 + }, + { + "epoch": 0.9656762295081968, + "grad_norm": 20.460311889648438, + "learning_rate": 7.330699005314076e-07, + "loss": 0.542, + "step": 7540 + }, + { + "epoch": 0.9669569672131147, + "grad_norm": 1.3875937461853027, + "learning_rate": 7.058182313666713e-07, + "loss": 0.6803, + "step": 7550 + }, + { + "epoch": 0.9682377049180327, + "grad_norm": 20.104841232299805, + "learning_rate": 6.78566562201935e-07, + "loss": 0.8385, + "step": 7560 + }, + { + "epoch": 0.9695184426229508, + "grad_norm": 17.50690269470215, + "learning_rate": 6.513148930371987e-07, + "loss": 0.9916, + "step": 7570 + }, + { + "epoch": 0.9707991803278688, + "grad_norm": 72.95804595947266, + "learning_rate": 6.240632238724622e-07, + "loss": 0.8251, + "step": 7580 + }, + { + "epoch": 0.9720799180327869, + "grad_norm": 11.275779724121094, + "learning_rate": 5.968115547077259e-07, + "loss": 0.4506, + "step": 7590 + }, + { + "epoch": 0.9733606557377049, + "grad_norm": 20.942705154418945, + "learning_rate": 5.695598855429896e-07, + "loss": 0.6638, + "step": 7600 + }, + { + "epoch": 0.9746413934426229, + "grad_norm": 8.423453330993652, + "learning_rate": 5.423082163782532e-07, + "loss": 0.6953, + "step": 7610 + }, + { + "epoch": 0.975922131147541, + "grad_norm": 24.83681297302246, + "learning_rate": 5.150565472135169e-07, + "loss": 0.7618, + "step": 7620 + }, + { + "epoch": 0.977202868852459, + "grad_norm": 18.958438873291016, + "learning_rate": 4.878048780487805e-07, + "loss": 0.734, + "step": 7630 + }, + { + "epoch": 0.9784836065573771, + "grad_norm": 12.136439323425293, + "learning_rate": 4.605532088840442e-07, + "loss": 0.5637, + "step": 7640 + }, + { + "epoch": 0.9797643442622951, + "grad_norm": 7.522444725036621, + "learning_rate": 4.3330153971930786e-07, + "loss": 0.8323, + "step": 7650 + }, + { + "epoch": 0.9810450819672131, + "grad_norm": 34.33516311645508, + "learning_rate": 4.060498705545715e-07, + "loss": 0.591, + "step": 7660 + }, + { + "epoch": 0.9823258196721312, + "grad_norm": 6.395289421081543, + "learning_rate": 3.787982013898352e-07, + "loss": 0.5533, + "step": 7670 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 7.777110576629639, + "learning_rate": 3.515465322250988e-07, + "loss": 0.7885, + "step": 7680 + }, + { + "epoch": 0.9848872950819673, + "grad_norm": 18.54967498779297, + "learning_rate": 3.242948630603625e-07, + "loss": 0.5966, + "step": 7690 + }, + { + "epoch": 0.9861680327868853, + "grad_norm": 13.985085487365723, + "learning_rate": 2.970431938956261e-07, + "loss": 0.7105, + "step": 7700 + }, + { + "epoch": 0.9874487704918032, + "grad_norm": 37.31953811645508, + "learning_rate": 2.697915247308898e-07, + "loss": 0.8736, + "step": 7710 + }, + { + "epoch": 0.9887295081967213, + "grad_norm": 9.107115745544434, + "learning_rate": 2.4253985556615344e-07, + "loss": 0.699, + "step": 7720 + }, + { + "epoch": 0.9900102459016393, + "grad_norm": 14.522866249084473, + "learning_rate": 2.152881864014171e-07, + "loss": 0.7096, + "step": 7730 + }, + { + "epoch": 0.9912909836065574, + "grad_norm": 12.966835975646973, + "learning_rate": 1.8803651723668075e-07, + "loss": 0.6627, + "step": 7740 + }, + { + "epoch": 0.9925717213114754, + "grad_norm": 33.506622314453125, + "learning_rate": 1.607848480719444e-07, + "loss": 0.6537, + "step": 7750 + }, + { + "epoch": 0.9938524590163934, + "grad_norm": 14.853964805603027, + "learning_rate": 1.3353317890720807e-07, + "loss": 0.4942, + "step": 7760 + }, + { + "epoch": 0.9951331967213115, + "grad_norm": 5.332017421722412, + "learning_rate": 1.0628150974247172e-07, + "loss": 0.4523, + "step": 7770 + }, + { + "epoch": 0.9964139344262295, + "grad_norm": 24.917579650878906, + "learning_rate": 7.902984057773541e-08, + "loss": 0.706, + "step": 7780 + }, + { + "epoch": 0.9976946721311475, + "grad_norm": 21.65096664428711, + "learning_rate": 5.177817141299905e-08, + "loss": 0.4827, + "step": 7790 + }, + { + "epoch": 0.9989754098360656, + "grad_norm": 3.226344347000122, + "learning_rate": 2.452650224826271e-08, + "loss": 0.8295, + "step": 7800 + } + ], + "logging_steps": 10, + "max_steps": 7808, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8217558262480896.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}