diff --git "a/checkpoint-15700/trainer_state.json" "b/checkpoint-15700/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-15700/trainer_state.json" @@ -0,0 +1,4429 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 51.01892063492063, + "eval_steps": 500, + "global_step": 15700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015873015873015873, + "grad_norm": 9.738715171813965, + "learning_rate": 1.5862944162436548e-07, + "loss": 5.3155, + "step": 25 + }, + { + "epoch": 0.0031746031746031746, + "grad_norm": 32.81618881225586, + "learning_rate": 3.1725888324873095e-07, + "loss": 5.0547, + "step": 50 + }, + { + "epoch": 0.004761904761904762, + "grad_norm": 32.67608642578125, + "learning_rate": 4.7588832487309643e-07, + "loss": 5.1135, + "step": 75 + }, + { + "epoch": 0.006349206349206349, + "grad_norm": 10.283950805664062, + "learning_rate": 6.345177664974619e-07, + "loss": 4.9131, + "step": 100 + }, + { + "epoch": 0.007936507936507936, + "grad_norm": 21.210186004638672, + "learning_rate": 7.931472081218275e-07, + "loss": 4.9171, + "step": 125 + }, + { + "epoch": 0.009523809523809525, + "grad_norm": 48.682411193847656, + "learning_rate": 9.517766497461929e-07, + "loss": 4.993, + "step": 150 + }, + { + "epoch": 0.011111111111111112, + "grad_norm": 12.522156715393066, + "learning_rate": 1.1104060913705584e-06, + "loss": 4.966, + "step": 175 + }, + { + "epoch": 0.012698412698412698, + "grad_norm": 32.584293365478516, + "learning_rate": 1.2690355329949238e-06, + "loss": 4.7858, + "step": 200 + }, + { + "epoch": 0.014285714285714285, + "grad_norm": 22.508533477783203, + "learning_rate": 1.4276649746192894e-06, + "loss": 4.713, + "step": 225 + }, + { + "epoch": 0.015873015873015872, + "grad_norm": 10.486129760742188, + "learning_rate": 1.586294416243655e-06, + "loss": 4.7809, + "step": 250 + }, + { + "epoch": 0.01746031746031746, + "grad_norm": 10.104169845581055, + "learning_rate": 1.7449238578680206e-06, + "loss": 4.8488, + "step": 275 + }, + { + "epoch": 0.01904761904761905, + "grad_norm": 20.557931900024414, + "learning_rate": 1.9035532994923857e-06, + "loss": 4.5276, + "step": 300 + }, + { + "epoch": 1.0014603174603174, + "grad_norm": 14.145257949829102, + "learning_rate": 2.0621827411167515e-06, + "loss": 4.641, + "step": 325 + }, + { + "epoch": 1.003047619047619, + "grad_norm": 11.256460189819336, + "learning_rate": 2.220812182741117e-06, + "loss": 4.4173, + "step": 350 + }, + { + "epoch": 1.0046349206349205, + "grad_norm": 13.009560585021973, + "learning_rate": 2.3794416243654827e-06, + "loss": 4.2029, + "step": 375 + }, + { + "epoch": 1.0062222222222221, + "grad_norm": 10.433947563171387, + "learning_rate": 2.5380710659898476e-06, + "loss": 4.0364, + "step": 400 + }, + { + "epoch": 1.0078095238095237, + "grad_norm": 7.473550319671631, + "learning_rate": 2.6967005076142134e-06, + "loss": 3.8754, + "step": 425 + }, + { + "epoch": 1.0093968253968253, + "grad_norm": 12.957052230834961, + "learning_rate": 2.855329949238579e-06, + "loss": 4.0458, + "step": 450 + }, + { + "epoch": 1.0109841269841269, + "grad_norm": 9.960160255432129, + "learning_rate": 3.0139593908629446e-06, + "loss": 3.7578, + "step": 475 + }, + { + "epoch": 1.0125714285714287, + "grad_norm": 12.287276268005371, + "learning_rate": 3.17258883248731e-06, + "loss": 3.6334, + "step": 500 + }, + { + "epoch": 1.0141587301587303, + "grad_norm": 12.79332160949707, + "learning_rate": 3.3312182741116753e-06, + "loss": 3.4685, + "step": 525 + }, + { + "epoch": 1.0157460317460318, + "grad_norm": 8.90628719329834, + "learning_rate": 3.489847715736041e-06, + "loss": 3.3762, + "step": 550 + }, + { + "epoch": 1.0173333333333334, + "grad_norm": 12.635107040405273, + "learning_rate": 3.6484771573604065e-06, + "loss": 3.305, + "step": 575 + }, + { + "epoch": 1.018920634920635, + "grad_norm": 15.49641227722168, + "learning_rate": 3.8071065989847715e-06, + "loss": 2.9876, + "step": 600 + }, + { + "epoch": 2.001333333333333, + "grad_norm": 10.022441864013672, + "learning_rate": 3.965736040609137e-06, + "loss": 3.0157, + "step": 625 + }, + { + "epoch": 2.0029206349206348, + "grad_norm": 9.137686729431152, + "learning_rate": 4.124365482233503e-06, + "loss": 2.8705, + "step": 650 + }, + { + "epoch": 2.0045079365079363, + "grad_norm": 10.643122673034668, + "learning_rate": 4.282994923857868e-06, + "loss": 2.5582, + "step": 675 + }, + { + "epoch": 2.006095238095238, + "grad_norm": 8.754136085510254, + "learning_rate": 4.441624365482234e-06, + "loss": 2.419, + "step": 700 + }, + { + "epoch": 2.0076825396825395, + "grad_norm": 9.287137031555176, + "learning_rate": 4.6002538071066e-06, + "loss": 2.2705, + "step": 725 + }, + { + "epoch": 2.009269841269841, + "grad_norm": 10.788775444030762, + "learning_rate": 4.758883248730965e-06, + "loss": 2.2656, + "step": 750 + }, + { + "epoch": 2.0108571428571427, + "grad_norm": 9.800201416015625, + "learning_rate": 4.91751269035533e-06, + "loss": 1.9858, + "step": 775 + }, + { + "epoch": 2.0124444444444443, + "grad_norm": 10.444952011108398, + "learning_rate": 4.995989840930358e-06, + "loss": 1.8272, + "step": 800 + }, + { + "epoch": 2.014031746031746, + "grad_norm": 11.801770210266113, + "learning_rate": 4.987635342868601e-06, + "loss": 1.7213, + "step": 825 + }, + { + "epoch": 2.0156190476190474, + "grad_norm": 8.304800033569336, + "learning_rate": 4.979280844806844e-06, + "loss": 1.5774, + "step": 850 + }, + { + "epoch": 2.017206349206349, + "grad_norm": 10.936843872070312, + "learning_rate": 4.970926346745088e-06, + "loss": 1.5306, + "step": 875 + }, + { + "epoch": 2.0187936507936506, + "grad_norm": 12.08633804321289, + "learning_rate": 4.962571848683331e-06, + "loss": 1.4236, + "step": 900 + }, + { + "epoch": 3.001206349206349, + "grad_norm": 12.72133731842041, + "learning_rate": 4.954217350621575e-06, + "loss": 1.3777, + "step": 925 + }, + { + "epoch": 3.002793650793651, + "grad_norm": 7.71216344833374, + "learning_rate": 4.945862852559818e-06, + "loss": 1.3662, + "step": 950 + }, + { + "epoch": 3.0043809523809526, + "grad_norm": 4.924420356750488, + "learning_rate": 4.937508354498062e-06, + "loss": 1.1704, + "step": 975 + }, + { + "epoch": 3.005968253968254, + "grad_norm": 9.430765151977539, + "learning_rate": 4.9291538564363055e-06, + "loss": 1.1824, + "step": 1000 + }, + { + "epoch": 3.0075555555555558, + "grad_norm": 8.154838562011719, + "learning_rate": 4.9207993583745495e-06, + "loss": 1.1026, + "step": 1025 + }, + { + "epoch": 3.0091428571428573, + "grad_norm": 8.593589782714844, + "learning_rate": 4.912444860312793e-06, + "loss": 1.1226, + "step": 1050 + }, + { + "epoch": 3.010730158730159, + "grad_norm": 7.987609386444092, + "learning_rate": 4.904090362251037e-06, + "loss": 1.0897, + "step": 1075 + }, + { + "epoch": 3.0123174603174605, + "grad_norm": 6.220165252685547, + "learning_rate": 4.89573586418928e-06, + "loss": 1.0358, + "step": 1100 + }, + { + "epoch": 3.013904761904762, + "grad_norm": 5.584622383117676, + "learning_rate": 4.887381366127523e-06, + "loss": 1.0586, + "step": 1125 + }, + { + "epoch": 3.0154920634920637, + "grad_norm": 6.9964141845703125, + "learning_rate": 4.879026868065767e-06, + "loss": 1.0425, + "step": 1150 + }, + { + "epoch": 3.0170793650793652, + "grad_norm": 6.9891839027404785, + "learning_rate": 4.87067237000401e-06, + "loss": 1.0801, + "step": 1175 + }, + { + "epoch": 3.018666666666667, + "grad_norm": 5.334001541137695, + "learning_rate": 4.862317871942254e-06, + "loss": 1.071, + "step": 1200 + }, + { + "epoch": 4.001079365079365, + "grad_norm": 6.936366081237793, + "learning_rate": 4.853963373880497e-06, + "loss": 1.0287, + "step": 1225 + }, + { + "epoch": 4.002666666666666, + "grad_norm": 5.803761959075928, + "learning_rate": 4.845608875818741e-06, + "loss": 1.1338, + "step": 1250 + }, + { + "epoch": 4.004253968253968, + "grad_norm": 6.901465892791748, + "learning_rate": 4.837254377756984e-06, + "loss": 0.9522, + "step": 1275 + }, + { + "epoch": 4.0058412698412695, + "grad_norm": 7.466715335845947, + "learning_rate": 4.828899879695228e-06, + "loss": 0.9676, + "step": 1300 + }, + { + "epoch": 4.007428571428571, + "grad_norm": 5.247936248779297, + "learning_rate": 4.820545381633472e-06, + "loss": 0.9219, + "step": 1325 + }, + { + "epoch": 4.009015873015873, + "grad_norm": 9.886089324951172, + "learning_rate": 4.812190883571715e-06, + "loss": 0.9054, + "step": 1350 + }, + { + "epoch": 4.010603174603174, + "grad_norm": 6.104865550994873, + "learning_rate": 4.803836385509959e-06, + "loss": 0.9142, + "step": 1375 + }, + { + "epoch": 4.012190476190476, + "grad_norm": 7.953219413757324, + "learning_rate": 4.7954818874482025e-06, + "loss": 0.899, + "step": 1400 + }, + { + "epoch": 4.0137777777777774, + "grad_norm": 6.037745475769043, + "learning_rate": 4.7871273893864465e-06, + "loss": 0.9195, + "step": 1425 + }, + { + "epoch": 4.015365079365079, + "grad_norm": 5.599011421203613, + "learning_rate": 4.77877289132469e-06, + "loss": 0.9049, + "step": 1450 + }, + { + "epoch": 4.016952380952381, + "grad_norm": 9.971386909484863, + "learning_rate": 4.770418393262934e-06, + "loss": 0.9406, + "step": 1475 + }, + { + "epoch": 4.018539682539682, + "grad_norm": 5.549421310424805, + "learning_rate": 4.762063895201177e-06, + "loss": 0.9306, + "step": 1500 + }, + { + "epoch": 5.000952380952381, + "grad_norm": 8.038061141967773, + "learning_rate": 4.753709397139421e-06, + "loss": 0.9159, + "step": 1525 + }, + { + "epoch": 5.002539682539682, + "grad_norm": 4.798451900482178, + "learning_rate": 4.745354899077664e-06, + "loss": 0.9853, + "step": 1550 + }, + { + "epoch": 5.004126984126984, + "grad_norm": 5.497593402862549, + "learning_rate": 4.737000401015907e-06, + "loss": 0.8753, + "step": 1575 + }, + { + "epoch": 5.005714285714285, + "grad_norm": 8.052043914794922, + "learning_rate": 4.728645902954151e-06, + "loss": 0.8879, + "step": 1600 + }, + { + "epoch": 5.007301587301587, + "grad_norm": 10.569701194763184, + "learning_rate": 4.720291404892394e-06, + "loss": 0.8254, + "step": 1625 + }, + { + "epoch": 5.0088888888888885, + "grad_norm": 7.724920749664307, + "learning_rate": 4.711936906830638e-06, + "loss": 0.7917, + "step": 1650 + }, + { + "epoch": 5.01047619047619, + "grad_norm": 6.411539554595947, + "learning_rate": 4.703582408768881e-06, + "loss": 0.8296, + "step": 1675 + }, + { + "epoch": 5.012063492063492, + "grad_norm": 4.88054084777832, + "learning_rate": 4.695227910707125e-06, + "loss": 0.8315, + "step": 1700 + }, + { + "epoch": 5.013650793650793, + "grad_norm": 8.332351684570312, + "learning_rate": 4.6868734126453685e-06, + "loss": 0.8323, + "step": 1725 + }, + { + "epoch": 5.015238095238095, + "grad_norm": 5.097577095031738, + "learning_rate": 4.6785189145836124e-06, + "loss": 0.8122, + "step": 1750 + }, + { + "epoch": 5.016825396825396, + "grad_norm": 9.592188835144043, + "learning_rate": 4.670164416521856e-06, + "loss": 0.8712, + "step": 1775 + }, + { + "epoch": 5.018412698412698, + "grad_norm": 7.307371616363525, + "learning_rate": 4.6618099184601e-06, + "loss": 0.8648, + "step": 1800 + }, + { + "epoch": 6.000825396825396, + "grad_norm": 7.067652702331543, + "learning_rate": 4.653455420398343e-06, + "loss": 0.7985, + "step": 1825 + }, + { + "epoch": 6.002412698412698, + "grad_norm": 6.129504203796387, + "learning_rate": 4.645100922336586e-06, + "loss": 0.9276, + "step": 1850 + }, + { + "epoch": 6.004, + "grad_norm": 5.0192742347717285, + "learning_rate": 4.63674642427483e-06, + "loss": 0.8118, + "step": 1875 + }, + { + "epoch": 6.005587301587302, + "grad_norm": 9.476181030273438, + "learning_rate": 4.628391926213073e-06, + "loss": 0.8063, + "step": 1900 + }, + { + "epoch": 6.007174603174604, + "grad_norm": 4.5081987380981445, + "learning_rate": 4.620037428151317e-06, + "loss": 0.7605, + "step": 1925 + }, + { + "epoch": 6.008761904761905, + "grad_norm": 7.077738285064697, + "learning_rate": 4.61168293008956e-06, + "loss": 0.7435, + "step": 1950 + }, + { + "epoch": 6.010349206349207, + "grad_norm": 5.30238151550293, + "learning_rate": 4.603328432027804e-06, + "loss": 0.7603, + "step": 1975 + }, + { + "epoch": 6.011936507936508, + "grad_norm": 6.851855754852295, + "learning_rate": 4.594973933966047e-06, + "loss": 0.7606, + "step": 2000 + }, + { + "epoch": 6.01352380952381, + "grad_norm": 7.457930564880371, + "learning_rate": 4.586619435904291e-06, + "loss": 0.7822, + "step": 2025 + }, + { + "epoch": 6.0151111111111115, + "grad_norm": 7.502450466156006, + "learning_rate": 4.578264937842534e-06, + "loss": 0.7487, + "step": 2050 + }, + { + "epoch": 6.016698412698413, + "grad_norm": 6.5313544273376465, + "learning_rate": 4.569910439780778e-06, + "loss": 0.7878, + "step": 2075 + }, + { + "epoch": 6.018285714285715, + "grad_norm": 7.514427661895752, + "learning_rate": 4.5615559417190215e-06, + "loss": 0.812, + "step": 2100 + }, + { + "epoch": 7.000698412698413, + "grad_norm": 5.095201015472412, + "learning_rate": 4.5532014436572655e-06, + "loss": 0.7322, + "step": 2125 + }, + { + "epoch": 7.002285714285715, + "grad_norm": 5.384045600891113, + "learning_rate": 4.5448469455955095e-06, + "loss": 0.8814, + "step": 2150 + }, + { + "epoch": 7.003873015873016, + "grad_norm": 4.990402698516846, + "learning_rate": 4.536492447533753e-06, + "loss": 0.7483, + "step": 2175 + }, + { + "epoch": 7.005460317460318, + "grad_norm": 6.337180137634277, + "learning_rate": 4.528137949471997e-06, + "loss": 0.742, + "step": 2200 + }, + { + "epoch": 7.007047619047619, + "grad_norm": 4.511834621429443, + "learning_rate": 4.51978345141024e-06, + "loss": 0.7482, + "step": 2225 + }, + { + "epoch": 7.008634920634921, + "grad_norm": 10.161249160766602, + "learning_rate": 4.511428953348484e-06, + "loss": 0.6561, + "step": 2250 + }, + { + "epoch": 7.010222222222223, + "grad_norm": 13.714274406433105, + "learning_rate": 4.503074455286727e-06, + "loss": 0.7306, + "step": 2275 + }, + { + "epoch": 7.011809523809524, + "grad_norm": 7.517003536224365, + "learning_rate": 4.494719957224971e-06, + "loss": 0.6942, + "step": 2300 + }, + { + "epoch": 7.013396825396826, + "grad_norm": 5.552587509155273, + "learning_rate": 4.486365459163214e-06, + "loss": 0.7466, + "step": 2325 + }, + { + "epoch": 7.014984126984127, + "grad_norm": 6.234260559082031, + "learning_rate": 4.478010961101457e-06, + "loss": 0.6913, + "step": 2350 + }, + { + "epoch": 7.016571428571429, + "grad_norm": 5.961171627044678, + "learning_rate": 4.469656463039701e-06, + "loss": 0.7444, + "step": 2375 + }, + { + "epoch": 7.0181587301587305, + "grad_norm": 9.553787231445312, + "learning_rate": 4.461301964977944e-06, + "loss": 0.7764, + "step": 2400 + }, + { + "epoch": 8.000571428571428, + "grad_norm": 6.736727237701416, + "learning_rate": 4.452947466916188e-06, + "loss": 0.6902, + "step": 2425 + }, + { + "epoch": 8.00215873015873, + "grad_norm": 6.498841285705566, + "learning_rate": 4.4445929688544314e-06, + "loss": 0.8205, + "step": 2450 + }, + { + "epoch": 8.003746031746031, + "grad_norm": 3.605954647064209, + "learning_rate": 4.4362384707926754e-06, + "loss": 0.7139, + "step": 2475 + }, + { + "epoch": 8.005333333333333, + "grad_norm": 4.930444240570068, + "learning_rate": 4.427883972730919e-06, + "loss": 0.7082, + "step": 2500 + }, + { + "epoch": 8.006920634920634, + "grad_norm": 4.654155731201172, + "learning_rate": 4.4195294746691626e-06, + "loss": 0.734, + "step": 2525 + }, + { + "epoch": 8.008507936507936, + "grad_norm": 4.657145977020264, + "learning_rate": 4.411174976607406e-06, + "loss": 0.6076, + "step": 2550 + }, + { + "epoch": 8.010095238095237, + "grad_norm": 5.5561418533325195, + "learning_rate": 4.402820478545649e-06, + "loss": 0.6882, + "step": 2575 + }, + { + "epoch": 8.011682539682539, + "grad_norm": 4.575371742248535, + "learning_rate": 4.394465980483893e-06, + "loss": 0.6572, + "step": 2600 + }, + { + "epoch": 8.01326984126984, + "grad_norm": 13.730400085449219, + "learning_rate": 4.386111482422136e-06, + "loss": 0.7209, + "step": 2625 + }, + { + "epoch": 8.014857142857142, + "grad_norm": 4.949292182922363, + "learning_rate": 4.37775698436038e-06, + "loss": 0.6433, + "step": 2650 + }, + { + "epoch": 8.016444444444444, + "grad_norm": 9.161581039428711, + "learning_rate": 4.369402486298623e-06, + "loss": 0.7117, + "step": 2675 + }, + { + "epoch": 8.018031746031745, + "grad_norm": 6.098261833190918, + "learning_rate": 4.361047988236867e-06, + "loss": 0.7515, + "step": 2700 + }, + { + "epoch": 9.000444444444444, + "grad_norm": 5.116198539733887, + "learning_rate": 4.35269349017511e-06, + "loss": 0.6484, + "step": 2725 + }, + { + "epoch": 9.002031746031745, + "grad_norm": 10.22689437866211, + "learning_rate": 4.344338992113354e-06, + "loss": 0.7807, + "step": 2750 + }, + { + "epoch": 9.003619047619047, + "grad_norm": 12.227742195129395, + "learning_rate": 4.335984494051597e-06, + "loss": 0.6763, + "step": 2775 + }, + { + "epoch": 9.005206349206349, + "grad_norm": 9.396242141723633, + "learning_rate": 4.327629995989841e-06, + "loss": 0.6722, + "step": 2800 + }, + { + "epoch": 9.00679365079365, + "grad_norm": 5.258615016937256, + "learning_rate": 4.3192754979280845e-06, + "loss": 0.7176, + "step": 2825 + }, + { + "epoch": 9.008380952380952, + "grad_norm": 5.422908306121826, + "learning_rate": 4.3109209998663285e-06, + "loss": 0.5881, + "step": 2850 + }, + { + "epoch": 9.009968253968253, + "grad_norm": 7.922283172607422, + "learning_rate": 4.302566501804572e-06, + "loss": 0.6492, + "step": 2875 + }, + { + "epoch": 9.011555555555555, + "grad_norm": 8.866413116455078, + "learning_rate": 4.294212003742816e-06, + "loss": 0.6277, + "step": 2900 + }, + { + "epoch": 9.013142857142856, + "grad_norm": 9.206356048583984, + "learning_rate": 4.285857505681059e-06, + "loss": 0.6644, + "step": 2925 + }, + { + "epoch": 9.014730158730158, + "grad_norm": 5.49714994430542, + "learning_rate": 4.277503007619303e-06, + "loss": 0.6481, + "step": 2950 + }, + { + "epoch": 9.01631746031746, + "grad_norm": 6.799171447753906, + "learning_rate": 4.269148509557547e-06, + "loss": 0.67, + "step": 2975 + }, + { + "epoch": 9.017904761904761, + "grad_norm": 7.201016902923584, + "learning_rate": 4.26079401149579e-06, + "loss": 0.7103, + "step": 3000 + }, + { + "epoch": 10.00031746031746, + "grad_norm": 6.763806343078613, + "learning_rate": 4.252439513434034e-06, + "loss": 0.6487, + "step": 3025 + }, + { + "epoch": 10.001904761904761, + "grad_norm": 9.263688087463379, + "learning_rate": 4.244085015372277e-06, + "loss": 0.7082, + "step": 3050 + }, + { + "epoch": 10.003492063492063, + "grad_norm": 11.184915542602539, + "learning_rate": 4.23573051731052e-06, + "loss": 0.6759, + "step": 3075 + }, + { + "epoch": 10.005079365079364, + "grad_norm": 4.252372741699219, + "learning_rate": 4.227376019248764e-06, + "loss": 0.66, + "step": 3100 + }, + { + "epoch": 10.006666666666666, + "grad_norm": 11.342703819274902, + "learning_rate": 4.219021521187007e-06, + "loss": 0.6747, + "step": 3125 + }, + { + "epoch": 10.008253968253968, + "grad_norm": 5.704590797424316, + "learning_rate": 4.210667023125251e-06, + "loss": 0.5862, + "step": 3150 + }, + { + "epoch": 10.009841269841269, + "grad_norm": 5.683150291442871, + "learning_rate": 4.2023125250634944e-06, + "loss": 0.6129, + "step": 3175 + }, + { + "epoch": 10.01142857142857, + "grad_norm": 4.855335235595703, + "learning_rate": 4.193958027001738e-06, + "loss": 0.604, + "step": 3200 + }, + { + "epoch": 10.013015873015872, + "grad_norm": 7.8647613525390625, + "learning_rate": 4.1856035289399816e-06, + "loss": 0.6367, + "step": 3225 + }, + { + "epoch": 10.014603174603174, + "grad_norm": 6.591641902923584, + "learning_rate": 4.1772490308782256e-06, + "loss": 0.6371, + "step": 3250 + }, + { + "epoch": 10.016190476190475, + "grad_norm": 7.047679424285889, + "learning_rate": 4.168894532816469e-06, + "loss": 0.6433, + "step": 3275 + }, + { + "epoch": 10.017777777777777, + "grad_norm": 7.674762725830078, + "learning_rate": 4.160540034754712e-06, + "loss": 0.6696, + "step": 3300 + }, + { + "epoch": 11.000190476190475, + "grad_norm": 7.130011081695557, + "learning_rate": 4.152185536692956e-06, + "loss": 0.6527, + "step": 3325 + }, + { + "epoch": 11.001777777777777, + "grad_norm": 7.317767143249512, + "learning_rate": 4.143831038631199e-06, + "loss": 0.6586, + "step": 3350 + }, + { + "epoch": 11.003365079365079, + "grad_norm": 2.615405321121216, + "learning_rate": 4.135476540569443e-06, + "loss": 0.6771, + "step": 3375 + }, + { + "epoch": 11.00495238095238, + "grad_norm": 4.891953468322754, + "learning_rate": 4.127122042507686e-06, + "loss": 0.64, + "step": 3400 + }, + { + "epoch": 11.006539682539682, + "grad_norm": 6.664401531219482, + "learning_rate": 4.11876754444593e-06, + "loss": 0.6411, + "step": 3425 + }, + { + "epoch": 11.008126984126983, + "grad_norm": 6.38748836517334, + "learning_rate": 4.110413046384173e-06, + "loss": 0.5616, + "step": 3450 + }, + { + "epoch": 11.009714285714285, + "grad_norm": 6.957112789154053, + "learning_rate": 4.102058548322417e-06, + "loss": 0.5956, + "step": 3475 + }, + { + "epoch": 11.011301587301586, + "grad_norm": 4.554030895233154, + "learning_rate": 4.09370405026066e-06, + "loss": 0.5998, + "step": 3500 + }, + { + "epoch": 11.012888888888888, + "grad_norm": 5.208797454833984, + "learning_rate": 4.085349552198904e-06, + "loss": 0.616, + "step": 3525 + }, + { + "epoch": 11.01447619047619, + "grad_norm": 6.402866840362549, + "learning_rate": 4.0769950541371475e-06, + "loss": 0.5958, + "step": 3550 + }, + { + "epoch": 11.016063492063491, + "grad_norm": 5.4900946617126465, + "learning_rate": 4.0686405560753915e-06, + "loss": 0.6173, + "step": 3575 + }, + { + "epoch": 11.017650793650793, + "grad_norm": 9.454499244689941, + "learning_rate": 4.060286058013635e-06, + "loss": 0.6567, + "step": 3600 + }, + { + "epoch": 12.000063492063491, + "grad_norm": 9.544127464294434, + "learning_rate": 4.051931559951879e-06, + "loss": 0.6164, + "step": 3625 + }, + { + "epoch": 12.001650793650793, + "grad_norm": 5.444927215576172, + "learning_rate": 4.043577061890122e-06, + "loss": 0.6563, + "step": 3650 + }, + { + "epoch": 12.003238095238094, + "grad_norm": 3.2568845748901367, + "learning_rate": 4.035222563828366e-06, + "loss": 0.6631, + "step": 3675 + }, + { + "epoch": 12.004825396825396, + "grad_norm": 11.233345031738281, + "learning_rate": 4.026868065766609e-06, + "loss": 0.6186, + "step": 3700 + }, + { + "epoch": 12.006412698412698, + "grad_norm": 5.101284027099609, + "learning_rate": 4.018513567704853e-06, + "loss": 0.6307, + "step": 3725 + }, + { + "epoch": 12.008, + "grad_norm": 7.161935329437256, + "learning_rate": 4.010159069643096e-06, + "loss": 0.5363, + "step": 3750 + }, + { + "epoch": 12.0095873015873, + "grad_norm": 8.10970401763916, + "learning_rate": 4.00180457158134e-06, + "loss": 0.5729, + "step": 3775 + }, + { + "epoch": 12.011174603174604, + "grad_norm": 6.0759077072143555, + "learning_rate": 3.993450073519583e-06, + "loss": 0.5785, + "step": 3800 + }, + { + "epoch": 12.012761904761906, + "grad_norm": 11.944267272949219, + "learning_rate": 3.985095575457827e-06, + "loss": 0.5812, + "step": 3825 + }, + { + "epoch": 12.014349206349207, + "grad_norm": 6.059834957122803, + "learning_rate": 3.97674107739607e-06, + "loss": 0.5958, + "step": 3850 + }, + { + "epoch": 12.015936507936509, + "grad_norm": 5.849289417266846, + "learning_rate": 3.968386579334314e-06, + "loss": 0.6121, + "step": 3875 + }, + { + "epoch": 12.01752380952381, + "grad_norm": 5.543300628662109, + "learning_rate": 3.960032081272557e-06, + "loss": 0.6259, + "step": 3900 + }, + { + "epoch": 12.019111111111112, + "grad_norm": 5.864200592041016, + "learning_rate": 3.951677583210801e-06, + "loss": 0.597, + "step": 3925 + }, + { + "epoch": 13.00152380952381, + "grad_norm": 9.676383972167969, + "learning_rate": 3.9433230851490445e-06, + "loss": 0.6586, + "step": 3950 + }, + { + "epoch": 13.003111111111112, + "grad_norm": 11.184745788574219, + "learning_rate": 3.9349685870872885e-06, + "loss": 0.644, + "step": 3975 + }, + { + "epoch": 13.004698412698414, + "grad_norm": 6.4422502517700195, + "learning_rate": 3.926614089025532e-06, + "loss": 0.5813, + "step": 4000 + }, + { + "epoch": 13.006285714285715, + "grad_norm": 5.095337390899658, + "learning_rate": 3.918259590963775e-06, + "loss": 0.6238, + "step": 4025 + }, + { + "epoch": 13.007873015873017, + "grad_norm": 7.0134663581848145, + "learning_rate": 3.909905092902019e-06, + "loss": 0.5323, + "step": 4050 + }, + { + "epoch": 13.009460317460318, + "grad_norm": 11.49138069152832, + "learning_rate": 3.901550594840262e-06, + "loss": 0.5527, + "step": 4075 + }, + { + "epoch": 13.01104761904762, + "grad_norm": 5.017846584320068, + "learning_rate": 3.893196096778506e-06, + "loss": 0.5561, + "step": 4100 + }, + { + "epoch": 13.012634920634921, + "grad_norm": 6.452044486999512, + "learning_rate": 3.884841598716749e-06, + "loss": 0.5796, + "step": 4125 + }, + { + "epoch": 13.014222222222223, + "grad_norm": 4.59455680847168, + "learning_rate": 3.876487100654993e-06, + "loss": 0.5686, + "step": 4150 + }, + { + "epoch": 13.015809523809525, + "grad_norm": 6.177151203155518, + "learning_rate": 3.868132602593236e-06, + "loss": 0.5974, + "step": 4175 + }, + { + "epoch": 13.017396825396826, + "grad_norm": 8.819549560546875, + "learning_rate": 3.85977810453148e-06, + "loss": 0.6186, + "step": 4200 + }, + { + "epoch": 13.018984126984128, + "grad_norm": 4.838363170623779, + "learning_rate": 3.851423606469723e-06, + "loss": 0.5783, + "step": 4225 + }, + { + "epoch": 14.001396825396826, + "grad_norm": 6.992326736450195, + "learning_rate": 3.843069108407967e-06, + "loss": 0.6319, + "step": 4250 + }, + { + "epoch": 14.002984126984128, + "grad_norm": 6.273831367492676, + "learning_rate": 3.8347146103462105e-06, + "loss": 0.6446, + "step": 4275 + }, + { + "epoch": 14.00457142857143, + "grad_norm": 5.901345252990723, + "learning_rate": 3.826360112284454e-06, + "loss": 0.5465, + "step": 4300 + }, + { + "epoch": 14.006158730158731, + "grad_norm": 9.980649948120117, + "learning_rate": 3.818005614222698e-06, + "loss": 0.6371, + "step": 4325 + }, + { + "epoch": 14.007746031746033, + "grad_norm": 3.479801654815674, + "learning_rate": 3.809651116160941e-06, + "loss": 0.5216, + "step": 4350 + }, + { + "epoch": 14.009333333333334, + "grad_norm": 7.416255950927734, + "learning_rate": 3.801296618099185e-06, + "loss": 0.5298, + "step": 4375 + }, + { + "epoch": 14.010920634920636, + "grad_norm": 6.53993034362793, + "learning_rate": 3.7929421200374283e-06, + "loss": 0.5488, + "step": 4400 + }, + { + "epoch": 14.012507936507937, + "grad_norm": 5.593329906463623, + "learning_rate": 3.7845876219756723e-06, + "loss": 0.5506, + "step": 4425 + }, + { + "epoch": 14.014095238095239, + "grad_norm": 5.567030906677246, + "learning_rate": 3.7762331239139154e-06, + "loss": 0.5616, + "step": 4450 + }, + { + "epoch": 14.01568253968254, + "grad_norm": 11.033809661865234, + "learning_rate": 3.7678786258521594e-06, + "loss": 0.5849, + "step": 4475 + }, + { + "epoch": 14.017269841269842, + "grad_norm": 5.325310707092285, + "learning_rate": 3.7595241277904026e-06, + "loss": 0.5911, + "step": 4500 + }, + { + "epoch": 14.018857142857144, + "grad_norm": 10.827190399169922, + "learning_rate": 3.751169629728646e-06, + "loss": 0.5941, + "step": 4525 + }, + { + "epoch": 15.001269841269842, + "grad_norm": 6.941823482513428, + "learning_rate": 3.7428151316668897e-06, + "loss": 0.6097, + "step": 4550 + }, + { + "epoch": 15.002857142857144, + "grad_norm": 11.185466766357422, + "learning_rate": 3.7344606336051333e-06, + "loss": 0.6295, + "step": 4575 + }, + { + "epoch": 15.004444444444445, + "grad_norm": 7.75522518157959, + "learning_rate": 3.726106135543377e-06, + "loss": 0.5308, + "step": 4600 + }, + { + "epoch": 15.006031746031747, + "grad_norm": 5.286986827850342, + "learning_rate": 3.7177516374816204e-06, + "loss": 0.6022, + "step": 4625 + }, + { + "epoch": 15.007619047619048, + "grad_norm": 6.149432182312012, + "learning_rate": 3.709397139419864e-06, + "loss": 0.5457, + "step": 4650 + }, + { + "epoch": 15.00920634920635, + "grad_norm": 7.458939552307129, + "learning_rate": 3.7010426413581075e-06, + "loss": 0.5073, + "step": 4675 + }, + { + "epoch": 15.010793650793651, + "grad_norm": 5.971858024597168, + "learning_rate": 3.692688143296351e-06, + "loss": 0.5418, + "step": 4700 + }, + { + "epoch": 15.012380952380953, + "grad_norm": 5.619646072387695, + "learning_rate": 3.6843336452345947e-06, + "loss": 0.5423, + "step": 4725 + }, + { + "epoch": 15.013968253968255, + "grad_norm": 6.382497787475586, + "learning_rate": 3.675979147172838e-06, + "loss": 0.5446, + "step": 4750 + }, + { + "epoch": 15.015555555555556, + "grad_norm": 6.406772136688232, + "learning_rate": 3.667624649111082e-06, + "loss": 0.5734, + "step": 4775 + }, + { + "epoch": 15.017142857142858, + "grad_norm": 7.28453254699707, + "learning_rate": 3.659270151049325e-06, + "loss": 0.5786, + "step": 4800 + }, + { + "epoch": 15.01873015873016, + "grad_norm": 9.923730850219727, + "learning_rate": 3.650915652987569e-06, + "loss": 0.5791, + "step": 4825 + }, + { + "epoch": 16.001142857142856, + "grad_norm": 8.251771926879883, + "learning_rate": 3.642561154925812e-06, + "loss": 0.5688, + "step": 4850 + }, + { + "epoch": 16.00273015873016, + "grad_norm": 7.045546054840088, + "learning_rate": 3.634206656864056e-06, + "loss": 0.6337, + "step": 4875 + }, + { + "epoch": 16.00431746031746, + "grad_norm": 8.073708534240723, + "learning_rate": 3.625852158802299e-06, + "loss": 0.5459, + "step": 4900 + }, + { + "epoch": 16.005904761904763, + "grad_norm": 4.836098670959473, + "learning_rate": 3.617497660740543e-06, + "loss": 0.5852, + "step": 4925 + }, + { + "epoch": 16.007492063492062, + "grad_norm": 6.777514457702637, + "learning_rate": 3.6091431626787863e-06, + "loss": 0.5408, + "step": 4950 + }, + { + "epoch": 16.009079365079366, + "grad_norm": 4.61210298538208, + "learning_rate": 3.6007886646170303e-06, + "loss": 0.4816, + "step": 4975 + }, + { + "epoch": 16.010666666666665, + "grad_norm": 6.172779083251953, + "learning_rate": 3.5924341665552735e-06, + "loss": 0.528, + "step": 5000 + }, + { + "epoch": 16.01225396825397, + "grad_norm": 5.35014009475708, + "learning_rate": 3.584079668493517e-06, + "loss": 0.5443, + "step": 5025 + }, + { + "epoch": 16.01384126984127, + "grad_norm": 7.521047115325928, + "learning_rate": 3.575725170431761e-06, + "loss": 0.5411, + "step": 5050 + }, + { + "epoch": 16.015428571428572, + "grad_norm": 3.7705650329589844, + "learning_rate": 3.567370672370004e-06, + "loss": 0.555, + "step": 5075 + }, + { + "epoch": 16.017015873015872, + "grad_norm": 4.953466892242432, + "learning_rate": 3.559016174308248e-06, + "loss": 0.5691, + "step": 5100 + }, + { + "epoch": 16.018603174603175, + "grad_norm": 9.339215278625488, + "learning_rate": 3.5506616762464913e-06, + "loss": 0.5754, + "step": 5125 + }, + { + "epoch": 17.001015873015874, + "grad_norm": 6.013854503631592, + "learning_rate": 3.5423071781847353e-06, + "loss": 0.5407, + "step": 5150 + }, + { + "epoch": 17.002603174603173, + "grad_norm": 6.144811630249023, + "learning_rate": 3.5339526801229784e-06, + "loss": 0.6193, + "step": 5175 + }, + { + "epoch": 17.004190476190477, + "grad_norm": 4.639403820037842, + "learning_rate": 3.5255981820612224e-06, + "loss": 0.5446, + "step": 5200 + }, + { + "epoch": 17.005777777777777, + "grad_norm": 3.8471908569335938, + "learning_rate": 3.5172436839994656e-06, + "loss": 0.5891, + "step": 5225 + }, + { + "epoch": 17.00736507936508, + "grad_norm": 4.270881175994873, + "learning_rate": 3.5088891859377087e-06, + "loss": 0.5323, + "step": 5250 + }, + { + "epoch": 17.00895238095238, + "grad_norm": 6.322847366333008, + "learning_rate": 3.5005346878759527e-06, + "loss": 0.4589, + "step": 5275 + }, + { + "epoch": 17.010539682539683, + "grad_norm": 18.737030029296875, + "learning_rate": 3.492180189814196e-06, + "loss": 0.5348, + "step": 5300 + }, + { + "epoch": 17.012126984126983, + "grad_norm": 3.4430785179138184, + "learning_rate": 3.48382569175244e-06, + "loss": 0.5397, + "step": 5325 + }, + { + "epoch": 17.013714285714286, + "grad_norm": 6.301079273223877, + "learning_rate": 3.4754711936906834e-06, + "loss": 0.5254, + "step": 5350 + }, + { + "epoch": 17.015301587301586, + "grad_norm": 10.843756675720215, + "learning_rate": 3.467116695628927e-06, + "loss": 0.5535, + "step": 5375 + }, + { + "epoch": 17.01688888888889, + "grad_norm": 9.099514961242676, + "learning_rate": 3.4587621975671705e-06, + "loss": 0.5458, + "step": 5400 + }, + { + "epoch": 17.01847619047619, + "grad_norm": 5.591258525848389, + "learning_rate": 3.450407699505414e-06, + "loss": 0.5715, + "step": 5425 + }, + { + "epoch": 18.000888888888888, + "grad_norm": 7.137011528015137, + "learning_rate": 3.4420532014436577e-06, + "loss": 0.5219, + "step": 5450 + }, + { + "epoch": 18.00247619047619, + "grad_norm": 9.867881774902344, + "learning_rate": 3.4336987033819012e-06, + "loss": 0.6136, + "step": 5475 + }, + { + "epoch": 18.00406349206349, + "grad_norm": 11.723578453063965, + "learning_rate": 3.4253442053201448e-06, + "loss": 0.5442, + "step": 5500 + }, + { + "epoch": 18.005650793650794, + "grad_norm": 5.498622417449951, + "learning_rate": 3.416989707258388e-06, + "loss": 0.5691, + "step": 5525 + }, + { + "epoch": 18.007238095238094, + "grad_norm": 5.055809020996094, + "learning_rate": 3.408635209196632e-06, + "loss": 0.5105, + "step": 5550 + }, + { + "epoch": 18.008825396825397, + "grad_norm": 3.1195857524871826, + "learning_rate": 3.400280711134875e-06, + "loss": 0.4747, + "step": 5575 + }, + { + "epoch": 18.010412698412697, + "grad_norm": 11.140824317932129, + "learning_rate": 3.391926213073119e-06, + "loss": 0.5209, + "step": 5600 + }, + { + "epoch": 18.012, + "grad_norm": 6.157780647277832, + "learning_rate": 3.383571715011362e-06, + "loss": 0.534, + "step": 5625 + }, + { + "epoch": 18.0135873015873, + "grad_norm": 4.913928031921387, + "learning_rate": 3.375217216949606e-06, + "loss": 0.5126, + "step": 5650 + }, + { + "epoch": 18.015174603174604, + "grad_norm": 7.3261895179748535, + "learning_rate": 3.3668627188878493e-06, + "loss": 0.5411, + "step": 5675 + }, + { + "epoch": 18.016761904761903, + "grad_norm": 4.676502227783203, + "learning_rate": 3.3585082208260933e-06, + "loss": 0.5322, + "step": 5700 + }, + { + "epoch": 18.018349206349207, + "grad_norm": 8.648612976074219, + "learning_rate": 3.3501537227643365e-06, + "loss": 0.5704, + "step": 5725 + }, + { + "epoch": 19.000761904761905, + "grad_norm": 10.17095947265625, + "learning_rate": 3.34179922470258e-06, + "loss": 0.5118, + "step": 5750 + }, + { + "epoch": 19.002349206349205, + "grad_norm": 4.843137264251709, + "learning_rate": 3.3334447266408236e-06, + "loss": 0.6025, + "step": 5775 + }, + { + "epoch": 19.00393650793651, + "grad_norm": 8.146428108215332, + "learning_rate": 3.325090228579067e-06, + "loss": 0.5498, + "step": 5800 + }, + { + "epoch": 19.005523809523808, + "grad_norm": 9.00712776184082, + "learning_rate": 3.3167357305173107e-06, + "loss": 0.5468, + "step": 5825 + }, + { + "epoch": 19.00711111111111, + "grad_norm": 4.855189800262451, + "learning_rate": 3.3083812324555543e-06, + "loss": 0.5228, + "step": 5850 + }, + { + "epoch": 19.00869841269841, + "grad_norm": 4.557061672210693, + "learning_rate": 3.300026734393798e-06, + "loss": 0.4595, + "step": 5875 + }, + { + "epoch": 19.010285714285715, + "grad_norm": 8.567035675048828, + "learning_rate": 3.2916722363320414e-06, + "loss": 0.519, + "step": 5900 + }, + { + "epoch": 19.011873015873014, + "grad_norm": 26.754615783691406, + "learning_rate": 3.2833177382702854e-06, + "loss": 0.5212, + "step": 5925 + }, + { + "epoch": 19.013460317460318, + "grad_norm": 4.910426139831543, + "learning_rate": 3.2749632402085285e-06, + "loss": 0.5025, + "step": 5950 + }, + { + "epoch": 19.015047619047618, + "grad_norm": 11.170868873596191, + "learning_rate": 3.2666087421467717e-06, + "loss": 0.5109, + "step": 5975 + }, + { + "epoch": 19.01663492063492, + "grad_norm": 8.7157564163208, + "learning_rate": 3.2582542440850157e-06, + "loss": 0.5374, + "step": 6000 + }, + { + "epoch": 19.01822222222222, + "grad_norm": 5.3223700523376465, + "learning_rate": 3.249899746023259e-06, + "loss": 0.5608, + "step": 6025 + }, + { + "epoch": 20.00063492063492, + "grad_norm": 15.868850708007812, + "learning_rate": 3.241545247961503e-06, + "loss": 0.5028, + "step": 6050 + }, + { + "epoch": 20.002222222222223, + "grad_norm": 7.932621955871582, + "learning_rate": 3.233190749899746e-06, + "loss": 0.6152, + "step": 6075 + }, + { + "epoch": 20.003809523809522, + "grad_norm": 10.044479370117188, + "learning_rate": 3.22483625183799e-06, + "loss": 0.5187, + "step": 6100 + }, + { + "epoch": 20.005396825396826, + "grad_norm": 7.519008159637451, + "learning_rate": 3.216481753776233e-06, + "loss": 0.5333, + "step": 6125 + }, + { + "epoch": 20.006984126984126, + "grad_norm": 4.2018866539001465, + "learning_rate": 3.208127255714477e-06, + "loss": 0.546, + "step": 6150 + }, + { + "epoch": 20.00857142857143, + "grad_norm": 6.87973690032959, + "learning_rate": 3.1997727576527206e-06, + "loss": 0.4432, + "step": 6175 + }, + { + "epoch": 20.01015873015873, + "grad_norm": 6.215482711791992, + "learning_rate": 3.191418259590964e-06, + "loss": 0.5015, + "step": 6200 + }, + { + "epoch": 20.011746031746032, + "grad_norm": 5.1478753089904785, + "learning_rate": 3.1830637615292078e-06, + "loss": 0.5034, + "step": 6225 + }, + { + "epoch": 20.013333333333332, + "grad_norm": 3.017598867416382, + "learning_rate": 3.174709263467451e-06, + "loss": 0.5259, + "step": 6250 + }, + { + "epoch": 20.014920634920635, + "grad_norm": 9.40729808807373, + "learning_rate": 3.166354765405695e-06, + "loss": 0.4903, + "step": 6275 + }, + { + "epoch": 20.016507936507935, + "grad_norm": 10.465718269348145, + "learning_rate": 3.158000267343938e-06, + "loss": 0.5392, + "step": 6300 + }, + { + "epoch": 20.01809523809524, + "grad_norm": 5.032984733581543, + "learning_rate": 3.149645769282182e-06, + "loss": 0.5571, + "step": 6325 + }, + { + "epoch": 21.000507936507937, + "grad_norm": 9.124772071838379, + "learning_rate": 3.141291271220425e-06, + "loss": 0.4855, + "step": 6350 + }, + { + "epoch": 21.002095238095237, + "grad_norm": 5.090267181396484, + "learning_rate": 3.132936773158669e-06, + "loss": 0.5912, + "step": 6375 + }, + { + "epoch": 21.00368253968254, + "grad_norm": 7.209656238555908, + "learning_rate": 3.1245822750969123e-06, + "loss": 0.5315, + "step": 6400 + }, + { + "epoch": 21.00526984126984, + "grad_norm": 10.766797065734863, + "learning_rate": 3.1162277770351563e-06, + "loss": 0.5364, + "step": 6425 + }, + { + "epoch": 21.006857142857143, + "grad_norm": 9.655423164367676, + "learning_rate": 3.1078732789733994e-06, + "loss": 0.5469, + "step": 6450 + }, + { + "epoch": 21.008444444444443, + "grad_norm": 9.706192970275879, + "learning_rate": 3.099518780911643e-06, + "loss": 0.4377, + "step": 6475 + }, + { + "epoch": 21.010031746031746, + "grad_norm": 5.6594343185424805, + "learning_rate": 3.0911642828498866e-06, + "loss": 0.4867, + "step": 6500 + }, + { + "epoch": 21.011619047619046, + "grad_norm": 9.920619010925293, + "learning_rate": 3.08280978478813e-06, + "loss": 0.497, + "step": 6525 + }, + { + "epoch": 21.01320634920635, + "grad_norm": 4.976430892944336, + "learning_rate": 3.0744552867263737e-06, + "loss": 0.4949, + "step": 6550 + }, + { + "epoch": 21.01479365079365, + "grad_norm": 7.563751697540283, + "learning_rate": 3.0661007886646173e-06, + "loss": 0.5136, + "step": 6575 + }, + { + "epoch": 21.016380952380953, + "grad_norm": 5.668909072875977, + "learning_rate": 3.057746290602861e-06, + "loss": 0.5097, + "step": 6600 + }, + { + "epoch": 21.017968253968252, + "grad_norm": 6.428191661834717, + "learning_rate": 3.0493917925411044e-06, + "loss": 0.5594, + "step": 6625 + }, + { + "epoch": 22.00038095238095, + "grad_norm": 6.726556301116943, + "learning_rate": 3.041037294479348e-06, + "loss": 0.4874, + "step": 6650 + }, + { + "epoch": 22.001968253968254, + "grad_norm": 14.118648529052734, + "learning_rate": 3.0326827964175915e-06, + "loss": 0.5642, + "step": 6675 + }, + { + "epoch": 22.003555555555554, + "grad_norm": 11.719437599182129, + "learning_rate": 3.024328298355835e-06, + "loss": 0.5228, + "step": 6700 + }, + { + "epoch": 22.005142857142857, + "grad_norm": 7.100255489349365, + "learning_rate": 3.0159738002940787e-06, + "loss": 0.5289, + "step": 6725 + }, + { + "epoch": 22.006730158730157, + "grad_norm": 15.27164363861084, + "learning_rate": 3.007619302232322e-06, + "loss": 0.5544, + "step": 6750 + }, + { + "epoch": 22.00831746031746, + "grad_norm": 6.844352722167969, + "learning_rate": 2.999264804170566e-06, + "loss": 0.4507, + "step": 6775 + }, + { + "epoch": 22.00990476190476, + "grad_norm": 5.465493202209473, + "learning_rate": 2.990910306108809e-06, + "loss": 0.4717, + "step": 6800 + }, + { + "epoch": 22.011492063492064, + "grad_norm": 6.300055503845215, + "learning_rate": 2.982555808047053e-06, + "loss": 0.4833, + "step": 6825 + }, + { + "epoch": 22.013079365079363, + "grad_norm": 3.9687702655792236, + "learning_rate": 2.974201309985296e-06, + "loss": 0.4925, + "step": 6850 + }, + { + "epoch": 22.014666666666667, + "grad_norm": 5.950267791748047, + "learning_rate": 2.96584681192354e-06, + "loss": 0.5093, + "step": 6875 + }, + { + "epoch": 22.016253968253967, + "grad_norm": 7.3085618019104, + "learning_rate": 2.957492313861783e-06, + "loss": 0.5095, + "step": 6900 + }, + { + "epoch": 22.01784126984127, + "grad_norm": 10.438004493713379, + "learning_rate": 2.949137815800027e-06, + "loss": 0.5509, + "step": 6925 + }, + { + "epoch": 23.00025396825397, + "grad_norm": 5.851992130279541, + "learning_rate": 2.9407833177382703e-06, + "loss": 0.4989, + "step": 6950 + }, + { + "epoch": 23.001841269841268, + "grad_norm": 5.808182716369629, + "learning_rate": 2.932428819676514e-06, + "loss": 0.531, + "step": 6975 + }, + { + "epoch": 23.00342857142857, + "grad_norm": 4.814669132232666, + "learning_rate": 2.924074321614758e-06, + "loss": 0.5413, + "step": 7000 + }, + { + "epoch": 23.00501587301587, + "grad_norm": 7.406203269958496, + "learning_rate": 2.915719823553001e-06, + "loss": 0.5223, + "step": 7025 + }, + { + "epoch": 23.006603174603175, + "grad_norm": 4.7713942527771, + "learning_rate": 2.907365325491245e-06, + "loss": 0.5251, + "step": 7050 + }, + { + "epoch": 23.008190476190475, + "grad_norm": 4.403865814208984, + "learning_rate": 2.899010827429488e-06, + "loss": 0.46, + "step": 7075 + }, + { + "epoch": 23.009777777777778, + "grad_norm": 5.674661636352539, + "learning_rate": 2.890656329367732e-06, + "loss": 0.4705, + "step": 7100 + }, + { + "epoch": 23.011365079365078, + "grad_norm": 7.83860445022583, + "learning_rate": 2.8823018313059753e-06, + "loss": 0.4906, + "step": 7125 + }, + { + "epoch": 23.01295238095238, + "grad_norm": 3.9756040573120117, + "learning_rate": 2.8739473332442193e-06, + "loss": 0.4931, + "step": 7150 + }, + { + "epoch": 23.01453968253968, + "grad_norm": 4.530709743499756, + "learning_rate": 2.8655928351824624e-06, + "loss": 0.4957, + "step": 7175 + }, + { + "epoch": 23.016126984126984, + "grad_norm": 7.570037364959717, + "learning_rate": 2.8572383371207056e-06, + "loss": 0.5039, + "step": 7200 + }, + { + "epoch": 23.017714285714284, + "grad_norm": 6.422541618347168, + "learning_rate": 2.8488838390589496e-06, + "loss": 0.5312, + "step": 7225 + }, + { + "epoch": 24.000126984126982, + "grad_norm": 7.004579544067383, + "learning_rate": 2.8405293409971927e-06, + "loss": 0.4954, + "step": 7250 + }, + { + "epoch": 24.001714285714286, + "grad_norm": 11.172274589538574, + "learning_rate": 2.8321748429354367e-06, + "loss": 0.5304, + "step": 7275 + }, + { + "epoch": 24.003301587301586, + "grad_norm": 6.250117301940918, + "learning_rate": 2.8238203448736803e-06, + "loss": 0.5508, + "step": 7300 + }, + { + "epoch": 24.00488888888889, + "grad_norm": 5.038013935089111, + "learning_rate": 2.815465846811924e-06, + "loss": 0.5128, + "step": 7325 + }, + { + "epoch": 24.00647619047619, + "grad_norm": 3.5625054836273193, + "learning_rate": 2.8071113487501674e-06, + "loss": 0.5219, + "step": 7350 + }, + { + "epoch": 24.008063492063492, + "grad_norm": 4.982530117034912, + "learning_rate": 2.798756850688411e-06, + "loss": 0.4495, + "step": 7375 + }, + { + "epoch": 24.009650793650792, + "grad_norm": 17.86178207397461, + "learning_rate": 2.7904023526266545e-06, + "loss": 0.4743, + "step": 7400 + }, + { + "epoch": 24.011238095238095, + "grad_norm": 6.184370517730713, + "learning_rate": 2.782047854564898e-06, + "loss": 0.4768, + "step": 7425 + }, + { + "epoch": 24.012825396825395, + "grad_norm": 11.036730766296387, + "learning_rate": 2.7736933565031416e-06, + "loss": 0.493, + "step": 7450 + }, + { + "epoch": 24.0144126984127, + "grad_norm": 7.9786577224731445, + "learning_rate": 2.765338858441385e-06, + "loss": 0.4758, + "step": 7475 + }, + { + "epoch": 24.016, + "grad_norm": 5.915741443634033, + "learning_rate": 2.7569843603796288e-06, + "loss": 0.5088, + "step": 7500 + }, + { + "epoch": 24.0175873015873, + "grad_norm": 10.715784072875977, + "learning_rate": 2.748629862317872e-06, + "loss": 0.5091, + "step": 7525 + }, + { + "epoch": 24.0191746031746, + "grad_norm": 6.6324381828308105, + "learning_rate": 2.740275364256116e-06, + "loss": 0.4805, + "step": 7550 + }, + { + "epoch": 25.001587301587303, + "grad_norm": 4.505024433135986, + "learning_rate": 2.731920866194359e-06, + "loss": 0.55, + "step": 7575 + }, + { + "epoch": 25.003174603174603, + "grad_norm": 6.079422950744629, + "learning_rate": 2.723566368132603e-06, + "loss": 0.5442, + "step": 7600 + }, + { + "epoch": 25.004761904761907, + "grad_norm": 5.729933738708496, + "learning_rate": 2.715211870070846e-06, + "loss": 0.4952, + "step": 7625 + }, + { + "epoch": 25.006349206349206, + "grad_norm": 12.97045612335205, + "learning_rate": 2.70685737200909e-06, + "loss": 0.5312, + "step": 7650 + }, + { + "epoch": 25.00793650793651, + "grad_norm": 6.688389301300049, + "learning_rate": 2.6985028739473333e-06, + "loss": 0.4426, + "step": 7675 + }, + { + "epoch": 25.00952380952381, + "grad_norm": 5.51877498626709, + "learning_rate": 2.690148375885577e-06, + "loss": 0.4597, + "step": 7700 + }, + { + "epoch": 25.011111111111113, + "grad_norm": 6.6266374588012695, + "learning_rate": 2.6817938778238204e-06, + "loss": 0.4767, + "step": 7725 + }, + { + "epoch": 25.012698412698413, + "grad_norm": 4.988967418670654, + "learning_rate": 2.673439379762064e-06, + "loss": 0.4721, + "step": 7750 + }, + { + "epoch": 25.014285714285716, + "grad_norm": 5.249930381774902, + "learning_rate": 2.6650848817003076e-06, + "loss": 0.481, + "step": 7775 + }, + { + "epoch": 25.015873015873016, + "grad_norm": 8.894637107849121, + "learning_rate": 2.656730383638551e-06, + "loss": 0.5092, + "step": 7800 + }, + { + "epoch": 25.01746031746032, + "grad_norm": 6.139794826507568, + "learning_rate": 2.648375885576795e-06, + "loss": 0.5146, + "step": 7825 + }, + { + "epoch": 25.01904761904762, + "grad_norm": 4.836121082305908, + "learning_rate": 2.6400213875150383e-06, + "loss": 0.4853, + "step": 7850 + }, + { + "epoch": 26.001460317460317, + "grad_norm": 4.840237140655518, + "learning_rate": 2.6316668894532823e-06, + "loss": 0.5325, + "step": 7875 + }, + { + "epoch": 26.00304761904762, + "grad_norm": 6.270430088043213, + "learning_rate": 2.6233123913915254e-06, + "loss": 0.5517, + "step": 7900 + }, + { + "epoch": 26.00463492063492, + "grad_norm": 6.732022285461426, + "learning_rate": 2.6149578933297694e-06, + "loss": 0.4771, + "step": 7925 + }, + { + "epoch": 26.006222222222224, + "grad_norm": 4.249831199645996, + "learning_rate": 2.6066033952680125e-06, + "loss": 0.5281, + "step": 7950 + }, + { + "epoch": 26.007809523809524, + "grad_norm": 9.650166511535645, + "learning_rate": 2.5982488972062557e-06, + "loss": 0.4461, + "step": 7975 + }, + { + "epoch": 26.009396825396827, + "grad_norm": 5.7691216468811035, + "learning_rate": 2.5898943991444997e-06, + "loss": 0.4487, + "step": 8000 + }, + { + "epoch": 26.010984126984127, + "grad_norm": 5.991948127746582, + "learning_rate": 2.581539901082743e-06, + "loss": 0.4715, + "step": 8025 + }, + { + "epoch": 26.01257142857143, + "grad_norm": 11.065790176391602, + "learning_rate": 2.573185403020987e-06, + "loss": 0.4742, + "step": 8050 + }, + { + "epoch": 26.01415873015873, + "grad_norm": 11.387042045593262, + "learning_rate": 2.56483090495923e-06, + "loss": 0.4793, + "step": 8075 + }, + { + "epoch": 26.015746031746033, + "grad_norm": 7.323668479919434, + "learning_rate": 2.556476406897474e-06, + "loss": 0.4933, + "step": 8100 + }, + { + "epoch": 26.017333333333333, + "grad_norm": 10.183083534240723, + "learning_rate": 2.5481219088357175e-06, + "loss": 0.5176, + "step": 8125 + }, + { + "epoch": 26.018920634920637, + "grad_norm": 3.41259503364563, + "learning_rate": 2.539767410773961e-06, + "loss": 0.4789, + "step": 8150 + }, + { + "epoch": 27.001333333333335, + "grad_norm": 8.132092475891113, + "learning_rate": 2.5314129127122046e-06, + "loss": 0.5315, + "step": 8175 + }, + { + "epoch": 27.002920634920635, + "grad_norm": 6.488096237182617, + "learning_rate": 2.5230584146504478e-06, + "loss": 0.5424, + "step": 8200 + }, + { + "epoch": 27.004507936507938, + "grad_norm": 7.1543803215026855, + "learning_rate": 2.5147039165886918e-06, + "loss": 0.4628, + "step": 8225 + }, + { + "epoch": 27.006095238095238, + "grad_norm": 6.017189025878906, + "learning_rate": 2.506349418526935e-06, + "loss": 0.5284, + "step": 8250 + }, + { + "epoch": 27.00768253968254, + "grad_norm": 5.09862756729126, + "learning_rate": 2.497994920465179e-06, + "loss": 0.4613, + "step": 8275 + }, + { + "epoch": 27.00926984126984, + "grad_norm": 6.283570766448975, + "learning_rate": 2.489640422403422e-06, + "loss": 0.4374, + "step": 8300 + }, + { + "epoch": 27.010857142857144, + "grad_norm": 5.45609712600708, + "learning_rate": 2.4812859243416656e-06, + "loss": 0.4615, + "step": 8325 + }, + { + "epoch": 27.012444444444444, + "grad_norm": 9.621217727661133, + "learning_rate": 2.472931426279909e-06, + "loss": 0.4665, + "step": 8350 + }, + { + "epoch": 27.014031746031748, + "grad_norm": 10.336989402770996, + "learning_rate": 2.4645769282181527e-06, + "loss": 0.4712, + "step": 8375 + }, + { + "epoch": 27.015619047619047, + "grad_norm": 8.53022289276123, + "learning_rate": 2.4562224301563963e-06, + "loss": 0.4869, + "step": 8400 + }, + { + "epoch": 27.01720634920635, + "grad_norm": 6.758102893829346, + "learning_rate": 2.44786793209464e-06, + "loss": 0.5007, + "step": 8425 + }, + { + "epoch": 27.01879365079365, + "grad_norm": 6.737295627593994, + "learning_rate": 2.4395134340328834e-06, + "loss": 0.5, + "step": 8450 + }, + { + "epoch": 28.00120634920635, + "grad_norm": 8.9446439743042, + "learning_rate": 2.431158935971127e-06, + "loss": 0.5098, + "step": 8475 + }, + { + "epoch": 28.002793650793652, + "grad_norm": 4.6513190269470215, + "learning_rate": 2.4228044379093706e-06, + "loss": 0.5249, + "step": 8500 + }, + { + "epoch": 28.004380952380952, + "grad_norm": 7.930838108062744, + "learning_rate": 2.414449939847614e-06, + "loss": 0.4823, + "step": 8525 + }, + { + "epoch": 28.005968253968256, + "grad_norm": 5.986405372619629, + "learning_rate": 2.4060954417858577e-06, + "loss": 0.5295, + "step": 8550 + }, + { + "epoch": 28.007555555555555, + "grad_norm": 6.348638534545898, + "learning_rate": 2.3977409437241013e-06, + "loss": 0.4608, + "step": 8575 + }, + { + "epoch": 28.00914285714286, + "grad_norm": 5.640425205230713, + "learning_rate": 2.389386445662345e-06, + "loss": 0.4183, + "step": 8600 + }, + { + "epoch": 28.01073015873016, + "grad_norm": 7.974732875823975, + "learning_rate": 2.3810319476005884e-06, + "loss": 0.4716, + "step": 8625 + }, + { + "epoch": 28.012317460317462, + "grad_norm": 4.698752403259277, + "learning_rate": 2.372677449538832e-06, + "loss": 0.4637, + "step": 8650 + }, + { + "epoch": 28.01390476190476, + "grad_norm": 4.2253828048706055, + "learning_rate": 2.3643229514770755e-06, + "loss": 0.4589, + "step": 8675 + }, + { + "epoch": 28.015492063492065, + "grad_norm": 7.007496356964111, + "learning_rate": 2.355968453415319e-06, + "loss": 0.4978, + "step": 8700 + }, + { + "epoch": 28.017079365079365, + "grad_norm": 4.830111026763916, + "learning_rate": 2.3476139553535627e-06, + "loss": 0.4873, + "step": 8725 + }, + { + "epoch": 28.018666666666668, + "grad_norm": 3.9254467487335205, + "learning_rate": 2.3392594572918062e-06, + "loss": 0.4994, + "step": 8750 + }, + { + "epoch": 29.001079365079367, + "grad_norm": 6.090777397155762, + "learning_rate": 2.33090495923005e-06, + "loss": 0.4761, + "step": 8775 + }, + { + "epoch": 29.002666666666666, + "grad_norm": 5.358640670776367, + "learning_rate": 2.322550461168293e-06, + "loss": 0.5431, + "step": 8800 + }, + { + "epoch": 29.00425396825397, + "grad_norm": 9.447075843811035, + "learning_rate": 2.3141959631065365e-06, + "loss": 0.4746, + "step": 8825 + }, + { + "epoch": 29.00584126984127, + "grad_norm": 5.390321731567383, + "learning_rate": 2.30584146504478e-06, + "loss": 0.5236, + "step": 8850 + }, + { + "epoch": 29.007428571428573, + "grad_norm": 4.194957256317139, + "learning_rate": 2.2974869669830236e-06, + "loss": 0.4683, + "step": 8875 + }, + { + "epoch": 29.009015873015873, + "grad_norm": 10.377429008483887, + "learning_rate": 2.289132468921267e-06, + "loss": 0.4122, + "step": 8900 + }, + { + "epoch": 29.010603174603176, + "grad_norm": 4.972590923309326, + "learning_rate": 2.2807779708595108e-06, + "loss": 0.4557, + "step": 8925 + }, + { + "epoch": 29.012190476190476, + "grad_norm": 4.772759437561035, + "learning_rate": 2.2724234727977548e-06, + "loss": 0.4732, + "step": 8950 + }, + { + "epoch": 29.01377777777778, + "grad_norm": 19.494970321655273, + "learning_rate": 2.2640689747359983e-06, + "loss": 0.4653, + "step": 8975 + }, + { + "epoch": 29.01536507936508, + "grad_norm": 6.1877593994140625, + "learning_rate": 2.255714476674242e-06, + "loss": 0.4901, + "step": 9000 + }, + { + "epoch": 29.016952380952382, + "grad_norm": 5.228841781616211, + "learning_rate": 2.2473599786124854e-06, + "loss": 0.4841, + "step": 9025 + }, + { + "epoch": 29.018539682539682, + "grad_norm": 5.32314395904541, + "learning_rate": 2.2390054805507286e-06, + "loss": 0.4902, + "step": 9050 + }, + { + "epoch": 30.00095238095238, + "grad_norm": 6.952610015869141, + "learning_rate": 2.230650982488972e-06, + "loss": 0.4803, + "step": 9075 + }, + { + "epoch": 30.002539682539684, + "grad_norm": 4.230266571044922, + "learning_rate": 2.2222964844272157e-06, + "loss": 0.5235, + "step": 9100 + }, + { + "epoch": 30.004126984126984, + "grad_norm": 7.016523361206055, + "learning_rate": 2.2139419863654593e-06, + "loss": 0.4873, + "step": 9125 + }, + { + "epoch": 30.005714285714287, + "grad_norm": 10.13500690460205, + "learning_rate": 2.205587488303703e-06, + "loss": 0.5262, + "step": 9150 + }, + { + "epoch": 30.007301587301587, + "grad_norm": 7.627212047576904, + "learning_rate": 2.1972329902419464e-06, + "loss": 0.4619, + "step": 9175 + }, + { + "epoch": 30.00888888888889, + "grad_norm": 7.077376365661621, + "learning_rate": 2.18887849218019e-06, + "loss": 0.4011, + "step": 9200 + }, + { + "epoch": 30.01047619047619, + "grad_norm": 7.501957416534424, + "learning_rate": 2.1805239941184336e-06, + "loss": 0.455, + "step": 9225 + }, + { + "epoch": 30.012063492063493, + "grad_norm": 6.617973327636719, + "learning_rate": 2.172169496056677e-06, + "loss": 0.4814, + "step": 9250 + }, + { + "epoch": 30.013650793650793, + "grad_norm": 3.885499954223633, + "learning_rate": 2.1638149979949207e-06, + "loss": 0.4523, + "step": 9275 + }, + { + "epoch": 30.015238095238097, + "grad_norm": 5.5597615242004395, + "learning_rate": 2.1554604999331642e-06, + "loss": 0.4768, + "step": 9300 + }, + { + "epoch": 30.016825396825396, + "grad_norm": 9.792261123657227, + "learning_rate": 2.147106001871408e-06, + "loss": 0.49, + "step": 9325 + }, + { + "epoch": 30.0184126984127, + "grad_norm": 5.992704391479492, + "learning_rate": 2.1387515038096514e-06, + "loss": 0.498, + "step": 9350 + }, + { + "epoch": 31.000825396825398, + "grad_norm": 8.464439392089844, + "learning_rate": 2.130397005747895e-06, + "loss": 0.4518, + "step": 9375 + }, + { + "epoch": 31.002412698412698, + "grad_norm": 3.486860990524292, + "learning_rate": 2.1220425076861385e-06, + "loss": 0.5318, + "step": 9400 + }, + { + "epoch": 31.004, + "grad_norm": 4.426388740539551, + "learning_rate": 2.113688009624382e-06, + "loss": 0.4917, + "step": 9425 + }, + { + "epoch": 31.0055873015873, + "grad_norm": 8.08337116241455, + "learning_rate": 2.1053335115626256e-06, + "loss": 0.5093, + "step": 9450 + }, + { + "epoch": 31.007174603174604, + "grad_norm": 3.963824987411499, + "learning_rate": 2.096979013500869e-06, + "loss": 0.4515, + "step": 9475 + }, + { + "epoch": 31.008761904761904, + "grad_norm": 7.304539203643799, + "learning_rate": 2.0886245154391128e-06, + "loss": 0.4196, + "step": 9500 + }, + { + "epoch": 31.010349206349208, + "grad_norm": 4.731977939605713, + "learning_rate": 2.080270017377356e-06, + "loss": 0.4529, + "step": 9525 + }, + { + "epoch": 31.011936507936507, + "grad_norm": 8.285253524780273, + "learning_rate": 2.0719155193155995e-06, + "loss": 0.4653, + "step": 9550 + }, + { + "epoch": 31.01352380952381, + "grad_norm": 8.305194854736328, + "learning_rate": 2.063561021253843e-06, + "loss": 0.4624, + "step": 9575 + }, + { + "epoch": 31.01511111111111, + "grad_norm": 13.913382530212402, + "learning_rate": 2.0552065231920866e-06, + "loss": 0.4676, + "step": 9600 + }, + { + "epoch": 31.016698412698414, + "grad_norm": 6.448155403137207, + "learning_rate": 2.04685202513033e-06, + "loss": 0.476, + "step": 9625 + }, + { + "epoch": 31.018285714285714, + "grad_norm": 7.706886291503906, + "learning_rate": 2.0384975270685737e-06, + "loss": 0.4967, + "step": 9650 + }, + { + "epoch": 32.00069841269841, + "grad_norm": 4.588306427001953, + "learning_rate": 2.0301430290068173e-06, + "loss": 0.4358, + "step": 9675 + }, + { + "epoch": 32.00228571428571, + "grad_norm": 4.243907451629639, + "learning_rate": 2.021788530945061e-06, + "loss": 0.5546, + "step": 9700 + }, + { + "epoch": 32.00387301587302, + "grad_norm": 6.786617755889893, + "learning_rate": 2.0134340328833044e-06, + "loss": 0.4775, + "step": 9725 + }, + { + "epoch": 32.00546031746032, + "grad_norm": 9.617806434631348, + "learning_rate": 2.005079534821548e-06, + "loss": 0.4857, + "step": 9750 + }, + { + "epoch": 32.00704761904762, + "grad_norm": 4.088709354400635, + "learning_rate": 1.9967250367597916e-06, + "loss": 0.4872, + "step": 9775 + }, + { + "epoch": 32.00863492063492, + "grad_norm": 7.801070690155029, + "learning_rate": 1.988370538698035e-06, + "loss": 0.3978, + "step": 9800 + }, + { + "epoch": 32.010222222222225, + "grad_norm": 12.151320457458496, + "learning_rate": 1.9800160406362787e-06, + "loss": 0.4567, + "step": 9825 + }, + { + "epoch": 32.011809523809525, + "grad_norm": 6.325204372406006, + "learning_rate": 1.9716615425745223e-06, + "loss": 0.4568, + "step": 9850 + }, + { + "epoch": 32.013396825396825, + "grad_norm": 4.849487781524658, + "learning_rate": 1.963307044512766e-06, + "loss": 0.4612, + "step": 9875 + }, + { + "epoch": 32.014984126984125, + "grad_norm": 6.0611090660095215, + "learning_rate": 1.9549525464510094e-06, + "loss": 0.4492, + "step": 9900 + }, + { + "epoch": 32.01657142857143, + "grad_norm": 8.705154418945312, + "learning_rate": 1.946598048389253e-06, + "loss": 0.4823, + "step": 9925 + }, + { + "epoch": 32.01815873015873, + "grad_norm": 6.838711261749268, + "learning_rate": 1.9382435503274965e-06, + "loss": 0.4981, + "step": 9950 + }, + { + "epoch": 33.000571428571426, + "grad_norm": 6.763596057891846, + "learning_rate": 1.92988905226574e-06, + "loss": 0.4369, + "step": 9975 + }, + { + "epoch": 33.00215873015873, + "grad_norm": 5.268820285797119, + "learning_rate": 1.9215345542039837e-06, + "loss": 0.5427, + "step": 10000 + }, + { + "epoch": 33.00374603174603, + "grad_norm": 3.5916361808776855, + "learning_rate": 1.913180056142227e-06, + "loss": 0.4798, + "step": 10025 + }, + { + "epoch": 33.00533333333333, + "grad_norm": 4.288261890411377, + "learning_rate": 1.9048255580804706e-06, + "loss": 0.488, + "step": 10050 + }, + { + "epoch": 33.00692063492063, + "grad_norm": 5.668625354766846, + "learning_rate": 1.8964710600187142e-06, + "loss": 0.5008, + "step": 10075 + }, + { + "epoch": 33.00850793650794, + "grad_norm": 4.973710536956787, + "learning_rate": 1.8881165619569577e-06, + "loss": 0.391, + "step": 10100 + }, + { + "epoch": 33.01009523809524, + "grad_norm": 7.073776721954346, + "learning_rate": 1.8797620638952013e-06, + "loss": 0.444, + "step": 10125 + }, + { + "epoch": 33.01168253968254, + "grad_norm": 4.297868251800537, + "learning_rate": 1.8714075658334449e-06, + "loss": 0.4516, + "step": 10150 + }, + { + "epoch": 33.01326984126984, + "grad_norm": 25.770071029663086, + "learning_rate": 1.8630530677716884e-06, + "loss": 0.4746, + "step": 10175 + }, + { + "epoch": 33.014857142857146, + "grad_norm": 3.6950442790985107, + "learning_rate": 1.854698569709932e-06, + "loss": 0.4346, + "step": 10200 + }, + { + "epoch": 33.016444444444446, + "grad_norm": 8.81914234161377, + "learning_rate": 1.8463440716481756e-06, + "loss": 0.4794, + "step": 10225 + }, + { + "epoch": 33.018031746031745, + "grad_norm": 6.376156806945801, + "learning_rate": 1.837989573586419e-06, + "loss": 0.5085, + "step": 10250 + }, + { + "epoch": 34.00044444444445, + "grad_norm": 4.302385330200195, + "learning_rate": 1.8296350755246625e-06, + "loss": 0.4276, + "step": 10275 + }, + { + "epoch": 34.00203174603175, + "grad_norm": 8.226256370544434, + "learning_rate": 1.821280577462906e-06, + "loss": 0.5405, + "step": 10300 + }, + { + "epoch": 34.00361904761905, + "grad_norm": 12.093836784362793, + "learning_rate": 1.8129260794011496e-06, + "loss": 0.4661, + "step": 10325 + }, + { + "epoch": 34.00520634920635, + "grad_norm": 11.840346336364746, + "learning_rate": 1.8045715813393932e-06, + "loss": 0.4779, + "step": 10350 + }, + { + "epoch": 34.006793650793654, + "grad_norm": 5.186977386474609, + "learning_rate": 1.7962170832776367e-06, + "loss": 0.5152, + "step": 10375 + }, + { + "epoch": 34.00838095238095, + "grad_norm": 4.393645286560059, + "learning_rate": 1.7878625852158805e-06, + "loss": 0.3996, + "step": 10400 + }, + { + "epoch": 34.00996825396825, + "grad_norm": 11.108858108520508, + "learning_rate": 1.779508087154124e-06, + "loss": 0.4351, + "step": 10425 + }, + { + "epoch": 34.01155555555555, + "grad_norm": 6.400074005126953, + "learning_rate": 1.7711535890923676e-06, + "loss": 0.4461, + "step": 10450 + }, + { + "epoch": 34.01314285714286, + "grad_norm": 11.155898094177246, + "learning_rate": 1.7627990910306112e-06, + "loss": 0.4503, + "step": 10475 + }, + { + "epoch": 34.01473015873016, + "grad_norm": 5.40648889541626, + "learning_rate": 1.7544445929688544e-06, + "loss": 0.4552, + "step": 10500 + }, + { + "epoch": 34.01631746031746, + "grad_norm": 4.550622463226318, + "learning_rate": 1.746090094907098e-06, + "loss": 0.4701, + "step": 10525 + }, + { + "epoch": 34.01790476190476, + "grad_norm": 7.3100433349609375, + "learning_rate": 1.7377355968453417e-06, + "loss": 0.4936, + "step": 10550 + }, + { + "epoch": 35.00031746031746, + "grad_norm": 6.450623035430908, + "learning_rate": 1.7293810987835853e-06, + "loss": 0.449, + "step": 10575 + }, + { + "epoch": 35.00190476190476, + "grad_norm": 9.00794792175293, + "learning_rate": 1.7210266007218288e-06, + "loss": 0.5011, + "step": 10600 + }, + { + "epoch": 35.00349206349206, + "grad_norm": 10.713994026184082, + "learning_rate": 1.7126721026600724e-06, + "loss": 0.4829, + "step": 10625 + }, + { + "epoch": 35.00507936507937, + "grad_norm": 4.488622188568115, + "learning_rate": 1.704317604598316e-06, + "loss": 0.4795, + "step": 10650 + }, + { + "epoch": 35.00666666666667, + "grad_norm": 16.104774475097656, + "learning_rate": 1.6959631065365595e-06, + "loss": 0.5035, + "step": 10675 + }, + { + "epoch": 35.00825396825397, + "grad_norm": 4.884101390838623, + "learning_rate": 1.687608608474803e-06, + "loss": 0.4168, + "step": 10700 + }, + { + "epoch": 35.00984126984127, + "grad_norm": 5.478789806365967, + "learning_rate": 1.6792541104130467e-06, + "loss": 0.4258, + "step": 10725 + }, + { + "epoch": 35.011428571428574, + "grad_norm": 6.428930282592773, + "learning_rate": 1.67089961235129e-06, + "loss": 0.4392, + "step": 10750 + }, + { + "epoch": 35.013015873015874, + "grad_norm": 2.7530977725982666, + "learning_rate": 1.6625451142895336e-06, + "loss": 0.451, + "step": 10775 + }, + { + "epoch": 35.014603174603174, + "grad_norm": 5.9829912185668945, + "learning_rate": 1.6541906162277771e-06, + "loss": 0.4586, + "step": 10800 + }, + { + "epoch": 35.016190476190474, + "grad_norm": 6.039813995361328, + "learning_rate": 1.6458361181660207e-06, + "loss": 0.4667, + "step": 10825 + }, + { + "epoch": 35.01777777777778, + "grad_norm": 6.336811065673828, + "learning_rate": 1.6374816201042643e-06, + "loss": 0.482, + "step": 10850 + }, + { + "epoch": 36.000190476190475, + "grad_norm": 6.172911643981934, + "learning_rate": 1.6291271220425078e-06, + "loss": 0.4651, + "step": 10875 + }, + { + "epoch": 36.001777777777775, + "grad_norm": 4.215289115905762, + "learning_rate": 1.6207726239807514e-06, + "loss": 0.4776, + "step": 10900 + }, + { + "epoch": 36.00336507936508, + "grad_norm": 2.862426519393921, + "learning_rate": 1.612418125918995e-06, + "loss": 0.5059, + "step": 10925 + }, + { + "epoch": 36.00495238095238, + "grad_norm": 4.817645072937012, + "learning_rate": 1.6040636278572385e-06, + "loss": 0.4777, + "step": 10950 + }, + { + "epoch": 36.00653968253968, + "grad_norm": 8.073090553283691, + "learning_rate": 1.595709129795482e-06, + "loss": 0.4837, + "step": 10975 + }, + { + "epoch": 36.00812698412698, + "grad_norm": 6.108732223510742, + "learning_rate": 1.5873546317337255e-06, + "loss": 0.4149, + "step": 11000 + }, + { + "epoch": 36.00971428571429, + "grad_norm": 4.37070369720459, + "learning_rate": 1.579000133671969e-06, + "loss": 0.4323, + "step": 11025 + }, + { + "epoch": 36.01130158730159, + "grad_norm": 3.8069772720336914, + "learning_rate": 1.5706456356102126e-06, + "loss": 0.4488, + "step": 11050 + }, + { + "epoch": 36.01288888888889, + "grad_norm": 4.619093894958496, + "learning_rate": 1.5622911375484562e-06, + "loss": 0.4551, + "step": 11075 + }, + { + "epoch": 36.01447619047619, + "grad_norm": 5.150590419769287, + "learning_rate": 1.5539366394866997e-06, + "loss": 0.4401, + "step": 11100 + }, + { + "epoch": 36.016063492063495, + "grad_norm": 6.264833450317383, + "learning_rate": 1.5455821414249433e-06, + "loss": 0.4642, + "step": 11125 + }, + { + "epoch": 36.017650793650795, + "grad_norm": 7.851002216339111, + "learning_rate": 1.5372276433631869e-06, + "loss": 0.4791, + "step": 11150 + }, + { + "epoch": 37.00006349206349, + "grad_norm": 7.941956996917725, + "learning_rate": 1.5288731453014304e-06, + "loss": 0.4488, + "step": 11175 + }, + { + "epoch": 37.001650793650796, + "grad_norm": 6.259990692138672, + "learning_rate": 1.520518647239674e-06, + "loss": 0.4967, + "step": 11200 + }, + { + "epoch": 37.003238095238096, + "grad_norm": 2.6891424655914307, + "learning_rate": 1.5121641491779175e-06, + "loss": 0.501, + "step": 11225 + }, + { + "epoch": 37.004825396825396, + "grad_norm": 7.098905563354492, + "learning_rate": 1.503809651116161e-06, + "loss": 0.4755, + "step": 11250 + }, + { + "epoch": 37.006412698412696, + "grad_norm": 5.689685344696045, + "learning_rate": 1.4954551530544045e-06, + "loss": 0.4868, + "step": 11275 + }, + { + "epoch": 37.008, + "grad_norm": 6.782131195068359, + "learning_rate": 1.487100654992648e-06, + "loss": 0.4057, + "step": 11300 + }, + { + "epoch": 37.0095873015873, + "grad_norm": 7.18269157409668, + "learning_rate": 1.4787461569308916e-06, + "loss": 0.4281, + "step": 11325 + }, + { + "epoch": 37.0111746031746, + "grad_norm": 6.619096755981445, + "learning_rate": 1.4703916588691352e-06, + "loss": 0.4398, + "step": 11350 + }, + { + "epoch": 37.0127619047619, + "grad_norm": 4.206869602203369, + "learning_rate": 1.462037160807379e-06, + "loss": 0.4394, + "step": 11375 + }, + { + "epoch": 37.01434920634921, + "grad_norm": 7.179015636444092, + "learning_rate": 1.4536826627456225e-06, + "loss": 0.4515, + "step": 11400 + }, + { + "epoch": 37.01593650793651, + "grad_norm": 5.137106895446777, + "learning_rate": 1.445328164683866e-06, + "loss": 0.4704, + "step": 11425 + }, + { + "epoch": 37.01752380952381, + "grad_norm": 4.871583461761475, + "learning_rate": 1.4369736666221096e-06, + "loss": 0.4704, + "step": 11450 + }, + { + "epoch": 37.01911111111111, + "grad_norm": 3.7863287925720215, + "learning_rate": 1.4286191685603528e-06, + "loss": 0.4518, + "step": 11475 + }, + { + "epoch": 38.00152380952381, + "grad_norm": 11.65233039855957, + "learning_rate": 1.4202646704985963e-06, + "loss": 0.5093, + "step": 11500 + }, + { + "epoch": 38.00311111111111, + "grad_norm": 9.356032371520996, + "learning_rate": 1.4119101724368401e-06, + "loss": 0.4997, + "step": 11525 + }, + { + "epoch": 38.00469841269841, + "grad_norm": 5.056600093841553, + "learning_rate": 1.4035556743750837e-06, + "loss": 0.4455, + "step": 11550 + }, + { + "epoch": 38.00628571428572, + "grad_norm": 4.728214740753174, + "learning_rate": 1.3952011763133273e-06, + "loss": 0.4935, + "step": 11575 + }, + { + "epoch": 38.00787301587302, + "grad_norm": 6.161364555358887, + "learning_rate": 1.3868466782515708e-06, + "loss": 0.4153, + "step": 11600 + }, + { + "epoch": 38.00946031746032, + "grad_norm": 10.345149993896484, + "learning_rate": 1.3784921801898144e-06, + "loss": 0.4206, + "step": 11625 + }, + { + "epoch": 38.011047619047616, + "grad_norm": 3.4351565837860107, + "learning_rate": 1.370137682128058e-06, + "loss": 0.434, + "step": 11650 + }, + { + "epoch": 38.01263492063492, + "grad_norm": 6.009977340698242, + "learning_rate": 1.3617831840663015e-06, + "loss": 0.448, + "step": 11675 + }, + { + "epoch": 38.01422222222222, + "grad_norm": 5.062911510467529, + "learning_rate": 1.353428686004545e-06, + "loss": 0.4399, + "step": 11700 + }, + { + "epoch": 38.01580952380952, + "grad_norm": 5.664973735809326, + "learning_rate": 1.3450741879427884e-06, + "loss": 0.4674, + "step": 11725 + }, + { + "epoch": 38.01739682539682, + "grad_norm": 7.776061534881592, + "learning_rate": 1.336719689881032e-06, + "loss": 0.481, + "step": 11750 + }, + { + "epoch": 38.01898412698413, + "grad_norm": 4.977148532867432, + "learning_rate": 1.3283651918192756e-06, + "loss": 0.446, + "step": 11775 + }, + { + "epoch": 39.001396825396824, + "grad_norm": 5.45735502243042, + "learning_rate": 1.3200106937575191e-06, + "loss": 0.4955, + "step": 11800 + }, + { + "epoch": 39.002984126984124, + "grad_norm": 5.3033881187438965, + "learning_rate": 1.3116561956957627e-06, + "loss": 0.5065, + "step": 11825 + }, + { + "epoch": 39.00457142857143, + "grad_norm": 5.209838390350342, + "learning_rate": 1.3033016976340063e-06, + "loss": 0.4283, + "step": 11850 + }, + { + "epoch": 39.00615873015873, + "grad_norm": 9.413320541381836, + "learning_rate": 1.2949471995722498e-06, + "loss": 0.5134, + "step": 11875 + }, + { + "epoch": 39.00774603174603, + "grad_norm": 3.614576816558838, + "learning_rate": 1.2865927015104934e-06, + "loss": 0.4132, + "step": 11900 + }, + { + "epoch": 39.00933333333333, + "grad_norm": 5.418633937835693, + "learning_rate": 1.278238203448737e-06, + "loss": 0.411, + "step": 11925 + }, + { + "epoch": 39.01092063492064, + "grad_norm": 7.182598114013672, + "learning_rate": 1.2698837053869805e-06, + "loss": 0.4367, + "step": 11950 + }, + { + "epoch": 39.01250793650794, + "grad_norm": 5.821928024291992, + "learning_rate": 1.2615292073252239e-06, + "loss": 0.4313, + "step": 11975 + }, + { + "epoch": 39.01409523809524, + "grad_norm": 5.386777877807617, + "learning_rate": 1.2531747092634675e-06, + "loss": 0.4423, + "step": 12000 + }, + { + "epoch": 39.01568253968254, + "grad_norm": 7.081798553466797, + "learning_rate": 1.244820211201711e-06, + "loss": 0.4637, + "step": 12025 + }, + { + "epoch": 39.017269841269844, + "grad_norm": 6.970532417297363, + "learning_rate": 1.2364657131399546e-06, + "loss": 0.4641, + "step": 12050 + }, + { + "epoch": 39.018857142857144, + "grad_norm": 10.821274757385254, + "learning_rate": 1.2281112150781982e-06, + "loss": 0.4712, + "step": 12075 + }, + { + "epoch": 40.00126984126984, + "grad_norm": 7.221808910369873, + "learning_rate": 1.2197567170164417e-06, + "loss": 0.4856, + "step": 12100 + }, + { + "epoch": 40.002857142857145, + "grad_norm": 15.445313453674316, + "learning_rate": 1.2114022189546853e-06, + "loss": 0.5051, + "step": 12125 + }, + { + "epoch": 40.004444444444445, + "grad_norm": 5.132606029510498, + "learning_rate": 1.2030477208929288e-06, + "loss": 0.4254, + "step": 12150 + }, + { + "epoch": 40.006031746031745, + "grad_norm": 3.0163042545318604, + "learning_rate": 1.1946932228311724e-06, + "loss": 0.4991, + "step": 12175 + }, + { + "epoch": 40.007619047619045, + "grad_norm": 4.396322250366211, + "learning_rate": 1.186338724769416e-06, + "loss": 0.4394, + "step": 12200 + }, + { + "epoch": 40.00920634920635, + "grad_norm": 9.984151840209961, + "learning_rate": 1.1779842267076595e-06, + "loss": 0.4004, + "step": 12225 + }, + { + "epoch": 40.01079365079365, + "grad_norm": 5.9347825050354, + "learning_rate": 1.1696297286459031e-06, + "loss": 0.4373, + "step": 12250 + }, + { + "epoch": 40.01238095238095, + "grad_norm": 5.0575852394104, + "learning_rate": 1.1612752305841465e-06, + "loss": 0.4321, + "step": 12275 + }, + { + "epoch": 40.01396825396825, + "grad_norm": 6.324869155883789, + "learning_rate": 1.15292073252239e-06, + "loss": 0.4353, + "step": 12300 + }, + { + "epoch": 40.01555555555556, + "grad_norm": 6.5207414627075195, + "learning_rate": 1.1445662344606336e-06, + "loss": 0.4675, + "step": 12325 + }, + { + "epoch": 40.01714285714286, + "grad_norm": 6.220884799957275, + "learning_rate": 1.1362117363988774e-06, + "loss": 0.4611, + "step": 12350 + }, + { + "epoch": 40.01873015873016, + "grad_norm": 10.964550018310547, + "learning_rate": 1.127857238337121e-06, + "loss": 0.4673, + "step": 12375 + }, + { + "epoch": 41.00114285714286, + "grad_norm": 6.545460224151611, + "learning_rate": 1.1195027402753643e-06, + "loss": 0.4593, + "step": 12400 + }, + { + "epoch": 41.00273015873016, + "grad_norm": 6.692239761352539, + "learning_rate": 1.1111482422136079e-06, + "loss": 0.512, + "step": 12425 + }, + { + "epoch": 41.00431746031746, + "grad_norm": 7.213928699493408, + "learning_rate": 1.1027937441518514e-06, + "loss": 0.4477, + "step": 12450 + }, + { + "epoch": 41.00590476190476, + "grad_norm": 4.5662431716918945, + "learning_rate": 1.094439246090095e-06, + "loss": 0.4916, + "step": 12475 + }, + { + "epoch": 41.007492063492066, + "grad_norm": 4.408071041107178, + "learning_rate": 1.0860847480283386e-06, + "loss": 0.4456, + "step": 12500 + }, + { + "epoch": 41.009079365079366, + "grad_norm": 5.65850830078125, + "learning_rate": 1.0777302499665821e-06, + "loss": 0.3866, + "step": 12525 + }, + { + "epoch": 41.010666666666665, + "grad_norm": 5.419500827789307, + "learning_rate": 1.0693757519048257e-06, + "loss": 0.4345, + "step": 12550 + }, + { + "epoch": 41.012253968253965, + "grad_norm": 4.399853229522705, + "learning_rate": 1.0610212538430693e-06, + "loss": 0.4444, + "step": 12575 + }, + { + "epoch": 41.01384126984127, + "grad_norm": 6.50507116317749, + "learning_rate": 1.0526667557813128e-06, + "loss": 0.4385, + "step": 12600 + }, + { + "epoch": 41.01542857142857, + "grad_norm": 2.587691068649292, + "learning_rate": 1.0443122577195564e-06, + "loss": 0.4558, + "step": 12625 + }, + { + "epoch": 41.01701587301587, + "grad_norm": 4.828845977783203, + "learning_rate": 1.0359577596577997e-06, + "loss": 0.4655, + "step": 12650 + }, + { + "epoch": 41.01860317460317, + "grad_norm": 9.805785179138184, + "learning_rate": 1.0276032615960433e-06, + "loss": 0.4688, + "step": 12675 + }, + { + "epoch": 42.001015873015874, + "grad_norm": 4.370798587799072, + "learning_rate": 1.0192487635342869e-06, + "loss": 0.4457, + "step": 12700 + }, + { + "epoch": 42.00260317460317, + "grad_norm": 5.954914093017578, + "learning_rate": 1.0108942654725304e-06, + "loss": 0.5047, + "step": 12725 + }, + { + "epoch": 42.00419047619047, + "grad_norm": 4.680336952209473, + "learning_rate": 1.002539767410774e-06, + "loss": 0.4536, + "step": 12750 + }, + { + "epoch": 42.00577777777778, + "grad_norm": 3.612610101699829, + "learning_rate": 9.941852693490176e-07, + "loss": 0.5006, + "step": 12775 + }, + { + "epoch": 42.00736507936508, + "grad_norm": 4.1838178634643555, + "learning_rate": 9.858307712872611e-07, + "loss": 0.4459, + "step": 12800 + }, + { + "epoch": 42.00895238095238, + "grad_norm": 5.238033771514893, + "learning_rate": 9.774762732255047e-07, + "loss": 0.3692, + "step": 12825 + }, + { + "epoch": 42.01053968253968, + "grad_norm": 5.992905139923096, + "learning_rate": 9.691217751637483e-07, + "loss": 0.4471, + "step": 12850 + }, + { + "epoch": 42.012126984126986, + "grad_norm": 3.9099578857421875, + "learning_rate": 9.607672771019918e-07, + "loss": 0.4505, + "step": 12875 + }, + { + "epoch": 42.013714285714286, + "grad_norm": 5.2203497886657715, + "learning_rate": 9.524127790402353e-07, + "loss": 0.4285, + "step": 12900 + }, + { + "epoch": 42.015301587301586, + "grad_norm": 10.484596252441406, + "learning_rate": 9.440582809784789e-07, + "loss": 0.4644, + "step": 12925 + }, + { + "epoch": 42.016888888888886, + "grad_norm": 9.001386642456055, + "learning_rate": 9.357037829167224e-07, + "loss": 0.4485, + "step": 12950 + }, + { + "epoch": 42.01847619047619, + "grad_norm": 5.222927570343018, + "learning_rate": 9.27349284854966e-07, + "loss": 0.4722, + "step": 12975 + }, + { + "epoch": 43.00088888888889, + "grad_norm": 5.298580169677734, + "learning_rate": 9.189947867932095e-07, + "loss": 0.4334, + "step": 13000 + }, + { + "epoch": 43.00247619047619, + "grad_norm": 7.794508457183838, + "learning_rate": 9.10640288731453e-07, + "loss": 0.5104, + "step": 13025 + }, + { + "epoch": 43.004063492063494, + "grad_norm": 7.346789360046387, + "learning_rate": 9.022857906696966e-07, + "loss": 0.4567, + "step": 13050 + }, + { + "epoch": 43.005650793650794, + "grad_norm": 7.656489372253418, + "learning_rate": 8.939312926079403e-07, + "loss": 0.4923, + "step": 13075 + }, + { + "epoch": 43.007238095238094, + "grad_norm": 4.487580299377441, + "learning_rate": 8.855767945461838e-07, + "loss": 0.4286, + "step": 13100 + }, + { + "epoch": 43.008825396825394, + "grad_norm": 3.4565789699554443, + "learning_rate": 8.772222964844272e-07, + "loss": 0.3914, + "step": 13125 + }, + { + "epoch": 43.0104126984127, + "grad_norm": 6.925444602966309, + "learning_rate": 8.688677984226708e-07, + "loss": 0.4391, + "step": 13150 + }, + { + "epoch": 43.012, + "grad_norm": 5.899662494659424, + "learning_rate": 8.605133003609144e-07, + "loss": 0.4529, + "step": 13175 + }, + { + "epoch": 43.0135873015873, + "grad_norm": 4.932972431182861, + "learning_rate": 8.52158802299158e-07, + "loss": 0.4289, + "step": 13200 + }, + { + "epoch": 43.0151746031746, + "grad_norm": 6.803351402282715, + "learning_rate": 8.438043042374015e-07, + "loss": 0.4562, + "step": 13225 + }, + { + "epoch": 43.01676190476191, + "grad_norm": 4.4438066482543945, + "learning_rate": 8.35449806175645e-07, + "loss": 0.4426, + "step": 13250 + }, + { + "epoch": 43.01834920634921, + "grad_norm": 9.328689575195312, + "learning_rate": 8.270953081138886e-07, + "loss": 0.4814, + "step": 13275 + }, + { + "epoch": 44.0007619047619, + "grad_norm": 10.716085433959961, + "learning_rate": 8.187408100521321e-07, + "loss": 0.4272, + "step": 13300 + }, + { + "epoch": 44.00234920634921, + "grad_norm": 5.453390598297119, + "learning_rate": 8.103863119903757e-07, + "loss": 0.5105, + "step": 13325 + }, + { + "epoch": 44.00393650793651, + "grad_norm": 8.206409454345703, + "learning_rate": 8.020318139286193e-07, + "loss": 0.4678, + "step": 13350 + }, + { + "epoch": 44.00552380952381, + "grad_norm": 9.60810661315918, + "learning_rate": 7.936773158668627e-07, + "loss": 0.4715, + "step": 13375 + }, + { + "epoch": 44.00711111111111, + "grad_norm": 4.23331880569458, + "learning_rate": 7.853228178051063e-07, + "loss": 0.4476, + "step": 13400 + }, + { + "epoch": 44.008698412698415, + "grad_norm": 4.390725135803223, + "learning_rate": 7.769683197433499e-07, + "loss": 0.3836, + "step": 13425 + }, + { + "epoch": 44.010285714285715, + "grad_norm": 8.254178047180176, + "learning_rate": 7.686138216815934e-07, + "loss": 0.441, + "step": 13450 + }, + { + "epoch": 44.011873015873014, + "grad_norm": 9.834754943847656, + "learning_rate": 7.60259323619837e-07, + "loss": 0.4474, + "step": 13475 + }, + { + "epoch": 44.013460317460314, + "grad_norm": 5.189121723175049, + "learning_rate": 7.519048255580805e-07, + "loss": 0.4259, + "step": 13500 + }, + { + "epoch": 44.01504761904762, + "grad_norm": 8.684473037719727, + "learning_rate": 7.43550327496324e-07, + "loss": 0.4342, + "step": 13525 + }, + { + "epoch": 44.01663492063492, + "grad_norm": 9.289813041687012, + "learning_rate": 7.351958294345676e-07, + "loss": 0.4599, + "step": 13550 + }, + { + "epoch": 44.01822222222222, + "grad_norm": 4.835540294647217, + "learning_rate": 7.268413313728113e-07, + "loss": 0.4765, + "step": 13575 + }, + { + "epoch": 45.00063492063492, + "grad_norm": 15.820073127746582, + "learning_rate": 7.184868333110548e-07, + "loss": 0.429, + "step": 13600 + }, + { + "epoch": 45.00222222222222, + "grad_norm": 7.332337856292725, + "learning_rate": 7.101323352492982e-07, + "loss": 0.5291, + "step": 13625 + }, + { + "epoch": 45.00380952380952, + "grad_norm": 9.775607109069824, + "learning_rate": 7.017778371875418e-07, + "loss": 0.4441, + "step": 13650 + }, + { + "epoch": 45.00539682539682, + "grad_norm": 6.92523717880249, + "learning_rate": 6.934233391257854e-07, + "loss": 0.4656, + "step": 13675 + }, + { + "epoch": 45.00698412698413, + "grad_norm": 4.137689590454102, + "learning_rate": 6.85068841064029e-07, + "loss": 0.4757, + "step": 13700 + }, + { + "epoch": 45.00857142857143, + "grad_norm": 6.4185638427734375, + "learning_rate": 6.767143430022725e-07, + "loss": 0.3757, + "step": 13725 + }, + { + "epoch": 45.01015873015873, + "grad_norm": 6.079351425170898, + "learning_rate": 6.68359844940516e-07, + "loss": 0.431, + "step": 13750 + }, + { + "epoch": 45.01174603174603, + "grad_norm": 4.2088165283203125, + "learning_rate": 6.600053468787596e-07, + "loss": 0.4354, + "step": 13775 + }, + { + "epoch": 45.013333333333335, + "grad_norm": 3.3986575603485107, + "learning_rate": 6.516508488170031e-07, + "loss": 0.4486, + "step": 13800 + }, + { + "epoch": 45.014920634920635, + "grad_norm": 5.290012359619141, + "learning_rate": 6.432963507552467e-07, + "loss": 0.4227, + "step": 13825 + }, + { + "epoch": 45.016507936507935, + "grad_norm": 8.828485488891602, + "learning_rate": 6.349418526934903e-07, + "loss": 0.4662, + "step": 13850 + }, + { + "epoch": 45.018095238095235, + "grad_norm": 4.196980953216553, + "learning_rate": 6.265873546317337e-07, + "loss": 0.4792, + "step": 13875 + }, + { + "epoch": 46.00050793650794, + "grad_norm": 8.14965534210205, + "learning_rate": 6.182328565699773e-07, + "loss": 0.4178, + "step": 13900 + }, + { + "epoch": 46.00209523809524, + "grad_norm": 4.221451282501221, + "learning_rate": 6.098783585082209e-07, + "loss": 0.5127, + "step": 13925 + }, + { + "epoch": 46.003682539682536, + "grad_norm": 7.106605052947998, + "learning_rate": 6.015238604464644e-07, + "loss": 0.4599, + "step": 13950 + }, + { + "epoch": 46.00526984126984, + "grad_norm": 11.949932098388672, + "learning_rate": 5.93169362384708e-07, + "loss": 0.4757, + "step": 13975 + }, + { + "epoch": 46.00685714285714, + "grad_norm": 7.468497276306152, + "learning_rate": 5.848148643229516e-07, + "loss": 0.4786, + "step": 14000 + }, + { + "epoch": 46.00844444444444, + "grad_norm": 9.329829216003418, + "learning_rate": 5.76460366261195e-07, + "loss": 0.3776, + "step": 14025 + }, + { + "epoch": 46.01003174603174, + "grad_norm": 5.68237829208374, + "learning_rate": 5.681058681994387e-07, + "loss": 0.419, + "step": 14050 + }, + { + "epoch": 46.01161904761905, + "grad_norm": 9.558659553527832, + "learning_rate": 5.597513701376821e-07, + "loss": 0.4381, + "step": 14075 + }, + { + "epoch": 46.01320634920635, + "grad_norm": 4.898390769958496, + "learning_rate": 5.513968720759257e-07, + "loss": 0.4285, + "step": 14100 + }, + { + "epoch": 46.01479365079365, + "grad_norm": 6.284289360046387, + "learning_rate": 5.430423740141693e-07, + "loss": 0.4426, + "step": 14125 + }, + { + "epoch": 46.01638095238095, + "grad_norm": 7.268927574157715, + "learning_rate": 5.346878759524128e-07, + "loss": 0.4454, + "step": 14150 + }, + { + "epoch": 46.017968253968256, + "grad_norm": 8.74441146850586, + "learning_rate": 5.263333778906564e-07, + "loss": 0.4862, + "step": 14175 + }, + { + "epoch": 47.00038095238095, + "grad_norm": 5.302139759063721, + "learning_rate": 5.179788798288999e-07, + "loss": 0.4241, + "step": 14200 + }, + { + "epoch": 47.00196825396825, + "grad_norm": 9.133415222167969, + "learning_rate": 5.096243817671434e-07, + "loss": 0.4938, + "step": 14225 + }, + { + "epoch": 47.00355555555556, + "grad_norm": 11.927820205688477, + "learning_rate": 5.01269883705387e-07, + "loss": 0.4569, + "step": 14250 + }, + { + "epoch": 47.00514285714286, + "grad_norm": 6.953431606292725, + "learning_rate": 4.929153856436306e-07, + "loss": 0.4669, + "step": 14275 + }, + { + "epoch": 47.00673015873016, + "grad_norm": 15.06204891204834, + "learning_rate": 4.845608875818741e-07, + "loss": 0.4956, + "step": 14300 + }, + { + "epoch": 47.00831746031746, + "grad_norm": 6.762052059173584, + "learning_rate": 4.7620638952011765e-07, + "loss": 0.3959, + "step": 14325 + }, + { + "epoch": 47.009904761904764, + "grad_norm": 5.515415668487549, + "learning_rate": 4.678518914583612e-07, + "loss": 0.4077, + "step": 14350 + }, + { + "epoch": 47.011492063492064, + "grad_norm": 5.323752403259277, + "learning_rate": 4.594973933966047e-07, + "loss": 0.4282, + "step": 14375 + }, + { + "epoch": 47.01307936507936, + "grad_norm": 3.4081521034240723, + "learning_rate": 4.511428953348483e-07, + "loss": 0.4318, + "step": 14400 + }, + { + "epoch": 47.01466666666666, + "grad_norm": 10.325825691223145, + "learning_rate": 4.427883972730919e-07, + "loss": 0.443, + "step": 14425 + }, + { + "epoch": 47.01625396825397, + "grad_norm": 7.097803592681885, + "learning_rate": 4.344338992113354e-07, + "loss": 0.4469, + "step": 14450 + }, + { + "epoch": 47.01784126984127, + "grad_norm": 8.34315013885498, + "learning_rate": 4.26079401149579e-07, + "loss": 0.48, + "step": 14475 + }, + { + "epoch": 48.000253968253965, + "grad_norm": 5.317016124725342, + "learning_rate": 4.177249030878225e-07, + "loss": 0.4391, + "step": 14500 + }, + { + "epoch": 48.00184126984127, + "grad_norm": 6.396937370300293, + "learning_rate": 4.0937040502606607e-07, + "loss": 0.4688, + "step": 14525 + }, + { + "epoch": 48.00342857142857, + "grad_norm": 3.8121814727783203, + "learning_rate": 4.0101590696430963e-07, + "loss": 0.4771, + "step": 14550 + }, + { + "epoch": 48.00501587301587, + "grad_norm": 7.068954944610596, + "learning_rate": 3.9266140890255315e-07, + "loss": 0.4657, + "step": 14575 + }, + { + "epoch": 48.00660317460317, + "grad_norm": 3.8409268856048584, + "learning_rate": 3.843069108407967e-07, + "loss": 0.4651, + "step": 14600 + }, + { + "epoch": 48.00819047619048, + "grad_norm": 4.6952714920043945, + "learning_rate": 3.759524127790402e-07, + "loss": 0.4065, + "step": 14625 + }, + { + "epoch": 48.00977777777778, + "grad_norm": 5.573005199432373, + "learning_rate": 3.675979147172838e-07, + "loss": 0.411, + "step": 14650 + }, + { + "epoch": 48.01136507936508, + "grad_norm": 7.740847587585449, + "learning_rate": 3.592434166555274e-07, + "loss": 0.4348, + "step": 14675 + }, + { + "epoch": 48.01295238095238, + "grad_norm": 3.9830234050750732, + "learning_rate": 3.508889185937709e-07, + "loss": 0.437, + "step": 14700 + }, + { + "epoch": 48.014539682539684, + "grad_norm": 4.826086521148682, + "learning_rate": 3.425344205320145e-07, + "loss": 0.4347, + "step": 14725 + }, + { + "epoch": 48.016126984126984, + "grad_norm": 7.815319538116455, + "learning_rate": 3.34179922470258e-07, + "loss": 0.4485, + "step": 14750 + }, + { + "epoch": 48.017714285714284, + "grad_norm": 7.226869583129883, + "learning_rate": 3.2582542440850157e-07, + "loss": 0.4703, + "step": 14775 + }, + { + "epoch": 49.000126984126986, + "grad_norm": 6.489394187927246, + "learning_rate": 3.1747092634674513e-07, + "loss": 0.4377, + "step": 14800 + }, + { + "epoch": 49.001714285714286, + "grad_norm": 7.8261399269104, + "learning_rate": 3.0911642828498865e-07, + "loss": 0.4772, + "step": 14825 + }, + { + "epoch": 49.003301587301586, + "grad_norm": 5.6710004806518555, + "learning_rate": 3.007619302232322e-07, + "loss": 0.4895, + "step": 14850 + }, + { + "epoch": 49.004888888888885, + "grad_norm": 5.4189252853393555, + "learning_rate": 2.924074321614758e-07, + "loss": 0.4575, + "step": 14875 + }, + { + "epoch": 49.00647619047619, + "grad_norm": 3.988452434539795, + "learning_rate": 2.8405293409971934e-07, + "loss": 0.4701, + "step": 14900 + }, + { + "epoch": 49.00806349206349, + "grad_norm": 6.010385990142822, + "learning_rate": 2.7569843603796286e-07, + "loss": 0.4005, + "step": 14925 + }, + { + "epoch": 49.00965079365079, + "grad_norm": 12.048097610473633, + "learning_rate": 2.673439379762064e-07, + "loss": 0.4209, + "step": 14950 + }, + { + "epoch": 49.01123809523809, + "grad_norm": 5.6577839851379395, + "learning_rate": 2.5898943991444994e-07, + "loss": 0.4263, + "step": 14975 + }, + { + "epoch": 49.0128253968254, + "grad_norm": 9.367551803588867, + "learning_rate": 2.506349418526935e-07, + "loss": 0.4412, + "step": 15000 + }, + { + "epoch": 49.0144126984127, + "grad_norm": 6.728636741638184, + "learning_rate": 2.4228044379093707e-07, + "loss": 0.4234, + "step": 15025 + }, + { + "epoch": 49.016, + "grad_norm": 6.2957234382629395, + "learning_rate": 2.339259457291806e-07, + "loss": 0.4565, + "step": 15050 + }, + { + "epoch": 49.0175873015873, + "grad_norm": 10.448657989501953, + "learning_rate": 2.2557144766742415e-07, + "loss": 0.4544, + "step": 15075 + }, + { + "epoch": 49.019174603174605, + "grad_norm": 6.694546699523926, + "learning_rate": 2.172169496056677e-07, + "loss": 0.4277, + "step": 15100 + }, + { + "epoch": 50.0015873015873, + "grad_norm": 5.7494096755981445, + "learning_rate": 2.0886245154391125e-07, + "loss": 0.4951, + "step": 15125 + }, + { + "epoch": 50.00317460317461, + "grad_norm": 6.14343786239624, + "learning_rate": 2.0050795348215482e-07, + "loss": 0.4884, + "step": 15150 + }, + { + "epoch": 50.00476190476191, + "grad_norm": 5.558969974517822, + "learning_rate": 1.9215345542039836e-07, + "loss": 0.4453, + "step": 15175 + }, + { + "epoch": 50.006349206349206, + "grad_norm": 12.143143653869629, + "learning_rate": 1.837989573586419e-07, + "loss": 0.4809, + "step": 15200 + }, + { + "epoch": 50.007936507936506, + "grad_norm": 6.324706554412842, + "learning_rate": 1.7544445929688546e-07, + "loss": 0.4006, + "step": 15225 + }, + { + "epoch": 50.00952380952381, + "grad_norm": 5.624573230743408, + "learning_rate": 1.67089961235129e-07, + "loss": 0.4116, + "step": 15250 + }, + { + "epoch": 50.01111111111111, + "grad_norm": 6.506253242492676, + "learning_rate": 1.5873546317337257e-07, + "loss": 0.4286, + "step": 15275 + }, + { + "epoch": 50.01269841269841, + "grad_norm": 6.208716869354248, + "learning_rate": 1.503809651116161e-07, + "loss": 0.4255, + "step": 15300 + }, + { + "epoch": 50.01428571428571, + "grad_norm": 3.370025634765625, + "learning_rate": 1.4202646704985967e-07, + "loss": 0.4314, + "step": 15325 + }, + { + "epoch": 50.01587301587302, + "grad_norm": 10.119969367980957, + "learning_rate": 1.336719689881032e-07, + "loss": 0.4615, + "step": 15350 + }, + { + "epoch": 50.01746031746032, + "grad_norm": 5.545236110687256, + "learning_rate": 1.2531747092634675e-07, + "loss": 0.4646, + "step": 15375 + }, + { + "epoch": 50.01904761904762, + "grad_norm": 4.946952819824219, + "learning_rate": 1.169629728645903e-07, + "loss": 0.4344, + "step": 15400 + }, + { + "epoch": 51.00146031746032, + "grad_norm": 5.004914283752441, + "learning_rate": 1.0860847480283386e-07, + "loss": 0.4824, + "step": 15425 + }, + { + "epoch": 51.00304761904762, + "grad_norm": 6.189335823059082, + "learning_rate": 1.0025397674107741e-07, + "loss": 0.4991, + "step": 15450 + }, + { + "epoch": 51.00463492063492, + "grad_norm": 7.598541259765625, + "learning_rate": 9.189947867932095e-08, + "loss": 0.4302, + "step": 15475 + }, + { + "epoch": 51.00622222222222, + "grad_norm": 5.034823417663574, + "learning_rate": 8.35449806175645e-08, + "loss": 0.4842, + "step": 15500 + }, + { + "epoch": 51.00780952380953, + "grad_norm": 10.85693359375, + "learning_rate": 7.519048255580805e-08, + "loss": 0.4062, + "step": 15525 + }, + { + "epoch": 51.00939682539683, + "grad_norm": 5.432575702667236, + "learning_rate": 6.68359844940516e-08, + "loss": 0.4042, + "step": 15550 + }, + { + "epoch": 51.01098412698413, + "grad_norm": 6.148897171020508, + "learning_rate": 5.848148643229515e-08, + "loss": 0.4301, + "step": 15575 + }, + { + "epoch": 51.01257142857143, + "grad_norm": 11.638060569763184, + "learning_rate": 5.0126988370538704e-08, + "loss": 0.4318, + "step": 15600 + }, + { + "epoch": 51.014158730158734, + "grad_norm": 9.534034729003906, + "learning_rate": 4.177249030878225e-08, + "loss": 0.4328, + "step": 15625 + }, + { + "epoch": 51.01574603174603, + "grad_norm": 7.127634048461914, + "learning_rate": 3.34179922470258e-08, + "loss": 0.4515, + "step": 15650 + }, + { + "epoch": 51.01733333333333, + "grad_norm": 11.5350923538208, + "learning_rate": 2.5063494185269352e-08, + "loss": 0.472, + "step": 15675 + }, + { + "epoch": 51.01892063492063, + "grad_norm": 2.837473154067993, + "learning_rate": 1.67089961235129e-08, + "loss": 0.4333, + "step": 15700 + } + ], + "logging_steps": 25, + "max_steps": 15750, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.19361437974528e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}