{ "best_metric": null, "best_model_checkpoint": null, "epoch": 51.01257142857143, "eval_steps": 500, "global_step": 15600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015873015873015873, "grad_norm": 9.738715171813965, "learning_rate": 1.5862944162436548e-07, "loss": 5.3155, "step": 25 }, { "epoch": 0.0031746031746031746, "grad_norm": 32.81618881225586, "learning_rate": 3.1725888324873095e-07, "loss": 5.0547, "step": 50 }, { "epoch": 0.004761904761904762, "grad_norm": 32.67608642578125, "learning_rate": 4.7588832487309643e-07, "loss": 5.1135, "step": 75 }, { "epoch": 0.006349206349206349, "grad_norm": 10.283950805664062, "learning_rate": 6.345177664974619e-07, "loss": 4.9131, "step": 100 }, { "epoch": 0.007936507936507936, "grad_norm": 21.210186004638672, "learning_rate": 7.931472081218275e-07, "loss": 4.9171, "step": 125 }, { "epoch": 0.009523809523809525, "grad_norm": 48.682411193847656, "learning_rate": 9.517766497461929e-07, "loss": 4.993, "step": 150 }, { "epoch": 0.011111111111111112, "grad_norm": 12.522156715393066, "learning_rate": 1.1104060913705584e-06, "loss": 4.966, "step": 175 }, { "epoch": 0.012698412698412698, "grad_norm": 32.584293365478516, "learning_rate": 1.2690355329949238e-06, "loss": 4.7858, "step": 200 }, { "epoch": 0.014285714285714285, "grad_norm": 22.508533477783203, "learning_rate": 1.4276649746192894e-06, "loss": 4.713, "step": 225 }, { "epoch": 0.015873015873015872, "grad_norm": 10.486129760742188, "learning_rate": 1.586294416243655e-06, "loss": 4.7809, "step": 250 }, { "epoch": 0.01746031746031746, "grad_norm": 10.104169845581055, "learning_rate": 1.7449238578680206e-06, "loss": 4.8488, "step": 275 }, { "epoch": 0.01904761904761905, "grad_norm": 20.557931900024414, "learning_rate": 1.9035532994923857e-06, "loss": 4.5276, "step": 300 }, { "epoch": 1.0014603174603174, "grad_norm": 14.145257949829102, "learning_rate": 2.0621827411167515e-06, "loss": 4.641, "step": 325 }, { "epoch": 1.003047619047619, "grad_norm": 11.256460189819336, "learning_rate": 2.220812182741117e-06, "loss": 4.4173, "step": 350 }, { "epoch": 1.0046349206349205, "grad_norm": 13.009560585021973, "learning_rate": 2.3794416243654827e-06, "loss": 4.2029, "step": 375 }, { "epoch": 1.0062222222222221, "grad_norm": 10.433947563171387, "learning_rate": 2.5380710659898476e-06, "loss": 4.0364, "step": 400 }, { "epoch": 1.0078095238095237, "grad_norm": 7.473550319671631, "learning_rate": 2.6967005076142134e-06, "loss": 3.8754, "step": 425 }, { "epoch": 1.0093968253968253, "grad_norm": 12.957052230834961, "learning_rate": 2.855329949238579e-06, "loss": 4.0458, "step": 450 }, { "epoch": 1.0109841269841269, "grad_norm": 9.960160255432129, "learning_rate": 3.0139593908629446e-06, "loss": 3.7578, "step": 475 }, { "epoch": 1.0125714285714287, "grad_norm": 12.287276268005371, "learning_rate": 3.17258883248731e-06, "loss": 3.6334, "step": 500 }, { "epoch": 1.0141587301587303, "grad_norm": 12.79332160949707, "learning_rate": 3.3312182741116753e-06, "loss": 3.4685, "step": 525 }, { "epoch": 1.0157460317460318, "grad_norm": 8.90628719329834, "learning_rate": 3.489847715736041e-06, "loss": 3.3762, "step": 550 }, { "epoch": 1.0173333333333334, "grad_norm": 12.635107040405273, "learning_rate": 3.6484771573604065e-06, "loss": 3.305, "step": 575 }, { "epoch": 1.018920634920635, "grad_norm": 15.49641227722168, "learning_rate": 3.8071065989847715e-06, "loss": 2.9876, "step": 600 }, { "epoch": 2.001333333333333, "grad_norm": 10.022441864013672, "learning_rate": 3.965736040609137e-06, "loss": 3.0157, "step": 625 }, { "epoch": 2.0029206349206348, "grad_norm": 9.137686729431152, "learning_rate": 4.124365482233503e-06, "loss": 2.8705, "step": 650 }, { "epoch": 2.0045079365079363, "grad_norm": 10.643122673034668, "learning_rate": 4.282994923857868e-06, "loss": 2.5582, "step": 675 }, { "epoch": 2.006095238095238, "grad_norm": 8.754136085510254, "learning_rate": 4.441624365482234e-06, "loss": 2.419, "step": 700 }, { "epoch": 2.0076825396825395, "grad_norm": 9.287137031555176, "learning_rate": 4.6002538071066e-06, "loss": 2.2705, "step": 725 }, { "epoch": 2.009269841269841, "grad_norm": 10.788775444030762, "learning_rate": 4.758883248730965e-06, "loss": 2.2656, "step": 750 }, { "epoch": 2.0108571428571427, "grad_norm": 9.800201416015625, "learning_rate": 4.91751269035533e-06, "loss": 1.9858, "step": 775 }, { "epoch": 2.0124444444444443, "grad_norm": 10.444952011108398, "learning_rate": 4.995989840930358e-06, "loss": 1.8272, "step": 800 }, { "epoch": 2.014031746031746, "grad_norm": 11.801770210266113, "learning_rate": 4.987635342868601e-06, "loss": 1.7213, "step": 825 }, { "epoch": 2.0156190476190474, "grad_norm": 8.304800033569336, "learning_rate": 4.979280844806844e-06, "loss": 1.5774, "step": 850 }, { "epoch": 2.017206349206349, "grad_norm": 10.936843872070312, "learning_rate": 4.970926346745088e-06, "loss": 1.5306, "step": 875 }, { "epoch": 2.0187936507936506, "grad_norm": 12.08633804321289, "learning_rate": 4.962571848683331e-06, "loss": 1.4236, "step": 900 }, { "epoch": 3.001206349206349, "grad_norm": 12.72133731842041, "learning_rate": 4.954217350621575e-06, "loss": 1.3777, "step": 925 }, { "epoch": 3.002793650793651, "grad_norm": 7.71216344833374, "learning_rate": 4.945862852559818e-06, "loss": 1.3662, "step": 950 }, { "epoch": 3.0043809523809526, "grad_norm": 4.924420356750488, "learning_rate": 4.937508354498062e-06, "loss": 1.1704, "step": 975 }, { "epoch": 3.005968253968254, "grad_norm": 9.430765151977539, "learning_rate": 4.9291538564363055e-06, "loss": 1.1824, "step": 1000 }, { "epoch": 3.0075555555555558, "grad_norm": 8.154838562011719, "learning_rate": 4.9207993583745495e-06, "loss": 1.1026, "step": 1025 }, { "epoch": 3.0091428571428573, "grad_norm": 8.593589782714844, "learning_rate": 4.912444860312793e-06, "loss": 1.1226, "step": 1050 }, { "epoch": 3.010730158730159, "grad_norm": 7.987609386444092, "learning_rate": 4.904090362251037e-06, "loss": 1.0897, "step": 1075 }, { "epoch": 3.0123174603174605, "grad_norm": 6.220165252685547, "learning_rate": 4.89573586418928e-06, "loss": 1.0358, "step": 1100 }, { "epoch": 3.013904761904762, "grad_norm": 5.584622383117676, "learning_rate": 4.887381366127523e-06, "loss": 1.0586, "step": 1125 }, { "epoch": 3.0154920634920637, "grad_norm": 6.9964141845703125, "learning_rate": 4.879026868065767e-06, "loss": 1.0425, "step": 1150 }, { "epoch": 3.0170793650793652, "grad_norm": 6.9891839027404785, "learning_rate": 4.87067237000401e-06, "loss": 1.0801, "step": 1175 }, { "epoch": 3.018666666666667, "grad_norm": 5.334001541137695, "learning_rate": 4.862317871942254e-06, "loss": 1.071, "step": 1200 }, { "epoch": 4.001079365079365, "grad_norm": 6.936366081237793, "learning_rate": 4.853963373880497e-06, "loss": 1.0287, "step": 1225 }, { "epoch": 4.002666666666666, "grad_norm": 5.803761959075928, "learning_rate": 4.845608875818741e-06, "loss": 1.1338, "step": 1250 }, { "epoch": 4.004253968253968, "grad_norm": 6.901465892791748, "learning_rate": 4.837254377756984e-06, "loss": 0.9522, "step": 1275 }, { "epoch": 4.0058412698412695, "grad_norm": 7.466715335845947, "learning_rate": 4.828899879695228e-06, "loss": 0.9676, "step": 1300 }, { "epoch": 4.007428571428571, "grad_norm": 5.247936248779297, "learning_rate": 4.820545381633472e-06, "loss": 0.9219, "step": 1325 }, { "epoch": 4.009015873015873, "grad_norm": 9.886089324951172, "learning_rate": 4.812190883571715e-06, "loss": 0.9054, "step": 1350 }, { "epoch": 4.010603174603174, "grad_norm": 6.104865550994873, "learning_rate": 4.803836385509959e-06, "loss": 0.9142, "step": 1375 }, { "epoch": 4.012190476190476, "grad_norm": 7.953219413757324, "learning_rate": 4.7954818874482025e-06, "loss": 0.899, "step": 1400 }, { "epoch": 4.0137777777777774, "grad_norm": 6.037745475769043, "learning_rate": 4.7871273893864465e-06, "loss": 0.9195, "step": 1425 }, { "epoch": 4.015365079365079, "grad_norm": 5.599011421203613, "learning_rate": 4.77877289132469e-06, "loss": 0.9049, "step": 1450 }, { "epoch": 4.016952380952381, "grad_norm": 9.971386909484863, "learning_rate": 4.770418393262934e-06, "loss": 0.9406, "step": 1475 }, { "epoch": 4.018539682539682, "grad_norm": 5.549421310424805, "learning_rate": 4.762063895201177e-06, "loss": 0.9306, "step": 1500 }, { "epoch": 5.000952380952381, "grad_norm": 8.038061141967773, "learning_rate": 4.753709397139421e-06, "loss": 0.9159, "step": 1525 }, { "epoch": 5.002539682539682, "grad_norm": 4.798451900482178, "learning_rate": 4.745354899077664e-06, "loss": 0.9853, "step": 1550 }, { "epoch": 5.004126984126984, "grad_norm": 5.497593402862549, "learning_rate": 4.737000401015907e-06, "loss": 0.8753, "step": 1575 }, { "epoch": 5.005714285714285, "grad_norm": 8.052043914794922, "learning_rate": 4.728645902954151e-06, "loss": 0.8879, "step": 1600 }, { "epoch": 5.007301587301587, "grad_norm": 10.569701194763184, "learning_rate": 4.720291404892394e-06, "loss": 0.8254, "step": 1625 }, { "epoch": 5.0088888888888885, "grad_norm": 7.724920749664307, "learning_rate": 4.711936906830638e-06, "loss": 0.7917, "step": 1650 }, { "epoch": 5.01047619047619, "grad_norm": 6.411539554595947, "learning_rate": 4.703582408768881e-06, "loss": 0.8296, "step": 1675 }, { "epoch": 5.012063492063492, "grad_norm": 4.88054084777832, "learning_rate": 4.695227910707125e-06, "loss": 0.8315, "step": 1700 }, { "epoch": 5.013650793650793, "grad_norm": 8.332351684570312, "learning_rate": 4.6868734126453685e-06, "loss": 0.8323, "step": 1725 }, { "epoch": 5.015238095238095, "grad_norm": 5.097577095031738, "learning_rate": 4.6785189145836124e-06, "loss": 0.8122, "step": 1750 }, { "epoch": 5.016825396825396, "grad_norm": 9.592188835144043, "learning_rate": 4.670164416521856e-06, "loss": 0.8712, "step": 1775 }, { "epoch": 5.018412698412698, "grad_norm": 7.307371616363525, "learning_rate": 4.6618099184601e-06, "loss": 0.8648, "step": 1800 }, { "epoch": 6.000825396825396, "grad_norm": 7.067652702331543, "learning_rate": 4.653455420398343e-06, "loss": 0.7985, "step": 1825 }, { "epoch": 6.002412698412698, "grad_norm": 6.129504203796387, "learning_rate": 4.645100922336586e-06, "loss": 0.9276, "step": 1850 }, { "epoch": 6.004, "grad_norm": 5.0192742347717285, "learning_rate": 4.63674642427483e-06, "loss": 0.8118, "step": 1875 }, { "epoch": 6.005587301587302, "grad_norm": 9.476181030273438, "learning_rate": 4.628391926213073e-06, "loss": 0.8063, "step": 1900 }, { "epoch": 6.007174603174604, "grad_norm": 4.5081987380981445, "learning_rate": 4.620037428151317e-06, "loss": 0.7605, "step": 1925 }, { "epoch": 6.008761904761905, "grad_norm": 7.077738285064697, "learning_rate": 4.61168293008956e-06, "loss": 0.7435, "step": 1950 }, { "epoch": 6.010349206349207, "grad_norm": 5.30238151550293, "learning_rate": 4.603328432027804e-06, "loss": 0.7603, "step": 1975 }, { "epoch": 6.011936507936508, "grad_norm": 6.851855754852295, "learning_rate": 4.594973933966047e-06, "loss": 0.7606, "step": 2000 }, { "epoch": 6.01352380952381, "grad_norm": 7.457930564880371, "learning_rate": 4.586619435904291e-06, "loss": 0.7822, "step": 2025 }, { "epoch": 6.0151111111111115, "grad_norm": 7.502450466156006, "learning_rate": 4.578264937842534e-06, "loss": 0.7487, "step": 2050 }, { "epoch": 6.016698412698413, "grad_norm": 6.5313544273376465, "learning_rate": 4.569910439780778e-06, "loss": 0.7878, "step": 2075 }, { "epoch": 6.018285714285715, "grad_norm": 7.514427661895752, "learning_rate": 4.5615559417190215e-06, "loss": 0.812, "step": 2100 }, { "epoch": 7.000698412698413, "grad_norm": 5.095201015472412, "learning_rate": 4.5532014436572655e-06, "loss": 0.7322, "step": 2125 }, { "epoch": 7.002285714285715, "grad_norm": 5.384045600891113, "learning_rate": 4.5448469455955095e-06, "loss": 0.8814, "step": 2150 }, { "epoch": 7.003873015873016, "grad_norm": 4.990402698516846, "learning_rate": 4.536492447533753e-06, "loss": 0.7483, "step": 2175 }, { "epoch": 7.005460317460318, "grad_norm": 6.337180137634277, "learning_rate": 4.528137949471997e-06, "loss": 0.742, "step": 2200 }, { "epoch": 7.007047619047619, "grad_norm": 4.511834621429443, "learning_rate": 4.51978345141024e-06, "loss": 0.7482, "step": 2225 }, { "epoch": 7.008634920634921, "grad_norm": 10.161249160766602, "learning_rate": 4.511428953348484e-06, "loss": 0.6561, "step": 2250 }, { "epoch": 7.010222222222223, "grad_norm": 13.714274406433105, "learning_rate": 4.503074455286727e-06, "loss": 0.7306, "step": 2275 }, { "epoch": 7.011809523809524, "grad_norm": 7.517003536224365, "learning_rate": 4.494719957224971e-06, "loss": 0.6942, "step": 2300 }, { "epoch": 7.013396825396826, "grad_norm": 5.552587509155273, "learning_rate": 4.486365459163214e-06, "loss": 0.7466, "step": 2325 }, { "epoch": 7.014984126984127, "grad_norm": 6.234260559082031, "learning_rate": 4.478010961101457e-06, "loss": 0.6913, "step": 2350 }, { "epoch": 7.016571428571429, "grad_norm": 5.961171627044678, "learning_rate": 4.469656463039701e-06, "loss": 0.7444, "step": 2375 }, { "epoch": 7.0181587301587305, "grad_norm": 9.553787231445312, "learning_rate": 4.461301964977944e-06, "loss": 0.7764, "step": 2400 }, { "epoch": 8.000571428571428, "grad_norm": 6.736727237701416, "learning_rate": 4.452947466916188e-06, "loss": 0.6902, "step": 2425 }, { "epoch": 8.00215873015873, "grad_norm": 6.498841285705566, "learning_rate": 4.4445929688544314e-06, "loss": 0.8205, "step": 2450 }, { "epoch": 8.003746031746031, "grad_norm": 3.605954647064209, "learning_rate": 4.4362384707926754e-06, "loss": 0.7139, "step": 2475 }, { "epoch": 8.005333333333333, "grad_norm": 4.930444240570068, "learning_rate": 4.427883972730919e-06, "loss": 0.7082, "step": 2500 }, { "epoch": 8.006920634920634, "grad_norm": 4.654155731201172, "learning_rate": 4.4195294746691626e-06, "loss": 0.734, "step": 2525 }, { "epoch": 8.008507936507936, "grad_norm": 4.657145977020264, "learning_rate": 4.411174976607406e-06, "loss": 0.6076, "step": 2550 }, { "epoch": 8.010095238095237, "grad_norm": 5.5561418533325195, "learning_rate": 4.402820478545649e-06, "loss": 0.6882, "step": 2575 }, { "epoch": 8.011682539682539, "grad_norm": 4.575371742248535, "learning_rate": 4.394465980483893e-06, "loss": 0.6572, "step": 2600 }, { "epoch": 8.01326984126984, "grad_norm": 13.730400085449219, "learning_rate": 4.386111482422136e-06, "loss": 0.7209, "step": 2625 }, { "epoch": 8.014857142857142, "grad_norm": 4.949292182922363, "learning_rate": 4.37775698436038e-06, "loss": 0.6433, "step": 2650 }, { "epoch": 8.016444444444444, "grad_norm": 9.161581039428711, "learning_rate": 4.369402486298623e-06, "loss": 0.7117, "step": 2675 }, { "epoch": 8.018031746031745, "grad_norm": 6.098261833190918, "learning_rate": 4.361047988236867e-06, "loss": 0.7515, "step": 2700 }, { "epoch": 9.000444444444444, "grad_norm": 5.116198539733887, "learning_rate": 4.35269349017511e-06, "loss": 0.6484, "step": 2725 }, { "epoch": 9.002031746031745, "grad_norm": 10.22689437866211, "learning_rate": 4.344338992113354e-06, "loss": 0.7807, "step": 2750 }, { "epoch": 9.003619047619047, "grad_norm": 12.227742195129395, "learning_rate": 4.335984494051597e-06, "loss": 0.6763, "step": 2775 }, { "epoch": 9.005206349206349, "grad_norm": 9.396242141723633, "learning_rate": 4.327629995989841e-06, "loss": 0.6722, "step": 2800 }, { "epoch": 9.00679365079365, "grad_norm": 5.258615016937256, "learning_rate": 4.3192754979280845e-06, "loss": 0.7176, "step": 2825 }, { "epoch": 9.008380952380952, "grad_norm": 5.422908306121826, "learning_rate": 4.3109209998663285e-06, "loss": 0.5881, "step": 2850 }, { "epoch": 9.009968253968253, "grad_norm": 7.922283172607422, "learning_rate": 4.302566501804572e-06, "loss": 0.6492, "step": 2875 }, { "epoch": 9.011555555555555, "grad_norm": 8.866413116455078, "learning_rate": 4.294212003742816e-06, "loss": 0.6277, "step": 2900 }, { "epoch": 9.013142857142856, "grad_norm": 9.206356048583984, "learning_rate": 4.285857505681059e-06, "loss": 0.6644, "step": 2925 }, { "epoch": 9.014730158730158, "grad_norm": 5.49714994430542, "learning_rate": 4.277503007619303e-06, "loss": 0.6481, "step": 2950 }, { "epoch": 9.01631746031746, "grad_norm": 6.799171447753906, "learning_rate": 4.269148509557547e-06, "loss": 0.67, "step": 2975 }, { "epoch": 9.017904761904761, "grad_norm": 7.201016902923584, "learning_rate": 4.26079401149579e-06, "loss": 0.7103, "step": 3000 }, { "epoch": 10.00031746031746, "grad_norm": 6.763806343078613, "learning_rate": 4.252439513434034e-06, "loss": 0.6487, "step": 3025 }, { "epoch": 10.001904761904761, "grad_norm": 9.263688087463379, "learning_rate": 4.244085015372277e-06, "loss": 0.7082, "step": 3050 }, { "epoch": 10.003492063492063, "grad_norm": 11.184915542602539, "learning_rate": 4.23573051731052e-06, "loss": 0.6759, "step": 3075 }, { "epoch": 10.005079365079364, "grad_norm": 4.252372741699219, "learning_rate": 4.227376019248764e-06, "loss": 0.66, "step": 3100 }, { "epoch": 10.006666666666666, "grad_norm": 11.342703819274902, "learning_rate": 4.219021521187007e-06, "loss": 0.6747, "step": 3125 }, { "epoch": 10.008253968253968, "grad_norm": 5.704590797424316, "learning_rate": 4.210667023125251e-06, "loss": 0.5862, "step": 3150 }, { "epoch": 10.009841269841269, "grad_norm": 5.683150291442871, "learning_rate": 4.2023125250634944e-06, "loss": 0.6129, "step": 3175 }, { "epoch": 10.01142857142857, "grad_norm": 4.855335235595703, "learning_rate": 4.193958027001738e-06, "loss": 0.604, "step": 3200 }, { "epoch": 10.013015873015872, "grad_norm": 7.8647613525390625, "learning_rate": 4.1856035289399816e-06, "loss": 0.6367, "step": 3225 }, { "epoch": 10.014603174603174, "grad_norm": 6.591641902923584, "learning_rate": 4.1772490308782256e-06, "loss": 0.6371, "step": 3250 }, { "epoch": 10.016190476190475, "grad_norm": 7.047679424285889, "learning_rate": 4.168894532816469e-06, "loss": 0.6433, "step": 3275 }, { "epoch": 10.017777777777777, "grad_norm": 7.674762725830078, "learning_rate": 4.160540034754712e-06, "loss": 0.6696, "step": 3300 }, { "epoch": 11.000190476190475, "grad_norm": 7.130011081695557, "learning_rate": 4.152185536692956e-06, "loss": 0.6527, "step": 3325 }, { "epoch": 11.001777777777777, "grad_norm": 7.317767143249512, "learning_rate": 4.143831038631199e-06, "loss": 0.6586, "step": 3350 }, { "epoch": 11.003365079365079, "grad_norm": 2.615405321121216, "learning_rate": 4.135476540569443e-06, "loss": 0.6771, "step": 3375 }, { "epoch": 11.00495238095238, "grad_norm": 4.891953468322754, "learning_rate": 4.127122042507686e-06, "loss": 0.64, "step": 3400 }, { "epoch": 11.006539682539682, "grad_norm": 6.664401531219482, "learning_rate": 4.11876754444593e-06, "loss": 0.6411, "step": 3425 }, { "epoch": 11.008126984126983, "grad_norm": 6.38748836517334, "learning_rate": 4.110413046384173e-06, "loss": 0.5616, "step": 3450 }, { "epoch": 11.009714285714285, "grad_norm": 6.957112789154053, "learning_rate": 4.102058548322417e-06, "loss": 0.5956, "step": 3475 }, { "epoch": 11.011301587301586, "grad_norm": 4.554030895233154, "learning_rate": 4.09370405026066e-06, "loss": 0.5998, "step": 3500 }, { "epoch": 11.012888888888888, "grad_norm": 5.208797454833984, "learning_rate": 4.085349552198904e-06, "loss": 0.616, "step": 3525 }, { "epoch": 11.01447619047619, "grad_norm": 6.402866840362549, "learning_rate": 4.0769950541371475e-06, "loss": 0.5958, "step": 3550 }, { "epoch": 11.016063492063491, "grad_norm": 5.4900946617126465, "learning_rate": 4.0686405560753915e-06, "loss": 0.6173, "step": 3575 }, { "epoch": 11.017650793650793, "grad_norm": 9.454499244689941, "learning_rate": 4.060286058013635e-06, "loss": 0.6567, "step": 3600 }, { "epoch": 12.000063492063491, "grad_norm": 9.544127464294434, "learning_rate": 4.051931559951879e-06, "loss": 0.6164, "step": 3625 }, { "epoch": 12.001650793650793, "grad_norm": 5.444927215576172, "learning_rate": 4.043577061890122e-06, "loss": 0.6563, "step": 3650 }, { "epoch": 12.003238095238094, "grad_norm": 3.2568845748901367, "learning_rate": 4.035222563828366e-06, "loss": 0.6631, "step": 3675 }, { "epoch": 12.004825396825396, "grad_norm": 11.233345031738281, "learning_rate": 4.026868065766609e-06, "loss": 0.6186, "step": 3700 }, { "epoch": 12.006412698412698, "grad_norm": 5.101284027099609, "learning_rate": 4.018513567704853e-06, "loss": 0.6307, "step": 3725 }, { "epoch": 12.008, "grad_norm": 7.161935329437256, "learning_rate": 4.010159069643096e-06, "loss": 0.5363, "step": 3750 }, { "epoch": 12.0095873015873, "grad_norm": 8.10970401763916, "learning_rate": 4.00180457158134e-06, "loss": 0.5729, "step": 3775 }, { "epoch": 12.011174603174604, "grad_norm": 6.0759077072143555, "learning_rate": 3.993450073519583e-06, "loss": 0.5785, "step": 3800 }, { "epoch": 12.012761904761906, "grad_norm": 11.944267272949219, "learning_rate": 3.985095575457827e-06, "loss": 0.5812, "step": 3825 }, { "epoch": 12.014349206349207, "grad_norm": 6.059834957122803, "learning_rate": 3.97674107739607e-06, "loss": 0.5958, "step": 3850 }, { "epoch": 12.015936507936509, "grad_norm": 5.849289417266846, "learning_rate": 3.968386579334314e-06, "loss": 0.6121, "step": 3875 }, { "epoch": 12.01752380952381, "grad_norm": 5.543300628662109, "learning_rate": 3.960032081272557e-06, "loss": 0.6259, "step": 3900 }, { "epoch": 12.019111111111112, "grad_norm": 5.864200592041016, "learning_rate": 3.951677583210801e-06, "loss": 0.597, "step": 3925 }, { "epoch": 13.00152380952381, "grad_norm": 9.676383972167969, "learning_rate": 3.9433230851490445e-06, "loss": 0.6586, "step": 3950 }, { "epoch": 13.003111111111112, "grad_norm": 11.184745788574219, "learning_rate": 3.9349685870872885e-06, "loss": 0.644, "step": 3975 }, { "epoch": 13.004698412698414, "grad_norm": 6.4422502517700195, "learning_rate": 3.926614089025532e-06, "loss": 0.5813, "step": 4000 }, { "epoch": 13.006285714285715, "grad_norm": 5.095337390899658, "learning_rate": 3.918259590963775e-06, "loss": 0.6238, "step": 4025 }, { "epoch": 13.007873015873017, "grad_norm": 7.0134663581848145, "learning_rate": 3.909905092902019e-06, "loss": 0.5323, "step": 4050 }, { "epoch": 13.009460317460318, "grad_norm": 11.49138069152832, "learning_rate": 3.901550594840262e-06, "loss": 0.5527, "step": 4075 }, { "epoch": 13.01104761904762, "grad_norm": 5.017846584320068, "learning_rate": 3.893196096778506e-06, "loss": 0.5561, "step": 4100 }, { "epoch": 13.012634920634921, "grad_norm": 6.452044486999512, "learning_rate": 3.884841598716749e-06, "loss": 0.5796, "step": 4125 }, { "epoch": 13.014222222222223, "grad_norm": 4.59455680847168, "learning_rate": 3.876487100654993e-06, "loss": 0.5686, "step": 4150 }, { "epoch": 13.015809523809525, "grad_norm": 6.177151203155518, "learning_rate": 3.868132602593236e-06, "loss": 0.5974, "step": 4175 }, { "epoch": 13.017396825396826, "grad_norm": 8.819549560546875, "learning_rate": 3.85977810453148e-06, "loss": 0.6186, "step": 4200 }, { "epoch": 13.018984126984128, "grad_norm": 4.838363170623779, "learning_rate": 3.851423606469723e-06, "loss": 0.5783, "step": 4225 }, { "epoch": 14.001396825396826, "grad_norm": 6.992326736450195, "learning_rate": 3.843069108407967e-06, "loss": 0.6319, "step": 4250 }, { "epoch": 14.002984126984128, "grad_norm": 6.273831367492676, "learning_rate": 3.8347146103462105e-06, "loss": 0.6446, "step": 4275 }, { "epoch": 14.00457142857143, "grad_norm": 5.901345252990723, "learning_rate": 3.826360112284454e-06, "loss": 0.5465, "step": 4300 }, { "epoch": 14.006158730158731, "grad_norm": 9.980649948120117, "learning_rate": 3.818005614222698e-06, "loss": 0.6371, "step": 4325 }, { "epoch": 14.007746031746033, "grad_norm": 3.479801654815674, "learning_rate": 3.809651116160941e-06, "loss": 0.5216, "step": 4350 }, { "epoch": 14.009333333333334, "grad_norm": 7.416255950927734, "learning_rate": 3.801296618099185e-06, "loss": 0.5298, "step": 4375 }, { "epoch": 14.010920634920636, "grad_norm": 6.53993034362793, "learning_rate": 3.7929421200374283e-06, "loss": 0.5488, "step": 4400 }, { "epoch": 14.012507936507937, "grad_norm": 5.593329906463623, "learning_rate": 3.7845876219756723e-06, "loss": 0.5506, "step": 4425 }, { "epoch": 14.014095238095239, "grad_norm": 5.567030906677246, "learning_rate": 3.7762331239139154e-06, "loss": 0.5616, "step": 4450 }, { "epoch": 14.01568253968254, "grad_norm": 11.033809661865234, "learning_rate": 3.7678786258521594e-06, "loss": 0.5849, "step": 4475 }, { "epoch": 14.017269841269842, "grad_norm": 5.325310707092285, "learning_rate": 3.7595241277904026e-06, "loss": 0.5911, "step": 4500 }, { "epoch": 14.018857142857144, "grad_norm": 10.827190399169922, "learning_rate": 3.751169629728646e-06, "loss": 0.5941, "step": 4525 }, { "epoch": 15.001269841269842, "grad_norm": 6.941823482513428, "learning_rate": 3.7428151316668897e-06, "loss": 0.6097, "step": 4550 }, { "epoch": 15.002857142857144, "grad_norm": 11.185466766357422, "learning_rate": 3.7344606336051333e-06, "loss": 0.6295, "step": 4575 }, { "epoch": 15.004444444444445, "grad_norm": 7.75522518157959, "learning_rate": 3.726106135543377e-06, "loss": 0.5308, "step": 4600 }, { "epoch": 15.006031746031747, "grad_norm": 5.286986827850342, "learning_rate": 3.7177516374816204e-06, "loss": 0.6022, "step": 4625 }, { "epoch": 15.007619047619048, "grad_norm": 6.149432182312012, "learning_rate": 3.709397139419864e-06, "loss": 0.5457, "step": 4650 }, { "epoch": 15.00920634920635, "grad_norm": 7.458939552307129, "learning_rate": 3.7010426413581075e-06, "loss": 0.5073, "step": 4675 }, { "epoch": 15.010793650793651, "grad_norm": 5.971858024597168, "learning_rate": 3.692688143296351e-06, "loss": 0.5418, "step": 4700 }, { "epoch": 15.012380952380953, "grad_norm": 5.619646072387695, "learning_rate": 3.6843336452345947e-06, "loss": 0.5423, "step": 4725 }, { "epoch": 15.013968253968255, "grad_norm": 6.382497787475586, "learning_rate": 3.675979147172838e-06, "loss": 0.5446, "step": 4750 }, { "epoch": 15.015555555555556, "grad_norm": 6.406772136688232, "learning_rate": 3.667624649111082e-06, "loss": 0.5734, "step": 4775 }, { "epoch": 15.017142857142858, "grad_norm": 7.28453254699707, "learning_rate": 3.659270151049325e-06, "loss": 0.5786, "step": 4800 }, { "epoch": 15.01873015873016, "grad_norm": 9.923730850219727, "learning_rate": 3.650915652987569e-06, "loss": 0.5791, "step": 4825 }, { "epoch": 16.001142857142856, "grad_norm": 8.251771926879883, "learning_rate": 3.642561154925812e-06, "loss": 0.5688, "step": 4850 }, { "epoch": 16.00273015873016, "grad_norm": 7.045546054840088, "learning_rate": 3.634206656864056e-06, "loss": 0.6337, "step": 4875 }, { "epoch": 16.00431746031746, "grad_norm": 8.073708534240723, "learning_rate": 3.625852158802299e-06, "loss": 0.5459, "step": 4900 }, { "epoch": 16.005904761904763, "grad_norm": 4.836098670959473, "learning_rate": 3.617497660740543e-06, "loss": 0.5852, "step": 4925 }, { "epoch": 16.007492063492062, "grad_norm": 6.777514457702637, "learning_rate": 3.6091431626787863e-06, "loss": 0.5408, "step": 4950 }, { "epoch": 16.009079365079366, "grad_norm": 4.61210298538208, "learning_rate": 3.6007886646170303e-06, "loss": 0.4816, "step": 4975 }, { "epoch": 16.010666666666665, "grad_norm": 6.172779083251953, "learning_rate": 3.5924341665552735e-06, "loss": 0.528, "step": 5000 }, { "epoch": 16.01225396825397, "grad_norm": 5.35014009475708, "learning_rate": 3.584079668493517e-06, "loss": 0.5443, "step": 5025 }, { "epoch": 16.01384126984127, "grad_norm": 7.521047115325928, "learning_rate": 3.575725170431761e-06, "loss": 0.5411, "step": 5050 }, { "epoch": 16.015428571428572, "grad_norm": 3.7705650329589844, "learning_rate": 3.567370672370004e-06, "loss": 0.555, "step": 5075 }, { "epoch": 16.017015873015872, "grad_norm": 4.953466892242432, "learning_rate": 3.559016174308248e-06, "loss": 0.5691, "step": 5100 }, { "epoch": 16.018603174603175, "grad_norm": 9.339215278625488, "learning_rate": 3.5506616762464913e-06, "loss": 0.5754, "step": 5125 }, { "epoch": 17.001015873015874, "grad_norm": 6.013854503631592, "learning_rate": 3.5423071781847353e-06, "loss": 0.5407, "step": 5150 }, { "epoch": 17.002603174603173, "grad_norm": 6.144811630249023, "learning_rate": 3.5339526801229784e-06, "loss": 0.6193, "step": 5175 }, { "epoch": 17.004190476190477, "grad_norm": 4.639403820037842, "learning_rate": 3.5255981820612224e-06, "loss": 0.5446, "step": 5200 }, { "epoch": 17.005777777777777, "grad_norm": 3.8471908569335938, "learning_rate": 3.5172436839994656e-06, "loss": 0.5891, "step": 5225 }, { "epoch": 17.00736507936508, "grad_norm": 4.270881175994873, "learning_rate": 3.5088891859377087e-06, "loss": 0.5323, "step": 5250 }, { "epoch": 17.00895238095238, "grad_norm": 6.322847366333008, "learning_rate": 3.5005346878759527e-06, "loss": 0.4589, "step": 5275 }, { "epoch": 17.010539682539683, "grad_norm": 18.737030029296875, "learning_rate": 3.492180189814196e-06, "loss": 0.5348, "step": 5300 }, { "epoch": 17.012126984126983, "grad_norm": 3.4430785179138184, "learning_rate": 3.48382569175244e-06, "loss": 0.5397, "step": 5325 }, { "epoch": 17.013714285714286, "grad_norm": 6.301079273223877, "learning_rate": 3.4754711936906834e-06, "loss": 0.5254, "step": 5350 }, { "epoch": 17.015301587301586, "grad_norm": 10.843756675720215, "learning_rate": 3.467116695628927e-06, "loss": 0.5535, "step": 5375 }, { "epoch": 17.01688888888889, "grad_norm": 9.099514961242676, "learning_rate": 3.4587621975671705e-06, "loss": 0.5458, "step": 5400 }, { "epoch": 17.01847619047619, "grad_norm": 5.591258525848389, "learning_rate": 3.450407699505414e-06, "loss": 0.5715, "step": 5425 }, { "epoch": 18.000888888888888, "grad_norm": 7.137011528015137, "learning_rate": 3.4420532014436577e-06, "loss": 0.5219, "step": 5450 }, { "epoch": 18.00247619047619, "grad_norm": 9.867881774902344, "learning_rate": 3.4336987033819012e-06, "loss": 0.6136, "step": 5475 }, { "epoch": 18.00406349206349, "grad_norm": 11.723578453063965, "learning_rate": 3.4253442053201448e-06, "loss": 0.5442, "step": 5500 }, { "epoch": 18.005650793650794, "grad_norm": 5.498622417449951, "learning_rate": 3.416989707258388e-06, "loss": 0.5691, "step": 5525 }, { "epoch": 18.007238095238094, "grad_norm": 5.055809020996094, "learning_rate": 3.408635209196632e-06, "loss": 0.5105, "step": 5550 }, { "epoch": 18.008825396825397, "grad_norm": 3.1195857524871826, "learning_rate": 3.400280711134875e-06, "loss": 0.4747, "step": 5575 }, { "epoch": 18.010412698412697, "grad_norm": 11.140824317932129, "learning_rate": 3.391926213073119e-06, "loss": 0.5209, "step": 5600 }, { "epoch": 18.012, "grad_norm": 6.157780647277832, "learning_rate": 3.383571715011362e-06, "loss": 0.534, "step": 5625 }, { "epoch": 18.0135873015873, "grad_norm": 4.913928031921387, "learning_rate": 3.375217216949606e-06, "loss": 0.5126, "step": 5650 }, { "epoch": 18.015174603174604, "grad_norm": 7.3261895179748535, "learning_rate": 3.3668627188878493e-06, "loss": 0.5411, "step": 5675 }, { "epoch": 18.016761904761903, "grad_norm": 4.676502227783203, "learning_rate": 3.3585082208260933e-06, "loss": 0.5322, "step": 5700 }, { "epoch": 18.018349206349207, "grad_norm": 8.648612976074219, "learning_rate": 3.3501537227643365e-06, "loss": 0.5704, "step": 5725 }, { "epoch": 19.000761904761905, "grad_norm": 10.17095947265625, "learning_rate": 3.34179922470258e-06, "loss": 0.5118, "step": 5750 }, { "epoch": 19.002349206349205, "grad_norm": 4.843137264251709, "learning_rate": 3.3334447266408236e-06, "loss": 0.6025, "step": 5775 }, { "epoch": 19.00393650793651, "grad_norm": 8.146428108215332, "learning_rate": 3.325090228579067e-06, "loss": 0.5498, "step": 5800 }, { "epoch": 19.005523809523808, "grad_norm": 9.00712776184082, "learning_rate": 3.3167357305173107e-06, "loss": 0.5468, "step": 5825 }, { "epoch": 19.00711111111111, "grad_norm": 4.855189800262451, "learning_rate": 3.3083812324555543e-06, "loss": 0.5228, "step": 5850 }, { "epoch": 19.00869841269841, "grad_norm": 4.557061672210693, "learning_rate": 3.300026734393798e-06, "loss": 0.4595, "step": 5875 }, { "epoch": 19.010285714285715, "grad_norm": 8.567035675048828, "learning_rate": 3.2916722363320414e-06, "loss": 0.519, "step": 5900 }, { "epoch": 19.011873015873014, "grad_norm": 26.754615783691406, "learning_rate": 3.2833177382702854e-06, "loss": 0.5212, "step": 5925 }, { "epoch": 19.013460317460318, "grad_norm": 4.910426139831543, "learning_rate": 3.2749632402085285e-06, "loss": 0.5025, "step": 5950 }, { "epoch": 19.015047619047618, "grad_norm": 11.170868873596191, "learning_rate": 3.2666087421467717e-06, "loss": 0.5109, "step": 5975 }, { "epoch": 19.01663492063492, "grad_norm": 8.7157564163208, "learning_rate": 3.2582542440850157e-06, "loss": 0.5374, "step": 6000 }, { "epoch": 19.01822222222222, "grad_norm": 5.3223700523376465, "learning_rate": 3.249899746023259e-06, "loss": 0.5608, "step": 6025 }, { "epoch": 20.00063492063492, "grad_norm": 15.868850708007812, "learning_rate": 3.241545247961503e-06, "loss": 0.5028, "step": 6050 }, { "epoch": 20.002222222222223, "grad_norm": 7.932621955871582, "learning_rate": 3.233190749899746e-06, "loss": 0.6152, "step": 6075 }, { "epoch": 20.003809523809522, "grad_norm": 10.044479370117188, "learning_rate": 3.22483625183799e-06, "loss": 0.5187, "step": 6100 }, { "epoch": 20.005396825396826, "grad_norm": 7.519008159637451, "learning_rate": 3.216481753776233e-06, "loss": 0.5333, "step": 6125 }, { "epoch": 20.006984126984126, "grad_norm": 4.2018866539001465, "learning_rate": 3.208127255714477e-06, "loss": 0.546, "step": 6150 }, { "epoch": 20.00857142857143, "grad_norm": 6.87973690032959, "learning_rate": 3.1997727576527206e-06, "loss": 0.4432, "step": 6175 }, { "epoch": 20.01015873015873, "grad_norm": 6.215482711791992, "learning_rate": 3.191418259590964e-06, "loss": 0.5015, "step": 6200 }, { "epoch": 20.011746031746032, "grad_norm": 5.1478753089904785, "learning_rate": 3.1830637615292078e-06, "loss": 0.5034, "step": 6225 }, { "epoch": 20.013333333333332, "grad_norm": 3.017598867416382, "learning_rate": 3.174709263467451e-06, "loss": 0.5259, "step": 6250 }, { "epoch": 20.014920634920635, "grad_norm": 9.40729808807373, "learning_rate": 3.166354765405695e-06, "loss": 0.4903, "step": 6275 }, { "epoch": 20.016507936507935, "grad_norm": 10.465718269348145, "learning_rate": 3.158000267343938e-06, "loss": 0.5392, "step": 6300 }, { "epoch": 20.01809523809524, "grad_norm": 5.032984733581543, "learning_rate": 3.149645769282182e-06, "loss": 0.5571, "step": 6325 }, { "epoch": 21.000507936507937, "grad_norm": 9.124772071838379, "learning_rate": 3.141291271220425e-06, "loss": 0.4855, "step": 6350 }, { "epoch": 21.002095238095237, "grad_norm": 5.090267181396484, "learning_rate": 3.132936773158669e-06, "loss": 0.5912, "step": 6375 }, { "epoch": 21.00368253968254, "grad_norm": 7.209656238555908, "learning_rate": 3.1245822750969123e-06, "loss": 0.5315, "step": 6400 }, { "epoch": 21.00526984126984, "grad_norm": 10.766797065734863, "learning_rate": 3.1162277770351563e-06, "loss": 0.5364, "step": 6425 }, { "epoch": 21.006857142857143, "grad_norm": 9.655423164367676, "learning_rate": 3.1078732789733994e-06, "loss": 0.5469, "step": 6450 }, { "epoch": 21.008444444444443, "grad_norm": 9.706192970275879, "learning_rate": 3.099518780911643e-06, "loss": 0.4377, "step": 6475 }, { "epoch": 21.010031746031746, "grad_norm": 5.6594343185424805, "learning_rate": 3.0911642828498866e-06, "loss": 0.4867, "step": 6500 }, { "epoch": 21.011619047619046, "grad_norm": 9.920619010925293, "learning_rate": 3.08280978478813e-06, "loss": 0.497, "step": 6525 }, { "epoch": 21.01320634920635, "grad_norm": 4.976430892944336, "learning_rate": 3.0744552867263737e-06, "loss": 0.4949, "step": 6550 }, { "epoch": 21.01479365079365, "grad_norm": 7.563751697540283, "learning_rate": 3.0661007886646173e-06, "loss": 0.5136, "step": 6575 }, { "epoch": 21.016380952380953, "grad_norm": 5.668909072875977, "learning_rate": 3.057746290602861e-06, "loss": 0.5097, "step": 6600 }, { "epoch": 21.017968253968252, "grad_norm": 6.428191661834717, "learning_rate": 3.0493917925411044e-06, "loss": 0.5594, "step": 6625 }, { "epoch": 22.00038095238095, "grad_norm": 6.726556301116943, "learning_rate": 3.041037294479348e-06, "loss": 0.4874, "step": 6650 }, { "epoch": 22.001968253968254, "grad_norm": 14.118648529052734, "learning_rate": 3.0326827964175915e-06, "loss": 0.5642, "step": 6675 }, { "epoch": 22.003555555555554, "grad_norm": 11.719437599182129, "learning_rate": 3.024328298355835e-06, "loss": 0.5228, "step": 6700 }, { "epoch": 22.005142857142857, "grad_norm": 7.100255489349365, "learning_rate": 3.0159738002940787e-06, "loss": 0.5289, "step": 6725 }, { "epoch": 22.006730158730157, "grad_norm": 15.27164363861084, "learning_rate": 3.007619302232322e-06, "loss": 0.5544, "step": 6750 }, { "epoch": 22.00831746031746, "grad_norm": 6.844352722167969, "learning_rate": 2.999264804170566e-06, "loss": 0.4507, "step": 6775 }, { "epoch": 22.00990476190476, "grad_norm": 5.465493202209473, "learning_rate": 2.990910306108809e-06, "loss": 0.4717, "step": 6800 }, { "epoch": 22.011492063492064, "grad_norm": 6.300055503845215, "learning_rate": 2.982555808047053e-06, "loss": 0.4833, "step": 6825 }, { "epoch": 22.013079365079363, "grad_norm": 3.9687702655792236, "learning_rate": 2.974201309985296e-06, "loss": 0.4925, "step": 6850 }, { "epoch": 22.014666666666667, "grad_norm": 5.950267791748047, "learning_rate": 2.96584681192354e-06, "loss": 0.5093, "step": 6875 }, { "epoch": 22.016253968253967, "grad_norm": 7.3085618019104, "learning_rate": 2.957492313861783e-06, "loss": 0.5095, "step": 6900 }, { "epoch": 22.01784126984127, "grad_norm": 10.438004493713379, "learning_rate": 2.949137815800027e-06, "loss": 0.5509, "step": 6925 }, { "epoch": 23.00025396825397, "grad_norm": 5.851992130279541, "learning_rate": 2.9407833177382703e-06, "loss": 0.4989, "step": 6950 }, { "epoch": 23.001841269841268, "grad_norm": 5.808182716369629, "learning_rate": 2.932428819676514e-06, "loss": 0.531, "step": 6975 }, { "epoch": 23.00342857142857, "grad_norm": 4.814669132232666, "learning_rate": 2.924074321614758e-06, "loss": 0.5413, "step": 7000 }, { "epoch": 23.00501587301587, "grad_norm": 7.406203269958496, "learning_rate": 2.915719823553001e-06, "loss": 0.5223, "step": 7025 }, { "epoch": 23.006603174603175, "grad_norm": 4.7713942527771, "learning_rate": 2.907365325491245e-06, "loss": 0.5251, "step": 7050 }, { "epoch": 23.008190476190475, "grad_norm": 4.403865814208984, "learning_rate": 2.899010827429488e-06, "loss": 0.46, "step": 7075 }, { "epoch": 23.009777777777778, "grad_norm": 5.674661636352539, "learning_rate": 2.890656329367732e-06, "loss": 0.4705, "step": 7100 }, { "epoch": 23.011365079365078, "grad_norm": 7.83860445022583, "learning_rate": 2.8823018313059753e-06, "loss": 0.4906, "step": 7125 }, { "epoch": 23.01295238095238, "grad_norm": 3.9756040573120117, "learning_rate": 2.8739473332442193e-06, "loss": 0.4931, "step": 7150 }, { "epoch": 23.01453968253968, "grad_norm": 4.530709743499756, "learning_rate": 2.8655928351824624e-06, "loss": 0.4957, "step": 7175 }, { "epoch": 23.016126984126984, "grad_norm": 7.570037364959717, "learning_rate": 2.8572383371207056e-06, "loss": 0.5039, "step": 7200 }, { "epoch": 23.017714285714284, "grad_norm": 6.422541618347168, "learning_rate": 2.8488838390589496e-06, "loss": 0.5312, "step": 7225 }, { "epoch": 24.000126984126982, "grad_norm": 7.004579544067383, "learning_rate": 2.8405293409971927e-06, "loss": 0.4954, "step": 7250 }, { "epoch": 24.001714285714286, "grad_norm": 11.172274589538574, "learning_rate": 2.8321748429354367e-06, "loss": 0.5304, "step": 7275 }, { "epoch": 24.003301587301586, "grad_norm": 6.250117301940918, "learning_rate": 2.8238203448736803e-06, "loss": 0.5508, "step": 7300 }, { "epoch": 24.00488888888889, "grad_norm": 5.038013935089111, "learning_rate": 2.815465846811924e-06, "loss": 0.5128, "step": 7325 }, { "epoch": 24.00647619047619, "grad_norm": 3.5625054836273193, "learning_rate": 2.8071113487501674e-06, "loss": 0.5219, "step": 7350 }, { "epoch": 24.008063492063492, "grad_norm": 4.982530117034912, "learning_rate": 2.798756850688411e-06, "loss": 0.4495, "step": 7375 }, { "epoch": 24.009650793650792, "grad_norm": 17.86178207397461, "learning_rate": 2.7904023526266545e-06, "loss": 0.4743, "step": 7400 }, { "epoch": 24.011238095238095, "grad_norm": 6.184370517730713, "learning_rate": 2.782047854564898e-06, "loss": 0.4768, "step": 7425 }, { "epoch": 24.012825396825395, "grad_norm": 11.036730766296387, "learning_rate": 2.7736933565031416e-06, "loss": 0.493, "step": 7450 }, { "epoch": 24.0144126984127, "grad_norm": 7.9786577224731445, "learning_rate": 2.765338858441385e-06, "loss": 0.4758, "step": 7475 }, { "epoch": 24.016, "grad_norm": 5.915741443634033, "learning_rate": 2.7569843603796288e-06, "loss": 0.5088, "step": 7500 }, { "epoch": 24.0175873015873, "grad_norm": 10.715784072875977, "learning_rate": 2.748629862317872e-06, "loss": 0.5091, "step": 7525 }, { "epoch": 24.0191746031746, "grad_norm": 6.6324381828308105, "learning_rate": 2.740275364256116e-06, "loss": 0.4805, "step": 7550 }, { "epoch": 25.001587301587303, "grad_norm": 4.505024433135986, "learning_rate": 2.731920866194359e-06, "loss": 0.55, "step": 7575 }, { "epoch": 25.003174603174603, "grad_norm": 6.079422950744629, "learning_rate": 2.723566368132603e-06, "loss": 0.5442, "step": 7600 }, { "epoch": 25.004761904761907, "grad_norm": 5.729933738708496, "learning_rate": 2.715211870070846e-06, "loss": 0.4952, "step": 7625 }, { "epoch": 25.006349206349206, "grad_norm": 12.97045612335205, "learning_rate": 2.70685737200909e-06, "loss": 0.5312, "step": 7650 }, { "epoch": 25.00793650793651, "grad_norm": 6.688389301300049, "learning_rate": 2.6985028739473333e-06, "loss": 0.4426, "step": 7675 }, { "epoch": 25.00952380952381, "grad_norm": 5.51877498626709, "learning_rate": 2.690148375885577e-06, "loss": 0.4597, "step": 7700 }, { "epoch": 25.011111111111113, "grad_norm": 6.6266374588012695, "learning_rate": 2.6817938778238204e-06, "loss": 0.4767, "step": 7725 }, { "epoch": 25.012698412698413, "grad_norm": 4.988967418670654, "learning_rate": 2.673439379762064e-06, "loss": 0.4721, "step": 7750 }, { "epoch": 25.014285714285716, "grad_norm": 5.249930381774902, "learning_rate": 2.6650848817003076e-06, "loss": 0.481, "step": 7775 }, { "epoch": 25.015873015873016, "grad_norm": 8.894637107849121, "learning_rate": 2.656730383638551e-06, "loss": 0.5092, "step": 7800 }, { "epoch": 25.01746031746032, "grad_norm": 6.139794826507568, "learning_rate": 2.648375885576795e-06, "loss": 0.5146, "step": 7825 }, { "epoch": 25.01904761904762, "grad_norm": 4.836121082305908, "learning_rate": 2.6400213875150383e-06, "loss": 0.4853, "step": 7850 }, { "epoch": 26.001460317460317, "grad_norm": 4.840237140655518, "learning_rate": 2.6316668894532823e-06, "loss": 0.5325, "step": 7875 }, { "epoch": 26.00304761904762, "grad_norm": 6.270430088043213, "learning_rate": 2.6233123913915254e-06, "loss": 0.5517, "step": 7900 }, { "epoch": 26.00463492063492, "grad_norm": 6.732022285461426, "learning_rate": 2.6149578933297694e-06, "loss": 0.4771, "step": 7925 }, { "epoch": 26.006222222222224, "grad_norm": 4.249831199645996, "learning_rate": 2.6066033952680125e-06, "loss": 0.5281, "step": 7950 }, { "epoch": 26.007809523809524, "grad_norm": 9.650166511535645, "learning_rate": 2.5982488972062557e-06, "loss": 0.4461, "step": 7975 }, { "epoch": 26.009396825396827, "grad_norm": 5.7691216468811035, "learning_rate": 2.5898943991444997e-06, "loss": 0.4487, "step": 8000 }, { "epoch": 26.010984126984127, "grad_norm": 5.991948127746582, "learning_rate": 2.581539901082743e-06, "loss": 0.4715, "step": 8025 }, { "epoch": 26.01257142857143, "grad_norm": 11.065790176391602, "learning_rate": 2.573185403020987e-06, "loss": 0.4742, "step": 8050 }, { "epoch": 26.01415873015873, "grad_norm": 11.387042045593262, "learning_rate": 2.56483090495923e-06, "loss": 0.4793, "step": 8075 }, { "epoch": 26.015746031746033, "grad_norm": 7.323668479919434, "learning_rate": 2.556476406897474e-06, "loss": 0.4933, "step": 8100 }, { "epoch": 26.017333333333333, "grad_norm": 10.183083534240723, "learning_rate": 2.5481219088357175e-06, "loss": 0.5176, "step": 8125 }, { "epoch": 26.018920634920637, "grad_norm": 3.41259503364563, "learning_rate": 2.539767410773961e-06, "loss": 0.4789, "step": 8150 }, { "epoch": 27.001333333333335, "grad_norm": 8.132092475891113, "learning_rate": 2.5314129127122046e-06, "loss": 0.5315, "step": 8175 }, { "epoch": 27.002920634920635, "grad_norm": 6.488096237182617, "learning_rate": 2.5230584146504478e-06, "loss": 0.5424, "step": 8200 }, { "epoch": 27.004507936507938, "grad_norm": 7.1543803215026855, "learning_rate": 2.5147039165886918e-06, "loss": 0.4628, "step": 8225 }, { "epoch": 27.006095238095238, "grad_norm": 6.017189025878906, "learning_rate": 2.506349418526935e-06, "loss": 0.5284, "step": 8250 }, { "epoch": 27.00768253968254, "grad_norm": 5.09862756729126, "learning_rate": 2.497994920465179e-06, "loss": 0.4613, "step": 8275 }, { "epoch": 27.00926984126984, "grad_norm": 6.283570766448975, "learning_rate": 2.489640422403422e-06, "loss": 0.4374, "step": 8300 }, { "epoch": 27.010857142857144, "grad_norm": 5.45609712600708, "learning_rate": 2.4812859243416656e-06, "loss": 0.4615, "step": 8325 }, { "epoch": 27.012444444444444, "grad_norm": 9.621217727661133, "learning_rate": 2.472931426279909e-06, "loss": 0.4665, "step": 8350 }, { "epoch": 27.014031746031748, "grad_norm": 10.336989402770996, "learning_rate": 2.4645769282181527e-06, "loss": 0.4712, "step": 8375 }, { "epoch": 27.015619047619047, "grad_norm": 8.53022289276123, "learning_rate": 2.4562224301563963e-06, "loss": 0.4869, "step": 8400 }, { "epoch": 27.01720634920635, "grad_norm": 6.758102893829346, "learning_rate": 2.44786793209464e-06, "loss": 0.5007, "step": 8425 }, { "epoch": 27.01879365079365, "grad_norm": 6.737295627593994, "learning_rate": 2.4395134340328834e-06, "loss": 0.5, "step": 8450 }, { "epoch": 28.00120634920635, "grad_norm": 8.9446439743042, "learning_rate": 2.431158935971127e-06, "loss": 0.5098, "step": 8475 }, { "epoch": 28.002793650793652, "grad_norm": 4.6513190269470215, "learning_rate": 2.4228044379093706e-06, "loss": 0.5249, "step": 8500 }, { "epoch": 28.004380952380952, "grad_norm": 7.930838108062744, "learning_rate": 2.414449939847614e-06, "loss": 0.4823, "step": 8525 }, { "epoch": 28.005968253968256, "grad_norm": 5.986405372619629, "learning_rate": 2.4060954417858577e-06, "loss": 0.5295, "step": 8550 }, { "epoch": 28.007555555555555, "grad_norm": 6.348638534545898, "learning_rate": 2.3977409437241013e-06, "loss": 0.4608, "step": 8575 }, { "epoch": 28.00914285714286, "grad_norm": 5.640425205230713, "learning_rate": 2.389386445662345e-06, "loss": 0.4183, "step": 8600 }, { "epoch": 28.01073015873016, "grad_norm": 7.974732875823975, "learning_rate": 2.3810319476005884e-06, "loss": 0.4716, "step": 8625 }, { "epoch": 28.012317460317462, "grad_norm": 4.698752403259277, "learning_rate": 2.372677449538832e-06, "loss": 0.4637, "step": 8650 }, { "epoch": 28.01390476190476, "grad_norm": 4.2253828048706055, "learning_rate": 2.3643229514770755e-06, "loss": 0.4589, "step": 8675 }, { "epoch": 28.015492063492065, "grad_norm": 7.007496356964111, "learning_rate": 2.355968453415319e-06, "loss": 0.4978, "step": 8700 }, { "epoch": 28.017079365079365, "grad_norm": 4.830111026763916, "learning_rate": 2.3476139553535627e-06, "loss": 0.4873, "step": 8725 }, { "epoch": 28.018666666666668, "grad_norm": 3.9254467487335205, "learning_rate": 2.3392594572918062e-06, "loss": 0.4994, "step": 8750 }, { "epoch": 29.001079365079367, "grad_norm": 6.090777397155762, "learning_rate": 2.33090495923005e-06, "loss": 0.4761, "step": 8775 }, { "epoch": 29.002666666666666, "grad_norm": 5.358640670776367, "learning_rate": 2.322550461168293e-06, "loss": 0.5431, "step": 8800 }, { "epoch": 29.00425396825397, "grad_norm": 9.447075843811035, "learning_rate": 2.3141959631065365e-06, "loss": 0.4746, "step": 8825 }, { "epoch": 29.00584126984127, "grad_norm": 5.390321731567383, "learning_rate": 2.30584146504478e-06, "loss": 0.5236, "step": 8850 }, { "epoch": 29.007428571428573, "grad_norm": 4.194957256317139, "learning_rate": 2.2974869669830236e-06, "loss": 0.4683, "step": 8875 }, { "epoch": 29.009015873015873, "grad_norm": 10.377429008483887, "learning_rate": 2.289132468921267e-06, "loss": 0.4122, "step": 8900 }, { "epoch": 29.010603174603176, "grad_norm": 4.972590923309326, "learning_rate": 2.2807779708595108e-06, "loss": 0.4557, "step": 8925 }, { "epoch": 29.012190476190476, "grad_norm": 4.772759437561035, "learning_rate": 2.2724234727977548e-06, "loss": 0.4732, "step": 8950 }, { "epoch": 29.01377777777778, "grad_norm": 19.494970321655273, "learning_rate": 2.2640689747359983e-06, "loss": 0.4653, "step": 8975 }, { "epoch": 29.01536507936508, "grad_norm": 6.1877593994140625, "learning_rate": 2.255714476674242e-06, "loss": 0.4901, "step": 9000 }, { "epoch": 29.016952380952382, "grad_norm": 5.228841781616211, "learning_rate": 2.2473599786124854e-06, "loss": 0.4841, "step": 9025 }, { "epoch": 29.018539682539682, "grad_norm": 5.32314395904541, "learning_rate": 2.2390054805507286e-06, "loss": 0.4902, "step": 9050 }, { "epoch": 30.00095238095238, "grad_norm": 6.952610015869141, "learning_rate": 2.230650982488972e-06, "loss": 0.4803, "step": 9075 }, { "epoch": 30.002539682539684, "grad_norm": 4.230266571044922, "learning_rate": 2.2222964844272157e-06, "loss": 0.5235, "step": 9100 }, { "epoch": 30.004126984126984, "grad_norm": 7.016523361206055, "learning_rate": 2.2139419863654593e-06, "loss": 0.4873, "step": 9125 }, { "epoch": 30.005714285714287, "grad_norm": 10.13500690460205, "learning_rate": 2.205587488303703e-06, "loss": 0.5262, "step": 9150 }, { "epoch": 30.007301587301587, "grad_norm": 7.627212047576904, "learning_rate": 2.1972329902419464e-06, "loss": 0.4619, "step": 9175 }, { "epoch": 30.00888888888889, "grad_norm": 7.077376365661621, "learning_rate": 2.18887849218019e-06, "loss": 0.4011, "step": 9200 }, { "epoch": 30.01047619047619, "grad_norm": 7.501957416534424, "learning_rate": 2.1805239941184336e-06, "loss": 0.455, "step": 9225 }, { "epoch": 30.012063492063493, "grad_norm": 6.617973327636719, "learning_rate": 2.172169496056677e-06, "loss": 0.4814, "step": 9250 }, { "epoch": 30.013650793650793, "grad_norm": 3.885499954223633, "learning_rate": 2.1638149979949207e-06, "loss": 0.4523, "step": 9275 }, { "epoch": 30.015238095238097, "grad_norm": 5.5597615242004395, "learning_rate": 2.1554604999331642e-06, "loss": 0.4768, "step": 9300 }, { "epoch": 30.016825396825396, "grad_norm": 9.792261123657227, "learning_rate": 2.147106001871408e-06, "loss": 0.49, "step": 9325 }, { "epoch": 30.0184126984127, "grad_norm": 5.992704391479492, "learning_rate": 2.1387515038096514e-06, "loss": 0.498, "step": 9350 }, { "epoch": 31.000825396825398, "grad_norm": 8.464439392089844, "learning_rate": 2.130397005747895e-06, "loss": 0.4518, "step": 9375 }, { "epoch": 31.002412698412698, "grad_norm": 3.486860990524292, "learning_rate": 2.1220425076861385e-06, "loss": 0.5318, "step": 9400 }, { "epoch": 31.004, "grad_norm": 4.426388740539551, "learning_rate": 2.113688009624382e-06, "loss": 0.4917, "step": 9425 }, { "epoch": 31.0055873015873, "grad_norm": 8.08337116241455, "learning_rate": 2.1053335115626256e-06, "loss": 0.5093, "step": 9450 }, { "epoch": 31.007174603174604, "grad_norm": 3.963824987411499, "learning_rate": 2.096979013500869e-06, "loss": 0.4515, "step": 9475 }, { "epoch": 31.008761904761904, "grad_norm": 7.304539203643799, "learning_rate": 2.0886245154391128e-06, "loss": 0.4196, "step": 9500 }, { "epoch": 31.010349206349208, "grad_norm": 4.731977939605713, "learning_rate": 2.080270017377356e-06, "loss": 0.4529, "step": 9525 }, { "epoch": 31.011936507936507, "grad_norm": 8.285253524780273, "learning_rate": 2.0719155193155995e-06, "loss": 0.4653, "step": 9550 }, { "epoch": 31.01352380952381, "grad_norm": 8.305194854736328, "learning_rate": 2.063561021253843e-06, "loss": 0.4624, "step": 9575 }, { "epoch": 31.01511111111111, "grad_norm": 13.913382530212402, "learning_rate": 2.0552065231920866e-06, "loss": 0.4676, "step": 9600 }, { "epoch": 31.016698412698414, "grad_norm": 6.448155403137207, "learning_rate": 2.04685202513033e-06, "loss": 0.476, "step": 9625 }, { "epoch": 31.018285714285714, "grad_norm": 7.706886291503906, "learning_rate": 2.0384975270685737e-06, "loss": 0.4967, "step": 9650 }, { "epoch": 32.00069841269841, "grad_norm": 4.588306427001953, "learning_rate": 2.0301430290068173e-06, "loss": 0.4358, "step": 9675 }, { "epoch": 32.00228571428571, "grad_norm": 4.243907451629639, "learning_rate": 2.021788530945061e-06, "loss": 0.5546, "step": 9700 }, { "epoch": 32.00387301587302, "grad_norm": 6.786617755889893, "learning_rate": 2.0134340328833044e-06, "loss": 0.4775, "step": 9725 }, { "epoch": 32.00546031746032, "grad_norm": 9.617806434631348, "learning_rate": 2.005079534821548e-06, "loss": 0.4857, "step": 9750 }, { "epoch": 32.00704761904762, "grad_norm": 4.088709354400635, "learning_rate": 1.9967250367597916e-06, "loss": 0.4872, "step": 9775 }, { "epoch": 32.00863492063492, "grad_norm": 7.801070690155029, "learning_rate": 1.988370538698035e-06, "loss": 0.3978, "step": 9800 }, { "epoch": 32.010222222222225, "grad_norm": 12.151320457458496, "learning_rate": 1.9800160406362787e-06, "loss": 0.4567, "step": 9825 }, { "epoch": 32.011809523809525, "grad_norm": 6.325204372406006, "learning_rate": 1.9716615425745223e-06, "loss": 0.4568, "step": 9850 }, { "epoch": 32.013396825396825, "grad_norm": 4.849487781524658, "learning_rate": 1.963307044512766e-06, "loss": 0.4612, "step": 9875 }, { "epoch": 32.014984126984125, "grad_norm": 6.0611090660095215, "learning_rate": 1.9549525464510094e-06, "loss": 0.4492, "step": 9900 }, { "epoch": 32.01657142857143, "grad_norm": 8.705154418945312, "learning_rate": 1.946598048389253e-06, "loss": 0.4823, "step": 9925 }, { "epoch": 32.01815873015873, "grad_norm": 6.838711261749268, "learning_rate": 1.9382435503274965e-06, "loss": 0.4981, "step": 9950 }, { "epoch": 33.000571428571426, "grad_norm": 6.763596057891846, "learning_rate": 1.92988905226574e-06, "loss": 0.4369, "step": 9975 }, { "epoch": 33.00215873015873, "grad_norm": 5.268820285797119, "learning_rate": 1.9215345542039837e-06, "loss": 0.5427, "step": 10000 }, { "epoch": 33.00374603174603, "grad_norm": 3.5916361808776855, "learning_rate": 1.913180056142227e-06, "loss": 0.4798, "step": 10025 }, { "epoch": 33.00533333333333, "grad_norm": 4.288261890411377, "learning_rate": 1.9048255580804706e-06, "loss": 0.488, "step": 10050 }, { "epoch": 33.00692063492063, "grad_norm": 5.668625354766846, "learning_rate": 1.8964710600187142e-06, "loss": 0.5008, "step": 10075 }, { "epoch": 33.00850793650794, "grad_norm": 4.973710536956787, "learning_rate": 1.8881165619569577e-06, "loss": 0.391, "step": 10100 }, { "epoch": 33.01009523809524, "grad_norm": 7.073776721954346, "learning_rate": 1.8797620638952013e-06, "loss": 0.444, "step": 10125 }, { "epoch": 33.01168253968254, "grad_norm": 4.297868251800537, "learning_rate": 1.8714075658334449e-06, "loss": 0.4516, "step": 10150 }, { "epoch": 33.01326984126984, "grad_norm": 25.770071029663086, "learning_rate": 1.8630530677716884e-06, "loss": 0.4746, "step": 10175 }, { "epoch": 33.014857142857146, "grad_norm": 3.6950442790985107, "learning_rate": 1.854698569709932e-06, "loss": 0.4346, "step": 10200 }, { "epoch": 33.016444444444446, "grad_norm": 8.81914234161377, "learning_rate": 1.8463440716481756e-06, "loss": 0.4794, "step": 10225 }, { "epoch": 33.018031746031745, "grad_norm": 6.376156806945801, "learning_rate": 1.837989573586419e-06, "loss": 0.5085, "step": 10250 }, { "epoch": 34.00044444444445, "grad_norm": 4.302385330200195, "learning_rate": 1.8296350755246625e-06, "loss": 0.4276, "step": 10275 }, { "epoch": 34.00203174603175, "grad_norm": 8.226256370544434, "learning_rate": 1.821280577462906e-06, "loss": 0.5405, "step": 10300 }, { "epoch": 34.00361904761905, "grad_norm": 12.093836784362793, "learning_rate": 1.8129260794011496e-06, "loss": 0.4661, "step": 10325 }, { "epoch": 34.00520634920635, "grad_norm": 11.840346336364746, "learning_rate": 1.8045715813393932e-06, "loss": 0.4779, "step": 10350 }, { "epoch": 34.006793650793654, "grad_norm": 5.186977386474609, "learning_rate": 1.7962170832776367e-06, "loss": 0.5152, "step": 10375 }, { "epoch": 34.00838095238095, "grad_norm": 4.393645286560059, "learning_rate": 1.7878625852158805e-06, "loss": 0.3996, "step": 10400 }, { "epoch": 34.00996825396825, "grad_norm": 11.108858108520508, "learning_rate": 1.779508087154124e-06, "loss": 0.4351, "step": 10425 }, { "epoch": 34.01155555555555, "grad_norm": 6.400074005126953, "learning_rate": 1.7711535890923676e-06, "loss": 0.4461, "step": 10450 }, { "epoch": 34.01314285714286, "grad_norm": 11.155898094177246, "learning_rate": 1.7627990910306112e-06, "loss": 0.4503, "step": 10475 }, { "epoch": 34.01473015873016, "grad_norm": 5.40648889541626, "learning_rate": 1.7544445929688544e-06, "loss": 0.4552, "step": 10500 }, { "epoch": 34.01631746031746, "grad_norm": 4.550622463226318, "learning_rate": 1.746090094907098e-06, "loss": 0.4701, "step": 10525 }, { "epoch": 34.01790476190476, "grad_norm": 7.3100433349609375, "learning_rate": 1.7377355968453417e-06, "loss": 0.4936, "step": 10550 }, { "epoch": 35.00031746031746, "grad_norm": 6.450623035430908, "learning_rate": 1.7293810987835853e-06, "loss": 0.449, "step": 10575 }, { "epoch": 35.00190476190476, "grad_norm": 9.00794792175293, "learning_rate": 1.7210266007218288e-06, "loss": 0.5011, "step": 10600 }, { "epoch": 35.00349206349206, "grad_norm": 10.713994026184082, "learning_rate": 1.7126721026600724e-06, "loss": 0.4829, "step": 10625 }, { "epoch": 35.00507936507937, "grad_norm": 4.488622188568115, "learning_rate": 1.704317604598316e-06, "loss": 0.4795, "step": 10650 }, { "epoch": 35.00666666666667, "grad_norm": 16.104774475097656, "learning_rate": 1.6959631065365595e-06, "loss": 0.5035, "step": 10675 }, { "epoch": 35.00825396825397, "grad_norm": 4.884101390838623, "learning_rate": 1.687608608474803e-06, "loss": 0.4168, "step": 10700 }, { "epoch": 35.00984126984127, "grad_norm": 5.478789806365967, "learning_rate": 1.6792541104130467e-06, "loss": 0.4258, "step": 10725 }, { "epoch": 35.011428571428574, "grad_norm": 6.428930282592773, "learning_rate": 1.67089961235129e-06, "loss": 0.4392, "step": 10750 }, { "epoch": 35.013015873015874, "grad_norm": 2.7530977725982666, "learning_rate": 1.6625451142895336e-06, "loss": 0.451, "step": 10775 }, { "epoch": 35.014603174603174, "grad_norm": 5.9829912185668945, "learning_rate": 1.6541906162277771e-06, "loss": 0.4586, "step": 10800 }, { "epoch": 35.016190476190474, "grad_norm": 6.039813995361328, "learning_rate": 1.6458361181660207e-06, "loss": 0.4667, "step": 10825 }, { "epoch": 35.01777777777778, "grad_norm": 6.336811065673828, "learning_rate": 1.6374816201042643e-06, "loss": 0.482, "step": 10850 }, { "epoch": 36.000190476190475, "grad_norm": 6.172911643981934, "learning_rate": 1.6291271220425078e-06, "loss": 0.4651, "step": 10875 }, { "epoch": 36.001777777777775, "grad_norm": 4.215289115905762, "learning_rate": 1.6207726239807514e-06, "loss": 0.4776, "step": 10900 }, { "epoch": 36.00336507936508, "grad_norm": 2.862426519393921, "learning_rate": 1.612418125918995e-06, "loss": 0.5059, "step": 10925 }, { "epoch": 36.00495238095238, "grad_norm": 4.817645072937012, "learning_rate": 1.6040636278572385e-06, "loss": 0.4777, "step": 10950 }, { "epoch": 36.00653968253968, "grad_norm": 8.073090553283691, "learning_rate": 1.595709129795482e-06, "loss": 0.4837, "step": 10975 }, { "epoch": 36.00812698412698, "grad_norm": 6.108732223510742, "learning_rate": 1.5873546317337255e-06, "loss": 0.4149, "step": 11000 }, { "epoch": 36.00971428571429, "grad_norm": 4.37070369720459, "learning_rate": 1.579000133671969e-06, "loss": 0.4323, "step": 11025 }, { "epoch": 36.01130158730159, "grad_norm": 3.8069772720336914, "learning_rate": 1.5706456356102126e-06, "loss": 0.4488, "step": 11050 }, { "epoch": 36.01288888888889, "grad_norm": 4.619093894958496, "learning_rate": 1.5622911375484562e-06, "loss": 0.4551, "step": 11075 }, { "epoch": 36.01447619047619, "grad_norm": 5.150590419769287, "learning_rate": 1.5539366394866997e-06, "loss": 0.4401, "step": 11100 }, { "epoch": 36.016063492063495, "grad_norm": 6.264833450317383, "learning_rate": 1.5455821414249433e-06, "loss": 0.4642, "step": 11125 }, { "epoch": 36.017650793650795, "grad_norm": 7.851002216339111, "learning_rate": 1.5372276433631869e-06, "loss": 0.4791, "step": 11150 }, { "epoch": 37.00006349206349, "grad_norm": 7.941956996917725, "learning_rate": 1.5288731453014304e-06, "loss": 0.4488, "step": 11175 }, { "epoch": 37.001650793650796, "grad_norm": 6.259990692138672, "learning_rate": 1.520518647239674e-06, "loss": 0.4967, "step": 11200 }, { "epoch": 37.003238095238096, "grad_norm": 2.6891424655914307, "learning_rate": 1.5121641491779175e-06, "loss": 0.501, "step": 11225 }, { "epoch": 37.004825396825396, "grad_norm": 7.098905563354492, "learning_rate": 1.503809651116161e-06, "loss": 0.4755, "step": 11250 }, { "epoch": 37.006412698412696, "grad_norm": 5.689685344696045, "learning_rate": 1.4954551530544045e-06, "loss": 0.4868, "step": 11275 }, { "epoch": 37.008, "grad_norm": 6.782131195068359, "learning_rate": 1.487100654992648e-06, "loss": 0.4057, "step": 11300 }, { "epoch": 37.0095873015873, "grad_norm": 7.18269157409668, "learning_rate": 1.4787461569308916e-06, "loss": 0.4281, "step": 11325 }, { "epoch": 37.0111746031746, "grad_norm": 6.619096755981445, "learning_rate": 1.4703916588691352e-06, "loss": 0.4398, "step": 11350 }, { "epoch": 37.0127619047619, "grad_norm": 4.206869602203369, "learning_rate": 1.462037160807379e-06, "loss": 0.4394, "step": 11375 }, { "epoch": 37.01434920634921, "grad_norm": 7.179015636444092, "learning_rate": 1.4536826627456225e-06, "loss": 0.4515, "step": 11400 }, { "epoch": 37.01593650793651, "grad_norm": 5.137106895446777, "learning_rate": 1.445328164683866e-06, "loss": 0.4704, "step": 11425 }, { "epoch": 37.01752380952381, "grad_norm": 4.871583461761475, "learning_rate": 1.4369736666221096e-06, "loss": 0.4704, "step": 11450 }, { "epoch": 37.01911111111111, "grad_norm": 3.7863287925720215, "learning_rate": 1.4286191685603528e-06, "loss": 0.4518, "step": 11475 }, { "epoch": 38.00152380952381, "grad_norm": 11.65233039855957, "learning_rate": 1.4202646704985963e-06, "loss": 0.5093, "step": 11500 }, { "epoch": 38.00311111111111, "grad_norm": 9.356032371520996, "learning_rate": 1.4119101724368401e-06, "loss": 0.4997, "step": 11525 }, { "epoch": 38.00469841269841, "grad_norm": 5.056600093841553, "learning_rate": 1.4035556743750837e-06, "loss": 0.4455, "step": 11550 }, { "epoch": 38.00628571428572, "grad_norm": 4.728214740753174, "learning_rate": 1.3952011763133273e-06, "loss": 0.4935, "step": 11575 }, { "epoch": 38.00787301587302, "grad_norm": 6.161364555358887, "learning_rate": 1.3868466782515708e-06, "loss": 0.4153, "step": 11600 }, { "epoch": 38.00946031746032, "grad_norm": 10.345149993896484, "learning_rate": 1.3784921801898144e-06, "loss": 0.4206, "step": 11625 }, { "epoch": 38.011047619047616, "grad_norm": 3.4351565837860107, "learning_rate": 1.370137682128058e-06, "loss": 0.434, "step": 11650 }, { "epoch": 38.01263492063492, "grad_norm": 6.009977340698242, "learning_rate": 1.3617831840663015e-06, "loss": 0.448, "step": 11675 }, { "epoch": 38.01422222222222, "grad_norm": 5.062911510467529, "learning_rate": 1.353428686004545e-06, "loss": 0.4399, "step": 11700 }, { "epoch": 38.01580952380952, "grad_norm": 5.664973735809326, "learning_rate": 1.3450741879427884e-06, "loss": 0.4674, "step": 11725 }, { "epoch": 38.01739682539682, "grad_norm": 7.776061534881592, "learning_rate": 1.336719689881032e-06, "loss": 0.481, "step": 11750 }, { "epoch": 38.01898412698413, "grad_norm": 4.977148532867432, "learning_rate": 1.3283651918192756e-06, "loss": 0.446, "step": 11775 }, { "epoch": 39.001396825396824, "grad_norm": 5.45735502243042, "learning_rate": 1.3200106937575191e-06, "loss": 0.4955, "step": 11800 }, { "epoch": 39.002984126984124, "grad_norm": 5.3033881187438965, "learning_rate": 1.3116561956957627e-06, "loss": 0.5065, "step": 11825 }, { "epoch": 39.00457142857143, "grad_norm": 5.209838390350342, "learning_rate": 1.3033016976340063e-06, "loss": 0.4283, "step": 11850 }, { "epoch": 39.00615873015873, "grad_norm": 9.413320541381836, "learning_rate": 1.2949471995722498e-06, "loss": 0.5134, "step": 11875 }, { "epoch": 39.00774603174603, "grad_norm": 3.614576816558838, "learning_rate": 1.2865927015104934e-06, "loss": 0.4132, "step": 11900 }, { "epoch": 39.00933333333333, "grad_norm": 5.418633937835693, "learning_rate": 1.278238203448737e-06, "loss": 0.411, "step": 11925 }, { "epoch": 39.01092063492064, "grad_norm": 7.182598114013672, "learning_rate": 1.2698837053869805e-06, "loss": 0.4367, "step": 11950 }, { "epoch": 39.01250793650794, "grad_norm": 5.821928024291992, "learning_rate": 1.2615292073252239e-06, "loss": 0.4313, "step": 11975 }, { "epoch": 39.01409523809524, "grad_norm": 5.386777877807617, "learning_rate": 1.2531747092634675e-06, "loss": 0.4423, "step": 12000 }, { "epoch": 39.01568253968254, "grad_norm": 7.081798553466797, "learning_rate": 1.244820211201711e-06, "loss": 0.4637, "step": 12025 }, { "epoch": 39.017269841269844, "grad_norm": 6.970532417297363, "learning_rate": 1.2364657131399546e-06, "loss": 0.4641, "step": 12050 }, { "epoch": 39.018857142857144, "grad_norm": 10.821274757385254, "learning_rate": 1.2281112150781982e-06, "loss": 0.4712, "step": 12075 }, { "epoch": 40.00126984126984, "grad_norm": 7.221808910369873, "learning_rate": 1.2197567170164417e-06, "loss": 0.4856, "step": 12100 }, { "epoch": 40.002857142857145, "grad_norm": 15.445313453674316, "learning_rate": 1.2114022189546853e-06, "loss": 0.5051, "step": 12125 }, { "epoch": 40.004444444444445, "grad_norm": 5.132606029510498, "learning_rate": 1.2030477208929288e-06, "loss": 0.4254, "step": 12150 }, { "epoch": 40.006031746031745, "grad_norm": 3.0163042545318604, "learning_rate": 1.1946932228311724e-06, "loss": 0.4991, "step": 12175 }, { "epoch": 40.007619047619045, "grad_norm": 4.396322250366211, "learning_rate": 1.186338724769416e-06, "loss": 0.4394, "step": 12200 }, { "epoch": 40.00920634920635, "grad_norm": 9.984151840209961, "learning_rate": 1.1779842267076595e-06, "loss": 0.4004, "step": 12225 }, { "epoch": 40.01079365079365, "grad_norm": 5.9347825050354, "learning_rate": 1.1696297286459031e-06, "loss": 0.4373, "step": 12250 }, { "epoch": 40.01238095238095, "grad_norm": 5.0575852394104, "learning_rate": 1.1612752305841465e-06, "loss": 0.4321, "step": 12275 }, { "epoch": 40.01396825396825, "grad_norm": 6.324869155883789, "learning_rate": 1.15292073252239e-06, "loss": 0.4353, "step": 12300 }, { "epoch": 40.01555555555556, "grad_norm": 6.5207414627075195, "learning_rate": 1.1445662344606336e-06, "loss": 0.4675, "step": 12325 }, { "epoch": 40.01714285714286, "grad_norm": 6.220884799957275, "learning_rate": 1.1362117363988774e-06, "loss": 0.4611, "step": 12350 }, { "epoch": 40.01873015873016, "grad_norm": 10.964550018310547, "learning_rate": 1.127857238337121e-06, "loss": 0.4673, "step": 12375 }, { "epoch": 41.00114285714286, "grad_norm": 6.545460224151611, "learning_rate": 1.1195027402753643e-06, "loss": 0.4593, "step": 12400 }, { "epoch": 41.00273015873016, "grad_norm": 6.692239761352539, "learning_rate": 1.1111482422136079e-06, "loss": 0.512, "step": 12425 }, { "epoch": 41.00431746031746, "grad_norm": 7.213928699493408, "learning_rate": 1.1027937441518514e-06, "loss": 0.4477, "step": 12450 }, { "epoch": 41.00590476190476, "grad_norm": 4.5662431716918945, "learning_rate": 1.094439246090095e-06, "loss": 0.4916, "step": 12475 }, { "epoch": 41.007492063492066, "grad_norm": 4.408071041107178, "learning_rate": 1.0860847480283386e-06, "loss": 0.4456, "step": 12500 }, { "epoch": 41.009079365079366, "grad_norm": 5.65850830078125, "learning_rate": 1.0777302499665821e-06, "loss": 0.3866, "step": 12525 }, { "epoch": 41.010666666666665, "grad_norm": 5.419500827789307, "learning_rate": 1.0693757519048257e-06, "loss": 0.4345, "step": 12550 }, { "epoch": 41.012253968253965, "grad_norm": 4.399853229522705, "learning_rate": 1.0610212538430693e-06, "loss": 0.4444, "step": 12575 }, { "epoch": 41.01384126984127, "grad_norm": 6.50507116317749, "learning_rate": 1.0526667557813128e-06, "loss": 0.4385, "step": 12600 }, { "epoch": 41.01542857142857, "grad_norm": 2.587691068649292, "learning_rate": 1.0443122577195564e-06, "loss": 0.4558, "step": 12625 }, { "epoch": 41.01701587301587, "grad_norm": 4.828845977783203, "learning_rate": 1.0359577596577997e-06, "loss": 0.4655, "step": 12650 }, { "epoch": 41.01860317460317, "grad_norm": 9.805785179138184, "learning_rate": 1.0276032615960433e-06, "loss": 0.4688, "step": 12675 }, { "epoch": 42.001015873015874, "grad_norm": 4.370798587799072, "learning_rate": 1.0192487635342869e-06, "loss": 0.4457, "step": 12700 }, { "epoch": 42.00260317460317, "grad_norm": 5.954914093017578, "learning_rate": 1.0108942654725304e-06, "loss": 0.5047, "step": 12725 }, { "epoch": 42.00419047619047, "grad_norm": 4.680336952209473, "learning_rate": 1.002539767410774e-06, "loss": 0.4536, "step": 12750 }, { "epoch": 42.00577777777778, "grad_norm": 3.612610101699829, "learning_rate": 9.941852693490176e-07, "loss": 0.5006, "step": 12775 }, { "epoch": 42.00736507936508, "grad_norm": 4.1838178634643555, "learning_rate": 9.858307712872611e-07, "loss": 0.4459, "step": 12800 }, { "epoch": 42.00895238095238, "grad_norm": 5.238033771514893, "learning_rate": 9.774762732255047e-07, "loss": 0.3692, "step": 12825 }, { "epoch": 42.01053968253968, "grad_norm": 5.992905139923096, "learning_rate": 9.691217751637483e-07, "loss": 0.4471, "step": 12850 }, { "epoch": 42.012126984126986, "grad_norm": 3.9099578857421875, "learning_rate": 9.607672771019918e-07, "loss": 0.4505, "step": 12875 }, { "epoch": 42.013714285714286, "grad_norm": 5.2203497886657715, "learning_rate": 9.524127790402353e-07, "loss": 0.4285, "step": 12900 }, { "epoch": 42.015301587301586, "grad_norm": 10.484596252441406, "learning_rate": 9.440582809784789e-07, "loss": 0.4644, "step": 12925 }, { "epoch": 42.016888888888886, "grad_norm": 9.001386642456055, "learning_rate": 9.357037829167224e-07, "loss": 0.4485, "step": 12950 }, { "epoch": 42.01847619047619, "grad_norm": 5.222927570343018, "learning_rate": 9.27349284854966e-07, "loss": 0.4722, "step": 12975 }, { "epoch": 43.00088888888889, "grad_norm": 5.298580169677734, "learning_rate": 9.189947867932095e-07, "loss": 0.4334, "step": 13000 }, { "epoch": 43.00247619047619, "grad_norm": 7.794508457183838, "learning_rate": 9.10640288731453e-07, "loss": 0.5104, "step": 13025 }, { "epoch": 43.004063492063494, "grad_norm": 7.346789360046387, "learning_rate": 9.022857906696966e-07, "loss": 0.4567, "step": 13050 }, { "epoch": 43.005650793650794, "grad_norm": 7.656489372253418, "learning_rate": 8.939312926079403e-07, "loss": 0.4923, "step": 13075 }, { "epoch": 43.007238095238094, "grad_norm": 4.487580299377441, "learning_rate": 8.855767945461838e-07, "loss": 0.4286, "step": 13100 }, { "epoch": 43.008825396825394, "grad_norm": 3.4565789699554443, "learning_rate": 8.772222964844272e-07, "loss": 0.3914, "step": 13125 }, { "epoch": 43.0104126984127, "grad_norm": 6.925444602966309, "learning_rate": 8.688677984226708e-07, "loss": 0.4391, "step": 13150 }, { "epoch": 43.012, "grad_norm": 5.899662494659424, "learning_rate": 8.605133003609144e-07, "loss": 0.4529, "step": 13175 }, { "epoch": 43.0135873015873, "grad_norm": 4.932972431182861, "learning_rate": 8.52158802299158e-07, "loss": 0.4289, "step": 13200 }, { "epoch": 43.0151746031746, "grad_norm": 6.803351402282715, "learning_rate": 8.438043042374015e-07, "loss": 0.4562, "step": 13225 }, { "epoch": 43.01676190476191, "grad_norm": 4.4438066482543945, "learning_rate": 8.35449806175645e-07, "loss": 0.4426, "step": 13250 }, { "epoch": 43.01834920634921, "grad_norm": 9.328689575195312, "learning_rate": 8.270953081138886e-07, "loss": 0.4814, "step": 13275 }, { "epoch": 44.0007619047619, "grad_norm": 10.716085433959961, "learning_rate": 8.187408100521321e-07, "loss": 0.4272, "step": 13300 }, { "epoch": 44.00234920634921, "grad_norm": 5.453390598297119, "learning_rate": 8.103863119903757e-07, "loss": 0.5105, "step": 13325 }, { "epoch": 44.00393650793651, "grad_norm": 8.206409454345703, "learning_rate": 8.020318139286193e-07, "loss": 0.4678, "step": 13350 }, { "epoch": 44.00552380952381, "grad_norm": 9.60810661315918, "learning_rate": 7.936773158668627e-07, "loss": 0.4715, "step": 13375 }, { "epoch": 44.00711111111111, "grad_norm": 4.23331880569458, "learning_rate": 7.853228178051063e-07, "loss": 0.4476, "step": 13400 }, { "epoch": 44.008698412698415, "grad_norm": 4.390725135803223, "learning_rate": 7.769683197433499e-07, "loss": 0.3836, "step": 13425 }, { "epoch": 44.010285714285715, "grad_norm": 8.254178047180176, "learning_rate": 7.686138216815934e-07, "loss": 0.441, "step": 13450 }, { "epoch": 44.011873015873014, "grad_norm": 9.834754943847656, "learning_rate": 7.60259323619837e-07, "loss": 0.4474, "step": 13475 }, { "epoch": 44.013460317460314, "grad_norm": 5.189121723175049, "learning_rate": 7.519048255580805e-07, "loss": 0.4259, "step": 13500 }, { "epoch": 44.01504761904762, "grad_norm": 8.684473037719727, "learning_rate": 7.43550327496324e-07, "loss": 0.4342, "step": 13525 }, { "epoch": 44.01663492063492, "grad_norm": 9.289813041687012, "learning_rate": 7.351958294345676e-07, "loss": 0.4599, "step": 13550 }, { "epoch": 44.01822222222222, "grad_norm": 4.835540294647217, "learning_rate": 7.268413313728113e-07, "loss": 0.4765, "step": 13575 }, { "epoch": 45.00063492063492, "grad_norm": 15.820073127746582, "learning_rate": 7.184868333110548e-07, "loss": 0.429, "step": 13600 }, { "epoch": 45.00222222222222, "grad_norm": 7.332337856292725, "learning_rate": 7.101323352492982e-07, "loss": 0.5291, "step": 13625 }, { "epoch": 45.00380952380952, "grad_norm": 9.775607109069824, "learning_rate": 7.017778371875418e-07, "loss": 0.4441, "step": 13650 }, { "epoch": 45.00539682539682, "grad_norm": 6.92523717880249, "learning_rate": 6.934233391257854e-07, "loss": 0.4656, "step": 13675 }, { "epoch": 45.00698412698413, "grad_norm": 4.137689590454102, "learning_rate": 6.85068841064029e-07, "loss": 0.4757, "step": 13700 }, { "epoch": 45.00857142857143, "grad_norm": 6.4185638427734375, "learning_rate": 6.767143430022725e-07, "loss": 0.3757, "step": 13725 }, { "epoch": 45.01015873015873, "grad_norm": 6.079351425170898, "learning_rate": 6.68359844940516e-07, "loss": 0.431, "step": 13750 }, { "epoch": 45.01174603174603, "grad_norm": 4.2088165283203125, "learning_rate": 6.600053468787596e-07, "loss": 0.4354, "step": 13775 }, { "epoch": 45.013333333333335, "grad_norm": 3.3986575603485107, "learning_rate": 6.516508488170031e-07, "loss": 0.4486, "step": 13800 }, { "epoch": 45.014920634920635, "grad_norm": 5.290012359619141, "learning_rate": 6.432963507552467e-07, "loss": 0.4227, "step": 13825 }, { "epoch": 45.016507936507935, "grad_norm": 8.828485488891602, "learning_rate": 6.349418526934903e-07, "loss": 0.4662, "step": 13850 }, { "epoch": 45.018095238095235, "grad_norm": 4.196980953216553, "learning_rate": 6.265873546317337e-07, "loss": 0.4792, "step": 13875 }, { "epoch": 46.00050793650794, "grad_norm": 8.14965534210205, "learning_rate": 6.182328565699773e-07, "loss": 0.4178, "step": 13900 }, { "epoch": 46.00209523809524, "grad_norm": 4.221451282501221, "learning_rate": 6.098783585082209e-07, "loss": 0.5127, "step": 13925 }, { "epoch": 46.003682539682536, "grad_norm": 7.106605052947998, "learning_rate": 6.015238604464644e-07, "loss": 0.4599, "step": 13950 }, { "epoch": 46.00526984126984, "grad_norm": 11.949932098388672, "learning_rate": 5.93169362384708e-07, "loss": 0.4757, "step": 13975 }, { "epoch": 46.00685714285714, "grad_norm": 7.468497276306152, "learning_rate": 5.848148643229516e-07, "loss": 0.4786, "step": 14000 }, { "epoch": 46.00844444444444, "grad_norm": 9.329829216003418, "learning_rate": 5.76460366261195e-07, "loss": 0.3776, "step": 14025 }, { "epoch": 46.01003174603174, "grad_norm": 5.68237829208374, "learning_rate": 5.681058681994387e-07, "loss": 0.419, "step": 14050 }, { "epoch": 46.01161904761905, "grad_norm": 9.558659553527832, "learning_rate": 5.597513701376821e-07, "loss": 0.4381, "step": 14075 }, { "epoch": 46.01320634920635, "grad_norm": 4.898390769958496, "learning_rate": 5.513968720759257e-07, "loss": 0.4285, "step": 14100 }, { "epoch": 46.01479365079365, "grad_norm": 6.284289360046387, "learning_rate": 5.430423740141693e-07, "loss": 0.4426, "step": 14125 }, { "epoch": 46.01638095238095, "grad_norm": 7.268927574157715, "learning_rate": 5.346878759524128e-07, "loss": 0.4454, "step": 14150 }, { "epoch": 46.017968253968256, "grad_norm": 8.74441146850586, "learning_rate": 5.263333778906564e-07, "loss": 0.4862, "step": 14175 }, { "epoch": 47.00038095238095, "grad_norm": 5.302139759063721, "learning_rate": 5.179788798288999e-07, "loss": 0.4241, "step": 14200 }, { "epoch": 47.00196825396825, "grad_norm": 9.133415222167969, "learning_rate": 5.096243817671434e-07, "loss": 0.4938, "step": 14225 }, { "epoch": 47.00355555555556, "grad_norm": 11.927820205688477, "learning_rate": 5.01269883705387e-07, "loss": 0.4569, "step": 14250 }, { "epoch": 47.00514285714286, "grad_norm": 6.953431606292725, "learning_rate": 4.929153856436306e-07, "loss": 0.4669, "step": 14275 }, { "epoch": 47.00673015873016, "grad_norm": 15.06204891204834, "learning_rate": 4.845608875818741e-07, "loss": 0.4956, "step": 14300 }, { "epoch": 47.00831746031746, "grad_norm": 6.762052059173584, "learning_rate": 4.7620638952011765e-07, "loss": 0.3959, "step": 14325 }, { "epoch": 47.009904761904764, "grad_norm": 5.515415668487549, "learning_rate": 4.678518914583612e-07, "loss": 0.4077, "step": 14350 }, { "epoch": 47.011492063492064, "grad_norm": 5.323752403259277, "learning_rate": 4.594973933966047e-07, "loss": 0.4282, "step": 14375 }, { "epoch": 47.01307936507936, "grad_norm": 3.4081521034240723, "learning_rate": 4.511428953348483e-07, "loss": 0.4318, "step": 14400 }, { "epoch": 47.01466666666666, "grad_norm": 10.325825691223145, "learning_rate": 4.427883972730919e-07, "loss": 0.443, "step": 14425 }, { "epoch": 47.01625396825397, "grad_norm": 7.097803592681885, "learning_rate": 4.344338992113354e-07, "loss": 0.4469, "step": 14450 }, { "epoch": 47.01784126984127, "grad_norm": 8.34315013885498, "learning_rate": 4.26079401149579e-07, "loss": 0.48, "step": 14475 }, { "epoch": 48.000253968253965, "grad_norm": 5.317016124725342, "learning_rate": 4.177249030878225e-07, "loss": 0.4391, "step": 14500 }, { "epoch": 48.00184126984127, "grad_norm": 6.396937370300293, "learning_rate": 4.0937040502606607e-07, "loss": 0.4688, "step": 14525 }, { "epoch": 48.00342857142857, "grad_norm": 3.8121814727783203, "learning_rate": 4.0101590696430963e-07, "loss": 0.4771, "step": 14550 }, { "epoch": 48.00501587301587, "grad_norm": 7.068954944610596, "learning_rate": 3.9266140890255315e-07, "loss": 0.4657, "step": 14575 }, { "epoch": 48.00660317460317, "grad_norm": 3.8409268856048584, "learning_rate": 3.843069108407967e-07, "loss": 0.4651, "step": 14600 }, { "epoch": 48.00819047619048, "grad_norm": 4.6952714920043945, "learning_rate": 3.759524127790402e-07, "loss": 0.4065, "step": 14625 }, { "epoch": 48.00977777777778, "grad_norm": 5.573005199432373, "learning_rate": 3.675979147172838e-07, "loss": 0.411, "step": 14650 }, { "epoch": 48.01136507936508, "grad_norm": 7.740847587585449, "learning_rate": 3.592434166555274e-07, "loss": 0.4348, "step": 14675 }, { "epoch": 48.01295238095238, "grad_norm": 3.9830234050750732, "learning_rate": 3.508889185937709e-07, "loss": 0.437, "step": 14700 }, { "epoch": 48.014539682539684, "grad_norm": 4.826086521148682, "learning_rate": 3.425344205320145e-07, "loss": 0.4347, "step": 14725 }, { "epoch": 48.016126984126984, "grad_norm": 7.815319538116455, "learning_rate": 3.34179922470258e-07, "loss": 0.4485, "step": 14750 }, { "epoch": 48.017714285714284, "grad_norm": 7.226869583129883, "learning_rate": 3.2582542440850157e-07, "loss": 0.4703, "step": 14775 }, { "epoch": 49.000126984126986, "grad_norm": 6.489394187927246, "learning_rate": 3.1747092634674513e-07, "loss": 0.4377, "step": 14800 }, { "epoch": 49.001714285714286, "grad_norm": 7.8261399269104, "learning_rate": 3.0911642828498865e-07, "loss": 0.4772, "step": 14825 }, { "epoch": 49.003301587301586, "grad_norm": 5.6710004806518555, "learning_rate": 3.007619302232322e-07, "loss": 0.4895, "step": 14850 }, { "epoch": 49.004888888888885, "grad_norm": 5.4189252853393555, "learning_rate": 2.924074321614758e-07, "loss": 0.4575, "step": 14875 }, { "epoch": 49.00647619047619, "grad_norm": 3.988452434539795, "learning_rate": 2.8405293409971934e-07, "loss": 0.4701, "step": 14900 }, { "epoch": 49.00806349206349, "grad_norm": 6.010385990142822, "learning_rate": 2.7569843603796286e-07, "loss": 0.4005, "step": 14925 }, { "epoch": 49.00965079365079, "grad_norm": 12.048097610473633, "learning_rate": 2.673439379762064e-07, "loss": 0.4209, "step": 14950 }, { "epoch": 49.01123809523809, "grad_norm": 5.6577839851379395, "learning_rate": 2.5898943991444994e-07, "loss": 0.4263, "step": 14975 }, { "epoch": 49.0128253968254, "grad_norm": 9.367551803588867, "learning_rate": 2.506349418526935e-07, "loss": 0.4412, "step": 15000 }, { "epoch": 49.0144126984127, "grad_norm": 6.728636741638184, "learning_rate": 2.4228044379093707e-07, "loss": 0.4234, "step": 15025 }, { "epoch": 49.016, "grad_norm": 6.2957234382629395, "learning_rate": 2.339259457291806e-07, "loss": 0.4565, "step": 15050 }, { "epoch": 49.0175873015873, "grad_norm": 10.448657989501953, "learning_rate": 2.2557144766742415e-07, "loss": 0.4544, "step": 15075 }, { "epoch": 49.019174603174605, "grad_norm": 6.694546699523926, "learning_rate": 2.172169496056677e-07, "loss": 0.4277, "step": 15100 }, { "epoch": 50.0015873015873, "grad_norm": 5.7494096755981445, "learning_rate": 2.0886245154391125e-07, "loss": 0.4951, "step": 15125 }, { "epoch": 50.00317460317461, "grad_norm": 6.14343786239624, "learning_rate": 2.0050795348215482e-07, "loss": 0.4884, "step": 15150 }, { "epoch": 50.00476190476191, "grad_norm": 5.558969974517822, "learning_rate": 1.9215345542039836e-07, "loss": 0.4453, "step": 15175 }, { "epoch": 50.006349206349206, "grad_norm": 12.143143653869629, "learning_rate": 1.837989573586419e-07, "loss": 0.4809, "step": 15200 }, { "epoch": 50.007936507936506, "grad_norm": 6.324706554412842, "learning_rate": 1.7544445929688546e-07, "loss": 0.4006, "step": 15225 }, { "epoch": 50.00952380952381, "grad_norm": 5.624573230743408, "learning_rate": 1.67089961235129e-07, "loss": 0.4116, "step": 15250 }, { "epoch": 50.01111111111111, "grad_norm": 6.506253242492676, "learning_rate": 1.5873546317337257e-07, "loss": 0.4286, "step": 15275 }, { "epoch": 50.01269841269841, "grad_norm": 6.208716869354248, "learning_rate": 1.503809651116161e-07, "loss": 0.4255, "step": 15300 }, { "epoch": 50.01428571428571, "grad_norm": 3.370025634765625, "learning_rate": 1.4202646704985967e-07, "loss": 0.4314, "step": 15325 }, { "epoch": 50.01587301587302, "grad_norm": 10.119969367980957, "learning_rate": 1.336719689881032e-07, "loss": 0.4615, "step": 15350 }, { "epoch": 50.01746031746032, "grad_norm": 5.545236110687256, "learning_rate": 1.2531747092634675e-07, "loss": 0.4646, "step": 15375 }, { "epoch": 50.01904761904762, "grad_norm": 4.946952819824219, "learning_rate": 1.169629728645903e-07, "loss": 0.4344, "step": 15400 }, { "epoch": 51.00146031746032, "grad_norm": 5.004914283752441, "learning_rate": 1.0860847480283386e-07, "loss": 0.4824, "step": 15425 }, { "epoch": 51.00304761904762, "grad_norm": 6.189335823059082, "learning_rate": 1.0025397674107741e-07, "loss": 0.4991, "step": 15450 }, { "epoch": 51.00463492063492, "grad_norm": 7.598541259765625, "learning_rate": 9.189947867932095e-08, "loss": 0.4302, "step": 15475 }, { "epoch": 51.00622222222222, "grad_norm": 5.034823417663574, "learning_rate": 8.35449806175645e-08, "loss": 0.4842, "step": 15500 }, { "epoch": 51.00780952380953, "grad_norm": 10.85693359375, "learning_rate": 7.519048255580805e-08, "loss": 0.4062, "step": 15525 }, { "epoch": 51.00939682539683, "grad_norm": 5.432575702667236, "learning_rate": 6.68359844940516e-08, "loss": 0.4042, "step": 15550 }, { "epoch": 51.01098412698413, "grad_norm": 6.148897171020508, "learning_rate": 5.848148643229515e-08, "loss": 0.4301, "step": 15575 }, { "epoch": 51.01257142857143, "grad_norm": 11.638060569763184, "learning_rate": 5.0126988370538704e-08, "loss": 0.4318, "step": 15600 } ], "logging_steps": 25, "max_steps": 15750, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.17323979440128e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }