{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2879, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003473428273706148, "grad_norm": 37.23415549580066, "learning_rate": 1.1494252873563219e-07, "loss": 0.1191, "step": 1 }, { "epoch": 0.0006946856547412296, "grad_norm": 43.976862685963305, "learning_rate": 2.2988505747126437e-07, "loss": 0.1435, "step": 2 }, { "epoch": 0.0010420284821118443, "grad_norm": 32.512151916312156, "learning_rate": 3.4482758620689656e-07, "loss": 0.1155, "step": 3 }, { "epoch": 0.0013893713094824591, "grad_norm": 48.913784050446566, "learning_rate": 4.5977011494252875e-07, "loss": 0.1683, "step": 4 }, { "epoch": 0.0017367141368530739, "grad_norm": 36.577121907693055, "learning_rate": 5.747126436781609e-07, "loss": 0.1352, "step": 5 }, { "epoch": 0.0020840569642236887, "grad_norm": 26.561764879370827, "learning_rate": 6.896551724137931e-07, "loss": 0.0911, "step": 6 }, { "epoch": 0.0024313997915943034, "grad_norm": 26.311858988645387, "learning_rate": 8.045977011494253e-07, "loss": 0.095, "step": 7 }, { "epoch": 0.0027787426189649182, "grad_norm": 4.621058125611507, "learning_rate": 9.195402298850575e-07, "loss": 0.0487, "step": 8 }, { "epoch": 0.003126085446335533, "grad_norm": 5.328829287414011, "learning_rate": 1.0344827586206898e-06, "loss": 0.0529, "step": 9 }, { "epoch": 0.0034734282737061478, "grad_norm": 3.148318981902749, "learning_rate": 1.1494252873563219e-06, "loss": 0.0402, "step": 10 }, { "epoch": 0.0038207711010767626, "grad_norm": 3.908627374831155, "learning_rate": 1.2643678160919542e-06, "loss": 0.0407, "step": 11 }, { "epoch": 0.004168113928447377, "grad_norm": 4.739284231854802, "learning_rate": 1.3793103448275862e-06, "loss": 0.0446, "step": 12 }, { "epoch": 0.0045154567558179926, "grad_norm": 2.3496827004932945, "learning_rate": 1.4942528735632185e-06, "loss": 0.0384, "step": 13 }, { "epoch": 0.004862799583188607, "grad_norm": 5.475271188171554, "learning_rate": 1.6091954022988506e-06, "loss": 0.0485, "step": 14 }, { "epoch": 0.005210142410559222, "grad_norm": 6.547877477275296, "learning_rate": 1.724137931034483e-06, "loss": 0.0568, "step": 15 }, { "epoch": 0.0055574852379298365, "grad_norm": 3.8463210912421704, "learning_rate": 1.839080459770115e-06, "loss": 0.0449, "step": 16 }, { "epoch": 0.005904828065300452, "grad_norm": 5.599877467384883, "learning_rate": 1.9540229885057475e-06, "loss": 0.0491, "step": 17 }, { "epoch": 0.006252170892671066, "grad_norm": 4.194689420093038, "learning_rate": 2.0689655172413796e-06, "loss": 0.0473, "step": 18 }, { "epoch": 0.006599513720041681, "grad_norm": 1.8031692743490404, "learning_rate": 2.1839080459770117e-06, "loss": 0.042, "step": 19 }, { "epoch": 0.0069468565474122956, "grad_norm": 4.4715637058132, "learning_rate": 2.2988505747126437e-06, "loss": 0.0404, "step": 20 }, { "epoch": 0.007294199374782911, "grad_norm": 0.8236040836386482, "learning_rate": 2.4137931034482762e-06, "loss": 0.0382, "step": 21 }, { "epoch": 0.007641542202153525, "grad_norm": 0.82338294259807, "learning_rate": 2.5287356321839083e-06, "loss": 0.0422, "step": 22 }, { "epoch": 0.00798888502952414, "grad_norm": 1.8027969557144028, "learning_rate": 2.6436781609195404e-06, "loss": 0.0363, "step": 23 }, { "epoch": 0.008336227856894755, "grad_norm": 0.727095842359559, "learning_rate": 2.7586206896551725e-06, "loss": 0.0351, "step": 24 }, { "epoch": 0.00868357068426537, "grad_norm": 1.92696781932363, "learning_rate": 2.8735632183908046e-06, "loss": 0.0301, "step": 25 }, { "epoch": 0.009030913511635985, "grad_norm": 0.9300122101440347, "learning_rate": 2.988505747126437e-06, "loss": 0.0326, "step": 26 }, { "epoch": 0.0093782563390066, "grad_norm": 5.1665169890422495, "learning_rate": 3.103448275862069e-06, "loss": 0.0378, "step": 27 }, { "epoch": 0.009725599166377214, "grad_norm": 5.236812002541303, "learning_rate": 3.2183908045977012e-06, "loss": 0.0408, "step": 28 }, { "epoch": 0.010072941993747829, "grad_norm": 2.6321912137329666, "learning_rate": 3.3333333333333333e-06, "loss": 0.0315, "step": 29 }, { "epoch": 0.010420284821118444, "grad_norm": 1.7260726850635042, "learning_rate": 3.448275862068966e-06, "loss": 0.03, "step": 30 }, { "epoch": 0.01076762764848906, "grad_norm": 2.359583277352366, "learning_rate": 3.563218390804598e-06, "loss": 0.0293, "step": 31 }, { "epoch": 0.011114970475859673, "grad_norm": 3.4439336912247382, "learning_rate": 3.67816091954023e-06, "loss": 0.0391, "step": 32 }, { "epoch": 0.011462313303230288, "grad_norm": 2.5718166529004374, "learning_rate": 3.793103448275862e-06, "loss": 0.0405, "step": 33 }, { "epoch": 0.011809656130600903, "grad_norm": 1.8743075423305569, "learning_rate": 3.908045977011495e-06, "loss": 0.0316, "step": 34 }, { "epoch": 0.012156998957971519, "grad_norm": 0.9641265575376259, "learning_rate": 4.022988505747127e-06, "loss": 0.0282, "step": 35 }, { "epoch": 0.012504341785342132, "grad_norm": 4.625856081652024, "learning_rate": 4.137931034482759e-06, "loss": 0.0336, "step": 36 }, { "epoch": 0.012851684612712747, "grad_norm": 4.956158639154869, "learning_rate": 4.252873563218391e-06, "loss": 0.0535, "step": 37 }, { "epoch": 0.013199027440083362, "grad_norm": 3.8237646578914033, "learning_rate": 4.367816091954023e-06, "loss": 0.0321, "step": 38 }, { "epoch": 0.013546370267453978, "grad_norm": 1.6947960633729644, "learning_rate": 4.482758620689656e-06, "loss": 0.0348, "step": 39 }, { "epoch": 0.013893713094824591, "grad_norm": 0.7680647499923681, "learning_rate": 4.5977011494252875e-06, "loss": 0.0318, "step": 40 }, { "epoch": 0.014241055922195206, "grad_norm": 2.1664725717573545, "learning_rate": 4.71264367816092e-06, "loss": 0.0314, "step": 41 }, { "epoch": 0.014588398749565822, "grad_norm": 4.263206749577008, "learning_rate": 4.8275862068965525e-06, "loss": 0.0407, "step": 42 }, { "epoch": 0.014935741576936437, "grad_norm": 4.1724594615524335, "learning_rate": 4.942528735632184e-06, "loss": 0.0424, "step": 43 }, { "epoch": 0.01528308440430705, "grad_norm": 3.7167659760688303, "learning_rate": 5.057471264367817e-06, "loss": 0.0345, "step": 44 }, { "epoch": 0.015630427231677665, "grad_norm": 2.463474287564055, "learning_rate": 5.172413793103449e-06, "loss": 0.0349, "step": 45 }, { "epoch": 0.01597777005904828, "grad_norm": 0.9621922490266032, "learning_rate": 5.287356321839081e-06, "loss": 0.0308, "step": 46 }, { "epoch": 0.016325112886418896, "grad_norm": 1.1678035904473527, "learning_rate": 5.402298850574713e-06, "loss": 0.0255, "step": 47 }, { "epoch": 0.01667245571378951, "grad_norm": 2.1879662040379664, "learning_rate": 5.517241379310345e-06, "loss": 0.0316, "step": 48 }, { "epoch": 0.017019798541160126, "grad_norm": 3.5945900982319015, "learning_rate": 5.6321839080459775e-06, "loss": 0.042, "step": 49 }, { "epoch": 0.01736714136853074, "grad_norm": 1.5756413045669015, "learning_rate": 5.747126436781609e-06, "loss": 0.0255, "step": 50 }, { "epoch": 0.017714484195901353, "grad_norm": 1.2748577272738042, "learning_rate": 5.862068965517242e-06, "loss": 0.03, "step": 51 }, { "epoch": 0.01806182702327197, "grad_norm": 3.009209224361777, "learning_rate": 5.977011494252874e-06, "loss": 0.0341, "step": 52 }, { "epoch": 0.018409169850642584, "grad_norm": 3.489277616266429, "learning_rate": 6.091954022988507e-06, "loss": 0.0317, "step": 53 }, { "epoch": 0.0187565126780132, "grad_norm": 3.962377854439755, "learning_rate": 6.206896551724138e-06, "loss": 0.0383, "step": 54 }, { "epoch": 0.019103855505383814, "grad_norm": 2.162400758859317, "learning_rate": 6.321839080459771e-06, "loss": 0.0327, "step": 55 }, { "epoch": 0.019451198332754428, "grad_norm": 0.9492153536245894, "learning_rate": 6.4367816091954025e-06, "loss": 0.0266, "step": 56 }, { "epoch": 0.019798541160125045, "grad_norm": 1.0187860208671786, "learning_rate": 6.551724137931035e-06, "loss": 0.0354, "step": 57 }, { "epoch": 0.020145883987495658, "grad_norm": 1.1083211600458822, "learning_rate": 6.666666666666667e-06, "loss": 0.0307, "step": 58 }, { "epoch": 0.02049322681486627, "grad_norm": 3.937460604921549, "learning_rate": 6.781609195402299e-06, "loss": 0.0359, "step": 59 }, { "epoch": 0.02084056964223689, "grad_norm": 0.6965391618701994, "learning_rate": 6.896551724137932e-06, "loss": 0.0296, "step": 60 }, { "epoch": 0.021187912469607502, "grad_norm": 1.7430268237608375, "learning_rate": 7.011494252873564e-06, "loss": 0.033, "step": 61 }, { "epoch": 0.02153525529697812, "grad_norm": 1.6253422710766354, "learning_rate": 7.126436781609196e-06, "loss": 0.026, "step": 62 }, { "epoch": 0.021882598124348732, "grad_norm": 3.037222634408878, "learning_rate": 7.241379310344828e-06, "loss": 0.0349, "step": 63 }, { "epoch": 0.022229940951719346, "grad_norm": 2.145372106566845, "learning_rate": 7.35632183908046e-06, "loss": 0.0294, "step": 64 }, { "epoch": 0.022577283779089963, "grad_norm": 2.6342550049280384, "learning_rate": 7.4712643678160925e-06, "loss": 0.038, "step": 65 }, { "epoch": 0.022924626606460576, "grad_norm": 1.336495145846901, "learning_rate": 7.586206896551724e-06, "loss": 0.037, "step": 66 }, { "epoch": 0.02327196943383119, "grad_norm": 1.3878158762500856, "learning_rate": 7.701149425287356e-06, "loss": 0.036, "step": 67 }, { "epoch": 0.023619312261201807, "grad_norm": 0.7502926647886281, "learning_rate": 7.81609195402299e-06, "loss": 0.027, "step": 68 }, { "epoch": 0.02396665508857242, "grad_norm": 2.7789051553384114, "learning_rate": 7.93103448275862e-06, "loss": 0.0385, "step": 69 }, { "epoch": 0.024313997915943037, "grad_norm": 1.510784689203471, "learning_rate": 8.045977011494253e-06, "loss": 0.0434, "step": 70 }, { "epoch": 0.02466134074331365, "grad_norm": 1.795628251322906, "learning_rate": 8.160919540229886e-06, "loss": 0.0308, "step": 71 }, { "epoch": 0.025008683570684264, "grad_norm": 0.8482257639162686, "learning_rate": 8.275862068965518e-06, "loss": 0.0354, "step": 72 }, { "epoch": 0.02535602639805488, "grad_norm": 1.9996412936621009, "learning_rate": 8.390804597701149e-06, "loss": 0.0369, "step": 73 }, { "epoch": 0.025703369225425494, "grad_norm": 0.7654423441044093, "learning_rate": 8.505747126436782e-06, "loss": 0.0353, "step": 74 }, { "epoch": 0.02605071205279611, "grad_norm": 0.6493879425365747, "learning_rate": 8.620689655172414e-06, "loss": 0.0291, "step": 75 }, { "epoch": 0.026398054880166725, "grad_norm": 2.414435804159643, "learning_rate": 8.735632183908047e-06, "loss": 0.0317, "step": 76 }, { "epoch": 0.02674539770753734, "grad_norm": 1.0904156611035012, "learning_rate": 8.85057471264368e-06, "loss": 0.0269, "step": 77 }, { "epoch": 0.027092740534907955, "grad_norm": 1.4189653573098011, "learning_rate": 8.965517241379312e-06, "loss": 0.0236, "step": 78 }, { "epoch": 0.02744008336227857, "grad_norm": 1.040668552643942, "learning_rate": 9.080459770114942e-06, "loss": 0.0243, "step": 79 }, { "epoch": 0.027787426189649182, "grad_norm": 1.3437174808420649, "learning_rate": 9.195402298850575e-06, "loss": 0.0332, "step": 80 }, { "epoch": 0.0281347690170198, "grad_norm": 1.4299130307272268, "learning_rate": 9.310344827586207e-06, "loss": 0.0308, "step": 81 }, { "epoch": 0.028482111844390413, "grad_norm": 1.2629695612963914, "learning_rate": 9.42528735632184e-06, "loss": 0.0203, "step": 82 }, { "epoch": 0.02882945467176103, "grad_norm": 1.3219643843083144, "learning_rate": 9.54022988505747e-06, "loss": 0.0379, "step": 83 }, { "epoch": 0.029176797499131643, "grad_norm": 3.6871466385779783, "learning_rate": 9.655172413793105e-06, "loss": 0.0274, "step": 84 }, { "epoch": 0.029524140326502257, "grad_norm": 3.9969862428228033, "learning_rate": 9.770114942528738e-06, "loss": 0.0323, "step": 85 }, { "epoch": 0.029871483153872874, "grad_norm": 3.065195993982362, "learning_rate": 9.885057471264368e-06, "loss": 0.0309, "step": 86 }, { "epoch": 0.030218825981243487, "grad_norm": 1.468135218974371, "learning_rate": 1e-05, "loss": 0.0263, "step": 87 }, { "epoch": 0.0305661688086141, "grad_norm": 2.1306295097585353, "learning_rate": 9.999996834743712e-06, "loss": 0.0366, "step": 88 }, { "epoch": 0.030913511635984717, "grad_norm": 4.561675020890797, "learning_rate": 9.999987338978852e-06, "loss": 0.0399, "step": 89 }, { "epoch": 0.03126085446335533, "grad_norm": 3.0699739972744355, "learning_rate": 9.999971512717445e-06, "loss": 0.0361, "step": 90 }, { "epoch": 0.03160819729072595, "grad_norm": 1.7204154640977625, "learning_rate": 9.99994935597953e-06, "loss": 0.023, "step": 91 }, { "epoch": 0.03195554011809656, "grad_norm": 0.4121394943188734, "learning_rate": 9.999920868793156e-06, "loss": 0.0151, "step": 92 }, { "epoch": 0.032302882945467175, "grad_norm": 1.8037684571162973, "learning_rate": 9.999886051194392e-06, "loss": 0.041, "step": 93 }, { "epoch": 0.03265022577283779, "grad_norm": 1.5191359679839431, "learning_rate": 9.999844903227323e-06, "loss": 0.0222, "step": 94 }, { "epoch": 0.03299756860020841, "grad_norm": 2.797587288217291, "learning_rate": 9.999797424944041e-06, "loss": 0.0359, "step": 95 }, { "epoch": 0.03334491142757902, "grad_norm": 0.9848390023335929, "learning_rate": 9.999743616404667e-06, "loss": 0.029, "step": 96 }, { "epoch": 0.033692254254949636, "grad_norm": 3.474922410115143, "learning_rate": 9.999683477677319e-06, "loss": 0.0433, "step": 97 }, { "epoch": 0.03403959708232025, "grad_norm": 0.4352894693833356, "learning_rate": 9.999617008838145e-06, "loss": 0.0271, "step": 98 }, { "epoch": 0.03438693990969086, "grad_norm": 1.0612949281619894, "learning_rate": 9.999544209971299e-06, "loss": 0.0262, "step": 99 }, { "epoch": 0.03473428273706148, "grad_norm": 1.5028120141315706, "learning_rate": 9.999465081168954e-06, "loss": 0.023, "step": 100 }, { "epoch": 0.035081625564432097, "grad_norm": 2.485311227250491, "learning_rate": 9.999379622531292e-06, "loss": 0.0353, "step": 101 }, { "epoch": 0.035428968391802707, "grad_norm": 1.1277504855781095, "learning_rate": 9.99928783416651e-06, "loss": 0.0238, "step": 102 }, { "epoch": 0.03577631121917332, "grad_norm": 0.9476524869349918, "learning_rate": 9.99918971619083e-06, "loss": 0.0311, "step": 103 }, { "epoch": 0.03612365404654394, "grad_norm": 4.351712281867559, "learning_rate": 9.999085268728473e-06, "loss": 0.0371, "step": 104 }, { "epoch": 0.03647099687391455, "grad_norm": 2.7348253113812424, "learning_rate": 9.998974491911681e-06, "loss": 0.0374, "step": 105 }, { "epoch": 0.03681833970128517, "grad_norm": 2.5995603090920105, "learning_rate": 9.998857385880712e-06, "loss": 0.0285, "step": 106 }, { "epoch": 0.037165682528655784, "grad_norm": 0.7311687318983651, "learning_rate": 9.99873395078383e-06, "loss": 0.0229, "step": 107 }, { "epoch": 0.0375130253560264, "grad_norm": 1.2353487773193577, "learning_rate": 9.998604186777318e-06, "loss": 0.0322, "step": 108 }, { "epoch": 0.03786036818339701, "grad_norm": 0.9451422134075631, "learning_rate": 9.998468094025473e-06, "loss": 0.0352, "step": 109 }, { "epoch": 0.03820771101076763, "grad_norm": 1.7091841651608024, "learning_rate": 9.9983256727006e-06, "loss": 0.0344, "step": 110 }, { "epoch": 0.038555053838138245, "grad_norm": 2.7065412298038387, "learning_rate": 9.998176922983017e-06, "loss": 0.0356, "step": 111 }, { "epoch": 0.038902396665508855, "grad_norm": 0.7915485766007472, "learning_rate": 9.998021845061059e-06, "loss": 0.0278, "step": 112 }, { "epoch": 0.03924973949287947, "grad_norm": 0.6604452686175176, "learning_rate": 9.99786043913107e-06, "loss": 0.0263, "step": 113 }, { "epoch": 0.03959708232025009, "grad_norm": 2.2075871866436807, "learning_rate": 9.997692705397408e-06, "loss": 0.0293, "step": 114 }, { "epoch": 0.0399444251476207, "grad_norm": 1.0990740259335516, "learning_rate": 9.99751864407244e-06, "loss": 0.0251, "step": 115 }, { "epoch": 0.040291767974991316, "grad_norm": 1.1686977294017693, "learning_rate": 9.997338255376545e-06, "loss": 0.0313, "step": 116 }, { "epoch": 0.04063911080236193, "grad_norm": 1.3383756136679479, "learning_rate": 9.997151539538114e-06, "loss": 0.0226, "step": 117 }, { "epoch": 0.04098645362973254, "grad_norm": 0.7694885404915951, "learning_rate": 9.996958496793547e-06, "loss": 0.0301, "step": 118 }, { "epoch": 0.04133379645710316, "grad_norm": 1.0594137346894543, "learning_rate": 9.996759127387259e-06, "loss": 0.0217, "step": 119 }, { "epoch": 0.04168113928447378, "grad_norm": 0.7335920204997873, "learning_rate": 9.996553431571669e-06, "loss": 0.0225, "step": 120 }, { "epoch": 0.042028482111844394, "grad_norm": 2.255099961830129, "learning_rate": 9.99634140960721e-06, "loss": 0.0248, "step": 121 }, { "epoch": 0.042375824939215004, "grad_norm": 2.42490870668873, "learning_rate": 9.996123061762324e-06, "loss": 0.0424, "step": 122 }, { "epoch": 0.04272316776658562, "grad_norm": 1.1936924708308256, "learning_rate": 9.99589838831346e-06, "loss": 0.036, "step": 123 }, { "epoch": 0.04307051059395624, "grad_norm": 0.9868005673494291, "learning_rate": 9.995667389545082e-06, "loss": 0.0418, "step": 124 }, { "epoch": 0.04341785342132685, "grad_norm": 2.506803561409087, "learning_rate": 9.995430065749653e-06, "loss": 0.0287, "step": 125 }, { "epoch": 0.043765196248697465, "grad_norm": 0.7525519955237748, "learning_rate": 9.995186417227654e-06, "loss": 0.0295, "step": 126 }, { "epoch": 0.04411253907606808, "grad_norm": 0.3504040791938595, "learning_rate": 9.994936444287565e-06, "loss": 0.0317, "step": 127 }, { "epoch": 0.04445988190343869, "grad_norm": 1.0108416374201756, "learning_rate": 9.99468014724588e-06, "loss": 0.0281, "step": 128 }, { "epoch": 0.04480722473080931, "grad_norm": 2.211821690034367, "learning_rate": 9.994417526427094e-06, "loss": 0.0351, "step": 129 }, { "epoch": 0.045154567558179926, "grad_norm": 0.4010376448499792, "learning_rate": 9.994148582163715e-06, "loss": 0.0278, "step": 130 }, { "epoch": 0.045501910385550536, "grad_norm": 1.1919924569133369, "learning_rate": 9.993873314796253e-06, "loss": 0.028, "step": 131 }, { "epoch": 0.04584925321292115, "grad_norm": 1.2102441655899068, "learning_rate": 9.993591724673225e-06, "loss": 0.0272, "step": 132 }, { "epoch": 0.04619659604029177, "grad_norm": 1.3423201041149735, "learning_rate": 9.993303812151153e-06, "loss": 0.0288, "step": 133 }, { "epoch": 0.04654393886766238, "grad_norm": 0.9911420725512745, "learning_rate": 9.993009577594564e-06, "loss": 0.0266, "step": 134 }, { "epoch": 0.046891281695032996, "grad_norm": 2.818665002325742, "learning_rate": 9.992709021375987e-06, "loss": 0.0416, "step": 135 }, { "epoch": 0.04723862452240361, "grad_norm": 0.6400736482581679, "learning_rate": 9.99240214387596e-06, "loss": 0.0224, "step": 136 }, { "epoch": 0.04758596734977423, "grad_norm": 1.564814253959133, "learning_rate": 9.99208894548302e-06, "loss": 0.0326, "step": 137 }, { "epoch": 0.04793331017714484, "grad_norm": 1.1019040221230227, "learning_rate": 9.991769426593707e-06, "loss": 0.0327, "step": 138 }, { "epoch": 0.04828065300451546, "grad_norm": 6.120623936551482, "learning_rate": 9.991443587612568e-06, "loss": 0.047, "step": 139 }, { "epoch": 0.048627995831886074, "grad_norm": 5.451375099863766, "learning_rate": 9.991111428952145e-06, "loss": 0.0503, "step": 140 }, { "epoch": 0.048975338659256684, "grad_norm": 4.7872510231651635, "learning_rate": 9.990772951032987e-06, "loss": 0.0417, "step": 141 }, { "epoch": 0.0493226814866273, "grad_norm": 3.9394554371100337, "learning_rate": 9.990428154283641e-06, "loss": 0.0341, "step": 142 }, { "epoch": 0.04967002431399792, "grad_norm": 0.8850048672337401, "learning_rate": 9.990077039140655e-06, "loss": 0.0319, "step": 143 }, { "epoch": 0.05001736714136853, "grad_norm": 0.735313738269197, "learning_rate": 9.989719606048578e-06, "loss": 0.03, "step": 144 }, { "epoch": 0.050364709968739145, "grad_norm": 2.380846224871553, "learning_rate": 9.989355855459954e-06, "loss": 0.033, "step": 145 }, { "epoch": 0.05071205279610976, "grad_norm": 1.5964417699444167, "learning_rate": 9.988985787835332e-06, "loss": 0.0297, "step": 146 }, { "epoch": 0.05105939562348037, "grad_norm": 2.2693926915330067, "learning_rate": 9.988609403643254e-06, "loss": 0.0355, "step": 147 }, { "epoch": 0.05140673845085099, "grad_norm": 0.8694077145197386, "learning_rate": 9.98822670336026e-06, "loss": 0.0257, "step": 148 }, { "epoch": 0.051754081278221606, "grad_norm": 2.6184498491631687, "learning_rate": 9.987837687470889e-06, "loss": 0.0349, "step": 149 }, { "epoch": 0.05210142410559222, "grad_norm": 1.8812135552729436, "learning_rate": 9.987442356467677e-06, "loss": 0.0205, "step": 150 }, { "epoch": 0.05244876693296283, "grad_norm": 0.757853013334225, "learning_rate": 9.987040710851148e-06, "loss": 0.0361, "step": 151 }, { "epoch": 0.05279610976033345, "grad_norm": 0.9951588342302143, "learning_rate": 9.98663275112983e-06, "loss": 0.0237, "step": 152 }, { "epoch": 0.05314345258770407, "grad_norm": 1.6932805634155623, "learning_rate": 9.986218477820244e-06, "loss": 0.0368, "step": 153 }, { "epoch": 0.05349079541507468, "grad_norm": 1.3139957494156915, "learning_rate": 9.985797891446898e-06, "loss": 0.0229, "step": 154 }, { "epoch": 0.053838138242445294, "grad_norm": 0.5291454593174553, "learning_rate": 9.9853709925423e-06, "loss": 0.0228, "step": 155 }, { "epoch": 0.05418548106981591, "grad_norm": 1.6707077604660217, "learning_rate": 9.984937781646948e-06, "loss": 0.03, "step": 156 }, { "epoch": 0.05453282389718652, "grad_norm": 3.0726902017901927, "learning_rate": 9.984498259309332e-06, "loss": 0.0334, "step": 157 }, { "epoch": 0.05488016672455714, "grad_norm": 0.7820818072933668, "learning_rate": 9.984052426085931e-06, "loss": 0.0205, "step": 158 }, { "epoch": 0.055227509551927755, "grad_norm": 1.5369790153168252, "learning_rate": 9.983600282541213e-06, "loss": 0.033, "step": 159 }, { "epoch": 0.055574852379298365, "grad_norm": 0.6130697531268696, "learning_rate": 9.983141829247644e-06, "loss": 0.0323, "step": 160 }, { "epoch": 0.05592219520666898, "grad_norm": 2.0042679704987245, "learning_rate": 9.982677066785667e-06, "loss": 0.0362, "step": 161 }, { "epoch": 0.0562695380340396, "grad_norm": 0.9373286974868164, "learning_rate": 9.982205995743723e-06, "loss": 0.027, "step": 162 }, { "epoch": 0.05661688086141021, "grad_norm": 0.9593697401680631, "learning_rate": 9.981728616718234e-06, "loss": 0.0221, "step": 163 }, { "epoch": 0.056964223688780825, "grad_norm": 0.9145114104321098, "learning_rate": 9.981244930313613e-06, "loss": 0.0269, "step": 164 }, { "epoch": 0.05731156651615144, "grad_norm": 1.7022933635129043, "learning_rate": 9.980754937142254e-06, "loss": 0.0273, "step": 165 }, { "epoch": 0.05765890934352206, "grad_norm": 0.5802100036143374, "learning_rate": 9.98025863782454e-06, "loss": 0.0197, "step": 166 }, { "epoch": 0.05800625217089267, "grad_norm": 2.06874906530981, "learning_rate": 9.979756032988837e-06, "loss": 0.0292, "step": 167 }, { "epoch": 0.058353594998263286, "grad_norm": 2.9290249045005017, "learning_rate": 9.979247123271494e-06, "loss": 0.0338, "step": 168 }, { "epoch": 0.0587009378256339, "grad_norm": 0.8386866195629475, "learning_rate": 9.978731909316841e-06, "loss": 0.0243, "step": 169 }, { "epoch": 0.05904828065300451, "grad_norm": 0.4502042472741338, "learning_rate": 9.978210391777195e-06, "loss": 0.0255, "step": 170 }, { "epoch": 0.05939562348037513, "grad_norm": 2.248415124487682, "learning_rate": 9.977682571312847e-06, "loss": 0.0336, "step": 171 }, { "epoch": 0.05974296630774575, "grad_norm": 1.0652482256976974, "learning_rate": 9.977148448592077e-06, "loss": 0.0224, "step": 172 }, { "epoch": 0.06009030913511636, "grad_norm": 0.7329238551144143, "learning_rate": 9.976608024291135e-06, "loss": 0.0266, "step": 173 }, { "epoch": 0.060437651962486974, "grad_norm": 0.8741338102260089, "learning_rate": 9.976061299094253e-06, "loss": 0.0219, "step": 174 }, { "epoch": 0.06078499478985759, "grad_norm": 1.6779119248256096, "learning_rate": 9.975508273693643e-06, "loss": 0.0371, "step": 175 }, { "epoch": 0.0611323376172282, "grad_norm": 0.7059586695874602, "learning_rate": 9.974948948789492e-06, "loss": 0.0255, "step": 176 }, { "epoch": 0.06147968044459882, "grad_norm": 2.197705367927432, "learning_rate": 9.974383325089962e-06, "loss": 0.0222, "step": 177 }, { "epoch": 0.061827023271969435, "grad_norm": 1.2087103303661568, "learning_rate": 9.973811403311192e-06, "loss": 0.0336, "step": 178 }, { "epoch": 0.06217436609934005, "grad_norm": 1.831109776996565, "learning_rate": 9.97323318417729e-06, "loss": 0.0292, "step": 179 }, { "epoch": 0.06252170892671066, "grad_norm": 0.6811940346616355, "learning_rate": 9.972648668420346e-06, "loss": 0.0241, "step": 180 }, { "epoch": 0.06286905175408128, "grad_norm": 0.7273972811687258, "learning_rate": 9.972057856780412e-06, "loss": 0.0164, "step": 181 }, { "epoch": 0.0632163945814519, "grad_norm": 1.795643396141142, "learning_rate": 9.97146075000552e-06, "loss": 0.0233, "step": 182 }, { "epoch": 0.06356373740882251, "grad_norm": 0.7725411745774898, "learning_rate": 9.970857348851667e-06, "loss": 0.0352, "step": 183 }, { "epoch": 0.06391108023619312, "grad_norm": 0.7933463105547254, "learning_rate": 9.970247654082816e-06, "loss": 0.0296, "step": 184 }, { "epoch": 0.06425842306356373, "grad_norm": 2.2605994347865774, "learning_rate": 9.96963166647091e-06, "loss": 0.04, "step": 185 }, { "epoch": 0.06460576589093435, "grad_norm": 1.0749382398980514, "learning_rate": 9.969009386795849e-06, "loss": 0.022, "step": 186 }, { "epoch": 0.06495310871830497, "grad_norm": 0.557792334948461, "learning_rate": 9.968380815845504e-06, "loss": 0.0327, "step": 187 }, { "epoch": 0.06530045154567558, "grad_norm": 3.518645514803458, "learning_rate": 9.967745954415711e-06, "loss": 0.0286, "step": 188 }, { "epoch": 0.0656477943730462, "grad_norm": 0.5873301047411636, "learning_rate": 9.967104803310266e-06, "loss": 0.0295, "step": 189 }, { "epoch": 0.06599513720041682, "grad_norm": 0.6753747187607116, "learning_rate": 9.966457363340936e-06, "loss": 0.026, "step": 190 }, { "epoch": 0.06634248002778742, "grad_norm": 0.4603778678263685, "learning_rate": 9.965803635327445e-06, "loss": 0.028, "step": 191 }, { "epoch": 0.06668982285515804, "grad_norm": 0.9556093549811782, "learning_rate": 9.965143620097479e-06, "loss": 0.0195, "step": 192 }, { "epoch": 0.06703716568252865, "grad_norm": 2.1021718467486745, "learning_rate": 9.964477318486687e-06, "loss": 0.0256, "step": 193 }, { "epoch": 0.06738450850989927, "grad_norm": 1.714896004446046, "learning_rate": 9.963804731338674e-06, "loss": 0.0245, "step": 194 }, { "epoch": 0.06773185133726989, "grad_norm": 0.8958701314528463, "learning_rate": 9.963125859505e-06, "loss": 0.0367, "step": 195 }, { "epoch": 0.0680791941646405, "grad_norm": 1.4512351067062361, "learning_rate": 9.962440703845193e-06, "loss": 0.0204, "step": 196 }, { "epoch": 0.06842653699201111, "grad_norm": 0.8793852979074263, "learning_rate": 9.961749265226728e-06, "loss": 0.0182, "step": 197 }, { "epoch": 0.06877387981938173, "grad_norm": 1.8655135067659925, "learning_rate": 9.961051544525037e-06, "loss": 0.0228, "step": 198 }, { "epoch": 0.06912122264675234, "grad_norm": 2.2100382744344698, "learning_rate": 9.960347542623506e-06, "loss": 0.0344, "step": 199 }, { "epoch": 0.06946856547412296, "grad_norm": 0.9843602786620123, "learning_rate": 9.959637260413471e-06, "loss": 0.0298, "step": 200 }, { "epoch": 0.06981590830149358, "grad_norm": 3.5094590361679496, "learning_rate": 9.958920698794226e-06, "loss": 0.037, "step": 201 }, { "epoch": 0.07016325112886419, "grad_norm": 1.5466354807280334, "learning_rate": 9.958197858673009e-06, "loss": 0.0294, "step": 202 }, { "epoch": 0.07051059395623481, "grad_norm": 3.257914253480638, "learning_rate": 9.95746874096501e-06, "loss": 0.0343, "step": 203 }, { "epoch": 0.07085793678360541, "grad_norm": 3.1724146363151333, "learning_rate": 9.95673334659337e-06, "loss": 0.046, "step": 204 }, { "epoch": 0.07120527961097603, "grad_norm": 2.045576669926992, "learning_rate": 9.95599167648917e-06, "loss": 0.0328, "step": 205 }, { "epoch": 0.07155262243834665, "grad_norm": 4.362827099663752, "learning_rate": 9.95524373159144e-06, "loss": 0.0333, "step": 206 }, { "epoch": 0.07189996526571726, "grad_norm": 2.1027185240369244, "learning_rate": 9.954489512847156e-06, "loss": 0.0278, "step": 207 }, { "epoch": 0.07224730809308788, "grad_norm": 1.951261682373136, "learning_rate": 9.953729021211238e-06, "loss": 0.0369, "step": 208 }, { "epoch": 0.0725946509204585, "grad_norm": 2.70690992472503, "learning_rate": 9.952962257646545e-06, "loss": 0.0368, "step": 209 }, { "epoch": 0.0729419937478291, "grad_norm": 1.1045388371026474, "learning_rate": 9.952189223123877e-06, "loss": 0.0198, "step": 210 }, { "epoch": 0.07328933657519972, "grad_norm": 0.37890065356086955, "learning_rate": 9.951409918621977e-06, "loss": 0.0275, "step": 211 }, { "epoch": 0.07363667940257033, "grad_norm": 1.27246581897078, "learning_rate": 9.950624345127523e-06, "loss": 0.0269, "step": 212 }, { "epoch": 0.07398402222994095, "grad_norm": 0.6532636236631134, "learning_rate": 9.949832503635133e-06, "loss": 0.0275, "step": 213 }, { "epoch": 0.07433136505731157, "grad_norm": 2.5733332280836576, "learning_rate": 9.949034395147357e-06, "loss": 0.0274, "step": 214 }, { "epoch": 0.07467870788468219, "grad_norm": 2.652931462756075, "learning_rate": 9.948230020674685e-06, "loss": 0.0321, "step": 215 }, { "epoch": 0.0750260507120528, "grad_norm": 1.2497045154490993, "learning_rate": 9.947419381235538e-06, "loss": 0.0277, "step": 216 }, { "epoch": 0.0753733935394234, "grad_norm": 1.029916760819159, "learning_rate": 9.946602477856262e-06, "loss": 0.0223, "step": 217 }, { "epoch": 0.07572073636679402, "grad_norm": 1.0496525720606742, "learning_rate": 9.94577931157115e-06, "loss": 0.0309, "step": 218 }, { "epoch": 0.07606807919416464, "grad_norm": 0.8813614432848731, "learning_rate": 9.944949883422409e-06, "loss": 0.0223, "step": 219 }, { "epoch": 0.07641542202153526, "grad_norm": 1.048099444449534, "learning_rate": 9.944114194460181e-06, "loss": 0.0358, "step": 220 }, { "epoch": 0.07676276484890587, "grad_norm": 0.5608593650182933, "learning_rate": 9.943272245742534e-06, "loss": 0.0197, "step": 221 }, { "epoch": 0.07711010767627649, "grad_norm": 1.6998010859628299, "learning_rate": 9.942424038335462e-06, "loss": 0.0255, "step": 222 }, { "epoch": 0.0774574505036471, "grad_norm": 2.345709307993445, "learning_rate": 9.941569573312882e-06, "loss": 0.0341, "step": 223 }, { "epoch": 0.07780479333101771, "grad_norm": 0.7888283535068908, "learning_rate": 9.940708851756633e-06, "loss": 0.0204, "step": 224 }, { "epoch": 0.07815213615838833, "grad_norm": 0.5490007316162747, "learning_rate": 9.939841874756481e-06, "loss": 0.0273, "step": 225 }, { "epoch": 0.07849947898575894, "grad_norm": 1.1934836714677952, "learning_rate": 9.938968643410103e-06, "loss": 0.0206, "step": 226 }, { "epoch": 0.07884682181312956, "grad_norm": 0.7241877331482239, "learning_rate": 9.938089158823101e-06, "loss": 0.0221, "step": 227 }, { "epoch": 0.07919416464050018, "grad_norm": 2.4778189410310696, "learning_rate": 9.937203422108995e-06, "loss": 0.0308, "step": 228 }, { "epoch": 0.0795415074678708, "grad_norm": 3.748172451740872, "learning_rate": 9.936311434389216e-06, "loss": 0.0427, "step": 229 }, { "epoch": 0.0798888502952414, "grad_norm": 0.7889067721600285, "learning_rate": 9.935413196793111e-06, "loss": 0.0193, "step": 230 }, { "epoch": 0.08023619312261202, "grad_norm": 0.704831305338057, "learning_rate": 9.934508710457944e-06, "loss": 0.0194, "step": 231 }, { "epoch": 0.08058353594998263, "grad_norm": 0.40717838994648536, "learning_rate": 9.933597976528883e-06, "loss": 0.0254, "step": 232 }, { "epoch": 0.08093087877735325, "grad_norm": 1.964638344320029, "learning_rate": 9.932680996159016e-06, "loss": 0.0283, "step": 233 }, { "epoch": 0.08127822160472387, "grad_norm": 2.4616476396806473, "learning_rate": 9.931757770509332e-06, "loss": 0.0364, "step": 234 }, { "epoch": 0.08162556443209448, "grad_norm": 0.6589471069941858, "learning_rate": 9.930828300748726e-06, "loss": 0.0152, "step": 235 }, { "epoch": 0.08197290725946509, "grad_norm": 1.5170996006478779, "learning_rate": 9.929892588054007e-06, "loss": 0.0456, "step": 236 }, { "epoch": 0.0823202500868357, "grad_norm": 0.49344733685789915, "learning_rate": 9.928950633609878e-06, "loss": 0.0157, "step": 237 }, { "epoch": 0.08266759291420632, "grad_norm": 1.2510013978714254, "learning_rate": 9.928002438608955e-06, "loss": 0.0264, "step": 238 }, { "epoch": 0.08301493574157694, "grad_norm": 0.6672029612706915, "learning_rate": 9.927048004251748e-06, "loss": 0.0232, "step": 239 }, { "epoch": 0.08336227856894755, "grad_norm": 0.7870864791530758, "learning_rate": 9.926087331746668e-06, "loss": 0.0241, "step": 240 }, { "epoch": 0.08370962139631817, "grad_norm": 0.4430411461060668, "learning_rate": 9.925120422310023e-06, "loss": 0.0239, "step": 241 }, { "epoch": 0.08405696422368879, "grad_norm": 0.7393154201127319, "learning_rate": 9.924147277166025e-06, "loss": 0.0221, "step": 242 }, { "epoch": 0.08440430705105939, "grad_norm": 1.5211011444968148, "learning_rate": 9.923167897546773e-06, "loss": 0.0361, "step": 243 }, { "epoch": 0.08475164987843001, "grad_norm": 0.306402457203348, "learning_rate": 9.92218228469226e-06, "loss": 0.0161, "step": 244 }, { "epoch": 0.08509899270580062, "grad_norm": 0.6425545998545745, "learning_rate": 9.921190439850374e-06, "loss": 0.0336, "step": 245 }, { "epoch": 0.08544633553317124, "grad_norm": 1.1621242417732545, "learning_rate": 9.920192364276894e-06, "loss": 0.0346, "step": 246 }, { "epoch": 0.08579367836054186, "grad_norm": 0.8594232560472324, "learning_rate": 9.919188059235483e-06, "loss": 0.0276, "step": 247 }, { "epoch": 0.08614102118791248, "grad_norm": 1.1817899977122954, "learning_rate": 9.918177525997697e-06, "loss": 0.0237, "step": 248 }, { "epoch": 0.08648836401528308, "grad_norm": 0.663618476895076, "learning_rate": 9.917160765842972e-06, "loss": 0.0147, "step": 249 }, { "epoch": 0.0868357068426537, "grad_norm": 2.1119160052926906, "learning_rate": 9.916137780058634e-06, "loss": 0.0372, "step": 250 }, { "epoch": 0.08718304967002431, "grad_norm": 0.5192464513916062, "learning_rate": 9.915108569939884e-06, "loss": 0.0226, "step": 251 }, { "epoch": 0.08753039249739493, "grad_norm": 1.4192863431762925, "learning_rate": 9.914073136789812e-06, "loss": 0.0208, "step": 252 }, { "epoch": 0.08787773532476555, "grad_norm": 1.4952635090772373, "learning_rate": 9.913031481919378e-06, "loss": 0.0256, "step": 253 }, { "epoch": 0.08822507815213616, "grad_norm": 0.7782630656189196, "learning_rate": 9.911983606647426e-06, "loss": 0.0301, "step": 254 }, { "epoch": 0.08857242097950677, "grad_norm": 1.0227442959844868, "learning_rate": 9.910929512300673e-06, "loss": 0.0205, "step": 255 }, { "epoch": 0.08891976380687738, "grad_norm": 1.0984883231423739, "learning_rate": 9.909869200213711e-06, "loss": 0.0272, "step": 256 }, { "epoch": 0.089267106634248, "grad_norm": 2.616121786507333, "learning_rate": 9.908802671729004e-06, "loss": 0.0298, "step": 257 }, { "epoch": 0.08961444946161862, "grad_norm": 1.0424691898538212, "learning_rate": 9.907729928196885e-06, "loss": 0.0221, "step": 258 }, { "epoch": 0.08996179228898923, "grad_norm": 1.2136434228663253, "learning_rate": 9.90665097097556e-06, "loss": 0.0344, "step": 259 }, { "epoch": 0.09030913511635985, "grad_norm": 1.0607165560461278, "learning_rate": 9.905565801431097e-06, "loss": 0.0278, "step": 260 }, { "epoch": 0.09065647794373047, "grad_norm": 0.7592364646233872, "learning_rate": 9.904474420937431e-06, "loss": 0.0241, "step": 261 }, { "epoch": 0.09100382077110107, "grad_norm": 1.142833500760027, "learning_rate": 9.903376830876363e-06, "loss": 0.0194, "step": 262 }, { "epoch": 0.09135116359847169, "grad_norm": 1.5792096178170156, "learning_rate": 9.902273032637558e-06, "loss": 0.0285, "step": 263 }, { "epoch": 0.0916985064258423, "grad_norm": 1.2090200476218058, "learning_rate": 9.901163027618532e-06, "loss": 0.027, "step": 264 }, { "epoch": 0.09204584925321292, "grad_norm": 0.7247661863395669, "learning_rate": 9.90004681722467e-06, "loss": 0.0381, "step": 265 }, { "epoch": 0.09239319208058354, "grad_norm": 1.3359604948328887, "learning_rate": 9.898924402869204e-06, "loss": 0.025, "step": 266 }, { "epoch": 0.09274053490795416, "grad_norm": 1.5054331995013246, "learning_rate": 9.897795785973227e-06, "loss": 0.0176, "step": 267 }, { "epoch": 0.09308787773532476, "grad_norm": 0.3800080517239375, "learning_rate": 9.896660967965688e-06, "loss": 0.0206, "step": 268 }, { "epoch": 0.09343522056269538, "grad_norm": 2.2058762679111212, "learning_rate": 9.895519950283378e-06, "loss": 0.0382, "step": 269 }, { "epoch": 0.09378256339006599, "grad_norm": 1.987671837514159, "learning_rate": 9.894372734370945e-06, "loss": 0.0245, "step": 270 }, { "epoch": 0.09412990621743661, "grad_norm": 1.1525093904407058, "learning_rate": 9.89321932168088e-06, "loss": 0.0294, "step": 271 }, { "epoch": 0.09447724904480723, "grad_norm": 0.5288867918479366, "learning_rate": 9.892059713673521e-06, "loss": 0.0329, "step": 272 }, { "epoch": 0.09482459187217784, "grad_norm": 2.3156241409626537, "learning_rate": 9.890893911817056e-06, "loss": 0.0281, "step": 273 }, { "epoch": 0.09517193469954846, "grad_norm": 1.6457301083303564, "learning_rate": 9.889721917587504e-06, "loss": 0.0246, "step": 274 }, { "epoch": 0.09551927752691906, "grad_norm": 1.3357334045720493, "learning_rate": 9.888543732468732e-06, "loss": 0.0285, "step": 275 }, { "epoch": 0.09586662035428968, "grad_norm": 1.283091226837623, "learning_rate": 9.887359357952441e-06, "loss": 0.0354, "step": 276 }, { "epoch": 0.0962139631816603, "grad_norm": 1.468754392397442, "learning_rate": 9.886168795538175e-06, "loss": 0.0243, "step": 277 }, { "epoch": 0.09656130600903091, "grad_norm": 0.5847171116164384, "learning_rate": 9.884972046733306e-06, "loss": 0.0187, "step": 278 }, { "epoch": 0.09690864883640153, "grad_norm": 0.7176769597295244, "learning_rate": 9.883769113053039e-06, "loss": 0.0242, "step": 279 }, { "epoch": 0.09725599166377215, "grad_norm": 0.4262926228040994, "learning_rate": 9.882559996020414e-06, "loss": 0.0179, "step": 280 }, { "epoch": 0.09760333449114275, "grad_norm": 0.9525497182278096, "learning_rate": 9.881344697166293e-06, "loss": 0.0349, "step": 281 }, { "epoch": 0.09795067731851337, "grad_norm": 0.9791433669149336, "learning_rate": 9.880123218029374e-06, "loss": 0.0308, "step": 282 }, { "epoch": 0.09829802014588399, "grad_norm": 0.7630066342045814, "learning_rate": 9.878895560156172e-06, "loss": 0.0205, "step": 283 }, { "epoch": 0.0986453629732546, "grad_norm": 0.364457070730968, "learning_rate": 9.877661725101028e-06, "loss": 0.0195, "step": 284 }, { "epoch": 0.09899270580062522, "grad_norm": 1.2950920833202644, "learning_rate": 9.876421714426104e-06, "loss": 0.0211, "step": 285 }, { "epoch": 0.09934004862799584, "grad_norm": 3.594479893409194, "learning_rate": 9.87517552970138e-06, "loss": 0.0375, "step": 286 }, { "epoch": 0.09968739145536645, "grad_norm": 1.3559833171841018, "learning_rate": 9.873923172504653e-06, "loss": 0.0306, "step": 287 }, { "epoch": 0.10003473428273706, "grad_norm": 1.0574786377389542, "learning_rate": 9.872664644421539e-06, "loss": 0.0378, "step": 288 }, { "epoch": 0.10038207711010767, "grad_norm": 0.8553158210643477, "learning_rate": 9.87139994704546e-06, "loss": 0.017, "step": 289 }, { "epoch": 0.10072941993747829, "grad_norm": 1.2327340151649626, "learning_rate": 9.870129081977654e-06, "loss": 0.0291, "step": 290 }, { "epoch": 0.10107676276484891, "grad_norm": 0.6120639599947546, "learning_rate": 9.868852050827167e-06, "loss": 0.0248, "step": 291 }, { "epoch": 0.10142410559221952, "grad_norm": 1.3504767761554735, "learning_rate": 9.86756885521085e-06, "loss": 0.0196, "step": 292 }, { "epoch": 0.10177144841959014, "grad_norm": 2.130326569000447, "learning_rate": 9.866279496753361e-06, "loss": 0.0272, "step": 293 }, { "epoch": 0.10211879124696074, "grad_norm": 2.961313545763907, "learning_rate": 9.86498397708716e-06, "loss": 0.0306, "step": 294 }, { "epoch": 0.10246613407433136, "grad_norm": 0.6185598205757522, "learning_rate": 9.863682297852506e-06, "loss": 0.0268, "step": 295 }, { "epoch": 0.10281347690170198, "grad_norm": 2.900646593623201, "learning_rate": 9.862374460697462e-06, "loss": 0.0299, "step": 296 }, { "epoch": 0.1031608197290726, "grad_norm": 4.815048093426214, "learning_rate": 9.86106046727788e-06, "loss": 0.0413, "step": 297 }, { "epoch": 0.10350816255644321, "grad_norm": 3.6854621061195236, "learning_rate": 9.859740319257413e-06, "loss": 0.0315, "step": 298 }, { "epoch": 0.10385550538381383, "grad_norm": 3.1485787317926315, "learning_rate": 9.858414018307503e-06, "loss": 0.0395, "step": 299 }, { "epoch": 0.10420284821118445, "grad_norm": 1.6857361849761061, "learning_rate": 9.857081566107383e-06, "loss": 0.0306, "step": 300 }, { "epoch": 0.10455019103855505, "grad_norm": 1.1203216330091716, "learning_rate": 9.855742964344074e-06, "loss": 0.0243, "step": 301 }, { "epoch": 0.10489753386592567, "grad_norm": 2.214128202481394, "learning_rate": 9.854398214712382e-06, "loss": 0.0234, "step": 302 }, { "epoch": 0.10524487669329628, "grad_norm": 1.6017019194654412, "learning_rate": 9.853047318914898e-06, "loss": 0.0291, "step": 303 }, { "epoch": 0.1055922195206669, "grad_norm": 2.2711519721461917, "learning_rate": 9.851690278661998e-06, "loss": 0.0271, "step": 304 }, { "epoch": 0.10593956234803752, "grad_norm": 0.4684895446152942, "learning_rate": 9.850327095671831e-06, "loss": 0.0177, "step": 305 }, { "epoch": 0.10628690517540813, "grad_norm": 1.8303208610929107, "learning_rate": 9.848957771670326e-06, "loss": 0.0286, "step": 306 }, { "epoch": 0.10663424800277874, "grad_norm": 0.547820372795727, "learning_rate": 9.847582308391189e-06, "loss": 0.0328, "step": 307 }, { "epoch": 0.10698159083014935, "grad_norm": 1.7911990714879877, "learning_rate": 9.846200707575897e-06, "loss": 0.0153, "step": 308 }, { "epoch": 0.10732893365751997, "grad_norm": 2.725196937839101, "learning_rate": 9.844812970973699e-06, "loss": 0.0228, "step": 309 }, { "epoch": 0.10767627648489059, "grad_norm": 2.375094543001517, "learning_rate": 9.843419100341608e-06, "loss": 0.0203, "step": 310 }, { "epoch": 0.1080236193122612, "grad_norm": 0.732852515115049, "learning_rate": 9.842019097444414e-06, "loss": 0.0214, "step": 311 }, { "epoch": 0.10837096213963182, "grad_norm": 0.8565652562532247, "learning_rate": 9.840612964054658e-06, "loss": 0.031, "step": 312 }, { "epoch": 0.10871830496700244, "grad_norm": 1.046089822343109, "learning_rate": 9.839200701952653e-06, "loss": 0.0273, "step": 313 }, { "epoch": 0.10906564779437304, "grad_norm": 1.4037462533263418, "learning_rate": 9.837782312926465e-06, "loss": 0.0247, "step": 314 }, { "epoch": 0.10941299062174366, "grad_norm": 0.724135174812571, "learning_rate": 9.836357798771922e-06, "loss": 0.0197, "step": 315 }, { "epoch": 0.10976033344911428, "grad_norm": 0.8979754709718626, "learning_rate": 9.834927161292604e-06, "loss": 0.0135, "step": 316 }, { "epoch": 0.11010767627648489, "grad_norm": 0.818961402615803, "learning_rate": 9.833490402299844e-06, "loss": 0.024, "step": 317 }, { "epoch": 0.11045501910385551, "grad_norm": 1.8250252957901378, "learning_rate": 9.832047523612726e-06, "loss": 0.0305, "step": 318 }, { "epoch": 0.11080236193122613, "grad_norm": 0.898470564935298, "learning_rate": 9.830598527058083e-06, "loss": 0.0262, "step": 319 }, { "epoch": 0.11114970475859673, "grad_norm": 0.7300373503368474, "learning_rate": 9.829143414470495e-06, "loss": 0.0286, "step": 320 }, { "epoch": 0.11149704758596735, "grad_norm": 1.0198415430023702, "learning_rate": 9.82768218769228e-06, "loss": 0.0245, "step": 321 }, { "epoch": 0.11184439041333796, "grad_norm": 1.5853784707754817, "learning_rate": 9.826214848573503e-06, "loss": 0.0272, "step": 322 }, { "epoch": 0.11219173324070858, "grad_norm": 0.5862824974087557, "learning_rate": 9.824741398971966e-06, "loss": 0.0168, "step": 323 }, { "epoch": 0.1125390760680792, "grad_norm": 1.0344261835540582, "learning_rate": 9.823261840753209e-06, "loss": 0.0186, "step": 324 }, { "epoch": 0.11288641889544981, "grad_norm": 1.3434435771133537, "learning_rate": 9.821776175790501e-06, "loss": 0.0287, "step": 325 }, { "epoch": 0.11323376172282042, "grad_norm": 0.6130576808513916, "learning_rate": 9.820284405964846e-06, "loss": 0.018, "step": 326 }, { "epoch": 0.11358110455019103, "grad_norm": 0.6570948194085574, "learning_rate": 9.81878653316498e-06, "loss": 0.0136, "step": 327 }, { "epoch": 0.11392844737756165, "grad_norm": 1.6328034399729674, "learning_rate": 9.817282559287362e-06, "loss": 0.0277, "step": 328 }, { "epoch": 0.11427579020493227, "grad_norm": 0.3948241510052037, "learning_rate": 9.815772486236179e-06, "loss": 0.0126, "step": 329 }, { "epoch": 0.11462313303230288, "grad_norm": 0.5069144619443747, "learning_rate": 9.814256315923335e-06, "loss": 0.0119, "step": 330 }, { "epoch": 0.1149704758596735, "grad_norm": 1.1749675379829028, "learning_rate": 9.81273405026846e-06, "loss": 0.0401, "step": 331 }, { "epoch": 0.11531781868704412, "grad_norm": 1.38512145823474, "learning_rate": 9.811205691198897e-06, "loss": 0.0255, "step": 332 }, { "epoch": 0.11566516151441472, "grad_norm": 1.6588777415442508, "learning_rate": 9.809671240649705e-06, "loss": 0.0232, "step": 333 }, { "epoch": 0.11601250434178534, "grad_norm": 0.891351380406614, "learning_rate": 9.808130700563658e-06, "loss": 0.0182, "step": 334 }, { "epoch": 0.11635984716915596, "grad_norm": 1.9182917391789063, "learning_rate": 9.806584072891234e-06, "loss": 0.0295, "step": 335 }, { "epoch": 0.11670718999652657, "grad_norm": 1.414292519566524, "learning_rate": 9.805031359590626e-06, "loss": 0.0182, "step": 336 }, { "epoch": 0.11705453282389719, "grad_norm": 1.048723673722958, "learning_rate": 9.803472562627726e-06, "loss": 0.0119, "step": 337 }, { "epoch": 0.1174018756512678, "grad_norm": 1.2747705868686736, "learning_rate": 9.801907683976128e-06, "loss": 0.0272, "step": 338 }, { "epoch": 0.11774921847863841, "grad_norm": 0.9457245602215513, "learning_rate": 9.800336725617136e-06, "loss": 0.0201, "step": 339 }, { "epoch": 0.11809656130600903, "grad_norm": 1.2510122100134684, "learning_rate": 9.798759689539739e-06, "loss": 0.0257, "step": 340 }, { "epoch": 0.11844390413337964, "grad_norm": 0.8319010285486187, "learning_rate": 9.797176577740625e-06, "loss": 0.0254, "step": 341 }, { "epoch": 0.11879124696075026, "grad_norm": 0.7850673506313961, "learning_rate": 9.795587392224182e-06, "loss": 0.0234, "step": 342 }, { "epoch": 0.11913858978812088, "grad_norm": 0.587424666600968, "learning_rate": 9.793992135002476e-06, "loss": 0.0179, "step": 343 }, { "epoch": 0.1194859326154915, "grad_norm": 1.2534232883052836, "learning_rate": 9.792390808095268e-06, "loss": 0.0228, "step": 344 }, { "epoch": 0.11983327544286211, "grad_norm": 1.1177620903458203, "learning_rate": 9.790783413530006e-06, "loss": 0.0283, "step": 345 }, { "epoch": 0.12018061827023271, "grad_norm": 0.7956916247471605, "learning_rate": 9.789169953341809e-06, "loss": 0.026, "step": 346 }, { "epoch": 0.12052796109760333, "grad_norm": 1.4672529265658403, "learning_rate": 9.787550429573487e-06, "loss": 0.0242, "step": 347 }, { "epoch": 0.12087530392497395, "grad_norm": 0.499722630858149, "learning_rate": 9.785924844275523e-06, "loss": 0.0243, "step": 348 }, { "epoch": 0.12122264675234456, "grad_norm": 0.7528884315208382, "learning_rate": 9.784293199506076e-06, "loss": 0.0312, "step": 349 }, { "epoch": 0.12156998957971518, "grad_norm": 0.9565348415093637, "learning_rate": 9.782655497330972e-06, "loss": 0.0149, "step": 350 }, { "epoch": 0.1219173324070858, "grad_norm": 4.700227177143458, "learning_rate": 9.781011739823715e-06, "loss": 0.0459, "step": 351 }, { "epoch": 0.1222646752344564, "grad_norm": 2.0797417224686545, "learning_rate": 9.779361929065462e-06, "loss": 0.0248, "step": 352 }, { "epoch": 0.12261201806182702, "grad_norm": 1.5530326186566639, "learning_rate": 9.777706067145052e-06, "loss": 0.0278, "step": 353 }, { "epoch": 0.12295936088919764, "grad_norm": 0.8614425997016937, "learning_rate": 9.77604415615897e-06, "loss": 0.0171, "step": 354 }, { "epoch": 0.12330670371656825, "grad_norm": 0.798882409804884, "learning_rate": 9.77437619821137e-06, "loss": 0.0221, "step": 355 }, { "epoch": 0.12365404654393887, "grad_norm": 1.309963463600437, "learning_rate": 9.772702195414053e-06, "loss": 0.0227, "step": 356 }, { "epoch": 0.12400138937130949, "grad_norm": 1.0239470706246998, "learning_rate": 9.771022149886482e-06, "loss": 0.0247, "step": 357 }, { "epoch": 0.1243487321986801, "grad_norm": 0.694165293574145, "learning_rate": 9.769336063755763e-06, "loss": 0.0165, "step": 358 }, { "epoch": 0.1246960750260507, "grad_norm": 0.8806399954727403, "learning_rate": 9.767643939156658e-06, "loss": 0.0315, "step": 359 }, { "epoch": 0.12504341785342132, "grad_norm": 1.4519111212390032, "learning_rate": 9.765945778231568e-06, "loss": 0.0206, "step": 360 }, { "epoch": 0.12539076068079194, "grad_norm": 0.7103007161003656, "learning_rate": 9.76424158313054e-06, "loss": 0.0206, "step": 361 }, { "epoch": 0.12573810350816256, "grad_norm": 1.4369518423012835, "learning_rate": 9.762531356011258e-06, "loss": 0.0309, "step": 362 }, { "epoch": 0.12608544633553317, "grad_norm": 1.9088679954477914, "learning_rate": 9.760815099039045e-06, "loss": 0.025, "step": 363 }, { "epoch": 0.1264327891629038, "grad_norm": 1.2218949903799274, "learning_rate": 9.75909281438686e-06, "loss": 0.0241, "step": 364 }, { "epoch": 0.1267801319902744, "grad_norm": 0.8840693103252929, "learning_rate": 9.757364504235292e-06, "loss": 0.0256, "step": 365 }, { "epoch": 0.12712747481764503, "grad_norm": 2.3783985810393853, "learning_rate": 9.755630170772556e-06, "loss": 0.0279, "step": 366 }, { "epoch": 0.12747481764501564, "grad_norm": 2.1206965462444987, "learning_rate": 9.753889816194498e-06, "loss": 0.0371, "step": 367 }, { "epoch": 0.12782216047238623, "grad_norm": 3.735386143479719, "learning_rate": 9.752143442704586e-06, "loss": 0.0474, "step": 368 }, { "epoch": 0.12816950329975685, "grad_norm": 0.6664661005983077, "learning_rate": 9.750391052513906e-06, "loss": 0.026, "step": 369 }, { "epoch": 0.12851684612712747, "grad_norm": 0.3453207995141737, "learning_rate": 9.748632647841165e-06, "loss": 0.0189, "step": 370 }, { "epoch": 0.12886418895449808, "grad_norm": 1.1376711577741057, "learning_rate": 9.746868230912683e-06, "loss": 0.0347, "step": 371 }, { "epoch": 0.1292115317818687, "grad_norm": 1.0621834917092619, "learning_rate": 9.745097803962394e-06, "loss": 0.0212, "step": 372 }, { "epoch": 0.12955887460923932, "grad_norm": 1.4420764224102458, "learning_rate": 9.743321369231837e-06, "loss": 0.0288, "step": 373 }, { "epoch": 0.12990621743660993, "grad_norm": 0.47180137756368545, "learning_rate": 9.741538928970163e-06, "loss": 0.0212, "step": 374 }, { "epoch": 0.13025356026398055, "grad_norm": 1.0223869124759941, "learning_rate": 9.739750485434126e-06, "loss": 0.0275, "step": 375 }, { "epoch": 0.13060090309135117, "grad_norm": 0.3467757694541222, "learning_rate": 9.737956040888073e-06, "loss": 0.0212, "step": 376 }, { "epoch": 0.13094824591872178, "grad_norm": 0.4743085064117592, "learning_rate": 9.736155597603959e-06, "loss": 0.0238, "step": 377 }, { "epoch": 0.1312955887460924, "grad_norm": 0.5939635024690287, "learning_rate": 9.734349157861329e-06, "loss": 0.0187, "step": 378 }, { "epoch": 0.13164293157346302, "grad_norm": 0.8656882396061, "learning_rate": 9.73253672394732e-06, "loss": 0.0267, "step": 379 }, { "epoch": 0.13199027440083363, "grad_norm": 0.9530332353603665, "learning_rate": 9.73071829815666e-06, "loss": 0.0226, "step": 380 }, { "epoch": 0.13233761722820422, "grad_norm": 1.126170471334089, "learning_rate": 9.728893882791663e-06, "loss": 0.0264, "step": 381 }, { "epoch": 0.13268496005557484, "grad_norm": 1.0981019989116185, "learning_rate": 9.727063480162226e-06, "loss": 0.0192, "step": 382 }, { "epoch": 0.13303230288294546, "grad_norm": 0.890921570685852, "learning_rate": 9.725227092585824e-06, "loss": 0.0291, "step": 383 }, { "epoch": 0.13337964571031607, "grad_norm": 0.785030282186115, "learning_rate": 9.723384722387516e-06, "loss": 0.0239, "step": 384 }, { "epoch": 0.1337269885376867, "grad_norm": 2.0060229292652942, "learning_rate": 9.721536371899928e-06, "loss": 0.0291, "step": 385 }, { "epoch": 0.1340743313650573, "grad_norm": 1.0118306100174919, "learning_rate": 9.719682043463261e-06, "loss": 0.0151, "step": 386 }, { "epoch": 0.13442167419242793, "grad_norm": 1.1099679117319492, "learning_rate": 9.717821739425286e-06, "loss": 0.0257, "step": 387 }, { "epoch": 0.13476901701979854, "grad_norm": 0.6925372348746138, "learning_rate": 9.71595546214134e-06, "loss": 0.0302, "step": 388 }, { "epoch": 0.13511635984716916, "grad_norm": 1.1588103017208862, "learning_rate": 9.714083213974322e-06, "loss": 0.0159, "step": 389 }, { "epoch": 0.13546370267453978, "grad_norm": 1.6463962782244659, "learning_rate": 9.712204997294685e-06, "loss": 0.028, "step": 390 }, { "epoch": 0.1358110455019104, "grad_norm": 0.740625572838366, "learning_rate": 9.710320814480448e-06, "loss": 0.0291, "step": 391 }, { "epoch": 0.136158388329281, "grad_norm": 1.461026830177916, "learning_rate": 9.708430667917179e-06, "loss": 0.0266, "step": 392 }, { "epoch": 0.13650573115665163, "grad_norm": 1.173157945166447, "learning_rate": 9.706534559997997e-06, "loss": 0.0183, "step": 393 }, { "epoch": 0.13685307398402222, "grad_norm": 2.7283819002594334, "learning_rate": 9.704632493123569e-06, "loss": 0.031, "step": 394 }, { "epoch": 0.13720041681139283, "grad_norm": 0.804775724073791, "learning_rate": 9.702724469702107e-06, "loss": 0.0209, "step": 395 }, { "epoch": 0.13754775963876345, "grad_norm": 0.7136569776548899, "learning_rate": 9.700810492149364e-06, "loss": 0.0247, "step": 396 }, { "epoch": 0.13789510246613407, "grad_norm": 0.8364128273663843, "learning_rate": 9.698890562888632e-06, "loss": 0.0208, "step": 397 }, { "epoch": 0.13824244529350468, "grad_norm": 1.0792662917432296, "learning_rate": 9.696964684350738e-06, "loss": 0.0283, "step": 398 }, { "epoch": 0.1385897881208753, "grad_norm": 1.0704903922122424, "learning_rate": 9.695032858974042e-06, "loss": 0.0233, "step": 399 }, { "epoch": 0.13893713094824592, "grad_norm": 0.8609262980285204, "learning_rate": 9.693095089204431e-06, "loss": 0.0277, "step": 400 }, { "epoch": 0.13928447377561654, "grad_norm": 0.38166608517210565, "learning_rate": 9.691151377495324e-06, "loss": 0.0198, "step": 401 }, { "epoch": 0.13963181660298715, "grad_norm": 0.5729627674226186, "learning_rate": 9.689201726307655e-06, "loss": 0.0318, "step": 402 }, { "epoch": 0.13997915943035777, "grad_norm": 1.975870726288315, "learning_rate": 9.687246138109888e-06, "loss": 0.0322, "step": 403 }, { "epoch": 0.14032650225772839, "grad_norm": 2.1898213167704648, "learning_rate": 9.68528461537799e-06, "loss": 0.0304, "step": 404 }, { "epoch": 0.140673845085099, "grad_norm": 0.620886516690147, "learning_rate": 9.683317160595457e-06, "loss": 0.0178, "step": 405 }, { "epoch": 0.14102118791246962, "grad_norm": 0.751397853860502, "learning_rate": 9.681343776253284e-06, "loss": 0.0227, "step": 406 }, { "epoch": 0.1413685307398402, "grad_norm": 1.5360568776293804, "learning_rate": 9.679364464849983e-06, "loss": 0.024, "step": 407 }, { "epoch": 0.14171587356721083, "grad_norm": 1.9507404317073442, "learning_rate": 9.67737922889156e-06, "loss": 0.0299, "step": 408 }, { "epoch": 0.14206321639458144, "grad_norm": 0.3885448349712659, "learning_rate": 9.675388070891527e-06, "loss": 0.0169, "step": 409 }, { "epoch": 0.14241055922195206, "grad_norm": 0.7487805008765015, "learning_rate": 9.6733909933709e-06, "loss": 0.0258, "step": 410 }, { "epoch": 0.14275790204932268, "grad_norm": 0.8741306809093075, "learning_rate": 9.671387998858178e-06, "loss": 0.029, "step": 411 }, { "epoch": 0.1431052448766933, "grad_norm": 0.7791251348925693, "learning_rate": 9.669379089889361e-06, "loss": 0.0287, "step": 412 }, { "epoch": 0.1434525877040639, "grad_norm": 1.6127960765398552, "learning_rate": 9.66736426900793e-06, "loss": 0.0207, "step": 413 }, { "epoch": 0.14379993053143453, "grad_norm": 2.360372239374618, "learning_rate": 9.66534353876486e-06, "loss": 0.0257, "step": 414 }, { "epoch": 0.14414727335880514, "grad_norm": 1.0591550444954392, "learning_rate": 9.663316901718599e-06, "loss": 0.0252, "step": 415 }, { "epoch": 0.14449461618617576, "grad_norm": 0.6279163360317281, "learning_rate": 9.661284360435075e-06, "loss": 0.0168, "step": 416 }, { "epoch": 0.14484195901354638, "grad_norm": 0.9205829596021957, "learning_rate": 9.659245917487698e-06, "loss": 0.028, "step": 417 }, { "epoch": 0.145189301840917, "grad_norm": 0.7205918553309815, "learning_rate": 9.657201575457346e-06, "loss": 0.0239, "step": 418 }, { "epoch": 0.1455366446682876, "grad_norm": 0.4050107455693852, "learning_rate": 9.655151336932362e-06, "loss": 0.0152, "step": 419 }, { "epoch": 0.1458839874956582, "grad_norm": 1.5415577326332006, "learning_rate": 9.653095204508562e-06, "loss": 0.0222, "step": 420 }, { "epoch": 0.14623133032302882, "grad_norm": 0.4578806435756731, "learning_rate": 9.651033180789216e-06, "loss": 0.016, "step": 421 }, { "epoch": 0.14657867315039944, "grad_norm": 0.8614555068468486, "learning_rate": 9.648965268385062e-06, "loss": 0.0257, "step": 422 }, { "epoch": 0.14692601597777005, "grad_norm": 1.470299031527, "learning_rate": 9.646891469914285e-06, "loss": 0.0368, "step": 423 }, { "epoch": 0.14727335880514067, "grad_norm": 1.0949073778594718, "learning_rate": 9.644811788002531e-06, "loss": 0.033, "step": 424 }, { "epoch": 0.1476207016325113, "grad_norm": 0.7401125451520727, "learning_rate": 9.642726225282886e-06, "loss": 0.0288, "step": 425 }, { "epoch": 0.1479680444598819, "grad_norm": 2.2131411209283565, "learning_rate": 9.64063478439589e-06, "loss": 0.0175, "step": 426 }, { "epoch": 0.14831538728725252, "grad_norm": 2.06363585907779, "learning_rate": 9.638537467989517e-06, "loss": 0.0276, "step": 427 }, { "epoch": 0.14866273011462314, "grad_norm": 1.467812889206939, "learning_rate": 9.63643427871919e-06, "loss": 0.0291, "step": 428 }, { "epoch": 0.14901007294199375, "grad_norm": 0.4505213888594297, "learning_rate": 9.634325219247758e-06, "loss": 0.0153, "step": 429 }, { "epoch": 0.14935741576936437, "grad_norm": 1.3450924566368194, "learning_rate": 9.632210292245508e-06, "loss": 0.0225, "step": 430 }, { "epoch": 0.149704758596735, "grad_norm": 2.347870224615922, "learning_rate": 9.630089500390154e-06, "loss": 0.028, "step": 431 }, { "epoch": 0.1500521014241056, "grad_norm": 1.6462241884720297, "learning_rate": 9.627962846366838e-06, "loss": 0.024, "step": 432 }, { "epoch": 0.1503994442514762, "grad_norm": 0.39196556018543693, "learning_rate": 9.62583033286812e-06, "loss": 0.0201, "step": 433 }, { "epoch": 0.1507467870788468, "grad_norm": 1.5059296209518138, "learning_rate": 9.62369196259398e-06, "loss": 0.02, "step": 434 }, { "epoch": 0.15109412990621743, "grad_norm": 0.45555178968971705, "learning_rate": 9.621547738251816e-06, "loss": 0.0221, "step": 435 }, { "epoch": 0.15144147273358805, "grad_norm": 0.2882865925387099, "learning_rate": 9.619397662556434e-06, "loss": 0.0128, "step": 436 }, { "epoch": 0.15178881556095866, "grad_norm": 0.6918769536905994, "learning_rate": 9.617241738230051e-06, "loss": 0.0242, "step": 437 }, { "epoch": 0.15213615838832928, "grad_norm": 1.6957528332450178, "learning_rate": 9.61507996800229e-06, "loss": 0.0313, "step": 438 }, { "epoch": 0.1524835012156999, "grad_norm": 1.317146675116354, "learning_rate": 9.61291235461017e-06, "loss": 0.0215, "step": 439 }, { "epoch": 0.1528308440430705, "grad_norm": 0.727706359709951, "learning_rate": 9.610738900798116e-06, "loss": 0.0271, "step": 440 }, { "epoch": 0.15317818687044113, "grad_norm": 3.5123826215661365, "learning_rate": 9.60855960931794e-06, "loss": 0.0383, "step": 441 }, { "epoch": 0.15352552969781175, "grad_norm": 3.728862284149442, "learning_rate": 9.606374482928849e-06, "loss": 0.0318, "step": 442 }, { "epoch": 0.15387287252518236, "grad_norm": 2.760620871544741, "learning_rate": 9.604183524397439e-06, "loss": 0.041, "step": 443 }, { "epoch": 0.15422021535255298, "grad_norm": 3.1069152542667604, "learning_rate": 9.601986736497686e-06, "loss": 0.0325, "step": 444 }, { "epoch": 0.1545675581799236, "grad_norm": 1.8573780055969038, "learning_rate": 9.59978412201095e-06, "loss": 0.0317, "step": 445 }, { "epoch": 0.1549149010072942, "grad_norm": 1.0027964210682174, "learning_rate": 9.597575683725965e-06, "loss": 0.0227, "step": 446 }, { "epoch": 0.1552622438346648, "grad_norm": 1.4276269980988359, "learning_rate": 9.595361424438841e-06, "loss": 0.0362, "step": 447 }, { "epoch": 0.15560958666203542, "grad_norm": 1.317331217220333, "learning_rate": 9.593141346953059e-06, "loss": 0.0226, "step": 448 }, { "epoch": 0.15595692948940604, "grad_norm": 0.5933495170717192, "learning_rate": 9.590915454079463e-06, "loss": 0.0227, "step": 449 }, { "epoch": 0.15630427231677665, "grad_norm": 0.4622800957314312, "learning_rate": 9.588683748636262e-06, "loss": 0.0267, "step": 450 }, { "epoch": 0.15665161514414727, "grad_norm": 0.7911625488227271, "learning_rate": 9.586446233449024e-06, "loss": 0.0246, "step": 451 }, { "epoch": 0.1569989579715179, "grad_norm": 0.9702252686758073, "learning_rate": 9.584202911350672e-06, "loss": 0.0236, "step": 452 }, { "epoch": 0.1573463007988885, "grad_norm": 0.6033941783861664, "learning_rate": 9.581953785181482e-06, "loss": 0.0214, "step": 453 }, { "epoch": 0.15769364362625912, "grad_norm": 0.4655527204441117, "learning_rate": 9.579698857789078e-06, "loss": 0.0283, "step": 454 }, { "epoch": 0.15804098645362974, "grad_norm": 0.5742266010042084, "learning_rate": 9.577438132028431e-06, "loss": 0.0313, "step": 455 }, { "epoch": 0.15838832928100036, "grad_norm": 0.5365426672112383, "learning_rate": 9.575171610761848e-06, "loss": 0.0281, "step": 456 }, { "epoch": 0.15873567210837097, "grad_norm": 0.4233818161136484, "learning_rate": 9.572899296858981e-06, "loss": 0.0315, "step": 457 }, { "epoch": 0.1590830149357416, "grad_norm": 0.5680963519098914, "learning_rate": 9.570621193196811e-06, "loss": 0.0269, "step": 458 }, { "epoch": 0.15943035776311218, "grad_norm": 0.5125465753073855, "learning_rate": 9.568337302659652e-06, "loss": 0.0167, "step": 459 }, { "epoch": 0.1597777005904828, "grad_norm": 0.3119364502263281, "learning_rate": 9.566047628139142e-06, "loss": 0.021, "step": 460 }, { "epoch": 0.1601250434178534, "grad_norm": 0.9906415221821926, "learning_rate": 9.563752172534242e-06, "loss": 0.0235, "step": 461 }, { "epoch": 0.16047238624522403, "grad_norm": 0.8066115586763946, "learning_rate": 9.561450938751238e-06, "loss": 0.0229, "step": 462 }, { "epoch": 0.16081972907259465, "grad_norm": 1.005027835921377, "learning_rate": 9.559143929703724e-06, "loss": 0.0203, "step": 463 }, { "epoch": 0.16116707189996526, "grad_norm": 0.32646704240882035, "learning_rate": 9.556831148312612e-06, "loss": 0.0183, "step": 464 }, { "epoch": 0.16151441472733588, "grad_norm": 0.8970575237008778, "learning_rate": 9.554512597506122e-06, "loss": 0.0237, "step": 465 }, { "epoch": 0.1618617575547065, "grad_norm": 1.9756358587933307, "learning_rate": 9.552188280219773e-06, "loss": 0.0232, "step": 466 }, { "epoch": 0.16220910038207711, "grad_norm": 0.659327280239366, "learning_rate": 9.549858199396394e-06, "loss": 0.0154, "step": 467 }, { "epoch": 0.16255644320944773, "grad_norm": 0.9285348605348003, "learning_rate": 9.547522357986102e-06, "loss": 0.0254, "step": 468 }, { "epoch": 0.16290378603681835, "grad_norm": 0.8854039259165357, "learning_rate": 9.545180758946312e-06, "loss": 0.023, "step": 469 }, { "epoch": 0.16325112886418897, "grad_norm": 0.7572168628873044, "learning_rate": 9.542833405241729e-06, "loss": 0.0229, "step": 470 }, { "epoch": 0.16359847169155958, "grad_norm": 1.1833229445171547, "learning_rate": 9.540480299844345e-06, "loss": 0.0194, "step": 471 }, { "epoch": 0.16394581451893017, "grad_norm": 1.128447523594609, "learning_rate": 9.538121445733431e-06, "loss": 0.0312, "step": 472 }, { "epoch": 0.1642931573463008, "grad_norm": 0.8646345138050862, "learning_rate": 9.53575684589554e-06, "loss": 0.034, "step": 473 }, { "epoch": 0.1646405001736714, "grad_norm": 2.142392363566201, "learning_rate": 9.533386503324495e-06, "loss": 0.0267, "step": 474 }, { "epoch": 0.16498784300104202, "grad_norm": 0.31920633758411143, "learning_rate": 9.531010421021396e-06, "loss": 0.0264, "step": 475 }, { "epoch": 0.16533518582841264, "grad_norm": 1.1720964352470096, "learning_rate": 9.528628601994603e-06, "loss": 0.0203, "step": 476 }, { "epoch": 0.16568252865578326, "grad_norm": 0.3038247546428463, "learning_rate": 9.526241049259746e-06, "loss": 0.0178, "step": 477 }, { "epoch": 0.16602987148315387, "grad_norm": 1.4917694833284774, "learning_rate": 9.523847765839712e-06, "loss": 0.0242, "step": 478 }, { "epoch": 0.1663772143105245, "grad_norm": 0.818082889175988, "learning_rate": 9.52144875476464e-06, "loss": 0.0222, "step": 479 }, { "epoch": 0.1667245571378951, "grad_norm": 0.3877555907825711, "learning_rate": 9.519044019071926e-06, "loss": 0.0243, "step": 480 }, { "epoch": 0.16707189996526572, "grad_norm": 0.4639811409245256, "learning_rate": 9.51663356180621e-06, "loss": 0.0251, "step": 481 }, { "epoch": 0.16741924279263634, "grad_norm": 0.8099085137002117, "learning_rate": 9.514217386019381e-06, "loss": 0.0303, "step": 482 }, { "epoch": 0.16776658562000696, "grad_norm": 0.9835471538813636, "learning_rate": 9.511795494770563e-06, "loss": 0.0259, "step": 483 }, { "epoch": 0.16811392844737758, "grad_norm": 0.6270381501097829, "learning_rate": 9.509367891126122e-06, "loss": 0.0159, "step": 484 }, { "epoch": 0.16846127127474816, "grad_norm": 0.4636485528169751, "learning_rate": 9.506934578159648e-06, "loss": 0.0238, "step": 485 }, { "epoch": 0.16880861410211878, "grad_norm": 0.6141677857665021, "learning_rate": 9.50449555895197e-06, "loss": 0.0204, "step": 486 }, { "epoch": 0.1691559569294894, "grad_norm": 0.9292538654705289, "learning_rate": 9.50205083659113e-06, "loss": 0.0327, "step": 487 }, { "epoch": 0.16950329975686002, "grad_norm": 1.1196044763325756, "learning_rate": 9.499600414172402e-06, "loss": 0.0222, "step": 488 }, { "epoch": 0.16985064258423063, "grad_norm": 1.5976292188516854, "learning_rate": 9.49714429479827e-06, "loss": 0.0324, "step": 489 }, { "epoch": 0.17019798541160125, "grad_norm": 1.2857006123730916, "learning_rate": 9.494682481578436e-06, "loss": 0.0205, "step": 490 }, { "epoch": 0.17054532823897187, "grad_norm": 1.276460947257805, "learning_rate": 9.492214977629804e-06, "loss": 0.016, "step": 491 }, { "epoch": 0.17089267106634248, "grad_norm": 1.252602259656139, "learning_rate": 9.489741786076488e-06, "loss": 0.0237, "step": 492 }, { "epoch": 0.1712400138937131, "grad_norm": 0.5734672626180113, "learning_rate": 9.487262910049804e-06, "loss": 0.017, "step": 493 }, { "epoch": 0.17158735672108372, "grad_norm": 1.2737930527140826, "learning_rate": 9.48477835268826e-06, "loss": 0.0231, "step": 494 }, { "epoch": 0.17193469954845433, "grad_norm": 0.9557221623397816, "learning_rate": 9.482288117137561e-06, "loss": 0.0186, "step": 495 }, { "epoch": 0.17228204237582495, "grad_norm": 1.7629581363677385, "learning_rate": 9.479792206550604e-06, "loss": 0.0333, "step": 496 }, { "epoch": 0.17262938520319557, "grad_norm": 0.4160720670777094, "learning_rate": 9.477290624087464e-06, "loss": 0.0207, "step": 497 }, { "epoch": 0.17297672803056616, "grad_norm": 1.657594987733875, "learning_rate": 9.474783372915401e-06, "loss": 0.0187, "step": 498 }, { "epoch": 0.17332407085793677, "grad_norm": 0.6777171710639321, "learning_rate": 9.472270456208856e-06, "loss": 0.0311, "step": 499 }, { "epoch": 0.1736714136853074, "grad_norm": 0.5649689115711107, "learning_rate": 9.469751877149434e-06, "loss": 0.0319, "step": 500 }, { "epoch": 0.174018756512678, "grad_norm": 1.440887458216018, "learning_rate": 9.467227638925917e-06, "loss": 0.0314, "step": 501 }, { "epoch": 0.17436609934004862, "grad_norm": 1.4716216093022831, "learning_rate": 9.464697744734248e-06, "loss": 0.0228, "step": 502 }, { "epoch": 0.17471344216741924, "grad_norm": 0.8445232330484052, "learning_rate": 9.462162197777533e-06, "loss": 0.0298, "step": 503 }, { "epoch": 0.17506078499478986, "grad_norm": 0.8274867059781084, "learning_rate": 9.459621001266036e-06, "loss": 0.0253, "step": 504 }, { "epoch": 0.17540812782216048, "grad_norm": 1.3093883304254912, "learning_rate": 9.45707415841717e-06, "loss": 0.0284, "step": 505 }, { "epoch": 0.1757554706495311, "grad_norm": 0.4071291730241176, "learning_rate": 9.454521672455501e-06, "loss": 0.0235, "step": 506 }, { "epoch": 0.1761028134769017, "grad_norm": 1.1764547349254069, "learning_rate": 9.451963546612737e-06, "loss": 0.0195, "step": 507 }, { "epoch": 0.17645015630427233, "grad_norm": 0.6569963887274749, "learning_rate": 9.449399784127726e-06, "loss": 0.0264, "step": 508 }, { "epoch": 0.17679749913164294, "grad_norm": 0.5238198578448529, "learning_rate": 9.446830388246457e-06, "loss": 0.0194, "step": 509 }, { "epoch": 0.17714484195901353, "grad_norm": 0.7417799180982843, "learning_rate": 9.444255362222046e-06, "loss": 0.0247, "step": 510 }, { "epoch": 0.17749218478638415, "grad_norm": 1.3703456770005742, "learning_rate": 9.441674709314743e-06, "loss": 0.0274, "step": 511 }, { "epoch": 0.17783952761375477, "grad_norm": 1.4652033192638536, "learning_rate": 9.439088432791916e-06, "loss": 0.0139, "step": 512 }, { "epoch": 0.17818687044112538, "grad_norm": 1.5832446589993328, "learning_rate": 9.436496535928057e-06, "loss": 0.034, "step": 513 }, { "epoch": 0.178534213268496, "grad_norm": 1.9119409177551294, "learning_rate": 9.433899022004774e-06, "loss": 0.0251, "step": 514 }, { "epoch": 0.17888155609586662, "grad_norm": 0.788548038638029, "learning_rate": 9.431295894310786e-06, "loss": 0.0308, "step": 515 }, { "epoch": 0.17922889892323723, "grad_norm": 0.8067843750520071, "learning_rate": 9.428687156141919e-06, "loss": 0.0234, "step": 516 }, { "epoch": 0.17957624175060785, "grad_norm": 1.1709927168009244, "learning_rate": 9.426072810801104e-06, "loss": 0.0319, "step": 517 }, { "epoch": 0.17992358457797847, "grad_norm": 0.6216918250812782, "learning_rate": 9.423452861598367e-06, "loss": 0.0243, "step": 518 }, { "epoch": 0.18027092740534909, "grad_norm": 0.8872099116597142, "learning_rate": 9.420827311850836e-06, "loss": 0.0252, "step": 519 }, { "epoch": 0.1806182702327197, "grad_norm": 0.586557224997833, "learning_rate": 9.418196164882725e-06, "loss": 0.0267, "step": 520 }, { "epoch": 0.18096561306009032, "grad_norm": 0.5527086813309143, "learning_rate": 9.415559424025335e-06, "loss": 0.0212, "step": 521 }, { "epoch": 0.18131295588746094, "grad_norm": 0.6211483589771798, "learning_rate": 9.41291709261705e-06, "loss": 0.018, "step": 522 }, { "epoch": 0.18166029871483153, "grad_norm": 0.5607071996617911, "learning_rate": 9.410269174003333e-06, "loss": 0.0208, "step": 523 }, { "epoch": 0.18200764154220214, "grad_norm": 1.8330990714819673, "learning_rate": 9.407615671536723e-06, "loss": 0.0306, "step": 524 }, { "epoch": 0.18235498436957276, "grad_norm": 0.7432517423995653, "learning_rate": 9.404956588576822e-06, "loss": 0.0266, "step": 525 }, { "epoch": 0.18270232719694338, "grad_norm": 0.5660339296085843, "learning_rate": 9.402291928490302e-06, "loss": 0.0184, "step": 526 }, { "epoch": 0.183049670024314, "grad_norm": 0.7219272711113525, "learning_rate": 9.399621694650898e-06, "loss": 0.0285, "step": 527 }, { "epoch": 0.1833970128516846, "grad_norm": 1.778600324374738, "learning_rate": 9.3969458904394e-06, "loss": 0.0345, "step": 528 }, { "epoch": 0.18374435567905523, "grad_norm": 1.6387273409591696, "learning_rate": 9.394264519243649e-06, "loss": 0.0273, "step": 529 }, { "epoch": 0.18409169850642584, "grad_norm": 0.8842030070616902, "learning_rate": 9.391577584458536e-06, "loss": 0.0203, "step": 530 }, { "epoch": 0.18443904133379646, "grad_norm": 0.9269965797470187, "learning_rate": 9.388885089485995e-06, "loss": 0.018, "step": 531 }, { "epoch": 0.18478638416116708, "grad_norm": 0.47186675698812003, "learning_rate": 9.386187037735004e-06, "loss": 0.0208, "step": 532 }, { "epoch": 0.1851337269885377, "grad_norm": 0.6914305009354391, "learning_rate": 9.383483432621569e-06, "loss": 0.021, "step": 533 }, { "epoch": 0.1854810698159083, "grad_norm": 0.5583142545579645, "learning_rate": 9.380774277568733e-06, "loss": 0.0169, "step": 534 }, { "epoch": 0.18582841264327893, "grad_norm": 0.5064603142442761, "learning_rate": 9.378059576006567e-06, "loss": 0.0263, "step": 535 }, { "epoch": 0.18617575547064952, "grad_norm": 0.5330806328958541, "learning_rate": 9.375339331372155e-06, "loss": 0.0191, "step": 536 }, { "epoch": 0.18652309829802013, "grad_norm": 1.945098806666559, "learning_rate": 9.37261354710961e-06, "loss": 0.0291, "step": 537 }, { "epoch": 0.18687044112539075, "grad_norm": 2.044217190353534, "learning_rate": 9.369882226670054e-06, "loss": 0.0258, "step": 538 }, { "epoch": 0.18721778395276137, "grad_norm": 0.6771224467650478, "learning_rate": 9.36714537351162e-06, "loss": 0.0168, "step": 539 }, { "epoch": 0.18756512678013199, "grad_norm": 0.695744615748055, "learning_rate": 9.36440299109944e-06, "loss": 0.015, "step": 540 }, { "epoch": 0.1879124696075026, "grad_norm": 0.4503395885743224, "learning_rate": 9.361655082905654e-06, "loss": 0.013, "step": 541 }, { "epoch": 0.18825981243487322, "grad_norm": 1.0753578393601884, "learning_rate": 9.358901652409398e-06, "loss": 0.018, "step": 542 }, { "epoch": 0.18860715526224384, "grad_norm": 0.49649547384514064, "learning_rate": 9.356142703096793e-06, "loss": 0.0147, "step": 543 }, { "epoch": 0.18895449808961445, "grad_norm": 1.5646023573061925, "learning_rate": 9.353378238460955e-06, "loss": 0.0237, "step": 544 }, { "epoch": 0.18930184091698507, "grad_norm": 0.7887255494483407, "learning_rate": 9.350608262001978e-06, "loss": 0.0175, "step": 545 }, { "epoch": 0.1896491837443557, "grad_norm": 1.0681506636597586, "learning_rate": 9.347832777226936e-06, "loss": 0.0186, "step": 546 }, { "epoch": 0.1899965265717263, "grad_norm": 0.7785556397165594, "learning_rate": 9.345051787649877e-06, "loss": 0.0184, "step": 547 }, { "epoch": 0.19034386939909692, "grad_norm": 0.509526910733021, "learning_rate": 9.34226529679182e-06, "loss": 0.0225, "step": 548 }, { "epoch": 0.1906912122264675, "grad_norm": 1.319059024617917, "learning_rate": 9.339473308180746e-06, "loss": 0.0214, "step": 549 }, { "epoch": 0.19103855505383813, "grad_norm": 0.5224211698738431, "learning_rate": 9.336675825351602e-06, "loss": 0.0257, "step": 550 }, { "epoch": 0.19138589788120874, "grad_norm": 1.1460327083281117, "learning_rate": 9.333872851846285e-06, "loss": 0.0255, "step": 551 }, { "epoch": 0.19173324070857936, "grad_norm": 0.7498296132522941, "learning_rate": 9.33106439121365e-06, "loss": 0.0262, "step": 552 }, { "epoch": 0.19208058353594998, "grad_norm": 0.38935101782706116, "learning_rate": 9.328250447009493e-06, "loss": 0.0147, "step": 553 }, { "epoch": 0.1924279263633206, "grad_norm": 0.8699070629651166, "learning_rate": 9.325431022796559e-06, "loss": 0.0213, "step": 554 }, { "epoch": 0.1927752691906912, "grad_norm": 0.8664760328709794, "learning_rate": 9.322606122144524e-06, "loss": 0.0236, "step": 555 }, { "epoch": 0.19312261201806183, "grad_norm": 2.250608635056857, "learning_rate": 9.319775748630004e-06, "loss": 0.0283, "step": 556 }, { "epoch": 0.19346995484543245, "grad_norm": 1.7828983126970328, "learning_rate": 9.316939905836543e-06, "loss": 0.0182, "step": 557 }, { "epoch": 0.19381729767280306, "grad_norm": 0.5137189754751468, "learning_rate": 9.314098597354608e-06, "loss": 0.0236, "step": 558 }, { "epoch": 0.19416464050017368, "grad_norm": 0.6266424659785979, "learning_rate": 9.311251826781587e-06, "loss": 0.0228, "step": 559 }, { "epoch": 0.1945119833275443, "grad_norm": 1.662923917212151, "learning_rate": 9.308399597721782e-06, "loss": 0.0232, "step": 560 }, { "epoch": 0.1948593261549149, "grad_norm": 0.7492399109293421, "learning_rate": 9.305541913786409e-06, "loss": 0.0224, "step": 561 }, { "epoch": 0.1952066689822855, "grad_norm": 0.5213910576317119, "learning_rate": 9.302678778593586e-06, "loss": 0.0232, "step": 562 }, { "epoch": 0.19555401180965612, "grad_norm": 1.4562306345231064, "learning_rate": 9.299810195768341e-06, "loss": 0.0282, "step": 563 }, { "epoch": 0.19590135463702674, "grad_norm": 0.9407546811393638, "learning_rate": 9.296936168942589e-06, "loss": 0.0199, "step": 564 }, { "epoch": 0.19624869746439735, "grad_norm": 0.8454850863651415, "learning_rate": 9.294056701755144e-06, "loss": 0.0169, "step": 565 }, { "epoch": 0.19659604029176797, "grad_norm": 0.27061382482871477, "learning_rate": 9.291171797851708e-06, "loss": 0.0131, "step": 566 }, { "epoch": 0.1969433831191386, "grad_norm": 0.49862269994808495, "learning_rate": 9.288281460884864e-06, "loss": 0.0171, "step": 567 }, { "epoch": 0.1972907259465092, "grad_norm": 0.8522107595930313, "learning_rate": 9.285385694514075e-06, "loss": 0.0218, "step": 568 }, { "epoch": 0.19763806877387982, "grad_norm": 0.8148444399323233, "learning_rate": 9.282484502405677e-06, "loss": 0.021, "step": 569 }, { "epoch": 0.19798541160125044, "grad_norm": 1.375134823060513, "learning_rate": 9.27957788823288e-06, "loss": 0.0298, "step": 570 }, { "epoch": 0.19833275442862106, "grad_norm": 1.1166507276668756, "learning_rate": 9.276665855675751e-06, "loss": 0.0186, "step": 571 }, { "epoch": 0.19868009725599167, "grad_norm": 1.3330794003539927, "learning_rate": 9.273748408421224e-06, "loss": 0.0254, "step": 572 }, { "epoch": 0.1990274400833623, "grad_norm": 0.369557234187912, "learning_rate": 9.270825550163088e-06, "loss": 0.0144, "step": 573 }, { "epoch": 0.1993747829107329, "grad_norm": 0.5934833218153897, "learning_rate": 9.267897284601976e-06, "loss": 0.0234, "step": 574 }, { "epoch": 0.1997221257381035, "grad_norm": 0.5376211438611554, "learning_rate": 9.264963615445378e-06, "loss": 0.0159, "step": 575 }, { "epoch": 0.2000694685654741, "grad_norm": 0.5613556695221377, "learning_rate": 9.26202454640762e-06, "loss": 0.0193, "step": 576 }, { "epoch": 0.20041681139284473, "grad_norm": 0.8368239688441027, "learning_rate": 9.259080081209861e-06, "loss": 0.023, "step": 577 }, { "epoch": 0.20076415422021535, "grad_norm": 1.2341577296539838, "learning_rate": 9.256130223580096e-06, "loss": 0.0198, "step": 578 }, { "epoch": 0.20111149704758596, "grad_norm": 0.3531135473026086, "learning_rate": 9.25317497725315e-06, "loss": 0.0081, "step": 579 }, { "epoch": 0.20145883987495658, "grad_norm": 0.7328484748799605, "learning_rate": 9.250214345970665e-06, "loss": 0.0184, "step": 580 }, { "epoch": 0.2018061827023272, "grad_norm": 1.9853290246566473, "learning_rate": 9.247248333481105e-06, "loss": 0.0214, "step": 581 }, { "epoch": 0.20215352552969781, "grad_norm": 1.4811646122579638, "learning_rate": 9.244276943539746e-06, "loss": 0.0241, "step": 582 }, { "epoch": 0.20250086835706843, "grad_norm": 1.272707363559077, "learning_rate": 9.241300179908672e-06, "loss": 0.0339, "step": 583 }, { "epoch": 0.20284821118443905, "grad_norm": 0.6314607624470274, "learning_rate": 9.238318046356772e-06, "loss": 0.0245, "step": 584 }, { "epoch": 0.20319555401180966, "grad_norm": 1.0288562093872682, "learning_rate": 9.235330546659731e-06, "loss": 0.0232, "step": 585 }, { "epoch": 0.20354289683918028, "grad_norm": 0.6216044232979469, "learning_rate": 9.23233768460003e-06, "loss": 0.0163, "step": 586 }, { "epoch": 0.2038902396665509, "grad_norm": 0.7839133361622146, "learning_rate": 9.229339463966942e-06, "loss": 0.017, "step": 587 }, { "epoch": 0.2042375824939215, "grad_norm": 1.065822018842104, "learning_rate": 9.226335888556517e-06, "loss": 0.0195, "step": 588 }, { "epoch": 0.2045849253212921, "grad_norm": 1.0799416510268132, "learning_rate": 9.223326962171594e-06, "loss": 0.0329, "step": 589 }, { "epoch": 0.20493226814866272, "grad_norm": 2.7643010329522597, "learning_rate": 9.22031268862178e-06, "loss": 0.0276, "step": 590 }, { "epoch": 0.20527961097603334, "grad_norm": 2.7311705318356703, "learning_rate": 9.217293071723455e-06, "loss": 0.0267, "step": 591 }, { "epoch": 0.20562695380340396, "grad_norm": 0.5493925419488683, "learning_rate": 9.214268115299761e-06, "loss": 0.0264, "step": 592 }, { "epoch": 0.20597429663077457, "grad_norm": 1.2139756694249682, "learning_rate": 9.211237823180605e-06, "loss": 0.0216, "step": 593 }, { "epoch": 0.2063216394581452, "grad_norm": 0.4237660059277888, "learning_rate": 9.208202199202649e-06, "loss": 0.0141, "step": 594 }, { "epoch": 0.2066689822855158, "grad_norm": 0.8419781686821314, "learning_rate": 9.205161247209303e-06, "loss": 0.0166, "step": 595 }, { "epoch": 0.20701632511288642, "grad_norm": 1.0055329342881736, "learning_rate": 9.202114971050722e-06, "loss": 0.0269, "step": 596 }, { "epoch": 0.20736366794025704, "grad_norm": 0.58123379031455, "learning_rate": 9.199063374583807e-06, "loss": 0.0272, "step": 597 }, { "epoch": 0.20771101076762766, "grad_norm": 1.258373507803076, "learning_rate": 9.19600646167219e-06, "loss": 0.0243, "step": 598 }, { "epoch": 0.20805835359499827, "grad_norm": 0.8337160786397171, "learning_rate": 9.192944236186237e-06, "loss": 0.0175, "step": 599 }, { "epoch": 0.2084056964223689, "grad_norm": 0.5397096011880417, "learning_rate": 9.189876702003037e-06, "loss": 0.0268, "step": 600 }, { "epoch": 0.20875303924973948, "grad_norm": 2.0609037827991252, "learning_rate": 9.186803863006408e-06, "loss": 0.0288, "step": 601 }, { "epoch": 0.2091003820771101, "grad_norm": 1.1561596599740218, "learning_rate": 9.183725723086873e-06, "loss": 0.0149, "step": 602 }, { "epoch": 0.20944772490448071, "grad_norm": 1.7470410681014348, "learning_rate": 9.180642286141678e-06, "loss": 0.0284, "step": 603 }, { "epoch": 0.20979506773185133, "grad_norm": 0.44157047622672, "learning_rate": 9.177553556074766e-06, "loss": 0.0204, "step": 604 }, { "epoch": 0.21014241055922195, "grad_norm": 0.44448775934421736, "learning_rate": 9.17445953679679e-06, "loss": 0.0188, "step": 605 }, { "epoch": 0.21048975338659257, "grad_norm": 1.4869913866249045, "learning_rate": 9.171360232225091e-06, "loss": 0.0216, "step": 606 }, { "epoch": 0.21083709621396318, "grad_norm": 0.7283847092929767, "learning_rate": 9.16825564628371e-06, "loss": 0.0226, "step": 607 }, { "epoch": 0.2111844390413338, "grad_norm": 0.44538635448331376, "learning_rate": 9.165145782903369e-06, "loss": 0.0112, "step": 608 }, { "epoch": 0.21153178186870442, "grad_norm": 0.5848098598044253, "learning_rate": 9.162030646021477e-06, "loss": 0.0272, "step": 609 }, { "epoch": 0.21187912469607503, "grad_norm": 0.6127021429282402, "learning_rate": 9.15891023958211e-06, "loss": 0.0183, "step": 610 }, { "epoch": 0.21222646752344565, "grad_norm": 0.8170743093460934, "learning_rate": 9.15578456753603e-06, "loss": 0.0283, "step": 611 }, { "epoch": 0.21257381035081627, "grad_norm": 1.1249613859894405, "learning_rate": 9.152653633840654e-06, "loss": 0.026, "step": 612 }, { "epoch": 0.21292115317818688, "grad_norm": 0.985266358777264, "learning_rate": 9.149517442460065e-06, "loss": 0.0242, "step": 613 }, { "epoch": 0.21326849600555747, "grad_norm": 0.5401136574602342, "learning_rate": 9.146375997365006e-06, "loss": 0.0236, "step": 614 }, { "epoch": 0.2136158388329281, "grad_norm": 0.8086803852304764, "learning_rate": 9.143229302532866e-06, "loss": 0.0258, "step": 615 }, { "epoch": 0.2139631816602987, "grad_norm": 0.8975085166076582, "learning_rate": 9.140077361947681e-06, "loss": 0.0194, "step": 616 }, { "epoch": 0.21431052448766932, "grad_norm": 1.1788638290843472, "learning_rate": 9.136920179600137e-06, "loss": 0.0204, "step": 617 }, { "epoch": 0.21465786731503994, "grad_norm": 0.5154845254827837, "learning_rate": 9.133757759487545e-06, "loss": 0.021, "step": 618 }, { "epoch": 0.21500521014241056, "grad_norm": 0.8015539062724568, "learning_rate": 9.130590105613854e-06, "loss": 0.0227, "step": 619 }, { "epoch": 0.21535255296978117, "grad_norm": 1.5385432558467305, "learning_rate": 9.127417221989643e-06, "loss": 0.0423, "step": 620 }, { "epoch": 0.2156998957971518, "grad_norm": 0.9984997426393253, "learning_rate": 9.1242391126321e-06, "loss": 0.0217, "step": 621 }, { "epoch": 0.2160472386245224, "grad_norm": 0.28392172266267085, "learning_rate": 9.121055781565044e-06, "loss": 0.0238, "step": 622 }, { "epoch": 0.21639458145189303, "grad_norm": 1.264151293918839, "learning_rate": 9.117867232818897e-06, "loss": 0.0209, "step": 623 }, { "epoch": 0.21674192427926364, "grad_norm": 1.0368184665287132, "learning_rate": 9.114673470430688e-06, "loss": 0.021, "step": 624 }, { "epoch": 0.21708926710663426, "grad_norm": 0.6918438110380035, "learning_rate": 9.111474498444046e-06, "loss": 0.0263, "step": 625 }, { "epoch": 0.21743660993400488, "grad_norm": 2.1367175315409477, "learning_rate": 9.1082703209092e-06, "loss": 0.0295, "step": 626 }, { "epoch": 0.21778395276137547, "grad_norm": 0.7119870045258382, "learning_rate": 9.105060941882966e-06, "loss": 0.023, "step": 627 }, { "epoch": 0.21813129558874608, "grad_norm": 1.14033999216725, "learning_rate": 9.101846365428747e-06, "loss": 0.0164, "step": 628 }, { "epoch": 0.2184786384161167, "grad_norm": 0.4345990776226327, "learning_rate": 9.098626595616527e-06, "loss": 0.0226, "step": 629 }, { "epoch": 0.21882598124348732, "grad_norm": 1.4156487567989067, "learning_rate": 9.095401636522863e-06, "loss": 0.0254, "step": 630 }, { "epoch": 0.21917332407085793, "grad_norm": 1.7788155380270627, "learning_rate": 9.092171492230883e-06, "loss": 0.0287, "step": 631 }, { "epoch": 0.21952066689822855, "grad_norm": 0.9127292614192289, "learning_rate": 9.088936166830285e-06, "loss": 0.0266, "step": 632 }, { "epoch": 0.21986800972559917, "grad_norm": 0.847423011464306, "learning_rate": 9.08569566441732e-06, "loss": 0.0232, "step": 633 }, { "epoch": 0.22021535255296978, "grad_norm": 0.43930165735953547, "learning_rate": 9.082449989094798e-06, "loss": 0.0183, "step": 634 }, { "epoch": 0.2205626953803404, "grad_norm": 0.7809095217627077, "learning_rate": 9.079199144972072e-06, "loss": 0.014, "step": 635 }, { "epoch": 0.22091003820771102, "grad_norm": 1.0253606245776024, "learning_rate": 9.075943136165049e-06, "loss": 0.0156, "step": 636 }, { "epoch": 0.22125738103508164, "grad_norm": 1.5700732329185856, "learning_rate": 9.072681966796169e-06, "loss": 0.0255, "step": 637 }, { "epoch": 0.22160472386245225, "grad_norm": 0.49771019117253024, "learning_rate": 9.069415640994403e-06, "loss": 0.0243, "step": 638 }, { "epoch": 0.22195206668982287, "grad_norm": 0.46212016511074455, "learning_rate": 9.066144162895259e-06, "loss": 0.023, "step": 639 }, { "epoch": 0.22229940951719346, "grad_norm": 0.7782758261073233, "learning_rate": 9.062867536640762e-06, "loss": 0.017, "step": 640 }, { "epoch": 0.22264675234456408, "grad_norm": 0.7838906985841836, "learning_rate": 9.059585766379455e-06, "loss": 0.0165, "step": 641 }, { "epoch": 0.2229940951719347, "grad_norm": 1.011196077425446, "learning_rate": 9.056298856266399e-06, "loss": 0.0218, "step": 642 }, { "epoch": 0.2233414379993053, "grad_norm": 1.518375606641246, "learning_rate": 9.053006810463156e-06, "loss": 0.0254, "step": 643 }, { "epoch": 0.22368878082667593, "grad_norm": 0.5838655335928358, "learning_rate": 9.049709633137796e-06, "loss": 0.0255, "step": 644 }, { "epoch": 0.22403612365404654, "grad_norm": 0.7210470798591194, "learning_rate": 9.04640732846488e-06, "loss": 0.0238, "step": 645 }, { "epoch": 0.22438346648141716, "grad_norm": 0.6437322596951384, "learning_rate": 9.043099900625468e-06, "loss": 0.0237, "step": 646 }, { "epoch": 0.22473080930878778, "grad_norm": 1.0308720116120658, "learning_rate": 9.039787353807101e-06, "loss": 0.0305, "step": 647 }, { "epoch": 0.2250781521361584, "grad_norm": 1.6497385641943867, "learning_rate": 9.036469692203804e-06, "loss": 0.0296, "step": 648 }, { "epoch": 0.225425494963529, "grad_norm": 0.3313837148743059, "learning_rate": 9.033146920016073e-06, "loss": 0.0224, "step": 649 }, { "epoch": 0.22577283779089963, "grad_norm": 0.5293106608754529, "learning_rate": 9.029819041450884e-06, "loss": 0.0203, "step": 650 }, { "epoch": 0.22612018061827024, "grad_norm": 1.1616316561406497, "learning_rate": 9.026486060721668e-06, "loss": 0.0288, "step": 651 }, { "epoch": 0.22646752344564083, "grad_norm": 1.3686370104380872, "learning_rate": 9.023147982048322e-06, "loss": 0.0247, "step": 652 }, { "epoch": 0.22681486627301145, "grad_norm": 0.9530520238325823, "learning_rate": 9.019804809657195e-06, "loss": 0.0222, "step": 653 }, { "epoch": 0.22716220910038207, "grad_norm": 0.4302751935992573, "learning_rate": 9.016456547781088e-06, "loss": 0.0222, "step": 654 }, { "epoch": 0.22750955192775268, "grad_norm": 0.4268832341371163, "learning_rate": 9.01310320065924e-06, "loss": 0.0189, "step": 655 }, { "epoch": 0.2278568947551233, "grad_norm": 0.3637254931806751, "learning_rate": 9.009744772537336e-06, "loss": 0.0208, "step": 656 }, { "epoch": 0.22820423758249392, "grad_norm": 0.9443986440525276, "learning_rate": 9.006381267667489e-06, "loss": 0.0157, "step": 657 }, { "epoch": 0.22855158040986454, "grad_norm": 1.033293442849867, "learning_rate": 9.00301269030824e-06, "loss": 0.0235, "step": 658 }, { "epoch": 0.22889892323723515, "grad_norm": 0.7206031262994533, "learning_rate": 8.999639044724555e-06, "loss": 0.0144, "step": 659 }, { "epoch": 0.22924626606460577, "grad_norm": 0.5569325989347864, "learning_rate": 8.996260335187813e-06, "loss": 0.0273, "step": 660 }, { "epoch": 0.2295936088919764, "grad_norm": 0.6950933663288503, "learning_rate": 8.992876565975809e-06, "loss": 0.0351, "step": 661 }, { "epoch": 0.229940951719347, "grad_norm": 0.961443300736354, "learning_rate": 8.98948774137274e-06, "loss": 0.0249, "step": 662 }, { "epoch": 0.23028829454671762, "grad_norm": 1.9014588737941847, "learning_rate": 8.986093865669205e-06, "loss": 0.0313, "step": 663 }, { "epoch": 0.23063563737408824, "grad_norm": 0.9492079615362812, "learning_rate": 8.9826949431622e-06, "loss": 0.0248, "step": 664 }, { "epoch": 0.23098298020145883, "grad_norm": 0.5262876532553753, "learning_rate": 8.97929097815511e-06, "loss": 0.0271, "step": 665 }, { "epoch": 0.23133032302882944, "grad_norm": 1.4774684801073679, "learning_rate": 8.9758819749577e-06, "loss": 0.0259, "step": 666 }, { "epoch": 0.23167766585620006, "grad_norm": 0.47912155013309676, "learning_rate": 8.972467937886122e-06, "loss": 0.0198, "step": 667 }, { "epoch": 0.23202500868357068, "grad_norm": 0.4857405075161966, "learning_rate": 8.969048871262895e-06, "loss": 0.021, "step": 668 }, { "epoch": 0.2323723515109413, "grad_norm": 0.5501028852401879, "learning_rate": 8.965624779416907e-06, "loss": 0.0229, "step": 669 }, { "epoch": 0.2327196943383119, "grad_norm": 0.37333376724281975, "learning_rate": 8.96219566668341e-06, "loss": 0.0216, "step": 670 }, { "epoch": 0.23306703716568253, "grad_norm": 0.48066801556311134, "learning_rate": 8.958761537404012e-06, "loss": 0.0198, "step": 671 }, { "epoch": 0.23341437999305314, "grad_norm": 0.813514292061828, "learning_rate": 8.955322395926673e-06, "loss": 0.0139, "step": 672 }, { "epoch": 0.23376172282042376, "grad_norm": 0.7125740685535722, "learning_rate": 8.9518782466057e-06, "loss": 0.0158, "step": 673 }, { "epoch": 0.23410906564779438, "grad_norm": 0.3825152111739057, "learning_rate": 8.948429093801738e-06, "loss": 0.0215, "step": 674 }, { "epoch": 0.234456408475165, "grad_norm": 1.3795608714085683, "learning_rate": 8.944974941881766e-06, "loss": 0.0274, "step": 675 }, { "epoch": 0.2348037513025356, "grad_norm": 0.5497870254567753, "learning_rate": 8.941515795219098e-06, "loss": 0.0197, "step": 676 }, { "epoch": 0.23515109412990623, "grad_norm": 0.8354225349175881, "learning_rate": 8.938051658193365e-06, "loss": 0.0227, "step": 677 }, { "epoch": 0.23549843695727682, "grad_norm": 0.5221880740171788, "learning_rate": 8.934582535190522e-06, "loss": 0.0212, "step": 678 }, { "epoch": 0.23584577978464744, "grad_norm": 0.5930924226753035, "learning_rate": 8.931108430602834e-06, "loss": 0.0176, "step": 679 }, { "epoch": 0.23619312261201805, "grad_norm": 1.4128463606001023, "learning_rate": 8.927629348828874e-06, "loss": 0.0171, "step": 680 }, { "epoch": 0.23654046543938867, "grad_norm": 0.8753822994129902, "learning_rate": 8.924145294273515e-06, "loss": 0.0211, "step": 681 }, { "epoch": 0.2368878082667593, "grad_norm": 0.7822190197357625, "learning_rate": 8.920656271347925e-06, "loss": 0.0251, "step": 682 }, { "epoch": 0.2372351510941299, "grad_norm": 1.4919418777559417, "learning_rate": 8.917162284469569e-06, "loss": 0.0224, "step": 683 }, { "epoch": 0.23758249392150052, "grad_norm": 1.5473820670531835, "learning_rate": 8.91366333806219e-06, "loss": 0.0277, "step": 684 }, { "epoch": 0.23792983674887114, "grad_norm": 1.1016478393828795, "learning_rate": 8.910159436555813e-06, "loss": 0.0252, "step": 685 }, { "epoch": 0.23827717957624175, "grad_norm": 1.5287564476717936, "learning_rate": 8.90665058438674e-06, "loss": 0.0188, "step": 686 }, { "epoch": 0.23862452240361237, "grad_norm": 0.7366291157632144, "learning_rate": 8.903136785997533e-06, "loss": 0.0295, "step": 687 }, { "epoch": 0.238971865230983, "grad_norm": 0.7896406244087025, "learning_rate": 8.899618045837025e-06, "loss": 0.0237, "step": 688 }, { "epoch": 0.2393192080583536, "grad_norm": 1.1875606974598063, "learning_rate": 8.896094368360297e-06, "loss": 0.0162, "step": 689 }, { "epoch": 0.23966655088572422, "grad_norm": 0.47042040685656056, "learning_rate": 8.892565758028688e-06, "loss": 0.0282, "step": 690 }, { "epoch": 0.2400138937130948, "grad_norm": 0.5817301391235052, "learning_rate": 8.889032219309781e-06, "loss": 0.0291, "step": 691 }, { "epoch": 0.24036123654046543, "grad_norm": 0.6761086942386368, "learning_rate": 8.885493756677399e-06, "loss": 0.0248, "step": 692 }, { "epoch": 0.24070857936783605, "grad_norm": 0.8209577079998156, "learning_rate": 8.881950374611597e-06, "loss": 0.0196, "step": 693 }, { "epoch": 0.24105592219520666, "grad_norm": 1.314377357321471, "learning_rate": 8.878402077598662e-06, "loss": 0.023, "step": 694 }, { "epoch": 0.24140326502257728, "grad_norm": 1.1012244390968478, "learning_rate": 8.874848870131098e-06, "loss": 0.0255, "step": 695 }, { "epoch": 0.2417506078499479, "grad_norm": 0.7845674765254901, "learning_rate": 8.871290756707634e-06, "loss": 0.0221, "step": 696 }, { "epoch": 0.2420979506773185, "grad_norm": 0.59132264309778, "learning_rate": 8.867727741833204e-06, "loss": 0.0179, "step": 697 }, { "epoch": 0.24244529350468913, "grad_norm": 1.2702082566630686, "learning_rate": 8.86415983001895e-06, "loss": 0.0284, "step": 698 }, { "epoch": 0.24279263633205975, "grad_norm": 0.7795467424110684, "learning_rate": 8.860587025782215e-06, "loss": 0.0171, "step": 699 }, { "epoch": 0.24313997915943036, "grad_norm": 1.84470369765152, "learning_rate": 8.857009333646535e-06, "loss": 0.0214, "step": 700 }, { "epoch": 0.24348732198680098, "grad_norm": 1.9319346606245589, "learning_rate": 8.853426758141635e-06, "loss": 0.0242, "step": 701 }, { "epoch": 0.2438346648141716, "grad_norm": 1.3940656814702743, "learning_rate": 8.849839303803425e-06, "loss": 0.0239, "step": 702 }, { "epoch": 0.24418200764154221, "grad_norm": 0.6685433203227141, "learning_rate": 8.846246975173985e-06, "loss": 0.0224, "step": 703 }, { "epoch": 0.2445293504689128, "grad_norm": 0.8276105742660492, "learning_rate": 8.842649776801576e-06, "loss": 0.031, "step": 704 }, { "epoch": 0.24487669329628342, "grad_norm": 0.9152658175517937, "learning_rate": 8.839047713240619e-06, "loss": 0.0157, "step": 705 }, { "epoch": 0.24522403612365404, "grad_norm": 1.0704740818766338, "learning_rate": 8.835440789051692e-06, "loss": 0.0143, "step": 706 }, { "epoch": 0.24557137895102465, "grad_norm": 1.0779902900776792, "learning_rate": 8.831829008801536e-06, "loss": 0.0175, "step": 707 }, { "epoch": 0.24591872177839527, "grad_norm": 0.6030988446891595, "learning_rate": 8.828212377063033e-06, "loss": 0.0194, "step": 708 }, { "epoch": 0.2462660646057659, "grad_norm": 0.8610359328267158, "learning_rate": 8.824590898415209e-06, "loss": 0.0258, "step": 709 }, { "epoch": 0.2466134074331365, "grad_norm": 0.5357367765825686, "learning_rate": 8.820964577443227e-06, "loss": 0.0278, "step": 710 }, { "epoch": 0.24696075026050712, "grad_norm": 1.5065093568870387, "learning_rate": 8.817333418738382e-06, "loss": 0.0195, "step": 711 }, { "epoch": 0.24730809308787774, "grad_norm": 1.4013893590690267, "learning_rate": 8.813697426898094e-06, "loss": 0.0276, "step": 712 }, { "epoch": 0.24765543591524836, "grad_norm": 1.4557088461414078, "learning_rate": 8.810056606525899e-06, "loss": 0.0232, "step": 713 }, { "epoch": 0.24800277874261897, "grad_norm": 1.1924673869323386, "learning_rate": 8.80641096223145e-06, "loss": 0.0163, "step": 714 }, { "epoch": 0.2483501215699896, "grad_norm": 0.42056499190089386, "learning_rate": 8.802760498630507e-06, "loss": 0.0249, "step": 715 }, { "epoch": 0.2486974643973602, "grad_norm": 0.3585210385651358, "learning_rate": 8.79910522034493e-06, "loss": 0.019, "step": 716 }, { "epoch": 0.2490448072247308, "grad_norm": 0.46610691242048347, "learning_rate": 8.795445132002679e-06, "loss": 0.0154, "step": 717 }, { "epoch": 0.2493921500521014, "grad_norm": 1.3759596235421945, "learning_rate": 8.791780238237794e-06, "loss": 0.0249, "step": 718 }, { "epoch": 0.24973949287947203, "grad_norm": 10.041742445276327, "learning_rate": 8.788110543690415e-06, "loss": 0.0274, "step": 719 }, { "epoch": 0.25008683570684265, "grad_norm": 0.9018639927916149, "learning_rate": 8.784436053006746e-06, "loss": 0.026, "step": 720 }, { "epoch": 0.2504341785342133, "grad_norm": 0.6316727580755663, "learning_rate": 8.780756770839071e-06, "loss": 0.0117, "step": 721 }, { "epoch": 0.2507815213615839, "grad_norm": 0.8380724086190908, "learning_rate": 8.777072701845738e-06, "loss": 0.0286, "step": 722 }, { "epoch": 0.25112886418895447, "grad_norm": 0.7088681744129693, "learning_rate": 8.773383850691155e-06, "loss": 0.0264, "step": 723 }, { "epoch": 0.2514762070163251, "grad_norm": 1.6040597806062724, "learning_rate": 8.769690222045787e-06, "loss": 0.0213, "step": 724 }, { "epoch": 0.2518235498436957, "grad_norm": 0.59234889125779, "learning_rate": 8.765991820586147e-06, "loss": 0.0193, "step": 725 }, { "epoch": 0.25217089267106635, "grad_norm": 1.3987574754257743, "learning_rate": 8.762288650994786e-06, "loss": 0.0302, "step": 726 }, { "epoch": 0.25251823549843694, "grad_norm": 0.7554420842981747, "learning_rate": 8.758580717960303e-06, "loss": 0.02, "step": 727 }, { "epoch": 0.2528655783258076, "grad_norm": 0.5299359962965386, "learning_rate": 8.754868026177317e-06, "loss": 0.0228, "step": 728 }, { "epoch": 0.25321292115317817, "grad_norm": 0.5066652476216912, "learning_rate": 8.751150580346477e-06, "loss": 0.0152, "step": 729 }, { "epoch": 0.2535602639805488, "grad_norm": 0.8528730067781101, "learning_rate": 8.747428385174452e-06, "loss": 0.0207, "step": 730 }, { "epoch": 0.2539076068079194, "grad_norm": 0.6236339038844014, "learning_rate": 8.743701445373922e-06, "loss": 0.0217, "step": 731 }, { "epoch": 0.25425494963529005, "grad_norm": 1.1276577230096638, "learning_rate": 8.739969765663574e-06, "loss": 0.018, "step": 732 }, { "epoch": 0.25460229246266064, "grad_norm": 1.0286105613817527, "learning_rate": 8.736233350768097e-06, "loss": 0.0166, "step": 733 }, { "epoch": 0.2549496352900313, "grad_norm": 1.4581045926440899, "learning_rate": 8.732492205418176e-06, "loss": 0.031, "step": 734 }, { "epoch": 0.2552969781174019, "grad_norm": 1.8876644886015665, "learning_rate": 8.728746334350483e-06, "loss": 0.027, "step": 735 }, { "epoch": 0.25564432094477246, "grad_norm": 1.0450409512942507, "learning_rate": 8.72499574230768e-06, "loss": 0.0207, "step": 736 }, { "epoch": 0.2559916637721431, "grad_norm": 0.6743560512268864, "learning_rate": 8.721240434038395e-06, "loss": 0.0244, "step": 737 }, { "epoch": 0.2563390065995137, "grad_norm": 0.8447479142593041, "learning_rate": 8.717480414297236e-06, "loss": 0.0288, "step": 738 }, { "epoch": 0.25668634942688434, "grad_norm": 0.9145315370795962, "learning_rate": 8.713715687844772e-06, "loss": 0.0185, "step": 739 }, { "epoch": 0.25703369225425493, "grad_norm": 1.5301247728960465, "learning_rate": 8.709946259447535e-06, "loss": 0.0275, "step": 740 }, { "epoch": 0.2573810350816256, "grad_norm": 1.901274677450291, "learning_rate": 8.706172133878006e-06, "loss": 0.015, "step": 741 }, { "epoch": 0.25772837790899616, "grad_norm": 0.49087360665043905, "learning_rate": 8.702393315914615e-06, "loss": 0.022, "step": 742 }, { "epoch": 0.2580757207363668, "grad_norm": 1.0922750212562884, "learning_rate": 8.698609810341733e-06, "loss": 0.0218, "step": 743 }, { "epoch": 0.2584230635637374, "grad_norm": 0.7102928208875048, "learning_rate": 8.694821621949667e-06, "loss": 0.0322, "step": 744 }, { "epoch": 0.25877040639110804, "grad_norm": 0.886964613687143, "learning_rate": 8.69102875553465e-06, "loss": 0.0159, "step": 745 }, { "epoch": 0.25911774921847863, "grad_norm": 0.7318076441241718, "learning_rate": 8.68723121589884e-06, "loss": 0.0233, "step": 746 }, { "epoch": 0.2594650920458493, "grad_norm": 0.6058986578436707, "learning_rate": 8.683429007850313e-06, "loss": 0.0251, "step": 747 }, { "epoch": 0.25981243487321987, "grad_norm": 0.9052930522075193, "learning_rate": 8.679622136203055e-06, "loss": 0.0149, "step": 748 }, { "epoch": 0.26015977770059046, "grad_norm": 1.2032045295627478, "learning_rate": 8.67581060577695e-06, "loss": 0.0224, "step": 749 }, { "epoch": 0.2605071205279611, "grad_norm": 0.539015089299102, "learning_rate": 8.671994421397793e-06, "loss": 0.0246, "step": 750 }, { "epoch": 0.2608544633553317, "grad_norm": 1.5525307354093696, "learning_rate": 8.668173587897261e-06, "loss": 0.0191, "step": 751 }, { "epoch": 0.26120180618270233, "grad_norm": 0.8769437334779882, "learning_rate": 8.664348110112923e-06, "loss": 0.0283, "step": 752 }, { "epoch": 0.2615491490100729, "grad_norm": 0.8682417310780673, "learning_rate": 8.660517992888225e-06, "loss": 0.0158, "step": 753 }, { "epoch": 0.26189649183744357, "grad_norm": 0.6663404002194353, "learning_rate": 8.656683241072488e-06, "loss": 0.0267, "step": 754 }, { "epoch": 0.26224383466481416, "grad_norm": 0.5892947695489575, "learning_rate": 8.6528438595209e-06, "loss": 0.0278, "step": 755 }, { "epoch": 0.2625911774921848, "grad_norm": 1.202175698552294, "learning_rate": 8.648999853094514e-06, "loss": 0.0266, "step": 756 }, { "epoch": 0.2629385203195554, "grad_norm": 0.7025664951173237, "learning_rate": 8.645151226660234e-06, "loss": 0.0204, "step": 757 }, { "epoch": 0.26328586314692604, "grad_norm": 0.4207213062287094, "learning_rate": 8.641297985090815e-06, "loss": 0.0231, "step": 758 }, { "epoch": 0.2636332059742966, "grad_norm": 0.5565332142739162, "learning_rate": 8.637440133264858e-06, "loss": 0.0201, "step": 759 }, { "epoch": 0.26398054880166727, "grad_norm": 0.98247646350947, "learning_rate": 8.6335776760668e-06, "loss": 0.03, "step": 760 }, { "epoch": 0.26432789162903786, "grad_norm": 1.331753783533657, "learning_rate": 8.629710618386903e-06, "loss": 0.0237, "step": 761 }, { "epoch": 0.26467523445640845, "grad_norm": 0.8360917594131502, "learning_rate": 8.625838965121263e-06, "loss": 0.0214, "step": 762 }, { "epoch": 0.2650225772837791, "grad_norm": 0.6406382478000888, "learning_rate": 8.621962721171789e-06, "loss": 0.0252, "step": 763 }, { "epoch": 0.2653699201111497, "grad_norm": 0.9906478193068576, "learning_rate": 8.618081891446201e-06, "loss": 0.0167, "step": 764 }, { "epoch": 0.2657172629385203, "grad_norm": 0.5398729124754951, "learning_rate": 8.61419648085803e-06, "loss": 0.02, "step": 765 }, { "epoch": 0.2660646057658909, "grad_norm": 1.1959531243334165, "learning_rate": 8.610306494326601e-06, "loss": 0.0253, "step": 766 }, { "epoch": 0.26641194859326156, "grad_norm": 1.1692706174579726, "learning_rate": 8.60641193677704e-06, "loss": 0.0297, "step": 767 }, { "epoch": 0.26675929142063215, "grad_norm": 1.0701239722359508, "learning_rate": 8.602512813140251e-06, "loss": 0.0214, "step": 768 }, { "epoch": 0.2671066342480028, "grad_norm": 0.4828984533321399, "learning_rate": 8.59860912835293e-06, "loss": 0.0253, "step": 769 }, { "epoch": 0.2674539770753734, "grad_norm": 1.1545942236415125, "learning_rate": 8.594700887357537e-06, "loss": 0.022, "step": 770 }, { "epoch": 0.26780131990274403, "grad_norm": 1.5002177682572735, "learning_rate": 8.59078809510231e-06, "loss": 0.0247, "step": 771 }, { "epoch": 0.2681486627301146, "grad_norm": 2.1953312403285024, "learning_rate": 8.58687075654124e-06, "loss": 0.0263, "step": 772 }, { "epoch": 0.26849600555748526, "grad_norm": 1.2121243717081425, "learning_rate": 8.582948876634084e-06, "loss": 0.0167, "step": 773 }, { "epoch": 0.26884334838485585, "grad_norm": 0.7901330703897526, "learning_rate": 8.579022460346343e-06, "loss": 0.0207, "step": 774 }, { "epoch": 0.26919069121222644, "grad_norm": 0.44307666800843953, "learning_rate": 8.57509151264926e-06, "loss": 0.0142, "step": 775 }, { "epoch": 0.2695380340395971, "grad_norm": 1.1332463643190542, "learning_rate": 8.57115603851982e-06, "loss": 0.0248, "step": 776 }, { "epoch": 0.2698853768669677, "grad_norm": 1.2837269229641983, "learning_rate": 8.567216042940735e-06, "loss": 0.0159, "step": 777 }, { "epoch": 0.2702327196943383, "grad_norm": 2.1182651419994714, "learning_rate": 8.563271530900448e-06, "loss": 0.0345, "step": 778 }, { "epoch": 0.2705800625217089, "grad_norm": 1.0165246753047423, "learning_rate": 8.55932250739311e-06, "loss": 0.0254, "step": 779 }, { "epoch": 0.27092740534907955, "grad_norm": 2.105285605392031, "learning_rate": 8.555368977418593e-06, "loss": 0.0306, "step": 780 }, { "epoch": 0.27127474817645014, "grad_norm": 1.9706982563429236, "learning_rate": 8.551410945982469e-06, "loss": 0.0388, "step": 781 }, { "epoch": 0.2716220910038208, "grad_norm": 1.2642900443762466, "learning_rate": 8.547448418096012e-06, "loss": 0.0169, "step": 782 }, { "epoch": 0.2719694338311914, "grad_norm": 0.6266295783177325, "learning_rate": 8.543481398776188e-06, "loss": 0.0218, "step": 783 }, { "epoch": 0.272316776658562, "grad_norm": 0.5973986334152628, "learning_rate": 8.539509893045654e-06, "loss": 0.0338, "step": 784 }, { "epoch": 0.2726641194859326, "grad_norm": 2.4245373478500687, "learning_rate": 8.535533905932739e-06, "loss": 0.0231, "step": 785 }, { "epoch": 0.27301146231330325, "grad_norm": 2.516583165841234, "learning_rate": 8.531553442471453e-06, "loss": 0.0262, "step": 786 }, { "epoch": 0.27335880514067384, "grad_norm": 3.32894137710812, "learning_rate": 8.527568507701467e-06, "loss": 0.0319, "step": 787 }, { "epoch": 0.27370614796804443, "grad_norm": 2.3072141104017625, "learning_rate": 8.523579106668121e-06, "loss": 0.0308, "step": 788 }, { "epoch": 0.2740534907954151, "grad_norm": 1.1713281727561171, "learning_rate": 8.519585244422405e-06, "loss": 0.0184, "step": 789 }, { "epoch": 0.27440083362278567, "grad_norm": 1.0031858926420991, "learning_rate": 8.515586926020959e-06, "loss": 0.0235, "step": 790 }, { "epoch": 0.2747481764501563, "grad_norm": 0.36533679027936755, "learning_rate": 8.511584156526059e-06, "loss": 0.0183, "step": 791 }, { "epoch": 0.2750955192775269, "grad_norm": 1.1097620564934818, "learning_rate": 8.507576941005626e-06, "loss": 0.0265, "step": 792 }, { "epoch": 0.27544286210489755, "grad_norm": 0.8446308489408966, "learning_rate": 8.503565284533206e-06, "loss": 0.0201, "step": 793 }, { "epoch": 0.27579020493226813, "grad_norm": 1.5612638631332916, "learning_rate": 8.499549192187965e-06, "loss": 0.022, "step": 794 }, { "epoch": 0.2761375477596388, "grad_norm": 1.0436423254930283, "learning_rate": 8.495528669054688e-06, "loss": 0.0188, "step": 795 }, { "epoch": 0.27648489058700937, "grad_norm": 0.7033478660193114, "learning_rate": 8.49150372022377e-06, "loss": 0.016, "step": 796 }, { "epoch": 0.27683223341438, "grad_norm": 0.7998512004622108, "learning_rate": 8.48747435079121e-06, "loss": 0.0297, "step": 797 }, { "epoch": 0.2771795762417506, "grad_norm": 0.5199486841483459, "learning_rate": 8.483440565858599e-06, "loss": 0.0216, "step": 798 }, { "epoch": 0.27752691906912125, "grad_norm": 1.2032922365716692, "learning_rate": 8.479402370533127e-06, "loss": 0.03, "step": 799 }, { "epoch": 0.27787426189649184, "grad_norm": 1.9065992211352163, "learning_rate": 8.47535976992756e-06, "loss": 0.0242, "step": 800 }, { "epoch": 0.2782216047238624, "grad_norm": 0.7910306233638731, "learning_rate": 8.471312769160247e-06, "loss": 0.014, "step": 801 }, { "epoch": 0.27856894755123307, "grad_norm": 0.9658546663370163, "learning_rate": 8.467261373355104e-06, "loss": 0.0234, "step": 802 }, { "epoch": 0.27891629037860366, "grad_norm": 0.6546478423971199, "learning_rate": 8.463205587641614e-06, "loss": 0.0189, "step": 803 }, { "epoch": 0.2792636332059743, "grad_norm": 0.6930129832525015, "learning_rate": 8.459145417154817e-06, "loss": 0.0258, "step": 804 }, { "epoch": 0.2796109760333449, "grad_norm": 0.4441678217661302, "learning_rate": 8.455080867035307e-06, "loss": 0.0125, "step": 805 }, { "epoch": 0.27995831886071554, "grad_norm": 1.1591532386678776, "learning_rate": 8.451011942429219e-06, "loss": 0.0283, "step": 806 }, { "epoch": 0.2803056616880861, "grad_norm": 1.2540462658684661, "learning_rate": 8.44693864848823e-06, "loss": 0.0144, "step": 807 }, { "epoch": 0.28065300451545677, "grad_norm": 0.8407468838106213, "learning_rate": 8.442860990369545e-06, "loss": 0.0194, "step": 808 }, { "epoch": 0.28100034734282736, "grad_norm": 1.02240345065518, "learning_rate": 8.438778973235904e-06, "loss": 0.0372, "step": 809 }, { "epoch": 0.281347690170198, "grad_norm": 0.711512705654793, "learning_rate": 8.43469260225555e-06, "loss": 0.0136, "step": 810 }, { "epoch": 0.2816950329975686, "grad_norm": 0.5328105749063056, "learning_rate": 8.430601882602256e-06, "loss": 0.0228, "step": 811 }, { "epoch": 0.28204237582493924, "grad_norm": 0.4977620396488783, "learning_rate": 8.426506819455285e-06, "loss": 0.0173, "step": 812 }, { "epoch": 0.28238971865230983, "grad_norm": 0.7517007103690861, "learning_rate": 8.422407417999413e-06, "loss": 0.0215, "step": 813 }, { "epoch": 0.2827370614796804, "grad_norm": 1.988030057566032, "learning_rate": 8.418303683424898e-06, "loss": 0.0236, "step": 814 }, { "epoch": 0.28308440430705106, "grad_norm": 0.564222331342316, "learning_rate": 8.414195620927491e-06, "loss": 0.0168, "step": 815 }, { "epoch": 0.28343174713442165, "grad_norm": 0.6853557880647918, "learning_rate": 8.410083235708422e-06, "loss": 0.0154, "step": 816 }, { "epoch": 0.2837790899617923, "grad_norm": 2.11338878622302, "learning_rate": 8.405966532974388e-06, "loss": 0.0264, "step": 817 }, { "epoch": 0.2841264327891629, "grad_norm": 0.8210234110109191, "learning_rate": 8.401845517937558e-06, "loss": 0.0291, "step": 818 }, { "epoch": 0.28447377561653353, "grad_norm": 0.889999073550466, "learning_rate": 8.397720195815561e-06, "loss": 0.0295, "step": 819 }, { "epoch": 0.2848211184439041, "grad_norm": 0.631173907871462, "learning_rate": 8.393590571831478e-06, "loss": 0.0217, "step": 820 }, { "epoch": 0.28516846127127476, "grad_norm": 0.7731774878641796, "learning_rate": 8.389456651213834e-06, "loss": 0.0155, "step": 821 }, { "epoch": 0.28551580409864535, "grad_norm": 1.5606323479981803, "learning_rate": 8.385318439196597e-06, "loss": 0.0249, "step": 822 }, { "epoch": 0.285863146926016, "grad_norm": 0.596618016585276, "learning_rate": 8.381175941019171e-06, "loss": 0.0325, "step": 823 }, { "epoch": 0.2862104897533866, "grad_norm": 1.0647983506484207, "learning_rate": 8.377029161926378e-06, "loss": 0.0268, "step": 824 }, { "epoch": 0.28655783258075723, "grad_norm": 0.5119216226794532, "learning_rate": 8.372878107168469e-06, "loss": 0.0238, "step": 825 }, { "epoch": 0.2869051754081278, "grad_norm": 0.46410901637663077, "learning_rate": 8.368722782001104e-06, "loss": 0.0169, "step": 826 }, { "epoch": 0.2872525182354984, "grad_norm": 1.3248050562513085, "learning_rate": 8.364563191685348e-06, "loss": 0.0224, "step": 827 }, { "epoch": 0.28759986106286906, "grad_norm": 0.9361071547212366, "learning_rate": 8.360399341487675e-06, "loss": 0.0259, "step": 828 }, { "epoch": 0.28794720389023964, "grad_norm": 1.1870448572485746, "learning_rate": 8.35623123667994e-06, "loss": 0.0193, "step": 829 }, { "epoch": 0.2882945467176103, "grad_norm": 0.9586109702853123, "learning_rate": 8.352058882539394e-06, "loss": 0.0206, "step": 830 }, { "epoch": 0.2886418895449809, "grad_norm": 0.5309149834858535, "learning_rate": 8.347882284348665e-06, "loss": 0.0208, "step": 831 }, { "epoch": 0.2889892323723515, "grad_norm": 1.3733779916187276, "learning_rate": 8.343701447395754e-06, "loss": 0.0183, "step": 832 }, { "epoch": 0.2893365751997221, "grad_norm": 1.3507083935119284, "learning_rate": 8.339516376974028e-06, "loss": 0.0228, "step": 833 }, { "epoch": 0.28968391802709276, "grad_norm": 0.9249455391192148, "learning_rate": 8.33532707838222e-06, "loss": 0.02, "step": 834 }, { "epoch": 0.29003126085446335, "grad_norm": 1.6816616489709881, "learning_rate": 8.331133556924404e-06, "loss": 0.0134, "step": 835 }, { "epoch": 0.290378603681834, "grad_norm": 1.0063521448061856, "learning_rate": 8.326935817910014e-06, "loss": 0.0188, "step": 836 }, { "epoch": 0.2907259465092046, "grad_norm": 0.8258758559049553, "learning_rate": 8.322733866653814e-06, "loss": 0.0263, "step": 837 }, { "epoch": 0.2910732893365752, "grad_norm": 1.173146724262853, "learning_rate": 8.31852770847591e-06, "loss": 0.0249, "step": 838 }, { "epoch": 0.2914206321639458, "grad_norm": 6.319556253427923, "learning_rate": 8.314317348701724e-06, "loss": 0.0334, "step": 839 }, { "epoch": 0.2917679749913164, "grad_norm": 3.142508712422825, "learning_rate": 8.310102792662006e-06, "loss": 0.0402, "step": 840 }, { "epoch": 0.29211531781868705, "grad_norm": 2.293229808909149, "learning_rate": 8.305884045692815e-06, "loss": 0.0401, "step": 841 }, { "epoch": 0.29246266064605764, "grad_norm": 1.021991216996574, "learning_rate": 8.30166111313552e-06, "loss": 0.0162, "step": 842 }, { "epoch": 0.2928100034734283, "grad_norm": 1.1326524259624917, "learning_rate": 8.297434000336781e-06, "loss": 0.0184, "step": 843 }, { "epoch": 0.29315734630079887, "grad_norm": 0.6967485472695889, "learning_rate": 8.29320271264856e-06, "loss": 0.0228, "step": 844 }, { "epoch": 0.2935046891281695, "grad_norm": 1.5690012404474116, "learning_rate": 8.288967255428102e-06, "loss": 0.0356, "step": 845 }, { "epoch": 0.2938520319555401, "grad_norm": 1.6508824584996764, "learning_rate": 8.284727634037928e-06, "loss": 0.022, "step": 846 }, { "epoch": 0.29419937478291075, "grad_norm": 1.7189879458010435, "learning_rate": 8.280483853845831e-06, "loss": 0.0176, "step": 847 }, { "epoch": 0.29454671761028134, "grad_norm": 1.3338043589868147, "learning_rate": 8.276235920224877e-06, "loss": 0.0139, "step": 848 }, { "epoch": 0.294894060437652, "grad_norm": 0.48731311842832525, "learning_rate": 8.271983838553383e-06, "loss": 0.0344, "step": 849 }, { "epoch": 0.2952414032650226, "grad_norm": 0.6636965122047247, "learning_rate": 8.26772761421492e-06, "loss": 0.0213, "step": 850 }, { "epoch": 0.2955887460923932, "grad_norm": 0.7449395913052544, "learning_rate": 8.263467252598303e-06, "loss": 0.0283, "step": 851 }, { "epoch": 0.2959360889197638, "grad_norm": 0.5866063388294399, "learning_rate": 8.25920275909759e-06, "loss": 0.0237, "step": 852 }, { "epoch": 0.2962834317471344, "grad_norm": 0.5762702547689298, "learning_rate": 8.254934139112062e-06, "loss": 0.0197, "step": 853 }, { "epoch": 0.29663077457450504, "grad_norm": 0.4188655050848168, "learning_rate": 8.250661398046236e-06, "loss": 0.0201, "step": 854 }, { "epoch": 0.29697811740187563, "grad_norm": 0.4948572771836202, "learning_rate": 8.246384541309835e-06, "loss": 0.0228, "step": 855 }, { "epoch": 0.2973254602292463, "grad_norm": 0.8130265751045145, "learning_rate": 8.242103574317802e-06, "loss": 0.0192, "step": 856 }, { "epoch": 0.29767280305661686, "grad_norm": 0.7538997021511098, "learning_rate": 8.237818502490273e-06, "loss": 0.0207, "step": 857 }, { "epoch": 0.2980201458839875, "grad_norm": 0.7989130279170902, "learning_rate": 8.233529331252598e-06, "loss": 0.0203, "step": 858 }, { "epoch": 0.2983674887113581, "grad_norm": 1.7902955280949013, "learning_rate": 8.2292360660353e-06, "loss": 0.0287, "step": 859 }, { "epoch": 0.29871483153872874, "grad_norm": 0.6270804680604328, "learning_rate": 8.224938712274097e-06, "loss": 0.0241, "step": 860 }, { "epoch": 0.29906217436609933, "grad_norm": 0.8032825382756686, "learning_rate": 8.220637275409878e-06, "loss": 0.0194, "step": 861 }, { "epoch": 0.29940951719347, "grad_norm": 1.1586121906804554, "learning_rate": 8.2163317608887e-06, "loss": 0.0207, "step": 862 }, { "epoch": 0.29975686002084057, "grad_norm": 0.9100243599349078, "learning_rate": 8.21202217416179e-06, "loss": 0.0156, "step": 863 }, { "epoch": 0.3001042028482112, "grad_norm": 0.6532828480664288, "learning_rate": 8.207708520685526e-06, "loss": 0.0187, "step": 864 }, { "epoch": 0.3004515456755818, "grad_norm": 0.5964555383406674, "learning_rate": 8.203390805921437e-06, "loss": 0.0176, "step": 865 }, { "epoch": 0.3007988885029524, "grad_norm": 0.6421563749310066, "learning_rate": 8.199069035336186e-06, "loss": 0.017, "step": 866 }, { "epoch": 0.30114623133032303, "grad_norm": 0.5207710470189775, "learning_rate": 8.194743214401587e-06, "loss": 0.0169, "step": 867 }, { "epoch": 0.3014935741576936, "grad_norm": 0.6923218916041749, "learning_rate": 8.190413348594564e-06, "loss": 0.027, "step": 868 }, { "epoch": 0.30184091698506427, "grad_norm": 0.5670711050522923, "learning_rate": 8.186079443397174e-06, "loss": 0.0278, "step": 869 }, { "epoch": 0.30218825981243486, "grad_norm": 0.7702120413403443, "learning_rate": 8.181741504296588e-06, "loss": 0.0251, "step": 870 }, { "epoch": 0.3025356026398055, "grad_norm": 0.6513641907856846, "learning_rate": 8.17739953678508e-06, "loss": 0.0193, "step": 871 }, { "epoch": 0.3028829454671761, "grad_norm": 1.2819392100935951, "learning_rate": 8.173053546360025e-06, "loss": 0.0239, "step": 872 }, { "epoch": 0.30323028829454673, "grad_norm": 0.8392899596493151, "learning_rate": 8.168703538523892e-06, "loss": 0.0281, "step": 873 }, { "epoch": 0.3035776311219173, "grad_norm": 0.5848723579885045, "learning_rate": 8.16434951878424e-06, "loss": 0.0191, "step": 874 }, { "epoch": 0.30392497394928797, "grad_norm": 1.1086246012363608, "learning_rate": 8.1599914926537e-06, "loss": 0.0167, "step": 875 }, { "epoch": 0.30427231677665856, "grad_norm": 0.7106156523093721, "learning_rate": 8.155629465649983e-06, "loss": 0.0231, "step": 876 }, { "epoch": 0.3046196596040292, "grad_norm": 0.770138606469383, "learning_rate": 8.15126344329586e-06, "loss": 0.0157, "step": 877 }, { "epoch": 0.3049670024313998, "grad_norm": 0.47477270596349086, "learning_rate": 8.146893431119166e-06, "loss": 0.0281, "step": 878 }, { "epoch": 0.3053143452587704, "grad_norm": 0.6077495292480687, "learning_rate": 8.142519434652782e-06, "loss": 0.0203, "step": 879 }, { "epoch": 0.305661688086141, "grad_norm": 0.8375044162407927, "learning_rate": 8.138141459434638e-06, "loss": 0.0207, "step": 880 }, { "epoch": 0.3060090309135116, "grad_norm": 0.7961579743921388, "learning_rate": 8.133759511007697e-06, "loss": 0.018, "step": 881 }, { "epoch": 0.30635637374088226, "grad_norm": 0.9954610793959501, "learning_rate": 8.129373594919957e-06, "loss": 0.0177, "step": 882 }, { "epoch": 0.30670371656825285, "grad_norm": 0.2668325158805243, "learning_rate": 8.124983716724434e-06, "loss": 0.0165, "step": 883 }, { "epoch": 0.3070510593956235, "grad_norm": 0.7195212219421603, "learning_rate": 8.120589881979167e-06, "loss": 0.0174, "step": 884 }, { "epoch": 0.3073984022229941, "grad_norm": 0.5395482390763875, "learning_rate": 8.116192096247202e-06, "loss": 0.0271, "step": 885 }, { "epoch": 0.3077457450503647, "grad_norm": 1.297519296316108, "learning_rate": 8.111790365096584e-06, "loss": 0.0211, "step": 886 }, { "epoch": 0.3080930878777353, "grad_norm": 1.110719079206034, "learning_rate": 8.107384694100355e-06, "loss": 0.0305, "step": 887 }, { "epoch": 0.30844043070510596, "grad_norm": 0.4497669645124215, "learning_rate": 8.102975088836551e-06, "loss": 0.0152, "step": 888 }, { "epoch": 0.30878777353247655, "grad_norm": 0.6803690138422418, "learning_rate": 8.098561554888181e-06, "loss": 0.036, "step": 889 }, { "epoch": 0.3091351163598472, "grad_norm": 0.5046240454353891, "learning_rate": 8.09414409784323e-06, "loss": 0.019, "step": 890 }, { "epoch": 0.3094824591872178, "grad_norm": 0.7226803158978641, "learning_rate": 8.089722723294654e-06, "loss": 0.0221, "step": 891 }, { "epoch": 0.3098298020145884, "grad_norm": 1.0803635312237747, "learning_rate": 8.085297436840365e-06, "loss": 0.0224, "step": 892 }, { "epoch": 0.310177144841959, "grad_norm": 0.41686248676010906, "learning_rate": 8.080868244083232e-06, "loss": 0.0257, "step": 893 }, { "epoch": 0.3105244876693296, "grad_norm": 1.0314297720832435, "learning_rate": 8.076435150631064e-06, "loss": 0.0182, "step": 894 }, { "epoch": 0.31087183049670025, "grad_norm": 2.59167850646609, "learning_rate": 8.071998162096613e-06, "loss": 0.0291, "step": 895 }, { "epoch": 0.31121917332407084, "grad_norm": 1.7037505969911395, "learning_rate": 8.06755728409756e-06, "loss": 0.0255, "step": 896 }, { "epoch": 0.3115665161514415, "grad_norm": 0.5736683932010955, "learning_rate": 8.063112522256516e-06, "loss": 0.0352, "step": 897 }, { "epoch": 0.3119138589788121, "grad_norm": 0.6074990637984911, "learning_rate": 8.058663882200998e-06, "loss": 0.0221, "step": 898 }, { "epoch": 0.3122612018061827, "grad_norm": 1.047458516096098, "learning_rate": 8.054211369563448e-06, "loss": 0.0239, "step": 899 }, { "epoch": 0.3126085446335533, "grad_norm": 1.3336377703933007, "learning_rate": 8.049754989981198e-06, "loss": 0.0258, "step": 900 }, { "epoch": 0.31295588746092395, "grad_norm": 0.5743775466749255, "learning_rate": 8.045294749096485e-06, "loss": 0.031, "step": 901 }, { "epoch": 0.31330323028829454, "grad_norm": 0.6348423077667337, "learning_rate": 8.040830652556429e-06, "loss": 0.0225, "step": 902 }, { "epoch": 0.3136505731156652, "grad_norm": 0.4690059985847731, "learning_rate": 8.036362706013033e-06, "loss": 0.0219, "step": 903 }, { "epoch": 0.3139979159430358, "grad_norm": 0.6063025129771344, "learning_rate": 8.031890915123178e-06, "loss": 0.0306, "step": 904 }, { "epoch": 0.31434525877040637, "grad_norm": 0.8197671720949243, "learning_rate": 8.02741528554861e-06, "loss": 0.023, "step": 905 }, { "epoch": 0.314692601597777, "grad_norm": 0.6251152385832162, "learning_rate": 8.02293582295593e-06, "loss": 0.0215, "step": 906 }, { "epoch": 0.3150399444251476, "grad_norm": 0.6072902912466627, "learning_rate": 8.018452533016604e-06, "loss": 0.0231, "step": 907 }, { "epoch": 0.31538728725251824, "grad_norm": 0.31262167331309026, "learning_rate": 8.01396542140693e-06, "loss": 0.0179, "step": 908 }, { "epoch": 0.31573463007988883, "grad_norm": 1.1201275890016398, "learning_rate": 8.009474493808054e-06, "loss": 0.0173, "step": 909 }, { "epoch": 0.3160819729072595, "grad_norm": 0.614103933391006, "learning_rate": 8.004979755905953e-06, "loss": 0.0204, "step": 910 }, { "epoch": 0.31642931573463007, "grad_norm": 1.2968687606304459, "learning_rate": 8.000481213391422e-06, "loss": 0.0304, "step": 911 }, { "epoch": 0.3167766585620007, "grad_norm": 0.5937663477943244, "learning_rate": 7.995978871960079e-06, "loss": 0.0205, "step": 912 }, { "epoch": 0.3171240013893713, "grad_norm": 0.5499626397370164, "learning_rate": 7.991472737312351e-06, "loss": 0.0185, "step": 913 }, { "epoch": 0.31747134421674195, "grad_norm": 1.101916609969889, "learning_rate": 7.986962815153466e-06, "loss": 0.0301, "step": 914 }, { "epoch": 0.31781868704411254, "grad_norm": 1.091262784145427, "learning_rate": 7.982449111193445e-06, "loss": 0.0208, "step": 915 }, { "epoch": 0.3181660298714832, "grad_norm": 0.6932386123391473, "learning_rate": 7.977931631147102e-06, "loss": 0.019, "step": 916 }, { "epoch": 0.31851337269885377, "grad_norm": 1.697367211349032, "learning_rate": 7.97341038073403e-06, "loss": 0.0225, "step": 917 }, { "epoch": 0.31886071552622436, "grad_norm": 0.48030348066887196, "learning_rate": 7.968885365678596e-06, "loss": 0.0198, "step": 918 }, { "epoch": 0.319208058353595, "grad_norm": 0.7911731814497744, "learning_rate": 7.96435659170993e-06, "loss": 0.0179, "step": 919 }, { "epoch": 0.3195554011809656, "grad_norm": 0.507749020308773, "learning_rate": 7.959824064561927e-06, "loss": 0.01, "step": 920 }, { "epoch": 0.31990274400833624, "grad_norm": 1.4662590811027465, "learning_rate": 7.955287789973231e-06, "loss": 0.0183, "step": 921 }, { "epoch": 0.3202500868357068, "grad_norm": 1.2953903150942983, "learning_rate": 7.950747773687231e-06, "loss": 0.0191, "step": 922 }, { "epoch": 0.32059742966307747, "grad_norm": 0.5523865660394646, "learning_rate": 7.946204021452049e-06, "loss": 0.0194, "step": 923 }, { "epoch": 0.32094477249044806, "grad_norm": 0.6034008798581597, "learning_rate": 7.941656539020546e-06, "loss": 0.0207, "step": 924 }, { "epoch": 0.3212921153178187, "grad_norm": 0.5589263526644, "learning_rate": 7.9371053321503e-06, "loss": 0.0185, "step": 925 }, { "epoch": 0.3216394581451893, "grad_norm": 0.5269322519646701, "learning_rate": 7.932550406603603e-06, "loss": 0.0196, "step": 926 }, { "epoch": 0.32198680097255994, "grad_norm": 1.2367553916721554, "learning_rate": 7.92799176814746e-06, "loss": 0.0224, "step": 927 }, { "epoch": 0.32233414379993053, "grad_norm": 0.5712602991453356, "learning_rate": 7.923429422553574e-06, "loss": 0.0239, "step": 928 }, { "epoch": 0.3226814866273012, "grad_norm": 0.7155877194834468, "learning_rate": 7.91886337559834e-06, "loss": 0.0178, "step": 929 }, { "epoch": 0.32302882945467176, "grad_norm": 0.9800497349713431, "learning_rate": 7.914293633062845e-06, "loss": 0.0189, "step": 930 }, { "epoch": 0.32337617228204235, "grad_norm": 1.6132143422048213, "learning_rate": 7.90972020073285e-06, "loss": 0.0229, "step": 931 }, { "epoch": 0.323723515109413, "grad_norm": 0.7257100719458756, "learning_rate": 7.905143084398792e-06, "loss": 0.0172, "step": 932 }, { "epoch": 0.3240708579367836, "grad_norm": 1.2683287542082773, "learning_rate": 7.900562289855763e-06, "loss": 0.017, "step": 933 }, { "epoch": 0.32441820076415423, "grad_norm": 0.7872928535413164, "learning_rate": 7.895977822903524e-06, "loss": 0.0218, "step": 934 }, { "epoch": 0.3247655435915248, "grad_norm": 0.9240714912057278, "learning_rate": 7.891389689346479e-06, "loss": 0.03, "step": 935 }, { "epoch": 0.32511288641889546, "grad_norm": 1.2754448071139646, "learning_rate": 7.886797894993674e-06, "loss": 0.0295, "step": 936 }, { "epoch": 0.32546022924626605, "grad_norm": 0.7875701058115853, "learning_rate": 7.882202445658792e-06, "loss": 0.0191, "step": 937 }, { "epoch": 0.3258075720736367, "grad_norm": 1.1225423591432484, "learning_rate": 7.877603347160144e-06, "loss": 0.0174, "step": 938 }, { "epoch": 0.3261549149010073, "grad_norm": 0.7416127576619395, "learning_rate": 7.873000605320658e-06, "loss": 0.0163, "step": 939 }, { "epoch": 0.32650225772837793, "grad_norm": 0.8331153940172414, "learning_rate": 7.868394225967881e-06, "loss": 0.0247, "step": 940 }, { "epoch": 0.3268496005557485, "grad_norm": 2.008150547125355, "learning_rate": 7.863784214933957e-06, "loss": 0.031, "step": 941 }, { "epoch": 0.32719694338311917, "grad_norm": 0.7135068543742416, "learning_rate": 7.859170578055633e-06, "loss": 0.0239, "step": 942 }, { "epoch": 0.32754428621048975, "grad_norm": 0.7525548520537905, "learning_rate": 7.85455332117425e-06, "loss": 0.0157, "step": 943 }, { "epoch": 0.32789162903786034, "grad_norm": 0.7034077910253422, "learning_rate": 7.849932450135726e-06, "loss": 0.027, "step": 944 }, { "epoch": 0.328238971865231, "grad_norm": 0.5421480780633295, "learning_rate": 7.84530797079056e-06, "loss": 0.0206, "step": 945 }, { "epoch": 0.3285863146926016, "grad_norm": 0.5588408113827876, "learning_rate": 7.84067988899381e-06, "loss": 0.0274, "step": 946 }, { "epoch": 0.3289336575199722, "grad_norm": 0.43480112336904514, "learning_rate": 7.836048210605109e-06, "loss": 0.0226, "step": 947 }, { "epoch": 0.3292810003473428, "grad_norm": 0.769442840903224, "learning_rate": 7.831412941488634e-06, "loss": 0.0148, "step": 948 }, { "epoch": 0.32962834317471346, "grad_norm": 1.3172824990527807, "learning_rate": 7.826774087513113e-06, "loss": 0.0276, "step": 949 }, { "epoch": 0.32997568600208405, "grad_norm": 2.0103119905473577, "learning_rate": 7.822131654551807e-06, "loss": 0.0295, "step": 950 }, { "epoch": 0.3303230288294547, "grad_norm": 0.41890011549619927, "learning_rate": 7.817485648482514e-06, "loss": 0.0203, "step": 951 }, { "epoch": 0.3306703716568253, "grad_norm": 0.34546681427916026, "learning_rate": 7.812836075187555e-06, "loss": 0.0159, "step": 952 }, { "epoch": 0.3310177144841959, "grad_norm": 1.3371854680462747, "learning_rate": 7.808182940553765e-06, "loss": 0.0306, "step": 953 }, { "epoch": 0.3313650573115665, "grad_norm": 1.8044729130465431, "learning_rate": 7.803526250472488e-06, "loss": 0.0338, "step": 954 }, { "epoch": 0.33171240013893716, "grad_norm": 1.0737141556403031, "learning_rate": 7.798866010839577e-06, "loss": 0.016, "step": 955 }, { "epoch": 0.33205974296630775, "grad_norm": 0.9299379110949423, "learning_rate": 7.794202227555365e-06, "loss": 0.0242, "step": 956 }, { "epoch": 0.33240708579367834, "grad_norm": 0.6512501538111303, "learning_rate": 7.789534906524684e-06, "loss": 0.0259, "step": 957 }, { "epoch": 0.332754428621049, "grad_norm": 0.7546601828240426, "learning_rate": 7.784864053656842e-06, "loss": 0.0251, "step": 958 }, { "epoch": 0.33310177144841957, "grad_norm": 0.4817810206017834, "learning_rate": 7.780189674865617e-06, "loss": 0.0198, "step": 959 }, { "epoch": 0.3334491142757902, "grad_norm": 1.3719190919474487, "learning_rate": 7.77551177606925e-06, "loss": 0.0219, "step": 960 }, { "epoch": 0.3337964571031608, "grad_norm": 1.3853549674094732, "learning_rate": 7.770830363190442e-06, "loss": 0.0184, "step": 961 }, { "epoch": 0.33414379993053145, "grad_norm": 0.9212792032039464, "learning_rate": 7.76614544215634e-06, "loss": 0.0238, "step": 962 }, { "epoch": 0.33449114275790204, "grad_norm": 0.5577408235128498, "learning_rate": 7.761457018898536e-06, "loss": 0.0222, "step": 963 }, { "epoch": 0.3348384855852727, "grad_norm": 1.9775404910250387, "learning_rate": 7.756765099353052e-06, "loss": 0.0236, "step": 964 }, { "epoch": 0.33518582841264327, "grad_norm": 1.3737059966757152, "learning_rate": 7.752069689460345e-06, "loss": 0.025, "step": 965 }, { "epoch": 0.3355331712400139, "grad_norm": 0.5574547827963353, "learning_rate": 7.747370795165277e-06, "loss": 0.0161, "step": 966 }, { "epoch": 0.3358805140673845, "grad_norm": 0.9820016637326211, "learning_rate": 7.742668422417137e-06, "loss": 0.0133, "step": 967 }, { "epoch": 0.33622785689475515, "grad_norm": 0.4953092620978607, "learning_rate": 7.737962577169606e-06, "loss": 0.0134, "step": 968 }, { "epoch": 0.33657519972212574, "grad_norm": 0.8358656568579974, "learning_rate": 7.73325326538077e-06, "loss": 0.0209, "step": 969 }, { "epoch": 0.33692254254949633, "grad_norm": 1.3762297024347394, "learning_rate": 7.728540493013098e-06, "loss": 0.0221, "step": 970 }, { "epoch": 0.337269885376867, "grad_norm": 1.8553914564827385, "learning_rate": 7.723824266033444e-06, "loss": 0.0247, "step": 971 }, { "epoch": 0.33761722820423756, "grad_norm": 1.3117680989693183, "learning_rate": 7.719104590413036e-06, "loss": 0.0241, "step": 972 }, { "epoch": 0.3379645710316082, "grad_norm": 1.1924550600193087, "learning_rate": 7.714381472127466e-06, "loss": 0.0251, "step": 973 }, { "epoch": 0.3383119138589788, "grad_norm": 0.7787134219349535, "learning_rate": 7.709654917156683e-06, "loss": 0.0213, "step": 974 }, { "epoch": 0.33865925668634944, "grad_norm": 0.5251026227118649, "learning_rate": 7.704924931484997e-06, "loss": 0.0135, "step": 975 }, { "epoch": 0.33900659951372003, "grad_norm": 0.6878735825950388, "learning_rate": 7.700191521101047e-06, "loss": 0.0193, "step": 976 }, { "epoch": 0.3393539423410907, "grad_norm": 1.2530620449693388, "learning_rate": 7.695454691997824e-06, "loss": 0.0234, "step": 977 }, { "epoch": 0.33970128516846126, "grad_norm": 1.9903113138666264, "learning_rate": 7.690714450172633e-06, "loss": 0.027, "step": 978 }, { "epoch": 0.3400486279958319, "grad_norm": 0.9816472108674068, "learning_rate": 7.685970801627108e-06, "loss": 0.0153, "step": 979 }, { "epoch": 0.3403959708232025, "grad_norm": 1.218057722488078, "learning_rate": 7.681223752367195e-06, "loss": 0.0261, "step": 980 }, { "epoch": 0.34074331365057314, "grad_norm": 0.5081769560208316, "learning_rate": 7.676473308403142e-06, "loss": 0.0149, "step": 981 }, { "epoch": 0.34109065647794373, "grad_norm": 0.541037545259369, "learning_rate": 7.671719475749502e-06, "loss": 0.0179, "step": 982 }, { "epoch": 0.3414379993053143, "grad_norm": 1.0709500749793777, "learning_rate": 7.666962260425113e-06, "loss": 0.0296, "step": 983 }, { "epoch": 0.34178534213268497, "grad_norm": 1.0183627382474816, "learning_rate": 7.662201668453098e-06, "loss": 0.0241, "step": 984 }, { "epoch": 0.34213268496005556, "grad_norm": 0.490710306987966, "learning_rate": 7.657437705860853e-06, "loss": 0.0083, "step": 985 }, { "epoch": 0.3424800277874262, "grad_norm": 0.9120936689637056, "learning_rate": 7.652670378680043e-06, "loss": 0.0183, "step": 986 }, { "epoch": 0.3428273706147968, "grad_norm": 0.8239513692459495, "learning_rate": 7.647899692946594e-06, "loss": 0.0199, "step": 987 }, { "epoch": 0.34317471344216743, "grad_norm": 0.6785851737249843, "learning_rate": 7.643125654700684e-06, "loss": 0.0203, "step": 988 }, { "epoch": 0.343522056269538, "grad_norm": 0.9952364860290294, "learning_rate": 7.638348269986733e-06, "loss": 0.0233, "step": 989 }, { "epoch": 0.34386939909690867, "grad_norm": 0.7687287712488833, "learning_rate": 7.6335675448534e-06, "loss": 0.0143, "step": 990 }, { "epoch": 0.34421674192427926, "grad_norm": 1.0499353749760423, "learning_rate": 7.628783485353573e-06, "loss": 0.0259, "step": 991 }, { "epoch": 0.3445640847516499, "grad_norm": 1.474581388095261, "learning_rate": 7.623996097544364e-06, "loss": 0.0258, "step": 992 }, { "epoch": 0.3449114275790205, "grad_norm": 1.8524514914939911, "learning_rate": 7.619205387487094e-06, "loss": 0.0263, "step": 993 }, { "epoch": 0.34525877040639114, "grad_norm": 1.9787996414155324, "learning_rate": 7.614411361247296e-06, "loss": 0.0241, "step": 994 }, { "epoch": 0.3456061132337617, "grad_norm": 0.9020458030038447, "learning_rate": 7.609614024894694e-06, "loss": 0.0282, "step": 995 }, { "epoch": 0.3459534560611323, "grad_norm": 1.6705983295073557, "learning_rate": 7.604813384503212e-06, "loss": 0.0297, "step": 996 }, { "epoch": 0.34630079888850296, "grad_norm": 0.6271112596802708, "learning_rate": 7.600009446150951e-06, "loss": 0.0202, "step": 997 }, { "epoch": 0.34664814171587355, "grad_norm": 0.43730341616505064, "learning_rate": 7.59520221592019e-06, "loss": 0.0165, "step": 998 }, { "epoch": 0.3469954845432442, "grad_norm": 0.6300030347921803, "learning_rate": 7.5903916998973745e-06, "loss": 0.0226, "step": 999 }, { "epoch": 0.3473428273706148, "grad_norm": 0.6996825232281534, "learning_rate": 7.585577904173113e-06, "loss": 0.0172, "step": 1000 }, { "epoch": 0.3476901701979854, "grad_norm": 0.6380166925486069, "learning_rate": 7.580760834842162e-06, "loss": 0.0201, "step": 1001 }, { "epoch": 0.348037513025356, "grad_norm": 1.3568481449306653, "learning_rate": 7.575940498003425e-06, "loss": 0.0212, "step": 1002 }, { "epoch": 0.34838485585272666, "grad_norm": 1.2549578263294605, "learning_rate": 7.571116899759945e-06, "loss": 0.0238, "step": 1003 }, { "epoch": 0.34873219868009725, "grad_norm": 1.0886856632970126, "learning_rate": 7.566290046218889e-06, "loss": 0.0243, "step": 1004 }, { "epoch": 0.3490795415074679, "grad_norm": 0.4043124260071018, "learning_rate": 7.5614599434915514e-06, "loss": 0.0149, "step": 1005 }, { "epoch": 0.3494268843348385, "grad_norm": 0.4143945807665345, "learning_rate": 7.556626597693335e-06, "loss": 0.0172, "step": 1006 }, { "epoch": 0.34977422716220913, "grad_norm": 0.47662985164661864, "learning_rate": 7.551790014943752e-06, "loss": 0.0194, "step": 1007 }, { "epoch": 0.3501215699895797, "grad_norm": 0.9261090190118699, "learning_rate": 7.546950201366412e-06, "loss": 0.0261, "step": 1008 }, { "epoch": 0.3504689128169503, "grad_norm": 1.2825084254203554, "learning_rate": 7.542107163089016e-06, "loss": 0.0151, "step": 1009 }, { "epoch": 0.35081625564432095, "grad_norm": 0.8371175718056099, "learning_rate": 7.537260906243344e-06, "loss": 0.0267, "step": 1010 }, { "epoch": 0.35116359847169154, "grad_norm": 0.6892235320884719, "learning_rate": 7.532411436965258e-06, "loss": 0.0253, "step": 1011 }, { "epoch": 0.3515109412990622, "grad_norm": 0.731192690685414, "learning_rate": 7.52755876139468e-06, "loss": 0.0139, "step": 1012 }, { "epoch": 0.3518582841264328, "grad_norm": 0.6312927924236015, "learning_rate": 7.522702885675597e-06, "loss": 0.0228, "step": 1013 }, { "epoch": 0.3522056269538034, "grad_norm": 0.9551305586895227, "learning_rate": 7.517843815956045e-06, "loss": 0.0178, "step": 1014 }, { "epoch": 0.352552969781174, "grad_norm": 0.5677401120338555, "learning_rate": 7.512981558388101e-06, "loss": 0.0225, "step": 1015 }, { "epoch": 0.35290031260854465, "grad_norm": 0.7040980402544591, "learning_rate": 7.5081161191278874e-06, "loss": 0.0176, "step": 1016 }, { "epoch": 0.35324765543591524, "grad_norm": 0.9674600691403624, "learning_rate": 7.5032475043355444e-06, "loss": 0.0243, "step": 1017 }, { "epoch": 0.3535949982632859, "grad_norm": 0.6023226292917273, "learning_rate": 7.498375720175239e-06, "loss": 0.0183, "step": 1018 }, { "epoch": 0.3539423410906565, "grad_norm": 0.9673022657430923, "learning_rate": 7.49350077281515e-06, "loss": 0.0287, "step": 1019 }, { "epoch": 0.35428968391802707, "grad_norm": 0.606921696103078, "learning_rate": 7.48862266842746e-06, "loss": 0.0226, "step": 1020 }, { "epoch": 0.3546370267453977, "grad_norm": 0.9175466939892832, "learning_rate": 7.483741413188349e-06, "loss": 0.0205, "step": 1021 }, { "epoch": 0.3549843695727683, "grad_norm": 0.5474768689731467, "learning_rate": 7.478857013277987e-06, "loss": 0.0253, "step": 1022 }, { "epoch": 0.35533171240013894, "grad_norm": 0.7005547254579504, "learning_rate": 7.473969474880527e-06, "loss": 0.0239, "step": 1023 }, { "epoch": 0.35567905522750953, "grad_norm": 0.6566673110253187, "learning_rate": 7.469078804184088e-06, "loss": 0.0153, "step": 1024 }, { "epoch": 0.3560263980548802, "grad_norm": 0.44731824465158737, "learning_rate": 7.464185007380767e-06, "loss": 0.0226, "step": 1025 }, { "epoch": 0.35637374088225077, "grad_norm": 0.929371757186016, "learning_rate": 7.459288090666605e-06, "loss": 0.0215, "step": 1026 }, { "epoch": 0.3567210837096214, "grad_norm": 1.754520347098642, "learning_rate": 7.45438806024161e-06, "loss": 0.0245, "step": 1027 }, { "epoch": 0.357068426536992, "grad_norm": 1.154684337826048, "learning_rate": 7.449484922309713e-06, "loss": 0.0183, "step": 1028 }, { "epoch": 0.35741576936436265, "grad_norm": 0.7743738340491029, "learning_rate": 7.444578683078798e-06, "loss": 0.0265, "step": 1029 }, { "epoch": 0.35776311219173323, "grad_norm": 0.39037178440555265, "learning_rate": 7.4396693487606605e-06, "loss": 0.0183, "step": 1030 }, { "epoch": 0.3581104550191039, "grad_norm": 0.6375860488882318, "learning_rate": 7.4347569255710254e-06, "loss": 0.0309, "step": 1031 }, { "epoch": 0.35845779784647447, "grad_norm": 0.6020351665546125, "learning_rate": 7.429841419729521e-06, "loss": 0.025, "step": 1032 }, { "epoch": 0.35880514067384506, "grad_norm": 1.3286872104930505, "learning_rate": 7.424922837459683e-06, "loss": 0.0224, "step": 1033 }, { "epoch": 0.3591524835012157, "grad_norm": 0.5744413112024466, "learning_rate": 7.42000118498894e-06, "loss": 0.0184, "step": 1034 }, { "epoch": 0.3594998263285863, "grad_norm": 1.1943187896682033, "learning_rate": 7.41507646854861e-06, "loss": 0.0262, "step": 1035 }, { "epoch": 0.35984716915595694, "grad_norm": 0.5497582747698295, "learning_rate": 7.4101486943738865e-06, "loss": 0.0211, "step": 1036 }, { "epoch": 0.3601945119833275, "grad_norm": 0.4778102052106275, "learning_rate": 7.405217868703839e-06, "loss": 0.0165, "step": 1037 }, { "epoch": 0.36054185481069817, "grad_norm": 0.7214250751175056, "learning_rate": 7.400283997781397e-06, "loss": 0.0286, "step": 1038 }, { "epoch": 0.36088919763806876, "grad_norm": 0.7687435033252493, "learning_rate": 7.395347087853349e-06, "loss": 0.0214, "step": 1039 }, { "epoch": 0.3612365404654394, "grad_norm": 1.0790158493374187, "learning_rate": 7.390407145170325e-06, "loss": 0.027, "step": 1040 }, { "epoch": 0.36158388329281, "grad_norm": 0.37118096549279656, "learning_rate": 7.385464175986803e-06, "loss": 0.0124, "step": 1041 }, { "epoch": 0.36193122612018064, "grad_norm": 1.4499606402413763, "learning_rate": 7.380518186561086e-06, "loss": 0.0224, "step": 1042 }, { "epoch": 0.3622785689475512, "grad_norm": 1.7916464514957589, "learning_rate": 7.375569183155306e-06, "loss": 0.0269, "step": 1043 }, { "epoch": 0.36262591177492187, "grad_norm": 0.6417320309447412, "learning_rate": 7.370617172035406e-06, "loss": 0.0187, "step": 1044 }, { "epoch": 0.36297325460229246, "grad_norm": 0.9602465969222208, "learning_rate": 7.365662159471142e-06, "loss": 0.0171, "step": 1045 }, { "epoch": 0.36332059742966305, "grad_norm": 0.5700015399771756, "learning_rate": 7.3607041517360666e-06, "loss": 0.023, "step": 1046 }, { "epoch": 0.3636679402570337, "grad_norm": 0.42616867425402144, "learning_rate": 7.355743155107526e-06, "loss": 0.0185, "step": 1047 }, { "epoch": 0.3640152830844043, "grad_norm": 0.5254170573018199, "learning_rate": 7.3507791758666514e-06, "loss": 0.0125, "step": 1048 }, { "epoch": 0.36436262591177493, "grad_norm": 0.8659700459185796, "learning_rate": 7.3458122202983495e-06, "loss": 0.0184, "step": 1049 }, { "epoch": 0.3647099687391455, "grad_norm": 0.78273420283934, "learning_rate": 7.340842294691292e-06, "loss": 0.0134, "step": 1050 }, { "epoch": 0.36505731156651616, "grad_norm": 1.2893737814650572, "learning_rate": 7.335869405337919e-06, "loss": 0.0188, "step": 1051 }, { "epoch": 0.36540465439388675, "grad_norm": 0.901363581151114, "learning_rate": 7.3308935585344135e-06, "loss": 0.0217, "step": 1052 }, { "epoch": 0.3657519972212574, "grad_norm": 0.481669992576634, "learning_rate": 7.325914760580712e-06, "loss": 0.0102, "step": 1053 }, { "epoch": 0.366099340048628, "grad_norm": 1.0260877050410049, "learning_rate": 7.32093301778048e-06, "loss": 0.0263, "step": 1054 }, { "epoch": 0.36644668287599863, "grad_norm": 0.8353120616059849, "learning_rate": 7.3159483364411175e-06, "loss": 0.022, "step": 1055 }, { "epoch": 0.3667940257033692, "grad_norm": 0.7273624057486092, "learning_rate": 7.310960722873739e-06, "loss": 0.0308, "step": 1056 }, { "epoch": 0.36714136853073986, "grad_norm": 1.13748674597797, "learning_rate": 7.3059701833931766e-06, "loss": 0.0135, "step": 1057 }, { "epoch": 0.36748871135811045, "grad_norm": 0.6859340482408613, "learning_rate": 7.300976724317964e-06, "loss": 0.016, "step": 1058 }, { "epoch": 0.36783605418548104, "grad_norm": 1.2719149842779836, "learning_rate": 7.295980351970331e-06, "loss": 0.0178, "step": 1059 }, { "epoch": 0.3681833970128517, "grad_norm": 1.0679079371197404, "learning_rate": 7.290981072676202e-06, "loss": 0.0333, "step": 1060 }, { "epoch": 0.3685307398402223, "grad_norm": 0.5473217513456246, "learning_rate": 7.285978892765171e-06, "loss": 0.0196, "step": 1061 }, { "epoch": 0.3688780826675929, "grad_norm": 0.6477963035607689, "learning_rate": 7.280973818570515e-06, "loss": 0.0229, "step": 1062 }, { "epoch": 0.3692254254949635, "grad_norm": 0.9779272803664357, "learning_rate": 7.275965856429167e-06, "loss": 0.0148, "step": 1063 }, { "epoch": 0.36957276832233416, "grad_norm": 0.6549296775726913, "learning_rate": 7.270955012681726e-06, "loss": 0.0219, "step": 1064 }, { "epoch": 0.36992011114970474, "grad_norm": 0.8692466378138357, "learning_rate": 7.26594129367243e-06, "loss": 0.0202, "step": 1065 }, { "epoch": 0.3702674539770754, "grad_norm": 0.6315864672971158, "learning_rate": 7.26092470574916e-06, "loss": 0.0181, "step": 1066 }, { "epoch": 0.370614796804446, "grad_norm": 0.8066486900818035, "learning_rate": 7.255905255263434e-06, "loss": 0.0234, "step": 1067 }, { "epoch": 0.3709621396318166, "grad_norm": 0.8450924908715876, "learning_rate": 7.25088294857039e-06, "loss": 0.0212, "step": 1068 }, { "epoch": 0.3713094824591872, "grad_norm": 0.7148838980127935, "learning_rate": 7.245857792028781e-06, "loss": 0.0204, "step": 1069 }, { "epoch": 0.37165682528655786, "grad_norm": 0.7769253902283783, "learning_rate": 7.240829792000974e-06, "loss": 0.0184, "step": 1070 }, { "epoch": 0.37200416811392845, "grad_norm": 0.5818321434788865, "learning_rate": 7.235798954852929e-06, "loss": 0.0218, "step": 1071 }, { "epoch": 0.37235151094129904, "grad_norm": 0.7181155976463919, "learning_rate": 7.230765286954204e-06, "loss": 0.0229, "step": 1072 }, { "epoch": 0.3726988537686697, "grad_norm": 0.5311889445893856, "learning_rate": 7.2257287946779365e-06, "loss": 0.0208, "step": 1073 }, { "epoch": 0.37304619659604027, "grad_norm": 0.6470157370215145, "learning_rate": 7.220689484400844e-06, "loss": 0.0286, "step": 1074 }, { "epoch": 0.3733935394234109, "grad_norm": 0.5839896666697149, "learning_rate": 7.2156473625032075e-06, "loss": 0.0208, "step": 1075 }, { "epoch": 0.3737408822507815, "grad_norm": 0.5003859905535379, "learning_rate": 7.210602435368873e-06, "loss": 0.0277, "step": 1076 }, { "epoch": 0.37408822507815215, "grad_norm": 0.938126994232941, "learning_rate": 7.205554709385234e-06, "loss": 0.0193, "step": 1077 }, { "epoch": 0.37443556790552274, "grad_norm": 0.9825447432197518, "learning_rate": 7.20050419094323e-06, "loss": 0.0327, "step": 1078 }, { "epoch": 0.3747829107328934, "grad_norm": 0.5808094753024735, "learning_rate": 7.195450886437334e-06, "loss": 0.0233, "step": 1079 }, { "epoch": 0.37513025356026397, "grad_norm": 0.729519930229865, "learning_rate": 7.190394802265548e-06, "loss": 0.0254, "step": 1080 }, { "epoch": 0.3754775963876346, "grad_norm": 0.32897954890179926, "learning_rate": 7.185335944829391e-06, "loss": 0.0193, "step": 1081 }, { "epoch": 0.3758249392150052, "grad_norm": 0.4730691255515786, "learning_rate": 7.1802743205339e-06, "loss": 0.0255, "step": 1082 }, { "epoch": 0.37617228204237585, "grad_norm": 0.8306626500245984, "learning_rate": 7.175209935787605e-06, "loss": 0.0261, "step": 1083 }, { "epoch": 0.37651962486974644, "grad_norm": 0.5382124730055174, "learning_rate": 7.17014279700254e-06, "loss": 0.025, "step": 1084 }, { "epoch": 0.37686696769711703, "grad_norm": 0.6717725725696359, "learning_rate": 7.165072910594219e-06, "loss": 0.0271, "step": 1085 }, { "epoch": 0.3772143105244877, "grad_norm": 0.7370235521381453, "learning_rate": 7.160000282981641e-06, "loss": 0.0273, "step": 1086 }, { "epoch": 0.37756165335185826, "grad_norm": 0.4891684754047165, "learning_rate": 7.154924920587269e-06, "loss": 0.0264, "step": 1087 }, { "epoch": 0.3779089961792289, "grad_norm": 0.3231257861866142, "learning_rate": 7.149846829837036e-06, "loss": 0.0206, "step": 1088 }, { "epoch": 0.3782563390065995, "grad_norm": 0.4101311080573536, "learning_rate": 7.144766017160324e-06, "loss": 0.0205, "step": 1089 }, { "epoch": 0.37860368183397014, "grad_norm": 0.8294757845956588, "learning_rate": 7.139682488989961e-06, "loss": 0.0265, "step": 1090 }, { "epoch": 0.37895102466134073, "grad_norm": 0.5156785295009514, "learning_rate": 7.134596251762217e-06, "loss": 0.0209, "step": 1091 }, { "epoch": 0.3792983674887114, "grad_norm": 0.6068386255398981, "learning_rate": 7.129507311916789e-06, "loss": 0.0191, "step": 1092 }, { "epoch": 0.37964571031608196, "grad_norm": 0.42053223931776523, "learning_rate": 7.124415675896796e-06, "loss": 0.018, "step": 1093 }, { "epoch": 0.3799930531434526, "grad_norm": 1.529702363413615, "learning_rate": 7.119321350148772e-06, "loss": 0.0305, "step": 1094 }, { "epoch": 0.3803403959708232, "grad_norm": 0.9603413062647468, "learning_rate": 7.114224341122655e-06, "loss": 0.0211, "step": 1095 }, { "epoch": 0.38068773879819384, "grad_norm": 0.46028100214920925, "learning_rate": 7.109124655271782e-06, "loss": 0.0121, "step": 1096 }, { "epoch": 0.38103508162556443, "grad_norm": 0.38425802665843994, "learning_rate": 7.1040222990528775e-06, "loss": 0.0165, "step": 1097 }, { "epoch": 0.381382424452935, "grad_norm": 0.34490477943437864, "learning_rate": 7.098917278926046e-06, "loss": 0.0215, "step": 1098 }, { "epoch": 0.38172976728030567, "grad_norm": 0.4917657059195774, "learning_rate": 7.093809601354769e-06, "loss": 0.0193, "step": 1099 }, { "epoch": 0.38207711010767625, "grad_norm": 0.3304192390082568, "learning_rate": 7.088699272805888e-06, "loss": 0.0161, "step": 1100 }, { "epoch": 0.3824244529350469, "grad_norm": 0.8538379691183342, "learning_rate": 7.0835862997496045e-06, "loss": 0.0174, "step": 1101 }, { "epoch": 0.3827717957624175, "grad_norm": 1.066827953382512, "learning_rate": 7.078470688659465e-06, "loss": 0.0257, "step": 1102 }, { "epoch": 0.38311913858978813, "grad_norm": 0.7479008835369523, "learning_rate": 7.073352446012357e-06, "loss": 0.0213, "step": 1103 }, { "epoch": 0.3834664814171587, "grad_norm": 1.0510022725835733, "learning_rate": 7.068231578288502e-06, "loss": 0.0259, "step": 1104 }, { "epoch": 0.38381382424452937, "grad_norm": 0.6431698987590786, "learning_rate": 7.063108091971444e-06, "loss": 0.0161, "step": 1105 }, { "epoch": 0.38416116707189996, "grad_norm": 0.626180547232632, "learning_rate": 7.05798199354804e-06, "loss": 0.015, "step": 1106 }, { "epoch": 0.3845085098992706, "grad_norm": 0.7399567466553991, "learning_rate": 7.052853289508458e-06, "loss": 0.0229, "step": 1107 }, { "epoch": 0.3848558527266412, "grad_norm": 0.5500014449728272, "learning_rate": 7.047721986346161e-06, "loss": 0.0232, "step": 1108 }, { "epoch": 0.38520319555401183, "grad_norm": 1.4534975737472637, "learning_rate": 7.042588090557906e-06, "loss": 0.0228, "step": 1109 }, { "epoch": 0.3855505383813824, "grad_norm": 1.3334339962937882, "learning_rate": 7.037451608643732e-06, "loss": 0.0201, "step": 1110 }, { "epoch": 0.385897881208753, "grad_norm": 1.0089775574518236, "learning_rate": 7.03231254710695e-06, "loss": 0.0184, "step": 1111 }, { "epoch": 0.38624522403612366, "grad_norm": 0.439074094269593, "learning_rate": 7.027170912454141e-06, "loss": 0.0162, "step": 1112 }, { "epoch": 0.38659256686349425, "grad_norm": 0.8348064008657952, "learning_rate": 7.02202671119514e-06, "loss": 0.017, "step": 1113 }, { "epoch": 0.3869399096908649, "grad_norm": 0.7951186556024389, "learning_rate": 7.016879949843032e-06, "loss": 0.0163, "step": 1114 }, { "epoch": 0.3872872525182355, "grad_norm": 0.7523417760723944, "learning_rate": 7.0117306349141485e-06, "loss": 0.0236, "step": 1115 }, { "epoch": 0.3876345953456061, "grad_norm": 1.3735341548254056, "learning_rate": 7.006578772928045e-06, "loss": 0.0218, "step": 1116 }, { "epoch": 0.3879819381729767, "grad_norm": 0.5385452791468442, "learning_rate": 7.0014243704075115e-06, "loss": 0.0205, "step": 1117 }, { "epoch": 0.38832928100034736, "grad_norm": 0.8605606189346207, "learning_rate": 6.996267433878545e-06, "loss": 0.0222, "step": 1118 }, { "epoch": 0.38867662382771795, "grad_norm": 0.823981661806264, "learning_rate": 6.991107969870363e-06, "loss": 0.0264, "step": 1119 }, { "epoch": 0.3890239666550886, "grad_norm": 0.7575159321093746, "learning_rate": 6.985945984915368e-06, "loss": 0.0229, "step": 1120 }, { "epoch": 0.3893713094824592, "grad_norm": 0.45426836006848514, "learning_rate": 6.9807814855491685e-06, "loss": 0.0162, "step": 1121 }, { "epoch": 0.3897186523098298, "grad_norm": 1.2049213408577124, "learning_rate": 6.975614478310546e-06, "loss": 0.0322, "step": 1122 }, { "epoch": 0.3900659951372004, "grad_norm": 0.7306802680657365, "learning_rate": 6.970444969741462e-06, "loss": 0.0198, "step": 1123 }, { "epoch": 0.390413337964571, "grad_norm": 0.7391729308842311, "learning_rate": 6.965272966387046e-06, "loss": 0.0184, "step": 1124 }, { "epoch": 0.39076068079194165, "grad_norm": 0.6086940453113556, "learning_rate": 6.960098474795583e-06, "loss": 0.0243, "step": 1125 }, { "epoch": 0.39110802361931224, "grad_norm": 0.4851910309801865, "learning_rate": 6.954921501518511e-06, "loss": 0.012, "step": 1126 }, { "epoch": 0.3914553664466829, "grad_norm": 0.45385633767002753, "learning_rate": 6.949742053110408e-06, "loss": 0.0198, "step": 1127 }, { "epoch": 0.3918027092740535, "grad_norm": 1.1129782341928363, "learning_rate": 6.944560136128986e-06, "loss": 0.0287, "step": 1128 }, { "epoch": 0.3921500521014241, "grad_norm": 0.7720433053769181, "learning_rate": 6.939375757135085e-06, "loss": 0.0238, "step": 1129 }, { "epoch": 0.3924973949287947, "grad_norm": 1.5172156671202623, "learning_rate": 6.934188922692659e-06, "loss": 0.0254, "step": 1130 }, { "epoch": 0.39284473775616535, "grad_norm": 0.5132675364055385, "learning_rate": 6.928999639368773e-06, "loss": 0.0176, "step": 1131 }, { "epoch": 0.39319208058353594, "grad_norm": 0.9917189063692697, "learning_rate": 6.923807913733591e-06, "loss": 0.0212, "step": 1132 }, { "epoch": 0.3935394234109066, "grad_norm": 1.0346614661684557, "learning_rate": 6.918613752360369e-06, "loss": 0.0237, "step": 1133 }, { "epoch": 0.3938867662382772, "grad_norm": 0.7774794514027609, "learning_rate": 6.913417161825449e-06, "loss": 0.021, "step": 1134 }, { "epoch": 0.3942341090656478, "grad_norm": 0.8284608514835325, "learning_rate": 6.908218148708248e-06, "loss": 0.0196, "step": 1135 }, { "epoch": 0.3945814518930184, "grad_norm": 0.5659423206737776, "learning_rate": 6.903016719591249e-06, "loss": 0.0221, "step": 1136 }, { "epoch": 0.394928794720389, "grad_norm": 1.0709171868083134, "learning_rate": 6.8978128810599935e-06, "loss": 0.035, "step": 1137 }, { "epoch": 0.39527613754775964, "grad_norm": 0.6527457095839493, "learning_rate": 6.8926066397030745e-06, "loss": 0.0204, "step": 1138 }, { "epoch": 0.39562348037513023, "grad_norm": 0.5573043903274787, "learning_rate": 6.887398002112129e-06, "loss": 0.0211, "step": 1139 }, { "epoch": 0.3959708232025009, "grad_norm": 0.8397780950700933, "learning_rate": 6.8821869748818235e-06, "loss": 0.018, "step": 1140 }, { "epoch": 0.39631816602987147, "grad_norm": 0.39026678264711556, "learning_rate": 6.876973564609857e-06, "loss": 0.0175, "step": 1141 }, { "epoch": 0.3966655088572421, "grad_norm": 0.5775231347276166, "learning_rate": 6.871757777896937e-06, "loss": 0.0176, "step": 1142 }, { "epoch": 0.3970128516846127, "grad_norm": 0.3504293355564611, "learning_rate": 6.866539621346786e-06, "loss": 0.0136, "step": 1143 }, { "epoch": 0.39736019451198334, "grad_norm": 0.3267695926847211, "learning_rate": 6.861319101566126e-06, "loss": 0.0152, "step": 1144 }, { "epoch": 0.39770753733935393, "grad_norm": 1.0251884908691054, "learning_rate": 6.856096225164669e-06, "loss": 0.017, "step": 1145 }, { "epoch": 0.3980548801667246, "grad_norm": 0.484956168822129, "learning_rate": 6.850870998755111e-06, "loss": 0.0192, "step": 1146 }, { "epoch": 0.39840222299409517, "grad_norm": 0.6237901046105527, "learning_rate": 6.845643428953127e-06, "loss": 0.0322, "step": 1147 }, { "epoch": 0.3987495658214658, "grad_norm": 1.2212183860294297, "learning_rate": 6.840413522377355e-06, "loss": 0.0211, "step": 1148 }, { "epoch": 0.3990969086488364, "grad_norm": 0.5405735574477039, "learning_rate": 6.8351812856493905e-06, "loss": 0.0227, "step": 1149 }, { "epoch": 0.399444251476207, "grad_norm": 1.7602429727127982, "learning_rate": 6.829946725393787e-06, "loss": 0.0264, "step": 1150 }, { "epoch": 0.39979159430357764, "grad_norm": 0.9253181842069885, "learning_rate": 6.824709848238028e-06, "loss": 0.0167, "step": 1151 }, { "epoch": 0.4001389371309482, "grad_norm": 0.39829378649964736, "learning_rate": 6.819470660812543e-06, "loss": 0.0153, "step": 1152 }, { "epoch": 0.40048627995831887, "grad_norm": 1.4005769801829058, "learning_rate": 6.814229169750675e-06, "loss": 0.0239, "step": 1153 }, { "epoch": 0.40083362278568946, "grad_norm": 0.599123382279925, "learning_rate": 6.808985381688692e-06, "loss": 0.0239, "step": 1154 }, { "epoch": 0.4011809656130601, "grad_norm": 0.502770976659655, "learning_rate": 6.8037393032657665e-06, "loss": 0.0161, "step": 1155 }, { "epoch": 0.4015283084404307, "grad_norm": 0.715985711190083, "learning_rate": 6.798490941123972e-06, "loss": 0.0321, "step": 1156 }, { "epoch": 0.40187565126780134, "grad_norm": 1.3666815307149158, "learning_rate": 6.793240301908273e-06, "loss": 0.0217, "step": 1157 }, { "epoch": 0.4022229940951719, "grad_norm": 1.3893597610770267, "learning_rate": 6.7879873922665164e-06, "loss": 0.0255, "step": 1158 }, { "epoch": 0.40257033692254257, "grad_norm": 1.699863532332705, "learning_rate": 6.782732218849425e-06, "loss": 0.0191, "step": 1159 }, { "epoch": 0.40291767974991316, "grad_norm": 0.5862064856816661, "learning_rate": 6.777474788310586e-06, "loss": 0.0278, "step": 1160 }, { "epoch": 0.4032650225772838, "grad_norm": 0.5792277534533721, "learning_rate": 6.772215107306448e-06, "loss": 0.0244, "step": 1161 }, { "epoch": 0.4036123654046544, "grad_norm": 0.7441381287676478, "learning_rate": 6.766953182496303e-06, "loss": 0.0164, "step": 1162 }, { "epoch": 0.403959708232025, "grad_norm": 0.874933616008428, "learning_rate": 6.761689020542288e-06, "loss": 0.024, "step": 1163 }, { "epoch": 0.40430705105939563, "grad_norm": 0.7249602510280612, "learning_rate": 6.756422628109374e-06, "loss": 0.0291, "step": 1164 }, { "epoch": 0.4046543938867662, "grad_norm": 0.38558307647762535, "learning_rate": 6.751154011865352e-06, "loss": 0.0193, "step": 1165 }, { "epoch": 0.40500173671413686, "grad_norm": 0.3569257075132759, "learning_rate": 6.74588317848083e-06, "loss": 0.0224, "step": 1166 }, { "epoch": 0.40534907954150745, "grad_norm": 0.7027161271352107, "learning_rate": 6.740610134629224e-06, "loss": 0.0236, "step": 1167 }, { "epoch": 0.4056964223688781, "grad_norm": 0.5055672476223064, "learning_rate": 6.735334886986749e-06, "loss": 0.0169, "step": 1168 }, { "epoch": 0.4060437651962487, "grad_norm": 1.1794513716622257, "learning_rate": 6.730057442232407e-06, "loss": 0.0156, "step": 1169 }, { "epoch": 0.40639110802361933, "grad_norm": 1.210535705797956, "learning_rate": 6.724777807047985e-06, "loss": 0.0195, "step": 1170 }, { "epoch": 0.4067384508509899, "grad_norm": 0.48600002401232995, "learning_rate": 6.719495988118043e-06, "loss": 0.0256, "step": 1171 }, { "epoch": 0.40708579367836056, "grad_norm": 0.30168099824930594, "learning_rate": 6.714211992129906e-06, "loss": 0.0151, "step": 1172 }, { "epoch": 0.40743313650573115, "grad_norm": 0.41890959033819075, "learning_rate": 6.708925825773653e-06, "loss": 0.0214, "step": 1173 }, { "epoch": 0.4077804793331018, "grad_norm": 0.4456883093941609, "learning_rate": 6.7036374957421125e-06, "loss": 0.0164, "step": 1174 }, { "epoch": 0.4081278221604724, "grad_norm": 0.8050022922465697, "learning_rate": 6.698347008730854e-06, "loss": 0.0177, "step": 1175 }, { "epoch": 0.408475164987843, "grad_norm": 0.7402939338287975, "learning_rate": 6.6930543714381745e-06, "loss": 0.0217, "step": 1176 }, { "epoch": 0.4088225078152136, "grad_norm": 0.7580203605323533, "learning_rate": 6.687759590565097e-06, "loss": 0.0217, "step": 1177 }, { "epoch": 0.4091698506425842, "grad_norm": 0.5276697193580777, "learning_rate": 6.6824626728153565e-06, "loss": 0.0177, "step": 1178 }, { "epoch": 0.40951719346995485, "grad_norm": 1.3525910498294889, "learning_rate": 6.677163624895393e-06, "loss": 0.0262, "step": 1179 }, { "epoch": 0.40986453629732544, "grad_norm": 0.7784159092511207, "learning_rate": 6.671862453514346e-06, "loss": 0.0224, "step": 1180 }, { "epoch": 0.4102118791246961, "grad_norm": 0.5263360435847655, "learning_rate": 6.666559165384041e-06, "loss": 0.018, "step": 1181 }, { "epoch": 0.4105592219520667, "grad_norm": 0.9366706697301209, "learning_rate": 6.661253767218982e-06, "loss": 0.0232, "step": 1182 }, { "epoch": 0.4109065647794373, "grad_norm": 0.38867181577970566, "learning_rate": 6.6559462657363525e-06, "loss": 0.0182, "step": 1183 }, { "epoch": 0.4112539076068079, "grad_norm": 1.6435992793264294, "learning_rate": 6.6506366676559885e-06, "loss": 0.0253, "step": 1184 }, { "epoch": 0.41160125043417856, "grad_norm": 1.0752706014174525, "learning_rate": 6.6453249797003885e-06, "loss": 0.0266, "step": 1185 }, { "epoch": 0.41194859326154915, "grad_norm": 0.8205362924547063, "learning_rate": 6.640011208594691e-06, "loss": 0.0228, "step": 1186 }, { "epoch": 0.4122959360889198, "grad_norm": 0.7528049121740515, "learning_rate": 6.634695361066679e-06, "loss": 0.0205, "step": 1187 }, { "epoch": 0.4126432789162904, "grad_norm": 0.5476931400142733, "learning_rate": 6.629377443846756e-06, "loss": 0.0256, "step": 1188 }, { "epoch": 0.41299062174366097, "grad_norm": 1.964533528727283, "learning_rate": 6.624057463667954e-06, "loss": 0.0249, "step": 1189 }, { "epoch": 0.4133379645710316, "grad_norm": 0.9431680832361578, "learning_rate": 6.618735427265912e-06, "loss": 0.0127, "step": 1190 }, { "epoch": 0.4136853073984022, "grad_norm": 0.31081854506575984, "learning_rate": 6.613411341378872e-06, "loss": 0.0168, "step": 1191 }, { "epoch": 0.41403265022577285, "grad_norm": 0.7037510968761942, "learning_rate": 6.608085212747676e-06, "loss": 0.0158, "step": 1192 }, { "epoch": 0.41437999305314344, "grad_norm": 0.7279786999120493, "learning_rate": 6.602757048115745e-06, "loss": 0.0208, "step": 1193 }, { "epoch": 0.4147273358805141, "grad_norm": 1.5397066182596297, "learning_rate": 6.597426854229085e-06, "loss": 0.0178, "step": 1194 }, { "epoch": 0.41507467870788467, "grad_norm": 0.8672894314292771, "learning_rate": 6.592094637836266e-06, "loss": 0.0207, "step": 1195 }, { "epoch": 0.4154220215352553, "grad_norm": 0.5979467691977439, "learning_rate": 6.586760405688421e-06, "loss": 0.0184, "step": 1196 }, { "epoch": 0.4157693643626259, "grad_norm": 1.304178348576468, "learning_rate": 6.581424164539235e-06, "loss": 0.0222, "step": 1197 }, { "epoch": 0.41611670718999655, "grad_norm": 0.744170926834296, "learning_rate": 6.5760859211449355e-06, "loss": 0.0186, "step": 1198 }, { "epoch": 0.41646405001736714, "grad_norm": 0.43247222642972794, "learning_rate": 6.570745682264288e-06, "loss": 0.0241, "step": 1199 }, { "epoch": 0.4168113928447378, "grad_norm": 0.7241231215446736, "learning_rate": 6.565403454658579e-06, "loss": 0.0115, "step": 1200 }, { "epoch": 0.41715873567210837, "grad_norm": 1.5981944791092622, "learning_rate": 6.560059245091619e-06, "loss": 0.0215, "step": 1201 }, { "epoch": 0.41750607849947896, "grad_norm": 0.6073741226445987, "learning_rate": 6.554713060329725e-06, "loss": 0.0159, "step": 1202 }, { "epoch": 0.4178534213268496, "grad_norm": 0.35103622430566045, "learning_rate": 6.549364907141713e-06, "loss": 0.0165, "step": 1203 }, { "epoch": 0.4182007641542202, "grad_norm": 0.3060822356877603, "learning_rate": 6.544014792298896e-06, "loss": 0.0136, "step": 1204 }, { "epoch": 0.41854810698159084, "grad_norm": 1.2509560719304376, "learning_rate": 6.538662722575067e-06, "loss": 0.0232, "step": 1205 }, { "epoch": 0.41889544980896143, "grad_norm": 1.1252212083506805, "learning_rate": 6.533308704746492e-06, "loss": 0.0261, "step": 1206 }, { "epoch": 0.4192427926363321, "grad_norm": 1.0051855699617804, "learning_rate": 6.527952745591911e-06, "loss": 0.0176, "step": 1207 }, { "epoch": 0.41959013546370266, "grad_norm": 0.4225505750014969, "learning_rate": 6.522594851892513e-06, "loss": 0.0157, "step": 1208 }, { "epoch": 0.4199374782910733, "grad_norm": 0.675487063845996, "learning_rate": 6.5172350304319456e-06, "loss": 0.0187, "step": 1209 }, { "epoch": 0.4202848211184439, "grad_norm": 0.5556340570184015, "learning_rate": 6.5118732879962866e-06, "loss": 0.021, "step": 1210 }, { "epoch": 0.42063216394581454, "grad_norm": 1.1461752497953057, "learning_rate": 6.506509631374056e-06, "loss": 0.024, "step": 1211 }, { "epoch": 0.42097950677318513, "grad_norm": 0.8587515989365709, "learning_rate": 6.501144067356191e-06, "loss": 0.0171, "step": 1212 }, { "epoch": 0.4213268496005558, "grad_norm": 0.6554268552282353, "learning_rate": 6.4957766027360455e-06, "loss": 0.0162, "step": 1213 }, { "epoch": 0.42167419242792636, "grad_norm": 0.9717352789041409, "learning_rate": 6.490407244309382e-06, "loss": 0.0177, "step": 1214 }, { "epoch": 0.42202153525529695, "grad_norm": 0.4666185404754438, "learning_rate": 6.485035998874356e-06, "loss": 0.0179, "step": 1215 }, { "epoch": 0.4223688780826676, "grad_norm": 0.6687573112666673, "learning_rate": 6.479662873231518e-06, "loss": 0.0278, "step": 1216 }, { "epoch": 0.4227162209100382, "grad_norm": 0.39201919773447125, "learning_rate": 6.4742878741837924e-06, "loss": 0.021, "step": 1217 }, { "epoch": 0.42306356373740883, "grad_norm": 1.9376472430652754, "learning_rate": 6.468911008536483e-06, "loss": 0.0315, "step": 1218 }, { "epoch": 0.4234109065647794, "grad_norm": 0.5349195261746512, "learning_rate": 6.4635322830972465e-06, "loss": 0.022, "step": 1219 }, { "epoch": 0.42375824939215007, "grad_norm": 0.5994260198762377, "learning_rate": 6.458151704676108e-06, "loss": 0.0249, "step": 1220 }, { "epoch": 0.42410559221952066, "grad_norm": 0.45829461719718667, "learning_rate": 6.452769280085427e-06, "loss": 0.0172, "step": 1221 }, { "epoch": 0.4244529350468913, "grad_norm": 0.4015948567021689, "learning_rate": 6.447385016139906e-06, "loss": 0.0159, "step": 1222 }, { "epoch": 0.4248002778742619, "grad_norm": 0.41359006368663803, "learning_rate": 6.441998919656575e-06, "loss": 0.0217, "step": 1223 }, { "epoch": 0.42514762070163253, "grad_norm": 0.784104999649454, "learning_rate": 6.436610997454785e-06, "loss": 0.0153, "step": 1224 }, { "epoch": 0.4254949635290031, "grad_norm": 0.4486969339475269, "learning_rate": 6.431221256356197e-06, "loss": 0.0191, "step": 1225 }, { "epoch": 0.42584230635637377, "grad_norm": 0.807568173718478, "learning_rate": 6.425829703184776e-06, "loss": 0.0191, "step": 1226 }, { "epoch": 0.42618964918374436, "grad_norm": 1.214899649576311, "learning_rate": 6.420436344766781e-06, "loss": 0.0242, "step": 1227 }, { "epoch": 0.42653699201111495, "grad_norm": 0.41836170664521255, "learning_rate": 6.415041187930757e-06, "loss": 0.0195, "step": 1228 }, { "epoch": 0.4268843348384856, "grad_norm": 0.8701313451372041, "learning_rate": 6.409644239507524e-06, "loss": 0.0199, "step": 1229 }, { "epoch": 0.4272316776658562, "grad_norm": 0.46742950813151507, "learning_rate": 6.404245506330175e-06, "loss": 0.013, "step": 1230 }, { "epoch": 0.4275790204932268, "grad_norm": 0.5597197002970087, "learning_rate": 6.398844995234057e-06, "loss": 0.0246, "step": 1231 }, { "epoch": 0.4279263633205974, "grad_norm": 1.3414468877460513, "learning_rate": 6.393442713056772e-06, "loss": 0.021, "step": 1232 }, { "epoch": 0.42827370614796806, "grad_norm": 0.6723941756786661, "learning_rate": 6.388038666638163e-06, "loss": 0.0228, "step": 1233 }, { "epoch": 0.42862104897533865, "grad_norm": 0.5282980798460091, "learning_rate": 6.382632862820306e-06, "loss": 0.019, "step": 1234 }, { "epoch": 0.4289683918027093, "grad_norm": 0.8434630536972346, "learning_rate": 6.377225308447503e-06, "loss": 0.0279, "step": 1235 }, { "epoch": 0.4293157346300799, "grad_norm": 1.2873804248352099, "learning_rate": 6.371816010366274e-06, "loss": 0.0242, "step": 1236 }, { "epoch": 0.4296630774574505, "grad_norm": 1.0288152920759024, "learning_rate": 6.366404975425342e-06, "loss": 0.0136, "step": 1237 }, { "epoch": 0.4300104202848211, "grad_norm": 0.9931125336676734, "learning_rate": 6.360992210475635e-06, "loss": 0.0238, "step": 1238 }, { "epoch": 0.43035776311219176, "grad_norm": 1.136404288127991, "learning_rate": 6.355577722370264e-06, "loss": 0.0171, "step": 1239 }, { "epoch": 0.43070510593956235, "grad_norm": 0.6828220416786258, "learning_rate": 6.3501615179645315e-06, "loss": 0.0195, "step": 1240 }, { "epoch": 0.43105244876693294, "grad_norm": 1.3364697049657948, "learning_rate": 6.344743604115903e-06, "loss": 0.0293, "step": 1241 }, { "epoch": 0.4313997915943036, "grad_norm": 1.2606216983253666, "learning_rate": 6.339323987684015e-06, "loss": 0.0197, "step": 1242 }, { "epoch": 0.4317471344216742, "grad_norm": 0.5059234102433385, "learning_rate": 6.333902675530657e-06, "loss": 0.0216, "step": 1243 }, { "epoch": 0.4320944772490448, "grad_norm": 0.6807195324342239, "learning_rate": 6.328479674519766e-06, "loss": 0.0225, "step": 1244 }, { "epoch": 0.4324418200764154, "grad_norm": 0.5504876720573679, "learning_rate": 6.323054991517416e-06, "loss": 0.0128, "step": 1245 }, { "epoch": 0.43278916290378605, "grad_norm": 0.8748832096236718, "learning_rate": 6.317628633391816e-06, "loss": 0.0208, "step": 1246 }, { "epoch": 0.43313650573115664, "grad_norm": 1.3807423200829632, "learning_rate": 6.312200607013287e-06, "loss": 0.0192, "step": 1247 }, { "epoch": 0.4334838485585273, "grad_norm": 0.4591330354830137, "learning_rate": 6.306770919254268e-06, "loss": 0.0182, "step": 1248 }, { "epoch": 0.4338311913858979, "grad_norm": 1.1115703829370096, "learning_rate": 6.301339576989301e-06, "loss": 0.0257, "step": 1249 }, { "epoch": 0.4341785342132685, "grad_norm": 0.9601555993164028, "learning_rate": 6.295906587095023e-06, "loss": 0.0293, "step": 1250 }, { "epoch": 0.4345258770406391, "grad_norm": 0.3602112221728658, "learning_rate": 6.2904719564501545e-06, "loss": 0.0209, "step": 1251 }, { "epoch": 0.43487321986800975, "grad_norm": 0.65050746016075, "learning_rate": 6.285035691935495e-06, "loss": 0.0201, "step": 1252 }, { "epoch": 0.43522056269538034, "grad_norm": 0.43904022220763456, "learning_rate": 6.279597800433915e-06, "loss": 0.0244, "step": 1253 }, { "epoch": 0.43556790552275093, "grad_norm": 1.0501132280980383, "learning_rate": 6.274158288830339e-06, "loss": 0.0152, "step": 1254 }, { "epoch": 0.4359152483501216, "grad_norm": 1.368313597379767, "learning_rate": 6.268717164011751e-06, "loss": 0.0299, "step": 1255 }, { "epoch": 0.43626259117749216, "grad_norm": 0.29948345258209563, "learning_rate": 6.263274432867168e-06, "loss": 0.0175, "step": 1256 }, { "epoch": 0.4366099340048628, "grad_norm": 0.5520058587984689, "learning_rate": 6.257830102287649e-06, "loss": 0.0131, "step": 1257 }, { "epoch": 0.4369572768322334, "grad_norm": 0.703456209843452, "learning_rate": 6.252384179166272e-06, "loss": 0.026, "step": 1258 }, { "epoch": 0.43730461965960404, "grad_norm": 0.31650775648430113, "learning_rate": 6.246936670398136e-06, "loss": 0.0119, "step": 1259 }, { "epoch": 0.43765196248697463, "grad_norm": 0.6575613262249127, "learning_rate": 6.2414875828803446e-06, "loss": 0.018, "step": 1260 }, { "epoch": 0.4379993053143453, "grad_norm": 0.7650908913989808, "learning_rate": 6.236036923512002e-06, "loss": 0.0193, "step": 1261 }, { "epoch": 0.43834664814171587, "grad_norm": 0.6645282671067363, "learning_rate": 6.230584699194201e-06, "loss": 0.0191, "step": 1262 }, { "epoch": 0.4386939909690865, "grad_norm": 0.89119449808857, "learning_rate": 6.225130916830017e-06, "loss": 0.0165, "step": 1263 }, { "epoch": 0.4390413337964571, "grad_norm": 1.4336588300538304, "learning_rate": 6.2196755833244975e-06, "loss": 0.0172, "step": 1264 }, { "epoch": 0.43938867662382775, "grad_norm": 0.5141694363351154, "learning_rate": 6.214218705584653e-06, "loss": 0.017, "step": 1265 }, { "epoch": 0.43973601945119833, "grad_norm": 0.7024489932275159, "learning_rate": 6.208760290519451e-06, "loss": 0.0266, "step": 1266 }, { "epoch": 0.4400833622785689, "grad_norm": 0.5137267630010462, "learning_rate": 6.203300345039804e-06, "loss": 0.0208, "step": 1267 }, { "epoch": 0.44043070510593957, "grad_norm": 0.9545655799932716, "learning_rate": 6.197838876058564e-06, "loss": 0.0212, "step": 1268 }, { "epoch": 0.44077804793331016, "grad_norm": 1.274602132775298, "learning_rate": 6.19237589049051e-06, "loss": 0.02, "step": 1269 }, { "epoch": 0.4411253907606808, "grad_norm": 1.1485868760171818, "learning_rate": 6.186911395252342e-06, "loss": 0.0325, "step": 1270 }, { "epoch": 0.4414727335880514, "grad_norm": 1.0719692070927125, "learning_rate": 6.181445397262671e-06, "loss": 0.0246, "step": 1271 }, { "epoch": 0.44182007641542204, "grad_norm": 0.5336479484346233, "learning_rate": 6.175977903442008e-06, "loss": 0.0154, "step": 1272 }, { "epoch": 0.4421674192427926, "grad_norm": 0.6172067020914168, "learning_rate": 6.170508920712765e-06, "loss": 0.0188, "step": 1273 }, { "epoch": 0.44251476207016327, "grad_norm": 0.9412324301612399, "learning_rate": 6.165038455999233e-06, "loss": 0.0216, "step": 1274 }, { "epoch": 0.44286210489753386, "grad_norm": 0.5832155616020471, "learning_rate": 6.159566516227582e-06, "loss": 0.0214, "step": 1275 }, { "epoch": 0.4432094477249045, "grad_norm": 0.5486296573893363, "learning_rate": 6.154093108325846e-06, "loss": 0.0163, "step": 1276 }, { "epoch": 0.4435567905522751, "grad_norm": 0.6013895518841912, "learning_rate": 6.148618239223924e-06, "loss": 0.0146, "step": 1277 }, { "epoch": 0.44390413337964574, "grad_norm": 0.886747002104457, "learning_rate": 6.143141915853558e-06, "loss": 0.0141, "step": 1278 }, { "epoch": 0.4442514762070163, "grad_norm": 0.7983414083814292, "learning_rate": 6.137664145148339e-06, "loss": 0.0229, "step": 1279 }, { "epoch": 0.4445988190343869, "grad_norm": 0.4982201099217589, "learning_rate": 6.1321849340436824e-06, "loss": 0.0226, "step": 1280 }, { "epoch": 0.44494616186175756, "grad_norm": 0.6104658236755754, "learning_rate": 6.126704289476834e-06, "loss": 0.0176, "step": 1281 }, { "epoch": 0.44529350468912815, "grad_norm": 0.4850325700357511, "learning_rate": 6.121222218386848e-06, "loss": 0.0275, "step": 1282 }, { "epoch": 0.4456408475164988, "grad_norm": 1.1453236673507199, "learning_rate": 6.115738727714593e-06, "loss": 0.0157, "step": 1283 }, { "epoch": 0.4459881903438694, "grad_norm": 1.1501958397477723, "learning_rate": 6.110253824402728e-06, "loss": 0.0233, "step": 1284 }, { "epoch": 0.44633553317124003, "grad_norm": 1.2142627028854323, "learning_rate": 6.104767515395702e-06, "loss": 0.0217, "step": 1285 }, { "epoch": 0.4466828759986106, "grad_norm": 0.8286577032771891, "learning_rate": 6.0992798076397465e-06, "loss": 0.0281, "step": 1286 }, { "epoch": 0.44703021882598126, "grad_norm": 1.1668550772372472, "learning_rate": 6.093790708082861e-06, "loss": 0.0122, "step": 1287 }, { "epoch": 0.44737756165335185, "grad_norm": 0.8595111659192216, "learning_rate": 6.088300223674808e-06, "loss": 0.013, "step": 1288 }, { "epoch": 0.4477249044807225, "grad_norm": 1.202956918343463, "learning_rate": 6.0828083613671055e-06, "loss": 0.0155, "step": 1289 }, { "epoch": 0.4480722473080931, "grad_norm": 1.144767269040697, "learning_rate": 6.077315128113011e-06, "loss": 0.0234, "step": 1290 }, { "epoch": 0.44841959013546373, "grad_norm": 0.3829734675474034, "learning_rate": 6.071820530867524e-06, "loss": 0.017, "step": 1291 }, { "epoch": 0.4487669329628343, "grad_norm": 0.705925698459192, "learning_rate": 6.066324576587367e-06, "loss": 0.0207, "step": 1292 }, { "epoch": 0.4491142757902049, "grad_norm": 0.9927898285052602, "learning_rate": 6.06082727223098e-06, "loss": 0.0227, "step": 1293 }, { "epoch": 0.44946161861757555, "grad_norm": 0.5058096386894589, "learning_rate": 6.055328624758515e-06, "loss": 0.025, "step": 1294 }, { "epoch": 0.44980896144494614, "grad_norm": 0.9125209454948322, "learning_rate": 6.0498286411318255e-06, "loss": 0.0224, "step": 1295 }, { "epoch": 0.4501563042723168, "grad_norm": 1.1242298901659808, "learning_rate": 6.04432732831445e-06, "loss": 0.0191, "step": 1296 }, { "epoch": 0.4505036470996874, "grad_norm": 0.8939518763202791, "learning_rate": 6.038824693271619e-06, "loss": 0.0249, "step": 1297 }, { "epoch": 0.450850989927058, "grad_norm": 1.0950080836356835, "learning_rate": 6.033320742970229e-06, "loss": 0.0248, "step": 1298 }, { "epoch": 0.4511983327544286, "grad_norm": 0.8937794294216942, "learning_rate": 6.027815484378848e-06, "loss": 0.0247, "step": 1299 }, { "epoch": 0.45154567558179926, "grad_norm": 1.211557408088087, "learning_rate": 6.0223089244676965e-06, "loss": 0.0257, "step": 1300 }, { "epoch": 0.45189301840916984, "grad_norm": 0.7913795101404235, "learning_rate": 6.016801070208644e-06, "loss": 0.0201, "step": 1301 }, { "epoch": 0.4522403612365405, "grad_norm": 1.022217982353667, "learning_rate": 6.011291928575199e-06, "loss": 0.0177, "step": 1302 }, { "epoch": 0.4525877040639111, "grad_norm": 0.5643877352773056, "learning_rate": 6.005781506542498e-06, "loss": 0.018, "step": 1303 }, { "epoch": 0.45293504689128167, "grad_norm": 0.46969855875618594, "learning_rate": 6.000269811087304e-06, "loss": 0.0164, "step": 1304 }, { "epoch": 0.4532823897186523, "grad_norm": 0.64817857112924, "learning_rate": 5.994756849187984e-06, "loss": 0.0224, "step": 1305 }, { "epoch": 0.4536297325460229, "grad_norm": 1.0142085570957111, "learning_rate": 5.989242627824516e-06, "loss": 0.0293, "step": 1306 }, { "epoch": 0.45397707537339355, "grad_norm": 0.44811883050529416, "learning_rate": 5.983727153978467e-06, "loss": 0.0188, "step": 1307 }, { "epoch": 0.45432441820076414, "grad_norm": 0.8638051888219721, "learning_rate": 5.978210434632996e-06, "loss": 0.0233, "step": 1308 }, { "epoch": 0.4546717610281348, "grad_norm": 0.7100938005096301, "learning_rate": 5.97269247677283e-06, "loss": 0.0197, "step": 1309 }, { "epoch": 0.45501910385550537, "grad_norm": 0.5543422204245909, "learning_rate": 5.967173287384275e-06, "loss": 0.0163, "step": 1310 }, { "epoch": 0.455366446682876, "grad_norm": 0.8347623418009196, "learning_rate": 5.961652873455186e-06, "loss": 0.0142, "step": 1311 }, { "epoch": 0.4557137895102466, "grad_norm": 0.2539498893920443, "learning_rate": 5.956131241974976e-06, "loss": 0.0132, "step": 1312 }, { "epoch": 0.45606113233761725, "grad_norm": 1.1630972005065157, "learning_rate": 5.950608399934594e-06, "loss": 0.0242, "step": 1313 }, { "epoch": 0.45640847516498784, "grad_norm": 1.0539497447728725, "learning_rate": 5.945084354326527e-06, "loss": 0.0179, "step": 1314 }, { "epoch": 0.4567558179923585, "grad_norm": 0.838017365343182, "learning_rate": 5.939559112144781e-06, "loss": 0.0265, "step": 1315 }, { "epoch": 0.45710316081972907, "grad_norm": 0.5217217424418361, "learning_rate": 5.93403268038488e-06, "loss": 0.0173, "step": 1316 }, { "epoch": 0.45745050364709966, "grad_norm": 1.049739897466213, "learning_rate": 5.928505066043852e-06, "loss": 0.0154, "step": 1317 }, { "epoch": 0.4577978464744703, "grad_norm": 0.514558882908952, "learning_rate": 5.922976276120225e-06, "loss": 0.0236, "step": 1318 }, { "epoch": 0.4581451893018409, "grad_norm": 0.8103740247236859, "learning_rate": 5.917446317614012e-06, "loss": 0.0115, "step": 1319 }, { "epoch": 0.45849253212921154, "grad_norm": 1.0239011468521235, "learning_rate": 5.911915197526709e-06, "loss": 0.0194, "step": 1320 }, { "epoch": 0.4588398749565821, "grad_norm": 0.45055190346246493, "learning_rate": 5.9063829228612805e-06, "loss": 0.0244, "step": 1321 }, { "epoch": 0.4591872177839528, "grad_norm": 0.4096700783073768, "learning_rate": 5.900849500622153e-06, "loss": 0.0192, "step": 1322 }, { "epoch": 0.45953456061132336, "grad_norm": 5.0746307124667585, "learning_rate": 5.895314937815206e-06, "loss": 0.0249, "step": 1323 }, { "epoch": 0.459881903438694, "grad_norm": 0.6287936978756935, "learning_rate": 5.889779241447765e-06, "loss": 0.0192, "step": 1324 }, { "epoch": 0.4602292462660646, "grad_norm": 0.5423899884873425, "learning_rate": 5.884242418528588e-06, "loss": 0.0219, "step": 1325 }, { "epoch": 0.46057658909343524, "grad_norm": 0.5268649486665123, "learning_rate": 5.878704476067862e-06, "loss": 0.0166, "step": 1326 }, { "epoch": 0.46092393192080583, "grad_norm": 0.922315522446586, "learning_rate": 5.873165421077186e-06, "loss": 0.0244, "step": 1327 }, { "epoch": 0.4612712747481765, "grad_norm": 0.6105099198285099, "learning_rate": 5.867625260569575e-06, "loss": 0.0278, "step": 1328 }, { "epoch": 0.46161861757554706, "grad_norm": 1.472196223378207, "learning_rate": 5.862084001559438e-06, "loss": 0.0233, "step": 1329 }, { "epoch": 0.46196596040291765, "grad_norm": 0.6221418880436498, "learning_rate": 5.85654165106258e-06, "loss": 0.0113, "step": 1330 }, { "epoch": 0.4623133032302883, "grad_norm": 0.6121127340609553, "learning_rate": 5.850998216096181e-06, "loss": 0.0249, "step": 1331 }, { "epoch": 0.4626606460576589, "grad_norm": 0.7691635062367261, "learning_rate": 5.845453703678801e-06, "loss": 0.0263, "step": 1332 }, { "epoch": 0.46300798888502953, "grad_norm": 0.4516319265859705, "learning_rate": 5.8399081208303595e-06, "loss": 0.0217, "step": 1333 }, { "epoch": 0.4633553317124001, "grad_norm": 1.0317569643624573, "learning_rate": 5.834361474572134e-06, "loss": 0.0174, "step": 1334 }, { "epoch": 0.46370267453977076, "grad_norm": 0.6672608515849126, "learning_rate": 5.828813771926746e-06, "loss": 0.0202, "step": 1335 }, { "epoch": 0.46405001736714135, "grad_norm": 2.1225607990268083, "learning_rate": 5.823265019918156e-06, "loss": 0.0202, "step": 1336 }, { "epoch": 0.464397360194512, "grad_norm": 0.5824621371505889, "learning_rate": 5.817715225571654e-06, "loss": 0.0189, "step": 1337 }, { "epoch": 0.4647447030218826, "grad_norm": 0.5556260631755074, "learning_rate": 5.812164395913848e-06, "loss": 0.02, "step": 1338 }, { "epoch": 0.46509204584925323, "grad_norm": 0.5236250786169708, "learning_rate": 5.806612537972658e-06, "loss": 0.0295, "step": 1339 }, { "epoch": 0.4654393886766238, "grad_norm": 0.915218352922461, "learning_rate": 5.801059658777303e-06, "loss": 0.0247, "step": 1340 }, { "epoch": 0.46578673150399447, "grad_norm": 0.8125906729186393, "learning_rate": 5.7955057653583e-06, "loss": 0.0168, "step": 1341 }, { "epoch": 0.46613407433136506, "grad_norm": 0.500901462927243, "learning_rate": 5.789950864747446e-06, "loss": 0.0163, "step": 1342 }, { "epoch": 0.46648141715873565, "grad_norm": 0.8445734370127098, "learning_rate": 5.784394963977815e-06, "loss": 0.0156, "step": 1343 }, { "epoch": 0.4668287599861063, "grad_norm": 0.8202499613757581, "learning_rate": 5.778838070083747e-06, "loss": 0.0165, "step": 1344 }, { "epoch": 0.4671761028134769, "grad_norm": 0.9521555125223035, "learning_rate": 5.77328019010084e-06, "loss": 0.0236, "step": 1345 }, { "epoch": 0.4675234456408475, "grad_norm": 0.3706684952737426, "learning_rate": 5.7677213310659375e-06, "loss": 0.0134, "step": 1346 }, { "epoch": 0.4678707884682181, "grad_norm": 0.4599731154160651, "learning_rate": 5.762161500017128e-06, "loss": 0.0155, "step": 1347 }, { "epoch": 0.46821813129558876, "grad_norm": 0.2427917500418682, "learning_rate": 5.756600703993725e-06, "loss": 0.0089, "step": 1348 }, { "epoch": 0.46856547412295935, "grad_norm": 0.7157300676183229, "learning_rate": 5.751038950036267e-06, "loss": 0.026, "step": 1349 }, { "epoch": 0.46891281695033, "grad_norm": 0.5005873987026797, "learning_rate": 5.745476245186506e-06, "loss": 0.0213, "step": 1350 }, { "epoch": 0.4692601597777006, "grad_norm": 0.9673326052882976, "learning_rate": 5.739912596487396e-06, "loss": 0.0269, "step": 1351 }, { "epoch": 0.4696075026050712, "grad_norm": 0.457111905319674, "learning_rate": 5.7343480109830865e-06, "loss": 0.0177, "step": 1352 }, { "epoch": 0.4699548454324418, "grad_norm": 0.8619795743501616, "learning_rate": 5.728782495718912e-06, "loss": 0.0156, "step": 1353 }, { "epoch": 0.47030218825981246, "grad_norm": 0.40333546480267507, "learning_rate": 5.7232160577413866e-06, "loss": 0.0186, "step": 1354 }, { "epoch": 0.47064953108718305, "grad_norm": 0.5088273149893132, "learning_rate": 5.717648704098191e-06, "loss": 0.0121, "step": 1355 }, { "epoch": 0.47099687391455364, "grad_norm": 0.5242704677164807, "learning_rate": 5.712080441838167e-06, "loss": 0.0162, "step": 1356 }, { "epoch": 0.4713442167419243, "grad_norm": 1.2256987250615423, "learning_rate": 5.706511278011303e-06, "loss": 0.0207, "step": 1357 }, { "epoch": 0.47169155956929487, "grad_norm": 0.9464524842305495, "learning_rate": 5.700941219668733e-06, "loss": 0.0191, "step": 1358 }, { "epoch": 0.4720389023966655, "grad_norm": 0.8126675490446509, "learning_rate": 5.6953702738627215e-06, "loss": 0.0195, "step": 1359 }, { "epoch": 0.4723862452240361, "grad_norm": 1.0495606290514148, "learning_rate": 5.689798447646657e-06, "loss": 0.0216, "step": 1360 }, { "epoch": 0.47273358805140675, "grad_norm": 0.5981699030064619, "learning_rate": 5.684225748075044e-06, "loss": 0.0143, "step": 1361 }, { "epoch": 0.47308093087877734, "grad_norm": 0.478440133230351, "learning_rate": 5.678652182203489e-06, "loss": 0.0215, "step": 1362 }, { "epoch": 0.473428273706148, "grad_norm": 0.9677500646745865, "learning_rate": 5.6730777570887e-06, "loss": 0.0195, "step": 1363 }, { "epoch": 0.4737756165335186, "grad_norm": 0.7312470814458618, "learning_rate": 5.667502479788467e-06, "loss": 0.0109, "step": 1364 }, { "epoch": 0.4741229593608892, "grad_norm": 0.36719507217506836, "learning_rate": 5.6619263573616676e-06, "loss": 0.0152, "step": 1365 }, { "epoch": 0.4744703021882598, "grad_norm": 0.503824691086867, "learning_rate": 5.6563493968682405e-06, "loss": 0.021, "step": 1366 }, { "epoch": 0.47481764501563045, "grad_norm": 1.1250583221317176, "learning_rate": 5.6507716053691916e-06, "loss": 0.0259, "step": 1367 }, { "epoch": 0.47516498784300104, "grad_norm": 0.574390097728659, "learning_rate": 5.645192989926577e-06, "loss": 0.0237, "step": 1368 }, { "epoch": 0.47551233067037163, "grad_norm": 1.5251526377041664, "learning_rate": 5.639613557603494e-06, "loss": 0.0281, "step": 1369 }, { "epoch": 0.4758596734977423, "grad_norm": 0.6785324566307315, "learning_rate": 5.634033315464076e-06, "loss": 0.0124, "step": 1370 }, { "epoch": 0.47620701632511286, "grad_norm": 0.5227461607632101, "learning_rate": 5.628452270573483e-06, "loss": 0.0177, "step": 1371 }, { "epoch": 0.4765543591524835, "grad_norm": 1.1646192631978813, "learning_rate": 5.6228704299978905e-06, "loss": 0.0241, "step": 1372 }, { "epoch": 0.4769017019798541, "grad_norm": 0.5872534337661728, "learning_rate": 5.617287800804478e-06, "loss": 0.0218, "step": 1373 }, { "epoch": 0.47724904480722474, "grad_norm": 0.5225710058655455, "learning_rate": 5.61170439006143e-06, "loss": 0.0223, "step": 1374 }, { "epoch": 0.47759638763459533, "grad_norm": 1.1334161288347557, "learning_rate": 5.6061202048379125e-06, "loss": 0.0254, "step": 1375 }, { "epoch": 0.477943730461966, "grad_norm": 1.0865327629480914, "learning_rate": 5.600535252204081e-06, "loss": 0.0237, "step": 1376 }, { "epoch": 0.47829107328933657, "grad_norm": 1.279397014560853, "learning_rate": 5.5949495392310535e-06, "loss": 0.0286, "step": 1377 }, { "epoch": 0.4786384161167072, "grad_norm": 0.32759908340107924, "learning_rate": 5.589363072990921e-06, "loss": 0.0149, "step": 1378 }, { "epoch": 0.4789857589440778, "grad_norm": 0.5810913478185202, "learning_rate": 5.583775860556717e-06, "loss": 0.0225, "step": 1379 }, { "epoch": 0.47933310177144844, "grad_norm": 0.3907616220341205, "learning_rate": 5.578187909002428e-06, "loss": 0.0246, "step": 1380 }, { "epoch": 0.47968044459881903, "grad_norm": 0.5016926806740227, "learning_rate": 5.572599225402974e-06, "loss": 0.0196, "step": 1381 }, { "epoch": 0.4800277874261896, "grad_norm": 0.5103118178433353, "learning_rate": 5.567009816834199e-06, "loss": 0.0155, "step": 1382 }, { "epoch": 0.48037513025356027, "grad_norm": 0.9141679829913468, "learning_rate": 5.561419690372869e-06, "loss": 0.025, "step": 1383 }, { "epoch": 0.48072247308093086, "grad_norm": 0.5420971166376601, "learning_rate": 5.555828853096656e-06, "loss": 0.0164, "step": 1384 }, { "epoch": 0.4810698159083015, "grad_norm": 0.6355535194466504, "learning_rate": 5.5502373120841346e-06, "loss": 0.0214, "step": 1385 }, { "epoch": 0.4814171587356721, "grad_norm": 0.3671380997420511, "learning_rate": 5.544645074414768e-06, "loss": 0.0149, "step": 1386 }, { "epoch": 0.48176450156304274, "grad_norm": 0.8978543017268891, "learning_rate": 5.539052147168903e-06, "loss": 0.0133, "step": 1387 }, { "epoch": 0.4821118443904133, "grad_norm": 0.9548917687698516, "learning_rate": 5.533458537427758e-06, "loss": 0.0214, "step": 1388 }, { "epoch": 0.48245918721778397, "grad_norm": 0.8756025165481638, "learning_rate": 5.5278642522734175e-06, "loss": 0.0225, "step": 1389 }, { "epoch": 0.48280653004515456, "grad_norm": 0.9092163623639533, "learning_rate": 5.52226929878882e-06, "loss": 0.0146, "step": 1390 }, { "epoch": 0.4831538728725252, "grad_norm": 0.39815894527941675, "learning_rate": 5.516673684057747e-06, "loss": 0.0166, "step": 1391 }, { "epoch": 0.4835012156998958, "grad_norm": 0.2966587066038902, "learning_rate": 5.511077415164825e-06, "loss": 0.0159, "step": 1392 }, { "epoch": 0.48384855852726644, "grad_norm": 0.4884869491519517, "learning_rate": 5.505480499195502e-06, "loss": 0.0166, "step": 1393 }, { "epoch": 0.484195901354637, "grad_norm": 0.35698741794966127, "learning_rate": 5.499882943236045e-06, "loss": 0.0114, "step": 1394 }, { "epoch": 0.4845432441820076, "grad_norm": 0.7090902718663725, "learning_rate": 5.494284754373538e-06, "loss": 0.0146, "step": 1395 }, { "epoch": 0.48489058700937826, "grad_norm": 0.5026358347696848, "learning_rate": 5.488685939695862e-06, "loss": 0.0148, "step": 1396 }, { "epoch": 0.48523792983674885, "grad_norm": 0.7545042851491367, "learning_rate": 5.4830865062916835e-06, "loss": 0.022, "step": 1397 }, { "epoch": 0.4855852726641195, "grad_norm": 0.5369844064837273, "learning_rate": 5.477486461250469e-06, "loss": 0.0209, "step": 1398 }, { "epoch": 0.4859326154914901, "grad_norm": 0.6861158062273787, "learning_rate": 5.471885811662442e-06, "loss": 0.0271, "step": 1399 }, { "epoch": 0.4862799583188607, "grad_norm": 0.7673744461691656, "learning_rate": 5.466284564618603e-06, "loss": 0.0296, "step": 1400 }, { "epoch": 0.4866273011462313, "grad_norm": 0.5475029370005332, "learning_rate": 5.460682727210702e-06, "loss": 0.0248, "step": 1401 }, { "epoch": 0.48697464397360196, "grad_norm": 0.9512795806158605, "learning_rate": 5.455080306531244e-06, "loss": 0.0218, "step": 1402 }, { "epoch": 0.48732198680097255, "grad_norm": 0.5444903179080952, "learning_rate": 5.449477309673462e-06, "loss": 0.0191, "step": 1403 }, { "epoch": 0.4876693296283432, "grad_norm": 0.8861826575673567, "learning_rate": 5.443873743731331e-06, "loss": 0.0109, "step": 1404 }, { "epoch": 0.4880166724557138, "grad_norm": 0.9771566586732063, "learning_rate": 5.438269615799534e-06, "loss": 0.0197, "step": 1405 }, { "epoch": 0.48836401528308443, "grad_norm": 1.9427129264091787, "learning_rate": 5.432664932973474e-06, "loss": 0.0282, "step": 1406 }, { "epoch": 0.488711358110455, "grad_norm": 1.7104554294560776, "learning_rate": 5.427059702349255e-06, "loss": 0.0204, "step": 1407 }, { "epoch": 0.4890587009378256, "grad_norm": 1.526360473399325, "learning_rate": 5.4214539310236716e-06, "loss": 0.0235, "step": 1408 }, { "epoch": 0.48940604376519625, "grad_norm": 0.4997923973605721, "learning_rate": 5.4158476260942075e-06, "loss": 0.0174, "step": 1409 }, { "epoch": 0.48975338659256684, "grad_norm": 0.4525414144834577, "learning_rate": 5.410240794659016e-06, "loss": 0.0195, "step": 1410 }, { "epoch": 0.4901007294199375, "grad_norm": 0.410000828468513, "learning_rate": 5.4046334438169245e-06, "loss": 0.018, "step": 1411 }, { "epoch": 0.4904480722473081, "grad_norm": 0.8060585957623981, "learning_rate": 5.39902558066741e-06, "loss": 0.0186, "step": 1412 }, { "epoch": 0.4907954150746787, "grad_norm": 2.1170543916864037, "learning_rate": 5.393417212310605e-06, "loss": 0.0307, "step": 1413 }, { "epoch": 0.4911427579020493, "grad_norm": 0.8364632533994034, "learning_rate": 5.387808345847277e-06, "loss": 0.0214, "step": 1414 }, { "epoch": 0.49149010072941995, "grad_norm": 1.3428878485248303, "learning_rate": 5.382198988378829e-06, "loss": 0.0218, "step": 1415 }, { "epoch": 0.49183744355679054, "grad_norm": 0.6747820600344949, "learning_rate": 5.376589147007279e-06, "loss": 0.0184, "step": 1416 }, { "epoch": 0.4921847863841612, "grad_norm": 0.8594425526853627, "learning_rate": 5.3709788288352615e-06, "loss": 0.0182, "step": 1417 }, { "epoch": 0.4925321292115318, "grad_norm": 0.5487772743825399, "learning_rate": 5.365368040966016e-06, "loss": 0.0221, "step": 1418 }, { "epoch": 0.4928794720389024, "grad_norm": 0.8978739980804368, "learning_rate": 5.359756790503376e-06, "loss": 0.0161, "step": 1419 }, { "epoch": 0.493226814866273, "grad_norm": 1.165143564643093, "learning_rate": 5.354145084551757e-06, "loss": 0.0215, "step": 1420 }, { "epoch": 0.4935741576936436, "grad_norm": 0.8557699204572717, "learning_rate": 5.348532930216157e-06, "loss": 0.0239, "step": 1421 }, { "epoch": 0.49392150052101425, "grad_norm": 1.203420881332666, "learning_rate": 5.342920334602137e-06, "loss": 0.0199, "step": 1422 }, { "epoch": 0.49426884334838483, "grad_norm": 0.49131730078379343, "learning_rate": 5.337307304815817e-06, "loss": 0.0211, "step": 1423 }, { "epoch": 0.4946161861757555, "grad_norm": 0.7833574610940411, "learning_rate": 5.331693847963871e-06, "loss": 0.0167, "step": 1424 }, { "epoch": 0.49496352900312607, "grad_norm": 0.4454425944376909, "learning_rate": 5.32607997115351e-06, "loss": 0.0218, "step": 1425 }, { "epoch": 0.4953108718304967, "grad_norm": 0.4471730702798169, "learning_rate": 5.320465681492478e-06, "loss": 0.016, "step": 1426 }, { "epoch": 0.4956582146578673, "grad_norm": 0.9677891658268274, "learning_rate": 5.31485098608904e-06, "loss": 0.0142, "step": 1427 }, { "epoch": 0.49600555748523795, "grad_norm": 0.5375414719758962, "learning_rate": 5.309235892051976e-06, "loss": 0.0147, "step": 1428 }, { "epoch": 0.49635290031260854, "grad_norm": 0.5293248840266622, "learning_rate": 5.303620406490573e-06, "loss": 0.0155, "step": 1429 }, { "epoch": 0.4967002431399792, "grad_norm": 0.7923702737792816, "learning_rate": 5.298004536514606e-06, "loss": 0.0137, "step": 1430 }, { "epoch": 0.49704758596734977, "grad_norm": 1.2096835572281968, "learning_rate": 5.292388289234349e-06, "loss": 0.0227, "step": 1431 }, { "epoch": 0.4973949287947204, "grad_norm": 0.9552234758771334, "learning_rate": 5.286771671760541e-06, "loss": 0.0318, "step": 1432 }, { "epoch": 0.497742271622091, "grad_norm": 0.9769572098077361, "learning_rate": 5.2811546912044e-06, "loss": 0.0211, "step": 1433 }, { "epoch": 0.4980896144494616, "grad_norm": 1.3870302191088562, "learning_rate": 5.275537354677595e-06, "loss": 0.0262, "step": 1434 }, { "epoch": 0.49843695727683224, "grad_norm": 0.455287552427971, "learning_rate": 5.2699196692922546e-06, "loss": 0.0153, "step": 1435 }, { "epoch": 0.4987843001042028, "grad_norm": 0.45542023402718185, "learning_rate": 5.264301642160939e-06, "loss": 0.0133, "step": 1436 }, { "epoch": 0.49913164293157347, "grad_norm": 0.5647376621549762, "learning_rate": 5.2586832803966525e-06, "loss": 0.0183, "step": 1437 }, { "epoch": 0.49947898575894406, "grad_norm": 0.4851885791856978, "learning_rate": 5.2530645911128135e-06, "loss": 0.0159, "step": 1438 }, { "epoch": 0.4998263285863147, "grad_norm": 0.8448863799291079, "learning_rate": 5.247445581423257e-06, "loss": 0.015, "step": 1439 }, { "epoch": 0.5001736714136853, "grad_norm": 0.9930674011154853, "learning_rate": 5.24182625844223e-06, "loss": 0.0191, "step": 1440 }, { "epoch": 0.5005210142410559, "grad_norm": 0.9939816699848936, "learning_rate": 5.236206629284367e-06, "loss": 0.0147, "step": 1441 }, { "epoch": 0.5008683570684266, "grad_norm": 0.8530137576376285, "learning_rate": 5.2305867010646975e-06, "loss": 0.0131, "step": 1442 }, { "epoch": 0.5012156998957972, "grad_norm": 0.6190001699293848, "learning_rate": 5.224966480898624e-06, "loss": 0.0205, "step": 1443 }, { "epoch": 0.5015630427231678, "grad_norm": 0.5694946459177951, "learning_rate": 5.219345975901925e-06, "loss": 0.0125, "step": 1444 }, { "epoch": 0.5019103855505384, "grad_norm": 0.6351471531109476, "learning_rate": 5.2137251931907315e-06, "loss": 0.0133, "step": 1445 }, { "epoch": 0.5022577283779089, "grad_norm": 0.9258615858237176, "learning_rate": 5.208104139881537e-06, "loss": 0.0228, "step": 1446 }, { "epoch": 0.5026050712052796, "grad_norm": 1.1728546136592717, "learning_rate": 5.202482823091165e-06, "loss": 0.027, "step": 1447 }, { "epoch": 0.5029524140326502, "grad_norm": 0.4906390164717808, "learning_rate": 5.196861249936782e-06, "loss": 0.0164, "step": 1448 }, { "epoch": 0.5032997568600208, "grad_norm": 0.8785844081111323, "learning_rate": 5.191239427535876e-06, "loss": 0.03, "step": 1449 }, { "epoch": 0.5036470996873914, "grad_norm": 0.47561204331956825, "learning_rate": 5.185617363006249e-06, "loss": 0.0164, "step": 1450 }, { "epoch": 0.5039944425147621, "grad_norm": 1.0364600327319036, "learning_rate": 5.179995063466011e-06, "loss": 0.0224, "step": 1451 }, { "epoch": 0.5043417853421327, "grad_norm": 0.4607390170994428, "learning_rate": 5.174372536033572e-06, "loss": 0.0186, "step": 1452 }, { "epoch": 0.5046891281695033, "grad_norm": 1.258104528985896, "learning_rate": 5.168749787827625e-06, "loss": 0.0207, "step": 1453 }, { "epoch": 0.5050364709968739, "grad_norm": 0.4431006510218478, "learning_rate": 5.163126825967147e-06, "loss": 0.0248, "step": 1454 }, { "epoch": 0.5053838138242446, "grad_norm": 1.0796514546664886, "learning_rate": 5.157503657571386e-06, "loss": 0.0226, "step": 1455 }, { "epoch": 0.5057311566516152, "grad_norm": 0.7305079505359174, "learning_rate": 5.151880289759847e-06, "loss": 0.0162, "step": 1456 }, { "epoch": 0.5060784994789858, "grad_norm": 0.5142991616801853, "learning_rate": 5.14625672965229e-06, "loss": 0.0231, "step": 1457 }, { "epoch": 0.5064258423063563, "grad_norm": 0.3053355660863188, "learning_rate": 5.140632984368721e-06, "loss": 0.0169, "step": 1458 }, { "epoch": 0.5067731851337269, "grad_norm": 0.6123341558370508, "learning_rate": 5.1350090610293765e-06, "loss": 0.0229, "step": 1459 }, { "epoch": 0.5071205279610976, "grad_norm": 0.4030206492046645, "learning_rate": 5.12938496675472e-06, "loss": 0.0175, "step": 1460 }, { "epoch": 0.5074678707884682, "grad_norm": 1.0886198281053343, "learning_rate": 5.123760708665432e-06, "loss": 0.0186, "step": 1461 }, { "epoch": 0.5078152136158388, "grad_norm": 0.48403013770913567, "learning_rate": 5.1181362938823995e-06, "loss": 0.0186, "step": 1462 }, { "epoch": 0.5081625564432094, "grad_norm": 0.740876359392756, "learning_rate": 5.112511729526708e-06, "loss": 0.0202, "step": 1463 }, { "epoch": 0.5085098992705801, "grad_norm": 1.3075365516854722, "learning_rate": 5.106887022719633e-06, "loss": 0.0182, "step": 1464 }, { "epoch": 0.5088572420979507, "grad_norm": 0.5392655690852264, "learning_rate": 5.101262180582628e-06, "loss": 0.0274, "step": 1465 }, { "epoch": 0.5092045849253213, "grad_norm": 0.7466155039978699, "learning_rate": 5.095637210237324e-06, "loss": 0.0198, "step": 1466 }, { "epoch": 0.5095519277526919, "grad_norm": 0.4642989380683977, "learning_rate": 5.090012118805505e-06, "loss": 0.0235, "step": 1467 }, { "epoch": 0.5098992705800626, "grad_norm": 0.5150423584413911, "learning_rate": 5.084386913409118e-06, "loss": 0.0256, "step": 1468 }, { "epoch": 0.5102466134074332, "grad_norm": 0.5093614850962735, "learning_rate": 5.0787616011702455e-06, "loss": 0.0195, "step": 1469 }, { "epoch": 0.5105939562348037, "grad_norm": 0.7660637501450626, "learning_rate": 5.073136189211114e-06, "loss": 0.0176, "step": 1470 }, { "epoch": 0.5109412990621743, "grad_norm": 1.0473364217411674, "learning_rate": 5.067510684654069e-06, "loss": 0.0261, "step": 1471 }, { "epoch": 0.5112886418895449, "grad_norm": 0.8782397765062985, "learning_rate": 5.061885094621575e-06, "loss": 0.0199, "step": 1472 }, { "epoch": 0.5116359847169156, "grad_norm": 0.4607201639452861, "learning_rate": 5.056259426236207e-06, "loss": 0.0146, "step": 1473 }, { "epoch": 0.5119833275442862, "grad_norm": 0.6661850564864881, "learning_rate": 5.05063368662064e-06, "loss": 0.0179, "step": 1474 }, { "epoch": 0.5123306703716568, "grad_norm": 0.4772014886025317, "learning_rate": 5.0450078828976326e-06, "loss": 0.0198, "step": 1475 }, { "epoch": 0.5126780131990274, "grad_norm": 1.305023780400015, "learning_rate": 5.0393820221900325e-06, "loss": 0.0219, "step": 1476 }, { "epoch": 0.5130253560263981, "grad_norm": 0.8537297309135922, "learning_rate": 5.0337561116207546e-06, "loss": 0.0191, "step": 1477 }, { "epoch": 0.5133726988537687, "grad_norm": 0.5326568118271483, "learning_rate": 5.028130158312779e-06, "loss": 0.0246, "step": 1478 }, { "epoch": 0.5137200416811393, "grad_norm": 0.3866710810781936, "learning_rate": 5.02250416938914e-06, "loss": 0.0178, "step": 1479 }, { "epoch": 0.5140673845085099, "grad_norm": 0.22396563755414556, "learning_rate": 5.016878151972915e-06, "loss": 0.0123, "step": 1480 }, { "epoch": 0.5144147273358806, "grad_norm": 0.4451777326858171, "learning_rate": 5.01125211318722e-06, "loss": 0.0214, "step": 1481 }, { "epoch": 0.5147620701632512, "grad_norm": 0.6070766959527533, "learning_rate": 5.005626060155194e-06, "loss": 0.0252, "step": 1482 }, { "epoch": 0.5151094129906217, "grad_norm": 1.0407371235155332, "learning_rate": 5e-06, "loss": 0.0221, "step": 1483 }, { "epoch": 0.5154567558179923, "grad_norm": 0.9630652449889104, "learning_rate": 4.994373939844807e-06, "loss": 0.0292, "step": 1484 }, { "epoch": 0.5158040986453629, "grad_norm": 0.5372033115683907, "learning_rate": 4.988747886812781e-06, "loss": 0.0332, "step": 1485 }, { "epoch": 0.5161514414727336, "grad_norm": 0.42093625107990457, "learning_rate": 4.983121848027088e-06, "loss": 0.0154, "step": 1486 }, { "epoch": 0.5164987843001042, "grad_norm": 0.5437961266541049, "learning_rate": 4.977495830610862e-06, "loss": 0.0162, "step": 1487 }, { "epoch": 0.5168461271274748, "grad_norm": 0.6036534712036593, "learning_rate": 4.9718698416872215e-06, "loss": 0.0267, "step": 1488 }, { "epoch": 0.5171934699548454, "grad_norm": 0.531411114394444, "learning_rate": 4.966243888379245e-06, "loss": 0.0273, "step": 1489 }, { "epoch": 0.5175408127822161, "grad_norm": 0.907790815597591, "learning_rate": 4.96061797780997e-06, "loss": 0.0143, "step": 1490 }, { "epoch": 0.5178881556095867, "grad_norm": 1.0791005798691276, "learning_rate": 4.954992117102369e-06, "loss": 0.0203, "step": 1491 }, { "epoch": 0.5182354984369573, "grad_norm": 1.0050228768407403, "learning_rate": 4.949366313379362e-06, "loss": 0.0159, "step": 1492 }, { "epoch": 0.5185828412643279, "grad_norm": 0.31620179318811703, "learning_rate": 4.943740573763794e-06, "loss": 0.0181, "step": 1493 }, { "epoch": 0.5189301840916986, "grad_norm": 0.693985447212686, "learning_rate": 4.938114905378428e-06, "loss": 0.0143, "step": 1494 }, { "epoch": 0.5192775269190691, "grad_norm": 0.9981712152035553, "learning_rate": 4.932489315345933e-06, "loss": 0.022, "step": 1495 }, { "epoch": 0.5196248697464397, "grad_norm": 0.47907771487206496, "learning_rate": 4.9268638107888875e-06, "loss": 0.0158, "step": 1496 }, { "epoch": 0.5199722125738103, "grad_norm": 0.6623027870764544, "learning_rate": 4.9212383988297545e-06, "loss": 0.0134, "step": 1497 }, { "epoch": 0.5203195554011809, "grad_norm": 1.0794034174253464, "learning_rate": 4.9156130865908845e-06, "loss": 0.0292, "step": 1498 }, { "epoch": 0.5206668982285516, "grad_norm": 0.3138682448100754, "learning_rate": 4.9099878811944965e-06, "loss": 0.0155, "step": 1499 }, { "epoch": 0.5210142410559222, "grad_norm": 0.7583850057995437, "learning_rate": 4.904362789762677e-06, "loss": 0.0147, "step": 1500 }, { "epoch": 0.5213615838832928, "grad_norm": 0.7155832485484563, "learning_rate": 4.898737819417372e-06, "loss": 0.0203, "step": 1501 }, { "epoch": 0.5217089267106634, "grad_norm": 0.35672739687797744, "learning_rate": 4.893112977280369e-06, "loss": 0.0069, "step": 1502 }, { "epoch": 0.5220562695380341, "grad_norm": 0.987017131142079, "learning_rate": 4.887488270473294e-06, "loss": 0.0232, "step": 1503 }, { "epoch": 0.5224036123654047, "grad_norm": 0.34804891464014953, "learning_rate": 4.881863706117601e-06, "loss": 0.0134, "step": 1504 }, { "epoch": 0.5227509551927753, "grad_norm": 0.6251911038205994, "learning_rate": 4.876239291334568e-06, "loss": 0.0202, "step": 1505 }, { "epoch": 0.5230982980201458, "grad_norm": 1.3491600406620532, "learning_rate": 4.8706150332452815e-06, "loss": 0.0237, "step": 1506 }, { "epoch": 0.5234456408475165, "grad_norm": 0.6296332865053156, "learning_rate": 4.864990938970624e-06, "loss": 0.0162, "step": 1507 }, { "epoch": 0.5237929836748871, "grad_norm": 0.6175960968496856, "learning_rate": 4.85936701563128e-06, "loss": 0.0223, "step": 1508 }, { "epoch": 0.5241403265022577, "grad_norm": 0.6135026142466486, "learning_rate": 4.85374327034771e-06, "loss": 0.0192, "step": 1509 }, { "epoch": 0.5244876693296283, "grad_norm": 0.8443887642336403, "learning_rate": 4.848119710240156e-06, "loss": 0.0244, "step": 1510 }, { "epoch": 0.5248350121569989, "grad_norm": 0.7215133934035978, "learning_rate": 4.842496342428616e-06, "loss": 0.0306, "step": 1511 }, { "epoch": 0.5251823549843696, "grad_norm": 0.3768241222787331, "learning_rate": 4.8368731740328536e-06, "loss": 0.0139, "step": 1512 }, { "epoch": 0.5255296978117402, "grad_norm": 0.3510966580810152, "learning_rate": 4.8312502121723755e-06, "loss": 0.0137, "step": 1513 }, { "epoch": 0.5258770406391108, "grad_norm": 0.9763840468277634, "learning_rate": 4.825627463966431e-06, "loss": 0.0218, "step": 1514 }, { "epoch": 0.5262243834664814, "grad_norm": 0.9974349698601298, "learning_rate": 4.8200049365339905e-06, "loss": 0.0229, "step": 1515 }, { "epoch": 0.5265717262938521, "grad_norm": 0.395840326358885, "learning_rate": 4.814382636993753e-06, "loss": 0.0224, "step": 1516 }, { "epoch": 0.5269190691212227, "grad_norm": 0.7070158097071757, "learning_rate": 4.808760572464126e-06, "loss": 0.0237, "step": 1517 }, { "epoch": 0.5272664119485932, "grad_norm": 0.34439167819910455, "learning_rate": 4.80313875006322e-06, "loss": 0.0209, "step": 1518 }, { "epoch": 0.5276137547759638, "grad_norm": 0.333265253316496, "learning_rate": 4.7975171769088366e-06, "loss": 0.0203, "step": 1519 }, { "epoch": 0.5279610976033345, "grad_norm": 0.7184684037655478, "learning_rate": 4.791895860118465e-06, "loss": 0.0182, "step": 1520 }, { "epoch": 0.5283084404307051, "grad_norm": 0.8473043460095674, "learning_rate": 4.7862748068092685e-06, "loss": 0.0174, "step": 1521 }, { "epoch": 0.5286557832580757, "grad_norm": 0.376539100887896, "learning_rate": 4.780654024098076e-06, "loss": 0.0167, "step": 1522 }, { "epoch": 0.5290031260854463, "grad_norm": 1.1649611690839885, "learning_rate": 4.775033519101378e-06, "loss": 0.0217, "step": 1523 }, { "epoch": 0.5293504689128169, "grad_norm": 0.62058919496297, "learning_rate": 4.769413298935305e-06, "loss": 0.0136, "step": 1524 }, { "epoch": 0.5296978117401876, "grad_norm": 0.5251282587265036, "learning_rate": 4.763793370715635e-06, "loss": 0.0219, "step": 1525 }, { "epoch": 0.5300451545675582, "grad_norm": 0.6236545128248215, "learning_rate": 4.758173741557772e-06, "loss": 0.0181, "step": 1526 }, { "epoch": 0.5303924973949288, "grad_norm": 0.9945963688156065, "learning_rate": 4.752554418576744e-06, "loss": 0.0203, "step": 1527 }, { "epoch": 0.5307398402222994, "grad_norm": 0.45683681677249527, "learning_rate": 4.746935408887188e-06, "loss": 0.0094, "step": 1528 }, { "epoch": 0.5310871830496701, "grad_norm": 0.2846095731280691, "learning_rate": 4.741316719603348e-06, "loss": 0.0102, "step": 1529 }, { "epoch": 0.5314345258770407, "grad_norm": 0.3347130074226901, "learning_rate": 4.735698357839061e-06, "loss": 0.016, "step": 1530 }, { "epoch": 0.5317818687044112, "grad_norm": 0.28383945406870187, "learning_rate": 4.730080330707748e-06, "loss": 0.018, "step": 1531 }, { "epoch": 0.5321292115317818, "grad_norm": 0.39618404004198604, "learning_rate": 4.724462645322406e-06, "loss": 0.0146, "step": 1532 }, { "epoch": 0.5324765543591525, "grad_norm": 0.48107525602946016, "learning_rate": 4.718845308795601e-06, "loss": 0.0084, "step": 1533 }, { "epoch": 0.5328238971865231, "grad_norm": 0.7775999794726074, "learning_rate": 4.71322832823946e-06, "loss": 0.017, "step": 1534 }, { "epoch": 0.5331712400138937, "grad_norm": 1.069007509619891, "learning_rate": 4.707611710765654e-06, "loss": 0.0288, "step": 1535 }, { "epoch": 0.5335185828412643, "grad_norm": 0.39849903984305235, "learning_rate": 4.701995463485395e-06, "loss": 0.0117, "step": 1536 }, { "epoch": 0.5338659256686349, "grad_norm": 0.677236043933078, "learning_rate": 4.696379593509429e-06, "loss": 0.015, "step": 1537 }, { "epoch": 0.5342132684960056, "grad_norm": 0.4265486094005142, "learning_rate": 4.690764107948025e-06, "loss": 0.0122, "step": 1538 }, { "epoch": 0.5345606113233762, "grad_norm": 0.4715494638944048, "learning_rate": 4.685149013910962e-06, "loss": 0.0156, "step": 1539 }, { "epoch": 0.5349079541507468, "grad_norm": 0.7150624971216185, "learning_rate": 4.6795343185075235e-06, "loss": 0.0169, "step": 1540 }, { "epoch": 0.5352552969781174, "grad_norm": 0.6525023335363956, "learning_rate": 4.6739200288464905e-06, "loss": 0.0291, "step": 1541 }, { "epoch": 0.5356026398054881, "grad_norm": 0.47839611155737516, "learning_rate": 4.668306152036129e-06, "loss": 0.0162, "step": 1542 }, { "epoch": 0.5359499826328586, "grad_norm": 0.7470286100606716, "learning_rate": 4.662692695184184e-06, "loss": 0.026, "step": 1543 }, { "epoch": 0.5362973254602292, "grad_norm": 0.4794922661466092, "learning_rate": 4.657079665397865e-06, "loss": 0.0233, "step": 1544 }, { "epoch": 0.5366446682875998, "grad_norm": 0.7903918813270774, "learning_rate": 4.651467069783845e-06, "loss": 0.0232, "step": 1545 }, { "epoch": 0.5369920111149705, "grad_norm": 1.610986156977707, "learning_rate": 4.645854915448243e-06, "loss": 0.0298, "step": 1546 }, { "epoch": 0.5373393539423411, "grad_norm": 0.6560078832036462, "learning_rate": 4.640243209496627e-06, "loss": 0.0186, "step": 1547 }, { "epoch": 0.5376866967697117, "grad_norm": 1.6026250043381345, "learning_rate": 4.634631959033985e-06, "loss": 0.0218, "step": 1548 }, { "epoch": 0.5380340395970823, "grad_norm": 0.6898766822043673, "learning_rate": 4.62902117116474e-06, "loss": 0.017, "step": 1549 }, { "epoch": 0.5383813824244529, "grad_norm": 0.5467408900610035, "learning_rate": 4.623410852992724e-06, "loss": 0.0233, "step": 1550 }, { "epoch": 0.5387287252518236, "grad_norm": 1.0551467144246778, "learning_rate": 4.617801011621175e-06, "loss": 0.0208, "step": 1551 }, { "epoch": 0.5390760680791942, "grad_norm": 1.162941958095386, "learning_rate": 4.6121916541527235e-06, "loss": 0.0259, "step": 1552 }, { "epoch": 0.5394234109065648, "grad_norm": 0.3761300022318631, "learning_rate": 4.606582787689396e-06, "loss": 0.0185, "step": 1553 }, { "epoch": 0.5397707537339353, "grad_norm": 0.6376765879619961, "learning_rate": 4.600974419332591e-06, "loss": 0.0172, "step": 1554 }, { "epoch": 0.540118096561306, "grad_norm": 0.8179982640637906, "learning_rate": 4.595366556183079e-06, "loss": 0.0192, "step": 1555 }, { "epoch": 0.5404654393886766, "grad_norm": 0.6660409394522981, "learning_rate": 4.589759205340986e-06, "loss": 0.0201, "step": 1556 }, { "epoch": 0.5408127822160472, "grad_norm": 0.37364095824936294, "learning_rate": 4.584152373905794e-06, "loss": 0.0242, "step": 1557 }, { "epoch": 0.5411601250434178, "grad_norm": 0.6214912023671911, "learning_rate": 4.578546068976329e-06, "loss": 0.0186, "step": 1558 }, { "epoch": 0.5415074678707885, "grad_norm": 0.3570650197481583, "learning_rate": 4.572940297650747e-06, "loss": 0.0137, "step": 1559 }, { "epoch": 0.5418548106981591, "grad_norm": 0.5029530205971685, "learning_rate": 4.567335067026528e-06, "loss": 0.0144, "step": 1560 }, { "epoch": 0.5422021535255297, "grad_norm": 0.8095089739524693, "learning_rate": 4.561730384200467e-06, "loss": 0.021, "step": 1561 }, { "epoch": 0.5425494963529003, "grad_norm": 0.45079050575045515, "learning_rate": 4.556126256268671e-06, "loss": 0.0199, "step": 1562 }, { "epoch": 0.5428968391802709, "grad_norm": 0.544012994922186, "learning_rate": 4.550522690326538e-06, "loss": 0.0178, "step": 1563 }, { "epoch": 0.5432441820076416, "grad_norm": 0.4351027380527296, "learning_rate": 4.544919693468759e-06, "loss": 0.0187, "step": 1564 }, { "epoch": 0.5435915248350122, "grad_norm": 0.735577031685772, "learning_rate": 4.539317272789299e-06, "loss": 0.0176, "step": 1565 }, { "epoch": 0.5439388676623828, "grad_norm": 0.7448332665427014, "learning_rate": 4.533715435381398e-06, "loss": 0.0203, "step": 1566 }, { "epoch": 0.5442862104897533, "grad_norm": 0.8816422119244176, "learning_rate": 4.528114188337559e-06, "loss": 0.0205, "step": 1567 }, { "epoch": 0.544633553317124, "grad_norm": 0.7125240794309594, "learning_rate": 4.522513538749534e-06, "loss": 0.019, "step": 1568 }, { "epoch": 0.5449808961444946, "grad_norm": 0.6691735284658429, "learning_rate": 4.516913493708317e-06, "loss": 0.021, "step": 1569 }, { "epoch": 0.5453282389718652, "grad_norm": 0.8271292240524128, "learning_rate": 4.511314060304141e-06, "loss": 0.0189, "step": 1570 }, { "epoch": 0.5456755817992358, "grad_norm": 0.9574204304906477, "learning_rate": 4.505715245626462e-06, "loss": 0.023, "step": 1571 }, { "epoch": 0.5460229246266065, "grad_norm": 0.6547097514027208, "learning_rate": 4.500117056763956e-06, "loss": 0.0225, "step": 1572 }, { "epoch": 0.5463702674539771, "grad_norm": 1.0813224646564668, "learning_rate": 4.494519500804501e-06, "loss": 0.0173, "step": 1573 }, { "epoch": 0.5467176102813477, "grad_norm": 0.9496654452606548, "learning_rate": 4.488922584835177e-06, "loss": 0.0146, "step": 1574 }, { "epoch": 0.5470649531087183, "grad_norm": 0.49774440254371105, "learning_rate": 4.483326315942253e-06, "loss": 0.0157, "step": 1575 }, { "epoch": 0.5474122959360889, "grad_norm": 0.4088463704083283, "learning_rate": 4.477730701211183e-06, "loss": 0.017, "step": 1576 }, { "epoch": 0.5477596387634596, "grad_norm": 0.6482438150567166, "learning_rate": 4.472135747726583e-06, "loss": 0.0265, "step": 1577 }, { "epoch": 0.5481069815908302, "grad_norm": 1.1293197974606506, "learning_rate": 4.466541462572243e-06, "loss": 0.0189, "step": 1578 }, { "epoch": 0.5484543244182007, "grad_norm": 0.5822884700457213, "learning_rate": 4.460947852831097e-06, "loss": 0.0183, "step": 1579 }, { "epoch": 0.5488016672455713, "grad_norm": 0.5604041072365497, "learning_rate": 4.455354925585234e-06, "loss": 0.0283, "step": 1580 }, { "epoch": 0.549149010072942, "grad_norm": 0.44375445497432964, "learning_rate": 4.449762687915866e-06, "loss": 0.0131, "step": 1581 }, { "epoch": 0.5494963529003126, "grad_norm": 1.0388835523370303, "learning_rate": 4.444171146903345e-06, "loss": 0.016, "step": 1582 }, { "epoch": 0.5498436957276832, "grad_norm": 0.6674378343975669, "learning_rate": 4.438580309627132e-06, "loss": 0.0198, "step": 1583 }, { "epoch": 0.5501910385550538, "grad_norm": 0.39270508786634684, "learning_rate": 4.4329901831658035e-06, "loss": 0.0128, "step": 1584 }, { "epoch": 0.5505383813824245, "grad_norm": 0.7015995821207273, "learning_rate": 4.427400774597028e-06, "loss": 0.0246, "step": 1585 }, { "epoch": 0.5508857242097951, "grad_norm": 0.4993737094804665, "learning_rate": 4.421812090997573e-06, "loss": 0.0231, "step": 1586 }, { "epoch": 0.5512330670371657, "grad_norm": 0.549124903537338, "learning_rate": 4.4162241394432834e-06, "loss": 0.0219, "step": 1587 }, { "epoch": 0.5515804098645363, "grad_norm": 0.9191413987980975, "learning_rate": 4.4106369270090814e-06, "loss": 0.0128, "step": 1588 }, { "epoch": 0.5519277526919069, "grad_norm": 0.47115453283840025, "learning_rate": 4.405050460768947e-06, "loss": 0.0151, "step": 1589 }, { "epoch": 0.5522750955192776, "grad_norm": 0.5360090594122309, "learning_rate": 4.3994647477959205e-06, "loss": 0.0194, "step": 1590 }, { "epoch": 0.5526224383466481, "grad_norm": 0.6178093745994127, "learning_rate": 4.393879795162088e-06, "loss": 0.0185, "step": 1591 }, { "epoch": 0.5529697811740187, "grad_norm": 0.5705601551799697, "learning_rate": 4.388295609938572e-06, "loss": 0.0194, "step": 1592 }, { "epoch": 0.5533171240013893, "grad_norm": 0.6254120310087067, "learning_rate": 4.3827121991955235e-06, "loss": 0.0181, "step": 1593 }, { "epoch": 0.55366446682876, "grad_norm": 0.3870391915004128, "learning_rate": 4.37712957000211e-06, "loss": 0.0148, "step": 1594 }, { "epoch": 0.5540118096561306, "grad_norm": 0.9142400891756279, "learning_rate": 4.371547729426517e-06, "loss": 0.0171, "step": 1595 }, { "epoch": 0.5543591524835012, "grad_norm": 0.49985847505258046, "learning_rate": 4.365966684535925e-06, "loss": 0.0201, "step": 1596 }, { "epoch": 0.5547064953108718, "grad_norm": 1.3223267741569067, "learning_rate": 4.360386442396508e-06, "loss": 0.0191, "step": 1597 }, { "epoch": 0.5550538381382425, "grad_norm": 0.3660339615210978, "learning_rate": 4.354807010073425e-06, "loss": 0.0116, "step": 1598 }, { "epoch": 0.5554011809656131, "grad_norm": 1.0210647191568252, "learning_rate": 4.349228394630808e-06, "loss": 0.0294, "step": 1599 }, { "epoch": 0.5557485237929837, "grad_norm": 0.5885533492032752, "learning_rate": 4.3436506031317594e-06, "loss": 0.0175, "step": 1600 }, { "epoch": 0.5560958666203543, "grad_norm": 0.6342172768582458, "learning_rate": 4.338073642638334e-06, "loss": 0.014, "step": 1601 }, { "epoch": 0.5564432094477249, "grad_norm": 0.5569542724926432, "learning_rate": 4.3324975202115345e-06, "loss": 0.0195, "step": 1602 }, { "epoch": 0.5567905522750956, "grad_norm": 0.7223546169720154, "learning_rate": 4.326922242911302e-06, "loss": 0.0142, "step": 1603 }, { "epoch": 0.5571378951024661, "grad_norm": 0.3683037424124626, "learning_rate": 4.321347817796511e-06, "loss": 0.0196, "step": 1604 }, { "epoch": 0.5574852379298367, "grad_norm": 1.2092694294770592, "learning_rate": 4.3157742519249576e-06, "loss": 0.0251, "step": 1605 }, { "epoch": 0.5578325807572073, "grad_norm": 0.4910970055629151, "learning_rate": 4.3102015523533436e-06, "loss": 0.0164, "step": 1606 }, { "epoch": 0.558179923584578, "grad_norm": 0.36288578186019704, "learning_rate": 4.304629726137279e-06, "loss": 0.0142, "step": 1607 }, { "epoch": 0.5585272664119486, "grad_norm": 0.5776094098850557, "learning_rate": 4.299058780331267e-06, "loss": 0.0147, "step": 1608 }, { "epoch": 0.5588746092393192, "grad_norm": 0.4606276309503416, "learning_rate": 4.293488721988698e-06, "loss": 0.0171, "step": 1609 }, { "epoch": 0.5592219520666898, "grad_norm": 1.9663566917991069, "learning_rate": 4.287919558161835e-06, "loss": 0.0173, "step": 1610 }, { "epoch": 0.5595692948940605, "grad_norm": 0.9161917744112766, "learning_rate": 4.28235129590181e-06, "loss": 0.0164, "step": 1611 }, { "epoch": 0.5599166377214311, "grad_norm": 0.3841077769256294, "learning_rate": 4.276783942258613e-06, "loss": 0.0136, "step": 1612 }, { "epoch": 0.5602639805488017, "grad_norm": 0.40125517813129213, "learning_rate": 4.27121750428109e-06, "loss": 0.0181, "step": 1613 }, { "epoch": 0.5606113233761723, "grad_norm": 0.8054623880288195, "learning_rate": 4.265651989016915e-06, "loss": 0.019, "step": 1614 }, { "epoch": 0.5609586662035428, "grad_norm": 0.9023479920127903, "learning_rate": 4.260087403512605e-06, "loss": 0.0185, "step": 1615 }, { "epoch": 0.5613060090309135, "grad_norm": 0.4648304244403317, "learning_rate": 4.254523754813495e-06, "loss": 0.0193, "step": 1616 }, { "epoch": 0.5616533518582841, "grad_norm": 0.9102616101626732, "learning_rate": 4.2489610499637346e-06, "loss": 0.0173, "step": 1617 }, { "epoch": 0.5620006946856547, "grad_norm": 0.5742925429597601, "learning_rate": 4.243399296006276e-06, "loss": 0.0224, "step": 1618 }, { "epoch": 0.5623480375130253, "grad_norm": 1.1494398692641448, "learning_rate": 4.237838499982874e-06, "loss": 0.0195, "step": 1619 }, { "epoch": 0.562695380340396, "grad_norm": 0.8792697527207582, "learning_rate": 4.232278668934063e-06, "loss": 0.0135, "step": 1620 }, { "epoch": 0.5630427231677666, "grad_norm": 0.46740643343738697, "learning_rate": 4.226719809899163e-06, "loss": 0.0199, "step": 1621 }, { "epoch": 0.5633900659951372, "grad_norm": 1.1632556584909721, "learning_rate": 4.221161929916255e-06, "loss": 0.0304, "step": 1622 }, { "epoch": 0.5637374088225078, "grad_norm": 0.5407257650582535, "learning_rate": 4.2156050360221855e-06, "loss": 0.0109, "step": 1623 }, { "epoch": 0.5640847516498785, "grad_norm": 0.6586876902269023, "learning_rate": 4.210049135252554e-06, "loss": 0.0149, "step": 1624 }, { "epoch": 0.5644320944772491, "grad_norm": 0.7069930675788888, "learning_rate": 4.204494234641701e-06, "loss": 0.0099, "step": 1625 }, { "epoch": 0.5647794373046197, "grad_norm": 0.8706649712202481, "learning_rate": 4.198940341222699e-06, "loss": 0.0259, "step": 1626 }, { "epoch": 0.5651267801319902, "grad_norm": 0.4998724109402863, "learning_rate": 4.193387462027343e-06, "loss": 0.019, "step": 1627 }, { "epoch": 0.5654741229593608, "grad_norm": 0.6031533765692226, "learning_rate": 4.1878356040861525e-06, "loss": 0.0192, "step": 1628 }, { "epoch": 0.5658214657867315, "grad_norm": 0.7768746063951399, "learning_rate": 4.182284774428348e-06, "loss": 0.0232, "step": 1629 }, { "epoch": 0.5661688086141021, "grad_norm": 0.46684861318924753, "learning_rate": 4.176734980081845e-06, "loss": 0.0174, "step": 1630 }, { "epoch": 0.5665161514414727, "grad_norm": 1.2406437022629773, "learning_rate": 4.171186228073256e-06, "loss": 0.026, "step": 1631 }, { "epoch": 0.5668634942688433, "grad_norm": 0.42437690132062417, "learning_rate": 4.165638525427867e-06, "loss": 0.0181, "step": 1632 }, { "epoch": 0.567210837096214, "grad_norm": 0.4618761042287134, "learning_rate": 4.160091879169642e-06, "loss": 0.0179, "step": 1633 }, { "epoch": 0.5675581799235846, "grad_norm": 1.1659549434015535, "learning_rate": 4.154546296321201e-06, "loss": 0.016, "step": 1634 }, { "epoch": 0.5679055227509552, "grad_norm": 0.8666502913074675, "learning_rate": 4.14900178390382e-06, "loss": 0.0216, "step": 1635 }, { "epoch": 0.5682528655783258, "grad_norm": 1.5564805764903915, "learning_rate": 4.143458348937421e-06, "loss": 0.021, "step": 1636 }, { "epoch": 0.5686002084056965, "grad_norm": 1.1023216366479238, "learning_rate": 4.137915998440564e-06, "loss": 0.0182, "step": 1637 }, { "epoch": 0.5689475512330671, "grad_norm": 0.6297911166518874, "learning_rate": 4.132374739430427e-06, "loss": 0.0238, "step": 1638 }, { "epoch": 0.5692948940604377, "grad_norm": 1.2566214150953217, "learning_rate": 4.126834578922816e-06, "loss": 0.0165, "step": 1639 }, { "epoch": 0.5696422368878082, "grad_norm": 0.49215545540415767, "learning_rate": 4.121295523932141e-06, "loss": 0.013, "step": 1640 }, { "epoch": 0.5699895797151788, "grad_norm": 0.4038473277561727, "learning_rate": 4.115757581471412e-06, "loss": 0.0156, "step": 1641 }, { "epoch": 0.5703369225425495, "grad_norm": 0.5382862896323736, "learning_rate": 4.110220758552236e-06, "loss": 0.0329, "step": 1642 }, { "epoch": 0.5706842653699201, "grad_norm": 1.1409766712611276, "learning_rate": 4.104685062184795e-06, "loss": 0.0222, "step": 1643 }, { "epoch": 0.5710316081972907, "grad_norm": 1.0556759974525016, "learning_rate": 4.0991504993778485e-06, "loss": 0.0267, "step": 1644 }, { "epoch": 0.5713789510246613, "grad_norm": 0.3881030370366468, "learning_rate": 4.09361707713872e-06, "loss": 0.0194, "step": 1645 }, { "epoch": 0.571726293852032, "grad_norm": 0.4925901427231959, "learning_rate": 4.088084802473294e-06, "loss": 0.0195, "step": 1646 }, { "epoch": 0.5720736366794026, "grad_norm": 0.5519961509625639, "learning_rate": 4.0825536823859895e-06, "loss": 0.0244, "step": 1647 }, { "epoch": 0.5724209795067732, "grad_norm": 0.30715357160720397, "learning_rate": 4.077023723879777e-06, "loss": 0.0107, "step": 1648 }, { "epoch": 0.5727683223341438, "grad_norm": 0.5494519120842487, "learning_rate": 4.0714949339561495e-06, "loss": 0.0167, "step": 1649 }, { "epoch": 0.5731156651615145, "grad_norm": 0.3374000317637518, "learning_rate": 4.065967319615123e-06, "loss": 0.0126, "step": 1650 }, { "epoch": 0.573463007988885, "grad_norm": 0.6301590836558902, "learning_rate": 4.06044088785522e-06, "loss": 0.0122, "step": 1651 }, { "epoch": 0.5738103508162556, "grad_norm": 1.309711699788591, "learning_rate": 4.054915645673475e-06, "loss": 0.0225, "step": 1652 }, { "epoch": 0.5741576936436262, "grad_norm": 0.8057778580750404, "learning_rate": 4.049391600065407e-06, "loss": 0.0324, "step": 1653 }, { "epoch": 0.5745050364709968, "grad_norm": 0.6846173645835267, "learning_rate": 4.043868758025027e-06, "loss": 0.0149, "step": 1654 }, { "epoch": 0.5748523792983675, "grad_norm": 0.983006587398994, "learning_rate": 4.038347126544816e-06, "loss": 0.0202, "step": 1655 }, { "epoch": 0.5751997221257381, "grad_norm": 0.4000451176994636, "learning_rate": 4.032826712615727e-06, "loss": 0.0146, "step": 1656 }, { "epoch": 0.5755470649531087, "grad_norm": 0.6557928230605302, "learning_rate": 4.02730752322717e-06, "loss": 0.0185, "step": 1657 }, { "epoch": 0.5758944077804793, "grad_norm": 0.39754184151942606, "learning_rate": 4.021789565367007e-06, "loss": 0.0137, "step": 1658 }, { "epoch": 0.57624175060785, "grad_norm": 0.4884113890090465, "learning_rate": 4.016272846021534e-06, "loss": 0.0227, "step": 1659 }, { "epoch": 0.5765890934352206, "grad_norm": 0.7745976315929322, "learning_rate": 4.010757372175485e-06, "loss": 0.0208, "step": 1660 }, { "epoch": 0.5769364362625912, "grad_norm": 0.4156533149774267, "learning_rate": 4.005243150812017e-06, "loss": 0.0202, "step": 1661 }, { "epoch": 0.5772837790899618, "grad_norm": 0.5057128862814502, "learning_rate": 3.999730188912698e-06, "loss": 0.0164, "step": 1662 }, { "epoch": 0.5776311219173325, "grad_norm": 0.3684895934447478, "learning_rate": 3.994218493457503e-06, "loss": 0.0181, "step": 1663 }, { "epoch": 0.577978464744703, "grad_norm": 0.3642952572602709, "learning_rate": 3.988708071424803e-06, "loss": 0.013, "step": 1664 }, { "epoch": 0.5783258075720736, "grad_norm": 0.9994996067257631, "learning_rate": 3.983198929791357e-06, "loss": 0.0249, "step": 1665 }, { "epoch": 0.5786731503994442, "grad_norm": 0.3528391599891256, "learning_rate": 3.977691075532305e-06, "loss": 0.0134, "step": 1666 }, { "epoch": 0.5790204932268148, "grad_norm": 0.4584118521555668, "learning_rate": 3.9721845156211535e-06, "loss": 0.0232, "step": 1667 }, { "epoch": 0.5793678360541855, "grad_norm": 0.6681777982150439, "learning_rate": 3.966679257029772e-06, "loss": 0.0171, "step": 1668 }, { "epoch": 0.5797151788815561, "grad_norm": 0.9132998487091103, "learning_rate": 3.961175306728382e-06, "loss": 0.0152, "step": 1669 }, { "epoch": 0.5800625217089267, "grad_norm": 0.6259597245750973, "learning_rate": 3.955672671685552e-06, "loss": 0.0131, "step": 1670 }, { "epoch": 0.5804098645362973, "grad_norm": 0.4912882189301569, "learning_rate": 3.950171358868177e-06, "loss": 0.0177, "step": 1671 }, { "epoch": 0.580757207363668, "grad_norm": 0.9807953706195164, "learning_rate": 3.944671375241485e-06, "loss": 0.0187, "step": 1672 }, { "epoch": 0.5811045501910386, "grad_norm": 0.9981909862170388, "learning_rate": 3.939172727769021e-06, "loss": 0.0179, "step": 1673 }, { "epoch": 0.5814518930184092, "grad_norm": 0.6504993573564067, "learning_rate": 3.933675423412636e-06, "loss": 0.0179, "step": 1674 }, { "epoch": 0.5817992358457797, "grad_norm": 0.513719596446845, "learning_rate": 3.928179469132477e-06, "loss": 0.0189, "step": 1675 }, { "epoch": 0.5821465786731504, "grad_norm": 0.5682561853770528, "learning_rate": 3.9226848718869905e-06, "loss": 0.0162, "step": 1676 }, { "epoch": 0.582493921500521, "grad_norm": 0.5510658649823071, "learning_rate": 3.917191638632897e-06, "loss": 0.0182, "step": 1677 }, { "epoch": 0.5828412643278916, "grad_norm": 0.7127772237528726, "learning_rate": 3.911699776325191e-06, "loss": 0.0204, "step": 1678 }, { "epoch": 0.5831886071552622, "grad_norm": 0.4103249102772578, "learning_rate": 3.906209291917141e-06, "loss": 0.0121, "step": 1679 }, { "epoch": 0.5835359499826328, "grad_norm": 0.5473544971789079, "learning_rate": 3.900720192360255e-06, "loss": 0.027, "step": 1680 }, { "epoch": 0.5838832928100035, "grad_norm": 0.39319317320120634, "learning_rate": 3.895232484604299e-06, "loss": 0.0142, "step": 1681 }, { "epoch": 0.5842306356373741, "grad_norm": 0.5760042859983425, "learning_rate": 3.889746175597274e-06, "loss": 0.0075, "step": 1682 }, { "epoch": 0.5845779784647447, "grad_norm": 0.5384592196281841, "learning_rate": 3.884261272285409e-06, "loss": 0.0162, "step": 1683 }, { "epoch": 0.5849253212921153, "grad_norm": 0.5430426343042051, "learning_rate": 3.8787777816131525e-06, "loss": 0.0132, "step": 1684 }, { "epoch": 0.585272664119486, "grad_norm": 0.49939162433255296, "learning_rate": 3.873295710523168e-06, "loss": 0.017, "step": 1685 }, { "epoch": 0.5856200069468566, "grad_norm": 0.6649735504087517, "learning_rate": 3.867815065956319e-06, "loss": 0.0173, "step": 1686 }, { "epoch": 0.5859673497742272, "grad_norm": 0.878731758013457, "learning_rate": 3.862335854851664e-06, "loss": 0.0225, "step": 1687 }, { "epoch": 0.5863146926015977, "grad_norm": 0.6320050052238549, "learning_rate": 3.856858084146444e-06, "loss": 0.0216, "step": 1688 }, { "epoch": 0.5866620354289684, "grad_norm": 0.9414138865126228, "learning_rate": 3.851381760776077e-06, "loss": 0.0211, "step": 1689 }, { "epoch": 0.587009378256339, "grad_norm": 0.7355116706005304, "learning_rate": 3.845906891674155e-06, "loss": 0.0193, "step": 1690 }, { "epoch": 0.5873567210837096, "grad_norm": 0.5623242566677557, "learning_rate": 3.8404334837724205e-06, "loss": 0.0201, "step": 1691 }, { "epoch": 0.5877040639110802, "grad_norm": 0.5982393659307643, "learning_rate": 3.834961544000769e-06, "loss": 0.024, "step": 1692 }, { "epoch": 0.5880514067384508, "grad_norm": 0.5212721299546837, "learning_rate": 3.8294910792872355e-06, "loss": 0.0194, "step": 1693 }, { "epoch": 0.5883987495658215, "grad_norm": 0.9699697508233305, "learning_rate": 3.824022096557992e-06, "loss": 0.0246, "step": 1694 }, { "epoch": 0.5887460923931921, "grad_norm": 0.6506075968369963, "learning_rate": 3.8185546027373325e-06, "loss": 0.0181, "step": 1695 }, { "epoch": 0.5890934352205627, "grad_norm": 1.1944480293880813, "learning_rate": 3.81308860474766e-06, "loss": 0.0181, "step": 1696 }, { "epoch": 0.5894407780479333, "grad_norm": 0.8174985452252524, "learning_rate": 3.807624109509491e-06, "loss": 0.0205, "step": 1697 }, { "epoch": 0.589788120875304, "grad_norm": 0.9297467401132783, "learning_rate": 3.802161123941436e-06, "loss": 0.0181, "step": 1698 }, { "epoch": 0.5901354637026746, "grad_norm": 1.1350479422662159, "learning_rate": 3.7966996549601968e-06, "loss": 0.022, "step": 1699 }, { "epoch": 0.5904828065300451, "grad_norm": 1.0512262693137089, "learning_rate": 3.7912397094805508e-06, "loss": 0.0132, "step": 1700 }, { "epoch": 0.5908301493574157, "grad_norm": 1.2991204215566619, "learning_rate": 3.785781294415349e-06, "loss": 0.0254, "step": 1701 }, { "epoch": 0.5911774921847864, "grad_norm": 0.5226731278223045, "learning_rate": 3.780324416675504e-06, "loss": 0.0236, "step": 1702 }, { "epoch": 0.591524835012157, "grad_norm": 0.7465469795767791, "learning_rate": 3.7748690831699858e-06, "loss": 0.0199, "step": 1703 }, { "epoch": 0.5918721778395276, "grad_norm": 0.39840312715848447, "learning_rate": 3.7694153008058005e-06, "loss": 0.0144, "step": 1704 }, { "epoch": 0.5922195206668982, "grad_norm": 0.62786992782903, "learning_rate": 3.7639630764879996e-06, "loss": 0.0152, "step": 1705 }, { "epoch": 0.5925668634942688, "grad_norm": 0.6340042279876545, "learning_rate": 3.7585124171196563e-06, "loss": 0.0208, "step": 1706 }, { "epoch": 0.5929142063216395, "grad_norm": 0.3158368812851925, "learning_rate": 3.7530633296018664e-06, "loss": 0.0158, "step": 1707 }, { "epoch": 0.5932615491490101, "grad_norm": 0.6469044082386353, "learning_rate": 3.747615820833729e-06, "loss": 0.0155, "step": 1708 }, { "epoch": 0.5936088919763807, "grad_norm": 0.5138966576020948, "learning_rate": 3.7421698977123533e-06, "loss": 0.0141, "step": 1709 }, { "epoch": 0.5939562348037513, "grad_norm": 1.0369760306163973, "learning_rate": 3.736725567132833e-06, "loss": 0.0182, "step": 1710 }, { "epoch": 0.594303577631122, "grad_norm": 0.8561393743072047, "learning_rate": 3.731282835988252e-06, "loss": 0.0195, "step": 1711 }, { "epoch": 0.5946509204584925, "grad_norm": 0.3744227619737715, "learning_rate": 3.725841711169662e-06, "loss": 0.0125, "step": 1712 }, { "epoch": 0.5949982632858631, "grad_norm": 0.44797174308593246, "learning_rate": 3.7204021995660865e-06, "loss": 0.023, "step": 1713 }, { "epoch": 0.5953456061132337, "grad_norm": 0.5892043090184133, "learning_rate": 3.7149643080645055e-06, "loss": 0.0237, "step": 1714 }, { "epoch": 0.5956929489406044, "grad_norm": 1.2898844568707646, "learning_rate": 3.7095280435498476e-06, "loss": 0.0208, "step": 1715 }, { "epoch": 0.596040291767975, "grad_norm": 0.32721676156286766, "learning_rate": 3.7040934129049794e-06, "loss": 0.0121, "step": 1716 }, { "epoch": 0.5963876345953456, "grad_norm": 0.8377490237817452, "learning_rate": 3.6986604230106993e-06, "loss": 0.0202, "step": 1717 }, { "epoch": 0.5967349774227162, "grad_norm": 1.3845327772962783, "learning_rate": 3.6932290807457326e-06, "loss": 0.0232, "step": 1718 }, { "epoch": 0.5970823202500868, "grad_norm": 0.5700167886253666, "learning_rate": 3.6877993929867146e-06, "loss": 0.0121, "step": 1719 }, { "epoch": 0.5974296630774575, "grad_norm": 0.6239261997221, "learning_rate": 3.6823713666081864e-06, "loss": 0.0122, "step": 1720 }, { "epoch": 0.5977770059048281, "grad_norm": 0.486573773418933, "learning_rate": 3.676945008482585e-06, "loss": 0.0163, "step": 1721 }, { "epoch": 0.5981243487321987, "grad_norm": 0.8899160417072958, "learning_rate": 3.671520325480235e-06, "loss": 0.0152, "step": 1722 }, { "epoch": 0.5984716915595693, "grad_norm": 0.6204522930553988, "learning_rate": 3.6660973244693443e-06, "loss": 0.0136, "step": 1723 }, { "epoch": 0.59881903438694, "grad_norm": 0.6351473439134602, "learning_rate": 3.6606760123159867e-06, "loss": 0.0124, "step": 1724 }, { "epoch": 0.5991663772143105, "grad_norm": 0.43393477090853044, "learning_rate": 3.6552563958840994e-06, "loss": 0.011, "step": 1725 }, { "epoch": 0.5995137200416811, "grad_norm": 0.5715600265285414, "learning_rate": 3.6498384820354693e-06, "loss": 0.0182, "step": 1726 }, { "epoch": 0.5998610628690517, "grad_norm": 0.8520750202499213, "learning_rate": 3.6444222776297356e-06, "loss": 0.0156, "step": 1727 }, { "epoch": 0.6002084056964224, "grad_norm": 0.919378604709449, "learning_rate": 3.6390077895243676e-06, "loss": 0.0124, "step": 1728 }, { "epoch": 0.600555748523793, "grad_norm": 0.5905220548628203, "learning_rate": 3.6335950245746593e-06, "loss": 0.0148, "step": 1729 }, { "epoch": 0.6009030913511636, "grad_norm": 0.45040394474125944, "learning_rate": 3.6281839896337277e-06, "loss": 0.0191, "step": 1730 }, { "epoch": 0.6012504341785342, "grad_norm": 0.5316669983658013, "learning_rate": 3.6227746915524964e-06, "loss": 0.016, "step": 1731 }, { "epoch": 0.6015977770059048, "grad_norm": 0.4609606149512745, "learning_rate": 3.6173671371796946e-06, "loss": 0.0148, "step": 1732 }, { "epoch": 0.6019451198332755, "grad_norm": 0.41149254163013027, "learning_rate": 3.6119613333618386e-06, "loss": 0.0133, "step": 1733 }, { "epoch": 0.6022924626606461, "grad_norm": 1.325138871791973, "learning_rate": 3.606557286943229e-06, "loss": 0.0172, "step": 1734 }, { "epoch": 0.6026398054880167, "grad_norm": 0.44045335177247696, "learning_rate": 3.601155004765943e-06, "loss": 0.009, "step": 1735 }, { "epoch": 0.6029871483153872, "grad_norm": 0.6696722263208196, "learning_rate": 3.5957544936698272e-06, "loss": 0.018, "step": 1736 }, { "epoch": 0.603334491142758, "grad_norm": 1.0184366709017578, "learning_rate": 3.5903557604924764e-06, "loss": 0.0137, "step": 1737 }, { "epoch": 0.6036818339701285, "grad_norm": 0.43044179281096573, "learning_rate": 3.5849588120692446e-06, "loss": 0.0104, "step": 1738 }, { "epoch": 0.6040291767974991, "grad_norm": 0.3641616392590476, "learning_rate": 3.5795636552332203e-06, "loss": 0.0106, "step": 1739 }, { "epoch": 0.6043765196248697, "grad_norm": 0.7270321800788714, "learning_rate": 3.5741702968152263e-06, "loss": 0.0262, "step": 1740 }, { "epoch": 0.6047238624522404, "grad_norm": 0.9048183481948355, "learning_rate": 3.5687787436438044e-06, "loss": 0.0179, "step": 1741 }, { "epoch": 0.605071205279611, "grad_norm": 1.0702279052136698, "learning_rate": 3.5633890025452162e-06, "loss": 0.022, "step": 1742 }, { "epoch": 0.6054185481069816, "grad_norm": 2.7184991390092743, "learning_rate": 3.5580010803434254e-06, "loss": 0.0263, "step": 1743 }, { "epoch": 0.6057658909343522, "grad_norm": 0.5480717716362404, "learning_rate": 3.552614983860096e-06, "loss": 0.025, "step": 1744 }, { "epoch": 0.6061132337617228, "grad_norm": 0.5845582912450411, "learning_rate": 3.547230719914575e-06, "loss": 0.0121, "step": 1745 }, { "epoch": 0.6064605765890935, "grad_norm": 0.706154766729172, "learning_rate": 3.541848295323893e-06, "loss": 0.0117, "step": 1746 }, { "epoch": 0.6068079194164641, "grad_norm": 0.6491617666226633, "learning_rate": 3.536467716902754e-06, "loss": 0.0227, "step": 1747 }, { "epoch": 0.6071552622438346, "grad_norm": 0.5492141027171614, "learning_rate": 3.5310889914635205e-06, "loss": 0.0269, "step": 1748 }, { "epoch": 0.6075026050712052, "grad_norm": 0.5439963844863106, "learning_rate": 3.5257121258162092e-06, "loss": 0.0148, "step": 1749 }, { "epoch": 0.6078499478985759, "grad_norm": 1.3037710079779645, "learning_rate": 3.5203371267684827e-06, "loss": 0.0235, "step": 1750 }, { "epoch": 0.6081972907259465, "grad_norm": 0.5104769061119815, "learning_rate": 3.5149640011256438e-06, "loss": 0.0198, "step": 1751 }, { "epoch": 0.6085446335533171, "grad_norm": 0.3616939548509015, "learning_rate": 3.5095927556906193e-06, "loss": 0.0132, "step": 1752 }, { "epoch": 0.6088919763806877, "grad_norm": 1.4959841416039057, "learning_rate": 3.504223397263955e-06, "loss": 0.0291, "step": 1753 }, { "epoch": 0.6092393192080584, "grad_norm": 1.3319542700316978, "learning_rate": 3.498855932643811e-06, "loss": 0.0282, "step": 1754 }, { "epoch": 0.609586662035429, "grad_norm": 0.3634235019022115, "learning_rate": 3.4934903686259445e-06, "loss": 0.0092, "step": 1755 }, { "epoch": 0.6099340048627996, "grad_norm": 0.26043584442925766, "learning_rate": 3.4881267120037143e-06, "loss": 0.0109, "step": 1756 }, { "epoch": 0.6102813476901702, "grad_norm": 0.6366073358882772, "learning_rate": 3.4827649695680578e-06, "loss": 0.0285, "step": 1757 }, { "epoch": 0.6106286905175408, "grad_norm": 1.3257848109030876, "learning_rate": 3.4774051481074885e-06, "loss": 0.0223, "step": 1758 }, { "epoch": 0.6109760333449115, "grad_norm": 0.4347554670924022, "learning_rate": 3.472047254408091e-06, "loss": 0.0156, "step": 1759 }, { "epoch": 0.611323376172282, "grad_norm": 0.5238569207098802, "learning_rate": 3.466691295253508e-06, "loss": 0.0184, "step": 1760 }, { "epoch": 0.6116707189996526, "grad_norm": 0.5864988138134234, "learning_rate": 3.4613372774249355e-06, "loss": 0.0192, "step": 1761 }, { "epoch": 0.6120180618270232, "grad_norm": 0.4574609279537837, "learning_rate": 3.455985207701105e-06, "loss": 0.0208, "step": 1762 }, { "epoch": 0.6123654046543939, "grad_norm": 0.6793197760170264, "learning_rate": 3.4506350928582878e-06, "loss": 0.0168, "step": 1763 }, { "epoch": 0.6127127474817645, "grad_norm": 0.8979077135480431, "learning_rate": 3.4452869396702754e-06, "loss": 0.0193, "step": 1764 }, { "epoch": 0.6130600903091351, "grad_norm": 1.5966359913446138, "learning_rate": 3.439940754908382e-06, "loss": 0.018, "step": 1765 }, { "epoch": 0.6134074331365057, "grad_norm": 0.3362644354278385, "learning_rate": 3.4345965453414222e-06, "loss": 0.011, "step": 1766 }, { "epoch": 0.6137547759638764, "grad_norm": 0.9478737143318928, "learning_rate": 3.429254317735714e-06, "loss": 0.0229, "step": 1767 }, { "epoch": 0.614102118791247, "grad_norm": 0.5981398739431912, "learning_rate": 3.423914078855064e-06, "loss": 0.014, "step": 1768 }, { "epoch": 0.6144494616186176, "grad_norm": 1.2492792940989146, "learning_rate": 3.418575835460767e-06, "loss": 0.0159, "step": 1769 }, { "epoch": 0.6147968044459882, "grad_norm": 0.5085854726417735, "learning_rate": 3.4132395943115803e-06, "loss": 0.0206, "step": 1770 }, { "epoch": 0.6151441472733588, "grad_norm": 0.3446709459428576, "learning_rate": 3.4079053621637346e-06, "loss": 0.0164, "step": 1771 }, { "epoch": 0.6154914901007295, "grad_norm": 0.5545946109354595, "learning_rate": 3.402573145770916e-06, "loss": 0.0157, "step": 1772 }, { "epoch": 0.6158388329281, "grad_norm": 0.9460892604144455, "learning_rate": 3.3972429518842566e-06, "loss": 0.0383, "step": 1773 }, { "epoch": 0.6161861757554706, "grad_norm": 0.4367290077518178, "learning_rate": 3.3919147872523257e-06, "loss": 0.0159, "step": 1774 }, { "epoch": 0.6165335185828412, "grad_norm": 0.45854417375935685, "learning_rate": 3.3865886586211285e-06, "loss": 0.0209, "step": 1775 }, { "epoch": 0.6168808614102119, "grad_norm": 0.6207845739989777, "learning_rate": 3.38126457273409e-06, "loss": 0.0273, "step": 1776 }, { "epoch": 0.6172282042375825, "grad_norm": 0.8158895384794037, "learning_rate": 3.3759425363320482e-06, "loss": 0.0205, "step": 1777 }, { "epoch": 0.6175755470649531, "grad_norm": 0.4183124541564, "learning_rate": 3.3706225561532457e-06, "loss": 0.017, "step": 1778 }, { "epoch": 0.6179228898923237, "grad_norm": 0.5875812018496149, "learning_rate": 3.365304638933322e-06, "loss": 0.0226, "step": 1779 }, { "epoch": 0.6182702327196944, "grad_norm": 0.4777993091444752, "learning_rate": 3.359988791405309e-06, "loss": 0.0187, "step": 1780 }, { "epoch": 0.618617575547065, "grad_norm": 0.48444108861589447, "learning_rate": 3.3546750202996136e-06, "loss": 0.0185, "step": 1781 }, { "epoch": 0.6189649183744356, "grad_norm": 0.4370386372661205, "learning_rate": 3.349363332344013e-06, "loss": 0.0114, "step": 1782 }, { "epoch": 0.6193122612018062, "grad_norm": 0.3131795277127707, "learning_rate": 3.3440537342636483e-06, "loss": 0.0162, "step": 1783 }, { "epoch": 0.6196596040291767, "grad_norm": 0.5666886716999282, "learning_rate": 3.338746232781017e-06, "loss": 0.013, "step": 1784 }, { "epoch": 0.6200069468565474, "grad_norm": 0.30906398669450097, "learning_rate": 3.333440834615961e-06, "loss": 0.019, "step": 1785 }, { "epoch": 0.620354289683918, "grad_norm": 0.26856637246235177, "learning_rate": 3.3281375464856556e-06, "loss": 0.0118, "step": 1786 }, { "epoch": 0.6207016325112886, "grad_norm": 0.449006224074791, "learning_rate": 3.322836375104608e-06, "loss": 0.0157, "step": 1787 }, { "epoch": 0.6210489753386592, "grad_norm": 0.9291779318103394, "learning_rate": 3.3175373271846434e-06, "loss": 0.0165, "step": 1788 }, { "epoch": 0.6213963181660299, "grad_norm": 0.8633956849713047, "learning_rate": 3.3122404094349037e-06, "loss": 0.0159, "step": 1789 }, { "epoch": 0.6217436609934005, "grad_norm": 0.43556926498499643, "learning_rate": 3.3069456285618263e-06, "loss": 0.0205, "step": 1790 }, { "epoch": 0.6220910038207711, "grad_norm": 0.4644429061948247, "learning_rate": 3.3016529912691476e-06, "loss": 0.0174, "step": 1791 }, { "epoch": 0.6224383466481417, "grad_norm": 0.4928017975644182, "learning_rate": 3.2963625042578875e-06, "loss": 0.0195, "step": 1792 }, { "epoch": 0.6227856894755124, "grad_norm": 0.39183290751040956, "learning_rate": 3.2910741742263495e-06, "loss": 0.0119, "step": 1793 }, { "epoch": 0.623133032302883, "grad_norm": 0.5461515569612917, "learning_rate": 3.2857880078700953e-06, "loss": 0.018, "step": 1794 }, { "epoch": 0.6234803751302536, "grad_norm": 0.7497392589922698, "learning_rate": 3.2805040118819574e-06, "loss": 0.0261, "step": 1795 }, { "epoch": 0.6238277179576242, "grad_norm": 0.802555297478242, "learning_rate": 3.2752221929520164e-06, "loss": 0.0158, "step": 1796 }, { "epoch": 0.6241750607849947, "grad_norm": 0.576305432582867, "learning_rate": 3.2699425577675935e-06, "loss": 0.0158, "step": 1797 }, { "epoch": 0.6245224036123654, "grad_norm": 0.5846916212785859, "learning_rate": 3.2646651130132533e-06, "loss": 0.0201, "step": 1798 }, { "epoch": 0.624869746439736, "grad_norm": 0.4703989112906414, "learning_rate": 3.2593898653707773e-06, "loss": 0.0153, "step": 1799 }, { "epoch": 0.6252170892671066, "grad_norm": 0.5825293697611765, "learning_rate": 3.254116821519171e-06, "loss": 0.0127, "step": 1800 }, { "epoch": 0.6255644320944772, "grad_norm": 0.4387644946692422, "learning_rate": 3.2488459881346483e-06, "loss": 0.02, "step": 1801 }, { "epoch": 0.6259117749218479, "grad_norm": 0.4193154456369623, "learning_rate": 3.2435773718906284e-06, "loss": 0.0244, "step": 1802 }, { "epoch": 0.6262591177492185, "grad_norm": 0.39690908782734347, "learning_rate": 3.238310979457713e-06, "loss": 0.0077, "step": 1803 }, { "epoch": 0.6266064605765891, "grad_norm": 1.233442099164475, "learning_rate": 3.233046817503699e-06, "loss": 0.0186, "step": 1804 }, { "epoch": 0.6269538034039597, "grad_norm": 1.6141038525452736, "learning_rate": 3.2277848926935528e-06, "loss": 0.0281, "step": 1805 }, { "epoch": 0.6273011462313304, "grad_norm": 0.7139609495476705, "learning_rate": 3.2225252116894155e-06, "loss": 0.0258, "step": 1806 }, { "epoch": 0.627648489058701, "grad_norm": 0.7513223031684714, "learning_rate": 3.2172677811505766e-06, "loss": 0.0225, "step": 1807 }, { "epoch": 0.6279958318860716, "grad_norm": 0.8056062866255712, "learning_rate": 3.2120126077334844e-06, "loss": 0.0158, "step": 1808 }, { "epoch": 0.6283431747134421, "grad_norm": 0.523846225092398, "learning_rate": 3.2067596980917282e-06, "loss": 0.0189, "step": 1809 }, { "epoch": 0.6286905175408127, "grad_norm": 0.6441750993488121, "learning_rate": 3.20150905887603e-06, "loss": 0.0282, "step": 1810 }, { "epoch": 0.6290378603681834, "grad_norm": 1.0001326879988315, "learning_rate": 3.1962606967342356e-06, "loss": 0.0201, "step": 1811 }, { "epoch": 0.629385203195554, "grad_norm": 0.5018682485066364, "learning_rate": 3.191014618311309e-06, "loss": 0.0192, "step": 1812 }, { "epoch": 0.6297325460229246, "grad_norm": 0.8969102194509483, "learning_rate": 3.185770830249326e-06, "loss": 0.0176, "step": 1813 }, { "epoch": 0.6300798888502952, "grad_norm": 0.9231788627357475, "learning_rate": 3.1805293391874604e-06, "loss": 0.0239, "step": 1814 }, { "epoch": 0.6304272316776659, "grad_norm": 0.4778698280139559, "learning_rate": 3.1752901517619733e-06, "loss": 0.0174, "step": 1815 }, { "epoch": 0.6307745745050365, "grad_norm": 0.7761414647990466, "learning_rate": 3.1700532746062148e-06, "loss": 0.0237, "step": 1816 }, { "epoch": 0.6311219173324071, "grad_norm": 0.49952446026830283, "learning_rate": 3.1648187143506095e-06, "loss": 0.0145, "step": 1817 }, { "epoch": 0.6314692601597777, "grad_norm": 0.6271738528394313, "learning_rate": 3.159586477622647e-06, "loss": 0.0176, "step": 1818 }, { "epoch": 0.6318166029871484, "grad_norm": 0.2541241193835112, "learning_rate": 3.1543565710468743e-06, "loss": 0.0113, "step": 1819 }, { "epoch": 0.632163945814519, "grad_norm": 0.6685782936301835, "learning_rate": 3.14912900124489e-06, "loss": 0.0191, "step": 1820 }, { "epoch": 0.6325112886418895, "grad_norm": 0.376455933499608, "learning_rate": 3.1439037748353316e-06, "loss": 0.0233, "step": 1821 }, { "epoch": 0.6328586314692601, "grad_norm": 0.42194631315752845, "learning_rate": 3.1386808984338758e-06, "loss": 0.0192, "step": 1822 }, { "epoch": 0.6332059742966307, "grad_norm": 0.5449588242931194, "learning_rate": 3.1334603786532147e-06, "loss": 0.0254, "step": 1823 }, { "epoch": 0.6335533171240014, "grad_norm": 0.22995057823733359, "learning_rate": 3.128242222103064e-06, "loss": 0.0109, "step": 1824 }, { "epoch": 0.633900659951372, "grad_norm": 0.48777742225584697, "learning_rate": 3.123026435390144e-06, "loss": 0.0231, "step": 1825 }, { "epoch": 0.6342480027787426, "grad_norm": 0.3576773382846993, "learning_rate": 3.117813025118178e-06, "loss": 0.0152, "step": 1826 }, { "epoch": 0.6345953456061132, "grad_norm": 0.2928349054625997, "learning_rate": 3.112601997887873e-06, "loss": 0.017, "step": 1827 }, { "epoch": 0.6349426884334839, "grad_norm": 0.5006421572877223, "learning_rate": 3.107393360296927e-06, "loss": 0.0167, "step": 1828 }, { "epoch": 0.6352900312608545, "grad_norm": 0.4143562291378625, "learning_rate": 3.1021871189400077e-06, "loss": 0.013, "step": 1829 }, { "epoch": 0.6356373740882251, "grad_norm": 1.0102185381541648, "learning_rate": 3.096983280408754e-06, "loss": 0.032, "step": 1830 }, { "epoch": 0.6359847169155957, "grad_norm": 0.5038454928470375, "learning_rate": 3.091781851291753e-06, "loss": 0.0195, "step": 1831 }, { "epoch": 0.6363320597429664, "grad_norm": 0.9531592705539882, "learning_rate": 3.0865828381745515e-06, "loss": 0.0257, "step": 1832 }, { "epoch": 0.636679402570337, "grad_norm": 0.7522264848981152, "learning_rate": 3.0813862476396323e-06, "loss": 0.0193, "step": 1833 }, { "epoch": 0.6370267453977075, "grad_norm": 0.4979623790577095, "learning_rate": 3.07619208626641e-06, "loss": 0.018, "step": 1834 }, { "epoch": 0.6373740882250781, "grad_norm": 1.156688985936038, "learning_rate": 3.0710003606312292e-06, "loss": 0.023, "step": 1835 }, { "epoch": 0.6377214310524487, "grad_norm": 0.7241396081280072, "learning_rate": 3.065811077307342e-06, "loss": 0.0221, "step": 1836 }, { "epoch": 0.6380687738798194, "grad_norm": 0.4866234304079524, "learning_rate": 3.060624242864916e-06, "loss": 0.0123, "step": 1837 }, { "epoch": 0.63841611670719, "grad_norm": 0.4496824373815353, "learning_rate": 3.0554398638710136e-06, "loss": 0.011, "step": 1838 }, { "epoch": 0.6387634595345606, "grad_norm": 0.7591414273681181, "learning_rate": 3.050257946889594e-06, "loss": 0.0151, "step": 1839 }, { "epoch": 0.6391108023619312, "grad_norm": 0.37588557206384826, "learning_rate": 3.045078498481491e-06, "loss": 0.0172, "step": 1840 }, { "epoch": 0.6394581451893019, "grad_norm": 0.5904069516982642, "learning_rate": 3.0399015252044185e-06, "loss": 0.0209, "step": 1841 }, { "epoch": 0.6398054880166725, "grad_norm": 0.5503945994322635, "learning_rate": 3.0347270336129554e-06, "loss": 0.0161, "step": 1842 }, { "epoch": 0.6401528308440431, "grad_norm": 0.8287427058376795, "learning_rate": 3.02955503025854e-06, "loss": 0.0231, "step": 1843 }, { "epoch": 0.6405001736714137, "grad_norm": 0.8132926128652753, "learning_rate": 3.0243855216894557e-06, "loss": 0.0222, "step": 1844 }, { "epoch": 0.6408475164987844, "grad_norm": 0.34673836499554306, "learning_rate": 3.0192185144508336e-06, "loss": 0.0096, "step": 1845 }, { "epoch": 0.6411948593261549, "grad_norm": 0.6070961612743545, "learning_rate": 3.0140540150846324e-06, "loss": 0.0255, "step": 1846 }, { "epoch": 0.6415422021535255, "grad_norm": 0.37708682146770417, "learning_rate": 3.00889203012964e-06, "loss": 0.0142, "step": 1847 }, { "epoch": 0.6418895449808961, "grad_norm": 0.4761721285878218, "learning_rate": 3.0037325661214555e-06, "loss": 0.0172, "step": 1848 }, { "epoch": 0.6422368878082667, "grad_norm": 0.43162467961906015, "learning_rate": 2.99857562959249e-06, "loss": 0.014, "step": 1849 }, { "epoch": 0.6425842306356374, "grad_norm": 0.4418457043261697, "learning_rate": 2.9934212270719555e-06, "loss": 0.0178, "step": 1850 }, { "epoch": 0.642931573463008, "grad_norm": 0.736952954263188, "learning_rate": 2.988269365085854e-06, "loss": 0.022, "step": 1851 }, { "epoch": 0.6432789162903786, "grad_norm": 0.5793129124364786, "learning_rate": 2.983120050156969e-06, "loss": 0.0166, "step": 1852 }, { "epoch": 0.6436262591177492, "grad_norm": 0.4059013512127999, "learning_rate": 2.9779732888048607e-06, "loss": 0.0134, "step": 1853 }, { "epoch": 0.6439736019451199, "grad_norm": 0.5434418757990271, "learning_rate": 2.9728290875458597e-06, "loss": 0.0184, "step": 1854 }, { "epoch": 0.6443209447724905, "grad_norm": 0.6080536174329941, "learning_rate": 2.967687452893051e-06, "loss": 0.0151, "step": 1855 }, { "epoch": 0.6446682875998611, "grad_norm": 0.3794734225409095, "learning_rate": 2.9625483913562696e-06, "loss": 0.0051, "step": 1856 }, { "epoch": 0.6450156304272316, "grad_norm": 0.4025726954879858, "learning_rate": 2.957411909442095e-06, "loss": 0.0138, "step": 1857 }, { "epoch": 0.6453629732546023, "grad_norm": 0.9717609808679778, "learning_rate": 2.95227801365384e-06, "loss": 0.0234, "step": 1858 }, { "epoch": 0.6457103160819729, "grad_norm": 0.9793666448755983, "learning_rate": 2.947146710491545e-06, "loss": 0.0243, "step": 1859 }, { "epoch": 0.6460576589093435, "grad_norm": 0.6545294963642488, "learning_rate": 2.942018006451961e-06, "loss": 0.0198, "step": 1860 }, { "epoch": 0.6464050017367141, "grad_norm": 0.543720085501545, "learning_rate": 2.9368919080285574e-06, "loss": 0.0148, "step": 1861 }, { "epoch": 0.6467523445640847, "grad_norm": 0.5001478214174131, "learning_rate": 2.9317684217114977e-06, "loss": 0.0153, "step": 1862 }, { "epoch": 0.6470996873914554, "grad_norm": 0.8634338411303856, "learning_rate": 2.9266475539876447e-06, "loss": 0.0153, "step": 1863 }, { "epoch": 0.647447030218826, "grad_norm": 0.3794776934466785, "learning_rate": 2.921529311340537e-06, "loss": 0.0145, "step": 1864 }, { "epoch": 0.6477943730461966, "grad_norm": 0.5456181574813935, "learning_rate": 2.916413700250397e-06, "loss": 0.0223, "step": 1865 }, { "epoch": 0.6481417158735672, "grad_norm": 0.7344763458259643, "learning_rate": 2.9113007271941118e-06, "loss": 0.0263, "step": 1866 }, { "epoch": 0.6484890587009379, "grad_norm": 0.42076158970673955, "learning_rate": 2.9061903986452323e-06, "loss": 0.0171, "step": 1867 }, { "epoch": 0.6488364015283085, "grad_norm": 0.5699294311522565, "learning_rate": 2.9010827210739557e-06, "loss": 0.0223, "step": 1868 }, { "epoch": 0.649183744355679, "grad_norm": 0.7798211226417748, "learning_rate": 2.895977700947124e-06, "loss": 0.0223, "step": 1869 }, { "epoch": 0.6495310871830496, "grad_norm": 0.6520811194584589, "learning_rate": 2.890875344728218e-06, "loss": 0.0253, "step": 1870 }, { "epoch": 0.6498784300104203, "grad_norm": 0.5155130215134897, "learning_rate": 2.8857756588773457e-06, "loss": 0.012, "step": 1871 }, { "epoch": 0.6502257728377909, "grad_norm": 0.6695698436955599, "learning_rate": 2.88067864985123e-06, "loss": 0.0189, "step": 1872 }, { "epoch": 0.6505731156651615, "grad_norm": 0.5691514228238037, "learning_rate": 2.875584324103205e-06, "loss": 0.022, "step": 1873 }, { "epoch": 0.6509204584925321, "grad_norm": 0.7629069218607217, "learning_rate": 2.8704926880832117e-06, "loss": 0.0201, "step": 1874 }, { "epoch": 0.6512678013199027, "grad_norm": 0.4637233958651768, "learning_rate": 2.865403748237784e-06, "loss": 0.0207, "step": 1875 }, { "epoch": 0.6516151441472734, "grad_norm": 0.702869419072292, "learning_rate": 2.860317511010041e-06, "loss": 0.0173, "step": 1876 }, { "epoch": 0.651962486974644, "grad_norm": 0.9400016037194756, "learning_rate": 2.855233982839678e-06, "loss": 0.0148, "step": 1877 }, { "epoch": 0.6523098298020146, "grad_norm": 0.4312318941463103, "learning_rate": 2.8501531701629658e-06, "loss": 0.0156, "step": 1878 }, { "epoch": 0.6526571726293852, "grad_norm": 0.8618641382406451, "learning_rate": 2.845075079412731e-06, "loss": 0.0163, "step": 1879 }, { "epoch": 0.6530045154567559, "grad_norm": 0.7664712979039602, "learning_rate": 2.8399997170183625e-06, "loss": 0.0156, "step": 1880 }, { "epoch": 0.6533518582841265, "grad_norm": 0.9200611024228682, "learning_rate": 2.8349270894057822e-06, "loss": 0.0202, "step": 1881 }, { "epoch": 0.653699201111497, "grad_norm": 0.509279359509403, "learning_rate": 2.8298572029974624e-06, "loss": 0.0182, "step": 1882 }, { "epoch": 0.6540465439388676, "grad_norm": 0.6113621614337158, "learning_rate": 2.824790064212396e-06, "loss": 0.019, "step": 1883 }, { "epoch": 0.6543938867662383, "grad_norm": 0.6185846522906022, "learning_rate": 2.8197256794661023e-06, "loss": 0.0278, "step": 1884 }, { "epoch": 0.6547412295936089, "grad_norm": 0.36054081111019864, "learning_rate": 2.814664055170609e-06, "loss": 0.0148, "step": 1885 }, { "epoch": 0.6550885724209795, "grad_norm": 0.46357034653680274, "learning_rate": 2.809605197734454e-06, "loss": 0.0174, "step": 1886 }, { "epoch": 0.6554359152483501, "grad_norm": 1.1073051215502399, "learning_rate": 2.804549113562667e-06, "loss": 0.0198, "step": 1887 }, { "epoch": 0.6557832580757207, "grad_norm": 0.2971627242142241, "learning_rate": 2.7994958090567715e-06, "loss": 0.0175, "step": 1888 }, { "epoch": 0.6561306009030914, "grad_norm": 0.36560530569067073, "learning_rate": 2.7944452906147656e-06, "loss": 0.0134, "step": 1889 }, { "epoch": 0.656477943730462, "grad_norm": 0.3549139995169762, "learning_rate": 2.7893975646311276e-06, "loss": 0.0132, "step": 1890 }, { "epoch": 0.6568252865578326, "grad_norm": 0.4732148819461115, "learning_rate": 2.784352637496792e-06, "loss": 0.0148, "step": 1891 }, { "epoch": 0.6571726293852032, "grad_norm": 0.40360522752171646, "learning_rate": 2.7793105155991584e-06, "loss": 0.0158, "step": 1892 }, { "epoch": 0.6575199722125739, "grad_norm": 0.7623569339524721, "learning_rate": 2.774271205322066e-06, "loss": 0.0177, "step": 1893 }, { "epoch": 0.6578673150399444, "grad_norm": 1.367237702513199, "learning_rate": 2.769234713045798e-06, "loss": 0.0228, "step": 1894 }, { "epoch": 0.658214657867315, "grad_norm": 0.675243175984947, "learning_rate": 2.764201045147071e-06, "loss": 0.0105, "step": 1895 }, { "epoch": 0.6585620006946856, "grad_norm": 0.9439336995618546, "learning_rate": 2.7591702079990277e-06, "loss": 0.0181, "step": 1896 }, { "epoch": 0.6589093435220563, "grad_norm": 0.5276873160106954, "learning_rate": 2.754142207971221e-06, "loss": 0.0099, "step": 1897 }, { "epoch": 0.6592566863494269, "grad_norm": 0.569158156227268, "learning_rate": 2.749117051429612e-06, "loss": 0.0129, "step": 1898 }, { "epoch": 0.6596040291767975, "grad_norm": 0.5137501939519034, "learning_rate": 2.7440947447365664e-06, "loss": 0.0141, "step": 1899 }, { "epoch": 0.6599513720041681, "grad_norm": 0.7005662168512808, "learning_rate": 2.739075294250841e-06, "loss": 0.0201, "step": 1900 }, { "epoch": 0.6602987148315387, "grad_norm": 0.7235707108221842, "learning_rate": 2.7340587063275736e-06, "loss": 0.0208, "step": 1901 }, { "epoch": 0.6606460576589094, "grad_norm": 0.710616178049963, "learning_rate": 2.7290449873182755e-06, "loss": 0.0149, "step": 1902 }, { "epoch": 0.66099340048628, "grad_norm": 0.48224009794578837, "learning_rate": 2.7240341435708316e-06, "loss": 0.0099, "step": 1903 }, { "epoch": 0.6613407433136506, "grad_norm": 0.7252312473237166, "learning_rate": 2.7190261814294873e-06, "loss": 0.0197, "step": 1904 }, { "epoch": 0.6616880861410211, "grad_norm": 0.4216043473064591, "learning_rate": 2.714021107234831e-06, "loss": 0.012, "step": 1905 }, { "epoch": 0.6620354289683918, "grad_norm": 0.556375549796574, "learning_rate": 2.7090189273238e-06, "loss": 0.0203, "step": 1906 }, { "epoch": 0.6623827717957624, "grad_norm": 1.4931977841749278, "learning_rate": 2.7040196480296677e-06, "loss": 0.031, "step": 1907 }, { "epoch": 0.662730114623133, "grad_norm": 0.6694116899417739, "learning_rate": 2.6990232756820396e-06, "loss": 0.015, "step": 1908 }, { "epoch": 0.6630774574505036, "grad_norm": 0.7127860652752314, "learning_rate": 2.6940298166068255e-06, "loss": 0.0151, "step": 1909 }, { "epoch": 0.6634248002778743, "grad_norm": 0.5936061309196379, "learning_rate": 2.6890392771262618e-06, "loss": 0.0169, "step": 1910 }, { "epoch": 0.6637721431052449, "grad_norm": 0.7367028197758674, "learning_rate": 2.684051663558884e-06, "loss": 0.0209, "step": 1911 }, { "epoch": 0.6641194859326155, "grad_norm": 0.6177550979223532, "learning_rate": 2.6790669822195202e-06, "loss": 0.0177, "step": 1912 }, { "epoch": 0.6644668287599861, "grad_norm": 1.1958625864238086, "learning_rate": 2.6740852394192896e-06, "loss": 0.0196, "step": 1913 }, { "epoch": 0.6648141715873567, "grad_norm": 0.46507928549224564, "learning_rate": 2.6691064414655864e-06, "loss": 0.0123, "step": 1914 }, { "epoch": 0.6651615144147274, "grad_norm": 0.5722070188703421, "learning_rate": 2.664130594662083e-06, "loss": 0.0224, "step": 1915 }, { "epoch": 0.665508857242098, "grad_norm": 0.9009029726789275, "learning_rate": 2.6591577053087084e-06, "loss": 0.015, "step": 1916 }, { "epoch": 0.6658562000694686, "grad_norm": 0.5063091771412535, "learning_rate": 2.6541877797016535e-06, "loss": 0.0138, "step": 1917 }, { "epoch": 0.6662035428968391, "grad_norm": 0.45072043809201573, "learning_rate": 2.6492208241333494e-06, "loss": 0.016, "step": 1918 }, { "epoch": 0.6665508857242098, "grad_norm": 0.39804190140708595, "learning_rate": 2.6442568448924754e-06, "loss": 0.0187, "step": 1919 }, { "epoch": 0.6668982285515804, "grad_norm": 0.3519857413757391, "learning_rate": 2.6392958482639343e-06, "loss": 0.0153, "step": 1920 }, { "epoch": 0.667245571378951, "grad_norm": 0.7945291965330532, "learning_rate": 2.63433784052886e-06, "loss": 0.0156, "step": 1921 }, { "epoch": 0.6675929142063216, "grad_norm": 0.49910796071011826, "learning_rate": 2.6293828279645938e-06, "loss": 0.015, "step": 1922 }, { "epoch": 0.6679402570336923, "grad_norm": 0.6606502012921845, "learning_rate": 2.6244308168446958e-06, "loss": 0.0329, "step": 1923 }, { "epoch": 0.6682875998610629, "grad_norm": 0.5820565201288449, "learning_rate": 2.6194818134389143e-06, "loss": 0.0199, "step": 1924 }, { "epoch": 0.6686349426884335, "grad_norm": 0.6077071562564756, "learning_rate": 2.614535824013199e-06, "loss": 0.0212, "step": 1925 }, { "epoch": 0.6689822855158041, "grad_norm": 0.9648264400971077, "learning_rate": 2.6095928548296773e-06, "loss": 0.0308, "step": 1926 }, { "epoch": 0.6693296283431747, "grad_norm": 0.6590307822381734, "learning_rate": 2.6046529121466537e-06, "loss": 0.016, "step": 1927 }, { "epoch": 0.6696769711705454, "grad_norm": 0.647299441184019, "learning_rate": 2.5997160022186028e-06, "loss": 0.0259, "step": 1928 }, { "epoch": 0.670024313997916, "grad_norm": 0.6135071672946737, "learning_rate": 2.594782131296163e-06, "loss": 0.0087, "step": 1929 }, { "epoch": 0.6703716568252865, "grad_norm": 1.2610496636585344, "learning_rate": 2.589851305626116e-06, "loss": 0.0239, "step": 1930 }, { "epoch": 0.6707189996526571, "grad_norm": 0.3954009222925035, "learning_rate": 2.5849235314513923e-06, "loss": 0.0122, "step": 1931 }, { "epoch": 0.6710663424800278, "grad_norm": 0.5156724723554922, "learning_rate": 2.57999881501106e-06, "loss": 0.0241, "step": 1932 }, { "epoch": 0.6714136853073984, "grad_norm": 0.8288162912643778, "learning_rate": 2.575077162540318e-06, "loss": 0.0258, "step": 1933 }, { "epoch": 0.671761028134769, "grad_norm": 0.3963519210942034, "learning_rate": 2.570158580270481e-06, "loss": 0.0156, "step": 1934 }, { "epoch": 0.6721083709621396, "grad_norm": 0.23269559096942974, "learning_rate": 2.565243074428976e-06, "loss": 0.0104, "step": 1935 }, { "epoch": 0.6724557137895103, "grad_norm": 0.5076358202743023, "learning_rate": 2.5603306512393387e-06, "loss": 0.0165, "step": 1936 }, { "epoch": 0.6728030566168809, "grad_norm": 0.4803777245917521, "learning_rate": 2.555421316921203e-06, "loss": 0.0161, "step": 1937 }, { "epoch": 0.6731503994442515, "grad_norm": 1.7249480773077772, "learning_rate": 2.5505150776902877e-06, "loss": 0.0299, "step": 1938 }, { "epoch": 0.6734977422716221, "grad_norm": 0.5695623395281586, "learning_rate": 2.5456119397583923e-06, "loss": 0.0226, "step": 1939 }, { "epoch": 0.6738450850989927, "grad_norm": 0.6090762926972669, "learning_rate": 2.540711909333394e-06, "loss": 0.0138, "step": 1940 }, { "epoch": 0.6741924279263634, "grad_norm": 0.8830972943432503, "learning_rate": 2.535814992619237e-06, "loss": 0.024, "step": 1941 }, { "epoch": 0.674539770753734, "grad_norm": 0.672673665772259, "learning_rate": 2.5309211958159135e-06, "loss": 0.0143, "step": 1942 }, { "epoch": 0.6748871135811045, "grad_norm": 0.44731362978442346, "learning_rate": 2.526030525119475e-06, "loss": 0.0204, "step": 1943 }, { "epoch": 0.6752344564084751, "grad_norm": 0.3413850862351891, "learning_rate": 2.521142986722014e-06, "loss": 0.0168, "step": 1944 }, { "epoch": 0.6755817992358458, "grad_norm": 1.356486822592823, "learning_rate": 2.516258586811653e-06, "loss": 0.0181, "step": 1945 }, { "epoch": 0.6759291420632164, "grad_norm": 0.5590617678902554, "learning_rate": 2.5113773315725407e-06, "loss": 0.0167, "step": 1946 }, { "epoch": 0.676276484890587, "grad_norm": 1.1480811352959883, "learning_rate": 2.5064992271848504e-06, "loss": 0.014, "step": 1947 }, { "epoch": 0.6766238277179576, "grad_norm": 1.1960798962121544, "learning_rate": 2.5016242798247623e-06, "loss": 0.0217, "step": 1948 }, { "epoch": 0.6769711705453283, "grad_norm": 1.4243039228724932, "learning_rate": 2.496752495664457e-06, "loss": 0.0233, "step": 1949 }, { "epoch": 0.6773185133726989, "grad_norm": 1.0255082402145865, "learning_rate": 2.491883880872115e-06, "loss": 0.0272, "step": 1950 }, { "epoch": 0.6776658562000695, "grad_norm": 0.7762367778334762, "learning_rate": 2.487018441611899e-06, "loss": 0.019, "step": 1951 }, { "epoch": 0.6780131990274401, "grad_norm": 0.6073071568906027, "learning_rate": 2.482156184043958e-06, "loss": 0.0259, "step": 1952 }, { "epoch": 0.6783605418548107, "grad_norm": 0.7255049359634366, "learning_rate": 2.4772971143244033e-06, "loss": 0.0257, "step": 1953 }, { "epoch": 0.6787078846821814, "grad_norm": 0.8117217423490592, "learning_rate": 2.4724412386053208e-06, "loss": 0.0133, "step": 1954 }, { "epoch": 0.6790552275095519, "grad_norm": 0.43153608387126113, "learning_rate": 2.4675885630347423e-06, "loss": 0.0144, "step": 1955 }, { "epoch": 0.6794025703369225, "grad_norm": 0.31393892010490915, "learning_rate": 2.4627390937566566e-06, "loss": 0.0122, "step": 1956 }, { "epoch": 0.6797499131642931, "grad_norm": 0.49140451557972653, "learning_rate": 2.457892836910985e-06, "loss": 0.0192, "step": 1957 }, { "epoch": 0.6800972559916638, "grad_norm": 0.32574396229410224, "learning_rate": 2.4530497986335888e-06, "loss": 0.0111, "step": 1958 }, { "epoch": 0.6804445988190344, "grad_norm": 0.5568119405077808, "learning_rate": 2.4482099850562496e-06, "loss": 0.0199, "step": 1959 }, { "epoch": 0.680791941646405, "grad_norm": 1.3668016268563767, "learning_rate": 2.4433734023066662e-06, "loss": 0.018, "step": 1960 }, { "epoch": 0.6811392844737756, "grad_norm": 0.375357049604211, "learning_rate": 2.438540056508449e-06, "loss": 0.009, "step": 1961 }, { "epoch": 0.6814866273011463, "grad_norm": 0.7359559626951213, "learning_rate": 2.4337099537811114e-06, "loss": 0.0124, "step": 1962 }, { "epoch": 0.6818339701285169, "grad_norm": 0.6182352327681956, "learning_rate": 2.4288831002400574e-06, "loss": 0.0159, "step": 1963 }, { "epoch": 0.6821813129558875, "grad_norm": 0.9957988289105886, "learning_rate": 2.4240595019965755e-06, "loss": 0.0226, "step": 1964 }, { "epoch": 0.682528655783258, "grad_norm": 0.5234908070793383, "learning_rate": 2.4192391651578384e-06, "loss": 0.0176, "step": 1965 }, { "epoch": 0.6828759986106286, "grad_norm": 0.8440220877656187, "learning_rate": 2.4144220958268883e-06, "loss": 0.0202, "step": 1966 }, { "epoch": 0.6832233414379993, "grad_norm": 0.5472941032734703, "learning_rate": 2.409608300102627e-06, "loss": 0.0197, "step": 1967 }, { "epoch": 0.6835706842653699, "grad_norm": 0.37475787591312015, "learning_rate": 2.404797784079811e-06, "loss": 0.0159, "step": 1968 }, { "epoch": 0.6839180270927405, "grad_norm": 0.41355656518558104, "learning_rate": 2.3999905538490487e-06, "loss": 0.0069, "step": 1969 }, { "epoch": 0.6842653699201111, "grad_norm": 1.433554613823539, "learning_rate": 2.395186615496789e-06, "loss": 0.0268, "step": 1970 }, { "epoch": 0.6846127127474818, "grad_norm": 1.1620950677445072, "learning_rate": 2.390385975105308e-06, "loss": 0.0218, "step": 1971 }, { "epoch": 0.6849600555748524, "grad_norm": 0.37254931419419035, "learning_rate": 2.3855886387527062e-06, "loss": 0.0112, "step": 1972 }, { "epoch": 0.685307398402223, "grad_norm": 0.4943387705379173, "learning_rate": 2.3807946125129056e-06, "loss": 0.0205, "step": 1973 }, { "epoch": 0.6856547412295936, "grad_norm": 0.7882856389300709, "learning_rate": 2.3760039024556387e-06, "loss": 0.0198, "step": 1974 }, { "epoch": 0.6860020840569643, "grad_norm": 1.2217971154250373, "learning_rate": 2.371216514646428e-06, "loss": 0.0135, "step": 1975 }, { "epoch": 0.6863494268843349, "grad_norm": 0.6619688439348922, "learning_rate": 2.3664324551466007e-06, "loss": 0.0163, "step": 1976 }, { "epoch": 0.6866967697117055, "grad_norm": 0.4763487225723533, "learning_rate": 2.361651730013269e-06, "loss": 0.0148, "step": 1977 }, { "epoch": 0.687044112539076, "grad_norm": 0.38275391078177107, "learning_rate": 2.356874345299319e-06, "loss": 0.013, "step": 1978 }, { "epoch": 0.6873914553664466, "grad_norm": 0.7080298751661178, "learning_rate": 2.3521003070534065e-06, "loss": 0.0205, "step": 1979 }, { "epoch": 0.6877387981938173, "grad_norm": 0.9866082614707886, "learning_rate": 2.347329621319957e-06, "loss": 0.0203, "step": 1980 }, { "epoch": 0.6880861410211879, "grad_norm": 0.5240609641902144, "learning_rate": 2.3425622941391485e-06, "loss": 0.0144, "step": 1981 }, { "epoch": 0.6884334838485585, "grad_norm": 0.5375077106046096, "learning_rate": 2.3377983315469045e-06, "loss": 0.0151, "step": 1982 }, { "epoch": 0.6887808266759291, "grad_norm": 0.9763971466094741, "learning_rate": 2.3330377395748878e-06, "loss": 0.0168, "step": 1983 }, { "epoch": 0.6891281695032998, "grad_norm": 0.6777867916323312, "learning_rate": 2.328280524250498e-06, "loss": 0.0196, "step": 1984 }, { "epoch": 0.6894755123306704, "grad_norm": 0.6717483439928033, "learning_rate": 2.3235266915968586e-06, "loss": 0.0297, "step": 1985 }, { "epoch": 0.689822855158041, "grad_norm": 0.659428567653895, "learning_rate": 2.3187762476328086e-06, "loss": 0.0154, "step": 1986 }, { "epoch": 0.6901701979854116, "grad_norm": 0.48768769720756106, "learning_rate": 2.3140291983728936e-06, "loss": 0.0175, "step": 1987 }, { "epoch": 0.6905175408127823, "grad_norm": 0.6148456012864478, "learning_rate": 2.3092855498273674e-06, "loss": 0.0177, "step": 1988 }, { "epoch": 0.6908648836401529, "grad_norm": 0.5297818902205734, "learning_rate": 2.3045453080021775e-06, "loss": 0.0134, "step": 1989 }, { "epoch": 0.6912122264675234, "grad_norm": 0.6794761025303996, "learning_rate": 2.2998084788989514e-06, "loss": 0.0148, "step": 1990 }, { "epoch": 0.691559569294894, "grad_norm": 0.4063679508309279, "learning_rate": 2.2950750685150045e-06, "loss": 0.0087, "step": 1991 }, { "epoch": 0.6919069121222646, "grad_norm": 0.6095392803855911, "learning_rate": 2.290345082843318e-06, "loss": 0.0164, "step": 1992 }, { "epoch": 0.6922542549496353, "grad_norm": 0.7441531444252605, "learning_rate": 2.285618527872537e-06, "loss": 0.0269, "step": 1993 }, { "epoch": 0.6926015977770059, "grad_norm": 0.5717566310228904, "learning_rate": 2.2808954095869653e-06, "loss": 0.0131, "step": 1994 }, { "epoch": 0.6929489406043765, "grad_norm": 0.6596567441963315, "learning_rate": 2.2761757339665576e-06, "loss": 0.0204, "step": 1995 }, { "epoch": 0.6932962834317471, "grad_norm": 0.608634510397694, "learning_rate": 2.2714595069869044e-06, "loss": 0.0201, "step": 1996 }, { "epoch": 0.6936436262591178, "grad_norm": 0.8865789858612154, "learning_rate": 2.2667467346192325e-06, "loss": 0.012, "step": 1997 }, { "epoch": 0.6939909690864884, "grad_norm": 0.7362361397592723, "learning_rate": 2.2620374228303944e-06, "loss": 0.0193, "step": 1998 }, { "epoch": 0.694338311913859, "grad_norm": 0.42850603654943825, "learning_rate": 2.2573315775828655e-06, "loss": 0.016, "step": 1999 }, { "epoch": 0.6946856547412296, "grad_norm": 0.46854751088934754, "learning_rate": 2.2526292048347246e-06, "loss": 0.0184, "step": 2000 }, { "epoch": 0.6950329975686003, "grad_norm": 0.6941328111513891, "learning_rate": 2.2479303105396576e-06, "loss": 0.0252, "step": 2001 }, { "epoch": 0.6953803403959709, "grad_norm": 0.5761073352376276, "learning_rate": 2.2432349006469468e-06, "loss": 0.0282, "step": 2002 }, { "epoch": 0.6957276832233414, "grad_norm": 0.5163738104718749, "learning_rate": 2.2385429811014654e-06, "loss": 0.0175, "step": 2003 }, { "epoch": 0.696075026050712, "grad_norm": 0.5317310005945695, "learning_rate": 2.2338545578436623e-06, "loss": 0.0251, "step": 2004 }, { "epoch": 0.6964223688780826, "grad_norm": 0.407307788912211, "learning_rate": 2.2291696368095595e-06, "loss": 0.0191, "step": 2005 }, { "epoch": 0.6967697117054533, "grad_norm": 0.9234682659843976, "learning_rate": 2.2244882239307497e-06, "loss": 0.0277, "step": 2006 }, { "epoch": 0.6971170545328239, "grad_norm": 0.4498944617872236, "learning_rate": 2.2198103251343856e-06, "loss": 0.0153, "step": 2007 }, { "epoch": 0.6974643973601945, "grad_norm": 0.3445137017847305, "learning_rate": 2.215135946343159e-06, "loss": 0.0151, "step": 2008 }, { "epoch": 0.6978117401875651, "grad_norm": 0.4550738434401644, "learning_rate": 2.2104650934753157e-06, "loss": 0.0198, "step": 2009 }, { "epoch": 0.6981590830149358, "grad_norm": 0.4009917053292384, "learning_rate": 2.2057977724446365e-06, "loss": 0.0181, "step": 2010 }, { "epoch": 0.6985064258423064, "grad_norm": 0.7002896810570791, "learning_rate": 2.201133989160427e-06, "loss": 0.0177, "step": 2011 }, { "epoch": 0.698853768669677, "grad_norm": 0.6950029004017243, "learning_rate": 2.1964737495275122e-06, "loss": 0.0182, "step": 2012 }, { "epoch": 0.6992011114970476, "grad_norm": 0.4199036421278682, "learning_rate": 2.191817059446236e-06, "loss": 0.0165, "step": 2013 }, { "epoch": 0.6995484543244183, "grad_norm": 0.5045888082776578, "learning_rate": 2.1871639248124465e-06, "loss": 0.018, "step": 2014 }, { "epoch": 0.6998957971517888, "grad_norm": 0.6958614459354281, "learning_rate": 2.182514351517488e-06, "loss": 0.0176, "step": 2015 }, { "epoch": 0.7002431399791594, "grad_norm": 0.46312645884075154, "learning_rate": 2.1778683454481946e-06, "loss": 0.0202, "step": 2016 }, { "epoch": 0.70059048280653, "grad_norm": 0.4799742697736775, "learning_rate": 2.1732259124868883e-06, "loss": 0.015, "step": 2017 }, { "epoch": 0.7009378256339006, "grad_norm": 0.33079209530907566, "learning_rate": 2.1685870585113666e-06, "loss": 0.0107, "step": 2018 }, { "epoch": 0.7012851684612713, "grad_norm": 0.5741815050233295, "learning_rate": 2.1639517893948926e-06, "loss": 0.0168, "step": 2019 }, { "epoch": 0.7016325112886419, "grad_norm": 0.8688246670359403, "learning_rate": 2.1593201110061906e-06, "loss": 0.0179, "step": 2020 }, { "epoch": 0.7019798541160125, "grad_norm": 0.47477951851972594, "learning_rate": 2.154692029209442e-06, "loss": 0.0131, "step": 2021 }, { "epoch": 0.7023271969433831, "grad_norm": 1.249034719322415, "learning_rate": 2.1500675498642746e-06, "loss": 0.0237, "step": 2022 }, { "epoch": 0.7026745397707538, "grad_norm": 1.4687157493254421, "learning_rate": 2.145446678825751e-06, "loss": 0.0217, "step": 2023 }, { "epoch": 0.7030218825981244, "grad_norm": 0.38679253812681325, "learning_rate": 2.140829421944367e-06, "loss": 0.0181, "step": 2024 }, { "epoch": 0.703369225425495, "grad_norm": 0.4319877902351082, "learning_rate": 2.136215785066046e-06, "loss": 0.0077, "step": 2025 }, { "epoch": 0.7037165682528655, "grad_norm": 1.2606438198673813, "learning_rate": 2.1316057740321212e-06, "loss": 0.0298, "step": 2026 }, { "epoch": 0.7040639110802361, "grad_norm": 0.7241622040775874, "learning_rate": 2.1269993946793414e-06, "loss": 0.0127, "step": 2027 }, { "epoch": 0.7044112539076068, "grad_norm": 0.7645077476228309, "learning_rate": 2.1223966528398577e-06, "loss": 0.0249, "step": 2028 }, { "epoch": 0.7047585967349774, "grad_norm": 0.6348186548588091, "learning_rate": 2.11779755434121e-06, "loss": 0.0142, "step": 2029 }, { "epoch": 0.705105939562348, "grad_norm": 0.5434630709530021, "learning_rate": 2.113202105006327e-06, "loss": 0.0148, "step": 2030 }, { "epoch": 0.7054532823897186, "grad_norm": 0.7784838597282087, "learning_rate": 2.1086103106535214e-06, "loss": 0.0197, "step": 2031 }, { "epoch": 0.7058006252170893, "grad_norm": 1.0412089267934344, "learning_rate": 2.104022177096477e-06, "loss": 0.0207, "step": 2032 }, { "epoch": 0.7061479680444599, "grad_norm": 0.5527143676070977, "learning_rate": 2.0994377101442387e-06, "loss": 0.0183, "step": 2033 }, { "epoch": 0.7064953108718305, "grad_norm": 1.1085673643586902, "learning_rate": 2.09485691560121e-06, "loss": 0.0153, "step": 2034 }, { "epoch": 0.7068426536992011, "grad_norm": 0.3690027590773899, "learning_rate": 2.0902797992671485e-06, "loss": 0.009, "step": 2035 }, { "epoch": 0.7071899965265718, "grad_norm": 0.4620521220600708, "learning_rate": 2.0857063669371545e-06, "loss": 0.0117, "step": 2036 }, { "epoch": 0.7075373393539424, "grad_norm": 0.7530837284547335, "learning_rate": 2.081136624401661e-06, "loss": 0.0109, "step": 2037 }, { "epoch": 0.707884682181313, "grad_norm": 1.090635412666822, "learning_rate": 2.076570577446428e-06, "loss": 0.0197, "step": 2038 }, { "epoch": 0.7082320250086835, "grad_norm": 0.5421154350298361, "learning_rate": 2.0720082318525405e-06, "loss": 0.021, "step": 2039 }, { "epoch": 0.7085793678360541, "grad_norm": 0.675053408269602, "learning_rate": 2.0674495933963997e-06, "loss": 0.021, "step": 2040 }, { "epoch": 0.7089267106634248, "grad_norm": 0.6491462598285759, "learning_rate": 2.062894667849702e-06, "loss": 0.0332, "step": 2041 }, { "epoch": 0.7092740534907954, "grad_norm": 0.4073609530744494, "learning_rate": 2.058343460979454e-06, "loss": 0.0158, "step": 2042 }, { "epoch": 0.709621396318166, "grad_norm": 0.6131847890148999, "learning_rate": 2.0537959785479517e-06, "loss": 0.0179, "step": 2043 }, { "epoch": 0.7099687391455366, "grad_norm": 0.7676568413480472, "learning_rate": 2.049252226312772e-06, "loss": 0.0231, "step": 2044 }, { "epoch": 0.7103160819729073, "grad_norm": 0.6639394977446624, "learning_rate": 2.04471221002677e-06, "loss": 0.0255, "step": 2045 }, { "epoch": 0.7106634248002779, "grad_norm": 0.6596012575548285, "learning_rate": 2.0401759354380728e-06, "loss": 0.0197, "step": 2046 }, { "epoch": 0.7110107676276485, "grad_norm": 0.7895955447252033, "learning_rate": 2.035643408290071e-06, "loss": 0.0258, "step": 2047 }, { "epoch": 0.7113581104550191, "grad_norm": 0.6477495390019533, "learning_rate": 2.0311146343214073e-06, "loss": 0.0102, "step": 2048 }, { "epoch": 0.7117054532823898, "grad_norm": 1.064483048484202, "learning_rate": 2.0265896192659717e-06, "loss": 0.0227, "step": 2049 }, { "epoch": 0.7120527961097604, "grad_norm": 0.6008333641221011, "learning_rate": 2.0220683688528988e-06, "loss": 0.0128, "step": 2050 }, { "epoch": 0.712400138937131, "grad_norm": 0.7586223473422381, "learning_rate": 2.0175508888065563e-06, "loss": 0.0207, "step": 2051 }, { "epoch": 0.7127474817645015, "grad_norm": 0.6734658920557806, "learning_rate": 2.013037184846537e-06, "loss": 0.0146, "step": 2052 }, { "epoch": 0.7130948245918721, "grad_norm": 0.43786239417939005, "learning_rate": 2.0085272626876496e-06, "loss": 0.025, "step": 2053 }, { "epoch": 0.7134421674192428, "grad_norm": 0.43477244650995783, "learning_rate": 2.00402112803992e-06, "loss": 0.0224, "step": 2054 }, { "epoch": 0.7137895102466134, "grad_norm": 0.738487309514483, "learning_rate": 1.9995187866085786e-06, "loss": 0.0203, "step": 2055 }, { "epoch": 0.714136853073984, "grad_norm": 0.370434091066062, "learning_rate": 1.9950202440940496e-06, "loss": 0.0159, "step": 2056 }, { "epoch": 0.7144841959013546, "grad_norm": 0.6924851015465208, "learning_rate": 1.9905255061919464e-06, "loss": 0.0131, "step": 2057 }, { "epoch": 0.7148315387287253, "grad_norm": 0.5673188746010913, "learning_rate": 1.9860345785930726e-06, "loss": 0.0172, "step": 2058 }, { "epoch": 0.7151788815560959, "grad_norm": 0.3438440626602846, "learning_rate": 1.9815474669833985e-06, "loss": 0.0145, "step": 2059 }, { "epoch": 0.7155262243834665, "grad_norm": 0.45539417984892605, "learning_rate": 1.977064177044071e-06, "loss": 0.0183, "step": 2060 }, { "epoch": 0.7158735672108371, "grad_norm": 1.484472402316562, "learning_rate": 1.972584714451392e-06, "loss": 0.0221, "step": 2061 }, { "epoch": 0.7162209100382078, "grad_norm": 0.5404845318844554, "learning_rate": 1.9681090848768237e-06, "loss": 0.0195, "step": 2062 }, { "epoch": 0.7165682528655783, "grad_norm": 0.47909379000212127, "learning_rate": 1.9636372939869677e-06, "loss": 0.0169, "step": 2063 }, { "epoch": 0.7169155956929489, "grad_norm": 0.5622427498011785, "learning_rate": 1.9591693474435735e-06, "loss": 0.0304, "step": 2064 }, { "epoch": 0.7172629385203195, "grad_norm": 0.7467315115677308, "learning_rate": 1.9547052509035164e-06, "loss": 0.0165, "step": 2065 }, { "epoch": 0.7176102813476901, "grad_norm": 0.5056808774501013, "learning_rate": 1.9502450100188037e-06, "loss": 0.0215, "step": 2066 }, { "epoch": 0.7179576241750608, "grad_norm": 1.4391119548200895, "learning_rate": 1.9457886304365533e-06, "loss": 0.0231, "step": 2067 }, { "epoch": 0.7183049670024314, "grad_norm": 0.6476533862493838, "learning_rate": 1.9413361177990015e-06, "loss": 0.0241, "step": 2068 }, { "epoch": 0.718652309829802, "grad_norm": 0.530719837422982, "learning_rate": 1.9368874777434864e-06, "loss": 0.0199, "step": 2069 }, { "epoch": 0.7189996526571726, "grad_norm": 0.519016068544451, "learning_rate": 1.932442715902441e-06, "loss": 0.0201, "step": 2070 }, { "epoch": 0.7193469954845433, "grad_norm": 0.704754836241455, "learning_rate": 1.9280018379033884e-06, "loss": 0.0292, "step": 2071 }, { "epoch": 0.7196943383119139, "grad_norm": 0.5687572422985191, "learning_rate": 1.923564849368936e-06, "loss": 0.0214, "step": 2072 }, { "epoch": 0.7200416811392845, "grad_norm": 0.3734820838812652, "learning_rate": 1.919131755916771e-06, "loss": 0.0187, "step": 2073 }, { "epoch": 0.720389023966655, "grad_norm": 0.3791262946433928, "learning_rate": 1.9147025631596362e-06, "loss": 0.02, "step": 2074 }, { "epoch": 0.7207363667940258, "grad_norm": 0.4220291272178966, "learning_rate": 1.9102772767053467e-06, "loss": 0.0249, "step": 2075 }, { "epoch": 0.7210837096213963, "grad_norm": 0.5391949275021815, "learning_rate": 1.9058559021567718e-06, "loss": 0.0192, "step": 2076 }, { "epoch": 0.7214310524487669, "grad_norm": 1.4908074892631773, "learning_rate": 1.9014384451118229e-06, "loss": 0.0206, "step": 2077 }, { "epoch": 0.7217783952761375, "grad_norm": 0.711125711354139, "learning_rate": 1.897024911163451e-06, "loss": 0.0247, "step": 2078 }, { "epoch": 0.7221257381035081, "grad_norm": 1.0280025482928667, "learning_rate": 1.892615305899645e-06, "loss": 0.027, "step": 2079 }, { "epoch": 0.7224730809308788, "grad_norm": 0.6833450502355668, "learning_rate": 1.8882096349034184e-06, "loss": 0.0264, "step": 2080 }, { "epoch": 0.7228204237582494, "grad_norm": 0.7449238137674074, "learning_rate": 1.8838079037528012e-06, "loss": 0.0226, "step": 2081 }, { "epoch": 0.72316776658562, "grad_norm": 0.745751826998084, "learning_rate": 1.879410118020834e-06, "loss": 0.0306, "step": 2082 }, { "epoch": 0.7235151094129906, "grad_norm": 0.3996052258819095, "learning_rate": 1.8750162832755669e-06, "loss": 0.0172, "step": 2083 }, { "epoch": 0.7238624522403613, "grad_norm": 0.9447949016982862, "learning_rate": 1.870626405080046e-06, "loss": 0.02, "step": 2084 }, { "epoch": 0.7242097950677319, "grad_norm": 0.7392285624719003, "learning_rate": 1.8662404889923058e-06, "loss": 0.0147, "step": 2085 }, { "epoch": 0.7245571378951025, "grad_norm": 0.506159285212194, "learning_rate": 1.8618585405653639e-06, "loss": 0.0188, "step": 2086 }, { "epoch": 0.724904480722473, "grad_norm": 0.5116325753968864, "learning_rate": 1.8574805653472178e-06, "loss": 0.0143, "step": 2087 }, { "epoch": 0.7252518235498437, "grad_norm": 0.7618959377624293, "learning_rate": 1.8531065688808346e-06, "loss": 0.0205, "step": 2088 }, { "epoch": 0.7255991663772143, "grad_norm": 0.6293199274360505, "learning_rate": 1.848736556704141e-06, "loss": 0.0122, "step": 2089 }, { "epoch": 0.7259465092045849, "grad_norm": 0.9005126434840245, "learning_rate": 1.8443705343500185e-06, "loss": 0.0146, "step": 2090 }, { "epoch": 0.7262938520319555, "grad_norm": 1.1648440673296412, "learning_rate": 1.840008507346302e-06, "loss": 0.0241, "step": 2091 }, { "epoch": 0.7266411948593261, "grad_norm": 0.5855114659510036, "learning_rate": 1.8356504812157623e-06, "loss": 0.0207, "step": 2092 }, { "epoch": 0.7269885376866968, "grad_norm": 0.5068661065981903, "learning_rate": 1.831296461476109e-06, "loss": 0.0146, "step": 2093 }, { "epoch": 0.7273358805140674, "grad_norm": 0.5837482870866362, "learning_rate": 1.826946453639976e-06, "loss": 0.0163, "step": 2094 }, { "epoch": 0.727683223341438, "grad_norm": 0.6908638053788853, "learning_rate": 1.822600463214922e-06, "loss": 0.0189, "step": 2095 }, { "epoch": 0.7280305661688086, "grad_norm": 0.6943496634239659, "learning_rate": 1.818258495703412e-06, "loss": 0.0207, "step": 2096 }, { "epoch": 0.7283779089961793, "grad_norm": 0.8592869838174363, "learning_rate": 1.813920556602826e-06, "loss": 0.0214, "step": 2097 }, { "epoch": 0.7287252518235499, "grad_norm": 0.9767864211568215, "learning_rate": 1.8095866514054372e-06, "loss": 0.0151, "step": 2098 }, { "epoch": 0.7290725946509204, "grad_norm": 0.24915866573453857, "learning_rate": 1.805256785598416e-06, "loss": 0.0104, "step": 2099 }, { "epoch": 0.729419937478291, "grad_norm": 0.34781878859312965, "learning_rate": 1.8009309646638128e-06, "loss": 0.0116, "step": 2100 }, { "epoch": 0.7297672803056617, "grad_norm": 0.6277095397756866, "learning_rate": 1.7966091940785653e-06, "loss": 0.016, "step": 2101 }, { "epoch": 0.7301146231330323, "grad_norm": 0.7019890305202732, "learning_rate": 1.792291479314473e-06, "loss": 0.0188, "step": 2102 }, { "epoch": 0.7304619659604029, "grad_norm": 0.4558680260256953, "learning_rate": 1.7879778258382103e-06, "loss": 0.0153, "step": 2103 }, { "epoch": 0.7308093087877735, "grad_norm": 0.5209152944828331, "learning_rate": 1.7836682391113002e-06, "loss": 0.0205, "step": 2104 }, { "epoch": 0.7311566516151441, "grad_norm": 0.6854168427153546, "learning_rate": 1.7793627245901236e-06, "loss": 0.0175, "step": 2105 }, { "epoch": 0.7315039944425148, "grad_norm": 0.594503372287282, "learning_rate": 1.775061287725906e-06, "loss": 0.0228, "step": 2106 }, { "epoch": 0.7318513372698854, "grad_norm": 1.3778446243347249, "learning_rate": 1.7707639339647015e-06, "loss": 0.0168, "step": 2107 }, { "epoch": 0.732198680097256, "grad_norm": 0.5276235706829822, "learning_rate": 1.766470668747403e-06, "loss": 0.0175, "step": 2108 }, { "epoch": 0.7325460229246266, "grad_norm": 0.45943667910861274, "learning_rate": 1.7621814975097274e-06, "loss": 0.023, "step": 2109 }, { "epoch": 0.7328933657519973, "grad_norm": 0.3538644823362223, "learning_rate": 1.7578964256822018e-06, "loss": 0.0146, "step": 2110 }, { "epoch": 0.7332407085793679, "grad_norm": 0.7312777836477884, "learning_rate": 1.753615458690166e-06, "loss": 0.0172, "step": 2111 }, { "epoch": 0.7335880514067384, "grad_norm": 0.9654485421097003, "learning_rate": 1.7493386019537645e-06, "loss": 0.0279, "step": 2112 }, { "epoch": 0.733935394234109, "grad_norm": 0.4641407414319063, "learning_rate": 1.7450658608879384e-06, "loss": 0.0216, "step": 2113 }, { "epoch": 0.7342827370614797, "grad_norm": 0.6553590581631387, "learning_rate": 1.7407972409024133e-06, "loss": 0.0106, "step": 2114 }, { "epoch": 0.7346300798888503, "grad_norm": 0.7171981611687974, "learning_rate": 1.7365327474016979e-06, "loss": 0.0132, "step": 2115 }, { "epoch": 0.7349774227162209, "grad_norm": 0.6691806372000761, "learning_rate": 1.7322723857850816e-06, "loss": 0.0125, "step": 2116 }, { "epoch": 0.7353247655435915, "grad_norm": 0.8491625521965726, "learning_rate": 1.7280161614466185e-06, "loss": 0.0237, "step": 2117 }, { "epoch": 0.7356721083709621, "grad_norm": 0.7559623864731797, "learning_rate": 1.7237640797751249e-06, "loss": 0.0169, "step": 2118 }, { "epoch": 0.7360194511983328, "grad_norm": 0.9441548462460033, "learning_rate": 1.7195161461541692e-06, "loss": 0.0278, "step": 2119 }, { "epoch": 0.7363667940257034, "grad_norm": 0.5416094909605166, "learning_rate": 1.7152723659620735e-06, "loss": 0.0143, "step": 2120 }, { "epoch": 0.736714136853074, "grad_norm": 0.5132478425979125, "learning_rate": 1.7110327445718995e-06, "loss": 0.0145, "step": 2121 }, { "epoch": 0.7370614796804446, "grad_norm": 0.9110313850628509, "learning_rate": 1.706797287351441e-06, "loss": 0.0255, "step": 2122 }, { "epoch": 0.7374088225078153, "grad_norm": 0.9146610099417635, "learning_rate": 1.7025659996632198e-06, "loss": 0.0152, "step": 2123 }, { "epoch": 0.7377561653351858, "grad_norm": 0.6272856149050274, "learning_rate": 1.6983388868644834e-06, "loss": 0.0161, "step": 2124 }, { "epoch": 0.7381035081625564, "grad_norm": 0.4677956955658168, "learning_rate": 1.6941159543071855e-06, "loss": 0.0159, "step": 2125 }, { "epoch": 0.738450850989927, "grad_norm": 0.4336551332394349, "learning_rate": 1.689897207337996e-06, "loss": 0.0199, "step": 2126 }, { "epoch": 0.7387981938172977, "grad_norm": 0.5590648865175777, "learning_rate": 1.6856826512982772e-06, "loss": 0.0224, "step": 2127 }, { "epoch": 0.7391455366446683, "grad_norm": 0.6332517012226541, "learning_rate": 1.6814722915240922e-06, "loss": 0.0177, "step": 2128 }, { "epoch": 0.7394928794720389, "grad_norm": 0.5184386784352486, "learning_rate": 1.6772661333461858e-06, "loss": 0.0118, "step": 2129 }, { "epoch": 0.7398402222994095, "grad_norm": 0.8189234197423481, "learning_rate": 1.673064182089988e-06, "loss": 0.0147, "step": 2130 }, { "epoch": 0.7401875651267801, "grad_norm": 0.6863762658701025, "learning_rate": 1.6688664430755964e-06, "loss": 0.0227, "step": 2131 }, { "epoch": 0.7405349079541508, "grad_norm": 0.5772056540395054, "learning_rate": 1.6646729216177827e-06, "loss": 0.0181, "step": 2132 }, { "epoch": 0.7408822507815214, "grad_norm": 0.5758071456756751, "learning_rate": 1.6604836230259713e-06, "loss": 0.0143, "step": 2133 }, { "epoch": 0.741229593608892, "grad_norm": 0.5432719143241201, "learning_rate": 1.6562985526042474e-06, "loss": 0.0192, "step": 2134 }, { "epoch": 0.7415769364362625, "grad_norm": 0.4669709741954846, "learning_rate": 1.6521177156513351e-06, "loss": 0.0186, "step": 2135 }, { "epoch": 0.7419242792636332, "grad_norm": 0.6288408754669941, "learning_rate": 1.6479411174606069e-06, "loss": 0.0164, "step": 2136 }, { "epoch": 0.7422716220910038, "grad_norm": 0.5089242192772048, "learning_rate": 1.6437687633200604e-06, "loss": 0.0207, "step": 2137 }, { "epoch": 0.7426189649183744, "grad_norm": 0.6965392337483026, "learning_rate": 1.639600658512327e-06, "loss": 0.0147, "step": 2138 }, { "epoch": 0.742966307745745, "grad_norm": 0.2523207249854437, "learning_rate": 1.6354368083146532e-06, "loss": 0.0099, "step": 2139 }, { "epoch": 0.7433136505731157, "grad_norm": 0.9510970817995196, "learning_rate": 1.6312772179988983e-06, "loss": 0.0257, "step": 2140 }, { "epoch": 0.7436609934004863, "grad_norm": 0.7060524417767109, "learning_rate": 1.6271218928315325e-06, "loss": 0.0199, "step": 2141 }, { "epoch": 0.7440083362278569, "grad_norm": 0.3884044654632346, "learning_rate": 1.6229708380736237e-06, "loss": 0.0153, "step": 2142 }, { "epoch": 0.7443556790552275, "grad_norm": 0.5770229127268146, "learning_rate": 1.6188240589808325e-06, "loss": 0.0227, "step": 2143 }, { "epoch": 0.7447030218825981, "grad_norm": 0.5435182445456208, "learning_rate": 1.6146815608034033e-06, "loss": 0.0142, "step": 2144 }, { "epoch": 0.7450503647099688, "grad_norm": 0.40026444675826955, "learning_rate": 1.6105433487861666e-06, "loss": 0.0172, "step": 2145 }, { "epoch": 0.7453977075373394, "grad_norm": 0.4362346680833222, "learning_rate": 1.6064094281685239e-06, "loss": 0.0113, "step": 2146 }, { "epoch": 0.74574505036471, "grad_norm": 1.0318993306154467, "learning_rate": 1.6022798041844407e-06, "loss": 0.0174, "step": 2147 }, { "epoch": 0.7460923931920805, "grad_norm": 0.5504252591968342, "learning_rate": 1.598154482062443e-06, "loss": 0.0142, "step": 2148 }, { "epoch": 0.7464397360194512, "grad_norm": 0.5110175837775568, "learning_rate": 1.594033467025613e-06, "loss": 0.0114, "step": 2149 }, { "epoch": 0.7467870788468218, "grad_norm": 0.9348497562735546, "learning_rate": 1.5899167642915803e-06, "loss": 0.0244, "step": 2150 }, { "epoch": 0.7471344216741924, "grad_norm": 0.7812495568758424, "learning_rate": 1.5858043790725096e-06, "loss": 0.0246, "step": 2151 }, { "epoch": 0.747481764501563, "grad_norm": 0.9116632509191444, "learning_rate": 1.5816963165751026e-06, "loss": 0.0135, "step": 2152 }, { "epoch": 0.7478291073289337, "grad_norm": 0.5280524057084818, "learning_rate": 1.5775925820005878e-06, "loss": 0.0111, "step": 2153 }, { "epoch": 0.7481764501563043, "grad_norm": 1.0020440686822643, "learning_rate": 1.5734931805447151e-06, "loss": 0.0312, "step": 2154 }, { "epoch": 0.7485237929836749, "grad_norm": 0.5849497270719394, "learning_rate": 1.5693981173977468e-06, "loss": 0.0202, "step": 2155 }, { "epoch": 0.7488711358110455, "grad_norm": 0.4259160011756788, "learning_rate": 1.56530739774445e-06, "loss": 0.0211, "step": 2156 }, { "epoch": 0.7492184786384161, "grad_norm": 0.7638640166062615, "learning_rate": 1.5612210267640987e-06, "loss": 0.0113, "step": 2157 }, { "epoch": 0.7495658214657868, "grad_norm": 0.43127591153772443, "learning_rate": 1.5571390096304545e-06, "loss": 0.0171, "step": 2158 }, { "epoch": 0.7499131642931574, "grad_norm": 0.7943011759564156, "learning_rate": 1.5530613515117721e-06, "loss": 0.0187, "step": 2159 }, { "epoch": 0.7502605071205279, "grad_norm": 1.4178672936850725, "learning_rate": 1.5489880575707821e-06, "loss": 0.0226, "step": 2160 }, { "epoch": 0.7506078499478985, "grad_norm": 0.42032498283687775, "learning_rate": 1.5449191329646951e-06, "loss": 0.0152, "step": 2161 }, { "epoch": 0.7509551927752692, "grad_norm": 0.4736721695112062, "learning_rate": 1.5408545828451838e-06, "loss": 0.0163, "step": 2162 }, { "epoch": 0.7513025356026398, "grad_norm": 0.44347628963881275, "learning_rate": 1.5367944123583884e-06, "loss": 0.0177, "step": 2163 }, { "epoch": 0.7516498784300104, "grad_norm": 0.4305512192092599, "learning_rate": 1.5327386266448973e-06, "loss": 0.0197, "step": 2164 }, { "epoch": 0.751997221257381, "grad_norm": 0.304280894510165, "learning_rate": 1.528687230839755e-06, "loss": 0.0144, "step": 2165 }, { "epoch": 0.7523445640847517, "grad_norm": 0.3806340910734509, "learning_rate": 1.5246402300724406e-06, "loss": 0.0186, "step": 2166 }, { "epoch": 0.7526919069121223, "grad_norm": 0.49033443578842467, "learning_rate": 1.5205976294668745e-06, "loss": 0.0112, "step": 2167 }, { "epoch": 0.7530392497394929, "grad_norm": 0.6297931905210379, "learning_rate": 1.5165594341414014e-06, "loss": 0.0139, "step": 2168 }, { "epoch": 0.7533865925668635, "grad_norm": 0.3703644340810331, "learning_rate": 1.5125256492087925e-06, "loss": 0.0108, "step": 2169 }, { "epoch": 0.7537339353942341, "grad_norm": 0.9633658417879537, "learning_rate": 1.5084962797762303e-06, "loss": 0.0187, "step": 2170 }, { "epoch": 0.7540812782216048, "grad_norm": 0.3787687052461065, "learning_rate": 1.5044713309453135e-06, "loss": 0.0139, "step": 2171 }, { "epoch": 0.7544286210489753, "grad_norm": 1.1842179559226889, "learning_rate": 1.5004508078120378e-06, "loss": 0.0211, "step": 2172 }, { "epoch": 0.7547759638763459, "grad_norm": 0.617894991221659, "learning_rate": 1.4964347154667959e-06, "loss": 0.0235, "step": 2173 }, { "epoch": 0.7551233067037165, "grad_norm": 1.2108002626639653, "learning_rate": 1.4924230589943738e-06, "loss": 0.025, "step": 2174 }, { "epoch": 0.7554706495310872, "grad_norm": 0.5389764197428066, "learning_rate": 1.488415843473942e-06, "loss": 0.0222, "step": 2175 }, { "epoch": 0.7558179923584578, "grad_norm": 0.5678337368948643, "learning_rate": 1.4844130739790441e-06, "loss": 0.0121, "step": 2176 }, { "epoch": 0.7561653351858284, "grad_norm": 0.5096074221105189, "learning_rate": 1.4804147555775955e-06, "loss": 0.0162, "step": 2177 }, { "epoch": 0.756512678013199, "grad_norm": 0.3945803267973934, "learning_rate": 1.4764208933318786e-06, "loss": 0.0148, "step": 2178 }, { "epoch": 0.7568600208405697, "grad_norm": 0.7056464507505118, "learning_rate": 1.472431492298534e-06, "loss": 0.0277, "step": 2179 }, { "epoch": 0.7572073636679403, "grad_norm": 0.4702850476473721, "learning_rate": 1.4684465575285507e-06, "loss": 0.0227, "step": 2180 }, { "epoch": 0.7575547064953109, "grad_norm": 0.7774483178392437, "learning_rate": 1.4644660940672628e-06, "loss": 0.0165, "step": 2181 }, { "epoch": 0.7579020493226815, "grad_norm": 0.63743536927089, "learning_rate": 1.4604901069543475e-06, "loss": 0.0147, "step": 2182 }, { "epoch": 0.758249392150052, "grad_norm": 1.2057859393359813, "learning_rate": 1.4565186012238126e-06, "loss": 0.02, "step": 2183 }, { "epoch": 0.7585967349774227, "grad_norm": 0.6058695506497026, "learning_rate": 1.452551581903991e-06, "loss": 0.0171, "step": 2184 }, { "epoch": 0.7589440778047933, "grad_norm": 1.0470651557089286, "learning_rate": 1.4485890540175335e-06, "loss": 0.0211, "step": 2185 }, { "epoch": 0.7592914206321639, "grad_norm": 0.5419134679450275, "learning_rate": 1.4446310225814087e-06, "loss": 0.017, "step": 2186 }, { "epoch": 0.7596387634595345, "grad_norm": 0.6607230911522339, "learning_rate": 1.4406774926068912e-06, "loss": 0.0097, "step": 2187 }, { "epoch": 0.7599861062869052, "grad_norm": 0.3575619358712941, "learning_rate": 1.4367284690995543e-06, "loss": 0.0156, "step": 2188 }, { "epoch": 0.7603334491142758, "grad_norm": 0.36101367127865347, "learning_rate": 1.4327839570592644e-06, "loss": 0.0132, "step": 2189 }, { "epoch": 0.7606807919416464, "grad_norm": 0.5040284990352497, "learning_rate": 1.4288439614801803e-06, "loss": 0.0162, "step": 2190 }, { "epoch": 0.761028134769017, "grad_norm": 0.9760018252254876, "learning_rate": 1.4249084873507412e-06, "loss": 0.0233, "step": 2191 }, { "epoch": 0.7613754775963877, "grad_norm": 0.6260035980310652, "learning_rate": 1.4209775396536595e-06, "loss": 0.0247, "step": 2192 }, { "epoch": 0.7617228204237583, "grad_norm": 0.723938835995245, "learning_rate": 1.4170511233659167e-06, "loss": 0.0232, "step": 2193 }, { "epoch": 0.7620701632511289, "grad_norm": 0.5719862112874641, "learning_rate": 1.4131292434587613e-06, "loss": 0.0187, "step": 2194 }, { "epoch": 0.7624175060784995, "grad_norm": 0.6344034231978065, "learning_rate": 1.409211904897692e-06, "loss": 0.0128, "step": 2195 }, { "epoch": 0.76276484890587, "grad_norm": 1.0837290646085158, "learning_rate": 1.4052991126424642e-06, "loss": 0.03, "step": 2196 }, { "epoch": 0.7631121917332407, "grad_norm": 0.5566631869774307, "learning_rate": 1.4013908716470714e-06, "loss": 0.0188, "step": 2197 }, { "epoch": 0.7634595345606113, "grad_norm": 0.9291107450950402, "learning_rate": 1.3974871868597495e-06, "loss": 0.0212, "step": 2198 }, { "epoch": 0.7638068773879819, "grad_norm": 0.6433035528020483, "learning_rate": 1.3935880632229614e-06, "loss": 0.0148, "step": 2199 }, { "epoch": 0.7641542202153525, "grad_norm": 0.5318264623442518, "learning_rate": 1.3896935056734001e-06, "loss": 0.0143, "step": 2200 }, { "epoch": 0.7645015630427232, "grad_norm": 0.3392416546423096, "learning_rate": 1.385803519141971e-06, "loss": 0.01, "step": 2201 }, { "epoch": 0.7648489058700938, "grad_norm": 0.4357906997308162, "learning_rate": 1.3819181085538002e-06, "loss": 0.0158, "step": 2202 }, { "epoch": 0.7651962486974644, "grad_norm": 0.8938906147298566, "learning_rate": 1.378037278828212e-06, "loss": 0.0157, "step": 2203 }, { "epoch": 0.765543591524835, "grad_norm": 0.812151390132714, "learning_rate": 1.3741610348787382e-06, "loss": 0.0201, "step": 2204 }, { "epoch": 0.7658909343522057, "grad_norm": 1.046728570840569, "learning_rate": 1.3702893816130968e-06, "loss": 0.0138, "step": 2205 }, { "epoch": 0.7662382771795763, "grad_norm": 0.8470882919333848, "learning_rate": 1.366422323933202e-06, "loss": 0.0217, "step": 2206 }, { "epoch": 0.7665856200069469, "grad_norm": 0.6419335535775957, "learning_rate": 1.362559866735142e-06, "loss": 0.0188, "step": 2207 }, { "epoch": 0.7669329628343174, "grad_norm": 1.0195274448965153, "learning_rate": 1.3587020149091856e-06, "loss": 0.0188, "step": 2208 }, { "epoch": 0.767280305661688, "grad_norm": 0.8491806303868048, "learning_rate": 1.3548487733397686e-06, "loss": 0.0136, "step": 2209 }, { "epoch": 0.7676276484890587, "grad_norm": 0.496099824248827, "learning_rate": 1.351000146905488e-06, "loss": 0.0159, "step": 2210 }, { "epoch": 0.7679749913164293, "grad_norm": 0.36784724782559614, "learning_rate": 1.3471561404791e-06, "loss": 0.0099, "step": 2211 }, { "epoch": 0.7683223341437999, "grad_norm": 0.853519257078698, "learning_rate": 1.343316758927513e-06, "loss": 0.0152, "step": 2212 }, { "epoch": 0.7686696769711705, "grad_norm": 0.4794009903353391, "learning_rate": 1.3394820071117765e-06, "loss": 0.0178, "step": 2213 }, { "epoch": 0.7690170197985412, "grad_norm": 0.5638174933594083, "learning_rate": 1.3356518898870773e-06, "loss": 0.0226, "step": 2214 }, { "epoch": 0.7693643626259118, "grad_norm": 0.7867218167416455, "learning_rate": 1.331826412102738e-06, "loss": 0.02, "step": 2215 }, { "epoch": 0.7697117054532824, "grad_norm": 1.2101224505178252, "learning_rate": 1.3280055786022078e-06, "loss": 0.0339, "step": 2216 }, { "epoch": 0.770059048280653, "grad_norm": 0.4560784481807581, "learning_rate": 1.3241893942230511e-06, "loss": 0.0153, "step": 2217 }, { "epoch": 0.7704063911080237, "grad_norm": 1.089455223796198, "learning_rate": 1.3203778637969478e-06, "loss": 0.02, "step": 2218 }, { "epoch": 0.7707537339353943, "grad_norm": 0.8102592696114593, "learning_rate": 1.3165709921496873e-06, "loss": 0.0146, "step": 2219 }, { "epoch": 0.7711010767627648, "grad_norm": 0.2248352987503352, "learning_rate": 1.312768784101161e-06, "loss": 0.0096, "step": 2220 }, { "epoch": 0.7714484195901354, "grad_norm": 0.6361836378952002, "learning_rate": 1.3089712444653525e-06, "loss": 0.0282, "step": 2221 }, { "epoch": 0.771795762417506, "grad_norm": 0.6975087143669682, "learning_rate": 1.3051783780503353e-06, "loss": 0.0123, "step": 2222 }, { "epoch": 0.7721431052448767, "grad_norm": 1.2464836526873286, "learning_rate": 1.3013901896582677e-06, "loss": 0.027, "step": 2223 }, { "epoch": 0.7724904480722473, "grad_norm": 0.4282700872207025, "learning_rate": 1.2976066840853862e-06, "loss": 0.0198, "step": 2224 }, { "epoch": 0.7728377908996179, "grad_norm": 0.4954085912972942, "learning_rate": 1.2938278661219961e-06, "loss": 0.011, "step": 2225 }, { "epoch": 0.7731851337269885, "grad_norm": 0.64840741080104, "learning_rate": 1.290053740552466e-06, "loss": 0.0198, "step": 2226 }, { "epoch": 0.7735324765543592, "grad_norm": 0.582268992296493, "learning_rate": 1.2862843121552293e-06, "loss": 0.0121, "step": 2227 }, { "epoch": 0.7738798193817298, "grad_norm": 0.5788123624597133, "learning_rate": 1.282519585702765e-06, "loss": 0.0181, "step": 2228 }, { "epoch": 0.7742271622091004, "grad_norm": 0.42036499722806164, "learning_rate": 1.2787595659616063e-06, "loss": 0.019, "step": 2229 }, { "epoch": 0.774574505036471, "grad_norm": 1.2403906461674592, "learning_rate": 1.275004257692321e-06, "loss": 0.0192, "step": 2230 }, { "epoch": 0.7749218478638417, "grad_norm": 0.33842093190437655, "learning_rate": 1.2712536656495167e-06, "loss": 0.0132, "step": 2231 }, { "epoch": 0.7752691906912123, "grad_norm": 0.9723522981189887, "learning_rate": 1.2675077945818249e-06, "loss": 0.0194, "step": 2232 }, { "epoch": 0.7756165335185828, "grad_norm": 0.6452976079476872, "learning_rate": 1.263766649231905e-06, "loss": 0.0235, "step": 2233 }, { "epoch": 0.7759638763459534, "grad_norm": 0.5932675671658317, "learning_rate": 1.260030234336428e-06, "loss": 0.016, "step": 2234 }, { "epoch": 0.776311219173324, "grad_norm": 0.76806867313472, "learning_rate": 1.2562985546260804e-06, "loss": 0.0187, "step": 2235 }, { "epoch": 0.7766585620006947, "grad_norm": 0.5650815303570309, "learning_rate": 1.252571614825549e-06, "loss": 0.0169, "step": 2236 }, { "epoch": 0.7770059048280653, "grad_norm": 0.48992636927446587, "learning_rate": 1.2488494196535238e-06, "loss": 0.0198, "step": 2237 }, { "epoch": 0.7773532476554359, "grad_norm": 0.47434548335024923, "learning_rate": 1.2451319738226835e-06, "loss": 0.0148, "step": 2238 }, { "epoch": 0.7777005904828065, "grad_norm": 1.587581496347539, "learning_rate": 1.2414192820396987e-06, "loss": 0.016, "step": 2239 }, { "epoch": 0.7780479333101772, "grad_norm": 0.5941570896375952, "learning_rate": 1.237711349005214e-06, "loss": 0.0224, "step": 2240 }, { "epoch": 0.7783952761375478, "grad_norm": 0.45192769841614433, "learning_rate": 1.234008179413856e-06, "loss": 0.018, "step": 2241 }, { "epoch": 0.7787426189649184, "grad_norm": 0.5359045238517698, "learning_rate": 1.2303097779542151e-06, "loss": 0.015, "step": 2242 }, { "epoch": 0.779089961792289, "grad_norm": 0.3170868716065785, "learning_rate": 1.2266161493088463e-06, "loss": 0.0157, "step": 2243 }, { "epoch": 0.7794373046196597, "grad_norm": 0.5339325400301773, "learning_rate": 1.2229272981542628e-06, "loss": 0.017, "step": 2244 }, { "epoch": 0.7797846474470302, "grad_norm": 0.5749594088375014, "learning_rate": 1.2192432291609296e-06, "loss": 0.0191, "step": 2245 }, { "epoch": 0.7801319902744008, "grad_norm": 0.8657654364305364, "learning_rate": 1.2155639469932551e-06, "loss": 0.0157, "step": 2246 }, { "epoch": 0.7804793331017714, "grad_norm": 0.6662929638257185, "learning_rate": 1.2118894563095857e-06, "loss": 0.0103, "step": 2247 }, { "epoch": 0.780826675929142, "grad_norm": 0.6185784496325207, "learning_rate": 1.2082197617622049e-06, "loss": 0.0271, "step": 2248 }, { "epoch": 0.7811740187565127, "grad_norm": 0.6728910637327504, "learning_rate": 1.2045548679973234e-06, "loss": 0.0204, "step": 2249 }, { "epoch": 0.7815213615838833, "grad_norm": 0.503622779557992, "learning_rate": 1.2008947796550714e-06, "loss": 0.0168, "step": 2250 }, { "epoch": 0.7818687044112539, "grad_norm": 0.7057212026990866, "learning_rate": 1.1972395013694944e-06, "loss": 0.0178, "step": 2251 }, { "epoch": 0.7822160472386245, "grad_norm": 0.9263129101479496, "learning_rate": 1.1935890377685499e-06, "loss": 0.0177, "step": 2252 }, { "epoch": 0.7825633900659952, "grad_norm": 0.5928517673288792, "learning_rate": 1.1899433934741023e-06, "loss": 0.0242, "step": 2253 }, { "epoch": 0.7829107328933658, "grad_norm": 0.286499819754983, "learning_rate": 1.186302573101908e-06, "loss": 0.011, "step": 2254 }, { "epoch": 0.7832580757207364, "grad_norm": 1.0908468375007774, "learning_rate": 1.1826665812616183e-06, "loss": 0.0343, "step": 2255 }, { "epoch": 0.783605418548107, "grad_norm": 0.3623784367226302, "learning_rate": 1.1790354225567724e-06, "loss": 0.0112, "step": 2256 }, { "epoch": 0.7839527613754776, "grad_norm": 0.3445252725726735, "learning_rate": 1.175409101584793e-06, "loss": 0.0093, "step": 2257 }, { "epoch": 0.7843001042028482, "grad_norm": 0.5433146108276392, "learning_rate": 1.1717876229369679e-06, "loss": 0.0151, "step": 2258 }, { "epoch": 0.7846474470302188, "grad_norm": 0.63386269312144, "learning_rate": 1.168170991198464e-06, "loss": 0.0211, "step": 2259 }, { "epoch": 0.7849947898575894, "grad_norm": 0.5237366655341864, "learning_rate": 1.1645592109483083e-06, "loss": 0.0106, "step": 2260 }, { "epoch": 0.78534213268496, "grad_norm": 0.4926580735814283, "learning_rate": 1.1609522867593825e-06, "loss": 0.0164, "step": 2261 }, { "epoch": 0.7856894755123307, "grad_norm": 0.618563652888975, "learning_rate": 1.1573502231984252e-06, "loss": 0.0171, "step": 2262 }, { "epoch": 0.7860368183397013, "grad_norm": 0.42754249596279326, "learning_rate": 1.1537530248260154e-06, "loss": 0.0094, "step": 2263 }, { "epoch": 0.7863841611670719, "grad_norm": 0.4479479051275309, "learning_rate": 1.1501606961965772e-06, "loss": 0.0164, "step": 2264 }, { "epoch": 0.7867315039944425, "grad_norm": 1.0791717141203099, "learning_rate": 1.1465732418583652e-06, "loss": 0.0216, "step": 2265 }, { "epoch": 0.7870788468218132, "grad_norm": 0.7585334147960321, "learning_rate": 1.1429906663534661e-06, "loss": 0.0199, "step": 2266 }, { "epoch": 0.7874261896491838, "grad_norm": 0.5631229523842227, "learning_rate": 1.1394129742177856e-06, "loss": 0.0155, "step": 2267 }, { "epoch": 0.7877735324765544, "grad_norm": 0.7372418292459169, "learning_rate": 1.1358401699810513e-06, "loss": 0.0162, "step": 2268 }, { "epoch": 0.7881208753039249, "grad_norm": 0.3814232843076739, "learning_rate": 1.1322722581667972e-06, "loss": 0.012, "step": 2269 }, { "epoch": 0.7884682181312956, "grad_norm": 0.3900814023948877, "learning_rate": 1.1287092432923675e-06, "loss": 0.0165, "step": 2270 }, { "epoch": 0.7888155609586662, "grad_norm": 0.3684093471163428, "learning_rate": 1.1251511298689015e-06, "loss": 0.0129, "step": 2271 }, { "epoch": 0.7891629037860368, "grad_norm": 0.41541558092287234, "learning_rate": 1.1215979224013395e-06, "loss": 0.0073, "step": 2272 }, { "epoch": 0.7895102466134074, "grad_norm": 0.45431105796049687, "learning_rate": 1.1180496253884028e-06, "loss": 0.0135, "step": 2273 }, { "epoch": 0.789857589440778, "grad_norm": 0.8135773753424956, "learning_rate": 1.1145062433226018e-06, "loss": 0.0125, "step": 2274 }, { "epoch": 0.7902049322681487, "grad_norm": 0.6545808136769573, "learning_rate": 1.1109677806902203e-06, "loss": 0.0136, "step": 2275 }, { "epoch": 0.7905522750955193, "grad_norm": 0.5330476620089217, "learning_rate": 1.107434241971313e-06, "loss": 0.0079, "step": 2276 }, { "epoch": 0.7908996179228899, "grad_norm": 0.6003919050746848, "learning_rate": 1.1039056316397046e-06, "loss": 0.014, "step": 2277 }, { "epoch": 0.7912469607502605, "grad_norm": 0.4179604464910238, "learning_rate": 1.1003819541629772e-06, "loss": 0.01, "step": 2278 }, { "epoch": 0.7915943035776312, "grad_norm": 0.7808363334276369, "learning_rate": 1.0968632140024683e-06, "loss": 0.0152, "step": 2279 }, { "epoch": 0.7919416464050018, "grad_norm": 0.4167603863620862, "learning_rate": 1.0933494156132607e-06, "loss": 0.0169, "step": 2280 }, { "epoch": 0.7922889892323723, "grad_norm": 1.012723748239046, "learning_rate": 1.0898405634441856e-06, "loss": 0.0185, "step": 2281 }, { "epoch": 0.7926363320597429, "grad_norm": 0.6056013118255872, "learning_rate": 1.0863366619378107e-06, "loss": 0.016, "step": 2282 }, { "epoch": 0.7929836748871136, "grad_norm": 0.7450736652041653, "learning_rate": 1.0828377155304332e-06, "loss": 0.0224, "step": 2283 }, { "epoch": 0.7933310177144842, "grad_norm": 1.0221605650577856, "learning_rate": 1.0793437286520765e-06, "loss": 0.0195, "step": 2284 }, { "epoch": 0.7936783605418548, "grad_norm": 0.6636410309989438, "learning_rate": 1.0758547057264873e-06, "loss": 0.0199, "step": 2285 }, { "epoch": 0.7940257033692254, "grad_norm": 1.1325176009144873, "learning_rate": 1.072370651171128e-06, "loss": 0.0233, "step": 2286 }, { "epoch": 0.794373046196596, "grad_norm": 0.5341830366952061, "learning_rate": 1.0688915693971675e-06, "loss": 0.012, "step": 2287 }, { "epoch": 0.7947203890239667, "grad_norm": 0.6153155573713803, "learning_rate": 1.0654174648094783e-06, "loss": 0.0198, "step": 2288 }, { "epoch": 0.7950677318513373, "grad_norm": 0.9623001317484534, "learning_rate": 1.0619483418066346e-06, "loss": 0.0169, "step": 2289 }, { "epoch": 0.7954150746787079, "grad_norm": 0.7223221290889841, "learning_rate": 1.0584842047809047e-06, "loss": 0.0155, "step": 2290 }, { "epoch": 0.7957624175060785, "grad_norm": 0.6504553961602982, "learning_rate": 1.0550250581182353e-06, "loss": 0.0224, "step": 2291 }, { "epoch": 0.7961097603334492, "grad_norm": 1.0998554308853323, "learning_rate": 1.0515709061982632e-06, "loss": 0.0255, "step": 2292 }, { "epoch": 0.7964571031608197, "grad_norm": 0.4931210569774452, "learning_rate": 1.048121753394301e-06, "loss": 0.0246, "step": 2293 }, { "epoch": 0.7968044459881903, "grad_norm": 0.8526641555443321, "learning_rate": 1.044677604073328e-06, "loss": 0.028, "step": 2294 }, { "epoch": 0.7971517888155609, "grad_norm": 0.6692895381848059, "learning_rate": 1.0412384625959887e-06, "loss": 0.02, "step": 2295 }, { "epoch": 0.7974991316429316, "grad_norm": 0.419733606925217, "learning_rate": 1.037804333316591e-06, "loss": 0.0143, "step": 2296 }, { "epoch": 0.7978464744703022, "grad_norm": 0.4921795286660977, "learning_rate": 1.0343752205830948e-06, "loss": 0.0166, "step": 2297 }, { "epoch": 0.7981938172976728, "grad_norm": 0.7772755035303158, "learning_rate": 1.030951128737106e-06, "loss": 0.0218, "step": 2298 }, { "epoch": 0.7985411601250434, "grad_norm": 0.5358777594915758, "learning_rate": 1.027532062113879e-06, "loss": 0.0237, "step": 2299 }, { "epoch": 0.798888502952414, "grad_norm": 0.5660888423523717, "learning_rate": 1.0241180250423e-06, "loss": 0.0152, "step": 2300 }, { "epoch": 0.7992358457797847, "grad_norm": 1.1542651595626319, "learning_rate": 1.0207090218448923e-06, "loss": 0.0125, "step": 2301 }, { "epoch": 0.7995831886071553, "grad_norm": 1.2171482202491108, "learning_rate": 1.0173050568378002e-06, "loss": 0.0215, "step": 2302 }, { "epoch": 0.7999305314345259, "grad_norm": 1.1045425181167858, "learning_rate": 1.013906134330796e-06, "loss": 0.0132, "step": 2303 }, { "epoch": 0.8002778742618964, "grad_norm": 0.8205260763175557, "learning_rate": 1.0105122586272615e-06, "loss": 0.0168, "step": 2304 }, { "epoch": 0.8006252170892671, "grad_norm": 0.6460529317376611, "learning_rate": 1.0071234340241925e-06, "loss": 0.0146, "step": 2305 }, { "epoch": 0.8009725599166377, "grad_norm": 1.1028425848766075, "learning_rate": 1.0037396648121872e-06, "loss": 0.0203, "step": 2306 }, { "epoch": 0.8013199027440083, "grad_norm": 0.6040772659608722, "learning_rate": 1.0003609552754468e-06, "loss": 0.0131, "step": 2307 }, { "epoch": 0.8016672455713789, "grad_norm": 0.42232669891432706, "learning_rate": 9.969873096917614e-07, "loss": 0.0113, "step": 2308 }, { "epoch": 0.8020145883987496, "grad_norm": 0.700969940979702, "learning_rate": 9.93618732332512e-07, "loss": 0.0191, "step": 2309 }, { "epoch": 0.8023619312261202, "grad_norm": 0.8526190683813358, "learning_rate": 9.902552274626638e-07, "loss": 0.0206, "step": 2310 }, { "epoch": 0.8027092740534908, "grad_norm": 0.5131311092706733, "learning_rate": 9.868967993407603e-07, "loss": 0.0139, "step": 2311 }, { "epoch": 0.8030566168808614, "grad_norm": 0.5377387868045655, "learning_rate": 9.83543452218914e-07, "loss": 0.0208, "step": 2312 }, { "epoch": 0.803403959708232, "grad_norm": 0.6207978174819988, "learning_rate": 9.801951903428053e-07, "loss": 0.0169, "step": 2313 }, { "epoch": 0.8037513025356027, "grad_norm": 1.0623394552894256, "learning_rate": 9.768520179516782e-07, "loss": 0.033, "step": 2314 }, { "epoch": 0.8040986453629733, "grad_norm": 0.29785651145166425, "learning_rate": 9.735139392783326e-07, "loss": 0.0109, "step": 2315 }, { "epoch": 0.8044459881903439, "grad_norm": 2.62736664673775, "learning_rate": 9.70180958549118e-07, "loss": 0.025, "step": 2316 }, { "epoch": 0.8047933310177144, "grad_norm": 0.5896374439097164, "learning_rate": 9.66853079983927e-07, "loss": 0.0225, "step": 2317 }, { "epoch": 0.8051406738450851, "grad_norm": 0.7960436391597935, "learning_rate": 9.63530307796197e-07, "loss": 0.0112, "step": 2318 }, { "epoch": 0.8054880166724557, "grad_norm": 0.615407883572327, "learning_rate": 9.602126461929002e-07, "loss": 0.0224, "step": 2319 }, { "epoch": 0.8058353594998263, "grad_norm": 0.48130754370985307, "learning_rate": 9.569000993745336e-07, "loss": 0.0161, "step": 2320 }, { "epoch": 0.8061827023271969, "grad_norm": 0.41555455314622086, "learning_rate": 9.535926715351207e-07, "loss": 0.0124, "step": 2321 }, { "epoch": 0.8065300451545676, "grad_norm": 0.8444184620512747, "learning_rate": 9.502903668622055e-07, "loss": 0.0166, "step": 2322 }, { "epoch": 0.8068773879819382, "grad_norm": 0.6578650196091546, "learning_rate": 9.469931895368462e-07, "loss": 0.0193, "step": 2323 }, { "epoch": 0.8072247308093088, "grad_norm": 0.8813060676876311, "learning_rate": 9.43701143733603e-07, "loss": 0.017, "step": 2324 }, { "epoch": 0.8075720736366794, "grad_norm": 0.48427747281839323, "learning_rate": 9.404142336205452e-07, "loss": 0.018, "step": 2325 }, { "epoch": 0.80791941646405, "grad_norm": 0.5218223865429743, "learning_rate": 9.371324633592399e-07, "loss": 0.0165, "step": 2326 }, { "epoch": 0.8082667592914207, "grad_norm": 0.5497878546670253, "learning_rate": 9.338558371047429e-07, "loss": 0.0155, "step": 2327 }, { "epoch": 0.8086141021187913, "grad_norm": 0.5828845938065368, "learning_rate": 9.30584359005598e-07, "loss": 0.0226, "step": 2328 }, { "epoch": 0.8089614449461618, "grad_norm": 0.45607595995290867, "learning_rate": 9.273180332038328e-07, "loss": 0.012, "step": 2329 }, { "epoch": 0.8093087877735324, "grad_norm": 0.37620117048721097, "learning_rate": 9.240568638349523e-07, "loss": 0.0131, "step": 2330 }, { "epoch": 0.8096561306009031, "grad_norm": 0.3290671929012356, "learning_rate": 9.208008550279296e-07, "loss": 0.0103, "step": 2331 }, { "epoch": 0.8100034734282737, "grad_norm": 0.623859993612422, "learning_rate": 9.175500109052044e-07, "loss": 0.022, "step": 2332 }, { "epoch": 0.8103508162556443, "grad_norm": 0.38503253698459383, "learning_rate": 9.143043355826802e-07, "loss": 0.0169, "step": 2333 }, { "epoch": 0.8106981590830149, "grad_norm": 0.5608051701389355, "learning_rate": 9.110638331697158e-07, "loss": 0.0145, "step": 2334 }, { "epoch": 0.8110455019103856, "grad_norm": 0.8102603807769083, "learning_rate": 9.078285077691179e-07, "loss": 0.0205, "step": 2335 }, { "epoch": 0.8113928447377562, "grad_norm": 0.8701051653222716, "learning_rate": 9.045983634771388e-07, "loss": 0.0144, "step": 2336 }, { "epoch": 0.8117401875651268, "grad_norm": 0.6909556680271054, "learning_rate": 9.013734043834743e-07, "loss": 0.0159, "step": 2337 }, { "epoch": 0.8120875303924974, "grad_norm": 1.0316061837686277, "learning_rate": 8.981536345712544e-07, "loss": 0.0184, "step": 2338 }, { "epoch": 0.812434873219868, "grad_norm": 0.7754820020926894, "learning_rate": 8.949390581170341e-07, "loss": 0.0241, "step": 2339 }, { "epoch": 0.8127822160472387, "grad_norm": 0.39205221031248105, "learning_rate": 8.917296790908009e-07, "loss": 0.0134, "step": 2340 }, { "epoch": 0.8131295588746092, "grad_norm": 0.7321244654973654, "learning_rate": 8.885255015559552e-07, "loss": 0.0192, "step": 2341 }, { "epoch": 0.8134769017019798, "grad_norm": 1.6237885505587426, "learning_rate": 8.853265295693131e-07, "loss": 0.0182, "step": 2342 }, { "epoch": 0.8138242445293504, "grad_norm": 0.40790172026137156, "learning_rate": 8.821327671811025e-07, "loss": 0.0158, "step": 2343 }, { "epoch": 0.8141715873567211, "grad_norm": 0.6612071453798002, "learning_rate": 8.789442184349556e-07, "loss": 0.0232, "step": 2344 }, { "epoch": 0.8145189301840917, "grad_norm": 0.5315701172252235, "learning_rate": 8.757608873679008e-07, "loss": 0.0226, "step": 2345 }, { "epoch": 0.8148662730114623, "grad_norm": 0.47678792802998987, "learning_rate": 8.72582778010359e-07, "loss": 0.0095, "step": 2346 }, { "epoch": 0.8152136158388329, "grad_norm": 0.4744887984080288, "learning_rate": 8.694098943861457e-07, "loss": 0.0134, "step": 2347 }, { "epoch": 0.8155609586662036, "grad_norm": 0.5992243344701217, "learning_rate": 8.662422405124565e-07, "loss": 0.0147, "step": 2348 }, { "epoch": 0.8159083014935742, "grad_norm": 0.45940543311353116, "learning_rate": 8.630798203998653e-07, "loss": 0.0171, "step": 2349 }, { "epoch": 0.8162556443209448, "grad_norm": 0.5247973512112908, "learning_rate": 8.59922638052319e-07, "loss": 0.0167, "step": 2350 }, { "epoch": 0.8166029871483154, "grad_norm": 0.5656596886450084, "learning_rate": 8.567706974671353e-07, "loss": 0.012, "step": 2351 }, { "epoch": 0.816950329975686, "grad_norm": 0.8482170289088908, "learning_rate": 8.536240026349951e-07, "loss": 0.0219, "step": 2352 }, { "epoch": 0.8172976728030567, "grad_norm": 0.6035908558385402, "learning_rate": 8.504825575399356e-07, "loss": 0.0169, "step": 2353 }, { "epoch": 0.8176450156304272, "grad_norm": 1.3540920045468, "learning_rate": 8.473463661593473e-07, "loss": 0.0197, "step": 2354 }, { "epoch": 0.8179923584577978, "grad_norm": 0.7084599678534361, "learning_rate": 8.442154324639706e-07, "loss": 0.0156, "step": 2355 }, { "epoch": 0.8183397012851684, "grad_norm": 0.40515237224446915, "learning_rate": 8.410897604178913e-07, "loss": 0.0144, "step": 2356 }, { "epoch": 0.8186870441125391, "grad_norm": 1.0892964020587188, "learning_rate": 8.379693539785266e-07, "loss": 0.0292, "step": 2357 }, { "epoch": 0.8190343869399097, "grad_norm": 1.4771693089117002, "learning_rate": 8.348542170966317e-07, "loss": 0.0215, "step": 2358 }, { "epoch": 0.8193817297672803, "grad_norm": 0.4065081151813109, "learning_rate": 8.317443537162922e-07, "loss": 0.0146, "step": 2359 }, { "epoch": 0.8197290725946509, "grad_norm": 0.5743831870975237, "learning_rate": 8.286397677749114e-07, "loss": 0.0201, "step": 2360 }, { "epoch": 0.8200764154220216, "grad_norm": 0.5683746981641697, "learning_rate": 8.255404632032126e-07, "loss": 0.017, "step": 2361 }, { "epoch": 0.8204237582493922, "grad_norm": 0.6228554253741776, "learning_rate": 8.224464439252344e-07, "loss": 0.0233, "step": 2362 }, { "epoch": 0.8207711010767628, "grad_norm": 0.9700168786950992, "learning_rate": 8.193577138583242e-07, "loss": 0.0145, "step": 2363 }, { "epoch": 0.8211184439041334, "grad_norm": 0.5794044470730007, "learning_rate": 8.162742769131282e-07, "loss": 0.0197, "step": 2364 }, { "epoch": 0.8214657867315039, "grad_norm": 0.36573146405518225, "learning_rate": 8.131961369935943e-07, "loss": 0.0165, "step": 2365 }, { "epoch": 0.8218131295588746, "grad_norm": 0.8465877042129827, "learning_rate": 8.101232979969625e-07, "loss": 0.0154, "step": 2366 }, { "epoch": 0.8221604723862452, "grad_norm": 0.40438541583542964, "learning_rate": 8.070557638137649e-07, "loss": 0.0104, "step": 2367 }, { "epoch": 0.8225078152136158, "grad_norm": 0.8024309888658482, "learning_rate": 8.039935383278119e-07, "loss": 0.0194, "step": 2368 }, { "epoch": 0.8228551580409864, "grad_norm": 0.5129057990979784, "learning_rate": 8.009366254161943e-07, "loss": 0.0228, "step": 2369 }, { "epoch": 0.8232025008683571, "grad_norm": 0.3255972401547693, "learning_rate": 7.978850289492779e-07, "loss": 0.013, "step": 2370 }, { "epoch": 0.8235498436957277, "grad_norm": 0.7555922278203009, "learning_rate": 7.948387527906987e-07, "loss": 0.0198, "step": 2371 }, { "epoch": 0.8238971865230983, "grad_norm": 0.7146376306999533, "learning_rate": 7.91797800797352e-07, "loss": 0.0164, "step": 2372 }, { "epoch": 0.8242445293504689, "grad_norm": 0.37704727475605315, "learning_rate": 7.887621768193954e-07, "loss": 0.0195, "step": 2373 }, { "epoch": 0.8245918721778396, "grad_norm": 0.7235950861900677, "learning_rate": 7.85731884700241e-07, "loss": 0.0123, "step": 2374 }, { "epoch": 0.8249392150052102, "grad_norm": 0.5352341742681154, "learning_rate": 7.827069282765475e-07, "loss": 0.0173, "step": 2375 }, { "epoch": 0.8252865578325808, "grad_norm": 0.35228903667411676, "learning_rate": 7.796873113782205e-07, "loss": 0.0138, "step": 2376 }, { "epoch": 0.8256339006599513, "grad_norm": 0.8622702937938816, "learning_rate": 7.766730378284065e-07, "loss": 0.0171, "step": 2377 }, { "epoch": 0.8259812434873219, "grad_norm": 0.9369503338878897, "learning_rate": 7.736641114434834e-07, "loss": 0.0206, "step": 2378 }, { "epoch": 0.8263285863146926, "grad_norm": 0.45846125634992196, "learning_rate": 7.706605360330594e-07, "loss": 0.0174, "step": 2379 }, { "epoch": 0.8266759291420632, "grad_norm": 0.3955188331989464, "learning_rate": 7.676623153999696e-07, "loss": 0.016, "step": 2380 }, { "epoch": 0.8270232719694338, "grad_norm": 0.7836478004454387, "learning_rate": 7.646694533402699e-07, "loss": 0.0261, "step": 2381 }, { "epoch": 0.8273706147968044, "grad_norm": 0.33544601587638473, "learning_rate": 7.616819536432296e-07, "loss": 0.0162, "step": 2382 }, { "epoch": 0.8277179576241751, "grad_norm": 0.7414613842115939, "learning_rate": 7.586998200913282e-07, "loss": 0.0209, "step": 2383 }, { "epoch": 0.8280653004515457, "grad_norm": 0.4693067215853339, "learning_rate": 7.557230564602541e-07, "loss": 0.0238, "step": 2384 }, { "epoch": 0.8284126432789163, "grad_norm": 0.43643356007744005, "learning_rate": 7.527516665188956e-07, "loss": 0.0112, "step": 2385 }, { "epoch": 0.8287599861062869, "grad_norm": 0.7401366594367462, "learning_rate": 7.497856540293369e-07, "loss": 0.0249, "step": 2386 }, { "epoch": 0.8291073289336576, "grad_norm": 0.6044238634021094, "learning_rate": 7.468250227468515e-07, "loss": 0.0167, "step": 2387 }, { "epoch": 0.8294546717610282, "grad_norm": 0.42786196774482066, "learning_rate": 7.438697764199043e-07, "loss": 0.0167, "step": 2388 }, { "epoch": 0.8298020145883988, "grad_norm": 0.2809219680220145, "learning_rate": 7.409199187901417e-07, "loss": 0.0103, "step": 2389 }, { "epoch": 0.8301493574157693, "grad_norm": 0.3850053448956444, "learning_rate": 7.379754535923817e-07, "loss": 0.011, "step": 2390 }, { "epoch": 0.8304967002431399, "grad_norm": 0.5129627163049738, "learning_rate": 7.35036384554621e-07, "loss": 0.0144, "step": 2391 }, { "epoch": 0.8308440430705106, "grad_norm": 0.7155517690937496, "learning_rate": 7.321027153980237e-07, "loss": 0.0183, "step": 2392 }, { "epoch": 0.8311913858978812, "grad_norm": 0.7899372791574505, "learning_rate": 7.291744498369146e-07, "loss": 0.0179, "step": 2393 }, { "epoch": 0.8315387287252518, "grad_norm": 0.6650626503017965, "learning_rate": 7.262515915787771e-07, "loss": 0.0193, "step": 2394 }, { "epoch": 0.8318860715526224, "grad_norm": 0.5901749401101161, "learning_rate": 7.233341443242504e-07, "loss": 0.0212, "step": 2395 }, { "epoch": 0.8322334143799931, "grad_norm": 0.4629616119625876, "learning_rate": 7.204221117671229e-07, "loss": 0.0154, "step": 2396 }, { "epoch": 0.8325807572073637, "grad_norm": 0.5047026630331894, "learning_rate": 7.175154975943244e-07, "loss": 0.0177, "step": 2397 }, { "epoch": 0.8329281000347343, "grad_norm": 0.9117791956297813, "learning_rate": 7.146143054859267e-07, "loss": 0.025, "step": 2398 }, { "epoch": 0.8332754428621049, "grad_norm": 0.9994693023274762, "learning_rate": 7.117185391151371e-07, "loss": 0.0238, "step": 2399 }, { "epoch": 0.8336227856894756, "grad_norm": 0.2469147876407477, "learning_rate": 7.088282021482934e-07, "loss": 0.0069, "step": 2400 }, { "epoch": 0.8339701285168462, "grad_norm": 0.4503780485766356, "learning_rate": 7.059432982448571e-07, "loss": 0.0235, "step": 2401 }, { "epoch": 0.8343174713442167, "grad_norm": 0.39895909831018983, "learning_rate": 7.030638310574123e-07, "loss": 0.0139, "step": 2402 }, { "epoch": 0.8346648141715873, "grad_norm": 0.8121294300478942, "learning_rate": 7.001898042316602e-07, "loss": 0.0143, "step": 2403 }, { "epoch": 0.8350121569989579, "grad_norm": 0.5682479563314492, "learning_rate": 6.97321221406414e-07, "loss": 0.0254, "step": 2404 }, { "epoch": 0.8353594998263286, "grad_norm": 0.5858957903067712, "learning_rate": 6.944580862135935e-07, "loss": 0.0164, "step": 2405 }, { "epoch": 0.8357068426536992, "grad_norm": 0.8591367268668065, "learning_rate": 6.916004022782191e-07, "loss": 0.012, "step": 2406 }, { "epoch": 0.8360541854810698, "grad_norm": 0.3931982527826898, "learning_rate": 6.887481732184148e-07, "loss": 0.0185, "step": 2407 }, { "epoch": 0.8364015283084404, "grad_norm": 0.7603616933042572, "learning_rate": 6.859014026453925e-07, "loss": 0.0149, "step": 2408 }, { "epoch": 0.8367488711358111, "grad_norm": 0.40064016500672056, "learning_rate": 6.830600941634579e-07, "loss": 0.0212, "step": 2409 }, { "epoch": 0.8370962139631817, "grad_norm": 0.7197529488413131, "learning_rate": 6.802242513699963e-07, "loss": 0.0187, "step": 2410 }, { "epoch": 0.8374435567905523, "grad_norm": 1.1480554116820771, "learning_rate": 6.773938778554773e-07, "loss": 0.015, "step": 2411 }, { "epoch": 0.8377908996179229, "grad_norm": 0.45140844123391066, "learning_rate": 6.745689772034425e-07, "loss": 0.02, "step": 2412 }, { "epoch": 0.8381382424452936, "grad_norm": 0.26189833795464385, "learning_rate": 6.717495529905077e-07, "loss": 0.0118, "step": 2413 }, { "epoch": 0.8384855852726641, "grad_norm": 0.42657880248327595, "learning_rate": 6.689356087863508e-07, "loss": 0.0166, "step": 2414 }, { "epoch": 0.8388329281000347, "grad_norm": 0.6022359413113759, "learning_rate": 6.661271481537157e-07, "loss": 0.0171, "step": 2415 }, { "epoch": 0.8391802709274053, "grad_norm": 0.4458461340423978, "learning_rate": 6.633241746483993e-07, "loss": 0.0193, "step": 2416 }, { "epoch": 0.8395276137547759, "grad_norm": 0.4109499161581392, "learning_rate": 6.605266918192543e-07, "loss": 0.0158, "step": 2417 }, { "epoch": 0.8398749565821466, "grad_norm": 0.8284740415704146, "learning_rate": 6.577347032081816e-07, "loss": 0.0196, "step": 2418 }, { "epoch": 0.8402222994095172, "grad_norm": 0.5823268709121688, "learning_rate": 6.549482123501249e-07, "loss": 0.0259, "step": 2419 }, { "epoch": 0.8405696422368878, "grad_norm": 0.4351391472723503, "learning_rate": 6.521672227730658e-07, "loss": 0.0119, "step": 2420 }, { "epoch": 0.8409169850642584, "grad_norm": 0.7478210142980828, "learning_rate": 6.49391737998023e-07, "loss": 0.0179, "step": 2421 }, { "epoch": 0.8412643278916291, "grad_norm": 0.714684958708105, "learning_rate": 6.466217615390468e-07, "loss": 0.0152, "step": 2422 }, { "epoch": 0.8416116707189997, "grad_norm": 0.47658348097908554, "learning_rate": 6.438572969032075e-07, "loss": 0.0247, "step": 2423 }, { "epoch": 0.8419590135463703, "grad_norm": 0.5726724916032655, "learning_rate": 6.410983475906024e-07, "loss": 0.0213, "step": 2424 }, { "epoch": 0.8423063563737408, "grad_norm": 0.4699141551972462, "learning_rate": 6.383449170943457e-07, "loss": 0.0073, "step": 2425 }, { "epoch": 0.8426536992011116, "grad_norm": 0.48804476100038013, "learning_rate": 6.355970089005615e-07, "loss": 0.0189, "step": 2426 }, { "epoch": 0.8430010420284821, "grad_norm": 0.5447605632942257, "learning_rate": 6.328546264883822e-07, "loss": 0.0241, "step": 2427 }, { "epoch": 0.8433483848558527, "grad_norm": 1.1943754155338666, "learning_rate": 6.301177733299457e-07, "loss": 0.0231, "step": 2428 }, { "epoch": 0.8436957276832233, "grad_norm": 0.4781400798615543, "learning_rate": 6.273864528903906e-07, "loss": 0.0119, "step": 2429 }, { "epoch": 0.8440430705105939, "grad_norm": 0.6541732578812876, "learning_rate": 6.246606686278467e-07, "loss": 0.0275, "step": 2430 }, { "epoch": 0.8443904133379646, "grad_norm": 0.40455279688964085, "learning_rate": 6.219404239934357e-07, "loss": 0.0115, "step": 2431 }, { "epoch": 0.8447377561653352, "grad_norm": 0.6875625753644061, "learning_rate": 6.19225722431267e-07, "loss": 0.0187, "step": 2432 }, { "epoch": 0.8450850989927058, "grad_norm": 0.7921616555433921, "learning_rate": 6.165165673784318e-07, "loss": 0.0171, "step": 2433 }, { "epoch": 0.8454324418200764, "grad_norm": 0.8867189304336157, "learning_rate": 6.13812962264998e-07, "loss": 0.0236, "step": 2434 }, { "epoch": 0.8457797846474471, "grad_norm": 1.115479157352609, "learning_rate": 6.111149105140052e-07, "loss": 0.0177, "step": 2435 }, { "epoch": 0.8461271274748177, "grad_norm": 0.5543931361704746, "learning_rate": 6.084224155414647e-07, "loss": 0.0105, "step": 2436 }, { "epoch": 0.8464744703021883, "grad_norm": 0.48262329533304676, "learning_rate": 6.057354807563526e-07, "loss": 0.0109, "step": 2437 }, { "epoch": 0.8468218131295588, "grad_norm": 0.41747137999997436, "learning_rate": 6.030541095606018e-07, "loss": 0.011, "step": 2438 }, { "epoch": 0.8471691559569295, "grad_norm": 0.49664059634379965, "learning_rate": 6.003783053491025e-07, "loss": 0.013, "step": 2439 }, { "epoch": 0.8475164987843001, "grad_norm": 0.4877777397413885, "learning_rate": 5.977080715096995e-07, "loss": 0.0232, "step": 2440 }, { "epoch": 0.8478638416116707, "grad_norm": 1.2489307888455528, "learning_rate": 5.950434114231801e-07, "loss": 0.0314, "step": 2441 }, { "epoch": 0.8482111844390413, "grad_norm": 0.4900829237700161, "learning_rate": 5.923843284632796e-07, "loss": 0.0184, "step": 2442 }, { "epoch": 0.8485585272664119, "grad_norm": 0.4575837010051548, "learning_rate": 5.897308259966672e-07, "loss": 0.0176, "step": 2443 }, { "epoch": 0.8489058700937826, "grad_norm": 0.38584962250720306, "learning_rate": 5.870829073829515e-07, "loss": 0.0179, "step": 2444 }, { "epoch": 0.8492532129211532, "grad_norm": 0.4177353453219893, "learning_rate": 5.844405759746663e-07, "loss": 0.013, "step": 2445 }, { "epoch": 0.8496005557485238, "grad_norm": 0.4869716570678539, "learning_rate": 5.818038351172767e-07, "loss": 0.0191, "step": 2446 }, { "epoch": 0.8499478985758944, "grad_norm": 0.6783571708436903, "learning_rate": 5.791726881491644e-07, "loss": 0.0213, "step": 2447 }, { "epoch": 0.8502952414032651, "grad_norm": 0.6897537136000237, "learning_rate": 5.765471384016341e-07, "loss": 0.0162, "step": 2448 }, { "epoch": 0.8506425842306357, "grad_norm": 0.44891761410994796, "learning_rate": 5.739271891988974e-07, "loss": 0.0214, "step": 2449 }, { "epoch": 0.8509899270580062, "grad_norm": 1.0985759896890122, "learning_rate": 5.713128438580823e-07, "loss": 0.0229, "step": 2450 }, { "epoch": 0.8513372698853768, "grad_norm": 0.4352898494688454, "learning_rate": 5.687041056892145e-07, "loss": 0.0143, "step": 2451 }, { "epoch": 0.8516846127127475, "grad_norm": 0.5789110156147614, "learning_rate": 5.66100977995227e-07, "loss": 0.0201, "step": 2452 }, { "epoch": 0.8520319555401181, "grad_norm": 0.6518621410478981, "learning_rate": 5.635034640719433e-07, "loss": 0.0216, "step": 2453 }, { "epoch": 0.8523792983674887, "grad_norm": 0.9846895298135566, "learning_rate": 5.609115672080845e-07, "loss": 0.018, "step": 2454 }, { "epoch": 0.8527266411948593, "grad_norm": 0.8407928319843598, "learning_rate": 5.583252906852594e-07, "loss": 0.0144, "step": 2455 }, { "epoch": 0.8530739840222299, "grad_norm": 0.5601226706008987, "learning_rate": 5.557446377779546e-07, "loss": 0.0215, "step": 2456 }, { "epoch": 0.8534213268496006, "grad_norm": 0.5001353143572673, "learning_rate": 5.53169611753544e-07, "loss": 0.0183, "step": 2457 }, { "epoch": 0.8537686696769712, "grad_norm": 0.5236537168132929, "learning_rate": 5.506002158722751e-07, "loss": 0.0218, "step": 2458 }, { "epoch": 0.8541160125043418, "grad_norm": 0.36795187111448785, "learning_rate": 5.48036453387265e-07, "loss": 0.0129, "step": 2459 }, { "epoch": 0.8544633553317124, "grad_norm": 0.7797517640305994, "learning_rate": 5.454783275445003e-07, "loss": 0.0167, "step": 2460 }, { "epoch": 0.8548106981590831, "grad_norm": 0.40671604163293146, "learning_rate": 5.429258415828298e-07, "loss": 0.0179, "step": 2461 }, { "epoch": 0.8551580409864536, "grad_norm": 0.8392548489754016, "learning_rate": 5.403789987339647e-07, "loss": 0.0164, "step": 2462 }, { "epoch": 0.8555053838138242, "grad_norm": 1.8042310161892017, "learning_rate": 5.378378022224679e-07, "loss": 0.0182, "step": 2463 }, { "epoch": 0.8558527266411948, "grad_norm": 0.9597712322684139, "learning_rate": 5.353022552657533e-07, "loss": 0.0242, "step": 2464 }, { "epoch": 0.8562000694685655, "grad_norm": 1.2902699361504208, "learning_rate": 5.327723610740843e-07, "loss": 0.0194, "step": 2465 }, { "epoch": 0.8565474122959361, "grad_norm": 0.9025915486754845, "learning_rate": 5.302481228505674e-07, "loss": 0.0159, "step": 2466 }, { "epoch": 0.8568947551233067, "grad_norm": 0.8935041671025581, "learning_rate": 5.277295437911462e-07, "loss": 0.0206, "step": 2467 }, { "epoch": 0.8572420979506773, "grad_norm": 0.8398011042782978, "learning_rate": 5.252166270845994e-07, "loss": 0.0175, "step": 2468 }, { "epoch": 0.8575894407780479, "grad_norm": 0.6273784672353577, "learning_rate": 5.227093759125368e-07, "loss": 0.013, "step": 2469 }, { "epoch": 0.8579367836054186, "grad_norm": 0.7599839379620519, "learning_rate": 5.20207793449397e-07, "loss": 0.0148, "step": 2470 }, { "epoch": 0.8582841264327892, "grad_norm": 0.9137884963550477, "learning_rate": 5.177118828624395e-07, "loss": 0.0177, "step": 2471 }, { "epoch": 0.8586314692601598, "grad_norm": 0.4027490941272288, "learning_rate": 5.152216473117416e-07, "loss": 0.0115, "step": 2472 }, { "epoch": 0.8589788120875304, "grad_norm": 0.4067939554578114, "learning_rate": 5.127370899501988e-07, "loss": 0.0132, "step": 2473 }, { "epoch": 0.859326154914901, "grad_norm": 0.39233639414180854, "learning_rate": 5.10258213923513e-07, "loss": 0.0134, "step": 2474 }, { "epoch": 0.8596734977422716, "grad_norm": 0.5636819438029829, "learning_rate": 5.07785022370198e-07, "loss": 0.0162, "step": 2475 }, { "epoch": 0.8600208405696422, "grad_norm": 0.45051474131353364, "learning_rate": 5.053175184215653e-07, "loss": 0.019, "step": 2476 }, { "epoch": 0.8603681833970128, "grad_norm": 0.504428208604678, "learning_rate": 5.028557052017302e-07, "loss": 0.0128, "step": 2477 }, { "epoch": 0.8607155262243835, "grad_norm": 0.5002145933355393, "learning_rate": 5.003995858275984e-07, "loss": 0.0148, "step": 2478 }, { "epoch": 0.8610628690517541, "grad_norm": 0.7066450698383614, "learning_rate": 4.979491634088712e-07, "loss": 0.0216, "step": 2479 }, { "epoch": 0.8614102118791247, "grad_norm": 0.7209554287180902, "learning_rate": 4.955044410480326e-07, "loss": 0.0102, "step": 2480 }, { "epoch": 0.8617575547064953, "grad_norm": 0.37457104151587356, "learning_rate": 4.93065421840353e-07, "loss": 0.0129, "step": 2481 }, { "epoch": 0.8621048975338659, "grad_norm": 0.35405427472628026, "learning_rate": 4.906321088738791e-07, "loss": 0.0145, "step": 2482 }, { "epoch": 0.8624522403612366, "grad_norm": 0.9145616386018184, "learning_rate": 4.882045052294371e-07, "loss": 0.0154, "step": 2483 }, { "epoch": 0.8627995831886072, "grad_norm": 0.45595222296109655, "learning_rate": 4.857826139806194e-07, "loss": 0.0159, "step": 2484 }, { "epoch": 0.8631469260159778, "grad_norm": 0.6853468761771402, "learning_rate": 4.833664381937908e-07, "loss": 0.015, "step": 2485 }, { "epoch": 0.8634942688433483, "grad_norm": 0.9539717763403811, "learning_rate": 4.809559809280756e-07, "loss": 0.0173, "step": 2486 }, { "epoch": 0.863841611670719, "grad_norm": 0.8437730803188537, "learning_rate": 4.785512452353619e-07, "loss": 0.0244, "step": 2487 }, { "epoch": 0.8641889544980896, "grad_norm": 0.7966421861571771, "learning_rate": 4.7615223416029086e-07, "loss": 0.0232, "step": 2488 }, { "epoch": 0.8645362973254602, "grad_norm": 0.5829026692062281, "learning_rate": 4.737589507402546e-07, "loss": 0.0191, "step": 2489 }, { "epoch": 0.8648836401528308, "grad_norm": 0.6971309820400614, "learning_rate": 4.7137139800539746e-07, "loss": 0.0147, "step": 2490 }, { "epoch": 0.8652309829802015, "grad_norm": 0.9730338109684425, "learning_rate": 4.689895789786059e-07, "loss": 0.027, "step": 2491 }, { "epoch": 0.8655783258075721, "grad_norm": 0.46701122458001026, "learning_rate": 4.666134966755059e-07, "loss": 0.0179, "step": 2492 }, { "epoch": 0.8659256686349427, "grad_norm": 0.7261076983049326, "learning_rate": 4.6424315410446117e-07, "loss": 0.0231, "step": 2493 }, { "epoch": 0.8662730114623133, "grad_norm": 0.4333800390573832, "learning_rate": 4.618785542665688e-07, "loss": 0.0171, "step": 2494 }, { "epoch": 0.8666203542896839, "grad_norm": 0.7645732325430994, "learning_rate": 4.5951970015565617e-07, "loss": 0.0152, "step": 2495 }, { "epoch": 0.8669676971170546, "grad_norm": 1.7725027824937925, "learning_rate": 4.571665947582726e-07, "loss": 0.0164, "step": 2496 }, { "epoch": 0.8673150399444252, "grad_norm": 0.5820223340143823, "learning_rate": 4.5481924105369e-07, "loss": 0.0147, "step": 2497 }, { "epoch": 0.8676623827717957, "grad_norm": 0.9426845874078936, "learning_rate": 4.5247764201390045e-07, "loss": 0.0181, "step": 2498 }, { "epoch": 0.8680097255991663, "grad_norm": 0.40736158785553966, "learning_rate": 4.5014180060360843e-07, "loss": 0.0161, "step": 2499 }, { "epoch": 0.868357068426537, "grad_norm": 0.35291016663434527, "learning_rate": 4.4781171978022786e-07, "loss": 0.0136, "step": 2500 }, { "epoch": 0.8687044112539076, "grad_norm": 0.7083995120920576, "learning_rate": 4.4548740249387934e-07, "loss": 0.0287, "step": 2501 }, { "epoch": 0.8690517540812782, "grad_norm": 0.6893058334285791, "learning_rate": 4.4316885168738776e-07, "loss": 0.0191, "step": 2502 }, { "epoch": 0.8693990969086488, "grad_norm": 0.4921717937501951, "learning_rate": 4.4085607029627717e-07, "loss": 0.0112, "step": 2503 }, { "epoch": 0.8697464397360195, "grad_norm": 0.5902281066197675, "learning_rate": 4.3854906124876415e-07, "loss": 0.0157, "step": 2504 }, { "epoch": 0.8700937825633901, "grad_norm": 0.5650017177846125, "learning_rate": 4.3624782746575886e-07, "loss": 0.0221, "step": 2505 }, { "epoch": 0.8704411253907607, "grad_norm": 0.5938641449620281, "learning_rate": 4.3395237186086014e-07, "loss": 0.0119, "step": 2506 }, { "epoch": 0.8707884682181313, "grad_norm": 0.44786371675122244, "learning_rate": 4.316626973403487e-07, "loss": 0.0138, "step": 2507 }, { "epoch": 0.8711358110455019, "grad_norm": 0.529485412783005, "learning_rate": 4.2937880680318846e-07, "loss": 0.0203, "step": 2508 }, { "epoch": 0.8714831538728726, "grad_norm": 0.4695178384130346, "learning_rate": 4.2710070314101845e-07, "loss": 0.0171, "step": 2509 }, { "epoch": 0.8718304967002432, "grad_norm": 0.6865782270212439, "learning_rate": 4.2482838923815163e-07, "loss": 0.0221, "step": 2510 }, { "epoch": 0.8721778395276137, "grad_norm": 1.008599802575518, "learning_rate": 4.2256186797156986e-07, "loss": 0.0163, "step": 2511 }, { "epoch": 0.8725251823549843, "grad_norm": 0.8277810535713074, "learning_rate": 4.203011422109227e-07, "loss": 0.0131, "step": 2512 }, { "epoch": 0.872872525182355, "grad_norm": 0.47209426725218306, "learning_rate": 4.180462148185188e-07, "loss": 0.0115, "step": 2513 }, { "epoch": 0.8732198680097256, "grad_norm": 1.0335741947460342, "learning_rate": 4.1579708864932956e-07, "loss": 0.0202, "step": 2514 }, { "epoch": 0.8735672108370962, "grad_norm": 0.4371396632860345, "learning_rate": 4.1355376655097704e-07, "loss": 0.0187, "step": 2515 }, { "epoch": 0.8739145536644668, "grad_norm": 0.4841174212281427, "learning_rate": 4.113162513637392e-07, "loss": 0.0146, "step": 2516 }, { "epoch": 0.8742618964918375, "grad_norm": 0.5418422212985032, "learning_rate": 4.090845459205378e-07, "loss": 0.0205, "step": 2517 }, { "epoch": 0.8746092393192081, "grad_norm": 0.6175530773760641, "learning_rate": 4.0685865304694205e-07, "loss": 0.0257, "step": 2518 }, { "epoch": 0.8749565821465787, "grad_norm": 0.6600694577668837, "learning_rate": 4.0463857556115924e-07, "loss": 0.0126, "step": 2519 }, { "epoch": 0.8753039249739493, "grad_norm": 0.45151100408746436, "learning_rate": 4.0242431627403656e-07, "loss": 0.0108, "step": 2520 }, { "epoch": 0.8756512678013199, "grad_norm": 0.7267112922014047, "learning_rate": 4.0021587798905247e-07, "loss": 0.0165, "step": 2521 }, { "epoch": 0.8759986106286906, "grad_norm": 0.7989326256489759, "learning_rate": 3.980132635023154e-07, "loss": 0.022, "step": 2522 }, { "epoch": 0.8763459534560611, "grad_norm": 1.297364986774049, "learning_rate": 3.9581647560256175e-07, "loss": 0.0218, "step": 2523 }, { "epoch": 0.8766932962834317, "grad_norm": 0.5435363177683816, "learning_rate": 3.9362551707115114e-07, "loss": 0.0247, "step": 2524 }, { "epoch": 0.8770406391108023, "grad_norm": 0.38578321310369523, "learning_rate": 3.914403906820613e-07, "loss": 0.0133, "step": 2525 }, { "epoch": 0.877387981938173, "grad_norm": 0.4932282625829498, "learning_rate": 3.892610992018847e-07, "loss": 0.016, "step": 2526 }, { "epoch": 0.8777353247655436, "grad_norm": 0.3669442115959157, "learning_rate": 3.870876453898292e-07, "loss": 0.0166, "step": 2527 }, { "epoch": 0.8780826675929142, "grad_norm": 0.9342073340109053, "learning_rate": 3.849200319977109e-07, "loss": 0.0228, "step": 2528 }, { "epoch": 0.8784300104202848, "grad_norm": 0.25770986108617316, "learning_rate": 3.8275826176994936e-07, "loss": 0.0092, "step": 2529 }, { "epoch": 0.8787773532476555, "grad_norm": 0.4339581883005602, "learning_rate": 3.8060233744356634e-07, "loss": 0.0168, "step": 2530 }, { "epoch": 0.8791246960750261, "grad_norm": 0.9289029914869522, "learning_rate": 3.784522617481845e-07, "loss": 0.0215, "step": 2531 }, { "epoch": 0.8794720389023967, "grad_norm": 0.5849896866294249, "learning_rate": 3.7630803740602073e-07, "loss": 0.0224, "step": 2532 }, { "epoch": 0.8798193817297673, "grad_norm": 0.7327664241177156, "learning_rate": 3.7416966713188174e-07, "loss": 0.0106, "step": 2533 }, { "epoch": 0.8801667245571378, "grad_norm": 0.3696886941163735, "learning_rate": 3.7203715363316294e-07, "loss": 0.0089, "step": 2534 }, { "epoch": 0.8805140673845085, "grad_norm": 0.49024463460685797, "learning_rate": 3.699104996098457e-07, "loss": 0.0151, "step": 2535 }, { "epoch": 0.8808614102118791, "grad_norm": 0.7947374546207403, "learning_rate": 3.6778970775449283e-07, "loss": 0.0142, "step": 2536 }, { "epoch": 0.8812087530392497, "grad_norm": 0.4625649202944896, "learning_rate": 3.656747807522437e-07, "loss": 0.0109, "step": 2537 }, { "epoch": 0.8815560958666203, "grad_norm": 0.5558918378768948, "learning_rate": 3.6356572128081134e-07, "loss": 0.0238, "step": 2538 }, { "epoch": 0.881903438693991, "grad_norm": 0.42918432489972863, "learning_rate": 3.614625320104831e-07, "loss": 0.0153, "step": 2539 }, { "epoch": 0.8822507815213616, "grad_norm": 1.027886704845079, "learning_rate": 3.593652156041122e-07, "loss": 0.0195, "step": 2540 }, { "epoch": 0.8825981243487322, "grad_norm": 0.33562683798972165, "learning_rate": 3.572737747171151e-07, "loss": 0.0125, "step": 2541 }, { "epoch": 0.8829454671761028, "grad_norm": 0.5951105843622697, "learning_rate": 3.5518821199747035e-07, "loss": 0.0175, "step": 2542 }, { "epoch": 0.8832928100034735, "grad_norm": 0.6577768489226572, "learning_rate": 3.531085300857151e-07, "loss": 0.0252, "step": 2543 }, { "epoch": 0.8836401528308441, "grad_norm": 1.1447374774363468, "learning_rate": 3.510347316149393e-07, "loss": 0.0253, "step": 2544 }, { "epoch": 0.8839874956582147, "grad_norm": 1.075748573924704, "learning_rate": 3.4896681921078477e-07, "loss": 0.0138, "step": 2545 }, { "epoch": 0.8843348384855853, "grad_norm": 0.5241854815133766, "learning_rate": 3.469047954914395e-07, "loss": 0.0178, "step": 2546 }, { "epoch": 0.8846821813129558, "grad_norm": 0.48865176944956945, "learning_rate": 3.4484866306763896e-07, "loss": 0.0123, "step": 2547 }, { "epoch": 0.8850295241403265, "grad_norm": 0.48077206825883056, "learning_rate": 3.4279842454265523e-07, "loss": 0.0114, "step": 2548 }, { "epoch": 0.8853768669676971, "grad_norm": 0.7919381272023467, "learning_rate": 3.407540825123024e-07, "loss": 0.015, "step": 2549 }, { "epoch": 0.8857242097950677, "grad_norm": 0.9115719414888127, "learning_rate": 3.3871563956492546e-07, "loss": 0.0215, "step": 2550 }, { "epoch": 0.8860715526224383, "grad_norm": 0.9428448234794181, "learning_rate": 3.36683098281404e-07, "loss": 0.019, "step": 2551 }, { "epoch": 0.886418895449809, "grad_norm": 0.5725716009079287, "learning_rate": 3.346564612351416e-07, "loss": 0.0221, "step": 2552 }, { "epoch": 0.8867662382771796, "grad_norm": 0.5339812873256146, "learning_rate": 3.3263573099207025e-07, "loss": 0.0156, "step": 2553 }, { "epoch": 0.8871135811045502, "grad_norm": 0.7200318917306987, "learning_rate": 3.3062091011064e-07, "loss": 0.0189, "step": 2554 }, { "epoch": 0.8874609239319208, "grad_norm": 0.8289865754088781, "learning_rate": 3.2861200114182257e-07, "loss": 0.0191, "step": 2555 }, { "epoch": 0.8878082667592915, "grad_norm": 0.41087275655115824, "learning_rate": 3.2660900662910056e-07, "loss": 0.014, "step": 2556 }, { "epoch": 0.8881556095866621, "grad_norm": 0.4168171217347719, "learning_rate": 3.2461192910847263e-07, "loss": 0.0127, "step": 2557 }, { "epoch": 0.8885029524140327, "grad_norm": 0.3718290081453453, "learning_rate": 3.2262077110844224e-07, "loss": 0.0094, "step": 2558 }, { "epoch": 0.8888502952414032, "grad_norm": 0.32435438005787876, "learning_rate": 3.206355351500184e-07, "loss": 0.0123, "step": 2559 }, { "epoch": 0.8891976380687738, "grad_norm": 0.4727940603046291, "learning_rate": 3.186562237467156e-07, "loss": 0.014, "step": 2560 }, { "epoch": 0.8895449808961445, "grad_norm": 0.3199322989762899, "learning_rate": 3.16682839404544e-07, "loss": 0.0156, "step": 2561 }, { "epoch": 0.8898923237235151, "grad_norm": 0.5052862572229199, "learning_rate": 3.147153846220108e-07, "loss": 0.0162, "step": 2562 }, { "epoch": 0.8902396665508857, "grad_norm": 0.4935045434697062, "learning_rate": 3.127538618901144e-07, "loss": 0.0201, "step": 2563 }, { "epoch": 0.8905870093782563, "grad_norm": 0.45484641731771275, "learning_rate": 3.107982736923448e-07, "loss": 0.012, "step": 2564 }, { "epoch": 0.890934352205627, "grad_norm": 0.4589036891401816, "learning_rate": 3.0884862250467715e-07, "loss": 0.0185, "step": 2565 }, { "epoch": 0.8912816950329976, "grad_norm": 0.4845147421338059, "learning_rate": 3.069049107955696e-07, "loss": 0.0122, "step": 2566 }, { "epoch": 0.8916290378603682, "grad_norm": 0.4953532759104348, "learning_rate": 3.0496714102595914e-07, "loss": 0.0105, "step": 2567 }, { "epoch": 0.8919763806877388, "grad_norm": 1.1581053470310272, "learning_rate": 3.030353156492627e-07, "loss": 0.0217, "step": 2568 }, { "epoch": 0.8923237235151095, "grad_norm": 0.5167371489599756, "learning_rate": 3.0110943711136874e-07, "loss": 0.0119, "step": 2569 }, { "epoch": 0.8926710663424801, "grad_norm": 0.6648490939529547, "learning_rate": 2.9918950785063684e-07, "loss": 0.015, "step": 2570 }, { "epoch": 0.8930184091698506, "grad_norm": 0.7019645201421016, "learning_rate": 2.9727553029789303e-07, "loss": 0.019, "step": 2571 }, { "epoch": 0.8933657519972212, "grad_norm": 0.3570823917219566, "learning_rate": 2.953675068764311e-07, "loss": 0.0123, "step": 2572 }, { "epoch": 0.8937130948245918, "grad_norm": 0.44958846952146136, "learning_rate": 2.9346544000200373e-07, "loss": 0.0175, "step": 2573 }, { "epoch": 0.8940604376519625, "grad_norm": 0.8033961501217101, "learning_rate": 2.915693320828222e-07, "loss": 0.0245, "step": 2574 }, { "epoch": 0.8944077804793331, "grad_norm": 0.6221815609588339, "learning_rate": 2.89679185519553e-07, "loss": 0.0164, "step": 2575 }, { "epoch": 0.8947551233067037, "grad_norm": 0.8670700629752854, "learning_rate": 2.877950027053167e-07, "loss": 0.0251, "step": 2576 }, { "epoch": 0.8951024661340743, "grad_norm": 0.33526691551846566, "learning_rate": 2.859167860256801e-07, "loss": 0.0145, "step": 2577 }, { "epoch": 0.895449808961445, "grad_norm": 0.3738576749343047, "learning_rate": 2.8404453785866037e-07, "loss": 0.0126, "step": 2578 }, { "epoch": 0.8957971517888156, "grad_norm": 0.6834813763355805, "learning_rate": 2.8217826057471423e-07, "loss": 0.0273, "step": 2579 }, { "epoch": 0.8961444946161862, "grad_norm": 0.47554504790974056, "learning_rate": 2.8031795653674033e-07, "loss": 0.0161, "step": 2580 }, { "epoch": 0.8964918374435568, "grad_norm": 0.3690282253450739, "learning_rate": 2.7846362810007355e-07, "loss": 0.0151, "step": 2581 }, { "epoch": 0.8968391802709275, "grad_norm": 0.7307835358362869, "learning_rate": 2.766152776124853e-07, "loss": 0.0207, "step": 2582 }, { "epoch": 0.897186523098298, "grad_norm": 0.755714402406654, "learning_rate": 2.7477290741417526e-07, "loss": 0.0172, "step": 2583 }, { "epoch": 0.8975338659256686, "grad_norm": 0.43973806244842545, "learning_rate": 2.729365198377748e-07, "loss": 0.0147, "step": 2584 }, { "epoch": 0.8978812087530392, "grad_norm": 0.9009144598568006, "learning_rate": 2.711061172083368e-07, "loss": 0.0192, "step": 2585 }, { "epoch": 0.8982285515804098, "grad_norm": 0.8203741129744253, "learning_rate": 2.692817018433397e-07, "loss": 0.0147, "step": 2586 }, { "epoch": 0.8985758944077805, "grad_norm": 0.8664237660996313, "learning_rate": 2.6746327605268017e-07, "loss": 0.018, "step": 2587 }, { "epoch": 0.8989232372351511, "grad_norm": 0.49174214473254213, "learning_rate": 2.656508421386722e-07, "loss": 0.019, "step": 2588 }, { "epoch": 0.8992705800625217, "grad_norm": 0.41984596348114944, "learning_rate": 2.638444023960418e-07, "loss": 0.0147, "step": 2589 }, { "epoch": 0.8996179228898923, "grad_norm": 0.6425656015149634, "learning_rate": 2.6204395911192836e-07, "loss": 0.0142, "step": 2590 }, { "epoch": 0.899965265717263, "grad_norm": 0.6500871093127806, "learning_rate": 2.6024951456587677e-07, "loss": 0.0146, "step": 2591 }, { "epoch": 0.9003126085446336, "grad_norm": 0.7086875286298896, "learning_rate": 2.5846107102983744e-07, "loss": 0.0243, "step": 2592 }, { "epoch": 0.9006599513720042, "grad_norm": 0.38145659806209425, "learning_rate": 2.566786307681635e-07, "loss": 0.0131, "step": 2593 }, { "epoch": 0.9010072941993748, "grad_norm": 0.6205010960638877, "learning_rate": 2.549021960376075e-07, "loss": 0.015, "step": 2594 }, { "epoch": 0.9013546370267453, "grad_norm": 0.5706136061121067, "learning_rate": 2.531317690873181e-07, "loss": 0.0197, "step": 2595 }, { "epoch": 0.901701979854116, "grad_norm": 0.3901107728118566, "learning_rate": 2.5136735215883613e-07, "loss": 0.0095, "step": 2596 }, { "epoch": 0.9020493226814866, "grad_norm": 0.5881915928887412, "learning_rate": 2.4960894748609465e-07, "loss": 0.0202, "step": 2597 }, { "epoch": 0.9023966655088572, "grad_norm": 0.42911566159079534, "learning_rate": 2.4785655729541555e-07, "loss": 0.0159, "step": 2598 }, { "epoch": 0.9027440083362278, "grad_norm": 0.27083599132726704, "learning_rate": 2.46110183805503e-07, "loss": 0.0074, "step": 2599 }, { "epoch": 0.9030913511635985, "grad_norm": 0.6530377955588946, "learning_rate": 2.4436982922744547e-07, "loss": 0.0168, "step": 2600 }, { "epoch": 0.9034386939909691, "grad_norm": 0.8639097600309369, "learning_rate": 2.426354957647098e-07, "loss": 0.0139, "step": 2601 }, { "epoch": 0.9037860368183397, "grad_norm": 0.5654469170463523, "learning_rate": 2.4090718561314064e-07, "loss": 0.0218, "step": 2602 }, { "epoch": 0.9041333796457103, "grad_norm": 0.8583056813584015, "learning_rate": 2.391849009609559e-07, "loss": 0.0109, "step": 2603 }, { "epoch": 0.904480722473081, "grad_norm": 0.6257033656528606, "learning_rate": 2.374686439887436e-07, "loss": 0.0149, "step": 2604 }, { "epoch": 0.9048280653004516, "grad_norm": 0.5398376002785664, "learning_rate": 2.3575841686946155e-07, "loss": 0.0164, "step": 2605 }, { "epoch": 0.9051754081278222, "grad_norm": 0.6221925809468902, "learning_rate": 2.3405422176843329e-07, "loss": 0.0256, "step": 2606 }, { "epoch": 0.9055227509551927, "grad_norm": 0.5713859427829016, "learning_rate": 2.3235606084334285e-07, "loss": 0.0226, "step": 2607 }, { "epoch": 0.9058700937825633, "grad_norm": 1.3040906101475984, "learning_rate": 2.3066393624423754e-07, "loss": 0.0166, "step": 2608 }, { "epoch": 0.906217436609934, "grad_norm": 0.7255451058697687, "learning_rate": 2.2897785011351982e-07, "loss": 0.0209, "step": 2609 }, { "epoch": 0.9065647794373046, "grad_norm": 0.6422064768024528, "learning_rate": 2.2729780458594751e-07, "loss": 0.0109, "step": 2610 }, { "epoch": 0.9069121222646752, "grad_norm": 0.3372897791705386, "learning_rate": 2.256238017886314e-07, "loss": 0.0187, "step": 2611 }, { "epoch": 0.9072594650920458, "grad_norm": 0.6097453676252612, "learning_rate": 2.2395584384102943e-07, "loss": 0.0207, "step": 2612 }, { "epoch": 0.9076068079194165, "grad_norm": 0.6884196263626761, "learning_rate": 2.2229393285494893e-07, "loss": 0.0144, "step": 2613 }, { "epoch": 0.9079541507467871, "grad_norm": 1.0714957544176478, "learning_rate": 2.2063807093453736e-07, "loss": 0.0182, "step": 2614 }, { "epoch": 0.9083014935741577, "grad_norm": 0.8267148959880947, "learning_rate": 2.1898826017628772e-07, "loss": 0.0239, "step": 2615 }, { "epoch": 0.9086488364015283, "grad_norm": 0.7458685484740719, "learning_rate": 2.173445026690285e-07, "loss": 0.0211, "step": 2616 }, { "epoch": 0.908996179228899, "grad_norm": 0.5686980841248143, "learning_rate": 2.1570680049392556e-07, "loss": 0.0125, "step": 2617 }, { "epoch": 0.9093435220562696, "grad_norm": 0.6449900442523265, "learning_rate": 2.1407515572447747e-07, "loss": 0.0241, "step": 2618 }, { "epoch": 0.9096908648836401, "grad_norm": 0.4341128255677446, "learning_rate": 2.1244957042651394e-07, "loss": 0.0169, "step": 2619 }, { "epoch": 0.9100382077110107, "grad_norm": 0.6600049909611397, "learning_rate": 2.108300466581925e-07, "loss": 0.0152, "step": 2620 }, { "epoch": 0.9103855505383813, "grad_norm": 0.6245210505743247, "learning_rate": 2.0921658646999687e-07, "loss": 0.025, "step": 2621 }, { "epoch": 0.910732893365752, "grad_norm": 0.5922289848107282, "learning_rate": 2.0760919190473182e-07, "loss": 0.0109, "step": 2622 }, { "epoch": 0.9110802361931226, "grad_norm": 0.39775195287639503, "learning_rate": 2.06007864997525e-07, "loss": 0.015, "step": 2623 }, { "epoch": 0.9114275790204932, "grad_norm": 0.7009054983235024, "learning_rate": 2.0441260777582018e-07, "loss": 0.0231, "step": 2624 }, { "epoch": 0.9117749218478638, "grad_norm": 0.6973007401473146, "learning_rate": 2.0282342225937503e-07, "loss": 0.021, "step": 2625 }, { "epoch": 0.9121222646752345, "grad_norm": 0.5883004994975823, "learning_rate": 2.0124031046026283e-07, "loss": 0.0159, "step": 2626 }, { "epoch": 0.9124696075026051, "grad_norm": 0.454183134142274, "learning_rate": 1.9966327438286582e-07, "loss": 0.0169, "step": 2627 }, { "epoch": 0.9128169503299757, "grad_norm": 0.384092493102016, "learning_rate": 1.9809231602387236e-07, "loss": 0.0132, "step": 2628 }, { "epoch": 0.9131642931573463, "grad_norm": 0.4491174239681359, "learning_rate": 1.9652743737227643e-07, "loss": 0.0199, "step": 2629 }, { "epoch": 0.913511635984717, "grad_norm": 0.59277378736531, "learning_rate": 1.9496864040937536e-07, "loss": 0.0196, "step": 2630 }, { "epoch": 0.9138589788120876, "grad_norm": 0.7391457567820585, "learning_rate": 1.9341592710876656e-07, "loss": 0.0164, "step": 2631 }, { "epoch": 0.9142063216394581, "grad_norm": 0.667688347659246, "learning_rate": 1.9186929943634358e-07, "loss": 0.026, "step": 2632 }, { "epoch": 0.9145536644668287, "grad_norm": 0.4364968802938354, "learning_rate": 1.9032875935029504e-07, "loss": 0.0128, "step": 2633 }, { "epoch": 0.9149010072941993, "grad_norm": 0.6408274013619847, "learning_rate": 1.887943088011035e-07, "loss": 0.0157, "step": 2634 }, { "epoch": 0.91524835012157, "grad_norm": 0.5188280320693144, "learning_rate": 1.87265949731541e-07, "loss": 0.0108, "step": 2635 }, { "epoch": 0.9155956929489406, "grad_norm": 0.8443286605185657, "learning_rate": 1.8574368407666576e-07, "loss": 0.0234, "step": 2636 }, { "epoch": 0.9159430357763112, "grad_norm": 0.4426808473145947, "learning_rate": 1.8422751376382274e-07, "loss": 0.0067, "step": 2637 }, { "epoch": 0.9162903786036818, "grad_norm": 0.47113894581099847, "learning_rate": 1.8271744071263808e-07, "loss": 0.0206, "step": 2638 }, { "epoch": 0.9166377214310525, "grad_norm": 0.8457616638367563, "learning_rate": 1.8121346683502183e-07, "loss": 0.021, "step": 2639 }, { "epoch": 0.9169850642584231, "grad_norm": 0.6588573901077968, "learning_rate": 1.7971559403515526e-07, "loss": 0.0191, "step": 2640 }, { "epoch": 0.9173324070857937, "grad_norm": 0.45622450404307274, "learning_rate": 1.782238242095008e-07, "loss": 0.014, "step": 2641 }, { "epoch": 0.9176797499131643, "grad_norm": 0.4489208891532178, "learning_rate": 1.767381592467926e-07, "loss": 0.0163, "step": 2642 }, { "epoch": 0.918027092740535, "grad_norm": 0.52794845468743, "learning_rate": 1.7525860102803438e-07, "loss": 0.0214, "step": 2643 }, { "epoch": 0.9183744355679055, "grad_norm": 0.33511808267174975, "learning_rate": 1.7378515142649767e-07, "loss": 0.0113, "step": 2644 }, { "epoch": 0.9187217783952761, "grad_norm": 0.8011767384068549, "learning_rate": 1.7231781230772127e-07, "loss": 0.0294, "step": 2645 }, { "epoch": 0.9190691212226467, "grad_norm": 0.3396441784353466, "learning_rate": 1.7085658552950746e-07, "loss": 0.0134, "step": 2646 }, { "epoch": 0.9194164640500173, "grad_norm": 0.30267544503409916, "learning_rate": 1.69401472941918e-07, "loss": 0.0108, "step": 2647 }, { "epoch": 0.919763806877388, "grad_norm": 0.7104785989934016, "learning_rate": 1.6795247638727585e-07, "loss": 0.0232, "step": 2648 }, { "epoch": 0.9201111497047586, "grad_norm": 0.27352163599926305, "learning_rate": 1.6650959770015796e-07, "loss": 0.0135, "step": 2649 }, { "epoch": 0.9204584925321292, "grad_norm": 0.494764482965323, "learning_rate": 1.6507283870739798e-07, "loss": 0.0164, "step": 2650 }, { "epoch": 0.9208058353594998, "grad_norm": 0.6063381987250709, "learning_rate": 1.6364220122807862e-07, "loss": 0.014, "step": 2651 }, { "epoch": 0.9211531781868705, "grad_norm": 0.3484396467979152, "learning_rate": 1.6221768707353536e-07, "loss": 0.0146, "step": 2652 }, { "epoch": 0.9215005210142411, "grad_norm": 0.5811657495793892, "learning_rate": 1.6079929804734716e-07, "loss": 0.0186, "step": 2653 }, { "epoch": 0.9218478638416117, "grad_norm": 0.38993789351226965, "learning_rate": 1.59387035945342e-07, "loss": 0.0104, "step": 2654 }, { "epoch": 0.9221952066689822, "grad_norm": 0.2970396986761084, "learning_rate": 1.5798090255558617e-07, "loss": 0.0094, "step": 2655 }, { "epoch": 0.922542549496353, "grad_norm": 0.7453340514328095, "learning_rate": 1.565808996583912e-07, "loss": 0.0236, "step": 2656 }, { "epoch": 0.9228898923237235, "grad_norm": 0.6111888530371641, "learning_rate": 1.55187029026303e-07, "loss": 0.0112, "step": 2657 }, { "epoch": 0.9232372351510941, "grad_norm": 0.8674798788983026, "learning_rate": 1.5379929242410385e-07, "loss": 0.0263, "step": 2658 }, { "epoch": 0.9235845779784647, "grad_norm": 0.527019561763946, "learning_rate": 1.5241769160881104e-07, "loss": 0.0142, "step": 2659 }, { "epoch": 0.9239319208058353, "grad_norm": 0.7830231346293527, "learning_rate": 1.5104222832967419e-07, "loss": 0.0209, "step": 2660 }, { "epoch": 0.924279263633206, "grad_norm": 0.5153735628692967, "learning_rate": 1.4967290432817028e-07, "loss": 0.0184, "step": 2661 }, { "epoch": 0.9246266064605766, "grad_norm": 0.2844914040623888, "learning_rate": 1.4830972133800247e-07, "loss": 0.0093, "step": 2662 }, { "epoch": 0.9249739492879472, "grad_norm": 0.43842457587592126, "learning_rate": 1.4695268108510075e-07, "loss": 0.0146, "step": 2663 }, { "epoch": 0.9253212921153178, "grad_norm": 0.7398002758095784, "learning_rate": 1.4560178528761848e-07, "loss": 0.0177, "step": 2664 }, { "epoch": 0.9256686349426885, "grad_norm": 0.35972295634537466, "learning_rate": 1.4425703565592753e-07, "loss": 0.0108, "step": 2665 }, { "epoch": 0.9260159777700591, "grad_norm": 0.7120797307812906, "learning_rate": 1.429184338926176e-07, "loss": 0.0157, "step": 2666 }, { "epoch": 0.9263633205974297, "grad_norm": 0.45611345610981685, "learning_rate": 1.415859816924975e-07, "loss": 0.0236, "step": 2667 }, { "epoch": 0.9267106634248002, "grad_norm": 0.6910514224473907, "learning_rate": 1.402596807425871e-07, "loss": 0.0259, "step": 2668 }, { "epoch": 0.9270580062521709, "grad_norm": 0.3565369326923046, "learning_rate": 1.3893953272212046e-07, "loss": 0.0147, "step": 2669 }, { "epoch": 0.9274053490795415, "grad_norm": 0.4838529238212452, "learning_rate": 1.3762553930253885e-07, "loss": 0.0196, "step": 2670 }, { "epoch": 0.9277526919069121, "grad_norm": 0.32075955939895817, "learning_rate": 1.3631770214749374e-07, "loss": 0.016, "step": 2671 }, { "epoch": 0.9281000347342827, "grad_norm": 0.5031442718892309, "learning_rate": 1.3501602291284166e-07, "loss": 0.0201, "step": 2672 }, { "epoch": 0.9284473775616533, "grad_norm": 0.3459439042987737, "learning_rate": 1.3372050324663988e-07, "loss": 0.0214, "step": 2673 }, { "epoch": 0.928794720389024, "grad_norm": 0.24515986422727065, "learning_rate": 1.3243114478915076e-07, "loss": 0.0098, "step": 2674 }, { "epoch": 0.9291420632163946, "grad_norm": 0.5721382453019332, "learning_rate": 1.3114794917283403e-07, "loss": 0.0195, "step": 2675 }, { "epoch": 0.9294894060437652, "grad_norm": 1.008562948955966, "learning_rate": 1.2987091802234675e-07, "loss": 0.0146, "step": 2676 }, { "epoch": 0.9298367488711358, "grad_norm": 0.3165525870137379, "learning_rate": 1.286000529545406e-07, "loss": 0.0125, "step": 2677 }, { "epoch": 0.9301840916985065, "grad_norm": 0.3519330637522624, "learning_rate": 1.2733535557846176e-07, "loss": 0.0132, "step": 2678 }, { "epoch": 0.930531434525877, "grad_norm": 0.609989019077514, "learning_rate": 1.2607682749534723e-07, "loss": 0.0227, "step": 2679 }, { "epoch": 0.9308787773532476, "grad_norm": 0.9719297935968579, "learning_rate": 1.2482447029862177e-07, "loss": 0.0175, "step": 2680 }, { "epoch": 0.9312261201806182, "grad_norm": 0.5004324439145527, "learning_rate": 1.2357828557389762e-07, "loss": 0.0198, "step": 2681 }, { "epoch": 0.9315734630079889, "grad_norm": 0.6042127292832795, "learning_rate": 1.223382748989732e-07, "loss": 0.0156, "step": 2682 }, { "epoch": 0.9319208058353595, "grad_norm": 0.6442994121709944, "learning_rate": 1.2110443984382936e-07, "loss": 0.0178, "step": 2683 }, { "epoch": 0.9322681486627301, "grad_norm": 0.4373568494058291, "learning_rate": 1.1987678197062758e-07, "loss": 0.0116, "step": 2684 }, { "epoch": 0.9326154914901007, "grad_norm": 0.4862790006331638, "learning_rate": 1.186553028337073e-07, "loss": 0.0133, "step": 2685 }, { "epoch": 0.9329628343174713, "grad_norm": 0.4434022690215712, "learning_rate": 1.1744000397958755e-07, "loss": 0.0176, "step": 2686 }, { "epoch": 0.933310177144842, "grad_norm": 0.8580640939743742, "learning_rate": 1.1623088694696194e-07, "loss": 0.0198, "step": 2687 }, { "epoch": 0.9336575199722126, "grad_norm": 0.4912841363664674, "learning_rate": 1.150279532666948e-07, "loss": 0.0137, "step": 2688 }, { "epoch": 0.9340048627995832, "grad_norm": 0.6768885959717444, "learning_rate": 1.1383120446182505e-07, "loss": 0.023, "step": 2689 }, { "epoch": 0.9343522056269538, "grad_norm": 0.4122917358139824, "learning_rate": 1.1264064204755898e-07, "loss": 0.0146, "step": 2690 }, { "epoch": 0.9346995484543245, "grad_norm": 0.2906332176742442, "learning_rate": 1.114562675312697e-07, "loss": 0.0091, "step": 2691 }, { "epoch": 0.935046891281695, "grad_norm": 0.788414928922905, "learning_rate": 1.1027808241249715e-07, "loss": 0.0234, "step": 2692 }, { "epoch": 0.9353942341090656, "grad_norm": 0.7000778534595465, "learning_rate": 1.0910608818294588e-07, "loss": 0.0171, "step": 2693 }, { "epoch": 0.9357415769364362, "grad_norm": 0.3365753374263883, "learning_rate": 1.079402863264789e-07, "loss": 0.011, "step": 2694 }, { "epoch": 0.9360889197638069, "grad_norm": 0.500493153669285, "learning_rate": 1.0678067831912164e-07, "loss": 0.0246, "step": 2695 }, { "epoch": 0.9364362625911775, "grad_norm": 0.49634500860470243, "learning_rate": 1.056272656290569e-07, "loss": 0.0151, "step": 2696 }, { "epoch": 0.9367836054185481, "grad_norm": 0.2959869869958872, "learning_rate": 1.0448004971662317e-07, "loss": 0.0144, "step": 2697 }, { "epoch": 0.9371309482459187, "grad_norm": 0.5721150903591803, "learning_rate": 1.0333903203431362e-07, "loss": 0.0158, "step": 2698 }, { "epoch": 0.9374782910732893, "grad_norm": 0.5174080846934402, "learning_rate": 1.0220421402677261e-07, "loss": 0.0172, "step": 2699 }, { "epoch": 0.93782563390066, "grad_norm": 0.6111422252957059, "learning_rate": 1.0107559713079751e-07, "loss": 0.0159, "step": 2700 }, { "epoch": 0.9381729767280306, "grad_norm": 0.3308102163778448, "learning_rate": 9.99531827753325e-08, "loss": 0.0116, "step": 2701 }, { "epoch": 0.9385203195554012, "grad_norm": 0.4117220886730794, "learning_rate": 9.883697238146917e-08, "loss": 0.0159, "step": 2702 }, { "epoch": 0.9388676623827718, "grad_norm": 0.6253135827117662, "learning_rate": 9.772696736244369e-08, "loss": 0.0161, "step": 2703 }, { "epoch": 0.9392150052101425, "grad_norm": 0.7918100091195154, "learning_rate": 9.662316912363634e-08, "loss": 0.0202, "step": 2704 }, { "epoch": 0.939562348037513, "grad_norm": 0.8027328036502868, "learning_rate": 9.552557906257032e-08, "loss": 0.018, "step": 2705 }, { "epoch": 0.9399096908648836, "grad_norm": 0.399436967336629, "learning_rate": 9.443419856890568e-08, "loss": 0.0154, "step": 2706 }, { "epoch": 0.9402570336922542, "grad_norm": 0.5291507460704015, "learning_rate": 9.33490290244421e-08, "loss": 0.0147, "step": 2707 }, { "epoch": 0.9406043765196249, "grad_norm": 0.9303566715405308, "learning_rate": 9.227007180311609e-08, "loss": 0.013, "step": 2708 }, { "epoch": 0.9409517193469955, "grad_norm": 0.48135730059288107, "learning_rate": 9.119732827099826e-08, "loss": 0.0178, "step": 2709 }, { "epoch": 0.9412990621743661, "grad_norm": 0.289263017584217, "learning_rate": 9.013079978629047e-08, "loss": 0.0139, "step": 2710 }, { "epoch": 0.9416464050017367, "grad_norm": 1.2281515688429252, "learning_rate": 8.907048769932813e-08, "loss": 0.0232, "step": 2711 }, { "epoch": 0.9419937478291073, "grad_norm": 0.45867978328141407, "learning_rate": 8.801639335257573e-08, "loss": 0.018, "step": 2712 }, { "epoch": 0.942341090656478, "grad_norm": 0.8116884665077297, "learning_rate": 8.696851808062401e-08, "loss": 0.0166, "step": 2713 }, { "epoch": 0.9426884334838486, "grad_norm": 0.629444316419227, "learning_rate": 8.592686321019005e-08, "loss": 0.0187, "step": 2714 }, { "epoch": 0.9430357763112192, "grad_norm": 0.467448231376131, "learning_rate": 8.489143006011613e-08, "loss": 0.0191, "step": 2715 }, { "epoch": 0.9433831191385897, "grad_norm": 1.540746884539893, "learning_rate": 8.38622199413669e-08, "loss": 0.0276, "step": 2716 }, { "epoch": 0.9437304619659604, "grad_norm": 0.9960497105684367, "learning_rate": 8.283923415702832e-08, "loss": 0.0185, "step": 2717 }, { "epoch": 0.944077804793331, "grad_norm": 0.6697810432465229, "learning_rate": 8.182247400230381e-08, "loss": 0.0239, "step": 2718 }, { "epoch": 0.9444251476207016, "grad_norm": 0.5928459085259657, "learning_rate": 8.081194076451749e-08, "loss": 0.0155, "step": 2719 }, { "epoch": 0.9447724904480722, "grad_norm": 0.9024042874062206, "learning_rate": 7.980763572310702e-08, "loss": 0.0175, "step": 2720 }, { "epoch": 0.9451198332754429, "grad_norm": 0.9579838145780024, "learning_rate": 7.880956014962694e-08, "loss": 0.0271, "step": 2721 }, { "epoch": 0.9454671761028135, "grad_norm": 0.49204147819791966, "learning_rate": 7.781771530774085e-08, "loss": 0.0233, "step": 2722 }, { "epoch": 0.9458145189301841, "grad_norm": 0.6787970584155775, "learning_rate": 7.683210245322869e-08, "loss": 0.025, "step": 2723 }, { "epoch": 0.9461618617575547, "grad_norm": 0.32439298949722206, "learning_rate": 7.585272283397504e-08, "loss": 0.0123, "step": 2724 }, { "epoch": 0.9465092045849253, "grad_norm": 0.8141650539318078, "learning_rate": 7.487957768997633e-08, "loss": 0.0142, "step": 2725 }, { "epoch": 0.946856547412296, "grad_norm": 0.8468827577767781, "learning_rate": 7.391266825333365e-08, "loss": 0.0111, "step": 2726 }, { "epoch": 0.9472038902396666, "grad_norm": 0.5687781476564356, "learning_rate": 7.295199574825384e-08, "loss": 0.0198, "step": 2727 }, { "epoch": 0.9475512330670371, "grad_norm": 0.4074296259542572, "learning_rate": 7.199756139104563e-08, "loss": 0.0144, "step": 2728 }, { "epoch": 0.9478985758944077, "grad_norm": 0.7996569231526766, "learning_rate": 7.104936639012239e-08, "loss": 0.0304, "step": 2729 }, { "epoch": 0.9482459187217784, "grad_norm": 0.45337039410172214, "learning_rate": 7.01074119459949e-08, "loss": 0.0146, "step": 2730 }, { "epoch": 0.948593261549149, "grad_norm": 0.7844203973946874, "learning_rate": 6.917169925127476e-08, "loss": 0.0272, "step": 2731 }, { "epoch": 0.9489406043765196, "grad_norm": 0.4489625006480948, "learning_rate": 6.824222949066983e-08, "loss": 0.0195, "step": 2732 }, { "epoch": 0.9492879472038902, "grad_norm": 1.1267855989046178, "learning_rate": 6.731900384098433e-08, "loss": 0.0137, "step": 2733 }, { "epoch": 0.9496352900312609, "grad_norm": 1.0751845620346967, "learning_rate": 6.640202347111657e-08, "loss": 0.019, "step": 2734 }, { "epoch": 0.9499826328586315, "grad_norm": 0.5627865416818346, "learning_rate": 6.54912895420573e-08, "loss": 0.0185, "step": 2735 }, { "epoch": 0.9503299756860021, "grad_norm": 0.49730875779627454, "learning_rate": 6.458680320688914e-08, "loss": 0.0198, "step": 2736 }, { "epoch": 0.9506773185133727, "grad_norm": 0.5114294099131146, "learning_rate": 6.368856561078496e-08, "loss": 0.0115, "step": 2737 }, { "epoch": 0.9510246613407433, "grad_norm": 0.8871643487076939, "learning_rate": 6.279657789100612e-08, "loss": 0.016, "step": 2738 }, { "epoch": 0.951372004168114, "grad_norm": 0.6824181407432921, "learning_rate": 6.191084117689871e-08, "loss": 0.0225, "step": 2739 }, { "epoch": 0.9517193469954845, "grad_norm": 0.3788527139900645, "learning_rate": 6.103135658989789e-08, "loss": 0.0135, "step": 2740 }, { "epoch": 0.9520666898228551, "grad_norm": 2.27805723002845, "learning_rate": 6.015812524352072e-08, "loss": 0.0262, "step": 2741 }, { "epoch": 0.9524140326502257, "grad_norm": 0.5250351000200569, "learning_rate": 5.9291148243367235e-08, "loss": 0.017, "step": 2742 }, { "epoch": 0.9527613754775964, "grad_norm": 0.6854351694783098, "learning_rate": 5.8430426687119954e-08, "loss": 0.0209, "step": 2743 }, { "epoch": 0.953108718304967, "grad_norm": 0.8186234378831619, "learning_rate": 5.7575961664539384e-08, "loss": 0.0262, "step": 2744 }, { "epoch": 0.9534560611323376, "grad_norm": 0.5783055322077533, "learning_rate": 5.672775425746735e-08, "loss": 0.0164, "step": 2745 }, { "epoch": 0.9538034039597082, "grad_norm": 1.1613038468708938, "learning_rate": 5.588580553982092e-08, "loss": 0.0306, "step": 2746 }, { "epoch": 0.9541507467870789, "grad_norm": 0.8318319702699184, "learning_rate": 5.505011657759296e-08, "loss": 0.0129, "step": 2747 }, { "epoch": 0.9544980896144495, "grad_norm": 0.7343407926498003, "learning_rate": 5.4220688428850974e-08, "loss": 0.0125, "step": 2748 }, { "epoch": 0.9548454324418201, "grad_norm": 0.5822397422630301, "learning_rate": 5.3397522143737725e-08, "loss": 0.0187, "step": 2749 }, { "epoch": 0.9551927752691907, "grad_norm": 0.36611892790873857, "learning_rate": 5.258061876446507e-08, "loss": 0.0129, "step": 2750 }, { "epoch": 0.9555401180965613, "grad_norm": 0.506143736687749, "learning_rate": 5.176997932531569e-08, "loss": 0.0111, "step": 2751 }, { "epoch": 0.955887460923932, "grad_norm": 0.37104961585297513, "learning_rate": 5.096560485264301e-08, "loss": 0.0178, "step": 2752 }, { "epoch": 0.9562348037513025, "grad_norm": 0.39087699228743555, "learning_rate": 5.016749636486851e-08, "loss": 0.0219, "step": 2753 }, { "epoch": 0.9565821465786731, "grad_norm": 0.3940695373758418, "learning_rate": 4.937565487247775e-08, "loss": 0.0217, "step": 2754 }, { "epoch": 0.9569294894060437, "grad_norm": 0.8139685846877176, "learning_rate": 4.859008137802379e-08, "loss": 0.0188, "step": 2755 }, { "epoch": 0.9572768322334144, "grad_norm": 0.3884984470065128, "learning_rate": 4.781077687612379e-08, "loss": 0.0118, "step": 2756 }, { "epoch": 0.957624175060785, "grad_norm": 0.9041840556875058, "learning_rate": 4.703774235345626e-08, "loss": 0.0204, "step": 2757 }, { "epoch": 0.9579715178881556, "grad_norm": 0.46537968867455554, "learning_rate": 4.627097878876274e-08, "loss": 0.0156, "step": 2758 }, { "epoch": 0.9583188607155262, "grad_norm": 0.41094808662250093, "learning_rate": 4.551048715284445e-08, "loss": 0.0174, "step": 2759 }, { "epoch": 0.9586662035428969, "grad_norm": 0.3654417168577741, "learning_rate": 4.4756268408561174e-08, "loss": 0.0193, "step": 2760 }, { "epoch": 0.9590135463702675, "grad_norm": 0.6891593746161022, "learning_rate": 4.400832351083184e-08, "loss": 0.0185, "step": 2761 }, { "epoch": 0.9593608891976381, "grad_norm": 0.9417802736827884, "learning_rate": 4.326665340663117e-08, "loss": 0.0176, "step": 2762 }, { "epoch": 0.9597082320250087, "grad_norm": 0.6092562454977165, "learning_rate": 4.253125903498967e-08, "loss": 0.0113, "step": 2763 }, { "epoch": 0.9600555748523792, "grad_norm": 0.4780402820280536, "learning_rate": 4.180214132699201e-08, "loss": 0.0132, "step": 2764 }, { "epoch": 0.96040291767975, "grad_norm": 0.7203305496862711, "learning_rate": 4.10793012057753e-08, "loss": 0.0201, "step": 2765 }, { "epoch": 0.9607502605071205, "grad_norm": 0.78632337470775, "learning_rate": 4.03627395865297e-08, "loss": 0.0161, "step": 2766 }, { "epoch": 0.9610976033344911, "grad_norm": 0.7800907681147895, "learning_rate": 3.9652457376496146e-08, "loss": 0.0226, "step": 2767 }, { "epoch": 0.9614449461618617, "grad_norm": 0.7579603624294482, "learning_rate": 3.894845547496418e-08, "loss": 0.018, "step": 2768 }, { "epoch": 0.9617922889892324, "grad_norm": 0.4992736774720632, "learning_rate": 3.8250734773272455e-08, "loss": 0.0202, "step": 2769 }, { "epoch": 0.962139631816603, "grad_norm": 0.5447806191032949, "learning_rate": 3.755929615480658e-08, "loss": 0.0158, "step": 2770 }, { "epoch": 0.9624869746439736, "grad_norm": 0.4587220635778511, "learning_rate": 3.687414049500015e-08, "loss": 0.0174, "step": 2771 }, { "epoch": 0.9628343174713442, "grad_norm": 0.6017122263910978, "learning_rate": 3.619526866132872e-08, "loss": 0.0212, "step": 2772 }, { "epoch": 0.9631816602987149, "grad_norm": 0.45153146666560434, "learning_rate": 3.552268151331417e-08, "loss": 0.0129, "step": 2773 }, { "epoch": 0.9635290031260855, "grad_norm": 0.8050815601154685, "learning_rate": 3.485637990252089e-08, "loss": 0.0203, "step": 2774 }, { "epoch": 0.9638763459534561, "grad_norm": 0.5550194035824456, "learning_rate": 3.4196364672555715e-08, "loss": 0.0149, "step": 2775 }, { "epoch": 0.9642236887808266, "grad_norm": 0.6061372955316445, "learning_rate": 3.3542636659064095e-08, "loss": 0.017, "step": 2776 }, { "epoch": 0.9645710316081972, "grad_norm": 0.7681878631554084, "learning_rate": 3.2895196689733953e-08, "loss": 0.0143, "step": 2777 }, { "epoch": 0.9649183744355679, "grad_norm": 0.7249193630109478, "learning_rate": 3.225404558429068e-08, "loss": 0.0175, "step": 2778 }, { "epoch": 0.9652657172629385, "grad_norm": 0.31018295263385637, "learning_rate": 3.1619184154496605e-08, "loss": 0.0096, "step": 2779 }, { "epoch": 0.9656130600903091, "grad_norm": 0.5323520395260942, "learning_rate": 3.099061320415153e-08, "loss": 0.02, "step": 2780 }, { "epoch": 0.9659604029176797, "grad_norm": 0.5460021526097322, "learning_rate": 3.036833352909052e-08, "loss": 0.0138, "step": 2781 }, { "epoch": 0.9663077457450504, "grad_norm": 0.4231306425496086, "learning_rate": 2.9752345917184456e-08, "loss": 0.0128, "step": 2782 }, { "epoch": 0.966655088572421, "grad_norm": 0.5104133537482568, "learning_rate": 2.914265114833614e-08, "loss": 0.0181, "step": 2783 }, { "epoch": 0.9670024313997916, "grad_norm": 0.32680316277058136, "learning_rate": 2.8539249994480878e-08, "loss": 0.0142, "step": 2784 }, { "epoch": 0.9673497742271622, "grad_norm": 0.6787838217925692, "learning_rate": 2.7942143219587547e-08, "loss": 0.0227, "step": 2785 }, { "epoch": 0.9676971170545329, "grad_norm": 0.5393149296688037, "learning_rate": 2.7351331579654194e-08, "loss": 0.0199, "step": 2786 }, { "epoch": 0.9680444598819035, "grad_norm": 0.3709030328592483, "learning_rate": 2.6766815822709124e-08, "loss": 0.0189, "step": 2787 }, { "epoch": 0.968391802709274, "grad_norm": 2.1813300319236433, "learning_rate": 2.618859668880869e-08, "loss": 0.0233, "step": 2788 }, { "epoch": 0.9687391455366446, "grad_norm": 0.6115166749222286, "learning_rate": 2.561667491003783e-08, "loss": 0.0151, "step": 2789 }, { "epoch": 0.9690864883640152, "grad_norm": 0.7565981079039917, "learning_rate": 2.5051051210508437e-08, "loss": 0.0133, "step": 2790 }, { "epoch": 0.9694338311913859, "grad_norm": 0.6494118382951497, "learning_rate": 2.4491726306357656e-08, "loss": 0.0129, "step": 2791 }, { "epoch": 0.9697811740187565, "grad_norm": 0.8914095771286661, "learning_rate": 2.3938700905747902e-08, "loss": 0.0165, "step": 2792 }, { "epoch": 0.9701285168461271, "grad_norm": 0.3651793620369032, "learning_rate": 2.3391975708866866e-08, "loss": 0.014, "step": 2793 }, { "epoch": 0.9704758596734977, "grad_norm": 0.6009630665505777, "learning_rate": 2.2851551407924166e-08, "loss": 0.0126, "step": 2794 }, { "epoch": 0.9708232025008684, "grad_norm": 0.6517704643511893, "learning_rate": 2.231742868715303e-08, "loss": 0.0215, "step": 2795 }, { "epoch": 0.971170545328239, "grad_norm": 0.5374743831920767, "learning_rate": 2.17896082228064e-08, "loss": 0.0109, "step": 2796 }, { "epoch": 0.9715178881556096, "grad_norm": 0.43751454745868124, "learning_rate": 2.1268090683159714e-08, "loss": 0.0151, "step": 2797 }, { "epoch": 0.9718652309829802, "grad_norm": 0.9225025519084343, "learning_rate": 2.075287672850812e-08, "loss": 0.0185, "step": 2798 }, { "epoch": 0.9722125738103509, "grad_norm": 0.6499562483522786, "learning_rate": 2.0243967011164267e-08, "loss": 0.0143, "step": 2799 }, { "epoch": 0.9725599166377215, "grad_norm": 0.4078620285370667, "learning_rate": 1.9741362175461076e-08, "loss": 0.0095, "step": 2800 }, { "epoch": 0.972907259465092, "grad_norm": 0.649018054650212, "learning_rate": 1.9245062857746744e-08, "loss": 0.015, "step": 2801 }, { "epoch": 0.9732546022924626, "grad_norm": 0.8446552989012637, "learning_rate": 1.8755069686388074e-08, "loss": 0.0265, "step": 2802 }, { "epoch": 0.9736019451198332, "grad_norm": 0.8836022726540815, "learning_rate": 1.827138328176603e-08, "loss": 0.0197, "step": 2803 }, { "epoch": 0.9739492879472039, "grad_norm": 0.39841982862426034, "learning_rate": 1.7794004256277976e-08, "loss": 0.0112, "step": 2804 }, { "epoch": 0.9742966307745745, "grad_norm": 0.6590609628914503, "learning_rate": 1.732293321433376e-08, "loss": 0.022, "step": 2805 }, { "epoch": 0.9746439736019451, "grad_norm": 0.5300192127082276, "learning_rate": 1.6858170752357407e-08, "loss": 0.0164, "step": 2806 }, { "epoch": 0.9749913164293157, "grad_norm": 1.0204073151035766, "learning_rate": 1.63997174587871e-08, "loss": 0.0123, "step": 2807 }, { "epoch": 0.9753386592566864, "grad_norm": 0.571323310061111, "learning_rate": 1.594757391407076e-08, "loss": 0.0157, "step": 2808 }, { "epoch": 0.975686002084057, "grad_norm": 0.3778707394308839, "learning_rate": 1.5501740690668788e-08, "loss": 0.0235, "step": 2809 }, { "epoch": 0.9760333449114276, "grad_norm": 0.44292675404107196, "learning_rate": 1.506221835305133e-08, "loss": 0.0173, "step": 2810 }, { "epoch": 0.9763806877387982, "grad_norm": 0.2763995858299458, "learning_rate": 1.4629007457699906e-08, "loss": 0.0093, "step": 2811 }, { "epoch": 0.9767280305661689, "grad_norm": 0.8458253024017103, "learning_rate": 1.4202108553102445e-08, "loss": 0.0216, "step": 2812 }, { "epoch": 0.9770753733935394, "grad_norm": 0.47122826092790115, "learning_rate": 1.3781522179757146e-08, "loss": 0.0211, "step": 2813 }, { "epoch": 0.97742271622091, "grad_norm": 0.6437597372265773, "learning_rate": 1.3367248870170269e-08, "loss": 0.0253, "step": 2814 }, { "epoch": 0.9777700590482806, "grad_norm": 0.7953386444191836, "learning_rate": 1.295928914885336e-08, "loss": 0.0216, "step": 2815 }, { "epoch": 0.9781174018756512, "grad_norm": 0.7510223662917004, "learning_rate": 1.255764353232547e-08, "loss": 0.0162, "step": 2816 }, { "epoch": 0.9784647447030219, "grad_norm": 0.38965761008763483, "learning_rate": 1.2162312529111487e-08, "loss": 0.0089, "step": 2817 }, { "epoch": 0.9788120875303925, "grad_norm": 0.8708571004518411, "learning_rate": 1.1773296639741028e-08, "loss": 0.0186, "step": 2818 }, { "epoch": 0.9791594303577631, "grad_norm": 0.4280076925155896, "learning_rate": 1.139059635674733e-08, "loss": 0.0125, "step": 2819 }, { "epoch": 0.9795067731851337, "grad_norm": 0.5982212585888964, "learning_rate": 1.1014212164668914e-08, "loss": 0.0128, "step": 2820 }, { "epoch": 0.9798541160125044, "grad_norm": 0.4802236337534653, "learning_rate": 1.0644144540046253e-08, "loss": 0.0154, "step": 2821 }, { "epoch": 0.980201458839875, "grad_norm": 0.6709260979923914, "learning_rate": 1.028039395142344e-08, "loss": 0.0162, "step": 2822 }, { "epoch": 0.9805488016672456, "grad_norm": 0.3994630255273555, "learning_rate": 9.92296085934541e-09, "loss": 0.0136, "step": 2823 }, { "epoch": 0.9808961444946162, "grad_norm": 0.8512917771148851, "learning_rate": 9.571845716360162e-09, "loss": 0.0137, "step": 2824 }, { "epoch": 0.9812434873219869, "grad_norm": 0.7023478964706273, "learning_rate": 9.227048967013762e-09, "loss": 0.0199, "step": 2825 }, { "epoch": 0.9815908301493574, "grad_norm": 0.5923172701448587, "learning_rate": 8.888571047855899e-09, "loss": 0.0149, "step": 2826 }, { "epoch": 0.981938172976728, "grad_norm": 0.18336419674101587, "learning_rate": 8.55641238743321e-09, "loss": 0.0045, "step": 2827 }, { "epoch": 0.9822855158040986, "grad_norm": 0.7801052776351165, "learning_rate": 8.230573406293186e-09, "loss": 0.0185, "step": 2828 }, { "epoch": 0.9826328586314692, "grad_norm": 0.4666029152584524, "learning_rate": 7.911054516981376e-09, "loss": 0.0092, "step": 2829 }, { "epoch": 0.9829802014588399, "grad_norm": 0.5508063511757076, "learning_rate": 7.597856124040848e-09, "loss": 0.0278, "step": 2830 }, { "epoch": 0.9833275442862105, "grad_norm": 0.47041522453276524, "learning_rate": 7.290978624013289e-09, "loss": 0.0131, "step": 2831 }, { "epoch": 0.9836748871135811, "grad_norm": 0.5117867904313976, "learning_rate": 6.990422405437903e-09, "loss": 0.0169, "step": 2832 }, { "epoch": 0.9840222299409517, "grad_norm": 0.4928579747341889, "learning_rate": 6.696187848848068e-09, "loss": 0.014, "step": 2833 }, { "epoch": 0.9843695727683224, "grad_norm": 0.3862793465920075, "learning_rate": 6.408275326775792e-09, "loss": 0.0194, "step": 2834 }, { "epoch": 0.984716915595693, "grad_norm": 0.38744597996896796, "learning_rate": 6.126685203747818e-09, "loss": 0.0142, "step": 2835 }, { "epoch": 0.9850642584230636, "grad_norm": 0.36149263951244837, "learning_rate": 5.851417836286177e-09, "loss": 0.0152, "step": 2836 }, { "epoch": 0.9854116012504341, "grad_norm": 0.6059984628372787, "learning_rate": 5.582473572907643e-09, "loss": 0.0125, "step": 2837 }, { "epoch": 0.9857589440778048, "grad_norm": 1.0805004360514219, "learning_rate": 5.319852754122612e-09, "loss": 0.0188, "step": 2838 }, { "epoch": 0.9861062869051754, "grad_norm": 0.7645678869514635, "learning_rate": 5.0635557124362185e-09, "loss": 0.0189, "step": 2839 }, { "epoch": 0.986453629732546, "grad_norm": 1.4270577859101476, "learning_rate": 4.813582772347225e-09, "loss": 0.027, "step": 2840 }, { "epoch": 0.9868009725599166, "grad_norm": 0.6651264047671889, "learning_rate": 4.569934250346908e-09, "loss": 0.0171, "step": 2841 }, { "epoch": 0.9871483153872872, "grad_norm": 0.562975977952286, "learning_rate": 4.332610454919062e-09, "loss": 0.0155, "step": 2842 }, { "epoch": 0.9874956582146579, "grad_norm": 0.5118103669731423, "learning_rate": 4.101611686539442e-09, "loss": 0.013, "step": 2843 }, { "epoch": 0.9878430010420285, "grad_norm": 0.7783130801930213, "learning_rate": 3.876938237676875e-09, "loss": 0.0253, "step": 2844 }, { "epoch": 0.9881903438693991, "grad_norm": 0.34801718183552194, "learning_rate": 3.6585903927910395e-09, "loss": 0.0118, "step": 2845 }, { "epoch": 0.9885376866967697, "grad_norm": 0.535293498292763, "learning_rate": 3.446568428332464e-09, "loss": 0.014, "step": 2846 }, { "epoch": 0.9888850295241404, "grad_norm": 0.504081454212183, "learning_rate": 3.2408726127425294e-09, "loss": 0.0184, "step": 2847 }, { "epoch": 0.989232372351511, "grad_norm": 0.5720403882389867, "learning_rate": 3.0415032064534668e-09, "loss": 0.0243, "step": 2848 }, { "epoch": 0.9895797151788815, "grad_norm": 0.5342584944130937, "learning_rate": 2.848460461887248e-09, "loss": 0.0212, "step": 2849 }, { "epoch": 0.9899270580062521, "grad_norm": 0.37088583044080237, "learning_rate": 2.6617446234555866e-09, "loss": 0.0199, "step": 2850 }, { "epoch": 0.9902744008336228, "grad_norm": 0.36631835755907144, "learning_rate": 2.4813559275604914e-09, "loss": 0.0147, "step": 2851 }, { "epoch": 0.9906217436609934, "grad_norm": 0.7717204298531588, "learning_rate": 2.3072946025920474e-09, "loss": 0.0133, "step": 2852 }, { "epoch": 0.990969086488364, "grad_norm": 0.4286410759035607, "learning_rate": 2.1395608689295244e-09, "loss": 0.0172, "step": 2853 }, { "epoch": 0.9913164293157346, "grad_norm": 0.7876162638170817, "learning_rate": 1.978154938941379e-09, "loss": 0.0192, "step": 2854 }, { "epoch": 0.9916637721431052, "grad_norm": 0.9045977670509248, "learning_rate": 1.8230770169841427e-09, "loss": 0.0145, "step": 2855 }, { "epoch": 0.9920111149704759, "grad_norm": 0.34387118239672126, "learning_rate": 1.674327299402423e-09, "loss": 0.0089, "step": 2856 }, { "epoch": 0.9923584577978465, "grad_norm": 0.37959493138909545, "learning_rate": 1.531905974528347e-09, "loss": 0.0163, "step": 2857 }, { "epoch": 0.9927058006252171, "grad_norm": 0.7818912632587496, "learning_rate": 1.3958132226821187e-09, "loss": 0.02, "step": 2858 }, { "epoch": 0.9930531434525877, "grad_norm": 0.8616967656128703, "learning_rate": 1.266049216170906e-09, "loss": 0.0235, "step": 2859 }, { "epoch": 0.9934004862799584, "grad_norm": 0.8090514774003552, "learning_rate": 1.142614119289398e-09, "loss": 0.0208, "step": 2860 }, { "epoch": 0.993747829107329, "grad_norm": 0.40981264825919256, "learning_rate": 1.025508088318694e-09, "loss": 0.0119, "step": 2861 }, { "epoch": 0.9940951719346995, "grad_norm": 0.9853414305979534, "learning_rate": 9.147312715279688e-10, "loss": 0.0194, "step": 2862 }, { "epoch": 0.9944425147620701, "grad_norm": 0.6823065427028371, "learning_rate": 8.102838091705867e-10, "loss": 0.0171, "step": 2863 }, { "epoch": 0.9947898575894408, "grad_norm": 0.49217144790501227, "learning_rate": 7.121658334890979e-10, "loss": 0.0127, "step": 2864 }, { "epoch": 0.9951372004168114, "grad_norm": 0.7961270955956514, "learning_rate": 6.203774687102426e-10, "loss": 0.0213, "step": 2865 }, { "epoch": 0.995484543244182, "grad_norm": 0.5901951990218518, "learning_rate": 5.34918831047726e-10, "loss": 0.0181, "step": 2866 }, { "epoch": 0.9958318860715526, "grad_norm": 1.0523471488146157, "learning_rate": 4.5579002870110854e-10, "loss": 0.0231, "step": 2867 }, { "epoch": 0.9961792288989232, "grad_norm": 0.5550763297657431, "learning_rate": 3.8299116185525066e-10, "loss": 0.011, "step": 2868 }, { "epoch": 0.9965265717262939, "grad_norm": 0.5408286201029091, "learning_rate": 3.165223226808678e-10, "loss": 0.0161, "step": 2869 }, { "epoch": 0.9968739145536645, "grad_norm": 0.4991725997802592, "learning_rate": 2.563835953345306e-10, "loss": 0.0175, "step": 2870 }, { "epoch": 0.9972212573810351, "grad_norm": 0.5996623257508237, "learning_rate": 2.0257505595810966e-10, "loss": 0.0183, "step": 2871 }, { "epoch": 0.9975686002084057, "grad_norm": 0.7177885884770104, "learning_rate": 1.5509677267877555e-10, "loss": 0.0194, "step": 2872 }, { "epoch": 0.9979159430357764, "grad_norm": 0.639091556482265, "learning_rate": 1.1394880560844368e-10, "loss": 0.0201, "step": 2873 }, { "epoch": 0.9982632858631469, "grad_norm": 0.5680423116235028, "learning_rate": 7.913120684488462e-11, "loss": 0.0189, "step": 2874 }, { "epoch": 0.9986106286905175, "grad_norm": 0.23508445434740197, "learning_rate": 5.0644020471168894e-11, "loss": 0.0098, "step": 2875 }, { "epoch": 0.9989579715178881, "grad_norm": 0.3673594266436054, "learning_rate": 2.8487282554556796e-11, "loss": 0.0109, "step": 2876 }, { "epoch": 0.9993053143452588, "grad_norm": 0.4480479527832172, "learning_rate": 1.2661021148163699e-11, "loss": 0.0206, "step": 2877 }, { "epoch": 0.9996526571726294, "grad_norm": 0.48054013159980413, "learning_rate": 3.1652562887396486e-12, "loss": 0.0155, "step": 2878 }, { "epoch": 1.0, "grad_norm": 0.7081858993617053, "learning_rate": 0.0, "loss": 0.0175, "step": 2879 }, { "epoch": 1.0, "step": 2879, "total_flos": 49962393949184.0, "train_loss": 0.021164150436296568, "train_runtime": 16194.0514, "train_samples_per_second": 11.381, "train_steps_per_second": 0.178 } ], "logging_steps": 1.0, "max_steps": 2879, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 49962393949184.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }