diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20195 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2879, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003473428273706148, + "grad_norm": 37.23415549580066, + "learning_rate": 1.1494252873563219e-07, + "loss": 0.1191, + "step": 1 + }, + { + "epoch": 0.0006946856547412296, + "grad_norm": 43.976862685963305, + "learning_rate": 2.2988505747126437e-07, + "loss": 0.1435, + "step": 2 + }, + { + "epoch": 0.0010420284821118443, + "grad_norm": 32.512151916312156, + "learning_rate": 3.4482758620689656e-07, + "loss": 0.1155, + "step": 3 + }, + { + "epoch": 0.0013893713094824591, + "grad_norm": 48.913784050446566, + "learning_rate": 4.5977011494252875e-07, + "loss": 0.1683, + "step": 4 + }, + { + "epoch": 0.0017367141368530739, + "grad_norm": 36.577121907693055, + "learning_rate": 5.747126436781609e-07, + "loss": 0.1352, + "step": 5 + }, + { + "epoch": 0.0020840569642236887, + "grad_norm": 26.561764879370827, + "learning_rate": 6.896551724137931e-07, + "loss": 0.0911, + "step": 6 + }, + { + "epoch": 0.0024313997915943034, + "grad_norm": 26.311858988645387, + "learning_rate": 8.045977011494253e-07, + "loss": 0.095, + "step": 7 + }, + { + "epoch": 0.0027787426189649182, + "grad_norm": 4.621058125611507, + "learning_rate": 9.195402298850575e-07, + "loss": 0.0487, + "step": 8 + }, + { + "epoch": 0.003126085446335533, + "grad_norm": 5.328829287414011, + "learning_rate": 1.0344827586206898e-06, + "loss": 0.0529, + "step": 9 + }, + { + "epoch": 0.0034734282737061478, + "grad_norm": 3.148318981902749, + "learning_rate": 1.1494252873563219e-06, + "loss": 0.0402, + "step": 10 + }, + { + "epoch": 0.0038207711010767626, + "grad_norm": 3.908627374831155, + "learning_rate": 1.2643678160919542e-06, + "loss": 0.0407, + "step": 11 + }, + { + "epoch": 0.004168113928447377, + "grad_norm": 4.739284231854802, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.0446, + "step": 12 + }, + { + "epoch": 0.0045154567558179926, + "grad_norm": 2.3496827004932945, + "learning_rate": 1.4942528735632185e-06, + "loss": 0.0384, + "step": 13 + }, + { + "epoch": 0.004862799583188607, + "grad_norm": 5.475271188171554, + "learning_rate": 1.6091954022988506e-06, + "loss": 0.0485, + "step": 14 + }, + { + "epoch": 0.005210142410559222, + "grad_norm": 6.547877477275296, + "learning_rate": 1.724137931034483e-06, + "loss": 0.0568, + "step": 15 + }, + { + "epoch": 0.0055574852379298365, + "grad_norm": 3.8463210912421704, + "learning_rate": 1.839080459770115e-06, + "loss": 0.0449, + "step": 16 + }, + { + "epoch": 0.005904828065300452, + "grad_norm": 5.599877467384883, + "learning_rate": 1.9540229885057475e-06, + "loss": 0.0491, + "step": 17 + }, + { + "epoch": 0.006252170892671066, + "grad_norm": 4.194689420093038, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.0473, + "step": 18 + }, + { + "epoch": 0.006599513720041681, + "grad_norm": 1.8031692743490404, + "learning_rate": 2.1839080459770117e-06, + "loss": 0.042, + "step": 19 + }, + { + "epoch": 0.0069468565474122956, + "grad_norm": 4.4715637058132, + "learning_rate": 2.2988505747126437e-06, + "loss": 0.0404, + "step": 20 + }, + { + "epoch": 0.007294199374782911, + "grad_norm": 0.8236040836386482, + "learning_rate": 2.4137931034482762e-06, + "loss": 0.0382, + "step": 21 + }, + { + "epoch": 0.007641542202153525, + "grad_norm": 0.82338294259807, + "learning_rate": 2.5287356321839083e-06, + "loss": 0.0422, + "step": 22 + }, + { + "epoch": 0.00798888502952414, + "grad_norm": 1.8027969557144028, + "learning_rate": 2.6436781609195404e-06, + "loss": 0.0363, + "step": 23 + }, + { + "epoch": 0.008336227856894755, + "grad_norm": 0.727095842359559, + "learning_rate": 2.7586206896551725e-06, + "loss": 0.0351, + "step": 24 + }, + { + "epoch": 0.00868357068426537, + "grad_norm": 1.92696781932363, + "learning_rate": 2.8735632183908046e-06, + "loss": 0.0301, + "step": 25 + }, + { + "epoch": 0.009030913511635985, + "grad_norm": 0.9300122101440347, + "learning_rate": 2.988505747126437e-06, + "loss": 0.0326, + "step": 26 + }, + { + "epoch": 0.0093782563390066, + "grad_norm": 5.1665169890422495, + "learning_rate": 3.103448275862069e-06, + "loss": 0.0378, + "step": 27 + }, + { + "epoch": 0.009725599166377214, + "grad_norm": 5.236812002541303, + "learning_rate": 3.2183908045977012e-06, + "loss": 0.0408, + "step": 28 + }, + { + "epoch": 0.010072941993747829, + "grad_norm": 2.6321912137329666, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0315, + "step": 29 + }, + { + "epoch": 0.010420284821118444, + "grad_norm": 1.7260726850635042, + "learning_rate": 3.448275862068966e-06, + "loss": 0.03, + "step": 30 + }, + { + "epoch": 0.01076762764848906, + "grad_norm": 2.359583277352366, + "learning_rate": 3.563218390804598e-06, + "loss": 0.0293, + "step": 31 + }, + { + "epoch": 0.011114970475859673, + "grad_norm": 3.4439336912247382, + "learning_rate": 3.67816091954023e-06, + "loss": 0.0391, + "step": 32 + }, + { + "epoch": 0.011462313303230288, + "grad_norm": 2.5718166529004374, + "learning_rate": 3.793103448275862e-06, + "loss": 0.0405, + "step": 33 + }, + { + "epoch": 0.011809656130600903, + "grad_norm": 1.8743075423305569, + "learning_rate": 3.908045977011495e-06, + "loss": 0.0316, + "step": 34 + }, + { + "epoch": 0.012156998957971519, + "grad_norm": 0.9641265575376259, + "learning_rate": 4.022988505747127e-06, + "loss": 0.0282, + "step": 35 + }, + { + "epoch": 0.012504341785342132, + "grad_norm": 4.625856081652024, + "learning_rate": 4.137931034482759e-06, + "loss": 0.0336, + "step": 36 + }, + { + "epoch": 0.012851684612712747, + "grad_norm": 4.956158639154869, + "learning_rate": 4.252873563218391e-06, + "loss": 0.0535, + "step": 37 + }, + { + "epoch": 0.013199027440083362, + "grad_norm": 3.8237646578914033, + "learning_rate": 4.367816091954023e-06, + "loss": 0.0321, + "step": 38 + }, + { + "epoch": 0.013546370267453978, + "grad_norm": 1.6947960633729644, + "learning_rate": 4.482758620689656e-06, + "loss": 0.0348, + "step": 39 + }, + { + "epoch": 0.013893713094824591, + "grad_norm": 0.7680647499923681, + "learning_rate": 4.5977011494252875e-06, + "loss": 0.0318, + "step": 40 + }, + { + "epoch": 0.014241055922195206, + "grad_norm": 2.1664725717573545, + "learning_rate": 4.71264367816092e-06, + "loss": 0.0314, + "step": 41 + }, + { + "epoch": 0.014588398749565822, + "grad_norm": 4.263206749577008, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.0407, + "step": 42 + }, + { + "epoch": 0.014935741576936437, + "grad_norm": 4.1724594615524335, + "learning_rate": 4.942528735632184e-06, + "loss": 0.0424, + "step": 43 + }, + { + "epoch": 0.01528308440430705, + "grad_norm": 3.7167659760688303, + "learning_rate": 5.057471264367817e-06, + "loss": 0.0345, + "step": 44 + }, + { + "epoch": 0.015630427231677665, + "grad_norm": 2.463474287564055, + "learning_rate": 5.172413793103449e-06, + "loss": 0.0349, + "step": 45 + }, + { + "epoch": 0.01597777005904828, + "grad_norm": 0.9621922490266032, + "learning_rate": 5.287356321839081e-06, + "loss": 0.0308, + "step": 46 + }, + { + "epoch": 0.016325112886418896, + "grad_norm": 1.1678035904473527, + "learning_rate": 5.402298850574713e-06, + "loss": 0.0255, + "step": 47 + }, + { + "epoch": 0.01667245571378951, + "grad_norm": 2.1879662040379664, + "learning_rate": 5.517241379310345e-06, + "loss": 0.0316, + "step": 48 + }, + { + "epoch": 0.017019798541160126, + "grad_norm": 3.5945900982319015, + "learning_rate": 5.6321839080459775e-06, + "loss": 0.042, + "step": 49 + }, + { + "epoch": 0.01736714136853074, + "grad_norm": 1.5756413045669015, + "learning_rate": 5.747126436781609e-06, + "loss": 0.0255, + "step": 50 + }, + { + "epoch": 0.017714484195901353, + "grad_norm": 1.2748577272738042, + "learning_rate": 5.862068965517242e-06, + "loss": 0.03, + "step": 51 + }, + { + "epoch": 0.01806182702327197, + "grad_norm": 3.009209224361777, + "learning_rate": 5.977011494252874e-06, + "loss": 0.0341, + "step": 52 + }, + { + "epoch": 0.018409169850642584, + "grad_norm": 3.489277616266429, + "learning_rate": 6.091954022988507e-06, + "loss": 0.0317, + "step": 53 + }, + { + "epoch": 0.0187565126780132, + "grad_norm": 3.962377854439755, + "learning_rate": 6.206896551724138e-06, + "loss": 0.0383, + "step": 54 + }, + { + "epoch": 0.019103855505383814, + "grad_norm": 2.162400758859317, + "learning_rate": 6.321839080459771e-06, + "loss": 0.0327, + "step": 55 + }, + { + "epoch": 0.019451198332754428, + "grad_norm": 0.9492153536245894, + "learning_rate": 6.4367816091954025e-06, + "loss": 0.0266, + "step": 56 + }, + { + "epoch": 0.019798541160125045, + "grad_norm": 1.0187860208671786, + "learning_rate": 6.551724137931035e-06, + "loss": 0.0354, + "step": 57 + }, + { + "epoch": 0.020145883987495658, + "grad_norm": 1.1083211600458822, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0307, + "step": 58 + }, + { + "epoch": 0.02049322681486627, + "grad_norm": 3.937460604921549, + "learning_rate": 6.781609195402299e-06, + "loss": 0.0359, + "step": 59 + }, + { + "epoch": 0.02084056964223689, + "grad_norm": 0.6965391618701994, + "learning_rate": 6.896551724137932e-06, + "loss": 0.0296, + "step": 60 + }, + { + "epoch": 0.021187912469607502, + "grad_norm": 1.7430268237608375, + "learning_rate": 7.011494252873564e-06, + "loss": 0.033, + "step": 61 + }, + { + "epoch": 0.02153525529697812, + "grad_norm": 1.6253422710766354, + "learning_rate": 7.126436781609196e-06, + "loss": 0.026, + "step": 62 + }, + { + "epoch": 0.021882598124348732, + "grad_norm": 3.037222634408878, + "learning_rate": 7.241379310344828e-06, + "loss": 0.0349, + "step": 63 + }, + { + "epoch": 0.022229940951719346, + "grad_norm": 2.145372106566845, + "learning_rate": 7.35632183908046e-06, + "loss": 0.0294, + "step": 64 + }, + { + "epoch": 0.022577283779089963, + "grad_norm": 2.6342550049280384, + "learning_rate": 7.4712643678160925e-06, + "loss": 0.038, + "step": 65 + }, + { + "epoch": 0.022924626606460576, + "grad_norm": 1.336495145846901, + "learning_rate": 7.586206896551724e-06, + "loss": 0.037, + "step": 66 + }, + { + "epoch": 0.02327196943383119, + "grad_norm": 1.3878158762500856, + "learning_rate": 7.701149425287356e-06, + "loss": 0.036, + "step": 67 + }, + { + "epoch": 0.023619312261201807, + "grad_norm": 0.7502926647886281, + "learning_rate": 7.81609195402299e-06, + "loss": 0.027, + "step": 68 + }, + { + "epoch": 0.02396665508857242, + "grad_norm": 2.7789051553384114, + "learning_rate": 7.93103448275862e-06, + "loss": 0.0385, + "step": 69 + }, + { + "epoch": 0.024313997915943037, + "grad_norm": 1.510784689203471, + "learning_rate": 8.045977011494253e-06, + "loss": 0.0434, + "step": 70 + }, + { + "epoch": 0.02466134074331365, + "grad_norm": 1.795628251322906, + "learning_rate": 8.160919540229886e-06, + "loss": 0.0308, + "step": 71 + }, + { + "epoch": 0.025008683570684264, + "grad_norm": 0.8482257639162686, + "learning_rate": 8.275862068965518e-06, + "loss": 0.0354, + "step": 72 + }, + { + "epoch": 0.02535602639805488, + "grad_norm": 1.9996412936621009, + "learning_rate": 8.390804597701149e-06, + "loss": 0.0369, + "step": 73 + }, + { + "epoch": 0.025703369225425494, + "grad_norm": 0.7654423441044093, + "learning_rate": 8.505747126436782e-06, + "loss": 0.0353, + "step": 74 + }, + { + "epoch": 0.02605071205279611, + "grad_norm": 0.6493879425365747, + "learning_rate": 8.620689655172414e-06, + "loss": 0.0291, + "step": 75 + }, + { + "epoch": 0.026398054880166725, + "grad_norm": 2.414435804159643, + "learning_rate": 8.735632183908047e-06, + "loss": 0.0317, + "step": 76 + }, + { + "epoch": 0.02674539770753734, + "grad_norm": 1.0904156611035012, + "learning_rate": 8.85057471264368e-06, + "loss": 0.0269, + "step": 77 + }, + { + "epoch": 0.027092740534907955, + "grad_norm": 1.4189653573098011, + "learning_rate": 8.965517241379312e-06, + "loss": 0.0236, + "step": 78 + }, + { + "epoch": 0.02744008336227857, + "grad_norm": 1.040668552643942, + "learning_rate": 9.080459770114942e-06, + "loss": 0.0243, + "step": 79 + }, + { + "epoch": 0.027787426189649182, + "grad_norm": 1.3437174808420649, + "learning_rate": 9.195402298850575e-06, + "loss": 0.0332, + "step": 80 + }, + { + "epoch": 0.0281347690170198, + "grad_norm": 1.4299130307272268, + "learning_rate": 9.310344827586207e-06, + "loss": 0.0308, + "step": 81 + }, + { + "epoch": 0.028482111844390413, + "grad_norm": 1.2629695612963914, + "learning_rate": 9.42528735632184e-06, + "loss": 0.0203, + "step": 82 + }, + { + "epoch": 0.02882945467176103, + "grad_norm": 1.3219643843083144, + "learning_rate": 9.54022988505747e-06, + "loss": 0.0379, + "step": 83 + }, + { + "epoch": 0.029176797499131643, + "grad_norm": 3.6871466385779783, + "learning_rate": 9.655172413793105e-06, + "loss": 0.0274, + "step": 84 + }, + { + "epoch": 0.029524140326502257, + "grad_norm": 3.9969862428228033, + "learning_rate": 9.770114942528738e-06, + "loss": 0.0323, + "step": 85 + }, + { + "epoch": 0.029871483153872874, + "grad_norm": 3.065195993982362, + "learning_rate": 9.885057471264368e-06, + "loss": 0.0309, + "step": 86 + }, + { + "epoch": 0.030218825981243487, + "grad_norm": 1.468135218974371, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 87 + }, + { + "epoch": 0.0305661688086141, + "grad_norm": 2.1306295097585353, + "learning_rate": 9.999996834743712e-06, + "loss": 0.0366, + "step": 88 + }, + { + "epoch": 0.030913511635984717, + "grad_norm": 4.561675020890797, + "learning_rate": 9.999987338978852e-06, + "loss": 0.0399, + "step": 89 + }, + { + "epoch": 0.03126085446335533, + "grad_norm": 3.0699739972744355, + "learning_rate": 9.999971512717445e-06, + "loss": 0.0361, + "step": 90 + }, + { + "epoch": 0.03160819729072595, + "grad_norm": 1.7204154640977625, + "learning_rate": 9.99994935597953e-06, + "loss": 0.023, + "step": 91 + }, + { + "epoch": 0.03195554011809656, + "grad_norm": 0.4121394943188734, + "learning_rate": 9.999920868793156e-06, + "loss": 0.0151, + "step": 92 + }, + { + "epoch": 0.032302882945467175, + "grad_norm": 1.8037684571162973, + "learning_rate": 9.999886051194392e-06, + "loss": 0.041, + "step": 93 + }, + { + "epoch": 0.03265022577283779, + "grad_norm": 1.5191359679839431, + "learning_rate": 9.999844903227323e-06, + "loss": 0.0222, + "step": 94 + }, + { + "epoch": 0.03299756860020841, + "grad_norm": 2.797587288217291, + "learning_rate": 9.999797424944041e-06, + "loss": 0.0359, + "step": 95 + }, + { + "epoch": 0.03334491142757902, + "grad_norm": 0.9848390023335929, + "learning_rate": 9.999743616404667e-06, + "loss": 0.029, + "step": 96 + }, + { + "epoch": 0.033692254254949636, + "grad_norm": 3.474922410115143, + "learning_rate": 9.999683477677319e-06, + "loss": 0.0433, + "step": 97 + }, + { + "epoch": 0.03403959708232025, + "grad_norm": 0.4352894693833356, + "learning_rate": 9.999617008838145e-06, + "loss": 0.0271, + "step": 98 + }, + { + "epoch": 0.03438693990969086, + "grad_norm": 1.0612949281619894, + "learning_rate": 9.999544209971299e-06, + "loss": 0.0262, + "step": 99 + }, + { + "epoch": 0.03473428273706148, + "grad_norm": 1.5028120141315706, + "learning_rate": 9.999465081168954e-06, + "loss": 0.023, + "step": 100 + }, + { + "epoch": 0.035081625564432097, + "grad_norm": 2.485311227250491, + "learning_rate": 9.999379622531292e-06, + "loss": 0.0353, + "step": 101 + }, + { + "epoch": 0.035428968391802707, + "grad_norm": 1.1277504855781095, + "learning_rate": 9.99928783416651e-06, + "loss": 0.0238, + "step": 102 + }, + { + "epoch": 0.03577631121917332, + "grad_norm": 0.9476524869349918, + "learning_rate": 9.99918971619083e-06, + "loss": 0.0311, + "step": 103 + }, + { + "epoch": 0.03612365404654394, + "grad_norm": 4.351712281867559, + "learning_rate": 9.999085268728473e-06, + "loss": 0.0371, + "step": 104 + }, + { + "epoch": 0.03647099687391455, + "grad_norm": 2.7348253113812424, + "learning_rate": 9.998974491911681e-06, + "loss": 0.0374, + "step": 105 + }, + { + "epoch": 0.03681833970128517, + "grad_norm": 2.5995603090920105, + "learning_rate": 9.998857385880712e-06, + "loss": 0.0285, + "step": 106 + }, + { + "epoch": 0.037165682528655784, + "grad_norm": 0.7311687318983651, + "learning_rate": 9.99873395078383e-06, + "loss": 0.0229, + "step": 107 + }, + { + "epoch": 0.0375130253560264, + "grad_norm": 1.2353487773193577, + "learning_rate": 9.998604186777318e-06, + "loss": 0.0322, + "step": 108 + }, + { + "epoch": 0.03786036818339701, + "grad_norm": 0.9451422134075631, + "learning_rate": 9.998468094025473e-06, + "loss": 0.0352, + "step": 109 + }, + { + "epoch": 0.03820771101076763, + "grad_norm": 1.7091841651608024, + "learning_rate": 9.9983256727006e-06, + "loss": 0.0344, + "step": 110 + }, + { + "epoch": 0.038555053838138245, + "grad_norm": 2.7065412298038387, + "learning_rate": 9.998176922983017e-06, + "loss": 0.0356, + "step": 111 + }, + { + "epoch": 0.038902396665508855, + "grad_norm": 0.7915485766007472, + "learning_rate": 9.998021845061059e-06, + "loss": 0.0278, + "step": 112 + }, + { + "epoch": 0.03924973949287947, + "grad_norm": 0.6604452686175176, + "learning_rate": 9.99786043913107e-06, + "loss": 0.0263, + "step": 113 + }, + { + "epoch": 0.03959708232025009, + "grad_norm": 2.2075871866436807, + "learning_rate": 9.997692705397408e-06, + "loss": 0.0293, + "step": 114 + }, + { + "epoch": 0.0399444251476207, + "grad_norm": 1.0990740259335516, + "learning_rate": 9.99751864407244e-06, + "loss": 0.0251, + "step": 115 + }, + { + "epoch": 0.040291767974991316, + "grad_norm": 1.1686977294017693, + "learning_rate": 9.997338255376545e-06, + "loss": 0.0313, + "step": 116 + }, + { + "epoch": 0.04063911080236193, + "grad_norm": 1.3383756136679479, + "learning_rate": 9.997151539538114e-06, + "loss": 0.0226, + "step": 117 + }, + { + "epoch": 0.04098645362973254, + "grad_norm": 0.7694885404915951, + "learning_rate": 9.996958496793547e-06, + "loss": 0.0301, + "step": 118 + }, + { + "epoch": 0.04133379645710316, + "grad_norm": 1.0594137346894543, + "learning_rate": 9.996759127387259e-06, + "loss": 0.0217, + "step": 119 + }, + { + "epoch": 0.04168113928447378, + "grad_norm": 0.7335920204997873, + "learning_rate": 9.996553431571669e-06, + "loss": 0.0225, + "step": 120 + }, + { + "epoch": 0.042028482111844394, + "grad_norm": 2.255099961830129, + "learning_rate": 9.99634140960721e-06, + "loss": 0.0248, + "step": 121 + }, + { + "epoch": 0.042375824939215004, + "grad_norm": 2.42490870668873, + "learning_rate": 9.996123061762324e-06, + "loss": 0.0424, + "step": 122 + }, + { + "epoch": 0.04272316776658562, + "grad_norm": 1.1936924708308256, + "learning_rate": 9.99589838831346e-06, + "loss": 0.036, + "step": 123 + }, + { + "epoch": 0.04307051059395624, + "grad_norm": 0.9868005673494291, + "learning_rate": 9.995667389545082e-06, + "loss": 0.0418, + "step": 124 + }, + { + "epoch": 0.04341785342132685, + "grad_norm": 2.506803561409087, + "learning_rate": 9.995430065749653e-06, + "loss": 0.0287, + "step": 125 + }, + { + "epoch": 0.043765196248697465, + "grad_norm": 0.7525519955237748, + "learning_rate": 9.995186417227654e-06, + "loss": 0.0295, + "step": 126 + }, + { + "epoch": 0.04411253907606808, + "grad_norm": 0.3504040791938595, + "learning_rate": 9.994936444287565e-06, + "loss": 0.0317, + "step": 127 + }, + { + "epoch": 0.04445988190343869, + "grad_norm": 1.0108416374201756, + "learning_rate": 9.99468014724588e-06, + "loss": 0.0281, + "step": 128 + }, + { + "epoch": 0.04480722473080931, + "grad_norm": 2.211821690034367, + "learning_rate": 9.994417526427094e-06, + "loss": 0.0351, + "step": 129 + }, + { + "epoch": 0.045154567558179926, + "grad_norm": 0.4010376448499792, + "learning_rate": 9.994148582163715e-06, + "loss": 0.0278, + "step": 130 + }, + { + "epoch": 0.045501910385550536, + "grad_norm": 1.1919924569133369, + "learning_rate": 9.993873314796253e-06, + "loss": 0.028, + "step": 131 + }, + { + "epoch": 0.04584925321292115, + "grad_norm": 1.2102441655899068, + "learning_rate": 9.993591724673225e-06, + "loss": 0.0272, + "step": 132 + }, + { + "epoch": 0.04619659604029177, + "grad_norm": 1.3423201041149735, + "learning_rate": 9.993303812151153e-06, + "loss": 0.0288, + "step": 133 + }, + { + "epoch": 0.04654393886766238, + "grad_norm": 0.9911420725512745, + "learning_rate": 9.993009577594564e-06, + "loss": 0.0266, + "step": 134 + }, + { + "epoch": 0.046891281695032996, + "grad_norm": 2.818665002325742, + "learning_rate": 9.992709021375987e-06, + "loss": 0.0416, + "step": 135 + }, + { + "epoch": 0.04723862452240361, + "grad_norm": 0.6400736482581679, + "learning_rate": 9.99240214387596e-06, + "loss": 0.0224, + "step": 136 + }, + { + "epoch": 0.04758596734977423, + "grad_norm": 1.564814253959133, + "learning_rate": 9.99208894548302e-06, + "loss": 0.0326, + "step": 137 + }, + { + "epoch": 0.04793331017714484, + "grad_norm": 1.1019040221230227, + "learning_rate": 9.991769426593707e-06, + "loss": 0.0327, + "step": 138 + }, + { + "epoch": 0.04828065300451546, + "grad_norm": 6.120623936551482, + "learning_rate": 9.991443587612568e-06, + "loss": 0.047, + "step": 139 + }, + { + "epoch": 0.048627995831886074, + "grad_norm": 5.451375099863766, + "learning_rate": 9.991111428952145e-06, + "loss": 0.0503, + "step": 140 + }, + { + "epoch": 0.048975338659256684, + "grad_norm": 4.7872510231651635, + "learning_rate": 9.990772951032987e-06, + "loss": 0.0417, + "step": 141 + }, + { + "epoch": 0.0493226814866273, + "grad_norm": 3.9394554371100337, + "learning_rate": 9.990428154283641e-06, + "loss": 0.0341, + "step": 142 + }, + { + "epoch": 0.04967002431399792, + "grad_norm": 0.8850048672337401, + "learning_rate": 9.990077039140655e-06, + "loss": 0.0319, + "step": 143 + }, + { + "epoch": 0.05001736714136853, + "grad_norm": 0.735313738269197, + "learning_rate": 9.989719606048578e-06, + "loss": 0.03, + "step": 144 + }, + { + "epoch": 0.050364709968739145, + "grad_norm": 2.380846224871553, + "learning_rate": 9.989355855459954e-06, + "loss": 0.033, + "step": 145 + }, + { + "epoch": 0.05071205279610976, + "grad_norm": 1.5964417699444167, + "learning_rate": 9.988985787835332e-06, + "loss": 0.0297, + "step": 146 + }, + { + "epoch": 0.05105939562348037, + "grad_norm": 2.2693926915330067, + "learning_rate": 9.988609403643254e-06, + "loss": 0.0355, + "step": 147 + }, + { + "epoch": 0.05140673845085099, + "grad_norm": 0.8694077145197386, + "learning_rate": 9.98822670336026e-06, + "loss": 0.0257, + "step": 148 + }, + { + "epoch": 0.051754081278221606, + "grad_norm": 2.6184498491631687, + "learning_rate": 9.987837687470889e-06, + "loss": 0.0349, + "step": 149 + }, + { + "epoch": 0.05210142410559222, + "grad_norm": 1.8812135552729436, + "learning_rate": 9.987442356467677e-06, + "loss": 0.0205, + "step": 150 + }, + { + "epoch": 0.05244876693296283, + "grad_norm": 0.757853013334225, + "learning_rate": 9.987040710851148e-06, + "loss": 0.0361, + "step": 151 + }, + { + "epoch": 0.05279610976033345, + "grad_norm": 0.9951588342302143, + "learning_rate": 9.98663275112983e-06, + "loss": 0.0237, + "step": 152 + }, + { + "epoch": 0.05314345258770407, + "grad_norm": 1.6932805634155623, + "learning_rate": 9.986218477820244e-06, + "loss": 0.0368, + "step": 153 + }, + { + "epoch": 0.05349079541507468, + "grad_norm": 1.3139957494156915, + "learning_rate": 9.985797891446898e-06, + "loss": 0.0229, + "step": 154 + }, + { + "epoch": 0.053838138242445294, + "grad_norm": 0.5291454593174553, + "learning_rate": 9.9853709925423e-06, + "loss": 0.0228, + "step": 155 + }, + { + "epoch": 0.05418548106981591, + "grad_norm": 1.6707077604660217, + "learning_rate": 9.984937781646948e-06, + "loss": 0.03, + "step": 156 + }, + { + "epoch": 0.05453282389718652, + "grad_norm": 3.0726902017901927, + "learning_rate": 9.984498259309332e-06, + "loss": 0.0334, + "step": 157 + }, + { + "epoch": 0.05488016672455714, + "grad_norm": 0.7820818072933668, + "learning_rate": 9.984052426085931e-06, + "loss": 0.0205, + "step": 158 + }, + { + "epoch": 0.055227509551927755, + "grad_norm": 1.5369790153168252, + "learning_rate": 9.983600282541213e-06, + "loss": 0.033, + "step": 159 + }, + { + "epoch": 0.055574852379298365, + "grad_norm": 0.6130697531268696, + "learning_rate": 9.983141829247644e-06, + "loss": 0.0323, + "step": 160 + }, + { + "epoch": 0.05592219520666898, + "grad_norm": 2.0042679704987245, + "learning_rate": 9.982677066785667e-06, + "loss": 0.0362, + "step": 161 + }, + { + "epoch": 0.0562695380340396, + "grad_norm": 0.9373286974868164, + "learning_rate": 9.982205995743723e-06, + "loss": 0.027, + "step": 162 + }, + { + "epoch": 0.05661688086141021, + "grad_norm": 0.9593697401680631, + "learning_rate": 9.981728616718234e-06, + "loss": 0.0221, + "step": 163 + }, + { + "epoch": 0.056964223688780825, + "grad_norm": 0.9145114104321098, + "learning_rate": 9.981244930313613e-06, + "loss": 0.0269, + "step": 164 + }, + { + "epoch": 0.05731156651615144, + "grad_norm": 1.7022933635129043, + "learning_rate": 9.980754937142254e-06, + "loss": 0.0273, + "step": 165 + }, + { + "epoch": 0.05765890934352206, + "grad_norm": 0.5802100036143374, + "learning_rate": 9.98025863782454e-06, + "loss": 0.0197, + "step": 166 + }, + { + "epoch": 0.05800625217089267, + "grad_norm": 2.06874906530981, + "learning_rate": 9.979756032988837e-06, + "loss": 0.0292, + "step": 167 + }, + { + "epoch": 0.058353594998263286, + "grad_norm": 2.9290249045005017, + "learning_rate": 9.979247123271494e-06, + "loss": 0.0338, + "step": 168 + }, + { + "epoch": 0.0587009378256339, + "grad_norm": 0.8386866195629475, + "learning_rate": 9.978731909316841e-06, + "loss": 0.0243, + "step": 169 + }, + { + "epoch": 0.05904828065300451, + "grad_norm": 0.4502042472741338, + "learning_rate": 9.978210391777195e-06, + "loss": 0.0255, + "step": 170 + }, + { + "epoch": 0.05939562348037513, + "grad_norm": 2.248415124487682, + "learning_rate": 9.977682571312847e-06, + "loss": 0.0336, + "step": 171 + }, + { + "epoch": 0.05974296630774575, + "grad_norm": 1.0652482256976974, + "learning_rate": 9.977148448592077e-06, + "loss": 0.0224, + "step": 172 + }, + { + "epoch": 0.06009030913511636, + "grad_norm": 0.7329238551144143, + "learning_rate": 9.976608024291135e-06, + "loss": 0.0266, + "step": 173 + }, + { + "epoch": 0.060437651962486974, + "grad_norm": 0.8741338102260089, + "learning_rate": 9.976061299094253e-06, + "loss": 0.0219, + "step": 174 + }, + { + "epoch": 0.06078499478985759, + "grad_norm": 1.6779119248256096, + "learning_rate": 9.975508273693643e-06, + "loss": 0.0371, + "step": 175 + }, + { + "epoch": 0.0611323376172282, + "grad_norm": 0.7059586695874602, + "learning_rate": 9.974948948789492e-06, + "loss": 0.0255, + "step": 176 + }, + { + "epoch": 0.06147968044459882, + "grad_norm": 2.197705367927432, + "learning_rate": 9.974383325089962e-06, + "loss": 0.0222, + "step": 177 + }, + { + "epoch": 0.061827023271969435, + "grad_norm": 1.2087103303661568, + "learning_rate": 9.973811403311192e-06, + "loss": 0.0336, + "step": 178 + }, + { + "epoch": 0.06217436609934005, + "grad_norm": 1.831109776996565, + "learning_rate": 9.97323318417729e-06, + "loss": 0.0292, + "step": 179 + }, + { + "epoch": 0.06252170892671066, + "grad_norm": 0.6811940346616355, + "learning_rate": 9.972648668420346e-06, + "loss": 0.0241, + "step": 180 + }, + { + "epoch": 0.06286905175408128, + "grad_norm": 0.7273972811687258, + "learning_rate": 9.972057856780412e-06, + "loss": 0.0164, + "step": 181 + }, + { + "epoch": 0.0632163945814519, + "grad_norm": 1.795643396141142, + "learning_rate": 9.97146075000552e-06, + "loss": 0.0233, + "step": 182 + }, + { + "epoch": 0.06356373740882251, + "grad_norm": 0.7725411745774898, + "learning_rate": 9.970857348851667e-06, + "loss": 0.0352, + "step": 183 + }, + { + "epoch": 0.06391108023619312, + "grad_norm": 0.7933463105547254, + "learning_rate": 9.970247654082816e-06, + "loss": 0.0296, + "step": 184 + }, + { + "epoch": 0.06425842306356373, + "grad_norm": 2.2605994347865774, + "learning_rate": 9.96963166647091e-06, + "loss": 0.04, + "step": 185 + }, + { + "epoch": 0.06460576589093435, + "grad_norm": 1.0749382398980514, + "learning_rate": 9.969009386795849e-06, + "loss": 0.022, + "step": 186 + }, + { + "epoch": 0.06495310871830497, + "grad_norm": 0.557792334948461, + "learning_rate": 9.968380815845504e-06, + "loss": 0.0327, + "step": 187 + }, + { + "epoch": 0.06530045154567558, + "grad_norm": 3.518645514803458, + "learning_rate": 9.967745954415711e-06, + "loss": 0.0286, + "step": 188 + }, + { + "epoch": 0.0656477943730462, + "grad_norm": 0.5873301047411636, + "learning_rate": 9.967104803310266e-06, + "loss": 0.0295, + "step": 189 + }, + { + "epoch": 0.06599513720041682, + "grad_norm": 0.6753747187607116, + "learning_rate": 9.966457363340936e-06, + "loss": 0.026, + "step": 190 + }, + { + "epoch": 0.06634248002778742, + "grad_norm": 0.4603778678263685, + "learning_rate": 9.965803635327445e-06, + "loss": 0.028, + "step": 191 + }, + { + "epoch": 0.06668982285515804, + "grad_norm": 0.9556093549811782, + "learning_rate": 9.965143620097479e-06, + "loss": 0.0195, + "step": 192 + }, + { + "epoch": 0.06703716568252865, + "grad_norm": 2.1021718467486745, + "learning_rate": 9.964477318486687e-06, + "loss": 0.0256, + "step": 193 + }, + { + "epoch": 0.06738450850989927, + "grad_norm": 1.714896004446046, + "learning_rate": 9.963804731338674e-06, + "loss": 0.0245, + "step": 194 + }, + { + "epoch": 0.06773185133726989, + "grad_norm": 0.8958701314528463, + "learning_rate": 9.963125859505e-06, + "loss": 0.0367, + "step": 195 + }, + { + "epoch": 0.0680791941646405, + "grad_norm": 1.4512351067062361, + "learning_rate": 9.962440703845193e-06, + "loss": 0.0204, + "step": 196 + }, + { + "epoch": 0.06842653699201111, + "grad_norm": 0.8793852979074263, + "learning_rate": 9.961749265226728e-06, + "loss": 0.0182, + "step": 197 + }, + { + "epoch": 0.06877387981938173, + "grad_norm": 1.8655135067659925, + "learning_rate": 9.961051544525037e-06, + "loss": 0.0228, + "step": 198 + }, + { + "epoch": 0.06912122264675234, + "grad_norm": 2.2100382744344698, + "learning_rate": 9.960347542623506e-06, + "loss": 0.0344, + "step": 199 + }, + { + "epoch": 0.06946856547412296, + "grad_norm": 0.9843602786620123, + "learning_rate": 9.959637260413471e-06, + "loss": 0.0298, + "step": 200 + }, + { + "epoch": 0.06981590830149358, + "grad_norm": 3.5094590361679496, + "learning_rate": 9.958920698794226e-06, + "loss": 0.037, + "step": 201 + }, + { + "epoch": 0.07016325112886419, + "grad_norm": 1.5466354807280334, + "learning_rate": 9.958197858673009e-06, + "loss": 0.0294, + "step": 202 + }, + { + "epoch": 0.07051059395623481, + "grad_norm": 3.257914253480638, + "learning_rate": 9.95746874096501e-06, + "loss": 0.0343, + "step": 203 + }, + { + "epoch": 0.07085793678360541, + "grad_norm": 3.1724146363151333, + "learning_rate": 9.95673334659337e-06, + "loss": 0.046, + "step": 204 + }, + { + "epoch": 0.07120527961097603, + "grad_norm": 2.045576669926992, + "learning_rate": 9.95599167648917e-06, + "loss": 0.0328, + "step": 205 + }, + { + "epoch": 0.07155262243834665, + "grad_norm": 4.362827099663752, + "learning_rate": 9.95524373159144e-06, + "loss": 0.0333, + "step": 206 + }, + { + "epoch": 0.07189996526571726, + "grad_norm": 2.1027185240369244, + "learning_rate": 9.954489512847156e-06, + "loss": 0.0278, + "step": 207 + }, + { + "epoch": 0.07224730809308788, + "grad_norm": 1.951261682373136, + "learning_rate": 9.953729021211238e-06, + "loss": 0.0369, + "step": 208 + }, + { + "epoch": 0.0725946509204585, + "grad_norm": 2.70690992472503, + "learning_rate": 9.952962257646545e-06, + "loss": 0.0368, + "step": 209 + }, + { + "epoch": 0.0729419937478291, + "grad_norm": 1.1045388371026474, + "learning_rate": 9.952189223123877e-06, + "loss": 0.0198, + "step": 210 + }, + { + "epoch": 0.07328933657519972, + "grad_norm": 0.37890065356086955, + "learning_rate": 9.951409918621977e-06, + "loss": 0.0275, + "step": 211 + }, + { + "epoch": 0.07363667940257033, + "grad_norm": 1.27246581897078, + "learning_rate": 9.950624345127523e-06, + "loss": 0.0269, + "step": 212 + }, + { + "epoch": 0.07398402222994095, + "grad_norm": 0.6532636236631134, + "learning_rate": 9.949832503635133e-06, + "loss": 0.0275, + "step": 213 + }, + { + "epoch": 0.07433136505731157, + "grad_norm": 2.5733332280836576, + "learning_rate": 9.949034395147357e-06, + "loss": 0.0274, + "step": 214 + }, + { + "epoch": 0.07467870788468219, + "grad_norm": 2.652931462756075, + "learning_rate": 9.948230020674685e-06, + "loss": 0.0321, + "step": 215 + }, + { + "epoch": 0.0750260507120528, + "grad_norm": 1.2497045154490993, + "learning_rate": 9.947419381235538e-06, + "loss": 0.0277, + "step": 216 + }, + { + "epoch": 0.0753733935394234, + "grad_norm": 1.029916760819159, + "learning_rate": 9.946602477856262e-06, + "loss": 0.0223, + "step": 217 + }, + { + "epoch": 0.07572073636679402, + "grad_norm": 1.0496525720606742, + "learning_rate": 9.94577931157115e-06, + "loss": 0.0309, + "step": 218 + }, + { + "epoch": 0.07606807919416464, + "grad_norm": 0.8813614432848731, + "learning_rate": 9.944949883422409e-06, + "loss": 0.0223, + "step": 219 + }, + { + "epoch": 0.07641542202153526, + "grad_norm": 1.048099444449534, + "learning_rate": 9.944114194460181e-06, + "loss": 0.0358, + "step": 220 + }, + { + "epoch": 0.07676276484890587, + "grad_norm": 0.5608593650182933, + "learning_rate": 9.943272245742534e-06, + "loss": 0.0197, + "step": 221 + }, + { + "epoch": 0.07711010767627649, + "grad_norm": 1.6998010859628299, + "learning_rate": 9.942424038335462e-06, + "loss": 0.0255, + "step": 222 + }, + { + "epoch": 0.0774574505036471, + "grad_norm": 2.345709307993445, + "learning_rate": 9.941569573312882e-06, + "loss": 0.0341, + "step": 223 + }, + { + "epoch": 0.07780479333101771, + "grad_norm": 0.7888283535068908, + "learning_rate": 9.940708851756633e-06, + "loss": 0.0204, + "step": 224 + }, + { + "epoch": 0.07815213615838833, + "grad_norm": 0.5490007316162747, + "learning_rate": 9.939841874756481e-06, + "loss": 0.0273, + "step": 225 + }, + { + "epoch": 0.07849947898575894, + "grad_norm": 1.1934836714677952, + "learning_rate": 9.938968643410103e-06, + "loss": 0.0206, + "step": 226 + }, + { + "epoch": 0.07884682181312956, + "grad_norm": 0.7241877331482239, + "learning_rate": 9.938089158823101e-06, + "loss": 0.0221, + "step": 227 + }, + { + "epoch": 0.07919416464050018, + "grad_norm": 2.4778189410310696, + "learning_rate": 9.937203422108995e-06, + "loss": 0.0308, + "step": 228 + }, + { + "epoch": 0.0795415074678708, + "grad_norm": 3.748172451740872, + "learning_rate": 9.936311434389216e-06, + "loss": 0.0427, + "step": 229 + }, + { + "epoch": 0.0798888502952414, + "grad_norm": 0.7889067721600285, + "learning_rate": 9.935413196793111e-06, + "loss": 0.0193, + "step": 230 + }, + { + "epoch": 0.08023619312261202, + "grad_norm": 0.704831305338057, + "learning_rate": 9.934508710457944e-06, + "loss": 0.0194, + "step": 231 + }, + { + "epoch": 0.08058353594998263, + "grad_norm": 0.40717838994648536, + "learning_rate": 9.933597976528883e-06, + "loss": 0.0254, + "step": 232 + }, + { + "epoch": 0.08093087877735325, + "grad_norm": 1.964638344320029, + "learning_rate": 9.932680996159016e-06, + "loss": 0.0283, + "step": 233 + }, + { + "epoch": 0.08127822160472387, + "grad_norm": 2.4616476396806473, + "learning_rate": 9.931757770509332e-06, + "loss": 0.0364, + "step": 234 + }, + { + "epoch": 0.08162556443209448, + "grad_norm": 0.6589471069941858, + "learning_rate": 9.930828300748726e-06, + "loss": 0.0152, + "step": 235 + }, + { + "epoch": 0.08197290725946509, + "grad_norm": 1.5170996006478779, + "learning_rate": 9.929892588054007e-06, + "loss": 0.0456, + "step": 236 + }, + { + "epoch": 0.0823202500868357, + "grad_norm": 0.49344733685789915, + "learning_rate": 9.928950633609878e-06, + "loss": 0.0157, + "step": 237 + }, + { + "epoch": 0.08266759291420632, + "grad_norm": 1.2510013978714254, + "learning_rate": 9.928002438608955e-06, + "loss": 0.0264, + "step": 238 + }, + { + "epoch": 0.08301493574157694, + "grad_norm": 0.6672029612706915, + "learning_rate": 9.927048004251748e-06, + "loss": 0.0232, + "step": 239 + }, + { + "epoch": 0.08336227856894755, + "grad_norm": 0.7870864791530758, + "learning_rate": 9.926087331746668e-06, + "loss": 0.0241, + "step": 240 + }, + { + "epoch": 0.08370962139631817, + "grad_norm": 0.4430411461060668, + "learning_rate": 9.925120422310023e-06, + "loss": 0.0239, + "step": 241 + }, + { + "epoch": 0.08405696422368879, + "grad_norm": 0.7393154201127319, + "learning_rate": 9.924147277166025e-06, + "loss": 0.0221, + "step": 242 + }, + { + "epoch": 0.08440430705105939, + "grad_norm": 1.5211011444968148, + "learning_rate": 9.923167897546773e-06, + "loss": 0.0361, + "step": 243 + }, + { + "epoch": 0.08475164987843001, + "grad_norm": 0.306402457203348, + "learning_rate": 9.92218228469226e-06, + "loss": 0.0161, + "step": 244 + }, + { + "epoch": 0.08509899270580062, + "grad_norm": 0.6425545998545745, + "learning_rate": 9.921190439850374e-06, + "loss": 0.0336, + "step": 245 + }, + { + "epoch": 0.08544633553317124, + "grad_norm": 1.1621242417732545, + "learning_rate": 9.920192364276894e-06, + "loss": 0.0346, + "step": 246 + }, + { + "epoch": 0.08579367836054186, + "grad_norm": 0.8594232560472324, + "learning_rate": 9.919188059235483e-06, + "loss": 0.0276, + "step": 247 + }, + { + "epoch": 0.08614102118791248, + "grad_norm": 1.1817899977122954, + "learning_rate": 9.918177525997697e-06, + "loss": 0.0237, + "step": 248 + }, + { + "epoch": 0.08648836401528308, + "grad_norm": 0.663618476895076, + "learning_rate": 9.917160765842972e-06, + "loss": 0.0147, + "step": 249 + }, + { + "epoch": 0.0868357068426537, + "grad_norm": 2.1119160052926906, + "learning_rate": 9.916137780058634e-06, + "loss": 0.0372, + "step": 250 + }, + { + "epoch": 0.08718304967002431, + "grad_norm": 0.5192464513916062, + "learning_rate": 9.915108569939884e-06, + "loss": 0.0226, + "step": 251 + }, + { + "epoch": 0.08753039249739493, + "grad_norm": 1.4192863431762925, + "learning_rate": 9.914073136789812e-06, + "loss": 0.0208, + "step": 252 + }, + { + "epoch": 0.08787773532476555, + "grad_norm": 1.4952635090772373, + "learning_rate": 9.913031481919378e-06, + "loss": 0.0256, + "step": 253 + }, + { + "epoch": 0.08822507815213616, + "grad_norm": 0.7782630656189196, + "learning_rate": 9.911983606647426e-06, + "loss": 0.0301, + "step": 254 + }, + { + "epoch": 0.08857242097950677, + "grad_norm": 1.0227442959844868, + "learning_rate": 9.910929512300673e-06, + "loss": 0.0205, + "step": 255 + }, + { + "epoch": 0.08891976380687738, + "grad_norm": 1.0984883231423739, + "learning_rate": 9.909869200213711e-06, + "loss": 0.0272, + "step": 256 + }, + { + "epoch": 0.089267106634248, + "grad_norm": 2.616121786507333, + "learning_rate": 9.908802671729004e-06, + "loss": 0.0298, + "step": 257 + }, + { + "epoch": 0.08961444946161862, + "grad_norm": 1.0424691898538212, + "learning_rate": 9.907729928196885e-06, + "loss": 0.0221, + "step": 258 + }, + { + "epoch": 0.08996179228898923, + "grad_norm": 1.2136434228663253, + "learning_rate": 9.90665097097556e-06, + "loss": 0.0344, + "step": 259 + }, + { + "epoch": 0.09030913511635985, + "grad_norm": 1.0607165560461278, + "learning_rate": 9.905565801431097e-06, + "loss": 0.0278, + "step": 260 + }, + { + "epoch": 0.09065647794373047, + "grad_norm": 0.7592364646233872, + "learning_rate": 9.904474420937431e-06, + "loss": 0.0241, + "step": 261 + }, + { + "epoch": 0.09100382077110107, + "grad_norm": 1.142833500760027, + "learning_rate": 9.903376830876363e-06, + "loss": 0.0194, + "step": 262 + }, + { + "epoch": 0.09135116359847169, + "grad_norm": 1.5792096178170156, + "learning_rate": 9.902273032637558e-06, + "loss": 0.0285, + "step": 263 + }, + { + "epoch": 0.0916985064258423, + "grad_norm": 1.2090200476218058, + "learning_rate": 9.901163027618532e-06, + "loss": 0.027, + "step": 264 + }, + { + "epoch": 0.09204584925321292, + "grad_norm": 0.7247661863395669, + "learning_rate": 9.90004681722467e-06, + "loss": 0.0381, + "step": 265 + }, + { + "epoch": 0.09239319208058354, + "grad_norm": 1.3359604948328887, + "learning_rate": 9.898924402869204e-06, + "loss": 0.025, + "step": 266 + }, + { + "epoch": 0.09274053490795416, + "grad_norm": 1.5054331995013246, + "learning_rate": 9.897795785973227e-06, + "loss": 0.0176, + "step": 267 + }, + { + "epoch": 0.09308787773532476, + "grad_norm": 0.3800080517239375, + "learning_rate": 9.896660967965688e-06, + "loss": 0.0206, + "step": 268 + }, + { + "epoch": 0.09343522056269538, + "grad_norm": 2.2058762679111212, + "learning_rate": 9.895519950283378e-06, + "loss": 0.0382, + "step": 269 + }, + { + "epoch": 0.09378256339006599, + "grad_norm": 1.987671837514159, + "learning_rate": 9.894372734370945e-06, + "loss": 0.0245, + "step": 270 + }, + { + "epoch": 0.09412990621743661, + "grad_norm": 1.1525093904407058, + "learning_rate": 9.89321932168088e-06, + "loss": 0.0294, + "step": 271 + }, + { + "epoch": 0.09447724904480723, + "grad_norm": 0.5288867918479366, + "learning_rate": 9.892059713673521e-06, + "loss": 0.0329, + "step": 272 + }, + { + "epoch": 0.09482459187217784, + "grad_norm": 2.3156241409626537, + "learning_rate": 9.890893911817056e-06, + "loss": 0.0281, + "step": 273 + }, + { + "epoch": 0.09517193469954846, + "grad_norm": 1.6457301083303564, + "learning_rate": 9.889721917587504e-06, + "loss": 0.0246, + "step": 274 + }, + { + "epoch": 0.09551927752691906, + "grad_norm": 1.3357334045720493, + "learning_rate": 9.888543732468732e-06, + "loss": 0.0285, + "step": 275 + }, + { + "epoch": 0.09586662035428968, + "grad_norm": 1.283091226837623, + "learning_rate": 9.887359357952441e-06, + "loss": 0.0354, + "step": 276 + }, + { + "epoch": 0.0962139631816603, + "grad_norm": 1.468754392397442, + "learning_rate": 9.886168795538175e-06, + "loss": 0.0243, + "step": 277 + }, + { + "epoch": 0.09656130600903091, + "grad_norm": 0.5847171116164384, + "learning_rate": 9.884972046733306e-06, + "loss": 0.0187, + "step": 278 + }, + { + "epoch": 0.09690864883640153, + "grad_norm": 0.7176769597295244, + "learning_rate": 9.883769113053039e-06, + "loss": 0.0242, + "step": 279 + }, + { + "epoch": 0.09725599166377215, + "grad_norm": 0.4262926228040994, + "learning_rate": 9.882559996020414e-06, + "loss": 0.0179, + "step": 280 + }, + { + "epoch": 0.09760333449114275, + "grad_norm": 0.9525497182278096, + "learning_rate": 9.881344697166293e-06, + "loss": 0.0349, + "step": 281 + }, + { + "epoch": 0.09795067731851337, + "grad_norm": 0.9791433669149336, + "learning_rate": 9.880123218029374e-06, + "loss": 0.0308, + "step": 282 + }, + { + "epoch": 0.09829802014588399, + "grad_norm": 0.7630066342045814, + "learning_rate": 9.878895560156172e-06, + "loss": 0.0205, + "step": 283 + }, + { + "epoch": 0.0986453629732546, + "grad_norm": 0.364457070730968, + "learning_rate": 9.877661725101028e-06, + "loss": 0.0195, + "step": 284 + }, + { + "epoch": 0.09899270580062522, + "grad_norm": 1.2950920833202644, + "learning_rate": 9.876421714426104e-06, + "loss": 0.0211, + "step": 285 + }, + { + "epoch": 0.09934004862799584, + "grad_norm": 3.594479893409194, + "learning_rate": 9.87517552970138e-06, + "loss": 0.0375, + "step": 286 + }, + { + "epoch": 0.09968739145536645, + "grad_norm": 1.3559833171841018, + "learning_rate": 9.873923172504653e-06, + "loss": 0.0306, + "step": 287 + }, + { + "epoch": 0.10003473428273706, + "grad_norm": 1.0574786377389542, + "learning_rate": 9.872664644421539e-06, + "loss": 0.0378, + "step": 288 + }, + { + "epoch": 0.10038207711010767, + "grad_norm": 0.8553158210643477, + "learning_rate": 9.87139994704546e-06, + "loss": 0.017, + "step": 289 + }, + { + "epoch": 0.10072941993747829, + "grad_norm": 1.2327340151649626, + "learning_rate": 9.870129081977654e-06, + "loss": 0.0291, + "step": 290 + }, + { + "epoch": 0.10107676276484891, + "grad_norm": 0.6120639599947546, + "learning_rate": 9.868852050827167e-06, + "loss": 0.0248, + "step": 291 + }, + { + "epoch": 0.10142410559221952, + "grad_norm": 1.3504767761554735, + "learning_rate": 9.86756885521085e-06, + "loss": 0.0196, + "step": 292 + }, + { + "epoch": 0.10177144841959014, + "grad_norm": 2.130326569000447, + "learning_rate": 9.866279496753361e-06, + "loss": 0.0272, + "step": 293 + }, + { + "epoch": 0.10211879124696074, + "grad_norm": 2.961313545763907, + "learning_rate": 9.86498397708716e-06, + "loss": 0.0306, + "step": 294 + }, + { + "epoch": 0.10246613407433136, + "grad_norm": 0.6185598205757522, + "learning_rate": 9.863682297852506e-06, + "loss": 0.0268, + "step": 295 + }, + { + "epoch": 0.10281347690170198, + "grad_norm": 2.900646593623201, + "learning_rate": 9.862374460697462e-06, + "loss": 0.0299, + "step": 296 + }, + { + "epoch": 0.1031608197290726, + "grad_norm": 4.815048093426214, + "learning_rate": 9.86106046727788e-06, + "loss": 0.0413, + "step": 297 + }, + { + "epoch": 0.10350816255644321, + "grad_norm": 3.6854621061195236, + "learning_rate": 9.859740319257413e-06, + "loss": 0.0315, + "step": 298 + }, + { + "epoch": 0.10385550538381383, + "grad_norm": 3.1485787317926315, + "learning_rate": 9.858414018307503e-06, + "loss": 0.0395, + "step": 299 + }, + { + "epoch": 0.10420284821118445, + "grad_norm": 1.6857361849761061, + "learning_rate": 9.857081566107383e-06, + "loss": 0.0306, + "step": 300 + }, + { + "epoch": 0.10455019103855505, + "grad_norm": 1.1203216330091716, + "learning_rate": 9.855742964344074e-06, + "loss": 0.0243, + "step": 301 + }, + { + "epoch": 0.10489753386592567, + "grad_norm": 2.214128202481394, + "learning_rate": 9.854398214712382e-06, + "loss": 0.0234, + "step": 302 + }, + { + "epoch": 0.10524487669329628, + "grad_norm": 1.6017019194654412, + "learning_rate": 9.853047318914898e-06, + "loss": 0.0291, + "step": 303 + }, + { + "epoch": 0.1055922195206669, + "grad_norm": 2.2711519721461917, + "learning_rate": 9.851690278661998e-06, + "loss": 0.0271, + "step": 304 + }, + { + "epoch": 0.10593956234803752, + "grad_norm": 0.4684895446152942, + "learning_rate": 9.850327095671831e-06, + "loss": 0.0177, + "step": 305 + }, + { + "epoch": 0.10628690517540813, + "grad_norm": 1.8303208610929107, + "learning_rate": 9.848957771670326e-06, + "loss": 0.0286, + "step": 306 + }, + { + "epoch": 0.10663424800277874, + "grad_norm": 0.547820372795727, + "learning_rate": 9.847582308391189e-06, + "loss": 0.0328, + "step": 307 + }, + { + "epoch": 0.10698159083014935, + "grad_norm": 1.7911990714879877, + "learning_rate": 9.846200707575897e-06, + "loss": 0.0153, + "step": 308 + }, + { + "epoch": 0.10732893365751997, + "grad_norm": 2.725196937839101, + "learning_rate": 9.844812970973699e-06, + "loss": 0.0228, + "step": 309 + }, + { + "epoch": 0.10767627648489059, + "grad_norm": 2.375094543001517, + "learning_rate": 9.843419100341608e-06, + "loss": 0.0203, + "step": 310 + }, + { + "epoch": 0.1080236193122612, + "grad_norm": 0.732852515115049, + "learning_rate": 9.842019097444414e-06, + "loss": 0.0214, + "step": 311 + }, + { + "epoch": 0.10837096213963182, + "grad_norm": 0.8565652562532247, + "learning_rate": 9.840612964054658e-06, + "loss": 0.031, + "step": 312 + }, + { + "epoch": 0.10871830496700244, + "grad_norm": 1.046089822343109, + "learning_rate": 9.839200701952653e-06, + "loss": 0.0273, + "step": 313 + }, + { + "epoch": 0.10906564779437304, + "grad_norm": 1.4037462533263418, + "learning_rate": 9.837782312926465e-06, + "loss": 0.0247, + "step": 314 + }, + { + "epoch": 0.10941299062174366, + "grad_norm": 0.724135174812571, + "learning_rate": 9.836357798771922e-06, + "loss": 0.0197, + "step": 315 + }, + { + "epoch": 0.10976033344911428, + "grad_norm": 0.8979754709718626, + "learning_rate": 9.834927161292604e-06, + "loss": 0.0135, + "step": 316 + }, + { + "epoch": 0.11010767627648489, + "grad_norm": 0.818961402615803, + "learning_rate": 9.833490402299844e-06, + "loss": 0.024, + "step": 317 + }, + { + "epoch": 0.11045501910385551, + "grad_norm": 1.8250252957901378, + "learning_rate": 9.832047523612726e-06, + "loss": 0.0305, + "step": 318 + }, + { + "epoch": 0.11080236193122613, + "grad_norm": 0.898470564935298, + "learning_rate": 9.830598527058083e-06, + "loss": 0.0262, + "step": 319 + }, + { + "epoch": 0.11114970475859673, + "grad_norm": 0.7300373503368474, + "learning_rate": 9.829143414470495e-06, + "loss": 0.0286, + "step": 320 + }, + { + "epoch": 0.11149704758596735, + "grad_norm": 1.0198415430023702, + "learning_rate": 9.82768218769228e-06, + "loss": 0.0245, + "step": 321 + }, + { + "epoch": 0.11184439041333796, + "grad_norm": 1.5853784707754817, + "learning_rate": 9.826214848573503e-06, + "loss": 0.0272, + "step": 322 + }, + { + "epoch": 0.11219173324070858, + "grad_norm": 0.5862824974087557, + "learning_rate": 9.824741398971966e-06, + "loss": 0.0168, + "step": 323 + }, + { + "epoch": 0.1125390760680792, + "grad_norm": 1.0344261835540582, + "learning_rate": 9.823261840753209e-06, + "loss": 0.0186, + "step": 324 + }, + { + "epoch": 0.11288641889544981, + "grad_norm": 1.3434435771133537, + "learning_rate": 9.821776175790501e-06, + "loss": 0.0287, + "step": 325 + }, + { + "epoch": 0.11323376172282042, + "grad_norm": 0.6130576808513916, + "learning_rate": 9.820284405964846e-06, + "loss": 0.018, + "step": 326 + }, + { + "epoch": 0.11358110455019103, + "grad_norm": 0.6570948194085574, + "learning_rate": 9.81878653316498e-06, + "loss": 0.0136, + "step": 327 + }, + { + "epoch": 0.11392844737756165, + "grad_norm": 1.6328034399729674, + "learning_rate": 9.817282559287362e-06, + "loss": 0.0277, + "step": 328 + }, + { + "epoch": 0.11427579020493227, + "grad_norm": 0.3948241510052037, + "learning_rate": 9.815772486236179e-06, + "loss": 0.0126, + "step": 329 + }, + { + "epoch": 0.11462313303230288, + "grad_norm": 0.5069144619443747, + "learning_rate": 9.814256315923335e-06, + "loss": 0.0119, + "step": 330 + }, + { + "epoch": 0.1149704758596735, + "grad_norm": 1.1749675379829028, + "learning_rate": 9.81273405026846e-06, + "loss": 0.0401, + "step": 331 + }, + { + "epoch": 0.11531781868704412, + "grad_norm": 1.38512145823474, + "learning_rate": 9.811205691198897e-06, + "loss": 0.0255, + "step": 332 + }, + { + "epoch": 0.11566516151441472, + "grad_norm": 1.6588777415442508, + "learning_rate": 9.809671240649705e-06, + "loss": 0.0232, + "step": 333 + }, + { + "epoch": 0.11601250434178534, + "grad_norm": 0.891351380406614, + "learning_rate": 9.808130700563658e-06, + "loss": 0.0182, + "step": 334 + }, + { + "epoch": 0.11635984716915596, + "grad_norm": 1.9182917391789063, + "learning_rate": 9.806584072891234e-06, + "loss": 0.0295, + "step": 335 + }, + { + "epoch": 0.11670718999652657, + "grad_norm": 1.414292519566524, + "learning_rate": 9.805031359590626e-06, + "loss": 0.0182, + "step": 336 + }, + { + "epoch": 0.11705453282389719, + "grad_norm": 1.048723673722958, + "learning_rate": 9.803472562627726e-06, + "loss": 0.0119, + "step": 337 + }, + { + "epoch": 0.1174018756512678, + "grad_norm": 1.2747705868686736, + "learning_rate": 9.801907683976128e-06, + "loss": 0.0272, + "step": 338 + }, + { + "epoch": 0.11774921847863841, + "grad_norm": 0.9457245602215513, + "learning_rate": 9.800336725617136e-06, + "loss": 0.0201, + "step": 339 + }, + { + "epoch": 0.11809656130600903, + "grad_norm": 1.2510122100134684, + "learning_rate": 9.798759689539739e-06, + "loss": 0.0257, + "step": 340 + }, + { + "epoch": 0.11844390413337964, + "grad_norm": 0.8319010285486187, + "learning_rate": 9.797176577740625e-06, + "loss": 0.0254, + "step": 341 + }, + { + "epoch": 0.11879124696075026, + "grad_norm": 0.7850673506313961, + "learning_rate": 9.795587392224182e-06, + "loss": 0.0234, + "step": 342 + }, + { + "epoch": 0.11913858978812088, + "grad_norm": 0.587424666600968, + "learning_rate": 9.793992135002476e-06, + "loss": 0.0179, + "step": 343 + }, + { + "epoch": 0.1194859326154915, + "grad_norm": 1.2534232883052836, + "learning_rate": 9.792390808095268e-06, + "loss": 0.0228, + "step": 344 + }, + { + "epoch": 0.11983327544286211, + "grad_norm": 1.1177620903458203, + "learning_rate": 9.790783413530006e-06, + "loss": 0.0283, + "step": 345 + }, + { + "epoch": 0.12018061827023271, + "grad_norm": 0.7956916247471605, + "learning_rate": 9.789169953341809e-06, + "loss": 0.026, + "step": 346 + }, + { + "epoch": 0.12052796109760333, + "grad_norm": 1.4672529265658403, + "learning_rate": 9.787550429573487e-06, + "loss": 0.0242, + "step": 347 + }, + { + "epoch": 0.12087530392497395, + "grad_norm": 0.499722630858149, + "learning_rate": 9.785924844275523e-06, + "loss": 0.0243, + "step": 348 + }, + { + "epoch": 0.12122264675234456, + "grad_norm": 0.7528884315208382, + "learning_rate": 9.784293199506076e-06, + "loss": 0.0312, + "step": 349 + }, + { + "epoch": 0.12156998957971518, + "grad_norm": 0.9565348415093637, + "learning_rate": 9.782655497330972e-06, + "loss": 0.0149, + "step": 350 + }, + { + "epoch": 0.1219173324070858, + "grad_norm": 4.700227177143458, + "learning_rate": 9.781011739823715e-06, + "loss": 0.0459, + "step": 351 + }, + { + "epoch": 0.1222646752344564, + "grad_norm": 2.0797417224686545, + "learning_rate": 9.779361929065462e-06, + "loss": 0.0248, + "step": 352 + }, + { + "epoch": 0.12261201806182702, + "grad_norm": 1.5530326186566639, + "learning_rate": 9.777706067145052e-06, + "loss": 0.0278, + "step": 353 + }, + { + "epoch": 0.12295936088919764, + "grad_norm": 0.8614425997016937, + "learning_rate": 9.77604415615897e-06, + "loss": 0.0171, + "step": 354 + }, + { + "epoch": 0.12330670371656825, + "grad_norm": 0.798882409804884, + "learning_rate": 9.77437619821137e-06, + "loss": 0.0221, + "step": 355 + }, + { + "epoch": 0.12365404654393887, + "grad_norm": 1.309963463600437, + "learning_rate": 9.772702195414053e-06, + "loss": 0.0227, + "step": 356 + }, + { + "epoch": 0.12400138937130949, + "grad_norm": 1.0239470706246998, + "learning_rate": 9.771022149886482e-06, + "loss": 0.0247, + "step": 357 + }, + { + "epoch": 0.1243487321986801, + "grad_norm": 0.694165293574145, + "learning_rate": 9.769336063755763e-06, + "loss": 0.0165, + "step": 358 + }, + { + "epoch": 0.1246960750260507, + "grad_norm": 0.8806399954727403, + "learning_rate": 9.767643939156658e-06, + "loss": 0.0315, + "step": 359 + }, + { + "epoch": 0.12504341785342132, + "grad_norm": 1.4519111212390032, + "learning_rate": 9.765945778231568e-06, + "loss": 0.0206, + "step": 360 + }, + { + "epoch": 0.12539076068079194, + "grad_norm": 0.7103007161003656, + "learning_rate": 9.76424158313054e-06, + "loss": 0.0206, + "step": 361 + }, + { + "epoch": 0.12573810350816256, + "grad_norm": 1.4369518423012835, + "learning_rate": 9.762531356011258e-06, + "loss": 0.0309, + "step": 362 + }, + { + "epoch": 0.12608544633553317, + "grad_norm": 1.9088679954477914, + "learning_rate": 9.760815099039045e-06, + "loss": 0.025, + "step": 363 + }, + { + "epoch": 0.1264327891629038, + "grad_norm": 1.2218949903799274, + "learning_rate": 9.75909281438686e-06, + "loss": 0.0241, + "step": 364 + }, + { + "epoch": 0.1267801319902744, + "grad_norm": 0.8840693103252929, + "learning_rate": 9.757364504235292e-06, + "loss": 0.0256, + "step": 365 + }, + { + "epoch": 0.12712747481764503, + "grad_norm": 2.3783985810393853, + "learning_rate": 9.755630170772556e-06, + "loss": 0.0279, + "step": 366 + }, + { + "epoch": 0.12747481764501564, + "grad_norm": 2.1206965462444987, + "learning_rate": 9.753889816194498e-06, + "loss": 0.0371, + "step": 367 + }, + { + "epoch": 0.12782216047238623, + "grad_norm": 3.735386143479719, + "learning_rate": 9.752143442704586e-06, + "loss": 0.0474, + "step": 368 + }, + { + "epoch": 0.12816950329975685, + "grad_norm": 0.6664661005983077, + "learning_rate": 9.750391052513906e-06, + "loss": 0.026, + "step": 369 + }, + { + "epoch": 0.12851684612712747, + "grad_norm": 0.3453207995141737, + "learning_rate": 9.748632647841165e-06, + "loss": 0.0189, + "step": 370 + }, + { + "epoch": 0.12886418895449808, + "grad_norm": 1.1376711577741057, + "learning_rate": 9.746868230912683e-06, + "loss": 0.0347, + "step": 371 + }, + { + "epoch": 0.1292115317818687, + "grad_norm": 1.0621834917092619, + "learning_rate": 9.745097803962394e-06, + "loss": 0.0212, + "step": 372 + }, + { + "epoch": 0.12955887460923932, + "grad_norm": 1.4420764224102458, + "learning_rate": 9.743321369231837e-06, + "loss": 0.0288, + "step": 373 + }, + { + "epoch": 0.12990621743660993, + "grad_norm": 0.47180137756368545, + "learning_rate": 9.741538928970163e-06, + "loss": 0.0212, + "step": 374 + }, + { + "epoch": 0.13025356026398055, + "grad_norm": 1.0223869124759941, + "learning_rate": 9.739750485434126e-06, + "loss": 0.0275, + "step": 375 + }, + { + "epoch": 0.13060090309135117, + "grad_norm": 0.3467757694541222, + "learning_rate": 9.737956040888073e-06, + "loss": 0.0212, + "step": 376 + }, + { + "epoch": 0.13094824591872178, + "grad_norm": 0.4743085064117592, + "learning_rate": 9.736155597603959e-06, + "loss": 0.0238, + "step": 377 + }, + { + "epoch": 0.1312955887460924, + "grad_norm": 0.5939635024690287, + "learning_rate": 9.734349157861329e-06, + "loss": 0.0187, + "step": 378 + }, + { + "epoch": 0.13164293157346302, + "grad_norm": 0.8656882396061, + "learning_rate": 9.73253672394732e-06, + "loss": 0.0267, + "step": 379 + }, + { + "epoch": 0.13199027440083363, + "grad_norm": 0.9530332353603665, + "learning_rate": 9.73071829815666e-06, + "loss": 0.0226, + "step": 380 + }, + { + "epoch": 0.13233761722820422, + "grad_norm": 1.126170471334089, + "learning_rate": 9.728893882791663e-06, + "loss": 0.0264, + "step": 381 + }, + { + "epoch": 0.13268496005557484, + "grad_norm": 1.0981019989116185, + "learning_rate": 9.727063480162226e-06, + "loss": 0.0192, + "step": 382 + }, + { + "epoch": 0.13303230288294546, + "grad_norm": 0.890921570685852, + "learning_rate": 9.725227092585824e-06, + "loss": 0.0291, + "step": 383 + }, + { + "epoch": 0.13337964571031607, + "grad_norm": 0.785030282186115, + "learning_rate": 9.723384722387516e-06, + "loss": 0.0239, + "step": 384 + }, + { + "epoch": 0.1337269885376867, + "grad_norm": 2.0060229292652942, + "learning_rate": 9.721536371899928e-06, + "loss": 0.0291, + "step": 385 + }, + { + "epoch": 0.1340743313650573, + "grad_norm": 1.0118306100174919, + "learning_rate": 9.719682043463261e-06, + "loss": 0.0151, + "step": 386 + }, + { + "epoch": 0.13442167419242793, + "grad_norm": 1.1099679117319492, + "learning_rate": 9.717821739425286e-06, + "loss": 0.0257, + "step": 387 + }, + { + "epoch": 0.13476901701979854, + "grad_norm": 0.6925372348746138, + "learning_rate": 9.71595546214134e-06, + "loss": 0.0302, + "step": 388 + }, + { + "epoch": 0.13511635984716916, + "grad_norm": 1.1588103017208862, + "learning_rate": 9.714083213974322e-06, + "loss": 0.0159, + "step": 389 + }, + { + "epoch": 0.13546370267453978, + "grad_norm": 1.6463962782244659, + "learning_rate": 9.712204997294685e-06, + "loss": 0.028, + "step": 390 + }, + { + "epoch": 0.1358110455019104, + "grad_norm": 0.740625572838366, + "learning_rate": 9.710320814480448e-06, + "loss": 0.0291, + "step": 391 + }, + { + "epoch": 0.136158388329281, + "grad_norm": 1.461026830177916, + "learning_rate": 9.708430667917179e-06, + "loss": 0.0266, + "step": 392 + }, + { + "epoch": 0.13650573115665163, + "grad_norm": 1.173157945166447, + "learning_rate": 9.706534559997997e-06, + "loss": 0.0183, + "step": 393 + }, + { + "epoch": 0.13685307398402222, + "grad_norm": 2.7283819002594334, + "learning_rate": 9.704632493123569e-06, + "loss": 0.031, + "step": 394 + }, + { + "epoch": 0.13720041681139283, + "grad_norm": 0.804775724073791, + "learning_rate": 9.702724469702107e-06, + "loss": 0.0209, + "step": 395 + }, + { + "epoch": 0.13754775963876345, + "grad_norm": 0.7136569776548899, + "learning_rate": 9.700810492149364e-06, + "loss": 0.0247, + "step": 396 + }, + { + "epoch": 0.13789510246613407, + "grad_norm": 0.8364128273663843, + "learning_rate": 9.698890562888632e-06, + "loss": 0.0208, + "step": 397 + }, + { + "epoch": 0.13824244529350468, + "grad_norm": 1.0792662917432296, + "learning_rate": 9.696964684350738e-06, + "loss": 0.0283, + "step": 398 + }, + { + "epoch": 0.1385897881208753, + "grad_norm": 1.0704903922122424, + "learning_rate": 9.695032858974042e-06, + "loss": 0.0233, + "step": 399 + }, + { + "epoch": 0.13893713094824592, + "grad_norm": 0.8609262980285204, + "learning_rate": 9.693095089204431e-06, + "loss": 0.0277, + "step": 400 + }, + { + "epoch": 0.13928447377561654, + "grad_norm": 0.38166608517210565, + "learning_rate": 9.691151377495324e-06, + "loss": 0.0198, + "step": 401 + }, + { + "epoch": 0.13963181660298715, + "grad_norm": 0.5729627674226186, + "learning_rate": 9.689201726307655e-06, + "loss": 0.0318, + "step": 402 + }, + { + "epoch": 0.13997915943035777, + "grad_norm": 1.975870726288315, + "learning_rate": 9.687246138109888e-06, + "loss": 0.0322, + "step": 403 + }, + { + "epoch": 0.14032650225772839, + "grad_norm": 2.1898213167704648, + "learning_rate": 9.68528461537799e-06, + "loss": 0.0304, + "step": 404 + }, + { + "epoch": 0.140673845085099, + "grad_norm": 0.620886516690147, + "learning_rate": 9.683317160595457e-06, + "loss": 0.0178, + "step": 405 + }, + { + "epoch": 0.14102118791246962, + "grad_norm": 0.751397853860502, + "learning_rate": 9.681343776253284e-06, + "loss": 0.0227, + "step": 406 + }, + { + "epoch": 0.1413685307398402, + "grad_norm": 1.5360568776293804, + "learning_rate": 9.679364464849983e-06, + "loss": 0.024, + "step": 407 + }, + { + "epoch": 0.14171587356721083, + "grad_norm": 1.9507404317073442, + "learning_rate": 9.67737922889156e-06, + "loss": 0.0299, + "step": 408 + }, + { + "epoch": 0.14206321639458144, + "grad_norm": 0.3885448349712659, + "learning_rate": 9.675388070891527e-06, + "loss": 0.0169, + "step": 409 + }, + { + "epoch": 0.14241055922195206, + "grad_norm": 0.7487805008765015, + "learning_rate": 9.6733909933709e-06, + "loss": 0.0258, + "step": 410 + }, + { + "epoch": 0.14275790204932268, + "grad_norm": 0.8741306809093075, + "learning_rate": 9.671387998858178e-06, + "loss": 0.029, + "step": 411 + }, + { + "epoch": 0.1431052448766933, + "grad_norm": 0.7791251348925693, + "learning_rate": 9.669379089889361e-06, + "loss": 0.0287, + "step": 412 + }, + { + "epoch": 0.1434525877040639, + "grad_norm": 1.6127960765398552, + "learning_rate": 9.66736426900793e-06, + "loss": 0.0207, + "step": 413 + }, + { + "epoch": 0.14379993053143453, + "grad_norm": 2.360372239374618, + "learning_rate": 9.66534353876486e-06, + "loss": 0.0257, + "step": 414 + }, + { + "epoch": 0.14414727335880514, + "grad_norm": 1.0591550444954392, + "learning_rate": 9.663316901718599e-06, + "loss": 0.0252, + "step": 415 + }, + { + "epoch": 0.14449461618617576, + "grad_norm": 0.6279163360317281, + "learning_rate": 9.661284360435075e-06, + "loss": 0.0168, + "step": 416 + }, + { + "epoch": 0.14484195901354638, + "grad_norm": 0.9205829596021957, + "learning_rate": 9.659245917487698e-06, + "loss": 0.028, + "step": 417 + }, + { + "epoch": 0.145189301840917, + "grad_norm": 0.7205918553309815, + "learning_rate": 9.657201575457346e-06, + "loss": 0.0239, + "step": 418 + }, + { + "epoch": 0.1455366446682876, + "grad_norm": 0.4050107455693852, + "learning_rate": 9.655151336932362e-06, + "loss": 0.0152, + "step": 419 + }, + { + "epoch": 0.1458839874956582, + "grad_norm": 1.5415577326332006, + "learning_rate": 9.653095204508562e-06, + "loss": 0.0222, + "step": 420 + }, + { + "epoch": 0.14623133032302882, + "grad_norm": 0.4578806435756731, + "learning_rate": 9.651033180789216e-06, + "loss": 0.016, + "step": 421 + }, + { + "epoch": 0.14657867315039944, + "grad_norm": 0.8614555068468486, + "learning_rate": 9.648965268385062e-06, + "loss": 0.0257, + "step": 422 + }, + { + "epoch": 0.14692601597777005, + "grad_norm": 1.470299031527, + "learning_rate": 9.646891469914285e-06, + "loss": 0.0368, + "step": 423 + }, + { + "epoch": 0.14727335880514067, + "grad_norm": 1.0949073778594718, + "learning_rate": 9.644811788002531e-06, + "loss": 0.033, + "step": 424 + }, + { + "epoch": 0.1476207016325113, + "grad_norm": 0.7401125451520727, + "learning_rate": 9.642726225282886e-06, + "loss": 0.0288, + "step": 425 + }, + { + "epoch": 0.1479680444598819, + "grad_norm": 2.2131411209283565, + "learning_rate": 9.64063478439589e-06, + "loss": 0.0175, + "step": 426 + }, + { + "epoch": 0.14831538728725252, + "grad_norm": 2.06363585907779, + "learning_rate": 9.638537467989517e-06, + "loss": 0.0276, + "step": 427 + }, + { + "epoch": 0.14866273011462314, + "grad_norm": 1.467812889206939, + "learning_rate": 9.63643427871919e-06, + "loss": 0.0291, + "step": 428 + }, + { + "epoch": 0.14901007294199375, + "grad_norm": 0.4505213888594297, + "learning_rate": 9.634325219247758e-06, + "loss": 0.0153, + "step": 429 + }, + { + "epoch": 0.14935741576936437, + "grad_norm": 1.3450924566368194, + "learning_rate": 9.632210292245508e-06, + "loss": 0.0225, + "step": 430 + }, + { + "epoch": 0.149704758596735, + "grad_norm": 2.347870224615922, + "learning_rate": 9.630089500390154e-06, + "loss": 0.028, + "step": 431 + }, + { + "epoch": 0.1500521014241056, + "grad_norm": 1.6462241884720297, + "learning_rate": 9.627962846366838e-06, + "loss": 0.024, + "step": 432 + }, + { + "epoch": 0.1503994442514762, + "grad_norm": 0.39196556018543693, + "learning_rate": 9.62583033286812e-06, + "loss": 0.0201, + "step": 433 + }, + { + "epoch": 0.1507467870788468, + "grad_norm": 1.5059296209518138, + "learning_rate": 9.62369196259398e-06, + "loss": 0.02, + "step": 434 + }, + { + "epoch": 0.15109412990621743, + "grad_norm": 0.45555178968971705, + "learning_rate": 9.621547738251816e-06, + "loss": 0.0221, + "step": 435 + }, + { + "epoch": 0.15144147273358805, + "grad_norm": 0.2882865925387099, + "learning_rate": 9.619397662556434e-06, + "loss": 0.0128, + "step": 436 + }, + { + "epoch": 0.15178881556095866, + "grad_norm": 0.6918769536905994, + "learning_rate": 9.617241738230051e-06, + "loss": 0.0242, + "step": 437 + }, + { + "epoch": 0.15213615838832928, + "grad_norm": 1.6957528332450178, + "learning_rate": 9.61507996800229e-06, + "loss": 0.0313, + "step": 438 + }, + { + "epoch": 0.1524835012156999, + "grad_norm": 1.317146675116354, + "learning_rate": 9.61291235461017e-06, + "loss": 0.0215, + "step": 439 + }, + { + "epoch": 0.1528308440430705, + "grad_norm": 0.727706359709951, + "learning_rate": 9.610738900798116e-06, + "loss": 0.0271, + "step": 440 + }, + { + "epoch": 0.15317818687044113, + "grad_norm": 3.5123826215661365, + "learning_rate": 9.60855960931794e-06, + "loss": 0.0383, + "step": 441 + }, + { + "epoch": 0.15352552969781175, + "grad_norm": 3.728862284149442, + "learning_rate": 9.606374482928849e-06, + "loss": 0.0318, + "step": 442 + }, + { + "epoch": 0.15387287252518236, + "grad_norm": 2.760620871544741, + "learning_rate": 9.604183524397439e-06, + "loss": 0.041, + "step": 443 + }, + { + "epoch": 0.15422021535255298, + "grad_norm": 3.1069152542667604, + "learning_rate": 9.601986736497686e-06, + "loss": 0.0325, + "step": 444 + }, + { + "epoch": 0.1545675581799236, + "grad_norm": 1.8573780055969038, + "learning_rate": 9.59978412201095e-06, + "loss": 0.0317, + "step": 445 + }, + { + "epoch": 0.1549149010072942, + "grad_norm": 1.0027964210682174, + "learning_rate": 9.597575683725965e-06, + "loss": 0.0227, + "step": 446 + }, + { + "epoch": 0.1552622438346648, + "grad_norm": 1.4276269980988359, + "learning_rate": 9.595361424438841e-06, + "loss": 0.0362, + "step": 447 + }, + { + "epoch": 0.15560958666203542, + "grad_norm": 1.317331217220333, + "learning_rate": 9.593141346953059e-06, + "loss": 0.0226, + "step": 448 + }, + { + "epoch": 0.15595692948940604, + "grad_norm": 0.5933495170717192, + "learning_rate": 9.590915454079463e-06, + "loss": 0.0227, + "step": 449 + }, + { + "epoch": 0.15630427231677665, + "grad_norm": 0.4622800957314312, + "learning_rate": 9.588683748636262e-06, + "loss": 0.0267, + "step": 450 + }, + { + "epoch": 0.15665161514414727, + "grad_norm": 0.7911625488227271, + "learning_rate": 9.586446233449024e-06, + "loss": 0.0246, + "step": 451 + }, + { + "epoch": 0.1569989579715179, + "grad_norm": 0.9702252686758073, + "learning_rate": 9.584202911350672e-06, + "loss": 0.0236, + "step": 452 + }, + { + "epoch": 0.1573463007988885, + "grad_norm": 0.6033941783861664, + "learning_rate": 9.581953785181482e-06, + "loss": 0.0214, + "step": 453 + }, + { + "epoch": 0.15769364362625912, + "grad_norm": 0.4655527204441117, + "learning_rate": 9.579698857789078e-06, + "loss": 0.0283, + "step": 454 + }, + { + "epoch": 0.15804098645362974, + "grad_norm": 0.5742266010042084, + "learning_rate": 9.577438132028431e-06, + "loss": 0.0313, + "step": 455 + }, + { + "epoch": 0.15838832928100036, + "grad_norm": 0.5365426672112383, + "learning_rate": 9.575171610761848e-06, + "loss": 0.0281, + "step": 456 + }, + { + "epoch": 0.15873567210837097, + "grad_norm": 0.4233818161136484, + "learning_rate": 9.572899296858981e-06, + "loss": 0.0315, + "step": 457 + }, + { + "epoch": 0.1590830149357416, + "grad_norm": 0.5680963519098914, + "learning_rate": 9.570621193196811e-06, + "loss": 0.0269, + "step": 458 + }, + { + "epoch": 0.15943035776311218, + "grad_norm": 0.5125465753073855, + "learning_rate": 9.568337302659652e-06, + "loss": 0.0167, + "step": 459 + }, + { + "epoch": 0.1597777005904828, + "grad_norm": 0.3119364502263281, + "learning_rate": 9.566047628139142e-06, + "loss": 0.021, + "step": 460 + }, + { + "epoch": 0.1601250434178534, + "grad_norm": 0.9906415221821926, + "learning_rate": 9.563752172534242e-06, + "loss": 0.0235, + "step": 461 + }, + { + "epoch": 0.16047238624522403, + "grad_norm": 0.8066115586763946, + "learning_rate": 9.561450938751238e-06, + "loss": 0.0229, + "step": 462 + }, + { + "epoch": 0.16081972907259465, + "grad_norm": 1.005027835921377, + "learning_rate": 9.559143929703724e-06, + "loss": 0.0203, + "step": 463 + }, + { + "epoch": 0.16116707189996526, + "grad_norm": 0.32646704240882035, + "learning_rate": 9.556831148312612e-06, + "loss": 0.0183, + "step": 464 + }, + { + "epoch": 0.16151441472733588, + "grad_norm": 0.8970575237008778, + "learning_rate": 9.554512597506122e-06, + "loss": 0.0237, + "step": 465 + }, + { + "epoch": 0.1618617575547065, + "grad_norm": 1.9756358587933307, + "learning_rate": 9.552188280219773e-06, + "loss": 0.0232, + "step": 466 + }, + { + "epoch": 0.16220910038207711, + "grad_norm": 0.659327280239366, + "learning_rate": 9.549858199396394e-06, + "loss": 0.0154, + "step": 467 + }, + { + "epoch": 0.16255644320944773, + "grad_norm": 0.9285348605348003, + "learning_rate": 9.547522357986102e-06, + "loss": 0.0254, + "step": 468 + }, + { + "epoch": 0.16290378603681835, + "grad_norm": 0.8854039259165357, + "learning_rate": 9.545180758946312e-06, + "loss": 0.023, + "step": 469 + }, + { + "epoch": 0.16325112886418897, + "grad_norm": 0.7572168628873044, + "learning_rate": 9.542833405241729e-06, + "loss": 0.0229, + "step": 470 + }, + { + "epoch": 0.16359847169155958, + "grad_norm": 1.1833229445171547, + "learning_rate": 9.540480299844345e-06, + "loss": 0.0194, + "step": 471 + }, + { + "epoch": 0.16394581451893017, + "grad_norm": 1.128447523594609, + "learning_rate": 9.538121445733431e-06, + "loss": 0.0312, + "step": 472 + }, + { + "epoch": 0.1642931573463008, + "grad_norm": 0.8646345138050862, + "learning_rate": 9.53575684589554e-06, + "loss": 0.034, + "step": 473 + }, + { + "epoch": 0.1646405001736714, + "grad_norm": 2.142392363566201, + "learning_rate": 9.533386503324495e-06, + "loss": 0.0267, + "step": 474 + }, + { + "epoch": 0.16498784300104202, + "grad_norm": 0.31920633758411143, + "learning_rate": 9.531010421021396e-06, + "loss": 0.0264, + "step": 475 + }, + { + "epoch": 0.16533518582841264, + "grad_norm": 1.1720964352470096, + "learning_rate": 9.528628601994603e-06, + "loss": 0.0203, + "step": 476 + }, + { + "epoch": 0.16568252865578326, + "grad_norm": 0.3038247546428463, + "learning_rate": 9.526241049259746e-06, + "loss": 0.0178, + "step": 477 + }, + { + "epoch": 0.16602987148315387, + "grad_norm": 1.4917694833284774, + "learning_rate": 9.523847765839712e-06, + "loss": 0.0242, + "step": 478 + }, + { + "epoch": 0.1663772143105245, + "grad_norm": 0.818082889175988, + "learning_rate": 9.52144875476464e-06, + "loss": 0.0222, + "step": 479 + }, + { + "epoch": 0.1667245571378951, + "grad_norm": 0.3877555907825711, + "learning_rate": 9.519044019071926e-06, + "loss": 0.0243, + "step": 480 + }, + { + "epoch": 0.16707189996526572, + "grad_norm": 0.4639811409245256, + "learning_rate": 9.51663356180621e-06, + "loss": 0.0251, + "step": 481 + }, + { + "epoch": 0.16741924279263634, + "grad_norm": 0.8099085137002117, + "learning_rate": 9.514217386019381e-06, + "loss": 0.0303, + "step": 482 + }, + { + "epoch": 0.16776658562000696, + "grad_norm": 0.9835471538813636, + "learning_rate": 9.511795494770563e-06, + "loss": 0.0259, + "step": 483 + }, + { + "epoch": 0.16811392844737758, + "grad_norm": 0.6270381501097829, + "learning_rate": 9.509367891126122e-06, + "loss": 0.0159, + "step": 484 + }, + { + "epoch": 0.16846127127474816, + "grad_norm": 0.4636485528169751, + "learning_rate": 9.506934578159648e-06, + "loss": 0.0238, + "step": 485 + }, + { + "epoch": 0.16880861410211878, + "grad_norm": 0.6141677857665021, + "learning_rate": 9.50449555895197e-06, + "loss": 0.0204, + "step": 486 + }, + { + "epoch": 0.1691559569294894, + "grad_norm": 0.9292538654705289, + "learning_rate": 9.50205083659113e-06, + "loss": 0.0327, + "step": 487 + }, + { + "epoch": 0.16950329975686002, + "grad_norm": 1.1196044763325756, + "learning_rate": 9.499600414172402e-06, + "loss": 0.0222, + "step": 488 + }, + { + "epoch": 0.16985064258423063, + "grad_norm": 1.5976292188516854, + "learning_rate": 9.49714429479827e-06, + "loss": 0.0324, + "step": 489 + }, + { + "epoch": 0.17019798541160125, + "grad_norm": 1.2857006123730916, + "learning_rate": 9.494682481578436e-06, + "loss": 0.0205, + "step": 490 + }, + { + "epoch": 0.17054532823897187, + "grad_norm": 1.276460947257805, + "learning_rate": 9.492214977629804e-06, + "loss": 0.016, + "step": 491 + }, + { + "epoch": 0.17089267106634248, + "grad_norm": 1.252602259656139, + "learning_rate": 9.489741786076488e-06, + "loss": 0.0237, + "step": 492 + }, + { + "epoch": 0.1712400138937131, + "grad_norm": 0.5734672626180113, + "learning_rate": 9.487262910049804e-06, + "loss": 0.017, + "step": 493 + }, + { + "epoch": 0.17158735672108372, + "grad_norm": 1.2737930527140826, + "learning_rate": 9.48477835268826e-06, + "loss": 0.0231, + "step": 494 + }, + { + "epoch": 0.17193469954845433, + "grad_norm": 0.9557221623397816, + "learning_rate": 9.482288117137561e-06, + "loss": 0.0186, + "step": 495 + }, + { + "epoch": 0.17228204237582495, + "grad_norm": 1.7629581363677385, + "learning_rate": 9.479792206550604e-06, + "loss": 0.0333, + "step": 496 + }, + { + "epoch": 0.17262938520319557, + "grad_norm": 0.4160720670777094, + "learning_rate": 9.477290624087464e-06, + "loss": 0.0207, + "step": 497 + }, + { + "epoch": 0.17297672803056616, + "grad_norm": 1.657594987733875, + "learning_rate": 9.474783372915401e-06, + "loss": 0.0187, + "step": 498 + }, + { + "epoch": 0.17332407085793677, + "grad_norm": 0.6777171710639321, + "learning_rate": 9.472270456208856e-06, + "loss": 0.0311, + "step": 499 + }, + { + "epoch": 0.1736714136853074, + "grad_norm": 0.5649689115711107, + "learning_rate": 9.469751877149434e-06, + "loss": 0.0319, + "step": 500 + }, + { + "epoch": 0.174018756512678, + "grad_norm": 1.440887458216018, + "learning_rate": 9.467227638925917e-06, + "loss": 0.0314, + "step": 501 + }, + { + "epoch": 0.17436609934004862, + "grad_norm": 1.4716216093022831, + "learning_rate": 9.464697744734248e-06, + "loss": 0.0228, + "step": 502 + }, + { + "epoch": 0.17471344216741924, + "grad_norm": 0.8445232330484052, + "learning_rate": 9.462162197777533e-06, + "loss": 0.0298, + "step": 503 + }, + { + "epoch": 0.17506078499478986, + "grad_norm": 0.8274867059781084, + "learning_rate": 9.459621001266036e-06, + "loss": 0.0253, + "step": 504 + }, + { + "epoch": 0.17540812782216048, + "grad_norm": 1.3093883304254912, + "learning_rate": 9.45707415841717e-06, + "loss": 0.0284, + "step": 505 + }, + { + "epoch": 0.1757554706495311, + "grad_norm": 0.4071291730241176, + "learning_rate": 9.454521672455501e-06, + "loss": 0.0235, + "step": 506 + }, + { + "epoch": 0.1761028134769017, + "grad_norm": 1.1764547349254069, + "learning_rate": 9.451963546612737e-06, + "loss": 0.0195, + "step": 507 + }, + { + "epoch": 0.17645015630427233, + "grad_norm": 0.6569963887274749, + "learning_rate": 9.449399784127726e-06, + "loss": 0.0264, + "step": 508 + }, + { + "epoch": 0.17679749913164294, + "grad_norm": 0.5238198578448529, + "learning_rate": 9.446830388246457e-06, + "loss": 0.0194, + "step": 509 + }, + { + "epoch": 0.17714484195901353, + "grad_norm": 0.7417799180982843, + "learning_rate": 9.444255362222046e-06, + "loss": 0.0247, + "step": 510 + }, + { + "epoch": 0.17749218478638415, + "grad_norm": 1.3703456770005742, + "learning_rate": 9.441674709314743e-06, + "loss": 0.0274, + "step": 511 + }, + { + "epoch": 0.17783952761375477, + "grad_norm": 1.4652033192638536, + "learning_rate": 9.439088432791916e-06, + "loss": 0.0139, + "step": 512 + }, + { + "epoch": 0.17818687044112538, + "grad_norm": 1.5832446589993328, + "learning_rate": 9.436496535928057e-06, + "loss": 0.034, + "step": 513 + }, + { + "epoch": 0.178534213268496, + "grad_norm": 1.9119409177551294, + "learning_rate": 9.433899022004774e-06, + "loss": 0.0251, + "step": 514 + }, + { + "epoch": 0.17888155609586662, + "grad_norm": 0.788548038638029, + "learning_rate": 9.431295894310786e-06, + "loss": 0.0308, + "step": 515 + }, + { + "epoch": 0.17922889892323723, + "grad_norm": 0.8067843750520071, + "learning_rate": 9.428687156141919e-06, + "loss": 0.0234, + "step": 516 + }, + { + "epoch": 0.17957624175060785, + "grad_norm": 1.1709927168009244, + "learning_rate": 9.426072810801104e-06, + "loss": 0.0319, + "step": 517 + }, + { + "epoch": 0.17992358457797847, + "grad_norm": 0.6216918250812782, + "learning_rate": 9.423452861598367e-06, + "loss": 0.0243, + "step": 518 + }, + { + "epoch": 0.18027092740534909, + "grad_norm": 0.8872099116597142, + "learning_rate": 9.420827311850836e-06, + "loss": 0.0252, + "step": 519 + }, + { + "epoch": 0.1806182702327197, + "grad_norm": 0.586557224997833, + "learning_rate": 9.418196164882725e-06, + "loss": 0.0267, + "step": 520 + }, + { + "epoch": 0.18096561306009032, + "grad_norm": 0.5527086813309143, + "learning_rate": 9.415559424025335e-06, + "loss": 0.0212, + "step": 521 + }, + { + "epoch": 0.18131295588746094, + "grad_norm": 0.6211483589771798, + "learning_rate": 9.41291709261705e-06, + "loss": 0.018, + "step": 522 + }, + { + "epoch": 0.18166029871483153, + "grad_norm": 0.5607071996617911, + "learning_rate": 9.410269174003333e-06, + "loss": 0.0208, + "step": 523 + }, + { + "epoch": 0.18200764154220214, + "grad_norm": 1.8330990714819673, + "learning_rate": 9.407615671536723e-06, + "loss": 0.0306, + "step": 524 + }, + { + "epoch": 0.18235498436957276, + "grad_norm": 0.7432517423995653, + "learning_rate": 9.404956588576822e-06, + "loss": 0.0266, + "step": 525 + }, + { + "epoch": 0.18270232719694338, + "grad_norm": 0.5660339296085843, + "learning_rate": 9.402291928490302e-06, + "loss": 0.0184, + "step": 526 + }, + { + "epoch": 0.183049670024314, + "grad_norm": 0.7219272711113525, + "learning_rate": 9.399621694650898e-06, + "loss": 0.0285, + "step": 527 + }, + { + "epoch": 0.1833970128516846, + "grad_norm": 1.778600324374738, + "learning_rate": 9.3969458904394e-06, + "loss": 0.0345, + "step": 528 + }, + { + "epoch": 0.18374435567905523, + "grad_norm": 1.6387273409591696, + "learning_rate": 9.394264519243649e-06, + "loss": 0.0273, + "step": 529 + }, + { + "epoch": 0.18409169850642584, + "grad_norm": 0.8842030070616902, + "learning_rate": 9.391577584458536e-06, + "loss": 0.0203, + "step": 530 + }, + { + "epoch": 0.18443904133379646, + "grad_norm": 0.9269965797470187, + "learning_rate": 9.388885089485995e-06, + "loss": 0.018, + "step": 531 + }, + { + "epoch": 0.18478638416116708, + "grad_norm": 0.47186675698812003, + "learning_rate": 9.386187037735004e-06, + "loss": 0.0208, + "step": 532 + }, + { + "epoch": 0.1851337269885377, + "grad_norm": 0.6914305009354391, + "learning_rate": 9.383483432621569e-06, + "loss": 0.021, + "step": 533 + }, + { + "epoch": 0.1854810698159083, + "grad_norm": 0.5583142545579645, + "learning_rate": 9.380774277568733e-06, + "loss": 0.0169, + "step": 534 + }, + { + "epoch": 0.18582841264327893, + "grad_norm": 0.5064603142442761, + "learning_rate": 9.378059576006567e-06, + "loss": 0.0263, + "step": 535 + }, + { + "epoch": 0.18617575547064952, + "grad_norm": 0.5330806328958541, + "learning_rate": 9.375339331372155e-06, + "loss": 0.0191, + "step": 536 + }, + { + "epoch": 0.18652309829802013, + "grad_norm": 1.945098806666559, + "learning_rate": 9.37261354710961e-06, + "loss": 0.0291, + "step": 537 + }, + { + "epoch": 0.18687044112539075, + "grad_norm": 2.044217190353534, + "learning_rate": 9.369882226670054e-06, + "loss": 0.0258, + "step": 538 + }, + { + "epoch": 0.18721778395276137, + "grad_norm": 0.6771224467650478, + "learning_rate": 9.36714537351162e-06, + "loss": 0.0168, + "step": 539 + }, + { + "epoch": 0.18756512678013199, + "grad_norm": 0.695744615748055, + "learning_rate": 9.36440299109944e-06, + "loss": 0.015, + "step": 540 + }, + { + "epoch": 0.1879124696075026, + "grad_norm": 0.4503395885743224, + "learning_rate": 9.361655082905654e-06, + "loss": 0.013, + "step": 541 + }, + { + "epoch": 0.18825981243487322, + "grad_norm": 1.0753578393601884, + "learning_rate": 9.358901652409398e-06, + "loss": 0.018, + "step": 542 + }, + { + "epoch": 0.18860715526224384, + "grad_norm": 0.49649547384514064, + "learning_rate": 9.356142703096793e-06, + "loss": 0.0147, + "step": 543 + }, + { + "epoch": 0.18895449808961445, + "grad_norm": 1.5646023573061925, + "learning_rate": 9.353378238460955e-06, + "loss": 0.0237, + "step": 544 + }, + { + "epoch": 0.18930184091698507, + "grad_norm": 0.7887255494483407, + "learning_rate": 9.350608262001978e-06, + "loss": 0.0175, + "step": 545 + }, + { + "epoch": 0.1896491837443557, + "grad_norm": 1.0681506636597586, + "learning_rate": 9.347832777226936e-06, + "loss": 0.0186, + "step": 546 + }, + { + "epoch": 0.1899965265717263, + "grad_norm": 0.7785556397165594, + "learning_rate": 9.345051787649877e-06, + "loss": 0.0184, + "step": 547 + }, + { + "epoch": 0.19034386939909692, + "grad_norm": 0.509526910733021, + "learning_rate": 9.34226529679182e-06, + "loss": 0.0225, + "step": 548 + }, + { + "epoch": 0.1906912122264675, + "grad_norm": 1.319059024617917, + "learning_rate": 9.339473308180746e-06, + "loss": 0.0214, + "step": 549 + }, + { + "epoch": 0.19103855505383813, + "grad_norm": 0.5224211698738431, + "learning_rate": 9.336675825351602e-06, + "loss": 0.0257, + "step": 550 + }, + { + "epoch": 0.19138589788120874, + "grad_norm": 1.1460327083281117, + "learning_rate": 9.333872851846285e-06, + "loss": 0.0255, + "step": 551 + }, + { + "epoch": 0.19173324070857936, + "grad_norm": 0.7498296132522941, + "learning_rate": 9.33106439121365e-06, + "loss": 0.0262, + "step": 552 + }, + { + "epoch": 0.19208058353594998, + "grad_norm": 0.38935101782706116, + "learning_rate": 9.328250447009493e-06, + "loss": 0.0147, + "step": 553 + }, + { + "epoch": 0.1924279263633206, + "grad_norm": 0.8699070629651166, + "learning_rate": 9.325431022796559e-06, + "loss": 0.0213, + "step": 554 + }, + { + "epoch": 0.1927752691906912, + "grad_norm": 0.8664760328709794, + "learning_rate": 9.322606122144524e-06, + "loss": 0.0236, + "step": 555 + }, + { + "epoch": 0.19312261201806183, + "grad_norm": 2.250608635056857, + "learning_rate": 9.319775748630004e-06, + "loss": 0.0283, + "step": 556 + }, + { + "epoch": 0.19346995484543245, + "grad_norm": 1.7828983126970328, + "learning_rate": 9.316939905836543e-06, + "loss": 0.0182, + "step": 557 + }, + { + "epoch": 0.19381729767280306, + "grad_norm": 0.5137189754751468, + "learning_rate": 9.314098597354608e-06, + "loss": 0.0236, + "step": 558 + }, + { + "epoch": 0.19416464050017368, + "grad_norm": 0.6266424659785979, + "learning_rate": 9.311251826781587e-06, + "loss": 0.0228, + "step": 559 + }, + { + "epoch": 0.1945119833275443, + "grad_norm": 1.662923917212151, + "learning_rate": 9.308399597721782e-06, + "loss": 0.0232, + "step": 560 + }, + { + "epoch": 0.1948593261549149, + "grad_norm": 0.7492399109293421, + "learning_rate": 9.305541913786409e-06, + "loss": 0.0224, + "step": 561 + }, + { + "epoch": 0.1952066689822855, + "grad_norm": 0.5213910576317119, + "learning_rate": 9.302678778593586e-06, + "loss": 0.0232, + "step": 562 + }, + { + "epoch": 0.19555401180965612, + "grad_norm": 1.4562306345231064, + "learning_rate": 9.299810195768341e-06, + "loss": 0.0282, + "step": 563 + }, + { + "epoch": 0.19590135463702674, + "grad_norm": 0.9407546811393638, + "learning_rate": 9.296936168942589e-06, + "loss": 0.0199, + "step": 564 + }, + { + "epoch": 0.19624869746439735, + "grad_norm": 0.8454850863651415, + "learning_rate": 9.294056701755144e-06, + "loss": 0.0169, + "step": 565 + }, + { + "epoch": 0.19659604029176797, + "grad_norm": 0.27061382482871477, + "learning_rate": 9.291171797851708e-06, + "loss": 0.0131, + "step": 566 + }, + { + "epoch": 0.1969433831191386, + "grad_norm": 0.49862269994808495, + "learning_rate": 9.288281460884864e-06, + "loss": 0.0171, + "step": 567 + }, + { + "epoch": 0.1972907259465092, + "grad_norm": 0.8522107595930313, + "learning_rate": 9.285385694514075e-06, + "loss": 0.0218, + "step": 568 + }, + { + "epoch": 0.19763806877387982, + "grad_norm": 0.8148444399323233, + "learning_rate": 9.282484502405677e-06, + "loss": 0.021, + "step": 569 + }, + { + "epoch": 0.19798541160125044, + "grad_norm": 1.375134823060513, + "learning_rate": 9.27957788823288e-06, + "loss": 0.0298, + "step": 570 + }, + { + "epoch": 0.19833275442862106, + "grad_norm": 1.1166507276668756, + "learning_rate": 9.276665855675751e-06, + "loss": 0.0186, + "step": 571 + }, + { + "epoch": 0.19868009725599167, + "grad_norm": 1.3330794003539927, + "learning_rate": 9.273748408421224e-06, + "loss": 0.0254, + "step": 572 + }, + { + "epoch": 0.1990274400833623, + "grad_norm": 0.369557234187912, + "learning_rate": 9.270825550163088e-06, + "loss": 0.0144, + "step": 573 + }, + { + "epoch": 0.1993747829107329, + "grad_norm": 0.5934833218153897, + "learning_rate": 9.267897284601976e-06, + "loss": 0.0234, + "step": 574 + }, + { + "epoch": 0.1997221257381035, + "grad_norm": 0.5376211438611554, + "learning_rate": 9.264963615445378e-06, + "loss": 0.0159, + "step": 575 + }, + { + "epoch": 0.2000694685654741, + "grad_norm": 0.5613556695221377, + "learning_rate": 9.26202454640762e-06, + "loss": 0.0193, + "step": 576 + }, + { + "epoch": 0.20041681139284473, + "grad_norm": 0.8368239688441027, + "learning_rate": 9.259080081209861e-06, + "loss": 0.023, + "step": 577 + }, + { + "epoch": 0.20076415422021535, + "grad_norm": 1.2341577296539838, + "learning_rate": 9.256130223580096e-06, + "loss": 0.0198, + "step": 578 + }, + { + "epoch": 0.20111149704758596, + "grad_norm": 0.3531135473026086, + "learning_rate": 9.25317497725315e-06, + "loss": 0.0081, + "step": 579 + }, + { + "epoch": 0.20145883987495658, + "grad_norm": 0.7328484748799605, + "learning_rate": 9.250214345970665e-06, + "loss": 0.0184, + "step": 580 + }, + { + "epoch": 0.2018061827023272, + "grad_norm": 1.9853290246566473, + "learning_rate": 9.247248333481105e-06, + "loss": 0.0214, + "step": 581 + }, + { + "epoch": 0.20215352552969781, + "grad_norm": 1.4811646122579638, + "learning_rate": 9.244276943539746e-06, + "loss": 0.0241, + "step": 582 + }, + { + "epoch": 0.20250086835706843, + "grad_norm": 1.272707363559077, + "learning_rate": 9.241300179908672e-06, + "loss": 0.0339, + "step": 583 + }, + { + "epoch": 0.20284821118443905, + "grad_norm": 0.6314607624470274, + "learning_rate": 9.238318046356772e-06, + "loss": 0.0245, + "step": 584 + }, + { + "epoch": 0.20319555401180966, + "grad_norm": 1.0288562093872682, + "learning_rate": 9.235330546659731e-06, + "loss": 0.0232, + "step": 585 + }, + { + "epoch": 0.20354289683918028, + "grad_norm": 0.6216044232979469, + "learning_rate": 9.23233768460003e-06, + "loss": 0.0163, + "step": 586 + }, + { + "epoch": 0.2038902396665509, + "grad_norm": 0.7839133361622146, + "learning_rate": 9.229339463966942e-06, + "loss": 0.017, + "step": 587 + }, + { + "epoch": 0.2042375824939215, + "grad_norm": 1.065822018842104, + "learning_rate": 9.226335888556517e-06, + "loss": 0.0195, + "step": 588 + }, + { + "epoch": 0.2045849253212921, + "grad_norm": 1.0799416510268132, + "learning_rate": 9.223326962171594e-06, + "loss": 0.0329, + "step": 589 + }, + { + "epoch": 0.20493226814866272, + "grad_norm": 2.7643010329522597, + "learning_rate": 9.22031268862178e-06, + "loss": 0.0276, + "step": 590 + }, + { + "epoch": 0.20527961097603334, + "grad_norm": 2.7311705318356703, + "learning_rate": 9.217293071723455e-06, + "loss": 0.0267, + "step": 591 + }, + { + "epoch": 0.20562695380340396, + "grad_norm": 0.5493925419488683, + "learning_rate": 9.214268115299761e-06, + "loss": 0.0264, + "step": 592 + }, + { + "epoch": 0.20597429663077457, + "grad_norm": 1.2139756694249682, + "learning_rate": 9.211237823180605e-06, + "loss": 0.0216, + "step": 593 + }, + { + "epoch": 0.2063216394581452, + "grad_norm": 0.4237660059277888, + "learning_rate": 9.208202199202649e-06, + "loss": 0.0141, + "step": 594 + }, + { + "epoch": 0.2066689822855158, + "grad_norm": 0.8419781686821314, + "learning_rate": 9.205161247209303e-06, + "loss": 0.0166, + "step": 595 + }, + { + "epoch": 0.20701632511288642, + "grad_norm": 1.0055329342881736, + "learning_rate": 9.202114971050722e-06, + "loss": 0.0269, + "step": 596 + }, + { + "epoch": 0.20736366794025704, + "grad_norm": 0.58123379031455, + "learning_rate": 9.199063374583807e-06, + "loss": 0.0272, + "step": 597 + }, + { + "epoch": 0.20771101076762766, + "grad_norm": 1.258373507803076, + "learning_rate": 9.19600646167219e-06, + "loss": 0.0243, + "step": 598 + }, + { + "epoch": 0.20805835359499827, + "grad_norm": 0.8337160786397171, + "learning_rate": 9.192944236186237e-06, + "loss": 0.0175, + "step": 599 + }, + { + "epoch": 0.2084056964223689, + "grad_norm": 0.5397096011880417, + "learning_rate": 9.189876702003037e-06, + "loss": 0.0268, + "step": 600 + }, + { + "epoch": 0.20875303924973948, + "grad_norm": 2.0609037827991252, + "learning_rate": 9.186803863006408e-06, + "loss": 0.0288, + "step": 601 + }, + { + "epoch": 0.2091003820771101, + "grad_norm": 1.1561596599740218, + "learning_rate": 9.183725723086873e-06, + "loss": 0.0149, + "step": 602 + }, + { + "epoch": 0.20944772490448071, + "grad_norm": 1.7470410681014348, + "learning_rate": 9.180642286141678e-06, + "loss": 0.0284, + "step": 603 + }, + { + "epoch": 0.20979506773185133, + "grad_norm": 0.44157047622672, + "learning_rate": 9.177553556074766e-06, + "loss": 0.0204, + "step": 604 + }, + { + "epoch": 0.21014241055922195, + "grad_norm": 0.44448775934421736, + "learning_rate": 9.17445953679679e-06, + "loss": 0.0188, + "step": 605 + }, + { + "epoch": 0.21048975338659257, + "grad_norm": 1.4869913866249045, + "learning_rate": 9.171360232225091e-06, + "loss": 0.0216, + "step": 606 + }, + { + "epoch": 0.21083709621396318, + "grad_norm": 0.7283847092929767, + "learning_rate": 9.16825564628371e-06, + "loss": 0.0226, + "step": 607 + }, + { + "epoch": 0.2111844390413338, + "grad_norm": 0.44538635448331376, + "learning_rate": 9.165145782903369e-06, + "loss": 0.0112, + "step": 608 + }, + { + "epoch": 0.21153178186870442, + "grad_norm": 0.5848098598044253, + "learning_rate": 9.162030646021477e-06, + "loss": 0.0272, + "step": 609 + }, + { + "epoch": 0.21187912469607503, + "grad_norm": 0.6127021429282402, + "learning_rate": 9.15891023958211e-06, + "loss": 0.0183, + "step": 610 + }, + { + "epoch": 0.21222646752344565, + "grad_norm": 0.8170743093460934, + "learning_rate": 9.15578456753603e-06, + "loss": 0.0283, + "step": 611 + }, + { + "epoch": 0.21257381035081627, + "grad_norm": 1.1249613859894405, + "learning_rate": 9.152653633840654e-06, + "loss": 0.026, + "step": 612 + }, + { + "epoch": 0.21292115317818688, + "grad_norm": 0.985266358777264, + "learning_rate": 9.149517442460065e-06, + "loss": 0.0242, + "step": 613 + }, + { + "epoch": 0.21326849600555747, + "grad_norm": 0.5401136574602342, + "learning_rate": 9.146375997365006e-06, + "loss": 0.0236, + "step": 614 + }, + { + "epoch": 0.2136158388329281, + "grad_norm": 0.8086803852304764, + "learning_rate": 9.143229302532866e-06, + "loss": 0.0258, + "step": 615 + }, + { + "epoch": 0.2139631816602987, + "grad_norm": 0.8975085166076582, + "learning_rate": 9.140077361947681e-06, + "loss": 0.0194, + "step": 616 + }, + { + "epoch": 0.21431052448766932, + "grad_norm": 1.1788638290843472, + "learning_rate": 9.136920179600137e-06, + "loss": 0.0204, + "step": 617 + }, + { + "epoch": 0.21465786731503994, + "grad_norm": 0.5154845254827837, + "learning_rate": 9.133757759487545e-06, + "loss": 0.021, + "step": 618 + }, + { + "epoch": 0.21500521014241056, + "grad_norm": 0.8015539062724568, + "learning_rate": 9.130590105613854e-06, + "loss": 0.0227, + "step": 619 + }, + { + "epoch": 0.21535255296978117, + "grad_norm": 1.5385432558467305, + "learning_rate": 9.127417221989643e-06, + "loss": 0.0423, + "step": 620 + }, + { + "epoch": 0.2156998957971518, + "grad_norm": 0.9984997426393253, + "learning_rate": 9.1242391126321e-06, + "loss": 0.0217, + "step": 621 + }, + { + "epoch": 0.2160472386245224, + "grad_norm": 0.28392172266267085, + "learning_rate": 9.121055781565044e-06, + "loss": 0.0238, + "step": 622 + }, + { + "epoch": 0.21639458145189303, + "grad_norm": 1.264151293918839, + "learning_rate": 9.117867232818897e-06, + "loss": 0.0209, + "step": 623 + }, + { + "epoch": 0.21674192427926364, + "grad_norm": 1.0368184665287132, + "learning_rate": 9.114673470430688e-06, + "loss": 0.021, + "step": 624 + }, + { + "epoch": 0.21708926710663426, + "grad_norm": 0.6918438110380035, + "learning_rate": 9.111474498444046e-06, + "loss": 0.0263, + "step": 625 + }, + { + "epoch": 0.21743660993400488, + "grad_norm": 2.1367175315409477, + "learning_rate": 9.1082703209092e-06, + "loss": 0.0295, + "step": 626 + }, + { + "epoch": 0.21778395276137547, + "grad_norm": 0.7119870045258382, + "learning_rate": 9.105060941882966e-06, + "loss": 0.023, + "step": 627 + }, + { + "epoch": 0.21813129558874608, + "grad_norm": 1.14033999216725, + "learning_rate": 9.101846365428747e-06, + "loss": 0.0164, + "step": 628 + }, + { + "epoch": 0.2184786384161167, + "grad_norm": 0.4345990776226327, + "learning_rate": 9.098626595616527e-06, + "loss": 0.0226, + "step": 629 + }, + { + "epoch": 0.21882598124348732, + "grad_norm": 1.4156487567989067, + "learning_rate": 9.095401636522863e-06, + "loss": 0.0254, + "step": 630 + }, + { + "epoch": 0.21917332407085793, + "grad_norm": 1.7788155380270627, + "learning_rate": 9.092171492230883e-06, + "loss": 0.0287, + "step": 631 + }, + { + "epoch": 0.21952066689822855, + "grad_norm": 0.9127292614192289, + "learning_rate": 9.088936166830285e-06, + "loss": 0.0266, + "step": 632 + }, + { + "epoch": 0.21986800972559917, + "grad_norm": 0.847423011464306, + "learning_rate": 9.08569566441732e-06, + "loss": 0.0232, + "step": 633 + }, + { + "epoch": 0.22021535255296978, + "grad_norm": 0.43930165735953547, + "learning_rate": 9.082449989094798e-06, + "loss": 0.0183, + "step": 634 + }, + { + "epoch": 0.2205626953803404, + "grad_norm": 0.7809095217627077, + "learning_rate": 9.079199144972072e-06, + "loss": 0.014, + "step": 635 + }, + { + "epoch": 0.22091003820771102, + "grad_norm": 1.0253606245776024, + "learning_rate": 9.075943136165049e-06, + "loss": 0.0156, + "step": 636 + }, + { + "epoch": 0.22125738103508164, + "grad_norm": 1.5700732329185856, + "learning_rate": 9.072681966796169e-06, + "loss": 0.0255, + "step": 637 + }, + { + "epoch": 0.22160472386245225, + "grad_norm": 0.49771019117253024, + "learning_rate": 9.069415640994403e-06, + "loss": 0.0243, + "step": 638 + }, + { + "epoch": 0.22195206668982287, + "grad_norm": 0.46212016511074455, + "learning_rate": 9.066144162895259e-06, + "loss": 0.023, + "step": 639 + }, + { + "epoch": 0.22229940951719346, + "grad_norm": 0.7782758261073233, + "learning_rate": 9.062867536640762e-06, + "loss": 0.017, + "step": 640 + }, + { + "epoch": 0.22264675234456408, + "grad_norm": 0.7838906985841836, + "learning_rate": 9.059585766379455e-06, + "loss": 0.0165, + "step": 641 + }, + { + "epoch": 0.2229940951719347, + "grad_norm": 1.011196077425446, + "learning_rate": 9.056298856266399e-06, + "loss": 0.0218, + "step": 642 + }, + { + "epoch": 0.2233414379993053, + "grad_norm": 1.518375606641246, + "learning_rate": 9.053006810463156e-06, + "loss": 0.0254, + "step": 643 + }, + { + "epoch": 0.22368878082667593, + "grad_norm": 0.5838655335928358, + "learning_rate": 9.049709633137796e-06, + "loss": 0.0255, + "step": 644 + }, + { + "epoch": 0.22403612365404654, + "grad_norm": 0.7210470798591194, + "learning_rate": 9.04640732846488e-06, + "loss": 0.0238, + "step": 645 + }, + { + "epoch": 0.22438346648141716, + "grad_norm": 0.6437322596951384, + "learning_rate": 9.043099900625468e-06, + "loss": 0.0237, + "step": 646 + }, + { + "epoch": 0.22473080930878778, + "grad_norm": 1.0308720116120658, + "learning_rate": 9.039787353807101e-06, + "loss": 0.0305, + "step": 647 + }, + { + "epoch": 0.2250781521361584, + "grad_norm": 1.6497385641943867, + "learning_rate": 9.036469692203804e-06, + "loss": 0.0296, + "step": 648 + }, + { + "epoch": 0.225425494963529, + "grad_norm": 0.3313837148743059, + "learning_rate": 9.033146920016073e-06, + "loss": 0.0224, + "step": 649 + }, + { + "epoch": 0.22577283779089963, + "grad_norm": 0.5293106608754529, + "learning_rate": 9.029819041450884e-06, + "loss": 0.0203, + "step": 650 + }, + { + "epoch": 0.22612018061827024, + "grad_norm": 1.1616316561406497, + "learning_rate": 9.026486060721668e-06, + "loss": 0.0288, + "step": 651 + }, + { + "epoch": 0.22646752344564083, + "grad_norm": 1.3686370104380872, + "learning_rate": 9.023147982048322e-06, + "loss": 0.0247, + "step": 652 + }, + { + "epoch": 0.22681486627301145, + "grad_norm": 0.9530520238325823, + "learning_rate": 9.019804809657195e-06, + "loss": 0.0222, + "step": 653 + }, + { + "epoch": 0.22716220910038207, + "grad_norm": 0.4302751935992573, + "learning_rate": 9.016456547781088e-06, + "loss": 0.0222, + "step": 654 + }, + { + "epoch": 0.22750955192775268, + "grad_norm": 0.4268832341371163, + "learning_rate": 9.01310320065924e-06, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.2278568947551233, + "grad_norm": 0.3637254931806751, + "learning_rate": 9.009744772537336e-06, + "loss": 0.0208, + "step": 656 + }, + { + "epoch": 0.22820423758249392, + "grad_norm": 0.9443986440525276, + "learning_rate": 9.006381267667489e-06, + "loss": 0.0157, + "step": 657 + }, + { + "epoch": 0.22855158040986454, + "grad_norm": 1.033293442849867, + "learning_rate": 9.00301269030824e-06, + "loss": 0.0235, + "step": 658 + }, + { + "epoch": 0.22889892323723515, + "grad_norm": 0.7206031262994533, + "learning_rate": 8.999639044724555e-06, + "loss": 0.0144, + "step": 659 + }, + { + "epoch": 0.22924626606460577, + "grad_norm": 0.5569325989347864, + "learning_rate": 8.996260335187813e-06, + "loss": 0.0273, + "step": 660 + }, + { + "epoch": 0.2295936088919764, + "grad_norm": 0.6950933663288503, + "learning_rate": 8.992876565975809e-06, + "loss": 0.0351, + "step": 661 + }, + { + "epoch": 0.229940951719347, + "grad_norm": 0.961443300736354, + "learning_rate": 8.98948774137274e-06, + "loss": 0.0249, + "step": 662 + }, + { + "epoch": 0.23028829454671762, + "grad_norm": 1.9014588737941847, + "learning_rate": 8.986093865669205e-06, + "loss": 0.0313, + "step": 663 + }, + { + "epoch": 0.23063563737408824, + "grad_norm": 0.9492079615362812, + "learning_rate": 8.9826949431622e-06, + "loss": 0.0248, + "step": 664 + }, + { + "epoch": 0.23098298020145883, + "grad_norm": 0.5262876532553753, + "learning_rate": 8.97929097815511e-06, + "loss": 0.0271, + "step": 665 + }, + { + "epoch": 0.23133032302882944, + "grad_norm": 1.4774684801073679, + "learning_rate": 8.9758819749577e-06, + "loss": 0.0259, + "step": 666 + }, + { + "epoch": 0.23167766585620006, + "grad_norm": 0.47912155013309676, + "learning_rate": 8.972467937886122e-06, + "loss": 0.0198, + "step": 667 + }, + { + "epoch": 0.23202500868357068, + "grad_norm": 0.4857405075161966, + "learning_rate": 8.969048871262895e-06, + "loss": 0.021, + "step": 668 + }, + { + "epoch": 0.2323723515109413, + "grad_norm": 0.5501028852401879, + "learning_rate": 8.965624779416907e-06, + "loss": 0.0229, + "step": 669 + }, + { + "epoch": 0.2327196943383119, + "grad_norm": 0.37333376724281975, + "learning_rate": 8.96219566668341e-06, + "loss": 0.0216, + "step": 670 + }, + { + "epoch": 0.23306703716568253, + "grad_norm": 0.48066801556311134, + "learning_rate": 8.958761537404012e-06, + "loss": 0.0198, + "step": 671 + }, + { + "epoch": 0.23341437999305314, + "grad_norm": 0.813514292061828, + "learning_rate": 8.955322395926673e-06, + "loss": 0.0139, + "step": 672 + }, + { + "epoch": 0.23376172282042376, + "grad_norm": 0.7125740685535722, + "learning_rate": 8.9518782466057e-06, + "loss": 0.0158, + "step": 673 + }, + { + "epoch": 0.23410906564779438, + "grad_norm": 0.3825152111739057, + "learning_rate": 8.948429093801738e-06, + "loss": 0.0215, + "step": 674 + }, + { + "epoch": 0.234456408475165, + "grad_norm": 1.3795608714085683, + "learning_rate": 8.944974941881766e-06, + "loss": 0.0274, + "step": 675 + }, + { + "epoch": 0.2348037513025356, + "grad_norm": 0.5497870254567753, + "learning_rate": 8.941515795219098e-06, + "loss": 0.0197, + "step": 676 + }, + { + "epoch": 0.23515109412990623, + "grad_norm": 0.8354225349175881, + "learning_rate": 8.938051658193365e-06, + "loss": 0.0227, + "step": 677 + }, + { + "epoch": 0.23549843695727682, + "grad_norm": 0.5221880740171788, + "learning_rate": 8.934582535190522e-06, + "loss": 0.0212, + "step": 678 + }, + { + "epoch": 0.23584577978464744, + "grad_norm": 0.5930924226753035, + "learning_rate": 8.931108430602834e-06, + "loss": 0.0176, + "step": 679 + }, + { + "epoch": 0.23619312261201805, + "grad_norm": 1.4128463606001023, + "learning_rate": 8.927629348828874e-06, + "loss": 0.0171, + "step": 680 + }, + { + "epoch": 0.23654046543938867, + "grad_norm": 0.8753822994129902, + "learning_rate": 8.924145294273515e-06, + "loss": 0.0211, + "step": 681 + }, + { + "epoch": 0.2368878082667593, + "grad_norm": 0.7822190197357625, + "learning_rate": 8.920656271347925e-06, + "loss": 0.0251, + "step": 682 + }, + { + "epoch": 0.2372351510941299, + "grad_norm": 1.4919418777559417, + "learning_rate": 8.917162284469569e-06, + "loss": 0.0224, + "step": 683 + }, + { + "epoch": 0.23758249392150052, + "grad_norm": 1.5473820670531835, + "learning_rate": 8.91366333806219e-06, + "loss": 0.0277, + "step": 684 + }, + { + "epoch": 0.23792983674887114, + "grad_norm": 1.1016478393828795, + "learning_rate": 8.910159436555813e-06, + "loss": 0.0252, + "step": 685 + }, + { + "epoch": 0.23827717957624175, + "grad_norm": 1.5287564476717936, + "learning_rate": 8.90665058438674e-06, + "loss": 0.0188, + "step": 686 + }, + { + "epoch": 0.23862452240361237, + "grad_norm": 0.7366291157632144, + "learning_rate": 8.903136785997533e-06, + "loss": 0.0295, + "step": 687 + }, + { + "epoch": 0.238971865230983, + "grad_norm": 0.7896406244087025, + "learning_rate": 8.899618045837025e-06, + "loss": 0.0237, + "step": 688 + }, + { + "epoch": 0.2393192080583536, + "grad_norm": 1.1875606974598063, + "learning_rate": 8.896094368360297e-06, + "loss": 0.0162, + "step": 689 + }, + { + "epoch": 0.23966655088572422, + "grad_norm": 0.47042040685656056, + "learning_rate": 8.892565758028688e-06, + "loss": 0.0282, + "step": 690 + }, + { + "epoch": 0.2400138937130948, + "grad_norm": 0.5817301391235052, + "learning_rate": 8.889032219309781e-06, + "loss": 0.0291, + "step": 691 + }, + { + "epoch": 0.24036123654046543, + "grad_norm": 0.6761086942386368, + "learning_rate": 8.885493756677399e-06, + "loss": 0.0248, + "step": 692 + }, + { + "epoch": 0.24070857936783605, + "grad_norm": 0.8209577079998156, + "learning_rate": 8.881950374611597e-06, + "loss": 0.0196, + "step": 693 + }, + { + "epoch": 0.24105592219520666, + "grad_norm": 1.314377357321471, + "learning_rate": 8.878402077598662e-06, + "loss": 0.023, + "step": 694 + }, + { + "epoch": 0.24140326502257728, + "grad_norm": 1.1012244390968478, + "learning_rate": 8.874848870131098e-06, + "loss": 0.0255, + "step": 695 + }, + { + "epoch": 0.2417506078499479, + "grad_norm": 0.7845674765254901, + "learning_rate": 8.871290756707634e-06, + "loss": 0.0221, + "step": 696 + }, + { + "epoch": 0.2420979506773185, + "grad_norm": 0.59132264309778, + "learning_rate": 8.867727741833204e-06, + "loss": 0.0179, + "step": 697 + }, + { + "epoch": 0.24244529350468913, + "grad_norm": 1.2702082566630686, + "learning_rate": 8.86415983001895e-06, + "loss": 0.0284, + "step": 698 + }, + { + "epoch": 0.24279263633205975, + "grad_norm": 0.7795467424110684, + "learning_rate": 8.860587025782215e-06, + "loss": 0.0171, + "step": 699 + }, + { + "epoch": 0.24313997915943036, + "grad_norm": 1.84470369765152, + "learning_rate": 8.857009333646535e-06, + "loss": 0.0214, + "step": 700 + }, + { + "epoch": 0.24348732198680098, + "grad_norm": 1.9319346606245589, + "learning_rate": 8.853426758141635e-06, + "loss": 0.0242, + "step": 701 + }, + { + "epoch": 0.2438346648141716, + "grad_norm": 1.3940656814702743, + "learning_rate": 8.849839303803425e-06, + "loss": 0.0239, + "step": 702 + }, + { + "epoch": 0.24418200764154221, + "grad_norm": 0.6685433203227141, + "learning_rate": 8.846246975173985e-06, + "loss": 0.0224, + "step": 703 + }, + { + "epoch": 0.2445293504689128, + "grad_norm": 0.8276105742660492, + "learning_rate": 8.842649776801576e-06, + "loss": 0.031, + "step": 704 + }, + { + "epoch": 0.24487669329628342, + "grad_norm": 0.9152658175517937, + "learning_rate": 8.839047713240619e-06, + "loss": 0.0157, + "step": 705 + }, + { + "epoch": 0.24522403612365404, + "grad_norm": 1.0704740818766338, + "learning_rate": 8.835440789051692e-06, + "loss": 0.0143, + "step": 706 + }, + { + "epoch": 0.24557137895102465, + "grad_norm": 1.0779902900776792, + "learning_rate": 8.831829008801536e-06, + "loss": 0.0175, + "step": 707 + }, + { + "epoch": 0.24591872177839527, + "grad_norm": 0.6030988446891595, + "learning_rate": 8.828212377063033e-06, + "loss": 0.0194, + "step": 708 + }, + { + "epoch": 0.2462660646057659, + "grad_norm": 0.8610359328267158, + "learning_rate": 8.824590898415209e-06, + "loss": 0.0258, + "step": 709 + }, + { + "epoch": 0.2466134074331365, + "grad_norm": 0.5357367765825686, + "learning_rate": 8.820964577443227e-06, + "loss": 0.0278, + "step": 710 + }, + { + "epoch": 0.24696075026050712, + "grad_norm": 1.5065093568870387, + "learning_rate": 8.817333418738382e-06, + "loss": 0.0195, + "step": 711 + }, + { + "epoch": 0.24730809308787774, + "grad_norm": 1.4013893590690267, + "learning_rate": 8.813697426898094e-06, + "loss": 0.0276, + "step": 712 + }, + { + "epoch": 0.24765543591524836, + "grad_norm": 1.4557088461414078, + "learning_rate": 8.810056606525899e-06, + "loss": 0.0232, + "step": 713 + }, + { + "epoch": 0.24800277874261897, + "grad_norm": 1.1924673869323386, + "learning_rate": 8.80641096223145e-06, + "loss": 0.0163, + "step": 714 + }, + { + "epoch": 0.2483501215699896, + "grad_norm": 0.42056499190089386, + "learning_rate": 8.802760498630507e-06, + "loss": 0.0249, + "step": 715 + }, + { + "epoch": 0.2486974643973602, + "grad_norm": 0.3585210385651358, + "learning_rate": 8.79910522034493e-06, + "loss": 0.019, + "step": 716 + }, + { + "epoch": 0.2490448072247308, + "grad_norm": 0.46610691242048347, + "learning_rate": 8.795445132002679e-06, + "loss": 0.0154, + "step": 717 + }, + { + "epoch": 0.2493921500521014, + "grad_norm": 1.3759596235421945, + "learning_rate": 8.791780238237794e-06, + "loss": 0.0249, + "step": 718 + }, + { + "epoch": 0.24973949287947203, + "grad_norm": 10.041742445276327, + "learning_rate": 8.788110543690415e-06, + "loss": 0.0274, + "step": 719 + }, + { + "epoch": 0.25008683570684265, + "grad_norm": 0.9018639927916149, + "learning_rate": 8.784436053006746e-06, + "loss": 0.026, + "step": 720 + }, + { + "epoch": 0.2504341785342133, + "grad_norm": 0.6316727580755663, + "learning_rate": 8.780756770839071e-06, + "loss": 0.0117, + "step": 721 + }, + { + "epoch": 0.2507815213615839, + "grad_norm": 0.8380724086190908, + "learning_rate": 8.777072701845738e-06, + "loss": 0.0286, + "step": 722 + }, + { + "epoch": 0.25112886418895447, + "grad_norm": 0.7088681744129693, + "learning_rate": 8.773383850691155e-06, + "loss": 0.0264, + "step": 723 + }, + { + "epoch": 0.2514762070163251, + "grad_norm": 1.6040597806062724, + "learning_rate": 8.769690222045787e-06, + "loss": 0.0213, + "step": 724 + }, + { + "epoch": 0.2518235498436957, + "grad_norm": 0.59234889125779, + "learning_rate": 8.765991820586147e-06, + "loss": 0.0193, + "step": 725 + }, + { + "epoch": 0.25217089267106635, + "grad_norm": 1.3987574754257743, + "learning_rate": 8.762288650994786e-06, + "loss": 0.0302, + "step": 726 + }, + { + "epoch": 0.25251823549843694, + "grad_norm": 0.7554420842981747, + "learning_rate": 8.758580717960303e-06, + "loss": 0.02, + "step": 727 + }, + { + "epoch": 0.2528655783258076, + "grad_norm": 0.5299359962965386, + "learning_rate": 8.754868026177317e-06, + "loss": 0.0228, + "step": 728 + }, + { + "epoch": 0.25321292115317817, + "grad_norm": 0.5066652476216912, + "learning_rate": 8.751150580346477e-06, + "loss": 0.0152, + "step": 729 + }, + { + "epoch": 0.2535602639805488, + "grad_norm": 0.8528730067781101, + "learning_rate": 8.747428385174452e-06, + "loss": 0.0207, + "step": 730 + }, + { + "epoch": 0.2539076068079194, + "grad_norm": 0.6236339038844014, + "learning_rate": 8.743701445373922e-06, + "loss": 0.0217, + "step": 731 + }, + { + "epoch": 0.25425494963529005, + "grad_norm": 1.1276577230096638, + "learning_rate": 8.739969765663574e-06, + "loss": 0.018, + "step": 732 + }, + { + "epoch": 0.25460229246266064, + "grad_norm": 1.0286105613817527, + "learning_rate": 8.736233350768097e-06, + "loss": 0.0166, + "step": 733 + }, + { + "epoch": 0.2549496352900313, + "grad_norm": 1.4581045926440899, + "learning_rate": 8.732492205418176e-06, + "loss": 0.031, + "step": 734 + }, + { + "epoch": 0.2552969781174019, + "grad_norm": 1.8876644886015665, + "learning_rate": 8.728746334350483e-06, + "loss": 0.027, + "step": 735 + }, + { + "epoch": 0.25564432094477246, + "grad_norm": 1.0450409512942507, + "learning_rate": 8.72499574230768e-06, + "loss": 0.0207, + "step": 736 + }, + { + "epoch": 0.2559916637721431, + "grad_norm": 0.6743560512268864, + "learning_rate": 8.721240434038395e-06, + "loss": 0.0244, + "step": 737 + }, + { + "epoch": 0.2563390065995137, + "grad_norm": 0.8447479142593041, + "learning_rate": 8.717480414297236e-06, + "loss": 0.0288, + "step": 738 + }, + { + "epoch": 0.25668634942688434, + "grad_norm": 0.9145315370795962, + "learning_rate": 8.713715687844772e-06, + "loss": 0.0185, + "step": 739 + }, + { + "epoch": 0.25703369225425493, + "grad_norm": 1.5301247728960465, + "learning_rate": 8.709946259447535e-06, + "loss": 0.0275, + "step": 740 + }, + { + "epoch": 0.2573810350816256, + "grad_norm": 1.901274677450291, + "learning_rate": 8.706172133878006e-06, + "loss": 0.015, + "step": 741 + }, + { + "epoch": 0.25772837790899616, + "grad_norm": 0.49087360665043905, + "learning_rate": 8.702393315914615e-06, + "loss": 0.022, + "step": 742 + }, + { + "epoch": 0.2580757207363668, + "grad_norm": 1.0922750212562884, + "learning_rate": 8.698609810341733e-06, + "loss": 0.0218, + "step": 743 + }, + { + "epoch": 0.2584230635637374, + "grad_norm": 0.7102928208875048, + "learning_rate": 8.694821621949667e-06, + "loss": 0.0322, + "step": 744 + }, + { + "epoch": 0.25877040639110804, + "grad_norm": 0.886964613687143, + "learning_rate": 8.69102875553465e-06, + "loss": 0.0159, + "step": 745 + }, + { + "epoch": 0.25911774921847863, + "grad_norm": 0.7318076441241718, + "learning_rate": 8.68723121589884e-06, + "loss": 0.0233, + "step": 746 + }, + { + "epoch": 0.2594650920458493, + "grad_norm": 0.6058986578436707, + "learning_rate": 8.683429007850313e-06, + "loss": 0.0251, + "step": 747 + }, + { + "epoch": 0.25981243487321987, + "grad_norm": 0.9052930522075193, + "learning_rate": 8.679622136203055e-06, + "loss": 0.0149, + "step": 748 + }, + { + "epoch": 0.26015977770059046, + "grad_norm": 1.2032045295627478, + "learning_rate": 8.67581060577695e-06, + "loss": 0.0224, + "step": 749 + }, + { + "epoch": 0.2605071205279611, + "grad_norm": 0.539015089299102, + "learning_rate": 8.671994421397793e-06, + "loss": 0.0246, + "step": 750 + }, + { + "epoch": 0.2608544633553317, + "grad_norm": 1.5525307354093696, + "learning_rate": 8.668173587897261e-06, + "loss": 0.0191, + "step": 751 + }, + { + "epoch": 0.26120180618270233, + "grad_norm": 0.8769437334779882, + "learning_rate": 8.664348110112923e-06, + "loss": 0.0283, + "step": 752 + }, + { + "epoch": 0.2615491490100729, + "grad_norm": 0.8682417310780673, + "learning_rate": 8.660517992888225e-06, + "loss": 0.0158, + "step": 753 + }, + { + "epoch": 0.26189649183744357, + "grad_norm": 0.6663404002194353, + "learning_rate": 8.656683241072488e-06, + "loss": 0.0267, + "step": 754 + }, + { + "epoch": 0.26224383466481416, + "grad_norm": 0.5892947695489575, + "learning_rate": 8.6528438595209e-06, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.2625911774921848, + "grad_norm": 1.202175698552294, + "learning_rate": 8.648999853094514e-06, + "loss": 0.0266, + "step": 756 + }, + { + "epoch": 0.2629385203195554, + "grad_norm": 0.7025664951173237, + "learning_rate": 8.645151226660234e-06, + "loss": 0.0204, + "step": 757 + }, + { + "epoch": 0.26328586314692604, + "grad_norm": 0.4207213062287094, + "learning_rate": 8.641297985090815e-06, + "loss": 0.0231, + "step": 758 + }, + { + "epoch": 0.2636332059742966, + "grad_norm": 0.5565332142739162, + "learning_rate": 8.637440133264858e-06, + "loss": 0.0201, + "step": 759 + }, + { + "epoch": 0.26398054880166727, + "grad_norm": 0.98247646350947, + "learning_rate": 8.6335776760668e-06, + "loss": 0.03, + "step": 760 + }, + { + "epoch": 0.26432789162903786, + "grad_norm": 1.331753783533657, + "learning_rate": 8.629710618386903e-06, + "loss": 0.0237, + "step": 761 + }, + { + "epoch": 0.26467523445640845, + "grad_norm": 0.8360917594131502, + "learning_rate": 8.625838965121263e-06, + "loss": 0.0214, + "step": 762 + }, + { + "epoch": 0.2650225772837791, + "grad_norm": 0.6406382478000888, + "learning_rate": 8.621962721171789e-06, + "loss": 0.0252, + "step": 763 + }, + { + "epoch": 0.2653699201111497, + "grad_norm": 0.9906478193068576, + "learning_rate": 8.618081891446201e-06, + "loss": 0.0167, + "step": 764 + }, + { + "epoch": 0.2657172629385203, + "grad_norm": 0.5398729124754951, + "learning_rate": 8.61419648085803e-06, + "loss": 0.02, + "step": 765 + }, + { + "epoch": 0.2660646057658909, + "grad_norm": 1.1959531243334165, + "learning_rate": 8.610306494326601e-06, + "loss": 0.0253, + "step": 766 + }, + { + "epoch": 0.26641194859326156, + "grad_norm": 1.1692706174579726, + "learning_rate": 8.60641193677704e-06, + "loss": 0.0297, + "step": 767 + }, + { + "epoch": 0.26675929142063215, + "grad_norm": 1.0701239722359508, + "learning_rate": 8.602512813140251e-06, + "loss": 0.0214, + "step": 768 + }, + { + "epoch": 0.2671066342480028, + "grad_norm": 0.4828984533321399, + "learning_rate": 8.59860912835293e-06, + "loss": 0.0253, + "step": 769 + }, + { + "epoch": 0.2674539770753734, + "grad_norm": 1.1545942236415125, + "learning_rate": 8.594700887357537e-06, + "loss": 0.022, + "step": 770 + }, + { + "epoch": 0.26780131990274403, + "grad_norm": 1.5002177682572735, + "learning_rate": 8.59078809510231e-06, + "loss": 0.0247, + "step": 771 + }, + { + "epoch": 0.2681486627301146, + "grad_norm": 2.1953312403285024, + "learning_rate": 8.58687075654124e-06, + "loss": 0.0263, + "step": 772 + }, + { + "epoch": 0.26849600555748526, + "grad_norm": 1.2121243717081425, + "learning_rate": 8.582948876634084e-06, + "loss": 0.0167, + "step": 773 + }, + { + "epoch": 0.26884334838485585, + "grad_norm": 0.7901330703897526, + "learning_rate": 8.579022460346343e-06, + "loss": 0.0207, + "step": 774 + }, + { + "epoch": 0.26919069121222644, + "grad_norm": 0.44307666800843953, + "learning_rate": 8.57509151264926e-06, + "loss": 0.0142, + "step": 775 + }, + { + "epoch": 0.2695380340395971, + "grad_norm": 1.1332463643190542, + "learning_rate": 8.57115603851982e-06, + "loss": 0.0248, + "step": 776 + }, + { + "epoch": 0.2698853768669677, + "grad_norm": 1.2837269229641983, + "learning_rate": 8.567216042940735e-06, + "loss": 0.0159, + "step": 777 + }, + { + "epoch": 0.2702327196943383, + "grad_norm": 2.1182651419994714, + "learning_rate": 8.563271530900448e-06, + "loss": 0.0345, + "step": 778 + }, + { + "epoch": 0.2705800625217089, + "grad_norm": 1.0165246753047423, + "learning_rate": 8.55932250739311e-06, + "loss": 0.0254, + "step": 779 + }, + { + "epoch": 0.27092740534907955, + "grad_norm": 2.105285605392031, + "learning_rate": 8.555368977418593e-06, + "loss": 0.0306, + "step": 780 + }, + { + "epoch": 0.27127474817645014, + "grad_norm": 1.9706982563429236, + "learning_rate": 8.551410945982469e-06, + "loss": 0.0388, + "step": 781 + }, + { + "epoch": 0.2716220910038208, + "grad_norm": 1.2642900443762466, + "learning_rate": 8.547448418096012e-06, + "loss": 0.0169, + "step": 782 + }, + { + "epoch": 0.2719694338311914, + "grad_norm": 0.6266295783177325, + "learning_rate": 8.543481398776188e-06, + "loss": 0.0218, + "step": 783 + }, + { + "epoch": 0.272316776658562, + "grad_norm": 0.5973986334152628, + "learning_rate": 8.539509893045654e-06, + "loss": 0.0338, + "step": 784 + }, + { + "epoch": 0.2726641194859326, + "grad_norm": 2.4245373478500687, + "learning_rate": 8.535533905932739e-06, + "loss": 0.0231, + "step": 785 + }, + { + "epoch": 0.27301146231330325, + "grad_norm": 2.516583165841234, + "learning_rate": 8.531553442471453e-06, + "loss": 0.0262, + "step": 786 + }, + { + "epoch": 0.27335880514067384, + "grad_norm": 3.32894137710812, + "learning_rate": 8.527568507701467e-06, + "loss": 0.0319, + "step": 787 + }, + { + "epoch": 0.27370614796804443, + "grad_norm": 2.3072141104017625, + "learning_rate": 8.523579106668121e-06, + "loss": 0.0308, + "step": 788 + }, + { + "epoch": 0.2740534907954151, + "grad_norm": 1.1713281727561171, + "learning_rate": 8.519585244422405e-06, + "loss": 0.0184, + "step": 789 + }, + { + "epoch": 0.27440083362278567, + "grad_norm": 1.0031858926420991, + "learning_rate": 8.515586926020959e-06, + "loss": 0.0235, + "step": 790 + }, + { + "epoch": 0.2747481764501563, + "grad_norm": 0.36533679027936755, + "learning_rate": 8.511584156526059e-06, + "loss": 0.0183, + "step": 791 + }, + { + "epoch": 0.2750955192775269, + "grad_norm": 1.1097620564934818, + "learning_rate": 8.507576941005626e-06, + "loss": 0.0265, + "step": 792 + }, + { + "epoch": 0.27544286210489755, + "grad_norm": 0.8446308489408966, + "learning_rate": 8.503565284533206e-06, + "loss": 0.0201, + "step": 793 + }, + { + "epoch": 0.27579020493226813, + "grad_norm": 1.5612638631332916, + "learning_rate": 8.499549192187965e-06, + "loss": 0.022, + "step": 794 + }, + { + "epoch": 0.2761375477596388, + "grad_norm": 1.0436423254930283, + "learning_rate": 8.495528669054688e-06, + "loss": 0.0188, + "step": 795 + }, + { + "epoch": 0.27648489058700937, + "grad_norm": 0.7033478660193114, + "learning_rate": 8.49150372022377e-06, + "loss": 0.016, + "step": 796 + }, + { + "epoch": 0.27683223341438, + "grad_norm": 0.7998512004622108, + "learning_rate": 8.48747435079121e-06, + "loss": 0.0297, + "step": 797 + }, + { + "epoch": 0.2771795762417506, + "grad_norm": 0.5199486841483459, + "learning_rate": 8.483440565858599e-06, + "loss": 0.0216, + "step": 798 + }, + { + "epoch": 0.27752691906912125, + "grad_norm": 1.2032922365716692, + "learning_rate": 8.479402370533127e-06, + "loss": 0.03, + "step": 799 + }, + { + "epoch": 0.27787426189649184, + "grad_norm": 1.9065992211352163, + "learning_rate": 8.47535976992756e-06, + "loss": 0.0242, + "step": 800 + }, + { + "epoch": 0.2782216047238624, + "grad_norm": 0.7910306233638731, + "learning_rate": 8.471312769160247e-06, + "loss": 0.014, + "step": 801 + }, + { + "epoch": 0.27856894755123307, + "grad_norm": 0.9658546663370163, + "learning_rate": 8.467261373355104e-06, + "loss": 0.0234, + "step": 802 + }, + { + "epoch": 0.27891629037860366, + "grad_norm": 0.6546478423971199, + "learning_rate": 8.463205587641614e-06, + "loss": 0.0189, + "step": 803 + }, + { + "epoch": 0.2792636332059743, + "grad_norm": 0.6930129832525015, + "learning_rate": 8.459145417154817e-06, + "loss": 0.0258, + "step": 804 + }, + { + "epoch": 0.2796109760333449, + "grad_norm": 0.4441678217661302, + "learning_rate": 8.455080867035307e-06, + "loss": 0.0125, + "step": 805 + }, + { + "epoch": 0.27995831886071554, + "grad_norm": 1.1591532386678776, + "learning_rate": 8.451011942429219e-06, + "loss": 0.0283, + "step": 806 + }, + { + "epoch": 0.2803056616880861, + "grad_norm": 1.2540462658684661, + "learning_rate": 8.44693864848823e-06, + "loss": 0.0144, + "step": 807 + }, + { + "epoch": 0.28065300451545677, + "grad_norm": 0.8407468838106213, + "learning_rate": 8.442860990369545e-06, + "loss": 0.0194, + "step": 808 + }, + { + "epoch": 0.28100034734282736, + "grad_norm": 1.02240345065518, + "learning_rate": 8.438778973235904e-06, + "loss": 0.0372, + "step": 809 + }, + { + "epoch": 0.281347690170198, + "grad_norm": 0.711512705654793, + "learning_rate": 8.43469260225555e-06, + "loss": 0.0136, + "step": 810 + }, + { + "epoch": 0.2816950329975686, + "grad_norm": 0.5328105749063056, + "learning_rate": 8.430601882602256e-06, + "loss": 0.0228, + "step": 811 + }, + { + "epoch": 0.28204237582493924, + "grad_norm": 0.4977620396488783, + "learning_rate": 8.426506819455285e-06, + "loss": 0.0173, + "step": 812 + }, + { + "epoch": 0.28238971865230983, + "grad_norm": 0.7517007103690861, + "learning_rate": 8.422407417999413e-06, + "loss": 0.0215, + "step": 813 + }, + { + "epoch": 0.2827370614796804, + "grad_norm": 1.988030057566032, + "learning_rate": 8.418303683424898e-06, + "loss": 0.0236, + "step": 814 + }, + { + "epoch": 0.28308440430705106, + "grad_norm": 0.564222331342316, + "learning_rate": 8.414195620927491e-06, + "loss": 0.0168, + "step": 815 + }, + { + "epoch": 0.28343174713442165, + "grad_norm": 0.6853557880647918, + "learning_rate": 8.410083235708422e-06, + "loss": 0.0154, + "step": 816 + }, + { + "epoch": 0.2837790899617923, + "grad_norm": 2.11338878622302, + "learning_rate": 8.405966532974388e-06, + "loss": 0.0264, + "step": 817 + }, + { + "epoch": 0.2841264327891629, + "grad_norm": 0.8210234110109191, + "learning_rate": 8.401845517937558e-06, + "loss": 0.0291, + "step": 818 + }, + { + "epoch": 0.28447377561653353, + "grad_norm": 0.889999073550466, + "learning_rate": 8.397720195815561e-06, + "loss": 0.0295, + "step": 819 + }, + { + "epoch": 0.2848211184439041, + "grad_norm": 0.631173907871462, + "learning_rate": 8.393590571831478e-06, + "loss": 0.0217, + "step": 820 + }, + { + "epoch": 0.28516846127127476, + "grad_norm": 0.7731774878641796, + "learning_rate": 8.389456651213834e-06, + "loss": 0.0155, + "step": 821 + }, + { + "epoch": 0.28551580409864535, + "grad_norm": 1.5606323479981803, + "learning_rate": 8.385318439196597e-06, + "loss": 0.0249, + "step": 822 + }, + { + "epoch": 0.285863146926016, + "grad_norm": 0.596618016585276, + "learning_rate": 8.381175941019171e-06, + "loss": 0.0325, + "step": 823 + }, + { + "epoch": 0.2862104897533866, + "grad_norm": 1.0647983506484207, + "learning_rate": 8.377029161926378e-06, + "loss": 0.0268, + "step": 824 + }, + { + "epoch": 0.28655783258075723, + "grad_norm": 0.5119216226794532, + "learning_rate": 8.372878107168469e-06, + "loss": 0.0238, + "step": 825 + }, + { + "epoch": 0.2869051754081278, + "grad_norm": 0.46410901637663077, + "learning_rate": 8.368722782001104e-06, + "loss": 0.0169, + "step": 826 + }, + { + "epoch": 0.2872525182354984, + "grad_norm": 1.3248050562513085, + "learning_rate": 8.364563191685348e-06, + "loss": 0.0224, + "step": 827 + }, + { + "epoch": 0.28759986106286906, + "grad_norm": 0.9361071547212366, + "learning_rate": 8.360399341487675e-06, + "loss": 0.0259, + "step": 828 + }, + { + "epoch": 0.28794720389023964, + "grad_norm": 1.1870448572485746, + "learning_rate": 8.35623123667994e-06, + "loss": 0.0193, + "step": 829 + }, + { + "epoch": 0.2882945467176103, + "grad_norm": 0.9586109702853123, + "learning_rate": 8.352058882539394e-06, + "loss": 0.0206, + "step": 830 + }, + { + "epoch": 0.2886418895449809, + "grad_norm": 0.5309149834858535, + "learning_rate": 8.347882284348665e-06, + "loss": 0.0208, + "step": 831 + }, + { + "epoch": 0.2889892323723515, + "grad_norm": 1.3733779916187276, + "learning_rate": 8.343701447395754e-06, + "loss": 0.0183, + "step": 832 + }, + { + "epoch": 0.2893365751997221, + "grad_norm": 1.3507083935119284, + "learning_rate": 8.339516376974028e-06, + "loss": 0.0228, + "step": 833 + }, + { + "epoch": 0.28968391802709276, + "grad_norm": 0.9249455391192148, + "learning_rate": 8.33532707838222e-06, + "loss": 0.02, + "step": 834 + }, + { + "epoch": 0.29003126085446335, + "grad_norm": 1.6816616489709881, + "learning_rate": 8.331133556924404e-06, + "loss": 0.0134, + "step": 835 + }, + { + "epoch": 0.290378603681834, + "grad_norm": 1.0063521448061856, + "learning_rate": 8.326935817910014e-06, + "loss": 0.0188, + "step": 836 + }, + { + "epoch": 0.2907259465092046, + "grad_norm": 0.8258758559049553, + "learning_rate": 8.322733866653814e-06, + "loss": 0.0263, + "step": 837 + }, + { + "epoch": 0.2910732893365752, + "grad_norm": 1.173146724262853, + "learning_rate": 8.31852770847591e-06, + "loss": 0.0249, + "step": 838 + }, + { + "epoch": 0.2914206321639458, + "grad_norm": 6.319556253427923, + "learning_rate": 8.314317348701724e-06, + "loss": 0.0334, + "step": 839 + }, + { + "epoch": 0.2917679749913164, + "grad_norm": 3.142508712422825, + "learning_rate": 8.310102792662006e-06, + "loss": 0.0402, + "step": 840 + }, + { + "epoch": 0.29211531781868705, + "grad_norm": 2.293229808909149, + "learning_rate": 8.305884045692815e-06, + "loss": 0.0401, + "step": 841 + }, + { + "epoch": 0.29246266064605764, + "grad_norm": 1.021991216996574, + "learning_rate": 8.30166111313552e-06, + "loss": 0.0162, + "step": 842 + }, + { + "epoch": 0.2928100034734283, + "grad_norm": 1.1326524259624917, + "learning_rate": 8.297434000336781e-06, + "loss": 0.0184, + "step": 843 + }, + { + "epoch": 0.29315734630079887, + "grad_norm": 0.6967485472695889, + "learning_rate": 8.29320271264856e-06, + "loss": 0.0228, + "step": 844 + }, + { + "epoch": 0.2935046891281695, + "grad_norm": 1.5690012404474116, + "learning_rate": 8.288967255428102e-06, + "loss": 0.0356, + "step": 845 + }, + { + "epoch": 0.2938520319555401, + "grad_norm": 1.6508824584996764, + "learning_rate": 8.284727634037928e-06, + "loss": 0.022, + "step": 846 + }, + { + "epoch": 0.29419937478291075, + "grad_norm": 1.7189879458010435, + "learning_rate": 8.280483853845831e-06, + "loss": 0.0176, + "step": 847 + }, + { + "epoch": 0.29454671761028134, + "grad_norm": 1.3338043589868147, + "learning_rate": 8.276235920224877e-06, + "loss": 0.0139, + "step": 848 + }, + { + "epoch": 0.294894060437652, + "grad_norm": 0.48731311842832525, + "learning_rate": 8.271983838553383e-06, + "loss": 0.0344, + "step": 849 + }, + { + "epoch": 0.2952414032650226, + "grad_norm": 0.6636965122047247, + "learning_rate": 8.26772761421492e-06, + "loss": 0.0213, + "step": 850 + }, + { + "epoch": 0.2955887460923932, + "grad_norm": 0.7449395913052544, + "learning_rate": 8.263467252598303e-06, + "loss": 0.0283, + "step": 851 + }, + { + "epoch": 0.2959360889197638, + "grad_norm": 0.5866063388294399, + "learning_rate": 8.25920275909759e-06, + "loss": 0.0237, + "step": 852 + }, + { + "epoch": 0.2962834317471344, + "grad_norm": 0.5762702547689298, + "learning_rate": 8.254934139112062e-06, + "loss": 0.0197, + "step": 853 + }, + { + "epoch": 0.29663077457450504, + "grad_norm": 0.4188655050848168, + "learning_rate": 8.250661398046236e-06, + "loss": 0.0201, + "step": 854 + }, + { + "epoch": 0.29697811740187563, + "grad_norm": 0.4948572771836202, + "learning_rate": 8.246384541309835e-06, + "loss": 0.0228, + "step": 855 + }, + { + "epoch": 0.2973254602292463, + "grad_norm": 0.8130265751045145, + "learning_rate": 8.242103574317802e-06, + "loss": 0.0192, + "step": 856 + }, + { + "epoch": 0.29767280305661686, + "grad_norm": 0.7538997021511098, + "learning_rate": 8.237818502490273e-06, + "loss": 0.0207, + "step": 857 + }, + { + "epoch": 0.2980201458839875, + "grad_norm": 0.7989130279170902, + "learning_rate": 8.233529331252598e-06, + "loss": 0.0203, + "step": 858 + }, + { + "epoch": 0.2983674887113581, + "grad_norm": 1.7902955280949013, + "learning_rate": 8.2292360660353e-06, + "loss": 0.0287, + "step": 859 + }, + { + "epoch": 0.29871483153872874, + "grad_norm": 0.6270804680604328, + "learning_rate": 8.224938712274097e-06, + "loss": 0.0241, + "step": 860 + }, + { + "epoch": 0.29906217436609933, + "grad_norm": 0.8032825382756686, + "learning_rate": 8.220637275409878e-06, + "loss": 0.0194, + "step": 861 + }, + { + "epoch": 0.29940951719347, + "grad_norm": 1.1586121906804554, + "learning_rate": 8.2163317608887e-06, + "loss": 0.0207, + "step": 862 + }, + { + "epoch": 0.29975686002084057, + "grad_norm": 0.9100243599349078, + "learning_rate": 8.21202217416179e-06, + "loss": 0.0156, + "step": 863 + }, + { + "epoch": 0.3001042028482112, + "grad_norm": 0.6532828480664288, + "learning_rate": 8.207708520685526e-06, + "loss": 0.0187, + "step": 864 + }, + { + "epoch": 0.3004515456755818, + "grad_norm": 0.5964555383406674, + "learning_rate": 8.203390805921437e-06, + "loss": 0.0176, + "step": 865 + }, + { + "epoch": 0.3007988885029524, + "grad_norm": 0.6421563749310066, + "learning_rate": 8.199069035336186e-06, + "loss": 0.017, + "step": 866 + }, + { + "epoch": 0.30114623133032303, + "grad_norm": 0.5207710470189775, + "learning_rate": 8.194743214401587e-06, + "loss": 0.0169, + "step": 867 + }, + { + "epoch": 0.3014935741576936, + "grad_norm": 0.6923218916041749, + "learning_rate": 8.190413348594564e-06, + "loss": 0.027, + "step": 868 + }, + { + "epoch": 0.30184091698506427, + "grad_norm": 0.5670711050522923, + "learning_rate": 8.186079443397174e-06, + "loss": 0.0278, + "step": 869 + }, + { + "epoch": 0.30218825981243486, + "grad_norm": 0.7702120413403443, + "learning_rate": 8.181741504296588e-06, + "loss": 0.0251, + "step": 870 + }, + { + "epoch": 0.3025356026398055, + "grad_norm": 0.6513641907856846, + "learning_rate": 8.17739953678508e-06, + "loss": 0.0193, + "step": 871 + }, + { + "epoch": 0.3028829454671761, + "grad_norm": 1.2819392100935951, + "learning_rate": 8.173053546360025e-06, + "loss": 0.0239, + "step": 872 + }, + { + "epoch": 0.30323028829454673, + "grad_norm": 0.8392899596493151, + "learning_rate": 8.168703538523892e-06, + "loss": 0.0281, + "step": 873 + }, + { + "epoch": 0.3035776311219173, + "grad_norm": 0.5848723579885045, + "learning_rate": 8.16434951878424e-06, + "loss": 0.0191, + "step": 874 + }, + { + "epoch": 0.30392497394928797, + "grad_norm": 1.1086246012363608, + "learning_rate": 8.1599914926537e-06, + "loss": 0.0167, + "step": 875 + }, + { + "epoch": 0.30427231677665856, + "grad_norm": 0.7106156523093721, + "learning_rate": 8.155629465649983e-06, + "loss": 0.0231, + "step": 876 + }, + { + "epoch": 0.3046196596040292, + "grad_norm": 0.770138606469383, + "learning_rate": 8.15126344329586e-06, + "loss": 0.0157, + "step": 877 + }, + { + "epoch": 0.3049670024313998, + "grad_norm": 0.47477270596349086, + "learning_rate": 8.146893431119166e-06, + "loss": 0.0281, + "step": 878 + }, + { + "epoch": 0.3053143452587704, + "grad_norm": 0.6077495292480687, + "learning_rate": 8.142519434652782e-06, + "loss": 0.0203, + "step": 879 + }, + { + "epoch": 0.305661688086141, + "grad_norm": 0.8375044162407927, + "learning_rate": 8.138141459434638e-06, + "loss": 0.0207, + "step": 880 + }, + { + "epoch": 0.3060090309135116, + "grad_norm": 0.7961579743921388, + "learning_rate": 8.133759511007697e-06, + "loss": 0.018, + "step": 881 + }, + { + "epoch": 0.30635637374088226, + "grad_norm": 0.9954610793959501, + "learning_rate": 8.129373594919957e-06, + "loss": 0.0177, + "step": 882 + }, + { + "epoch": 0.30670371656825285, + "grad_norm": 0.2668325158805243, + "learning_rate": 8.124983716724434e-06, + "loss": 0.0165, + "step": 883 + }, + { + "epoch": 0.3070510593956235, + "grad_norm": 0.7195212219421603, + "learning_rate": 8.120589881979167e-06, + "loss": 0.0174, + "step": 884 + }, + { + "epoch": 0.3073984022229941, + "grad_norm": 0.5395482390763875, + "learning_rate": 8.116192096247202e-06, + "loss": 0.0271, + "step": 885 + }, + { + "epoch": 0.3077457450503647, + "grad_norm": 1.297519296316108, + "learning_rate": 8.111790365096584e-06, + "loss": 0.0211, + "step": 886 + }, + { + "epoch": 0.3080930878777353, + "grad_norm": 1.110719079206034, + "learning_rate": 8.107384694100355e-06, + "loss": 0.0305, + "step": 887 + }, + { + "epoch": 0.30844043070510596, + "grad_norm": 0.4497669645124215, + "learning_rate": 8.102975088836551e-06, + "loss": 0.0152, + "step": 888 + }, + { + "epoch": 0.30878777353247655, + "grad_norm": 0.6803690138422418, + "learning_rate": 8.098561554888181e-06, + "loss": 0.036, + "step": 889 + }, + { + "epoch": 0.3091351163598472, + "grad_norm": 0.5046240454353891, + "learning_rate": 8.09414409784323e-06, + "loss": 0.019, + "step": 890 + }, + { + "epoch": 0.3094824591872178, + "grad_norm": 0.7226803158978641, + "learning_rate": 8.089722723294654e-06, + "loss": 0.0221, + "step": 891 + }, + { + "epoch": 0.3098298020145884, + "grad_norm": 1.0803635312237747, + "learning_rate": 8.085297436840365e-06, + "loss": 0.0224, + "step": 892 + }, + { + "epoch": 0.310177144841959, + "grad_norm": 0.41686248676010906, + "learning_rate": 8.080868244083232e-06, + "loss": 0.0257, + "step": 893 + }, + { + "epoch": 0.3105244876693296, + "grad_norm": 1.0314297720832435, + "learning_rate": 8.076435150631064e-06, + "loss": 0.0182, + "step": 894 + }, + { + "epoch": 0.31087183049670025, + "grad_norm": 2.59167850646609, + "learning_rate": 8.071998162096613e-06, + "loss": 0.0291, + "step": 895 + }, + { + "epoch": 0.31121917332407084, + "grad_norm": 1.7037505969911395, + "learning_rate": 8.06755728409756e-06, + "loss": 0.0255, + "step": 896 + }, + { + "epoch": 0.3115665161514415, + "grad_norm": 0.5736683932010955, + "learning_rate": 8.063112522256516e-06, + "loss": 0.0352, + "step": 897 + }, + { + "epoch": 0.3119138589788121, + "grad_norm": 0.6074990637984911, + "learning_rate": 8.058663882200998e-06, + "loss": 0.0221, + "step": 898 + }, + { + "epoch": 0.3122612018061827, + "grad_norm": 1.047458516096098, + "learning_rate": 8.054211369563448e-06, + "loss": 0.0239, + "step": 899 + }, + { + "epoch": 0.3126085446335533, + "grad_norm": 1.3336377703933007, + "learning_rate": 8.049754989981198e-06, + "loss": 0.0258, + "step": 900 + }, + { + "epoch": 0.31295588746092395, + "grad_norm": 0.5743775466749255, + "learning_rate": 8.045294749096485e-06, + "loss": 0.031, + "step": 901 + }, + { + "epoch": 0.31330323028829454, + "grad_norm": 0.6348423077667337, + "learning_rate": 8.040830652556429e-06, + "loss": 0.0225, + "step": 902 + }, + { + "epoch": 0.3136505731156652, + "grad_norm": 0.4690059985847731, + "learning_rate": 8.036362706013033e-06, + "loss": 0.0219, + "step": 903 + }, + { + "epoch": 0.3139979159430358, + "grad_norm": 0.6063025129771344, + "learning_rate": 8.031890915123178e-06, + "loss": 0.0306, + "step": 904 + }, + { + "epoch": 0.31434525877040637, + "grad_norm": 0.8197671720949243, + "learning_rate": 8.02741528554861e-06, + "loss": 0.023, + "step": 905 + }, + { + "epoch": 0.314692601597777, + "grad_norm": 0.6251152385832162, + "learning_rate": 8.02293582295593e-06, + "loss": 0.0215, + "step": 906 + }, + { + "epoch": 0.3150399444251476, + "grad_norm": 0.6072902912466627, + "learning_rate": 8.018452533016604e-06, + "loss": 0.0231, + "step": 907 + }, + { + "epoch": 0.31538728725251824, + "grad_norm": 0.31262167331309026, + "learning_rate": 8.01396542140693e-06, + "loss": 0.0179, + "step": 908 + }, + { + "epoch": 0.31573463007988883, + "grad_norm": 1.1201275890016398, + "learning_rate": 8.009474493808054e-06, + "loss": 0.0173, + "step": 909 + }, + { + "epoch": 0.3160819729072595, + "grad_norm": 0.614103933391006, + "learning_rate": 8.004979755905953e-06, + "loss": 0.0204, + "step": 910 + }, + { + "epoch": 0.31642931573463007, + "grad_norm": 1.2968687606304459, + "learning_rate": 8.000481213391422e-06, + "loss": 0.0304, + "step": 911 + }, + { + "epoch": 0.3167766585620007, + "grad_norm": 0.5937663477943244, + "learning_rate": 7.995978871960079e-06, + "loss": 0.0205, + "step": 912 + }, + { + "epoch": 0.3171240013893713, + "grad_norm": 0.5499626397370164, + "learning_rate": 7.991472737312351e-06, + "loss": 0.0185, + "step": 913 + }, + { + "epoch": 0.31747134421674195, + "grad_norm": 1.101916609969889, + "learning_rate": 7.986962815153466e-06, + "loss": 0.0301, + "step": 914 + }, + { + "epoch": 0.31781868704411254, + "grad_norm": 1.091262784145427, + "learning_rate": 7.982449111193445e-06, + "loss": 0.0208, + "step": 915 + }, + { + "epoch": 0.3181660298714832, + "grad_norm": 0.6932386123391473, + "learning_rate": 7.977931631147102e-06, + "loss": 0.019, + "step": 916 + }, + { + "epoch": 0.31851337269885377, + "grad_norm": 1.697367211349032, + "learning_rate": 7.97341038073403e-06, + "loss": 0.0225, + "step": 917 + }, + { + "epoch": 0.31886071552622436, + "grad_norm": 0.48030348066887196, + "learning_rate": 7.968885365678596e-06, + "loss": 0.0198, + "step": 918 + }, + { + "epoch": 0.319208058353595, + "grad_norm": 0.7911731814497744, + "learning_rate": 7.96435659170993e-06, + "loss": 0.0179, + "step": 919 + }, + { + "epoch": 0.3195554011809656, + "grad_norm": 0.507749020308773, + "learning_rate": 7.959824064561927e-06, + "loss": 0.01, + "step": 920 + }, + { + "epoch": 0.31990274400833624, + "grad_norm": 1.4662590811027465, + "learning_rate": 7.955287789973231e-06, + "loss": 0.0183, + "step": 921 + }, + { + "epoch": 0.3202500868357068, + "grad_norm": 1.2953903150942983, + "learning_rate": 7.950747773687231e-06, + "loss": 0.0191, + "step": 922 + }, + { + "epoch": 0.32059742966307747, + "grad_norm": 0.5523865660394646, + "learning_rate": 7.946204021452049e-06, + "loss": 0.0194, + "step": 923 + }, + { + "epoch": 0.32094477249044806, + "grad_norm": 0.6034008798581597, + "learning_rate": 7.941656539020546e-06, + "loss": 0.0207, + "step": 924 + }, + { + "epoch": 0.3212921153178187, + "grad_norm": 0.5589263526644, + "learning_rate": 7.9371053321503e-06, + "loss": 0.0185, + "step": 925 + }, + { + "epoch": 0.3216394581451893, + "grad_norm": 0.5269322519646701, + "learning_rate": 7.932550406603603e-06, + "loss": 0.0196, + "step": 926 + }, + { + "epoch": 0.32198680097255994, + "grad_norm": 1.2367553916721554, + "learning_rate": 7.92799176814746e-06, + "loss": 0.0224, + "step": 927 + }, + { + "epoch": 0.32233414379993053, + "grad_norm": 0.5712602991453356, + "learning_rate": 7.923429422553574e-06, + "loss": 0.0239, + "step": 928 + }, + { + "epoch": 0.3226814866273012, + "grad_norm": 0.7155877194834468, + "learning_rate": 7.91886337559834e-06, + "loss": 0.0178, + "step": 929 + }, + { + "epoch": 0.32302882945467176, + "grad_norm": 0.9800497349713431, + "learning_rate": 7.914293633062845e-06, + "loss": 0.0189, + "step": 930 + }, + { + "epoch": 0.32337617228204235, + "grad_norm": 1.6132143422048213, + "learning_rate": 7.90972020073285e-06, + "loss": 0.0229, + "step": 931 + }, + { + "epoch": 0.323723515109413, + "grad_norm": 0.7257100719458756, + "learning_rate": 7.905143084398792e-06, + "loss": 0.0172, + "step": 932 + }, + { + "epoch": 0.3240708579367836, + "grad_norm": 1.2683287542082773, + "learning_rate": 7.900562289855763e-06, + "loss": 0.017, + "step": 933 + }, + { + "epoch": 0.32441820076415423, + "grad_norm": 0.7872928535413164, + "learning_rate": 7.895977822903524e-06, + "loss": 0.0218, + "step": 934 + }, + { + "epoch": 0.3247655435915248, + "grad_norm": 0.9240714912057278, + "learning_rate": 7.891389689346479e-06, + "loss": 0.03, + "step": 935 + }, + { + "epoch": 0.32511288641889546, + "grad_norm": 1.2754448071139646, + "learning_rate": 7.886797894993674e-06, + "loss": 0.0295, + "step": 936 + }, + { + "epoch": 0.32546022924626605, + "grad_norm": 0.7875701058115853, + "learning_rate": 7.882202445658792e-06, + "loss": 0.0191, + "step": 937 + }, + { + "epoch": 0.3258075720736367, + "grad_norm": 1.1225423591432484, + "learning_rate": 7.877603347160144e-06, + "loss": 0.0174, + "step": 938 + }, + { + "epoch": 0.3261549149010073, + "grad_norm": 0.7416127576619395, + "learning_rate": 7.873000605320658e-06, + "loss": 0.0163, + "step": 939 + }, + { + "epoch": 0.32650225772837793, + "grad_norm": 0.8331153940172414, + "learning_rate": 7.868394225967881e-06, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.3268496005557485, + "grad_norm": 2.008150547125355, + "learning_rate": 7.863784214933957e-06, + "loss": 0.031, + "step": 941 + }, + { + "epoch": 0.32719694338311917, + "grad_norm": 0.7135068543742416, + "learning_rate": 7.859170578055633e-06, + "loss": 0.0239, + "step": 942 + }, + { + "epoch": 0.32754428621048975, + "grad_norm": 0.7525548520537905, + "learning_rate": 7.85455332117425e-06, + "loss": 0.0157, + "step": 943 + }, + { + "epoch": 0.32789162903786034, + "grad_norm": 0.7034077910253422, + "learning_rate": 7.849932450135726e-06, + "loss": 0.027, + "step": 944 + }, + { + "epoch": 0.328238971865231, + "grad_norm": 0.5421480780633295, + "learning_rate": 7.84530797079056e-06, + "loss": 0.0206, + "step": 945 + }, + { + "epoch": 0.3285863146926016, + "grad_norm": 0.5588408113827876, + "learning_rate": 7.84067988899381e-06, + "loss": 0.0274, + "step": 946 + }, + { + "epoch": 0.3289336575199722, + "grad_norm": 0.43480112336904514, + "learning_rate": 7.836048210605109e-06, + "loss": 0.0226, + "step": 947 + }, + { + "epoch": 0.3292810003473428, + "grad_norm": 0.769442840903224, + "learning_rate": 7.831412941488634e-06, + "loss": 0.0148, + "step": 948 + }, + { + "epoch": 0.32962834317471346, + "grad_norm": 1.3172824990527807, + "learning_rate": 7.826774087513113e-06, + "loss": 0.0276, + "step": 949 + }, + { + "epoch": 0.32997568600208405, + "grad_norm": 2.0103119905473577, + "learning_rate": 7.822131654551807e-06, + "loss": 0.0295, + "step": 950 + }, + { + "epoch": 0.3303230288294547, + "grad_norm": 0.41890011549619927, + "learning_rate": 7.817485648482514e-06, + "loss": 0.0203, + "step": 951 + }, + { + "epoch": 0.3306703716568253, + "grad_norm": 0.34546681427916026, + "learning_rate": 7.812836075187555e-06, + "loss": 0.0159, + "step": 952 + }, + { + "epoch": 0.3310177144841959, + "grad_norm": 1.3371854680462747, + "learning_rate": 7.808182940553765e-06, + "loss": 0.0306, + "step": 953 + }, + { + "epoch": 0.3313650573115665, + "grad_norm": 1.8044729130465431, + "learning_rate": 7.803526250472488e-06, + "loss": 0.0338, + "step": 954 + }, + { + "epoch": 0.33171240013893716, + "grad_norm": 1.0737141556403031, + "learning_rate": 7.798866010839577e-06, + "loss": 0.016, + "step": 955 + }, + { + "epoch": 0.33205974296630775, + "grad_norm": 0.9299379110949423, + "learning_rate": 7.794202227555365e-06, + "loss": 0.0242, + "step": 956 + }, + { + "epoch": 0.33240708579367834, + "grad_norm": 0.6512501538111303, + "learning_rate": 7.789534906524684e-06, + "loss": 0.0259, + "step": 957 + }, + { + "epoch": 0.332754428621049, + "grad_norm": 0.7546601828240426, + "learning_rate": 7.784864053656842e-06, + "loss": 0.0251, + "step": 958 + }, + { + "epoch": 0.33310177144841957, + "grad_norm": 0.4817810206017834, + "learning_rate": 7.780189674865617e-06, + "loss": 0.0198, + "step": 959 + }, + { + "epoch": 0.3334491142757902, + "grad_norm": 1.3719190919474487, + "learning_rate": 7.77551177606925e-06, + "loss": 0.0219, + "step": 960 + }, + { + "epoch": 0.3337964571031608, + "grad_norm": 1.3853549674094732, + "learning_rate": 7.770830363190442e-06, + "loss": 0.0184, + "step": 961 + }, + { + "epoch": 0.33414379993053145, + "grad_norm": 0.9212792032039464, + "learning_rate": 7.76614544215634e-06, + "loss": 0.0238, + "step": 962 + }, + { + "epoch": 0.33449114275790204, + "grad_norm": 0.5577408235128498, + "learning_rate": 7.761457018898536e-06, + "loss": 0.0222, + "step": 963 + }, + { + "epoch": 0.3348384855852727, + "grad_norm": 1.9775404910250387, + "learning_rate": 7.756765099353052e-06, + "loss": 0.0236, + "step": 964 + }, + { + "epoch": 0.33518582841264327, + "grad_norm": 1.3737059966757152, + "learning_rate": 7.752069689460345e-06, + "loss": 0.025, + "step": 965 + }, + { + "epoch": 0.3355331712400139, + "grad_norm": 0.5574547827963353, + "learning_rate": 7.747370795165277e-06, + "loss": 0.0161, + "step": 966 + }, + { + "epoch": 0.3358805140673845, + "grad_norm": 0.9820016637326211, + "learning_rate": 7.742668422417137e-06, + "loss": 0.0133, + "step": 967 + }, + { + "epoch": 0.33622785689475515, + "grad_norm": 0.4953092620978607, + "learning_rate": 7.737962577169606e-06, + "loss": 0.0134, + "step": 968 + }, + { + "epoch": 0.33657519972212574, + "grad_norm": 0.8358656568579974, + "learning_rate": 7.73325326538077e-06, + "loss": 0.0209, + "step": 969 + }, + { + "epoch": 0.33692254254949633, + "grad_norm": 1.3762297024347394, + "learning_rate": 7.728540493013098e-06, + "loss": 0.0221, + "step": 970 + }, + { + "epoch": 0.337269885376867, + "grad_norm": 1.8553914564827385, + "learning_rate": 7.723824266033444e-06, + "loss": 0.0247, + "step": 971 + }, + { + "epoch": 0.33761722820423756, + "grad_norm": 1.3117680989693183, + "learning_rate": 7.719104590413036e-06, + "loss": 0.0241, + "step": 972 + }, + { + "epoch": 0.3379645710316082, + "grad_norm": 1.1924550600193087, + "learning_rate": 7.714381472127466e-06, + "loss": 0.0251, + "step": 973 + }, + { + "epoch": 0.3383119138589788, + "grad_norm": 0.7787134219349535, + "learning_rate": 7.709654917156683e-06, + "loss": 0.0213, + "step": 974 + }, + { + "epoch": 0.33865925668634944, + "grad_norm": 0.5251026227118649, + "learning_rate": 7.704924931484997e-06, + "loss": 0.0135, + "step": 975 + }, + { + "epoch": 0.33900659951372003, + "grad_norm": 0.6878735825950388, + "learning_rate": 7.700191521101047e-06, + "loss": 0.0193, + "step": 976 + }, + { + "epoch": 0.3393539423410907, + "grad_norm": 1.2530620449693388, + "learning_rate": 7.695454691997824e-06, + "loss": 0.0234, + "step": 977 + }, + { + "epoch": 0.33970128516846126, + "grad_norm": 1.9903113138666264, + "learning_rate": 7.690714450172633e-06, + "loss": 0.027, + "step": 978 + }, + { + "epoch": 0.3400486279958319, + "grad_norm": 0.9816472108674068, + "learning_rate": 7.685970801627108e-06, + "loss": 0.0153, + "step": 979 + }, + { + "epoch": 0.3403959708232025, + "grad_norm": 1.218057722488078, + "learning_rate": 7.681223752367195e-06, + "loss": 0.0261, + "step": 980 + }, + { + "epoch": 0.34074331365057314, + "grad_norm": 0.5081769560208316, + "learning_rate": 7.676473308403142e-06, + "loss": 0.0149, + "step": 981 + }, + { + "epoch": 0.34109065647794373, + "grad_norm": 0.541037545259369, + "learning_rate": 7.671719475749502e-06, + "loss": 0.0179, + "step": 982 + }, + { + "epoch": 0.3414379993053143, + "grad_norm": 1.0709500749793777, + "learning_rate": 7.666962260425113e-06, + "loss": 0.0296, + "step": 983 + }, + { + "epoch": 0.34178534213268497, + "grad_norm": 1.0183627382474816, + "learning_rate": 7.662201668453098e-06, + "loss": 0.0241, + "step": 984 + }, + { + "epoch": 0.34213268496005556, + "grad_norm": 0.490710306987966, + "learning_rate": 7.657437705860853e-06, + "loss": 0.0083, + "step": 985 + }, + { + "epoch": 0.3424800277874262, + "grad_norm": 0.9120936689637056, + "learning_rate": 7.652670378680043e-06, + "loss": 0.0183, + "step": 986 + }, + { + "epoch": 0.3428273706147968, + "grad_norm": 0.8239513692459495, + "learning_rate": 7.647899692946594e-06, + "loss": 0.0199, + "step": 987 + }, + { + "epoch": 0.34317471344216743, + "grad_norm": 0.6785851737249843, + "learning_rate": 7.643125654700684e-06, + "loss": 0.0203, + "step": 988 + }, + { + "epoch": 0.343522056269538, + "grad_norm": 0.9952364860290294, + "learning_rate": 7.638348269986733e-06, + "loss": 0.0233, + "step": 989 + }, + { + "epoch": 0.34386939909690867, + "grad_norm": 0.7687287712488833, + "learning_rate": 7.6335675448534e-06, + "loss": 0.0143, + "step": 990 + }, + { + "epoch": 0.34421674192427926, + "grad_norm": 1.0499353749760423, + "learning_rate": 7.628783485353573e-06, + "loss": 0.0259, + "step": 991 + }, + { + "epoch": 0.3445640847516499, + "grad_norm": 1.474581388095261, + "learning_rate": 7.623996097544364e-06, + "loss": 0.0258, + "step": 992 + }, + { + "epoch": 0.3449114275790205, + "grad_norm": 1.8524514914939911, + "learning_rate": 7.619205387487094e-06, + "loss": 0.0263, + "step": 993 + }, + { + "epoch": 0.34525877040639114, + "grad_norm": 1.9787996414155324, + "learning_rate": 7.614411361247296e-06, + "loss": 0.0241, + "step": 994 + }, + { + "epoch": 0.3456061132337617, + "grad_norm": 0.9020458030038447, + "learning_rate": 7.609614024894694e-06, + "loss": 0.0282, + "step": 995 + }, + { + "epoch": 0.3459534560611323, + "grad_norm": 1.6705983295073557, + "learning_rate": 7.604813384503212e-06, + "loss": 0.0297, + "step": 996 + }, + { + "epoch": 0.34630079888850296, + "grad_norm": 0.6271112596802708, + "learning_rate": 7.600009446150951e-06, + "loss": 0.0202, + "step": 997 + }, + { + "epoch": 0.34664814171587355, + "grad_norm": 0.43730341616505064, + "learning_rate": 7.59520221592019e-06, + "loss": 0.0165, + "step": 998 + }, + { + "epoch": 0.3469954845432442, + "grad_norm": 0.6300030347921803, + "learning_rate": 7.5903916998973745e-06, + "loss": 0.0226, + "step": 999 + }, + { + "epoch": 0.3473428273706148, + "grad_norm": 0.6996825232281534, + "learning_rate": 7.585577904173113e-06, + "loss": 0.0172, + "step": 1000 + }, + { + "epoch": 0.3476901701979854, + "grad_norm": 0.6380166925486069, + "learning_rate": 7.580760834842162e-06, + "loss": 0.0201, + "step": 1001 + }, + { + "epoch": 0.348037513025356, + "grad_norm": 1.3568481449306653, + "learning_rate": 7.575940498003425e-06, + "loss": 0.0212, + "step": 1002 + }, + { + "epoch": 0.34838485585272666, + "grad_norm": 1.2549578263294605, + "learning_rate": 7.571116899759945e-06, + "loss": 0.0238, + "step": 1003 + }, + { + "epoch": 0.34873219868009725, + "grad_norm": 1.0886856632970126, + "learning_rate": 7.566290046218889e-06, + "loss": 0.0243, + "step": 1004 + }, + { + "epoch": 0.3490795415074679, + "grad_norm": 0.4043124260071018, + "learning_rate": 7.5614599434915514e-06, + "loss": 0.0149, + "step": 1005 + }, + { + "epoch": 0.3494268843348385, + "grad_norm": 0.4143945807665345, + "learning_rate": 7.556626597693335e-06, + "loss": 0.0172, + "step": 1006 + }, + { + "epoch": 0.34977422716220913, + "grad_norm": 0.47662985164661864, + "learning_rate": 7.551790014943752e-06, + "loss": 0.0194, + "step": 1007 + }, + { + "epoch": 0.3501215699895797, + "grad_norm": 0.9261090190118699, + "learning_rate": 7.546950201366412e-06, + "loss": 0.0261, + "step": 1008 + }, + { + "epoch": 0.3504689128169503, + "grad_norm": 1.2825084254203554, + "learning_rate": 7.542107163089016e-06, + "loss": 0.0151, + "step": 1009 + }, + { + "epoch": 0.35081625564432095, + "grad_norm": 0.8371175718056099, + "learning_rate": 7.537260906243344e-06, + "loss": 0.0267, + "step": 1010 + }, + { + "epoch": 0.35116359847169154, + "grad_norm": 0.6892235320884719, + "learning_rate": 7.532411436965258e-06, + "loss": 0.0253, + "step": 1011 + }, + { + "epoch": 0.3515109412990622, + "grad_norm": 0.731192690685414, + "learning_rate": 7.52755876139468e-06, + "loss": 0.0139, + "step": 1012 + }, + { + "epoch": 0.3518582841264328, + "grad_norm": 0.6312927924236015, + "learning_rate": 7.522702885675597e-06, + "loss": 0.0228, + "step": 1013 + }, + { + "epoch": 0.3522056269538034, + "grad_norm": 0.9551305586895227, + "learning_rate": 7.517843815956045e-06, + "loss": 0.0178, + "step": 1014 + }, + { + "epoch": 0.352552969781174, + "grad_norm": 0.5677401120338555, + "learning_rate": 7.512981558388101e-06, + "loss": 0.0225, + "step": 1015 + }, + { + "epoch": 0.35290031260854465, + "grad_norm": 0.7040980402544591, + "learning_rate": 7.5081161191278874e-06, + "loss": 0.0176, + "step": 1016 + }, + { + "epoch": 0.35324765543591524, + "grad_norm": 0.9674600691403624, + "learning_rate": 7.5032475043355444e-06, + "loss": 0.0243, + "step": 1017 + }, + { + "epoch": 0.3535949982632859, + "grad_norm": 0.6023226292917273, + "learning_rate": 7.498375720175239e-06, + "loss": 0.0183, + "step": 1018 + }, + { + "epoch": 0.3539423410906565, + "grad_norm": 0.9673022657430923, + "learning_rate": 7.49350077281515e-06, + "loss": 0.0287, + "step": 1019 + }, + { + "epoch": 0.35428968391802707, + "grad_norm": 0.606921696103078, + "learning_rate": 7.48862266842746e-06, + "loss": 0.0226, + "step": 1020 + }, + { + "epoch": 0.3546370267453977, + "grad_norm": 0.9175466939892832, + "learning_rate": 7.483741413188349e-06, + "loss": 0.0205, + "step": 1021 + }, + { + "epoch": 0.3549843695727683, + "grad_norm": 0.5474768689731467, + "learning_rate": 7.478857013277987e-06, + "loss": 0.0253, + "step": 1022 + }, + { + "epoch": 0.35533171240013894, + "grad_norm": 0.7005547254579504, + "learning_rate": 7.473969474880527e-06, + "loss": 0.0239, + "step": 1023 + }, + { + "epoch": 0.35567905522750953, + "grad_norm": 0.6566673110253187, + "learning_rate": 7.469078804184088e-06, + "loss": 0.0153, + "step": 1024 + }, + { + "epoch": 0.3560263980548802, + "grad_norm": 0.44731824465158737, + "learning_rate": 7.464185007380767e-06, + "loss": 0.0226, + "step": 1025 + }, + { + "epoch": 0.35637374088225077, + "grad_norm": 0.929371757186016, + "learning_rate": 7.459288090666605e-06, + "loss": 0.0215, + "step": 1026 + }, + { + "epoch": 0.3567210837096214, + "grad_norm": 1.754520347098642, + "learning_rate": 7.45438806024161e-06, + "loss": 0.0245, + "step": 1027 + }, + { + "epoch": 0.357068426536992, + "grad_norm": 1.154684337826048, + "learning_rate": 7.449484922309713e-06, + "loss": 0.0183, + "step": 1028 + }, + { + "epoch": 0.35741576936436265, + "grad_norm": 0.7743738340491029, + "learning_rate": 7.444578683078798e-06, + "loss": 0.0265, + "step": 1029 + }, + { + "epoch": 0.35776311219173323, + "grad_norm": 0.39037178440555265, + "learning_rate": 7.4396693487606605e-06, + "loss": 0.0183, + "step": 1030 + }, + { + "epoch": 0.3581104550191039, + "grad_norm": 0.6375860488882318, + "learning_rate": 7.4347569255710254e-06, + "loss": 0.0309, + "step": 1031 + }, + { + "epoch": 0.35845779784647447, + "grad_norm": 0.6020351665546125, + "learning_rate": 7.429841419729521e-06, + "loss": 0.025, + "step": 1032 + }, + { + "epoch": 0.35880514067384506, + "grad_norm": 1.3286872104930505, + "learning_rate": 7.424922837459683e-06, + "loss": 0.0224, + "step": 1033 + }, + { + "epoch": 0.3591524835012157, + "grad_norm": 0.5744413112024466, + "learning_rate": 7.42000118498894e-06, + "loss": 0.0184, + "step": 1034 + }, + { + "epoch": 0.3594998263285863, + "grad_norm": 1.1943187896682033, + "learning_rate": 7.41507646854861e-06, + "loss": 0.0262, + "step": 1035 + }, + { + "epoch": 0.35984716915595694, + "grad_norm": 0.5497582747698295, + "learning_rate": 7.4101486943738865e-06, + "loss": 0.0211, + "step": 1036 + }, + { + "epoch": 0.3601945119833275, + "grad_norm": 0.4778102052106275, + "learning_rate": 7.405217868703839e-06, + "loss": 0.0165, + "step": 1037 + }, + { + "epoch": 0.36054185481069817, + "grad_norm": 0.7214250751175056, + "learning_rate": 7.400283997781397e-06, + "loss": 0.0286, + "step": 1038 + }, + { + "epoch": 0.36088919763806876, + "grad_norm": 0.7687435033252493, + "learning_rate": 7.395347087853349e-06, + "loss": 0.0214, + "step": 1039 + }, + { + "epoch": 0.3612365404654394, + "grad_norm": 1.0790158493374187, + "learning_rate": 7.390407145170325e-06, + "loss": 0.027, + "step": 1040 + }, + { + "epoch": 0.36158388329281, + "grad_norm": 0.37118096549279656, + "learning_rate": 7.385464175986803e-06, + "loss": 0.0124, + "step": 1041 + }, + { + "epoch": 0.36193122612018064, + "grad_norm": 1.4499606402413763, + "learning_rate": 7.380518186561086e-06, + "loss": 0.0224, + "step": 1042 + }, + { + "epoch": 0.3622785689475512, + "grad_norm": 1.7916464514957589, + "learning_rate": 7.375569183155306e-06, + "loss": 0.0269, + "step": 1043 + }, + { + "epoch": 0.36262591177492187, + "grad_norm": 0.6417320309447412, + "learning_rate": 7.370617172035406e-06, + "loss": 0.0187, + "step": 1044 + }, + { + "epoch": 0.36297325460229246, + "grad_norm": 0.9602465969222208, + "learning_rate": 7.365662159471142e-06, + "loss": 0.0171, + "step": 1045 + }, + { + "epoch": 0.36332059742966305, + "grad_norm": 0.5700015399771756, + "learning_rate": 7.3607041517360666e-06, + "loss": 0.023, + "step": 1046 + }, + { + "epoch": 0.3636679402570337, + "grad_norm": 0.42616867425402144, + "learning_rate": 7.355743155107526e-06, + "loss": 0.0185, + "step": 1047 + }, + { + "epoch": 0.3640152830844043, + "grad_norm": 0.5254170573018199, + "learning_rate": 7.3507791758666514e-06, + "loss": 0.0125, + "step": 1048 + }, + { + "epoch": 0.36436262591177493, + "grad_norm": 0.8659700459185796, + "learning_rate": 7.3458122202983495e-06, + "loss": 0.0184, + "step": 1049 + }, + { + "epoch": 0.3647099687391455, + "grad_norm": 0.78273420283934, + "learning_rate": 7.340842294691292e-06, + "loss": 0.0134, + "step": 1050 + }, + { + "epoch": 0.36505731156651616, + "grad_norm": 1.2893737814650572, + "learning_rate": 7.335869405337919e-06, + "loss": 0.0188, + "step": 1051 + }, + { + "epoch": 0.36540465439388675, + "grad_norm": 0.901363581151114, + "learning_rate": 7.3308935585344135e-06, + "loss": 0.0217, + "step": 1052 + }, + { + "epoch": 0.3657519972212574, + "grad_norm": 0.481669992576634, + "learning_rate": 7.325914760580712e-06, + "loss": 0.0102, + "step": 1053 + }, + { + "epoch": 0.366099340048628, + "grad_norm": 1.0260877050410049, + "learning_rate": 7.32093301778048e-06, + "loss": 0.0263, + "step": 1054 + }, + { + "epoch": 0.36644668287599863, + "grad_norm": 0.8353120616059849, + "learning_rate": 7.3159483364411175e-06, + "loss": 0.022, + "step": 1055 + }, + { + "epoch": 0.3667940257033692, + "grad_norm": 0.7273624057486092, + "learning_rate": 7.310960722873739e-06, + "loss": 0.0308, + "step": 1056 + }, + { + "epoch": 0.36714136853073986, + "grad_norm": 1.13748674597797, + "learning_rate": 7.3059701833931766e-06, + "loss": 0.0135, + "step": 1057 + }, + { + "epoch": 0.36748871135811045, + "grad_norm": 0.6859340482408613, + "learning_rate": 7.300976724317964e-06, + "loss": 0.016, + "step": 1058 + }, + { + "epoch": 0.36783605418548104, + "grad_norm": 1.2719149842779836, + "learning_rate": 7.295980351970331e-06, + "loss": 0.0178, + "step": 1059 + }, + { + "epoch": 0.3681833970128517, + "grad_norm": 1.0679079371197404, + "learning_rate": 7.290981072676202e-06, + "loss": 0.0333, + "step": 1060 + }, + { + "epoch": 0.3685307398402223, + "grad_norm": 0.5473217513456246, + "learning_rate": 7.285978892765171e-06, + "loss": 0.0196, + "step": 1061 + }, + { + "epoch": 0.3688780826675929, + "grad_norm": 0.6477963035607689, + "learning_rate": 7.280973818570515e-06, + "loss": 0.0229, + "step": 1062 + }, + { + "epoch": 0.3692254254949635, + "grad_norm": 0.9779272803664357, + "learning_rate": 7.275965856429167e-06, + "loss": 0.0148, + "step": 1063 + }, + { + "epoch": 0.36957276832233416, + "grad_norm": 0.6549296775726913, + "learning_rate": 7.270955012681726e-06, + "loss": 0.0219, + "step": 1064 + }, + { + "epoch": 0.36992011114970474, + "grad_norm": 0.8692466378138357, + "learning_rate": 7.26594129367243e-06, + "loss": 0.0202, + "step": 1065 + }, + { + "epoch": 0.3702674539770754, + "grad_norm": 0.6315864672971158, + "learning_rate": 7.26092470574916e-06, + "loss": 0.0181, + "step": 1066 + }, + { + "epoch": 0.370614796804446, + "grad_norm": 0.8066486900818035, + "learning_rate": 7.255905255263434e-06, + "loss": 0.0234, + "step": 1067 + }, + { + "epoch": 0.3709621396318166, + "grad_norm": 0.8450924908715876, + "learning_rate": 7.25088294857039e-06, + "loss": 0.0212, + "step": 1068 + }, + { + "epoch": 0.3713094824591872, + "grad_norm": 0.7148838980127935, + "learning_rate": 7.245857792028781e-06, + "loss": 0.0204, + "step": 1069 + }, + { + "epoch": 0.37165682528655786, + "grad_norm": 0.7769253902283783, + "learning_rate": 7.240829792000974e-06, + "loss": 0.0184, + "step": 1070 + }, + { + "epoch": 0.37200416811392845, + "grad_norm": 0.5818321434788865, + "learning_rate": 7.235798954852929e-06, + "loss": 0.0218, + "step": 1071 + }, + { + "epoch": 0.37235151094129904, + "grad_norm": 0.7181155976463919, + "learning_rate": 7.230765286954204e-06, + "loss": 0.0229, + "step": 1072 + }, + { + "epoch": 0.3726988537686697, + "grad_norm": 0.5311889445893856, + "learning_rate": 7.2257287946779365e-06, + "loss": 0.0208, + "step": 1073 + }, + { + "epoch": 0.37304619659604027, + "grad_norm": 0.6470157370215145, + "learning_rate": 7.220689484400844e-06, + "loss": 0.0286, + "step": 1074 + }, + { + "epoch": 0.3733935394234109, + "grad_norm": 0.5839896666697149, + "learning_rate": 7.2156473625032075e-06, + "loss": 0.0208, + "step": 1075 + }, + { + "epoch": 0.3737408822507815, + "grad_norm": 0.5003859905535379, + "learning_rate": 7.210602435368873e-06, + "loss": 0.0277, + "step": 1076 + }, + { + "epoch": 0.37408822507815215, + "grad_norm": 0.938126994232941, + "learning_rate": 7.205554709385234e-06, + "loss": 0.0193, + "step": 1077 + }, + { + "epoch": 0.37443556790552274, + "grad_norm": 0.9825447432197518, + "learning_rate": 7.20050419094323e-06, + "loss": 0.0327, + "step": 1078 + }, + { + "epoch": 0.3747829107328934, + "grad_norm": 0.5808094753024735, + "learning_rate": 7.195450886437334e-06, + "loss": 0.0233, + "step": 1079 + }, + { + "epoch": 0.37513025356026397, + "grad_norm": 0.729519930229865, + "learning_rate": 7.190394802265548e-06, + "loss": 0.0254, + "step": 1080 + }, + { + "epoch": 0.3754775963876346, + "grad_norm": 0.32897954890179926, + "learning_rate": 7.185335944829391e-06, + "loss": 0.0193, + "step": 1081 + }, + { + "epoch": 0.3758249392150052, + "grad_norm": 0.4730691255515786, + "learning_rate": 7.1802743205339e-06, + "loss": 0.0255, + "step": 1082 + }, + { + "epoch": 0.37617228204237585, + "grad_norm": 0.8306626500245984, + "learning_rate": 7.175209935787605e-06, + "loss": 0.0261, + "step": 1083 + }, + { + "epoch": 0.37651962486974644, + "grad_norm": 0.5382124730055174, + "learning_rate": 7.17014279700254e-06, + "loss": 0.025, + "step": 1084 + }, + { + "epoch": 0.37686696769711703, + "grad_norm": 0.6717725725696359, + "learning_rate": 7.165072910594219e-06, + "loss": 0.0271, + "step": 1085 + }, + { + "epoch": 0.3772143105244877, + "grad_norm": 0.7370235521381453, + "learning_rate": 7.160000282981641e-06, + "loss": 0.0273, + "step": 1086 + }, + { + "epoch": 0.37756165335185826, + "grad_norm": 0.4891684754047165, + "learning_rate": 7.154924920587269e-06, + "loss": 0.0264, + "step": 1087 + }, + { + "epoch": 0.3779089961792289, + "grad_norm": 0.3231257861866142, + "learning_rate": 7.149846829837036e-06, + "loss": 0.0206, + "step": 1088 + }, + { + "epoch": 0.3782563390065995, + "grad_norm": 0.4101311080573536, + "learning_rate": 7.144766017160324e-06, + "loss": 0.0205, + "step": 1089 + }, + { + "epoch": 0.37860368183397014, + "grad_norm": 0.8294757845956588, + "learning_rate": 7.139682488989961e-06, + "loss": 0.0265, + "step": 1090 + }, + { + "epoch": 0.37895102466134073, + "grad_norm": 0.5156785295009514, + "learning_rate": 7.134596251762217e-06, + "loss": 0.0209, + "step": 1091 + }, + { + "epoch": 0.3792983674887114, + "grad_norm": 0.6068386255398981, + "learning_rate": 7.129507311916789e-06, + "loss": 0.0191, + "step": 1092 + }, + { + "epoch": 0.37964571031608196, + "grad_norm": 0.42053223931776523, + "learning_rate": 7.124415675896796e-06, + "loss": 0.018, + "step": 1093 + }, + { + "epoch": 0.3799930531434526, + "grad_norm": 1.529702363413615, + "learning_rate": 7.119321350148772e-06, + "loss": 0.0305, + "step": 1094 + }, + { + "epoch": 0.3803403959708232, + "grad_norm": 0.9603413062647468, + "learning_rate": 7.114224341122655e-06, + "loss": 0.0211, + "step": 1095 + }, + { + "epoch": 0.38068773879819384, + "grad_norm": 0.46028100214920925, + "learning_rate": 7.109124655271782e-06, + "loss": 0.0121, + "step": 1096 + }, + { + "epoch": 0.38103508162556443, + "grad_norm": 0.38425802665843994, + "learning_rate": 7.1040222990528775e-06, + "loss": 0.0165, + "step": 1097 + }, + { + "epoch": 0.381382424452935, + "grad_norm": 0.34490477943437864, + "learning_rate": 7.098917278926046e-06, + "loss": 0.0215, + "step": 1098 + }, + { + "epoch": 0.38172976728030567, + "grad_norm": 0.4917657059195774, + "learning_rate": 7.093809601354769e-06, + "loss": 0.0193, + "step": 1099 + }, + { + "epoch": 0.38207711010767625, + "grad_norm": 0.3304192390082568, + "learning_rate": 7.088699272805888e-06, + "loss": 0.0161, + "step": 1100 + }, + { + "epoch": 0.3824244529350469, + "grad_norm": 0.8538379691183342, + "learning_rate": 7.0835862997496045e-06, + "loss": 0.0174, + "step": 1101 + }, + { + "epoch": 0.3827717957624175, + "grad_norm": 1.066827953382512, + "learning_rate": 7.078470688659465e-06, + "loss": 0.0257, + "step": 1102 + }, + { + "epoch": 0.38311913858978813, + "grad_norm": 0.7479008835369523, + "learning_rate": 7.073352446012357e-06, + "loss": 0.0213, + "step": 1103 + }, + { + "epoch": 0.3834664814171587, + "grad_norm": 1.0510022725835733, + "learning_rate": 7.068231578288502e-06, + "loss": 0.0259, + "step": 1104 + }, + { + "epoch": 0.38381382424452937, + "grad_norm": 0.6431698987590786, + "learning_rate": 7.063108091971444e-06, + "loss": 0.0161, + "step": 1105 + }, + { + "epoch": 0.38416116707189996, + "grad_norm": 0.626180547232632, + "learning_rate": 7.05798199354804e-06, + "loss": 0.015, + "step": 1106 + }, + { + "epoch": 0.3845085098992706, + "grad_norm": 0.7399567466553991, + "learning_rate": 7.052853289508458e-06, + "loss": 0.0229, + "step": 1107 + }, + { + "epoch": 0.3848558527266412, + "grad_norm": 0.5500014449728272, + "learning_rate": 7.047721986346161e-06, + "loss": 0.0232, + "step": 1108 + }, + { + "epoch": 0.38520319555401183, + "grad_norm": 1.4534975737472637, + "learning_rate": 7.042588090557906e-06, + "loss": 0.0228, + "step": 1109 + }, + { + "epoch": 0.3855505383813824, + "grad_norm": 1.3334339962937882, + "learning_rate": 7.037451608643732e-06, + "loss": 0.0201, + "step": 1110 + }, + { + "epoch": 0.385897881208753, + "grad_norm": 1.0089775574518236, + "learning_rate": 7.03231254710695e-06, + "loss": 0.0184, + "step": 1111 + }, + { + "epoch": 0.38624522403612366, + "grad_norm": 0.439074094269593, + "learning_rate": 7.027170912454141e-06, + "loss": 0.0162, + "step": 1112 + }, + { + "epoch": 0.38659256686349425, + "grad_norm": 0.8348064008657952, + "learning_rate": 7.02202671119514e-06, + "loss": 0.017, + "step": 1113 + }, + { + "epoch": 0.3869399096908649, + "grad_norm": 0.7951186556024389, + "learning_rate": 7.016879949843032e-06, + "loss": 0.0163, + "step": 1114 + }, + { + "epoch": 0.3872872525182355, + "grad_norm": 0.7523417760723944, + "learning_rate": 7.0117306349141485e-06, + "loss": 0.0236, + "step": 1115 + }, + { + "epoch": 0.3876345953456061, + "grad_norm": 1.3735341548254056, + "learning_rate": 7.006578772928045e-06, + "loss": 0.0218, + "step": 1116 + }, + { + "epoch": 0.3879819381729767, + "grad_norm": 0.5385452791468442, + "learning_rate": 7.0014243704075115e-06, + "loss": 0.0205, + "step": 1117 + }, + { + "epoch": 0.38832928100034736, + "grad_norm": 0.8605606189346207, + "learning_rate": 6.996267433878545e-06, + "loss": 0.0222, + "step": 1118 + }, + { + "epoch": 0.38867662382771795, + "grad_norm": 0.823981661806264, + "learning_rate": 6.991107969870363e-06, + "loss": 0.0264, + "step": 1119 + }, + { + "epoch": 0.3890239666550886, + "grad_norm": 0.7575159321093746, + "learning_rate": 6.985945984915368e-06, + "loss": 0.0229, + "step": 1120 + }, + { + "epoch": 0.3893713094824592, + "grad_norm": 0.45426836006848514, + "learning_rate": 6.9807814855491685e-06, + "loss": 0.0162, + "step": 1121 + }, + { + "epoch": 0.3897186523098298, + "grad_norm": 1.2049213408577124, + "learning_rate": 6.975614478310546e-06, + "loss": 0.0322, + "step": 1122 + }, + { + "epoch": 0.3900659951372004, + "grad_norm": 0.7306802680657365, + "learning_rate": 6.970444969741462e-06, + "loss": 0.0198, + "step": 1123 + }, + { + "epoch": 0.390413337964571, + "grad_norm": 0.7391729308842311, + "learning_rate": 6.965272966387046e-06, + "loss": 0.0184, + "step": 1124 + }, + { + "epoch": 0.39076068079194165, + "grad_norm": 0.6086940453113556, + "learning_rate": 6.960098474795583e-06, + "loss": 0.0243, + "step": 1125 + }, + { + "epoch": 0.39110802361931224, + "grad_norm": 0.4851910309801865, + "learning_rate": 6.954921501518511e-06, + "loss": 0.012, + "step": 1126 + }, + { + "epoch": 0.3914553664466829, + "grad_norm": 0.45385633767002753, + "learning_rate": 6.949742053110408e-06, + "loss": 0.0198, + "step": 1127 + }, + { + "epoch": 0.3918027092740535, + "grad_norm": 1.1129782341928363, + "learning_rate": 6.944560136128986e-06, + "loss": 0.0287, + "step": 1128 + }, + { + "epoch": 0.3921500521014241, + "grad_norm": 0.7720433053769181, + "learning_rate": 6.939375757135085e-06, + "loss": 0.0238, + "step": 1129 + }, + { + "epoch": 0.3924973949287947, + "grad_norm": 1.5172156671202623, + "learning_rate": 6.934188922692659e-06, + "loss": 0.0254, + "step": 1130 + }, + { + "epoch": 0.39284473775616535, + "grad_norm": 0.5132675364055385, + "learning_rate": 6.928999639368773e-06, + "loss": 0.0176, + "step": 1131 + }, + { + "epoch": 0.39319208058353594, + "grad_norm": 0.9917189063692697, + "learning_rate": 6.923807913733591e-06, + "loss": 0.0212, + "step": 1132 + }, + { + "epoch": 0.3935394234109066, + "grad_norm": 1.0346614661684557, + "learning_rate": 6.918613752360369e-06, + "loss": 0.0237, + "step": 1133 + }, + { + "epoch": 0.3938867662382772, + "grad_norm": 0.7774794514027609, + "learning_rate": 6.913417161825449e-06, + "loss": 0.021, + "step": 1134 + }, + { + "epoch": 0.3942341090656478, + "grad_norm": 0.8284608514835325, + "learning_rate": 6.908218148708248e-06, + "loss": 0.0196, + "step": 1135 + }, + { + "epoch": 0.3945814518930184, + "grad_norm": 0.5659423206737776, + "learning_rate": 6.903016719591249e-06, + "loss": 0.0221, + "step": 1136 + }, + { + "epoch": 0.394928794720389, + "grad_norm": 1.0709171868083134, + "learning_rate": 6.8978128810599935e-06, + "loss": 0.035, + "step": 1137 + }, + { + "epoch": 0.39527613754775964, + "grad_norm": 0.6527457095839493, + "learning_rate": 6.8926066397030745e-06, + "loss": 0.0204, + "step": 1138 + }, + { + "epoch": 0.39562348037513023, + "grad_norm": 0.5573043903274787, + "learning_rate": 6.887398002112129e-06, + "loss": 0.0211, + "step": 1139 + }, + { + "epoch": 0.3959708232025009, + "grad_norm": 0.8397780950700933, + "learning_rate": 6.8821869748818235e-06, + "loss": 0.018, + "step": 1140 + }, + { + "epoch": 0.39631816602987147, + "grad_norm": 0.39026678264711556, + "learning_rate": 6.876973564609857e-06, + "loss": 0.0175, + "step": 1141 + }, + { + "epoch": 0.3966655088572421, + "grad_norm": 0.5775231347276166, + "learning_rate": 6.871757777896937e-06, + "loss": 0.0176, + "step": 1142 + }, + { + "epoch": 0.3970128516846127, + "grad_norm": 0.3504293355564611, + "learning_rate": 6.866539621346786e-06, + "loss": 0.0136, + "step": 1143 + }, + { + "epoch": 0.39736019451198334, + "grad_norm": 0.3267695926847211, + "learning_rate": 6.861319101566126e-06, + "loss": 0.0152, + "step": 1144 + }, + { + "epoch": 0.39770753733935393, + "grad_norm": 1.0251884908691054, + "learning_rate": 6.856096225164669e-06, + "loss": 0.017, + "step": 1145 + }, + { + "epoch": 0.3980548801667246, + "grad_norm": 0.484956168822129, + "learning_rate": 6.850870998755111e-06, + "loss": 0.0192, + "step": 1146 + }, + { + "epoch": 0.39840222299409517, + "grad_norm": 0.6237901046105527, + "learning_rate": 6.845643428953127e-06, + "loss": 0.0322, + "step": 1147 + }, + { + "epoch": 0.3987495658214658, + "grad_norm": 1.2212183860294297, + "learning_rate": 6.840413522377355e-06, + "loss": 0.0211, + "step": 1148 + }, + { + "epoch": 0.3990969086488364, + "grad_norm": 0.5405735574477039, + "learning_rate": 6.8351812856493905e-06, + "loss": 0.0227, + "step": 1149 + }, + { + "epoch": 0.399444251476207, + "grad_norm": 1.7602429727127982, + "learning_rate": 6.829946725393787e-06, + "loss": 0.0264, + "step": 1150 + }, + { + "epoch": 0.39979159430357764, + "grad_norm": 0.9253181842069885, + "learning_rate": 6.824709848238028e-06, + "loss": 0.0167, + "step": 1151 + }, + { + "epoch": 0.4001389371309482, + "grad_norm": 0.39829378649964736, + "learning_rate": 6.819470660812543e-06, + "loss": 0.0153, + "step": 1152 + }, + { + "epoch": 0.40048627995831887, + "grad_norm": 1.4005769801829058, + "learning_rate": 6.814229169750675e-06, + "loss": 0.0239, + "step": 1153 + }, + { + "epoch": 0.40083362278568946, + "grad_norm": 0.599123382279925, + "learning_rate": 6.808985381688692e-06, + "loss": 0.0239, + "step": 1154 + }, + { + "epoch": 0.4011809656130601, + "grad_norm": 0.502770976659655, + "learning_rate": 6.8037393032657665e-06, + "loss": 0.0161, + "step": 1155 + }, + { + "epoch": 0.4015283084404307, + "grad_norm": 0.715985711190083, + "learning_rate": 6.798490941123972e-06, + "loss": 0.0321, + "step": 1156 + }, + { + "epoch": 0.40187565126780134, + "grad_norm": 1.3666815307149158, + "learning_rate": 6.793240301908273e-06, + "loss": 0.0217, + "step": 1157 + }, + { + "epoch": 0.4022229940951719, + "grad_norm": 1.3893597610770267, + "learning_rate": 6.7879873922665164e-06, + "loss": 0.0255, + "step": 1158 + }, + { + "epoch": 0.40257033692254257, + "grad_norm": 1.699863532332705, + "learning_rate": 6.782732218849425e-06, + "loss": 0.0191, + "step": 1159 + }, + { + "epoch": 0.40291767974991316, + "grad_norm": 0.5862064856816661, + "learning_rate": 6.777474788310586e-06, + "loss": 0.0278, + "step": 1160 + }, + { + "epoch": 0.4032650225772838, + "grad_norm": 0.5792277534533721, + "learning_rate": 6.772215107306448e-06, + "loss": 0.0244, + "step": 1161 + }, + { + "epoch": 0.4036123654046544, + "grad_norm": 0.7441381287676478, + "learning_rate": 6.766953182496303e-06, + "loss": 0.0164, + "step": 1162 + }, + { + "epoch": 0.403959708232025, + "grad_norm": 0.874933616008428, + "learning_rate": 6.761689020542288e-06, + "loss": 0.024, + "step": 1163 + }, + { + "epoch": 0.40430705105939563, + "grad_norm": 0.7249602510280612, + "learning_rate": 6.756422628109374e-06, + "loss": 0.0291, + "step": 1164 + }, + { + "epoch": 0.4046543938867662, + "grad_norm": 0.38558307647762535, + "learning_rate": 6.751154011865352e-06, + "loss": 0.0193, + "step": 1165 + }, + { + "epoch": 0.40500173671413686, + "grad_norm": 0.3569257075132759, + "learning_rate": 6.74588317848083e-06, + "loss": 0.0224, + "step": 1166 + }, + { + "epoch": 0.40534907954150745, + "grad_norm": 0.7027161271352107, + "learning_rate": 6.740610134629224e-06, + "loss": 0.0236, + "step": 1167 + }, + { + "epoch": 0.4056964223688781, + "grad_norm": 0.5055672476223064, + "learning_rate": 6.735334886986749e-06, + "loss": 0.0169, + "step": 1168 + }, + { + "epoch": 0.4060437651962487, + "grad_norm": 1.1794513716622257, + "learning_rate": 6.730057442232407e-06, + "loss": 0.0156, + "step": 1169 + }, + { + "epoch": 0.40639110802361933, + "grad_norm": 1.210535705797956, + "learning_rate": 6.724777807047985e-06, + "loss": 0.0195, + "step": 1170 + }, + { + "epoch": 0.4067384508509899, + "grad_norm": 0.48600002401232995, + "learning_rate": 6.719495988118043e-06, + "loss": 0.0256, + "step": 1171 + }, + { + "epoch": 0.40708579367836056, + "grad_norm": 0.30168099824930594, + "learning_rate": 6.714211992129906e-06, + "loss": 0.0151, + "step": 1172 + }, + { + "epoch": 0.40743313650573115, + "grad_norm": 0.41890959033819075, + "learning_rate": 6.708925825773653e-06, + "loss": 0.0214, + "step": 1173 + }, + { + "epoch": 0.4077804793331018, + "grad_norm": 0.4456883093941609, + "learning_rate": 6.7036374957421125e-06, + "loss": 0.0164, + "step": 1174 + }, + { + "epoch": 0.4081278221604724, + "grad_norm": 0.8050022922465697, + "learning_rate": 6.698347008730854e-06, + "loss": 0.0177, + "step": 1175 + }, + { + "epoch": 0.408475164987843, + "grad_norm": 0.7402939338287975, + "learning_rate": 6.6930543714381745e-06, + "loss": 0.0217, + "step": 1176 + }, + { + "epoch": 0.4088225078152136, + "grad_norm": 0.7580203605323533, + "learning_rate": 6.687759590565097e-06, + "loss": 0.0217, + "step": 1177 + }, + { + "epoch": 0.4091698506425842, + "grad_norm": 0.5276697193580777, + "learning_rate": 6.6824626728153565e-06, + "loss": 0.0177, + "step": 1178 + }, + { + "epoch": 0.40951719346995485, + "grad_norm": 1.3525910498294889, + "learning_rate": 6.677163624895393e-06, + "loss": 0.0262, + "step": 1179 + }, + { + "epoch": 0.40986453629732544, + "grad_norm": 0.7784159092511207, + "learning_rate": 6.671862453514346e-06, + "loss": 0.0224, + "step": 1180 + }, + { + "epoch": 0.4102118791246961, + "grad_norm": 0.5263360435847655, + "learning_rate": 6.666559165384041e-06, + "loss": 0.018, + "step": 1181 + }, + { + "epoch": 0.4105592219520667, + "grad_norm": 0.9366706697301209, + "learning_rate": 6.661253767218982e-06, + "loss": 0.0232, + "step": 1182 + }, + { + "epoch": 0.4109065647794373, + "grad_norm": 0.38867181577970566, + "learning_rate": 6.6559462657363525e-06, + "loss": 0.0182, + "step": 1183 + }, + { + "epoch": 0.4112539076068079, + "grad_norm": 1.6435992793264294, + "learning_rate": 6.6506366676559885e-06, + "loss": 0.0253, + "step": 1184 + }, + { + "epoch": 0.41160125043417856, + "grad_norm": 1.0752706014174525, + "learning_rate": 6.6453249797003885e-06, + "loss": 0.0266, + "step": 1185 + }, + { + "epoch": 0.41194859326154915, + "grad_norm": 0.8205362924547063, + "learning_rate": 6.640011208594691e-06, + "loss": 0.0228, + "step": 1186 + }, + { + "epoch": 0.4122959360889198, + "grad_norm": 0.7528049121740515, + "learning_rate": 6.634695361066679e-06, + "loss": 0.0205, + "step": 1187 + }, + { + "epoch": 0.4126432789162904, + "grad_norm": 0.5476931400142733, + "learning_rate": 6.629377443846756e-06, + "loss": 0.0256, + "step": 1188 + }, + { + "epoch": 0.41299062174366097, + "grad_norm": 1.964533528727283, + "learning_rate": 6.624057463667954e-06, + "loss": 0.0249, + "step": 1189 + }, + { + "epoch": 0.4133379645710316, + "grad_norm": 0.9431680832361578, + "learning_rate": 6.618735427265912e-06, + "loss": 0.0127, + "step": 1190 + }, + { + "epoch": 0.4136853073984022, + "grad_norm": 0.31081854506575984, + "learning_rate": 6.613411341378872e-06, + "loss": 0.0168, + "step": 1191 + }, + { + "epoch": 0.41403265022577285, + "grad_norm": 0.7037510968761942, + "learning_rate": 6.608085212747676e-06, + "loss": 0.0158, + "step": 1192 + }, + { + "epoch": 0.41437999305314344, + "grad_norm": 0.7279786999120493, + "learning_rate": 6.602757048115745e-06, + "loss": 0.0208, + "step": 1193 + }, + { + "epoch": 0.4147273358805141, + "grad_norm": 1.5397066182596297, + "learning_rate": 6.597426854229085e-06, + "loss": 0.0178, + "step": 1194 + }, + { + "epoch": 0.41507467870788467, + "grad_norm": 0.8672894314292771, + "learning_rate": 6.592094637836266e-06, + "loss": 0.0207, + "step": 1195 + }, + { + "epoch": 0.4154220215352553, + "grad_norm": 0.5979467691977439, + "learning_rate": 6.586760405688421e-06, + "loss": 0.0184, + "step": 1196 + }, + { + "epoch": 0.4157693643626259, + "grad_norm": 1.304178348576468, + "learning_rate": 6.581424164539235e-06, + "loss": 0.0222, + "step": 1197 + }, + { + "epoch": 0.41611670718999655, + "grad_norm": 0.744170926834296, + "learning_rate": 6.5760859211449355e-06, + "loss": 0.0186, + "step": 1198 + }, + { + "epoch": 0.41646405001736714, + "grad_norm": 0.43247222642972794, + "learning_rate": 6.570745682264288e-06, + "loss": 0.0241, + "step": 1199 + }, + { + "epoch": 0.4168113928447378, + "grad_norm": 0.7241231215446736, + "learning_rate": 6.565403454658579e-06, + "loss": 0.0115, + "step": 1200 + }, + { + "epoch": 0.41715873567210837, + "grad_norm": 1.5981944791092622, + "learning_rate": 6.560059245091619e-06, + "loss": 0.0215, + "step": 1201 + }, + { + "epoch": 0.41750607849947896, + "grad_norm": 0.6073741226445987, + "learning_rate": 6.554713060329725e-06, + "loss": 0.0159, + "step": 1202 + }, + { + "epoch": 0.4178534213268496, + "grad_norm": 0.35103622430566045, + "learning_rate": 6.549364907141713e-06, + "loss": 0.0165, + "step": 1203 + }, + { + "epoch": 0.4182007641542202, + "grad_norm": 0.3060822356877603, + "learning_rate": 6.544014792298896e-06, + "loss": 0.0136, + "step": 1204 + }, + { + "epoch": 0.41854810698159084, + "grad_norm": 1.2509560719304376, + "learning_rate": 6.538662722575067e-06, + "loss": 0.0232, + "step": 1205 + }, + { + "epoch": 0.41889544980896143, + "grad_norm": 1.1252212083506805, + "learning_rate": 6.533308704746492e-06, + "loss": 0.0261, + "step": 1206 + }, + { + "epoch": 0.4192427926363321, + "grad_norm": 1.0051855699617804, + "learning_rate": 6.527952745591911e-06, + "loss": 0.0176, + "step": 1207 + }, + { + "epoch": 0.41959013546370266, + "grad_norm": 0.4225505750014969, + "learning_rate": 6.522594851892513e-06, + "loss": 0.0157, + "step": 1208 + }, + { + "epoch": 0.4199374782910733, + "grad_norm": 0.675487063845996, + "learning_rate": 6.5172350304319456e-06, + "loss": 0.0187, + "step": 1209 + }, + { + "epoch": 0.4202848211184439, + "grad_norm": 0.5556340570184015, + "learning_rate": 6.5118732879962866e-06, + "loss": 0.021, + "step": 1210 + }, + { + "epoch": 0.42063216394581454, + "grad_norm": 1.1461752497953057, + "learning_rate": 6.506509631374056e-06, + "loss": 0.024, + "step": 1211 + }, + { + "epoch": 0.42097950677318513, + "grad_norm": 0.8587515989365709, + "learning_rate": 6.501144067356191e-06, + "loss": 0.0171, + "step": 1212 + }, + { + "epoch": 0.4213268496005558, + "grad_norm": 0.6554268552282353, + "learning_rate": 6.4957766027360455e-06, + "loss": 0.0162, + "step": 1213 + }, + { + "epoch": 0.42167419242792636, + "grad_norm": 0.9717352789041409, + "learning_rate": 6.490407244309382e-06, + "loss": 0.0177, + "step": 1214 + }, + { + "epoch": 0.42202153525529695, + "grad_norm": 0.4666185404754438, + "learning_rate": 6.485035998874356e-06, + "loss": 0.0179, + "step": 1215 + }, + { + "epoch": 0.4223688780826676, + "grad_norm": 0.6687573112666673, + "learning_rate": 6.479662873231518e-06, + "loss": 0.0278, + "step": 1216 + }, + { + "epoch": 0.4227162209100382, + "grad_norm": 0.39201919773447125, + "learning_rate": 6.4742878741837924e-06, + "loss": 0.021, + "step": 1217 + }, + { + "epoch": 0.42306356373740883, + "grad_norm": 1.9376472430652754, + "learning_rate": 6.468911008536483e-06, + "loss": 0.0315, + "step": 1218 + }, + { + "epoch": 0.4234109065647794, + "grad_norm": 0.5349195261746512, + "learning_rate": 6.4635322830972465e-06, + "loss": 0.022, + "step": 1219 + }, + { + "epoch": 0.42375824939215007, + "grad_norm": 0.5994260198762377, + "learning_rate": 6.458151704676108e-06, + "loss": 0.0249, + "step": 1220 + }, + { + "epoch": 0.42410559221952066, + "grad_norm": 0.45829461719718667, + "learning_rate": 6.452769280085427e-06, + "loss": 0.0172, + "step": 1221 + }, + { + "epoch": 0.4244529350468913, + "grad_norm": 0.4015948567021689, + "learning_rate": 6.447385016139906e-06, + "loss": 0.0159, + "step": 1222 + }, + { + "epoch": 0.4248002778742619, + "grad_norm": 0.41359006368663803, + "learning_rate": 6.441998919656575e-06, + "loss": 0.0217, + "step": 1223 + }, + { + "epoch": 0.42514762070163253, + "grad_norm": 0.784104999649454, + "learning_rate": 6.436610997454785e-06, + "loss": 0.0153, + "step": 1224 + }, + { + "epoch": 0.4254949635290031, + "grad_norm": 0.4486969339475269, + "learning_rate": 6.431221256356197e-06, + "loss": 0.0191, + "step": 1225 + }, + { + "epoch": 0.42584230635637377, + "grad_norm": 0.807568173718478, + "learning_rate": 6.425829703184776e-06, + "loss": 0.0191, + "step": 1226 + }, + { + "epoch": 0.42618964918374436, + "grad_norm": 1.214899649576311, + "learning_rate": 6.420436344766781e-06, + "loss": 0.0242, + "step": 1227 + }, + { + "epoch": 0.42653699201111495, + "grad_norm": 0.41836170664521255, + "learning_rate": 6.415041187930757e-06, + "loss": 0.0195, + "step": 1228 + }, + { + "epoch": 0.4268843348384856, + "grad_norm": 0.8701313451372041, + "learning_rate": 6.409644239507524e-06, + "loss": 0.0199, + "step": 1229 + }, + { + "epoch": 0.4272316776658562, + "grad_norm": 0.46742950813151507, + "learning_rate": 6.404245506330175e-06, + "loss": 0.013, + "step": 1230 + }, + { + "epoch": 0.4275790204932268, + "grad_norm": 0.5597197002970087, + "learning_rate": 6.398844995234057e-06, + "loss": 0.0246, + "step": 1231 + }, + { + "epoch": 0.4279263633205974, + "grad_norm": 1.3414468877460513, + "learning_rate": 6.393442713056772e-06, + "loss": 0.021, + "step": 1232 + }, + { + "epoch": 0.42827370614796806, + "grad_norm": 0.6723941756786661, + "learning_rate": 6.388038666638163e-06, + "loss": 0.0228, + "step": 1233 + }, + { + "epoch": 0.42862104897533865, + "grad_norm": 0.5282980798460091, + "learning_rate": 6.382632862820306e-06, + "loss": 0.019, + "step": 1234 + }, + { + "epoch": 0.4289683918027093, + "grad_norm": 0.8434630536972346, + "learning_rate": 6.377225308447503e-06, + "loss": 0.0279, + "step": 1235 + }, + { + "epoch": 0.4293157346300799, + "grad_norm": 1.2873804248352099, + "learning_rate": 6.371816010366274e-06, + "loss": 0.0242, + "step": 1236 + }, + { + "epoch": 0.4296630774574505, + "grad_norm": 1.0288152920759024, + "learning_rate": 6.366404975425342e-06, + "loss": 0.0136, + "step": 1237 + }, + { + "epoch": 0.4300104202848211, + "grad_norm": 0.9931125336676734, + "learning_rate": 6.360992210475635e-06, + "loss": 0.0238, + "step": 1238 + }, + { + "epoch": 0.43035776311219176, + "grad_norm": 1.136404288127991, + "learning_rate": 6.355577722370264e-06, + "loss": 0.0171, + "step": 1239 + }, + { + "epoch": 0.43070510593956235, + "grad_norm": 0.6828220416786258, + "learning_rate": 6.3501615179645315e-06, + "loss": 0.0195, + "step": 1240 + }, + { + "epoch": 0.43105244876693294, + "grad_norm": 1.3364697049657948, + "learning_rate": 6.344743604115903e-06, + "loss": 0.0293, + "step": 1241 + }, + { + "epoch": 0.4313997915943036, + "grad_norm": 1.2606216983253666, + "learning_rate": 6.339323987684015e-06, + "loss": 0.0197, + "step": 1242 + }, + { + "epoch": 0.4317471344216742, + "grad_norm": 0.5059234102433385, + "learning_rate": 6.333902675530657e-06, + "loss": 0.0216, + "step": 1243 + }, + { + "epoch": 0.4320944772490448, + "grad_norm": 0.6807195324342239, + "learning_rate": 6.328479674519766e-06, + "loss": 0.0225, + "step": 1244 + }, + { + "epoch": 0.4324418200764154, + "grad_norm": 0.5504876720573679, + "learning_rate": 6.323054991517416e-06, + "loss": 0.0128, + "step": 1245 + }, + { + "epoch": 0.43278916290378605, + "grad_norm": 0.8748832096236718, + "learning_rate": 6.317628633391816e-06, + "loss": 0.0208, + "step": 1246 + }, + { + "epoch": 0.43313650573115664, + "grad_norm": 1.3807423200829632, + "learning_rate": 6.312200607013287e-06, + "loss": 0.0192, + "step": 1247 + }, + { + "epoch": 0.4334838485585273, + "grad_norm": 0.4591330354830137, + "learning_rate": 6.306770919254268e-06, + "loss": 0.0182, + "step": 1248 + }, + { + "epoch": 0.4338311913858979, + "grad_norm": 1.1115703829370096, + "learning_rate": 6.301339576989301e-06, + "loss": 0.0257, + "step": 1249 + }, + { + "epoch": 0.4341785342132685, + "grad_norm": 0.9601555993164028, + "learning_rate": 6.295906587095023e-06, + "loss": 0.0293, + "step": 1250 + }, + { + "epoch": 0.4345258770406391, + "grad_norm": 0.3602112221728658, + "learning_rate": 6.2904719564501545e-06, + "loss": 0.0209, + "step": 1251 + }, + { + "epoch": 0.43487321986800975, + "grad_norm": 0.65050746016075, + "learning_rate": 6.285035691935495e-06, + "loss": 0.0201, + "step": 1252 + }, + { + "epoch": 0.43522056269538034, + "grad_norm": 0.43904022220763456, + "learning_rate": 6.279597800433915e-06, + "loss": 0.0244, + "step": 1253 + }, + { + "epoch": 0.43556790552275093, + "grad_norm": 1.0501132280980383, + "learning_rate": 6.274158288830339e-06, + "loss": 0.0152, + "step": 1254 + }, + { + "epoch": 0.4359152483501216, + "grad_norm": 1.368313597379767, + "learning_rate": 6.268717164011751e-06, + "loss": 0.0299, + "step": 1255 + }, + { + "epoch": 0.43626259117749216, + "grad_norm": 0.29948345258209563, + "learning_rate": 6.263274432867168e-06, + "loss": 0.0175, + "step": 1256 + }, + { + "epoch": 0.4366099340048628, + "grad_norm": 0.5520058587984689, + "learning_rate": 6.257830102287649e-06, + "loss": 0.0131, + "step": 1257 + }, + { + "epoch": 0.4369572768322334, + "grad_norm": 0.703456209843452, + "learning_rate": 6.252384179166272e-06, + "loss": 0.026, + "step": 1258 + }, + { + "epoch": 0.43730461965960404, + "grad_norm": 0.31650775648430113, + "learning_rate": 6.246936670398136e-06, + "loss": 0.0119, + "step": 1259 + }, + { + "epoch": 0.43765196248697463, + "grad_norm": 0.6575613262249127, + "learning_rate": 6.2414875828803446e-06, + "loss": 0.018, + "step": 1260 + }, + { + "epoch": 0.4379993053143453, + "grad_norm": 0.7650908913989808, + "learning_rate": 6.236036923512002e-06, + "loss": 0.0193, + "step": 1261 + }, + { + "epoch": 0.43834664814171587, + "grad_norm": 0.6645282671067363, + "learning_rate": 6.230584699194201e-06, + "loss": 0.0191, + "step": 1262 + }, + { + "epoch": 0.4386939909690865, + "grad_norm": 0.89119449808857, + "learning_rate": 6.225130916830017e-06, + "loss": 0.0165, + "step": 1263 + }, + { + "epoch": 0.4390413337964571, + "grad_norm": 1.4336588300538304, + "learning_rate": 6.2196755833244975e-06, + "loss": 0.0172, + "step": 1264 + }, + { + "epoch": 0.43938867662382775, + "grad_norm": 0.5141694363351154, + "learning_rate": 6.214218705584653e-06, + "loss": 0.017, + "step": 1265 + }, + { + "epoch": 0.43973601945119833, + "grad_norm": 0.7024489932275159, + "learning_rate": 6.208760290519451e-06, + "loss": 0.0266, + "step": 1266 + }, + { + "epoch": 0.4400833622785689, + "grad_norm": 0.5137267630010462, + "learning_rate": 6.203300345039804e-06, + "loss": 0.0208, + "step": 1267 + }, + { + "epoch": 0.44043070510593957, + "grad_norm": 0.9545655799932716, + "learning_rate": 6.197838876058564e-06, + "loss": 0.0212, + "step": 1268 + }, + { + "epoch": 0.44077804793331016, + "grad_norm": 1.274602132775298, + "learning_rate": 6.19237589049051e-06, + "loss": 0.02, + "step": 1269 + }, + { + "epoch": 0.4411253907606808, + "grad_norm": 1.1485868760171818, + "learning_rate": 6.186911395252342e-06, + "loss": 0.0325, + "step": 1270 + }, + { + "epoch": 0.4414727335880514, + "grad_norm": 1.0719692070927125, + "learning_rate": 6.181445397262671e-06, + "loss": 0.0246, + "step": 1271 + }, + { + "epoch": 0.44182007641542204, + "grad_norm": 0.5336479484346233, + "learning_rate": 6.175977903442008e-06, + "loss": 0.0154, + "step": 1272 + }, + { + "epoch": 0.4421674192427926, + "grad_norm": 0.6172067020914168, + "learning_rate": 6.170508920712765e-06, + "loss": 0.0188, + "step": 1273 + }, + { + "epoch": 0.44251476207016327, + "grad_norm": 0.9412324301612399, + "learning_rate": 6.165038455999233e-06, + "loss": 0.0216, + "step": 1274 + }, + { + "epoch": 0.44286210489753386, + "grad_norm": 0.5832155616020471, + "learning_rate": 6.159566516227582e-06, + "loss": 0.0214, + "step": 1275 + }, + { + "epoch": 0.4432094477249045, + "grad_norm": 0.5486296573893363, + "learning_rate": 6.154093108325846e-06, + "loss": 0.0163, + "step": 1276 + }, + { + "epoch": 0.4435567905522751, + "grad_norm": 0.6013895518841912, + "learning_rate": 6.148618239223924e-06, + "loss": 0.0146, + "step": 1277 + }, + { + "epoch": 0.44390413337964574, + "grad_norm": 0.886747002104457, + "learning_rate": 6.143141915853558e-06, + "loss": 0.0141, + "step": 1278 + }, + { + "epoch": 0.4442514762070163, + "grad_norm": 0.7983414083814292, + "learning_rate": 6.137664145148339e-06, + "loss": 0.0229, + "step": 1279 + }, + { + "epoch": 0.4445988190343869, + "grad_norm": 0.4982201099217589, + "learning_rate": 6.1321849340436824e-06, + "loss": 0.0226, + "step": 1280 + }, + { + "epoch": 0.44494616186175756, + "grad_norm": 0.6104658236755754, + "learning_rate": 6.126704289476834e-06, + "loss": 0.0176, + "step": 1281 + }, + { + "epoch": 0.44529350468912815, + "grad_norm": 0.4850325700357511, + "learning_rate": 6.121222218386848e-06, + "loss": 0.0275, + "step": 1282 + }, + { + "epoch": 0.4456408475164988, + "grad_norm": 1.1453236673507199, + "learning_rate": 6.115738727714593e-06, + "loss": 0.0157, + "step": 1283 + }, + { + "epoch": 0.4459881903438694, + "grad_norm": 1.1501958397477723, + "learning_rate": 6.110253824402728e-06, + "loss": 0.0233, + "step": 1284 + }, + { + "epoch": 0.44633553317124003, + "grad_norm": 1.2142627028854323, + "learning_rate": 6.104767515395702e-06, + "loss": 0.0217, + "step": 1285 + }, + { + "epoch": 0.4466828759986106, + "grad_norm": 0.8286577032771891, + "learning_rate": 6.0992798076397465e-06, + "loss": 0.0281, + "step": 1286 + }, + { + "epoch": 0.44703021882598126, + "grad_norm": 1.1668550772372472, + "learning_rate": 6.093790708082861e-06, + "loss": 0.0122, + "step": 1287 + }, + { + "epoch": 0.44737756165335185, + "grad_norm": 0.8595111659192216, + "learning_rate": 6.088300223674808e-06, + "loss": 0.013, + "step": 1288 + }, + { + "epoch": 0.4477249044807225, + "grad_norm": 1.202956918343463, + "learning_rate": 6.0828083613671055e-06, + "loss": 0.0155, + "step": 1289 + }, + { + "epoch": 0.4480722473080931, + "grad_norm": 1.144767269040697, + "learning_rate": 6.077315128113011e-06, + "loss": 0.0234, + "step": 1290 + }, + { + "epoch": 0.44841959013546373, + "grad_norm": 0.3829734675474034, + "learning_rate": 6.071820530867524e-06, + "loss": 0.017, + "step": 1291 + }, + { + "epoch": 0.4487669329628343, + "grad_norm": 0.705925698459192, + "learning_rate": 6.066324576587367e-06, + "loss": 0.0207, + "step": 1292 + }, + { + "epoch": 0.4491142757902049, + "grad_norm": 0.9927898285052602, + "learning_rate": 6.06082727223098e-06, + "loss": 0.0227, + "step": 1293 + }, + { + "epoch": 0.44946161861757555, + "grad_norm": 0.5058096386894589, + "learning_rate": 6.055328624758515e-06, + "loss": 0.025, + "step": 1294 + }, + { + "epoch": 0.44980896144494614, + "grad_norm": 0.9125209454948322, + "learning_rate": 6.0498286411318255e-06, + "loss": 0.0224, + "step": 1295 + }, + { + "epoch": 0.4501563042723168, + "grad_norm": 1.1242298901659808, + "learning_rate": 6.04432732831445e-06, + "loss": 0.0191, + "step": 1296 + }, + { + "epoch": 0.4505036470996874, + "grad_norm": 0.8939518763202791, + "learning_rate": 6.038824693271619e-06, + "loss": 0.0249, + "step": 1297 + }, + { + "epoch": 0.450850989927058, + "grad_norm": 1.0950080836356835, + "learning_rate": 6.033320742970229e-06, + "loss": 0.0248, + "step": 1298 + }, + { + "epoch": 0.4511983327544286, + "grad_norm": 0.8937794294216942, + "learning_rate": 6.027815484378848e-06, + "loss": 0.0247, + "step": 1299 + }, + { + "epoch": 0.45154567558179926, + "grad_norm": 1.211557408088087, + "learning_rate": 6.0223089244676965e-06, + "loss": 0.0257, + "step": 1300 + }, + { + "epoch": 0.45189301840916984, + "grad_norm": 0.7913795101404235, + "learning_rate": 6.016801070208644e-06, + "loss": 0.0201, + "step": 1301 + }, + { + "epoch": 0.4522403612365405, + "grad_norm": 1.022217982353667, + "learning_rate": 6.011291928575199e-06, + "loss": 0.0177, + "step": 1302 + }, + { + "epoch": 0.4525877040639111, + "grad_norm": 0.5643877352773056, + "learning_rate": 6.005781506542498e-06, + "loss": 0.018, + "step": 1303 + }, + { + "epoch": 0.45293504689128167, + "grad_norm": 0.46969855875618594, + "learning_rate": 6.000269811087304e-06, + "loss": 0.0164, + "step": 1304 + }, + { + "epoch": 0.4532823897186523, + "grad_norm": 0.64817857112924, + "learning_rate": 5.994756849187984e-06, + "loss": 0.0224, + "step": 1305 + }, + { + "epoch": 0.4536297325460229, + "grad_norm": 1.0142085570957111, + "learning_rate": 5.989242627824516e-06, + "loss": 0.0293, + "step": 1306 + }, + { + "epoch": 0.45397707537339355, + "grad_norm": 0.44811883050529416, + "learning_rate": 5.983727153978467e-06, + "loss": 0.0188, + "step": 1307 + }, + { + "epoch": 0.45432441820076414, + "grad_norm": 0.8638051888219721, + "learning_rate": 5.978210434632996e-06, + "loss": 0.0233, + "step": 1308 + }, + { + "epoch": 0.4546717610281348, + "grad_norm": 0.7100938005096301, + "learning_rate": 5.97269247677283e-06, + "loss": 0.0197, + "step": 1309 + }, + { + "epoch": 0.45501910385550537, + "grad_norm": 0.5543422204245909, + "learning_rate": 5.967173287384275e-06, + "loss": 0.0163, + "step": 1310 + }, + { + "epoch": 0.455366446682876, + "grad_norm": 0.8347623418009196, + "learning_rate": 5.961652873455186e-06, + "loss": 0.0142, + "step": 1311 + }, + { + "epoch": 0.4557137895102466, + "grad_norm": 0.2539498893920443, + "learning_rate": 5.956131241974976e-06, + "loss": 0.0132, + "step": 1312 + }, + { + "epoch": 0.45606113233761725, + "grad_norm": 1.1630972005065157, + "learning_rate": 5.950608399934594e-06, + "loss": 0.0242, + "step": 1313 + }, + { + "epoch": 0.45640847516498784, + "grad_norm": 1.0539497447728725, + "learning_rate": 5.945084354326527e-06, + "loss": 0.0179, + "step": 1314 + }, + { + "epoch": 0.4567558179923585, + "grad_norm": 0.838017365343182, + "learning_rate": 5.939559112144781e-06, + "loss": 0.0265, + "step": 1315 + }, + { + "epoch": 0.45710316081972907, + "grad_norm": 0.5217217424418361, + "learning_rate": 5.93403268038488e-06, + "loss": 0.0173, + "step": 1316 + }, + { + "epoch": 0.45745050364709966, + "grad_norm": 1.049739897466213, + "learning_rate": 5.928505066043852e-06, + "loss": 0.0154, + "step": 1317 + }, + { + "epoch": 0.4577978464744703, + "grad_norm": 0.514558882908952, + "learning_rate": 5.922976276120225e-06, + "loss": 0.0236, + "step": 1318 + }, + { + "epoch": 0.4581451893018409, + "grad_norm": 0.8103740247236859, + "learning_rate": 5.917446317614012e-06, + "loss": 0.0115, + "step": 1319 + }, + { + "epoch": 0.45849253212921154, + "grad_norm": 1.0239011468521235, + "learning_rate": 5.911915197526709e-06, + "loss": 0.0194, + "step": 1320 + }, + { + "epoch": 0.4588398749565821, + "grad_norm": 0.45055190346246493, + "learning_rate": 5.9063829228612805e-06, + "loss": 0.0244, + "step": 1321 + }, + { + "epoch": 0.4591872177839528, + "grad_norm": 0.4096700783073768, + "learning_rate": 5.900849500622153e-06, + "loss": 0.0192, + "step": 1322 + }, + { + "epoch": 0.45953456061132336, + "grad_norm": 5.0746307124667585, + "learning_rate": 5.895314937815206e-06, + "loss": 0.0249, + "step": 1323 + }, + { + "epoch": 0.459881903438694, + "grad_norm": 0.6287936978756935, + "learning_rate": 5.889779241447765e-06, + "loss": 0.0192, + "step": 1324 + }, + { + "epoch": 0.4602292462660646, + "grad_norm": 0.5423899884873425, + "learning_rate": 5.884242418528588e-06, + "loss": 0.0219, + "step": 1325 + }, + { + "epoch": 0.46057658909343524, + "grad_norm": 0.5268649486665123, + "learning_rate": 5.878704476067862e-06, + "loss": 0.0166, + "step": 1326 + }, + { + "epoch": 0.46092393192080583, + "grad_norm": 0.922315522446586, + "learning_rate": 5.873165421077186e-06, + "loss": 0.0244, + "step": 1327 + }, + { + "epoch": 0.4612712747481765, + "grad_norm": 0.6105099198285099, + "learning_rate": 5.867625260569575e-06, + "loss": 0.0278, + "step": 1328 + }, + { + "epoch": 0.46161861757554706, + "grad_norm": 1.472196223378207, + "learning_rate": 5.862084001559438e-06, + "loss": 0.0233, + "step": 1329 + }, + { + "epoch": 0.46196596040291765, + "grad_norm": 0.6221418880436498, + "learning_rate": 5.85654165106258e-06, + "loss": 0.0113, + "step": 1330 + }, + { + "epoch": 0.4623133032302883, + "grad_norm": 0.6121127340609553, + "learning_rate": 5.850998216096181e-06, + "loss": 0.0249, + "step": 1331 + }, + { + "epoch": 0.4626606460576589, + "grad_norm": 0.7691635062367261, + "learning_rate": 5.845453703678801e-06, + "loss": 0.0263, + "step": 1332 + }, + { + "epoch": 0.46300798888502953, + "grad_norm": 0.4516319265859705, + "learning_rate": 5.8399081208303595e-06, + "loss": 0.0217, + "step": 1333 + }, + { + "epoch": 0.4633553317124001, + "grad_norm": 1.0317569643624573, + "learning_rate": 5.834361474572134e-06, + "loss": 0.0174, + "step": 1334 + }, + { + "epoch": 0.46370267453977076, + "grad_norm": 0.6672608515849126, + "learning_rate": 5.828813771926746e-06, + "loss": 0.0202, + "step": 1335 + }, + { + "epoch": 0.46405001736714135, + "grad_norm": 2.1225607990268083, + "learning_rate": 5.823265019918156e-06, + "loss": 0.0202, + "step": 1336 + }, + { + "epoch": 0.464397360194512, + "grad_norm": 0.5824621371505889, + "learning_rate": 5.817715225571654e-06, + "loss": 0.0189, + "step": 1337 + }, + { + "epoch": 0.4647447030218826, + "grad_norm": 0.5556260631755074, + "learning_rate": 5.812164395913848e-06, + "loss": 0.02, + "step": 1338 + }, + { + "epoch": 0.46509204584925323, + "grad_norm": 0.5236250786169708, + "learning_rate": 5.806612537972658e-06, + "loss": 0.0295, + "step": 1339 + }, + { + "epoch": 0.4654393886766238, + "grad_norm": 0.915218352922461, + "learning_rate": 5.801059658777303e-06, + "loss": 0.0247, + "step": 1340 + }, + { + "epoch": 0.46578673150399447, + "grad_norm": 0.8125906729186393, + "learning_rate": 5.7955057653583e-06, + "loss": 0.0168, + "step": 1341 + }, + { + "epoch": 0.46613407433136506, + "grad_norm": 0.500901462927243, + "learning_rate": 5.789950864747446e-06, + "loss": 0.0163, + "step": 1342 + }, + { + "epoch": 0.46648141715873565, + "grad_norm": 0.8445734370127098, + "learning_rate": 5.784394963977815e-06, + "loss": 0.0156, + "step": 1343 + }, + { + "epoch": 0.4668287599861063, + "grad_norm": 0.8202499613757581, + "learning_rate": 5.778838070083747e-06, + "loss": 0.0165, + "step": 1344 + }, + { + "epoch": 0.4671761028134769, + "grad_norm": 0.9521555125223035, + "learning_rate": 5.77328019010084e-06, + "loss": 0.0236, + "step": 1345 + }, + { + "epoch": 0.4675234456408475, + "grad_norm": 0.3706684952737426, + "learning_rate": 5.7677213310659375e-06, + "loss": 0.0134, + "step": 1346 + }, + { + "epoch": 0.4678707884682181, + "grad_norm": 0.4599731154160651, + "learning_rate": 5.762161500017128e-06, + "loss": 0.0155, + "step": 1347 + }, + { + "epoch": 0.46821813129558876, + "grad_norm": 0.2427917500418682, + "learning_rate": 5.756600703993725e-06, + "loss": 0.0089, + "step": 1348 + }, + { + "epoch": 0.46856547412295935, + "grad_norm": 0.7157300676183229, + "learning_rate": 5.751038950036267e-06, + "loss": 0.026, + "step": 1349 + }, + { + "epoch": 0.46891281695033, + "grad_norm": 0.5005873987026797, + "learning_rate": 5.745476245186506e-06, + "loss": 0.0213, + "step": 1350 + }, + { + "epoch": 0.4692601597777006, + "grad_norm": 0.9673326052882976, + "learning_rate": 5.739912596487396e-06, + "loss": 0.0269, + "step": 1351 + }, + { + "epoch": 0.4696075026050712, + "grad_norm": 0.457111905319674, + "learning_rate": 5.7343480109830865e-06, + "loss": 0.0177, + "step": 1352 + }, + { + "epoch": 0.4699548454324418, + "grad_norm": 0.8619795743501616, + "learning_rate": 5.728782495718912e-06, + "loss": 0.0156, + "step": 1353 + }, + { + "epoch": 0.47030218825981246, + "grad_norm": 0.40333546480267507, + "learning_rate": 5.7232160577413866e-06, + "loss": 0.0186, + "step": 1354 + }, + { + "epoch": 0.47064953108718305, + "grad_norm": 0.5088273149893132, + "learning_rate": 5.717648704098191e-06, + "loss": 0.0121, + "step": 1355 + }, + { + "epoch": 0.47099687391455364, + "grad_norm": 0.5242704677164807, + "learning_rate": 5.712080441838167e-06, + "loss": 0.0162, + "step": 1356 + }, + { + "epoch": 0.4713442167419243, + "grad_norm": 1.2256987250615423, + "learning_rate": 5.706511278011303e-06, + "loss": 0.0207, + "step": 1357 + }, + { + "epoch": 0.47169155956929487, + "grad_norm": 0.9464524842305495, + "learning_rate": 5.700941219668733e-06, + "loss": 0.0191, + "step": 1358 + }, + { + "epoch": 0.4720389023966655, + "grad_norm": 0.8126675490446509, + "learning_rate": 5.6953702738627215e-06, + "loss": 0.0195, + "step": 1359 + }, + { + "epoch": 0.4723862452240361, + "grad_norm": 1.0495606290514148, + "learning_rate": 5.689798447646657e-06, + "loss": 0.0216, + "step": 1360 + }, + { + "epoch": 0.47273358805140675, + "grad_norm": 0.5981699030064619, + "learning_rate": 5.684225748075044e-06, + "loss": 0.0143, + "step": 1361 + }, + { + "epoch": 0.47308093087877734, + "grad_norm": 0.478440133230351, + "learning_rate": 5.678652182203489e-06, + "loss": 0.0215, + "step": 1362 + }, + { + "epoch": 0.473428273706148, + "grad_norm": 0.9677500646745865, + "learning_rate": 5.6730777570887e-06, + "loss": 0.0195, + "step": 1363 + }, + { + "epoch": 0.4737756165335186, + "grad_norm": 0.7312470814458618, + "learning_rate": 5.667502479788467e-06, + "loss": 0.0109, + "step": 1364 + }, + { + "epoch": 0.4741229593608892, + "grad_norm": 0.36719507217506836, + "learning_rate": 5.6619263573616676e-06, + "loss": 0.0152, + "step": 1365 + }, + { + "epoch": 0.4744703021882598, + "grad_norm": 0.503824691086867, + "learning_rate": 5.6563493968682405e-06, + "loss": 0.021, + "step": 1366 + }, + { + "epoch": 0.47481764501563045, + "grad_norm": 1.1250583221317176, + "learning_rate": 5.6507716053691916e-06, + "loss": 0.0259, + "step": 1367 + }, + { + "epoch": 0.47516498784300104, + "grad_norm": 0.574390097728659, + "learning_rate": 5.645192989926577e-06, + "loss": 0.0237, + "step": 1368 + }, + { + "epoch": 0.47551233067037163, + "grad_norm": 1.5251526377041664, + "learning_rate": 5.639613557603494e-06, + "loss": 0.0281, + "step": 1369 + }, + { + "epoch": 0.4758596734977423, + "grad_norm": 0.6785324566307315, + "learning_rate": 5.634033315464076e-06, + "loss": 0.0124, + "step": 1370 + }, + { + "epoch": 0.47620701632511286, + "grad_norm": 0.5227461607632101, + "learning_rate": 5.628452270573483e-06, + "loss": 0.0177, + "step": 1371 + }, + { + "epoch": 0.4765543591524835, + "grad_norm": 1.1646192631978813, + "learning_rate": 5.6228704299978905e-06, + "loss": 0.0241, + "step": 1372 + }, + { + "epoch": 0.4769017019798541, + "grad_norm": 0.5872534337661728, + "learning_rate": 5.617287800804478e-06, + "loss": 0.0218, + "step": 1373 + }, + { + "epoch": 0.47724904480722474, + "grad_norm": 0.5225710058655455, + "learning_rate": 5.61170439006143e-06, + "loss": 0.0223, + "step": 1374 + }, + { + "epoch": 0.47759638763459533, + "grad_norm": 1.1334161288347557, + "learning_rate": 5.6061202048379125e-06, + "loss": 0.0254, + "step": 1375 + }, + { + "epoch": 0.477943730461966, + "grad_norm": 1.0865327629480914, + "learning_rate": 5.600535252204081e-06, + "loss": 0.0237, + "step": 1376 + }, + { + "epoch": 0.47829107328933657, + "grad_norm": 1.279397014560853, + "learning_rate": 5.5949495392310535e-06, + "loss": 0.0286, + "step": 1377 + }, + { + "epoch": 0.4786384161167072, + "grad_norm": 0.32759908340107924, + "learning_rate": 5.589363072990921e-06, + "loss": 0.0149, + "step": 1378 + }, + { + "epoch": 0.4789857589440778, + "grad_norm": 0.5810913478185202, + "learning_rate": 5.583775860556717e-06, + "loss": 0.0225, + "step": 1379 + }, + { + "epoch": 0.47933310177144844, + "grad_norm": 0.3907616220341205, + "learning_rate": 5.578187909002428e-06, + "loss": 0.0246, + "step": 1380 + }, + { + "epoch": 0.47968044459881903, + "grad_norm": 0.5016926806740227, + "learning_rate": 5.572599225402974e-06, + "loss": 0.0196, + "step": 1381 + }, + { + "epoch": 0.4800277874261896, + "grad_norm": 0.5103118178433353, + "learning_rate": 5.567009816834199e-06, + "loss": 0.0155, + "step": 1382 + }, + { + "epoch": 0.48037513025356027, + "grad_norm": 0.9141679829913468, + "learning_rate": 5.561419690372869e-06, + "loss": 0.025, + "step": 1383 + }, + { + "epoch": 0.48072247308093086, + "grad_norm": 0.5420971166376601, + "learning_rate": 5.555828853096656e-06, + "loss": 0.0164, + "step": 1384 + }, + { + "epoch": 0.4810698159083015, + "grad_norm": 0.6355535194466504, + "learning_rate": 5.5502373120841346e-06, + "loss": 0.0214, + "step": 1385 + }, + { + "epoch": 0.4814171587356721, + "grad_norm": 0.3671380997420511, + "learning_rate": 5.544645074414768e-06, + "loss": 0.0149, + "step": 1386 + }, + { + "epoch": 0.48176450156304274, + "grad_norm": 0.8978543017268891, + "learning_rate": 5.539052147168903e-06, + "loss": 0.0133, + "step": 1387 + }, + { + "epoch": 0.4821118443904133, + "grad_norm": 0.9548917687698516, + "learning_rate": 5.533458537427758e-06, + "loss": 0.0214, + "step": 1388 + }, + { + "epoch": 0.48245918721778397, + "grad_norm": 0.8756025165481638, + "learning_rate": 5.5278642522734175e-06, + "loss": 0.0225, + "step": 1389 + }, + { + "epoch": 0.48280653004515456, + "grad_norm": 0.9092163623639533, + "learning_rate": 5.52226929878882e-06, + "loss": 0.0146, + "step": 1390 + }, + { + "epoch": 0.4831538728725252, + "grad_norm": 0.39815894527941675, + "learning_rate": 5.516673684057747e-06, + "loss": 0.0166, + "step": 1391 + }, + { + "epoch": 0.4835012156998958, + "grad_norm": 0.2966587066038902, + "learning_rate": 5.511077415164825e-06, + "loss": 0.0159, + "step": 1392 + }, + { + "epoch": 0.48384855852726644, + "grad_norm": 0.4884869491519517, + "learning_rate": 5.505480499195502e-06, + "loss": 0.0166, + "step": 1393 + }, + { + "epoch": 0.484195901354637, + "grad_norm": 0.35698741794966127, + "learning_rate": 5.499882943236045e-06, + "loss": 0.0114, + "step": 1394 + }, + { + "epoch": 0.4845432441820076, + "grad_norm": 0.7090902718663725, + "learning_rate": 5.494284754373538e-06, + "loss": 0.0146, + "step": 1395 + }, + { + "epoch": 0.48489058700937826, + "grad_norm": 0.5026358347696848, + "learning_rate": 5.488685939695862e-06, + "loss": 0.0148, + "step": 1396 + }, + { + "epoch": 0.48523792983674885, + "grad_norm": 0.7545042851491367, + "learning_rate": 5.4830865062916835e-06, + "loss": 0.022, + "step": 1397 + }, + { + "epoch": 0.4855852726641195, + "grad_norm": 0.5369844064837273, + "learning_rate": 5.477486461250469e-06, + "loss": 0.0209, + "step": 1398 + }, + { + "epoch": 0.4859326154914901, + "grad_norm": 0.6861158062273787, + "learning_rate": 5.471885811662442e-06, + "loss": 0.0271, + "step": 1399 + }, + { + "epoch": 0.4862799583188607, + "grad_norm": 0.7673744461691656, + "learning_rate": 5.466284564618603e-06, + "loss": 0.0296, + "step": 1400 + }, + { + "epoch": 0.4866273011462313, + "grad_norm": 0.5475029370005332, + "learning_rate": 5.460682727210702e-06, + "loss": 0.0248, + "step": 1401 + }, + { + "epoch": 0.48697464397360196, + "grad_norm": 0.9512795806158605, + "learning_rate": 5.455080306531244e-06, + "loss": 0.0218, + "step": 1402 + }, + { + "epoch": 0.48732198680097255, + "grad_norm": 0.5444903179080952, + "learning_rate": 5.449477309673462e-06, + "loss": 0.0191, + "step": 1403 + }, + { + "epoch": 0.4876693296283432, + "grad_norm": 0.8861826575673567, + "learning_rate": 5.443873743731331e-06, + "loss": 0.0109, + "step": 1404 + }, + { + "epoch": 0.4880166724557138, + "grad_norm": 0.9771566586732063, + "learning_rate": 5.438269615799534e-06, + "loss": 0.0197, + "step": 1405 + }, + { + "epoch": 0.48836401528308443, + "grad_norm": 1.9427129264091787, + "learning_rate": 5.432664932973474e-06, + "loss": 0.0282, + "step": 1406 + }, + { + "epoch": 0.488711358110455, + "grad_norm": 1.7104554294560776, + "learning_rate": 5.427059702349255e-06, + "loss": 0.0204, + "step": 1407 + }, + { + "epoch": 0.4890587009378256, + "grad_norm": 1.526360473399325, + "learning_rate": 5.4214539310236716e-06, + "loss": 0.0235, + "step": 1408 + }, + { + "epoch": 0.48940604376519625, + "grad_norm": 0.4997923973605721, + "learning_rate": 5.4158476260942075e-06, + "loss": 0.0174, + "step": 1409 + }, + { + "epoch": 0.48975338659256684, + "grad_norm": 0.4525414144834577, + "learning_rate": 5.410240794659016e-06, + "loss": 0.0195, + "step": 1410 + }, + { + "epoch": 0.4901007294199375, + "grad_norm": 0.410000828468513, + "learning_rate": 5.4046334438169245e-06, + "loss": 0.018, + "step": 1411 + }, + { + "epoch": 0.4904480722473081, + "grad_norm": 0.8060585957623981, + "learning_rate": 5.39902558066741e-06, + "loss": 0.0186, + "step": 1412 + }, + { + "epoch": 0.4907954150746787, + "grad_norm": 2.1170543916864037, + "learning_rate": 5.393417212310605e-06, + "loss": 0.0307, + "step": 1413 + }, + { + "epoch": 0.4911427579020493, + "grad_norm": 0.8364632533994034, + "learning_rate": 5.387808345847277e-06, + "loss": 0.0214, + "step": 1414 + }, + { + "epoch": 0.49149010072941995, + "grad_norm": 1.3428878485248303, + "learning_rate": 5.382198988378829e-06, + "loss": 0.0218, + "step": 1415 + }, + { + "epoch": 0.49183744355679054, + "grad_norm": 0.6747820600344949, + "learning_rate": 5.376589147007279e-06, + "loss": 0.0184, + "step": 1416 + }, + { + "epoch": 0.4921847863841612, + "grad_norm": 0.8594425526853627, + "learning_rate": 5.3709788288352615e-06, + "loss": 0.0182, + "step": 1417 + }, + { + "epoch": 0.4925321292115318, + "grad_norm": 0.5487772743825399, + "learning_rate": 5.365368040966016e-06, + "loss": 0.0221, + "step": 1418 + }, + { + "epoch": 0.4928794720389024, + "grad_norm": 0.8978739980804368, + "learning_rate": 5.359756790503376e-06, + "loss": 0.0161, + "step": 1419 + }, + { + "epoch": 0.493226814866273, + "grad_norm": 1.165143564643093, + "learning_rate": 5.354145084551757e-06, + "loss": 0.0215, + "step": 1420 + }, + { + "epoch": 0.4935741576936436, + "grad_norm": 0.8557699204572717, + "learning_rate": 5.348532930216157e-06, + "loss": 0.0239, + "step": 1421 + }, + { + "epoch": 0.49392150052101425, + "grad_norm": 1.203420881332666, + "learning_rate": 5.342920334602137e-06, + "loss": 0.0199, + "step": 1422 + }, + { + "epoch": 0.49426884334838483, + "grad_norm": 0.49131730078379343, + "learning_rate": 5.337307304815817e-06, + "loss": 0.0211, + "step": 1423 + }, + { + "epoch": 0.4946161861757555, + "grad_norm": 0.7833574610940411, + "learning_rate": 5.331693847963871e-06, + "loss": 0.0167, + "step": 1424 + }, + { + "epoch": 0.49496352900312607, + "grad_norm": 0.4454425944376909, + "learning_rate": 5.32607997115351e-06, + "loss": 0.0218, + "step": 1425 + }, + { + "epoch": 0.4953108718304967, + "grad_norm": 0.4471730702798169, + "learning_rate": 5.320465681492478e-06, + "loss": 0.016, + "step": 1426 + }, + { + "epoch": 0.4956582146578673, + "grad_norm": 0.9677891658268274, + "learning_rate": 5.31485098608904e-06, + "loss": 0.0142, + "step": 1427 + }, + { + "epoch": 0.49600555748523795, + "grad_norm": 0.5375414719758962, + "learning_rate": 5.309235892051976e-06, + "loss": 0.0147, + "step": 1428 + }, + { + "epoch": 0.49635290031260854, + "grad_norm": 0.5293248840266622, + "learning_rate": 5.303620406490573e-06, + "loss": 0.0155, + "step": 1429 + }, + { + "epoch": 0.4967002431399792, + "grad_norm": 0.7923702737792816, + "learning_rate": 5.298004536514606e-06, + "loss": 0.0137, + "step": 1430 + }, + { + "epoch": 0.49704758596734977, + "grad_norm": 1.2096835572281968, + "learning_rate": 5.292388289234349e-06, + "loss": 0.0227, + "step": 1431 + }, + { + "epoch": 0.4973949287947204, + "grad_norm": 0.9552234758771334, + "learning_rate": 5.286771671760541e-06, + "loss": 0.0318, + "step": 1432 + }, + { + "epoch": 0.497742271622091, + "grad_norm": 0.9769572098077361, + "learning_rate": 5.2811546912044e-06, + "loss": 0.0211, + "step": 1433 + }, + { + "epoch": 0.4980896144494616, + "grad_norm": 1.3870302191088562, + "learning_rate": 5.275537354677595e-06, + "loss": 0.0262, + "step": 1434 + }, + { + "epoch": 0.49843695727683224, + "grad_norm": 0.455287552427971, + "learning_rate": 5.2699196692922546e-06, + "loss": 0.0153, + "step": 1435 + }, + { + "epoch": 0.4987843001042028, + "grad_norm": 0.45542023402718185, + "learning_rate": 5.264301642160939e-06, + "loss": 0.0133, + "step": 1436 + }, + { + "epoch": 0.49913164293157347, + "grad_norm": 0.5647376621549762, + "learning_rate": 5.2586832803966525e-06, + "loss": 0.0183, + "step": 1437 + }, + { + "epoch": 0.49947898575894406, + "grad_norm": 0.4851885791856978, + "learning_rate": 5.2530645911128135e-06, + "loss": 0.0159, + "step": 1438 + }, + { + "epoch": 0.4998263285863147, + "grad_norm": 0.8448863799291079, + "learning_rate": 5.247445581423257e-06, + "loss": 0.015, + "step": 1439 + }, + { + "epoch": 0.5001736714136853, + "grad_norm": 0.9930674011154853, + "learning_rate": 5.24182625844223e-06, + "loss": 0.0191, + "step": 1440 + }, + { + "epoch": 0.5005210142410559, + "grad_norm": 0.9939816699848936, + "learning_rate": 5.236206629284367e-06, + "loss": 0.0147, + "step": 1441 + }, + { + "epoch": 0.5008683570684266, + "grad_norm": 0.8530137576376285, + "learning_rate": 5.2305867010646975e-06, + "loss": 0.0131, + "step": 1442 + }, + { + "epoch": 0.5012156998957972, + "grad_norm": 0.6190001699293848, + "learning_rate": 5.224966480898624e-06, + "loss": 0.0205, + "step": 1443 + }, + { + "epoch": 0.5015630427231678, + "grad_norm": 0.5694946459177951, + "learning_rate": 5.219345975901925e-06, + "loss": 0.0125, + "step": 1444 + }, + { + "epoch": 0.5019103855505384, + "grad_norm": 0.6351471531109476, + "learning_rate": 5.2137251931907315e-06, + "loss": 0.0133, + "step": 1445 + }, + { + "epoch": 0.5022577283779089, + "grad_norm": 0.9258615858237176, + "learning_rate": 5.208104139881537e-06, + "loss": 0.0228, + "step": 1446 + }, + { + "epoch": 0.5026050712052796, + "grad_norm": 1.1728546136592717, + "learning_rate": 5.202482823091165e-06, + "loss": 0.027, + "step": 1447 + }, + { + "epoch": 0.5029524140326502, + "grad_norm": 0.4906390164717808, + "learning_rate": 5.196861249936782e-06, + "loss": 0.0164, + "step": 1448 + }, + { + "epoch": 0.5032997568600208, + "grad_norm": 0.8785844081111323, + "learning_rate": 5.191239427535876e-06, + "loss": 0.03, + "step": 1449 + }, + { + "epoch": 0.5036470996873914, + "grad_norm": 0.47561204331956825, + "learning_rate": 5.185617363006249e-06, + "loss": 0.0164, + "step": 1450 + }, + { + "epoch": 0.5039944425147621, + "grad_norm": 1.0364600327319036, + "learning_rate": 5.179995063466011e-06, + "loss": 0.0224, + "step": 1451 + }, + { + "epoch": 0.5043417853421327, + "grad_norm": 0.4607390170994428, + "learning_rate": 5.174372536033572e-06, + "loss": 0.0186, + "step": 1452 + }, + { + "epoch": 0.5046891281695033, + "grad_norm": 1.258104528985896, + "learning_rate": 5.168749787827625e-06, + "loss": 0.0207, + "step": 1453 + }, + { + "epoch": 0.5050364709968739, + "grad_norm": 0.4431006510218478, + "learning_rate": 5.163126825967147e-06, + "loss": 0.0248, + "step": 1454 + }, + { + "epoch": 0.5053838138242446, + "grad_norm": 1.0796514546664886, + "learning_rate": 5.157503657571386e-06, + "loss": 0.0226, + "step": 1455 + }, + { + "epoch": 0.5057311566516152, + "grad_norm": 0.7305079505359174, + "learning_rate": 5.151880289759847e-06, + "loss": 0.0162, + "step": 1456 + }, + { + "epoch": 0.5060784994789858, + "grad_norm": 0.5142991616801853, + "learning_rate": 5.14625672965229e-06, + "loss": 0.0231, + "step": 1457 + }, + { + "epoch": 0.5064258423063563, + "grad_norm": 0.3053355660863188, + "learning_rate": 5.140632984368721e-06, + "loss": 0.0169, + "step": 1458 + }, + { + "epoch": 0.5067731851337269, + "grad_norm": 0.6123341558370508, + "learning_rate": 5.1350090610293765e-06, + "loss": 0.0229, + "step": 1459 + }, + { + "epoch": 0.5071205279610976, + "grad_norm": 0.4030206492046645, + "learning_rate": 5.12938496675472e-06, + "loss": 0.0175, + "step": 1460 + }, + { + "epoch": 0.5074678707884682, + "grad_norm": 1.0886198281053343, + "learning_rate": 5.123760708665432e-06, + "loss": 0.0186, + "step": 1461 + }, + { + "epoch": 0.5078152136158388, + "grad_norm": 0.48403013770913567, + "learning_rate": 5.1181362938823995e-06, + "loss": 0.0186, + "step": 1462 + }, + { + "epoch": 0.5081625564432094, + "grad_norm": 0.740876359392756, + "learning_rate": 5.112511729526708e-06, + "loss": 0.0202, + "step": 1463 + }, + { + "epoch": 0.5085098992705801, + "grad_norm": 1.3075365516854722, + "learning_rate": 5.106887022719633e-06, + "loss": 0.0182, + "step": 1464 + }, + { + "epoch": 0.5088572420979507, + "grad_norm": 0.5392655690852264, + "learning_rate": 5.101262180582628e-06, + "loss": 0.0274, + "step": 1465 + }, + { + "epoch": 0.5092045849253213, + "grad_norm": 0.7466155039978699, + "learning_rate": 5.095637210237324e-06, + "loss": 0.0198, + "step": 1466 + }, + { + "epoch": 0.5095519277526919, + "grad_norm": 0.4642989380683977, + "learning_rate": 5.090012118805505e-06, + "loss": 0.0235, + "step": 1467 + }, + { + "epoch": 0.5098992705800626, + "grad_norm": 0.5150423584413911, + "learning_rate": 5.084386913409118e-06, + "loss": 0.0256, + "step": 1468 + }, + { + "epoch": 0.5102466134074332, + "grad_norm": 0.5093614850962735, + "learning_rate": 5.0787616011702455e-06, + "loss": 0.0195, + "step": 1469 + }, + { + "epoch": 0.5105939562348037, + "grad_norm": 0.7660637501450626, + "learning_rate": 5.073136189211114e-06, + "loss": 0.0176, + "step": 1470 + }, + { + "epoch": 0.5109412990621743, + "grad_norm": 1.0473364217411674, + "learning_rate": 5.067510684654069e-06, + "loss": 0.0261, + "step": 1471 + }, + { + "epoch": 0.5112886418895449, + "grad_norm": 0.8782397765062985, + "learning_rate": 5.061885094621575e-06, + "loss": 0.0199, + "step": 1472 + }, + { + "epoch": 0.5116359847169156, + "grad_norm": 0.4607201639452861, + "learning_rate": 5.056259426236207e-06, + "loss": 0.0146, + "step": 1473 + }, + { + "epoch": 0.5119833275442862, + "grad_norm": 0.6661850564864881, + "learning_rate": 5.05063368662064e-06, + "loss": 0.0179, + "step": 1474 + }, + { + "epoch": 0.5123306703716568, + "grad_norm": 0.4772014886025317, + "learning_rate": 5.0450078828976326e-06, + "loss": 0.0198, + "step": 1475 + }, + { + "epoch": 0.5126780131990274, + "grad_norm": 1.305023780400015, + "learning_rate": 5.0393820221900325e-06, + "loss": 0.0219, + "step": 1476 + }, + { + "epoch": 0.5130253560263981, + "grad_norm": 0.8537297309135922, + "learning_rate": 5.0337561116207546e-06, + "loss": 0.0191, + "step": 1477 + }, + { + "epoch": 0.5133726988537687, + "grad_norm": 0.5326568118271483, + "learning_rate": 5.028130158312779e-06, + "loss": 0.0246, + "step": 1478 + }, + { + "epoch": 0.5137200416811393, + "grad_norm": 0.3866710810781936, + "learning_rate": 5.02250416938914e-06, + "loss": 0.0178, + "step": 1479 + }, + { + "epoch": 0.5140673845085099, + "grad_norm": 0.22396563755414556, + "learning_rate": 5.016878151972915e-06, + "loss": 0.0123, + "step": 1480 + }, + { + "epoch": 0.5144147273358806, + "grad_norm": 0.4451777326858171, + "learning_rate": 5.01125211318722e-06, + "loss": 0.0214, + "step": 1481 + }, + { + "epoch": 0.5147620701632512, + "grad_norm": 0.6070766959527533, + "learning_rate": 5.005626060155194e-06, + "loss": 0.0252, + "step": 1482 + }, + { + "epoch": 0.5151094129906217, + "grad_norm": 1.0407371235155332, + "learning_rate": 5e-06, + "loss": 0.0221, + "step": 1483 + }, + { + "epoch": 0.5154567558179923, + "grad_norm": 0.9630652449889104, + "learning_rate": 4.994373939844807e-06, + "loss": 0.0292, + "step": 1484 + }, + { + "epoch": 0.5158040986453629, + "grad_norm": 0.5372033115683907, + "learning_rate": 4.988747886812781e-06, + "loss": 0.0332, + "step": 1485 + }, + { + "epoch": 0.5161514414727336, + "grad_norm": 0.42093625107990457, + "learning_rate": 4.983121848027088e-06, + "loss": 0.0154, + "step": 1486 + }, + { + "epoch": 0.5164987843001042, + "grad_norm": 0.5437961266541049, + "learning_rate": 4.977495830610862e-06, + "loss": 0.0162, + "step": 1487 + }, + { + "epoch": 0.5168461271274748, + "grad_norm": 0.6036534712036593, + "learning_rate": 4.9718698416872215e-06, + "loss": 0.0267, + "step": 1488 + }, + { + "epoch": 0.5171934699548454, + "grad_norm": 0.531411114394444, + "learning_rate": 4.966243888379245e-06, + "loss": 0.0273, + "step": 1489 + }, + { + "epoch": 0.5175408127822161, + "grad_norm": 0.907790815597591, + "learning_rate": 4.96061797780997e-06, + "loss": 0.0143, + "step": 1490 + }, + { + "epoch": 0.5178881556095867, + "grad_norm": 1.0791005798691276, + "learning_rate": 4.954992117102369e-06, + "loss": 0.0203, + "step": 1491 + }, + { + "epoch": 0.5182354984369573, + "grad_norm": 1.0050228768407403, + "learning_rate": 4.949366313379362e-06, + "loss": 0.0159, + "step": 1492 + }, + { + "epoch": 0.5185828412643279, + "grad_norm": 0.31620179318811703, + "learning_rate": 4.943740573763794e-06, + "loss": 0.0181, + "step": 1493 + }, + { + "epoch": 0.5189301840916986, + "grad_norm": 0.693985447212686, + "learning_rate": 4.938114905378428e-06, + "loss": 0.0143, + "step": 1494 + }, + { + "epoch": 0.5192775269190691, + "grad_norm": 0.9981712152035553, + "learning_rate": 4.932489315345933e-06, + "loss": 0.022, + "step": 1495 + }, + { + "epoch": 0.5196248697464397, + "grad_norm": 0.47907771487206496, + "learning_rate": 4.9268638107888875e-06, + "loss": 0.0158, + "step": 1496 + }, + { + "epoch": 0.5199722125738103, + "grad_norm": 0.6623027870764544, + "learning_rate": 4.9212383988297545e-06, + "loss": 0.0134, + "step": 1497 + }, + { + "epoch": 0.5203195554011809, + "grad_norm": 1.0794034174253464, + "learning_rate": 4.9156130865908845e-06, + "loss": 0.0292, + "step": 1498 + }, + { + "epoch": 0.5206668982285516, + "grad_norm": 0.3138682448100754, + "learning_rate": 4.9099878811944965e-06, + "loss": 0.0155, + "step": 1499 + }, + { + "epoch": 0.5210142410559222, + "grad_norm": 0.7583850057995437, + "learning_rate": 4.904362789762677e-06, + "loss": 0.0147, + "step": 1500 + }, + { + "epoch": 0.5213615838832928, + "grad_norm": 0.7155832485484563, + "learning_rate": 4.898737819417372e-06, + "loss": 0.0203, + "step": 1501 + }, + { + "epoch": 0.5217089267106634, + "grad_norm": 0.35672739687797744, + "learning_rate": 4.893112977280369e-06, + "loss": 0.0069, + "step": 1502 + }, + { + "epoch": 0.5220562695380341, + "grad_norm": 0.987017131142079, + "learning_rate": 4.887488270473294e-06, + "loss": 0.0232, + "step": 1503 + }, + { + "epoch": 0.5224036123654047, + "grad_norm": 0.34804891464014953, + "learning_rate": 4.881863706117601e-06, + "loss": 0.0134, + "step": 1504 + }, + { + "epoch": 0.5227509551927753, + "grad_norm": 0.6251911038205994, + "learning_rate": 4.876239291334568e-06, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.5230982980201458, + "grad_norm": 1.3491600406620532, + "learning_rate": 4.8706150332452815e-06, + "loss": 0.0237, + "step": 1506 + }, + { + "epoch": 0.5234456408475165, + "grad_norm": 0.6296332865053156, + "learning_rate": 4.864990938970624e-06, + "loss": 0.0162, + "step": 1507 + }, + { + "epoch": 0.5237929836748871, + "grad_norm": 0.6175960968496856, + "learning_rate": 4.85936701563128e-06, + "loss": 0.0223, + "step": 1508 + }, + { + "epoch": 0.5241403265022577, + "grad_norm": 0.6135026142466486, + "learning_rate": 4.85374327034771e-06, + "loss": 0.0192, + "step": 1509 + }, + { + "epoch": 0.5244876693296283, + "grad_norm": 0.8443887642336403, + "learning_rate": 4.848119710240156e-06, + "loss": 0.0244, + "step": 1510 + }, + { + "epoch": 0.5248350121569989, + "grad_norm": 0.7215133934035978, + "learning_rate": 4.842496342428616e-06, + "loss": 0.0306, + "step": 1511 + }, + { + "epoch": 0.5251823549843696, + "grad_norm": 0.3768241222787331, + "learning_rate": 4.8368731740328536e-06, + "loss": 0.0139, + "step": 1512 + }, + { + "epoch": 0.5255296978117402, + "grad_norm": 0.3510966580810152, + "learning_rate": 4.8312502121723755e-06, + "loss": 0.0137, + "step": 1513 + }, + { + "epoch": 0.5258770406391108, + "grad_norm": 0.9763840468277634, + "learning_rate": 4.825627463966431e-06, + "loss": 0.0218, + "step": 1514 + }, + { + "epoch": 0.5262243834664814, + "grad_norm": 0.9974349698601298, + "learning_rate": 4.8200049365339905e-06, + "loss": 0.0229, + "step": 1515 + }, + { + "epoch": 0.5265717262938521, + "grad_norm": 0.395840326358885, + "learning_rate": 4.814382636993753e-06, + "loss": 0.0224, + "step": 1516 + }, + { + "epoch": 0.5269190691212227, + "grad_norm": 0.7070158097071757, + "learning_rate": 4.808760572464126e-06, + "loss": 0.0237, + "step": 1517 + }, + { + "epoch": 0.5272664119485932, + "grad_norm": 0.34439167819910455, + "learning_rate": 4.80313875006322e-06, + "loss": 0.0209, + "step": 1518 + }, + { + "epoch": 0.5276137547759638, + "grad_norm": 0.333265253316496, + "learning_rate": 4.7975171769088366e-06, + "loss": 0.0203, + "step": 1519 + }, + { + "epoch": 0.5279610976033345, + "grad_norm": 0.7184684037655478, + "learning_rate": 4.791895860118465e-06, + "loss": 0.0182, + "step": 1520 + }, + { + "epoch": 0.5283084404307051, + "grad_norm": 0.8473043460095674, + "learning_rate": 4.7862748068092685e-06, + "loss": 0.0174, + "step": 1521 + }, + { + "epoch": 0.5286557832580757, + "grad_norm": 0.376539100887896, + "learning_rate": 4.780654024098076e-06, + "loss": 0.0167, + "step": 1522 + }, + { + "epoch": 0.5290031260854463, + "grad_norm": 1.1649611690839885, + "learning_rate": 4.775033519101378e-06, + "loss": 0.0217, + "step": 1523 + }, + { + "epoch": 0.5293504689128169, + "grad_norm": 0.62058919496297, + "learning_rate": 4.769413298935305e-06, + "loss": 0.0136, + "step": 1524 + }, + { + "epoch": 0.5296978117401876, + "grad_norm": 0.5251282587265036, + "learning_rate": 4.763793370715635e-06, + "loss": 0.0219, + "step": 1525 + }, + { + "epoch": 0.5300451545675582, + "grad_norm": 0.6236545128248215, + "learning_rate": 4.758173741557772e-06, + "loss": 0.0181, + "step": 1526 + }, + { + "epoch": 0.5303924973949288, + "grad_norm": 0.9945963688156065, + "learning_rate": 4.752554418576744e-06, + "loss": 0.0203, + "step": 1527 + }, + { + "epoch": 0.5307398402222994, + "grad_norm": 0.45683681677249527, + "learning_rate": 4.746935408887188e-06, + "loss": 0.0094, + "step": 1528 + }, + { + "epoch": 0.5310871830496701, + "grad_norm": 0.2846095731280691, + "learning_rate": 4.741316719603348e-06, + "loss": 0.0102, + "step": 1529 + }, + { + "epoch": 0.5314345258770407, + "grad_norm": 0.3347130074226901, + "learning_rate": 4.735698357839061e-06, + "loss": 0.016, + "step": 1530 + }, + { + "epoch": 0.5317818687044112, + "grad_norm": 0.28383945406870187, + "learning_rate": 4.730080330707748e-06, + "loss": 0.018, + "step": 1531 + }, + { + "epoch": 0.5321292115317818, + "grad_norm": 0.39618404004198604, + "learning_rate": 4.724462645322406e-06, + "loss": 0.0146, + "step": 1532 + }, + { + "epoch": 0.5324765543591525, + "grad_norm": 0.48107525602946016, + "learning_rate": 4.718845308795601e-06, + "loss": 0.0084, + "step": 1533 + }, + { + "epoch": 0.5328238971865231, + "grad_norm": 0.7775999794726074, + "learning_rate": 4.71322832823946e-06, + "loss": 0.017, + "step": 1534 + }, + { + "epoch": 0.5331712400138937, + "grad_norm": 1.069007509619891, + "learning_rate": 4.707611710765654e-06, + "loss": 0.0288, + "step": 1535 + }, + { + "epoch": 0.5335185828412643, + "grad_norm": 0.39849903984305235, + "learning_rate": 4.701995463485395e-06, + "loss": 0.0117, + "step": 1536 + }, + { + "epoch": 0.5338659256686349, + "grad_norm": 0.677236043933078, + "learning_rate": 4.696379593509429e-06, + "loss": 0.015, + "step": 1537 + }, + { + "epoch": 0.5342132684960056, + "grad_norm": 0.4265486094005142, + "learning_rate": 4.690764107948025e-06, + "loss": 0.0122, + "step": 1538 + }, + { + "epoch": 0.5345606113233762, + "grad_norm": 0.4715494638944048, + "learning_rate": 4.685149013910962e-06, + "loss": 0.0156, + "step": 1539 + }, + { + "epoch": 0.5349079541507468, + "grad_norm": 0.7150624971216185, + "learning_rate": 4.6795343185075235e-06, + "loss": 0.0169, + "step": 1540 + }, + { + "epoch": 0.5352552969781174, + "grad_norm": 0.6525023335363956, + "learning_rate": 4.6739200288464905e-06, + "loss": 0.0291, + "step": 1541 + }, + { + "epoch": 0.5356026398054881, + "grad_norm": 0.47839611155737516, + "learning_rate": 4.668306152036129e-06, + "loss": 0.0162, + "step": 1542 + }, + { + "epoch": 0.5359499826328586, + "grad_norm": 0.7470286100606716, + "learning_rate": 4.662692695184184e-06, + "loss": 0.026, + "step": 1543 + }, + { + "epoch": 0.5362973254602292, + "grad_norm": 0.4794922661466092, + "learning_rate": 4.657079665397865e-06, + "loss": 0.0233, + "step": 1544 + }, + { + "epoch": 0.5366446682875998, + "grad_norm": 0.7903918813270774, + "learning_rate": 4.651467069783845e-06, + "loss": 0.0232, + "step": 1545 + }, + { + "epoch": 0.5369920111149705, + "grad_norm": 1.610986156977707, + "learning_rate": 4.645854915448243e-06, + "loss": 0.0298, + "step": 1546 + }, + { + "epoch": 0.5373393539423411, + "grad_norm": 0.6560078832036462, + "learning_rate": 4.640243209496627e-06, + "loss": 0.0186, + "step": 1547 + }, + { + "epoch": 0.5376866967697117, + "grad_norm": 1.6026250043381345, + "learning_rate": 4.634631959033985e-06, + "loss": 0.0218, + "step": 1548 + }, + { + "epoch": 0.5380340395970823, + "grad_norm": 0.6898766822043673, + "learning_rate": 4.62902117116474e-06, + "loss": 0.017, + "step": 1549 + }, + { + "epoch": 0.5383813824244529, + "grad_norm": 0.5467408900610035, + "learning_rate": 4.623410852992724e-06, + "loss": 0.0233, + "step": 1550 + }, + { + "epoch": 0.5387287252518236, + "grad_norm": 1.0551467144246778, + "learning_rate": 4.617801011621175e-06, + "loss": 0.0208, + "step": 1551 + }, + { + "epoch": 0.5390760680791942, + "grad_norm": 1.162941958095386, + "learning_rate": 4.6121916541527235e-06, + "loss": 0.0259, + "step": 1552 + }, + { + "epoch": 0.5394234109065648, + "grad_norm": 0.3761300022318631, + "learning_rate": 4.606582787689396e-06, + "loss": 0.0185, + "step": 1553 + }, + { + "epoch": 0.5397707537339353, + "grad_norm": 0.6376765879619961, + "learning_rate": 4.600974419332591e-06, + "loss": 0.0172, + "step": 1554 + }, + { + "epoch": 0.540118096561306, + "grad_norm": 0.8179982640637906, + "learning_rate": 4.595366556183079e-06, + "loss": 0.0192, + "step": 1555 + }, + { + "epoch": 0.5404654393886766, + "grad_norm": 0.6660409394522981, + "learning_rate": 4.589759205340986e-06, + "loss": 0.0201, + "step": 1556 + }, + { + "epoch": 0.5408127822160472, + "grad_norm": 0.37364095824936294, + "learning_rate": 4.584152373905794e-06, + "loss": 0.0242, + "step": 1557 + }, + { + "epoch": 0.5411601250434178, + "grad_norm": 0.6214912023671911, + "learning_rate": 4.578546068976329e-06, + "loss": 0.0186, + "step": 1558 + }, + { + "epoch": 0.5415074678707885, + "grad_norm": 0.3570650197481583, + "learning_rate": 4.572940297650747e-06, + "loss": 0.0137, + "step": 1559 + }, + { + "epoch": 0.5418548106981591, + "grad_norm": 0.5029530205971685, + "learning_rate": 4.567335067026528e-06, + "loss": 0.0144, + "step": 1560 + }, + { + "epoch": 0.5422021535255297, + "grad_norm": 0.8095089739524693, + "learning_rate": 4.561730384200467e-06, + "loss": 0.021, + "step": 1561 + }, + { + "epoch": 0.5425494963529003, + "grad_norm": 0.45079050575045515, + "learning_rate": 4.556126256268671e-06, + "loss": 0.0199, + "step": 1562 + }, + { + "epoch": 0.5428968391802709, + "grad_norm": 0.544012994922186, + "learning_rate": 4.550522690326538e-06, + "loss": 0.0178, + "step": 1563 + }, + { + "epoch": 0.5432441820076416, + "grad_norm": 0.4351027380527296, + "learning_rate": 4.544919693468759e-06, + "loss": 0.0187, + "step": 1564 + }, + { + "epoch": 0.5435915248350122, + "grad_norm": 0.735577031685772, + "learning_rate": 4.539317272789299e-06, + "loss": 0.0176, + "step": 1565 + }, + { + "epoch": 0.5439388676623828, + "grad_norm": 0.7448332665427014, + "learning_rate": 4.533715435381398e-06, + "loss": 0.0203, + "step": 1566 + }, + { + "epoch": 0.5442862104897533, + "grad_norm": 0.8816422119244176, + "learning_rate": 4.528114188337559e-06, + "loss": 0.0205, + "step": 1567 + }, + { + "epoch": 0.544633553317124, + "grad_norm": 0.7125240794309594, + "learning_rate": 4.522513538749534e-06, + "loss": 0.019, + "step": 1568 + }, + { + "epoch": 0.5449808961444946, + "grad_norm": 0.6691735284658429, + "learning_rate": 4.516913493708317e-06, + "loss": 0.021, + "step": 1569 + }, + { + "epoch": 0.5453282389718652, + "grad_norm": 0.8271292240524128, + "learning_rate": 4.511314060304141e-06, + "loss": 0.0189, + "step": 1570 + }, + { + "epoch": 0.5456755817992358, + "grad_norm": 0.9574204304906477, + "learning_rate": 4.505715245626462e-06, + "loss": 0.023, + "step": 1571 + }, + { + "epoch": 0.5460229246266065, + "grad_norm": 0.6547097514027208, + "learning_rate": 4.500117056763956e-06, + "loss": 0.0225, + "step": 1572 + }, + { + "epoch": 0.5463702674539771, + "grad_norm": 1.0813224646564668, + "learning_rate": 4.494519500804501e-06, + "loss": 0.0173, + "step": 1573 + }, + { + "epoch": 0.5467176102813477, + "grad_norm": 0.9496654452606548, + "learning_rate": 4.488922584835177e-06, + "loss": 0.0146, + "step": 1574 + }, + { + "epoch": 0.5470649531087183, + "grad_norm": 0.49774440254371105, + "learning_rate": 4.483326315942253e-06, + "loss": 0.0157, + "step": 1575 + }, + { + "epoch": 0.5474122959360889, + "grad_norm": 0.4088463704083283, + "learning_rate": 4.477730701211183e-06, + "loss": 0.017, + "step": 1576 + }, + { + "epoch": 0.5477596387634596, + "grad_norm": 0.6482438150567166, + "learning_rate": 4.472135747726583e-06, + "loss": 0.0265, + "step": 1577 + }, + { + "epoch": 0.5481069815908302, + "grad_norm": 1.1293197974606506, + "learning_rate": 4.466541462572243e-06, + "loss": 0.0189, + "step": 1578 + }, + { + "epoch": 0.5484543244182007, + "grad_norm": 0.5822884700457213, + "learning_rate": 4.460947852831097e-06, + "loss": 0.0183, + "step": 1579 + }, + { + "epoch": 0.5488016672455713, + "grad_norm": 0.5604041072365497, + "learning_rate": 4.455354925585234e-06, + "loss": 0.0283, + "step": 1580 + }, + { + "epoch": 0.549149010072942, + "grad_norm": 0.44375445497432964, + "learning_rate": 4.449762687915866e-06, + "loss": 0.0131, + "step": 1581 + }, + { + "epoch": 0.5494963529003126, + "grad_norm": 1.0388835523370303, + "learning_rate": 4.444171146903345e-06, + "loss": 0.016, + "step": 1582 + }, + { + "epoch": 0.5498436957276832, + "grad_norm": 0.6674378343975669, + "learning_rate": 4.438580309627132e-06, + "loss": 0.0198, + "step": 1583 + }, + { + "epoch": 0.5501910385550538, + "grad_norm": 0.39270508786634684, + "learning_rate": 4.4329901831658035e-06, + "loss": 0.0128, + "step": 1584 + }, + { + "epoch": 0.5505383813824245, + "grad_norm": 0.7015995821207273, + "learning_rate": 4.427400774597028e-06, + "loss": 0.0246, + "step": 1585 + }, + { + "epoch": 0.5508857242097951, + "grad_norm": 0.4993737094804665, + "learning_rate": 4.421812090997573e-06, + "loss": 0.0231, + "step": 1586 + }, + { + "epoch": 0.5512330670371657, + "grad_norm": 0.549124903537338, + "learning_rate": 4.4162241394432834e-06, + "loss": 0.0219, + "step": 1587 + }, + { + "epoch": 0.5515804098645363, + "grad_norm": 0.9191413987980975, + "learning_rate": 4.4106369270090814e-06, + "loss": 0.0128, + "step": 1588 + }, + { + "epoch": 0.5519277526919069, + "grad_norm": 0.47115453283840025, + "learning_rate": 4.405050460768947e-06, + "loss": 0.0151, + "step": 1589 + }, + { + "epoch": 0.5522750955192776, + "grad_norm": 0.5360090594122309, + "learning_rate": 4.3994647477959205e-06, + "loss": 0.0194, + "step": 1590 + }, + { + "epoch": 0.5526224383466481, + "grad_norm": 0.6178093745994127, + "learning_rate": 4.393879795162088e-06, + "loss": 0.0185, + "step": 1591 + }, + { + "epoch": 0.5529697811740187, + "grad_norm": 0.5705601551799697, + "learning_rate": 4.388295609938572e-06, + "loss": 0.0194, + "step": 1592 + }, + { + "epoch": 0.5533171240013893, + "grad_norm": 0.6254120310087067, + "learning_rate": 4.3827121991955235e-06, + "loss": 0.0181, + "step": 1593 + }, + { + "epoch": 0.55366446682876, + "grad_norm": 0.3870391915004128, + "learning_rate": 4.37712957000211e-06, + "loss": 0.0148, + "step": 1594 + }, + { + "epoch": 0.5540118096561306, + "grad_norm": 0.9142400891756279, + "learning_rate": 4.371547729426517e-06, + "loss": 0.0171, + "step": 1595 + }, + { + "epoch": 0.5543591524835012, + "grad_norm": 0.49985847505258046, + "learning_rate": 4.365966684535925e-06, + "loss": 0.0201, + "step": 1596 + }, + { + "epoch": 0.5547064953108718, + "grad_norm": 1.3223267741569067, + "learning_rate": 4.360386442396508e-06, + "loss": 0.0191, + "step": 1597 + }, + { + "epoch": 0.5550538381382425, + "grad_norm": 0.3660339615210978, + "learning_rate": 4.354807010073425e-06, + "loss": 0.0116, + "step": 1598 + }, + { + "epoch": 0.5554011809656131, + "grad_norm": 1.0210647191568252, + "learning_rate": 4.349228394630808e-06, + "loss": 0.0294, + "step": 1599 + }, + { + "epoch": 0.5557485237929837, + "grad_norm": 0.5885533492032752, + "learning_rate": 4.3436506031317594e-06, + "loss": 0.0175, + "step": 1600 + }, + { + "epoch": 0.5560958666203543, + "grad_norm": 0.6342172768582458, + "learning_rate": 4.338073642638334e-06, + "loss": 0.014, + "step": 1601 + }, + { + "epoch": 0.5564432094477249, + "grad_norm": 0.5569542724926432, + "learning_rate": 4.3324975202115345e-06, + "loss": 0.0195, + "step": 1602 + }, + { + "epoch": 0.5567905522750956, + "grad_norm": 0.7223546169720154, + "learning_rate": 4.326922242911302e-06, + "loss": 0.0142, + "step": 1603 + }, + { + "epoch": 0.5571378951024661, + "grad_norm": 0.3683037424124626, + "learning_rate": 4.321347817796511e-06, + "loss": 0.0196, + "step": 1604 + }, + { + "epoch": 0.5574852379298367, + "grad_norm": 1.2092694294770592, + "learning_rate": 4.3157742519249576e-06, + "loss": 0.0251, + "step": 1605 + }, + { + "epoch": 0.5578325807572073, + "grad_norm": 0.4910970055629151, + "learning_rate": 4.3102015523533436e-06, + "loss": 0.0164, + "step": 1606 + }, + { + "epoch": 0.558179923584578, + "grad_norm": 0.36288578186019704, + "learning_rate": 4.304629726137279e-06, + "loss": 0.0142, + "step": 1607 + }, + { + "epoch": 0.5585272664119486, + "grad_norm": 0.5776094098850557, + "learning_rate": 4.299058780331267e-06, + "loss": 0.0147, + "step": 1608 + }, + { + "epoch": 0.5588746092393192, + "grad_norm": 0.4606276309503416, + "learning_rate": 4.293488721988698e-06, + "loss": 0.0171, + "step": 1609 + }, + { + "epoch": 0.5592219520666898, + "grad_norm": 1.9663566917991069, + "learning_rate": 4.287919558161835e-06, + "loss": 0.0173, + "step": 1610 + }, + { + "epoch": 0.5595692948940605, + "grad_norm": 0.9161917744112766, + "learning_rate": 4.28235129590181e-06, + "loss": 0.0164, + "step": 1611 + }, + { + "epoch": 0.5599166377214311, + "grad_norm": 0.3841077769256294, + "learning_rate": 4.276783942258613e-06, + "loss": 0.0136, + "step": 1612 + }, + { + "epoch": 0.5602639805488017, + "grad_norm": 0.40125517813129213, + "learning_rate": 4.27121750428109e-06, + "loss": 0.0181, + "step": 1613 + }, + { + "epoch": 0.5606113233761723, + "grad_norm": 0.8054623880288195, + "learning_rate": 4.265651989016915e-06, + "loss": 0.019, + "step": 1614 + }, + { + "epoch": 0.5609586662035428, + "grad_norm": 0.9023479920127903, + "learning_rate": 4.260087403512605e-06, + "loss": 0.0185, + "step": 1615 + }, + { + "epoch": 0.5613060090309135, + "grad_norm": 0.4648304244403317, + "learning_rate": 4.254523754813495e-06, + "loss": 0.0193, + "step": 1616 + }, + { + "epoch": 0.5616533518582841, + "grad_norm": 0.9102616101626732, + "learning_rate": 4.2489610499637346e-06, + "loss": 0.0173, + "step": 1617 + }, + { + "epoch": 0.5620006946856547, + "grad_norm": 0.5742925429597601, + "learning_rate": 4.243399296006276e-06, + "loss": 0.0224, + "step": 1618 + }, + { + "epoch": 0.5623480375130253, + "grad_norm": 1.1494398692641448, + "learning_rate": 4.237838499982874e-06, + "loss": 0.0195, + "step": 1619 + }, + { + "epoch": 0.562695380340396, + "grad_norm": 0.8792697527207582, + "learning_rate": 4.232278668934063e-06, + "loss": 0.0135, + "step": 1620 + }, + { + "epoch": 0.5630427231677666, + "grad_norm": 0.46740643343738697, + "learning_rate": 4.226719809899163e-06, + "loss": 0.0199, + "step": 1621 + }, + { + "epoch": 0.5633900659951372, + "grad_norm": 1.1632556584909721, + "learning_rate": 4.221161929916255e-06, + "loss": 0.0304, + "step": 1622 + }, + { + "epoch": 0.5637374088225078, + "grad_norm": 0.5407257650582535, + "learning_rate": 4.2156050360221855e-06, + "loss": 0.0109, + "step": 1623 + }, + { + "epoch": 0.5640847516498785, + "grad_norm": 0.6586876902269023, + "learning_rate": 4.210049135252554e-06, + "loss": 0.0149, + "step": 1624 + }, + { + "epoch": 0.5644320944772491, + "grad_norm": 0.7069930675788888, + "learning_rate": 4.204494234641701e-06, + "loss": 0.0099, + "step": 1625 + }, + { + "epoch": 0.5647794373046197, + "grad_norm": 0.8706649712202481, + "learning_rate": 4.198940341222699e-06, + "loss": 0.0259, + "step": 1626 + }, + { + "epoch": 0.5651267801319902, + "grad_norm": 0.4998724109402863, + "learning_rate": 4.193387462027343e-06, + "loss": 0.019, + "step": 1627 + }, + { + "epoch": 0.5654741229593608, + "grad_norm": 0.6031533765692226, + "learning_rate": 4.1878356040861525e-06, + "loss": 0.0192, + "step": 1628 + }, + { + "epoch": 0.5658214657867315, + "grad_norm": 0.7768746063951399, + "learning_rate": 4.182284774428348e-06, + "loss": 0.0232, + "step": 1629 + }, + { + "epoch": 0.5661688086141021, + "grad_norm": 0.46684861318924753, + "learning_rate": 4.176734980081845e-06, + "loss": 0.0174, + "step": 1630 + }, + { + "epoch": 0.5665161514414727, + "grad_norm": 1.2406437022629773, + "learning_rate": 4.171186228073256e-06, + "loss": 0.026, + "step": 1631 + }, + { + "epoch": 0.5668634942688433, + "grad_norm": 0.42437690132062417, + "learning_rate": 4.165638525427867e-06, + "loss": 0.0181, + "step": 1632 + }, + { + "epoch": 0.567210837096214, + "grad_norm": 0.4618761042287134, + "learning_rate": 4.160091879169642e-06, + "loss": 0.0179, + "step": 1633 + }, + { + "epoch": 0.5675581799235846, + "grad_norm": 1.1659549434015535, + "learning_rate": 4.154546296321201e-06, + "loss": 0.016, + "step": 1634 + }, + { + "epoch": 0.5679055227509552, + "grad_norm": 0.8666502913074675, + "learning_rate": 4.14900178390382e-06, + "loss": 0.0216, + "step": 1635 + }, + { + "epoch": 0.5682528655783258, + "grad_norm": 1.5564805764903915, + "learning_rate": 4.143458348937421e-06, + "loss": 0.021, + "step": 1636 + }, + { + "epoch": 0.5686002084056965, + "grad_norm": 1.1023216366479238, + "learning_rate": 4.137915998440564e-06, + "loss": 0.0182, + "step": 1637 + }, + { + "epoch": 0.5689475512330671, + "grad_norm": 0.6297911166518874, + "learning_rate": 4.132374739430427e-06, + "loss": 0.0238, + "step": 1638 + }, + { + "epoch": 0.5692948940604377, + "grad_norm": 1.2566214150953217, + "learning_rate": 4.126834578922816e-06, + "loss": 0.0165, + "step": 1639 + }, + { + "epoch": 0.5696422368878082, + "grad_norm": 0.49215545540415767, + "learning_rate": 4.121295523932141e-06, + "loss": 0.013, + "step": 1640 + }, + { + "epoch": 0.5699895797151788, + "grad_norm": 0.4038473277561727, + "learning_rate": 4.115757581471412e-06, + "loss": 0.0156, + "step": 1641 + }, + { + "epoch": 0.5703369225425495, + "grad_norm": 0.5382862896323736, + "learning_rate": 4.110220758552236e-06, + "loss": 0.0329, + "step": 1642 + }, + { + "epoch": 0.5706842653699201, + "grad_norm": 1.1409766712611276, + "learning_rate": 4.104685062184795e-06, + "loss": 0.0222, + "step": 1643 + }, + { + "epoch": 0.5710316081972907, + "grad_norm": 1.0556759974525016, + "learning_rate": 4.0991504993778485e-06, + "loss": 0.0267, + "step": 1644 + }, + { + "epoch": 0.5713789510246613, + "grad_norm": 0.3881030370366468, + "learning_rate": 4.09361707713872e-06, + "loss": 0.0194, + "step": 1645 + }, + { + "epoch": 0.571726293852032, + "grad_norm": 0.4925901427231959, + "learning_rate": 4.088084802473294e-06, + "loss": 0.0195, + "step": 1646 + }, + { + "epoch": 0.5720736366794026, + "grad_norm": 0.5519961509625639, + "learning_rate": 4.0825536823859895e-06, + "loss": 0.0244, + "step": 1647 + }, + { + "epoch": 0.5724209795067732, + "grad_norm": 0.30715357160720397, + "learning_rate": 4.077023723879777e-06, + "loss": 0.0107, + "step": 1648 + }, + { + "epoch": 0.5727683223341438, + "grad_norm": 0.5494519120842487, + "learning_rate": 4.0714949339561495e-06, + "loss": 0.0167, + "step": 1649 + }, + { + "epoch": 0.5731156651615145, + "grad_norm": 0.3374000317637518, + "learning_rate": 4.065967319615123e-06, + "loss": 0.0126, + "step": 1650 + }, + { + "epoch": 0.573463007988885, + "grad_norm": 0.6301590836558902, + "learning_rate": 4.06044088785522e-06, + "loss": 0.0122, + "step": 1651 + }, + { + "epoch": 0.5738103508162556, + "grad_norm": 1.309711699788591, + "learning_rate": 4.054915645673475e-06, + "loss": 0.0225, + "step": 1652 + }, + { + "epoch": 0.5741576936436262, + "grad_norm": 0.8057778580750404, + "learning_rate": 4.049391600065407e-06, + "loss": 0.0324, + "step": 1653 + }, + { + "epoch": 0.5745050364709968, + "grad_norm": 0.6846173645835267, + "learning_rate": 4.043868758025027e-06, + "loss": 0.0149, + "step": 1654 + }, + { + "epoch": 0.5748523792983675, + "grad_norm": 0.983006587398994, + "learning_rate": 4.038347126544816e-06, + "loss": 0.0202, + "step": 1655 + }, + { + "epoch": 0.5751997221257381, + "grad_norm": 0.4000451176994636, + "learning_rate": 4.032826712615727e-06, + "loss": 0.0146, + "step": 1656 + }, + { + "epoch": 0.5755470649531087, + "grad_norm": 0.6557928230605302, + "learning_rate": 4.02730752322717e-06, + "loss": 0.0185, + "step": 1657 + }, + { + "epoch": 0.5758944077804793, + "grad_norm": 0.39754184151942606, + "learning_rate": 4.021789565367007e-06, + "loss": 0.0137, + "step": 1658 + }, + { + "epoch": 0.57624175060785, + "grad_norm": 0.4884113890090465, + "learning_rate": 4.016272846021534e-06, + "loss": 0.0227, + "step": 1659 + }, + { + "epoch": 0.5765890934352206, + "grad_norm": 0.7745976315929322, + "learning_rate": 4.010757372175485e-06, + "loss": 0.0208, + "step": 1660 + }, + { + "epoch": 0.5769364362625912, + "grad_norm": 0.4156533149774267, + "learning_rate": 4.005243150812017e-06, + "loss": 0.0202, + "step": 1661 + }, + { + "epoch": 0.5772837790899618, + "grad_norm": 0.5057128862814502, + "learning_rate": 3.999730188912698e-06, + "loss": 0.0164, + "step": 1662 + }, + { + "epoch": 0.5776311219173325, + "grad_norm": 0.3684895934447478, + "learning_rate": 3.994218493457503e-06, + "loss": 0.0181, + "step": 1663 + }, + { + "epoch": 0.577978464744703, + "grad_norm": 0.3642952572602709, + "learning_rate": 3.988708071424803e-06, + "loss": 0.013, + "step": 1664 + }, + { + "epoch": 0.5783258075720736, + "grad_norm": 0.9994996067257631, + "learning_rate": 3.983198929791357e-06, + "loss": 0.0249, + "step": 1665 + }, + { + "epoch": 0.5786731503994442, + "grad_norm": 0.3528391599891256, + "learning_rate": 3.977691075532305e-06, + "loss": 0.0134, + "step": 1666 + }, + { + "epoch": 0.5790204932268148, + "grad_norm": 0.4584118521555668, + "learning_rate": 3.9721845156211535e-06, + "loss": 0.0232, + "step": 1667 + }, + { + "epoch": 0.5793678360541855, + "grad_norm": 0.6681777982150439, + "learning_rate": 3.966679257029772e-06, + "loss": 0.0171, + "step": 1668 + }, + { + "epoch": 0.5797151788815561, + "grad_norm": 0.9132998487091103, + "learning_rate": 3.961175306728382e-06, + "loss": 0.0152, + "step": 1669 + }, + { + "epoch": 0.5800625217089267, + "grad_norm": 0.6259597245750973, + "learning_rate": 3.955672671685552e-06, + "loss": 0.0131, + "step": 1670 + }, + { + "epoch": 0.5804098645362973, + "grad_norm": 0.4912882189301569, + "learning_rate": 3.950171358868177e-06, + "loss": 0.0177, + "step": 1671 + }, + { + "epoch": 0.580757207363668, + "grad_norm": 0.9807953706195164, + "learning_rate": 3.944671375241485e-06, + "loss": 0.0187, + "step": 1672 + }, + { + "epoch": 0.5811045501910386, + "grad_norm": 0.9981909862170388, + "learning_rate": 3.939172727769021e-06, + "loss": 0.0179, + "step": 1673 + }, + { + "epoch": 0.5814518930184092, + "grad_norm": 0.6504993573564067, + "learning_rate": 3.933675423412636e-06, + "loss": 0.0179, + "step": 1674 + }, + { + "epoch": 0.5817992358457797, + "grad_norm": 0.513719596446845, + "learning_rate": 3.928179469132477e-06, + "loss": 0.0189, + "step": 1675 + }, + { + "epoch": 0.5821465786731504, + "grad_norm": 0.5682561853770528, + "learning_rate": 3.9226848718869905e-06, + "loss": 0.0162, + "step": 1676 + }, + { + "epoch": 0.582493921500521, + "grad_norm": 0.5510658649823071, + "learning_rate": 3.917191638632897e-06, + "loss": 0.0182, + "step": 1677 + }, + { + "epoch": 0.5828412643278916, + "grad_norm": 0.7127772237528726, + "learning_rate": 3.911699776325191e-06, + "loss": 0.0204, + "step": 1678 + }, + { + "epoch": 0.5831886071552622, + "grad_norm": 0.4103249102772578, + "learning_rate": 3.906209291917141e-06, + "loss": 0.0121, + "step": 1679 + }, + { + "epoch": 0.5835359499826328, + "grad_norm": 0.5473544971789079, + "learning_rate": 3.900720192360255e-06, + "loss": 0.027, + "step": 1680 + }, + { + "epoch": 0.5838832928100035, + "grad_norm": 0.39319317320120634, + "learning_rate": 3.895232484604299e-06, + "loss": 0.0142, + "step": 1681 + }, + { + "epoch": 0.5842306356373741, + "grad_norm": 0.5760042859983425, + "learning_rate": 3.889746175597274e-06, + "loss": 0.0075, + "step": 1682 + }, + { + "epoch": 0.5845779784647447, + "grad_norm": 0.5384592196281841, + "learning_rate": 3.884261272285409e-06, + "loss": 0.0162, + "step": 1683 + }, + { + "epoch": 0.5849253212921153, + "grad_norm": 0.5430426343042051, + "learning_rate": 3.8787777816131525e-06, + "loss": 0.0132, + "step": 1684 + }, + { + "epoch": 0.585272664119486, + "grad_norm": 0.49939162433255296, + "learning_rate": 3.873295710523168e-06, + "loss": 0.017, + "step": 1685 + }, + { + "epoch": 0.5856200069468566, + "grad_norm": 0.6649735504087517, + "learning_rate": 3.867815065956319e-06, + "loss": 0.0173, + "step": 1686 + }, + { + "epoch": 0.5859673497742272, + "grad_norm": 0.878731758013457, + "learning_rate": 3.862335854851664e-06, + "loss": 0.0225, + "step": 1687 + }, + { + "epoch": 0.5863146926015977, + "grad_norm": 0.6320050052238549, + "learning_rate": 3.856858084146444e-06, + "loss": 0.0216, + "step": 1688 + }, + { + "epoch": 0.5866620354289684, + "grad_norm": 0.9414138865126228, + "learning_rate": 3.851381760776077e-06, + "loss": 0.0211, + "step": 1689 + }, + { + "epoch": 0.587009378256339, + "grad_norm": 0.7355116706005304, + "learning_rate": 3.845906891674155e-06, + "loss": 0.0193, + "step": 1690 + }, + { + "epoch": 0.5873567210837096, + "grad_norm": 0.5623242566677557, + "learning_rate": 3.8404334837724205e-06, + "loss": 0.0201, + "step": 1691 + }, + { + "epoch": 0.5877040639110802, + "grad_norm": 0.5982393659307643, + "learning_rate": 3.834961544000769e-06, + "loss": 0.024, + "step": 1692 + }, + { + "epoch": 0.5880514067384508, + "grad_norm": 0.5212721299546837, + "learning_rate": 3.8294910792872355e-06, + "loss": 0.0194, + "step": 1693 + }, + { + "epoch": 0.5883987495658215, + "grad_norm": 0.9699697508233305, + "learning_rate": 3.824022096557992e-06, + "loss": 0.0246, + "step": 1694 + }, + { + "epoch": 0.5887460923931921, + "grad_norm": 0.6506075968369963, + "learning_rate": 3.8185546027373325e-06, + "loss": 0.0181, + "step": 1695 + }, + { + "epoch": 0.5890934352205627, + "grad_norm": 1.1944480293880813, + "learning_rate": 3.81308860474766e-06, + "loss": 0.0181, + "step": 1696 + }, + { + "epoch": 0.5894407780479333, + "grad_norm": 0.8174985452252524, + "learning_rate": 3.807624109509491e-06, + "loss": 0.0205, + "step": 1697 + }, + { + "epoch": 0.589788120875304, + "grad_norm": 0.9297467401132783, + "learning_rate": 3.802161123941436e-06, + "loss": 0.0181, + "step": 1698 + }, + { + "epoch": 0.5901354637026746, + "grad_norm": 1.1350479422662159, + "learning_rate": 3.7966996549601968e-06, + "loss": 0.022, + "step": 1699 + }, + { + "epoch": 0.5904828065300451, + "grad_norm": 1.0512262693137089, + "learning_rate": 3.7912397094805508e-06, + "loss": 0.0132, + "step": 1700 + }, + { + "epoch": 0.5908301493574157, + "grad_norm": 1.2991204215566619, + "learning_rate": 3.785781294415349e-06, + "loss": 0.0254, + "step": 1701 + }, + { + "epoch": 0.5911774921847864, + "grad_norm": 0.5226731278223045, + "learning_rate": 3.780324416675504e-06, + "loss": 0.0236, + "step": 1702 + }, + { + "epoch": 0.591524835012157, + "grad_norm": 0.7465469795767791, + "learning_rate": 3.7748690831699858e-06, + "loss": 0.0199, + "step": 1703 + }, + { + "epoch": 0.5918721778395276, + "grad_norm": 0.39840312715848447, + "learning_rate": 3.7694153008058005e-06, + "loss": 0.0144, + "step": 1704 + }, + { + "epoch": 0.5922195206668982, + "grad_norm": 0.62786992782903, + "learning_rate": 3.7639630764879996e-06, + "loss": 0.0152, + "step": 1705 + }, + { + "epoch": 0.5925668634942688, + "grad_norm": 0.6340042279876545, + "learning_rate": 3.7585124171196563e-06, + "loss": 0.0208, + "step": 1706 + }, + { + "epoch": 0.5929142063216395, + "grad_norm": 0.3158368812851925, + "learning_rate": 3.7530633296018664e-06, + "loss": 0.0158, + "step": 1707 + }, + { + "epoch": 0.5932615491490101, + "grad_norm": 0.6469044082386353, + "learning_rate": 3.747615820833729e-06, + "loss": 0.0155, + "step": 1708 + }, + { + "epoch": 0.5936088919763807, + "grad_norm": 0.5138966576020948, + "learning_rate": 3.7421698977123533e-06, + "loss": 0.0141, + "step": 1709 + }, + { + "epoch": 0.5939562348037513, + "grad_norm": 1.0369760306163973, + "learning_rate": 3.736725567132833e-06, + "loss": 0.0182, + "step": 1710 + }, + { + "epoch": 0.594303577631122, + "grad_norm": 0.8561393743072047, + "learning_rate": 3.731282835988252e-06, + "loss": 0.0195, + "step": 1711 + }, + { + "epoch": 0.5946509204584925, + "grad_norm": 0.3744227619737715, + "learning_rate": 3.725841711169662e-06, + "loss": 0.0125, + "step": 1712 + }, + { + "epoch": 0.5949982632858631, + "grad_norm": 0.44797174308593246, + "learning_rate": 3.7204021995660865e-06, + "loss": 0.023, + "step": 1713 + }, + { + "epoch": 0.5953456061132337, + "grad_norm": 0.5892043090184133, + "learning_rate": 3.7149643080645055e-06, + "loss": 0.0237, + "step": 1714 + }, + { + "epoch": 0.5956929489406044, + "grad_norm": 1.2898844568707646, + "learning_rate": 3.7095280435498476e-06, + "loss": 0.0208, + "step": 1715 + }, + { + "epoch": 0.596040291767975, + "grad_norm": 0.32721676156286766, + "learning_rate": 3.7040934129049794e-06, + "loss": 0.0121, + "step": 1716 + }, + { + "epoch": 0.5963876345953456, + "grad_norm": 0.8377490237817452, + "learning_rate": 3.6986604230106993e-06, + "loss": 0.0202, + "step": 1717 + }, + { + "epoch": 0.5967349774227162, + "grad_norm": 1.3845327772962783, + "learning_rate": 3.6932290807457326e-06, + "loss": 0.0232, + "step": 1718 + }, + { + "epoch": 0.5970823202500868, + "grad_norm": 0.5700167886253666, + "learning_rate": 3.6877993929867146e-06, + "loss": 0.0121, + "step": 1719 + }, + { + "epoch": 0.5974296630774575, + "grad_norm": 0.6239261997221, + "learning_rate": 3.6823713666081864e-06, + "loss": 0.0122, + "step": 1720 + }, + { + "epoch": 0.5977770059048281, + "grad_norm": 0.486573773418933, + "learning_rate": 3.676945008482585e-06, + "loss": 0.0163, + "step": 1721 + }, + { + "epoch": 0.5981243487321987, + "grad_norm": 0.8899160417072958, + "learning_rate": 3.671520325480235e-06, + "loss": 0.0152, + "step": 1722 + }, + { + "epoch": 0.5984716915595693, + "grad_norm": 0.6204522930553988, + "learning_rate": 3.6660973244693443e-06, + "loss": 0.0136, + "step": 1723 + }, + { + "epoch": 0.59881903438694, + "grad_norm": 0.6351473439134602, + "learning_rate": 3.6606760123159867e-06, + "loss": 0.0124, + "step": 1724 + }, + { + "epoch": 0.5991663772143105, + "grad_norm": 0.43393477090853044, + "learning_rate": 3.6552563958840994e-06, + "loss": 0.011, + "step": 1725 + }, + { + "epoch": 0.5995137200416811, + "grad_norm": 0.5715600265285414, + "learning_rate": 3.6498384820354693e-06, + "loss": 0.0182, + "step": 1726 + }, + { + "epoch": 0.5998610628690517, + "grad_norm": 0.8520750202499213, + "learning_rate": 3.6444222776297356e-06, + "loss": 0.0156, + "step": 1727 + }, + { + "epoch": 0.6002084056964224, + "grad_norm": 0.919378604709449, + "learning_rate": 3.6390077895243676e-06, + "loss": 0.0124, + "step": 1728 + }, + { + "epoch": 0.600555748523793, + "grad_norm": 0.5905220548628203, + "learning_rate": 3.6335950245746593e-06, + "loss": 0.0148, + "step": 1729 + }, + { + "epoch": 0.6009030913511636, + "grad_norm": 0.45040394474125944, + "learning_rate": 3.6281839896337277e-06, + "loss": 0.0191, + "step": 1730 + }, + { + "epoch": 0.6012504341785342, + "grad_norm": 0.5316669983658013, + "learning_rate": 3.6227746915524964e-06, + "loss": 0.016, + "step": 1731 + }, + { + "epoch": 0.6015977770059048, + "grad_norm": 0.4609606149512745, + "learning_rate": 3.6173671371796946e-06, + "loss": 0.0148, + "step": 1732 + }, + { + "epoch": 0.6019451198332755, + "grad_norm": 0.41149254163013027, + "learning_rate": 3.6119613333618386e-06, + "loss": 0.0133, + "step": 1733 + }, + { + "epoch": 0.6022924626606461, + "grad_norm": 1.325138871791973, + "learning_rate": 3.606557286943229e-06, + "loss": 0.0172, + "step": 1734 + }, + { + "epoch": 0.6026398054880167, + "grad_norm": 0.44045335177247696, + "learning_rate": 3.601155004765943e-06, + "loss": 0.009, + "step": 1735 + }, + { + "epoch": 0.6029871483153872, + "grad_norm": 0.6696722263208196, + "learning_rate": 3.5957544936698272e-06, + "loss": 0.018, + "step": 1736 + }, + { + "epoch": 0.603334491142758, + "grad_norm": 1.0184366709017578, + "learning_rate": 3.5903557604924764e-06, + "loss": 0.0137, + "step": 1737 + }, + { + "epoch": 0.6036818339701285, + "grad_norm": 0.43044179281096573, + "learning_rate": 3.5849588120692446e-06, + "loss": 0.0104, + "step": 1738 + }, + { + "epoch": 0.6040291767974991, + "grad_norm": 0.3641616392590476, + "learning_rate": 3.5795636552332203e-06, + "loss": 0.0106, + "step": 1739 + }, + { + "epoch": 0.6043765196248697, + "grad_norm": 0.7270321800788714, + "learning_rate": 3.5741702968152263e-06, + "loss": 0.0262, + "step": 1740 + }, + { + "epoch": 0.6047238624522404, + "grad_norm": 0.9048183481948355, + "learning_rate": 3.5687787436438044e-06, + "loss": 0.0179, + "step": 1741 + }, + { + "epoch": 0.605071205279611, + "grad_norm": 1.0702279052136698, + "learning_rate": 3.5633890025452162e-06, + "loss": 0.022, + "step": 1742 + }, + { + "epoch": 0.6054185481069816, + "grad_norm": 2.7184991390092743, + "learning_rate": 3.5580010803434254e-06, + "loss": 0.0263, + "step": 1743 + }, + { + "epoch": 0.6057658909343522, + "grad_norm": 0.5480717716362404, + "learning_rate": 3.552614983860096e-06, + "loss": 0.025, + "step": 1744 + }, + { + "epoch": 0.6061132337617228, + "grad_norm": 0.5845582912450411, + "learning_rate": 3.547230719914575e-06, + "loss": 0.0121, + "step": 1745 + }, + { + "epoch": 0.6064605765890935, + "grad_norm": 0.706154766729172, + "learning_rate": 3.541848295323893e-06, + "loss": 0.0117, + "step": 1746 + }, + { + "epoch": 0.6068079194164641, + "grad_norm": 0.6491617666226633, + "learning_rate": 3.536467716902754e-06, + "loss": 0.0227, + "step": 1747 + }, + { + "epoch": 0.6071552622438346, + "grad_norm": 0.5492141027171614, + "learning_rate": 3.5310889914635205e-06, + "loss": 0.0269, + "step": 1748 + }, + { + "epoch": 0.6075026050712052, + "grad_norm": 0.5439963844863106, + "learning_rate": 3.5257121258162092e-06, + "loss": 0.0148, + "step": 1749 + }, + { + "epoch": 0.6078499478985759, + "grad_norm": 1.3037710079779645, + "learning_rate": 3.5203371267684827e-06, + "loss": 0.0235, + "step": 1750 + }, + { + "epoch": 0.6081972907259465, + "grad_norm": 0.5104769061119815, + "learning_rate": 3.5149640011256438e-06, + "loss": 0.0198, + "step": 1751 + }, + { + "epoch": 0.6085446335533171, + "grad_norm": 0.3616939548509015, + "learning_rate": 3.5095927556906193e-06, + "loss": 0.0132, + "step": 1752 + }, + { + "epoch": 0.6088919763806877, + "grad_norm": 1.4959841416039057, + "learning_rate": 3.504223397263955e-06, + "loss": 0.0291, + "step": 1753 + }, + { + "epoch": 0.6092393192080584, + "grad_norm": 1.3319542700316978, + "learning_rate": 3.498855932643811e-06, + "loss": 0.0282, + "step": 1754 + }, + { + "epoch": 0.609586662035429, + "grad_norm": 0.3634235019022115, + "learning_rate": 3.4934903686259445e-06, + "loss": 0.0092, + "step": 1755 + }, + { + "epoch": 0.6099340048627996, + "grad_norm": 0.26043584442925766, + "learning_rate": 3.4881267120037143e-06, + "loss": 0.0109, + "step": 1756 + }, + { + "epoch": 0.6102813476901702, + "grad_norm": 0.6366073358882772, + "learning_rate": 3.4827649695680578e-06, + "loss": 0.0285, + "step": 1757 + }, + { + "epoch": 0.6106286905175408, + "grad_norm": 1.3257848109030876, + "learning_rate": 3.4774051481074885e-06, + "loss": 0.0223, + "step": 1758 + }, + { + "epoch": 0.6109760333449115, + "grad_norm": 0.4347554670924022, + "learning_rate": 3.472047254408091e-06, + "loss": 0.0156, + "step": 1759 + }, + { + "epoch": 0.611323376172282, + "grad_norm": 0.5238569207098802, + "learning_rate": 3.466691295253508e-06, + "loss": 0.0184, + "step": 1760 + }, + { + "epoch": 0.6116707189996526, + "grad_norm": 0.5864988138134234, + "learning_rate": 3.4613372774249355e-06, + "loss": 0.0192, + "step": 1761 + }, + { + "epoch": 0.6120180618270232, + "grad_norm": 0.4574609279537837, + "learning_rate": 3.455985207701105e-06, + "loss": 0.0208, + "step": 1762 + }, + { + "epoch": 0.6123654046543939, + "grad_norm": 0.6793197760170264, + "learning_rate": 3.4506350928582878e-06, + "loss": 0.0168, + "step": 1763 + }, + { + "epoch": 0.6127127474817645, + "grad_norm": 0.8979077135480431, + "learning_rate": 3.4452869396702754e-06, + "loss": 0.0193, + "step": 1764 + }, + { + "epoch": 0.6130600903091351, + "grad_norm": 1.5966359913446138, + "learning_rate": 3.439940754908382e-06, + "loss": 0.018, + "step": 1765 + }, + { + "epoch": 0.6134074331365057, + "grad_norm": 0.3362644354278385, + "learning_rate": 3.4345965453414222e-06, + "loss": 0.011, + "step": 1766 + }, + { + "epoch": 0.6137547759638764, + "grad_norm": 0.9478737143318928, + "learning_rate": 3.429254317735714e-06, + "loss": 0.0229, + "step": 1767 + }, + { + "epoch": 0.614102118791247, + "grad_norm": 0.5981398739431912, + "learning_rate": 3.423914078855064e-06, + "loss": 0.014, + "step": 1768 + }, + { + "epoch": 0.6144494616186176, + "grad_norm": 1.2492792940989146, + "learning_rate": 3.418575835460767e-06, + "loss": 0.0159, + "step": 1769 + }, + { + "epoch": 0.6147968044459882, + "grad_norm": 0.5085854726417735, + "learning_rate": 3.4132395943115803e-06, + "loss": 0.0206, + "step": 1770 + }, + { + "epoch": 0.6151441472733588, + "grad_norm": 0.3446709459428576, + "learning_rate": 3.4079053621637346e-06, + "loss": 0.0164, + "step": 1771 + }, + { + "epoch": 0.6154914901007295, + "grad_norm": 0.5545946109354595, + "learning_rate": 3.402573145770916e-06, + "loss": 0.0157, + "step": 1772 + }, + { + "epoch": 0.6158388329281, + "grad_norm": 0.9460892604144455, + "learning_rate": 3.3972429518842566e-06, + "loss": 0.0383, + "step": 1773 + }, + { + "epoch": 0.6161861757554706, + "grad_norm": 0.4367290077518178, + "learning_rate": 3.3919147872523257e-06, + "loss": 0.0159, + "step": 1774 + }, + { + "epoch": 0.6165335185828412, + "grad_norm": 0.45854417375935685, + "learning_rate": 3.3865886586211285e-06, + "loss": 0.0209, + "step": 1775 + }, + { + "epoch": 0.6168808614102119, + "grad_norm": 0.6207845739989777, + "learning_rate": 3.38126457273409e-06, + "loss": 0.0273, + "step": 1776 + }, + { + "epoch": 0.6172282042375825, + "grad_norm": 0.8158895384794037, + "learning_rate": 3.3759425363320482e-06, + "loss": 0.0205, + "step": 1777 + }, + { + "epoch": 0.6175755470649531, + "grad_norm": 0.4183124541564, + "learning_rate": 3.3706225561532457e-06, + "loss": 0.017, + "step": 1778 + }, + { + "epoch": 0.6179228898923237, + "grad_norm": 0.5875812018496149, + "learning_rate": 3.365304638933322e-06, + "loss": 0.0226, + "step": 1779 + }, + { + "epoch": 0.6182702327196944, + "grad_norm": 0.4777993091444752, + "learning_rate": 3.359988791405309e-06, + "loss": 0.0187, + "step": 1780 + }, + { + "epoch": 0.618617575547065, + "grad_norm": 0.48444108861589447, + "learning_rate": 3.3546750202996136e-06, + "loss": 0.0185, + "step": 1781 + }, + { + "epoch": 0.6189649183744356, + "grad_norm": 0.4370386372661205, + "learning_rate": 3.349363332344013e-06, + "loss": 0.0114, + "step": 1782 + }, + { + "epoch": 0.6193122612018062, + "grad_norm": 0.3131795277127707, + "learning_rate": 3.3440537342636483e-06, + "loss": 0.0162, + "step": 1783 + }, + { + "epoch": 0.6196596040291767, + "grad_norm": 0.5666886716999282, + "learning_rate": 3.338746232781017e-06, + "loss": 0.013, + "step": 1784 + }, + { + "epoch": 0.6200069468565474, + "grad_norm": 0.30906398669450097, + "learning_rate": 3.333440834615961e-06, + "loss": 0.019, + "step": 1785 + }, + { + "epoch": 0.620354289683918, + "grad_norm": 0.26856637246235177, + "learning_rate": 3.3281375464856556e-06, + "loss": 0.0118, + "step": 1786 + }, + { + "epoch": 0.6207016325112886, + "grad_norm": 0.449006224074791, + "learning_rate": 3.322836375104608e-06, + "loss": 0.0157, + "step": 1787 + }, + { + "epoch": 0.6210489753386592, + "grad_norm": 0.9291779318103394, + "learning_rate": 3.3175373271846434e-06, + "loss": 0.0165, + "step": 1788 + }, + { + "epoch": 0.6213963181660299, + "grad_norm": 0.8633956849713047, + "learning_rate": 3.3122404094349037e-06, + "loss": 0.0159, + "step": 1789 + }, + { + "epoch": 0.6217436609934005, + "grad_norm": 0.43556926498499643, + "learning_rate": 3.3069456285618263e-06, + "loss": 0.0205, + "step": 1790 + }, + { + "epoch": 0.6220910038207711, + "grad_norm": 0.4644429061948247, + "learning_rate": 3.3016529912691476e-06, + "loss": 0.0174, + "step": 1791 + }, + { + "epoch": 0.6224383466481417, + "grad_norm": 0.4928017975644182, + "learning_rate": 3.2963625042578875e-06, + "loss": 0.0195, + "step": 1792 + }, + { + "epoch": 0.6227856894755124, + "grad_norm": 0.39183290751040956, + "learning_rate": 3.2910741742263495e-06, + "loss": 0.0119, + "step": 1793 + }, + { + "epoch": 0.623133032302883, + "grad_norm": 0.5461515569612917, + "learning_rate": 3.2857880078700953e-06, + "loss": 0.018, + "step": 1794 + }, + { + "epoch": 0.6234803751302536, + "grad_norm": 0.7497392589922698, + "learning_rate": 3.2805040118819574e-06, + "loss": 0.0261, + "step": 1795 + }, + { + "epoch": 0.6238277179576242, + "grad_norm": 0.802555297478242, + "learning_rate": 3.2752221929520164e-06, + "loss": 0.0158, + "step": 1796 + }, + { + "epoch": 0.6241750607849947, + "grad_norm": 0.576305432582867, + "learning_rate": 3.2699425577675935e-06, + "loss": 0.0158, + "step": 1797 + }, + { + "epoch": 0.6245224036123654, + "grad_norm": 0.5846916212785859, + "learning_rate": 3.2646651130132533e-06, + "loss": 0.0201, + "step": 1798 + }, + { + "epoch": 0.624869746439736, + "grad_norm": 0.4703989112906414, + "learning_rate": 3.2593898653707773e-06, + "loss": 0.0153, + "step": 1799 + }, + { + "epoch": 0.6252170892671066, + "grad_norm": 0.5825293697611765, + "learning_rate": 3.254116821519171e-06, + "loss": 0.0127, + "step": 1800 + }, + { + "epoch": 0.6255644320944772, + "grad_norm": 0.4387644946692422, + "learning_rate": 3.2488459881346483e-06, + "loss": 0.02, + "step": 1801 + }, + { + "epoch": 0.6259117749218479, + "grad_norm": 0.4193154456369623, + "learning_rate": 3.2435773718906284e-06, + "loss": 0.0244, + "step": 1802 + }, + { + "epoch": 0.6262591177492185, + "grad_norm": 0.39690908782734347, + "learning_rate": 3.238310979457713e-06, + "loss": 0.0077, + "step": 1803 + }, + { + "epoch": 0.6266064605765891, + "grad_norm": 1.233442099164475, + "learning_rate": 3.233046817503699e-06, + "loss": 0.0186, + "step": 1804 + }, + { + "epoch": 0.6269538034039597, + "grad_norm": 1.6141038525452736, + "learning_rate": 3.2277848926935528e-06, + "loss": 0.0281, + "step": 1805 + }, + { + "epoch": 0.6273011462313304, + "grad_norm": 0.7139609495476705, + "learning_rate": 3.2225252116894155e-06, + "loss": 0.0258, + "step": 1806 + }, + { + "epoch": 0.627648489058701, + "grad_norm": 0.7513223031684714, + "learning_rate": 3.2172677811505766e-06, + "loss": 0.0225, + "step": 1807 + }, + { + "epoch": 0.6279958318860716, + "grad_norm": 0.8056062866255712, + "learning_rate": 3.2120126077334844e-06, + "loss": 0.0158, + "step": 1808 + }, + { + "epoch": 0.6283431747134421, + "grad_norm": 0.523846225092398, + "learning_rate": 3.2067596980917282e-06, + "loss": 0.0189, + "step": 1809 + }, + { + "epoch": 0.6286905175408127, + "grad_norm": 0.6441750993488121, + "learning_rate": 3.20150905887603e-06, + "loss": 0.0282, + "step": 1810 + }, + { + "epoch": 0.6290378603681834, + "grad_norm": 1.0001326879988315, + "learning_rate": 3.1962606967342356e-06, + "loss": 0.0201, + "step": 1811 + }, + { + "epoch": 0.629385203195554, + "grad_norm": 0.5018682485066364, + "learning_rate": 3.191014618311309e-06, + "loss": 0.0192, + "step": 1812 + }, + { + "epoch": 0.6297325460229246, + "grad_norm": 0.8969102194509483, + "learning_rate": 3.185770830249326e-06, + "loss": 0.0176, + "step": 1813 + }, + { + "epoch": 0.6300798888502952, + "grad_norm": 0.9231788627357475, + "learning_rate": 3.1805293391874604e-06, + "loss": 0.0239, + "step": 1814 + }, + { + "epoch": 0.6304272316776659, + "grad_norm": 0.4778698280139559, + "learning_rate": 3.1752901517619733e-06, + "loss": 0.0174, + "step": 1815 + }, + { + "epoch": 0.6307745745050365, + "grad_norm": 0.7761414647990466, + "learning_rate": 3.1700532746062148e-06, + "loss": 0.0237, + "step": 1816 + }, + { + "epoch": 0.6311219173324071, + "grad_norm": 0.49952446026830283, + "learning_rate": 3.1648187143506095e-06, + "loss": 0.0145, + "step": 1817 + }, + { + "epoch": 0.6314692601597777, + "grad_norm": 0.6271738528394313, + "learning_rate": 3.159586477622647e-06, + "loss": 0.0176, + "step": 1818 + }, + { + "epoch": 0.6318166029871484, + "grad_norm": 0.2541241193835112, + "learning_rate": 3.1543565710468743e-06, + "loss": 0.0113, + "step": 1819 + }, + { + "epoch": 0.632163945814519, + "grad_norm": 0.6685782936301835, + "learning_rate": 3.14912900124489e-06, + "loss": 0.0191, + "step": 1820 + }, + { + "epoch": 0.6325112886418895, + "grad_norm": 0.376455933499608, + "learning_rate": 3.1439037748353316e-06, + "loss": 0.0233, + "step": 1821 + }, + { + "epoch": 0.6328586314692601, + "grad_norm": 0.42194631315752845, + "learning_rate": 3.1386808984338758e-06, + "loss": 0.0192, + "step": 1822 + }, + { + "epoch": 0.6332059742966307, + "grad_norm": 0.5449588242931194, + "learning_rate": 3.1334603786532147e-06, + "loss": 0.0254, + "step": 1823 + }, + { + "epoch": 0.6335533171240014, + "grad_norm": 0.22995057823733359, + "learning_rate": 3.128242222103064e-06, + "loss": 0.0109, + "step": 1824 + }, + { + "epoch": 0.633900659951372, + "grad_norm": 0.48777742225584697, + "learning_rate": 3.123026435390144e-06, + "loss": 0.0231, + "step": 1825 + }, + { + "epoch": 0.6342480027787426, + "grad_norm": 0.3576773382846993, + "learning_rate": 3.117813025118178e-06, + "loss": 0.0152, + "step": 1826 + }, + { + "epoch": 0.6345953456061132, + "grad_norm": 0.2928349054625997, + "learning_rate": 3.112601997887873e-06, + "loss": 0.017, + "step": 1827 + }, + { + "epoch": 0.6349426884334839, + "grad_norm": 0.5006421572877223, + "learning_rate": 3.107393360296927e-06, + "loss": 0.0167, + "step": 1828 + }, + { + "epoch": 0.6352900312608545, + "grad_norm": 0.4143562291378625, + "learning_rate": 3.1021871189400077e-06, + "loss": 0.013, + "step": 1829 + }, + { + "epoch": 0.6356373740882251, + "grad_norm": 1.0102185381541648, + "learning_rate": 3.096983280408754e-06, + "loss": 0.032, + "step": 1830 + }, + { + "epoch": 0.6359847169155957, + "grad_norm": 0.5038454928470375, + "learning_rate": 3.091781851291753e-06, + "loss": 0.0195, + "step": 1831 + }, + { + "epoch": 0.6363320597429664, + "grad_norm": 0.9531592705539882, + "learning_rate": 3.0865828381745515e-06, + "loss": 0.0257, + "step": 1832 + }, + { + "epoch": 0.636679402570337, + "grad_norm": 0.7522264848981152, + "learning_rate": 3.0813862476396323e-06, + "loss": 0.0193, + "step": 1833 + }, + { + "epoch": 0.6370267453977075, + "grad_norm": 0.4979623790577095, + "learning_rate": 3.07619208626641e-06, + "loss": 0.018, + "step": 1834 + }, + { + "epoch": 0.6373740882250781, + "grad_norm": 1.156688985936038, + "learning_rate": 3.0710003606312292e-06, + "loss": 0.023, + "step": 1835 + }, + { + "epoch": 0.6377214310524487, + "grad_norm": 0.7241396081280072, + "learning_rate": 3.065811077307342e-06, + "loss": 0.0221, + "step": 1836 + }, + { + "epoch": 0.6380687738798194, + "grad_norm": 0.4866234304079524, + "learning_rate": 3.060624242864916e-06, + "loss": 0.0123, + "step": 1837 + }, + { + "epoch": 0.63841611670719, + "grad_norm": 0.4496824373815353, + "learning_rate": 3.0554398638710136e-06, + "loss": 0.011, + "step": 1838 + }, + { + "epoch": 0.6387634595345606, + "grad_norm": 0.7591414273681181, + "learning_rate": 3.050257946889594e-06, + "loss": 0.0151, + "step": 1839 + }, + { + "epoch": 0.6391108023619312, + "grad_norm": 0.37588557206384826, + "learning_rate": 3.045078498481491e-06, + "loss": 0.0172, + "step": 1840 + }, + { + "epoch": 0.6394581451893019, + "grad_norm": 0.5904069516982642, + "learning_rate": 3.0399015252044185e-06, + "loss": 0.0209, + "step": 1841 + }, + { + "epoch": 0.6398054880166725, + "grad_norm": 0.5503945994322635, + "learning_rate": 3.0347270336129554e-06, + "loss": 0.0161, + "step": 1842 + }, + { + "epoch": 0.6401528308440431, + "grad_norm": 0.8287427058376795, + "learning_rate": 3.02955503025854e-06, + "loss": 0.0231, + "step": 1843 + }, + { + "epoch": 0.6405001736714137, + "grad_norm": 0.8132926128652753, + "learning_rate": 3.0243855216894557e-06, + "loss": 0.0222, + "step": 1844 + }, + { + "epoch": 0.6408475164987844, + "grad_norm": 0.34673836499554306, + "learning_rate": 3.0192185144508336e-06, + "loss": 0.0096, + "step": 1845 + }, + { + "epoch": 0.6411948593261549, + "grad_norm": 0.6070961612743545, + "learning_rate": 3.0140540150846324e-06, + "loss": 0.0255, + "step": 1846 + }, + { + "epoch": 0.6415422021535255, + "grad_norm": 0.37708682146770417, + "learning_rate": 3.00889203012964e-06, + "loss": 0.0142, + "step": 1847 + }, + { + "epoch": 0.6418895449808961, + "grad_norm": 0.4761721285878218, + "learning_rate": 3.0037325661214555e-06, + "loss": 0.0172, + "step": 1848 + }, + { + "epoch": 0.6422368878082667, + "grad_norm": 0.43162467961906015, + "learning_rate": 2.99857562959249e-06, + "loss": 0.014, + "step": 1849 + }, + { + "epoch": 0.6425842306356374, + "grad_norm": 0.4418457043261697, + "learning_rate": 2.9934212270719555e-06, + "loss": 0.0178, + "step": 1850 + }, + { + "epoch": 0.642931573463008, + "grad_norm": 0.736952954263188, + "learning_rate": 2.988269365085854e-06, + "loss": 0.022, + "step": 1851 + }, + { + "epoch": 0.6432789162903786, + "grad_norm": 0.5793129124364786, + "learning_rate": 2.983120050156969e-06, + "loss": 0.0166, + "step": 1852 + }, + { + "epoch": 0.6436262591177492, + "grad_norm": 0.4059013512127999, + "learning_rate": 2.9779732888048607e-06, + "loss": 0.0134, + "step": 1853 + }, + { + "epoch": 0.6439736019451199, + "grad_norm": 0.5434418757990271, + "learning_rate": 2.9728290875458597e-06, + "loss": 0.0184, + "step": 1854 + }, + { + "epoch": 0.6443209447724905, + "grad_norm": 0.6080536174329941, + "learning_rate": 2.967687452893051e-06, + "loss": 0.0151, + "step": 1855 + }, + { + "epoch": 0.6446682875998611, + "grad_norm": 0.3794734225409095, + "learning_rate": 2.9625483913562696e-06, + "loss": 0.0051, + "step": 1856 + }, + { + "epoch": 0.6450156304272316, + "grad_norm": 0.4025726954879858, + "learning_rate": 2.957411909442095e-06, + "loss": 0.0138, + "step": 1857 + }, + { + "epoch": 0.6453629732546023, + "grad_norm": 0.9717609808679778, + "learning_rate": 2.95227801365384e-06, + "loss": 0.0234, + "step": 1858 + }, + { + "epoch": 0.6457103160819729, + "grad_norm": 0.9793666448755983, + "learning_rate": 2.947146710491545e-06, + "loss": 0.0243, + "step": 1859 + }, + { + "epoch": 0.6460576589093435, + "grad_norm": 0.6545294963642488, + "learning_rate": 2.942018006451961e-06, + "loss": 0.0198, + "step": 1860 + }, + { + "epoch": 0.6464050017367141, + "grad_norm": 0.543720085501545, + "learning_rate": 2.9368919080285574e-06, + "loss": 0.0148, + "step": 1861 + }, + { + "epoch": 0.6467523445640847, + "grad_norm": 0.5001478214174131, + "learning_rate": 2.9317684217114977e-06, + "loss": 0.0153, + "step": 1862 + }, + { + "epoch": 0.6470996873914554, + "grad_norm": 0.8634338411303856, + "learning_rate": 2.9266475539876447e-06, + "loss": 0.0153, + "step": 1863 + }, + { + "epoch": 0.647447030218826, + "grad_norm": 0.3794776934466785, + "learning_rate": 2.921529311340537e-06, + "loss": 0.0145, + "step": 1864 + }, + { + "epoch": 0.6477943730461966, + "grad_norm": 0.5456181574813935, + "learning_rate": 2.916413700250397e-06, + "loss": 0.0223, + "step": 1865 + }, + { + "epoch": 0.6481417158735672, + "grad_norm": 0.7344763458259643, + "learning_rate": 2.9113007271941118e-06, + "loss": 0.0263, + "step": 1866 + }, + { + "epoch": 0.6484890587009379, + "grad_norm": 0.42076158970673955, + "learning_rate": 2.9061903986452323e-06, + "loss": 0.0171, + "step": 1867 + }, + { + "epoch": 0.6488364015283085, + "grad_norm": 0.5699294311522565, + "learning_rate": 2.9010827210739557e-06, + "loss": 0.0223, + "step": 1868 + }, + { + "epoch": 0.649183744355679, + "grad_norm": 0.7798211226417748, + "learning_rate": 2.895977700947124e-06, + "loss": 0.0223, + "step": 1869 + }, + { + "epoch": 0.6495310871830496, + "grad_norm": 0.6520811194584589, + "learning_rate": 2.890875344728218e-06, + "loss": 0.0253, + "step": 1870 + }, + { + "epoch": 0.6498784300104203, + "grad_norm": 0.5155130215134897, + "learning_rate": 2.8857756588773457e-06, + "loss": 0.012, + "step": 1871 + }, + { + "epoch": 0.6502257728377909, + "grad_norm": 0.6695698436955599, + "learning_rate": 2.88067864985123e-06, + "loss": 0.0189, + "step": 1872 + }, + { + "epoch": 0.6505731156651615, + "grad_norm": 0.5691514228238037, + "learning_rate": 2.875584324103205e-06, + "loss": 0.022, + "step": 1873 + }, + { + "epoch": 0.6509204584925321, + "grad_norm": 0.7629069218607217, + "learning_rate": 2.8704926880832117e-06, + "loss": 0.0201, + "step": 1874 + }, + { + "epoch": 0.6512678013199027, + "grad_norm": 0.4637233958651768, + "learning_rate": 2.865403748237784e-06, + "loss": 0.0207, + "step": 1875 + }, + { + "epoch": 0.6516151441472734, + "grad_norm": 0.702869419072292, + "learning_rate": 2.860317511010041e-06, + "loss": 0.0173, + "step": 1876 + }, + { + "epoch": 0.651962486974644, + "grad_norm": 0.9400016037194756, + "learning_rate": 2.855233982839678e-06, + "loss": 0.0148, + "step": 1877 + }, + { + "epoch": 0.6523098298020146, + "grad_norm": 0.4312318941463103, + "learning_rate": 2.8501531701629658e-06, + "loss": 0.0156, + "step": 1878 + }, + { + "epoch": 0.6526571726293852, + "grad_norm": 0.8618641382406451, + "learning_rate": 2.845075079412731e-06, + "loss": 0.0163, + "step": 1879 + }, + { + "epoch": 0.6530045154567559, + "grad_norm": 0.7664712979039602, + "learning_rate": 2.8399997170183625e-06, + "loss": 0.0156, + "step": 1880 + }, + { + "epoch": 0.6533518582841265, + "grad_norm": 0.9200611024228682, + "learning_rate": 2.8349270894057822e-06, + "loss": 0.0202, + "step": 1881 + }, + { + "epoch": 0.653699201111497, + "grad_norm": 0.509279359509403, + "learning_rate": 2.8298572029974624e-06, + "loss": 0.0182, + "step": 1882 + }, + { + "epoch": 0.6540465439388676, + "grad_norm": 0.6113621614337158, + "learning_rate": 2.824790064212396e-06, + "loss": 0.019, + "step": 1883 + }, + { + "epoch": 0.6543938867662383, + "grad_norm": 0.6185846522906022, + "learning_rate": 2.8197256794661023e-06, + "loss": 0.0278, + "step": 1884 + }, + { + "epoch": 0.6547412295936089, + "grad_norm": 0.36054081111019864, + "learning_rate": 2.814664055170609e-06, + "loss": 0.0148, + "step": 1885 + }, + { + "epoch": 0.6550885724209795, + "grad_norm": 0.46357034653680274, + "learning_rate": 2.809605197734454e-06, + "loss": 0.0174, + "step": 1886 + }, + { + "epoch": 0.6554359152483501, + "grad_norm": 1.1073051215502399, + "learning_rate": 2.804549113562667e-06, + "loss": 0.0198, + "step": 1887 + }, + { + "epoch": 0.6557832580757207, + "grad_norm": 0.2971627242142241, + "learning_rate": 2.7994958090567715e-06, + "loss": 0.0175, + "step": 1888 + }, + { + "epoch": 0.6561306009030914, + "grad_norm": 0.36560530569067073, + "learning_rate": 2.7944452906147656e-06, + "loss": 0.0134, + "step": 1889 + }, + { + "epoch": 0.656477943730462, + "grad_norm": 0.3549139995169762, + "learning_rate": 2.7893975646311276e-06, + "loss": 0.0132, + "step": 1890 + }, + { + "epoch": 0.6568252865578326, + "grad_norm": 0.4732148819461115, + "learning_rate": 2.784352637496792e-06, + "loss": 0.0148, + "step": 1891 + }, + { + "epoch": 0.6571726293852032, + "grad_norm": 0.40360522752171646, + "learning_rate": 2.7793105155991584e-06, + "loss": 0.0158, + "step": 1892 + }, + { + "epoch": 0.6575199722125739, + "grad_norm": 0.7623569339524721, + "learning_rate": 2.774271205322066e-06, + "loss": 0.0177, + "step": 1893 + }, + { + "epoch": 0.6578673150399444, + "grad_norm": 1.367237702513199, + "learning_rate": 2.769234713045798e-06, + "loss": 0.0228, + "step": 1894 + }, + { + "epoch": 0.658214657867315, + "grad_norm": 0.675243175984947, + "learning_rate": 2.764201045147071e-06, + "loss": 0.0105, + "step": 1895 + }, + { + "epoch": 0.6585620006946856, + "grad_norm": 0.9439336995618546, + "learning_rate": 2.7591702079990277e-06, + "loss": 0.0181, + "step": 1896 + }, + { + "epoch": 0.6589093435220563, + "grad_norm": 0.5276873160106954, + "learning_rate": 2.754142207971221e-06, + "loss": 0.0099, + "step": 1897 + }, + { + "epoch": 0.6592566863494269, + "grad_norm": 0.569158156227268, + "learning_rate": 2.749117051429612e-06, + "loss": 0.0129, + "step": 1898 + }, + { + "epoch": 0.6596040291767975, + "grad_norm": 0.5137501939519034, + "learning_rate": 2.7440947447365664e-06, + "loss": 0.0141, + "step": 1899 + }, + { + "epoch": 0.6599513720041681, + "grad_norm": 0.7005662168512808, + "learning_rate": 2.739075294250841e-06, + "loss": 0.0201, + "step": 1900 + }, + { + "epoch": 0.6602987148315387, + "grad_norm": 0.7235707108221842, + "learning_rate": 2.7340587063275736e-06, + "loss": 0.0208, + "step": 1901 + }, + { + "epoch": 0.6606460576589094, + "grad_norm": 0.710616178049963, + "learning_rate": 2.7290449873182755e-06, + "loss": 0.0149, + "step": 1902 + }, + { + "epoch": 0.66099340048628, + "grad_norm": 0.48224009794578837, + "learning_rate": 2.7240341435708316e-06, + "loss": 0.0099, + "step": 1903 + }, + { + "epoch": 0.6613407433136506, + "grad_norm": 0.7252312473237166, + "learning_rate": 2.7190261814294873e-06, + "loss": 0.0197, + "step": 1904 + }, + { + "epoch": 0.6616880861410211, + "grad_norm": 0.4216043473064591, + "learning_rate": 2.714021107234831e-06, + "loss": 0.012, + "step": 1905 + }, + { + "epoch": 0.6620354289683918, + "grad_norm": 0.556375549796574, + "learning_rate": 2.7090189273238e-06, + "loss": 0.0203, + "step": 1906 + }, + { + "epoch": 0.6623827717957624, + "grad_norm": 1.4931977841749278, + "learning_rate": 2.7040196480296677e-06, + "loss": 0.031, + "step": 1907 + }, + { + "epoch": 0.662730114623133, + "grad_norm": 0.6694116899417739, + "learning_rate": 2.6990232756820396e-06, + "loss": 0.015, + "step": 1908 + }, + { + "epoch": 0.6630774574505036, + "grad_norm": 0.7127860652752314, + "learning_rate": 2.6940298166068255e-06, + "loss": 0.0151, + "step": 1909 + }, + { + "epoch": 0.6634248002778743, + "grad_norm": 0.5936061309196379, + "learning_rate": 2.6890392771262618e-06, + "loss": 0.0169, + "step": 1910 + }, + { + "epoch": 0.6637721431052449, + "grad_norm": 0.7367028197758674, + "learning_rate": 2.684051663558884e-06, + "loss": 0.0209, + "step": 1911 + }, + { + "epoch": 0.6641194859326155, + "grad_norm": 0.6177550979223532, + "learning_rate": 2.6790669822195202e-06, + "loss": 0.0177, + "step": 1912 + }, + { + "epoch": 0.6644668287599861, + "grad_norm": 1.1958625864238086, + "learning_rate": 2.6740852394192896e-06, + "loss": 0.0196, + "step": 1913 + }, + { + "epoch": 0.6648141715873567, + "grad_norm": 0.46507928549224564, + "learning_rate": 2.6691064414655864e-06, + "loss": 0.0123, + "step": 1914 + }, + { + "epoch": 0.6651615144147274, + "grad_norm": 0.5722070188703421, + "learning_rate": 2.664130594662083e-06, + "loss": 0.0224, + "step": 1915 + }, + { + "epoch": 0.665508857242098, + "grad_norm": 0.9009029726789275, + "learning_rate": 2.6591577053087084e-06, + "loss": 0.015, + "step": 1916 + }, + { + "epoch": 0.6658562000694686, + "grad_norm": 0.5063091771412535, + "learning_rate": 2.6541877797016535e-06, + "loss": 0.0138, + "step": 1917 + }, + { + "epoch": 0.6662035428968391, + "grad_norm": 0.45072043809201573, + "learning_rate": 2.6492208241333494e-06, + "loss": 0.016, + "step": 1918 + }, + { + "epoch": 0.6665508857242098, + "grad_norm": 0.39804190140708595, + "learning_rate": 2.6442568448924754e-06, + "loss": 0.0187, + "step": 1919 + }, + { + "epoch": 0.6668982285515804, + "grad_norm": 0.3519857413757391, + "learning_rate": 2.6392958482639343e-06, + "loss": 0.0153, + "step": 1920 + }, + { + "epoch": 0.667245571378951, + "grad_norm": 0.7945291965330532, + "learning_rate": 2.63433784052886e-06, + "loss": 0.0156, + "step": 1921 + }, + { + "epoch": 0.6675929142063216, + "grad_norm": 0.49910796071011826, + "learning_rate": 2.6293828279645938e-06, + "loss": 0.015, + "step": 1922 + }, + { + "epoch": 0.6679402570336923, + "grad_norm": 0.6606502012921845, + "learning_rate": 2.6244308168446958e-06, + "loss": 0.0329, + "step": 1923 + }, + { + "epoch": 0.6682875998610629, + "grad_norm": 0.5820565201288449, + "learning_rate": 2.6194818134389143e-06, + "loss": 0.0199, + "step": 1924 + }, + { + "epoch": 0.6686349426884335, + "grad_norm": 0.6077071562564756, + "learning_rate": 2.614535824013199e-06, + "loss": 0.0212, + "step": 1925 + }, + { + "epoch": 0.6689822855158041, + "grad_norm": 0.9648264400971077, + "learning_rate": 2.6095928548296773e-06, + "loss": 0.0308, + "step": 1926 + }, + { + "epoch": 0.6693296283431747, + "grad_norm": 0.6590307822381734, + "learning_rate": 2.6046529121466537e-06, + "loss": 0.016, + "step": 1927 + }, + { + "epoch": 0.6696769711705454, + "grad_norm": 0.647299441184019, + "learning_rate": 2.5997160022186028e-06, + "loss": 0.0259, + "step": 1928 + }, + { + "epoch": 0.670024313997916, + "grad_norm": 0.6135071672946737, + "learning_rate": 2.594782131296163e-06, + "loss": 0.0087, + "step": 1929 + }, + { + "epoch": 0.6703716568252865, + "grad_norm": 1.2610496636585344, + "learning_rate": 2.589851305626116e-06, + "loss": 0.0239, + "step": 1930 + }, + { + "epoch": 0.6707189996526571, + "grad_norm": 0.3954009222925035, + "learning_rate": 2.5849235314513923e-06, + "loss": 0.0122, + "step": 1931 + }, + { + "epoch": 0.6710663424800278, + "grad_norm": 0.5156724723554922, + "learning_rate": 2.57999881501106e-06, + "loss": 0.0241, + "step": 1932 + }, + { + "epoch": 0.6714136853073984, + "grad_norm": 0.8288162912643778, + "learning_rate": 2.575077162540318e-06, + "loss": 0.0258, + "step": 1933 + }, + { + "epoch": 0.671761028134769, + "grad_norm": 0.3963519210942034, + "learning_rate": 2.570158580270481e-06, + "loss": 0.0156, + "step": 1934 + }, + { + "epoch": 0.6721083709621396, + "grad_norm": 0.23269559096942974, + "learning_rate": 2.565243074428976e-06, + "loss": 0.0104, + "step": 1935 + }, + { + "epoch": 0.6724557137895103, + "grad_norm": 0.5076358202743023, + "learning_rate": 2.5603306512393387e-06, + "loss": 0.0165, + "step": 1936 + }, + { + "epoch": 0.6728030566168809, + "grad_norm": 0.4803777245917521, + "learning_rate": 2.555421316921203e-06, + "loss": 0.0161, + "step": 1937 + }, + { + "epoch": 0.6731503994442515, + "grad_norm": 1.7249480773077772, + "learning_rate": 2.5505150776902877e-06, + "loss": 0.0299, + "step": 1938 + }, + { + "epoch": 0.6734977422716221, + "grad_norm": 0.5695623395281586, + "learning_rate": 2.5456119397583923e-06, + "loss": 0.0226, + "step": 1939 + }, + { + "epoch": 0.6738450850989927, + "grad_norm": 0.6090762926972669, + "learning_rate": 2.540711909333394e-06, + "loss": 0.0138, + "step": 1940 + }, + { + "epoch": 0.6741924279263634, + "grad_norm": 0.8830972943432503, + "learning_rate": 2.535814992619237e-06, + "loss": 0.024, + "step": 1941 + }, + { + "epoch": 0.674539770753734, + "grad_norm": 0.672673665772259, + "learning_rate": 2.5309211958159135e-06, + "loss": 0.0143, + "step": 1942 + }, + { + "epoch": 0.6748871135811045, + "grad_norm": 0.44731362978442346, + "learning_rate": 2.526030525119475e-06, + "loss": 0.0204, + "step": 1943 + }, + { + "epoch": 0.6752344564084751, + "grad_norm": 0.3413850862351891, + "learning_rate": 2.521142986722014e-06, + "loss": 0.0168, + "step": 1944 + }, + { + "epoch": 0.6755817992358458, + "grad_norm": 1.356486822592823, + "learning_rate": 2.516258586811653e-06, + "loss": 0.0181, + "step": 1945 + }, + { + "epoch": 0.6759291420632164, + "grad_norm": 0.5590617678902554, + "learning_rate": 2.5113773315725407e-06, + "loss": 0.0167, + "step": 1946 + }, + { + "epoch": 0.676276484890587, + "grad_norm": 1.1480811352959883, + "learning_rate": 2.5064992271848504e-06, + "loss": 0.014, + "step": 1947 + }, + { + "epoch": 0.6766238277179576, + "grad_norm": 1.1960798962121544, + "learning_rate": 2.5016242798247623e-06, + "loss": 0.0217, + "step": 1948 + }, + { + "epoch": 0.6769711705453283, + "grad_norm": 1.4243039228724932, + "learning_rate": 2.496752495664457e-06, + "loss": 0.0233, + "step": 1949 + }, + { + "epoch": 0.6773185133726989, + "grad_norm": 1.0255082402145865, + "learning_rate": 2.491883880872115e-06, + "loss": 0.0272, + "step": 1950 + }, + { + "epoch": 0.6776658562000695, + "grad_norm": 0.7762367778334762, + "learning_rate": 2.487018441611899e-06, + "loss": 0.019, + "step": 1951 + }, + { + "epoch": 0.6780131990274401, + "grad_norm": 0.6073071568906027, + "learning_rate": 2.482156184043958e-06, + "loss": 0.0259, + "step": 1952 + }, + { + "epoch": 0.6783605418548107, + "grad_norm": 0.7255049359634366, + "learning_rate": 2.4772971143244033e-06, + "loss": 0.0257, + "step": 1953 + }, + { + "epoch": 0.6787078846821814, + "grad_norm": 0.8117217423490592, + "learning_rate": 2.4724412386053208e-06, + "loss": 0.0133, + "step": 1954 + }, + { + "epoch": 0.6790552275095519, + "grad_norm": 0.43153608387126113, + "learning_rate": 2.4675885630347423e-06, + "loss": 0.0144, + "step": 1955 + }, + { + "epoch": 0.6794025703369225, + "grad_norm": 0.31393892010490915, + "learning_rate": 2.4627390937566566e-06, + "loss": 0.0122, + "step": 1956 + }, + { + "epoch": 0.6797499131642931, + "grad_norm": 0.49140451557972653, + "learning_rate": 2.457892836910985e-06, + "loss": 0.0192, + "step": 1957 + }, + { + "epoch": 0.6800972559916638, + "grad_norm": 0.32574396229410224, + "learning_rate": 2.4530497986335888e-06, + "loss": 0.0111, + "step": 1958 + }, + { + "epoch": 0.6804445988190344, + "grad_norm": 0.5568119405077808, + "learning_rate": 2.4482099850562496e-06, + "loss": 0.0199, + "step": 1959 + }, + { + "epoch": 0.680791941646405, + "grad_norm": 1.3668016268563767, + "learning_rate": 2.4433734023066662e-06, + "loss": 0.018, + "step": 1960 + }, + { + "epoch": 0.6811392844737756, + "grad_norm": 0.375357049604211, + "learning_rate": 2.438540056508449e-06, + "loss": 0.009, + "step": 1961 + }, + { + "epoch": 0.6814866273011463, + "grad_norm": 0.7359559626951213, + "learning_rate": 2.4337099537811114e-06, + "loss": 0.0124, + "step": 1962 + }, + { + "epoch": 0.6818339701285169, + "grad_norm": 0.6182352327681956, + "learning_rate": 2.4288831002400574e-06, + "loss": 0.0159, + "step": 1963 + }, + { + "epoch": 0.6821813129558875, + "grad_norm": 0.9957988289105886, + "learning_rate": 2.4240595019965755e-06, + "loss": 0.0226, + "step": 1964 + }, + { + "epoch": 0.682528655783258, + "grad_norm": 0.5234908070793383, + "learning_rate": 2.4192391651578384e-06, + "loss": 0.0176, + "step": 1965 + }, + { + "epoch": 0.6828759986106286, + "grad_norm": 0.8440220877656187, + "learning_rate": 2.4144220958268883e-06, + "loss": 0.0202, + "step": 1966 + }, + { + "epoch": 0.6832233414379993, + "grad_norm": 0.5472941032734703, + "learning_rate": 2.409608300102627e-06, + "loss": 0.0197, + "step": 1967 + }, + { + "epoch": 0.6835706842653699, + "grad_norm": 0.37475787591312015, + "learning_rate": 2.404797784079811e-06, + "loss": 0.0159, + "step": 1968 + }, + { + "epoch": 0.6839180270927405, + "grad_norm": 0.41355656518558104, + "learning_rate": 2.3999905538490487e-06, + "loss": 0.0069, + "step": 1969 + }, + { + "epoch": 0.6842653699201111, + "grad_norm": 1.433554613823539, + "learning_rate": 2.395186615496789e-06, + "loss": 0.0268, + "step": 1970 + }, + { + "epoch": 0.6846127127474818, + "grad_norm": 1.1620950677445072, + "learning_rate": 2.390385975105308e-06, + "loss": 0.0218, + "step": 1971 + }, + { + "epoch": 0.6849600555748524, + "grad_norm": 0.37254931419419035, + "learning_rate": 2.3855886387527062e-06, + "loss": 0.0112, + "step": 1972 + }, + { + "epoch": 0.685307398402223, + "grad_norm": 0.4943387705379173, + "learning_rate": 2.3807946125129056e-06, + "loss": 0.0205, + "step": 1973 + }, + { + "epoch": 0.6856547412295936, + "grad_norm": 0.7882856389300709, + "learning_rate": 2.3760039024556387e-06, + "loss": 0.0198, + "step": 1974 + }, + { + "epoch": 0.6860020840569643, + "grad_norm": 1.2217971154250373, + "learning_rate": 2.371216514646428e-06, + "loss": 0.0135, + "step": 1975 + }, + { + "epoch": 0.6863494268843349, + "grad_norm": 0.6619688439348922, + "learning_rate": 2.3664324551466007e-06, + "loss": 0.0163, + "step": 1976 + }, + { + "epoch": 0.6866967697117055, + "grad_norm": 0.4763487225723533, + "learning_rate": 2.361651730013269e-06, + "loss": 0.0148, + "step": 1977 + }, + { + "epoch": 0.687044112539076, + "grad_norm": 0.38275391078177107, + "learning_rate": 2.356874345299319e-06, + "loss": 0.013, + "step": 1978 + }, + { + "epoch": 0.6873914553664466, + "grad_norm": 0.7080298751661178, + "learning_rate": 2.3521003070534065e-06, + "loss": 0.0205, + "step": 1979 + }, + { + "epoch": 0.6877387981938173, + "grad_norm": 0.9866082614707886, + "learning_rate": 2.347329621319957e-06, + "loss": 0.0203, + "step": 1980 + }, + { + "epoch": 0.6880861410211879, + "grad_norm": 0.5240609641902144, + "learning_rate": 2.3425622941391485e-06, + "loss": 0.0144, + "step": 1981 + }, + { + "epoch": 0.6884334838485585, + "grad_norm": 0.5375077106046096, + "learning_rate": 2.3377983315469045e-06, + "loss": 0.0151, + "step": 1982 + }, + { + "epoch": 0.6887808266759291, + "grad_norm": 0.9763971466094741, + "learning_rate": 2.3330377395748878e-06, + "loss": 0.0168, + "step": 1983 + }, + { + "epoch": 0.6891281695032998, + "grad_norm": 0.6777867916323312, + "learning_rate": 2.328280524250498e-06, + "loss": 0.0196, + "step": 1984 + }, + { + "epoch": 0.6894755123306704, + "grad_norm": 0.6717483439928033, + "learning_rate": 2.3235266915968586e-06, + "loss": 0.0297, + "step": 1985 + }, + { + "epoch": 0.689822855158041, + "grad_norm": 0.659428567653895, + "learning_rate": 2.3187762476328086e-06, + "loss": 0.0154, + "step": 1986 + }, + { + "epoch": 0.6901701979854116, + "grad_norm": 0.48768769720756106, + "learning_rate": 2.3140291983728936e-06, + "loss": 0.0175, + "step": 1987 + }, + { + "epoch": 0.6905175408127823, + "grad_norm": 0.6148456012864478, + "learning_rate": 2.3092855498273674e-06, + "loss": 0.0177, + "step": 1988 + }, + { + "epoch": 0.6908648836401529, + "grad_norm": 0.5297818902205734, + "learning_rate": 2.3045453080021775e-06, + "loss": 0.0134, + "step": 1989 + }, + { + "epoch": 0.6912122264675234, + "grad_norm": 0.6794761025303996, + "learning_rate": 2.2998084788989514e-06, + "loss": 0.0148, + "step": 1990 + }, + { + "epoch": 0.691559569294894, + "grad_norm": 0.4063679508309279, + "learning_rate": 2.2950750685150045e-06, + "loss": 0.0087, + "step": 1991 + }, + { + "epoch": 0.6919069121222646, + "grad_norm": 0.6095392803855911, + "learning_rate": 2.290345082843318e-06, + "loss": 0.0164, + "step": 1992 + }, + { + "epoch": 0.6922542549496353, + "grad_norm": 0.7441531444252605, + "learning_rate": 2.285618527872537e-06, + "loss": 0.0269, + "step": 1993 + }, + { + "epoch": 0.6926015977770059, + "grad_norm": 0.5717566310228904, + "learning_rate": 2.2808954095869653e-06, + "loss": 0.0131, + "step": 1994 + }, + { + "epoch": 0.6929489406043765, + "grad_norm": 0.6596567441963315, + "learning_rate": 2.2761757339665576e-06, + "loss": 0.0204, + "step": 1995 + }, + { + "epoch": 0.6932962834317471, + "grad_norm": 0.608634510397694, + "learning_rate": 2.2714595069869044e-06, + "loss": 0.0201, + "step": 1996 + }, + { + "epoch": 0.6936436262591178, + "grad_norm": 0.8865789858612154, + "learning_rate": 2.2667467346192325e-06, + "loss": 0.012, + "step": 1997 + }, + { + "epoch": 0.6939909690864884, + "grad_norm": 0.7362361397592723, + "learning_rate": 2.2620374228303944e-06, + "loss": 0.0193, + "step": 1998 + }, + { + "epoch": 0.694338311913859, + "grad_norm": 0.42850603654943825, + "learning_rate": 2.2573315775828655e-06, + "loss": 0.016, + "step": 1999 + }, + { + "epoch": 0.6946856547412296, + "grad_norm": 0.46854751088934754, + "learning_rate": 2.2526292048347246e-06, + "loss": 0.0184, + "step": 2000 + }, + { + "epoch": 0.6950329975686003, + "grad_norm": 0.6941328111513891, + "learning_rate": 2.2479303105396576e-06, + "loss": 0.0252, + "step": 2001 + }, + { + "epoch": 0.6953803403959709, + "grad_norm": 0.5761073352376276, + "learning_rate": 2.2432349006469468e-06, + "loss": 0.0282, + "step": 2002 + }, + { + "epoch": 0.6957276832233414, + "grad_norm": 0.5163738104718749, + "learning_rate": 2.2385429811014654e-06, + "loss": 0.0175, + "step": 2003 + }, + { + "epoch": 0.696075026050712, + "grad_norm": 0.5317310005945695, + "learning_rate": 2.2338545578436623e-06, + "loss": 0.0251, + "step": 2004 + }, + { + "epoch": 0.6964223688780826, + "grad_norm": 0.407307788912211, + "learning_rate": 2.2291696368095595e-06, + "loss": 0.0191, + "step": 2005 + }, + { + "epoch": 0.6967697117054533, + "grad_norm": 0.9234682659843976, + "learning_rate": 2.2244882239307497e-06, + "loss": 0.0277, + "step": 2006 + }, + { + "epoch": 0.6971170545328239, + "grad_norm": 0.4498944617872236, + "learning_rate": 2.2198103251343856e-06, + "loss": 0.0153, + "step": 2007 + }, + { + "epoch": 0.6974643973601945, + "grad_norm": 0.3445137017847305, + "learning_rate": 2.215135946343159e-06, + "loss": 0.0151, + "step": 2008 + }, + { + "epoch": 0.6978117401875651, + "grad_norm": 0.4550738434401644, + "learning_rate": 2.2104650934753157e-06, + "loss": 0.0198, + "step": 2009 + }, + { + "epoch": 0.6981590830149358, + "grad_norm": 0.4009917053292384, + "learning_rate": 2.2057977724446365e-06, + "loss": 0.0181, + "step": 2010 + }, + { + "epoch": 0.6985064258423064, + "grad_norm": 0.7002896810570791, + "learning_rate": 2.201133989160427e-06, + "loss": 0.0177, + "step": 2011 + }, + { + "epoch": 0.698853768669677, + "grad_norm": 0.6950029004017243, + "learning_rate": 2.1964737495275122e-06, + "loss": 0.0182, + "step": 2012 + }, + { + "epoch": 0.6992011114970476, + "grad_norm": 0.4199036421278682, + "learning_rate": 2.191817059446236e-06, + "loss": 0.0165, + "step": 2013 + }, + { + "epoch": 0.6995484543244183, + "grad_norm": 0.5045888082776578, + "learning_rate": 2.1871639248124465e-06, + "loss": 0.018, + "step": 2014 + }, + { + "epoch": 0.6998957971517888, + "grad_norm": 0.6958614459354281, + "learning_rate": 2.182514351517488e-06, + "loss": 0.0176, + "step": 2015 + }, + { + "epoch": 0.7002431399791594, + "grad_norm": 0.46312645884075154, + "learning_rate": 2.1778683454481946e-06, + "loss": 0.0202, + "step": 2016 + }, + { + "epoch": 0.70059048280653, + "grad_norm": 0.4799742697736775, + "learning_rate": 2.1732259124868883e-06, + "loss": 0.015, + "step": 2017 + }, + { + "epoch": 0.7009378256339006, + "grad_norm": 0.33079209530907566, + "learning_rate": 2.1685870585113666e-06, + "loss": 0.0107, + "step": 2018 + }, + { + "epoch": 0.7012851684612713, + "grad_norm": 0.5741815050233295, + "learning_rate": 2.1639517893948926e-06, + "loss": 0.0168, + "step": 2019 + }, + { + "epoch": 0.7016325112886419, + "grad_norm": 0.8688246670359403, + "learning_rate": 2.1593201110061906e-06, + "loss": 0.0179, + "step": 2020 + }, + { + "epoch": 0.7019798541160125, + "grad_norm": 0.47477951851972594, + "learning_rate": 2.154692029209442e-06, + "loss": 0.0131, + "step": 2021 + }, + { + "epoch": 0.7023271969433831, + "grad_norm": 1.249034719322415, + "learning_rate": 2.1500675498642746e-06, + "loss": 0.0237, + "step": 2022 + }, + { + "epoch": 0.7026745397707538, + "grad_norm": 1.4687157493254421, + "learning_rate": 2.145446678825751e-06, + "loss": 0.0217, + "step": 2023 + }, + { + "epoch": 0.7030218825981244, + "grad_norm": 0.38679253812681325, + "learning_rate": 2.140829421944367e-06, + "loss": 0.0181, + "step": 2024 + }, + { + "epoch": 0.703369225425495, + "grad_norm": 0.4319877902351082, + "learning_rate": 2.136215785066046e-06, + "loss": 0.0077, + "step": 2025 + }, + { + "epoch": 0.7037165682528655, + "grad_norm": 1.2606438198673813, + "learning_rate": 2.1316057740321212e-06, + "loss": 0.0298, + "step": 2026 + }, + { + "epoch": 0.7040639110802361, + "grad_norm": 0.7241622040775874, + "learning_rate": 2.1269993946793414e-06, + "loss": 0.0127, + "step": 2027 + }, + { + "epoch": 0.7044112539076068, + "grad_norm": 0.7645077476228309, + "learning_rate": 2.1223966528398577e-06, + "loss": 0.0249, + "step": 2028 + }, + { + "epoch": 0.7047585967349774, + "grad_norm": 0.6348186548588091, + "learning_rate": 2.11779755434121e-06, + "loss": 0.0142, + "step": 2029 + }, + { + "epoch": 0.705105939562348, + "grad_norm": 0.5434630709530021, + "learning_rate": 2.113202105006327e-06, + "loss": 0.0148, + "step": 2030 + }, + { + "epoch": 0.7054532823897186, + "grad_norm": 0.7784838597282087, + "learning_rate": 2.1086103106535214e-06, + "loss": 0.0197, + "step": 2031 + }, + { + "epoch": 0.7058006252170893, + "grad_norm": 1.0412089267934344, + "learning_rate": 2.104022177096477e-06, + "loss": 0.0207, + "step": 2032 + }, + { + "epoch": 0.7061479680444599, + "grad_norm": 0.5527143676070977, + "learning_rate": 2.0994377101442387e-06, + "loss": 0.0183, + "step": 2033 + }, + { + "epoch": 0.7064953108718305, + "grad_norm": 1.1085673643586902, + "learning_rate": 2.09485691560121e-06, + "loss": 0.0153, + "step": 2034 + }, + { + "epoch": 0.7068426536992011, + "grad_norm": 0.3690027590773899, + "learning_rate": 2.0902797992671485e-06, + "loss": 0.009, + "step": 2035 + }, + { + "epoch": 0.7071899965265718, + "grad_norm": 0.4620521220600708, + "learning_rate": 2.0857063669371545e-06, + "loss": 0.0117, + "step": 2036 + }, + { + "epoch": 0.7075373393539424, + "grad_norm": 0.7530837284547335, + "learning_rate": 2.081136624401661e-06, + "loss": 0.0109, + "step": 2037 + }, + { + "epoch": 0.707884682181313, + "grad_norm": 1.090635412666822, + "learning_rate": 2.076570577446428e-06, + "loss": 0.0197, + "step": 2038 + }, + { + "epoch": 0.7082320250086835, + "grad_norm": 0.5421154350298361, + "learning_rate": 2.0720082318525405e-06, + "loss": 0.021, + "step": 2039 + }, + { + "epoch": 0.7085793678360541, + "grad_norm": 0.675053408269602, + "learning_rate": 2.0674495933963997e-06, + "loss": 0.021, + "step": 2040 + }, + { + "epoch": 0.7089267106634248, + "grad_norm": 0.6491462598285759, + "learning_rate": 2.062894667849702e-06, + "loss": 0.0332, + "step": 2041 + }, + { + "epoch": 0.7092740534907954, + "grad_norm": 0.4073609530744494, + "learning_rate": 2.058343460979454e-06, + "loss": 0.0158, + "step": 2042 + }, + { + "epoch": 0.709621396318166, + "grad_norm": 0.6131847890148999, + "learning_rate": 2.0537959785479517e-06, + "loss": 0.0179, + "step": 2043 + }, + { + "epoch": 0.7099687391455366, + "grad_norm": 0.7676568413480472, + "learning_rate": 2.049252226312772e-06, + "loss": 0.0231, + "step": 2044 + }, + { + "epoch": 0.7103160819729073, + "grad_norm": 0.6639394977446624, + "learning_rate": 2.04471221002677e-06, + "loss": 0.0255, + "step": 2045 + }, + { + "epoch": 0.7106634248002779, + "grad_norm": 0.6596012575548285, + "learning_rate": 2.0401759354380728e-06, + "loss": 0.0197, + "step": 2046 + }, + { + "epoch": 0.7110107676276485, + "grad_norm": 0.7895955447252033, + "learning_rate": 2.035643408290071e-06, + "loss": 0.0258, + "step": 2047 + }, + { + "epoch": 0.7113581104550191, + "grad_norm": 0.6477495390019533, + "learning_rate": 2.0311146343214073e-06, + "loss": 0.0102, + "step": 2048 + }, + { + "epoch": 0.7117054532823898, + "grad_norm": 1.064483048484202, + "learning_rate": 2.0265896192659717e-06, + "loss": 0.0227, + "step": 2049 + }, + { + "epoch": 0.7120527961097604, + "grad_norm": 0.6008333641221011, + "learning_rate": 2.0220683688528988e-06, + "loss": 0.0128, + "step": 2050 + }, + { + "epoch": 0.712400138937131, + "grad_norm": 0.7586223473422381, + "learning_rate": 2.0175508888065563e-06, + "loss": 0.0207, + "step": 2051 + }, + { + "epoch": 0.7127474817645015, + "grad_norm": 0.6734658920557806, + "learning_rate": 2.013037184846537e-06, + "loss": 0.0146, + "step": 2052 + }, + { + "epoch": 0.7130948245918721, + "grad_norm": 0.43786239417939005, + "learning_rate": 2.0085272626876496e-06, + "loss": 0.025, + "step": 2053 + }, + { + "epoch": 0.7134421674192428, + "grad_norm": 0.43477244650995783, + "learning_rate": 2.00402112803992e-06, + "loss": 0.0224, + "step": 2054 + }, + { + "epoch": 0.7137895102466134, + "grad_norm": 0.738487309514483, + "learning_rate": 1.9995187866085786e-06, + "loss": 0.0203, + "step": 2055 + }, + { + "epoch": 0.714136853073984, + "grad_norm": 0.370434091066062, + "learning_rate": 1.9950202440940496e-06, + "loss": 0.0159, + "step": 2056 + }, + { + "epoch": 0.7144841959013546, + "grad_norm": 0.6924851015465208, + "learning_rate": 1.9905255061919464e-06, + "loss": 0.0131, + "step": 2057 + }, + { + "epoch": 0.7148315387287253, + "grad_norm": 0.5673188746010913, + "learning_rate": 1.9860345785930726e-06, + "loss": 0.0172, + "step": 2058 + }, + { + "epoch": 0.7151788815560959, + "grad_norm": 0.3438440626602846, + "learning_rate": 1.9815474669833985e-06, + "loss": 0.0145, + "step": 2059 + }, + { + "epoch": 0.7155262243834665, + "grad_norm": 0.45539417984892605, + "learning_rate": 1.977064177044071e-06, + "loss": 0.0183, + "step": 2060 + }, + { + "epoch": 0.7158735672108371, + "grad_norm": 1.484472402316562, + "learning_rate": 1.972584714451392e-06, + "loss": 0.0221, + "step": 2061 + }, + { + "epoch": 0.7162209100382078, + "grad_norm": 0.5404845318844554, + "learning_rate": 1.9681090848768237e-06, + "loss": 0.0195, + "step": 2062 + }, + { + "epoch": 0.7165682528655783, + "grad_norm": 0.47909379000212127, + "learning_rate": 1.9636372939869677e-06, + "loss": 0.0169, + "step": 2063 + }, + { + "epoch": 0.7169155956929489, + "grad_norm": 0.5622427498011785, + "learning_rate": 1.9591693474435735e-06, + "loss": 0.0304, + "step": 2064 + }, + { + "epoch": 0.7172629385203195, + "grad_norm": 0.7467315115677308, + "learning_rate": 1.9547052509035164e-06, + "loss": 0.0165, + "step": 2065 + }, + { + "epoch": 0.7176102813476901, + "grad_norm": 0.5056808774501013, + "learning_rate": 1.9502450100188037e-06, + "loss": 0.0215, + "step": 2066 + }, + { + "epoch": 0.7179576241750608, + "grad_norm": 1.4391119548200895, + "learning_rate": 1.9457886304365533e-06, + "loss": 0.0231, + "step": 2067 + }, + { + "epoch": 0.7183049670024314, + "grad_norm": 0.6476533862493838, + "learning_rate": 1.9413361177990015e-06, + "loss": 0.0241, + "step": 2068 + }, + { + "epoch": 0.718652309829802, + "grad_norm": 0.530719837422982, + "learning_rate": 1.9368874777434864e-06, + "loss": 0.0199, + "step": 2069 + }, + { + "epoch": 0.7189996526571726, + "grad_norm": 0.519016068544451, + "learning_rate": 1.932442715902441e-06, + "loss": 0.0201, + "step": 2070 + }, + { + "epoch": 0.7193469954845433, + "grad_norm": 0.704754836241455, + "learning_rate": 1.9280018379033884e-06, + "loss": 0.0292, + "step": 2071 + }, + { + "epoch": 0.7196943383119139, + "grad_norm": 0.5687572422985191, + "learning_rate": 1.923564849368936e-06, + "loss": 0.0214, + "step": 2072 + }, + { + "epoch": 0.7200416811392845, + "grad_norm": 0.3734820838812652, + "learning_rate": 1.919131755916771e-06, + "loss": 0.0187, + "step": 2073 + }, + { + "epoch": 0.720389023966655, + "grad_norm": 0.3791262946433928, + "learning_rate": 1.9147025631596362e-06, + "loss": 0.02, + "step": 2074 + }, + { + "epoch": 0.7207363667940258, + "grad_norm": 0.4220291272178966, + "learning_rate": 1.9102772767053467e-06, + "loss": 0.0249, + "step": 2075 + }, + { + "epoch": 0.7210837096213963, + "grad_norm": 0.5391949275021815, + "learning_rate": 1.9058559021567718e-06, + "loss": 0.0192, + "step": 2076 + }, + { + "epoch": 0.7214310524487669, + "grad_norm": 1.4908074892631773, + "learning_rate": 1.9014384451118229e-06, + "loss": 0.0206, + "step": 2077 + }, + { + "epoch": 0.7217783952761375, + "grad_norm": 0.711125711354139, + "learning_rate": 1.897024911163451e-06, + "loss": 0.0247, + "step": 2078 + }, + { + "epoch": 0.7221257381035081, + "grad_norm": 1.0280025482928667, + "learning_rate": 1.892615305899645e-06, + "loss": 0.027, + "step": 2079 + }, + { + "epoch": 0.7224730809308788, + "grad_norm": 0.6833450502355668, + "learning_rate": 1.8882096349034184e-06, + "loss": 0.0264, + "step": 2080 + }, + { + "epoch": 0.7228204237582494, + "grad_norm": 0.7449238137674074, + "learning_rate": 1.8838079037528012e-06, + "loss": 0.0226, + "step": 2081 + }, + { + "epoch": 0.72316776658562, + "grad_norm": 0.745751826998084, + "learning_rate": 1.879410118020834e-06, + "loss": 0.0306, + "step": 2082 + }, + { + "epoch": 0.7235151094129906, + "grad_norm": 0.3996052258819095, + "learning_rate": 1.8750162832755669e-06, + "loss": 0.0172, + "step": 2083 + }, + { + "epoch": 0.7238624522403613, + "grad_norm": 0.9447949016982862, + "learning_rate": 1.870626405080046e-06, + "loss": 0.02, + "step": 2084 + }, + { + "epoch": 0.7242097950677319, + "grad_norm": 0.7392285624719003, + "learning_rate": 1.8662404889923058e-06, + "loss": 0.0147, + "step": 2085 + }, + { + "epoch": 0.7245571378951025, + "grad_norm": 0.506159285212194, + "learning_rate": 1.8618585405653639e-06, + "loss": 0.0188, + "step": 2086 + }, + { + "epoch": 0.724904480722473, + "grad_norm": 0.5116325753968864, + "learning_rate": 1.8574805653472178e-06, + "loss": 0.0143, + "step": 2087 + }, + { + "epoch": 0.7252518235498437, + "grad_norm": 0.7618959377624293, + "learning_rate": 1.8531065688808346e-06, + "loss": 0.0205, + "step": 2088 + }, + { + "epoch": 0.7255991663772143, + "grad_norm": 0.6293199274360505, + "learning_rate": 1.848736556704141e-06, + "loss": 0.0122, + "step": 2089 + }, + { + "epoch": 0.7259465092045849, + "grad_norm": 0.9005126434840245, + "learning_rate": 1.8443705343500185e-06, + "loss": 0.0146, + "step": 2090 + }, + { + "epoch": 0.7262938520319555, + "grad_norm": 1.1648440673296412, + "learning_rate": 1.840008507346302e-06, + "loss": 0.0241, + "step": 2091 + }, + { + "epoch": 0.7266411948593261, + "grad_norm": 0.5855114659510036, + "learning_rate": 1.8356504812157623e-06, + "loss": 0.0207, + "step": 2092 + }, + { + "epoch": 0.7269885376866968, + "grad_norm": 0.5068661065981903, + "learning_rate": 1.831296461476109e-06, + "loss": 0.0146, + "step": 2093 + }, + { + "epoch": 0.7273358805140674, + "grad_norm": 0.5837482870866362, + "learning_rate": 1.826946453639976e-06, + "loss": 0.0163, + "step": 2094 + }, + { + "epoch": 0.727683223341438, + "grad_norm": 0.6908638053788853, + "learning_rate": 1.822600463214922e-06, + "loss": 0.0189, + "step": 2095 + }, + { + "epoch": 0.7280305661688086, + "grad_norm": 0.6943496634239659, + "learning_rate": 1.818258495703412e-06, + "loss": 0.0207, + "step": 2096 + }, + { + "epoch": 0.7283779089961793, + "grad_norm": 0.8592869838174363, + "learning_rate": 1.813920556602826e-06, + "loss": 0.0214, + "step": 2097 + }, + { + "epoch": 0.7287252518235499, + "grad_norm": 0.9767864211568215, + "learning_rate": 1.8095866514054372e-06, + "loss": 0.0151, + "step": 2098 + }, + { + "epoch": 0.7290725946509204, + "grad_norm": 0.24915866573453857, + "learning_rate": 1.805256785598416e-06, + "loss": 0.0104, + "step": 2099 + }, + { + "epoch": 0.729419937478291, + "grad_norm": 0.34781878859312965, + "learning_rate": 1.8009309646638128e-06, + "loss": 0.0116, + "step": 2100 + }, + { + "epoch": 0.7297672803056617, + "grad_norm": 0.6277095397756866, + "learning_rate": 1.7966091940785653e-06, + "loss": 0.016, + "step": 2101 + }, + { + "epoch": 0.7301146231330323, + "grad_norm": 0.7019890305202732, + "learning_rate": 1.792291479314473e-06, + "loss": 0.0188, + "step": 2102 + }, + { + "epoch": 0.7304619659604029, + "grad_norm": 0.4558680260256953, + "learning_rate": 1.7879778258382103e-06, + "loss": 0.0153, + "step": 2103 + }, + { + "epoch": 0.7308093087877735, + "grad_norm": 0.5209152944828331, + "learning_rate": 1.7836682391113002e-06, + "loss": 0.0205, + "step": 2104 + }, + { + "epoch": 0.7311566516151441, + "grad_norm": 0.6854168427153546, + "learning_rate": 1.7793627245901236e-06, + "loss": 0.0175, + "step": 2105 + }, + { + "epoch": 0.7315039944425148, + "grad_norm": 0.594503372287282, + "learning_rate": 1.775061287725906e-06, + "loss": 0.0228, + "step": 2106 + }, + { + "epoch": 0.7318513372698854, + "grad_norm": 1.3778446243347249, + "learning_rate": 1.7707639339647015e-06, + "loss": 0.0168, + "step": 2107 + }, + { + "epoch": 0.732198680097256, + "grad_norm": 0.5276235706829822, + "learning_rate": 1.766470668747403e-06, + "loss": 0.0175, + "step": 2108 + }, + { + "epoch": 0.7325460229246266, + "grad_norm": 0.45943667910861274, + "learning_rate": 1.7621814975097274e-06, + "loss": 0.023, + "step": 2109 + }, + { + "epoch": 0.7328933657519973, + "grad_norm": 0.3538644823362223, + "learning_rate": 1.7578964256822018e-06, + "loss": 0.0146, + "step": 2110 + }, + { + "epoch": 0.7332407085793679, + "grad_norm": 0.7312777836477884, + "learning_rate": 1.753615458690166e-06, + "loss": 0.0172, + "step": 2111 + }, + { + "epoch": 0.7335880514067384, + "grad_norm": 0.9654485421097003, + "learning_rate": 1.7493386019537645e-06, + "loss": 0.0279, + "step": 2112 + }, + { + "epoch": 0.733935394234109, + "grad_norm": 0.4641407414319063, + "learning_rate": 1.7450658608879384e-06, + "loss": 0.0216, + "step": 2113 + }, + { + "epoch": 0.7342827370614797, + "grad_norm": 0.6553590581631387, + "learning_rate": 1.7407972409024133e-06, + "loss": 0.0106, + "step": 2114 + }, + { + "epoch": 0.7346300798888503, + "grad_norm": 0.7171981611687974, + "learning_rate": 1.7365327474016979e-06, + "loss": 0.0132, + "step": 2115 + }, + { + "epoch": 0.7349774227162209, + "grad_norm": 0.6691806372000761, + "learning_rate": 1.7322723857850816e-06, + "loss": 0.0125, + "step": 2116 + }, + { + "epoch": 0.7353247655435915, + "grad_norm": 0.8491625521965726, + "learning_rate": 1.7280161614466185e-06, + "loss": 0.0237, + "step": 2117 + }, + { + "epoch": 0.7356721083709621, + "grad_norm": 0.7559623864731797, + "learning_rate": 1.7237640797751249e-06, + "loss": 0.0169, + "step": 2118 + }, + { + "epoch": 0.7360194511983328, + "grad_norm": 0.9441548462460033, + "learning_rate": 1.7195161461541692e-06, + "loss": 0.0278, + "step": 2119 + }, + { + "epoch": 0.7363667940257034, + "grad_norm": 0.5416094909605166, + "learning_rate": 1.7152723659620735e-06, + "loss": 0.0143, + "step": 2120 + }, + { + "epoch": 0.736714136853074, + "grad_norm": 0.5132478425979125, + "learning_rate": 1.7110327445718995e-06, + "loss": 0.0145, + "step": 2121 + }, + { + "epoch": 0.7370614796804446, + "grad_norm": 0.9110313850628509, + "learning_rate": 1.706797287351441e-06, + "loss": 0.0255, + "step": 2122 + }, + { + "epoch": 0.7374088225078153, + "grad_norm": 0.9146610099417635, + "learning_rate": 1.7025659996632198e-06, + "loss": 0.0152, + "step": 2123 + }, + { + "epoch": 0.7377561653351858, + "grad_norm": 0.6272856149050274, + "learning_rate": 1.6983388868644834e-06, + "loss": 0.0161, + "step": 2124 + }, + { + "epoch": 0.7381035081625564, + "grad_norm": 0.4677956955658168, + "learning_rate": 1.6941159543071855e-06, + "loss": 0.0159, + "step": 2125 + }, + { + "epoch": 0.738450850989927, + "grad_norm": 0.4336551332394349, + "learning_rate": 1.689897207337996e-06, + "loss": 0.0199, + "step": 2126 + }, + { + "epoch": 0.7387981938172977, + "grad_norm": 0.5590648865175777, + "learning_rate": 1.6856826512982772e-06, + "loss": 0.0224, + "step": 2127 + }, + { + "epoch": 0.7391455366446683, + "grad_norm": 0.6332517012226541, + "learning_rate": 1.6814722915240922e-06, + "loss": 0.0177, + "step": 2128 + }, + { + "epoch": 0.7394928794720389, + "grad_norm": 0.5184386784352486, + "learning_rate": 1.6772661333461858e-06, + "loss": 0.0118, + "step": 2129 + }, + { + "epoch": 0.7398402222994095, + "grad_norm": 0.8189234197423481, + "learning_rate": 1.673064182089988e-06, + "loss": 0.0147, + "step": 2130 + }, + { + "epoch": 0.7401875651267801, + "grad_norm": 0.6863762658701025, + "learning_rate": 1.6688664430755964e-06, + "loss": 0.0227, + "step": 2131 + }, + { + "epoch": 0.7405349079541508, + "grad_norm": 0.5772056540395054, + "learning_rate": 1.6646729216177827e-06, + "loss": 0.0181, + "step": 2132 + }, + { + "epoch": 0.7408822507815214, + "grad_norm": 0.5758071456756751, + "learning_rate": 1.6604836230259713e-06, + "loss": 0.0143, + "step": 2133 + }, + { + "epoch": 0.741229593608892, + "grad_norm": 0.5432719143241201, + "learning_rate": 1.6562985526042474e-06, + "loss": 0.0192, + "step": 2134 + }, + { + "epoch": 0.7415769364362625, + "grad_norm": 0.4669709741954846, + "learning_rate": 1.6521177156513351e-06, + "loss": 0.0186, + "step": 2135 + }, + { + "epoch": 0.7419242792636332, + "grad_norm": 0.6288408754669941, + "learning_rate": 1.6479411174606069e-06, + "loss": 0.0164, + "step": 2136 + }, + { + "epoch": 0.7422716220910038, + "grad_norm": 0.5089242192772048, + "learning_rate": 1.6437687633200604e-06, + "loss": 0.0207, + "step": 2137 + }, + { + "epoch": 0.7426189649183744, + "grad_norm": 0.6965392337483026, + "learning_rate": 1.639600658512327e-06, + "loss": 0.0147, + "step": 2138 + }, + { + "epoch": 0.742966307745745, + "grad_norm": 0.2523207249854437, + "learning_rate": 1.6354368083146532e-06, + "loss": 0.0099, + "step": 2139 + }, + { + "epoch": 0.7433136505731157, + "grad_norm": 0.9510970817995196, + "learning_rate": 1.6312772179988983e-06, + "loss": 0.0257, + "step": 2140 + }, + { + "epoch": 0.7436609934004863, + "grad_norm": 0.7060524417767109, + "learning_rate": 1.6271218928315325e-06, + "loss": 0.0199, + "step": 2141 + }, + { + "epoch": 0.7440083362278569, + "grad_norm": 0.3884044654632346, + "learning_rate": 1.6229708380736237e-06, + "loss": 0.0153, + "step": 2142 + }, + { + "epoch": 0.7443556790552275, + "grad_norm": 0.5770229127268146, + "learning_rate": 1.6188240589808325e-06, + "loss": 0.0227, + "step": 2143 + }, + { + "epoch": 0.7447030218825981, + "grad_norm": 0.5435182445456208, + "learning_rate": 1.6146815608034033e-06, + "loss": 0.0142, + "step": 2144 + }, + { + "epoch": 0.7450503647099688, + "grad_norm": 0.40026444675826955, + "learning_rate": 1.6105433487861666e-06, + "loss": 0.0172, + "step": 2145 + }, + { + "epoch": 0.7453977075373394, + "grad_norm": 0.4362346680833222, + "learning_rate": 1.6064094281685239e-06, + "loss": 0.0113, + "step": 2146 + }, + { + "epoch": 0.74574505036471, + "grad_norm": 1.0318993306154467, + "learning_rate": 1.6022798041844407e-06, + "loss": 0.0174, + "step": 2147 + }, + { + "epoch": 0.7460923931920805, + "grad_norm": 0.5504252591968342, + "learning_rate": 1.598154482062443e-06, + "loss": 0.0142, + "step": 2148 + }, + { + "epoch": 0.7464397360194512, + "grad_norm": 0.5110175837775568, + "learning_rate": 1.594033467025613e-06, + "loss": 0.0114, + "step": 2149 + }, + { + "epoch": 0.7467870788468218, + "grad_norm": 0.9348497562735546, + "learning_rate": 1.5899167642915803e-06, + "loss": 0.0244, + "step": 2150 + }, + { + "epoch": 0.7471344216741924, + "grad_norm": 0.7812495568758424, + "learning_rate": 1.5858043790725096e-06, + "loss": 0.0246, + "step": 2151 + }, + { + "epoch": 0.747481764501563, + "grad_norm": 0.9116632509191444, + "learning_rate": 1.5816963165751026e-06, + "loss": 0.0135, + "step": 2152 + }, + { + "epoch": 0.7478291073289337, + "grad_norm": 0.5280524057084818, + "learning_rate": 1.5775925820005878e-06, + "loss": 0.0111, + "step": 2153 + }, + { + "epoch": 0.7481764501563043, + "grad_norm": 1.0020440686822643, + "learning_rate": 1.5734931805447151e-06, + "loss": 0.0312, + "step": 2154 + }, + { + "epoch": 0.7485237929836749, + "grad_norm": 0.5849497270719394, + "learning_rate": 1.5693981173977468e-06, + "loss": 0.0202, + "step": 2155 + }, + { + "epoch": 0.7488711358110455, + "grad_norm": 0.4259160011756788, + "learning_rate": 1.56530739774445e-06, + "loss": 0.0211, + "step": 2156 + }, + { + "epoch": 0.7492184786384161, + "grad_norm": 0.7638640166062615, + "learning_rate": 1.5612210267640987e-06, + "loss": 0.0113, + "step": 2157 + }, + { + "epoch": 0.7495658214657868, + "grad_norm": 0.43127591153772443, + "learning_rate": 1.5571390096304545e-06, + "loss": 0.0171, + "step": 2158 + }, + { + "epoch": 0.7499131642931574, + "grad_norm": 0.7943011759564156, + "learning_rate": 1.5530613515117721e-06, + "loss": 0.0187, + "step": 2159 + }, + { + "epoch": 0.7502605071205279, + "grad_norm": 1.4178672936850725, + "learning_rate": 1.5489880575707821e-06, + "loss": 0.0226, + "step": 2160 + }, + { + "epoch": 0.7506078499478985, + "grad_norm": 0.42032498283687775, + "learning_rate": 1.5449191329646951e-06, + "loss": 0.0152, + "step": 2161 + }, + { + "epoch": 0.7509551927752692, + "grad_norm": 0.4736721695112062, + "learning_rate": 1.5408545828451838e-06, + "loss": 0.0163, + "step": 2162 + }, + { + "epoch": 0.7513025356026398, + "grad_norm": 0.44347628963881275, + "learning_rate": 1.5367944123583884e-06, + "loss": 0.0177, + "step": 2163 + }, + { + "epoch": 0.7516498784300104, + "grad_norm": 0.4305512192092599, + "learning_rate": 1.5327386266448973e-06, + "loss": 0.0197, + "step": 2164 + }, + { + "epoch": 0.751997221257381, + "grad_norm": 0.304280894510165, + "learning_rate": 1.528687230839755e-06, + "loss": 0.0144, + "step": 2165 + }, + { + "epoch": 0.7523445640847517, + "grad_norm": 0.3806340910734509, + "learning_rate": 1.5246402300724406e-06, + "loss": 0.0186, + "step": 2166 + }, + { + "epoch": 0.7526919069121223, + "grad_norm": 0.49033443578842467, + "learning_rate": 1.5205976294668745e-06, + "loss": 0.0112, + "step": 2167 + }, + { + "epoch": 0.7530392497394929, + "grad_norm": 0.6297931905210379, + "learning_rate": 1.5165594341414014e-06, + "loss": 0.0139, + "step": 2168 + }, + { + "epoch": 0.7533865925668635, + "grad_norm": 0.3703644340810331, + "learning_rate": 1.5125256492087925e-06, + "loss": 0.0108, + "step": 2169 + }, + { + "epoch": 0.7537339353942341, + "grad_norm": 0.9633658417879537, + "learning_rate": 1.5084962797762303e-06, + "loss": 0.0187, + "step": 2170 + }, + { + "epoch": 0.7540812782216048, + "grad_norm": 0.3787687052461065, + "learning_rate": 1.5044713309453135e-06, + "loss": 0.0139, + "step": 2171 + }, + { + "epoch": 0.7544286210489753, + "grad_norm": 1.1842179559226889, + "learning_rate": 1.5004508078120378e-06, + "loss": 0.0211, + "step": 2172 + }, + { + "epoch": 0.7547759638763459, + "grad_norm": 0.617894991221659, + "learning_rate": 1.4964347154667959e-06, + "loss": 0.0235, + "step": 2173 + }, + { + "epoch": 0.7551233067037165, + "grad_norm": 1.2108002626639653, + "learning_rate": 1.4924230589943738e-06, + "loss": 0.025, + "step": 2174 + }, + { + "epoch": 0.7554706495310872, + "grad_norm": 0.5389764197428066, + "learning_rate": 1.488415843473942e-06, + "loss": 0.0222, + "step": 2175 + }, + { + "epoch": 0.7558179923584578, + "grad_norm": 0.5678337368948643, + "learning_rate": 1.4844130739790441e-06, + "loss": 0.0121, + "step": 2176 + }, + { + "epoch": 0.7561653351858284, + "grad_norm": 0.5096074221105189, + "learning_rate": 1.4804147555775955e-06, + "loss": 0.0162, + "step": 2177 + }, + { + "epoch": 0.756512678013199, + "grad_norm": 0.3945803267973934, + "learning_rate": 1.4764208933318786e-06, + "loss": 0.0148, + "step": 2178 + }, + { + "epoch": 0.7568600208405697, + "grad_norm": 0.7056464507505118, + "learning_rate": 1.472431492298534e-06, + "loss": 0.0277, + "step": 2179 + }, + { + "epoch": 0.7572073636679403, + "grad_norm": 0.4702850476473721, + "learning_rate": 1.4684465575285507e-06, + "loss": 0.0227, + "step": 2180 + }, + { + "epoch": 0.7575547064953109, + "grad_norm": 0.7774483178392437, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.0165, + "step": 2181 + }, + { + "epoch": 0.7579020493226815, + "grad_norm": 0.63743536927089, + "learning_rate": 1.4604901069543475e-06, + "loss": 0.0147, + "step": 2182 + }, + { + "epoch": 0.758249392150052, + "grad_norm": 1.2057859393359813, + "learning_rate": 1.4565186012238126e-06, + "loss": 0.02, + "step": 2183 + }, + { + "epoch": 0.7585967349774227, + "grad_norm": 0.6058695506497026, + "learning_rate": 1.452551581903991e-06, + "loss": 0.0171, + "step": 2184 + }, + { + "epoch": 0.7589440778047933, + "grad_norm": 1.0470651557089286, + "learning_rate": 1.4485890540175335e-06, + "loss": 0.0211, + "step": 2185 + }, + { + "epoch": 0.7592914206321639, + "grad_norm": 0.5419134679450275, + "learning_rate": 1.4446310225814087e-06, + "loss": 0.017, + "step": 2186 + }, + { + "epoch": 0.7596387634595345, + "grad_norm": 0.6607230911522339, + "learning_rate": 1.4406774926068912e-06, + "loss": 0.0097, + "step": 2187 + }, + { + "epoch": 0.7599861062869052, + "grad_norm": 0.3575619358712941, + "learning_rate": 1.4367284690995543e-06, + "loss": 0.0156, + "step": 2188 + }, + { + "epoch": 0.7603334491142758, + "grad_norm": 0.36101367127865347, + "learning_rate": 1.4327839570592644e-06, + "loss": 0.0132, + "step": 2189 + }, + { + "epoch": 0.7606807919416464, + "grad_norm": 0.5040284990352497, + "learning_rate": 1.4288439614801803e-06, + "loss": 0.0162, + "step": 2190 + }, + { + "epoch": 0.761028134769017, + "grad_norm": 0.9760018252254876, + "learning_rate": 1.4249084873507412e-06, + "loss": 0.0233, + "step": 2191 + }, + { + "epoch": 0.7613754775963877, + "grad_norm": 0.6260035980310652, + "learning_rate": 1.4209775396536595e-06, + "loss": 0.0247, + "step": 2192 + }, + { + "epoch": 0.7617228204237583, + "grad_norm": 0.723938835995245, + "learning_rate": 1.4170511233659167e-06, + "loss": 0.0232, + "step": 2193 + }, + { + "epoch": 0.7620701632511289, + "grad_norm": 0.5719862112874641, + "learning_rate": 1.4131292434587613e-06, + "loss": 0.0187, + "step": 2194 + }, + { + "epoch": 0.7624175060784995, + "grad_norm": 0.6344034231978065, + "learning_rate": 1.409211904897692e-06, + "loss": 0.0128, + "step": 2195 + }, + { + "epoch": 0.76276484890587, + "grad_norm": 1.0837290646085158, + "learning_rate": 1.4052991126424642e-06, + "loss": 0.03, + "step": 2196 + }, + { + "epoch": 0.7631121917332407, + "grad_norm": 0.5566631869774307, + "learning_rate": 1.4013908716470714e-06, + "loss": 0.0188, + "step": 2197 + }, + { + "epoch": 0.7634595345606113, + "grad_norm": 0.9291107450950402, + "learning_rate": 1.3974871868597495e-06, + "loss": 0.0212, + "step": 2198 + }, + { + "epoch": 0.7638068773879819, + "grad_norm": 0.6433035528020483, + "learning_rate": 1.3935880632229614e-06, + "loss": 0.0148, + "step": 2199 + }, + { + "epoch": 0.7641542202153525, + "grad_norm": 0.5318264623442518, + "learning_rate": 1.3896935056734001e-06, + "loss": 0.0143, + "step": 2200 + }, + { + "epoch": 0.7645015630427232, + "grad_norm": 0.3392416546423096, + "learning_rate": 1.385803519141971e-06, + "loss": 0.01, + "step": 2201 + }, + { + "epoch": 0.7648489058700938, + "grad_norm": 0.4357906997308162, + "learning_rate": 1.3819181085538002e-06, + "loss": 0.0158, + "step": 2202 + }, + { + "epoch": 0.7651962486974644, + "grad_norm": 0.8938906147298566, + "learning_rate": 1.378037278828212e-06, + "loss": 0.0157, + "step": 2203 + }, + { + "epoch": 0.765543591524835, + "grad_norm": 0.812151390132714, + "learning_rate": 1.3741610348787382e-06, + "loss": 0.0201, + "step": 2204 + }, + { + "epoch": 0.7658909343522057, + "grad_norm": 1.046728570840569, + "learning_rate": 1.3702893816130968e-06, + "loss": 0.0138, + "step": 2205 + }, + { + "epoch": 0.7662382771795763, + "grad_norm": 0.8470882919333848, + "learning_rate": 1.366422323933202e-06, + "loss": 0.0217, + "step": 2206 + }, + { + "epoch": 0.7665856200069469, + "grad_norm": 0.6419335535775957, + "learning_rate": 1.362559866735142e-06, + "loss": 0.0188, + "step": 2207 + }, + { + "epoch": 0.7669329628343174, + "grad_norm": 1.0195274448965153, + "learning_rate": 1.3587020149091856e-06, + "loss": 0.0188, + "step": 2208 + }, + { + "epoch": 0.767280305661688, + "grad_norm": 0.8491806303868048, + "learning_rate": 1.3548487733397686e-06, + "loss": 0.0136, + "step": 2209 + }, + { + "epoch": 0.7676276484890587, + "grad_norm": 0.496099824248827, + "learning_rate": 1.351000146905488e-06, + "loss": 0.0159, + "step": 2210 + }, + { + "epoch": 0.7679749913164293, + "grad_norm": 0.36784724782559614, + "learning_rate": 1.3471561404791e-06, + "loss": 0.0099, + "step": 2211 + }, + { + "epoch": 0.7683223341437999, + "grad_norm": 0.853519257078698, + "learning_rate": 1.343316758927513e-06, + "loss": 0.0152, + "step": 2212 + }, + { + "epoch": 0.7686696769711705, + "grad_norm": 0.4794009903353391, + "learning_rate": 1.3394820071117765e-06, + "loss": 0.0178, + "step": 2213 + }, + { + "epoch": 0.7690170197985412, + "grad_norm": 0.5638174933594083, + "learning_rate": 1.3356518898870773e-06, + "loss": 0.0226, + "step": 2214 + }, + { + "epoch": 0.7693643626259118, + "grad_norm": 0.7867218167416455, + "learning_rate": 1.331826412102738e-06, + "loss": 0.02, + "step": 2215 + }, + { + "epoch": 0.7697117054532824, + "grad_norm": 1.2101224505178252, + "learning_rate": 1.3280055786022078e-06, + "loss": 0.0339, + "step": 2216 + }, + { + "epoch": 0.770059048280653, + "grad_norm": 0.4560784481807581, + "learning_rate": 1.3241893942230511e-06, + "loss": 0.0153, + "step": 2217 + }, + { + "epoch": 0.7704063911080237, + "grad_norm": 1.089455223796198, + "learning_rate": 1.3203778637969478e-06, + "loss": 0.02, + "step": 2218 + }, + { + "epoch": 0.7707537339353943, + "grad_norm": 0.8102592696114593, + "learning_rate": 1.3165709921496873e-06, + "loss": 0.0146, + "step": 2219 + }, + { + "epoch": 0.7711010767627648, + "grad_norm": 0.2248352987503352, + "learning_rate": 1.312768784101161e-06, + "loss": 0.0096, + "step": 2220 + }, + { + "epoch": 0.7714484195901354, + "grad_norm": 0.6361836378952002, + "learning_rate": 1.3089712444653525e-06, + "loss": 0.0282, + "step": 2221 + }, + { + "epoch": 0.771795762417506, + "grad_norm": 0.6975087143669682, + "learning_rate": 1.3051783780503353e-06, + "loss": 0.0123, + "step": 2222 + }, + { + "epoch": 0.7721431052448767, + "grad_norm": 1.2464836526873286, + "learning_rate": 1.3013901896582677e-06, + "loss": 0.027, + "step": 2223 + }, + { + "epoch": 0.7724904480722473, + "grad_norm": 0.4282700872207025, + "learning_rate": 1.2976066840853862e-06, + "loss": 0.0198, + "step": 2224 + }, + { + "epoch": 0.7728377908996179, + "grad_norm": 0.4954085912972942, + "learning_rate": 1.2938278661219961e-06, + "loss": 0.011, + "step": 2225 + }, + { + "epoch": 0.7731851337269885, + "grad_norm": 0.64840741080104, + "learning_rate": 1.290053740552466e-06, + "loss": 0.0198, + "step": 2226 + }, + { + "epoch": 0.7735324765543592, + "grad_norm": 0.582268992296493, + "learning_rate": 1.2862843121552293e-06, + "loss": 0.0121, + "step": 2227 + }, + { + "epoch": 0.7738798193817298, + "grad_norm": 0.5788123624597133, + "learning_rate": 1.282519585702765e-06, + "loss": 0.0181, + "step": 2228 + }, + { + "epoch": 0.7742271622091004, + "grad_norm": 0.42036499722806164, + "learning_rate": 1.2787595659616063e-06, + "loss": 0.019, + "step": 2229 + }, + { + "epoch": 0.774574505036471, + "grad_norm": 1.2403906461674592, + "learning_rate": 1.275004257692321e-06, + "loss": 0.0192, + "step": 2230 + }, + { + "epoch": 0.7749218478638417, + "grad_norm": 0.33842093190437655, + "learning_rate": 1.2712536656495167e-06, + "loss": 0.0132, + "step": 2231 + }, + { + "epoch": 0.7752691906912123, + "grad_norm": 0.9723522981189887, + "learning_rate": 1.2675077945818249e-06, + "loss": 0.0194, + "step": 2232 + }, + { + "epoch": 0.7756165335185828, + "grad_norm": 0.6452976079476872, + "learning_rate": 1.263766649231905e-06, + "loss": 0.0235, + "step": 2233 + }, + { + "epoch": 0.7759638763459534, + "grad_norm": 0.5932675671658317, + "learning_rate": 1.260030234336428e-06, + "loss": 0.016, + "step": 2234 + }, + { + "epoch": 0.776311219173324, + "grad_norm": 0.76806867313472, + "learning_rate": 1.2562985546260804e-06, + "loss": 0.0187, + "step": 2235 + }, + { + "epoch": 0.7766585620006947, + "grad_norm": 0.5650815303570309, + "learning_rate": 1.252571614825549e-06, + "loss": 0.0169, + "step": 2236 + }, + { + "epoch": 0.7770059048280653, + "grad_norm": 0.48992636927446587, + "learning_rate": 1.2488494196535238e-06, + "loss": 0.0198, + "step": 2237 + }, + { + "epoch": 0.7773532476554359, + "grad_norm": 0.47434548335024923, + "learning_rate": 1.2451319738226835e-06, + "loss": 0.0148, + "step": 2238 + }, + { + "epoch": 0.7777005904828065, + "grad_norm": 1.587581496347539, + "learning_rate": 1.2414192820396987e-06, + "loss": 0.016, + "step": 2239 + }, + { + "epoch": 0.7780479333101772, + "grad_norm": 0.5941570896375952, + "learning_rate": 1.237711349005214e-06, + "loss": 0.0224, + "step": 2240 + }, + { + "epoch": 0.7783952761375478, + "grad_norm": 0.45192769841614433, + "learning_rate": 1.234008179413856e-06, + "loss": 0.018, + "step": 2241 + }, + { + "epoch": 0.7787426189649184, + "grad_norm": 0.5359045238517698, + "learning_rate": 1.2303097779542151e-06, + "loss": 0.015, + "step": 2242 + }, + { + "epoch": 0.779089961792289, + "grad_norm": 0.3170868716065785, + "learning_rate": 1.2266161493088463e-06, + "loss": 0.0157, + "step": 2243 + }, + { + "epoch": 0.7794373046196597, + "grad_norm": 0.5339325400301773, + "learning_rate": 1.2229272981542628e-06, + "loss": 0.017, + "step": 2244 + }, + { + "epoch": 0.7797846474470302, + "grad_norm": 0.5749594088375014, + "learning_rate": 1.2192432291609296e-06, + "loss": 0.0191, + "step": 2245 + }, + { + "epoch": 0.7801319902744008, + "grad_norm": 0.8657654364305364, + "learning_rate": 1.2155639469932551e-06, + "loss": 0.0157, + "step": 2246 + }, + { + "epoch": 0.7804793331017714, + "grad_norm": 0.6662929638257185, + "learning_rate": 1.2118894563095857e-06, + "loss": 0.0103, + "step": 2247 + }, + { + "epoch": 0.780826675929142, + "grad_norm": 0.6185784496325207, + "learning_rate": 1.2082197617622049e-06, + "loss": 0.0271, + "step": 2248 + }, + { + "epoch": 0.7811740187565127, + "grad_norm": 0.6728910637327504, + "learning_rate": 1.2045548679973234e-06, + "loss": 0.0204, + "step": 2249 + }, + { + "epoch": 0.7815213615838833, + "grad_norm": 0.503622779557992, + "learning_rate": 1.2008947796550714e-06, + "loss": 0.0168, + "step": 2250 + }, + { + "epoch": 0.7818687044112539, + "grad_norm": 0.7057212026990866, + "learning_rate": 1.1972395013694944e-06, + "loss": 0.0178, + "step": 2251 + }, + { + "epoch": 0.7822160472386245, + "grad_norm": 0.9263129101479496, + "learning_rate": 1.1935890377685499e-06, + "loss": 0.0177, + "step": 2252 + }, + { + "epoch": 0.7825633900659952, + "grad_norm": 0.5928517673288792, + "learning_rate": 1.1899433934741023e-06, + "loss": 0.0242, + "step": 2253 + }, + { + "epoch": 0.7829107328933658, + "grad_norm": 0.286499819754983, + "learning_rate": 1.186302573101908e-06, + "loss": 0.011, + "step": 2254 + }, + { + "epoch": 0.7832580757207364, + "grad_norm": 1.0908468375007774, + "learning_rate": 1.1826665812616183e-06, + "loss": 0.0343, + "step": 2255 + }, + { + "epoch": 0.783605418548107, + "grad_norm": 0.3623784367226302, + "learning_rate": 1.1790354225567724e-06, + "loss": 0.0112, + "step": 2256 + }, + { + "epoch": 0.7839527613754776, + "grad_norm": 0.3445252725726735, + "learning_rate": 1.175409101584793e-06, + "loss": 0.0093, + "step": 2257 + }, + { + "epoch": 0.7843001042028482, + "grad_norm": 0.5433146108276392, + "learning_rate": 1.1717876229369679e-06, + "loss": 0.0151, + "step": 2258 + }, + { + "epoch": 0.7846474470302188, + "grad_norm": 0.63386269312144, + "learning_rate": 1.168170991198464e-06, + "loss": 0.0211, + "step": 2259 + }, + { + "epoch": 0.7849947898575894, + "grad_norm": 0.5237366655341864, + "learning_rate": 1.1645592109483083e-06, + "loss": 0.0106, + "step": 2260 + }, + { + "epoch": 0.78534213268496, + "grad_norm": 0.4926580735814283, + "learning_rate": 1.1609522867593825e-06, + "loss": 0.0164, + "step": 2261 + }, + { + "epoch": 0.7856894755123307, + "grad_norm": 0.618563652888975, + "learning_rate": 1.1573502231984252e-06, + "loss": 0.0171, + "step": 2262 + }, + { + "epoch": 0.7860368183397013, + "grad_norm": 0.42754249596279326, + "learning_rate": 1.1537530248260154e-06, + "loss": 0.0094, + "step": 2263 + }, + { + "epoch": 0.7863841611670719, + "grad_norm": 0.4479479051275309, + "learning_rate": 1.1501606961965772e-06, + "loss": 0.0164, + "step": 2264 + }, + { + "epoch": 0.7867315039944425, + "grad_norm": 1.0791717141203099, + "learning_rate": 1.1465732418583652e-06, + "loss": 0.0216, + "step": 2265 + }, + { + "epoch": 0.7870788468218132, + "grad_norm": 0.7585334147960321, + "learning_rate": 1.1429906663534661e-06, + "loss": 0.0199, + "step": 2266 + }, + { + "epoch": 0.7874261896491838, + "grad_norm": 0.5631229523842227, + "learning_rate": 1.1394129742177856e-06, + "loss": 0.0155, + "step": 2267 + }, + { + "epoch": 0.7877735324765544, + "grad_norm": 0.7372418292459169, + "learning_rate": 1.1358401699810513e-06, + "loss": 0.0162, + "step": 2268 + }, + { + "epoch": 0.7881208753039249, + "grad_norm": 0.3814232843076739, + "learning_rate": 1.1322722581667972e-06, + "loss": 0.012, + "step": 2269 + }, + { + "epoch": 0.7884682181312956, + "grad_norm": 0.3900814023948877, + "learning_rate": 1.1287092432923675e-06, + "loss": 0.0165, + "step": 2270 + }, + { + "epoch": 0.7888155609586662, + "grad_norm": 0.3684093471163428, + "learning_rate": 1.1251511298689015e-06, + "loss": 0.0129, + "step": 2271 + }, + { + "epoch": 0.7891629037860368, + "grad_norm": 0.41541558092287234, + "learning_rate": 1.1215979224013395e-06, + "loss": 0.0073, + "step": 2272 + }, + { + "epoch": 0.7895102466134074, + "grad_norm": 0.45431105796049687, + "learning_rate": 1.1180496253884028e-06, + "loss": 0.0135, + "step": 2273 + }, + { + "epoch": 0.789857589440778, + "grad_norm": 0.8135773753424956, + "learning_rate": 1.1145062433226018e-06, + "loss": 0.0125, + "step": 2274 + }, + { + "epoch": 0.7902049322681487, + "grad_norm": 0.6545808136769573, + "learning_rate": 1.1109677806902203e-06, + "loss": 0.0136, + "step": 2275 + }, + { + "epoch": 0.7905522750955193, + "grad_norm": 0.5330476620089217, + "learning_rate": 1.107434241971313e-06, + "loss": 0.0079, + "step": 2276 + }, + { + "epoch": 0.7908996179228899, + "grad_norm": 0.6003919050746848, + "learning_rate": 1.1039056316397046e-06, + "loss": 0.014, + "step": 2277 + }, + { + "epoch": 0.7912469607502605, + "grad_norm": 0.4179604464910238, + "learning_rate": 1.1003819541629772e-06, + "loss": 0.01, + "step": 2278 + }, + { + "epoch": 0.7915943035776312, + "grad_norm": 0.7808363334276369, + "learning_rate": 1.0968632140024683e-06, + "loss": 0.0152, + "step": 2279 + }, + { + "epoch": 0.7919416464050018, + "grad_norm": 0.4167603863620862, + "learning_rate": 1.0933494156132607e-06, + "loss": 0.0169, + "step": 2280 + }, + { + "epoch": 0.7922889892323723, + "grad_norm": 1.012723748239046, + "learning_rate": 1.0898405634441856e-06, + "loss": 0.0185, + "step": 2281 + }, + { + "epoch": 0.7926363320597429, + "grad_norm": 0.6056013118255872, + "learning_rate": 1.0863366619378107e-06, + "loss": 0.016, + "step": 2282 + }, + { + "epoch": 0.7929836748871136, + "grad_norm": 0.7450736652041653, + "learning_rate": 1.0828377155304332e-06, + "loss": 0.0224, + "step": 2283 + }, + { + "epoch": 0.7933310177144842, + "grad_norm": 1.0221605650577856, + "learning_rate": 1.0793437286520765e-06, + "loss": 0.0195, + "step": 2284 + }, + { + "epoch": 0.7936783605418548, + "grad_norm": 0.6636410309989438, + "learning_rate": 1.0758547057264873e-06, + "loss": 0.0199, + "step": 2285 + }, + { + "epoch": 0.7940257033692254, + "grad_norm": 1.1325176009144873, + "learning_rate": 1.072370651171128e-06, + "loss": 0.0233, + "step": 2286 + }, + { + "epoch": 0.794373046196596, + "grad_norm": 0.5341830366952061, + "learning_rate": 1.0688915693971675e-06, + "loss": 0.012, + "step": 2287 + }, + { + "epoch": 0.7947203890239667, + "grad_norm": 0.6153155573713803, + "learning_rate": 1.0654174648094783e-06, + "loss": 0.0198, + "step": 2288 + }, + { + "epoch": 0.7950677318513373, + "grad_norm": 0.9623001317484534, + "learning_rate": 1.0619483418066346e-06, + "loss": 0.0169, + "step": 2289 + }, + { + "epoch": 0.7954150746787079, + "grad_norm": 0.7223221290889841, + "learning_rate": 1.0584842047809047e-06, + "loss": 0.0155, + "step": 2290 + }, + { + "epoch": 0.7957624175060785, + "grad_norm": 0.6504553961602982, + "learning_rate": 1.0550250581182353e-06, + "loss": 0.0224, + "step": 2291 + }, + { + "epoch": 0.7961097603334492, + "grad_norm": 1.0998554308853323, + "learning_rate": 1.0515709061982632e-06, + "loss": 0.0255, + "step": 2292 + }, + { + "epoch": 0.7964571031608197, + "grad_norm": 0.4931210569774452, + "learning_rate": 1.048121753394301e-06, + "loss": 0.0246, + "step": 2293 + }, + { + "epoch": 0.7968044459881903, + "grad_norm": 0.8526641555443321, + "learning_rate": 1.044677604073328e-06, + "loss": 0.028, + "step": 2294 + }, + { + "epoch": 0.7971517888155609, + "grad_norm": 0.6692895381848059, + "learning_rate": 1.0412384625959887e-06, + "loss": 0.02, + "step": 2295 + }, + { + "epoch": 0.7974991316429316, + "grad_norm": 0.419733606925217, + "learning_rate": 1.037804333316591e-06, + "loss": 0.0143, + "step": 2296 + }, + { + "epoch": 0.7978464744703022, + "grad_norm": 0.4921795286660977, + "learning_rate": 1.0343752205830948e-06, + "loss": 0.0166, + "step": 2297 + }, + { + "epoch": 0.7981938172976728, + "grad_norm": 0.7772755035303158, + "learning_rate": 1.030951128737106e-06, + "loss": 0.0218, + "step": 2298 + }, + { + "epoch": 0.7985411601250434, + "grad_norm": 0.5358777594915758, + "learning_rate": 1.027532062113879e-06, + "loss": 0.0237, + "step": 2299 + }, + { + "epoch": 0.798888502952414, + "grad_norm": 0.5660888423523717, + "learning_rate": 1.0241180250423e-06, + "loss": 0.0152, + "step": 2300 + }, + { + "epoch": 0.7992358457797847, + "grad_norm": 1.1542651595626319, + "learning_rate": 1.0207090218448923e-06, + "loss": 0.0125, + "step": 2301 + }, + { + "epoch": 0.7995831886071553, + "grad_norm": 1.2171482202491108, + "learning_rate": 1.0173050568378002e-06, + "loss": 0.0215, + "step": 2302 + }, + { + "epoch": 0.7999305314345259, + "grad_norm": 1.1045425181167858, + "learning_rate": 1.013906134330796e-06, + "loss": 0.0132, + "step": 2303 + }, + { + "epoch": 0.8002778742618964, + "grad_norm": 0.8205260763175557, + "learning_rate": 1.0105122586272615e-06, + "loss": 0.0168, + "step": 2304 + }, + { + "epoch": 0.8006252170892671, + "grad_norm": 0.6460529317376611, + "learning_rate": 1.0071234340241925e-06, + "loss": 0.0146, + "step": 2305 + }, + { + "epoch": 0.8009725599166377, + "grad_norm": 1.1028425848766075, + "learning_rate": 1.0037396648121872e-06, + "loss": 0.0203, + "step": 2306 + }, + { + "epoch": 0.8013199027440083, + "grad_norm": 0.6040772659608722, + "learning_rate": 1.0003609552754468e-06, + "loss": 0.0131, + "step": 2307 + }, + { + "epoch": 0.8016672455713789, + "grad_norm": 0.42232669891432706, + "learning_rate": 9.969873096917614e-07, + "loss": 0.0113, + "step": 2308 + }, + { + "epoch": 0.8020145883987496, + "grad_norm": 0.700969940979702, + "learning_rate": 9.93618732332512e-07, + "loss": 0.0191, + "step": 2309 + }, + { + "epoch": 0.8023619312261202, + "grad_norm": 0.8526190683813358, + "learning_rate": 9.902552274626638e-07, + "loss": 0.0206, + "step": 2310 + }, + { + "epoch": 0.8027092740534908, + "grad_norm": 0.5131311092706733, + "learning_rate": 9.868967993407603e-07, + "loss": 0.0139, + "step": 2311 + }, + { + "epoch": 0.8030566168808614, + "grad_norm": 0.5377387868045655, + "learning_rate": 9.83543452218914e-07, + "loss": 0.0208, + "step": 2312 + }, + { + "epoch": 0.803403959708232, + "grad_norm": 0.6207978174819988, + "learning_rate": 9.801951903428053e-07, + "loss": 0.0169, + "step": 2313 + }, + { + "epoch": 0.8037513025356027, + "grad_norm": 1.0623394552894256, + "learning_rate": 9.768520179516782e-07, + "loss": 0.033, + "step": 2314 + }, + { + "epoch": 0.8040986453629733, + "grad_norm": 0.29785651145166425, + "learning_rate": 9.735139392783326e-07, + "loss": 0.0109, + "step": 2315 + }, + { + "epoch": 0.8044459881903439, + "grad_norm": 2.62736664673775, + "learning_rate": 9.70180958549118e-07, + "loss": 0.025, + "step": 2316 + }, + { + "epoch": 0.8047933310177144, + "grad_norm": 0.5896374439097164, + "learning_rate": 9.66853079983927e-07, + "loss": 0.0225, + "step": 2317 + }, + { + "epoch": 0.8051406738450851, + "grad_norm": 0.7960436391597935, + "learning_rate": 9.63530307796197e-07, + "loss": 0.0112, + "step": 2318 + }, + { + "epoch": 0.8054880166724557, + "grad_norm": 0.615407883572327, + "learning_rate": 9.602126461929002e-07, + "loss": 0.0224, + "step": 2319 + }, + { + "epoch": 0.8058353594998263, + "grad_norm": 0.48130754370985307, + "learning_rate": 9.569000993745336e-07, + "loss": 0.0161, + "step": 2320 + }, + { + "epoch": 0.8061827023271969, + "grad_norm": 0.41555455314622086, + "learning_rate": 9.535926715351207e-07, + "loss": 0.0124, + "step": 2321 + }, + { + "epoch": 0.8065300451545676, + "grad_norm": 0.8444184620512747, + "learning_rate": 9.502903668622055e-07, + "loss": 0.0166, + "step": 2322 + }, + { + "epoch": 0.8068773879819382, + "grad_norm": 0.6578650196091546, + "learning_rate": 9.469931895368462e-07, + "loss": 0.0193, + "step": 2323 + }, + { + "epoch": 0.8072247308093088, + "grad_norm": 0.8813060676876311, + "learning_rate": 9.43701143733603e-07, + "loss": 0.017, + "step": 2324 + }, + { + "epoch": 0.8075720736366794, + "grad_norm": 0.48427747281839323, + "learning_rate": 9.404142336205452e-07, + "loss": 0.018, + "step": 2325 + }, + { + "epoch": 0.80791941646405, + "grad_norm": 0.5218223865429743, + "learning_rate": 9.371324633592399e-07, + "loss": 0.0165, + "step": 2326 + }, + { + "epoch": 0.8082667592914207, + "grad_norm": 0.5497878546670253, + "learning_rate": 9.338558371047429e-07, + "loss": 0.0155, + "step": 2327 + }, + { + "epoch": 0.8086141021187913, + "grad_norm": 0.5828845938065368, + "learning_rate": 9.30584359005598e-07, + "loss": 0.0226, + "step": 2328 + }, + { + "epoch": 0.8089614449461618, + "grad_norm": 0.45607595995290867, + "learning_rate": 9.273180332038328e-07, + "loss": 0.012, + "step": 2329 + }, + { + "epoch": 0.8093087877735324, + "grad_norm": 0.37620117048721097, + "learning_rate": 9.240568638349523e-07, + "loss": 0.0131, + "step": 2330 + }, + { + "epoch": 0.8096561306009031, + "grad_norm": 0.3290671929012356, + "learning_rate": 9.208008550279296e-07, + "loss": 0.0103, + "step": 2331 + }, + { + "epoch": 0.8100034734282737, + "grad_norm": 0.623859993612422, + "learning_rate": 9.175500109052044e-07, + "loss": 0.022, + "step": 2332 + }, + { + "epoch": 0.8103508162556443, + "grad_norm": 0.38503253698459383, + "learning_rate": 9.143043355826802e-07, + "loss": 0.0169, + "step": 2333 + }, + { + "epoch": 0.8106981590830149, + "grad_norm": 0.5608051701389355, + "learning_rate": 9.110638331697158e-07, + "loss": 0.0145, + "step": 2334 + }, + { + "epoch": 0.8110455019103856, + "grad_norm": 0.8102603807769083, + "learning_rate": 9.078285077691179e-07, + "loss": 0.0205, + "step": 2335 + }, + { + "epoch": 0.8113928447377562, + "grad_norm": 0.8701051653222716, + "learning_rate": 9.045983634771388e-07, + "loss": 0.0144, + "step": 2336 + }, + { + "epoch": 0.8117401875651268, + "grad_norm": 0.6909556680271054, + "learning_rate": 9.013734043834743e-07, + "loss": 0.0159, + "step": 2337 + }, + { + "epoch": 0.8120875303924974, + "grad_norm": 1.0316061837686277, + "learning_rate": 8.981536345712544e-07, + "loss": 0.0184, + "step": 2338 + }, + { + "epoch": 0.812434873219868, + "grad_norm": 0.7754820020926894, + "learning_rate": 8.949390581170341e-07, + "loss": 0.0241, + "step": 2339 + }, + { + "epoch": 0.8127822160472387, + "grad_norm": 0.39205221031248105, + "learning_rate": 8.917296790908009e-07, + "loss": 0.0134, + "step": 2340 + }, + { + "epoch": 0.8131295588746092, + "grad_norm": 0.7321244654973654, + "learning_rate": 8.885255015559552e-07, + "loss": 0.0192, + "step": 2341 + }, + { + "epoch": 0.8134769017019798, + "grad_norm": 1.6237885505587426, + "learning_rate": 8.853265295693131e-07, + "loss": 0.0182, + "step": 2342 + }, + { + "epoch": 0.8138242445293504, + "grad_norm": 0.40790172026137156, + "learning_rate": 8.821327671811025e-07, + "loss": 0.0158, + "step": 2343 + }, + { + "epoch": 0.8141715873567211, + "grad_norm": 0.6612071453798002, + "learning_rate": 8.789442184349556e-07, + "loss": 0.0232, + "step": 2344 + }, + { + "epoch": 0.8145189301840917, + "grad_norm": 0.5315701172252235, + "learning_rate": 8.757608873679008e-07, + "loss": 0.0226, + "step": 2345 + }, + { + "epoch": 0.8148662730114623, + "grad_norm": 0.47678792802998987, + "learning_rate": 8.72582778010359e-07, + "loss": 0.0095, + "step": 2346 + }, + { + "epoch": 0.8152136158388329, + "grad_norm": 0.4744887984080288, + "learning_rate": 8.694098943861457e-07, + "loss": 0.0134, + "step": 2347 + }, + { + "epoch": 0.8155609586662036, + "grad_norm": 0.5992243344701217, + "learning_rate": 8.662422405124565e-07, + "loss": 0.0147, + "step": 2348 + }, + { + "epoch": 0.8159083014935742, + "grad_norm": 0.45940543311353116, + "learning_rate": 8.630798203998653e-07, + "loss": 0.0171, + "step": 2349 + }, + { + "epoch": 0.8162556443209448, + "grad_norm": 0.5247973512112908, + "learning_rate": 8.59922638052319e-07, + "loss": 0.0167, + "step": 2350 + }, + { + "epoch": 0.8166029871483154, + "grad_norm": 0.5656596886450084, + "learning_rate": 8.567706974671353e-07, + "loss": 0.012, + "step": 2351 + }, + { + "epoch": 0.816950329975686, + "grad_norm": 0.8482170289088908, + "learning_rate": 8.536240026349951e-07, + "loss": 0.0219, + "step": 2352 + }, + { + "epoch": 0.8172976728030567, + "grad_norm": 0.6035908558385402, + "learning_rate": 8.504825575399356e-07, + "loss": 0.0169, + "step": 2353 + }, + { + "epoch": 0.8176450156304272, + "grad_norm": 1.3540920045468, + "learning_rate": 8.473463661593473e-07, + "loss": 0.0197, + "step": 2354 + }, + { + "epoch": 0.8179923584577978, + "grad_norm": 0.7084599678534361, + "learning_rate": 8.442154324639706e-07, + "loss": 0.0156, + "step": 2355 + }, + { + "epoch": 0.8183397012851684, + "grad_norm": 0.40515237224446915, + "learning_rate": 8.410897604178913e-07, + "loss": 0.0144, + "step": 2356 + }, + { + "epoch": 0.8186870441125391, + "grad_norm": 1.0892964020587188, + "learning_rate": 8.379693539785266e-07, + "loss": 0.0292, + "step": 2357 + }, + { + "epoch": 0.8190343869399097, + "grad_norm": 1.4771693089117002, + "learning_rate": 8.348542170966317e-07, + "loss": 0.0215, + "step": 2358 + }, + { + "epoch": 0.8193817297672803, + "grad_norm": 0.4065081151813109, + "learning_rate": 8.317443537162922e-07, + "loss": 0.0146, + "step": 2359 + }, + { + "epoch": 0.8197290725946509, + "grad_norm": 0.5743831870975237, + "learning_rate": 8.286397677749114e-07, + "loss": 0.0201, + "step": 2360 + }, + { + "epoch": 0.8200764154220216, + "grad_norm": 0.5683746981641697, + "learning_rate": 8.255404632032126e-07, + "loss": 0.017, + "step": 2361 + }, + { + "epoch": 0.8204237582493922, + "grad_norm": 0.6228554253741776, + "learning_rate": 8.224464439252344e-07, + "loss": 0.0233, + "step": 2362 + }, + { + "epoch": 0.8207711010767628, + "grad_norm": 0.9700168786950992, + "learning_rate": 8.193577138583242e-07, + "loss": 0.0145, + "step": 2363 + }, + { + "epoch": 0.8211184439041334, + "grad_norm": 0.5794044470730007, + "learning_rate": 8.162742769131282e-07, + "loss": 0.0197, + "step": 2364 + }, + { + "epoch": 0.8214657867315039, + "grad_norm": 0.36573146405518225, + "learning_rate": 8.131961369935943e-07, + "loss": 0.0165, + "step": 2365 + }, + { + "epoch": 0.8218131295588746, + "grad_norm": 0.8465877042129827, + "learning_rate": 8.101232979969625e-07, + "loss": 0.0154, + "step": 2366 + }, + { + "epoch": 0.8221604723862452, + "grad_norm": 0.40438541583542964, + "learning_rate": 8.070557638137649e-07, + "loss": 0.0104, + "step": 2367 + }, + { + "epoch": 0.8225078152136158, + "grad_norm": 0.8024309888658482, + "learning_rate": 8.039935383278119e-07, + "loss": 0.0194, + "step": 2368 + }, + { + "epoch": 0.8228551580409864, + "grad_norm": 0.5129057990979784, + "learning_rate": 8.009366254161943e-07, + "loss": 0.0228, + "step": 2369 + }, + { + "epoch": 0.8232025008683571, + "grad_norm": 0.3255972401547693, + "learning_rate": 7.978850289492779e-07, + "loss": 0.013, + "step": 2370 + }, + { + "epoch": 0.8235498436957277, + "grad_norm": 0.7555922278203009, + "learning_rate": 7.948387527906987e-07, + "loss": 0.0198, + "step": 2371 + }, + { + "epoch": 0.8238971865230983, + "grad_norm": 0.7146376306999533, + "learning_rate": 7.91797800797352e-07, + "loss": 0.0164, + "step": 2372 + }, + { + "epoch": 0.8242445293504689, + "grad_norm": 0.37704727475605315, + "learning_rate": 7.887621768193954e-07, + "loss": 0.0195, + "step": 2373 + }, + { + "epoch": 0.8245918721778396, + "grad_norm": 0.7235950861900677, + "learning_rate": 7.85731884700241e-07, + "loss": 0.0123, + "step": 2374 + }, + { + "epoch": 0.8249392150052102, + "grad_norm": 0.5352341742681154, + "learning_rate": 7.827069282765475e-07, + "loss": 0.0173, + "step": 2375 + }, + { + "epoch": 0.8252865578325808, + "grad_norm": 0.35228903667411676, + "learning_rate": 7.796873113782205e-07, + "loss": 0.0138, + "step": 2376 + }, + { + "epoch": 0.8256339006599513, + "grad_norm": 0.8622702937938816, + "learning_rate": 7.766730378284065e-07, + "loss": 0.0171, + "step": 2377 + }, + { + "epoch": 0.8259812434873219, + "grad_norm": 0.9369503338878897, + "learning_rate": 7.736641114434834e-07, + "loss": 0.0206, + "step": 2378 + }, + { + "epoch": 0.8263285863146926, + "grad_norm": 0.45846125634992196, + "learning_rate": 7.706605360330594e-07, + "loss": 0.0174, + "step": 2379 + }, + { + "epoch": 0.8266759291420632, + "grad_norm": 0.3955188331989464, + "learning_rate": 7.676623153999696e-07, + "loss": 0.016, + "step": 2380 + }, + { + "epoch": 0.8270232719694338, + "grad_norm": 0.7836478004454387, + "learning_rate": 7.646694533402699e-07, + "loss": 0.0261, + "step": 2381 + }, + { + "epoch": 0.8273706147968044, + "grad_norm": 0.33544601587638473, + "learning_rate": 7.616819536432296e-07, + "loss": 0.0162, + "step": 2382 + }, + { + "epoch": 0.8277179576241751, + "grad_norm": 0.7414613842115939, + "learning_rate": 7.586998200913282e-07, + "loss": 0.0209, + "step": 2383 + }, + { + "epoch": 0.8280653004515457, + "grad_norm": 0.4693067215853339, + "learning_rate": 7.557230564602541e-07, + "loss": 0.0238, + "step": 2384 + }, + { + "epoch": 0.8284126432789163, + "grad_norm": 0.43643356007744005, + "learning_rate": 7.527516665188956e-07, + "loss": 0.0112, + "step": 2385 + }, + { + "epoch": 0.8287599861062869, + "grad_norm": 0.7401366594367462, + "learning_rate": 7.497856540293369e-07, + "loss": 0.0249, + "step": 2386 + }, + { + "epoch": 0.8291073289336576, + "grad_norm": 0.6044238634021094, + "learning_rate": 7.468250227468515e-07, + "loss": 0.0167, + "step": 2387 + }, + { + "epoch": 0.8294546717610282, + "grad_norm": 0.42786196774482066, + "learning_rate": 7.438697764199043e-07, + "loss": 0.0167, + "step": 2388 + }, + { + "epoch": 0.8298020145883988, + "grad_norm": 0.2809219680220145, + "learning_rate": 7.409199187901417e-07, + "loss": 0.0103, + "step": 2389 + }, + { + "epoch": 0.8301493574157693, + "grad_norm": 0.3850053448956444, + "learning_rate": 7.379754535923817e-07, + "loss": 0.011, + "step": 2390 + }, + { + "epoch": 0.8304967002431399, + "grad_norm": 0.5129627163049738, + "learning_rate": 7.35036384554621e-07, + "loss": 0.0144, + "step": 2391 + }, + { + "epoch": 0.8308440430705106, + "grad_norm": 0.7155517690937496, + "learning_rate": 7.321027153980237e-07, + "loss": 0.0183, + "step": 2392 + }, + { + "epoch": 0.8311913858978812, + "grad_norm": 0.7899372791574505, + "learning_rate": 7.291744498369146e-07, + "loss": 0.0179, + "step": 2393 + }, + { + "epoch": 0.8315387287252518, + "grad_norm": 0.6650626503017965, + "learning_rate": 7.262515915787771e-07, + "loss": 0.0193, + "step": 2394 + }, + { + "epoch": 0.8318860715526224, + "grad_norm": 0.5901749401101161, + "learning_rate": 7.233341443242504e-07, + "loss": 0.0212, + "step": 2395 + }, + { + "epoch": 0.8322334143799931, + "grad_norm": 0.4629616119625876, + "learning_rate": 7.204221117671229e-07, + "loss": 0.0154, + "step": 2396 + }, + { + "epoch": 0.8325807572073637, + "grad_norm": 0.5047026630331894, + "learning_rate": 7.175154975943244e-07, + "loss": 0.0177, + "step": 2397 + }, + { + "epoch": 0.8329281000347343, + "grad_norm": 0.9117791956297813, + "learning_rate": 7.146143054859267e-07, + "loss": 0.025, + "step": 2398 + }, + { + "epoch": 0.8332754428621049, + "grad_norm": 0.9994693023274762, + "learning_rate": 7.117185391151371e-07, + "loss": 0.0238, + "step": 2399 + }, + { + "epoch": 0.8336227856894756, + "grad_norm": 0.2469147876407477, + "learning_rate": 7.088282021482934e-07, + "loss": 0.0069, + "step": 2400 + }, + { + "epoch": 0.8339701285168462, + "grad_norm": 0.4503780485766356, + "learning_rate": 7.059432982448571e-07, + "loss": 0.0235, + "step": 2401 + }, + { + "epoch": 0.8343174713442167, + "grad_norm": 0.39895909831018983, + "learning_rate": 7.030638310574123e-07, + "loss": 0.0139, + "step": 2402 + }, + { + "epoch": 0.8346648141715873, + "grad_norm": 0.8121294300478942, + "learning_rate": 7.001898042316602e-07, + "loss": 0.0143, + "step": 2403 + }, + { + "epoch": 0.8350121569989579, + "grad_norm": 0.5682479563314492, + "learning_rate": 6.97321221406414e-07, + "loss": 0.0254, + "step": 2404 + }, + { + "epoch": 0.8353594998263286, + "grad_norm": 0.5858957903067712, + "learning_rate": 6.944580862135935e-07, + "loss": 0.0164, + "step": 2405 + }, + { + "epoch": 0.8357068426536992, + "grad_norm": 0.8591367268668065, + "learning_rate": 6.916004022782191e-07, + "loss": 0.012, + "step": 2406 + }, + { + "epoch": 0.8360541854810698, + "grad_norm": 0.3931982527826898, + "learning_rate": 6.887481732184148e-07, + "loss": 0.0185, + "step": 2407 + }, + { + "epoch": 0.8364015283084404, + "grad_norm": 0.7603616933042572, + "learning_rate": 6.859014026453925e-07, + "loss": 0.0149, + "step": 2408 + }, + { + "epoch": 0.8367488711358111, + "grad_norm": 0.40064016500672056, + "learning_rate": 6.830600941634579e-07, + "loss": 0.0212, + "step": 2409 + }, + { + "epoch": 0.8370962139631817, + "grad_norm": 0.7197529488413131, + "learning_rate": 6.802242513699963e-07, + "loss": 0.0187, + "step": 2410 + }, + { + "epoch": 0.8374435567905523, + "grad_norm": 1.1480554116820771, + "learning_rate": 6.773938778554773e-07, + "loss": 0.015, + "step": 2411 + }, + { + "epoch": 0.8377908996179229, + "grad_norm": 0.45140844123391066, + "learning_rate": 6.745689772034425e-07, + "loss": 0.02, + "step": 2412 + }, + { + "epoch": 0.8381382424452936, + "grad_norm": 0.26189833795464385, + "learning_rate": 6.717495529905077e-07, + "loss": 0.0118, + "step": 2413 + }, + { + "epoch": 0.8384855852726641, + "grad_norm": 0.42657880248327595, + "learning_rate": 6.689356087863508e-07, + "loss": 0.0166, + "step": 2414 + }, + { + "epoch": 0.8388329281000347, + "grad_norm": 0.6022359413113759, + "learning_rate": 6.661271481537157e-07, + "loss": 0.0171, + "step": 2415 + }, + { + "epoch": 0.8391802709274053, + "grad_norm": 0.4458461340423978, + "learning_rate": 6.633241746483993e-07, + "loss": 0.0193, + "step": 2416 + }, + { + "epoch": 0.8395276137547759, + "grad_norm": 0.4109499161581392, + "learning_rate": 6.605266918192543e-07, + "loss": 0.0158, + "step": 2417 + }, + { + "epoch": 0.8398749565821466, + "grad_norm": 0.8284740415704146, + "learning_rate": 6.577347032081816e-07, + "loss": 0.0196, + "step": 2418 + }, + { + "epoch": 0.8402222994095172, + "grad_norm": 0.5823268709121688, + "learning_rate": 6.549482123501249e-07, + "loss": 0.0259, + "step": 2419 + }, + { + "epoch": 0.8405696422368878, + "grad_norm": 0.4351391472723503, + "learning_rate": 6.521672227730658e-07, + "loss": 0.0119, + "step": 2420 + }, + { + "epoch": 0.8409169850642584, + "grad_norm": 0.7478210142980828, + "learning_rate": 6.49391737998023e-07, + "loss": 0.0179, + "step": 2421 + }, + { + "epoch": 0.8412643278916291, + "grad_norm": 0.714684958708105, + "learning_rate": 6.466217615390468e-07, + "loss": 0.0152, + "step": 2422 + }, + { + "epoch": 0.8416116707189997, + "grad_norm": 0.47658348097908554, + "learning_rate": 6.438572969032075e-07, + "loss": 0.0247, + "step": 2423 + }, + { + "epoch": 0.8419590135463703, + "grad_norm": 0.5726724916032655, + "learning_rate": 6.410983475906024e-07, + "loss": 0.0213, + "step": 2424 + }, + { + "epoch": 0.8423063563737408, + "grad_norm": 0.4699141551972462, + "learning_rate": 6.383449170943457e-07, + "loss": 0.0073, + "step": 2425 + }, + { + "epoch": 0.8426536992011116, + "grad_norm": 0.48804476100038013, + "learning_rate": 6.355970089005615e-07, + "loss": 0.0189, + "step": 2426 + }, + { + "epoch": 0.8430010420284821, + "grad_norm": 0.5447605632942257, + "learning_rate": 6.328546264883822e-07, + "loss": 0.0241, + "step": 2427 + }, + { + "epoch": 0.8433483848558527, + "grad_norm": 1.1943754155338666, + "learning_rate": 6.301177733299457e-07, + "loss": 0.0231, + "step": 2428 + }, + { + "epoch": 0.8436957276832233, + "grad_norm": 0.4781400798615543, + "learning_rate": 6.273864528903906e-07, + "loss": 0.0119, + "step": 2429 + }, + { + "epoch": 0.8440430705105939, + "grad_norm": 0.6541732578812876, + "learning_rate": 6.246606686278467e-07, + "loss": 0.0275, + "step": 2430 + }, + { + "epoch": 0.8443904133379646, + "grad_norm": 0.40455279688964085, + "learning_rate": 6.219404239934357e-07, + "loss": 0.0115, + "step": 2431 + }, + { + "epoch": 0.8447377561653352, + "grad_norm": 0.6875625753644061, + "learning_rate": 6.19225722431267e-07, + "loss": 0.0187, + "step": 2432 + }, + { + "epoch": 0.8450850989927058, + "grad_norm": 0.7921616555433921, + "learning_rate": 6.165165673784318e-07, + "loss": 0.0171, + "step": 2433 + }, + { + "epoch": 0.8454324418200764, + "grad_norm": 0.8867189304336157, + "learning_rate": 6.13812962264998e-07, + "loss": 0.0236, + "step": 2434 + }, + { + "epoch": 0.8457797846474471, + "grad_norm": 1.115479157352609, + "learning_rate": 6.111149105140052e-07, + "loss": 0.0177, + "step": 2435 + }, + { + "epoch": 0.8461271274748177, + "grad_norm": 0.5543931361704746, + "learning_rate": 6.084224155414647e-07, + "loss": 0.0105, + "step": 2436 + }, + { + "epoch": 0.8464744703021883, + "grad_norm": 0.48262329533304676, + "learning_rate": 6.057354807563526e-07, + "loss": 0.0109, + "step": 2437 + }, + { + "epoch": 0.8468218131295588, + "grad_norm": 0.41747137999997436, + "learning_rate": 6.030541095606018e-07, + "loss": 0.011, + "step": 2438 + }, + { + "epoch": 0.8471691559569295, + "grad_norm": 0.49664059634379965, + "learning_rate": 6.003783053491025e-07, + "loss": 0.013, + "step": 2439 + }, + { + "epoch": 0.8475164987843001, + "grad_norm": 0.4877777397413885, + "learning_rate": 5.977080715096995e-07, + "loss": 0.0232, + "step": 2440 + }, + { + "epoch": 0.8478638416116707, + "grad_norm": 1.2489307888455528, + "learning_rate": 5.950434114231801e-07, + "loss": 0.0314, + "step": 2441 + }, + { + "epoch": 0.8482111844390413, + "grad_norm": 0.4900829237700161, + "learning_rate": 5.923843284632796e-07, + "loss": 0.0184, + "step": 2442 + }, + { + "epoch": 0.8485585272664119, + "grad_norm": 0.4575837010051548, + "learning_rate": 5.897308259966672e-07, + "loss": 0.0176, + "step": 2443 + }, + { + "epoch": 0.8489058700937826, + "grad_norm": 0.38584962250720306, + "learning_rate": 5.870829073829515e-07, + "loss": 0.0179, + "step": 2444 + }, + { + "epoch": 0.8492532129211532, + "grad_norm": 0.4177353453219893, + "learning_rate": 5.844405759746663e-07, + "loss": 0.013, + "step": 2445 + }, + { + "epoch": 0.8496005557485238, + "grad_norm": 0.4869716570678539, + "learning_rate": 5.818038351172767e-07, + "loss": 0.0191, + "step": 2446 + }, + { + "epoch": 0.8499478985758944, + "grad_norm": 0.6783571708436903, + "learning_rate": 5.791726881491644e-07, + "loss": 0.0213, + "step": 2447 + }, + { + "epoch": 0.8502952414032651, + "grad_norm": 0.6897537136000237, + "learning_rate": 5.765471384016341e-07, + "loss": 0.0162, + "step": 2448 + }, + { + "epoch": 0.8506425842306357, + "grad_norm": 0.44891761410994796, + "learning_rate": 5.739271891988974e-07, + "loss": 0.0214, + "step": 2449 + }, + { + "epoch": 0.8509899270580062, + "grad_norm": 1.0985759896890122, + "learning_rate": 5.713128438580823e-07, + "loss": 0.0229, + "step": 2450 + }, + { + "epoch": 0.8513372698853768, + "grad_norm": 0.4352898494688454, + "learning_rate": 5.687041056892145e-07, + "loss": 0.0143, + "step": 2451 + }, + { + "epoch": 0.8516846127127475, + "grad_norm": 0.5789110156147614, + "learning_rate": 5.66100977995227e-07, + "loss": 0.0201, + "step": 2452 + }, + { + "epoch": 0.8520319555401181, + "grad_norm": 0.6518621410478981, + "learning_rate": 5.635034640719433e-07, + "loss": 0.0216, + "step": 2453 + }, + { + "epoch": 0.8523792983674887, + "grad_norm": 0.9846895298135566, + "learning_rate": 5.609115672080845e-07, + "loss": 0.018, + "step": 2454 + }, + { + "epoch": 0.8527266411948593, + "grad_norm": 0.8407928319843598, + "learning_rate": 5.583252906852594e-07, + "loss": 0.0144, + "step": 2455 + }, + { + "epoch": 0.8530739840222299, + "grad_norm": 0.5601226706008987, + "learning_rate": 5.557446377779546e-07, + "loss": 0.0215, + "step": 2456 + }, + { + "epoch": 0.8534213268496006, + "grad_norm": 0.5001353143572673, + "learning_rate": 5.53169611753544e-07, + "loss": 0.0183, + "step": 2457 + }, + { + "epoch": 0.8537686696769712, + "grad_norm": 0.5236537168132929, + "learning_rate": 5.506002158722751e-07, + "loss": 0.0218, + "step": 2458 + }, + { + "epoch": 0.8541160125043418, + "grad_norm": 0.36795187111448785, + "learning_rate": 5.48036453387265e-07, + "loss": 0.0129, + "step": 2459 + }, + { + "epoch": 0.8544633553317124, + "grad_norm": 0.7797517640305994, + "learning_rate": 5.454783275445003e-07, + "loss": 0.0167, + "step": 2460 + }, + { + "epoch": 0.8548106981590831, + "grad_norm": 0.40671604163293146, + "learning_rate": 5.429258415828298e-07, + "loss": 0.0179, + "step": 2461 + }, + { + "epoch": 0.8551580409864536, + "grad_norm": 0.8392548489754016, + "learning_rate": 5.403789987339647e-07, + "loss": 0.0164, + "step": 2462 + }, + { + "epoch": 0.8555053838138242, + "grad_norm": 1.8042310161892017, + "learning_rate": 5.378378022224679e-07, + "loss": 0.0182, + "step": 2463 + }, + { + "epoch": 0.8558527266411948, + "grad_norm": 0.9597712322684139, + "learning_rate": 5.353022552657533e-07, + "loss": 0.0242, + "step": 2464 + }, + { + "epoch": 0.8562000694685655, + "grad_norm": 1.2902699361504208, + "learning_rate": 5.327723610740843e-07, + "loss": 0.0194, + "step": 2465 + }, + { + "epoch": 0.8565474122959361, + "grad_norm": 0.9025915486754845, + "learning_rate": 5.302481228505674e-07, + "loss": 0.0159, + "step": 2466 + }, + { + "epoch": 0.8568947551233067, + "grad_norm": 0.8935041671025581, + "learning_rate": 5.277295437911462e-07, + "loss": 0.0206, + "step": 2467 + }, + { + "epoch": 0.8572420979506773, + "grad_norm": 0.8398011042782978, + "learning_rate": 5.252166270845994e-07, + "loss": 0.0175, + "step": 2468 + }, + { + "epoch": 0.8575894407780479, + "grad_norm": 0.6273784672353577, + "learning_rate": 5.227093759125368e-07, + "loss": 0.013, + "step": 2469 + }, + { + "epoch": 0.8579367836054186, + "grad_norm": 0.7599839379620519, + "learning_rate": 5.20207793449397e-07, + "loss": 0.0148, + "step": 2470 + }, + { + "epoch": 0.8582841264327892, + "grad_norm": 0.9137884963550477, + "learning_rate": 5.177118828624395e-07, + "loss": 0.0177, + "step": 2471 + }, + { + "epoch": 0.8586314692601598, + "grad_norm": 0.4027490941272288, + "learning_rate": 5.152216473117416e-07, + "loss": 0.0115, + "step": 2472 + }, + { + "epoch": 0.8589788120875304, + "grad_norm": 0.4067939554578114, + "learning_rate": 5.127370899501988e-07, + "loss": 0.0132, + "step": 2473 + }, + { + "epoch": 0.859326154914901, + "grad_norm": 0.39233639414180854, + "learning_rate": 5.10258213923513e-07, + "loss": 0.0134, + "step": 2474 + }, + { + "epoch": 0.8596734977422716, + "grad_norm": 0.5636819438029829, + "learning_rate": 5.07785022370198e-07, + "loss": 0.0162, + "step": 2475 + }, + { + "epoch": 0.8600208405696422, + "grad_norm": 0.45051474131353364, + "learning_rate": 5.053175184215653e-07, + "loss": 0.019, + "step": 2476 + }, + { + "epoch": 0.8603681833970128, + "grad_norm": 0.504428208604678, + "learning_rate": 5.028557052017302e-07, + "loss": 0.0128, + "step": 2477 + }, + { + "epoch": 0.8607155262243835, + "grad_norm": 0.5002145933355393, + "learning_rate": 5.003995858275984e-07, + "loss": 0.0148, + "step": 2478 + }, + { + "epoch": 0.8610628690517541, + "grad_norm": 0.7066450698383614, + "learning_rate": 4.979491634088712e-07, + "loss": 0.0216, + "step": 2479 + }, + { + "epoch": 0.8614102118791247, + "grad_norm": 0.7209554287180902, + "learning_rate": 4.955044410480326e-07, + "loss": 0.0102, + "step": 2480 + }, + { + "epoch": 0.8617575547064953, + "grad_norm": 0.37457104151587356, + "learning_rate": 4.93065421840353e-07, + "loss": 0.0129, + "step": 2481 + }, + { + "epoch": 0.8621048975338659, + "grad_norm": 0.35405427472628026, + "learning_rate": 4.906321088738791e-07, + "loss": 0.0145, + "step": 2482 + }, + { + "epoch": 0.8624522403612366, + "grad_norm": 0.9145616386018184, + "learning_rate": 4.882045052294371e-07, + "loss": 0.0154, + "step": 2483 + }, + { + "epoch": 0.8627995831886072, + "grad_norm": 0.45595222296109655, + "learning_rate": 4.857826139806194e-07, + "loss": 0.0159, + "step": 2484 + }, + { + "epoch": 0.8631469260159778, + "grad_norm": 0.6853468761771402, + "learning_rate": 4.833664381937908e-07, + "loss": 0.015, + "step": 2485 + }, + { + "epoch": 0.8634942688433483, + "grad_norm": 0.9539717763403811, + "learning_rate": 4.809559809280756e-07, + "loss": 0.0173, + "step": 2486 + }, + { + "epoch": 0.863841611670719, + "grad_norm": 0.8437730803188537, + "learning_rate": 4.785512452353619e-07, + "loss": 0.0244, + "step": 2487 + }, + { + "epoch": 0.8641889544980896, + "grad_norm": 0.7966421861571771, + "learning_rate": 4.7615223416029086e-07, + "loss": 0.0232, + "step": 2488 + }, + { + "epoch": 0.8645362973254602, + "grad_norm": 0.5829026692062281, + "learning_rate": 4.737589507402546e-07, + "loss": 0.0191, + "step": 2489 + }, + { + "epoch": 0.8648836401528308, + "grad_norm": 0.6971309820400614, + "learning_rate": 4.7137139800539746e-07, + "loss": 0.0147, + "step": 2490 + }, + { + "epoch": 0.8652309829802015, + "grad_norm": 0.9730338109684425, + "learning_rate": 4.689895789786059e-07, + "loss": 0.027, + "step": 2491 + }, + { + "epoch": 0.8655783258075721, + "grad_norm": 0.46701122458001026, + "learning_rate": 4.666134966755059e-07, + "loss": 0.0179, + "step": 2492 + }, + { + "epoch": 0.8659256686349427, + "grad_norm": 0.7261076983049326, + "learning_rate": 4.6424315410446117e-07, + "loss": 0.0231, + "step": 2493 + }, + { + "epoch": 0.8662730114623133, + "grad_norm": 0.4333800390573832, + "learning_rate": 4.618785542665688e-07, + "loss": 0.0171, + "step": 2494 + }, + { + "epoch": 0.8666203542896839, + "grad_norm": 0.7645732325430994, + "learning_rate": 4.5951970015565617e-07, + "loss": 0.0152, + "step": 2495 + }, + { + "epoch": 0.8669676971170546, + "grad_norm": 1.7725027824937925, + "learning_rate": 4.571665947582726e-07, + "loss": 0.0164, + "step": 2496 + }, + { + "epoch": 0.8673150399444252, + "grad_norm": 0.5820223340143823, + "learning_rate": 4.5481924105369e-07, + "loss": 0.0147, + "step": 2497 + }, + { + "epoch": 0.8676623827717957, + "grad_norm": 0.9426845874078936, + "learning_rate": 4.5247764201390045e-07, + "loss": 0.0181, + "step": 2498 + }, + { + "epoch": 0.8680097255991663, + "grad_norm": 0.40736158785553966, + "learning_rate": 4.5014180060360843e-07, + "loss": 0.0161, + "step": 2499 + }, + { + "epoch": 0.868357068426537, + "grad_norm": 0.35291016663434527, + "learning_rate": 4.4781171978022786e-07, + "loss": 0.0136, + "step": 2500 + }, + { + "epoch": 0.8687044112539076, + "grad_norm": 0.7083995120920576, + "learning_rate": 4.4548740249387934e-07, + "loss": 0.0287, + "step": 2501 + }, + { + "epoch": 0.8690517540812782, + "grad_norm": 0.6893058334285791, + "learning_rate": 4.4316885168738776e-07, + "loss": 0.0191, + "step": 2502 + }, + { + "epoch": 0.8693990969086488, + "grad_norm": 0.4921717937501951, + "learning_rate": 4.4085607029627717e-07, + "loss": 0.0112, + "step": 2503 + }, + { + "epoch": 0.8697464397360195, + "grad_norm": 0.5902281066197675, + "learning_rate": 4.3854906124876415e-07, + "loss": 0.0157, + "step": 2504 + }, + { + "epoch": 0.8700937825633901, + "grad_norm": 0.5650017177846125, + "learning_rate": 4.3624782746575886e-07, + "loss": 0.0221, + "step": 2505 + }, + { + "epoch": 0.8704411253907607, + "grad_norm": 0.5938641449620281, + "learning_rate": 4.3395237186086014e-07, + "loss": 0.0119, + "step": 2506 + }, + { + "epoch": 0.8707884682181313, + "grad_norm": 0.44786371675122244, + "learning_rate": 4.316626973403487e-07, + "loss": 0.0138, + "step": 2507 + }, + { + "epoch": 0.8711358110455019, + "grad_norm": 0.529485412783005, + "learning_rate": 4.2937880680318846e-07, + "loss": 0.0203, + "step": 2508 + }, + { + "epoch": 0.8714831538728726, + "grad_norm": 0.4695178384130346, + "learning_rate": 4.2710070314101845e-07, + "loss": 0.0171, + "step": 2509 + }, + { + "epoch": 0.8718304967002432, + "grad_norm": 0.6865782270212439, + "learning_rate": 4.2482838923815163e-07, + "loss": 0.0221, + "step": 2510 + }, + { + "epoch": 0.8721778395276137, + "grad_norm": 1.008599802575518, + "learning_rate": 4.2256186797156986e-07, + "loss": 0.0163, + "step": 2511 + }, + { + "epoch": 0.8725251823549843, + "grad_norm": 0.8277810535713074, + "learning_rate": 4.203011422109227e-07, + "loss": 0.0131, + "step": 2512 + }, + { + "epoch": 0.872872525182355, + "grad_norm": 0.47209426725218306, + "learning_rate": 4.180462148185188e-07, + "loss": 0.0115, + "step": 2513 + }, + { + "epoch": 0.8732198680097256, + "grad_norm": 1.0335741947460342, + "learning_rate": 4.1579708864932956e-07, + "loss": 0.0202, + "step": 2514 + }, + { + "epoch": 0.8735672108370962, + "grad_norm": 0.4371396632860345, + "learning_rate": 4.1355376655097704e-07, + "loss": 0.0187, + "step": 2515 + }, + { + "epoch": 0.8739145536644668, + "grad_norm": 0.4841174212281427, + "learning_rate": 4.113162513637392e-07, + "loss": 0.0146, + "step": 2516 + }, + { + "epoch": 0.8742618964918375, + "grad_norm": 0.5418422212985032, + "learning_rate": 4.090845459205378e-07, + "loss": 0.0205, + "step": 2517 + }, + { + "epoch": 0.8746092393192081, + "grad_norm": 0.6175530773760641, + "learning_rate": 4.0685865304694205e-07, + "loss": 0.0257, + "step": 2518 + }, + { + "epoch": 0.8749565821465787, + "grad_norm": 0.6600694577668837, + "learning_rate": 4.0463857556115924e-07, + "loss": 0.0126, + "step": 2519 + }, + { + "epoch": 0.8753039249739493, + "grad_norm": 0.45151100408746436, + "learning_rate": 4.0242431627403656e-07, + "loss": 0.0108, + "step": 2520 + }, + { + "epoch": 0.8756512678013199, + "grad_norm": 0.7267112922014047, + "learning_rate": 4.0021587798905247e-07, + "loss": 0.0165, + "step": 2521 + }, + { + "epoch": 0.8759986106286906, + "grad_norm": 0.7989326256489759, + "learning_rate": 3.980132635023154e-07, + "loss": 0.022, + "step": 2522 + }, + { + "epoch": 0.8763459534560611, + "grad_norm": 1.297364986774049, + "learning_rate": 3.9581647560256175e-07, + "loss": 0.0218, + "step": 2523 + }, + { + "epoch": 0.8766932962834317, + "grad_norm": 0.5435363177683816, + "learning_rate": 3.9362551707115114e-07, + "loss": 0.0247, + "step": 2524 + }, + { + "epoch": 0.8770406391108023, + "grad_norm": 0.38578321310369523, + "learning_rate": 3.914403906820613e-07, + "loss": 0.0133, + "step": 2525 + }, + { + "epoch": 0.877387981938173, + "grad_norm": 0.4932282625829498, + "learning_rate": 3.892610992018847e-07, + "loss": 0.016, + "step": 2526 + }, + { + "epoch": 0.8777353247655436, + "grad_norm": 0.3669442115959157, + "learning_rate": 3.870876453898292e-07, + "loss": 0.0166, + "step": 2527 + }, + { + "epoch": 0.8780826675929142, + "grad_norm": 0.9342073340109053, + "learning_rate": 3.849200319977109e-07, + "loss": 0.0228, + "step": 2528 + }, + { + "epoch": 0.8784300104202848, + "grad_norm": 0.25770986108617316, + "learning_rate": 3.8275826176994936e-07, + "loss": 0.0092, + "step": 2529 + }, + { + "epoch": 0.8787773532476555, + "grad_norm": 0.4339581883005602, + "learning_rate": 3.8060233744356634e-07, + "loss": 0.0168, + "step": 2530 + }, + { + "epoch": 0.8791246960750261, + "grad_norm": 0.9289029914869522, + "learning_rate": 3.784522617481845e-07, + "loss": 0.0215, + "step": 2531 + }, + { + "epoch": 0.8794720389023967, + "grad_norm": 0.5849896866294249, + "learning_rate": 3.7630803740602073e-07, + "loss": 0.0224, + "step": 2532 + }, + { + "epoch": 0.8798193817297673, + "grad_norm": 0.7327664241177156, + "learning_rate": 3.7416966713188174e-07, + "loss": 0.0106, + "step": 2533 + }, + { + "epoch": 0.8801667245571378, + "grad_norm": 0.3696886941163735, + "learning_rate": 3.7203715363316294e-07, + "loss": 0.0089, + "step": 2534 + }, + { + "epoch": 0.8805140673845085, + "grad_norm": 0.49024463460685797, + "learning_rate": 3.699104996098457e-07, + "loss": 0.0151, + "step": 2535 + }, + { + "epoch": 0.8808614102118791, + "grad_norm": 0.7947374546207403, + "learning_rate": 3.6778970775449283e-07, + "loss": 0.0142, + "step": 2536 + }, + { + "epoch": 0.8812087530392497, + "grad_norm": 0.4625649202944896, + "learning_rate": 3.656747807522437e-07, + "loss": 0.0109, + "step": 2537 + }, + { + "epoch": 0.8815560958666203, + "grad_norm": 0.5558918378768948, + "learning_rate": 3.6356572128081134e-07, + "loss": 0.0238, + "step": 2538 + }, + { + "epoch": 0.881903438693991, + "grad_norm": 0.42918432489972863, + "learning_rate": 3.614625320104831e-07, + "loss": 0.0153, + "step": 2539 + }, + { + "epoch": 0.8822507815213616, + "grad_norm": 1.027886704845079, + "learning_rate": 3.593652156041122e-07, + "loss": 0.0195, + "step": 2540 + }, + { + "epoch": 0.8825981243487322, + "grad_norm": 0.33562683798972165, + "learning_rate": 3.572737747171151e-07, + "loss": 0.0125, + "step": 2541 + }, + { + "epoch": 0.8829454671761028, + "grad_norm": 0.5951105843622697, + "learning_rate": 3.5518821199747035e-07, + "loss": 0.0175, + "step": 2542 + }, + { + "epoch": 0.8832928100034735, + "grad_norm": 0.6577768489226572, + "learning_rate": 3.531085300857151e-07, + "loss": 0.0252, + "step": 2543 + }, + { + "epoch": 0.8836401528308441, + "grad_norm": 1.1447374774363468, + "learning_rate": 3.510347316149393e-07, + "loss": 0.0253, + "step": 2544 + }, + { + "epoch": 0.8839874956582147, + "grad_norm": 1.075748573924704, + "learning_rate": 3.4896681921078477e-07, + "loss": 0.0138, + "step": 2545 + }, + { + "epoch": 0.8843348384855853, + "grad_norm": 0.5241854815133766, + "learning_rate": 3.469047954914395e-07, + "loss": 0.0178, + "step": 2546 + }, + { + "epoch": 0.8846821813129558, + "grad_norm": 0.48865176944956945, + "learning_rate": 3.4484866306763896e-07, + "loss": 0.0123, + "step": 2547 + }, + { + "epoch": 0.8850295241403265, + "grad_norm": 0.48077206825883056, + "learning_rate": 3.4279842454265523e-07, + "loss": 0.0114, + "step": 2548 + }, + { + "epoch": 0.8853768669676971, + "grad_norm": 0.7919381272023467, + "learning_rate": 3.407540825123024e-07, + "loss": 0.015, + "step": 2549 + }, + { + "epoch": 0.8857242097950677, + "grad_norm": 0.9115719414888127, + "learning_rate": 3.3871563956492546e-07, + "loss": 0.0215, + "step": 2550 + }, + { + "epoch": 0.8860715526224383, + "grad_norm": 0.9428448234794181, + "learning_rate": 3.36683098281404e-07, + "loss": 0.019, + "step": 2551 + }, + { + "epoch": 0.886418895449809, + "grad_norm": 0.5725716009079287, + "learning_rate": 3.346564612351416e-07, + "loss": 0.0221, + "step": 2552 + }, + { + "epoch": 0.8867662382771796, + "grad_norm": 0.5339812873256146, + "learning_rate": 3.3263573099207025e-07, + "loss": 0.0156, + "step": 2553 + }, + { + "epoch": 0.8871135811045502, + "grad_norm": 0.7200318917306987, + "learning_rate": 3.3062091011064e-07, + "loss": 0.0189, + "step": 2554 + }, + { + "epoch": 0.8874609239319208, + "grad_norm": 0.8289865754088781, + "learning_rate": 3.2861200114182257e-07, + "loss": 0.0191, + "step": 2555 + }, + { + "epoch": 0.8878082667592915, + "grad_norm": 0.41087275655115824, + "learning_rate": 3.2660900662910056e-07, + "loss": 0.014, + "step": 2556 + }, + { + "epoch": 0.8881556095866621, + "grad_norm": 0.4168171217347719, + "learning_rate": 3.2461192910847263e-07, + "loss": 0.0127, + "step": 2557 + }, + { + "epoch": 0.8885029524140327, + "grad_norm": 0.3718290081453453, + "learning_rate": 3.2262077110844224e-07, + "loss": 0.0094, + "step": 2558 + }, + { + "epoch": 0.8888502952414032, + "grad_norm": 0.32435438005787876, + "learning_rate": 3.206355351500184e-07, + "loss": 0.0123, + "step": 2559 + }, + { + "epoch": 0.8891976380687738, + "grad_norm": 0.4727940603046291, + "learning_rate": 3.186562237467156e-07, + "loss": 0.014, + "step": 2560 + }, + { + "epoch": 0.8895449808961445, + "grad_norm": 0.3199322989762899, + "learning_rate": 3.16682839404544e-07, + "loss": 0.0156, + "step": 2561 + }, + { + "epoch": 0.8898923237235151, + "grad_norm": 0.5052862572229199, + "learning_rate": 3.147153846220108e-07, + "loss": 0.0162, + "step": 2562 + }, + { + "epoch": 0.8902396665508857, + "grad_norm": 0.4935045434697062, + "learning_rate": 3.127538618901144e-07, + "loss": 0.0201, + "step": 2563 + }, + { + "epoch": 0.8905870093782563, + "grad_norm": 0.45484641731771275, + "learning_rate": 3.107982736923448e-07, + "loss": 0.012, + "step": 2564 + }, + { + "epoch": 0.890934352205627, + "grad_norm": 0.4589036891401816, + "learning_rate": 3.0884862250467715e-07, + "loss": 0.0185, + "step": 2565 + }, + { + "epoch": 0.8912816950329976, + "grad_norm": 0.4845147421338059, + "learning_rate": 3.069049107955696e-07, + "loss": 0.0122, + "step": 2566 + }, + { + "epoch": 0.8916290378603682, + "grad_norm": 0.4953532759104348, + "learning_rate": 3.0496714102595914e-07, + "loss": 0.0105, + "step": 2567 + }, + { + "epoch": 0.8919763806877388, + "grad_norm": 1.1581053470310272, + "learning_rate": 3.030353156492627e-07, + "loss": 0.0217, + "step": 2568 + }, + { + "epoch": 0.8923237235151095, + "grad_norm": 0.5167371489599756, + "learning_rate": 3.0110943711136874e-07, + "loss": 0.0119, + "step": 2569 + }, + { + "epoch": 0.8926710663424801, + "grad_norm": 0.6648490939529547, + "learning_rate": 2.9918950785063684e-07, + "loss": 0.015, + "step": 2570 + }, + { + "epoch": 0.8930184091698506, + "grad_norm": 0.7019645201421016, + "learning_rate": 2.9727553029789303e-07, + "loss": 0.019, + "step": 2571 + }, + { + "epoch": 0.8933657519972212, + "grad_norm": 0.3570823917219566, + "learning_rate": 2.953675068764311e-07, + "loss": 0.0123, + "step": 2572 + }, + { + "epoch": 0.8937130948245918, + "grad_norm": 0.44958846952146136, + "learning_rate": 2.9346544000200373e-07, + "loss": 0.0175, + "step": 2573 + }, + { + "epoch": 0.8940604376519625, + "grad_norm": 0.8033961501217101, + "learning_rate": 2.915693320828222e-07, + "loss": 0.0245, + "step": 2574 + }, + { + "epoch": 0.8944077804793331, + "grad_norm": 0.6221815609588339, + "learning_rate": 2.89679185519553e-07, + "loss": 0.0164, + "step": 2575 + }, + { + "epoch": 0.8947551233067037, + "grad_norm": 0.8670700629752854, + "learning_rate": 2.877950027053167e-07, + "loss": 0.0251, + "step": 2576 + }, + { + "epoch": 0.8951024661340743, + "grad_norm": 0.33526691551846566, + "learning_rate": 2.859167860256801e-07, + "loss": 0.0145, + "step": 2577 + }, + { + "epoch": 0.895449808961445, + "grad_norm": 0.3738576749343047, + "learning_rate": 2.8404453785866037e-07, + "loss": 0.0126, + "step": 2578 + }, + { + "epoch": 0.8957971517888156, + "grad_norm": 0.6834813763355805, + "learning_rate": 2.8217826057471423e-07, + "loss": 0.0273, + "step": 2579 + }, + { + "epoch": 0.8961444946161862, + "grad_norm": 0.47554504790974056, + "learning_rate": 2.8031795653674033e-07, + "loss": 0.0161, + "step": 2580 + }, + { + "epoch": 0.8964918374435568, + "grad_norm": 0.3690282253450739, + "learning_rate": 2.7846362810007355e-07, + "loss": 0.0151, + "step": 2581 + }, + { + "epoch": 0.8968391802709275, + "grad_norm": 0.7307835358362869, + "learning_rate": 2.766152776124853e-07, + "loss": 0.0207, + "step": 2582 + }, + { + "epoch": 0.897186523098298, + "grad_norm": 0.755714402406654, + "learning_rate": 2.7477290741417526e-07, + "loss": 0.0172, + "step": 2583 + }, + { + "epoch": 0.8975338659256686, + "grad_norm": 0.43973806244842545, + "learning_rate": 2.729365198377748e-07, + "loss": 0.0147, + "step": 2584 + }, + { + "epoch": 0.8978812087530392, + "grad_norm": 0.9009144598568006, + "learning_rate": 2.711061172083368e-07, + "loss": 0.0192, + "step": 2585 + }, + { + "epoch": 0.8982285515804098, + "grad_norm": 0.8203741129744253, + "learning_rate": 2.692817018433397e-07, + "loss": 0.0147, + "step": 2586 + }, + { + "epoch": 0.8985758944077805, + "grad_norm": 0.8664237660996313, + "learning_rate": 2.6746327605268017e-07, + "loss": 0.018, + "step": 2587 + }, + { + "epoch": 0.8989232372351511, + "grad_norm": 0.49174214473254213, + "learning_rate": 2.656508421386722e-07, + "loss": 0.019, + "step": 2588 + }, + { + "epoch": 0.8992705800625217, + "grad_norm": 0.41984596348114944, + "learning_rate": 2.638444023960418e-07, + "loss": 0.0147, + "step": 2589 + }, + { + "epoch": 0.8996179228898923, + "grad_norm": 0.6425656015149634, + "learning_rate": 2.6204395911192836e-07, + "loss": 0.0142, + "step": 2590 + }, + { + "epoch": 0.899965265717263, + "grad_norm": 0.6500871093127806, + "learning_rate": 2.6024951456587677e-07, + "loss": 0.0146, + "step": 2591 + }, + { + "epoch": 0.9003126085446336, + "grad_norm": 0.7086875286298896, + "learning_rate": 2.5846107102983744e-07, + "loss": 0.0243, + "step": 2592 + }, + { + "epoch": 0.9006599513720042, + "grad_norm": 0.38145659806209425, + "learning_rate": 2.566786307681635e-07, + "loss": 0.0131, + "step": 2593 + }, + { + "epoch": 0.9010072941993748, + "grad_norm": 0.6205010960638877, + "learning_rate": 2.549021960376075e-07, + "loss": 0.015, + "step": 2594 + }, + { + "epoch": 0.9013546370267453, + "grad_norm": 0.5706136061121067, + "learning_rate": 2.531317690873181e-07, + "loss": 0.0197, + "step": 2595 + }, + { + "epoch": 0.901701979854116, + "grad_norm": 0.3901107728118566, + "learning_rate": 2.5136735215883613e-07, + "loss": 0.0095, + "step": 2596 + }, + { + "epoch": 0.9020493226814866, + "grad_norm": 0.5881915928887412, + "learning_rate": 2.4960894748609465e-07, + "loss": 0.0202, + "step": 2597 + }, + { + "epoch": 0.9023966655088572, + "grad_norm": 0.42911566159079534, + "learning_rate": 2.4785655729541555e-07, + "loss": 0.0159, + "step": 2598 + }, + { + "epoch": 0.9027440083362278, + "grad_norm": 0.27083599132726704, + "learning_rate": 2.46110183805503e-07, + "loss": 0.0074, + "step": 2599 + }, + { + "epoch": 0.9030913511635985, + "grad_norm": 0.6530377955588946, + "learning_rate": 2.4436982922744547e-07, + "loss": 0.0168, + "step": 2600 + }, + { + "epoch": 0.9034386939909691, + "grad_norm": 0.8639097600309369, + "learning_rate": 2.426354957647098e-07, + "loss": 0.0139, + "step": 2601 + }, + { + "epoch": 0.9037860368183397, + "grad_norm": 0.5654469170463523, + "learning_rate": 2.4090718561314064e-07, + "loss": 0.0218, + "step": 2602 + }, + { + "epoch": 0.9041333796457103, + "grad_norm": 0.8583056813584015, + "learning_rate": 2.391849009609559e-07, + "loss": 0.0109, + "step": 2603 + }, + { + "epoch": 0.904480722473081, + "grad_norm": 0.6257033656528606, + "learning_rate": 2.374686439887436e-07, + "loss": 0.0149, + "step": 2604 + }, + { + "epoch": 0.9048280653004516, + "grad_norm": 0.5398376002785664, + "learning_rate": 2.3575841686946155e-07, + "loss": 0.0164, + "step": 2605 + }, + { + "epoch": 0.9051754081278222, + "grad_norm": 0.6221925809468902, + "learning_rate": 2.3405422176843329e-07, + "loss": 0.0256, + "step": 2606 + }, + { + "epoch": 0.9055227509551927, + "grad_norm": 0.5713859427829016, + "learning_rate": 2.3235606084334285e-07, + "loss": 0.0226, + "step": 2607 + }, + { + "epoch": 0.9058700937825633, + "grad_norm": 1.3040906101475984, + "learning_rate": 2.3066393624423754e-07, + "loss": 0.0166, + "step": 2608 + }, + { + "epoch": 0.906217436609934, + "grad_norm": 0.7255451058697687, + "learning_rate": 2.2897785011351982e-07, + "loss": 0.0209, + "step": 2609 + }, + { + "epoch": 0.9065647794373046, + "grad_norm": 0.6422064768024528, + "learning_rate": 2.2729780458594751e-07, + "loss": 0.0109, + "step": 2610 + }, + { + "epoch": 0.9069121222646752, + "grad_norm": 0.3372897791705386, + "learning_rate": 2.256238017886314e-07, + "loss": 0.0187, + "step": 2611 + }, + { + "epoch": 0.9072594650920458, + "grad_norm": 0.6097453676252612, + "learning_rate": 2.2395584384102943e-07, + "loss": 0.0207, + "step": 2612 + }, + { + "epoch": 0.9076068079194165, + "grad_norm": 0.6884196263626761, + "learning_rate": 2.2229393285494893e-07, + "loss": 0.0144, + "step": 2613 + }, + { + "epoch": 0.9079541507467871, + "grad_norm": 1.0714957544176478, + "learning_rate": 2.2063807093453736e-07, + "loss": 0.0182, + "step": 2614 + }, + { + "epoch": 0.9083014935741577, + "grad_norm": 0.8267148959880947, + "learning_rate": 2.1898826017628772e-07, + "loss": 0.0239, + "step": 2615 + }, + { + "epoch": 0.9086488364015283, + "grad_norm": 0.7458685484740719, + "learning_rate": 2.173445026690285e-07, + "loss": 0.0211, + "step": 2616 + }, + { + "epoch": 0.908996179228899, + "grad_norm": 0.5686980841248143, + "learning_rate": 2.1570680049392556e-07, + "loss": 0.0125, + "step": 2617 + }, + { + "epoch": 0.9093435220562696, + "grad_norm": 0.6449900442523265, + "learning_rate": 2.1407515572447747e-07, + "loss": 0.0241, + "step": 2618 + }, + { + "epoch": 0.9096908648836401, + "grad_norm": 0.4341128255677446, + "learning_rate": 2.1244957042651394e-07, + "loss": 0.0169, + "step": 2619 + }, + { + "epoch": 0.9100382077110107, + "grad_norm": 0.6600049909611397, + "learning_rate": 2.108300466581925e-07, + "loss": 0.0152, + "step": 2620 + }, + { + "epoch": 0.9103855505383813, + "grad_norm": 0.6245210505743247, + "learning_rate": 2.0921658646999687e-07, + "loss": 0.025, + "step": 2621 + }, + { + "epoch": 0.910732893365752, + "grad_norm": 0.5922289848107282, + "learning_rate": 2.0760919190473182e-07, + "loss": 0.0109, + "step": 2622 + }, + { + "epoch": 0.9110802361931226, + "grad_norm": 0.39775195287639503, + "learning_rate": 2.06007864997525e-07, + "loss": 0.015, + "step": 2623 + }, + { + "epoch": 0.9114275790204932, + "grad_norm": 0.7009054983235024, + "learning_rate": 2.0441260777582018e-07, + "loss": 0.0231, + "step": 2624 + }, + { + "epoch": 0.9117749218478638, + "grad_norm": 0.6973007401473146, + "learning_rate": 2.0282342225937503e-07, + "loss": 0.021, + "step": 2625 + }, + { + "epoch": 0.9121222646752345, + "grad_norm": 0.5883004994975823, + "learning_rate": 2.0124031046026283e-07, + "loss": 0.0159, + "step": 2626 + }, + { + "epoch": 0.9124696075026051, + "grad_norm": 0.454183134142274, + "learning_rate": 1.9966327438286582e-07, + "loss": 0.0169, + "step": 2627 + }, + { + "epoch": 0.9128169503299757, + "grad_norm": 0.384092493102016, + "learning_rate": 1.9809231602387236e-07, + "loss": 0.0132, + "step": 2628 + }, + { + "epoch": 0.9131642931573463, + "grad_norm": 0.4491174239681359, + "learning_rate": 1.9652743737227643e-07, + "loss": 0.0199, + "step": 2629 + }, + { + "epoch": 0.913511635984717, + "grad_norm": 0.59277378736531, + "learning_rate": 1.9496864040937536e-07, + "loss": 0.0196, + "step": 2630 + }, + { + "epoch": 0.9138589788120876, + "grad_norm": 0.7391457567820585, + "learning_rate": 1.9341592710876656e-07, + "loss": 0.0164, + "step": 2631 + }, + { + "epoch": 0.9142063216394581, + "grad_norm": 0.667688347659246, + "learning_rate": 1.9186929943634358e-07, + "loss": 0.026, + "step": 2632 + }, + { + "epoch": 0.9145536644668287, + "grad_norm": 0.4364968802938354, + "learning_rate": 1.9032875935029504e-07, + "loss": 0.0128, + "step": 2633 + }, + { + "epoch": 0.9149010072941993, + "grad_norm": 0.6408274013619847, + "learning_rate": 1.887943088011035e-07, + "loss": 0.0157, + "step": 2634 + }, + { + "epoch": 0.91524835012157, + "grad_norm": 0.5188280320693144, + "learning_rate": 1.87265949731541e-07, + "loss": 0.0108, + "step": 2635 + }, + { + "epoch": 0.9155956929489406, + "grad_norm": 0.8443286605185657, + "learning_rate": 1.8574368407666576e-07, + "loss": 0.0234, + "step": 2636 + }, + { + "epoch": 0.9159430357763112, + "grad_norm": 0.4426808473145947, + "learning_rate": 1.8422751376382274e-07, + "loss": 0.0067, + "step": 2637 + }, + { + "epoch": 0.9162903786036818, + "grad_norm": 0.47113894581099847, + "learning_rate": 1.8271744071263808e-07, + "loss": 0.0206, + "step": 2638 + }, + { + "epoch": 0.9166377214310525, + "grad_norm": 0.8457616638367563, + "learning_rate": 1.8121346683502183e-07, + "loss": 0.021, + "step": 2639 + }, + { + "epoch": 0.9169850642584231, + "grad_norm": 0.6588573901077968, + "learning_rate": 1.7971559403515526e-07, + "loss": 0.0191, + "step": 2640 + }, + { + "epoch": 0.9173324070857937, + "grad_norm": 0.45622450404307274, + "learning_rate": 1.782238242095008e-07, + "loss": 0.014, + "step": 2641 + }, + { + "epoch": 0.9176797499131643, + "grad_norm": 0.4489208891532178, + "learning_rate": 1.767381592467926e-07, + "loss": 0.0163, + "step": 2642 + }, + { + "epoch": 0.918027092740535, + "grad_norm": 0.52794845468743, + "learning_rate": 1.7525860102803438e-07, + "loss": 0.0214, + "step": 2643 + }, + { + "epoch": 0.9183744355679055, + "grad_norm": 0.33511808267174975, + "learning_rate": 1.7378515142649767e-07, + "loss": 0.0113, + "step": 2644 + }, + { + "epoch": 0.9187217783952761, + "grad_norm": 0.8011767384068549, + "learning_rate": 1.7231781230772127e-07, + "loss": 0.0294, + "step": 2645 + }, + { + "epoch": 0.9190691212226467, + "grad_norm": 0.3396441784353466, + "learning_rate": 1.7085658552950746e-07, + "loss": 0.0134, + "step": 2646 + }, + { + "epoch": 0.9194164640500173, + "grad_norm": 0.30267544503409916, + "learning_rate": 1.69401472941918e-07, + "loss": 0.0108, + "step": 2647 + }, + { + "epoch": 0.919763806877388, + "grad_norm": 0.7104785989934016, + "learning_rate": 1.6795247638727585e-07, + "loss": 0.0232, + "step": 2648 + }, + { + "epoch": 0.9201111497047586, + "grad_norm": 0.27352163599926305, + "learning_rate": 1.6650959770015796e-07, + "loss": 0.0135, + "step": 2649 + }, + { + "epoch": 0.9204584925321292, + "grad_norm": 0.494764482965323, + "learning_rate": 1.6507283870739798e-07, + "loss": 0.0164, + "step": 2650 + }, + { + "epoch": 0.9208058353594998, + "grad_norm": 0.6063381987250709, + "learning_rate": 1.6364220122807862e-07, + "loss": 0.014, + "step": 2651 + }, + { + "epoch": 0.9211531781868705, + "grad_norm": 0.3484396467979152, + "learning_rate": 1.6221768707353536e-07, + "loss": 0.0146, + "step": 2652 + }, + { + "epoch": 0.9215005210142411, + "grad_norm": 0.5811657495793892, + "learning_rate": 1.6079929804734716e-07, + "loss": 0.0186, + "step": 2653 + }, + { + "epoch": 0.9218478638416117, + "grad_norm": 0.38993789351226965, + "learning_rate": 1.59387035945342e-07, + "loss": 0.0104, + "step": 2654 + }, + { + "epoch": 0.9221952066689822, + "grad_norm": 0.2970396986761084, + "learning_rate": 1.5798090255558617e-07, + "loss": 0.0094, + "step": 2655 + }, + { + "epoch": 0.922542549496353, + "grad_norm": 0.7453340514328095, + "learning_rate": 1.565808996583912e-07, + "loss": 0.0236, + "step": 2656 + }, + { + "epoch": 0.9228898923237235, + "grad_norm": 0.6111888530371641, + "learning_rate": 1.55187029026303e-07, + "loss": 0.0112, + "step": 2657 + }, + { + "epoch": 0.9232372351510941, + "grad_norm": 0.8674798788983026, + "learning_rate": 1.5379929242410385e-07, + "loss": 0.0263, + "step": 2658 + }, + { + "epoch": 0.9235845779784647, + "grad_norm": 0.527019561763946, + "learning_rate": 1.5241769160881104e-07, + "loss": 0.0142, + "step": 2659 + }, + { + "epoch": 0.9239319208058353, + "grad_norm": 0.7830231346293527, + "learning_rate": 1.5104222832967419e-07, + "loss": 0.0209, + "step": 2660 + }, + { + "epoch": 0.924279263633206, + "grad_norm": 0.5153735628692967, + "learning_rate": 1.4967290432817028e-07, + "loss": 0.0184, + "step": 2661 + }, + { + "epoch": 0.9246266064605766, + "grad_norm": 0.2844914040623888, + "learning_rate": 1.4830972133800247e-07, + "loss": 0.0093, + "step": 2662 + }, + { + "epoch": 0.9249739492879472, + "grad_norm": 0.43842457587592126, + "learning_rate": 1.4695268108510075e-07, + "loss": 0.0146, + "step": 2663 + }, + { + "epoch": 0.9253212921153178, + "grad_norm": 0.7398002758095784, + "learning_rate": 1.4560178528761848e-07, + "loss": 0.0177, + "step": 2664 + }, + { + "epoch": 0.9256686349426885, + "grad_norm": 0.35972295634537466, + "learning_rate": 1.4425703565592753e-07, + "loss": 0.0108, + "step": 2665 + }, + { + "epoch": 0.9260159777700591, + "grad_norm": 0.7120797307812906, + "learning_rate": 1.429184338926176e-07, + "loss": 0.0157, + "step": 2666 + }, + { + "epoch": 0.9263633205974297, + "grad_norm": 0.45611345610981685, + "learning_rate": 1.415859816924975e-07, + "loss": 0.0236, + "step": 2667 + }, + { + "epoch": 0.9267106634248002, + "grad_norm": 0.6910514224473907, + "learning_rate": 1.402596807425871e-07, + "loss": 0.0259, + "step": 2668 + }, + { + "epoch": 0.9270580062521709, + "grad_norm": 0.3565369326923046, + "learning_rate": 1.3893953272212046e-07, + "loss": 0.0147, + "step": 2669 + }, + { + "epoch": 0.9274053490795415, + "grad_norm": 0.4838529238212452, + "learning_rate": 1.3762553930253885e-07, + "loss": 0.0196, + "step": 2670 + }, + { + "epoch": 0.9277526919069121, + "grad_norm": 0.32075955939895817, + "learning_rate": 1.3631770214749374e-07, + "loss": 0.016, + "step": 2671 + }, + { + "epoch": 0.9281000347342827, + "grad_norm": 0.5031442718892309, + "learning_rate": 1.3501602291284166e-07, + "loss": 0.0201, + "step": 2672 + }, + { + "epoch": 0.9284473775616533, + "grad_norm": 0.3459439042987737, + "learning_rate": 1.3372050324663988e-07, + "loss": 0.0214, + "step": 2673 + }, + { + "epoch": 0.928794720389024, + "grad_norm": 0.24515986422727065, + "learning_rate": 1.3243114478915076e-07, + "loss": 0.0098, + "step": 2674 + }, + { + "epoch": 0.9291420632163946, + "grad_norm": 0.5721382453019332, + "learning_rate": 1.3114794917283403e-07, + "loss": 0.0195, + "step": 2675 + }, + { + "epoch": 0.9294894060437652, + "grad_norm": 1.008562948955966, + "learning_rate": 1.2987091802234675e-07, + "loss": 0.0146, + "step": 2676 + }, + { + "epoch": 0.9298367488711358, + "grad_norm": 0.3165525870137379, + "learning_rate": 1.286000529545406e-07, + "loss": 0.0125, + "step": 2677 + }, + { + "epoch": 0.9301840916985065, + "grad_norm": 0.3519330637522624, + "learning_rate": 1.2733535557846176e-07, + "loss": 0.0132, + "step": 2678 + }, + { + "epoch": 0.930531434525877, + "grad_norm": 0.609989019077514, + "learning_rate": 1.2607682749534723e-07, + "loss": 0.0227, + "step": 2679 + }, + { + "epoch": 0.9308787773532476, + "grad_norm": 0.9719297935968579, + "learning_rate": 1.2482447029862177e-07, + "loss": 0.0175, + "step": 2680 + }, + { + "epoch": 0.9312261201806182, + "grad_norm": 0.5004324439145527, + "learning_rate": 1.2357828557389762e-07, + "loss": 0.0198, + "step": 2681 + }, + { + "epoch": 0.9315734630079889, + "grad_norm": 0.6042127292832795, + "learning_rate": 1.223382748989732e-07, + "loss": 0.0156, + "step": 2682 + }, + { + "epoch": 0.9319208058353595, + "grad_norm": 0.6442994121709944, + "learning_rate": 1.2110443984382936e-07, + "loss": 0.0178, + "step": 2683 + }, + { + "epoch": 0.9322681486627301, + "grad_norm": 0.4373568494058291, + "learning_rate": 1.1987678197062758e-07, + "loss": 0.0116, + "step": 2684 + }, + { + "epoch": 0.9326154914901007, + "grad_norm": 0.4862790006331638, + "learning_rate": 1.186553028337073e-07, + "loss": 0.0133, + "step": 2685 + }, + { + "epoch": 0.9329628343174713, + "grad_norm": 0.4434022690215712, + "learning_rate": 1.1744000397958755e-07, + "loss": 0.0176, + "step": 2686 + }, + { + "epoch": 0.933310177144842, + "grad_norm": 0.8580640939743742, + "learning_rate": 1.1623088694696194e-07, + "loss": 0.0198, + "step": 2687 + }, + { + "epoch": 0.9336575199722126, + "grad_norm": 0.4912841363664674, + "learning_rate": 1.150279532666948e-07, + "loss": 0.0137, + "step": 2688 + }, + { + "epoch": 0.9340048627995832, + "grad_norm": 0.6768885959717444, + "learning_rate": 1.1383120446182505e-07, + "loss": 0.023, + "step": 2689 + }, + { + "epoch": 0.9343522056269538, + "grad_norm": 0.4122917358139824, + "learning_rate": 1.1264064204755898e-07, + "loss": 0.0146, + "step": 2690 + }, + { + "epoch": 0.9346995484543245, + "grad_norm": 0.2906332176742442, + "learning_rate": 1.114562675312697e-07, + "loss": 0.0091, + "step": 2691 + }, + { + "epoch": 0.935046891281695, + "grad_norm": 0.788414928922905, + "learning_rate": 1.1027808241249715e-07, + "loss": 0.0234, + "step": 2692 + }, + { + "epoch": 0.9353942341090656, + "grad_norm": 0.7000778534595465, + "learning_rate": 1.0910608818294588e-07, + "loss": 0.0171, + "step": 2693 + }, + { + "epoch": 0.9357415769364362, + "grad_norm": 0.3365753374263883, + "learning_rate": 1.079402863264789e-07, + "loss": 0.011, + "step": 2694 + }, + { + "epoch": 0.9360889197638069, + "grad_norm": 0.500493153669285, + "learning_rate": 1.0678067831912164e-07, + "loss": 0.0246, + "step": 2695 + }, + { + "epoch": 0.9364362625911775, + "grad_norm": 0.49634500860470243, + "learning_rate": 1.056272656290569e-07, + "loss": 0.0151, + "step": 2696 + }, + { + "epoch": 0.9367836054185481, + "grad_norm": 0.2959869869958872, + "learning_rate": 1.0448004971662317e-07, + "loss": 0.0144, + "step": 2697 + }, + { + "epoch": 0.9371309482459187, + "grad_norm": 0.5721150903591803, + "learning_rate": 1.0333903203431362e-07, + "loss": 0.0158, + "step": 2698 + }, + { + "epoch": 0.9374782910732893, + "grad_norm": 0.5174080846934402, + "learning_rate": 1.0220421402677261e-07, + "loss": 0.0172, + "step": 2699 + }, + { + "epoch": 0.93782563390066, + "grad_norm": 0.6111422252957059, + "learning_rate": 1.0107559713079751e-07, + "loss": 0.0159, + "step": 2700 + }, + { + "epoch": 0.9381729767280306, + "grad_norm": 0.3308102163778448, + "learning_rate": 9.99531827753325e-08, + "loss": 0.0116, + "step": 2701 + }, + { + "epoch": 0.9385203195554012, + "grad_norm": 0.4117220886730794, + "learning_rate": 9.883697238146917e-08, + "loss": 0.0159, + "step": 2702 + }, + { + "epoch": 0.9388676623827718, + "grad_norm": 0.6253135827117662, + "learning_rate": 9.772696736244369e-08, + "loss": 0.0161, + "step": 2703 + }, + { + "epoch": 0.9392150052101425, + "grad_norm": 0.7918100091195154, + "learning_rate": 9.662316912363634e-08, + "loss": 0.0202, + "step": 2704 + }, + { + "epoch": 0.939562348037513, + "grad_norm": 0.8027328036502868, + "learning_rate": 9.552557906257032e-08, + "loss": 0.018, + "step": 2705 + }, + { + "epoch": 0.9399096908648836, + "grad_norm": 0.399436967336629, + "learning_rate": 9.443419856890568e-08, + "loss": 0.0154, + "step": 2706 + }, + { + "epoch": 0.9402570336922542, + "grad_norm": 0.5291507460704015, + "learning_rate": 9.33490290244421e-08, + "loss": 0.0147, + "step": 2707 + }, + { + "epoch": 0.9406043765196249, + "grad_norm": 0.9303566715405308, + "learning_rate": 9.227007180311609e-08, + "loss": 0.013, + "step": 2708 + }, + { + "epoch": 0.9409517193469955, + "grad_norm": 0.48135730059288107, + "learning_rate": 9.119732827099826e-08, + "loss": 0.0178, + "step": 2709 + }, + { + "epoch": 0.9412990621743661, + "grad_norm": 0.289263017584217, + "learning_rate": 9.013079978629047e-08, + "loss": 0.0139, + "step": 2710 + }, + { + "epoch": 0.9416464050017367, + "grad_norm": 1.2281515688429252, + "learning_rate": 8.907048769932813e-08, + "loss": 0.0232, + "step": 2711 + }, + { + "epoch": 0.9419937478291073, + "grad_norm": 0.45867978328141407, + "learning_rate": 8.801639335257573e-08, + "loss": 0.018, + "step": 2712 + }, + { + "epoch": 0.942341090656478, + "grad_norm": 0.8116884665077297, + "learning_rate": 8.696851808062401e-08, + "loss": 0.0166, + "step": 2713 + }, + { + "epoch": 0.9426884334838486, + "grad_norm": 0.629444316419227, + "learning_rate": 8.592686321019005e-08, + "loss": 0.0187, + "step": 2714 + }, + { + "epoch": 0.9430357763112192, + "grad_norm": 0.467448231376131, + "learning_rate": 8.489143006011613e-08, + "loss": 0.0191, + "step": 2715 + }, + { + "epoch": 0.9433831191385897, + "grad_norm": 1.540746884539893, + "learning_rate": 8.38622199413669e-08, + "loss": 0.0276, + "step": 2716 + }, + { + "epoch": 0.9437304619659604, + "grad_norm": 0.9960497105684367, + "learning_rate": 8.283923415702832e-08, + "loss": 0.0185, + "step": 2717 + }, + { + "epoch": 0.944077804793331, + "grad_norm": 0.6697810432465229, + "learning_rate": 8.182247400230381e-08, + "loss": 0.0239, + "step": 2718 + }, + { + "epoch": 0.9444251476207016, + "grad_norm": 0.5928459085259657, + "learning_rate": 8.081194076451749e-08, + "loss": 0.0155, + "step": 2719 + }, + { + "epoch": 0.9447724904480722, + "grad_norm": 0.9024042874062206, + "learning_rate": 7.980763572310702e-08, + "loss": 0.0175, + "step": 2720 + }, + { + "epoch": 0.9451198332754429, + "grad_norm": 0.9579838145780024, + "learning_rate": 7.880956014962694e-08, + "loss": 0.0271, + "step": 2721 + }, + { + "epoch": 0.9454671761028135, + "grad_norm": 0.49204147819791966, + "learning_rate": 7.781771530774085e-08, + "loss": 0.0233, + "step": 2722 + }, + { + "epoch": 0.9458145189301841, + "grad_norm": 0.6787970584155775, + "learning_rate": 7.683210245322869e-08, + "loss": 0.025, + "step": 2723 + }, + { + "epoch": 0.9461618617575547, + "grad_norm": 0.32439298949722206, + "learning_rate": 7.585272283397504e-08, + "loss": 0.0123, + "step": 2724 + }, + { + "epoch": 0.9465092045849253, + "grad_norm": 0.8141650539318078, + "learning_rate": 7.487957768997633e-08, + "loss": 0.0142, + "step": 2725 + }, + { + "epoch": 0.946856547412296, + "grad_norm": 0.8468827577767781, + "learning_rate": 7.391266825333365e-08, + "loss": 0.0111, + "step": 2726 + }, + { + "epoch": 0.9472038902396666, + "grad_norm": 0.5687781476564356, + "learning_rate": 7.295199574825384e-08, + "loss": 0.0198, + "step": 2727 + }, + { + "epoch": 0.9475512330670371, + "grad_norm": 0.4074296259542572, + "learning_rate": 7.199756139104563e-08, + "loss": 0.0144, + "step": 2728 + }, + { + "epoch": 0.9478985758944077, + "grad_norm": 0.7996569231526766, + "learning_rate": 7.104936639012239e-08, + "loss": 0.0304, + "step": 2729 + }, + { + "epoch": 0.9482459187217784, + "grad_norm": 0.45337039410172214, + "learning_rate": 7.01074119459949e-08, + "loss": 0.0146, + "step": 2730 + }, + { + "epoch": 0.948593261549149, + "grad_norm": 0.7844203973946874, + "learning_rate": 6.917169925127476e-08, + "loss": 0.0272, + "step": 2731 + }, + { + "epoch": 0.9489406043765196, + "grad_norm": 0.4489625006480948, + "learning_rate": 6.824222949066983e-08, + "loss": 0.0195, + "step": 2732 + }, + { + "epoch": 0.9492879472038902, + "grad_norm": 1.1267855989046178, + "learning_rate": 6.731900384098433e-08, + "loss": 0.0137, + "step": 2733 + }, + { + "epoch": 0.9496352900312609, + "grad_norm": 1.0751845620346967, + "learning_rate": 6.640202347111657e-08, + "loss": 0.019, + "step": 2734 + }, + { + "epoch": 0.9499826328586315, + "grad_norm": 0.5627865416818346, + "learning_rate": 6.54912895420573e-08, + "loss": 0.0185, + "step": 2735 + }, + { + "epoch": 0.9503299756860021, + "grad_norm": 0.49730875779627454, + "learning_rate": 6.458680320688914e-08, + "loss": 0.0198, + "step": 2736 + }, + { + "epoch": 0.9506773185133727, + "grad_norm": 0.5114294099131146, + "learning_rate": 6.368856561078496e-08, + "loss": 0.0115, + "step": 2737 + }, + { + "epoch": 0.9510246613407433, + "grad_norm": 0.8871643487076939, + "learning_rate": 6.279657789100612e-08, + "loss": 0.016, + "step": 2738 + }, + { + "epoch": 0.951372004168114, + "grad_norm": 0.6824181407432921, + "learning_rate": 6.191084117689871e-08, + "loss": 0.0225, + "step": 2739 + }, + { + "epoch": 0.9517193469954845, + "grad_norm": 0.3788527139900645, + "learning_rate": 6.103135658989789e-08, + "loss": 0.0135, + "step": 2740 + }, + { + "epoch": 0.9520666898228551, + "grad_norm": 2.27805723002845, + "learning_rate": 6.015812524352072e-08, + "loss": 0.0262, + "step": 2741 + }, + { + "epoch": 0.9524140326502257, + "grad_norm": 0.5250351000200569, + "learning_rate": 5.9291148243367235e-08, + "loss": 0.017, + "step": 2742 + }, + { + "epoch": 0.9527613754775964, + "grad_norm": 0.6854351694783098, + "learning_rate": 5.8430426687119954e-08, + "loss": 0.0209, + "step": 2743 + }, + { + "epoch": 0.953108718304967, + "grad_norm": 0.8186234378831619, + "learning_rate": 5.7575961664539384e-08, + "loss": 0.0262, + "step": 2744 + }, + { + "epoch": 0.9534560611323376, + "grad_norm": 0.5783055322077533, + "learning_rate": 5.672775425746735e-08, + "loss": 0.0164, + "step": 2745 + }, + { + "epoch": 0.9538034039597082, + "grad_norm": 1.1613038468708938, + "learning_rate": 5.588580553982092e-08, + "loss": 0.0306, + "step": 2746 + }, + { + "epoch": 0.9541507467870789, + "grad_norm": 0.8318319702699184, + "learning_rate": 5.505011657759296e-08, + "loss": 0.0129, + "step": 2747 + }, + { + "epoch": 0.9544980896144495, + "grad_norm": 0.7343407926498003, + "learning_rate": 5.4220688428850974e-08, + "loss": 0.0125, + "step": 2748 + }, + { + "epoch": 0.9548454324418201, + "grad_norm": 0.5822397422630301, + "learning_rate": 5.3397522143737725e-08, + "loss": 0.0187, + "step": 2749 + }, + { + "epoch": 0.9551927752691907, + "grad_norm": 0.36611892790873857, + "learning_rate": 5.258061876446507e-08, + "loss": 0.0129, + "step": 2750 + }, + { + "epoch": 0.9555401180965613, + "grad_norm": 0.506143736687749, + "learning_rate": 5.176997932531569e-08, + "loss": 0.0111, + "step": 2751 + }, + { + "epoch": 0.955887460923932, + "grad_norm": 0.37104961585297513, + "learning_rate": 5.096560485264301e-08, + "loss": 0.0178, + "step": 2752 + }, + { + "epoch": 0.9562348037513025, + "grad_norm": 0.39087699228743555, + "learning_rate": 5.016749636486851e-08, + "loss": 0.0219, + "step": 2753 + }, + { + "epoch": 0.9565821465786731, + "grad_norm": 0.3940695373758418, + "learning_rate": 4.937565487247775e-08, + "loss": 0.0217, + "step": 2754 + }, + { + "epoch": 0.9569294894060437, + "grad_norm": 0.8139685846877176, + "learning_rate": 4.859008137802379e-08, + "loss": 0.0188, + "step": 2755 + }, + { + "epoch": 0.9572768322334144, + "grad_norm": 0.3884984470065128, + "learning_rate": 4.781077687612379e-08, + "loss": 0.0118, + "step": 2756 + }, + { + "epoch": 0.957624175060785, + "grad_norm": 0.9041840556875058, + "learning_rate": 4.703774235345626e-08, + "loss": 0.0204, + "step": 2757 + }, + { + "epoch": 0.9579715178881556, + "grad_norm": 0.46537968867455554, + "learning_rate": 4.627097878876274e-08, + "loss": 0.0156, + "step": 2758 + }, + { + "epoch": 0.9583188607155262, + "grad_norm": 0.41094808662250093, + "learning_rate": 4.551048715284445e-08, + "loss": 0.0174, + "step": 2759 + }, + { + "epoch": 0.9586662035428969, + "grad_norm": 0.3654417168577741, + "learning_rate": 4.4756268408561174e-08, + "loss": 0.0193, + "step": 2760 + }, + { + "epoch": 0.9590135463702675, + "grad_norm": 0.6891593746161022, + "learning_rate": 4.400832351083184e-08, + "loss": 0.0185, + "step": 2761 + }, + { + "epoch": 0.9593608891976381, + "grad_norm": 0.9417802736827884, + "learning_rate": 4.326665340663117e-08, + "loss": 0.0176, + "step": 2762 + }, + { + "epoch": 0.9597082320250087, + "grad_norm": 0.6092562454977165, + "learning_rate": 4.253125903498967e-08, + "loss": 0.0113, + "step": 2763 + }, + { + "epoch": 0.9600555748523792, + "grad_norm": 0.4780402820280536, + "learning_rate": 4.180214132699201e-08, + "loss": 0.0132, + "step": 2764 + }, + { + "epoch": 0.96040291767975, + "grad_norm": 0.7203305496862711, + "learning_rate": 4.10793012057753e-08, + "loss": 0.0201, + "step": 2765 + }, + { + "epoch": 0.9607502605071205, + "grad_norm": 0.78632337470775, + "learning_rate": 4.03627395865297e-08, + "loss": 0.0161, + "step": 2766 + }, + { + "epoch": 0.9610976033344911, + "grad_norm": 0.7800907681147895, + "learning_rate": 3.9652457376496146e-08, + "loss": 0.0226, + "step": 2767 + }, + { + "epoch": 0.9614449461618617, + "grad_norm": 0.7579603624294482, + "learning_rate": 3.894845547496418e-08, + "loss": 0.018, + "step": 2768 + }, + { + "epoch": 0.9617922889892324, + "grad_norm": 0.4992736774720632, + "learning_rate": 3.8250734773272455e-08, + "loss": 0.0202, + "step": 2769 + }, + { + "epoch": 0.962139631816603, + "grad_norm": 0.5447806191032949, + "learning_rate": 3.755929615480658e-08, + "loss": 0.0158, + "step": 2770 + }, + { + "epoch": 0.9624869746439736, + "grad_norm": 0.4587220635778511, + "learning_rate": 3.687414049500015e-08, + "loss": 0.0174, + "step": 2771 + }, + { + "epoch": 0.9628343174713442, + "grad_norm": 0.6017122263910978, + "learning_rate": 3.619526866132872e-08, + "loss": 0.0212, + "step": 2772 + }, + { + "epoch": 0.9631816602987149, + "grad_norm": 0.45153146666560434, + "learning_rate": 3.552268151331417e-08, + "loss": 0.0129, + "step": 2773 + }, + { + "epoch": 0.9635290031260855, + "grad_norm": 0.8050815601154685, + "learning_rate": 3.485637990252089e-08, + "loss": 0.0203, + "step": 2774 + }, + { + "epoch": 0.9638763459534561, + "grad_norm": 0.5550194035824456, + "learning_rate": 3.4196364672555715e-08, + "loss": 0.0149, + "step": 2775 + }, + { + "epoch": 0.9642236887808266, + "grad_norm": 0.6061372955316445, + "learning_rate": 3.3542636659064095e-08, + "loss": 0.017, + "step": 2776 + }, + { + "epoch": 0.9645710316081972, + "grad_norm": 0.7681878631554084, + "learning_rate": 3.2895196689733953e-08, + "loss": 0.0143, + "step": 2777 + }, + { + "epoch": 0.9649183744355679, + "grad_norm": 0.7249193630109478, + "learning_rate": 3.225404558429068e-08, + "loss": 0.0175, + "step": 2778 + }, + { + "epoch": 0.9652657172629385, + "grad_norm": 0.31018295263385637, + "learning_rate": 3.1619184154496605e-08, + "loss": 0.0096, + "step": 2779 + }, + { + "epoch": 0.9656130600903091, + "grad_norm": 0.5323520395260942, + "learning_rate": 3.099061320415153e-08, + "loss": 0.02, + "step": 2780 + }, + { + "epoch": 0.9659604029176797, + "grad_norm": 0.5460021526097322, + "learning_rate": 3.036833352909052e-08, + "loss": 0.0138, + "step": 2781 + }, + { + "epoch": 0.9663077457450504, + "grad_norm": 0.4231306425496086, + "learning_rate": 2.9752345917184456e-08, + "loss": 0.0128, + "step": 2782 + }, + { + "epoch": 0.966655088572421, + "grad_norm": 0.5104133537482568, + "learning_rate": 2.914265114833614e-08, + "loss": 0.0181, + "step": 2783 + }, + { + "epoch": 0.9670024313997916, + "grad_norm": 0.32680316277058136, + "learning_rate": 2.8539249994480878e-08, + "loss": 0.0142, + "step": 2784 + }, + { + "epoch": 0.9673497742271622, + "grad_norm": 0.6787838217925692, + "learning_rate": 2.7942143219587547e-08, + "loss": 0.0227, + "step": 2785 + }, + { + "epoch": 0.9676971170545329, + "grad_norm": 0.5393149296688037, + "learning_rate": 2.7351331579654194e-08, + "loss": 0.0199, + "step": 2786 + }, + { + "epoch": 0.9680444598819035, + "grad_norm": 0.3709030328592483, + "learning_rate": 2.6766815822709124e-08, + "loss": 0.0189, + "step": 2787 + }, + { + "epoch": 0.968391802709274, + "grad_norm": 2.1813300319236433, + "learning_rate": 2.618859668880869e-08, + "loss": 0.0233, + "step": 2788 + }, + { + "epoch": 0.9687391455366446, + "grad_norm": 0.6115166749222286, + "learning_rate": 2.561667491003783e-08, + "loss": 0.0151, + "step": 2789 + }, + { + "epoch": 0.9690864883640152, + "grad_norm": 0.7565981079039917, + "learning_rate": 2.5051051210508437e-08, + "loss": 0.0133, + "step": 2790 + }, + { + "epoch": 0.9694338311913859, + "grad_norm": 0.6494118382951497, + "learning_rate": 2.4491726306357656e-08, + "loss": 0.0129, + "step": 2791 + }, + { + "epoch": 0.9697811740187565, + "grad_norm": 0.8914095771286661, + "learning_rate": 2.3938700905747902e-08, + "loss": 0.0165, + "step": 2792 + }, + { + "epoch": 0.9701285168461271, + "grad_norm": 0.3651793620369032, + "learning_rate": 2.3391975708866866e-08, + "loss": 0.014, + "step": 2793 + }, + { + "epoch": 0.9704758596734977, + "grad_norm": 0.6009630665505777, + "learning_rate": 2.2851551407924166e-08, + "loss": 0.0126, + "step": 2794 + }, + { + "epoch": 0.9708232025008684, + "grad_norm": 0.6517704643511893, + "learning_rate": 2.231742868715303e-08, + "loss": 0.0215, + "step": 2795 + }, + { + "epoch": 0.971170545328239, + "grad_norm": 0.5374743831920767, + "learning_rate": 2.17896082228064e-08, + "loss": 0.0109, + "step": 2796 + }, + { + "epoch": 0.9715178881556096, + "grad_norm": 0.43751454745868124, + "learning_rate": 2.1268090683159714e-08, + "loss": 0.0151, + "step": 2797 + }, + { + "epoch": 0.9718652309829802, + "grad_norm": 0.9225025519084343, + "learning_rate": 2.075287672850812e-08, + "loss": 0.0185, + "step": 2798 + }, + { + "epoch": 0.9722125738103509, + "grad_norm": 0.6499562483522786, + "learning_rate": 2.0243967011164267e-08, + "loss": 0.0143, + "step": 2799 + }, + { + "epoch": 0.9725599166377215, + "grad_norm": 0.4078620285370667, + "learning_rate": 1.9741362175461076e-08, + "loss": 0.0095, + "step": 2800 + }, + { + "epoch": 0.972907259465092, + "grad_norm": 0.649018054650212, + "learning_rate": 1.9245062857746744e-08, + "loss": 0.015, + "step": 2801 + }, + { + "epoch": 0.9732546022924626, + "grad_norm": 0.8446552989012637, + "learning_rate": 1.8755069686388074e-08, + "loss": 0.0265, + "step": 2802 + }, + { + "epoch": 0.9736019451198332, + "grad_norm": 0.8836022726540815, + "learning_rate": 1.827138328176603e-08, + "loss": 0.0197, + "step": 2803 + }, + { + "epoch": 0.9739492879472039, + "grad_norm": 0.39841982862426034, + "learning_rate": 1.7794004256277976e-08, + "loss": 0.0112, + "step": 2804 + }, + { + "epoch": 0.9742966307745745, + "grad_norm": 0.6590609628914503, + "learning_rate": 1.732293321433376e-08, + "loss": 0.022, + "step": 2805 + }, + { + "epoch": 0.9746439736019451, + "grad_norm": 0.5300192127082276, + "learning_rate": 1.6858170752357407e-08, + "loss": 0.0164, + "step": 2806 + }, + { + "epoch": 0.9749913164293157, + "grad_norm": 1.0204073151035766, + "learning_rate": 1.63997174587871e-08, + "loss": 0.0123, + "step": 2807 + }, + { + "epoch": 0.9753386592566864, + "grad_norm": 0.571323310061111, + "learning_rate": 1.594757391407076e-08, + "loss": 0.0157, + "step": 2808 + }, + { + "epoch": 0.975686002084057, + "grad_norm": 0.3778707394308839, + "learning_rate": 1.5501740690668788e-08, + "loss": 0.0235, + "step": 2809 + }, + { + "epoch": 0.9760333449114276, + "grad_norm": 0.44292675404107196, + "learning_rate": 1.506221835305133e-08, + "loss": 0.0173, + "step": 2810 + }, + { + "epoch": 0.9763806877387982, + "grad_norm": 0.2763995858299458, + "learning_rate": 1.4629007457699906e-08, + "loss": 0.0093, + "step": 2811 + }, + { + "epoch": 0.9767280305661689, + "grad_norm": 0.8458253024017103, + "learning_rate": 1.4202108553102445e-08, + "loss": 0.0216, + "step": 2812 + }, + { + "epoch": 0.9770753733935394, + "grad_norm": 0.47122826092790115, + "learning_rate": 1.3781522179757146e-08, + "loss": 0.0211, + "step": 2813 + }, + { + "epoch": 0.97742271622091, + "grad_norm": 0.6437597372265773, + "learning_rate": 1.3367248870170269e-08, + "loss": 0.0253, + "step": 2814 + }, + { + "epoch": 0.9777700590482806, + "grad_norm": 0.7953386444191836, + "learning_rate": 1.295928914885336e-08, + "loss": 0.0216, + "step": 2815 + }, + { + "epoch": 0.9781174018756512, + "grad_norm": 0.7510223662917004, + "learning_rate": 1.255764353232547e-08, + "loss": 0.0162, + "step": 2816 + }, + { + "epoch": 0.9784647447030219, + "grad_norm": 0.38965761008763483, + "learning_rate": 1.2162312529111487e-08, + "loss": 0.0089, + "step": 2817 + }, + { + "epoch": 0.9788120875303925, + "grad_norm": 0.8708571004518411, + "learning_rate": 1.1773296639741028e-08, + "loss": 0.0186, + "step": 2818 + }, + { + "epoch": 0.9791594303577631, + "grad_norm": 0.4280076925155896, + "learning_rate": 1.139059635674733e-08, + "loss": 0.0125, + "step": 2819 + }, + { + "epoch": 0.9795067731851337, + "grad_norm": 0.5982212585888964, + "learning_rate": 1.1014212164668914e-08, + "loss": 0.0128, + "step": 2820 + }, + { + "epoch": 0.9798541160125044, + "grad_norm": 0.4802236337534653, + "learning_rate": 1.0644144540046253e-08, + "loss": 0.0154, + "step": 2821 + }, + { + "epoch": 0.980201458839875, + "grad_norm": 0.6709260979923914, + "learning_rate": 1.028039395142344e-08, + "loss": 0.0162, + "step": 2822 + }, + { + "epoch": 0.9805488016672456, + "grad_norm": 0.3994630255273555, + "learning_rate": 9.92296085934541e-09, + "loss": 0.0136, + "step": 2823 + }, + { + "epoch": 0.9808961444946162, + "grad_norm": 0.8512917771148851, + "learning_rate": 9.571845716360162e-09, + "loss": 0.0137, + "step": 2824 + }, + { + "epoch": 0.9812434873219869, + "grad_norm": 0.7023478964706273, + "learning_rate": 9.227048967013762e-09, + "loss": 0.0199, + "step": 2825 + }, + { + "epoch": 0.9815908301493574, + "grad_norm": 0.5923172701448587, + "learning_rate": 8.888571047855899e-09, + "loss": 0.0149, + "step": 2826 + }, + { + "epoch": 0.981938172976728, + "grad_norm": 0.18336419674101587, + "learning_rate": 8.55641238743321e-09, + "loss": 0.0045, + "step": 2827 + }, + { + "epoch": 0.9822855158040986, + "grad_norm": 0.7801052776351165, + "learning_rate": 8.230573406293186e-09, + "loss": 0.0185, + "step": 2828 + }, + { + "epoch": 0.9826328586314692, + "grad_norm": 0.4666029152584524, + "learning_rate": 7.911054516981376e-09, + "loss": 0.0092, + "step": 2829 + }, + { + "epoch": 0.9829802014588399, + "grad_norm": 0.5508063511757076, + "learning_rate": 7.597856124040848e-09, + "loss": 0.0278, + "step": 2830 + }, + { + "epoch": 0.9833275442862105, + "grad_norm": 0.47041522453276524, + "learning_rate": 7.290978624013289e-09, + "loss": 0.0131, + "step": 2831 + }, + { + "epoch": 0.9836748871135811, + "grad_norm": 0.5117867904313976, + "learning_rate": 6.990422405437903e-09, + "loss": 0.0169, + "step": 2832 + }, + { + "epoch": 0.9840222299409517, + "grad_norm": 0.4928579747341889, + "learning_rate": 6.696187848848068e-09, + "loss": 0.014, + "step": 2833 + }, + { + "epoch": 0.9843695727683224, + "grad_norm": 0.3862793465920075, + "learning_rate": 6.408275326775792e-09, + "loss": 0.0194, + "step": 2834 + }, + { + "epoch": 0.984716915595693, + "grad_norm": 0.38744597996896796, + "learning_rate": 6.126685203747818e-09, + "loss": 0.0142, + "step": 2835 + }, + { + "epoch": 0.9850642584230636, + "grad_norm": 0.36149263951244837, + "learning_rate": 5.851417836286177e-09, + "loss": 0.0152, + "step": 2836 + }, + { + "epoch": 0.9854116012504341, + "grad_norm": 0.6059984628372787, + "learning_rate": 5.582473572907643e-09, + "loss": 0.0125, + "step": 2837 + }, + { + "epoch": 0.9857589440778048, + "grad_norm": 1.0805004360514219, + "learning_rate": 5.319852754122612e-09, + "loss": 0.0188, + "step": 2838 + }, + { + "epoch": 0.9861062869051754, + "grad_norm": 0.7645678869514635, + "learning_rate": 5.0635557124362185e-09, + "loss": 0.0189, + "step": 2839 + }, + { + "epoch": 0.986453629732546, + "grad_norm": 1.4270577859101476, + "learning_rate": 4.813582772347225e-09, + "loss": 0.027, + "step": 2840 + }, + { + "epoch": 0.9868009725599166, + "grad_norm": 0.6651264047671889, + "learning_rate": 4.569934250346908e-09, + "loss": 0.0171, + "step": 2841 + }, + { + "epoch": 0.9871483153872872, + "grad_norm": 0.562975977952286, + "learning_rate": 4.332610454919062e-09, + "loss": 0.0155, + "step": 2842 + }, + { + "epoch": 0.9874956582146579, + "grad_norm": 0.5118103669731423, + "learning_rate": 4.101611686539442e-09, + "loss": 0.013, + "step": 2843 + }, + { + "epoch": 0.9878430010420285, + "grad_norm": 0.7783130801930213, + "learning_rate": 3.876938237676875e-09, + "loss": 0.0253, + "step": 2844 + }, + { + "epoch": 0.9881903438693991, + "grad_norm": 0.34801718183552194, + "learning_rate": 3.6585903927910395e-09, + "loss": 0.0118, + "step": 2845 + }, + { + "epoch": 0.9885376866967697, + "grad_norm": 0.535293498292763, + "learning_rate": 3.446568428332464e-09, + "loss": 0.014, + "step": 2846 + }, + { + "epoch": 0.9888850295241404, + "grad_norm": 0.504081454212183, + "learning_rate": 3.2408726127425294e-09, + "loss": 0.0184, + "step": 2847 + }, + { + "epoch": 0.989232372351511, + "grad_norm": 0.5720403882389867, + "learning_rate": 3.0415032064534668e-09, + "loss": 0.0243, + "step": 2848 + }, + { + "epoch": 0.9895797151788815, + "grad_norm": 0.5342584944130937, + "learning_rate": 2.848460461887248e-09, + "loss": 0.0212, + "step": 2849 + }, + { + "epoch": 0.9899270580062521, + "grad_norm": 0.37088583044080237, + "learning_rate": 2.6617446234555866e-09, + "loss": 0.0199, + "step": 2850 + }, + { + "epoch": 0.9902744008336228, + "grad_norm": 0.36631835755907144, + "learning_rate": 2.4813559275604914e-09, + "loss": 0.0147, + "step": 2851 + }, + { + "epoch": 0.9906217436609934, + "grad_norm": 0.7717204298531588, + "learning_rate": 2.3072946025920474e-09, + "loss": 0.0133, + "step": 2852 + }, + { + "epoch": 0.990969086488364, + "grad_norm": 0.4286410759035607, + "learning_rate": 2.1395608689295244e-09, + "loss": 0.0172, + "step": 2853 + }, + { + "epoch": 0.9913164293157346, + "grad_norm": 0.7876162638170817, + "learning_rate": 1.978154938941379e-09, + "loss": 0.0192, + "step": 2854 + }, + { + "epoch": 0.9916637721431052, + "grad_norm": 0.9045977670509248, + "learning_rate": 1.8230770169841427e-09, + "loss": 0.0145, + "step": 2855 + }, + { + "epoch": 0.9920111149704759, + "grad_norm": 0.34387118239672126, + "learning_rate": 1.674327299402423e-09, + "loss": 0.0089, + "step": 2856 + }, + { + "epoch": 0.9923584577978465, + "grad_norm": 0.37959493138909545, + "learning_rate": 1.531905974528347e-09, + "loss": 0.0163, + "step": 2857 + }, + { + "epoch": 0.9927058006252171, + "grad_norm": 0.7818912632587496, + "learning_rate": 1.3958132226821187e-09, + "loss": 0.02, + "step": 2858 + }, + { + "epoch": 0.9930531434525877, + "grad_norm": 0.8616967656128703, + "learning_rate": 1.266049216170906e-09, + "loss": 0.0235, + "step": 2859 + }, + { + "epoch": 0.9934004862799584, + "grad_norm": 0.8090514774003552, + "learning_rate": 1.142614119289398e-09, + "loss": 0.0208, + "step": 2860 + }, + { + "epoch": 0.993747829107329, + "grad_norm": 0.40981264825919256, + "learning_rate": 1.025508088318694e-09, + "loss": 0.0119, + "step": 2861 + }, + { + "epoch": 0.9940951719346995, + "grad_norm": 0.9853414305979534, + "learning_rate": 9.147312715279688e-10, + "loss": 0.0194, + "step": 2862 + }, + { + "epoch": 0.9944425147620701, + "grad_norm": 0.6823065427028371, + "learning_rate": 8.102838091705867e-10, + "loss": 0.0171, + "step": 2863 + }, + { + "epoch": 0.9947898575894408, + "grad_norm": 0.49217144790501227, + "learning_rate": 7.121658334890979e-10, + "loss": 0.0127, + "step": 2864 + }, + { + "epoch": 0.9951372004168114, + "grad_norm": 0.7961270955956514, + "learning_rate": 6.203774687102426e-10, + "loss": 0.0213, + "step": 2865 + }, + { + "epoch": 0.995484543244182, + "grad_norm": 0.5901951990218518, + "learning_rate": 5.34918831047726e-10, + "loss": 0.0181, + "step": 2866 + }, + { + "epoch": 0.9958318860715526, + "grad_norm": 1.0523471488146157, + "learning_rate": 4.5579002870110854e-10, + "loss": 0.0231, + "step": 2867 + }, + { + "epoch": 0.9961792288989232, + "grad_norm": 0.5550763297657431, + "learning_rate": 3.8299116185525066e-10, + "loss": 0.011, + "step": 2868 + }, + { + "epoch": 0.9965265717262939, + "grad_norm": 0.5408286201029091, + "learning_rate": 3.165223226808678e-10, + "loss": 0.0161, + "step": 2869 + }, + { + "epoch": 0.9968739145536645, + "grad_norm": 0.4991725997802592, + "learning_rate": 2.563835953345306e-10, + "loss": 0.0175, + "step": 2870 + }, + { + "epoch": 0.9972212573810351, + "grad_norm": 0.5996623257508237, + "learning_rate": 2.0257505595810966e-10, + "loss": 0.0183, + "step": 2871 + }, + { + "epoch": 0.9975686002084057, + "grad_norm": 0.7177885884770104, + "learning_rate": 1.5509677267877555e-10, + "loss": 0.0194, + "step": 2872 + }, + { + "epoch": 0.9979159430357764, + "grad_norm": 0.639091556482265, + "learning_rate": 1.1394880560844368e-10, + "loss": 0.0201, + "step": 2873 + }, + { + "epoch": 0.9982632858631469, + "grad_norm": 0.5680423116235028, + "learning_rate": 7.913120684488462e-11, + "loss": 0.0189, + "step": 2874 + }, + { + "epoch": 0.9986106286905175, + "grad_norm": 0.23508445434740197, + "learning_rate": 5.0644020471168894e-11, + "loss": 0.0098, + "step": 2875 + }, + { + "epoch": 0.9989579715178881, + "grad_norm": 0.3673594266436054, + "learning_rate": 2.8487282554556796e-11, + "loss": 0.0109, + "step": 2876 + }, + { + "epoch": 0.9993053143452588, + "grad_norm": 0.4480479527832172, + "learning_rate": 1.2661021148163699e-11, + "loss": 0.0206, + "step": 2877 + }, + { + "epoch": 0.9996526571726294, + "grad_norm": 0.48054013159980413, + "learning_rate": 3.1652562887396486e-12, + "loss": 0.0155, + "step": 2878 + }, + { + "epoch": 1.0, + "grad_norm": 0.7081858993617053, + "learning_rate": 0.0, + "loss": 0.0175, + "step": 2879 + }, + { + "epoch": 1.0, + "step": 2879, + "total_flos": 49962393949184.0, + "train_loss": 0.021164150436296568, + "train_runtime": 16194.0514, + "train_samples_per_second": 11.381, + "train_steps_per_second": 0.178 + } + ], + "logging_steps": 1.0, + "max_steps": 2879, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 49962393949184.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}