{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 7130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00014025245441795232, "grad_norm": 13.661454734360627, "learning_rate": 4.672897196261682e-08, "loss": 1.4474, "step": 1 }, { "epoch": 0.00028050490883590464, "grad_norm": 11.652364702042846, "learning_rate": 9.345794392523364e-08, "loss": 1.3264, "step": 2 }, { "epoch": 0.00042075736325385696, "grad_norm": 14.080603581409317, "learning_rate": 1.4018691588785048e-07, "loss": 1.3986, "step": 3 }, { "epoch": 0.0005610098176718093, "grad_norm": 13.216997612124556, "learning_rate": 1.8691588785046729e-07, "loss": 1.4604, "step": 4 }, { "epoch": 0.0007012622720897616, "grad_norm": 13.473275754095528, "learning_rate": 2.3364485981308412e-07, "loss": 1.4155, "step": 5 }, { "epoch": 0.0008415147265077139, "grad_norm": 14.316948211945018, "learning_rate": 2.8037383177570096e-07, "loss": 1.409, "step": 6 }, { "epoch": 0.0009817671809256663, "grad_norm": 13.272260284759573, "learning_rate": 3.2710280373831776e-07, "loss": 1.4131, "step": 7 }, { "epoch": 0.0011220196353436186, "grad_norm": 11.733722206376212, "learning_rate": 3.7383177570093457e-07, "loss": 1.3955, "step": 8 }, { "epoch": 0.0012622720897615708, "grad_norm": 13.253857321048795, "learning_rate": 4.2056074766355143e-07, "loss": 1.4297, "step": 9 }, { "epoch": 0.001402524544179523, "grad_norm": 15.15472769416775, "learning_rate": 4.6728971962616824e-07, "loss": 1.5023, "step": 10 }, { "epoch": 0.0015427769985974754, "grad_norm": 14.32592783214941, "learning_rate": 5.140186915887851e-07, "loss": 1.3748, "step": 11 }, { "epoch": 0.0016830294530154279, "grad_norm": 14.096768142745077, "learning_rate": 5.607476635514019e-07, "loss": 1.3932, "step": 12 }, { "epoch": 0.0018232819074333801, "grad_norm": 12.327740062510571, "learning_rate": 6.074766355140187e-07, "loss": 1.294, "step": 13 }, { "epoch": 0.0019635343618513326, "grad_norm": 11.411498150508658, "learning_rate": 6.542056074766355e-07, "loss": 1.3779, "step": 14 }, { "epoch": 0.0021037868162692847, "grad_norm": 12.706843321120317, "learning_rate": 7.009345794392523e-07, "loss": 1.2252, "step": 15 }, { "epoch": 0.002244039270687237, "grad_norm": 12.702905181446807, "learning_rate": 7.476635514018691e-07, "loss": 1.1854, "step": 16 }, { "epoch": 0.002384291725105189, "grad_norm": 11.68784466403558, "learning_rate": 7.94392523364486e-07, "loss": 1.1701, "step": 17 }, { "epoch": 0.0025245441795231417, "grad_norm": 10.974135769334598, "learning_rate": 8.411214953271029e-07, "loss": 1.2076, "step": 18 }, { "epoch": 0.002664796633941094, "grad_norm": 10.77184860983223, "learning_rate": 8.878504672897197e-07, "loss": 1.1467, "step": 19 }, { "epoch": 0.002805049088359046, "grad_norm": 8.746388589268841, "learning_rate": 9.345794392523365e-07, "loss": 1.0417, "step": 20 }, { "epoch": 0.0029453015427769987, "grad_norm": 7.124787979986887, "learning_rate": 9.813084112149534e-07, "loss": 1.0061, "step": 21 }, { "epoch": 0.0030855539971949507, "grad_norm": 6.335205035786844, "learning_rate": 1.0280373831775702e-06, "loss": 0.9409, "step": 22 }, { "epoch": 0.0032258064516129032, "grad_norm": 5.601907418882994, "learning_rate": 1.074766355140187e-06, "loss": 0.9028, "step": 23 }, { "epoch": 0.0033660589060308557, "grad_norm": 5.160024290397307, "learning_rate": 1.1214953271028038e-06, "loss": 0.91, "step": 24 }, { "epoch": 0.0035063113604488078, "grad_norm": 4.43592914125005, "learning_rate": 1.1682242990654206e-06, "loss": 0.8257, "step": 25 }, { "epoch": 0.0036465638148667602, "grad_norm": 5.155283519448136, "learning_rate": 1.2149532710280374e-06, "loss": 0.8183, "step": 26 }, { "epoch": 0.0037868162692847123, "grad_norm": 4.5863816730169695, "learning_rate": 1.2616822429906543e-06, "loss": 0.7743, "step": 27 }, { "epoch": 0.003927068723702665, "grad_norm": 5.7306784374979465, "learning_rate": 1.308411214953271e-06, "loss": 0.7569, "step": 28 }, { "epoch": 0.004067321178120617, "grad_norm": 5.6621386819050095, "learning_rate": 1.3551401869158879e-06, "loss": 0.7621, "step": 29 }, { "epoch": 0.004207573632538569, "grad_norm": 6.28696830427706, "learning_rate": 1.4018691588785047e-06, "loss": 0.8344, "step": 30 }, { "epoch": 0.004347826086956522, "grad_norm": 4.527708577028351, "learning_rate": 1.4485981308411215e-06, "loss": 0.6995, "step": 31 }, { "epoch": 0.004488078541374474, "grad_norm": 3.823721209414868, "learning_rate": 1.4953271028037383e-06, "loss": 0.719, "step": 32 }, { "epoch": 0.004628330995792427, "grad_norm": 3.7322723981548185, "learning_rate": 1.542056074766355e-06, "loss": 0.7347, "step": 33 }, { "epoch": 0.004768583450210378, "grad_norm": 3.596050024005017, "learning_rate": 1.588785046728972e-06, "loss": 0.7049, "step": 34 }, { "epoch": 0.004908835904628331, "grad_norm": 2.8044936769291433, "learning_rate": 1.6355140186915887e-06, "loss": 0.7007, "step": 35 }, { "epoch": 0.005049088359046283, "grad_norm": 3.320112873134955, "learning_rate": 1.6822429906542057e-06, "loss": 0.7024, "step": 36 }, { "epoch": 0.005189340813464236, "grad_norm": 3.429517598386339, "learning_rate": 1.7289719626168225e-06, "loss": 0.6795, "step": 37 }, { "epoch": 0.005329593267882188, "grad_norm": 3.2349477067402863, "learning_rate": 1.7757009345794394e-06, "loss": 0.5921, "step": 38 }, { "epoch": 0.00546984572230014, "grad_norm": 3.6712892755077213, "learning_rate": 1.8224299065420562e-06, "loss": 0.7182, "step": 39 }, { "epoch": 0.005610098176718092, "grad_norm": 3.607521539043542, "learning_rate": 1.869158878504673e-06, "loss": 0.6387, "step": 40 }, { "epoch": 0.005750350631136045, "grad_norm": 3.1687167374710383, "learning_rate": 1.9158878504672898e-06, "loss": 0.6739, "step": 41 }, { "epoch": 0.005890603085553997, "grad_norm": 3.917755638404281, "learning_rate": 1.962616822429907e-06, "loss": 0.6093, "step": 42 }, { "epoch": 0.00603085553997195, "grad_norm": 3.1539899836195517, "learning_rate": 2.0093457943925234e-06, "loss": 0.6461, "step": 43 }, { "epoch": 0.0061711079943899015, "grad_norm": 2.7970553718244044, "learning_rate": 2.0560747663551404e-06, "loss": 0.6713, "step": 44 }, { "epoch": 0.006311360448807854, "grad_norm": 3.378191110421054, "learning_rate": 2.102803738317757e-06, "loss": 0.6094, "step": 45 }, { "epoch": 0.0064516129032258064, "grad_norm": 3.113807011606497, "learning_rate": 2.149532710280374e-06, "loss": 0.686, "step": 46 }, { "epoch": 0.006591865357643759, "grad_norm": 7.931595175396131, "learning_rate": 2.1962616822429906e-06, "loss": 0.6347, "step": 47 }, { "epoch": 0.006732117812061711, "grad_norm": 2.9221977830774652, "learning_rate": 2.2429906542056077e-06, "loss": 0.5823, "step": 48 }, { "epoch": 0.006872370266479663, "grad_norm": 2.756270888199664, "learning_rate": 2.2897196261682247e-06, "loss": 0.6081, "step": 49 }, { "epoch": 0.0070126227208976155, "grad_norm": 2.5633470949449273, "learning_rate": 2.3364485981308413e-06, "loss": 0.6211, "step": 50 }, { "epoch": 0.007152875175315568, "grad_norm": 2.431050301712823, "learning_rate": 2.3831775700934583e-06, "loss": 0.5952, "step": 51 }, { "epoch": 0.0072931276297335205, "grad_norm": 2.502156281393821, "learning_rate": 2.429906542056075e-06, "loss": 0.5901, "step": 52 }, { "epoch": 0.007433380084151473, "grad_norm": 2.8927597823402476, "learning_rate": 2.476635514018692e-06, "loss": 0.5322, "step": 53 }, { "epoch": 0.007573632538569425, "grad_norm": 2.823575486071377, "learning_rate": 2.5233644859813085e-06, "loss": 0.6088, "step": 54 }, { "epoch": 0.007713884992987377, "grad_norm": 2.3749801233363357, "learning_rate": 2.570093457943925e-06, "loss": 0.5664, "step": 55 }, { "epoch": 0.00785413744740533, "grad_norm": 6.038239377321177, "learning_rate": 2.616822429906542e-06, "loss": 0.6008, "step": 56 }, { "epoch": 0.007994389901823282, "grad_norm": 2.709237744859725, "learning_rate": 2.6635514018691587e-06, "loss": 0.546, "step": 57 }, { "epoch": 0.008134642356241234, "grad_norm": 2.582624912546111, "learning_rate": 2.7102803738317757e-06, "loss": 0.5844, "step": 58 }, { "epoch": 0.008274894810659187, "grad_norm": 2.9544317997233085, "learning_rate": 2.7570093457943923e-06, "loss": 0.5631, "step": 59 }, { "epoch": 0.008415147265077139, "grad_norm": 2.6056831420355455, "learning_rate": 2.8037383177570094e-06, "loss": 0.5304, "step": 60 }, { "epoch": 0.008555399719495092, "grad_norm": 2.4488024684859915, "learning_rate": 2.8504672897196264e-06, "loss": 0.5022, "step": 61 }, { "epoch": 0.008695652173913044, "grad_norm": 2.541940882747964, "learning_rate": 2.897196261682243e-06, "loss": 0.6031, "step": 62 }, { "epoch": 0.008835904628330995, "grad_norm": 2.2663267956034527, "learning_rate": 2.94392523364486e-06, "loss": 0.5485, "step": 63 }, { "epoch": 0.008976157082748949, "grad_norm": 2.8042865774770886, "learning_rate": 2.9906542056074766e-06, "loss": 0.5517, "step": 64 }, { "epoch": 0.0091164095371669, "grad_norm": 3.8375715516219837, "learning_rate": 3.0373831775700936e-06, "loss": 0.5794, "step": 65 }, { "epoch": 0.009256661991584854, "grad_norm": 2.5126065929063692, "learning_rate": 3.08411214953271e-06, "loss": 0.5446, "step": 66 }, { "epoch": 0.009396914446002805, "grad_norm": 2.98158797101787, "learning_rate": 3.1308411214953272e-06, "loss": 0.4908, "step": 67 }, { "epoch": 0.009537166900420757, "grad_norm": 2.166071320927866, "learning_rate": 3.177570093457944e-06, "loss": 0.5105, "step": 68 }, { "epoch": 0.00967741935483871, "grad_norm": 2.5281475504658086, "learning_rate": 3.224299065420561e-06, "loss": 0.551, "step": 69 }, { "epoch": 0.009817671809256662, "grad_norm": 2.473513004309663, "learning_rate": 3.2710280373831774e-06, "loss": 0.5112, "step": 70 }, { "epoch": 0.009957924263674615, "grad_norm": 2.359169856367669, "learning_rate": 3.3177570093457945e-06, "loss": 0.5559, "step": 71 }, { "epoch": 0.010098176718092567, "grad_norm": 2.8093961755211234, "learning_rate": 3.3644859813084115e-06, "loss": 0.5144, "step": 72 }, { "epoch": 0.010238429172510518, "grad_norm": 2.840281288830847, "learning_rate": 3.411214953271028e-06, "loss": 0.5191, "step": 73 }, { "epoch": 0.010378681626928472, "grad_norm": 2.1312129132128756, "learning_rate": 3.457943925233645e-06, "loss": 0.5524, "step": 74 }, { "epoch": 0.010518934081346423, "grad_norm": 3.2541440805847226, "learning_rate": 3.5046728971962617e-06, "loss": 0.4996, "step": 75 }, { "epoch": 0.010659186535764377, "grad_norm": 2.4664219369712668, "learning_rate": 3.5514018691588787e-06, "loss": 0.4917, "step": 76 }, { "epoch": 0.010799438990182328, "grad_norm": 1.8505733117148453, "learning_rate": 3.5981308411214953e-06, "loss": 0.4527, "step": 77 }, { "epoch": 0.01093969144460028, "grad_norm": 3.1081107938600225, "learning_rate": 3.6448598130841123e-06, "loss": 0.49, "step": 78 }, { "epoch": 0.011079943899018233, "grad_norm": 2.2399443461112014, "learning_rate": 3.691588785046729e-06, "loss": 0.4954, "step": 79 }, { "epoch": 0.011220196353436185, "grad_norm": 2.426412552464205, "learning_rate": 3.738317757009346e-06, "loss": 0.5283, "step": 80 }, { "epoch": 0.011360448807854138, "grad_norm": 2.1635754860108283, "learning_rate": 3.785046728971963e-06, "loss": 0.5162, "step": 81 }, { "epoch": 0.01150070126227209, "grad_norm": 12.432751846692884, "learning_rate": 3.8317757009345796e-06, "loss": 0.5213, "step": 82 }, { "epoch": 0.011640953716690041, "grad_norm": 2.2515305505586367, "learning_rate": 3.878504672897196e-06, "loss": 0.5692, "step": 83 }, { "epoch": 0.011781206171107995, "grad_norm": 2.6768165089275433, "learning_rate": 3.925233644859814e-06, "loss": 0.4898, "step": 84 }, { "epoch": 0.011921458625525946, "grad_norm": 2.375804564070908, "learning_rate": 3.97196261682243e-06, "loss": 0.5063, "step": 85 }, { "epoch": 0.0120617110799439, "grad_norm": 2.4232912906146264, "learning_rate": 4.018691588785047e-06, "loss": 0.5118, "step": 86 }, { "epoch": 0.012201963534361851, "grad_norm": 2.137906095064392, "learning_rate": 4.065420560747663e-06, "loss": 0.4843, "step": 87 }, { "epoch": 0.012342215988779803, "grad_norm": 2.529576631488027, "learning_rate": 4.112149532710281e-06, "loss": 0.4882, "step": 88 }, { "epoch": 0.012482468443197756, "grad_norm": 2.4081983728663574, "learning_rate": 4.1588785046728974e-06, "loss": 0.4941, "step": 89 }, { "epoch": 0.012622720897615708, "grad_norm": 2.473316942025269, "learning_rate": 4.205607476635514e-06, "loss": 0.4752, "step": 90 }, { "epoch": 0.012762973352033661, "grad_norm": 2.542254697806033, "learning_rate": 4.2523364485981315e-06, "loss": 0.5384, "step": 91 }, { "epoch": 0.012903225806451613, "grad_norm": 2.930920150840522, "learning_rate": 4.299065420560748e-06, "loss": 0.4801, "step": 92 }, { "epoch": 0.013043478260869565, "grad_norm": 2.4199862098300935, "learning_rate": 4.345794392523365e-06, "loss": 0.4703, "step": 93 }, { "epoch": 0.013183730715287518, "grad_norm": 2.898503963397338, "learning_rate": 4.392523364485981e-06, "loss": 0.5149, "step": 94 }, { "epoch": 0.01332398316970547, "grad_norm": 2.4811731014298526, "learning_rate": 4.439252336448599e-06, "loss": 0.4948, "step": 95 }, { "epoch": 0.013464235624123423, "grad_norm": 2.4104896945380965, "learning_rate": 4.485981308411215e-06, "loss": 0.4735, "step": 96 }, { "epoch": 0.013604488078541374, "grad_norm": 2.3108481616225283, "learning_rate": 4.532710280373832e-06, "loss": 0.4711, "step": 97 }, { "epoch": 0.013744740532959326, "grad_norm": 2.4729119480498323, "learning_rate": 4.579439252336449e-06, "loss": 0.4596, "step": 98 }, { "epoch": 0.01388499298737728, "grad_norm": 3.0604731801015026, "learning_rate": 4.626168224299066e-06, "loss": 0.5267, "step": 99 }, { "epoch": 0.014025245441795231, "grad_norm": 2.2018374745801137, "learning_rate": 4.6728971962616825e-06, "loss": 0.4495, "step": 100 }, { "epoch": 0.014165497896213184, "grad_norm": 2.6875785106457837, "learning_rate": 4.719626168224299e-06, "loss": 0.5106, "step": 101 }, { "epoch": 0.014305750350631136, "grad_norm": 2.4805458357239525, "learning_rate": 4.766355140186917e-06, "loss": 0.4712, "step": 102 }, { "epoch": 0.014446002805049088, "grad_norm": 2.6687734348241965, "learning_rate": 4.813084112149533e-06, "loss": 0.4764, "step": 103 }, { "epoch": 0.014586255259467041, "grad_norm": 2.4712394041877164, "learning_rate": 4.85981308411215e-06, "loss": 0.522, "step": 104 }, { "epoch": 0.014726507713884993, "grad_norm": 2.4421232345628563, "learning_rate": 4.906542056074766e-06, "loss": 0.4984, "step": 105 }, { "epoch": 0.014866760168302946, "grad_norm": 2.34273476546865, "learning_rate": 4.953271028037384e-06, "loss": 0.4553, "step": 106 }, { "epoch": 0.015007012622720898, "grad_norm": 2.0392392838236155, "learning_rate": 5e-06, "loss": 0.4636, "step": 107 }, { "epoch": 0.01514726507713885, "grad_norm": 2.6694824035659046, "learning_rate": 5.046728971962617e-06, "loss": 0.4991, "step": 108 }, { "epoch": 0.015287517531556803, "grad_norm": 2.283615349201864, "learning_rate": 5.0934579439252344e-06, "loss": 0.4207, "step": 109 }, { "epoch": 0.015427769985974754, "grad_norm": 3.2743346958115493, "learning_rate": 5.14018691588785e-06, "loss": 0.4307, "step": 110 }, { "epoch": 0.015568022440392707, "grad_norm": 2.4879573689086465, "learning_rate": 5.186915887850468e-06, "loss": 0.4511, "step": 111 }, { "epoch": 0.01570827489481066, "grad_norm": 2.93407874028616, "learning_rate": 5.233644859813084e-06, "loss": 0.5184, "step": 112 }, { "epoch": 0.015848527349228612, "grad_norm": 2.8606874549387165, "learning_rate": 5.280373831775702e-06, "loss": 0.5225, "step": 113 }, { "epoch": 0.015988779803646564, "grad_norm": 2.1285504244996054, "learning_rate": 5.3271028037383174e-06, "loss": 0.47, "step": 114 }, { "epoch": 0.016129032258064516, "grad_norm": 2.4979235608979047, "learning_rate": 5.373831775700935e-06, "loss": 0.4828, "step": 115 }, { "epoch": 0.016269284712482467, "grad_norm": 2.6649907528142474, "learning_rate": 5.4205607476635515e-06, "loss": 0.4716, "step": 116 }, { "epoch": 0.016409537166900422, "grad_norm": 3.420902479791717, "learning_rate": 5.467289719626169e-06, "loss": 0.5071, "step": 117 }, { "epoch": 0.016549789621318374, "grad_norm": 2.219265837545501, "learning_rate": 5.514018691588785e-06, "loss": 0.4426, "step": 118 }, { "epoch": 0.016690042075736326, "grad_norm": 2.3545280336731054, "learning_rate": 5.560747663551402e-06, "loss": 0.4866, "step": 119 }, { "epoch": 0.016830294530154277, "grad_norm": 2.387420301455272, "learning_rate": 5.607476635514019e-06, "loss": 0.452, "step": 120 }, { "epoch": 0.01697054698457223, "grad_norm": 2.6728180230586607, "learning_rate": 5.654205607476636e-06, "loss": 0.4309, "step": 121 }, { "epoch": 0.017110799438990184, "grad_norm": 2.2308907184860374, "learning_rate": 5.700934579439253e-06, "loss": 0.5054, "step": 122 }, { "epoch": 0.017251051893408136, "grad_norm": 2.810934012274622, "learning_rate": 5.747663551401869e-06, "loss": 0.5124, "step": 123 }, { "epoch": 0.017391304347826087, "grad_norm": 2.7662963914320975, "learning_rate": 5.794392523364486e-06, "loss": 0.5333, "step": 124 }, { "epoch": 0.01753155680224404, "grad_norm": 2.4800159912209088, "learning_rate": 5.841121495327103e-06, "loss": 0.4727, "step": 125 }, { "epoch": 0.01767180925666199, "grad_norm": 2.2948587092370913, "learning_rate": 5.88785046728972e-06, "loss": 0.4578, "step": 126 }, { "epoch": 0.017812061711079945, "grad_norm": 2.301623567494857, "learning_rate": 5.9345794392523374e-06, "loss": 0.4518, "step": 127 }, { "epoch": 0.017952314165497897, "grad_norm": 2.3776599644899354, "learning_rate": 5.981308411214953e-06, "loss": 0.3798, "step": 128 }, { "epoch": 0.01809256661991585, "grad_norm": 2.961128812462258, "learning_rate": 6.028037383177571e-06, "loss": 0.4212, "step": 129 }, { "epoch": 0.0182328190743338, "grad_norm": 1.9936150914770603, "learning_rate": 6.074766355140187e-06, "loss": 0.4897, "step": 130 }, { "epoch": 0.018373071528751752, "grad_norm": 2.3537941922148162, "learning_rate": 6.121495327102805e-06, "loss": 0.4071, "step": 131 }, { "epoch": 0.018513323983169707, "grad_norm": 2.5750306951674746, "learning_rate": 6.16822429906542e-06, "loss": 0.4788, "step": 132 }, { "epoch": 0.01865357643758766, "grad_norm": 2.4451015003565595, "learning_rate": 6.214953271028038e-06, "loss": 0.4962, "step": 133 }, { "epoch": 0.01879382889200561, "grad_norm": 2.4963653846476466, "learning_rate": 6.2616822429906544e-06, "loss": 0.4788, "step": 134 }, { "epoch": 0.018934081346423562, "grad_norm": 3.0003013459490875, "learning_rate": 6.308411214953272e-06, "loss": 0.4555, "step": 135 }, { "epoch": 0.019074333800841514, "grad_norm": 3.182269423553265, "learning_rate": 6.355140186915888e-06, "loss": 0.5034, "step": 136 }, { "epoch": 0.01921458625525947, "grad_norm": 2.2024493973945227, "learning_rate": 6.401869158878505e-06, "loss": 0.4764, "step": 137 }, { "epoch": 0.01935483870967742, "grad_norm": 2.1137250339109706, "learning_rate": 6.448598130841122e-06, "loss": 0.486, "step": 138 }, { "epoch": 0.019495091164095372, "grad_norm": 3.0070982245947406, "learning_rate": 6.495327102803739e-06, "loss": 0.4372, "step": 139 }, { "epoch": 0.019635343618513323, "grad_norm": 3.397719033249549, "learning_rate": 6.542056074766355e-06, "loss": 0.4585, "step": 140 }, { "epoch": 0.019775596072931275, "grad_norm": 2.1466440934520645, "learning_rate": 6.588785046728972e-06, "loss": 0.4137, "step": 141 }, { "epoch": 0.01991584852734923, "grad_norm": 2.1297944901950254, "learning_rate": 6.635514018691589e-06, "loss": 0.447, "step": 142 }, { "epoch": 0.020056100981767182, "grad_norm": 2.5290440495622977, "learning_rate": 6.682242990654206e-06, "loss": 0.4413, "step": 143 }, { "epoch": 0.020196353436185133, "grad_norm": 2.9100510546213667, "learning_rate": 6.728971962616823e-06, "loss": 0.4258, "step": 144 }, { "epoch": 0.020336605890603085, "grad_norm": 2.6962183218038125, "learning_rate": 6.77570093457944e-06, "loss": 0.5215, "step": 145 }, { "epoch": 0.020476858345021037, "grad_norm": 2.9961710973352034, "learning_rate": 6.822429906542056e-06, "loss": 0.4584, "step": 146 }, { "epoch": 0.02061711079943899, "grad_norm": 2.39908951337045, "learning_rate": 6.869158878504674e-06, "loss": 0.4675, "step": 147 }, { "epoch": 0.020757363253856943, "grad_norm": 2.1663055363495816, "learning_rate": 6.91588785046729e-06, "loss": 0.4575, "step": 148 }, { "epoch": 0.020897615708274895, "grad_norm": 2.5819320775087506, "learning_rate": 6.962616822429908e-06, "loss": 0.4347, "step": 149 }, { "epoch": 0.021037868162692847, "grad_norm": 2.786407660383044, "learning_rate": 7.009345794392523e-06, "loss": 0.4351, "step": 150 }, { "epoch": 0.021178120617110798, "grad_norm": 2.189282372355883, "learning_rate": 7.056074766355141e-06, "loss": 0.4187, "step": 151 }, { "epoch": 0.021318373071528753, "grad_norm": 2.3930198292849205, "learning_rate": 7.1028037383177574e-06, "loss": 0.4296, "step": 152 }, { "epoch": 0.021458625525946705, "grad_norm": 3.4065329057993052, "learning_rate": 7.149532710280375e-06, "loss": 0.437, "step": 153 }, { "epoch": 0.021598877980364656, "grad_norm": 3.1904730301999273, "learning_rate": 7.196261682242991e-06, "loss": 0.4148, "step": 154 }, { "epoch": 0.021739130434782608, "grad_norm": 2.7429837975324225, "learning_rate": 7.242990654205608e-06, "loss": 0.4268, "step": 155 }, { "epoch": 0.02187938288920056, "grad_norm": 3.1382514752776776, "learning_rate": 7.289719626168225e-06, "loss": 0.4564, "step": 156 }, { "epoch": 0.022019635343618515, "grad_norm": 2.5054584994223874, "learning_rate": 7.336448598130842e-06, "loss": 0.4777, "step": 157 }, { "epoch": 0.022159887798036466, "grad_norm": 2.890473562861722, "learning_rate": 7.383177570093458e-06, "loss": 0.4147, "step": 158 }, { "epoch": 0.022300140252454418, "grad_norm": 2.7279150813245576, "learning_rate": 7.429906542056075e-06, "loss": 0.4773, "step": 159 }, { "epoch": 0.02244039270687237, "grad_norm": 2.67332784850334, "learning_rate": 7.476635514018692e-06, "loss": 0.454, "step": 160 }, { "epoch": 0.02258064516129032, "grad_norm": 3.4481133042856777, "learning_rate": 7.523364485981309e-06, "loss": 0.4233, "step": 161 }, { "epoch": 0.022720897615708276, "grad_norm": 6.936568463993134, "learning_rate": 7.570093457943926e-06, "loss": 0.4393, "step": 162 }, { "epoch": 0.022861150070126228, "grad_norm": 3.033072012519631, "learning_rate": 7.616822429906543e-06, "loss": 0.4479, "step": 163 }, { "epoch": 0.02300140252454418, "grad_norm": 2.4739299464688567, "learning_rate": 7.663551401869159e-06, "loss": 0.4697, "step": 164 }, { "epoch": 0.02314165497896213, "grad_norm": 2.509415444016097, "learning_rate": 7.710280373831777e-06, "loss": 0.4766, "step": 165 }, { "epoch": 0.023281907433380083, "grad_norm": 3.1794433500015598, "learning_rate": 7.757009345794392e-06, "loss": 0.4068, "step": 166 }, { "epoch": 0.023422159887798038, "grad_norm": 2.8367639603685224, "learning_rate": 7.80373831775701e-06, "loss": 0.4944, "step": 167 }, { "epoch": 0.02356241234221599, "grad_norm": 2.857527506004238, "learning_rate": 7.850467289719627e-06, "loss": 0.4659, "step": 168 }, { "epoch": 0.02370266479663394, "grad_norm": 2.5049948386559833, "learning_rate": 7.897196261682244e-06, "loss": 0.4369, "step": 169 }, { "epoch": 0.023842917251051893, "grad_norm": 3.3132809851928977, "learning_rate": 7.94392523364486e-06, "loss": 0.4741, "step": 170 }, { "epoch": 0.023983169705469844, "grad_norm": 2.4062973612298255, "learning_rate": 7.990654205607477e-06, "loss": 0.4718, "step": 171 }, { "epoch": 0.0241234221598878, "grad_norm": 3.3146345350785285, "learning_rate": 8.037383177570094e-06, "loss": 0.4426, "step": 172 }, { "epoch": 0.02426367461430575, "grad_norm": 4.009771570943594, "learning_rate": 8.084112149532712e-06, "loss": 0.5129, "step": 173 }, { "epoch": 0.024403927068723703, "grad_norm": 3.229490268038816, "learning_rate": 8.130841121495327e-06, "loss": 0.397, "step": 174 }, { "epoch": 0.024544179523141654, "grad_norm": 2.7474466087145015, "learning_rate": 8.177570093457945e-06, "loss": 0.4625, "step": 175 }, { "epoch": 0.024684431977559606, "grad_norm": 2.818404040706357, "learning_rate": 8.224299065420562e-06, "loss": 0.4419, "step": 176 }, { "epoch": 0.02482468443197756, "grad_norm": 2.2949543922230857, "learning_rate": 8.271028037383178e-06, "loss": 0.4383, "step": 177 }, { "epoch": 0.024964936886395513, "grad_norm": 3.3778996197436735, "learning_rate": 8.317757009345795e-06, "loss": 0.4454, "step": 178 }, { "epoch": 0.025105189340813464, "grad_norm": 2.5405408106294285, "learning_rate": 8.364485981308411e-06, "loss": 0.4624, "step": 179 }, { "epoch": 0.025245441795231416, "grad_norm": 2.5942044147310668, "learning_rate": 8.411214953271028e-06, "loss": 0.4706, "step": 180 }, { "epoch": 0.025385694249649367, "grad_norm": 2.385842619822456, "learning_rate": 8.457943925233646e-06, "loss": 0.4364, "step": 181 }, { "epoch": 0.025525946704067323, "grad_norm": 2.4913725761274312, "learning_rate": 8.504672897196263e-06, "loss": 0.475, "step": 182 }, { "epoch": 0.025666199158485274, "grad_norm": 3.344250365280128, "learning_rate": 8.55140186915888e-06, "loss": 0.4733, "step": 183 }, { "epoch": 0.025806451612903226, "grad_norm": 2.4657216873840873, "learning_rate": 8.598130841121496e-06, "loss": 0.4213, "step": 184 }, { "epoch": 0.025946704067321177, "grad_norm": 2.723476291890163, "learning_rate": 8.644859813084113e-06, "loss": 0.4721, "step": 185 }, { "epoch": 0.02608695652173913, "grad_norm": 2.3472765014010473, "learning_rate": 8.69158878504673e-06, "loss": 0.4296, "step": 186 }, { "epoch": 0.026227208976157084, "grad_norm": 2.4548499973254776, "learning_rate": 8.738317757009348e-06, "loss": 0.4524, "step": 187 }, { "epoch": 0.026367461430575036, "grad_norm": 3.093154079187258, "learning_rate": 8.785046728971963e-06, "loss": 0.4314, "step": 188 }, { "epoch": 0.026507713884992987, "grad_norm": 2.4236905091939214, "learning_rate": 8.83177570093458e-06, "loss": 0.4645, "step": 189 }, { "epoch": 0.02664796633941094, "grad_norm": 2.223185658886129, "learning_rate": 8.878504672897197e-06, "loss": 0.4534, "step": 190 }, { "epoch": 0.02678821879382889, "grad_norm": 2.4879994375038126, "learning_rate": 8.925233644859814e-06, "loss": 0.4601, "step": 191 }, { "epoch": 0.026928471248246846, "grad_norm": 2.4159928292237534, "learning_rate": 8.97196261682243e-06, "loss": 0.4215, "step": 192 }, { "epoch": 0.027068723702664797, "grad_norm": 2.639071938930907, "learning_rate": 9.018691588785047e-06, "loss": 0.403, "step": 193 }, { "epoch": 0.02720897615708275, "grad_norm": 2.3513794887970962, "learning_rate": 9.065420560747664e-06, "loss": 0.4547, "step": 194 }, { "epoch": 0.0273492286115007, "grad_norm": 2.648785413547633, "learning_rate": 9.112149532710282e-06, "loss": 0.4573, "step": 195 }, { "epoch": 0.027489481065918652, "grad_norm": 1.9824272268439151, "learning_rate": 9.158878504672899e-06, "loss": 0.4611, "step": 196 }, { "epoch": 0.027629733520336607, "grad_norm": 2.106337783750016, "learning_rate": 9.205607476635515e-06, "loss": 0.453, "step": 197 }, { "epoch": 0.02776998597475456, "grad_norm": 2.754139472338549, "learning_rate": 9.252336448598132e-06, "loss": 0.4613, "step": 198 }, { "epoch": 0.02791023842917251, "grad_norm": 2.1091617052534097, "learning_rate": 9.299065420560748e-06, "loss": 0.4403, "step": 199 }, { "epoch": 0.028050490883590462, "grad_norm": 2.6631317750226433, "learning_rate": 9.345794392523365e-06, "loss": 0.5109, "step": 200 }, { "epoch": 0.028190743338008414, "grad_norm": 2.7530027814879467, "learning_rate": 9.392523364485983e-06, "loss": 0.4506, "step": 201 }, { "epoch": 0.02833099579242637, "grad_norm": 2.977965016986435, "learning_rate": 9.439252336448598e-06, "loss": 0.4316, "step": 202 }, { "epoch": 0.02847124824684432, "grad_norm": 2.581687177162465, "learning_rate": 9.485981308411217e-06, "loss": 0.4669, "step": 203 }, { "epoch": 0.028611500701262272, "grad_norm": 3.023653559887397, "learning_rate": 9.532710280373833e-06, "loss": 0.4842, "step": 204 }, { "epoch": 0.028751753155680224, "grad_norm": 2.251101647995522, "learning_rate": 9.57943925233645e-06, "loss": 0.4696, "step": 205 }, { "epoch": 0.028892005610098175, "grad_norm": 4.548572551120584, "learning_rate": 9.626168224299066e-06, "loss": 0.4198, "step": 206 }, { "epoch": 0.02903225806451613, "grad_norm": 2.4447480058990565, "learning_rate": 9.672897196261683e-06, "loss": 0.5004, "step": 207 }, { "epoch": 0.029172510518934082, "grad_norm": 2.4963083667454917, "learning_rate": 9.7196261682243e-06, "loss": 0.4501, "step": 208 }, { "epoch": 0.029312762973352034, "grad_norm": 3.389083441550459, "learning_rate": 9.766355140186918e-06, "loss": 0.4711, "step": 209 }, { "epoch": 0.029453015427769985, "grad_norm": 2.44389999592741, "learning_rate": 9.813084112149533e-06, "loss": 0.4404, "step": 210 }, { "epoch": 0.029593267882187937, "grad_norm": 2.7659040407263054, "learning_rate": 9.859813084112151e-06, "loss": 0.4667, "step": 211 }, { "epoch": 0.029733520336605892, "grad_norm": 2.3237430374180734, "learning_rate": 9.906542056074768e-06, "loss": 0.4111, "step": 212 }, { "epoch": 0.029873772791023843, "grad_norm": 3.530638110054946, "learning_rate": 9.953271028037384e-06, "loss": 0.4329, "step": 213 }, { "epoch": 0.030014025245441795, "grad_norm": 4.789895413079215, "learning_rate": 1e-05, "loss": 0.4408, "step": 214 }, { "epoch": 0.030154277699859747, "grad_norm": 4.241107838409851, "learning_rate": 9.999999484142467e-06, "loss": 0.4598, "step": 215 }, { "epoch": 0.0302945301542777, "grad_norm": 2.7110071530587203, "learning_rate": 9.999997936569974e-06, "loss": 0.4757, "step": 216 }, { "epoch": 0.030434782608695653, "grad_norm": 3.1539139364591593, "learning_rate": 9.999995357282836e-06, "loss": 0.4891, "step": 217 }, { "epoch": 0.030575035063113605, "grad_norm": 3.171695046417335, "learning_rate": 9.999991746281591e-06, "loss": 0.4617, "step": 218 }, { "epoch": 0.030715287517531557, "grad_norm": 7.457190086787282, "learning_rate": 9.999987103566983e-06, "loss": 0.4733, "step": 219 }, { "epoch": 0.030855539971949508, "grad_norm": 3.8577441271643957, "learning_rate": 9.999981429139967e-06, "loss": 0.533, "step": 220 }, { "epoch": 0.03099579242636746, "grad_norm": 4.5759186619685615, "learning_rate": 9.999974723001716e-06, "loss": 0.4259, "step": 221 }, { "epoch": 0.031136044880785415, "grad_norm": 4.491693459586005, "learning_rate": 9.999966985153615e-06, "loss": 0.4972, "step": 222 }, { "epoch": 0.03127629733520337, "grad_norm": 5.173661675373215, "learning_rate": 9.999958215597257e-06, "loss": 0.4109, "step": 223 }, { "epoch": 0.03141654978962132, "grad_norm": 3.916440199669267, "learning_rate": 9.999948414334455e-06, "loss": 0.436, "step": 224 }, { "epoch": 0.03155680224403927, "grad_norm": 6.064143950955498, "learning_rate": 9.99993758136723e-06, "loss": 0.4106, "step": 225 }, { "epoch": 0.031697054698457225, "grad_norm": 6.448893418126817, "learning_rate": 9.999925716697817e-06, "loss": 0.4182, "step": 226 }, { "epoch": 0.03183730715287517, "grad_norm": 4.138705287220603, "learning_rate": 9.999912820328665e-06, "loss": 0.4654, "step": 227 }, { "epoch": 0.03197755960729313, "grad_norm": 4.1848981375016345, "learning_rate": 9.999898892262433e-06, "loss": 0.5169, "step": 228 }, { "epoch": 0.03211781206171108, "grad_norm": 5.729900558168016, "learning_rate": 9.999883932502e-06, "loss": 0.4591, "step": 229 }, { "epoch": 0.03225806451612903, "grad_norm": 5.312650431578789, "learning_rate": 9.999867941050447e-06, "loss": 0.4888, "step": 230 }, { "epoch": 0.032398316970546986, "grad_norm": 3.2773287639798165, "learning_rate": 9.999850917911077e-06, "loss": 0.4727, "step": 231 }, { "epoch": 0.032538569424964935, "grad_norm": 5.090892153872765, "learning_rate": 9.999832863087403e-06, "loss": 0.496, "step": 232 }, { "epoch": 0.03267882187938289, "grad_norm": 2.6749013971712348, "learning_rate": 9.999813776583148e-06, "loss": 0.4557, "step": 233 }, { "epoch": 0.032819074333800845, "grad_norm": 3.4653662596172223, "learning_rate": 9.99979365840225e-06, "loss": 0.4448, "step": 234 }, { "epoch": 0.03295932678821879, "grad_norm": 3.0513598177977794, "learning_rate": 9.999772508548863e-06, "loss": 0.4645, "step": 235 }, { "epoch": 0.03309957924263675, "grad_norm": 3.938342170872232, "learning_rate": 9.999750327027351e-06, "loss": 0.4441, "step": 236 }, { "epoch": 0.033239831697054696, "grad_norm": 3.0984241356419577, "learning_rate": 9.999727113842291e-06, "loss": 0.4604, "step": 237 }, { "epoch": 0.03338008415147265, "grad_norm": 3.1530727597645347, "learning_rate": 9.999702868998469e-06, "loss": 0.4454, "step": 238 }, { "epoch": 0.033520336605890606, "grad_norm": 2.4716036757375632, "learning_rate": 9.999677592500892e-06, "loss": 0.4657, "step": 239 }, { "epoch": 0.033660589060308554, "grad_norm": 3.000683070937668, "learning_rate": 9.999651284354774e-06, "loss": 0.459, "step": 240 }, { "epoch": 0.03380084151472651, "grad_norm": 2.5906398464463103, "learning_rate": 9.999623944565545e-06, "loss": 0.4551, "step": 241 }, { "epoch": 0.03394109396914446, "grad_norm": 3.4031914094132296, "learning_rate": 9.999595573138845e-06, "loss": 0.4755, "step": 242 }, { "epoch": 0.03408134642356241, "grad_norm": 2.8565402962306092, "learning_rate": 9.999566170080528e-06, "loss": 0.4243, "step": 243 }, { "epoch": 0.03422159887798037, "grad_norm": 3.2899238382817217, "learning_rate": 9.999535735396662e-06, "loss": 0.4628, "step": 244 }, { "epoch": 0.034361851332398316, "grad_norm": 8.290443010629769, "learning_rate": 9.999504269093525e-06, "loss": 0.4826, "step": 245 }, { "epoch": 0.03450210378681627, "grad_norm": 2.719656080774345, "learning_rate": 9.999471771177612e-06, "loss": 0.4288, "step": 246 }, { "epoch": 0.03464235624123422, "grad_norm": 2.7756213549302475, "learning_rate": 9.999438241655629e-06, "loss": 0.4144, "step": 247 }, { "epoch": 0.034782608695652174, "grad_norm": 2.4149045987764, "learning_rate": 9.999403680534492e-06, "loss": 0.4561, "step": 248 }, { "epoch": 0.03492286115007013, "grad_norm": 2.7172461548891307, "learning_rate": 9.999368087821337e-06, "loss": 0.4623, "step": 249 }, { "epoch": 0.03506311360448808, "grad_norm": 2.4256778516862316, "learning_rate": 9.999331463523502e-06, "loss": 0.4749, "step": 250 }, { "epoch": 0.03520336605890603, "grad_norm": 3.724395569784377, "learning_rate": 9.99929380764855e-06, "loss": 0.4399, "step": 251 }, { "epoch": 0.03534361851332398, "grad_norm": 3.568258617129064, "learning_rate": 9.999255120204248e-06, "loss": 0.4683, "step": 252 }, { "epoch": 0.035483870967741936, "grad_norm": 2.6187052866320912, "learning_rate": 9.999215401198579e-06, "loss": 0.4298, "step": 253 }, { "epoch": 0.03562412342215989, "grad_norm": 2.3684910373498034, "learning_rate": 9.99917465063974e-06, "loss": 0.44, "step": 254 }, { "epoch": 0.03576437587657784, "grad_norm": 2.412211971186012, "learning_rate": 9.999132868536139e-06, "loss": 0.4445, "step": 255 }, { "epoch": 0.035904628330995794, "grad_norm": 3.6754283837004547, "learning_rate": 9.999090054896397e-06, "loss": 0.4472, "step": 256 }, { "epoch": 0.03604488078541374, "grad_norm": 2.1935283972817397, "learning_rate": 9.999046209729347e-06, "loss": 0.4419, "step": 257 }, { "epoch": 0.0361851332398317, "grad_norm": 3.219258240508144, "learning_rate": 9.999001333044039e-06, "loss": 0.4845, "step": 258 }, { "epoch": 0.03632538569424965, "grad_norm": 3.194816868030383, "learning_rate": 9.998955424849733e-06, "loss": 0.462, "step": 259 }, { "epoch": 0.0364656381486676, "grad_norm": 3.0603513381318126, "learning_rate": 9.998908485155898e-06, "loss": 0.4309, "step": 260 }, { "epoch": 0.036605890603085556, "grad_norm": 3.412580434668683, "learning_rate": 9.998860513972224e-06, "loss": 0.4182, "step": 261 }, { "epoch": 0.036746143057503504, "grad_norm": 2.1959985495946435, "learning_rate": 9.998811511308608e-06, "loss": 0.4187, "step": 262 }, { "epoch": 0.03688639551192146, "grad_norm": 3.1898764932574313, "learning_rate": 9.99876147717516e-06, "loss": 0.4299, "step": 263 }, { "epoch": 0.037026647966339414, "grad_norm": 2.5861625458214, "learning_rate": 9.998710411582205e-06, "loss": 0.4583, "step": 264 }, { "epoch": 0.03716690042075736, "grad_norm": 2.761361925957318, "learning_rate": 9.998658314540282e-06, "loss": 0.4413, "step": 265 }, { "epoch": 0.03730715287517532, "grad_norm": 2.5023223580676617, "learning_rate": 9.998605186060138e-06, "loss": 0.4774, "step": 266 }, { "epoch": 0.037447405329593265, "grad_norm": 2.594543427596996, "learning_rate": 9.998551026152736e-06, "loss": 0.5158, "step": 267 }, { "epoch": 0.03758765778401122, "grad_norm": 2.1353457431335343, "learning_rate": 9.998495834829255e-06, "loss": 0.4678, "step": 268 }, { "epoch": 0.037727910238429176, "grad_norm": 1.9760431504499556, "learning_rate": 9.998439612101079e-06, "loss": 0.4884, "step": 269 }, { "epoch": 0.037868162692847124, "grad_norm": 2.299898609746157, "learning_rate": 9.99838235797981e-06, "loss": 0.4694, "step": 270 }, { "epoch": 0.03800841514726508, "grad_norm": 2.877641210993647, "learning_rate": 9.998324072477266e-06, "loss": 0.466, "step": 271 }, { "epoch": 0.03814866760168303, "grad_norm": 2.4696916160660214, "learning_rate": 9.998264755605467e-06, "loss": 0.4104, "step": 272 }, { "epoch": 0.03828892005610098, "grad_norm": 3.0000419613764624, "learning_rate": 9.99820440737666e-06, "loss": 0.5101, "step": 273 }, { "epoch": 0.03842917251051894, "grad_norm": 2.571668922075563, "learning_rate": 9.998143027803292e-06, "loss": 0.3905, "step": 274 }, { "epoch": 0.038569424964936885, "grad_norm": 2.7421483746207205, "learning_rate": 9.998080616898028e-06, "loss": 0.48, "step": 275 }, { "epoch": 0.03870967741935484, "grad_norm": 2.3289613181702276, "learning_rate": 9.998017174673752e-06, "loss": 0.4387, "step": 276 }, { "epoch": 0.03884992987377279, "grad_norm": 2.272201883970651, "learning_rate": 9.997952701143547e-06, "loss": 0.4177, "step": 277 }, { "epoch": 0.038990182328190744, "grad_norm": 2.290366676825513, "learning_rate": 9.997887196320723e-06, "loss": 0.4676, "step": 278 }, { "epoch": 0.0391304347826087, "grad_norm": 2.7377494698340126, "learning_rate": 9.997820660218793e-06, "loss": 0.4464, "step": 279 }, { "epoch": 0.03927068723702665, "grad_norm": 5.596344756243784, "learning_rate": 9.997753092851488e-06, "loss": 0.4646, "step": 280 }, { "epoch": 0.0394109396914446, "grad_norm": 2.338321145810563, "learning_rate": 9.99768449423275e-06, "loss": 0.4701, "step": 281 }, { "epoch": 0.03955119214586255, "grad_norm": 2.1076395313053173, "learning_rate": 9.997614864376732e-06, "loss": 0.4388, "step": 282 }, { "epoch": 0.039691444600280505, "grad_norm": 2.999200078494238, "learning_rate": 9.997544203297801e-06, "loss": 0.4057, "step": 283 }, { "epoch": 0.03983169705469846, "grad_norm": 2.8111029652044843, "learning_rate": 9.997472511010543e-06, "loss": 0.4448, "step": 284 }, { "epoch": 0.03997194950911641, "grad_norm": 2.41178396400633, "learning_rate": 9.997399787529744e-06, "loss": 0.4118, "step": 285 }, { "epoch": 0.040112201963534364, "grad_norm": 2.775796923605218, "learning_rate": 9.997326032870417e-06, "loss": 0.424, "step": 286 }, { "epoch": 0.04025245441795231, "grad_norm": 2.27484328600029, "learning_rate": 9.997251247047775e-06, "loss": 0.4841, "step": 287 }, { "epoch": 0.04039270687237027, "grad_norm": 3.7872507699478013, "learning_rate": 9.997175430077253e-06, "loss": 0.4039, "step": 288 }, { "epoch": 0.04053295932678822, "grad_norm": 2.4579956878860183, "learning_rate": 9.997098581974492e-06, "loss": 0.4481, "step": 289 }, { "epoch": 0.04067321178120617, "grad_norm": 7.967051844948522, "learning_rate": 9.997020702755353e-06, "loss": 0.4396, "step": 290 }, { "epoch": 0.040813464235624125, "grad_norm": 2.7912498466604405, "learning_rate": 9.996941792435903e-06, "loss": 0.4852, "step": 291 }, { "epoch": 0.04095371669004207, "grad_norm": 2.4139024996279286, "learning_rate": 9.996861851032426e-06, "loss": 0.4419, "step": 292 }, { "epoch": 0.04109396914446003, "grad_norm": 3.2394699289439663, "learning_rate": 9.996780878561417e-06, "loss": 0.4403, "step": 293 }, { "epoch": 0.04123422159887798, "grad_norm": 2.7592036815093386, "learning_rate": 9.996698875039583e-06, "loss": 0.4903, "step": 294 }, { "epoch": 0.04137447405329593, "grad_norm": 2.3936474212898418, "learning_rate": 9.996615840483847e-06, "loss": 0.4097, "step": 295 }, { "epoch": 0.04151472650771389, "grad_norm": 3.5436822578642895, "learning_rate": 9.99653177491134e-06, "loss": 0.4876, "step": 296 }, { "epoch": 0.041654978962131835, "grad_norm": 3.959402535575374, "learning_rate": 9.996446678339413e-06, "loss": 0.425, "step": 297 }, { "epoch": 0.04179523141654979, "grad_norm": 2.6463237418097703, "learning_rate": 9.996360550785619e-06, "loss": 0.4225, "step": 298 }, { "epoch": 0.041935483870967745, "grad_norm": 4.9228909851687215, "learning_rate": 9.996273392267733e-06, "loss": 0.4407, "step": 299 }, { "epoch": 0.04207573632538569, "grad_norm": 2.655592174978057, "learning_rate": 9.99618520280374e-06, "loss": 0.4416, "step": 300 }, { "epoch": 0.04221598877980365, "grad_norm": 2.1464593832072727, "learning_rate": 9.996095982411835e-06, "loss": 0.4552, "step": 301 }, { "epoch": 0.042356241234221596, "grad_norm": 2.299876840030588, "learning_rate": 9.996005731110431e-06, "loss": 0.4766, "step": 302 }, { "epoch": 0.04249649368863955, "grad_norm": 2.182841326578513, "learning_rate": 9.99591444891815e-06, "loss": 0.427, "step": 303 }, { "epoch": 0.042636746143057506, "grad_norm": 1.9763327001342343, "learning_rate": 9.995822135853824e-06, "loss": 0.437, "step": 304 }, { "epoch": 0.042776998597475455, "grad_norm": 2.4374035547200656, "learning_rate": 9.995728791936505e-06, "loss": 0.4357, "step": 305 }, { "epoch": 0.04291725105189341, "grad_norm": 3.335041577996492, "learning_rate": 9.995634417185454e-06, "loss": 0.4545, "step": 306 }, { "epoch": 0.04305750350631136, "grad_norm": 2.791786893466748, "learning_rate": 9.995539011620143e-06, "loss": 0.4411, "step": 307 }, { "epoch": 0.04319775596072931, "grad_norm": 2.2639917208584217, "learning_rate": 9.995442575260257e-06, "loss": 0.4121, "step": 308 }, { "epoch": 0.04333800841514727, "grad_norm": 2.828405292594402, "learning_rate": 9.995345108125698e-06, "loss": 0.4585, "step": 309 }, { "epoch": 0.043478260869565216, "grad_norm": 2.2326249958856543, "learning_rate": 9.995246610236575e-06, "loss": 0.4493, "step": 310 }, { "epoch": 0.04361851332398317, "grad_norm": 2.260294464136413, "learning_rate": 9.995147081613214e-06, "loss": 0.4859, "step": 311 }, { "epoch": 0.04375876577840112, "grad_norm": 2.1496741602412075, "learning_rate": 9.995046522276152e-06, "loss": 0.4799, "step": 312 }, { "epoch": 0.043899018232819075, "grad_norm": 2.4892454569806644, "learning_rate": 9.994944932246137e-06, "loss": 0.4069, "step": 313 }, { "epoch": 0.04403927068723703, "grad_norm": 3.165611509574923, "learning_rate": 9.994842311544135e-06, "loss": 0.471, "step": 314 }, { "epoch": 0.04417952314165498, "grad_norm": 2.415118927643495, "learning_rate": 9.994738660191316e-06, "loss": 0.4696, "step": 315 }, { "epoch": 0.04431977559607293, "grad_norm": 2.2355811858156267, "learning_rate": 9.994633978209073e-06, "loss": 0.4269, "step": 316 }, { "epoch": 0.04446002805049088, "grad_norm": 2.7164345012135938, "learning_rate": 9.994528265619004e-06, "loss": 0.4284, "step": 317 }, { "epoch": 0.044600280504908836, "grad_norm": 2.140912001923874, "learning_rate": 9.99442152244292e-06, "loss": 0.395, "step": 318 }, { "epoch": 0.04474053295932679, "grad_norm": 2.5090098152135902, "learning_rate": 9.994313748702848e-06, "loss": 0.4667, "step": 319 }, { "epoch": 0.04488078541374474, "grad_norm": 2.202827379401287, "learning_rate": 9.994204944421029e-06, "loss": 0.5083, "step": 320 }, { "epoch": 0.045021037868162694, "grad_norm": 2.282237218163507, "learning_rate": 9.994095109619912e-06, "loss": 0.4951, "step": 321 }, { "epoch": 0.04516129032258064, "grad_norm": 3.9044745721114813, "learning_rate": 9.993984244322158e-06, "loss": 0.4194, "step": 322 }, { "epoch": 0.0453015427769986, "grad_norm": 2.0614550862440146, "learning_rate": 9.993872348550649e-06, "loss": 0.4627, "step": 323 }, { "epoch": 0.04544179523141655, "grad_norm": 2.3684085932063708, "learning_rate": 9.993759422328471e-06, "loss": 0.3941, "step": 324 }, { "epoch": 0.0455820476858345, "grad_norm": 2.243267902234387, "learning_rate": 9.993645465678923e-06, "loss": 0.459, "step": 325 }, { "epoch": 0.045722300140252456, "grad_norm": 2.33580399453878, "learning_rate": 9.993530478625524e-06, "loss": 0.4344, "step": 326 }, { "epoch": 0.045862552594670404, "grad_norm": 1.9560050786660381, "learning_rate": 9.993414461191996e-06, "loss": 0.3959, "step": 327 }, { "epoch": 0.04600280504908836, "grad_norm": 2.192379767536861, "learning_rate": 9.993297413402282e-06, "loss": 0.4596, "step": 328 }, { "epoch": 0.046143057503506314, "grad_norm": 2.3848361068170227, "learning_rate": 9.993179335280532e-06, "loss": 0.4794, "step": 329 }, { "epoch": 0.04628330995792426, "grad_norm": 2.2372163432947922, "learning_rate": 9.993060226851112e-06, "loss": 0.4718, "step": 330 }, { "epoch": 0.04642356241234222, "grad_norm": 2.0827652601764326, "learning_rate": 9.992940088138598e-06, "loss": 0.4325, "step": 331 }, { "epoch": 0.046563814866760166, "grad_norm": 2.0082875443088875, "learning_rate": 9.992818919167779e-06, "loss": 0.4598, "step": 332 }, { "epoch": 0.04670406732117812, "grad_norm": 2.675181434396497, "learning_rate": 9.992696719963662e-06, "loss": 0.4272, "step": 333 }, { "epoch": 0.046844319775596076, "grad_norm": 2.6326097359386624, "learning_rate": 9.992573490551457e-06, "loss": 0.4285, "step": 334 }, { "epoch": 0.046984572230014024, "grad_norm": 2.264642436622871, "learning_rate": 9.992449230956591e-06, "loss": 0.449, "step": 335 }, { "epoch": 0.04712482468443198, "grad_norm": 2.739256158784037, "learning_rate": 9.992323941204709e-06, "loss": 0.444, "step": 336 }, { "epoch": 0.04726507713884993, "grad_norm": 2.200321442789467, "learning_rate": 9.99219762132166e-06, "loss": 0.4697, "step": 337 }, { "epoch": 0.04740532959326788, "grad_norm": 2.7014323320684666, "learning_rate": 9.992070271333508e-06, "loss": 0.4886, "step": 338 }, { "epoch": 0.04754558204768584, "grad_norm": 1.9649803766333644, "learning_rate": 9.991941891266535e-06, "loss": 0.4282, "step": 339 }, { "epoch": 0.047685834502103785, "grad_norm": 2.1970225559998844, "learning_rate": 9.99181248114723e-06, "loss": 0.4428, "step": 340 }, { "epoch": 0.04782608695652174, "grad_norm": 2.280902679340636, "learning_rate": 9.991682041002294e-06, "loss": 0.4978, "step": 341 }, { "epoch": 0.04796633941093969, "grad_norm": 1.8243627062569885, "learning_rate": 9.991550570858642e-06, "loss": 0.4087, "step": 342 }, { "epoch": 0.048106591865357644, "grad_norm": 2.146800468294943, "learning_rate": 9.991418070743406e-06, "loss": 0.3955, "step": 343 }, { "epoch": 0.0482468443197756, "grad_norm": 1.9664065021913326, "learning_rate": 9.991284540683922e-06, "loss": 0.4296, "step": 344 }, { "epoch": 0.04838709677419355, "grad_norm": 2.204615528383741, "learning_rate": 9.991149980707746e-06, "loss": 0.4565, "step": 345 }, { "epoch": 0.0485273492286115, "grad_norm": 1.9048901069162731, "learning_rate": 9.991014390842642e-06, "loss": 0.481, "step": 346 }, { "epoch": 0.04866760168302945, "grad_norm": 2.147333700826464, "learning_rate": 9.990877771116588e-06, "loss": 0.4565, "step": 347 }, { "epoch": 0.048807854137447405, "grad_norm": 3.100564099332543, "learning_rate": 9.990740121557776e-06, "loss": 0.401, "step": 348 }, { "epoch": 0.04894810659186536, "grad_norm": 2.05657489378133, "learning_rate": 9.990601442194607e-06, "loss": 0.4256, "step": 349 }, { "epoch": 0.04908835904628331, "grad_norm": 2.9890188466228027, "learning_rate": 9.990461733055698e-06, "loss": 0.4117, "step": 350 }, { "epoch": 0.049228611500701264, "grad_norm": 2.9191849509046, "learning_rate": 9.990320994169876e-06, "loss": 0.4441, "step": 351 }, { "epoch": 0.04936886395511921, "grad_norm": 2.4351511666688803, "learning_rate": 9.990179225566184e-06, "loss": 0.4408, "step": 352 }, { "epoch": 0.04950911640953717, "grad_norm": 2.3557971832916627, "learning_rate": 9.99003642727387e-06, "loss": 0.4232, "step": 353 }, { "epoch": 0.04964936886395512, "grad_norm": 2.081023639821482, "learning_rate": 9.989892599322404e-06, "loss": 0.4079, "step": 354 }, { "epoch": 0.04978962131837307, "grad_norm": 2.04506142913407, "learning_rate": 9.989747741741462e-06, "loss": 0.4199, "step": 355 }, { "epoch": 0.049929873772791025, "grad_norm": 2.338372839686613, "learning_rate": 9.989601854560935e-06, "loss": 0.4641, "step": 356 }, { "epoch": 0.05007012622720897, "grad_norm": 3.097644243060746, "learning_rate": 9.989454937810926e-06, "loss": 0.4067, "step": 357 }, { "epoch": 0.05021037868162693, "grad_norm": 2.418508047377956, "learning_rate": 9.989306991521748e-06, "loss": 0.4857, "step": 358 }, { "epoch": 0.050350631136044884, "grad_norm": 2.4960379676376463, "learning_rate": 9.989158015723933e-06, "loss": 0.3945, "step": 359 }, { "epoch": 0.05049088359046283, "grad_norm": 3.0467826633644743, "learning_rate": 9.989008010448216e-06, "loss": 0.4802, "step": 360 }, { "epoch": 0.05063113604488079, "grad_norm": 2.9041844072351926, "learning_rate": 9.988856975725551e-06, "loss": 0.4073, "step": 361 }, { "epoch": 0.050771388499298735, "grad_norm": 2.8008760784700564, "learning_rate": 9.988704911587106e-06, "loss": 0.4451, "step": 362 }, { "epoch": 0.05091164095371669, "grad_norm": 2.1058253055364644, "learning_rate": 9.988551818064257e-06, "loss": 0.4193, "step": 363 }, { "epoch": 0.051051893408134645, "grad_norm": 4.016680983099743, "learning_rate": 9.988397695188592e-06, "loss": 0.443, "step": 364 }, { "epoch": 0.05119214586255259, "grad_norm": 2.240242037639467, "learning_rate": 9.988242542991914e-06, "loss": 0.4573, "step": 365 }, { "epoch": 0.05133239831697055, "grad_norm": 3.900217143151508, "learning_rate": 9.98808636150624e-06, "loss": 0.4569, "step": 366 }, { "epoch": 0.051472650771388496, "grad_norm": 4.681467676131186, "learning_rate": 9.987929150763791e-06, "loss": 0.4644, "step": 367 }, { "epoch": 0.05161290322580645, "grad_norm": 2.497936159831502, "learning_rate": 9.987770910797014e-06, "loss": 0.4262, "step": 368 }, { "epoch": 0.05175315568022441, "grad_norm": 2.5257348152798422, "learning_rate": 9.987611641638555e-06, "loss": 0.4294, "step": 369 }, { "epoch": 0.051893408134642355, "grad_norm": 3.3062575918878454, "learning_rate": 9.98745134332128e-06, "loss": 0.4351, "step": 370 }, { "epoch": 0.05203366058906031, "grad_norm": 2.6053801697367356, "learning_rate": 9.987290015878266e-06, "loss": 0.4043, "step": 371 }, { "epoch": 0.05217391304347826, "grad_norm": 2.7186338027151398, "learning_rate": 9.9871276593428e-06, "loss": 0.4769, "step": 372 }, { "epoch": 0.05231416549789621, "grad_norm": 3.154600259935923, "learning_rate": 9.986964273748385e-06, "loss": 0.4398, "step": 373 }, { "epoch": 0.05245441795231417, "grad_norm": 2.75939818012874, "learning_rate": 9.986799859128735e-06, "loss": 0.3672, "step": 374 }, { "epoch": 0.052594670406732116, "grad_norm": 3.224108337626685, "learning_rate": 9.986634415517774e-06, "loss": 0.4438, "step": 375 }, { "epoch": 0.05273492286115007, "grad_norm": 2.780420586859395, "learning_rate": 9.98646794294964e-06, "loss": 0.4126, "step": 376 }, { "epoch": 0.05287517531556802, "grad_norm": 2.452331997933473, "learning_rate": 9.986300441458683e-06, "loss": 0.4359, "step": 377 }, { "epoch": 0.053015427769985975, "grad_norm": 3.4553352290486927, "learning_rate": 9.986131911079469e-06, "loss": 0.4799, "step": 378 }, { "epoch": 0.05315568022440393, "grad_norm": 2.540674255569296, "learning_rate": 9.98596235184677e-06, "loss": 0.4535, "step": 379 }, { "epoch": 0.05329593267882188, "grad_norm": 4.561415033802088, "learning_rate": 9.985791763795576e-06, "loss": 0.4403, "step": 380 }, { "epoch": 0.05343618513323983, "grad_norm": 3.5456563729858837, "learning_rate": 9.985620146961083e-06, "loss": 0.4166, "step": 381 }, { "epoch": 0.05357643758765778, "grad_norm": 3.056757623060372, "learning_rate": 9.985447501378706e-06, "loss": 0.4446, "step": 382 }, { "epoch": 0.053716690042075736, "grad_norm": 2.65376111280259, "learning_rate": 9.985273827084068e-06, "loss": 0.4174, "step": 383 }, { "epoch": 0.05385694249649369, "grad_norm": 2.2571335137882653, "learning_rate": 9.985099124113006e-06, "loss": 0.4569, "step": 384 }, { "epoch": 0.05399719495091164, "grad_norm": 3.140464816587059, "learning_rate": 9.984923392501567e-06, "loss": 0.405, "step": 385 }, { "epoch": 0.054137447405329595, "grad_norm": 2.978348163837495, "learning_rate": 9.984746632286016e-06, "loss": 0.4393, "step": 386 }, { "epoch": 0.05427769985974754, "grad_norm": 2.911406833327405, "learning_rate": 9.984568843502822e-06, "loss": 0.4244, "step": 387 }, { "epoch": 0.0544179523141655, "grad_norm": 3.448299574934883, "learning_rate": 9.984390026188671e-06, "loss": 0.4606, "step": 388 }, { "epoch": 0.05455820476858345, "grad_norm": 3.858196866373252, "learning_rate": 9.984210180380464e-06, "loss": 0.4029, "step": 389 }, { "epoch": 0.0546984572230014, "grad_norm": 2.456964197744901, "learning_rate": 9.984029306115307e-06, "loss": 0.4547, "step": 390 }, { "epoch": 0.054838709677419356, "grad_norm": 2.7040302326442203, "learning_rate": 9.983847403430525e-06, "loss": 0.3927, "step": 391 }, { "epoch": 0.054978962131837304, "grad_norm": 3.113182611949032, "learning_rate": 9.98366447236365e-06, "loss": 0.4277, "step": 392 }, { "epoch": 0.05511921458625526, "grad_norm": 5.221665350439687, "learning_rate": 9.983480512952432e-06, "loss": 0.4287, "step": 393 }, { "epoch": 0.055259467040673214, "grad_norm": 4.395863079434432, "learning_rate": 9.983295525234827e-06, "loss": 0.4268, "step": 394 }, { "epoch": 0.05539971949509116, "grad_norm": 2.525984762802741, "learning_rate": 9.983109509249004e-06, "loss": 0.4171, "step": 395 }, { "epoch": 0.05553997194950912, "grad_norm": 2.3485680458195644, "learning_rate": 9.98292246503335e-06, "loss": 0.4277, "step": 396 }, { "epoch": 0.055680224403927066, "grad_norm": 2.6902959606908716, "learning_rate": 9.98273439262646e-06, "loss": 0.4234, "step": 397 }, { "epoch": 0.05582047685834502, "grad_norm": 3.229392521660995, "learning_rate": 9.982545292067138e-06, "loss": 0.3998, "step": 398 }, { "epoch": 0.055960729312762976, "grad_norm": 3.3481118816882818, "learning_rate": 9.982355163394407e-06, "loss": 0.4255, "step": 399 }, { "epoch": 0.056100981767180924, "grad_norm": 4.178073122077656, "learning_rate": 9.982164006647497e-06, "loss": 0.4778, "step": 400 }, { "epoch": 0.05624123422159888, "grad_norm": 2.7662857042327764, "learning_rate": 9.981971821865853e-06, "loss": 0.376, "step": 401 }, { "epoch": 0.05638148667601683, "grad_norm": 3.1243894362035056, "learning_rate": 9.98177860908913e-06, "loss": 0.4587, "step": 402 }, { "epoch": 0.05652173913043478, "grad_norm": 2.8668578579450243, "learning_rate": 9.981584368357198e-06, "loss": 0.4969, "step": 403 }, { "epoch": 0.05666199158485274, "grad_norm": 2.9526239202142204, "learning_rate": 9.981389099710132e-06, "loss": 0.4319, "step": 404 }, { "epoch": 0.056802244039270686, "grad_norm": 3.4498605617042895, "learning_rate": 9.981192803188232e-06, "loss": 0.4591, "step": 405 }, { "epoch": 0.05694249649368864, "grad_norm": 2.4476655114294243, "learning_rate": 9.980995478831997e-06, "loss": 0.4806, "step": 406 }, { "epoch": 0.05708274894810659, "grad_norm": 2.60632731005128, "learning_rate": 9.980797126682145e-06, "loss": 0.4234, "step": 407 }, { "epoch": 0.057223001402524544, "grad_norm": 2.547888057004348, "learning_rate": 9.980597746779604e-06, "loss": 0.479, "step": 408 }, { "epoch": 0.0573632538569425, "grad_norm": 2.9135041302765767, "learning_rate": 9.980397339165514e-06, "loss": 0.4603, "step": 409 }, { "epoch": 0.05750350631136045, "grad_norm": 2.4318942630958817, "learning_rate": 9.980195903881231e-06, "loss": 0.3862, "step": 410 }, { "epoch": 0.0576437587657784, "grad_norm": 4.859069225834828, "learning_rate": 9.979993440968317e-06, "loss": 0.4348, "step": 411 }, { "epoch": 0.05778401122019635, "grad_norm": 3.1546327583358766, "learning_rate": 9.979789950468549e-06, "loss": 0.3903, "step": 412 }, { "epoch": 0.057924263674614306, "grad_norm": 2.3941324471315197, "learning_rate": 9.979585432423917e-06, "loss": 0.4625, "step": 413 }, { "epoch": 0.05806451612903226, "grad_norm": 2.1689398163060707, "learning_rate": 9.97937988687662e-06, "loss": 0.3648, "step": 414 }, { "epoch": 0.05820476858345021, "grad_norm": 2.4445118918891358, "learning_rate": 9.979173313869072e-06, "loss": 0.4376, "step": 415 }, { "epoch": 0.058345021037868164, "grad_norm": 2.431349207562791, "learning_rate": 9.9789657134439e-06, "loss": 0.4026, "step": 416 }, { "epoch": 0.05848527349228611, "grad_norm": 2.2644796699256524, "learning_rate": 9.978757085643937e-06, "loss": 0.4361, "step": 417 }, { "epoch": 0.05862552594670407, "grad_norm": 2.9941974791263095, "learning_rate": 9.978547430512235e-06, "loss": 0.4146, "step": 418 }, { "epoch": 0.05876577840112202, "grad_norm": 3.2043005298155127, "learning_rate": 9.978336748092053e-06, "loss": 0.4846, "step": 419 }, { "epoch": 0.05890603085553997, "grad_norm": 2.892740001695145, "learning_rate": 9.978125038426865e-06, "loss": 0.4259, "step": 420 }, { "epoch": 0.059046283309957925, "grad_norm": 2.875252754047582, "learning_rate": 9.977912301560354e-06, "loss": 0.4322, "step": 421 }, { "epoch": 0.059186535764375874, "grad_norm": 3.3902067286830766, "learning_rate": 9.97769853753642e-06, "loss": 0.4289, "step": 422 }, { "epoch": 0.05932678821879383, "grad_norm": 2.283201558656702, "learning_rate": 9.977483746399168e-06, "loss": 0.4062, "step": 423 }, { "epoch": 0.059467040673211784, "grad_norm": 2.9137736732179103, "learning_rate": 9.97726792819292e-06, "loss": 0.4466, "step": 424 }, { "epoch": 0.05960729312762973, "grad_norm": 3.0026838695119156, "learning_rate": 9.97705108296221e-06, "loss": 0.4266, "step": 425 }, { "epoch": 0.05974754558204769, "grad_norm": 2.271799342851588, "learning_rate": 9.976833210751781e-06, "loss": 0.4098, "step": 426 }, { "epoch": 0.059887798036465635, "grad_norm": 2.2688546148723887, "learning_rate": 9.97661431160659e-06, "loss": 0.4412, "step": 427 }, { "epoch": 0.06002805049088359, "grad_norm": 2.2506609051993753, "learning_rate": 9.976394385571805e-06, "loss": 0.4229, "step": 428 }, { "epoch": 0.060168302945301545, "grad_norm": 3.4311202061186155, "learning_rate": 9.976173432692806e-06, "loss": 0.3871, "step": 429 }, { "epoch": 0.06030855539971949, "grad_norm": 2.4693623036970833, "learning_rate": 9.975951453015187e-06, "loss": 0.4449, "step": 430 }, { "epoch": 0.06044880785413745, "grad_norm": 2.119309941572607, "learning_rate": 9.975728446584748e-06, "loss": 0.4375, "step": 431 }, { "epoch": 0.0605890603085554, "grad_norm": 2.781737467019255, "learning_rate": 9.97550441344751e-06, "loss": 0.4723, "step": 432 }, { "epoch": 0.06072931276297335, "grad_norm": 3.916772205202428, "learning_rate": 9.975279353649696e-06, "loss": 0.3807, "step": 433 }, { "epoch": 0.06086956521739131, "grad_norm": 2.697268807135161, "learning_rate": 9.975053267237748e-06, "loss": 0.4048, "step": 434 }, { "epoch": 0.061009817671809255, "grad_norm": 2.2991946178625473, "learning_rate": 9.974826154258318e-06, "loss": 0.4444, "step": 435 }, { "epoch": 0.06115007012622721, "grad_norm": 2.288303913539505, "learning_rate": 9.974598014758267e-06, "loss": 0.4321, "step": 436 }, { "epoch": 0.06129032258064516, "grad_norm": 2.1306842623609916, "learning_rate": 9.97436884878467e-06, "loss": 0.4538, "step": 437 }, { "epoch": 0.06143057503506311, "grad_norm": 2.8779805531325233, "learning_rate": 9.974138656384815e-06, "loss": 0.4544, "step": 438 }, { "epoch": 0.06157082748948107, "grad_norm": 2.6811096154655076, "learning_rate": 9.973907437606201e-06, "loss": 0.4559, "step": 439 }, { "epoch": 0.061711079943899017, "grad_norm": 2.527840755595699, "learning_rate": 9.973675192496539e-06, "loss": 0.3951, "step": 440 }, { "epoch": 0.06185133239831697, "grad_norm": 2.61175465149756, "learning_rate": 9.973441921103748e-06, "loss": 0.4135, "step": 441 }, { "epoch": 0.06199158485273492, "grad_norm": 3.0474108102591577, "learning_rate": 9.973207623475964e-06, "loss": 0.4642, "step": 442 }, { "epoch": 0.062131837307152875, "grad_norm": 3.273171789633585, "learning_rate": 9.972972299661533e-06, "loss": 0.405, "step": 443 }, { "epoch": 0.06227208976157083, "grad_norm": 2.4906402855455507, "learning_rate": 9.972735949709012e-06, "loss": 0.4952, "step": 444 }, { "epoch": 0.06241234221598878, "grad_norm": 2.752149781997046, "learning_rate": 9.97249857366717e-06, "loss": 0.3933, "step": 445 }, { "epoch": 0.06255259467040673, "grad_norm": 5.226205582684293, "learning_rate": 9.972260171584987e-06, "loss": 0.4855, "step": 446 }, { "epoch": 0.06269284712482469, "grad_norm": 2.138832399797569, "learning_rate": 9.972020743511657e-06, "loss": 0.4183, "step": 447 }, { "epoch": 0.06283309957924264, "grad_norm": 2.7575107657947813, "learning_rate": 9.971780289496585e-06, "loss": 0.4044, "step": 448 }, { "epoch": 0.06297335203366058, "grad_norm": 2.551724830411371, "learning_rate": 9.971538809589385e-06, "loss": 0.4481, "step": 449 }, { "epoch": 0.06311360448807854, "grad_norm": 2.798675383867582, "learning_rate": 9.971296303839884e-06, "loss": 0.4512, "step": 450 }, { "epoch": 0.0632538569424965, "grad_norm": 3.428840550010312, "learning_rate": 9.971052772298125e-06, "loss": 0.4238, "step": 451 }, { "epoch": 0.06339410939691445, "grad_norm": 3.709139418412272, "learning_rate": 9.970808215014357e-06, "loss": 0.3889, "step": 452 }, { "epoch": 0.0635343618513324, "grad_norm": 3.030208133309783, "learning_rate": 9.970562632039043e-06, "loss": 0.3957, "step": 453 }, { "epoch": 0.06367461430575035, "grad_norm": 6.885326883150995, "learning_rate": 9.970316023422855e-06, "loss": 0.4285, "step": 454 }, { "epoch": 0.0638148667601683, "grad_norm": 2.5276179688162843, "learning_rate": 9.970068389216681e-06, "loss": 0.4407, "step": 455 }, { "epoch": 0.06395511921458626, "grad_norm": 3.072987718353407, "learning_rate": 9.969819729471621e-06, "loss": 0.4541, "step": 456 }, { "epoch": 0.06409537166900421, "grad_norm": 3.3686920840463737, "learning_rate": 9.96957004423898e-06, "loss": 0.3829, "step": 457 }, { "epoch": 0.06423562412342217, "grad_norm": 4.102687485227266, "learning_rate": 9.96931933357028e-06, "loss": 0.3965, "step": 458 }, { "epoch": 0.06437587657784011, "grad_norm": 4.748896822064187, "learning_rate": 9.969067597517255e-06, "loss": 0.3776, "step": 459 }, { "epoch": 0.06451612903225806, "grad_norm": 7.97001488314475, "learning_rate": 9.968814836131849e-06, "loss": 0.4132, "step": 460 }, { "epoch": 0.06465638148667602, "grad_norm": 2.586267456025277, "learning_rate": 9.968561049466214e-06, "loss": 0.3967, "step": 461 }, { "epoch": 0.06479663394109397, "grad_norm": 3.3607247413390375, "learning_rate": 9.96830623757272e-06, "loss": 0.4624, "step": 462 }, { "epoch": 0.06493688639551193, "grad_norm": 2.81145076253653, "learning_rate": 9.968050400503946e-06, "loss": 0.4169, "step": 463 }, { "epoch": 0.06507713884992987, "grad_norm": 2.3608408031441948, "learning_rate": 9.967793538312683e-06, "loss": 0.3983, "step": 464 }, { "epoch": 0.06521739130434782, "grad_norm": 3.6264460080234047, "learning_rate": 9.967535651051927e-06, "loss": 0.4062, "step": 465 }, { "epoch": 0.06535764375876578, "grad_norm": 3.058153922269525, "learning_rate": 9.967276738774897e-06, "loss": 0.4581, "step": 466 }, { "epoch": 0.06549789621318373, "grad_norm": 2.6000922920279175, "learning_rate": 9.967016801535018e-06, "loss": 0.4484, "step": 467 }, { "epoch": 0.06563814866760169, "grad_norm": 4.530200073674399, "learning_rate": 9.966755839385925e-06, "loss": 0.4339, "step": 468 }, { "epoch": 0.06577840112201963, "grad_norm": 3.1136100719837874, "learning_rate": 9.966493852381463e-06, "loss": 0.4572, "step": 469 }, { "epoch": 0.06591865357643759, "grad_norm": 2.905310037847187, "learning_rate": 9.966230840575693e-06, "loss": 0.424, "step": 470 }, { "epoch": 0.06605890603085554, "grad_norm": 2.9207955880839394, "learning_rate": 9.965966804022887e-06, "loss": 0.4279, "step": 471 }, { "epoch": 0.0661991584852735, "grad_norm": 2.4950931077605185, "learning_rate": 9.965701742777524e-06, "loss": 0.3819, "step": 472 }, { "epoch": 0.06633941093969145, "grad_norm": 2.968480629997674, "learning_rate": 9.9654356568943e-06, "loss": 0.4006, "step": 473 }, { "epoch": 0.06647966339410939, "grad_norm": 2.6876135513802017, "learning_rate": 9.965168546428122e-06, "loss": 0.4062, "step": 474 }, { "epoch": 0.06661991584852735, "grad_norm": 2.6265824407379954, "learning_rate": 9.964900411434103e-06, "loss": 0.4296, "step": 475 }, { "epoch": 0.0667601683029453, "grad_norm": 3.589584191769265, "learning_rate": 9.96463125196757e-06, "loss": 0.4646, "step": 476 }, { "epoch": 0.06690042075736326, "grad_norm": 3.0776954002541586, "learning_rate": 9.964361068084063e-06, "loss": 0.3919, "step": 477 }, { "epoch": 0.06704067321178121, "grad_norm": 2.55216692314013, "learning_rate": 9.964089859839335e-06, "loss": 0.4514, "step": 478 }, { "epoch": 0.06718092566619915, "grad_norm": 3.138857215864084, "learning_rate": 9.963817627289347e-06, "loss": 0.4173, "step": 479 }, { "epoch": 0.06732117812061711, "grad_norm": 3.824038196388918, "learning_rate": 9.96354437049027e-06, "loss": 0.4645, "step": 480 }, { "epoch": 0.06746143057503506, "grad_norm": 3.000051021142068, "learning_rate": 9.963270089498492e-06, "loss": 0.4441, "step": 481 }, { "epoch": 0.06760168302945302, "grad_norm": 2.9219727525321875, "learning_rate": 9.962994784370605e-06, "loss": 0.4251, "step": 482 }, { "epoch": 0.06774193548387097, "grad_norm": 2.1701257920611745, "learning_rate": 9.96271845516342e-06, "loss": 0.4549, "step": 483 }, { "epoch": 0.06788218793828892, "grad_norm": 2.6204877855044986, "learning_rate": 9.962441101933956e-06, "loss": 0.4111, "step": 484 }, { "epoch": 0.06802244039270687, "grad_norm": 2.9931973578388766, "learning_rate": 9.962162724739437e-06, "loss": 0.4482, "step": 485 }, { "epoch": 0.06816269284712483, "grad_norm": 2.1543447950883508, "learning_rate": 9.961883323637312e-06, "loss": 0.397, "step": 486 }, { "epoch": 0.06830294530154278, "grad_norm": 3.734622452330382, "learning_rate": 9.961602898685225e-06, "loss": 0.3931, "step": 487 }, { "epoch": 0.06844319775596074, "grad_norm": 2.50450748360693, "learning_rate": 9.961321449941049e-06, "loss": 0.464, "step": 488 }, { "epoch": 0.06858345021037868, "grad_norm": 2.241605463811463, "learning_rate": 9.961038977462852e-06, "loss": 0.4422, "step": 489 }, { "epoch": 0.06872370266479663, "grad_norm": 2.500196735270521, "learning_rate": 9.960755481308923e-06, "loss": 0.4486, "step": 490 }, { "epoch": 0.06886395511921459, "grad_norm": 2.105636335225222, "learning_rate": 9.960470961537758e-06, "loss": 0.4134, "step": 491 }, { "epoch": 0.06900420757363254, "grad_norm": 2.1332761379363143, "learning_rate": 9.960185418208068e-06, "loss": 0.4311, "step": 492 }, { "epoch": 0.0691444600280505, "grad_norm": 2.252543495257875, "learning_rate": 9.95989885137877e-06, "loss": 0.3975, "step": 493 }, { "epoch": 0.06928471248246844, "grad_norm": 1.877928164000942, "learning_rate": 9.959611261108999e-06, "loss": 0.3964, "step": 494 }, { "epoch": 0.0694249649368864, "grad_norm": 2.127128881053362, "learning_rate": 9.959322647458093e-06, "loss": 0.4041, "step": 495 }, { "epoch": 0.06956521739130435, "grad_norm": 2.386453317220393, "learning_rate": 9.959033010485608e-06, "loss": 0.396, "step": 496 }, { "epoch": 0.0697054698457223, "grad_norm": 2.2658162003440294, "learning_rate": 9.958742350251307e-06, "loss": 0.4343, "step": 497 }, { "epoch": 0.06984572230014026, "grad_norm": 2.793246168750746, "learning_rate": 9.958450666815168e-06, "loss": 0.4587, "step": 498 }, { "epoch": 0.0699859747545582, "grad_norm": 2.8381594525976057, "learning_rate": 9.958157960237376e-06, "loss": 0.4395, "step": 499 }, { "epoch": 0.07012622720897616, "grad_norm": 2.3910692587553366, "learning_rate": 9.95786423057833e-06, "loss": 0.4423, "step": 500 }, { "epoch": 0.07026647966339411, "grad_norm": 2.2315230942406568, "learning_rate": 9.957569477898636e-06, "loss": 0.4346, "step": 501 }, { "epoch": 0.07040673211781207, "grad_norm": 2.716305828898804, "learning_rate": 9.957273702259118e-06, "loss": 0.404, "step": 502 }, { "epoch": 0.07054698457223002, "grad_norm": 2.932740759238852, "learning_rate": 9.956976903720806e-06, "loss": 0.4647, "step": 503 }, { "epoch": 0.07068723702664796, "grad_norm": 2.343874508411583, "learning_rate": 9.956679082344941e-06, "loss": 0.414, "step": 504 }, { "epoch": 0.07082748948106592, "grad_norm": 2.279088080223534, "learning_rate": 9.956380238192978e-06, "loss": 0.3991, "step": 505 }, { "epoch": 0.07096774193548387, "grad_norm": 1.969531070840866, "learning_rate": 9.956080371326582e-06, "loss": 0.4259, "step": 506 }, { "epoch": 0.07110799438990183, "grad_norm": 3.2446055024668534, "learning_rate": 9.955779481807626e-06, "loss": 0.3774, "step": 507 }, { "epoch": 0.07124824684431978, "grad_norm": 2.5281786711362937, "learning_rate": 9.955477569698197e-06, "loss": 0.4195, "step": 508 }, { "epoch": 0.07138849929873772, "grad_norm": 2.253901172546274, "learning_rate": 9.955174635060595e-06, "loss": 0.4324, "step": 509 }, { "epoch": 0.07152875175315568, "grad_norm": 2.2731718452163974, "learning_rate": 9.954870677957327e-06, "loss": 0.4136, "step": 510 }, { "epoch": 0.07166900420757363, "grad_norm": 2.678187715203363, "learning_rate": 9.95456569845111e-06, "loss": 0.4948, "step": 511 }, { "epoch": 0.07180925666199159, "grad_norm": 1.9039431512424003, "learning_rate": 9.954259696604878e-06, "loss": 0.4102, "step": 512 }, { "epoch": 0.07194950911640954, "grad_norm": 3.16024633026936, "learning_rate": 9.95395267248177e-06, "loss": 0.4286, "step": 513 }, { "epoch": 0.07208976157082748, "grad_norm": 2.9262212567585966, "learning_rate": 9.95364462614514e-06, "loss": 0.3897, "step": 514 }, { "epoch": 0.07223001402524544, "grad_norm": 2.308117812615739, "learning_rate": 9.95333555765855e-06, "loss": 0.426, "step": 515 }, { "epoch": 0.0723702664796634, "grad_norm": 2.3842910931823758, "learning_rate": 9.953025467085774e-06, "loss": 0.4173, "step": 516 }, { "epoch": 0.07251051893408135, "grad_norm": 4.239549016540257, "learning_rate": 9.952714354490799e-06, "loss": 0.4429, "step": 517 }, { "epoch": 0.0726507713884993, "grad_norm": 2.6658770961891856, "learning_rate": 9.952402219937817e-06, "loss": 0.4402, "step": 518 }, { "epoch": 0.07279102384291725, "grad_norm": 3.6264705963520583, "learning_rate": 9.952089063491237e-06, "loss": 0.4296, "step": 519 }, { "epoch": 0.0729312762973352, "grad_norm": 5.67574596459696, "learning_rate": 9.951774885215679e-06, "loss": 0.3891, "step": 520 }, { "epoch": 0.07307152875175316, "grad_norm": 3.5136992881261095, "learning_rate": 9.951459685175968e-06, "loss": 0.4451, "step": 521 }, { "epoch": 0.07321178120617111, "grad_norm": 6.318712859605858, "learning_rate": 9.951143463437145e-06, "loss": 0.4053, "step": 522 }, { "epoch": 0.07335203366058907, "grad_norm": 2.8805837725691252, "learning_rate": 9.95082622006446e-06, "loss": 0.4392, "step": 523 }, { "epoch": 0.07349228611500701, "grad_norm": 3.026507413458335, "learning_rate": 9.950507955123372e-06, "loss": 0.4052, "step": 524 }, { "epoch": 0.07363253856942496, "grad_norm": 3.9677406289023214, "learning_rate": 9.950188668679558e-06, "loss": 0.4294, "step": 525 }, { "epoch": 0.07377279102384292, "grad_norm": 2.6406734845828543, "learning_rate": 9.949868360798893e-06, "loss": 0.4434, "step": 526 }, { "epoch": 0.07391304347826087, "grad_norm": 7.1962068314654815, "learning_rate": 9.949547031547475e-06, "loss": 0.444, "step": 527 }, { "epoch": 0.07405329593267883, "grad_norm": 2.363341779958304, "learning_rate": 9.94922468099161e-06, "loss": 0.4043, "step": 528 }, { "epoch": 0.07419354838709677, "grad_norm": 2.604781716512908, "learning_rate": 9.948901309197807e-06, "loss": 0.4414, "step": 529 }, { "epoch": 0.07433380084151472, "grad_norm": 2.395452872764815, "learning_rate": 9.948576916232796e-06, "loss": 0.421, "step": 530 }, { "epoch": 0.07447405329593268, "grad_norm": 1.869005794305938, "learning_rate": 9.948251502163512e-06, "loss": 0.4112, "step": 531 }, { "epoch": 0.07461430575035063, "grad_norm": 2.058564795737436, "learning_rate": 9.947925067057102e-06, "loss": 0.4169, "step": 532 }, { "epoch": 0.07475455820476859, "grad_norm": 2.178728884114789, "learning_rate": 9.94759761098092e-06, "loss": 0.3915, "step": 533 }, { "epoch": 0.07489481065918653, "grad_norm": 2.4635334189493996, "learning_rate": 9.947269134002542e-06, "loss": 0.4013, "step": 534 }, { "epoch": 0.07503506311360449, "grad_norm": 2.319099494526946, "learning_rate": 9.946939636189741e-06, "loss": 0.4422, "step": 535 }, { "epoch": 0.07517531556802244, "grad_norm": 2.4325207446508528, "learning_rate": 9.946609117610508e-06, "loss": 0.4542, "step": 536 }, { "epoch": 0.0753155680224404, "grad_norm": 2.3704946851527944, "learning_rate": 9.946277578333045e-06, "loss": 0.4286, "step": 537 }, { "epoch": 0.07545582047685835, "grad_norm": 4.493791111531237, "learning_rate": 9.945945018425759e-06, "loss": 0.4195, "step": 538 }, { "epoch": 0.07559607293127629, "grad_norm": 3.4720842986794302, "learning_rate": 9.945611437957274e-06, "loss": 0.4568, "step": 539 }, { "epoch": 0.07573632538569425, "grad_norm": 3.338403438061192, "learning_rate": 9.945276836996422e-06, "loss": 0.4358, "step": 540 }, { "epoch": 0.0758765778401122, "grad_norm": 2.053028205436276, "learning_rate": 9.944941215612244e-06, "loss": 0.4184, "step": 541 }, { "epoch": 0.07601683029453016, "grad_norm": 2.228621858944948, "learning_rate": 9.944604573873996e-06, "loss": 0.4477, "step": 542 }, { "epoch": 0.07615708274894811, "grad_norm": 2.261248762089278, "learning_rate": 9.94426691185114e-06, "loss": 0.4258, "step": 543 }, { "epoch": 0.07629733520336605, "grad_norm": 2.819725672993946, "learning_rate": 9.943928229613349e-06, "loss": 0.4036, "step": 544 }, { "epoch": 0.07643758765778401, "grad_norm": 7.578812337949622, "learning_rate": 9.943588527230508e-06, "loss": 0.432, "step": 545 }, { "epoch": 0.07657784011220196, "grad_norm": 2.3253149701138542, "learning_rate": 9.943247804772714e-06, "loss": 0.3997, "step": 546 }, { "epoch": 0.07671809256661992, "grad_norm": 2.6782872403476596, "learning_rate": 9.942906062310272e-06, "loss": 0.4297, "step": 547 }, { "epoch": 0.07685834502103787, "grad_norm": 2.7059734529150243, "learning_rate": 9.942563299913698e-06, "loss": 0.4296, "step": 548 }, { "epoch": 0.07699859747545582, "grad_norm": 3.064931410353403, "learning_rate": 9.942219517653718e-06, "loss": 0.4798, "step": 549 }, { "epoch": 0.07713884992987377, "grad_norm": 2.9262448848792095, "learning_rate": 9.94187471560127e-06, "loss": 0.4008, "step": 550 }, { "epoch": 0.07727910238429173, "grad_norm": 2.4556075248378026, "learning_rate": 9.9415288938275e-06, "loss": 0.4927, "step": 551 }, { "epoch": 0.07741935483870968, "grad_norm": 2.536805919562837, "learning_rate": 9.941182052403768e-06, "loss": 0.4321, "step": 552 }, { "epoch": 0.07755960729312764, "grad_norm": 2.5497361476748943, "learning_rate": 9.940834191401642e-06, "loss": 0.407, "step": 553 }, { "epoch": 0.07769985974754558, "grad_norm": 2.6513769890760357, "learning_rate": 9.940485310892901e-06, "loss": 0.3805, "step": 554 }, { "epoch": 0.07784011220196353, "grad_norm": 2.224983163834213, "learning_rate": 9.94013541094953e-06, "loss": 0.4337, "step": 555 }, { "epoch": 0.07798036465638149, "grad_norm": 2.758949196818198, "learning_rate": 9.939784491643734e-06, "loss": 0.3878, "step": 556 }, { "epoch": 0.07812061711079944, "grad_norm": 4.113823517125366, "learning_rate": 9.939432553047919e-06, "loss": 0.4511, "step": 557 }, { "epoch": 0.0782608695652174, "grad_norm": 2.6464227535492, "learning_rate": 9.939079595234706e-06, "loss": 0.4624, "step": 558 }, { "epoch": 0.07840112201963534, "grad_norm": 2.688141502288919, "learning_rate": 9.938725618276926e-06, "loss": 0.4662, "step": 559 }, { "epoch": 0.0785413744740533, "grad_norm": 2.8573708102708153, "learning_rate": 9.938370622247619e-06, "loss": 0.4127, "step": 560 }, { "epoch": 0.07868162692847125, "grad_norm": 2.8526057595014116, "learning_rate": 9.938014607220036e-06, "loss": 0.3666, "step": 561 }, { "epoch": 0.0788218793828892, "grad_norm": 3.9592941456572706, "learning_rate": 9.93765757326764e-06, "loss": 0.3906, "step": 562 }, { "epoch": 0.07896213183730716, "grad_norm": 2.932131955465213, "learning_rate": 9.9372995204641e-06, "loss": 0.471, "step": 563 }, { "epoch": 0.0791023842917251, "grad_norm": 2.905563273357825, "learning_rate": 9.936940448883299e-06, "loss": 0.4304, "step": 564 }, { "epoch": 0.07924263674614306, "grad_norm": 2.5895361624176068, "learning_rate": 9.936580358599327e-06, "loss": 0.4006, "step": 565 }, { "epoch": 0.07938288920056101, "grad_norm": 3.4988994230395343, "learning_rate": 9.93621924968649e-06, "loss": 0.4132, "step": 566 }, { "epoch": 0.07952314165497897, "grad_norm": 2.964015035696671, "learning_rate": 9.935857122219297e-06, "loss": 0.4112, "step": 567 }, { "epoch": 0.07966339410939692, "grad_norm": 5.810852145342308, "learning_rate": 9.935493976272473e-06, "loss": 0.4046, "step": 568 }, { "epoch": 0.07980364656381486, "grad_norm": 2.410573963340091, "learning_rate": 9.935129811920947e-06, "loss": 0.4047, "step": 569 }, { "epoch": 0.07994389901823282, "grad_norm": 2.6910832492251617, "learning_rate": 9.934764629239863e-06, "loss": 0.4178, "step": 570 }, { "epoch": 0.08008415147265077, "grad_norm": 3.1171618008809063, "learning_rate": 9.934398428304577e-06, "loss": 0.4337, "step": 571 }, { "epoch": 0.08022440392706873, "grad_norm": 2.7934641578768775, "learning_rate": 9.93403120919065e-06, "loss": 0.4095, "step": 572 }, { "epoch": 0.08036465638148668, "grad_norm": 2.688027529728785, "learning_rate": 9.933662971973851e-06, "loss": 0.3839, "step": 573 }, { "epoch": 0.08050490883590462, "grad_norm": 2.9424136033901704, "learning_rate": 9.933293716730172e-06, "loss": 0.4243, "step": 574 }, { "epoch": 0.08064516129032258, "grad_norm": 2.3308109432486637, "learning_rate": 9.932923443535798e-06, "loss": 0.4662, "step": 575 }, { "epoch": 0.08078541374474053, "grad_norm": 2.749524248758854, "learning_rate": 9.932552152467137e-06, "loss": 0.3932, "step": 576 }, { "epoch": 0.08092566619915849, "grad_norm": 2.444681982097326, "learning_rate": 9.9321798436008e-06, "loss": 0.4114, "step": 577 }, { "epoch": 0.08106591865357644, "grad_norm": 3.46460717048328, "learning_rate": 9.931806517013612e-06, "loss": 0.411, "step": 578 }, { "epoch": 0.08120617110799438, "grad_norm": 2.9782210407602308, "learning_rate": 9.931432172782606e-06, "loss": 0.4557, "step": 579 }, { "epoch": 0.08134642356241234, "grad_norm": 2.627155145868136, "learning_rate": 9.931056810985024e-06, "loss": 0.3953, "step": 580 }, { "epoch": 0.0814866760168303, "grad_norm": 6.5731086360346485, "learning_rate": 9.93068043169832e-06, "loss": 0.4036, "step": 581 }, { "epoch": 0.08162692847124825, "grad_norm": 2.563433593389875, "learning_rate": 9.930303035000159e-06, "loss": 0.4301, "step": 582 }, { "epoch": 0.0817671809256662, "grad_norm": 2.813436394682458, "learning_rate": 9.929924620968409e-06, "loss": 0.4587, "step": 583 }, { "epoch": 0.08190743338008415, "grad_norm": 2.2688752110926553, "learning_rate": 9.92954518968116e-06, "loss": 0.4647, "step": 584 }, { "epoch": 0.0820476858345021, "grad_norm": 2.4365217740162155, "learning_rate": 9.929164741216702e-06, "loss": 0.4226, "step": 585 }, { "epoch": 0.08218793828892006, "grad_norm": 2.3070974390215184, "learning_rate": 9.928783275653534e-06, "loss": 0.3934, "step": 586 }, { "epoch": 0.08232819074333801, "grad_norm": 1.9404043148937231, "learning_rate": 9.928400793070375e-06, "loss": 0.4123, "step": 587 }, { "epoch": 0.08246844319775597, "grad_norm": 2.6048628118198662, "learning_rate": 9.928017293546144e-06, "loss": 0.4545, "step": 588 }, { "epoch": 0.08260869565217391, "grad_norm": 2.237539864296864, "learning_rate": 9.927632777159975e-06, "loss": 0.4053, "step": 589 }, { "epoch": 0.08274894810659186, "grad_norm": 2.106464440723593, "learning_rate": 9.927247243991209e-06, "loss": 0.4086, "step": 590 }, { "epoch": 0.08288920056100982, "grad_norm": 4.108699844653068, "learning_rate": 9.9268606941194e-06, "loss": 0.4469, "step": 591 }, { "epoch": 0.08302945301542777, "grad_norm": 2.2622056084572204, "learning_rate": 9.926473127624306e-06, "loss": 0.432, "step": 592 }, { "epoch": 0.08316970546984573, "grad_norm": 3.1122191949742097, "learning_rate": 9.926084544585904e-06, "loss": 0.4296, "step": 593 }, { "epoch": 0.08330995792426367, "grad_norm": 3.4069466534674056, "learning_rate": 9.925694945084369e-06, "loss": 0.4404, "step": 594 }, { "epoch": 0.08345021037868162, "grad_norm": 2.3797838318050246, "learning_rate": 9.925304329200098e-06, "loss": 0.4615, "step": 595 }, { "epoch": 0.08359046283309958, "grad_norm": 2.6397311712432208, "learning_rate": 9.92491269701369e-06, "loss": 0.4209, "step": 596 }, { "epoch": 0.08373071528751753, "grad_norm": 2.2164889352816157, "learning_rate": 9.924520048605955e-06, "loss": 0.4023, "step": 597 }, { "epoch": 0.08387096774193549, "grad_norm": 2.4390800441230716, "learning_rate": 9.924126384057913e-06, "loss": 0.4341, "step": 598 }, { "epoch": 0.08401122019635343, "grad_norm": 2.0206406753833477, "learning_rate": 9.923731703450794e-06, "loss": 0.4465, "step": 599 }, { "epoch": 0.08415147265077139, "grad_norm": 2.3044698321469497, "learning_rate": 9.923336006866038e-06, "loss": 0.3857, "step": 600 }, { "epoch": 0.08429172510518934, "grad_norm": 2.4004909410613644, "learning_rate": 9.922939294385294e-06, "loss": 0.4197, "step": 601 }, { "epoch": 0.0844319775596073, "grad_norm": 2.401645962239625, "learning_rate": 9.922541566090422e-06, "loss": 0.4212, "step": 602 }, { "epoch": 0.08457223001402525, "grad_norm": 2.099050388889036, "learning_rate": 9.922142822063488e-06, "loss": 0.3966, "step": 603 }, { "epoch": 0.08471248246844319, "grad_norm": 3.3282125398826805, "learning_rate": 9.921743062386773e-06, "loss": 0.3842, "step": 604 }, { "epoch": 0.08485273492286115, "grad_norm": 2.508136668896997, "learning_rate": 9.92134228714276e-06, "loss": 0.4454, "step": 605 }, { "epoch": 0.0849929873772791, "grad_norm": 2.700962502618605, "learning_rate": 9.920940496414153e-06, "loss": 0.4267, "step": 606 }, { "epoch": 0.08513323983169706, "grad_norm": 2.771552506430004, "learning_rate": 9.920537690283853e-06, "loss": 0.4177, "step": 607 }, { "epoch": 0.08527349228611501, "grad_norm": 3.061680275338762, "learning_rate": 9.92013386883498e-06, "loss": 0.3966, "step": 608 }, { "epoch": 0.08541374474053295, "grad_norm": 3.2256444775001243, "learning_rate": 9.919729032150855e-06, "loss": 0.4103, "step": 609 }, { "epoch": 0.08555399719495091, "grad_norm": 2.309504966537246, "learning_rate": 9.91932318031502e-06, "loss": 0.4252, "step": 610 }, { "epoch": 0.08569424964936886, "grad_norm": 2.297905100576898, "learning_rate": 9.918916313411213e-06, "loss": 0.3915, "step": 611 }, { "epoch": 0.08583450210378682, "grad_norm": 2.9628528046881666, "learning_rate": 9.918508431523392e-06, "loss": 0.3876, "step": 612 }, { "epoch": 0.08597475455820477, "grad_norm": 2.581944451193296, "learning_rate": 9.91809953473572e-06, "loss": 0.4353, "step": 613 }, { "epoch": 0.08611500701262272, "grad_norm": 6.792611584917445, "learning_rate": 9.917689623132568e-06, "loss": 0.43, "step": 614 }, { "epoch": 0.08625525946704067, "grad_norm": 3.082437505405656, "learning_rate": 9.91727869679852e-06, "loss": 0.4128, "step": 615 }, { "epoch": 0.08639551192145863, "grad_norm": 2.9533436204833015, "learning_rate": 9.916866755818368e-06, "loss": 0.4259, "step": 616 }, { "epoch": 0.08653576437587658, "grad_norm": 2.707020681121143, "learning_rate": 9.916453800277115e-06, "loss": 0.4051, "step": 617 }, { "epoch": 0.08667601683029454, "grad_norm": 4.012798813956297, "learning_rate": 9.916039830259967e-06, "loss": 0.4268, "step": 618 }, { "epoch": 0.08681626928471248, "grad_norm": 2.510555965204729, "learning_rate": 9.915624845852347e-06, "loss": 0.4317, "step": 619 }, { "epoch": 0.08695652173913043, "grad_norm": 3.1114674337633086, "learning_rate": 9.915208847139883e-06, "loss": 0.4148, "step": 620 }, { "epoch": 0.08709677419354839, "grad_norm": 2.6402545195046807, "learning_rate": 9.914791834208415e-06, "loss": 0.4007, "step": 621 }, { "epoch": 0.08723702664796634, "grad_norm": 3.57474888273078, "learning_rate": 9.91437380714399e-06, "loss": 0.3921, "step": 622 }, { "epoch": 0.0873772791023843, "grad_norm": 2.6138733790288042, "learning_rate": 9.913954766032861e-06, "loss": 0.4088, "step": 623 }, { "epoch": 0.08751753155680224, "grad_norm": 2.9242495103828965, "learning_rate": 9.9135347109615e-06, "loss": 0.4424, "step": 624 }, { "epoch": 0.0876577840112202, "grad_norm": 2.130216814566768, "learning_rate": 9.91311364201658e-06, "loss": 0.43, "step": 625 }, { "epoch": 0.08779803646563815, "grad_norm": 2.4060293319385533, "learning_rate": 9.912691559284985e-06, "loss": 0.4293, "step": 626 }, { "epoch": 0.0879382889200561, "grad_norm": 3.259512258914764, "learning_rate": 9.912268462853811e-06, "loss": 0.4063, "step": 627 }, { "epoch": 0.08807854137447406, "grad_norm": 2.4595403671410714, "learning_rate": 9.911844352810359e-06, "loss": 0.3681, "step": 628 }, { "epoch": 0.088218793828892, "grad_norm": 2.8738075768940536, "learning_rate": 9.91141922924214e-06, "loss": 0.4042, "step": 629 }, { "epoch": 0.08835904628330996, "grad_norm": 2.46226712592563, "learning_rate": 9.910993092236878e-06, "loss": 0.4459, "step": 630 }, { "epoch": 0.08849929873772791, "grad_norm": 3.8799595012714176, "learning_rate": 9.910565941882501e-06, "loss": 0.4457, "step": 631 }, { "epoch": 0.08863955119214587, "grad_norm": 2.2927665036378775, "learning_rate": 9.910137778267153e-06, "loss": 0.3553, "step": 632 }, { "epoch": 0.08877980364656382, "grad_norm": 2.375620108511329, "learning_rate": 9.909708601479178e-06, "loss": 0.4236, "step": 633 }, { "epoch": 0.08892005610098176, "grad_norm": 2.371846363965291, "learning_rate": 9.909278411607134e-06, "loss": 0.3367, "step": 634 }, { "epoch": 0.08906030855539972, "grad_norm": 2.7951895467670806, "learning_rate": 9.908847208739788e-06, "loss": 0.4098, "step": 635 }, { "epoch": 0.08920056100981767, "grad_norm": 2.639822753389235, "learning_rate": 9.908414992966119e-06, "loss": 0.4755, "step": 636 }, { "epoch": 0.08934081346423563, "grad_norm": 2.785910056295134, "learning_rate": 9.907981764375307e-06, "loss": 0.4334, "step": 637 }, { "epoch": 0.08948106591865358, "grad_norm": 2.852628660117186, "learning_rate": 9.907547523056748e-06, "loss": 0.4167, "step": 638 }, { "epoch": 0.08962131837307152, "grad_norm": 2.7571054043827314, "learning_rate": 9.907112269100045e-06, "loss": 0.4233, "step": 639 }, { "epoch": 0.08976157082748948, "grad_norm": 3.4838484977865765, "learning_rate": 9.90667600259501e-06, "loss": 0.4067, "step": 640 }, { "epoch": 0.08990182328190743, "grad_norm": 2.853054043241597, "learning_rate": 9.906238723631662e-06, "loss": 0.4684, "step": 641 }, { "epoch": 0.09004207573632539, "grad_norm": 2.3391280313509992, "learning_rate": 9.905800432300232e-06, "loss": 0.4375, "step": 642 }, { "epoch": 0.09018232819074334, "grad_norm": 2.635695421355907, "learning_rate": 9.905361128691156e-06, "loss": 0.4424, "step": 643 }, { "epoch": 0.09032258064516129, "grad_norm": 2.9267385874827374, "learning_rate": 9.904920812895082e-06, "loss": 0.3907, "step": 644 }, { "epoch": 0.09046283309957924, "grad_norm": 3.1888003221441816, "learning_rate": 9.904479485002869e-06, "loss": 0.404, "step": 645 }, { "epoch": 0.0906030855539972, "grad_norm": 2.4179096861791174, "learning_rate": 9.904037145105577e-06, "loss": 0.3998, "step": 646 }, { "epoch": 0.09074333800841515, "grad_norm": 4.689159658989563, "learning_rate": 9.903593793294484e-06, "loss": 0.4289, "step": 647 }, { "epoch": 0.0908835904628331, "grad_norm": 3.2351038014946774, "learning_rate": 9.903149429661072e-06, "loss": 0.4068, "step": 648 }, { "epoch": 0.09102384291725105, "grad_norm": 2.789755342482678, "learning_rate": 9.902704054297028e-06, "loss": 0.4724, "step": 649 }, { "epoch": 0.091164095371669, "grad_norm": 3.7744858195798634, "learning_rate": 9.902257667294259e-06, "loss": 0.4228, "step": 650 }, { "epoch": 0.09130434782608696, "grad_norm": 2.68132800720131, "learning_rate": 9.901810268744868e-06, "loss": 0.3946, "step": 651 }, { "epoch": 0.09144460028050491, "grad_norm": 3.2079914620968415, "learning_rate": 9.901361858741177e-06, "loss": 0.3925, "step": 652 }, { "epoch": 0.09158485273492287, "grad_norm": 2.6380354242903548, "learning_rate": 9.900912437375708e-06, "loss": 0.4617, "step": 653 }, { "epoch": 0.09172510518934081, "grad_norm": 2.6376430671542788, "learning_rate": 9.900462004741198e-06, "loss": 0.4913, "step": 654 }, { "epoch": 0.09186535764375876, "grad_norm": 2.546610438173832, "learning_rate": 9.90001056093059e-06, "loss": 0.3981, "step": 655 }, { "epoch": 0.09200561009817672, "grad_norm": 2.508642136669367, "learning_rate": 9.899558106037039e-06, "loss": 0.4517, "step": 656 }, { "epoch": 0.09214586255259467, "grad_norm": 2.6862991666488, "learning_rate": 9.899104640153904e-06, "loss": 0.3706, "step": 657 }, { "epoch": 0.09228611500701263, "grad_norm": 2.628741549679193, "learning_rate": 9.898650163374751e-06, "loss": 0.4695, "step": 658 }, { "epoch": 0.09242636746143057, "grad_norm": 2.694523173314856, "learning_rate": 9.898194675793365e-06, "loss": 0.4437, "step": 659 }, { "epoch": 0.09256661991584852, "grad_norm": 3.146987619061112, "learning_rate": 9.897738177503729e-06, "loss": 0.4315, "step": 660 }, { "epoch": 0.09270687237026648, "grad_norm": 2.968430672587619, "learning_rate": 9.897280668600037e-06, "loss": 0.4503, "step": 661 }, { "epoch": 0.09284712482468443, "grad_norm": 2.4055953683512343, "learning_rate": 9.896822149176695e-06, "loss": 0.4188, "step": 662 }, { "epoch": 0.09298737727910239, "grad_norm": 2.9929472354682223, "learning_rate": 9.896362619328314e-06, "loss": 0.4037, "step": 663 }, { "epoch": 0.09312762973352033, "grad_norm": 2.250263622522839, "learning_rate": 9.895902079149715e-06, "loss": 0.4621, "step": 664 }, { "epoch": 0.09326788218793829, "grad_norm": 2.427144083946365, "learning_rate": 9.895440528735927e-06, "loss": 0.4444, "step": 665 }, { "epoch": 0.09340813464235624, "grad_norm": 3.001947724356723, "learning_rate": 9.894977968182189e-06, "loss": 0.4368, "step": 666 }, { "epoch": 0.0935483870967742, "grad_norm": 1.7161206333778094, "learning_rate": 9.894514397583947e-06, "loss": 0.3923, "step": 667 }, { "epoch": 0.09368863955119215, "grad_norm": 2.050272211749686, "learning_rate": 9.894049817036854e-06, "loss": 0.4058, "step": 668 }, { "epoch": 0.09382889200561009, "grad_norm": 2.6703072571856445, "learning_rate": 9.893584226636773e-06, "loss": 0.4187, "step": 669 }, { "epoch": 0.09396914446002805, "grad_norm": 2.29934829103304, "learning_rate": 9.893117626479778e-06, "loss": 0.4224, "step": 670 }, { "epoch": 0.094109396914446, "grad_norm": 2.117931952978256, "learning_rate": 9.892650016662144e-06, "loss": 0.4186, "step": 671 }, { "epoch": 0.09424964936886396, "grad_norm": 2.928117254973665, "learning_rate": 9.892181397280365e-06, "loss": 0.3833, "step": 672 }, { "epoch": 0.09438990182328191, "grad_norm": 1.9800764368696875, "learning_rate": 9.891711768431131e-06, "loss": 0.4262, "step": 673 }, { "epoch": 0.09453015427769985, "grad_norm": 2.631203088772597, "learning_rate": 9.891241130211353e-06, "loss": 0.424, "step": 674 }, { "epoch": 0.09467040673211781, "grad_norm": 2.573355036365654, "learning_rate": 9.89076948271814e-06, "loss": 0.4323, "step": 675 }, { "epoch": 0.09481065918653576, "grad_norm": 2.5308140508800396, "learning_rate": 9.89029682604881e-06, "loss": 0.4255, "step": 676 }, { "epoch": 0.09495091164095372, "grad_norm": 6.263074703077405, "learning_rate": 9.8898231603009e-06, "loss": 0.4706, "step": 677 }, { "epoch": 0.09509116409537167, "grad_norm": 1.9784421163386703, "learning_rate": 9.889348485572144e-06, "loss": 0.3975, "step": 678 }, { "epoch": 0.09523141654978962, "grad_norm": 1.7511227956748854, "learning_rate": 9.888872801960486e-06, "loss": 0.4006, "step": 679 }, { "epoch": 0.09537166900420757, "grad_norm": 2.2266958096399727, "learning_rate": 9.888396109564082e-06, "loss": 0.4519, "step": 680 }, { "epoch": 0.09551192145862553, "grad_norm": 3.908997324897981, "learning_rate": 9.887918408481295e-06, "loss": 0.4007, "step": 681 }, { "epoch": 0.09565217391304348, "grad_norm": 2.439371808592663, "learning_rate": 9.887439698810694e-06, "loss": 0.4283, "step": 682 }, { "epoch": 0.09579242636746144, "grad_norm": 3.543474895153141, "learning_rate": 9.886959980651056e-06, "loss": 0.4233, "step": 683 }, { "epoch": 0.09593267882187938, "grad_norm": 2.3517648603493555, "learning_rate": 9.886479254101372e-06, "loss": 0.4512, "step": 684 }, { "epoch": 0.09607293127629733, "grad_norm": 2.459871285180292, "learning_rate": 9.885997519260831e-06, "loss": 0.4385, "step": 685 }, { "epoch": 0.09621318373071529, "grad_norm": 2.7796031919820336, "learning_rate": 9.885514776228837e-06, "loss": 0.425, "step": 686 }, { "epoch": 0.09635343618513324, "grad_norm": 4.073655057481395, "learning_rate": 9.885031025105005e-06, "loss": 0.4417, "step": 687 }, { "epoch": 0.0964936886395512, "grad_norm": 3.3321846572247087, "learning_rate": 9.884546265989148e-06, "loss": 0.392, "step": 688 }, { "epoch": 0.09663394109396914, "grad_norm": 3.0825323318803, "learning_rate": 9.884060498981297e-06, "loss": 0.4687, "step": 689 }, { "epoch": 0.0967741935483871, "grad_norm": 2.915756265152888, "learning_rate": 9.883573724181683e-06, "loss": 0.4377, "step": 690 }, { "epoch": 0.09691444600280505, "grad_norm": 1.9791401576893655, "learning_rate": 9.883085941690752e-06, "loss": 0.4135, "step": 691 }, { "epoch": 0.097054698457223, "grad_norm": 2.3294381144422176, "learning_rate": 9.882597151609153e-06, "loss": 0.463, "step": 692 }, { "epoch": 0.09719495091164096, "grad_norm": 2.5857971706519183, "learning_rate": 9.882107354037743e-06, "loss": 0.4294, "step": 693 }, { "epoch": 0.0973352033660589, "grad_norm": 3.377955237824373, "learning_rate": 9.881616549077591e-06, "loss": 0.4355, "step": 694 }, { "epoch": 0.09747545582047686, "grad_norm": 2.0239998172337748, "learning_rate": 9.881124736829968e-06, "loss": 0.3903, "step": 695 }, { "epoch": 0.09761570827489481, "grad_norm": 2.324796409306966, "learning_rate": 9.880631917396358e-06, "loss": 0.4011, "step": 696 }, { "epoch": 0.09775596072931277, "grad_norm": 1.9300017931045346, "learning_rate": 9.880138090878452e-06, "loss": 0.4243, "step": 697 }, { "epoch": 0.09789621318373072, "grad_norm": 2.0679366388483134, "learning_rate": 9.879643257378146e-06, "loss": 0.4019, "step": 698 }, { "epoch": 0.09803646563814866, "grad_norm": 2.6681583722534667, "learning_rate": 9.879147416997544e-06, "loss": 0.4249, "step": 699 }, { "epoch": 0.09817671809256662, "grad_norm": 2.508378484917336, "learning_rate": 9.878650569838963e-06, "loss": 0.4455, "step": 700 }, { "epoch": 0.09831697054698457, "grad_norm": 2.8439817229379787, "learning_rate": 9.878152716004921e-06, "loss": 0.3742, "step": 701 }, { "epoch": 0.09845722300140253, "grad_norm": 2.649351249312995, "learning_rate": 9.877653855598148e-06, "loss": 0.4235, "step": 702 }, { "epoch": 0.09859747545582048, "grad_norm": 4.604803786348187, "learning_rate": 9.87715398872158e-06, "loss": 0.4095, "step": 703 }, { "epoch": 0.09873772791023842, "grad_norm": 3.1705496889656524, "learning_rate": 9.87665311547836e-06, "loss": 0.3916, "step": 704 }, { "epoch": 0.09887798036465638, "grad_norm": 2.3647080331150105, "learning_rate": 9.87615123597184e-06, "loss": 0.4688, "step": 705 }, { "epoch": 0.09901823281907433, "grad_norm": 3.7469128935284113, "learning_rate": 9.875648350305582e-06, "loss": 0.384, "step": 706 }, { "epoch": 0.09915848527349229, "grad_norm": 2.7851332226158423, "learning_rate": 9.87514445858335e-06, "loss": 0.3958, "step": 707 }, { "epoch": 0.09929873772791024, "grad_norm": 4.455041947943315, "learning_rate": 9.874639560909118e-06, "loss": 0.4299, "step": 708 }, { "epoch": 0.09943899018232819, "grad_norm": 2.4501353713024754, "learning_rate": 9.87413365738707e-06, "loss": 0.4572, "step": 709 }, { "epoch": 0.09957924263674614, "grad_norm": 2.2838563009242687, "learning_rate": 9.873626748121597e-06, "loss": 0.3923, "step": 710 }, { "epoch": 0.0997194950911641, "grad_norm": 2.3631663396973073, "learning_rate": 9.873118833217294e-06, "loss": 0.4312, "step": 711 }, { "epoch": 0.09985974754558205, "grad_norm": 2.4806088866049043, "learning_rate": 9.872609912778966e-06, "loss": 0.4005, "step": 712 }, { "epoch": 0.1, "grad_norm": 2.0023346625757017, "learning_rate": 9.872099986911625e-06, "loss": 0.4369, "step": 713 }, { "epoch": 0.10014025245441795, "grad_norm": 3.2275309367806186, "learning_rate": 9.871589055720489e-06, "loss": 0.4363, "step": 714 }, { "epoch": 0.1002805049088359, "grad_norm": 1.9443648761030785, "learning_rate": 9.87107711931099e-06, "loss": 0.3703, "step": 715 }, { "epoch": 0.10042075736325386, "grad_norm": 2.094399536499077, "learning_rate": 9.870564177788758e-06, "loss": 0.433, "step": 716 }, { "epoch": 0.10056100981767181, "grad_norm": 3.0015091279193054, "learning_rate": 9.870050231259636e-06, "loss": 0.3796, "step": 717 }, { "epoch": 0.10070126227208977, "grad_norm": 2.31805381914514, "learning_rate": 9.869535279829674e-06, "loss": 0.4246, "step": 718 }, { "epoch": 0.10084151472650771, "grad_norm": 2.3824709788233105, "learning_rate": 9.86901932360513e-06, "loss": 0.4044, "step": 719 }, { "epoch": 0.10098176718092566, "grad_norm": 1.8945659201684293, "learning_rate": 9.868502362692463e-06, "loss": 0.4089, "step": 720 }, { "epoch": 0.10112201963534362, "grad_norm": 2.5057419640142853, "learning_rate": 9.867984397198349e-06, "loss": 0.4598, "step": 721 }, { "epoch": 0.10126227208976157, "grad_norm": 2.706606877078446, "learning_rate": 9.867465427229665e-06, "loss": 0.4719, "step": 722 }, { "epoch": 0.10140252454417953, "grad_norm": 2.8465567351234817, "learning_rate": 9.866945452893497e-06, "loss": 0.4188, "step": 723 }, { "epoch": 0.10154277699859747, "grad_norm": 2.111371261339523, "learning_rate": 9.866424474297139e-06, "loss": 0.4069, "step": 724 }, { "epoch": 0.10168302945301542, "grad_norm": 3.4225936901345775, "learning_rate": 9.86590249154809e-06, "loss": 0.4465, "step": 725 }, { "epoch": 0.10182328190743338, "grad_norm": 2.1327977441968957, "learning_rate": 9.865379504754056e-06, "loss": 0.4267, "step": 726 }, { "epoch": 0.10196353436185134, "grad_norm": 3.295065288215254, "learning_rate": 9.864855514022955e-06, "loss": 0.3909, "step": 727 }, { "epoch": 0.10210378681626929, "grad_norm": 2.563103488477448, "learning_rate": 9.864330519462906e-06, "loss": 0.4222, "step": 728 }, { "epoch": 0.10224403927068723, "grad_norm": 2.233006304904973, "learning_rate": 9.86380452118224e-06, "loss": 0.4569, "step": 729 }, { "epoch": 0.10238429172510519, "grad_norm": 2.0686464911897042, "learning_rate": 9.863277519289493e-06, "loss": 0.4134, "step": 730 }, { "epoch": 0.10252454417952314, "grad_norm": 2.2164254705048148, "learning_rate": 9.862749513893405e-06, "loss": 0.4526, "step": 731 }, { "epoch": 0.1026647966339411, "grad_norm": 2.1318399636356853, "learning_rate": 9.862220505102933e-06, "loss": 0.4812, "step": 732 }, { "epoch": 0.10280504908835905, "grad_norm": 3.259832327807662, "learning_rate": 9.861690493027226e-06, "loss": 0.4289, "step": 733 }, { "epoch": 0.10294530154277699, "grad_norm": 1.8344255430676608, "learning_rate": 9.861159477775653e-06, "loss": 0.4309, "step": 734 }, { "epoch": 0.10308555399719495, "grad_norm": 2.3153925152211743, "learning_rate": 9.860627459457785e-06, "loss": 0.3976, "step": 735 }, { "epoch": 0.1032258064516129, "grad_norm": 2.228805108621936, "learning_rate": 9.8600944381834e-06, "loss": 0.3945, "step": 736 }, { "epoch": 0.10336605890603086, "grad_norm": 2.957773092887003, "learning_rate": 9.859560414062483e-06, "loss": 0.4616, "step": 737 }, { "epoch": 0.10350631136044881, "grad_norm": 4.192957268506033, "learning_rate": 9.859025387205225e-06, "loss": 0.3786, "step": 738 }, { "epoch": 0.10364656381486675, "grad_norm": 2.261538799815849, "learning_rate": 9.858489357722028e-06, "loss": 0.4094, "step": 739 }, { "epoch": 0.10378681626928471, "grad_norm": 1.9781449809579394, "learning_rate": 9.857952325723496e-06, "loss": 0.4258, "step": 740 }, { "epoch": 0.10392706872370266, "grad_norm": 3.0515006141618226, "learning_rate": 9.857414291320441e-06, "loss": 0.4287, "step": 741 }, { "epoch": 0.10406732117812062, "grad_norm": 2.941254671403553, "learning_rate": 9.856875254623883e-06, "loss": 0.4309, "step": 742 }, { "epoch": 0.10420757363253857, "grad_norm": 3.2492503255239478, "learning_rate": 9.85633521574505e-06, "loss": 0.3882, "step": 743 }, { "epoch": 0.10434782608695652, "grad_norm": 2.8602378668936614, "learning_rate": 9.855794174795374e-06, "loss": 0.431, "step": 744 }, { "epoch": 0.10448807854137447, "grad_norm": 2.165234300123747, "learning_rate": 9.855252131886495e-06, "loss": 0.4007, "step": 745 }, { "epoch": 0.10462833099579243, "grad_norm": 2.9322890467431586, "learning_rate": 9.854709087130261e-06, "loss": 0.4117, "step": 746 }, { "epoch": 0.10476858345021038, "grad_norm": 3.8150778638260667, "learning_rate": 9.854165040638724e-06, "loss": 0.3997, "step": 747 }, { "epoch": 0.10490883590462834, "grad_norm": 2.3792130095538573, "learning_rate": 9.853619992524144e-06, "loss": 0.425, "step": 748 }, { "epoch": 0.10504908835904628, "grad_norm": 2.2364872904944515, "learning_rate": 9.85307394289899e-06, "loss": 0.3893, "step": 749 }, { "epoch": 0.10518934081346423, "grad_norm": 2.5435134140329834, "learning_rate": 9.852526891875932e-06, "loss": 0.4489, "step": 750 }, { "epoch": 0.10532959326788219, "grad_norm": 3.293084446875812, "learning_rate": 9.851978839567856e-06, "loss": 0.4373, "step": 751 }, { "epoch": 0.10546984572230014, "grad_norm": 2.177715325379126, "learning_rate": 9.851429786087842e-06, "loss": 0.457, "step": 752 }, { "epoch": 0.1056100981767181, "grad_norm": 3.253567571519585, "learning_rate": 9.850879731549188e-06, "loss": 0.4378, "step": 753 }, { "epoch": 0.10575035063113604, "grad_norm": 2.5268366450838724, "learning_rate": 9.85032867606539e-06, "loss": 0.4572, "step": 754 }, { "epoch": 0.105890603085554, "grad_norm": 2.540245179785859, "learning_rate": 9.84977661975016e-06, "loss": 0.3745, "step": 755 }, { "epoch": 0.10603085553997195, "grad_norm": 3.3641119627764624, "learning_rate": 9.849223562717404e-06, "loss": 0.4701, "step": 756 }, { "epoch": 0.1061711079943899, "grad_norm": 3.351479411484646, "learning_rate": 9.848669505081248e-06, "loss": 0.4161, "step": 757 }, { "epoch": 0.10631136044880786, "grad_norm": 3.0313470569291834, "learning_rate": 9.848114446956015e-06, "loss": 0.4443, "step": 758 }, { "epoch": 0.1064516129032258, "grad_norm": 3.397566339492137, "learning_rate": 9.847558388456237e-06, "loss": 0.4039, "step": 759 }, { "epoch": 0.10659186535764376, "grad_norm": 3.120050707184945, "learning_rate": 9.847001329696653e-06, "loss": 0.4159, "step": 760 }, { "epoch": 0.10673211781206171, "grad_norm": 3.133738849317252, "learning_rate": 9.846443270792209e-06, "loss": 0.417, "step": 761 }, { "epoch": 0.10687237026647967, "grad_norm": 2.726730210357395, "learning_rate": 9.845884211858054e-06, "loss": 0.3905, "step": 762 }, { "epoch": 0.10701262272089762, "grad_norm": 2.4747384751989214, "learning_rate": 9.84532415300955e-06, "loss": 0.4183, "step": 763 }, { "epoch": 0.10715287517531556, "grad_norm": 2.678313233783397, "learning_rate": 9.84476309436226e-06, "loss": 0.3922, "step": 764 }, { "epoch": 0.10729312762973352, "grad_norm": 2.292223313668754, "learning_rate": 9.844201036031952e-06, "loss": 0.4401, "step": 765 }, { "epoch": 0.10743338008415147, "grad_norm": 2.389456369829056, "learning_rate": 9.843637978134604e-06, "loss": 0.3891, "step": 766 }, { "epoch": 0.10757363253856943, "grad_norm": 2.6836818808308855, "learning_rate": 9.843073920786402e-06, "loss": 0.4201, "step": 767 }, { "epoch": 0.10771388499298738, "grad_norm": 2.0464798640554407, "learning_rate": 9.84250886410373e-06, "loss": 0.4135, "step": 768 }, { "epoch": 0.10785413744740532, "grad_norm": 3.007364769841982, "learning_rate": 9.841942808203188e-06, "loss": 0.3846, "step": 769 }, { "epoch": 0.10799438990182328, "grad_norm": 2.807392823915712, "learning_rate": 9.841375753201575e-06, "loss": 0.3458, "step": 770 }, { "epoch": 0.10813464235624123, "grad_norm": 2.824721759833576, "learning_rate": 9.8408076992159e-06, "loss": 0.4155, "step": 771 }, { "epoch": 0.10827489481065919, "grad_norm": 2.45043866259281, "learning_rate": 9.840238646363378e-06, "loss": 0.4109, "step": 772 }, { "epoch": 0.10841514726507714, "grad_norm": 2.7188753997997464, "learning_rate": 9.839668594761427e-06, "loss": 0.4679, "step": 773 }, { "epoch": 0.10855539971949509, "grad_norm": 2.3847993165681585, "learning_rate": 9.839097544527674e-06, "loss": 0.4439, "step": 774 }, { "epoch": 0.10869565217391304, "grad_norm": 2.776012675770339, "learning_rate": 9.838525495779952e-06, "loss": 0.4231, "step": 775 }, { "epoch": 0.108835904628331, "grad_norm": 7.936009304610304, "learning_rate": 9.837952448636298e-06, "loss": 0.4633, "step": 776 }, { "epoch": 0.10897615708274895, "grad_norm": 3.7583725605092515, "learning_rate": 9.837378403214957e-06, "loss": 0.4045, "step": 777 }, { "epoch": 0.1091164095371669, "grad_norm": 3.702018089485178, "learning_rate": 9.836803359634379e-06, "loss": 0.428, "step": 778 }, { "epoch": 0.10925666199158485, "grad_norm": 2.394826948567772, "learning_rate": 9.836227318013219e-06, "loss": 0.4636, "step": 779 }, { "epoch": 0.1093969144460028, "grad_norm": 2.836542238434387, "learning_rate": 9.835650278470343e-06, "loss": 0.4353, "step": 780 }, { "epoch": 0.10953716690042076, "grad_norm": 2.2712740222624177, "learning_rate": 9.835072241124815e-06, "loss": 0.3667, "step": 781 }, { "epoch": 0.10967741935483871, "grad_norm": 3.353774543413632, "learning_rate": 9.834493206095911e-06, "loss": 0.414, "step": 782 }, { "epoch": 0.10981767180925667, "grad_norm": 2.513856633531474, "learning_rate": 9.83391317350311e-06, "loss": 0.4186, "step": 783 }, { "epoch": 0.10995792426367461, "grad_norm": 2.148116986035295, "learning_rate": 9.833332143466099e-06, "loss": 0.4224, "step": 784 }, { "epoch": 0.11009817671809256, "grad_norm": 2.120995057340813, "learning_rate": 9.832750116104768e-06, "loss": 0.4, "step": 785 }, { "epoch": 0.11023842917251052, "grad_norm": 2.9049666657890763, "learning_rate": 9.832167091539215e-06, "loss": 0.4156, "step": 786 }, { "epoch": 0.11037868162692847, "grad_norm": 2.0270828931859612, "learning_rate": 9.831583069889742e-06, "loss": 0.4413, "step": 787 }, { "epoch": 0.11051893408134643, "grad_norm": 2.719837190579929, "learning_rate": 9.830998051276858e-06, "loss": 0.4132, "step": 788 }, { "epoch": 0.11065918653576437, "grad_norm": 6.453595825458017, "learning_rate": 9.83041203582128e-06, "loss": 0.413, "step": 789 }, { "epoch": 0.11079943899018233, "grad_norm": 2.571938600367921, "learning_rate": 9.829825023643926e-06, "loss": 0.4153, "step": 790 }, { "epoch": 0.11093969144460028, "grad_norm": 4.072679881882209, "learning_rate": 9.829237014865921e-06, "loss": 0.4188, "step": 791 }, { "epoch": 0.11107994389901824, "grad_norm": 2.567249269955474, "learning_rate": 9.828648009608598e-06, "loss": 0.4763, "step": 792 }, { "epoch": 0.11122019635343619, "grad_norm": 3.0306669638024233, "learning_rate": 9.828058007993496e-06, "loss": 0.4551, "step": 793 }, { "epoch": 0.11136044880785413, "grad_norm": 2.5668201780444253, "learning_rate": 9.827467010142352e-06, "loss": 0.4332, "step": 794 }, { "epoch": 0.11150070126227209, "grad_norm": 2.196304694391903, "learning_rate": 9.82687501617712e-06, "loss": 0.429, "step": 795 }, { "epoch": 0.11164095371669004, "grad_norm": 2.0864192106401043, "learning_rate": 9.826282026219953e-06, "loss": 0.4301, "step": 796 }, { "epoch": 0.111781206171108, "grad_norm": 2.979370074675202, "learning_rate": 9.825688040393206e-06, "loss": 0.4356, "step": 797 }, { "epoch": 0.11192145862552595, "grad_norm": 2.701817455362829, "learning_rate": 9.825093058819448e-06, "loss": 0.4379, "step": 798 }, { "epoch": 0.1120617110799439, "grad_norm": 2.8518501032190433, "learning_rate": 9.824497081621449e-06, "loss": 0.3844, "step": 799 }, { "epoch": 0.11220196353436185, "grad_norm": 3.0058642927083716, "learning_rate": 9.823900108922183e-06, "loss": 0.4006, "step": 800 }, { "epoch": 0.1123422159887798, "grad_norm": 2.612046752744941, "learning_rate": 9.823302140844833e-06, "loss": 0.4017, "step": 801 }, { "epoch": 0.11248246844319776, "grad_norm": 11.518853696376025, "learning_rate": 9.822703177512783e-06, "loss": 0.4233, "step": 802 }, { "epoch": 0.11262272089761571, "grad_norm": 2.718768152636732, "learning_rate": 9.822103219049625e-06, "loss": 0.4791, "step": 803 }, { "epoch": 0.11276297335203365, "grad_norm": 3.4314865225341475, "learning_rate": 9.82150226557916e-06, "loss": 0.4689, "step": 804 }, { "epoch": 0.11290322580645161, "grad_norm": 3.1112021440478483, "learning_rate": 9.820900317225388e-06, "loss": 0.4518, "step": 805 }, { "epoch": 0.11304347826086956, "grad_norm": 2.399677465541807, "learning_rate": 9.820297374112518e-06, "loss": 0.3647, "step": 806 }, { "epoch": 0.11318373071528752, "grad_norm": 2.4453337549620375, "learning_rate": 9.81969343636496e-06, "loss": 0.439, "step": 807 }, { "epoch": 0.11332398316970548, "grad_norm": 2.6232748266317882, "learning_rate": 9.819088504107335e-06, "loss": 0.3992, "step": 808 }, { "epoch": 0.11346423562412342, "grad_norm": 4.417556649035361, "learning_rate": 9.818482577464466e-06, "loss": 0.4111, "step": 809 }, { "epoch": 0.11360448807854137, "grad_norm": 2.2717629259425403, "learning_rate": 9.817875656561382e-06, "loss": 0.4294, "step": 810 }, { "epoch": 0.11374474053295933, "grad_norm": 2.3737422222634765, "learning_rate": 9.817267741523318e-06, "loss": 0.4107, "step": 811 }, { "epoch": 0.11388499298737728, "grad_norm": 3.380901968878859, "learning_rate": 9.816658832475709e-06, "loss": 0.4314, "step": 812 }, { "epoch": 0.11402524544179524, "grad_norm": 3.1898527251301148, "learning_rate": 9.816048929544202e-06, "loss": 0.3904, "step": 813 }, { "epoch": 0.11416549789621318, "grad_norm": 2.8553862553276845, "learning_rate": 9.815438032854648e-06, "loss": 0.3901, "step": 814 }, { "epoch": 0.11430575035063113, "grad_norm": 2.5111782032191092, "learning_rate": 9.814826142533098e-06, "loss": 0.3837, "step": 815 }, { "epoch": 0.11444600280504909, "grad_norm": 3.4285194302203448, "learning_rate": 9.814213258705813e-06, "loss": 0.4319, "step": 816 }, { "epoch": 0.11458625525946704, "grad_norm": 3.1309580653503524, "learning_rate": 9.813599381499256e-06, "loss": 0.4072, "step": 817 }, { "epoch": 0.114726507713885, "grad_norm": 2.482951014059725, "learning_rate": 9.812984511040099e-06, "loss": 0.4587, "step": 818 }, { "epoch": 0.11486676016830294, "grad_norm": 2.1088340913459738, "learning_rate": 9.812368647455212e-06, "loss": 0.408, "step": 819 }, { "epoch": 0.1150070126227209, "grad_norm": 4.06988909783507, "learning_rate": 9.811751790871677e-06, "loss": 0.4454, "step": 820 }, { "epoch": 0.11514726507713885, "grad_norm": 1.8865566144506738, "learning_rate": 9.811133941416778e-06, "loss": 0.392, "step": 821 }, { "epoch": 0.1152875175315568, "grad_norm": 12.298435911891652, "learning_rate": 9.810515099218004e-06, "loss": 0.4354, "step": 822 }, { "epoch": 0.11542776998597476, "grad_norm": 3.182664232354045, "learning_rate": 9.809895264403046e-06, "loss": 0.4247, "step": 823 }, { "epoch": 0.1155680224403927, "grad_norm": 2.258350875711285, "learning_rate": 9.809274437099807e-06, "loss": 0.4286, "step": 824 }, { "epoch": 0.11570827489481066, "grad_norm": 2.4596109357066225, "learning_rate": 9.808652617436386e-06, "loss": 0.4023, "step": 825 }, { "epoch": 0.11584852734922861, "grad_norm": 2.6044013769554795, "learning_rate": 9.808029805541097e-06, "loss": 0.3784, "step": 826 }, { "epoch": 0.11598877980364657, "grad_norm": 2.3819645608016073, "learning_rate": 9.807406001542447e-06, "loss": 0.4447, "step": 827 }, { "epoch": 0.11612903225806452, "grad_norm": 2.540778416579888, "learning_rate": 9.806781205569155e-06, "loss": 0.4323, "step": 828 }, { "epoch": 0.11626928471248246, "grad_norm": 2.5052212551574122, "learning_rate": 9.806155417750146e-06, "loss": 0.4013, "step": 829 }, { "epoch": 0.11640953716690042, "grad_norm": 3.5601511121674716, "learning_rate": 9.805528638214543e-06, "loss": 0.4157, "step": 830 }, { "epoch": 0.11654978962131837, "grad_norm": 3.4347167798684883, "learning_rate": 9.80490086709168e-06, "loss": 0.426, "step": 831 }, { "epoch": 0.11669004207573633, "grad_norm": 2.1668793011521705, "learning_rate": 9.804272104511093e-06, "loss": 0.4199, "step": 832 }, { "epoch": 0.11683029453015428, "grad_norm": 2.790386323878258, "learning_rate": 9.803642350602524e-06, "loss": 0.4641, "step": 833 }, { "epoch": 0.11697054698457222, "grad_norm": 2.229598267981134, "learning_rate": 9.803011605495916e-06, "loss": 0.4156, "step": 834 }, { "epoch": 0.11711079943899018, "grad_norm": 2.372155242567933, "learning_rate": 9.802379869321419e-06, "loss": 0.3742, "step": 835 }, { "epoch": 0.11725105189340813, "grad_norm": 2.5267910714167305, "learning_rate": 9.801747142209388e-06, "loss": 0.41, "step": 836 }, { "epoch": 0.11739130434782609, "grad_norm": 2.798671635514141, "learning_rate": 9.801113424290381e-06, "loss": 0.4195, "step": 837 }, { "epoch": 0.11753155680224404, "grad_norm": 2.201043951993503, "learning_rate": 9.800478715695165e-06, "loss": 0.3917, "step": 838 }, { "epoch": 0.11767180925666199, "grad_norm": 2.8375879954671084, "learning_rate": 9.799843016554701e-06, "loss": 0.4322, "step": 839 }, { "epoch": 0.11781206171107994, "grad_norm": 3.0192007103507255, "learning_rate": 9.799206327000168e-06, "loss": 0.4355, "step": 840 }, { "epoch": 0.1179523141654979, "grad_norm": 2.4579407868853793, "learning_rate": 9.798568647162939e-06, "loss": 0.3999, "step": 841 }, { "epoch": 0.11809256661991585, "grad_norm": 2.868648312829615, "learning_rate": 9.797929977174593e-06, "loss": 0.4202, "step": 842 }, { "epoch": 0.1182328190743338, "grad_norm": 2.444392704656942, "learning_rate": 9.79729031716692e-06, "loss": 0.4535, "step": 843 }, { "epoch": 0.11837307152875175, "grad_norm": 2.4466970990477193, "learning_rate": 9.796649667271905e-06, "loss": 0.4754, "step": 844 }, { "epoch": 0.1185133239831697, "grad_norm": 2.1970605373037944, "learning_rate": 9.796008027621744e-06, "loss": 0.4266, "step": 845 }, { "epoch": 0.11865357643758766, "grad_norm": 3.0419740042594596, "learning_rate": 9.795365398348833e-06, "loss": 0.418, "step": 846 }, { "epoch": 0.11879382889200561, "grad_norm": 3.108357632075865, "learning_rate": 9.794721779585776e-06, "loss": 0.4455, "step": 847 }, { "epoch": 0.11893408134642357, "grad_norm": 3.702574354992489, "learning_rate": 9.794077171465376e-06, "loss": 0.4027, "step": 848 }, { "epoch": 0.11907433380084151, "grad_norm": 2.5720740314288455, "learning_rate": 9.79343157412065e-06, "loss": 0.4496, "step": 849 }, { "epoch": 0.11921458625525946, "grad_norm": 2.6534680327852422, "learning_rate": 9.792784987684804e-06, "loss": 0.4453, "step": 850 }, { "epoch": 0.11935483870967742, "grad_norm": 3.4562435653654218, "learning_rate": 9.792137412291265e-06, "loss": 0.4067, "step": 851 }, { "epoch": 0.11949509116409537, "grad_norm": 3.6623015574572646, "learning_rate": 9.791488848073649e-06, "loss": 0.4412, "step": 852 }, { "epoch": 0.11963534361851333, "grad_norm": 2.262821962831728, "learning_rate": 9.790839295165786e-06, "loss": 0.3982, "step": 853 }, { "epoch": 0.11977559607293127, "grad_norm": 2.088117053722709, "learning_rate": 9.790188753701704e-06, "loss": 0.4554, "step": 854 }, { "epoch": 0.11991584852734923, "grad_norm": 2.795286356087783, "learning_rate": 9.789537223815642e-06, "loss": 0.4554, "step": 855 }, { "epoch": 0.12005610098176718, "grad_norm": 2.168853414307077, "learning_rate": 9.788884705642035e-06, "loss": 0.4453, "step": 856 }, { "epoch": 0.12019635343618514, "grad_norm": 2.8575380188289246, "learning_rate": 9.788231199315528e-06, "loss": 0.4203, "step": 857 }, { "epoch": 0.12033660589060309, "grad_norm": 2.2694481524226155, "learning_rate": 9.787576704970965e-06, "loss": 0.4025, "step": 858 }, { "epoch": 0.12047685834502103, "grad_norm": 1.9459592670909753, "learning_rate": 9.786921222743397e-06, "loss": 0.3947, "step": 859 }, { "epoch": 0.12061711079943899, "grad_norm": 4.6113394996021295, "learning_rate": 9.78626475276808e-06, "loss": 0.4228, "step": 860 }, { "epoch": 0.12075736325385694, "grad_norm": 2.7197508449600467, "learning_rate": 9.78560729518047e-06, "loss": 0.3511, "step": 861 }, { "epoch": 0.1208976157082749, "grad_norm": 3.3173452964190853, "learning_rate": 9.78494885011623e-06, "loss": 0.4611, "step": 862 }, { "epoch": 0.12103786816269285, "grad_norm": 2.579088620518456, "learning_rate": 9.784289417711225e-06, "loss": 0.4245, "step": 863 }, { "epoch": 0.1211781206171108, "grad_norm": 2.41681768230522, "learning_rate": 9.783628998101525e-06, "loss": 0.3745, "step": 864 }, { "epoch": 0.12131837307152875, "grad_norm": 2.52596154364446, "learning_rate": 9.7829675914234e-06, "loss": 0.4442, "step": 865 }, { "epoch": 0.1214586255259467, "grad_norm": 2.217709256935435, "learning_rate": 9.782305197813332e-06, "loss": 0.3895, "step": 866 }, { "epoch": 0.12159887798036466, "grad_norm": 3.0233614810977767, "learning_rate": 9.781641817407997e-06, "loss": 0.4022, "step": 867 }, { "epoch": 0.12173913043478261, "grad_norm": 3.2395492666401555, "learning_rate": 9.78097745034428e-06, "loss": 0.4439, "step": 868 }, { "epoch": 0.12187938288920055, "grad_norm": 2.616566507483794, "learning_rate": 9.780312096759269e-06, "loss": 0.4003, "step": 869 }, { "epoch": 0.12201963534361851, "grad_norm": 2.7211906615180435, "learning_rate": 9.779645756790255e-06, "loss": 0.3798, "step": 870 }, { "epoch": 0.12215988779803647, "grad_norm": 2.6812172129656013, "learning_rate": 9.77897843057473e-06, "loss": 0.4149, "step": 871 }, { "epoch": 0.12230014025245442, "grad_norm": 2.76963326033593, "learning_rate": 9.778310118250397e-06, "loss": 0.3767, "step": 872 }, { "epoch": 0.12244039270687238, "grad_norm": 2.943419316016232, "learning_rate": 9.777640819955154e-06, "loss": 0.3773, "step": 873 }, { "epoch": 0.12258064516129032, "grad_norm": 2.0576256460804427, "learning_rate": 9.776970535827109e-06, "loss": 0.4245, "step": 874 }, { "epoch": 0.12272089761570827, "grad_norm": 2.1816033588119184, "learning_rate": 9.776299266004565e-06, "loss": 0.3999, "step": 875 }, { "epoch": 0.12286115007012623, "grad_norm": 2.684569957257161, "learning_rate": 9.775627010626039e-06, "loss": 0.3935, "step": 876 }, { "epoch": 0.12300140252454418, "grad_norm": 2.4828538376263944, "learning_rate": 9.774953769830245e-06, "loss": 0.4415, "step": 877 }, { "epoch": 0.12314165497896214, "grad_norm": 2.1320748083935577, "learning_rate": 9.7742795437561e-06, "loss": 0.4531, "step": 878 }, { "epoch": 0.12328190743338008, "grad_norm": 2.6471226672034236, "learning_rate": 9.77360433254273e-06, "loss": 0.4024, "step": 879 }, { "epoch": 0.12342215988779803, "grad_norm": 2.0813399378500494, "learning_rate": 9.772928136329454e-06, "loss": 0.4201, "step": 880 }, { "epoch": 0.12356241234221599, "grad_norm": 2.34991904585982, "learning_rate": 9.772250955255804e-06, "loss": 0.4167, "step": 881 }, { "epoch": 0.12370266479663394, "grad_norm": 2.025609680513328, "learning_rate": 9.77157278946151e-06, "loss": 0.4023, "step": 882 }, { "epoch": 0.1238429172510519, "grad_norm": 3.895887622692978, "learning_rate": 9.77089363908651e-06, "loss": 0.4525, "step": 883 }, { "epoch": 0.12398316970546984, "grad_norm": 2.4118179701255458, "learning_rate": 9.770213504270939e-06, "loss": 0.4303, "step": 884 }, { "epoch": 0.1241234221598878, "grad_norm": 2.529015484920472, "learning_rate": 9.769532385155137e-06, "loss": 0.4452, "step": 885 }, { "epoch": 0.12426367461430575, "grad_norm": 2.3987646221532235, "learning_rate": 9.768850281879651e-06, "loss": 0.3993, "step": 886 }, { "epoch": 0.1244039270687237, "grad_norm": 3.0088480170100538, "learning_rate": 9.768167194585227e-06, "loss": 0.4171, "step": 887 }, { "epoch": 0.12454417952314166, "grad_norm": 3.5148509890144113, "learning_rate": 9.767483123412817e-06, "loss": 0.4016, "step": 888 }, { "epoch": 0.1246844319775596, "grad_norm": 2.9459247306950833, "learning_rate": 9.766798068503572e-06, "loss": 0.4302, "step": 889 }, { "epoch": 0.12482468443197756, "grad_norm": 2.6089431496710183, "learning_rate": 9.766112029998847e-06, "loss": 0.4113, "step": 890 }, { "epoch": 0.12496493688639551, "grad_norm": 2.138543557972791, "learning_rate": 9.765425008040206e-06, "loss": 0.414, "step": 891 }, { "epoch": 0.12510518934081347, "grad_norm": 3.734299279885769, "learning_rate": 9.764737002769406e-06, "loss": 0.4288, "step": 892 }, { "epoch": 0.12524544179523142, "grad_norm": 3.4320503732768075, "learning_rate": 9.764048014328417e-06, "loss": 0.4266, "step": 893 }, { "epoch": 0.12538569424964938, "grad_norm": 2.7518215215660256, "learning_rate": 9.763358042859403e-06, "loss": 0.4388, "step": 894 }, { "epoch": 0.12552594670406733, "grad_norm": 2.5858301791598963, "learning_rate": 9.762667088504737e-06, "loss": 0.4239, "step": 895 }, { "epoch": 0.1256661991584853, "grad_norm": 2.5484777918658708, "learning_rate": 9.761975151406991e-06, "loss": 0.3979, "step": 896 }, { "epoch": 0.12580645161290321, "grad_norm": 2.997854260150099, "learning_rate": 9.761282231708942e-06, "loss": 0.4462, "step": 897 }, { "epoch": 0.12594670406732117, "grad_norm": 2.349823267358076, "learning_rate": 9.76058832955357e-06, "loss": 0.4192, "step": 898 }, { "epoch": 0.12608695652173912, "grad_norm": 4.1198200879119335, "learning_rate": 9.759893445084059e-06, "loss": 0.3794, "step": 899 }, { "epoch": 0.12622720897615708, "grad_norm": 3.0641468835986925, "learning_rate": 9.759197578443787e-06, "loss": 0.4222, "step": 900 }, { "epoch": 0.12636746143057503, "grad_norm": 2.389251214527134, "learning_rate": 9.758500729776348e-06, "loss": 0.4267, "step": 901 }, { "epoch": 0.126507713884993, "grad_norm": 3.261539806094398, "learning_rate": 9.757802899225527e-06, "loss": 0.3882, "step": 902 }, { "epoch": 0.12664796633941094, "grad_norm": 8.668433913977546, "learning_rate": 9.757104086935319e-06, "loss": 0.3938, "step": 903 }, { "epoch": 0.1267882187938289, "grad_norm": 2.1680835590377336, "learning_rate": 9.756404293049918e-06, "loss": 0.4654, "step": 904 }, { "epoch": 0.12692847124824685, "grad_norm": 3.875022642007943, "learning_rate": 9.755703517713722e-06, "loss": 0.4069, "step": 905 }, { "epoch": 0.1270687237026648, "grad_norm": 2.9620203136852528, "learning_rate": 9.755001761071333e-06, "loss": 0.4111, "step": 906 }, { "epoch": 0.12720897615708274, "grad_norm": 3.3351562442810816, "learning_rate": 9.754299023267548e-06, "loss": 0.392, "step": 907 }, { "epoch": 0.1273492286115007, "grad_norm": 2.243303931479571, "learning_rate": 9.753595304447379e-06, "loss": 0.374, "step": 908 }, { "epoch": 0.12748948106591865, "grad_norm": 3.672374934799436, "learning_rate": 9.752890604756029e-06, "loss": 0.4477, "step": 909 }, { "epoch": 0.1276297335203366, "grad_norm": 2.5034939192844154, "learning_rate": 9.75218492433891e-06, "loss": 0.4434, "step": 910 }, { "epoch": 0.12776998597475456, "grad_norm": 2.384482477048135, "learning_rate": 9.751478263341631e-06, "loss": 0.4102, "step": 911 }, { "epoch": 0.1279102384291725, "grad_norm": 2.4232628568172254, "learning_rate": 9.75077062191001e-06, "loss": 0.381, "step": 912 }, { "epoch": 0.12805049088359047, "grad_norm": 4.9417764035027725, "learning_rate": 9.750062000190063e-06, "loss": 0.4107, "step": 913 }, { "epoch": 0.12819074333800842, "grad_norm": 4.520883211250388, "learning_rate": 9.74935239832801e-06, "loss": 0.4231, "step": 914 }, { "epoch": 0.12833099579242638, "grad_norm": 3.435301615259594, "learning_rate": 9.748641816470268e-06, "loss": 0.4046, "step": 915 }, { "epoch": 0.12847124824684433, "grad_norm": 3.6134869821075655, "learning_rate": 9.747930254763467e-06, "loss": 0.4328, "step": 916 }, { "epoch": 0.12861150070126226, "grad_norm": 4.823240309114587, "learning_rate": 9.747217713354428e-06, "loss": 0.4142, "step": 917 }, { "epoch": 0.12875175315568022, "grad_norm": 3.1103437438579125, "learning_rate": 9.746504192390181e-06, "loss": 0.4307, "step": 918 }, { "epoch": 0.12889200561009817, "grad_norm": 2.9851943883428844, "learning_rate": 9.745789692017955e-06, "loss": 0.4469, "step": 919 }, { "epoch": 0.12903225806451613, "grad_norm": 2.440124468208358, "learning_rate": 9.745074212385183e-06, "loss": 0.4925, "step": 920 }, { "epoch": 0.12917251051893408, "grad_norm": 3.015941385858175, "learning_rate": 9.7443577536395e-06, "loss": 0.3947, "step": 921 }, { "epoch": 0.12931276297335204, "grad_norm": 3.8219685777762846, "learning_rate": 9.74364031592874e-06, "loss": 0.4495, "step": 922 }, { "epoch": 0.12945301542777, "grad_norm": 3.3775169029914087, "learning_rate": 9.742921899400942e-06, "loss": 0.4206, "step": 923 }, { "epoch": 0.12959326788218795, "grad_norm": 2.3453750063237404, "learning_rate": 9.742202504204348e-06, "loss": 0.4499, "step": 924 }, { "epoch": 0.1297335203366059, "grad_norm": 2.4678149806678364, "learning_rate": 9.741482130487398e-06, "loss": 0.4204, "step": 925 }, { "epoch": 0.12987377279102386, "grad_norm": 1.8421419534399446, "learning_rate": 9.740760778398737e-06, "loss": 0.4204, "step": 926 }, { "epoch": 0.13001402524544178, "grad_norm": 3.2702555702966323, "learning_rate": 9.740038448087213e-06, "loss": 0.4287, "step": 927 }, { "epoch": 0.13015427769985974, "grad_norm": 2.8921565766744832, "learning_rate": 9.739315139701868e-06, "loss": 0.3727, "step": 928 }, { "epoch": 0.1302945301542777, "grad_norm": 3.3246820398187786, "learning_rate": 9.738590853391959e-06, "loss": 0.3994, "step": 929 }, { "epoch": 0.13043478260869565, "grad_norm": 3.3943434335178178, "learning_rate": 9.737865589306932e-06, "loss": 0.3994, "step": 930 }, { "epoch": 0.1305750350631136, "grad_norm": 2.353189912789768, "learning_rate": 9.737139347596443e-06, "loss": 0.4097, "step": 931 }, { "epoch": 0.13071528751753156, "grad_norm": 2.677553355792189, "learning_rate": 9.736412128410346e-06, "loss": 0.4509, "step": 932 }, { "epoch": 0.1308555399719495, "grad_norm": 3.2895532092470074, "learning_rate": 9.735683931898697e-06, "loss": 0.3784, "step": 933 }, { "epoch": 0.13099579242636747, "grad_norm": 2.628733477647113, "learning_rate": 9.734954758211754e-06, "loss": 0.3452, "step": 934 }, { "epoch": 0.13113604488078542, "grad_norm": 2.404896838697879, "learning_rate": 9.734224607499978e-06, "loss": 0.3717, "step": 935 }, { "epoch": 0.13127629733520338, "grad_norm": 2.388326699495475, "learning_rate": 9.733493479914031e-06, "loss": 0.379, "step": 936 }, { "epoch": 0.1314165497896213, "grad_norm": 3.113983572855933, "learning_rate": 9.732761375604773e-06, "loss": 0.4343, "step": 937 }, { "epoch": 0.13155680224403926, "grad_norm": 2.8419655032121263, "learning_rate": 9.732028294723273e-06, "loss": 0.4055, "step": 938 }, { "epoch": 0.13169705469845722, "grad_norm": 2.582295783684513, "learning_rate": 9.731294237420795e-06, "loss": 0.3656, "step": 939 }, { "epoch": 0.13183730715287517, "grad_norm": 2.5227941868161188, "learning_rate": 9.730559203848807e-06, "loss": 0.4454, "step": 940 }, { "epoch": 0.13197755960729313, "grad_norm": 2.8135475750865475, "learning_rate": 9.729823194158977e-06, "loss": 0.4111, "step": 941 }, { "epoch": 0.13211781206171108, "grad_norm": 2.9114703801997153, "learning_rate": 9.729086208503174e-06, "loss": 0.4128, "step": 942 }, { "epoch": 0.13225806451612904, "grad_norm": 2.6629800247377187, "learning_rate": 9.728348247033474e-06, "loss": 0.3627, "step": 943 }, { "epoch": 0.132398316970547, "grad_norm": 2.1609781367631555, "learning_rate": 9.727609309902148e-06, "loss": 0.4349, "step": 944 }, { "epoch": 0.13253856942496495, "grad_norm": 2.8621154326932787, "learning_rate": 9.72686939726167e-06, "loss": 0.4068, "step": 945 }, { "epoch": 0.1326788218793829, "grad_norm": 3.307619061622956, "learning_rate": 9.726128509264715e-06, "loss": 0.4172, "step": 946 }, { "epoch": 0.13281907433380083, "grad_norm": 4.036258159309049, "learning_rate": 9.725386646064164e-06, "loss": 0.387, "step": 947 }, { "epoch": 0.13295932678821878, "grad_norm": 2.617637663885529, "learning_rate": 9.724643807813092e-06, "loss": 0.3825, "step": 948 }, { "epoch": 0.13309957924263674, "grad_norm": 8.049112249671543, "learning_rate": 9.723899994664779e-06, "loss": 0.3981, "step": 949 }, { "epoch": 0.1332398316970547, "grad_norm": 2.627640531431317, "learning_rate": 9.723155206772705e-06, "loss": 0.3793, "step": 950 }, { "epoch": 0.13338008415147265, "grad_norm": 2.887861425726099, "learning_rate": 9.722409444290555e-06, "loss": 0.3941, "step": 951 }, { "epoch": 0.1335203366058906, "grad_norm": 2.5768409074280902, "learning_rate": 9.721662707372208e-06, "loss": 0.3862, "step": 952 }, { "epoch": 0.13366058906030856, "grad_norm": 3.131483752268668, "learning_rate": 9.720914996171748e-06, "loss": 0.4969, "step": 953 }, { "epoch": 0.13380084151472652, "grad_norm": 3.2768043556143454, "learning_rate": 9.720166310843464e-06, "loss": 0.4152, "step": 954 }, { "epoch": 0.13394109396914447, "grad_norm": 3.9301772068441476, "learning_rate": 9.719416651541839e-06, "loss": 0.4067, "step": 955 }, { "epoch": 0.13408134642356243, "grad_norm": 3.0164926496302686, "learning_rate": 9.71866601842156e-06, "loss": 0.4036, "step": 956 }, { "epoch": 0.13422159887798035, "grad_norm": 3.7718778115604166, "learning_rate": 9.717914411637515e-06, "loss": 0.4159, "step": 957 }, { "epoch": 0.1343618513323983, "grad_norm": 3.5147383864653383, "learning_rate": 9.717161831344792e-06, "loss": 0.3842, "step": 958 }, { "epoch": 0.13450210378681626, "grad_norm": 2.3479731848841063, "learning_rate": 9.716408277698684e-06, "loss": 0.4142, "step": 959 }, { "epoch": 0.13464235624123422, "grad_norm": 3.2119418241902236, "learning_rate": 9.71565375085468e-06, "loss": 0.4163, "step": 960 }, { "epoch": 0.13478260869565217, "grad_norm": 3.0045776410583005, "learning_rate": 9.714898250968468e-06, "loss": 0.411, "step": 961 }, { "epoch": 0.13492286115007013, "grad_norm": 2.7489467684690556, "learning_rate": 9.714141778195945e-06, "loss": 0.4149, "step": 962 }, { "epoch": 0.13506311360448808, "grad_norm": 2.616605688303831, "learning_rate": 9.713384332693199e-06, "loss": 0.3891, "step": 963 }, { "epoch": 0.13520336605890604, "grad_norm": 3.734042867151906, "learning_rate": 9.712625914616528e-06, "loss": 0.4248, "step": 964 }, { "epoch": 0.135343618513324, "grad_norm": 14.348713865695746, "learning_rate": 9.711866524122424e-06, "loss": 0.4384, "step": 965 }, { "epoch": 0.13548387096774195, "grad_norm": 2.625704534534177, "learning_rate": 9.711106161367583e-06, "loss": 0.342, "step": 966 }, { "epoch": 0.13562412342215988, "grad_norm": 3.9837470064609795, "learning_rate": 9.710344826508901e-06, "loss": 0.4036, "step": 967 }, { "epoch": 0.13576437587657783, "grad_norm": 2.5930050618938405, "learning_rate": 9.70958251970347e-06, "loss": 0.4178, "step": 968 }, { "epoch": 0.13590462833099579, "grad_norm": 2.8534213761385976, "learning_rate": 9.708819241108594e-06, "loss": 0.3618, "step": 969 }, { "epoch": 0.13604488078541374, "grad_norm": 3.294126838683279, "learning_rate": 9.708054990881763e-06, "loss": 0.38, "step": 970 }, { "epoch": 0.1361851332398317, "grad_norm": 2.622642548417069, "learning_rate": 9.70728976918068e-06, "loss": 0.3757, "step": 971 }, { "epoch": 0.13632538569424965, "grad_norm": 3.0116166271900346, "learning_rate": 9.706523576163238e-06, "loss": 0.4254, "step": 972 }, { "epoch": 0.1364656381486676, "grad_norm": 2.645765278659375, "learning_rate": 9.70575641198754e-06, "loss": 0.411, "step": 973 }, { "epoch": 0.13660589060308556, "grad_norm": 4.805513140442834, "learning_rate": 9.704988276811883e-06, "loss": 0.4039, "step": 974 }, { "epoch": 0.13674614305750352, "grad_norm": 3.4626146648714484, "learning_rate": 9.704219170794766e-06, "loss": 0.4016, "step": 975 }, { "epoch": 0.13688639551192147, "grad_norm": 2.688336973934779, "learning_rate": 9.703449094094891e-06, "loss": 0.3756, "step": 976 }, { "epoch": 0.1370266479663394, "grad_norm": 4.203176958975426, "learning_rate": 9.702678046871157e-06, "loss": 0.3975, "step": 977 }, { "epoch": 0.13716690042075735, "grad_norm": 1.929001288469437, "learning_rate": 9.701906029282662e-06, "loss": 0.4516, "step": 978 }, { "epoch": 0.1373071528751753, "grad_norm": 3.0748518373103986, "learning_rate": 9.701133041488707e-06, "loss": 0.3766, "step": 979 }, { "epoch": 0.13744740532959326, "grad_norm": 2.981742776574924, "learning_rate": 9.700359083648795e-06, "loss": 0.451, "step": 980 }, { "epoch": 0.13758765778401122, "grad_norm": 3.729218821809074, "learning_rate": 9.699584155922625e-06, "loss": 0.4326, "step": 981 }, { "epoch": 0.13772791023842917, "grad_norm": 2.8034787774504792, "learning_rate": 9.698808258470098e-06, "loss": 0.3542, "step": 982 }, { "epoch": 0.13786816269284713, "grad_norm": 2.4394320142701442, "learning_rate": 9.698031391451317e-06, "loss": 0.4563, "step": 983 }, { "epoch": 0.13800841514726508, "grad_norm": 2.885935345428439, "learning_rate": 9.69725355502658e-06, "loss": 0.4053, "step": 984 }, { "epoch": 0.13814866760168304, "grad_norm": 3.043594393972688, "learning_rate": 9.69647474935639e-06, "loss": 0.4153, "step": 985 }, { "epoch": 0.138288920056101, "grad_norm": 2.764984487728043, "learning_rate": 9.695694974601447e-06, "loss": 0.408, "step": 986 }, { "epoch": 0.13842917251051892, "grad_norm": 2.4223383306484383, "learning_rate": 9.694914230922655e-06, "loss": 0.4151, "step": 987 }, { "epoch": 0.13856942496493688, "grad_norm": 3.6252609685157493, "learning_rate": 9.69413251848111e-06, "loss": 0.3885, "step": 988 }, { "epoch": 0.13870967741935483, "grad_norm": 2.2759624499423015, "learning_rate": 9.693349837438115e-06, "loss": 0.3923, "step": 989 }, { "epoch": 0.1388499298737728, "grad_norm": 2.9365669554266276, "learning_rate": 9.692566187955174e-06, "loss": 0.3845, "step": 990 }, { "epoch": 0.13899018232819074, "grad_norm": 3.3294872665883593, "learning_rate": 9.691781570193983e-06, "loss": 0.481, "step": 991 }, { "epoch": 0.1391304347826087, "grad_norm": 3.469740167833233, "learning_rate": 9.690995984316446e-06, "loss": 0.3969, "step": 992 }, { "epoch": 0.13927068723702665, "grad_norm": 2.1178940160744526, "learning_rate": 9.69020943048466e-06, "loss": 0.4025, "step": 993 }, { "epoch": 0.1394109396914446, "grad_norm": 2.4580767761667897, "learning_rate": 9.689421908860928e-06, "loss": 0.4416, "step": 994 }, { "epoch": 0.13955119214586256, "grad_norm": 2.97540597149203, "learning_rate": 9.688633419607746e-06, "loss": 0.4181, "step": 995 }, { "epoch": 0.13969144460028052, "grad_norm": 2.33622826645214, "learning_rate": 9.687843962887817e-06, "loss": 0.4092, "step": 996 }, { "epoch": 0.13983169705469845, "grad_norm": 2.6173824408509363, "learning_rate": 9.687053538864037e-06, "loss": 0.3904, "step": 997 }, { "epoch": 0.1399719495091164, "grad_norm": 4.421298471346341, "learning_rate": 9.686262147699507e-06, "loss": 0.4033, "step": 998 }, { "epoch": 0.14011220196353436, "grad_norm": 2.6223914491426754, "learning_rate": 9.685469789557522e-06, "loss": 0.369, "step": 999 }, { "epoch": 0.1402524544179523, "grad_norm": 20.335259184807093, "learning_rate": 9.684676464601583e-06, "loss": 0.3867, "step": 1000 }, { "epoch": 0.14039270687237027, "grad_norm": 2.5383140047589405, "learning_rate": 9.683882172995385e-06, "loss": 0.4046, "step": 1001 }, { "epoch": 0.14053295932678822, "grad_norm": 3.27502432078875, "learning_rate": 9.683086914902825e-06, "loss": 0.4237, "step": 1002 }, { "epoch": 0.14067321178120618, "grad_norm": 3.0193039980076954, "learning_rate": 9.682290690487997e-06, "loss": 0.4615, "step": 1003 }, { "epoch": 0.14081346423562413, "grad_norm": 2.327672619999904, "learning_rate": 9.681493499915198e-06, "loss": 0.4194, "step": 1004 }, { "epoch": 0.14095371669004209, "grad_norm": 2.625620450762989, "learning_rate": 9.680695343348923e-06, "loss": 0.444, "step": 1005 }, { "epoch": 0.14109396914446004, "grad_norm": 3.7116859082901015, "learning_rate": 9.679896220953866e-06, "loss": 0.401, "step": 1006 }, { "epoch": 0.14123422159887797, "grad_norm": 2.67865005741062, "learning_rate": 9.679096132894922e-06, "loss": 0.3901, "step": 1007 }, { "epoch": 0.14137447405329592, "grad_norm": 2.8502717741833923, "learning_rate": 9.678295079337182e-06, "loss": 0.3979, "step": 1008 }, { "epoch": 0.14151472650771388, "grad_norm": 2.3008797953150473, "learning_rate": 9.677493060445936e-06, "loss": 0.4327, "step": 1009 }, { "epoch": 0.14165497896213183, "grad_norm": 2.4525737750577106, "learning_rate": 9.676690076386674e-06, "loss": 0.4726, "step": 1010 }, { "epoch": 0.1417952314165498, "grad_norm": 3.1129769017348754, "learning_rate": 9.675886127325091e-06, "loss": 0.3784, "step": 1011 }, { "epoch": 0.14193548387096774, "grad_norm": 2.9323719797377805, "learning_rate": 9.675081213427076e-06, "loss": 0.417, "step": 1012 }, { "epoch": 0.1420757363253857, "grad_norm": 3.341162277423039, "learning_rate": 9.674275334858712e-06, "loss": 0.3979, "step": 1013 }, { "epoch": 0.14221598877980365, "grad_norm": 2.6588434461376877, "learning_rate": 9.673468491786291e-06, "loss": 0.4255, "step": 1014 }, { "epoch": 0.1423562412342216, "grad_norm": 2.739230134385568, "learning_rate": 9.672660684376298e-06, "loss": 0.424, "step": 1015 }, { "epoch": 0.14249649368863956, "grad_norm": 4.256385942212301, "learning_rate": 9.67185191279542e-06, "loss": 0.4094, "step": 1016 }, { "epoch": 0.1426367461430575, "grad_norm": 2.5727243905031387, "learning_rate": 9.671042177210539e-06, "loss": 0.4388, "step": 1017 }, { "epoch": 0.14277699859747545, "grad_norm": 2.187796436387607, "learning_rate": 9.670231477788738e-06, "loss": 0.4022, "step": 1018 }, { "epoch": 0.1429172510518934, "grad_norm": 2.4363492547960393, "learning_rate": 9.669419814697303e-06, "loss": 0.3693, "step": 1019 }, { "epoch": 0.14305750350631136, "grad_norm": 2.7023554241174192, "learning_rate": 9.668607188103708e-06, "loss": 0.4447, "step": 1020 }, { "epoch": 0.1431977559607293, "grad_norm": 2.8140796887488646, "learning_rate": 9.667793598175641e-06, "loss": 0.4771, "step": 1021 }, { "epoch": 0.14333800841514727, "grad_norm": 2.7237793454554082, "learning_rate": 9.666979045080977e-06, "loss": 0.3448, "step": 1022 }, { "epoch": 0.14347826086956522, "grad_norm": 2.3018341504961604, "learning_rate": 9.666163528987793e-06, "loss": 0.3679, "step": 1023 }, { "epoch": 0.14361851332398318, "grad_norm": 2.617725373979371, "learning_rate": 9.665347050064362e-06, "loss": 0.3945, "step": 1024 }, { "epoch": 0.14375876577840113, "grad_norm": 2.428341508047377, "learning_rate": 9.664529608479165e-06, "loss": 0.38, "step": 1025 }, { "epoch": 0.1438990182328191, "grad_norm": 3.367756594119279, "learning_rate": 9.663711204400872e-06, "loss": 0.4291, "step": 1026 }, { "epoch": 0.14403927068723701, "grad_norm": 2.8160973325239986, "learning_rate": 9.662891837998354e-06, "loss": 0.382, "step": 1027 }, { "epoch": 0.14417952314165497, "grad_norm": 2.455965960437588, "learning_rate": 9.662071509440683e-06, "loss": 0.3799, "step": 1028 }, { "epoch": 0.14431977559607292, "grad_norm": 2.5339836147654746, "learning_rate": 9.661250218897129e-06, "loss": 0.4704, "step": 1029 }, { "epoch": 0.14446002805049088, "grad_norm": 4.8754043411417705, "learning_rate": 9.660427966537157e-06, "loss": 0.4086, "step": 1030 }, { "epoch": 0.14460028050490883, "grad_norm": 2.761186216557979, "learning_rate": 9.659604752530434e-06, "loss": 0.3941, "step": 1031 }, { "epoch": 0.1447405329593268, "grad_norm": 1.9542255200251777, "learning_rate": 9.658780577046826e-06, "loss": 0.3949, "step": 1032 }, { "epoch": 0.14488078541374474, "grad_norm": 2.252739615952699, "learning_rate": 9.657955440256396e-06, "loss": 0.4001, "step": 1033 }, { "epoch": 0.1450210378681627, "grad_norm": 2.7491462422614648, "learning_rate": 9.657129342329403e-06, "loss": 0.4347, "step": 1034 }, { "epoch": 0.14516129032258066, "grad_norm": 1.9876620484513357, "learning_rate": 9.656302283436306e-06, "loss": 0.4149, "step": 1035 }, { "epoch": 0.1453015427769986, "grad_norm": 2.0437005739914085, "learning_rate": 9.655474263747765e-06, "loss": 0.3812, "step": 1036 }, { "epoch": 0.14544179523141654, "grad_norm": 2.3959405184003217, "learning_rate": 9.654645283434636e-06, "loss": 0.4127, "step": 1037 }, { "epoch": 0.1455820476858345, "grad_norm": 1.950102030701282, "learning_rate": 9.653815342667973e-06, "loss": 0.3966, "step": 1038 }, { "epoch": 0.14572230014025245, "grad_norm": 2.8776803169128526, "learning_rate": 9.652984441619028e-06, "loss": 0.391, "step": 1039 }, { "epoch": 0.1458625525946704, "grad_norm": 2.58316619906647, "learning_rate": 9.65215258045925e-06, "loss": 0.3968, "step": 1040 }, { "epoch": 0.14600280504908836, "grad_norm": 2.1810511763496523, "learning_rate": 9.651319759360293e-06, "loss": 0.4204, "step": 1041 }, { "epoch": 0.1461430575035063, "grad_norm": 2.0801372680852457, "learning_rate": 9.650485978493998e-06, "loss": 0.4226, "step": 1042 }, { "epoch": 0.14628330995792427, "grad_norm": 2.943572241079221, "learning_rate": 9.649651238032412e-06, "loss": 0.4113, "step": 1043 }, { "epoch": 0.14642356241234222, "grad_norm": 2.073304515446183, "learning_rate": 9.64881553814778e-06, "loss": 0.4012, "step": 1044 }, { "epoch": 0.14656381486676018, "grad_norm": 2.465948325885005, "learning_rate": 9.647978879012539e-06, "loss": 0.4245, "step": 1045 }, { "epoch": 0.14670406732117813, "grad_norm": 2.030857576463728, "learning_rate": 9.64714126079933e-06, "loss": 0.3889, "step": 1046 }, { "epoch": 0.14684431977559606, "grad_norm": 2.376926945437948, "learning_rate": 9.64630268368099e-06, "loss": 0.4391, "step": 1047 }, { "epoch": 0.14698457223001402, "grad_norm": 2.701034354834928, "learning_rate": 9.645463147830551e-06, "loss": 0.4072, "step": 1048 }, { "epoch": 0.14712482468443197, "grad_norm": 3.014489152689817, "learning_rate": 9.644622653421249e-06, "loss": 0.4145, "step": 1049 }, { "epoch": 0.14726507713884993, "grad_norm": 3.4944008591168783, "learning_rate": 9.643781200626512e-06, "loss": 0.4052, "step": 1050 }, { "epoch": 0.14740532959326788, "grad_norm": 2.886312701377284, "learning_rate": 9.64293878961997e-06, "loss": 0.4213, "step": 1051 }, { "epoch": 0.14754558204768584, "grad_norm": 3.3666889857191293, "learning_rate": 9.642095420575443e-06, "loss": 0.4341, "step": 1052 }, { "epoch": 0.1476858345021038, "grad_norm": 2.549141092196029, "learning_rate": 9.641251093666961e-06, "loss": 0.4339, "step": 1053 }, { "epoch": 0.14782608695652175, "grad_norm": 2.1635618215940524, "learning_rate": 9.640405809068743e-06, "loss": 0.3898, "step": 1054 }, { "epoch": 0.1479663394109397, "grad_norm": 1.9952349522994326, "learning_rate": 9.639559566955204e-06, "loss": 0.4109, "step": 1055 }, { "epoch": 0.14810659186535766, "grad_norm": 2.473388467038814, "learning_rate": 9.638712367500964e-06, "loss": 0.4313, "step": 1056 }, { "epoch": 0.14824684431977558, "grad_norm": 2.72466319041275, "learning_rate": 9.637864210880836e-06, "loss": 0.4333, "step": 1057 }, { "epoch": 0.14838709677419354, "grad_norm": 2.7842817605587578, "learning_rate": 9.63701509726983e-06, "loss": 0.4199, "step": 1058 }, { "epoch": 0.1485273492286115, "grad_norm": 3.424883908719828, "learning_rate": 9.636165026843155e-06, "loss": 0.4466, "step": 1059 }, { "epoch": 0.14866760168302945, "grad_norm": 2.0427576251374937, "learning_rate": 9.63531399977622e-06, "loss": 0.4165, "step": 1060 }, { "epoch": 0.1488078541374474, "grad_norm": 4.31985907218794, "learning_rate": 9.634462016244625e-06, "loss": 0.3888, "step": 1061 }, { "epoch": 0.14894810659186536, "grad_norm": 4.148297955883368, "learning_rate": 9.633609076424171e-06, "loss": 0.4666, "step": 1062 }, { "epoch": 0.14908835904628331, "grad_norm": 2.4568796763039598, "learning_rate": 9.632755180490858e-06, "loss": 0.4003, "step": 1063 }, { "epoch": 0.14922861150070127, "grad_norm": 2.5828160003804546, "learning_rate": 9.63190032862088e-06, "loss": 0.3755, "step": 1064 }, { "epoch": 0.14936886395511922, "grad_norm": 2.8218962642168433, "learning_rate": 9.631044520990628e-06, "loss": 0.3939, "step": 1065 }, { "epoch": 0.14950911640953718, "grad_norm": 2.357733755648507, "learning_rate": 9.630187757776697e-06, "loss": 0.4194, "step": 1066 }, { "epoch": 0.1496493688639551, "grad_norm": 3.1452347815472677, "learning_rate": 9.629330039155872e-06, "loss": 0.3781, "step": 1067 }, { "epoch": 0.14978962131837306, "grad_norm": 2.7153605236855913, "learning_rate": 9.628471365305134e-06, "loss": 0.4333, "step": 1068 }, { "epoch": 0.14992987377279102, "grad_norm": 2.4797130484848298, "learning_rate": 9.627611736401668e-06, "loss": 0.4175, "step": 1069 }, { "epoch": 0.15007012622720897, "grad_norm": 2.3284972232526, "learning_rate": 9.62675115262285e-06, "loss": 0.4352, "step": 1070 }, { "epoch": 0.15021037868162693, "grad_norm": 2.3642635605374673, "learning_rate": 9.625889614146258e-06, "loss": 0.439, "step": 1071 }, { "epoch": 0.15035063113604488, "grad_norm": 3.048220137018182, "learning_rate": 9.625027121149665e-06, "loss": 0.4081, "step": 1072 }, { "epoch": 0.15049088359046284, "grad_norm": 2.7320320772993902, "learning_rate": 9.624163673811036e-06, "loss": 0.4282, "step": 1073 }, { "epoch": 0.1506311360448808, "grad_norm": 2.0980072012051325, "learning_rate": 9.62329927230854e-06, "loss": 0.405, "step": 1074 }, { "epoch": 0.15077138849929875, "grad_norm": 2.2985592849384924, "learning_rate": 9.622433916820539e-06, "loss": 0.4004, "step": 1075 }, { "epoch": 0.1509116409537167, "grad_norm": 2.4278306151730766, "learning_rate": 9.621567607525597e-06, "loss": 0.4195, "step": 1076 }, { "epoch": 0.15105189340813463, "grad_norm": 2.180788589249815, "learning_rate": 9.620700344602465e-06, "loss": 0.444, "step": 1077 }, { "epoch": 0.15119214586255258, "grad_norm": 3.0347592205834886, "learning_rate": 9.619832128230102e-06, "loss": 0.4117, "step": 1078 }, { "epoch": 0.15133239831697054, "grad_norm": 1.9319990267227911, "learning_rate": 9.618962958587656e-06, "loss": 0.3891, "step": 1079 }, { "epoch": 0.1514726507713885, "grad_norm": 2.2633574615488334, "learning_rate": 9.618092835854474e-06, "loss": 0.3929, "step": 1080 }, { "epoch": 0.15161290322580645, "grad_norm": 2.3929805744114385, "learning_rate": 9.617221760210097e-06, "loss": 0.4334, "step": 1081 }, { "epoch": 0.1517531556802244, "grad_norm": 2.5438583388443448, "learning_rate": 9.616349731834271e-06, "loss": 0.3803, "step": 1082 }, { "epoch": 0.15189340813464236, "grad_norm": 2.8384846999492006, "learning_rate": 9.61547675090693e-06, "loss": 0.4036, "step": 1083 }, { "epoch": 0.15203366058906032, "grad_norm": 3.2382551866317697, "learning_rate": 9.614602817608207e-06, "loss": 0.4007, "step": 1084 }, { "epoch": 0.15217391304347827, "grad_norm": 2.4687658381859436, "learning_rate": 9.613727932118435e-06, "loss": 0.3984, "step": 1085 }, { "epoch": 0.15231416549789623, "grad_norm": 2.6900388568780325, "learning_rate": 9.612852094618135e-06, "loss": 0.439, "step": 1086 }, { "epoch": 0.15245441795231415, "grad_norm": 2.5914035892559166, "learning_rate": 9.611975305288035e-06, "loss": 0.3796, "step": 1087 }, { "epoch": 0.1525946704067321, "grad_norm": 2.7013164489848562, "learning_rate": 9.611097564309054e-06, "loss": 0.4026, "step": 1088 }, { "epoch": 0.15273492286115006, "grad_norm": 2.3553114501659254, "learning_rate": 9.610218871862303e-06, "loss": 0.4008, "step": 1089 }, { "epoch": 0.15287517531556802, "grad_norm": 2.2941843479120396, "learning_rate": 9.609339228129098e-06, "loss": 0.4124, "step": 1090 }, { "epoch": 0.15301542776998597, "grad_norm": 3.0215582317653946, "learning_rate": 9.608458633290949e-06, "loss": 0.4124, "step": 1091 }, { "epoch": 0.15315568022440393, "grad_norm": 2.297298055081629, "learning_rate": 9.607577087529555e-06, "loss": 0.3776, "step": 1092 }, { "epoch": 0.15329593267882188, "grad_norm": 2.5290971242612517, "learning_rate": 9.606694591026823e-06, "loss": 0.4053, "step": 1093 }, { "epoch": 0.15343618513323984, "grad_norm": 2.707537450980487, "learning_rate": 9.605811143964846e-06, "loss": 0.4241, "step": 1094 }, { "epoch": 0.1535764375876578, "grad_norm": 2.0348920595752285, "learning_rate": 9.604926746525918e-06, "loss": 0.4216, "step": 1095 }, { "epoch": 0.15371669004207575, "grad_norm": 3.602553886947677, "learning_rate": 9.604041398892528e-06, "loss": 0.4163, "step": 1096 }, { "epoch": 0.15385694249649368, "grad_norm": 2.5830213809603455, "learning_rate": 9.603155101247363e-06, "loss": 0.4138, "step": 1097 }, { "epoch": 0.15399719495091163, "grad_norm": 5.650148420156749, "learning_rate": 9.602267853773301e-06, "loss": 0.4264, "step": 1098 }, { "epoch": 0.1541374474053296, "grad_norm": 2.5522059716062664, "learning_rate": 9.601379656653424e-06, "loss": 0.4532, "step": 1099 }, { "epoch": 0.15427769985974754, "grad_norm": 2.1496229195934924, "learning_rate": 9.600490510071001e-06, "loss": 0.421, "step": 1100 }, { "epoch": 0.1544179523141655, "grad_norm": 1.9552626241694175, "learning_rate": 9.599600414209503e-06, "loss": 0.443, "step": 1101 }, { "epoch": 0.15455820476858345, "grad_norm": 2.4666002788717067, "learning_rate": 9.598709369252595e-06, "loss": 0.4381, "step": 1102 }, { "epoch": 0.1546984572230014, "grad_norm": 2.385906274737349, "learning_rate": 9.597817375384138e-06, "loss": 0.384, "step": 1103 }, { "epoch": 0.15483870967741936, "grad_norm": 2.7324615759211506, "learning_rate": 9.596924432788188e-06, "loss": 0.409, "step": 1104 }, { "epoch": 0.15497896213183732, "grad_norm": 2.740828565912536, "learning_rate": 9.596030541648999e-06, "loss": 0.3933, "step": 1105 }, { "epoch": 0.15511921458625527, "grad_norm": 1.8453785444387232, "learning_rate": 9.595135702151017e-06, "loss": 0.4253, "step": 1106 }, { "epoch": 0.1552594670406732, "grad_norm": 2.492595675056487, "learning_rate": 9.594239914478886e-06, "loss": 0.3938, "step": 1107 }, { "epoch": 0.15539971949509115, "grad_norm": 2.3764125237428178, "learning_rate": 9.593343178817448e-06, "loss": 0.4503, "step": 1108 }, { "epoch": 0.1555399719495091, "grad_norm": 2.2884922814600546, "learning_rate": 9.592445495351738e-06, "loss": 0.3992, "step": 1109 }, { "epoch": 0.15568022440392706, "grad_norm": 4.806067787952794, "learning_rate": 9.591546864266983e-06, "loss": 0.4033, "step": 1110 }, { "epoch": 0.15582047685834502, "grad_norm": 2.567340465964004, "learning_rate": 9.590647285748614e-06, "loss": 0.4139, "step": 1111 }, { "epoch": 0.15596072931276297, "grad_norm": 3.9090763700670523, "learning_rate": 9.589746759982248e-06, "loss": 0.3944, "step": 1112 }, { "epoch": 0.15610098176718093, "grad_norm": 2.72169116304878, "learning_rate": 9.588845287153705e-06, "loss": 0.4031, "step": 1113 }, { "epoch": 0.15624123422159888, "grad_norm": 2.0091026109662993, "learning_rate": 9.587942867448998e-06, "loss": 0.3521, "step": 1114 }, { "epoch": 0.15638148667601684, "grad_norm": 2.230409851899958, "learning_rate": 9.587039501054335e-06, "loss": 0.4059, "step": 1115 }, { "epoch": 0.1565217391304348, "grad_norm": 2.8783031230402103, "learning_rate": 9.586135188156116e-06, "loss": 0.453, "step": 1116 }, { "epoch": 0.15666199158485272, "grad_norm": 2.1051851835366935, "learning_rate": 9.585229928940944e-06, "loss": 0.4069, "step": 1117 }, { "epoch": 0.15680224403927068, "grad_norm": 2.1059716920897347, "learning_rate": 9.584323723595612e-06, "loss": 0.3927, "step": 1118 }, { "epoch": 0.15694249649368863, "grad_norm": 2.0551007801598855, "learning_rate": 9.583416572307107e-06, "loss": 0.41, "step": 1119 }, { "epoch": 0.1570827489481066, "grad_norm": 2.5229899951496044, "learning_rate": 9.582508475262615e-06, "loss": 0.4201, "step": 1120 }, { "epoch": 0.15722300140252454, "grad_norm": 2.3618850997704826, "learning_rate": 9.581599432649515e-06, "loss": 0.3849, "step": 1121 }, { "epoch": 0.1573632538569425, "grad_norm": 2.4202884522653885, "learning_rate": 9.580689444655381e-06, "loss": 0.4441, "step": 1122 }, { "epoch": 0.15750350631136045, "grad_norm": 2.818625920645712, "learning_rate": 9.579778511467985e-06, "loss": 0.4273, "step": 1123 }, { "epoch": 0.1576437587657784, "grad_norm": 2.712378830554588, "learning_rate": 9.578866633275289e-06, "loss": 0.3783, "step": 1124 }, { "epoch": 0.15778401122019636, "grad_norm": 2.326053901454548, "learning_rate": 9.577953810265453e-06, "loss": 0.4356, "step": 1125 }, { "epoch": 0.15792426367461432, "grad_norm": 2.3586008178564604, "learning_rate": 9.577040042626832e-06, "loss": 0.432, "step": 1126 }, { "epoch": 0.15806451612903225, "grad_norm": 2.124613221816073, "learning_rate": 9.576125330547977e-06, "loss": 0.4262, "step": 1127 }, { "epoch": 0.1582047685834502, "grad_norm": 3.2891273039099116, "learning_rate": 9.575209674217632e-06, "loss": 0.4362, "step": 1128 }, { "epoch": 0.15834502103786816, "grad_norm": 2.9829424070961768, "learning_rate": 9.574293073824734e-06, "loss": 0.3821, "step": 1129 }, { "epoch": 0.1584852734922861, "grad_norm": 2.1867071622300935, "learning_rate": 9.57337552955842e-06, "loss": 0.4005, "step": 1130 }, { "epoch": 0.15862552594670407, "grad_norm": 3.400795865876443, "learning_rate": 9.572457041608018e-06, "loss": 0.4295, "step": 1131 }, { "epoch": 0.15876577840112202, "grad_norm": 2.818576606103275, "learning_rate": 9.57153761016305e-06, "loss": 0.466, "step": 1132 }, { "epoch": 0.15890603085553998, "grad_norm": 2.3378325975398395, "learning_rate": 9.570617235413235e-06, "loss": 0.4219, "step": 1133 }, { "epoch": 0.15904628330995793, "grad_norm": 3.5590919120819935, "learning_rate": 9.569695917548488e-06, "loss": 0.4149, "step": 1134 }, { "epoch": 0.15918653576437589, "grad_norm": 2.6004553836221596, "learning_rate": 9.568773656758913e-06, "loss": 0.4164, "step": 1135 }, { "epoch": 0.15932678821879384, "grad_norm": 1.9978827952622087, "learning_rate": 9.567850453234816e-06, "loss": 0.3767, "step": 1136 }, { "epoch": 0.15946704067321177, "grad_norm": 2.1536955145410523, "learning_rate": 9.56692630716669e-06, "loss": 0.3865, "step": 1137 }, { "epoch": 0.15960729312762972, "grad_norm": 2.4451260724162602, "learning_rate": 9.56600121874523e-06, "loss": 0.4038, "step": 1138 }, { "epoch": 0.15974754558204768, "grad_norm": 2.6160236560731867, "learning_rate": 9.565075188161316e-06, "loss": 0.4032, "step": 1139 }, { "epoch": 0.15988779803646563, "grad_norm": 2.326345288325165, "learning_rate": 9.564148215606033e-06, "loss": 0.4153, "step": 1140 }, { "epoch": 0.1600280504908836, "grad_norm": 2.2909924179944072, "learning_rate": 9.563220301270652e-06, "loss": 0.4402, "step": 1141 }, { "epoch": 0.16016830294530154, "grad_norm": 3.021430559424996, "learning_rate": 9.562291445346642e-06, "loss": 0.3899, "step": 1142 }, { "epoch": 0.1603085553997195, "grad_norm": 2.446424432538204, "learning_rate": 9.561361648025671e-06, "loss": 0.4103, "step": 1143 }, { "epoch": 0.16044880785413745, "grad_norm": 3.003415230055348, "learning_rate": 9.560430909499589e-06, "loss": 0.3539, "step": 1144 }, { "epoch": 0.1605890603085554, "grad_norm": 3.0235065777598096, "learning_rate": 9.55949922996045e-06, "loss": 0.4177, "step": 1145 }, { "epoch": 0.16072931276297336, "grad_norm": 2.534913602481532, "learning_rate": 9.558566609600502e-06, "loss": 0.4516, "step": 1146 }, { "epoch": 0.1608695652173913, "grad_norm": 2.100353815154539, "learning_rate": 9.557633048612183e-06, "loss": 0.3994, "step": 1147 }, { "epoch": 0.16100981767180925, "grad_norm": 1.9203085990102577, "learning_rate": 9.556698547188126e-06, "loss": 0.398, "step": 1148 }, { "epoch": 0.1611500701262272, "grad_norm": 1.89492950972649, "learning_rate": 9.555763105521159e-06, "loss": 0.4365, "step": 1149 }, { "epoch": 0.16129032258064516, "grad_norm": 2.7139931643012707, "learning_rate": 9.554826723804304e-06, "loss": 0.4431, "step": 1150 }, { "epoch": 0.1614305750350631, "grad_norm": 2.560696036666536, "learning_rate": 9.553889402230776e-06, "loss": 0.3947, "step": 1151 }, { "epoch": 0.16157082748948107, "grad_norm": 1.7362460791788672, "learning_rate": 9.55295114099399e-06, "loss": 0.3914, "step": 1152 }, { "epoch": 0.16171107994389902, "grad_norm": 3.732927624478831, "learning_rate": 9.55201194028754e-06, "loss": 0.3865, "step": 1153 }, { "epoch": 0.16185133239831698, "grad_norm": 2.6886451410607464, "learning_rate": 9.551071800305233e-06, "loss": 0.3904, "step": 1154 }, { "epoch": 0.16199158485273493, "grad_norm": 2.379096864476931, "learning_rate": 9.550130721241056e-06, "loss": 0.4546, "step": 1155 }, { "epoch": 0.1621318373071529, "grad_norm": 2.461502736891504, "learning_rate": 9.549188703289192e-06, "loss": 0.3657, "step": 1156 }, { "epoch": 0.16227208976157081, "grad_norm": 2.2762197138255154, "learning_rate": 9.548245746644025e-06, "loss": 0.4343, "step": 1157 }, { "epoch": 0.16241234221598877, "grad_norm": 2.546500242780415, "learning_rate": 9.547301851500123e-06, "loss": 0.428, "step": 1158 }, { "epoch": 0.16255259467040672, "grad_norm": 2.5021262663610164, "learning_rate": 9.546357018052254e-06, "loss": 0.3946, "step": 1159 }, { "epoch": 0.16269284712482468, "grad_norm": 1.9582843842036564, "learning_rate": 9.545411246495377e-06, "loss": 0.4227, "step": 1160 }, { "epoch": 0.16283309957924264, "grad_norm": 2.359148128288013, "learning_rate": 9.544464537024648e-06, "loss": 0.4191, "step": 1161 }, { "epoch": 0.1629733520336606, "grad_norm": 2.4439744304811586, "learning_rate": 9.54351688983541e-06, "loss": 0.3898, "step": 1162 }, { "epoch": 0.16311360448807855, "grad_norm": 2.6416373936167408, "learning_rate": 9.542568305123207e-06, "loss": 0.4311, "step": 1163 }, { "epoch": 0.1632538569424965, "grad_norm": 2.2498189005554714, "learning_rate": 9.54161878308377e-06, "loss": 0.3881, "step": 1164 }, { "epoch": 0.16339410939691446, "grad_norm": 1.9983997499481057, "learning_rate": 9.54066832391303e-06, "loss": 0.3777, "step": 1165 }, { "epoch": 0.1635343618513324, "grad_norm": 2.5831920421067824, "learning_rate": 9.539716927807102e-06, "loss": 0.3733, "step": 1166 }, { "epoch": 0.16367461430575034, "grad_norm": 1.7356110713126478, "learning_rate": 9.538764594962302e-06, "loss": 0.4383, "step": 1167 }, { "epoch": 0.1638148667601683, "grad_norm": 2.2847990881741733, "learning_rate": 9.537811325575142e-06, "loss": 0.4128, "step": 1168 }, { "epoch": 0.16395511921458625, "grad_norm": 2.7139922858222056, "learning_rate": 9.536857119842315e-06, "loss": 0.3244, "step": 1169 }, { "epoch": 0.1640953716690042, "grad_norm": 2.658353443926694, "learning_rate": 9.53590197796072e-06, "loss": 0.4127, "step": 1170 }, { "epoch": 0.16423562412342216, "grad_norm": 2.43866222745694, "learning_rate": 9.534945900127441e-06, "loss": 0.3763, "step": 1171 }, { "epoch": 0.1643758765778401, "grad_norm": 2.5244907507693934, "learning_rate": 9.533988886539761e-06, "loss": 0.4239, "step": 1172 }, { "epoch": 0.16451612903225807, "grad_norm": 2.520608550419327, "learning_rate": 9.533030937395151e-06, "loss": 0.3735, "step": 1173 }, { "epoch": 0.16465638148667602, "grad_norm": 2.3409335698247653, "learning_rate": 9.532072052891276e-06, "loss": 0.3895, "step": 1174 }, { "epoch": 0.16479663394109398, "grad_norm": 2.674189763938731, "learning_rate": 9.531112233225998e-06, "loss": 0.3949, "step": 1175 }, { "epoch": 0.16493688639551193, "grad_norm": 1.9253873905921681, "learning_rate": 9.530151478597366e-06, "loss": 0.4235, "step": 1176 }, { "epoch": 0.16507713884992986, "grad_norm": 3.5982233484824095, "learning_rate": 9.529189789203628e-06, "loss": 0.4053, "step": 1177 }, { "epoch": 0.16521739130434782, "grad_norm": 4.013193307083466, "learning_rate": 9.52822716524322e-06, "loss": 0.4376, "step": 1178 }, { "epoch": 0.16535764375876577, "grad_norm": 2.991055824873512, "learning_rate": 9.527263606914772e-06, "loss": 0.3655, "step": 1179 }, { "epoch": 0.16549789621318373, "grad_norm": 3.578869471499451, "learning_rate": 9.526299114417108e-06, "loss": 0.3989, "step": 1180 }, { "epoch": 0.16563814866760168, "grad_norm": 2.96286889849689, "learning_rate": 9.525333687949247e-06, "loss": 0.435, "step": 1181 }, { "epoch": 0.16577840112201964, "grad_norm": 1.7487858238668246, "learning_rate": 9.524367327710396e-06, "loss": 0.4205, "step": 1182 }, { "epoch": 0.1659186535764376, "grad_norm": 2.627491994277561, "learning_rate": 9.523400033899957e-06, "loss": 0.4342, "step": 1183 }, { "epoch": 0.16605890603085555, "grad_norm": 2.5171275893599323, "learning_rate": 9.522431806717523e-06, "loss": 0.4001, "step": 1184 }, { "epoch": 0.1661991584852735, "grad_norm": 3.483885863316731, "learning_rate": 9.52146264636288e-06, "loss": 0.4341, "step": 1185 }, { "epoch": 0.16633941093969146, "grad_norm": 2.3925726442858144, "learning_rate": 9.520492553036012e-06, "loss": 0.3603, "step": 1186 }, { "epoch": 0.16647966339410938, "grad_norm": 2.4015746831740015, "learning_rate": 9.519521526937087e-06, "loss": 0.4177, "step": 1187 }, { "epoch": 0.16661991584852734, "grad_norm": 2.448855043020823, "learning_rate": 9.518549568266474e-06, "loss": 0.4601, "step": 1188 }, { "epoch": 0.1667601683029453, "grad_norm": 2.044396217210435, "learning_rate": 9.517576677224723e-06, "loss": 0.436, "step": 1189 }, { "epoch": 0.16690042075736325, "grad_norm": 2.4423940384525595, "learning_rate": 9.516602854012587e-06, "loss": 0.3716, "step": 1190 }, { "epoch": 0.1670406732117812, "grad_norm": 1.9781591427524672, "learning_rate": 9.515628098831009e-06, "loss": 0.3487, "step": 1191 }, { "epoch": 0.16718092566619916, "grad_norm": 2.1303713735352496, "learning_rate": 9.514652411881122e-06, "loss": 0.4224, "step": 1192 }, { "epoch": 0.16732117812061711, "grad_norm": 2.2392595160536475, "learning_rate": 9.51367579336425e-06, "loss": 0.4361, "step": 1193 }, { "epoch": 0.16746143057503507, "grad_norm": 1.9880941186765244, "learning_rate": 9.512698243481914e-06, "loss": 0.3987, "step": 1194 }, { "epoch": 0.16760168302945302, "grad_norm": 2.2046618140418803, "learning_rate": 9.511719762435822e-06, "loss": 0.395, "step": 1195 }, { "epoch": 0.16774193548387098, "grad_norm": 4.369878550572421, "learning_rate": 9.51074035042788e-06, "loss": 0.4797, "step": 1196 }, { "epoch": 0.1678821879382889, "grad_norm": 1.816185656862949, "learning_rate": 9.509760007660182e-06, "loss": 0.389, "step": 1197 }, { "epoch": 0.16802244039270686, "grad_norm": 2.4334654062389744, "learning_rate": 9.508778734335013e-06, "loss": 0.3873, "step": 1198 }, { "epoch": 0.16816269284712482, "grad_norm": 2.1890452785209344, "learning_rate": 9.507796530654854e-06, "loss": 0.4145, "step": 1199 }, { "epoch": 0.16830294530154277, "grad_norm": 3.2225224877542797, "learning_rate": 9.506813396822373e-06, "loss": 0.4182, "step": 1200 }, { "epoch": 0.16844319775596073, "grad_norm": 1.96718074685317, "learning_rate": 9.505829333040437e-06, "loss": 0.4093, "step": 1201 }, { "epoch": 0.16858345021037868, "grad_norm": 2.340145136252884, "learning_rate": 9.504844339512096e-06, "loss": 0.3925, "step": 1202 }, { "epoch": 0.16872370266479664, "grad_norm": 1.9437631465553142, "learning_rate": 9.503858416440602e-06, "loss": 0.4138, "step": 1203 }, { "epoch": 0.1688639551192146, "grad_norm": 1.9766972311186057, "learning_rate": 9.502871564029386e-06, "loss": 0.4369, "step": 1204 }, { "epoch": 0.16900420757363255, "grad_norm": 1.729387897295848, "learning_rate": 9.501883782482084e-06, "loss": 0.4281, "step": 1205 }, { "epoch": 0.1691444600280505, "grad_norm": 2.628780639731081, "learning_rate": 9.500895072002517e-06, "loss": 0.4124, "step": 1206 }, { "epoch": 0.16928471248246843, "grad_norm": 7.411661352154686, "learning_rate": 9.499905432794699e-06, "loss": 0.405, "step": 1207 }, { "epoch": 0.16942496493688639, "grad_norm": 2.783183165361544, "learning_rate": 9.498914865062831e-06, "loss": 0.4386, "step": 1208 }, { "epoch": 0.16956521739130434, "grad_norm": 2.982545141884103, "learning_rate": 9.497923369011312e-06, "loss": 0.3758, "step": 1209 }, { "epoch": 0.1697054698457223, "grad_norm": 2.345994307131417, "learning_rate": 9.496930944844733e-06, "loss": 0.3261, "step": 1210 }, { "epoch": 0.16984572230014025, "grad_norm": 3.5091945584820152, "learning_rate": 9.495937592767873e-06, "loss": 0.4107, "step": 1211 }, { "epoch": 0.1699859747545582, "grad_norm": 2.065666657689991, "learning_rate": 9.494943312985698e-06, "loss": 0.4038, "step": 1212 }, { "epoch": 0.17012622720897616, "grad_norm": 2.5238550744140418, "learning_rate": 9.493948105703376e-06, "loss": 0.4376, "step": 1213 }, { "epoch": 0.17026647966339412, "grad_norm": 4.103484715123352, "learning_rate": 9.49295197112626e-06, "loss": 0.4272, "step": 1214 }, { "epoch": 0.17040673211781207, "grad_norm": 2.298970000135591, "learning_rate": 9.491954909459895e-06, "loss": 0.3997, "step": 1215 }, { "epoch": 0.17054698457223003, "grad_norm": 4.247622329719606, "learning_rate": 9.490956920910016e-06, "loss": 0.3835, "step": 1216 }, { "epoch": 0.17068723702664795, "grad_norm": 4.1277944174000885, "learning_rate": 9.489958005682555e-06, "loss": 0.4511, "step": 1217 }, { "epoch": 0.1708274894810659, "grad_norm": 3.259215713661228, "learning_rate": 9.488958163983629e-06, "loss": 0.4328, "step": 1218 }, { "epoch": 0.17096774193548386, "grad_norm": 2.9630929152371537, "learning_rate": 9.487957396019547e-06, "loss": 0.4259, "step": 1219 }, { "epoch": 0.17110799438990182, "grad_norm": 2.0584259254335158, "learning_rate": 9.486955701996811e-06, "loss": 0.378, "step": 1220 }, { "epoch": 0.17124824684431977, "grad_norm": 2.4670107545653037, "learning_rate": 9.485953082122116e-06, "loss": 0.4353, "step": 1221 }, { "epoch": 0.17138849929873773, "grad_norm": 2.3662240362178717, "learning_rate": 9.484949536602343e-06, "loss": 0.4423, "step": 1222 }, { "epoch": 0.17152875175315568, "grad_norm": 2.2723231779997004, "learning_rate": 9.48394506564457e-06, "loss": 0.3982, "step": 1223 }, { "epoch": 0.17166900420757364, "grad_norm": 2.559165925556135, "learning_rate": 9.482939669456056e-06, "loss": 0.4306, "step": 1224 }, { "epoch": 0.1718092566619916, "grad_norm": 2.6037049965121444, "learning_rate": 9.481933348244264e-06, "loss": 0.424, "step": 1225 }, { "epoch": 0.17194950911640955, "grad_norm": 2.553673499217684, "learning_rate": 9.480926102216836e-06, "loss": 0.428, "step": 1226 }, { "epoch": 0.17208976157082748, "grad_norm": 6.408181992402674, "learning_rate": 9.479917931581616e-06, "loss": 0.4529, "step": 1227 }, { "epoch": 0.17223001402524543, "grad_norm": 2.448997183334332, "learning_rate": 9.478908836546629e-06, "loss": 0.3834, "step": 1228 }, { "epoch": 0.1723702664796634, "grad_norm": 2.8029912638152146, "learning_rate": 9.477898817320094e-06, "loss": 0.4123, "step": 1229 }, { "epoch": 0.17251051893408134, "grad_norm": 3.202817129830967, "learning_rate": 9.476887874110426e-06, "loss": 0.3825, "step": 1230 }, { "epoch": 0.1726507713884993, "grad_norm": 2.8321866538719878, "learning_rate": 9.475876007126222e-06, "loss": 0.4591, "step": 1231 }, { "epoch": 0.17279102384291725, "grad_norm": 2.4171645101133303, "learning_rate": 9.474863216576276e-06, "loss": 0.4224, "step": 1232 }, { "epoch": 0.1729312762973352, "grad_norm": 7.168051504546139, "learning_rate": 9.473849502669568e-06, "loss": 0.3713, "step": 1233 }, { "epoch": 0.17307152875175316, "grad_norm": 2.78534482769613, "learning_rate": 9.472834865615271e-06, "loss": 0.3697, "step": 1234 }, { "epoch": 0.17321178120617112, "grad_norm": 2.8460343795812273, "learning_rate": 9.47181930562275e-06, "loss": 0.4576, "step": 1235 }, { "epoch": 0.17335203366058907, "grad_norm": 2.5761921428493375, "learning_rate": 9.470802822901558e-06, "loss": 0.4131, "step": 1236 }, { "epoch": 0.173492286115007, "grad_norm": 4.779160984416179, "learning_rate": 9.469785417661439e-06, "loss": 0.4242, "step": 1237 }, { "epoch": 0.17363253856942495, "grad_norm": 2.2333713224605662, "learning_rate": 9.468767090112328e-06, "loss": 0.4347, "step": 1238 }, { "epoch": 0.1737727910238429, "grad_norm": 2.323396322453706, "learning_rate": 9.467747840464348e-06, "loss": 0.4057, "step": 1239 }, { "epoch": 0.17391304347826086, "grad_norm": 2.201717928401065, "learning_rate": 9.466727668927817e-06, "loss": 0.4482, "step": 1240 }, { "epoch": 0.17405329593267882, "grad_norm": 3.6871073885602232, "learning_rate": 9.465706575713235e-06, "loss": 0.3794, "step": 1241 }, { "epoch": 0.17419354838709677, "grad_norm": 2.4971196748213007, "learning_rate": 9.464684561031306e-06, "loss": 0.4636, "step": 1242 }, { "epoch": 0.17433380084151473, "grad_norm": 3.0448723887495732, "learning_rate": 9.463661625092907e-06, "loss": 0.3944, "step": 1243 }, { "epoch": 0.17447405329593269, "grad_norm": 2.6748980101634925, "learning_rate": 9.462637768109119e-06, "loss": 0.3785, "step": 1244 }, { "epoch": 0.17461430575035064, "grad_norm": 3.436399942722793, "learning_rate": 9.461612990291205e-06, "loss": 0.4122, "step": 1245 }, { "epoch": 0.1747545582047686, "grad_norm": 3.2400271652401527, "learning_rate": 9.460587291850623e-06, "loss": 0.3889, "step": 1246 }, { "epoch": 0.17489481065918652, "grad_norm": 1.816902207786625, "learning_rate": 9.459560672999016e-06, "loss": 0.3969, "step": 1247 }, { "epoch": 0.17503506311360448, "grad_norm": 1.916401146727593, "learning_rate": 9.458533133948223e-06, "loss": 0.4011, "step": 1248 }, { "epoch": 0.17517531556802243, "grad_norm": 2.0131263326920577, "learning_rate": 9.457504674910265e-06, "loss": 0.3718, "step": 1249 }, { "epoch": 0.1753155680224404, "grad_norm": 2.584319377701908, "learning_rate": 9.45647529609736e-06, "loss": 0.382, "step": 1250 }, { "epoch": 0.17545582047685834, "grad_norm": 2.230159597801251, "learning_rate": 9.455444997721916e-06, "loss": 0.4054, "step": 1251 }, { "epoch": 0.1755960729312763, "grad_norm": 2.29695378382502, "learning_rate": 9.454413779996523e-06, "loss": 0.4555, "step": 1252 }, { "epoch": 0.17573632538569425, "grad_norm": 2.1711072044875364, "learning_rate": 9.453381643133968e-06, "loss": 0.4293, "step": 1253 }, { "epoch": 0.1758765778401122, "grad_norm": 1.8950240605218354, "learning_rate": 9.452348587347224e-06, "loss": 0.4058, "step": 1254 }, { "epoch": 0.17601683029453016, "grad_norm": 2.1408903973751197, "learning_rate": 9.451314612849456e-06, "loss": 0.444, "step": 1255 }, { "epoch": 0.17615708274894812, "grad_norm": 1.8594666786993295, "learning_rate": 9.450279719854016e-06, "loss": 0.4166, "step": 1256 }, { "epoch": 0.17629733520336605, "grad_norm": 1.9891962070879203, "learning_rate": 9.44924390857445e-06, "loss": 0.396, "step": 1257 }, { "epoch": 0.176437587657784, "grad_norm": 2.8283465388238334, "learning_rate": 9.448207179224487e-06, "loss": 0.395, "step": 1258 }, { "epoch": 0.17657784011220196, "grad_norm": 6.6509716614753405, "learning_rate": 9.44716953201805e-06, "loss": 0.4057, "step": 1259 }, { "epoch": 0.1767180925666199, "grad_norm": 2.4270617657359415, "learning_rate": 9.446130967169251e-06, "loss": 0.4031, "step": 1260 }, { "epoch": 0.17685834502103787, "grad_norm": 1.956520546921998, "learning_rate": 9.44509148489239e-06, "loss": 0.3916, "step": 1261 }, { "epoch": 0.17699859747545582, "grad_norm": 2.619777024659837, "learning_rate": 9.444051085401957e-06, "loss": 0.3839, "step": 1262 }, { "epoch": 0.17713884992987378, "grad_norm": 2.361267240510828, "learning_rate": 9.44300976891263e-06, "loss": 0.3769, "step": 1263 }, { "epoch": 0.17727910238429173, "grad_norm": 2.458383160964393, "learning_rate": 9.44196753563928e-06, "loss": 0.4219, "step": 1264 }, { "epoch": 0.1774193548387097, "grad_norm": 1.9114083708869307, "learning_rate": 9.440924385796964e-06, "loss": 0.392, "step": 1265 }, { "epoch": 0.17755960729312764, "grad_norm": 1.7480757897399084, "learning_rate": 9.439880319600924e-06, "loss": 0.336, "step": 1266 }, { "epoch": 0.17769985974754557, "grad_norm": 2.565847304048959, "learning_rate": 9.438835337266603e-06, "loss": 0.4076, "step": 1267 }, { "epoch": 0.17784011220196352, "grad_norm": 2.56245878232976, "learning_rate": 9.43778943900962e-06, "loss": 0.4294, "step": 1268 }, { "epoch": 0.17798036465638148, "grad_norm": 2.130491565880132, "learning_rate": 9.436742625045794e-06, "loss": 0.3948, "step": 1269 }, { "epoch": 0.17812061711079943, "grad_norm": 4.078736942608223, "learning_rate": 9.435694895591124e-06, "loss": 0.4387, "step": 1270 }, { "epoch": 0.1782608695652174, "grad_norm": 2.231195388253836, "learning_rate": 9.434646250861801e-06, "loss": 0.3925, "step": 1271 }, { "epoch": 0.17840112201963534, "grad_norm": 2.37259783479781, "learning_rate": 9.433596691074207e-06, "loss": 0.3996, "step": 1272 }, { "epoch": 0.1785413744740533, "grad_norm": 2.0671959793943393, "learning_rate": 9.432546216444912e-06, "loss": 0.401, "step": 1273 }, { "epoch": 0.17868162692847125, "grad_norm": 2.1230714686003656, "learning_rate": 9.431494827190673e-06, "loss": 0.4037, "step": 1274 }, { "epoch": 0.1788218793828892, "grad_norm": 2.3101467840615477, "learning_rate": 9.430442523528437e-06, "loss": 0.4683, "step": 1275 }, { "epoch": 0.17896213183730716, "grad_norm": 2.1702603712199924, "learning_rate": 9.429389305675342e-06, "loss": 0.3653, "step": 1276 }, { "epoch": 0.1791023842917251, "grad_norm": 1.916233248702735, "learning_rate": 9.428335173848708e-06, "loss": 0.4099, "step": 1277 }, { "epoch": 0.17924263674614305, "grad_norm": 2.026866349476185, "learning_rate": 9.427280128266049e-06, "loss": 0.4409, "step": 1278 }, { "epoch": 0.179382889200561, "grad_norm": 1.7998743649188929, "learning_rate": 9.42622416914507e-06, "loss": 0.3886, "step": 1279 }, { "epoch": 0.17952314165497896, "grad_norm": 2.4149245417398375, "learning_rate": 9.425167296703655e-06, "loss": 0.3896, "step": 1280 }, { "epoch": 0.1796633941093969, "grad_norm": 3.1346046858024827, "learning_rate": 9.424109511159887e-06, "loss": 0.3891, "step": 1281 }, { "epoch": 0.17980364656381487, "grad_norm": 1.8760595506842588, "learning_rate": 9.423050812732029e-06, "loss": 0.4091, "step": 1282 }, { "epoch": 0.17994389901823282, "grad_norm": 2.065049762818974, "learning_rate": 9.421991201638539e-06, "loss": 0.3934, "step": 1283 }, { "epoch": 0.18008415147265078, "grad_norm": 3.8631468345470634, "learning_rate": 9.420930678098057e-06, "loss": 0.4096, "step": 1284 }, { "epoch": 0.18022440392706873, "grad_norm": 2.5088842840896723, "learning_rate": 9.419869242329417e-06, "loss": 0.484, "step": 1285 }, { "epoch": 0.1803646563814867, "grad_norm": 2.3024466272519746, "learning_rate": 9.41880689455164e-06, "loss": 0.3983, "step": 1286 }, { "epoch": 0.18050490883590461, "grad_norm": 5.286998802581762, "learning_rate": 9.417743634983933e-06, "loss": 0.3722, "step": 1287 }, { "epoch": 0.18064516129032257, "grad_norm": 1.9964600467711022, "learning_rate": 9.416679463845691e-06, "loss": 0.4233, "step": 1288 }, { "epoch": 0.18078541374474053, "grad_norm": 1.7774773516063807, "learning_rate": 9.415614381356496e-06, "loss": 0.4363, "step": 1289 }, { "epoch": 0.18092566619915848, "grad_norm": 2.0877998410820227, "learning_rate": 9.414548387736127e-06, "loss": 0.377, "step": 1290 }, { "epoch": 0.18106591865357644, "grad_norm": 2.008161699973019, "learning_rate": 9.413481483204541e-06, "loss": 0.3891, "step": 1291 }, { "epoch": 0.1812061711079944, "grad_norm": 1.9249842234373735, "learning_rate": 9.412413667981884e-06, "loss": 0.4526, "step": 1292 }, { "epoch": 0.18134642356241235, "grad_norm": 2.222781963849238, "learning_rate": 9.411344942288493e-06, "loss": 0.3976, "step": 1293 }, { "epoch": 0.1814866760168303, "grad_norm": 1.9944357359414755, "learning_rate": 9.410275306344895e-06, "loss": 0.4038, "step": 1294 }, { "epoch": 0.18162692847124826, "grad_norm": 2.1647634094154253, "learning_rate": 9.409204760371803e-06, "loss": 0.4259, "step": 1295 }, { "epoch": 0.1817671809256662, "grad_norm": 2.0766959188410454, "learning_rate": 9.40813330459011e-06, "loss": 0.4083, "step": 1296 }, { "epoch": 0.18190743338008414, "grad_norm": 1.8051194503864763, "learning_rate": 9.407060939220907e-06, "loss": 0.4057, "step": 1297 }, { "epoch": 0.1820476858345021, "grad_norm": 2.2887919920356232, "learning_rate": 9.405987664485472e-06, "loss": 0.4404, "step": 1298 }, { "epoch": 0.18218793828892005, "grad_norm": 2.82084857780661, "learning_rate": 9.404913480605264e-06, "loss": 0.3819, "step": 1299 }, { "epoch": 0.182328190743338, "grad_norm": 2.7994823556334794, "learning_rate": 9.403838387801933e-06, "loss": 0.477, "step": 1300 }, { "epoch": 0.18246844319775596, "grad_norm": 5.310348793255512, "learning_rate": 9.40276238629732e-06, "loss": 0.4191, "step": 1301 }, { "epoch": 0.1826086956521739, "grad_norm": 2.2485722144238562, "learning_rate": 9.401685476313447e-06, "loss": 0.4196, "step": 1302 }, { "epoch": 0.18274894810659187, "grad_norm": 2.3517466121343826, "learning_rate": 9.400607658072531e-06, "loss": 0.3727, "step": 1303 }, { "epoch": 0.18288920056100982, "grad_norm": 2.7313407411494257, "learning_rate": 9.399528931796968e-06, "loss": 0.4151, "step": 1304 }, { "epoch": 0.18302945301542778, "grad_norm": 2.6340662294488815, "learning_rate": 9.398449297709349e-06, "loss": 0.4394, "step": 1305 }, { "epoch": 0.18316970546984573, "grad_norm": 2.001794963264513, "learning_rate": 9.397368756032445e-06, "loss": 0.4329, "step": 1306 }, { "epoch": 0.18330995792426366, "grad_norm": 2.3730756090724356, "learning_rate": 9.396287306989224e-06, "loss": 0.4592, "step": 1307 }, { "epoch": 0.18345021037868162, "grad_norm": 2.6745381607240053, "learning_rate": 9.39520495080283e-06, "loss": 0.4108, "step": 1308 }, { "epoch": 0.18359046283309957, "grad_norm": 1.9216225931297322, "learning_rate": 9.394121687696602e-06, "loss": 0.4359, "step": 1309 }, { "epoch": 0.18373071528751753, "grad_norm": 1.9238677199883212, "learning_rate": 9.393037517894063e-06, "loss": 0.3791, "step": 1310 }, { "epoch": 0.18387096774193548, "grad_norm": 2.4132352232441616, "learning_rate": 9.391952441618926e-06, "loss": 0.4041, "step": 1311 }, { "epoch": 0.18401122019635344, "grad_norm": 1.878841343510048, "learning_rate": 9.390866459095085e-06, "loss": 0.3719, "step": 1312 }, { "epoch": 0.1841514726507714, "grad_norm": 5.015415460600611, "learning_rate": 9.389779570546628e-06, "loss": 0.3606, "step": 1313 }, { "epoch": 0.18429172510518935, "grad_norm": 3.078994550699514, "learning_rate": 9.388691776197827e-06, "loss": 0.4732, "step": 1314 }, { "epoch": 0.1844319775596073, "grad_norm": 2.5962899881514936, "learning_rate": 9.38760307627314e-06, "loss": 0.3743, "step": 1315 }, { "epoch": 0.18457223001402526, "grad_norm": 1.8070094376503871, "learning_rate": 9.38651347099721e-06, "loss": 0.3603, "step": 1316 }, { "epoch": 0.18471248246844318, "grad_norm": 2.8092288173370177, "learning_rate": 9.385422960594875e-06, "loss": 0.4026, "step": 1317 }, { "epoch": 0.18485273492286114, "grad_norm": 2.2046945811049135, "learning_rate": 9.384331545291149e-06, "loss": 0.3708, "step": 1318 }, { "epoch": 0.1849929873772791, "grad_norm": 2.891487776383314, "learning_rate": 9.38323922531124e-06, "loss": 0.4461, "step": 1319 }, { "epoch": 0.18513323983169705, "grad_norm": 2.8598962527577965, "learning_rate": 9.38214600088054e-06, "loss": 0.3994, "step": 1320 }, { "epoch": 0.185273492286115, "grad_norm": 2.067061725922081, "learning_rate": 9.381051872224632e-06, "loss": 0.4338, "step": 1321 }, { "epoch": 0.18541374474053296, "grad_norm": 3.6242260600018565, "learning_rate": 9.379956839569275e-06, "loss": 0.4035, "step": 1322 }, { "epoch": 0.18555399719495091, "grad_norm": 2.6338959680345133, "learning_rate": 9.378860903140428e-06, "loss": 0.3675, "step": 1323 }, { "epoch": 0.18569424964936887, "grad_norm": 2.3073136190112105, "learning_rate": 9.377764063164224e-06, "loss": 0.3736, "step": 1324 }, { "epoch": 0.18583450210378682, "grad_norm": 2.775240116212779, "learning_rate": 9.376666319866993e-06, "loss": 0.4577, "step": 1325 }, { "epoch": 0.18597475455820478, "grad_norm": 2.0588283803225518, "learning_rate": 9.375567673475246e-06, "loss": 0.4372, "step": 1326 }, { "epoch": 0.1861150070126227, "grad_norm": 1.9292353667954527, "learning_rate": 9.374468124215676e-06, "loss": 0.4161, "step": 1327 }, { "epoch": 0.18625525946704066, "grad_norm": 1.9314512771358503, "learning_rate": 9.373367672315174e-06, "loss": 0.4031, "step": 1328 }, { "epoch": 0.18639551192145862, "grad_norm": 1.967248131845537, "learning_rate": 9.372266318000806e-06, "loss": 0.3874, "step": 1329 }, { "epoch": 0.18653576437587657, "grad_norm": 2.249186156693607, "learning_rate": 9.371164061499831e-06, "loss": 0.4165, "step": 1330 }, { "epoch": 0.18667601683029453, "grad_norm": 2.184013421908207, "learning_rate": 9.37006090303969e-06, "loss": 0.3871, "step": 1331 }, { "epoch": 0.18681626928471248, "grad_norm": 2.6464915820935078, "learning_rate": 9.368956842848014e-06, "loss": 0.4071, "step": 1332 }, { "epoch": 0.18695652173913044, "grad_norm": 1.945588119162185, "learning_rate": 9.367851881152618e-06, "loss": 0.3855, "step": 1333 }, { "epoch": 0.1870967741935484, "grad_norm": 2.4132731607041604, "learning_rate": 9.366746018181503e-06, "loss": 0.4268, "step": 1334 }, { "epoch": 0.18723702664796635, "grad_norm": 2.343077703056088, "learning_rate": 9.365639254162855e-06, "loss": 0.3824, "step": 1335 }, { "epoch": 0.1873772791023843, "grad_norm": 2.504778015443031, "learning_rate": 9.364531589325048e-06, "loss": 0.4229, "step": 1336 }, { "epoch": 0.18751753155680223, "grad_norm": 4.257230106385012, "learning_rate": 9.363423023896641e-06, "loss": 0.4264, "step": 1337 }, { "epoch": 0.18765778401122019, "grad_norm": 2.5754928561633474, "learning_rate": 9.362313558106376e-06, "loss": 0.3747, "step": 1338 }, { "epoch": 0.18779803646563814, "grad_norm": 2.2302334689761425, "learning_rate": 9.361203192183188e-06, "loss": 0.3789, "step": 1339 }, { "epoch": 0.1879382889200561, "grad_norm": 2.0942703283752526, "learning_rate": 9.36009192635619e-06, "loss": 0.4049, "step": 1340 }, { "epoch": 0.18807854137447405, "grad_norm": 5.170522276656714, "learning_rate": 9.358979760854686e-06, "loss": 0.3775, "step": 1341 }, { "epoch": 0.188218793828892, "grad_norm": 2.752491082810207, "learning_rate": 9.357866695908162e-06, "loss": 0.418, "step": 1342 }, { "epoch": 0.18835904628330996, "grad_norm": 2.6952625214739037, "learning_rate": 9.356752731746292e-06, "loss": 0.4513, "step": 1343 }, { "epoch": 0.18849929873772792, "grad_norm": 2.7231864248811504, "learning_rate": 9.355637868598935e-06, "loss": 0.4096, "step": 1344 }, { "epoch": 0.18863955119214587, "grad_norm": 2.953142761499993, "learning_rate": 9.354522106696133e-06, "loss": 0.3857, "step": 1345 }, { "epoch": 0.18877980364656383, "grad_norm": 2.09650422185122, "learning_rate": 9.353405446268119e-06, "loss": 0.3789, "step": 1346 }, { "epoch": 0.18892005610098175, "grad_norm": 2.269377238683035, "learning_rate": 9.352287887545305e-06, "loss": 0.3962, "step": 1347 }, { "epoch": 0.1890603085553997, "grad_norm": 2.515045383567181, "learning_rate": 9.351169430758293e-06, "loss": 0.4204, "step": 1348 }, { "epoch": 0.18920056100981766, "grad_norm": 2.3111362044284456, "learning_rate": 9.350050076137871e-06, "loss": 0.4028, "step": 1349 }, { "epoch": 0.18934081346423562, "grad_norm": 2.980511786273907, "learning_rate": 9.348929823915005e-06, "loss": 0.4341, "step": 1350 }, { "epoch": 0.18948106591865357, "grad_norm": 1.936557048096184, "learning_rate": 9.347808674320855e-06, "loss": 0.4075, "step": 1351 }, { "epoch": 0.18962131837307153, "grad_norm": 2.6976251825200275, "learning_rate": 9.346686627586761e-06, "loss": 0.4315, "step": 1352 }, { "epoch": 0.18976157082748948, "grad_norm": 2.407604480349974, "learning_rate": 9.345563683944249e-06, "loss": 0.3889, "step": 1353 }, { "epoch": 0.18990182328190744, "grad_norm": 2.501290465127207, "learning_rate": 9.344439843625034e-06, "loss": 0.4287, "step": 1354 }, { "epoch": 0.1900420757363254, "grad_norm": 2.50757177035308, "learning_rate": 9.343315106861008e-06, "loss": 0.3932, "step": 1355 }, { "epoch": 0.19018232819074335, "grad_norm": 1.9337898540273668, "learning_rate": 9.342189473884254e-06, "loss": 0.4143, "step": 1356 }, { "epoch": 0.19032258064516128, "grad_norm": 3.0467529272420832, "learning_rate": 9.341062944927039e-06, "loss": 0.4613, "step": 1357 }, { "epoch": 0.19046283309957923, "grad_norm": 2.042810262509889, "learning_rate": 9.339935520221816e-06, "loss": 0.4288, "step": 1358 }, { "epoch": 0.1906030855539972, "grad_norm": 2.2597999141534757, "learning_rate": 9.338807200001218e-06, "loss": 0.3858, "step": 1359 }, { "epoch": 0.19074333800841514, "grad_norm": 2.200438031025038, "learning_rate": 9.337677984498069e-06, "loss": 0.389, "step": 1360 }, { "epoch": 0.1908835904628331, "grad_norm": 2.014207089888786, "learning_rate": 9.336547873945372e-06, "loss": 0.3699, "step": 1361 }, { "epoch": 0.19102384291725105, "grad_norm": 1.9686073978186245, "learning_rate": 9.33541686857632e-06, "loss": 0.3764, "step": 1362 }, { "epoch": 0.191164095371669, "grad_norm": 2.473033713506233, "learning_rate": 9.334284968624286e-06, "loss": 0.435, "step": 1363 }, { "epoch": 0.19130434782608696, "grad_norm": 2.099282996071567, "learning_rate": 9.33315217432283e-06, "loss": 0.4149, "step": 1364 }, { "epoch": 0.19144460028050492, "grad_norm": 2.178538467372289, "learning_rate": 9.332018485905699e-06, "loss": 0.386, "step": 1365 }, { "epoch": 0.19158485273492287, "grad_norm": 2.190162346453066, "learning_rate": 9.330883903606816e-06, "loss": 0.3973, "step": 1366 }, { "epoch": 0.1917251051893408, "grad_norm": 2.5280598446969447, "learning_rate": 9.329748427660299e-06, "loss": 0.411, "step": 1367 }, { "epoch": 0.19186535764375875, "grad_norm": 2.2984575282403537, "learning_rate": 9.328612058300443e-06, "loss": 0.4265, "step": 1368 }, { "epoch": 0.1920056100981767, "grad_norm": 2.063347151201084, "learning_rate": 9.327474795761734e-06, "loss": 0.4387, "step": 1369 }, { "epoch": 0.19214586255259467, "grad_norm": 2.006159595579826, "learning_rate": 9.326336640278831e-06, "loss": 0.3644, "step": 1370 }, { "epoch": 0.19228611500701262, "grad_norm": 3.438110158694435, "learning_rate": 9.32519759208659e-06, "loss": 0.3748, "step": 1371 }, { "epoch": 0.19242636746143058, "grad_norm": 2.0239572925208282, "learning_rate": 9.324057651420045e-06, "loss": 0.4758, "step": 1372 }, { "epoch": 0.19256661991584853, "grad_norm": 2.21528236311621, "learning_rate": 9.322916818514414e-06, "loss": 0.383, "step": 1373 }, { "epoch": 0.19270687237026649, "grad_norm": 2.3135107769736756, "learning_rate": 9.321775093605097e-06, "loss": 0.3693, "step": 1374 }, { "epoch": 0.19284712482468444, "grad_norm": 2.2661269059498035, "learning_rate": 9.320632476927687e-06, "loss": 0.4537, "step": 1375 }, { "epoch": 0.1929873772791024, "grad_norm": 3.500027656445907, "learning_rate": 9.31948896871795e-06, "loss": 0.4226, "step": 1376 }, { "epoch": 0.19312762973352032, "grad_norm": 2.7175424459137743, "learning_rate": 9.318344569211843e-06, "loss": 0.3953, "step": 1377 }, { "epoch": 0.19326788218793828, "grad_norm": 2.4276732899721574, "learning_rate": 9.317199278645506e-06, "loss": 0.447, "step": 1378 }, { "epoch": 0.19340813464235623, "grad_norm": 3.0923585170740013, "learning_rate": 9.316053097255258e-06, "loss": 0.4311, "step": 1379 }, { "epoch": 0.1935483870967742, "grad_norm": 2.499313641744383, "learning_rate": 9.314906025277609e-06, "loss": 0.4257, "step": 1380 }, { "epoch": 0.19368863955119214, "grad_norm": 2.2872057991322574, "learning_rate": 9.313758062949246e-06, "loss": 0.4493, "step": 1381 }, { "epoch": 0.1938288920056101, "grad_norm": 4.697298515347429, "learning_rate": 9.312609210507046e-06, "loss": 0.4156, "step": 1382 }, { "epoch": 0.19396914446002805, "grad_norm": 2.701639285300845, "learning_rate": 9.311459468188066e-06, "loss": 0.3711, "step": 1383 }, { "epoch": 0.194109396914446, "grad_norm": 2.3189878439442775, "learning_rate": 9.310308836229548e-06, "loss": 0.4267, "step": 1384 }, { "epoch": 0.19424964936886396, "grad_norm": 4.523857874599961, "learning_rate": 9.309157314868916e-06, "loss": 0.4061, "step": 1385 }, { "epoch": 0.19438990182328192, "grad_norm": 2.713873600686242, "learning_rate": 9.308004904343776e-06, "loss": 0.4249, "step": 1386 }, { "epoch": 0.19453015427769985, "grad_norm": 1.772783738684182, "learning_rate": 9.306851604891925e-06, "loss": 0.4243, "step": 1387 }, { "epoch": 0.1946704067321178, "grad_norm": 2.4064009978241576, "learning_rate": 9.305697416751333e-06, "loss": 0.4165, "step": 1388 }, { "epoch": 0.19481065918653576, "grad_norm": 2.1153757148502077, "learning_rate": 9.304542340160162e-06, "loss": 0.3876, "step": 1389 }, { "epoch": 0.1949509116409537, "grad_norm": 2.384502474475808, "learning_rate": 9.303386375356752e-06, "loss": 0.4414, "step": 1390 }, { "epoch": 0.19509116409537167, "grad_norm": 2.489382896707716, "learning_rate": 9.302229522579631e-06, "loss": 0.4028, "step": 1391 }, { "epoch": 0.19523141654978962, "grad_norm": 1.977480588448759, "learning_rate": 9.301071782067504e-06, "loss": 0.4126, "step": 1392 }, { "epoch": 0.19537166900420758, "grad_norm": 2.118782709369969, "learning_rate": 9.299913154059265e-06, "loss": 0.4234, "step": 1393 }, { "epoch": 0.19551192145862553, "grad_norm": 2.380494137444876, "learning_rate": 9.29875363879399e-06, "loss": 0.4174, "step": 1394 }, { "epoch": 0.1956521739130435, "grad_norm": 2.6250699351849307, "learning_rate": 9.297593236510933e-06, "loss": 0.458, "step": 1395 }, { "epoch": 0.19579242636746144, "grad_norm": 2.9620154841686075, "learning_rate": 9.296431947449538e-06, "loss": 0.4409, "step": 1396 }, { "epoch": 0.19593267882187937, "grad_norm": 2.3089261662840843, "learning_rate": 9.295269771849426e-06, "loss": 0.4022, "step": 1397 }, { "epoch": 0.19607293127629732, "grad_norm": 1.9510896377110778, "learning_rate": 9.294106709950408e-06, "loss": 0.4083, "step": 1398 }, { "epoch": 0.19621318373071528, "grad_norm": 2.206352196938933, "learning_rate": 9.292942761992471e-06, "loss": 0.3777, "step": 1399 }, { "epoch": 0.19635343618513323, "grad_norm": 1.8893318482776549, "learning_rate": 9.291777928215787e-06, "loss": 0.4069, "step": 1400 }, { "epoch": 0.1964936886395512, "grad_norm": 2.592875505582783, "learning_rate": 9.290612208860713e-06, "loss": 0.3711, "step": 1401 }, { "epoch": 0.19663394109396914, "grad_norm": 2.2361284325075563, "learning_rate": 9.289445604167786e-06, "loss": 0.4059, "step": 1402 }, { "epoch": 0.1967741935483871, "grad_norm": 1.8215574010599955, "learning_rate": 9.288278114377727e-06, "loss": 0.4116, "step": 1403 }, { "epoch": 0.19691444600280505, "grad_norm": 2.4257357434904807, "learning_rate": 9.28710973973144e-06, "loss": 0.4123, "step": 1404 }, { "epoch": 0.197054698457223, "grad_norm": 2.92372537864309, "learning_rate": 9.28594048047001e-06, "loss": 0.4824, "step": 1405 }, { "epoch": 0.19719495091164096, "grad_norm": 2.236373541091785, "learning_rate": 9.284770336834706e-06, "loss": 0.393, "step": 1406 }, { "epoch": 0.1973352033660589, "grad_norm": 2.3508061304674217, "learning_rate": 9.283599309066977e-06, "loss": 0.4082, "step": 1407 }, { "epoch": 0.19747545582047685, "grad_norm": 6.242480828527329, "learning_rate": 9.28242739740846e-06, "loss": 0.3843, "step": 1408 }, { "epoch": 0.1976157082748948, "grad_norm": 2.343863014039593, "learning_rate": 9.281254602100968e-06, "loss": 0.415, "step": 1409 }, { "epoch": 0.19775596072931276, "grad_norm": 2.5285849022063736, "learning_rate": 9.280080923386501e-06, "loss": 0.4191, "step": 1410 }, { "epoch": 0.1978962131837307, "grad_norm": 2.886189454816124, "learning_rate": 9.278906361507238e-06, "loss": 0.3954, "step": 1411 }, { "epoch": 0.19803646563814867, "grad_norm": 2.471271434453646, "learning_rate": 9.277730916705544e-06, "loss": 0.3775, "step": 1412 }, { "epoch": 0.19817671809256662, "grad_norm": 3.3561141400392924, "learning_rate": 9.276554589223958e-06, "loss": 0.4067, "step": 1413 }, { "epoch": 0.19831697054698458, "grad_norm": 2.1284411000943964, "learning_rate": 9.275377379305214e-06, "loss": 0.3623, "step": 1414 }, { "epoch": 0.19845722300140253, "grad_norm": 2.1647020627436877, "learning_rate": 9.274199287192215e-06, "loss": 0.3959, "step": 1415 }, { "epoch": 0.1985974754558205, "grad_norm": 1.9409602859970183, "learning_rate": 9.273020313128059e-06, "loss": 0.3607, "step": 1416 }, { "epoch": 0.19873772791023842, "grad_norm": 2.194122643623745, "learning_rate": 9.271840457356013e-06, "loss": 0.3979, "step": 1417 }, { "epoch": 0.19887798036465637, "grad_norm": 2.058356197194156, "learning_rate": 9.270659720119533e-06, "loss": 0.4303, "step": 1418 }, { "epoch": 0.19901823281907433, "grad_norm": 2.9008734407166363, "learning_rate": 9.26947810166226e-06, "loss": 0.4401, "step": 1419 }, { "epoch": 0.19915848527349228, "grad_norm": 2.3008698477145173, "learning_rate": 9.268295602228005e-06, "loss": 0.3912, "step": 1420 }, { "epoch": 0.19929873772791024, "grad_norm": 1.9900117249838303, "learning_rate": 9.267112222060777e-06, "loss": 0.4056, "step": 1421 }, { "epoch": 0.1994389901823282, "grad_norm": 1.6835911301594684, "learning_rate": 9.265927961404754e-06, "loss": 0.4402, "step": 1422 }, { "epoch": 0.19957924263674615, "grad_norm": 2.769281760789626, "learning_rate": 9.2647428205043e-06, "loss": 0.4287, "step": 1423 }, { "epoch": 0.1997194950911641, "grad_norm": 2.4279233163856357, "learning_rate": 9.26355679960396e-06, "loss": 0.391, "step": 1424 }, { "epoch": 0.19985974754558206, "grad_norm": 2.7814875297698856, "learning_rate": 9.262369898948462e-06, "loss": 0.3719, "step": 1425 }, { "epoch": 0.2, "grad_norm": 2.462462712504218, "learning_rate": 9.261182118782714e-06, "loss": 0.3894, "step": 1426 }, { "epoch": 0.20014025245441797, "grad_norm": 2.2714046028017347, "learning_rate": 9.259993459351809e-06, "loss": 0.3685, "step": 1427 }, { "epoch": 0.2002805049088359, "grad_norm": 1.8996433776661321, "learning_rate": 9.258803920901014e-06, "loss": 0.3699, "step": 1428 }, { "epoch": 0.20042075736325385, "grad_norm": 2.4279677999268627, "learning_rate": 9.257613503675787e-06, "loss": 0.3952, "step": 1429 }, { "epoch": 0.2005610098176718, "grad_norm": 2.5510466877109206, "learning_rate": 9.256422207921757e-06, "loss": 0.3901, "step": 1430 }, { "epoch": 0.20070126227208976, "grad_norm": 2.087034815506554, "learning_rate": 9.255230033884743e-06, "loss": 0.3665, "step": 1431 }, { "epoch": 0.2008415147265077, "grad_norm": 1.796151056823173, "learning_rate": 9.254036981810741e-06, "loss": 0.4049, "step": 1432 }, { "epoch": 0.20098176718092567, "grad_norm": 2.5513892858407226, "learning_rate": 9.252843051945928e-06, "loss": 0.3709, "step": 1433 }, { "epoch": 0.20112201963534362, "grad_norm": 2.0344492440219453, "learning_rate": 9.251648244536665e-06, "loss": 0.4165, "step": 1434 }, { "epoch": 0.20126227208976158, "grad_norm": 2.146802689446459, "learning_rate": 9.25045255982949e-06, "loss": 0.4006, "step": 1435 }, { "epoch": 0.20140252454417953, "grad_norm": 2.0697541257630774, "learning_rate": 9.249255998071127e-06, "loss": 0.4222, "step": 1436 }, { "epoch": 0.2015427769985975, "grad_norm": 2.232451778288349, "learning_rate": 9.248058559508476e-06, "loss": 0.3758, "step": 1437 }, { "epoch": 0.20168302945301542, "grad_norm": 3.4215996339688517, "learning_rate": 9.246860244388621e-06, "loss": 0.4274, "step": 1438 }, { "epoch": 0.20182328190743337, "grad_norm": 2.863454607910686, "learning_rate": 9.245661052958823e-06, "loss": 0.3895, "step": 1439 }, { "epoch": 0.20196353436185133, "grad_norm": 2.1933680709978995, "learning_rate": 9.244460985466532e-06, "loss": 0.3714, "step": 1440 }, { "epoch": 0.20210378681626928, "grad_norm": 2.669707104765996, "learning_rate": 9.24326004215937e-06, "loss": 0.4157, "step": 1441 }, { "epoch": 0.20224403927068724, "grad_norm": 3.4700354866170096, "learning_rate": 9.242058223285143e-06, "loss": 0.3998, "step": 1442 }, { "epoch": 0.2023842917251052, "grad_norm": 2.505279592873909, "learning_rate": 9.24085552909184e-06, "loss": 0.4437, "step": 1443 }, { "epoch": 0.20252454417952315, "grad_norm": 3.434467434528662, "learning_rate": 9.239651959827627e-06, "loss": 0.3708, "step": 1444 }, { "epoch": 0.2026647966339411, "grad_norm": 2.5130967413682725, "learning_rate": 9.238447515740853e-06, "loss": 0.4148, "step": 1445 }, { "epoch": 0.20280504908835906, "grad_norm": 2.105501022516173, "learning_rate": 9.237242197080045e-06, "loss": 0.411, "step": 1446 }, { "epoch": 0.202945301542777, "grad_norm": 1.8854235513747604, "learning_rate": 9.236036004093916e-06, "loss": 0.3427, "step": 1447 }, { "epoch": 0.20308555399719494, "grad_norm": 2.55042155652096, "learning_rate": 9.23482893703135e-06, "loss": 0.3727, "step": 1448 }, { "epoch": 0.2032258064516129, "grad_norm": 2.420095368232234, "learning_rate": 9.233620996141421e-06, "loss": 0.4375, "step": 1449 }, { "epoch": 0.20336605890603085, "grad_norm": 2.0828050960972937, "learning_rate": 9.232412181673378e-06, "loss": 0.3818, "step": 1450 }, { "epoch": 0.2035063113604488, "grad_norm": 4.233784426787666, "learning_rate": 9.231202493876652e-06, "loss": 0.434, "step": 1451 }, { "epoch": 0.20364656381486676, "grad_norm": 2.1116800782405782, "learning_rate": 9.229991933000852e-06, "loss": 0.4015, "step": 1452 }, { "epoch": 0.20378681626928472, "grad_norm": 2.615944911849222, "learning_rate": 9.22878049929577e-06, "loss": 0.3836, "step": 1453 }, { "epoch": 0.20392706872370267, "grad_norm": 4.375384504587943, "learning_rate": 9.227568193011375e-06, "loss": 0.4528, "step": 1454 }, { "epoch": 0.20406732117812063, "grad_norm": 3.3040938328864926, "learning_rate": 9.226355014397823e-06, "loss": 0.4139, "step": 1455 }, { "epoch": 0.20420757363253858, "grad_norm": 2.314455803334421, "learning_rate": 9.225140963705439e-06, "loss": 0.3664, "step": 1456 }, { "epoch": 0.20434782608695654, "grad_norm": 2.0065778803061414, "learning_rate": 9.223926041184737e-06, "loss": 0.385, "step": 1457 }, { "epoch": 0.20448807854137446, "grad_norm": 2.186802998262635, "learning_rate": 9.222710247086405e-06, "loss": 0.4081, "step": 1458 }, { "epoch": 0.20462833099579242, "grad_norm": 2.475878696323528, "learning_rate": 9.221493581661318e-06, "loss": 0.4083, "step": 1459 }, { "epoch": 0.20476858345021037, "grad_norm": 2.719296630627898, "learning_rate": 9.220276045160524e-06, "loss": 0.4588, "step": 1460 }, { "epoch": 0.20490883590462833, "grad_norm": 2.2386737366194165, "learning_rate": 9.219057637835252e-06, "loss": 0.3836, "step": 1461 }, { "epoch": 0.20504908835904628, "grad_norm": 2.3176780670132886, "learning_rate": 9.217838359936914e-06, "loss": 0.4386, "step": 1462 }, { "epoch": 0.20518934081346424, "grad_norm": 2.8205283127317022, "learning_rate": 9.216618211717098e-06, "loss": 0.404, "step": 1463 }, { "epoch": 0.2053295932678822, "grad_norm": 2.394257123729619, "learning_rate": 9.215397193427575e-06, "loss": 0.3781, "step": 1464 }, { "epoch": 0.20546984572230015, "grad_norm": 3.109693654663509, "learning_rate": 9.21417530532029e-06, "loss": 0.4384, "step": 1465 }, { "epoch": 0.2056100981767181, "grad_norm": 2.2488498927221303, "learning_rate": 9.212952547647375e-06, "loss": 0.4156, "step": 1466 }, { "epoch": 0.20575035063113606, "grad_norm": 2.986240782374211, "learning_rate": 9.211728920661136e-06, "loss": 0.4524, "step": 1467 }, { "epoch": 0.20589060308555399, "grad_norm": 2.354129745959072, "learning_rate": 9.21050442461406e-06, "loss": 0.3689, "step": 1468 }, { "epoch": 0.20603085553997194, "grad_norm": 2.8101150573307514, "learning_rate": 9.20927905975881e-06, "loss": 0.427, "step": 1469 }, { "epoch": 0.2061711079943899, "grad_norm": 2.3664697748765042, "learning_rate": 9.208052826348238e-06, "loss": 0.4078, "step": 1470 }, { "epoch": 0.20631136044880785, "grad_norm": 3.456054549528466, "learning_rate": 9.206825724635363e-06, "loss": 0.4205, "step": 1471 }, { "epoch": 0.2064516129032258, "grad_norm": 2.734210548905228, "learning_rate": 9.205597754873391e-06, "loss": 0.3855, "step": 1472 }, { "epoch": 0.20659186535764376, "grad_norm": 2.876762679064388, "learning_rate": 9.204368917315706e-06, "loss": 0.4278, "step": 1473 }, { "epoch": 0.20673211781206172, "grad_norm": 2.3318016156438435, "learning_rate": 9.203139212215868e-06, "loss": 0.4172, "step": 1474 }, { "epoch": 0.20687237026647967, "grad_norm": 2.9254057252330936, "learning_rate": 9.201908639827619e-06, "loss": 0.452, "step": 1475 }, { "epoch": 0.20701262272089763, "grad_norm": 2.717553500269121, "learning_rate": 9.20067720040488e-06, "loss": 0.4106, "step": 1476 }, { "epoch": 0.20715287517531558, "grad_norm": 2.114915029383557, "learning_rate": 9.199444894201748e-06, "loss": 0.4107, "step": 1477 }, { "epoch": 0.2072931276297335, "grad_norm": 2.726447772478814, "learning_rate": 9.198211721472503e-06, "loss": 0.4128, "step": 1478 }, { "epoch": 0.20743338008415146, "grad_norm": 2.5159765910043626, "learning_rate": 9.1969776824716e-06, "loss": 0.3829, "step": 1479 }, { "epoch": 0.20757363253856942, "grad_norm": 2.1955806256204426, "learning_rate": 9.195742777453674e-06, "loss": 0.3776, "step": 1480 }, { "epoch": 0.20771388499298737, "grad_norm": 2.57472504194049, "learning_rate": 9.19450700667354e-06, "loss": 0.4003, "step": 1481 }, { "epoch": 0.20785413744740533, "grad_norm": 2.0983922935041828, "learning_rate": 9.193270370386188e-06, "loss": 0.3897, "step": 1482 }, { "epoch": 0.20799438990182328, "grad_norm": 2.236613825909605, "learning_rate": 9.192032868846794e-06, "loss": 0.4157, "step": 1483 }, { "epoch": 0.20813464235624124, "grad_norm": 2.1168935649137723, "learning_rate": 9.190794502310704e-06, "loss": 0.4028, "step": 1484 }, { "epoch": 0.2082748948106592, "grad_norm": 2.199760736112584, "learning_rate": 9.18955527103345e-06, "loss": 0.3974, "step": 1485 }, { "epoch": 0.20841514726507715, "grad_norm": 1.7899034014801607, "learning_rate": 9.188315175270735e-06, "loss": 0.4169, "step": 1486 }, { "epoch": 0.2085553997194951, "grad_norm": 2.1197719213767376, "learning_rate": 9.187074215278444e-06, "loss": 0.3827, "step": 1487 }, { "epoch": 0.20869565217391303, "grad_norm": 6.672248347465336, "learning_rate": 9.185832391312644e-06, "loss": 0.4774, "step": 1488 }, { "epoch": 0.208835904628331, "grad_norm": 2.302661276604301, "learning_rate": 9.184589703629575e-06, "loss": 0.4104, "step": 1489 }, { "epoch": 0.20897615708274894, "grad_norm": 2.700757969321983, "learning_rate": 9.183346152485654e-06, "loss": 0.4013, "step": 1490 }, { "epoch": 0.2091164095371669, "grad_norm": 1.9589725087073622, "learning_rate": 9.182101738137483e-06, "loss": 0.4001, "step": 1491 }, { "epoch": 0.20925666199158485, "grad_norm": 3.34356817303709, "learning_rate": 9.180856460841836e-06, "loss": 0.3882, "step": 1492 }, { "epoch": 0.2093969144460028, "grad_norm": 3.005874445371172, "learning_rate": 9.17961032085567e-06, "loss": 0.3971, "step": 1493 }, { "epoch": 0.20953716690042076, "grad_norm": 1.9845622417389834, "learning_rate": 9.178363318436115e-06, "loss": 0.3454, "step": 1494 }, { "epoch": 0.20967741935483872, "grad_norm": 2.2403692123422614, "learning_rate": 9.17711545384048e-06, "loss": 0.3917, "step": 1495 }, { "epoch": 0.20981767180925667, "grad_norm": 2.483576808318475, "learning_rate": 9.175866727326255e-06, "loss": 0.3951, "step": 1496 }, { "epoch": 0.20995792426367463, "grad_norm": 2.048815091712597, "learning_rate": 9.174617139151108e-06, "loss": 0.3927, "step": 1497 }, { "epoch": 0.21009817671809256, "grad_norm": 1.9829325201506038, "learning_rate": 9.173366689572878e-06, "loss": 0.3798, "step": 1498 }, { "epoch": 0.2102384291725105, "grad_norm": 1.8619798638046845, "learning_rate": 9.172115378849588e-06, "loss": 0.3571, "step": 1499 }, { "epoch": 0.21037868162692847, "grad_norm": 1.9489584464218077, "learning_rate": 9.170863207239442e-06, "loss": 0.3732, "step": 1500 }, { "epoch": 0.21051893408134642, "grad_norm": 2.0950831254295657, "learning_rate": 9.169610175000812e-06, "loss": 0.4218, "step": 1501 }, { "epoch": 0.21065918653576438, "grad_norm": 1.8420837761407387, "learning_rate": 9.168356282392253e-06, "loss": 0.3985, "step": 1502 }, { "epoch": 0.21079943899018233, "grad_norm": 2.015921282616536, "learning_rate": 9.167101529672496e-06, "loss": 0.4444, "step": 1503 }, { "epoch": 0.21093969144460029, "grad_norm": 2.042364494283169, "learning_rate": 9.165845917100454e-06, "loss": 0.3971, "step": 1504 }, { "epoch": 0.21107994389901824, "grad_norm": 2.4184792615984407, "learning_rate": 9.164589444935211e-06, "loss": 0.4013, "step": 1505 }, { "epoch": 0.2112201963534362, "grad_norm": 2.601623167871354, "learning_rate": 9.163332113436031e-06, "loss": 0.3685, "step": 1506 }, { "epoch": 0.21136044880785415, "grad_norm": 2.190409659822618, "learning_rate": 9.16207392286236e-06, "loss": 0.3305, "step": 1507 }, { "epoch": 0.21150070126227208, "grad_norm": 2.541480031483082, "learning_rate": 9.160814873473811e-06, "loss": 0.4558, "step": 1508 }, { "epoch": 0.21164095371669003, "grad_norm": 2.623477494262688, "learning_rate": 9.159554965530184e-06, "loss": 0.4059, "step": 1509 }, { "epoch": 0.211781206171108, "grad_norm": 2.6626097893801357, "learning_rate": 9.15829419929145e-06, "loss": 0.4111, "step": 1510 }, { "epoch": 0.21192145862552594, "grad_norm": 2.600870401235836, "learning_rate": 9.157032575017762e-06, "loss": 0.4285, "step": 1511 }, { "epoch": 0.2120617110799439, "grad_norm": 1.8254412653221328, "learning_rate": 9.155770092969443e-06, "loss": 0.4262, "step": 1512 }, { "epoch": 0.21220196353436185, "grad_norm": 2.2689426729131585, "learning_rate": 9.154506753407e-06, "loss": 0.4277, "step": 1513 }, { "epoch": 0.2123422159887798, "grad_norm": 2.63141592775659, "learning_rate": 9.153242556591115e-06, "loss": 0.3921, "step": 1514 }, { "epoch": 0.21248246844319776, "grad_norm": 2.239057209996259, "learning_rate": 9.151977502782645e-06, "loss": 0.3789, "step": 1515 }, { "epoch": 0.21262272089761572, "grad_norm": 3.3126547435388036, "learning_rate": 9.150711592242627e-06, "loss": 0.4585, "step": 1516 }, { "epoch": 0.21276297335203367, "grad_norm": 3.024333180475233, "learning_rate": 9.149444825232269e-06, "loss": 0.3752, "step": 1517 }, { "epoch": 0.2129032258064516, "grad_norm": 2.314355054620543, "learning_rate": 9.148177202012962e-06, "loss": 0.4336, "step": 1518 }, { "epoch": 0.21304347826086956, "grad_norm": 2.9054186872829684, "learning_rate": 9.146908722846271e-06, "loss": 0.3722, "step": 1519 }, { "epoch": 0.2131837307152875, "grad_norm": 2.0111219391200548, "learning_rate": 9.145639387993939e-06, "loss": 0.4274, "step": 1520 }, { "epoch": 0.21332398316970547, "grad_norm": 3.9138090923457325, "learning_rate": 9.14436919771788e-06, "loss": 0.4008, "step": 1521 }, { "epoch": 0.21346423562412342, "grad_norm": 3.6613886009438525, "learning_rate": 9.143098152280195e-06, "loss": 0.427, "step": 1522 }, { "epoch": 0.21360448807854138, "grad_norm": 2.242507007974925, "learning_rate": 9.14182625194315e-06, "loss": 0.4512, "step": 1523 }, { "epoch": 0.21374474053295933, "grad_norm": 2.700592265691038, "learning_rate": 9.140553496969195e-06, "loss": 0.4084, "step": 1524 }, { "epoch": 0.2138849929873773, "grad_norm": 2.586223206194609, "learning_rate": 9.139279887620955e-06, "loss": 0.344, "step": 1525 }, { "epoch": 0.21402524544179524, "grad_norm": 2.480895190490221, "learning_rate": 9.13800542416123e-06, "loss": 0.377, "step": 1526 }, { "epoch": 0.2141654978962132, "grad_norm": 2.6939844370210495, "learning_rate": 9.136730106852995e-06, "loss": 0.4272, "step": 1527 }, { "epoch": 0.21430575035063112, "grad_norm": 2.335792970811034, "learning_rate": 9.135453935959405e-06, "loss": 0.3912, "step": 1528 }, { "epoch": 0.21444600280504908, "grad_norm": 2.0814241307818375, "learning_rate": 9.134176911743787e-06, "loss": 0.418, "step": 1529 }, { "epoch": 0.21458625525946703, "grad_norm": 2.561429055816967, "learning_rate": 9.132899034469648e-06, "loss": 0.4307, "step": 1530 }, { "epoch": 0.214726507713885, "grad_norm": 2.2047880132244133, "learning_rate": 9.131620304400667e-06, "loss": 0.4589, "step": 1531 }, { "epoch": 0.21486676016830294, "grad_norm": 2.092484504523836, "learning_rate": 9.1303407218007e-06, "loss": 0.4, "step": 1532 }, { "epoch": 0.2150070126227209, "grad_norm": 2.417213531531512, "learning_rate": 9.129060286933786e-06, "loss": 0.4325, "step": 1533 }, { "epoch": 0.21514726507713886, "grad_norm": 3.479223985637649, "learning_rate": 9.127779000064127e-06, "loss": 0.4306, "step": 1534 }, { "epoch": 0.2152875175315568, "grad_norm": 2.045423968886132, "learning_rate": 9.126496861456113e-06, "loss": 0.3936, "step": 1535 }, { "epoch": 0.21542776998597477, "grad_norm": 3.1986736350206884, "learning_rate": 9.125213871374298e-06, "loss": 0.3774, "step": 1536 }, { "epoch": 0.21556802244039272, "grad_norm": 3.076003684638189, "learning_rate": 9.123930030083425e-06, "loss": 0.4076, "step": 1537 }, { "epoch": 0.21570827489481065, "grad_norm": 2.372546484665667, "learning_rate": 9.1226453378484e-06, "loss": 0.4431, "step": 1538 }, { "epoch": 0.2158485273492286, "grad_norm": 4.21751781412375, "learning_rate": 9.121359794934312e-06, "loss": 0.3694, "step": 1539 }, { "epoch": 0.21598877980364656, "grad_norm": 2.9005920134733683, "learning_rate": 9.120073401606427e-06, "loss": 0.4202, "step": 1540 }, { "epoch": 0.2161290322580645, "grad_norm": 1.9262216308207813, "learning_rate": 9.11878615813018e-06, "loss": 0.3786, "step": 1541 }, { "epoch": 0.21626928471248247, "grad_norm": 3.183721359597388, "learning_rate": 9.117498064771185e-06, "loss": 0.3864, "step": 1542 }, { "epoch": 0.21640953716690042, "grad_norm": 2.303215979509217, "learning_rate": 9.11620912179523e-06, "loss": 0.4329, "step": 1543 }, { "epoch": 0.21654978962131838, "grad_norm": 2.265762745846327, "learning_rate": 9.114919329468283e-06, "loss": 0.4413, "step": 1544 }, { "epoch": 0.21669004207573633, "grad_norm": 2.1847844298146994, "learning_rate": 9.113628688056481e-06, "loss": 0.3797, "step": 1545 }, { "epoch": 0.2168302945301543, "grad_norm": 2.411927201654379, "learning_rate": 9.112337197826138e-06, "loss": 0.4022, "step": 1546 }, { "epoch": 0.21697054698457224, "grad_norm": 3.038010442096778, "learning_rate": 9.111044859043747e-06, "loss": 0.3552, "step": 1547 }, { "epoch": 0.21711079943899017, "grad_norm": 2.491823941203527, "learning_rate": 9.10975167197597e-06, "loss": 0.3809, "step": 1548 }, { "epoch": 0.21725105189340813, "grad_norm": 2.0882758703137543, "learning_rate": 9.10845763688965e-06, "loss": 0.4056, "step": 1549 }, { "epoch": 0.21739130434782608, "grad_norm": 1.8782589882074907, "learning_rate": 9.107162754051799e-06, "loss": 0.3244, "step": 1550 }, { "epoch": 0.21753155680224404, "grad_norm": 1.838473382677785, "learning_rate": 9.10586702372961e-06, "loss": 0.3706, "step": 1551 }, { "epoch": 0.217671809256662, "grad_norm": 5.9059399720218115, "learning_rate": 9.104570446190445e-06, "loss": 0.4006, "step": 1552 }, { "epoch": 0.21781206171107995, "grad_norm": 4.79985167751032, "learning_rate": 9.103273021701846e-06, "loss": 0.3793, "step": 1553 }, { "epoch": 0.2179523141654979, "grad_norm": 2.490988703153485, "learning_rate": 9.101974750531526e-06, "loss": 0.4171, "step": 1554 }, { "epoch": 0.21809256661991586, "grad_norm": 2.8877136415655196, "learning_rate": 9.100675632947376e-06, "loss": 0.4251, "step": 1555 }, { "epoch": 0.2182328190743338, "grad_norm": 2.589451088345622, "learning_rate": 9.099375669217458e-06, "loss": 0.4597, "step": 1556 }, { "epoch": 0.21837307152875177, "grad_norm": 2.4756500777563692, "learning_rate": 9.098074859610012e-06, "loss": 0.3863, "step": 1557 }, { "epoch": 0.2185133239831697, "grad_norm": 2.5113724489108415, "learning_rate": 9.09677320439345e-06, "loss": 0.3952, "step": 1558 }, { "epoch": 0.21865357643758765, "grad_norm": 2.2457956663616634, "learning_rate": 9.095470703836358e-06, "loss": 0.3877, "step": 1559 }, { "epoch": 0.2187938288920056, "grad_norm": 2.5690051061179324, "learning_rate": 9.094167358207502e-06, "loss": 0.3553, "step": 1560 }, { "epoch": 0.21893408134642356, "grad_norm": 2.1940285399883583, "learning_rate": 9.092863167775813e-06, "loss": 0.3706, "step": 1561 }, { "epoch": 0.21907433380084151, "grad_norm": 2.022564673975247, "learning_rate": 9.091558132810407e-06, "loss": 0.467, "step": 1562 }, { "epoch": 0.21921458625525947, "grad_norm": 2.0309817797295735, "learning_rate": 9.090252253580565e-06, "loss": 0.3954, "step": 1563 }, { "epoch": 0.21935483870967742, "grad_norm": 2.645635332122965, "learning_rate": 9.088945530355746e-06, "loss": 0.4259, "step": 1564 }, { "epoch": 0.21949509116409538, "grad_norm": 2.6097401803141893, "learning_rate": 9.087637963405586e-06, "loss": 0.4323, "step": 1565 }, { "epoch": 0.21963534361851333, "grad_norm": 2.6476916497937775, "learning_rate": 9.08632955299989e-06, "loss": 0.3635, "step": 1566 }, { "epoch": 0.2197755960729313, "grad_norm": 2.8157562055935133, "learning_rate": 9.085020299408642e-06, "loss": 0.3463, "step": 1567 }, { "epoch": 0.21991584852734922, "grad_norm": 2.0668448720004484, "learning_rate": 9.083710202901994e-06, "loss": 0.3498, "step": 1568 }, { "epoch": 0.22005610098176717, "grad_norm": 2.5532363363057446, "learning_rate": 9.082399263750276e-06, "loss": 0.4385, "step": 1569 }, { "epoch": 0.22019635343618513, "grad_norm": 2.5028252849616686, "learning_rate": 9.081087482223993e-06, "loss": 0.4157, "step": 1570 }, { "epoch": 0.22033660589060308, "grad_norm": 3.2169142738140164, "learning_rate": 9.07977485859382e-06, "loss": 0.3789, "step": 1571 }, { "epoch": 0.22047685834502104, "grad_norm": 2.9239779977349, "learning_rate": 9.07846139313061e-06, "loss": 0.3983, "step": 1572 }, { "epoch": 0.220617110799439, "grad_norm": 1.8895485075321286, "learning_rate": 9.077147086105382e-06, "loss": 0.3849, "step": 1573 }, { "epoch": 0.22075736325385695, "grad_norm": 2.5063486550717617, "learning_rate": 9.075831937789341e-06, "loss": 0.3572, "step": 1574 }, { "epoch": 0.2208976157082749, "grad_norm": 4.627223949308136, "learning_rate": 9.074515948453855e-06, "loss": 0.3991, "step": 1575 }, { "epoch": 0.22103786816269286, "grad_norm": 3.150065642384388, "learning_rate": 9.073199118370471e-06, "loss": 0.4272, "step": 1576 }, { "epoch": 0.2211781206171108, "grad_norm": 2.62013666044059, "learning_rate": 9.071881447810907e-06, "loss": 0.395, "step": 1577 }, { "epoch": 0.22131837307152874, "grad_norm": 2.704967422148151, "learning_rate": 9.070562937047052e-06, "loss": 0.3499, "step": 1578 }, { "epoch": 0.2214586255259467, "grad_norm": 2.4416971506379226, "learning_rate": 9.069243586350976e-06, "loss": 0.3922, "step": 1579 }, { "epoch": 0.22159887798036465, "grad_norm": 2.5300621730719137, "learning_rate": 9.067923395994916e-06, "loss": 0.3629, "step": 1580 }, { "epoch": 0.2217391304347826, "grad_norm": 2.3300566372516127, "learning_rate": 9.066602366251283e-06, "loss": 0.4092, "step": 1581 }, { "epoch": 0.22187938288920056, "grad_norm": 2.9805736998232613, "learning_rate": 9.065280497392663e-06, "loss": 0.3782, "step": 1582 }, { "epoch": 0.22201963534361852, "grad_norm": 2.3213024147044465, "learning_rate": 9.063957789691816e-06, "loss": 0.3972, "step": 1583 }, { "epoch": 0.22215988779803647, "grad_norm": 2.28036189101728, "learning_rate": 9.06263424342167e-06, "loss": 0.3893, "step": 1584 }, { "epoch": 0.22230014025245443, "grad_norm": 3.1747847116124097, "learning_rate": 9.061309858855334e-06, "loss": 0.3958, "step": 1585 }, { "epoch": 0.22244039270687238, "grad_norm": 2.012031960409847, "learning_rate": 9.059984636266082e-06, "loss": 0.3837, "step": 1586 }, { "epoch": 0.22258064516129034, "grad_norm": 3.2309730809565287, "learning_rate": 9.058658575927368e-06, "loss": 0.3639, "step": 1587 }, { "epoch": 0.22272089761570826, "grad_norm": 4.4926515555145805, "learning_rate": 9.057331678112809e-06, "loss": 0.3915, "step": 1588 }, { "epoch": 0.22286115007012622, "grad_norm": 2.5871076903812176, "learning_rate": 9.056003943096208e-06, "loss": 0.4108, "step": 1589 }, { "epoch": 0.22300140252454417, "grad_norm": 2.19299193758337, "learning_rate": 9.05467537115153e-06, "loss": 0.4002, "step": 1590 }, { "epoch": 0.22314165497896213, "grad_norm": 2.6996796983124174, "learning_rate": 9.053345962552915e-06, "loss": 0.3931, "step": 1591 }, { "epoch": 0.22328190743338008, "grad_norm": 2.591372951860609, "learning_rate": 9.052015717574683e-06, "loss": 0.4354, "step": 1592 }, { "epoch": 0.22342215988779804, "grad_norm": 2.7543098316312573, "learning_rate": 9.050684636491317e-06, "loss": 0.4613, "step": 1593 }, { "epoch": 0.223562412342216, "grad_norm": 2.5999256746965926, "learning_rate": 9.049352719577474e-06, "loss": 0.4052, "step": 1594 }, { "epoch": 0.22370266479663395, "grad_norm": 3.433717206833776, "learning_rate": 9.04801996710799e-06, "loss": 0.3687, "step": 1595 }, { "epoch": 0.2238429172510519, "grad_norm": 3.500176016604993, "learning_rate": 9.046686379357867e-06, "loss": 0.3675, "step": 1596 }, { "epoch": 0.22398316970546986, "grad_norm": 2.290175652243173, "learning_rate": 9.045351956602282e-06, "loss": 0.3665, "step": 1597 }, { "epoch": 0.2241234221598878, "grad_norm": 2.117190653105355, "learning_rate": 9.044016699116584e-06, "loss": 0.3596, "step": 1598 }, { "epoch": 0.22426367461430574, "grad_norm": 2.229228782382571, "learning_rate": 9.042680607176296e-06, "loss": 0.4171, "step": 1599 }, { "epoch": 0.2244039270687237, "grad_norm": 2.2541234062213307, "learning_rate": 9.041343681057106e-06, "loss": 0.3792, "step": 1600 }, { "epoch": 0.22454417952314165, "grad_norm": 2.4005378200012486, "learning_rate": 9.040005921034884e-06, "loss": 0.4065, "step": 1601 }, { "epoch": 0.2246844319775596, "grad_norm": 2.676985554573149, "learning_rate": 9.038667327385664e-06, "loss": 0.4133, "step": 1602 }, { "epoch": 0.22482468443197756, "grad_norm": 3.708125990882431, "learning_rate": 9.03732790038566e-06, "loss": 0.4195, "step": 1603 }, { "epoch": 0.22496493688639552, "grad_norm": 2.197735192880922, "learning_rate": 9.03598764031125e-06, "loss": 0.4185, "step": 1604 }, { "epoch": 0.22510518934081347, "grad_norm": 1.8357170459605696, "learning_rate": 9.034646547438987e-06, "loss": 0.3766, "step": 1605 }, { "epoch": 0.22524544179523143, "grad_norm": 2.702345542763752, "learning_rate": 9.033304622045597e-06, "loss": 0.4174, "step": 1606 }, { "epoch": 0.22538569424964938, "grad_norm": 2.1375341446140386, "learning_rate": 9.03196186440798e-06, "loss": 0.3466, "step": 1607 }, { "epoch": 0.2255259467040673, "grad_norm": 1.946357844506364, "learning_rate": 9.0306182748032e-06, "loss": 0.4189, "step": 1608 }, { "epoch": 0.22566619915848526, "grad_norm": 1.8715071251397501, "learning_rate": 9.029273853508498e-06, "loss": 0.4265, "step": 1609 }, { "epoch": 0.22580645161290322, "grad_norm": 2.531521347007128, "learning_rate": 9.027928600801288e-06, "loss": 0.4044, "step": 1610 }, { "epoch": 0.22594670406732117, "grad_norm": 2.910527844354641, "learning_rate": 9.026582516959153e-06, "loss": 0.3976, "step": 1611 }, { "epoch": 0.22608695652173913, "grad_norm": 2.219301289586148, "learning_rate": 9.025235602259848e-06, "loss": 0.332, "step": 1612 }, { "epoch": 0.22622720897615708, "grad_norm": 3.07740054827471, "learning_rate": 9.023887856981298e-06, "loss": 0.3935, "step": 1613 }, { "epoch": 0.22636746143057504, "grad_norm": 3.6821033622079256, "learning_rate": 9.022539281401601e-06, "loss": 0.4112, "step": 1614 }, { "epoch": 0.226507713884993, "grad_norm": 3.452672592147198, "learning_rate": 9.021189875799027e-06, "loss": 0.4322, "step": 1615 }, { "epoch": 0.22664796633941095, "grad_norm": 2.746822775907403, "learning_rate": 9.019839640452018e-06, "loss": 0.3953, "step": 1616 }, { "epoch": 0.2267882187938289, "grad_norm": 1.9746478913698404, "learning_rate": 9.018488575639184e-06, "loss": 0.4149, "step": 1617 }, { "epoch": 0.22692847124824683, "grad_norm": 1.9926586715909664, "learning_rate": 9.017136681639307e-06, "loss": 0.3755, "step": 1618 }, { "epoch": 0.2270687237026648, "grad_norm": 2.296329251802608, "learning_rate": 9.01578395873134e-06, "loss": 0.3892, "step": 1619 }, { "epoch": 0.22720897615708274, "grad_norm": 1.9822364156630172, "learning_rate": 9.014430407194413e-06, "loss": 0.4096, "step": 1620 }, { "epoch": 0.2273492286115007, "grad_norm": 2.296627732026712, "learning_rate": 9.013076027307817e-06, "loss": 0.4187, "step": 1621 }, { "epoch": 0.22748948106591865, "grad_norm": 1.9803310248586163, "learning_rate": 9.01172081935102e-06, "loss": 0.3624, "step": 1622 }, { "epoch": 0.2276297335203366, "grad_norm": 2.3947345591325178, "learning_rate": 9.01036478360366e-06, "loss": 0.4036, "step": 1623 }, { "epoch": 0.22776998597475456, "grad_norm": 2.034718296132587, "learning_rate": 9.009007920345547e-06, "loss": 0.3788, "step": 1624 }, { "epoch": 0.22791023842917252, "grad_norm": 2.2989757039967036, "learning_rate": 9.007650229856658e-06, "loss": 0.3868, "step": 1625 }, { "epoch": 0.22805049088359047, "grad_norm": 1.897762790922765, "learning_rate": 9.006291712417143e-06, "loss": 0.4124, "step": 1626 }, { "epoch": 0.22819074333800843, "grad_norm": 2.0790352799585916, "learning_rate": 9.004932368307324e-06, "loss": 0.3878, "step": 1627 }, { "epoch": 0.22833099579242636, "grad_norm": 4.544013417412197, "learning_rate": 9.00357219780769e-06, "loss": 0.4067, "step": 1628 }, { "epoch": 0.2284712482468443, "grad_norm": 2.85240733551978, "learning_rate": 9.002211201198906e-06, "loss": 0.349, "step": 1629 }, { "epoch": 0.22861150070126227, "grad_norm": 3.3276788116949794, "learning_rate": 9.000849378761802e-06, "loss": 0.3851, "step": 1630 }, { "epoch": 0.22875175315568022, "grad_norm": 1.88933222685364, "learning_rate": 8.99948673077738e-06, "loss": 0.419, "step": 1631 }, { "epoch": 0.22889200561009818, "grad_norm": 2.0304971326757504, "learning_rate": 8.998123257526814e-06, "loss": 0.3542, "step": 1632 }, { "epoch": 0.22903225806451613, "grad_norm": 2.4956064718711297, "learning_rate": 8.996758959291447e-06, "loss": 0.4525, "step": 1633 }, { "epoch": 0.22917251051893409, "grad_norm": 2.8405366285823304, "learning_rate": 8.995393836352793e-06, "loss": 0.3752, "step": 1634 }, { "epoch": 0.22931276297335204, "grad_norm": 2.260088133486008, "learning_rate": 8.994027888992533e-06, "loss": 0.4087, "step": 1635 }, { "epoch": 0.22945301542777, "grad_norm": 1.9000110776477925, "learning_rate": 8.992661117492526e-06, "loss": 0.3813, "step": 1636 }, { "epoch": 0.22959326788218795, "grad_norm": 2.5853449669875075, "learning_rate": 8.991293522134789e-06, "loss": 0.3683, "step": 1637 }, { "epoch": 0.22973352033660588, "grad_norm": 2.003331747112609, "learning_rate": 8.98992510320152e-06, "loss": 0.3777, "step": 1638 }, { "epoch": 0.22987377279102383, "grad_norm": 2.5589198712724035, "learning_rate": 8.988555860975082e-06, "loss": 0.4048, "step": 1639 }, { "epoch": 0.2300140252454418, "grad_norm": 2.2356720474236806, "learning_rate": 8.987185795738007e-06, "loss": 0.4177, "step": 1640 }, { "epoch": 0.23015427769985974, "grad_norm": 2.441388183526904, "learning_rate": 8.985814907773004e-06, "loss": 0.4235, "step": 1641 }, { "epoch": 0.2302945301542777, "grad_norm": 2.151370068867226, "learning_rate": 8.984443197362938e-06, "loss": 0.3803, "step": 1642 }, { "epoch": 0.23043478260869565, "grad_norm": 2.267550367239667, "learning_rate": 8.983070664790856e-06, "loss": 0.4109, "step": 1643 }, { "epoch": 0.2305750350631136, "grad_norm": 2.1053224417219716, "learning_rate": 8.981697310339972e-06, "loss": 0.3954, "step": 1644 }, { "epoch": 0.23071528751753156, "grad_norm": 1.7570527872179347, "learning_rate": 8.980323134293664e-06, "loss": 0.3774, "step": 1645 }, { "epoch": 0.23085553997194952, "grad_norm": 2.434043291677052, "learning_rate": 8.978948136935488e-06, "loss": 0.381, "step": 1646 }, { "epoch": 0.23099579242636747, "grad_norm": 1.9277178638991808, "learning_rate": 8.977572318549164e-06, "loss": 0.3794, "step": 1647 }, { "epoch": 0.2311360448807854, "grad_norm": 2.8909639726819867, "learning_rate": 8.97619567941858e-06, "loss": 0.3999, "step": 1648 }, { "epoch": 0.23127629733520336, "grad_norm": 1.8991705338741542, "learning_rate": 8.974818219827796e-06, "loss": 0.3801, "step": 1649 }, { "epoch": 0.2314165497896213, "grad_norm": 2.3324845223357937, "learning_rate": 8.973439940061044e-06, "loss": 0.3954, "step": 1650 }, { "epoch": 0.23155680224403927, "grad_norm": 2.116780146863398, "learning_rate": 8.972060840402721e-06, "loss": 0.3853, "step": 1651 }, { "epoch": 0.23169705469845722, "grad_norm": 2.162627036712677, "learning_rate": 8.970680921137396e-06, "loss": 0.387, "step": 1652 }, { "epoch": 0.23183730715287518, "grad_norm": 1.992855244346062, "learning_rate": 8.969300182549802e-06, "loss": 0.3881, "step": 1653 }, { "epoch": 0.23197755960729313, "grad_norm": 1.900349850819968, "learning_rate": 8.967918624924849e-06, "loss": 0.4256, "step": 1654 }, { "epoch": 0.2321178120617111, "grad_norm": 1.9903624907971762, "learning_rate": 8.966536248547608e-06, "loss": 0.4404, "step": 1655 }, { "epoch": 0.23225806451612904, "grad_norm": 1.8558296575397093, "learning_rate": 8.965153053703325e-06, "loss": 0.4001, "step": 1656 }, { "epoch": 0.232398316970547, "grad_norm": 1.9107665040809345, "learning_rate": 8.963769040677413e-06, "loss": 0.4166, "step": 1657 }, { "epoch": 0.23253856942496492, "grad_norm": 2.299496562426827, "learning_rate": 8.962384209755453e-06, "loss": 0.4118, "step": 1658 }, { "epoch": 0.23267882187938288, "grad_norm": 2.2810366805161437, "learning_rate": 8.960998561223193e-06, "loss": 0.3915, "step": 1659 }, { "epoch": 0.23281907433380084, "grad_norm": 2.0407852553902295, "learning_rate": 8.959612095366556e-06, "loss": 0.3591, "step": 1660 }, { "epoch": 0.2329593267882188, "grad_norm": 2.159094109601291, "learning_rate": 8.958224812471625e-06, "loss": 0.3823, "step": 1661 }, { "epoch": 0.23309957924263675, "grad_norm": 2.0192802465057254, "learning_rate": 8.95683671282466e-06, "loss": 0.4396, "step": 1662 }, { "epoch": 0.2332398316970547, "grad_norm": 2.8927017587310693, "learning_rate": 8.955447796712083e-06, "loss": 0.3687, "step": 1663 }, { "epoch": 0.23338008415147266, "grad_norm": 2.728144582916274, "learning_rate": 8.954058064420487e-06, "loss": 0.4356, "step": 1664 }, { "epoch": 0.2335203366058906, "grad_norm": 1.8896194179780765, "learning_rate": 8.952667516236635e-06, "loss": 0.3628, "step": 1665 }, { "epoch": 0.23366058906030857, "grad_norm": 2.0293034543324526, "learning_rate": 8.951276152447458e-06, "loss": 0.41, "step": 1666 }, { "epoch": 0.23380084151472652, "grad_norm": 2.2500371929909355, "learning_rate": 8.949883973340051e-06, "loss": 0.3951, "step": 1667 }, { "epoch": 0.23394109396914445, "grad_norm": 2.3940381387144067, "learning_rate": 8.948490979201683e-06, "loss": 0.4154, "step": 1668 }, { "epoch": 0.2340813464235624, "grad_norm": 1.7812550193314356, "learning_rate": 8.947097170319789e-06, "loss": 0.4004, "step": 1669 }, { "epoch": 0.23422159887798036, "grad_norm": 2.1472347707684687, "learning_rate": 8.94570254698197e-06, "loss": 0.4362, "step": 1670 }, { "epoch": 0.2343618513323983, "grad_norm": 2.2844318513279873, "learning_rate": 8.944307109475996e-06, "loss": 0.3723, "step": 1671 }, { "epoch": 0.23450210378681627, "grad_norm": 3.0805714755788256, "learning_rate": 8.942910858089806e-06, "loss": 0.4107, "step": 1672 }, { "epoch": 0.23464235624123422, "grad_norm": 1.9922744432812676, "learning_rate": 8.94151379311151e-06, "loss": 0.3842, "step": 1673 }, { "epoch": 0.23478260869565218, "grad_norm": 3.341890192206006, "learning_rate": 8.940115914829382e-06, "loss": 0.4123, "step": 1674 }, { "epoch": 0.23492286115007013, "grad_norm": 3.807837715425462, "learning_rate": 8.93871722353186e-06, "loss": 0.4424, "step": 1675 }, { "epoch": 0.2350631136044881, "grad_norm": 2.428124713283998, "learning_rate": 8.937317719507556e-06, "loss": 0.3734, "step": 1676 }, { "epoch": 0.23520336605890604, "grad_norm": 1.8818099649138518, "learning_rate": 8.935917403045251e-06, "loss": 0.4008, "step": 1677 }, { "epoch": 0.23534361851332397, "grad_norm": 1.8918695058197808, "learning_rate": 8.934516274433889e-06, "loss": 0.4031, "step": 1678 }, { "epoch": 0.23548387096774193, "grad_norm": 1.8727454936470198, "learning_rate": 8.93311433396258e-06, "loss": 0.3772, "step": 1679 }, { "epoch": 0.23562412342215988, "grad_norm": 1.8489457322528533, "learning_rate": 8.93171158192061e-06, "loss": 0.3969, "step": 1680 }, { "epoch": 0.23576437587657784, "grad_norm": 1.782463329735169, "learning_rate": 8.930308018597422e-06, "loss": 0.3341, "step": 1681 }, { "epoch": 0.2359046283309958, "grad_norm": 2.447395680496483, "learning_rate": 8.928903644282635e-06, "loss": 0.4359, "step": 1682 }, { "epoch": 0.23604488078541375, "grad_norm": 1.724058035035235, "learning_rate": 8.92749845926603e-06, "loss": 0.4187, "step": 1683 }, { "epoch": 0.2361851332398317, "grad_norm": 2.5647117559867008, "learning_rate": 8.926092463837557e-06, "loss": 0.3799, "step": 1684 }, { "epoch": 0.23632538569424966, "grad_norm": 1.9698294450542004, "learning_rate": 8.924685658287334e-06, "loss": 0.3827, "step": 1685 }, { "epoch": 0.2364656381486676, "grad_norm": 2.074864748487916, "learning_rate": 8.923278042905647e-06, "loss": 0.4297, "step": 1686 }, { "epoch": 0.23660589060308557, "grad_norm": 1.9723837484857996, "learning_rate": 8.921869617982945e-06, "loss": 0.4084, "step": 1687 }, { "epoch": 0.2367461430575035, "grad_norm": 2.181385103867383, "learning_rate": 8.920460383809847e-06, "loss": 0.4329, "step": 1688 }, { "epoch": 0.23688639551192145, "grad_norm": 1.458910836813871, "learning_rate": 8.91905034067714e-06, "loss": 0.3816, "step": 1689 }, { "epoch": 0.2370266479663394, "grad_norm": 2.040082303799142, "learning_rate": 8.917639488875776e-06, "loss": 0.3957, "step": 1690 }, { "epoch": 0.23716690042075736, "grad_norm": 2.0712335804833493, "learning_rate": 8.916227828696873e-06, "loss": 0.369, "step": 1691 }, { "epoch": 0.23730715287517531, "grad_norm": 2.2525521744624557, "learning_rate": 8.91481536043172e-06, "loss": 0.4085, "step": 1692 }, { "epoch": 0.23744740532959327, "grad_norm": 2.846440143223222, "learning_rate": 8.913402084371767e-06, "loss": 0.3944, "step": 1693 }, { "epoch": 0.23758765778401122, "grad_norm": 2.1343981539139207, "learning_rate": 8.911988000808636e-06, "loss": 0.4309, "step": 1694 }, { "epoch": 0.23772791023842918, "grad_norm": 1.7394265505861504, "learning_rate": 8.910573110034113e-06, "loss": 0.3643, "step": 1695 }, { "epoch": 0.23786816269284713, "grad_norm": 1.9952329209006225, "learning_rate": 8.90915741234015e-06, "loss": 0.3924, "step": 1696 }, { "epoch": 0.2380084151472651, "grad_norm": 3.0299040962376123, "learning_rate": 8.907740908018866e-06, "loss": 0.3975, "step": 1697 }, { "epoch": 0.23814866760168302, "grad_norm": 1.9531878041660242, "learning_rate": 8.906323597362547e-06, "loss": 0.4184, "step": 1698 }, { "epoch": 0.23828892005610097, "grad_norm": 2.2203150857243257, "learning_rate": 8.904905480663646e-06, "loss": 0.4144, "step": 1699 }, { "epoch": 0.23842917251051893, "grad_norm": 2.222344294480045, "learning_rate": 8.90348655821478e-06, "loss": 0.378, "step": 1700 }, { "epoch": 0.23856942496493688, "grad_norm": 2.1053773651390606, "learning_rate": 8.902066830308735e-06, "loss": 0.4399, "step": 1701 }, { "epoch": 0.23870967741935484, "grad_norm": 3.228736865245466, "learning_rate": 8.900646297238462e-06, "loss": 0.3865, "step": 1702 }, { "epoch": 0.2388499298737728, "grad_norm": 2.168370774943011, "learning_rate": 8.899224959297078e-06, "loss": 0.404, "step": 1703 }, { "epoch": 0.23899018232819075, "grad_norm": 4.8979443266863845, "learning_rate": 8.897802816777866e-06, "loss": 0.4161, "step": 1704 }, { "epoch": 0.2391304347826087, "grad_norm": 2.6500597461227207, "learning_rate": 8.896379869974273e-06, "loss": 0.4087, "step": 1705 }, { "epoch": 0.23927068723702666, "grad_norm": 3.057828649167323, "learning_rate": 8.894956119179918e-06, "loss": 0.4235, "step": 1706 }, { "epoch": 0.2394109396914446, "grad_norm": 2.5599749149643207, "learning_rate": 8.89353156468858e-06, "loss": 0.387, "step": 1707 }, { "epoch": 0.23955119214586254, "grad_norm": 2.6350365365841326, "learning_rate": 8.892106206794204e-06, "loss": 0.3877, "step": 1708 }, { "epoch": 0.2396914446002805, "grad_norm": 3.1900285153923176, "learning_rate": 8.890680045790907e-06, "loss": 0.4292, "step": 1709 }, { "epoch": 0.23983169705469845, "grad_norm": 2.82073413518562, "learning_rate": 8.889253081972963e-06, "loss": 0.4004, "step": 1710 }, { "epoch": 0.2399719495091164, "grad_norm": 1.8333597470316982, "learning_rate": 8.88782531563482e-06, "loss": 0.3766, "step": 1711 }, { "epoch": 0.24011220196353436, "grad_norm": 2.232362067221606, "learning_rate": 8.886396747071085e-06, "loss": 0.4563, "step": 1712 }, { "epoch": 0.24025245441795232, "grad_norm": 2.48197667701577, "learning_rate": 8.884967376576534e-06, "loss": 0.4349, "step": 1713 }, { "epoch": 0.24039270687237027, "grad_norm": 2.744797988295589, "learning_rate": 8.883537204446105e-06, "loss": 0.4015, "step": 1714 }, { "epoch": 0.24053295932678823, "grad_norm": 2.4051826388667767, "learning_rate": 8.88210623097491e-06, "loss": 0.3699, "step": 1715 }, { "epoch": 0.24067321178120618, "grad_norm": 2.164231166394999, "learning_rate": 8.880674456458214e-06, "loss": 0.4416, "step": 1716 }, { "epoch": 0.24081346423562414, "grad_norm": 2.5652527328817194, "learning_rate": 8.879241881191458e-06, "loss": 0.368, "step": 1717 }, { "epoch": 0.24095371669004206, "grad_norm": 1.7471053843339004, "learning_rate": 8.877808505470242e-06, "loss": 0.3543, "step": 1718 }, { "epoch": 0.24109396914446002, "grad_norm": 2.8804187295417862, "learning_rate": 8.876374329590331e-06, "loss": 0.3405, "step": 1719 }, { "epoch": 0.24123422159887797, "grad_norm": 2.2787610415443846, "learning_rate": 8.874939353847662e-06, "loss": 0.3809, "step": 1720 }, { "epoch": 0.24137447405329593, "grad_norm": 3.084608192479369, "learning_rate": 8.87350357853833e-06, "loss": 0.4129, "step": 1721 }, { "epoch": 0.24151472650771388, "grad_norm": 5.009866706765327, "learning_rate": 8.872067003958597e-06, "loss": 0.4014, "step": 1722 }, { "epoch": 0.24165497896213184, "grad_norm": 1.772390518681213, "learning_rate": 8.87062963040489e-06, "loss": 0.4027, "step": 1723 }, { "epoch": 0.2417952314165498, "grad_norm": 2.332087377131572, "learning_rate": 8.869191458173801e-06, "loss": 0.3496, "step": 1724 }, { "epoch": 0.24193548387096775, "grad_norm": 2.1971708963955283, "learning_rate": 8.867752487562087e-06, "loss": 0.3459, "step": 1725 }, { "epoch": 0.2420757363253857, "grad_norm": 3.087015702415595, "learning_rate": 8.866312718866669e-06, "loss": 0.3949, "step": 1726 }, { "epoch": 0.24221598877980366, "grad_norm": 2.1088482234309263, "learning_rate": 8.864872152384635e-06, "loss": 0.4081, "step": 1727 }, { "epoch": 0.2423562412342216, "grad_norm": 2.2054834390354463, "learning_rate": 8.863430788413232e-06, "loss": 0.3819, "step": 1728 }, { "epoch": 0.24249649368863954, "grad_norm": 3.2512741525611624, "learning_rate": 8.86198862724988e-06, "loss": 0.3699, "step": 1729 }, { "epoch": 0.2426367461430575, "grad_norm": 3.234062142836094, "learning_rate": 8.860545669192155e-06, "loss": 0.3883, "step": 1730 }, { "epoch": 0.24277699859747545, "grad_norm": 2.039208885579974, "learning_rate": 8.859101914537804e-06, "loss": 0.3905, "step": 1731 }, { "epoch": 0.2429172510518934, "grad_norm": 2.293178672214395, "learning_rate": 8.857657363584736e-06, "loss": 0.3662, "step": 1732 }, { "epoch": 0.24305750350631136, "grad_norm": 2.0828516848427414, "learning_rate": 8.85621201663102e-06, "loss": 0.3828, "step": 1733 }, { "epoch": 0.24319775596072932, "grad_norm": 2.207938704828507, "learning_rate": 8.854765873974898e-06, "loss": 0.4198, "step": 1734 }, { "epoch": 0.24333800841514727, "grad_norm": 2.9174165215925933, "learning_rate": 8.85331893591477e-06, "loss": 0.4653, "step": 1735 }, { "epoch": 0.24347826086956523, "grad_norm": 2.2664337819270406, "learning_rate": 8.851871202749201e-06, "loss": 0.3783, "step": 1736 }, { "epoch": 0.24361851332398318, "grad_norm": 2.146807353857163, "learning_rate": 8.850422674776918e-06, "loss": 0.3834, "step": 1737 }, { "epoch": 0.2437587657784011, "grad_norm": 3.4281839537789196, "learning_rate": 8.84897335229682e-06, "loss": 0.3939, "step": 1738 }, { "epoch": 0.24389901823281906, "grad_norm": 2.4417787801718664, "learning_rate": 8.84752323560796e-06, "loss": 0.4119, "step": 1739 }, { "epoch": 0.24403927068723702, "grad_norm": 2.0966569448871355, "learning_rate": 8.846072325009562e-06, "loss": 0.3511, "step": 1740 }, { "epoch": 0.24417952314165497, "grad_norm": 2.258088197342238, "learning_rate": 8.84462062080101e-06, "loss": 0.4303, "step": 1741 }, { "epoch": 0.24431977559607293, "grad_norm": 2.5054597364976625, "learning_rate": 8.843168123281855e-06, "loss": 0.3977, "step": 1742 }, { "epoch": 0.24446002805049089, "grad_norm": 3.92578222170386, "learning_rate": 8.841714832751806e-06, "loss": 0.4287, "step": 1743 }, { "epoch": 0.24460028050490884, "grad_norm": 2.6346756769185817, "learning_rate": 8.840260749510744e-06, "loss": 0.4227, "step": 1744 }, { "epoch": 0.2447405329593268, "grad_norm": 2.7527575971896794, "learning_rate": 8.838805873858704e-06, "loss": 0.4165, "step": 1745 }, { "epoch": 0.24488078541374475, "grad_norm": 2.2262771155854875, "learning_rate": 8.837350206095894e-06, "loss": 0.3821, "step": 1746 }, { "epoch": 0.2450210378681627, "grad_norm": 2.6176660811848116, "learning_rate": 8.83589374652268e-06, "loss": 0.4032, "step": 1747 }, { "epoch": 0.24516129032258063, "grad_norm": 2.314385959600325, "learning_rate": 8.834436495439588e-06, "loss": 0.4427, "step": 1748 }, { "epoch": 0.2453015427769986, "grad_norm": 2.089073195961481, "learning_rate": 8.832978453147316e-06, "loss": 0.3704, "step": 1749 }, { "epoch": 0.24544179523141654, "grad_norm": 3.0954830055318703, "learning_rate": 8.83151961994672e-06, "loss": 0.4065, "step": 1750 }, { "epoch": 0.2455820476858345, "grad_norm": 3.7937963743690544, "learning_rate": 8.830059996138818e-06, "loss": 0.388, "step": 1751 }, { "epoch": 0.24572230014025245, "grad_norm": 2.2588277995692123, "learning_rate": 8.828599582024794e-06, "loss": 0.4209, "step": 1752 }, { "epoch": 0.2458625525946704, "grad_norm": 2.246356981857827, "learning_rate": 8.827138377905999e-06, "loss": 0.413, "step": 1753 }, { "epoch": 0.24600280504908836, "grad_norm": 2.2246019800346253, "learning_rate": 8.825676384083936e-06, "loss": 0.3786, "step": 1754 }, { "epoch": 0.24614305750350632, "grad_norm": 2.3258634502119815, "learning_rate": 8.824213600860278e-06, "loss": 0.4072, "step": 1755 }, { "epoch": 0.24628330995792427, "grad_norm": 2.453444441874324, "learning_rate": 8.822750028536863e-06, "loss": 0.4087, "step": 1756 }, { "epoch": 0.24642356241234223, "grad_norm": 2.850109325788278, "learning_rate": 8.821285667415688e-06, "loss": 0.4049, "step": 1757 }, { "epoch": 0.24656381486676016, "grad_norm": 5.25191844220837, "learning_rate": 8.819820517798911e-06, "loss": 0.3922, "step": 1758 }, { "epoch": 0.2467040673211781, "grad_norm": 2.8776234805924714, "learning_rate": 8.81835457998886e-06, "loss": 0.3401, "step": 1759 }, { "epoch": 0.24684431977559607, "grad_norm": 2.72167408110743, "learning_rate": 8.816887854288018e-06, "loss": 0.3746, "step": 1760 }, { "epoch": 0.24698457223001402, "grad_norm": 2.03635211742382, "learning_rate": 8.815420340999034e-06, "loss": 0.3548, "step": 1761 }, { "epoch": 0.24712482468443198, "grad_norm": 2.228948017936398, "learning_rate": 8.813952040424718e-06, "loss": 0.3908, "step": 1762 }, { "epoch": 0.24726507713884993, "grad_norm": 2.272067361739735, "learning_rate": 8.812482952868047e-06, "loss": 0.3677, "step": 1763 }, { "epoch": 0.2474053295932679, "grad_norm": 2.5296451520290995, "learning_rate": 8.811013078632154e-06, "loss": 0.3995, "step": 1764 }, { "epoch": 0.24754558204768584, "grad_norm": 3.1448746961609175, "learning_rate": 8.809542418020335e-06, "loss": 0.4081, "step": 1765 }, { "epoch": 0.2476858345021038, "grad_norm": 1.991928323080467, "learning_rate": 8.808070971336058e-06, "loss": 0.4135, "step": 1766 }, { "epoch": 0.24782608695652175, "grad_norm": 2.233814729458205, "learning_rate": 8.80659873888294e-06, "loss": 0.3954, "step": 1767 }, { "epoch": 0.24796633941093968, "grad_norm": 2.0783321879653203, "learning_rate": 8.805125720964766e-06, "loss": 0.4052, "step": 1768 }, { "epoch": 0.24810659186535763, "grad_norm": 2.197283528572893, "learning_rate": 8.803651917885486e-06, "loss": 0.4264, "step": 1769 }, { "epoch": 0.2482468443197756, "grad_norm": 2.3898712166601066, "learning_rate": 8.802177329949205e-06, "loss": 0.3857, "step": 1770 }, { "epoch": 0.24838709677419354, "grad_norm": 2.399110581578432, "learning_rate": 8.800701957460199e-06, "loss": 0.4092, "step": 1771 }, { "epoch": 0.2485273492286115, "grad_norm": 1.976417144134966, "learning_rate": 8.799225800722895e-06, "loss": 0.4004, "step": 1772 }, { "epoch": 0.24866760168302945, "grad_norm": 3.164961623551263, "learning_rate": 8.797748860041891e-06, "loss": 0.4035, "step": 1773 }, { "epoch": 0.2488078541374474, "grad_norm": 2.3482842898566494, "learning_rate": 8.796271135721944e-06, "loss": 0.3948, "step": 1774 }, { "epoch": 0.24894810659186536, "grad_norm": 1.6693538301954394, "learning_rate": 8.79479262806797e-06, "loss": 0.4039, "step": 1775 }, { "epoch": 0.24908835904628332, "grad_norm": 2.7814138985678643, "learning_rate": 8.79331333738505e-06, "loss": 0.4242, "step": 1776 }, { "epoch": 0.24922861150070127, "grad_norm": 2.2315645482366797, "learning_rate": 8.791833263978426e-06, "loss": 0.3643, "step": 1777 }, { "epoch": 0.2493688639551192, "grad_norm": 2.0523285178732666, "learning_rate": 8.7903524081535e-06, "loss": 0.403, "step": 1778 }, { "epoch": 0.24950911640953716, "grad_norm": 2.436298759054723, "learning_rate": 8.788870770215835e-06, "loss": 0.4465, "step": 1779 }, { "epoch": 0.2496493688639551, "grad_norm": 2.5515507564052506, "learning_rate": 8.787388350471158e-06, "loss": 0.3948, "step": 1780 }, { "epoch": 0.24978962131837307, "grad_norm": 2.25264055226364, "learning_rate": 8.785905149225356e-06, "loss": 0.4201, "step": 1781 }, { "epoch": 0.24992987377279102, "grad_norm": 1.8616700954058476, "learning_rate": 8.784421166784476e-06, "loss": 0.4022, "step": 1782 }, { "epoch": 0.250070126227209, "grad_norm": 3.0806936013153834, "learning_rate": 8.782936403454729e-06, "loss": 0.4205, "step": 1783 }, { "epoch": 0.25021037868162693, "grad_norm": 2.1931853390924596, "learning_rate": 8.781450859542484e-06, "loss": 0.3928, "step": 1784 }, { "epoch": 0.2503506311360449, "grad_norm": 2.413569131452635, "learning_rate": 8.779964535354274e-06, "loss": 0.374, "step": 1785 }, { "epoch": 0.25049088359046284, "grad_norm": 2.478009977232865, "learning_rate": 8.778477431196792e-06, "loss": 0.4174, "step": 1786 }, { "epoch": 0.2506311360448808, "grad_norm": 1.8967975707488671, "learning_rate": 8.77698954737689e-06, "loss": 0.3483, "step": 1787 }, { "epoch": 0.25077138849929875, "grad_norm": 1.7110772620124224, "learning_rate": 8.775500884201582e-06, "loss": 0.3989, "step": 1788 }, { "epoch": 0.2509116409537167, "grad_norm": 2.1189182991094255, "learning_rate": 8.774011441978046e-06, "loss": 0.4062, "step": 1789 }, { "epoch": 0.25105189340813466, "grad_norm": 2.0172415468287266, "learning_rate": 8.772521221013615e-06, "loss": 0.4258, "step": 1790 }, { "epoch": 0.2511921458625526, "grad_norm": 2.2282825986706833, "learning_rate": 8.771030221615786e-06, "loss": 0.4526, "step": 1791 }, { "epoch": 0.2513323983169706, "grad_norm": 2.1082367262990678, "learning_rate": 8.769538444092219e-06, "loss": 0.4004, "step": 1792 }, { "epoch": 0.2514726507713885, "grad_norm": 2.023116627588351, "learning_rate": 8.768045888750729e-06, "loss": 0.3842, "step": 1793 }, { "epoch": 0.25161290322580643, "grad_norm": 2.230766423499367, "learning_rate": 8.766552555899297e-06, "loss": 0.4112, "step": 1794 }, { "epoch": 0.2517531556802244, "grad_norm": 12.372882700420277, "learning_rate": 8.76505844584606e-06, "loss": 0.385, "step": 1795 }, { "epoch": 0.25189340813464234, "grad_norm": 2.230432299635121, "learning_rate": 8.763563558899317e-06, "loss": 0.3769, "step": 1796 }, { "epoch": 0.2520336605890603, "grad_norm": 2.3401758025226753, "learning_rate": 8.762067895367527e-06, "loss": 0.4169, "step": 1797 }, { "epoch": 0.25217391304347825, "grad_norm": 2.3017206265889416, "learning_rate": 8.760571455559313e-06, "loss": 0.3613, "step": 1798 }, { "epoch": 0.2523141654978962, "grad_norm": 1.8519542939496811, "learning_rate": 8.759074239783451e-06, "loss": 0.3539, "step": 1799 }, { "epoch": 0.25245441795231416, "grad_norm": 1.795128811435546, "learning_rate": 8.757576248348883e-06, "loss": 0.4556, "step": 1800 }, { "epoch": 0.2525946704067321, "grad_norm": 2.5662300130639015, "learning_rate": 8.756077481564708e-06, "loss": 0.3916, "step": 1801 }, { "epoch": 0.25273492286115007, "grad_norm": 2.0147116314161866, "learning_rate": 8.754577939740188e-06, "loss": 0.4047, "step": 1802 }, { "epoch": 0.252875175315568, "grad_norm": 2.114643553328905, "learning_rate": 8.75307762318474e-06, "loss": 0.4217, "step": 1803 }, { "epoch": 0.253015427769986, "grad_norm": 2.1305930638178814, "learning_rate": 8.751576532207947e-06, "loss": 0.3663, "step": 1804 }, { "epoch": 0.25315568022440393, "grad_norm": 2.031597870602574, "learning_rate": 8.750074667119546e-06, "loss": 0.4082, "step": 1805 }, { "epoch": 0.2532959326788219, "grad_norm": 2.1795838033969406, "learning_rate": 8.748572028229438e-06, "loss": 0.3633, "step": 1806 }, { "epoch": 0.25343618513323984, "grad_norm": 1.9731163168260866, "learning_rate": 8.747068615847683e-06, "loss": 0.3441, "step": 1807 }, { "epoch": 0.2535764375876578, "grad_norm": 2.18130979701933, "learning_rate": 8.745564430284495e-06, "loss": 0.434, "step": 1808 }, { "epoch": 0.25371669004207575, "grad_norm": 2.289373799791576, "learning_rate": 8.744059471850258e-06, "loss": 0.416, "step": 1809 }, { "epoch": 0.2538569424964937, "grad_norm": 2.9871894387706694, "learning_rate": 8.742553740855507e-06, "loss": 0.4362, "step": 1810 }, { "epoch": 0.25399719495091166, "grad_norm": 2.4547691909532494, "learning_rate": 8.741047237610938e-06, "loss": 0.4176, "step": 1811 }, { "epoch": 0.2541374474053296, "grad_norm": 2.42179840951222, "learning_rate": 8.739539962427408e-06, "loss": 0.4296, "step": 1812 }, { "epoch": 0.2542776998597475, "grad_norm": 2.4302580430339495, "learning_rate": 8.738031915615934e-06, "loss": 0.3877, "step": 1813 }, { "epoch": 0.2544179523141655, "grad_norm": 2.459516423774877, "learning_rate": 8.736523097487693e-06, "loss": 0.358, "step": 1814 }, { "epoch": 0.25455820476858343, "grad_norm": 1.7997585214126197, "learning_rate": 8.735013508354012e-06, "loss": 0.3501, "step": 1815 }, { "epoch": 0.2546984572230014, "grad_norm": 2.916415085614378, "learning_rate": 8.73350314852639e-06, "loss": 0.4427, "step": 1816 }, { "epoch": 0.25483870967741934, "grad_norm": 2.031047400127565, "learning_rate": 8.731992018316478e-06, "loss": 0.3813, "step": 1817 }, { "epoch": 0.2549789621318373, "grad_norm": 1.7100093187669403, "learning_rate": 8.730480118036087e-06, "loss": 0.3838, "step": 1818 }, { "epoch": 0.25511921458625525, "grad_norm": 2.324098112678016, "learning_rate": 8.728967447997185e-06, "loss": 0.3654, "step": 1819 }, { "epoch": 0.2552594670406732, "grad_norm": 1.8097878256938136, "learning_rate": 8.727454008511905e-06, "loss": 0.3925, "step": 1820 }, { "epoch": 0.25539971949509116, "grad_norm": 2.521438799149979, "learning_rate": 8.72593979989253e-06, "loss": 0.3573, "step": 1821 }, { "epoch": 0.2555399719495091, "grad_norm": 1.9643452734401718, "learning_rate": 8.724424822451512e-06, "loss": 0.3934, "step": 1822 }, { "epoch": 0.25568022440392707, "grad_norm": 2.1646705626999068, "learning_rate": 8.722909076501451e-06, "loss": 0.3542, "step": 1823 }, { "epoch": 0.255820476858345, "grad_norm": 2.1582689646348654, "learning_rate": 8.721392562355113e-06, "loss": 0.3713, "step": 1824 }, { "epoch": 0.255960729312763, "grad_norm": 2.401757840295143, "learning_rate": 8.719875280325418e-06, "loss": 0.4492, "step": 1825 }, { "epoch": 0.25610098176718094, "grad_norm": 2.8804397536024426, "learning_rate": 8.71835723072545e-06, "loss": 0.3875, "step": 1826 }, { "epoch": 0.2562412342215989, "grad_norm": 2.343629249005314, "learning_rate": 8.716838413868445e-06, "loss": 0.3836, "step": 1827 }, { "epoch": 0.25638148667601685, "grad_norm": 2.16886022985698, "learning_rate": 8.715318830067801e-06, "loss": 0.4179, "step": 1828 }, { "epoch": 0.2565217391304348, "grad_norm": 2.2775460050356195, "learning_rate": 8.713798479637073e-06, "loss": 0.4342, "step": 1829 }, { "epoch": 0.25666199158485276, "grad_norm": 2.115526737427353, "learning_rate": 8.712277362889975e-06, "loss": 0.4431, "step": 1830 }, { "epoch": 0.2568022440392707, "grad_norm": 2.5964463707644923, "learning_rate": 8.71075548014038e-06, "loss": 0.4239, "step": 1831 }, { "epoch": 0.25694249649368867, "grad_norm": 2.096060431774018, "learning_rate": 8.709232831702319e-06, "loss": 0.381, "step": 1832 }, { "epoch": 0.25708274894810657, "grad_norm": 2.1111865141715045, "learning_rate": 8.707709417889975e-06, "loss": 0.3954, "step": 1833 }, { "epoch": 0.2572230014025245, "grad_norm": 1.9854325007327296, "learning_rate": 8.706185239017699e-06, "loss": 0.3876, "step": 1834 }, { "epoch": 0.2573632538569425, "grad_norm": 2.8123602938391947, "learning_rate": 8.704660295399991e-06, "loss": 0.4228, "step": 1835 }, { "epoch": 0.25750350631136043, "grad_norm": 1.9994820877879265, "learning_rate": 8.703134587351514e-06, "loss": 0.3928, "step": 1836 }, { "epoch": 0.2576437587657784, "grad_norm": 1.5917584560883244, "learning_rate": 8.701608115187087e-06, "loss": 0.3915, "step": 1837 }, { "epoch": 0.25778401122019634, "grad_norm": 1.7824601864213876, "learning_rate": 8.700080879221689e-06, "loss": 0.3607, "step": 1838 }, { "epoch": 0.2579242636746143, "grad_norm": 1.7969392847918233, "learning_rate": 8.69855287977045e-06, "loss": 0.397, "step": 1839 }, { "epoch": 0.25806451612903225, "grad_norm": 3.2935680418767985, "learning_rate": 8.697024117148665e-06, "loss": 0.4094, "step": 1840 }, { "epoch": 0.2582047685834502, "grad_norm": 2.3393088415535592, "learning_rate": 8.695494591671782e-06, "loss": 0.4297, "step": 1841 }, { "epoch": 0.25834502103786816, "grad_norm": 2.1157633929157216, "learning_rate": 8.69396430365541e-06, "loss": 0.3899, "step": 1842 }, { "epoch": 0.2584852734922861, "grad_norm": 1.9184366498424226, "learning_rate": 8.69243325341531e-06, "loss": 0.3735, "step": 1843 }, { "epoch": 0.25862552594670407, "grad_norm": 2.456632888109686, "learning_rate": 8.690901441267409e-06, "loss": 0.3914, "step": 1844 }, { "epoch": 0.258765778401122, "grad_norm": 1.7217606965219199, "learning_rate": 8.689368867527781e-06, "loss": 0.4023, "step": 1845 }, { "epoch": 0.25890603085554, "grad_norm": 1.9828561692355213, "learning_rate": 8.687835532512662e-06, "loss": 0.3669, "step": 1846 }, { "epoch": 0.25904628330995794, "grad_norm": 2.5495032107057494, "learning_rate": 8.686301436538446e-06, "loss": 0.3849, "step": 1847 }, { "epoch": 0.2591865357643759, "grad_norm": 1.8525053432454466, "learning_rate": 8.684766579921684e-06, "loss": 0.387, "step": 1848 }, { "epoch": 0.25932678821879385, "grad_norm": 1.740659171681587, "learning_rate": 8.683230962979082e-06, "loss": 0.3819, "step": 1849 }, { "epoch": 0.2594670406732118, "grad_norm": 2.3032506569443543, "learning_rate": 8.681694586027506e-06, "loss": 0.3923, "step": 1850 }, { "epoch": 0.25960729312762976, "grad_norm": 1.754521659154257, "learning_rate": 8.68015744938397e-06, "loss": 0.3646, "step": 1851 }, { "epoch": 0.2597475455820477, "grad_norm": 2.0436054936921844, "learning_rate": 8.67861955336566e-06, "loss": 0.3578, "step": 1852 }, { "epoch": 0.2598877980364656, "grad_norm": 1.640286074552872, "learning_rate": 8.677080898289903e-06, "loss": 0.3667, "step": 1853 }, { "epoch": 0.26002805049088357, "grad_norm": 2.330152000413651, "learning_rate": 8.675541484474195e-06, "loss": 0.4138, "step": 1854 }, { "epoch": 0.2601683029453015, "grad_norm": 2.5643937161399717, "learning_rate": 8.67400131223618e-06, "loss": 0.4348, "step": 1855 }, { "epoch": 0.2603085553997195, "grad_norm": 2.342728964608888, "learning_rate": 8.672460381893662e-06, "loss": 0.4137, "step": 1856 }, { "epoch": 0.26044880785413743, "grad_norm": 2.4091909692049387, "learning_rate": 8.670918693764603e-06, "loss": 0.4115, "step": 1857 }, { "epoch": 0.2605890603085554, "grad_norm": 1.9344513195555226, "learning_rate": 8.669376248167118e-06, "loss": 0.3515, "step": 1858 }, { "epoch": 0.26072931276297334, "grad_norm": 2.2932831582783164, "learning_rate": 8.667833045419483e-06, "loss": 0.4041, "step": 1859 }, { "epoch": 0.2608695652173913, "grad_norm": 1.8623588406111193, "learning_rate": 8.666289085840122e-06, "loss": 0.3698, "step": 1860 }, { "epoch": 0.26100981767180925, "grad_norm": 1.697984900790457, "learning_rate": 8.664744369747622e-06, "loss": 0.3902, "step": 1861 }, { "epoch": 0.2611500701262272, "grad_norm": 2.102819471003304, "learning_rate": 8.663198897460727e-06, "loss": 0.4029, "step": 1862 }, { "epoch": 0.26129032258064516, "grad_norm": 2.098543856631733, "learning_rate": 8.661652669298332e-06, "loss": 0.4179, "step": 1863 }, { "epoch": 0.2614305750350631, "grad_norm": 2.1064507453859007, "learning_rate": 8.660105685579493e-06, "loss": 0.3965, "step": 1864 }, { "epoch": 0.2615708274894811, "grad_norm": 2.968723337154117, "learning_rate": 8.658557946623416e-06, "loss": 0.3631, "step": 1865 }, { "epoch": 0.261711079943899, "grad_norm": 2.1515177891780626, "learning_rate": 8.657009452749466e-06, "loss": 0.3782, "step": 1866 }, { "epoch": 0.261851332398317, "grad_norm": 3.7319835203814775, "learning_rate": 8.655460204277167e-06, "loss": 0.4801, "step": 1867 }, { "epoch": 0.26199158485273494, "grad_norm": 1.722021422763533, "learning_rate": 8.653910201526195e-06, "loss": 0.4096, "step": 1868 }, { "epoch": 0.2621318373071529, "grad_norm": 2.2192863568259393, "learning_rate": 8.652359444816379e-06, "loss": 0.3775, "step": 1869 }, { "epoch": 0.26227208976157085, "grad_norm": 1.970113978114331, "learning_rate": 8.65080793446771e-06, "loss": 0.4222, "step": 1870 }, { "epoch": 0.2624123422159888, "grad_norm": 2.2785623473359298, "learning_rate": 8.649255670800328e-06, "loss": 0.3658, "step": 1871 }, { "epoch": 0.26255259467040676, "grad_norm": 1.9152123559086334, "learning_rate": 8.647702654134535e-06, "loss": 0.4157, "step": 1872 }, { "epoch": 0.26269284712482466, "grad_norm": 2.0864722320149585, "learning_rate": 8.646148884790786e-06, "loss": 0.4045, "step": 1873 }, { "epoch": 0.2628330995792426, "grad_norm": 1.9043973456067995, "learning_rate": 8.644594363089687e-06, "loss": 0.4008, "step": 1874 }, { "epoch": 0.26297335203366057, "grad_norm": 2.779080691478046, "learning_rate": 8.643039089352005e-06, "loss": 0.3749, "step": 1875 }, { "epoch": 0.2631136044880785, "grad_norm": 2.770364181458188, "learning_rate": 8.64148306389866e-06, "loss": 0.3922, "step": 1876 }, { "epoch": 0.2632538569424965, "grad_norm": 2.091318341111565, "learning_rate": 8.639926287050726e-06, "loss": 0.411, "step": 1877 }, { "epoch": 0.26339410939691443, "grad_norm": 2.357830729581564, "learning_rate": 8.638368759129433e-06, "loss": 0.4514, "step": 1878 }, { "epoch": 0.2635343618513324, "grad_norm": 2.0686976630744565, "learning_rate": 8.636810480456165e-06, "loss": 0.4019, "step": 1879 }, { "epoch": 0.26367461430575034, "grad_norm": 2.765358432487882, "learning_rate": 8.635251451352463e-06, "loss": 0.3982, "step": 1880 }, { "epoch": 0.2638148667601683, "grad_norm": 2.245786961045472, "learning_rate": 8.633691672140022e-06, "loss": 0.373, "step": 1881 }, { "epoch": 0.26395511921458625, "grad_norm": 2.5532891882557145, "learning_rate": 8.632131143140694e-06, "loss": 0.3958, "step": 1882 }, { "epoch": 0.2640953716690042, "grad_norm": 2.2262751879115914, "learning_rate": 8.63056986467648e-06, "loss": 0.3719, "step": 1883 }, { "epoch": 0.26423562412342216, "grad_norm": 5.823579790256705, "learning_rate": 8.629007837069537e-06, "loss": 0.4001, "step": 1884 }, { "epoch": 0.2643758765778401, "grad_norm": 2.0944670403046124, "learning_rate": 8.627445060642182e-06, "loss": 0.3603, "step": 1885 }, { "epoch": 0.2645161290322581, "grad_norm": 2.336050177570873, "learning_rate": 8.625881535716883e-06, "loss": 0.4023, "step": 1886 }, { "epoch": 0.26465638148667603, "grad_norm": 1.5570389490468364, "learning_rate": 8.624317262616261e-06, "loss": 0.4039, "step": 1887 }, { "epoch": 0.264796633941094, "grad_norm": 2.610990926217625, "learning_rate": 8.622752241663094e-06, "loss": 0.4119, "step": 1888 }, { "epoch": 0.26493688639551194, "grad_norm": 1.922161794213683, "learning_rate": 8.621186473180312e-06, "loss": 0.3569, "step": 1889 }, { "epoch": 0.2650771388499299, "grad_norm": 1.79694260179815, "learning_rate": 8.619619957491e-06, "loss": 0.4201, "step": 1890 }, { "epoch": 0.26521739130434785, "grad_norm": 2.6570526705436173, "learning_rate": 8.6180526949184e-06, "loss": 0.4005, "step": 1891 }, { "epoch": 0.2653576437587658, "grad_norm": 2.0333176872818792, "learning_rate": 8.616484685785905e-06, "loss": 0.3845, "step": 1892 }, { "epoch": 0.2654978962131837, "grad_norm": 2.3472710164833965, "learning_rate": 8.614915930417058e-06, "loss": 0.4486, "step": 1893 }, { "epoch": 0.26563814866760166, "grad_norm": 2.113362591036528, "learning_rate": 8.613346429135567e-06, "loss": 0.3834, "step": 1894 }, { "epoch": 0.2657784011220196, "grad_norm": 2.8160780293480343, "learning_rate": 8.611776182265285e-06, "loss": 0.4311, "step": 1895 }, { "epoch": 0.26591865357643757, "grad_norm": 3.528094341729408, "learning_rate": 8.610205190130223e-06, "loss": 0.3854, "step": 1896 }, { "epoch": 0.2660589060308555, "grad_norm": 2.3232927801934005, "learning_rate": 8.608633453054541e-06, "loss": 0.3791, "step": 1897 }, { "epoch": 0.2661991584852735, "grad_norm": 1.9262337607578615, "learning_rate": 8.607060971362557e-06, "loss": 0.3894, "step": 1898 }, { "epoch": 0.26633941093969143, "grad_norm": 1.8099675079717121, "learning_rate": 8.605487745378745e-06, "loss": 0.4007, "step": 1899 }, { "epoch": 0.2664796633941094, "grad_norm": 2.195843181069265, "learning_rate": 8.603913775427726e-06, "loss": 0.3962, "step": 1900 }, { "epoch": 0.26661991584852734, "grad_norm": 2.5239010789532843, "learning_rate": 8.602339061834278e-06, "loss": 0.3825, "step": 1901 }, { "epoch": 0.2667601683029453, "grad_norm": 1.8738946199497548, "learning_rate": 8.600763604923332e-06, "loss": 0.4033, "step": 1902 }, { "epoch": 0.26690042075736325, "grad_norm": 2.5849364956520606, "learning_rate": 8.599187405019974e-06, "loss": 0.4197, "step": 1903 }, { "epoch": 0.2670406732117812, "grad_norm": 1.8922922648778524, "learning_rate": 8.597610462449441e-06, "loss": 0.4101, "step": 1904 }, { "epoch": 0.26718092566619916, "grad_norm": 1.539743650328855, "learning_rate": 8.596032777537124e-06, "loss": 0.3873, "step": 1905 }, { "epoch": 0.2673211781206171, "grad_norm": 1.5646290578988398, "learning_rate": 8.594454350608565e-06, "loss": 0.3506, "step": 1906 }, { "epoch": 0.2674614305750351, "grad_norm": 2.002680889060512, "learning_rate": 8.592875181989466e-06, "loss": 0.4104, "step": 1907 }, { "epoch": 0.26760168302945303, "grad_norm": 1.8974059013988245, "learning_rate": 8.591295272005674e-06, "loss": 0.419, "step": 1908 }, { "epoch": 0.267741935483871, "grad_norm": 2.5530264117514223, "learning_rate": 8.589714620983195e-06, "loss": 0.4579, "step": 1909 }, { "epoch": 0.26788218793828894, "grad_norm": 2.8885530211350114, "learning_rate": 8.588133229248182e-06, "loss": 0.3681, "step": 1910 }, { "epoch": 0.2680224403927069, "grad_norm": 1.921906323681632, "learning_rate": 8.586551097126945e-06, "loss": 0.3724, "step": 1911 }, { "epoch": 0.26816269284712485, "grad_norm": 2.3039845574576705, "learning_rate": 8.58496822494595e-06, "loss": 0.394, "step": 1912 }, { "epoch": 0.26830294530154275, "grad_norm": 2.4581419552379047, "learning_rate": 8.583384613031804e-06, "loss": 0.3755, "step": 1913 }, { "epoch": 0.2684431977559607, "grad_norm": 2.03704652478998, "learning_rate": 8.581800261711281e-06, "loss": 0.3682, "step": 1914 }, { "epoch": 0.26858345021037866, "grad_norm": 2.230393283188611, "learning_rate": 8.5802151713113e-06, "loss": 0.3738, "step": 1915 }, { "epoch": 0.2687237026647966, "grad_norm": 2.844086344184844, "learning_rate": 8.578629342158929e-06, "loss": 0.387, "step": 1916 }, { "epoch": 0.26886395511921457, "grad_norm": 4.858322569288278, "learning_rate": 8.577042774581397e-06, "loss": 0.4563, "step": 1917 }, { "epoch": 0.2690042075736325, "grad_norm": 2.102151781019419, "learning_rate": 8.57545546890608e-06, "loss": 0.3915, "step": 1918 }, { "epoch": 0.2691444600280505, "grad_norm": 2.125545543878169, "learning_rate": 8.573867425460506e-06, "loss": 0.422, "step": 1919 }, { "epoch": 0.26928471248246844, "grad_norm": 1.9576893687820685, "learning_rate": 8.572278644572358e-06, "loss": 0.3709, "step": 1920 }, { "epoch": 0.2694249649368864, "grad_norm": 2.8934073589307046, "learning_rate": 8.57068912656947e-06, "loss": 0.4329, "step": 1921 }, { "epoch": 0.26956521739130435, "grad_norm": 3.0953975105454576, "learning_rate": 8.569098871779828e-06, "loss": 0.3901, "step": 1922 }, { "epoch": 0.2697054698457223, "grad_norm": 2.26550734641981, "learning_rate": 8.567507880531567e-06, "loss": 0.45, "step": 1923 }, { "epoch": 0.26984572230014026, "grad_norm": 2.302109236305386, "learning_rate": 8.565916153152982e-06, "loss": 0.3977, "step": 1924 }, { "epoch": 0.2699859747545582, "grad_norm": 2.491833317859881, "learning_rate": 8.564323689972512e-06, "loss": 0.3613, "step": 1925 }, { "epoch": 0.27012622720897617, "grad_norm": 3.1616424724783947, "learning_rate": 8.56273049131875e-06, "loss": 0.3399, "step": 1926 }, { "epoch": 0.2702664796633941, "grad_norm": 2.516709846058792, "learning_rate": 8.561136557520444e-06, "loss": 0.3759, "step": 1927 }, { "epoch": 0.2704067321178121, "grad_norm": 3.215223287656667, "learning_rate": 8.559541888906486e-06, "loss": 0.4654, "step": 1928 }, { "epoch": 0.27054698457223003, "grad_norm": 1.869643571207829, "learning_rate": 8.557946485805932e-06, "loss": 0.3496, "step": 1929 }, { "epoch": 0.270687237026648, "grad_norm": 2.191051107586194, "learning_rate": 8.556350348547978e-06, "loss": 0.393, "step": 1930 }, { "epoch": 0.27082748948106594, "grad_norm": 1.9275542912339192, "learning_rate": 8.554753477461972e-06, "loss": 0.3902, "step": 1931 }, { "epoch": 0.2709677419354839, "grad_norm": 1.7725834071196545, "learning_rate": 8.553155872877425e-06, "loss": 0.3906, "step": 1932 }, { "epoch": 0.2711079943899018, "grad_norm": 2.804797412131901, "learning_rate": 8.551557535123988e-06, "loss": 0.438, "step": 1933 }, { "epoch": 0.27124824684431975, "grad_norm": 1.8478280566222092, "learning_rate": 8.549958464531465e-06, "loss": 0.3667, "step": 1934 }, { "epoch": 0.2713884992987377, "grad_norm": 1.9476510205066677, "learning_rate": 8.548358661429817e-06, "loss": 0.3832, "step": 1935 }, { "epoch": 0.27152875175315566, "grad_norm": 2.193762180914032, "learning_rate": 8.546758126149148e-06, "loss": 0.403, "step": 1936 }, { "epoch": 0.2716690042075736, "grad_norm": 2.686763396340027, "learning_rate": 8.545156859019721e-06, "loss": 0.357, "step": 1937 }, { "epoch": 0.27180925666199157, "grad_norm": 2.4497636252837096, "learning_rate": 8.543554860371942e-06, "loss": 0.4097, "step": 1938 }, { "epoch": 0.2719495091164095, "grad_norm": 2.2659394144711174, "learning_rate": 8.541952130536377e-06, "loss": 0.4214, "step": 1939 }, { "epoch": 0.2720897615708275, "grad_norm": 2.116612317546439, "learning_rate": 8.540348669843736e-06, "loss": 0.3638, "step": 1940 }, { "epoch": 0.27223001402524544, "grad_norm": 2.367573319521346, "learning_rate": 8.538744478624883e-06, "loss": 0.3828, "step": 1941 }, { "epoch": 0.2723702664796634, "grad_norm": 1.912587122709218, "learning_rate": 8.537139557210828e-06, "loss": 0.3923, "step": 1942 }, { "epoch": 0.27251051893408135, "grad_norm": 2.6533771911747186, "learning_rate": 8.535533905932739e-06, "loss": 0.3634, "step": 1943 }, { "epoch": 0.2726507713884993, "grad_norm": 2.6608145595246246, "learning_rate": 8.533927525121928e-06, "loss": 0.3968, "step": 1944 }, { "epoch": 0.27279102384291726, "grad_norm": 2.6831855741141495, "learning_rate": 8.532320415109864e-06, "loss": 0.4027, "step": 1945 }, { "epoch": 0.2729312762973352, "grad_norm": 2.1465467976734285, "learning_rate": 8.53071257622816e-06, "loss": 0.3948, "step": 1946 }, { "epoch": 0.27307152875175317, "grad_norm": 2.184859507777465, "learning_rate": 8.529104008808584e-06, "loss": 0.4376, "step": 1947 }, { "epoch": 0.2732117812061711, "grad_norm": 2.385611569189652, "learning_rate": 8.527494713183052e-06, "loss": 0.3855, "step": 1948 }, { "epoch": 0.2733520336605891, "grad_norm": 5.639806122660272, "learning_rate": 8.525884689683632e-06, "loss": 0.4211, "step": 1949 }, { "epoch": 0.27349228611500703, "grad_norm": 2.061708356188572, "learning_rate": 8.524273938642539e-06, "loss": 0.3728, "step": 1950 }, { "epoch": 0.273632538569425, "grad_norm": 3.2571343097957044, "learning_rate": 8.522662460392141e-06, "loss": 0.3731, "step": 1951 }, { "epoch": 0.27377279102384294, "grad_norm": 5.704271008039003, "learning_rate": 8.521050255264956e-06, "loss": 0.412, "step": 1952 }, { "epoch": 0.27391304347826084, "grad_norm": 2.0338436294132913, "learning_rate": 8.51943732359365e-06, "loss": 0.3768, "step": 1953 }, { "epoch": 0.2740532959326788, "grad_norm": 2.2392067052879123, "learning_rate": 8.517823665711043e-06, "loss": 0.4045, "step": 1954 }, { "epoch": 0.27419354838709675, "grad_norm": 1.8752295035729778, "learning_rate": 8.516209281950098e-06, "loss": 0.3631, "step": 1955 }, { "epoch": 0.2743338008415147, "grad_norm": 1.9185941651523495, "learning_rate": 8.514594172643934e-06, "loss": 0.3905, "step": 1956 }, { "epoch": 0.27447405329593266, "grad_norm": 2.0808382606473472, "learning_rate": 8.512978338125818e-06, "loss": 0.3623, "step": 1957 }, { "epoch": 0.2746143057503506, "grad_norm": 2.179664420275715, "learning_rate": 8.511361778729165e-06, "loss": 0.3849, "step": 1958 }, { "epoch": 0.2747545582047686, "grad_norm": 2.319733621333297, "learning_rate": 8.509744494787543e-06, "loss": 0.3628, "step": 1959 }, { "epoch": 0.27489481065918653, "grad_norm": 1.9976060964698326, "learning_rate": 8.508126486634664e-06, "loss": 0.3813, "step": 1960 }, { "epoch": 0.2750350631136045, "grad_norm": 1.7363586766373231, "learning_rate": 8.506507754604393e-06, "loss": 0.4096, "step": 1961 }, { "epoch": 0.27517531556802244, "grad_norm": 1.7032192799082124, "learning_rate": 8.504888299030748e-06, "loss": 0.4375, "step": 1962 }, { "epoch": 0.2753155680224404, "grad_norm": 2.1166125428296088, "learning_rate": 8.503268120247888e-06, "loss": 0.3722, "step": 1963 }, { "epoch": 0.27545582047685835, "grad_norm": 2.6186934508270143, "learning_rate": 8.501647218590127e-06, "loss": 0.3877, "step": 1964 }, { "epoch": 0.2755960729312763, "grad_norm": 2.5298676660479384, "learning_rate": 8.500025594391927e-06, "loss": 0.4027, "step": 1965 }, { "epoch": 0.27573632538569426, "grad_norm": 2.400180344958265, "learning_rate": 8.498403247987899e-06, "loss": 0.3818, "step": 1966 }, { "epoch": 0.2758765778401122, "grad_norm": 1.9054596309905054, "learning_rate": 8.496780179712804e-06, "loss": 0.3779, "step": 1967 }, { "epoch": 0.27601683029453017, "grad_norm": 2.9199983006302577, "learning_rate": 8.495156389901548e-06, "loss": 0.3671, "step": 1968 }, { "epoch": 0.2761570827489481, "grad_norm": 2.0482238518447207, "learning_rate": 8.49353187888919e-06, "loss": 0.3613, "step": 1969 }, { "epoch": 0.2762973352033661, "grad_norm": 4.292017406958045, "learning_rate": 8.491906647010937e-06, "loss": 0.3514, "step": 1970 }, { "epoch": 0.27643758765778403, "grad_norm": 2.3651074628226647, "learning_rate": 8.490280694602142e-06, "loss": 0.3675, "step": 1971 }, { "epoch": 0.276577840112202, "grad_norm": 2.248692450473476, "learning_rate": 8.488654021998313e-06, "loss": 0.3944, "step": 1972 }, { "epoch": 0.2767180925666199, "grad_norm": 2.194272918543235, "learning_rate": 8.4870266295351e-06, "loss": 0.415, "step": 1973 }, { "epoch": 0.27685834502103784, "grad_norm": 2.5410939702264534, "learning_rate": 8.485398517548303e-06, "loss": 0.4024, "step": 1974 }, { "epoch": 0.2769985974754558, "grad_norm": 1.7951171237487749, "learning_rate": 8.483769686373872e-06, "loss": 0.3656, "step": 1975 }, { "epoch": 0.27713884992987375, "grad_norm": 2.072755804162117, "learning_rate": 8.482140136347907e-06, "loss": 0.3525, "step": 1976 }, { "epoch": 0.2772791023842917, "grad_norm": 2.1762294889707223, "learning_rate": 8.480509867806655e-06, "loss": 0.4684, "step": 1977 }, { "epoch": 0.27741935483870966, "grad_norm": 1.84907254864733, "learning_rate": 8.478878881086505e-06, "loss": 0.373, "step": 1978 }, { "epoch": 0.2775596072931276, "grad_norm": 3.078471517664375, "learning_rate": 8.477247176524007e-06, "loss": 0.3638, "step": 1979 }, { "epoch": 0.2776998597475456, "grad_norm": 2.149284722866258, "learning_rate": 8.475614754455845e-06, "loss": 0.4024, "step": 1980 }, { "epoch": 0.27784011220196353, "grad_norm": 1.661085375916414, "learning_rate": 8.473981615218863e-06, "loss": 0.3863, "step": 1981 }, { "epoch": 0.2779803646563815, "grad_norm": 2.911253528977741, "learning_rate": 8.472347759150044e-06, "loss": 0.4179, "step": 1982 }, { "epoch": 0.27812061711079944, "grad_norm": 2.5618475106545042, "learning_rate": 8.470713186586526e-06, "loss": 0.3999, "step": 1983 }, { "epoch": 0.2782608695652174, "grad_norm": 2.0787290684695052, "learning_rate": 8.46907789786559e-06, "loss": 0.3664, "step": 1984 }, { "epoch": 0.27840112201963535, "grad_norm": 2.2599584815338614, "learning_rate": 8.467441893324667e-06, "loss": 0.3781, "step": 1985 }, { "epoch": 0.2785413744740533, "grad_norm": 3.101759897095279, "learning_rate": 8.465805173301333e-06, "loss": 0.394, "step": 1986 }, { "epoch": 0.27868162692847126, "grad_norm": 2.403749529143717, "learning_rate": 8.464167738133317e-06, "loss": 0.4104, "step": 1987 }, { "epoch": 0.2788218793828892, "grad_norm": 2.042999209000056, "learning_rate": 8.462529588158491e-06, "loss": 0.3861, "step": 1988 }, { "epoch": 0.27896213183730717, "grad_norm": 2.122307923413435, "learning_rate": 8.460890723714874e-06, "loss": 0.3736, "step": 1989 }, { "epoch": 0.2791023842917251, "grad_norm": 1.6328002345158334, "learning_rate": 8.459251145140639e-06, "loss": 0.4034, "step": 1990 }, { "epoch": 0.2792426367461431, "grad_norm": 2.0913696422657617, "learning_rate": 8.457610852774097e-06, "loss": 0.3582, "step": 1991 }, { "epoch": 0.27938288920056104, "grad_norm": 1.8425595594533861, "learning_rate": 8.455969846953711e-06, "loss": 0.4194, "step": 1992 }, { "epoch": 0.27952314165497893, "grad_norm": 2.6940249698787273, "learning_rate": 8.454328128018093e-06, "loss": 0.385, "step": 1993 }, { "epoch": 0.2796633941093969, "grad_norm": 5.380494436637811, "learning_rate": 8.452685696306e-06, "loss": 0.3974, "step": 1994 }, { "epoch": 0.27980364656381485, "grad_norm": 1.8393183968506612, "learning_rate": 8.451042552156335e-06, "loss": 0.4158, "step": 1995 }, { "epoch": 0.2799438990182328, "grad_norm": 1.9801218304577122, "learning_rate": 8.449398695908151e-06, "loss": 0.3812, "step": 1996 }, { "epoch": 0.28008415147265076, "grad_norm": 1.7930305154504615, "learning_rate": 8.447754127900645e-06, "loss": 0.4415, "step": 1997 }, { "epoch": 0.2802244039270687, "grad_norm": 1.9863592124764318, "learning_rate": 8.446108848473165e-06, "loss": 0.3819, "step": 1998 }, { "epoch": 0.28036465638148667, "grad_norm": 2.4154400398141744, "learning_rate": 8.444462857965198e-06, "loss": 0.4205, "step": 1999 }, { "epoch": 0.2805049088359046, "grad_norm": 3.0162779738531658, "learning_rate": 8.442816156716386e-06, "loss": 0.3709, "step": 2000 }, { "epoch": 0.2806451612903226, "grad_norm": 2.124307519615917, "learning_rate": 8.441168745066513e-06, "loss": 0.3945, "step": 2001 }, { "epoch": 0.28078541374474053, "grad_norm": 1.965962086021889, "learning_rate": 8.439520623355513e-06, "loss": 0.4628, "step": 2002 }, { "epoch": 0.2809256661991585, "grad_norm": 1.94843430874463, "learning_rate": 8.43787179192346e-06, "loss": 0.402, "step": 2003 }, { "epoch": 0.28106591865357644, "grad_norm": 1.6164321978455929, "learning_rate": 8.436222251110584e-06, "loss": 0.4205, "step": 2004 }, { "epoch": 0.2812061711079944, "grad_norm": 2.874593457212774, "learning_rate": 8.434572001257253e-06, "loss": 0.3794, "step": 2005 }, { "epoch": 0.28134642356241235, "grad_norm": 1.9116949269346306, "learning_rate": 8.432921042703985e-06, "loss": 0.3831, "step": 2006 }, { "epoch": 0.2814866760168303, "grad_norm": 1.930630102326828, "learning_rate": 8.431269375791444e-06, "loss": 0.3987, "step": 2007 }, { "epoch": 0.28162692847124826, "grad_norm": 2.465397163568844, "learning_rate": 8.429617000860441e-06, "loss": 0.4027, "step": 2008 }, { "epoch": 0.2817671809256662, "grad_norm": 1.7839813706832324, "learning_rate": 8.427963918251932e-06, "loss": 0.3749, "step": 2009 }, { "epoch": 0.28190743338008417, "grad_norm": 2.410140224508614, "learning_rate": 8.426310128307016e-06, "loss": 0.378, "step": 2010 }, { "epoch": 0.2820476858345021, "grad_norm": 1.982791479072939, "learning_rate": 8.424655631366945e-06, "loss": 0.3891, "step": 2011 }, { "epoch": 0.2821879382889201, "grad_norm": 1.6456976082123294, "learning_rate": 8.42300042777311e-06, "loss": 0.4031, "step": 2012 }, { "epoch": 0.282328190743338, "grad_norm": 1.7083401408486298, "learning_rate": 8.42134451786705e-06, "loss": 0.3772, "step": 2013 }, { "epoch": 0.28246844319775594, "grad_norm": 2.1972682291651235, "learning_rate": 8.419687901990454e-06, "loss": 0.371, "step": 2014 }, { "epoch": 0.2826086956521739, "grad_norm": 2.6676438945081284, "learning_rate": 8.41803058048515e-06, "loss": 0.409, "step": 2015 }, { "epoch": 0.28274894810659185, "grad_norm": 2.1337667154150166, "learning_rate": 8.416372553693118e-06, "loss": 0.4004, "step": 2016 }, { "epoch": 0.2828892005610098, "grad_norm": 2.1972137580683904, "learning_rate": 8.414713821956477e-06, "loss": 0.3757, "step": 2017 }, { "epoch": 0.28302945301542776, "grad_norm": 1.8607969857182447, "learning_rate": 8.413054385617495e-06, "loss": 0.4189, "step": 2018 }, { "epoch": 0.2831697054698457, "grad_norm": 3.6280329939867615, "learning_rate": 8.411394245018589e-06, "loss": 0.3783, "step": 2019 }, { "epoch": 0.28330995792426367, "grad_norm": 2.1460758059889846, "learning_rate": 8.409733400502311e-06, "loss": 0.3876, "step": 2020 }, { "epoch": 0.2834502103786816, "grad_norm": 2.0155866752352134, "learning_rate": 8.40807185241137e-06, "loss": 0.3831, "step": 2021 }, { "epoch": 0.2835904628330996, "grad_norm": 1.9741713342682568, "learning_rate": 8.406409601088612e-06, "loss": 0.3919, "step": 2022 }, { "epoch": 0.28373071528751753, "grad_norm": 2.0708475824962154, "learning_rate": 8.404746646877033e-06, "loss": 0.4318, "step": 2023 }, { "epoch": 0.2838709677419355, "grad_norm": 2.147420746625381, "learning_rate": 8.40308299011977e-06, "loss": 0.4323, "step": 2024 }, { "epoch": 0.28401122019635344, "grad_norm": 2.5912033815878286, "learning_rate": 8.401418631160109e-06, "loss": 0.3917, "step": 2025 }, { "epoch": 0.2841514726507714, "grad_norm": 2.0905007728879967, "learning_rate": 8.399753570341475e-06, "loss": 0.3675, "step": 2026 }, { "epoch": 0.28429172510518935, "grad_norm": 2.532844229798891, "learning_rate": 8.398087808007447e-06, "loss": 0.324, "step": 2027 }, { "epoch": 0.2844319775596073, "grad_norm": 2.0360164179111884, "learning_rate": 8.39642134450174e-06, "loss": 0.429, "step": 2028 }, { "epoch": 0.28457223001402526, "grad_norm": 1.9246541839703972, "learning_rate": 8.394754180168218e-06, "loss": 0.3939, "step": 2029 }, { "epoch": 0.2847124824684432, "grad_norm": 2.034711382782424, "learning_rate": 8.393086315350887e-06, "loss": 0.368, "step": 2030 }, { "epoch": 0.2848527349228612, "grad_norm": 1.8123260283012756, "learning_rate": 8.391417750393903e-06, "loss": 0.3866, "step": 2031 }, { "epoch": 0.28499298737727913, "grad_norm": 2.9954588216066194, "learning_rate": 8.38974848564156e-06, "loss": 0.4104, "step": 2032 }, { "epoch": 0.285133239831697, "grad_norm": 2.104818891404387, "learning_rate": 8.388078521438299e-06, "loss": 0.3898, "step": 2033 }, { "epoch": 0.285273492286115, "grad_norm": 2.013439087311307, "learning_rate": 8.386407858128707e-06, "loss": 0.4037, "step": 2034 }, { "epoch": 0.28541374474053294, "grad_norm": 2.5360782868609166, "learning_rate": 8.38473649605751e-06, "loss": 0.4272, "step": 2035 }, { "epoch": 0.2855539971949509, "grad_norm": 1.9654114284233963, "learning_rate": 8.383064435569587e-06, "loss": 0.3522, "step": 2036 }, { "epoch": 0.28569424964936885, "grad_norm": 2.695067109809957, "learning_rate": 8.381391677009954e-06, "loss": 0.3766, "step": 2037 }, { "epoch": 0.2858345021037868, "grad_norm": 4.958714168679723, "learning_rate": 8.379718220723772e-06, "loss": 0.4305, "step": 2038 }, { "epoch": 0.28597475455820476, "grad_norm": 2.2216902043020315, "learning_rate": 8.378044067056348e-06, "loss": 0.4076, "step": 2039 }, { "epoch": 0.2861150070126227, "grad_norm": 3.0217510714755886, "learning_rate": 8.376369216353132e-06, "loss": 0.3769, "step": 2040 }, { "epoch": 0.28625525946704067, "grad_norm": 2.4143684464932775, "learning_rate": 8.374693668959717e-06, "loss": 0.3704, "step": 2041 }, { "epoch": 0.2863955119214586, "grad_norm": 2.8704516049537983, "learning_rate": 8.373017425221841e-06, "loss": 0.3753, "step": 2042 }, { "epoch": 0.2865357643758766, "grad_norm": 2.452911562047353, "learning_rate": 8.371340485485384e-06, "loss": 0.4044, "step": 2043 }, { "epoch": 0.28667601683029453, "grad_norm": 1.9048770900934886, "learning_rate": 8.369662850096374e-06, "loss": 0.4221, "step": 2044 }, { "epoch": 0.2868162692847125, "grad_norm": 1.6405235622473011, "learning_rate": 8.367984519400976e-06, "loss": 0.3667, "step": 2045 }, { "epoch": 0.28695652173913044, "grad_norm": 1.7648879681874103, "learning_rate": 8.366305493745502e-06, "loss": 0.3676, "step": 2046 }, { "epoch": 0.2870967741935484, "grad_norm": 1.829144837315095, "learning_rate": 8.36462577347641e-06, "loss": 0.4257, "step": 2047 }, { "epoch": 0.28723702664796635, "grad_norm": 2.2416230132474344, "learning_rate": 8.362945358940295e-06, "loss": 0.3743, "step": 2048 }, { "epoch": 0.2873772791023843, "grad_norm": 1.9966126605432148, "learning_rate": 8.361264250483903e-06, "loss": 0.359, "step": 2049 }, { "epoch": 0.28751753155680226, "grad_norm": 1.9181819261224626, "learning_rate": 8.359582448454114e-06, "loss": 0.3865, "step": 2050 }, { "epoch": 0.2876577840112202, "grad_norm": 2.314355878758693, "learning_rate": 8.357899953197959e-06, "loss": 0.3974, "step": 2051 }, { "epoch": 0.2877980364656382, "grad_norm": 3.978629723449816, "learning_rate": 8.35621676506261e-06, "loss": 0.4274, "step": 2052 }, { "epoch": 0.2879382889200561, "grad_norm": 3.4541337278948343, "learning_rate": 8.354532884395381e-06, "loss": 0.4033, "step": 2053 }, { "epoch": 0.28807854137447403, "grad_norm": 2.454124975794849, "learning_rate": 8.352848311543726e-06, "loss": 0.3851, "step": 2054 }, { "epoch": 0.288218793828892, "grad_norm": 1.98882977127263, "learning_rate": 8.351163046855246e-06, "loss": 0.4103, "step": 2055 }, { "epoch": 0.28835904628330994, "grad_norm": 2.0469990467018997, "learning_rate": 8.349477090677686e-06, "loss": 0.3498, "step": 2056 }, { "epoch": 0.2884992987377279, "grad_norm": 1.9361363811415777, "learning_rate": 8.34779044335893e-06, "loss": 0.3742, "step": 2057 }, { "epoch": 0.28863955119214585, "grad_norm": 1.6277148969495066, "learning_rate": 8.346103105247004e-06, "loss": 0.3689, "step": 2058 }, { "epoch": 0.2887798036465638, "grad_norm": 3.7857220898982216, "learning_rate": 8.34441507669008e-06, "loss": 0.352, "step": 2059 }, { "epoch": 0.28892005610098176, "grad_norm": 1.8252570977119367, "learning_rate": 8.342726358036473e-06, "loss": 0.3535, "step": 2060 }, { "epoch": 0.2890603085553997, "grad_norm": 1.994422765606317, "learning_rate": 8.341036949634633e-06, "loss": 0.3991, "step": 2061 }, { "epoch": 0.28920056100981767, "grad_norm": 2.1137938380193178, "learning_rate": 8.339346851833163e-06, "loss": 0.4185, "step": 2062 }, { "epoch": 0.2893408134642356, "grad_norm": 2.6572276391949305, "learning_rate": 8.337656064980801e-06, "loss": 0.4079, "step": 2063 }, { "epoch": 0.2894810659186536, "grad_norm": 2.529826387924585, "learning_rate": 8.335964589426429e-06, "loss": 0.3931, "step": 2064 }, { "epoch": 0.28962131837307153, "grad_norm": 2.7859199835739554, "learning_rate": 8.334272425519069e-06, "loss": 0.4661, "step": 2065 }, { "epoch": 0.2897615708274895, "grad_norm": 2.245219875009994, "learning_rate": 8.33257957360789e-06, "loss": 0.3807, "step": 2066 }, { "epoch": 0.28990182328190744, "grad_norm": 1.846290777397913, "learning_rate": 8.330886034042198e-06, "loss": 0.3663, "step": 2067 }, { "epoch": 0.2900420757363254, "grad_norm": 2.6281187513279423, "learning_rate": 8.329191807171447e-06, "loss": 0.3913, "step": 2068 }, { "epoch": 0.29018232819074335, "grad_norm": 2.922661007507906, "learning_rate": 8.327496893345223e-06, "loss": 0.3935, "step": 2069 }, { "epoch": 0.2903225806451613, "grad_norm": 1.9079077969431208, "learning_rate": 8.325801292913265e-06, "loss": 0.3935, "step": 2070 }, { "epoch": 0.29046283309957927, "grad_norm": 3.1233806228993175, "learning_rate": 8.324105006225444e-06, "loss": 0.3871, "step": 2071 }, { "epoch": 0.2906030855539972, "grad_norm": 2.8115247095071365, "learning_rate": 8.32240803363178e-06, "loss": 0.3786, "step": 2072 }, { "epoch": 0.2907433380084151, "grad_norm": 2.0744786216459934, "learning_rate": 8.320710375482432e-06, "loss": 0.3892, "step": 2073 }, { "epoch": 0.2908835904628331, "grad_norm": 2.0533894547104463, "learning_rate": 8.319012032127698e-06, "loss": 0.393, "step": 2074 }, { "epoch": 0.29102384291725103, "grad_norm": 2.0383829321744065, "learning_rate": 8.317313003918017e-06, "loss": 0.3717, "step": 2075 }, { "epoch": 0.291164095371669, "grad_norm": 2.0247831252017, "learning_rate": 8.315613291203977e-06, "loss": 0.4398, "step": 2076 }, { "epoch": 0.29130434782608694, "grad_norm": 1.9283604556951024, "learning_rate": 8.313912894336298e-06, "loss": 0.3893, "step": 2077 }, { "epoch": 0.2914446002805049, "grad_norm": 2.0797136014221835, "learning_rate": 8.312211813665848e-06, "loss": 0.3805, "step": 2078 }, { "epoch": 0.29158485273492285, "grad_norm": 3.2149438668626296, "learning_rate": 8.310510049543628e-06, "loss": 0.3408, "step": 2079 }, { "epoch": 0.2917251051893408, "grad_norm": 1.98311611102613, "learning_rate": 8.30880760232079e-06, "loss": 0.3638, "step": 2080 }, { "epoch": 0.29186535764375876, "grad_norm": 2.448884153249969, "learning_rate": 8.307104472348619e-06, "loss": 0.3961, "step": 2081 }, { "epoch": 0.2920056100981767, "grad_norm": 2.5653130512420512, "learning_rate": 8.305400659978547e-06, "loss": 0.4071, "step": 2082 }, { "epoch": 0.29214586255259467, "grad_norm": 2.752057346295419, "learning_rate": 8.303696165562141e-06, "loss": 0.4087, "step": 2083 }, { "epoch": 0.2922861150070126, "grad_norm": 2.0282528406417404, "learning_rate": 8.301990989451114e-06, "loss": 0.3831, "step": 2084 }, { "epoch": 0.2924263674614306, "grad_norm": 1.9096606026929017, "learning_rate": 8.300285131997315e-06, "loss": 0.3793, "step": 2085 }, { "epoch": 0.29256661991584854, "grad_norm": 1.9258908130360277, "learning_rate": 8.298578593552737e-06, "loss": 0.338, "step": 2086 }, { "epoch": 0.2927068723702665, "grad_norm": 4.284687082622965, "learning_rate": 8.296871374469511e-06, "loss": 0.4034, "step": 2087 }, { "epoch": 0.29284712482468445, "grad_norm": 2.629651444403835, "learning_rate": 8.295163475099911e-06, "loss": 0.3948, "step": 2088 }, { "epoch": 0.2929873772791024, "grad_norm": 2.6515235587808332, "learning_rate": 8.293454895796351e-06, "loss": 0.3882, "step": 2089 }, { "epoch": 0.29312762973352036, "grad_norm": 2.2166437171761464, "learning_rate": 8.291745636911382e-06, "loss": 0.3615, "step": 2090 }, { "epoch": 0.2932678821879383, "grad_norm": 2.3002612007099756, "learning_rate": 8.2900356987977e-06, "loss": 0.3707, "step": 2091 }, { "epoch": 0.29340813464235627, "grad_norm": 2.288243169913762, "learning_rate": 8.288325081808134e-06, "loss": 0.4152, "step": 2092 }, { "epoch": 0.29354838709677417, "grad_norm": 1.7614318884773124, "learning_rate": 8.286613786295666e-06, "loss": 0.4022, "step": 2093 }, { "epoch": 0.2936886395511921, "grad_norm": 1.6696173853202558, "learning_rate": 8.284901812613403e-06, "loss": 0.4109, "step": 2094 }, { "epoch": 0.2938288920056101, "grad_norm": 4.319676054005422, "learning_rate": 8.283189161114602e-06, "loss": 0.4101, "step": 2095 }, { "epoch": 0.29396914446002803, "grad_norm": 3.0241739327399584, "learning_rate": 8.281475832152655e-06, "loss": 0.3953, "step": 2096 }, { "epoch": 0.294109396914446, "grad_norm": 2.3366791941496956, "learning_rate": 8.279761826081096e-06, "loss": 0.3794, "step": 2097 }, { "epoch": 0.29424964936886394, "grad_norm": 2.1285467281718837, "learning_rate": 8.2780471432536e-06, "loss": 0.3705, "step": 2098 }, { "epoch": 0.2943899018232819, "grad_norm": 1.898297214426797, "learning_rate": 8.276331784023976e-06, "loss": 0.3772, "step": 2099 }, { "epoch": 0.29453015427769985, "grad_norm": 2.105764732717809, "learning_rate": 8.27461574874618e-06, "loss": 0.3903, "step": 2100 }, { "epoch": 0.2946704067321178, "grad_norm": 2.126178414711259, "learning_rate": 8.272899037774302e-06, "loss": 0.412, "step": 2101 }, { "epoch": 0.29481065918653576, "grad_norm": 4.847686349435665, "learning_rate": 8.271181651462575e-06, "loss": 0.3925, "step": 2102 }, { "epoch": 0.2949509116409537, "grad_norm": 1.9651083796462991, "learning_rate": 8.269463590165368e-06, "loss": 0.3721, "step": 2103 }, { "epoch": 0.29509116409537167, "grad_norm": 2.66404656995558, "learning_rate": 8.26774485423719e-06, "loss": 0.3814, "step": 2104 }, { "epoch": 0.2952314165497896, "grad_norm": 2.0170938034221906, "learning_rate": 8.266025444032694e-06, "loss": 0.3834, "step": 2105 }, { "epoch": 0.2953716690042076, "grad_norm": 2.699145082829306, "learning_rate": 8.264305359906664e-06, "loss": 0.3915, "step": 2106 }, { "epoch": 0.29551192145862554, "grad_norm": 2.1751018675703127, "learning_rate": 8.26258460221403e-06, "loss": 0.3809, "step": 2107 }, { "epoch": 0.2956521739130435, "grad_norm": 2.0102372427085955, "learning_rate": 8.260863171309857e-06, "loss": 0.3755, "step": 2108 }, { "epoch": 0.29579242636746145, "grad_norm": 2.1862630344043756, "learning_rate": 8.25914106754935e-06, "loss": 0.3799, "step": 2109 }, { "epoch": 0.2959326788218794, "grad_norm": 2.1306685966312675, "learning_rate": 8.257418291287855e-06, "loss": 0.3374, "step": 2110 }, { "epoch": 0.29607293127629736, "grad_norm": 4.926736035785619, "learning_rate": 8.255694842880854e-06, "loss": 0.4177, "step": 2111 }, { "epoch": 0.2962131837307153, "grad_norm": 3.0856561073742608, "learning_rate": 8.253970722683968e-06, "loss": 0.3788, "step": 2112 }, { "epoch": 0.2963534361851332, "grad_norm": 1.8817922906807496, "learning_rate": 8.252245931052958e-06, "loss": 0.3877, "step": 2113 }, { "epoch": 0.29649368863955117, "grad_norm": 7.795719772101657, "learning_rate": 8.250520468343722e-06, "loss": 0.3775, "step": 2114 }, { "epoch": 0.2966339410939691, "grad_norm": 2.2941997284682962, "learning_rate": 8.248794334912297e-06, "loss": 0.3845, "step": 2115 }, { "epoch": 0.2967741935483871, "grad_norm": 3.1427453318436727, "learning_rate": 8.247067531114858e-06, "loss": 0.3832, "step": 2116 }, { "epoch": 0.29691444600280503, "grad_norm": 2.1658731866065994, "learning_rate": 8.245340057307722e-06, "loss": 0.4201, "step": 2117 }, { "epoch": 0.297054698457223, "grad_norm": 2.1934600290656965, "learning_rate": 8.243611913847337e-06, "loss": 0.4464, "step": 2118 }, { "epoch": 0.29719495091164094, "grad_norm": 1.7379775564239317, "learning_rate": 8.241883101090296e-06, "loss": 0.3596, "step": 2119 }, { "epoch": 0.2973352033660589, "grad_norm": 2.020164641289784, "learning_rate": 8.240153619393325e-06, "loss": 0.3881, "step": 2120 }, { "epoch": 0.29747545582047685, "grad_norm": 2.090539434988478, "learning_rate": 8.238423469113294e-06, "loss": 0.3836, "step": 2121 }, { "epoch": 0.2976157082748948, "grad_norm": 1.844075578295086, "learning_rate": 8.236692650607205e-06, "loss": 0.3258, "step": 2122 }, { "epoch": 0.29775596072931276, "grad_norm": 3.8265461059568042, "learning_rate": 8.2349611642322e-06, "loss": 0.4089, "step": 2123 }, { "epoch": 0.2978962131837307, "grad_norm": 2.880623831737345, "learning_rate": 8.233229010345561e-06, "loss": 0.3996, "step": 2124 }, { "epoch": 0.2980364656381487, "grad_norm": 3.206971775483844, "learning_rate": 8.231496189304704e-06, "loss": 0.4055, "step": 2125 }, { "epoch": 0.29817671809256663, "grad_norm": 2.7135037195599483, "learning_rate": 8.229762701467187e-06, "loss": 0.3559, "step": 2126 }, { "epoch": 0.2983169705469846, "grad_norm": 2.7865619295536117, "learning_rate": 8.2280285471907e-06, "loss": 0.4346, "step": 2127 }, { "epoch": 0.29845722300140254, "grad_norm": 2.2144601748777024, "learning_rate": 8.226293726833077e-06, "loss": 0.4049, "step": 2128 }, { "epoch": 0.2985974754558205, "grad_norm": 3.596369178661456, "learning_rate": 8.224558240752282e-06, "loss": 0.3634, "step": 2129 }, { "epoch": 0.29873772791023845, "grad_norm": 2.2244717603122672, "learning_rate": 8.222822089306423e-06, "loss": 0.3657, "step": 2130 }, { "epoch": 0.2988779803646564, "grad_norm": 2.0354943168558925, "learning_rate": 8.221085272853743e-06, "loss": 0.3231, "step": 2131 }, { "epoch": 0.29901823281907436, "grad_norm": 2.221840009668665, "learning_rate": 8.21934779175262e-06, "loss": 0.4098, "step": 2132 }, { "epoch": 0.29915848527349226, "grad_norm": 2.363376583982346, "learning_rate": 8.217609646361574e-06, "loss": 0.3773, "step": 2133 }, { "epoch": 0.2992987377279102, "grad_norm": 2.9407561091546293, "learning_rate": 8.215870837039258e-06, "loss": 0.3736, "step": 2134 }, { "epoch": 0.29943899018232817, "grad_norm": 2.092441890370004, "learning_rate": 8.21413136414446e-06, "loss": 0.4006, "step": 2135 }, { "epoch": 0.2995792426367461, "grad_norm": 2.6833049946427825, "learning_rate": 8.212391228036111e-06, "loss": 0.417, "step": 2136 }, { "epoch": 0.2997194950911641, "grad_norm": 2.937513635481983, "learning_rate": 8.210650429073278e-06, "loss": 0.3488, "step": 2137 }, { "epoch": 0.29985974754558203, "grad_norm": 1.7572655399040793, "learning_rate": 8.208908967615159e-06, "loss": 0.3499, "step": 2138 }, { "epoch": 0.3, "grad_norm": 2.4271575414312343, "learning_rate": 8.207166844021093e-06, "loss": 0.3656, "step": 2139 }, { "epoch": 0.30014025245441794, "grad_norm": 1.9399610086828964, "learning_rate": 8.205424058650557e-06, "loss": 0.3795, "step": 2140 }, { "epoch": 0.3002805049088359, "grad_norm": 2.1563743887143016, "learning_rate": 8.203680611863161e-06, "loss": 0.3787, "step": 2141 }, { "epoch": 0.30042075736325385, "grad_norm": 1.9535460361614778, "learning_rate": 8.201936504018653e-06, "loss": 0.3215, "step": 2142 }, { "epoch": 0.3005610098176718, "grad_norm": 2.0141089600501014, "learning_rate": 8.200191735476918e-06, "loss": 0.3664, "step": 2143 }, { "epoch": 0.30070126227208976, "grad_norm": 2.088290826537911, "learning_rate": 8.198446306597977e-06, "loss": 0.3713, "step": 2144 }, { "epoch": 0.3008415147265077, "grad_norm": 2.2860951638492764, "learning_rate": 8.196700217741987e-06, "loss": 0.3709, "step": 2145 }, { "epoch": 0.3009817671809257, "grad_norm": 2.004121229749821, "learning_rate": 8.19495346926924e-06, "loss": 0.3616, "step": 2146 }, { "epoch": 0.30112201963534363, "grad_norm": 2.6905026185385044, "learning_rate": 8.193206061540167e-06, "loss": 0.3639, "step": 2147 }, { "epoch": 0.3012622720897616, "grad_norm": 2.6251891840018953, "learning_rate": 8.191457994915334e-06, "loss": 0.3954, "step": 2148 }, { "epoch": 0.30140252454417954, "grad_norm": 3.0459803123514146, "learning_rate": 8.18970926975544e-06, "loss": 0.4426, "step": 2149 }, { "epoch": 0.3015427769985975, "grad_norm": 2.8704062540507147, "learning_rate": 8.187959886421322e-06, "loss": 0.3841, "step": 2150 }, { "epoch": 0.30168302945301545, "grad_norm": 1.7224156910232515, "learning_rate": 8.186209845273954e-06, "loss": 0.3872, "step": 2151 }, { "epoch": 0.3018232819074334, "grad_norm": 1.9432145414583362, "learning_rate": 8.184459146674447e-06, "loss": 0.3739, "step": 2152 }, { "epoch": 0.30196353436185136, "grad_norm": 1.9952429583804874, "learning_rate": 8.182707790984043e-06, "loss": 0.3958, "step": 2153 }, { "epoch": 0.30210378681626926, "grad_norm": 2.0696880046481576, "learning_rate": 8.180955778564122e-06, "loss": 0.3964, "step": 2154 }, { "epoch": 0.3022440392706872, "grad_norm": 1.6842580666232094, "learning_rate": 8.1792031097762e-06, "loss": 0.3527, "step": 2155 }, { "epoch": 0.30238429172510517, "grad_norm": 2.6652135963082833, "learning_rate": 8.177449784981927e-06, "loss": 0.4207, "step": 2156 }, { "epoch": 0.3025245441795231, "grad_norm": 1.9541580519930288, "learning_rate": 8.175695804543093e-06, "loss": 0.3816, "step": 2157 }, { "epoch": 0.3026647966339411, "grad_norm": 1.9573398735060985, "learning_rate": 8.173941168821615e-06, "loss": 0.3649, "step": 2158 }, { "epoch": 0.30280504908835904, "grad_norm": 1.8407808739765645, "learning_rate": 8.172185878179553e-06, "loss": 0.4146, "step": 2159 }, { "epoch": 0.302945301542777, "grad_norm": 2.3113105395461866, "learning_rate": 8.170429932979097e-06, "loss": 0.398, "step": 2160 }, { "epoch": 0.30308555399719495, "grad_norm": 3.030407660729817, "learning_rate": 8.168673333582572e-06, "loss": 0.3413, "step": 2161 }, { "epoch": 0.3032258064516129, "grad_norm": 1.6785322607972446, "learning_rate": 8.166916080352447e-06, "loss": 0.3947, "step": 2162 }, { "epoch": 0.30336605890603086, "grad_norm": 1.8131414133543353, "learning_rate": 8.165158173651313e-06, "loss": 0.4159, "step": 2163 }, { "epoch": 0.3035063113604488, "grad_norm": 2.4342955041326495, "learning_rate": 8.163399613841903e-06, "loss": 0.4456, "step": 2164 }, { "epoch": 0.30364656381486677, "grad_norm": 2.224569720550326, "learning_rate": 8.161640401287084e-06, "loss": 0.4068, "step": 2165 }, { "epoch": 0.3037868162692847, "grad_norm": 1.893543607336071, "learning_rate": 8.159880536349858e-06, "loss": 0.3876, "step": 2166 }, { "epoch": 0.3039270687237027, "grad_norm": 2.127696178101129, "learning_rate": 8.15812001939336e-06, "loss": 0.405, "step": 2167 }, { "epoch": 0.30406732117812063, "grad_norm": 2.9861863316566577, "learning_rate": 8.156358850780858e-06, "loss": 0.342, "step": 2168 }, { "epoch": 0.3042075736325386, "grad_norm": 2.193005636059241, "learning_rate": 8.154597030875762e-06, "loss": 0.433, "step": 2169 }, { "epoch": 0.30434782608695654, "grad_norm": 2.3754694625250217, "learning_rate": 8.152834560041607e-06, "loss": 0.3775, "step": 2170 }, { "epoch": 0.3044880785413745, "grad_norm": 2.673078477876989, "learning_rate": 8.15107143864207e-06, "loss": 0.3682, "step": 2171 }, { "epoch": 0.30462833099579245, "grad_norm": 4.703118713983029, "learning_rate": 8.149307667040954e-06, "loss": 0.3452, "step": 2172 }, { "epoch": 0.3047685834502104, "grad_norm": 2.277373796268076, "learning_rate": 8.147543245602204e-06, "loss": 0.4216, "step": 2173 }, { "epoch": 0.3049088359046283, "grad_norm": 2.022083669091645, "learning_rate": 8.145778174689897e-06, "loss": 0.3599, "step": 2174 }, { "epoch": 0.30504908835904626, "grad_norm": 2.2134787038188377, "learning_rate": 8.144012454668241e-06, "loss": 0.3841, "step": 2175 }, { "epoch": 0.3051893408134642, "grad_norm": 2.2910215567977876, "learning_rate": 8.142246085901581e-06, "loss": 0.4187, "step": 2176 }, { "epoch": 0.30532959326788217, "grad_norm": 2.451140457499104, "learning_rate": 8.140479068754396e-06, "loss": 0.3879, "step": 2177 }, { "epoch": 0.3054698457223001, "grad_norm": 2.8276933372229385, "learning_rate": 8.138711403591295e-06, "loss": 0.3992, "step": 2178 }, { "epoch": 0.3056100981767181, "grad_norm": 2.37082003124329, "learning_rate": 8.136943090777025e-06, "loss": 0.4225, "step": 2179 }, { "epoch": 0.30575035063113604, "grad_norm": 2.1829502201529407, "learning_rate": 8.135174130676464e-06, "loss": 0.3544, "step": 2180 }, { "epoch": 0.305890603085554, "grad_norm": 2.617295812386776, "learning_rate": 8.133404523654626e-06, "loss": 0.4068, "step": 2181 }, { "epoch": 0.30603085553997195, "grad_norm": 2.0892847757488746, "learning_rate": 8.131634270076657e-06, "loss": 0.3957, "step": 2182 }, { "epoch": 0.3061711079943899, "grad_norm": 2.2764629145868502, "learning_rate": 8.129863370307833e-06, "loss": 0.3807, "step": 2183 }, { "epoch": 0.30631136044880786, "grad_norm": 2.8685846484698705, "learning_rate": 8.128091824713571e-06, "loss": 0.4161, "step": 2184 }, { "epoch": 0.3064516129032258, "grad_norm": 2.1901891255692325, "learning_rate": 8.126319633659416e-06, "loss": 0.3775, "step": 2185 }, { "epoch": 0.30659186535764377, "grad_norm": 2.3252623707387166, "learning_rate": 8.124546797511046e-06, "loss": 0.386, "step": 2186 }, { "epoch": 0.3067321178120617, "grad_norm": 2.067826185222951, "learning_rate": 8.122773316634276e-06, "loss": 0.4171, "step": 2187 }, { "epoch": 0.3068723702664797, "grad_norm": 2.724974248921923, "learning_rate": 8.120999191395048e-06, "loss": 0.4506, "step": 2188 }, { "epoch": 0.30701262272089763, "grad_norm": 1.8373023621025466, "learning_rate": 8.119224422159441e-06, "loss": 0.3795, "step": 2189 }, { "epoch": 0.3071528751753156, "grad_norm": 2.099796616832632, "learning_rate": 8.117449009293668e-06, "loss": 0.3593, "step": 2190 }, { "epoch": 0.30729312762973354, "grad_norm": 2.260378320137247, "learning_rate": 8.115672953164073e-06, "loss": 0.3675, "step": 2191 }, { "epoch": 0.3074333800841515, "grad_norm": 3.0535911680522076, "learning_rate": 8.113896254137131e-06, "loss": 0.4079, "step": 2192 }, { "epoch": 0.30757363253856945, "grad_norm": 1.851037123944131, "learning_rate": 8.112118912579452e-06, "loss": 0.3492, "step": 2193 }, { "epoch": 0.30771388499298735, "grad_norm": 2.0999790735564487, "learning_rate": 8.110340928857779e-06, "loss": 0.3966, "step": 2194 }, { "epoch": 0.3078541374474053, "grad_norm": 2.2863134339690103, "learning_rate": 8.108562303338987e-06, "loss": 0.3676, "step": 2195 }, { "epoch": 0.30799438990182326, "grad_norm": 1.616899252740245, "learning_rate": 8.10678303639008e-06, "loss": 0.3719, "step": 2196 }, { "epoch": 0.3081346423562412, "grad_norm": 1.9026439516043294, "learning_rate": 8.1050031283782e-06, "loss": 0.3955, "step": 2197 }, { "epoch": 0.3082748948106592, "grad_norm": 1.8766172428071146, "learning_rate": 8.103222579670618e-06, "loss": 0.3902, "step": 2198 }, { "epoch": 0.3084151472650771, "grad_norm": 2.128621437549606, "learning_rate": 8.101441390634736e-06, "loss": 0.3842, "step": 2199 }, { "epoch": 0.3085553997194951, "grad_norm": 1.5484493220519866, "learning_rate": 8.099659561638092e-06, "loss": 0.3689, "step": 2200 }, { "epoch": 0.30869565217391304, "grad_norm": 2.0247993746614488, "learning_rate": 8.097877093048354e-06, "loss": 0.4129, "step": 2201 }, { "epoch": 0.308835904628331, "grad_norm": 2.205177595464606, "learning_rate": 8.096093985233323e-06, "loss": 0.3824, "step": 2202 }, { "epoch": 0.30897615708274895, "grad_norm": 2.5801998228123475, "learning_rate": 8.094310238560926e-06, "loss": 0.4089, "step": 2203 }, { "epoch": 0.3091164095371669, "grad_norm": 3.9177762076473956, "learning_rate": 8.092525853399231e-06, "loss": 0.3898, "step": 2204 }, { "epoch": 0.30925666199158486, "grad_norm": 3.272130591493574, "learning_rate": 8.090740830116432e-06, "loss": 0.4042, "step": 2205 }, { "epoch": 0.3093969144460028, "grad_norm": 2.539198182222068, "learning_rate": 8.088955169080856e-06, "loss": 0.3859, "step": 2206 }, { "epoch": 0.30953716690042077, "grad_norm": 1.7323134258262423, "learning_rate": 8.087168870660964e-06, "loss": 0.4205, "step": 2207 }, { "epoch": 0.3096774193548387, "grad_norm": 2.1988005706367217, "learning_rate": 8.085381935225342e-06, "loss": 0.3865, "step": 2208 }, { "epoch": 0.3098176718092567, "grad_norm": 2.3541928405206924, "learning_rate": 8.083594363142717e-06, "loss": 0.3139, "step": 2209 }, { "epoch": 0.30995792426367463, "grad_norm": 2.485741197662744, "learning_rate": 8.081806154781936e-06, "loss": 0.4228, "step": 2210 }, { "epoch": 0.3100981767180926, "grad_norm": 2.208808008197552, "learning_rate": 8.080017310511987e-06, "loss": 0.4026, "step": 2211 }, { "epoch": 0.31023842917251054, "grad_norm": 1.7956333680427607, "learning_rate": 8.078227830701985e-06, "loss": 0.3569, "step": 2212 }, { "epoch": 0.3103786816269285, "grad_norm": 2.288421541122139, "learning_rate": 8.076437715721174e-06, "loss": 0.4007, "step": 2213 }, { "epoch": 0.3105189340813464, "grad_norm": 2.7860932773548823, "learning_rate": 8.074646965938937e-06, "loss": 0.4589, "step": 2214 }, { "epoch": 0.31065918653576435, "grad_norm": 2.253996372900332, "learning_rate": 8.072855581724778e-06, "loss": 0.4302, "step": 2215 }, { "epoch": 0.3107994389901823, "grad_norm": 1.9691610966734285, "learning_rate": 8.071063563448341e-06, "loss": 0.357, "step": 2216 }, { "epoch": 0.31093969144460026, "grad_norm": 2.737957770685815, "learning_rate": 8.06927091147939e-06, "loss": 0.3727, "step": 2217 }, { "epoch": 0.3110799438990182, "grad_norm": 1.9685142012576333, "learning_rate": 8.067477626187831e-06, "loss": 0.3663, "step": 2218 }, { "epoch": 0.3112201963534362, "grad_norm": 2.0052910197330513, "learning_rate": 8.065683707943696e-06, "loss": 0.3606, "step": 2219 }, { "epoch": 0.31136044880785413, "grad_norm": 2.5818417662838926, "learning_rate": 8.063889157117148e-06, "loss": 0.4069, "step": 2220 }, { "epoch": 0.3115007012622721, "grad_norm": 1.9977260417778602, "learning_rate": 8.062093974078478e-06, "loss": 0.3314, "step": 2221 }, { "epoch": 0.31164095371669004, "grad_norm": 1.8538974895181894, "learning_rate": 8.060298159198107e-06, "loss": 0.4012, "step": 2222 }, { "epoch": 0.311781206171108, "grad_norm": 2.506972317762278, "learning_rate": 8.058501712846594e-06, "loss": 0.4327, "step": 2223 }, { "epoch": 0.31192145862552595, "grad_norm": 2.1493888829241308, "learning_rate": 8.056704635394621e-06, "loss": 0.3717, "step": 2224 }, { "epoch": 0.3120617110799439, "grad_norm": 2.0530731473323396, "learning_rate": 8.054906927213e-06, "loss": 0.3568, "step": 2225 }, { "epoch": 0.31220196353436186, "grad_norm": 2.065786344427999, "learning_rate": 8.05310858867268e-06, "loss": 0.4181, "step": 2226 }, { "epoch": 0.3123422159887798, "grad_norm": 2.300136922824164, "learning_rate": 8.051309620144733e-06, "loss": 0.3884, "step": 2227 }, { "epoch": 0.31248246844319777, "grad_norm": 3.0152160683511178, "learning_rate": 8.049510022000365e-06, "loss": 0.4056, "step": 2228 }, { "epoch": 0.3126227208976157, "grad_norm": 2.389163299689456, "learning_rate": 8.047709794610907e-06, "loss": 0.3686, "step": 2229 }, { "epoch": 0.3127629733520337, "grad_norm": 3.321035364786867, "learning_rate": 8.045908938347828e-06, "loss": 0.3865, "step": 2230 }, { "epoch": 0.31290322580645163, "grad_norm": 2.31740750421422, "learning_rate": 8.04410745358272e-06, "loss": 0.3923, "step": 2231 }, { "epoch": 0.3130434782608696, "grad_norm": 2.8486348211982975, "learning_rate": 8.042305340687307e-06, "loss": 0.4217, "step": 2232 }, { "epoch": 0.31318373071528754, "grad_norm": 2.1042025975347927, "learning_rate": 8.040502600033441e-06, "loss": 0.3795, "step": 2233 }, { "epoch": 0.31332398316970544, "grad_norm": 2.285068503757385, "learning_rate": 8.038699231993106e-06, "loss": 0.3854, "step": 2234 }, { "epoch": 0.3134642356241234, "grad_norm": 2.1754397769799563, "learning_rate": 8.036895236938416e-06, "loss": 0.3727, "step": 2235 }, { "epoch": 0.31360448807854135, "grad_norm": 2.4114890594392433, "learning_rate": 8.03509061524161e-06, "loss": 0.3539, "step": 2236 }, { "epoch": 0.3137447405329593, "grad_norm": 2.4342818902245975, "learning_rate": 8.03328536727506e-06, "loss": 0.4188, "step": 2237 }, { "epoch": 0.31388499298737726, "grad_norm": 2.581267135600186, "learning_rate": 8.031479493411268e-06, "loss": 0.4197, "step": 2238 }, { "epoch": 0.3140252454417952, "grad_norm": 2.1154362378790768, "learning_rate": 8.029672994022861e-06, "loss": 0.3966, "step": 2239 }, { "epoch": 0.3141654978962132, "grad_norm": 2.1405316631824034, "learning_rate": 8.027865869482599e-06, "loss": 0.3859, "step": 2240 }, { "epoch": 0.31430575035063113, "grad_norm": 2.423109072400933, "learning_rate": 8.02605812016337e-06, "loss": 0.3493, "step": 2241 }, { "epoch": 0.3144460028050491, "grad_norm": 1.7392404031712216, "learning_rate": 8.024249746438189e-06, "loss": 0.3639, "step": 2242 }, { "epoch": 0.31458625525946704, "grad_norm": 2.1875422882352202, "learning_rate": 8.022440748680202e-06, "loss": 0.3669, "step": 2243 }, { "epoch": 0.314726507713885, "grad_norm": 1.991900793650069, "learning_rate": 8.020631127262681e-06, "loss": 0.3711, "step": 2244 }, { "epoch": 0.31486676016830295, "grad_norm": 1.7238689833552583, "learning_rate": 8.018820882559034e-06, "loss": 0.3963, "step": 2245 }, { "epoch": 0.3150070126227209, "grad_norm": 1.7066443941158096, "learning_rate": 8.017010014942788e-06, "loss": 0.3787, "step": 2246 }, { "epoch": 0.31514726507713886, "grad_norm": 2.401744935372743, "learning_rate": 8.015198524787603e-06, "loss": 0.382, "step": 2247 }, { "epoch": 0.3152875175315568, "grad_norm": 5.288497556550409, "learning_rate": 8.013386412467268e-06, "loss": 0.3587, "step": 2248 }, { "epoch": 0.31542776998597477, "grad_norm": 1.9781105703879194, "learning_rate": 8.0115736783557e-06, "loss": 0.3705, "step": 2249 }, { "epoch": 0.3155680224403927, "grad_norm": 2.7045237720281876, "learning_rate": 8.009760322826945e-06, "loss": 0.3735, "step": 2250 }, { "epoch": 0.3157082748948107, "grad_norm": 2.23414664001861, "learning_rate": 8.007946346255176e-06, "loss": 0.4121, "step": 2251 }, { "epoch": 0.31584852734922864, "grad_norm": 1.8496678311900048, "learning_rate": 8.006131749014692e-06, "loss": 0.4051, "step": 2252 }, { "epoch": 0.3159887798036466, "grad_norm": 3.107255989984166, "learning_rate": 8.004316531479924e-06, "loss": 0.412, "step": 2253 }, { "epoch": 0.3161290322580645, "grad_norm": 2.0664564389802367, "learning_rate": 8.00250069402543e-06, "loss": 0.3983, "step": 2254 }, { "epoch": 0.31626928471248245, "grad_norm": 1.5957451094633568, "learning_rate": 8.000684237025894e-06, "loss": 0.3737, "step": 2255 }, { "epoch": 0.3164095371669004, "grad_norm": 2.449441172646948, "learning_rate": 7.998867160856133e-06, "loss": 0.4479, "step": 2256 }, { "epoch": 0.31654978962131836, "grad_norm": 3.034635403397775, "learning_rate": 7.997049465891083e-06, "loss": 0.3817, "step": 2257 }, { "epoch": 0.3166900420757363, "grad_norm": 2.1371485192505832, "learning_rate": 7.995231152505815e-06, "loss": 0.4284, "step": 2258 }, { "epoch": 0.31683029453015427, "grad_norm": 2.103018444247045, "learning_rate": 7.993412221075525e-06, "loss": 0.3598, "step": 2259 }, { "epoch": 0.3169705469845722, "grad_norm": 2.0663157913801635, "learning_rate": 7.991592671975536e-06, "loss": 0.4007, "step": 2260 }, { "epoch": 0.3171107994389902, "grad_norm": 4.100768152246181, "learning_rate": 7.9897725055813e-06, "loss": 0.3542, "step": 2261 }, { "epoch": 0.31725105189340813, "grad_norm": 2.007098950602527, "learning_rate": 7.987951722268399e-06, "loss": 0.3738, "step": 2262 }, { "epoch": 0.3173913043478261, "grad_norm": 2.696578195800087, "learning_rate": 7.986130322412532e-06, "loss": 0.3687, "step": 2263 }, { "epoch": 0.31753155680224404, "grad_norm": 2.009144856753719, "learning_rate": 7.984308306389536e-06, "loss": 0.364, "step": 2264 }, { "epoch": 0.317671809256662, "grad_norm": 4.901461262701729, "learning_rate": 7.982485674575373e-06, "loss": 0.373, "step": 2265 }, { "epoch": 0.31781206171107995, "grad_norm": 1.7093962952230575, "learning_rate": 7.980662427346127e-06, "loss": 0.3988, "step": 2266 }, { "epoch": 0.3179523141654979, "grad_norm": 2.874669760935338, "learning_rate": 7.978838565078015e-06, "loss": 0.3611, "step": 2267 }, { "epoch": 0.31809256661991586, "grad_norm": 2.7640808449886296, "learning_rate": 7.977014088147375e-06, "loss": 0.4062, "step": 2268 }, { "epoch": 0.3182328190743338, "grad_norm": 2.2714153092332197, "learning_rate": 7.975188996930679e-06, "loss": 0.3719, "step": 2269 }, { "epoch": 0.31837307152875177, "grad_norm": 2.7586359194220056, "learning_rate": 7.973363291804518e-06, "loss": 0.4288, "step": 2270 }, { "epoch": 0.3185133239831697, "grad_norm": 2.0566691426530688, "learning_rate": 7.971536973145614e-06, "loss": 0.4049, "step": 2271 }, { "epoch": 0.3186535764375877, "grad_norm": 2.645378754580899, "learning_rate": 7.96971004133082e-06, "loss": 0.3915, "step": 2272 }, { "epoch": 0.31879382889200564, "grad_norm": 2.3592871845399666, "learning_rate": 7.967882496737106e-06, "loss": 0.3707, "step": 2273 }, { "epoch": 0.31893408134642354, "grad_norm": 2.4749599530611786, "learning_rate": 7.966054339741573e-06, "loss": 0.4039, "step": 2274 }, { "epoch": 0.3190743338008415, "grad_norm": 1.8905120689775639, "learning_rate": 7.96422557072145e-06, "loss": 0.3938, "step": 2275 }, { "epoch": 0.31921458625525945, "grad_norm": 2.676230466683171, "learning_rate": 7.962396190054089e-06, "loss": 0.3574, "step": 2276 }, { "epoch": 0.3193548387096774, "grad_norm": 2.148687951488609, "learning_rate": 7.960566198116973e-06, "loss": 0.425, "step": 2277 }, { "epoch": 0.31949509116409536, "grad_norm": 3.1232190970337275, "learning_rate": 7.958735595287706e-06, "loss": 0.345, "step": 2278 }, { "epoch": 0.3196353436185133, "grad_norm": 2.2374484391904743, "learning_rate": 7.95690438194402e-06, "loss": 0.3673, "step": 2279 }, { "epoch": 0.31977559607293127, "grad_norm": 2.1124972371641975, "learning_rate": 7.955072558463772e-06, "loss": 0.3808, "step": 2280 }, { "epoch": 0.3199158485273492, "grad_norm": 2.0835221268623476, "learning_rate": 7.953240125224948e-06, "loss": 0.4209, "step": 2281 }, { "epoch": 0.3200561009817672, "grad_norm": 2.007836961411668, "learning_rate": 7.951407082605657e-06, "loss": 0.365, "step": 2282 }, { "epoch": 0.32019635343618513, "grad_norm": 2.4335629873030586, "learning_rate": 7.949573430984137e-06, "loss": 0.4528, "step": 2283 }, { "epoch": 0.3203366058906031, "grad_norm": 2.7069682764872676, "learning_rate": 7.947739170738744e-06, "loss": 0.385, "step": 2284 }, { "epoch": 0.32047685834502104, "grad_norm": 3.632064118773018, "learning_rate": 7.945904302247968e-06, "loss": 0.4091, "step": 2285 }, { "epoch": 0.320617110799439, "grad_norm": 2.464407663111946, "learning_rate": 7.944068825890424e-06, "loss": 0.343, "step": 2286 }, { "epoch": 0.32075736325385695, "grad_norm": 1.7060924890497633, "learning_rate": 7.942232742044842e-06, "loss": 0.3816, "step": 2287 }, { "epoch": 0.3208976157082749, "grad_norm": 1.9806092342231936, "learning_rate": 7.940396051090093e-06, "loss": 0.3719, "step": 2288 }, { "epoch": 0.32103786816269286, "grad_norm": 1.7567527458676493, "learning_rate": 7.938558753405162e-06, "loss": 0.4074, "step": 2289 }, { "epoch": 0.3211781206171108, "grad_norm": 1.9619867321814028, "learning_rate": 7.93672084936916e-06, "loss": 0.4201, "step": 2290 }, { "epoch": 0.3213183730715288, "grad_norm": 2.1708211194339726, "learning_rate": 7.934882339361331e-06, "loss": 0.3895, "step": 2291 }, { "epoch": 0.32145862552594673, "grad_norm": 2.2776281790983575, "learning_rate": 7.933043223761035e-06, "loss": 0.3696, "step": 2292 }, { "epoch": 0.3215988779803647, "grad_norm": 2.225626645086249, "learning_rate": 7.931203502947762e-06, "loss": 0.37, "step": 2293 }, { "epoch": 0.3217391304347826, "grad_norm": 2.0233115374267334, "learning_rate": 7.929363177301124e-06, "loss": 0.3833, "step": 2294 }, { "epoch": 0.32187938288920054, "grad_norm": 4.124759320260182, "learning_rate": 7.927522247200864e-06, "loss": 0.4233, "step": 2295 }, { "epoch": 0.3220196353436185, "grad_norm": 1.6645099593483654, "learning_rate": 7.925680713026837e-06, "loss": 0.3589, "step": 2296 }, { "epoch": 0.32215988779803645, "grad_norm": 2.6125153244518216, "learning_rate": 7.923838575159038e-06, "loss": 0.4116, "step": 2297 }, { "epoch": 0.3223001402524544, "grad_norm": 2.5757111317396366, "learning_rate": 7.921995833977575e-06, "loss": 0.3671, "step": 2298 }, { "epoch": 0.32244039270687236, "grad_norm": 2.8395032088202314, "learning_rate": 7.920152489862687e-06, "loss": 0.4082, "step": 2299 }, { "epoch": 0.3225806451612903, "grad_norm": 2.3124470060307725, "learning_rate": 7.918308543194735e-06, "loss": 0.3825, "step": 2300 }, { "epoch": 0.32272089761570827, "grad_norm": 6.590749916857166, "learning_rate": 7.916463994354203e-06, "loss": 0.3883, "step": 2301 }, { "epoch": 0.3228611500701262, "grad_norm": 2.096803857815951, "learning_rate": 7.914618843721704e-06, "loss": 0.3728, "step": 2302 }, { "epoch": 0.3230014025245442, "grad_norm": 2.4404516688498346, "learning_rate": 7.912773091677968e-06, "loss": 0.4176, "step": 2303 }, { "epoch": 0.32314165497896213, "grad_norm": 2.3188365005028824, "learning_rate": 7.910926738603855e-06, "loss": 0.3491, "step": 2304 }, { "epoch": 0.3232819074333801, "grad_norm": 2.428806059185959, "learning_rate": 7.909079784880347e-06, "loss": 0.3704, "step": 2305 }, { "epoch": 0.32342215988779804, "grad_norm": 2.3858480160663675, "learning_rate": 7.907232230888549e-06, "loss": 0.3682, "step": 2306 }, { "epoch": 0.323562412342216, "grad_norm": 2.6305744382168696, "learning_rate": 7.905384077009693e-06, "loss": 0.3828, "step": 2307 }, { "epoch": 0.32370266479663395, "grad_norm": 1.9328989957854303, "learning_rate": 7.90353532362513e-06, "loss": 0.4115, "step": 2308 }, { "epoch": 0.3238429172510519, "grad_norm": 2.29793965065034, "learning_rate": 7.90168597111634e-06, "loss": 0.3635, "step": 2309 }, { "epoch": 0.32398316970546986, "grad_norm": 1.8021945000319568, "learning_rate": 7.899836019864922e-06, "loss": 0.3607, "step": 2310 }, { "epoch": 0.3241234221598878, "grad_norm": 2.1721283469873813, "learning_rate": 7.897985470252601e-06, "loss": 0.3892, "step": 2311 }, { "epoch": 0.3242636746143058, "grad_norm": 1.7412101159385331, "learning_rate": 7.896134322661225e-06, "loss": 0.3661, "step": 2312 }, { "epoch": 0.32440392706872373, "grad_norm": 1.7566352123120275, "learning_rate": 7.894282577472764e-06, "loss": 0.3627, "step": 2313 }, { "epoch": 0.32454417952314163, "grad_norm": 1.7557127894570657, "learning_rate": 7.892430235069317e-06, "loss": 0.3883, "step": 2314 }, { "epoch": 0.3246844319775596, "grad_norm": 1.9306366474248546, "learning_rate": 7.8905772958331e-06, "loss": 0.3428, "step": 2315 }, { "epoch": 0.32482468443197754, "grad_norm": 2.1882779509176644, "learning_rate": 7.888723760146451e-06, "loss": 0.359, "step": 2316 }, { "epoch": 0.3249649368863955, "grad_norm": 1.8946200949689767, "learning_rate": 7.886869628391835e-06, "loss": 0.3839, "step": 2317 }, { "epoch": 0.32510518934081345, "grad_norm": 1.9532042220261234, "learning_rate": 7.885014900951842e-06, "loss": 0.393, "step": 2318 }, { "epoch": 0.3252454417952314, "grad_norm": 1.78474271229, "learning_rate": 7.883159578209181e-06, "loss": 0.3488, "step": 2319 }, { "epoch": 0.32538569424964936, "grad_norm": 1.8161068249090417, "learning_rate": 7.881303660546684e-06, "loss": 0.3442, "step": 2320 }, { "epoch": 0.3255259467040673, "grad_norm": 2.2887154273729875, "learning_rate": 7.879447148347307e-06, "loss": 0.3764, "step": 2321 }, { "epoch": 0.32566619915848527, "grad_norm": 2.1621102571701, "learning_rate": 7.877590041994128e-06, "loss": 0.3973, "step": 2322 }, { "epoch": 0.3258064516129032, "grad_norm": 2.3703114257488194, "learning_rate": 7.875732341870349e-06, "loss": 0.4186, "step": 2323 }, { "epoch": 0.3259467040673212, "grad_norm": 2.213481719757349, "learning_rate": 7.873874048359293e-06, "loss": 0.4075, "step": 2324 }, { "epoch": 0.32608695652173914, "grad_norm": 2.1171470722688244, "learning_rate": 7.872015161844404e-06, "loss": 0.3755, "step": 2325 }, { "epoch": 0.3262272089761571, "grad_norm": 2.992194989944526, "learning_rate": 7.870155682709253e-06, "loss": 0.3333, "step": 2326 }, { "epoch": 0.32636746143057505, "grad_norm": 2.5419015827950955, "learning_rate": 7.868295611337529e-06, "loss": 0.3672, "step": 2327 }, { "epoch": 0.326507713884993, "grad_norm": 2.0264711946199756, "learning_rate": 7.866434948113046e-06, "loss": 0.3837, "step": 2328 }, { "epoch": 0.32664796633941096, "grad_norm": 2.392167435256061, "learning_rate": 7.864573693419736e-06, "loss": 0.3492, "step": 2329 }, { "epoch": 0.3267882187938289, "grad_norm": 2.6356328239804228, "learning_rate": 7.86271184764166e-06, "loss": 0.4173, "step": 2330 }, { "epoch": 0.32692847124824687, "grad_norm": 2.710258552773532, "learning_rate": 7.860849411162995e-06, "loss": 0.3956, "step": 2331 }, { "epoch": 0.3270687237026648, "grad_norm": 2.5767390369386733, "learning_rate": 7.85898638436804e-06, "loss": 0.3483, "step": 2332 }, { "epoch": 0.3272089761570828, "grad_norm": 1.879747864474537, "learning_rate": 7.857122767641218e-06, "loss": 0.3355, "step": 2333 }, { "epoch": 0.3273492286115007, "grad_norm": 2.936038851041767, "learning_rate": 7.855258561367077e-06, "loss": 0.3801, "step": 2334 }, { "epoch": 0.32748948106591863, "grad_norm": 3.018427520018249, "learning_rate": 7.853393765930279e-06, "loss": 0.3858, "step": 2335 }, { "epoch": 0.3276297335203366, "grad_norm": 2.141056657692691, "learning_rate": 7.851528381715612e-06, "loss": 0.4352, "step": 2336 }, { "epoch": 0.32776998597475454, "grad_norm": 2.1496665074947843, "learning_rate": 7.849662409107987e-06, "loss": 0.3276, "step": 2337 }, { "epoch": 0.3279102384291725, "grad_norm": 1.960200440998415, "learning_rate": 7.847795848492432e-06, "loss": 0.3958, "step": 2338 }, { "epoch": 0.32805049088359045, "grad_norm": 2.8645196711084027, "learning_rate": 7.845928700254101e-06, "loss": 0.3691, "step": 2339 }, { "epoch": 0.3281907433380084, "grad_norm": 2.1281922989341626, "learning_rate": 7.844060964778264e-06, "loss": 0.4168, "step": 2340 }, { "epoch": 0.32833099579242636, "grad_norm": 1.9750802132457514, "learning_rate": 7.842192642450319e-06, "loss": 0.3629, "step": 2341 }, { "epoch": 0.3284712482468443, "grad_norm": 2.8224021377019075, "learning_rate": 7.84032373365578e-06, "loss": 0.3811, "step": 2342 }, { "epoch": 0.32861150070126227, "grad_norm": 2.608970839215009, "learning_rate": 7.838454238780282e-06, "loss": 0.3773, "step": 2343 }, { "epoch": 0.3287517531556802, "grad_norm": 2.193014333459828, "learning_rate": 7.836584158209581e-06, "loss": 0.3571, "step": 2344 }, { "epoch": 0.3288920056100982, "grad_norm": 6.386539860357833, "learning_rate": 7.83471349232956e-06, "loss": 0.3563, "step": 2345 }, { "epoch": 0.32903225806451614, "grad_norm": 5.171146583546573, "learning_rate": 7.832842241526212e-06, "loss": 0.381, "step": 2346 }, { "epoch": 0.3291725105189341, "grad_norm": 2.0223915018183787, "learning_rate": 7.83097040618566e-06, "loss": 0.3569, "step": 2347 }, { "epoch": 0.32931276297335205, "grad_norm": 1.9955994951510179, "learning_rate": 7.829097986694145e-06, "loss": 0.4056, "step": 2348 }, { "epoch": 0.32945301542777, "grad_norm": 2.2628873925471584, "learning_rate": 7.827224983438024e-06, "loss": 0.3753, "step": 2349 }, { "epoch": 0.32959326788218796, "grad_norm": 2.3359155558190103, "learning_rate": 7.825351396803783e-06, "loss": 0.4256, "step": 2350 }, { "epoch": 0.3297335203366059, "grad_norm": 2.09880594184664, "learning_rate": 7.823477227178019e-06, "loss": 0.4326, "step": 2351 }, { "epoch": 0.32987377279102387, "grad_norm": 2.1746188608095944, "learning_rate": 7.821602474947454e-06, "loss": 0.3954, "step": 2352 }, { "epoch": 0.3300140252454418, "grad_norm": 2.5494184840731022, "learning_rate": 7.819727140498933e-06, "loss": 0.3784, "step": 2353 }, { "epoch": 0.3301542776998597, "grad_norm": 2.7994321076925286, "learning_rate": 7.817851224219417e-06, "loss": 0.4024, "step": 2354 }, { "epoch": 0.3302945301542777, "grad_norm": 2.2650453023190544, "learning_rate": 7.815974726495988e-06, "loss": 0.3878, "step": 2355 }, { "epoch": 0.33043478260869563, "grad_norm": 2.250966182527586, "learning_rate": 7.814097647715848e-06, "loss": 0.3697, "step": 2356 }, { "epoch": 0.3305750350631136, "grad_norm": 2.2789900572402146, "learning_rate": 7.812219988266318e-06, "loss": 0.373, "step": 2357 }, { "epoch": 0.33071528751753154, "grad_norm": 2.065844050117118, "learning_rate": 7.810341748534843e-06, "loss": 0.404, "step": 2358 }, { "epoch": 0.3308555399719495, "grad_norm": 2.104844604156731, "learning_rate": 7.808462928908982e-06, "loss": 0.4121, "step": 2359 }, { "epoch": 0.33099579242636745, "grad_norm": 2.038429600502812, "learning_rate": 7.806583529776417e-06, "loss": 0.3584, "step": 2360 }, { "epoch": 0.3311360448807854, "grad_norm": 1.806893721714098, "learning_rate": 7.804703551524948e-06, "loss": 0.3586, "step": 2361 }, { "epoch": 0.33127629733520336, "grad_norm": 2.1756104106024874, "learning_rate": 7.802822994542498e-06, "loss": 0.4208, "step": 2362 }, { "epoch": 0.3314165497896213, "grad_norm": 1.8956128365981368, "learning_rate": 7.800941859217103e-06, "loss": 0.3645, "step": 2363 }, { "epoch": 0.3315568022440393, "grad_norm": 1.9215483349079334, "learning_rate": 7.799060145936928e-06, "loss": 0.3531, "step": 2364 }, { "epoch": 0.3316970546984572, "grad_norm": 2.000847279368834, "learning_rate": 7.797177855090246e-06, "loss": 0.3398, "step": 2365 }, { "epoch": 0.3318373071528752, "grad_norm": 1.7520650532328665, "learning_rate": 7.795294987065456e-06, "loss": 0.3294, "step": 2366 }, { "epoch": 0.33197755960729314, "grad_norm": 2.994069595639065, "learning_rate": 7.793411542251074e-06, "loss": 0.3965, "step": 2367 }, { "epoch": 0.3321178120617111, "grad_norm": 2.7628920678213573, "learning_rate": 7.791527521035736e-06, "loss": 0.3472, "step": 2368 }, { "epoch": 0.33225806451612905, "grad_norm": 2.155078625052337, "learning_rate": 7.789642923808199e-06, "loss": 0.3909, "step": 2369 }, { "epoch": 0.332398316970547, "grad_norm": 2.462173296948808, "learning_rate": 7.787757750957335e-06, "loss": 0.367, "step": 2370 }, { "epoch": 0.33253856942496496, "grad_norm": 5.262808660665833, "learning_rate": 7.785872002872134e-06, "loss": 0.3641, "step": 2371 }, { "epoch": 0.3326788218793829, "grad_norm": 2.6289115100824976, "learning_rate": 7.78398567994171e-06, "loss": 0.3994, "step": 2372 }, { "epoch": 0.33281907433380087, "grad_norm": 5.153026336040004, "learning_rate": 7.78209878255529e-06, "loss": 0.3913, "step": 2373 }, { "epoch": 0.33295932678821877, "grad_norm": 1.9966613201500707, "learning_rate": 7.780211311102226e-06, "loss": 0.3873, "step": 2374 }, { "epoch": 0.3330995792426367, "grad_norm": 2.1044308965480303, "learning_rate": 7.77832326597198e-06, "loss": 0.3905, "step": 2375 }, { "epoch": 0.3332398316970547, "grad_norm": 2.6982361789654417, "learning_rate": 7.77643464755414e-06, "loss": 0.3305, "step": 2376 }, { "epoch": 0.33338008415147263, "grad_norm": 2.3561274934284238, "learning_rate": 7.77454545623841e-06, "loss": 0.3842, "step": 2377 }, { "epoch": 0.3335203366058906, "grad_norm": 2.310475674819604, "learning_rate": 7.772655692414606e-06, "loss": 0.4197, "step": 2378 }, { "epoch": 0.33366058906030854, "grad_norm": 2.2470347680600873, "learning_rate": 7.770765356472672e-06, "loss": 0.373, "step": 2379 }, { "epoch": 0.3338008415147265, "grad_norm": 2.0585568042892564, "learning_rate": 7.768874448802665e-06, "loss": 0.3725, "step": 2380 }, { "epoch": 0.33394109396914445, "grad_norm": 2.0365224634660084, "learning_rate": 7.766982969794762e-06, "loss": 0.4236, "step": 2381 }, { "epoch": 0.3340813464235624, "grad_norm": 2.223542852847889, "learning_rate": 7.765090919839253e-06, "loss": 0.3733, "step": 2382 }, { "epoch": 0.33422159887798036, "grad_norm": 2.993459565602722, "learning_rate": 7.763198299326553e-06, "loss": 0.3895, "step": 2383 }, { "epoch": 0.3343618513323983, "grad_norm": 1.876552701824441, "learning_rate": 7.761305108647188e-06, "loss": 0.4387, "step": 2384 }, { "epoch": 0.3345021037868163, "grad_norm": 1.9082531644259433, "learning_rate": 7.759411348191806e-06, "loss": 0.4458, "step": 2385 }, { "epoch": 0.33464235624123423, "grad_norm": 1.869581403749016, "learning_rate": 7.75751701835117e-06, "loss": 0.366, "step": 2386 }, { "epoch": 0.3347826086956522, "grad_norm": 3.364589458889104, "learning_rate": 7.755622119516163e-06, "loss": 0.3282, "step": 2387 }, { "epoch": 0.33492286115007014, "grad_norm": 3.0999076521869813, "learning_rate": 7.753726652077787e-06, "loss": 0.3788, "step": 2388 }, { "epoch": 0.3350631136044881, "grad_norm": 1.8586692432338272, "learning_rate": 7.751830616427151e-06, "loss": 0.334, "step": 2389 }, { "epoch": 0.33520336605890605, "grad_norm": 3.339386990193841, "learning_rate": 7.749934012955497e-06, "loss": 0.3783, "step": 2390 }, { "epoch": 0.335343618513324, "grad_norm": 2.4828943602773106, "learning_rate": 7.74803684205417e-06, "loss": 0.4059, "step": 2391 }, { "epoch": 0.33548387096774196, "grad_norm": 2.226278400700491, "learning_rate": 7.74613910411464e-06, "loss": 0.314, "step": 2392 }, { "epoch": 0.3356241234221599, "grad_norm": 2.8409002088928093, "learning_rate": 7.744240799528492e-06, "loss": 0.3928, "step": 2393 }, { "epoch": 0.3357643758765778, "grad_norm": 2.353856688040037, "learning_rate": 7.742341928687427e-06, "loss": 0.425, "step": 2394 }, { "epoch": 0.33590462833099577, "grad_norm": 2.5053773269174315, "learning_rate": 7.740442491983266e-06, "loss": 0.3987, "step": 2395 }, { "epoch": 0.3360448807854137, "grad_norm": 2.6549699279461416, "learning_rate": 7.738542489807942e-06, "loss": 0.3777, "step": 2396 }, { "epoch": 0.3361851332398317, "grad_norm": 2.4223000430322665, "learning_rate": 7.736641922553509e-06, "loss": 0.4271, "step": 2397 }, { "epoch": 0.33632538569424963, "grad_norm": 2.9602781180583158, "learning_rate": 7.734740790612137e-06, "loss": 0.3811, "step": 2398 }, { "epoch": 0.3364656381486676, "grad_norm": 5.159054634965838, "learning_rate": 7.732839094376106e-06, "loss": 0.3788, "step": 2399 }, { "epoch": 0.33660589060308554, "grad_norm": 4.063659737182079, "learning_rate": 7.730936834237821e-06, "loss": 0.3857, "step": 2400 }, { "epoch": 0.3367461430575035, "grad_norm": 2.183506835954752, "learning_rate": 7.7290340105898e-06, "loss": 0.3857, "step": 2401 }, { "epoch": 0.33688639551192145, "grad_norm": 2.299229459643657, "learning_rate": 7.72713062382468e-06, "loss": 0.4217, "step": 2402 }, { "epoch": 0.3370266479663394, "grad_norm": 1.8967634441622792, "learning_rate": 7.725226674335208e-06, "loss": 0.3645, "step": 2403 }, { "epoch": 0.33716690042075736, "grad_norm": 1.9791544328278008, "learning_rate": 7.72332216251425e-06, "loss": 0.4066, "step": 2404 }, { "epoch": 0.3373071528751753, "grad_norm": 1.7195509691507262, "learning_rate": 7.72141708875479e-06, "loss": 0.3532, "step": 2405 }, { "epoch": 0.3374474053295933, "grad_norm": 2.0027431510910088, "learning_rate": 7.71951145344993e-06, "loss": 0.3883, "step": 2406 }, { "epoch": 0.33758765778401123, "grad_norm": 4.2355480188109444, "learning_rate": 7.71760525699288e-06, "loss": 0.3466, "step": 2407 }, { "epoch": 0.3377279102384292, "grad_norm": 2.4843475532215202, "learning_rate": 7.715698499776973e-06, "loss": 0.3571, "step": 2408 }, { "epoch": 0.33786816269284714, "grad_norm": 2.0882875156272913, "learning_rate": 7.713791182195653e-06, "loss": 0.3594, "step": 2409 }, { "epoch": 0.3380084151472651, "grad_norm": 3.5033987436863625, "learning_rate": 7.711883304642482e-06, "loss": 0.3713, "step": 2410 }, { "epoch": 0.33814866760168305, "grad_norm": 2.610094713560664, "learning_rate": 7.709974867511139e-06, "loss": 0.3516, "step": 2411 }, { "epoch": 0.338288920056101, "grad_norm": 2.9864461213097147, "learning_rate": 7.708065871195413e-06, "loss": 0.3503, "step": 2412 }, { "epoch": 0.33842917251051896, "grad_norm": 3.567500436484745, "learning_rate": 7.706156316089218e-06, "loss": 0.42, "step": 2413 }, { "epoch": 0.33856942496493686, "grad_norm": 2.7769600003541455, "learning_rate": 7.704246202586572e-06, "loss": 0.3487, "step": 2414 }, { "epoch": 0.3387096774193548, "grad_norm": 2.391267577932771, "learning_rate": 7.702335531081616e-06, "loss": 0.3733, "step": 2415 }, { "epoch": 0.33884992987377277, "grad_norm": 3.2589702058584917, "learning_rate": 7.700424301968603e-06, "loss": 0.4378, "step": 2416 }, { "epoch": 0.3389901823281907, "grad_norm": 2.8489535167218674, "learning_rate": 7.698512515641903e-06, "loss": 0.4128, "step": 2417 }, { "epoch": 0.3391304347826087, "grad_norm": 3.465459826895792, "learning_rate": 7.696600172495997e-06, "loss": 0.3582, "step": 2418 }, { "epoch": 0.33927068723702664, "grad_norm": 4.119207534499843, "learning_rate": 7.694687272925487e-06, "loss": 0.3811, "step": 2419 }, { "epoch": 0.3394109396914446, "grad_norm": 2.8037132334923984, "learning_rate": 7.692773817325082e-06, "loss": 0.3816, "step": 2420 }, { "epoch": 0.33955119214586255, "grad_norm": 2.214389007549896, "learning_rate": 7.690859806089615e-06, "loss": 0.4281, "step": 2421 }, { "epoch": 0.3396914446002805, "grad_norm": 3.14416744425953, "learning_rate": 7.688945239614027e-06, "loss": 0.3599, "step": 2422 }, { "epoch": 0.33983169705469846, "grad_norm": 3.3112959202843673, "learning_rate": 7.687030118293375e-06, "loss": 0.4218, "step": 2423 }, { "epoch": 0.3399719495091164, "grad_norm": 1.9679367565599826, "learning_rate": 7.685114442522831e-06, "loss": 0.3866, "step": 2424 }, { "epoch": 0.34011220196353437, "grad_norm": 2.8687340829445067, "learning_rate": 7.683198212697682e-06, "loss": 0.3192, "step": 2425 }, { "epoch": 0.3402524544179523, "grad_norm": 2.141744947177933, "learning_rate": 7.681281429213328e-06, "loss": 0.3446, "step": 2426 }, { "epoch": 0.3403927068723703, "grad_norm": 3.884461449960031, "learning_rate": 7.679364092465282e-06, "loss": 0.3409, "step": 2427 }, { "epoch": 0.34053295932678823, "grad_norm": 3.252291678369861, "learning_rate": 7.677446202849178e-06, "loss": 0.4001, "step": 2428 }, { "epoch": 0.3406732117812062, "grad_norm": 2.5693135740843074, "learning_rate": 7.675527760760755e-06, "loss": 0.3625, "step": 2429 }, { "epoch": 0.34081346423562414, "grad_norm": 1.94779264765855, "learning_rate": 7.67360876659587e-06, "loss": 0.3817, "step": 2430 }, { "epoch": 0.3409537166900421, "grad_norm": 2.9129512191758122, "learning_rate": 7.671689220750497e-06, "loss": 0.4316, "step": 2431 }, { "epoch": 0.34109396914446005, "grad_norm": 2.414421277097682, "learning_rate": 7.669769123620719e-06, "loss": 0.3527, "step": 2432 }, { "epoch": 0.341234221598878, "grad_norm": 2.5488487038201817, "learning_rate": 7.667848475602735e-06, "loss": 0.3615, "step": 2433 }, { "epoch": 0.3413744740532959, "grad_norm": 2.1510068753610114, "learning_rate": 7.665927277092855e-06, "loss": 0.3252, "step": 2434 }, { "epoch": 0.34151472650771386, "grad_norm": 4.5114987010055545, "learning_rate": 7.664005528487508e-06, "loss": 0.3551, "step": 2435 }, { "epoch": 0.3416549789621318, "grad_norm": 2.224840428491665, "learning_rate": 7.662083230183234e-06, "loss": 0.4257, "step": 2436 }, { "epoch": 0.34179523141654977, "grad_norm": 2.1687518380896615, "learning_rate": 7.660160382576683e-06, "loss": 0.3622, "step": 2437 }, { "epoch": 0.3419354838709677, "grad_norm": 2.0700605131140737, "learning_rate": 7.658236986064624e-06, "loss": 0.4092, "step": 2438 }, { "epoch": 0.3420757363253857, "grad_norm": 2.952289281394626, "learning_rate": 7.656313041043934e-06, "loss": 0.3376, "step": 2439 }, { "epoch": 0.34221598877980364, "grad_norm": 2.1084519238598425, "learning_rate": 7.654388547911605e-06, "loss": 0.3996, "step": 2440 }, { "epoch": 0.3423562412342216, "grad_norm": 2.488196546708823, "learning_rate": 7.652463507064745e-06, "loss": 0.3486, "step": 2441 }, { "epoch": 0.34249649368863955, "grad_norm": 2.6391789014858107, "learning_rate": 7.650537918900573e-06, "loss": 0.3781, "step": 2442 }, { "epoch": 0.3426367461430575, "grad_norm": 2.290251647582189, "learning_rate": 7.648611783816417e-06, "loss": 0.3674, "step": 2443 }, { "epoch": 0.34277699859747546, "grad_norm": 2.127596222867731, "learning_rate": 7.646685102209726e-06, "loss": 0.4083, "step": 2444 }, { "epoch": 0.3429172510518934, "grad_norm": 3.774445519556818, "learning_rate": 7.644757874478056e-06, "loss": 0.3805, "step": 2445 }, { "epoch": 0.34305750350631137, "grad_norm": 2.7730142619677265, "learning_rate": 7.642830101019075e-06, "loss": 0.4198, "step": 2446 }, { "epoch": 0.3431977559607293, "grad_norm": 2.4049010025237414, "learning_rate": 7.640901782230567e-06, "loss": 0.3435, "step": 2447 }, { "epoch": 0.3433380084151473, "grad_norm": 3.2781681665245817, "learning_rate": 7.638972918510428e-06, "loss": 0.3723, "step": 2448 }, { "epoch": 0.34347826086956523, "grad_norm": 2.103059823782007, "learning_rate": 7.637043510256663e-06, "loss": 0.3603, "step": 2449 }, { "epoch": 0.3436185133239832, "grad_norm": 2.5055351969344577, "learning_rate": 7.635113557867395e-06, "loss": 0.36, "step": 2450 }, { "epoch": 0.34375876577840114, "grad_norm": 2.0768178399640798, "learning_rate": 7.633183061740853e-06, "loss": 0.4151, "step": 2451 }, { "epoch": 0.3438990182328191, "grad_norm": 3.3709916224210077, "learning_rate": 7.631252022275386e-06, "loss": 0.4023, "step": 2452 }, { "epoch": 0.34403927068723705, "grad_norm": 4.505497647148638, "learning_rate": 7.6293204398694455e-06, "loss": 0.3903, "step": 2453 }, { "epoch": 0.34417952314165495, "grad_norm": 2.319049427204839, "learning_rate": 7.627388314921602e-06, "loss": 0.3493, "step": 2454 }, { "epoch": 0.3443197755960729, "grad_norm": 1.861841313165259, "learning_rate": 7.625455647830537e-06, "loss": 0.3726, "step": 2455 }, { "epoch": 0.34446002805049086, "grad_norm": 2.9880319927703813, "learning_rate": 7.62352243899504e-06, "loss": 0.3716, "step": 2456 }, { "epoch": 0.3446002805049088, "grad_norm": 2.859842230973515, "learning_rate": 7.621588688814019e-06, "loss": 0.4044, "step": 2457 }, { "epoch": 0.3447405329593268, "grad_norm": 3.1532608834694837, "learning_rate": 7.619654397686488e-06, "loss": 0.3611, "step": 2458 }, { "epoch": 0.34488078541374473, "grad_norm": 2.467739429549011, "learning_rate": 7.617719566011575e-06, "loss": 0.378, "step": 2459 }, { "epoch": 0.3450210378681627, "grad_norm": 2.0329663581297535, "learning_rate": 7.615784194188516e-06, "loss": 0.3856, "step": 2460 }, { "epoch": 0.34516129032258064, "grad_norm": 1.9632030636952378, "learning_rate": 7.613848282616665e-06, "loss": 0.3656, "step": 2461 }, { "epoch": 0.3453015427769986, "grad_norm": 2.4892160525135836, "learning_rate": 7.611911831695482e-06, "loss": 0.3964, "step": 2462 }, { "epoch": 0.34544179523141655, "grad_norm": 2.1418058382327154, "learning_rate": 7.609974841824543e-06, "loss": 0.3911, "step": 2463 }, { "epoch": 0.3455820476858345, "grad_norm": 2.8403491130680476, "learning_rate": 7.608037313403529e-06, "loss": 0.3763, "step": 2464 }, { "epoch": 0.34572230014025246, "grad_norm": 3.4438748982621843, "learning_rate": 7.606099246832234e-06, "loss": 0.3976, "step": 2465 }, { "epoch": 0.3458625525946704, "grad_norm": 2.3548973997407328, "learning_rate": 7.60416064251057e-06, "loss": 0.3882, "step": 2466 }, { "epoch": 0.34600280504908837, "grad_norm": 1.840302428394339, "learning_rate": 7.602221500838553e-06, "loss": 0.4043, "step": 2467 }, { "epoch": 0.3461430575035063, "grad_norm": 2.402767286724887, "learning_rate": 7.600281822216307e-06, "loss": 0.3924, "step": 2468 }, { "epoch": 0.3462833099579243, "grad_norm": 2.2058958120496452, "learning_rate": 7.598341607044075e-06, "loss": 0.3676, "step": 2469 }, { "epoch": 0.34642356241234223, "grad_norm": 3.0197888026377973, "learning_rate": 7.596400855722206e-06, "loss": 0.4034, "step": 2470 }, { "epoch": 0.3465638148667602, "grad_norm": 3.4526554668760707, "learning_rate": 7.594459568651159e-06, "loss": 0.4048, "step": 2471 }, { "epoch": 0.34670406732117814, "grad_norm": 2.188374371847153, "learning_rate": 7.592517746231507e-06, "loss": 0.3253, "step": 2472 }, { "epoch": 0.3468443197755961, "grad_norm": 1.8931366182048215, "learning_rate": 7.590575388863932e-06, "loss": 0.3797, "step": 2473 }, { "epoch": 0.346984572230014, "grad_norm": 2.881291678369242, "learning_rate": 7.588632496949223e-06, "loss": 0.4079, "step": 2474 }, { "epoch": 0.34712482468443195, "grad_norm": 2.4911064266592207, "learning_rate": 7.586689070888284e-06, "loss": 0.3704, "step": 2475 }, { "epoch": 0.3472650771388499, "grad_norm": 2.190078741251853, "learning_rate": 7.584745111082128e-06, "loss": 0.4035, "step": 2476 }, { "epoch": 0.34740532959326786, "grad_norm": 1.7503280332203313, "learning_rate": 7.582800617931876e-06, "loss": 0.3706, "step": 2477 }, { "epoch": 0.3475455820476858, "grad_norm": 2.190366882702773, "learning_rate": 7.580855591838763e-06, "loss": 0.4068, "step": 2478 }, { "epoch": 0.3476858345021038, "grad_norm": 2.887774902733068, "learning_rate": 7.578910033204129e-06, "loss": 0.3917, "step": 2479 }, { "epoch": 0.34782608695652173, "grad_norm": 2.0859305278075775, "learning_rate": 7.576963942429427e-06, "loss": 0.4256, "step": 2480 }, { "epoch": 0.3479663394109397, "grad_norm": 2.963583214618868, "learning_rate": 7.5750173199162204e-06, "loss": 0.4112, "step": 2481 }, { "epoch": 0.34810659186535764, "grad_norm": 2.2355417259859562, "learning_rate": 7.5730701660661795e-06, "loss": 0.4027, "step": 2482 }, { "epoch": 0.3482468443197756, "grad_norm": 2.409134857045145, "learning_rate": 7.571122481281088e-06, "loss": 0.3822, "step": 2483 }, { "epoch": 0.34838709677419355, "grad_norm": 2.283823834474823, "learning_rate": 7.569174265962834e-06, "loss": 0.3528, "step": 2484 }, { "epoch": 0.3485273492286115, "grad_norm": 2.3789784086962555, "learning_rate": 7.567225520513422e-06, "loss": 0.3977, "step": 2485 }, { "epoch": 0.34866760168302946, "grad_norm": 2.490071034777873, "learning_rate": 7.565276245334957e-06, "loss": 0.3935, "step": 2486 }, { "epoch": 0.3488078541374474, "grad_norm": 1.9318882674493174, "learning_rate": 7.563326440829662e-06, "loss": 0.3977, "step": 2487 }, { "epoch": 0.34894810659186537, "grad_norm": 2.3107523113380735, "learning_rate": 7.561376107399867e-06, "loss": 0.4, "step": 2488 }, { "epoch": 0.3490883590462833, "grad_norm": 2.1177477781491043, "learning_rate": 7.559425245448006e-06, "loss": 0.3658, "step": 2489 }, { "epoch": 0.3492286115007013, "grad_norm": 2.581725409742212, "learning_rate": 7.557473855376627e-06, "loss": 0.379, "step": 2490 }, { "epoch": 0.34936886395511924, "grad_norm": 2.2414967606074563, "learning_rate": 7.555521937588386e-06, "loss": 0.3653, "step": 2491 }, { "epoch": 0.3495091164095372, "grad_norm": 2.2774972225972063, "learning_rate": 7.553569492486048e-06, "loss": 0.3768, "step": 2492 }, { "epoch": 0.34964936886395515, "grad_norm": 3.2972873231038076, "learning_rate": 7.551616520472485e-06, "loss": 0.3989, "step": 2493 }, { "epoch": 0.34978962131837305, "grad_norm": 1.9989039875042462, "learning_rate": 7.5496630219506805e-06, "loss": 0.4047, "step": 2494 }, { "epoch": 0.349929873772791, "grad_norm": 2.494488362541893, "learning_rate": 7.547708997323724e-06, "loss": 0.3927, "step": 2495 }, { "epoch": 0.35007012622720896, "grad_norm": 2.2986865522663305, "learning_rate": 7.5457544469948164e-06, "loss": 0.3916, "step": 2496 }, { "epoch": 0.3502103786816269, "grad_norm": 2.8952696729643357, "learning_rate": 7.543799371367264e-06, "loss": 0.3776, "step": 2497 }, { "epoch": 0.35035063113604487, "grad_norm": 3.6559564195452015, "learning_rate": 7.541843770844486e-06, "loss": 0.3756, "step": 2498 }, { "epoch": 0.3504908835904628, "grad_norm": 3.529219862451816, "learning_rate": 7.539887645830002e-06, "loss": 0.3758, "step": 2499 }, { "epoch": 0.3506311360448808, "grad_norm": 1.9723541330570262, "learning_rate": 7.537930996727448e-06, "loss": 0.3996, "step": 2500 }, { "epoch": 0.35077138849929873, "grad_norm": 2.571584647603979, "learning_rate": 7.535973823940566e-06, "loss": 0.3423, "step": 2501 }, { "epoch": 0.3509116409537167, "grad_norm": 2.3237401645856504, "learning_rate": 7.5340161278732e-06, "loss": 0.4229, "step": 2502 }, { "epoch": 0.35105189340813464, "grad_norm": 2.8951128788421694, "learning_rate": 7.532057908929311e-06, "loss": 0.3902, "step": 2503 }, { "epoch": 0.3511921458625526, "grad_norm": 2.360675983600532, "learning_rate": 7.530099167512965e-06, "loss": 0.4054, "step": 2504 }, { "epoch": 0.35133239831697055, "grad_norm": 2.5494885286984696, "learning_rate": 7.528139904028331e-06, "loss": 0.4239, "step": 2505 }, { "epoch": 0.3514726507713885, "grad_norm": 2.5976076049839043, "learning_rate": 7.5261801188796904e-06, "loss": 0.394, "step": 2506 }, { "epoch": 0.35161290322580646, "grad_norm": 3.608933458940407, "learning_rate": 7.524219812471432e-06, "loss": 0.3978, "step": 2507 }, { "epoch": 0.3517531556802244, "grad_norm": 2.094553551275927, "learning_rate": 7.5222589852080505e-06, "loss": 0.4001, "step": 2508 }, { "epoch": 0.35189340813464237, "grad_norm": 2.2619169211001062, "learning_rate": 7.520297637494149e-06, "loss": 0.4428, "step": 2509 }, { "epoch": 0.3520336605890603, "grad_norm": 1.9980056475043073, "learning_rate": 7.5183357697344395e-06, "loss": 0.3501, "step": 2510 }, { "epoch": 0.3521739130434783, "grad_norm": 2.8890903007362208, "learning_rate": 7.516373382333737e-06, "loss": 0.3937, "step": 2511 }, { "epoch": 0.35231416549789624, "grad_norm": 5.590624228957727, "learning_rate": 7.51441047569697e-06, "loss": 0.3872, "step": 2512 }, { "epoch": 0.3524544179523142, "grad_norm": 2.074817520758344, "learning_rate": 7.512447050229166e-06, "loss": 0.4052, "step": 2513 }, { "epoch": 0.3525946704067321, "grad_norm": 2.200763708551853, "learning_rate": 7.510483106335468e-06, "loss": 0.3893, "step": 2514 }, { "epoch": 0.35273492286115005, "grad_norm": 2.684573509683436, "learning_rate": 7.508518644421119e-06, "loss": 0.3639, "step": 2515 }, { "epoch": 0.352875175315568, "grad_norm": 2.3800293725068524, "learning_rate": 7.506553664891475e-06, "loss": 0.3835, "step": 2516 }, { "epoch": 0.35301542776998596, "grad_norm": 2.1144641660990136, "learning_rate": 7.504588168151994e-06, "loss": 0.3707, "step": 2517 }, { "epoch": 0.3531556802244039, "grad_norm": 2.574608734181712, "learning_rate": 7.502622154608243e-06, "loss": 0.3698, "step": 2518 }, { "epoch": 0.35329593267882187, "grad_norm": 2.7953633746767834, "learning_rate": 7.500655624665896e-06, "loss": 0.389, "step": 2519 }, { "epoch": 0.3534361851332398, "grad_norm": 2.5719559352001125, "learning_rate": 7.498688578730731e-06, "loss": 0.3512, "step": 2520 }, { "epoch": 0.3535764375876578, "grad_norm": 3.540543606286348, "learning_rate": 7.496721017208634e-06, "loss": 0.3497, "step": 2521 }, { "epoch": 0.35371669004207573, "grad_norm": 2.952888600409061, "learning_rate": 7.4947529405056005e-06, "loss": 0.3782, "step": 2522 }, { "epoch": 0.3538569424964937, "grad_norm": 2.6493845458869996, "learning_rate": 7.492784349027726e-06, "loss": 0.4001, "step": 2523 }, { "epoch": 0.35399719495091164, "grad_norm": 2.7959957339045514, "learning_rate": 7.4908152431812175e-06, "loss": 0.4351, "step": 2524 }, { "epoch": 0.3541374474053296, "grad_norm": 2.281654740054621, "learning_rate": 7.488845623372386e-06, "loss": 0.4025, "step": 2525 }, { "epoch": 0.35427769985974755, "grad_norm": 3.563813486058933, "learning_rate": 7.486875490007648e-06, "loss": 0.3807, "step": 2526 }, { "epoch": 0.3544179523141655, "grad_norm": 3.1964825490913458, "learning_rate": 7.484904843493528e-06, "loss": 0.36, "step": 2527 }, { "epoch": 0.35455820476858346, "grad_norm": 4.18792380850054, "learning_rate": 7.482933684236654e-06, "loss": 0.3721, "step": 2528 }, { "epoch": 0.3546984572230014, "grad_norm": 4.316806577872319, "learning_rate": 7.480962012643762e-06, "loss": 0.4127, "step": 2529 }, { "epoch": 0.3548387096774194, "grad_norm": 2.639916138617495, "learning_rate": 7.478989829121691e-06, "loss": 0.4203, "step": 2530 }, { "epoch": 0.35497896213183733, "grad_norm": 4.334808807556717, "learning_rate": 7.477017134077389e-06, "loss": 0.3761, "step": 2531 }, { "epoch": 0.3551192145862553, "grad_norm": 2.9008196889075526, "learning_rate": 7.475043927917908e-06, "loss": 0.3883, "step": 2532 }, { "epoch": 0.35525946704067324, "grad_norm": 2.070973071392494, "learning_rate": 7.473070211050404e-06, "loss": 0.3827, "step": 2533 }, { "epoch": 0.35539971949509114, "grad_norm": 2.356417286244183, "learning_rate": 7.47109598388214e-06, "loss": 0.4041, "step": 2534 }, { "epoch": 0.3555399719495091, "grad_norm": 2.879972718427506, "learning_rate": 7.469121246820483e-06, "loss": 0.363, "step": 2535 }, { "epoch": 0.35568022440392705, "grad_norm": 2.394098189804574, "learning_rate": 7.467146000272909e-06, "loss": 0.3474, "step": 2536 }, { "epoch": 0.355820476858345, "grad_norm": 4.893085092558933, "learning_rate": 7.4651702446469944e-06, "loss": 0.371, "step": 2537 }, { "epoch": 0.35596072931276296, "grad_norm": 2.3821635378758743, "learning_rate": 7.4631939803504215e-06, "loss": 0.3941, "step": 2538 }, { "epoch": 0.3561009817671809, "grad_norm": 2.992127420404914, "learning_rate": 7.4612172077909815e-06, "loss": 0.3235, "step": 2539 }, { "epoch": 0.35624123422159887, "grad_norm": 2.1565393447240924, "learning_rate": 7.459239927376566e-06, "loss": 0.4045, "step": 2540 }, { "epoch": 0.3563814866760168, "grad_norm": 3.4030323755078915, "learning_rate": 7.457262139515172e-06, "loss": 0.4094, "step": 2541 }, { "epoch": 0.3565217391304348, "grad_norm": 2.5291703711728193, "learning_rate": 7.455283844614906e-06, "loss": 0.4277, "step": 2542 }, { "epoch": 0.35666199158485273, "grad_norm": 2.4378799729190153, "learning_rate": 7.453305043083969e-06, "loss": 0.3948, "step": 2543 }, { "epoch": 0.3568022440392707, "grad_norm": 2.3756464028341484, "learning_rate": 7.451325735330679e-06, "loss": 0.3698, "step": 2544 }, { "epoch": 0.35694249649368864, "grad_norm": 2.795381797413467, "learning_rate": 7.449345921763449e-06, "loss": 0.3866, "step": 2545 }, { "epoch": 0.3570827489481066, "grad_norm": 2.75375785698378, "learning_rate": 7.4473656027908005e-06, "loss": 0.4009, "step": 2546 }, { "epoch": 0.35722300140252455, "grad_norm": 2.2805925231778916, "learning_rate": 7.445384778821358e-06, "loss": 0.3339, "step": 2547 }, { "epoch": 0.3573632538569425, "grad_norm": 1.807017881852776, "learning_rate": 7.443403450263852e-06, "loss": 0.3592, "step": 2548 }, { "epoch": 0.35750350631136046, "grad_norm": 2.623471950647878, "learning_rate": 7.441421617527116e-06, "loss": 0.3805, "step": 2549 }, { "epoch": 0.3576437587657784, "grad_norm": 4.469224811218161, "learning_rate": 7.439439281020085e-06, "loss": 0.3692, "step": 2550 }, { "epoch": 0.3577840112201964, "grad_norm": 2.238702278416659, "learning_rate": 7.4374564411518e-06, "loss": 0.383, "step": 2551 }, { "epoch": 0.35792426367461433, "grad_norm": 2.512817901692813, "learning_rate": 7.435473098331411e-06, "loss": 0.3578, "step": 2552 }, { "epoch": 0.3580645161290323, "grad_norm": 2.3466446358376287, "learning_rate": 7.4334892529681625e-06, "loss": 0.3449, "step": 2553 }, { "epoch": 0.3582047685834502, "grad_norm": 2.1224095880417666, "learning_rate": 7.431504905471407e-06, "loss": 0.3974, "step": 2554 }, { "epoch": 0.35834502103786814, "grad_norm": 3.0151948770327226, "learning_rate": 7.4295200562506045e-06, "loss": 0.4193, "step": 2555 }, { "epoch": 0.3584852734922861, "grad_norm": 2.840051278739372, "learning_rate": 7.427534705715311e-06, "loss": 0.3618, "step": 2556 }, { "epoch": 0.35862552594670405, "grad_norm": 1.8249205193130171, "learning_rate": 7.425548854275191e-06, "loss": 0.3699, "step": 2557 }, { "epoch": 0.358765778401122, "grad_norm": 2.06784301882875, "learning_rate": 7.42356250234001e-06, "loss": 0.3345, "step": 2558 }, { "epoch": 0.35890603085553996, "grad_norm": 2.629152283626657, "learning_rate": 7.421575650319641e-06, "loss": 0.3969, "step": 2559 }, { "epoch": 0.3590462833099579, "grad_norm": 2.129968892438941, "learning_rate": 7.419588298624054e-06, "loss": 0.3352, "step": 2560 }, { "epoch": 0.35918653576437587, "grad_norm": 2.109145653583698, "learning_rate": 7.417600447663327e-06, "loss": 0.3965, "step": 2561 }, { "epoch": 0.3593267882187938, "grad_norm": 1.9931339783105941, "learning_rate": 7.415612097847638e-06, "loss": 0.3801, "step": 2562 }, { "epoch": 0.3594670406732118, "grad_norm": 1.755883476592616, "learning_rate": 7.4136232495872695e-06, "loss": 0.3474, "step": 2563 }, { "epoch": 0.35960729312762973, "grad_norm": 1.6746028144553087, "learning_rate": 7.411633903292605e-06, "loss": 0.3849, "step": 2564 }, { "epoch": 0.3597475455820477, "grad_norm": 2.0947921209146725, "learning_rate": 7.409644059374136e-06, "loss": 0.424, "step": 2565 }, { "epoch": 0.35988779803646564, "grad_norm": 2.283381889922185, "learning_rate": 7.407653718242449e-06, "loss": 0.3869, "step": 2566 }, { "epoch": 0.3600280504908836, "grad_norm": 2.5098415259376528, "learning_rate": 7.405662880308239e-06, "loss": 0.4101, "step": 2567 }, { "epoch": 0.36016830294530155, "grad_norm": 3.4601683440075672, "learning_rate": 7.403671545982299e-06, "loss": 0.3592, "step": 2568 }, { "epoch": 0.3603085553997195, "grad_norm": 2.3954757644742712, "learning_rate": 7.401679715675531e-06, "loss": 0.3753, "step": 2569 }, { "epoch": 0.36044880785413747, "grad_norm": 1.9883600905100367, "learning_rate": 7.399687389798933e-06, "loss": 0.3435, "step": 2570 }, { "epoch": 0.3605890603085554, "grad_norm": 2.639416389836353, "learning_rate": 7.397694568763607e-06, "loss": 0.3902, "step": 2571 }, { "epoch": 0.3607293127629734, "grad_norm": 2.083128906392836, "learning_rate": 7.395701252980758e-06, "loss": 0.3524, "step": 2572 }, { "epoch": 0.36086956521739133, "grad_norm": 2.540861179311404, "learning_rate": 7.393707442861693e-06, "loss": 0.3644, "step": 2573 }, { "epoch": 0.36100981767180923, "grad_norm": 2.5971928003812685, "learning_rate": 7.391713138817822e-06, "loss": 0.4045, "step": 2574 }, { "epoch": 0.3611500701262272, "grad_norm": 2.586675348496697, "learning_rate": 7.389718341260654e-06, "loss": 0.388, "step": 2575 }, { "epoch": 0.36129032258064514, "grad_norm": 2.08230629083009, "learning_rate": 7.387723050601804e-06, "loss": 0.372, "step": 2576 }, { "epoch": 0.3614305750350631, "grad_norm": 3.3773648489249584, "learning_rate": 7.385727267252983e-06, "loss": 0.4274, "step": 2577 }, { "epoch": 0.36157082748948105, "grad_norm": 3.4426538844955963, "learning_rate": 7.383730991626007e-06, "loss": 0.3648, "step": 2578 }, { "epoch": 0.361711079943899, "grad_norm": 9.793128457762329, "learning_rate": 7.381734224132796e-06, "loss": 0.4114, "step": 2579 }, { "epoch": 0.36185133239831696, "grad_norm": 2.4929754751432127, "learning_rate": 7.379736965185369e-06, "loss": 0.3874, "step": 2580 }, { "epoch": 0.3619915848527349, "grad_norm": 2.882129288430603, "learning_rate": 7.3777392151958435e-06, "loss": 0.4169, "step": 2581 }, { "epoch": 0.36213183730715287, "grad_norm": 2.492471517325515, "learning_rate": 7.375740974576444e-06, "loss": 0.4064, "step": 2582 }, { "epoch": 0.3622720897615708, "grad_norm": 2.3357046770403977, "learning_rate": 7.373742243739493e-06, "loss": 0.385, "step": 2583 }, { "epoch": 0.3624123422159888, "grad_norm": 2.216743744200852, "learning_rate": 7.3717430230974155e-06, "loss": 0.3975, "step": 2584 }, { "epoch": 0.36255259467040674, "grad_norm": 2.695912346917678, "learning_rate": 7.369743313062734e-06, "loss": 0.4067, "step": 2585 }, { "epoch": 0.3626928471248247, "grad_norm": 2.655723699152729, "learning_rate": 7.367743114048076e-06, "loss": 0.4192, "step": 2586 }, { "epoch": 0.36283309957924265, "grad_norm": 3.0299871901439657, "learning_rate": 7.365742426466169e-06, "loss": 0.3713, "step": 2587 }, { "epoch": 0.3629733520336606, "grad_norm": 2.450428932944969, "learning_rate": 7.3637412507298415e-06, "loss": 0.4427, "step": 2588 }, { "epoch": 0.36311360448807856, "grad_norm": 1.8514180187398916, "learning_rate": 7.361739587252019e-06, "loss": 0.3735, "step": 2589 }, { "epoch": 0.3632538569424965, "grad_norm": 2.3778943946302915, "learning_rate": 7.359737436445735e-06, "loss": 0.395, "step": 2590 }, { "epoch": 0.36339410939691447, "grad_norm": 4.216947495428324, "learning_rate": 7.3577347987241176e-06, "loss": 0.3816, "step": 2591 }, { "epoch": 0.3635343618513324, "grad_norm": 2.2805235241607327, "learning_rate": 7.355731674500396e-06, "loss": 0.3505, "step": 2592 }, { "epoch": 0.3636746143057504, "grad_norm": 2.199853077230697, "learning_rate": 7.353728064187901e-06, "loss": 0.3395, "step": 2593 }, { "epoch": 0.3638148667601683, "grad_norm": 1.9726486961295655, "learning_rate": 7.3517239682000675e-06, "loss": 0.3528, "step": 2594 }, { "epoch": 0.36395511921458623, "grad_norm": 3.307952350296336, "learning_rate": 7.349719386950422e-06, "loss": 0.3582, "step": 2595 }, { "epoch": 0.3640953716690042, "grad_norm": 2.3407667374318577, "learning_rate": 7.347714320852597e-06, "loss": 0.4029, "step": 2596 }, { "epoch": 0.36423562412342214, "grad_norm": 2.985186401630228, "learning_rate": 7.345708770320324e-06, "loss": 0.3491, "step": 2597 }, { "epoch": 0.3643758765778401, "grad_norm": 3.4139351199063066, "learning_rate": 7.343702735767435e-06, "loss": 0.3948, "step": 2598 }, { "epoch": 0.36451612903225805, "grad_norm": 3.1926437462667474, "learning_rate": 7.341696217607861e-06, "loss": 0.3456, "step": 2599 }, { "epoch": 0.364656381486676, "grad_norm": 3.9351693173423103, "learning_rate": 7.339689216255632e-06, "loss": 0.3537, "step": 2600 }, { "epoch": 0.36479663394109396, "grad_norm": 2.074336219234299, "learning_rate": 7.337681732124882e-06, "loss": 0.3581, "step": 2601 }, { "epoch": 0.3649368863955119, "grad_norm": 2.4942581041549854, "learning_rate": 7.335673765629837e-06, "loss": 0.3644, "step": 2602 }, { "epoch": 0.36507713884992987, "grad_norm": 2.3353223724340326, "learning_rate": 7.333665317184829e-06, "loss": 0.4102, "step": 2603 }, { "epoch": 0.3652173913043478, "grad_norm": 3.3822074507620368, "learning_rate": 7.3316563872042865e-06, "loss": 0.3734, "step": 2604 }, { "epoch": 0.3653576437587658, "grad_norm": 2.590510171846359, "learning_rate": 7.329646976102741e-06, "loss": 0.3773, "step": 2605 }, { "epoch": 0.36549789621318374, "grad_norm": 3.123350547354971, "learning_rate": 7.327637084294818e-06, "loss": 0.3985, "step": 2606 }, { "epoch": 0.3656381486676017, "grad_norm": 2.710728354177903, "learning_rate": 7.325626712195242e-06, "loss": 0.4421, "step": 2607 }, { "epoch": 0.36577840112201965, "grad_norm": 2.6896713711261757, "learning_rate": 7.323615860218844e-06, "loss": 0.3636, "step": 2608 }, { "epoch": 0.3659186535764376, "grad_norm": 2.325462713650891, "learning_rate": 7.321604528780546e-06, "loss": 0.3891, "step": 2609 }, { "epoch": 0.36605890603085556, "grad_norm": 2.5894860758564513, "learning_rate": 7.319592718295374e-06, "loss": 0.3386, "step": 2610 }, { "epoch": 0.3661991584852735, "grad_norm": 2.0062626300544935, "learning_rate": 7.317580429178452e-06, "loss": 0.4057, "step": 2611 }, { "epoch": 0.36633941093969147, "grad_norm": 2.5548971014335535, "learning_rate": 7.315567661844999e-06, "loss": 0.3867, "step": 2612 }, { "epoch": 0.3664796633941094, "grad_norm": 2.2378452269120483, "learning_rate": 7.313554416710337e-06, "loss": 0.3825, "step": 2613 }, { "epoch": 0.3666199158485273, "grad_norm": 3.1128247930358484, "learning_rate": 7.311540694189885e-06, "loss": 0.3922, "step": 2614 }, { "epoch": 0.3667601683029453, "grad_norm": 2.6382339757965676, "learning_rate": 7.30952649469916e-06, "loss": 0.3823, "step": 2615 }, { "epoch": 0.36690042075736323, "grad_norm": 1.9376780520426813, "learning_rate": 7.307511818653778e-06, "loss": 0.4007, "step": 2616 }, { "epoch": 0.3670406732117812, "grad_norm": 2.0710307476831726, "learning_rate": 7.305496666469456e-06, "loss": 0.3688, "step": 2617 }, { "epoch": 0.36718092566619914, "grad_norm": 2.6979782411674917, "learning_rate": 7.3034810385620035e-06, "loss": 0.4235, "step": 2618 }, { "epoch": 0.3673211781206171, "grad_norm": 2.3795665206764998, "learning_rate": 7.301464935347331e-06, "loss": 0.359, "step": 2619 }, { "epoch": 0.36746143057503505, "grad_norm": 2.0266195484799328, "learning_rate": 7.299448357241448e-06, "loss": 0.3995, "step": 2620 }, { "epoch": 0.367601683029453, "grad_norm": 2.2305254018774163, "learning_rate": 7.297431304660464e-06, "loss": 0.3484, "step": 2621 }, { "epoch": 0.36774193548387096, "grad_norm": 2.4227204846647012, "learning_rate": 7.295413778020579e-06, "loss": 0.4161, "step": 2622 }, { "epoch": 0.3678821879382889, "grad_norm": 2.91374405253559, "learning_rate": 7.293395777738099e-06, "loss": 0.3287, "step": 2623 }, { "epoch": 0.3680224403927069, "grad_norm": 2.9312154218834388, "learning_rate": 7.291377304229423e-06, "loss": 0.4088, "step": 2624 }, { "epoch": 0.36816269284712483, "grad_norm": 2.932389216497157, "learning_rate": 7.28935835791105e-06, "loss": 0.3809, "step": 2625 }, { "epoch": 0.3683029453015428, "grad_norm": 2.310382388943893, "learning_rate": 7.287338939199574e-06, "loss": 0.3857, "step": 2626 }, { "epoch": 0.36844319775596074, "grad_norm": 2.1110186919121627, "learning_rate": 7.28531904851169e-06, "loss": 0.3772, "step": 2627 }, { "epoch": 0.3685834502103787, "grad_norm": 2.818313947220294, "learning_rate": 7.283298686264184e-06, "loss": 0.405, "step": 2628 }, { "epoch": 0.36872370266479665, "grad_norm": 2.3937089762147945, "learning_rate": 7.281277852873947e-06, "loss": 0.4092, "step": 2629 }, { "epoch": 0.3688639551192146, "grad_norm": 2.0548031850527626, "learning_rate": 7.279256548757964e-06, "loss": 0.3966, "step": 2630 }, { "epoch": 0.36900420757363256, "grad_norm": 2.1457592726090104, "learning_rate": 7.277234774333317e-06, "loss": 0.3443, "step": 2631 }, { "epoch": 0.3691444600280505, "grad_norm": 1.9191386894652678, "learning_rate": 7.2752125300171835e-06, "loss": 0.3674, "step": 2632 }, { "epoch": 0.36928471248246847, "grad_norm": 2.949105731743844, "learning_rate": 7.27318981622684e-06, "loss": 0.3433, "step": 2633 }, { "epoch": 0.36942496493688637, "grad_norm": 3.0184232546806573, "learning_rate": 7.271166633379661e-06, "loss": 0.3645, "step": 2634 }, { "epoch": 0.3695652173913043, "grad_norm": 2.2647388698457243, "learning_rate": 7.269142981893114e-06, "loss": 0.4023, "step": 2635 }, { "epoch": 0.3697054698457223, "grad_norm": 1.8647964682496092, "learning_rate": 7.267118862184767e-06, "loss": 0.3317, "step": 2636 }, { "epoch": 0.36984572230014023, "grad_norm": 2.357368474954432, "learning_rate": 7.265094274672282e-06, "loss": 0.389, "step": 2637 }, { "epoch": 0.3699859747545582, "grad_norm": 6.685407649916095, "learning_rate": 7.263069219773417e-06, "loss": 0.3664, "step": 2638 }, { "epoch": 0.37012622720897614, "grad_norm": 2.0653320289412322, "learning_rate": 7.26104369790603e-06, "loss": 0.3368, "step": 2639 }, { "epoch": 0.3702664796633941, "grad_norm": 1.951410991080459, "learning_rate": 7.259017709488073e-06, "loss": 0.3912, "step": 2640 }, { "epoch": 0.37040673211781205, "grad_norm": 2.4032127731425708, "learning_rate": 7.256991254937595e-06, "loss": 0.4393, "step": 2641 }, { "epoch": 0.37054698457223, "grad_norm": 2.773227307588479, "learning_rate": 7.25496433467274e-06, "loss": 0.3652, "step": 2642 }, { "epoch": 0.37068723702664796, "grad_norm": 2.0992767496334337, "learning_rate": 7.252936949111749e-06, "loss": 0.3895, "step": 2643 }, { "epoch": 0.3708274894810659, "grad_norm": 1.8346186019599804, "learning_rate": 7.250909098672958e-06, "loss": 0.3879, "step": 2644 }, { "epoch": 0.3709677419354839, "grad_norm": 2.9875457057888246, "learning_rate": 7.248880783774801e-06, "loss": 0.3751, "step": 2645 }, { "epoch": 0.37110799438990183, "grad_norm": 2.4846658596381754, "learning_rate": 7.246852004835807e-06, "loss": 0.4625, "step": 2646 }, { "epoch": 0.3712482468443198, "grad_norm": 2.5329693265037356, "learning_rate": 7.2448227622746e-06, "loss": 0.3595, "step": 2647 }, { "epoch": 0.37138849929873774, "grad_norm": 1.9896371710984335, "learning_rate": 7.242793056509898e-06, "loss": 0.3655, "step": 2648 }, { "epoch": 0.3715287517531557, "grad_norm": 2.65739981504629, "learning_rate": 7.240762887960518e-06, "loss": 0.3679, "step": 2649 }, { "epoch": 0.37166900420757365, "grad_norm": 2.3821040867084986, "learning_rate": 7.2387322570453724e-06, "loss": 0.3723, "step": 2650 }, { "epoch": 0.3718092566619916, "grad_norm": 1.8763319370948819, "learning_rate": 7.236701164183466e-06, "loss": 0.394, "step": 2651 }, { "epoch": 0.37194950911640956, "grad_norm": 3.2589482584916687, "learning_rate": 7.2346696097939025e-06, "loss": 0.3896, "step": 2652 }, { "epoch": 0.3720897615708275, "grad_norm": 2.9309941747497543, "learning_rate": 7.232637594295876e-06, "loss": 0.3974, "step": 2653 }, { "epoch": 0.3722300140252454, "grad_norm": 1.9401587425253404, "learning_rate": 7.23060511810868e-06, "loss": 0.3489, "step": 2654 }, { "epoch": 0.37237026647966337, "grad_norm": 3.813944558551307, "learning_rate": 7.228572181651703e-06, "loss": 0.4014, "step": 2655 }, { "epoch": 0.3725105189340813, "grad_norm": 2.1585601373265186, "learning_rate": 7.226538785344427e-06, "loss": 0.4384, "step": 2656 }, { "epoch": 0.3726507713884993, "grad_norm": 1.8833090099800784, "learning_rate": 7.224504929606429e-06, "loss": 0.3416, "step": 2657 }, { "epoch": 0.37279102384291724, "grad_norm": 2.566109232212122, "learning_rate": 7.22247061485738e-06, "loss": 0.4173, "step": 2658 }, { "epoch": 0.3729312762973352, "grad_norm": 2.4010808338824003, "learning_rate": 7.220435841517045e-06, "loss": 0.3552, "step": 2659 }, { "epoch": 0.37307152875175315, "grad_norm": 2.3016393126847223, "learning_rate": 7.2184006100052885e-06, "loss": 0.3524, "step": 2660 }, { "epoch": 0.3732117812061711, "grad_norm": 1.9403418342042806, "learning_rate": 7.216364920742065e-06, "loss": 0.3399, "step": 2661 }, { "epoch": 0.37335203366058906, "grad_norm": 2.297717089270709, "learning_rate": 7.214328774147425e-06, "loss": 0.4446, "step": 2662 }, { "epoch": 0.373492286115007, "grad_norm": 2.0321738856324605, "learning_rate": 7.212292170641514e-06, "loss": 0.3556, "step": 2663 }, { "epoch": 0.37363253856942497, "grad_norm": 2.1208648839176343, "learning_rate": 7.210255110644569e-06, "loss": 0.388, "step": 2664 }, { "epoch": 0.3737727910238429, "grad_norm": 2.441592863961637, "learning_rate": 7.2082175945769226e-06, "loss": 0.3787, "step": 2665 }, { "epoch": 0.3739130434782609, "grad_norm": 3.269145619245797, "learning_rate": 7.206179622859005e-06, "loss": 0.4324, "step": 2666 }, { "epoch": 0.37405329593267883, "grad_norm": 1.7342886688465817, "learning_rate": 7.204141195911336e-06, "loss": 0.3633, "step": 2667 }, { "epoch": 0.3741935483870968, "grad_norm": 2.247254604058616, "learning_rate": 7.202102314154531e-06, "loss": 0.4224, "step": 2668 }, { "epoch": 0.37433380084151474, "grad_norm": 2.998866343878918, "learning_rate": 7.200062978009297e-06, "loss": 0.4204, "step": 2669 }, { "epoch": 0.3744740532959327, "grad_norm": 1.95165465895523, "learning_rate": 7.198023187896439e-06, "loss": 0.3802, "step": 2670 }, { "epoch": 0.37461430575035065, "grad_norm": 2.3514281579864673, "learning_rate": 7.195982944236853e-06, "loss": 0.389, "step": 2671 }, { "epoch": 0.3747545582047686, "grad_norm": 2.1633194842529586, "learning_rate": 7.193942247451528e-06, "loss": 0.3704, "step": 2672 }, { "epoch": 0.37489481065918656, "grad_norm": 3.6690834634706886, "learning_rate": 7.191901097961549e-06, "loss": 0.3073, "step": 2673 }, { "epoch": 0.37503506311360446, "grad_norm": 1.7635468262268166, "learning_rate": 7.189859496188092e-06, "loss": 0.3648, "step": 2674 }, { "epoch": 0.3751753155680224, "grad_norm": 2.009354292553742, "learning_rate": 7.187817442552427e-06, "loss": 0.3689, "step": 2675 }, { "epoch": 0.37531556802244037, "grad_norm": 2.02493042531996, "learning_rate": 7.185774937475919e-06, "loss": 0.3848, "step": 2676 }, { "epoch": 0.3754558204768583, "grad_norm": 1.9914280659381172, "learning_rate": 7.183731981380024e-06, "loss": 0.3969, "step": 2677 }, { "epoch": 0.3755960729312763, "grad_norm": 1.9733611707506664, "learning_rate": 7.181688574686292e-06, "loss": 0.3992, "step": 2678 }, { "epoch": 0.37573632538569424, "grad_norm": 1.963039107898463, "learning_rate": 7.179644717816363e-06, "loss": 0.4142, "step": 2679 }, { "epoch": 0.3758765778401122, "grad_norm": 1.9738013239463472, "learning_rate": 7.177600411191976e-06, "loss": 0.3233, "step": 2680 }, { "epoch": 0.37601683029453015, "grad_norm": 1.6404519852736195, "learning_rate": 7.175555655234958e-06, "loss": 0.359, "step": 2681 }, { "epoch": 0.3761570827489481, "grad_norm": 2.042675923646879, "learning_rate": 7.173510450367229e-06, "loss": 0.414, "step": 2682 }, { "epoch": 0.37629733520336606, "grad_norm": 2.8844720462496602, "learning_rate": 7.1714647970108056e-06, "loss": 0.365, "step": 2683 }, { "epoch": 0.376437587657784, "grad_norm": 1.771170759777421, "learning_rate": 7.169418695587791e-06, "loss": 0.3971, "step": 2684 }, { "epoch": 0.37657784011220197, "grad_norm": 2.1165920419629356, "learning_rate": 7.167372146520386e-06, "loss": 0.3781, "step": 2685 }, { "epoch": 0.3767180925666199, "grad_norm": 2.7971823752514746, "learning_rate": 7.165325150230881e-06, "loss": 0.3686, "step": 2686 }, { "epoch": 0.3768583450210379, "grad_norm": 2.1217444670447616, "learning_rate": 7.1632777071416606e-06, "loss": 0.4086, "step": 2687 }, { "epoch": 0.37699859747545583, "grad_norm": 2.0857937081028006, "learning_rate": 7.161229817675198e-06, "loss": 0.3901, "step": 2688 }, { "epoch": 0.3771388499298738, "grad_norm": 2.210010347644177, "learning_rate": 7.159181482254062e-06, "loss": 0.3787, "step": 2689 }, { "epoch": 0.37727910238429174, "grad_norm": 1.6996225863736485, "learning_rate": 7.157132701300911e-06, "loss": 0.3688, "step": 2690 }, { "epoch": 0.3774193548387097, "grad_norm": 2.6115570082178237, "learning_rate": 7.1550834752385e-06, "loss": 0.3662, "step": 2691 }, { "epoch": 0.37755960729312765, "grad_norm": 2.7297591131781216, "learning_rate": 7.15303380448967e-06, "loss": 0.4395, "step": 2692 }, { "epoch": 0.3776998597475456, "grad_norm": 2.5221248082279373, "learning_rate": 7.150983689477357e-06, "loss": 0.3924, "step": 2693 }, { "epoch": 0.3778401122019635, "grad_norm": 1.6853880205144463, "learning_rate": 7.148933130624587e-06, "loss": 0.3585, "step": 2694 }, { "epoch": 0.37798036465638146, "grad_norm": 1.8355528083505668, "learning_rate": 7.146882128354479e-06, "loss": 0.3716, "step": 2695 }, { "epoch": 0.3781206171107994, "grad_norm": 2.4638459956542564, "learning_rate": 7.144830683090242e-06, "loss": 0.3877, "step": 2696 }, { "epoch": 0.3782608695652174, "grad_norm": 2.586682353545663, "learning_rate": 7.14277879525518e-06, "loss": 0.3457, "step": 2697 }, { "epoch": 0.3784011220196353, "grad_norm": 2.0312158435003806, "learning_rate": 7.140726465272686e-06, "loss": 0.3851, "step": 2698 }, { "epoch": 0.3785413744740533, "grad_norm": 1.9745050510275235, "learning_rate": 7.138673693566241e-06, "loss": 0.384, "step": 2699 }, { "epoch": 0.37868162692847124, "grad_norm": 2.9102868377265216, "learning_rate": 7.1366204805594205e-06, "loss": 0.3597, "step": 2700 }, { "epoch": 0.3788218793828892, "grad_norm": 2.523986095402258, "learning_rate": 7.134566826675892e-06, "loss": 0.3666, "step": 2701 }, { "epoch": 0.37896213183730715, "grad_norm": 2.0368041178287855, "learning_rate": 7.13251273233941e-06, "loss": 0.3697, "step": 2702 }, { "epoch": 0.3791023842917251, "grad_norm": 2.182700313888236, "learning_rate": 7.130458197973828e-06, "loss": 0.4175, "step": 2703 }, { "epoch": 0.37924263674614306, "grad_norm": 1.7856352516123042, "learning_rate": 7.12840322400308e-06, "loss": 0.3761, "step": 2704 }, { "epoch": 0.379382889200561, "grad_norm": 1.8700717372699933, "learning_rate": 7.1263478108511955e-06, "loss": 0.3478, "step": 2705 }, { "epoch": 0.37952314165497897, "grad_norm": 2.146491039428917, "learning_rate": 7.1242919589422974e-06, "loss": 0.3666, "step": 2706 }, { "epoch": 0.3796633941093969, "grad_norm": 1.758262813955884, "learning_rate": 7.122235668700594e-06, "loss": 0.365, "step": 2707 }, { "epoch": 0.3798036465638149, "grad_norm": 1.6914870879345465, "learning_rate": 7.12017894055039e-06, "loss": 0.3607, "step": 2708 }, { "epoch": 0.37994389901823283, "grad_norm": 1.8810810659604993, "learning_rate": 7.118121774916074e-06, "loss": 0.3916, "step": 2709 }, { "epoch": 0.3800841514726508, "grad_norm": 2.002667317347584, "learning_rate": 7.1160641722221255e-06, "loss": 0.3769, "step": 2710 }, { "epoch": 0.38022440392706874, "grad_norm": 2.429506939130892, "learning_rate": 7.114006132893121e-06, "loss": 0.3904, "step": 2711 }, { "epoch": 0.3803646563814867, "grad_norm": 2.7939574299042547, "learning_rate": 7.111947657353719e-06, "loss": 0.3409, "step": 2712 }, { "epoch": 0.38050490883590465, "grad_norm": 1.79968267928662, "learning_rate": 7.1098887460286745e-06, "loss": 0.3828, "step": 2713 }, { "epoch": 0.38064516129032255, "grad_norm": 2.3324422042898596, "learning_rate": 7.1078293993428285e-06, "loss": 0.3642, "step": 2714 }, { "epoch": 0.3807854137447405, "grad_norm": 1.6515685804069273, "learning_rate": 7.105769617721111e-06, "loss": 0.3708, "step": 2715 }, { "epoch": 0.38092566619915846, "grad_norm": 1.7811308034534186, "learning_rate": 7.1037094015885456e-06, "loss": 0.4029, "step": 2716 }, { "epoch": 0.3810659186535764, "grad_norm": 2.375109218294629, "learning_rate": 7.101648751370243e-06, "loss": 0.3916, "step": 2717 }, { "epoch": 0.3812061711079944, "grad_norm": 2.1253549615931346, "learning_rate": 7.099587667491404e-06, "loss": 0.3755, "step": 2718 }, { "epoch": 0.38134642356241233, "grad_norm": 2.238059467292633, "learning_rate": 7.097526150377319e-06, "loss": 0.4146, "step": 2719 }, { "epoch": 0.3814866760168303, "grad_norm": 1.608583199983806, "learning_rate": 7.095464200453366e-06, "loss": 0.3084, "step": 2720 }, { "epoch": 0.38162692847124824, "grad_norm": 2.698270109337977, "learning_rate": 7.093401818145016e-06, "loss": 0.3955, "step": 2721 }, { "epoch": 0.3817671809256662, "grad_norm": 2.173661958250716, "learning_rate": 7.091339003877826e-06, "loss": 0.3652, "step": 2722 }, { "epoch": 0.38190743338008415, "grad_norm": 2.213002564847195, "learning_rate": 7.0892757580774455e-06, "loss": 0.3753, "step": 2723 }, { "epoch": 0.3820476858345021, "grad_norm": 2.08170314586838, "learning_rate": 7.087212081169608e-06, "loss": 0.3946, "step": 2724 }, { "epoch": 0.38218793828892006, "grad_norm": 1.8079209858119716, "learning_rate": 7.08514797358014e-06, "loss": 0.4261, "step": 2725 }, { "epoch": 0.382328190743338, "grad_norm": 2.3784720489675144, "learning_rate": 7.083083435734955e-06, "loss": 0.3421, "step": 2726 }, { "epoch": 0.38246844319775597, "grad_norm": 2.560923673023371, "learning_rate": 7.081018468060057e-06, "loss": 0.4263, "step": 2727 }, { "epoch": 0.3826086956521739, "grad_norm": 1.9860750860520162, "learning_rate": 7.078953070981538e-06, "loss": 0.4225, "step": 2728 }, { "epoch": 0.3827489481065919, "grad_norm": 2.022692215394565, "learning_rate": 7.0768872449255765e-06, "loss": 0.3675, "step": 2729 }, { "epoch": 0.38288920056100983, "grad_norm": 2.4224701180991883, "learning_rate": 7.074820990318444e-06, "loss": 0.3681, "step": 2730 }, { "epoch": 0.3830294530154278, "grad_norm": 2.240185205773074, "learning_rate": 7.072754307586495e-06, "loss": 0.387, "step": 2731 }, { "epoch": 0.38316970546984574, "grad_norm": 2.001635717025117, "learning_rate": 7.070687197156175e-06, "loss": 0.35, "step": 2732 }, { "epoch": 0.3833099579242637, "grad_norm": 9.34241325487182, "learning_rate": 7.068619659454019e-06, "loss": 0.3894, "step": 2733 }, { "epoch": 0.3834502103786816, "grad_norm": 2.30715344935689, "learning_rate": 7.066551694906651e-06, "loss": 0.3435, "step": 2734 }, { "epoch": 0.38359046283309955, "grad_norm": 1.8624227213348574, "learning_rate": 7.064483303940777e-06, "loss": 0.3816, "step": 2735 }, { "epoch": 0.3837307152875175, "grad_norm": 2.114541853574622, "learning_rate": 7.062414486983197e-06, "loss": 0.3732, "step": 2736 }, { "epoch": 0.38387096774193546, "grad_norm": 1.8986329029131497, "learning_rate": 7.060345244460797e-06, "loss": 0.3693, "step": 2737 }, { "epoch": 0.3840112201963534, "grad_norm": 2.026134917034492, "learning_rate": 7.05827557680055e-06, "loss": 0.3971, "step": 2738 }, { "epoch": 0.3841514726507714, "grad_norm": 2.19224534995001, "learning_rate": 7.056205484429519e-06, "loss": 0.3927, "step": 2739 }, { "epoch": 0.38429172510518933, "grad_norm": 2.1723162525288475, "learning_rate": 7.0541349677748524e-06, "loss": 0.4036, "step": 2740 }, { "epoch": 0.3844319775596073, "grad_norm": 1.6751322167438003, "learning_rate": 7.052064027263785e-06, "loss": 0.3715, "step": 2741 }, { "epoch": 0.38457223001402524, "grad_norm": 1.831945356295364, "learning_rate": 7.049992663323642e-06, "loss": 0.3584, "step": 2742 }, { "epoch": 0.3847124824684432, "grad_norm": 1.8147570927228163, "learning_rate": 7.047920876381837e-06, "loss": 0.3601, "step": 2743 }, { "epoch": 0.38485273492286115, "grad_norm": 2.0593086752970784, "learning_rate": 7.045848666865867e-06, "loss": 0.3974, "step": 2744 }, { "epoch": 0.3849929873772791, "grad_norm": 4.138751603025116, "learning_rate": 7.043776035203318e-06, "loss": 0.3822, "step": 2745 }, { "epoch": 0.38513323983169706, "grad_norm": 5.317272062523193, "learning_rate": 7.041702981821862e-06, "loss": 0.4017, "step": 2746 }, { "epoch": 0.385273492286115, "grad_norm": 2.031893584190763, "learning_rate": 7.039629507149261e-06, "loss": 0.3947, "step": 2747 }, { "epoch": 0.38541374474053297, "grad_norm": 1.930081778953072, "learning_rate": 7.0375556116133605e-06, "loss": 0.3834, "step": 2748 }, { "epoch": 0.3855539971949509, "grad_norm": 1.8266058584635771, "learning_rate": 7.035481295642096e-06, "loss": 0.3396, "step": 2749 }, { "epoch": 0.3856942496493689, "grad_norm": 2.187581524011011, "learning_rate": 7.033406559663486e-06, "loss": 0.3678, "step": 2750 }, { "epoch": 0.38583450210378684, "grad_norm": 2.3217121873251054, "learning_rate": 7.03133140410564e-06, "loss": 0.3878, "step": 2751 }, { "epoch": 0.3859747545582048, "grad_norm": 3.0723880808895685, "learning_rate": 7.029255829396751e-06, "loss": 0.376, "step": 2752 }, { "epoch": 0.38611500701262275, "grad_norm": 1.9281320828349446, "learning_rate": 7.027179835965097e-06, "loss": 0.3896, "step": 2753 }, { "epoch": 0.38625525946704065, "grad_norm": 2.8745669163127623, "learning_rate": 7.025103424239049e-06, "loss": 0.3623, "step": 2754 }, { "epoch": 0.3863955119214586, "grad_norm": 2.3002719801267313, "learning_rate": 7.023026594647057e-06, "loss": 0.3546, "step": 2755 }, { "epoch": 0.38653576437587656, "grad_norm": 2.0389703605559317, "learning_rate": 7.02094934761766e-06, "loss": 0.3723, "step": 2756 }, { "epoch": 0.3866760168302945, "grad_norm": 2.9409158204089985, "learning_rate": 7.018871683579487e-06, "loss": 0.382, "step": 2757 }, { "epoch": 0.38681626928471247, "grad_norm": 2.0436059603547623, "learning_rate": 7.016793602961245e-06, "loss": 0.3681, "step": 2758 }, { "epoch": 0.3869565217391304, "grad_norm": 1.7090881664910806, "learning_rate": 7.0147151061917355e-06, "loss": 0.3647, "step": 2759 }, { "epoch": 0.3870967741935484, "grad_norm": 2.3634208701009043, "learning_rate": 7.012636193699838e-06, "loss": 0.3354, "step": 2760 }, { "epoch": 0.38723702664796633, "grad_norm": 2.9577348847037253, "learning_rate": 7.010556865914525e-06, "loss": 0.3759, "step": 2761 }, { "epoch": 0.3873772791023843, "grad_norm": 2.0398032059835147, "learning_rate": 7.008477123264849e-06, "loss": 0.402, "step": 2762 }, { "epoch": 0.38751753155680224, "grad_norm": 2.8535680955348677, "learning_rate": 7.006396966179949e-06, "loss": 0.3694, "step": 2763 }, { "epoch": 0.3876577840112202, "grad_norm": 2.3300804784080213, "learning_rate": 7.004316395089055e-06, "loss": 0.4104, "step": 2764 }, { "epoch": 0.38779803646563815, "grad_norm": 2.3251774709915036, "learning_rate": 7.002235410421476e-06, "loss": 0.3872, "step": 2765 }, { "epoch": 0.3879382889200561, "grad_norm": 2.190882982417195, "learning_rate": 7.000154012606608e-06, "loss": 0.3669, "step": 2766 }, { "epoch": 0.38807854137447406, "grad_norm": 1.7113138420729719, "learning_rate": 6.998072202073933e-06, "loss": 0.3904, "step": 2767 }, { "epoch": 0.388218793828892, "grad_norm": 2.481627570570841, "learning_rate": 6.9959899792530195e-06, "loss": 0.3392, "step": 2768 }, { "epoch": 0.38835904628330997, "grad_norm": 2.669436943033956, "learning_rate": 6.9939073445735205e-06, "loss": 0.3725, "step": 2769 }, { "epoch": 0.3884992987377279, "grad_norm": 1.7911573913443863, "learning_rate": 6.99182429846517e-06, "loss": 0.3495, "step": 2770 }, { "epoch": 0.3886395511921459, "grad_norm": 1.911090021916523, "learning_rate": 6.9897408413577905e-06, "loss": 0.3862, "step": 2771 }, { "epoch": 0.38877980364656384, "grad_norm": 2.2005145251542966, "learning_rate": 6.987656973681291e-06, "loss": 0.3553, "step": 2772 }, { "epoch": 0.3889200561009818, "grad_norm": 2.650891812529021, "learning_rate": 6.985572695865662e-06, "loss": 0.3659, "step": 2773 }, { "epoch": 0.3890603085553997, "grad_norm": 1.8968112714893508, "learning_rate": 6.98348800834098e-06, "loss": 0.3633, "step": 2774 }, { "epoch": 0.38920056100981765, "grad_norm": 2.1181284935349205, "learning_rate": 6.981402911537405e-06, "loss": 0.3675, "step": 2775 }, { "epoch": 0.3893408134642356, "grad_norm": 2.311301565219156, "learning_rate": 6.9793174058851805e-06, "loss": 0.334, "step": 2776 }, { "epoch": 0.38948106591865356, "grad_norm": 3.1931824241398186, "learning_rate": 6.97723149181464e-06, "loss": 0.3521, "step": 2777 }, { "epoch": 0.3896213183730715, "grad_norm": 1.7317256453123064, "learning_rate": 6.975145169756193e-06, "loss": 0.3607, "step": 2778 }, { "epoch": 0.38976157082748947, "grad_norm": 5.834295938177826, "learning_rate": 6.973058440140341e-06, "loss": 0.3662, "step": 2779 }, { "epoch": 0.3899018232819074, "grad_norm": 1.9658092758088719, "learning_rate": 6.9709713033976655e-06, "loss": 0.3617, "step": 2780 }, { "epoch": 0.3900420757363254, "grad_norm": 2.005914882348768, "learning_rate": 6.968883759958831e-06, "loss": 0.4063, "step": 2781 }, { "epoch": 0.39018232819074333, "grad_norm": 2.3280984761820163, "learning_rate": 6.96679581025459e-06, "loss": 0.3613, "step": 2782 }, { "epoch": 0.3903225806451613, "grad_norm": 3.4377659087937547, "learning_rate": 6.964707454715772e-06, "loss": 0.411, "step": 2783 }, { "epoch": 0.39046283309957924, "grad_norm": 2.175526465354706, "learning_rate": 6.962618693773299e-06, "loss": 0.3531, "step": 2784 }, { "epoch": 0.3906030855539972, "grad_norm": 1.9304835727667538, "learning_rate": 6.960529527858171e-06, "loss": 0.3156, "step": 2785 }, { "epoch": 0.39074333800841515, "grad_norm": 2.430980080965851, "learning_rate": 6.958439957401471e-06, "loss": 0.33, "step": 2786 }, { "epoch": 0.3908835904628331, "grad_norm": 2.2167235240480196, "learning_rate": 6.956349982834367e-06, "loss": 0.4087, "step": 2787 }, { "epoch": 0.39102384291725106, "grad_norm": 2.3529689296319822, "learning_rate": 6.954259604588114e-06, "loss": 0.3814, "step": 2788 }, { "epoch": 0.391164095371669, "grad_norm": 1.9722438267821434, "learning_rate": 6.9521688230940454e-06, "loss": 0.3858, "step": 2789 }, { "epoch": 0.391304347826087, "grad_norm": 2.0846286053201757, "learning_rate": 6.9500776387835785e-06, "loss": 0.3906, "step": 2790 }, { "epoch": 0.39144460028050493, "grad_norm": 2.0932108626206456, "learning_rate": 6.947986052088216e-06, "loss": 0.3638, "step": 2791 }, { "epoch": 0.3915848527349229, "grad_norm": 2.1413487234255335, "learning_rate": 6.945894063439542e-06, "loss": 0.3969, "step": 2792 }, { "epoch": 0.39172510518934084, "grad_norm": 2.564965062637792, "learning_rate": 6.943801673269222e-06, "loss": 0.3809, "step": 2793 }, { "epoch": 0.39186535764375874, "grad_norm": 1.8310054680884822, "learning_rate": 6.941708882009006e-06, "loss": 0.3528, "step": 2794 }, { "epoch": 0.3920056100981767, "grad_norm": 2.191259369033134, "learning_rate": 6.9396156900907295e-06, "loss": 0.3281, "step": 2795 }, { "epoch": 0.39214586255259465, "grad_norm": 1.956944446580266, "learning_rate": 6.937522097946306e-06, "loss": 0.3342, "step": 2796 }, { "epoch": 0.3922861150070126, "grad_norm": 1.7812798480829113, "learning_rate": 6.935428106007734e-06, "loss": 0.4142, "step": 2797 }, { "epoch": 0.39242636746143056, "grad_norm": 1.8440234094693648, "learning_rate": 6.933333714707094e-06, "loss": 0.3337, "step": 2798 }, { "epoch": 0.3925666199158485, "grad_norm": 2.0130442576425236, "learning_rate": 6.931238924476551e-06, "loss": 0.3911, "step": 2799 }, { "epoch": 0.39270687237026647, "grad_norm": 1.7551259173354243, "learning_rate": 6.929143735748348e-06, "loss": 0.3983, "step": 2800 }, { "epoch": 0.3928471248246844, "grad_norm": 1.8954054241248117, "learning_rate": 6.9270481489548125e-06, "loss": 0.4167, "step": 2801 }, { "epoch": 0.3929873772791024, "grad_norm": 2.2257547619985187, "learning_rate": 6.924952164528355e-06, "loss": 0.3829, "step": 2802 }, { "epoch": 0.39312762973352033, "grad_norm": 2.3009327448399826, "learning_rate": 6.922855782901468e-06, "loss": 0.3772, "step": 2803 }, { "epoch": 0.3932678821879383, "grad_norm": 2.2510698741844273, "learning_rate": 6.920759004506723e-06, "loss": 0.3412, "step": 2804 }, { "epoch": 0.39340813464235624, "grad_norm": 2.3434082799387723, "learning_rate": 6.918661829776778e-06, "loss": 0.4037, "step": 2805 }, { "epoch": 0.3935483870967742, "grad_norm": 1.8285699286625023, "learning_rate": 6.916564259144369e-06, "loss": 0.4049, "step": 2806 }, { "epoch": 0.39368863955119215, "grad_norm": 1.9366074013409922, "learning_rate": 6.9144662930423144e-06, "loss": 0.4165, "step": 2807 }, { "epoch": 0.3938288920056101, "grad_norm": 2.0170469960537734, "learning_rate": 6.912367931903516e-06, "loss": 0.3661, "step": 2808 }, { "epoch": 0.39396914446002806, "grad_norm": 3.068442049132869, "learning_rate": 6.910269176160957e-06, "loss": 0.3858, "step": 2809 }, { "epoch": 0.394109396914446, "grad_norm": 2.0466238552093334, "learning_rate": 6.9081700262477e-06, "loss": 0.3652, "step": 2810 }, { "epoch": 0.394249649368864, "grad_norm": 2.240531815169965, "learning_rate": 6.906070482596887e-06, "loss": 0.3786, "step": 2811 }, { "epoch": 0.39438990182328193, "grad_norm": 2.1921137519335114, "learning_rate": 6.903970545641749e-06, "loss": 0.3733, "step": 2812 }, { "epoch": 0.3945301542776999, "grad_norm": 2.9701212075415024, "learning_rate": 6.901870215815591e-06, "loss": 0.3977, "step": 2813 }, { "epoch": 0.3946704067321178, "grad_norm": 1.8126571850198405, "learning_rate": 6.8997694935518e-06, "loss": 0.3804, "step": 2814 }, { "epoch": 0.39481065918653574, "grad_norm": 1.916742620483538, "learning_rate": 6.897668379283848e-06, "loss": 0.3582, "step": 2815 }, { "epoch": 0.3949509116409537, "grad_norm": 2.085359986329221, "learning_rate": 6.895566873445285e-06, "loss": 0.3685, "step": 2816 }, { "epoch": 0.39509116409537165, "grad_norm": 1.7739849841956417, "learning_rate": 6.893464976469739e-06, "loss": 0.3853, "step": 2817 }, { "epoch": 0.3952314165497896, "grad_norm": 2.078938604574011, "learning_rate": 6.891362688790925e-06, "loss": 0.3944, "step": 2818 }, { "epoch": 0.39537166900420756, "grad_norm": 2.69869835525436, "learning_rate": 6.889260010842633e-06, "loss": 0.3546, "step": 2819 }, { "epoch": 0.3955119214586255, "grad_norm": 1.9177047641050273, "learning_rate": 6.887156943058739e-06, "loss": 0.3697, "step": 2820 }, { "epoch": 0.39565217391304347, "grad_norm": 1.7719737999229914, "learning_rate": 6.8850534858731945e-06, "loss": 0.3891, "step": 2821 }, { "epoch": 0.3957924263674614, "grad_norm": 4.434331272976904, "learning_rate": 6.882949639720032e-06, "loss": 0.4028, "step": 2822 }, { "epoch": 0.3959326788218794, "grad_norm": 2.1868988300953727, "learning_rate": 6.880845405033368e-06, "loss": 0.3669, "step": 2823 }, { "epoch": 0.39607293127629734, "grad_norm": 1.9415471660966932, "learning_rate": 6.878740782247395e-06, "loss": 0.377, "step": 2824 }, { "epoch": 0.3962131837307153, "grad_norm": 2.051923626205308, "learning_rate": 6.876635771796386e-06, "loss": 0.3483, "step": 2825 }, { "epoch": 0.39635343618513325, "grad_norm": 2.033043289864516, "learning_rate": 6.874530374114699e-06, "loss": 0.351, "step": 2826 }, { "epoch": 0.3964936886395512, "grad_norm": 2.5905405433425783, "learning_rate": 6.8724245896367636e-06, "loss": 0.4028, "step": 2827 }, { "epoch": 0.39663394109396916, "grad_norm": 2.8603262231081437, "learning_rate": 6.870318418797098e-06, "loss": 0.369, "step": 2828 }, { "epoch": 0.3967741935483871, "grad_norm": 2.0297749959339466, "learning_rate": 6.868211862030291e-06, "loss": 0.3454, "step": 2829 }, { "epoch": 0.39691444600280507, "grad_norm": 2.1999661139566156, "learning_rate": 6.86610491977102e-06, "loss": 0.3576, "step": 2830 }, { "epoch": 0.397054698457223, "grad_norm": 2.2875719653206534, "learning_rate": 6.863997592454038e-06, "loss": 0.3712, "step": 2831 }, { "epoch": 0.397194950911641, "grad_norm": 2.701703618491338, "learning_rate": 6.8618898805141744e-06, "loss": 0.3603, "step": 2832 }, { "epoch": 0.39733520336605893, "grad_norm": 2.0266571940443283, "learning_rate": 6.859781784386341e-06, "loss": 0.3878, "step": 2833 }, { "epoch": 0.39747545582047683, "grad_norm": 2.7958568232800722, "learning_rate": 6.857673304505532e-06, "loss": 0.383, "step": 2834 }, { "epoch": 0.3976157082748948, "grad_norm": 2.1007855762509493, "learning_rate": 6.855564441306815e-06, "loss": 0.3786, "step": 2835 }, { "epoch": 0.39775596072931274, "grad_norm": 2.0260560755147847, "learning_rate": 6.8534551952253395e-06, "loss": 0.3629, "step": 2836 }, { "epoch": 0.3978962131837307, "grad_norm": 2.089466781129425, "learning_rate": 6.8513455666963325e-06, "loss": 0.4235, "step": 2837 }, { "epoch": 0.39803646563814865, "grad_norm": 3.1489299833885167, "learning_rate": 6.849235556155103e-06, "loss": 0.344, "step": 2838 }, { "epoch": 0.3981767180925666, "grad_norm": 2.1933930717998162, "learning_rate": 6.847125164037036e-06, "loss": 0.3859, "step": 2839 }, { "epoch": 0.39831697054698456, "grad_norm": 3.3305302914242843, "learning_rate": 6.845014390777595e-06, "loss": 0.3565, "step": 2840 }, { "epoch": 0.3984572230014025, "grad_norm": 2.415918025988191, "learning_rate": 6.842903236812328e-06, "loss": 0.404, "step": 2841 }, { "epoch": 0.39859747545582047, "grad_norm": 2.2959983346381443, "learning_rate": 6.840791702576852e-06, "loss": 0.3632, "step": 2842 }, { "epoch": 0.3987377279102384, "grad_norm": 4.6335780008241665, "learning_rate": 6.838679788506869e-06, "loss": 0.4097, "step": 2843 }, { "epoch": 0.3988779803646564, "grad_norm": 2.1784628432324666, "learning_rate": 6.836567495038157e-06, "loss": 0.3214, "step": 2844 }, { "epoch": 0.39901823281907434, "grad_norm": 2.0384581390042467, "learning_rate": 6.834454822606576e-06, "loss": 0.3771, "step": 2845 }, { "epoch": 0.3991584852734923, "grad_norm": 2.3244054382743755, "learning_rate": 6.832341771648057e-06, "loss": 0.3956, "step": 2846 }, { "epoch": 0.39929873772791025, "grad_norm": 2.151519008133184, "learning_rate": 6.830228342598615e-06, "loss": 0.3344, "step": 2847 }, { "epoch": 0.3994389901823282, "grad_norm": 2.012262304260118, "learning_rate": 6.828114535894342e-06, "loss": 0.3718, "step": 2848 }, { "epoch": 0.39957924263674616, "grad_norm": 2.2242488150244557, "learning_rate": 6.826000351971407e-06, "loss": 0.4137, "step": 2849 }, { "epoch": 0.3997194950911641, "grad_norm": 1.8661437369343805, "learning_rate": 6.823885791266056e-06, "loss": 0.3731, "step": 2850 }, { "epoch": 0.39985974754558207, "grad_norm": 1.8568077636584384, "learning_rate": 6.821770854214615e-06, "loss": 0.3751, "step": 2851 }, { "epoch": 0.4, "grad_norm": 2.0018616594959218, "learning_rate": 6.819655541253487e-06, "loss": 0.3782, "step": 2852 }, { "epoch": 0.400140252454418, "grad_norm": 2.335391590030284, "learning_rate": 6.817539852819149e-06, "loss": 0.3833, "step": 2853 }, { "epoch": 0.40028050490883593, "grad_norm": 1.939921619267324, "learning_rate": 6.8154237893481625e-06, "loss": 0.3408, "step": 2854 }, { "epoch": 0.40042075736325383, "grad_norm": 2.414459195921677, "learning_rate": 6.813307351277161e-06, "loss": 0.366, "step": 2855 }, { "epoch": 0.4005610098176718, "grad_norm": 2.281650351314389, "learning_rate": 6.811190539042855e-06, "loss": 0.3704, "step": 2856 }, { "epoch": 0.40070126227208974, "grad_norm": 2.246586010934994, "learning_rate": 6.809073353082038e-06, "loss": 0.3385, "step": 2857 }, { "epoch": 0.4008415147265077, "grad_norm": 2.0917018152857794, "learning_rate": 6.8069557938315715e-06, "loss": 0.3905, "step": 2858 }, { "epoch": 0.40098176718092565, "grad_norm": 1.738901160640023, "learning_rate": 6.8048378617284005e-06, "loss": 0.4044, "step": 2859 }, { "epoch": 0.4011220196353436, "grad_norm": 2.1443918318325537, "learning_rate": 6.802719557209547e-06, "loss": 0.3909, "step": 2860 }, { "epoch": 0.40126227208976156, "grad_norm": 1.8905397506283899, "learning_rate": 6.800600880712107e-06, "loss": 0.3461, "step": 2861 }, { "epoch": 0.4014025245441795, "grad_norm": 2.0879866575275186, "learning_rate": 6.798481832673257e-06, "loss": 0.4092, "step": 2862 }, { "epoch": 0.4015427769985975, "grad_norm": 2.059589413029318, "learning_rate": 6.796362413530245e-06, "loss": 0.3434, "step": 2863 }, { "epoch": 0.4016830294530154, "grad_norm": 2.804562111604573, "learning_rate": 6.794242623720399e-06, "loss": 0.4021, "step": 2864 }, { "epoch": 0.4018232819074334, "grad_norm": 1.631267930440174, "learning_rate": 6.792122463681126e-06, "loss": 0.3617, "step": 2865 }, { "epoch": 0.40196353436185134, "grad_norm": 2.1560401261511184, "learning_rate": 6.7900019338499005e-06, "loss": 0.3731, "step": 2866 }, { "epoch": 0.4021037868162693, "grad_norm": 2.9970811313546992, "learning_rate": 6.787881034664283e-06, "loss": 0.3543, "step": 2867 }, { "epoch": 0.40224403927068725, "grad_norm": 2.484675167352034, "learning_rate": 6.785759766561903e-06, "loss": 0.3826, "step": 2868 }, { "epoch": 0.4023842917251052, "grad_norm": 2.8073681954808776, "learning_rate": 6.783638129980474e-06, "loss": 0.3664, "step": 2869 }, { "epoch": 0.40252454417952316, "grad_norm": 2.3355159315326603, "learning_rate": 6.781516125357777e-06, "loss": 0.3834, "step": 2870 }, { "epoch": 0.4026647966339411, "grad_norm": 1.9199631027808943, "learning_rate": 6.779393753131674e-06, "loss": 0.3182, "step": 2871 }, { "epoch": 0.40280504908835907, "grad_norm": 3.8423943338651414, "learning_rate": 6.7772710137401044e-06, "loss": 0.3335, "step": 2872 }, { "epoch": 0.402945301542777, "grad_norm": 2.3478489953764767, "learning_rate": 6.775147907621076e-06, "loss": 0.4064, "step": 2873 }, { "epoch": 0.403085553997195, "grad_norm": 2.25245342007486, "learning_rate": 6.773024435212678e-06, "loss": 0.3506, "step": 2874 }, { "epoch": 0.4032258064516129, "grad_norm": 3.3659917916681183, "learning_rate": 6.770900596953076e-06, "loss": 0.3354, "step": 2875 }, { "epoch": 0.40336605890603083, "grad_norm": 1.9653768555641453, "learning_rate": 6.76877639328051e-06, "loss": 0.3697, "step": 2876 }, { "epoch": 0.4035063113604488, "grad_norm": 2.072145046857204, "learning_rate": 6.766651824633292e-06, "loss": 0.38, "step": 2877 }, { "epoch": 0.40364656381486674, "grad_norm": 2.5673818838553855, "learning_rate": 6.764526891449813e-06, "loss": 0.3232, "step": 2878 }, { "epoch": 0.4037868162692847, "grad_norm": 1.9900222680269288, "learning_rate": 6.762401594168537e-06, "loss": 0.3591, "step": 2879 }, { "epoch": 0.40392706872370265, "grad_norm": 1.9995406338530466, "learning_rate": 6.7602759332280045e-06, "loss": 0.393, "step": 2880 }, { "epoch": 0.4040673211781206, "grad_norm": 2.0464191657197093, "learning_rate": 6.758149909066832e-06, "loss": 0.4433, "step": 2881 }, { "epoch": 0.40420757363253856, "grad_norm": 2.5133510761326145, "learning_rate": 6.7560235221237115e-06, "loss": 0.3685, "step": 2882 }, { "epoch": 0.4043478260869565, "grad_norm": 2.8399995658766723, "learning_rate": 6.753896772837403e-06, "loss": 0.361, "step": 2883 }, { "epoch": 0.4044880785413745, "grad_norm": 2.3714684782593434, "learning_rate": 6.75176966164675e-06, "loss": 0.3374, "step": 2884 }, { "epoch": 0.40462833099579243, "grad_norm": 5.2156490775638815, "learning_rate": 6.749642188990666e-06, "loss": 0.3834, "step": 2885 }, { "epoch": 0.4047685834502104, "grad_norm": 2.3745669923875954, "learning_rate": 6.74751435530814e-06, "loss": 0.4136, "step": 2886 }, { "epoch": 0.40490883590462834, "grad_norm": 1.9922008439159415, "learning_rate": 6.745386161038237e-06, "loss": 0.3961, "step": 2887 }, { "epoch": 0.4050490883590463, "grad_norm": 2.007372499932463, "learning_rate": 6.743257606620094e-06, "loss": 0.3869, "step": 2888 }, { "epoch": 0.40518934081346425, "grad_norm": 2.7073903031462896, "learning_rate": 6.741128692492922e-06, "loss": 0.3796, "step": 2889 }, { "epoch": 0.4053295932678822, "grad_norm": 2.6032272373744014, "learning_rate": 6.7389994190960085e-06, "loss": 0.3423, "step": 2890 }, { "epoch": 0.40546984572230016, "grad_norm": 2.130256210821289, "learning_rate": 6.7368697868687146e-06, "loss": 0.3426, "step": 2891 }, { "epoch": 0.4056100981767181, "grad_norm": 2.107243343848278, "learning_rate": 6.734739796250477e-06, "loss": 0.4077, "step": 2892 }, { "epoch": 0.40575035063113607, "grad_norm": 1.8965197637459976, "learning_rate": 6.7326094476808e-06, "loss": 0.3869, "step": 2893 }, { "epoch": 0.405890603085554, "grad_norm": 2.0741441501341917, "learning_rate": 6.730478741599269e-06, "loss": 0.3754, "step": 2894 }, { "epoch": 0.4060308555399719, "grad_norm": 2.019668426872192, "learning_rate": 6.728347678445539e-06, "loss": 0.4069, "step": 2895 }, { "epoch": 0.4061711079943899, "grad_norm": 2.8716264918154124, "learning_rate": 6.726216258659343e-06, "loss": 0.3741, "step": 2896 }, { "epoch": 0.40631136044880783, "grad_norm": 2.1436181964894208, "learning_rate": 6.724084482680482e-06, "loss": 0.383, "step": 2897 }, { "epoch": 0.4064516129032258, "grad_norm": 2.4143706189895213, "learning_rate": 6.721952350948833e-06, "loss": 0.406, "step": 2898 }, { "epoch": 0.40659186535764374, "grad_norm": 5.134607567282539, "learning_rate": 6.719819863904345e-06, "loss": 0.3649, "step": 2899 }, { "epoch": 0.4067321178120617, "grad_norm": 2.0743840325949914, "learning_rate": 6.717687021987045e-06, "loss": 0.3866, "step": 2900 }, { "epoch": 0.40687237026647965, "grad_norm": 1.9798454793688127, "learning_rate": 6.715553825637029e-06, "loss": 0.3995, "step": 2901 }, { "epoch": 0.4070126227208976, "grad_norm": 1.9348766656253935, "learning_rate": 6.713420275294467e-06, "loss": 0.3652, "step": 2902 }, { "epoch": 0.40715287517531557, "grad_norm": 2.099339213178405, "learning_rate": 6.711286371399602e-06, "loss": 0.4118, "step": 2903 }, { "epoch": 0.4072931276297335, "grad_norm": 1.9887904506288367, "learning_rate": 6.7091521143927495e-06, "loss": 0.3395, "step": 2904 }, { "epoch": 0.4074333800841515, "grad_norm": 2.0673660904717557, "learning_rate": 6.707017504714299e-06, "loss": 0.3737, "step": 2905 }, { "epoch": 0.40757363253856943, "grad_norm": 2.2934740279901646, "learning_rate": 6.704882542804714e-06, "loss": 0.3864, "step": 2906 }, { "epoch": 0.4077138849929874, "grad_norm": 2.0377739236564802, "learning_rate": 6.702747229104527e-06, "loss": 0.3544, "step": 2907 }, { "epoch": 0.40785413744740534, "grad_norm": 2.001311825637749, "learning_rate": 6.700611564054346e-06, "loss": 0.3752, "step": 2908 }, { "epoch": 0.4079943899018233, "grad_norm": 2.2973312651362363, "learning_rate": 6.69847554809485e-06, "loss": 0.3609, "step": 2909 }, { "epoch": 0.40813464235624125, "grad_norm": 2.6263678483699064, "learning_rate": 6.696339181666791e-06, "loss": 0.3732, "step": 2910 }, { "epoch": 0.4082748948106592, "grad_norm": 2.1224398056498983, "learning_rate": 6.694202465210993e-06, "loss": 0.3791, "step": 2911 }, { "epoch": 0.40841514726507716, "grad_norm": 4.583175633347297, "learning_rate": 6.692065399168352e-06, "loss": 0.3978, "step": 2912 }, { "epoch": 0.4085553997194951, "grad_norm": 2.435008903502806, "learning_rate": 6.689927983979841e-06, "loss": 0.3441, "step": 2913 }, { "epoch": 0.40869565217391307, "grad_norm": 2.663327023288221, "learning_rate": 6.687790220086494e-06, "loss": 0.3421, "step": 2914 }, { "epoch": 0.40883590462833097, "grad_norm": 1.9536352482909567, "learning_rate": 6.6856521079294275e-06, "loss": 0.3704, "step": 2915 }, { "epoch": 0.4089761570827489, "grad_norm": 2.1554075060249764, "learning_rate": 6.683513647949826e-06, "loss": 0.3767, "step": 2916 }, { "epoch": 0.4091164095371669, "grad_norm": 5.60575337501475, "learning_rate": 6.681374840588946e-06, "loss": 0.3412, "step": 2917 }, { "epoch": 0.40925666199158484, "grad_norm": 1.9187452061496952, "learning_rate": 6.6792356862881144e-06, "loss": 0.3601, "step": 2918 }, { "epoch": 0.4093969144460028, "grad_norm": 1.7575353107188094, "learning_rate": 6.6770961854887296e-06, "loss": 0.382, "step": 2919 }, { "epoch": 0.40953716690042075, "grad_norm": 1.9285661164972525, "learning_rate": 6.674956338632265e-06, "loss": 0.3702, "step": 2920 }, { "epoch": 0.4096774193548387, "grad_norm": 2.3891920395155704, "learning_rate": 6.672816146160262e-06, "loss": 0.3662, "step": 2921 }, { "epoch": 0.40981767180925666, "grad_norm": 2.0833510461690046, "learning_rate": 6.6706756085143345e-06, "loss": 0.3816, "step": 2922 }, { "epoch": 0.4099579242636746, "grad_norm": 1.9985807867034608, "learning_rate": 6.668534726136166e-06, "loss": 0.3535, "step": 2923 }, { "epoch": 0.41009817671809257, "grad_norm": 2.491742611581422, "learning_rate": 6.666393499467516e-06, "loss": 0.3717, "step": 2924 }, { "epoch": 0.4102384291725105, "grad_norm": 2.2670765403342443, "learning_rate": 6.664251928950209e-06, "loss": 0.3624, "step": 2925 }, { "epoch": 0.4103786816269285, "grad_norm": 2.3017248734782525, "learning_rate": 6.662110015026144e-06, "loss": 0.3265, "step": 2926 }, { "epoch": 0.41051893408134643, "grad_norm": 2.415196420962704, "learning_rate": 6.659967758137289e-06, "loss": 0.3697, "step": 2927 }, { "epoch": 0.4106591865357644, "grad_norm": 2.2957168046440843, "learning_rate": 6.657825158725686e-06, "loss": 0.4034, "step": 2928 }, { "epoch": 0.41079943899018234, "grad_norm": 2.3243421505778534, "learning_rate": 6.655682217233445e-06, "loss": 0.3815, "step": 2929 }, { "epoch": 0.4109396914446003, "grad_norm": 2.47256917935782, "learning_rate": 6.653538934102743e-06, "loss": 0.3529, "step": 2930 }, { "epoch": 0.41107994389901825, "grad_norm": 2.711316524690193, "learning_rate": 6.651395309775837e-06, "loss": 0.3833, "step": 2931 }, { "epoch": 0.4112201963534362, "grad_norm": 2.745401438975129, "learning_rate": 6.6492513446950444e-06, "loss": 0.4036, "step": 2932 }, { "epoch": 0.41136044880785416, "grad_norm": 2.242587914418082, "learning_rate": 6.64710703930276e-06, "loss": 0.3714, "step": 2933 }, { "epoch": 0.4115007012622721, "grad_norm": 2.0559998009306817, "learning_rate": 6.644962394041447e-06, "loss": 0.3982, "step": 2934 }, { "epoch": 0.41164095371669, "grad_norm": 2.4016660153001816, "learning_rate": 6.642817409353635e-06, "loss": 0.3709, "step": 2935 }, { "epoch": 0.41178120617110797, "grad_norm": 1.9817901356409142, "learning_rate": 6.640672085681928e-06, "loss": 0.3778, "step": 2936 }, { "epoch": 0.4119214586255259, "grad_norm": 2.526899295740091, "learning_rate": 6.638526423468999e-06, "loss": 0.3267, "step": 2937 }, { "epoch": 0.4120617110799439, "grad_norm": 1.7734660679873095, "learning_rate": 6.636380423157591e-06, "loss": 0.3817, "step": 2938 }, { "epoch": 0.41220196353436184, "grad_norm": 3.027816085884175, "learning_rate": 6.634234085190516e-06, "loss": 0.3904, "step": 2939 }, { "epoch": 0.4123422159887798, "grad_norm": 3.141939258301542, "learning_rate": 6.632087410010653e-06, "loss": 0.3346, "step": 2940 }, { "epoch": 0.41248246844319775, "grad_norm": 2.0572552899070016, "learning_rate": 6.629940398060957e-06, "loss": 0.3598, "step": 2941 }, { "epoch": 0.4126227208976157, "grad_norm": 1.9555734165175536, "learning_rate": 6.627793049784448e-06, "loss": 0.3121, "step": 2942 }, { "epoch": 0.41276297335203366, "grad_norm": 2.9455637533747785, "learning_rate": 6.625645365624214e-06, "loss": 0.3714, "step": 2943 }, { "epoch": 0.4129032258064516, "grad_norm": 2.6087770976261804, "learning_rate": 6.6234973460234184e-06, "loss": 0.3658, "step": 2944 }, { "epoch": 0.41304347826086957, "grad_norm": 2.6286738889580326, "learning_rate": 6.621348991425287e-06, "loss": 0.3766, "step": 2945 }, { "epoch": 0.4131837307152875, "grad_norm": 2.465878325431431, "learning_rate": 6.619200302273119e-06, "loss": 0.3786, "step": 2946 }, { "epoch": 0.4133239831697055, "grad_norm": 2.3915526142461854, "learning_rate": 6.61705127901028e-06, "loss": 0.4377, "step": 2947 }, { "epoch": 0.41346423562412343, "grad_norm": 2.6531857037648363, "learning_rate": 6.614901922080211e-06, "loss": 0.3468, "step": 2948 }, { "epoch": 0.4136044880785414, "grad_norm": 2.432788893352973, "learning_rate": 6.612752231926411e-06, "loss": 0.3318, "step": 2949 }, { "epoch": 0.41374474053295934, "grad_norm": 2.0949029737050546, "learning_rate": 6.6106022089924535e-06, "loss": 0.3524, "step": 2950 }, { "epoch": 0.4138849929873773, "grad_norm": 2.0690827946199812, "learning_rate": 6.608451853721985e-06, "loss": 0.4044, "step": 2951 }, { "epoch": 0.41402524544179525, "grad_norm": 2.384088893538771, "learning_rate": 6.606301166558713e-06, "loss": 0.388, "step": 2952 }, { "epoch": 0.4141654978962132, "grad_norm": 2.887515648473307, "learning_rate": 6.604150147946418e-06, "loss": 0.3435, "step": 2953 }, { "epoch": 0.41430575035063116, "grad_norm": 3.2461522139553147, "learning_rate": 6.601998798328948e-06, "loss": 0.3906, "step": 2954 }, { "epoch": 0.41444600280504906, "grad_norm": 2.1591470026641932, "learning_rate": 6.599847118150218e-06, "loss": 0.4012, "step": 2955 }, { "epoch": 0.414586255259467, "grad_norm": 2.184112760113168, "learning_rate": 6.597695107854212e-06, "loss": 0.4034, "step": 2956 }, { "epoch": 0.414726507713885, "grad_norm": 2.247956937818674, "learning_rate": 6.595542767884984e-06, "loss": 0.3556, "step": 2957 }, { "epoch": 0.41486676016830293, "grad_norm": 2.202788848122221, "learning_rate": 6.593390098686653e-06, "loss": 0.3397, "step": 2958 }, { "epoch": 0.4150070126227209, "grad_norm": 2.49757401059819, "learning_rate": 6.591237100703407e-06, "loss": 0.4076, "step": 2959 }, { "epoch": 0.41514726507713884, "grad_norm": 1.9641122840513912, "learning_rate": 6.589083774379503e-06, "loss": 0.3868, "step": 2960 }, { "epoch": 0.4152875175315568, "grad_norm": 2.819796443653793, "learning_rate": 6.586930120159263e-06, "loss": 0.3656, "step": 2961 }, { "epoch": 0.41542776998597475, "grad_norm": 2.3057450276643507, "learning_rate": 6.584776138487081e-06, "loss": 0.3944, "step": 2962 }, { "epoch": 0.4155680224403927, "grad_norm": 1.857510826315462, "learning_rate": 6.5826218298074144e-06, "loss": 0.3756, "step": 2963 }, { "epoch": 0.41570827489481066, "grad_norm": 3.068894075635903, "learning_rate": 6.5804671945647916e-06, "loss": 0.3729, "step": 2964 }, { "epoch": 0.4158485273492286, "grad_norm": 2.6594827166212838, "learning_rate": 6.578312233203804e-06, "loss": 0.3198, "step": 2965 }, { "epoch": 0.41598877980364657, "grad_norm": 2.942706910979791, "learning_rate": 6.5761569461691145e-06, "loss": 0.3957, "step": 2966 }, { "epoch": 0.4161290322580645, "grad_norm": 2.711136868316297, "learning_rate": 6.57400133390545e-06, "loss": 0.4335, "step": 2967 }, { "epoch": 0.4162692847124825, "grad_norm": 2.447395388244688, "learning_rate": 6.5718453968576076e-06, "loss": 0.3956, "step": 2968 }, { "epoch": 0.41640953716690043, "grad_norm": 2.216683620971242, "learning_rate": 6.569689135470451e-06, "loss": 0.3778, "step": 2969 }, { "epoch": 0.4165497896213184, "grad_norm": 3.577581639241376, "learning_rate": 6.567532550188908e-06, "loss": 0.3818, "step": 2970 }, { "epoch": 0.41669004207573634, "grad_norm": 2.2638294339580196, "learning_rate": 6.565375641457973e-06, "loss": 0.3656, "step": 2971 }, { "epoch": 0.4168302945301543, "grad_norm": 2.2339672436818554, "learning_rate": 6.563218409722712e-06, "loss": 0.3535, "step": 2972 }, { "epoch": 0.41697054698457225, "grad_norm": 2.306871833809432, "learning_rate": 6.561060855428252e-06, "loss": 0.3854, "step": 2973 }, { "epoch": 0.4171107994389902, "grad_norm": 2.45367503226088, "learning_rate": 6.558902979019793e-06, "loss": 0.3581, "step": 2974 }, { "epoch": 0.4172510518934081, "grad_norm": 2.5308552186899136, "learning_rate": 6.556744780942594e-06, "loss": 0.3544, "step": 2975 }, { "epoch": 0.41739130434782606, "grad_norm": 2.720058345589643, "learning_rate": 6.5545862616419865e-06, "loss": 0.3867, "step": 2976 }, { "epoch": 0.417531556802244, "grad_norm": 2.163755098950633, "learning_rate": 6.552427421563365e-06, "loss": 0.3569, "step": 2977 }, { "epoch": 0.417671809256662, "grad_norm": 2.936813761317621, "learning_rate": 6.550268261152192e-06, "loss": 0.3576, "step": 2978 }, { "epoch": 0.41781206171107993, "grad_norm": 2.255972881681715, "learning_rate": 6.548108780853995e-06, "loss": 0.3863, "step": 2979 }, { "epoch": 0.4179523141654979, "grad_norm": 2.7095336797197493, "learning_rate": 6.545948981114365e-06, "loss": 0.3861, "step": 2980 }, { "epoch": 0.41809256661991584, "grad_norm": 2.091114378075385, "learning_rate": 6.543788862378965e-06, "loss": 0.3753, "step": 2981 }, { "epoch": 0.4182328190743338, "grad_norm": 1.9166822294626182, "learning_rate": 6.541628425093518e-06, "loss": 0.3571, "step": 2982 }, { "epoch": 0.41837307152875175, "grad_norm": 2.768288315972146, "learning_rate": 6.539467669703816e-06, "loss": 0.3893, "step": 2983 }, { "epoch": 0.4185133239831697, "grad_norm": 2.04604526621096, "learning_rate": 6.537306596655716e-06, "loss": 0.3529, "step": 2984 }, { "epoch": 0.41865357643758766, "grad_norm": 2.2046941485395966, "learning_rate": 6.535145206395141e-06, "loss": 0.4155, "step": 2985 }, { "epoch": 0.4187938288920056, "grad_norm": 2.124995960904939, "learning_rate": 6.532983499368078e-06, "loss": 0.4011, "step": 2986 }, { "epoch": 0.41893408134642357, "grad_norm": 2.75722093153955, "learning_rate": 6.530821476020579e-06, "loss": 0.3531, "step": 2987 }, { "epoch": 0.4190743338008415, "grad_norm": 2.3984737828396194, "learning_rate": 6.5286591367987655e-06, "loss": 0.3654, "step": 2988 }, { "epoch": 0.4192145862552595, "grad_norm": 1.9285650038738311, "learning_rate": 6.5264964821488184e-06, "loss": 0.3974, "step": 2989 }, { "epoch": 0.41935483870967744, "grad_norm": 1.9835378127382342, "learning_rate": 6.524333512516987e-06, "loss": 0.353, "step": 2990 }, { "epoch": 0.4194950911640954, "grad_norm": 1.8940887248960436, "learning_rate": 6.522170228349585e-06, "loss": 0.412, "step": 2991 }, { "epoch": 0.41963534361851335, "grad_norm": 2.8751978598801635, "learning_rate": 6.520006630092991e-06, "loss": 0.3996, "step": 2992 }, { "epoch": 0.4197755960729313, "grad_norm": 2.075483091259303, "learning_rate": 6.5178427181936485e-06, "loss": 0.4116, "step": 2993 }, { "epoch": 0.41991584852734926, "grad_norm": 1.8964327678474917, "learning_rate": 6.515678493098065e-06, "loss": 0.4029, "step": 2994 }, { "epoch": 0.42005610098176716, "grad_norm": 1.9062612564505, "learning_rate": 6.513513955252816e-06, "loss": 0.3984, "step": 2995 }, { "epoch": 0.4201963534361851, "grad_norm": 2.2711529871324596, "learning_rate": 6.511349105104534e-06, "loss": 0.4117, "step": 2996 }, { "epoch": 0.42033660589060307, "grad_norm": 2.1683068913873877, "learning_rate": 6.509183943099925e-06, "loss": 0.4131, "step": 2997 }, { "epoch": 0.420476858345021, "grad_norm": 1.8909044295700346, "learning_rate": 6.507018469685752e-06, "loss": 0.3843, "step": 2998 }, { "epoch": 0.420617110799439, "grad_norm": 1.9831648012357606, "learning_rate": 6.504852685308849e-06, "loss": 0.3762, "step": 2999 }, { "epoch": 0.42075736325385693, "grad_norm": 1.979135941384055, "learning_rate": 6.502686590416105e-06, "loss": 0.37, "step": 3000 }, { "epoch": 0.4208976157082749, "grad_norm": 2.6214865832148257, "learning_rate": 6.5005201854544845e-06, "loss": 0.3603, "step": 3001 }, { "epoch": 0.42103786816269284, "grad_norm": 2.2516730763842734, "learning_rate": 6.498353470871006e-06, "loss": 0.3892, "step": 3002 }, { "epoch": 0.4211781206171108, "grad_norm": 1.8068297251119418, "learning_rate": 6.4961864471127556e-06, "loss": 0.3856, "step": 3003 }, { "epoch": 0.42131837307152875, "grad_norm": 2.144867528457207, "learning_rate": 6.494019114626887e-06, "loss": 0.3794, "step": 3004 }, { "epoch": 0.4214586255259467, "grad_norm": 2.861507100311274, "learning_rate": 6.491851473860612e-06, "loss": 0.3856, "step": 3005 }, { "epoch": 0.42159887798036466, "grad_norm": 2.0756143878319735, "learning_rate": 6.489683525261208e-06, "loss": 0.3853, "step": 3006 }, { "epoch": 0.4217391304347826, "grad_norm": 2.1205773049771794, "learning_rate": 6.487515269276015e-06, "loss": 0.3988, "step": 3007 }, { "epoch": 0.42187938288920057, "grad_norm": 2.0807943767566957, "learning_rate": 6.48534670635244e-06, "loss": 0.3648, "step": 3008 }, { "epoch": 0.4220196353436185, "grad_norm": 2.4839576034674935, "learning_rate": 6.48317783693795e-06, "loss": 0.3622, "step": 3009 }, { "epoch": 0.4221598877980365, "grad_norm": 1.75251677598964, "learning_rate": 6.481008661480075e-06, "loss": 0.4208, "step": 3010 }, { "epoch": 0.42230014025245444, "grad_norm": 2.088056766476447, "learning_rate": 6.478839180426411e-06, "loss": 0.358, "step": 3011 }, { "epoch": 0.4224403927068724, "grad_norm": 2.08511153309319, "learning_rate": 6.476669394224613e-06, "loss": 0.3157, "step": 3012 }, { "epoch": 0.42258064516129035, "grad_norm": 1.9006509945567969, "learning_rate": 6.474499303322402e-06, "loss": 0.3775, "step": 3013 }, { "epoch": 0.4227208976157083, "grad_norm": 2.117886698789312, "learning_rate": 6.472328908167562e-06, "loss": 0.3546, "step": 3014 }, { "epoch": 0.4228611500701262, "grad_norm": 2.6404464029262313, "learning_rate": 6.470158209207939e-06, "loss": 0.3631, "step": 3015 }, { "epoch": 0.42300140252454416, "grad_norm": 2.1789452167362633, "learning_rate": 6.46798720689144e-06, "loss": 0.3697, "step": 3016 }, { "epoch": 0.4231416549789621, "grad_norm": 2.544308357912146, "learning_rate": 6.465815901666036e-06, "loss": 0.3836, "step": 3017 }, { "epoch": 0.42328190743338007, "grad_norm": 1.7922059807327582, "learning_rate": 6.463644293979763e-06, "loss": 0.345, "step": 3018 }, { "epoch": 0.423422159887798, "grad_norm": 1.8206209240706366, "learning_rate": 6.461472384280715e-06, "loss": 0.3353, "step": 3019 }, { "epoch": 0.423562412342216, "grad_norm": 2.194489348498048, "learning_rate": 6.459300173017052e-06, "loss": 0.4084, "step": 3020 }, { "epoch": 0.42370266479663393, "grad_norm": 3.0100395380002416, "learning_rate": 6.457127660636994e-06, "loss": 0.3663, "step": 3021 }, { "epoch": 0.4238429172510519, "grad_norm": 1.8586443579487488, "learning_rate": 6.454954847588824e-06, "loss": 0.3797, "step": 3022 }, { "epoch": 0.42398316970546984, "grad_norm": 2.561671425445379, "learning_rate": 6.452781734320884e-06, "loss": 0.3769, "step": 3023 }, { "epoch": 0.4241234221598878, "grad_norm": 2.2018671435503734, "learning_rate": 6.450608321281584e-06, "loss": 0.3653, "step": 3024 }, { "epoch": 0.42426367461430575, "grad_norm": 2.679648796669525, "learning_rate": 6.4484346089193926e-06, "loss": 0.3841, "step": 3025 }, { "epoch": 0.4244039270687237, "grad_norm": 2.034056734424741, "learning_rate": 6.4462605976828395e-06, "loss": 0.3898, "step": 3026 }, { "epoch": 0.42454417952314166, "grad_norm": 3.5660952446028555, "learning_rate": 6.444086288020514e-06, "loss": 0.3988, "step": 3027 }, { "epoch": 0.4246844319775596, "grad_norm": 1.6028086210677315, "learning_rate": 6.441911680381074e-06, "loss": 0.3815, "step": 3028 }, { "epoch": 0.4248246844319776, "grad_norm": 2.2986737947346096, "learning_rate": 6.4397367752132325e-06, "loss": 0.3505, "step": 3029 }, { "epoch": 0.42496493688639553, "grad_norm": 2.0119138160129935, "learning_rate": 6.437561572965767e-06, "loss": 0.3845, "step": 3030 }, { "epoch": 0.4251051893408135, "grad_norm": 2.2746656318847043, "learning_rate": 6.435386074087514e-06, "loss": 0.3433, "step": 3031 }, { "epoch": 0.42524544179523144, "grad_norm": 2.1931543568613723, "learning_rate": 6.433210279027373e-06, "loss": 0.3484, "step": 3032 }, { "epoch": 0.4253856942496494, "grad_norm": 2.0355128233994577, "learning_rate": 6.431034188234304e-06, "loss": 0.3253, "step": 3033 }, { "epoch": 0.42552594670406735, "grad_norm": 2.4458276370587106, "learning_rate": 6.4288578021573275e-06, "loss": 0.3651, "step": 3034 }, { "epoch": 0.42566619915848525, "grad_norm": 1.9610652198080876, "learning_rate": 6.426681121245527e-06, "loss": 0.3597, "step": 3035 }, { "epoch": 0.4258064516129032, "grad_norm": 1.7913879872724987, "learning_rate": 6.424504145948045e-06, "loss": 0.3895, "step": 3036 }, { "epoch": 0.42594670406732116, "grad_norm": 2.1310446542387083, "learning_rate": 6.422326876714084e-06, "loss": 0.3455, "step": 3037 }, { "epoch": 0.4260869565217391, "grad_norm": 2.4881883061894863, "learning_rate": 6.420149313992909e-06, "loss": 0.3599, "step": 3038 }, { "epoch": 0.42622720897615707, "grad_norm": 2.0379705903661285, "learning_rate": 6.417971458233847e-06, "loss": 0.3718, "step": 3039 }, { "epoch": 0.426367461430575, "grad_norm": 1.8652640934518958, "learning_rate": 6.41579330988628e-06, "loss": 0.3655, "step": 3040 }, { "epoch": 0.426507713884993, "grad_norm": 2.480208063319188, "learning_rate": 6.413614869399655e-06, "loss": 0.3985, "step": 3041 }, { "epoch": 0.42664796633941093, "grad_norm": 2.043465140086211, "learning_rate": 6.411436137223479e-06, "loss": 0.3542, "step": 3042 }, { "epoch": 0.4267882187938289, "grad_norm": 2.6543387324813104, "learning_rate": 6.409257113807316e-06, "loss": 0.3631, "step": 3043 }, { "epoch": 0.42692847124824684, "grad_norm": 1.9786603447272457, "learning_rate": 6.4070777996007925e-06, "loss": 0.3625, "step": 3044 }, { "epoch": 0.4270687237026648, "grad_norm": 2.008187819257285, "learning_rate": 6.4048981950535975e-06, "loss": 0.4293, "step": 3045 }, { "epoch": 0.42720897615708275, "grad_norm": 3.1085667158640073, "learning_rate": 6.402718300615475e-06, "loss": 0.3624, "step": 3046 }, { "epoch": 0.4273492286115007, "grad_norm": 2.0100220867860323, "learning_rate": 6.40053811673623e-06, "loss": 0.3892, "step": 3047 }, { "epoch": 0.42748948106591866, "grad_norm": 2.045806605934265, "learning_rate": 6.398357643865731e-06, "loss": 0.3523, "step": 3048 }, { "epoch": 0.4276297335203366, "grad_norm": 2.3876733447353446, "learning_rate": 6.396176882453902e-06, "loss": 0.3669, "step": 3049 }, { "epoch": 0.4277699859747546, "grad_norm": 2.5873330030132276, "learning_rate": 6.393995832950725e-06, "loss": 0.3676, "step": 3050 }, { "epoch": 0.42791023842917253, "grad_norm": 4.398591006701837, "learning_rate": 6.391814495806251e-06, "loss": 0.3863, "step": 3051 }, { "epoch": 0.4280504908835905, "grad_norm": 2.101332981665496, "learning_rate": 6.389632871470578e-06, "loss": 0.3602, "step": 3052 }, { "epoch": 0.42819074333800844, "grad_norm": 2.943751723497538, "learning_rate": 6.3874509603938706e-06, "loss": 0.3634, "step": 3053 }, { "epoch": 0.4283309957924264, "grad_norm": 1.9780114935298978, "learning_rate": 6.385268763026351e-06, "loss": 0.3304, "step": 3054 }, { "epoch": 0.4284712482468443, "grad_norm": 2.2855364100181284, "learning_rate": 6.3830862798183006e-06, "loss": 0.3792, "step": 3055 }, { "epoch": 0.42861150070126225, "grad_norm": 1.7138861065551223, "learning_rate": 6.38090351122006e-06, "loss": 0.3556, "step": 3056 }, { "epoch": 0.4287517531556802, "grad_norm": 2.032355491878188, "learning_rate": 6.378720457682027e-06, "loss": 0.4151, "step": 3057 }, { "epoch": 0.42889200561009816, "grad_norm": 2.2035514472926376, "learning_rate": 6.37653711965466e-06, "loss": 0.4183, "step": 3058 }, { "epoch": 0.4290322580645161, "grad_norm": 1.7776122173399367, "learning_rate": 6.374353497588475e-06, "loss": 0.3088, "step": 3059 }, { "epoch": 0.42917251051893407, "grad_norm": 2.377906727141462, "learning_rate": 6.372169591934048e-06, "loss": 0.4141, "step": 3060 }, { "epoch": 0.429312762973352, "grad_norm": 2.4111111571589796, "learning_rate": 6.369985403142014e-06, "loss": 0.3828, "step": 3061 }, { "epoch": 0.42945301542777, "grad_norm": 2.365883445983861, "learning_rate": 6.367800931663062e-06, "loss": 0.3847, "step": 3062 }, { "epoch": 0.42959326788218793, "grad_norm": 3.5569456201170992, "learning_rate": 6.365616177947945e-06, "loss": 0.365, "step": 3063 }, { "epoch": 0.4297335203366059, "grad_norm": 2.2949676898684834, "learning_rate": 6.363431142447469e-06, "loss": 0.3995, "step": 3064 }, { "epoch": 0.42987377279102384, "grad_norm": 1.9338572312986662, "learning_rate": 6.361245825612505e-06, "loss": 0.3936, "step": 3065 }, { "epoch": 0.4300140252454418, "grad_norm": 3.179374060775951, "learning_rate": 6.359060227893972e-06, "loss": 0.3405, "step": 3066 }, { "epoch": 0.43015427769985976, "grad_norm": 1.825747190654818, "learning_rate": 6.356874349742859e-06, "loss": 0.3736, "step": 3067 }, { "epoch": 0.4302945301542777, "grad_norm": 2.475885148184037, "learning_rate": 6.354688191610202e-06, "loss": 0.3953, "step": 3068 }, { "epoch": 0.43043478260869567, "grad_norm": 2.036260323682717, "learning_rate": 6.352501753947103e-06, "loss": 0.3756, "step": 3069 }, { "epoch": 0.4305750350631136, "grad_norm": 2.7191551169749215, "learning_rate": 6.350315037204714e-06, "loss": 0.3961, "step": 3070 }, { "epoch": 0.4307152875175316, "grad_norm": 2.023818874871799, "learning_rate": 6.3481280418342536e-06, "loss": 0.3247, "step": 3071 }, { "epoch": 0.43085553997194953, "grad_norm": 2.3403417601802006, "learning_rate": 6.3459407682869885e-06, "loss": 0.3359, "step": 3072 }, { "epoch": 0.4309957924263675, "grad_norm": 1.935849132476107, "learning_rate": 6.34375321701425e-06, "loss": 0.3849, "step": 3073 }, { "epoch": 0.43113604488078544, "grad_norm": 1.7900019015936102, "learning_rate": 6.341565388467425e-06, "loss": 0.401, "step": 3074 }, { "epoch": 0.43127629733520334, "grad_norm": 2.0797246068428428, "learning_rate": 6.339377283097953e-06, "loss": 0.4215, "step": 3075 }, { "epoch": 0.4314165497896213, "grad_norm": 1.7003992649650197, "learning_rate": 6.3371889013573365e-06, "loss": 0.3835, "step": 3076 }, { "epoch": 0.43155680224403925, "grad_norm": 1.8710926032013058, "learning_rate": 6.335000243697134e-06, "loss": 0.3161, "step": 3077 }, { "epoch": 0.4316970546984572, "grad_norm": 2.3780231308642255, "learning_rate": 6.332811310568956e-06, "loss": 0.3765, "step": 3078 }, { "epoch": 0.43183730715287516, "grad_norm": 2.3691038929229435, "learning_rate": 6.330622102424478e-06, "loss": 0.4037, "step": 3079 }, { "epoch": 0.4319775596072931, "grad_norm": 2.052958642267089, "learning_rate": 6.328432619715424e-06, "loss": 0.3626, "step": 3080 }, { "epoch": 0.43211781206171107, "grad_norm": 1.9562199599424515, "learning_rate": 6.326242862893581e-06, "loss": 0.3449, "step": 3081 }, { "epoch": 0.432258064516129, "grad_norm": 2.0147609780960725, "learning_rate": 6.324052832410788e-06, "loss": 0.3595, "step": 3082 }, { "epoch": 0.432398316970547, "grad_norm": 2.3811630793849927, "learning_rate": 6.321862528718945e-06, "loss": 0.378, "step": 3083 }, { "epoch": 0.43253856942496494, "grad_norm": 2.5347334828119386, "learning_rate": 6.319671952270004e-06, "loss": 0.3383, "step": 3084 }, { "epoch": 0.4326788218793829, "grad_norm": 2.044908583633381, "learning_rate": 6.317481103515976e-06, "loss": 0.3692, "step": 3085 }, { "epoch": 0.43281907433380085, "grad_norm": 2.2898061540342183, "learning_rate": 6.3152899829089254e-06, "loss": 0.3529, "step": 3086 }, { "epoch": 0.4329593267882188, "grad_norm": 2.8818273334122657, "learning_rate": 6.313098590900978e-06, "loss": 0.3856, "step": 3087 }, { "epoch": 0.43309957924263676, "grad_norm": 2.1146473867051396, "learning_rate": 6.310906927944309e-06, "loss": 0.421, "step": 3088 }, { "epoch": 0.4332398316970547, "grad_norm": 2.254707815352376, "learning_rate": 6.308714994491155e-06, "loss": 0.3892, "step": 3089 }, { "epoch": 0.43338008415147267, "grad_norm": 3.983253829886218, "learning_rate": 6.306522790993805e-06, "loss": 0.3853, "step": 3090 }, { "epoch": 0.4335203366058906, "grad_norm": 1.98163914754682, "learning_rate": 6.304330317904605e-06, "loss": 0.386, "step": 3091 }, { "epoch": 0.4336605890603086, "grad_norm": 2.060933645387968, "learning_rate": 6.3021375756759575e-06, "loss": 0.3571, "step": 3092 }, { "epoch": 0.43380084151472653, "grad_norm": 2.2093568145564673, "learning_rate": 6.299944564760318e-06, "loss": 0.367, "step": 3093 }, { "epoch": 0.4339410939691445, "grad_norm": 6.592785226830853, "learning_rate": 6.2977512856101994e-06, "loss": 0.3878, "step": 3094 }, { "epoch": 0.4340813464235624, "grad_norm": 5.216528322540592, "learning_rate": 6.295557738678171e-06, "loss": 0.4287, "step": 3095 }, { "epoch": 0.43422159887798034, "grad_norm": 2.4271044968977096, "learning_rate": 6.2933639244168535e-06, "loss": 0.3633, "step": 3096 }, { "epoch": 0.4343618513323983, "grad_norm": 3.7555400615450276, "learning_rate": 6.291169843278927e-06, "loss": 0.366, "step": 3097 }, { "epoch": 0.43450210378681625, "grad_norm": 2.2663304776159086, "learning_rate": 6.288975495717124e-06, "loss": 0.3689, "step": 3098 }, { "epoch": 0.4346423562412342, "grad_norm": 2.886574871932473, "learning_rate": 6.286780882184233e-06, "loss": 0.3859, "step": 3099 }, { "epoch": 0.43478260869565216, "grad_norm": 2.4017684619869413, "learning_rate": 6.284586003133096e-06, "loss": 0.4306, "step": 3100 }, { "epoch": 0.4349228611500701, "grad_norm": 2.194044622650793, "learning_rate": 6.282390859016613e-06, "loss": 0.3833, "step": 3101 }, { "epoch": 0.43506311360448807, "grad_norm": 2.837959010435082, "learning_rate": 6.280195450287736e-06, "loss": 0.3677, "step": 3102 }, { "epoch": 0.435203366058906, "grad_norm": 1.804763396017228, "learning_rate": 6.277999777399473e-06, "loss": 0.3379, "step": 3103 }, { "epoch": 0.435343618513324, "grad_norm": 5.791890348144692, "learning_rate": 6.2758038408048825e-06, "loss": 0.3717, "step": 3104 }, { "epoch": 0.43548387096774194, "grad_norm": 2.085206778904916, "learning_rate": 6.273607640957085e-06, "loss": 0.3592, "step": 3105 }, { "epoch": 0.4356241234221599, "grad_norm": 2.217525412877289, "learning_rate": 6.271411178309247e-06, "loss": 0.3905, "step": 3106 }, { "epoch": 0.43576437587657785, "grad_norm": 2.4927270000400195, "learning_rate": 6.269214453314596e-06, "loss": 0.3468, "step": 3107 }, { "epoch": 0.4359046283309958, "grad_norm": 2.0800766349393136, "learning_rate": 6.267017466426411e-06, "loss": 0.3826, "step": 3108 }, { "epoch": 0.43604488078541376, "grad_norm": 1.8737999572608952, "learning_rate": 6.264820218098022e-06, "loss": 0.3359, "step": 3109 }, { "epoch": 0.4361851332398317, "grad_norm": 2.1993587036049402, "learning_rate": 6.262622708782818e-06, "loss": 0.3751, "step": 3110 }, { "epoch": 0.43632538569424967, "grad_norm": 1.914893456525548, "learning_rate": 6.260424938934241e-06, "loss": 0.3594, "step": 3111 }, { "epoch": 0.4364656381486676, "grad_norm": 1.7085917517742373, "learning_rate": 6.258226909005783e-06, "loss": 0.3283, "step": 3112 }, { "epoch": 0.4366058906030856, "grad_norm": 2.7808260648146774, "learning_rate": 6.256028619450993e-06, "loss": 0.347, "step": 3113 }, { "epoch": 0.43674614305750353, "grad_norm": 2.1780719854615778, "learning_rate": 6.253830070723472e-06, "loss": 0.4053, "step": 3114 }, { "epoch": 0.43688639551192143, "grad_norm": 1.7256471442791652, "learning_rate": 6.251631263276877e-06, "loss": 0.3547, "step": 3115 }, { "epoch": 0.4370266479663394, "grad_norm": 2.063718378112415, "learning_rate": 6.2494321975649155e-06, "loss": 0.4002, "step": 3116 }, { "epoch": 0.43716690042075734, "grad_norm": 2.5092678897558574, "learning_rate": 6.247232874041348e-06, "loss": 0.399, "step": 3117 }, { "epoch": 0.4373071528751753, "grad_norm": 4.5309753334722735, "learning_rate": 6.2450332931599926e-06, "loss": 0.3513, "step": 3118 }, { "epoch": 0.43744740532959325, "grad_norm": 2.0605368085136218, "learning_rate": 6.2428334553747135e-06, "loss": 0.3814, "step": 3119 }, { "epoch": 0.4375876577840112, "grad_norm": 1.8861750417569874, "learning_rate": 6.240633361139435e-06, "loss": 0.3613, "step": 3120 }, { "epoch": 0.43772791023842916, "grad_norm": 1.9643117135048858, "learning_rate": 6.238433010908131e-06, "loss": 0.368, "step": 3121 }, { "epoch": 0.4378681626928471, "grad_norm": 3.468365209733309, "learning_rate": 6.236232405134827e-06, "loss": 0.3747, "step": 3122 }, { "epoch": 0.4380084151472651, "grad_norm": 2.1193137665083635, "learning_rate": 6.234031544273602e-06, "loss": 0.3932, "step": 3123 }, { "epoch": 0.43814866760168303, "grad_norm": 2.70094811429153, "learning_rate": 6.23183042877859e-06, "loss": 0.4386, "step": 3124 }, { "epoch": 0.438288920056101, "grad_norm": 1.5011661288220675, "learning_rate": 6.229629059103975e-06, "loss": 0.3572, "step": 3125 }, { "epoch": 0.43842917251051894, "grad_norm": 2.4942781773382947, "learning_rate": 6.227427435703997e-06, "loss": 0.3713, "step": 3126 }, { "epoch": 0.4385694249649369, "grad_norm": 2.014961665808795, "learning_rate": 6.225225559032941e-06, "loss": 0.3816, "step": 3127 }, { "epoch": 0.43870967741935485, "grad_norm": 2.5368756545602387, "learning_rate": 6.223023429545152e-06, "loss": 0.3384, "step": 3128 }, { "epoch": 0.4388499298737728, "grad_norm": 1.8690475073951176, "learning_rate": 6.2208210476950215e-06, "loss": 0.376, "step": 3129 }, { "epoch": 0.43899018232819076, "grad_norm": 2.0655791677217845, "learning_rate": 6.218618413936999e-06, "loss": 0.3723, "step": 3130 }, { "epoch": 0.4391304347826087, "grad_norm": 2.653944563343457, "learning_rate": 6.216415528725579e-06, "loss": 0.3617, "step": 3131 }, { "epoch": 0.43927068723702667, "grad_norm": 2.4351836716078177, "learning_rate": 6.2142123925153135e-06, "loss": 0.4117, "step": 3132 }, { "epoch": 0.4394109396914446, "grad_norm": 1.7904881956025063, "learning_rate": 6.212009005760805e-06, "loss": 0.3873, "step": 3133 }, { "epoch": 0.4395511921458626, "grad_norm": 2.9173768041860035, "learning_rate": 6.209805368916705e-06, "loss": 0.3762, "step": 3134 }, { "epoch": 0.4396914446002805, "grad_norm": 2.36465288191589, "learning_rate": 6.207601482437719e-06, "loss": 0.3669, "step": 3135 }, { "epoch": 0.43983169705469843, "grad_norm": 2.1025277229141497, "learning_rate": 6.2053973467786065e-06, "loss": 0.4253, "step": 3136 }, { "epoch": 0.4399719495091164, "grad_norm": 3.1177516034852997, "learning_rate": 6.203192962394171e-06, "loss": 0.3776, "step": 3137 }, { "epoch": 0.44011220196353434, "grad_norm": 2.5542018790460004, "learning_rate": 6.200988329739275e-06, "loss": 0.3614, "step": 3138 }, { "epoch": 0.4402524544179523, "grad_norm": 2.010436247366258, "learning_rate": 6.198783449268827e-06, "loss": 0.3418, "step": 3139 }, { "epoch": 0.44039270687237025, "grad_norm": 1.915634132848877, "learning_rate": 6.1965783214377895e-06, "loss": 0.3876, "step": 3140 }, { "epoch": 0.4405329593267882, "grad_norm": 2.0789454855329006, "learning_rate": 6.194372946701176e-06, "loss": 0.371, "step": 3141 }, { "epoch": 0.44067321178120616, "grad_norm": 1.943198652666048, "learning_rate": 6.192167325514049e-06, "loss": 0.3923, "step": 3142 }, { "epoch": 0.4408134642356241, "grad_norm": 2.4950206760321896, "learning_rate": 6.189961458331523e-06, "loss": 0.3757, "step": 3143 }, { "epoch": 0.4409537166900421, "grad_norm": 1.8742401490733025, "learning_rate": 6.1877553456087655e-06, "loss": 0.3563, "step": 3144 }, { "epoch": 0.44109396914446003, "grad_norm": 2.412410430036062, "learning_rate": 6.1855489878009885e-06, "loss": 0.3976, "step": 3145 }, { "epoch": 0.441234221598878, "grad_norm": 1.6552014180497685, "learning_rate": 6.183342385363462e-06, "loss": 0.3718, "step": 3146 }, { "epoch": 0.44137447405329594, "grad_norm": 2.472184219020633, "learning_rate": 6.181135538751504e-06, "loss": 0.3734, "step": 3147 }, { "epoch": 0.4415147265077139, "grad_norm": 1.8066278240678253, "learning_rate": 6.178928448420476e-06, "loss": 0.3548, "step": 3148 }, { "epoch": 0.44165497896213185, "grad_norm": 2.868216597556832, "learning_rate": 6.176721114825802e-06, "loss": 0.3595, "step": 3149 }, { "epoch": 0.4417952314165498, "grad_norm": 2.332550042240132, "learning_rate": 6.174513538422946e-06, "loss": 0.357, "step": 3150 }, { "epoch": 0.44193548387096776, "grad_norm": 2.5689158253850626, "learning_rate": 6.172305719667427e-06, "loss": 0.3589, "step": 3151 }, { "epoch": 0.4420757363253857, "grad_norm": 2.272913082503428, "learning_rate": 6.170097659014812e-06, "loss": 0.3328, "step": 3152 }, { "epoch": 0.44221598877980367, "grad_norm": 2.108863599031784, "learning_rate": 6.167889356920722e-06, "loss": 0.3446, "step": 3153 }, { "epoch": 0.4423562412342216, "grad_norm": 3.0872897118990554, "learning_rate": 6.165680813840822e-06, "loss": 0.3452, "step": 3154 }, { "epoch": 0.4424964936886395, "grad_norm": 2.235750002033823, "learning_rate": 6.163472030230831e-06, "loss": 0.3674, "step": 3155 }, { "epoch": 0.4426367461430575, "grad_norm": 2.295988054364062, "learning_rate": 6.161263006546513e-06, "loss": 0.402, "step": 3156 }, { "epoch": 0.44277699859747544, "grad_norm": 2.0929471654515854, "learning_rate": 6.159053743243689e-06, "loss": 0.3756, "step": 3157 }, { "epoch": 0.4429172510518934, "grad_norm": 2.321635065221947, "learning_rate": 6.156844240778221e-06, "loss": 0.349, "step": 3158 }, { "epoch": 0.44305750350631135, "grad_norm": 2.719119255158404, "learning_rate": 6.1546344996060294e-06, "loss": 0.3586, "step": 3159 }, { "epoch": 0.4431977559607293, "grad_norm": 2.2467076267089383, "learning_rate": 6.152424520183072e-06, "loss": 0.3601, "step": 3160 }, { "epoch": 0.44333800841514726, "grad_norm": 5.078444814147996, "learning_rate": 6.150214302965368e-06, "loss": 0.3603, "step": 3161 }, { "epoch": 0.4434782608695652, "grad_norm": 2.7884946613722366, "learning_rate": 6.148003848408979e-06, "loss": 0.3826, "step": 3162 }, { "epoch": 0.44361851332398317, "grad_norm": 2.202152984682112, "learning_rate": 6.145793156970017e-06, "loss": 0.3122, "step": 3163 }, { "epoch": 0.4437587657784011, "grad_norm": 2.5317235433063074, "learning_rate": 6.143582229104641e-06, "loss": 0.382, "step": 3164 }, { "epoch": 0.4438990182328191, "grad_norm": 2.4112428660902743, "learning_rate": 6.141371065269061e-06, "loss": 0.403, "step": 3165 }, { "epoch": 0.44403927068723703, "grad_norm": 3.437184406011725, "learning_rate": 6.1391596659195366e-06, "loss": 0.3423, "step": 3166 }, { "epoch": 0.444179523141655, "grad_norm": 2.384027090225938, "learning_rate": 6.136948031512375e-06, "loss": 0.3771, "step": 3167 }, { "epoch": 0.44431977559607294, "grad_norm": 1.8979953831896892, "learning_rate": 6.134736162503929e-06, "loss": 0.3486, "step": 3168 }, { "epoch": 0.4444600280504909, "grad_norm": 3.7057591336777325, "learning_rate": 6.132524059350607e-06, "loss": 0.3708, "step": 3169 }, { "epoch": 0.44460028050490885, "grad_norm": 1.99404198592203, "learning_rate": 6.130311722508854e-06, "loss": 0.3698, "step": 3170 }, { "epoch": 0.4447405329593268, "grad_norm": 2.089432092925555, "learning_rate": 6.128099152435175e-06, "loss": 0.3076, "step": 3171 }, { "epoch": 0.44488078541374476, "grad_norm": 2.2652376041037403, "learning_rate": 6.125886349586117e-06, "loss": 0.3843, "step": 3172 }, { "epoch": 0.4450210378681627, "grad_norm": 1.817629625716848, "learning_rate": 6.123673314418277e-06, "loss": 0.3865, "step": 3173 }, { "epoch": 0.44516129032258067, "grad_norm": 2.3003070128823295, "learning_rate": 6.121460047388301e-06, "loss": 0.3545, "step": 3174 }, { "epoch": 0.44530154277699857, "grad_norm": 3.1265590593356256, "learning_rate": 6.119246548952877e-06, "loss": 0.381, "step": 3175 }, { "epoch": 0.4454417952314165, "grad_norm": 2.597207304514465, "learning_rate": 6.117032819568749e-06, "loss": 0.4274, "step": 3176 }, { "epoch": 0.4455820476858345, "grad_norm": 5.478136998112841, "learning_rate": 6.114818859692701e-06, "loss": 0.3614, "step": 3177 }, { "epoch": 0.44572230014025244, "grad_norm": 2.1964999751093823, "learning_rate": 6.112604669781572e-06, "loss": 0.3703, "step": 3178 }, { "epoch": 0.4458625525946704, "grad_norm": 2.591953712668207, "learning_rate": 6.110390250292244e-06, "loss": 0.3462, "step": 3179 }, { "epoch": 0.44600280504908835, "grad_norm": 2.5600698159055724, "learning_rate": 6.108175601681643e-06, "loss": 0.3505, "step": 3180 }, { "epoch": 0.4461430575035063, "grad_norm": 2.718778851235431, "learning_rate": 6.1059607244067485e-06, "loss": 0.3362, "step": 3181 }, { "epoch": 0.44628330995792426, "grad_norm": 2.580673253094917, "learning_rate": 6.103745618924587e-06, "loss": 0.3338, "step": 3182 }, { "epoch": 0.4464235624123422, "grad_norm": 3.5236694530254353, "learning_rate": 6.101530285692228e-06, "loss": 0.3945, "step": 3183 }, { "epoch": 0.44656381486676017, "grad_norm": 2.38945786651994, "learning_rate": 6.09931472516679e-06, "loss": 0.3596, "step": 3184 }, { "epoch": 0.4467040673211781, "grad_norm": 1.5620214110798465, "learning_rate": 6.097098937805439e-06, "loss": 0.3664, "step": 3185 }, { "epoch": 0.4468443197755961, "grad_norm": 1.8800200969555259, "learning_rate": 6.094882924065387e-06, "loss": 0.3533, "step": 3186 }, { "epoch": 0.44698457223001403, "grad_norm": 1.897013754381057, "learning_rate": 6.092666684403893e-06, "loss": 0.3512, "step": 3187 }, { "epoch": 0.447124824684432, "grad_norm": 2.0445609953847983, "learning_rate": 6.090450219278264e-06, "loss": 0.3314, "step": 3188 }, { "epoch": 0.44726507713884994, "grad_norm": 3.0185093501427844, "learning_rate": 6.088233529145849e-06, "loss": 0.3504, "step": 3189 }, { "epoch": 0.4474053295932679, "grad_norm": 1.8722599353381495, "learning_rate": 6.08601661446405e-06, "loss": 0.3604, "step": 3190 }, { "epoch": 0.44754558204768585, "grad_norm": 2.0700753705958412, "learning_rate": 6.08379947569031e-06, "loss": 0.3827, "step": 3191 }, { "epoch": 0.4476858345021038, "grad_norm": 2.4587920994686443, "learning_rate": 6.081582113282118e-06, "loss": 0.3639, "step": 3192 }, { "epoch": 0.44782608695652176, "grad_norm": 1.998231582829719, "learning_rate": 6.0793645276970145e-06, "loss": 0.4057, "step": 3193 }, { "epoch": 0.4479663394109397, "grad_norm": 4.129201425559402, "learning_rate": 6.077146719392582e-06, "loss": 0.3539, "step": 3194 }, { "epoch": 0.4481065918653576, "grad_norm": 2.300687260157076, "learning_rate": 6.07492868882645e-06, "loss": 0.3819, "step": 3195 }, { "epoch": 0.4482468443197756, "grad_norm": 2.4248465735281823, "learning_rate": 6.072710436456293e-06, "loss": 0.3839, "step": 3196 }, { "epoch": 0.4483870967741935, "grad_norm": 2.1497907226197652, "learning_rate": 6.070491962739831e-06, "loss": 0.377, "step": 3197 }, { "epoch": 0.4485273492286115, "grad_norm": 2.6192772564392364, "learning_rate": 6.068273268134832e-06, "loss": 0.3142, "step": 3198 }, { "epoch": 0.44866760168302944, "grad_norm": 2.0445735893486354, "learning_rate": 6.066054353099109e-06, "loss": 0.3713, "step": 3199 }, { "epoch": 0.4488078541374474, "grad_norm": 1.8651355018174338, "learning_rate": 6.063835218090517e-06, "loss": 0.3405, "step": 3200 }, { "epoch": 0.44894810659186535, "grad_norm": 1.9812117467610952, "learning_rate": 6.061615863566961e-06, "loss": 0.386, "step": 3201 }, { "epoch": 0.4490883590462833, "grad_norm": 2.090148903502393, "learning_rate": 6.059396289986386e-06, "loss": 0.3842, "step": 3202 }, { "epoch": 0.44922861150070126, "grad_norm": 2.192210112908474, "learning_rate": 6.057176497806791e-06, "loss": 0.3693, "step": 3203 }, { "epoch": 0.4493688639551192, "grad_norm": 2.4998645745790613, "learning_rate": 6.054956487486212e-06, "loss": 0.3648, "step": 3204 }, { "epoch": 0.44950911640953717, "grad_norm": 1.9365350720031556, "learning_rate": 6.05273625948273e-06, "loss": 0.3431, "step": 3205 }, { "epoch": 0.4496493688639551, "grad_norm": 2.1014495025093227, "learning_rate": 6.050515814254477e-06, "loss": 0.3537, "step": 3206 }, { "epoch": 0.4497896213183731, "grad_norm": 2.9500370734519947, "learning_rate": 6.0482951522596245e-06, "loss": 0.3943, "step": 3207 }, { "epoch": 0.44992987377279103, "grad_norm": 2.245159133844503, "learning_rate": 6.046074273956392e-06, "loss": 0.3545, "step": 3208 }, { "epoch": 0.450070126227209, "grad_norm": 5.0542825952344845, "learning_rate": 6.043853179803042e-06, "loss": 0.3886, "step": 3209 }, { "epoch": 0.45021037868162694, "grad_norm": 1.9314116524505913, "learning_rate": 6.041631870257882e-06, "loss": 0.3453, "step": 3210 }, { "epoch": 0.4503506311360449, "grad_norm": 2.0703286224313278, "learning_rate": 6.039410345779262e-06, "loss": 0.3513, "step": 3211 }, { "epoch": 0.45049088359046285, "grad_norm": 1.9566098062072412, "learning_rate": 6.037188606825578e-06, "loss": 0.3484, "step": 3212 }, { "epoch": 0.4506311360448808, "grad_norm": 1.8968896403368298, "learning_rate": 6.034966653855272e-06, "loss": 0.3608, "step": 3213 }, { "epoch": 0.45077138849929876, "grad_norm": 2.3166707548841816, "learning_rate": 6.032744487326827e-06, "loss": 0.3397, "step": 3214 }, { "epoch": 0.45091164095371666, "grad_norm": 3.0505120894988615, "learning_rate": 6.030522107698775e-06, "loss": 0.3851, "step": 3215 }, { "epoch": 0.4510518934081346, "grad_norm": 2.2994873346260714, "learning_rate": 6.028299515429683e-06, "loss": 0.3833, "step": 3216 }, { "epoch": 0.4511921458625526, "grad_norm": 2.173624774644811, "learning_rate": 6.026076710978172e-06, "loss": 0.3976, "step": 3217 }, { "epoch": 0.45133239831697053, "grad_norm": 2.659105359403506, "learning_rate": 6.023853694802899e-06, "loss": 0.3805, "step": 3218 }, { "epoch": 0.4514726507713885, "grad_norm": 1.9559256653458905, "learning_rate": 6.021630467362571e-06, "loss": 0.3572, "step": 3219 }, { "epoch": 0.45161290322580644, "grad_norm": 1.9863725355316788, "learning_rate": 6.0194070291159346e-06, "loss": 0.3404, "step": 3220 }, { "epoch": 0.4517531556802244, "grad_norm": 2.4565558284391837, "learning_rate": 6.017183380521777e-06, "loss": 0.3639, "step": 3221 }, { "epoch": 0.45189340813464235, "grad_norm": 1.7592824935308335, "learning_rate": 6.014959522038937e-06, "loss": 0.3115, "step": 3222 }, { "epoch": 0.4520336605890603, "grad_norm": 2.2539731020739904, "learning_rate": 6.012735454126289e-06, "loss": 0.3586, "step": 3223 }, { "epoch": 0.45217391304347826, "grad_norm": 2.2256324297880847, "learning_rate": 6.010511177242757e-06, "loss": 0.3765, "step": 3224 }, { "epoch": 0.4523141654978962, "grad_norm": 2.038525039171005, "learning_rate": 6.008286691847305e-06, "loss": 0.307, "step": 3225 }, { "epoch": 0.45245441795231417, "grad_norm": 2.0037489563480544, "learning_rate": 6.006061998398937e-06, "loss": 0.3606, "step": 3226 }, { "epoch": 0.4525946704067321, "grad_norm": 4.9308927728137935, "learning_rate": 6.003837097356704e-06, "loss": 0.3522, "step": 3227 }, { "epoch": 0.4527349228611501, "grad_norm": 2.296342852962521, "learning_rate": 6.0016119891797e-06, "loss": 0.3778, "step": 3228 }, { "epoch": 0.45287517531556803, "grad_norm": 2.4191128638479578, "learning_rate": 5.999386674327059e-06, "loss": 0.3674, "step": 3229 }, { "epoch": 0.453015427769986, "grad_norm": 2.405635606678439, "learning_rate": 5.997161153257963e-06, "loss": 0.337, "step": 3230 }, { "epoch": 0.45315568022440395, "grad_norm": 2.820683927725578, "learning_rate": 5.994935426431627e-06, "loss": 0.3826, "step": 3231 }, { "epoch": 0.4532959326788219, "grad_norm": 2.048661362486077, "learning_rate": 5.992709494307317e-06, "loss": 0.3206, "step": 3232 }, { "epoch": 0.45343618513323986, "grad_norm": 2.0968902723776677, "learning_rate": 5.9904833573443385e-06, "loss": 0.3615, "step": 3233 }, { "epoch": 0.4535764375876578, "grad_norm": 2.0925976443096888, "learning_rate": 5.9882570160020395e-06, "loss": 0.4131, "step": 3234 }, { "epoch": 0.4537166900420757, "grad_norm": 1.9800916685212002, "learning_rate": 5.986030470739811e-06, "loss": 0.3396, "step": 3235 }, { "epoch": 0.45385694249649366, "grad_norm": 1.996309809926368, "learning_rate": 5.983803722017083e-06, "loss": 0.3675, "step": 3236 }, { "epoch": 0.4539971949509116, "grad_norm": 2.26499235594009, "learning_rate": 5.981576770293329e-06, "loss": 0.3618, "step": 3237 }, { "epoch": 0.4541374474053296, "grad_norm": 2.2277240065979345, "learning_rate": 5.979349616028067e-06, "loss": 0.3764, "step": 3238 }, { "epoch": 0.45427769985974753, "grad_norm": 2.2412833806715216, "learning_rate": 5.977122259680854e-06, "loss": 0.3784, "step": 3239 }, { "epoch": 0.4544179523141655, "grad_norm": 1.7765281734320806, "learning_rate": 5.974894701711291e-06, "loss": 0.3448, "step": 3240 }, { "epoch": 0.45455820476858344, "grad_norm": 2.0283248967434355, "learning_rate": 5.9726669425790175e-06, "loss": 0.3953, "step": 3241 }, { "epoch": 0.4546984572230014, "grad_norm": 1.721646036478072, "learning_rate": 5.970438982743715e-06, "loss": 0.3801, "step": 3242 }, { "epoch": 0.45483870967741935, "grad_norm": 2.0502169750108896, "learning_rate": 5.9682108226651084e-06, "loss": 0.3713, "step": 3243 }, { "epoch": 0.4549789621318373, "grad_norm": 2.258007318348002, "learning_rate": 5.965982462802962e-06, "loss": 0.3648, "step": 3244 }, { "epoch": 0.45511921458625526, "grad_norm": 2.885161809763547, "learning_rate": 5.963753903617084e-06, "loss": 0.3749, "step": 3245 }, { "epoch": 0.4552594670406732, "grad_norm": 3.2923500864575255, "learning_rate": 5.961525145567322e-06, "loss": 0.411, "step": 3246 }, { "epoch": 0.45539971949509117, "grad_norm": 1.8702406561110336, "learning_rate": 5.959296189113563e-06, "loss": 0.3308, "step": 3247 }, { "epoch": 0.4555399719495091, "grad_norm": 2.058069730654273, "learning_rate": 5.9570670347157375e-06, "loss": 0.3714, "step": 3248 }, { "epoch": 0.4556802244039271, "grad_norm": 1.7925590096820072, "learning_rate": 5.954837682833816e-06, "loss": 0.3584, "step": 3249 }, { "epoch": 0.45582047685834504, "grad_norm": 2.027971174287306, "learning_rate": 5.95260813392781e-06, "loss": 0.3622, "step": 3250 }, { "epoch": 0.455960729312763, "grad_norm": 2.189689956909735, "learning_rate": 5.950378388457774e-06, "loss": 0.3396, "step": 3251 }, { "epoch": 0.45610098176718095, "grad_norm": 1.9452521379903005, "learning_rate": 5.948148446883794e-06, "loss": 0.3721, "step": 3252 }, { "epoch": 0.4562412342215989, "grad_norm": 2.15293961877132, "learning_rate": 5.945918309666005e-06, "loss": 0.3749, "step": 3253 }, { "epoch": 0.45638148667601686, "grad_norm": 2.0038506870114303, "learning_rate": 5.943687977264584e-06, "loss": 0.3247, "step": 3254 }, { "epoch": 0.45652173913043476, "grad_norm": 2.0209705529047195, "learning_rate": 5.941457450139741e-06, "loss": 0.3809, "step": 3255 }, { "epoch": 0.4566619915848527, "grad_norm": 2.192862776318144, "learning_rate": 5.939226728751733e-06, "loss": 0.3788, "step": 3256 }, { "epoch": 0.45680224403927067, "grad_norm": 1.7903194093495989, "learning_rate": 5.9369958135608485e-06, "loss": 0.3242, "step": 3257 }, { "epoch": 0.4569424964936886, "grad_norm": 2.9390605271458026, "learning_rate": 5.934764705027425e-06, "loss": 0.3575, "step": 3258 }, { "epoch": 0.4570827489481066, "grad_norm": 1.9489514123605827, "learning_rate": 5.932533403611835e-06, "loss": 0.4118, "step": 3259 }, { "epoch": 0.45722300140252453, "grad_norm": 6.0008687344106235, "learning_rate": 5.930301909774494e-06, "loss": 0.3886, "step": 3260 }, { "epoch": 0.4573632538569425, "grad_norm": 2.08134577991854, "learning_rate": 5.928070223975853e-06, "loss": 0.309, "step": 3261 }, { "epoch": 0.45750350631136044, "grad_norm": 2.162594293732569, "learning_rate": 5.925838346676405e-06, "loss": 0.3486, "step": 3262 }, { "epoch": 0.4576437587657784, "grad_norm": 1.9173750881207732, "learning_rate": 5.9236062783366825e-06, "loss": 0.4128, "step": 3263 }, { "epoch": 0.45778401122019635, "grad_norm": 2.1152535364581087, "learning_rate": 5.9213740194172565e-06, "loss": 0.3415, "step": 3264 }, { "epoch": 0.4579242636746143, "grad_norm": 1.8453330663040461, "learning_rate": 5.919141570378739e-06, "loss": 0.3868, "step": 3265 }, { "epoch": 0.45806451612903226, "grad_norm": 1.7771971254337957, "learning_rate": 5.916908931681781e-06, "loss": 0.341, "step": 3266 }, { "epoch": 0.4582047685834502, "grad_norm": 2.219529136004141, "learning_rate": 5.914676103787071e-06, "loss": 0.3448, "step": 3267 }, { "epoch": 0.45834502103786817, "grad_norm": 2.2818127290811847, "learning_rate": 5.912443087155336e-06, "loss": 0.3281, "step": 3268 }, { "epoch": 0.4584852734922861, "grad_norm": 2.213883987585131, "learning_rate": 5.910209882247346e-06, "loss": 0.3873, "step": 3269 }, { "epoch": 0.4586255259467041, "grad_norm": 1.802093821882768, "learning_rate": 5.9079764895239066e-06, "loss": 0.346, "step": 3270 }, { "epoch": 0.45876577840112204, "grad_norm": 2.5466901091883067, "learning_rate": 5.905742909445863e-06, "loss": 0.283, "step": 3271 }, { "epoch": 0.45890603085554, "grad_norm": 2.3335159661707436, "learning_rate": 5.903509142474095e-06, "loss": 0.3718, "step": 3272 }, { "epoch": 0.45904628330995795, "grad_norm": 2.0416784351035653, "learning_rate": 5.90127518906953e-06, "loss": 0.4019, "step": 3273 }, { "epoch": 0.4591865357643759, "grad_norm": 2.228976256417363, "learning_rate": 5.899041049693125e-06, "loss": 0.3804, "step": 3274 }, { "epoch": 0.4593267882187938, "grad_norm": 1.8191624229522616, "learning_rate": 5.896806724805881e-06, "loss": 0.3927, "step": 3275 }, { "epoch": 0.45946704067321176, "grad_norm": 2.4887976477376133, "learning_rate": 5.894572214868837e-06, "loss": 0.4222, "step": 3276 }, { "epoch": 0.4596072931276297, "grad_norm": 3.287435024765238, "learning_rate": 5.8923375203430645e-06, "loss": 0.396, "step": 3277 }, { "epoch": 0.45974754558204767, "grad_norm": 2.8422171685010915, "learning_rate": 5.890102641689679e-06, "loss": 0.358, "step": 3278 }, { "epoch": 0.4598877980364656, "grad_norm": 2.2139045567465314, "learning_rate": 5.887867579369833e-06, "loss": 0.3446, "step": 3279 }, { "epoch": 0.4600280504908836, "grad_norm": 1.5319630266762803, "learning_rate": 5.885632333844714e-06, "loss": 0.3255, "step": 3280 }, { "epoch": 0.46016830294530153, "grad_norm": 3.415389093487316, "learning_rate": 5.883396905575552e-06, "loss": 0.3676, "step": 3281 }, { "epoch": 0.4603085553997195, "grad_norm": 2.042504923521572, "learning_rate": 5.88116129502361e-06, "loss": 0.3608, "step": 3282 }, { "epoch": 0.46044880785413744, "grad_norm": 2.049226531211399, "learning_rate": 5.87892550265019e-06, "loss": 0.352, "step": 3283 }, { "epoch": 0.4605890603085554, "grad_norm": 2.1610039535973504, "learning_rate": 5.876689528916634e-06, "loss": 0.3339, "step": 3284 }, { "epoch": 0.46072931276297335, "grad_norm": 2.019556750020344, "learning_rate": 5.874453374284318e-06, "loss": 0.387, "step": 3285 }, { "epoch": 0.4608695652173913, "grad_norm": 1.742825994112724, "learning_rate": 5.872217039214659e-06, "loss": 0.3381, "step": 3286 }, { "epoch": 0.46100981767180926, "grad_norm": 1.7603801369874148, "learning_rate": 5.8699805241691065e-06, "loss": 0.3462, "step": 3287 }, { "epoch": 0.4611500701262272, "grad_norm": 2.0988487675722145, "learning_rate": 5.867743829609152e-06, "loss": 0.3237, "step": 3288 }, { "epoch": 0.4612903225806452, "grad_norm": 1.8856515334161965, "learning_rate": 5.86550695599632e-06, "loss": 0.3804, "step": 3289 }, { "epoch": 0.46143057503506313, "grad_norm": 1.71656480073384, "learning_rate": 5.863269903792174e-06, "loss": 0.3695, "step": 3290 }, { "epoch": 0.4615708274894811, "grad_norm": 2.19705717327195, "learning_rate": 5.861032673458316e-06, "loss": 0.3692, "step": 3291 }, { "epoch": 0.46171107994389904, "grad_norm": 2.3173786971887185, "learning_rate": 5.858795265456382e-06, "loss": 0.389, "step": 3292 }, { "epoch": 0.461851332398317, "grad_norm": 2.1148940611295464, "learning_rate": 5.856557680248043e-06, "loss": 0.3433, "step": 3293 }, { "epoch": 0.46199158485273495, "grad_norm": 1.6152004252213574, "learning_rate": 5.854319918295012e-06, "loss": 0.3476, "step": 3294 }, { "epoch": 0.46213183730715285, "grad_norm": 2.6472973014891856, "learning_rate": 5.8520819800590345e-06, "loss": 0.3691, "step": 3295 }, { "epoch": 0.4622720897615708, "grad_norm": 2.026335537215173, "learning_rate": 5.849843866001893e-06, "loss": 0.3845, "step": 3296 }, { "epoch": 0.46241234221598876, "grad_norm": 2.122171539125287, "learning_rate": 5.847605576585409e-06, "loss": 0.3165, "step": 3297 }, { "epoch": 0.4625525946704067, "grad_norm": 1.8328678522751738, "learning_rate": 5.845367112271434e-06, "loss": 0.3598, "step": 3298 }, { "epoch": 0.46269284712482467, "grad_norm": 6.689192228032785, "learning_rate": 5.843128473521863e-06, "loss": 0.3743, "step": 3299 }, { "epoch": 0.4628330995792426, "grad_norm": 1.911267914672481, "learning_rate": 5.840889660798621e-06, "loss": 0.3616, "step": 3300 }, { "epoch": 0.4629733520336606, "grad_norm": 1.772081840054675, "learning_rate": 5.838650674563674e-06, "loss": 0.4195, "step": 3301 }, { "epoch": 0.46311360448807853, "grad_norm": 1.750050748361724, "learning_rate": 5.836411515279018e-06, "loss": 0.3369, "step": 3302 }, { "epoch": 0.4632538569424965, "grad_norm": 3.2578135245708073, "learning_rate": 5.834172183406691e-06, "loss": 0.3728, "step": 3303 }, { "epoch": 0.46339410939691444, "grad_norm": 2.5216723430373835, "learning_rate": 5.831932679408761e-06, "loss": 0.4086, "step": 3304 }, { "epoch": 0.4635343618513324, "grad_norm": 1.7575140127434614, "learning_rate": 5.829693003747334e-06, "loss": 0.3579, "step": 3305 }, { "epoch": 0.46367461430575035, "grad_norm": 1.8340550938175775, "learning_rate": 5.827453156884553e-06, "loss": 0.3864, "step": 3306 }, { "epoch": 0.4638148667601683, "grad_norm": 1.860122466311265, "learning_rate": 5.825213139282595e-06, "loss": 0.3745, "step": 3307 }, { "epoch": 0.46395511921458626, "grad_norm": 1.9725781717995015, "learning_rate": 5.82297295140367e-06, "loss": 0.3714, "step": 3308 }, { "epoch": 0.4640953716690042, "grad_norm": 2.504362686618192, "learning_rate": 5.820732593710027e-06, "loss": 0.3386, "step": 3309 }, { "epoch": 0.4642356241234222, "grad_norm": 2.666997908841355, "learning_rate": 5.818492066663947e-06, "loss": 0.3548, "step": 3310 }, { "epoch": 0.46437587657784013, "grad_norm": 2.030195343878973, "learning_rate": 5.816251370727748e-06, "loss": 0.3548, "step": 3311 }, { "epoch": 0.4645161290322581, "grad_norm": 2.2591310234827477, "learning_rate": 5.814010506363781e-06, "loss": 0.3617, "step": 3312 }, { "epoch": 0.46465638148667604, "grad_norm": 2.0222442525611495, "learning_rate": 5.811769474034434e-06, "loss": 0.3739, "step": 3313 }, { "epoch": 0.464796633941094, "grad_norm": 1.7153365485058296, "learning_rate": 5.8095282742021265e-06, "loss": 0.3658, "step": 3314 }, { "epoch": 0.4649368863955119, "grad_norm": 1.8944831763391643, "learning_rate": 5.807286907329315e-06, "loss": 0.3341, "step": 3315 }, { "epoch": 0.46507713884992985, "grad_norm": 2.1334005250879535, "learning_rate": 5.8050453738784905e-06, "loss": 0.3499, "step": 3316 }, { "epoch": 0.4652173913043478, "grad_norm": 1.8840253413818493, "learning_rate": 5.802803674312178e-06, "loss": 0.365, "step": 3317 }, { "epoch": 0.46535764375876576, "grad_norm": 1.954463652578499, "learning_rate": 5.800561809092937e-06, "loss": 0.3251, "step": 3318 }, { "epoch": 0.4654978962131837, "grad_norm": 2.4611560542832622, "learning_rate": 5.798319778683359e-06, "loss": 0.3462, "step": 3319 }, { "epoch": 0.46563814866760167, "grad_norm": 2.0683997189024566, "learning_rate": 5.796077583546071e-06, "loss": 0.3972, "step": 3320 }, { "epoch": 0.4657784011220196, "grad_norm": 1.7039439480969512, "learning_rate": 5.793835224143737e-06, "loss": 0.3339, "step": 3321 }, { "epoch": 0.4659186535764376, "grad_norm": 1.9839075694395618, "learning_rate": 5.79159270093905e-06, "loss": 0.3833, "step": 3322 }, { "epoch": 0.46605890603085554, "grad_norm": 2.1598084430172424, "learning_rate": 5.78935001439474e-06, "loss": 0.3361, "step": 3323 }, { "epoch": 0.4661991584852735, "grad_norm": 1.9102086721142557, "learning_rate": 5.787107164973571e-06, "loss": 0.3955, "step": 3324 }, { "epoch": 0.46633941093969145, "grad_norm": 3.303319047986144, "learning_rate": 5.784864153138335e-06, "loss": 0.3655, "step": 3325 }, { "epoch": 0.4664796633941094, "grad_norm": 3.623106494834591, "learning_rate": 5.782620979351865e-06, "loss": 0.3544, "step": 3326 }, { "epoch": 0.46661991584852736, "grad_norm": 2.2960837941247556, "learning_rate": 5.780377644077025e-06, "loss": 0.3739, "step": 3327 }, { "epoch": 0.4667601683029453, "grad_norm": 2.4266344127328043, "learning_rate": 5.77813414777671e-06, "loss": 0.396, "step": 3328 }, { "epoch": 0.46690042075736327, "grad_norm": 1.9167860933773733, "learning_rate": 5.7758904909138495e-06, "loss": 0.3429, "step": 3329 }, { "epoch": 0.4670406732117812, "grad_norm": 1.970371063354331, "learning_rate": 5.773646673951406e-06, "loss": 0.3456, "step": 3330 }, { "epoch": 0.4671809256661992, "grad_norm": 2.556987601938278, "learning_rate": 5.771402697352377e-06, "loss": 0.368, "step": 3331 }, { "epoch": 0.46732117812061713, "grad_norm": 2.1711919794582464, "learning_rate": 5.769158561579793e-06, "loss": 0.3621, "step": 3332 }, { "epoch": 0.4674614305750351, "grad_norm": 1.7560727474593718, "learning_rate": 5.766914267096712e-06, "loss": 0.3555, "step": 3333 }, { "epoch": 0.46760168302945304, "grad_norm": 1.7464173292354848, "learning_rate": 5.764669814366231e-06, "loss": 0.3221, "step": 3334 }, { "epoch": 0.46774193548387094, "grad_norm": 2.2322115794910693, "learning_rate": 5.762425203851475e-06, "loss": 0.3751, "step": 3335 }, { "epoch": 0.4678821879382889, "grad_norm": 2.2694211529341057, "learning_rate": 5.760180436015604e-06, "loss": 0.3624, "step": 3336 }, { "epoch": 0.46802244039270685, "grad_norm": 2.2039002481654033, "learning_rate": 5.7579355113218125e-06, "loss": 0.3795, "step": 3337 }, { "epoch": 0.4681626928471248, "grad_norm": 2.006466781930607, "learning_rate": 5.7556904302333246e-06, "loss": 0.4034, "step": 3338 }, { "epoch": 0.46830294530154276, "grad_norm": 2.0005510286374535, "learning_rate": 5.753445193213394e-06, "loss": 0.3861, "step": 3339 }, { "epoch": 0.4684431977559607, "grad_norm": 2.5605942687559446, "learning_rate": 5.751199800725314e-06, "loss": 0.3631, "step": 3340 }, { "epoch": 0.46858345021037867, "grad_norm": 1.879051155323806, "learning_rate": 5.748954253232401e-06, "loss": 0.3753, "step": 3341 }, { "epoch": 0.4687237026647966, "grad_norm": 1.6570065048219016, "learning_rate": 5.7467085511980115e-06, "loss": 0.365, "step": 3342 }, { "epoch": 0.4688639551192146, "grad_norm": 2.313059043325296, "learning_rate": 5.74446269508553e-06, "loss": 0.4197, "step": 3343 }, { "epoch": 0.46900420757363254, "grad_norm": 1.7720323280847174, "learning_rate": 5.742216685358373e-06, "loss": 0.3931, "step": 3344 }, { "epoch": 0.4691444600280505, "grad_norm": 1.6759847593001471, "learning_rate": 5.739970522479986e-06, "loss": 0.2938, "step": 3345 }, { "epoch": 0.46928471248246845, "grad_norm": 3.2631461486540236, "learning_rate": 5.737724206913853e-06, "loss": 0.3444, "step": 3346 }, { "epoch": 0.4694249649368864, "grad_norm": 1.9549225736735016, "learning_rate": 5.735477739123484e-06, "loss": 0.4205, "step": 3347 }, { "epoch": 0.46956521739130436, "grad_norm": 2.015764571687795, "learning_rate": 5.7332311195724235e-06, "loss": 0.3939, "step": 3348 }, { "epoch": 0.4697054698457223, "grad_norm": 2.1747607263953275, "learning_rate": 5.730984348724242e-06, "loss": 0.3842, "step": 3349 }, { "epoch": 0.46984572230014027, "grad_norm": 3.6645903054684834, "learning_rate": 5.7287374270425475e-06, "loss": 0.3668, "step": 3350 }, { "epoch": 0.4699859747545582, "grad_norm": 2.122060200807133, "learning_rate": 5.7264903549909765e-06, "loss": 0.3251, "step": 3351 }, { "epoch": 0.4701262272089762, "grad_norm": 1.7407277239582615, "learning_rate": 5.724243133033197e-06, "loss": 0.3515, "step": 3352 }, { "epoch": 0.47026647966339413, "grad_norm": 2.566781351904343, "learning_rate": 5.721995761632907e-06, "loss": 0.3688, "step": 3353 }, { "epoch": 0.4704067321178121, "grad_norm": 2.8823840642212755, "learning_rate": 5.719748241253835e-06, "loss": 0.3657, "step": 3354 }, { "epoch": 0.47054698457223, "grad_norm": 1.6688280237366695, "learning_rate": 5.717500572359743e-06, "loss": 0.372, "step": 3355 }, { "epoch": 0.47068723702664794, "grad_norm": 2.09412380696328, "learning_rate": 5.71525275541442e-06, "loss": 0.3726, "step": 3356 }, { "epoch": 0.4708274894810659, "grad_norm": 2.5241915399990997, "learning_rate": 5.7130047908816884e-06, "loss": 0.3899, "step": 3357 }, { "epoch": 0.47096774193548385, "grad_norm": 1.7718955575332396, "learning_rate": 5.7107566792254e-06, "loss": 0.3203, "step": 3358 }, { "epoch": 0.4711079943899018, "grad_norm": 2.283924677306611, "learning_rate": 5.7085084209094365e-06, "loss": 0.3634, "step": 3359 }, { "epoch": 0.47124824684431976, "grad_norm": 2.0808053764611127, "learning_rate": 5.70626001639771e-06, "loss": 0.4106, "step": 3360 }, { "epoch": 0.4713884992987377, "grad_norm": 2.374983737287557, "learning_rate": 5.704011466154162e-06, "loss": 0.3463, "step": 3361 }, { "epoch": 0.4715287517531557, "grad_norm": 2.022873847840879, "learning_rate": 5.701762770642768e-06, "loss": 0.3849, "step": 3362 }, { "epoch": 0.47166900420757363, "grad_norm": 1.6075870989849208, "learning_rate": 5.6995139303275304e-06, "loss": 0.3378, "step": 3363 }, { "epoch": 0.4718092566619916, "grad_norm": 2.577531688506492, "learning_rate": 5.69726494567248e-06, "loss": 0.3853, "step": 3364 }, { "epoch": 0.47194950911640954, "grad_norm": 1.768815794890373, "learning_rate": 5.69501581714168e-06, "loss": 0.379, "step": 3365 }, { "epoch": 0.4720897615708275, "grad_norm": 2.185680940963181, "learning_rate": 5.69276654519922e-06, "loss": 0.387, "step": 3366 }, { "epoch": 0.47223001402524545, "grad_norm": 1.7264637638358167, "learning_rate": 5.690517130309223e-06, "loss": 0.298, "step": 3367 }, { "epoch": 0.4723702664796634, "grad_norm": 1.8070506027643298, "learning_rate": 5.688267572935843e-06, "loss": 0.3442, "step": 3368 }, { "epoch": 0.47251051893408136, "grad_norm": 2.3430635591450786, "learning_rate": 5.686017873543256e-06, "loss": 0.4104, "step": 3369 }, { "epoch": 0.4726507713884993, "grad_norm": 2.619387302829446, "learning_rate": 5.683768032595673e-06, "loss": 0.3674, "step": 3370 }, { "epoch": 0.47279102384291727, "grad_norm": 2.38228013392099, "learning_rate": 5.681518050557336e-06, "loss": 0.4098, "step": 3371 }, { "epoch": 0.4729312762973352, "grad_norm": 2.0181962992955453, "learning_rate": 5.679267927892509e-06, "loss": 0.3849, "step": 3372 }, { "epoch": 0.4730715287517532, "grad_norm": 1.5752061345674044, "learning_rate": 5.677017665065492e-06, "loss": 0.3858, "step": 3373 }, { "epoch": 0.47321178120617113, "grad_norm": 2.42823488043311, "learning_rate": 5.674767262540609e-06, "loss": 0.3726, "step": 3374 }, { "epoch": 0.47335203366058903, "grad_norm": 1.8587451156418637, "learning_rate": 5.672516720782216e-06, "loss": 0.4061, "step": 3375 }, { "epoch": 0.473492286115007, "grad_norm": 1.7988537715114945, "learning_rate": 5.670266040254697e-06, "loss": 0.3522, "step": 3376 }, { "epoch": 0.47363253856942494, "grad_norm": 2.201632163013018, "learning_rate": 5.668015221422463e-06, "loss": 0.3503, "step": 3377 }, { "epoch": 0.4737727910238429, "grad_norm": 1.8931766661894756, "learning_rate": 5.6657642647499545e-06, "loss": 0.3321, "step": 3378 }, { "epoch": 0.47391304347826085, "grad_norm": 2.390712861085868, "learning_rate": 5.6635131707016425e-06, "loss": 0.3922, "step": 3379 }, { "epoch": 0.4740532959326788, "grad_norm": 2.4976578231295874, "learning_rate": 5.6612619397420225e-06, "loss": 0.3675, "step": 3380 }, { "epoch": 0.47419354838709676, "grad_norm": 1.7961631359952799, "learning_rate": 5.65901057233562e-06, "loss": 0.3696, "step": 3381 }, { "epoch": 0.4743338008415147, "grad_norm": 2.4715599775620998, "learning_rate": 5.656759068946992e-06, "loss": 0.3623, "step": 3382 }, { "epoch": 0.4744740532959327, "grad_norm": 2.3838031648049163, "learning_rate": 5.6545074300407184e-06, "loss": 0.3363, "step": 3383 }, { "epoch": 0.47461430575035063, "grad_norm": 2.3269334854698323, "learning_rate": 5.652255656081409e-06, "loss": 0.3637, "step": 3384 }, { "epoch": 0.4747545582047686, "grad_norm": 2.3025510033872134, "learning_rate": 5.650003747533701e-06, "loss": 0.3759, "step": 3385 }, { "epoch": 0.47489481065918654, "grad_norm": 2.8000856011793434, "learning_rate": 5.647751704862263e-06, "loss": 0.4076, "step": 3386 }, { "epoch": 0.4750350631136045, "grad_norm": 2.9542182634490133, "learning_rate": 5.645499528531785e-06, "loss": 0.3276, "step": 3387 }, { "epoch": 0.47517531556802245, "grad_norm": 2.3767712663778657, "learning_rate": 5.643247219006989e-06, "loss": 0.3556, "step": 3388 }, { "epoch": 0.4753155680224404, "grad_norm": 1.8810863892643879, "learning_rate": 5.640994776752626e-06, "loss": 0.3316, "step": 3389 }, { "epoch": 0.47545582047685836, "grad_norm": 2.780808317281315, "learning_rate": 5.638742202233466e-06, "loss": 0.394, "step": 3390 }, { "epoch": 0.4755960729312763, "grad_norm": 2.22458493936865, "learning_rate": 5.636489495914316e-06, "loss": 0.3958, "step": 3391 }, { "epoch": 0.47573632538569427, "grad_norm": 1.764911068413006, "learning_rate": 5.6342366582600035e-06, "loss": 0.3388, "step": 3392 }, { "epoch": 0.4758765778401122, "grad_norm": 1.7564479702726126, "learning_rate": 5.6319836897353915e-06, "loss": 0.3925, "step": 3393 }, { "epoch": 0.4760168302945302, "grad_norm": 2.2664148466623737, "learning_rate": 5.629730590805358e-06, "loss": 0.3489, "step": 3394 }, { "epoch": 0.4761570827489481, "grad_norm": 3.1882549214695652, "learning_rate": 5.627477361934818e-06, "loss": 0.3658, "step": 3395 }, { "epoch": 0.47629733520336603, "grad_norm": 2.184436505806564, "learning_rate": 5.625224003588708e-06, "loss": 0.3775, "step": 3396 }, { "epoch": 0.476437587657784, "grad_norm": 2.5345394283440466, "learning_rate": 5.6229705162319926e-06, "loss": 0.3417, "step": 3397 }, { "epoch": 0.47657784011220194, "grad_norm": 2.015614827448289, "learning_rate": 5.620716900329664e-06, "loss": 0.3766, "step": 3398 }, { "epoch": 0.4767180925666199, "grad_norm": 2.294413902391428, "learning_rate": 5.61846315634674e-06, "loss": 0.3823, "step": 3399 }, { "epoch": 0.47685834502103785, "grad_norm": 1.9069105160886886, "learning_rate": 5.616209284748263e-06, "loss": 0.3784, "step": 3400 }, { "epoch": 0.4769985974754558, "grad_norm": 1.8348091716443151, "learning_rate": 5.613955285999306e-06, "loss": 0.3303, "step": 3401 }, { "epoch": 0.47713884992987377, "grad_norm": 2.007718688910692, "learning_rate": 5.611701160564965e-06, "loss": 0.3633, "step": 3402 }, { "epoch": 0.4772791023842917, "grad_norm": 1.810191558901829, "learning_rate": 5.609446908910363e-06, "loss": 0.3429, "step": 3403 }, { "epoch": 0.4774193548387097, "grad_norm": 2.0450417265797167, "learning_rate": 5.607192531500651e-06, "loss": 0.3701, "step": 3404 }, { "epoch": 0.47755960729312763, "grad_norm": 2.090208560083418, "learning_rate": 5.6049380288010016e-06, "loss": 0.3565, "step": 3405 }, { "epoch": 0.4776998597475456, "grad_norm": 2.1022561215581375, "learning_rate": 5.6026834012766155e-06, "loss": 0.412, "step": 3406 }, { "epoch": 0.47784011220196354, "grad_norm": 1.9550504425253594, "learning_rate": 5.600428649392722e-06, "loss": 0.3017, "step": 3407 }, { "epoch": 0.4779803646563815, "grad_norm": 2.109797117594696, "learning_rate": 5.5981737736145695e-06, "loss": 0.3862, "step": 3408 }, { "epoch": 0.47812061711079945, "grad_norm": 2.838247152092697, "learning_rate": 5.5959187744074396e-06, "loss": 0.3504, "step": 3409 }, { "epoch": 0.4782608695652174, "grad_norm": 1.537278744044049, "learning_rate": 5.593663652236632e-06, "loss": 0.3721, "step": 3410 }, { "epoch": 0.47840112201963536, "grad_norm": 2.149475069059831, "learning_rate": 5.59140840756748e-06, "loss": 0.3577, "step": 3411 }, { "epoch": 0.4785413744740533, "grad_norm": 1.9685367287059707, "learning_rate": 5.589153040865333e-06, "loss": 0.3619, "step": 3412 }, { "epoch": 0.47868162692847127, "grad_norm": 1.6135134033459715, "learning_rate": 5.586897552595573e-06, "loss": 0.3213, "step": 3413 }, { "epoch": 0.4788218793828892, "grad_norm": 1.6070999805846602, "learning_rate": 5.584641943223603e-06, "loss": 0.3719, "step": 3414 }, { "epoch": 0.4789621318373071, "grad_norm": 2.690635626192631, "learning_rate": 5.582386213214853e-06, "loss": 0.39, "step": 3415 }, { "epoch": 0.4791023842917251, "grad_norm": 2.3162580317251464, "learning_rate": 5.580130363034777e-06, "loss": 0.3718, "step": 3416 }, { "epoch": 0.47924263674614304, "grad_norm": 2.0521383391106705, "learning_rate": 5.577874393148854e-06, "loss": 0.402, "step": 3417 }, { "epoch": 0.479382889200561, "grad_norm": 1.8445005020881315, "learning_rate": 5.575618304022586e-06, "loss": 0.3662, "step": 3418 }, { "epoch": 0.47952314165497895, "grad_norm": 1.8203273657981704, "learning_rate": 5.573362096121504e-06, "loss": 0.3407, "step": 3419 }, { "epoch": 0.4796633941093969, "grad_norm": 2.1431574633729737, "learning_rate": 5.571105769911159e-06, "loss": 0.3632, "step": 3420 }, { "epoch": 0.47980364656381486, "grad_norm": 1.8571702724571157, "learning_rate": 5.568849325857127e-06, "loss": 0.4112, "step": 3421 }, { "epoch": 0.4799438990182328, "grad_norm": 1.8449189957615024, "learning_rate": 5.566592764425012e-06, "loss": 0.3585, "step": 3422 }, { "epoch": 0.48008415147265077, "grad_norm": 2.538577552847517, "learning_rate": 5.5643360860804385e-06, "loss": 0.3459, "step": 3423 }, { "epoch": 0.4802244039270687, "grad_norm": 2.346928183693328, "learning_rate": 5.562079291289058e-06, "loss": 0.3817, "step": 3424 }, { "epoch": 0.4803646563814867, "grad_norm": 1.8396235696691525, "learning_rate": 5.559822380516539e-06, "loss": 0.3171, "step": 3425 }, { "epoch": 0.48050490883590463, "grad_norm": 2.1224421646272247, "learning_rate": 5.557565354228586e-06, "loss": 0.3417, "step": 3426 }, { "epoch": 0.4806451612903226, "grad_norm": 2.1039042419356018, "learning_rate": 5.555308212890917e-06, "loss": 0.3585, "step": 3427 }, { "epoch": 0.48078541374474054, "grad_norm": 1.7546328120825783, "learning_rate": 5.553050956969278e-06, "loss": 0.3367, "step": 3428 }, { "epoch": 0.4809256661991585, "grad_norm": 7.208643853292607, "learning_rate": 5.550793586929437e-06, "loss": 0.368, "step": 3429 }, { "epoch": 0.48106591865357645, "grad_norm": 1.7010825973541792, "learning_rate": 5.54853610323719e-06, "loss": 0.3535, "step": 3430 }, { "epoch": 0.4812061711079944, "grad_norm": 1.8869018465844478, "learning_rate": 5.546278506358348e-06, "loss": 0.3718, "step": 3431 }, { "epoch": 0.48134642356241236, "grad_norm": 2.030271676022865, "learning_rate": 5.544020796758754e-06, "loss": 0.3677, "step": 3432 }, { "epoch": 0.4814866760168303, "grad_norm": 2.6586432950909806, "learning_rate": 5.5417629749042676e-06, "loss": 0.4122, "step": 3433 }, { "epoch": 0.4816269284712483, "grad_norm": 2.542980750739732, "learning_rate": 5.539505041260779e-06, "loss": 0.3571, "step": 3434 }, { "epoch": 0.48176718092566617, "grad_norm": 4.073130856448745, "learning_rate": 5.537246996294192e-06, "loss": 0.4074, "step": 3435 }, { "epoch": 0.4819074333800841, "grad_norm": 2.113311259678906, "learning_rate": 5.534988840470442e-06, "loss": 0.3503, "step": 3436 }, { "epoch": 0.4820476858345021, "grad_norm": 2.152660644243231, "learning_rate": 5.532730574255482e-06, "loss": 0.3299, "step": 3437 }, { "epoch": 0.48218793828892004, "grad_norm": 2.427454864050904, "learning_rate": 5.530472198115291e-06, "loss": 0.3888, "step": 3438 }, { "epoch": 0.482328190743338, "grad_norm": 1.940220246053316, "learning_rate": 5.528213712515867e-06, "loss": 0.3913, "step": 3439 }, { "epoch": 0.48246844319775595, "grad_norm": 2.677326463477122, "learning_rate": 5.525955117923235e-06, "loss": 0.3949, "step": 3440 }, { "epoch": 0.4826086956521739, "grad_norm": 1.6475824968120016, "learning_rate": 5.523696414803438e-06, "loss": 0.3056, "step": 3441 }, { "epoch": 0.48274894810659186, "grad_norm": 1.9489180766817007, "learning_rate": 5.521437603622545e-06, "loss": 0.3531, "step": 3442 }, { "epoch": 0.4828892005610098, "grad_norm": 2.204931722262916, "learning_rate": 5.519178684846646e-06, "loss": 0.3832, "step": 3443 }, { "epoch": 0.48302945301542777, "grad_norm": 1.840306833225775, "learning_rate": 5.51691965894185e-06, "loss": 0.3152, "step": 3444 }, { "epoch": 0.4831697054698457, "grad_norm": 1.677703324231544, "learning_rate": 5.514660526374298e-06, "loss": 0.3215, "step": 3445 }, { "epoch": 0.4833099579242637, "grad_norm": 2.2958022746104345, "learning_rate": 5.51240128761014e-06, "loss": 0.3513, "step": 3446 }, { "epoch": 0.48345021037868163, "grad_norm": 2.3874673371801958, "learning_rate": 5.510141943115556e-06, "loss": 0.4343, "step": 3447 }, { "epoch": 0.4835904628330996, "grad_norm": 1.7850819912413272, "learning_rate": 5.507882493356745e-06, "loss": 0.3758, "step": 3448 }, { "epoch": 0.48373071528751754, "grad_norm": 1.9206499987290235, "learning_rate": 5.505622938799933e-06, "loss": 0.3659, "step": 3449 }, { "epoch": 0.4838709677419355, "grad_norm": 1.9581775468045717, "learning_rate": 5.503363279911359e-06, "loss": 0.3599, "step": 3450 }, { "epoch": 0.48401122019635345, "grad_norm": 1.8537623861237322, "learning_rate": 5.501103517157288e-06, "loss": 0.3478, "step": 3451 }, { "epoch": 0.4841514726507714, "grad_norm": 3.8072046495226872, "learning_rate": 5.498843651004008e-06, "loss": 0.3362, "step": 3452 }, { "epoch": 0.48429172510518936, "grad_norm": 2.633015791744595, "learning_rate": 5.496583681917824e-06, "loss": 0.3554, "step": 3453 }, { "epoch": 0.4844319775596073, "grad_norm": 2.2976589811192754, "learning_rate": 5.494323610365069e-06, "loss": 0.3684, "step": 3454 }, { "epoch": 0.4845722300140252, "grad_norm": 2.609890549948166, "learning_rate": 5.49206343681209e-06, "loss": 0.3713, "step": 3455 }, { "epoch": 0.4847124824684432, "grad_norm": 2.8041734144593478, "learning_rate": 5.489803161725258e-06, "loss": 0.3958, "step": 3456 }, { "epoch": 0.48485273492286113, "grad_norm": 1.6574972243019672, "learning_rate": 5.487542785570966e-06, "loss": 0.3363, "step": 3457 }, { "epoch": 0.4849929873772791, "grad_norm": 2.212293337205754, "learning_rate": 5.485282308815626e-06, "loss": 0.3647, "step": 3458 }, { "epoch": 0.48513323983169704, "grad_norm": 1.7722098516844986, "learning_rate": 5.483021731925673e-06, "loss": 0.3631, "step": 3459 }, { "epoch": 0.485273492286115, "grad_norm": 1.913147999806873, "learning_rate": 5.48076105536756e-06, "loss": 0.3592, "step": 3460 }, { "epoch": 0.48541374474053295, "grad_norm": 2.7117592393407013, "learning_rate": 5.478500279607762e-06, "loss": 0.4193, "step": 3461 }, { "epoch": 0.4855539971949509, "grad_norm": 1.650907501000138, "learning_rate": 5.476239405112775e-06, "loss": 0.3357, "step": 3462 }, { "epoch": 0.48569424964936886, "grad_norm": 2.4092608354290586, "learning_rate": 5.4739784323491115e-06, "loss": 0.3772, "step": 3463 }, { "epoch": 0.4858345021037868, "grad_norm": 1.58084815728063, "learning_rate": 5.471717361783312e-06, "loss": 0.3757, "step": 3464 }, { "epoch": 0.48597475455820477, "grad_norm": 2.0404149974120016, "learning_rate": 5.469456193881931e-06, "loss": 0.3428, "step": 3465 }, { "epoch": 0.4861150070126227, "grad_norm": 1.6985842868575367, "learning_rate": 5.467194929111544e-06, "loss": 0.3597, "step": 3466 }, { "epoch": 0.4862552594670407, "grad_norm": 2.16851019069396, "learning_rate": 5.464933567938746e-06, "loss": 0.4061, "step": 3467 }, { "epoch": 0.48639551192145863, "grad_norm": 1.9651278523597377, "learning_rate": 5.462672110830155e-06, "loss": 0.3642, "step": 3468 }, { "epoch": 0.4865357643758766, "grad_norm": 1.7147008188939055, "learning_rate": 5.460410558252408e-06, "loss": 0.3377, "step": 3469 }, { "epoch": 0.48667601683029454, "grad_norm": 2.391262193918918, "learning_rate": 5.458148910672157e-06, "loss": 0.3791, "step": 3470 }, { "epoch": 0.4868162692847125, "grad_norm": 1.7631322768208932, "learning_rate": 5.455887168556081e-06, "loss": 0.374, "step": 3471 }, { "epoch": 0.48695652173913045, "grad_norm": 1.6535578025955604, "learning_rate": 5.453625332370872e-06, "loss": 0.3319, "step": 3472 }, { "epoch": 0.4870967741935484, "grad_norm": 2.333337261559949, "learning_rate": 5.451363402583244e-06, "loss": 0.3613, "step": 3473 }, { "epoch": 0.48723702664796636, "grad_norm": 1.7053602756422965, "learning_rate": 5.449101379659933e-06, "loss": 0.3389, "step": 3474 }, { "epoch": 0.48737727910238426, "grad_norm": 2.121889755355027, "learning_rate": 5.446839264067689e-06, "loss": 0.435, "step": 3475 }, { "epoch": 0.4875175315568022, "grad_norm": 1.9865257442447095, "learning_rate": 5.444577056273284e-06, "loss": 0.3367, "step": 3476 }, { "epoch": 0.4876577840112202, "grad_norm": 1.7861729237269275, "learning_rate": 5.442314756743511e-06, "loss": 0.3057, "step": 3477 }, { "epoch": 0.48779803646563813, "grad_norm": 2.5492892378035408, "learning_rate": 5.4400523659451775e-06, "loss": 0.3304, "step": 3478 }, { "epoch": 0.4879382889200561, "grad_norm": 2.2886710499470477, "learning_rate": 5.4377898843451126e-06, "loss": 0.352, "step": 3479 }, { "epoch": 0.48807854137447404, "grad_norm": 5.545186562181602, "learning_rate": 5.4355273124101645e-06, "loss": 0.3184, "step": 3480 }, { "epoch": 0.488218793828892, "grad_norm": 3.774633624758788, "learning_rate": 5.4332646506071986e-06, "loss": 0.356, "step": 3481 }, { "epoch": 0.48835904628330995, "grad_norm": 2.3165079387196945, "learning_rate": 5.4310018994030974e-06, "loss": 0.3809, "step": 3482 }, { "epoch": 0.4884992987377279, "grad_norm": 2.2852046602778207, "learning_rate": 5.428739059264767e-06, "loss": 0.4027, "step": 3483 }, { "epoch": 0.48863955119214586, "grad_norm": 2.0430223155247664, "learning_rate": 5.426476130659126e-06, "loss": 0.3837, "step": 3484 }, { "epoch": 0.4887798036465638, "grad_norm": 3.440571331884098, "learning_rate": 5.424213114053115e-06, "loss": 0.3405, "step": 3485 }, { "epoch": 0.48892005610098177, "grad_norm": 1.9599636745006113, "learning_rate": 5.421950009913694e-06, "loss": 0.3447, "step": 3486 }, { "epoch": 0.4890603085553997, "grad_norm": 1.5293759146540704, "learning_rate": 5.4196868187078335e-06, "loss": 0.3432, "step": 3487 }, { "epoch": 0.4892005610098177, "grad_norm": 4.38111145409954, "learning_rate": 5.417423540902531e-06, "loss": 0.3826, "step": 3488 }, { "epoch": 0.48934081346423564, "grad_norm": 1.8238884506330926, "learning_rate": 5.4151601769647974e-06, "loss": 0.3353, "step": 3489 }, { "epoch": 0.4894810659186536, "grad_norm": 1.8828008501001097, "learning_rate": 5.412896727361663e-06, "loss": 0.3363, "step": 3490 }, { "epoch": 0.48962131837307155, "grad_norm": 1.7800706673642333, "learning_rate": 5.410633192560173e-06, "loss": 0.3578, "step": 3491 }, { "epoch": 0.4897615708274895, "grad_norm": 2.9638739445969104, "learning_rate": 5.408369573027391e-06, "loss": 0.3391, "step": 3492 }, { "epoch": 0.48990182328190746, "grad_norm": 1.9088439177844712, "learning_rate": 5.406105869230402e-06, "loss": 0.3822, "step": 3493 }, { "epoch": 0.4900420757363254, "grad_norm": 2.2339910430728644, "learning_rate": 5.403842081636303e-06, "loss": 0.3821, "step": 3494 }, { "epoch": 0.4901823281907433, "grad_norm": 1.9917402775248143, "learning_rate": 5.401578210712214e-06, "loss": 0.3654, "step": 3495 }, { "epoch": 0.49032258064516127, "grad_norm": 2.0570830678009218, "learning_rate": 5.399314256925265e-06, "loss": 0.3252, "step": 3496 }, { "epoch": 0.4904628330995792, "grad_norm": 1.8526916923649244, "learning_rate": 5.39705022074261e-06, "loss": 0.3715, "step": 3497 }, { "epoch": 0.4906030855539972, "grad_norm": 2.1243883262366685, "learning_rate": 5.394786102631415e-06, "loss": 0.345, "step": 3498 }, { "epoch": 0.49074333800841513, "grad_norm": 1.8091739523079235, "learning_rate": 5.392521903058867e-06, "loss": 0.3482, "step": 3499 }, { "epoch": 0.4908835904628331, "grad_norm": 2.0642107024543783, "learning_rate": 5.390257622492166e-06, "loss": 0.3701, "step": 3500 }, { "epoch": 0.49102384291725104, "grad_norm": 1.9751294636847048, "learning_rate": 5.387993261398532e-06, "loss": 0.4017, "step": 3501 }, { "epoch": 0.491164095371669, "grad_norm": 1.6122819886206905, "learning_rate": 5.3857288202452e-06, "loss": 0.3876, "step": 3502 }, { "epoch": 0.49130434782608695, "grad_norm": 2.4007172823973297, "learning_rate": 5.383464299499419e-06, "loss": 0.3724, "step": 3503 }, { "epoch": 0.4914446002805049, "grad_norm": 1.8953572468456088, "learning_rate": 5.381199699628459e-06, "loss": 0.3597, "step": 3504 }, { "epoch": 0.49158485273492286, "grad_norm": 1.6357099751958848, "learning_rate": 5.378935021099604e-06, "loss": 0.3415, "step": 3505 }, { "epoch": 0.4917251051893408, "grad_norm": 2.2879267143436914, "learning_rate": 5.376670264380157e-06, "loss": 0.3784, "step": 3506 }, { "epoch": 0.49186535764375877, "grad_norm": 2.1087541584907776, "learning_rate": 5.374405429937431e-06, "loss": 0.3354, "step": 3507 }, { "epoch": 0.4920056100981767, "grad_norm": 1.8314910938276547, "learning_rate": 5.3721405182387595e-06, "loss": 0.3663, "step": 3508 }, { "epoch": 0.4921458625525947, "grad_norm": 2.0976318130632574, "learning_rate": 5.369875529751492e-06, "loss": 0.4138, "step": 3509 }, { "epoch": 0.49228611500701264, "grad_norm": 1.986068183446864, "learning_rate": 5.367610464942994e-06, "loss": 0.4021, "step": 3510 }, { "epoch": 0.4924263674614306, "grad_norm": 2.073745817303248, "learning_rate": 5.365345324280646e-06, "loss": 0.3797, "step": 3511 }, { "epoch": 0.49256661991584855, "grad_norm": 3.389437841205313, "learning_rate": 5.363080108231843e-06, "loss": 0.3435, "step": 3512 }, { "epoch": 0.4927068723702665, "grad_norm": 1.8007754245092866, "learning_rate": 5.360814817263995e-06, "loss": 0.3422, "step": 3513 }, { "epoch": 0.49284712482468446, "grad_norm": 2.0893833687192767, "learning_rate": 5.35854945184453e-06, "loss": 0.3853, "step": 3514 }, { "epoch": 0.49298737727910236, "grad_norm": 1.7853410153588127, "learning_rate": 5.35628401244089e-06, "loss": 0.3553, "step": 3515 }, { "epoch": 0.4931276297335203, "grad_norm": 2.5773009861835248, "learning_rate": 5.354018499520536e-06, "loss": 0.3357, "step": 3516 }, { "epoch": 0.49326788218793827, "grad_norm": 2.091574720309659, "learning_rate": 5.351752913550936e-06, "loss": 0.3581, "step": 3517 }, { "epoch": 0.4934081346423562, "grad_norm": 1.6789155827578406, "learning_rate": 5.349487254999579e-06, "loss": 0.3428, "step": 3518 }, { "epoch": 0.4935483870967742, "grad_norm": 1.900068673348398, "learning_rate": 5.34722152433397e-06, "loss": 0.3888, "step": 3519 }, { "epoch": 0.49368863955119213, "grad_norm": 6.758937993342709, "learning_rate": 5.3449557220216245e-06, "loss": 0.3516, "step": 3520 }, { "epoch": 0.4938288920056101, "grad_norm": 2.5406540797447383, "learning_rate": 5.342689848530077e-06, "loss": 0.4309, "step": 3521 }, { "epoch": 0.49396914446002804, "grad_norm": 1.7292062532727037, "learning_rate": 5.3404239043268734e-06, "loss": 0.3941, "step": 3522 }, { "epoch": 0.494109396914446, "grad_norm": 1.669510925769043, "learning_rate": 5.338157889879575e-06, "loss": 0.3484, "step": 3523 }, { "epoch": 0.49424964936886395, "grad_norm": 2.198314311379287, "learning_rate": 5.335891805655758e-06, "loss": 0.3321, "step": 3524 }, { "epoch": 0.4943899018232819, "grad_norm": 2.3344231739716306, "learning_rate": 5.333625652123014e-06, "loss": 0.3567, "step": 3525 }, { "epoch": 0.49453015427769986, "grad_norm": 1.6880870786868467, "learning_rate": 5.331359429748948e-06, "loss": 0.331, "step": 3526 }, { "epoch": 0.4946704067321178, "grad_norm": 2.1150280965467405, "learning_rate": 5.329093139001179e-06, "loss": 0.3071, "step": 3527 }, { "epoch": 0.4948106591865358, "grad_norm": 2.5290660148925186, "learning_rate": 5.326826780347339e-06, "loss": 0.384, "step": 3528 }, { "epoch": 0.49495091164095373, "grad_norm": 3.138408884876908, "learning_rate": 5.324560354255077e-06, "loss": 0.3707, "step": 3529 }, { "epoch": 0.4950911640953717, "grad_norm": 2.194923339234974, "learning_rate": 5.322293861192052e-06, "loss": 0.3669, "step": 3530 }, { "epoch": 0.49523141654978964, "grad_norm": 2.3783969678521357, "learning_rate": 5.320027301625944e-06, "loss": 0.3815, "step": 3531 }, { "epoch": 0.4953716690042076, "grad_norm": 1.4931852190753048, "learning_rate": 5.317760676024436e-06, "loss": 0.377, "step": 3532 }, { "epoch": 0.49551192145862555, "grad_norm": 1.5125846649938026, "learning_rate": 5.315493984855233e-06, "loss": 0.3236, "step": 3533 }, { "epoch": 0.4956521739130435, "grad_norm": 3.03376901313347, "learning_rate": 5.313227228586049e-06, "loss": 0.3972, "step": 3534 }, { "epoch": 0.4957924263674614, "grad_norm": 2.072108687948789, "learning_rate": 5.310960407684616e-06, "loss": 0.3874, "step": 3535 }, { "epoch": 0.49593267882187936, "grad_norm": 2.2044055004081016, "learning_rate": 5.308693522618674e-06, "loss": 0.385, "step": 3536 }, { "epoch": 0.4960729312762973, "grad_norm": 1.8551794850589114, "learning_rate": 5.306426573855983e-06, "loss": 0.3996, "step": 3537 }, { "epoch": 0.49621318373071527, "grad_norm": 2.2629413363969197, "learning_rate": 5.3041595618643075e-06, "loss": 0.3636, "step": 3538 }, { "epoch": 0.4963534361851332, "grad_norm": 2.051905964836472, "learning_rate": 5.301892487111431e-06, "loss": 0.3739, "step": 3539 }, { "epoch": 0.4964936886395512, "grad_norm": 2.8018398439106567, "learning_rate": 5.2996253500651494e-06, "loss": 0.3318, "step": 3540 }, { "epoch": 0.49663394109396913, "grad_norm": 1.9338218477361708, "learning_rate": 5.297358151193271e-06, "loss": 0.3572, "step": 3541 }, { "epoch": 0.4967741935483871, "grad_norm": 1.854012650797458, "learning_rate": 5.2950908909636144e-06, "loss": 0.3432, "step": 3542 }, { "epoch": 0.49691444600280504, "grad_norm": 2.7044443427783893, "learning_rate": 5.292823569844016e-06, "loss": 0.3533, "step": 3543 }, { "epoch": 0.497054698457223, "grad_norm": 1.7242928340813182, "learning_rate": 5.2905561883023185e-06, "loss": 0.3878, "step": 3544 }, { "epoch": 0.49719495091164095, "grad_norm": 1.7061705350711203, "learning_rate": 5.288288746806381e-06, "loss": 0.3568, "step": 3545 }, { "epoch": 0.4973352033660589, "grad_norm": 2.1937535951590355, "learning_rate": 5.286021245824075e-06, "loss": 0.3746, "step": 3546 }, { "epoch": 0.49747545582047686, "grad_norm": 2.710114015946774, "learning_rate": 5.283753685823284e-06, "loss": 0.3749, "step": 3547 }, { "epoch": 0.4976157082748948, "grad_norm": 1.7869258598782944, "learning_rate": 5.2814860672719015e-06, "loss": 0.4042, "step": 3548 }, { "epoch": 0.4977559607293128, "grad_norm": 2.0259953536858712, "learning_rate": 5.2792183906378355e-06, "loss": 0.3557, "step": 3549 }, { "epoch": 0.49789621318373073, "grad_norm": 1.730785263427423, "learning_rate": 5.276950656389006e-06, "loss": 0.3417, "step": 3550 }, { "epoch": 0.4980364656381487, "grad_norm": 1.6818038737135972, "learning_rate": 5.274682864993344e-06, "loss": 0.3935, "step": 3551 }, { "epoch": 0.49817671809256664, "grad_norm": 1.7037149509958602, "learning_rate": 5.272415016918792e-06, "loss": 0.3445, "step": 3552 }, { "epoch": 0.4983169705469846, "grad_norm": 1.990156387433322, "learning_rate": 5.270147112633304e-06, "loss": 0.3534, "step": 3553 }, { "epoch": 0.49845722300140255, "grad_norm": 1.7371029406995182, "learning_rate": 5.2678791526048465e-06, "loss": 0.3388, "step": 3554 }, { "epoch": 0.49859747545582045, "grad_norm": 2.3898317105406837, "learning_rate": 5.265611137301397e-06, "loss": 0.3668, "step": 3555 }, { "epoch": 0.4987377279102384, "grad_norm": 2.6702580606570874, "learning_rate": 5.263343067190945e-06, "loss": 0.3949, "step": 3556 }, { "epoch": 0.49887798036465636, "grad_norm": 1.9808089285783332, "learning_rate": 5.261074942741492e-06, "loss": 0.3311, "step": 3557 }, { "epoch": 0.4990182328190743, "grad_norm": 1.8343156509286567, "learning_rate": 5.258806764421048e-06, "loss": 0.3429, "step": 3558 }, { "epoch": 0.49915848527349227, "grad_norm": 2.0537746707969164, "learning_rate": 5.256538532697636e-06, "loss": 0.3697, "step": 3559 }, { "epoch": 0.4992987377279102, "grad_norm": 1.907017975244886, "learning_rate": 5.254270248039291e-06, "loss": 0.3934, "step": 3560 }, { "epoch": 0.4994389901823282, "grad_norm": 2.1977482108936703, "learning_rate": 5.2520019109140555e-06, "loss": 0.3232, "step": 3561 }, { "epoch": 0.49957924263674613, "grad_norm": 2.6939093876746236, "learning_rate": 5.249733521789987e-06, "loss": 0.3435, "step": 3562 }, { "epoch": 0.4997194950911641, "grad_norm": 2.0596759998739578, "learning_rate": 5.247465081135153e-06, "loss": 0.358, "step": 3563 }, { "epoch": 0.49985974754558204, "grad_norm": 2.9638768404908706, "learning_rate": 5.245196589417625e-06, "loss": 0.3614, "step": 3564 }, { "epoch": 0.5, "grad_norm": 1.8750654209167998, "learning_rate": 5.2429280471054954e-06, "loss": 0.3104, "step": 3565 }, { "epoch": 0.500140252454418, "grad_norm": 2.060994726027136, "learning_rate": 5.24065945466686e-06, "loss": 0.3716, "step": 3566 }, { "epoch": 0.5002805049088359, "grad_norm": 1.9472321380215232, "learning_rate": 5.238390812569828e-06, "loss": 0.3796, "step": 3567 }, { "epoch": 0.5004207573632539, "grad_norm": 1.9917087353635414, "learning_rate": 5.2361221212825175e-06, "loss": 0.3885, "step": 3568 }, { "epoch": 0.5005610098176718, "grad_norm": 1.4907006650390289, "learning_rate": 5.2338533812730565e-06, "loss": 0.3813, "step": 3569 }, { "epoch": 0.5007012622720898, "grad_norm": 2.0036792291909427, "learning_rate": 5.2315845930095845e-06, "loss": 0.3572, "step": 3570 }, { "epoch": 0.5008415147265077, "grad_norm": 2.1267380058297434, "learning_rate": 5.229315756960249e-06, "loss": 0.3263, "step": 3571 }, { "epoch": 0.5009817671809257, "grad_norm": 2.264529259207773, "learning_rate": 5.227046873593211e-06, "loss": 0.4315, "step": 3572 }, { "epoch": 0.5011220196353436, "grad_norm": 4.043644503082531, "learning_rate": 5.224777943376635e-06, "loss": 0.3515, "step": 3573 }, { "epoch": 0.5012622720897616, "grad_norm": 2.473619125936771, "learning_rate": 5.222508966778702e-06, "loss": 0.3678, "step": 3574 }, { "epoch": 0.5014025245441796, "grad_norm": 2.0144578970275067, "learning_rate": 5.220239944267598e-06, "loss": 0.3573, "step": 3575 }, { "epoch": 0.5015427769985975, "grad_norm": 2.1544543543345003, "learning_rate": 5.21797087631152e-06, "loss": 0.3841, "step": 3576 }, { "epoch": 0.5016830294530155, "grad_norm": 1.7002194319081827, "learning_rate": 5.215701763378673e-06, "loss": 0.3742, "step": 3577 }, { "epoch": 0.5018232819074334, "grad_norm": 5.329862339324754, "learning_rate": 5.213432605937278e-06, "loss": 0.3282, "step": 3578 }, { "epoch": 0.5019635343618514, "grad_norm": 2.3313369612032413, "learning_rate": 5.211163404455553e-06, "loss": 0.3547, "step": 3579 }, { "epoch": 0.5021037868162693, "grad_norm": 1.9136849167195042, "learning_rate": 5.208894159401735e-06, "loss": 0.41, "step": 3580 }, { "epoch": 0.5022440392706873, "grad_norm": 2.0810312012913914, "learning_rate": 5.206624871244066e-06, "loss": 0.3375, "step": 3581 }, { "epoch": 0.5023842917251052, "grad_norm": 1.9107683133379803, "learning_rate": 5.204355540450799e-06, "loss": 0.3901, "step": 3582 }, { "epoch": 0.5025245441795232, "grad_norm": 2.1011609682990424, "learning_rate": 5.202086167490196e-06, "loss": 0.3749, "step": 3583 }, { "epoch": 0.5026647966339411, "grad_norm": 1.8980022920568553, "learning_rate": 5.199816752830523e-06, "loss": 0.3784, "step": 3584 }, { "epoch": 0.5028050490883591, "grad_norm": 1.8199829982131124, "learning_rate": 5.197547296940059e-06, "loss": 0.37, "step": 3585 }, { "epoch": 0.502945301542777, "grad_norm": 2.9263206565308026, "learning_rate": 5.19527780028709e-06, "loss": 0.4367, "step": 3586 }, { "epoch": 0.5030855539971949, "grad_norm": 1.8786349348515863, "learning_rate": 5.19300826333991e-06, "loss": 0.3172, "step": 3587 }, { "epoch": 0.5032258064516129, "grad_norm": 2.3408089049836764, "learning_rate": 5.190738686566826e-06, "loss": 0.362, "step": 3588 }, { "epoch": 0.5033660589060308, "grad_norm": 2.537648719977014, "learning_rate": 5.188469070436145e-06, "loss": 0.3214, "step": 3589 }, { "epoch": 0.5035063113604488, "grad_norm": 2.062053632118277, "learning_rate": 5.186199415416188e-06, "loss": 0.4148, "step": 3590 }, { "epoch": 0.5036465638148667, "grad_norm": 1.6267476587508785, "learning_rate": 5.183929721975282e-06, "loss": 0.3257, "step": 3591 }, { "epoch": 0.5037868162692847, "grad_norm": 2.3694154433751047, "learning_rate": 5.181659990581764e-06, "loss": 0.3757, "step": 3592 }, { "epoch": 0.5039270687237026, "grad_norm": 1.6616743977684734, "learning_rate": 5.1793902217039775e-06, "loss": 0.3486, "step": 3593 }, { "epoch": 0.5040673211781206, "grad_norm": 2.2586167958791594, "learning_rate": 5.177120415810271e-06, "loss": 0.3302, "step": 3594 }, { "epoch": 0.5042075736325385, "grad_norm": 1.9068985132938883, "learning_rate": 5.1748505733690035e-06, "loss": 0.3592, "step": 3595 }, { "epoch": 0.5043478260869565, "grad_norm": 2.5848335606544186, "learning_rate": 5.172580694848541e-06, "loss": 0.3949, "step": 3596 }, { "epoch": 0.5044880785413745, "grad_norm": 1.7342057703984415, "learning_rate": 5.170310780717259e-06, "loss": 0.3646, "step": 3597 }, { "epoch": 0.5046283309957924, "grad_norm": 1.9273732010315718, "learning_rate": 5.1680408314435385e-06, "loss": 0.3116, "step": 3598 }, { "epoch": 0.5047685834502104, "grad_norm": 2.009923159520715, "learning_rate": 5.1657708474957645e-06, "loss": 0.3863, "step": 3599 }, { "epoch": 0.5049088359046283, "grad_norm": 2.21667964137972, "learning_rate": 5.163500829342334e-06, "loss": 0.3442, "step": 3600 }, { "epoch": 0.5050490883590463, "grad_norm": 1.883318378029018, "learning_rate": 5.16123077745165e-06, "loss": 0.3613, "step": 3601 }, { "epoch": 0.5051893408134642, "grad_norm": 2.1520639227268417, "learning_rate": 5.158960692292122e-06, "loss": 0.3846, "step": 3602 }, { "epoch": 0.5053295932678822, "grad_norm": 1.798095752848141, "learning_rate": 5.156690574332167e-06, "loss": 0.3953, "step": 3603 }, { "epoch": 0.5054698457223001, "grad_norm": 2.810529569634874, "learning_rate": 5.154420424040205e-06, "loss": 0.3171, "step": 3604 }, { "epoch": 0.5056100981767181, "grad_norm": 1.724401303761626, "learning_rate": 5.152150241884669e-06, "loss": 0.3198, "step": 3605 }, { "epoch": 0.505750350631136, "grad_norm": 2.76260840777175, "learning_rate": 5.149880028333992e-06, "loss": 0.3707, "step": 3606 }, { "epoch": 0.505890603085554, "grad_norm": 3.0973069521989802, "learning_rate": 5.147609783856619e-06, "loss": 0.3905, "step": 3607 }, { "epoch": 0.506030855539972, "grad_norm": 2.164701401908316, "learning_rate": 5.145339508920998e-06, "loss": 0.3626, "step": 3608 }, { "epoch": 0.5061711079943899, "grad_norm": 2.131523441437247, "learning_rate": 5.143069203995586e-06, "loss": 0.3517, "step": 3609 }, { "epoch": 0.5063113604488079, "grad_norm": 2.3198181036641934, "learning_rate": 5.140798869548841e-06, "loss": 0.3654, "step": 3610 }, { "epoch": 0.5064516129032258, "grad_norm": 2.3831153552092545, "learning_rate": 5.138528506049234e-06, "loss": 0.379, "step": 3611 }, { "epoch": 0.5065918653576438, "grad_norm": 1.9456881731060405, "learning_rate": 5.1362581139652375e-06, "loss": 0.3575, "step": 3612 }, { "epoch": 0.5067321178120617, "grad_norm": 1.5284935411261253, "learning_rate": 5.133987693765332e-06, "loss": 0.419, "step": 3613 }, { "epoch": 0.5068723702664797, "grad_norm": 1.9078647465780565, "learning_rate": 5.131717245918001e-06, "loss": 0.3807, "step": 3614 }, { "epoch": 0.5070126227208976, "grad_norm": 1.6538089541583236, "learning_rate": 5.129446770891738e-06, "loss": 0.3516, "step": 3615 }, { "epoch": 0.5071528751753156, "grad_norm": 2.3782428887681077, "learning_rate": 5.1271762691550375e-06, "loss": 0.3643, "step": 3616 }, { "epoch": 0.5072931276297336, "grad_norm": 2.3744836547121904, "learning_rate": 5.124905741176402e-06, "loss": 0.3823, "step": 3617 }, { "epoch": 0.5074333800841515, "grad_norm": 1.7064750690622985, "learning_rate": 5.122635187424339e-06, "loss": 0.3316, "step": 3618 }, { "epoch": 0.5075736325385695, "grad_norm": 4.934697188620978, "learning_rate": 5.120364608367363e-06, "loss": 0.3847, "step": 3619 }, { "epoch": 0.5077138849929874, "grad_norm": 1.9209568342554906, "learning_rate": 5.11809400447399e-06, "loss": 0.4058, "step": 3620 }, { "epoch": 0.5078541374474054, "grad_norm": 3.247819168816595, "learning_rate": 5.115823376212744e-06, "loss": 0.3651, "step": 3621 }, { "epoch": 0.5079943899018233, "grad_norm": 1.863488271796615, "learning_rate": 5.113552724052154e-06, "loss": 0.3654, "step": 3622 }, { "epoch": 0.5081346423562413, "grad_norm": 2.1383164480627426, "learning_rate": 5.111282048460753e-06, "loss": 0.3802, "step": 3623 }, { "epoch": 0.5082748948106592, "grad_norm": 1.8316205505846053, "learning_rate": 5.109011349907079e-06, "loss": 0.3543, "step": 3624 }, { "epoch": 0.5084151472650772, "grad_norm": 2.4632399672010936, "learning_rate": 5.106740628859674e-06, "loss": 0.363, "step": 3625 }, { "epoch": 0.508555399719495, "grad_norm": 1.7758033868540197, "learning_rate": 5.1044698857870875e-06, "loss": 0.3646, "step": 3626 }, { "epoch": 0.508695652173913, "grad_norm": 1.9179749032288962, "learning_rate": 5.102199121157869e-06, "loss": 0.3397, "step": 3627 }, { "epoch": 0.508835904628331, "grad_norm": 3.0746252622449557, "learning_rate": 5.099928335440575e-06, "loss": 0.3228, "step": 3628 }, { "epoch": 0.5089761570827489, "grad_norm": 1.968233010364934, "learning_rate": 5.097657529103769e-06, "loss": 0.3754, "step": 3629 }, { "epoch": 0.5091164095371669, "grad_norm": 1.7492075215669722, "learning_rate": 5.095386702616012e-06, "loss": 0.3433, "step": 3630 }, { "epoch": 0.5092566619915848, "grad_norm": 4.336460501256189, "learning_rate": 5.093115856445876e-06, "loss": 0.3771, "step": 3631 }, { "epoch": 0.5093969144460028, "grad_norm": 2.24498730815197, "learning_rate": 5.090844991061934e-06, "loss": 0.3238, "step": 3632 }, { "epoch": 0.5095371669004207, "grad_norm": 2.1608264290488095, "learning_rate": 5.088574106932762e-06, "loss": 0.3813, "step": 3633 }, { "epoch": 0.5096774193548387, "grad_norm": 2.1584342176899267, "learning_rate": 5.0863032045269435e-06, "loss": 0.3762, "step": 3634 }, { "epoch": 0.5098176718092566, "grad_norm": 1.5673269101450311, "learning_rate": 5.0840322843130606e-06, "loss": 0.3247, "step": 3635 }, { "epoch": 0.5099579242636746, "grad_norm": 1.6479393807576714, "learning_rate": 5.081761346759703e-06, "loss": 0.3493, "step": 3636 }, { "epoch": 0.5100981767180925, "grad_norm": 2.0252255820382308, "learning_rate": 5.079490392335463e-06, "loss": 0.3473, "step": 3637 }, { "epoch": 0.5102384291725105, "grad_norm": 2.540666091417728, "learning_rate": 5.077219421508936e-06, "loss": 0.3793, "step": 3638 }, { "epoch": 0.5103786816269285, "grad_norm": 4.3856048664415335, "learning_rate": 5.074948434748721e-06, "loss": 0.3508, "step": 3639 }, { "epoch": 0.5105189340813464, "grad_norm": 2.013121003244975, "learning_rate": 5.072677432523418e-06, "loss": 0.3542, "step": 3640 }, { "epoch": 0.5106591865357644, "grad_norm": 2.7909712043788404, "learning_rate": 5.070406415301637e-06, "loss": 0.3897, "step": 3641 }, { "epoch": 0.5107994389901823, "grad_norm": 1.9031620967863687, "learning_rate": 5.068135383551983e-06, "loss": 0.357, "step": 3642 }, { "epoch": 0.5109396914446003, "grad_norm": 2.06840986238842, "learning_rate": 5.065864337743068e-06, "loss": 0.4024, "step": 3643 }, { "epoch": 0.5110799438990182, "grad_norm": 1.7308085433049116, "learning_rate": 5.06359327834351e-06, "loss": 0.4072, "step": 3644 }, { "epoch": 0.5112201963534362, "grad_norm": 1.795656271906842, "learning_rate": 5.06132220582192e-06, "loss": 0.3863, "step": 3645 }, { "epoch": 0.5113604488078541, "grad_norm": 2.0431594322012825, "learning_rate": 5.059051120646924e-06, "loss": 0.3829, "step": 3646 }, { "epoch": 0.5115007012622721, "grad_norm": 2.0105892232522793, "learning_rate": 5.0567800232871404e-06, "loss": 0.3652, "step": 3647 }, { "epoch": 0.51164095371669, "grad_norm": 2.2719089049022347, "learning_rate": 5.0545089142111945e-06, "loss": 0.3538, "step": 3648 }, { "epoch": 0.511781206171108, "grad_norm": 2.0813496746217575, "learning_rate": 5.052237793887717e-06, "loss": 0.3653, "step": 3649 }, { "epoch": 0.511921458625526, "grad_norm": 2.3126416549598363, "learning_rate": 5.049966662785335e-06, "loss": 0.3948, "step": 3650 }, { "epoch": 0.5120617110799439, "grad_norm": 2.4722443006960737, "learning_rate": 5.047695521372681e-06, "loss": 0.3796, "step": 3651 }, { "epoch": 0.5122019635343619, "grad_norm": 1.6406566798466669, "learning_rate": 5.045424370118389e-06, "loss": 0.3172, "step": 3652 }, { "epoch": 0.5123422159887798, "grad_norm": 3.292379632022623, "learning_rate": 5.043153209491095e-06, "loss": 0.3419, "step": 3653 }, { "epoch": 0.5124824684431978, "grad_norm": 1.9069795306927018, "learning_rate": 5.04088203995944e-06, "loss": 0.3676, "step": 3654 }, { "epoch": 0.5126227208976157, "grad_norm": 2.2450576117245093, "learning_rate": 5.03861086199206e-06, "loss": 0.3471, "step": 3655 }, { "epoch": 0.5127629733520337, "grad_norm": 1.822749916806008, "learning_rate": 5.036339676057599e-06, "loss": 0.3918, "step": 3656 }, { "epoch": 0.5129032258064516, "grad_norm": 1.960890932784039, "learning_rate": 5.0340684826247e-06, "loss": 0.3236, "step": 3657 }, { "epoch": 0.5130434782608696, "grad_norm": 1.8710556504227125, "learning_rate": 5.031797282162007e-06, "loss": 0.3751, "step": 3658 }, { "epoch": 0.5131837307152876, "grad_norm": 1.8810988102482114, "learning_rate": 5.029526075138167e-06, "loss": 0.3539, "step": 3659 }, { "epoch": 0.5133239831697055, "grad_norm": 1.880791113259444, "learning_rate": 5.027254862021829e-06, "loss": 0.3937, "step": 3660 }, { "epoch": 0.5134642356241235, "grad_norm": 1.8354542197615762, "learning_rate": 5.024983643281639e-06, "loss": 0.3614, "step": 3661 }, { "epoch": 0.5136044880785414, "grad_norm": 1.9082546012440125, "learning_rate": 5.022712419386248e-06, "loss": 0.402, "step": 3662 }, { "epoch": 0.5137447405329594, "grad_norm": 2.2186705884020026, "learning_rate": 5.020441190804309e-06, "loss": 0.376, "step": 3663 }, { "epoch": 0.5138849929873773, "grad_norm": 2.4373399730872958, "learning_rate": 5.018169958004474e-06, "loss": 0.3995, "step": 3664 }, { "epoch": 0.5140252454417953, "grad_norm": 2.752847151315721, "learning_rate": 5.015898721455394e-06, "loss": 0.3203, "step": 3665 }, { "epoch": 0.5141654978962131, "grad_norm": 2.470483099145603, "learning_rate": 5.013627481625725e-06, "loss": 0.3551, "step": 3666 }, { "epoch": 0.5143057503506311, "grad_norm": 1.817088731097163, "learning_rate": 5.011356238984121e-06, "loss": 0.3939, "step": 3667 }, { "epoch": 0.514446002805049, "grad_norm": 2.81747767455658, "learning_rate": 5.009084993999234e-06, "loss": 0.358, "step": 3668 }, { "epoch": 0.514586255259467, "grad_norm": 2.215296784747574, "learning_rate": 5.006813747139722e-06, "loss": 0.3998, "step": 3669 }, { "epoch": 0.514726507713885, "grad_norm": 2.158325302325101, "learning_rate": 5.004542498874244e-06, "loss": 0.3307, "step": 3670 }, { "epoch": 0.5148667601683029, "grad_norm": 2.1141317351987348, "learning_rate": 5.002271249671451e-06, "loss": 0.3485, "step": 3671 }, { "epoch": 0.5150070126227209, "grad_norm": 1.9612532281697088, "learning_rate": 5e-06, "loss": 0.3578, "step": 3672 }, { "epoch": 0.5151472650771388, "grad_norm": 2.0698050398859325, "learning_rate": 4.997728750328551e-06, "loss": 0.3458, "step": 3673 }, { "epoch": 0.5152875175315568, "grad_norm": 2.2935707042998605, "learning_rate": 4.995457501125758e-06, "loss": 0.3664, "step": 3674 }, { "epoch": 0.5154277699859747, "grad_norm": 1.951666447579458, "learning_rate": 4.9931862528602784e-06, "loss": 0.3625, "step": 3675 }, { "epoch": 0.5155680224403927, "grad_norm": 3.081016770022228, "learning_rate": 4.990915006000767e-06, "loss": 0.3442, "step": 3676 }, { "epoch": 0.5157082748948106, "grad_norm": 1.8115296890685593, "learning_rate": 4.988643761015881e-06, "loss": 0.3434, "step": 3677 }, { "epoch": 0.5158485273492286, "grad_norm": 1.6531970118394341, "learning_rate": 4.986372518374276e-06, "loss": 0.3027, "step": 3678 }, { "epoch": 0.5159887798036465, "grad_norm": 3.186839016466021, "learning_rate": 4.984101278544607e-06, "loss": 0.3046, "step": 3679 }, { "epoch": 0.5161290322580645, "grad_norm": 1.7695053806056051, "learning_rate": 4.981830041995527e-06, "loss": 0.3846, "step": 3680 }, { "epoch": 0.5162692847124825, "grad_norm": 1.729302489067152, "learning_rate": 4.9795588091956906e-06, "loss": 0.3639, "step": 3681 }, { "epoch": 0.5164095371669004, "grad_norm": 1.8667818530624347, "learning_rate": 4.977287580613752e-06, "loss": 0.3874, "step": 3682 }, { "epoch": 0.5165497896213184, "grad_norm": 2.211993069953479, "learning_rate": 4.975016356718364e-06, "loss": 0.3514, "step": 3683 }, { "epoch": 0.5166900420757363, "grad_norm": 2.602827343174196, "learning_rate": 4.9727451379781735e-06, "loss": 0.3593, "step": 3684 }, { "epoch": 0.5168302945301543, "grad_norm": 2.4770237827014014, "learning_rate": 4.970473924861835e-06, "loss": 0.3509, "step": 3685 }, { "epoch": 0.5169705469845722, "grad_norm": 1.836307179993493, "learning_rate": 4.968202717837996e-06, "loss": 0.3625, "step": 3686 }, { "epoch": 0.5171107994389902, "grad_norm": 2.473193262491885, "learning_rate": 4.9659315173753026e-06, "loss": 0.3937, "step": 3687 }, { "epoch": 0.5172510518934081, "grad_norm": 2.1534112128276806, "learning_rate": 4.963660323942403e-06, "loss": 0.3567, "step": 3688 }, { "epoch": 0.5173913043478261, "grad_norm": 1.8020580341422825, "learning_rate": 4.961389138007942e-06, "loss": 0.3647, "step": 3689 }, { "epoch": 0.517531556802244, "grad_norm": 1.6571617495935018, "learning_rate": 4.9591179600405615e-06, "loss": 0.3688, "step": 3690 }, { "epoch": 0.517671809256662, "grad_norm": 2.2593634007579593, "learning_rate": 4.956846790508906e-06, "loss": 0.3648, "step": 3691 }, { "epoch": 0.51781206171108, "grad_norm": 1.8395922705612158, "learning_rate": 4.954575629881613e-06, "loss": 0.3832, "step": 3692 }, { "epoch": 0.5179523141654979, "grad_norm": 1.942237844758353, "learning_rate": 4.9523044786273214e-06, "loss": 0.3417, "step": 3693 }, { "epoch": 0.5180925666199159, "grad_norm": 2.1211825469874213, "learning_rate": 4.950033337214667e-06, "loss": 0.3591, "step": 3694 }, { "epoch": 0.5182328190743338, "grad_norm": 2.213239462175558, "learning_rate": 4.947762206112285e-06, "loss": 0.3899, "step": 3695 }, { "epoch": 0.5183730715287518, "grad_norm": 1.825873006752082, "learning_rate": 4.945491085788806e-06, "loss": 0.3789, "step": 3696 }, { "epoch": 0.5185133239831697, "grad_norm": 1.8896315305283706, "learning_rate": 4.943219976712862e-06, "loss": 0.3432, "step": 3697 }, { "epoch": 0.5186535764375877, "grad_norm": 2.1163108671823134, "learning_rate": 4.940948879353078e-06, "loss": 0.3378, "step": 3698 }, { "epoch": 0.5187938288920056, "grad_norm": 2.6058237728980043, "learning_rate": 4.93867779417808e-06, "loss": 0.292, "step": 3699 }, { "epoch": 0.5189340813464236, "grad_norm": 1.6119031585209922, "learning_rate": 4.936406721656492e-06, "loss": 0.3112, "step": 3700 }, { "epoch": 0.5190743338008416, "grad_norm": 2.1174694591423013, "learning_rate": 4.934135662256932e-06, "loss": 0.3844, "step": 3701 }, { "epoch": 0.5192145862552595, "grad_norm": 7.220989193768638, "learning_rate": 4.9318646164480175e-06, "loss": 0.3769, "step": 3702 }, { "epoch": 0.5193548387096775, "grad_norm": 3.0002214032172283, "learning_rate": 4.929593584698363e-06, "loss": 0.3206, "step": 3703 }, { "epoch": 0.5194950911640954, "grad_norm": 2.3109269849268466, "learning_rate": 4.927322567476584e-06, "loss": 0.3531, "step": 3704 }, { "epoch": 0.5196353436185134, "grad_norm": 4.505017344518672, "learning_rate": 4.925051565251282e-06, "loss": 0.3527, "step": 3705 }, { "epoch": 0.5197755960729312, "grad_norm": 2.2171866053336813, "learning_rate": 4.922780578491067e-06, "loss": 0.4189, "step": 3706 }, { "epoch": 0.5199158485273492, "grad_norm": 2.1710867789349786, "learning_rate": 4.92050960766454e-06, "loss": 0.3455, "step": 3707 }, { "epoch": 0.5200561009817671, "grad_norm": 2.3662179906647873, "learning_rate": 4.918238653240299e-06, "loss": 0.35, "step": 3708 }, { "epoch": 0.5201963534361851, "grad_norm": 1.7285903828289937, "learning_rate": 4.915967715686941e-06, "loss": 0.3587, "step": 3709 }, { "epoch": 0.520336605890603, "grad_norm": 1.4787423326643403, "learning_rate": 4.913696795473058e-06, "loss": 0.3185, "step": 3710 }, { "epoch": 0.520476858345021, "grad_norm": 1.82482076836039, "learning_rate": 4.911425893067239e-06, "loss": 0.3134, "step": 3711 }, { "epoch": 0.520617110799439, "grad_norm": 2.0070776874995264, "learning_rate": 4.909155008938068e-06, "loss": 0.3574, "step": 3712 }, { "epoch": 0.5207573632538569, "grad_norm": 1.733567333321009, "learning_rate": 4.906884143554126e-06, "loss": 0.384, "step": 3713 }, { "epoch": 0.5208976157082749, "grad_norm": 3.099018735323684, "learning_rate": 4.9046132973839895e-06, "loss": 0.3718, "step": 3714 }, { "epoch": 0.5210378681626928, "grad_norm": 3.708477417507696, "learning_rate": 4.9023424708962334e-06, "loss": 0.391, "step": 3715 }, { "epoch": 0.5211781206171108, "grad_norm": 2.453857021086955, "learning_rate": 4.900071664559427e-06, "loss": 0.3457, "step": 3716 }, { "epoch": 0.5213183730715287, "grad_norm": 2.3273839603265722, "learning_rate": 4.897800878842133e-06, "loss": 0.3587, "step": 3717 }, { "epoch": 0.5214586255259467, "grad_norm": 2.996705312370072, "learning_rate": 4.895530114212913e-06, "loss": 0.3471, "step": 3718 }, { "epoch": 0.5215988779803646, "grad_norm": 6.115569659544578, "learning_rate": 4.893259371140326e-06, "loss": 0.3556, "step": 3719 }, { "epoch": 0.5217391304347826, "grad_norm": 1.8397433826583083, "learning_rate": 4.890988650092922e-06, "loss": 0.3339, "step": 3720 }, { "epoch": 0.5218793828892005, "grad_norm": 1.6192506170703065, "learning_rate": 4.8887179515392465e-06, "loss": 0.3689, "step": 3721 }, { "epoch": 0.5220196353436185, "grad_norm": 2.706800839084897, "learning_rate": 4.886447275947846e-06, "loss": 0.3595, "step": 3722 }, { "epoch": 0.5221598877980365, "grad_norm": 2.034408461241353, "learning_rate": 4.8841766237872555e-06, "loss": 0.3373, "step": 3723 }, { "epoch": 0.5223001402524544, "grad_norm": 2.7751033454415457, "learning_rate": 4.8819059955260105e-06, "loss": 0.3561, "step": 3724 }, { "epoch": 0.5224403927068724, "grad_norm": 2.630091860367186, "learning_rate": 4.87963539163264e-06, "loss": 0.3449, "step": 3725 }, { "epoch": 0.5225806451612903, "grad_norm": 2.647019808612603, "learning_rate": 4.877364812575663e-06, "loss": 0.3791, "step": 3726 }, { "epoch": 0.5227208976157083, "grad_norm": 1.968073229043711, "learning_rate": 4.875094258823601e-06, "loss": 0.3294, "step": 3727 }, { "epoch": 0.5228611500701262, "grad_norm": 1.6059381026314938, "learning_rate": 4.872823730844966e-06, "loss": 0.349, "step": 3728 }, { "epoch": 0.5230014025245442, "grad_norm": 2.0848585903231633, "learning_rate": 4.8705532291082644e-06, "loss": 0.356, "step": 3729 }, { "epoch": 0.5231416549789621, "grad_norm": 1.9999534482306636, "learning_rate": 4.868282754082e-06, "loss": 0.3147, "step": 3730 }, { "epoch": 0.5232819074333801, "grad_norm": 1.9251763746040882, "learning_rate": 4.866012306234669e-06, "loss": 0.3915, "step": 3731 }, { "epoch": 0.523422159887798, "grad_norm": 2.9706849869494767, "learning_rate": 4.863741886034764e-06, "loss": 0.3318, "step": 3732 }, { "epoch": 0.523562412342216, "grad_norm": 1.9825949908056737, "learning_rate": 4.861471493950767e-06, "loss": 0.3618, "step": 3733 }, { "epoch": 0.523702664796634, "grad_norm": 2.237727391286987, "learning_rate": 4.859201130451161e-06, "loss": 0.3849, "step": 3734 }, { "epoch": 0.5238429172510519, "grad_norm": 2.0979564032884186, "learning_rate": 4.856930796004417e-06, "loss": 0.3926, "step": 3735 }, { "epoch": 0.5239831697054699, "grad_norm": 1.9183888023945277, "learning_rate": 4.854660491079004e-06, "loss": 0.3871, "step": 3736 }, { "epoch": 0.5241234221598878, "grad_norm": 2.1296195139007383, "learning_rate": 4.852390216143383e-06, "loss": 0.3588, "step": 3737 }, { "epoch": 0.5242636746143058, "grad_norm": 2.303929815369107, "learning_rate": 4.850119971666009e-06, "loss": 0.3681, "step": 3738 }, { "epoch": 0.5244039270687237, "grad_norm": 2.1408420647601116, "learning_rate": 4.847849758115333e-06, "loss": 0.363, "step": 3739 }, { "epoch": 0.5245441795231417, "grad_norm": 1.9511784733841133, "learning_rate": 4.845579575959795e-06, "loss": 0.392, "step": 3740 }, { "epoch": 0.5246844319775597, "grad_norm": 1.954720783636369, "learning_rate": 4.843309425667834e-06, "loss": 0.3235, "step": 3741 }, { "epoch": 0.5248246844319776, "grad_norm": 5.039376277173541, "learning_rate": 4.841039307707878e-06, "loss": 0.3806, "step": 3742 }, { "epoch": 0.5249649368863956, "grad_norm": 3.0999156509737253, "learning_rate": 4.838769222548349e-06, "loss": 0.3626, "step": 3743 }, { "epoch": 0.5251051893408135, "grad_norm": 2.447411559458202, "learning_rate": 4.8364991706576655e-06, "loss": 0.3339, "step": 3744 }, { "epoch": 0.5252454417952315, "grad_norm": 2.9730291746718036, "learning_rate": 4.834229152504239e-06, "loss": 0.4044, "step": 3745 }, { "epoch": 0.5253856942496493, "grad_norm": 1.8721455781280327, "learning_rate": 4.831959168556464e-06, "loss": 0.3569, "step": 3746 }, { "epoch": 0.5255259467040673, "grad_norm": 2.014664650360418, "learning_rate": 4.829689219282742e-06, "loss": 0.3708, "step": 3747 }, { "epoch": 0.5256661991584852, "grad_norm": 2.043516009173536, "learning_rate": 4.827419305151461e-06, "loss": 0.3498, "step": 3748 }, { "epoch": 0.5258064516129032, "grad_norm": 6.739501984079254, "learning_rate": 4.825149426630999e-06, "loss": 0.3854, "step": 3749 }, { "epoch": 0.5259467040673211, "grad_norm": 2.2036201515826037, "learning_rate": 4.822879584189732e-06, "loss": 0.3522, "step": 3750 }, { "epoch": 0.5260869565217391, "grad_norm": 2.270229002538842, "learning_rate": 4.820609778296024e-06, "loss": 0.3502, "step": 3751 }, { "epoch": 0.526227208976157, "grad_norm": 1.8278016676624995, "learning_rate": 4.818340009418237e-06, "loss": 0.3142, "step": 3752 }, { "epoch": 0.526367461430575, "grad_norm": 1.8311794228347154, "learning_rate": 4.8160702780247184e-06, "loss": 0.3682, "step": 3753 }, { "epoch": 0.526507713884993, "grad_norm": 2.9541725843735263, "learning_rate": 4.813800584583813e-06, "loss": 0.3885, "step": 3754 }, { "epoch": 0.5266479663394109, "grad_norm": 1.7103564525541748, "learning_rate": 4.8115309295638566e-06, "loss": 0.3606, "step": 3755 }, { "epoch": 0.5267882187938289, "grad_norm": 1.836085666602667, "learning_rate": 4.809261313433176e-06, "loss": 0.3691, "step": 3756 }, { "epoch": 0.5269284712482468, "grad_norm": 2.397690647409258, "learning_rate": 4.806991736660091e-06, "loss": 0.3507, "step": 3757 }, { "epoch": 0.5270687237026648, "grad_norm": 2.132516805531155, "learning_rate": 4.8047221997129126e-06, "loss": 0.3898, "step": 3758 }, { "epoch": 0.5272089761570827, "grad_norm": 2.2276906150494495, "learning_rate": 4.802452703059943e-06, "loss": 0.296, "step": 3759 }, { "epoch": 0.5273492286115007, "grad_norm": 2.158573833405449, "learning_rate": 4.800183247169478e-06, "loss": 0.3319, "step": 3760 }, { "epoch": 0.5274894810659186, "grad_norm": 1.9254663918018025, "learning_rate": 4.797913832509806e-06, "loss": 0.3363, "step": 3761 }, { "epoch": 0.5276297335203366, "grad_norm": 1.7542979051633234, "learning_rate": 4.795644459549201e-06, "loss": 0.3782, "step": 3762 }, { "epoch": 0.5277699859747546, "grad_norm": 1.7331262853035079, "learning_rate": 4.793375128755934e-06, "loss": 0.3614, "step": 3763 }, { "epoch": 0.5279102384291725, "grad_norm": 2.3612281645638133, "learning_rate": 4.791105840598266e-06, "loss": 0.3573, "step": 3764 }, { "epoch": 0.5280504908835905, "grad_norm": 1.8234160983440382, "learning_rate": 4.788836595544448e-06, "loss": 0.3652, "step": 3765 }, { "epoch": 0.5281907433380084, "grad_norm": 2.3850019558112803, "learning_rate": 4.7865673940627255e-06, "loss": 0.3768, "step": 3766 }, { "epoch": 0.5283309957924264, "grad_norm": 1.6687251571349104, "learning_rate": 4.7842982366213275e-06, "loss": 0.3159, "step": 3767 }, { "epoch": 0.5284712482468443, "grad_norm": 1.7348096148336387, "learning_rate": 4.782029123688483e-06, "loss": 0.3346, "step": 3768 }, { "epoch": 0.5286115007012623, "grad_norm": 1.6782108647207223, "learning_rate": 4.779760055732405e-06, "loss": 0.3338, "step": 3769 }, { "epoch": 0.5287517531556802, "grad_norm": 1.6730762337498548, "learning_rate": 4.7774910332213005e-06, "loss": 0.3408, "step": 3770 }, { "epoch": 0.5288920056100982, "grad_norm": 3.180567514174041, "learning_rate": 4.775222056623366e-06, "loss": 0.4107, "step": 3771 }, { "epoch": 0.5290322580645161, "grad_norm": 1.9225132820454358, "learning_rate": 4.77295312640679e-06, "loss": 0.3425, "step": 3772 }, { "epoch": 0.5291725105189341, "grad_norm": 1.8724585793654311, "learning_rate": 4.770684243039752e-06, "loss": 0.3514, "step": 3773 }, { "epoch": 0.5293127629733521, "grad_norm": 2.3046916379729803, "learning_rate": 4.768415406990417e-06, "loss": 0.3288, "step": 3774 }, { "epoch": 0.52945301542777, "grad_norm": 1.8607976263537893, "learning_rate": 4.766146618726944e-06, "loss": 0.371, "step": 3775 }, { "epoch": 0.529593267882188, "grad_norm": 1.736131895152826, "learning_rate": 4.763877878717484e-06, "loss": 0.3425, "step": 3776 }, { "epoch": 0.5297335203366059, "grad_norm": 2.523536703799325, "learning_rate": 4.761609187430174e-06, "loss": 0.392, "step": 3777 }, { "epoch": 0.5298737727910239, "grad_norm": 1.9900356863646342, "learning_rate": 4.759340545333142e-06, "loss": 0.3454, "step": 3778 }, { "epoch": 0.5300140252454418, "grad_norm": 2.703840480777204, "learning_rate": 4.757071952894506e-06, "loss": 0.3768, "step": 3779 }, { "epoch": 0.5301542776998598, "grad_norm": 2.5751979973973964, "learning_rate": 4.754803410582376e-06, "loss": 0.3776, "step": 3780 }, { "epoch": 0.5302945301542777, "grad_norm": 2.456608625249978, "learning_rate": 4.75253491886485e-06, "loss": 0.335, "step": 3781 }, { "epoch": 0.5304347826086957, "grad_norm": 2.0070395558958634, "learning_rate": 4.750266478210014e-06, "loss": 0.36, "step": 3782 }, { "epoch": 0.5305750350631137, "grad_norm": 2.003743958919221, "learning_rate": 4.747998089085945e-06, "loss": 0.3354, "step": 3783 }, { "epoch": 0.5307152875175316, "grad_norm": 1.8612221270357825, "learning_rate": 4.74572975196071e-06, "loss": 0.3478, "step": 3784 }, { "epoch": 0.5308555399719496, "grad_norm": 2.1073672311396283, "learning_rate": 4.743461467302364e-06, "loss": 0.3983, "step": 3785 }, { "epoch": 0.5309957924263674, "grad_norm": 1.686334914210802, "learning_rate": 4.741193235578953e-06, "loss": 0.338, "step": 3786 }, { "epoch": 0.5311360448807854, "grad_norm": 2.02845101814661, "learning_rate": 4.7389250572585104e-06, "loss": 0.3569, "step": 3787 }, { "epoch": 0.5312762973352033, "grad_norm": 2.451489140012877, "learning_rate": 4.736656932809056e-06, "loss": 0.3463, "step": 3788 }, { "epoch": 0.5314165497896213, "grad_norm": 2.799396933597216, "learning_rate": 4.734388862698605e-06, "loss": 0.3703, "step": 3789 }, { "epoch": 0.5315568022440392, "grad_norm": 1.8924983807938576, "learning_rate": 4.732120847395156e-06, "loss": 0.381, "step": 3790 }, { "epoch": 0.5316970546984572, "grad_norm": 2.076633692684222, "learning_rate": 4.7298528873666985e-06, "loss": 0.3935, "step": 3791 }, { "epoch": 0.5318373071528751, "grad_norm": 1.825428857449898, "learning_rate": 4.72758498308121e-06, "loss": 0.3725, "step": 3792 }, { "epoch": 0.5319775596072931, "grad_norm": 1.8969901890478373, "learning_rate": 4.725317135006658e-06, "loss": 0.351, "step": 3793 }, { "epoch": 0.532117812061711, "grad_norm": 1.6724535217441499, "learning_rate": 4.723049343610996e-06, "loss": 0.3038, "step": 3794 }, { "epoch": 0.532258064516129, "grad_norm": 1.798939456041783, "learning_rate": 4.720781609362165e-06, "loss": 0.341, "step": 3795 }, { "epoch": 0.532398316970547, "grad_norm": 1.785256547843999, "learning_rate": 4.7185139327281e-06, "loss": 0.3394, "step": 3796 }, { "epoch": 0.5325385694249649, "grad_norm": 2.4772830721119776, "learning_rate": 4.716246314176717e-06, "loss": 0.3354, "step": 3797 }, { "epoch": 0.5326788218793829, "grad_norm": 1.8989217007260413, "learning_rate": 4.713978754175926e-06, "loss": 0.3443, "step": 3798 }, { "epoch": 0.5328190743338008, "grad_norm": 1.7351941073228352, "learning_rate": 4.71171125319362e-06, "loss": 0.3366, "step": 3799 }, { "epoch": 0.5329593267882188, "grad_norm": 1.936322438884095, "learning_rate": 4.709443811697683e-06, "loss": 0.3549, "step": 3800 }, { "epoch": 0.5330995792426367, "grad_norm": 1.5804332810257844, "learning_rate": 4.707176430155986e-06, "loss": 0.3533, "step": 3801 }, { "epoch": 0.5332398316970547, "grad_norm": 1.7651832500566387, "learning_rate": 4.704909109036387e-06, "loss": 0.3259, "step": 3802 }, { "epoch": 0.5333800841514726, "grad_norm": 1.9921138974225232, "learning_rate": 4.70264184880673e-06, "loss": 0.3786, "step": 3803 }, { "epoch": 0.5335203366058906, "grad_norm": 2.000472251449842, "learning_rate": 4.700374649934851e-06, "loss": 0.345, "step": 3804 }, { "epoch": 0.5336605890603086, "grad_norm": 3.065104097524603, "learning_rate": 4.69810751288857e-06, "loss": 0.3585, "step": 3805 }, { "epoch": 0.5338008415147265, "grad_norm": 2.2290730560707375, "learning_rate": 4.695840438135693e-06, "loss": 0.4052, "step": 3806 }, { "epoch": 0.5339410939691445, "grad_norm": 2.058200400483346, "learning_rate": 4.6935734261440195e-06, "loss": 0.4094, "step": 3807 }, { "epoch": 0.5340813464235624, "grad_norm": 1.6804502662624656, "learning_rate": 4.6913064773813274e-06, "loss": 0.3973, "step": 3808 }, { "epoch": 0.5342215988779804, "grad_norm": 3.615529151905542, "learning_rate": 4.689039592315387e-06, "loss": 0.3786, "step": 3809 }, { "epoch": 0.5343618513323983, "grad_norm": 1.7978308747451779, "learning_rate": 4.686772771413954e-06, "loss": 0.3478, "step": 3810 }, { "epoch": 0.5345021037868163, "grad_norm": 2.670788103296928, "learning_rate": 4.68450601514477e-06, "loss": 0.3661, "step": 3811 }, { "epoch": 0.5346423562412342, "grad_norm": 2.379760288219893, "learning_rate": 4.682239323975566e-06, "loss": 0.3571, "step": 3812 }, { "epoch": 0.5347826086956522, "grad_norm": 1.9908688238702827, "learning_rate": 4.679972698374058e-06, "loss": 0.396, "step": 3813 }, { "epoch": 0.5349228611500702, "grad_norm": 1.8402141352111134, "learning_rate": 4.6777061388079485e-06, "loss": 0.3276, "step": 3814 }, { "epoch": 0.5350631136044881, "grad_norm": 1.895674339027143, "learning_rate": 4.675439645744924e-06, "loss": 0.3148, "step": 3815 }, { "epoch": 0.5352033660589061, "grad_norm": 1.7269382672036369, "learning_rate": 4.673173219652662e-06, "loss": 0.3524, "step": 3816 }, { "epoch": 0.535343618513324, "grad_norm": 1.8932986310120272, "learning_rate": 4.6709068609988225e-06, "loss": 0.327, "step": 3817 }, { "epoch": 0.535483870967742, "grad_norm": 1.8934102625508196, "learning_rate": 4.668640570251054e-06, "loss": 0.3831, "step": 3818 }, { "epoch": 0.5356241234221599, "grad_norm": 2.4952139818859562, "learning_rate": 4.666374347876987e-06, "loss": 0.377, "step": 3819 }, { "epoch": 0.5357643758765779, "grad_norm": 2.088910673526141, "learning_rate": 4.6641081943442425e-06, "loss": 0.3608, "step": 3820 }, { "epoch": 0.5359046283309958, "grad_norm": 1.7012049499870237, "learning_rate": 4.661842110120426e-06, "loss": 0.3244, "step": 3821 }, { "epoch": 0.5360448807854138, "grad_norm": 2.0095416390005343, "learning_rate": 4.659576095673127e-06, "loss": 0.4246, "step": 3822 }, { "epoch": 0.5361851332398317, "grad_norm": 1.9607024640876656, "learning_rate": 4.657310151469924e-06, "loss": 0.4061, "step": 3823 }, { "epoch": 0.5363253856942497, "grad_norm": 2.121843349574675, "learning_rate": 4.6550442779783755e-06, "loss": 0.3376, "step": 3824 }, { "epoch": 0.5364656381486677, "grad_norm": 2.966589854528299, "learning_rate": 4.65277847566603e-06, "loss": 0.3538, "step": 3825 }, { "epoch": 0.5366058906030855, "grad_norm": 1.6712579747240142, "learning_rate": 4.6505127450004216e-06, "loss": 0.3317, "step": 3826 }, { "epoch": 0.5367461430575035, "grad_norm": 2.1949406101820985, "learning_rate": 4.648247086449064e-06, "loss": 0.3328, "step": 3827 }, { "epoch": 0.5368863955119214, "grad_norm": 2.4053969419291668, "learning_rate": 4.645981500479466e-06, "loss": 0.34, "step": 3828 }, { "epoch": 0.5370266479663394, "grad_norm": 3.0017739455309354, "learning_rate": 4.643715987559111e-06, "loss": 0.3688, "step": 3829 }, { "epoch": 0.5371669004207573, "grad_norm": 1.4903857952898907, "learning_rate": 4.641450548155473e-06, "loss": 0.3544, "step": 3830 }, { "epoch": 0.5373071528751753, "grad_norm": 2.1454245786014106, "learning_rate": 4.639185182736008e-06, "loss": 0.3339, "step": 3831 }, { "epoch": 0.5374474053295932, "grad_norm": 2.1925558247087147, "learning_rate": 4.63691989176816e-06, "loss": 0.3689, "step": 3832 }, { "epoch": 0.5375876577840112, "grad_norm": 1.6358050069560464, "learning_rate": 4.634654675719355e-06, "loss": 0.3685, "step": 3833 }, { "epoch": 0.5377279102384291, "grad_norm": 1.8921292838631343, "learning_rate": 4.632389535057007e-06, "loss": 0.3562, "step": 3834 }, { "epoch": 0.5378681626928471, "grad_norm": 1.6850069491868522, "learning_rate": 4.6301244702485084e-06, "loss": 0.3216, "step": 3835 }, { "epoch": 0.538008415147265, "grad_norm": 2.067552792891055, "learning_rate": 4.627859481761242e-06, "loss": 0.3768, "step": 3836 }, { "epoch": 0.538148667601683, "grad_norm": 1.8790312346765001, "learning_rate": 4.625594570062571e-06, "loss": 0.3639, "step": 3837 }, { "epoch": 0.538288920056101, "grad_norm": 1.7960907924871665, "learning_rate": 4.6233297356198446e-06, "loss": 0.3216, "step": 3838 }, { "epoch": 0.5384291725105189, "grad_norm": 2.1672577662996013, "learning_rate": 4.621064978900397e-06, "loss": 0.3975, "step": 3839 }, { "epoch": 0.5385694249649369, "grad_norm": 2.421871357576646, "learning_rate": 4.618800300371543e-06, "loss": 0.3772, "step": 3840 }, { "epoch": 0.5387096774193548, "grad_norm": 2.115980641887648, "learning_rate": 4.616535700500583e-06, "loss": 0.3289, "step": 3841 }, { "epoch": 0.5388499298737728, "grad_norm": 1.9303108474949164, "learning_rate": 4.614271179754802e-06, "loss": 0.3653, "step": 3842 }, { "epoch": 0.5389901823281907, "grad_norm": 1.9264690416721257, "learning_rate": 4.612006738601469e-06, "loss": 0.3521, "step": 3843 }, { "epoch": 0.5391304347826087, "grad_norm": 2.4079605562014854, "learning_rate": 4.609742377507834e-06, "loss": 0.3744, "step": 3844 }, { "epoch": 0.5392706872370266, "grad_norm": 2.2634823895092286, "learning_rate": 4.607478096941133e-06, "loss": 0.3491, "step": 3845 }, { "epoch": 0.5394109396914446, "grad_norm": 3.1666324680556377, "learning_rate": 4.605213897368584e-06, "loss": 0.3527, "step": 3846 }, { "epoch": 0.5395511921458626, "grad_norm": 1.93498953330982, "learning_rate": 4.60294977925739e-06, "loss": 0.3215, "step": 3847 }, { "epoch": 0.5396914446002805, "grad_norm": 1.8191757909755943, "learning_rate": 4.600685743074736e-06, "loss": 0.3486, "step": 3848 }, { "epoch": 0.5398316970546985, "grad_norm": 2.4975145382691903, "learning_rate": 4.598421789287787e-06, "loss": 0.3928, "step": 3849 }, { "epoch": 0.5399719495091164, "grad_norm": 2.063299890944582, "learning_rate": 4.596157918363699e-06, "loss": 0.3489, "step": 3850 }, { "epoch": 0.5401122019635344, "grad_norm": 2.0370839776625704, "learning_rate": 4.5938941307696004e-06, "loss": 0.3458, "step": 3851 }, { "epoch": 0.5402524544179523, "grad_norm": 1.9805628887728552, "learning_rate": 4.591630426972611e-06, "loss": 0.3546, "step": 3852 }, { "epoch": 0.5403927068723703, "grad_norm": 1.8024059587514745, "learning_rate": 4.58936680743983e-06, "loss": 0.3526, "step": 3853 }, { "epoch": 0.5405329593267882, "grad_norm": 2.997080813153648, "learning_rate": 4.587103272638339e-06, "loss": 0.3765, "step": 3854 }, { "epoch": 0.5406732117812062, "grad_norm": 1.6489810860904184, "learning_rate": 4.584839823035204e-06, "loss": 0.3616, "step": 3855 }, { "epoch": 0.5408134642356242, "grad_norm": 2.035892053361841, "learning_rate": 4.58257645909747e-06, "loss": 0.3377, "step": 3856 }, { "epoch": 0.5409537166900421, "grad_norm": 2.7038778678861553, "learning_rate": 4.580313181292168e-06, "loss": 0.3654, "step": 3857 }, { "epoch": 0.5410939691444601, "grad_norm": 3.0510882077889825, "learning_rate": 4.578049990086309e-06, "loss": 0.3724, "step": 3858 }, { "epoch": 0.541234221598878, "grad_norm": 2.3639348915286815, "learning_rate": 4.575786885946886e-06, "loss": 0.3758, "step": 3859 }, { "epoch": 0.541374474053296, "grad_norm": 2.8292038688030243, "learning_rate": 4.573523869340875e-06, "loss": 0.3558, "step": 3860 }, { "epoch": 0.5415147265077139, "grad_norm": 1.904698412694173, "learning_rate": 4.571260940735235e-06, "loss": 0.3687, "step": 3861 }, { "epoch": 0.5416549789621319, "grad_norm": 1.8712332719298508, "learning_rate": 4.568998100596903e-06, "loss": 0.3761, "step": 3862 }, { "epoch": 0.5417952314165498, "grad_norm": 1.8831217025186084, "learning_rate": 4.566735349392802e-06, "loss": 0.3501, "step": 3863 }, { "epoch": 0.5419354838709678, "grad_norm": 1.6454830360245232, "learning_rate": 4.564472687589836e-06, "loss": 0.3509, "step": 3864 }, { "epoch": 0.5420757363253857, "grad_norm": 1.8091647274663516, "learning_rate": 4.562210115654887e-06, "loss": 0.3747, "step": 3865 }, { "epoch": 0.5422159887798036, "grad_norm": 1.6665782269219154, "learning_rate": 4.5599476340548225e-06, "loss": 0.3904, "step": 3866 }, { "epoch": 0.5423562412342215, "grad_norm": 3.104678340105464, "learning_rate": 4.5576852432564896e-06, "loss": 0.3847, "step": 3867 }, { "epoch": 0.5424964936886395, "grad_norm": 2.6605894657456677, "learning_rate": 4.555422943726715e-06, "loss": 0.402, "step": 3868 }, { "epoch": 0.5426367461430575, "grad_norm": 2.0226962230352714, "learning_rate": 4.5531607359323125e-06, "loss": 0.3683, "step": 3869 }, { "epoch": 0.5427769985974754, "grad_norm": 1.9697688660360277, "learning_rate": 4.550898620340069e-06, "loss": 0.3541, "step": 3870 }, { "epoch": 0.5429172510518934, "grad_norm": 1.8209327656056056, "learning_rate": 4.548636597416758e-06, "loss": 0.3945, "step": 3871 }, { "epoch": 0.5430575035063113, "grad_norm": 3.081041532509555, "learning_rate": 4.546374667629131e-06, "loss": 0.3717, "step": 3872 }, { "epoch": 0.5431977559607293, "grad_norm": 2.605151018389007, "learning_rate": 4.544112831443921e-06, "loss": 0.3905, "step": 3873 }, { "epoch": 0.5433380084151472, "grad_norm": 1.8229765164677507, "learning_rate": 4.541851089327844e-06, "loss": 0.3509, "step": 3874 }, { "epoch": 0.5434782608695652, "grad_norm": 1.8193377066341037, "learning_rate": 4.539589441747595e-06, "loss": 0.2899, "step": 3875 }, { "epoch": 0.5436185133239831, "grad_norm": 1.967936393105313, "learning_rate": 4.537327889169847e-06, "loss": 0.3223, "step": 3876 }, { "epoch": 0.5437587657784011, "grad_norm": 2.2267729007292414, "learning_rate": 4.535066432061256e-06, "loss": 0.3587, "step": 3877 }, { "epoch": 0.543899018232819, "grad_norm": 1.7616622475911814, "learning_rate": 4.532805070888459e-06, "loss": 0.3764, "step": 3878 }, { "epoch": 0.544039270687237, "grad_norm": 3.1818069779830456, "learning_rate": 4.53054380611807e-06, "loss": 0.3705, "step": 3879 }, { "epoch": 0.544179523141655, "grad_norm": 1.835864710886692, "learning_rate": 4.528282638216689e-06, "loss": 0.331, "step": 3880 }, { "epoch": 0.5443197755960729, "grad_norm": 2.011107001759995, "learning_rate": 4.526021567650889e-06, "loss": 0.3621, "step": 3881 }, { "epoch": 0.5444600280504909, "grad_norm": 1.8267039457169438, "learning_rate": 4.523760594887228e-06, "loss": 0.3491, "step": 3882 }, { "epoch": 0.5446002805049088, "grad_norm": 2.0383710017535557, "learning_rate": 4.5214997203922394e-06, "loss": 0.4078, "step": 3883 }, { "epoch": 0.5447405329593268, "grad_norm": 2.6147167784742043, "learning_rate": 4.519238944632442e-06, "loss": 0.3195, "step": 3884 }, { "epoch": 0.5448807854137447, "grad_norm": 2.177517265137506, "learning_rate": 4.516978268074328e-06, "loss": 0.3893, "step": 3885 }, { "epoch": 0.5450210378681627, "grad_norm": 2.8368955733046377, "learning_rate": 4.5147176911843746e-06, "loss": 0.3465, "step": 3886 }, { "epoch": 0.5451612903225806, "grad_norm": 6.072248977240768, "learning_rate": 4.5124572144290345e-06, "loss": 0.3514, "step": 3887 }, { "epoch": 0.5453015427769986, "grad_norm": 2.030698261154632, "learning_rate": 4.510196838274742e-06, "loss": 0.3747, "step": 3888 }, { "epoch": 0.5454417952314166, "grad_norm": 1.682193891047895, "learning_rate": 4.507936563187911e-06, "loss": 0.3856, "step": 3889 }, { "epoch": 0.5455820476858345, "grad_norm": 2.372250420936067, "learning_rate": 4.505676389634932e-06, "loss": 0.3155, "step": 3890 }, { "epoch": 0.5457223001402525, "grad_norm": 2.2192439214552913, "learning_rate": 4.5034163180821775e-06, "loss": 0.3842, "step": 3891 }, { "epoch": 0.5458625525946704, "grad_norm": 2.173165137134506, "learning_rate": 4.5011563489959945e-06, "loss": 0.3311, "step": 3892 }, { "epoch": 0.5460028050490884, "grad_norm": 2.2312166526717365, "learning_rate": 4.498896482842715e-06, "loss": 0.3745, "step": 3893 }, { "epoch": 0.5461430575035063, "grad_norm": 1.7927490631684098, "learning_rate": 4.496636720088643e-06, "loss": 0.3468, "step": 3894 }, { "epoch": 0.5462833099579243, "grad_norm": 2.29388866784975, "learning_rate": 4.4943770612000686e-06, "loss": 0.3566, "step": 3895 }, { "epoch": 0.5464235624123422, "grad_norm": 1.7380833202359072, "learning_rate": 4.492117506643256e-06, "loss": 0.3688, "step": 3896 }, { "epoch": 0.5465638148667602, "grad_norm": 1.9628051131871231, "learning_rate": 4.489858056884446e-06, "loss": 0.3568, "step": 3897 }, { "epoch": 0.5467040673211782, "grad_norm": 1.7493356397316042, "learning_rate": 4.487598712389862e-06, "loss": 0.3526, "step": 3898 }, { "epoch": 0.5468443197755961, "grad_norm": 1.8857004011869167, "learning_rate": 4.485339473625704e-06, "loss": 0.3635, "step": 3899 }, { "epoch": 0.5469845722300141, "grad_norm": 2.77714115053065, "learning_rate": 4.4830803410581506e-06, "loss": 0.4128, "step": 3900 }, { "epoch": 0.547124824684432, "grad_norm": 2.2753753268664916, "learning_rate": 4.480821315153356e-06, "loss": 0.3294, "step": 3901 }, { "epoch": 0.54726507713885, "grad_norm": 2.0920410013794273, "learning_rate": 4.478562396377457e-06, "loss": 0.3432, "step": 3902 }, { "epoch": 0.5474053295932679, "grad_norm": 2.90700595007131, "learning_rate": 4.476303585196563e-06, "loss": 0.3558, "step": 3903 }, { "epoch": 0.5475455820476859, "grad_norm": 3.0075940337665656, "learning_rate": 4.474044882076766e-06, "loss": 0.3309, "step": 3904 }, { "epoch": 0.5476858345021038, "grad_norm": 2.2447326251155295, "learning_rate": 4.471786287484134e-06, "loss": 0.2912, "step": 3905 }, { "epoch": 0.5478260869565217, "grad_norm": 2.0240551804796993, "learning_rate": 4.46952780188471e-06, "loss": 0.3622, "step": 3906 }, { "epoch": 0.5479663394109396, "grad_norm": 2.793983456545589, "learning_rate": 4.467269425744518e-06, "loss": 0.3768, "step": 3907 }, { "epoch": 0.5481065918653576, "grad_norm": 1.9396775837038343, "learning_rate": 4.465011159529559e-06, "loss": 0.3917, "step": 3908 }, { "epoch": 0.5482468443197756, "grad_norm": 2.1350374458834924, "learning_rate": 4.462753003705808e-06, "loss": 0.3438, "step": 3909 }, { "epoch": 0.5483870967741935, "grad_norm": 2.0753567262591863, "learning_rate": 4.460494958739223e-06, "loss": 0.3568, "step": 3910 }, { "epoch": 0.5485273492286115, "grad_norm": 2.484770641372815, "learning_rate": 4.458237025095733e-06, "loss": 0.3791, "step": 3911 }, { "epoch": 0.5486676016830294, "grad_norm": 1.9310991938205846, "learning_rate": 4.45597920324125e-06, "loss": 0.3475, "step": 3912 }, { "epoch": 0.5488078541374474, "grad_norm": 2.3812475850250734, "learning_rate": 4.453721493641655e-06, "loss": 0.3669, "step": 3913 }, { "epoch": 0.5489481065918653, "grad_norm": 2.515563395438916, "learning_rate": 4.451463896762813e-06, "loss": 0.3357, "step": 3914 }, { "epoch": 0.5490883590462833, "grad_norm": 2.297783289266765, "learning_rate": 4.449206413070565e-06, "loss": 0.3753, "step": 3915 }, { "epoch": 0.5492286115007012, "grad_norm": 1.8882573963910838, "learning_rate": 4.446949043030724e-06, "loss": 0.3422, "step": 3916 }, { "epoch": 0.5493688639551192, "grad_norm": 1.9788357775971421, "learning_rate": 4.444691787109085e-06, "loss": 0.3183, "step": 3917 }, { "epoch": 0.5495091164095371, "grad_norm": 2.0455493856849616, "learning_rate": 4.442434645771416e-06, "loss": 0.3287, "step": 3918 }, { "epoch": 0.5496493688639551, "grad_norm": 2.2545894863214135, "learning_rate": 4.4401776194834615e-06, "loss": 0.3719, "step": 3919 }, { "epoch": 0.5497896213183731, "grad_norm": 1.9582421978245121, "learning_rate": 4.437920708710944e-06, "loss": 0.3928, "step": 3920 }, { "epoch": 0.549929873772791, "grad_norm": 1.843475062674204, "learning_rate": 4.435663913919563e-06, "loss": 0.3843, "step": 3921 }, { "epoch": 0.550070126227209, "grad_norm": 2.362382601288497, "learning_rate": 4.433407235574989e-06, "loss": 0.3627, "step": 3922 }, { "epoch": 0.5502103786816269, "grad_norm": 1.682032593736763, "learning_rate": 4.431150674142874e-06, "loss": 0.3514, "step": 3923 }, { "epoch": 0.5503506311360449, "grad_norm": 1.9547678637431727, "learning_rate": 4.428894230088842e-06, "loss": 0.3493, "step": 3924 }, { "epoch": 0.5504908835904628, "grad_norm": 2.3956817800160928, "learning_rate": 4.426637903878498e-06, "loss": 0.4052, "step": 3925 }, { "epoch": 0.5506311360448808, "grad_norm": 1.972846477136798, "learning_rate": 4.424381695977415e-06, "loss": 0.3218, "step": 3926 }, { "epoch": 0.5507713884992987, "grad_norm": 2.6929020372343873, "learning_rate": 4.422125606851147e-06, "loss": 0.3498, "step": 3927 }, { "epoch": 0.5509116409537167, "grad_norm": 2.248596177324145, "learning_rate": 4.419869636965223e-06, "loss": 0.3458, "step": 3928 }, { "epoch": 0.5510518934081347, "grad_norm": 2.1531526738175297, "learning_rate": 4.417613786785147e-06, "loss": 0.3578, "step": 3929 }, { "epoch": 0.5511921458625526, "grad_norm": 1.8298157266694628, "learning_rate": 4.415358056776398e-06, "loss": 0.3121, "step": 3930 }, { "epoch": 0.5513323983169706, "grad_norm": 2.1541765720673407, "learning_rate": 4.413102447404428e-06, "loss": 0.3728, "step": 3931 }, { "epoch": 0.5514726507713885, "grad_norm": 2.0610473603531414, "learning_rate": 4.410846959134667e-06, "loss": 0.3419, "step": 3932 }, { "epoch": 0.5516129032258065, "grad_norm": 1.6841794298541857, "learning_rate": 4.4085915924325226e-06, "loss": 0.3509, "step": 3933 }, { "epoch": 0.5517531556802244, "grad_norm": 2.2575638050005766, "learning_rate": 4.406336347763369e-06, "loss": 0.3978, "step": 3934 }, { "epoch": 0.5518934081346424, "grad_norm": 2.884226382509099, "learning_rate": 4.404081225592562e-06, "loss": 0.331, "step": 3935 }, { "epoch": 0.5520336605890603, "grad_norm": 2.002456229656637, "learning_rate": 4.401826226385431e-06, "loss": 0.3618, "step": 3936 }, { "epoch": 0.5521739130434783, "grad_norm": 2.516485410651428, "learning_rate": 4.399571350607281e-06, "loss": 0.3783, "step": 3937 }, { "epoch": 0.5523141654978962, "grad_norm": 1.9928447761093335, "learning_rate": 4.397316598723385e-06, "loss": 0.4104, "step": 3938 }, { "epoch": 0.5524544179523142, "grad_norm": 5.061050772746348, "learning_rate": 4.395061971199e-06, "loss": 0.3787, "step": 3939 }, { "epoch": 0.5525946704067322, "grad_norm": 1.8323935209514903, "learning_rate": 4.39280746849935e-06, "loss": 0.3743, "step": 3940 }, { "epoch": 0.5527349228611501, "grad_norm": 3.5232849764746055, "learning_rate": 4.390553091089637e-06, "loss": 0.3463, "step": 3941 }, { "epoch": 0.5528751753155681, "grad_norm": 1.9255368462590554, "learning_rate": 4.388298839435036e-06, "loss": 0.3465, "step": 3942 }, { "epoch": 0.553015427769986, "grad_norm": 1.9038140413529716, "learning_rate": 4.386044714000695e-06, "loss": 0.3391, "step": 3943 }, { "epoch": 0.553155680224404, "grad_norm": 1.9011684915400022, "learning_rate": 4.383790715251739e-06, "loss": 0.3737, "step": 3944 }, { "epoch": 0.5532959326788219, "grad_norm": 2.0758023009347295, "learning_rate": 4.381536843653262e-06, "loss": 0.3487, "step": 3945 }, { "epoch": 0.5534361851332398, "grad_norm": 2.2002244618089604, "learning_rate": 4.379283099670338e-06, "loss": 0.3625, "step": 3946 }, { "epoch": 0.5535764375876577, "grad_norm": 2.922614019436026, "learning_rate": 4.377029483768009e-06, "loss": 0.3633, "step": 3947 }, { "epoch": 0.5537166900420757, "grad_norm": 1.7813143802184654, "learning_rate": 4.3747759964112936e-06, "loss": 0.3572, "step": 3948 }, { "epoch": 0.5538569424964936, "grad_norm": 2.1892607006278904, "learning_rate": 4.372522638065183e-06, "loss": 0.3643, "step": 3949 }, { "epoch": 0.5539971949509116, "grad_norm": 2.387352692218868, "learning_rate": 4.370269409194642e-06, "loss": 0.3452, "step": 3950 }, { "epoch": 0.5541374474053296, "grad_norm": 1.8691123713128777, "learning_rate": 4.36801631026461e-06, "loss": 0.3797, "step": 3951 }, { "epoch": 0.5542776998597475, "grad_norm": 3.09714791551377, "learning_rate": 4.365763341739996e-06, "loss": 0.3598, "step": 3952 }, { "epoch": 0.5544179523141655, "grad_norm": 1.8317313853739752, "learning_rate": 4.363510504085685e-06, "loss": 0.2957, "step": 3953 }, { "epoch": 0.5545582047685834, "grad_norm": 2.040786073178243, "learning_rate": 4.361257797766537e-06, "loss": 0.417, "step": 3954 }, { "epoch": 0.5546984572230014, "grad_norm": 2.564329285257168, "learning_rate": 4.359005223247378e-06, "loss": 0.3398, "step": 3955 }, { "epoch": 0.5548387096774193, "grad_norm": 2.1021277365958464, "learning_rate": 4.356752780993012e-06, "loss": 0.3701, "step": 3956 }, { "epoch": 0.5549789621318373, "grad_norm": 3.10998651826641, "learning_rate": 4.354500471468217e-06, "loss": 0.3333, "step": 3957 }, { "epoch": 0.5551192145862552, "grad_norm": 2.1287674884895367, "learning_rate": 4.352248295137739e-06, "loss": 0.3716, "step": 3958 }, { "epoch": 0.5552594670406732, "grad_norm": 1.9927558836088566, "learning_rate": 4.3499962524662995e-06, "loss": 0.3757, "step": 3959 }, { "epoch": 0.5553997194950911, "grad_norm": 1.9628789645176046, "learning_rate": 4.347744343918593e-06, "loss": 0.4037, "step": 3960 }, { "epoch": 0.5555399719495091, "grad_norm": 2.5501542773980352, "learning_rate": 4.345492569959283e-06, "loss": 0.3686, "step": 3961 }, { "epoch": 0.5556802244039271, "grad_norm": 1.980111535716656, "learning_rate": 4.3432409310530096e-06, "loss": 0.3881, "step": 3962 }, { "epoch": 0.555820476858345, "grad_norm": 1.771262831119859, "learning_rate": 4.340989427664381e-06, "loss": 0.3829, "step": 3963 }, { "epoch": 0.555960729312763, "grad_norm": 1.5055358776731897, "learning_rate": 4.338738060257979e-06, "loss": 0.3202, "step": 3964 }, { "epoch": 0.5561009817671809, "grad_norm": 1.5640431222849003, "learning_rate": 4.336486829298359e-06, "loss": 0.3532, "step": 3965 }, { "epoch": 0.5562412342215989, "grad_norm": 1.8157504302671275, "learning_rate": 4.334235735250047e-06, "loss": 0.3442, "step": 3966 }, { "epoch": 0.5563814866760168, "grad_norm": 3.753603538986645, "learning_rate": 4.331984778577539e-06, "loss": 0.3288, "step": 3967 }, { "epoch": 0.5565217391304348, "grad_norm": 1.8673389325523846, "learning_rate": 4.3297339597453046e-06, "loss": 0.3055, "step": 3968 }, { "epoch": 0.5566619915848527, "grad_norm": 2.63599246724721, "learning_rate": 4.3274832792177845e-06, "loss": 0.4148, "step": 3969 }, { "epoch": 0.5568022440392707, "grad_norm": 2.0529265889952, "learning_rate": 4.325232737459391e-06, "loss": 0.3447, "step": 3970 }, { "epoch": 0.5569424964936887, "grad_norm": 2.863405316047186, "learning_rate": 4.322982334934509e-06, "loss": 0.3627, "step": 3971 }, { "epoch": 0.5570827489481066, "grad_norm": 7.75094008127754, "learning_rate": 4.320732072107491e-06, "loss": 0.3467, "step": 3972 }, { "epoch": 0.5572230014025246, "grad_norm": 2.0167040634891387, "learning_rate": 4.318481949442665e-06, "loss": 0.3709, "step": 3973 }, { "epoch": 0.5573632538569425, "grad_norm": 2.9767971797352577, "learning_rate": 4.316231967404326e-06, "loss": 0.3656, "step": 3974 }, { "epoch": 0.5575035063113605, "grad_norm": 2.058524259085654, "learning_rate": 4.313982126456747e-06, "loss": 0.3359, "step": 3975 }, { "epoch": 0.5576437587657784, "grad_norm": 1.7439150605646927, "learning_rate": 4.31173242706416e-06, "loss": 0.3943, "step": 3976 }, { "epoch": 0.5577840112201964, "grad_norm": 1.974679826787042, "learning_rate": 4.309482869690779e-06, "loss": 0.3596, "step": 3977 }, { "epoch": 0.5579242636746143, "grad_norm": 2.6952937471041465, "learning_rate": 4.307233454800783e-06, "loss": 0.3331, "step": 3978 }, { "epoch": 0.5580645161290323, "grad_norm": 1.5705695250338612, "learning_rate": 4.304984182858324e-06, "loss": 0.3316, "step": 3979 }, { "epoch": 0.5582047685834503, "grad_norm": 2.4750063847931476, "learning_rate": 4.302735054327523e-06, "loss": 0.3354, "step": 3980 }, { "epoch": 0.5583450210378682, "grad_norm": 1.8427319059558782, "learning_rate": 4.300486069672471e-06, "loss": 0.3303, "step": 3981 }, { "epoch": 0.5584852734922862, "grad_norm": 2.2387593609195107, "learning_rate": 4.298237229357233e-06, "loss": 0.3516, "step": 3982 }, { "epoch": 0.5586255259467041, "grad_norm": 1.9649249869629832, "learning_rate": 4.2959885338458385e-06, "loss": 0.3727, "step": 3983 }, { "epoch": 0.5587657784011221, "grad_norm": 1.9914933734451215, "learning_rate": 4.293739983602292e-06, "loss": 0.3451, "step": 3984 }, { "epoch": 0.55890603085554, "grad_norm": 1.8368202846668076, "learning_rate": 4.291491579090565e-06, "loss": 0.3055, "step": 3985 }, { "epoch": 0.5590462833099579, "grad_norm": 1.9741925894872447, "learning_rate": 4.289243320774601e-06, "loss": 0.355, "step": 3986 }, { "epoch": 0.5591865357643758, "grad_norm": 2.2796349229822037, "learning_rate": 4.286995209118313e-06, "loss": 0.3685, "step": 3987 }, { "epoch": 0.5593267882187938, "grad_norm": 2.42933392175206, "learning_rate": 4.284747244585581e-06, "loss": 0.3602, "step": 3988 }, { "epoch": 0.5594670406732117, "grad_norm": 2.359462509048195, "learning_rate": 4.282499427640258e-06, "loss": 0.3937, "step": 3989 }, { "epoch": 0.5596072931276297, "grad_norm": 2.4346362188043242, "learning_rate": 4.280251758746165e-06, "loss": 0.407, "step": 3990 }, { "epoch": 0.5597475455820476, "grad_norm": 1.9350033332736856, "learning_rate": 4.278004238367093e-06, "loss": 0.349, "step": 3991 }, { "epoch": 0.5598877980364656, "grad_norm": 2.212681490248612, "learning_rate": 4.275756866966804e-06, "loss": 0.3662, "step": 3992 }, { "epoch": 0.5600280504908836, "grad_norm": 1.782911763616484, "learning_rate": 4.273509645009023e-06, "loss": 0.3814, "step": 3993 }, { "epoch": 0.5601683029453015, "grad_norm": 2.508265754406132, "learning_rate": 4.271262572957453e-06, "loss": 0.3607, "step": 3994 }, { "epoch": 0.5603085553997195, "grad_norm": 2.0931657573824434, "learning_rate": 4.269015651275761e-06, "loss": 0.3543, "step": 3995 }, { "epoch": 0.5604488078541374, "grad_norm": 2.481147829349639, "learning_rate": 4.26676888042758e-06, "loss": 0.3574, "step": 3996 }, { "epoch": 0.5605890603085554, "grad_norm": 1.5637285361886657, "learning_rate": 4.264522260876518e-06, "loss": 0.3703, "step": 3997 }, { "epoch": 0.5607293127629733, "grad_norm": 2.0011552811842517, "learning_rate": 4.262275793086149e-06, "loss": 0.3532, "step": 3998 }, { "epoch": 0.5608695652173913, "grad_norm": 4.584021198789568, "learning_rate": 4.260029477520016e-06, "loss": 0.3791, "step": 3999 }, { "epoch": 0.5610098176718092, "grad_norm": 2.3692501134016046, "learning_rate": 4.25778331464163e-06, "loss": 0.3675, "step": 4000 }, { "epoch": 0.5611500701262272, "grad_norm": 1.995452419504221, "learning_rate": 4.255537304914472e-06, "loss": 0.3515, "step": 4001 }, { "epoch": 0.5612903225806452, "grad_norm": 1.9120007056430488, "learning_rate": 4.253291448801989e-06, "loss": 0.3505, "step": 4002 }, { "epoch": 0.5614305750350631, "grad_norm": 2.0482409629552576, "learning_rate": 4.251045746767601e-06, "loss": 0.3517, "step": 4003 }, { "epoch": 0.5615708274894811, "grad_norm": 3.569560766901653, "learning_rate": 4.248800199274689e-06, "loss": 0.3418, "step": 4004 }, { "epoch": 0.561711079943899, "grad_norm": 1.836702163212503, "learning_rate": 4.246554806786607e-06, "loss": 0.3388, "step": 4005 }, { "epoch": 0.561851332398317, "grad_norm": 2.824366479916942, "learning_rate": 4.244309569766677e-06, "loss": 0.4082, "step": 4006 }, { "epoch": 0.5619915848527349, "grad_norm": 3.0586480029910557, "learning_rate": 4.242064488678188e-06, "loss": 0.3457, "step": 4007 }, { "epoch": 0.5621318373071529, "grad_norm": 2.000085471234275, "learning_rate": 4.239819563984397e-06, "loss": 0.3829, "step": 4008 }, { "epoch": 0.5622720897615708, "grad_norm": 1.816690139735518, "learning_rate": 4.237574796148527e-06, "loss": 0.3454, "step": 4009 }, { "epoch": 0.5624123422159888, "grad_norm": 2.294144960747169, "learning_rate": 4.23533018563377e-06, "loss": 0.3405, "step": 4010 }, { "epoch": 0.5625525946704067, "grad_norm": 2.0517414279376136, "learning_rate": 4.233085732903288e-06, "loss": 0.3738, "step": 4011 }, { "epoch": 0.5626928471248247, "grad_norm": 2.931601100301933, "learning_rate": 4.230841438420209e-06, "loss": 0.3538, "step": 4012 }, { "epoch": 0.5628330995792427, "grad_norm": 2.27883365111371, "learning_rate": 4.228597302647622e-06, "loss": 0.3612, "step": 4013 }, { "epoch": 0.5629733520336606, "grad_norm": 3.1511775888599494, "learning_rate": 4.226353326048594e-06, "loss": 0.3501, "step": 4014 }, { "epoch": 0.5631136044880786, "grad_norm": 1.6144590350471384, "learning_rate": 4.224109509086151e-06, "loss": 0.3443, "step": 4015 }, { "epoch": 0.5632538569424965, "grad_norm": 1.890248221693349, "learning_rate": 4.221865852223293e-06, "loss": 0.3369, "step": 4016 }, { "epoch": 0.5633941093969145, "grad_norm": 2.230133726275793, "learning_rate": 4.219622355922976e-06, "loss": 0.3472, "step": 4017 }, { "epoch": 0.5635343618513324, "grad_norm": 1.796377760047483, "learning_rate": 4.217379020648135e-06, "loss": 0.392, "step": 4018 }, { "epoch": 0.5636746143057504, "grad_norm": 1.6747253221218918, "learning_rate": 4.2151358468616675e-06, "loss": 0.3612, "step": 4019 }, { "epoch": 0.5638148667601683, "grad_norm": 2.7354067354586107, "learning_rate": 4.212892835026432e-06, "loss": 0.3715, "step": 4020 }, { "epoch": 0.5639551192145863, "grad_norm": 5.220324067619909, "learning_rate": 4.2106499856052604e-06, "loss": 0.3983, "step": 4021 }, { "epoch": 0.5640953716690043, "grad_norm": 1.9543880193585925, "learning_rate": 4.2084072990609505e-06, "loss": 0.3922, "step": 4022 }, { "epoch": 0.5642356241234222, "grad_norm": 2.089358949134604, "learning_rate": 4.206164775856265e-06, "loss": 0.3637, "step": 4023 }, { "epoch": 0.5643758765778402, "grad_norm": 3.107859640018066, "learning_rate": 4.2039224164539306e-06, "loss": 0.3743, "step": 4024 }, { "epoch": 0.5645161290322581, "grad_norm": 2.259329104979652, "learning_rate": 4.201680221316643e-06, "loss": 0.359, "step": 4025 }, { "epoch": 0.564656381486676, "grad_norm": 2.7861196341806282, "learning_rate": 4.1994381909070645e-06, "loss": 0.3375, "step": 4026 }, { "epoch": 0.5647966339410939, "grad_norm": 1.9161412997933962, "learning_rate": 4.1971963256878224e-06, "loss": 0.3395, "step": 4027 }, { "epoch": 0.5649368863955119, "grad_norm": 1.8622376660437714, "learning_rate": 4.194954626121511e-06, "loss": 0.3879, "step": 4028 }, { "epoch": 0.5650771388499298, "grad_norm": 1.9647463097499447, "learning_rate": 4.192713092670687e-06, "loss": 0.3, "step": 4029 }, { "epoch": 0.5652173913043478, "grad_norm": 2.1680854284832543, "learning_rate": 4.190471725797875e-06, "loss": 0.4018, "step": 4030 }, { "epoch": 0.5653576437587657, "grad_norm": 2.4319991617264054, "learning_rate": 4.188230525965567e-06, "loss": 0.3962, "step": 4031 }, { "epoch": 0.5654978962131837, "grad_norm": 2.7505121621128428, "learning_rate": 4.185989493636219e-06, "loss": 0.3428, "step": 4032 }, { "epoch": 0.5656381486676016, "grad_norm": 2.3025213892105794, "learning_rate": 4.183748629272254e-06, "loss": 0.3309, "step": 4033 }, { "epoch": 0.5657784011220196, "grad_norm": 2.0003605755972007, "learning_rate": 4.181507933336054e-06, "loss": 0.3762, "step": 4034 }, { "epoch": 0.5659186535764376, "grad_norm": 2.7179611640159838, "learning_rate": 4.179267406289974e-06, "loss": 0.3772, "step": 4035 }, { "epoch": 0.5660589060308555, "grad_norm": 2.1792011333630676, "learning_rate": 4.17702704859633e-06, "loss": 0.3388, "step": 4036 }, { "epoch": 0.5661991584852735, "grad_norm": 2.1503614654088468, "learning_rate": 4.174786860717408e-06, "loss": 0.3605, "step": 4037 }, { "epoch": 0.5663394109396914, "grad_norm": 1.9815515571488207, "learning_rate": 4.172546843115449e-06, "loss": 0.3638, "step": 4038 }, { "epoch": 0.5664796633941094, "grad_norm": 2.500379342862865, "learning_rate": 4.170306996252669e-06, "loss": 0.3447, "step": 4039 }, { "epoch": 0.5666199158485273, "grad_norm": 1.7325283908031537, "learning_rate": 4.1680673205912425e-06, "loss": 0.3438, "step": 4040 }, { "epoch": 0.5667601683029453, "grad_norm": 2.233380182923048, "learning_rate": 4.165827816593312e-06, "loss": 0.3471, "step": 4041 }, { "epoch": 0.5669004207573632, "grad_norm": 8.051422789516575, "learning_rate": 4.163588484720984e-06, "loss": 0.3638, "step": 4042 }, { "epoch": 0.5670406732117812, "grad_norm": 2.3848292086885925, "learning_rate": 4.161349325436328e-06, "loss": 0.3537, "step": 4043 }, { "epoch": 0.5671809256661992, "grad_norm": 2.030663625737846, "learning_rate": 4.159110339201381e-06, "loss": 0.3039, "step": 4044 }, { "epoch": 0.5673211781206171, "grad_norm": 2.183869864761213, "learning_rate": 4.156871526478139e-06, "loss": 0.3543, "step": 4045 }, { "epoch": 0.5674614305750351, "grad_norm": 4.95938898383839, "learning_rate": 4.1546328877285665e-06, "loss": 0.3324, "step": 4046 }, { "epoch": 0.567601683029453, "grad_norm": 2.3391394470646083, "learning_rate": 4.152394423414593e-06, "loss": 0.3408, "step": 4047 }, { "epoch": 0.567741935483871, "grad_norm": 1.9948738685450287, "learning_rate": 4.1501561339981086e-06, "loss": 0.3632, "step": 4048 }, { "epoch": 0.5678821879382889, "grad_norm": 1.9703244165820992, "learning_rate": 4.147918019940967e-06, "loss": 0.3855, "step": 4049 }, { "epoch": 0.5680224403927069, "grad_norm": 2.760256022451316, "learning_rate": 4.145680081704989e-06, "loss": 0.4089, "step": 4050 }, { "epoch": 0.5681626928471248, "grad_norm": 2.311451442227457, "learning_rate": 4.143442319751958e-06, "loss": 0.3402, "step": 4051 }, { "epoch": 0.5683029453015428, "grad_norm": 2.402487451240722, "learning_rate": 4.14120473454362e-06, "loss": 0.3526, "step": 4052 }, { "epoch": 0.5684431977559607, "grad_norm": 2.042516596340329, "learning_rate": 4.138967326541685e-06, "loss": 0.3308, "step": 4053 }, { "epoch": 0.5685834502103787, "grad_norm": 2.4331368741545063, "learning_rate": 4.136730096207827e-06, "loss": 0.3503, "step": 4054 }, { "epoch": 0.5687237026647967, "grad_norm": 2.192411848186068, "learning_rate": 4.134493044003681e-06, "loss": 0.349, "step": 4055 }, { "epoch": 0.5688639551192146, "grad_norm": 2.7949759573405664, "learning_rate": 4.132256170390848e-06, "loss": 0.3555, "step": 4056 }, { "epoch": 0.5690042075736326, "grad_norm": 2.2904647333927173, "learning_rate": 4.1300194758308935e-06, "loss": 0.4011, "step": 4057 }, { "epoch": 0.5691444600280505, "grad_norm": 4.207944921314374, "learning_rate": 4.127782960785344e-06, "loss": 0.3558, "step": 4058 }, { "epoch": 0.5692847124824685, "grad_norm": 2.1428089000175983, "learning_rate": 4.125546625715683e-06, "loss": 0.368, "step": 4059 }, { "epoch": 0.5694249649368864, "grad_norm": 1.7858487378320997, "learning_rate": 4.123310471083368e-06, "loss": 0.3317, "step": 4060 }, { "epoch": 0.5695652173913044, "grad_norm": 2.350428310889083, "learning_rate": 4.121074497349811e-06, "loss": 0.3393, "step": 4061 }, { "epoch": 0.5697054698457223, "grad_norm": 3.1231477970423382, "learning_rate": 4.118838704976392e-06, "loss": 0.3643, "step": 4062 }, { "epoch": 0.5698457223001403, "grad_norm": 2.4124420554362342, "learning_rate": 4.116603094424449e-06, "loss": 0.3804, "step": 4063 }, { "epoch": 0.5699859747545583, "grad_norm": 1.7870402671918004, "learning_rate": 4.1143676661552876e-06, "loss": 0.38, "step": 4064 }, { "epoch": 0.5701262272089762, "grad_norm": 2.6742602849660737, "learning_rate": 4.112132420630169e-06, "loss": 0.325, "step": 4065 }, { "epoch": 0.570266479663394, "grad_norm": 2.3125673232717587, "learning_rate": 4.1098973583103226e-06, "loss": 0.3332, "step": 4066 }, { "epoch": 0.570406732117812, "grad_norm": 2.7874232157579404, "learning_rate": 4.107662479656937e-06, "loss": 0.3422, "step": 4067 }, { "epoch": 0.57054698457223, "grad_norm": 2.1500779426108623, "learning_rate": 4.105427785131165e-06, "loss": 0.3659, "step": 4068 }, { "epoch": 0.5706872370266479, "grad_norm": 2.408367760840079, "learning_rate": 4.10319327519412e-06, "loss": 0.3454, "step": 4069 }, { "epoch": 0.5708274894810659, "grad_norm": 2.678832693528013, "learning_rate": 4.1009589503068755e-06, "loss": 0.3558, "step": 4070 }, { "epoch": 0.5709677419354838, "grad_norm": 1.825368384191693, "learning_rate": 4.098724810930472e-06, "loss": 0.3035, "step": 4071 }, { "epoch": 0.5711079943899018, "grad_norm": 2.283628817278272, "learning_rate": 4.096490857525906e-06, "loss": 0.3396, "step": 4072 }, { "epoch": 0.5712482468443197, "grad_norm": 1.9964474478515783, "learning_rate": 4.094257090554139e-06, "loss": 0.3428, "step": 4073 }, { "epoch": 0.5713884992987377, "grad_norm": 1.807067622681657, "learning_rate": 4.092023510476095e-06, "loss": 0.344, "step": 4074 }, { "epoch": 0.5715287517531557, "grad_norm": 2.518016839108204, "learning_rate": 4.089790117752655e-06, "loss": 0.3669, "step": 4075 }, { "epoch": 0.5716690042075736, "grad_norm": 2.0935175752450963, "learning_rate": 4.087556912844664e-06, "loss": 0.3312, "step": 4076 }, { "epoch": 0.5718092566619916, "grad_norm": 2.089938894227294, "learning_rate": 4.08532389621293e-06, "loss": 0.3378, "step": 4077 }, { "epoch": 0.5719495091164095, "grad_norm": 1.8908652791327984, "learning_rate": 4.08309106831822e-06, "loss": 0.3357, "step": 4078 }, { "epoch": 0.5720897615708275, "grad_norm": 3.4453951248330204, "learning_rate": 4.080858429621262e-06, "loss": 0.3927, "step": 4079 }, { "epoch": 0.5722300140252454, "grad_norm": 1.8776968317185028, "learning_rate": 4.078625980582746e-06, "loss": 0.3829, "step": 4080 }, { "epoch": 0.5723702664796634, "grad_norm": 1.9442539015764289, "learning_rate": 4.076393721663321e-06, "loss": 0.3639, "step": 4081 }, { "epoch": 0.5725105189340813, "grad_norm": 1.8928874326991776, "learning_rate": 4.0741616533235975e-06, "loss": 0.3402, "step": 4082 }, { "epoch": 0.5726507713884993, "grad_norm": 2.9785241098752095, "learning_rate": 4.071929776024149e-06, "loss": 0.3626, "step": 4083 }, { "epoch": 0.5727910238429172, "grad_norm": 2.0957183695129373, "learning_rate": 4.069698090225508e-06, "loss": 0.3352, "step": 4084 }, { "epoch": 0.5729312762973352, "grad_norm": 2.9927657641804326, "learning_rate": 4.067466596388166e-06, "loss": 0.3503, "step": 4085 }, { "epoch": 0.5730715287517532, "grad_norm": 2.314194032999261, "learning_rate": 4.065235294972577e-06, "loss": 0.3065, "step": 4086 }, { "epoch": 0.5732117812061711, "grad_norm": 2.37091596713029, "learning_rate": 4.063004186439153e-06, "loss": 0.2762, "step": 4087 }, { "epoch": 0.5733520336605891, "grad_norm": 2.537302951020786, "learning_rate": 4.06077327124827e-06, "loss": 0.3668, "step": 4088 }, { "epoch": 0.573492286115007, "grad_norm": 3.281149871751669, "learning_rate": 4.0585425498602605e-06, "loss": 0.3309, "step": 4089 }, { "epoch": 0.573632538569425, "grad_norm": 1.9202906583284154, "learning_rate": 4.056312022735417e-06, "loss": 0.3415, "step": 4090 }, { "epoch": 0.5737727910238429, "grad_norm": 2.0862442533986445, "learning_rate": 4.054081690333995e-06, "loss": 0.3544, "step": 4091 }, { "epoch": 0.5739130434782609, "grad_norm": 1.964025914982669, "learning_rate": 4.051851553116208e-06, "loss": 0.3419, "step": 4092 }, { "epoch": 0.5740532959326788, "grad_norm": 2.2269258972227144, "learning_rate": 4.049621611542228e-06, "loss": 0.3648, "step": 4093 }, { "epoch": 0.5741935483870968, "grad_norm": 2.4151050081978958, "learning_rate": 4.04739186607219e-06, "loss": 0.3309, "step": 4094 }, { "epoch": 0.5743338008415148, "grad_norm": 2.4053281529251667, "learning_rate": 4.045162317166184e-06, "loss": 0.4001, "step": 4095 }, { "epoch": 0.5744740532959327, "grad_norm": 1.9920034884571645, "learning_rate": 4.0429329652842625e-06, "loss": 0.3321, "step": 4096 }, { "epoch": 0.5746143057503507, "grad_norm": 3.5364054627696375, "learning_rate": 4.040703810886437e-06, "loss": 0.3325, "step": 4097 }, { "epoch": 0.5747545582047686, "grad_norm": 1.6961010623951722, "learning_rate": 4.038474854432679e-06, "loss": 0.3669, "step": 4098 }, { "epoch": 0.5748948106591866, "grad_norm": 2.2859871160789926, "learning_rate": 4.036246096382916e-06, "loss": 0.329, "step": 4099 }, { "epoch": 0.5750350631136045, "grad_norm": 2.257150519980587, "learning_rate": 4.03401753719704e-06, "loss": 0.4075, "step": 4100 }, { "epoch": 0.5751753155680225, "grad_norm": 2.83911022593347, "learning_rate": 4.031789177334895e-06, "loss": 0.321, "step": 4101 }, { "epoch": 0.5753155680224404, "grad_norm": 2.415466295484493, "learning_rate": 4.029561017256288e-06, "loss": 0.3603, "step": 4102 }, { "epoch": 0.5754558204768584, "grad_norm": 2.6474540932685415, "learning_rate": 4.027333057420985e-06, "loss": 0.3543, "step": 4103 }, { "epoch": 0.5755960729312763, "grad_norm": 2.710192487292419, "learning_rate": 4.0251052982887105e-06, "loss": 0.3019, "step": 4104 }, { "epoch": 0.5757363253856943, "grad_norm": 1.9153099510411178, "learning_rate": 4.022877740319147e-06, "loss": 0.3537, "step": 4105 }, { "epoch": 0.5758765778401121, "grad_norm": 2.413161915316689, "learning_rate": 4.0206503839719335e-06, "loss": 0.3565, "step": 4106 }, { "epoch": 0.5760168302945301, "grad_norm": 2.1596803882577276, "learning_rate": 4.018423229706672e-06, "loss": 0.3405, "step": 4107 }, { "epoch": 0.5761570827489481, "grad_norm": 3.2498479954312023, "learning_rate": 4.016196277982919e-06, "loss": 0.4166, "step": 4108 }, { "epoch": 0.576297335203366, "grad_norm": 5.20678270781313, "learning_rate": 4.013969529260191e-06, "loss": 0.3417, "step": 4109 }, { "epoch": 0.576437587657784, "grad_norm": 2.3844484810361957, "learning_rate": 4.011742983997961e-06, "loss": 0.3696, "step": 4110 }, { "epoch": 0.5765778401122019, "grad_norm": 4.76942967362148, "learning_rate": 4.009516642655662e-06, "loss": 0.369, "step": 4111 }, { "epoch": 0.5767180925666199, "grad_norm": 1.697423789289751, "learning_rate": 4.007290505692684e-06, "loss": 0.3553, "step": 4112 }, { "epoch": 0.5768583450210378, "grad_norm": 3.95416523511651, "learning_rate": 4.0050645735683745e-06, "loss": 0.3909, "step": 4113 }, { "epoch": 0.5769985974754558, "grad_norm": 2.139558512522974, "learning_rate": 4.002838846742039e-06, "loss": 0.3137, "step": 4114 }, { "epoch": 0.5771388499298737, "grad_norm": 2.137558236918246, "learning_rate": 4.000613325672942e-06, "loss": 0.3775, "step": 4115 }, { "epoch": 0.5772791023842917, "grad_norm": 2.6642570536433197, "learning_rate": 3.998388010820301e-06, "loss": 0.3336, "step": 4116 }, { "epoch": 0.5774193548387097, "grad_norm": 2.0161749510919544, "learning_rate": 3.996162902643296e-06, "loss": 0.3749, "step": 4117 }, { "epoch": 0.5775596072931276, "grad_norm": 2.49786867843826, "learning_rate": 3.993938001601064e-06, "loss": 0.3729, "step": 4118 }, { "epoch": 0.5776998597475456, "grad_norm": 2.08646720418379, "learning_rate": 3.991713308152696e-06, "loss": 0.3339, "step": 4119 }, { "epoch": 0.5778401122019635, "grad_norm": 2.3520681654984763, "learning_rate": 3.989488822757244e-06, "loss": 0.3286, "step": 4120 }, { "epoch": 0.5779803646563815, "grad_norm": 1.8700974904111196, "learning_rate": 3.987264545873712e-06, "loss": 0.3962, "step": 4121 }, { "epoch": 0.5781206171107994, "grad_norm": 2.6825891027379534, "learning_rate": 3.985040477961066e-06, "loss": 0.3333, "step": 4122 }, { "epoch": 0.5782608695652174, "grad_norm": 2.5669499348304448, "learning_rate": 3.982816619478225e-06, "loss": 0.361, "step": 4123 }, { "epoch": 0.5784011220196353, "grad_norm": 3.6099286335999854, "learning_rate": 3.980592970884069e-06, "loss": 0.3846, "step": 4124 }, { "epoch": 0.5785413744740533, "grad_norm": 2.451039490791346, "learning_rate": 3.97836953263743e-06, "loss": 0.3346, "step": 4125 }, { "epoch": 0.5786816269284712, "grad_norm": 2.0278558396710644, "learning_rate": 3.976146305197102e-06, "loss": 0.3454, "step": 4126 }, { "epoch": 0.5788218793828892, "grad_norm": 2.512018307491578, "learning_rate": 3.973923289021829e-06, "loss": 0.3333, "step": 4127 }, { "epoch": 0.5789621318373072, "grad_norm": 3.0108364212973355, "learning_rate": 3.9717004845703175e-06, "loss": 0.384, "step": 4128 }, { "epoch": 0.5791023842917251, "grad_norm": 1.7171409704761555, "learning_rate": 3.969477892301227e-06, "loss": 0.3382, "step": 4129 }, { "epoch": 0.5792426367461431, "grad_norm": 1.9232288337275685, "learning_rate": 3.967255512673174e-06, "loss": 0.3725, "step": 4130 }, { "epoch": 0.579382889200561, "grad_norm": 2.1859354145948102, "learning_rate": 3.96503334614473e-06, "loss": 0.3182, "step": 4131 }, { "epoch": 0.579523141654979, "grad_norm": 1.6894634268562354, "learning_rate": 3.962811393174423e-06, "loss": 0.3196, "step": 4132 }, { "epoch": 0.5796633941093969, "grad_norm": 2.491464826984387, "learning_rate": 3.96058965422074e-06, "loss": 0.3203, "step": 4133 }, { "epoch": 0.5798036465638149, "grad_norm": 1.875576312187306, "learning_rate": 3.9583681297421194e-06, "loss": 0.3811, "step": 4134 }, { "epoch": 0.5799438990182328, "grad_norm": 2.800764480178831, "learning_rate": 3.956146820196959e-06, "loss": 0.3523, "step": 4135 }, { "epoch": 0.5800841514726508, "grad_norm": 1.9469945904539534, "learning_rate": 3.9539257260436085e-06, "loss": 0.3401, "step": 4136 }, { "epoch": 0.5802244039270688, "grad_norm": 1.7060160466028682, "learning_rate": 3.9517048477403755e-06, "loss": 0.3508, "step": 4137 }, { "epoch": 0.5803646563814867, "grad_norm": 1.7768616410221871, "learning_rate": 3.949484185745523e-06, "loss": 0.3836, "step": 4138 }, { "epoch": 0.5805049088359047, "grad_norm": 2.009590282150025, "learning_rate": 3.94726374051727e-06, "loss": 0.3463, "step": 4139 }, { "epoch": 0.5806451612903226, "grad_norm": 2.384161995389065, "learning_rate": 3.94504351251379e-06, "loss": 0.3441, "step": 4140 }, { "epoch": 0.5807854137447406, "grad_norm": 2.250092292588194, "learning_rate": 3.9428235021932104e-06, "loss": 0.3215, "step": 4141 }, { "epoch": 0.5809256661991585, "grad_norm": 2.057154461756021, "learning_rate": 3.940603710013615e-06, "loss": 0.3749, "step": 4142 }, { "epoch": 0.5810659186535765, "grad_norm": 2.2480537686213267, "learning_rate": 3.9383841364330425e-06, "loss": 0.3554, "step": 4143 }, { "epoch": 0.5812061711079944, "grad_norm": 2.984949186521969, "learning_rate": 3.936164781909485e-06, "loss": 0.3668, "step": 4144 }, { "epoch": 0.5813464235624124, "grad_norm": 1.8679294787865324, "learning_rate": 3.933945646900893e-06, "loss": 0.3709, "step": 4145 }, { "epoch": 0.5814866760168302, "grad_norm": 1.750664380525044, "learning_rate": 3.93172673186517e-06, "loss": 0.3513, "step": 4146 }, { "epoch": 0.5816269284712482, "grad_norm": 2.297870238715008, "learning_rate": 3.92950803726017e-06, "loss": 0.3583, "step": 4147 }, { "epoch": 0.5817671809256661, "grad_norm": 1.8358281529105722, "learning_rate": 3.927289563543709e-06, "loss": 0.3554, "step": 4148 }, { "epoch": 0.5819074333800841, "grad_norm": 2.239116413029075, "learning_rate": 3.925071311173551e-06, "loss": 0.3297, "step": 4149 }, { "epoch": 0.5820476858345021, "grad_norm": 2.5718902105389176, "learning_rate": 3.9228532806074184e-06, "loss": 0.3594, "step": 4150 }, { "epoch": 0.58218793828892, "grad_norm": 2.0238983925506586, "learning_rate": 3.920635472302986e-06, "loss": 0.387, "step": 4151 }, { "epoch": 0.582328190743338, "grad_norm": 1.830865419905645, "learning_rate": 3.918417886717884e-06, "loss": 0.3827, "step": 4152 }, { "epoch": 0.5824684431977559, "grad_norm": 1.8999162730792385, "learning_rate": 3.916200524309693e-06, "loss": 0.3711, "step": 4153 }, { "epoch": 0.5826086956521739, "grad_norm": 1.771343322319912, "learning_rate": 3.913983385535951e-06, "loss": 0.3397, "step": 4154 }, { "epoch": 0.5827489481065918, "grad_norm": 2.2062049061432676, "learning_rate": 3.911766470854152e-06, "loss": 0.4011, "step": 4155 }, { "epoch": 0.5828892005610098, "grad_norm": 2.2108300927698354, "learning_rate": 3.9095497807217375e-06, "loss": 0.4168, "step": 4156 }, { "epoch": 0.5830294530154277, "grad_norm": 2.0882882006437122, "learning_rate": 3.907333315596107e-06, "loss": 0.3646, "step": 4157 }, { "epoch": 0.5831697054698457, "grad_norm": 2.041904850774565, "learning_rate": 3.905117075934613e-06, "loss": 0.3221, "step": 4158 }, { "epoch": 0.5833099579242637, "grad_norm": 1.7812451145038197, "learning_rate": 3.902901062194561e-06, "loss": 0.3229, "step": 4159 }, { "epoch": 0.5834502103786816, "grad_norm": 2.1272066664052933, "learning_rate": 3.900685274833211e-06, "loss": 0.3354, "step": 4160 }, { "epoch": 0.5835904628330996, "grad_norm": 1.7319853532669596, "learning_rate": 3.898469714307773e-06, "loss": 0.3773, "step": 4161 }, { "epoch": 0.5837307152875175, "grad_norm": 2.294328692453924, "learning_rate": 3.896254381075416e-06, "loss": 0.3692, "step": 4162 }, { "epoch": 0.5838709677419355, "grad_norm": 2.3797379466449975, "learning_rate": 3.894039275593253e-06, "loss": 0.3706, "step": 4163 }, { "epoch": 0.5840112201963534, "grad_norm": 2.2372456496602755, "learning_rate": 3.891824398318359e-06, "loss": 0.3712, "step": 4164 }, { "epoch": 0.5841514726507714, "grad_norm": 2.671888295636921, "learning_rate": 3.889609749707759e-06, "loss": 0.3605, "step": 4165 }, { "epoch": 0.5842917251051893, "grad_norm": 2.101199888123125, "learning_rate": 3.887395330218429e-06, "loss": 0.4009, "step": 4166 }, { "epoch": 0.5844319775596073, "grad_norm": 2.753203173771317, "learning_rate": 3.8851811403073e-06, "loss": 0.3443, "step": 4167 }, { "epoch": 0.5845722300140253, "grad_norm": 2.624668191193143, "learning_rate": 3.882967180431253e-06, "loss": 0.3406, "step": 4168 }, { "epoch": 0.5847124824684432, "grad_norm": 2.0941259701356043, "learning_rate": 3.880753451047124e-06, "loss": 0.3835, "step": 4169 }, { "epoch": 0.5848527349228612, "grad_norm": 2.07792444623775, "learning_rate": 3.8785399526117e-06, "loss": 0.4415, "step": 4170 }, { "epoch": 0.5849929873772791, "grad_norm": 2.7015436210436, "learning_rate": 3.876326685581724e-06, "loss": 0.3334, "step": 4171 }, { "epoch": 0.5851332398316971, "grad_norm": 2.4641662730617764, "learning_rate": 3.874113650413884e-06, "loss": 0.383, "step": 4172 }, { "epoch": 0.585273492286115, "grad_norm": 2.2109716837366777, "learning_rate": 3.8719008475648265e-06, "loss": 0.3734, "step": 4173 }, { "epoch": 0.585413744740533, "grad_norm": 2.9551476711805598, "learning_rate": 3.869688277491148e-06, "loss": 0.3484, "step": 4174 }, { "epoch": 0.5855539971949509, "grad_norm": 1.9807678839441782, "learning_rate": 3.867475940649396e-06, "loss": 0.358, "step": 4175 }, { "epoch": 0.5856942496493689, "grad_norm": 1.93026156515909, "learning_rate": 3.865263837496072e-06, "loss": 0.3752, "step": 4176 }, { "epoch": 0.5858345021037868, "grad_norm": 2.6938512406939856, "learning_rate": 3.8630519684876264e-06, "loss": 0.3334, "step": 4177 }, { "epoch": 0.5859747545582048, "grad_norm": 1.9554531412267513, "learning_rate": 3.860840334080463e-06, "loss": 0.348, "step": 4178 }, { "epoch": 0.5861150070126228, "grad_norm": 1.9979749202357044, "learning_rate": 3.858628934730939e-06, "loss": 0.3604, "step": 4179 }, { "epoch": 0.5862552594670407, "grad_norm": 2.492117566888369, "learning_rate": 3.8564177708953595e-06, "loss": 0.3474, "step": 4180 }, { "epoch": 0.5863955119214587, "grad_norm": 1.7499045618463536, "learning_rate": 3.854206843029985e-06, "loss": 0.3244, "step": 4181 }, { "epoch": 0.5865357643758766, "grad_norm": 2.8248339308816983, "learning_rate": 3.851996151591022e-06, "loss": 0.3953, "step": 4182 }, { "epoch": 0.5866760168302946, "grad_norm": 2.0510990732146728, "learning_rate": 3.849785697034634e-06, "loss": 0.3516, "step": 4183 }, { "epoch": 0.5868162692847125, "grad_norm": 2.5842645771463886, "learning_rate": 3.847575479816929e-06, "loss": 0.3418, "step": 4184 }, { "epoch": 0.5869565217391305, "grad_norm": 2.3155732224991037, "learning_rate": 3.845365500393974e-06, "loss": 0.3506, "step": 4185 }, { "epoch": 0.5870967741935483, "grad_norm": 2.9519515355171384, "learning_rate": 3.84315575922178e-06, "loss": 0.3698, "step": 4186 }, { "epoch": 0.5872370266479663, "grad_norm": 2.4845983057009717, "learning_rate": 3.840946256756314e-06, "loss": 0.4092, "step": 4187 }, { "epoch": 0.5873772791023842, "grad_norm": 3.67948339588697, "learning_rate": 3.838736993453489e-06, "loss": 0.343, "step": 4188 }, { "epoch": 0.5875175315568022, "grad_norm": 2.7732259320437476, "learning_rate": 3.836527969769172e-06, "loss": 0.3358, "step": 4189 }, { "epoch": 0.5876577840112202, "grad_norm": 2.899068511795467, "learning_rate": 3.834319186159179e-06, "loss": 0.3636, "step": 4190 }, { "epoch": 0.5877980364656381, "grad_norm": 1.931005051272881, "learning_rate": 3.83211064307928e-06, "loss": 0.3908, "step": 4191 }, { "epoch": 0.5879382889200561, "grad_norm": 2.257105627580685, "learning_rate": 3.829902340985189e-06, "loss": 0.3682, "step": 4192 }, { "epoch": 0.588078541374474, "grad_norm": 2.693213488616094, "learning_rate": 3.827694280332575e-06, "loss": 0.3605, "step": 4193 }, { "epoch": 0.588218793828892, "grad_norm": 2.5910509152814916, "learning_rate": 3.8254864615770556e-06, "loss": 0.3623, "step": 4194 }, { "epoch": 0.5883590462833099, "grad_norm": 3.725701126577724, "learning_rate": 3.8232788851742e-06, "loss": 0.3629, "step": 4195 }, { "epoch": 0.5884992987377279, "grad_norm": 1.8032178963940022, "learning_rate": 3.821071551579525e-06, "loss": 0.3515, "step": 4196 }, { "epoch": 0.5886395511921458, "grad_norm": 3.49545879032298, "learning_rate": 3.818864461248498e-06, "loss": 0.382, "step": 4197 }, { "epoch": 0.5887798036465638, "grad_norm": 1.9855891440674973, "learning_rate": 3.816657614636538e-06, "loss": 0.3487, "step": 4198 }, { "epoch": 0.5889200561009817, "grad_norm": 5.877248455523978, "learning_rate": 3.8144510121990106e-06, "loss": 0.3456, "step": 4199 }, { "epoch": 0.5890603085553997, "grad_norm": 2.6702061845698797, "learning_rate": 3.812244654391235e-06, "loss": 0.3363, "step": 4200 }, { "epoch": 0.5892005610098177, "grad_norm": 1.8190881106787657, "learning_rate": 3.810038541668477e-06, "loss": 0.3409, "step": 4201 }, { "epoch": 0.5893408134642356, "grad_norm": 3.192438525964976, "learning_rate": 3.8078326744859516e-06, "loss": 0.3674, "step": 4202 }, { "epoch": 0.5894810659186536, "grad_norm": 2.1368540942409857, "learning_rate": 3.805627053298825e-06, "loss": 0.3639, "step": 4203 }, { "epoch": 0.5896213183730715, "grad_norm": 2.170118101571448, "learning_rate": 3.803421678562213e-06, "loss": 0.3482, "step": 4204 }, { "epoch": 0.5897615708274895, "grad_norm": 2.154160191755568, "learning_rate": 3.8012165507311756e-06, "loss": 0.3694, "step": 4205 }, { "epoch": 0.5899018232819074, "grad_norm": 1.9468775202702702, "learning_rate": 3.799011670260727e-06, "loss": 0.3535, "step": 4206 }, { "epoch": 0.5900420757363254, "grad_norm": 2.7769812925517035, "learning_rate": 3.7968070376058304e-06, "loss": 0.341, "step": 4207 }, { "epoch": 0.5901823281907433, "grad_norm": 2.408261436876393, "learning_rate": 3.7946026532213965e-06, "loss": 0.3423, "step": 4208 }, { "epoch": 0.5903225806451613, "grad_norm": 2.3279229402864403, "learning_rate": 3.792398517562282e-06, "loss": 0.3517, "step": 4209 }, { "epoch": 0.5904628330995793, "grad_norm": 2.4121815288423836, "learning_rate": 3.7901946310832966e-06, "loss": 0.3555, "step": 4210 }, { "epoch": 0.5906030855539972, "grad_norm": 2.052644707693104, "learning_rate": 3.7879909942391963e-06, "loss": 0.3513, "step": 4211 }, { "epoch": 0.5907433380084152, "grad_norm": 2.631633369987309, "learning_rate": 3.7857876074846878e-06, "loss": 0.2984, "step": 4212 }, { "epoch": 0.5908835904628331, "grad_norm": 2.072797557789031, "learning_rate": 3.7835844712744228e-06, "loss": 0.3642, "step": 4213 }, { "epoch": 0.5910238429172511, "grad_norm": 2.409708980068304, "learning_rate": 3.7813815860630034e-06, "loss": 0.3531, "step": 4214 }, { "epoch": 0.591164095371669, "grad_norm": 2.698151881349362, "learning_rate": 3.7791789523049793e-06, "loss": 0.3653, "step": 4215 }, { "epoch": 0.591304347826087, "grad_norm": 2.2721115387952597, "learning_rate": 3.7769765704548494e-06, "loss": 0.3591, "step": 4216 }, { "epoch": 0.5914446002805049, "grad_norm": 1.8682055391452301, "learning_rate": 3.7747744409670608e-06, "loss": 0.3683, "step": 4217 }, { "epoch": 0.5915848527349229, "grad_norm": 2.0743420810276967, "learning_rate": 3.7725725642960047e-06, "loss": 0.3553, "step": 4218 }, { "epoch": 0.5917251051893408, "grad_norm": 1.9468333722068214, "learning_rate": 3.770370940896025e-06, "loss": 0.3344, "step": 4219 }, { "epoch": 0.5918653576437588, "grad_norm": 1.8901671808971614, "learning_rate": 3.76816957122141e-06, "loss": 0.3315, "step": 4220 }, { "epoch": 0.5920056100981768, "grad_norm": 2.5630499086912333, "learning_rate": 3.765968455726398e-06, "loss": 0.3568, "step": 4221 }, { "epoch": 0.5921458625525947, "grad_norm": 2.6366259749303174, "learning_rate": 3.7637675948651754e-06, "loss": 0.356, "step": 4222 }, { "epoch": 0.5922861150070127, "grad_norm": 2.0677242582718445, "learning_rate": 3.7615669890918706e-06, "loss": 0.3448, "step": 4223 }, { "epoch": 0.5924263674614306, "grad_norm": 2.4100187438615825, "learning_rate": 3.7593666388605654e-06, "loss": 0.3103, "step": 4224 }, { "epoch": 0.5925666199158486, "grad_norm": 2.843861169004524, "learning_rate": 3.7571665446252886e-06, "loss": 0.3897, "step": 4225 }, { "epoch": 0.5927068723702664, "grad_norm": 2.3019327543099495, "learning_rate": 3.7549667068400104e-06, "loss": 0.3735, "step": 4226 }, { "epoch": 0.5928471248246844, "grad_norm": 2.275953545744929, "learning_rate": 3.7527671259586536e-06, "loss": 0.3011, "step": 4227 }, { "epoch": 0.5929873772791023, "grad_norm": 1.7204963135462241, "learning_rate": 3.7505678024350874e-06, "loss": 0.3226, "step": 4228 }, { "epoch": 0.5931276297335203, "grad_norm": 3.8261648959864885, "learning_rate": 3.748368736723125e-06, "loss": 0.3311, "step": 4229 }, { "epoch": 0.5932678821879382, "grad_norm": 4.350846236776729, "learning_rate": 3.746169929276529e-06, "loss": 0.347, "step": 4230 }, { "epoch": 0.5934081346423562, "grad_norm": 2.436453080929339, "learning_rate": 3.7439713805490087e-06, "loss": 0.3529, "step": 4231 }, { "epoch": 0.5935483870967742, "grad_norm": 3.4349530581358283, "learning_rate": 3.7417730909942184e-06, "loss": 0.3745, "step": 4232 }, { "epoch": 0.5936886395511921, "grad_norm": 2.330227510535075, "learning_rate": 3.739575061065761e-06, "loss": 0.3097, "step": 4233 }, { "epoch": 0.5938288920056101, "grad_norm": 1.7743758353365402, "learning_rate": 3.7373772912171825e-06, "loss": 0.3535, "step": 4234 }, { "epoch": 0.593969144460028, "grad_norm": 2.196454602927952, "learning_rate": 3.7351797819019788e-06, "loss": 0.3359, "step": 4235 }, { "epoch": 0.594109396914446, "grad_norm": 2.124044203382685, "learning_rate": 3.73298253357359e-06, "loss": 0.347, "step": 4236 }, { "epoch": 0.5942496493688639, "grad_norm": 2.5641156197484523, "learning_rate": 3.7307855466854053e-06, "loss": 0.3407, "step": 4237 }, { "epoch": 0.5943899018232819, "grad_norm": 2.887024983228268, "learning_rate": 3.728588821690754e-06, "loss": 0.3302, "step": 4238 }, { "epoch": 0.5945301542776998, "grad_norm": 2.1395413516889916, "learning_rate": 3.726392359042917e-06, "loss": 0.3748, "step": 4239 }, { "epoch": 0.5946704067321178, "grad_norm": 2.0811834559527767, "learning_rate": 3.7241961591951183e-06, "loss": 0.3446, "step": 4240 }, { "epoch": 0.5948106591865358, "grad_norm": 2.0085811108637004, "learning_rate": 3.722000222600528e-06, "loss": 0.3766, "step": 4241 }, { "epoch": 0.5949509116409537, "grad_norm": 2.721441230415507, "learning_rate": 3.7198045497122647e-06, "loss": 0.3372, "step": 4242 }, { "epoch": 0.5950911640953717, "grad_norm": 1.9012920127706099, "learning_rate": 3.717609140983387e-06, "loss": 0.3537, "step": 4243 }, { "epoch": 0.5952314165497896, "grad_norm": 1.9686418533897232, "learning_rate": 3.7154139968669043e-06, "loss": 0.3723, "step": 4244 }, { "epoch": 0.5953716690042076, "grad_norm": 2.1407567561456284, "learning_rate": 3.71321911781577e-06, "loss": 0.3137, "step": 4245 }, { "epoch": 0.5955119214586255, "grad_norm": 10.838285712917159, "learning_rate": 3.7110245042828786e-06, "loss": 0.3466, "step": 4246 }, { "epoch": 0.5956521739130435, "grad_norm": 2.1101207403918814, "learning_rate": 3.708830156721075e-06, "loss": 0.3352, "step": 4247 }, { "epoch": 0.5957924263674614, "grad_norm": 2.1787994654036233, "learning_rate": 3.706636075583148e-06, "loss": 0.3236, "step": 4248 }, { "epoch": 0.5959326788218794, "grad_norm": 1.9806009282352337, "learning_rate": 3.7044422613218322e-06, "loss": 0.3916, "step": 4249 }, { "epoch": 0.5960729312762973, "grad_norm": 2.203474192981284, "learning_rate": 3.7022487143898022e-06, "loss": 0.386, "step": 4250 }, { "epoch": 0.5962131837307153, "grad_norm": 2.017799092071447, "learning_rate": 3.700055435239684e-06, "loss": 0.3743, "step": 4251 }, { "epoch": 0.5963534361851333, "grad_norm": 2.0964838655207134, "learning_rate": 3.697862424324044e-06, "loss": 0.3432, "step": 4252 }, { "epoch": 0.5964936886395512, "grad_norm": 4.940246783744893, "learning_rate": 3.695669682095397e-06, "loss": 0.3148, "step": 4253 }, { "epoch": 0.5966339410939692, "grad_norm": 1.8992584086286368, "learning_rate": 3.6934772090061966e-06, "loss": 0.3891, "step": 4254 }, { "epoch": 0.5967741935483871, "grad_norm": 3.6758545433463947, "learning_rate": 3.691285005508847e-06, "loss": 0.3574, "step": 4255 }, { "epoch": 0.5969144460028051, "grad_norm": 1.8013059302706191, "learning_rate": 3.689093072055692e-06, "loss": 0.343, "step": 4256 }, { "epoch": 0.597054698457223, "grad_norm": 2.0326691585027965, "learning_rate": 3.686901409099023e-06, "loss": 0.3338, "step": 4257 }, { "epoch": 0.597194950911641, "grad_norm": 2.0916453929087773, "learning_rate": 3.6847100170910754e-06, "loss": 0.3508, "step": 4258 }, { "epoch": 0.5973352033660589, "grad_norm": 2.7867650418811993, "learning_rate": 3.682518896484026e-06, "loss": 0.3417, "step": 4259 }, { "epoch": 0.5974754558204769, "grad_norm": 1.9317979893164554, "learning_rate": 3.6803280477299975e-06, "loss": 0.2917, "step": 4260 }, { "epoch": 0.5976157082748949, "grad_norm": 1.83661720193116, "learning_rate": 3.6781374712810558e-06, "loss": 0.3435, "step": 4261 }, { "epoch": 0.5977559607293128, "grad_norm": 2.4077858915350214, "learning_rate": 3.675947167589212e-06, "loss": 0.3558, "step": 4262 }, { "epoch": 0.5978962131837308, "grad_norm": 1.9965214280474397, "learning_rate": 3.6737571371064205e-06, "loss": 0.4022, "step": 4263 }, { "epoch": 0.5980364656381487, "grad_norm": 2.1240814692144245, "learning_rate": 3.6715673802845768e-06, "loss": 0.3042, "step": 4264 }, { "epoch": 0.5981767180925667, "grad_norm": 2.241403156755719, "learning_rate": 3.6693778975755235e-06, "loss": 0.3563, "step": 4265 }, { "epoch": 0.5983169705469845, "grad_norm": 2.4444567855850954, "learning_rate": 3.667188689431046e-06, "loss": 0.379, "step": 4266 }, { "epoch": 0.5984572230014025, "grad_norm": 3.785584542655182, "learning_rate": 3.664999756302869e-06, "loss": 0.3515, "step": 4267 }, { "epoch": 0.5985974754558204, "grad_norm": 2.0121158541243847, "learning_rate": 3.662811098642665e-06, "loss": 0.362, "step": 4268 }, { "epoch": 0.5987377279102384, "grad_norm": 2.9032270234970223, "learning_rate": 3.66062271690205e-06, "loss": 0.3429, "step": 4269 }, { "epoch": 0.5988779803646563, "grad_norm": 2.3263629159062664, "learning_rate": 3.658434611532578e-06, "loss": 0.342, "step": 4270 }, { "epoch": 0.5990182328190743, "grad_norm": 2.053752730049088, "learning_rate": 3.656246782985751e-06, "loss": 0.3757, "step": 4271 }, { "epoch": 0.5991584852734922, "grad_norm": 1.9144579420513643, "learning_rate": 3.654059231713013e-06, "loss": 0.3498, "step": 4272 }, { "epoch": 0.5992987377279102, "grad_norm": 1.8557776906659542, "learning_rate": 3.651871958165748e-06, "loss": 0.3788, "step": 4273 }, { "epoch": 0.5994389901823282, "grad_norm": 2.5721592167863605, "learning_rate": 3.6496849627952875e-06, "loss": 0.3271, "step": 4274 }, { "epoch": 0.5995792426367461, "grad_norm": 2.0748444096704746, "learning_rate": 3.6474982460528998e-06, "loss": 0.3515, "step": 4275 }, { "epoch": 0.5997194950911641, "grad_norm": 2.7852095803482406, "learning_rate": 3.6453118083897988e-06, "loss": 0.3391, "step": 4276 }, { "epoch": 0.599859747545582, "grad_norm": 1.948589154469682, "learning_rate": 3.6431256502571422e-06, "loss": 0.3513, "step": 4277 }, { "epoch": 0.6, "grad_norm": 1.7279774663215346, "learning_rate": 3.640939772106029e-06, "loss": 0.3441, "step": 4278 }, { "epoch": 0.6001402524544179, "grad_norm": 2.2718346047778355, "learning_rate": 3.638754174387498e-06, "loss": 0.3577, "step": 4279 }, { "epoch": 0.6002805049088359, "grad_norm": 2.1540052366981146, "learning_rate": 3.6365688575525315e-06, "loss": 0.3495, "step": 4280 }, { "epoch": 0.6004207573632538, "grad_norm": 2.2821707761554757, "learning_rate": 3.634383822052057e-06, "loss": 0.3826, "step": 4281 }, { "epoch": 0.6005610098176718, "grad_norm": 1.888680585271086, "learning_rate": 3.6321990683369384e-06, "loss": 0.3532, "step": 4282 }, { "epoch": 0.6007012622720898, "grad_norm": 3.642471434638938, "learning_rate": 3.6300145968579876e-06, "loss": 0.3721, "step": 4283 }, { "epoch": 0.6008415147265077, "grad_norm": 1.529307320316739, "learning_rate": 3.627830408065952e-06, "loss": 0.343, "step": 4284 }, { "epoch": 0.6009817671809257, "grad_norm": 1.85106565352229, "learning_rate": 3.625646502411525e-06, "loss": 0.3351, "step": 4285 }, { "epoch": 0.6011220196353436, "grad_norm": 1.6993304336501163, "learning_rate": 3.623462880345341e-06, "loss": 0.3167, "step": 4286 }, { "epoch": 0.6012622720897616, "grad_norm": 2.039517640512858, "learning_rate": 3.6212795423179754e-06, "loss": 0.3277, "step": 4287 }, { "epoch": 0.6014025245441795, "grad_norm": 1.6008608886211184, "learning_rate": 3.6190964887799418e-06, "loss": 0.4017, "step": 4288 }, { "epoch": 0.6015427769985975, "grad_norm": 2.0974937106050717, "learning_rate": 3.6169137201817007e-06, "loss": 0.3473, "step": 4289 }, { "epoch": 0.6016830294530154, "grad_norm": 1.9228467267984521, "learning_rate": 3.614731236973651e-06, "loss": 0.363, "step": 4290 }, { "epoch": 0.6018232819074334, "grad_norm": 2.0389393736398005, "learning_rate": 3.6125490396061315e-06, "loss": 0.38, "step": 4291 }, { "epoch": 0.6019635343618513, "grad_norm": 3.0728297506853153, "learning_rate": 3.610367128529424e-06, "loss": 0.3738, "step": 4292 }, { "epoch": 0.6021037868162693, "grad_norm": 2.132214361024675, "learning_rate": 3.6081855041937507e-06, "loss": 0.3394, "step": 4293 }, { "epoch": 0.6022440392706873, "grad_norm": 1.4584039761826493, "learning_rate": 3.606004167049275e-06, "loss": 0.3151, "step": 4294 }, { "epoch": 0.6023842917251052, "grad_norm": 2.0255274050517613, "learning_rate": 3.6038231175461004e-06, "loss": 0.3578, "step": 4295 }, { "epoch": 0.6025245441795232, "grad_norm": 2.123655679369667, "learning_rate": 3.6016423561342707e-06, "loss": 0.384, "step": 4296 }, { "epoch": 0.6026647966339411, "grad_norm": 2.101620471254049, "learning_rate": 3.5994618832637706e-06, "loss": 0.3924, "step": 4297 }, { "epoch": 0.6028050490883591, "grad_norm": 1.5563425427233322, "learning_rate": 3.597281699384526e-06, "loss": 0.3246, "step": 4298 }, { "epoch": 0.602945301542777, "grad_norm": 1.8189422296092204, "learning_rate": 3.595101804946404e-06, "loss": 0.3488, "step": 4299 }, { "epoch": 0.603085553997195, "grad_norm": 2.364861179334206, "learning_rate": 3.5929222003992083e-06, "loss": 0.3487, "step": 4300 }, { "epoch": 0.603225806451613, "grad_norm": 4.028720268970686, "learning_rate": 3.5907428861926857e-06, "loss": 0.3425, "step": 4301 }, { "epoch": 0.6033660589060309, "grad_norm": 1.988516683073515, "learning_rate": 3.5885638627765228e-06, "loss": 0.3121, "step": 4302 }, { "epoch": 0.6035063113604489, "grad_norm": 3.8848709391362175, "learning_rate": 3.586385130600345e-06, "loss": 0.365, "step": 4303 }, { "epoch": 0.6036465638148668, "grad_norm": 2.1905162180230757, "learning_rate": 3.584206690113721e-06, "loss": 0.3611, "step": 4304 }, { "epoch": 0.6037868162692848, "grad_norm": 2.459869152869124, "learning_rate": 3.582028541766154e-06, "loss": 0.3443, "step": 4305 }, { "epoch": 0.6039270687237027, "grad_norm": 2.653502176206626, "learning_rate": 3.5798506860070904e-06, "loss": 0.3323, "step": 4306 }, { "epoch": 0.6040673211781206, "grad_norm": 1.744194072660565, "learning_rate": 3.5776731232859156e-06, "loss": 0.3904, "step": 4307 }, { "epoch": 0.6042075736325385, "grad_norm": 2.240101871047314, "learning_rate": 3.575495854051957e-06, "loss": 0.3482, "step": 4308 }, { "epoch": 0.6043478260869565, "grad_norm": 2.3815332199236896, "learning_rate": 3.573318878754475e-06, "loss": 0.3602, "step": 4309 }, { "epoch": 0.6044880785413744, "grad_norm": 2.3758423967655764, "learning_rate": 3.5711421978426746e-06, "loss": 0.3952, "step": 4310 }, { "epoch": 0.6046283309957924, "grad_norm": 1.8251077907553779, "learning_rate": 3.568965811765699e-06, "loss": 0.3419, "step": 4311 }, { "epoch": 0.6047685834502103, "grad_norm": 1.9703723943744282, "learning_rate": 3.5667897209726287e-06, "loss": 0.3436, "step": 4312 }, { "epoch": 0.6049088359046283, "grad_norm": 1.9864605134639635, "learning_rate": 3.564613925912488e-06, "loss": 0.3852, "step": 4313 }, { "epoch": 0.6050490883590462, "grad_norm": 2.0148545796287087, "learning_rate": 3.562438427034234e-06, "loss": 0.3714, "step": 4314 }, { "epoch": 0.6051893408134642, "grad_norm": 1.9747675408804797, "learning_rate": 3.5602632247867687e-06, "loss": 0.3359, "step": 4315 }, { "epoch": 0.6053295932678822, "grad_norm": 4.8990725359649945, "learning_rate": 3.5580883196189265e-06, "loss": 0.3364, "step": 4316 }, { "epoch": 0.6054698457223001, "grad_norm": 3.1115846689784417, "learning_rate": 3.555913711979486e-06, "loss": 0.3918, "step": 4317 }, { "epoch": 0.6056100981767181, "grad_norm": 1.9817920605144639, "learning_rate": 3.553739402317162e-06, "loss": 0.3725, "step": 4318 }, { "epoch": 0.605750350631136, "grad_norm": 2.0237731655178086, "learning_rate": 3.551565391080609e-06, "loss": 0.3597, "step": 4319 }, { "epoch": 0.605890603085554, "grad_norm": 1.9535262039102557, "learning_rate": 3.549391678718417e-06, "loss": 0.3575, "step": 4320 }, { "epoch": 0.6060308555399719, "grad_norm": 1.8527534614173429, "learning_rate": 3.5472182656791165e-06, "loss": 0.3103, "step": 4321 }, { "epoch": 0.6061711079943899, "grad_norm": 2.2799961354406966, "learning_rate": 3.545045152411178e-06, "loss": 0.3506, "step": 4322 }, { "epoch": 0.6063113604488078, "grad_norm": 1.8539091924426185, "learning_rate": 3.5428723393630067e-06, "loss": 0.3227, "step": 4323 }, { "epoch": 0.6064516129032258, "grad_norm": 1.6580855077360623, "learning_rate": 3.5406998269829485e-06, "loss": 0.3362, "step": 4324 }, { "epoch": 0.6065918653576438, "grad_norm": 1.9477421551491219, "learning_rate": 3.538527615719285e-06, "loss": 0.3435, "step": 4325 }, { "epoch": 0.6067321178120617, "grad_norm": 2.2625670744505983, "learning_rate": 3.5363557060202375e-06, "loss": 0.3489, "step": 4326 }, { "epoch": 0.6068723702664797, "grad_norm": 2.1946981527511826, "learning_rate": 3.5341840983339636e-06, "loss": 0.36, "step": 4327 }, { "epoch": 0.6070126227208976, "grad_norm": 2.067955200934907, "learning_rate": 3.532012793108561e-06, "loss": 0.3321, "step": 4328 }, { "epoch": 0.6071528751753156, "grad_norm": 1.7949605279689447, "learning_rate": 3.5298417907920633e-06, "loss": 0.324, "step": 4329 }, { "epoch": 0.6072931276297335, "grad_norm": 1.772084799965578, "learning_rate": 3.52767109183244e-06, "loss": 0.369, "step": 4330 }, { "epoch": 0.6074333800841515, "grad_norm": 2.156329830737455, "learning_rate": 3.5255006966776005e-06, "loss": 0.4056, "step": 4331 }, { "epoch": 0.6075736325385694, "grad_norm": 2.6108693851920672, "learning_rate": 3.523330605775389e-06, "loss": 0.3627, "step": 4332 }, { "epoch": 0.6077138849929874, "grad_norm": 1.8027850275166566, "learning_rate": 3.5211608195735914e-06, "loss": 0.3219, "step": 4333 }, { "epoch": 0.6078541374474054, "grad_norm": 2.4882090032560122, "learning_rate": 3.518991338519926e-06, "loss": 0.3736, "step": 4334 }, { "epoch": 0.6079943899018233, "grad_norm": 2.132108691001789, "learning_rate": 3.516822163062052e-06, "loss": 0.3451, "step": 4335 }, { "epoch": 0.6081346423562413, "grad_norm": 4.92783075299627, "learning_rate": 3.514653293647561e-06, "loss": 0.3489, "step": 4336 }, { "epoch": 0.6082748948106592, "grad_norm": 1.9123994414377583, "learning_rate": 3.5124847307239863e-06, "loss": 0.3244, "step": 4337 }, { "epoch": 0.6084151472650772, "grad_norm": 4.693474789207228, "learning_rate": 3.510316474738794e-06, "loss": 0.3431, "step": 4338 }, { "epoch": 0.6085553997194951, "grad_norm": 2.2118403340361263, "learning_rate": 3.5081485261393894e-06, "loss": 0.3909, "step": 4339 }, { "epoch": 0.6086956521739131, "grad_norm": 2.076815314363698, "learning_rate": 3.5059808853731146e-06, "loss": 0.3753, "step": 4340 }, { "epoch": 0.608835904628331, "grad_norm": 1.9349908886677722, "learning_rate": 3.5038135528872453e-06, "loss": 0.3487, "step": 4341 }, { "epoch": 0.608976157082749, "grad_norm": 1.9679945449986103, "learning_rate": 3.5016465291289957e-06, "loss": 0.3382, "step": 4342 }, { "epoch": 0.609116409537167, "grad_norm": 1.6875717006568307, "learning_rate": 3.4994798145455167e-06, "loss": 0.3484, "step": 4343 }, { "epoch": 0.6092566619915849, "grad_norm": 1.7962124473771668, "learning_rate": 3.4973134095838943e-06, "loss": 0.3442, "step": 4344 }, { "epoch": 0.6093969144460029, "grad_norm": 2.0884620733783263, "learning_rate": 3.495147314691153e-06, "loss": 0.273, "step": 4345 }, { "epoch": 0.6095371669004208, "grad_norm": 1.8186427632077298, "learning_rate": 3.4929815303142483e-06, "loss": 0.3614, "step": 4346 }, { "epoch": 0.6096774193548387, "grad_norm": 1.8281769052293504, "learning_rate": 3.490816056900076e-06, "loss": 0.3226, "step": 4347 }, { "epoch": 0.6098176718092566, "grad_norm": 2.223116701296679, "learning_rate": 3.4886508948954656e-06, "loss": 0.3886, "step": 4348 }, { "epoch": 0.6099579242636746, "grad_norm": 2.728598110778955, "learning_rate": 3.486486044747186e-06, "loss": 0.376, "step": 4349 }, { "epoch": 0.6100981767180925, "grad_norm": 1.7482228110659543, "learning_rate": 3.484321506901936e-06, "loss": 0.3132, "step": 4350 }, { "epoch": 0.6102384291725105, "grad_norm": 2.003599980021922, "learning_rate": 3.4821572818063544e-06, "loss": 0.4345, "step": 4351 }, { "epoch": 0.6103786816269284, "grad_norm": 1.7644904888758262, "learning_rate": 3.4799933699070115e-06, "loss": 0.3953, "step": 4352 }, { "epoch": 0.6105189340813464, "grad_norm": 3.39412399676018, "learning_rate": 3.477829771650417e-06, "loss": 0.367, "step": 4353 }, { "epoch": 0.6106591865357643, "grad_norm": 2.066529585793821, "learning_rate": 3.4756664874830147e-06, "loss": 0.3257, "step": 4354 }, { "epoch": 0.6107994389901823, "grad_norm": 2.2240134119960433, "learning_rate": 3.4735035178511832e-06, "loss": 0.3221, "step": 4355 }, { "epoch": 0.6109396914446003, "grad_norm": 2.079361855844014, "learning_rate": 3.471340863201237e-06, "loss": 0.3717, "step": 4356 }, { "epoch": 0.6110799438990182, "grad_norm": 2.5928811146132063, "learning_rate": 3.469178523979422e-06, "loss": 0.3454, "step": 4357 }, { "epoch": 0.6112201963534362, "grad_norm": 1.9893070115609928, "learning_rate": 3.4670165006319236e-06, "loss": 0.3507, "step": 4358 }, { "epoch": 0.6113604488078541, "grad_norm": 2.8649412914072863, "learning_rate": 3.4648547936048597e-06, "loss": 0.3581, "step": 4359 }, { "epoch": 0.6115007012622721, "grad_norm": 2.324811382231182, "learning_rate": 3.4626934033442856e-06, "loss": 0.3689, "step": 4360 }, { "epoch": 0.61164095371669, "grad_norm": 2.089206491442504, "learning_rate": 3.4605323302961857e-06, "loss": 0.3527, "step": 4361 }, { "epoch": 0.611781206171108, "grad_norm": 2.074881065361166, "learning_rate": 3.458371574906484e-06, "loss": 0.355, "step": 4362 }, { "epoch": 0.6119214586255259, "grad_norm": 3.7156631095606607, "learning_rate": 3.456211137621037e-06, "loss": 0.3419, "step": 4363 }, { "epoch": 0.6120617110799439, "grad_norm": 2.3329962645891995, "learning_rate": 3.4540510188856357e-06, "loss": 0.333, "step": 4364 }, { "epoch": 0.6122019635343618, "grad_norm": 2.4543838672560327, "learning_rate": 3.4518912191460073e-06, "loss": 0.3611, "step": 4365 }, { "epoch": 0.6123422159887798, "grad_norm": 2.2936147790284305, "learning_rate": 3.449731738847809e-06, "loss": 0.331, "step": 4366 }, { "epoch": 0.6124824684431978, "grad_norm": 1.7890824895683208, "learning_rate": 3.447572578436635e-06, "loss": 0.3713, "step": 4367 }, { "epoch": 0.6126227208976157, "grad_norm": 2.198428077921574, "learning_rate": 3.4454137383580135e-06, "loss": 0.3458, "step": 4368 }, { "epoch": 0.6127629733520337, "grad_norm": 1.7246301116215097, "learning_rate": 3.4432552190574055e-06, "loss": 0.3272, "step": 4369 }, { "epoch": 0.6129032258064516, "grad_norm": 2.1581439118008254, "learning_rate": 3.4410970209802096e-06, "loss": 0.3944, "step": 4370 }, { "epoch": 0.6130434782608696, "grad_norm": 2.3616853227688352, "learning_rate": 3.438939144571749e-06, "loss": 0.3519, "step": 4371 }, { "epoch": 0.6131837307152875, "grad_norm": 1.915360178230315, "learning_rate": 3.4367815902772917e-06, "loss": 0.3695, "step": 4372 }, { "epoch": 0.6133239831697055, "grad_norm": 1.9699757711014638, "learning_rate": 3.4346243585420297e-06, "loss": 0.3719, "step": 4373 }, { "epoch": 0.6134642356241234, "grad_norm": 1.9591654639630485, "learning_rate": 3.4324674498110956e-06, "loss": 0.3355, "step": 4374 }, { "epoch": 0.6136044880785414, "grad_norm": 1.9766826366960935, "learning_rate": 3.43031086452955e-06, "loss": 0.3756, "step": 4375 }, { "epoch": 0.6137447405329594, "grad_norm": 2.09786810062667, "learning_rate": 3.4281546031423933e-06, "loss": 0.3516, "step": 4376 }, { "epoch": 0.6138849929873773, "grad_norm": 1.7469417552442383, "learning_rate": 3.425998666094551e-06, "loss": 0.3842, "step": 4377 }, { "epoch": 0.6140252454417953, "grad_norm": 2.38398558710209, "learning_rate": 3.4238430538308876e-06, "loss": 0.3571, "step": 4378 }, { "epoch": 0.6141654978962132, "grad_norm": 2.216296599146189, "learning_rate": 3.4216877667961975e-06, "loss": 0.3246, "step": 4379 }, { "epoch": 0.6143057503506312, "grad_norm": 1.6360010289606342, "learning_rate": 3.4195328054352097e-06, "loss": 0.3561, "step": 4380 }, { "epoch": 0.6144460028050491, "grad_norm": 2.0961595022358095, "learning_rate": 3.417378170192587e-06, "loss": 0.2913, "step": 4381 }, { "epoch": 0.6145862552594671, "grad_norm": 1.8339831399964226, "learning_rate": 3.4152238615129208e-06, "loss": 0.3441, "step": 4382 }, { "epoch": 0.614726507713885, "grad_norm": 2.756747723748065, "learning_rate": 3.413069879840738e-06, "loss": 0.3386, "step": 4383 }, { "epoch": 0.614866760168303, "grad_norm": 2.056533970109622, "learning_rate": 3.4109162256204988e-06, "loss": 0.3845, "step": 4384 }, { "epoch": 0.615007012622721, "grad_norm": 1.9620190559573933, "learning_rate": 3.4087628992965937e-06, "loss": 0.3466, "step": 4385 }, { "epoch": 0.6151472650771389, "grad_norm": 2.395230314135045, "learning_rate": 3.406609901313349e-06, "loss": 0.433, "step": 4386 }, { "epoch": 0.6152875175315567, "grad_norm": 4.720998266334918, "learning_rate": 3.404457232115017e-06, "loss": 0.3398, "step": 4387 }, { "epoch": 0.6154277699859747, "grad_norm": 2.3935329727314674, "learning_rate": 3.402304892145788e-06, "loss": 0.3682, "step": 4388 }, { "epoch": 0.6155680224403927, "grad_norm": 1.919283538596872, "learning_rate": 3.4001528818497826e-06, "loss": 0.362, "step": 4389 }, { "epoch": 0.6157082748948106, "grad_norm": 2.507789302730524, "learning_rate": 3.3980012016710533e-06, "loss": 0.313, "step": 4390 }, { "epoch": 0.6158485273492286, "grad_norm": 1.7911317677420253, "learning_rate": 3.395849852053584e-06, "loss": 0.3232, "step": 4391 }, { "epoch": 0.6159887798036465, "grad_norm": 2.602774031518904, "learning_rate": 3.3936988334412895e-06, "loss": 0.3787, "step": 4392 }, { "epoch": 0.6161290322580645, "grad_norm": 2.1162838291769814, "learning_rate": 3.3915481462780174e-06, "loss": 0.3558, "step": 4393 }, { "epoch": 0.6162692847124824, "grad_norm": 2.020274396331996, "learning_rate": 3.389397791007548e-06, "loss": 0.3406, "step": 4394 }, { "epoch": 0.6164095371669004, "grad_norm": 2.5636787494670137, "learning_rate": 3.3872477680735915e-06, "loss": 0.368, "step": 4395 }, { "epoch": 0.6165497896213183, "grad_norm": 1.6640758245230183, "learning_rate": 3.385098077919791e-06, "loss": 0.3303, "step": 4396 }, { "epoch": 0.6166900420757363, "grad_norm": 2.0837048644341767, "learning_rate": 3.3829487209897207e-06, "loss": 0.3385, "step": 4397 }, { "epoch": 0.6168302945301543, "grad_norm": 2.166659452964316, "learning_rate": 3.3807996977268825e-06, "loss": 0.3254, "step": 4398 }, { "epoch": 0.6169705469845722, "grad_norm": 2.14184413080648, "learning_rate": 3.3786510085747145e-06, "loss": 0.3696, "step": 4399 }, { "epoch": 0.6171107994389902, "grad_norm": 1.788784755153217, "learning_rate": 3.3765026539765832e-06, "loss": 0.3462, "step": 4400 }, { "epoch": 0.6172510518934081, "grad_norm": 2.244079642135408, "learning_rate": 3.3743546343757872e-06, "loss": 0.3661, "step": 4401 }, { "epoch": 0.6173913043478261, "grad_norm": 2.5572643291516837, "learning_rate": 3.3722069502155543e-06, "loss": 0.3353, "step": 4402 }, { "epoch": 0.617531556802244, "grad_norm": 1.7719158753216584, "learning_rate": 3.370059601939044e-06, "loss": 0.3331, "step": 4403 }, { "epoch": 0.617671809256662, "grad_norm": 2.1734355561763974, "learning_rate": 3.3679125899893474e-06, "loss": 0.3341, "step": 4404 }, { "epoch": 0.6178120617110799, "grad_norm": 1.8119890216216927, "learning_rate": 3.3657659148094855e-06, "loss": 0.3478, "step": 4405 }, { "epoch": 0.6179523141654979, "grad_norm": 2.248818405154216, "learning_rate": 3.36361957684241e-06, "loss": 0.3737, "step": 4406 }, { "epoch": 0.6180925666199159, "grad_norm": 1.8517112833113465, "learning_rate": 3.3614735765310013e-06, "loss": 0.3739, "step": 4407 }, { "epoch": 0.6182328190743338, "grad_norm": 1.9878941367072047, "learning_rate": 3.3593279143180723e-06, "loss": 0.4018, "step": 4408 }, { "epoch": 0.6183730715287518, "grad_norm": 2.1501275091285805, "learning_rate": 3.357182590646366e-06, "loss": 0.3807, "step": 4409 }, { "epoch": 0.6185133239831697, "grad_norm": 2.0735349515825794, "learning_rate": 3.355037605958554e-06, "loss": 0.3493, "step": 4410 }, { "epoch": 0.6186535764375877, "grad_norm": 1.8811474160648354, "learning_rate": 3.3528929606972407e-06, "loss": 0.3096, "step": 4411 }, { "epoch": 0.6187938288920056, "grad_norm": 2.3151646289003147, "learning_rate": 3.3507486553049572e-06, "loss": 0.3784, "step": 4412 }, { "epoch": 0.6189340813464236, "grad_norm": 3.573511072359696, "learning_rate": 3.3486046902241663e-06, "loss": 0.3747, "step": 4413 }, { "epoch": 0.6190743338008415, "grad_norm": 2.037821190915984, "learning_rate": 3.3464610658972584e-06, "loss": 0.3391, "step": 4414 }, { "epoch": 0.6192145862552595, "grad_norm": 1.9200223351212615, "learning_rate": 3.344317782766558e-06, "loss": 0.3862, "step": 4415 }, { "epoch": 0.6193548387096774, "grad_norm": 2.054567050666888, "learning_rate": 3.342174841274315e-06, "loss": 0.3239, "step": 4416 }, { "epoch": 0.6194950911640954, "grad_norm": 3.476778790892976, "learning_rate": 3.3400322418627117e-06, "loss": 0.3166, "step": 4417 }, { "epoch": 0.6196353436185134, "grad_norm": 2.1363802960851173, "learning_rate": 3.337889984973858e-06, "loss": 0.308, "step": 4418 }, { "epoch": 0.6197755960729313, "grad_norm": 2.1849234529612915, "learning_rate": 3.3357480710497925e-06, "loss": 0.3627, "step": 4419 }, { "epoch": 0.6199158485273493, "grad_norm": 2.1862200670935885, "learning_rate": 3.3336065005324847e-06, "loss": 0.3147, "step": 4420 }, { "epoch": 0.6200561009817672, "grad_norm": 1.937681558777636, "learning_rate": 3.331465273863834e-06, "loss": 0.3547, "step": 4421 }, { "epoch": 0.6201963534361852, "grad_norm": 2.0081232327721494, "learning_rate": 3.3293243914856676e-06, "loss": 0.3449, "step": 4422 }, { "epoch": 0.6203366058906031, "grad_norm": 1.85053936338573, "learning_rate": 3.32718385383974e-06, "loss": 0.3721, "step": 4423 }, { "epoch": 0.6204768583450211, "grad_norm": 5.283646260008643, "learning_rate": 3.3250436613677366e-06, "loss": 0.3561, "step": 4424 }, { "epoch": 0.620617110799439, "grad_norm": 1.7050215634187804, "learning_rate": 3.3229038145112713e-06, "loss": 0.3319, "step": 4425 }, { "epoch": 0.620757363253857, "grad_norm": 2.1540374461313196, "learning_rate": 3.3207643137118872e-06, "loss": 0.3501, "step": 4426 }, { "epoch": 0.6208976157082748, "grad_norm": 2.086014535460115, "learning_rate": 3.318625159411056e-06, "loss": 0.3396, "step": 4427 }, { "epoch": 0.6210378681626928, "grad_norm": 1.6800643143377814, "learning_rate": 3.3164863520501744e-06, "loss": 0.3346, "step": 4428 }, { "epoch": 0.6211781206171108, "grad_norm": 2.128774544377825, "learning_rate": 3.314347892070573e-06, "loss": 0.3444, "step": 4429 }, { "epoch": 0.6213183730715287, "grad_norm": 2.2955677698127803, "learning_rate": 3.3122097799135066e-06, "loss": 0.3269, "step": 4430 }, { "epoch": 0.6214586255259467, "grad_norm": 1.8260493436290384, "learning_rate": 3.3100720160201615e-06, "loss": 0.3489, "step": 4431 }, { "epoch": 0.6215988779803646, "grad_norm": 2.193172837543114, "learning_rate": 3.307934600831648e-06, "loss": 0.3836, "step": 4432 }, { "epoch": 0.6217391304347826, "grad_norm": 2.0765223238741863, "learning_rate": 3.30579753478901e-06, "loss": 0.366, "step": 4433 }, { "epoch": 0.6218793828892005, "grad_norm": 1.8817112024655251, "learning_rate": 3.303660818333212e-06, "loss": 0.3176, "step": 4434 }, { "epoch": 0.6220196353436185, "grad_norm": 2.409517225132294, "learning_rate": 3.3015244519051525e-06, "loss": 0.4114, "step": 4435 }, { "epoch": 0.6221598877980364, "grad_norm": 1.944579143248559, "learning_rate": 3.2993884359456557e-06, "loss": 0.3663, "step": 4436 }, { "epoch": 0.6223001402524544, "grad_norm": 2.218416108273978, "learning_rate": 3.2972527708954737e-06, "loss": 0.3758, "step": 4437 }, { "epoch": 0.6224403927068723, "grad_norm": 2.124198762519748, "learning_rate": 3.295117457195288e-06, "loss": 0.3691, "step": 4438 }, { "epoch": 0.6225806451612903, "grad_norm": 1.8243388559146436, "learning_rate": 3.2929824952857014e-06, "loss": 0.386, "step": 4439 }, { "epoch": 0.6227208976157083, "grad_norm": 2.110256886686457, "learning_rate": 3.2908478856072518e-06, "loss": 0.3775, "step": 4440 }, { "epoch": 0.6228611500701262, "grad_norm": 1.850135671950415, "learning_rate": 3.2887136286003997e-06, "loss": 0.3078, "step": 4441 }, { "epoch": 0.6230014025245442, "grad_norm": 2.3813354917383327, "learning_rate": 3.2865797247055354e-06, "loss": 0.3784, "step": 4442 }, { "epoch": 0.6231416549789621, "grad_norm": 1.7585870689288905, "learning_rate": 3.2844461743629725e-06, "loss": 0.3873, "step": 4443 }, { "epoch": 0.6232819074333801, "grad_norm": 2.2404897821930203, "learning_rate": 3.282312978012956e-06, "loss": 0.3758, "step": 4444 }, { "epoch": 0.623422159887798, "grad_norm": 2.061052103160456, "learning_rate": 3.2801801360956557e-06, "loss": 0.3572, "step": 4445 }, { "epoch": 0.623562412342216, "grad_norm": 1.8882111832699573, "learning_rate": 3.2780476490511694e-06, "loss": 0.3954, "step": 4446 }, { "epoch": 0.6237026647966339, "grad_norm": 2.0889731046370543, "learning_rate": 3.27591551731952e-06, "loss": 0.3562, "step": 4447 }, { "epoch": 0.6238429172510519, "grad_norm": 2.263854920406114, "learning_rate": 3.273783741340658e-06, "loss": 0.3687, "step": 4448 }, { "epoch": 0.6239831697054699, "grad_norm": 2.3011147948687434, "learning_rate": 3.2716523215544602e-06, "loss": 0.3351, "step": 4449 }, { "epoch": 0.6241234221598878, "grad_norm": 1.9925843685002225, "learning_rate": 3.269521258400731e-06, "loss": 0.3504, "step": 4450 }, { "epoch": 0.6242636746143058, "grad_norm": 2.0951980593704027, "learning_rate": 3.2673905523192e-06, "loss": 0.3716, "step": 4451 }, { "epoch": 0.6244039270687237, "grad_norm": 3.050645422262411, "learning_rate": 3.2652602037495247e-06, "loss": 0.3699, "step": 4452 }, { "epoch": 0.6245441795231417, "grad_norm": 3.1397779850314396, "learning_rate": 3.2631302131312854e-06, "loss": 0.3832, "step": 4453 }, { "epoch": 0.6246844319775596, "grad_norm": 1.8468113070121392, "learning_rate": 3.2610005809039936e-06, "loss": 0.3768, "step": 4454 }, { "epoch": 0.6248246844319776, "grad_norm": 1.8401996891880095, "learning_rate": 3.258871307507081e-06, "loss": 0.3362, "step": 4455 }, { "epoch": 0.6249649368863955, "grad_norm": 2.162123158866688, "learning_rate": 3.256742393379909e-06, "loss": 0.3345, "step": 4456 }, { "epoch": 0.6251051893408135, "grad_norm": 1.9774089704508973, "learning_rate": 3.254613838961765e-06, "loss": 0.311, "step": 4457 }, { "epoch": 0.6252454417952314, "grad_norm": 1.6822473226869763, "learning_rate": 3.252485644691862e-06, "loss": 0.2772, "step": 4458 }, { "epoch": 0.6253856942496494, "grad_norm": 3.0043092613574953, "learning_rate": 3.2503578110093358e-06, "loss": 0.3381, "step": 4459 }, { "epoch": 0.6255259467040674, "grad_norm": 1.9601816491206723, "learning_rate": 3.248230338353252e-06, "loss": 0.3463, "step": 4460 }, { "epoch": 0.6256661991584853, "grad_norm": 1.893437272238824, "learning_rate": 3.2461032271625982e-06, "loss": 0.3776, "step": 4461 }, { "epoch": 0.6258064516129033, "grad_norm": 1.8747348597935514, "learning_rate": 3.2439764778762906e-06, "loss": 0.3542, "step": 4462 }, { "epoch": 0.6259467040673212, "grad_norm": 2.12824685620318, "learning_rate": 3.2418500909331684e-06, "loss": 0.3022, "step": 4463 }, { "epoch": 0.6260869565217392, "grad_norm": 1.7486535069291265, "learning_rate": 3.2397240667719963e-06, "loss": 0.3005, "step": 4464 }, { "epoch": 0.6262272089761571, "grad_norm": 1.9781555872476282, "learning_rate": 3.2375984058314647e-06, "loss": 0.3385, "step": 4465 }, { "epoch": 0.6263674614305751, "grad_norm": 2.805830103803709, "learning_rate": 3.235473108550189e-06, "loss": 0.3415, "step": 4466 }, { "epoch": 0.6265077138849929, "grad_norm": 2.2315230942406568, "learning_rate": 3.233348175366709e-06, "loss": 0.4034, "step": 4467 }, { "epoch": 0.6266479663394109, "grad_norm": 1.9608690470263896, "learning_rate": 3.2312236067194913e-06, "loss": 0.3344, "step": 4468 }, { "epoch": 0.6267882187938288, "grad_norm": 1.8807564426987695, "learning_rate": 3.2290994030469237e-06, "loss": 0.3529, "step": 4469 }, { "epoch": 0.6269284712482468, "grad_norm": 2.721551876399985, "learning_rate": 3.226975564787322e-06, "loss": 0.3569, "step": 4470 }, { "epoch": 0.6270687237026648, "grad_norm": 6.783111176223187, "learning_rate": 3.224852092378925e-06, "loss": 0.3592, "step": 4471 }, { "epoch": 0.6272089761570827, "grad_norm": 2.198620350600676, "learning_rate": 3.2227289862598976e-06, "loss": 0.3864, "step": 4472 }, { "epoch": 0.6273492286115007, "grad_norm": 2.0113101877674753, "learning_rate": 3.220606246868326e-06, "loss": 0.3373, "step": 4473 }, { "epoch": 0.6274894810659186, "grad_norm": 1.9854114859300918, "learning_rate": 3.2184838746422233e-06, "loss": 0.3461, "step": 4474 }, { "epoch": 0.6276297335203366, "grad_norm": 2.0358610195921947, "learning_rate": 3.2163618700195285e-06, "loss": 0.3876, "step": 4475 }, { "epoch": 0.6277699859747545, "grad_norm": 1.7221775211494363, "learning_rate": 3.2142402334380984e-06, "loss": 0.3518, "step": 4476 }, { "epoch": 0.6279102384291725, "grad_norm": 1.9751394222639023, "learning_rate": 3.21211896533572e-06, "loss": 0.37, "step": 4477 }, { "epoch": 0.6280504908835904, "grad_norm": 1.8196223837672574, "learning_rate": 3.2099980661501016e-06, "loss": 0.3333, "step": 4478 }, { "epoch": 0.6281907433380084, "grad_norm": 1.7610174504487206, "learning_rate": 3.2078775363188775e-06, "loss": 0.3595, "step": 4479 }, { "epoch": 0.6283309957924264, "grad_norm": 2.2911030394129006, "learning_rate": 3.205757376279602e-06, "loss": 0.3077, "step": 4480 }, { "epoch": 0.6284712482468443, "grad_norm": 1.5870498739961734, "learning_rate": 3.203637586469756e-06, "loss": 0.3054, "step": 4481 }, { "epoch": 0.6286115007012623, "grad_norm": 1.878691218834135, "learning_rate": 3.2015181673267435e-06, "loss": 0.3691, "step": 4482 }, { "epoch": 0.6287517531556802, "grad_norm": 1.8826255230640434, "learning_rate": 3.199399119287894e-06, "loss": 0.3468, "step": 4483 }, { "epoch": 0.6288920056100982, "grad_norm": 2.501134043021219, "learning_rate": 3.197280442790455e-06, "loss": 0.3495, "step": 4484 }, { "epoch": 0.6290322580645161, "grad_norm": 1.8727212410010685, "learning_rate": 3.1951621382716015e-06, "loss": 0.3651, "step": 4485 }, { "epoch": 0.6291725105189341, "grad_norm": 2.2809491939012343, "learning_rate": 3.1930442061684306e-06, "loss": 0.3981, "step": 4486 }, { "epoch": 0.629312762973352, "grad_norm": 2.972648550417098, "learning_rate": 3.1909266469179644e-06, "loss": 0.3392, "step": 4487 }, { "epoch": 0.62945301542777, "grad_norm": 1.940978158461384, "learning_rate": 3.1888094609571463e-06, "loss": 0.2972, "step": 4488 }, { "epoch": 0.629593267882188, "grad_norm": 1.5125886055710485, "learning_rate": 3.18669264872284e-06, "loss": 0.3233, "step": 4489 }, { "epoch": 0.6297335203366059, "grad_norm": 2.116794000635274, "learning_rate": 3.1845762106518374e-06, "loss": 0.308, "step": 4490 }, { "epoch": 0.6298737727910239, "grad_norm": 1.9100346126715593, "learning_rate": 3.1824601471808504e-06, "loss": 0.321, "step": 4491 }, { "epoch": 0.6300140252454418, "grad_norm": 1.8993772841474652, "learning_rate": 3.180344458746514e-06, "loss": 0.3533, "step": 4492 }, { "epoch": 0.6301542776998598, "grad_norm": 1.843474674680991, "learning_rate": 3.178229145785386e-06, "loss": 0.3413, "step": 4493 }, { "epoch": 0.6302945301542777, "grad_norm": 2.7419767380276525, "learning_rate": 3.1761142087339446e-06, "loss": 0.3849, "step": 4494 }, { "epoch": 0.6304347826086957, "grad_norm": 2.6959122584806106, "learning_rate": 3.1739996480285963e-06, "loss": 0.344, "step": 4495 }, { "epoch": 0.6305750350631136, "grad_norm": 1.8567263547501958, "learning_rate": 3.171885464105661e-06, "loss": 0.3539, "step": 4496 }, { "epoch": 0.6307152875175316, "grad_norm": 2.214904461386831, "learning_rate": 3.169771657401387e-06, "loss": 0.3491, "step": 4497 }, { "epoch": 0.6308555399719495, "grad_norm": 2.3015393496175847, "learning_rate": 3.1676582283519454e-06, "loss": 0.3171, "step": 4498 }, { "epoch": 0.6309957924263675, "grad_norm": 2.2839814646848975, "learning_rate": 3.165545177393427e-06, "loss": 0.3529, "step": 4499 }, { "epoch": 0.6311360448807855, "grad_norm": 2.0924276474780243, "learning_rate": 3.1634325049618443e-06, "loss": 0.3688, "step": 4500 }, { "epoch": 0.6312762973352034, "grad_norm": 1.7791027042704783, "learning_rate": 3.161320211493133e-06, "loss": 0.316, "step": 4501 }, { "epoch": 0.6314165497896214, "grad_norm": 2.182453437824107, "learning_rate": 3.15920829742315e-06, "loss": 0.3754, "step": 4502 }, { "epoch": 0.6315568022440393, "grad_norm": 2.2554687161737963, "learning_rate": 3.1570967631876733e-06, "loss": 0.4206, "step": 4503 }, { "epoch": 0.6316970546984573, "grad_norm": 2.0332195416904475, "learning_rate": 3.154985609222405e-06, "loss": 0.3274, "step": 4504 }, { "epoch": 0.6318373071528752, "grad_norm": 1.6553149192811103, "learning_rate": 3.1528748359629657e-06, "loss": 0.3651, "step": 4505 }, { "epoch": 0.6319775596072932, "grad_norm": 2.0617873376652684, "learning_rate": 3.1507644438448987e-06, "loss": 0.4181, "step": 4506 }, { "epoch": 0.632117812061711, "grad_norm": 2.6060304508084267, "learning_rate": 3.1486544333036687e-06, "loss": 0.3579, "step": 4507 }, { "epoch": 0.632258064516129, "grad_norm": 1.8247594910048301, "learning_rate": 3.1465448047746626e-06, "loss": 0.3772, "step": 4508 }, { "epoch": 0.6323983169705469, "grad_norm": 1.9582347709797019, "learning_rate": 3.1444355586931876e-06, "loss": 0.3817, "step": 4509 }, { "epoch": 0.6325385694249649, "grad_norm": 2.7868882367542005, "learning_rate": 3.1423266954944694e-06, "loss": 0.3441, "step": 4510 }, { "epoch": 0.6326788218793828, "grad_norm": 3.0028583896515637, "learning_rate": 3.1402182156136586e-06, "loss": 0.3575, "step": 4511 }, { "epoch": 0.6328190743338008, "grad_norm": 2.1012603656029767, "learning_rate": 3.1381101194858264e-06, "loss": 0.3496, "step": 4512 }, { "epoch": 0.6329593267882188, "grad_norm": 4.551077143009234, "learning_rate": 3.136002407545964e-06, "loss": 0.3565, "step": 4513 }, { "epoch": 0.6330995792426367, "grad_norm": 1.6054086789961206, "learning_rate": 3.1338950802289802e-06, "loss": 0.3479, "step": 4514 }, { "epoch": 0.6332398316970547, "grad_norm": 2.6545066947055016, "learning_rate": 3.131788137969709e-06, "loss": 0.3499, "step": 4515 }, { "epoch": 0.6333800841514726, "grad_norm": 2.0053540329617285, "learning_rate": 3.1296815812029058e-06, "loss": 0.338, "step": 4516 }, { "epoch": 0.6335203366058906, "grad_norm": 1.8970723836599421, "learning_rate": 3.1275754103632385e-06, "loss": 0.3494, "step": 4517 }, { "epoch": 0.6336605890603085, "grad_norm": 2.002465278429241, "learning_rate": 3.1254696258853034e-06, "loss": 0.2897, "step": 4518 }, { "epoch": 0.6338008415147265, "grad_norm": 1.9478185972390356, "learning_rate": 3.1233642282036147e-06, "loss": 0.362, "step": 4519 }, { "epoch": 0.6339410939691444, "grad_norm": 2.0847051173194684, "learning_rate": 3.121259217752608e-06, "loss": 0.4214, "step": 4520 }, { "epoch": 0.6340813464235624, "grad_norm": 1.709321186118227, "learning_rate": 3.119154594966634e-06, "loss": 0.3926, "step": 4521 }, { "epoch": 0.6342215988779804, "grad_norm": 3.474513034868019, "learning_rate": 3.1170503602799695e-06, "loss": 0.3445, "step": 4522 }, { "epoch": 0.6343618513323983, "grad_norm": 1.7033494573950991, "learning_rate": 3.114946514126807e-06, "loss": 0.3354, "step": 4523 }, { "epoch": 0.6345021037868163, "grad_norm": 1.723414939532117, "learning_rate": 3.112843056941263e-06, "loss": 0.355, "step": 4524 }, { "epoch": 0.6346423562412342, "grad_norm": 2.4963643340783115, "learning_rate": 3.1107399891573675e-06, "loss": 0.3273, "step": 4525 }, { "epoch": 0.6347826086956522, "grad_norm": 1.7502858745863457, "learning_rate": 3.1086373112090762e-06, "loss": 0.3301, "step": 4526 }, { "epoch": 0.6349228611500701, "grad_norm": 2.12727357746543, "learning_rate": 3.106535023530262e-06, "loss": 0.3441, "step": 4527 }, { "epoch": 0.6350631136044881, "grad_norm": 1.96508957403895, "learning_rate": 3.1044331265547168e-06, "loss": 0.3411, "step": 4528 }, { "epoch": 0.635203366058906, "grad_norm": 1.8779178168704709, "learning_rate": 3.1023316207161535e-06, "loss": 0.3713, "step": 4529 }, { "epoch": 0.635343618513324, "grad_norm": 2.896004449925244, "learning_rate": 3.1002305064482006e-06, "loss": 0.3896, "step": 4530 }, { "epoch": 0.635483870967742, "grad_norm": 1.8324450450530976, "learning_rate": 3.0981297841844106e-06, "loss": 0.3034, "step": 4531 }, { "epoch": 0.6356241234221599, "grad_norm": 1.9464898893464622, "learning_rate": 3.0960294543582513e-06, "loss": 0.32, "step": 4532 }, { "epoch": 0.6357643758765779, "grad_norm": 2.455072686698432, "learning_rate": 3.0939295174031127e-06, "loss": 0.3627, "step": 4533 }, { "epoch": 0.6359046283309958, "grad_norm": 2.055774473727265, "learning_rate": 3.0918299737523016e-06, "loss": 0.369, "step": 4534 }, { "epoch": 0.6360448807854138, "grad_norm": 3.01562088882087, "learning_rate": 3.0897308238390432e-06, "loss": 0.3974, "step": 4535 }, { "epoch": 0.6361851332398317, "grad_norm": 2.2624573761523665, "learning_rate": 3.087632068096483e-06, "loss": 0.3867, "step": 4536 }, { "epoch": 0.6363253856942497, "grad_norm": 2.323906782995709, "learning_rate": 3.0855337069576872e-06, "loss": 0.3613, "step": 4537 }, { "epoch": 0.6364656381486676, "grad_norm": 2.8973748657484584, "learning_rate": 3.0834357408556333e-06, "loss": 0.3726, "step": 4538 }, { "epoch": 0.6366058906030856, "grad_norm": 2.0203821392994072, "learning_rate": 3.0813381702232235e-06, "loss": 0.3743, "step": 4539 }, { "epoch": 0.6367461430575035, "grad_norm": 1.671030062781153, "learning_rate": 3.079240995493279e-06, "loss": 0.347, "step": 4540 }, { "epoch": 0.6368863955119215, "grad_norm": 2.2229207556942265, "learning_rate": 3.0771442170985344e-06, "loss": 0.4049, "step": 4541 }, { "epoch": 0.6370266479663395, "grad_norm": 2.2035882341080124, "learning_rate": 3.0750478354716463e-06, "loss": 0.3522, "step": 4542 }, { "epoch": 0.6371669004207574, "grad_norm": 1.7015928323527512, "learning_rate": 3.0729518510451888e-06, "loss": 0.2752, "step": 4543 }, { "epoch": 0.6373071528751754, "grad_norm": 2.2296046839774006, "learning_rate": 3.0708562642516538e-06, "loss": 0.3118, "step": 4544 }, { "epoch": 0.6374474053295933, "grad_norm": 1.9898353004351053, "learning_rate": 3.068761075523451e-06, "loss": 0.41, "step": 4545 }, { "epoch": 0.6375876577840113, "grad_norm": 1.9692703649289445, "learning_rate": 3.0666662852929063e-06, "loss": 0.3181, "step": 4546 }, { "epoch": 0.6377279102384291, "grad_norm": 1.9840463080731596, "learning_rate": 3.0645718939922668e-06, "loss": 0.3099, "step": 4547 }, { "epoch": 0.6378681626928471, "grad_norm": 2.123101901382688, "learning_rate": 3.0624779020536955e-06, "loss": 0.3696, "step": 4548 }, { "epoch": 0.638008415147265, "grad_norm": 2.1057792250647633, "learning_rate": 3.0603843099092713e-06, "loss": 0.3504, "step": 4549 }, { "epoch": 0.638148667601683, "grad_norm": 2.5099752731929046, "learning_rate": 3.058291117990996e-06, "loss": 0.3683, "step": 4550 }, { "epoch": 0.6382889200561009, "grad_norm": 2.002760412687919, "learning_rate": 3.0561983267307803e-06, "loss": 0.3304, "step": 4551 }, { "epoch": 0.6384291725105189, "grad_norm": 1.9658643980052655, "learning_rate": 3.0541059365604597e-06, "loss": 0.3564, "step": 4552 }, { "epoch": 0.6385694249649368, "grad_norm": 2.0715207586774484, "learning_rate": 3.0520139479117844e-06, "loss": 0.3732, "step": 4553 }, { "epoch": 0.6387096774193548, "grad_norm": 2.3694462339278464, "learning_rate": 3.049922361216422e-06, "loss": 0.3856, "step": 4554 }, { "epoch": 0.6388499298737728, "grad_norm": 4.295252712218037, "learning_rate": 3.0478311769059554e-06, "loss": 0.351, "step": 4555 }, { "epoch": 0.6389901823281907, "grad_norm": 2.049159747723904, "learning_rate": 3.045740395411886e-06, "loss": 0.3388, "step": 4556 }, { "epoch": 0.6391304347826087, "grad_norm": 2.5809980625114277, "learning_rate": 3.0436500171656327e-06, "loss": 0.3509, "step": 4557 }, { "epoch": 0.6392706872370266, "grad_norm": 2.0725467933369535, "learning_rate": 3.041560042598532e-06, "loss": 0.3394, "step": 4558 }, { "epoch": 0.6394109396914446, "grad_norm": 2.1502446013304013, "learning_rate": 3.039470472141832e-06, "loss": 0.3715, "step": 4559 }, { "epoch": 0.6395511921458625, "grad_norm": 1.933816792888678, "learning_rate": 3.0373813062267025e-06, "loss": 0.3517, "step": 4560 }, { "epoch": 0.6396914446002805, "grad_norm": 2.4159210852688426, "learning_rate": 3.03529254528423e-06, "loss": 0.3431, "step": 4561 }, { "epoch": 0.6398316970546984, "grad_norm": 1.8660322629790949, "learning_rate": 3.033204189745413e-06, "loss": 0.3974, "step": 4562 }, { "epoch": 0.6399719495091164, "grad_norm": 1.8247432893984266, "learning_rate": 3.0311162400411697e-06, "loss": 0.3155, "step": 4563 }, { "epoch": 0.6401122019635344, "grad_norm": 1.883526298742388, "learning_rate": 3.0290286966023353e-06, "loss": 0.3252, "step": 4564 }, { "epoch": 0.6402524544179523, "grad_norm": 2.004238168097906, "learning_rate": 3.0269415598596604e-06, "loss": 0.3367, "step": 4565 }, { "epoch": 0.6403927068723703, "grad_norm": 1.7507196036792376, "learning_rate": 3.024854830243808e-06, "loss": 0.3485, "step": 4566 }, { "epoch": 0.6405329593267882, "grad_norm": 1.8850351723948737, "learning_rate": 3.022768508185362e-06, "loss": 0.3939, "step": 4567 }, { "epoch": 0.6406732117812062, "grad_norm": 2.5476508333814794, "learning_rate": 3.0206825941148203e-06, "loss": 0.3769, "step": 4568 }, { "epoch": 0.6408134642356241, "grad_norm": 1.986979899941672, "learning_rate": 3.018597088462597e-06, "loss": 0.3244, "step": 4569 }, { "epoch": 0.6409537166900421, "grad_norm": 3.0521013869097735, "learning_rate": 3.0165119916590224e-06, "loss": 0.3706, "step": 4570 }, { "epoch": 0.64109396914446, "grad_norm": 1.7491560672162012, "learning_rate": 3.0144273041343393e-06, "loss": 0.3718, "step": 4571 }, { "epoch": 0.641234221598878, "grad_norm": 2.184723536200053, "learning_rate": 3.0123430263187092e-06, "loss": 0.2994, "step": 4572 }, { "epoch": 0.641374474053296, "grad_norm": 1.7168160395154741, "learning_rate": 3.01025915864221e-06, "loss": 0.3525, "step": 4573 }, { "epoch": 0.6415147265077139, "grad_norm": 1.5907500961665788, "learning_rate": 3.008175701534831e-06, "loss": 0.3468, "step": 4574 }, { "epoch": 0.6416549789621319, "grad_norm": 2.0082021847188276, "learning_rate": 3.006092655426481e-06, "loss": 0.3338, "step": 4575 }, { "epoch": 0.6417952314165498, "grad_norm": 4.189800256961816, "learning_rate": 3.00401002074698e-06, "loss": 0.3742, "step": 4576 }, { "epoch": 0.6419354838709678, "grad_norm": 2.153385194246446, "learning_rate": 3.001927797926067e-06, "loss": 0.3752, "step": 4577 }, { "epoch": 0.6420757363253857, "grad_norm": 2.1230894363474717, "learning_rate": 2.9998459873933927e-06, "loss": 0.3738, "step": 4578 }, { "epoch": 0.6422159887798037, "grad_norm": 1.9055915070390135, "learning_rate": 2.997764589578527e-06, "loss": 0.3081, "step": 4579 }, { "epoch": 0.6423562412342216, "grad_norm": 1.891160305944515, "learning_rate": 2.995683604910947e-06, "loss": 0.36, "step": 4580 }, { "epoch": 0.6424964936886396, "grad_norm": 2.1911503443495084, "learning_rate": 2.9936030338200527e-06, "loss": 0.3469, "step": 4581 }, { "epoch": 0.6426367461430575, "grad_norm": 1.6665888847638703, "learning_rate": 2.991522876735154e-06, "loss": 0.3441, "step": 4582 }, { "epoch": 0.6427769985974755, "grad_norm": 2.2880495714072806, "learning_rate": 2.989443134085477e-06, "loss": 0.3461, "step": 4583 }, { "epoch": 0.6429172510518935, "grad_norm": 1.7869778944333592, "learning_rate": 2.9873638063001633e-06, "loss": 0.3367, "step": 4584 }, { "epoch": 0.6430575035063114, "grad_norm": 2.4827918040407484, "learning_rate": 2.9852848938082657e-06, "loss": 0.3684, "step": 4585 }, { "epoch": 0.6431977559607294, "grad_norm": 1.9377946321979833, "learning_rate": 2.983206397038756e-06, "loss": 0.3512, "step": 4586 }, { "epoch": 0.6433380084151472, "grad_norm": 2.1198192722731886, "learning_rate": 2.981128316420515e-06, "loss": 0.3193, "step": 4587 }, { "epoch": 0.6434782608695652, "grad_norm": 1.8526557881567933, "learning_rate": 2.97905065238234e-06, "loss": 0.328, "step": 4588 }, { "epoch": 0.6436185133239831, "grad_norm": 2.0941407707284037, "learning_rate": 2.9769734053529443e-06, "loss": 0.3047, "step": 4589 }, { "epoch": 0.6437587657784011, "grad_norm": 1.9440640652700383, "learning_rate": 2.974896575760952e-06, "loss": 0.3708, "step": 4590 }, { "epoch": 0.643899018232819, "grad_norm": 2.494536820192507, "learning_rate": 2.972820164034904e-06, "loss": 0.3809, "step": 4591 }, { "epoch": 0.644039270687237, "grad_norm": 1.8840684935638463, "learning_rate": 2.9707441706032515e-06, "loss": 0.3424, "step": 4592 }, { "epoch": 0.6441795231416549, "grad_norm": 1.9816701281137397, "learning_rate": 2.968668595894361e-06, "loss": 0.329, "step": 4593 }, { "epoch": 0.6443197755960729, "grad_norm": 1.8141631684147626, "learning_rate": 2.9665934403365148e-06, "loss": 0.3041, "step": 4594 }, { "epoch": 0.6444600280504909, "grad_norm": 1.7478385247036152, "learning_rate": 2.964518704357906e-06, "loss": 0.3424, "step": 4595 }, { "epoch": 0.6446002805049088, "grad_norm": 2.006157337556306, "learning_rate": 2.9624443883866403e-06, "loss": 0.3469, "step": 4596 }, { "epoch": 0.6447405329593268, "grad_norm": 2.3234138697927795, "learning_rate": 2.9603704928507406e-06, "loss": 0.3881, "step": 4597 }, { "epoch": 0.6448807854137447, "grad_norm": 2.5939210582535885, "learning_rate": 2.958297018178139e-06, "loss": 0.3583, "step": 4598 }, { "epoch": 0.6450210378681627, "grad_norm": 1.9126333078037303, "learning_rate": 2.956223964796685e-06, "loss": 0.3154, "step": 4599 }, { "epoch": 0.6451612903225806, "grad_norm": 2.36846486491677, "learning_rate": 2.9541513331341353e-06, "loss": 0.3476, "step": 4600 }, { "epoch": 0.6453015427769986, "grad_norm": 1.992982174818043, "learning_rate": 2.9520791236181645e-06, "loss": 0.3524, "step": 4601 }, { "epoch": 0.6454417952314165, "grad_norm": 1.7403118488778395, "learning_rate": 2.9500073366763593e-06, "loss": 0.3335, "step": 4602 }, { "epoch": 0.6455820476858345, "grad_norm": 2.1247540499900732, "learning_rate": 2.947935972736217e-06, "loss": 0.3488, "step": 4603 }, { "epoch": 0.6457223001402524, "grad_norm": 2.2549819891436615, "learning_rate": 2.9458650322251505e-06, "loss": 0.371, "step": 4604 }, { "epoch": 0.6458625525946704, "grad_norm": 2.4169141598831962, "learning_rate": 2.943794515570483e-06, "loss": 0.3189, "step": 4605 }, { "epoch": 0.6460028050490884, "grad_norm": 2.3737820965935676, "learning_rate": 2.941724423199451e-06, "loss": 0.3507, "step": 4606 }, { "epoch": 0.6461430575035063, "grad_norm": 2.5483753489125798, "learning_rate": 2.9396547555392054e-06, "loss": 0.3329, "step": 4607 }, { "epoch": 0.6462833099579243, "grad_norm": 2.061467345688524, "learning_rate": 2.9375855130168046e-06, "loss": 0.321, "step": 4608 }, { "epoch": 0.6464235624123422, "grad_norm": 1.9846043491739127, "learning_rate": 2.9355166960592242e-06, "loss": 0.3931, "step": 4609 }, { "epoch": 0.6465638148667602, "grad_norm": 1.7651755512023444, "learning_rate": 2.9334483050933506e-06, "loss": 0.3799, "step": 4610 }, { "epoch": 0.6467040673211781, "grad_norm": 2.0617031523247142, "learning_rate": 2.9313803405459816e-06, "loss": 0.3735, "step": 4611 }, { "epoch": 0.6468443197755961, "grad_norm": 2.5427692294040187, "learning_rate": 2.929312802843826e-06, "loss": 0.3287, "step": 4612 }, { "epoch": 0.646984572230014, "grad_norm": 1.8859294243286475, "learning_rate": 2.927245692413507e-06, "loss": 0.3637, "step": 4613 }, { "epoch": 0.647124824684432, "grad_norm": 2.110236437056754, "learning_rate": 2.925179009681557e-06, "loss": 0.3319, "step": 4614 }, { "epoch": 0.64726507713885, "grad_norm": 2.1491777843923896, "learning_rate": 2.923112755074423e-06, "loss": 0.3268, "step": 4615 }, { "epoch": 0.6474053295932679, "grad_norm": 2.699141549581143, "learning_rate": 2.9210469290184627e-06, "loss": 0.3449, "step": 4616 }, { "epoch": 0.6475455820476859, "grad_norm": 2.2763101054283967, "learning_rate": 2.9189815319399422e-06, "loss": 0.3362, "step": 4617 }, { "epoch": 0.6476858345021038, "grad_norm": 3.4246626694784466, "learning_rate": 2.9169165642650467e-06, "loss": 0.3431, "step": 4618 }, { "epoch": 0.6478260869565218, "grad_norm": 2.0543354153395357, "learning_rate": 2.914852026419862e-06, "loss": 0.3756, "step": 4619 }, { "epoch": 0.6479663394109397, "grad_norm": 1.6626460534668486, "learning_rate": 2.9127879188303954e-06, "loss": 0.3101, "step": 4620 }, { "epoch": 0.6481065918653577, "grad_norm": 3.074064103033329, "learning_rate": 2.910724241922558e-06, "loss": 0.3574, "step": 4621 }, { "epoch": 0.6482468443197756, "grad_norm": 1.8684943186913718, "learning_rate": 2.9086609961221758e-06, "loss": 0.3604, "step": 4622 }, { "epoch": 0.6483870967741936, "grad_norm": 2.2664296792997876, "learning_rate": 2.906598181854986e-06, "loss": 0.349, "step": 4623 }, { "epoch": 0.6485273492286115, "grad_norm": 2.326094490769146, "learning_rate": 2.904535799546636e-06, "loss": 0.3347, "step": 4624 }, { "epoch": 0.6486676016830295, "grad_norm": 1.6826762024664168, "learning_rate": 2.902473849622683e-06, "loss": 0.3217, "step": 4625 }, { "epoch": 0.6488078541374475, "grad_norm": 7.8489513401404505, "learning_rate": 2.9004123325085976e-06, "loss": 0.3536, "step": 4626 }, { "epoch": 0.6489481065918653, "grad_norm": 1.899992380628868, "learning_rate": 2.8983512486297582e-06, "loss": 0.3405, "step": 4627 }, { "epoch": 0.6490883590462833, "grad_norm": 1.7764575132082756, "learning_rate": 2.8962905984114553e-06, "loss": 0.3673, "step": 4628 }, { "epoch": 0.6492286115007012, "grad_norm": 3.2582564017213116, "learning_rate": 2.8942303822788916e-06, "loss": 0.3746, "step": 4629 }, { "epoch": 0.6493688639551192, "grad_norm": 1.8176649101193338, "learning_rate": 2.8921706006571744e-06, "loss": 0.3473, "step": 4630 }, { "epoch": 0.6495091164095371, "grad_norm": 2.4060758057002025, "learning_rate": 2.890111253971327e-06, "loss": 0.3359, "step": 4631 }, { "epoch": 0.6496493688639551, "grad_norm": 3.275301090988934, "learning_rate": 2.8880523426462824e-06, "loss": 0.359, "step": 4632 }, { "epoch": 0.649789621318373, "grad_norm": 2.3130684231319947, "learning_rate": 2.885993867106881e-06, "loss": 0.3601, "step": 4633 }, { "epoch": 0.649929873772791, "grad_norm": 2.5167180879236417, "learning_rate": 2.8839358277778758e-06, "loss": 0.3677, "step": 4634 }, { "epoch": 0.6500701262272089, "grad_norm": 3.0954326330959225, "learning_rate": 2.8818782250839282e-06, "loss": 0.3736, "step": 4635 }, { "epoch": 0.6502103786816269, "grad_norm": 2.5139943399904277, "learning_rate": 2.879821059449611e-06, "loss": 0.3666, "step": 4636 }, { "epoch": 0.6503506311360449, "grad_norm": 1.7542489785328284, "learning_rate": 2.8777643312994046e-06, "loss": 0.311, "step": 4637 }, { "epoch": 0.6504908835904628, "grad_norm": 2.3791102931018484, "learning_rate": 2.8757080410577042e-06, "loss": 0.3578, "step": 4638 }, { "epoch": 0.6506311360448808, "grad_norm": 2.3984617548905907, "learning_rate": 2.8736521891488057e-06, "loss": 0.3409, "step": 4639 }, { "epoch": 0.6507713884992987, "grad_norm": 3.3016425639824027, "learning_rate": 2.8715967759969222e-06, "loss": 0.3811, "step": 4640 }, { "epoch": 0.6509116409537167, "grad_norm": 1.7915204084566343, "learning_rate": 2.8695418020261755e-06, "loss": 0.3916, "step": 4641 }, { "epoch": 0.6510518934081346, "grad_norm": 2.8164749031323475, "learning_rate": 2.8674872676605914e-06, "loss": 0.3928, "step": 4642 }, { "epoch": 0.6511921458625526, "grad_norm": 1.8285579331785564, "learning_rate": 2.8654331733241113e-06, "loss": 0.315, "step": 4643 }, { "epoch": 0.6513323983169705, "grad_norm": 3.862429626218797, "learning_rate": 2.8633795194405824e-06, "loss": 0.3694, "step": 4644 }, { "epoch": 0.6514726507713885, "grad_norm": 1.9473305770770066, "learning_rate": 2.8613263064337617e-06, "loss": 0.3355, "step": 4645 }, { "epoch": 0.6516129032258065, "grad_norm": 1.9278602133881784, "learning_rate": 2.859273534727316e-06, "loss": 0.3497, "step": 4646 }, { "epoch": 0.6517531556802244, "grad_norm": 2.398239475487615, "learning_rate": 2.8572212047448196e-06, "loss": 0.3661, "step": 4647 }, { "epoch": 0.6518934081346424, "grad_norm": 2.635364868700427, "learning_rate": 2.8551693169097573e-06, "loss": 0.291, "step": 4648 }, { "epoch": 0.6520336605890603, "grad_norm": 2.59978582527105, "learning_rate": 2.8531178716455217e-06, "loss": 0.3385, "step": 4649 }, { "epoch": 0.6521739130434783, "grad_norm": 1.8331195388749015, "learning_rate": 2.8510668693754157e-06, "loss": 0.3514, "step": 4650 }, { "epoch": 0.6523141654978962, "grad_norm": 1.8445192445706031, "learning_rate": 2.8490163105226454e-06, "loss": 0.3045, "step": 4651 }, { "epoch": 0.6524544179523142, "grad_norm": 1.6860395398871482, "learning_rate": 2.846966195510332e-06, "loss": 0.3466, "step": 4652 }, { "epoch": 0.6525946704067321, "grad_norm": 1.9178165289923703, "learning_rate": 2.844916524761502e-06, "loss": 0.3136, "step": 4653 }, { "epoch": 0.6527349228611501, "grad_norm": 2.1315857429842233, "learning_rate": 2.8428672986990894e-06, "loss": 0.3923, "step": 4654 }, { "epoch": 0.652875175315568, "grad_norm": 2.0212318446960578, "learning_rate": 2.84081851774594e-06, "loss": 0.3653, "step": 4655 }, { "epoch": 0.653015427769986, "grad_norm": 3.0373076640957057, "learning_rate": 2.8387701823248035e-06, "loss": 0.3499, "step": 4656 }, { "epoch": 0.653155680224404, "grad_norm": 2.061589126588437, "learning_rate": 2.8367222928583403e-06, "loss": 0.3396, "step": 4657 }, { "epoch": 0.6532959326788219, "grad_norm": 2.2320376889112916, "learning_rate": 2.834674849769119e-06, "loss": 0.353, "step": 4658 }, { "epoch": 0.6534361851332399, "grad_norm": 2.42074292704061, "learning_rate": 2.8326278534796154e-06, "loss": 0.2906, "step": 4659 }, { "epoch": 0.6535764375876578, "grad_norm": 2.0843648518511158, "learning_rate": 2.83058130441221e-06, "loss": 0.3169, "step": 4660 }, { "epoch": 0.6537166900420758, "grad_norm": 1.9394999610800565, "learning_rate": 2.8285352029891957e-06, "loss": 0.3225, "step": 4661 }, { "epoch": 0.6538569424964937, "grad_norm": 2.0162747540878865, "learning_rate": 2.826489549632773e-06, "loss": 0.3499, "step": 4662 }, { "epoch": 0.6539971949509117, "grad_norm": 2.064376901827244, "learning_rate": 2.8244443447650448e-06, "loss": 0.3106, "step": 4663 }, { "epoch": 0.6541374474053296, "grad_norm": 2.305044269426271, "learning_rate": 2.8223995888080263e-06, "loss": 0.3198, "step": 4664 }, { "epoch": 0.6542776998597476, "grad_norm": 1.5590281251691258, "learning_rate": 2.8203552821836388e-06, "loss": 0.297, "step": 4665 }, { "epoch": 0.6544179523141656, "grad_norm": 1.9711392754594148, "learning_rate": 2.81831142531371e-06, "loss": 0.3417, "step": 4666 }, { "epoch": 0.6545582047685834, "grad_norm": 2.334647852872854, "learning_rate": 2.816268018619977e-06, "loss": 0.3249, "step": 4667 }, { "epoch": 0.6546984572230014, "grad_norm": 2.3746166923800525, "learning_rate": 2.8142250625240806e-06, "loss": 0.3302, "step": 4668 }, { "epoch": 0.6548387096774193, "grad_norm": 2.3418481867293406, "learning_rate": 2.8121825574475727e-06, "loss": 0.3391, "step": 4669 }, { "epoch": 0.6549789621318373, "grad_norm": 2.4825000525529286, "learning_rate": 2.81014050381191e-06, "loss": 0.3246, "step": 4670 }, { "epoch": 0.6551192145862552, "grad_norm": 1.8045615280244405, "learning_rate": 2.808098902038453e-06, "loss": 0.302, "step": 4671 }, { "epoch": 0.6552594670406732, "grad_norm": 2.0861615121404977, "learning_rate": 2.8060577525484735e-06, "loss": 0.3293, "step": 4672 }, { "epoch": 0.6553997194950911, "grad_norm": 2.5970633611753158, "learning_rate": 2.804017055763149e-06, "loss": 0.3702, "step": 4673 }, { "epoch": 0.6555399719495091, "grad_norm": 2.614142625199795, "learning_rate": 2.8019768121035627e-06, "loss": 0.4004, "step": 4674 }, { "epoch": 0.655680224403927, "grad_norm": 2.2539857952817783, "learning_rate": 2.799937021990704e-06, "loss": 0.3261, "step": 4675 }, { "epoch": 0.655820476858345, "grad_norm": 2.5801004875190037, "learning_rate": 2.797897685845471e-06, "loss": 0.3352, "step": 4676 }, { "epoch": 0.655960729312763, "grad_norm": 1.7454343229038092, "learning_rate": 2.7958588040886647e-06, "loss": 0.3847, "step": 4677 }, { "epoch": 0.6561009817671809, "grad_norm": 2.012056962998581, "learning_rate": 2.7938203771409945e-06, "loss": 0.3486, "step": 4678 }, { "epoch": 0.6562412342215989, "grad_norm": 1.981845715612273, "learning_rate": 2.7917824054230787e-06, "loss": 0.3521, "step": 4679 }, { "epoch": 0.6563814866760168, "grad_norm": 1.912802830574912, "learning_rate": 2.7897448893554335e-06, "loss": 0.3773, "step": 4680 }, { "epoch": 0.6565217391304348, "grad_norm": 1.8860624763399523, "learning_rate": 2.787707829358488e-06, "loss": 0.3298, "step": 4681 }, { "epoch": 0.6566619915848527, "grad_norm": 2.11202361925963, "learning_rate": 2.7856712258525755e-06, "loss": 0.3467, "step": 4682 }, { "epoch": 0.6568022440392707, "grad_norm": 2.4087430251316984, "learning_rate": 2.783635079257937e-06, "loss": 0.3528, "step": 4683 }, { "epoch": 0.6569424964936886, "grad_norm": 1.9470473064755167, "learning_rate": 2.7815993899947135e-06, "loss": 0.3713, "step": 4684 }, { "epoch": 0.6570827489481066, "grad_norm": 2.473218905054572, "learning_rate": 2.779564158482957e-06, "loss": 0.3633, "step": 4685 }, { "epoch": 0.6572230014025245, "grad_norm": 2.5197186067693758, "learning_rate": 2.7775293851426233e-06, "loss": 0.3765, "step": 4686 }, { "epoch": 0.6573632538569425, "grad_norm": 2.4310388272321974, "learning_rate": 2.7754950703935735e-06, "loss": 0.324, "step": 4687 }, { "epoch": 0.6575035063113605, "grad_norm": 1.8321771949922003, "learning_rate": 2.7734612146555738e-06, "loss": 0.3663, "step": 4688 }, { "epoch": 0.6576437587657784, "grad_norm": 2.2703847412881637, "learning_rate": 2.7714278183482967e-06, "loss": 0.3459, "step": 4689 }, { "epoch": 0.6577840112201964, "grad_norm": 1.8240195580153358, "learning_rate": 2.7693948818913197e-06, "loss": 0.3, "step": 4690 }, { "epoch": 0.6579242636746143, "grad_norm": 2.76302400685283, "learning_rate": 2.767362405704126e-06, "loss": 0.3025, "step": 4691 }, { "epoch": 0.6580645161290323, "grad_norm": 1.637770212606875, "learning_rate": 2.7653303902061e-06, "loss": 0.3043, "step": 4692 }, { "epoch": 0.6582047685834502, "grad_norm": 1.752883034938037, "learning_rate": 2.763298835816535e-06, "loss": 0.3988, "step": 4693 }, { "epoch": 0.6583450210378682, "grad_norm": 2.000782932577204, "learning_rate": 2.761267742954629e-06, "loss": 0.3642, "step": 4694 }, { "epoch": 0.6584852734922861, "grad_norm": 2.5979672812206878, "learning_rate": 2.7592371120394825e-06, "loss": 0.3408, "step": 4695 }, { "epoch": 0.6586255259467041, "grad_norm": 2.2892040430254896, "learning_rate": 2.757206943490103e-06, "loss": 0.3333, "step": 4696 }, { "epoch": 0.658765778401122, "grad_norm": 1.7572594344780899, "learning_rate": 2.7551772377254018e-06, "loss": 0.3216, "step": 4697 }, { "epoch": 0.65890603085554, "grad_norm": 2.191060030373805, "learning_rate": 2.7531479951641928e-06, "loss": 0.3497, "step": 4698 }, { "epoch": 0.659046283309958, "grad_norm": 1.9372309221033999, "learning_rate": 2.751119216225198e-06, "loss": 0.3376, "step": 4699 }, { "epoch": 0.6591865357643759, "grad_norm": 2.025164953036077, "learning_rate": 2.749090901327043e-06, "loss": 0.3649, "step": 4700 }, { "epoch": 0.6593267882187939, "grad_norm": 2.416887525353876, "learning_rate": 2.7470630508882525e-06, "loss": 0.3229, "step": 4701 }, { "epoch": 0.6594670406732118, "grad_norm": 2.5935240497757364, "learning_rate": 2.7450356653272614e-06, "loss": 0.3489, "step": 4702 }, { "epoch": 0.6596072931276298, "grad_norm": 1.8588028997347341, "learning_rate": 2.7430087450624053e-06, "loss": 0.344, "step": 4703 }, { "epoch": 0.6597475455820477, "grad_norm": 2.1908006018158073, "learning_rate": 2.740982290511929e-06, "loss": 0.3292, "step": 4704 }, { "epoch": 0.6598877980364657, "grad_norm": 2.266288502319294, "learning_rate": 2.7389563020939724e-06, "loss": 0.3525, "step": 4705 }, { "epoch": 0.6600280504908836, "grad_norm": 2.1414814370450985, "learning_rate": 2.7369307802265854e-06, "loss": 0.3371, "step": 4706 }, { "epoch": 0.6601683029453015, "grad_norm": 1.9773840120542923, "learning_rate": 2.734905725327721e-06, "loss": 0.3522, "step": 4707 }, { "epoch": 0.6603085553997194, "grad_norm": 2.7644149805671248, "learning_rate": 2.7328811378152355e-06, "loss": 0.3436, "step": 4708 }, { "epoch": 0.6604488078541374, "grad_norm": 1.6660650677516469, "learning_rate": 2.7308570181068872e-06, "loss": 0.3189, "step": 4709 }, { "epoch": 0.6605890603085554, "grad_norm": 2.2498889471834387, "learning_rate": 2.72883336662034e-06, "loss": 0.338, "step": 4710 }, { "epoch": 0.6607293127629733, "grad_norm": 2.072071983213755, "learning_rate": 2.726810183773162e-06, "loss": 0.315, "step": 4711 }, { "epoch": 0.6608695652173913, "grad_norm": 1.8845091333934638, "learning_rate": 2.7247874699828186e-06, "loss": 0.3219, "step": 4712 }, { "epoch": 0.6610098176718092, "grad_norm": 5.763176788504465, "learning_rate": 2.7227652256666848e-06, "loss": 0.3359, "step": 4713 }, { "epoch": 0.6611500701262272, "grad_norm": 1.8985609477529983, "learning_rate": 2.7207434512420374e-06, "loss": 0.3844, "step": 4714 }, { "epoch": 0.6612903225806451, "grad_norm": 2.5145070689935056, "learning_rate": 2.718722147126054e-06, "loss": 0.3305, "step": 4715 }, { "epoch": 0.6614305750350631, "grad_norm": 1.9832996849069426, "learning_rate": 2.7167013137358173e-06, "loss": 0.325, "step": 4716 }, { "epoch": 0.661570827489481, "grad_norm": 2.008781941224616, "learning_rate": 2.714680951488312e-06, "loss": 0.3624, "step": 4717 }, { "epoch": 0.661711079943899, "grad_norm": 1.9941882091378325, "learning_rate": 2.7126610608004263e-06, "loss": 0.34, "step": 4718 }, { "epoch": 0.661851332398317, "grad_norm": 2.385054837122267, "learning_rate": 2.71064164208895e-06, "loss": 0.3251, "step": 4719 }, { "epoch": 0.6619915848527349, "grad_norm": 2.4934164144406368, "learning_rate": 2.7086226957705773e-06, "loss": 0.3786, "step": 4720 }, { "epoch": 0.6621318373071529, "grad_norm": 2.09854067550914, "learning_rate": 2.7066042222619017e-06, "loss": 0.3854, "step": 4721 }, { "epoch": 0.6622720897615708, "grad_norm": 1.8302577692895277, "learning_rate": 2.704586221979422e-06, "loss": 0.3195, "step": 4722 }, { "epoch": 0.6624123422159888, "grad_norm": 3.1317948284252393, "learning_rate": 2.7025686953395368e-06, "loss": 0.3644, "step": 4723 }, { "epoch": 0.6625525946704067, "grad_norm": 1.7526700223327374, "learning_rate": 2.7005516427585537e-06, "loss": 0.3281, "step": 4724 }, { "epoch": 0.6626928471248247, "grad_norm": 1.7663058428295533, "learning_rate": 2.6985350646526713e-06, "loss": 0.3556, "step": 4725 }, { "epoch": 0.6628330995792426, "grad_norm": 2.7559095522761896, "learning_rate": 2.6965189614379995e-06, "loss": 0.3546, "step": 4726 }, { "epoch": 0.6629733520336606, "grad_norm": 1.6614809742244903, "learning_rate": 2.6945033335305458e-06, "loss": 0.3072, "step": 4727 }, { "epoch": 0.6631136044880785, "grad_norm": 2.2053696040192516, "learning_rate": 2.6924881813462225e-06, "loss": 0.365, "step": 4728 }, { "epoch": 0.6632538569424965, "grad_norm": 1.6586938498105346, "learning_rate": 2.6904735053008405e-06, "loss": 0.3457, "step": 4729 }, { "epoch": 0.6633941093969145, "grad_norm": 2.810974385484437, "learning_rate": 2.688459305810116e-06, "loss": 0.3483, "step": 4730 }, { "epoch": 0.6635343618513324, "grad_norm": 2.446048711059328, "learning_rate": 2.6864455832896633e-06, "loss": 0.3705, "step": 4731 }, { "epoch": 0.6636746143057504, "grad_norm": 1.9848479660963572, "learning_rate": 2.684432338155003e-06, "loss": 0.3039, "step": 4732 }, { "epoch": 0.6638148667601683, "grad_norm": 1.9624996574061393, "learning_rate": 2.6824195708215504e-06, "loss": 0.2931, "step": 4733 }, { "epoch": 0.6639551192145863, "grad_norm": 2.279642453183277, "learning_rate": 2.6804072817046266e-06, "loss": 0.3284, "step": 4734 }, { "epoch": 0.6640953716690042, "grad_norm": 2.1523510609137406, "learning_rate": 2.678395471219455e-06, "loss": 0.3261, "step": 4735 }, { "epoch": 0.6642356241234222, "grad_norm": 3.236647540061164, "learning_rate": 2.6763841397811576e-06, "loss": 0.3104, "step": 4736 }, { "epoch": 0.6643758765778401, "grad_norm": 2.3175174820291, "learning_rate": 2.674373287804759e-06, "loss": 0.3223, "step": 4737 }, { "epoch": 0.6645161290322581, "grad_norm": 2.044567059154848, "learning_rate": 2.6723629157051844e-06, "loss": 0.3573, "step": 4738 }, { "epoch": 0.664656381486676, "grad_norm": 1.8697812249220616, "learning_rate": 2.6703530238972597e-06, "loss": 0.3498, "step": 4739 }, { "epoch": 0.664796633941094, "grad_norm": 1.8125449207921789, "learning_rate": 2.6683436127957122e-06, "loss": 0.3434, "step": 4740 }, { "epoch": 0.664936886395512, "grad_norm": 2.433619026065066, "learning_rate": 2.6663346828151727e-06, "loss": 0.3327, "step": 4741 }, { "epoch": 0.6650771388499299, "grad_norm": 2.9677482119382854, "learning_rate": 2.664326234370164e-06, "loss": 0.3943, "step": 4742 }, { "epoch": 0.6652173913043479, "grad_norm": 3.028960946577812, "learning_rate": 2.662318267875119e-06, "loss": 0.3211, "step": 4743 }, { "epoch": 0.6653576437587658, "grad_norm": 1.8940958368249898, "learning_rate": 2.6603107837443675e-06, "loss": 0.318, "step": 4744 }, { "epoch": 0.6654978962131838, "grad_norm": 1.8530857564492274, "learning_rate": 2.658303782392141e-06, "loss": 0.3508, "step": 4745 }, { "epoch": 0.6656381486676017, "grad_norm": 1.537619984604868, "learning_rate": 2.656297264232567e-06, "loss": 0.3237, "step": 4746 }, { "epoch": 0.6657784011220196, "grad_norm": 1.6202378552862653, "learning_rate": 2.654291229679678e-06, "loss": 0.3291, "step": 4747 }, { "epoch": 0.6659186535764375, "grad_norm": 1.8886748415461125, "learning_rate": 2.652285679147405e-06, "loss": 0.357, "step": 4748 }, { "epoch": 0.6660589060308555, "grad_norm": 2.0224770877327765, "learning_rate": 2.65028061304958e-06, "loss": 0.3423, "step": 4749 }, { "epoch": 0.6661991584852734, "grad_norm": 2.8815385851867266, "learning_rate": 2.6482760317999338e-06, "loss": 0.3389, "step": 4750 }, { "epoch": 0.6663394109396914, "grad_norm": 2.3082438298947254, "learning_rate": 2.6462719358120983e-06, "loss": 0.3756, "step": 4751 }, { "epoch": 0.6664796633941094, "grad_norm": 2.0251263378904842, "learning_rate": 2.644268325499606e-06, "loss": 0.3481, "step": 4752 }, { "epoch": 0.6666199158485273, "grad_norm": 2.493588523130223, "learning_rate": 2.642265201275885e-06, "loss": 0.3424, "step": 4753 }, { "epoch": 0.6667601683029453, "grad_norm": 1.9319181946757449, "learning_rate": 2.640262563554267e-06, "loss": 0.3397, "step": 4754 }, { "epoch": 0.6669004207573632, "grad_norm": 2.442674670499213, "learning_rate": 2.6382604127479815e-06, "loss": 0.3937, "step": 4755 }, { "epoch": 0.6670406732117812, "grad_norm": 2.0225662064060237, "learning_rate": 2.636258749270161e-06, "loss": 0.3022, "step": 4756 }, { "epoch": 0.6671809256661991, "grad_norm": 2.3421753743718603, "learning_rate": 2.634257573533833e-06, "loss": 0.3411, "step": 4757 }, { "epoch": 0.6673211781206171, "grad_norm": 2.2014546656951817, "learning_rate": 2.632256885951925e-06, "loss": 0.3402, "step": 4758 }, { "epoch": 0.667461430575035, "grad_norm": 2.219863826939898, "learning_rate": 2.630256686937267e-06, "loss": 0.3245, "step": 4759 }, { "epoch": 0.667601683029453, "grad_norm": 2.4774018318443582, "learning_rate": 2.6282569769025857e-06, "loss": 0.3313, "step": 4760 }, { "epoch": 0.667741935483871, "grad_norm": 1.5628846267321943, "learning_rate": 2.6262577562605086e-06, "loss": 0.3064, "step": 4761 }, { "epoch": 0.6678821879382889, "grad_norm": 4.691355022878102, "learning_rate": 2.6242590254235566e-06, "loss": 0.3282, "step": 4762 }, { "epoch": 0.6680224403927069, "grad_norm": 1.9757068576284977, "learning_rate": 2.622260784804157e-06, "loss": 0.3239, "step": 4763 }, { "epoch": 0.6681626928471248, "grad_norm": 55.64012917498822, "learning_rate": 2.6202630348146323e-06, "loss": 0.3356, "step": 4764 }, { "epoch": 0.6683029453015428, "grad_norm": 2.218992300638231, "learning_rate": 2.6182657758672046e-06, "loss": 0.356, "step": 4765 }, { "epoch": 0.6684431977559607, "grad_norm": 2.0142887624661494, "learning_rate": 2.616269008373995e-06, "loss": 0.3301, "step": 4766 }, { "epoch": 0.6685834502103787, "grad_norm": 2.291814302254986, "learning_rate": 2.6142727327470203e-06, "loss": 0.3951, "step": 4767 }, { "epoch": 0.6687237026647966, "grad_norm": 3.155596523809966, "learning_rate": 2.612276949398199e-06, "loss": 0.3386, "step": 4768 }, { "epoch": 0.6688639551192146, "grad_norm": 2.0918613854869323, "learning_rate": 2.610281658739347e-06, "loss": 0.3455, "step": 4769 }, { "epoch": 0.6690042075736325, "grad_norm": 1.7799757029986059, "learning_rate": 2.6082868611821787e-06, "loss": 0.3454, "step": 4770 }, { "epoch": 0.6691444600280505, "grad_norm": 2.3196245708560768, "learning_rate": 2.606292557138307e-06, "loss": 0.3836, "step": 4771 }, { "epoch": 0.6692847124824685, "grad_norm": 1.8635379766346316, "learning_rate": 2.6042987470192425e-06, "loss": 0.3536, "step": 4772 }, { "epoch": 0.6694249649368864, "grad_norm": 2.2895219881993363, "learning_rate": 2.602305431236396e-06, "loss": 0.3439, "step": 4773 }, { "epoch": 0.6695652173913044, "grad_norm": 1.811038908205031, "learning_rate": 2.6003126102010696e-06, "loss": 0.3139, "step": 4774 }, { "epoch": 0.6697054698457223, "grad_norm": 2.1723970293547663, "learning_rate": 2.598320284324471e-06, "loss": 0.3565, "step": 4775 }, { "epoch": 0.6698457223001403, "grad_norm": 1.9545669725858683, "learning_rate": 2.596328454017702e-06, "loss": 0.3639, "step": 4776 }, { "epoch": 0.6699859747545582, "grad_norm": 1.9651293689181057, "learning_rate": 2.5943371196917633e-06, "loss": 0.383, "step": 4777 }, { "epoch": 0.6701262272089762, "grad_norm": 2.184369490238603, "learning_rate": 2.592346281757552e-06, "loss": 0.3377, "step": 4778 }, { "epoch": 0.6702664796633941, "grad_norm": 2.2520091305440917, "learning_rate": 2.590355940625865e-06, "loss": 0.356, "step": 4779 }, { "epoch": 0.6704067321178121, "grad_norm": 1.6715399861027498, "learning_rate": 2.5883660967073944e-06, "loss": 0.3716, "step": 4780 }, { "epoch": 0.67054698457223, "grad_norm": 2.390283398126288, "learning_rate": 2.5863767504127313e-06, "loss": 0.3208, "step": 4781 }, { "epoch": 0.670687237026648, "grad_norm": 2.179289969559659, "learning_rate": 2.5843879021523636e-06, "loss": 0.3361, "step": 4782 }, { "epoch": 0.670827489481066, "grad_norm": 1.8004717447086647, "learning_rate": 2.582399552336674e-06, "loss": 0.3465, "step": 4783 }, { "epoch": 0.6709677419354839, "grad_norm": 2.6576084029444367, "learning_rate": 2.5804117013759466e-06, "loss": 0.3136, "step": 4784 }, { "epoch": 0.6711079943899019, "grad_norm": 2.2337519704066096, "learning_rate": 2.5784243496803596e-06, "loss": 0.3612, "step": 4785 }, { "epoch": 0.6712482468443198, "grad_norm": 3.4833930988447315, "learning_rate": 2.5764374976599894e-06, "loss": 0.3155, "step": 4786 }, { "epoch": 0.6713884992987377, "grad_norm": 2.3751951438640884, "learning_rate": 2.574451145724812e-06, "loss": 0.3162, "step": 4787 }, { "epoch": 0.6715287517531556, "grad_norm": 2.1865011932209284, "learning_rate": 2.5724652942846916e-06, "loss": 0.3525, "step": 4788 }, { "epoch": 0.6716690042075736, "grad_norm": 1.989145626899199, "learning_rate": 2.5704799437493976e-06, "loss": 0.3378, "step": 4789 }, { "epoch": 0.6718092566619915, "grad_norm": 2.4700663473508038, "learning_rate": 2.5684950945285937e-06, "loss": 0.3413, "step": 4790 }, { "epoch": 0.6719495091164095, "grad_norm": 1.7890071943638448, "learning_rate": 2.5665107470318396e-06, "loss": 0.2885, "step": 4791 }, { "epoch": 0.6720897615708274, "grad_norm": 9.053408778479733, "learning_rate": 2.5645269016685905e-06, "loss": 0.3846, "step": 4792 }, { "epoch": 0.6722300140252454, "grad_norm": 1.8155621936328632, "learning_rate": 2.5625435588482017e-06, "loss": 0.39, "step": 4793 }, { "epoch": 0.6723702664796634, "grad_norm": 1.5217388366319835, "learning_rate": 2.5605607189799177e-06, "loss": 0.3252, "step": 4794 }, { "epoch": 0.6725105189340813, "grad_norm": 2.169840456639448, "learning_rate": 2.558578382472887e-06, "loss": 0.3354, "step": 4795 }, { "epoch": 0.6726507713884993, "grad_norm": 1.7608257322543013, "learning_rate": 2.5565965497361494e-06, "loss": 0.3393, "step": 4796 }, { "epoch": 0.6727910238429172, "grad_norm": 1.9172696395228257, "learning_rate": 2.5546152211786428e-06, "loss": 0.3055, "step": 4797 }, { "epoch": 0.6729312762973352, "grad_norm": 2.3876126327709613, "learning_rate": 2.5526343972092003e-06, "loss": 0.3662, "step": 4798 }, { "epoch": 0.6730715287517531, "grad_norm": 1.622893802480131, "learning_rate": 2.550654078236552e-06, "loss": 0.324, "step": 4799 }, { "epoch": 0.6732117812061711, "grad_norm": 2.191883382006201, "learning_rate": 2.5486742646693217e-06, "loss": 0.3955, "step": 4800 }, { "epoch": 0.673352033660589, "grad_norm": 2.545905180155053, "learning_rate": 2.5466949569160306e-06, "loss": 0.3654, "step": 4801 }, { "epoch": 0.673492286115007, "grad_norm": 2.8056506360081626, "learning_rate": 2.5447161553850974e-06, "loss": 0.3629, "step": 4802 }, { "epoch": 0.673632538569425, "grad_norm": 2.1400262593948836, "learning_rate": 2.5427378604848285e-06, "loss": 0.3579, "step": 4803 }, { "epoch": 0.6737727910238429, "grad_norm": 1.7256024483461303, "learning_rate": 2.5407600726234356e-06, "loss": 0.333, "step": 4804 }, { "epoch": 0.6739130434782609, "grad_norm": 1.8363989067596194, "learning_rate": 2.538782792209019e-06, "loss": 0.3468, "step": 4805 }, { "epoch": 0.6740532959326788, "grad_norm": 2.321343292514414, "learning_rate": 2.5368060196495785e-06, "loss": 0.3451, "step": 4806 }, { "epoch": 0.6741935483870968, "grad_norm": 1.6191084505388524, "learning_rate": 2.5348297553530064e-06, "loss": 0.2886, "step": 4807 }, { "epoch": 0.6743338008415147, "grad_norm": 1.8480164255469123, "learning_rate": 2.5328539997270927e-06, "loss": 0.3027, "step": 4808 }, { "epoch": 0.6744740532959327, "grad_norm": 1.9105289150307192, "learning_rate": 2.5308787531795186e-06, "loss": 0.3497, "step": 4809 }, { "epoch": 0.6746143057503506, "grad_norm": 2.1354458938707426, "learning_rate": 2.5289040161178623e-06, "loss": 0.3392, "step": 4810 }, { "epoch": 0.6747545582047686, "grad_norm": 1.6922082563344254, "learning_rate": 2.526929788949598e-06, "loss": 0.3395, "step": 4811 }, { "epoch": 0.6748948106591866, "grad_norm": 2.5258509193579277, "learning_rate": 2.524956072082093e-06, "loss": 0.4006, "step": 4812 }, { "epoch": 0.6750350631136045, "grad_norm": 2.5034864909952486, "learning_rate": 2.5229828659226114e-06, "loss": 0.3507, "step": 4813 }, { "epoch": 0.6751753155680225, "grad_norm": 3.8638668715721702, "learning_rate": 2.521010170878311e-06, "loss": 0.3521, "step": 4814 }, { "epoch": 0.6753155680224404, "grad_norm": 2.4862366425014915, "learning_rate": 2.5190379873562402e-06, "loss": 0.3639, "step": 4815 }, { "epoch": 0.6754558204768584, "grad_norm": 2.6345516991776132, "learning_rate": 2.517066315763348e-06, "loss": 0.3882, "step": 4816 }, { "epoch": 0.6755960729312763, "grad_norm": 2.0635241653288183, "learning_rate": 2.5150951565064737e-06, "loss": 0.3373, "step": 4817 }, { "epoch": 0.6757363253856943, "grad_norm": 2.216093272666554, "learning_rate": 2.513124509992353e-06, "loss": 0.3393, "step": 4818 }, { "epoch": 0.6758765778401122, "grad_norm": 3.0399091860106435, "learning_rate": 2.511154376627615e-06, "loss": 0.356, "step": 4819 }, { "epoch": 0.6760168302945302, "grad_norm": 3.8105693994400913, "learning_rate": 2.5091847568187834e-06, "loss": 0.3533, "step": 4820 }, { "epoch": 0.6761570827489481, "grad_norm": 2.979912742258916, "learning_rate": 2.5072156509722745e-06, "loss": 0.3542, "step": 4821 }, { "epoch": 0.6762973352033661, "grad_norm": 1.9496235704220481, "learning_rate": 2.5052470594944e-06, "loss": 0.3708, "step": 4822 }, { "epoch": 0.6764375876577841, "grad_norm": 1.9776677596046714, "learning_rate": 2.5032789827913672e-06, "loss": 0.3302, "step": 4823 }, { "epoch": 0.676577840112202, "grad_norm": 1.8552918038572037, "learning_rate": 2.5013114212692713e-06, "loss": 0.3384, "step": 4824 }, { "epoch": 0.67671809256662, "grad_norm": 3.066828724131287, "learning_rate": 2.499344375334106e-06, "loss": 0.3313, "step": 4825 }, { "epoch": 0.6768583450210379, "grad_norm": 2.0423257373817774, "learning_rate": 2.4973778453917574e-06, "loss": 0.3407, "step": 4826 }, { "epoch": 0.6769985974754558, "grad_norm": 1.9275461895426618, "learning_rate": 2.4954118318480063e-06, "loss": 0.3369, "step": 4827 }, { "epoch": 0.6771388499298737, "grad_norm": 1.6136146921272896, "learning_rate": 2.4934463351085254e-06, "loss": 0.3356, "step": 4828 }, { "epoch": 0.6772791023842917, "grad_norm": 2.023826532265707, "learning_rate": 2.4914813555788827e-06, "loss": 0.3857, "step": 4829 }, { "epoch": 0.6774193548387096, "grad_norm": 2.6130023345586464, "learning_rate": 2.489516893664535e-06, "loss": 0.333, "step": 4830 }, { "epoch": 0.6775596072931276, "grad_norm": 1.9802990601805766, "learning_rate": 2.4875529497708356e-06, "loss": 0.3525, "step": 4831 }, { "epoch": 0.6776998597475455, "grad_norm": 1.7728034411030296, "learning_rate": 2.4855895243030325e-06, "loss": 0.3657, "step": 4832 }, { "epoch": 0.6778401122019635, "grad_norm": 2.1376590647266593, "learning_rate": 2.483626617666264e-06, "loss": 0.3367, "step": 4833 }, { "epoch": 0.6779803646563815, "grad_norm": 1.9989144240091217, "learning_rate": 2.4816642302655634e-06, "loss": 0.3564, "step": 4834 }, { "epoch": 0.6781206171107994, "grad_norm": 1.6249887025880574, "learning_rate": 2.479702362505853e-06, "loss": 0.3611, "step": 4835 }, { "epoch": 0.6782608695652174, "grad_norm": 3.828242459733366, "learning_rate": 2.4777410147919516e-06, "loss": 0.3107, "step": 4836 }, { "epoch": 0.6784011220196353, "grad_norm": 1.7606868724628917, "learning_rate": 2.4757801875285705e-06, "loss": 0.3754, "step": 4837 }, { "epoch": 0.6785413744740533, "grad_norm": 1.6891278080484637, "learning_rate": 2.4738198811203112e-06, "loss": 0.3604, "step": 4838 }, { "epoch": 0.6786816269284712, "grad_norm": 1.6802695042290212, "learning_rate": 2.471860095971671e-06, "loss": 0.3568, "step": 4839 }, { "epoch": 0.6788218793828892, "grad_norm": 2.112436404202052, "learning_rate": 2.4699008324870366e-06, "loss": 0.3469, "step": 4840 }, { "epoch": 0.6789621318373071, "grad_norm": 2.258230731599385, "learning_rate": 2.4679420910706887e-06, "loss": 0.3676, "step": 4841 }, { "epoch": 0.6791023842917251, "grad_norm": 1.7790280588134102, "learning_rate": 2.4659838721268005e-06, "loss": 0.2956, "step": 4842 }, { "epoch": 0.679242636746143, "grad_norm": 8.145004295315204, "learning_rate": 2.4640261760594377e-06, "loss": 0.3464, "step": 4843 }, { "epoch": 0.679382889200561, "grad_norm": 4.064096929220273, "learning_rate": 2.4620690032725536e-06, "loss": 0.3414, "step": 4844 }, { "epoch": 0.679523141654979, "grad_norm": 2.1768670103905094, "learning_rate": 2.4601123541699996e-06, "loss": 0.3786, "step": 4845 }, { "epoch": 0.6796633941093969, "grad_norm": 2.837693188514416, "learning_rate": 2.458156229155516e-06, "loss": 0.3681, "step": 4846 }, { "epoch": 0.6798036465638149, "grad_norm": 2.0115092048026675, "learning_rate": 2.456200628632736e-06, "loss": 0.3524, "step": 4847 }, { "epoch": 0.6799438990182328, "grad_norm": 1.803622505403581, "learning_rate": 2.454245553005184e-06, "loss": 0.3421, "step": 4848 }, { "epoch": 0.6800841514726508, "grad_norm": 1.9715777479916263, "learning_rate": 2.452291002676278e-06, "loss": 0.3438, "step": 4849 }, { "epoch": 0.6802244039270687, "grad_norm": 2.4935131792324867, "learning_rate": 2.450336978049322e-06, "loss": 0.3397, "step": 4850 }, { "epoch": 0.6803646563814867, "grad_norm": 1.737553476292829, "learning_rate": 2.448383479527517e-06, "loss": 0.3432, "step": 4851 }, { "epoch": 0.6805049088359046, "grad_norm": 1.8921600289402105, "learning_rate": 2.446430507513954e-06, "loss": 0.3505, "step": 4852 }, { "epoch": 0.6806451612903226, "grad_norm": 2.602933871612999, "learning_rate": 2.4444780624116147e-06, "loss": 0.3539, "step": 4853 }, { "epoch": 0.6807854137447406, "grad_norm": 1.524525415335669, "learning_rate": 2.4425261446233738e-06, "loss": 0.3193, "step": 4854 }, { "epoch": 0.6809256661991585, "grad_norm": 1.9725900166789263, "learning_rate": 2.4405747545519966e-06, "loss": 0.3038, "step": 4855 }, { "epoch": 0.6810659186535765, "grad_norm": 3.4232317181456127, "learning_rate": 2.4386238926001352e-06, "loss": 0.369, "step": 4856 }, { "epoch": 0.6812061711079944, "grad_norm": 2.1701234849171183, "learning_rate": 2.436673559170339e-06, "loss": 0.3556, "step": 4857 }, { "epoch": 0.6813464235624124, "grad_norm": 1.9295519576558962, "learning_rate": 2.4347237546650443e-06, "loss": 0.3173, "step": 4858 }, { "epoch": 0.6814866760168303, "grad_norm": 1.9591650380341858, "learning_rate": 2.4327744794865803e-06, "loss": 0.3185, "step": 4859 }, { "epoch": 0.6816269284712483, "grad_norm": 1.7390676027070593, "learning_rate": 2.430825734037167e-06, "loss": 0.315, "step": 4860 }, { "epoch": 0.6817671809256662, "grad_norm": 3.183563344611323, "learning_rate": 2.4288775187189134e-06, "loss": 0.3528, "step": 4861 }, { "epoch": 0.6819074333800842, "grad_norm": 1.8374877877381035, "learning_rate": 2.4269298339338205e-06, "loss": 0.3497, "step": 4862 }, { "epoch": 0.6820476858345021, "grad_norm": 1.823141494692052, "learning_rate": 2.42498268008378e-06, "loss": 0.335, "step": 4863 }, { "epoch": 0.6821879382889201, "grad_norm": 2.5986848329242163, "learning_rate": 2.4230360575705743e-06, "loss": 0.3326, "step": 4864 }, { "epoch": 0.6823281907433381, "grad_norm": 2.1540750785359655, "learning_rate": 2.421089966795873e-06, "loss": 0.3142, "step": 4865 }, { "epoch": 0.682468443197756, "grad_norm": 1.8623304840365607, "learning_rate": 2.4191444081612382e-06, "loss": 0.3304, "step": 4866 }, { "epoch": 0.6826086956521739, "grad_norm": 2.2993695473094347, "learning_rate": 2.417199382068124e-06, "loss": 0.3736, "step": 4867 }, { "epoch": 0.6827489481065918, "grad_norm": 1.9145809794824167, "learning_rate": 2.4152548889178722e-06, "loss": 0.3813, "step": 4868 }, { "epoch": 0.6828892005610098, "grad_norm": 1.5904168822831621, "learning_rate": 2.4133109291117156e-06, "loss": 0.3336, "step": 4869 }, { "epoch": 0.6830294530154277, "grad_norm": 1.565604140091316, "learning_rate": 2.4113675030507786e-06, "loss": 0.3482, "step": 4870 }, { "epoch": 0.6831697054698457, "grad_norm": 1.4806693150082852, "learning_rate": 2.40942461113607e-06, "loss": 0.3241, "step": 4871 }, { "epoch": 0.6833099579242636, "grad_norm": 1.7401013393049916, "learning_rate": 2.4074822537684945e-06, "loss": 0.3274, "step": 4872 }, { "epoch": 0.6834502103786816, "grad_norm": 2.2783320328941996, "learning_rate": 2.4055404313488424e-06, "loss": 0.3537, "step": 4873 }, { "epoch": 0.6835904628330995, "grad_norm": 2.029054129715597, "learning_rate": 2.4035991442777963e-06, "loss": 0.3373, "step": 4874 }, { "epoch": 0.6837307152875175, "grad_norm": 2.043027216875199, "learning_rate": 2.401658392955928e-06, "loss": 0.3117, "step": 4875 }, { "epoch": 0.6838709677419355, "grad_norm": 1.8483148090728783, "learning_rate": 2.3997181777836955e-06, "loss": 0.2996, "step": 4876 }, { "epoch": 0.6840112201963534, "grad_norm": 1.9384667230543733, "learning_rate": 2.39777849916145e-06, "loss": 0.352, "step": 4877 }, { "epoch": 0.6841514726507714, "grad_norm": 1.869122639632318, "learning_rate": 2.395839357489431e-06, "loss": 0.3871, "step": 4878 }, { "epoch": 0.6842917251051893, "grad_norm": 1.8982037058851864, "learning_rate": 2.3939007531677656e-06, "loss": 0.3518, "step": 4879 }, { "epoch": 0.6844319775596073, "grad_norm": 1.6800525357342693, "learning_rate": 2.391962686596473e-06, "loss": 0.3167, "step": 4880 }, { "epoch": 0.6845722300140252, "grad_norm": 1.574222536295462, "learning_rate": 2.390025158175458e-06, "loss": 0.3311, "step": 4881 }, { "epoch": 0.6847124824684432, "grad_norm": 1.8750896432428326, "learning_rate": 2.3880881683045176e-06, "loss": 0.3711, "step": 4882 }, { "epoch": 0.6848527349228611, "grad_norm": 2.0688174049483172, "learning_rate": 2.3861517173833347e-06, "loss": 0.3775, "step": 4883 }, { "epoch": 0.6849929873772791, "grad_norm": 2.175492053442408, "learning_rate": 2.3842158058114855e-06, "loss": 0.346, "step": 4884 }, { "epoch": 0.685133239831697, "grad_norm": 1.4958595510893002, "learning_rate": 2.3822804339884283e-06, "loss": 0.3717, "step": 4885 }, { "epoch": 0.685273492286115, "grad_norm": 2.0031945226926537, "learning_rate": 2.3803456023135135e-06, "loss": 0.3666, "step": 4886 }, { "epoch": 0.685413744740533, "grad_norm": 1.765770965214067, "learning_rate": 2.3784113111859818e-06, "loss": 0.3755, "step": 4887 }, { "epoch": 0.6855539971949509, "grad_norm": 1.7711917589259518, "learning_rate": 2.37647756100496e-06, "loss": 0.3461, "step": 4888 }, { "epoch": 0.6856942496493689, "grad_norm": 2.6354565121097484, "learning_rate": 2.3745443521694644e-06, "loss": 0.3195, "step": 4889 }, { "epoch": 0.6858345021037868, "grad_norm": 2.8539677757149495, "learning_rate": 2.3726116850783987e-06, "loss": 0.3525, "step": 4890 }, { "epoch": 0.6859747545582048, "grad_norm": 1.7553092255917502, "learning_rate": 2.370679560130557e-06, "loss": 0.3057, "step": 4891 }, { "epoch": 0.6861150070126227, "grad_norm": 2.0170381308995338, "learning_rate": 2.3687479777246165e-06, "loss": 0.3685, "step": 4892 }, { "epoch": 0.6862552594670407, "grad_norm": 1.9162794081290784, "learning_rate": 2.366816938259148e-06, "loss": 0.3561, "step": 4893 }, { "epoch": 0.6863955119214586, "grad_norm": 2.0485275237134055, "learning_rate": 2.364886442132606e-06, "loss": 0.3203, "step": 4894 }, { "epoch": 0.6865357643758766, "grad_norm": 3.18272640840319, "learning_rate": 2.3629564897433376e-06, "loss": 0.3668, "step": 4895 }, { "epoch": 0.6866760168302946, "grad_norm": 2.0510066607891155, "learning_rate": 2.361027081489575e-06, "loss": 0.3449, "step": 4896 }, { "epoch": 0.6868162692847125, "grad_norm": 2.3276141329438356, "learning_rate": 2.3590982177694348e-06, "loss": 0.332, "step": 4897 }, { "epoch": 0.6869565217391305, "grad_norm": 2.7521228399528583, "learning_rate": 2.357169898980927e-06, "loss": 0.3469, "step": 4898 }, { "epoch": 0.6870967741935484, "grad_norm": 2.2696359843316927, "learning_rate": 2.3552421255219465e-06, "loss": 0.2938, "step": 4899 }, { "epoch": 0.6872370266479664, "grad_norm": 2.272610438551627, "learning_rate": 2.3533148977902755e-06, "loss": 0.2979, "step": 4900 }, { "epoch": 0.6873772791023843, "grad_norm": 1.659763639812041, "learning_rate": 2.3513882161835835e-06, "loss": 0.3288, "step": 4901 }, { "epoch": 0.6875175315568023, "grad_norm": 1.682054351350338, "learning_rate": 2.349462081099429e-06, "loss": 0.3532, "step": 4902 }, { "epoch": 0.6876577840112202, "grad_norm": 2.0667674680963497, "learning_rate": 2.3475364929352554e-06, "loss": 0.3756, "step": 4903 }, { "epoch": 0.6877980364656382, "grad_norm": 2.3794944800120286, "learning_rate": 2.3456114520883956e-06, "loss": 0.3078, "step": 4904 }, { "epoch": 0.6879382889200562, "grad_norm": 2.102733073339568, "learning_rate": 2.343686958956069e-06, "loss": 0.372, "step": 4905 }, { "epoch": 0.6880785413744741, "grad_norm": 1.8473964774689269, "learning_rate": 2.3417630139353782e-06, "loss": 0.3488, "step": 4906 }, { "epoch": 0.688218793828892, "grad_norm": 1.811526859413884, "learning_rate": 2.339839617423318e-06, "loss": 0.3572, "step": 4907 }, { "epoch": 0.6883590462833099, "grad_norm": 1.9006112922292053, "learning_rate": 2.3379167698167666e-06, "loss": 0.3284, "step": 4908 }, { "epoch": 0.6884992987377279, "grad_norm": 6.836867403269759, "learning_rate": 2.3359944715124915e-06, "loss": 0.3425, "step": 4909 }, { "epoch": 0.6886395511921458, "grad_norm": 1.930044411459565, "learning_rate": 2.3340727229071445e-06, "loss": 0.3266, "step": 4910 }, { "epoch": 0.6887798036465638, "grad_norm": 1.8514138978968704, "learning_rate": 2.3321515243972663e-06, "loss": 0.3412, "step": 4911 }, { "epoch": 0.6889200561009817, "grad_norm": 1.923306064158969, "learning_rate": 2.330230876379283e-06, "loss": 0.3572, "step": 4912 }, { "epoch": 0.6890603085553997, "grad_norm": 1.7974390222767227, "learning_rate": 2.3283107792495046e-06, "loss": 0.3788, "step": 4913 }, { "epoch": 0.6892005610098176, "grad_norm": 2.0320310998173725, "learning_rate": 2.326391233404131e-06, "loss": 0.3269, "step": 4914 }, { "epoch": 0.6893408134642356, "grad_norm": 1.9156506858709157, "learning_rate": 2.3244722392392467e-06, "loss": 0.3512, "step": 4915 }, { "epoch": 0.6894810659186535, "grad_norm": 6.679958476320143, "learning_rate": 2.322553797150825e-06, "loss": 0.3371, "step": 4916 }, { "epoch": 0.6896213183730715, "grad_norm": 2.275004472832946, "learning_rate": 2.3206359075347194e-06, "loss": 0.3048, "step": 4917 }, { "epoch": 0.6897615708274895, "grad_norm": 2.4697752163801874, "learning_rate": 2.318718570786675e-06, "loss": 0.3442, "step": 4918 }, { "epoch": 0.6899018232819074, "grad_norm": 1.95949498346129, "learning_rate": 2.3168017873023203e-06, "loss": 0.3365, "step": 4919 }, { "epoch": 0.6900420757363254, "grad_norm": 1.561890368742812, "learning_rate": 2.3148855574771706e-06, "loss": 0.3206, "step": 4920 }, { "epoch": 0.6901823281907433, "grad_norm": 1.9940354098100093, "learning_rate": 2.3129698817066267e-06, "loss": 0.3189, "step": 4921 }, { "epoch": 0.6903225806451613, "grad_norm": 2.8386582283562336, "learning_rate": 2.311054760385974e-06, "loss": 0.3585, "step": 4922 }, { "epoch": 0.6904628330995792, "grad_norm": 1.6717226903826001, "learning_rate": 2.309140193910385e-06, "loss": 0.3252, "step": 4923 }, { "epoch": 0.6906030855539972, "grad_norm": 1.9455905087529684, "learning_rate": 2.307226182674918e-06, "loss": 0.3729, "step": 4924 }, { "epoch": 0.6907433380084151, "grad_norm": 2.100293992671468, "learning_rate": 2.3053127270745163e-06, "loss": 0.3224, "step": 4925 }, { "epoch": 0.6908835904628331, "grad_norm": 2.144882757010462, "learning_rate": 2.3033998275040047e-06, "loss": 0.3883, "step": 4926 }, { "epoch": 0.691023842917251, "grad_norm": 1.8407416288928191, "learning_rate": 2.301487484358099e-06, "loss": 0.3537, "step": 4927 }, { "epoch": 0.691164095371669, "grad_norm": 1.8411367614896692, "learning_rate": 2.2995756980313984e-06, "loss": 0.3351, "step": 4928 }, { "epoch": 0.691304347826087, "grad_norm": 4.384000838091693, "learning_rate": 2.2976644689183848e-06, "loss": 0.3624, "step": 4929 }, { "epoch": 0.6914446002805049, "grad_norm": 2.053349164224674, "learning_rate": 2.295753797413428e-06, "loss": 0.3051, "step": 4930 }, { "epoch": 0.6915848527349229, "grad_norm": 1.8416763541310615, "learning_rate": 2.2938436839107825e-06, "loss": 0.314, "step": 4931 }, { "epoch": 0.6917251051893408, "grad_norm": 2.9839517183360793, "learning_rate": 2.2919341288045853e-06, "loss": 0.3345, "step": 4932 }, { "epoch": 0.6918653576437588, "grad_norm": 2.4474906604810753, "learning_rate": 2.2900251324888627e-06, "loss": 0.3304, "step": 4933 }, { "epoch": 0.6920056100981767, "grad_norm": 2.0332256392845682, "learning_rate": 2.288116695357519e-06, "loss": 0.3946, "step": 4934 }, { "epoch": 0.6921458625525947, "grad_norm": 2.425418550217218, "learning_rate": 2.2862088178043483e-06, "loss": 0.3636, "step": 4935 }, { "epoch": 0.6922861150070126, "grad_norm": 2.319435133569523, "learning_rate": 2.2843015002230283e-06, "loss": 0.3636, "step": 4936 }, { "epoch": 0.6924263674614306, "grad_norm": 1.695037397574108, "learning_rate": 2.282394743007122e-06, "loss": 0.3128, "step": 4937 }, { "epoch": 0.6925666199158486, "grad_norm": 1.674633139674579, "learning_rate": 2.280488546550072e-06, "loss": 0.3229, "step": 4938 }, { "epoch": 0.6927068723702665, "grad_norm": 2.4116542618473575, "learning_rate": 2.27858291124521e-06, "loss": 0.3729, "step": 4939 }, { "epoch": 0.6928471248246845, "grad_norm": 1.710757938525467, "learning_rate": 2.276677837485752e-06, "loss": 0.3182, "step": 4940 }, { "epoch": 0.6929873772791024, "grad_norm": 2.1489788691146137, "learning_rate": 2.2747733256647946e-06, "loss": 0.3468, "step": 4941 }, { "epoch": 0.6931276297335204, "grad_norm": 1.7799265445597243, "learning_rate": 2.2728693761753216e-06, "loss": 0.3844, "step": 4942 }, { "epoch": 0.6932678821879383, "grad_norm": 1.915242419224993, "learning_rate": 2.2709659894102e-06, "loss": 0.3649, "step": 4943 }, { "epoch": 0.6934081346423563, "grad_norm": 1.7433303027129712, "learning_rate": 2.26906316576218e-06, "loss": 0.339, "step": 4944 }, { "epoch": 0.6935483870967742, "grad_norm": 2.1112188122141378, "learning_rate": 2.2671609056238953e-06, "loss": 0.3906, "step": 4945 }, { "epoch": 0.6936886395511922, "grad_norm": 2.231766785685112, "learning_rate": 2.265259209387867e-06, "loss": 0.3461, "step": 4946 }, { "epoch": 0.69382889200561, "grad_norm": 1.5217532506725975, "learning_rate": 2.263358077446492e-06, "loss": 0.3758, "step": 4947 }, { "epoch": 0.693969144460028, "grad_norm": 2.2807142072664544, "learning_rate": 2.2614575101920585e-06, "loss": 0.3365, "step": 4948 }, { "epoch": 0.694109396914446, "grad_norm": 1.737891884389821, "learning_rate": 2.2595575080167348e-06, "loss": 0.3469, "step": 4949 }, { "epoch": 0.6942496493688639, "grad_norm": 1.6898442692079068, "learning_rate": 2.257658071312573e-06, "loss": 0.3348, "step": 4950 }, { "epoch": 0.6943899018232819, "grad_norm": 3.685137234206689, "learning_rate": 2.2557592004715084e-06, "loss": 0.3366, "step": 4951 }, { "epoch": 0.6945301542776998, "grad_norm": 2.268858923280868, "learning_rate": 2.25386089588536e-06, "loss": 0.3405, "step": 4952 }, { "epoch": 0.6946704067321178, "grad_norm": 4.717015231747, "learning_rate": 2.25196315794583e-06, "loss": 0.3553, "step": 4953 }, { "epoch": 0.6948106591865357, "grad_norm": 1.750220625457748, "learning_rate": 2.250065987044505e-06, "loss": 0.3487, "step": 4954 }, { "epoch": 0.6949509116409537, "grad_norm": 1.773653462574046, "learning_rate": 2.248169383572849e-06, "loss": 0.3293, "step": 4955 }, { "epoch": 0.6950911640953716, "grad_norm": 1.7211895362356864, "learning_rate": 2.2462733479222147e-06, "loss": 0.352, "step": 4956 }, { "epoch": 0.6952314165497896, "grad_norm": 1.991094611092209, "learning_rate": 2.244377880483838e-06, "loss": 0.3665, "step": 4957 }, { "epoch": 0.6953716690042075, "grad_norm": 1.7630594569548512, "learning_rate": 2.242482981648831e-06, "loss": 0.3318, "step": 4958 }, { "epoch": 0.6955119214586255, "grad_norm": 1.8962335548366729, "learning_rate": 2.2405886518081967e-06, "loss": 0.2876, "step": 4959 }, { "epoch": 0.6956521739130435, "grad_norm": 1.7127470436944867, "learning_rate": 2.238694891352814e-06, "loss": 0.384, "step": 4960 }, { "epoch": 0.6957924263674614, "grad_norm": 1.8407797730532787, "learning_rate": 2.236801700673449e-06, "loss": 0.3004, "step": 4961 }, { "epoch": 0.6959326788218794, "grad_norm": 1.6073013575672346, "learning_rate": 2.2349090801607477e-06, "loss": 0.307, "step": 4962 }, { "epoch": 0.6960729312762973, "grad_norm": 2.234191246746476, "learning_rate": 2.233017030205239e-06, "loss": 0.3429, "step": 4963 }, { "epoch": 0.6962131837307153, "grad_norm": 2.1695388194890683, "learning_rate": 2.2311255511973347e-06, "loss": 0.3327, "step": 4964 }, { "epoch": 0.6963534361851332, "grad_norm": 2.319662086443192, "learning_rate": 2.2292346435273277e-06, "loss": 0.3384, "step": 4965 }, { "epoch": 0.6964936886395512, "grad_norm": 2.435123948344073, "learning_rate": 2.227344307585396e-06, "loss": 0.3099, "step": 4966 }, { "epoch": 0.6966339410939691, "grad_norm": 1.9782460397145347, "learning_rate": 2.2254545437615932e-06, "loss": 0.354, "step": 4967 }, { "epoch": 0.6967741935483871, "grad_norm": 1.855506077441144, "learning_rate": 2.223565352445861e-06, "loss": 0.3603, "step": 4968 }, { "epoch": 0.696914446002805, "grad_norm": 2.5492210582239876, "learning_rate": 2.2216767340280206e-06, "loss": 0.335, "step": 4969 }, { "epoch": 0.697054698457223, "grad_norm": 4.32902060376448, "learning_rate": 2.219788688897775e-06, "loss": 0.3546, "step": 4970 }, { "epoch": 0.697194950911641, "grad_norm": 1.9668548181478436, "learning_rate": 2.2179012174447097e-06, "loss": 0.3423, "step": 4971 }, { "epoch": 0.6973352033660589, "grad_norm": 1.9644566300581427, "learning_rate": 2.2160143200582906e-06, "loss": 0.3427, "step": 4972 }, { "epoch": 0.6974754558204769, "grad_norm": 2.3470263151717936, "learning_rate": 2.2141279971278663e-06, "loss": 0.295, "step": 4973 }, { "epoch": 0.6976157082748948, "grad_norm": 2.1396924514456934, "learning_rate": 2.2122422490426676e-06, "loss": 0.3017, "step": 4974 }, { "epoch": 0.6977559607293128, "grad_norm": 2.338386502176925, "learning_rate": 2.2103570761918023e-06, "loss": 0.3375, "step": 4975 }, { "epoch": 0.6978962131837307, "grad_norm": 1.5508060020950503, "learning_rate": 2.208472478964265e-06, "loss": 0.3048, "step": 4976 }, { "epoch": 0.6980364656381487, "grad_norm": 1.6475311969819226, "learning_rate": 2.2065884577489276e-06, "loss": 0.3641, "step": 4977 }, { "epoch": 0.6981767180925667, "grad_norm": 1.866884917318121, "learning_rate": 2.2047050129345478e-06, "loss": 0.3832, "step": 4978 }, { "epoch": 0.6983169705469846, "grad_norm": 2.3844679788082015, "learning_rate": 2.202822144909757e-06, "loss": 0.3286, "step": 4979 }, { "epoch": 0.6984572230014026, "grad_norm": 1.8905640269133994, "learning_rate": 2.2009398540630742e-06, "loss": 0.3831, "step": 4980 }, { "epoch": 0.6985974754558205, "grad_norm": 2.1308290530109444, "learning_rate": 2.199058140782897e-06, "loss": 0.3591, "step": 4981 }, { "epoch": 0.6987377279102385, "grad_norm": 1.8859487032300772, "learning_rate": 2.197177005457503e-06, "loss": 0.3507, "step": 4982 }, { "epoch": 0.6988779803646564, "grad_norm": 2.2174662785078785, "learning_rate": 2.1952964484750527e-06, "loss": 0.3817, "step": 4983 }, { "epoch": 0.6990182328190744, "grad_norm": 1.8149222269550918, "learning_rate": 2.1934164702235844e-06, "loss": 0.3741, "step": 4984 }, { "epoch": 0.6991584852734923, "grad_norm": 1.7668686934184172, "learning_rate": 2.1915370710910188e-06, "loss": 0.3567, "step": 4985 }, { "epoch": 0.6992987377279103, "grad_norm": 1.94357320097987, "learning_rate": 2.1896582514651577e-06, "loss": 0.3166, "step": 4986 }, { "epoch": 0.6994389901823281, "grad_norm": 2.4325619097493045, "learning_rate": 2.1877800117336835e-06, "loss": 0.3491, "step": 4987 }, { "epoch": 0.6995792426367461, "grad_norm": 3.0002342768427908, "learning_rate": 2.1859023522841543e-06, "loss": 0.3291, "step": 4988 }, { "epoch": 0.699719495091164, "grad_norm": 2.2086329226938166, "learning_rate": 2.184025273504014e-06, "loss": 0.3632, "step": 4989 }, { "epoch": 0.699859747545582, "grad_norm": 1.8811335378698089, "learning_rate": 2.1821487757805843e-06, "loss": 0.3229, "step": 4990 }, { "epoch": 0.7, "grad_norm": 2.146365744702934, "learning_rate": 2.180272859501068e-06, "loss": 0.3457, "step": 4991 }, { "epoch": 0.7001402524544179, "grad_norm": 2.0006237249540146, "learning_rate": 2.178397525052546e-06, "loss": 0.3432, "step": 4992 }, { "epoch": 0.7002805049088359, "grad_norm": 1.9857248836246286, "learning_rate": 2.176522772821983e-06, "loss": 0.3176, "step": 4993 }, { "epoch": 0.7004207573632538, "grad_norm": 1.8293669304964608, "learning_rate": 2.1746486031962183e-06, "loss": 0.3725, "step": 4994 }, { "epoch": 0.7005610098176718, "grad_norm": 1.9802026212457033, "learning_rate": 2.172775016561977e-06, "loss": 0.349, "step": 4995 }, { "epoch": 0.7007012622720897, "grad_norm": 1.8544252051078014, "learning_rate": 2.1709020133058566e-06, "loss": 0.3084, "step": 4996 }, { "epoch": 0.7008415147265077, "grad_norm": 1.720086566387843, "learning_rate": 2.16902959381434e-06, "loss": 0.3581, "step": 4997 }, { "epoch": 0.7009817671809256, "grad_norm": 4.118131757306003, "learning_rate": 2.16715775847379e-06, "loss": 0.3129, "step": 4998 }, { "epoch": 0.7011220196353436, "grad_norm": 3.0040660959482373, "learning_rate": 2.1652865076704432e-06, "loss": 0.3419, "step": 4999 }, { "epoch": 0.7012622720897616, "grad_norm": 2.307826707056283, "learning_rate": 2.16341584179042e-06, "loss": 0.3435, "step": 5000 }, { "epoch": 0.7014025245441795, "grad_norm": 1.8347030060453506, "learning_rate": 2.1615457612197206e-06, "loss": 0.3742, "step": 5001 }, { "epoch": 0.7015427769985975, "grad_norm": 1.8641945062342542, "learning_rate": 2.159676266344222e-06, "loss": 0.361, "step": 5002 }, { "epoch": 0.7016830294530154, "grad_norm": 1.6426599869984482, "learning_rate": 2.1578073575496814e-06, "loss": 0.3136, "step": 5003 }, { "epoch": 0.7018232819074334, "grad_norm": 1.7516965816470764, "learning_rate": 2.1559390352217357e-06, "loss": 0.3369, "step": 5004 }, { "epoch": 0.7019635343618513, "grad_norm": 1.9462112740428954, "learning_rate": 2.1540712997459e-06, "loss": 0.3154, "step": 5005 }, { "epoch": 0.7021037868162693, "grad_norm": 3.2220678913675966, "learning_rate": 2.1522041515075686e-06, "loss": 0.3223, "step": 5006 }, { "epoch": 0.7022440392706872, "grad_norm": 1.9223240273527498, "learning_rate": 2.150337590892016e-06, "loss": 0.3785, "step": 5007 }, { "epoch": 0.7023842917251052, "grad_norm": 1.86673581074765, "learning_rate": 2.14847161828439e-06, "loss": 0.3279, "step": 5008 }, { "epoch": 0.7025245441795231, "grad_norm": 2.3034515687634225, "learning_rate": 2.1466062340697234e-06, "loss": 0.3406, "step": 5009 }, { "epoch": 0.7026647966339411, "grad_norm": 3.33387553255807, "learning_rate": 2.144741438632925e-06, "loss": 0.325, "step": 5010 }, { "epoch": 0.7028050490883591, "grad_norm": 2.2296706607345795, "learning_rate": 2.1428772323587827e-06, "loss": 0.3782, "step": 5011 }, { "epoch": 0.702945301542777, "grad_norm": 1.8868797344136996, "learning_rate": 2.141013615631962e-06, "loss": 0.4091, "step": 5012 }, { "epoch": 0.703085553997195, "grad_norm": 1.5316438752236938, "learning_rate": 2.1391505888370067e-06, "loss": 0.3135, "step": 5013 }, { "epoch": 0.7032258064516129, "grad_norm": 1.6077696561897485, "learning_rate": 2.13728815235834e-06, "loss": 0.3243, "step": 5014 }, { "epoch": 0.7033660589060309, "grad_norm": 3.6524656877205, "learning_rate": 2.1354263065802627e-06, "loss": 0.3282, "step": 5015 }, { "epoch": 0.7035063113604488, "grad_norm": 1.8402118679055728, "learning_rate": 2.1335650518869555e-06, "loss": 0.3384, "step": 5016 }, { "epoch": 0.7036465638148668, "grad_norm": 1.870062047877553, "learning_rate": 2.1317043886624718e-06, "loss": 0.3474, "step": 5017 }, { "epoch": 0.7037868162692847, "grad_norm": 1.588845054129988, "learning_rate": 2.1298443172907475e-06, "loss": 0.335, "step": 5018 }, { "epoch": 0.7039270687237027, "grad_norm": 3.3327880572331683, "learning_rate": 2.127984838155598e-06, "loss": 0.3245, "step": 5019 }, { "epoch": 0.7040673211781207, "grad_norm": 1.7506758883297056, "learning_rate": 2.1261259516407098e-06, "loss": 0.33, "step": 5020 }, { "epoch": 0.7042075736325386, "grad_norm": 1.7420627566441307, "learning_rate": 2.1242676581296527e-06, "loss": 0.3738, "step": 5021 }, { "epoch": 0.7043478260869566, "grad_norm": 2.1044719084362273, "learning_rate": 2.1224099580058734e-06, "loss": 0.3428, "step": 5022 }, { "epoch": 0.7044880785413745, "grad_norm": 1.5899683648087246, "learning_rate": 2.120552851652694e-06, "loss": 0.3215, "step": 5023 }, { "epoch": 0.7046283309957925, "grad_norm": 2.245380533842817, "learning_rate": 2.1186963394533165e-06, "loss": 0.361, "step": 5024 }, { "epoch": 0.7047685834502104, "grad_norm": 2.143175262725051, "learning_rate": 2.1168404217908194e-06, "loss": 0.3255, "step": 5025 }, { "epoch": 0.7049088359046284, "grad_norm": 1.9523550728556962, "learning_rate": 2.114985099048158e-06, "loss": 0.3606, "step": 5026 }, { "epoch": 0.7050490883590463, "grad_norm": 1.81453183170757, "learning_rate": 2.113130371608165e-06, "loss": 0.3773, "step": 5027 }, { "epoch": 0.7051893408134642, "grad_norm": 1.5598531811656515, "learning_rate": 2.111276239853552e-06, "loss": 0.3217, "step": 5028 }, { "epoch": 0.7053295932678821, "grad_norm": 1.768485461290956, "learning_rate": 2.109422704166903e-06, "loss": 0.3775, "step": 5029 }, { "epoch": 0.7054698457223001, "grad_norm": 2.2490331903826744, "learning_rate": 2.1075697649306838e-06, "loss": 0.3283, "step": 5030 }, { "epoch": 0.705610098176718, "grad_norm": 2.2071010342258375, "learning_rate": 2.105717422527235e-06, "loss": 0.2999, "step": 5031 }, { "epoch": 0.705750350631136, "grad_norm": 1.97347528054505, "learning_rate": 2.103865677338776e-06, "loss": 0.351, "step": 5032 }, { "epoch": 0.705890603085554, "grad_norm": 1.8673725754698942, "learning_rate": 2.1020145297474003e-06, "loss": 0.3367, "step": 5033 }, { "epoch": 0.7060308555399719, "grad_norm": 2.1983279766897668, "learning_rate": 2.1001639801350793e-06, "loss": 0.3309, "step": 5034 }, { "epoch": 0.7061711079943899, "grad_norm": 3.0904254968510263, "learning_rate": 2.0983140288836607e-06, "loss": 0.3457, "step": 5035 }, { "epoch": 0.7063113604488078, "grad_norm": 2.3794811537529927, "learning_rate": 2.0964646763748696e-06, "loss": 0.37, "step": 5036 }, { "epoch": 0.7064516129032258, "grad_norm": 2.3211278028888778, "learning_rate": 2.094615922990309e-06, "loss": 0.3297, "step": 5037 }, { "epoch": 0.7065918653576437, "grad_norm": 2.467789958238873, "learning_rate": 2.092767769111452e-06, "loss": 0.3797, "step": 5038 }, { "epoch": 0.7067321178120617, "grad_norm": 1.6758116961650011, "learning_rate": 2.090920215119657e-06, "loss": 0.3299, "step": 5039 }, { "epoch": 0.7068723702664796, "grad_norm": 1.8454785407138252, "learning_rate": 2.089073261396148e-06, "loss": 0.3473, "step": 5040 }, { "epoch": 0.7070126227208976, "grad_norm": 2.0545678629692774, "learning_rate": 2.0872269083220346e-06, "loss": 0.3501, "step": 5041 }, { "epoch": 0.7071528751753156, "grad_norm": 2.129468483133855, "learning_rate": 2.085381156278299e-06, "loss": 0.3555, "step": 5042 }, { "epoch": 0.7072931276297335, "grad_norm": 1.6214863864681273, "learning_rate": 2.0835360056457983e-06, "loss": 0.337, "step": 5043 }, { "epoch": 0.7074333800841515, "grad_norm": 2.07729041758089, "learning_rate": 2.0816914568052664e-06, "loss": 0.4135, "step": 5044 }, { "epoch": 0.7075736325385694, "grad_norm": 1.9305697134887765, "learning_rate": 2.079847510137314e-06, "loss": 0.3554, "step": 5045 }, { "epoch": 0.7077138849929874, "grad_norm": 1.7000211377793173, "learning_rate": 2.078004166022426e-06, "loss": 0.3478, "step": 5046 }, { "epoch": 0.7078541374474053, "grad_norm": 2.0439659587839825, "learning_rate": 2.0761614248409635e-06, "loss": 0.3086, "step": 5047 }, { "epoch": 0.7079943899018233, "grad_norm": 2.127269318532695, "learning_rate": 2.0743192869731655e-06, "loss": 0.3327, "step": 5048 }, { "epoch": 0.7081346423562412, "grad_norm": 2.3993226446514457, "learning_rate": 2.07247775279914e-06, "loss": 0.3253, "step": 5049 }, { "epoch": 0.7082748948106592, "grad_norm": 1.73579252583626, "learning_rate": 2.0706368226988772e-06, "loss": 0.2972, "step": 5050 }, { "epoch": 0.7084151472650771, "grad_norm": 1.946504220204907, "learning_rate": 2.0687964970522394e-06, "loss": 0.3529, "step": 5051 }, { "epoch": 0.7085553997194951, "grad_norm": 2.1665721897165517, "learning_rate": 2.066956776238966e-06, "loss": 0.3016, "step": 5052 }, { "epoch": 0.7086956521739131, "grad_norm": 1.818096531146455, "learning_rate": 2.0651176606386697e-06, "loss": 0.3423, "step": 5053 }, { "epoch": 0.708835904628331, "grad_norm": 1.7765876251564428, "learning_rate": 2.06327915063084e-06, "loss": 0.3325, "step": 5054 }, { "epoch": 0.708976157082749, "grad_norm": 1.6127443320701487, "learning_rate": 2.0614412465948392e-06, "loss": 0.3415, "step": 5055 }, { "epoch": 0.7091164095371669, "grad_norm": 1.841172760834422, "learning_rate": 2.0596039489099066e-06, "loss": 0.3227, "step": 5056 }, { "epoch": 0.7092566619915849, "grad_norm": 2.0288886796956525, "learning_rate": 2.057767257955157e-06, "loss": 0.3243, "step": 5057 }, { "epoch": 0.7093969144460028, "grad_norm": 2.589526955401833, "learning_rate": 2.055931174109579e-06, "loss": 0.2918, "step": 5058 }, { "epoch": 0.7095371669004208, "grad_norm": 2.176714219435069, "learning_rate": 2.054095697752032e-06, "loss": 0.3591, "step": 5059 }, { "epoch": 0.7096774193548387, "grad_norm": 4.561776926930897, "learning_rate": 2.0522608292612583e-06, "loss": 0.3153, "step": 5060 }, { "epoch": 0.7098176718092567, "grad_norm": 1.909942427761125, "learning_rate": 2.050426569015866e-06, "loss": 0.3624, "step": 5061 }, { "epoch": 0.7099579242636747, "grad_norm": 1.9319457149737589, "learning_rate": 2.0485929173943436e-06, "loss": 0.3198, "step": 5062 }, { "epoch": 0.7100981767180926, "grad_norm": 1.8323259909579115, "learning_rate": 2.0467598747750533e-06, "loss": 0.3358, "step": 5063 }, { "epoch": 0.7102384291725106, "grad_norm": 1.9768674713194019, "learning_rate": 2.044927441536229e-06, "loss": 0.3771, "step": 5064 }, { "epoch": 0.7103786816269285, "grad_norm": 3.681989917324834, "learning_rate": 2.043095618055982e-06, "loss": 0.3515, "step": 5065 }, { "epoch": 0.7105189340813465, "grad_norm": 2.0974489248102084, "learning_rate": 2.0412644047122953e-06, "loss": 0.3675, "step": 5066 }, { "epoch": 0.7106591865357644, "grad_norm": 1.506785937683091, "learning_rate": 2.039433801883027e-06, "loss": 0.3187, "step": 5067 }, { "epoch": 0.7107994389901823, "grad_norm": 1.8397792148813068, "learning_rate": 2.0376038099459104e-06, "loss": 0.3792, "step": 5068 }, { "epoch": 0.7109396914446002, "grad_norm": 2.7023361025081964, "learning_rate": 2.035774429278552e-06, "loss": 0.3168, "step": 5069 }, { "epoch": 0.7110799438990182, "grad_norm": 2.156760666679022, "learning_rate": 2.033945660258429e-06, "loss": 0.3038, "step": 5070 }, { "epoch": 0.7112201963534361, "grad_norm": 2.145730050079679, "learning_rate": 2.032117503262896e-06, "loss": 0.3655, "step": 5071 }, { "epoch": 0.7113604488078541, "grad_norm": 2.192895610963617, "learning_rate": 2.030289958669181e-06, "loss": 0.3168, "step": 5072 }, { "epoch": 0.711500701262272, "grad_norm": 2.448345898811741, "learning_rate": 2.0284630268543853e-06, "loss": 0.3575, "step": 5073 }, { "epoch": 0.71164095371669, "grad_norm": 2.8186177157130663, "learning_rate": 2.026636708195483e-06, "loss": 0.3079, "step": 5074 }, { "epoch": 0.711781206171108, "grad_norm": 1.4530044013401062, "learning_rate": 2.0248110030693223e-06, "loss": 0.3223, "step": 5075 }, { "epoch": 0.7119214586255259, "grad_norm": 1.8277664485040477, "learning_rate": 2.0229859118526244e-06, "loss": 0.3405, "step": 5076 }, { "epoch": 0.7120617110799439, "grad_norm": 2.3654508861731753, "learning_rate": 2.0211614349219855e-06, "loss": 0.3208, "step": 5077 }, { "epoch": 0.7122019635343618, "grad_norm": 2.0013950965791545, "learning_rate": 2.0193375726538737e-06, "loss": 0.3498, "step": 5078 }, { "epoch": 0.7123422159887798, "grad_norm": 1.857968800404947, "learning_rate": 2.0175143254246277e-06, "loss": 0.3568, "step": 5079 }, { "epoch": 0.7124824684431977, "grad_norm": 3.6731879301653625, "learning_rate": 2.0156916936104654e-06, "loss": 0.3397, "step": 5080 }, { "epoch": 0.7126227208976157, "grad_norm": 1.8642040982308885, "learning_rate": 2.01386967758747e-06, "loss": 0.3229, "step": 5081 }, { "epoch": 0.7127629733520336, "grad_norm": 2.046725435106983, "learning_rate": 2.012048277731604e-06, "loss": 0.3416, "step": 5082 }, { "epoch": 0.7129032258064516, "grad_norm": 1.8774035624843246, "learning_rate": 2.0102274944187005e-06, "loss": 0.3433, "step": 5083 }, { "epoch": 0.7130434782608696, "grad_norm": 2.9286754436828866, "learning_rate": 2.008407328024465e-06, "loss": 0.3543, "step": 5084 }, { "epoch": 0.7131837307152875, "grad_norm": 2.4573121509180638, "learning_rate": 2.0065877789244762e-06, "loss": 0.3134, "step": 5085 }, { "epoch": 0.7133239831697055, "grad_norm": 1.9388622294247575, "learning_rate": 2.004768847494186e-06, "loss": 0.3316, "step": 5086 }, { "epoch": 0.7134642356241234, "grad_norm": 2.3780639359884086, "learning_rate": 2.0029505341089183e-06, "loss": 0.3465, "step": 5087 }, { "epoch": 0.7136044880785414, "grad_norm": 1.7964356258419587, "learning_rate": 2.0011328391438685e-06, "loss": 0.2894, "step": 5088 }, { "epoch": 0.7137447405329593, "grad_norm": 1.973605511083435, "learning_rate": 1.999315762974107e-06, "loss": 0.354, "step": 5089 }, { "epoch": 0.7138849929873773, "grad_norm": 3.554051738519123, "learning_rate": 1.997499305974572e-06, "loss": 0.323, "step": 5090 }, { "epoch": 0.7140252454417952, "grad_norm": 2.675031704581072, "learning_rate": 1.9956834685200778e-06, "loss": 0.3995, "step": 5091 }, { "epoch": 0.7141654978962132, "grad_norm": 1.8131502234812635, "learning_rate": 1.9938682509853097e-06, "loss": 0.393, "step": 5092 }, { "epoch": 0.7143057503506312, "grad_norm": 1.989104874152798, "learning_rate": 1.992053653744826e-06, "loss": 0.3895, "step": 5093 }, { "epoch": 0.7144460028050491, "grad_norm": 1.628787980943264, "learning_rate": 1.990239677173056e-06, "loss": 0.3497, "step": 5094 }, { "epoch": 0.7145862552594671, "grad_norm": 2.4860239374753004, "learning_rate": 1.9884263216443002e-06, "loss": 0.2967, "step": 5095 }, { "epoch": 0.714726507713885, "grad_norm": 1.7492160403531798, "learning_rate": 1.9866135875327325e-06, "loss": 0.379, "step": 5096 }, { "epoch": 0.714866760168303, "grad_norm": 3.971354673625309, "learning_rate": 1.9848014752123977e-06, "loss": 0.3841, "step": 5097 }, { "epoch": 0.7150070126227209, "grad_norm": 2.167217172475929, "learning_rate": 1.982989985057213e-06, "loss": 0.3422, "step": 5098 }, { "epoch": 0.7151472650771389, "grad_norm": 3.3701777445841534, "learning_rate": 1.9811791174409676e-06, "loss": 0.3379, "step": 5099 }, { "epoch": 0.7152875175315568, "grad_norm": 1.9345081363643923, "learning_rate": 1.979368872737319e-06, "loss": 0.3364, "step": 5100 }, { "epoch": 0.7154277699859748, "grad_norm": 2.008132493461925, "learning_rate": 1.9775592513198015e-06, "loss": 0.334, "step": 5101 }, { "epoch": 0.7155680224403927, "grad_norm": 1.938069905786888, "learning_rate": 1.9757502535618137e-06, "loss": 0.3427, "step": 5102 }, { "epoch": 0.7157082748948107, "grad_norm": 1.7472972434628686, "learning_rate": 1.973941879836633e-06, "loss": 0.2977, "step": 5103 }, { "epoch": 0.7158485273492287, "grad_norm": 1.900339123928792, "learning_rate": 1.9721341305174025e-06, "loss": 0.3047, "step": 5104 }, { "epoch": 0.7159887798036466, "grad_norm": 2.1029331885013116, "learning_rate": 1.9703270059771406e-06, "loss": 0.3781, "step": 5105 }, { "epoch": 0.7161290322580646, "grad_norm": 2.111016433109741, "learning_rate": 1.9685205065887336e-06, "loss": 0.356, "step": 5106 }, { "epoch": 0.7162692847124825, "grad_norm": 2.310444201576799, "learning_rate": 1.966714632724941e-06, "loss": 0.385, "step": 5107 }, { "epoch": 0.7164095371669004, "grad_norm": 1.799053298160675, "learning_rate": 1.964909384758391e-06, "loss": 0.3372, "step": 5108 }, { "epoch": 0.7165497896213183, "grad_norm": 2.0032243962852885, "learning_rate": 1.963104763061585e-06, "loss": 0.3212, "step": 5109 }, { "epoch": 0.7166900420757363, "grad_norm": 2.4295163600297784, "learning_rate": 1.9613007680068957e-06, "loss": 0.3218, "step": 5110 }, { "epoch": 0.7168302945301542, "grad_norm": 2.6042331127590748, "learning_rate": 1.959497399966561e-06, "loss": 0.329, "step": 5111 }, { "epoch": 0.7169705469845722, "grad_norm": 1.9213926749034111, "learning_rate": 1.957694659312695e-06, "loss": 0.3484, "step": 5112 }, { "epoch": 0.7171107994389901, "grad_norm": 2.014328413900129, "learning_rate": 1.955892546417281e-06, "loss": 0.3194, "step": 5113 }, { "epoch": 0.7172510518934081, "grad_norm": 1.9335946747748736, "learning_rate": 1.954091061652172e-06, "loss": 0.3013, "step": 5114 }, { "epoch": 0.717391304347826, "grad_norm": 2.0745085030742567, "learning_rate": 1.9522902053890925e-06, "loss": 0.3635, "step": 5115 }, { "epoch": 0.717531556802244, "grad_norm": 2.1666974163318717, "learning_rate": 1.9504899779996354e-06, "loss": 0.3312, "step": 5116 }, { "epoch": 0.717671809256662, "grad_norm": 1.9068403736872883, "learning_rate": 1.9486903798552665e-06, "loss": 0.3671, "step": 5117 }, { "epoch": 0.7178120617110799, "grad_norm": 1.5146708680489749, "learning_rate": 1.946891411327319e-06, "loss": 0.2997, "step": 5118 }, { "epoch": 0.7179523141654979, "grad_norm": 1.7557612000240834, "learning_rate": 1.9450930727870004e-06, "loss": 0.3106, "step": 5119 }, { "epoch": 0.7180925666199158, "grad_norm": 2.1320388005431523, "learning_rate": 1.943295364605381e-06, "loss": 0.3366, "step": 5120 }, { "epoch": 0.7182328190743338, "grad_norm": 1.6361367731864962, "learning_rate": 1.941498287153409e-06, "loss": 0.3276, "step": 5121 }, { "epoch": 0.7183730715287517, "grad_norm": 1.800253437002403, "learning_rate": 1.9397018408018947e-06, "loss": 0.3651, "step": 5122 }, { "epoch": 0.7185133239831697, "grad_norm": 1.830366732741697, "learning_rate": 1.9379060259215255e-06, "loss": 0.323, "step": 5123 }, { "epoch": 0.7186535764375876, "grad_norm": 1.8870877681025053, "learning_rate": 1.936110842882854e-06, "loss": 0.3439, "step": 5124 }, { "epoch": 0.7187938288920056, "grad_norm": 1.8852424611354548, "learning_rate": 1.934316292056304e-06, "loss": 0.3496, "step": 5125 }, { "epoch": 0.7189340813464236, "grad_norm": 1.6054238269024805, "learning_rate": 1.9325223738121685e-06, "loss": 0.306, "step": 5126 }, { "epoch": 0.7190743338008415, "grad_norm": 2.1570231944938545, "learning_rate": 1.9307290885206102e-06, "loss": 0.342, "step": 5127 }, { "epoch": 0.7192145862552595, "grad_norm": 2.6756731665841893, "learning_rate": 1.928936436551661e-06, "loss": 0.3238, "step": 5128 }, { "epoch": 0.7193548387096774, "grad_norm": 1.8203018908539417, "learning_rate": 1.927144418275222e-06, "loss": 0.3078, "step": 5129 }, { "epoch": 0.7194950911640954, "grad_norm": 2.59700387205732, "learning_rate": 1.925353034061065e-06, "loss": 0.3662, "step": 5130 }, { "epoch": 0.7196353436185133, "grad_norm": 2.035901187739777, "learning_rate": 1.9235622842788264e-06, "loss": 0.3155, "step": 5131 }, { "epoch": 0.7197755960729313, "grad_norm": 1.7428021223509278, "learning_rate": 1.9217721692980172e-06, "loss": 0.3596, "step": 5132 }, { "epoch": 0.7199158485273492, "grad_norm": 1.8377145810596403, "learning_rate": 1.9199826894880147e-06, "loss": 0.3281, "step": 5133 }, { "epoch": 0.7200561009817672, "grad_norm": 2.692423192365857, "learning_rate": 1.9181938452180654e-06, "loss": 0.3103, "step": 5134 }, { "epoch": 0.7201963534361852, "grad_norm": 2.707137641082121, "learning_rate": 1.9164056368572847e-06, "loss": 0.3334, "step": 5135 }, { "epoch": 0.7203366058906031, "grad_norm": 1.8280596599150534, "learning_rate": 1.9146180647746575e-06, "loss": 0.3524, "step": 5136 }, { "epoch": 0.7204768583450211, "grad_norm": 2.908479389214046, "learning_rate": 1.9128311293390362e-06, "loss": 0.3193, "step": 5137 }, { "epoch": 0.720617110799439, "grad_norm": 2.7801014103677426, "learning_rate": 1.9110448309191428e-06, "loss": 0.3171, "step": 5138 }, { "epoch": 0.720757363253857, "grad_norm": 2.1351596088292277, "learning_rate": 1.9092591698835673e-06, "loss": 0.3616, "step": 5139 }, { "epoch": 0.7208976157082749, "grad_norm": 2.304026880789615, "learning_rate": 1.90747414660077e-06, "loss": 0.3552, "step": 5140 }, { "epoch": 0.7210378681626929, "grad_norm": 2.2322983060931785, "learning_rate": 1.905689761439075e-06, "loss": 0.2825, "step": 5141 }, { "epoch": 0.7211781206171108, "grad_norm": 1.7348705649122362, "learning_rate": 1.903906014766681e-06, "loss": 0.3432, "step": 5142 }, { "epoch": 0.7213183730715288, "grad_norm": 2.171638750358942, "learning_rate": 1.9021229069516477e-06, "loss": 0.3781, "step": 5143 }, { "epoch": 0.7214586255259468, "grad_norm": 2.0720818785981865, "learning_rate": 1.9003404383619094e-06, "loss": 0.3608, "step": 5144 }, { "epoch": 0.7215988779803647, "grad_norm": 2.9442813596206077, "learning_rate": 1.8985586093652658e-06, "loss": 0.3103, "step": 5145 }, { "epoch": 0.7217391304347827, "grad_norm": 2.0272327311847347, "learning_rate": 1.8967774203293843e-06, "loss": 0.3485, "step": 5146 }, { "epoch": 0.7218793828892006, "grad_norm": 1.7557215482700594, "learning_rate": 1.894996871621802e-06, "loss": 0.3473, "step": 5147 }, { "epoch": 0.7220196353436185, "grad_norm": 1.9601626746147267, "learning_rate": 1.8932169636099213e-06, "loss": 0.3323, "step": 5148 }, { "epoch": 0.7221598877980364, "grad_norm": 1.6426349498522037, "learning_rate": 1.891437696661015e-06, "loss": 0.3117, "step": 5149 }, { "epoch": 0.7223001402524544, "grad_norm": 1.9908866075637346, "learning_rate": 1.8896590711422215e-06, "loss": 0.3601, "step": 5150 }, { "epoch": 0.7224403927068723, "grad_norm": 2.065296819935879, "learning_rate": 1.8878810874205494e-06, "loss": 0.2681, "step": 5151 }, { "epoch": 0.7225806451612903, "grad_norm": 1.5103534378322283, "learning_rate": 1.8861037458628712e-06, "loss": 0.3068, "step": 5152 }, { "epoch": 0.7227208976157082, "grad_norm": 2.3221881105978897, "learning_rate": 1.8843270468359287e-06, "loss": 0.3574, "step": 5153 }, { "epoch": 0.7228611500701262, "grad_norm": 2.4639939477028765, "learning_rate": 1.8825509907063328e-06, "loss": 0.34, "step": 5154 }, { "epoch": 0.7230014025245441, "grad_norm": 1.808272924454319, "learning_rate": 1.8807755778405596e-06, "loss": 0.3777, "step": 5155 }, { "epoch": 0.7231416549789621, "grad_norm": 1.9720768748111204, "learning_rate": 1.8790008086049534e-06, "loss": 0.3331, "step": 5156 }, { "epoch": 0.7232819074333801, "grad_norm": 3.604243342001116, "learning_rate": 1.8772266833657254e-06, "loss": 0.3318, "step": 5157 }, { "epoch": 0.723422159887798, "grad_norm": 1.601873302540678, "learning_rate": 1.8754532024889537e-06, "loss": 0.3413, "step": 5158 }, { "epoch": 0.723562412342216, "grad_norm": 3.661717947831014, "learning_rate": 1.873680366340584e-06, "loss": 0.3657, "step": 5159 }, { "epoch": 0.7237026647966339, "grad_norm": 1.4857368245484992, "learning_rate": 1.8719081752864298e-06, "loss": 0.361, "step": 5160 }, { "epoch": 0.7238429172510519, "grad_norm": 3.4033823803970757, "learning_rate": 1.8701366296921675e-06, "loss": 0.3264, "step": 5161 }, { "epoch": 0.7239831697054698, "grad_norm": 2.1102636655254976, "learning_rate": 1.8683657299233464e-06, "loss": 0.3389, "step": 5162 }, { "epoch": 0.7241234221598878, "grad_norm": 2.1942275003107854, "learning_rate": 1.8665954763453764e-06, "loss": 0.3747, "step": 5163 }, { "epoch": 0.7242636746143057, "grad_norm": 1.6916834932188847, "learning_rate": 1.8648258693235376e-06, "loss": 0.3201, "step": 5164 }, { "epoch": 0.7244039270687237, "grad_norm": 1.9092961970923215, "learning_rate": 1.8630569092229766e-06, "loss": 0.3531, "step": 5165 }, { "epoch": 0.7245441795231417, "grad_norm": 2.9038233316450657, "learning_rate": 1.8612885964087063e-06, "loss": 0.3551, "step": 5166 }, { "epoch": 0.7246844319775596, "grad_norm": 1.865516329720176, "learning_rate": 1.8595209312456052e-06, "loss": 0.3326, "step": 5167 }, { "epoch": 0.7248246844319776, "grad_norm": 1.5641212444725243, "learning_rate": 1.857753914098419e-06, "loss": 0.3483, "step": 5168 }, { "epoch": 0.7249649368863955, "grad_norm": 1.927130607506261, "learning_rate": 1.8559875453317588e-06, "loss": 0.3485, "step": 5169 }, { "epoch": 0.7251051893408135, "grad_norm": 1.6542935602184174, "learning_rate": 1.854221825310103e-06, "loss": 0.359, "step": 5170 }, { "epoch": 0.7252454417952314, "grad_norm": 2.1849017379978006, "learning_rate": 1.8524567543977973e-06, "loss": 0.3351, "step": 5171 }, { "epoch": 0.7253856942496494, "grad_norm": 1.878083618012832, "learning_rate": 1.8506923329590482e-06, "loss": 0.3103, "step": 5172 }, { "epoch": 0.7255259467040673, "grad_norm": 2.2461141303423204, "learning_rate": 1.8489285613579328e-06, "loss": 0.3305, "step": 5173 }, { "epoch": 0.7256661991584853, "grad_norm": 2.0265537847068282, "learning_rate": 1.8471654399583938e-06, "loss": 0.3354, "step": 5174 }, { "epoch": 0.7258064516129032, "grad_norm": 2.1371733968073907, "learning_rate": 1.8454029691242392e-06, "loss": 0.3487, "step": 5175 }, { "epoch": 0.7259467040673212, "grad_norm": 2.0998881310230026, "learning_rate": 1.843641149219142e-06, "loss": 0.3026, "step": 5176 }, { "epoch": 0.7260869565217392, "grad_norm": 1.7197930378930555, "learning_rate": 1.8418799806066413e-06, "loss": 0.3323, "step": 5177 }, { "epoch": 0.7262272089761571, "grad_norm": 1.6163411897570767, "learning_rate": 1.8401194636501424e-06, "loss": 0.2911, "step": 5178 }, { "epoch": 0.7263674614305751, "grad_norm": 1.9477384217116067, "learning_rate": 1.8383595987129155e-06, "loss": 0.3676, "step": 5179 }, { "epoch": 0.726507713884993, "grad_norm": 1.6835785265453334, "learning_rate": 1.8366003861580966e-06, "loss": 0.2978, "step": 5180 }, { "epoch": 0.726647966339411, "grad_norm": 2.1312936817514228, "learning_rate": 1.8348418263486884e-06, "loss": 0.3647, "step": 5181 }, { "epoch": 0.7267882187938289, "grad_norm": 2.2603974114460756, "learning_rate": 1.8330839196475542e-06, "loss": 0.3258, "step": 5182 }, { "epoch": 0.7269284712482469, "grad_norm": 1.688695131168653, "learning_rate": 1.831326666417429e-06, "loss": 0.3592, "step": 5183 }, { "epoch": 0.7270687237026648, "grad_norm": 2.3815822739950887, "learning_rate": 1.829570067020906e-06, "loss": 0.3434, "step": 5184 }, { "epoch": 0.7272089761570828, "grad_norm": 2.0811552742144506, "learning_rate": 1.8278141218204499e-06, "loss": 0.3305, "step": 5185 }, { "epoch": 0.7273492286115008, "grad_norm": 2.1815690427561, "learning_rate": 1.8260588311783866e-06, "loss": 0.3611, "step": 5186 }, { "epoch": 0.7274894810659187, "grad_norm": 2.0354904515439065, "learning_rate": 1.8243041954569085e-06, "loss": 0.3577, "step": 5187 }, { "epoch": 0.7276297335203366, "grad_norm": 1.9442601555579526, "learning_rate": 1.822550215018073e-06, "loss": 0.3623, "step": 5188 }, { "epoch": 0.7277699859747545, "grad_norm": 2.144165230177413, "learning_rate": 1.820796890223801e-06, "loss": 0.3534, "step": 5189 }, { "epoch": 0.7279102384291725, "grad_norm": 2.1672147522239413, "learning_rate": 1.8190442214358788e-06, "loss": 0.3477, "step": 5190 }, { "epoch": 0.7280504908835904, "grad_norm": 2.4162481098986643, "learning_rate": 1.8172922090159578e-06, "loss": 0.3275, "step": 5191 }, { "epoch": 0.7281907433380084, "grad_norm": 2.2502103283380053, "learning_rate": 1.8155408533255553e-06, "loss": 0.3081, "step": 5192 }, { "epoch": 0.7283309957924263, "grad_norm": 1.9128511293496415, "learning_rate": 1.8137901547260472e-06, "loss": 0.3869, "step": 5193 }, { "epoch": 0.7284712482468443, "grad_norm": 2.136809017638813, "learning_rate": 1.8120401135786803e-06, "loss": 0.3349, "step": 5194 }, { "epoch": 0.7286115007012622, "grad_norm": 2.0280648714859497, "learning_rate": 1.8102907302445627e-06, "loss": 0.3153, "step": 5195 }, { "epoch": 0.7287517531556802, "grad_norm": 2.16892233831681, "learning_rate": 1.808542005084668e-06, "loss": 0.3486, "step": 5196 }, { "epoch": 0.7288920056100981, "grad_norm": 1.7990733092438704, "learning_rate": 1.8067939384598337e-06, "loss": 0.3267, "step": 5197 }, { "epoch": 0.7290322580645161, "grad_norm": 1.7380560225400656, "learning_rate": 1.8050465307307602e-06, "loss": 0.3708, "step": 5198 }, { "epoch": 0.7291725105189341, "grad_norm": 1.5623925744321192, "learning_rate": 1.8032997822580139e-06, "loss": 0.3478, "step": 5199 }, { "epoch": 0.729312762973352, "grad_norm": 2.056504407177262, "learning_rate": 1.8015536934020229e-06, "loss": 0.3331, "step": 5200 }, { "epoch": 0.72945301542777, "grad_norm": 1.7019922531704705, "learning_rate": 1.7998082645230835e-06, "loss": 0.3402, "step": 5201 }, { "epoch": 0.7295932678821879, "grad_norm": 1.878817613695927, "learning_rate": 1.798063495981348e-06, "loss": 0.3377, "step": 5202 }, { "epoch": 0.7297335203366059, "grad_norm": 1.989660058531053, "learning_rate": 1.7963193881368402e-06, "loss": 0.3692, "step": 5203 }, { "epoch": 0.7298737727910238, "grad_norm": 1.8423269566885208, "learning_rate": 1.7945759413494458e-06, "loss": 0.2823, "step": 5204 }, { "epoch": 0.7300140252454418, "grad_norm": 1.863589087434298, "learning_rate": 1.7928331559789087e-06, "loss": 0.3426, "step": 5205 }, { "epoch": 0.7301542776998597, "grad_norm": 2.458546472997347, "learning_rate": 1.7910910323848435e-06, "loss": 0.3589, "step": 5206 }, { "epoch": 0.7302945301542777, "grad_norm": 2.2911234356293506, "learning_rate": 1.789349570926724e-06, "loss": 0.3563, "step": 5207 }, { "epoch": 0.7304347826086957, "grad_norm": 1.830823553120265, "learning_rate": 1.7876087719638896e-06, "loss": 0.3302, "step": 5208 }, { "epoch": 0.7305750350631136, "grad_norm": 1.709315676592499, "learning_rate": 1.7858686358555411e-06, "loss": 0.3029, "step": 5209 }, { "epoch": 0.7307152875175316, "grad_norm": 2.3108799388801975, "learning_rate": 1.7841291629607443e-06, "loss": 0.3096, "step": 5210 }, { "epoch": 0.7308555399719495, "grad_norm": 1.8611165073500004, "learning_rate": 1.7823903536384262e-06, "loss": 0.3729, "step": 5211 }, { "epoch": 0.7309957924263675, "grad_norm": 2.558811044565814, "learning_rate": 1.7806522082473809e-06, "loss": 0.3111, "step": 5212 }, { "epoch": 0.7311360448807854, "grad_norm": 3.380450756782409, "learning_rate": 1.7789147271462586e-06, "loss": 0.3243, "step": 5213 }, { "epoch": 0.7312762973352034, "grad_norm": 1.8097226140036573, "learning_rate": 1.7771779106935783e-06, "loss": 0.297, "step": 5214 }, { "epoch": 0.7314165497896213, "grad_norm": 1.9066193019657003, "learning_rate": 1.7754417592477192e-06, "loss": 0.3755, "step": 5215 }, { "epoch": 0.7315568022440393, "grad_norm": 1.7265130635774562, "learning_rate": 1.7737062731669246e-06, "loss": 0.3586, "step": 5216 }, { "epoch": 0.7316970546984572, "grad_norm": 4.992608137721776, "learning_rate": 1.7719714528093e-06, "loss": 0.3424, "step": 5217 }, { "epoch": 0.7318373071528752, "grad_norm": 1.901539469997147, "learning_rate": 1.7702372985328132e-06, "loss": 0.3304, "step": 5218 }, { "epoch": 0.7319775596072932, "grad_norm": 2.302248113151951, "learning_rate": 1.7685038106952952e-06, "loss": 0.4008, "step": 5219 }, { "epoch": 0.7321178120617111, "grad_norm": 2.8534623179590453, "learning_rate": 1.766770989654439e-06, "loss": 0.3638, "step": 5220 }, { "epoch": 0.7322580645161291, "grad_norm": 1.9951256720558088, "learning_rate": 1.7650388357677994e-06, "loss": 0.346, "step": 5221 }, { "epoch": 0.732398316970547, "grad_norm": 2.360981577392984, "learning_rate": 1.7633073493927965e-06, "loss": 0.3475, "step": 5222 }, { "epoch": 0.732538569424965, "grad_norm": 1.7721064609812565, "learning_rate": 1.7615765308867071e-06, "loss": 0.3171, "step": 5223 }, { "epoch": 0.7326788218793829, "grad_norm": 2.448313958143518, "learning_rate": 1.7598463806066774e-06, "loss": 0.3211, "step": 5224 }, { "epoch": 0.7328190743338009, "grad_norm": 2.8386204326562154, "learning_rate": 1.7581168989097075e-06, "loss": 0.3333, "step": 5225 }, { "epoch": 0.7329593267882188, "grad_norm": 2.2743286274005827, "learning_rate": 1.7563880861526656e-06, "loss": 0.3332, "step": 5226 }, { "epoch": 0.7330995792426368, "grad_norm": 3.3518747748880418, "learning_rate": 1.7546599426922812e-06, "loss": 0.3283, "step": 5227 }, { "epoch": 0.7332398316970546, "grad_norm": 2.431451776364008, "learning_rate": 1.7529324688851429e-06, "loss": 0.3652, "step": 5228 }, { "epoch": 0.7333800841514726, "grad_norm": 2.079936220585059, "learning_rate": 1.7512056650877047e-06, "loss": 0.3381, "step": 5229 }, { "epoch": 0.7335203366058906, "grad_norm": 1.5573857341751294, "learning_rate": 1.7494795316562791e-06, "loss": 0.2989, "step": 5230 }, { "epoch": 0.7336605890603085, "grad_norm": 2.1574642798162063, "learning_rate": 1.7477540689470424e-06, "loss": 0.3684, "step": 5231 }, { "epoch": 0.7338008415147265, "grad_norm": 2.788714218833427, "learning_rate": 1.7460292773160315e-06, "loss": 0.3587, "step": 5232 }, { "epoch": 0.7339410939691444, "grad_norm": 2.117539042323461, "learning_rate": 1.7443051571191472e-06, "loss": 0.3351, "step": 5233 }, { "epoch": 0.7340813464235624, "grad_norm": 2.084264305641265, "learning_rate": 1.7425817087121455e-06, "loss": 0.3457, "step": 5234 }, { "epoch": 0.7342215988779803, "grad_norm": 2.4439564805471874, "learning_rate": 1.7408589324506504e-06, "loss": 0.3697, "step": 5235 }, { "epoch": 0.7343618513323983, "grad_norm": 3.301799549379433, "learning_rate": 1.7391368286901444e-06, "loss": 0.3731, "step": 5236 }, { "epoch": 0.7345021037868162, "grad_norm": 5.560056170996278, "learning_rate": 1.7374153977859715e-06, "loss": 0.3013, "step": 5237 }, { "epoch": 0.7346423562412342, "grad_norm": 2.2043678619727785, "learning_rate": 1.7356946400933373e-06, "loss": 0.3138, "step": 5238 }, { "epoch": 0.7347826086956522, "grad_norm": 2.0351061091771974, "learning_rate": 1.7339745559673071e-06, "loss": 0.3386, "step": 5239 }, { "epoch": 0.7349228611500701, "grad_norm": 1.7182250868590931, "learning_rate": 1.73225514576281e-06, "loss": 0.332, "step": 5240 }, { "epoch": 0.7350631136044881, "grad_norm": 2.1520080858525388, "learning_rate": 1.7305364098346328e-06, "loss": 0.3192, "step": 5241 }, { "epoch": 0.735203366058906, "grad_norm": 1.7805447437092352, "learning_rate": 1.7288183485374267e-06, "loss": 0.3869, "step": 5242 }, { "epoch": 0.735343618513324, "grad_norm": 1.7518191419250488, "learning_rate": 1.7271009622256985e-06, "loss": 0.3287, "step": 5243 }, { "epoch": 0.7354838709677419, "grad_norm": 2.413231864169483, "learning_rate": 1.7253842512538204e-06, "loss": 0.3524, "step": 5244 }, { "epoch": 0.7356241234221599, "grad_norm": 3.6427293188712175, "learning_rate": 1.723668215976026e-06, "loss": 0.3474, "step": 5245 }, { "epoch": 0.7357643758765778, "grad_norm": 1.7375320019965204, "learning_rate": 1.7219528567464028e-06, "loss": 0.3046, "step": 5246 }, { "epoch": 0.7359046283309958, "grad_norm": 2.106907284109343, "learning_rate": 1.7202381739189055e-06, "loss": 0.308, "step": 5247 }, { "epoch": 0.7360448807854137, "grad_norm": 3.010538664109083, "learning_rate": 1.7185241678473468e-06, "loss": 0.3215, "step": 5248 }, { "epoch": 0.7361851332398317, "grad_norm": 1.690245055765956, "learning_rate": 1.7168108388853999e-06, "loss": 0.2991, "step": 5249 }, { "epoch": 0.7363253856942497, "grad_norm": 2.196055765003282, "learning_rate": 1.7150981873865979e-06, "loss": 0.404, "step": 5250 }, { "epoch": 0.7364656381486676, "grad_norm": 2.374275900230434, "learning_rate": 1.713386213704335e-06, "loss": 0.3297, "step": 5251 }, { "epoch": 0.7366058906030856, "grad_norm": 1.8325085374079029, "learning_rate": 1.7116749181918652e-06, "loss": 0.339, "step": 5252 }, { "epoch": 0.7367461430575035, "grad_norm": 1.901003888595272, "learning_rate": 1.7099643012023032e-06, "loss": 0.348, "step": 5253 }, { "epoch": 0.7368863955119215, "grad_norm": 1.8538604512045354, "learning_rate": 1.70825436308862e-06, "loss": 0.3439, "step": 5254 }, { "epoch": 0.7370266479663394, "grad_norm": 1.7297925467498292, "learning_rate": 1.7065451042036507e-06, "loss": 0.2956, "step": 5255 }, { "epoch": 0.7371669004207574, "grad_norm": 2.131573774964797, "learning_rate": 1.7048365249000897e-06, "loss": 0.3678, "step": 5256 }, { "epoch": 0.7373071528751753, "grad_norm": 1.8323315860230363, "learning_rate": 1.7031286255304896e-06, "loss": 0.3485, "step": 5257 }, { "epoch": 0.7374474053295933, "grad_norm": 1.7956019660383182, "learning_rate": 1.7014214064472646e-06, "loss": 0.3945, "step": 5258 }, { "epoch": 0.7375876577840113, "grad_norm": 2.5445091633106043, "learning_rate": 1.6997148680026859e-06, "loss": 0.3765, "step": 5259 }, { "epoch": 0.7377279102384292, "grad_norm": 1.5487217523863663, "learning_rate": 1.6980090105488866e-06, "loss": 0.3528, "step": 5260 }, { "epoch": 0.7378681626928472, "grad_norm": 1.7530471974888568, "learning_rate": 1.696303834437859e-06, "loss": 0.3232, "step": 5261 }, { "epoch": 0.7380084151472651, "grad_norm": 2.042309277149941, "learning_rate": 1.6945993400214534e-06, "loss": 0.3309, "step": 5262 }, { "epoch": 0.7381486676016831, "grad_norm": 1.724968290728605, "learning_rate": 1.6928955276513826e-06, "loss": 0.3227, "step": 5263 }, { "epoch": 0.738288920056101, "grad_norm": 1.9871208237081424, "learning_rate": 1.6911923976792123e-06, "loss": 0.3791, "step": 5264 }, { "epoch": 0.738429172510519, "grad_norm": 1.9357374849975357, "learning_rate": 1.6894899504563738e-06, "loss": 0.3228, "step": 5265 }, { "epoch": 0.7385694249649369, "grad_norm": 3.682063864100731, "learning_rate": 1.6877881863341567e-06, "loss": 0.3669, "step": 5266 }, { "epoch": 0.7387096774193549, "grad_norm": 1.8207032926142381, "learning_rate": 1.686087105663704e-06, "loss": 0.3111, "step": 5267 }, { "epoch": 0.7388499298737727, "grad_norm": 1.4221648874415136, "learning_rate": 1.6843867087960252e-06, "loss": 0.3187, "step": 5268 }, { "epoch": 0.7389901823281907, "grad_norm": 2.3056584785907606, "learning_rate": 1.6826869960819835e-06, "loss": 0.3723, "step": 5269 }, { "epoch": 0.7391304347826086, "grad_norm": 2.465266123521746, "learning_rate": 1.6809879678723045e-06, "loss": 0.3204, "step": 5270 }, { "epoch": 0.7392706872370266, "grad_norm": 1.7927755280808841, "learning_rate": 1.6792896245175693e-06, "loss": 0.3757, "step": 5271 }, { "epoch": 0.7394109396914446, "grad_norm": 1.7148334615828895, "learning_rate": 1.67759196636822e-06, "loss": 0.3619, "step": 5272 }, { "epoch": 0.7395511921458625, "grad_norm": 2.2879921555684737, "learning_rate": 1.6758949937745562e-06, "loss": 0.3612, "step": 5273 }, { "epoch": 0.7396914446002805, "grad_norm": 1.735688545511626, "learning_rate": 1.6741987070867377e-06, "loss": 0.3205, "step": 5274 }, { "epoch": 0.7398316970546984, "grad_norm": 1.7605154322018521, "learning_rate": 1.6725031066547786e-06, "loss": 0.34, "step": 5275 }, { "epoch": 0.7399719495091164, "grad_norm": 2.233906410005426, "learning_rate": 1.6708081928285558e-06, "loss": 0.3648, "step": 5276 }, { "epoch": 0.7401122019635343, "grad_norm": 1.636982607456062, "learning_rate": 1.6691139659578032e-06, "loss": 0.3373, "step": 5277 }, { "epoch": 0.7402524544179523, "grad_norm": 1.933009327123472, "learning_rate": 1.6674204263921118e-06, "loss": 0.3531, "step": 5278 }, { "epoch": 0.7403927068723702, "grad_norm": 2.1217639068151355, "learning_rate": 1.6657275744809327e-06, "loss": 0.3424, "step": 5279 }, { "epoch": 0.7405329593267882, "grad_norm": 1.6153657391601572, "learning_rate": 1.6640354105735728e-06, "loss": 0.3141, "step": 5280 }, { "epoch": 0.7406732117812062, "grad_norm": 2.1935849162965315, "learning_rate": 1.6623439350191995e-06, "loss": 0.373, "step": 5281 }, { "epoch": 0.7408134642356241, "grad_norm": 2.073334068817466, "learning_rate": 1.6606531481668364e-06, "loss": 0.3, "step": 5282 }, { "epoch": 0.7409537166900421, "grad_norm": 1.597045781806837, "learning_rate": 1.658963050365367e-06, "loss": 0.282, "step": 5283 }, { "epoch": 0.74109396914446, "grad_norm": 3.201213880615952, "learning_rate": 1.6572736419635288e-06, "loss": 0.3288, "step": 5284 }, { "epoch": 0.741234221598878, "grad_norm": 1.5715544993860078, "learning_rate": 1.6555849233099202e-06, "loss": 0.3531, "step": 5285 }, { "epoch": 0.7413744740532959, "grad_norm": 1.4536333733023163, "learning_rate": 1.6538968947529965e-06, "loss": 0.3034, "step": 5286 }, { "epoch": 0.7415147265077139, "grad_norm": 2.576253408164149, "learning_rate": 1.6522095566410728e-06, "loss": 0.3941, "step": 5287 }, { "epoch": 0.7416549789621318, "grad_norm": 1.8442899915390842, "learning_rate": 1.6505229093223158e-06, "loss": 0.3493, "step": 5288 }, { "epoch": 0.7417952314165498, "grad_norm": 2.3369810905360775, "learning_rate": 1.648836953144755e-06, "loss": 0.3351, "step": 5289 }, { "epoch": 0.7419354838709677, "grad_norm": 2.0408650465910814, "learning_rate": 1.647151688456276e-06, "loss": 0.3637, "step": 5290 }, { "epoch": 0.7420757363253857, "grad_norm": 1.755958902256073, "learning_rate": 1.6454671156046214e-06, "loss": 0.342, "step": 5291 }, { "epoch": 0.7422159887798037, "grad_norm": 2.0912285040590293, "learning_rate": 1.6437832349373906e-06, "loss": 0.352, "step": 5292 }, { "epoch": 0.7423562412342216, "grad_norm": 1.6744456982469045, "learning_rate": 1.642100046802041e-06, "loss": 0.2912, "step": 5293 }, { "epoch": 0.7424964936886396, "grad_norm": 1.8811889232973702, "learning_rate": 1.6404175515458882e-06, "loss": 0.3338, "step": 5294 }, { "epoch": 0.7426367461430575, "grad_norm": 2.6832431525325453, "learning_rate": 1.6387357495161e-06, "loss": 0.3602, "step": 5295 }, { "epoch": 0.7427769985974755, "grad_norm": 1.6231402245219924, "learning_rate": 1.6370546410597066e-06, "loss": 0.3741, "step": 5296 }, { "epoch": 0.7429172510518934, "grad_norm": 2.4813120452272415, "learning_rate": 1.6353742265235923e-06, "loss": 0.3498, "step": 5297 }, { "epoch": 0.7430575035063114, "grad_norm": 1.7329304495897215, "learning_rate": 1.633694506254499e-06, "loss": 0.3174, "step": 5298 }, { "epoch": 0.7431977559607293, "grad_norm": 2.5076551536563843, "learning_rate": 1.6320154805990258e-06, "loss": 0.3471, "step": 5299 }, { "epoch": 0.7433380084151473, "grad_norm": 1.6935705561439403, "learning_rate": 1.6303371499036275e-06, "loss": 0.3315, "step": 5300 }, { "epoch": 0.7434782608695653, "grad_norm": 1.733728305430638, "learning_rate": 1.6286595145146162e-06, "loss": 0.3413, "step": 5301 }, { "epoch": 0.7436185133239832, "grad_norm": 3.5352889736825484, "learning_rate": 1.6269825747781598e-06, "loss": 0.3608, "step": 5302 }, { "epoch": 0.7437587657784012, "grad_norm": 2.209918538831851, "learning_rate": 1.6253063310402833e-06, "loss": 0.3355, "step": 5303 }, { "epoch": 0.7438990182328191, "grad_norm": 1.9843469452564386, "learning_rate": 1.6236307836468695e-06, "loss": 0.3554, "step": 5304 }, { "epoch": 0.7440392706872371, "grad_norm": 2.044412310780819, "learning_rate": 1.6219559329436528e-06, "loss": 0.3666, "step": 5305 }, { "epoch": 0.744179523141655, "grad_norm": 2.2105173311552866, "learning_rate": 1.6202817792762283e-06, "loss": 0.3257, "step": 5306 }, { "epoch": 0.744319775596073, "grad_norm": 1.567990080227906, "learning_rate": 1.6186083229900462e-06, "loss": 0.3351, "step": 5307 }, { "epoch": 0.7444600280504908, "grad_norm": 2.146632875784692, "learning_rate": 1.616935564430414e-06, "loss": 0.3148, "step": 5308 }, { "epoch": 0.7446002805049088, "grad_norm": 1.6912904483895126, "learning_rate": 1.6152635039424907e-06, "loss": 0.3049, "step": 5309 }, { "epoch": 0.7447405329593267, "grad_norm": 2.4312690910629287, "learning_rate": 1.6135921418712959e-06, "loss": 0.3218, "step": 5310 }, { "epoch": 0.7448807854137447, "grad_norm": 1.957398340208964, "learning_rate": 1.6119214785617027e-06, "loss": 0.3293, "step": 5311 }, { "epoch": 0.7450210378681626, "grad_norm": 1.6518775516739996, "learning_rate": 1.6102515143584412e-06, "loss": 0.2943, "step": 5312 }, { "epoch": 0.7451612903225806, "grad_norm": 1.9106666803477559, "learning_rate": 1.6085822496060976e-06, "loss": 0.3489, "step": 5313 }, { "epoch": 0.7453015427769986, "grad_norm": 1.9158735776736553, "learning_rate": 1.6069136846491124e-06, "loss": 0.3317, "step": 5314 }, { "epoch": 0.7454417952314165, "grad_norm": 1.6038742129121992, "learning_rate": 1.6052458198317844e-06, "loss": 0.3135, "step": 5315 }, { "epoch": 0.7455820476858345, "grad_norm": 1.9133885033835578, "learning_rate": 1.6035786554982614e-06, "loss": 0.3391, "step": 5316 }, { "epoch": 0.7457223001402524, "grad_norm": 1.732591347646306, "learning_rate": 1.601912191992554e-06, "loss": 0.3646, "step": 5317 }, { "epoch": 0.7458625525946704, "grad_norm": 1.6952019580716342, "learning_rate": 1.6002464296585253e-06, "loss": 0.3394, "step": 5318 }, { "epoch": 0.7460028050490883, "grad_norm": 1.8672598980390895, "learning_rate": 1.5985813688398927e-06, "loss": 0.345, "step": 5319 }, { "epoch": 0.7461430575035063, "grad_norm": 2.4239522577878194, "learning_rate": 1.59691700988023e-06, "loss": 0.3321, "step": 5320 }, { "epoch": 0.7462833099579242, "grad_norm": 1.9655834951703797, "learning_rate": 1.5952533531229675e-06, "loss": 0.2737, "step": 5321 }, { "epoch": 0.7464235624123422, "grad_norm": 1.937611730491857, "learning_rate": 1.5935903989113877e-06, "loss": 0.2932, "step": 5322 }, { "epoch": 0.7465638148667602, "grad_norm": 1.5592507719961863, "learning_rate": 1.59192814758863e-06, "loss": 0.3791, "step": 5323 }, { "epoch": 0.7467040673211781, "grad_norm": 2.201839207085718, "learning_rate": 1.5902665994976896e-06, "loss": 0.3694, "step": 5324 }, { "epoch": 0.7468443197755961, "grad_norm": 1.7937604365560251, "learning_rate": 1.5886057549814133e-06, "loss": 0.3236, "step": 5325 }, { "epoch": 0.746984572230014, "grad_norm": 2.326017308965358, "learning_rate": 1.5869456143825051e-06, "loss": 0.3245, "step": 5326 }, { "epoch": 0.747124824684432, "grad_norm": 1.9614850985631718, "learning_rate": 1.5852861780435237e-06, "loss": 0.3429, "step": 5327 }, { "epoch": 0.7472650771388499, "grad_norm": 2.180545323611778, "learning_rate": 1.583627446306883e-06, "loss": 0.3167, "step": 5328 }, { "epoch": 0.7474053295932679, "grad_norm": 1.908928224827889, "learning_rate": 1.581969419514851e-06, "loss": 0.3454, "step": 5329 }, { "epoch": 0.7475455820476858, "grad_norm": 2.8077578938926613, "learning_rate": 1.5803120980095477e-06, "loss": 0.3429, "step": 5330 }, { "epoch": 0.7476858345021038, "grad_norm": 2.0328350045368087, "learning_rate": 1.5786554821329515e-06, "loss": 0.374, "step": 5331 }, { "epoch": 0.7478260869565218, "grad_norm": 1.7823989910167393, "learning_rate": 1.5769995722268926e-06, "loss": 0.3371, "step": 5332 }, { "epoch": 0.7479663394109397, "grad_norm": 2.0925052415788965, "learning_rate": 1.5753443686330572e-06, "loss": 0.3539, "step": 5333 }, { "epoch": 0.7481065918653577, "grad_norm": 2.104408350998319, "learning_rate": 1.5736898716929848e-06, "loss": 0.3309, "step": 5334 }, { "epoch": 0.7482468443197756, "grad_norm": 2.086807926604718, "learning_rate": 1.5720360817480712e-06, "loss": 0.3923, "step": 5335 }, { "epoch": 0.7483870967741936, "grad_norm": 1.6954502454076055, "learning_rate": 1.5703829991395602e-06, "loss": 0.3676, "step": 5336 }, { "epoch": 0.7485273492286115, "grad_norm": 2.176607642763364, "learning_rate": 1.5687306242085565e-06, "loss": 0.3342, "step": 5337 }, { "epoch": 0.7486676016830295, "grad_norm": 2.6320449208249617, "learning_rate": 1.567078957296016e-06, "loss": 0.3215, "step": 5338 }, { "epoch": 0.7488078541374474, "grad_norm": 2.035737582482139, "learning_rate": 1.565427998742748e-06, "loss": 0.3255, "step": 5339 }, { "epoch": 0.7489481065918654, "grad_norm": 2.014725121995481, "learning_rate": 1.5637777488894167e-06, "loss": 0.3662, "step": 5340 }, { "epoch": 0.7490883590462833, "grad_norm": 7.301237121685618, "learning_rate": 1.5621282080765399e-06, "loss": 0.2935, "step": 5341 }, { "epoch": 0.7492286115007013, "grad_norm": 2.776033717580058, "learning_rate": 1.5604793766444882e-06, "loss": 0.3036, "step": 5342 }, { "epoch": 0.7493688639551193, "grad_norm": 1.9263406989388097, "learning_rate": 1.5588312549334867e-06, "loss": 0.3158, "step": 5343 }, { "epoch": 0.7495091164095372, "grad_norm": 2.193111415852494, "learning_rate": 1.557183843283614e-06, "loss": 0.3103, "step": 5344 }, { "epoch": 0.7496493688639552, "grad_norm": 2.1123410317463924, "learning_rate": 1.5555371420348031e-06, "loss": 0.3598, "step": 5345 }, { "epoch": 0.7497896213183731, "grad_norm": 1.7752991706281942, "learning_rate": 1.5538911515268368e-06, "loss": 0.336, "step": 5346 }, { "epoch": 0.7499298737727911, "grad_norm": 1.6625836028682857, "learning_rate": 1.552245872099355e-06, "loss": 0.3422, "step": 5347 }, { "epoch": 0.7500701262272089, "grad_norm": 3.631068313439282, "learning_rate": 1.5506013040918494e-06, "loss": 0.3201, "step": 5348 }, { "epoch": 0.7502103786816269, "grad_norm": 2.0488197464676157, "learning_rate": 1.5489574478436664e-06, "loss": 0.3167, "step": 5349 }, { "epoch": 0.7503506311360448, "grad_norm": 1.9815610623380784, "learning_rate": 1.5473143036940026e-06, "loss": 0.3339, "step": 5350 }, { "epoch": 0.7504908835904628, "grad_norm": 2.07300102279498, "learning_rate": 1.5456718719819092e-06, "loss": 0.3842, "step": 5351 }, { "epoch": 0.7506311360448807, "grad_norm": 2.3243686146865126, "learning_rate": 1.544030153046291e-06, "loss": 0.3448, "step": 5352 }, { "epoch": 0.7507713884992987, "grad_norm": 1.9360414983542087, "learning_rate": 1.5423891472259056e-06, "loss": 0.3097, "step": 5353 }, { "epoch": 0.7509116409537167, "grad_norm": 2.1254365136090443, "learning_rate": 1.5407488548593629e-06, "loss": 0.3691, "step": 5354 }, { "epoch": 0.7510518934081346, "grad_norm": 1.819221005620609, "learning_rate": 1.5391092762851257e-06, "loss": 0.3068, "step": 5355 }, { "epoch": 0.7511921458625526, "grad_norm": 2.178673511766741, "learning_rate": 1.5374704118415112e-06, "loss": 0.3107, "step": 5356 }, { "epoch": 0.7513323983169705, "grad_norm": 2.5865565362898186, "learning_rate": 1.535832261866685e-06, "loss": 0.3368, "step": 5357 }, { "epoch": 0.7514726507713885, "grad_norm": 2.1100044724114193, "learning_rate": 1.5341948266986683e-06, "loss": 0.3822, "step": 5358 }, { "epoch": 0.7516129032258064, "grad_norm": 2.6812531371198323, "learning_rate": 1.5325581066753354e-06, "loss": 0.3218, "step": 5359 }, { "epoch": 0.7517531556802244, "grad_norm": 1.8979995285130067, "learning_rate": 1.5309221021344118e-06, "loss": 0.3602, "step": 5360 }, { "epoch": 0.7518934081346423, "grad_norm": 2.7624707526820673, "learning_rate": 1.5292868134134754e-06, "loss": 0.3446, "step": 5361 }, { "epoch": 0.7520336605890603, "grad_norm": 2.4392638060482157, "learning_rate": 1.5276522408499567e-06, "loss": 0.3438, "step": 5362 }, { "epoch": 0.7521739130434782, "grad_norm": 1.9799534354851513, "learning_rate": 1.5260183847811383e-06, "loss": 0.3452, "step": 5363 }, { "epoch": 0.7523141654978962, "grad_norm": 2.0625130624068486, "learning_rate": 1.5243852455441555e-06, "loss": 0.3323, "step": 5364 }, { "epoch": 0.7524544179523142, "grad_norm": 2.6449542249673907, "learning_rate": 1.5227528234759958e-06, "loss": 0.4118, "step": 5365 }, { "epoch": 0.7525946704067321, "grad_norm": 1.704245802231899, "learning_rate": 1.5211211189134955e-06, "loss": 0.3415, "step": 5366 }, { "epoch": 0.7527349228611501, "grad_norm": 2.211351874782759, "learning_rate": 1.519490132193347e-06, "loss": 0.3248, "step": 5367 }, { "epoch": 0.752875175315568, "grad_norm": 1.7899428954152856, "learning_rate": 1.517859863652093e-06, "loss": 0.3516, "step": 5368 }, { "epoch": 0.753015427769986, "grad_norm": 2.1069201843612975, "learning_rate": 1.516230313626128e-06, "loss": 0.3598, "step": 5369 }, { "epoch": 0.7531556802244039, "grad_norm": 1.7663997199550696, "learning_rate": 1.5146014824516997e-06, "loss": 0.3405, "step": 5370 }, { "epoch": 0.7532959326788219, "grad_norm": 1.931904372674142, "learning_rate": 1.512973370464903e-06, "loss": 0.3627, "step": 5371 }, { "epoch": 0.7534361851332398, "grad_norm": 1.5642756481680389, "learning_rate": 1.5113459780016887e-06, "loss": 0.3233, "step": 5372 }, { "epoch": 0.7535764375876578, "grad_norm": 1.614935518636846, "learning_rate": 1.5097193053978587e-06, "loss": 0.3385, "step": 5373 }, { "epoch": 0.7537166900420758, "grad_norm": 1.8659110714164076, "learning_rate": 1.5080933529890645e-06, "loss": 0.3304, "step": 5374 }, { "epoch": 0.7538569424964937, "grad_norm": 2.0682329205841015, "learning_rate": 1.5064681211108112e-06, "loss": 0.3691, "step": 5375 }, { "epoch": 0.7539971949509117, "grad_norm": 2.0436490095192195, "learning_rate": 1.5048436100984549e-06, "loss": 0.3828, "step": 5376 }, { "epoch": 0.7541374474053296, "grad_norm": 1.7451644575124283, "learning_rate": 1.5032198202871983e-06, "loss": 0.3441, "step": 5377 }, { "epoch": 0.7542776998597476, "grad_norm": 1.7371855636506286, "learning_rate": 1.5015967520121016e-06, "loss": 0.2981, "step": 5378 }, { "epoch": 0.7544179523141655, "grad_norm": 1.808180034602921, "learning_rate": 1.4999744056080734e-06, "loss": 0.3712, "step": 5379 }, { "epoch": 0.7545582047685835, "grad_norm": 3.2144279720978624, "learning_rate": 1.4983527814098736e-06, "loss": 0.3377, "step": 5380 }, { "epoch": 0.7546984572230014, "grad_norm": 1.9447988421800337, "learning_rate": 1.496731879752113e-06, "loss": 0.3222, "step": 5381 }, { "epoch": 0.7548387096774194, "grad_norm": 2.0364857027079726, "learning_rate": 1.4951117009692528e-06, "loss": 0.3621, "step": 5382 }, { "epoch": 0.7549789621318374, "grad_norm": 1.9322076935185653, "learning_rate": 1.4934922453956064e-06, "loss": 0.3724, "step": 5383 }, { "epoch": 0.7551192145862553, "grad_norm": 1.7266682035790357, "learning_rate": 1.4918735133653368e-06, "loss": 0.3484, "step": 5384 }, { "epoch": 0.7552594670406733, "grad_norm": 3.1612525801763116, "learning_rate": 1.4902555052124579e-06, "loss": 0.3484, "step": 5385 }, { "epoch": 0.7553997194950912, "grad_norm": 1.8036256118362006, "learning_rate": 1.4886382212708361e-06, "loss": 0.329, "step": 5386 }, { "epoch": 0.7555399719495092, "grad_norm": 1.5160553311439011, "learning_rate": 1.4870216618741833e-06, "loss": 0.3637, "step": 5387 }, { "epoch": 0.755680224403927, "grad_norm": 1.8147913162861125, "learning_rate": 1.4854058273560667e-06, "loss": 0.3366, "step": 5388 }, { "epoch": 0.755820476858345, "grad_norm": 3.5734487569253854, "learning_rate": 1.4837907180499035e-06, "loss": 0.3892, "step": 5389 }, { "epoch": 0.7559607293127629, "grad_norm": 3.218964060359669, "learning_rate": 1.4821763342889588e-06, "loss": 0.3049, "step": 5390 }, { "epoch": 0.7561009817671809, "grad_norm": 1.767840994999226, "learning_rate": 1.480562676406352e-06, "loss": 0.3388, "step": 5391 }, { "epoch": 0.7562412342215988, "grad_norm": 2.327752512377683, "learning_rate": 1.4789497447350465e-06, "loss": 0.3283, "step": 5392 }, { "epoch": 0.7563814866760168, "grad_norm": 2.197860596530687, "learning_rate": 1.477337539607861e-06, "loss": 0.3477, "step": 5393 }, { "epoch": 0.7565217391304347, "grad_norm": 2.1448802003995846, "learning_rate": 1.475726061357463e-06, "loss": 0.319, "step": 5394 }, { "epoch": 0.7566619915848527, "grad_norm": 2.002064592933341, "learning_rate": 1.4741153103163696e-06, "loss": 0.356, "step": 5395 }, { "epoch": 0.7568022440392707, "grad_norm": 1.5318112026145136, "learning_rate": 1.4725052868169482e-06, "loss": 0.3525, "step": 5396 }, { "epoch": 0.7569424964936886, "grad_norm": 1.7033552661586027, "learning_rate": 1.4708959911914177e-06, "loss": 0.3019, "step": 5397 }, { "epoch": 0.7570827489481066, "grad_norm": 1.7214921750730936, "learning_rate": 1.4692874237718413e-06, "loss": 0.3316, "step": 5398 }, { "epoch": 0.7572230014025245, "grad_norm": 1.8012813113771748, "learning_rate": 1.4676795848901376e-06, "loss": 0.3726, "step": 5399 }, { "epoch": 0.7573632538569425, "grad_norm": 1.689158013648949, "learning_rate": 1.466072474878073e-06, "loss": 0.3254, "step": 5400 }, { "epoch": 0.7575035063113604, "grad_norm": 1.8979264815656323, "learning_rate": 1.4644660940672628e-06, "loss": 0.3595, "step": 5401 }, { "epoch": 0.7576437587657784, "grad_norm": 1.713991200886311, "learning_rate": 1.4628604427891728e-06, "loss": 0.3221, "step": 5402 }, { "epoch": 0.7577840112201963, "grad_norm": 1.8760979934305082, "learning_rate": 1.4612555213751185e-06, "loss": 0.3552, "step": 5403 }, { "epoch": 0.7579242636746143, "grad_norm": 1.818435290398036, "learning_rate": 1.4596513301562636e-06, "loss": 0.347, "step": 5404 }, { "epoch": 0.7580645161290323, "grad_norm": 2.1214919585849428, "learning_rate": 1.458047869463622e-06, "loss": 0.3079, "step": 5405 }, { "epoch": 0.7582047685834502, "grad_norm": 3.7373087703302756, "learning_rate": 1.4564451396280577e-06, "loss": 0.3423, "step": 5406 }, { "epoch": 0.7583450210378682, "grad_norm": 1.8864824293153972, "learning_rate": 1.4548431409802804e-06, "loss": 0.3473, "step": 5407 }, { "epoch": 0.7584852734922861, "grad_norm": 3.3069646398305133, "learning_rate": 1.4532418738508525e-06, "loss": 0.3285, "step": 5408 }, { "epoch": 0.7586255259467041, "grad_norm": 1.8105686025951542, "learning_rate": 1.4516413385701845e-06, "loss": 0.3057, "step": 5409 }, { "epoch": 0.758765778401122, "grad_norm": 1.8756810540858204, "learning_rate": 1.4500415354685349e-06, "loss": 0.352, "step": 5410 }, { "epoch": 0.75890603085554, "grad_norm": 1.537404595493174, "learning_rate": 1.4484424648760125e-06, "loss": 0.2971, "step": 5411 }, { "epoch": 0.7590462833099579, "grad_norm": 1.747487239654624, "learning_rate": 1.4468441271225764e-06, "loss": 0.3498, "step": 5412 }, { "epoch": 0.7591865357643759, "grad_norm": 1.8081452244017502, "learning_rate": 1.4452465225380285e-06, "loss": 0.3418, "step": 5413 }, { "epoch": 0.7593267882187938, "grad_norm": 1.5120138821595321, "learning_rate": 1.4436496514520253e-06, "loss": 0.2918, "step": 5414 }, { "epoch": 0.7594670406732118, "grad_norm": 1.6813368395532586, "learning_rate": 1.44205351419407e-06, "loss": 0.3356, "step": 5415 }, { "epoch": 0.7596072931276298, "grad_norm": 2.6252493058754, "learning_rate": 1.440458111093514e-06, "loss": 0.3125, "step": 5416 }, { "epoch": 0.7597475455820477, "grad_norm": 1.7250163201928383, "learning_rate": 1.4388634424795594e-06, "loss": 0.3574, "step": 5417 }, { "epoch": 0.7598877980364657, "grad_norm": 2.2932404287320045, "learning_rate": 1.4372695086812522e-06, "loss": 0.3402, "step": 5418 }, { "epoch": 0.7600280504908836, "grad_norm": 1.7072148344367855, "learning_rate": 1.4356763100274901e-06, "loss": 0.3159, "step": 5419 }, { "epoch": 0.7601683029453016, "grad_norm": 2.1409783732975862, "learning_rate": 1.4340838468470198e-06, "loss": 0.3343, "step": 5420 }, { "epoch": 0.7603085553997195, "grad_norm": 2.289619143769733, "learning_rate": 1.4324921194684337e-06, "loss": 0.3197, "step": 5421 }, { "epoch": 0.7604488078541375, "grad_norm": 1.8028124692126355, "learning_rate": 1.430901128220174e-06, "loss": 0.3423, "step": 5422 }, { "epoch": 0.7605890603085554, "grad_norm": 1.7531063575229409, "learning_rate": 1.4293108734305311e-06, "loss": 0.3936, "step": 5423 }, { "epoch": 0.7607293127629734, "grad_norm": 1.5460500347632529, "learning_rate": 1.4277213554276426e-06, "loss": 0.33, "step": 5424 }, { "epoch": 0.7608695652173914, "grad_norm": 1.7624590686346935, "learning_rate": 1.426132574539495e-06, "loss": 0.3133, "step": 5425 }, { "epoch": 0.7610098176718093, "grad_norm": 2.063618790021201, "learning_rate": 1.424544531093921e-06, "loss": 0.3811, "step": 5426 }, { "epoch": 0.7611500701262273, "grad_norm": 2.428099085449811, "learning_rate": 1.4229572254186047e-06, "loss": 0.322, "step": 5427 }, { "epoch": 0.7612903225806451, "grad_norm": 1.8664109917792424, "learning_rate": 1.4213706578410718e-06, "loss": 0.3192, "step": 5428 }, { "epoch": 0.7614305750350631, "grad_norm": 2.239618085251511, "learning_rate": 1.4197848286887017e-06, "loss": 0.3379, "step": 5429 }, { "epoch": 0.761570827489481, "grad_norm": 2.008103761438966, "learning_rate": 1.4181997382887192e-06, "loss": 0.3073, "step": 5430 }, { "epoch": 0.761711079943899, "grad_norm": 1.9215041632427265, "learning_rate": 1.416615386968196e-06, "loss": 0.3484, "step": 5431 }, { "epoch": 0.7618513323983169, "grad_norm": 3.3563778304152425, "learning_rate": 1.4150317750540515e-06, "loss": 0.3217, "step": 5432 }, { "epoch": 0.7619915848527349, "grad_norm": 1.5837235806375916, "learning_rate": 1.4134489028730557e-06, "loss": 0.3321, "step": 5433 }, { "epoch": 0.7621318373071528, "grad_norm": 2.1148224743948525, "learning_rate": 1.4118667707518202e-06, "loss": 0.3421, "step": 5434 }, { "epoch": 0.7622720897615708, "grad_norm": 2.0722848386170423, "learning_rate": 1.410285379016807e-06, "loss": 0.3907, "step": 5435 }, { "epoch": 0.7624123422159887, "grad_norm": 1.8033304768443914, "learning_rate": 1.4087047279943267e-06, "loss": 0.3124, "step": 5436 }, { "epoch": 0.7625525946704067, "grad_norm": 1.7135278610399958, "learning_rate": 1.4071248180105346e-06, "loss": 0.3579, "step": 5437 }, { "epoch": 0.7626928471248247, "grad_norm": 1.6952978036832402, "learning_rate": 1.405545649391436e-06, "loss": 0.3391, "step": 5438 }, { "epoch": 0.7628330995792426, "grad_norm": 1.8522394286873611, "learning_rate": 1.4039672224628786e-06, "loss": 0.3389, "step": 5439 }, { "epoch": 0.7629733520336606, "grad_norm": 2.011625239597013, "learning_rate": 1.4023895375505608e-06, "loss": 0.3134, "step": 5440 }, { "epoch": 0.7631136044880785, "grad_norm": 2.2318195588450713, "learning_rate": 1.4008125949800272e-06, "loss": 0.3401, "step": 5441 }, { "epoch": 0.7632538569424965, "grad_norm": 1.7545048768961482, "learning_rate": 1.3992363950766686e-06, "loss": 0.3342, "step": 5442 }, { "epoch": 0.7633941093969144, "grad_norm": 1.935869823193459, "learning_rate": 1.397660938165723e-06, "loss": 0.2948, "step": 5443 }, { "epoch": 0.7635343618513324, "grad_norm": 1.7942149500086986, "learning_rate": 1.3960862245722746e-06, "loss": 0.306, "step": 5444 }, { "epoch": 0.7636746143057503, "grad_norm": 2.872970528174927, "learning_rate": 1.3945122546212552e-06, "loss": 0.3829, "step": 5445 }, { "epoch": 0.7638148667601683, "grad_norm": 2.1523152815083586, "learning_rate": 1.3929390286374416e-06, "loss": 0.3348, "step": 5446 }, { "epoch": 0.7639551192145863, "grad_norm": 1.6803437126599372, "learning_rate": 1.3913665469454606e-06, "loss": 0.3584, "step": 5447 }, { "epoch": 0.7640953716690042, "grad_norm": 1.6866220380259067, "learning_rate": 1.3897948098697789e-06, "loss": 0.2842, "step": 5448 }, { "epoch": 0.7642356241234222, "grad_norm": 1.4713265565515339, "learning_rate": 1.3882238177347157e-06, "loss": 0.3022, "step": 5449 }, { "epoch": 0.7643758765778401, "grad_norm": 1.6232155024841535, "learning_rate": 1.3866535708644335e-06, "loss": 0.306, "step": 5450 }, { "epoch": 0.7645161290322581, "grad_norm": 2.202802269206031, "learning_rate": 1.385084069582942e-06, "loss": 0.3549, "step": 5451 }, { "epoch": 0.764656381486676, "grad_norm": 2.02787794304139, "learning_rate": 1.3835153142140971e-06, "loss": 0.3316, "step": 5452 }, { "epoch": 0.764796633941094, "grad_norm": 1.6817068337267562, "learning_rate": 1.3819473050816002e-06, "loss": 0.3266, "step": 5453 }, { "epoch": 0.7649368863955119, "grad_norm": 3.269731777518092, "learning_rate": 1.380380042509001e-06, "loss": 0.3315, "step": 5454 }, { "epoch": 0.7650771388499299, "grad_norm": 1.8272502795349816, "learning_rate": 1.3788135268196894e-06, "loss": 0.3388, "step": 5455 }, { "epoch": 0.7652173913043478, "grad_norm": 4.58989133120284, "learning_rate": 1.377247758336907e-06, "loss": 0.3488, "step": 5456 }, { "epoch": 0.7653576437587658, "grad_norm": 1.67390518278175, "learning_rate": 1.3756827373837396e-06, "loss": 0.3118, "step": 5457 }, { "epoch": 0.7654978962131838, "grad_norm": 1.865032468722799, "learning_rate": 1.374118464283119e-06, "loss": 0.3658, "step": 5458 }, { "epoch": 0.7656381486676017, "grad_norm": 2.102835117444648, "learning_rate": 1.3725549393578197e-06, "loss": 0.3515, "step": 5459 }, { "epoch": 0.7657784011220197, "grad_norm": 1.8440908989709814, "learning_rate": 1.370992162930465e-06, "loss": 0.3665, "step": 5460 }, { "epoch": 0.7659186535764376, "grad_norm": 2.343159613955257, "learning_rate": 1.3694301353235235e-06, "loss": 0.3029, "step": 5461 }, { "epoch": 0.7660589060308556, "grad_norm": 1.7596793201211025, "learning_rate": 1.367868856859308e-06, "loss": 0.3538, "step": 5462 }, { "epoch": 0.7661991584852735, "grad_norm": 5.851081695972075, "learning_rate": 1.3663083278599781e-06, "loss": 0.3357, "step": 5463 }, { "epoch": 0.7663394109396915, "grad_norm": 2.2776798896638333, "learning_rate": 1.3647485486475376e-06, "loss": 0.3601, "step": 5464 }, { "epoch": 0.7664796633941094, "grad_norm": 2.001795558776411, "learning_rate": 1.3631895195438361e-06, "loss": 0.3195, "step": 5465 }, { "epoch": 0.7666199158485274, "grad_norm": 1.63141429848353, "learning_rate": 1.361631240870569e-06, "loss": 0.3089, "step": 5466 }, { "epoch": 0.7667601683029454, "grad_norm": 2.1079087424168583, "learning_rate": 1.3600737129492752e-06, "loss": 0.2987, "step": 5467 }, { "epoch": 0.7669004207573632, "grad_norm": 1.713057297847868, "learning_rate": 1.3585169361013418e-06, "loss": 0.3096, "step": 5468 }, { "epoch": 0.7670406732117812, "grad_norm": 2.298338754530892, "learning_rate": 1.3569609106479958e-06, "loss": 0.3496, "step": 5469 }, { "epoch": 0.7671809256661991, "grad_norm": 3.03476959082916, "learning_rate": 1.3554056369103136e-06, "loss": 0.3447, "step": 5470 }, { "epoch": 0.7673211781206171, "grad_norm": 1.9646292051729148, "learning_rate": 1.353851115209215e-06, "loss": 0.3268, "step": 5471 }, { "epoch": 0.767461430575035, "grad_norm": 2.2836468789924873, "learning_rate": 1.3522973458654648e-06, "loss": 0.3571, "step": 5472 }, { "epoch": 0.767601683029453, "grad_norm": 2.1972810329320884, "learning_rate": 1.3507443291996724e-06, "loss": 0.3171, "step": 5473 }, { "epoch": 0.7677419354838709, "grad_norm": 3.004734594776448, "learning_rate": 1.3491920655322931e-06, "loss": 0.3265, "step": 5474 }, { "epoch": 0.7678821879382889, "grad_norm": 2.0213523932231885, "learning_rate": 1.3476405551836235e-06, "loss": 0.32, "step": 5475 }, { "epoch": 0.7680224403927068, "grad_norm": 1.7738492378502255, "learning_rate": 1.346089798473808e-06, "loss": 0.3216, "step": 5476 }, { "epoch": 0.7681626928471248, "grad_norm": 1.584703087178074, "learning_rate": 1.344539795722834e-06, "loss": 0.3147, "step": 5477 }, { "epoch": 0.7683029453015428, "grad_norm": 2.0027310083612844, "learning_rate": 1.3429905472505344e-06, "loss": 0.3669, "step": 5478 }, { "epoch": 0.7684431977559607, "grad_norm": 2.46891339884, "learning_rate": 1.341442053376587e-06, "loss": 0.3335, "step": 5479 }, { "epoch": 0.7685834502103787, "grad_norm": 1.8013489463467418, "learning_rate": 1.3398943144205095e-06, "loss": 0.2999, "step": 5480 }, { "epoch": 0.7687237026647966, "grad_norm": 2.667523415345634, "learning_rate": 1.3383473307016687e-06, "loss": 0.3379, "step": 5481 }, { "epoch": 0.7688639551192146, "grad_norm": 3.2722052027704627, "learning_rate": 1.3368011025392735e-06, "loss": 0.3931, "step": 5482 }, { "epoch": 0.7690042075736325, "grad_norm": 1.802315676784272, "learning_rate": 1.3352556302523783e-06, "loss": 0.4007, "step": 5483 }, { "epoch": 0.7691444600280505, "grad_norm": 2.174979865320776, "learning_rate": 1.3337109141598798e-06, "loss": 0.3909, "step": 5484 }, { "epoch": 0.7692847124824684, "grad_norm": 2.122574150848066, "learning_rate": 1.3321669545805188e-06, "loss": 0.2839, "step": 5485 }, { "epoch": 0.7694249649368864, "grad_norm": 1.7859068112811267, "learning_rate": 1.3306237518328819e-06, "loss": 0.359, "step": 5486 }, { "epoch": 0.7695652173913043, "grad_norm": 2.1375147367270575, "learning_rate": 1.3290813062353969e-06, "loss": 0.3563, "step": 5487 }, { "epoch": 0.7697054698457223, "grad_norm": 1.7818421835070155, "learning_rate": 1.3275396181063394e-06, "loss": 0.3427, "step": 5488 }, { "epoch": 0.7698457223001403, "grad_norm": 2.4102401347841296, "learning_rate": 1.325998687763822e-06, "loss": 0.3353, "step": 5489 }, { "epoch": 0.7699859747545582, "grad_norm": 1.8653207808847747, "learning_rate": 1.324458515525807e-06, "loss": 0.3522, "step": 5490 }, { "epoch": 0.7701262272089762, "grad_norm": 1.6765169960523418, "learning_rate": 1.3229191017100978e-06, "loss": 0.3528, "step": 5491 }, { "epoch": 0.7702664796633941, "grad_norm": 1.4296081187405678, "learning_rate": 1.321380446634342e-06, "loss": 0.3195, "step": 5492 }, { "epoch": 0.7704067321178121, "grad_norm": 1.9597924525474038, "learning_rate": 1.3198425506160302e-06, "loss": 0.3932, "step": 5493 }, { "epoch": 0.77054698457223, "grad_norm": 1.8015749293193855, "learning_rate": 1.318305413972496e-06, "loss": 0.3099, "step": 5494 }, { "epoch": 0.770687237026648, "grad_norm": 2.2557407892938155, "learning_rate": 1.316769037020919e-06, "loss": 0.3475, "step": 5495 }, { "epoch": 0.7708274894810659, "grad_norm": 2.054797731640889, "learning_rate": 1.3152334200783167e-06, "loss": 0.3427, "step": 5496 }, { "epoch": 0.7709677419354839, "grad_norm": 1.7489541879786186, "learning_rate": 1.3136985634615546e-06, "loss": 0.3229, "step": 5497 }, { "epoch": 0.7711079943899019, "grad_norm": 2.155146662064966, "learning_rate": 1.312164467487339e-06, "loss": 0.3391, "step": 5498 }, { "epoch": 0.7712482468443198, "grad_norm": 1.6106437940762957, "learning_rate": 1.310631132472222e-06, "loss": 0.3271, "step": 5499 }, { "epoch": 0.7713884992987378, "grad_norm": 2.7825938203662828, "learning_rate": 1.3090985587325932e-06, "loss": 0.3381, "step": 5500 }, { "epoch": 0.7715287517531557, "grad_norm": 1.8502871470944398, "learning_rate": 1.3075667465846904e-06, "loss": 0.3343, "step": 5501 }, { "epoch": 0.7716690042075737, "grad_norm": 1.9989665460807124, "learning_rate": 1.306035696344592e-06, "loss": 0.3074, "step": 5502 }, { "epoch": 0.7718092566619916, "grad_norm": 2.0267723616009548, "learning_rate": 1.3045054083282194e-06, "loss": 0.3178, "step": 5503 }, { "epoch": 0.7719495091164096, "grad_norm": 1.9101139368156632, "learning_rate": 1.3029758828513368e-06, "loss": 0.3373, "step": 5504 }, { "epoch": 0.7720897615708275, "grad_norm": 3.0012562029042424, "learning_rate": 1.3014471202295514e-06, "loss": 0.3641, "step": 5505 }, { "epoch": 0.7722300140252455, "grad_norm": 1.832290663582457, "learning_rate": 1.2999191207783129e-06, "loss": 0.3285, "step": 5506 }, { "epoch": 0.7723702664796634, "grad_norm": 2.2269572660939287, "learning_rate": 1.298391884812913e-06, "loss": 0.3491, "step": 5507 }, { "epoch": 0.7725105189340813, "grad_norm": 1.8252461254380226, "learning_rate": 1.2968654126484858e-06, "loss": 0.3228, "step": 5508 }, { "epoch": 0.7726507713884992, "grad_norm": 1.9474375810524291, "learning_rate": 1.2953397046000105e-06, "loss": 0.293, "step": 5509 }, { "epoch": 0.7727910238429172, "grad_norm": 1.71627622657958, "learning_rate": 1.2938147609823026e-06, "loss": 0.3438, "step": 5510 }, { "epoch": 0.7729312762973352, "grad_norm": 1.607563814406419, "learning_rate": 1.2922905821100256e-06, "loss": 0.3258, "step": 5511 }, { "epoch": 0.7730715287517531, "grad_norm": 2.0176060130877516, "learning_rate": 1.2907671682976824e-06, "loss": 0.3288, "step": 5512 }, { "epoch": 0.7732117812061711, "grad_norm": 1.8301278253114919, "learning_rate": 1.2892445198596198e-06, "loss": 0.3284, "step": 5513 }, { "epoch": 0.773352033660589, "grad_norm": 1.7395995209753552, "learning_rate": 1.287722637110025e-06, "loss": 0.3367, "step": 5514 }, { "epoch": 0.773492286115007, "grad_norm": 1.694733903303401, "learning_rate": 1.2862015203629274e-06, "loss": 0.3728, "step": 5515 }, { "epoch": 0.7736325385694249, "grad_norm": 1.901310508913826, "learning_rate": 1.2846811699322014e-06, "loss": 0.3275, "step": 5516 }, { "epoch": 0.7737727910238429, "grad_norm": 1.9238296121504403, "learning_rate": 1.2831615861315572e-06, "loss": 0.3105, "step": 5517 }, { "epoch": 0.7739130434782608, "grad_norm": 2.05338515864443, "learning_rate": 1.281642769274552e-06, "loss": 0.3272, "step": 5518 }, { "epoch": 0.7740532959326788, "grad_norm": 1.8056059642830145, "learning_rate": 1.2801247196745826e-06, "loss": 0.3658, "step": 5519 }, { "epoch": 0.7741935483870968, "grad_norm": 2.1669977986562663, "learning_rate": 1.27860743764489e-06, "loss": 0.3544, "step": 5520 }, { "epoch": 0.7743338008415147, "grad_norm": 2.868992541330401, "learning_rate": 1.2770909234985513e-06, "loss": 0.305, "step": 5521 }, { "epoch": 0.7744740532959327, "grad_norm": 1.7161912772182477, "learning_rate": 1.2755751775484898e-06, "loss": 0.3344, "step": 5522 }, { "epoch": 0.7746143057503506, "grad_norm": 2.6487019724719976, "learning_rate": 1.2740602001074697e-06, "loss": 0.306, "step": 5523 }, { "epoch": 0.7747545582047686, "grad_norm": 2.055987624847336, "learning_rate": 1.2725459914880961e-06, "loss": 0.3336, "step": 5524 }, { "epoch": 0.7748948106591865, "grad_norm": 2.723068665803196, "learning_rate": 1.271032552002815e-06, "loss": 0.3227, "step": 5525 }, { "epoch": 0.7750350631136045, "grad_norm": 1.7054352096915222, "learning_rate": 1.2695198819639143e-06, "loss": 0.2843, "step": 5526 }, { "epoch": 0.7751753155680224, "grad_norm": 1.8316081194882392, "learning_rate": 1.2680079816835228e-06, "loss": 0.361, "step": 5527 }, { "epoch": 0.7753155680224404, "grad_norm": 1.9786748040684015, "learning_rate": 1.2664968514736104e-06, "loss": 0.361, "step": 5528 }, { "epoch": 0.7754558204768583, "grad_norm": 1.6685832765423594, "learning_rate": 1.2649864916459897e-06, "loss": 0.3705, "step": 5529 }, { "epoch": 0.7755960729312763, "grad_norm": 4.315740722165435, "learning_rate": 1.26347690251231e-06, "loss": 0.3254, "step": 5530 }, { "epoch": 0.7757363253856943, "grad_norm": 2.3362523507830457, "learning_rate": 1.261968084384066e-06, "loss": 0.3357, "step": 5531 }, { "epoch": 0.7758765778401122, "grad_norm": 2.4577774346063768, "learning_rate": 1.2604600375725922e-06, "loss": 0.3464, "step": 5532 }, { "epoch": 0.7760168302945302, "grad_norm": 2.615925407705269, "learning_rate": 1.2589527623890629e-06, "loss": 0.354, "step": 5533 }, { "epoch": 0.7761570827489481, "grad_norm": 1.6176130683918148, "learning_rate": 1.257446259144494e-06, "loss": 0.3274, "step": 5534 }, { "epoch": 0.7762973352033661, "grad_norm": 1.9333421331512302, "learning_rate": 1.2559405281497427e-06, "loss": 0.3042, "step": 5535 }, { "epoch": 0.776437587657784, "grad_norm": 2.0073110941441827, "learning_rate": 1.2544355697155048e-06, "loss": 0.3683, "step": 5536 }, { "epoch": 0.776577840112202, "grad_norm": 1.6912544305691197, "learning_rate": 1.25293138415232e-06, "loss": 0.3234, "step": 5537 }, { "epoch": 0.7767180925666199, "grad_norm": 2.266898803417683, "learning_rate": 1.2514279717705636e-06, "loss": 0.3892, "step": 5538 }, { "epoch": 0.7768583450210379, "grad_norm": 1.762106030308148, "learning_rate": 1.249925332880455e-06, "loss": 0.3596, "step": 5539 }, { "epoch": 0.7769985974754559, "grad_norm": 1.6701505805558243, "learning_rate": 1.248423467792056e-06, "loss": 0.3632, "step": 5540 }, { "epoch": 0.7771388499298738, "grad_norm": 1.9333835063650269, "learning_rate": 1.2469223768152622e-06, "loss": 0.3548, "step": 5541 }, { "epoch": 0.7772791023842918, "grad_norm": 2.729945753487802, "learning_rate": 1.245422060259815e-06, "loss": 0.3187, "step": 5542 }, { "epoch": 0.7774193548387097, "grad_norm": 1.92851623324979, "learning_rate": 1.2439225184352938e-06, "loss": 0.3581, "step": 5543 }, { "epoch": 0.7775596072931277, "grad_norm": 1.5612339230909196, "learning_rate": 1.242423751651119e-06, "loss": 0.3155, "step": 5544 }, { "epoch": 0.7776998597475456, "grad_norm": 2.013880958395357, "learning_rate": 1.2409257602165509e-06, "loss": 0.35, "step": 5545 }, { "epoch": 0.7778401122019636, "grad_norm": 1.7712406213656107, "learning_rate": 1.239428544440689e-06, "loss": 0.3452, "step": 5546 }, { "epoch": 0.7779803646563815, "grad_norm": 1.93062139607714, "learning_rate": 1.2379321046324732e-06, "loss": 0.3284, "step": 5547 }, { "epoch": 0.7781206171107994, "grad_norm": 2.9576952250547257, "learning_rate": 1.2364364411006841e-06, "loss": 0.3022, "step": 5548 }, { "epoch": 0.7782608695652173, "grad_norm": 1.6978101482406665, "learning_rate": 1.2349415541539406e-06, "loss": 0.3166, "step": 5549 }, { "epoch": 0.7784011220196353, "grad_norm": 1.852199589699668, "learning_rate": 1.2334474441007045e-06, "loss": 0.3515, "step": 5550 }, { "epoch": 0.7785413744740532, "grad_norm": 1.800886304194853, "learning_rate": 1.2319541112492717e-06, "loss": 0.3671, "step": 5551 }, { "epoch": 0.7786816269284712, "grad_norm": 1.7221987715871248, "learning_rate": 1.230461555907782e-06, "loss": 0.3233, "step": 5552 }, { "epoch": 0.7788218793828892, "grad_norm": 1.8208669710289334, "learning_rate": 1.2289697783842142e-06, "loss": 0.3135, "step": 5553 }, { "epoch": 0.7789621318373071, "grad_norm": 1.878421966889591, "learning_rate": 1.2274787789863862e-06, "loss": 0.3945, "step": 5554 }, { "epoch": 0.7791023842917251, "grad_norm": 2.224133689333846, "learning_rate": 1.2259885580219555e-06, "loss": 0.3247, "step": 5555 }, { "epoch": 0.779242636746143, "grad_norm": 2.182730461441235, "learning_rate": 1.224499115798418e-06, "loss": 0.3549, "step": 5556 }, { "epoch": 0.779382889200561, "grad_norm": 2.467984237694756, "learning_rate": 1.2230104526231107e-06, "loss": 0.3576, "step": 5557 }, { "epoch": 0.7795231416549789, "grad_norm": 2.0174426968554062, "learning_rate": 1.22152256880321e-06, "loss": 0.3579, "step": 5558 }, { "epoch": 0.7796633941093969, "grad_norm": 1.8609788532931197, "learning_rate": 1.220035464645727e-06, "loss": 0.3449, "step": 5559 }, { "epoch": 0.7798036465638148, "grad_norm": 1.7594371832094933, "learning_rate": 1.2185491404575166e-06, "loss": 0.3438, "step": 5560 }, { "epoch": 0.7799438990182328, "grad_norm": 1.6770595130255492, "learning_rate": 1.2170635965452737e-06, "loss": 0.2762, "step": 5561 }, { "epoch": 0.7800841514726508, "grad_norm": 1.8939585026441734, "learning_rate": 1.215578833215526e-06, "loss": 0.332, "step": 5562 }, { "epoch": 0.7802244039270687, "grad_norm": 2.0241806256360118, "learning_rate": 1.2140948507746465e-06, "loss": 0.3049, "step": 5563 }, { "epoch": 0.7803646563814867, "grad_norm": 3.0051009998073224, "learning_rate": 1.2126116495288436e-06, "loss": 0.318, "step": 5564 }, { "epoch": 0.7805049088359046, "grad_norm": 2.2001015769743737, "learning_rate": 1.2111292297841666e-06, "loss": 0.3462, "step": 5565 }, { "epoch": 0.7806451612903226, "grad_norm": 2.286968432089739, "learning_rate": 1.2096475918465016e-06, "loss": 0.4091, "step": 5566 }, { "epoch": 0.7807854137447405, "grad_norm": 1.8284229133635672, "learning_rate": 1.2081667360215743e-06, "loss": 0.3726, "step": 5567 }, { "epoch": 0.7809256661991585, "grad_norm": 1.5242776754731406, "learning_rate": 1.2066866626149499e-06, "loss": 0.3384, "step": 5568 }, { "epoch": 0.7810659186535764, "grad_norm": 2.099572469697456, "learning_rate": 1.2052073719320296e-06, "loss": 0.3417, "step": 5569 }, { "epoch": 0.7812061711079944, "grad_norm": 1.8500487501577372, "learning_rate": 1.2037288642780575e-06, "loss": 0.2819, "step": 5570 }, { "epoch": 0.7813464235624124, "grad_norm": 2.0148341083628725, "learning_rate": 1.20225113995811e-06, "loss": 0.3398, "step": 5571 }, { "epoch": 0.7814866760168303, "grad_norm": 1.8447730895072216, "learning_rate": 1.2007741992771065e-06, "loss": 0.2809, "step": 5572 }, { "epoch": 0.7816269284712483, "grad_norm": 3.9497060352502915, "learning_rate": 1.1992980425398033e-06, "loss": 0.3209, "step": 5573 }, { "epoch": 0.7817671809256662, "grad_norm": 2.1555340449034417, "learning_rate": 1.1978226700507956e-06, "loss": 0.3177, "step": 5574 }, { "epoch": 0.7819074333800842, "grad_norm": 2.1044830109646218, "learning_rate": 1.1963480821145157e-06, "loss": 0.3329, "step": 5575 }, { "epoch": 0.7820476858345021, "grad_norm": 1.6940213397820114, "learning_rate": 1.1948742790352342e-06, "loss": 0.3424, "step": 5576 }, { "epoch": 0.7821879382889201, "grad_norm": 1.9048425450171396, "learning_rate": 1.193401261117061e-06, "loss": 0.3148, "step": 5577 }, { "epoch": 0.782328190743338, "grad_norm": 1.6003253159012631, "learning_rate": 1.1919290286639424e-06, "loss": 0.3289, "step": 5578 }, { "epoch": 0.782468443197756, "grad_norm": 2.3899296765376103, "learning_rate": 1.1904575819796648e-06, "loss": 0.3624, "step": 5579 }, { "epoch": 0.782608695652174, "grad_norm": 1.9761375603728804, "learning_rate": 1.1889869213678485e-06, "loss": 0.3546, "step": 5580 }, { "epoch": 0.7827489481065919, "grad_norm": 2.185402436720707, "learning_rate": 1.1875170471319565e-06, "loss": 0.322, "step": 5581 }, { "epoch": 0.7828892005610099, "grad_norm": 2.0770793377816488, "learning_rate": 1.1860479595752838e-06, "loss": 0.3157, "step": 5582 }, { "epoch": 0.7830294530154278, "grad_norm": 1.8057938524456543, "learning_rate": 1.1845796590009684e-06, "loss": 0.3234, "step": 5583 }, { "epoch": 0.7831697054698458, "grad_norm": 2.325188442510063, "learning_rate": 1.1831121457119842e-06, "loss": 0.3738, "step": 5584 }, { "epoch": 0.7833099579242637, "grad_norm": 1.6443400147216793, "learning_rate": 1.1816454200111415e-06, "loss": 0.3666, "step": 5585 }, { "epoch": 0.7834502103786817, "grad_norm": 1.802549540755552, "learning_rate": 1.1801794822010893e-06, "loss": 0.3348, "step": 5586 }, { "epoch": 0.7835904628330996, "grad_norm": 2.019375409545668, "learning_rate": 1.1787143325843131e-06, "loss": 0.3245, "step": 5587 }, { "epoch": 0.7837307152875175, "grad_norm": 2.753198324346358, "learning_rate": 1.1772499714631375e-06, "loss": 0.3683, "step": 5588 }, { "epoch": 0.7838709677419354, "grad_norm": 2.2299409632715648, "learning_rate": 1.1757863991397222e-06, "loss": 0.3288, "step": 5589 }, { "epoch": 0.7840112201963534, "grad_norm": 1.6155406287942473, "learning_rate": 1.1743236159160654e-06, "loss": 0.3569, "step": 5590 }, { "epoch": 0.7841514726507713, "grad_norm": 2.511121331768935, "learning_rate": 1.172861622094003e-06, "loss": 0.2925, "step": 5591 }, { "epoch": 0.7842917251051893, "grad_norm": 2.2419204818753053, "learning_rate": 1.1714004179752058e-06, "loss": 0.3356, "step": 5592 }, { "epoch": 0.7844319775596073, "grad_norm": 11.546187157075314, "learning_rate": 1.169940003861183e-06, "loss": 0.3551, "step": 5593 }, { "epoch": 0.7845722300140252, "grad_norm": 2.117761400430089, "learning_rate": 1.1684803800532819e-06, "loss": 0.3109, "step": 5594 }, { "epoch": 0.7847124824684432, "grad_norm": 3.1172918240100715, "learning_rate": 1.1670215468526852e-06, "loss": 0.3427, "step": 5595 }, { "epoch": 0.7848527349228611, "grad_norm": 2.364279896988707, "learning_rate": 1.165563504560413e-06, "loss": 0.3161, "step": 5596 }, { "epoch": 0.7849929873772791, "grad_norm": 1.7774277216598986, "learning_rate": 1.1641062534773218e-06, "loss": 0.3428, "step": 5597 }, { "epoch": 0.785133239831697, "grad_norm": 3.781100561798781, "learning_rate": 1.162649793904106e-06, "loss": 0.3211, "step": 5598 }, { "epoch": 0.785273492286115, "grad_norm": 1.805575527993617, "learning_rate": 1.1611941261412962e-06, "loss": 0.3636, "step": 5599 }, { "epoch": 0.7854137447405329, "grad_norm": 2.047779378496523, "learning_rate": 1.1597392504892574e-06, "loss": 0.3108, "step": 5600 }, { "epoch": 0.7855539971949509, "grad_norm": 1.9605607966574607, "learning_rate": 1.1582851672481943e-06, "loss": 0.3181, "step": 5601 }, { "epoch": 0.7856942496493688, "grad_norm": 4.769938933423453, "learning_rate": 1.156831876718148e-06, "loss": 0.3327, "step": 5602 }, { "epoch": 0.7858345021037868, "grad_norm": 3.845098119508138, "learning_rate": 1.1553793791989914e-06, "loss": 0.3603, "step": 5603 }, { "epoch": 0.7859747545582048, "grad_norm": 2.0993012809927225, "learning_rate": 1.15392767499044e-06, "loss": 0.3579, "step": 5604 }, { "epoch": 0.7861150070126227, "grad_norm": 1.629328025898715, "learning_rate": 1.1524767643920415e-06, "loss": 0.322, "step": 5605 }, { "epoch": 0.7862552594670407, "grad_norm": 2.5661377555875395, "learning_rate": 1.1510266477031823e-06, "loss": 0.3405, "step": 5606 }, { "epoch": 0.7863955119214586, "grad_norm": 1.7576094446521446, "learning_rate": 1.149577325223083e-06, "loss": 0.3097, "step": 5607 }, { "epoch": 0.7865357643758766, "grad_norm": 1.740294724091247, "learning_rate": 1.148128797250801e-06, "loss": 0.2985, "step": 5608 }, { "epoch": 0.7866760168302945, "grad_norm": 2.172606969203831, "learning_rate": 1.146681064085231e-06, "loss": 0.3644, "step": 5609 }, { "epoch": 0.7868162692847125, "grad_norm": 1.7255453850860665, "learning_rate": 1.145234126025102e-06, "loss": 0.3155, "step": 5610 }, { "epoch": 0.7869565217391304, "grad_norm": 2.1474584949821263, "learning_rate": 1.1437879833689808e-06, "loss": 0.3472, "step": 5611 }, { "epoch": 0.7870967741935484, "grad_norm": 2.518843398717106, "learning_rate": 1.1423426364152663e-06, "loss": 0.3506, "step": 5612 }, { "epoch": 0.7872370266479664, "grad_norm": 2.0227274587862984, "learning_rate": 1.1408980854621965e-06, "loss": 0.3031, "step": 5613 }, { "epoch": 0.7873772791023843, "grad_norm": 2.5066460959963477, "learning_rate": 1.1394543308078454e-06, "loss": 0.347, "step": 5614 }, { "epoch": 0.7875175315568023, "grad_norm": 2.059096099537695, "learning_rate": 1.1380113727501213e-06, "loss": 0.3451, "step": 5615 }, { "epoch": 0.7876577840112202, "grad_norm": 1.7545830115495633, "learning_rate": 1.1365692115867682e-06, "loss": 0.3061, "step": 5616 }, { "epoch": 0.7877980364656382, "grad_norm": 2.3832536132746696, "learning_rate": 1.1351278476153665e-06, "loss": 0.3662, "step": 5617 }, { "epoch": 0.7879382889200561, "grad_norm": 2.0026887463259597, "learning_rate": 1.133687281133331e-06, "loss": 0.3293, "step": 5618 }, { "epoch": 0.7880785413744741, "grad_norm": 2.6352229191624756, "learning_rate": 1.1322475124379134e-06, "loss": 0.3689, "step": 5619 }, { "epoch": 0.788218793828892, "grad_norm": 2.0457165184259978, "learning_rate": 1.1308085418262004e-06, "loss": 0.3337, "step": 5620 }, { "epoch": 0.78835904628331, "grad_norm": 1.9130572735059104, "learning_rate": 1.1293703695951109e-06, "loss": 0.4108, "step": 5621 }, { "epoch": 0.788499298737728, "grad_norm": 1.7077376870175092, "learning_rate": 1.1279329960414047e-06, "loss": 0.3654, "step": 5622 }, { "epoch": 0.7886395511921459, "grad_norm": 1.7246010111587606, "learning_rate": 1.1264964214616715e-06, "loss": 0.3586, "step": 5623 }, { "epoch": 0.7887798036465639, "grad_norm": 2.0780491635818983, "learning_rate": 1.1250606461523389e-06, "loss": 0.3335, "step": 5624 }, { "epoch": 0.7889200561009818, "grad_norm": 1.964810259251106, "learning_rate": 1.1236256704096693e-06, "loss": 0.3336, "step": 5625 }, { "epoch": 0.7890603085553998, "grad_norm": 1.9218862889896469, "learning_rate": 1.1221914945297601e-06, "loss": 0.3232, "step": 5626 }, { "epoch": 0.7892005610098177, "grad_norm": 1.8349141616909292, "learning_rate": 1.1207581188085436e-06, "loss": 0.357, "step": 5627 }, { "epoch": 0.7893408134642356, "grad_norm": 2.2783764024117907, "learning_rate": 1.119325543541787e-06, "loss": 0.3744, "step": 5628 }, { "epoch": 0.7894810659186535, "grad_norm": 1.8179069636410412, "learning_rate": 1.1178937690250917e-06, "loss": 0.3316, "step": 5629 }, { "epoch": 0.7896213183730715, "grad_norm": 2.50935330696278, "learning_rate": 1.1164627955538948e-06, "loss": 0.3283, "step": 5630 }, { "epoch": 0.7897615708274894, "grad_norm": 2.3333185626879436, "learning_rate": 1.1150326234234675e-06, "loss": 0.3277, "step": 5631 }, { "epoch": 0.7899018232819074, "grad_norm": 1.7401253851295573, "learning_rate": 1.113603252928917e-06, "loss": 0.3513, "step": 5632 }, { "epoch": 0.7900420757363253, "grad_norm": 2.107094216882188, "learning_rate": 1.1121746843651815e-06, "loss": 0.3603, "step": 5633 }, { "epoch": 0.7901823281907433, "grad_norm": 1.851664286845704, "learning_rate": 1.1107469180270375e-06, "loss": 0.3486, "step": 5634 }, { "epoch": 0.7903225806451613, "grad_norm": 2.134160548505115, "learning_rate": 1.1093199542090944e-06, "loss": 0.3532, "step": 5635 }, { "epoch": 0.7904628330995792, "grad_norm": 1.7084402035415662, "learning_rate": 1.107893793205796e-06, "loss": 0.3418, "step": 5636 }, { "epoch": 0.7906030855539972, "grad_norm": 2.1783879825982035, "learning_rate": 1.1064684353114213e-06, "loss": 0.356, "step": 5637 }, { "epoch": 0.7907433380084151, "grad_norm": 1.9902299426088401, "learning_rate": 1.1050438808200824e-06, "loss": 0.2903, "step": 5638 }, { "epoch": 0.7908835904628331, "grad_norm": 2.097792864280986, "learning_rate": 1.1036201300257266e-06, "loss": 0.3328, "step": 5639 }, { "epoch": 0.791023842917251, "grad_norm": 1.5702810711348787, "learning_rate": 1.1021971832221345e-06, "loss": 0.3274, "step": 5640 }, { "epoch": 0.791164095371669, "grad_norm": 1.6669152392353133, "learning_rate": 1.1007750407029232e-06, "loss": 0.3128, "step": 5641 }, { "epoch": 0.7913043478260869, "grad_norm": 2.0804641293837163, "learning_rate": 1.0993537027615387e-06, "loss": 0.3225, "step": 5642 }, { "epoch": 0.7914446002805049, "grad_norm": 1.9666110332615956, "learning_rate": 1.0979331696912666e-06, "loss": 0.2962, "step": 5643 }, { "epoch": 0.7915848527349229, "grad_norm": 3.2575056997323983, "learning_rate": 1.0965134417852213e-06, "loss": 0.3701, "step": 5644 }, { "epoch": 0.7917251051893408, "grad_norm": 2.0197738413215154, "learning_rate": 1.095094519336356e-06, "loss": 0.3645, "step": 5645 }, { "epoch": 0.7918653576437588, "grad_norm": 2.9561071123427776, "learning_rate": 1.0936764026374547e-06, "loss": 0.3345, "step": 5646 }, { "epoch": 0.7920056100981767, "grad_norm": 1.9860582796671338, "learning_rate": 1.0922590919811355e-06, "loss": 0.3453, "step": 5647 }, { "epoch": 0.7921458625525947, "grad_norm": 1.863408050381167, "learning_rate": 1.0908425876598512e-06, "loss": 0.3616, "step": 5648 }, { "epoch": 0.7922861150070126, "grad_norm": 1.9969779667104437, "learning_rate": 1.0894268899658877e-06, "loss": 0.3084, "step": 5649 }, { "epoch": 0.7924263674614306, "grad_norm": 2.6149735381208012, "learning_rate": 1.088011999191364e-06, "loss": 0.3348, "step": 5650 }, { "epoch": 0.7925666199158485, "grad_norm": 3.044535986211674, "learning_rate": 1.0865979156282325e-06, "loss": 0.3725, "step": 5651 }, { "epoch": 0.7927068723702665, "grad_norm": 2.4961841071404813, "learning_rate": 1.085184639568282e-06, "loss": 0.3684, "step": 5652 }, { "epoch": 0.7928471248246844, "grad_norm": 1.6559067856358471, "learning_rate": 1.083772171303128e-06, "loss": 0.3508, "step": 5653 }, { "epoch": 0.7929873772791024, "grad_norm": 1.9758185995907542, "learning_rate": 1.0823605111242259e-06, "loss": 0.3484, "step": 5654 }, { "epoch": 0.7931276297335204, "grad_norm": 2.2113540310968727, "learning_rate": 1.0809496593228614e-06, "loss": 0.3343, "step": 5655 }, { "epoch": 0.7932678821879383, "grad_norm": 1.7747452405324897, "learning_rate": 1.079539616190154e-06, "loss": 0.3602, "step": 5656 }, { "epoch": 0.7934081346423563, "grad_norm": 1.579490146704042, "learning_rate": 1.0781303820170563e-06, "loss": 0.3275, "step": 5657 }, { "epoch": 0.7935483870967742, "grad_norm": 1.6870645031765732, "learning_rate": 1.0767219570943543e-06, "loss": 0.3451, "step": 5658 }, { "epoch": 0.7936886395511922, "grad_norm": 2.344472443180921, "learning_rate": 1.075314341712666e-06, "loss": 0.3459, "step": 5659 }, { "epoch": 0.7938288920056101, "grad_norm": 2.7627789352236944, "learning_rate": 1.073907536162443e-06, "loss": 0.3474, "step": 5660 }, { "epoch": 0.7939691444600281, "grad_norm": 1.8440318134691451, "learning_rate": 1.0725015407339718e-06, "loss": 0.3138, "step": 5661 }, { "epoch": 0.794109396914446, "grad_norm": 2.03676841567004, "learning_rate": 1.0710963557173664e-06, "loss": 0.3172, "step": 5662 }, { "epoch": 0.794249649368864, "grad_norm": 1.8063233462954715, "learning_rate": 1.0696919814025803e-06, "loss": 0.3441, "step": 5663 }, { "epoch": 0.794389901823282, "grad_norm": 1.8511881389495108, "learning_rate": 1.0682884180793923e-06, "loss": 0.3081, "step": 5664 }, { "epoch": 0.7945301542776999, "grad_norm": 1.7153049969626335, "learning_rate": 1.066885666037421e-06, "loss": 0.3463, "step": 5665 }, { "epoch": 0.7946704067321179, "grad_norm": 1.7254733279346333, "learning_rate": 1.0654837255661131e-06, "loss": 0.3742, "step": 5666 }, { "epoch": 0.7948106591865358, "grad_norm": 1.6485211681086387, "learning_rate": 1.0640825969547498e-06, "loss": 0.3085, "step": 5667 }, { "epoch": 0.7949509116409537, "grad_norm": 1.564319390570846, "learning_rate": 1.062682280492444e-06, "loss": 0.2908, "step": 5668 }, { "epoch": 0.7950911640953716, "grad_norm": 2.405887824211006, "learning_rate": 1.0612827764681417e-06, "loss": 0.3344, "step": 5669 }, { "epoch": 0.7952314165497896, "grad_norm": 1.905344388374912, "learning_rate": 1.0598840851706204e-06, "loss": 0.3551, "step": 5670 }, { "epoch": 0.7953716690042075, "grad_norm": 2.0632993131841375, "learning_rate": 1.05848620688849e-06, "loss": 0.3272, "step": 5671 }, { "epoch": 0.7955119214586255, "grad_norm": 2.3569699591436595, "learning_rate": 1.0570891419101931e-06, "loss": 0.3477, "step": 5672 }, { "epoch": 0.7956521739130434, "grad_norm": 2.2150500973446037, "learning_rate": 1.055692890524006e-06, "loss": 0.3683, "step": 5673 }, { "epoch": 0.7957924263674614, "grad_norm": 2.319900013643524, "learning_rate": 1.0542974530180327e-06, "loss": 0.3716, "step": 5674 }, { "epoch": 0.7959326788218793, "grad_norm": 3.262861771092343, "learning_rate": 1.0529028296802129e-06, "loss": 0.3264, "step": 5675 }, { "epoch": 0.7960729312762973, "grad_norm": 6.472918802667598, "learning_rate": 1.0515090207983175e-06, "loss": 0.3143, "step": 5676 }, { "epoch": 0.7962131837307153, "grad_norm": 1.770896401871693, "learning_rate": 1.0501160266599492e-06, "loss": 0.3187, "step": 5677 }, { "epoch": 0.7963534361851332, "grad_norm": 1.7966566533651267, "learning_rate": 1.048723847552543e-06, "loss": 0.3789, "step": 5678 }, { "epoch": 0.7964936886395512, "grad_norm": 1.942745000443771, "learning_rate": 1.0473324837633653e-06, "loss": 0.3173, "step": 5679 }, { "epoch": 0.7966339410939691, "grad_norm": 2.0303323873867263, "learning_rate": 1.0459419355795137e-06, "loss": 0.3209, "step": 5680 }, { "epoch": 0.7967741935483871, "grad_norm": 1.6179107670361652, "learning_rate": 1.0445522032879184e-06, "loss": 0.3575, "step": 5681 }, { "epoch": 0.796914446002805, "grad_norm": 2.084925373855064, "learning_rate": 1.0431632871753421e-06, "loss": 0.3222, "step": 5682 }, { "epoch": 0.797054698457223, "grad_norm": 2.0772841050105995, "learning_rate": 1.041775187528376e-06, "loss": 0.3361, "step": 5683 }, { "epoch": 0.7971949509116409, "grad_norm": 1.5633805654720567, "learning_rate": 1.040387904633447e-06, "loss": 0.3255, "step": 5684 }, { "epoch": 0.7973352033660589, "grad_norm": 1.7530008200661482, "learning_rate": 1.0390014387768083e-06, "loss": 0.353, "step": 5685 }, { "epoch": 0.7974754558204769, "grad_norm": 1.9015732601115234, "learning_rate": 1.037615790244549e-06, "loss": 0.3194, "step": 5686 }, { "epoch": 0.7976157082748948, "grad_norm": 1.622848847538799, "learning_rate": 1.0362309593225877e-06, "loss": 0.2932, "step": 5687 }, { "epoch": 0.7977559607293128, "grad_norm": 1.889292980739497, "learning_rate": 1.0348469462966753e-06, "loss": 0.3646, "step": 5688 }, { "epoch": 0.7978962131837307, "grad_norm": 2.7106220160730223, "learning_rate": 1.0334637514523927e-06, "loss": 0.3489, "step": 5689 }, { "epoch": 0.7980364656381487, "grad_norm": 2.2346860228992265, "learning_rate": 1.0320813750751523e-06, "loss": 0.329, "step": 5690 }, { "epoch": 0.7981767180925666, "grad_norm": 2.0883379779022193, "learning_rate": 1.030699817450198e-06, "loss": 0.3688, "step": 5691 }, { "epoch": 0.7983169705469846, "grad_norm": 2.085250226929884, "learning_rate": 1.029319078862605e-06, "loss": 0.3429, "step": 5692 }, { "epoch": 0.7984572230014025, "grad_norm": 1.619173387810302, "learning_rate": 1.0279391595972798e-06, "loss": 0.355, "step": 5693 }, { "epoch": 0.7985974754558205, "grad_norm": 1.773116229697513, "learning_rate": 1.0265600599389569e-06, "loss": 0.3614, "step": 5694 }, { "epoch": 0.7987377279102384, "grad_norm": 1.829789862632043, "learning_rate": 1.0251817801722047e-06, "loss": 0.2866, "step": 5695 }, { "epoch": 0.7988779803646564, "grad_norm": 1.8418024412942797, "learning_rate": 1.0238043205814219e-06, "loss": 0.3396, "step": 5696 }, { "epoch": 0.7990182328190744, "grad_norm": 1.6423051496151726, "learning_rate": 1.0224276814508376e-06, "loss": 0.3135, "step": 5697 }, { "epoch": 0.7991584852734923, "grad_norm": 3.7559773173190667, "learning_rate": 1.0210518630645122e-06, "loss": 0.3367, "step": 5698 }, { "epoch": 0.7992987377279103, "grad_norm": 2.444374270106192, "learning_rate": 1.0196768657063355e-06, "loss": 0.2674, "step": 5699 }, { "epoch": 0.7994389901823282, "grad_norm": 1.9377186559534711, "learning_rate": 1.0183026896600284e-06, "loss": 0.3008, "step": 5700 }, { "epoch": 0.7995792426367462, "grad_norm": 1.8864328234875283, "learning_rate": 1.0169293352091436e-06, "loss": 0.3668, "step": 5701 }, { "epoch": 0.7997194950911641, "grad_norm": 2.0545708800896247, "learning_rate": 1.0155568026370637e-06, "loss": 0.3194, "step": 5702 }, { "epoch": 0.7998597475455821, "grad_norm": 1.9448815293503119, "learning_rate": 1.0141850922269986e-06, "loss": 0.3548, "step": 5703 }, { "epoch": 0.8, "grad_norm": 5.412346373882036, "learning_rate": 1.0128142042619938e-06, "loss": 0.3274, "step": 5704 }, { "epoch": 0.800140252454418, "grad_norm": 1.7511844713692766, "learning_rate": 1.0114441390249202e-06, "loss": 0.3228, "step": 5705 }, { "epoch": 0.800280504908836, "grad_norm": 2.2507204385875608, "learning_rate": 1.010074896798482e-06, "loss": 0.3258, "step": 5706 }, { "epoch": 0.8004207573632539, "grad_norm": 2.151957343944403, "learning_rate": 1.0087064778652129e-06, "loss": 0.3303, "step": 5707 }, { "epoch": 0.8005610098176719, "grad_norm": 1.9601307459449562, "learning_rate": 1.007338882507477e-06, "loss": 0.3272, "step": 5708 }, { "epoch": 0.8007012622720897, "grad_norm": 1.9591374741551855, "learning_rate": 1.0059721110074678e-06, "loss": 0.321, "step": 5709 }, { "epoch": 0.8008415147265077, "grad_norm": 1.7443652718524574, "learning_rate": 1.0046061636472087e-06, "loss": 0.3227, "step": 5710 }, { "epoch": 0.8009817671809256, "grad_norm": 2.572176457440362, "learning_rate": 1.003241040708554e-06, "loss": 0.3052, "step": 5711 }, { "epoch": 0.8011220196353436, "grad_norm": 2.466239715260843, "learning_rate": 1.0018767424731867e-06, "loss": 0.3554, "step": 5712 }, { "epoch": 0.8012622720897615, "grad_norm": 1.7741794462566438, "learning_rate": 1.000513269222621e-06, "loss": 0.3411, "step": 5713 }, { "epoch": 0.8014025245441795, "grad_norm": 2.0104676735951377, "learning_rate": 9.991506212382007e-07, "loss": 0.3063, "step": 5714 }, { "epoch": 0.8015427769985974, "grad_norm": 1.8451099875729509, "learning_rate": 9.977887988010958e-07, "loss": 0.3736, "step": 5715 }, { "epoch": 0.8016830294530154, "grad_norm": 2.0101962059251623, "learning_rate": 9.964278021923107e-07, "loss": 0.4055, "step": 5716 }, { "epoch": 0.8018232819074333, "grad_norm": 2.169130526092723, "learning_rate": 9.950676316926777e-07, "loss": 0.3094, "step": 5717 }, { "epoch": 0.8019635343618513, "grad_norm": 3.2461397280231328, "learning_rate": 9.937082875828586e-07, "loss": 0.3499, "step": 5718 }, { "epoch": 0.8021037868162693, "grad_norm": 2.040985019582136, "learning_rate": 9.923497701433437e-07, "loss": 0.3047, "step": 5719 }, { "epoch": 0.8022440392706872, "grad_norm": 1.605090911286988, "learning_rate": 9.909920796544542e-07, "loss": 0.3059, "step": 5720 }, { "epoch": 0.8023842917251052, "grad_norm": 2.5044523646355334, "learning_rate": 9.896352163963397e-07, "loss": 0.3628, "step": 5721 }, { "epoch": 0.8025245441795231, "grad_norm": 2.461740707837052, "learning_rate": 9.8827918064898e-07, "loss": 0.3359, "step": 5722 }, { "epoch": 0.8026647966339411, "grad_norm": 1.93196163460843, "learning_rate": 9.869239726921843e-07, "loss": 0.3268, "step": 5723 }, { "epoch": 0.802805049088359, "grad_norm": 3.0180064863333698, "learning_rate": 9.85569592805588e-07, "loss": 0.3414, "step": 5724 }, { "epoch": 0.802945301542777, "grad_norm": 2.031314085903108, "learning_rate": 9.842160412686603e-07, "loss": 0.363, "step": 5725 }, { "epoch": 0.803085553997195, "grad_norm": 1.8243241534934667, "learning_rate": 9.82863318360695e-07, "loss": 0.3249, "step": 5726 }, { "epoch": 0.8032258064516129, "grad_norm": 2.122215915535193, "learning_rate": 9.815114243608182e-07, "loss": 0.3526, "step": 5727 }, { "epoch": 0.8033660589060309, "grad_norm": 1.8246452274119915, "learning_rate": 9.801603595479831e-07, "loss": 0.3472, "step": 5728 }, { "epoch": 0.8035063113604488, "grad_norm": 1.949177407111033, "learning_rate": 9.788101242009735e-07, "loss": 0.3347, "step": 5729 }, { "epoch": 0.8036465638148668, "grad_norm": 1.5108238548294288, "learning_rate": 9.774607185984004e-07, "loss": 0.3432, "step": 5730 }, { "epoch": 0.8037868162692847, "grad_norm": 4.90478843738224, "learning_rate": 9.761121430187037e-07, "loss": 0.3552, "step": 5731 }, { "epoch": 0.8039270687237027, "grad_norm": 1.9336230959736478, "learning_rate": 9.747643977401538e-07, "loss": 0.3237, "step": 5732 }, { "epoch": 0.8040673211781206, "grad_norm": 3.037949385190606, "learning_rate": 9.734174830408478e-07, "loss": 0.3743, "step": 5733 }, { "epoch": 0.8042075736325386, "grad_norm": 1.7365452016037863, "learning_rate": 9.720713991987136e-07, "loss": 0.3611, "step": 5734 }, { "epoch": 0.8043478260869565, "grad_norm": 1.7765378361502226, "learning_rate": 9.707261464915036e-07, "loss": 0.3501, "step": 5735 }, { "epoch": 0.8044880785413745, "grad_norm": 2.1466727482174486, "learning_rate": 9.693817251968025e-07, "loss": 0.2964, "step": 5736 }, { "epoch": 0.8046283309957925, "grad_norm": 5.470628862513632, "learning_rate": 9.68038135592022e-07, "loss": 0.3643, "step": 5737 }, { "epoch": 0.8047685834502104, "grad_norm": 2.0073348490239384, "learning_rate": 9.666953779544025e-07, "loss": 0.3078, "step": 5738 }, { "epoch": 0.8049088359046284, "grad_norm": 2.07769552970156, "learning_rate": 9.653534525610137e-07, "loss": 0.3354, "step": 5739 }, { "epoch": 0.8050490883590463, "grad_norm": 1.6301909944467379, "learning_rate": 9.640123596887507e-07, "loss": 0.313, "step": 5740 }, { "epoch": 0.8051893408134643, "grad_norm": 2.2509133286603618, "learning_rate": 9.626720996143407e-07, "loss": 0.3685, "step": 5741 }, { "epoch": 0.8053295932678822, "grad_norm": 1.8537578203444045, "learning_rate": 9.613326726143352e-07, "loss": 0.2815, "step": 5742 }, { "epoch": 0.8054698457223002, "grad_norm": 1.7312063879586224, "learning_rate": 9.59994078965118e-07, "loss": 0.3316, "step": 5743 }, { "epoch": 0.8056100981767181, "grad_norm": 2.2204873173915622, "learning_rate": 9.586563189428954e-07, "loss": 0.3443, "step": 5744 }, { "epoch": 0.8057503506311361, "grad_norm": 2.475509564126404, "learning_rate": 9.573193928237073e-07, "loss": 0.3344, "step": 5745 }, { "epoch": 0.805890603085554, "grad_norm": 2.4244828488779158, "learning_rate": 9.559833008834175e-07, "loss": 0.3511, "step": 5746 }, { "epoch": 0.806030855539972, "grad_norm": 2.298858512735912, "learning_rate": 9.546480433977195e-07, "loss": 0.3297, "step": 5747 }, { "epoch": 0.80617110799439, "grad_norm": 1.9968073515559546, "learning_rate": 9.533136206421345e-07, "loss": 0.3669, "step": 5748 }, { "epoch": 0.8063113604488078, "grad_norm": 1.7465393726662555, "learning_rate": 9.519800328920115e-07, "loss": 0.3508, "step": 5749 }, { "epoch": 0.8064516129032258, "grad_norm": 2.263841861270207, "learning_rate": 9.50647280422527e-07, "loss": 0.3354, "step": 5750 }, { "epoch": 0.8065918653576437, "grad_norm": 1.8702609890861124, "learning_rate": 9.493153635086855e-07, "loss": 0.3091, "step": 5751 }, { "epoch": 0.8067321178120617, "grad_norm": 1.8627230849850065, "learning_rate": 9.479842824253182e-07, "loss": 0.3218, "step": 5752 }, { "epoch": 0.8068723702664796, "grad_norm": 2.165996252167648, "learning_rate": 9.466540374470845e-07, "loss": 0.316, "step": 5753 }, { "epoch": 0.8070126227208976, "grad_norm": 1.817937193474944, "learning_rate": 9.453246288484713e-07, "loss": 0.3506, "step": 5754 }, { "epoch": 0.8071528751753155, "grad_norm": 2.0764247277383734, "learning_rate": 9.439960569037943e-07, "loss": 0.3247, "step": 5755 }, { "epoch": 0.8072931276297335, "grad_norm": 2.2852675712401522, "learning_rate": 9.426683218871918e-07, "loss": 0.3002, "step": 5756 }, { "epoch": 0.8074333800841514, "grad_norm": 2.443217394612114, "learning_rate": 9.413414240726349e-07, "loss": 0.3252, "step": 5757 }, { "epoch": 0.8075736325385694, "grad_norm": 1.990428072831685, "learning_rate": 9.400153637339182e-07, "loss": 0.3132, "step": 5758 }, { "epoch": 0.8077138849929874, "grad_norm": 1.68767101692836, "learning_rate": 9.386901411446664e-07, "loss": 0.3247, "step": 5759 }, { "epoch": 0.8078541374474053, "grad_norm": 1.8257938095659758, "learning_rate": 9.373657565783295e-07, "loss": 0.3431, "step": 5760 }, { "epoch": 0.8079943899018233, "grad_norm": 1.7401334003305737, "learning_rate": 9.360422103081851e-07, "loss": 0.3446, "step": 5761 }, { "epoch": 0.8081346423562412, "grad_norm": 2.1734092288231954, "learning_rate": 9.347195026073369e-07, "loss": 0.3573, "step": 5762 }, { "epoch": 0.8082748948106592, "grad_norm": 2.6941365648360462, "learning_rate": 9.333976337487178e-07, "loss": 0.3428, "step": 5763 }, { "epoch": 0.8084151472650771, "grad_norm": 1.725155909030087, "learning_rate": 9.32076604005086e-07, "loss": 0.3378, "step": 5764 }, { "epoch": 0.8085553997194951, "grad_norm": 1.7444695550214129, "learning_rate": 9.307564136490255e-07, "loss": 0.3244, "step": 5765 }, { "epoch": 0.808695652173913, "grad_norm": 2.642887838840806, "learning_rate": 9.294370629529503e-07, "loss": 0.3453, "step": 5766 }, { "epoch": 0.808835904628331, "grad_norm": 2.0914858062284405, "learning_rate": 9.281185521890962e-07, "loss": 0.3736, "step": 5767 }, { "epoch": 0.808976157082749, "grad_norm": 2.053682610955385, "learning_rate": 9.26800881629531e-07, "loss": 0.3152, "step": 5768 }, { "epoch": 0.8091164095371669, "grad_norm": 2.240800381956439, "learning_rate": 9.254840515461455e-07, "loss": 0.3729, "step": 5769 }, { "epoch": 0.8092566619915849, "grad_norm": 3.133655006724875, "learning_rate": 9.241680622106597e-07, "loss": 0.3404, "step": 5770 }, { "epoch": 0.8093969144460028, "grad_norm": 2.8585078691257793, "learning_rate": 9.22852913894618e-07, "loss": 0.3217, "step": 5771 }, { "epoch": 0.8095371669004208, "grad_norm": 1.7039475860549054, "learning_rate": 9.215386068693927e-07, "loss": 0.3158, "step": 5772 }, { "epoch": 0.8096774193548387, "grad_norm": 2.0100561289755268, "learning_rate": 9.202251414061813e-07, "loss": 0.3353, "step": 5773 }, { "epoch": 0.8098176718092567, "grad_norm": 2.3034458759855787, "learning_rate": 9.189125177760083e-07, "loss": 0.3192, "step": 5774 }, { "epoch": 0.8099579242636746, "grad_norm": 1.9723596330988504, "learning_rate": 9.176007362497258e-07, "loss": 0.3281, "step": 5775 }, { "epoch": 0.8100981767180926, "grad_norm": 2.0757019142665087, "learning_rate": 9.162897970980083e-07, "loss": 0.3266, "step": 5776 }, { "epoch": 0.8102384291725105, "grad_norm": 3.415380716619031, "learning_rate": 9.149797005913602e-07, "loss": 0.3546, "step": 5777 }, { "epoch": 0.8103786816269285, "grad_norm": 1.8925355447835168, "learning_rate": 9.136704470001101e-07, "loss": 0.3493, "step": 5778 }, { "epoch": 0.8105189340813465, "grad_norm": 2.4612542811366676, "learning_rate": 9.123620365944147e-07, "loss": 0.3535, "step": 5779 }, { "epoch": 0.8106591865357644, "grad_norm": 2.3744879722780583, "learning_rate": 9.110544696442542e-07, "loss": 0.3671, "step": 5780 }, { "epoch": 0.8107994389901824, "grad_norm": 2.46101248490671, "learning_rate": 9.097477464194359e-07, "loss": 0.3673, "step": 5781 }, { "epoch": 0.8109396914446003, "grad_norm": 2.339743483087492, "learning_rate": 9.084418671895939e-07, "loss": 0.2883, "step": 5782 }, { "epoch": 0.8110799438990183, "grad_norm": 1.8807450969834456, "learning_rate": 9.071368322241864e-07, "loss": 0.3713, "step": 5783 }, { "epoch": 0.8112201963534362, "grad_norm": 1.9206210131468826, "learning_rate": 9.058326417925001e-07, "loss": 0.3804, "step": 5784 }, { "epoch": 0.8113604488078542, "grad_norm": 1.8776017893133752, "learning_rate": 9.045292961636426e-07, "loss": 0.3398, "step": 5785 }, { "epoch": 0.8115007012622721, "grad_norm": 1.8245361182750697, "learning_rate": 9.032267956065516e-07, "loss": 0.3392, "step": 5786 }, { "epoch": 0.8116409537166901, "grad_norm": 2.105556847139408, "learning_rate": 9.019251403899903e-07, "loss": 0.3409, "step": 5787 }, { "epoch": 0.811781206171108, "grad_norm": 7.110580845109208, "learning_rate": 9.006243307825435e-07, "loss": 0.3287, "step": 5788 }, { "epoch": 0.8119214586255259, "grad_norm": 2.004054608718371, "learning_rate": 8.993243670526258e-07, "loss": 0.3089, "step": 5789 }, { "epoch": 0.8120617110799438, "grad_norm": 1.739839759551441, "learning_rate": 8.980252494684749e-07, "loss": 0.3655, "step": 5790 }, { "epoch": 0.8122019635343618, "grad_norm": 1.5793280217951071, "learning_rate": 8.967269782981558e-07, "loss": 0.3239, "step": 5791 }, { "epoch": 0.8123422159887798, "grad_norm": 2.0103331896065493, "learning_rate": 8.954295538095564e-07, "loss": 0.3498, "step": 5792 }, { "epoch": 0.8124824684431977, "grad_norm": 1.651150175393771, "learning_rate": 8.941329762703921e-07, "loss": 0.3118, "step": 5793 }, { "epoch": 0.8126227208976157, "grad_norm": 2.254585785134493, "learning_rate": 8.928372459482021e-07, "loss": 0.356, "step": 5794 }, { "epoch": 0.8127629733520336, "grad_norm": 2.126751794735609, "learning_rate": 8.915423631103514e-07, "loss": 0.3477, "step": 5795 }, { "epoch": 0.8129032258064516, "grad_norm": 1.6356049528485896, "learning_rate": 8.902483280240315e-07, "loss": 0.3511, "step": 5796 }, { "epoch": 0.8130434782608695, "grad_norm": 1.9790083035472674, "learning_rate": 8.889551409562552e-07, "loss": 0.3221, "step": 5797 }, { "epoch": 0.8131837307152875, "grad_norm": 2.487666894115915, "learning_rate": 8.876628021738631e-07, "loss": 0.3378, "step": 5798 }, { "epoch": 0.8133239831697054, "grad_norm": 2.0407927322970085, "learning_rate": 8.863713119435208e-07, "loss": 0.3623, "step": 5799 }, { "epoch": 0.8134642356241234, "grad_norm": 2.1613033622124274, "learning_rate": 8.850806705317183e-07, "loss": 0.3229, "step": 5800 }, { "epoch": 0.8136044880785414, "grad_norm": 2.0114786245493743, "learning_rate": 8.8379087820477e-07, "loss": 0.3531, "step": 5801 }, { "epoch": 0.8137447405329593, "grad_norm": 1.941715453794252, "learning_rate": 8.825019352288162e-07, "loss": 0.3708, "step": 5802 }, { "epoch": 0.8138849929873773, "grad_norm": 1.9075044350428787, "learning_rate": 8.812138418698207e-07, "loss": 0.324, "step": 5803 }, { "epoch": 0.8140252454417952, "grad_norm": 1.5082312284161705, "learning_rate": 8.799265983935734e-07, "loss": 0.3293, "step": 5804 }, { "epoch": 0.8141654978962132, "grad_norm": 3.839564083311218, "learning_rate": 8.786402050656878e-07, "loss": 0.3563, "step": 5805 }, { "epoch": 0.8143057503506311, "grad_norm": 2.7640195162866656, "learning_rate": 8.77354662151601e-07, "loss": 0.3078, "step": 5806 }, { "epoch": 0.8144460028050491, "grad_norm": 1.7297261110594955, "learning_rate": 8.76069969916577e-07, "loss": 0.3557, "step": 5807 }, { "epoch": 0.814586255259467, "grad_norm": 1.9786278710717704, "learning_rate": 8.747861286257031e-07, "loss": 0.3463, "step": 5808 }, { "epoch": 0.814726507713885, "grad_norm": 2.1937101224660274, "learning_rate": 8.735031385438897e-07, "loss": 0.3164, "step": 5809 }, { "epoch": 0.814866760168303, "grad_norm": 3.2108051801830535, "learning_rate": 8.722209999358738e-07, "loss": 0.3168, "step": 5810 }, { "epoch": 0.8150070126227209, "grad_norm": 2.5617230796021184, "learning_rate": 8.709397130662151e-07, "loss": 0.3707, "step": 5811 }, { "epoch": 0.8151472650771389, "grad_norm": 1.6767654907548426, "learning_rate": 8.696592781992991e-07, "loss": 0.3561, "step": 5812 }, { "epoch": 0.8152875175315568, "grad_norm": 1.7958386293526407, "learning_rate": 8.68379695599334e-07, "loss": 0.3264, "step": 5813 }, { "epoch": 0.8154277699859748, "grad_norm": 1.8976315643978994, "learning_rate": 8.671009655303531e-07, "loss": 0.3521, "step": 5814 }, { "epoch": 0.8155680224403927, "grad_norm": 1.704093028049904, "learning_rate": 8.658230882562135e-07, "loss": 0.3058, "step": 5815 }, { "epoch": 0.8157082748948107, "grad_norm": 2.0508220559556336, "learning_rate": 8.645460640405967e-07, "loss": 0.3677, "step": 5816 }, { "epoch": 0.8158485273492286, "grad_norm": 3.168897161132635, "learning_rate": 8.632698931470063e-07, "loss": 0.356, "step": 5817 }, { "epoch": 0.8159887798036466, "grad_norm": 1.8862199775627215, "learning_rate": 8.619945758387716e-07, "loss": 0.3351, "step": 5818 }, { "epoch": 0.8161290322580645, "grad_norm": 1.7788708508654691, "learning_rate": 8.60720112379046e-07, "loss": 0.3257, "step": 5819 }, { "epoch": 0.8162692847124825, "grad_norm": 2.1905164357056157, "learning_rate": 8.594465030308052e-07, "loss": 0.3206, "step": 5820 }, { "epoch": 0.8164095371669005, "grad_norm": 1.8315833872131473, "learning_rate": 8.581737480568514e-07, "loss": 0.3528, "step": 5821 }, { "epoch": 0.8165497896213184, "grad_norm": 1.62700815096304, "learning_rate": 8.569018477198065e-07, "loss": 0.3356, "step": 5822 }, { "epoch": 0.8166900420757364, "grad_norm": 1.7163175536528217, "learning_rate": 8.556308022821202e-07, "loss": 0.3166, "step": 5823 }, { "epoch": 0.8168302945301543, "grad_norm": 2.0827485471781526, "learning_rate": 8.543606120060627e-07, "loss": 0.3316, "step": 5824 }, { "epoch": 0.8169705469845723, "grad_norm": 2.397592998526746, "learning_rate": 8.530912771537303e-07, "loss": 0.3386, "step": 5825 }, { "epoch": 0.8171107994389902, "grad_norm": 2.0602279633245097, "learning_rate": 8.518227979870392e-07, "loss": 0.3686, "step": 5826 }, { "epoch": 0.8172510518934082, "grad_norm": 1.5238339421822042, "learning_rate": 8.505551747677321e-07, "loss": 0.3418, "step": 5827 }, { "epoch": 0.8173913043478261, "grad_norm": 2.147094417725767, "learning_rate": 8.492884077573749e-07, "loss": 0.3196, "step": 5828 }, { "epoch": 0.817531556802244, "grad_norm": 1.8971183180883944, "learning_rate": 8.480224972173562e-07, "loss": 0.3312, "step": 5829 }, { "epoch": 0.8176718092566619, "grad_norm": 1.7751109424359743, "learning_rate": 8.46757443408886e-07, "loss": 0.3139, "step": 5830 }, { "epoch": 0.8178120617110799, "grad_norm": 2.3422132413033654, "learning_rate": 8.45493246593001e-07, "loss": 0.3481, "step": 5831 }, { "epoch": 0.8179523141654979, "grad_norm": 1.8702055349970348, "learning_rate": 8.442299070305582e-07, "loss": 0.3601, "step": 5832 }, { "epoch": 0.8180925666199158, "grad_norm": 1.88676588422135, "learning_rate": 8.429674249822401e-07, "loss": 0.3621, "step": 5833 }, { "epoch": 0.8182328190743338, "grad_norm": 1.7514782520352044, "learning_rate": 8.417058007085505e-07, "loss": 0.3692, "step": 5834 }, { "epoch": 0.8183730715287517, "grad_norm": 2.0023376393308654, "learning_rate": 8.404450344698167e-07, "loss": 0.3087, "step": 5835 }, { "epoch": 0.8185133239831697, "grad_norm": 2.36910278592, "learning_rate": 8.391851265261886e-07, "loss": 0.3207, "step": 5836 }, { "epoch": 0.8186535764375876, "grad_norm": 1.7855773696181456, "learning_rate": 8.379260771376419e-07, "loss": 0.2824, "step": 5837 }, { "epoch": 0.8187938288920056, "grad_norm": 2.798208078834312, "learning_rate": 8.366678865639688e-07, "loss": 0.3459, "step": 5838 }, { "epoch": 0.8189340813464235, "grad_norm": 1.8766746671696948, "learning_rate": 8.354105550647901e-07, "loss": 0.363, "step": 5839 }, { "epoch": 0.8190743338008415, "grad_norm": 1.9517239848167403, "learning_rate": 8.341540828995476e-07, "loss": 0.3131, "step": 5840 }, { "epoch": 0.8192145862552594, "grad_norm": 1.5362395912109557, "learning_rate": 8.32898470327505e-07, "loss": 0.2952, "step": 5841 }, { "epoch": 0.8193548387096774, "grad_norm": 2.3947829445614692, "learning_rate": 8.316437176077491e-07, "loss": 0.3508, "step": 5842 }, { "epoch": 0.8194950911640954, "grad_norm": 2.0311315648690442, "learning_rate": 8.303898249991899e-07, "loss": 0.3039, "step": 5843 }, { "epoch": 0.8196353436185133, "grad_norm": 1.9421654182093329, "learning_rate": 8.291367927605592e-07, "loss": 0.3526, "step": 5844 }, { "epoch": 0.8197755960729313, "grad_norm": 2.7752537757374323, "learning_rate": 8.27884621150411e-07, "loss": 0.3279, "step": 5845 }, { "epoch": 0.8199158485273492, "grad_norm": 1.7460376295249311, "learning_rate": 8.266333104271241e-07, "loss": 0.3221, "step": 5846 }, { "epoch": 0.8200561009817672, "grad_norm": 2.0667482031868265, "learning_rate": 8.253828608488946e-07, "loss": 0.3135, "step": 5847 }, { "epoch": 0.8201963534361851, "grad_norm": 1.9865050410550704, "learning_rate": 8.241332726737455e-07, "loss": 0.3021, "step": 5848 }, { "epoch": 0.8203366058906031, "grad_norm": 1.5112588661003188, "learning_rate": 8.228845461595225e-07, "loss": 0.3727, "step": 5849 }, { "epoch": 0.820476858345021, "grad_norm": 2.0461429131228472, "learning_rate": 8.216366815638882e-07, "loss": 0.3135, "step": 5850 }, { "epoch": 0.820617110799439, "grad_norm": 2.0007325261449407, "learning_rate": 8.203896791443322e-07, "loss": 0.3279, "step": 5851 }, { "epoch": 0.820757363253857, "grad_norm": 1.599781516178261, "learning_rate": 8.191435391581648e-07, "loss": 0.3065, "step": 5852 }, { "epoch": 0.8208976157082749, "grad_norm": 1.703463669448008, "learning_rate": 8.178982618625186e-07, "loss": 0.3008, "step": 5853 }, { "epoch": 0.8210378681626929, "grad_norm": 2.1060813904238365, "learning_rate": 8.16653847514347e-07, "loss": 0.3158, "step": 5854 }, { "epoch": 0.8211781206171108, "grad_norm": 1.9188823815818137, "learning_rate": 8.154102963704274e-07, "loss": 0.2834, "step": 5855 }, { "epoch": 0.8213183730715288, "grad_norm": 1.7954208460850856, "learning_rate": 8.141676086873574e-07, "loss": 0.3243, "step": 5856 }, { "epoch": 0.8214586255259467, "grad_norm": 1.8033431689733954, "learning_rate": 8.129257847215571e-07, "loss": 0.3648, "step": 5857 }, { "epoch": 0.8215988779803647, "grad_norm": 1.750238402340337, "learning_rate": 8.116848247292674e-07, "loss": 0.3353, "step": 5858 }, { "epoch": 0.8217391304347826, "grad_norm": 2.0612695665049636, "learning_rate": 8.104447289665523e-07, "loss": 0.3537, "step": 5859 }, { "epoch": 0.8218793828892006, "grad_norm": 1.6407404541082031, "learning_rate": 8.092054976892966e-07, "loss": 0.317, "step": 5860 }, { "epoch": 0.8220196353436185, "grad_norm": 3.5031199854220216, "learning_rate": 8.079671311532072e-07, "loss": 0.3405, "step": 5861 }, { "epoch": 0.8221598877980365, "grad_norm": 2.935043769003174, "learning_rate": 8.067296296138128e-07, "loss": 0.3584, "step": 5862 }, { "epoch": 0.8223001402524545, "grad_norm": 2.8804809735594654, "learning_rate": 8.054929933264626e-07, "loss": 0.3662, "step": 5863 }, { "epoch": 0.8224403927068724, "grad_norm": 1.9098530472150959, "learning_rate": 8.04257222546328e-07, "loss": 0.4007, "step": 5864 }, { "epoch": 0.8225806451612904, "grad_norm": 2.869083120913703, "learning_rate": 8.030223175284019e-07, "loss": 0.3405, "step": 5865 }, { "epoch": 0.8227208976157083, "grad_norm": 2.2868398870841253, "learning_rate": 8.017882785274988e-07, "loss": 0.3353, "step": 5866 }, { "epoch": 0.8228611500701263, "grad_norm": 1.9512142142436164, "learning_rate": 8.005551057982531e-07, "loss": 0.3313, "step": 5867 }, { "epoch": 0.8230014025245442, "grad_norm": 1.6856202853274438, "learning_rate": 7.993227995951208e-07, "loss": 0.3183, "step": 5868 }, { "epoch": 0.8231416549789621, "grad_norm": 2.0861045970496166, "learning_rate": 7.980913601723811e-07, "loss": 0.2856, "step": 5869 }, { "epoch": 0.82328190743338, "grad_norm": 2.25674561105616, "learning_rate": 7.968607877841333e-07, "loss": 0.3462, "step": 5870 }, { "epoch": 0.823422159887798, "grad_norm": 2.233675760616221, "learning_rate": 7.956310826842955e-07, "loss": 0.3526, "step": 5871 }, { "epoch": 0.8235624123422159, "grad_norm": 1.7621768601081615, "learning_rate": 7.944022451266098e-07, "loss": 0.3397, "step": 5872 }, { "epoch": 0.8237026647966339, "grad_norm": 2.678401360335284, "learning_rate": 7.931742753646382e-07, "loss": 0.3252, "step": 5873 }, { "epoch": 0.8238429172510519, "grad_norm": 1.8346189918261142, "learning_rate": 7.919471736517631e-07, "loss": 0.3271, "step": 5874 }, { "epoch": 0.8239831697054698, "grad_norm": 2.310120260237604, "learning_rate": 7.907209402411897e-07, "loss": 0.3598, "step": 5875 }, { "epoch": 0.8241234221598878, "grad_norm": 1.8612980234728231, "learning_rate": 7.894955753859412e-07, "loss": 0.3267, "step": 5876 }, { "epoch": 0.8242636746143057, "grad_norm": 1.676075018184822, "learning_rate": 7.882710793388643e-07, "loss": 0.3227, "step": 5877 }, { "epoch": 0.8244039270687237, "grad_norm": 1.859567712365909, "learning_rate": 7.870474523526262e-07, "loss": 0.3273, "step": 5878 }, { "epoch": 0.8245441795231416, "grad_norm": 2.1535659892093277, "learning_rate": 7.858246946797104e-07, "loss": 0.2998, "step": 5879 }, { "epoch": 0.8246844319775596, "grad_norm": 1.7578729915806797, "learning_rate": 7.846028065724264e-07, "loss": 0.3407, "step": 5880 }, { "epoch": 0.8248246844319775, "grad_norm": 1.6492877231994678, "learning_rate": 7.833817882829025e-07, "loss": 0.3128, "step": 5881 }, { "epoch": 0.8249649368863955, "grad_norm": 2.6111116398028478, "learning_rate": 7.821616400630866e-07, "loss": 0.3265, "step": 5882 }, { "epoch": 0.8251051893408134, "grad_norm": 2.957570761239578, "learning_rate": 7.809423621647483e-07, "loss": 0.3168, "step": 5883 }, { "epoch": 0.8252454417952314, "grad_norm": 1.9169339187169225, "learning_rate": 7.79723954839477e-07, "loss": 0.3645, "step": 5884 }, { "epoch": 0.8253856942496494, "grad_norm": 1.5261219337714407, "learning_rate": 7.785064183386826e-07, "loss": 0.3282, "step": 5885 }, { "epoch": 0.8255259467040673, "grad_norm": 2.8281430407206876, "learning_rate": 7.772897529135947e-07, "loss": 0.3213, "step": 5886 }, { "epoch": 0.8256661991584853, "grad_norm": 1.9784204247364008, "learning_rate": 7.760739588152655e-07, "loss": 0.314, "step": 5887 }, { "epoch": 0.8258064516129032, "grad_norm": 2.4941880379205292, "learning_rate": 7.74859036294563e-07, "loss": 0.3144, "step": 5888 }, { "epoch": 0.8259467040673212, "grad_norm": 1.9567120990428433, "learning_rate": 7.736449856021788e-07, "loss": 0.3128, "step": 5889 }, { "epoch": 0.8260869565217391, "grad_norm": 1.8834135750187262, "learning_rate": 7.72431806988625e-07, "loss": 0.3328, "step": 5890 }, { "epoch": 0.8262272089761571, "grad_norm": 1.8822348567268872, "learning_rate": 7.712195007042322e-07, "loss": 0.3549, "step": 5891 }, { "epoch": 0.826367461430575, "grad_norm": 1.8272646974553661, "learning_rate": 7.7000806699915e-07, "loss": 0.3552, "step": 5892 }, { "epoch": 0.826507713884993, "grad_norm": 1.8917971673839595, "learning_rate": 7.687975061233499e-07, "loss": 0.3492, "step": 5893 }, { "epoch": 0.826647966339411, "grad_norm": 1.9909909114160795, "learning_rate": 7.675878183266228e-07, "loss": 0.3274, "step": 5894 }, { "epoch": 0.8267882187938289, "grad_norm": 1.8233574543228601, "learning_rate": 7.663790038585794e-07, "loss": 0.3108, "step": 5895 }, { "epoch": 0.8269284712482469, "grad_norm": 1.8419453469037088, "learning_rate": 7.651710629686504e-07, "loss": 0.3397, "step": 5896 }, { "epoch": 0.8270687237026648, "grad_norm": 1.6867987977292953, "learning_rate": 7.639639959060857e-07, "loss": 0.3742, "step": 5897 }, { "epoch": 0.8272089761570828, "grad_norm": 2.34353250130135, "learning_rate": 7.627578029199562e-07, "loss": 0.3365, "step": 5898 }, { "epoch": 0.8273492286115007, "grad_norm": 1.7427913149701546, "learning_rate": 7.615524842591493e-07, "loss": 0.3783, "step": 5899 }, { "epoch": 0.8274894810659187, "grad_norm": 1.8866160741948856, "learning_rate": 7.603480401723745e-07, "loss": 0.3301, "step": 5900 }, { "epoch": 0.8276297335203366, "grad_norm": 1.851636796579322, "learning_rate": 7.591444709081619e-07, "loss": 0.335, "step": 5901 }, { "epoch": 0.8277699859747546, "grad_norm": 2.039721386152992, "learning_rate": 7.579417767148583e-07, "loss": 0.3804, "step": 5902 }, { "epoch": 0.8279102384291726, "grad_norm": 1.9702069779892553, "learning_rate": 7.56739957840632e-07, "loss": 0.3229, "step": 5903 }, { "epoch": 0.8280504908835905, "grad_norm": 1.8008265955050329, "learning_rate": 7.555390145334696e-07, "loss": 0.3151, "step": 5904 }, { "epoch": 0.8281907433380085, "grad_norm": 2.0113235826231723, "learning_rate": 7.543389470411772e-07, "loss": 0.3157, "step": 5905 }, { "epoch": 0.8283309957924264, "grad_norm": 1.8861464744152876, "learning_rate": 7.531397556113806e-07, "loss": 0.3626, "step": 5906 }, { "epoch": 0.8284712482468444, "grad_norm": 1.8546790440148893, "learning_rate": 7.519414404915254e-07, "loss": 0.3247, "step": 5907 }, { "epoch": 0.8286115007012623, "grad_norm": 2.0027327940614494, "learning_rate": 7.507440019288742e-07, "loss": 0.3327, "step": 5908 }, { "epoch": 0.8287517531556802, "grad_norm": 2.647663374660763, "learning_rate": 7.4954744017051e-07, "loss": 0.3403, "step": 5909 }, { "epoch": 0.8288920056100981, "grad_norm": 1.8293053492615146, "learning_rate": 7.483517554633357e-07, "loss": 0.3211, "step": 5910 }, { "epoch": 0.8290322580645161, "grad_norm": 3.723512949241147, "learning_rate": 7.471569480540725e-07, "loss": 0.3301, "step": 5911 }, { "epoch": 0.829172510518934, "grad_norm": 2.2210324864319597, "learning_rate": 7.459630181892608e-07, "loss": 0.2945, "step": 5912 }, { "epoch": 0.829312762973352, "grad_norm": 2.494243479305422, "learning_rate": 7.447699661152586e-07, "loss": 0.3431, "step": 5913 }, { "epoch": 0.82945301542777, "grad_norm": 2.2497983418270935, "learning_rate": 7.435777920782444e-07, "loss": 0.3238, "step": 5914 }, { "epoch": 0.8295932678821879, "grad_norm": 1.6138152561837147, "learning_rate": 7.423864963242155e-07, "loss": 0.3337, "step": 5915 }, { "epoch": 0.8297335203366059, "grad_norm": 1.659031745193029, "learning_rate": 7.411960790989863e-07, "loss": 0.3253, "step": 5916 }, { "epoch": 0.8298737727910238, "grad_norm": 1.596046364146649, "learning_rate": 7.400065406481926e-07, "loss": 0.3564, "step": 5917 }, { "epoch": 0.8300140252454418, "grad_norm": 2.411427958374746, "learning_rate": 7.388178812172859e-07, "loss": 0.3284, "step": 5918 }, { "epoch": 0.8301542776998597, "grad_norm": 1.6704416280422771, "learning_rate": 7.376301010515397e-07, "loss": 0.3316, "step": 5919 }, { "epoch": 0.8302945301542777, "grad_norm": 2.0502537221438706, "learning_rate": 7.36443200396042e-07, "loss": 0.3456, "step": 5920 }, { "epoch": 0.8304347826086956, "grad_norm": 2.784069903405944, "learning_rate": 7.352571794957025e-07, "loss": 0.2897, "step": 5921 }, { "epoch": 0.8305750350631136, "grad_norm": 1.8757480400733109, "learning_rate": 7.340720385952476e-07, "loss": 0.3049, "step": 5922 }, { "epoch": 0.8307152875175315, "grad_norm": 1.7903903880716123, "learning_rate": 7.328877779392235e-07, "loss": 0.3363, "step": 5923 }, { "epoch": 0.8308555399719495, "grad_norm": 1.9409527930248527, "learning_rate": 7.317043977719945e-07, "loss": 0.3838, "step": 5924 }, { "epoch": 0.8309957924263675, "grad_norm": 1.875694464185648, "learning_rate": 7.305218983377422e-07, "loss": 0.2978, "step": 5925 }, { "epoch": 0.8311360448807854, "grad_norm": 2.2371251181142955, "learning_rate": 7.293402798804667e-07, "loss": 0.3491, "step": 5926 }, { "epoch": 0.8312762973352034, "grad_norm": 2.176854853223327, "learning_rate": 7.281595426439875e-07, "loss": 0.3173, "step": 5927 }, { "epoch": 0.8314165497896213, "grad_norm": 2.2230408776523607, "learning_rate": 7.269796868719426e-07, "loss": 0.3249, "step": 5928 }, { "epoch": 0.8315568022440393, "grad_norm": 7.15568893432043, "learning_rate": 7.258007128077843e-07, "loss": 0.3476, "step": 5929 }, { "epoch": 0.8316970546984572, "grad_norm": 1.735562579495822, "learning_rate": 7.24622620694787e-07, "loss": 0.334, "step": 5930 }, { "epoch": 0.8318373071528752, "grad_norm": 2.2811862466343023, "learning_rate": 7.23445410776042e-07, "loss": 0.3338, "step": 5931 }, { "epoch": 0.8319775596072931, "grad_norm": 1.8630811155948452, "learning_rate": 7.222690832944579e-07, "loss": 0.3498, "step": 5932 }, { "epoch": 0.8321178120617111, "grad_norm": 1.8348565348534098, "learning_rate": 7.210936384927631e-07, "loss": 0.3594, "step": 5933 }, { "epoch": 0.832258064516129, "grad_norm": 1.609483511516469, "learning_rate": 7.199190766135001e-07, "loss": 0.3169, "step": 5934 }, { "epoch": 0.832398316970547, "grad_norm": 2.3220203420912773, "learning_rate": 7.187453978990328e-07, "loss": 0.3099, "step": 5935 }, { "epoch": 0.832538569424965, "grad_norm": 1.7907693375338614, "learning_rate": 7.175726025915409e-07, "loss": 0.3197, "step": 5936 }, { "epoch": 0.8326788218793829, "grad_norm": 1.9922263945728322, "learning_rate": 7.164006909330234e-07, "loss": 0.3125, "step": 5937 }, { "epoch": 0.8328190743338009, "grad_norm": 2.0490245452200084, "learning_rate": 7.152296631652955e-07, "loss": 0.3466, "step": 5938 }, { "epoch": 0.8329593267882188, "grad_norm": 2.789434840889348, "learning_rate": 7.140595195299921e-07, "loss": 0.3455, "step": 5939 }, { "epoch": 0.8330995792426368, "grad_norm": 2.6088231582509787, "learning_rate": 7.128902602685617e-07, "loss": 0.3696, "step": 5940 }, { "epoch": 0.8332398316970547, "grad_norm": 2.2986714091720395, "learning_rate": 7.117218856222741e-07, "loss": 0.3017, "step": 5941 }, { "epoch": 0.8333800841514727, "grad_norm": 2.6683109896135573, "learning_rate": 7.105543958322154e-07, "loss": 0.3158, "step": 5942 }, { "epoch": 0.8335203366058906, "grad_norm": 1.557099661159203, "learning_rate": 7.093877911392882e-07, "loss": 0.3166, "step": 5943 }, { "epoch": 0.8336605890603086, "grad_norm": 1.942696708528512, "learning_rate": 7.082220717842137e-07, "loss": 0.3142, "step": 5944 }, { "epoch": 0.8338008415147266, "grad_norm": 1.9173665703774032, "learning_rate": 7.070572380075302e-07, "loss": 0.3275, "step": 5945 }, { "epoch": 0.8339410939691445, "grad_norm": 1.93024180246546, "learning_rate": 7.058932900495929e-07, "loss": 0.3745, "step": 5946 }, { "epoch": 0.8340813464235625, "grad_norm": 2.787778585565696, "learning_rate": 7.047302281505735e-07, "loss": 0.363, "step": 5947 }, { "epoch": 0.8342215988779804, "grad_norm": 1.7713322852814406, "learning_rate": 7.03568052550464e-07, "loss": 0.3292, "step": 5948 }, { "epoch": 0.8343618513323983, "grad_norm": 2.116686096502722, "learning_rate": 7.024067634890686e-07, "loss": 0.3577, "step": 5949 }, { "epoch": 0.8345021037868162, "grad_norm": 2.8753280452494203, "learning_rate": 7.012463612060122e-07, "loss": 0.3252, "step": 5950 }, { "epoch": 0.8346423562412342, "grad_norm": 2.0730617478812743, "learning_rate": 7.000868459407357e-07, "loss": 0.341, "step": 5951 }, { "epoch": 0.8347826086956521, "grad_norm": 2.5781062269972286, "learning_rate": 6.989282179324963e-07, "loss": 0.338, "step": 5952 }, { "epoch": 0.8349228611500701, "grad_norm": 1.896673128299176, "learning_rate": 6.977704774203703e-07, "loss": 0.3187, "step": 5953 }, { "epoch": 0.835063113604488, "grad_norm": 1.585913897559515, "learning_rate": 6.966136246432492e-07, "loss": 0.3353, "step": 5954 }, { "epoch": 0.835203366058906, "grad_norm": 2.496019914517064, "learning_rate": 6.954576598398399e-07, "loss": 0.3523, "step": 5955 }, { "epoch": 0.835343618513324, "grad_norm": 2.017724060511966, "learning_rate": 6.943025832486682e-07, "loss": 0.3164, "step": 5956 }, { "epoch": 0.8354838709677419, "grad_norm": 4.432005583528933, "learning_rate": 6.931483951080769e-07, "loss": 0.3534, "step": 5957 }, { "epoch": 0.8356241234221599, "grad_norm": 1.5581541277951183, "learning_rate": 6.919950956562244e-07, "loss": 0.3022, "step": 5958 }, { "epoch": 0.8357643758765778, "grad_norm": 1.772605936303927, "learning_rate": 6.908426851310851e-07, "loss": 0.3231, "step": 5959 }, { "epoch": 0.8359046283309958, "grad_norm": 5.7808936756744735, "learning_rate": 6.896911637704534e-07, "loss": 0.2925, "step": 5960 }, { "epoch": 0.8360448807854137, "grad_norm": 1.6853368104454178, "learning_rate": 6.885405318119342e-07, "loss": 0.3098, "step": 5961 }, { "epoch": 0.8361851332398317, "grad_norm": 3.7081130030180924, "learning_rate": 6.873907894929543e-07, "loss": 0.3661, "step": 5962 }, { "epoch": 0.8363253856942496, "grad_norm": 2.076226306391961, "learning_rate": 6.862419370507545e-07, "loss": 0.3236, "step": 5963 }, { "epoch": 0.8364656381486676, "grad_norm": 1.8623008467940207, "learning_rate": 6.850939747223928e-07, "loss": 0.3675, "step": 5964 }, { "epoch": 0.8366058906030855, "grad_norm": 1.93870445468513, "learning_rate": 6.839469027447431e-07, "loss": 0.3488, "step": 5965 }, { "epoch": 0.8367461430575035, "grad_norm": 1.8637422195364273, "learning_rate": 6.828007213544957e-07, "loss": 0.3689, "step": 5966 }, { "epoch": 0.8368863955119215, "grad_norm": 2.124894756628103, "learning_rate": 6.816554307881574e-07, "loss": 0.3196, "step": 5967 }, { "epoch": 0.8370266479663394, "grad_norm": 4.484146590845727, "learning_rate": 6.805110312820501e-07, "loss": 0.3567, "step": 5968 }, { "epoch": 0.8371669004207574, "grad_norm": 2.032548694660719, "learning_rate": 6.793675230723145e-07, "loss": 0.3294, "step": 5969 }, { "epoch": 0.8373071528751753, "grad_norm": 1.621800353697337, "learning_rate": 6.782249063949031e-07, "loss": 0.3409, "step": 5970 }, { "epoch": 0.8374474053295933, "grad_norm": 1.6870937564848512, "learning_rate": 6.770831814855882e-07, "loss": 0.3403, "step": 5971 }, { "epoch": 0.8375876577840112, "grad_norm": 2.0616989892241713, "learning_rate": 6.75942348579956e-07, "loss": 0.3305, "step": 5972 }, { "epoch": 0.8377279102384292, "grad_norm": 2.049175920231566, "learning_rate": 6.748024079134102e-07, "loss": 0.3531, "step": 5973 }, { "epoch": 0.8378681626928471, "grad_norm": 2.2037848167676723, "learning_rate": 6.736633597211706e-07, "loss": 0.3244, "step": 5974 }, { "epoch": 0.8380084151472651, "grad_norm": 2.3520586371216377, "learning_rate": 6.725252042382691e-07, "loss": 0.3406, "step": 5975 }, { "epoch": 0.838148667601683, "grad_norm": 1.9766252228807857, "learning_rate": 6.713879416995572e-07, "loss": 0.3666, "step": 5976 }, { "epoch": 0.838288920056101, "grad_norm": 2.012382679238463, "learning_rate": 6.702515723397024e-07, "loss": 0.3268, "step": 5977 }, { "epoch": 0.838429172510519, "grad_norm": 2.3056053273702646, "learning_rate": 6.691160963931848e-07, "loss": 0.3311, "step": 5978 }, { "epoch": 0.8385694249649369, "grad_norm": 1.7251773300797073, "learning_rate": 6.67981514094303e-07, "loss": 0.3414, "step": 5979 }, { "epoch": 0.8387096774193549, "grad_norm": 2.131301736063884, "learning_rate": 6.668478256771716e-07, "loss": 0.3187, "step": 5980 }, { "epoch": 0.8388499298737728, "grad_norm": 2.4760383508574724, "learning_rate": 6.657150313757155e-07, "loss": 0.3751, "step": 5981 }, { "epoch": 0.8389901823281908, "grad_norm": 2.2141479260339847, "learning_rate": 6.645831314236817e-07, "loss": 0.3494, "step": 5982 }, { "epoch": 0.8391304347826087, "grad_norm": 1.9183240510907655, "learning_rate": 6.634521260546289e-07, "loss": 0.2969, "step": 5983 }, { "epoch": 0.8392706872370267, "grad_norm": 2.6188554146246297, "learning_rate": 6.623220155019322e-07, "loss": 0.3421, "step": 5984 }, { "epoch": 0.8394109396914446, "grad_norm": 2.2504458515506918, "learning_rate": 6.611927999987821e-07, "loss": 0.39, "step": 5985 }, { "epoch": 0.8395511921458626, "grad_norm": 1.8906382410515796, "learning_rate": 6.600644797781847e-07, "loss": 0.3552, "step": 5986 }, { "epoch": 0.8396914446002806, "grad_norm": 3.430929528228952, "learning_rate": 6.589370550729607e-07, "loss": 0.3432, "step": 5987 }, { "epoch": 0.8398316970546985, "grad_norm": 2.069211271094334, "learning_rate": 6.578105261157464e-07, "loss": 0.3252, "step": 5988 }, { "epoch": 0.8399719495091164, "grad_norm": 2.2624390398949368, "learning_rate": 6.566848931389935e-07, "loss": 0.3826, "step": 5989 }, { "epoch": 0.8401122019635343, "grad_norm": 2.1597756573062346, "learning_rate": 6.555601563749675e-07, "loss": 0.321, "step": 5990 }, { "epoch": 0.8402524544179523, "grad_norm": 1.681750640637653, "learning_rate": 6.54436316055751e-07, "loss": 0.3177, "step": 5991 }, { "epoch": 0.8403927068723702, "grad_norm": 2.0913650822141245, "learning_rate": 6.533133724132396e-07, "loss": 0.3432, "step": 5992 }, { "epoch": 0.8405329593267882, "grad_norm": 1.9211184865779194, "learning_rate": 6.521913256791457e-07, "loss": 0.3443, "step": 5993 }, { "epoch": 0.8406732117812061, "grad_norm": 2.02490699460296, "learning_rate": 6.510701760849952e-07, "loss": 0.3535, "step": 5994 }, { "epoch": 0.8408134642356241, "grad_norm": 2.0128200442381976, "learning_rate": 6.499499238621315e-07, "loss": 0.3582, "step": 5995 }, { "epoch": 0.840953716690042, "grad_norm": 2.044152232493797, "learning_rate": 6.488305692417074e-07, "loss": 0.3635, "step": 5996 }, { "epoch": 0.84109396914446, "grad_norm": 2.027611158077854, "learning_rate": 6.477121124546965e-07, "loss": 0.3121, "step": 5997 }, { "epoch": 0.841234221598878, "grad_norm": 1.6533406454098938, "learning_rate": 6.46594553731883e-07, "loss": 0.3346, "step": 5998 }, { "epoch": 0.8413744740532959, "grad_norm": 2.2162560429197526, "learning_rate": 6.454778933038681e-07, "loss": 0.3244, "step": 5999 }, { "epoch": 0.8415147265077139, "grad_norm": 2.014075459866966, "learning_rate": 6.443621314010673e-07, "loss": 0.3673, "step": 6000 }, { "epoch": 0.8416549789621318, "grad_norm": 2.2366298155536892, "learning_rate": 6.432472682537105e-07, "loss": 0.3507, "step": 6001 }, { "epoch": 0.8417952314165498, "grad_norm": 1.8507162486070836, "learning_rate": 6.421333040918398e-07, "loss": 0.3528, "step": 6002 }, { "epoch": 0.8419354838709677, "grad_norm": 2.208053811094836, "learning_rate": 6.410202391453157e-07, "loss": 0.3512, "step": 6003 }, { "epoch": 0.8420757363253857, "grad_norm": 2.6097733427703576, "learning_rate": 6.399080736438113e-07, "loss": 0.3787, "step": 6004 }, { "epoch": 0.8422159887798036, "grad_norm": 2.5709288997188677, "learning_rate": 6.387968078168133e-07, "loss": 0.3308, "step": 6005 }, { "epoch": 0.8423562412342216, "grad_norm": 1.6977866967506767, "learning_rate": 6.376864418936246e-07, "loss": 0.3189, "step": 6006 }, { "epoch": 0.8424964936886395, "grad_norm": 2.069032899811632, "learning_rate": 6.365769761033608e-07, "loss": 0.3866, "step": 6007 }, { "epoch": 0.8426367461430575, "grad_norm": 2.2906525072889536, "learning_rate": 6.354684106749531e-07, "loss": 0.298, "step": 6008 }, { "epoch": 0.8427769985974755, "grad_norm": 2.8560573320911984, "learning_rate": 6.343607458371459e-07, "loss": 0.3232, "step": 6009 }, { "epoch": 0.8429172510518934, "grad_norm": 1.7659643066618205, "learning_rate": 6.332539818184985e-07, "loss": 0.321, "step": 6010 }, { "epoch": 0.8430575035063114, "grad_norm": 1.9057627821344394, "learning_rate": 6.321481188473827e-07, "loss": 0.3122, "step": 6011 }, { "epoch": 0.8431977559607293, "grad_norm": 2.4928993476046317, "learning_rate": 6.310431571519865e-07, "loss": 0.326, "step": 6012 }, { "epoch": 0.8433380084151473, "grad_norm": 2.427642648671933, "learning_rate": 6.299390969603108e-07, "loss": 0.3464, "step": 6013 }, { "epoch": 0.8434782608695652, "grad_norm": 1.69797486125548, "learning_rate": 6.288359385001702e-07, "loss": 0.365, "step": 6014 }, { "epoch": 0.8436185133239832, "grad_norm": 1.7153653196714806, "learning_rate": 6.277336819991953e-07, "loss": 0.343, "step": 6015 }, { "epoch": 0.8437587657784011, "grad_norm": 3.257182767092917, "learning_rate": 6.266323276848285e-07, "loss": 0.3006, "step": 6016 }, { "epoch": 0.8438990182328191, "grad_norm": 1.8505231942282456, "learning_rate": 6.255318757843249e-07, "loss": 0.31, "step": 6017 }, { "epoch": 0.844039270687237, "grad_norm": 2.555817800233562, "learning_rate": 6.244323265247565e-07, "loss": 0.3776, "step": 6018 }, { "epoch": 0.844179523141655, "grad_norm": 1.6185535797781898, "learning_rate": 6.233336801330076e-07, "loss": 0.3536, "step": 6019 }, { "epoch": 0.844319775596073, "grad_norm": 1.8701322948353567, "learning_rate": 6.222359368357761e-07, "loss": 0.2935, "step": 6020 }, { "epoch": 0.8444600280504909, "grad_norm": 1.95327245537127, "learning_rate": 6.211390968595743e-07, "loss": 0.3718, "step": 6021 }, { "epoch": 0.8446002805049089, "grad_norm": 2.4527583729067683, "learning_rate": 6.200431604307255e-07, "loss": 0.3182, "step": 6022 }, { "epoch": 0.8447405329593268, "grad_norm": 1.8037636113595021, "learning_rate": 6.1894812777537e-07, "loss": 0.3728, "step": 6023 }, { "epoch": 0.8448807854137448, "grad_norm": 4.7849250581905665, "learning_rate": 6.178539991194599e-07, "loss": 0.3513, "step": 6024 }, { "epoch": 0.8450210378681627, "grad_norm": 1.8427973080106381, "learning_rate": 6.16760774688761e-07, "loss": 0.3401, "step": 6025 }, { "epoch": 0.8451612903225807, "grad_norm": 3.2215639424873546, "learning_rate": 6.15668454708852e-07, "loss": 0.338, "step": 6026 }, { "epoch": 0.8453015427769986, "grad_norm": 1.5921298000673347, "learning_rate": 6.145770394051265e-07, "loss": 0.2723, "step": 6027 }, { "epoch": 0.8454417952314166, "grad_norm": 1.686424124333253, "learning_rate": 6.134865290027903e-07, "loss": 0.364, "step": 6028 }, { "epoch": 0.8455820476858344, "grad_norm": 1.679836887547496, "learning_rate": 6.123969237268617e-07, "loss": 0.3235, "step": 6029 }, { "epoch": 0.8457223001402524, "grad_norm": 2.1749759190421134, "learning_rate": 6.113082238021745e-07, "loss": 0.3068, "step": 6030 }, { "epoch": 0.8458625525946704, "grad_norm": 2.09664318548658, "learning_rate": 6.102204294533731e-07, "loss": 0.3522, "step": 6031 }, { "epoch": 0.8460028050490883, "grad_norm": 2.6375446297724987, "learning_rate": 6.091335409049159e-07, "loss": 0.3367, "step": 6032 }, { "epoch": 0.8461430575035063, "grad_norm": 1.773297879945007, "learning_rate": 6.080475583810758e-07, "loss": 0.2993, "step": 6033 }, { "epoch": 0.8462833099579242, "grad_norm": 1.8628776321070017, "learning_rate": 6.069624821059378e-07, "loss": 0.3197, "step": 6034 }, { "epoch": 0.8464235624123422, "grad_norm": 2.126443484841785, "learning_rate": 6.05878312303399e-07, "loss": 0.3677, "step": 6035 }, { "epoch": 0.8465638148667601, "grad_norm": 2.0184801564992894, "learning_rate": 6.04795049197171e-07, "loss": 0.3416, "step": 6036 }, { "epoch": 0.8467040673211781, "grad_norm": 2.523825411905077, "learning_rate": 6.037126930107779e-07, "loss": 0.309, "step": 6037 }, { "epoch": 0.846844319775596, "grad_norm": 1.9578989507611473, "learning_rate": 6.026312439675553e-07, "loss": 0.3564, "step": 6038 }, { "epoch": 0.846984572230014, "grad_norm": 1.7386849952464973, "learning_rate": 6.015507022906525e-07, "loss": 0.3048, "step": 6039 }, { "epoch": 0.847124824684432, "grad_norm": 1.9307526031243454, "learning_rate": 6.004710682030324e-07, "loss": 0.3167, "step": 6040 }, { "epoch": 0.8472650771388499, "grad_norm": 2.235520822658841, "learning_rate": 5.993923419274699e-07, "loss": 0.3311, "step": 6041 }, { "epoch": 0.8474053295932679, "grad_norm": 1.7091770957183454, "learning_rate": 5.983145236865534e-07, "loss": 0.2649, "step": 6042 }, { "epoch": 0.8475455820476858, "grad_norm": 2.554799675301453, "learning_rate": 5.972376137026814e-07, "loss": 0.3042, "step": 6043 }, { "epoch": 0.8476858345021038, "grad_norm": 1.8978700142858804, "learning_rate": 5.961616121980679e-07, "loss": 0.3705, "step": 6044 }, { "epoch": 0.8478260869565217, "grad_norm": 1.6428209413366355, "learning_rate": 5.95086519394738e-07, "loss": 0.2903, "step": 6045 }, { "epoch": 0.8479663394109397, "grad_norm": 2.0745175823460476, "learning_rate": 5.940123355145294e-07, "loss": 0.3268, "step": 6046 }, { "epoch": 0.8481065918653576, "grad_norm": 2.1057000820489065, "learning_rate": 5.929390607790931e-07, "loss": 0.3615, "step": 6047 }, { "epoch": 0.8482468443197756, "grad_norm": 7.285221702956641, "learning_rate": 5.918666954098912e-07, "loss": 0.3378, "step": 6048 }, { "epoch": 0.8483870967741935, "grad_norm": 2.6413442320652574, "learning_rate": 5.90795239628199e-07, "loss": 0.3087, "step": 6049 }, { "epoch": 0.8485273492286115, "grad_norm": 2.1448385160966863, "learning_rate": 5.897246936551043e-07, "loss": 0.3478, "step": 6050 }, { "epoch": 0.8486676016830295, "grad_norm": 1.8457792720527355, "learning_rate": 5.886550577115069e-07, "loss": 0.3427, "step": 6051 }, { "epoch": 0.8488078541374474, "grad_norm": 2.1754995057498454, "learning_rate": 5.875863320181175e-07, "loss": 0.3215, "step": 6052 }, { "epoch": 0.8489481065918654, "grad_norm": 1.709931308526679, "learning_rate": 5.865185167954612e-07, "loss": 0.3136, "step": 6053 }, { "epoch": 0.8490883590462833, "grad_norm": 2.1225717920174256, "learning_rate": 5.854516122638737e-07, "loss": 0.3152, "step": 6054 }, { "epoch": 0.8492286115007013, "grad_norm": 1.5578365924774313, "learning_rate": 5.843856186435032e-07, "loss": 0.2882, "step": 6055 }, { "epoch": 0.8493688639551192, "grad_norm": 2.180186990365222, "learning_rate": 5.833205361543109e-07, "loss": 0.3369, "step": 6056 }, { "epoch": 0.8495091164095372, "grad_norm": 2.224083306570469, "learning_rate": 5.822563650160684e-07, "loss": 0.3681, "step": 6057 }, { "epoch": 0.8496493688639551, "grad_norm": 1.8469486615616075, "learning_rate": 5.81193105448361e-07, "loss": 0.2978, "step": 6058 }, { "epoch": 0.8497896213183731, "grad_norm": 1.817914308028695, "learning_rate": 5.801307576705833e-07, "loss": 0.3455, "step": 6059 }, { "epoch": 0.8499298737727911, "grad_norm": 1.8011416655873491, "learning_rate": 5.790693219019439e-07, "loss": 0.3161, "step": 6060 }, { "epoch": 0.850070126227209, "grad_norm": 3.3997522376049125, "learning_rate": 5.78008798361463e-07, "loss": 0.3459, "step": 6061 }, { "epoch": 0.850210378681627, "grad_norm": 2.194004741778331, "learning_rate": 5.769491872679733e-07, "loss": 0.3115, "step": 6062 }, { "epoch": 0.8503506311360449, "grad_norm": 1.7685496995287888, "learning_rate": 5.758904888401156e-07, "loss": 0.2988, "step": 6063 }, { "epoch": 0.8504908835904629, "grad_norm": 1.5764153917561585, "learning_rate": 5.748327032963464e-07, "loss": 0.3057, "step": 6064 }, { "epoch": 0.8506311360448808, "grad_norm": 2.3427791364822306, "learning_rate": 5.737758308549319e-07, "loss": 0.3583, "step": 6065 }, { "epoch": 0.8507713884992988, "grad_norm": 1.9202509275021205, "learning_rate": 5.727198717339511e-07, "loss": 0.3273, "step": 6066 }, { "epoch": 0.8509116409537167, "grad_norm": 3.282592063013716, "learning_rate": 5.716648261512931e-07, "loss": 0.2924, "step": 6067 }, { "epoch": 0.8510518934081347, "grad_norm": 2.0339465509029315, "learning_rate": 5.706106943246592e-07, "loss": 0.3325, "step": 6068 }, { "epoch": 0.8511921458625525, "grad_norm": 3.183092548258119, "learning_rate": 5.695574764715628e-07, "loss": 0.3515, "step": 6069 }, { "epoch": 0.8513323983169705, "grad_norm": 1.963248118778689, "learning_rate": 5.685051728093271e-07, "loss": 0.3292, "step": 6070 }, { "epoch": 0.8514726507713885, "grad_norm": 2.1576682692380924, "learning_rate": 5.674537835550897e-07, "loss": 0.3739, "step": 6071 }, { "epoch": 0.8516129032258064, "grad_norm": 1.607726428635446, "learning_rate": 5.664033089257948e-07, "loss": 0.2824, "step": 6072 }, { "epoch": 0.8517531556802244, "grad_norm": 1.8666605824416778, "learning_rate": 5.653537491382011e-07, "loss": 0.3571, "step": 6073 }, { "epoch": 0.8518934081346423, "grad_norm": 16.54785891049003, "learning_rate": 5.643051044088787e-07, "loss": 0.3395, "step": 6074 }, { "epoch": 0.8520336605890603, "grad_norm": 1.797449567402985, "learning_rate": 5.632573749542075e-07, "loss": 0.3377, "step": 6075 }, { "epoch": 0.8521739130434782, "grad_norm": 1.9316008190531682, "learning_rate": 5.622105609903794e-07, "loss": 0.3547, "step": 6076 }, { "epoch": 0.8523141654978962, "grad_norm": 2.062292146323976, "learning_rate": 5.611646627333977e-07, "loss": 0.3243, "step": 6077 }, { "epoch": 0.8524544179523141, "grad_norm": 1.772400405732911, "learning_rate": 5.601196803990755e-07, "loss": 0.3582, "step": 6078 }, { "epoch": 0.8525946704067321, "grad_norm": 1.5540623773896571, "learning_rate": 5.590756142030385e-07, "loss": 0.2533, "step": 6079 }, { "epoch": 0.85273492286115, "grad_norm": 1.647772777012749, "learning_rate": 5.58032464360721e-07, "loss": 0.3481, "step": 6080 }, { "epoch": 0.852875175315568, "grad_norm": 2.8274269717876424, "learning_rate": 5.569902310873703e-07, "loss": 0.3354, "step": 6081 }, { "epoch": 0.853015427769986, "grad_norm": 1.6888542392728894, "learning_rate": 5.55948914598044e-07, "loss": 0.3287, "step": 6082 }, { "epoch": 0.8531556802244039, "grad_norm": 1.4950542773086866, "learning_rate": 5.549085151076122e-07, "loss": 0.3234, "step": 6083 }, { "epoch": 0.8532959326788219, "grad_norm": 2.2691632233405725, "learning_rate": 5.538690328307505e-07, "loss": 0.3874, "step": 6084 }, { "epoch": 0.8534361851332398, "grad_norm": 1.7555042624860497, "learning_rate": 5.528304679819513e-07, "loss": 0.3295, "step": 6085 }, { "epoch": 0.8535764375876578, "grad_norm": 2.047387444799126, "learning_rate": 5.517928207755146e-07, "loss": 0.3006, "step": 6086 }, { "epoch": 0.8537166900420757, "grad_norm": 2.3183178275289342, "learning_rate": 5.507560914255516e-07, "loss": 0.3289, "step": 6087 }, { "epoch": 0.8538569424964937, "grad_norm": 2.027587523164664, "learning_rate": 5.497202801459844e-07, "loss": 0.3712, "step": 6088 }, { "epoch": 0.8539971949509116, "grad_norm": 2.0034693191422077, "learning_rate": 5.486853871505455e-07, "loss": 0.3455, "step": 6089 }, { "epoch": 0.8541374474053296, "grad_norm": 1.5420972463369598, "learning_rate": 5.476514126527771e-07, "loss": 0.3511, "step": 6090 }, { "epoch": 0.8542776998597476, "grad_norm": 1.7251298579113123, "learning_rate": 5.466183568660332e-07, "loss": 0.2941, "step": 6091 }, { "epoch": 0.8544179523141655, "grad_norm": 2.008034779160064, "learning_rate": 5.45586220003479e-07, "loss": 0.3602, "step": 6092 }, { "epoch": 0.8545582047685835, "grad_norm": 4.738031467228583, "learning_rate": 5.44555002278086e-07, "loss": 0.3157, "step": 6093 }, { "epoch": 0.8546984572230014, "grad_norm": 1.9561062452251008, "learning_rate": 5.435247039026398e-07, "loss": 0.3759, "step": 6094 }, { "epoch": 0.8548387096774194, "grad_norm": 1.8167542483131938, "learning_rate": 5.424953250897358e-07, "loss": 0.2973, "step": 6095 }, { "epoch": 0.8549789621318373, "grad_norm": 1.62523744755668, "learning_rate": 5.414668660517791e-07, "loss": 0.3296, "step": 6096 }, { "epoch": 0.8551192145862553, "grad_norm": 1.8130455511465304, "learning_rate": 5.404393270009844e-07, "loss": 0.3309, "step": 6097 }, { "epoch": 0.8552594670406732, "grad_norm": 2.4729313268015383, "learning_rate": 5.394127081493783e-07, "loss": 0.2892, "step": 6098 }, { "epoch": 0.8553997194950912, "grad_norm": 1.9803118822320447, "learning_rate": 5.383870097087962e-07, "loss": 0.306, "step": 6099 }, { "epoch": 0.8555399719495091, "grad_norm": 1.6516971991546492, "learning_rate": 5.373622318908822e-07, "loss": 0.3312, "step": 6100 }, { "epoch": 0.8556802244039271, "grad_norm": 1.9901851988642159, "learning_rate": 5.363383749070939e-07, "loss": 0.3639, "step": 6101 }, { "epoch": 0.8558204768583451, "grad_norm": 1.9386208891289105, "learning_rate": 5.353154389686954e-07, "loss": 0.3329, "step": 6102 }, { "epoch": 0.855960729312763, "grad_norm": 2.8572910406649954, "learning_rate": 5.342934242867648e-07, "loss": 0.3147, "step": 6103 }, { "epoch": 0.856100981767181, "grad_norm": 1.8968582177715962, "learning_rate": 5.332723310721855e-07, "loss": 0.3423, "step": 6104 }, { "epoch": 0.8562412342215989, "grad_norm": 1.9959974769622206, "learning_rate": 5.322521595356533e-07, "loss": 0.39, "step": 6105 }, { "epoch": 0.8563814866760169, "grad_norm": 1.6576896744223273, "learning_rate": 5.312329098876734e-07, "loss": 0.3099, "step": 6106 }, { "epoch": 0.8565217391304348, "grad_norm": 3.028156865348899, "learning_rate": 5.302145823385618e-07, "loss": 0.3125, "step": 6107 }, { "epoch": 0.8566619915848528, "grad_norm": 1.6721440749356884, "learning_rate": 5.291971770984428e-07, "loss": 0.3083, "step": 6108 }, { "epoch": 0.8568022440392706, "grad_norm": 1.5664244386753763, "learning_rate": 5.281806943772505e-07, "loss": 0.3218, "step": 6109 }, { "epoch": 0.8569424964936886, "grad_norm": 2.001829026260339, "learning_rate": 5.271651343847295e-07, "loss": 0.3173, "step": 6110 }, { "epoch": 0.8570827489481065, "grad_norm": 1.7406645135144216, "learning_rate": 5.261504973304332e-07, "loss": 0.3605, "step": 6111 }, { "epoch": 0.8572230014025245, "grad_norm": 1.949301677766893, "learning_rate": 5.251367834237264e-07, "loss": 0.3331, "step": 6112 }, { "epoch": 0.8573632538569425, "grad_norm": 1.6607421559164777, "learning_rate": 5.241239928737791e-07, "loss": 0.3137, "step": 6113 }, { "epoch": 0.8575035063113604, "grad_norm": 2.658054143022672, "learning_rate": 5.231121258895749e-07, "loss": 0.3229, "step": 6114 }, { "epoch": 0.8576437587657784, "grad_norm": 2.220127376648124, "learning_rate": 5.221011826799055e-07, "loss": 0.3296, "step": 6115 }, { "epoch": 0.8577840112201963, "grad_norm": 1.9048314679306597, "learning_rate": 5.210911634533722e-07, "loss": 0.3558, "step": 6116 }, { "epoch": 0.8579242636746143, "grad_norm": 1.9844102030357418, "learning_rate": 5.200820684183849e-07, "loss": 0.3434, "step": 6117 }, { "epoch": 0.8580645161290322, "grad_norm": 2.5911214907219993, "learning_rate": 5.190738977831638e-07, "loss": 0.4046, "step": 6118 }, { "epoch": 0.8582047685834502, "grad_norm": 1.9521554600416147, "learning_rate": 5.180666517557375e-07, "loss": 0.2708, "step": 6119 }, { "epoch": 0.8583450210378681, "grad_norm": 2.1307050752237457, "learning_rate": 5.170603305439454e-07, "loss": 0.3149, "step": 6120 }, { "epoch": 0.8584852734922861, "grad_norm": 1.6818998449355316, "learning_rate": 5.160549343554327e-07, "loss": 0.3407, "step": 6121 }, { "epoch": 0.858625525946704, "grad_norm": 2.355537476810119, "learning_rate": 5.150504633976572e-07, "loss": 0.3513, "step": 6122 }, { "epoch": 0.858765778401122, "grad_norm": 2.3363586862590586, "learning_rate": 5.140469178778845e-07, "loss": 0.3245, "step": 6123 }, { "epoch": 0.85890603085554, "grad_norm": 1.600906163150386, "learning_rate": 5.130442980031892e-07, "loss": 0.3271, "step": 6124 }, { "epoch": 0.8590462833099579, "grad_norm": 1.747766977416793, "learning_rate": 5.120426039804544e-07, "loss": 0.3233, "step": 6125 }, { "epoch": 0.8591865357643759, "grad_norm": 1.8807559356291546, "learning_rate": 5.110418360163722e-07, "loss": 0.3149, "step": 6126 }, { "epoch": 0.8593267882187938, "grad_norm": 1.7970212793931848, "learning_rate": 5.10041994317445e-07, "loss": 0.3228, "step": 6127 }, { "epoch": 0.8594670406732118, "grad_norm": 1.8260132419820505, "learning_rate": 5.090430790899836e-07, "loss": 0.3549, "step": 6128 }, { "epoch": 0.8596072931276297, "grad_norm": 1.9466884909455928, "learning_rate": 5.080450905401057e-07, "loss": 0.2916, "step": 6129 }, { "epoch": 0.8597475455820477, "grad_norm": 2.239626814545811, "learning_rate": 5.070480288737406e-07, "loss": 0.3428, "step": 6130 }, { "epoch": 0.8598877980364656, "grad_norm": 1.8719173522131916, "learning_rate": 5.060518942966242e-07, "loss": 0.3085, "step": 6131 }, { "epoch": 0.8600280504908836, "grad_norm": 2.6332475885684232, "learning_rate": 5.050566870143025e-07, "loss": 0.3674, "step": 6132 }, { "epoch": 0.8601683029453016, "grad_norm": 2.0176389819824037, "learning_rate": 5.040624072321299e-07, "loss": 0.3213, "step": 6133 }, { "epoch": 0.8603085553997195, "grad_norm": 2.0443898030709886, "learning_rate": 5.030690551552675e-07, "loss": 0.3264, "step": 6134 }, { "epoch": 0.8604488078541375, "grad_norm": 2.481831718303744, "learning_rate": 5.020766309886876e-07, "loss": 0.3135, "step": 6135 }, { "epoch": 0.8605890603085554, "grad_norm": 1.803464532946029, "learning_rate": 5.010851349371704e-07, "loss": 0.3321, "step": 6136 }, { "epoch": 0.8607293127629734, "grad_norm": 1.6609170763513093, "learning_rate": 5.000945672053032e-07, "loss": 0.3657, "step": 6137 }, { "epoch": 0.8608695652173913, "grad_norm": 1.8111355348252078, "learning_rate": 4.99104927997483e-07, "loss": 0.2851, "step": 6138 }, { "epoch": 0.8610098176718093, "grad_norm": 2.0842943899418027, "learning_rate": 4.981162175179155e-07, "loss": 0.2894, "step": 6139 }, { "epoch": 0.8611500701262272, "grad_norm": 2.067301968889042, "learning_rate": 4.971284359706141e-07, "loss": 0.3055, "step": 6140 }, { "epoch": 0.8612903225806452, "grad_norm": 2.018817354856072, "learning_rate": 4.961415835594007e-07, "loss": 0.3269, "step": 6141 }, { "epoch": 0.8614305750350632, "grad_norm": 2.081729945841518, "learning_rate": 4.951556604879049e-07, "loss": 0.3516, "step": 6142 }, { "epoch": 0.8615708274894811, "grad_norm": 1.8779569833153424, "learning_rate": 4.941706669595647e-07, "loss": 0.2899, "step": 6143 }, { "epoch": 0.8617110799438991, "grad_norm": 1.969667372336686, "learning_rate": 4.931866031776283e-07, "loss": 0.3436, "step": 6144 }, { "epoch": 0.861851332398317, "grad_norm": 1.7094670774776866, "learning_rate": 4.922034693451483e-07, "loss": 0.318, "step": 6145 }, { "epoch": 0.861991584852735, "grad_norm": 2.304611773782397, "learning_rate": 4.912212656649879e-07, "loss": 0.3665, "step": 6146 }, { "epoch": 0.8621318373071529, "grad_norm": 1.6669800304825453, "learning_rate": 4.902399923398193e-07, "loss": 0.3475, "step": 6147 }, { "epoch": 0.8622720897615709, "grad_norm": 1.8138926844665062, "learning_rate": 4.892596495721202e-07, "loss": 0.3295, "step": 6148 }, { "epoch": 0.8624123422159887, "grad_norm": 1.9394263258942779, "learning_rate": 4.882802375641777e-07, "loss": 0.331, "step": 6149 }, { "epoch": 0.8625525946704067, "grad_norm": 1.7434561177565802, "learning_rate": 4.873017565180871e-07, "loss": 0.3426, "step": 6150 }, { "epoch": 0.8626928471248246, "grad_norm": 1.9122465900755303, "learning_rate": 4.86324206635751e-07, "loss": 0.3014, "step": 6151 }, { "epoch": 0.8628330995792426, "grad_norm": 1.9520255694668907, "learning_rate": 4.853475881188796e-07, "loss": 0.3149, "step": 6152 }, { "epoch": 0.8629733520336605, "grad_norm": 2.021487441498278, "learning_rate": 4.843719011689924e-07, "loss": 0.3633, "step": 6153 }, { "epoch": 0.8631136044880785, "grad_norm": 2.0556805317778513, "learning_rate": 4.833971459874137e-07, "loss": 0.3818, "step": 6154 }, { "epoch": 0.8632538569424965, "grad_norm": 1.9207956027657835, "learning_rate": 4.824233227752789e-07, "loss": 0.3737, "step": 6155 }, { "epoch": 0.8633941093969144, "grad_norm": 2.6620565340615046, "learning_rate": 4.814504317335289e-07, "loss": 0.3294, "step": 6156 }, { "epoch": 0.8635343618513324, "grad_norm": 2.2018176589503002, "learning_rate": 4.804784730629131e-07, "loss": 0.3508, "step": 6157 }, { "epoch": 0.8636746143057503, "grad_norm": 2.0023934824375305, "learning_rate": 4.795074469639888e-07, "loss": 0.3548, "step": 6158 }, { "epoch": 0.8638148667601683, "grad_norm": 1.695598103597831, "learning_rate": 4.785373536371196e-07, "loss": 0.3422, "step": 6159 }, { "epoch": 0.8639551192145862, "grad_norm": 2.0864526919667816, "learning_rate": 4.775681932824783e-07, "loss": 0.3122, "step": 6160 }, { "epoch": 0.8640953716690042, "grad_norm": 2.016306089317565, "learning_rate": 4.7659996610004423e-07, "loss": 0.3933, "step": 6161 }, { "epoch": 0.8642356241234221, "grad_norm": 1.6900118041895953, "learning_rate": 4.756326722896054e-07, "loss": 0.3496, "step": 6162 }, { "epoch": 0.8643758765778401, "grad_norm": 5.151620902371692, "learning_rate": 4.7466631205075333e-07, "loss": 0.3196, "step": 6163 }, { "epoch": 0.864516129032258, "grad_norm": 1.7722064883863387, "learning_rate": 4.7370088558289175e-07, "loss": 0.3503, "step": 6164 }, { "epoch": 0.864656381486676, "grad_norm": 1.622432440951444, "learning_rate": 4.7273639308523023e-07, "loss": 0.313, "step": 6165 }, { "epoch": 0.864796633941094, "grad_norm": 2.2409615704136474, "learning_rate": 4.717728347567829e-07, "loss": 0.3461, "step": 6166 }, { "epoch": 0.8649368863955119, "grad_norm": 1.757736407858271, "learning_rate": 4.708102107963741e-07, "loss": 0.2997, "step": 6167 }, { "epoch": 0.8650771388499299, "grad_norm": 1.9983383667123933, "learning_rate": 4.698485214026349e-07, "loss": 0.3834, "step": 6168 }, { "epoch": 0.8652173913043478, "grad_norm": 3.1373323859318596, "learning_rate": 4.6888776677400384e-07, "loss": 0.3208, "step": 6169 }, { "epoch": 0.8653576437587658, "grad_norm": 2.352382073522565, "learning_rate": 4.6792794710872446e-07, "loss": 0.3235, "step": 6170 }, { "epoch": 0.8654978962131837, "grad_norm": 3.1639075724029904, "learning_rate": 4.6696906260485007e-07, "loss": 0.3512, "step": 6171 }, { "epoch": 0.8656381486676017, "grad_norm": 1.9639443980346738, "learning_rate": 4.6601111346023963e-07, "loss": 0.3574, "step": 6172 }, { "epoch": 0.8657784011220196, "grad_norm": 1.892510789868173, "learning_rate": 4.6505409987255833e-07, "loss": 0.2837, "step": 6173 }, { "epoch": 0.8659186535764376, "grad_norm": 2.603573500153803, "learning_rate": 4.64098022039281e-07, "loss": 0.3226, "step": 6174 }, { "epoch": 0.8660589060308556, "grad_norm": 1.6795884214056205, "learning_rate": 4.6314288015768595e-07, "loss": 0.3549, "step": 6175 }, { "epoch": 0.8661991584852735, "grad_norm": 1.9386218729974738, "learning_rate": 4.621886744248605e-07, "loss": 0.2914, "step": 6176 }, { "epoch": 0.8663394109396915, "grad_norm": 2.5953144732610745, "learning_rate": 4.612354050376977e-07, "loss": 0.3244, "step": 6177 }, { "epoch": 0.8664796633941094, "grad_norm": 1.8586149826215994, "learning_rate": 4.602830721928997e-07, "loss": 0.3248, "step": 6178 }, { "epoch": 0.8666199158485274, "grad_norm": 1.9694344677991502, "learning_rate": 4.5933167608697204e-07, "loss": 0.357, "step": 6179 }, { "epoch": 0.8667601683029453, "grad_norm": 1.7994479127443317, "learning_rate": 4.5838121691622995e-07, "loss": 0.3546, "step": 6180 }, { "epoch": 0.8669004207573633, "grad_norm": 1.6335047664333426, "learning_rate": 4.574316948767932e-07, "loss": 0.3622, "step": 6181 }, { "epoch": 0.8670406732117812, "grad_norm": 2.1867567707121363, "learning_rate": 4.5648311016458943e-07, "loss": 0.3325, "step": 6182 }, { "epoch": 0.8671809256661992, "grad_norm": 1.6769049729303462, "learning_rate": 4.555354629753533e-07, "loss": 0.3434, "step": 6183 }, { "epoch": 0.8673211781206172, "grad_norm": 1.623600210162833, "learning_rate": 4.545887535046228e-07, "loss": 0.3277, "step": 6184 }, { "epoch": 0.8674614305750351, "grad_norm": 2.259558296418964, "learning_rate": 4.536429819477478e-07, "loss": 0.3524, "step": 6185 }, { "epoch": 0.8676016830294531, "grad_norm": 1.7577016837899422, "learning_rate": 4.526981484998788e-07, "loss": 0.3456, "step": 6186 }, { "epoch": 0.867741935483871, "grad_norm": 2.0092064438242847, "learning_rate": 4.517542533559771e-07, "loss": 0.3278, "step": 6187 }, { "epoch": 0.867882187938289, "grad_norm": 2.0981223158695617, "learning_rate": 4.508112967108091e-07, "loss": 0.3477, "step": 6188 }, { "epoch": 0.8680224403927068, "grad_norm": 2.3139637618858253, "learning_rate": 4.4986927875894646e-07, "loss": 0.3905, "step": 6189 }, { "epoch": 0.8681626928471248, "grad_norm": 1.8357978930048868, "learning_rate": 4.489281996947681e-07, "loss": 0.3591, "step": 6190 }, { "epoch": 0.8683029453015427, "grad_norm": 1.7530442054340942, "learning_rate": 4.479880597124597e-07, "loss": 0.315, "step": 6191 }, { "epoch": 0.8684431977559607, "grad_norm": 1.9814538555985741, "learning_rate": 4.4704885900601236e-07, "loss": 0.3456, "step": 6192 }, { "epoch": 0.8685834502103786, "grad_norm": 2.1454821425765793, "learning_rate": 4.461105977692237e-07, "loss": 0.3248, "step": 6193 }, { "epoch": 0.8687237026647966, "grad_norm": 2.2922351709985165, "learning_rate": 4.4517327619569784e-07, "loss": 0.3454, "step": 6194 }, { "epoch": 0.8688639551192145, "grad_norm": 1.8228991553055858, "learning_rate": 4.442368944788428e-07, "loss": 0.3862, "step": 6195 }, { "epoch": 0.8690042075736325, "grad_norm": 2.088126873071465, "learning_rate": 4.4330145281187566e-07, "loss": 0.3468, "step": 6196 }, { "epoch": 0.8691444600280505, "grad_norm": 2.232995200774267, "learning_rate": 4.423669513878182e-07, "loss": 0.3752, "step": 6197 }, { "epoch": 0.8692847124824684, "grad_norm": 1.8777537151854016, "learning_rate": 4.414333903994983e-07, "loss": 0.3461, "step": 6198 }, { "epoch": 0.8694249649368864, "grad_norm": 2.2544860622967557, "learning_rate": 4.405007700395497e-07, "loss": 0.3425, "step": 6199 }, { "epoch": 0.8695652173913043, "grad_norm": 2.545593220011076, "learning_rate": 4.3956909050041186e-07, "loss": 0.341, "step": 6200 }, { "epoch": 0.8697054698457223, "grad_norm": 1.9872264649838791, "learning_rate": 4.3863835197433037e-07, "loss": 0.3123, "step": 6201 }, { "epoch": 0.8698457223001402, "grad_norm": 2.4011734755960483, "learning_rate": 4.377085546533566e-07, "loss": 0.3173, "step": 6202 }, { "epoch": 0.8699859747545582, "grad_norm": 2.4892906644740282, "learning_rate": 4.3677969872934824e-07, "loss": 0.378, "step": 6203 }, { "epoch": 0.8701262272089761, "grad_norm": 1.9302509427364138, "learning_rate": 4.3585178439396856e-07, "loss": 0.3327, "step": 6204 }, { "epoch": 0.8702664796633941, "grad_norm": 2.1926549933704558, "learning_rate": 4.349248118386851e-07, "loss": 0.2987, "step": 6205 }, { "epoch": 0.870406732117812, "grad_norm": 2.2739894760545907, "learning_rate": 4.33998781254773e-07, "loss": 0.3171, "step": 6206 }, { "epoch": 0.87054698457223, "grad_norm": 1.9066204273957095, "learning_rate": 4.330736928333107e-07, "loss": 0.3185, "step": 6207 }, { "epoch": 0.870687237026648, "grad_norm": 1.9608341508964438, "learning_rate": 4.321495467651854e-07, "loss": 0.3559, "step": 6208 }, { "epoch": 0.8708274894810659, "grad_norm": 1.653961616061505, "learning_rate": 4.312263432410868e-07, "loss": 0.3138, "step": 6209 }, { "epoch": 0.8709677419354839, "grad_norm": 2.027096066201325, "learning_rate": 4.303040824515131e-07, "loss": 0.3411, "step": 6210 }, { "epoch": 0.8711079943899018, "grad_norm": 2.1758933457925744, "learning_rate": 4.293827645867649e-07, "loss": 0.3149, "step": 6211 }, { "epoch": 0.8712482468443198, "grad_norm": 2.0769198946099747, "learning_rate": 4.284623898369511e-07, "loss": 0.31, "step": 6212 }, { "epoch": 0.8713884992987377, "grad_norm": 1.7664437885167765, "learning_rate": 4.2754295839198325e-07, "loss": 0.3337, "step": 6213 }, { "epoch": 0.8715287517531557, "grad_norm": 1.7721961293879005, "learning_rate": 4.266244704415806e-07, "loss": 0.3719, "step": 6214 }, { "epoch": 0.8716690042075736, "grad_norm": 2.597400440441202, "learning_rate": 4.2570692617526667e-07, "loss": 0.3565, "step": 6215 }, { "epoch": 0.8718092566619916, "grad_norm": 1.953520833912209, "learning_rate": 4.2479032578236934e-07, "loss": 0.3463, "step": 6216 }, { "epoch": 0.8719495091164096, "grad_norm": 1.7699329854919152, "learning_rate": 4.2387466945202347e-07, "loss": 0.3422, "step": 6217 }, { "epoch": 0.8720897615708275, "grad_norm": 1.8870084867867771, "learning_rate": 4.2295995737316854e-07, "loss": 0.3173, "step": 6218 }, { "epoch": 0.8722300140252455, "grad_norm": 2.5841188057021762, "learning_rate": 4.220461897345485e-07, "loss": 0.31, "step": 6219 }, { "epoch": 0.8723702664796634, "grad_norm": 2.425071231543078, "learning_rate": 4.211333667247125e-07, "loss": 0.366, "step": 6220 }, { "epoch": 0.8725105189340814, "grad_norm": 2.653631198432156, "learning_rate": 4.202214885320166e-07, "loss": 0.3219, "step": 6221 }, { "epoch": 0.8726507713884993, "grad_norm": 1.9855591252211922, "learning_rate": 4.193105553446192e-07, "loss": 0.3601, "step": 6222 }, { "epoch": 0.8727910238429173, "grad_norm": 1.973098191715661, "learning_rate": 4.184005673504854e-07, "loss": 0.3467, "step": 6223 }, { "epoch": 0.8729312762973352, "grad_norm": 1.5475179703859994, "learning_rate": 4.174915247373862e-07, "loss": 0.2777, "step": 6224 }, { "epoch": 0.8730715287517532, "grad_norm": 2.216656409027153, "learning_rate": 4.1658342769289374e-07, "loss": 0.3515, "step": 6225 }, { "epoch": 0.8732117812061712, "grad_norm": 1.7922433619951623, "learning_rate": 4.156762764043898e-07, "loss": 0.2931, "step": 6226 }, { "epoch": 0.8733520336605891, "grad_norm": 2.1849540063133115, "learning_rate": 4.147700710590563e-07, "loss": 0.3102, "step": 6227 }, { "epoch": 0.8734922861150071, "grad_norm": 2.1353028678034778, "learning_rate": 4.1386481184388427e-07, "loss": 0.345, "step": 6228 }, { "epoch": 0.8736325385694249, "grad_norm": 1.9927665317755603, "learning_rate": 4.1296049894566646e-07, "loss": 0.3132, "step": 6229 }, { "epoch": 0.8737727910238429, "grad_norm": 1.9397185759577098, "learning_rate": 4.1205713255100253e-07, "loss": 0.3408, "step": 6230 }, { "epoch": 0.8739130434782608, "grad_norm": 2.2291248234419507, "learning_rate": 4.1115471284629504e-07, "loss": 0.3152, "step": 6231 }, { "epoch": 0.8740532959326788, "grad_norm": 1.6399069531724892, "learning_rate": 4.102532400177528e-07, "loss": 0.3261, "step": 6232 }, { "epoch": 0.8741935483870967, "grad_norm": 2.301290199968847, "learning_rate": 4.0935271425138757e-07, "loss": 0.3632, "step": 6233 }, { "epoch": 0.8743338008415147, "grad_norm": 2.089099444887941, "learning_rate": 4.0845313573301736e-07, "loss": 0.3338, "step": 6234 }, { "epoch": 0.8744740532959326, "grad_norm": 1.9892611084556335, "learning_rate": 4.0755450464826375e-07, "loss": 0.3539, "step": 6235 }, { "epoch": 0.8746143057503506, "grad_norm": 1.961349808710393, "learning_rate": 4.0665682118255225e-07, "loss": 0.3653, "step": 6236 }, { "epoch": 0.8747545582047686, "grad_norm": 2.0523057484675307, "learning_rate": 4.0576008552111414e-07, "loss": 0.3138, "step": 6237 }, { "epoch": 0.8748948106591865, "grad_norm": 1.8497758446032617, "learning_rate": 4.048642978489842e-07, "loss": 0.3208, "step": 6238 }, { "epoch": 0.8750350631136045, "grad_norm": 1.6028257272836108, "learning_rate": 4.0396945835100286e-07, "loss": 0.3179, "step": 6239 }, { "epoch": 0.8751753155680224, "grad_norm": 3.575344821332715, "learning_rate": 4.030755672118125e-07, "loss": 0.3289, "step": 6240 }, { "epoch": 0.8753155680224404, "grad_norm": 1.8369215904783278, "learning_rate": 4.021826246158628e-07, "loss": 0.3171, "step": 6241 }, { "epoch": 0.8754558204768583, "grad_norm": 2.408774500723742, "learning_rate": 4.012906307474057e-07, "loss": 0.314, "step": 6242 }, { "epoch": 0.8755960729312763, "grad_norm": 2.7633347160725497, "learning_rate": 4.003995857904974e-07, "loss": 0.2717, "step": 6243 }, { "epoch": 0.8757363253856942, "grad_norm": 1.874794885224466, "learning_rate": 3.9950948992899917e-07, "loss": 0.2934, "step": 6244 }, { "epoch": 0.8758765778401122, "grad_norm": 2.0450207414018253, "learning_rate": 3.986203433465774e-07, "loss": 0.3033, "step": 6245 }, { "epoch": 0.8760168302945301, "grad_norm": 2.1962849373988607, "learning_rate": 3.9773214622669974e-07, "loss": 0.3606, "step": 6246 }, { "epoch": 0.8761570827489481, "grad_norm": 1.925182442875088, "learning_rate": 3.968448987526391e-07, "loss": 0.297, "step": 6247 }, { "epoch": 0.8762973352033661, "grad_norm": 3.0837439358160283, "learning_rate": 3.959586011074729e-07, "loss": 0.4101, "step": 6248 }, { "epoch": 0.876437587657784, "grad_norm": 1.820047121800348, "learning_rate": 3.9507325347408365e-07, "loss": 0.3715, "step": 6249 }, { "epoch": 0.876577840112202, "grad_norm": 2.123989538163392, "learning_rate": 3.9418885603515535e-07, "loss": 0.3285, "step": 6250 }, { "epoch": 0.8767180925666199, "grad_norm": 1.9028326577334098, "learning_rate": 3.9330540897317805e-07, "loss": 0.3282, "step": 6251 }, { "epoch": 0.8768583450210379, "grad_norm": 2.5625067920129596, "learning_rate": 3.9242291247044484e-07, "loss": 0.3355, "step": 6252 }, { "epoch": 0.8769985974754558, "grad_norm": 1.9377393267084986, "learning_rate": 3.9154136670905287e-07, "loss": 0.3043, "step": 6253 }, { "epoch": 0.8771388499298738, "grad_norm": 2.133249091432173, "learning_rate": 3.9066077187090215e-07, "loss": 0.3595, "step": 6254 }, { "epoch": 0.8772791023842917, "grad_norm": 8.592033297994583, "learning_rate": 3.8978112813769786e-07, "loss": 0.2706, "step": 6255 }, { "epoch": 0.8774193548387097, "grad_norm": 1.7053572698235018, "learning_rate": 3.8890243569094874e-07, "loss": 0.331, "step": 6256 }, { "epoch": 0.8775596072931277, "grad_norm": 1.7764867707738832, "learning_rate": 3.880246947119659e-07, "loss": 0.4069, "step": 6257 }, { "epoch": 0.8776998597475456, "grad_norm": 2.1733428605383747, "learning_rate": 3.8714790538186553e-07, "loss": 0.3234, "step": 6258 }, { "epoch": 0.8778401122019636, "grad_norm": 2.723296912411615, "learning_rate": 3.862720678815668e-07, "loss": 0.3723, "step": 6259 }, { "epoch": 0.8779803646563815, "grad_norm": 1.9761420846968247, "learning_rate": 3.853971823917929e-07, "loss": 0.3464, "step": 6260 }, { "epoch": 0.8781206171107995, "grad_norm": 4.592249618790261, "learning_rate": 3.845232490930706e-07, "loss": 0.3224, "step": 6261 }, { "epoch": 0.8782608695652174, "grad_norm": 1.7811874914076617, "learning_rate": 3.836502681657289e-07, "loss": 0.3361, "step": 6262 }, { "epoch": 0.8784011220196354, "grad_norm": 1.7916467650735006, "learning_rate": 3.827782397899021e-07, "loss": 0.3057, "step": 6263 }, { "epoch": 0.8785413744740533, "grad_norm": 1.819111571142397, "learning_rate": 3.819071641455274e-07, "loss": 0.336, "step": 6264 }, { "epoch": 0.8786816269284713, "grad_norm": 2.0791107363876984, "learning_rate": 3.810370414123454e-07, "loss": 0.3953, "step": 6265 }, { "epoch": 0.8788218793828892, "grad_norm": 2.1047932916120384, "learning_rate": 3.801678717698987e-07, "loss": 0.2798, "step": 6266 }, { "epoch": 0.8789621318373072, "grad_norm": 1.8515435712788382, "learning_rate": 3.792996553975359e-07, "loss": 0.3328, "step": 6267 }, { "epoch": 0.8791023842917252, "grad_norm": 1.5640742191938888, "learning_rate": 3.7843239247440545e-07, "loss": 0.3314, "step": 6268 }, { "epoch": 0.879242636746143, "grad_norm": 1.838173401632113, "learning_rate": 3.7756608317946144e-07, "loss": 0.3628, "step": 6269 }, { "epoch": 0.879382889200561, "grad_norm": 1.8346106746639579, "learning_rate": 3.767007276914619e-07, "loss": 0.3167, "step": 6270 }, { "epoch": 0.8795231416549789, "grad_norm": 1.5852026528851482, "learning_rate": 3.7583632618896635e-07, "loss": 0.3313, "step": 6271 }, { "epoch": 0.8796633941093969, "grad_norm": 2.039834296658605, "learning_rate": 3.7497287885033763e-07, "loss": 0.3324, "step": 6272 }, { "epoch": 0.8798036465638148, "grad_norm": 1.9923923162752146, "learning_rate": 3.7411038585374206e-07, "loss": 0.3583, "step": 6273 }, { "epoch": 0.8799438990182328, "grad_norm": 2.0758346901052915, "learning_rate": 3.7324884737715003e-07, "loss": 0.3352, "step": 6274 }, { "epoch": 0.8800841514726507, "grad_norm": 2.649444028724977, "learning_rate": 3.723882635983328e-07, "loss": 0.3543, "step": 6275 }, { "epoch": 0.8802244039270687, "grad_norm": 1.6356024019096354, "learning_rate": 3.715286346948671e-07, "loss": 0.3016, "step": 6276 }, { "epoch": 0.8803646563814866, "grad_norm": 2.361852898354247, "learning_rate": 3.7066996084413e-07, "loss": 0.3336, "step": 6277 }, { "epoch": 0.8805049088359046, "grad_norm": 2.416907550601449, "learning_rate": 3.698122422233036e-07, "loss": 0.3203, "step": 6278 }, { "epoch": 0.8806451612903226, "grad_norm": 3.276667710462878, "learning_rate": 3.6895547900937136e-07, "loss": 0.3193, "step": 6279 }, { "epoch": 0.8807854137447405, "grad_norm": 1.992461660631314, "learning_rate": 3.6809967137912183e-07, "loss": 0.3555, "step": 6280 }, { "epoch": 0.8809256661991585, "grad_norm": 2.0779222661992085, "learning_rate": 3.6724481950914326e-07, "loss": 0.3006, "step": 6281 }, { "epoch": 0.8810659186535764, "grad_norm": 3.5406671048494918, "learning_rate": 3.663909235758295e-07, "loss": 0.3716, "step": 6282 }, { "epoch": 0.8812061711079944, "grad_norm": 1.6455470773783436, "learning_rate": 3.6553798375537574e-07, "loss": 0.3384, "step": 6283 }, { "epoch": 0.8813464235624123, "grad_norm": 2.403501947848549, "learning_rate": 3.646860002237801e-07, "loss": 0.353, "step": 6284 }, { "epoch": 0.8814866760168303, "grad_norm": 2.6385936259562617, "learning_rate": 3.638349731568436e-07, "loss": 0.2969, "step": 6285 }, { "epoch": 0.8816269284712482, "grad_norm": 2.313372679310739, "learning_rate": 3.6298490273017017e-07, "loss": 0.3529, "step": 6286 }, { "epoch": 0.8817671809256662, "grad_norm": 1.7654749713723603, "learning_rate": 3.621357891191657e-07, "loss": 0.37, "step": 6287 }, { "epoch": 0.8819074333800841, "grad_norm": 2.0446533493182097, "learning_rate": 3.612876324990372e-07, "loss": 0.3335, "step": 6288 }, { "epoch": 0.8820476858345021, "grad_norm": 1.959427210159347, "learning_rate": 3.6044043304479745e-07, "loss": 0.328, "step": 6289 }, { "epoch": 0.8821879382889201, "grad_norm": 1.581465707171352, "learning_rate": 3.595941909312595e-07, "loss": 0.3288, "step": 6290 }, { "epoch": 0.882328190743338, "grad_norm": 2.754262135553992, "learning_rate": 3.587489063330402e-07, "loss": 0.3232, "step": 6291 }, { "epoch": 0.882468443197756, "grad_norm": 2.337862580273782, "learning_rate": 3.5790457942455725e-07, "loss": 0.3691, "step": 6292 }, { "epoch": 0.8826086956521739, "grad_norm": 2.1880469592074623, "learning_rate": 3.570612103800325e-07, "loss": 0.3031, "step": 6293 }, { "epoch": 0.8827489481065919, "grad_norm": 1.8746505411647414, "learning_rate": 3.5621879937348836e-07, "loss": 0.308, "step": 6294 }, { "epoch": 0.8828892005610098, "grad_norm": 2.230027992004426, "learning_rate": 3.5537734657875136e-07, "loss": 0.3336, "step": 6295 }, { "epoch": 0.8830294530154278, "grad_norm": 1.757850341389559, "learning_rate": 3.545368521694487e-07, "loss": 0.2911, "step": 6296 }, { "epoch": 0.8831697054698457, "grad_norm": 2.000659833781798, "learning_rate": 3.5369731631901214e-07, "loss": 0.3766, "step": 6297 }, { "epoch": 0.8833099579242637, "grad_norm": 2.2010871715207694, "learning_rate": 3.528587392006716e-07, "loss": 0.3526, "step": 6298 }, { "epoch": 0.8834502103786817, "grad_norm": 1.734279045681108, "learning_rate": 3.520211209874624e-07, "loss": 0.2763, "step": 6299 }, { "epoch": 0.8835904628330996, "grad_norm": 1.7584538624871486, "learning_rate": 3.51184461852222e-07, "loss": 0.3872, "step": 6300 }, { "epoch": 0.8837307152875176, "grad_norm": 1.9306525778537398, "learning_rate": 3.5034876196758825e-07, "loss": 0.3531, "step": 6301 }, { "epoch": 0.8838709677419355, "grad_norm": 2.088028220771579, "learning_rate": 3.4951402150600275e-07, "loss": 0.3196, "step": 6302 }, { "epoch": 0.8840112201963535, "grad_norm": 2.7617141745177354, "learning_rate": 3.486802406397083e-07, "loss": 0.3291, "step": 6303 }, { "epoch": 0.8841514726507714, "grad_norm": 1.826882094722809, "learning_rate": 3.4784741954074884e-07, "loss": 0.2972, "step": 6304 }, { "epoch": 0.8842917251051894, "grad_norm": 1.9117986876716981, "learning_rate": 3.470155583809726e-07, "loss": 0.2747, "step": 6305 }, { "epoch": 0.8844319775596073, "grad_norm": 2.161411575975973, "learning_rate": 3.4618465733202765e-07, "loss": 0.3381, "step": 6306 }, { "epoch": 0.8845722300140253, "grad_norm": 2.6001967319005272, "learning_rate": 3.453547165653642e-07, "loss": 0.3306, "step": 6307 }, { "epoch": 0.8847124824684433, "grad_norm": 1.8931393888473897, "learning_rate": 3.4452573625223584e-07, "loss": 0.3835, "step": 6308 }, { "epoch": 0.8848527349228611, "grad_norm": 1.8680725553827058, "learning_rate": 3.436977165636951e-07, "loss": 0.3033, "step": 6309 }, { "epoch": 0.884992987377279, "grad_norm": 2.0719808514052653, "learning_rate": 3.428706576705992e-07, "loss": 0.3403, "step": 6310 }, { "epoch": 0.885133239831697, "grad_norm": 1.6961861745567095, "learning_rate": 3.420445597436056e-07, "loss": 0.3213, "step": 6311 }, { "epoch": 0.885273492286115, "grad_norm": 1.7356935592369231, "learning_rate": 3.41219422953174e-07, "loss": 0.37, "step": 6312 }, { "epoch": 0.8854137447405329, "grad_norm": 2.298424956846988, "learning_rate": 3.4039524746956597e-07, "loss": 0.3029, "step": 6313 }, { "epoch": 0.8855539971949509, "grad_norm": 1.7647260554880972, "learning_rate": 3.395720334628438e-07, "loss": 0.3174, "step": 6314 }, { "epoch": 0.8856942496493688, "grad_norm": 3.1359185781321424, "learning_rate": 3.3874978110287224e-07, "loss": 0.393, "step": 6315 }, { "epoch": 0.8858345021037868, "grad_norm": 1.85676417053083, "learning_rate": 3.3792849055931776e-07, "loss": 0.341, "step": 6316 }, { "epoch": 0.8859747545582047, "grad_norm": 1.834654924132413, "learning_rate": 3.371081620016475e-07, "loss": 0.337, "step": 6317 }, { "epoch": 0.8861150070126227, "grad_norm": 3.3153277149726867, "learning_rate": 3.362887955991301e-07, "loss": 0.347, "step": 6318 }, { "epoch": 0.8862552594670406, "grad_norm": 2.264503464493783, "learning_rate": 3.354703915208363e-07, "loss": 0.295, "step": 6319 }, { "epoch": 0.8863955119214586, "grad_norm": 2.0317260184418746, "learning_rate": 3.3465294993563826e-07, "loss": 0.3375, "step": 6320 }, { "epoch": 0.8865357643758766, "grad_norm": 1.9126458355520601, "learning_rate": 3.338364710122094e-07, "loss": 0.3271, "step": 6321 }, { "epoch": 0.8866760168302945, "grad_norm": 1.96007388121654, "learning_rate": 3.330209549190244e-07, "loss": 0.3763, "step": 6322 }, { "epoch": 0.8868162692847125, "grad_norm": 1.850327864797175, "learning_rate": 3.322064018243587e-07, "loss": 0.3581, "step": 6323 }, { "epoch": 0.8869565217391304, "grad_norm": 1.8820289469285605, "learning_rate": 3.313928118962906e-07, "loss": 0.329, "step": 6324 }, { "epoch": 0.8870967741935484, "grad_norm": 1.7805049074057675, "learning_rate": 3.305801853026985e-07, "loss": 0.2933, "step": 6325 }, { "epoch": 0.8872370266479663, "grad_norm": 1.6507930468143621, "learning_rate": 3.297685222112623e-07, "loss": 0.2983, "step": 6326 }, { "epoch": 0.8873772791023843, "grad_norm": 1.8365418433727847, "learning_rate": 3.2895782278946244e-07, "loss": 0.3528, "step": 6327 }, { "epoch": 0.8875175315568022, "grad_norm": 1.8484968084009057, "learning_rate": 3.2814808720458226e-07, "loss": 0.3511, "step": 6328 }, { "epoch": 0.8876577840112202, "grad_norm": 2.2172339189225623, "learning_rate": 3.2733931562370257e-07, "loss": 0.3376, "step": 6329 }, { "epoch": 0.8877980364656382, "grad_norm": 1.805411519720349, "learning_rate": 3.265315082137099e-07, "loss": 0.2986, "step": 6330 }, { "epoch": 0.8879382889200561, "grad_norm": 1.893011242385263, "learning_rate": 3.2572466514128876e-07, "loss": 0.3043, "step": 6331 }, { "epoch": 0.8880785413744741, "grad_norm": 2.4741823329238963, "learning_rate": 3.2491878657292643e-07, "loss": 0.3805, "step": 6332 }, { "epoch": 0.888218793828892, "grad_norm": 2.211128361420085, "learning_rate": 3.2411387267490937e-07, "loss": 0.327, "step": 6333 }, { "epoch": 0.88835904628331, "grad_norm": 1.8169338977611755, "learning_rate": 3.233099236133264e-07, "loss": 0.2745, "step": 6334 }, { "epoch": 0.8884992987377279, "grad_norm": 2.1614977239366913, "learning_rate": 3.2250693955406697e-07, "loss": 0.3514, "step": 6335 }, { "epoch": 0.8886395511921459, "grad_norm": 5.061884901567189, "learning_rate": 3.217049206628209e-07, "loss": 0.3363, "step": 6336 }, { "epoch": 0.8887798036465638, "grad_norm": 5.0219397796459715, "learning_rate": 3.2090386710507906e-07, "loss": 0.3207, "step": 6337 }, { "epoch": 0.8889200561009818, "grad_norm": 2.380255055264623, "learning_rate": 3.201037790461342e-07, "loss": 0.3436, "step": 6338 }, { "epoch": 0.8890603085553997, "grad_norm": 2.099317521503431, "learning_rate": 3.193046566510777e-07, "loss": 0.3419, "step": 6339 }, { "epoch": 0.8892005610098177, "grad_norm": 4.856192868558223, "learning_rate": 3.185065000848031e-07, "loss": 0.3244, "step": 6340 }, { "epoch": 0.8893408134642357, "grad_norm": 1.8512882077783777, "learning_rate": 3.1770930951200483e-07, "loss": 0.3461, "step": 6341 }, { "epoch": 0.8894810659186536, "grad_norm": 1.7256280778403572, "learning_rate": 3.16913085097178e-07, "loss": 0.3362, "step": 6342 }, { "epoch": 0.8896213183730716, "grad_norm": 2.257556412373256, "learning_rate": 3.161178270046167e-07, "loss": 0.3812, "step": 6343 }, { "epoch": 0.8897615708274895, "grad_norm": 1.942270619953052, "learning_rate": 3.15323535398418e-07, "loss": 0.3305, "step": 6344 }, { "epoch": 0.8899018232819075, "grad_norm": 2.513542118591529, "learning_rate": 3.14530210442478e-07, "loss": 0.3796, "step": 6345 }, { "epoch": 0.8900420757363254, "grad_norm": 2.9497962234720156, "learning_rate": 3.1373785230049356e-07, "loss": 0.3605, "step": 6346 }, { "epoch": 0.8901823281907434, "grad_norm": 2.4746160228612855, "learning_rate": 3.129464611359634e-07, "loss": 0.3978, "step": 6347 }, { "epoch": 0.8903225806451613, "grad_norm": 2.0952153558137048, "learning_rate": 3.12156037112184e-07, "loss": 0.3059, "step": 6348 }, { "epoch": 0.8904628330995792, "grad_norm": 1.708006556201296, "learning_rate": 3.1136658039225497e-07, "loss": 0.3166, "step": 6349 }, { "epoch": 0.8906030855539971, "grad_norm": 1.800262575071448, "learning_rate": 3.105780911390738e-07, "loss": 0.3589, "step": 6350 }, { "epoch": 0.8907433380084151, "grad_norm": 1.7566654786329698, "learning_rate": 3.097905695153408e-07, "loss": 0.2929, "step": 6351 }, { "epoch": 0.890883590462833, "grad_norm": 1.8035065721561834, "learning_rate": 3.090040156835555e-07, "loss": 0.3514, "step": 6352 }, { "epoch": 0.891023842917251, "grad_norm": 1.7010293368725664, "learning_rate": 3.0821842980601756e-07, "loss": 0.3043, "step": 6353 }, { "epoch": 0.891164095371669, "grad_norm": 1.7516822903290543, "learning_rate": 3.0743381204482726e-07, "loss": 0.3359, "step": 6354 }, { "epoch": 0.8913043478260869, "grad_norm": 1.7836326591535243, "learning_rate": 3.066501625618851e-07, "loss": 0.3246, "step": 6355 }, { "epoch": 0.8914446002805049, "grad_norm": 1.7662744593563413, "learning_rate": 3.058674815188917e-07, "loss": 0.3267, "step": 6356 }, { "epoch": 0.8915848527349228, "grad_norm": 2.1919827986272016, "learning_rate": 3.0508576907734734e-07, "loss": 0.3202, "step": 6357 }, { "epoch": 0.8917251051893408, "grad_norm": 2.9177214940319165, "learning_rate": 3.043050253985541e-07, "loss": 0.3252, "step": 6358 }, { "epoch": 0.8918653576437587, "grad_norm": 1.811432754426676, "learning_rate": 3.0352525064361147e-07, "loss": 0.3388, "step": 6359 }, { "epoch": 0.8920056100981767, "grad_norm": 1.736775635591189, "learning_rate": 3.0274644497342133e-07, "loss": 0.3246, "step": 6360 }, { "epoch": 0.8921458625525946, "grad_norm": 1.8262877402038598, "learning_rate": 3.0196860854868447e-07, "loss": 0.2895, "step": 6361 }, { "epoch": 0.8922861150070126, "grad_norm": 1.7191176974850781, "learning_rate": 3.0119174152990204e-07, "loss": 0.3255, "step": 6362 }, { "epoch": 0.8924263674614306, "grad_norm": 1.9905385092924774, "learning_rate": 3.0041584407737577e-07, "loss": 0.3416, "step": 6363 }, { "epoch": 0.8925666199158485, "grad_norm": 2.838382392171113, "learning_rate": 2.996409163512054e-07, "loss": 0.2938, "step": 6364 }, { "epoch": 0.8927068723702665, "grad_norm": 1.7577040575273946, "learning_rate": 2.9886695851129297e-07, "loss": 0.3255, "step": 6365 }, { "epoch": 0.8928471248246844, "grad_norm": 1.705563680371963, "learning_rate": 2.980939707173391e-07, "loss": 0.3891, "step": 6366 }, { "epoch": 0.8929873772791024, "grad_norm": 2.20109670353286, "learning_rate": 2.9732195312884515e-07, "loss": 0.3688, "step": 6367 }, { "epoch": 0.8931276297335203, "grad_norm": 2.0139455970000237, "learning_rate": 2.965509059051097e-07, "loss": 0.3176, "step": 6368 }, { "epoch": 0.8932678821879383, "grad_norm": 1.6841023466854168, "learning_rate": 2.9578082920523387e-07, "loss": 0.3734, "step": 6369 }, { "epoch": 0.8934081346423562, "grad_norm": 2.105283824605893, "learning_rate": 2.9501172318811834e-07, "loss": 0.3286, "step": 6370 }, { "epoch": 0.8935483870967742, "grad_norm": 2.508199026202168, "learning_rate": 2.9424358801246167e-07, "loss": 0.3455, "step": 6371 }, { "epoch": 0.8936886395511922, "grad_norm": 1.7060103167750753, "learning_rate": 2.934764238367632e-07, "loss": 0.3388, "step": 6372 }, { "epoch": 0.8938288920056101, "grad_norm": 2.207762794448052, "learning_rate": 2.927102308193225e-07, "loss": 0.3049, "step": 6373 }, { "epoch": 0.8939691444600281, "grad_norm": 2.377933296968513, "learning_rate": 2.91945009118238e-07, "loss": 0.3223, "step": 6374 }, { "epoch": 0.894109396914446, "grad_norm": 2.134356264895068, "learning_rate": 2.911807588914078e-07, "loss": 0.323, "step": 6375 }, { "epoch": 0.894249649368864, "grad_norm": 3.4754981308274724, "learning_rate": 2.904174802965293e-07, "loss": 0.3468, "step": 6376 }, { "epoch": 0.8943899018232819, "grad_norm": 1.6229265629764944, "learning_rate": 2.8965517349110015e-07, "loss": 0.3615, "step": 6377 }, { "epoch": 0.8945301542776999, "grad_norm": 1.7426960976812025, "learning_rate": 2.888938386324169e-07, "loss": 0.3728, "step": 6378 }, { "epoch": 0.8946704067321178, "grad_norm": 2.5721471667982874, "learning_rate": 2.8813347587757667e-07, "loss": 0.3275, "step": 6379 }, { "epoch": 0.8948106591865358, "grad_norm": 7.014666859335583, "learning_rate": 2.873740853834728e-07, "loss": 0.364, "step": 6380 }, { "epoch": 0.8949509116409538, "grad_norm": 1.8831372119546843, "learning_rate": 2.866156673068016e-07, "loss": 0.2918, "step": 6381 }, { "epoch": 0.8950911640953717, "grad_norm": 1.6226080382691666, "learning_rate": 2.858582218040573e-07, "loss": 0.3077, "step": 6382 }, { "epoch": 0.8952314165497897, "grad_norm": 1.7182186345701416, "learning_rate": 2.851017490315333e-07, "loss": 0.3249, "step": 6383 }, { "epoch": 0.8953716690042076, "grad_norm": 1.6869435982538288, "learning_rate": 2.843462491453219e-07, "loss": 0.3067, "step": 6384 }, { "epoch": 0.8955119214586256, "grad_norm": 2.1809606628018785, "learning_rate": 2.8359172230131626e-07, "loss": 0.3509, "step": 6385 }, { "epoch": 0.8956521739130435, "grad_norm": 2.210615909763722, "learning_rate": 2.828381686552073e-07, "loss": 0.3623, "step": 6386 }, { "epoch": 0.8957924263674615, "grad_norm": 1.5366265242595494, "learning_rate": 2.820855883624857e-07, "loss": 0.3078, "step": 6387 }, { "epoch": 0.8959326788218794, "grad_norm": 1.74725473879402, "learning_rate": 2.813339815784416e-07, "loss": 0.3224, "step": 6388 }, { "epoch": 0.8960729312762973, "grad_norm": 2.6935483604778945, "learning_rate": 2.8058334845816214e-07, "loss": 0.3646, "step": 6389 }, { "epoch": 0.8962131837307152, "grad_norm": 1.9151870226856185, "learning_rate": 2.7983368915653674e-07, "loss": 0.2947, "step": 6390 }, { "epoch": 0.8963534361851332, "grad_norm": 1.7127801039565111, "learning_rate": 2.790850038282522e-07, "loss": 0.3424, "step": 6391 }, { "epoch": 0.8964936886395511, "grad_norm": 2.3244892379698694, "learning_rate": 2.7833729262779383e-07, "loss": 0.3343, "step": 6392 }, { "epoch": 0.8966339410939691, "grad_norm": 3.043613037537069, "learning_rate": 2.7759055570944715e-07, "loss": 0.3596, "step": 6393 }, { "epoch": 0.896774193548387, "grad_norm": 1.659154971490423, "learning_rate": 2.768447932272955e-07, "loss": 0.3176, "step": 6394 }, { "epoch": 0.896914446002805, "grad_norm": 1.9829444835315149, "learning_rate": 2.76100005335222e-07, "loss": 0.2822, "step": 6395 }, { "epoch": 0.897054698457223, "grad_norm": 2.3087549556726494, "learning_rate": 2.753561921869091e-07, "loss": 0.3237, "step": 6396 }, { "epoch": 0.8971949509116409, "grad_norm": 2.4650936816135123, "learning_rate": 2.74613353935837e-07, "loss": 0.3091, "step": 6397 }, { "epoch": 0.8973352033660589, "grad_norm": 2.315761019563859, "learning_rate": 2.7387149073528464e-07, "loss": 0.3411, "step": 6398 }, { "epoch": 0.8974754558204768, "grad_norm": 1.9166401225477205, "learning_rate": 2.731306027383318e-07, "loss": 0.336, "step": 6399 }, { "epoch": 0.8976157082748948, "grad_norm": 2.5515846751383022, "learning_rate": 2.72390690097854e-07, "loss": 0.3392, "step": 6400 }, { "epoch": 0.8977559607293127, "grad_norm": 1.8463056277273768, "learning_rate": 2.7165175296652746e-07, "loss": 0.3794, "step": 6401 }, { "epoch": 0.8978962131837307, "grad_norm": 2.2725232570284843, "learning_rate": 2.7091379149682683e-07, "loss": 0.3407, "step": 6402 }, { "epoch": 0.8980364656381487, "grad_norm": 1.8793436600026, "learning_rate": 2.7017680584102537e-07, "loss": 0.3409, "step": 6403 }, { "epoch": 0.8981767180925666, "grad_norm": 2.0360867941067706, "learning_rate": 2.694407961511947e-07, "loss": 0.319, "step": 6404 }, { "epoch": 0.8983169705469846, "grad_norm": 2.130367456538409, "learning_rate": 2.6870576257920553e-07, "loss": 0.3355, "step": 6405 }, { "epoch": 0.8984572230014025, "grad_norm": 1.8509436108219248, "learning_rate": 2.6797170527672723e-07, "loss": 0.3405, "step": 6406 }, { "epoch": 0.8985974754558205, "grad_norm": 2.442118841317848, "learning_rate": 2.672386243952263e-07, "loss": 0.362, "step": 6407 }, { "epoch": 0.8987377279102384, "grad_norm": 2.011796020208985, "learning_rate": 2.6650652008597067e-07, "loss": 0.3043, "step": 6408 }, { "epoch": 0.8988779803646564, "grad_norm": 1.8372185316350906, "learning_rate": 2.657753925000228e-07, "loss": 0.3341, "step": 6409 }, { "epoch": 0.8990182328190743, "grad_norm": 1.5716785922496417, "learning_rate": 2.6504524178824706e-07, "loss": 0.3359, "step": 6410 }, { "epoch": 0.8991584852734923, "grad_norm": 1.8070496132293217, "learning_rate": 2.643160681013046e-07, "loss": 0.3408, "step": 6411 }, { "epoch": 0.8992987377279102, "grad_norm": 1.5685709695371164, "learning_rate": 2.6358787158965616e-07, "loss": 0.3508, "step": 6412 }, { "epoch": 0.8994389901823282, "grad_norm": 2.749902896900691, "learning_rate": 2.628606524035582e-07, "loss": 0.3598, "step": 6413 }, { "epoch": 0.8995792426367462, "grad_norm": 1.6787762676351194, "learning_rate": 2.62134410693069e-07, "loss": 0.2974, "step": 6414 }, { "epoch": 0.8997194950911641, "grad_norm": 2.3596508546958583, "learning_rate": 2.6140914660804205e-07, "loss": 0.3483, "step": 6415 }, { "epoch": 0.8998597475455821, "grad_norm": 1.7489755901903796, "learning_rate": 2.6068486029813154e-07, "loss": 0.3472, "step": 6416 }, { "epoch": 0.9, "grad_norm": 3.43849764131923, "learning_rate": 2.599615519127885e-07, "loss": 0.3005, "step": 6417 }, { "epoch": 0.900140252454418, "grad_norm": 2.081259636384117, "learning_rate": 2.592392216012629e-07, "loss": 0.3582, "step": 6418 }, { "epoch": 0.9002805049088359, "grad_norm": 2.6991352780542623, "learning_rate": 2.585178695126023e-07, "loss": 0.3901, "step": 6419 }, { "epoch": 0.9004207573632539, "grad_norm": 1.8602617857889976, "learning_rate": 2.577974957956536e-07, "loss": 0.35, "step": 6420 }, { "epoch": 0.9005610098176718, "grad_norm": 1.876405443516473, "learning_rate": 2.5707810059905914e-07, "loss": 0.3109, "step": 6421 }, { "epoch": 0.9007012622720898, "grad_norm": 1.7687461071173758, "learning_rate": 2.5635968407126175e-07, "loss": 0.364, "step": 6422 }, { "epoch": 0.9008415147265078, "grad_norm": 1.875204456626101, "learning_rate": 2.556422463605024e-07, "loss": 0.3147, "step": 6423 }, { "epoch": 0.9009817671809257, "grad_norm": 2.2041094589260966, "learning_rate": 2.549257876148181e-07, "loss": 0.3424, "step": 6424 }, { "epoch": 0.9011220196353437, "grad_norm": 3.062173164210323, "learning_rate": 2.542103079820463e-07, "loss": 0.3089, "step": 6425 }, { "epoch": 0.9012622720897616, "grad_norm": 1.8930857384112518, "learning_rate": 2.534958076098204e-07, "loss": 0.3855, "step": 6426 }, { "epoch": 0.9014025245441796, "grad_norm": 2.0237503104536585, "learning_rate": 2.5278228664557315e-07, "loss": 0.2977, "step": 6427 }, { "epoch": 0.9015427769985975, "grad_norm": 2.8737569692559544, "learning_rate": 2.520697452365345e-07, "loss": 0.3242, "step": 6428 }, { "epoch": 0.9016830294530155, "grad_norm": 2.060874298260323, "learning_rate": 2.513581835297324e-07, "loss": 0.3887, "step": 6429 }, { "epoch": 0.9018232819074333, "grad_norm": 4.4434439380689525, "learning_rate": 2.506476016719922e-07, "loss": 0.3178, "step": 6430 }, { "epoch": 0.9019635343618513, "grad_norm": 2.0877908195714006, "learning_rate": 2.499379998099377e-07, "loss": 0.357, "step": 6431 }, { "epoch": 0.9021037868162692, "grad_norm": 1.9384758245433105, "learning_rate": 2.492293780899907e-07, "loss": 0.3236, "step": 6432 }, { "epoch": 0.9022440392706872, "grad_norm": 2.668964697177363, "learning_rate": 2.4852173665837034e-07, "loss": 0.3664, "step": 6433 }, { "epoch": 0.9023842917251051, "grad_norm": 1.869608885208811, "learning_rate": 2.478150756610925e-07, "loss": 0.321, "step": 6434 }, { "epoch": 0.9025245441795231, "grad_norm": 2.1069577530661516, "learning_rate": 2.4710939524397235e-07, "loss": 0.3268, "step": 6435 }, { "epoch": 0.9026647966339411, "grad_norm": 1.9612765683963285, "learning_rate": 2.4640469555262226e-07, "loss": 0.3717, "step": 6436 }, { "epoch": 0.902805049088359, "grad_norm": 2.0318256442817595, "learning_rate": 2.4570097673245197e-07, "loss": 0.3759, "step": 6437 }, { "epoch": 0.902945301542777, "grad_norm": 1.7072184654247193, "learning_rate": 2.4499823892866924e-07, "loss": 0.3135, "step": 6438 }, { "epoch": 0.9030855539971949, "grad_norm": 2.2673687768416793, "learning_rate": 2.442964822862781e-07, "loss": 0.3281, "step": 6439 }, { "epoch": 0.9032258064516129, "grad_norm": 2.159538415226643, "learning_rate": 2.4359570695008327e-07, "loss": 0.3629, "step": 6440 }, { "epoch": 0.9033660589060308, "grad_norm": 1.7782799571117813, "learning_rate": 2.4289591306468244e-07, "loss": 0.2931, "step": 6441 }, { "epoch": 0.9035063113604488, "grad_norm": 2.531211475973964, "learning_rate": 2.4219710077447446e-07, "loss": 0.3785, "step": 6442 }, { "epoch": 0.9036465638148667, "grad_norm": 1.6680236378556847, "learning_rate": 2.4149927022365406e-07, "loss": 0.3628, "step": 6443 }, { "epoch": 0.9037868162692847, "grad_norm": 1.8115116581700923, "learning_rate": 2.4080242155621327e-07, "loss": 0.3502, "step": 6444 }, { "epoch": 0.9039270687237027, "grad_norm": 3.7752446177470573, "learning_rate": 2.401065549159426e-07, "loss": 0.3296, "step": 6445 }, { "epoch": 0.9040673211781206, "grad_norm": 2.0571705714130015, "learning_rate": 2.394116704464294e-07, "loss": 0.319, "step": 6446 }, { "epoch": 0.9042075736325386, "grad_norm": 1.8337153845850283, "learning_rate": 2.387177682910574e-07, "loss": 0.3131, "step": 6447 }, { "epoch": 0.9043478260869565, "grad_norm": 1.898020380608137, "learning_rate": 2.3802484859300922e-07, "loss": 0.3488, "step": 6448 }, { "epoch": 0.9044880785413745, "grad_norm": 2.1345956356010043, "learning_rate": 2.3733291149526495e-07, "loss": 0.3383, "step": 6449 }, { "epoch": 0.9046283309957924, "grad_norm": 1.8144939898435781, "learning_rate": 2.366419571405981e-07, "loss": 0.3612, "step": 6450 }, { "epoch": 0.9047685834502104, "grad_norm": 1.9396023572841739, "learning_rate": 2.3595198567158473e-07, "loss": 0.37, "step": 6451 }, { "epoch": 0.9049088359046283, "grad_norm": 1.9392519382645634, "learning_rate": 2.352629972305942e-07, "loss": 0.3211, "step": 6452 }, { "epoch": 0.9050490883590463, "grad_norm": 1.7169912876981206, "learning_rate": 2.3457499195979616e-07, "loss": 0.3248, "step": 6453 }, { "epoch": 0.9051893408134642, "grad_norm": 1.8409406303932152, "learning_rate": 2.3388797000115427e-07, "loss": 0.3319, "step": 6454 }, { "epoch": 0.9053295932678822, "grad_norm": 2.7406971242324687, "learning_rate": 2.3320193149643067e-07, "loss": 0.3605, "step": 6455 }, { "epoch": 0.9054698457223002, "grad_norm": 2.961669026296774, "learning_rate": 2.325168765871849e-07, "loss": 0.3153, "step": 6456 }, { "epoch": 0.9056100981767181, "grad_norm": 1.6950753744915061, "learning_rate": 2.318328054147734e-07, "loss": 0.3574, "step": 6457 }, { "epoch": 0.9057503506311361, "grad_norm": 1.9092387548807057, "learning_rate": 2.3114971812034981e-07, "loss": 0.3308, "step": 6458 }, { "epoch": 0.905890603085554, "grad_norm": 1.7908026881707528, "learning_rate": 2.304676148448637e-07, "loss": 0.3726, "step": 6459 }, { "epoch": 0.906030855539972, "grad_norm": 2.146073139704114, "learning_rate": 2.2978649572906298e-07, "loss": 0.3341, "step": 6460 }, { "epoch": 0.9061711079943899, "grad_norm": 2.0290856200945084, "learning_rate": 2.2910636091349192e-07, "loss": 0.3498, "step": 6461 }, { "epoch": 0.9063113604488079, "grad_norm": 2.0355555752143877, "learning_rate": 2.2842721053849048e-07, "loss": 0.3419, "step": 6462 }, { "epoch": 0.9064516129032258, "grad_norm": 1.8168796373642095, "learning_rate": 2.2774904474419768e-07, "loss": 0.3271, "step": 6463 }, { "epoch": 0.9065918653576438, "grad_norm": 2.452611104021504, "learning_rate": 2.2707186367054767e-07, "loss": 0.3328, "step": 6464 }, { "epoch": 0.9067321178120618, "grad_norm": 2.234727951663096, "learning_rate": 2.2639566745727203e-07, "loss": 0.3079, "step": 6465 }, { "epoch": 0.9068723702664797, "grad_norm": 1.9857965618938644, "learning_rate": 2.2572045624389972e-07, "loss": 0.3555, "step": 6466 }, { "epoch": 0.9070126227208977, "grad_norm": 1.5458213223027557, "learning_rate": 2.2504623016975536e-07, "loss": 0.3482, "step": 6467 }, { "epoch": 0.9071528751753156, "grad_norm": 1.9122035127091819, "learning_rate": 2.24372989373961e-07, "loss": 0.3457, "step": 6468 }, { "epoch": 0.9072931276297336, "grad_norm": 2.4124207083365934, "learning_rate": 2.23700733995435e-07, "loss": 0.2914, "step": 6469 }, { "epoch": 0.9074333800841514, "grad_norm": 1.971585789674584, "learning_rate": 2.2302946417289305e-07, "loss": 0.418, "step": 6470 }, { "epoch": 0.9075736325385694, "grad_norm": 2.1671853789502733, "learning_rate": 2.223591800448466e-07, "loss": 0.3333, "step": 6471 }, { "epoch": 0.9077138849929873, "grad_norm": 1.8622393304071339, "learning_rate": 2.2168988174960382e-07, "loss": 0.3354, "step": 6472 }, { "epoch": 0.9078541374474053, "grad_norm": 2.2991299102968035, "learning_rate": 2.2102156942526986e-07, "loss": 0.364, "step": 6473 }, { "epoch": 0.9079943899018232, "grad_norm": 2.0240534135909893, "learning_rate": 2.203542432097472e-07, "loss": 0.3136, "step": 6474 }, { "epoch": 0.9081346423562412, "grad_norm": 1.3943806200927298, "learning_rate": 2.1968790324073285e-07, "loss": 0.295, "step": 6475 }, { "epoch": 0.9082748948106592, "grad_norm": 2.145686493327701, "learning_rate": 2.1902254965572134e-07, "loss": 0.362, "step": 6476 }, { "epoch": 0.9084151472650771, "grad_norm": 1.9714300896064172, "learning_rate": 2.1835818259200448e-07, "loss": 0.3827, "step": 6477 }, { "epoch": 0.9085553997194951, "grad_norm": 1.7382667798072229, "learning_rate": 2.1769480218666927e-07, "loss": 0.3179, "step": 6478 }, { "epoch": 0.908695652173913, "grad_norm": 3.221825620936998, "learning_rate": 2.1703240857659958e-07, "loss": 0.3752, "step": 6479 }, { "epoch": 0.908835904628331, "grad_norm": 1.9515586981266762, "learning_rate": 2.163710018984766e-07, "loss": 0.2827, "step": 6480 }, { "epoch": 0.9089761570827489, "grad_norm": 2.067412104554531, "learning_rate": 2.1571058228877617e-07, "loss": 0.3329, "step": 6481 }, { "epoch": 0.9091164095371669, "grad_norm": 1.4986873287177127, "learning_rate": 2.1505114988377096e-07, "loss": 0.288, "step": 6482 }, { "epoch": 0.9092566619915848, "grad_norm": 2.3155555127795298, "learning_rate": 2.14392704819531e-07, "loss": 0.3225, "step": 6483 }, { "epoch": 0.9093969144460028, "grad_norm": 3.6750651035900033, "learning_rate": 2.137352472319215e-07, "loss": 0.3276, "step": 6484 }, { "epoch": 0.9095371669004207, "grad_norm": 2.05581146944025, "learning_rate": 2.1307877725660398e-07, "loss": 0.3108, "step": 6485 }, { "epoch": 0.9096774193548387, "grad_norm": 2.0669038169935225, "learning_rate": 2.124232950290367e-07, "loss": 0.3097, "step": 6486 }, { "epoch": 0.9098176718092567, "grad_norm": 1.8422944740769531, "learning_rate": 2.117688006844737e-07, "loss": 0.3132, "step": 6487 }, { "epoch": 0.9099579242636746, "grad_norm": 1.784975873319275, "learning_rate": 2.1111529435796584e-07, "loss": 0.3492, "step": 6488 }, { "epoch": 0.9100981767180926, "grad_norm": 2.1534436525758776, "learning_rate": 2.104627761843592e-07, "loss": 0.3334, "step": 6489 }, { "epoch": 0.9102384291725105, "grad_norm": 2.2759782679011353, "learning_rate": 2.0981124629829651e-07, "loss": 0.3185, "step": 6490 }, { "epoch": 0.9103786816269285, "grad_norm": 2.0355962178880844, "learning_rate": 2.0916070483421592e-07, "loss": 0.3262, "step": 6491 }, { "epoch": 0.9105189340813464, "grad_norm": 2.61705030394832, "learning_rate": 2.0851115192635218e-07, "loss": 0.3758, "step": 6492 }, { "epoch": 0.9106591865357644, "grad_norm": 2.392020223538441, "learning_rate": 2.0786258770873647e-07, "loss": 0.3347, "step": 6493 }, { "epoch": 0.9107994389901823, "grad_norm": 1.8840452092247766, "learning_rate": 2.0721501231519558e-07, "loss": 0.3383, "step": 6494 }, { "epoch": 0.9109396914446003, "grad_norm": 1.5870277903959382, "learning_rate": 2.065684258793521e-07, "loss": 0.32, "step": 6495 }, { "epoch": 0.9110799438990183, "grad_norm": 1.963483396385638, "learning_rate": 2.0592282853462377e-07, "loss": 0.3402, "step": 6496 }, { "epoch": 0.9112201963534362, "grad_norm": 1.9601562889223791, "learning_rate": 2.0527822041422563e-07, "loss": 0.3095, "step": 6497 }, { "epoch": 0.9113604488078542, "grad_norm": 1.6955009390879858, "learning_rate": 2.04634601651168e-07, "loss": 0.3416, "step": 6498 }, { "epoch": 0.9115007012622721, "grad_norm": 3.94637654374392, "learning_rate": 2.039919723782574e-07, "loss": 0.335, "step": 6499 }, { "epoch": 0.9116409537166901, "grad_norm": 2.609268460411564, "learning_rate": 2.0335033272809612e-07, "loss": 0.3162, "step": 6500 }, { "epoch": 0.911781206171108, "grad_norm": 1.8922804843474608, "learning_rate": 2.0270968283308102e-07, "loss": 0.3167, "step": 6501 }, { "epoch": 0.911921458625526, "grad_norm": 1.866661859687792, "learning_rate": 2.0207002282540744e-07, "loss": 0.3533, "step": 6502 }, { "epoch": 0.9120617110799439, "grad_norm": 1.711954868176322, "learning_rate": 2.014313528370626e-07, "loss": 0.2689, "step": 6503 }, { "epoch": 0.9122019635343619, "grad_norm": 2.61727367657796, "learning_rate": 2.0079367299983276e-07, "loss": 0.3634, "step": 6504 }, { "epoch": 0.9123422159887798, "grad_norm": 1.664704391539879, "learning_rate": 2.0015698344529877e-07, "loss": 0.3155, "step": 6505 }, { "epoch": 0.9124824684431978, "grad_norm": 1.683728984880033, "learning_rate": 1.9952128430483718e-07, "loss": 0.3319, "step": 6506 }, { "epoch": 0.9126227208976158, "grad_norm": 2.766460917378963, "learning_rate": 1.9888657570961924e-07, "loss": 0.3634, "step": 6507 }, { "epoch": 0.9127629733520337, "grad_norm": 2.0925759966869544, "learning_rate": 1.9825285779061344e-07, "loss": 0.283, "step": 6508 }, { "epoch": 0.9129032258064517, "grad_norm": 2.6910608344032707, "learning_rate": 1.9762013067858243e-07, "loss": 0.3418, "step": 6509 }, { "epoch": 0.9130434782608695, "grad_norm": 2.8313984341479577, "learning_rate": 1.9698839450408568e-07, "loss": 0.3192, "step": 6510 }, { "epoch": 0.9131837307152875, "grad_norm": 1.5782042379598649, "learning_rate": 1.9635764939747782e-07, "loss": 0.3261, "step": 6511 }, { "epoch": 0.9133239831697054, "grad_norm": 1.8616428169264159, "learning_rate": 1.9572789548890748e-07, "loss": 0.3028, "step": 6512 }, { "epoch": 0.9134642356241234, "grad_norm": 2.494612897760707, "learning_rate": 1.9509913290832073e-07, "loss": 0.3678, "step": 6513 }, { "epoch": 0.9136044880785413, "grad_norm": 2.3881481029967393, "learning_rate": 1.9447136178545766e-07, "loss": 0.3908, "step": 6514 }, { "epoch": 0.9137447405329593, "grad_norm": 2.2436378387214755, "learning_rate": 1.938445822498558e-07, "loss": 0.3419, "step": 6515 }, { "epoch": 0.9138849929873772, "grad_norm": 1.7663563926709596, "learning_rate": 1.932187944308461e-07, "loss": 0.3199, "step": 6516 }, { "epoch": 0.9140252454417952, "grad_norm": 1.846807434085411, "learning_rate": 1.925939984575548e-07, "loss": 0.3448, "step": 6517 }, { "epoch": 0.9141654978962132, "grad_norm": 1.7260625469239388, "learning_rate": 1.919701944589042e-07, "loss": 0.3109, "step": 6518 }, { "epoch": 0.9143057503506311, "grad_norm": 2.143209081086744, "learning_rate": 1.9134738256361306e-07, "loss": 0.3181, "step": 6519 }, { "epoch": 0.9144460028050491, "grad_norm": 4.028731394744626, "learning_rate": 1.9072556290019362e-07, "loss": 0.3243, "step": 6520 }, { "epoch": 0.914586255259467, "grad_norm": 1.8834519309564963, "learning_rate": 1.9010473559695376e-07, "loss": 0.3135, "step": 6521 }, { "epoch": 0.914726507713885, "grad_norm": 1.933139138772827, "learning_rate": 1.8948490078199767e-07, "loss": 0.3417, "step": 6522 }, { "epoch": 0.9148667601683029, "grad_norm": 2.0987715215982483, "learning_rate": 1.8886605858322304e-07, "loss": 0.326, "step": 6523 }, { "epoch": 0.9150070126227209, "grad_norm": 2.4846504106753216, "learning_rate": 1.8824820912832387e-07, "loss": 0.3308, "step": 6524 }, { "epoch": 0.9151472650771388, "grad_norm": 1.9761975219051628, "learning_rate": 1.8763135254478925e-07, "loss": 0.303, "step": 6525 }, { "epoch": 0.9152875175315568, "grad_norm": 1.8039433650200412, "learning_rate": 1.8701548895990295e-07, "loss": 0.3252, "step": 6526 }, { "epoch": 0.9154277699859747, "grad_norm": 1.8482777233833467, "learning_rate": 1.8640061850074443e-07, "loss": 0.353, "step": 6527 }, { "epoch": 0.9155680224403927, "grad_norm": 1.7612766971554672, "learning_rate": 1.857867412941883e-07, "loss": 0.3308, "step": 6528 }, { "epoch": 0.9157082748948107, "grad_norm": 2.139668494600196, "learning_rate": 1.8517385746690264e-07, "loss": 0.34, "step": 6529 }, { "epoch": 0.9158485273492286, "grad_norm": 2.7150719855444407, "learning_rate": 1.8456196714535302e-07, "loss": 0.3444, "step": 6530 }, { "epoch": 0.9159887798036466, "grad_norm": 6.318024889716728, "learning_rate": 1.839510704557984e-07, "loss": 0.3432, "step": 6531 }, { "epoch": 0.9161290322580645, "grad_norm": 1.4105254559091014, "learning_rate": 1.8334116752429243e-07, "loss": 0.3195, "step": 6532 }, { "epoch": 0.9162692847124825, "grad_norm": 2.309538001062297, "learning_rate": 1.8273225847668442e-07, "loss": 0.3074, "step": 6533 }, { "epoch": 0.9164095371669004, "grad_norm": 1.7117698421540013, "learning_rate": 1.8212434343861886e-07, "loss": 0.3415, "step": 6534 }, { "epoch": 0.9165497896213184, "grad_norm": 2.245402725732361, "learning_rate": 1.8151742253553483e-07, "loss": 0.2881, "step": 6535 }, { "epoch": 0.9166900420757363, "grad_norm": 1.8490478565589743, "learning_rate": 1.8091149589266554e-07, "loss": 0.2994, "step": 6536 }, { "epoch": 0.9168302945301543, "grad_norm": 1.8075189218546368, "learning_rate": 1.8030656363504152e-07, "loss": 0.2875, "step": 6537 }, { "epoch": 0.9169705469845723, "grad_norm": 1.7207027786029376, "learning_rate": 1.79702625887484e-07, "loss": 0.3254, "step": 6538 }, { "epoch": 0.9171107994389902, "grad_norm": 1.7013413775118333, "learning_rate": 1.7909968277461276e-07, "loss": 0.2961, "step": 6539 }, { "epoch": 0.9172510518934082, "grad_norm": 2.2446303401840373, "learning_rate": 1.7849773442084051e-07, "loss": 0.3078, "step": 6540 }, { "epoch": 0.9173913043478261, "grad_norm": 1.7442635795083248, "learning_rate": 1.7789678095037456e-07, "loss": 0.3159, "step": 6541 }, { "epoch": 0.9175315568022441, "grad_norm": 2.2332372369541384, "learning_rate": 1.7729682248721848e-07, "loss": 0.2935, "step": 6542 }, { "epoch": 0.917671809256662, "grad_norm": 1.814182158605373, "learning_rate": 1.7669785915516935e-07, "loss": 0.3287, "step": 6543 }, { "epoch": 0.91781206171108, "grad_norm": 1.8916335135200069, "learning_rate": 1.7609989107781834e-07, "loss": 0.3098, "step": 6544 }, { "epoch": 0.9179523141654979, "grad_norm": 2.346933263063092, "learning_rate": 1.7550291837855226e-07, "loss": 0.3237, "step": 6545 }, { "epoch": 0.9180925666199159, "grad_norm": 2.5337828220674905, "learning_rate": 1.7490694118055263e-07, "loss": 0.3159, "step": 6546 }, { "epoch": 0.9182328190743339, "grad_norm": 1.8358312697823946, "learning_rate": 1.7431195960679436e-07, "loss": 0.3344, "step": 6547 }, { "epoch": 0.9183730715287518, "grad_norm": 2.047690309243461, "learning_rate": 1.7371797378004874e-07, "loss": 0.3044, "step": 6548 }, { "epoch": 0.9185133239831698, "grad_norm": 1.8988222587854253, "learning_rate": 1.731249838228799e-07, "loss": 0.3778, "step": 6549 }, { "epoch": 0.9186535764375876, "grad_norm": 2.363687578059927, "learning_rate": 1.7253298985764777e-07, "loss": 0.2884, "step": 6550 }, { "epoch": 0.9187938288920056, "grad_norm": 2.0565524032523457, "learning_rate": 1.7194199200650518e-07, "loss": 0.3357, "step": 6551 }, { "epoch": 0.9189340813464235, "grad_norm": 1.9636810684975385, "learning_rate": 1.7135199039140239e-07, "loss": 0.3413, "step": 6552 }, { "epoch": 0.9190743338008415, "grad_norm": 1.955582682223935, "learning_rate": 1.7076298513407973e-07, "loss": 0.3264, "step": 6553 }, { "epoch": 0.9192145862552594, "grad_norm": 1.8403737465008818, "learning_rate": 1.701749763560756e-07, "loss": 0.3126, "step": 6554 }, { "epoch": 0.9193548387096774, "grad_norm": 2.2001739649915883, "learning_rate": 1.695879641787207e-07, "loss": 0.3524, "step": 6555 }, { "epoch": 0.9194950911640953, "grad_norm": 1.7826117279647893, "learning_rate": 1.69001948723142e-07, "loss": 0.2939, "step": 6556 }, { "epoch": 0.9196353436185133, "grad_norm": 1.7222002944097243, "learning_rate": 1.684169301102595e-07, "loss": 0.3165, "step": 6557 }, { "epoch": 0.9197755960729312, "grad_norm": 1.9297374147453752, "learning_rate": 1.6783290846078714e-07, "loss": 0.3266, "step": 6558 }, { "epoch": 0.9199158485273492, "grad_norm": 1.7498777210566778, "learning_rate": 1.6724988389523356e-07, "loss": 0.3386, "step": 6559 }, { "epoch": 0.9200561009817672, "grad_norm": 2.0468470447757037, "learning_rate": 1.666678565339025e-07, "loss": 0.3195, "step": 6560 }, { "epoch": 0.9201963534361851, "grad_norm": 2.0537156972498933, "learning_rate": 1.6608682649689068e-07, "loss": 0.3448, "step": 6561 }, { "epoch": 0.9203366058906031, "grad_norm": 2.138053407471259, "learning_rate": 1.6550679390408998e-07, "loss": 0.3311, "step": 6562 }, { "epoch": 0.920476858345021, "grad_norm": 2.1394554340154373, "learning_rate": 1.649277588751863e-07, "loss": 0.3118, "step": 6563 }, { "epoch": 0.920617110799439, "grad_norm": 2.02860275307692, "learning_rate": 1.6434972152965855e-07, "loss": 0.3199, "step": 6564 }, { "epoch": 0.9207573632538569, "grad_norm": 3.0271142491133465, "learning_rate": 1.6377268198678131e-07, "loss": 0.3489, "step": 6565 }, { "epoch": 0.9208976157082749, "grad_norm": 2.511646228176392, "learning_rate": 1.6319664036562266e-07, "loss": 0.3517, "step": 6566 }, { "epoch": 0.9210378681626928, "grad_norm": 2.047690192810529, "learning_rate": 1.6262159678504475e-07, "loss": 0.3429, "step": 6567 }, { "epoch": 0.9211781206171108, "grad_norm": 6.131734143350992, "learning_rate": 1.620475513637032e-07, "loss": 0.311, "step": 6568 }, { "epoch": 0.9213183730715288, "grad_norm": 1.902329086466522, "learning_rate": 1.614745042200494e-07, "loss": 0.3442, "step": 6569 }, { "epoch": 0.9214586255259467, "grad_norm": 1.7882286027363872, "learning_rate": 1.6090245547232707e-07, "loss": 0.3331, "step": 6570 }, { "epoch": 0.9215988779803647, "grad_norm": 1.9829275904952604, "learning_rate": 1.6033140523857405e-07, "loss": 0.3329, "step": 6571 }, { "epoch": 0.9217391304347826, "grad_norm": 1.646009274828564, "learning_rate": 1.5976135363662383e-07, "loss": 0.3373, "step": 6572 }, { "epoch": 0.9218793828892006, "grad_norm": 1.879228401720701, "learning_rate": 1.5919230078410064e-07, "loss": 0.3128, "step": 6573 }, { "epoch": 0.9220196353436185, "grad_norm": 2.382185556450233, "learning_rate": 1.5862424679842614e-07, "loss": 0.3585, "step": 6574 }, { "epoch": 0.9221598877980365, "grad_norm": 1.6139994724372086, "learning_rate": 1.5805719179681377e-07, "loss": 0.3096, "step": 6575 }, { "epoch": 0.9223001402524544, "grad_norm": 2.164650406836998, "learning_rate": 1.5749113589627108e-07, "loss": 0.3076, "step": 6576 }, { "epoch": 0.9224403927068724, "grad_norm": 1.9358338606143035, "learning_rate": 1.5692607921360014e-07, "loss": 0.3191, "step": 6577 }, { "epoch": 0.9225806451612903, "grad_norm": 1.9165922924483476, "learning_rate": 1.5636202186539663e-07, "loss": 0.3283, "step": 6578 }, { "epoch": 0.9227208976157083, "grad_norm": 1.711457961537682, "learning_rate": 1.557989639680496e-07, "loss": 0.3365, "step": 6579 }, { "epoch": 0.9228611500701263, "grad_norm": 2.067580122219716, "learning_rate": 1.5523690563774175e-07, "loss": 0.3176, "step": 6580 }, { "epoch": 0.9230014025245442, "grad_norm": 1.844478980313012, "learning_rate": 1.5467584699045024e-07, "loss": 0.3226, "step": 6581 }, { "epoch": 0.9231416549789622, "grad_norm": 1.7999907307915974, "learning_rate": 1.5411578814194583e-07, "loss": 0.332, "step": 6582 }, { "epoch": 0.9232819074333801, "grad_norm": 2.083644106256067, "learning_rate": 1.535567292077922e-07, "loss": 0.332, "step": 6583 }, { "epoch": 0.9234221598877981, "grad_norm": 2.34832398729741, "learning_rate": 1.5299867030334815e-07, "loss": 0.3739, "step": 6584 }, { "epoch": 0.923562412342216, "grad_norm": 1.806899659428255, "learning_rate": 1.5244161154376437e-07, "loss": 0.3198, "step": 6585 }, { "epoch": 0.923702664796634, "grad_norm": 2.1579342221478157, "learning_rate": 1.518855530439861e-07, "loss": 0.3349, "step": 6586 }, { "epoch": 0.9238429172510519, "grad_norm": 1.8378865386885033, "learning_rate": 1.5133049491875275e-07, "loss": 0.3376, "step": 6587 }, { "epoch": 0.9239831697054699, "grad_norm": 2.114911534688992, "learning_rate": 1.5077643728259594e-07, "loss": 0.3455, "step": 6588 }, { "epoch": 0.9241234221598879, "grad_norm": 2.6276515103256743, "learning_rate": 1.502233802498415e-07, "loss": 0.3219, "step": 6589 }, { "epoch": 0.9242636746143057, "grad_norm": 1.6685174758585788, "learning_rate": 1.4967132393460983e-07, "loss": 0.3253, "step": 6590 }, { "epoch": 0.9244039270687237, "grad_norm": 1.5947954825221315, "learning_rate": 1.491202684508136e-07, "loss": 0.3282, "step": 6591 }, { "epoch": 0.9245441795231416, "grad_norm": 2.08878352286138, "learning_rate": 1.4857021391215865e-07, "loss": 0.2955, "step": 6592 }, { "epoch": 0.9246844319775596, "grad_norm": 1.7592033478662086, "learning_rate": 1.4802116043214575e-07, "loss": 0.3638, "step": 6593 }, { "epoch": 0.9248246844319775, "grad_norm": 1.888648584295217, "learning_rate": 1.4747310812406768e-07, "loss": 0.2916, "step": 6594 }, { "epoch": 0.9249649368863955, "grad_norm": 2.429384267841834, "learning_rate": 1.4692605710101116e-07, "loss": 0.2917, "step": 6595 }, { "epoch": 0.9251051893408134, "grad_norm": 1.8467883275280157, "learning_rate": 1.4638000747585646e-07, "loss": 0.3678, "step": 6596 }, { "epoch": 0.9252454417952314, "grad_norm": 1.9553472465170143, "learning_rate": 1.4583495936127678e-07, "loss": 0.3524, "step": 6597 }, { "epoch": 0.9253856942496493, "grad_norm": 1.7344101223525308, "learning_rate": 1.4529091286973994e-07, "loss": 0.3559, "step": 6598 }, { "epoch": 0.9255259467040673, "grad_norm": 2.2156352356323206, "learning_rate": 1.447478681135056e-07, "loss": 0.3386, "step": 6599 }, { "epoch": 0.9256661991584852, "grad_norm": 1.813486291287178, "learning_rate": 1.442058252046269e-07, "loss": 0.3612, "step": 6600 }, { "epoch": 0.9258064516129032, "grad_norm": 2.0140367504808836, "learning_rate": 1.43664784254951e-07, "loss": 0.3665, "step": 6601 }, { "epoch": 0.9259467040673212, "grad_norm": 2.1154893208980035, "learning_rate": 1.4312474537611752e-07, "loss": 0.3432, "step": 6602 }, { "epoch": 0.9260869565217391, "grad_norm": 1.420607032914001, "learning_rate": 1.425857086795601e-07, "loss": 0.3033, "step": 6603 }, { "epoch": 0.9262272089761571, "grad_norm": 3.13226054744572, "learning_rate": 1.420476742765059e-07, "loss": 0.3281, "step": 6604 }, { "epoch": 0.926367461430575, "grad_norm": 1.9455346895966856, "learning_rate": 1.415106422779733e-07, "loss": 0.3429, "step": 6605 }, { "epoch": 0.926507713884993, "grad_norm": 1.8218811218263558, "learning_rate": 1.409746127947753e-07, "loss": 0.3259, "step": 6606 }, { "epoch": 0.9266479663394109, "grad_norm": 2.150655371560418, "learning_rate": 1.4043958593751794e-07, "loss": 0.3263, "step": 6607 }, { "epoch": 0.9267882187938289, "grad_norm": 1.6566892077539037, "learning_rate": 1.3990556181660065e-07, "loss": 0.3049, "step": 6608 }, { "epoch": 0.9269284712482468, "grad_norm": 3.342102215192838, "learning_rate": 1.3937254054221526e-07, "loss": 0.3078, "step": 6609 }, { "epoch": 0.9270687237026648, "grad_norm": 1.871233845286181, "learning_rate": 1.388405222243472e-07, "loss": 0.3534, "step": 6610 }, { "epoch": 0.9272089761570828, "grad_norm": 1.6744606487699778, "learning_rate": 1.3830950697277468e-07, "loss": 0.3173, "step": 6611 }, { "epoch": 0.9273492286115007, "grad_norm": 1.871660501102518, "learning_rate": 1.3777949489706898e-07, "loss": 0.3467, "step": 6612 }, { "epoch": 0.9274894810659187, "grad_norm": 2.239072756267944, "learning_rate": 1.3725048610659487e-07, "loss": 0.3303, "step": 6613 }, { "epoch": 0.9276297335203366, "grad_norm": 1.5782304483064018, "learning_rate": 1.367224807105083e-07, "loss": 0.2848, "step": 6614 }, { "epoch": 0.9277699859747546, "grad_norm": 2.122793061386278, "learning_rate": 1.3619547881776052e-07, "loss": 0.2982, "step": 6615 }, { "epoch": 0.9279102384291725, "grad_norm": 1.9083302513108775, "learning_rate": 1.356694805370945e-07, "loss": 0.3444, "step": 6616 }, { "epoch": 0.9280504908835905, "grad_norm": 2.4186182582571405, "learning_rate": 1.3514448597704623e-07, "loss": 0.3216, "step": 6617 }, { "epoch": 0.9281907433380084, "grad_norm": 2.068188538669029, "learning_rate": 1.3462049524594456e-07, "loss": 0.331, "step": 6618 }, { "epoch": 0.9283309957924264, "grad_norm": 2.415676429270153, "learning_rate": 1.3409750845191138e-07, "loss": 0.3587, "step": 6619 }, { "epoch": 0.9284712482468443, "grad_norm": 1.6002750339156029, "learning_rate": 1.335755257028626e-07, "loss": 0.2921, "step": 6620 }, { "epoch": 0.9286115007012623, "grad_norm": 2.794741536099729, "learning_rate": 1.330545471065031e-07, "loss": 0.3061, "step": 6621 }, { "epoch": 0.9287517531556803, "grad_norm": 1.9764234772895763, "learning_rate": 1.3253457277033533e-07, "loss": 0.3683, "step": 6622 }, { "epoch": 0.9288920056100982, "grad_norm": 2.168578136131443, "learning_rate": 1.3201560280165117e-07, "loss": 0.3605, "step": 6623 }, { "epoch": 0.9290322580645162, "grad_norm": 2.134183338323099, "learning_rate": 1.3149763730753772e-07, "loss": 0.338, "step": 6624 }, { "epoch": 0.9291725105189341, "grad_norm": 1.782302679239299, "learning_rate": 1.3098067639487232e-07, "loss": 0.2931, "step": 6625 }, { "epoch": 0.9293127629733521, "grad_norm": 1.9152591001162227, "learning_rate": 1.3046472017032685e-07, "loss": 0.3095, "step": 6626 }, { "epoch": 0.92945301542777, "grad_norm": 1.9278451255823978, "learning_rate": 1.2994976874036503e-07, "loss": 0.2964, "step": 6627 }, { "epoch": 0.929593267882188, "grad_norm": 1.8916573346393755, "learning_rate": 1.2943582221124296e-07, "loss": 0.3704, "step": 6628 }, { "epoch": 0.929733520336606, "grad_norm": 3.2316006911023827, "learning_rate": 1.2892288068901136e-07, "loss": 0.3516, "step": 6629 }, { "epoch": 0.9298737727910238, "grad_norm": 2.1125142791084883, "learning_rate": 1.284109442795106e-07, "loss": 0.337, "step": 6630 }, { "epoch": 0.9300140252454417, "grad_norm": 2.1681403014895877, "learning_rate": 1.2790001308837618e-07, "loss": 0.3384, "step": 6631 }, { "epoch": 0.9301542776998597, "grad_norm": 1.4015100067412902, "learning_rate": 1.2739008722103486e-07, "loss": 0.3236, "step": 6632 }, { "epoch": 0.9302945301542777, "grad_norm": 2.4628169557714794, "learning_rate": 1.2688116678270636e-07, "loss": 0.3647, "step": 6633 }, { "epoch": 0.9304347826086956, "grad_norm": 2.7381745016973635, "learning_rate": 1.2637325187840332e-07, "loss": 0.3959, "step": 6634 }, { "epoch": 0.9305750350631136, "grad_norm": 1.8364509027005826, "learning_rate": 1.2586634261292918e-07, "loss": 0.319, "step": 6635 }, { "epoch": 0.9307152875175315, "grad_norm": 1.6682706108586483, "learning_rate": 1.253604390908819e-07, "loss": 0.3287, "step": 6636 }, { "epoch": 0.9308555399719495, "grad_norm": 2.8326916996594993, "learning_rate": 1.2485554141665134e-07, "loss": 0.3121, "step": 6637 }, { "epoch": 0.9309957924263674, "grad_norm": 2.032210020468175, "learning_rate": 1.2435164969441915e-07, "loss": 0.2921, "step": 6638 }, { "epoch": 0.9311360448807854, "grad_norm": 1.9140064464847182, "learning_rate": 1.2384876402815993e-07, "loss": 0.3429, "step": 6639 }, { "epoch": 0.9312762973352033, "grad_norm": 2.026400131509312, "learning_rate": 1.2334688452164122e-07, "loss": 0.3125, "step": 6640 }, { "epoch": 0.9314165497896213, "grad_norm": 1.953947885733953, "learning_rate": 1.2284601127842187e-07, "loss": 0.337, "step": 6641 }, { "epoch": 0.9315568022440393, "grad_norm": 1.870401847827674, "learning_rate": 1.2234614440185365e-07, "loss": 0.3916, "step": 6642 }, { "epoch": 0.9316970546984572, "grad_norm": 1.7539224942480975, "learning_rate": 1.2184728399508016e-07, "loss": 0.2934, "step": 6643 }, { "epoch": 0.9318373071528752, "grad_norm": 1.8106569587960397, "learning_rate": 1.2134943016103794e-07, "loss": 0.3192, "step": 6644 }, { "epoch": 0.9319775596072931, "grad_norm": 1.7074997038722306, "learning_rate": 1.208525830024565e-07, "loss": 0.3364, "step": 6645 }, { "epoch": 0.9321178120617111, "grad_norm": 2.174307688882564, "learning_rate": 1.2035674262185603e-07, "loss": 0.3045, "step": 6646 }, { "epoch": 0.932258064516129, "grad_norm": 1.4831958303405273, "learning_rate": 1.198619091215497e-07, "loss": 0.334, "step": 6647 }, { "epoch": 0.932398316970547, "grad_norm": 2.7656068962927067, "learning_rate": 1.1936808260364252e-07, "loss": 0.3343, "step": 6648 }, { "epoch": 0.9325385694249649, "grad_norm": 1.731182355970807, "learning_rate": 1.1887526317003351e-07, "loss": 0.3103, "step": 6649 }, { "epoch": 0.9326788218793829, "grad_norm": 2.330214004839386, "learning_rate": 1.1838345092241132e-07, "loss": 0.3474, "step": 6650 }, { "epoch": 0.9328190743338008, "grad_norm": 1.5974586569819582, "learning_rate": 1.1789264596225814e-07, "loss": 0.309, "step": 6651 }, { "epoch": 0.9329593267882188, "grad_norm": 1.91959177247065, "learning_rate": 1.1740284839084848e-07, "loss": 0.3632, "step": 6652 }, { "epoch": 0.9330995792426368, "grad_norm": 1.8530330049732362, "learning_rate": 1.1691405830924873e-07, "loss": 0.31, "step": 6653 }, { "epoch": 0.9332398316970547, "grad_norm": 1.7005358272278972, "learning_rate": 1.1642627581831767e-07, "loss": 0.3176, "step": 6654 }, { "epoch": 0.9333800841514727, "grad_norm": 2.117864971932148, "learning_rate": 1.1593950101870422e-07, "loss": 0.3399, "step": 6655 }, { "epoch": 0.9335203366058906, "grad_norm": 1.7568738677768594, "learning_rate": 1.1545373401085247e-07, "loss": 0.3267, "step": 6656 }, { "epoch": 0.9336605890603086, "grad_norm": 1.8403935673911973, "learning_rate": 1.149689748949967e-07, "loss": 0.324, "step": 6657 }, { "epoch": 0.9338008415147265, "grad_norm": 1.8586898952019615, "learning_rate": 1.14485223771163e-07, "loss": 0.3058, "step": 6658 }, { "epoch": 0.9339410939691445, "grad_norm": 2.1421602410182725, "learning_rate": 1.1400248073917042e-07, "loss": 0.2896, "step": 6659 }, { "epoch": 0.9340813464235624, "grad_norm": 1.8791247456929219, "learning_rate": 1.1352074589862983e-07, "loss": 0.3249, "step": 6660 }, { "epoch": 0.9342215988779804, "grad_norm": 2.6790799266800605, "learning_rate": 1.1304001934894393e-07, "loss": 0.2742, "step": 6661 }, { "epoch": 0.9343618513323984, "grad_norm": 1.7939718262151678, "learning_rate": 1.1256030118930727e-07, "loss": 0.3381, "step": 6662 }, { "epoch": 0.9345021037868163, "grad_norm": 1.650901652114266, "learning_rate": 1.1208159151870567e-07, "loss": 0.3402, "step": 6663 }, { "epoch": 0.9346423562412343, "grad_norm": 1.7405745904769603, "learning_rate": 1.116038904359179e-07, "loss": 0.294, "step": 6664 }, { "epoch": 0.9347826086956522, "grad_norm": 1.9748711869930793, "learning_rate": 1.1112719803951455e-07, "loss": 0.3481, "step": 6665 }, { "epoch": 0.9349228611500702, "grad_norm": 2.0185098038672256, "learning_rate": 1.1065151442785749e-07, "loss": 0.3115, "step": 6666 }, { "epoch": 0.9350631136044881, "grad_norm": 2.0087902493767764, "learning_rate": 1.1017683969910042e-07, "loss": 0.3115, "step": 6667 }, { "epoch": 0.9352033660589061, "grad_norm": 2.3521472292640966, "learning_rate": 1.0970317395119001e-07, "loss": 0.3321, "step": 6668 }, { "epoch": 0.935343618513324, "grad_norm": 2.2172591882666493, "learning_rate": 1.0923051728186251e-07, "loss": 0.3421, "step": 6669 }, { "epoch": 0.9354838709677419, "grad_norm": 2.1276466493011066, "learning_rate": 1.0875886978864881e-07, "loss": 0.3583, "step": 6670 }, { "epoch": 0.9356241234221598, "grad_norm": 1.8247800041227011, "learning_rate": 1.0828823156886881e-07, "loss": 0.3307, "step": 6671 }, { "epoch": 0.9357643758765778, "grad_norm": 1.9219088667700115, "learning_rate": 1.0781860271963651e-07, "loss": 0.3331, "step": 6672 }, { "epoch": 0.9359046283309957, "grad_norm": 1.9015074346589105, "learning_rate": 1.0734998333785607e-07, "loss": 0.3184, "step": 6673 }, { "epoch": 0.9360448807854137, "grad_norm": 2.6042317395027363, "learning_rate": 1.0688237352022346e-07, "loss": 0.4029, "step": 6674 }, { "epoch": 0.9361851332398317, "grad_norm": 1.5007378035238323, "learning_rate": 1.0641577336322761e-07, "loss": 0.3433, "step": 6675 }, { "epoch": 0.9363253856942496, "grad_norm": 2.0792788407129343, "learning_rate": 1.0595018296314763e-07, "loss": 0.321, "step": 6676 }, { "epoch": 0.9364656381486676, "grad_norm": 2.0656938965542486, "learning_rate": 1.0548560241605444e-07, "loss": 0.3342, "step": 6677 }, { "epoch": 0.9366058906030855, "grad_norm": 1.480984157737675, "learning_rate": 1.0502203181781135e-07, "loss": 0.2905, "step": 6678 }, { "epoch": 0.9367461430575035, "grad_norm": 2.0277413215437385, "learning_rate": 1.0455947126407296e-07, "loss": 0.3149, "step": 6679 }, { "epoch": 0.9368863955119214, "grad_norm": 2.0502269758781364, "learning_rate": 1.040979208502857e-07, "loss": 0.3087, "step": 6680 }, { "epoch": 0.9370266479663394, "grad_norm": 2.528514184219136, "learning_rate": 1.0363738067168672e-07, "loss": 0.3182, "step": 6681 }, { "epoch": 0.9371669004207573, "grad_norm": 1.968403437553339, "learning_rate": 1.0317785082330555e-07, "loss": 0.3744, "step": 6682 }, { "epoch": 0.9373071528751753, "grad_norm": 2.2301071060871407, "learning_rate": 1.027193313999636e-07, "loss": 0.2949, "step": 6683 }, { "epoch": 0.9374474053295933, "grad_norm": 1.750376660819731, "learning_rate": 1.0226182249627181e-07, "loss": 0.3229, "step": 6684 }, { "epoch": 0.9375876577840112, "grad_norm": 7.738464137635623, "learning_rate": 1.0180532420663525e-07, "loss": 0.3512, "step": 6685 }, { "epoch": 0.9377279102384292, "grad_norm": 2.8562025804986537, "learning_rate": 1.0134983662524856e-07, "loss": 0.3003, "step": 6686 }, { "epoch": 0.9378681626928471, "grad_norm": 2.171436292950088, "learning_rate": 1.0089535984609766e-07, "loss": 0.2919, "step": 6687 }, { "epoch": 0.9380084151472651, "grad_norm": 1.5988770836017374, "learning_rate": 1.0044189396296144e-07, "loss": 0.3163, "step": 6688 }, { "epoch": 0.938148667601683, "grad_norm": 2.519233248334775, "learning_rate": 9.998943906941005e-08, "loss": 0.3628, "step": 6689 }, { "epoch": 0.938288920056101, "grad_norm": 1.7104082377602061, "learning_rate": 9.953799525880325e-08, "loss": 0.2953, "step": 6690 }, { "epoch": 0.9384291725105189, "grad_norm": 1.9062556282335585, "learning_rate": 9.908756262429376e-08, "loss": 0.3263, "step": 6691 }, { "epoch": 0.9385694249649369, "grad_norm": 1.4382548630763323, "learning_rate": 9.863814125882498e-08, "loss": 0.3036, "step": 6692 }, { "epoch": 0.9387096774193548, "grad_norm": 2.4152025413482123, "learning_rate": 9.818973125513275e-08, "loss": 0.3388, "step": 6693 }, { "epoch": 0.9388499298737728, "grad_norm": 2.0095498253690725, "learning_rate": 9.774233270574252e-08, "loss": 0.34, "step": 6694 }, { "epoch": 0.9389901823281908, "grad_norm": 1.9968226346834501, "learning_rate": 9.729594570297207e-08, "loss": 0.3456, "step": 6695 }, { "epoch": 0.9391304347826087, "grad_norm": 2.3881906319379778, "learning_rate": 9.685057033892998e-08, "loss": 0.3246, "step": 6696 }, { "epoch": 0.9392706872370267, "grad_norm": 2.515793883536749, "learning_rate": 9.640620670551659e-08, "loss": 0.3152, "step": 6697 }, { "epoch": 0.9394109396914446, "grad_norm": 1.6561729125311107, "learning_rate": 9.596285489442359e-08, "loss": 0.365, "step": 6698 }, { "epoch": 0.9395511921458626, "grad_norm": 1.8681782284731467, "learning_rate": 9.552051499713278e-08, "loss": 0.3052, "step": 6699 }, { "epoch": 0.9396914446002805, "grad_norm": 2.5307647805155846, "learning_rate": 9.507918710491838e-08, "loss": 0.4002, "step": 6700 }, { "epoch": 0.9398316970546985, "grad_norm": 1.705043656917329, "learning_rate": 9.46388713088453e-08, "loss": 0.3678, "step": 6701 }, { "epoch": 0.9399719495091164, "grad_norm": 1.6747395583409663, "learning_rate": 9.419956769976979e-08, "loss": 0.3711, "step": 6702 }, { "epoch": 0.9401122019635344, "grad_norm": 1.8401993005042068, "learning_rate": 9.376127636833876e-08, "loss": 0.3355, "step": 6703 }, { "epoch": 0.9402524544179524, "grad_norm": 1.5423021639138137, "learning_rate": 9.332399740499043e-08, "loss": 0.3463, "step": 6704 }, { "epoch": 0.9403927068723703, "grad_norm": 1.5704777568953194, "learning_rate": 9.288773089995484e-08, "loss": 0.2712, "step": 6705 }, { "epoch": 0.9405329593267883, "grad_norm": 2.2328915239662432, "learning_rate": 9.245247694325166e-08, "loss": 0.3356, "step": 6706 }, { "epoch": 0.9406732117812062, "grad_norm": 1.823704649143701, "learning_rate": 9.201823562469347e-08, "loss": 0.3364, "step": 6707 }, { "epoch": 0.9408134642356242, "grad_norm": 3.1361528886194145, "learning_rate": 9.158500703388252e-08, "loss": 0.3633, "step": 6708 }, { "epoch": 0.9409537166900421, "grad_norm": 1.9773325268303703, "learning_rate": 9.115279126021226e-08, "loss": 0.2843, "step": 6709 }, { "epoch": 0.94109396914446, "grad_norm": 1.685644047523068, "learning_rate": 9.072158839286748e-08, "loss": 0.3549, "step": 6710 }, { "epoch": 0.9412342215988779, "grad_norm": 1.6767550397882214, "learning_rate": 9.029139852082425e-08, "loss": 0.2785, "step": 6711 }, { "epoch": 0.9413744740532959, "grad_norm": 1.8905452364655115, "learning_rate": 8.986222173284876e-08, "loss": 0.3298, "step": 6712 }, { "epoch": 0.9415147265077138, "grad_norm": 2.266881870357177, "learning_rate": 8.94340581174985e-08, "loss": 0.3333, "step": 6713 }, { "epoch": 0.9416549789621318, "grad_norm": 1.9823558476919985, "learning_rate": 8.900690776312282e-08, "loss": 0.3121, "step": 6714 }, { "epoch": 0.9417952314165497, "grad_norm": 2.0148965864882356, "learning_rate": 8.85807707578612e-08, "loss": 0.2502, "step": 6715 }, { "epoch": 0.9419354838709677, "grad_norm": 1.6007710059505142, "learning_rate": 8.815564718964331e-08, "loss": 0.3006, "step": 6716 }, { "epoch": 0.9420757363253857, "grad_norm": 1.899377597958946, "learning_rate": 8.773153714619064e-08, "loss": 0.3231, "step": 6717 }, { "epoch": 0.9422159887798036, "grad_norm": 1.8475743733303898, "learning_rate": 8.730844071501599e-08, "loss": 0.3441, "step": 6718 }, { "epoch": 0.9423562412342216, "grad_norm": 1.6455136805579718, "learning_rate": 8.688635798342116e-08, "loss": 0.3173, "step": 6719 }, { "epoch": 0.9424964936886395, "grad_norm": 1.6979893237835335, "learning_rate": 8.646528903850093e-08, "loss": 0.3171, "step": 6720 }, { "epoch": 0.9426367461430575, "grad_norm": 2.4574258605701567, "learning_rate": 8.604523396713915e-08, "loss": 0.3302, "step": 6721 }, { "epoch": 0.9427769985974754, "grad_norm": 2.1320135275892906, "learning_rate": 8.562619285601259e-08, "loss": 0.3699, "step": 6722 }, { "epoch": 0.9429172510518934, "grad_norm": 1.8041230346948087, "learning_rate": 8.520816579158598e-08, "loss": 0.3006, "step": 6723 }, { "epoch": 0.9430575035063113, "grad_norm": 1.741294871653738, "learning_rate": 8.479115286011752e-08, "loss": 0.3084, "step": 6724 }, { "epoch": 0.9431977559607293, "grad_norm": 2.2473719401468952, "learning_rate": 8.437515414765341e-08, "loss": 0.3175, "step": 6725 }, { "epoch": 0.9433380084151473, "grad_norm": 5.341308214337735, "learning_rate": 8.396016974003385e-08, "loss": 0.3475, "step": 6726 }, { "epoch": 0.9434782608695652, "grad_norm": 2.320098147782597, "learning_rate": 8.354619972288703e-08, "loss": 0.3404, "step": 6727 }, { "epoch": 0.9436185133239832, "grad_norm": 1.706750632818319, "learning_rate": 8.313324418163238e-08, "loss": 0.3166, "step": 6728 }, { "epoch": 0.9437587657784011, "grad_norm": 1.4860137727337914, "learning_rate": 8.272130320148063e-08, "loss": 0.2915, "step": 6729 }, { "epoch": 0.9438990182328191, "grad_norm": 2.1778567703133076, "learning_rate": 8.231037686743326e-08, "loss": 0.3054, "step": 6730 }, { "epoch": 0.944039270687237, "grad_norm": 2.0111889187752587, "learning_rate": 8.190046526428241e-08, "loss": 0.2894, "step": 6731 }, { "epoch": 0.944179523141655, "grad_norm": 2.107421875, "learning_rate": 8.149156847660933e-08, "loss": 0.3369, "step": 6732 }, { "epoch": 0.9443197755960729, "grad_norm": 1.9532220434875733, "learning_rate": 8.108368658878818e-08, "loss": 0.3248, "step": 6733 }, { "epoch": 0.9444600280504909, "grad_norm": 5.560519090749869, "learning_rate": 8.067681968498164e-08, "loss": 0.3306, "step": 6734 }, { "epoch": 0.9446002805049089, "grad_norm": 1.8531577404918933, "learning_rate": 8.027096784914479e-08, "loss": 0.3191, "step": 6735 }, { "epoch": 0.9447405329593268, "grad_norm": 2.5112560552027507, "learning_rate": 7.986613116502173e-08, "loss": 0.3602, "step": 6736 }, { "epoch": 0.9448807854137448, "grad_norm": 2.0638125318068186, "learning_rate": 7.946230971614732e-08, "loss": 0.3504, "step": 6737 }, { "epoch": 0.9450210378681627, "grad_norm": 1.9234644202324795, "learning_rate": 7.905950358584768e-08, "loss": 0.3603, "step": 6738 }, { "epoch": 0.9451612903225807, "grad_norm": 1.640491371162321, "learning_rate": 7.865771285723911e-08, "loss": 0.3496, "step": 6739 }, { "epoch": 0.9453015427769986, "grad_norm": 2.5441573920415648, "learning_rate": 7.825693761322861e-08, "loss": 0.3406, "step": 6740 }, { "epoch": 0.9454417952314166, "grad_norm": 2.2453493161723728, "learning_rate": 7.785717793651282e-08, "loss": 0.3872, "step": 6741 }, { "epoch": 0.9455820476858345, "grad_norm": 1.8699859334039226, "learning_rate": 7.745843390957908e-08, "loss": 0.3319, "step": 6742 }, { "epoch": 0.9457223001402525, "grad_norm": 2.010658236469907, "learning_rate": 7.706070561470657e-08, "loss": 0.3546, "step": 6743 }, { "epoch": 0.9458625525946704, "grad_norm": 1.8456122481967792, "learning_rate": 7.666399313396245e-08, "loss": 0.3384, "step": 6744 }, { "epoch": 0.9460028050490884, "grad_norm": 1.9208561127057369, "learning_rate": 7.626829654920732e-08, "loss": 0.3176, "step": 6745 }, { "epoch": 0.9461430575035064, "grad_norm": 2.0927113476192423, "learning_rate": 7.587361594208808e-08, "loss": 0.3233, "step": 6746 }, { "epoch": 0.9462833099579243, "grad_norm": 2.2825182172167366, "learning_rate": 7.54799513940463e-08, "loss": 0.32, "step": 6747 }, { "epoch": 0.9464235624123423, "grad_norm": 1.4624331564992614, "learning_rate": 7.508730298631084e-08, "loss": 0.2874, "step": 6748 }, { "epoch": 0.9465638148667602, "grad_norm": 1.6642034050113415, "learning_rate": 7.469567079990248e-08, "loss": 0.353, "step": 6749 }, { "epoch": 0.9467040673211781, "grad_norm": 1.8105449655928105, "learning_rate": 7.430505491563101e-08, "loss": 0.3307, "step": 6750 }, { "epoch": 0.946844319775596, "grad_norm": 2.0276922907583823, "learning_rate": 7.391545541409806e-08, "loss": 0.2869, "step": 6751 }, { "epoch": 0.946984572230014, "grad_norm": 2.3169712451900644, "learning_rate": 7.352687237569489e-08, "loss": 0.2957, "step": 6752 }, { "epoch": 0.9471248246844319, "grad_norm": 1.8697647758682472, "learning_rate": 7.31393058806018e-08, "loss": 0.3187, "step": 6753 }, { "epoch": 0.9472650771388499, "grad_norm": 1.7739001103461847, "learning_rate": 7.275275600879206e-08, "loss": 0.3277, "step": 6754 }, { "epoch": 0.9474053295932678, "grad_norm": 2.775332467035578, "learning_rate": 7.236722284002573e-08, "loss": 0.3454, "step": 6755 }, { "epoch": 0.9475455820476858, "grad_norm": 1.9544554794522409, "learning_rate": 7.198270645385641e-08, "loss": 0.3635, "step": 6756 }, { "epoch": 0.9476858345021038, "grad_norm": 2.2120813437020765, "learning_rate": 7.159920692962563e-08, "loss": 0.342, "step": 6757 }, { "epoch": 0.9478260869565217, "grad_norm": 3.272276315060281, "learning_rate": 7.12167243464662e-08, "loss": 0.2782, "step": 6758 }, { "epoch": 0.9479663394109397, "grad_norm": 2.099498089347273, "learning_rate": 7.08352587833e-08, "loss": 0.325, "step": 6759 }, { "epoch": 0.9481065918653576, "grad_norm": 1.919260992044781, "learning_rate": 7.045481031884071e-08, "loss": 0.3272, "step": 6760 }, { "epoch": 0.9482468443197756, "grad_norm": 1.8680636852076475, "learning_rate": 7.007537903159057e-08, "loss": 0.3586, "step": 6761 }, { "epoch": 0.9483870967741935, "grad_norm": 2.195413281716921, "learning_rate": 6.969696499984246e-08, "loss": 0.3797, "step": 6762 }, { "epoch": 0.9485273492286115, "grad_norm": 1.8759226754038234, "learning_rate": 6.931956830168007e-08, "loss": 0.3632, "step": 6763 }, { "epoch": 0.9486676016830294, "grad_norm": 2.071448939087052, "learning_rate": 6.894318901497665e-08, "loss": 0.3281, "step": 6764 }, { "epoch": 0.9488078541374474, "grad_norm": 1.8984849711946763, "learning_rate": 6.856782721739452e-08, "loss": 0.31, "step": 6765 }, { "epoch": 0.9489481065918653, "grad_norm": 1.8518359147375056, "learning_rate": 6.819348298638839e-08, "loss": 0.3042, "step": 6766 }, { "epoch": 0.9490883590462833, "grad_norm": 2.034163044143987, "learning_rate": 6.782015639919982e-08, "loss": 0.3073, "step": 6767 }, { "epoch": 0.9492286115007013, "grad_norm": 1.9289667428265918, "learning_rate": 6.744784753286382e-08, "loss": 0.317, "step": 6768 }, { "epoch": 0.9493688639551192, "grad_norm": 1.9879177038440172, "learning_rate": 6.70765564642023e-08, "loss": 0.345, "step": 6769 }, { "epoch": 0.9495091164095372, "grad_norm": 1.8874829878893036, "learning_rate": 6.670628326982953e-08, "loss": 0.3215, "step": 6770 }, { "epoch": 0.9496493688639551, "grad_norm": 1.822936561112609, "learning_rate": 6.633702802614828e-08, "loss": 0.323, "step": 6771 }, { "epoch": 0.9497896213183731, "grad_norm": 1.9085431298731017, "learning_rate": 6.596879080935203e-08, "loss": 0.3265, "step": 6772 }, { "epoch": 0.949929873772791, "grad_norm": 2.0445116682794837, "learning_rate": 6.560157169542391e-08, "loss": 0.3244, "step": 6773 }, { "epoch": 0.950070126227209, "grad_norm": 2.3498078450310222, "learning_rate": 6.52353707601372e-08, "loss": 0.3152, "step": 6774 }, { "epoch": 0.9502103786816269, "grad_norm": 2.849720650082794, "learning_rate": 6.487018807905421e-08, "loss": 0.3192, "step": 6775 }, { "epoch": 0.9503506311360449, "grad_norm": 2.029606315510344, "learning_rate": 6.450602372752912e-08, "loss": 0.3417, "step": 6776 }, { "epoch": 0.9504908835904629, "grad_norm": 2.4383671514113265, "learning_rate": 6.414287778070404e-08, "loss": 0.3406, "step": 6777 }, { "epoch": 0.9506311360448808, "grad_norm": 1.958311960090176, "learning_rate": 6.378075031351072e-08, "loss": 0.2919, "step": 6778 }, { "epoch": 0.9507713884992988, "grad_norm": 1.6491996124328188, "learning_rate": 6.34196414006727e-08, "loss": 0.3129, "step": 6779 }, { "epoch": 0.9509116409537167, "grad_norm": 1.8340383243446128, "learning_rate": 6.305955111670204e-08, "loss": 0.3265, "step": 6780 }, { "epoch": 0.9510518934081347, "grad_norm": 2.1330844580002353, "learning_rate": 6.270047953590097e-08, "loss": 0.301, "step": 6781 }, { "epoch": 0.9511921458625526, "grad_norm": 1.9498437403157618, "learning_rate": 6.234242673236079e-08, "loss": 0.2775, "step": 6782 }, { "epoch": 0.9513323983169706, "grad_norm": 2.071992012965888, "learning_rate": 6.198539277996407e-08, "loss": 0.3433, "step": 6783 }, { "epoch": 0.9514726507713885, "grad_norm": 1.7451211494866188, "learning_rate": 6.162937775238187e-08, "loss": 0.3545, "step": 6784 }, { "epoch": 0.9516129032258065, "grad_norm": 1.754293148467464, "learning_rate": 6.127438172307487e-08, "loss": 0.2823, "step": 6785 }, { "epoch": 0.9517531556802244, "grad_norm": 2.6037076520108573, "learning_rate": 6.092040476529504e-08, "loss": 0.3298, "step": 6786 }, { "epoch": 0.9518934081346424, "grad_norm": 2.2871887036906866, "learning_rate": 6.056744695208283e-08, "loss": 0.3013, "step": 6787 }, { "epoch": 0.9520336605890604, "grad_norm": 1.8678150479122384, "learning_rate": 6.021550835626777e-08, "loss": 0.3013, "step": 6788 }, { "epoch": 0.9521739130434783, "grad_norm": 1.8589458451162781, "learning_rate": 5.986458905047066e-08, "loss": 0.3179, "step": 6789 }, { "epoch": 0.9523141654978962, "grad_norm": 2.5860782826988875, "learning_rate": 5.9514689107101345e-08, "loss": 0.3439, "step": 6790 }, { "epoch": 0.9524544179523141, "grad_norm": 2.005740272678113, "learning_rate": 5.9165808598358745e-08, "loss": 0.3183, "step": 6791 }, { "epoch": 0.9525946704067321, "grad_norm": 1.9104044934310975, "learning_rate": 5.881794759623194e-08, "loss": 0.3752, "step": 6792 }, { "epoch": 0.95273492286115, "grad_norm": 1.8990240878384865, "learning_rate": 5.8471106172499625e-08, "loss": 0.3657, "step": 6793 }, { "epoch": 0.952875175315568, "grad_norm": 1.7161652984055515, "learning_rate": 5.8125284398730666e-08, "loss": 0.3108, "step": 6794 }, { "epoch": 0.9530154277699859, "grad_norm": 1.8565069566632817, "learning_rate": 5.778048234628242e-08, "loss": 0.3812, "step": 6795 }, { "epoch": 0.9531556802244039, "grad_norm": 1.7967060838998146, "learning_rate": 5.743670008630298e-08, "loss": 0.3461, "step": 6796 }, { "epoch": 0.9532959326788218, "grad_norm": 1.8520707989995513, "learning_rate": 5.709393768972837e-08, "loss": 0.2878, "step": 6797 }, { "epoch": 0.9534361851332398, "grad_norm": 3.3518715029102, "learning_rate": 5.675219522728648e-08, "loss": 0.3628, "step": 6798 }, { "epoch": 0.9535764375876578, "grad_norm": 2.1697678258071105, "learning_rate": 5.6411472769492547e-08, "loss": 0.323, "step": 6799 }, { "epoch": 0.9537166900420757, "grad_norm": 1.7234031805251036, "learning_rate": 5.607177038665257e-08, "loss": 0.3337, "step": 6800 }, { "epoch": 0.9538569424964937, "grad_norm": 1.8849138745128529, "learning_rate": 5.573308814886158e-08, "loss": 0.3142, "step": 6801 }, { "epoch": 0.9539971949509116, "grad_norm": 2.0951984007489544, "learning_rate": 5.539542612600479e-08, "loss": 0.2973, "step": 6802 }, { "epoch": 0.9541374474053296, "grad_norm": 1.8343039529744198, "learning_rate": 5.5058784387755915e-08, "loss": 0.3276, "step": 6803 }, { "epoch": 0.9542776998597475, "grad_norm": 1.6309108337689948, "learning_rate": 5.472316300357883e-08, "loss": 0.3331, "step": 6804 }, { "epoch": 0.9544179523141655, "grad_norm": 2.203490098502096, "learning_rate": 5.438856204272647e-08, "loss": 0.3487, "step": 6805 }, { "epoch": 0.9545582047685834, "grad_norm": 1.9693165524016785, "learning_rate": 5.405498157424194e-08, "loss": 0.308, "step": 6806 }, { "epoch": 0.9546984572230014, "grad_norm": 1.6907367746912394, "learning_rate": 5.372242166695685e-08, "loss": 0.3226, "step": 6807 }, { "epoch": 0.9548387096774194, "grad_norm": 2.3080101760163916, "learning_rate": 5.339088238949186e-08, "loss": 0.3229, "step": 6808 }, { "epoch": 0.9549789621318373, "grad_norm": 2.465026268345145, "learning_rate": 5.3060363810259475e-08, "loss": 0.3406, "step": 6809 }, { "epoch": 0.9551192145862553, "grad_norm": 2.2878749227481587, "learning_rate": 5.273086599745847e-08, "loss": 0.3091, "step": 6810 }, { "epoch": 0.9552594670406732, "grad_norm": 1.6893987922531368, "learning_rate": 5.2402389019078904e-08, "loss": 0.3075, "step": 6811 }, { "epoch": 0.9553997194950912, "grad_norm": 2.1230025159593606, "learning_rate": 5.207493294289989e-08, "loss": 0.3184, "step": 6812 }, { "epoch": 0.9555399719495091, "grad_norm": 1.8442982004141253, "learning_rate": 5.174849783648905e-08, "loss": 0.3197, "step": 6813 }, { "epoch": 0.9556802244039271, "grad_norm": 1.830772829869067, "learning_rate": 5.142308376720473e-08, "loss": 0.3005, "step": 6814 }, { "epoch": 0.955820476858345, "grad_norm": 1.9777450341229956, "learning_rate": 5.109869080219376e-08, "loss": 0.3054, "step": 6815 }, { "epoch": 0.955960729312763, "grad_norm": 1.7234996711812538, "learning_rate": 5.0775319008392054e-08, "loss": 0.3606, "step": 6816 }, { "epoch": 0.956100981767181, "grad_norm": 1.7733903786200882, "learning_rate": 5.045296845252512e-08, "loss": 0.3299, "step": 6817 }, { "epoch": 0.9562412342215989, "grad_norm": 1.7217231697394833, "learning_rate": 5.013163920110864e-08, "loss": 0.3107, "step": 6818 }, { "epoch": 0.9563814866760169, "grad_norm": 2.526474769404399, "learning_rate": 4.9811331320445135e-08, "loss": 0.2892, "step": 6819 }, { "epoch": 0.9565217391304348, "grad_norm": 2.4136077551312853, "learning_rate": 4.9492044876628396e-08, "loss": 0.3401, "step": 6820 }, { "epoch": 0.9566619915848528, "grad_norm": 1.9785941316444133, "learning_rate": 4.917377993554184e-08, "loss": 0.352, "step": 6821 }, { "epoch": 0.9568022440392707, "grad_norm": 1.9318916612818058, "learning_rate": 4.885653656285627e-08, "loss": 0.3556, "step": 6822 }, { "epoch": 0.9569424964936887, "grad_norm": 2.1368134807111447, "learning_rate": 4.854031482403321e-08, "loss": 0.2664, "step": 6823 }, { "epoch": 0.9570827489481066, "grad_norm": 1.8959370539312435, "learning_rate": 4.822511478432212e-08, "loss": 0.3424, "step": 6824 }, { "epoch": 0.9572230014025246, "grad_norm": 1.882591772793428, "learning_rate": 4.791093650876322e-08, "loss": 0.339, "step": 6825 }, { "epoch": 0.9573632538569425, "grad_norm": 1.6518307874855795, "learning_rate": 4.759778006218407e-08, "loss": 0.3265, "step": 6826 }, { "epoch": 0.9575035063113605, "grad_norm": 1.5995230738796147, "learning_rate": 4.7285645509203e-08, "loss": 0.3134, "step": 6827 }, { "epoch": 0.9576437587657785, "grad_norm": 2.7837506314548004, "learning_rate": 4.6974532914226825e-08, "loss": 0.3061, "step": 6828 }, { "epoch": 0.9577840112201964, "grad_norm": 3.8986960652134877, "learning_rate": 4.666444234145084e-08, "loss": 0.3608, "step": 6829 }, { "epoch": 0.9579242636746143, "grad_norm": 1.789151718380436, "learning_rate": 4.635537385486111e-08, "loss": 0.3235, "step": 6830 }, { "epoch": 0.9580645161290322, "grad_norm": 2.003017771405416, "learning_rate": 4.604732751823049e-08, "loss": 0.3241, "step": 6831 }, { "epoch": 0.9582047685834502, "grad_norm": 1.6175367927199098, "learning_rate": 4.5740303395122585e-08, "loss": 0.3639, "step": 6832 }, { "epoch": 0.9583450210378681, "grad_norm": 1.8295555712195632, "learning_rate": 4.543430154889006e-08, "loss": 0.3156, "step": 6833 }, { "epoch": 0.9584852734922861, "grad_norm": 1.8207387138707252, "learning_rate": 4.512932204267406e-08, "loss": 0.3598, "step": 6834 }, { "epoch": 0.958625525946704, "grad_norm": 2.015989048928081, "learning_rate": 4.482536493940537e-08, "loss": 0.351, "step": 6835 }, { "epoch": 0.958765778401122, "grad_norm": 2.0607370876364484, "learning_rate": 4.45224303018027e-08, "loss": 0.3272, "step": 6836 }, { "epoch": 0.9589060308555399, "grad_norm": 2.244964474947079, "learning_rate": 4.422051819237494e-08, "loss": 0.3191, "step": 6837 }, { "epoch": 0.9590462833099579, "grad_norm": 2.6168137012970836, "learning_rate": 4.3919628673418926e-08, "loss": 0.3311, "step": 6838 }, { "epoch": 0.9591865357643758, "grad_norm": 2.081305686961756, "learning_rate": 4.361976180702221e-08, "loss": 0.2902, "step": 6839 }, { "epoch": 0.9593267882187938, "grad_norm": 1.8400436908261635, "learning_rate": 4.3320917655059744e-08, "loss": 0.335, "step": 6840 }, { "epoch": 0.9594670406732118, "grad_norm": 2.861753547992175, "learning_rate": 4.3023096279195544e-08, "loss": 0.315, "step": 6841 }, { "epoch": 0.9596072931276297, "grad_norm": 1.7847379031484063, "learning_rate": 4.2726297740883215e-08, "loss": 0.3287, "step": 6842 }, { "epoch": 0.9597475455820477, "grad_norm": 2.209708475416086, "learning_rate": 4.2430522101364894e-08, "loss": 0.3106, "step": 6843 }, { "epoch": 0.9598877980364656, "grad_norm": 1.878955356646157, "learning_rate": 4.21357694216723e-08, "loss": 0.3633, "step": 6844 }, { "epoch": 0.9600280504908836, "grad_norm": 1.6570793090930034, "learning_rate": 4.184203976262513e-08, "loss": 0.3415, "step": 6845 }, { "epoch": 0.9601683029453015, "grad_norm": 2.73784500108004, "learning_rate": 4.1549333184832675e-08, "loss": 0.3163, "step": 6846 }, { "epoch": 0.9603085553997195, "grad_norm": 1.6604915504688564, "learning_rate": 4.1257649748693284e-08, "loss": 0.3373, "step": 6847 }, { "epoch": 0.9604488078541374, "grad_norm": 1.9855255035745194, "learning_rate": 4.0966989514392705e-08, "loss": 0.2942, "step": 6848 }, { "epoch": 0.9605890603085554, "grad_norm": 1.9093395273562788, "learning_rate": 4.0677352541907963e-08, "loss": 0.3283, "step": 6849 }, { "epoch": 0.9607293127629734, "grad_norm": 2.351842413683376, "learning_rate": 4.038873889100237e-08, "loss": 0.3659, "step": 6850 }, { "epoch": 0.9608695652173913, "grad_norm": 3.192796980771688, "learning_rate": 4.010114862123049e-08, "loss": 0.3652, "step": 6851 }, { "epoch": 0.9610098176718093, "grad_norm": 2.9252670101006677, "learning_rate": 3.981458179193321e-08, "loss": 0.2928, "step": 6852 }, { "epoch": 0.9611500701262272, "grad_norm": 2.4607695022642666, "learning_rate": 3.952903846224265e-08, "loss": 0.3353, "step": 6853 }, { "epoch": 0.9612903225806452, "grad_norm": 1.8055313581491765, "learning_rate": 3.9244518691078925e-08, "loss": 0.2715, "step": 6854 }, { "epoch": 0.9614305750350631, "grad_norm": 2.114058318207997, "learning_rate": 3.8961022537149505e-08, "loss": 0.3114, "step": 6855 }, { "epoch": 0.9615708274894811, "grad_norm": 2.245820189692927, "learning_rate": 3.86785500589526e-08, "loss": 0.3534, "step": 6856 }, { "epoch": 0.961711079943899, "grad_norm": 2.5238872871260427, "learning_rate": 3.839710131477492e-08, "loss": 0.3688, "step": 6857 }, { "epoch": 0.961851332398317, "grad_norm": 1.7384740829664376, "learning_rate": 3.811667636269001e-08, "loss": 0.3305, "step": 6858 }, { "epoch": 0.961991584852735, "grad_norm": 1.9439769281226988, "learning_rate": 3.7837275260563244e-08, "loss": 0.3543, "step": 6859 }, { "epoch": 0.9621318373071529, "grad_norm": 2.030804277080956, "learning_rate": 3.755889806604629e-08, "loss": 0.3141, "step": 6860 }, { "epoch": 0.9622720897615709, "grad_norm": 2.164263113480276, "learning_rate": 3.728154483657986e-08, "loss": 0.3333, "step": 6861 }, { "epoch": 0.9624123422159888, "grad_norm": 2.285130792459692, "learning_rate": 3.700521562939485e-08, "loss": 0.3338, "step": 6862 }, { "epoch": 0.9625525946704068, "grad_norm": 1.7627436665589045, "learning_rate": 3.672991050150898e-08, "loss": 0.3134, "step": 6863 }, { "epoch": 0.9626928471248247, "grad_norm": 2.022295655441503, "learning_rate": 3.645562950973014e-08, "loss": 0.3168, "step": 6864 }, { "epoch": 0.9628330995792427, "grad_norm": 2.4641930738454705, "learning_rate": 3.618237271065417e-08, "loss": 0.3211, "step": 6865 }, { "epoch": 0.9629733520336606, "grad_norm": 2.177571462516267, "learning_rate": 3.591014016066541e-08, "loss": 0.3516, "step": 6866 }, { "epoch": 0.9631136044880786, "grad_norm": 1.61777644098987, "learning_rate": 3.563893191593726e-08, "loss": 0.3126, "step": 6867 }, { "epoch": 0.9632538569424965, "grad_norm": 1.372134430496963, "learning_rate": 3.5368748032431624e-08, "loss": 0.2821, "step": 6868 }, { "epoch": 0.9633941093969145, "grad_norm": 2.1865836267675984, "learning_rate": 3.509958856590001e-08, "loss": 0.3502, "step": 6869 }, { "epoch": 0.9635343618513323, "grad_norm": 1.674688307385927, "learning_rate": 3.483145357187967e-08, "loss": 0.3405, "step": 6870 }, { "epoch": 0.9636746143057503, "grad_norm": 4.775680589356111, "learning_rate": 3.456434310570023e-08, "loss": 0.3143, "step": 6871 }, { "epoch": 0.9638148667601683, "grad_norm": 1.509979509252502, "learning_rate": 3.429825722247704e-08, "loss": 0.3187, "step": 6872 }, { "epoch": 0.9639551192145862, "grad_norm": 1.7459413602754554, "learning_rate": 3.403319597711563e-08, "loss": 0.3514, "step": 6873 }, { "epoch": 0.9640953716690042, "grad_norm": 2.086018192859291, "learning_rate": 3.3769159424308917e-08, "loss": 0.3305, "step": 6874 }, { "epoch": 0.9642356241234221, "grad_norm": 1.8846004117194253, "learning_rate": 3.3506147618538874e-08, "loss": 0.3638, "step": 6875 }, { "epoch": 0.9643758765778401, "grad_norm": 2.634913028244453, "learning_rate": 3.324416061407709e-08, "loss": 0.3372, "step": 6876 }, { "epoch": 0.964516129032258, "grad_norm": 2.1519258789287607, "learning_rate": 3.298319846498254e-08, "loss": 0.3397, "step": 6877 }, { "epoch": 0.964656381486676, "grad_norm": 1.7700900443928804, "learning_rate": 3.2723261225102164e-08, "loss": 0.3803, "step": 6878 }, { "epoch": 0.9647966339410939, "grad_norm": 2.3261941160841126, "learning_rate": 3.246434894807304e-08, "loss": 0.335, "step": 6879 }, { "epoch": 0.9649368863955119, "grad_norm": 2.317850161438424, "learning_rate": 3.2206461687319666e-08, "loss": 0.2882, "step": 6880 }, { "epoch": 0.9650771388499298, "grad_norm": 1.9930923501346247, "learning_rate": 3.1949599496054475e-08, "loss": 0.3407, "step": 6881 }, { "epoch": 0.9652173913043478, "grad_norm": 2.071194097562228, "learning_rate": 3.169376242728062e-08, "loss": 0.3636, "step": 6882 }, { "epoch": 0.9653576437587658, "grad_norm": 2.075505606397217, "learning_rate": 3.143895053378698e-08, "loss": 0.2992, "step": 6883 }, { "epoch": 0.9654978962131837, "grad_norm": 1.908366919587051, "learning_rate": 3.118516386815318e-08, "loss": 0.3265, "step": 6884 }, { "epoch": 0.9656381486676017, "grad_norm": 2.0316261530094657, "learning_rate": 3.093240248274565e-08, "loss": 0.3436, "step": 6885 }, { "epoch": 0.9657784011220196, "grad_norm": 1.9161664959753528, "learning_rate": 3.068066642972045e-08, "loss": 0.3113, "step": 6886 }, { "epoch": 0.9659186535764376, "grad_norm": 1.8475174640041743, "learning_rate": 3.042995576102104e-08, "loss": 0.3669, "step": 6887 }, { "epoch": 0.9660589060308555, "grad_norm": 3.1987404192072564, "learning_rate": 3.018027052838046e-08, "loss": 0.3377, "step": 6888 }, { "epoch": 0.9661991584852735, "grad_norm": 1.7068394044665627, "learning_rate": 2.993161078331919e-08, "loss": 0.323, "step": 6889 }, { "epoch": 0.9663394109396914, "grad_norm": 2.2114007147818984, "learning_rate": 2.9683976577146166e-08, "loss": 0.3307, "step": 6890 }, { "epoch": 0.9664796633941094, "grad_norm": 2.061199471783769, "learning_rate": 2.9437367960959417e-08, "loss": 0.3147, "step": 6891 }, { "epoch": 0.9666199158485274, "grad_norm": 2.848375352609453, "learning_rate": 2.9191784985644345e-08, "loss": 0.3783, "step": 6892 }, { "epoch": 0.9667601683029453, "grad_norm": 1.8116456353032202, "learning_rate": 2.894722770187597e-08, "loss": 0.3364, "step": 6893 }, { "epoch": 0.9669004207573633, "grad_norm": 1.7995207174962906, "learning_rate": 2.8703696160116146e-08, "loss": 0.3783, "step": 6894 }, { "epoch": 0.9670406732117812, "grad_norm": 3.52820949132285, "learning_rate": 2.8461190410616347e-08, "loss": 0.3397, "step": 6895 }, { "epoch": 0.9671809256661992, "grad_norm": 1.8297594377170325, "learning_rate": 2.8219710503416543e-08, "loss": 0.349, "step": 6896 }, { "epoch": 0.9673211781206171, "grad_norm": 1.8074482200455293, "learning_rate": 2.7979256488343542e-08, "loss": 0.3322, "step": 6897 }, { "epoch": 0.9674614305750351, "grad_norm": 1.6312315373453812, "learning_rate": 2.773982841501377e-08, "loss": 0.2954, "step": 6898 }, { "epoch": 0.967601683029453, "grad_norm": 3.3095476921346325, "learning_rate": 2.7501426332831594e-08, "loss": 0.3172, "step": 6899 }, { "epoch": 0.967741935483871, "grad_norm": 1.6852966335008814, "learning_rate": 2.726405029098933e-08, "loss": 0.309, "step": 6900 }, { "epoch": 0.967882187938289, "grad_norm": 1.7513922874921226, "learning_rate": 2.70277003384678e-08, "loss": 0.3215, "step": 6901 }, { "epoch": 0.9680224403927069, "grad_norm": 1.7986701291227465, "learning_rate": 2.6792376524036878e-08, "loss": 0.2972, "step": 6902 }, { "epoch": 0.9681626928471249, "grad_norm": 1.6948014996631606, "learning_rate": 2.6558078896252725e-08, "loss": 0.3355, "step": 6903 }, { "epoch": 0.9683029453015428, "grad_norm": 1.9089437119391899, "learning_rate": 2.6324807503462223e-08, "loss": 0.3451, "step": 6904 }, { "epoch": 0.9684431977559608, "grad_norm": 1.9048835985159995, "learning_rate": 2.6092562393799094e-08, "loss": 0.2934, "step": 6905 }, { "epoch": 0.9685834502103787, "grad_norm": 1.4797856105999352, "learning_rate": 2.5861343615184997e-08, "loss": 0.3346, "step": 6906 }, { "epoch": 0.9687237026647967, "grad_norm": 2.089001409169512, "learning_rate": 2.5631151215330107e-08, "loss": 0.3516, "step": 6907 }, { "epoch": 0.9688639551192146, "grad_norm": 2.568520799792851, "learning_rate": 2.5401985241734207e-08, "loss": 0.321, "step": 6908 }, { "epoch": 0.9690042075736326, "grad_norm": 2.6566199549928315, "learning_rate": 2.5173845741682802e-08, "loss": 0.2762, "step": 6909 }, { "epoch": 0.9691444600280504, "grad_norm": 1.769372928770437, "learning_rate": 2.4946732762252125e-08, "loss": 0.3486, "step": 6910 }, { "epoch": 0.9692847124824684, "grad_norm": 1.7618534011301776, "learning_rate": 2.4720646350304134e-08, "loss": 0.3407, "step": 6911 }, { "epoch": 0.9694249649368863, "grad_norm": 2.016468078776808, "learning_rate": 2.4495586552490958e-08, "loss": 0.3668, "step": 6912 }, { "epoch": 0.9695652173913043, "grad_norm": 2.3641300411119186, "learning_rate": 2.427155341525156e-08, "loss": 0.3168, "step": 6913 }, { "epoch": 0.9697054698457223, "grad_norm": 1.8476393458941975, "learning_rate": 2.4048546984813957e-08, "loss": 0.3249, "step": 6914 }, { "epoch": 0.9698457223001402, "grad_norm": 2.1510117523354686, "learning_rate": 2.3826567307194127e-08, "loss": 0.327, "step": 6915 }, { "epoch": 0.9699859747545582, "grad_norm": 1.711104223759044, "learning_rate": 2.360561442819598e-08, "loss": 0.3117, "step": 6916 }, { "epoch": 0.9701262272089761, "grad_norm": 1.9177661174941776, "learning_rate": 2.338568839341082e-08, "loss": 0.3283, "step": 6917 }, { "epoch": 0.9702664796633941, "grad_norm": 2.1546403709607382, "learning_rate": 2.3166789248220134e-08, "loss": 0.3457, "step": 6918 }, { "epoch": 0.970406732117812, "grad_norm": 1.6086490067657344, "learning_rate": 2.294891703779112e-08, "loss": 0.335, "step": 6919 }, { "epoch": 0.97054698457223, "grad_norm": 2.495590135276828, "learning_rate": 2.2732071807081147e-08, "loss": 0.3171, "step": 6920 }, { "epoch": 0.9706872370266479, "grad_norm": 2.228933256769876, "learning_rate": 2.251625360083387e-08, "loss": 0.3326, "step": 6921 }, { "epoch": 0.9708274894810659, "grad_norm": 1.739643720237338, "learning_rate": 2.230146246358256e-08, "loss": 0.3429, "step": 6922 }, { "epoch": 0.9709677419354839, "grad_norm": 2.144460986155397, "learning_rate": 2.2087698439646756e-08, "loss": 0.3543, "step": 6923 }, { "epoch": 0.9711079943899018, "grad_norm": 1.7617003446239223, "learning_rate": 2.1874961573136734e-08, "loss": 0.3433, "step": 6924 }, { "epoch": 0.9712482468443198, "grad_norm": 2.106882388662975, "learning_rate": 2.1663251907947935e-08, "loss": 0.3186, "step": 6925 }, { "epoch": 0.9713884992987377, "grad_norm": 2.9334674508327536, "learning_rate": 2.1452569487765973e-08, "loss": 0.3249, "step": 6926 }, { "epoch": 0.9715287517531557, "grad_norm": 1.9021588809444159, "learning_rate": 2.1242914356063292e-08, "loss": 0.3506, "step": 6927 }, { "epoch": 0.9716690042075736, "grad_norm": 2.178758867763347, "learning_rate": 2.1034286556100847e-08, "loss": 0.3197, "step": 6928 }, { "epoch": 0.9718092566619916, "grad_norm": 1.7642679316174432, "learning_rate": 2.082668613092753e-08, "loss": 0.3546, "step": 6929 }, { "epoch": 0.9719495091164095, "grad_norm": 2.024741205703396, "learning_rate": 2.0620113123380746e-08, "loss": 0.2879, "step": 6930 }, { "epoch": 0.9720897615708275, "grad_norm": 2.2417654244981438, "learning_rate": 2.0414567576084176e-08, "loss": 0.3765, "step": 6931 }, { "epoch": 0.9722300140252454, "grad_norm": 1.6519353555773142, "learning_rate": 2.021004953145167e-08, "loss": 0.2957, "step": 6932 }, { "epoch": 0.9723702664796634, "grad_norm": 2.480118950636816, "learning_rate": 2.000655903168447e-08, "loss": 0.3146, "step": 6933 }, { "epoch": 0.9725105189340814, "grad_norm": 1.548173147784194, "learning_rate": 1.98040961187701e-08, "loss": 0.3555, "step": 6934 }, { "epoch": 0.9726507713884993, "grad_norm": 3.8052676030821924, "learning_rate": 1.9602660834486253e-08, "loss": 0.3122, "step": 6935 }, { "epoch": 0.9727910238429173, "grad_norm": 2.0996884069428017, "learning_rate": 1.9402253220398014e-08, "loss": 0.3338, "step": 6936 }, { "epoch": 0.9729312762973352, "grad_norm": 3.0433162936359985, "learning_rate": 1.9202873317856752e-08, "loss": 0.3611, "step": 6937 }, { "epoch": 0.9730715287517532, "grad_norm": 1.726733445210379, "learning_rate": 1.900452116800455e-08, "loss": 0.3212, "step": 6938 }, { "epoch": 0.9732117812061711, "grad_norm": 2.168199791496611, "learning_rate": 1.8807196811769236e-08, "loss": 0.345, "step": 6939 }, { "epoch": 0.9733520336605891, "grad_norm": 2.0695186604526943, "learning_rate": 1.8610900289867673e-08, "loss": 0.3747, "step": 6940 }, { "epoch": 0.973492286115007, "grad_norm": 2.525995900372027, "learning_rate": 1.841563164280413e-08, "loss": 0.3202, "step": 6941 }, { "epoch": 0.973632538569425, "grad_norm": 2.2008566575678308, "learning_rate": 1.822139091087083e-08, "loss": 0.3425, "step": 6942 }, { "epoch": 0.973772791023843, "grad_norm": 1.9939880611632366, "learning_rate": 1.802817813414792e-08, "loss": 0.301, "step": 6943 }, { "epoch": 0.9739130434782609, "grad_norm": 2.395165812062486, "learning_rate": 1.7835993352503524e-08, "loss": 0.3518, "step": 6944 }, { "epoch": 0.9740532959326789, "grad_norm": 1.9435400797345095, "learning_rate": 1.764483660559424e-08, "loss": 0.3246, "step": 6945 }, { "epoch": 0.9741935483870968, "grad_norm": 1.6021597888232368, "learning_rate": 1.745470793286297e-08, "loss": 0.3261, "step": 6946 }, { "epoch": 0.9743338008415148, "grad_norm": 1.76142248128104, "learning_rate": 1.726560737354166e-08, "loss": 0.3349, "step": 6947 }, { "epoch": 0.9744740532959327, "grad_norm": 2.1330115815754516, "learning_rate": 1.7077534966650767e-08, "loss": 0.3714, "step": 6948 }, { "epoch": 0.9746143057503507, "grad_norm": 2.1109746448289806, "learning_rate": 1.6890490750997025e-08, "loss": 0.3255, "step": 6949 }, { "epoch": 0.9747545582047685, "grad_norm": 2.1459100456475872, "learning_rate": 1.6704474765175115e-08, "loss": 0.3629, "step": 6950 }, { "epoch": 0.9748948106591865, "grad_norm": 1.7336263329502652, "learning_rate": 1.6519487047569338e-08, "loss": 0.3143, "step": 6951 }, { "epoch": 0.9750350631136044, "grad_norm": 1.8197520952848103, "learning_rate": 1.6335527636350267e-08, "loss": 0.3112, "step": 6952 }, { "epoch": 0.9751753155680224, "grad_norm": 2.1264879963399466, "learning_rate": 1.6152596569475877e-08, "loss": 0.3272, "step": 6953 }, { "epoch": 0.9753155680224403, "grad_norm": 2.5653568253118935, "learning_rate": 1.5970693884693745e-08, "loss": 0.3567, "step": 6954 }, { "epoch": 0.9754558204768583, "grad_norm": 1.9539150013626614, "learning_rate": 1.5789819619537182e-08, "loss": 0.331, "step": 6955 }, { "epoch": 0.9755960729312763, "grad_norm": 1.639369593272163, "learning_rate": 1.5609973811329116e-08, "loss": 0.3282, "step": 6956 }, { "epoch": 0.9757363253856942, "grad_norm": 2.0810067982670297, "learning_rate": 1.5431156497179856e-08, "loss": 0.3238, "step": 6957 }, { "epoch": 0.9758765778401122, "grad_norm": 2.2223468692577684, "learning_rate": 1.5253367713985444e-08, "loss": 0.355, "step": 6958 }, { "epoch": 0.9760168302945301, "grad_norm": 2.022645183958781, "learning_rate": 1.5076607498433203e-08, "loss": 0.3448, "step": 6959 }, { "epoch": 0.9761570827489481, "grad_norm": 2.1896754755098566, "learning_rate": 1.490087588699507e-08, "loss": 0.3126, "step": 6960 }, { "epoch": 0.976297335203366, "grad_norm": 2.0119698673289803, "learning_rate": 1.4726172915933146e-08, "loss": 0.3229, "step": 6961 }, { "epoch": 0.976437587657784, "grad_norm": 1.5223322062116849, "learning_rate": 1.4552498621295264e-08, "loss": 0.3171, "step": 6962 }, { "epoch": 0.976577840112202, "grad_norm": 2.180829258430162, "learning_rate": 1.4379853038917757e-08, "loss": 0.3321, "step": 6963 }, { "epoch": 0.9767180925666199, "grad_norm": 1.7928534576647364, "learning_rate": 1.4208236204426018e-08, "loss": 0.3371, "step": 6964 }, { "epoch": 0.9768583450210379, "grad_norm": 1.9880872230814195, "learning_rate": 1.403764815323061e-08, "loss": 0.2968, "step": 6965 }, { "epoch": 0.9769985974754558, "grad_norm": 3.1878461462824963, "learning_rate": 1.3868088920532263e-08, "loss": 0.294, "step": 6966 }, { "epoch": 0.9771388499298738, "grad_norm": 1.9792679677101683, "learning_rate": 1.3699558541317437e-08, "loss": 0.3215, "step": 6967 }, { "epoch": 0.9772791023842917, "grad_norm": 2.201612345535296, "learning_rate": 1.3532057050361646e-08, "loss": 0.4144, "step": 6968 }, { "epoch": 0.9774193548387097, "grad_norm": 3.0719044370589335, "learning_rate": 1.3365584482228356e-08, "loss": 0.3379, "step": 6969 }, { "epoch": 0.9775596072931276, "grad_norm": 1.7739490325857143, "learning_rate": 1.3200140871266754e-08, "loss": 0.3445, "step": 6970 }, { "epoch": 0.9776998597475456, "grad_norm": 1.971182637245946, "learning_rate": 1.3035726251615644e-08, "loss": 0.3734, "step": 6971 }, { "epoch": 0.9778401122019635, "grad_norm": 2.864127290433025, "learning_rate": 1.2872340657200666e-08, "loss": 0.3259, "step": 6972 }, { "epoch": 0.9779803646563815, "grad_norm": 2.7583103324953018, "learning_rate": 1.2709984121735407e-08, "loss": 0.3531, "step": 6973 }, { "epoch": 0.9781206171107995, "grad_norm": 1.6661653082487407, "learning_rate": 1.2548656678721404e-08, "loss": 0.2938, "step": 6974 }, { "epoch": 0.9782608695652174, "grad_norm": 1.6411081238529122, "learning_rate": 1.2388358361446473e-08, "loss": 0.3215, "step": 6975 }, { "epoch": 0.9784011220196354, "grad_norm": 1.8096072691897056, "learning_rate": 1.2229089202987487e-08, "loss": 0.3257, "step": 6976 }, { "epoch": 0.9785413744740533, "grad_norm": 1.7345780262379857, "learning_rate": 1.2070849236208716e-08, "loss": 0.361, "step": 6977 }, { "epoch": 0.9786816269284713, "grad_norm": 2.003078832704314, "learning_rate": 1.1913638493762369e-08, "loss": 0.3515, "step": 6978 }, { "epoch": 0.9788218793828892, "grad_norm": 2.1185393012054208, "learning_rate": 1.1757457008086393e-08, "loss": 0.3649, "step": 6979 }, { "epoch": 0.9789621318373072, "grad_norm": 2.318011237649609, "learning_rate": 1.1602304811408893e-08, "loss": 0.3725, "step": 6980 }, { "epoch": 0.9791023842917251, "grad_norm": 2.4755974944256867, "learning_rate": 1.1448181935744262e-08, "loss": 0.3353, "step": 6981 }, { "epoch": 0.9792426367461431, "grad_norm": 1.932705021848148, "learning_rate": 1.1295088412894285e-08, "loss": 0.3427, "step": 6982 }, { "epoch": 0.979382889200561, "grad_norm": 1.843193616808991, "learning_rate": 1.1143024274448689e-08, "loss": 0.3462, "step": 6983 }, { "epoch": 0.979523141654979, "grad_norm": 1.7363907381145973, "learning_rate": 1.0991989551785708e-08, "loss": 0.3384, "step": 6984 }, { "epoch": 0.979663394109397, "grad_norm": 1.7355447209782942, "learning_rate": 1.0841984276069306e-08, "loss": 0.3551, "step": 6985 }, { "epoch": 0.9798036465638149, "grad_norm": 2.5687334491725586, "learning_rate": 1.0693008478252498e-08, "loss": 0.3355, "step": 6986 }, { "epoch": 0.9799438990182329, "grad_norm": 2.0444300368856387, "learning_rate": 1.0545062189075139e-08, "loss": 0.3522, "step": 6987 }, { "epoch": 0.9800841514726508, "grad_norm": 1.888648142463353, "learning_rate": 1.0398145439065588e-08, "loss": 0.3266, "step": 6988 }, { "epoch": 0.9802244039270688, "grad_norm": 2.2163758806602636, "learning_rate": 1.0252258258537929e-08, "loss": 0.3318, "step": 6989 }, { "epoch": 0.9803646563814866, "grad_norm": 1.4781148510961541, "learning_rate": 1.0107400677596413e-08, "loss": 0.3236, "step": 6990 }, { "epoch": 0.9805049088359046, "grad_norm": 1.8701176974542386, "learning_rate": 9.963572726129911e-09, "loss": 0.3438, "step": 6991 }, { "epoch": 0.9806451612903225, "grad_norm": 2.3022615757863583, "learning_rate": 9.82077443381746e-09, "loss": 0.3299, "step": 6992 }, { "epoch": 0.9807854137447405, "grad_norm": 1.9215475904500783, "learning_rate": 9.679005830124376e-09, "loss": 0.3164, "step": 6993 }, { "epoch": 0.9809256661991584, "grad_norm": 2.354190713765256, "learning_rate": 9.53826694430282e-09, "loss": 0.3163, "step": 6994 }, { "epoch": 0.9810659186535764, "grad_norm": 1.8962011155206373, "learning_rate": 9.398557805394003e-09, "loss": 0.3212, "step": 6995 }, { "epoch": 0.9812061711079944, "grad_norm": 1.737051333673081, "learning_rate": 9.259878442225422e-09, "loss": 0.2991, "step": 6996 }, { "epoch": 0.9813464235624123, "grad_norm": 2.0739370036181923, "learning_rate": 9.12222888341252e-09, "loss": 0.337, "step": 6997 }, { "epoch": 0.9814866760168303, "grad_norm": 3.0198056983336654, "learning_rate": 8.985609157359243e-09, "loss": 0.3078, "step": 6998 }, { "epoch": 0.9816269284712482, "grad_norm": 2.4857989375102427, "learning_rate": 8.850019292255263e-09, "loss": 0.3402, "step": 6999 }, { "epoch": 0.9817671809256662, "grad_norm": 2.239733160167448, "learning_rate": 8.715459316078756e-09, "loss": 0.3638, "step": 7000 }, { "epoch": 0.9819074333800841, "grad_norm": 1.844160260607244, "learning_rate": 8.581929256595844e-09, "loss": 0.3495, "step": 7001 }, { "epoch": 0.9820476858345021, "grad_norm": 1.7637742732031654, "learning_rate": 8.449429141358378e-09, "loss": 0.3513, "step": 7002 }, { "epoch": 0.98218793828892, "grad_norm": 1.8230067276278605, "learning_rate": 8.317958997708374e-09, "loss": 0.3179, "step": 7003 }, { "epoch": 0.982328190743338, "grad_norm": 3.797825195974302, "learning_rate": 8.187518852771914e-09, "loss": 0.3378, "step": 7004 }, { "epoch": 0.982468443197756, "grad_norm": 2.6197214186687647, "learning_rate": 8.058108733465797e-09, "loss": 0.3196, "step": 7005 }, { "epoch": 0.9826086956521739, "grad_norm": 2.2768127974066763, "learning_rate": 7.929728666492553e-09, "loss": 0.3388, "step": 7006 }, { "epoch": 0.9827489481065919, "grad_norm": 2.1874182549597934, "learning_rate": 7.802378678342105e-09, "loss": 0.3082, "step": 7007 }, { "epoch": 0.9828892005610098, "grad_norm": 1.8473112336641073, "learning_rate": 7.676058795292873e-09, "loss": 0.2986, "step": 7008 }, { "epoch": 0.9830294530154278, "grad_norm": 1.7340333619301749, "learning_rate": 7.550769043409567e-09, "loss": 0.3024, "step": 7009 }, { "epoch": 0.9831697054698457, "grad_norm": 2.7785773133276166, "learning_rate": 7.426509448545394e-09, "loss": 0.2922, "step": 7010 }, { "epoch": 0.9833099579242637, "grad_norm": 1.9353620977959036, "learning_rate": 7.3032800363398435e-09, "loss": 0.3093, "step": 7011 }, { "epoch": 0.9834502103786816, "grad_norm": 1.8281876642947563, "learning_rate": 7.18108083222091e-09, "loss": 0.3305, "step": 7012 }, { "epoch": 0.9835904628330996, "grad_norm": 1.8850985376319274, "learning_rate": 7.0599118614034235e-09, "loss": 0.3445, "step": 7013 }, { "epoch": 0.9837307152875175, "grad_norm": 2.069942340508495, "learning_rate": 6.939773148889051e-09, "loss": 0.3376, "step": 7014 }, { "epoch": 0.9838709677419355, "grad_norm": 2.1912792801410412, "learning_rate": 6.820664719469072e-09, "loss": 0.3395, "step": 7015 }, { "epoch": 0.9840112201963535, "grad_norm": 2.1815345075817105, "learning_rate": 6.702586597719385e-09, "loss": 0.3542, "step": 7016 }, { "epoch": 0.9841514726507714, "grad_norm": 1.9681281061229745, "learning_rate": 6.585538808004943e-09, "loss": 0.3164, "step": 7017 }, { "epoch": 0.9842917251051894, "grad_norm": 2.2809307972445914, "learning_rate": 6.469521374477539e-09, "loss": 0.3031, "step": 7018 }, { "epoch": 0.9844319775596073, "grad_norm": 2.81271598834278, "learning_rate": 6.354534321077465e-09, "loss": 0.3586, "step": 7019 }, { "epoch": 0.9845722300140253, "grad_norm": 1.766811343632893, "learning_rate": 6.24057767153019e-09, "loss": 0.3094, "step": 7020 }, { "epoch": 0.9847124824684432, "grad_norm": 1.9827992347892807, "learning_rate": 6.1276514493513466e-09, "loss": 0.3841, "step": 7021 }, { "epoch": 0.9848527349228612, "grad_norm": 2.0834922602752113, "learning_rate": 6.0157556778411844e-09, "loss": 0.3589, "step": 7022 }, { "epoch": 0.9849929873772791, "grad_norm": 1.6604575927106815, "learning_rate": 5.904890380089012e-09, "loss": 0.3535, "step": 7023 }, { "epoch": 0.9851332398316971, "grad_norm": 1.849249638636689, "learning_rate": 5.795055578971531e-09, "loss": 0.2979, "step": 7024 }, { "epoch": 0.985273492286115, "grad_norm": 2.1086277521196757, "learning_rate": 5.686251297151724e-09, "loss": 0.3452, "step": 7025 }, { "epoch": 0.985413744740533, "grad_norm": 2.017633191785424, "learning_rate": 5.578477557081074e-09, "loss": 0.3144, "step": 7026 }, { "epoch": 0.985553997194951, "grad_norm": 1.7342644046197875, "learning_rate": 5.471734380997906e-09, "loss": 0.3361, "step": 7027 }, { "epoch": 0.9856942496493689, "grad_norm": 1.4852054882766985, "learning_rate": 5.366021790927378e-09, "loss": 0.3106, "step": 7028 }, { "epoch": 0.9858345021037869, "grad_norm": 2.3414887773634625, "learning_rate": 5.261339808683707e-09, "loss": 0.3364, "step": 7029 }, { "epoch": 0.9859747545582047, "grad_norm": 1.5287347493693741, "learning_rate": 5.157688455865728e-09, "loss": 0.3157, "step": 7030 }, { "epoch": 0.9861150070126227, "grad_norm": 1.6782467362843645, "learning_rate": 5.055067753862442e-09, "loss": 0.3145, "step": 7031 }, { "epoch": 0.9862552594670406, "grad_norm": 2.2836597204711975, "learning_rate": 4.9534777238485764e-09, "loss": 0.3618, "step": 7032 }, { "epoch": 0.9863955119214586, "grad_norm": 2.2912733838708577, "learning_rate": 4.852918386786254e-09, "loss": 0.3465, "step": 7033 }, { "epoch": 0.9865357643758765, "grad_norm": 2.123225873965707, "learning_rate": 4.753389763425542e-09, "loss": 0.3562, "step": 7034 }, { "epoch": 0.9866760168302945, "grad_norm": 2.6014407934181696, "learning_rate": 4.654891874303347e-09, "loss": 0.2982, "step": 7035 }, { "epoch": 0.9868162692847124, "grad_norm": 2.0661023210297103, "learning_rate": 4.55742473974341e-09, "loss": 0.324, "step": 7036 }, { "epoch": 0.9869565217391304, "grad_norm": 3.987655665309898, "learning_rate": 4.460988379858533e-09, "loss": 0.3653, "step": 7037 }, { "epoch": 0.9870967741935484, "grad_norm": 2.324958747579836, "learning_rate": 4.365582814546687e-09, "loss": 0.315, "step": 7038 }, { "epoch": 0.9872370266479663, "grad_norm": 2.3145150349380637, "learning_rate": 4.2712080634949024e-09, "loss": 0.3015, "step": 7039 }, { "epoch": 0.9873772791023843, "grad_norm": 3.197096795137812, "learning_rate": 4.1778641461764916e-09, "loss": 0.3249, "step": 7040 }, { "epoch": 0.9875175315568022, "grad_norm": 1.724405866390223, "learning_rate": 4.085551081851602e-09, "loss": 0.3078, "step": 7041 }, { "epoch": 0.9876577840112202, "grad_norm": 2.0545719244764054, "learning_rate": 3.994268889569442e-09, "loss": 0.3362, "step": 7042 }, { "epoch": 0.9877980364656381, "grad_norm": 3.784429553242738, "learning_rate": 3.904017588164943e-09, "loss": 0.3165, "step": 7043 }, { "epoch": 0.9879382889200561, "grad_norm": 3.121939419224, "learning_rate": 3.814797196261544e-09, "loss": 0.3187, "step": 7044 }, { "epoch": 0.988078541374474, "grad_norm": 1.918127671007153, "learning_rate": 3.726607732267851e-09, "loss": 0.317, "step": 7045 }, { "epoch": 0.988218793828892, "grad_norm": 2.122370495479186, "learning_rate": 3.6394492143820847e-09, "loss": 0.4136, "step": 7046 }, { "epoch": 0.98835904628331, "grad_norm": 1.8414429924349593, "learning_rate": 3.553321660588749e-09, "loss": 0.2722, "step": 7047 }, { "epoch": 0.9884992987377279, "grad_norm": 1.6685789184882778, "learning_rate": 3.468225088659738e-09, "loss": 0.3365, "step": 7048 }, { "epoch": 0.9886395511921459, "grad_norm": 2.0828852235155924, "learning_rate": 3.3841595161537842e-09, "loss": 0.3409, "step": 7049 }, { "epoch": 0.9887798036465638, "grad_norm": 2.35572643989733, "learning_rate": 3.3011249604170124e-09, "loss": 0.3671, "step": 7050 }, { "epoch": 0.9889200561009818, "grad_norm": 1.8969862300431353, "learning_rate": 3.2191214385840498e-09, "loss": 0.3206, "step": 7051 }, { "epoch": 0.9890603085553997, "grad_norm": 1.9554634438601601, "learning_rate": 3.1381489675746946e-09, "loss": 0.3491, "step": 7052 }, { "epoch": 0.9892005610098177, "grad_norm": 1.6366118985715643, "learning_rate": 3.0582075640972487e-09, "loss": 0.3415, "step": 7053 }, { "epoch": 0.9893408134642356, "grad_norm": 2.0319114195071677, "learning_rate": 2.9792972446479605e-09, "loss": 0.3512, "step": 7054 }, { "epoch": 0.9894810659186536, "grad_norm": 3.765417955966864, "learning_rate": 2.9014180255082515e-09, "loss": 0.3231, "step": 7055 }, { "epoch": 0.9896213183730715, "grad_norm": 2.068452741120409, "learning_rate": 2.824569922748599e-09, "loss": 0.2907, "step": 7056 }, { "epoch": 0.9897615708274895, "grad_norm": 2.6674607604554064, "learning_rate": 2.7487529522257637e-09, "loss": 0.3414, "step": 7057 }, { "epoch": 0.9899018232819075, "grad_norm": 2.4476430104199265, "learning_rate": 2.6739671295838986e-09, "loss": 0.2938, "step": 7058 }, { "epoch": 0.9900420757363254, "grad_norm": 1.8463221566492758, "learning_rate": 2.6002124702556585e-09, "loss": 0.3251, "step": 7059 }, { "epoch": 0.9901823281907434, "grad_norm": 2.1455342599786627, "learning_rate": 2.5274889894583156e-09, "loss": 0.319, "step": 7060 }, { "epoch": 0.9903225806451613, "grad_norm": 2.0769030197783556, "learning_rate": 2.455796702198754e-09, "loss": 0.3236, "step": 7061 }, { "epoch": 0.9904628330995793, "grad_norm": 2.9287772023284573, "learning_rate": 2.3851356232695855e-09, "loss": 0.3464, "step": 7062 }, { "epoch": 0.9906030855539972, "grad_norm": 3.632268130626663, "learning_rate": 2.3155057672519244e-09, "loss": 0.3418, "step": 7063 }, { "epoch": 0.9907433380084152, "grad_norm": 2.0404546085273374, "learning_rate": 2.246907148513167e-09, "loss": 0.2938, "step": 7064 }, { "epoch": 0.9908835904628331, "grad_norm": 2.1601198272669633, "learning_rate": 2.179339781208101e-09, "loss": 0.3437, "step": 7065 }, { "epoch": 0.9910238429172511, "grad_norm": 2.5819283838600797, "learning_rate": 2.1128036792783526e-09, "loss": 0.3113, "step": 7066 }, { "epoch": 0.991164095371669, "grad_norm": 2.5702515568798536, "learning_rate": 2.0472988564540496e-09, "loss": 0.3398, "step": 7067 }, { "epoch": 0.991304347826087, "grad_norm": 2.073958385958656, "learning_rate": 1.982825326250493e-09, "loss": 0.3553, "step": 7068 }, { "epoch": 0.991444600280505, "grad_norm": 2.0484930733621116, "learning_rate": 1.919383101972594e-09, "loss": 0.3446, "step": 7069 }, { "epoch": 0.9915848527349228, "grad_norm": 2.0164375737272437, "learning_rate": 1.856972196710438e-09, "loss": 0.312, "step": 7070 }, { "epoch": 0.9917251051893408, "grad_norm": 2.294530073768904, "learning_rate": 1.7955926233420573e-09, "loss": 0.2994, "step": 7071 }, { "epoch": 0.9918653576437587, "grad_norm": 1.7331397666811066, "learning_rate": 1.7352443945334308e-09, "loss": 0.3197, "step": 7072 }, { "epoch": 0.9920056100981767, "grad_norm": 2.1116657392965967, "learning_rate": 1.6759275227357098e-09, "loss": 0.35, "step": 7073 }, { "epoch": 0.9921458625525946, "grad_norm": 2.626575814417055, "learning_rate": 1.6176420201902132e-09, "loss": 0.2825, "step": 7074 }, { "epoch": 0.9922861150070126, "grad_norm": 3.197740597594557, "learning_rate": 1.560387898922322e-09, "loss": 0.329, "step": 7075 }, { "epoch": 0.9924263674614305, "grad_norm": 1.8922180526119, "learning_rate": 1.5041651707464744e-09, "loss": 0.3594, "step": 7076 }, { "epoch": 0.9925666199158485, "grad_norm": 2.3169031237492144, "learning_rate": 1.4489738472639459e-09, "loss": 0.3294, "step": 7077 }, { "epoch": 0.9927068723702664, "grad_norm": 1.7995159478483842, "learning_rate": 1.3948139398628492e-09, "loss": 0.3397, "step": 7078 }, { "epoch": 0.9928471248246844, "grad_norm": 1.9404430187364463, "learning_rate": 1.3416854597192441e-09, "loss": 0.3219, "step": 7079 }, { "epoch": 0.9929873772791024, "grad_norm": 2.070890641011154, "learning_rate": 1.2895884177954732e-09, "loss": 0.3431, "step": 7080 }, { "epoch": 0.9931276297335203, "grad_norm": 2.2443074163567185, "learning_rate": 1.2385228248407155e-09, "loss": 0.3546, "step": 7081 }, { "epoch": 0.9932678821879383, "grad_norm": 1.982793944071344, "learning_rate": 1.1884886913932082e-09, "loss": 0.3221, "step": 7082 }, { "epoch": 0.9934081346423562, "grad_norm": 2.742405287742413, "learning_rate": 1.1394860277763597e-09, "loss": 0.3109, "step": 7083 }, { "epoch": 0.9935483870967742, "grad_norm": 2.510864777096434, "learning_rate": 1.0915148441020817e-09, "loss": 0.3595, "step": 7084 }, { "epoch": 0.9936886395511921, "grad_norm": 1.7686502650957845, "learning_rate": 1.0445751502685676e-09, "loss": 0.2833, "step": 7085 }, { "epoch": 0.9938288920056101, "grad_norm": 1.913805076260073, "learning_rate": 9.986669559614027e-10, "loss": 0.2992, "step": 7086 }, { "epoch": 0.993969144460028, "grad_norm": 2.764108878049031, "learning_rate": 9.537902706535652e-10, "loss": 0.3131, "step": 7087 }, { "epoch": 0.994109396914446, "grad_norm": 1.8324669683791863, "learning_rate": 9.099451036048701e-10, "loss": 0.2991, "step": 7088 }, { "epoch": 0.994249649368864, "grad_norm": 1.903760566506771, "learning_rate": 8.67131463862525e-10, "loss": 0.3324, "step": 7089 }, { "epoch": 0.9943899018232819, "grad_norm": 2.18812737323945, "learning_rate": 8.253493602611295e-10, "loss": 0.3727, "step": 7090 }, { "epoch": 0.9945301542776999, "grad_norm": 2.7696960141949445, "learning_rate": 7.845988014215655e-10, "loss": 0.2828, "step": 7091 }, { "epoch": 0.9946704067321178, "grad_norm": 2.1376590647266593, "learning_rate": 7.448797957526621e-10, "loss": 0.3226, "step": 7092 }, { "epoch": 0.9948106591865358, "grad_norm": 1.8907727270499315, "learning_rate": 7.061923514506409e-10, "loss": 0.3257, "step": 7093 }, { "epoch": 0.9949509116409537, "grad_norm": 1.7812308594863207, "learning_rate": 6.685364764980051e-10, "loss": 0.3105, "step": 7094 }, { "epoch": 0.9950911640953717, "grad_norm": 1.9195794763589107, "learning_rate": 6.319121786646509e-10, "loss": 0.3249, "step": 7095 }, { "epoch": 0.9952314165497896, "grad_norm": 1.9496716296208418, "learning_rate": 5.963194655078663e-10, "loss": 0.2845, "step": 7096 }, { "epoch": 0.9953716690042076, "grad_norm": 2.1652600662108448, "learning_rate": 5.617583443717767e-10, "loss": 0.3012, "step": 7097 }, { "epoch": 0.9955119214586255, "grad_norm": 1.7504815392783346, "learning_rate": 5.282288223884546e-10, "loss": 0.3593, "step": 7098 }, { "epoch": 0.9956521739130435, "grad_norm": 1.8551711958163708, "learning_rate": 4.957309064756998e-10, "loss": 0.3236, "step": 7099 }, { "epoch": 0.9957924263674615, "grad_norm": 2.1420189989460185, "learning_rate": 4.642646033398146e-10, "loss": 0.3509, "step": 7100 }, { "epoch": 0.9959326788218794, "grad_norm": 1.6297799747084554, "learning_rate": 4.3382991947338306e-10, "loss": 0.298, "step": 7101 }, { "epoch": 0.9960729312762974, "grad_norm": 1.8351976567806327, "learning_rate": 4.0442686115582665e-10, "loss": 0.3478, "step": 7102 }, { "epoch": 0.9962131837307153, "grad_norm": 1.7828118687861776, "learning_rate": 3.760554344556244e-10, "loss": 0.2857, "step": 7103 }, { "epoch": 0.9963534361851333, "grad_norm": 1.5356729429009237, "learning_rate": 3.487156452258722e-10, "loss": 0.3211, "step": 7104 }, { "epoch": 0.9964936886395512, "grad_norm": 1.7639418149335162, "learning_rate": 3.2240749910816825e-10, "loss": 0.3706, "step": 7105 }, { "epoch": 0.9966339410939692, "grad_norm": 1.8902128416641317, "learning_rate": 2.971310015315032e-10, "loss": 0.3144, "step": 7106 }, { "epoch": 0.9967741935483871, "grad_norm": 2.4859166190602946, "learning_rate": 2.7288615771114966e-10, "loss": 0.3374, "step": 7107 }, { "epoch": 0.9969144460028051, "grad_norm": 1.7699716453651817, "learning_rate": 2.496729726497726e-10, "loss": 0.3233, "step": 7108 }, { "epoch": 0.997054698457223, "grad_norm": 3.8033556477201067, "learning_rate": 2.274914511374293e-10, "loss": 0.3465, "step": 7109 }, { "epoch": 0.9971949509116409, "grad_norm": 2.006456444121876, "learning_rate": 2.0634159775045904e-10, "loss": 0.316, "step": 7110 }, { "epoch": 0.9973352033660589, "grad_norm": 1.948212511032593, "learning_rate": 1.8622341685425872e-10, "loss": 0.3587, "step": 7111 }, { "epoch": 0.9974754558204768, "grad_norm": 2.259944344907112, "learning_rate": 1.6713691259939713e-10, "loss": 0.3204, "step": 7112 }, { "epoch": 0.9976157082748948, "grad_norm": 1.8447136381230682, "learning_rate": 1.4908208892383536e-10, "loss": 0.3639, "step": 7113 }, { "epoch": 0.9977559607293127, "grad_norm": 1.8138665933814442, "learning_rate": 1.3205894955348187e-10, "loss": 0.3683, "step": 7114 }, { "epoch": 0.9978962131837307, "grad_norm": 1.771872077414711, "learning_rate": 1.160674980010823e-10, "loss": 0.3283, "step": 7115 }, { "epoch": 0.9980364656381486, "grad_norm": 1.932355881384, "learning_rate": 1.011077375662195e-10, "loss": 0.3214, "step": 7116 }, { "epoch": 0.9981767180925666, "grad_norm": 1.8956369850290455, "learning_rate": 8.717967133586857e-11, "loss": 0.3423, "step": 7117 }, { "epoch": 0.9983169705469845, "grad_norm": 1.566992899563982, "learning_rate": 7.428330218384183e-11, "loss": 0.2764, "step": 7118 }, { "epoch": 0.9984572230014025, "grad_norm": 2.4810578855613885, "learning_rate": 6.241863277078874e-11, "loss": 0.3159, "step": 7119 }, { "epoch": 0.9985974754558204, "grad_norm": 1.6707840684713395, "learning_rate": 5.1585665545861305e-11, "loss": 0.2777, "step": 7120 }, { "epoch": 0.9987377279102384, "grad_norm": 3.090766624237503, "learning_rate": 4.1784402743383357e-11, "loss": 0.3819, "step": 7121 }, { "epoch": 0.9988779803646564, "grad_norm": 2.2281827686412914, "learning_rate": 3.301484638618124e-11, "loss": 0.343, "step": 7122 }, { "epoch": 0.9990182328190743, "grad_norm": 1.6322343158526365, "learning_rate": 2.5276998284473608e-11, "loss": 0.3062, "step": 7123 }, { "epoch": 0.9991584852734923, "grad_norm": 1.5780935001062246, "learning_rate": 1.857086003365094e-11, "loss": 0.3153, "step": 7124 }, { "epoch": 0.9992987377279102, "grad_norm": 2.536575836864684, "learning_rate": 1.2896433018161348e-11, "loss": 0.2997, "step": 7125 }, { "epoch": 0.9994389901823282, "grad_norm": 2.1491291944065365, "learning_rate": 8.253718408735013e-12, "loss": 0.331, "step": 7126 }, { "epoch": 0.9995792426367461, "grad_norm": 1.9257449133116333, "learning_rate": 4.642717164049515e-12, "loss": 0.2941, "step": 7127 }, { "epoch": 0.9997194950911641, "grad_norm": 2.142527938714024, "learning_rate": 2.063430027954283e-12, "loss": 0.2826, "step": 7128 }, { "epoch": 0.999859747545582, "grad_norm": 1.8741560626150489, "learning_rate": 5.158575333563675e-13, "loss": 0.3203, "step": 7129 }, { "epoch": 1.0, "grad_norm": 2.348740008598381, "learning_rate": 0.0, "loss": 0.3445, "step": 7130 }, { "epoch": 1.0, "step": 7130, "total_flos": 2.3623676556921012e+19, "train_loss": 0.0059655218313318815, "train_runtime": 3517.247, "train_samples_per_second": 583.832, "train_steps_per_second": 2.027 } ], "logging_steps": 1.0, "max_steps": 7130, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3623676556921012e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }