diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24367 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 3475, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014388489208633094, + "grad_norm": 2.4105954227484694, + "learning_rate": 2.2988505747126437e-07, + "loss": 0.7695, + "step": 1 + }, + { + "epoch": 0.0028776978417266188, + "grad_norm": 2.491958180835405, + "learning_rate": 4.5977011494252875e-07, + "loss": 0.7668, + "step": 2 + }, + { + "epoch": 0.004316546762589928, + "grad_norm": 2.4515145211929483, + "learning_rate": 6.896551724137931e-07, + "loss": 0.778, + "step": 3 + }, + { + "epoch": 0.0057553956834532375, + "grad_norm": 2.464306284194726, + "learning_rate": 9.195402298850575e-07, + "loss": 0.7771, + "step": 4 + }, + { + "epoch": 0.007194244604316547, + "grad_norm": 2.4381995924967113, + "learning_rate": 1.1494252873563219e-06, + "loss": 0.7718, + "step": 5 + }, + { + "epoch": 0.008633093525179856, + "grad_norm": 2.2863926582720127, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.7634, + "step": 6 + }, + { + "epoch": 0.010071942446043165, + "grad_norm": 2.347937860490624, + "learning_rate": 1.6091954022988506e-06, + "loss": 0.7653, + "step": 7 + }, + { + "epoch": 0.011510791366906475, + "grad_norm": 1.9546287546043806, + "learning_rate": 1.839080459770115e-06, + "loss": 0.7385, + "step": 8 + }, + { + "epoch": 0.012949640287769784, + "grad_norm": 1.85957724143468, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.7353, + "step": 9 + }, + { + "epoch": 0.014388489208633094, + "grad_norm": 1.3128121565973474, + "learning_rate": 2.2988505747126437e-06, + "loss": 0.7172, + "step": 10 + }, + { + "epoch": 0.015827338129496403, + "grad_norm": 1.2823467031872628, + "learning_rate": 2.5287356321839083e-06, + "loss": 0.7171, + "step": 11 + }, + { + "epoch": 0.017266187050359712, + "grad_norm": 1.1777835656701032, + "learning_rate": 2.7586206896551725e-06, + "loss": 0.7044, + "step": 12 + }, + { + "epoch": 0.01870503597122302, + "grad_norm": 1.016282103804724, + "learning_rate": 2.988505747126437e-06, + "loss": 0.6922, + "step": 13 + }, + { + "epoch": 0.02014388489208633, + "grad_norm": 1.124753468365821, + "learning_rate": 3.2183908045977012e-06, + "loss": 0.6778, + "step": 14 + }, + { + "epoch": 0.02158273381294964, + "grad_norm": 1.0897618312033936, + "learning_rate": 3.448275862068966e-06, + "loss": 0.6684, + "step": 15 + }, + { + "epoch": 0.02302158273381295, + "grad_norm": 0.9987802726468263, + "learning_rate": 3.67816091954023e-06, + "loss": 0.6666, + "step": 16 + }, + { + "epoch": 0.02446043165467626, + "grad_norm": 0.9119063742889313, + "learning_rate": 3.908045977011495e-06, + "loss": 0.6553, + "step": 17 + }, + { + "epoch": 0.025899280575539568, + "grad_norm": 0.6845690635011229, + "learning_rate": 4.137931034482759e-06, + "loss": 0.6285, + "step": 18 + }, + { + "epoch": 0.027338129496402876, + "grad_norm": 0.8002047902305283, + "learning_rate": 4.367816091954023e-06, + "loss": 0.6057, + "step": 19 + }, + { + "epoch": 0.02877697841726619, + "grad_norm": 0.743409237994264, + "learning_rate": 4.5977011494252875e-06, + "loss": 0.5936, + "step": 20 + }, + { + "epoch": 0.030215827338129497, + "grad_norm": 0.6250234115670129, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.593, + "step": 21 + }, + { + "epoch": 0.031654676258992806, + "grad_norm": 0.5164780369653312, + "learning_rate": 5.057471264367817e-06, + "loss": 0.5858, + "step": 22 + }, + { + "epoch": 0.033093525179856115, + "grad_norm": 0.5234892365042625, + "learning_rate": 5.287356321839081e-06, + "loss": 0.5919, + "step": 23 + }, + { + "epoch": 0.034532374100719423, + "grad_norm": 0.5072797188187648, + "learning_rate": 5.517241379310345e-06, + "loss": 0.5745, + "step": 24 + }, + { + "epoch": 0.03597122302158273, + "grad_norm": 0.6212376093865479, + "learning_rate": 5.747126436781609e-06, + "loss": 0.5645, + "step": 25 + }, + { + "epoch": 0.03741007194244604, + "grad_norm": 0.6306527534732702, + "learning_rate": 5.977011494252874e-06, + "loss": 0.5592, + "step": 26 + }, + { + "epoch": 0.03884892086330935, + "grad_norm": 0.5440901874907684, + "learning_rate": 6.206896551724138e-06, + "loss": 0.5634, + "step": 27 + }, + { + "epoch": 0.04028776978417266, + "grad_norm": 0.4763183956174834, + "learning_rate": 6.4367816091954025e-06, + "loss": 0.5588, + "step": 28 + }, + { + "epoch": 0.041726618705035974, + "grad_norm": 0.5298392880409118, + "learning_rate": 6.666666666666667e-06, + "loss": 0.5372, + "step": 29 + }, + { + "epoch": 0.04316546762589928, + "grad_norm": 0.43602918957586, + "learning_rate": 6.896551724137932e-06, + "loss": 0.5359, + "step": 30 + }, + { + "epoch": 0.04460431654676259, + "grad_norm": 0.39977253671051743, + "learning_rate": 7.126436781609196e-06, + "loss": 0.5391, + "step": 31 + }, + { + "epoch": 0.0460431654676259, + "grad_norm": 0.40590313043219917, + "learning_rate": 7.35632183908046e-06, + "loss": 0.5315, + "step": 32 + }, + { + "epoch": 0.04748201438848921, + "grad_norm": 0.3360824835256222, + "learning_rate": 7.586206896551724e-06, + "loss": 0.5276, + "step": 33 + }, + { + "epoch": 0.04892086330935252, + "grad_norm": 0.2659791474022181, + "learning_rate": 7.81609195402299e-06, + "loss": 0.5205, + "step": 34 + }, + { + "epoch": 0.050359712230215826, + "grad_norm": 0.2761746224449288, + "learning_rate": 8.045977011494253e-06, + "loss": 0.5231, + "step": 35 + }, + { + "epoch": 0.051798561151079135, + "grad_norm": 0.27773851379862535, + "learning_rate": 8.275862068965518e-06, + "loss": 0.5212, + "step": 36 + }, + { + "epoch": 0.053237410071942444, + "grad_norm": 0.27271770524700484, + "learning_rate": 8.505747126436782e-06, + "loss": 0.5081, + "step": 37 + }, + { + "epoch": 0.05467625899280575, + "grad_norm": 0.27581155555156306, + "learning_rate": 8.735632183908047e-06, + "loss": 0.5151, + "step": 38 + }, + { + "epoch": 0.05611510791366906, + "grad_norm": 0.21882673011712764, + "learning_rate": 8.965517241379312e-06, + "loss": 0.5062, + "step": 39 + }, + { + "epoch": 0.05755395683453238, + "grad_norm": 0.1992687663487771, + "learning_rate": 9.195402298850575e-06, + "loss": 0.5222, + "step": 40 + }, + { + "epoch": 0.058992805755395686, + "grad_norm": 0.20624278775881216, + "learning_rate": 9.42528735632184e-06, + "loss": 0.4948, + "step": 41 + }, + { + "epoch": 0.060431654676258995, + "grad_norm": 0.20069347615852262, + "learning_rate": 9.655172413793105e-06, + "loss": 0.5078, + "step": 42 + }, + { + "epoch": 0.0618705035971223, + "grad_norm": 0.1845034563605306, + "learning_rate": 9.885057471264368e-06, + "loss": 0.5051, + "step": 43 + }, + { + "epoch": 0.06330935251798561, + "grad_norm": 0.17534127903696262, + "learning_rate": 1.0114942528735633e-05, + "loss": 0.4967, + "step": 44 + }, + { + "epoch": 0.06474820143884892, + "grad_norm": 0.1750837167101689, + "learning_rate": 1.0344827586206898e-05, + "loss": 0.4903, + "step": 45 + }, + { + "epoch": 0.06618705035971223, + "grad_norm": 0.16672130368854124, + "learning_rate": 1.0574712643678162e-05, + "loss": 0.4873, + "step": 46 + }, + { + "epoch": 0.06762589928057554, + "grad_norm": 0.15519991857027912, + "learning_rate": 1.0804597701149427e-05, + "loss": 0.499, + "step": 47 + }, + { + "epoch": 0.06906474820143885, + "grad_norm": 0.15004507712336698, + "learning_rate": 1.103448275862069e-05, + "loss": 0.4813, + "step": 48 + }, + { + "epoch": 0.07050359712230216, + "grad_norm": 0.15344504306667656, + "learning_rate": 1.1264367816091955e-05, + "loss": 0.4813, + "step": 49 + }, + { + "epoch": 0.07194244604316546, + "grad_norm": 0.14368931627859793, + "learning_rate": 1.1494252873563218e-05, + "loss": 0.4858, + "step": 50 + }, + { + "epoch": 0.07338129496402877, + "grad_norm": 0.1304589493399049, + "learning_rate": 1.1724137931034483e-05, + "loss": 0.4854, + "step": 51 + }, + { + "epoch": 0.07482014388489208, + "grad_norm": 0.1377449195642743, + "learning_rate": 1.1954022988505748e-05, + "loss": 0.4789, + "step": 52 + }, + { + "epoch": 0.07625899280575539, + "grad_norm": 0.15496411963731735, + "learning_rate": 1.2183908045977013e-05, + "loss": 0.4853, + "step": 53 + }, + { + "epoch": 0.0776978417266187, + "grad_norm": 0.13357411915265002, + "learning_rate": 1.2413793103448277e-05, + "loss": 0.4802, + "step": 54 + }, + { + "epoch": 0.07913669064748201, + "grad_norm": 0.14332033734170296, + "learning_rate": 1.2643678160919542e-05, + "loss": 0.4922, + "step": 55 + }, + { + "epoch": 0.08057553956834532, + "grad_norm": 0.1292592407744019, + "learning_rate": 1.2873563218390805e-05, + "loss": 0.4835, + "step": 56 + }, + { + "epoch": 0.08201438848920864, + "grad_norm": 0.1279098304442974, + "learning_rate": 1.310344827586207e-05, + "loss": 0.4772, + "step": 57 + }, + { + "epoch": 0.08345323741007195, + "grad_norm": 0.12546369838992738, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.4758, + "step": 58 + }, + { + "epoch": 0.08489208633093526, + "grad_norm": 0.11644069764775271, + "learning_rate": 1.3563218390804598e-05, + "loss": 0.4683, + "step": 59 + }, + { + "epoch": 0.08633093525179857, + "grad_norm": 0.119357895427173, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.478, + "step": 60 + }, + { + "epoch": 0.08776978417266187, + "grad_norm": 0.10056032693404654, + "learning_rate": 1.4022988505747128e-05, + "loss": 0.4697, + "step": 61 + }, + { + "epoch": 0.08920863309352518, + "grad_norm": 0.1168386481490424, + "learning_rate": 1.4252873563218392e-05, + "loss": 0.4645, + "step": 62 + }, + { + "epoch": 0.09064748201438849, + "grad_norm": 5.064701562877305, + "learning_rate": 1.4482758620689657e-05, + "loss": 0.4704, + "step": 63 + }, + { + "epoch": 0.0920863309352518, + "grad_norm": 0.13239631855143605, + "learning_rate": 1.471264367816092e-05, + "loss": 0.4666, + "step": 64 + }, + { + "epoch": 0.09352517985611511, + "grad_norm": 0.10626289271608953, + "learning_rate": 1.4942528735632185e-05, + "loss": 0.4633, + "step": 65 + }, + { + "epoch": 0.09496402877697842, + "grad_norm": 0.11998633127665166, + "learning_rate": 1.5172413793103448e-05, + "loss": 0.4669, + "step": 66 + }, + { + "epoch": 0.09640287769784173, + "grad_norm": 0.12948805456339993, + "learning_rate": 1.540229885057471e-05, + "loss": 0.464, + "step": 67 + }, + { + "epoch": 0.09784172661870504, + "grad_norm": 0.11881891499790373, + "learning_rate": 1.563218390804598e-05, + "loss": 0.4523, + "step": 68 + }, + { + "epoch": 0.09928057553956834, + "grad_norm": 0.1153614566652368, + "learning_rate": 1.586206896551724e-05, + "loss": 0.4545, + "step": 69 + }, + { + "epoch": 0.10071942446043165, + "grad_norm": 0.12797301351533724, + "learning_rate": 1.6091954022988507e-05, + "loss": 0.4553, + "step": 70 + }, + { + "epoch": 0.10215827338129496, + "grad_norm": 0.10554962913118272, + "learning_rate": 1.632183908045977e-05, + "loss": 0.4641, + "step": 71 + }, + { + "epoch": 0.10359712230215827, + "grad_norm": 0.12020889100383784, + "learning_rate": 1.6551724137931037e-05, + "loss": 0.458, + "step": 72 + }, + { + "epoch": 0.10503597122302158, + "grad_norm": 0.12992964208869473, + "learning_rate": 1.6781609195402298e-05, + "loss": 0.4486, + "step": 73 + }, + { + "epoch": 0.10647482014388489, + "grad_norm": 0.12529418437433326, + "learning_rate": 1.7011494252873563e-05, + "loss": 0.4487, + "step": 74 + }, + { + "epoch": 0.1079136690647482, + "grad_norm": 0.13838801655443417, + "learning_rate": 1.7241379310344828e-05, + "loss": 0.4471, + "step": 75 + }, + { + "epoch": 0.1093525179856115, + "grad_norm": 0.19416526233841697, + "learning_rate": 1.7471264367816093e-05, + "loss": 0.4493, + "step": 76 + }, + { + "epoch": 0.11079136690647481, + "grad_norm": 0.23172341553218395, + "learning_rate": 1.770114942528736e-05, + "loss": 0.456, + "step": 77 + }, + { + "epoch": 0.11223021582733812, + "grad_norm": 0.25887867069435166, + "learning_rate": 1.7931034482758623e-05, + "loss": 0.4617, + "step": 78 + }, + { + "epoch": 0.11366906474820145, + "grad_norm": 0.34859196212947413, + "learning_rate": 1.8160919540229885e-05, + "loss": 0.4513, + "step": 79 + }, + { + "epoch": 0.11510791366906475, + "grad_norm": 0.29709836278105234, + "learning_rate": 1.839080459770115e-05, + "loss": 0.4453, + "step": 80 + }, + { + "epoch": 0.11654676258992806, + "grad_norm": 0.22415770328366957, + "learning_rate": 1.8620689655172415e-05, + "loss": 0.4473, + "step": 81 + }, + { + "epoch": 0.11798561151079137, + "grad_norm": 0.1832543617929052, + "learning_rate": 1.885057471264368e-05, + "loss": 0.4504, + "step": 82 + }, + { + "epoch": 0.11942446043165468, + "grad_norm": 0.16025347616908636, + "learning_rate": 1.908045977011494e-05, + "loss": 0.4571, + "step": 83 + }, + { + "epoch": 0.12086330935251799, + "grad_norm": 0.155497884160083, + "learning_rate": 1.931034482758621e-05, + "loss": 0.4416, + "step": 84 + }, + { + "epoch": 0.1223021582733813, + "grad_norm": 0.19185065201692705, + "learning_rate": 1.9540229885057475e-05, + "loss": 0.4479, + "step": 85 + }, + { + "epoch": 0.1237410071942446, + "grad_norm": 0.2389793638238605, + "learning_rate": 1.9770114942528737e-05, + "loss": 0.4548, + "step": 86 + }, + { + "epoch": 0.1251798561151079, + "grad_norm": 0.2371356137047519, + "learning_rate": 2e-05, + "loss": 0.4498, + "step": 87 + }, + { + "epoch": 0.12661870503597122, + "grad_norm": 0.2056326072963956, + "learning_rate": 2.0229885057471267e-05, + "loss": 0.4441, + "step": 88 + }, + { + "epoch": 0.12805755395683452, + "grad_norm": 0.1392259208106514, + "learning_rate": 2.0459770114942528e-05, + "loss": 0.4411, + "step": 89 + }, + { + "epoch": 0.12949640287769784, + "grad_norm": 0.14101469473373407, + "learning_rate": 2.0689655172413797e-05, + "loss": 0.4432, + "step": 90 + }, + { + "epoch": 0.13093525179856116, + "grad_norm": 0.1701408324626712, + "learning_rate": 2.0919540229885058e-05, + "loss": 0.4465, + "step": 91 + }, + { + "epoch": 0.13237410071942446, + "grad_norm": 0.20477036899270176, + "learning_rate": 2.1149425287356323e-05, + "loss": 0.4422, + "step": 92 + }, + { + "epoch": 0.13381294964028778, + "grad_norm": 0.19497091900720495, + "learning_rate": 2.1379310344827585e-05, + "loss": 0.4408, + "step": 93 + }, + { + "epoch": 0.13525179856115108, + "grad_norm": 0.19755905692235068, + "learning_rate": 2.1609195402298853e-05, + "loss": 0.4386, + "step": 94 + }, + { + "epoch": 0.1366906474820144, + "grad_norm": 0.2538534216995381, + "learning_rate": 2.183908045977012e-05, + "loss": 0.4411, + "step": 95 + }, + { + "epoch": 0.1381294964028777, + "grad_norm": 0.33652040167995695, + "learning_rate": 2.206896551724138e-05, + "loss": 0.4534, + "step": 96 + }, + { + "epoch": 0.13956834532374102, + "grad_norm": 0.4499557903583448, + "learning_rate": 2.229885057471265e-05, + "loss": 0.4337, + "step": 97 + }, + { + "epoch": 0.1410071942446043, + "grad_norm": 0.4787848785232312, + "learning_rate": 2.252873563218391e-05, + "loss": 0.4483, + "step": 98 + }, + { + "epoch": 0.14244604316546763, + "grad_norm": 0.3652726105292967, + "learning_rate": 2.2758620689655175e-05, + "loss": 0.4448, + "step": 99 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.1850523458420075, + "learning_rate": 2.2988505747126437e-05, + "loss": 0.436, + "step": 100 + }, + { + "epoch": 0.14532374100719425, + "grad_norm": 0.28013625574877, + "learning_rate": 2.3218390804597705e-05, + "loss": 0.4378, + "step": 101 + }, + { + "epoch": 0.14676258992805755, + "grad_norm": 0.34859128737402756, + "learning_rate": 2.3448275862068967e-05, + "loss": 0.4485, + "step": 102 + }, + { + "epoch": 0.14820143884892087, + "grad_norm": 0.2386322213525175, + "learning_rate": 2.367816091954023e-05, + "loss": 0.4386, + "step": 103 + }, + { + "epoch": 0.14964028776978416, + "grad_norm": 0.21644678275392948, + "learning_rate": 2.3908045977011497e-05, + "loss": 0.4342, + "step": 104 + }, + { + "epoch": 0.1510791366906475, + "grad_norm": 0.3563969211116265, + "learning_rate": 2.413793103448276e-05, + "loss": 0.4382, + "step": 105 + }, + { + "epoch": 0.15251798561151078, + "grad_norm": 0.29362358534583805, + "learning_rate": 2.4367816091954027e-05, + "loss": 0.4422, + "step": 106 + }, + { + "epoch": 0.1539568345323741, + "grad_norm": 0.1966080484385412, + "learning_rate": 2.4597701149425288e-05, + "loss": 0.436, + "step": 107 + }, + { + "epoch": 0.1553956834532374, + "grad_norm": 0.3027821801910231, + "learning_rate": 2.4827586206896553e-05, + "loss": 0.4431, + "step": 108 + }, + { + "epoch": 0.15683453237410072, + "grad_norm": 0.3459544142754879, + "learning_rate": 2.5057471264367815e-05, + "loss": 0.4394, + "step": 109 + }, + { + "epoch": 0.15827338129496402, + "grad_norm": 0.29343882759370143, + "learning_rate": 2.5287356321839083e-05, + "loss": 0.4368, + "step": 110 + }, + { + "epoch": 0.15971223021582734, + "grad_norm": 0.3049054900232845, + "learning_rate": 2.551724137931035e-05, + "loss": 0.4364, + "step": 111 + }, + { + "epoch": 0.16115107913669063, + "grad_norm": 0.4170403918987231, + "learning_rate": 2.574712643678161e-05, + "loss": 0.4407, + "step": 112 + }, + { + "epoch": 0.16258992805755396, + "grad_norm": 0.6096237752048831, + "learning_rate": 2.597701149425288e-05, + "loss": 0.4436, + "step": 113 + }, + { + "epoch": 0.16402877697841728, + "grad_norm": 0.872737579366971, + "learning_rate": 2.620689655172414e-05, + "loss": 0.4469, + "step": 114 + }, + { + "epoch": 0.16546762589928057, + "grad_norm": 0.7464229294451004, + "learning_rate": 2.6436781609195405e-05, + "loss": 0.4385, + "step": 115 + }, + { + "epoch": 0.1669064748201439, + "grad_norm": 0.3430285738778335, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.4307, + "step": 116 + }, + { + "epoch": 0.1683453237410072, + "grad_norm": 0.5713328379650735, + "learning_rate": 2.6896551724137935e-05, + "loss": 0.4389, + "step": 117 + }, + { + "epoch": 0.1697841726618705, + "grad_norm": 0.5230987770214107, + "learning_rate": 2.7126436781609197e-05, + "loss": 0.4335, + "step": 118 + }, + { + "epoch": 0.1712230215827338, + "grad_norm": 0.5925850527079884, + "learning_rate": 2.735632183908046e-05, + "loss": 0.4423, + "step": 119 + }, + { + "epoch": 0.17266187050359713, + "grad_norm": 0.5490079016610805, + "learning_rate": 2.7586206896551727e-05, + "loss": 0.4427, + "step": 120 + }, + { + "epoch": 0.17410071942446043, + "grad_norm": 0.4270003450740747, + "learning_rate": 2.781609195402299e-05, + "loss": 0.4304, + "step": 121 + }, + { + "epoch": 0.17553956834532375, + "grad_norm": 0.6019702794042444, + "learning_rate": 2.8045977011494257e-05, + "loss": 0.4354, + "step": 122 + }, + { + "epoch": 0.17697841726618704, + "grad_norm": 0.334370783331933, + "learning_rate": 2.8275862068965518e-05, + "loss": 0.4429, + "step": 123 + }, + { + "epoch": 0.17841726618705037, + "grad_norm": 0.47395349878838183, + "learning_rate": 2.8505747126436783e-05, + "loss": 0.4287, + "step": 124 + }, + { + "epoch": 0.17985611510791366, + "grad_norm": 0.3038871824866802, + "learning_rate": 2.8735632183908045e-05, + "loss": 0.4248, + "step": 125 + }, + { + "epoch": 0.18129496402877698, + "grad_norm": 0.47453715308720656, + "learning_rate": 2.8965517241379313e-05, + "loss": 0.4343, + "step": 126 + }, + { + "epoch": 0.18273381294964028, + "grad_norm": 0.31926301463397466, + "learning_rate": 2.919540229885058e-05, + "loss": 0.4353, + "step": 127 + }, + { + "epoch": 0.1841726618705036, + "grad_norm": 0.33941879358451277, + "learning_rate": 2.942528735632184e-05, + "loss": 0.4295, + "step": 128 + }, + { + "epoch": 0.1856115107913669, + "grad_norm": 0.27089134597336756, + "learning_rate": 2.965517241379311e-05, + "loss": 0.4384, + "step": 129 + }, + { + "epoch": 0.18705035971223022, + "grad_norm": 0.31129593176521775, + "learning_rate": 2.988505747126437e-05, + "loss": 0.4203, + "step": 130 + }, + { + "epoch": 0.1884892086330935, + "grad_norm": 0.283986981943612, + "learning_rate": 3.0114942528735635e-05, + "loss": 0.4308, + "step": 131 + }, + { + "epoch": 0.18992805755395684, + "grad_norm": 0.25674815225905595, + "learning_rate": 3.0344827586206897e-05, + "loss": 0.4196, + "step": 132 + }, + { + "epoch": 0.19136690647482013, + "grad_norm": 0.28252583896309685, + "learning_rate": 3.057471264367816e-05, + "loss": 0.4237, + "step": 133 + }, + { + "epoch": 0.19280575539568345, + "grad_norm": 0.26632639356638954, + "learning_rate": 3.080459770114942e-05, + "loss": 0.4232, + "step": 134 + }, + { + "epoch": 0.19424460431654678, + "grad_norm": 0.30656043922613224, + "learning_rate": 3.103448275862069e-05, + "loss": 0.4289, + "step": 135 + }, + { + "epoch": 0.19568345323741007, + "grad_norm": 0.25927043256426724, + "learning_rate": 3.126436781609196e-05, + "loss": 0.427, + "step": 136 + }, + { + "epoch": 0.1971223021582734, + "grad_norm": 0.24798130007018074, + "learning_rate": 3.149425287356322e-05, + "loss": 0.4345, + "step": 137 + }, + { + "epoch": 0.1985611510791367, + "grad_norm": 0.23475327425958922, + "learning_rate": 3.172413793103448e-05, + "loss": 0.4287, + "step": 138 + }, + { + "epoch": 0.2, + "grad_norm": 0.2377350899828933, + "learning_rate": 3.195402298850575e-05, + "loss": 0.4273, + "step": 139 + }, + { + "epoch": 0.2014388489208633, + "grad_norm": 0.3071734095000456, + "learning_rate": 3.218390804597701e-05, + "loss": 0.4248, + "step": 140 + }, + { + "epoch": 0.20287769784172663, + "grad_norm": 0.3350793570904282, + "learning_rate": 3.2413793103448275e-05, + "loss": 0.4261, + "step": 141 + }, + { + "epoch": 0.20431654676258992, + "grad_norm": 0.43440066658930093, + "learning_rate": 3.264367816091954e-05, + "loss": 0.4434, + "step": 142 + }, + { + "epoch": 0.20575539568345325, + "grad_norm": 0.5538333001404411, + "learning_rate": 3.287356321839081e-05, + "loss": 0.4326, + "step": 143 + }, + { + "epoch": 0.20719424460431654, + "grad_norm": 0.5881012244147088, + "learning_rate": 3.310344827586207e-05, + "loss": 0.4297, + "step": 144 + }, + { + "epoch": 0.20863309352517986, + "grad_norm": 0.4374203738344661, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.4342, + "step": 145 + }, + { + "epoch": 0.21007194244604316, + "grad_norm": 0.4160120130456441, + "learning_rate": 3.3563218390804597e-05, + "loss": 0.4251, + "step": 146 + }, + { + "epoch": 0.21151079136690648, + "grad_norm": 0.5392228648425607, + "learning_rate": 3.3793103448275865e-05, + "loss": 0.439, + "step": 147 + }, + { + "epoch": 0.21294964028776978, + "grad_norm": 0.4290554187950581, + "learning_rate": 3.4022988505747127e-05, + "loss": 0.4263, + "step": 148 + }, + { + "epoch": 0.2143884892086331, + "grad_norm": 0.445806647816008, + "learning_rate": 3.4252873563218395e-05, + "loss": 0.4266, + "step": 149 + }, + { + "epoch": 0.2158273381294964, + "grad_norm": 0.5143056157416875, + "learning_rate": 3.4482758620689657e-05, + "loss": 0.4302, + "step": 150 + }, + { + "epoch": 0.21726618705035972, + "grad_norm": 0.31334789130815455, + "learning_rate": 3.4712643678160925e-05, + "loss": 0.4334, + "step": 151 + }, + { + "epoch": 0.218705035971223, + "grad_norm": 0.4208883505046265, + "learning_rate": 3.4942528735632187e-05, + "loss": 0.4311, + "step": 152 + }, + { + "epoch": 0.22014388489208633, + "grad_norm": 0.34323260654772186, + "learning_rate": 3.517241379310345e-05, + "loss": 0.432, + "step": 153 + }, + { + "epoch": 0.22158273381294963, + "grad_norm": 0.3711247664184252, + "learning_rate": 3.540229885057472e-05, + "loss": 0.427, + "step": 154 + }, + { + "epoch": 0.22302158273381295, + "grad_norm": 0.41996688614643096, + "learning_rate": 3.563218390804598e-05, + "loss": 0.4256, + "step": 155 + }, + { + "epoch": 0.22446043165467625, + "grad_norm": 0.4405013162802177, + "learning_rate": 3.586206896551725e-05, + "loss": 0.4234, + "step": 156 + }, + { + "epoch": 0.22589928057553957, + "grad_norm": 0.440866752806929, + "learning_rate": 3.609195402298851e-05, + "loss": 0.4211, + "step": 157 + }, + { + "epoch": 0.2273381294964029, + "grad_norm": 0.3803054960625481, + "learning_rate": 3.632183908045977e-05, + "loss": 0.4291, + "step": 158 + }, + { + "epoch": 0.22877697841726619, + "grad_norm": 0.47391874678742224, + "learning_rate": 3.655172413793104e-05, + "loss": 0.4259, + "step": 159 + }, + { + "epoch": 0.2302158273381295, + "grad_norm": 0.4766341025501577, + "learning_rate": 3.67816091954023e-05, + "loss": 0.4284, + "step": 160 + }, + { + "epoch": 0.2316546762589928, + "grad_norm": 0.7700120448534205, + "learning_rate": 3.701149425287357e-05, + "loss": 0.4216, + "step": 161 + }, + { + "epoch": 0.23309352517985613, + "grad_norm": 0.7842053963294792, + "learning_rate": 3.724137931034483e-05, + "loss": 0.4303, + "step": 162 + }, + { + "epoch": 0.23453237410071942, + "grad_norm": 0.4967000795881762, + "learning_rate": 3.74712643678161e-05, + "loss": 0.4191, + "step": 163 + }, + { + "epoch": 0.23597122302158274, + "grad_norm": 0.49658628735384647, + "learning_rate": 3.770114942528736e-05, + "loss": 0.4224, + "step": 164 + }, + { + "epoch": 0.23741007194244604, + "grad_norm": 0.39151704923998104, + "learning_rate": 3.793103448275862e-05, + "loss": 0.4255, + "step": 165 + }, + { + "epoch": 0.23884892086330936, + "grad_norm": 0.5367521813703079, + "learning_rate": 3.816091954022988e-05, + "loss": 0.4174, + "step": 166 + }, + { + "epoch": 0.24028776978417266, + "grad_norm": 0.3717546418391886, + "learning_rate": 3.839080459770115e-05, + "loss": 0.4245, + "step": 167 + }, + { + "epoch": 0.24172661870503598, + "grad_norm": 0.36917102874526303, + "learning_rate": 3.862068965517242e-05, + "loss": 0.4175, + "step": 168 + }, + { + "epoch": 0.24316546762589927, + "grad_norm": 0.3939287560108561, + "learning_rate": 3.885057471264368e-05, + "loss": 0.4254, + "step": 169 + }, + { + "epoch": 0.2446043165467626, + "grad_norm": 0.3705339045689148, + "learning_rate": 3.908045977011495e-05, + "loss": 0.4183, + "step": 170 + }, + { + "epoch": 0.2460431654676259, + "grad_norm": 0.3977076335757685, + "learning_rate": 3.931034482758621e-05, + "loss": 0.4359, + "step": 171 + }, + { + "epoch": 0.2474820143884892, + "grad_norm": 0.3239142876763284, + "learning_rate": 3.954022988505747e-05, + "loss": 0.4131, + "step": 172 + }, + { + "epoch": 0.2489208633093525, + "grad_norm": 0.4004560562375386, + "learning_rate": 3.9770114942528735e-05, + "loss": 0.4154, + "step": 173 + }, + { + "epoch": 0.2503597122302158, + "grad_norm": 0.3482246073839111, + "learning_rate": 4e-05, + "loss": 0.4191, + "step": 174 + }, + { + "epoch": 0.2517985611510791, + "grad_norm": 0.30916823043183983, + "learning_rate": 4.022988505747127e-05, + "loss": 0.4296, + "step": 175 + }, + { + "epoch": 0.25323741007194245, + "grad_norm": 0.3926285238360458, + "learning_rate": 4.045977011494253e-05, + "loss": 0.4298, + "step": 176 + }, + { + "epoch": 0.25467625899280577, + "grad_norm": 0.3473702277340475, + "learning_rate": 4.0689655172413795e-05, + "loss": 0.4276, + "step": 177 + }, + { + "epoch": 0.25611510791366904, + "grad_norm": 0.39203554127421786, + "learning_rate": 4.0919540229885057e-05, + "loss": 0.4294, + "step": 178 + }, + { + "epoch": 0.25755395683453236, + "grad_norm": 0.36858650708676194, + "learning_rate": 4.1149425287356325e-05, + "loss": 0.4272, + "step": 179 + }, + { + "epoch": 0.2589928057553957, + "grad_norm": 0.35331416241514024, + "learning_rate": 4.137931034482759e-05, + "loss": 0.4165, + "step": 180 + }, + { + "epoch": 0.260431654676259, + "grad_norm": 0.3650719681137465, + "learning_rate": 4.160919540229885e-05, + "loss": 0.4246, + "step": 181 + }, + { + "epoch": 0.26187050359712233, + "grad_norm": 0.396532271347279, + "learning_rate": 4.1839080459770117e-05, + "loss": 0.4077, + "step": 182 + }, + { + "epoch": 0.2633093525179856, + "grad_norm": 0.5027339023410375, + "learning_rate": 4.2068965517241385e-05, + "loss": 0.4171, + "step": 183 + }, + { + "epoch": 0.2647482014388489, + "grad_norm": 0.5398000888843012, + "learning_rate": 4.2298850574712647e-05, + "loss": 0.4183, + "step": 184 + }, + { + "epoch": 0.26618705035971224, + "grad_norm": 0.388305209367546, + "learning_rate": 4.2528735632183915e-05, + "loss": 0.4215, + "step": 185 + }, + { + "epoch": 0.26762589928057556, + "grad_norm": 0.3594204739218431, + "learning_rate": 4.275862068965517e-05, + "loss": 0.4192, + "step": 186 + }, + { + "epoch": 0.26906474820143883, + "grad_norm": 0.40748119218221474, + "learning_rate": 4.298850574712644e-05, + "loss": 0.4267, + "step": 187 + }, + { + "epoch": 0.27050359712230215, + "grad_norm": 0.4831661157601768, + "learning_rate": 4.321839080459771e-05, + "loss": 0.4161, + "step": 188 + }, + { + "epoch": 0.2719424460431655, + "grad_norm": 0.6431491770515896, + "learning_rate": 4.344827586206897e-05, + "loss": 0.4214, + "step": 189 + }, + { + "epoch": 0.2733812949640288, + "grad_norm": 0.7689279391826508, + "learning_rate": 4.367816091954024e-05, + "loss": 0.4326, + "step": 190 + }, + { + "epoch": 0.27482014388489207, + "grad_norm": 0.8194099131112732, + "learning_rate": 4.39080459770115e-05, + "loss": 0.4224, + "step": 191 + }, + { + "epoch": 0.2762589928057554, + "grad_norm": 0.5719341880685053, + "learning_rate": 4.413793103448276e-05, + "loss": 0.4142, + "step": 192 + }, + { + "epoch": 0.2776978417266187, + "grad_norm": 0.48369772529062066, + "learning_rate": 4.436781609195403e-05, + "loss": 0.4178, + "step": 193 + }, + { + "epoch": 0.27913669064748203, + "grad_norm": 0.5930946177166372, + "learning_rate": 4.45977011494253e-05, + "loss": 0.4179, + "step": 194 + }, + { + "epoch": 0.2805755395683453, + "grad_norm": 0.4258909411652574, + "learning_rate": 4.482758620689655e-05, + "loss": 0.4234, + "step": 195 + }, + { + "epoch": 0.2820143884892086, + "grad_norm": 0.38845248341947575, + "learning_rate": 4.505747126436782e-05, + "loss": 0.4177, + "step": 196 + }, + { + "epoch": 0.28345323741007195, + "grad_norm": 0.5374461347132847, + "learning_rate": 4.528735632183908e-05, + "loss": 0.4186, + "step": 197 + }, + { + "epoch": 0.28489208633093527, + "grad_norm": 0.414301384735006, + "learning_rate": 4.551724137931035e-05, + "loss": 0.4123, + "step": 198 + }, + { + "epoch": 0.28633093525179854, + "grad_norm": 0.3421929046493428, + "learning_rate": 4.574712643678162e-05, + "loss": 0.4116, + "step": 199 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.4720206403909787, + "learning_rate": 4.597701149425287e-05, + "loss": 0.4212, + "step": 200 + }, + { + "epoch": 0.2892086330935252, + "grad_norm": 0.41305656015425807, + "learning_rate": 4.620689655172414e-05, + "loss": 0.4112, + "step": 201 + }, + { + "epoch": 0.2906474820143885, + "grad_norm": 0.303624198533764, + "learning_rate": 4.643678160919541e-05, + "loss": 0.4066, + "step": 202 + }, + { + "epoch": 0.2920863309352518, + "grad_norm": 0.4023935675809489, + "learning_rate": 4.666666666666667e-05, + "loss": 0.4195, + "step": 203 + }, + { + "epoch": 0.2935251798561151, + "grad_norm": 0.4065299617485997, + "learning_rate": 4.689655172413793e-05, + "loss": 0.416, + "step": 204 + }, + { + "epoch": 0.2949640287769784, + "grad_norm": 0.4328671507220006, + "learning_rate": 4.7126436781609195e-05, + "loss": 0.4193, + "step": 205 + }, + { + "epoch": 0.29640287769784174, + "grad_norm": 0.5357838410994751, + "learning_rate": 4.735632183908046e-05, + "loss": 0.4117, + "step": 206 + }, + { + "epoch": 0.29784172661870506, + "grad_norm": 0.4771204429978033, + "learning_rate": 4.758620689655173e-05, + "loss": 0.412, + "step": 207 + }, + { + "epoch": 0.2992805755395683, + "grad_norm": 0.437631567776442, + "learning_rate": 4.781609195402299e-05, + "loss": 0.4159, + "step": 208 + }, + { + "epoch": 0.30071942446043165, + "grad_norm": 0.5244670541769951, + "learning_rate": 4.8045977011494255e-05, + "loss": 0.4065, + "step": 209 + }, + { + "epoch": 0.302158273381295, + "grad_norm": 0.47470433749275237, + "learning_rate": 4.827586206896552e-05, + "loss": 0.4165, + "step": 210 + }, + { + "epoch": 0.3035971223021583, + "grad_norm": 0.37019131484391476, + "learning_rate": 4.8505747126436785e-05, + "loss": 0.4027, + "step": 211 + }, + { + "epoch": 0.30503597122302156, + "grad_norm": 0.3677924799998598, + "learning_rate": 4.873563218390805e-05, + "loss": 0.4221, + "step": 212 + }, + { + "epoch": 0.3064748201438849, + "grad_norm": 0.4213221362717633, + "learning_rate": 4.896551724137931e-05, + "loss": 0.4136, + "step": 213 + }, + { + "epoch": 0.3079136690647482, + "grad_norm": 0.5005616556129306, + "learning_rate": 4.9195402298850577e-05, + "loss": 0.4146, + "step": 214 + }, + { + "epoch": 0.30935251798561153, + "grad_norm": 0.5660260676340962, + "learning_rate": 4.9425287356321845e-05, + "loss": 0.415, + "step": 215 + }, + { + "epoch": 0.3107913669064748, + "grad_norm": 0.6288560002121155, + "learning_rate": 4.9655172413793107e-05, + "loss": 0.4127, + "step": 216 + }, + { + "epoch": 0.3122302158273381, + "grad_norm": 0.629138529219548, + "learning_rate": 4.9885057471264375e-05, + "loss": 0.4139, + "step": 217 + }, + { + "epoch": 0.31366906474820144, + "grad_norm": 0.423519291278689, + "learning_rate": 5.011494252873563e-05, + "loss": 0.4143, + "step": 218 + }, + { + "epoch": 0.31510791366906477, + "grad_norm": 0.34837499698479096, + "learning_rate": 5.03448275862069e-05, + "loss": 0.4143, + "step": 219 + }, + { + "epoch": 0.31654676258992803, + "grad_norm": 0.4615319088669839, + "learning_rate": 5.057471264367817e-05, + "loss": 0.4126, + "step": 220 + }, + { + "epoch": 0.31798561151079136, + "grad_norm": 0.4937186470978348, + "learning_rate": 5.0804597701149435e-05, + "loss": 0.4138, + "step": 221 + }, + { + "epoch": 0.3194244604316547, + "grad_norm": 0.414189734087535, + "learning_rate": 5.10344827586207e-05, + "loss": 0.4217, + "step": 222 + }, + { + "epoch": 0.320863309352518, + "grad_norm": 0.28717519451044027, + "learning_rate": 5.126436781609196e-05, + "loss": 0.4143, + "step": 223 + }, + { + "epoch": 0.32230215827338127, + "grad_norm": 0.48031210981271294, + "learning_rate": 5.149425287356322e-05, + "loss": 0.4211, + "step": 224 + }, + { + "epoch": 0.3237410071942446, + "grad_norm": 0.4729273176257217, + "learning_rate": 5.172413793103449e-05, + "loss": 0.4188, + "step": 225 + }, + { + "epoch": 0.3251798561151079, + "grad_norm": 0.25093035536711344, + "learning_rate": 5.195402298850576e-05, + "loss": 0.4087, + "step": 226 + }, + { + "epoch": 0.32661870503597124, + "grad_norm": 0.3013410763134809, + "learning_rate": 5.218390804597701e-05, + "loss": 0.4158, + "step": 227 + }, + { + "epoch": 0.32805755395683456, + "grad_norm": 0.44716508261402804, + "learning_rate": 5.241379310344828e-05, + "loss": 0.4099, + "step": 228 + }, + { + "epoch": 0.3294964028776978, + "grad_norm": 0.3809050702014656, + "learning_rate": 5.264367816091954e-05, + "loss": 0.4113, + "step": 229 + }, + { + "epoch": 0.33093525179856115, + "grad_norm": 0.20636335745276008, + "learning_rate": 5.287356321839081e-05, + "loss": 0.4105, + "step": 230 + }, + { + "epoch": 0.33237410071942447, + "grad_norm": 0.3511948529002789, + "learning_rate": 5.310344827586208e-05, + "loss": 0.4185, + "step": 231 + }, + { + "epoch": 0.3338129496402878, + "grad_norm": 0.40331439495888555, + "learning_rate": 5.333333333333333e-05, + "loss": 0.4221, + "step": 232 + }, + { + "epoch": 0.33525179856115106, + "grad_norm": 0.34719092742015417, + "learning_rate": 5.35632183908046e-05, + "loss": 0.4145, + "step": 233 + }, + { + "epoch": 0.3366906474820144, + "grad_norm": 0.5114677709909795, + "learning_rate": 5.379310344827587e-05, + "loss": 0.4112, + "step": 234 + }, + { + "epoch": 0.3381294964028777, + "grad_norm": 0.7803945133643926, + "learning_rate": 5.402298850574713e-05, + "loss": 0.4165, + "step": 235 + }, + { + "epoch": 0.339568345323741, + "grad_norm": 0.9609656354531556, + "learning_rate": 5.425287356321839e-05, + "loss": 0.4212, + "step": 236 + }, + { + "epoch": 0.3410071942446043, + "grad_norm": 0.8077612130876954, + "learning_rate": 5.4482758620689655e-05, + "loss": 0.4195, + "step": 237 + }, + { + "epoch": 0.3424460431654676, + "grad_norm": 0.44530673257470504, + "learning_rate": 5.471264367816092e-05, + "loss": 0.415, + "step": 238 + }, + { + "epoch": 0.34388489208633094, + "grad_norm": 1.1511565326195332, + "learning_rate": 5.494252873563219e-05, + "loss": 0.4222, + "step": 239 + }, + { + "epoch": 0.34532374100719426, + "grad_norm": 0.6381960947846417, + "learning_rate": 5.517241379310345e-05, + "loss": 0.4246, + "step": 240 + }, + { + "epoch": 0.34676258992805753, + "grad_norm": 0.49519466414702334, + "learning_rate": 5.5402298850574715e-05, + "loss": 0.4146, + "step": 241 + }, + { + "epoch": 0.34820143884892085, + "grad_norm": 0.5237504473154212, + "learning_rate": 5.563218390804598e-05, + "loss": 0.4192, + "step": 242 + }, + { + "epoch": 0.3496402877697842, + "grad_norm": 0.5941024386408629, + "learning_rate": 5.5862068965517245e-05, + "loss": 0.4262, + "step": 243 + }, + { + "epoch": 0.3510791366906475, + "grad_norm": 0.5466007109154629, + "learning_rate": 5.609195402298851e-05, + "loss": 0.4207, + "step": 244 + }, + { + "epoch": 0.35251798561151076, + "grad_norm": 0.5070998561917002, + "learning_rate": 5.632183908045977e-05, + "loss": 0.4178, + "step": 245 + }, + { + "epoch": 0.3539568345323741, + "grad_norm": 0.5532223210228329, + "learning_rate": 5.6551724137931037e-05, + "loss": 0.4204, + "step": 246 + }, + { + "epoch": 0.3553956834532374, + "grad_norm": 0.5476749200012132, + "learning_rate": 5.6781609195402305e-05, + "loss": 0.4173, + "step": 247 + }, + { + "epoch": 0.35683453237410073, + "grad_norm": 0.4702862030858147, + "learning_rate": 5.7011494252873567e-05, + "loss": 0.4208, + "step": 248 + }, + { + "epoch": 0.35827338129496406, + "grad_norm": 0.4491819673301651, + "learning_rate": 5.7241379310344835e-05, + "loss": 0.4189, + "step": 249 + }, + { + "epoch": 0.3597122302158273, + "grad_norm": 0.47023442603839466, + "learning_rate": 5.747126436781609e-05, + "loss": 0.4223, + "step": 250 + }, + { + "epoch": 0.36115107913669064, + "grad_norm": 0.39940693933184623, + "learning_rate": 5.770114942528736e-05, + "loss": 0.4097, + "step": 251 + }, + { + "epoch": 0.36258992805755397, + "grad_norm": 0.3758340984896894, + "learning_rate": 5.7931034482758627e-05, + "loss": 0.4172, + "step": 252 + }, + { + "epoch": 0.3640287769784173, + "grad_norm": 0.40155533823476564, + "learning_rate": 5.8160919540229895e-05, + "loss": 0.4151, + "step": 253 + }, + { + "epoch": 0.36546762589928056, + "grad_norm": 0.34253263674719914, + "learning_rate": 5.839080459770116e-05, + "loss": 0.4203, + "step": 254 + }, + { + "epoch": 0.3669064748201439, + "grad_norm": 0.36390271929883083, + "learning_rate": 5.862068965517242e-05, + "loss": 0.4134, + "step": 255 + }, + { + "epoch": 0.3683453237410072, + "grad_norm": 0.33231707017315143, + "learning_rate": 5.885057471264368e-05, + "loss": 0.419, + "step": 256 + }, + { + "epoch": 0.3697841726618705, + "grad_norm": 0.33117090224747675, + "learning_rate": 5.908045977011495e-05, + "loss": 0.4105, + "step": 257 + }, + { + "epoch": 0.3712230215827338, + "grad_norm": 0.3823966378687382, + "learning_rate": 5.931034482758622e-05, + "loss": 0.4271, + "step": 258 + }, + { + "epoch": 0.3726618705035971, + "grad_norm": 0.3940172882104445, + "learning_rate": 5.954022988505747e-05, + "loss": 0.4157, + "step": 259 + }, + { + "epoch": 0.37410071942446044, + "grad_norm": 0.3871761128804785, + "learning_rate": 5.977011494252874e-05, + "loss": 0.4193, + "step": 260 + }, + { + "epoch": 0.37553956834532376, + "grad_norm": 0.5047843144990062, + "learning_rate": 6.000000000000001e-05, + "loss": 0.4194, + "step": 261 + }, + { + "epoch": 0.376978417266187, + "grad_norm": 0.703100287188996, + "learning_rate": 6.022988505747127e-05, + "loss": 0.4153, + "step": 262 + }, + { + "epoch": 0.37841726618705035, + "grad_norm": 0.8316949370232966, + "learning_rate": 6.045977011494254e-05, + "loss": 0.423, + "step": 263 + }, + { + "epoch": 0.37985611510791367, + "grad_norm": 0.6133608576551307, + "learning_rate": 6.068965517241379e-05, + "loss": 0.4203, + "step": 264 + }, + { + "epoch": 0.381294964028777, + "grad_norm": 0.3077241840978202, + "learning_rate": 6.091954022988506e-05, + "loss": 0.4138, + "step": 265 + }, + { + "epoch": 0.38273381294964026, + "grad_norm": 0.574646804178916, + "learning_rate": 6.114942528735632e-05, + "loss": 0.4126, + "step": 266 + }, + { + "epoch": 0.3841726618705036, + "grad_norm": 0.5005718982159811, + "learning_rate": 6.137931034482759e-05, + "loss": 0.4171, + "step": 267 + }, + { + "epoch": 0.3856115107913669, + "grad_norm": 0.3374353201004547, + "learning_rate": 6.160919540229885e-05, + "loss": 0.4084, + "step": 268 + }, + { + "epoch": 0.38705035971223023, + "grad_norm": 0.4562006878600622, + "learning_rate": 6.183908045977011e-05, + "loss": 0.4163, + "step": 269 + }, + { + "epoch": 0.38848920863309355, + "grad_norm": 0.48033384427276327, + "learning_rate": 6.206896551724138e-05, + "loss": 0.4065, + "step": 270 + }, + { + "epoch": 0.3899280575539568, + "grad_norm": 0.32216192622007456, + "learning_rate": 6.229885057471265e-05, + "loss": 0.4112, + "step": 271 + }, + { + "epoch": 0.39136690647482014, + "grad_norm": 0.44669616782797317, + "learning_rate": 6.252873563218392e-05, + "loss": 0.4079, + "step": 272 + }, + { + "epoch": 0.39280575539568346, + "grad_norm": 0.4476950772763014, + "learning_rate": 6.275862068965517e-05, + "loss": 0.4111, + "step": 273 + }, + { + "epoch": 0.3942446043165468, + "grad_norm": 0.2886411172690327, + "learning_rate": 6.298850574712644e-05, + "loss": 0.4043, + "step": 274 + }, + { + "epoch": 0.39568345323741005, + "grad_norm": 0.3702149250397203, + "learning_rate": 6.321839080459771e-05, + "loss": 0.402, + "step": 275 + }, + { + "epoch": 0.3971223021582734, + "grad_norm": 0.3735855064136341, + "learning_rate": 6.344827586206897e-05, + "loss": 0.4097, + "step": 276 + }, + { + "epoch": 0.3985611510791367, + "grad_norm": 0.2578446895913922, + "learning_rate": 6.367816091954023e-05, + "loss": 0.4085, + "step": 277 + }, + { + "epoch": 0.4, + "grad_norm": 0.36680592540229395, + "learning_rate": 6.39080459770115e-05, + "loss": 0.41, + "step": 278 + }, + { + "epoch": 0.4014388489208633, + "grad_norm": 0.2847257307786734, + "learning_rate": 6.413793103448276e-05, + "loss": 0.4083, + "step": 279 + }, + { + "epoch": 0.4028776978417266, + "grad_norm": 0.31354966366517834, + "learning_rate": 6.436781609195403e-05, + "loss": 0.4079, + "step": 280 + }, + { + "epoch": 0.40431654676258993, + "grad_norm": 0.34653289172604856, + "learning_rate": 6.45977011494253e-05, + "loss": 0.4108, + "step": 281 + }, + { + "epoch": 0.40575539568345326, + "grad_norm": 0.24017294677401724, + "learning_rate": 6.482758620689655e-05, + "loss": 0.3986, + "step": 282 + }, + { + "epoch": 0.4071942446043165, + "grad_norm": 0.26441107666341884, + "learning_rate": 6.505747126436782e-05, + "loss": 0.4123, + "step": 283 + }, + { + "epoch": 0.40863309352517985, + "grad_norm": 0.2911074896661384, + "learning_rate": 6.528735632183909e-05, + "loss": 0.4196, + "step": 284 + }, + { + "epoch": 0.41007194244604317, + "grad_norm": 0.338550336780261, + "learning_rate": 6.551724137931035e-05, + "loss": 0.4039, + "step": 285 + }, + { + "epoch": 0.4115107913669065, + "grad_norm": 0.4212636484391325, + "learning_rate": 6.574712643678162e-05, + "loss": 0.4191, + "step": 286 + }, + { + "epoch": 0.41294964028776976, + "grad_norm": 0.4720387151383318, + "learning_rate": 6.597701149425288e-05, + "loss": 0.3996, + "step": 287 + }, + { + "epoch": 0.4143884892086331, + "grad_norm": 0.5665310272932965, + "learning_rate": 6.620689655172415e-05, + "loss": 0.4168, + "step": 288 + }, + { + "epoch": 0.4158273381294964, + "grad_norm": 0.6879396891468814, + "learning_rate": 6.643678160919542e-05, + "loss": 0.4113, + "step": 289 + }, + { + "epoch": 0.4172661870503597, + "grad_norm": 0.9471828010742368, + "learning_rate": 6.666666666666667e-05, + "loss": 0.4192, + "step": 290 + }, + { + "epoch": 0.418705035971223, + "grad_norm": 0.589750102641128, + "learning_rate": 6.689655172413794e-05, + "loss": 0.3972, + "step": 291 + }, + { + "epoch": 0.4201438848920863, + "grad_norm": 0.512995567918449, + "learning_rate": 6.712643678160919e-05, + "loss": 0.4174, + "step": 292 + }, + { + "epoch": 0.42158273381294964, + "grad_norm": 0.6129621783395007, + "learning_rate": 6.735632183908046e-05, + "loss": 0.4188, + "step": 293 + }, + { + "epoch": 0.42302158273381296, + "grad_norm": 0.6662371744372287, + "learning_rate": 6.758620689655173e-05, + "loss": 0.4128, + "step": 294 + }, + { + "epoch": 0.4244604316546763, + "grad_norm": 0.49955639912919664, + "learning_rate": 6.7816091954023e-05, + "loss": 0.4103, + "step": 295 + }, + { + "epoch": 0.42589928057553955, + "grad_norm": 0.659145883377898, + "learning_rate": 6.804597701149425e-05, + "loss": 0.4103, + "step": 296 + }, + { + "epoch": 0.4273381294964029, + "grad_norm": 0.5342696552963402, + "learning_rate": 6.827586206896552e-05, + "loss": 0.4125, + "step": 297 + }, + { + "epoch": 0.4287769784172662, + "grad_norm": 0.5954892014584885, + "learning_rate": 6.850574712643679e-05, + "loss": 0.4085, + "step": 298 + }, + { + "epoch": 0.4302158273381295, + "grad_norm": 0.5668914244262512, + "learning_rate": 6.873563218390806e-05, + "loss": 0.4121, + "step": 299 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 0.5107590244214486, + "learning_rate": 6.896551724137931e-05, + "loss": 0.4106, + "step": 300 + }, + { + "epoch": 0.4330935251798561, + "grad_norm": 0.6755319559179165, + "learning_rate": 6.919540229885058e-05, + "loss": 0.4117, + "step": 301 + }, + { + "epoch": 0.43453237410071943, + "grad_norm": 0.4179083230881996, + "learning_rate": 6.942528735632185e-05, + "loss": 0.4046, + "step": 302 + }, + { + "epoch": 0.43597122302158275, + "grad_norm": 0.5770119275066495, + "learning_rate": 6.96551724137931e-05, + "loss": 0.4123, + "step": 303 + }, + { + "epoch": 0.437410071942446, + "grad_norm": 0.4400220981417635, + "learning_rate": 6.988505747126437e-05, + "loss": 0.414, + "step": 304 + }, + { + "epoch": 0.43884892086330934, + "grad_norm": 0.7787505087188255, + "learning_rate": 7.011494252873563e-05, + "loss": 0.4154, + "step": 305 + }, + { + "epoch": 0.44028776978417267, + "grad_norm": 0.5438951189611558, + "learning_rate": 7.03448275862069e-05, + "loss": 0.4103, + "step": 306 + }, + { + "epoch": 0.441726618705036, + "grad_norm": 0.7182301959610059, + "learning_rate": 7.057471264367816e-05, + "loss": 0.4248, + "step": 307 + }, + { + "epoch": 0.44316546762589926, + "grad_norm": 0.784357548693455, + "learning_rate": 7.080459770114943e-05, + "loss": 0.4248, + "step": 308 + }, + { + "epoch": 0.4446043165467626, + "grad_norm": 1.4074663898040851, + "learning_rate": 7.10344827586207e-05, + "loss": 0.4331, + "step": 309 + }, + { + "epoch": 0.4460431654676259, + "grad_norm": 0.6547039263701318, + "learning_rate": 7.126436781609196e-05, + "loss": 0.4134, + "step": 310 + }, + { + "epoch": 0.4474820143884892, + "grad_norm": 0.7020988819775665, + "learning_rate": 7.149425287356322e-05, + "loss": 0.4173, + "step": 311 + }, + { + "epoch": 0.4489208633093525, + "grad_norm": 0.5880726013717825, + "learning_rate": 7.17241379310345e-05, + "loss": 0.4129, + "step": 312 + }, + { + "epoch": 0.4503597122302158, + "grad_norm": 0.5521119116996107, + "learning_rate": 7.195402298850576e-05, + "loss": 0.4153, + "step": 313 + }, + { + "epoch": 0.45179856115107914, + "grad_norm": 3.9881369791462293, + "learning_rate": 7.218390804597702e-05, + "loss": 0.4232, + "step": 314 + }, + { + "epoch": 0.45323741007194246, + "grad_norm": 0.7734742568120329, + "learning_rate": 7.241379310344828e-05, + "loss": 0.4254, + "step": 315 + }, + { + "epoch": 0.4546762589928058, + "grad_norm": 0.6392054913190713, + "learning_rate": 7.264367816091954e-05, + "loss": 0.4154, + "step": 316 + }, + { + "epoch": 0.45611510791366905, + "grad_norm": 0.5343121642978103, + "learning_rate": 7.287356321839081e-05, + "loss": 0.4183, + "step": 317 + }, + { + "epoch": 0.45755395683453237, + "grad_norm": 0.534464146362328, + "learning_rate": 7.310344827586208e-05, + "loss": 0.4176, + "step": 318 + }, + { + "epoch": 0.4589928057553957, + "grad_norm": 0.4591671523934069, + "learning_rate": 7.333333333333333e-05, + "loss": 0.4065, + "step": 319 + }, + { + "epoch": 0.460431654676259, + "grad_norm": 0.4570818633965499, + "learning_rate": 7.35632183908046e-05, + "loss": 0.4183, + "step": 320 + }, + { + "epoch": 0.4618705035971223, + "grad_norm": 0.4108070417925211, + "learning_rate": 7.379310344827587e-05, + "loss": 0.4233, + "step": 321 + }, + { + "epoch": 0.4633093525179856, + "grad_norm": 0.40859086843130993, + "learning_rate": 7.402298850574714e-05, + "loss": 0.4126, + "step": 322 + }, + { + "epoch": 0.46474820143884893, + "grad_norm": 0.46481655751060136, + "learning_rate": 7.425287356321839e-05, + "loss": 0.4162, + "step": 323 + }, + { + "epoch": 0.46618705035971225, + "grad_norm": 0.35680701465011644, + "learning_rate": 7.448275862068966e-05, + "loss": 0.4112, + "step": 324 + }, + { + "epoch": 0.4676258992805755, + "grad_norm": 0.39342147794665006, + "learning_rate": 7.471264367816093e-05, + "loss": 0.4087, + "step": 325 + }, + { + "epoch": 0.46906474820143884, + "grad_norm": 0.343319754952168, + "learning_rate": 7.49425287356322e-05, + "loss": 0.4033, + "step": 326 + }, + { + "epoch": 0.47050359712230216, + "grad_norm": 0.32852062718493036, + "learning_rate": 7.517241379310345e-05, + "loss": 0.4135, + "step": 327 + }, + { + "epoch": 0.4719424460431655, + "grad_norm": 0.3022324788382363, + "learning_rate": 7.540229885057472e-05, + "loss": 0.4046, + "step": 328 + }, + { + "epoch": 0.47338129496402875, + "grad_norm": 0.326440648315175, + "learning_rate": 7.563218390804599e-05, + "loss": 0.4215, + "step": 329 + }, + { + "epoch": 0.4748201438848921, + "grad_norm": 0.31595727192347783, + "learning_rate": 7.586206896551724e-05, + "loss": 0.4141, + "step": 330 + }, + { + "epoch": 0.4762589928057554, + "grad_norm": 0.2663905387567008, + "learning_rate": 7.609195402298851e-05, + "loss": 0.4081, + "step": 331 + }, + { + "epoch": 0.4776978417266187, + "grad_norm": 0.278590485517421, + "learning_rate": 7.632183908045977e-05, + "loss": 0.4047, + "step": 332 + }, + { + "epoch": 0.479136690647482, + "grad_norm": 0.23279241923894028, + "learning_rate": 7.655172413793103e-05, + "loss": 0.3964, + "step": 333 + }, + { + "epoch": 0.4805755395683453, + "grad_norm": 0.29081713635668505, + "learning_rate": 7.67816091954023e-05, + "loss": 0.4087, + "step": 334 + }, + { + "epoch": 0.48201438848920863, + "grad_norm": 0.21801609645937103, + "learning_rate": 7.701149425287357e-05, + "loss": 0.4096, + "step": 335 + }, + { + "epoch": 0.48345323741007196, + "grad_norm": 0.2100066170024453, + "learning_rate": 7.724137931034484e-05, + "loss": 0.4019, + "step": 336 + }, + { + "epoch": 0.4848920863309353, + "grad_norm": 0.21436855692470322, + "learning_rate": 7.74712643678161e-05, + "loss": 0.403, + "step": 337 + }, + { + "epoch": 0.48633093525179855, + "grad_norm": 0.20241965859420183, + "learning_rate": 7.770114942528736e-05, + "loss": 0.4069, + "step": 338 + }, + { + "epoch": 0.48776978417266187, + "grad_norm": 0.19119778436069676, + "learning_rate": 7.793103448275863e-05, + "loss": 0.4143, + "step": 339 + }, + { + "epoch": 0.4892086330935252, + "grad_norm": 0.20104293091307832, + "learning_rate": 7.81609195402299e-05, + "loss": 0.4015, + "step": 340 + }, + { + "epoch": 0.4906474820143885, + "grad_norm": 0.20525291633468387, + "learning_rate": 7.839080459770115e-05, + "loss": 0.4007, + "step": 341 + }, + { + "epoch": 0.4920863309352518, + "grad_norm": 0.3065758658427838, + "learning_rate": 7.862068965517242e-05, + "loss": 0.3938, + "step": 342 + }, + { + "epoch": 0.4935251798561151, + "grad_norm": 0.42866554620728164, + "learning_rate": 7.885057471264368e-05, + "loss": 0.4059, + "step": 343 + }, + { + "epoch": 0.4949640287769784, + "grad_norm": 0.5089904520435118, + "learning_rate": 7.908045977011495e-05, + "loss": 0.401, + "step": 344 + }, + { + "epoch": 0.49640287769784175, + "grad_norm": 0.6058763556754064, + "learning_rate": 7.931034482758621e-05, + "loss": 0.4002, + "step": 345 + }, + { + "epoch": 0.497841726618705, + "grad_norm": 0.7046159485332273, + "learning_rate": 7.954022988505747e-05, + "loss": 0.4133, + "step": 346 + }, + { + "epoch": 0.49928057553956834, + "grad_norm": 0.7368240450982135, + "learning_rate": 7.977011494252874e-05, + "loss": 0.4102, + "step": 347 + }, + { + "epoch": 0.5007194244604316, + "grad_norm": 0.5207562288254234, + "learning_rate": 8e-05, + "loss": 0.4091, + "step": 348 + }, + { + "epoch": 0.5021582733812949, + "grad_norm": 0.29358142633665374, + "learning_rate": 7.999997981289966e-05, + "loss": 0.3977, + "step": 349 + }, + { + "epoch": 0.5035971223021583, + "grad_norm": 0.606870412872592, + "learning_rate": 7.999991925161896e-05, + "loss": 0.4094, + "step": 350 + }, + { + "epoch": 0.5050359712230216, + "grad_norm": 0.5733670548773588, + "learning_rate": 7.999981831621906e-05, + "loss": 0.4057, + "step": 351 + }, + { + "epoch": 0.5064748201438849, + "grad_norm": 0.2769785062462958, + "learning_rate": 7.999967700680183e-05, + "loss": 0.4063, + "step": 352 + }, + { + "epoch": 0.5079136690647482, + "grad_norm": 0.6086587925627267, + "learning_rate": 7.99994953235099e-05, + "loss": 0.4029, + "step": 353 + }, + { + "epoch": 0.5093525179856115, + "grad_norm": 0.5923654056002113, + "learning_rate": 7.999927326652667e-05, + "loss": 0.4108, + "step": 354 + }, + { + "epoch": 0.5107913669064749, + "grad_norm": 0.39072319098055625, + "learning_rate": 7.999901083607624e-05, + "loss": 0.4049, + "step": 355 + }, + { + "epoch": 0.5122302158273381, + "grad_norm": 0.49546215109939884, + "learning_rate": 7.99987080324235e-05, + "loss": 0.4052, + "step": 356 + }, + { + "epoch": 0.5136690647482014, + "grad_norm": 0.4142270363441574, + "learning_rate": 7.999836485587415e-05, + "loss": 0.4115, + "step": 357 + }, + { + "epoch": 0.5151079136690647, + "grad_norm": 0.34303257732768067, + "learning_rate": 7.99979813067745e-05, + "loss": 0.4105, + "step": 358 + }, + { + "epoch": 0.516546762589928, + "grad_norm": 0.37193739236992807, + "learning_rate": 7.999755738551171e-05, + "loss": 0.4003, + "step": 359 + }, + { + "epoch": 0.5179856115107914, + "grad_norm": 0.3618689619234623, + "learning_rate": 7.999709309251368e-05, + "loss": 0.4038, + "step": 360 + }, + { + "epoch": 0.5194244604316547, + "grad_norm": 0.3527534652754947, + "learning_rate": 7.999658842824904e-05, + "loss": 0.3951, + "step": 361 + }, + { + "epoch": 0.520863309352518, + "grad_norm": 0.31037934934255207, + "learning_rate": 7.999604339322717e-05, + "loss": 0.4049, + "step": 362 + }, + { + "epoch": 0.5223021582733813, + "grad_norm": 0.2960883056882076, + "learning_rate": 7.999545798799823e-05, + "loss": 0.4079, + "step": 363 + }, + { + "epoch": 0.5237410071942447, + "grad_norm": 0.3253854973145754, + "learning_rate": 7.999483221315307e-05, + "loss": 0.3932, + "step": 364 + }, + { + "epoch": 0.5251798561151079, + "grad_norm": 0.2638110731347279, + "learning_rate": 7.999416606932331e-05, + "loss": 0.4076, + "step": 365 + }, + { + "epoch": 0.5266187050359712, + "grad_norm": 0.29904682553238354, + "learning_rate": 7.999345955718136e-05, + "loss": 0.3919, + "step": 366 + }, + { + "epoch": 0.5280575539568345, + "grad_norm": 0.35910718776043077, + "learning_rate": 7.999271267744033e-05, + "loss": 0.4028, + "step": 367 + }, + { + "epoch": 0.5294964028776978, + "grad_norm": 0.2693411427809221, + "learning_rate": 7.999192543085407e-05, + "loss": 0.4036, + "step": 368 + }, + { + "epoch": 0.5309352517985612, + "grad_norm": 0.3550220152156702, + "learning_rate": 7.999109781821722e-05, + "loss": 0.4079, + "step": 369 + }, + { + "epoch": 0.5323741007194245, + "grad_norm": 0.3160549144611672, + "learning_rate": 7.999022984036512e-05, + "loss": 0.4111, + "step": 370 + }, + { + "epoch": 0.5338129496402878, + "grad_norm": 0.2066309224309784, + "learning_rate": 7.998932149817386e-05, + "loss": 0.3995, + "step": 371 + }, + { + "epoch": 0.5352517985611511, + "grad_norm": 0.2919697687299495, + "learning_rate": 7.998837279256028e-05, + "loss": 0.4004, + "step": 372 + }, + { + "epoch": 0.5366906474820143, + "grad_norm": 0.268649195427655, + "learning_rate": 7.998738372448196e-05, + "loss": 0.4066, + "step": 373 + }, + { + "epoch": 0.5381294964028777, + "grad_norm": 0.26476642665463757, + "learning_rate": 7.998635429493726e-05, + "loss": 0.3949, + "step": 374 + }, + { + "epoch": 0.539568345323741, + "grad_norm": 0.36794156033362396, + "learning_rate": 7.998528450496519e-05, + "loss": 0.3976, + "step": 375 + }, + { + "epoch": 0.5410071942446043, + "grad_norm": 0.40919097296904383, + "learning_rate": 7.998417435564557e-05, + "loss": 0.407, + "step": 376 + }, + { + "epoch": 0.5424460431654676, + "grad_norm": 0.4526556851986713, + "learning_rate": 7.998302384809893e-05, + "loss": 0.4076, + "step": 377 + }, + { + "epoch": 0.543884892086331, + "grad_norm": 0.4701472448170464, + "learning_rate": 7.998183298348654e-05, + "loss": 0.4046, + "step": 378 + }, + { + "epoch": 0.5453237410071943, + "grad_norm": 0.44907946268323634, + "learning_rate": 7.998060176301041e-05, + "loss": 0.4121, + "step": 379 + }, + { + "epoch": 0.5467625899280576, + "grad_norm": 0.26057683513153385, + "learning_rate": 7.997933018791327e-05, + "loss": 0.3997, + "step": 380 + }, + { + "epoch": 0.5482014388489208, + "grad_norm": 0.2229074658092342, + "learning_rate": 7.99780182594786e-05, + "loss": 0.4018, + "step": 381 + }, + { + "epoch": 0.5496402877697841, + "grad_norm": 0.31888109362416694, + "learning_rate": 7.99766659790306e-05, + "loss": 0.3984, + "step": 382 + }, + { + "epoch": 0.5510791366906475, + "grad_norm": 0.3334807967394207, + "learning_rate": 7.997527334793419e-05, + "loss": 0.4038, + "step": 383 + }, + { + "epoch": 0.5525179856115108, + "grad_norm": 0.20443061990304592, + "learning_rate": 7.997384036759505e-05, + "loss": 0.4077, + "step": 384 + }, + { + "epoch": 0.5539568345323741, + "grad_norm": 0.2983478446351588, + "learning_rate": 7.997236703945955e-05, + "loss": 0.3985, + "step": 385 + }, + { + "epoch": 0.5553956834532374, + "grad_norm": 0.3399128576179451, + "learning_rate": 7.99708533650148e-05, + "loss": 0.4133, + "step": 386 + }, + { + "epoch": 0.5568345323741007, + "grad_norm": 0.24599737198322885, + "learning_rate": 7.996929934578864e-05, + "loss": 0.3985, + "step": 387 + }, + { + "epoch": 0.5582733812949641, + "grad_norm": 0.25455223818502287, + "learning_rate": 7.996770498334963e-05, + "loss": 0.4051, + "step": 388 + }, + { + "epoch": 0.5597122302158274, + "grad_norm": 0.2876531237701059, + "learning_rate": 7.996607027930705e-05, + "loss": 0.4056, + "step": 389 + }, + { + "epoch": 0.5611510791366906, + "grad_norm": 0.23219710717556477, + "learning_rate": 7.996439523531088e-05, + "loss": 0.4052, + "step": 390 + }, + { + "epoch": 0.5625899280575539, + "grad_norm": 0.29661563286854414, + "learning_rate": 7.996267985305186e-05, + "loss": 0.4071, + "step": 391 + }, + { + "epoch": 0.5640287769784172, + "grad_norm": 0.3381873329962419, + "learning_rate": 7.99609241342614e-05, + "loss": 0.4035, + "step": 392 + }, + { + "epoch": 0.5654676258992806, + "grad_norm": 0.3183331177671327, + "learning_rate": 7.995912808071164e-05, + "loss": 0.3979, + "step": 393 + }, + { + "epoch": 0.5669064748201439, + "grad_norm": 0.362667614282157, + "learning_rate": 7.995729169421545e-05, + "loss": 0.4064, + "step": 394 + }, + { + "epoch": 0.5683453237410072, + "grad_norm": 0.36848162295344455, + "learning_rate": 7.99554149766264e-05, + "loss": 0.4067, + "step": 395 + }, + { + "epoch": 0.5697841726618705, + "grad_norm": 0.2309864523681632, + "learning_rate": 7.995349792983874e-05, + "loss": 0.3936, + "step": 396 + }, + { + "epoch": 0.5712230215827339, + "grad_norm": 0.18573579497904213, + "learning_rate": 7.995154055578748e-05, + "loss": 0.4016, + "step": 397 + }, + { + "epoch": 0.5726618705035971, + "grad_norm": 0.22848869369322164, + "learning_rate": 7.994954285644827e-05, + "loss": 0.4022, + "step": 398 + }, + { + "epoch": 0.5741007194244604, + "grad_norm": 1.242889611634213, + "learning_rate": 7.994750483383753e-05, + "loss": 0.4074, + "step": 399 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.15040364980189874, + "learning_rate": 7.994542649001235e-05, + "loss": 0.4005, + "step": 400 + }, + { + "epoch": 0.576978417266187, + "grad_norm": 0.238032913741372, + "learning_rate": 7.994330782707048e-05, + "loss": 0.4092, + "step": 401 + }, + { + "epoch": 0.5784172661870504, + "grad_norm": 0.23976492117285375, + "learning_rate": 7.994114884715045e-05, + "loss": 0.3969, + "step": 402 + }, + { + "epoch": 0.5798561151079137, + "grad_norm": 0.27796861633714126, + "learning_rate": 7.99389495524314e-05, + "loss": 0.4069, + "step": 403 + }, + { + "epoch": 0.581294964028777, + "grad_norm": 0.3870100203847514, + "learning_rate": 7.993670994513321e-05, + "loss": 0.4021, + "step": 404 + }, + { + "epoch": 0.5827338129496403, + "grad_norm": 0.6297558741722189, + "learning_rate": 7.993443002751646e-05, + "loss": 0.4045, + "step": 405 + }, + { + "epoch": 0.5841726618705037, + "grad_norm": 0.7519493513775716, + "learning_rate": 7.993210980188236e-05, + "loss": 0.4199, + "step": 406 + }, + { + "epoch": 0.5856115107913669, + "grad_norm": 0.5848170414817211, + "learning_rate": 7.992974927057287e-05, + "loss": 0.4027, + "step": 407 + }, + { + "epoch": 0.5870503597122302, + "grad_norm": 0.4741533309366105, + "learning_rate": 7.992734843597058e-05, + "loss": 0.397, + "step": 408 + }, + { + "epoch": 0.5884892086330935, + "grad_norm": 0.525392793642585, + "learning_rate": 7.992490730049881e-05, + "loss": 0.4048, + "step": 409 + }, + { + "epoch": 0.5899280575539568, + "grad_norm": 0.43840628431903, + "learning_rate": 7.992242586662152e-05, + "loss": 0.401, + "step": 410 + }, + { + "epoch": 0.5913669064748202, + "grad_norm": 0.44899208218013437, + "learning_rate": 7.991990413684336e-05, + "loss": 0.4102, + "step": 411 + }, + { + "epoch": 0.5928057553956835, + "grad_norm": 0.418529631501033, + "learning_rate": 7.991734211370965e-05, + "loss": 0.411, + "step": 412 + }, + { + "epoch": 0.5942446043165468, + "grad_norm": 0.4023009688954088, + "learning_rate": 7.991473979980637e-05, + "loss": 0.406, + "step": 413 + }, + { + "epoch": 0.5956834532374101, + "grad_norm": 0.3340069426859754, + "learning_rate": 7.99120971977602e-05, + "loss": 0.406, + "step": 414 + }, + { + "epoch": 0.5971223021582733, + "grad_norm": 0.3122487477412848, + "learning_rate": 7.990941431023844e-05, + "loss": 0.4064, + "step": 415 + }, + { + "epoch": 0.5985611510791367, + "grad_norm": 0.35328758358148876, + "learning_rate": 7.990669113994911e-05, + "loss": 0.3945, + "step": 416 + }, + { + "epoch": 0.6, + "grad_norm": 0.22445809971932562, + "learning_rate": 7.99039276896408e-05, + "loss": 0.3978, + "step": 417 + }, + { + "epoch": 0.6014388489208633, + "grad_norm": 0.3117911528173465, + "learning_rate": 7.990112396210288e-05, + "loss": 0.4079, + "step": 418 + }, + { + "epoch": 0.6028776978417266, + "grad_norm": 0.27110674043698635, + "learning_rate": 7.989827996016525e-05, + "loss": 0.3978, + "step": 419 + }, + { + "epoch": 0.60431654676259, + "grad_norm": 0.28342593104905844, + "learning_rate": 7.989539568669856e-05, + "loss": 0.3945, + "step": 420 + }, + { + "epoch": 0.6057553956834533, + "grad_norm": 0.2335891065020172, + "learning_rate": 7.989247114461403e-05, + "loss": 0.4012, + "step": 421 + }, + { + "epoch": 0.6071942446043166, + "grad_norm": 0.34132845531397416, + "learning_rate": 7.988950633686358e-05, + "loss": 0.4059, + "step": 422 + }, + { + "epoch": 0.6086330935251798, + "grad_norm": 0.3793753760188709, + "learning_rate": 7.988650126643976e-05, + "loss": 0.4083, + "step": 423 + }, + { + "epoch": 0.6100719424460431, + "grad_norm": 0.24033707461795106, + "learning_rate": 7.988345593637572e-05, + "loss": 0.3927, + "step": 424 + }, + { + "epoch": 0.6115107913669064, + "grad_norm": 0.26549905398007445, + "learning_rate": 7.988037034974532e-05, + "loss": 0.3936, + "step": 425 + }, + { + "epoch": 0.6129496402877698, + "grad_norm": 0.30599802789608876, + "learning_rate": 7.9877244509663e-05, + "loss": 0.399, + "step": 426 + }, + { + "epoch": 0.6143884892086331, + "grad_norm": 0.250416843178887, + "learning_rate": 7.987407841928384e-05, + "loss": 0.4016, + "step": 427 + }, + { + "epoch": 0.6158273381294964, + "grad_norm": 0.3261688992689598, + "learning_rate": 7.987087208180355e-05, + "loss": 0.4034, + "step": 428 + }, + { + "epoch": 0.6172661870503597, + "grad_norm": 0.2401440595172318, + "learning_rate": 7.986762550045844e-05, + "loss": 0.3891, + "step": 429 + }, + { + "epoch": 0.6187050359712231, + "grad_norm": 0.2743844101931455, + "learning_rate": 7.98643386785255e-05, + "loss": 0.3981, + "step": 430 + }, + { + "epoch": 0.6201438848920864, + "grad_norm": 0.2758490427203629, + "learning_rate": 7.986101161932227e-05, + "loss": 0.399, + "step": 431 + }, + { + "epoch": 0.6215827338129496, + "grad_norm": 0.24764514548640124, + "learning_rate": 7.985764432620695e-05, + "loss": 0.3958, + "step": 432 + }, + { + "epoch": 0.6230215827338129, + "grad_norm": 0.31498435577719125, + "learning_rate": 7.985423680257833e-05, + "loss": 0.4056, + "step": 433 + }, + { + "epoch": 0.6244604316546762, + "grad_norm": 0.3071596705693149, + "learning_rate": 7.985078905187582e-05, + "loss": 0.3932, + "step": 434 + }, + { + "epoch": 0.6258992805755396, + "grad_norm": 0.23562150322180472, + "learning_rate": 7.984730107757942e-05, + "loss": 0.391, + "step": 435 + }, + { + "epoch": 0.6273381294964029, + "grad_norm": 0.2106502372853218, + "learning_rate": 7.984377288320973e-05, + "loss": 0.3866, + "step": 436 + }, + { + "epoch": 0.6287769784172662, + "grad_norm": 0.18180087469996958, + "learning_rate": 7.984020447232795e-05, + "loss": 0.3948, + "step": 437 + }, + { + "epoch": 0.6302158273381295, + "grad_norm": 0.1697981029409121, + "learning_rate": 7.983659584853586e-05, + "loss": 0.402, + "step": 438 + }, + { + "epoch": 0.6316546762589929, + "grad_norm": 0.1599496406363699, + "learning_rate": 7.983294701547588e-05, + "loss": 0.3961, + "step": 439 + }, + { + "epoch": 0.6330935251798561, + "grad_norm": 0.14858876445054642, + "learning_rate": 7.982925797683095e-05, + "loss": 0.398, + "step": 440 + }, + { + "epoch": 0.6345323741007194, + "grad_norm": 0.15847796647568324, + "learning_rate": 7.982552873632461e-05, + "loss": 0.4042, + "step": 441 + }, + { + "epoch": 0.6359712230215827, + "grad_norm": 0.19809172933544417, + "learning_rate": 7.982175929772102e-05, + "loss": 0.3864, + "step": 442 + }, + { + "epoch": 0.637410071942446, + "grad_norm": 0.1811174238613106, + "learning_rate": 7.981794966482486e-05, + "loss": 0.3855, + "step": 443 + }, + { + "epoch": 0.6388489208633094, + "grad_norm": 0.1675328143083596, + "learning_rate": 7.98140998414814e-05, + "loss": 0.3971, + "step": 444 + }, + { + "epoch": 0.6402877697841727, + "grad_norm": 0.20034503405113696, + "learning_rate": 7.98102098315765e-05, + "loss": 0.399, + "step": 445 + }, + { + "epoch": 0.641726618705036, + "grad_norm": 0.211833894086791, + "learning_rate": 7.980627963903654e-05, + "loss": 0.3908, + "step": 446 + }, + { + "epoch": 0.6431654676258993, + "grad_norm": 0.23408497340134393, + "learning_rate": 7.980230926782848e-05, + "loss": 0.4003, + "step": 447 + }, + { + "epoch": 0.6446043165467625, + "grad_norm": 0.23077143132838263, + "learning_rate": 7.979829872195984e-05, + "loss": 0.3962, + "step": 448 + }, + { + "epoch": 0.6460431654676259, + "grad_norm": 0.2834731638435603, + "learning_rate": 7.979424800547869e-05, + "loss": 0.3995, + "step": 449 + }, + { + "epoch": 0.6474820143884892, + "grad_norm": 0.36818575331884823, + "learning_rate": 7.979015712247365e-05, + "loss": 0.4002, + "step": 450 + }, + { + "epoch": 0.6489208633093525, + "grad_norm": 0.44203983146087344, + "learning_rate": 7.978602607707383e-05, + "loss": 0.3948, + "step": 451 + }, + { + "epoch": 0.6503597122302158, + "grad_norm": 0.442645682673057, + "learning_rate": 7.978185487344897e-05, + "loss": 0.4012, + "step": 452 + }, + { + "epoch": 0.6517985611510791, + "grad_norm": 0.43444069228623855, + "learning_rate": 7.977764351580928e-05, + "loss": 0.4081, + "step": 453 + }, + { + "epoch": 0.6532374100719425, + "grad_norm": 0.39467412539574687, + "learning_rate": 7.97733920084055e-05, + "loss": 0.4003, + "step": 454 + }, + { + "epoch": 0.6546762589928058, + "grad_norm": 0.38297176846303266, + "learning_rate": 7.976910035552892e-05, + "loss": 0.393, + "step": 455 + }, + { + "epoch": 0.6561151079136691, + "grad_norm": 0.3990027114029954, + "learning_rate": 7.976476856151134e-05, + "loss": 0.3967, + "step": 456 + }, + { + "epoch": 0.6575539568345323, + "grad_norm": 0.38837742399412434, + "learning_rate": 7.976039663072509e-05, + "loss": 0.3908, + "step": 457 + }, + { + "epoch": 0.6589928057553956, + "grad_norm": 0.290277744761257, + "learning_rate": 7.975598456758298e-05, + "loss": 0.4067, + "step": 458 + }, + { + "epoch": 0.660431654676259, + "grad_norm": 0.21844368211822543, + "learning_rate": 7.975153237653836e-05, + "loss": 0.3988, + "step": 459 + }, + { + "epoch": 0.6618705035971223, + "grad_norm": 0.3289265056281082, + "learning_rate": 7.974704006208509e-05, + "loss": 0.3981, + "step": 460 + }, + { + "epoch": 0.6633093525179856, + "grad_norm": 0.42331452138501785, + "learning_rate": 7.974250762875747e-05, + "loss": 0.4028, + "step": 461 + }, + { + "epoch": 0.6647482014388489, + "grad_norm": 0.3580448522286446, + "learning_rate": 7.973793508113035e-05, + "loss": 0.4046, + "step": 462 + }, + { + "epoch": 0.6661870503597123, + "grad_norm": 0.168092987350451, + "learning_rate": 7.973332242381908e-05, + "loss": 0.3967, + "step": 463 + }, + { + "epoch": 0.6676258992805756, + "grad_norm": 0.24235827768404627, + "learning_rate": 7.972866966147942e-05, + "loss": 0.4027, + "step": 464 + }, + { + "epoch": 0.6690647482014388, + "grad_norm": 0.32639102896298916, + "learning_rate": 7.972397679880771e-05, + "loss": 0.4019, + "step": 465 + }, + { + "epoch": 0.6705035971223021, + "grad_norm": 0.300427946195229, + "learning_rate": 7.971924384054068e-05, + "loss": 0.391, + "step": 466 + }, + { + "epoch": 0.6719424460431654, + "grad_norm": 0.2047814273496808, + "learning_rate": 7.971447079145557e-05, + "loss": 0.3924, + "step": 467 + }, + { + "epoch": 0.6733812949640288, + "grad_norm": 0.19410806738189568, + "learning_rate": 7.970965765637011e-05, + "loss": 0.3905, + "step": 468 + }, + { + "epoch": 0.6748201438848921, + "grad_norm": 0.2461909540823566, + "learning_rate": 7.970480444014244e-05, + "loss": 0.3947, + "step": 469 + }, + { + "epoch": 0.6762589928057554, + "grad_norm": 0.23520468572683895, + "learning_rate": 7.969991114767114e-05, + "loss": 0.4047, + "step": 470 + }, + { + "epoch": 0.6776978417266187, + "grad_norm": 0.2288733575688635, + "learning_rate": 7.969497778389534e-05, + "loss": 0.3931, + "step": 471 + }, + { + "epoch": 0.679136690647482, + "grad_norm": 0.22993980558386445, + "learning_rate": 7.969000435379454e-05, + "loss": 0.3947, + "step": 472 + }, + { + "epoch": 0.6805755395683454, + "grad_norm": 0.23754711246173846, + "learning_rate": 7.968499086238867e-05, + "loss": 0.4004, + "step": 473 + }, + { + "epoch": 0.6820143884892086, + "grad_norm": 0.20366152685811229, + "learning_rate": 7.967993731473815e-05, + "loss": 0.396, + "step": 474 + }, + { + "epoch": 0.6834532374100719, + "grad_norm": 0.17920563839246975, + "learning_rate": 7.96748437159438e-05, + "loss": 0.3993, + "step": 475 + }, + { + "epoch": 0.6848920863309352, + "grad_norm": 0.18952993481603614, + "learning_rate": 7.966971007114686e-05, + "loss": 0.3928, + "step": 476 + }, + { + "epoch": 0.6863309352517986, + "grad_norm": 0.2172472942767943, + "learning_rate": 7.966453638552901e-05, + "loss": 0.3942, + "step": 477 + }, + { + "epoch": 0.6877697841726619, + "grad_norm": 0.2098368899511751, + "learning_rate": 7.965932266431232e-05, + "loss": 0.3933, + "step": 478 + }, + { + "epoch": 0.6892086330935252, + "grad_norm": 0.21759755000537567, + "learning_rate": 7.96540689127593e-05, + "loss": 0.3938, + "step": 479 + }, + { + "epoch": 0.6906474820143885, + "grad_norm": 0.27684281206502115, + "learning_rate": 7.964877513617285e-05, + "loss": 0.3908, + "step": 480 + }, + { + "epoch": 0.6920863309352518, + "grad_norm": 0.3955996614331896, + "learning_rate": 7.964344133989627e-05, + "loss": 0.3933, + "step": 481 + }, + { + "epoch": 0.6935251798561151, + "grad_norm": 0.3936258614387365, + "learning_rate": 7.963806752931324e-05, + "loss": 0.3998, + "step": 482 + }, + { + "epoch": 0.6949640287769784, + "grad_norm": 0.2970999309051372, + "learning_rate": 7.963265370984786e-05, + "loss": 0.3994, + "step": 483 + }, + { + "epoch": 0.6964028776978417, + "grad_norm": 0.3279006569452077, + "learning_rate": 7.962719988696458e-05, + "loss": 0.3965, + "step": 484 + }, + { + "epoch": 0.697841726618705, + "grad_norm": 0.40135146596828136, + "learning_rate": 7.962170606616826e-05, + "loss": 0.3986, + "step": 485 + }, + { + "epoch": 0.6992805755395683, + "grad_norm": 0.41780843703588655, + "learning_rate": 7.96161722530041e-05, + "loss": 0.3965, + "step": 486 + }, + { + "epoch": 0.7007194244604317, + "grad_norm": 0.4053211081313924, + "learning_rate": 7.96105984530577e-05, + "loss": 0.3774, + "step": 487 + }, + { + "epoch": 0.702158273381295, + "grad_norm": 0.4103765238612219, + "learning_rate": 7.9604984671955e-05, + "loss": 0.3866, + "step": 488 + }, + { + "epoch": 0.7035971223021583, + "grad_norm": 0.34964703097638195, + "learning_rate": 7.959933091536227e-05, + "loss": 0.3958, + "step": 489 + }, + { + "epoch": 0.7050359712230215, + "grad_norm": 0.25846513601369847, + "learning_rate": 7.95936371889862e-05, + "loss": 0.3938, + "step": 490 + }, + { + "epoch": 0.7064748201438849, + "grad_norm": 0.274682346218589, + "learning_rate": 7.958790349857375e-05, + "loss": 0.3891, + "step": 491 + }, + { + "epoch": 0.7079136690647482, + "grad_norm": 0.3113217352383488, + "learning_rate": 7.958212984991226e-05, + "loss": 0.3974, + "step": 492 + }, + { + "epoch": 0.7093525179856115, + "grad_norm": 0.324416073045715, + "learning_rate": 7.957631624882938e-05, + "loss": 0.3978, + "step": 493 + }, + { + "epoch": 0.7107913669064748, + "grad_norm": 0.2591042835794199, + "learning_rate": 7.957046270119313e-05, + "loss": 0.3979, + "step": 494 + }, + { + "epoch": 0.7122302158273381, + "grad_norm": 0.2393859657662622, + "learning_rate": 7.956456921291178e-05, + "loss": 0.3952, + "step": 495 + }, + { + "epoch": 0.7136690647482015, + "grad_norm": 0.3290105193793378, + "learning_rate": 7.955863578993396e-05, + "loss": 0.3982, + "step": 496 + }, + { + "epoch": 0.7151079136690648, + "grad_norm": 0.29739496982271724, + "learning_rate": 7.955266243824864e-05, + "loss": 0.3905, + "step": 497 + }, + { + "epoch": 0.7165467625899281, + "grad_norm": 0.20314300140842406, + "learning_rate": 7.954664916388499e-05, + "loss": 0.3966, + "step": 498 + }, + { + "epoch": 0.7179856115107913, + "grad_norm": 0.28034465884458026, + "learning_rate": 7.954059597291257e-05, + "loss": 0.3966, + "step": 499 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.2959049118620304, + "learning_rate": 7.953450287144121e-05, + "loss": 0.4031, + "step": 500 + }, + { + "epoch": 0.720863309352518, + "grad_norm": 0.2486789124385139, + "learning_rate": 7.952836986562099e-05, + "loss": 0.3921, + "step": 501 + }, + { + "epoch": 0.7223021582733813, + "grad_norm": 0.27972167275170323, + "learning_rate": 7.952219696164231e-05, + "loss": 0.3942, + "step": 502 + }, + { + "epoch": 0.7237410071942446, + "grad_norm": 0.24393341332660898, + "learning_rate": 7.95159841657358e-05, + "loss": 0.3899, + "step": 503 + }, + { + "epoch": 0.7251798561151079, + "grad_norm": 0.21560853713804515, + "learning_rate": 7.950973148417239e-05, + "loss": 0.4051, + "step": 504 + }, + { + "epoch": 0.7266187050359713, + "grad_norm": 0.24353818532928312, + "learning_rate": 7.950343892326327e-05, + "loss": 0.3992, + "step": 505 + }, + { + "epoch": 0.7280575539568346, + "grad_norm": 0.25016763270320097, + "learning_rate": 7.949710648935984e-05, + "loss": 0.3995, + "step": 506 + }, + { + "epoch": 0.7294964028776978, + "grad_norm": 0.2162108624913337, + "learning_rate": 7.949073418885378e-05, + "loss": 0.3966, + "step": 507 + }, + { + "epoch": 0.7309352517985611, + "grad_norm": 0.24692064398043742, + "learning_rate": 7.948432202817703e-05, + "loss": 0.3976, + "step": 508 + }, + { + "epoch": 0.7323741007194244, + "grad_norm": 0.29102943123049935, + "learning_rate": 7.94778700138017e-05, + "loss": 0.3899, + "step": 509 + }, + { + "epoch": 0.7338129496402878, + "grad_norm": 0.26026846085626515, + "learning_rate": 7.947137815224018e-05, + "loss": 0.3937, + "step": 510 + }, + { + "epoch": 0.7352517985611511, + "grad_norm": 0.277160471430053, + "learning_rate": 7.946484645004508e-05, + "loss": 0.3933, + "step": 511 + }, + { + "epoch": 0.7366906474820144, + "grad_norm": 0.3580864764505278, + "learning_rate": 7.945827491380916e-05, + "loss": 0.3924, + "step": 512 + }, + { + "epoch": 0.7381294964028777, + "grad_norm": 0.4100211305198917, + "learning_rate": 7.945166355016548e-05, + "loss": 0.3921, + "step": 513 + }, + { + "epoch": 0.739568345323741, + "grad_norm": 0.3752077589402531, + "learning_rate": 7.944501236578722e-05, + "loss": 0.3834, + "step": 514 + }, + { + "epoch": 0.7410071942446043, + "grad_norm": 0.35021889724147953, + "learning_rate": 7.943832136738783e-05, + "loss": 0.3894, + "step": 515 + }, + { + "epoch": 0.7424460431654676, + "grad_norm": 0.3295101949914119, + "learning_rate": 7.943159056172084e-05, + "loss": 0.3987, + "step": 516 + }, + { + "epoch": 0.7438848920863309, + "grad_norm": 0.29331915864233477, + "learning_rate": 7.942481995558007e-05, + "loss": 0.3867, + "step": 517 + }, + { + "epoch": 0.7453237410071942, + "grad_norm": 0.2853388653685031, + "learning_rate": 7.941800955579946e-05, + "loss": 0.3891, + "step": 518 + }, + { + "epoch": 0.7467625899280576, + "grad_norm": 0.2706330302215832, + "learning_rate": 7.941115936925311e-05, + "loss": 0.3929, + "step": 519 + }, + { + "epoch": 0.7482014388489209, + "grad_norm": 0.23279517043371323, + "learning_rate": 7.940426940285529e-05, + "loss": 0.3913, + "step": 520 + }, + { + "epoch": 0.7496402877697842, + "grad_norm": 0.2220900334502715, + "learning_rate": 7.939733966356042e-05, + "loss": 0.39, + "step": 521 + }, + { + "epoch": 0.7510791366906475, + "grad_norm": 0.23271376043917275, + "learning_rate": 7.939037015836308e-05, + "loss": 0.3931, + "step": 522 + }, + { + "epoch": 0.7525179856115108, + "grad_norm": 0.22106957223746632, + "learning_rate": 7.938336089429796e-05, + "loss": 0.3891, + "step": 523 + }, + { + "epoch": 0.753956834532374, + "grad_norm": 0.24197907887885128, + "learning_rate": 7.937631187843991e-05, + "loss": 0.3902, + "step": 524 + }, + { + "epoch": 0.7553956834532374, + "grad_norm": 0.22673980441736877, + "learning_rate": 7.936922311790388e-05, + "loss": 0.3973, + "step": 525 + }, + { + "epoch": 0.7568345323741007, + "grad_norm": 0.1573949178411432, + "learning_rate": 7.936209461984495e-05, + "loss": 0.3931, + "step": 526 + }, + { + "epoch": 0.758273381294964, + "grad_norm": 0.16989105140678423, + "learning_rate": 7.935492639145831e-05, + "loss": 0.3945, + "step": 527 + }, + { + "epoch": 0.7597122302158273, + "grad_norm": 0.16198079458205, + "learning_rate": 7.934771843997922e-05, + "loss": 0.3938, + "step": 528 + }, + { + "epoch": 0.7611510791366907, + "grad_norm": 0.14474801638735643, + "learning_rate": 7.934047077268311e-05, + "loss": 0.3932, + "step": 529 + }, + { + "epoch": 0.762589928057554, + "grad_norm": 0.19071898012891114, + "learning_rate": 7.93331833968854e-05, + "loss": 0.3963, + "step": 530 + }, + { + "epoch": 0.7640287769784173, + "grad_norm": 0.23415263060538, + "learning_rate": 7.932585631994168e-05, + "loss": 0.3995, + "step": 531 + }, + { + "epoch": 0.7654676258992805, + "grad_norm": 0.25461113306253436, + "learning_rate": 7.931848954924754e-05, + "loss": 0.3877, + "step": 532 + }, + { + "epoch": 0.7669064748201438, + "grad_norm": 0.27845571117851337, + "learning_rate": 7.931108309223868e-05, + "loss": 0.3955, + "step": 533 + }, + { + "epoch": 0.7683453237410072, + "grad_norm": 0.29125838184684827, + "learning_rate": 7.930363695639085e-05, + "loss": 0.403, + "step": 534 + }, + { + "epoch": 0.7697841726618705, + "grad_norm": 0.4030485435958359, + "learning_rate": 7.929615114921984e-05, + "loss": 0.3954, + "step": 535 + }, + { + "epoch": 0.7712230215827338, + "grad_norm": 0.5501020153467006, + "learning_rate": 7.92886256782815e-05, + "loss": 0.4011, + "step": 536 + }, + { + "epoch": 0.7726618705035971, + "grad_norm": 0.5591809285898809, + "learning_rate": 7.928106055117168e-05, + "loss": 0.3956, + "step": 537 + }, + { + "epoch": 0.7741007194244605, + "grad_norm": 0.4561015670623354, + "learning_rate": 7.927345577552627e-05, + "loss": 0.3939, + "step": 538 + }, + { + "epoch": 0.7755395683453238, + "grad_norm": 0.3534835719458935, + "learning_rate": 7.926581135902122e-05, + "loss": 0.3982, + "step": 539 + }, + { + "epoch": 0.7769784172661871, + "grad_norm": 0.29817006306468613, + "learning_rate": 7.925812730937245e-05, + "loss": 0.3987, + "step": 540 + }, + { + "epoch": 0.7784172661870503, + "grad_norm": 0.33721551414512624, + "learning_rate": 7.92504036343359e-05, + "loss": 0.3925, + "step": 541 + }, + { + "epoch": 0.7798561151079136, + "grad_norm": 0.3534990722780468, + "learning_rate": 7.924264034170747e-05, + "loss": 0.4035, + "step": 542 + }, + { + "epoch": 0.781294964028777, + "grad_norm": 0.28500786280558943, + "learning_rate": 7.923483743932311e-05, + "loss": 0.3841, + "step": 543 + }, + { + "epoch": 0.7827338129496403, + "grad_norm": 0.2935353003885132, + "learning_rate": 7.922699493505871e-05, + "loss": 0.3979, + "step": 544 + }, + { + "epoch": 0.7841726618705036, + "grad_norm": 0.2824720757373356, + "learning_rate": 7.921911283683013e-05, + "loss": 0.3812, + "step": 545 + }, + { + "epoch": 0.7856115107913669, + "grad_norm": 0.21131745701377888, + "learning_rate": 7.921119115259322e-05, + "loss": 0.3889, + "step": 546 + }, + { + "epoch": 0.7870503597122303, + "grad_norm": 0.17006553976446184, + "learning_rate": 7.920322989034377e-05, + "loss": 0.4014, + "step": 547 + }, + { + "epoch": 0.7884892086330936, + "grad_norm": 0.22941668905471813, + "learning_rate": 7.919522905811752e-05, + "loss": 0.395, + "step": 548 + }, + { + "epoch": 0.7899280575539568, + "grad_norm": 0.2595782530775398, + "learning_rate": 7.918718866399012e-05, + "loss": 0.3902, + "step": 549 + }, + { + "epoch": 0.7913669064748201, + "grad_norm": 0.2241314542850331, + "learning_rate": 7.917910871607723e-05, + "loss": 0.3962, + "step": 550 + }, + { + "epoch": 0.7928057553956834, + "grad_norm": 0.19297615840666787, + "learning_rate": 7.917098922253436e-05, + "loss": 0.3922, + "step": 551 + }, + { + "epoch": 0.7942446043165468, + "grad_norm": 0.24337496404512848, + "learning_rate": 7.916283019155696e-05, + "loss": 0.3938, + "step": 552 + }, + { + "epoch": 0.7956834532374101, + "grad_norm": 0.23584358774435407, + "learning_rate": 7.915463163138041e-05, + "loss": 0.3809, + "step": 553 + }, + { + "epoch": 0.7971223021582734, + "grad_norm": 0.2005090428309887, + "learning_rate": 7.914639355027995e-05, + "loss": 0.386, + "step": 554 + }, + { + "epoch": 0.7985611510791367, + "grad_norm": 0.20358168719753014, + "learning_rate": 7.913811595657072e-05, + "loss": 0.4024, + "step": 555 + }, + { + "epoch": 0.8, + "grad_norm": 0.23516057951207506, + "learning_rate": 7.912979885860776e-05, + "loss": 0.3946, + "step": 556 + }, + { + "epoch": 0.8014388489208633, + "grad_norm": 0.2436118231209871, + "learning_rate": 7.912144226478598e-05, + "loss": 0.394, + "step": 557 + }, + { + "epoch": 0.8028776978417266, + "grad_norm": 0.3057793105311322, + "learning_rate": 7.911304618354015e-05, + "loss": 0.403, + "step": 558 + }, + { + "epoch": 0.8043165467625899, + "grad_norm": 0.4044375403682651, + "learning_rate": 7.910461062334488e-05, + "loss": 0.3963, + "step": 559 + }, + { + "epoch": 0.8057553956834532, + "grad_norm": 0.4561042808982624, + "learning_rate": 7.909613559271467e-05, + "loss": 0.3926, + "step": 560 + }, + { + "epoch": 0.8071942446043165, + "grad_norm": 0.4063174953150093, + "learning_rate": 7.908762110020382e-05, + "loss": 0.4026, + "step": 561 + }, + { + "epoch": 0.8086330935251799, + "grad_norm": 0.3189075553153762, + "learning_rate": 7.907906715440649e-05, + "loss": 0.3875, + "step": 562 + }, + { + "epoch": 0.8100719424460432, + "grad_norm": 0.3138978072985748, + "learning_rate": 7.907047376395661e-05, + "loss": 0.4015, + "step": 563 + }, + { + "epoch": 0.8115107913669065, + "grad_norm": 0.3131591493013707, + "learning_rate": 7.906184093752801e-05, + "loss": 0.3837, + "step": 564 + }, + { + "epoch": 0.8129496402877698, + "grad_norm": 0.3206070232195477, + "learning_rate": 7.905316868383425e-05, + "loss": 0.4034, + "step": 565 + }, + { + "epoch": 0.814388489208633, + "grad_norm": 0.34778976345979296, + "learning_rate": 7.904445701162872e-05, + "loss": 0.3929, + "step": 566 + }, + { + "epoch": 0.8158273381294964, + "grad_norm": 0.30227284798904663, + "learning_rate": 7.903570592970458e-05, + "loss": 0.3902, + "step": 567 + }, + { + "epoch": 0.8172661870503597, + "grad_norm": 0.19280019472036092, + "learning_rate": 7.902691544689479e-05, + "loss": 0.3823, + "step": 568 + }, + { + "epoch": 0.818705035971223, + "grad_norm": 0.2160741916005615, + "learning_rate": 7.901808557207206e-05, + "loss": 0.391, + "step": 569 + }, + { + "epoch": 0.8201438848920863, + "grad_norm": 0.2543543474640509, + "learning_rate": 7.900921631414887e-05, + "loss": 0.3964, + "step": 570 + }, + { + "epoch": 0.8215827338129497, + "grad_norm": 0.22143844825733844, + "learning_rate": 7.900030768207746e-05, + "loss": 0.3865, + "step": 571 + }, + { + "epoch": 0.823021582733813, + "grad_norm": 0.20666393285687437, + "learning_rate": 7.899135968484979e-05, + "loss": 0.378, + "step": 572 + }, + { + "epoch": 0.8244604316546763, + "grad_norm": 0.2066782394153934, + "learning_rate": 7.898237233149758e-05, + "loss": 0.3962, + "step": 573 + }, + { + "epoch": 0.8258992805755395, + "grad_norm": 0.1938863528224781, + "learning_rate": 7.897334563109225e-05, + "loss": 0.3918, + "step": 574 + }, + { + "epoch": 0.8273381294964028, + "grad_norm": 0.20336201484248936, + "learning_rate": 7.896427959274494e-05, + "loss": 0.4001, + "step": 575 + }, + { + "epoch": 0.8287769784172662, + "grad_norm": 0.2258744950644811, + "learning_rate": 7.895517422560651e-05, + "loss": 0.3871, + "step": 576 + }, + { + "epoch": 0.8302158273381295, + "grad_norm": 0.23843133875317776, + "learning_rate": 7.89460295388675e-05, + "loss": 0.3955, + "step": 577 + }, + { + "epoch": 0.8316546762589928, + "grad_norm": 0.2312577140122766, + "learning_rate": 7.893684554175817e-05, + "loss": 0.3898, + "step": 578 + }, + { + "epoch": 0.8330935251798561, + "grad_norm": 0.27908521142623455, + "learning_rate": 7.892762224354839e-05, + "loss": 0.4008, + "step": 579 + }, + { + "epoch": 0.8345323741007195, + "grad_norm": 0.2999958408864618, + "learning_rate": 7.891835965354778e-05, + "loss": 0.3965, + "step": 580 + }, + { + "epoch": 0.8359712230215828, + "grad_norm": 0.26756124634390244, + "learning_rate": 7.890905778110557e-05, + "loss": 0.3849, + "step": 581 + }, + { + "epoch": 0.837410071942446, + "grad_norm": 0.2799734483589846, + "learning_rate": 7.889971663561065e-05, + "loss": 0.4019, + "step": 582 + }, + { + "epoch": 0.8388489208633093, + "grad_norm": 0.3158878705989888, + "learning_rate": 7.889033622649155e-05, + "loss": 0.3853, + "step": 583 + }, + { + "epoch": 0.8402877697841726, + "grad_norm": 0.2982911349211642, + "learning_rate": 7.888091656321644e-05, + "loss": 0.397, + "step": 584 + }, + { + "epoch": 0.841726618705036, + "grad_norm": 0.2053738696369239, + "learning_rate": 7.88714576552931e-05, + "loss": 0.3918, + "step": 585 + }, + { + "epoch": 0.8431654676258993, + "grad_norm": 0.17663025292577625, + "learning_rate": 7.886195951226892e-05, + "loss": 0.3832, + "step": 586 + }, + { + "epoch": 0.8446043165467626, + "grad_norm": 0.2826510855902402, + "learning_rate": 7.885242214373091e-05, + "loss": 0.3979, + "step": 587 + }, + { + "epoch": 0.8460431654676259, + "grad_norm": 0.3492728858016357, + "learning_rate": 7.884284555930564e-05, + "loss": 0.3906, + "step": 588 + }, + { + "epoch": 0.8474820143884892, + "grad_norm": 0.3158540804191533, + "learning_rate": 7.883322976865932e-05, + "loss": 0.3947, + "step": 589 + }, + { + "epoch": 0.8489208633093526, + "grad_norm": 0.25409408381096943, + "learning_rate": 7.882357478149767e-05, + "loss": 0.3969, + "step": 590 + }, + { + "epoch": 0.8503597122302158, + "grad_norm": 0.2075916331813354, + "learning_rate": 7.8813880607566e-05, + "loss": 0.3953, + "step": 591 + }, + { + "epoch": 0.8517985611510791, + "grad_norm": 0.21423174653614885, + "learning_rate": 7.880414725664918e-05, + "loss": 0.3942, + "step": 592 + }, + { + "epoch": 0.8532374100719424, + "grad_norm": 0.21588649263672796, + "learning_rate": 7.879437473857161e-05, + "loss": 0.3887, + "step": 593 + }, + { + "epoch": 0.8546762589928057, + "grad_norm": 0.16259134395034616, + "learning_rate": 7.878456306319723e-05, + "loss": 0.391, + "step": 594 + }, + { + "epoch": 0.8561151079136691, + "grad_norm": 0.186899603789238, + "learning_rate": 7.877471224042952e-05, + "loss": 0.3878, + "step": 595 + }, + { + "epoch": 0.8575539568345324, + "grad_norm": 0.19396290760022455, + "learning_rate": 7.876482228021144e-05, + "loss": 0.3846, + "step": 596 + }, + { + "epoch": 0.8589928057553957, + "grad_norm": 0.22712154579665747, + "learning_rate": 7.875489319252549e-05, + "loss": 0.3892, + "step": 597 + }, + { + "epoch": 0.860431654676259, + "grad_norm": 0.24463632384626877, + "learning_rate": 7.874492498739362e-05, + "loss": 0.3932, + "step": 598 + }, + { + "epoch": 0.8618705035971223, + "grad_norm": 0.2612460691086307, + "learning_rate": 7.87349176748773e-05, + "loss": 0.3882, + "step": 599 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.2974439892363158, + "learning_rate": 7.872487126507747e-05, + "loss": 0.3925, + "step": 600 + }, + { + "epoch": 0.8647482014388489, + "grad_norm": 0.3338655060234199, + "learning_rate": 7.87147857681345e-05, + "loss": 0.4075, + "step": 601 + }, + { + "epoch": 0.8661870503597122, + "grad_norm": 0.35575445911282527, + "learning_rate": 7.870466119422826e-05, + "loss": 0.3902, + "step": 602 + }, + { + "epoch": 0.8676258992805755, + "grad_norm": 0.3368046090318334, + "learning_rate": 7.869449755357803e-05, + "loss": 0.3898, + "step": 603 + }, + { + "epoch": 0.8690647482014389, + "grad_norm": 0.2883199718243486, + "learning_rate": 7.868429485644252e-05, + "loss": 0.3919, + "step": 604 + }, + { + "epoch": 0.8705035971223022, + "grad_norm": 0.26703764250144163, + "learning_rate": 7.86740531131199e-05, + "loss": 0.3975, + "step": 605 + }, + { + "epoch": 0.8719424460431655, + "grad_norm": 0.30522825564675843, + "learning_rate": 7.866377233394771e-05, + "loss": 0.3936, + "step": 606 + }, + { + "epoch": 0.8733812949640288, + "grad_norm": 0.32195739971921056, + "learning_rate": 7.865345252930291e-05, + "loss": 0.3953, + "step": 607 + }, + { + "epoch": 0.874820143884892, + "grad_norm": 0.280171972891727, + "learning_rate": 7.864309370960184e-05, + "loss": 0.392, + "step": 608 + }, + { + "epoch": 0.8762589928057554, + "grad_norm": 0.26936379941264654, + "learning_rate": 7.863269588530023e-05, + "loss": 0.3837, + "step": 609 + }, + { + "epoch": 0.8776978417266187, + "grad_norm": 0.3070891060623235, + "learning_rate": 7.862225906689319e-05, + "loss": 0.3847, + "step": 610 + }, + { + "epoch": 0.879136690647482, + "grad_norm": 0.3199770059266285, + "learning_rate": 7.861178326491514e-05, + "loss": 0.4021, + "step": 611 + }, + { + "epoch": 0.8805755395683453, + "grad_norm": 0.29592006497124823, + "learning_rate": 7.860126848993992e-05, + "loss": 0.3941, + "step": 612 + }, + { + "epoch": 0.8820143884892087, + "grad_norm": 0.2906273468610675, + "learning_rate": 7.859071475258065e-05, + "loss": 0.3927, + "step": 613 + }, + { + "epoch": 0.883453237410072, + "grad_norm": 0.2831202573537749, + "learning_rate": 7.85801220634898e-05, + "loss": 0.3895, + "step": 614 + }, + { + "epoch": 0.8848920863309353, + "grad_norm": 0.21640349754386537, + "learning_rate": 7.856949043335917e-05, + "loss": 0.388, + "step": 615 + }, + { + "epoch": 0.8863309352517985, + "grad_norm": 0.16980883227629529, + "learning_rate": 7.855881987291983e-05, + "loss": 0.3841, + "step": 616 + }, + { + "epoch": 0.8877697841726618, + "grad_norm": 0.2206574043305558, + "learning_rate": 7.854811039294216e-05, + "loss": 0.3995, + "step": 617 + }, + { + "epoch": 0.8892086330935252, + "grad_norm": 0.22489015667648418, + "learning_rate": 7.853736200423584e-05, + "loss": 0.3839, + "step": 618 + }, + { + "epoch": 0.8906474820143885, + "grad_norm": 0.19746269430804855, + "learning_rate": 7.852657471764983e-05, + "loss": 0.4014, + "step": 619 + }, + { + "epoch": 0.8920863309352518, + "grad_norm": 0.16512005648681502, + "learning_rate": 7.851574854407228e-05, + "loss": 0.3943, + "step": 620 + }, + { + "epoch": 0.8935251798561151, + "grad_norm": 0.2058837706591543, + "learning_rate": 7.85048834944307e-05, + "loss": 0.3913, + "step": 621 + }, + { + "epoch": 0.8949640287769784, + "grad_norm": 0.2377523174843737, + "learning_rate": 7.849397957969173e-05, + "loss": 0.3867, + "step": 622 + }, + { + "epoch": 0.8964028776978418, + "grad_norm": 0.2412960219227453, + "learning_rate": 7.848303681086134e-05, + "loss": 0.3885, + "step": 623 + }, + { + "epoch": 0.897841726618705, + "grad_norm": 0.2109556511078225, + "learning_rate": 7.847205519898461e-05, + "loss": 0.3831, + "step": 624 + }, + { + "epoch": 0.8992805755395683, + "grad_norm": 0.18051444186197507, + "learning_rate": 7.846103475514595e-05, + "loss": 0.3904, + "step": 625 + }, + { + "epoch": 0.9007194244604316, + "grad_norm": 0.2324178742893254, + "learning_rate": 7.844997549046886e-05, + "loss": 0.3888, + "step": 626 + }, + { + "epoch": 0.902158273381295, + "grad_norm": 0.34544320106868226, + "learning_rate": 7.843887741611608e-05, + "loss": 0.398, + "step": 627 + }, + { + "epoch": 0.9035971223021583, + "grad_norm": 0.3834819492352676, + "learning_rate": 7.842774054328949e-05, + "loss": 0.3804, + "step": 628 + }, + { + "epoch": 0.9050359712230216, + "grad_norm": 0.37512679464727644, + "learning_rate": 7.841656488323017e-05, + "loss": 0.3942, + "step": 629 + }, + { + "epoch": 0.9064748201438849, + "grad_norm": 0.33642676880227634, + "learning_rate": 7.840535044721832e-05, + "loss": 0.3906, + "step": 630 + }, + { + "epoch": 0.9079136690647482, + "grad_norm": 0.2776328123356685, + "learning_rate": 7.839409724657327e-05, + "loss": 0.39, + "step": 631 + }, + { + "epoch": 0.9093525179856116, + "grad_norm": 0.264229711157273, + "learning_rate": 7.838280529265353e-05, + "loss": 0.3928, + "step": 632 + }, + { + "epoch": 0.9107913669064748, + "grad_norm": 0.22856035604344255, + "learning_rate": 7.837147459685666e-05, + "loss": 0.38, + "step": 633 + }, + { + "epoch": 0.9122302158273381, + "grad_norm": 0.19021078124482907, + "learning_rate": 7.836010517061937e-05, + "loss": 0.3964, + "step": 634 + }, + { + "epoch": 0.9136690647482014, + "grad_norm": 0.22192646472597122, + "learning_rate": 7.834869702541742e-05, + "loss": 0.3918, + "step": 635 + }, + { + "epoch": 0.9151079136690647, + "grad_norm": 0.2247343099527254, + "learning_rate": 7.833725017276573e-05, + "loss": 0.3924, + "step": 636 + }, + { + "epoch": 0.9165467625899281, + "grad_norm": 0.20099820906716964, + "learning_rate": 7.83257646242182e-05, + "loss": 0.3886, + "step": 637 + }, + { + "epoch": 0.9179856115107914, + "grad_norm": 0.2006531932455901, + "learning_rate": 7.831424039136783e-05, + "loss": 0.3948, + "step": 638 + }, + { + "epoch": 0.9194244604316547, + "grad_norm": 0.2135581623713477, + "learning_rate": 7.830267748584666e-05, + "loss": 0.3909, + "step": 639 + }, + { + "epoch": 0.920863309352518, + "grad_norm": 0.2930249173439451, + "learning_rate": 7.829107591932578e-05, + "loss": 0.3834, + "step": 640 + }, + { + "epoch": 0.9223021582733812, + "grad_norm": 0.3510158419143827, + "learning_rate": 7.82794357035153e-05, + "loss": 0.3855, + "step": 641 + }, + { + "epoch": 0.9237410071942446, + "grad_norm": 0.334990714639028, + "learning_rate": 7.82677568501643e-05, + "loss": 0.3889, + "step": 642 + }, + { + "epoch": 0.9251798561151079, + "grad_norm": 0.3292222176629868, + "learning_rate": 7.82560393710609e-05, + "loss": 0.386, + "step": 643 + }, + { + "epoch": 0.9266187050359712, + "grad_norm": 0.33404852035613686, + "learning_rate": 7.824428327803221e-05, + "loss": 0.3919, + "step": 644 + }, + { + "epoch": 0.9280575539568345, + "grad_norm": 0.3436229866403737, + "learning_rate": 7.823248858294428e-05, + "loss": 0.3969, + "step": 645 + }, + { + "epoch": 0.9294964028776979, + "grad_norm": 0.34608808373655575, + "learning_rate": 7.822065529770216e-05, + "loss": 0.3918, + "step": 646 + }, + { + "epoch": 0.9309352517985612, + "grad_norm": 0.3568453570260214, + "learning_rate": 7.820878343424984e-05, + "loss": 0.3827, + "step": 647 + }, + { + "epoch": 0.9323741007194245, + "grad_norm": 0.4080463861388677, + "learning_rate": 7.819687300457021e-05, + "loss": 0.384, + "step": 648 + }, + { + "epoch": 0.9338129496402877, + "grad_norm": 0.4398849386803914, + "learning_rate": 7.818492402068517e-05, + "loss": 0.3936, + "step": 649 + }, + { + "epoch": 0.935251798561151, + "grad_norm": 0.4242173179698511, + "learning_rate": 7.817293649465546e-05, + "loss": 0.3946, + "step": 650 + }, + { + "epoch": 0.9366906474820144, + "grad_norm": 0.3000781182109413, + "learning_rate": 7.816091043858076e-05, + "loss": 0.3927, + "step": 651 + }, + { + "epoch": 0.9381294964028777, + "grad_norm": 0.2671248146821321, + "learning_rate": 7.814884586459962e-05, + "loss": 0.3927, + "step": 652 + }, + { + "epoch": 0.939568345323741, + "grad_norm": 0.3110113659917515, + "learning_rate": 7.813674278488949e-05, + "loss": 0.3853, + "step": 653 + }, + { + "epoch": 0.9410071942446043, + "grad_norm": 0.32913498505284944, + "learning_rate": 7.812460121166666e-05, + "loss": 0.4011, + "step": 654 + }, + { + "epoch": 0.9424460431654677, + "grad_norm": 0.2560732729098207, + "learning_rate": 7.81124211571863e-05, + "loss": 0.3849, + "step": 655 + }, + { + "epoch": 0.943884892086331, + "grad_norm": 0.23473288445415613, + "learning_rate": 7.810020263374239e-05, + "loss": 0.3849, + "step": 656 + }, + { + "epoch": 0.9453237410071943, + "grad_norm": 0.25502262467547715, + "learning_rate": 7.808794565366778e-05, + "loss": 0.3841, + "step": 657 + }, + { + "epoch": 0.9467625899280575, + "grad_norm": 0.2675264909920931, + "learning_rate": 7.807565022933412e-05, + "loss": 0.3807, + "step": 658 + }, + { + "epoch": 0.9482014388489208, + "grad_norm": 0.32022221655981714, + "learning_rate": 7.806331637315183e-05, + "loss": 0.3845, + "step": 659 + }, + { + "epoch": 0.9496402877697842, + "grad_norm": 0.3358875172701553, + "learning_rate": 7.805094409757017e-05, + "loss": 0.3898, + "step": 660 + }, + { + "epoch": 0.9510791366906475, + "grad_norm": 0.22558263781799218, + "learning_rate": 7.803853341507715e-05, + "loss": 0.3947, + "step": 661 + }, + { + "epoch": 0.9525179856115108, + "grad_norm": 0.22029002177773868, + "learning_rate": 7.802608433819957e-05, + "loss": 0.3944, + "step": 662 + }, + { + "epoch": 0.9539568345323741, + "grad_norm": 0.3013802495912675, + "learning_rate": 7.801359687950292e-05, + "loss": 0.4043, + "step": 663 + }, + { + "epoch": 0.9553956834532374, + "grad_norm": 0.2732169010636322, + "learning_rate": 7.800107105159155e-05, + "loss": 0.3878, + "step": 664 + }, + { + "epoch": 0.9568345323741008, + "grad_norm": 0.15983177234294135, + "learning_rate": 7.798850686710841e-05, + "loss": 0.3965, + "step": 665 + }, + { + "epoch": 0.958273381294964, + "grad_norm": 0.24494999138716617, + "learning_rate": 7.797590433873526e-05, + "loss": 0.388, + "step": 666 + }, + { + "epoch": 0.9597122302158273, + "grad_norm": 0.32383354768412387, + "learning_rate": 7.79632634791925e-05, + "loss": 0.387, + "step": 667 + }, + { + "epoch": 0.9611510791366906, + "grad_norm": 0.2620658766378879, + "learning_rate": 7.795058430123925e-05, + "loss": 0.3962, + "step": 668 + }, + { + "epoch": 0.962589928057554, + "grad_norm": 0.22541523028018362, + "learning_rate": 7.793786681767333e-05, + "loss": 0.3872, + "step": 669 + }, + { + "epoch": 0.9640287769784173, + "grad_norm": 0.22687402775204735, + "learning_rate": 7.792511104133117e-05, + "loss": 0.381, + "step": 670 + }, + { + "epoch": 0.9654676258992806, + "grad_norm": 0.25659393392471547, + "learning_rate": 7.791231698508786e-05, + "loss": 0.3876, + "step": 671 + }, + { + "epoch": 0.9669064748201439, + "grad_norm": 0.3045763973091339, + "learning_rate": 7.789948466185718e-05, + "loss": 0.3708, + "step": 672 + }, + { + "epoch": 0.9683453237410072, + "grad_norm": 0.2566836653530128, + "learning_rate": 7.788661408459146e-05, + "loss": 0.3886, + "step": 673 + }, + { + "epoch": 0.9697841726618706, + "grad_norm": 0.23594214728583138, + "learning_rate": 7.787370526628173e-05, + "loss": 0.4012, + "step": 674 + }, + { + "epoch": 0.9712230215827338, + "grad_norm": 0.24609866995203958, + "learning_rate": 7.786075821995754e-05, + "loss": 0.3934, + "step": 675 + }, + { + "epoch": 0.9726618705035971, + "grad_norm": 0.3197191898312636, + "learning_rate": 7.784777295868706e-05, + "loss": 0.3894, + "step": 676 + }, + { + "epoch": 0.9741007194244604, + "grad_norm": 0.4005845521559889, + "learning_rate": 7.783474949557704e-05, + "loss": 0.3856, + "step": 677 + }, + { + "epoch": 0.9755395683453237, + "grad_norm": 0.4425181618821247, + "learning_rate": 7.782168784377276e-05, + "loss": 0.3959, + "step": 678 + }, + { + "epoch": 0.9769784172661871, + "grad_norm": 0.4332856052063338, + "learning_rate": 7.780858801645806e-05, + "loss": 0.3885, + "step": 679 + }, + { + "epoch": 0.9784172661870504, + "grad_norm": 0.3952524434700546, + "learning_rate": 7.779545002685535e-05, + "loss": 0.393, + "step": 680 + }, + { + "epoch": 0.9798561151079137, + "grad_norm": 0.33755031917203393, + "learning_rate": 7.778227388822552e-05, + "loss": 0.389, + "step": 681 + }, + { + "epoch": 0.981294964028777, + "grad_norm": 0.273520410377852, + "learning_rate": 7.776905961386793e-05, + "loss": 0.3939, + "step": 682 + }, + { + "epoch": 0.9827338129496402, + "grad_norm": 0.2906169847789615, + "learning_rate": 7.77558072171205e-05, + "loss": 0.3961, + "step": 683 + }, + { + "epoch": 0.9841726618705036, + "grad_norm": 0.29722724301916326, + "learning_rate": 7.774251671135961e-05, + "loss": 0.3812, + "step": 684 + }, + { + "epoch": 0.9856115107913669, + "grad_norm": 0.2520810803176553, + "learning_rate": 7.77291881100001e-05, + "loss": 0.391, + "step": 685 + }, + { + "epoch": 0.9870503597122302, + "grad_norm": 0.18539473632642314, + "learning_rate": 7.771582142649523e-05, + "loss": 0.3854, + "step": 686 + }, + { + "epoch": 0.9884892086330935, + "grad_norm": 0.23387948076579174, + "learning_rate": 7.770241667433677e-05, + "loss": 0.3938, + "step": 687 + }, + { + "epoch": 0.9899280575539569, + "grad_norm": 0.32818364161003727, + "learning_rate": 7.768897386705488e-05, + "loss": 0.3881, + "step": 688 + }, + { + "epoch": 0.9913669064748202, + "grad_norm": 0.3151278015842664, + "learning_rate": 7.767549301821807e-05, + "loss": 0.3891, + "step": 689 + }, + { + "epoch": 0.9928057553956835, + "grad_norm": 0.1978620567014893, + "learning_rate": 7.766197414143333e-05, + "loss": 0.3837, + "step": 690 + }, + { + "epoch": 0.9942446043165467, + "grad_norm": 0.22559586800348716, + "learning_rate": 7.764841725034602e-05, + "loss": 0.3925, + "step": 691 + }, + { + "epoch": 0.99568345323741, + "grad_norm": 0.28410829917691, + "learning_rate": 7.763482235863985e-05, + "loss": 0.3892, + "step": 692 + }, + { + "epoch": 0.9971223021582734, + "grad_norm": 0.2648279300455164, + "learning_rate": 7.762118948003688e-05, + "loss": 0.3907, + "step": 693 + }, + { + "epoch": 0.9985611510791367, + "grad_norm": 0.19968267788630498, + "learning_rate": 7.760751862829754e-05, + "loss": 0.3839, + "step": 694 + }, + { + "epoch": 1.0, + "grad_norm": 0.2618737125473869, + "learning_rate": 7.759380981722055e-05, + "loss": 0.3857, + "step": 695 + }, + { + "epoch": 1.0014388489208632, + "grad_norm": 0.31553007501168995, + "learning_rate": 7.758006306064301e-05, + "loss": 0.3717, + "step": 696 + }, + { + "epoch": 1.0028776978417266, + "grad_norm": 0.33965441725150664, + "learning_rate": 7.756627837244023e-05, + "loss": 0.3734, + "step": 697 + }, + { + "epoch": 1.0043165467625899, + "grad_norm": 0.40641548452468257, + "learning_rate": 7.755245576652588e-05, + "loss": 0.3791, + "step": 698 + }, + { + "epoch": 1.0057553956834533, + "grad_norm": 0.43018797825921173, + "learning_rate": 7.753859525685187e-05, + "loss": 0.3713, + "step": 699 + }, + { + "epoch": 1.0071942446043165, + "grad_norm": 0.38735977677912703, + "learning_rate": 7.752469685740838e-05, + "loss": 0.379, + "step": 700 + }, + { + "epoch": 1.00863309352518, + "grad_norm": 0.34346977718231, + "learning_rate": 7.751076058222381e-05, + "loss": 0.3666, + "step": 701 + }, + { + "epoch": 1.0100719424460431, + "grad_norm": 0.2758009922986033, + "learning_rate": 7.749678644536485e-05, + "loss": 0.364, + "step": 702 + }, + { + "epoch": 1.0115107913669066, + "grad_norm": 0.2508222065726072, + "learning_rate": 7.748277446093631e-05, + "loss": 0.3722, + "step": 703 + }, + { + "epoch": 1.0129496402877698, + "grad_norm": 0.2581419457116818, + "learning_rate": 7.746872464308131e-05, + "loss": 0.3694, + "step": 704 + }, + { + "epoch": 1.014388489208633, + "grad_norm": 0.2663971309609672, + "learning_rate": 7.745463700598108e-05, + "loss": 0.3723, + "step": 705 + }, + { + "epoch": 1.0158273381294964, + "grad_norm": 0.24874575875769642, + "learning_rate": 7.744051156385503e-05, + "loss": 0.3707, + "step": 706 + }, + { + "epoch": 1.0172661870503596, + "grad_norm": 0.2274756545689248, + "learning_rate": 7.742634833096077e-05, + "loss": 0.373, + "step": 707 + }, + { + "epoch": 1.018705035971223, + "grad_norm": 0.22046600028071656, + "learning_rate": 7.741214732159403e-05, + "loss": 0.3793, + "step": 708 + }, + { + "epoch": 1.0201438848920863, + "grad_norm": 0.23300885777665975, + "learning_rate": 7.739790855008867e-05, + "loss": 0.3733, + "step": 709 + }, + { + "epoch": 1.0215827338129497, + "grad_norm": 0.24905008563481562, + "learning_rate": 7.738363203081664e-05, + "loss": 0.3747, + "step": 710 + }, + { + "epoch": 1.023021582733813, + "grad_norm": 0.2678606420343796, + "learning_rate": 7.736931777818805e-05, + "loss": 0.3851, + "step": 711 + }, + { + "epoch": 1.0244604316546762, + "grad_norm": 0.24026951579855546, + "learning_rate": 7.735496580665105e-05, + "loss": 0.3803, + "step": 712 + }, + { + "epoch": 1.0258992805755396, + "grad_norm": 0.27398099644606966, + "learning_rate": 7.734057613069188e-05, + "loss": 0.3711, + "step": 713 + }, + { + "epoch": 1.0273381294964028, + "grad_norm": 0.2690255911738012, + "learning_rate": 7.73261487648348e-05, + "loss": 0.3737, + "step": 714 + }, + { + "epoch": 1.0287769784172662, + "grad_norm": 0.22042977642621647, + "learning_rate": 7.731168372364219e-05, + "loss": 0.3657, + "step": 715 + }, + { + "epoch": 1.0302158273381294, + "grad_norm": 0.23820484191579894, + "learning_rate": 7.729718102171438e-05, + "loss": 0.3755, + "step": 716 + }, + { + "epoch": 1.0316546762589929, + "grad_norm": 0.26180916905216817, + "learning_rate": 7.728264067368976e-05, + "loss": 0.3718, + "step": 717 + }, + { + "epoch": 1.033093525179856, + "grad_norm": 0.27001803149465553, + "learning_rate": 7.726806269424469e-05, + "loss": 0.3657, + "step": 718 + }, + { + "epoch": 1.0345323741007195, + "grad_norm": 0.2937530686224215, + "learning_rate": 7.725344709809355e-05, + "loss": 0.376, + "step": 719 + }, + { + "epoch": 1.0359712230215827, + "grad_norm": 0.2973441276813663, + "learning_rate": 7.723879389998864e-05, + "loss": 0.3806, + "step": 720 + }, + { + "epoch": 1.037410071942446, + "grad_norm": 0.25307126825041176, + "learning_rate": 7.722410311472026e-05, + "loss": 0.3678, + "step": 721 + }, + { + "epoch": 1.0388489208633094, + "grad_norm": 0.19698468151176968, + "learning_rate": 7.72093747571166e-05, + "loss": 0.3688, + "step": 722 + }, + { + "epoch": 1.0402877697841726, + "grad_norm": 0.17878109299499642, + "learning_rate": 7.719460884204383e-05, + "loss": 0.368, + "step": 723 + }, + { + "epoch": 1.041726618705036, + "grad_norm": 0.22288327986868614, + "learning_rate": 7.717980538440599e-05, + "loss": 0.3683, + "step": 724 + }, + { + "epoch": 1.0431654676258992, + "grad_norm": 0.2320159277967116, + "learning_rate": 7.716496439914502e-05, + "loss": 0.3645, + "step": 725 + }, + { + "epoch": 1.0446043165467627, + "grad_norm": 0.21976899032515732, + "learning_rate": 7.715008590124076e-05, + "loss": 0.3783, + "step": 726 + }, + { + "epoch": 1.0460431654676259, + "grad_norm": 0.2629062946538339, + "learning_rate": 7.713516990571088e-05, + "loss": 0.3704, + "step": 727 + }, + { + "epoch": 1.0474820143884893, + "grad_norm": 0.28509364456056624, + "learning_rate": 7.71202164276109e-05, + "loss": 0.3721, + "step": 728 + }, + { + "epoch": 1.0489208633093525, + "grad_norm": 0.2454798623600427, + "learning_rate": 7.710522548203424e-05, + "loss": 0.3751, + "step": 729 + }, + { + "epoch": 1.0503597122302157, + "grad_norm": 0.2354289666700434, + "learning_rate": 7.709019708411202e-05, + "loss": 0.3758, + "step": 730 + }, + { + "epoch": 1.0517985611510792, + "grad_norm": 0.24512748729268605, + "learning_rate": 7.707513124901327e-05, + "loss": 0.3666, + "step": 731 + }, + { + "epoch": 1.0532374100719424, + "grad_norm": 0.3008060196418346, + "learning_rate": 7.706002799194476e-05, + "loss": 0.3647, + "step": 732 + }, + { + "epoch": 1.0546762589928058, + "grad_norm": 0.34124747562157987, + "learning_rate": 7.704488732815105e-05, + "loss": 0.3674, + "step": 733 + }, + { + "epoch": 1.056115107913669, + "grad_norm": 0.2934930694625762, + "learning_rate": 7.702970927291442e-05, + "loss": 0.3718, + "step": 734 + }, + { + "epoch": 1.0575539568345325, + "grad_norm": 0.19598463515263995, + "learning_rate": 7.701449384155492e-05, + "loss": 0.3808, + "step": 735 + }, + { + "epoch": 1.0589928057553957, + "grad_norm": 0.21231686194205143, + "learning_rate": 7.699924104943033e-05, + "loss": 0.3791, + "step": 736 + }, + { + "epoch": 1.0604316546762589, + "grad_norm": 0.2793791463484476, + "learning_rate": 7.698395091193615e-05, + "loss": 0.3678, + "step": 737 + }, + { + "epoch": 1.0618705035971223, + "grad_norm": 0.25077747032290765, + "learning_rate": 7.696862344450553e-05, + "loss": 0.3659, + "step": 738 + }, + { + "epoch": 1.0633093525179855, + "grad_norm": 0.2698416723588037, + "learning_rate": 7.695325866260932e-05, + "loss": 0.3773, + "step": 739 + }, + { + "epoch": 1.064748201438849, + "grad_norm": 0.25574664417176746, + "learning_rate": 7.693785658175607e-05, + "loss": 0.3634, + "step": 740 + }, + { + "epoch": 1.0661870503597122, + "grad_norm": 0.2120620821263056, + "learning_rate": 7.692241721749194e-05, + "loss": 0.3707, + "step": 741 + }, + { + "epoch": 1.0676258992805756, + "grad_norm": 0.2364943418652984, + "learning_rate": 7.69069405854007e-05, + "loss": 0.3737, + "step": 742 + }, + { + "epoch": 1.0690647482014388, + "grad_norm": 0.22536048745525106, + "learning_rate": 7.68914267011038e-05, + "loss": 0.3694, + "step": 743 + }, + { + "epoch": 1.0705035971223023, + "grad_norm": 0.22723369017031214, + "learning_rate": 7.687587558026024e-05, + "loss": 0.371, + "step": 744 + }, + { + "epoch": 1.0719424460431655, + "grad_norm": 0.21568294234696916, + "learning_rate": 7.686028723856664e-05, + "loss": 0.3688, + "step": 745 + }, + { + "epoch": 1.0733812949640287, + "grad_norm": 0.2528091241920581, + "learning_rate": 7.684466169175714e-05, + "loss": 0.3739, + "step": 746 + }, + { + "epoch": 1.074820143884892, + "grad_norm": 0.2829084202430895, + "learning_rate": 7.68289989556035e-05, + "loss": 0.3642, + "step": 747 + }, + { + "epoch": 1.0762589928057553, + "grad_norm": 0.2889957548752893, + "learning_rate": 7.681329904591495e-05, + "loss": 0.365, + "step": 748 + }, + { + "epoch": 1.0776978417266188, + "grad_norm": 0.30667124621667996, + "learning_rate": 7.67975619785383e-05, + "loss": 0.3681, + "step": 749 + }, + { + "epoch": 1.079136690647482, + "grad_norm": 0.29156344248640786, + "learning_rate": 7.678178776935781e-05, + "loss": 0.3717, + "step": 750 + }, + { + "epoch": 1.0805755395683454, + "grad_norm": 0.26343478914536866, + "learning_rate": 7.676597643429528e-05, + "loss": 0.3739, + "step": 751 + }, + { + "epoch": 1.0820143884892086, + "grad_norm": 0.26201198141793924, + "learning_rate": 7.675012798930994e-05, + "loss": 0.3742, + "step": 752 + }, + { + "epoch": 1.083453237410072, + "grad_norm": 0.2762651821216201, + "learning_rate": 7.673424245039852e-05, + "loss": 0.3715, + "step": 753 + }, + { + "epoch": 1.0848920863309353, + "grad_norm": 0.22200173851448812, + "learning_rate": 7.671831983359515e-05, + "loss": 0.3761, + "step": 754 + }, + { + "epoch": 1.0863309352517985, + "grad_norm": 0.22087460184150698, + "learning_rate": 7.670236015497141e-05, + "loss": 0.3671, + "step": 755 + }, + { + "epoch": 1.087769784172662, + "grad_norm": 0.266194809910504, + "learning_rate": 7.668636343063628e-05, + "loss": 0.3651, + "step": 756 + }, + { + "epoch": 1.0892086330935251, + "grad_norm": 0.30392490240355563, + "learning_rate": 7.667032967673614e-05, + "loss": 0.3726, + "step": 757 + }, + { + "epoch": 1.0906474820143885, + "grad_norm": 0.32390718405302116, + "learning_rate": 7.665425890945474e-05, + "loss": 0.3756, + "step": 758 + }, + { + "epoch": 1.0920863309352518, + "grad_norm": 0.313310504767798, + "learning_rate": 7.663815114501319e-05, + "loss": 0.3685, + "step": 759 + }, + { + "epoch": 1.0935251798561152, + "grad_norm": 0.275653965646266, + "learning_rate": 7.662200639966992e-05, + "loss": 0.3611, + "step": 760 + }, + { + "epoch": 1.0949640287769784, + "grad_norm": 0.2797921397183036, + "learning_rate": 7.660582468972074e-05, + "loss": 0.3804, + "step": 761 + }, + { + "epoch": 1.0964028776978418, + "grad_norm": 0.34469485096564356, + "learning_rate": 7.658960603149873e-05, + "loss": 0.3654, + "step": 762 + }, + { + "epoch": 1.097841726618705, + "grad_norm": 0.34903953742928767, + "learning_rate": 7.657335044137427e-05, + "loss": 0.3698, + "step": 763 + }, + { + "epoch": 1.0992805755395683, + "grad_norm": 0.27823069653353727, + "learning_rate": 7.655705793575504e-05, + "loss": 0.366, + "step": 764 + }, + { + "epoch": 1.1007194244604317, + "grad_norm": 0.24822074396235053, + "learning_rate": 7.654072853108592e-05, + "loss": 0.3717, + "step": 765 + }, + { + "epoch": 1.102158273381295, + "grad_norm": 0.25840628636978735, + "learning_rate": 7.652436224384911e-05, + "loss": 0.3703, + "step": 766 + }, + { + "epoch": 1.1035971223021583, + "grad_norm": 0.2529631240696829, + "learning_rate": 7.6507959090564e-05, + "loss": 0.3722, + "step": 767 + }, + { + "epoch": 1.1050359712230216, + "grad_norm": 0.24929231191974152, + "learning_rate": 7.649151908778721e-05, + "loss": 0.3722, + "step": 768 + }, + { + "epoch": 1.106474820143885, + "grad_norm": 0.2269118319413078, + "learning_rate": 7.64750422521125e-05, + "loss": 0.3746, + "step": 769 + }, + { + "epoch": 1.1079136690647482, + "grad_norm": 0.21647296032756777, + "learning_rate": 7.645852860017086e-05, + "loss": 0.367, + "step": 770 + }, + { + "epoch": 1.1093525179856114, + "grad_norm": 0.2471192399071732, + "learning_rate": 7.644197814863045e-05, + "loss": 0.3684, + "step": 771 + }, + { + "epoch": 1.1107913669064748, + "grad_norm": 0.25791371757926, + "learning_rate": 7.642539091419654e-05, + "loss": 0.3771, + "step": 772 + }, + { + "epoch": 1.112230215827338, + "grad_norm": 0.21350379236032627, + "learning_rate": 7.640876691361152e-05, + "loss": 0.372, + "step": 773 + }, + { + "epoch": 1.1136690647482015, + "grad_norm": 0.19325186323404028, + "learning_rate": 7.639210616365494e-05, + "loss": 0.3747, + "step": 774 + }, + { + "epoch": 1.1151079136690647, + "grad_norm": 0.2329585900087163, + "learning_rate": 7.637540868114338e-05, + "loss": 0.3767, + "step": 775 + }, + { + "epoch": 1.1165467625899281, + "grad_norm": 0.2312114835528738, + "learning_rate": 7.635867448293056e-05, + "loss": 0.3699, + "step": 776 + }, + { + "epoch": 1.1179856115107913, + "grad_norm": 0.22412843889749232, + "learning_rate": 7.63419035859072e-05, + "loss": 0.3687, + "step": 777 + }, + { + "epoch": 1.1194244604316548, + "grad_norm": 0.1692205798761735, + "learning_rate": 7.63250960070011e-05, + "loss": 0.3722, + "step": 778 + }, + { + "epoch": 1.120863309352518, + "grad_norm": 0.16114224565456758, + "learning_rate": 7.630825176317707e-05, + "loss": 0.3697, + "step": 779 + }, + { + "epoch": 1.1223021582733812, + "grad_norm": 0.2213503208640909, + "learning_rate": 7.629137087143693e-05, + "loss": 0.3738, + "step": 780 + }, + { + "epoch": 1.1237410071942446, + "grad_norm": 0.24905814357039824, + "learning_rate": 7.627445334881951e-05, + "loss": 0.3738, + "step": 781 + }, + { + "epoch": 1.1251798561151078, + "grad_norm": 0.27876416934327075, + "learning_rate": 7.625749921240058e-05, + "loss": 0.3702, + "step": 782 + }, + { + "epoch": 1.1266187050359713, + "grad_norm": 0.2907151065971304, + "learning_rate": 7.62405084792929e-05, + "loss": 0.3805, + "step": 783 + }, + { + "epoch": 1.1280575539568345, + "grad_norm": 0.3183459859308742, + "learning_rate": 7.622348116664611e-05, + "loss": 0.3761, + "step": 784 + }, + { + "epoch": 1.129496402877698, + "grad_norm": 0.28647369901052755, + "learning_rate": 7.620641729164686e-05, + "loss": 0.3804, + "step": 785 + }, + { + "epoch": 1.1309352517985611, + "grad_norm": 0.26695805060206323, + "learning_rate": 7.618931687151863e-05, + "loss": 0.3851, + "step": 786 + }, + { + "epoch": 1.1323741007194243, + "grad_norm": 0.21157618631799555, + "learning_rate": 7.617217992352183e-05, + "loss": 0.3712, + "step": 787 + }, + { + "epoch": 1.1338129496402878, + "grad_norm": 0.17102518487708407, + "learning_rate": 7.615500646495373e-05, + "loss": 0.3617, + "step": 788 + }, + { + "epoch": 1.135251798561151, + "grad_norm": 0.2327103761272104, + "learning_rate": 7.613779651314841e-05, + "loss": 0.3597, + "step": 789 + }, + { + "epoch": 1.1366906474820144, + "grad_norm": 0.2855044981002147, + "learning_rate": 7.612055008547688e-05, + "loss": 0.3778, + "step": 790 + }, + { + "epoch": 1.1381294964028776, + "grad_norm": 0.28780307592946286, + "learning_rate": 7.610326719934685e-05, + "loss": 0.3718, + "step": 791 + }, + { + "epoch": 1.139568345323741, + "grad_norm": 0.2829437935854506, + "learning_rate": 7.608594787220292e-05, + "loss": 0.3694, + "step": 792 + }, + { + "epoch": 1.1410071942446043, + "grad_norm": 0.28165833621378605, + "learning_rate": 7.606859212152644e-05, + "loss": 0.3734, + "step": 793 + }, + { + "epoch": 1.1424460431654677, + "grad_norm": 0.266286805674129, + "learning_rate": 7.605119996483551e-05, + "loss": 0.3679, + "step": 794 + }, + { + "epoch": 1.143884892086331, + "grad_norm": 0.2471187084073408, + "learning_rate": 7.6033771419685e-05, + "loss": 0.3724, + "step": 795 + }, + { + "epoch": 1.1453237410071941, + "grad_norm": 0.2983714367288603, + "learning_rate": 7.601630650366648e-05, + "loss": 0.3755, + "step": 796 + }, + { + "epoch": 1.1467625899280576, + "grad_norm": 0.3492960796329427, + "learning_rate": 7.59988052344083e-05, + "loss": 0.37, + "step": 797 + }, + { + "epoch": 1.1482014388489208, + "grad_norm": 0.3751692962462577, + "learning_rate": 7.59812676295754e-05, + "loss": 0.3713, + "step": 798 + }, + { + "epoch": 1.1496402877697842, + "grad_norm": 0.39396196639251957, + "learning_rate": 7.596369370686947e-05, + "loss": 0.3783, + "step": 799 + }, + { + "epoch": 1.1510791366906474, + "grad_norm": 0.3445354561658745, + "learning_rate": 7.594608348402885e-05, + "loss": 0.3804, + "step": 800 + }, + { + "epoch": 1.1525179856115109, + "grad_norm": 0.2735280407069058, + "learning_rate": 7.592843697882848e-05, + "loss": 0.3764, + "step": 801 + }, + { + "epoch": 1.153956834532374, + "grad_norm": 0.2726335410361186, + "learning_rate": 7.591075420907997e-05, + "loss": 0.3727, + "step": 802 + }, + { + "epoch": 1.1553956834532375, + "grad_norm": 0.2622606934047612, + "learning_rate": 7.589303519263151e-05, + "loss": 0.3684, + "step": 803 + }, + { + "epoch": 1.1568345323741007, + "grad_norm": 0.2148203795515544, + "learning_rate": 7.587527994736787e-05, + "loss": 0.3683, + "step": 804 + }, + { + "epoch": 1.158273381294964, + "grad_norm": 0.23153893883516943, + "learning_rate": 7.58574884912104e-05, + "loss": 0.3696, + "step": 805 + }, + { + "epoch": 1.1597122302158274, + "grad_norm": 0.25522408205885655, + "learning_rate": 7.5839660842117e-05, + "loss": 0.3759, + "step": 806 + }, + { + "epoch": 1.1611510791366906, + "grad_norm": 0.22696954301069733, + "learning_rate": 7.582179701808208e-05, + "loss": 0.3709, + "step": 807 + }, + { + "epoch": 1.162589928057554, + "grad_norm": 0.20794558001687014, + "learning_rate": 7.580389703713661e-05, + "loss": 0.3672, + "step": 808 + }, + { + "epoch": 1.1640287769784172, + "grad_norm": 0.23369271592611313, + "learning_rate": 7.5785960917348e-05, + "loss": 0.3638, + "step": 809 + }, + { + "epoch": 1.1654676258992807, + "grad_norm": 0.2386353589713116, + "learning_rate": 7.576798867682018e-05, + "loss": 0.374, + "step": 810 + }, + { + "epoch": 1.1669064748201439, + "grad_norm": 0.21507342856828493, + "learning_rate": 7.574998033369349e-05, + "loss": 0.3654, + "step": 811 + }, + { + "epoch": 1.1683453237410073, + "grad_norm": 0.21609386965419786, + "learning_rate": 7.573193590614479e-05, + "loss": 0.3719, + "step": 812 + }, + { + "epoch": 1.1697841726618705, + "grad_norm": 0.24765489873299398, + "learning_rate": 7.571385541238727e-05, + "loss": 0.3763, + "step": 813 + }, + { + "epoch": 1.1712230215827337, + "grad_norm": 0.2508495575608087, + "learning_rate": 7.569573887067059e-05, + "loss": 0.3675, + "step": 814 + }, + { + "epoch": 1.1726618705035972, + "grad_norm": 0.19808518068361278, + "learning_rate": 7.567758629928076e-05, + "loss": 0.3791, + "step": 815 + }, + { + "epoch": 1.1741007194244604, + "grad_norm": 0.1979656718305221, + "learning_rate": 7.565939771654018e-05, + "loss": 0.3681, + "step": 816 + }, + { + "epoch": 1.1755395683453238, + "grad_norm": 0.19541210281020807, + "learning_rate": 7.564117314080758e-05, + "loss": 0.3656, + "step": 817 + }, + { + "epoch": 1.176978417266187, + "grad_norm": 0.24875723856508886, + "learning_rate": 7.562291259047804e-05, + "loss": 0.366, + "step": 818 + }, + { + "epoch": 1.1784172661870504, + "grad_norm": 0.2593364294380872, + "learning_rate": 7.560461608398292e-05, + "loss": 0.3673, + "step": 819 + }, + { + "epoch": 1.1798561151079137, + "grad_norm": 0.2037316550239657, + "learning_rate": 7.558628363978991e-05, + "loss": 0.3697, + "step": 820 + }, + { + "epoch": 1.181294964028777, + "grad_norm": 0.24957260089896505, + "learning_rate": 7.556791527640292e-05, + "loss": 0.3691, + "step": 821 + }, + { + "epoch": 1.1827338129496403, + "grad_norm": 0.2765448339671684, + "learning_rate": 7.554951101236219e-05, + "loss": 0.3725, + "step": 822 + }, + { + "epoch": 1.1841726618705035, + "grad_norm": 0.20202979074251823, + "learning_rate": 7.553107086624413e-05, + "loss": 0.3595, + "step": 823 + }, + { + "epoch": 1.185611510791367, + "grad_norm": 0.1740436926149169, + "learning_rate": 7.551259485666141e-05, + "loss": 0.3635, + "step": 824 + }, + { + "epoch": 1.1870503597122302, + "grad_norm": 0.21631024506455465, + "learning_rate": 7.549408300226287e-05, + "loss": 0.3712, + "step": 825 + }, + { + "epoch": 1.1884892086330936, + "grad_norm": 0.25177375486379844, + "learning_rate": 7.547553532173356e-05, + "loss": 0.3699, + "step": 826 + }, + { + "epoch": 1.1899280575539568, + "grad_norm": 0.20550097869119144, + "learning_rate": 7.545695183379465e-05, + "loss": 0.3662, + "step": 827 + }, + { + "epoch": 1.19136690647482, + "grad_norm": 0.1833546056638634, + "learning_rate": 7.54383325572035e-05, + "loss": 0.3717, + "step": 828 + }, + { + "epoch": 1.1928057553956835, + "grad_norm": 0.21642428138305977, + "learning_rate": 7.541967751075354e-05, + "loss": 0.3779, + "step": 829 + }, + { + "epoch": 1.1942446043165469, + "grad_norm": 0.22311333328763097, + "learning_rate": 7.540098671327438e-05, + "loss": 0.3672, + "step": 830 + }, + { + "epoch": 1.19568345323741, + "grad_norm": 0.2230284422426472, + "learning_rate": 7.538226018363164e-05, + "loss": 0.3704, + "step": 831 + }, + { + "epoch": 1.1971223021582733, + "grad_norm": 0.2233858353142221, + "learning_rate": 7.536349794072705e-05, + "loss": 0.3661, + "step": 832 + }, + { + "epoch": 1.1985611510791367, + "grad_norm": 0.24431933896310673, + "learning_rate": 7.534470000349835e-05, + "loss": 0.3696, + "step": 833 + }, + { + "epoch": 1.2, + "grad_norm": 0.20570153241892766, + "learning_rate": 7.532586639091936e-05, + "loss": 0.378, + "step": 834 + }, + { + "epoch": 1.2014388489208634, + "grad_norm": 0.23363348010581506, + "learning_rate": 7.530699712199985e-05, + "loss": 0.3638, + "step": 835 + }, + { + "epoch": 1.2028776978417266, + "grad_norm": 0.2551061431484015, + "learning_rate": 7.528809221578565e-05, + "loss": 0.3663, + "step": 836 + }, + { + "epoch": 1.2043165467625898, + "grad_norm": 0.20603774908286682, + "learning_rate": 7.52691516913585e-05, + "loss": 0.3804, + "step": 837 + }, + { + "epoch": 1.2057553956834532, + "grad_norm": 0.230745543757371, + "learning_rate": 7.525017556783612e-05, + "loss": 0.3748, + "step": 838 + }, + { + "epoch": 1.2071942446043165, + "grad_norm": 0.27823629877405837, + "learning_rate": 7.523116386437216e-05, + "loss": 0.3636, + "step": 839 + }, + { + "epoch": 1.20863309352518, + "grad_norm": 0.27983234823070235, + "learning_rate": 7.521211660015615e-05, + "loss": 0.3713, + "step": 840 + }, + { + "epoch": 1.210071942446043, + "grad_norm": 0.20857782351282705, + "learning_rate": 7.519303379441357e-05, + "loss": 0.3682, + "step": 841 + }, + { + "epoch": 1.2115107913669065, + "grad_norm": 0.1809768437352649, + "learning_rate": 7.517391546640573e-05, + "loss": 0.374, + "step": 842 + }, + { + "epoch": 1.2129496402877697, + "grad_norm": 0.21711202508277216, + "learning_rate": 7.515476163542982e-05, + "loss": 0.3686, + "step": 843 + }, + { + "epoch": 1.2143884892086332, + "grad_norm": 0.1921623166614579, + "learning_rate": 7.513557232081887e-05, + "loss": 0.3694, + "step": 844 + }, + { + "epoch": 1.2158273381294964, + "grad_norm": 0.15316247122528281, + "learning_rate": 7.511634754194168e-05, + "loss": 0.3727, + "step": 845 + }, + { + "epoch": 1.2172661870503596, + "grad_norm": 0.18834756267417926, + "learning_rate": 7.50970873182029e-05, + "loss": 0.3692, + "step": 846 + }, + { + "epoch": 1.218705035971223, + "grad_norm": 0.23686774578710018, + "learning_rate": 7.507779166904292e-05, + "loss": 0.3711, + "step": 847 + }, + { + "epoch": 1.2201438848920863, + "grad_norm": 0.24771915125409488, + "learning_rate": 7.50584606139379e-05, + "loss": 0.3763, + "step": 848 + }, + { + "epoch": 1.2215827338129497, + "grad_norm": 0.24131346498923673, + "learning_rate": 7.503909417239975e-05, + "loss": 0.3725, + "step": 849 + }, + { + "epoch": 1.223021582733813, + "grad_norm": 0.25443424568582057, + "learning_rate": 7.501969236397607e-05, + "loss": 0.3716, + "step": 850 + }, + { + "epoch": 1.2244604316546763, + "grad_norm": 0.28102454004883953, + "learning_rate": 7.500025520825018e-05, + "loss": 0.3655, + "step": 851 + }, + { + "epoch": 1.2258992805755395, + "grad_norm": 0.2731890197889495, + "learning_rate": 7.498078272484108e-05, + "loss": 0.3723, + "step": 852 + }, + { + "epoch": 1.227338129496403, + "grad_norm": 0.24784435562954357, + "learning_rate": 7.496127493340341e-05, + "loss": 0.3811, + "step": 853 + }, + { + "epoch": 1.2287769784172662, + "grad_norm": 0.2603058743485281, + "learning_rate": 7.494173185362745e-05, + "loss": 0.3776, + "step": 854 + }, + { + "epoch": 1.2302158273381294, + "grad_norm": 0.24634148328332406, + "learning_rate": 7.492215350523913e-05, + "loss": 0.3685, + "step": 855 + }, + { + "epoch": 1.2316546762589928, + "grad_norm": 0.2529437285257151, + "learning_rate": 7.490253990799991e-05, + "loss": 0.3769, + "step": 856 + }, + { + "epoch": 1.233093525179856, + "grad_norm": 0.21585577519657578, + "learning_rate": 7.488289108170692e-05, + "loss": 0.371, + "step": 857 + }, + { + "epoch": 1.2345323741007195, + "grad_norm": 0.20070593268415463, + "learning_rate": 7.486320704619276e-05, + "loss": 0.362, + "step": 858 + }, + { + "epoch": 1.2359712230215827, + "grad_norm": 0.33278265943848906, + "learning_rate": 7.484348782132565e-05, + "loss": 0.3719, + "step": 859 + }, + { + "epoch": 1.2374100719424461, + "grad_norm": 0.39253677405101345, + "learning_rate": 7.482373342700927e-05, + "loss": 0.3699, + "step": 860 + }, + { + "epoch": 1.2388489208633093, + "grad_norm": 0.348245063526455, + "learning_rate": 7.48039438831828e-05, + "loss": 0.3668, + "step": 861 + }, + { + "epoch": 1.2402877697841728, + "grad_norm": 0.27252448611358787, + "learning_rate": 7.478411920982095e-05, + "loss": 0.3745, + "step": 862 + }, + { + "epoch": 1.241726618705036, + "grad_norm": 0.20338714632236685, + "learning_rate": 7.476425942693382e-05, + "loss": 0.3767, + "step": 863 + }, + { + "epoch": 1.2431654676258992, + "grad_norm": 0.21236877270999938, + "learning_rate": 7.474436455456701e-05, + "loss": 0.3735, + "step": 864 + }, + { + "epoch": 1.2446043165467626, + "grad_norm": 0.21143808627777527, + "learning_rate": 7.472443461280149e-05, + "loss": 0.3727, + "step": 865 + }, + { + "epoch": 1.2460431654676258, + "grad_norm": 0.22982317906740338, + "learning_rate": 7.470446962175367e-05, + "loss": 0.3764, + "step": 866 + }, + { + "epoch": 1.2474820143884893, + "grad_norm": 0.22235343681142253, + "learning_rate": 7.468446960157527e-05, + "loss": 0.3669, + "step": 867 + }, + { + "epoch": 1.2489208633093525, + "grad_norm": 0.18991249022815596, + "learning_rate": 7.466443457245344e-05, + "loss": 0.3772, + "step": 868 + }, + { + "epoch": 1.2503597122302157, + "grad_norm": 0.20548444109653444, + "learning_rate": 7.464436455461066e-05, + "loss": 0.3692, + "step": 869 + }, + { + "epoch": 1.2517985611510791, + "grad_norm": 0.24937818557657385, + "learning_rate": 7.462425956830466e-05, + "loss": 0.3733, + "step": 870 + }, + { + "epoch": 1.2532374100719426, + "grad_norm": 0.29764989918438073, + "learning_rate": 7.460411963382853e-05, + "loss": 0.3685, + "step": 871 + }, + { + "epoch": 1.2546762589928058, + "grad_norm": 0.31750242257491057, + "learning_rate": 7.45839447715106e-05, + "loss": 0.3652, + "step": 872 + }, + { + "epoch": 1.256115107913669, + "grad_norm": 0.3134852958397248, + "learning_rate": 7.456373500171449e-05, + "loss": 0.3644, + "step": 873 + }, + { + "epoch": 1.2575539568345324, + "grad_norm": 0.30230759591716544, + "learning_rate": 7.454349034483903e-05, + "loss": 0.3686, + "step": 874 + }, + { + "epoch": 1.2589928057553956, + "grad_norm": 0.318563757610303, + "learning_rate": 7.452321082131824e-05, + "loss": 0.3776, + "step": 875 + }, + { + "epoch": 1.260431654676259, + "grad_norm": 0.27034438572857183, + "learning_rate": 7.450289645162138e-05, + "loss": 0.3671, + "step": 876 + }, + { + "epoch": 1.2618705035971223, + "grad_norm": 0.20339298565882605, + "learning_rate": 7.448254725625287e-05, + "loss": 0.3711, + "step": 877 + }, + { + "epoch": 1.2633093525179855, + "grad_norm": 0.1882757355289913, + "learning_rate": 7.446216325575225e-05, + "loss": 0.3765, + "step": 878 + }, + { + "epoch": 1.264748201438849, + "grad_norm": 0.2282191365184483, + "learning_rate": 7.444174447069423e-05, + "loss": 0.3732, + "step": 879 + }, + { + "epoch": 1.2661870503597124, + "grad_norm": 0.24506792063831453, + "learning_rate": 7.442129092168859e-05, + "loss": 0.3764, + "step": 880 + }, + { + "epoch": 1.2676258992805756, + "grad_norm": 0.22089288271493143, + "learning_rate": 7.440080262938026e-05, + "loss": 0.3681, + "step": 881 + }, + { + "epoch": 1.2690647482014388, + "grad_norm": 0.2301162172592513, + "learning_rate": 7.438027961444916e-05, + "loss": 0.3748, + "step": 882 + }, + { + "epoch": 1.2705035971223022, + "grad_norm": 0.2542452738575605, + "learning_rate": 7.435972189761033e-05, + "loss": 0.3688, + "step": 883 + }, + { + "epoch": 1.2719424460431654, + "grad_norm": 0.22470200136134963, + "learning_rate": 7.43391294996138e-05, + "loss": 0.3611, + "step": 884 + }, + { + "epoch": 1.2733812949640289, + "grad_norm": 0.1898198683091336, + "learning_rate": 7.431850244124459e-05, + "loss": 0.3739, + "step": 885 + }, + { + "epoch": 1.274820143884892, + "grad_norm": 0.1820491835452753, + "learning_rate": 7.429784074332274e-05, + "loss": 0.3712, + "step": 886 + }, + { + "epoch": 1.2762589928057553, + "grad_norm": 0.25402600893388405, + "learning_rate": 7.427714442670324e-05, + "loss": 0.3609, + "step": 887 + }, + { + "epoch": 1.2776978417266187, + "grad_norm": 0.2932596065482579, + "learning_rate": 7.425641351227602e-05, + "loss": 0.3735, + "step": 888 + }, + { + "epoch": 1.2791366906474821, + "grad_norm": 0.254759984310122, + "learning_rate": 7.423564802096592e-05, + "loss": 0.3706, + "step": 889 + }, + { + "epoch": 1.2805755395683454, + "grad_norm": 0.15825456679635624, + "learning_rate": 7.42148479737327e-05, + "loss": 0.3732, + "step": 890 + }, + { + "epoch": 1.2820143884892086, + "grad_norm": 0.19782704607500964, + "learning_rate": 7.419401339157099e-05, + "loss": 0.3666, + "step": 891 + }, + { + "epoch": 1.283453237410072, + "grad_norm": 0.22692372888409243, + "learning_rate": 7.41731442955103e-05, + "loss": 0.3727, + "step": 892 + }, + { + "epoch": 1.2848920863309352, + "grad_norm": 0.22599493246768218, + "learning_rate": 7.415224070661492e-05, + "loss": 0.3735, + "step": 893 + }, + { + "epoch": 1.2863309352517986, + "grad_norm": 0.20090376197165424, + "learning_rate": 7.413130264598404e-05, + "loss": 0.3742, + "step": 894 + }, + { + "epoch": 1.2877697841726619, + "grad_norm": 0.18062614457652348, + "learning_rate": 7.411033013475156e-05, + "loss": 0.3661, + "step": 895 + }, + { + "epoch": 1.289208633093525, + "grad_norm": 0.22557471773322108, + "learning_rate": 7.408932319408619e-05, + "loss": 0.3706, + "step": 896 + }, + { + "epoch": 1.2906474820143885, + "grad_norm": 0.21324522732601672, + "learning_rate": 7.406828184519141e-05, + "loss": 0.373, + "step": 897 + }, + { + "epoch": 1.292086330935252, + "grad_norm": 0.20198963604788683, + "learning_rate": 7.40472061093054e-05, + "loss": 0.3727, + "step": 898 + }, + { + "epoch": 1.2935251798561151, + "grad_norm": 0.19423555758666586, + "learning_rate": 7.402609600770104e-05, + "loss": 0.369, + "step": 899 + }, + { + "epoch": 1.2949640287769784, + "grad_norm": 0.20589478044466908, + "learning_rate": 7.400495156168596e-05, + "loss": 0.3728, + "step": 900 + }, + { + "epoch": 1.2964028776978418, + "grad_norm": 0.25023522722452557, + "learning_rate": 7.39837727926024e-05, + "loss": 0.3636, + "step": 901 + }, + { + "epoch": 1.297841726618705, + "grad_norm": 0.2919924081876397, + "learning_rate": 7.396255972182723e-05, + "loss": 0.3675, + "step": 902 + }, + { + "epoch": 1.2992805755395684, + "grad_norm": 0.2561789804875008, + "learning_rate": 7.394131237077199e-05, + "loss": 0.3722, + "step": 903 + }, + { + "epoch": 1.3007194244604317, + "grad_norm": 0.2476025541732896, + "learning_rate": 7.39200307608828e-05, + "loss": 0.3724, + "step": 904 + }, + { + "epoch": 1.3021582733812949, + "grad_norm": 0.2756821793834618, + "learning_rate": 7.389871491364036e-05, + "loss": 0.3712, + "step": 905 + }, + { + "epoch": 1.3035971223021583, + "grad_norm": 0.2725376081819577, + "learning_rate": 7.387736485055993e-05, + "loss": 0.3691, + "step": 906 + }, + { + "epoch": 1.3050359712230215, + "grad_norm": 0.2632588761126749, + "learning_rate": 7.385598059319129e-05, + "loss": 0.3806, + "step": 907 + }, + { + "epoch": 1.306474820143885, + "grad_norm": 0.274824077738147, + "learning_rate": 7.383456216311875e-05, + "loss": 0.38, + "step": 908 + }, + { + "epoch": 1.3079136690647482, + "grad_norm": 0.2976218077528491, + "learning_rate": 7.381310958196112e-05, + "loss": 0.3745, + "step": 909 + }, + { + "epoch": 1.3093525179856116, + "grad_norm": 0.2740150521185998, + "learning_rate": 7.379162287137167e-05, + "loss": 0.3641, + "step": 910 + }, + { + "epoch": 1.3107913669064748, + "grad_norm": 0.18624314318474133, + "learning_rate": 7.37701020530381e-05, + "loss": 0.3656, + "step": 911 + }, + { + "epoch": 1.3122302158273382, + "grad_norm": 0.23514671142831362, + "learning_rate": 7.374854714868259e-05, + "loss": 0.3619, + "step": 912 + }, + { + "epoch": 1.3136690647482014, + "grad_norm": 0.2640162079181578, + "learning_rate": 7.372695818006167e-05, + "loss": 0.3737, + "step": 913 + }, + { + "epoch": 1.3151079136690647, + "grad_norm": 0.21895715772510793, + "learning_rate": 7.370533516896627e-05, + "loss": 0.3748, + "step": 914 + }, + { + "epoch": 1.316546762589928, + "grad_norm": 0.19204811742035638, + "learning_rate": 7.368367813722169e-05, + "loss": 0.3762, + "step": 915 + }, + { + "epoch": 1.3179856115107913, + "grad_norm": 0.23551727025175465, + "learning_rate": 7.366198710668755e-05, + "loss": 0.371, + "step": 916 + }, + { + "epoch": 1.3194244604316547, + "grad_norm": 0.21097683299141343, + "learning_rate": 7.364026209925783e-05, + "loss": 0.3749, + "step": 917 + }, + { + "epoch": 1.320863309352518, + "grad_norm": 0.18362234695221757, + "learning_rate": 7.361850313686076e-05, + "loss": 0.3728, + "step": 918 + }, + { + "epoch": 1.3223021582733812, + "grad_norm": 0.21469391732409804, + "learning_rate": 7.359671024145886e-05, + "loss": 0.3656, + "step": 919 + }, + { + "epoch": 1.3237410071942446, + "grad_norm": 0.21885651922020025, + "learning_rate": 7.35748834350489e-05, + "loss": 0.3671, + "step": 920 + }, + { + "epoch": 1.325179856115108, + "grad_norm": 0.1970601176982681, + "learning_rate": 7.355302273966186e-05, + "loss": 0.3716, + "step": 921 + }, + { + "epoch": 1.3266187050359712, + "grad_norm": 0.2417575513009536, + "learning_rate": 7.353112817736295e-05, + "loss": 0.3659, + "step": 922 + }, + { + "epoch": 1.3280575539568344, + "grad_norm": 0.22902836288856157, + "learning_rate": 7.350919977025157e-05, + "loss": 0.3689, + "step": 923 + }, + { + "epoch": 1.3294964028776979, + "grad_norm": 0.18807707508033616, + "learning_rate": 7.348723754046127e-05, + "loss": 0.3607, + "step": 924 + }, + { + "epoch": 1.330935251798561, + "grad_norm": 0.19350319400188104, + "learning_rate": 7.34652415101597e-05, + "loss": 0.3663, + "step": 925 + }, + { + "epoch": 1.3323741007194245, + "grad_norm": 0.22217192896787372, + "learning_rate": 7.344321170154871e-05, + "loss": 0.3689, + "step": 926 + }, + { + "epoch": 1.3338129496402877, + "grad_norm": 0.27901108408156117, + "learning_rate": 7.342114813686419e-05, + "loss": 0.3723, + "step": 927 + }, + { + "epoch": 1.335251798561151, + "grad_norm": 0.2869745209501443, + "learning_rate": 7.339905083837608e-05, + "loss": 0.3831, + "step": 928 + }, + { + "epoch": 1.3366906474820144, + "grad_norm": 0.22988927418717225, + "learning_rate": 7.337691982838841e-05, + "loss": 0.3633, + "step": 929 + }, + { + "epoch": 1.3381294964028778, + "grad_norm": 0.21230789850039003, + "learning_rate": 7.335475512923924e-05, + "loss": 0.3642, + "step": 930 + }, + { + "epoch": 1.339568345323741, + "grad_norm": 0.23346971236746394, + "learning_rate": 7.33325567633006e-05, + "loss": 0.3768, + "step": 931 + }, + { + "epoch": 1.3410071942446042, + "grad_norm": 0.17917347614029558, + "learning_rate": 7.331032475297855e-05, + "loss": 0.3684, + "step": 932 + }, + { + "epoch": 1.3424460431654677, + "grad_norm": 0.15564979709111454, + "learning_rate": 7.328805912071307e-05, + "loss": 0.3648, + "step": 933 + }, + { + "epoch": 1.3438848920863309, + "grad_norm": 0.18785506120243278, + "learning_rate": 7.326575988897807e-05, + "loss": 0.3752, + "step": 934 + }, + { + "epoch": 1.3453237410071943, + "grad_norm": 0.1965931833564018, + "learning_rate": 7.324342708028141e-05, + "loss": 0.3673, + "step": 935 + }, + { + "epoch": 1.3467625899280575, + "grad_norm": 0.20849109538010188, + "learning_rate": 7.322106071716483e-05, + "loss": 0.3618, + "step": 936 + }, + { + "epoch": 1.3482014388489207, + "grad_norm": 0.19947202708650083, + "learning_rate": 7.319866082220388e-05, + "loss": 0.3696, + "step": 937 + }, + { + "epoch": 1.3496402877697842, + "grad_norm": 0.16626015484382997, + "learning_rate": 7.317622741800808e-05, + "loss": 0.3869, + "step": 938 + }, + { + "epoch": 1.3510791366906476, + "grad_norm": 0.18899282070305004, + "learning_rate": 7.315376052722065e-05, + "loss": 0.369, + "step": 939 + }, + { + "epoch": 1.3525179856115108, + "grad_norm": 0.1743281306881679, + "learning_rate": 7.313126017251868e-05, + "loss": 0.3589, + "step": 940 + }, + { + "epoch": 1.353956834532374, + "grad_norm": 0.1450761098854889, + "learning_rate": 7.3108726376613e-05, + "loss": 0.3722, + "step": 941 + }, + { + "epoch": 1.3553956834532375, + "grad_norm": 0.15543250685925575, + "learning_rate": 7.308615916224823e-05, + "loss": 0.3634, + "step": 942 + }, + { + "epoch": 1.3568345323741007, + "grad_norm": 0.2076568865911509, + "learning_rate": 7.306355855220267e-05, + "loss": 0.3749, + "step": 943 + }, + { + "epoch": 1.358273381294964, + "grad_norm": 0.2164167100448256, + "learning_rate": 7.30409245692884e-05, + "loss": 0.3699, + "step": 944 + }, + { + "epoch": 1.3597122302158273, + "grad_norm": 0.20103550217189015, + "learning_rate": 7.301825723635111e-05, + "loss": 0.3653, + "step": 945 + }, + { + "epoch": 1.3611510791366905, + "grad_norm": 0.22906493617217463, + "learning_rate": 7.299555657627021e-05, + "loss": 0.3659, + "step": 946 + }, + { + "epoch": 1.362589928057554, + "grad_norm": 0.24804845625514324, + "learning_rate": 7.29728226119587e-05, + "loss": 0.3843, + "step": 947 + }, + { + "epoch": 1.3640287769784174, + "grad_norm": 0.2574010821529766, + "learning_rate": 7.295005536636325e-05, + "loss": 0.3694, + "step": 948 + }, + { + "epoch": 1.3654676258992806, + "grad_norm": 0.27319217453544853, + "learning_rate": 7.292725486246407e-05, + "loss": 0.3691, + "step": 949 + }, + { + "epoch": 1.3669064748201438, + "grad_norm": 0.3137763417062616, + "learning_rate": 7.290442112327498e-05, + "loss": 0.3728, + "step": 950 + }, + { + "epoch": 1.3683453237410073, + "grad_norm": 0.31367627699699374, + "learning_rate": 7.288155417184331e-05, + "loss": 0.3755, + "step": 951 + }, + { + "epoch": 1.3697841726618705, + "grad_norm": 0.31054301331184375, + "learning_rate": 7.285865403124995e-05, + "loss": 0.3701, + "step": 952 + }, + { + "epoch": 1.371223021582734, + "grad_norm": 0.264865486382744, + "learning_rate": 7.283572072460927e-05, + "loss": 0.3637, + "step": 953 + }, + { + "epoch": 1.3726618705035971, + "grad_norm": 0.24832730740175757, + "learning_rate": 7.28127542750691e-05, + "loss": 0.3666, + "step": 954 + }, + { + "epoch": 1.3741007194244603, + "grad_norm": 0.24633003699743544, + "learning_rate": 7.278975470581076e-05, + "loss": 0.3744, + "step": 955 + }, + { + "epoch": 1.3755395683453238, + "grad_norm": 0.23944991878988073, + "learning_rate": 7.276672204004898e-05, + "loss": 0.3723, + "step": 956 + }, + { + "epoch": 1.376978417266187, + "grad_norm": 0.23658561224616861, + "learning_rate": 7.274365630103189e-05, + "loss": 0.3683, + "step": 957 + }, + { + "epoch": 1.3784172661870504, + "grad_norm": 0.2592946740203759, + "learning_rate": 7.2720557512041e-05, + "loss": 0.3748, + "step": 958 + }, + { + "epoch": 1.3798561151079136, + "grad_norm": 0.23670621727444183, + "learning_rate": 7.269742569639121e-05, + "loss": 0.3744, + "step": 959 + }, + { + "epoch": 1.381294964028777, + "grad_norm": 0.2292635420385005, + "learning_rate": 7.267426087743073e-05, + "loss": 0.3768, + "step": 960 + }, + { + "epoch": 1.3827338129496403, + "grad_norm": 0.2268629950072278, + "learning_rate": 7.265106307854107e-05, + "loss": 0.3738, + "step": 961 + }, + { + "epoch": 1.3841726618705037, + "grad_norm": 0.22747961927056887, + "learning_rate": 7.262783232313706e-05, + "loss": 0.3639, + "step": 962 + }, + { + "epoch": 1.385611510791367, + "grad_norm": 0.22375027684102045, + "learning_rate": 7.260456863466676e-05, + "loss": 0.3693, + "step": 963 + }, + { + "epoch": 1.3870503597122301, + "grad_norm": 0.26756377015293353, + "learning_rate": 7.258127203661153e-05, + "loss": 0.3836, + "step": 964 + }, + { + "epoch": 1.3884892086330936, + "grad_norm": 0.2641592742373027, + "learning_rate": 7.255794255248587e-05, + "loss": 0.3687, + "step": 965 + }, + { + "epoch": 1.3899280575539568, + "grad_norm": 0.1741039452690814, + "learning_rate": 7.253458020583752e-05, + "loss": 0.365, + "step": 966 + }, + { + "epoch": 1.3913669064748202, + "grad_norm": 0.24619697055268608, + "learning_rate": 7.25111850202474e-05, + "loss": 0.3755, + "step": 967 + }, + { + "epoch": 1.3928057553956834, + "grad_norm": 0.29978393559314154, + "learning_rate": 7.248775701932953e-05, + "loss": 0.3806, + "step": 968 + }, + { + "epoch": 1.3942446043165468, + "grad_norm": 0.2198291463813615, + "learning_rate": 7.246429622673111e-05, + "loss": 0.3794, + "step": 969 + }, + { + "epoch": 1.39568345323741, + "grad_norm": 0.149923625810162, + "learning_rate": 7.244080266613238e-05, + "loss": 0.3758, + "step": 970 + }, + { + "epoch": 1.3971223021582735, + "grad_norm": 0.1727076360812617, + "learning_rate": 7.241727636124671e-05, + "loss": 0.3718, + "step": 971 + }, + { + "epoch": 1.3985611510791367, + "grad_norm": 0.2094557826048381, + "learning_rate": 7.239371733582047e-05, + "loss": 0.3776, + "step": 972 + }, + { + "epoch": 1.4, + "grad_norm": 0.15811082164113044, + "learning_rate": 7.23701256136331e-05, + "loss": 0.3623, + "step": 973 + }, + { + "epoch": 1.4014388489208633, + "grad_norm": 0.13596205161356006, + "learning_rate": 7.2346501218497e-05, + "loss": 0.3679, + "step": 974 + }, + { + "epoch": 1.4028776978417266, + "grad_norm": 0.1547108001168075, + "learning_rate": 7.23228441742576e-05, + "loss": 0.3694, + "step": 975 + }, + { + "epoch": 1.40431654676259, + "grad_norm": 0.1515226364651108, + "learning_rate": 7.229915450479324e-05, + "loss": 0.366, + "step": 976 + }, + { + "epoch": 1.4057553956834532, + "grad_norm": 0.13965019541850962, + "learning_rate": 7.227543223401522e-05, + "loss": 0.3707, + "step": 977 + }, + { + "epoch": 1.4071942446043164, + "grad_norm": 0.18867540462886592, + "learning_rate": 7.225167738586772e-05, + "loss": 0.3707, + "step": 978 + }, + { + "epoch": 1.4086330935251798, + "grad_norm": 0.2690397878961195, + "learning_rate": 7.22278899843278e-05, + "loss": 0.3727, + "step": 979 + }, + { + "epoch": 1.4100719424460433, + "grad_norm": 0.3127286924758779, + "learning_rate": 7.220407005340542e-05, + "loss": 0.3702, + "step": 980 + }, + { + "epoch": 1.4115107913669065, + "grad_norm": 0.35116182021408393, + "learning_rate": 7.218021761714336e-05, + "loss": 0.3752, + "step": 981 + }, + { + "epoch": 1.4129496402877697, + "grad_norm": 0.3130530174839103, + "learning_rate": 7.215633269961714e-05, + "loss": 0.3665, + "step": 982 + }, + { + "epoch": 1.4143884892086331, + "grad_norm": 0.254603450015429, + "learning_rate": 7.213241532493516e-05, + "loss": 0.3653, + "step": 983 + }, + { + "epoch": 1.4158273381294963, + "grad_norm": 0.26860387604138586, + "learning_rate": 7.210846551723855e-05, + "loss": 0.3679, + "step": 984 + }, + { + "epoch": 1.4172661870503598, + "grad_norm": 0.29070538996667433, + "learning_rate": 7.208448330070116e-05, + "loss": 0.3766, + "step": 985 + }, + { + "epoch": 1.418705035971223, + "grad_norm": 0.31265244907033013, + "learning_rate": 7.206046869952954e-05, + "loss": 0.3824, + "step": 986 + }, + { + "epoch": 1.4201438848920862, + "grad_norm": 0.2763893862741098, + "learning_rate": 7.203642173796298e-05, + "loss": 0.3673, + "step": 987 + }, + { + "epoch": 1.4215827338129496, + "grad_norm": 0.22370884435098334, + "learning_rate": 7.201234244027338e-05, + "loss": 0.3709, + "step": 988 + }, + { + "epoch": 1.423021582733813, + "grad_norm": 0.19105592811118327, + "learning_rate": 7.19882308307653e-05, + "loss": 0.3636, + "step": 989 + }, + { + "epoch": 1.4244604316546763, + "grad_norm": 0.20107475993801432, + "learning_rate": 7.196408693377594e-05, + "loss": 0.3684, + "step": 990 + }, + { + "epoch": 1.4258992805755395, + "grad_norm": 0.26309111130685686, + "learning_rate": 7.193991077367501e-05, + "loss": 0.3778, + "step": 991 + }, + { + "epoch": 1.427338129496403, + "grad_norm": 0.28549866613414865, + "learning_rate": 7.19157023748649e-05, + "loss": 0.372, + "step": 992 + }, + { + "epoch": 1.4287769784172661, + "grad_norm": 0.287943613682877, + "learning_rate": 7.189146176178044e-05, + "loss": 0.376, + "step": 993 + }, + { + "epoch": 1.4302158273381296, + "grad_norm": 0.24113088730328244, + "learning_rate": 7.186718895888904e-05, + "loss": 0.3663, + "step": 994 + }, + { + "epoch": 1.4316546762589928, + "grad_norm": 0.2138198760304494, + "learning_rate": 7.184288399069054e-05, + "loss": 0.3671, + "step": 995 + }, + { + "epoch": 1.433093525179856, + "grad_norm": 0.22492272003674318, + "learning_rate": 7.181854688171732e-05, + "loss": 0.3708, + "step": 996 + }, + { + "epoch": 1.4345323741007194, + "grad_norm": 0.20104573143639234, + "learning_rate": 7.179417765653413e-05, + "loss": 0.3695, + "step": 997 + }, + { + "epoch": 1.4359712230215829, + "grad_norm": 0.16938615838863322, + "learning_rate": 7.17697763397382e-05, + "loss": 0.3662, + "step": 998 + }, + { + "epoch": 1.437410071942446, + "grad_norm": 0.14805036489703202, + "learning_rate": 7.174534295595911e-05, + "loss": 0.3648, + "step": 999 + }, + { + "epoch": 1.4388489208633093, + "grad_norm": 0.17266866558604405, + "learning_rate": 7.17208775298588e-05, + "loss": 0.3721, + "step": 1000 + }, + { + "epoch": 1.4402877697841727, + "grad_norm": 0.19524748761472635, + "learning_rate": 7.169638008613158e-05, + "loss": 0.3731, + "step": 1001 + }, + { + "epoch": 1.441726618705036, + "grad_norm": 0.220526684507708, + "learning_rate": 7.16718506495041e-05, + "loss": 0.372, + "step": 1002 + }, + { + "epoch": 1.4431654676258994, + "grad_norm": 0.21718586501066883, + "learning_rate": 7.164728924473522e-05, + "loss": 0.3666, + "step": 1003 + }, + { + "epoch": 1.4446043165467626, + "grad_norm": 0.199258907361367, + "learning_rate": 7.162269589661614e-05, + "loss": 0.3735, + "step": 1004 + }, + { + "epoch": 1.4460431654676258, + "grad_norm": 0.2314267753915402, + "learning_rate": 7.15980706299703e-05, + "loss": 0.3569, + "step": 1005 + }, + { + "epoch": 1.4474820143884892, + "grad_norm": 0.26551029161174233, + "learning_rate": 7.15734134696533e-05, + "loss": 0.3759, + "step": 1006 + }, + { + "epoch": 1.4489208633093524, + "grad_norm": 0.22711136350517233, + "learning_rate": 7.1548724440553e-05, + "loss": 0.3576, + "step": 1007 + }, + { + "epoch": 1.4503597122302159, + "grad_norm": 0.17628765409818595, + "learning_rate": 7.152400356758937e-05, + "loss": 0.3721, + "step": 1008 + }, + { + "epoch": 1.451798561151079, + "grad_norm": 0.20206966855772898, + "learning_rate": 7.149925087571456e-05, + "loss": 0.3744, + "step": 1009 + }, + { + "epoch": 1.4532374100719425, + "grad_norm": 0.1865332111696231, + "learning_rate": 7.147446638991283e-05, + "loss": 0.362, + "step": 1010 + }, + { + "epoch": 1.4546762589928057, + "grad_norm": 0.16890414484534108, + "learning_rate": 7.14496501352005e-05, + "loss": 0.3672, + "step": 1011 + }, + { + "epoch": 1.4561151079136692, + "grad_norm": 0.17979472458157772, + "learning_rate": 7.1424802136626e-05, + "loss": 0.3719, + "step": 1012 + }, + { + "epoch": 1.4575539568345324, + "grad_norm": 0.15033256457936278, + "learning_rate": 7.139992241926978e-05, + "loss": 0.3607, + "step": 1013 + }, + { + "epoch": 1.4589928057553956, + "grad_norm": 0.1863289313868577, + "learning_rate": 7.137501100824432e-05, + "loss": 0.3698, + "step": 1014 + }, + { + "epoch": 1.460431654676259, + "grad_norm": 0.2298324813762039, + "learning_rate": 7.135006792869405e-05, + "loss": 0.3698, + "step": 1015 + }, + { + "epoch": 1.4618705035971222, + "grad_norm": 0.17501429166945917, + "learning_rate": 7.132509320579542e-05, + "loss": 0.3691, + "step": 1016 + }, + { + "epoch": 1.4633093525179857, + "grad_norm": 0.1568780008946555, + "learning_rate": 7.130008686475677e-05, + "loss": 0.3771, + "step": 1017 + }, + { + "epoch": 1.4647482014388489, + "grad_norm": 0.18883894264720721, + "learning_rate": 7.127504893081839e-05, + "loss": 0.361, + "step": 1018 + }, + { + "epoch": 1.4661870503597123, + "grad_norm": 0.21335909285528223, + "learning_rate": 7.124997942925244e-05, + "loss": 0.3716, + "step": 1019 + }, + { + "epoch": 1.4676258992805755, + "grad_norm": 0.2465896954841192, + "learning_rate": 7.122487838536295e-05, + "loss": 0.3624, + "step": 1020 + }, + { + "epoch": 1.469064748201439, + "grad_norm": 0.2432164708019936, + "learning_rate": 7.119974582448577e-05, + "loss": 0.366, + "step": 1021 + }, + { + "epoch": 1.4705035971223022, + "grad_norm": 0.2251933489831929, + "learning_rate": 7.11745817719886e-05, + "loss": 0.3675, + "step": 1022 + }, + { + "epoch": 1.4719424460431654, + "grad_norm": 0.21640744171179588, + "learning_rate": 7.114938625327088e-05, + "loss": 0.3644, + "step": 1023 + }, + { + "epoch": 1.4733812949640288, + "grad_norm": 0.20407010222209512, + "learning_rate": 7.112415929376385e-05, + "loss": 0.3785, + "step": 1024 + }, + { + "epoch": 1.474820143884892, + "grad_norm": 0.22433584927431985, + "learning_rate": 7.109890091893047e-05, + "loss": 0.3747, + "step": 1025 + }, + { + "epoch": 1.4762589928057555, + "grad_norm": 0.20891736414872672, + "learning_rate": 7.107361115426537e-05, + "loss": 0.3636, + "step": 1026 + }, + { + "epoch": 1.4776978417266187, + "grad_norm": 0.1846405972029036, + "learning_rate": 7.104829002529496e-05, + "loss": 0.3652, + "step": 1027 + }, + { + "epoch": 1.4791366906474819, + "grad_norm": 0.18316835473870297, + "learning_rate": 7.102293755757721e-05, + "loss": 0.3742, + "step": 1028 + }, + { + "epoch": 1.4805755395683453, + "grad_norm": 0.18940001949154145, + "learning_rate": 7.099755377670177e-05, + "loss": 0.3787, + "step": 1029 + }, + { + "epoch": 1.4820143884892087, + "grad_norm": 0.202197442529192, + "learning_rate": 7.097213870828989e-05, + "loss": 0.3682, + "step": 1030 + }, + { + "epoch": 1.483453237410072, + "grad_norm": 0.23653540527897343, + "learning_rate": 7.094669237799437e-05, + "loss": 0.365, + "step": 1031 + }, + { + "epoch": 1.4848920863309352, + "grad_norm": 0.25273303274457065, + "learning_rate": 7.092121481149964e-05, + "loss": 0.3816, + "step": 1032 + }, + { + "epoch": 1.4863309352517986, + "grad_norm": 0.2541430913646154, + "learning_rate": 7.089570603452157e-05, + "loss": 0.3571, + "step": 1033 + }, + { + "epoch": 1.4877697841726618, + "grad_norm": 0.23731631571248973, + "learning_rate": 7.087016607280758e-05, + "loss": 0.3654, + "step": 1034 + }, + { + "epoch": 1.4892086330935252, + "grad_norm": 0.2102000795141157, + "learning_rate": 7.084459495213658e-05, + "loss": 0.369, + "step": 1035 + }, + { + "epoch": 1.4906474820143885, + "grad_norm": 0.20018786941562838, + "learning_rate": 7.081899269831888e-05, + "loss": 0.3726, + "step": 1036 + }, + { + "epoch": 1.4920863309352517, + "grad_norm": 0.18174622680531183, + "learning_rate": 7.079335933719625e-05, + "loss": 0.3703, + "step": 1037 + }, + { + "epoch": 1.493525179856115, + "grad_norm": 0.1739627444880869, + "learning_rate": 7.076769489464188e-05, + "loss": 0.3689, + "step": 1038 + }, + { + "epoch": 1.4949640287769785, + "grad_norm": 0.19045338757680166, + "learning_rate": 7.074199939656027e-05, + "loss": 0.3684, + "step": 1039 + }, + { + "epoch": 1.4964028776978417, + "grad_norm": 0.17323825783489552, + "learning_rate": 7.071627286888731e-05, + "loss": 0.3592, + "step": 1040 + }, + { + "epoch": 1.497841726618705, + "grad_norm": 0.17301111473128455, + "learning_rate": 7.06905153375902e-05, + "loss": 0.3646, + "step": 1041 + }, + { + "epoch": 1.4992805755395684, + "grad_norm": 0.212194491495326, + "learning_rate": 7.066472682866744e-05, + "loss": 0.3654, + "step": 1042 + }, + { + "epoch": 1.5007194244604316, + "grad_norm": 0.24175569970243074, + "learning_rate": 7.063890736814878e-05, + "loss": 0.369, + "step": 1043 + }, + { + "epoch": 1.502158273381295, + "grad_norm": 0.19559789804626979, + "learning_rate": 7.061305698209524e-05, + "loss": 0.3592, + "step": 1044 + }, + { + "epoch": 1.5035971223021583, + "grad_norm": 0.1674547024455131, + "learning_rate": 7.058717569659901e-05, + "loss": 0.3686, + "step": 1045 + }, + { + "epoch": 1.5050359712230215, + "grad_norm": 0.21656195173077553, + "learning_rate": 7.05612635377835e-05, + "loss": 0.3665, + "step": 1046 + }, + { + "epoch": 1.506474820143885, + "grad_norm": 0.25183894141407687, + "learning_rate": 7.053532053180332e-05, + "loss": 0.373, + "step": 1047 + }, + { + "epoch": 1.5079136690647483, + "grad_norm": 0.22848196273677326, + "learning_rate": 7.050934670484413e-05, + "loss": 0.3649, + "step": 1048 + }, + { + "epoch": 1.5093525179856115, + "grad_norm": 0.20612656601373666, + "learning_rate": 7.048334208312273e-05, + "loss": 0.3762, + "step": 1049 + }, + { + "epoch": 1.5107913669064748, + "grad_norm": 0.2012284069329427, + "learning_rate": 7.045730669288706e-05, + "loss": 0.3739, + "step": 1050 + }, + { + "epoch": 1.512230215827338, + "grad_norm": 0.19784816330810212, + "learning_rate": 7.043124056041606e-05, + "loss": 0.3764, + "step": 1051 + }, + { + "epoch": 1.5136690647482014, + "grad_norm": 0.20070931859918947, + "learning_rate": 7.040514371201969e-05, + "loss": 0.3686, + "step": 1052 + }, + { + "epoch": 1.5151079136690648, + "grad_norm": 0.21327088238852762, + "learning_rate": 7.037901617403894e-05, + "loss": 0.3719, + "step": 1053 + }, + { + "epoch": 1.516546762589928, + "grad_norm": 0.18889305816471605, + "learning_rate": 7.035285797284578e-05, + "loss": 0.3629, + "step": 1054 + }, + { + "epoch": 1.5179856115107913, + "grad_norm": 0.1717405383978451, + "learning_rate": 7.032666913484313e-05, + "loss": 0.366, + "step": 1055 + }, + { + "epoch": 1.5194244604316547, + "grad_norm": 0.17505137907376786, + "learning_rate": 7.030044968646481e-05, + "loss": 0.3752, + "step": 1056 + }, + { + "epoch": 1.5208633093525181, + "grad_norm": 0.17506375531474533, + "learning_rate": 7.027419965417556e-05, + "loss": 0.362, + "step": 1057 + }, + { + "epoch": 1.5223021582733813, + "grad_norm": 0.19851104466139188, + "learning_rate": 7.024791906447098e-05, + "loss": 0.3642, + "step": 1058 + }, + { + "epoch": 1.5237410071942445, + "grad_norm": 0.20949914509907405, + "learning_rate": 7.022160794387751e-05, + "loss": 0.3672, + "step": 1059 + }, + { + "epoch": 1.5251798561151078, + "grad_norm": 0.22099560063799745, + "learning_rate": 7.019526631895242e-05, + "loss": 0.3655, + "step": 1060 + }, + { + "epoch": 1.5266187050359712, + "grad_norm": 0.19446322850465173, + "learning_rate": 7.016889421628374e-05, + "loss": 0.3729, + "step": 1061 + }, + { + "epoch": 1.5280575539568346, + "grad_norm": 0.19321062750086784, + "learning_rate": 7.014249166249032e-05, + "loss": 0.3769, + "step": 1062 + }, + { + "epoch": 1.5294964028776978, + "grad_norm": 0.1830637878435879, + "learning_rate": 7.011605868422168e-05, + "loss": 0.3733, + "step": 1063 + }, + { + "epoch": 1.530935251798561, + "grad_norm": 0.15814437083114577, + "learning_rate": 7.00895953081581e-05, + "loss": 0.363, + "step": 1064 + }, + { + "epoch": 1.5323741007194245, + "grad_norm": 0.13505172891850736, + "learning_rate": 7.00631015610105e-05, + "loss": 0.3643, + "step": 1065 + }, + { + "epoch": 1.533812949640288, + "grad_norm": 0.21924398456152785, + "learning_rate": 7.00365774695205e-05, + "loss": 0.3654, + "step": 1066 + }, + { + "epoch": 1.5352517985611511, + "grad_norm": 0.26690718573582306, + "learning_rate": 7.001002306046031e-05, + "loss": 0.3719, + "step": 1067 + }, + { + "epoch": 1.5366906474820143, + "grad_norm": 0.2733501865057393, + "learning_rate": 6.998343836063276e-05, + "loss": 0.364, + "step": 1068 + }, + { + "epoch": 1.5381294964028775, + "grad_norm": 0.3003023341903461, + "learning_rate": 6.995682339687125e-05, + "loss": 0.3698, + "step": 1069 + }, + { + "epoch": 1.539568345323741, + "grad_norm": 0.3855597047106891, + "learning_rate": 6.993017819603973e-05, + "loss": 0.3792, + "step": 1070 + }, + { + "epoch": 1.5410071942446044, + "grad_norm": 0.3930859327750899, + "learning_rate": 6.990350278503267e-05, + "loss": 0.3696, + "step": 1071 + }, + { + "epoch": 1.5424460431654676, + "grad_norm": 0.29835743193969017, + "learning_rate": 6.9876797190775e-05, + "loss": 0.3613, + "step": 1072 + }, + { + "epoch": 1.5438848920863308, + "grad_norm": 0.17425106011105304, + "learning_rate": 6.985006144022219e-05, + "loss": 0.3673, + "step": 1073 + }, + { + "epoch": 1.5453237410071943, + "grad_norm": 0.1512964787752433, + "learning_rate": 6.982329556036007e-05, + "loss": 0.3655, + "step": 1074 + }, + { + "epoch": 1.5467625899280577, + "grad_norm": 0.21301259298164918, + "learning_rate": 6.979649957820494e-05, + "loss": 0.3751, + "step": 1075 + }, + { + "epoch": 1.548201438848921, + "grad_norm": 0.27811188745530113, + "learning_rate": 6.976967352080345e-05, + "loss": 0.3628, + "step": 1076 + }, + { + "epoch": 1.5496402877697841, + "grad_norm": 0.2682485717058034, + "learning_rate": 6.974281741523259e-05, + "loss": 0.3649, + "step": 1077 + }, + { + "epoch": 1.5510791366906473, + "grad_norm": 0.2132643819684934, + "learning_rate": 6.971593128859974e-05, + "loss": 0.375, + "step": 1078 + }, + { + "epoch": 1.5525179856115108, + "grad_norm": 0.14719184623094933, + "learning_rate": 6.968901516804254e-05, + "loss": 0.3648, + "step": 1079 + }, + { + "epoch": 1.5539568345323742, + "grad_norm": 0.15999419220097852, + "learning_rate": 6.966206908072891e-05, + "loss": 0.3628, + "step": 1080 + }, + { + "epoch": 1.5553956834532374, + "grad_norm": 0.16389494346907643, + "learning_rate": 6.963509305385701e-05, + "loss": 0.37, + "step": 1081 + }, + { + "epoch": 1.5568345323741006, + "grad_norm": 0.16612092719907467, + "learning_rate": 6.960808711465524e-05, + "loss": 0.363, + "step": 1082 + }, + { + "epoch": 1.558273381294964, + "grad_norm": 0.17113416113904897, + "learning_rate": 6.958105129038216e-05, + "loss": 0.3642, + "step": 1083 + }, + { + "epoch": 1.5597122302158275, + "grad_norm": 0.19938671064970884, + "learning_rate": 6.955398560832654e-05, + "loss": 0.3707, + "step": 1084 + }, + { + "epoch": 1.5611510791366907, + "grad_norm": 0.19592331868330812, + "learning_rate": 6.952689009580724e-05, + "loss": 0.3715, + "step": 1085 + }, + { + "epoch": 1.562589928057554, + "grad_norm": 0.1593070837808773, + "learning_rate": 6.949976478017327e-05, + "loss": 0.3651, + "step": 1086 + }, + { + "epoch": 1.5640287769784171, + "grad_norm": 0.1729324239186276, + "learning_rate": 6.947260968880369e-05, + "loss": 0.3686, + "step": 1087 + }, + { + "epoch": 1.5654676258992806, + "grad_norm": 0.20192957746825058, + "learning_rate": 6.944542484910763e-05, + "loss": 0.3705, + "step": 1088 + }, + { + "epoch": 1.566906474820144, + "grad_norm": 0.20153030072972658, + "learning_rate": 6.941821028852424e-05, + "loss": 0.3656, + "step": 1089 + }, + { + "epoch": 1.5683453237410072, + "grad_norm": 0.18388173358459498, + "learning_rate": 6.939096603452269e-05, + "loss": 0.3786, + "step": 1090 + }, + { + "epoch": 1.5697841726618704, + "grad_norm": 0.20279535476864027, + "learning_rate": 6.93636921146021e-05, + "loss": 0.3695, + "step": 1091 + }, + { + "epoch": 1.5712230215827339, + "grad_norm": 0.2564529735884279, + "learning_rate": 6.933638855629153e-05, + "loss": 0.3706, + "step": 1092 + }, + { + "epoch": 1.572661870503597, + "grad_norm": 0.2592608372047231, + "learning_rate": 6.930905538714995e-05, + "loss": 0.3737, + "step": 1093 + }, + { + "epoch": 1.5741007194244605, + "grad_norm": 0.26381299512115763, + "learning_rate": 6.928169263476628e-05, + "loss": 0.3714, + "step": 1094 + }, + { + "epoch": 1.5755395683453237, + "grad_norm": 0.25979589437396916, + "learning_rate": 6.92543003267592e-05, + "loss": 0.3725, + "step": 1095 + }, + { + "epoch": 1.576978417266187, + "grad_norm": 0.22649384845684808, + "learning_rate": 6.922687849077729e-05, + "loss": 0.3757, + "step": 1096 + }, + { + "epoch": 1.5784172661870504, + "grad_norm": 0.23969878317497015, + "learning_rate": 6.919942715449893e-05, + "loss": 0.3734, + "step": 1097 + }, + { + "epoch": 1.5798561151079138, + "grad_norm": 0.24479553315852776, + "learning_rate": 6.917194634563225e-05, + "loss": 0.3711, + "step": 1098 + }, + { + "epoch": 1.581294964028777, + "grad_norm": 0.23940246332303425, + "learning_rate": 6.914443609191514e-05, + "loss": 0.37, + "step": 1099 + }, + { + "epoch": 1.5827338129496402, + "grad_norm": 0.2605546110742917, + "learning_rate": 6.911689642111523e-05, + "loss": 0.3653, + "step": 1100 + }, + { + "epoch": 1.5841726618705037, + "grad_norm": 0.2302503038672148, + "learning_rate": 6.90893273610298e-05, + "loss": 0.3743, + "step": 1101 + }, + { + "epoch": 1.5856115107913669, + "grad_norm": 0.18357282616918016, + "learning_rate": 6.906172893948585e-05, + "loss": 0.372, + "step": 1102 + }, + { + "epoch": 1.5870503597122303, + "grad_norm": 0.15529606711487257, + "learning_rate": 6.903410118433996e-05, + "loss": 0.3689, + "step": 1103 + }, + { + "epoch": 1.5884892086330935, + "grad_norm": 0.18466658574544775, + "learning_rate": 6.900644412347836e-05, + "loss": 0.375, + "step": 1104 + }, + { + "epoch": 1.5899280575539567, + "grad_norm": 0.19475119813923158, + "learning_rate": 6.897875778481682e-05, + "loss": 0.3618, + "step": 1105 + }, + { + "epoch": 1.5913669064748202, + "grad_norm": 0.17366024888705028, + "learning_rate": 6.89510421963007e-05, + "loss": 0.3712, + "step": 1106 + }, + { + "epoch": 1.5928057553956836, + "grad_norm": 0.15676286754267493, + "learning_rate": 6.892329738590489e-05, + "loss": 0.3683, + "step": 1107 + }, + { + "epoch": 1.5942446043165468, + "grad_norm": 0.2188595760487771, + "learning_rate": 6.889552338163372e-05, + "loss": 0.3707, + "step": 1108 + }, + { + "epoch": 1.59568345323741, + "grad_norm": 0.24277985356006634, + "learning_rate": 6.886772021152104e-05, + "loss": 0.3777, + "step": 1109 + }, + { + "epoch": 1.5971223021582732, + "grad_norm": 0.18816245997249675, + "learning_rate": 6.883988790363009e-05, + "loss": 0.3727, + "step": 1110 + }, + { + "epoch": 1.5985611510791367, + "grad_norm": 0.17503299519995805, + "learning_rate": 6.881202648605359e-05, + "loss": 0.3772, + "step": 1111 + }, + { + "epoch": 1.6, + "grad_norm": 0.16511086634598757, + "learning_rate": 6.878413598691358e-05, + "loss": 0.377, + "step": 1112 + }, + { + "epoch": 1.6014388489208633, + "grad_norm": 0.15203174372301287, + "learning_rate": 6.875621643436147e-05, + "loss": 0.3656, + "step": 1113 + }, + { + "epoch": 1.6028776978417265, + "grad_norm": 0.16503153227139583, + "learning_rate": 6.872826785657802e-05, + "loss": 0.3655, + "step": 1114 + }, + { + "epoch": 1.60431654676259, + "grad_norm": 0.15592639608997116, + "learning_rate": 6.870029028177324e-05, + "loss": 0.3689, + "step": 1115 + }, + { + "epoch": 1.6057553956834534, + "grad_norm": 0.1641954675900134, + "learning_rate": 6.867228373818648e-05, + "loss": 0.3665, + "step": 1116 + }, + { + "epoch": 1.6071942446043166, + "grad_norm": 0.19296460226555923, + "learning_rate": 6.864424825408624e-05, + "loss": 0.3586, + "step": 1117 + }, + { + "epoch": 1.6086330935251798, + "grad_norm": 0.2215282372407742, + "learning_rate": 6.861618385777028e-05, + "loss": 0.3781, + "step": 1118 + }, + { + "epoch": 1.610071942446043, + "grad_norm": 0.18461010675491152, + "learning_rate": 6.858809057756558e-05, + "loss": 0.3662, + "step": 1119 + }, + { + "epoch": 1.6115107913669064, + "grad_norm": 0.17631555123291315, + "learning_rate": 6.855996844182819e-05, + "loss": 0.3716, + "step": 1120 + }, + { + "epoch": 1.6129496402877699, + "grad_norm": 0.23550309117468388, + "learning_rate": 6.853181747894334e-05, + "loss": 0.3609, + "step": 1121 + }, + { + "epoch": 1.614388489208633, + "grad_norm": 0.24146841042380257, + "learning_rate": 6.850363771732536e-05, + "loss": 0.3708, + "step": 1122 + }, + { + "epoch": 1.6158273381294963, + "grad_norm": 0.17650593443303575, + "learning_rate": 6.847542918541762e-05, + "loss": 0.3747, + "step": 1123 + }, + { + "epoch": 1.6172661870503597, + "grad_norm": 0.17144861816967746, + "learning_rate": 6.844719191169254e-05, + "loss": 0.3686, + "step": 1124 + }, + { + "epoch": 1.6187050359712232, + "grad_norm": 0.19574566897019172, + "learning_rate": 6.841892592465158e-05, + "loss": 0.3622, + "step": 1125 + }, + { + "epoch": 1.6201438848920864, + "grad_norm": 0.2285187712932195, + "learning_rate": 6.839063125282512e-05, + "loss": 0.3717, + "step": 1126 + }, + { + "epoch": 1.6215827338129496, + "grad_norm": 0.2397466316648997, + "learning_rate": 6.836230792477256e-05, + "loss": 0.3659, + "step": 1127 + }, + { + "epoch": 1.6230215827338128, + "grad_norm": 0.2450769117093886, + "learning_rate": 6.833395596908217e-05, + "loss": 0.3711, + "step": 1128 + }, + { + "epoch": 1.6244604316546762, + "grad_norm": 0.20283090074104773, + "learning_rate": 6.830557541437114e-05, + "loss": 0.3612, + "step": 1129 + }, + { + "epoch": 1.6258992805755397, + "grad_norm": 0.1412473996698023, + "learning_rate": 6.827716628928556e-05, + "loss": 0.3704, + "step": 1130 + }, + { + "epoch": 1.6273381294964029, + "grad_norm": 0.16775492920558058, + "learning_rate": 6.824872862250028e-05, + "loss": 0.375, + "step": 1131 + }, + { + "epoch": 1.628776978417266, + "grad_norm": 0.21575772836874516, + "learning_rate": 6.822026244271903e-05, + "loss": 0.3674, + "step": 1132 + }, + { + "epoch": 1.6302158273381295, + "grad_norm": 0.2039009705419886, + "learning_rate": 6.819176777867425e-05, + "loss": 0.3689, + "step": 1133 + }, + { + "epoch": 1.631654676258993, + "grad_norm": 0.18811765846923917, + "learning_rate": 6.816324465912723e-05, + "loss": 0.3664, + "step": 1134 + }, + { + "epoch": 1.6330935251798562, + "grad_norm": 0.18184626894267752, + "learning_rate": 6.813469311286789e-05, + "loss": 0.3734, + "step": 1135 + }, + { + "epoch": 1.6345323741007194, + "grad_norm": 0.19266509419072342, + "learning_rate": 6.810611316871488e-05, + "loss": 0.3664, + "step": 1136 + }, + { + "epoch": 1.6359712230215826, + "grad_norm": 0.19628417801952316, + "learning_rate": 6.80775048555155e-05, + "loss": 0.3648, + "step": 1137 + }, + { + "epoch": 1.637410071942446, + "grad_norm": 0.18940598982443663, + "learning_rate": 6.804886820214572e-05, + "loss": 0.3693, + "step": 1138 + }, + { + "epoch": 1.6388489208633095, + "grad_norm": 0.16004033404127446, + "learning_rate": 6.802020323751008e-05, + "loss": 0.3685, + "step": 1139 + }, + { + "epoch": 1.6402877697841727, + "grad_norm": 0.13072688623804118, + "learning_rate": 6.799150999054169e-05, + "loss": 0.3755, + "step": 1140 + }, + { + "epoch": 1.641726618705036, + "grad_norm": 0.13198554865782608, + "learning_rate": 6.796278849020225e-05, + "loss": 0.369, + "step": 1141 + }, + { + "epoch": 1.6431654676258993, + "grad_norm": 0.1599842172754047, + "learning_rate": 6.79340387654819e-05, + "loss": 0.3642, + "step": 1142 + }, + { + "epoch": 1.6446043165467625, + "grad_norm": 0.17771857869118973, + "learning_rate": 6.790526084539939e-05, + "loss": 0.371, + "step": 1143 + }, + { + "epoch": 1.646043165467626, + "grad_norm": 0.15171821300344104, + "learning_rate": 6.787645475900182e-05, + "loss": 0.3622, + "step": 1144 + }, + { + "epoch": 1.6474820143884892, + "grad_norm": 0.14137633171084957, + "learning_rate": 6.784762053536475e-05, + "loss": 0.3748, + "step": 1145 + }, + { + "epoch": 1.6489208633093524, + "grad_norm": 0.15929284019782258, + "learning_rate": 6.781875820359216e-05, + "loss": 0.3634, + "step": 1146 + }, + { + "epoch": 1.6503597122302158, + "grad_norm": 0.17031051029268407, + "learning_rate": 6.778986779281639e-05, + "loss": 0.3712, + "step": 1147 + }, + { + "epoch": 1.6517985611510793, + "grad_norm": 0.21332579251788533, + "learning_rate": 6.776094933219811e-05, + "loss": 0.3616, + "step": 1148 + }, + { + "epoch": 1.6532374100719425, + "grad_norm": 0.17755181249847204, + "learning_rate": 6.773200285092633e-05, + "loss": 0.3652, + "step": 1149 + }, + { + "epoch": 1.6546762589928057, + "grad_norm": 0.1924293240221449, + "learning_rate": 6.770302837821833e-05, + "loss": 0.364, + "step": 1150 + }, + { + "epoch": 1.6561151079136691, + "grad_norm": 0.21501293495283566, + "learning_rate": 6.767402594331961e-05, + "loss": 0.3652, + "step": 1151 + }, + { + "epoch": 1.6575539568345323, + "grad_norm": 0.2001501970519388, + "learning_rate": 6.764499557550396e-05, + "loss": 0.3681, + "step": 1152 + }, + { + "epoch": 1.6589928057553958, + "grad_norm": 0.18563372424586336, + "learning_rate": 6.761593730407329e-05, + "loss": 0.3723, + "step": 1153 + }, + { + "epoch": 1.660431654676259, + "grad_norm": 0.1828232460234351, + "learning_rate": 6.758685115835776e-05, + "loss": 0.3685, + "step": 1154 + }, + { + "epoch": 1.6618705035971222, + "grad_norm": 0.17699360766600628, + "learning_rate": 6.755773716771555e-05, + "loss": 0.3679, + "step": 1155 + }, + { + "epoch": 1.6633093525179856, + "grad_norm": 0.20368342130435985, + "learning_rate": 6.752859536153306e-05, + "loss": 0.3618, + "step": 1156 + }, + { + "epoch": 1.664748201438849, + "grad_norm": 0.17640051533425183, + "learning_rate": 6.749942576922473e-05, + "loss": 0.3619, + "step": 1157 + }, + { + "epoch": 1.6661870503597123, + "grad_norm": 0.21632935911496773, + "learning_rate": 6.7470228420233e-05, + "loss": 0.3751, + "step": 1158 + }, + { + "epoch": 1.6676258992805755, + "grad_norm": 0.3076754300359413, + "learning_rate": 6.744100334402836e-05, + "loss": 0.3665, + "step": 1159 + }, + { + "epoch": 1.6690647482014387, + "grad_norm": 0.19521131323389862, + "learning_rate": 6.741175057010932e-05, + "loss": 0.3568, + "step": 1160 + }, + { + "epoch": 1.6705035971223021, + "grad_norm": 0.19925854450194885, + "learning_rate": 6.738247012800228e-05, + "loss": 0.3651, + "step": 1161 + }, + { + "epoch": 1.6719424460431656, + "grad_norm": 0.24512084833363884, + "learning_rate": 6.735316204726163e-05, + "loss": 0.3749, + "step": 1162 + }, + { + "epoch": 1.6733812949640288, + "grad_norm": 0.20602257731161921, + "learning_rate": 6.732382635746961e-05, + "loss": 0.3666, + "step": 1163 + }, + { + "epoch": 1.674820143884892, + "grad_norm": 0.18003047454062612, + "learning_rate": 6.729446308823635e-05, + "loss": 0.3599, + "step": 1164 + }, + { + "epoch": 1.6762589928057554, + "grad_norm": 0.16513473718549246, + "learning_rate": 6.72650722691998e-05, + "loss": 0.3725, + "step": 1165 + }, + { + "epoch": 1.6776978417266188, + "grad_norm": 0.19027418872001917, + "learning_rate": 6.723565393002576e-05, + "loss": 0.3784, + "step": 1166 + }, + { + "epoch": 1.679136690647482, + "grad_norm": 0.20054960977344133, + "learning_rate": 6.720620810040776e-05, + "loss": 0.3697, + "step": 1167 + }, + { + "epoch": 1.6805755395683453, + "grad_norm": 0.18110955789965194, + "learning_rate": 6.717673481006709e-05, + "loss": 0.3606, + "step": 1168 + }, + { + "epoch": 1.6820143884892085, + "grad_norm": 0.20028353836227986, + "learning_rate": 6.714723408875279e-05, + "loss": 0.3703, + "step": 1169 + }, + { + "epoch": 1.683453237410072, + "grad_norm": 0.24954652165243063, + "learning_rate": 6.711770596624153e-05, + "loss": 0.3726, + "step": 1170 + }, + { + "epoch": 1.6848920863309353, + "grad_norm": 0.2517417857934957, + "learning_rate": 6.708815047233768e-05, + "loss": 0.3693, + "step": 1171 + }, + { + "epoch": 1.6863309352517986, + "grad_norm": 0.2089087199180209, + "learning_rate": 6.705856763687324e-05, + "loss": 0.3704, + "step": 1172 + }, + { + "epoch": 1.6877697841726618, + "grad_norm": 0.1742666318915172, + "learning_rate": 6.702895748970776e-05, + "loss": 0.3634, + "step": 1173 + }, + { + "epoch": 1.6892086330935252, + "grad_norm": 0.18217385997746868, + "learning_rate": 6.699932006072842e-05, + "loss": 0.3756, + "step": 1174 + }, + { + "epoch": 1.6906474820143886, + "grad_norm": 0.2625774979517684, + "learning_rate": 6.69696553798499e-05, + "loss": 0.3621, + "step": 1175 + }, + { + "epoch": 1.6920863309352518, + "grad_norm": 0.2868987974011614, + "learning_rate": 6.693996347701442e-05, + "loss": 0.3697, + "step": 1176 + }, + { + "epoch": 1.693525179856115, + "grad_norm": 0.2503251710843237, + "learning_rate": 6.691024438219159e-05, + "loss": 0.3789, + "step": 1177 + }, + { + "epoch": 1.6949640287769783, + "grad_norm": 0.23069913579436455, + "learning_rate": 6.688049812537857e-05, + "loss": 0.3579, + "step": 1178 + }, + { + "epoch": 1.6964028776978417, + "grad_norm": 0.22720764849016103, + "learning_rate": 6.685072473659989e-05, + "loss": 0.3619, + "step": 1179 + }, + { + "epoch": 1.6978417266187051, + "grad_norm": 0.2096733860806973, + "learning_rate": 6.682092424590747e-05, + "loss": 0.3709, + "step": 1180 + }, + { + "epoch": 1.6992805755395683, + "grad_norm": 0.25133324773649096, + "learning_rate": 6.679109668338057e-05, + "loss": 0.3808, + "step": 1181 + }, + { + "epoch": 1.7007194244604316, + "grad_norm": 0.27145707465158153, + "learning_rate": 6.676124207912582e-05, + "loss": 0.3661, + "step": 1182 + }, + { + "epoch": 1.702158273381295, + "grad_norm": 0.27304035934956816, + "learning_rate": 6.673136046327707e-05, + "loss": 0.3616, + "step": 1183 + }, + { + "epoch": 1.7035971223021584, + "grad_norm": 0.2723443196995204, + "learning_rate": 6.670145186599552e-05, + "loss": 0.3665, + "step": 1184 + }, + { + "epoch": 1.7050359712230216, + "grad_norm": 0.22953328221226027, + "learning_rate": 6.667151631746953e-05, + "loss": 0.3627, + "step": 1185 + }, + { + "epoch": 1.7064748201438849, + "grad_norm": 0.1612118326533789, + "learning_rate": 6.664155384791473e-05, + "loss": 0.3589, + "step": 1186 + }, + { + "epoch": 1.707913669064748, + "grad_norm": 0.14286583024419586, + "learning_rate": 6.661156448757386e-05, + "loss": 0.3615, + "step": 1187 + }, + { + "epoch": 1.7093525179856115, + "grad_norm": 0.1388824638725049, + "learning_rate": 6.658154826671685e-05, + "loss": 0.3662, + "step": 1188 + }, + { + "epoch": 1.710791366906475, + "grad_norm": 0.14299854170823556, + "learning_rate": 6.655150521564072e-05, + "loss": 0.363, + "step": 1189 + }, + { + "epoch": 1.7122302158273381, + "grad_norm": 0.18627026369421645, + "learning_rate": 6.652143536466955e-05, + "loss": 0.3756, + "step": 1190 + }, + { + "epoch": 1.7136690647482014, + "grad_norm": 0.25452604401196904, + "learning_rate": 6.649133874415454e-05, + "loss": 0.3668, + "step": 1191 + }, + { + "epoch": 1.7151079136690648, + "grad_norm": 0.3430639021513708, + "learning_rate": 6.646121538447382e-05, + "loss": 0.3618, + "step": 1192 + }, + { + "epoch": 1.7165467625899282, + "grad_norm": 0.3712491097759974, + "learning_rate": 6.643106531603259e-05, + "loss": 0.3602, + "step": 1193 + }, + { + "epoch": 1.7179856115107914, + "grad_norm": 0.2362138858573847, + "learning_rate": 6.640088856926294e-05, + "loss": 0.364, + "step": 1194 + }, + { + "epoch": 1.7194244604316546, + "grad_norm": 0.13478226211555927, + "learning_rate": 6.637068517462395e-05, + "loss": 0.3583, + "step": 1195 + }, + { + "epoch": 1.7208633093525179, + "grad_norm": 0.20014562213498244, + "learning_rate": 6.634045516260156e-05, + "loss": 0.367, + "step": 1196 + }, + { + "epoch": 1.7223021582733813, + "grad_norm": 0.17170578915671195, + "learning_rate": 6.631019856370856e-05, + "loss": 0.3688, + "step": 1197 + }, + { + "epoch": 1.7237410071942447, + "grad_norm": 0.14360750660228105, + "learning_rate": 6.627991540848464e-05, + "loss": 0.3576, + "step": 1198 + }, + { + "epoch": 1.725179856115108, + "grad_norm": 0.15177488956197097, + "learning_rate": 6.624960572749622e-05, + "loss": 0.3609, + "step": 1199 + }, + { + "epoch": 1.7266187050359711, + "grad_norm": 0.19404811811754438, + "learning_rate": 6.621926955133657e-05, + "loss": 0.3679, + "step": 1200 + }, + { + "epoch": 1.7280575539568346, + "grad_norm": 0.2172039496918918, + "learning_rate": 6.618890691062561e-05, + "loss": 0.3731, + "step": 1201 + }, + { + "epoch": 1.7294964028776978, + "grad_norm": 0.208959610319358, + "learning_rate": 6.615851783601006e-05, + "loss": 0.3613, + "step": 1202 + }, + { + "epoch": 1.7309352517985612, + "grad_norm": 0.1838832578768312, + "learning_rate": 6.612810235816326e-05, + "loss": 0.3653, + "step": 1203 + }, + { + "epoch": 1.7323741007194244, + "grad_norm": 0.14991254472871976, + "learning_rate": 6.609766050778525e-05, + "loss": 0.3658, + "step": 1204 + }, + { + "epoch": 1.7338129496402876, + "grad_norm": 0.15056586517037424, + "learning_rate": 6.606719231560265e-05, + "loss": 0.3683, + "step": 1205 + }, + { + "epoch": 1.735251798561151, + "grad_norm": 0.15261729120006054, + "learning_rate": 6.60366978123687e-05, + "loss": 0.3647, + "step": 1206 + }, + { + "epoch": 1.7366906474820145, + "grad_norm": 0.15674945015806585, + "learning_rate": 6.600617702886314e-05, + "loss": 0.3637, + "step": 1207 + }, + { + "epoch": 1.7381294964028777, + "grad_norm": 0.217110724574672, + "learning_rate": 6.597562999589233e-05, + "loss": 0.365, + "step": 1208 + }, + { + "epoch": 1.739568345323741, + "grad_norm": 0.18175264752569695, + "learning_rate": 6.594505674428903e-05, + "loss": 0.369, + "step": 1209 + }, + { + "epoch": 1.7410071942446042, + "grad_norm": 0.1544655331884978, + "learning_rate": 6.59144573049125e-05, + "loss": 0.3627, + "step": 1210 + }, + { + "epoch": 1.7424460431654676, + "grad_norm": 0.20292392125680517, + "learning_rate": 6.588383170864849e-05, + "loss": 0.3677, + "step": 1211 + }, + { + "epoch": 1.743884892086331, + "grad_norm": 0.21311850938644913, + "learning_rate": 6.585317998640903e-05, + "loss": 0.3705, + "step": 1212 + }, + { + "epoch": 1.7453237410071942, + "grad_norm": 0.1860092526443083, + "learning_rate": 6.582250216913265e-05, + "loss": 0.3595, + "step": 1213 + }, + { + "epoch": 1.7467625899280574, + "grad_norm": 0.16196745930604664, + "learning_rate": 6.579179828778414e-05, + "loss": 0.3631, + "step": 1214 + }, + { + "epoch": 1.7482014388489209, + "grad_norm": 0.15627090063490578, + "learning_rate": 6.576106837335458e-05, + "loss": 0.3633, + "step": 1215 + }, + { + "epoch": 1.7496402877697843, + "grad_norm": 0.17278059621160283, + "learning_rate": 6.573031245686142e-05, + "loss": 0.3701, + "step": 1216 + }, + { + "epoch": 1.7510791366906475, + "grad_norm": 0.22942428046256286, + "learning_rate": 6.569953056934826e-05, + "loss": 0.3674, + "step": 1217 + }, + { + "epoch": 1.7525179856115107, + "grad_norm": 0.20358107561700375, + "learning_rate": 6.566872274188496e-05, + "loss": 0.3774, + "step": 1218 + }, + { + "epoch": 1.753956834532374, + "grad_norm": 0.17835485713159435, + "learning_rate": 6.563788900556756e-05, + "loss": 0.3693, + "step": 1219 + }, + { + "epoch": 1.7553956834532374, + "grad_norm": 0.17026695702516972, + "learning_rate": 6.560702939151826e-05, + "loss": 0.3695, + "step": 1220 + }, + { + "epoch": 1.7568345323741008, + "grad_norm": 0.19569792601854083, + "learning_rate": 6.557614393088534e-05, + "loss": 0.3719, + "step": 1221 + }, + { + "epoch": 1.758273381294964, + "grad_norm": 0.18425833648219234, + "learning_rate": 6.554523265484321e-05, + "loss": 0.3623, + "step": 1222 + }, + { + "epoch": 1.7597122302158272, + "grad_norm": 0.19413952365363657, + "learning_rate": 6.551429559459231e-05, + "loss": 0.3656, + "step": 1223 + }, + { + "epoch": 1.7611510791366907, + "grad_norm": 0.22492340242534004, + "learning_rate": 6.548333278135915e-05, + "loss": 0.3609, + "step": 1224 + }, + { + "epoch": 1.762589928057554, + "grad_norm": 0.23250710762288015, + "learning_rate": 6.545234424639616e-05, + "loss": 0.3651, + "step": 1225 + }, + { + "epoch": 1.7640287769784173, + "grad_norm": 0.2531774006159571, + "learning_rate": 6.542133002098178e-05, + "loss": 0.3714, + "step": 1226 + }, + { + "epoch": 1.7654676258992805, + "grad_norm": 0.2851427168782333, + "learning_rate": 6.53902901364204e-05, + "loss": 0.3733, + "step": 1227 + }, + { + "epoch": 1.7669064748201437, + "grad_norm": 0.22742460070704784, + "learning_rate": 6.535922462404226e-05, + "loss": 0.3606, + "step": 1228 + }, + { + "epoch": 1.7683453237410072, + "grad_norm": 0.17288087034904095, + "learning_rate": 6.53281335152035e-05, + "loss": 0.3693, + "step": 1229 + }, + { + "epoch": 1.7697841726618706, + "grad_norm": 0.19183373954992122, + "learning_rate": 6.529701684128608e-05, + "loss": 0.3639, + "step": 1230 + }, + { + "epoch": 1.7712230215827338, + "grad_norm": 0.23127921588481407, + "learning_rate": 6.526587463369779e-05, + "loss": 0.3736, + "step": 1231 + }, + { + "epoch": 1.772661870503597, + "grad_norm": 0.22274556172374138, + "learning_rate": 6.523470692387215e-05, + "loss": 0.3768, + "step": 1232 + }, + { + "epoch": 1.7741007194244605, + "grad_norm": 0.15866051821034918, + "learning_rate": 6.520351374326846e-05, + "loss": 0.3661, + "step": 1233 + }, + { + "epoch": 1.775539568345324, + "grad_norm": 0.19594762797331902, + "learning_rate": 6.51722951233717e-05, + "loss": 0.3746, + "step": 1234 + }, + { + "epoch": 1.776978417266187, + "grad_norm": 0.2515028034448848, + "learning_rate": 6.514105109569254e-05, + "loss": 0.3667, + "step": 1235 + }, + { + "epoch": 1.7784172661870503, + "grad_norm": 0.21371915658842414, + "learning_rate": 6.510978169176731e-05, + "loss": 0.3704, + "step": 1236 + }, + { + "epoch": 1.7798561151079135, + "grad_norm": 0.21260882546744508, + "learning_rate": 6.507848694315794e-05, + "loss": 0.3605, + "step": 1237 + }, + { + "epoch": 1.781294964028777, + "grad_norm": 0.21239711315848003, + "learning_rate": 6.504716688145192e-05, + "loss": 0.3638, + "step": 1238 + }, + { + "epoch": 1.7827338129496404, + "grad_norm": 0.21190449910072812, + "learning_rate": 6.501582153826235e-05, + "loss": 0.3696, + "step": 1239 + }, + { + "epoch": 1.7841726618705036, + "grad_norm": 0.2266603833816144, + "learning_rate": 6.498445094522776e-05, + "loss": 0.3682, + "step": 1240 + }, + { + "epoch": 1.7856115107913668, + "grad_norm": 0.2229483162436417, + "learning_rate": 6.495305513401226e-05, + "loss": 0.3644, + "step": 1241 + }, + { + "epoch": 1.7870503597122303, + "grad_norm": 0.231901254439104, + "learning_rate": 6.492163413630534e-05, + "loss": 0.3778, + "step": 1242 + }, + { + "epoch": 1.7884892086330937, + "grad_norm": 0.2586259725233526, + "learning_rate": 6.489018798382195e-05, + "loss": 0.3749, + "step": 1243 + }, + { + "epoch": 1.789928057553957, + "grad_norm": 0.25644600072677914, + "learning_rate": 6.485871670830243e-05, + "loss": 0.3729, + "step": 1244 + }, + { + "epoch": 1.79136690647482, + "grad_norm": 0.20278941867009576, + "learning_rate": 6.482722034151247e-05, + "loss": 0.3657, + "step": 1245 + }, + { + "epoch": 1.7928057553956833, + "grad_norm": 0.12947462225096898, + "learning_rate": 6.479569891524307e-05, + "loss": 0.3605, + "step": 1246 + }, + { + "epoch": 1.7942446043165468, + "grad_norm": 0.19473353666555931, + "learning_rate": 6.476415246131056e-05, + "loss": 0.3573, + "step": 1247 + }, + { + "epoch": 1.7956834532374102, + "grad_norm": 0.22175060764272925, + "learning_rate": 6.47325810115565e-05, + "loss": 0.3688, + "step": 1248 + }, + { + "epoch": 1.7971223021582734, + "grad_norm": 0.19107613478091598, + "learning_rate": 6.470098459784768e-05, + "loss": 0.3607, + "step": 1249 + }, + { + "epoch": 1.7985611510791366, + "grad_norm": 0.20692252089586538, + "learning_rate": 6.466936325207612e-05, + "loss": 0.3618, + "step": 1250 + }, + { + "epoch": 1.8, + "grad_norm": 0.2216428206800112, + "learning_rate": 6.463771700615898e-05, + "loss": 0.3706, + "step": 1251 + }, + { + "epoch": 1.8014388489208633, + "grad_norm": 0.21527489050122087, + "learning_rate": 6.460604589203854e-05, + "loss": 0.3642, + "step": 1252 + }, + { + "epoch": 1.8028776978417267, + "grad_norm": 0.22442501298074238, + "learning_rate": 6.457434994168224e-05, + "loss": 0.3738, + "step": 1253 + }, + { + "epoch": 1.80431654676259, + "grad_norm": 0.2334843631216741, + "learning_rate": 6.454262918708247e-05, + "loss": 0.3636, + "step": 1254 + }, + { + "epoch": 1.8057553956834531, + "grad_norm": 0.17986454834962382, + "learning_rate": 6.451088366025682e-05, + "loss": 0.3606, + "step": 1255 + }, + { + "epoch": 1.8071942446043165, + "grad_norm": 0.16168341390536148, + "learning_rate": 6.447911339324773e-05, + "loss": 0.3613, + "step": 1256 + }, + { + "epoch": 1.80863309352518, + "grad_norm": 0.15127179324517867, + "learning_rate": 6.444731841812274e-05, + "loss": 0.3731, + "step": 1257 + }, + { + "epoch": 1.8100719424460432, + "grad_norm": 0.13619364927493682, + "learning_rate": 6.44154987669742e-05, + "loss": 0.3603, + "step": 1258 + }, + { + "epoch": 1.8115107913669064, + "grad_norm": 0.13911547113387118, + "learning_rate": 6.438365447191947e-05, + "loss": 0.3671, + "step": 1259 + }, + { + "epoch": 1.8129496402877698, + "grad_norm": 0.1763023708267854, + "learning_rate": 6.435178556510076e-05, + "loss": 0.3667, + "step": 1260 + }, + { + "epoch": 1.814388489208633, + "grad_norm": 0.1844999431533389, + "learning_rate": 6.431989207868508e-05, + "loss": 0.3676, + "step": 1261 + }, + { + "epoch": 1.8158273381294965, + "grad_norm": 0.179685943790929, + "learning_rate": 6.428797404486431e-05, + "loss": 0.3723, + "step": 1262 + }, + { + "epoch": 1.8172661870503597, + "grad_norm": 0.18079082090576035, + "learning_rate": 6.425603149585507e-05, + "loss": 0.3667, + "step": 1263 + }, + { + "epoch": 1.818705035971223, + "grad_norm": 0.18067051290306504, + "learning_rate": 6.422406446389872e-05, + "loss": 0.3698, + "step": 1264 + }, + { + "epoch": 1.8201438848920863, + "grad_norm": 0.18456235795127293, + "learning_rate": 6.419207298126135e-05, + "loss": 0.3624, + "step": 1265 + }, + { + "epoch": 1.8215827338129498, + "grad_norm": 0.1989141620198725, + "learning_rate": 6.416005708023372e-05, + "loss": 0.36, + "step": 1266 + }, + { + "epoch": 1.823021582733813, + "grad_norm": 0.17040604847465962, + "learning_rate": 6.412801679313125e-05, + "loss": 0.3594, + "step": 1267 + }, + { + "epoch": 1.8244604316546762, + "grad_norm": 0.17522779502348249, + "learning_rate": 6.409595215229397e-05, + "loss": 0.3667, + "step": 1268 + }, + { + "epoch": 1.8258992805755394, + "grad_norm": 0.167039688944147, + "learning_rate": 6.406386319008647e-05, + "loss": 0.3641, + "step": 1269 + }, + { + "epoch": 1.8273381294964028, + "grad_norm": 0.1614900845830248, + "learning_rate": 6.403174993889791e-05, + "loss": 0.3715, + "step": 1270 + }, + { + "epoch": 1.8287769784172663, + "grad_norm": 0.17026645396291346, + "learning_rate": 6.399961243114197e-05, + "loss": 0.3592, + "step": 1271 + }, + { + "epoch": 1.8302158273381295, + "grad_norm": 0.16282300582633127, + "learning_rate": 6.39674506992568e-05, + "loss": 0.3648, + "step": 1272 + }, + { + "epoch": 1.8316546762589927, + "grad_norm": 0.16423560483078464, + "learning_rate": 6.393526477570499e-05, + "loss": 0.3583, + "step": 1273 + }, + { + "epoch": 1.8330935251798561, + "grad_norm": 0.15753644555304952, + "learning_rate": 6.390305469297357e-05, + "loss": 0.3666, + "step": 1274 + }, + { + "epoch": 1.8345323741007196, + "grad_norm": 0.132931699103671, + "learning_rate": 6.387082048357397e-05, + "loss": 0.3588, + "step": 1275 + }, + { + "epoch": 1.8359712230215828, + "grad_norm": 0.12381739999202473, + "learning_rate": 6.383856218004193e-05, + "loss": 0.3619, + "step": 1276 + }, + { + "epoch": 1.837410071942446, + "grad_norm": 0.1408804747953695, + "learning_rate": 6.380627981493753e-05, + "loss": 0.3745, + "step": 1277 + }, + { + "epoch": 1.8388489208633092, + "grad_norm": 0.15318148858885486, + "learning_rate": 6.377397342084514e-05, + "loss": 0.3641, + "step": 1278 + }, + { + "epoch": 1.8402877697841726, + "grad_norm": 0.15896624901132014, + "learning_rate": 6.37416430303734e-05, + "loss": 0.359, + "step": 1279 + }, + { + "epoch": 1.841726618705036, + "grad_norm": 0.16889794983736373, + "learning_rate": 6.370928867615513e-05, + "loss": 0.3604, + "step": 1280 + }, + { + "epoch": 1.8431654676258993, + "grad_norm": 0.13534285200730897, + "learning_rate": 6.367691039084736e-05, + "loss": 0.3616, + "step": 1281 + }, + { + "epoch": 1.8446043165467625, + "grad_norm": 0.15095602064699098, + "learning_rate": 6.36445082071313e-05, + "loss": 0.3658, + "step": 1282 + }, + { + "epoch": 1.846043165467626, + "grad_norm": 0.16119564119857152, + "learning_rate": 6.361208215771222e-05, + "loss": 0.3681, + "step": 1283 + }, + { + "epoch": 1.8474820143884894, + "grad_norm": 0.1687200328354194, + "learning_rate": 6.357963227531954e-05, + "loss": 0.3778, + "step": 1284 + }, + { + "epoch": 1.8489208633093526, + "grad_norm": 0.1927316215580099, + "learning_rate": 6.35471585927067e-05, + "loss": 0.3709, + "step": 1285 + }, + { + "epoch": 1.8503597122302158, + "grad_norm": 0.2255474866727223, + "learning_rate": 6.351466114265118e-05, + "loss": 0.3668, + "step": 1286 + }, + { + "epoch": 1.851798561151079, + "grad_norm": 0.23023478951407433, + "learning_rate": 6.348213995795445e-05, + "loss": 0.3777, + "step": 1287 + }, + { + "epoch": 1.8532374100719424, + "grad_norm": 0.2380129207576633, + "learning_rate": 6.344959507144192e-05, + "loss": 0.3684, + "step": 1288 + }, + { + "epoch": 1.8546762589928059, + "grad_norm": 0.19321416153221121, + "learning_rate": 6.341702651596293e-05, + "loss": 0.3539, + "step": 1289 + }, + { + "epoch": 1.856115107913669, + "grad_norm": 0.23807443862365457, + "learning_rate": 6.338443432439074e-05, + "loss": 0.3673, + "step": 1290 + }, + { + "epoch": 1.8575539568345323, + "grad_norm": 0.246407917353753, + "learning_rate": 6.335181852962242e-05, + "loss": 0.3594, + "step": 1291 + }, + { + "epoch": 1.8589928057553957, + "grad_norm": 0.239352433412642, + "learning_rate": 6.331917916457889e-05, + "loss": 0.3588, + "step": 1292 + }, + { + "epoch": 1.8604316546762591, + "grad_norm": 0.24341180965224837, + "learning_rate": 6.328651626220485e-05, + "loss": 0.3736, + "step": 1293 + }, + { + "epoch": 1.8618705035971224, + "grad_norm": 0.22174826280195295, + "learning_rate": 6.325382985546879e-05, + "loss": 0.3663, + "step": 1294 + }, + { + "epoch": 1.8633093525179856, + "grad_norm": 0.18922955181189371, + "learning_rate": 6.322111997736288e-05, + "loss": 0.3711, + "step": 1295 + }, + { + "epoch": 1.8647482014388488, + "grad_norm": 0.18508999656566633, + "learning_rate": 6.3188386660903e-05, + "loss": 0.3656, + "step": 1296 + }, + { + "epoch": 1.8661870503597122, + "grad_norm": 0.16258187592426696, + "learning_rate": 6.315562993912869e-05, + "loss": 0.3684, + "step": 1297 + }, + { + "epoch": 1.8676258992805757, + "grad_norm": 0.11809985776299295, + "learning_rate": 6.31228498451031e-05, + "loss": 0.3704, + "step": 1298 + }, + { + "epoch": 1.8690647482014389, + "grad_norm": 0.1473174157257907, + "learning_rate": 6.309004641191299e-05, + "loss": 0.367, + "step": 1299 + }, + { + "epoch": 1.870503597122302, + "grad_norm": 0.15981579791476963, + "learning_rate": 6.305721967266869e-05, + "loss": 0.3722, + "step": 1300 + }, + { + "epoch": 1.8719424460431655, + "grad_norm": 0.15920152652070532, + "learning_rate": 6.302436966050401e-05, + "loss": 0.3614, + "step": 1301 + }, + { + "epoch": 1.873381294964029, + "grad_norm": 0.13753742153654028, + "learning_rate": 6.29914964085763e-05, + "loss": 0.3635, + "step": 1302 + }, + { + "epoch": 1.8748201438848922, + "grad_norm": 0.15314177588108846, + "learning_rate": 6.295859995006629e-05, + "loss": 0.3669, + "step": 1303 + }, + { + "epoch": 1.8762589928057554, + "grad_norm": 0.14603492031310347, + "learning_rate": 6.292568031817823e-05, + "loss": 0.3673, + "step": 1304 + }, + { + "epoch": 1.8776978417266186, + "grad_norm": 0.13655641297840962, + "learning_rate": 6.28927375461397e-05, + "loss": 0.3617, + "step": 1305 + }, + { + "epoch": 1.879136690647482, + "grad_norm": 0.1682465239379107, + "learning_rate": 6.285977166720166e-05, + "loss": 0.3637, + "step": 1306 + }, + { + "epoch": 1.8805755395683454, + "grad_norm": 0.21073997101971043, + "learning_rate": 6.28267827146384e-05, + "loss": 0.3565, + "step": 1307 + }, + { + "epoch": 1.8820143884892087, + "grad_norm": 0.16044415201678497, + "learning_rate": 6.279377072174744e-05, + "loss": 0.3682, + "step": 1308 + }, + { + "epoch": 1.8834532374100719, + "grad_norm": 0.19534802614931365, + "learning_rate": 6.276073572184964e-05, + "loss": 0.3717, + "step": 1309 + }, + { + "epoch": 1.8848920863309353, + "grad_norm": 0.1905370702491253, + "learning_rate": 6.272767774828903e-05, + "loss": 0.3694, + "step": 1310 + }, + { + "epoch": 1.8863309352517985, + "grad_norm": 0.22291681201611888, + "learning_rate": 6.269459683443283e-05, + "loss": 0.3683, + "step": 1311 + }, + { + "epoch": 1.887769784172662, + "grad_norm": 0.22979965511133224, + "learning_rate": 6.266149301367146e-05, + "loss": 0.3685, + "step": 1312 + }, + { + "epoch": 1.8892086330935252, + "grad_norm": 0.25371034308502655, + "learning_rate": 6.262836631941839e-05, + "loss": 0.3665, + "step": 1313 + }, + { + "epoch": 1.8906474820143884, + "grad_norm": 0.23742213179615146, + "learning_rate": 6.259521678511023e-05, + "loss": 0.3682, + "step": 1314 + }, + { + "epoch": 1.8920863309352518, + "grad_norm": 0.17859614363972742, + "learning_rate": 6.256204444420663e-05, + "loss": 0.3558, + "step": 1315 + }, + { + "epoch": 1.8935251798561152, + "grad_norm": 0.15944292200948446, + "learning_rate": 6.252884933019028e-05, + "loss": 0.3664, + "step": 1316 + }, + { + "epoch": 1.8949640287769784, + "grad_norm": 0.16886543532919862, + "learning_rate": 6.249563147656679e-05, + "loss": 0.3646, + "step": 1317 + }, + { + "epoch": 1.8964028776978417, + "grad_norm": 0.17695565927950804, + "learning_rate": 6.24623909168648e-05, + "loss": 0.3634, + "step": 1318 + }, + { + "epoch": 1.8978417266187049, + "grad_norm": 0.173219674595819, + "learning_rate": 6.242912768463581e-05, + "loss": 0.3707, + "step": 1319 + }, + { + "epoch": 1.8992805755395683, + "grad_norm": 0.18810502872275187, + "learning_rate": 6.239584181345426e-05, + "loss": 0.3658, + "step": 1320 + }, + { + "epoch": 1.9007194244604317, + "grad_norm": 0.18522148169382519, + "learning_rate": 6.236253333691739e-05, + "loss": 0.3657, + "step": 1321 + }, + { + "epoch": 1.902158273381295, + "grad_norm": 0.17037815206677648, + "learning_rate": 6.23292022886453e-05, + "loss": 0.3617, + "step": 1322 + }, + { + "epoch": 1.9035971223021582, + "grad_norm": 0.13651020757870233, + "learning_rate": 6.229584870228083e-05, + "loss": 0.3663, + "step": 1323 + }, + { + "epoch": 1.9050359712230216, + "grad_norm": 0.13238836277044766, + "learning_rate": 6.226247261148958e-05, + "loss": 0.3674, + "step": 1324 + }, + { + "epoch": 1.906474820143885, + "grad_norm": 0.17756725488705433, + "learning_rate": 6.22290740499599e-05, + "loss": 0.3802, + "step": 1325 + }, + { + "epoch": 1.9079136690647482, + "grad_norm": 0.19697424948513603, + "learning_rate": 6.21956530514028e-05, + "loss": 0.3667, + "step": 1326 + }, + { + "epoch": 1.9093525179856115, + "grad_norm": 0.18668259822133004, + "learning_rate": 6.216220964955192e-05, + "loss": 0.37, + "step": 1327 + }, + { + "epoch": 1.9107913669064747, + "grad_norm": 0.18432614814837855, + "learning_rate": 6.21287438781635e-05, + "loss": 0.3587, + "step": 1328 + }, + { + "epoch": 1.912230215827338, + "grad_norm": 0.21777479742632244, + "learning_rate": 6.209525577101642e-05, + "loss": 0.367, + "step": 1329 + }, + { + "epoch": 1.9136690647482015, + "grad_norm": 0.1551943406063464, + "learning_rate": 6.206174536191207e-05, + "loss": 0.3663, + "step": 1330 + }, + { + "epoch": 1.9151079136690647, + "grad_norm": 0.18586350947210925, + "learning_rate": 6.202821268467433e-05, + "loss": 0.376, + "step": 1331 + }, + { + "epoch": 1.916546762589928, + "grad_norm": 0.17665307141723707, + "learning_rate": 6.199465777314958e-05, + "loss": 0.3703, + "step": 1332 + }, + { + "epoch": 1.9179856115107914, + "grad_norm": 0.16332710030615785, + "learning_rate": 6.196108066120663e-05, + "loss": 0.3632, + "step": 1333 + }, + { + "epoch": 1.9194244604316548, + "grad_norm": 0.16610604271465892, + "learning_rate": 6.192748138273674e-05, + "loss": 0.3677, + "step": 1334 + }, + { + "epoch": 1.920863309352518, + "grad_norm": 0.14593869750353175, + "learning_rate": 6.189385997165348e-05, + "loss": 0.3676, + "step": 1335 + }, + { + "epoch": 1.9223021582733812, + "grad_norm": 0.15808245007787355, + "learning_rate": 6.186021646189281e-05, + "loss": 0.3571, + "step": 1336 + }, + { + "epoch": 1.9237410071942445, + "grad_norm": 0.19800729472492867, + "learning_rate": 6.182655088741294e-05, + "loss": 0.3672, + "step": 1337 + }, + { + "epoch": 1.925179856115108, + "grad_norm": 0.2671922414742339, + "learning_rate": 6.179286328219442e-05, + "loss": 0.3613, + "step": 1338 + }, + { + "epoch": 1.9266187050359713, + "grad_norm": 0.2986364081661207, + "learning_rate": 6.175915368024e-05, + "loss": 0.3702, + "step": 1339 + }, + { + "epoch": 1.9280575539568345, + "grad_norm": 0.263811149892615, + "learning_rate": 6.172542211557463e-05, + "loss": 0.3626, + "step": 1340 + }, + { + "epoch": 1.9294964028776977, + "grad_norm": 0.21786234216312614, + "learning_rate": 6.169166862224542e-05, + "loss": 0.3655, + "step": 1341 + }, + { + "epoch": 1.9309352517985612, + "grad_norm": 0.2121469588015191, + "learning_rate": 6.165789323432166e-05, + "loss": 0.3656, + "step": 1342 + }, + { + "epoch": 1.9323741007194246, + "grad_norm": 0.20357450615026512, + "learning_rate": 6.162409598589467e-05, + "loss": 0.372, + "step": 1343 + }, + { + "epoch": 1.9338129496402878, + "grad_norm": 0.19609243094112727, + "learning_rate": 6.159027691107791e-05, + "loss": 0.3678, + "step": 1344 + }, + { + "epoch": 1.935251798561151, + "grad_norm": 0.24947346016676034, + "learning_rate": 6.15564360440068e-05, + "loss": 0.367, + "step": 1345 + }, + { + "epoch": 1.9366906474820142, + "grad_norm": 0.28117985290654207, + "learning_rate": 6.15225734188388e-05, + "loss": 0.3694, + "step": 1346 + }, + { + "epoch": 1.9381294964028777, + "grad_norm": 0.23619911878557048, + "learning_rate": 6.148868906975334e-05, + "loss": 0.375, + "step": 1347 + }, + { + "epoch": 1.9395683453237411, + "grad_norm": 0.1889537191471565, + "learning_rate": 6.145478303095174e-05, + "loss": 0.3588, + "step": 1348 + }, + { + "epoch": 1.9410071942446043, + "grad_norm": 0.16659880981858904, + "learning_rate": 6.142085533665722e-05, + "loss": 0.3761, + "step": 1349 + }, + { + "epoch": 1.9424460431654675, + "grad_norm": 0.1548161898642397, + "learning_rate": 6.138690602111487e-05, + "loss": 0.3628, + "step": 1350 + }, + { + "epoch": 1.943884892086331, + "grad_norm": 0.16605248079429874, + "learning_rate": 6.135293511859164e-05, + "loss": 0.3622, + "step": 1351 + }, + { + "epoch": 1.9453237410071944, + "grad_norm": 0.21011815374468795, + "learning_rate": 6.131894266337618e-05, + "loss": 0.3614, + "step": 1352 + }, + { + "epoch": 1.9467625899280576, + "grad_norm": 0.26222420519160206, + "learning_rate": 6.128492868977897e-05, + "loss": 0.3745, + "step": 1353 + }, + { + "epoch": 1.9482014388489208, + "grad_norm": 0.2761941631424964, + "learning_rate": 6.12508932321322e-05, + "loss": 0.3702, + "step": 1354 + }, + { + "epoch": 1.949640287769784, + "grad_norm": 0.24631564378436294, + "learning_rate": 6.12168363247897e-05, + "loss": 0.3589, + "step": 1355 + }, + { + "epoch": 1.9510791366906475, + "grad_norm": 0.21284330567492582, + "learning_rate": 6.1182758002127e-05, + "loss": 0.3639, + "step": 1356 + }, + { + "epoch": 1.952517985611511, + "grad_norm": 0.2046714792956475, + "learning_rate": 6.114865829854123e-05, + "loss": 0.3761, + "step": 1357 + }, + { + "epoch": 1.9539568345323741, + "grad_norm": 0.2095542825539175, + "learning_rate": 6.111453724845106e-05, + "loss": 0.37, + "step": 1358 + }, + { + "epoch": 1.9553956834532373, + "grad_norm": 0.20867260956799893, + "learning_rate": 6.108039488629679e-05, + "loss": 0.3716, + "step": 1359 + }, + { + "epoch": 1.9568345323741008, + "grad_norm": 0.204268029457269, + "learning_rate": 6.104623124654016e-05, + "loss": 0.3718, + "step": 1360 + }, + { + "epoch": 1.958273381294964, + "grad_norm": 0.13842933139509825, + "learning_rate": 6.101204636366441e-05, + "loss": 0.3667, + "step": 1361 + }, + { + "epoch": 1.9597122302158274, + "grad_norm": 0.14559876830826288, + "learning_rate": 6.0977840272174224e-05, + "loss": 0.3694, + "step": 1362 + }, + { + "epoch": 1.9611510791366906, + "grad_norm": 0.21853957364813154, + "learning_rate": 6.094361300659571e-05, + "loss": 0.3661, + "step": 1363 + }, + { + "epoch": 1.9625899280575538, + "grad_norm": 0.2803796152416274, + "learning_rate": 6.090936460147632e-05, + "loss": 0.3683, + "step": 1364 + }, + { + "epoch": 1.9640287769784173, + "grad_norm": 0.24471699891354518, + "learning_rate": 6.087509509138483e-05, + "loss": 0.3567, + "step": 1365 + }, + { + "epoch": 1.9654676258992807, + "grad_norm": 0.19905025886723116, + "learning_rate": 6.0840804510911374e-05, + "loss": 0.3661, + "step": 1366 + }, + { + "epoch": 1.966906474820144, + "grad_norm": 0.1586018942006736, + "learning_rate": 6.0806492894667315e-05, + "loss": 0.355, + "step": 1367 + }, + { + "epoch": 1.9683453237410071, + "grad_norm": 0.1966596533936432, + "learning_rate": 6.077216027728524e-05, + "loss": 0.3608, + "step": 1368 + }, + { + "epoch": 1.9697841726618706, + "grad_norm": 0.1952894028757089, + "learning_rate": 6.073780669341896e-05, + "loss": 0.3605, + "step": 1369 + }, + { + "epoch": 1.9712230215827338, + "grad_norm": 0.1739202507995161, + "learning_rate": 6.070343217774343e-05, + "loss": 0.3618, + "step": 1370 + }, + { + "epoch": 1.9726618705035972, + "grad_norm": 0.18788051008043297, + "learning_rate": 6.066903676495477e-05, + "loss": 0.3682, + "step": 1371 + }, + { + "epoch": 1.9741007194244604, + "grad_norm": 0.17297079370821708, + "learning_rate": 6.063462048977011e-05, + "loss": 0.3619, + "step": 1372 + }, + { + "epoch": 1.9755395683453236, + "grad_norm": 0.14786038421172948, + "learning_rate": 6.060018338692774e-05, + "loss": 0.3704, + "step": 1373 + }, + { + "epoch": 1.976978417266187, + "grad_norm": 0.1526032428610714, + "learning_rate": 6.056572549118688e-05, + "loss": 0.372, + "step": 1374 + }, + { + "epoch": 1.9784172661870505, + "grad_norm": 0.15556697159769797, + "learning_rate": 6.053124683732781e-05, + "loss": 0.3815, + "step": 1375 + }, + { + "epoch": 1.9798561151079137, + "grad_norm": 0.14307035561229411, + "learning_rate": 6.049674746015172e-05, + "loss": 0.3612, + "step": 1376 + }, + { + "epoch": 1.981294964028777, + "grad_norm": 0.16303436232338575, + "learning_rate": 6.046222739448075e-05, + "loss": 0.3702, + "step": 1377 + }, + { + "epoch": 1.9827338129496401, + "grad_norm": 0.20362078098379227, + "learning_rate": 6.042768667515786e-05, + "loss": 0.3676, + "step": 1378 + }, + { + "epoch": 1.9841726618705036, + "grad_norm": 0.20224652092435189, + "learning_rate": 6.039312533704692e-05, + "loss": 0.3644, + "step": 1379 + }, + { + "epoch": 1.985611510791367, + "grad_norm": 0.1918306268378846, + "learning_rate": 6.0358543415032625e-05, + "loss": 0.3644, + "step": 1380 + }, + { + "epoch": 1.9870503597122302, + "grad_norm": 0.1977587698041893, + "learning_rate": 6.032394094402035e-05, + "loss": 0.3623, + "step": 1381 + }, + { + "epoch": 1.9884892086330934, + "grad_norm": 0.19004846555269467, + "learning_rate": 6.0289317958936305e-05, + "loss": 0.3709, + "step": 1382 + }, + { + "epoch": 1.9899280575539569, + "grad_norm": 0.189812208978719, + "learning_rate": 6.0254674494727374e-05, + "loss": 0.3596, + "step": 1383 + }, + { + "epoch": 1.9913669064748203, + "grad_norm": 0.17927036123690349, + "learning_rate": 6.022001058636111e-05, + "loss": 0.3669, + "step": 1384 + }, + { + "epoch": 1.9928057553956835, + "grad_norm": 0.12890285825777262, + "learning_rate": 6.01853262688257e-05, + "loss": 0.3645, + "step": 1385 + }, + { + "epoch": 1.9942446043165467, + "grad_norm": 0.15473138461434743, + "learning_rate": 6.0150621577129934e-05, + "loss": 0.3681, + "step": 1386 + }, + { + "epoch": 1.99568345323741, + "grad_norm": 0.168212010686727, + "learning_rate": 6.011589654630318e-05, + "loss": 0.3643, + "step": 1387 + }, + { + "epoch": 1.9971223021582734, + "grad_norm": 0.17058104280627068, + "learning_rate": 6.008115121139528e-05, + "loss": 0.3605, + "step": 1388 + }, + { + "epoch": 1.9985611510791368, + "grad_norm": 0.16992170065244036, + "learning_rate": 6.0046385607476655e-05, + "loss": 0.372, + "step": 1389 + }, + { + "epoch": 2.0, + "grad_norm": 0.13875453854266914, + "learning_rate": 6.001159976963814e-05, + "loss": 0.3767, + "step": 1390 + }, + { + "epoch": 2.001438848920863, + "grad_norm": 0.148563996887187, + "learning_rate": 5.9976793732990965e-05, + "loss": 0.3344, + "step": 1391 + }, + { + "epoch": 2.0028776978417264, + "grad_norm": 0.17757271066320204, + "learning_rate": 5.9941967532666806e-05, + "loss": 0.3414, + "step": 1392 + }, + { + "epoch": 2.00431654676259, + "grad_norm": 0.25728320339880906, + "learning_rate": 5.990712120381766e-05, + "loss": 0.3512, + "step": 1393 + }, + { + "epoch": 2.0057553956834533, + "grad_norm": 0.3018541720324142, + "learning_rate": 5.987225478161583e-05, + "loss": 0.3371, + "step": 1394 + }, + { + "epoch": 2.0071942446043165, + "grad_norm": 0.3411629721917868, + "learning_rate": 5.9837368301253905e-05, + "loss": 0.3352, + "step": 1395 + }, + { + "epoch": 2.0086330935251797, + "grad_norm": 0.36077816050495254, + "learning_rate": 5.980246179794476e-05, + "loss": 0.3481, + "step": 1396 + }, + { + "epoch": 2.0100719424460434, + "grad_norm": 0.3325755806427443, + "learning_rate": 5.976753530692144e-05, + "loss": 0.3358, + "step": 1397 + }, + { + "epoch": 2.0115107913669066, + "grad_norm": 0.28271431398627417, + "learning_rate": 5.9732588863437155e-05, + "loss": 0.3368, + "step": 1398 + }, + { + "epoch": 2.01294964028777, + "grad_norm": 0.22383842416931668, + "learning_rate": 5.96976225027653e-05, + "loss": 0.3476, + "step": 1399 + }, + { + "epoch": 2.014388489208633, + "grad_norm": 0.22964513577848436, + "learning_rate": 5.966263626019932e-05, + "loss": 0.3397, + "step": 1400 + }, + { + "epoch": 2.015827338129496, + "grad_norm": 0.19952963432087853, + "learning_rate": 5.9627630171052774e-05, + "loss": 0.3545, + "step": 1401 + }, + { + "epoch": 2.01726618705036, + "grad_norm": 0.215144227694722, + "learning_rate": 5.9592604270659234e-05, + "loss": 0.347, + "step": 1402 + }, + { + "epoch": 2.018705035971223, + "grad_norm": 0.27146520260633705, + "learning_rate": 5.955755859437225e-05, + "loss": 0.3402, + "step": 1403 + }, + { + "epoch": 2.0201438848920863, + "grad_norm": 0.2558100421691262, + "learning_rate": 5.9522493177565366e-05, + "loss": 0.3407, + "step": 1404 + }, + { + "epoch": 2.0215827338129495, + "grad_norm": 0.19499628780137643, + "learning_rate": 5.948740805563203e-05, + "loss": 0.3363, + "step": 1405 + }, + { + "epoch": 2.023021582733813, + "grad_norm": 0.25988959050213156, + "learning_rate": 5.94523032639856e-05, + "loss": 0.339, + "step": 1406 + }, + { + "epoch": 2.0244604316546764, + "grad_norm": 0.2691665083854109, + "learning_rate": 5.9417178838059254e-05, + "loss": 0.3306, + "step": 1407 + }, + { + "epoch": 2.0258992805755396, + "grad_norm": 0.16866973938494045, + "learning_rate": 5.9382034813306014e-05, + "loss": 0.3357, + "step": 1408 + }, + { + "epoch": 2.027338129496403, + "grad_norm": 0.1980678367873882, + "learning_rate": 5.934687122519868e-05, + "loss": 0.343, + "step": 1409 + }, + { + "epoch": 2.028776978417266, + "grad_norm": 0.24433337786013076, + "learning_rate": 5.93116881092298e-05, + "loss": 0.3354, + "step": 1410 + }, + { + "epoch": 2.0302158273381297, + "grad_norm": 0.17072295105484814, + "learning_rate": 5.927648550091162e-05, + "loss": 0.3415, + "step": 1411 + }, + { + "epoch": 2.031654676258993, + "grad_norm": 0.1471505339735923, + "learning_rate": 5.9241263435776087e-05, + "loss": 0.3363, + "step": 1412 + }, + { + "epoch": 2.033093525179856, + "grad_norm": 0.17589140549291227, + "learning_rate": 5.920602194937474e-05, + "loss": 0.3478, + "step": 1413 + }, + { + "epoch": 2.0345323741007193, + "grad_norm": 0.15623510804112709, + "learning_rate": 5.9170761077278766e-05, + "loss": 0.3396, + "step": 1414 + }, + { + "epoch": 2.0359712230215825, + "grad_norm": 0.16624956542152658, + "learning_rate": 5.9135480855078915e-05, + "loss": 0.3421, + "step": 1415 + }, + { + "epoch": 2.037410071942446, + "grad_norm": 0.16604428989665748, + "learning_rate": 5.910018131838544e-05, + "loss": 0.3458, + "step": 1416 + }, + { + "epoch": 2.0388489208633094, + "grad_norm": 0.1481243103484568, + "learning_rate": 5.906486250282811e-05, + "loss": 0.3407, + "step": 1417 + }, + { + "epoch": 2.0402877697841726, + "grad_norm": 0.16981494985197118, + "learning_rate": 5.902952444405615e-05, + "loss": 0.3437, + "step": 1418 + }, + { + "epoch": 2.041726618705036, + "grad_norm": 0.18613786633593082, + "learning_rate": 5.899416717773822e-05, + "loss": 0.3471, + "step": 1419 + }, + { + "epoch": 2.0431654676258995, + "grad_norm": 0.16572440895735488, + "learning_rate": 5.8958790739562316e-05, + "loss": 0.3387, + "step": 1420 + }, + { + "epoch": 2.0446043165467627, + "grad_norm": 0.13683264362354047, + "learning_rate": 5.892339516523586e-05, + "loss": 0.3354, + "step": 1421 + }, + { + "epoch": 2.046043165467626, + "grad_norm": 0.13439859226195303, + "learning_rate": 5.8887980490485536e-05, + "loss": 0.3338, + "step": 1422 + }, + { + "epoch": 2.047482014388489, + "grad_norm": 0.1439569376941477, + "learning_rate": 5.8852546751057337e-05, + "loss": 0.3413, + "step": 1423 + }, + { + "epoch": 2.0489208633093523, + "grad_norm": 0.1598768464407202, + "learning_rate": 5.8817093982716455e-05, + "loss": 0.3531, + "step": 1424 + }, + { + "epoch": 2.050359712230216, + "grad_norm": 0.14821641303085784, + "learning_rate": 5.878162222124735e-05, + "loss": 0.3356, + "step": 1425 + }, + { + "epoch": 2.051798561151079, + "grad_norm": 0.12491325288420484, + "learning_rate": 5.8746131502453623e-05, + "loss": 0.3385, + "step": 1426 + }, + { + "epoch": 2.0532374100719424, + "grad_norm": 0.12083502206900046, + "learning_rate": 5.871062186215799e-05, + "loss": 0.3409, + "step": 1427 + }, + { + "epoch": 2.0546762589928056, + "grad_norm": 0.13571360860778423, + "learning_rate": 5.867509333620231e-05, + "loss": 0.3342, + "step": 1428 + }, + { + "epoch": 2.0561151079136692, + "grad_norm": 0.13984388418344307, + "learning_rate": 5.863954596044744e-05, + "loss": 0.3361, + "step": 1429 + }, + { + "epoch": 2.0575539568345325, + "grad_norm": 0.11629574383383963, + "learning_rate": 5.8603979770773344e-05, + "loss": 0.3414, + "step": 1430 + }, + { + "epoch": 2.0589928057553957, + "grad_norm": 0.13495699245456752, + "learning_rate": 5.85683948030789e-05, + "loss": 0.3406, + "step": 1431 + }, + { + "epoch": 2.060431654676259, + "grad_norm": 0.13148052384917566, + "learning_rate": 5.8532791093282e-05, + "loss": 0.3357, + "step": 1432 + }, + { + "epoch": 2.061870503597122, + "grad_norm": 0.14840204064521428, + "learning_rate": 5.849716867731941e-05, + "loss": 0.3416, + "step": 1433 + }, + { + "epoch": 2.0633093525179858, + "grad_norm": 0.11713407890189519, + "learning_rate": 5.84615275911468e-05, + "loss": 0.3373, + "step": 1434 + }, + { + "epoch": 2.064748201438849, + "grad_norm": 0.11695249228680646, + "learning_rate": 5.8425867870738684e-05, + "loss": 0.3387, + "step": 1435 + }, + { + "epoch": 2.066187050359712, + "grad_norm": 0.12771762431703068, + "learning_rate": 5.839018955208838e-05, + "loss": 0.3404, + "step": 1436 + }, + { + "epoch": 2.0676258992805754, + "grad_norm": 0.14190392315489428, + "learning_rate": 5.835449267120796e-05, + "loss": 0.3405, + "step": 1437 + }, + { + "epoch": 2.069064748201439, + "grad_norm": 0.1314280361275215, + "learning_rate": 5.831877726412827e-05, + "loss": 0.3374, + "step": 1438 + }, + { + "epoch": 2.0705035971223023, + "grad_norm": 0.11923833830476203, + "learning_rate": 5.828304336689883e-05, + "loss": 0.3419, + "step": 1439 + }, + { + "epoch": 2.0719424460431655, + "grad_norm": 0.15719810095823256, + "learning_rate": 5.824729101558781e-05, + "loss": 0.3419, + "step": 1440 + }, + { + "epoch": 2.0733812949640287, + "grad_norm": 0.1448222991005709, + "learning_rate": 5.821152024628207e-05, + "loss": 0.3338, + "step": 1441 + }, + { + "epoch": 2.074820143884892, + "grad_norm": 0.16522879889055553, + "learning_rate": 5.8175731095086974e-05, + "loss": 0.3399, + "step": 1442 + }, + { + "epoch": 2.0762589928057555, + "grad_norm": 0.21170071164270177, + "learning_rate": 5.813992359812649e-05, + "loss": 0.3433, + "step": 1443 + }, + { + "epoch": 2.0776978417266188, + "grad_norm": 0.2202554706035317, + "learning_rate": 5.8104097791543104e-05, + "loss": 0.3376, + "step": 1444 + }, + { + "epoch": 2.079136690647482, + "grad_norm": 0.17559261793029335, + "learning_rate": 5.806825371149778e-05, + "loss": 0.342, + "step": 1445 + }, + { + "epoch": 2.080575539568345, + "grad_norm": 0.16582794991328403, + "learning_rate": 5.803239139416989e-05, + "loss": 0.3377, + "step": 1446 + }, + { + "epoch": 2.082014388489209, + "grad_norm": 0.16252281453261838, + "learning_rate": 5.799651087575728e-05, + "loss": 0.3348, + "step": 1447 + }, + { + "epoch": 2.083453237410072, + "grad_norm": 0.14014857137408293, + "learning_rate": 5.7960612192476096e-05, + "loss": 0.3362, + "step": 1448 + }, + { + "epoch": 2.0848920863309353, + "grad_norm": 0.16596189212725054, + "learning_rate": 5.792469538056089e-05, + "loss": 0.3368, + "step": 1449 + }, + { + "epoch": 2.0863309352517985, + "grad_norm": 0.23794659327159862, + "learning_rate": 5.7888760476264445e-05, + "loss": 0.3308, + "step": 1450 + }, + { + "epoch": 2.0877697841726617, + "grad_norm": 0.2519812743121408, + "learning_rate": 5.785280751585785e-05, + "loss": 0.3453, + "step": 1451 + }, + { + "epoch": 2.0892086330935253, + "grad_norm": 0.24031404589039398, + "learning_rate": 5.7816836535630436e-05, + "loss": 0.3387, + "step": 1452 + }, + { + "epoch": 2.0906474820143885, + "grad_norm": 0.18973703655841345, + "learning_rate": 5.7780847571889625e-05, + "loss": 0.3378, + "step": 1453 + }, + { + "epoch": 2.0920863309352518, + "grad_norm": 0.1682276408188406, + "learning_rate": 5.7744840660961126e-05, + "loss": 0.3399, + "step": 1454 + }, + { + "epoch": 2.093525179856115, + "grad_norm": 0.17641722119061082, + "learning_rate": 5.770881583918865e-05, + "loss": 0.3394, + "step": 1455 + }, + { + "epoch": 2.0949640287769786, + "grad_norm": 0.17089632987102674, + "learning_rate": 5.767277314293404e-05, + "loss": 0.3485, + "step": 1456 + }, + { + "epoch": 2.096402877697842, + "grad_norm": 0.13352662398274492, + "learning_rate": 5.76367126085772e-05, + "loss": 0.3479, + "step": 1457 + }, + { + "epoch": 2.097841726618705, + "grad_norm": 0.16704433169505478, + "learning_rate": 5.760063427251599e-05, + "loss": 0.3459, + "step": 1458 + }, + { + "epoch": 2.0992805755395683, + "grad_norm": 0.19259136865967705, + "learning_rate": 5.756453817116624e-05, + "loss": 0.3413, + "step": 1459 + }, + { + "epoch": 2.1007194244604315, + "grad_norm": 0.17415900235040346, + "learning_rate": 5.752842434096176e-05, + "loss": 0.3426, + "step": 1460 + }, + { + "epoch": 2.102158273381295, + "grad_norm": 0.12734265587328836, + "learning_rate": 5.7492292818354224e-05, + "loss": 0.3491, + "step": 1461 + }, + { + "epoch": 2.1035971223021583, + "grad_norm": 0.11675352615016288, + "learning_rate": 5.745614363981316e-05, + "loss": 0.338, + "step": 1462 + }, + { + "epoch": 2.1050359712230216, + "grad_norm": 0.12091941181674576, + "learning_rate": 5.741997684182591e-05, + "loss": 0.3345, + "step": 1463 + }, + { + "epoch": 2.1064748201438848, + "grad_norm": 0.14343425734938844, + "learning_rate": 5.7383792460897626e-05, + "loss": 0.3372, + "step": 1464 + }, + { + "epoch": 2.1079136690647484, + "grad_norm": 0.15837441419013518, + "learning_rate": 5.73475905335512e-05, + "loss": 0.3372, + "step": 1465 + }, + { + "epoch": 2.1093525179856116, + "grad_norm": 0.1703498771793665, + "learning_rate": 5.731137109632722e-05, + "loss": 0.3454, + "step": 1466 + }, + { + "epoch": 2.110791366906475, + "grad_norm": 0.14049879617314373, + "learning_rate": 5.727513418578397e-05, + "loss": 0.3368, + "step": 1467 + }, + { + "epoch": 2.112230215827338, + "grad_norm": 0.1500152804042965, + "learning_rate": 5.723887983849732e-05, + "loss": 0.3453, + "step": 1468 + }, + { + "epoch": 2.1136690647482013, + "grad_norm": 0.14449565334649642, + "learning_rate": 5.720260809106083e-05, + "loss": 0.3436, + "step": 1469 + }, + { + "epoch": 2.115107913669065, + "grad_norm": 0.1756781174580756, + "learning_rate": 5.716631898008553e-05, + "loss": 0.3467, + "step": 1470 + }, + { + "epoch": 2.116546762589928, + "grad_norm": 0.18614967731122278, + "learning_rate": 5.713001254220002e-05, + "loss": 0.3351, + "step": 1471 + }, + { + "epoch": 2.1179856115107913, + "grad_norm": 0.14261200052126147, + "learning_rate": 5.7093688814050425e-05, + "loss": 0.3332, + "step": 1472 + }, + { + "epoch": 2.1194244604316546, + "grad_norm": 0.1675466491240586, + "learning_rate": 5.705734783230022e-05, + "loss": 0.3352, + "step": 1473 + }, + { + "epoch": 2.1208633093525178, + "grad_norm": 0.19342404719601078, + "learning_rate": 5.7020989633630414e-05, + "loss": 0.3448, + "step": 1474 + }, + { + "epoch": 2.1223021582733814, + "grad_norm": 0.1449713706015703, + "learning_rate": 5.6984614254739306e-05, + "loss": 0.3419, + "step": 1475 + }, + { + "epoch": 2.1237410071942446, + "grad_norm": 0.13417399972592065, + "learning_rate": 5.694822173234257e-05, + "loss": 0.3365, + "step": 1476 + }, + { + "epoch": 2.125179856115108, + "grad_norm": 0.14800438024460638, + "learning_rate": 5.691181210317319e-05, + "loss": 0.3307, + "step": 1477 + }, + { + "epoch": 2.126618705035971, + "grad_norm": 0.1764029381697298, + "learning_rate": 5.687538540398141e-05, + "loss": 0.3371, + "step": 1478 + }, + { + "epoch": 2.1280575539568347, + "grad_norm": 0.1831382465599354, + "learning_rate": 5.683894167153468e-05, + "loss": 0.3428, + "step": 1479 + }, + { + "epoch": 2.129496402877698, + "grad_norm": 0.1768420571871011, + "learning_rate": 5.680248094261769e-05, + "loss": 0.3521, + "step": 1480 + }, + { + "epoch": 2.130935251798561, + "grad_norm": 0.21501595835453444, + "learning_rate": 5.676600325403224e-05, + "loss": 0.334, + "step": 1481 + }, + { + "epoch": 2.1323741007194243, + "grad_norm": 0.18877105575339637, + "learning_rate": 5.672950864259729e-05, + "loss": 0.338, + "step": 1482 + }, + { + "epoch": 2.133812949640288, + "grad_norm": 0.1294937427982825, + "learning_rate": 5.669299714514884e-05, + "loss": 0.3422, + "step": 1483 + }, + { + "epoch": 2.135251798561151, + "grad_norm": 0.1273382842813177, + "learning_rate": 5.665646879853995e-05, + "loss": 0.336, + "step": 1484 + }, + { + "epoch": 2.1366906474820144, + "grad_norm": 0.13317103255146273, + "learning_rate": 5.661992363964072e-05, + "loss": 0.3405, + "step": 1485 + }, + { + "epoch": 2.1381294964028776, + "grad_norm": 0.14529342520812324, + "learning_rate": 5.658336170533814e-05, + "loss": 0.3453, + "step": 1486 + }, + { + "epoch": 2.139568345323741, + "grad_norm": 0.1597014572285992, + "learning_rate": 5.654678303253624e-05, + "loss": 0.3414, + "step": 1487 + }, + { + "epoch": 2.1410071942446045, + "grad_norm": 0.1304551078464195, + "learning_rate": 5.6510187658155846e-05, + "loss": 0.3551, + "step": 1488 + }, + { + "epoch": 2.1424460431654677, + "grad_norm": 0.11581885124410166, + "learning_rate": 5.6473575619134686e-05, + "loss": 0.3391, + "step": 1489 + }, + { + "epoch": 2.143884892086331, + "grad_norm": 0.13725711766965948, + "learning_rate": 5.643694695242731e-05, + "loss": 0.338, + "step": 1490 + }, + { + "epoch": 2.145323741007194, + "grad_norm": 0.1451690787664973, + "learning_rate": 5.640030169500508e-05, + "loss": 0.3314, + "step": 1491 + }, + { + "epoch": 2.1467625899280574, + "grad_norm": 0.14030207891161744, + "learning_rate": 5.636363988385601e-05, + "loss": 0.343, + "step": 1492 + }, + { + "epoch": 2.148201438848921, + "grad_norm": 0.1610816781622484, + "learning_rate": 5.632696155598493e-05, + "loss": 0.3502, + "step": 1493 + }, + { + "epoch": 2.149640287769784, + "grad_norm": 0.12540923093393241, + "learning_rate": 5.6290266748413266e-05, + "loss": 0.3304, + "step": 1494 + }, + { + "epoch": 2.1510791366906474, + "grad_norm": 0.13418605248153795, + "learning_rate": 5.6253555498179124e-05, + "loss": 0.3363, + "step": 1495 + }, + { + "epoch": 2.1525179856115106, + "grad_norm": 0.14709879762829714, + "learning_rate": 5.621682784233718e-05, + "loss": 0.3336, + "step": 1496 + }, + { + "epoch": 2.1539568345323743, + "grad_norm": 0.1563659130071038, + "learning_rate": 5.618008381795868e-05, + "loss": 0.3414, + "step": 1497 + }, + { + "epoch": 2.1553956834532375, + "grad_norm": 0.18468733308318555, + "learning_rate": 5.61433234621314e-05, + "loss": 0.3375, + "step": 1498 + }, + { + "epoch": 2.1568345323741007, + "grad_norm": 0.18787953455404294, + "learning_rate": 5.610654681195957e-05, + "loss": 0.3363, + "step": 1499 + }, + { + "epoch": 2.158273381294964, + "grad_norm": 0.13890337995163107, + "learning_rate": 5.606975390456391e-05, + "loss": 0.337, + "step": 1500 + }, + { + "epoch": 2.159712230215827, + "grad_norm": 0.14644128624879602, + "learning_rate": 5.603294477708149e-05, + "loss": 0.338, + "step": 1501 + }, + { + "epoch": 2.161151079136691, + "grad_norm": 0.1501386328488287, + "learning_rate": 5.599611946666581e-05, + "loss": 0.3409, + "step": 1502 + }, + { + "epoch": 2.162589928057554, + "grad_norm": 0.17246425668599588, + "learning_rate": 5.595927801048669e-05, + "loss": 0.3377, + "step": 1503 + }, + { + "epoch": 2.1640287769784172, + "grad_norm": 0.2019670162251421, + "learning_rate": 5.5922420445730245e-05, + "loss": 0.3446, + "step": 1504 + }, + { + "epoch": 2.1654676258992804, + "grad_norm": 0.17811936053004404, + "learning_rate": 5.5885546809598805e-05, + "loss": 0.3351, + "step": 1505 + }, + { + "epoch": 2.166906474820144, + "grad_norm": 0.10886271085349164, + "learning_rate": 5.584865713931098e-05, + "loss": 0.3353, + "step": 1506 + }, + { + "epoch": 2.1683453237410073, + "grad_norm": 0.16795292903684267, + "learning_rate": 5.5811751472101564e-05, + "loss": 0.3371, + "step": 1507 + }, + { + "epoch": 2.1697841726618705, + "grad_norm": 0.1863404289138655, + "learning_rate": 5.577482984522145e-05, + "loss": 0.339, + "step": 1508 + }, + { + "epoch": 2.1712230215827337, + "grad_norm": 0.183043593865869, + "learning_rate": 5.573789229593767e-05, + "loss": 0.337, + "step": 1509 + }, + { + "epoch": 2.172661870503597, + "grad_norm": 0.17483783244947432, + "learning_rate": 5.570093886153334e-05, + "loss": 0.3486, + "step": 1510 + }, + { + "epoch": 2.1741007194244606, + "grad_norm": 0.15053536278422408, + "learning_rate": 5.5663969579307594e-05, + "loss": 0.3358, + "step": 1511 + }, + { + "epoch": 2.175539568345324, + "grad_norm": 0.1270283844779313, + "learning_rate": 5.562698448657553e-05, + "loss": 0.3512, + "step": 1512 + }, + { + "epoch": 2.176978417266187, + "grad_norm": 0.13060290196603938, + "learning_rate": 5.5589983620668286e-05, + "loss": 0.3431, + "step": 1513 + }, + { + "epoch": 2.1784172661870502, + "grad_norm": 0.13973132677886133, + "learning_rate": 5.555296701893284e-05, + "loss": 0.333, + "step": 1514 + }, + { + "epoch": 2.1798561151079134, + "grad_norm": 0.12157919806508072, + "learning_rate": 5.551593471873208e-05, + "loss": 0.3269, + "step": 1515 + }, + { + "epoch": 2.181294964028777, + "grad_norm": 0.15709986759465888, + "learning_rate": 5.547888675744476e-05, + "loss": 0.3472, + "step": 1516 + }, + { + "epoch": 2.1827338129496403, + "grad_norm": 0.1397129088825444, + "learning_rate": 5.5441823172465427e-05, + "loss": 0.3429, + "step": 1517 + }, + { + "epoch": 2.1841726618705035, + "grad_norm": 0.13661235608902214, + "learning_rate": 5.540474400120438e-05, + "loss": 0.3352, + "step": 1518 + }, + { + "epoch": 2.1856115107913667, + "grad_norm": 0.15779330501808156, + "learning_rate": 5.536764928108769e-05, + "loss": 0.3414, + "step": 1519 + }, + { + "epoch": 2.1870503597122304, + "grad_norm": 0.16129758993795465, + "learning_rate": 5.533053904955709e-05, + "loss": 0.3362, + "step": 1520 + }, + { + "epoch": 2.1884892086330936, + "grad_norm": 0.15720442376879168, + "learning_rate": 5.5293413344069964e-05, + "loss": 0.3485, + "step": 1521 + }, + { + "epoch": 2.189928057553957, + "grad_norm": 0.15507626372202452, + "learning_rate": 5.525627220209934e-05, + "loss": 0.3374, + "step": 1522 + }, + { + "epoch": 2.19136690647482, + "grad_norm": 0.16565853659345364, + "learning_rate": 5.5219115661133815e-05, + "loss": 0.3431, + "step": 1523 + }, + { + "epoch": 2.1928057553956837, + "grad_norm": 0.2042792580144636, + "learning_rate": 5.518194375867754e-05, + "loss": 0.346, + "step": 1524 + }, + { + "epoch": 2.194244604316547, + "grad_norm": 0.19753686236847903, + "learning_rate": 5.514475653225014e-05, + "loss": 0.3425, + "step": 1525 + }, + { + "epoch": 2.19568345323741, + "grad_norm": 0.17338936644955222, + "learning_rate": 5.510755401938676e-05, + "loss": 0.3442, + "step": 1526 + }, + { + "epoch": 2.1971223021582733, + "grad_norm": 0.14526568967476972, + "learning_rate": 5.5070336257637904e-05, + "loss": 0.3332, + "step": 1527 + }, + { + "epoch": 2.1985611510791365, + "grad_norm": 0.16128188345297867, + "learning_rate": 5.503310328456953e-05, + "loss": 0.34, + "step": 1528 + }, + { + "epoch": 2.2, + "grad_norm": 0.17205429431240457, + "learning_rate": 5.4995855137762926e-05, + "loss": 0.3517, + "step": 1529 + }, + { + "epoch": 2.2014388489208634, + "grad_norm": 0.17768080599054073, + "learning_rate": 5.4958591854814695e-05, + "loss": 0.349, + "step": 1530 + }, + { + "epoch": 2.2028776978417266, + "grad_norm": 0.20441850917447524, + "learning_rate": 5.492131347333671e-05, + "loss": 0.3395, + "step": 1531 + }, + { + "epoch": 2.20431654676259, + "grad_norm": 0.17639121611676273, + "learning_rate": 5.48840200309561e-05, + "loss": 0.3366, + "step": 1532 + }, + { + "epoch": 2.205755395683453, + "grad_norm": 0.1167227355953685, + "learning_rate": 5.484671156531519e-05, + "loss": 0.3297, + "step": 1533 + }, + { + "epoch": 2.2071942446043167, + "grad_norm": 0.13602920961715934, + "learning_rate": 5.480938811407146e-05, + "loss": 0.3415, + "step": 1534 + }, + { + "epoch": 2.20863309352518, + "grad_norm": 0.15865350852770105, + "learning_rate": 5.477204971489753e-05, + "loss": 0.3408, + "step": 1535 + }, + { + "epoch": 2.210071942446043, + "grad_norm": 0.1784279601859331, + "learning_rate": 5.473469640548109e-05, + "loss": 0.3444, + "step": 1536 + }, + { + "epoch": 2.2115107913669063, + "grad_norm": 0.15677594496706615, + "learning_rate": 5.469732822352491e-05, + "loss": 0.34, + "step": 1537 + }, + { + "epoch": 2.21294964028777, + "grad_norm": 0.1558089174768745, + "learning_rate": 5.465994520674672e-05, + "loss": 0.3323, + "step": 1538 + }, + { + "epoch": 2.214388489208633, + "grad_norm": 0.17636357175020703, + "learning_rate": 5.4622547392879295e-05, + "loss": 0.3433, + "step": 1539 + }, + { + "epoch": 2.2158273381294964, + "grad_norm": 0.15555718344338995, + "learning_rate": 5.458513481967027e-05, + "loss": 0.3448, + "step": 1540 + }, + { + "epoch": 2.2172661870503596, + "grad_norm": 0.18393076876807044, + "learning_rate": 5.454770752488223e-05, + "loss": 0.3291, + "step": 1541 + }, + { + "epoch": 2.218705035971223, + "grad_norm": 0.13756871917599844, + "learning_rate": 5.4510265546292615e-05, + "loss": 0.3444, + "step": 1542 + }, + { + "epoch": 2.2201438848920865, + "grad_norm": 0.14744324982539372, + "learning_rate": 5.4472808921693657e-05, + "loss": 0.3414, + "step": 1543 + }, + { + "epoch": 2.2215827338129497, + "grad_norm": 0.17324681650300475, + "learning_rate": 5.4435337688892396e-05, + "loss": 0.3348, + "step": 1544 + }, + { + "epoch": 2.223021582733813, + "grad_norm": 0.21017507418299664, + "learning_rate": 5.4397851885710595e-05, + "loss": 0.3476, + "step": 1545 + }, + { + "epoch": 2.224460431654676, + "grad_norm": 0.18560881927511783, + "learning_rate": 5.4360351549984755e-05, + "loss": 0.3395, + "step": 1546 + }, + { + "epoch": 2.2258992805755398, + "grad_norm": 0.16057845613316232, + "learning_rate": 5.432283671956601e-05, + "loss": 0.3374, + "step": 1547 + }, + { + "epoch": 2.227338129496403, + "grad_norm": 0.17400221956971498, + "learning_rate": 5.428530743232016e-05, + "loss": 0.3464, + "step": 1548 + }, + { + "epoch": 2.228776978417266, + "grad_norm": 0.13688750875923683, + "learning_rate": 5.4247763726127564e-05, + "loss": 0.3412, + "step": 1549 + }, + { + "epoch": 2.2302158273381294, + "grad_norm": 0.16895566858175826, + "learning_rate": 5.421020563888317e-05, + "loss": 0.3389, + "step": 1550 + }, + { + "epoch": 2.2316546762589926, + "grad_norm": 0.16049543492332127, + "learning_rate": 5.417263320849641e-05, + "loss": 0.3302, + "step": 1551 + }, + { + "epoch": 2.2330935251798563, + "grad_norm": 0.1529656726630707, + "learning_rate": 5.4135046472891205e-05, + "loss": 0.3388, + "step": 1552 + }, + { + "epoch": 2.2345323741007195, + "grad_norm": 0.1661012439318489, + "learning_rate": 5.409744547000591e-05, + "loss": 0.3409, + "step": 1553 + }, + { + "epoch": 2.2359712230215827, + "grad_norm": 0.1523798399883737, + "learning_rate": 5.405983023779328e-05, + "loss": 0.3364, + "step": 1554 + }, + { + "epoch": 2.237410071942446, + "grad_norm": 0.14102246718693223, + "learning_rate": 5.402220081422048e-05, + "loss": 0.3408, + "step": 1555 + }, + { + "epoch": 2.2388489208633096, + "grad_norm": 0.11049551216075727, + "learning_rate": 5.3984557237268905e-05, + "loss": 0.3433, + "step": 1556 + }, + { + "epoch": 2.2402877697841728, + "grad_norm": 0.1399944478206375, + "learning_rate": 5.394689954493432e-05, + "loss": 0.3488, + "step": 1557 + }, + { + "epoch": 2.241726618705036, + "grad_norm": 0.1479515174519836, + "learning_rate": 5.390922777522669e-05, + "loss": 0.3387, + "step": 1558 + }, + { + "epoch": 2.243165467625899, + "grad_norm": 0.12763995560575422, + "learning_rate": 5.3871541966170225e-05, + "loss": 0.3446, + "step": 1559 + }, + { + "epoch": 2.2446043165467624, + "grad_norm": 0.1309388399200539, + "learning_rate": 5.383384215580326e-05, + "loss": 0.3385, + "step": 1560 + }, + { + "epoch": 2.246043165467626, + "grad_norm": 0.13535157925392624, + "learning_rate": 5.37961283821783e-05, + "loss": 0.3472, + "step": 1561 + }, + { + "epoch": 2.2474820143884893, + "grad_norm": 0.14589245872554155, + "learning_rate": 5.3758400683361926e-05, + "loss": 0.3355, + "step": 1562 + }, + { + "epoch": 2.2489208633093525, + "grad_norm": 0.14696443357983832, + "learning_rate": 5.372065909743479e-05, + "loss": 0.3456, + "step": 1563 + }, + { + "epoch": 2.2503597122302157, + "grad_norm": 0.14858801303380195, + "learning_rate": 5.368290366249155e-05, + "loss": 0.3471, + "step": 1564 + }, + { + "epoch": 2.2517985611510793, + "grad_norm": 0.12730617646963155, + "learning_rate": 5.364513441664084e-05, + "loss": 0.3374, + "step": 1565 + }, + { + "epoch": 2.2532374100719426, + "grad_norm": 0.12323182979257986, + "learning_rate": 5.3607351398005234e-05, + "loss": 0.3381, + "step": 1566 + }, + { + "epoch": 2.2546762589928058, + "grad_norm": 0.14275410896069196, + "learning_rate": 5.356955464472121e-05, + "loss": 0.335, + "step": 1567 + }, + { + "epoch": 2.256115107913669, + "grad_norm": 0.16897913426284591, + "learning_rate": 5.353174419493913e-05, + "loss": 0.3375, + "step": 1568 + }, + { + "epoch": 2.257553956834532, + "grad_norm": 0.1779313024622998, + "learning_rate": 5.349392008682314e-05, + "loss": 0.3467, + "step": 1569 + }, + { + "epoch": 2.258992805755396, + "grad_norm": 0.14193982348022544, + "learning_rate": 5.3456082358551204e-05, + "loss": 0.335, + "step": 1570 + }, + { + "epoch": 2.260431654676259, + "grad_norm": 0.12525506601160286, + "learning_rate": 5.341823104831501e-05, + "loss": 0.3398, + "step": 1571 + }, + { + "epoch": 2.2618705035971223, + "grad_norm": 0.13843939017177903, + "learning_rate": 5.338036619431999e-05, + "loss": 0.3433, + "step": 1572 + }, + { + "epoch": 2.2633093525179855, + "grad_norm": 0.13249616791552016, + "learning_rate": 5.33424878347852e-05, + "loss": 0.3345, + "step": 1573 + }, + { + "epoch": 2.2647482014388487, + "grad_norm": 0.1681175570097203, + "learning_rate": 5.330459600794337e-05, + "loss": 0.3437, + "step": 1574 + }, + { + "epoch": 2.2661870503597124, + "grad_norm": 0.13046711251361295, + "learning_rate": 5.32666907520408e-05, + "loss": 0.3388, + "step": 1575 + }, + { + "epoch": 2.2676258992805756, + "grad_norm": 0.1294688814318854, + "learning_rate": 5.322877210533735e-05, + "loss": 0.3376, + "step": 1576 + }, + { + "epoch": 2.2690647482014388, + "grad_norm": 0.1689404345986412, + "learning_rate": 5.319084010610638e-05, + "loss": 0.3402, + "step": 1577 + }, + { + "epoch": 2.270503597122302, + "grad_norm": 0.13531193824777604, + "learning_rate": 5.3152894792634785e-05, + "loss": 0.3456, + "step": 1578 + }, + { + "epoch": 2.2719424460431656, + "grad_norm": 0.13641412543988995, + "learning_rate": 5.311493620322282e-05, + "loss": 0.3417, + "step": 1579 + }, + { + "epoch": 2.273381294964029, + "grad_norm": 0.14410478214500225, + "learning_rate": 5.3076964376184186e-05, + "loss": 0.3364, + "step": 1580 + }, + { + "epoch": 2.274820143884892, + "grad_norm": 0.17981099543469098, + "learning_rate": 5.303897934984595e-05, + "loss": 0.3413, + "step": 1581 + }, + { + "epoch": 2.2762589928057553, + "grad_norm": 0.1310151076813013, + "learning_rate": 5.300098116254848e-05, + "loss": 0.3358, + "step": 1582 + }, + { + "epoch": 2.277697841726619, + "grad_norm": 0.1397793816458818, + "learning_rate": 5.296296985264543e-05, + "loss": 0.3412, + "step": 1583 + }, + { + "epoch": 2.279136690647482, + "grad_norm": 0.14915863726588208, + "learning_rate": 5.2924945458503713e-05, + "loss": 0.3397, + "step": 1584 + }, + { + "epoch": 2.2805755395683454, + "grad_norm": 0.10674266202915253, + "learning_rate": 5.2886908018503454e-05, + "loss": 0.3454, + "step": 1585 + }, + { + "epoch": 2.2820143884892086, + "grad_norm": 0.12978752685607617, + "learning_rate": 5.284885757103792e-05, + "loss": 0.3509, + "step": 1586 + }, + { + "epoch": 2.283453237410072, + "grad_norm": 0.15617892058617083, + "learning_rate": 5.2810794154513503e-05, + "loss": 0.3425, + "step": 1587 + }, + { + "epoch": 2.2848920863309354, + "grad_norm": 0.1735752896636099, + "learning_rate": 5.277271780734975e-05, + "loss": 0.3362, + "step": 1588 + }, + { + "epoch": 2.2863309352517986, + "grad_norm": 0.15900631594222825, + "learning_rate": 5.273462856797918e-05, + "loss": 0.3442, + "step": 1589 + }, + { + "epoch": 2.287769784172662, + "grad_norm": 0.11309688348190682, + "learning_rate": 5.269652647484735e-05, + "loss": 0.3391, + "step": 1590 + }, + { + "epoch": 2.289208633093525, + "grad_norm": 0.14290407695721685, + "learning_rate": 5.2658411566412837e-05, + "loss": 0.3346, + "step": 1591 + }, + { + "epoch": 2.2906474820143883, + "grad_norm": 0.15093684348741013, + "learning_rate": 5.262028388114708e-05, + "loss": 0.3402, + "step": 1592 + }, + { + "epoch": 2.292086330935252, + "grad_norm": 0.15631978580232964, + "learning_rate": 5.258214345753446e-05, + "loss": 0.3439, + "step": 1593 + }, + { + "epoch": 2.293525179856115, + "grad_norm": 0.12869733397073002, + "learning_rate": 5.254399033407221e-05, + "loss": 0.3424, + "step": 1594 + }, + { + "epoch": 2.2949640287769784, + "grad_norm": 0.11517532335773004, + "learning_rate": 5.250582454927037e-05, + "loss": 0.3403, + "step": 1595 + }, + { + "epoch": 2.2964028776978416, + "grad_norm": 0.1340681459546274, + "learning_rate": 5.2467646141651764e-05, + "loss": 0.3377, + "step": 1596 + }, + { + "epoch": 2.2978417266187052, + "grad_norm": 0.12046792802010879, + "learning_rate": 5.2429455149751976e-05, + "loss": 0.3438, + "step": 1597 + }, + { + "epoch": 2.2992805755395684, + "grad_norm": 0.13079680094789867, + "learning_rate": 5.2391251612119256e-05, + "loss": 0.3452, + "step": 1598 + }, + { + "epoch": 2.3007194244604317, + "grad_norm": 0.16640190235613248, + "learning_rate": 5.235303556731456e-05, + "loss": 0.3461, + "step": 1599 + }, + { + "epoch": 2.302158273381295, + "grad_norm": 0.1577686684215034, + "learning_rate": 5.23148070539114e-05, + "loss": 0.341, + "step": 1600 + }, + { + "epoch": 2.3035971223021585, + "grad_norm": 0.14925437585718365, + "learning_rate": 5.227656611049598e-05, + "loss": 0.3387, + "step": 1601 + }, + { + "epoch": 2.3050359712230217, + "grad_norm": 0.13710986596933333, + "learning_rate": 5.2238312775666935e-05, + "loss": 0.3368, + "step": 1602 + }, + { + "epoch": 2.306474820143885, + "grad_norm": 0.1500167813711351, + "learning_rate": 5.220004708803548e-05, + "loss": 0.3375, + "step": 1603 + }, + { + "epoch": 2.307913669064748, + "grad_norm": 0.1566653927707589, + "learning_rate": 5.216176908622528e-05, + "loss": 0.3416, + "step": 1604 + }, + { + "epoch": 2.3093525179856114, + "grad_norm": 0.13490532368742505, + "learning_rate": 5.2123478808872436e-05, + "loss": 0.332, + "step": 1605 + }, + { + "epoch": 2.310791366906475, + "grad_norm": 0.14186561037817463, + "learning_rate": 5.208517629462541e-05, + "loss": 0.338, + "step": 1606 + }, + { + "epoch": 2.3122302158273382, + "grad_norm": 0.1738923611376932, + "learning_rate": 5.204686158214507e-05, + "loss": 0.3347, + "step": 1607 + }, + { + "epoch": 2.3136690647482014, + "grad_norm": 0.1643249308537781, + "learning_rate": 5.200853471010453e-05, + "loss": 0.3408, + "step": 1608 + }, + { + "epoch": 2.3151079136690647, + "grad_norm": 0.11412779665907977, + "learning_rate": 5.197019571718921e-05, + "loss": 0.3399, + "step": 1609 + }, + { + "epoch": 2.316546762589928, + "grad_norm": 0.12569340024427234, + "learning_rate": 5.19318446420968e-05, + "loss": 0.3453, + "step": 1610 + }, + { + "epoch": 2.3179856115107915, + "grad_norm": 0.11776087089400157, + "learning_rate": 5.189348152353712e-05, + "loss": 0.338, + "step": 1611 + }, + { + "epoch": 2.3194244604316547, + "grad_norm": 0.13584830504004294, + "learning_rate": 5.1855106400232196e-05, + "loss": 0.3444, + "step": 1612 + }, + { + "epoch": 2.320863309352518, + "grad_norm": 0.12917173409302307, + "learning_rate": 5.181671931091612e-05, + "loss": 0.3411, + "step": 1613 + }, + { + "epoch": 2.322302158273381, + "grad_norm": 0.10729175450876592, + "learning_rate": 5.1778320294335126e-05, + "loss": 0.3499, + "step": 1614 + }, + { + "epoch": 2.3237410071942444, + "grad_norm": 0.11881215722162419, + "learning_rate": 5.1739909389247445e-05, + "loss": 0.33, + "step": 1615 + }, + { + "epoch": 2.325179856115108, + "grad_norm": 0.13059841815215834, + "learning_rate": 5.17014866344233e-05, + "loss": 0.341, + "step": 1616 + }, + { + "epoch": 2.3266187050359712, + "grad_norm": 0.12307382947679618, + "learning_rate": 5.166305206864492e-05, + "loss": 0.337, + "step": 1617 + }, + { + "epoch": 2.3280575539568344, + "grad_norm": 0.13075069201133485, + "learning_rate": 5.162460573070642e-05, + "loss": 0.342, + "step": 1618 + }, + { + "epoch": 2.3294964028776977, + "grad_norm": 0.14771166131358693, + "learning_rate": 5.158614765941376e-05, + "loss": 0.3317, + "step": 1619 + }, + { + "epoch": 2.3309352517985613, + "grad_norm": 0.1507808153078834, + "learning_rate": 5.1547677893584846e-05, + "loss": 0.343, + "step": 1620 + }, + { + "epoch": 2.3323741007194245, + "grad_norm": 0.153815934182145, + "learning_rate": 5.15091964720493e-05, + "loss": 0.341, + "step": 1621 + }, + { + "epoch": 2.3338129496402877, + "grad_norm": 0.09981420149445622, + "learning_rate": 5.1470703433648556e-05, + "loss": 0.3342, + "step": 1622 + }, + { + "epoch": 2.335251798561151, + "grad_norm": 0.12819863045020796, + "learning_rate": 5.143219881723573e-05, + "loss": 0.3463, + "step": 1623 + }, + { + "epoch": 2.3366906474820146, + "grad_norm": 0.14046083524838301, + "learning_rate": 5.139368266167567e-05, + "loss": 0.3408, + "step": 1624 + }, + { + "epoch": 2.338129496402878, + "grad_norm": 0.12589480843570602, + "learning_rate": 5.135515500584484e-05, + "loss": 0.3397, + "step": 1625 + }, + { + "epoch": 2.339568345323741, + "grad_norm": 0.1426606838423531, + "learning_rate": 5.131661588863132e-05, + "loss": 0.3375, + "step": 1626 + }, + { + "epoch": 2.3410071942446042, + "grad_norm": 0.15078541911069537, + "learning_rate": 5.1278065348934786e-05, + "loss": 0.335, + "step": 1627 + }, + { + "epoch": 2.3424460431654675, + "grad_norm": 0.19084042148697, + "learning_rate": 5.123950342566639e-05, + "loss": 0.3491, + "step": 1628 + }, + { + "epoch": 2.343884892086331, + "grad_norm": 0.2067992073638373, + "learning_rate": 5.120093015774882e-05, + "loss": 0.3428, + "step": 1629 + }, + { + "epoch": 2.3453237410071943, + "grad_norm": 0.15228406978948328, + "learning_rate": 5.116234558411618e-05, + "loss": 0.3439, + "step": 1630 + }, + { + "epoch": 2.3467625899280575, + "grad_norm": 0.13129931838861417, + "learning_rate": 5.1123749743714024e-05, + "loss": 0.3337, + "step": 1631 + }, + { + "epoch": 2.3482014388489207, + "grad_norm": 0.14308636666369418, + "learning_rate": 5.1085142675499246e-05, + "loss": 0.3441, + "step": 1632 + }, + { + "epoch": 2.349640287769784, + "grad_norm": 0.15789616839818424, + "learning_rate": 5.1046524418440075e-05, + "loss": 0.3396, + "step": 1633 + }, + { + "epoch": 2.3510791366906476, + "grad_norm": 0.1466083603631073, + "learning_rate": 5.100789501151607e-05, + "loss": 0.3369, + "step": 1634 + }, + { + "epoch": 2.352517985611511, + "grad_norm": 0.13018873128250721, + "learning_rate": 5.0969254493717996e-05, + "loss": 0.3522, + "step": 1635 + }, + { + "epoch": 2.353956834532374, + "grad_norm": 0.17119707649582805, + "learning_rate": 5.093060290404785e-05, + "loss": 0.3413, + "step": 1636 + }, + { + "epoch": 2.3553956834532372, + "grad_norm": 0.20255061297486265, + "learning_rate": 5.089194028151882e-05, + "loss": 0.3462, + "step": 1637 + }, + { + "epoch": 2.356834532374101, + "grad_norm": 0.15442557209983027, + "learning_rate": 5.085326666515521e-05, + "loss": 0.3436, + "step": 1638 + }, + { + "epoch": 2.358273381294964, + "grad_norm": 0.1340549799852972, + "learning_rate": 5.081458209399243e-05, + "loss": 0.3408, + "step": 1639 + }, + { + "epoch": 2.3597122302158273, + "grad_norm": 0.1328264324241929, + "learning_rate": 5.0775886607076954e-05, + "loss": 0.3341, + "step": 1640 + }, + { + "epoch": 2.3611510791366905, + "grad_norm": 0.13228593919564147, + "learning_rate": 5.073718024346626e-05, + "loss": 0.3408, + "step": 1641 + }, + { + "epoch": 2.362589928057554, + "grad_norm": 0.11427875745181809, + "learning_rate": 5.06984630422288e-05, + "loss": 0.3405, + "step": 1642 + }, + { + "epoch": 2.3640287769784174, + "grad_norm": 0.12760861743227073, + "learning_rate": 5.065973504244399e-05, + "loss": 0.3395, + "step": 1643 + }, + { + "epoch": 2.3654676258992806, + "grad_norm": 0.13863017553609638, + "learning_rate": 5.062099628320213e-05, + "loss": 0.3436, + "step": 1644 + }, + { + "epoch": 2.366906474820144, + "grad_norm": 0.13931443184416437, + "learning_rate": 5.058224680360438e-05, + "loss": 0.3418, + "step": 1645 + }, + { + "epoch": 2.368345323741007, + "grad_norm": 0.12265615954125163, + "learning_rate": 5.054348664276271e-05, + "loss": 0.344, + "step": 1646 + }, + { + "epoch": 2.3697841726618707, + "grad_norm": 0.14603930332322596, + "learning_rate": 5.05047158397999e-05, + "loss": 0.3459, + "step": 1647 + }, + { + "epoch": 2.371223021582734, + "grad_norm": 0.1440671894672473, + "learning_rate": 5.046593443384945e-05, + "loss": 0.3416, + "step": 1648 + }, + { + "epoch": 2.372661870503597, + "grad_norm": 0.11840615315430422, + "learning_rate": 5.042714246405555e-05, + "loss": 0.3417, + "step": 1649 + }, + { + "epoch": 2.3741007194244603, + "grad_norm": 0.141585181622913, + "learning_rate": 5.038833996957309e-05, + "loss": 0.3405, + "step": 1650 + }, + { + "epoch": 2.3755395683453235, + "grad_norm": 0.1509373476302574, + "learning_rate": 5.0349526989567546e-05, + "loss": 0.3437, + "step": 1651 + }, + { + "epoch": 2.376978417266187, + "grad_norm": 0.14972886167306537, + "learning_rate": 5.0310703563215016e-05, + "loss": 0.3418, + "step": 1652 + }, + { + "epoch": 2.3784172661870504, + "grad_norm": 0.11796062644059152, + "learning_rate": 5.027186972970211e-05, + "loss": 0.3509, + "step": 1653 + }, + { + "epoch": 2.3798561151079136, + "grad_norm": 0.12176269985179156, + "learning_rate": 5.0233025528225934e-05, + "loss": 0.3427, + "step": 1654 + }, + { + "epoch": 2.381294964028777, + "grad_norm": 0.14452049677617043, + "learning_rate": 5.01941709979941e-05, + "loss": 0.3387, + "step": 1655 + }, + { + "epoch": 2.38273381294964, + "grad_norm": 0.14278552999151747, + "learning_rate": 5.015530617822462e-05, + "loss": 0.3454, + "step": 1656 + }, + { + "epoch": 2.3841726618705037, + "grad_norm": 0.12193205162577694, + "learning_rate": 5.011643110814589e-05, + "loss": 0.3412, + "step": 1657 + }, + { + "epoch": 2.385611510791367, + "grad_norm": 0.11282073766836978, + "learning_rate": 5.007754582699666e-05, + "loss": 0.3382, + "step": 1658 + }, + { + "epoch": 2.38705035971223, + "grad_norm": 0.11703149562420231, + "learning_rate": 5.003865037402598e-05, + "loss": 0.3409, + "step": 1659 + }, + { + "epoch": 2.3884892086330938, + "grad_norm": 0.12885309449382898, + "learning_rate": 4.999974478849319e-05, + "loss": 0.3312, + "step": 1660 + }, + { + "epoch": 2.389928057553957, + "grad_norm": 0.13183350461843132, + "learning_rate": 4.99608291096678e-05, + "loss": 0.3475, + "step": 1661 + }, + { + "epoch": 2.39136690647482, + "grad_norm": 0.13776304321963204, + "learning_rate": 4.9921903376829565e-05, + "loss": 0.3384, + "step": 1662 + }, + { + "epoch": 2.3928057553956834, + "grad_norm": 0.18077136229119795, + "learning_rate": 4.988296762926838e-05, + "loss": 0.3427, + "step": 1663 + }, + { + "epoch": 2.3942446043165466, + "grad_norm": 0.15410079075328875, + "learning_rate": 4.984402190628422e-05, + "loss": 0.3432, + "step": 1664 + }, + { + "epoch": 2.3956834532374103, + "grad_norm": 0.18997535918274108, + "learning_rate": 4.980506624718716e-05, + "loss": 0.3487, + "step": 1665 + }, + { + "epoch": 2.3971223021582735, + "grad_norm": 0.2531239787919152, + "learning_rate": 4.9766100691297284e-05, + "loss": 0.3426, + "step": 1666 + }, + { + "epoch": 2.3985611510791367, + "grad_norm": 0.1801137622984491, + "learning_rate": 4.9727125277944675e-05, + "loss": 0.3356, + "step": 1667 + }, + { + "epoch": 2.4, + "grad_norm": 0.1551017120725682, + "learning_rate": 4.968814004646934e-05, + "loss": 0.3359, + "step": 1668 + }, + { + "epoch": 2.401438848920863, + "grad_norm": 0.22300888322165083, + "learning_rate": 4.964914503622126e-05, + "loss": 0.3425, + "step": 1669 + }, + { + "epoch": 2.402877697841727, + "grad_norm": 0.1652182883526435, + "learning_rate": 4.961014028656021e-05, + "loss": 0.3461, + "step": 1670 + }, + { + "epoch": 2.40431654676259, + "grad_norm": 0.14298724843508484, + "learning_rate": 4.9571125836855825e-05, + "loss": 0.3472, + "step": 1671 + }, + { + "epoch": 2.405755395683453, + "grad_norm": 0.16991455955822699, + "learning_rate": 4.9532101726487564e-05, + "loss": 0.3321, + "step": 1672 + }, + { + "epoch": 2.4071942446043164, + "grad_norm": 0.14727464943168272, + "learning_rate": 4.9493067994844606e-05, + "loss": 0.3408, + "step": 1673 + }, + { + "epoch": 2.4086330935251796, + "grad_norm": 0.13934054200481163, + "learning_rate": 4.9454024681325815e-05, + "loss": 0.3425, + "step": 1674 + }, + { + "epoch": 2.4100719424460433, + "grad_norm": 0.14174089322523498, + "learning_rate": 4.941497182533978e-05, + "loss": 0.3432, + "step": 1675 + }, + { + "epoch": 2.4115107913669065, + "grad_norm": 0.12575370552729948, + "learning_rate": 4.937590946630469e-05, + "loss": 0.3454, + "step": 1676 + }, + { + "epoch": 2.4129496402877697, + "grad_norm": 0.1487581138720967, + "learning_rate": 4.9336837643648335e-05, + "loss": 0.336, + "step": 1677 + }, + { + "epoch": 2.414388489208633, + "grad_norm": 0.16032016529401594, + "learning_rate": 4.929775639680805e-05, + "loss": 0.3345, + "step": 1678 + }, + { + "epoch": 2.4158273381294966, + "grad_norm": 0.12110484420380835, + "learning_rate": 4.925866576523069e-05, + "loss": 0.3461, + "step": 1679 + }, + { + "epoch": 2.41726618705036, + "grad_norm": 0.13464064234471756, + "learning_rate": 4.921956578837259e-05, + "loss": 0.3417, + "step": 1680 + }, + { + "epoch": 2.418705035971223, + "grad_norm": 0.130308509890039, + "learning_rate": 4.918045650569949e-05, + "loss": 0.3414, + "step": 1681 + }, + { + "epoch": 2.420143884892086, + "grad_norm": 0.11452785815593626, + "learning_rate": 4.9141337956686564e-05, + "loss": 0.3397, + "step": 1682 + }, + { + "epoch": 2.42158273381295, + "grad_norm": 0.1305037076855578, + "learning_rate": 4.91022101808183e-05, + "loss": 0.3462, + "step": 1683 + }, + { + "epoch": 2.423021582733813, + "grad_norm": 0.11371114866586443, + "learning_rate": 4.90630732175885e-05, + "loss": 0.3433, + "step": 1684 + }, + { + "epoch": 2.4244604316546763, + "grad_norm": 0.13409494807980796, + "learning_rate": 4.902392710650028e-05, + "loss": 0.349, + "step": 1685 + }, + { + "epoch": 2.4258992805755395, + "grad_norm": 0.12018496996475034, + "learning_rate": 4.898477188706596e-05, + "loss": 0.3428, + "step": 1686 + }, + { + "epoch": 2.4273381294964027, + "grad_norm": 0.13131070325532423, + "learning_rate": 4.894560759880705e-05, + "loss": 0.3402, + "step": 1687 + }, + { + "epoch": 2.4287769784172664, + "grad_norm": 0.15857659122130136, + "learning_rate": 4.8906434281254223e-05, + "loss": 0.3327, + "step": 1688 + }, + { + "epoch": 2.4302158273381296, + "grad_norm": 0.13889386695086822, + "learning_rate": 4.886725197394726e-05, + "loss": 0.333, + "step": 1689 + }, + { + "epoch": 2.431654676258993, + "grad_norm": 0.13073855148187585, + "learning_rate": 4.882806071643503e-05, + "loss": 0.3509, + "step": 1690 + }, + { + "epoch": 2.433093525179856, + "grad_norm": 0.15118527435233725, + "learning_rate": 4.878886054827541e-05, + "loss": 0.3447, + "step": 1691 + }, + { + "epoch": 2.434532374100719, + "grad_norm": 0.14452053762841285, + "learning_rate": 4.874965150903529e-05, + "loss": 0.3428, + "step": 1692 + }, + { + "epoch": 2.435971223021583, + "grad_norm": 0.10431327788410687, + "learning_rate": 4.871043363829053e-05, + "loss": 0.3336, + "step": 1693 + }, + { + "epoch": 2.437410071942446, + "grad_norm": 0.4213052230566776, + "learning_rate": 4.8671206975625856e-05, + "loss": 0.3526, + "step": 1694 + }, + { + "epoch": 2.4388489208633093, + "grad_norm": 0.11184770364690542, + "learning_rate": 4.863197156063492e-05, + "loss": 0.3353, + "step": 1695 + }, + { + "epoch": 2.4402877697841725, + "grad_norm": 0.12222149419458228, + "learning_rate": 4.859272743292017e-05, + "loss": 0.3429, + "step": 1696 + }, + { + "epoch": 2.441726618705036, + "grad_norm": 0.11907575482636068, + "learning_rate": 4.855347463209287e-05, + "loss": 0.3481, + "step": 1697 + }, + { + "epoch": 2.4431654676258994, + "grad_norm": 0.13084626104764957, + "learning_rate": 4.851421319777304e-05, + "loss": 0.3399, + "step": 1698 + }, + { + "epoch": 2.4446043165467626, + "grad_norm": 0.1397141335067657, + "learning_rate": 4.847494316958939e-05, + "loss": 0.338, + "step": 1699 + }, + { + "epoch": 2.446043165467626, + "grad_norm": 0.1319380742709877, + "learning_rate": 4.8435664587179315e-05, + "loss": 0.348, + "step": 1700 + }, + { + "epoch": 2.4474820143884894, + "grad_norm": 0.1269627828943175, + "learning_rate": 4.839637749018887e-05, + "loss": 0.3372, + "step": 1701 + }, + { + "epoch": 2.4489208633093527, + "grad_norm": 0.1153638843635166, + "learning_rate": 4.835708191827268e-05, + "loss": 0.3449, + "step": 1702 + }, + { + "epoch": 2.450359712230216, + "grad_norm": 0.11531311545018066, + "learning_rate": 4.831777791109392e-05, + "loss": 0.3378, + "step": 1703 + }, + { + "epoch": 2.451798561151079, + "grad_norm": 0.11940463375470235, + "learning_rate": 4.827846550832428e-05, + "loss": 0.3417, + "step": 1704 + }, + { + "epoch": 2.4532374100719423, + "grad_norm": 0.1143421974967014, + "learning_rate": 4.8239144749643936e-05, + "loss": 0.3397, + "step": 1705 + }, + { + "epoch": 2.454676258992806, + "grad_norm": 0.1127354412143374, + "learning_rate": 4.819981567474152e-05, + "loss": 0.3393, + "step": 1706 + }, + { + "epoch": 2.456115107913669, + "grad_norm": 0.12792273576722857, + "learning_rate": 4.8160478323313974e-05, + "loss": 0.3454, + "step": 1707 + }, + { + "epoch": 2.4575539568345324, + "grad_norm": 0.12738881142266212, + "learning_rate": 4.812113273506671e-05, + "loss": 0.3363, + "step": 1708 + }, + { + "epoch": 2.4589928057553956, + "grad_norm": 0.1038933836953817, + "learning_rate": 4.808177894971336e-05, + "loss": 0.3429, + "step": 1709 + }, + { + "epoch": 2.460431654676259, + "grad_norm": 0.15633130637066203, + "learning_rate": 4.804241700697588e-05, + "loss": 0.341, + "step": 1710 + }, + { + "epoch": 2.4618705035971225, + "grad_norm": 0.1293726497004006, + "learning_rate": 4.800304694658443e-05, + "loss": 0.3409, + "step": 1711 + }, + { + "epoch": 2.4633093525179857, + "grad_norm": 0.13166280142095088, + "learning_rate": 4.796366880827739e-05, + "loss": 0.336, + "step": 1712 + }, + { + "epoch": 2.464748201438849, + "grad_norm": 0.14510994717815986, + "learning_rate": 4.792428263180128e-05, + "loss": 0.334, + "step": 1713 + }, + { + "epoch": 2.466187050359712, + "grad_norm": 0.10196507070564377, + "learning_rate": 4.7884888456910734e-05, + "loss": 0.3475, + "step": 1714 + }, + { + "epoch": 2.4676258992805753, + "grad_norm": 0.13058594222204808, + "learning_rate": 4.784548632336846e-05, + "loss": 0.3462, + "step": 1715 + }, + { + "epoch": 2.469064748201439, + "grad_norm": 0.11063133459925471, + "learning_rate": 4.7806076270945197e-05, + "loss": 0.3375, + "step": 1716 + }, + { + "epoch": 2.470503597122302, + "grad_norm": 0.10271160902895207, + "learning_rate": 4.776665833941968e-05, + "loss": 0.3394, + "step": 1717 + }, + { + "epoch": 2.4719424460431654, + "grad_norm": 0.12607363146537412, + "learning_rate": 4.772723256857859e-05, + "loss": 0.3451, + "step": 1718 + }, + { + "epoch": 2.4733812949640286, + "grad_norm": 0.1583496627807278, + "learning_rate": 4.768779899821655e-05, + "loss": 0.3366, + "step": 1719 + }, + { + "epoch": 2.4748201438848922, + "grad_norm": 0.14234848814645065, + "learning_rate": 4.7648357668135996e-05, + "loss": 0.3432, + "step": 1720 + }, + { + "epoch": 2.4762589928057555, + "grad_norm": 0.14076368701870684, + "learning_rate": 4.760890861814726e-05, + "loss": 0.3437, + "step": 1721 + }, + { + "epoch": 2.4776978417266187, + "grad_norm": 0.16272716328653117, + "learning_rate": 4.756945188806843e-05, + "loss": 0.336, + "step": 1722 + }, + { + "epoch": 2.479136690647482, + "grad_norm": 0.13042375807753362, + "learning_rate": 4.752998751772536e-05, + "loss": 0.3464, + "step": 1723 + }, + { + "epoch": 2.4805755395683455, + "grad_norm": 0.10483141311526244, + "learning_rate": 4.749051554695159e-05, + "loss": 0.3359, + "step": 1724 + }, + { + "epoch": 2.4820143884892087, + "grad_norm": 0.15640901733596446, + "learning_rate": 4.745103601558838e-05, + "loss": 0.3409, + "step": 1725 + }, + { + "epoch": 2.483453237410072, + "grad_norm": 0.14828497121504203, + "learning_rate": 4.741154896348458e-05, + "loss": 0.3447, + "step": 1726 + }, + { + "epoch": 2.484892086330935, + "grad_norm": 0.1103431235543875, + "learning_rate": 4.7372054430496636e-05, + "loss": 0.3364, + "step": 1727 + }, + { + "epoch": 2.4863309352517984, + "grad_norm": 0.16230804063638982, + "learning_rate": 4.733255245648857e-05, + "loss": 0.3445, + "step": 1728 + }, + { + "epoch": 2.487769784172662, + "grad_norm": 0.1351199508794201, + "learning_rate": 4.729304308133189e-05, + "loss": 0.3405, + "step": 1729 + }, + { + "epoch": 2.4892086330935252, + "grad_norm": 0.10339238683809066, + "learning_rate": 4.725352634490557e-05, + "loss": 0.3452, + "step": 1730 + }, + { + "epoch": 2.4906474820143885, + "grad_norm": 0.12454825077224349, + "learning_rate": 4.7214002287096035e-05, + "loss": 0.3411, + "step": 1731 + }, + { + "epoch": 2.4920863309352517, + "grad_norm": 0.1365182483518328, + "learning_rate": 4.7174470947797117e-05, + "loss": 0.3385, + "step": 1732 + }, + { + "epoch": 2.493525179856115, + "grad_norm": 0.11652871026144225, + "learning_rate": 4.7134932366909915e-05, + "loss": 0.3355, + "step": 1733 + }, + { + "epoch": 2.4949640287769785, + "grad_norm": 0.1398221290455718, + "learning_rate": 4.709538658434294e-05, + "loss": 0.3421, + "step": 1734 + }, + { + "epoch": 2.4964028776978417, + "grad_norm": 0.12643466864251143, + "learning_rate": 4.705583364001192e-05, + "loss": 0.3412, + "step": 1735 + }, + { + "epoch": 2.497841726618705, + "grad_norm": 0.12663940225696427, + "learning_rate": 4.701627357383981e-05, + "loss": 0.3427, + "step": 1736 + }, + { + "epoch": 2.499280575539568, + "grad_norm": 0.12867713456211932, + "learning_rate": 4.697670642575675e-05, + "loss": 0.3389, + "step": 1737 + }, + { + "epoch": 2.5007194244604314, + "grad_norm": 0.09729215423207732, + "learning_rate": 4.693713223570006e-05, + "loss": 0.3396, + "step": 1738 + }, + { + "epoch": 2.502158273381295, + "grad_norm": 0.10536205673373836, + "learning_rate": 4.689755104361414e-05, + "loss": 0.3358, + "step": 1739 + }, + { + "epoch": 2.5035971223021583, + "grad_norm": 0.09470267440128205, + "learning_rate": 4.685796288945046e-05, + "loss": 0.3354, + "step": 1740 + }, + { + "epoch": 2.5050359712230215, + "grad_norm": 0.10720684460790161, + "learning_rate": 4.6818367813167535e-05, + "loss": 0.3392, + "step": 1741 + }, + { + "epoch": 2.506474820143885, + "grad_norm": 0.12457788316646835, + "learning_rate": 4.6778765854730835e-05, + "loss": 0.3369, + "step": 1742 + }, + { + "epoch": 2.5079136690647483, + "grad_norm": 0.12249312745078086, + "learning_rate": 4.673915705411281e-05, + "loss": 0.3502, + "step": 1743 + }, + { + "epoch": 2.5093525179856115, + "grad_norm": 0.12834266627532392, + "learning_rate": 4.6699541451292786e-05, + "loss": 0.3385, + "step": 1744 + }, + { + "epoch": 2.5107913669064748, + "grad_norm": 0.12800319139369157, + "learning_rate": 4.665991908625699e-05, + "loss": 0.3434, + "step": 1745 + }, + { + "epoch": 2.512230215827338, + "grad_norm": 0.11948627362015994, + "learning_rate": 4.6620289998998445e-05, + "loss": 0.3344, + "step": 1746 + }, + { + "epoch": 2.5136690647482016, + "grad_norm": 0.13107752356793956, + "learning_rate": 4.658065422951697e-05, + "loss": 0.3359, + "step": 1747 + }, + { + "epoch": 2.515107913669065, + "grad_norm": 0.11391152786479106, + "learning_rate": 4.654101181781913e-05, + "loss": 0.341, + "step": 1748 + }, + { + "epoch": 2.516546762589928, + "grad_norm": 0.13063226539610714, + "learning_rate": 4.650136280391818e-05, + "loss": 0.3389, + "step": 1749 + }, + { + "epoch": 2.5179856115107913, + "grad_norm": 0.1261484669423309, + "learning_rate": 4.646170722783408e-05, + "loss": 0.3291, + "step": 1750 + }, + { + "epoch": 2.5194244604316545, + "grad_norm": 0.10139629190687235, + "learning_rate": 4.6422045129593344e-05, + "loss": 0.3355, + "step": 1751 + }, + { + "epoch": 2.520863309352518, + "grad_norm": 0.14901272921951966, + "learning_rate": 4.6382376549229146e-05, + "loss": 0.3288, + "step": 1752 + }, + { + "epoch": 2.5223021582733813, + "grad_norm": 0.1523215911606212, + "learning_rate": 4.634270152678115e-05, + "loss": 0.3355, + "step": 1753 + }, + { + "epoch": 2.5237410071942445, + "grad_norm": 0.11328006772432715, + "learning_rate": 4.630302010229555e-05, + "loss": 0.3305, + "step": 1754 + }, + { + "epoch": 2.5251798561151078, + "grad_norm": 0.14774464605253138, + "learning_rate": 4.6263332315824964e-05, + "loss": 0.3428, + "step": 1755 + }, + { + "epoch": 2.526618705035971, + "grad_norm": 0.13300622516392707, + "learning_rate": 4.622363820742848e-05, + "loss": 0.3439, + "step": 1756 + }, + { + "epoch": 2.5280575539568346, + "grad_norm": 0.1287918531160956, + "learning_rate": 4.618393781717156e-05, + "loss": 0.3378, + "step": 1757 + }, + { + "epoch": 2.529496402877698, + "grad_norm": 0.11652999713195161, + "learning_rate": 4.614423118512595e-05, + "loss": 0.3492, + "step": 1758 + }, + { + "epoch": 2.530935251798561, + "grad_norm": 0.11213280118872815, + "learning_rate": 4.610451835136978e-05, + "loss": 0.3437, + "step": 1759 + }, + { + "epoch": 2.5323741007194247, + "grad_norm": 0.11185644660014751, + "learning_rate": 4.606479935598738e-05, + "loss": 0.3501, + "step": 1760 + }, + { + "epoch": 2.533812949640288, + "grad_norm": 0.09436021263924985, + "learning_rate": 4.602507423906931e-05, + "loss": 0.3354, + "step": 1761 + }, + { + "epoch": 2.535251798561151, + "grad_norm": 0.11104665481821985, + "learning_rate": 4.598534304071233e-05, + "loss": 0.3338, + "step": 1762 + }, + { + "epoch": 2.5366906474820143, + "grad_norm": 0.10911876228788199, + "learning_rate": 4.5945605801019315e-05, + "loss": 0.3345, + "step": 1763 + }, + { + "epoch": 2.5381294964028775, + "grad_norm": 0.09465719016284928, + "learning_rate": 4.5905862560099255e-05, + "loss": 0.3387, + "step": 1764 + }, + { + "epoch": 2.539568345323741, + "grad_norm": 0.10650314757534553, + "learning_rate": 4.5866113358067187e-05, + "loss": 0.3389, + "step": 1765 + }, + { + "epoch": 2.5410071942446044, + "grad_norm": 0.11984942002554853, + "learning_rate": 4.582635823504416e-05, + "loss": 0.3379, + "step": 1766 + }, + { + "epoch": 2.5424460431654676, + "grad_norm": 0.11734119542612027, + "learning_rate": 4.5786597231157214e-05, + "loss": 0.3474, + "step": 1767 + }, + { + "epoch": 2.543884892086331, + "grad_norm": 0.11015931760329653, + "learning_rate": 4.574683038653932e-05, + "loss": 0.3396, + "step": 1768 + }, + { + "epoch": 2.545323741007194, + "grad_norm": 0.11844530280191058, + "learning_rate": 4.5707057741329324e-05, + "loss": 0.3391, + "step": 1769 + }, + { + "epoch": 2.5467625899280577, + "grad_norm": 0.12190665595078562, + "learning_rate": 4.5667279335671986e-05, + "loss": 0.3445, + "step": 1770 + }, + { + "epoch": 2.548201438848921, + "grad_norm": 0.11538645447979806, + "learning_rate": 4.56274952097178e-05, + "loss": 0.3369, + "step": 1771 + }, + { + "epoch": 2.549640287769784, + "grad_norm": 0.13013491237388408, + "learning_rate": 4.558770540362308e-05, + "loss": 0.341, + "step": 1772 + }, + { + "epoch": 2.5510791366906473, + "grad_norm": 0.11811645416708122, + "learning_rate": 4.554790995754988e-05, + "loss": 0.3421, + "step": 1773 + }, + { + "epoch": 2.5525179856115106, + "grad_norm": 0.14139738127617787, + "learning_rate": 4.5508108911665926e-05, + "loss": 0.343, + "step": 1774 + }, + { + "epoch": 2.553956834532374, + "grad_norm": 0.15176651595735202, + "learning_rate": 4.5468302306144594e-05, + "loss": 0.338, + "step": 1775 + }, + { + "epoch": 2.5553956834532374, + "grad_norm": 0.10735008156962264, + "learning_rate": 4.542849018116491e-05, + "loss": 0.3393, + "step": 1776 + }, + { + "epoch": 2.5568345323741006, + "grad_norm": 0.14879243298691355, + "learning_rate": 4.538867257691141e-05, + "loss": 0.3311, + "step": 1777 + }, + { + "epoch": 2.5582733812949643, + "grad_norm": 0.12535624465428738, + "learning_rate": 4.53488495335742e-05, + "loss": 0.3482, + "step": 1778 + }, + { + "epoch": 2.5597122302158275, + "grad_norm": 0.10908302463595965, + "learning_rate": 4.5309021091348885e-05, + "loss": 0.3427, + "step": 1779 + }, + { + "epoch": 2.5611510791366907, + "grad_norm": 0.15905673972568443, + "learning_rate": 4.5269187290436486e-05, + "loss": 0.3357, + "step": 1780 + }, + { + "epoch": 2.562589928057554, + "grad_norm": 0.13253803661911623, + "learning_rate": 4.5229348171043466e-05, + "loss": 0.3372, + "step": 1781 + }, + { + "epoch": 2.564028776978417, + "grad_norm": 0.10950791156714575, + "learning_rate": 4.51895037733816e-05, + "loss": 0.3408, + "step": 1782 + }, + { + "epoch": 2.565467625899281, + "grad_norm": 0.16002575202649105, + "learning_rate": 4.5149654137668095e-05, + "loss": 0.3408, + "step": 1783 + }, + { + "epoch": 2.566906474820144, + "grad_norm": 0.17440279143410928, + "learning_rate": 4.5109799304125333e-05, + "loss": 0.3445, + "step": 1784 + }, + { + "epoch": 2.568345323741007, + "grad_norm": 0.0900887649274377, + "learning_rate": 4.5069939312981e-05, + "loss": 0.3387, + "step": 1785 + }, + { + "epoch": 2.5697841726618704, + "grad_norm": 0.12332539288262825, + "learning_rate": 4.503007420446798e-05, + "loss": 0.3352, + "step": 1786 + }, + { + "epoch": 2.5712230215827336, + "grad_norm": 0.11515832396272689, + "learning_rate": 4.499020401882433e-05, + "loss": 0.3374, + "step": 1787 + }, + { + "epoch": 2.5726618705035973, + "grad_norm": 0.11235829008442862, + "learning_rate": 4.49503287962932e-05, + "loss": 0.3388, + "step": 1788 + }, + { + "epoch": 2.5741007194244605, + "grad_norm": 0.11147519106474965, + "learning_rate": 4.491044857712288e-05, + "loss": 0.3402, + "step": 1789 + }, + { + "epoch": 2.5755395683453237, + "grad_norm": 0.1262425182010213, + "learning_rate": 4.4870563401566634e-05, + "loss": 0.3458, + "step": 1790 + }, + { + "epoch": 2.576978417266187, + "grad_norm": 0.11324580140469244, + "learning_rate": 4.483067330988278e-05, + "loss": 0.3419, + "step": 1791 + }, + { + "epoch": 2.57841726618705, + "grad_norm": 0.1289434899627111, + "learning_rate": 4.479077834233458e-05, + "loss": 0.3436, + "step": 1792 + }, + { + "epoch": 2.579856115107914, + "grad_norm": 0.1122464263243746, + "learning_rate": 4.475087853919023e-05, + "loss": 0.3407, + "step": 1793 + }, + { + "epoch": 2.581294964028777, + "grad_norm": 0.13063460751115946, + "learning_rate": 4.4710973940722786e-05, + "loss": 0.3456, + "step": 1794 + }, + { + "epoch": 2.58273381294964, + "grad_norm": 0.13627796419509441, + "learning_rate": 4.4671064587210146e-05, + "loss": 0.3361, + "step": 1795 + }, + { + "epoch": 2.584172661870504, + "grad_norm": 0.13508693136071181, + "learning_rate": 4.4631150518935044e-05, + "loss": 0.336, + "step": 1796 + }, + { + "epoch": 2.5856115107913666, + "grad_norm": 0.12401876363541481, + "learning_rate": 4.459123177618491e-05, + "loss": 0.3379, + "step": 1797 + }, + { + "epoch": 2.5870503597122303, + "grad_norm": 0.12941049453432754, + "learning_rate": 4.455130839925195e-05, + "loss": 0.3435, + "step": 1798 + }, + { + "epoch": 2.5884892086330935, + "grad_norm": 0.14867071306096177, + "learning_rate": 4.451138042843302e-05, + "loss": 0.3425, + "step": 1799 + }, + { + "epoch": 2.5899280575539567, + "grad_norm": 0.12093240923116741, + "learning_rate": 4.447144790402963e-05, + "loss": 0.3408, + "step": 1800 + }, + { + "epoch": 2.5913669064748204, + "grad_norm": 0.12374790333094081, + "learning_rate": 4.4431510866347837e-05, + "loss": 0.3375, + "step": 1801 + }, + { + "epoch": 2.5928057553956836, + "grad_norm": 0.14857921532385407, + "learning_rate": 4.439156935569833e-05, + "loss": 0.3416, + "step": 1802 + }, + { + "epoch": 2.594244604316547, + "grad_norm": 0.12926069519248012, + "learning_rate": 4.435162341239625e-05, + "loss": 0.3435, + "step": 1803 + }, + { + "epoch": 2.59568345323741, + "grad_norm": 0.11651180748397517, + "learning_rate": 4.4311673076761254e-05, + "loss": 0.3373, + "step": 1804 + }, + { + "epoch": 2.597122302158273, + "grad_norm": 0.1507180724691262, + "learning_rate": 4.42717183891174e-05, + "loss": 0.3331, + "step": 1805 + }, + { + "epoch": 2.598561151079137, + "grad_norm": 0.1414529505953001, + "learning_rate": 4.4231759389793144e-05, + "loss": 0.3439, + "step": 1806 + }, + { + "epoch": 2.6, + "grad_norm": 0.11773868704444448, + "learning_rate": 4.4191796119121335e-05, + "loss": 0.3421, + "step": 1807 + }, + { + "epoch": 2.6014388489208633, + "grad_norm": 0.14819298825521954, + "learning_rate": 4.415182861743906e-05, + "loss": 0.3476, + "step": 1808 + }, + { + "epoch": 2.6028776978417265, + "grad_norm": 0.140434101586376, + "learning_rate": 4.411185692508774e-05, + "loss": 0.3414, + "step": 1809 + }, + { + "epoch": 2.6043165467625897, + "grad_norm": 0.10932990039065672, + "learning_rate": 4.4071881082413e-05, + "loss": 0.3375, + "step": 1810 + }, + { + "epoch": 2.6057553956834534, + "grad_norm": 0.13786451900966154, + "learning_rate": 4.4031901129764665e-05, + "loss": 0.3331, + "step": 1811 + }, + { + "epoch": 2.6071942446043166, + "grad_norm": 0.12996289496226116, + "learning_rate": 4.3991917107496695e-05, + "loss": 0.3431, + "step": 1812 + }, + { + "epoch": 2.60863309352518, + "grad_norm": 0.10387392325944972, + "learning_rate": 4.395192905596716e-05, + "loss": 0.3394, + "step": 1813 + }, + { + "epoch": 2.610071942446043, + "grad_norm": 0.12503653841902704, + "learning_rate": 4.3911937015538186e-05, + "loss": 0.3413, + "step": 1814 + }, + { + "epoch": 2.6115107913669062, + "grad_norm": 0.10971997429608449, + "learning_rate": 4.3871941026575965e-05, + "loss": 0.3374, + "step": 1815 + }, + { + "epoch": 2.61294964028777, + "grad_norm": 0.10023185050648277, + "learning_rate": 4.383194112945066e-05, + "loss": 0.3421, + "step": 1816 + }, + { + "epoch": 2.614388489208633, + "grad_norm": 0.11916928542493764, + "learning_rate": 4.379193736453633e-05, + "loss": 0.339, + "step": 1817 + }, + { + "epoch": 2.6158273381294963, + "grad_norm": 0.10586309598590037, + "learning_rate": 4.375192977221099e-05, + "loss": 0.3484, + "step": 1818 + }, + { + "epoch": 2.61726618705036, + "grad_norm": 0.1098671434915074, + "learning_rate": 4.371191839285651e-05, + "loss": 0.3354, + "step": 1819 + }, + { + "epoch": 2.618705035971223, + "grad_norm": 0.13188890631748518, + "learning_rate": 4.367190326685858e-05, + "loss": 0.3413, + "step": 1820 + }, + { + "epoch": 2.6201438848920864, + "grad_norm": 0.11743665747189141, + "learning_rate": 4.363188443460666e-05, + "loss": 0.3459, + "step": 1821 + }, + { + "epoch": 2.6215827338129496, + "grad_norm": 0.12017290401897036, + "learning_rate": 4.3591861936493964e-05, + "loss": 0.3355, + "step": 1822 + }, + { + "epoch": 2.623021582733813, + "grad_norm": 0.1146193053784997, + "learning_rate": 4.3551835812917395e-05, + "loss": 0.34, + "step": 1823 + }, + { + "epoch": 2.6244604316546765, + "grad_norm": 0.09860614798113755, + "learning_rate": 4.351180610427754e-05, + "loss": 0.3415, + "step": 1824 + }, + { + "epoch": 2.6258992805755397, + "grad_norm": 0.11114368863560233, + "learning_rate": 4.347177285097855e-05, + "loss": 0.3382, + "step": 1825 + }, + { + "epoch": 2.627338129496403, + "grad_norm": 0.11665309354902487, + "learning_rate": 4.343173609342822e-05, + "loss": 0.3446, + "step": 1826 + }, + { + "epoch": 2.628776978417266, + "grad_norm": 0.10277560427518727, + "learning_rate": 4.339169587203785e-05, + "loss": 0.3391, + "step": 1827 + }, + { + "epoch": 2.6302158273381293, + "grad_norm": 0.09599960246248385, + "learning_rate": 4.335165222722222e-05, + "loss": 0.3287, + "step": 1828 + }, + { + "epoch": 2.631654676258993, + "grad_norm": 0.11722916136015507, + "learning_rate": 4.331160519939962e-05, + "loss": 0.3328, + "step": 1829 + }, + { + "epoch": 2.633093525179856, + "grad_norm": 0.11023207099145288, + "learning_rate": 4.327155482899168e-05, + "loss": 0.3378, + "step": 1830 + }, + { + "epoch": 2.6345323741007194, + "grad_norm": 0.10202424983535885, + "learning_rate": 4.323150115642346e-05, + "loss": 0.3446, + "step": 1831 + }, + { + "epoch": 2.6359712230215826, + "grad_norm": 0.10413403149240487, + "learning_rate": 4.3191444222123326e-05, + "loss": 0.3477, + "step": 1832 + }, + { + "epoch": 2.637410071942446, + "grad_norm": 0.11491521916422581, + "learning_rate": 4.3151384066522964e-05, + "loss": 0.3367, + "step": 1833 + }, + { + "epoch": 2.6388489208633095, + "grad_norm": 0.10822247599269093, + "learning_rate": 4.311132073005727e-05, + "loss": 0.3422, + "step": 1834 + }, + { + "epoch": 2.6402877697841727, + "grad_norm": 0.12789621557709568, + "learning_rate": 4.3071254253164395e-05, + "loss": 0.339, + "step": 1835 + }, + { + "epoch": 2.641726618705036, + "grad_norm": 0.10547673251689751, + "learning_rate": 4.3031184676285625e-05, + "loss": 0.3297, + "step": 1836 + }, + { + "epoch": 2.6431654676258995, + "grad_norm": 0.12123644061016982, + "learning_rate": 4.299111203986539e-05, + "loss": 0.3312, + "step": 1837 + }, + { + "epoch": 2.6446043165467623, + "grad_norm": 0.12844557785693508, + "learning_rate": 4.29510363843512e-05, + "loss": 0.3367, + "step": 1838 + }, + { + "epoch": 2.646043165467626, + "grad_norm": 0.10624197613105688, + "learning_rate": 4.291095775019364e-05, + "loss": 0.3406, + "step": 1839 + }, + { + "epoch": 2.647482014388489, + "grad_norm": 0.11173704018377709, + "learning_rate": 4.287087617784627e-05, + "loss": 0.3451, + "step": 1840 + }, + { + "epoch": 2.6489208633093524, + "grad_norm": 0.10968755925479357, + "learning_rate": 4.283079170776561e-05, + "loss": 0.3384, + "step": 1841 + }, + { + "epoch": 2.650359712230216, + "grad_norm": 0.11346023095700333, + "learning_rate": 4.279070438041116e-05, + "loss": 0.3373, + "step": 1842 + }, + { + "epoch": 2.6517985611510793, + "grad_norm": 0.10204403289138868, + "learning_rate": 4.275061423624522e-05, + "loss": 0.3428, + "step": 1843 + }, + { + "epoch": 2.6532374100719425, + "grad_norm": 0.1169231203643409, + "learning_rate": 4.2710521315733e-05, + "loss": 0.3413, + "step": 1844 + }, + { + "epoch": 2.6546762589928057, + "grad_norm": 0.10943698461313435, + "learning_rate": 4.26704256593425e-05, + "loss": 0.3474, + "step": 1845 + }, + { + "epoch": 2.656115107913669, + "grad_norm": 0.1220135814725209, + "learning_rate": 4.2630327307544454e-05, + "loss": 0.333, + "step": 1846 + }, + { + "epoch": 2.6575539568345325, + "grad_norm": 0.109415482241854, + "learning_rate": 4.2590226300812335e-05, + "loss": 0.3381, + "step": 1847 + }, + { + "epoch": 2.6589928057553958, + "grad_norm": 0.1179174317816068, + "learning_rate": 4.255012267962232e-05, + "loss": 0.3453, + "step": 1848 + }, + { + "epoch": 2.660431654676259, + "grad_norm": 0.1275534930596529, + "learning_rate": 4.251001648445317e-05, + "loss": 0.3405, + "step": 1849 + }, + { + "epoch": 2.661870503597122, + "grad_norm": 0.13711098030323063, + "learning_rate": 4.246990775578628e-05, + "loss": 0.3393, + "step": 1850 + }, + { + "epoch": 2.6633093525179854, + "grad_norm": 0.12306157494688558, + "learning_rate": 4.242979653410562e-05, + "loss": 0.3397, + "step": 1851 + }, + { + "epoch": 2.664748201438849, + "grad_norm": 0.10730585499334598, + "learning_rate": 4.238968285989762e-05, + "loss": 0.337, + "step": 1852 + }, + { + "epoch": 2.6661870503597123, + "grad_norm": 0.15486322539275116, + "learning_rate": 4.2349566773651236e-05, + "loss": 0.336, + "step": 1853 + }, + { + "epoch": 2.6676258992805755, + "grad_norm": 0.13889360595217953, + "learning_rate": 4.2309448315857844e-05, + "loss": 0.3419, + "step": 1854 + }, + { + "epoch": 2.6690647482014387, + "grad_norm": 0.09999278397483768, + "learning_rate": 4.226932752701122e-05, + "loss": 0.3423, + "step": 1855 + }, + { + "epoch": 2.670503597122302, + "grad_norm": 0.14533948392392976, + "learning_rate": 4.2229204447607456e-05, + "loss": 0.3345, + "step": 1856 + }, + { + "epoch": 2.6719424460431656, + "grad_norm": 0.15935841097160405, + "learning_rate": 4.2189079118145e-05, + "loss": 0.3461, + "step": 1857 + }, + { + "epoch": 2.6733812949640288, + "grad_norm": 0.10932579096393788, + "learning_rate": 4.214895157912454e-05, + "loss": 0.3429, + "step": 1858 + }, + { + "epoch": 2.674820143884892, + "grad_norm": 0.13080358011840967, + "learning_rate": 4.210882187104904e-05, + "loss": 0.3366, + "step": 1859 + }, + { + "epoch": 2.6762589928057556, + "grad_norm": 0.13269828722864208, + "learning_rate": 4.206869003442358e-05, + "loss": 0.3408, + "step": 1860 + }, + { + "epoch": 2.677697841726619, + "grad_norm": 0.1305819791221825, + "learning_rate": 4.2028556109755465e-05, + "loss": 0.3391, + "step": 1861 + }, + { + "epoch": 2.679136690647482, + "grad_norm": 0.09724856574709481, + "learning_rate": 4.198842013755408e-05, + "loss": 0.3397, + "step": 1862 + }, + { + "epoch": 2.6805755395683453, + "grad_norm": 0.09635878900706464, + "learning_rate": 4.194828215833082e-05, + "loss": 0.3403, + "step": 1863 + }, + { + "epoch": 2.6820143884892085, + "grad_norm": 0.09396676697509772, + "learning_rate": 4.1908142212599206e-05, + "loss": 0.3288, + "step": 1864 + }, + { + "epoch": 2.683453237410072, + "grad_norm": 0.0974403067346001, + "learning_rate": 4.1868000340874674e-05, + "loss": 0.332, + "step": 1865 + }, + { + "epoch": 2.6848920863309353, + "grad_norm": 0.09508337444864512, + "learning_rate": 4.182785658367462e-05, + "loss": 0.3427, + "step": 1866 + }, + { + "epoch": 2.6863309352517986, + "grad_norm": 0.11136661124114632, + "learning_rate": 4.178771098151835e-05, + "loss": 0.3425, + "step": 1867 + }, + { + "epoch": 2.6877697841726618, + "grad_norm": 0.09821683497577681, + "learning_rate": 4.1747563574927034e-05, + "loss": 0.3443, + "step": 1868 + }, + { + "epoch": 2.689208633093525, + "grad_norm": 0.12174694705551985, + "learning_rate": 4.170741440442366e-05, + "loss": 0.3391, + "step": 1869 + }, + { + "epoch": 2.6906474820143886, + "grad_norm": 0.11064487171338318, + "learning_rate": 4.166726351053299e-05, + "loss": 0.3436, + "step": 1870 + }, + { + "epoch": 2.692086330935252, + "grad_norm": 0.11510251312155208, + "learning_rate": 4.1627110933781515e-05, + "loss": 0.3448, + "step": 1871 + }, + { + "epoch": 2.693525179856115, + "grad_norm": 0.14301855964769836, + "learning_rate": 4.158695671469746e-05, + "loss": 0.3506, + "step": 1872 + }, + { + "epoch": 2.6949640287769783, + "grad_norm": 0.13519930844792627, + "learning_rate": 4.154680089381068e-05, + "loss": 0.3455, + "step": 1873 + }, + { + "epoch": 2.6964028776978415, + "grad_norm": 0.11251336867088967, + "learning_rate": 4.150664351165266e-05, + "loss": 0.3312, + "step": 1874 + }, + { + "epoch": 2.697841726618705, + "grad_norm": 0.11106285385888814, + "learning_rate": 4.146648460875646e-05, + "loss": 0.3436, + "step": 1875 + }, + { + "epoch": 2.6992805755395683, + "grad_norm": 0.09324584790437306, + "learning_rate": 4.1426324225656644e-05, + "loss": 0.3377, + "step": 1876 + }, + { + "epoch": 2.7007194244604316, + "grad_norm": 0.10724805131104773, + "learning_rate": 4.138616240288934e-05, + "loss": 0.3434, + "step": 1877 + }, + { + "epoch": 2.702158273381295, + "grad_norm": 0.0868909954045789, + "learning_rate": 4.134599918099204e-05, + "loss": 0.3436, + "step": 1878 + }, + { + "epoch": 2.7035971223021584, + "grad_norm": 0.10193825613946285, + "learning_rate": 4.130583460050371e-05, + "loss": 0.3453, + "step": 1879 + }, + { + "epoch": 2.7050359712230216, + "grad_norm": 0.09979947035317822, + "learning_rate": 4.126566870196468e-05, + "loss": 0.3469, + "step": 1880 + }, + { + "epoch": 2.706474820143885, + "grad_norm": 0.10164179590282962, + "learning_rate": 4.12255015259166e-05, + "loss": 0.3402, + "step": 1881 + }, + { + "epoch": 2.707913669064748, + "grad_norm": 0.10457175175599578, + "learning_rate": 4.1185333112902394e-05, + "loss": 0.3387, + "step": 1882 + }, + { + "epoch": 2.7093525179856117, + "grad_norm": 0.10303970397298673, + "learning_rate": 4.114516350346626e-05, + "loss": 0.3342, + "step": 1883 + }, + { + "epoch": 2.710791366906475, + "grad_norm": 0.09774648641990759, + "learning_rate": 4.1104992738153616e-05, + "loss": 0.3438, + "step": 1884 + }, + { + "epoch": 2.712230215827338, + "grad_norm": 0.11011856691134238, + "learning_rate": 4.1064820857511e-05, + "loss": 0.3385, + "step": 1885 + }, + { + "epoch": 2.7136690647482014, + "grad_norm": 0.10500214455171199, + "learning_rate": 4.1024647902086107e-05, + "loss": 0.3484, + "step": 1886 + }, + { + "epoch": 2.7151079136690646, + "grad_norm": 0.09702583849457841, + "learning_rate": 4.0984473912427706e-05, + "loss": 0.3459, + "step": 1887 + }, + { + "epoch": 2.716546762589928, + "grad_norm": 0.11101139153632772, + "learning_rate": 4.0944298929085633e-05, + "loss": 0.3362, + "step": 1888 + }, + { + "epoch": 2.7179856115107914, + "grad_norm": 0.11344635607958369, + "learning_rate": 4.090412299261068e-05, + "loss": 0.3393, + "step": 1889 + }, + { + "epoch": 2.7194244604316546, + "grad_norm": 0.0849582398459722, + "learning_rate": 4.086394614355467e-05, + "loss": 0.3331, + "step": 1890 + }, + { + "epoch": 2.720863309352518, + "grad_norm": 0.09560759718322384, + "learning_rate": 4.082376842247027e-05, + "loss": 0.3416, + "step": 1891 + }, + { + "epoch": 2.722302158273381, + "grad_norm": 0.11213224875463144, + "learning_rate": 4.0783589869911074e-05, + "loss": 0.3443, + "step": 1892 + }, + { + "epoch": 2.7237410071942447, + "grad_norm": 0.1025268180157425, + "learning_rate": 4.074341052643152e-05, + "loss": 0.34, + "step": 1893 + }, + { + "epoch": 2.725179856115108, + "grad_norm": 0.10061029461157417, + "learning_rate": 4.070323043258683e-05, + "loss": 0.3456, + "step": 1894 + }, + { + "epoch": 2.726618705035971, + "grad_norm": 0.1315208555486423, + "learning_rate": 4.066304962893297e-05, + "loss": 0.3405, + "step": 1895 + }, + { + "epoch": 2.728057553956835, + "grad_norm": 0.10202207975723282, + "learning_rate": 4.062286815602661e-05, + "loss": 0.3452, + "step": 1896 + }, + { + "epoch": 2.7294964028776976, + "grad_norm": 0.10788336704356037, + "learning_rate": 4.0582686054425196e-05, + "loss": 0.3409, + "step": 1897 + }, + { + "epoch": 2.7309352517985612, + "grad_norm": 0.10995402693608562, + "learning_rate": 4.054250336468666e-05, + "loss": 0.3361, + "step": 1898 + }, + { + "epoch": 2.7323741007194244, + "grad_norm": 0.08228729974196912, + "learning_rate": 4.050232012736964e-05, + "loss": 0.3411, + "step": 1899 + }, + { + "epoch": 2.7338129496402876, + "grad_norm": 0.11300246784808828, + "learning_rate": 4.0462136383033285e-05, + "loss": 0.3437, + "step": 1900 + }, + { + "epoch": 2.7352517985611513, + "grad_norm": 0.10330823730912551, + "learning_rate": 4.0421952172237254e-05, + "loss": 0.3375, + "step": 1901 + }, + { + "epoch": 2.7366906474820145, + "grad_norm": 0.10158770831666479, + "learning_rate": 4.038176753554166e-05, + "loss": 0.34, + "step": 1902 + }, + { + "epoch": 2.7381294964028777, + "grad_norm": 0.09037962763345136, + "learning_rate": 4.034158251350711e-05, + "loss": 0.3334, + "step": 1903 + }, + { + "epoch": 2.739568345323741, + "grad_norm": 0.12040518845184638, + "learning_rate": 4.030139714669453e-05, + "loss": 0.339, + "step": 1904 + }, + { + "epoch": 2.741007194244604, + "grad_norm": 0.09999156206608065, + "learning_rate": 4.026121147566522e-05, + "loss": 0.3487, + "step": 1905 + }, + { + "epoch": 2.742446043165468, + "grad_norm": 0.10962480095976812, + "learning_rate": 4.02210255409808e-05, + "loss": 0.3409, + "step": 1906 + }, + { + "epoch": 2.743884892086331, + "grad_norm": 0.14188551617672068, + "learning_rate": 4.018083938320314e-05, + "loss": 0.3397, + "step": 1907 + }, + { + "epoch": 2.7453237410071942, + "grad_norm": 0.11311796878797645, + "learning_rate": 4.014065304289435e-05, + "loss": 0.3351, + "step": 1908 + }, + { + "epoch": 2.7467625899280574, + "grad_norm": 0.10314189465154625, + "learning_rate": 4.010046656061669e-05, + "loss": 0.3472, + "step": 1909 + }, + { + "epoch": 2.7482014388489207, + "grad_norm": 0.13708723801063197, + "learning_rate": 4.006027997693262e-05, + "loss": 0.3357, + "step": 1910 + }, + { + "epoch": 2.7496402877697843, + "grad_norm": 0.1116048166839257, + "learning_rate": 4.002009333240465e-05, + "loss": 0.3327, + "step": 1911 + }, + { + "epoch": 2.7510791366906475, + "grad_norm": 0.11477952516508165, + "learning_rate": 3.997990666759536e-05, + "loss": 0.3459, + "step": 1912 + }, + { + "epoch": 2.7525179856115107, + "grad_norm": 0.09994756450196751, + "learning_rate": 3.99397200230674e-05, + "loss": 0.3299, + "step": 1913 + }, + { + "epoch": 2.753956834532374, + "grad_norm": 0.10474276010820825, + "learning_rate": 3.989953343938331e-05, + "loss": 0.3431, + "step": 1914 + }, + { + "epoch": 2.755395683453237, + "grad_norm": 0.09656557000221296, + "learning_rate": 3.985934695710566e-05, + "loss": 0.3403, + "step": 1915 + }, + { + "epoch": 2.756834532374101, + "grad_norm": 0.08824704913143855, + "learning_rate": 3.9819160616796873e-05, + "loss": 0.343, + "step": 1916 + }, + { + "epoch": 2.758273381294964, + "grad_norm": 0.09649943443919938, + "learning_rate": 3.977897445901922e-05, + "loss": 0.3452, + "step": 1917 + }, + { + "epoch": 2.7597122302158272, + "grad_norm": 0.10209351657601745, + "learning_rate": 3.9738788524334794e-05, + "loss": 0.3392, + "step": 1918 + }, + { + "epoch": 2.761151079136691, + "grad_norm": 0.12454331122782362, + "learning_rate": 3.969860285330549e-05, + "loss": 0.3445, + "step": 1919 + }, + { + "epoch": 2.762589928057554, + "grad_norm": 0.10092792322414952, + "learning_rate": 3.965841748649291e-05, + "loss": 0.3425, + "step": 1920 + }, + { + "epoch": 2.7640287769784173, + "grad_norm": 0.1077236194758336, + "learning_rate": 3.961823246445834e-05, + "loss": 0.3386, + "step": 1921 + }, + { + "epoch": 2.7654676258992805, + "grad_norm": 0.13388696323456153, + "learning_rate": 3.957804782776276e-05, + "loss": 0.3441, + "step": 1922 + }, + { + "epoch": 2.7669064748201437, + "grad_norm": 0.10255911600542274, + "learning_rate": 3.953786361696673e-05, + "loss": 0.3394, + "step": 1923 + }, + { + "epoch": 2.7683453237410074, + "grad_norm": 0.103328615587966, + "learning_rate": 3.9497679872630366e-05, + "loss": 0.3431, + "step": 1924 + }, + { + "epoch": 2.7697841726618706, + "grad_norm": 0.14211785564146165, + "learning_rate": 3.945749663531334e-05, + "loss": 0.3425, + "step": 1925 + }, + { + "epoch": 2.771223021582734, + "grad_norm": 0.12904341075634934, + "learning_rate": 3.941731394557482e-05, + "loss": 0.3407, + "step": 1926 + }, + { + "epoch": 2.772661870503597, + "grad_norm": 0.11197846800896227, + "learning_rate": 3.9377131843973394e-05, + "loss": 0.339, + "step": 1927 + }, + { + "epoch": 2.7741007194244602, + "grad_norm": 0.1429322126035399, + "learning_rate": 3.933695037106705e-05, + "loss": 0.3348, + "step": 1928 + }, + { + "epoch": 2.775539568345324, + "grad_norm": 0.1106949557977696, + "learning_rate": 3.9296769567413177e-05, + "loss": 0.3485, + "step": 1929 + }, + { + "epoch": 2.776978417266187, + "grad_norm": 0.1510109014335804, + "learning_rate": 3.925658947356849e-05, + "loss": 0.3366, + "step": 1930 + }, + { + "epoch": 2.7784172661870503, + "grad_norm": 0.11513010328064764, + "learning_rate": 3.921641013008893e-05, + "loss": 0.3378, + "step": 1931 + }, + { + "epoch": 2.7798561151079135, + "grad_norm": 0.09727543648341261, + "learning_rate": 3.9176231577529734e-05, + "loss": 0.3465, + "step": 1932 + }, + { + "epoch": 2.7812949640287767, + "grad_norm": 0.0973114578702156, + "learning_rate": 3.913605385644535e-05, + "loss": 0.3394, + "step": 1933 + }, + { + "epoch": 2.7827338129496404, + "grad_norm": 0.08922787063098898, + "learning_rate": 3.909587700738933e-05, + "loss": 0.3343, + "step": 1934 + }, + { + "epoch": 2.7841726618705036, + "grad_norm": 0.1028230823021992, + "learning_rate": 3.9055701070914393e-05, + "loss": 0.3464, + "step": 1935 + }, + { + "epoch": 2.785611510791367, + "grad_norm": 0.09200732441336425, + "learning_rate": 3.90155260875723e-05, + "loss": 0.3322, + "step": 1936 + }, + { + "epoch": 2.7870503597122305, + "grad_norm": 0.108749718978638, + "learning_rate": 3.8975352097913914e-05, + "loss": 0.3428, + "step": 1937 + }, + { + "epoch": 2.7884892086330937, + "grad_norm": 0.10129148106185752, + "learning_rate": 3.8935179142489016e-05, + "loss": 0.3401, + "step": 1938 + }, + { + "epoch": 2.789928057553957, + "grad_norm": 0.0929517364536396, + "learning_rate": 3.8895007261846404e-05, + "loss": 0.3376, + "step": 1939 + }, + { + "epoch": 2.79136690647482, + "grad_norm": 0.12330365574595786, + "learning_rate": 3.885483649653374e-05, + "loss": 0.3401, + "step": 1940 + }, + { + "epoch": 2.7928057553956833, + "grad_norm": 0.11359836606019805, + "learning_rate": 3.881466688709761e-05, + "loss": 0.34, + "step": 1941 + }, + { + "epoch": 2.794244604316547, + "grad_norm": 0.11152528505935266, + "learning_rate": 3.877449847408342e-05, + "loss": 0.3456, + "step": 1942 + }, + { + "epoch": 2.79568345323741, + "grad_norm": 0.15288234424007455, + "learning_rate": 3.873433129803532e-05, + "loss": 0.3371, + "step": 1943 + }, + { + "epoch": 2.7971223021582734, + "grad_norm": 0.12451350839843649, + "learning_rate": 3.86941653994963e-05, + "loss": 0.3499, + "step": 1944 + }, + { + "epoch": 2.7985611510791366, + "grad_norm": 0.13729679818380444, + "learning_rate": 3.8654000819007974e-05, + "loss": 0.3486, + "step": 1945 + }, + { + "epoch": 2.8, + "grad_norm": 0.13096931728184064, + "learning_rate": 3.8613837597110686e-05, + "loss": 0.3443, + "step": 1946 + }, + { + "epoch": 2.8014388489208635, + "grad_norm": 0.0874959006555631, + "learning_rate": 3.8573675774343356e-05, + "loss": 0.3418, + "step": 1947 + }, + { + "epoch": 2.8028776978417267, + "grad_norm": 0.11067866700964694, + "learning_rate": 3.853351539124355e-05, + "loss": 0.3453, + "step": 1948 + }, + { + "epoch": 2.80431654676259, + "grad_norm": 0.11136162558610203, + "learning_rate": 3.8493356488347345e-05, + "loss": 0.3383, + "step": 1949 + }, + { + "epoch": 2.805755395683453, + "grad_norm": 0.10376991448915758, + "learning_rate": 3.845319910618933e-05, + "loss": 0.3351, + "step": 1950 + }, + { + "epoch": 2.8071942446043163, + "grad_norm": 0.11579359325574481, + "learning_rate": 3.841304328530254e-05, + "loss": 0.3407, + "step": 1951 + }, + { + "epoch": 2.80863309352518, + "grad_norm": 0.10374915093741167, + "learning_rate": 3.83728890662185e-05, + "loss": 0.3392, + "step": 1952 + }, + { + "epoch": 2.810071942446043, + "grad_norm": 0.10029067431414022, + "learning_rate": 3.833273648946704e-05, + "loss": 0.3373, + "step": 1953 + }, + { + "epoch": 2.8115107913669064, + "grad_norm": 0.09402337578438068, + "learning_rate": 3.829258559557635e-05, + "loss": 0.3272, + "step": 1954 + }, + { + "epoch": 2.81294964028777, + "grad_norm": 0.10650566276574056, + "learning_rate": 3.825243642507297e-05, + "loss": 0.3424, + "step": 1955 + }, + { + "epoch": 2.814388489208633, + "grad_norm": 0.10921757363647926, + "learning_rate": 3.8212289018481666e-05, + "loss": 0.3431, + "step": 1956 + }, + { + "epoch": 2.8158273381294965, + "grad_norm": 0.10534874915498878, + "learning_rate": 3.817214341632539e-05, + "loss": 0.3329, + "step": 1957 + }, + { + "epoch": 2.8172661870503597, + "grad_norm": 0.1015999510780134, + "learning_rate": 3.813199965912533e-05, + "loss": 0.3402, + "step": 1958 + }, + { + "epoch": 2.818705035971223, + "grad_norm": 0.09424198994137044, + "learning_rate": 3.80918577874008e-05, + "loss": 0.3428, + "step": 1959 + }, + { + "epoch": 2.8201438848920866, + "grad_norm": 0.10554116905363074, + "learning_rate": 3.8051717841669196e-05, + "loss": 0.3487, + "step": 1960 + }, + { + "epoch": 2.8215827338129498, + "grad_norm": 0.11403157042696999, + "learning_rate": 3.801157986244595e-05, + "loss": 0.3365, + "step": 1961 + }, + { + "epoch": 2.823021582733813, + "grad_norm": 0.10551621664168667, + "learning_rate": 3.7971443890244534e-05, + "loss": 0.3335, + "step": 1962 + }, + { + "epoch": 2.824460431654676, + "grad_norm": 0.10515730375531454, + "learning_rate": 3.7931309965576426e-05, + "loss": 0.3381, + "step": 1963 + }, + { + "epoch": 2.8258992805755394, + "grad_norm": 0.09972936999039739, + "learning_rate": 3.7891178128950975e-05, + "loss": 0.3359, + "step": 1964 + }, + { + "epoch": 2.827338129496403, + "grad_norm": 0.11430428733733113, + "learning_rate": 3.785104842087546e-05, + "loss": 0.3328, + "step": 1965 + }, + { + "epoch": 2.8287769784172663, + "grad_norm": 0.14020430699616854, + "learning_rate": 3.7810920881855016e-05, + "loss": 0.3338, + "step": 1966 + }, + { + "epoch": 2.8302158273381295, + "grad_norm": 0.096686224918284, + "learning_rate": 3.777079555239255e-05, + "loss": 0.335, + "step": 1967 + }, + { + "epoch": 2.8316546762589927, + "grad_norm": 0.16387851406407417, + "learning_rate": 3.77306724729888e-05, + "loss": 0.3459, + "step": 1968 + }, + { + "epoch": 2.833093525179856, + "grad_norm": 0.11753990070533193, + "learning_rate": 3.769055168414215e-05, + "loss": 0.3432, + "step": 1969 + }, + { + "epoch": 2.8345323741007196, + "grad_norm": 0.12735840341028878, + "learning_rate": 3.765043322634877e-05, + "loss": 0.3423, + "step": 1970 + }, + { + "epoch": 2.8359712230215828, + "grad_norm": 0.1329095366969948, + "learning_rate": 3.761031714010239e-05, + "loss": 0.3375, + "step": 1971 + }, + { + "epoch": 2.837410071942446, + "grad_norm": 0.09524702053754186, + "learning_rate": 3.75702034658944e-05, + "loss": 0.3357, + "step": 1972 + }, + { + "epoch": 2.838848920863309, + "grad_norm": 0.11737566541841826, + "learning_rate": 3.753009224421373e-05, + "loss": 0.3498, + "step": 1973 + }, + { + "epoch": 2.8402877697841724, + "grad_norm": 0.09924741895401636, + "learning_rate": 3.748998351554684e-05, + "loss": 0.3335, + "step": 1974 + }, + { + "epoch": 2.841726618705036, + "grad_norm": 0.11945876292369449, + "learning_rate": 3.74498773203777e-05, + "loss": 0.3348, + "step": 1975 + }, + { + "epoch": 2.8431654676258993, + "grad_norm": 0.12159139949878082, + "learning_rate": 3.7409773699187664e-05, + "loss": 0.3384, + "step": 1976 + }, + { + "epoch": 2.8446043165467625, + "grad_norm": 0.10690729286030871, + "learning_rate": 3.736967269245555e-05, + "loss": 0.3345, + "step": 1977 + }, + { + "epoch": 2.846043165467626, + "grad_norm": 0.10519347765975301, + "learning_rate": 3.732957434065751e-05, + "loss": 0.3409, + "step": 1978 + }, + { + "epoch": 2.8474820143884894, + "grad_norm": 0.09875549429217262, + "learning_rate": 3.728947868426701e-05, + "loss": 0.3333, + "step": 1979 + }, + { + "epoch": 2.8489208633093526, + "grad_norm": 0.0877136411088501, + "learning_rate": 3.724938576375479e-05, + "loss": 0.3265, + "step": 1980 + }, + { + "epoch": 2.850359712230216, + "grad_norm": 0.09133539784167514, + "learning_rate": 3.7209295619588856e-05, + "loss": 0.3438, + "step": 1981 + }, + { + "epoch": 2.851798561151079, + "grad_norm": 0.09612159565294272, + "learning_rate": 3.7169208292234395e-05, + "loss": 0.336, + "step": 1982 + }, + { + "epoch": 2.8532374100719426, + "grad_norm": 0.09685530731482227, + "learning_rate": 3.7129123822153746e-05, + "loss": 0.3426, + "step": 1983 + }, + { + "epoch": 2.854676258992806, + "grad_norm": 0.11645804028219138, + "learning_rate": 3.708904224980636e-05, + "loss": 0.3433, + "step": 1984 + }, + { + "epoch": 2.856115107913669, + "grad_norm": 0.09789254210421806, + "learning_rate": 3.704896361564881e-05, + "loss": 0.3403, + "step": 1985 + }, + { + "epoch": 2.8575539568345323, + "grad_norm": 0.09901140761085257, + "learning_rate": 3.700888796013462e-05, + "loss": 0.3358, + "step": 1986 + }, + { + "epoch": 2.8589928057553955, + "grad_norm": 0.11827022633011026, + "learning_rate": 3.696881532371439e-05, + "loss": 0.3344, + "step": 1987 + }, + { + "epoch": 2.860431654676259, + "grad_norm": 0.10327626295170113, + "learning_rate": 3.692874574683562e-05, + "loss": 0.3421, + "step": 1988 + }, + { + "epoch": 2.8618705035971224, + "grad_norm": 0.11279702701966186, + "learning_rate": 3.688867926994274e-05, + "loss": 0.3347, + "step": 1989 + }, + { + "epoch": 2.8633093525179856, + "grad_norm": 0.11634377484743742, + "learning_rate": 3.684861593347705e-05, + "loss": 0.3388, + "step": 1990 + }, + { + "epoch": 2.864748201438849, + "grad_norm": 0.09635632525853179, + "learning_rate": 3.6808555777876673e-05, + "loss": 0.3468, + "step": 1991 + }, + { + "epoch": 2.866187050359712, + "grad_norm": 0.09886557805436955, + "learning_rate": 3.676849884357655e-05, + "loss": 0.3456, + "step": 1992 + }, + { + "epoch": 2.8676258992805757, + "grad_norm": 0.10608790458889766, + "learning_rate": 3.672844517100833e-05, + "loss": 0.3444, + "step": 1993 + }, + { + "epoch": 2.869064748201439, + "grad_norm": 0.09498842901617018, + "learning_rate": 3.66883948006004e-05, + "loss": 0.3399, + "step": 1994 + }, + { + "epoch": 2.870503597122302, + "grad_norm": 0.14887013631600512, + "learning_rate": 3.664834777277777e-05, + "loss": 0.3446, + "step": 1995 + }, + { + "epoch": 2.8719424460431657, + "grad_norm": 0.0959301744637005, + "learning_rate": 3.6608304127962166e-05, + "loss": 0.3471, + "step": 1996 + }, + { + "epoch": 2.873381294964029, + "grad_norm": 0.12416508438264422, + "learning_rate": 3.656826390657179e-05, + "loss": 0.3422, + "step": 1997 + }, + { + "epoch": 2.874820143884892, + "grad_norm": 0.10944433209308456, + "learning_rate": 3.6528227149021455e-05, + "loss": 0.3336, + "step": 1998 + }, + { + "epoch": 2.8762589928057554, + "grad_norm": 0.09549258689455808, + "learning_rate": 3.648819389572248e-05, + "loss": 0.3355, + "step": 1999 + }, + { + "epoch": 2.8776978417266186, + "grad_norm": 0.10323613880773544, + "learning_rate": 3.644816418708261e-05, + "loss": 0.3396, + "step": 2000 + }, + { + "epoch": 2.8791366906474822, + "grad_norm": 0.11161380918729948, + "learning_rate": 3.6408138063506057e-05, + "loss": 0.3422, + "step": 2001 + }, + { + "epoch": 2.8805755395683454, + "grad_norm": 0.09144683037649778, + "learning_rate": 3.636811556539335e-05, + "loss": 0.3373, + "step": 2002 + }, + { + "epoch": 2.8820143884892087, + "grad_norm": 0.10156626895683483, + "learning_rate": 3.6328096733141423e-05, + "loss": 0.3375, + "step": 2003 + }, + { + "epoch": 2.883453237410072, + "grad_norm": 0.10981190963013603, + "learning_rate": 3.6288081607143496e-05, + "loss": 0.3374, + "step": 2004 + }, + { + "epoch": 2.884892086330935, + "grad_norm": 0.09486965282121589, + "learning_rate": 3.6248070227789034e-05, + "loss": 0.3335, + "step": 2005 + }, + { + "epoch": 2.8863309352517987, + "grad_norm": 0.12553572365453994, + "learning_rate": 3.620806263546369e-05, + "loss": 0.3375, + "step": 2006 + }, + { + "epoch": 2.887769784172662, + "grad_norm": 0.08744989482973632, + "learning_rate": 3.6168058870549355e-05, + "loss": 0.3399, + "step": 2007 + }, + { + "epoch": 2.889208633093525, + "grad_norm": 0.10419739270922672, + "learning_rate": 3.612805897342405e-05, + "loss": 0.3376, + "step": 2008 + }, + { + "epoch": 2.8906474820143884, + "grad_norm": 0.11601086378233597, + "learning_rate": 3.608806298446182e-05, + "loss": 0.3413, + "step": 2009 + }, + { + "epoch": 2.8920863309352516, + "grad_norm": 0.10100418652811571, + "learning_rate": 3.604807094403286e-05, + "loss": 0.3369, + "step": 2010 + }, + { + "epoch": 2.8935251798561152, + "grad_norm": 0.09789083730632218, + "learning_rate": 3.6008082892503325e-05, + "loss": 0.3339, + "step": 2011 + }, + { + "epoch": 2.8949640287769784, + "grad_norm": 0.113231870060525, + "learning_rate": 3.596809887023534e-05, + "loss": 0.3426, + "step": 2012 + }, + { + "epoch": 2.8964028776978417, + "grad_norm": 0.08853598945173508, + "learning_rate": 3.5928118917587e-05, + "loss": 0.3417, + "step": 2013 + }, + { + "epoch": 2.897841726618705, + "grad_norm": 0.12189669471820712, + "learning_rate": 3.588814307491227e-05, + "loss": 0.3352, + "step": 2014 + }, + { + "epoch": 2.899280575539568, + "grad_norm": 0.12976402273747667, + "learning_rate": 3.584817138256096e-05, + "loss": 0.3367, + "step": 2015 + }, + { + "epoch": 2.9007194244604317, + "grad_norm": 0.5517604227527648, + "learning_rate": 3.580820388087869e-05, + "loss": 0.3398, + "step": 2016 + }, + { + "epoch": 2.902158273381295, + "grad_norm": 0.11954468702025425, + "learning_rate": 3.5768240610206855e-05, + "loss": 0.3519, + "step": 2017 + }, + { + "epoch": 2.903597122302158, + "grad_norm": 0.12000771579204647, + "learning_rate": 3.572828161088262e-05, + "loss": 0.3386, + "step": 2018 + }, + { + "epoch": 2.905035971223022, + "grad_norm": 0.1160108192635706, + "learning_rate": 3.568832692323876e-05, + "loss": 0.3333, + "step": 2019 + }, + { + "epoch": 2.906474820143885, + "grad_norm": 0.33604009024132314, + "learning_rate": 3.564837658760376e-05, + "loss": 0.3488, + "step": 2020 + }, + { + "epoch": 2.9079136690647482, + "grad_norm": 0.13931330547334667, + "learning_rate": 3.560843064430168e-05, + "loss": 0.3425, + "step": 2021 + }, + { + "epoch": 2.9093525179856115, + "grad_norm": 0.2274196971791949, + "learning_rate": 3.556848913365218e-05, + "loss": 0.3448, + "step": 2022 + }, + { + "epoch": 2.9107913669064747, + "grad_norm": 0.14845749220476806, + "learning_rate": 3.552855209597039e-05, + "loss": 0.3501, + "step": 2023 + }, + { + "epoch": 2.9122302158273383, + "grad_norm": 0.14042284150124493, + "learning_rate": 3.548861957156698e-05, + "loss": 0.3344, + "step": 2024 + }, + { + "epoch": 2.9136690647482015, + "grad_norm": 0.19669862745580163, + "learning_rate": 3.544869160074806e-05, + "loss": 0.3445, + "step": 2025 + }, + { + "epoch": 2.9151079136690647, + "grad_norm": 0.12080075666826642, + "learning_rate": 3.5408768223815105e-05, + "loss": 0.3433, + "step": 2026 + }, + { + "epoch": 2.916546762589928, + "grad_norm": 0.11359617833009475, + "learning_rate": 3.536884948106498e-05, + "loss": 0.3418, + "step": 2027 + }, + { + "epoch": 2.917985611510791, + "grad_norm": 0.1241496184644772, + "learning_rate": 3.532893541278986e-05, + "loss": 0.3449, + "step": 2028 + }, + { + "epoch": 2.919424460431655, + "grad_norm": 0.1252420579703172, + "learning_rate": 3.528902605927722e-05, + "loss": 0.3485, + "step": 2029 + }, + { + "epoch": 2.920863309352518, + "grad_norm": 0.12099295193096923, + "learning_rate": 3.524912146080978e-05, + "loss": 0.3363, + "step": 2030 + }, + { + "epoch": 2.9223021582733812, + "grad_norm": 0.10313434029543694, + "learning_rate": 3.5209221657665436e-05, + "loss": 0.3442, + "step": 2031 + }, + { + "epoch": 2.9237410071942445, + "grad_norm": 0.11331349519355352, + "learning_rate": 3.516932669011723e-05, + "loss": 0.3457, + "step": 2032 + }, + { + "epoch": 2.9251798561151077, + "grad_norm": 0.10827193734464523, + "learning_rate": 3.512943659843337e-05, + "loss": 0.3446, + "step": 2033 + }, + { + "epoch": 2.9266187050359713, + "grad_norm": 0.13373768241898437, + "learning_rate": 3.508955142287714e-05, + "loss": 0.3443, + "step": 2034 + }, + { + "epoch": 2.9280575539568345, + "grad_norm": 0.11755344172389338, + "learning_rate": 3.50496712037068e-05, + "loss": 0.3378, + "step": 2035 + }, + { + "epoch": 2.9294964028776977, + "grad_norm": 0.1092987257661601, + "learning_rate": 3.5009795981175676e-05, + "loss": 0.3451, + "step": 2036 + }, + { + "epoch": 2.9309352517985614, + "grad_norm": 0.13471333476409209, + "learning_rate": 3.496992579553203e-05, + "loss": 0.3414, + "step": 2037 + }, + { + "epoch": 2.9323741007194246, + "grad_norm": 0.09778093965691664, + "learning_rate": 3.4930060687019015e-05, + "loss": 0.3398, + "step": 2038 + }, + { + "epoch": 2.933812949640288, + "grad_norm": 0.11407068505788961, + "learning_rate": 3.489020069587467e-05, + "loss": 0.3382, + "step": 2039 + }, + { + "epoch": 2.935251798561151, + "grad_norm": 0.10556720511828248, + "learning_rate": 3.485034586233192e-05, + "loss": 0.3315, + "step": 2040 + }, + { + "epoch": 2.9366906474820142, + "grad_norm": 0.09627186947222915, + "learning_rate": 3.4810496226618404e-05, + "loss": 0.3393, + "step": 2041 + }, + { + "epoch": 2.938129496402878, + "grad_norm": 0.11039089616509497, + "learning_rate": 3.477065182895656e-05, + "loss": 0.3369, + "step": 2042 + }, + { + "epoch": 2.939568345323741, + "grad_norm": 0.09607262830034538, + "learning_rate": 3.473081270956352e-05, + "loss": 0.3374, + "step": 2043 + }, + { + "epoch": 2.9410071942446043, + "grad_norm": 0.101459526198804, + "learning_rate": 3.469097890865113e-05, + "loss": 0.3401, + "step": 2044 + }, + { + "epoch": 2.9424460431654675, + "grad_norm": 0.11182305048557227, + "learning_rate": 3.465115046642581e-05, + "loss": 0.3394, + "step": 2045 + }, + { + "epoch": 2.9438848920863308, + "grad_norm": 0.10131498840239124, + "learning_rate": 3.461132742308859e-05, + "loss": 0.3389, + "step": 2046 + }, + { + "epoch": 2.9453237410071944, + "grad_norm": 0.0941180917869226, + "learning_rate": 3.45715098188351e-05, + "loss": 0.3368, + "step": 2047 + }, + { + "epoch": 2.9467625899280576, + "grad_norm": 0.10370672140159803, + "learning_rate": 3.453169769385541e-05, + "loss": 0.3326, + "step": 2048 + }, + { + "epoch": 2.948201438848921, + "grad_norm": 0.09591896159588302, + "learning_rate": 3.449189108833409e-05, + "loss": 0.3441, + "step": 2049 + }, + { + "epoch": 2.949640287769784, + "grad_norm": 0.09936462705055035, + "learning_rate": 3.445209004245012e-05, + "loss": 0.3385, + "step": 2050 + }, + { + "epoch": 2.9510791366906473, + "grad_norm": 0.09429491483624323, + "learning_rate": 3.441229459637693e-05, + "loss": 0.3504, + "step": 2051 + }, + { + "epoch": 2.952517985611511, + "grad_norm": 0.09881581153293392, + "learning_rate": 3.4372504790282215e-05, + "loss": 0.3428, + "step": 2052 + }, + { + "epoch": 2.953956834532374, + "grad_norm": 0.09743132407484635, + "learning_rate": 3.4332720664328034e-05, + "loss": 0.3378, + "step": 2053 + }, + { + "epoch": 2.9553956834532373, + "grad_norm": 0.09791054902725754, + "learning_rate": 3.4292942258670675e-05, + "loss": 0.333, + "step": 2054 + }, + { + "epoch": 2.956834532374101, + "grad_norm": 0.10633424241620104, + "learning_rate": 3.425316961346069e-05, + "loss": 0.3512, + "step": 2055 + }, + { + "epoch": 2.9582733812949638, + "grad_norm": 0.09523089276447394, + "learning_rate": 3.42134027688428e-05, + "loss": 0.3449, + "step": 2056 + }, + { + "epoch": 2.9597122302158274, + "grad_norm": 0.10504482623409091, + "learning_rate": 3.417364176495585e-05, + "loss": 0.337, + "step": 2057 + }, + { + "epoch": 2.9611510791366906, + "grad_norm": 0.0910028613044847, + "learning_rate": 3.4133886641932834e-05, + "loss": 0.3352, + "step": 2058 + }, + { + "epoch": 2.962589928057554, + "grad_norm": 0.08738779213644705, + "learning_rate": 3.409413743990076e-05, + "loss": 0.3352, + "step": 2059 + }, + { + "epoch": 2.9640287769784175, + "grad_norm": 0.11240302590903321, + "learning_rate": 3.4054394198980705e-05, + "loss": 0.3358, + "step": 2060 + }, + { + "epoch": 2.9654676258992807, + "grad_norm": 0.0923897668090509, + "learning_rate": 3.401465695928768e-05, + "loss": 0.3525, + "step": 2061 + }, + { + "epoch": 2.966906474820144, + "grad_norm": 0.10130489279016075, + "learning_rate": 3.3974925760930694e-05, + "loss": 0.3336, + "step": 2062 + }, + { + "epoch": 2.968345323741007, + "grad_norm": 0.09116925176685219, + "learning_rate": 3.393520064401264e-05, + "loss": 0.3437, + "step": 2063 + }, + { + "epoch": 2.9697841726618703, + "grad_norm": 0.09357240837963861, + "learning_rate": 3.3895481648630234e-05, + "loss": 0.3397, + "step": 2064 + }, + { + "epoch": 2.971223021582734, + "grad_norm": 0.08921155663052444, + "learning_rate": 3.385576881487405e-05, + "loss": 0.3463, + "step": 2065 + }, + { + "epoch": 2.972661870503597, + "grad_norm": 0.18879817973341476, + "learning_rate": 3.381606218282846e-05, + "loss": 0.3495, + "step": 2066 + }, + { + "epoch": 2.9741007194244604, + "grad_norm": 0.0828229553796737, + "learning_rate": 3.377636179257153e-05, + "loss": 0.34, + "step": 2067 + }, + { + "epoch": 2.9755395683453236, + "grad_norm": 0.0883365063239282, + "learning_rate": 3.373666768417505e-05, + "loss": 0.3385, + "step": 2068 + }, + { + "epoch": 2.976978417266187, + "grad_norm": 0.0843524304405967, + "learning_rate": 3.3696979897704466e-05, + "loss": 0.3367, + "step": 2069 + }, + { + "epoch": 2.9784172661870505, + "grad_norm": 0.08721692620414641, + "learning_rate": 3.3657298473218864e-05, + "loss": 0.3398, + "step": 2070 + }, + { + "epoch": 2.9798561151079137, + "grad_norm": 0.08381150306485105, + "learning_rate": 3.361762345077087e-05, + "loss": 0.3337, + "step": 2071 + }, + { + "epoch": 2.981294964028777, + "grad_norm": 0.08431930869069568, + "learning_rate": 3.3577954870406656e-05, + "loss": 0.3333, + "step": 2072 + }, + { + "epoch": 2.98273381294964, + "grad_norm": 0.09523256103847537, + "learning_rate": 3.3538292772165936e-05, + "loss": 0.3456, + "step": 2073 + }, + { + "epoch": 2.9841726618705033, + "grad_norm": 0.08844468183438381, + "learning_rate": 3.3498637196081825e-05, + "loss": 0.3467, + "step": 2074 + }, + { + "epoch": 2.985611510791367, + "grad_norm": 0.11491416674856708, + "learning_rate": 3.345898818218089e-05, + "loss": 0.3351, + "step": 2075 + }, + { + "epoch": 2.98705035971223, + "grad_norm": 0.0899026646635497, + "learning_rate": 3.341934577048304e-05, + "loss": 0.334, + "step": 2076 + }, + { + "epoch": 2.9884892086330934, + "grad_norm": 0.09680106609239067, + "learning_rate": 3.337971000100157e-05, + "loss": 0.3389, + "step": 2077 + }, + { + "epoch": 2.989928057553957, + "grad_norm": 0.09716428675247937, + "learning_rate": 3.334008091374303e-05, + "loss": 0.3421, + "step": 2078 + }, + { + "epoch": 2.9913669064748203, + "grad_norm": 0.09397649824466278, + "learning_rate": 3.3300458548707214e-05, + "loss": 0.3429, + "step": 2079 + }, + { + "epoch": 2.9928057553956835, + "grad_norm": 0.1032790560661443, + "learning_rate": 3.326084294588721e-05, + "loss": 0.3398, + "step": 2080 + }, + { + "epoch": 2.9942446043165467, + "grad_norm": 0.08806441098719499, + "learning_rate": 3.322123414526917e-05, + "loss": 0.3374, + "step": 2081 + }, + { + "epoch": 2.99568345323741, + "grad_norm": 0.09040602131734916, + "learning_rate": 3.3181632186832485e-05, + "loss": 0.3438, + "step": 2082 + }, + { + "epoch": 2.9971223021582736, + "grad_norm": 0.08450635151958065, + "learning_rate": 3.3142037110549546e-05, + "loss": 0.3374, + "step": 2083 + }, + { + "epoch": 2.998561151079137, + "grad_norm": 0.08672084923982686, + "learning_rate": 3.310244895638587e-05, + "loss": 0.3392, + "step": 2084 + }, + { + "epoch": 3.0, + "grad_norm": 0.11705616335925603, + "learning_rate": 3.306286776429995e-05, + "loss": 0.3239, + "step": 2085 + }, + { + "epoch": 3.001438848920863, + "grad_norm": 0.13793126150799295, + "learning_rate": 3.302329357424326e-05, + "loss": 0.3123, + "step": 2086 + }, + { + "epoch": 3.0028776978417264, + "grad_norm": 0.13603474523355813, + "learning_rate": 3.2983726426160204e-05, + "loss": 0.3217, + "step": 2087 + }, + { + "epoch": 3.00431654676259, + "grad_norm": 0.1856790529153839, + "learning_rate": 3.2944166359988083e-05, + "loss": 0.3162, + "step": 2088 + }, + { + "epoch": 3.0057553956834533, + "grad_norm": 0.1611176916591849, + "learning_rate": 3.290461341565707e-05, + "loss": 0.3148, + "step": 2089 + }, + { + "epoch": 3.0071942446043165, + "grad_norm": 0.16726575474877292, + "learning_rate": 3.286506763309009e-05, + "loss": 0.3167, + "step": 2090 + }, + { + "epoch": 3.0086330935251797, + "grad_norm": 0.15380789186230392, + "learning_rate": 3.2825529052202904e-05, + "loss": 0.3218, + "step": 2091 + }, + { + "epoch": 3.0100719424460434, + "grad_norm": 0.16337070969766343, + "learning_rate": 3.278599771290397e-05, + "loss": 0.3158, + "step": 2092 + }, + { + "epoch": 3.0115107913669066, + "grad_norm": 0.14398184962901214, + "learning_rate": 3.274647365509445e-05, + "loss": 0.3147, + "step": 2093 + }, + { + "epoch": 3.01294964028777, + "grad_norm": 0.11501375024418786, + "learning_rate": 3.2706956918668126e-05, + "loss": 0.309, + "step": 2094 + }, + { + "epoch": 3.014388489208633, + "grad_norm": 0.15656933461861816, + "learning_rate": 3.266744754351144e-05, + "loss": 0.3078, + "step": 2095 + }, + { + "epoch": 3.015827338129496, + "grad_norm": 0.11864428376109526, + "learning_rate": 3.262794556950338e-05, + "loss": 0.3087, + "step": 2096 + }, + { + "epoch": 3.01726618705036, + "grad_norm": 0.1068241730293408, + "learning_rate": 3.2588451036515435e-05, + "loss": 0.3012, + "step": 2097 + }, + { + "epoch": 3.018705035971223, + "grad_norm": 0.11991144173205176, + "learning_rate": 3.2548963984411623e-05, + "loss": 0.3064, + "step": 2098 + }, + { + "epoch": 3.0201438848920863, + "grad_norm": 0.11295063162718329, + "learning_rate": 3.2509484453048413e-05, + "loss": 0.3069, + "step": 2099 + }, + { + "epoch": 3.0215827338129495, + "grad_norm": 0.10013282575374777, + "learning_rate": 3.247001248227465e-05, + "loss": 0.3229, + "step": 2100 + }, + { + "epoch": 3.023021582733813, + "grad_norm": 0.12506616455815658, + "learning_rate": 3.2430548111931574e-05, + "loss": 0.3089, + "step": 2101 + }, + { + "epoch": 3.0244604316546764, + "grad_norm": 0.10517337087645212, + "learning_rate": 3.239109138185275e-05, + "loss": 0.3069, + "step": 2102 + }, + { + "epoch": 3.0258992805755396, + "grad_norm": 0.10493068657974713, + "learning_rate": 3.2351642331864024e-05, + "loss": 0.2984, + "step": 2103 + }, + { + "epoch": 3.027338129496403, + "grad_norm": 0.11380836450829623, + "learning_rate": 3.2312201001783473e-05, + "loss": 0.3162, + "step": 2104 + }, + { + "epoch": 3.028776978417266, + "grad_norm": 0.10722577915109091, + "learning_rate": 3.2272767431421416e-05, + "loss": 0.31, + "step": 2105 + }, + { + "epoch": 3.0302158273381297, + "grad_norm": 0.12162362941969909, + "learning_rate": 3.2233341660580335e-05, + "loss": 0.3083, + "step": 2106 + }, + { + "epoch": 3.031654676258993, + "grad_norm": 0.11168047128844485, + "learning_rate": 3.219392372905482e-05, + "loss": 0.3106, + "step": 2107 + }, + { + "epoch": 3.033093525179856, + "grad_norm": 0.12843621459373988, + "learning_rate": 3.215451367663156e-05, + "loss": 0.3081, + "step": 2108 + }, + { + "epoch": 3.0345323741007193, + "grad_norm": 0.09702141129244819, + "learning_rate": 3.211511154308927e-05, + "loss": 0.3084, + "step": 2109 + }, + { + "epoch": 3.0359712230215825, + "grad_norm": 0.12510881961189294, + "learning_rate": 3.207571736819873e-05, + "loss": 0.3097, + "step": 2110 + }, + { + "epoch": 3.037410071942446, + "grad_norm": 0.08843205023499215, + "learning_rate": 3.203633119172262e-05, + "loss": 0.313, + "step": 2111 + }, + { + "epoch": 3.0388489208633094, + "grad_norm": 0.10911141103303929, + "learning_rate": 3.1996953053415575e-05, + "loss": 0.3062, + "step": 2112 + }, + { + "epoch": 3.0402877697841726, + "grad_norm": 0.11128508018839607, + "learning_rate": 3.1957582993024135e-05, + "loss": 0.3032, + "step": 2113 + }, + { + "epoch": 3.041726618705036, + "grad_norm": 0.2589859061373978, + "learning_rate": 3.191822105028665e-05, + "loss": 0.3115, + "step": 2114 + }, + { + "epoch": 3.0431654676258995, + "grad_norm": 0.10701641516972452, + "learning_rate": 3.1878867264933305e-05, + "loss": 0.3222, + "step": 2115 + }, + { + "epoch": 3.0446043165467627, + "grad_norm": 0.10343551330065237, + "learning_rate": 3.1839521676686026e-05, + "loss": 0.3129, + "step": 2116 + }, + { + "epoch": 3.046043165467626, + "grad_norm": 0.09286316473328367, + "learning_rate": 3.1800184325258494e-05, + "loss": 0.31, + "step": 2117 + }, + { + "epoch": 3.047482014388489, + "grad_norm": 0.10606551785976652, + "learning_rate": 3.176085525035607e-05, + "loss": 0.3084, + "step": 2118 + }, + { + "epoch": 3.0489208633093523, + "grad_norm": 0.09781749171984888, + "learning_rate": 3.172153449167574e-05, + "loss": 0.3184, + "step": 2119 + }, + { + "epoch": 3.050359712230216, + "grad_norm": 0.10422034929359225, + "learning_rate": 3.1682222088906096e-05, + "loss": 0.3118, + "step": 2120 + }, + { + "epoch": 3.051798561151079, + "grad_norm": 0.08864611099382119, + "learning_rate": 3.1642918081727327e-05, + "loss": 0.3197, + "step": 2121 + }, + { + "epoch": 3.0532374100719424, + "grad_norm": 0.36668073828222714, + "learning_rate": 3.1603622509811144e-05, + "loss": 0.3119, + "step": 2122 + }, + { + "epoch": 3.0546762589928056, + "grad_norm": 0.10341831271326356, + "learning_rate": 3.156433541282069e-05, + "loss": 0.3085, + "step": 2123 + }, + { + "epoch": 3.0561151079136692, + "grad_norm": 0.10780961775187786, + "learning_rate": 3.152505683041062e-05, + "loss": 0.3123, + "step": 2124 + }, + { + "epoch": 3.0575539568345325, + "grad_norm": 0.12743920608580703, + "learning_rate": 3.1485786802226976e-05, + "loss": 0.3131, + "step": 2125 + }, + { + "epoch": 3.0589928057553957, + "grad_norm": 0.09614525287762177, + "learning_rate": 3.1446525367907134e-05, + "loss": 0.3159, + "step": 2126 + }, + { + "epoch": 3.060431654676259, + "grad_norm": 0.1015553385159086, + "learning_rate": 3.1407272567079834e-05, + "loss": 0.3121, + "step": 2127 + }, + { + "epoch": 3.061870503597122, + "grad_norm": 0.10752800721482238, + "learning_rate": 3.136802843936509e-05, + "loss": 0.312, + "step": 2128 + }, + { + "epoch": 3.0633093525179858, + "grad_norm": 0.19324908126515103, + "learning_rate": 3.132879302437416e-05, + "loss": 0.3131, + "step": 2129 + }, + { + "epoch": 3.064748201438849, + "grad_norm": 0.09708957563754647, + "learning_rate": 3.128956636170949e-05, + "loss": 0.3101, + "step": 2130 + }, + { + "epoch": 3.066187050359712, + "grad_norm": 0.09301531792483879, + "learning_rate": 3.125034849096471e-05, + "loss": 0.3059, + "step": 2131 + }, + { + "epoch": 3.0676258992805754, + "grad_norm": 0.10400184737956308, + "learning_rate": 3.1211139451724605e-05, + "loss": 0.3119, + "step": 2132 + }, + { + "epoch": 3.069064748201439, + "grad_norm": 0.10684526359696246, + "learning_rate": 3.1171939283564986e-05, + "loss": 0.3095, + "step": 2133 + }, + { + "epoch": 3.0705035971223023, + "grad_norm": 0.10738487120925314, + "learning_rate": 3.113274802605276e-05, + "loss": 0.3006, + "step": 2134 + }, + { + "epoch": 3.0719424460431655, + "grad_norm": 0.0810562667527916, + "learning_rate": 3.109356571874579e-05, + "loss": 0.3096, + "step": 2135 + }, + { + "epoch": 3.0733812949640287, + "grad_norm": 0.268649406492071, + "learning_rate": 3.105439240119296e-05, + "loss": 0.321, + "step": 2136 + }, + { + "epoch": 3.074820143884892, + "grad_norm": 0.10376615835087384, + "learning_rate": 3.101522811293405e-05, + "loss": 0.3194, + "step": 2137 + }, + { + "epoch": 3.0762589928057555, + "grad_norm": 0.10463073075381597, + "learning_rate": 3.0976072893499724e-05, + "loss": 0.3184, + "step": 2138 + }, + { + "epoch": 3.0776978417266188, + "grad_norm": 0.09668657967871916, + "learning_rate": 3.093692678241151e-05, + "loss": 0.3092, + "step": 2139 + }, + { + "epoch": 3.079136690647482, + "grad_norm": 0.10680206670349111, + "learning_rate": 3.0897789819181715e-05, + "loss": 0.3103, + "step": 2140 + }, + { + "epoch": 3.080575539568345, + "grad_norm": 0.0996584820848933, + "learning_rate": 3.0858662043313456e-05, + "loss": 0.3055, + "step": 2141 + }, + { + "epoch": 3.082014388489209, + "grad_norm": 0.12307212788294597, + "learning_rate": 3.081954349430051e-05, + "loss": 0.3091, + "step": 2142 + }, + { + "epoch": 3.083453237410072, + "grad_norm": 0.08558726684483627, + "learning_rate": 3.0780434211627415e-05, + "loss": 0.3131, + "step": 2143 + }, + { + "epoch": 3.0848920863309353, + "grad_norm": 0.1172950866722576, + "learning_rate": 3.074133423476932e-05, + "loss": 0.3156, + "step": 2144 + }, + { + "epoch": 3.0863309352517985, + "grad_norm": 0.10469453062047866, + "learning_rate": 3.070224360319197e-05, + "loss": 0.3218, + "step": 2145 + }, + { + "epoch": 3.0877697841726617, + "grad_norm": 0.0925454967053722, + "learning_rate": 3.066316235635168e-05, + "loss": 0.3041, + "step": 2146 + }, + { + "epoch": 3.0892086330935253, + "grad_norm": 0.10145715448704891, + "learning_rate": 3.0624090533695324e-05, + "loss": 0.3158, + "step": 2147 + }, + { + "epoch": 3.0906474820143885, + "grad_norm": 0.11844817927214425, + "learning_rate": 3.0585028174660236e-05, + "loss": 0.3095, + "step": 2148 + }, + { + "epoch": 3.0920863309352518, + "grad_norm": 0.09077288092441493, + "learning_rate": 3.054597531867419e-05, + "loss": 0.3059, + "step": 2149 + }, + { + "epoch": 3.093525179856115, + "grad_norm": 0.11748236066697641, + "learning_rate": 3.0506932005155407e-05, + "loss": 0.3081, + "step": 2150 + }, + { + "epoch": 3.0949640287769786, + "grad_norm": 0.11190838616637672, + "learning_rate": 3.0467898273512446e-05, + "loss": 0.3197, + "step": 2151 + }, + { + "epoch": 3.096402877697842, + "grad_norm": 0.11697295478129117, + "learning_rate": 3.042887416314418e-05, + "loss": 0.3155, + "step": 2152 + }, + { + "epoch": 3.097841726618705, + "grad_norm": 0.10233140792591265, + "learning_rate": 3.03898597134398e-05, + "loss": 0.3091, + "step": 2153 + }, + { + "epoch": 3.0992805755395683, + "grad_norm": 0.09267504676657475, + "learning_rate": 3.0350854963778755e-05, + "loss": 0.3126, + "step": 2154 + }, + { + "epoch": 3.1007194244604315, + "grad_norm": 0.11576708311909643, + "learning_rate": 3.0311859953530672e-05, + "loss": 0.3073, + "step": 2155 + }, + { + "epoch": 3.102158273381295, + "grad_norm": 0.0879816111435866, + "learning_rate": 3.027287472205535e-05, + "loss": 0.3064, + "step": 2156 + }, + { + "epoch": 3.1035971223021583, + "grad_norm": 0.09573994059522453, + "learning_rate": 3.0233899308702722e-05, + "loss": 0.3138, + "step": 2157 + }, + { + "epoch": 3.1050359712230216, + "grad_norm": 0.11650012965328882, + "learning_rate": 3.0194933752812853e-05, + "loss": 0.3111, + "step": 2158 + }, + { + "epoch": 3.1064748201438848, + "grad_norm": 0.0832860465680782, + "learning_rate": 3.0155978093715787e-05, + "loss": 0.3195, + "step": 2159 + }, + { + "epoch": 3.1079136690647484, + "grad_norm": 0.10908168360989089, + "learning_rate": 3.011703237073162e-05, + "loss": 0.3075, + "step": 2160 + }, + { + "epoch": 3.1093525179856116, + "grad_norm": 0.10251962926405986, + "learning_rate": 3.0078096623170442e-05, + "loss": 0.306, + "step": 2161 + }, + { + "epoch": 3.110791366906475, + "grad_norm": 0.08614700299827016, + "learning_rate": 3.0039170890332214e-05, + "loss": 0.3146, + "step": 2162 + }, + { + "epoch": 3.112230215827338, + "grad_norm": 0.10245047075815311, + "learning_rate": 3.0000255211506836e-05, + "loss": 0.3077, + "step": 2163 + }, + { + "epoch": 3.1136690647482013, + "grad_norm": 0.09054934591074866, + "learning_rate": 2.9961349625974022e-05, + "loss": 0.3114, + "step": 2164 + }, + { + "epoch": 3.115107913669065, + "grad_norm": 0.08382115872482102, + "learning_rate": 2.992245417300335e-05, + "loss": 0.3032, + "step": 2165 + }, + { + "epoch": 3.116546762589928, + "grad_norm": 0.09618357746864291, + "learning_rate": 2.9883568891854118e-05, + "loss": 0.3133, + "step": 2166 + }, + { + "epoch": 3.1179856115107913, + "grad_norm": 0.07157257749831544, + "learning_rate": 2.9844693821775394e-05, + "loss": 0.3151, + "step": 2167 + }, + { + "epoch": 3.1194244604316546, + "grad_norm": 0.09815562942238666, + "learning_rate": 2.9805829002005907e-05, + "loss": 0.3157, + "step": 2168 + }, + { + "epoch": 3.1208633093525178, + "grad_norm": 0.10227029481953566, + "learning_rate": 2.9766974471774072e-05, + "loss": 0.3091, + "step": 2169 + }, + { + "epoch": 3.1223021582733814, + "grad_norm": 0.19938457580233848, + "learning_rate": 2.9728130270297913e-05, + "loss": 0.3076, + "step": 2170 + }, + { + "epoch": 3.1237410071942446, + "grad_norm": 0.0938557233143553, + "learning_rate": 2.968929643678499e-05, + "loss": 0.3172, + "step": 2171 + }, + { + "epoch": 3.125179856115108, + "grad_norm": 0.08048511224507746, + "learning_rate": 2.965047301043246e-05, + "loss": 0.3129, + "step": 2172 + }, + { + "epoch": 3.126618705035971, + "grad_norm": 0.10823148930623065, + "learning_rate": 2.961166003042692e-05, + "loss": 0.3188, + "step": 2173 + }, + { + "epoch": 3.1280575539568347, + "grad_norm": 0.07990693174638736, + "learning_rate": 2.9572857535944473e-05, + "loss": 0.3151, + "step": 2174 + }, + { + "epoch": 3.129496402877698, + "grad_norm": 0.07803203834549868, + "learning_rate": 2.9534065566150567e-05, + "loss": 0.3054, + "step": 2175 + }, + { + "epoch": 3.130935251798561, + "grad_norm": 0.07812299012768215, + "learning_rate": 2.9495284160200105e-05, + "loss": 0.3145, + "step": 2176 + }, + { + "epoch": 3.1323741007194243, + "grad_norm": 0.07117217556524316, + "learning_rate": 2.9456513357237305e-05, + "loss": 0.31, + "step": 2177 + }, + { + "epoch": 3.133812949640288, + "grad_norm": 0.08353609633978562, + "learning_rate": 2.9417753196395637e-05, + "loss": 0.319, + "step": 2178 + }, + { + "epoch": 3.135251798561151, + "grad_norm": 0.09055984034536244, + "learning_rate": 2.9379003716797877e-05, + "loss": 0.321, + "step": 2179 + }, + { + "epoch": 3.1366906474820144, + "grad_norm": 0.08081929164201893, + "learning_rate": 2.9340264957556018e-05, + "loss": 0.3108, + "step": 2180 + }, + { + "epoch": 3.1381294964028776, + "grad_norm": 0.09007488058816597, + "learning_rate": 2.9301536957771218e-05, + "loss": 0.3141, + "step": 2181 + }, + { + "epoch": 3.139568345323741, + "grad_norm": 0.08158773384180579, + "learning_rate": 2.9262819756533754e-05, + "loss": 0.303, + "step": 2182 + }, + { + "epoch": 3.1410071942446045, + "grad_norm": 0.0887930878270568, + "learning_rate": 2.922411339292306e-05, + "loss": 0.3187, + "step": 2183 + }, + { + "epoch": 3.1424460431654677, + "grad_norm": 0.09076889185748031, + "learning_rate": 2.9185417906007586e-05, + "loss": 0.3075, + "step": 2184 + }, + { + "epoch": 3.143884892086331, + "grad_norm": 0.09010997903238532, + "learning_rate": 2.914673333484481e-05, + "loss": 0.3153, + "step": 2185 + }, + { + "epoch": 3.145323741007194, + "grad_norm": 0.08601074144589695, + "learning_rate": 2.9108059718481184e-05, + "loss": 0.3136, + "step": 2186 + }, + { + "epoch": 3.1467625899280574, + "grad_norm": 0.09473825701510877, + "learning_rate": 2.906939709595216e-05, + "loss": 0.3099, + "step": 2187 + }, + { + "epoch": 3.148201438848921, + "grad_norm": 0.07940853642607197, + "learning_rate": 2.9030745506282017e-05, + "loss": 0.3081, + "step": 2188 + }, + { + "epoch": 3.149640287769784, + "grad_norm": 0.09176280731150839, + "learning_rate": 2.8992104988483943e-05, + "loss": 0.3048, + "step": 2189 + }, + { + "epoch": 3.1510791366906474, + "grad_norm": 0.08137132079708974, + "learning_rate": 2.895347558155992e-05, + "loss": 0.3106, + "step": 2190 + }, + { + "epoch": 3.1525179856115106, + "grad_norm": 0.08813337885140969, + "learning_rate": 2.8914857324500767e-05, + "loss": 0.3152, + "step": 2191 + }, + { + "epoch": 3.1539568345323743, + "grad_norm": 0.07592244668519561, + "learning_rate": 2.887625025628599e-05, + "loss": 0.3064, + "step": 2192 + }, + { + "epoch": 3.1553956834532375, + "grad_norm": 0.10560558598349411, + "learning_rate": 2.8837654415883817e-05, + "loss": 0.3116, + "step": 2193 + }, + { + "epoch": 3.1568345323741007, + "grad_norm": 0.0755960624360644, + "learning_rate": 2.879906984225119e-05, + "loss": 0.3165, + "step": 2194 + }, + { + "epoch": 3.158273381294964, + "grad_norm": 0.09732919238129134, + "learning_rate": 2.8760496574333613e-05, + "loss": 0.3184, + "step": 2195 + }, + { + "epoch": 3.159712230215827, + "grad_norm": 0.07449648470746753, + "learning_rate": 2.8721934651065227e-05, + "loss": 0.3145, + "step": 2196 + }, + { + "epoch": 3.161151079136691, + "grad_norm": 0.0846053364879296, + "learning_rate": 2.8683384111368675e-05, + "loss": 0.3028, + "step": 2197 + }, + { + "epoch": 3.162589928057554, + "grad_norm": 0.08346396573559035, + "learning_rate": 2.864484499415517e-05, + "loss": 0.3074, + "step": 2198 + }, + { + "epoch": 3.1640287769784172, + "grad_norm": 0.09412025107338536, + "learning_rate": 2.8606317338324347e-05, + "loss": 0.3094, + "step": 2199 + }, + { + "epoch": 3.1654676258992804, + "grad_norm": 0.07562893334341883, + "learning_rate": 2.856780118276429e-05, + "loss": 0.3208, + "step": 2200 + }, + { + "epoch": 3.166906474820144, + "grad_norm": 0.08737397019390829, + "learning_rate": 2.852929656635146e-05, + "loss": 0.3107, + "step": 2201 + }, + { + "epoch": 3.1683453237410073, + "grad_norm": 0.08921765424618087, + "learning_rate": 2.8490803527950706e-05, + "loss": 0.3095, + "step": 2202 + }, + { + "epoch": 3.1697841726618705, + "grad_norm": 0.07644433162930088, + "learning_rate": 2.845232210641517e-05, + "loss": 0.3124, + "step": 2203 + }, + { + "epoch": 3.1712230215827337, + "grad_norm": 0.10046560790987005, + "learning_rate": 2.841385234058624e-05, + "loss": 0.3096, + "step": 2204 + }, + { + "epoch": 3.172661870503597, + "grad_norm": 0.09749723028751422, + "learning_rate": 2.83753942692936e-05, + "loss": 0.3159, + "step": 2205 + }, + { + "epoch": 3.1741007194244606, + "grad_norm": 0.07485393350887619, + "learning_rate": 2.8336947931355096e-05, + "loss": 0.3111, + "step": 2206 + }, + { + "epoch": 3.175539568345324, + "grad_norm": 0.08927179298519106, + "learning_rate": 2.8298513365576715e-05, + "loss": 0.3094, + "step": 2207 + }, + { + "epoch": 3.176978417266187, + "grad_norm": 0.3885049074315983, + "learning_rate": 2.826009061075257e-05, + "loss": 0.3207, + "step": 2208 + }, + { + "epoch": 3.1784172661870502, + "grad_norm": 0.09307844740534896, + "learning_rate": 2.822167970566488e-05, + "loss": 0.3, + "step": 2209 + }, + { + "epoch": 3.1798561151079134, + "grad_norm": 0.09350080081395308, + "learning_rate": 2.8183280689083895e-05, + "loss": 0.3101, + "step": 2210 + }, + { + "epoch": 3.181294964028777, + "grad_norm": 0.10753781473299533, + "learning_rate": 2.8144893599767828e-05, + "loss": 0.3042, + "step": 2211 + }, + { + "epoch": 3.1827338129496403, + "grad_norm": 0.09175164298439005, + "learning_rate": 2.8106518476462886e-05, + "loss": 0.3059, + "step": 2212 + }, + { + "epoch": 3.1841726618705035, + "grad_norm": 0.09216657565504674, + "learning_rate": 2.806815535790321e-05, + "loss": 0.3107, + "step": 2213 + }, + { + "epoch": 3.1856115107913667, + "grad_norm": 0.09845501579488738, + "learning_rate": 2.8029804282810794e-05, + "loss": 0.3051, + "step": 2214 + }, + { + "epoch": 3.1870503597122304, + "grad_norm": 0.08435034016576407, + "learning_rate": 2.7991465289895497e-05, + "loss": 0.3169, + "step": 2215 + }, + { + "epoch": 3.1884892086330936, + "grad_norm": 0.10859317166691841, + "learning_rate": 2.7953138417854952e-05, + "loss": 0.3105, + "step": 2216 + }, + { + "epoch": 3.189928057553957, + "grad_norm": 0.08268118205133658, + "learning_rate": 2.79148237053746e-05, + "loss": 0.3147, + "step": 2217 + }, + { + "epoch": 3.19136690647482, + "grad_norm": 0.10477342417518032, + "learning_rate": 2.787652119112758e-05, + "loss": 0.3103, + "step": 2218 + }, + { + "epoch": 3.1928057553956837, + "grad_norm": 0.0873830025004209, + "learning_rate": 2.783823091377472e-05, + "loss": 0.3114, + "step": 2219 + }, + { + "epoch": 3.194244604316547, + "grad_norm": 0.08259462797161037, + "learning_rate": 2.7799952911964535e-05, + "loss": 0.3081, + "step": 2220 + }, + { + "epoch": 3.19568345323741, + "grad_norm": 0.09863762634723984, + "learning_rate": 2.776168722433308e-05, + "loss": 0.3188, + "step": 2221 + }, + { + "epoch": 3.1971223021582733, + "grad_norm": 0.08376842485912665, + "learning_rate": 2.7723433889504046e-05, + "loss": 0.305, + "step": 2222 + }, + { + "epoch": 3.1985611510791365, + "grad_norm": 0.07444689807045426, + "learning_rate": 2.7685192946088597e-05, + "loss": 0.3088, + "step": 2223 + }, + { + "epoch": 3.2, + "grad_norm": 0.08472495106391134, + "learning_rate": 2.7646964432685456e-05, + "loss": 0.3073, + "step": 2224 + }, + { + "epoch": 3.2014388489208634, + "grad_norm": 0.07662930328782688, + "learning_rate": 2.7608748387880754e-05, + "loss": 0.3132, + "step": 2225 + }, + { + "epoch": 3.2028776978417266, + "grad_norm": 0.09687190373515468, + "learning_rate": 2.7570544850248047e-05, + "loss": 0.3097, + "step": 2226 + }, + { + "epoch": 3.20431654676259, + "grad_norm": 0.09810302622840962, + "learning_rate": 2.753235385834824e-05, + "loss": 0.3174, + "step": 2227 + }, + { + "epoch": 3.205755395683453, + "grad_norm": 0.10861137611650165, + "learning_rate": 2.749417545072964e-05, + "loss": 0.3147, + "step": 2228 + }, + { + "epoch": 3.2071942446043167, + "grad_norm": 0.11598410975855965, + "learning_rate": 2.7456009665927807e-05, + "loss": 0.3123, + "step": 2229 + }, + { + "epoch": 3.20863309352518, + "grad_norm": 0.0754146028025198, + "learning_rate": 2.741785654246555e-05, + "loss": 0.3116, + "step": 2230 + }, + { + "epoch": 3.210071942446043, + "grad_norm": 0.09958175259270793, + "learning_rate": 2.7379716118852927e-05, + "loss": 0.319, + "step": 2231 + }, + { + "epoch": 3.2115107913669063, + "grad_norm": 0.08298289356877014, + "learning_rate": 2.734158843358718e-05, + "loss": 0.3021, + "step": 2232 + }, + { + "epoch": 3.21294964028777, + "grad_norm": 0.07874592189015894, + "learning_rate": 2.730347352515266e-05, + "loss": 0.3162, + "step": 2233 + }, + { + "epoch": 3.214388489208633, + "grad_norm": 0.10664889226983967, + "learning_rate": 2.7265371432020836e-05, + "loss": 0.3116, + "step": 2234 + }, + { + "epoch": 3.2158273381294964, + "grad_norm": 0.06975698260073791, + "learning_rate": 2.7227282192650258e-05, + "loss": 0.3154, + "step": 2235 + }, + { + "epoch": 3.2172661870503596, + "grad_norm": 0.10253930117577448, + "learning_rate": 2.7189205845486503e-05, + "loss": 0.3164, + "step": 2236 + }, + { + "epoch": 3.218705035971223, + "grad_norm": 0.08077061379947138, + "learning_rate": 2.7151142428962103e-05, + "loss": 0.3145, + "step": 2237 + }, + { + "epoch": 3.2201438848920865, + "grad_norm": 0.10455540840609091, + "learning_rate": 2.711309198149655e-05, + "loss": 0.3078, + "step": 2238 + }, + { + "epoch": 3.2215827338129497, + "grad_norm": 0.10720051510695476, + "learning_rate": 2.7075054541496296e-05, + "loss": 0.3096, + "step": 2239 + }, + { + "epoch": 3.223021582733813, + "grad_norm": 0.08730198774992833, + "learning_rate": 2.7037030147354582e-05, + "loss": 0.3079, + "step": 2240 + }, + { + "epoch": 3.224460431654676, + "grad_norm": 0.08808245280975563, + "learning_rate": 2.6999018837451523e-05, + "loss": 0.3086, + "step": 2241 + }, + { + "epoch": 3.2258992805755398, + "grad_norm": 0.09690990476431383, + "learning_rate": 2.6961020650154057e-05, + "loss": 0.3122, + "step": 2242 + }, + { + "epoch": 3.227338129496403, + "grad_norm": 0.07541941606724827, + "learning_rate": 2.6923035623815824e-05, + "loss": 0.3149, + "step": 2243 + }, + { + "epoch": 3.228776978417266, + "grad_norm": 0.08953075238235308, + "learning_rate": 2.6885063796777195e-05, + "loss": 0.3109, + "step": 2244 + }, + { + "epoch": 3.2302158273381294, + "grad_norm": 0.0756041493121849, + "learning_rate": 2.6847105207365225e-05, + "loss": 0.3115, + "step": 2245 + }, + { + "epoch": 3.2316546762589926, + "grad_norm": 0.08150949209268828, + "learning_rate": 2.6809159893893624e-05, + "loss": 0.3072, + "step": 2246 + }, + { + "epoch": 3.2330935251798563, + "grad_norm": 0.07674594542794348, + "learning_rate": 2.6771227894662666e-05, + "loss": 0.3066, + "step": 2247 + }, + { + "epoch": 3.2345323741007195, + "grad_norm": 0.08437215838706465, + "learning_rate": 2.6733309247959217e-05, + "loss": 0.3084, + "step": 2248 + }, + { + "epoch": 3.2359712230215827, + "grad_norm": 0.07488814476800476, + "learning_rate": 2.669540399205664e-05, + "loss": 0.3109, + "step": 2249 + }, + { + "epoch": 3.237410071942446, + "grad_norm": 0.08477463075152412, + "learning_rate": 2.6657512165214806e-05, + "loss": 0.3081, + "step": 2250 + }, + { + "epoch": 3.2388489208633096, + "grad_norm": 0.08480300227796347, + "learning_rate": 2.6619633805680028e-05, + "loss": 0.3151, + "step": 2251 + }, + { + "epoch": 3.2402877697841728, + "grad_norm": 0.08417307835689034, + "learning_rate": 2.6581768951684992e-05, + "loss": 0.3126, + "step": 2252 + }, + { + "epoch": 3.241726618705036, + "grad_norm": 0.0869429807242692, + "learning_rate": 2.6543917641448813e-05, + "loss": 0.3104, + "step": 2253 + }, + { + "epoch": 3.243165467625899, + "grad_norm": 0.083339481220526, + "learning_rate": 2.650607991317687e-05, + "loss": 0.3093, + "step": 2254 + }, + { + "epoch": 3.2446043165467624, + "grad_norm": 0.0841705362687117, + "learning_rate": 2.6468255805060885e-05, + "loss": 0.3182, + "step": 2255 + }, + { + "epoch": 3.246043165467626, + "grad_norm": 0.07847401258381193, + "learning_rate": 2.6430445355278788e-05, + "loss": 0.3175, + "step": 2256 + }, + { + "epoch": 3.2474820143884893, + "grad_norm": 0.07747532307100845, + "learning_rate": 2.639264860199477e-05, + "loss": 0.3108, + "step": 2257 + }, + { + "epoch": 3.2489208633093525, + "grad_norm": 0.07087934641843581, + "learning_rate": 2.6354865583359175e-05, + "loss": 0.3105, + "step": 2258 + }, + { + "epoch": 3.2503597122302157, + "grad_norm": 0.08010150707605394, + "learning_rate": 2.631709633750847e-05, + "loss": 0.3029, + "step": 2259 + }, + { + "epoch": 3.2517985611510793, + "grad_norm": 0.06538151949892566, + "learning_rate": 2.6279340902565217e-05, + "loss": 0.3103, + "step": 2260 + }, + { + "epoch": 3.2532374100719426, + "grad_norm": 0.08170052188335858, + "learning_rate": 2.6241599316638084e-05, + "loss": 0.3014, + "step": 2261 + }, + { + "epoch": 3.2546762589928058, + "grad_norm": 0.0817078451653208, + "learning_rate": 2.6203871617821717e-05, + "loss": 0.3194, + "step": 2262 + }, + { + "epoch": 3.256115107913669, + "grad_norm": 0.07909170810469295, + "learning_rate": 2.6166157844196755e-05, + "loss": 0.3103, + "step": 2263 + }, + { + "epoch": 3.257553956834532, + "grad_norm": 0.08352539220044815, + "learning_rate": 2.6128458033829792e-05, + "loss": 0.3139, + "step": 2264 + }, + { + "epoch": 3.258992805755396, + "grad_norm": 0.07913451837672723, + "learning_rate": 2.609077222477332e-05, + "loss": 0.3147, + "step": 2265 + }, + { + "epoch": 3.260431654676259, + "grad_norm": 0.08618944966170508, + "learning_rate": 2.6053100455065693e-05, + "loss": 0.3123, + "step": 2266 + }, + { + "epoch": 3.2618705035971223, + "grad_norm": 0.07266722689750256, + "learning_rate": 2.6015442762731095e-05, + "loss": 0.3152, + "step": 2267 + }, + { + "epoch": 3.2633093525179855, + "grad_norm": 0.09435052405815617, + "learning_rate": 2.5977799185779534e-05, + "loss": 0.3127, + "step": 2268 + }, + { + "epoch": 3.2647482014388487, + "grad_norm": 0.07199685293057449, + "learning_rate": 2.5940169762206722e-05, + "loss": 0.3027, + "step": 2269 + }, + { + "epoch": 3.2661870503597124, + "grad_norm": 0.08625337724152561, + "learning_rate": 2.5902554529994105e-05, + "loss": 0.3144, + "step": 2270 + }, + { + "epoch": 3.2676258992805756, + "grad_norm": 0.08384084708818604, + "learning_rate": 2.5864953527108805e-05, + "loss": 0.3091, + "step": 2271 + }, + { + "epoch": 3.2690647482014388, + "grad_norm": 0.09045487677998157, + "learning_rate": 2.58273667915036e-05, + "loss": 0.3151, + "step": 2272 + }, + { + "epoch": 3.270503597122302, + "grad_norm": 0.08550608784459429, + "learning_rate": 2.578979436111684e-05, + "loss": 0.3069, + "step": 2273 + }, + { + "epoch": 3.2719424460431656, + "grad_norm": 0.07644305283410271, + "learning_rate": 2.5752236273872432e-05, + "loss": 0.3089, + "step": 2274 + }, + { + "epoch": 3.273381294964029, + "grad_norm": 0.08672218328853597, + "learning_rate": 2.5714692567679853e-05, + "loss": 0.3108, + "step": 2275 + }, + { + "epoch": 3.274820143884892, + "grad_norm": 0.0858839644826738, + "learning_rate": 2.5677163280433995e-05, + "loss": 0.3085, + "step": 2276 + }, + { + "epoch": 3.2762589928057553, + "grad_norm": 0.07779820348974278, + "learning_rate": 2.5639648450015268e-05, + "loss": 0.3105, + "step": 2277 + }, + { + "epoch": 3.277697841726619, + "grad_norm": 0.09608787847046683, + "learning_rate": 2.5602148114289415e-05, + "loss": 0.3106, + "step": 2278 + }, + { + "epoch": 3.279136690647482, + "grad_norm": 0.08810889838597738, + "learning_rate": 2.556466231110762e-05, + "loss": 0.3056, + "step": 2279 + }, + { + "epoch": 3.2805755395683454, + "grad_norm": 0.07600342745886936, + "learning_rate": 2.552719107830635e-05, + "loss": 0.3064, + "step": 2280 + }, + { + "epoch": 3.2820143884892086, + "grad_norm": 0.0905027839872227, + "learning_rate": 2.54897344537074e-05, + "loss": 0.3068, + "step": 2281 + }, + { + "epoch": 3.283453237410072, + "grad_norm": 0.07570693317848128, + "learning_rate": 2.5452292475117767e-05, + "loss": 0.3124, + "step": 2282 + }, + { + "epoch": 3.2848920863309354, + "grad_norm": 0.079114700228305, + "learning_rate": 2.541486518032973e-05, + "loss": 0.31, + "step": 2283 + }, + { + "epoch": 3.2863309352517986, + "grad_norm": 0.10010897960545055, + "learning_rate": 2.5377452607120722e-05, + "loss": 0.3209, + "step": 2284 + }, + { + "epoch": 3.287769784172662, + "grad_norm": 0.07373044643745348, + "learning_rate": 2.5340054793253276e-05, + "loss": 0.3154, + "step": 2285 + }, + { + "epoch": 3.289208633093525, + "grad_norm": 0.09561131286368094, + "learning_rate": 2.5302671776475098e-05, + "loss": 0.3072, + "step": 2286 + }, + { + "epoch": 3.2906474820143883, + "grad_norm": 0.08481304440195968, + "learning_rate": 2.526530359451892e-05, + "loss": 0.3149, + "step": 2287 + }, + { + "epoch": 3.292086330935252, + "grad_norm": 0.07033086586948158, + "learning_rate": 2.522795028510249e-05, + "loss": 0.3159, + "step": 2288 + }, + { + "epoch": 3.293525179856115, + "grad_norm": 0.08058728399603567, + "learning_rate": 2.5190611885928547e-05, + "loss": 0.3128, + "step": 2289 + }, + { + "epoch": 3.2949640287769784, + "grad_norm": 0.0758475875615919, + "learning_rate": 2.5153288434684816e-05, + "loss": 0.3171, + "step": 2290 + }, + { + "epoch": 3.2964028776978416, + "grad_norm": 0.09660252514185105, + "learning_rate": 2.5115979969043914e-05, + "loss": 0.3172, + "step": 2291 + }, + { + "epoch": 3.2978417266187052, + "grad_norm": 0.07392542301471512, + "learning_rate": 2.5078686526663304e-05, + "loss": 0.3123, + "step": 2292 + }, + { + "epoch": 3.2992805755395684, + "grad_norm": 0.07628148569031089, + "learning_rate": 2.5041408145185312e-05, + "loss": 0.3035, + "step": 2293 + }, + { + "epoch": 3.3007194244604317, + "grad_norm": 0.0816470604581458, + "learning_rate": 2.5004144862237084e-05, + "loss": 0.3142, + "step": 2294 + }, + { + "epoch": 3.302158273381295, + "grad_norm": 0.078780294760847, + "learning_rate": 2.4966896715430484e-05, + "loss": 0.3063, + "step": 2295 + }, + { + "epoch": 3.3035971223021585, + "grad_norm": 0.07937649700673445, + "learning_rate": 2.4929663742362103e-05, + "loss": 0.315, + "step": 2296 + }, + { + "epoch": 3.3050359712230217, + "grad_norm": 0.07099358599255484, + "learning_rate": 2.4892445980613254e-05, + "loss": 0.3066, + "step": 2297 + }, + { + "epoch": 3.306474820143885, + "grad_norm": 0.08230088318252803, + "learning_rate": 2.4855243467749865e-05, + "loss": 0.3166, + "step": 2298 + }, + { + "epoch": 3.307913669064748, + "grad_norm": 0.07839788409142841, + "learning_rate": 2.481805624132247e-05, + "loss": 0.3056, + "step": 2299 + }, + { + "epoch": 3.3093525179856114, + "grad_norm": 0.07333852877027737, + "learning_rate": 2.478088433886618e-05, + "loss": 0.3048, + "step": 2300 + }, + { + "epoch": 3.310791366906475, + "grad_norm": 0.09456550634859283, + "learning_rate": 2.4743727797900668e-05, + "loss": 0.31, + "step": 2301 + }, + { + "epoch": 3.3122302158273382, + "grad_norm": 0.06933779657260399, + "learning_rate": 2.4706586655930042e-05, + "loss": 0.3097, + "step": 2302 + }, + { + "epoch": 3.3136690647482014, + "grad_norm": 0.08714194098610613, + "learning_rate": 2.4669460950442926e-05, + "loss": 0.3138, + "step": 2303 + }, + { + "epoch": 3.3151079136690647, + "grad_norm": 0.09547255660921417, + "learning_rate": 2.463235071891231e-05, + "loss": 0.3132, + "step": 2304 + }, + { + "epoch": 3.316546762589928, + "grad_norm": 0.07190538663347976, + "learning_rate": 2.4595255998795625e-05, + "loss": 0.3015, + "step": 2305 + }, + { + "epoch": 3.3179856115107915, + "grad_norm": 0.09186247817974864, + "learning_rate": 2.4558176827534587e-05, + "loss": 0.3069, + "step": 2306 + }, + { + "epoch": 3.3194244604316547, + "grad_norm": 0.07892002449578553, + "learning_rate": 2.452111324255524e-05, + "loss": 0.3059, + "step": 2307 + }, + { + "epoch": 3.320863309352518, + "grad_norm": 0.08602518467621155, + "learning_rate": 2.448406528126793e-05, + "loss": 0.317, + "step": 2308 + }, + { + "epoch": 3.322302158273381, + "grad_norm": 0.09660912181886301, + "learning_rate": 2.444703298106718e-05, + "loss": 0.3067, + "step": 2309 + }, + { + "epoch": 3.3237410071942444, + "grad_norm": 0.08146049515388465, + "learning_rate": 2.441001637933173e-05, + "loss": 0.313, + "step": 2310 + }, + { + "epoch": 3.325179856115108, + "grad_norm": 0.09560703484537106, + "learning_rate": 2.437301551342447e-05, + "loss": 0.3154, + "step": 2311 + }, + { + "epoch": 3.3266187050359712, + "grad_norm": 0.07466795669935042, + "learning_rate": 2.433603042069242e-05, + "loss": 0.31, + "step": 2312 + }, + { + "epoch": 3.3280575539568344, + "grad_norm": 0.08325258763084104, + "learning_rate": 2.4299061138466667e-05, + "loss": 0.312, + "step": 2313 + }, + { + "epoch": 3.3294964028776977, + "grad_norm": 0.08796488903023479, + "learning_rate": 2.4262107704062343e-05, + "loss": 0.3121, + "step": 2314 + }, + { + "epoch": 3.3309352517985613, + "grad_norm": 0.09552304190021914, + "learning_rate": 2.4225170154778562e-05, + "loss": 0.298, + "step": 2315 + }, + { + "epoch": 3.3323741007194245, + "grad_norm": 0.08363284586867374, + "learning_rate": 2.4188248527898446e-05, + "loss": 0.3157, + "step": 2316 + }, + { + "epoch": 3.3338129496402877, + "grad_norm": 0.09022905284092302, + "learning_rate": 2.415134286068903e-05, + "loss": 0.3111, + "step": 2317 + }, + { + "epoch": 3.335251798561151, + "grad_norm": 0.0835668636717724, + "learning_rate": 2.411445319040121e-05, + "loss": 0.3079, + "step": 2318 + }, + { + "epoch": 3.3366906474820146, + "grad_norm": 0.0877973742789849, + "learning_rate": 2.407757955426977e-05, + "loss": 0.3104, + "step": 2319 + }, + { + "epoch": 3.338129496402878, + "grad_norm": 0.08331668340154326, + "learning_rate": 2.4040721989513314e-05, + "loss": 0.3119, + "step": 2320 + }, + { + "epoch": 3.339568345323741, + "grad_norm": 0.08697931951822054, + "learning_rate": 2.40038805333342e-05, + "loss": 0.3081, + "step": 2321 + }, + { + "epoch": 3.3410071942446042, + "grad_norm": 0.091564579637049, + "learning_rate": 2.396705522291852e-05, + "loss": 0.3151, + "step": 2322 + }, + { + "epoch": 3.3424460431654675, + "grad_norm": 0.08131858824210252, + "learning_rate": 2.393024609543611e-05, + "loss": 0.3084, + "step": 2323 + }, + { + "epoch": 3.343884892086331, + "grad_norm": 0.08590487089463734, + "learning_rate": 2.3893453188040442e-05, + "loss": 0.3037, + "step": 2324 + }, + { + "epoch": 3.3453237410071943, + "grad_norm": 0.07930906646637645, + "learning_rate": 2.3856676537868614e-05, + "loss": 0.3144, + "step": 2325 + }, + { + "epoch": 3.3467625899280575, + "grad_norm": 0.09001420314337577, + "learning_rate": 2.3819916182041318e-05, + "loss": 0.3042, + "step": 2326 + }, + { + "epoch": 3.3482014388489207, + "grad_norm": 0.08026328979707366, + "learning_rate": 2.378317215766283e-05, + "loss": 0.3042, + "step": 2327 + }, + { + "epoch": 3.349640287769784, + "grad_norm": 0.0733431469374671, + "learning_rate": 2.3746444501820886e-05, + "loss": 0.3163, + "step": 2328 + }, + { + "epoch": 3.3510791366906476, + "grad_norm": 0.08584777385276081, + "learning_rate": 2.370973325158675e-05, + "loss": 0.3049, + "step": 2329 + }, + { + "epoch": 3.352517985611511, + "grad_norm": 0.06972001281573628, + "learning_rate": 2.3673038444015087e-05, + "loss": 0.3141, + "step": 2330 + }, + { + "epoch": 3.353956834532374, + "grad_norm": 0.08060635092830382, + "learning_rate": 2.363636011614401e-05, + "loss": 0.3125, + "step": 2331 + }, + { + "epoch": 3.3553956834532372, + "grad_norm": 0.08013278285028885, + "learning_rate": 2.3599698304994946e-05, + "loss": 0.3177, + "step": 2332 + }, + { + "epoch": 3.356834532374101, + "grad_norm": 0.08918460812426662, + "learning_rate": 2.3563053047572683e-05, + "loss": 0.3154, + "step": 2333 + }, + { + "epoch": 3.358273381294964, + "grad_norm": 0.07615428352899022, + "learning_rate": 2.352642438086533e-05, + "loss": 0.2992, + "step": 2334 + }, + { + "epoch": 3.3597122302158273, + "grad_norm": 0.07729121670905142, + "learning_rate": 2.348981234184417e-05, + "loss": 0.307, + "step": 2335 + }, + { + "epoch": 3.3611510791366905, + "grad_norm": 0.07141656240900103, + "learning_rate": 2.3453216967463785e-05, + "loss": 0.3145, + "step": 2336 + }, + { + "epoch": 3.362589928057554, + "grad_norm": 0.08935058896085595, + "learning_rate": 2.3416638294661864e-05, + "loss": 0.3085, + "step": 2337 + }, + { + "epoch": 3.3640287769784174, + "grad_norm": 0.06400882009509953, + "learning_rate": 2.3380076360359293e-05, + "loss": 0.3124, + "step": 2338 + }, + { + "epoch": 3.3654676258992806, + "grad_norm": 0.09512916515917591, + "learning_rate": 2.3343531201460067e-05, + "loss": 0.3043, + "step": 2339 + }, + { + "epoch": 3.366906474820144, + "grad_norm": 0.06956346991078782, + "learning_rate": 2.3307002854851188e-05, + "loss": 0.3122, + "step": 2340 + }, + { + "epoch": 3.368345323741007, + "grad_norm": 0.08138811148963958, + "learning_rate": 2.3270491357402715e-05, + "loss": 0.3073, + "step": 2341 + }, + { + "epoch": 3.3697841726618707, + "grad_norm": 0.07568926449823735, + "learning_rate": 2.3233996745967772e-05, + "loss": 0.3143, + "step": 2342 + }, + { + "epoch": 3.371223021582734, + "grad_norm": 0.08713346179685631, + "learning_rate": 2.3197519057382326e-05, + "loss": 0.3011, + "step": 2343 + }, + { + "epoch": 3.372661870503597, + "grad_norm": 0.06657650013060197, + "learning_rate": 2.316105832846532e-05, + "loss": 0.3021, + "step": 2344 + }, + { + "epoch": 3.3741007194244603, + "grad_norm": 0.0909347921883191, + "learning_rate": 2.3124614596018606e-05, + "loss": 0.3033, + "step": 2345 + }, + { + "epoch": 3.3755395683453235, + "grad_norm": 0.06633577705016298, + "learning_rate": 2.308818789682682e-05, + "loss": 0.2986, + "step": 2346 + }, + { + "epoch": 3.376978417266187, + "grad_norm": 0.08813700890410717, + "learning_rate": 2.3051778267657436e-05, + "loss": 0.3099, + "step": 2347 + }, + { + "epoch": 3.3784172661870504, + "grad_norm": 0.08030048225395597, + "learning_rate": 2.3015385745260704e-05, + "loss": 0.3121, + "step": 2348 + }, + { + "epoch": 3.3798561151079136, + "grad_norm": 0.08772628266053589, + "learning_rate": 2.2979010366369595e-05, + "loss": 0.3138, + "step": 2349 + }, + { + "epoch": 3.381294964028777, + "grad_norm": 0.06915074511307691, + "learning_rate": 2.294265216769978e-05, + "loss": 0.3036, + "step": 2350 + }, + { + "epoch": 3.38273381294964, + "grad_norm": 0.0787032873720719, + "learning_rate": 2.2906311185949605e-05, + "loss": 0.3159, + "step": 2351 + }, + { + "epoch": 3.3841726618705037, + "grad_norm": 0.06474353356605671, + "learning_rate": 2.2869987457799977e-05, + "loss": 0.3013, + "step": 2352 + }, + { + "epoch": 3.385611510791367, + "grad_norm": 0.07936109383021454, + "learning_rate": 2.283368101991448e-05, + "loss": 0.3056, + "step": 2353 + }, + { + "epoch": 3.38705035971223, + "grad_norm": 0.06753730195534415, + "learning_rate": 2.2797391908939196e-05, + "loss": 0.3138, + "step": 2354 + }, + { + "epoch": 3.3884892086330938, + "grad_norm": 0.07238196048279752, + "learning_rate": 2.2761120161502674e-05, + "loss": 0.3043, + "step": 2355 + }, + { + "epoch": 3.389928057553957, + "grad_norm": 0.06924314961884438, + "learning_rate": 2.2724865814216042e-05, + "loss": 0.3009, + "step": 2356 + }, + { + "epoch": 3.39136690647482, + "grad_norm": 0.06361823031633611, + "learning_rate": 2.2688628903672792e-05, + "loss": 0.3087, + "step": 2357 + }, + { + "epoch": 3.3928057553956834, + "grad_norm": 0.07732323917041639, + "learning_rate": 2.265240946644881e-05, + "loss": 0.3157, + "step": 2358 + }, + { + "epoch": 3.3942446043165466, + "grad_norm": 0.06415742640174263, + "learning_rate": 2.261620753910238e-05, + "loss": 0.3182, + "step": 2359 + }, + { + "epoch": 3.3956834532374103, + "grad_norm": 0.07267628619293835, + "learning_rate": 2.25800231581741e-05, + "loss": 0.3076, + "step": 2360 + }, + { + "epoch": 3.3971223021582735, + "grad_norm": 0.061930859007014914, + "learning_rate": 2.254385636018686e-05, + "loss": 0.3084, + "step": 2361 + }, + { + "epoch": 3.3985611510791367, + "grad_norm": 0.07012006894365325, + "learning_rate": 2.250770718164579e-05, + "loss": 0.3107, + "step": 2362 + }, + { + "epoch": 3.4, + "grad_norm": 0.07266288046107679, + "learning_rate": 2.247157565903825e-05, + "loss": 0.3114, + "step": 2363 + }, + { + "epoch": 3.401438848920863, + "grad_norm": 0.06690274972297683, + "learning_rate": 2.243546182883377e-05, + "loss": 0.3102, + "step": 2364 + }, + { + "epoch": 3.402877697841727, + "grad_norm": 0.0692178004843867, + "learning_rate": 2.2399365727484047e-05, + "loss": 0.3068, + "step": 2365 + }, + { + "epoch": 3.40431654676259, + "grad_norm": 0.0722367138426255, + "learning_rate": 2.2363287391422806e-05, + "loss": 0.3081, + "step": 2366 + }, + { + "epoch": 3.405755395683453, + "grad_norm": 0.06969322455090261, + "learning_rate": 2.2327226857065954e-05, + "loss": 0.3126, + "step": 2367 + }, + { + "epoch": 3.4071942446043164, + "grad_norm": 0.0707675931902275, + "learning_rate": 2.2291184160811374e-05, + "loss": 0.3148, + "step": 2368 + }, + { + "epoch": 3.4086330935251796, + "grad_norm": 0.06728240197400746, + "learning_rate": 2.22551593390389e-05, + "loss": 0.3107, + "step": 2369 + }, + { + "epoch": 3.4100719424460433, + "grad_norm": 0.07436693837659339, + "learning_rate": 2.2219152428110368e-05, + "loss": 0.3069, + "step": 2370 + }, + { + "epoch": 3.4115107913669065, + "grad_norm": 0.06117093871196123, + "learning_rate": 2.218316346436959e-05, + "loss": 0.306, + "step": 2371 + }, + { + "epoch": 3.4129496402877697, + "grad_norm": 0.07245482528214216, + "learning_rate": 2.2147192484142154e-05, + "loss": 0.3062, + "step": 2372 + }, + { + "epoch": 3.414388489208633, + "grad_norm": 0.05973011321086604, + "learning_rate": 2.2111239523735568e-05, + "loss": 0.3046, + "step": 2373 + }, + { + "epoch": 3.4158273381294966, + "grad_norm": 0.07457067897114032, + "learning_rate": 2.2075304619439127e-05, + "loss": 0.3119, + "step": 2374 + }, + { + "epoch": 3.41726618705036, + "grad_norm": 0.27115162473455456, + "learning_rate": 2.2039387807523914e-05, + "loss": 0.3127, + "step": 2375 + }, + { + "epoch": 3.418705035971223, + "grad_norm": 0.08044914751899118, + "learning_rate": 2.2003489124242742e-05, + "loss": 0.3103, + "step": 2376 + }, + { + "epoch": 3.420143884892086, + "grad_norm": 0.07157904616628058, + "learning_rate": 2.1967608605830115e-05, + "loss": 0.3106, + "step": 2377 + }, + { + "epoch": 3.42158273381295, + "grad_norm": 0.07414897587449552, + "learning_rate": 2.1931746288502235e-05, + "loss": 0.3129, + "step": 2378 + }, + { + "epoch": 3.423021582733813, + "grad_norm": 0.07280082346273015, + "learning_rate": 2.1895902208456903e-05, + "loss": 0.3127, + "step": 2379 + }, + { + "epoch": 3.4244604316546763, + "grad_norm": 0.07413056477182658, + "learning_rate": 2.186007640187353e-05, + "loss": 0.3126, + "step": 2380 + }, + { + "epoch": 3.4258992805755395, + "grad_norm": 0.0712247235301936, + "learning_rate": 2.1824268904913036e-05, + "loss": 0.3079, + "step": 2381 + }, + { + "epoch": 3.4273381294964027, + "grad_norm": 0.08602698068407817, + "learning_rate": 2.1788479753717935e-05, + "loss": 0.309, + "step": 2382 + }, + { + "epoch": 3.4287769784172664, + "grad_norm": 0.06949678973950535, + "learning_rate": 2.1752708984412196e-05, + "loss": 0.305, + "step": 2383 + }, + { + "epoch": 3.4302158273381296, + "grad_norm": 0.0808604625712668, + "learning_rate": 2.171695663310119e-05, + "loss": 0.3059, + "step": 2384 + }, + { + "epoch": 3.431654676258993, + "grad_norm": 0.07221729607102836, + "learning_rate": 2.1681222735871747e-05, + "loss": 0.3114, + "step": 2385 + }, + { + "epoch": 3.433093525179856, + "grad_norm": 0.09025084305244628, + "learning_rate": 2.1645507328792058e-05, + "loss": 0.3136, + "step": 2386 + }, + { + "epoch": 3.434532374100719, + "grad_norm": 0.07081013288539319, + "learning_rate": 2.1609810447911637e-05, + "loss": 0.3141, + "step": 2387 + }, + { + "epoch": 3.435971223021583, + "grad_norm": 0.07325114649426695, + "learning_rate": 2.157413212926133e-05, + "loss": 0.3055, + "step": 2388 + }, + { + "epoch": 3.437410071942446, + "grad_norm": 0.06552754563676148, + "learning_rate": 2.1538472408853206e-05, + "loss": 0.3102, + "step": 2389 + }, + { + "epoch": 3.4388489208633093, + "grad_norm": 0.06958008689182883, + "learning_rate": 2.1502831322680598e-05, + "loss": 0.3136, + "step": 2390 + }, + { + "epoch": 3.4402877697841725, + "grad_norm": 0.07411701271075863, + "learning_rate": 2.1467208906718008e-05, + "loss": 0.3011, + "step": 2391 + }, + { + "epoch": 3.441726618705036, + "grad_norm": 0.06446652332050883, + "learning_rate": 2.1431605196921103e-05, + "loss": 0.3189, + "step": 2392 + }, + { + "epoch": 3.4431654676258994, + "grad_norm": 0.07008631497051318, + "learning_rate": 2.1396020229226666e-05, + "loss": 0.3122, + "step": 2393 + }, + { + "epoch": 3.4446043165467626, + "grad_norm": 0.07305952821190029, + "learning_rate": 2.1360454039552577e-05, + "loss": 0.3012, + "step": 2394 + }, + { + "epoch": 3.446043165467626, + "grad_norm": 0.06945354549100825, + "learning_rate": 2.1324906663797718e-05, + "loss": 0.313, + "step": 2395 + }, + { + "epoch": 3.4474820143884894, + "grad_norm": 0.07681370824552682, + "learning_rate": 2.1289378137842008e-05, + "loss": 0.3144, + "step": 2396 + }, + { + "epoch": 3.4489208633093527, + "grad_norm": 0.06996385633447534, + "learning_rate": 2.125386849754639e-05, + "loss": 0.3096, + "step": 2397 + }, + { + "epoch": 3.450359712230216, + "grad_norm": 0.07331328439643199, + "learning_rate": 2.121837777875266e-05, + "loss": 0.316, + "step": 2398 + }, + { + "epoch": 3.451798561151079, + "grad_norm": 0.07344637977225729, + "learning_rate": 2.118290601728354e-05, + "loss": 0.3074, + "step": 2399 + }, + { + "epoch": 3.4532374100719423, + "grad_norm": 0.07668692477154283, + "learning_rate": 2.1147453248942687e-05, + "loss": 0.3072, + "step": 2400 + }, + { + "epoch": 3.454676258992806, + "grad_norm": 0.07619582037841803, + "learning_rate": 2.1112019509514478e-05, + "loss": 0.3129, + "step": 2401 + }, + { + "epoch": 3.456115107913669, + "grad_norm": 0.07479747846629918, + "learning_rate": 2.1076604834764154e-05, + "loss": 0.3099, + "step": 2402 + }, + { + "epoch": 3.4575539568345324, + "grad_norm": 0.090326392764258, + "learning_rate": 2.1041209260437694e-05, + "loss": 0.3146, + "step": 2403 + }, + { + "epoch": 3.4589928057553956, + "grad_norm": 0.0777856669266028, + "learning_rate": 2.10058328222618e-05, + "loss": 0.3105, + "step": 2404 + }, + { + "epoch": 3.460431654676259, + "grad_norm": 0.10039220659650842, + "learning_rate": 2.097047555594385e-05, + "loss": 0.308, + "step": 2405 + }, + { + "epoch": 3.4618705035971225, + "grad_norm": 0.07880503894320168, + "learning_rate": 2.0935137497171904e-05, + "loss": 0.3123, + "step": 2406 + }, + { + "epoch": 3.4633093525179857, + "grad_norm": 0.10766337775820017, + "learning_rate": 2.0899818681614557e-05, + "loss": 0.3148, + "step": 2407 + }, + { + "epoch": 3.464748201438849, + "grad_norm": 0.07152630066014365, + "learning_rate": 2.086451914492108e-05, + "loss": 0.3095, + "step": 2408 + }, + { + "epoch": 3.466187050359712, + "grad_norm": 0.06931007783592509, + "learning_rate": 2.082923892272124e-05, + "loss": 0.3094, + "step": 2409 + }, + { + "epoch": 3.4676258992805753, + "grad_norm": 0.08750246007451234, + "learning_rate": 2.079397805062526e-05, + "loss": 0.3127, + "step": 2410 + }, + { + "epoch": 3.469064748201439, + "grad_norm": 0.07336793877490577, + "learning_rate": 2.0758736564223937e-05, + "loss": 0.3073, + "step": 2411 + }, + { + "epoch": 3.470503597122302, + "grad_norm": 0.06878588227635767, + "learning_rate": 2.0723514499088388e-05, + "loss": 0.3151, + "step": 2412 + }, + { + "epoch": 3.4719424460431654, + "grad_norm": 0.07205378099429692, + "learning_rate": 2.068831189077021e-05, + "loss": 0.3165, + "step": 2413 + }, + { + "epoch": 3.4733812949640286, + "grad_norm": 0.07254073531532791, + "learning_rate": 2.065312877480133e-05, + "loss": 0.2993, + "step": 2414 + }, + { + "epoch": 3.4748201438848922, + "grad_norm": 0.06448071539936977, + "learning_rate": 2.0617965186694e-05, + "loss": 0.3061, + "step": 2415 + }, + { + "epoch": 3.4762589928057555, + "grad_norm": 0.07969287347363285, + "learning_rate": 2.058282116194076e-05, + "loss": 0.3182, + "step": 2416 + }, + { + "epoch": 3.4776978417266187, + "grad_norm": 0.0703083994859436, + "learning_rate": 2.0547696736014415e-05, + "loss": 0.3158, + "step": 2417 + }, + { + "epoch": 3.479136690647482, + "grad_norm": 0.06391401118115385, + "learning_rate": 2.0512591944367976e-05, + "loss": 0.3146, + "step": 2418 + }, + { + "epoch": 3.4805755395683455, + "grad_norm": 0.07949909331350384, + "learning_rate": 2.0477506822434644e-05, + "loss": 0.3095, + "step": 2419 + }, + { + "epoch": 3.4820143884892087, + "grad_norm": 0.06808962841536517, + "learning_rate": 2.0442441405627776e-05, + "loss": 0.3106, + "step": 2420 + }, + { + "epoch": 3.483453237410072, + "grad_norm": 0.08055504695955595, + "learning_rate": 2.0407395729340792e-05, + "loss": 0.3074, + "step": 2421 + }, + { + "epoch": 3.484892086330935, + "grad_norm": 0.08062785800145268, + "learning_rate": 2.037236982894723e-05, + "loss": 0.3074, + "step": 2422 + }, + { + "epoch": 3.4863309352517984, + "grad_norm": 0.07718758511488966, + "learning_rate": 2.0337363739800695e-05, + "loss": 0.3062, + "step": 2423 + }, + { + "epoch": 3.487769784172662, + "grad_norm": 0.07031308809064436, + "learning_rate": 2.030237749723472e-05, + "loss": 0.3112, + "step": 2424 + }, + { + "epoch": 3.4892086330935252, + "grad_norm": 0.06298489171916702, + "learning_rate": 2.026741113656284e-05, + "loss": 0.3158, + "step": 2425 + }, + { + "epoch": 3.4906474820143885, + "grad_norm": 0.07920017440476117, + "learning_rate": 2.0232464693078578e-05, + "loss": 0.3181, + "step": 2426 + }, + { + "epoch": 3.4920863309352517, + "grad_norm": 0.06380879282023667, + "learning_rate": 2.0197538202055246e-05, + "loss": 0.3116, + "step": 2427 + }, + { + "epoch": 3.493525179856115, + "grad_norm": 0.06934399087100221, + "learning_rate": 2.01626316987461e-05, + "loss": 0.3093, + "step": 2428 + }, + { + "epoch": 3.4949640287769785, + "grad_norm": 0.06646782712492776, + "learning_rate": 2.0127745218384193e-05, + "loss": 0.3103, + "step": 2429 + }, + { + "epoch": 3.4964028776978417, + "grad_norm": 0.06950058637538396, + "learning_rate": 2.009287879618236e-05, + "loss": 0.3135, + "step": 2430 + }, + { + "epoch": 3.497841726618705, + "grad_norm": 0.06866221637545802, + "learning_rate": 2.0058032467333204e-05, + "loss": 0.3169, + "step": 2431 + }, + { + "epoch": 3.499280575539568, + "grad_norm": 0.0645057742473268, + "learning_rate": 2.0023206267009056e-05, + "loss": 0.3176, + "step": 2432 + }, + { + "epoch": 3.5007194244604314, + "grad_norm": 0.06589944743996938, + "learning_rate": 1.9988400230361872e-05, + "loss": 0.3139, + "step": 2433 + }, + { + "epoch": 3.502158273381295, + "grad_norm": 0.06639231700006921, + "learning_rate": 1.9953614392523345e-05, + "loss": 0.3002, + "step": 2434 + }, + { + "epoch": 3.5035971223021583, + "grad_norm": 0.07072191123946665, + "learning_rate": 1.9918848788604738e-05, + "loss": 0.3182, + "step": 2435 + }, + { + "epoch": 3.5050359712230215, + "grad_norm": 0.06800598042601348, + "learning_rate": 1.9884103453696837e-05, + "loss": 0.3187, + "step": 2436 + }, + { + "epoch": 3.506474820143885, + "grad_norm": 0.06296346756178148, + "learning_rate": 1.9849378422870082e-05, + "loss": 0.3143, + "step": 2437 + }, + { + "epoch": 3.5079136690647483, + "grad_norm": 0.06690443343286248, + "learning_rate": 1.9814673731174315e-05, + "loss": 0.3164, + "step": 2438 + }, + { + "epoch": 3.5093525179856115, + "grad_norm": 0.06677680553334467, + "learning_rate": 1.97799894136389e-05, + "loss": 0.3119, + "step": 2439 + }, + { + "epoch": 3.5107913669064748, + "grad_norm": 0.06444580512826723, + "learning_rate": 1.9745325505272633e-05, + "loss": 0.3049, + "step": 2440 + }, + { + "epoch": 3.512230215827338, + "grad_norm": 0.06874289491717574, + "learning_rate": 1.9710682041063705e-05, + "loss": 0.3145, + "step": 2441 + }, + { + "epoch": 3.5136690647482016, + "grad_norm": 0.07217181386706867, + "learning_rate": 1.9676059055979663e-05, + "loss": 0.3078, + "step": 2442 + }, + { + "epoch": 3.515107913669065, + "grad_norm": 0.06564361893188796, + "learning_rate": 1.9641456584967392e-05, + "loss": 0.3032, + "step": 2443 + }, + { + "epoch": 3.516546762589928, + "grad_norm": 0.07487083795793006, + "learning_rate": 1.9606874662953076e-05, + "loss": 0.3101, + "step": 2444 + }, + { + "epoch": 3.5179856115107913, + "grad_norm": 0.0618777293904377, + "learning_rate": 1.9572313324842148e-05, + "loss": 0.302, + "step": 2445 + }, + { + "epoch": 3.5194244604316545, + "grad_norm": 0.07249629854440197, + "learning_rate": 1.9537772605519285e-05, + "loss": 0.3063, + "step": 2446 + }, + { + "epoch": 3.520863309352518, + "grad_norm": 0.06472724303978827, + "learning_rate": 1.950325253984828e-05, + "loss": 0.3156, + "step": 2447 + }, + { + "epoch": 3.5223021582733813, + "grad_norm": 0.07258332278787377, + "learning_rate": 1.946875316267219e-05, + "loss": 0.3072, + "step": 2448 + }, + { + "epoch": 3.5237410071942445, + "grad_norm": 0.07185391155876489, + "learning_rate": 1.9434274508813135e-05, + "loss": 0.3146, + "step": 2449 + }, + { + "epoch": 3.5251798561151078, + "grad_norm": 0.065249610001787, + "learning_rate": 1.9399816613072287e-05, + "loss": 0.3059, + "step": 2450 + }, + { + "epoch": 3.526618705035971, + "grad_norm": 0.0720522516278948, + "learning_rate": 1.9365379510229888e-05, + "loss": 0.3145, + "step": 2451 + }, + { + "epoch": 3.5280575539568346, + "grad_norm": 0.07040021193745491, + "learning_rate": 1.9330963235045253e-05, + "loss": 0.3141, + "step": 2452 + }, + { + "epoch": 3.529496402877698, + "grad_norm": 0.06867038188715696, + "learning_rate": 1.9296567822256577e-05, + "loss": 0.3161, + "step": 2453 + }, + { + "epoch": 3.530935251798561, + "grad_norm": 0.06663744178748973, + "learning_rate": 1.9262193306581052e-05, + "loss": 0.3059, + "step": 2454 + }, + { + "epoch": 3.5323741007194247, + "grad_norm": 0.0645062042179706, + "learning_rate": 1.922783972271477e-05, + "loss": 0.3078, + "step": 2455 + }, + { + "epoch": 3.533812949640288, + "grad_norm": 0.0634394118400146, + "learning_rate": 1.9193507105332702e-05, + "loss": 0.2995, + "step": 2456 + }, + { + "epoch": 3.535251798561151, + "grad_norm": 0.06934873745320619, + "learning_rate": 1.9159195489088636e-05, + "loss": 0.3049, + "step": 2457 + }, + { + "epoch": 3.5366906474820143, + "grad_norm": 0.0639265233110726, + "learning_rate": 1.9124904908615178e-05, + "loss": 0.308, + "step": 2458 + }, + { + "epoch": 3.5381294964028775, + "grad_norm": 0.06375477688785211, + "learning_rate": 1.9090635398523698e-05, + "loss": 0.3042, + "step": 2459 + }, + { + "epoch": 3.539568345323741, + "grad_norm": 0.08106792318534015, + "learning_rate": 1.9056386993404294e-05, + "loss": 0.312, + "step": 2460 + }, + { + "epoch": 3.5410071942446044, + "grad_norm": 0.06438660808844053, + "learning_rate": 1.902215972782579e-05, + "loss": 0.3087, + "step": 2461 + }, + { + "epoch": 3.5424460431654676, + "grad_norm": 0.06804453264424278, + "learning_rate": 1.8987953636335595e-05, + "loss": 0.3055, + "step": 2462 + }, + { + "epoch": 3.543884892086331, + "grad_norm": 0.06606195736768096, + "learning_rate": 1.8953768753459863e-05, + "loss": 0.3126, + "step": 2463 + }, + { + "epoch": 3.545323741007194, + "grad_norm": 0.07120465726116934, + "learning_rate": 1.8919605113703227e-05, + "loss": 0.313, + "step": 2464 + }, + { + "epoch": 3.5467625899280577, + "grad_norm": 0.06464663246904867, + "learning_rate": 1.888546275154895e-05, + "loss": 0.3138, + "step": 2465 + }, + { + "epoch": 3.548201438848921, + "grad_norm": 0.06262826257038849, + "learning_rate": 1.885134170145879e-05, + "loss": 0.3089, + "step": 2466 + }, + { + "epoch": 3.549640287769784, + "grad_norm": 0.06815666810533745, + "learning_rate": 1.8817241997873007e-05, + "loss": 0.2966, + "step": 2467 + }, + { + "epoch": 3.5510791366906473, + "grad_norm": 0.05941656249430085, + "learning_rate": 1.8783163675210307e-05, + "loss": 0.3148, + "step": 2468 + }, + { + "epoch": 3.5525179856115106, + "grad_norm": 0.07041289292419853, + "learning_rate": 1.8749106767867808e-05, + "loss": 0.3122, + "step": 2469 + }, + { + "epoch": 3.553956834532374, + "grad_norm": 0.07358680810037943, + "learning_rate": 1.871507131022103e-05, + "loss": 0.3148, + "step": 2470 + }, + { + "epoch": 3.5553956834532374, + "grad_norm": 0.06256848030022917, + "learning_rate": 1.8681057336623825e-05, + "loss": 0.3128, + "step": 2471 + }, + { + "epoch": 3.5568345323741006, + "grad_norm": 0.06645120167514493, + "learning_rate": 1.864706488140839e-05, + "loss": 0.308, + "step": 2472 + }, + { + "epoch": 3.5582733812949643, + "grad_norm": 0.07095189238251999, + "learning_rate": 1.861309397888513e-05, + "loss": 0.3138, + "step": 2473 + }, + { + "epoch": 3.5597122302158275, + "grad_norm": 0.07511578845905682, + "learning_rate": 1.857914466334279e-05, + "loss": 0.3006, + "step": 2474 + }, + { + "epoch": 3.5611510791366907, + "grad_norm": 0.06184718761923469, + "learning_rate": 1.8545216969048288e-05, + "loss": 0.3067, + "step": 2475 + }, + { + "epoch": 3.562589928057554, + "grad_norm": 0.0732481003171402, + "learning_rate": 1.851131093024668e-05, + "loss": 0.3151, + "step": 2476 + }, + { + "epoch": 3.564028776978417, + "grad_norm": 0.07321066110757826, + "learning_rate": 1.8477426581161192e-05, + "loss": 0.3076, + "step": 2477 + }, + { + "epoch": 3.565467625899281, + "grad_norm": 0.06872200415539845, + "learning_rate": 1.844356395599322e-05, + "loss": 0.3177, + "step": 2478 + }, + { + "epoch": 3.566906474820144, + "grad_norm": 0.07148169759889017, + "learning_rate": 1.840972308892211e-05, + "loss": 0.3085, + "step": 2479 + }, + { + "epoch": 3.568345323741007, + "grad_norm": 0.06651479480795791, + "learning_rate": 1.837590401410532e-05, + "loss": 0.3083, + "step": 2480 + }, + { + "epoch": 3.5697841726618704, + "grad_norm": 0.07008142727674253, + "learning_rate": 1.8342106765678358e-05, + "loss": 0.3104, + "step": 2481 + }, + { + "epoch": 3.5712230215827336, + "grad_norm": 0.06859727234979068, + "learning_rate": 1.8308331377754584e-05, + "loss": 0.3128, + "step": 2482 + }, + { + "epoch": 3.5726618705035973, + "grad_norm": 0.07352856994415677, + "learning_rate": 1.8274577884425383e-05, + "loss": 0.3085, + "step": 2483 + }, + { + "epoch": 3.5741007194244605, + "grad_norm": 0.06259710447078544, + "learning_rate": 1.8240846319760012e-05, + "loss": 0.3063, + "step": 2484 + }, + { + "epoch": 3.5755395683453237, + "grad_norm": 0.0688473836174342, + "learning_rate": 1.8207136717805585e-05, + "loss": 0.297, + "step": 2485 + }, + { + "epoch": 3.576978417266187, + "grad_norm": 0.06606776561775955, + "learning_rate": 1.8173449112587062e-05, + "loss": 0.3054, + "step": 2486 + }, + { + "epoch": 3.57841726618705, + "grad_norm": 0.06220494904604853, + "learning_rate": 1.813978353810722e-05, + "loss": 0.2996, + "step": 2487 + }, + { + "epoch": 3.579856115107914, + "grad_norm": 0.08421515901945559, + "learning_rate": 1.8106140028346526e-05, + "loss": 0.3058, + "step": 2488 + }, + { + "epoch": 3.581294964028777, + "grad_norm": 0.06064233136038843, + "learning_rate": 1.8072518617263276e-05, + "loss": 0.3133, + "step": 2489 + }, + { + "epoch": 3.58273381294964, + "grad_norm": 0.07980235943596127, + "learning_rate": 1.803891933879338e-05, + "loss": 0.3071, + "step": 2490 + }, + { + "epoch": 3.584172661870504, + "grad_norm": 0.0640938741526328, + "learning_rate": 1.8005342226850423e-05, + "loss": 0.3146, + "step": 2491 + }, + { + "epoch": 3.5856115107913666, + "grad_norm": 0.0761611712183301, + "learning_rate": 1.7971787315325684e-05, + "loss": 0.316, + "step": 2492 + }, + { + "epoch": 3.5870503597122303, + "grad_norm": 0.06930077652154204, + "learning_rate": 1.7938254638087946e-05, + "loss": 0.3079, + "step": 2493 + }, + { + "epoch": 3.5884892086330935, + "grad_norm": 0.0716508617153888, + "learning_rate": 1.7904744228983585e-05, + "loss": 0.307, + "step": 2494 + }, + { + "epoch": 3.5899280575539567, + "grad_norm": 0.06378955887830169, + "learning_rate": 1.7871256121836507e-05, + "loss": 0.3084, + "step": 2495 + }, + { + "epoch": 3.5913669064748204, + "grad_norm": 0.06898893310679868, + "learning_rate": 1.7837790350448098e-05, + "loss": 0.3068, + "step": 2496 + }, + { + "epoch": 3.5928057553956836, + "grad_norm": 0.06604980702701138, + "learning_rate": 1.7804346948597206e-05, + "loss": 0.3044, + "step": 2497 + }, + { + "epoch": 3.594244604316547, + "grad_norm": 0.07092813218675978, + "learning_rate": 1.7770925950040114e-05, + "loss": 0.301, + "step": 2498 + }, + { + "epoch": 3.59568345323741, + "grad_norm": 0.06175603012900154, + "learning_rate": 1.773752738851042e-05, + "loss": 0.3058, + "step": 2499 + }, + { + "epoch": 3.597122302158273, + "grad_norm": 0.0685423524799416, + "learning_rate": 1.770415129771918e-05, + "loss": 0.3117, + "step": 2500 + }, + { + "epoch": 3.598561151079137, + "grad_norm": 0.06536157690154143, + "learning_rate": 1.7670797711354724e-05, + "loss": 0.3121, + "step": 2501 + }, + { + "epoch": 3.6, + "grad_norm": 0.0710392964305355, + "learning_rate": 1.763746666308261e-05, + "loss": 0.3121, + "step": 2502 + }, + { + "epoch": 3.6014388489208633, + "grad_norm": 0.06777863684239872, + "learning_rate": 1.760415818654574e-05, + "loss": 0.321, + "step": 2503 + }, + { + "epoch": 3.6028776978417265, + "grad_norm": 0.07728431278232174, + "learning_rate": 1.75708723153642e-05, + "loss": 0.3104, + "step": 2504 + }, + { + "epoch": 3.6043165467625897, + "grad_norm": 0.07168939295547448, + "learning_rate": 1.7537609083135224e-05, + "loss": 0.3069, + "step": 2505 + }, + { + "epoch": 3.6057553956834534, + "grad_norm": 0.08032230286800195, + "learning_rate": 1.7504368523433216e-05, + "loss": 0.3024, + "step": 2506 + }, + { + "epoch": 3.6071942446043166, + "grad_norm": 0.05749273372875255, + "learning_rate": 1.747115066980974e-05, + "loss": 0.3017, + "step": 2507 + }, + { + "epoch": 3.60863309352518, + "grad_norm": 0.07140683658749929, + "learning_rate": 1.7437955555793372e-05, + "loss": 0.311, + "step": 2508 + }, + { + "epoch": 3.610071942446043, + "grad_norm": 0.0732014880124806, + "learning_rate": 1.740478321488978e-05, + "loss": 0.3092, + "step": 2509 + }, + { + "epoch": 3.6115107913669062, + "grad_norm": 0.0662511977338535, + "learning_rate": 1.737163368058162e-05, + "loss": 0.3091, + "step": 2510 + }, + { + "epoch": 3.61294964028777, + "grad_norm": 0.06611797050487064, + "learning_rate": 1.7338506986328552e-05, + "loss": 0.3054, + "step": 2511 + }, + { + "epoch": 3.614388489208633, + "grad_norm": 0.06937017863617154, + "learning_rate": 1.730540316556717e-05, + "loss": 0.307, + "step": 2512 + }, + { + "epoch": 3.6158273381294963, + "grad_norm": 0.06385096341018605, + "learning_rate": 1.727232225171098e-05, + "loss": 0.3053, + "step": 2513 + }, + { + "epoch": 3.61726618705036, + "grad_norm": 0.07092946980616527, + "learning_rate": 1.7239264278150364e-05, + "loss": 0.317, + "step": 2514 + }, + { + "epoch": 3.618705035971223, + "grad_norm": 0.08718889810295921, + "learning_rate": 1.7206229278252577e-05, + "loss": 0.3105, + "step": 2515 + }, + { + "epoch": 3.6201438848920864, + "grad_norm": 0.0719947954139465, + "learning_rate": 1.717321728536163e-05, + "loss": 0.3113, + "step": 2516 + }, + { + "epoch": 3.6215827338129496, + "grad_norm": 0.06418350060711016, + "learning_rate": 1.7140228332798336e-05, + "loss": 0.3065, + "step": 2517 + }, + { + "epoch": 3.623021582733813, + "grad_norm": 0.07292396382004765, + "learning_rate": 1.7107262453860308e-05, + "loss": 0.3162, + "step": 2518 + }, + { + "epoch": 3.6244604316546765, + "grad_norm": 0.07322695486853029, + "learning_rate": 1.707431968182179e-05, + "loss": 0.3058, + "step": 2519 + }, + { + "epoch": 3.6258992805755397, + "grad_norm": 0.061263534587851595, + "learning_rate": 1.7041400049933726e-05, + "loss": 0.3083, + "step": 2520 + }, + { + "epoch": 3.627338129496403, + "grad_norm": 0.06960676333387338, + "learning_rate": 1.700850359142373e-05, + "loss": 0.3056, + "step": 2521 + }, + { + "epoch": 3.628776978417266, + "grad_norm": 0.06683108415097225, + "learning_rate": 1.6975630339496e-05, + "loss": 0.3116, + "step": 2522 + }, + { + "epoch": 3.6302158273381293, + "grad_norm": 0.06433834831333483, + "learning_rate": 1.6942780327331317e-05, + "loss": 0.3137, + "step": 2523 + }, + { + "epoch": 3.631654676258993, + "grad_norm": 0.0691490163582227, + "learning_rate": 1.6909953588087024e-05, + "loss": 0.3068, + "step": 2524 + }, + { + "epoch": 3.633093525179856, + "grad_norm": 0.06750580224774425, + "learning_rate": 1.687715015489691e-05, + "loss": 0.3013, + "step": 2525 + }, + { + "epoch": 3.6345323741007194, + "grad_norm": 0.07328497686061296, + "learning_rate": 1.6844370060871324e-05, + "loss": 0.3121, + "step": 2526 + }, + { + "epoch": 3.6359712230215826, + "grad_norm": 0.06526965025274721, + "learning_rate": 1.6811613339097022e-05, + "loss": 0.3036, + "step": 2527 + }, + { + "epoch": 3.637410071942446, + "grad_norm": 0.07280262414290611, + "learning_rate": 1.6778880022637123e-05, + "loss": 0.3101, + "step": 2528 + }, + { + "epoch": 3.6388489208633095, + "grad_norm": 0.06492770967340347, + "learning_rate": 1.674617014453121e-05, + "loss": 0.303, + "step": 2529 + }, + { + "epoch": 3.6402877697841727, + "grad_norm": 0.07030555773347764, + "learning_rate": 1.6713483737795155e-05, + "loss": 0.3086, + "step": 2530 + }, + { + "epoch": 3.641726618705036, + "grad_norm": 0.07097045668922858, + "learning_rate": 1.6680820835421124e-05, + "loss": 0.3136, + "step": 2531 + }, + { + "epoch": 3.6431654676258995, + "grad_norm": 0.06551477015746095, + "learning_rate": 1.664818147037758e-05, + "loss": 0.3116, + "step": 2532 + }, + { + "epoch": 3.6446043165467623, + "grad_norm": 0.07000518869280055, + "learning_rate": 1.6615565675609272e-05, + "loss": 0.3064, + "step": 2533 + }, + { + "epoch": 3.646043165467626, + "grad_norm": 0.06683324068068973, + "learning_rate": 1.6582973484037076e-05, + "loss": 0.3047, + "step": 2534 + }, + { + "epoch": 3.647482014388489, + "grad_norm": 0.06536828321825529, + "learning_rate": 1.6550404928558094e-05, + "loss": 0.3101, + "step": 2535 + }, + { + "epoch": 3.6489208633093524, + "grad_norm": 0.06941870589984539, + "learning_rate": 1.6517860042045564e-05, + "loss": 0.3107, + "step": 2536 + }, + { + "epoch": 3.650359712230216, + "grad_norm": 0.061878311241111955, + "learning_rate": 1.6485338857348826e-05, + "loss": 0.3115, + "step": 2537 + }, + { + "epoch": 3.6517985611510793, + "grad_norm": 0.0727082628618032, + "learning_rate": 1.6452841407293307e-05, + "loss": 0.3036, + "step": 2538 + }, + { + "epoch": 3.6532374100719425, + "grad_norm": 0.060659731854837945, + "learning_rate": 1.642036772468047e-05, + "loss": 0.3099, + "step": 2539 + }, + { + "epoch": 3.6546762589928057, + "grad_norm": 0.06868866377712568, + "learning_rate": 1.6387917842287783e-05, + "loss": 0.3123, + "step": 2540 + }, + { + "epoch": 3.656115107913669, + "grad_norm": 0.07082289976171845, + "learning_rate": 1.635549179286871e-05, + "loss": 0.3083, + "step": 2541 + }, + { + "epoch": 3.6575539568345325, + "grad_norm": 0.06037483578222037, + "learning_rate": 1.6323089609152648e-05, + "loss": 0.311, + "step": 2542 + }, + { + "epoch": 3.6589928057553958, + "grad_norm": 0.06728793506224287, + "learning_rate": 1.6290711323844866e-05, + "loss": 0.3107, + "step": 2543 + }, + { + "epoch": 3.660431654676259, + "grad_norm": 0.06837921156473406, + "learning_rate": 1.6258356969626614e-05, + "loss": 0.3121, + "step": 2544 + }, + { + "epoch": 3.661870503597122, + "grad_norm": 0.06371398556508498, + "learning_rate": 1.622602657915487e-05, + "loss": 0.3089, + "step": 2545 + }, + { + "epoch": 3.6633093525179854, + "grad_norm": 0.07837110896728085, + "learning_rate": 1.6193720185062484e-05, + "loss": 0.3193, + "step": 2546 + }, + { + "epoch": 3.664748201438849, + "grad_norm": 0.07209174598097369, + "learning_rate": 1.6161437819958087e-05, + "loss": 0.3161, + "step": 2547 + }, + { + "epoch": 3.6661870503597123, + "grad_norm": 0.07064470115761468, + "learning_rate": 1.6129179516426048e-05, + "loss": 0.3111, + "step": 2548 + }, + { + "epoch": 3.6676258992805755, + "grad_norm": 0.08100857076451701, + "learning_rate": 1.609694530702644e-05, + "loss": 0.305, + "step": 2549 + }, + { + "epoch": 3.6690647482014387, + "grad_norm": 0.06613161652594873, + "learning_rate": 1.6064735224295027e-05, + "loss": 0.3088, + "step": 2550 + }, + { + "epoch": 3.670503597122302, + "grad_norm": 0.0713894278866626, + "learning_rate": 1.603254930074322e-05, + "loss": 0.307, + "step": 2551 + }, + { + "epoch": 3.6719424460431656, + "grad_norm": 0.06583824106135408, + "learning_rate": 1.6000387568858042e-05, + "loss": 0.3136, + "step": 2552 + }, + { + "epoch": 3.6733812949640288, + "grad_norm": 0.06573697888323053, + "learning_rate": 1.5968250061102105e-05, + "loss": 0.3094, + "step": 2553 + }, + { + "epoch": 3.674820143884892, + "grad_norm": 0.06482376965631038, + "learning_rate": 1.593613680991353e-05, + "loss": 0.3092, + "step": 2554 + }, + { + "epoch": 3.6762589928057556, + "grad_norm": 0.06347306421022354, + "learning_rate": 1.590404784770603e-05, + "loss": 0.3123, + "step": 2555 + }, + { + "epoch": 3.677697841726619, + "grad_norm": 0.06437733500977535, + "learning_rate": 1.5871983206868756e-05, + "loss": 0.3204, + "step": 2556 + }, + { + "epoch": 3.679136690647482, + "grad_norm": 0.06822100891443192, + "learning_rate": 1.583994291976629e-05, + "loss": 0.3046, + "step": 2557 + }, + { + "epoch": 3.6805755395683453, + "grad_norm": 0.0615780409502821, + "learning_rate": 1.580792701873865e-05, + "loss": 0.3131, + "step": 2558 + }, + { + "epoch": 3.6820143884892085, + "grad_norm": 0.060451882456547056, + "learning_rate": 1.5775935536101296e-05, + "loss": 0.3077, + "step": 2559 + }, + { + "epoch": 3.683453237410072, + "grad_norm": 0.05949962881040066, + "learning_rate": 1.5743968504144946e-05, + "loss": 0.3067, + "step": 2560 + }, + { + "epoch": 3.6848920863309353, + "grad_norm": 0.06496845677062033, + "learning_rate": 1.57120259551357e-05, + "loss": 0.3046, + "step": 2561 + }, + { + "epoch": 3.6863309352517986, + "grad_norm": 0.06347335914473873, + "learning_rate": 1.5680107921314926e-05, + "loss": 0.3035, + "step": 2562 + }, + { + "epoch": 3.6877697841726618, + "grad_norm": 0.05931661956429885, + "learning_rate": 1.5648214434899257e-05, + "loss": 0.3086, + "step": 2563 + }, + { + "epoch": 3.689208633093525, + "grad_norm": 0.057231306144126665, + "learning_rate": 1.5616345528080537e-05, + "loss": 0.3024, + "step": 2564 + }, + { + "epoch": 3.6906474820143886, + "grad_norm": 0.05686512028571938, + "learning_rate": 1.5584501233025813e-05, + "loss": 0.3125, + "step": 2565 + }, + { + "epoch": 3.692086330935252, + "grad_norm": 0.058433105562290054, + "learning_rate": 1.555268158187728e-05, + "loss": 0.3075, + "step": 2566 + }, + { + "epoch": 3.693525179856115, + "grad_norm": 0.055485795231595025, + "learning_rate": 1.552088660675227e-05, + "loss": 0.3057, + "step": 2567 + }, + { + "epoch": 3.6949640287769783, + "grad_norm": 0.06043557385741081, + "learning_rate": 1.54891163397432e-05, + "loss": 0.3142, + "step": 2568 + }, + { + "epoch": 3.6964028776978415, + "grad_norm": 0.0601382784406409, + "learning_rate": 1.5457370812917526e-05, + "loss": 0.3042, + "step": 2569 + }, + { + "epoch": 3.697841726618705, + "grad_norm": 0.05986032978864982, + "learning_rate": 1.5425650058317795e-05, + "loss": 0.3104, + "step": 2570 + }, + { + "epoch": 3.6992805755395683, + "grad_norm": 0.05796349899696422, + "learning_rate": 1.5393954107961467e-05, + "loss": 0.309, + "step": 2571 + }, + { + "epoch": 3.7007194244604316, + "grad_norm": 0.06231745618425038, + "learning_rate": 1.536228299384102e-05, + "loss": 0.3125, + "step": 2572 + }, + { + "epoch": 3.702158273381295, + "grad_norm": 0.06033391802311592, + "learning_rate": 1.533063674792389e-05, + "loss": 0.3104, + "step": 2573 + }, + { + "epoch": 3.7035971223021584, + "grad_norm": 0.060051894055961556, + "learning_rate": 1.529901540215233e-05, + "loss": 0.3033, + "step": 2574 + }, + { + "epoch": 3.7050359712230216, + "grad_norm": 0.0660384081684206, + "learning_rate": 1.5267418988443517e-05, + "loss": 0.3106, + "step": 2575 + }, + { + "epoch": 3.706474820143885, + "grad_norm": 0.06951516962816624, + "learning_rate": 1.5235847538689452e-05, + "loss": 0.3085, + "step": 2576 + }, + { + "epoch": 3.707913669064748, + "grad_norm": 0.06775270235025226, + "learning_rate": 1.5204301084756936e-05, + "loss": 0.3133, + "step": 2577 + }, + { + "epoch": 3.7093525179856117, + "grad_norm": 0.06216993738083855, + "learning_rate": 1.5172779658487539e-05, + "loss": 0.3122, + "step": 2578 + }, + { + "epoch": 3.710791366906475, + "grad_norm": 0.06842070043738754, + "learning_rate": 1.5141283291697587e-05, + "loss": 0.3185, + "step": 2579 + }, + { + "epoch": 3.712230215827338, + "grad_norm": 0.0569504302697582, + "learning_rate": 1.5109812016178053e-05, + "loss": 0.312, + "step": 2580 + }, + { + "epoch": 3.7136690647482014, + "grad_norm": 0.06461140384892121, + "learning_rate": 1.5078365863694667e-05, + "loss": 0.2967, + "step": 2581 + }, + { + "epoch": 3.7151079136690646, + "grad_norm": 0.06872612298563056, + "learning_rate": 1.5046944865987763e-05, + "loss": 0.3032, + "step": 2582 + }, + { + "epoch": 3.716546762589928, + "grad_norm": 0.05831441535529986, + "learning_rate": 1.501554905477224e-05, + "loss": 0.3159, + "step": 2583 + }, + { + "epoch": 3.7179856115107914, + "grad_norm": 0.06402916303258853, + "learning_rate": 1.4984178461737663e-05, + "loss": 0.3135, + "step": 2584 + }, + { + "epoch": 3.7194244604316546, + "grad_norm": 0.06553586847679266, + "learning_rate": 1.4952833118548094e-05, + "loss": 0.3127, + "step": 2585 + }, + { + "epoch": 3.720863309352518, + "grad_norm": 0.06150736723057186, + "learning_rate": 1.492151305684208e-05, + "loss": 0.3072, + "step": 2586 + }, + { + "epoch": 3.722302158273381, + "grad_norm": 0.06696628246885926, + "learning_rate": 1.4890218308232704e-05, + "loss": 0.3099, + "step": 2587 + }, + { + "epoch": 3.7237410071942447, + "grad_norm": 0.06072453891762582, + "learning_rate": 1.4858948904307476e-05, + "loss": 0.3116, + "step": 2588 + }, + { + "epoch": 3.725179856115108, + "grad_norm": 0.06592704415232585, + "learning_rate": 1.4827704876628319e-05, + "loss": 0.3193, + "step": 2589 + }, + { + "epoch": 3.726618705035971, + "grad_norm": 0.06473227734575612, + "learning_rate": 1.4796486256731561e-05, + "loss": 0.3097, + "step": 2590 + }, + { + "epoch": 3.728057553956835, + "grad_norm": 0.05812771491009711, + "learning_rate": 1.4765293076127862e-05, + "loss": 0.3141, + "step": 2591 + }, + { + "epoch": 3.7294964028776976, + "grad_norm": 0.0602507431715896, + "learning_rate": 1.4734125366302224e-05, + "loss": 0.309, + "step": 2592 + }, + { + "epoch": 3.7309352517985612, + "grad_norm": 0.05787360066411294, + "learning_rate": 1.470298315871392e-05, + "loss": 0.3023, + "step": 2593 + }, + { + "epoch": 3.7323741007194244, + "grad_norm": 0.05675735349177474, + "learning_rate": 1.4671866484796505e-05, + "loss": 0.3111, + "step": 2594 + }, + { + "epoch": 3.7338129496402876, + "grad_norm": 0.06130772806247975, + "learning_rate": 1.4640775375957742e-05, + "loss": 0.3098, + "step": 2595 + }, + { + "epoch": 3.7352517985611513, + "grad_norm": 0.05898101582797852, + "learning_rate": 1.4609709863579622e-05, + "loss": 0.3107, + "step": 2596 + }, + { + "epoch": 3.7366906474820145, + "grad_norm": 0.061029615963601236, + "learning_rate": 1.4578669979018231e-05, + "loss": 0.3171, + "step": 2597 + }, + { + "epoch": 3.7381294964028777, + "grad_norm": 0.06442740289261426, + "learning_rate": 1.454765575360385e-05, + "loss": 0.3149, + "step": 2598 + }, + { + "epoch": 3.739568345323741, + "grad_norm": 0.06792699078247295, + "learning_rate": 1.4516667218640877e-05, + "loss": 0.3106, + "step": 2599 + }, + { + "epoch": 3.741007194244604, + "grad_norm": 0.06772812345824679, + "learning_rate": 1.4485704405407699e-05, + "loss": 0.3144, + "step": 2600 + }, + { + "epoch": 3.742446043165468, + "grad_norm": 0.06605950317390714, + "learning_rate": 1.4454767345156806e-05, + "loss": 0.3065, + "step": 2601 + }, + { + "epoch": 3.743884892086331, + "grad_norm": 0.05788478467738365, + "learning_rate": 1.4423856069114677e-05, + "loss": 0.308, + "step": 2602 + }, + { + "epoch": 3.7453237410071942, + "grad_norm": 0.06831851166439465, + "learning_rate": 1.4392970608481758e-05, + "loss": 0.3094, + "step": 2603 + }, + { + "epoch": 3.7467625899280574, + "grad_norm": 0.06028037648009121, + "learning_rate": 1.4362110994432445e-05, + "loss": 0.3124, + "step": 2604 + }, + { + "epoch": 3.7482014388489207, + "grad_norm": 0.06412255417807958, + "learning_rate": 1.433127725811505e-05, + "loss": 0.3081, + "step": 2605 + }, + { + "epoch": 3.7496402877697843, + "grad_norm": 0.05647372134593134, + "learning_rate": 1.4300469430651754e-05, + "loss": 0.3042, + "step": 2606 + }, + { + "epoch": 3.7510791366906475, + "grad_norm": 0.09179084647263566, + "learning_rate": 1.4269687543138594e-05, + "loss": 0.3141, + "step": 2607 + }, + { + "epoch": 3.7525179856115107, + "grad_norm": 0.06520308960062181, + "learning_rate": 1.4238931626645434e-05, + "loss": 0.3088, + "step": 2608 + }, + { + "epoch": 3.753956834532374, + "grad_norm": 0.07436791270658137, + "learning_rate": 1.4208201712215871e-05, + "loss": 0.3113, + "step": 2609 + }, + { + "epoch": 3.755395683453237, + "grad_norm": 0.05996695752235399, + "learning_rate": 1.4177497830867348e-05, + "loss": 0.3101, + "step": 2610 + }, + { + "epoch": 3.756834532374101, + "grad_norm": 0.05786446268831168, + "learning_rate": 1.4146820013590973e-05, + "loss": 0.3077, + "step": 2611 + }, + { + "epoch": 3.758273381294964, + "grad_norm": 0.06983516150573807, + "learning_rate": 1.411616829135153e-05, + "loss": 0.2973, + "step": 2612 + }, + { + "epoch": 3.7597122302158272, + "grad_norm": 0.0658565800376622, + "learning_rate": 1.4085542695087502e-05, + "loss": 0.3142, + "step": 2613 + }, + { + "epoch": 3.761151079136691, + "grad_norm": 0.16283982642674674, + "learning_rate": 1.4054943255710987e-05, + "loss": 0.3197, + "step": 2614 + }, + { + "epoch": 3.762589928057554, + "grad_norm": 0.06299679372917073, + "learning_rate": 1.4024370004107683e-05, + "loss": 0.3036, + "step": 2615 + }, + { + "epoch": 3.7640287769784173, + "grad_norm": 0.06638329576126877, + "learning_rate": 1.3993822971136859e-05, + "loss": 0.304, + "step": 2616 + }, + { + "epoch": 3.7654676258992805, + "grad_norm": 0.062057024816662185, + "learning_rate": 1.3963302187631316e-05, + "loss": 0.3182, + "step": 2617 + }, + { + "epoch": 3.7669064748201437, + "grad_norm": 0.05653817314485878, + "learning_rate": 1.3932807684397348e-05, + "loss": 0.3171, + "step": 2618 + }, + { + "epoch": 3.7683453237410074, + "grad_norm": 0.06107756482833042, + "learning_rate": 1.3902339492214751e-05, + "loss": 0.317, + "step": 2619 + }, + { + "epoch": 3.7697841726618706, + "grad_norm": 0.057954495172457515, + "learning_rate": 1.387189764183674e-05, + "loss": 0.3058, + "step": 2620 + }, + { + "epoch": 3.771223021582734, + "grad_norm": 0.06518604390601693, + "learning_rate": 1.384148216398995e-05, + "loss": 0.3053, + "step": 2621 + }, + { + "epoch": 3.772661870503597, + "grad_norm": 0.060156239559592144, + "learning_rate": 1.381109308937441e-05, + "loss": 0.3142, + "step": 2622 + }, + { + "epoch": 3.7741007194244602, + "grad_norm": 0.061219184840551555, + "learning_rate": 1.3780730448663456e-05, + "loss": 0.3029, + "step": 2623 + }, + { + "epoch": 3.775539568345324, + "grad_norm": 0.06872992961064582, + "learning_rate": 1.3750394272503775e-05, + "loss": 0.3038, + "step": 2624 + }, + { + "epoch": 3.776978417266187, + "grad_norm": 0.06207118803048583, + "learning_rate": 1.3720084591515374e-05, + "loss": 0.3155, + "step": 2625 + }, + { + "epoch": 3.7784172661870503, + "grad_norm": 0.06276725568796437, + "learning_rate": 1.3689801436291448e-05, + "loss": 0.3079, + "step": 2626 + }, + { + "epoch": 3.7798561151079135, + "grad_norm": 0.06301376305449499, + "learning_rate": 1.365954483739846e-05, + "loss": 0.3084, + "step": 2627 + }, + { + "epoch": 3.7812949640287767, + "grad_norm": 0.06786189913232177, + "learning_rate": 1.3629314825376061e-05, + "loss": 0.3017, + "step": 2628 + }, + { + "epoch": 3.7827338129496404, + "grad_norm": 0.06923629831247584, + "learning_rate": 1.359911143073707e-05, + "loss": 0.3126, + "step": 2629 + }, + { + "epoch": 3.7841726618705036, + "grad_norm": 0.05915691173482345, + "learning_rate": 1.3568934683967427e-05, + "loss": 0.3135, + "step": 2630 + }, + { + "epoch": 3.785611510791367, + "grad_norm": 0.06480400653976588, + "learning_rate": 1.3538784615526188e-05, + "loss": 0.3088, + "step": 2631 + }, + { + "epoch": 3.7870503597122305, + "grad_norm": 0.061066561210690914, + "learning_rate": 1.3508661255845477e-05, + "loss": 0.3166, + "step": 2632 + }, + { + "epoch": 3.7884892086330937, + "grad_norm": 0.05631072013232032, + "learning_rate": 1.3478564635330455e-05, + "loss": 0.3049, + "step": 2633 + }, + { + "epoch": 3.789928057553957, + "grad_norm": 0.06469049654437241, + "learning_rate": 1.344849478435931e-05, + "loss": 0.3153, + "step": 2634 + }, + { + "epoch": 3.79136690647482, + "grad_norm": 0.05619451199322114, + "learning_rate": 1.3418451733283156e-05, + "loss": 0.3076, + "step": 2635 + }, + { + "epoch": 3.7928057553956833, + "grad_norm": 0.06318767999957417, + "learning_rate": 1.3388435512426142e-05, + "loss": 0.3099, + "step": 2636 + }, + { + "epoch": 3.794244604316547, + "grad_norm": 0.11931093635445984, + "learning_rate": 1.3358446152085289e-05, + "loss": 0.3212, + "step": 2637 + }, + { + "epoch": 3.79568345323741, + "grad_norm": 0.05764009252951675, + "learning_rate": 1.332848368253048e-05, + "loss": 0.3197, + "step": 2638 + }, + { + "epoch": 3.7971223021582734, + "grad_norm": 0.06625803834027354, + "learning_rate": 1.3298548134004498e-05, + "loss": 0.3098, + "step": 2639 + }, + { + "epoch": 3.7985611510791366, + "grad_norm": 0.06722982196993153, + "learning_rate": 1.326863953672294e-05, + "loss": 0.3048, + "step": 2640 + }, + { + "epoch": 3.8, + "grad_norm": 0.05956157666694582, + "learning_rate": 1.3238757920874203e-05, + "loss": 0.3113, + "step": 2641 + }, + { + "epoch": 3.8014388489208635, + "grad_norm": 0.06408147015237593, + "learning_rate": 1.3208903316619436e-05, + "loss": 0.3138, + "step": 2642 + }, + { + "epoch": 3.8028776978417267, + "grad_norm": 0.05836561799046896, + "learning_rate": 1.317907575409254e-05, + "loss": 0.3113, + "step": 2643 + }, + { + "epoch": 3.80431654676259, + "grad_norm": 0.06586186700166624, + "learning_rate": 1.3149275263400116e-05, + "loss": 0.3068, + "step": 2644 + }, + { + "epoch": 3.805755395683453, + "grad_norm": 0.05574505585504934, + "learning_rate": 1.3119501874621437e-05, + "loss": 0.3045, + "step": 2645 + }, + { + "epoch": 3.8071942446043163, + "grad_norm": 0.06191955826077262, + "learning_rate": 1.3089755617808417e-05, + "loss": 0.3069, + "step": 2646 + }, + { + "epoch": 3.80863309352518, + "grad_norm": 0.06409880201156713, + "learning_rate": 1.3060036522985598e-05, + "loss": 0.3149, + "step": 2647 + }, + { + "epoch": 3.810071942446043, + "grad_norm": 0.05713343345695329, + "learning_rate": 1.3030344620150105e-05, + "loss": 0.3054, + "step": 2648 + }, + { + "epoch": 3.8115107913669064, + "grad_norm": 0.07271851070609117, + "learning_rate": 1.3000679939271588e-05, + "loss": 0.3102, + "step": 2649 + }, + { + "epoch": 3.81294964028777, + "grad_norm": 0.06882430273590348, + "learning_rate": 1.2971042510292238e-05, + "loss": 0.3021, + "step": 2650 + }, + { + "epoch": 3.814388489208633, + "grad_norm": 0.06367380750131976, + "learning_rate": 1.2941432363126784e-05, + "loss": 0.3134, + "step": 2651 + }, + { + "epoch": 3.8158273381294965, + "grad_norm": 0.06397499285479019, + "learning_rate": 1.2911849527662335e-05, + "loss": 0.3158, + "step": 2652 + }, + { + "epoch": 3.8172661870503597, + "grad_norm": 0.06668572646459466, + "learning_rate": 1.2882294033758473e-05, + "loss": 0.3144, + "step": 2653 + }, + { + "epoch": 3.818705035971223, + "grad_norm": 0.06703585684409313, + "learning_rate": 1.2852765911247227e-05, + "loss": 0.3043, + "step": 2654 + }, + { + "epoch": 3.8201438848920866, + "grad_norm": 0.06884979624293733, + "learning_rate": 1.2823265189932914e-05, + "loss": 0.3147, + "step": 2655 + }, + { + "epoch": 3.8215827338129498, + "grad_norm": 0.059878668782955315, + "learning_rate": 1.2793791899592254e-05, + "loss": 0.3191, + "step": 2656 + }, + { + "epoch": 3.823021582733813, + "grad_norm": 0.059485779702165203, + "learning_rate": 1.2764346069974249e-05, + "loss": 0.3041, + "step": 2657 + }, + { + "epoch": 3.824460431654676, + "grad_norm": 0.07647157975745708, + "learning_rate": 1.2734927730800206e-05, + "loss": 0.3062, + "step": 2658 + }, + { + "epoch": 3.8258992805755394, + "grad_norm": 0.16489538003955, + "learning_rate": 1.2705536911763665e-05, + "loss": 0.3165, + "step": 2659 + }, + { + "epoch": 3.827338129496403, + "grad_norm": 0.06318125861693077, + "learning_rate": 1.2676173642530417e-05, + "loss": 0.3033, + "step": 2660 + }, + { + "epoch": 3.8287769784172663, + "grad_norm": 0.06567256901298571, + "learning_rate": 1.2646837952738382e-05, + "loss": 0.3056, + "step": 2661 + }, + { + "epoch": 3.8302158273381295, + "grad_norm": 0.05518801916037263, + "learning_rate": 1.2617529871997727e-05, + "loss": 0.3084, + "step": 2662 + }, + { + "epoch": 3.8316546762589927, + "grad_norm": 0.06215454503091133, + "learning_rate": 1.2588249429890706e-05, + "loss": 0.3085, + "step": 2663 + }, + { + "epoch": 3.833093525179856, + "grad_norm": 0.06215747767088541, + "learning_rate": 1.2558996655971644e-05, + "loss": 0.3131, + "step": 2664 + }, + { + "epoch": 3.8345323741007196, + "grad_norm": 0.0571081373298139, + "learning_rate": 1.2529771579767024e-05, + "loss": 0.3157, + "step": 2665 + }, + { + "epoch": 3.8359712230215828, + "grad_norm": 0.05764282070698008, + "learning_rate": 1.2500574230775294e-05, + "loss": 0.3119, + "step": 2666 + }, + { + "epoch": 3.837410071942446, + "grad_norm": 0.061498174350714155, + "learning_rate": 1.2471404638466949e-05, + "loss": 0.3065, + "step": 2667 + }, + { + "epoch": 3.838848920863309, + "grad_norm": 0.06444471515107401, + "learning_rate": 1.2442262832284464e-05, + "loss": 0.3127, + "step": 2668 + }, + { + "epoch": 3.8402877697841724, + "grad_norm": 0.05249170213553225, + "learning_rate": 1.2413148841642268e-05, + "loss": 0.3022, + "step": 2669 + }, + { + "epoch": 3.841726618705036, + "grad_norm": 0.06441398889770678, + "learning_rate": 1.2384062695926713e-05, + "loss": 0.3068, + "step": 2670 + }, + { + "epoch": 3.8431654676258993, + "grad_norm": 0.062304092178714035, + "learning_rate": 1.235500442449605e-05, + "loss": 0.2992, + "step": 2671 + }, + { + "epoch": 3.8446043165467625, + "grad_norm": 0.060928419468288225, + "learning_rate": 1.232597405668039e-05, + "loss": 0.3064, + "step": 2672 + }, + { + "epoch": 3.846043165467626, + "grad_norm": 0.0614376313092846, + "learning_rate": 1.2296971621781677e-05, + "loss": 0.302, + "step": 2673 + }, + { + "epoch": 3.8474820143884894, + "grad_norm": 0.07040298267064284, + "learning_rate": 1.2267997149073679e-05, + "loss": 0.3107, + "step": 2674 + }, + { + "epoch": 3.8489208633093526, + "grad_norm": 0.06682703801152587, + "learning_rate": 1.2239050667801885e-05, + "loss": 0.3143, + "step": 2675 + }, + { + "epoch": 3.850359712230216, + "grad_norm": 0.057759323436254666, + "learning_rate": 1.2210132207183611e-05, + "loss": 0.3173, + "step": 2676 + }, + { + "epoch": 3.851798561151079, + "grad_norm": 0.06570271107765477, + "learning_rate": 1.2181241796407855e-05, + "loss": 0.3128, + "step": 2677 + }, + { + "epoch": 3.8532374100719426, + "grad_norm": 0.056153261565319953, + "learning_rate": 1.2152379464635264e-05, + "loss": 0.3121, + "step": 2678 + }, + { + "epoch": 3.854676258992806, + "grad_norm": 0.07057172176534204, + "learning_rate": 1.2123545240998182e-05, + "loss": 0.3091, + "step": 2679 + }, + { + "epoch": 3.856115107913669, + "grad_norm": 0.05974707197694837, + "learning_rate": 1.2094739154600616e-05, + "loss": 0.3077, + "step": 2680 + }, + { + "epoch": 3.8575539568345323, + "grad_norm": 0.0585049483720299, + "learning_rate": 1.2065961234518096e-05, + "loss": 0.3042, + "step": 2681 + }, + { + "epoch": 3.8589928057553955, + "grad_norm": 0.05401931011856757, + "learning_rate": 1.2037211509797771e-05, + "loss": 0.3084, + "step": 2682 + }, + { + "epoch": 3.860431654676259, + "grad_norm": 0.05991698970812978, + "learning_rate": 1.2008490009458322e-05, + "loss": 0.3026, + "step": 2683 + }, + { + "epoch": 3.8618705035971224, + "grad_norm": 0.06668618504592472, + "learning_rate": 1.1979796762489934e-05, + "loss": 0.3095, + "step": 2684 + }, + { + "epoch": 3.8633093525179856, + "grad_norm": 0.05842009339861893, + "learning_rate": 1.195113179785429e-05, + "loss": 0.3123, + "step": 2685 + }, + { + "epoch": 3.864748201438849, + "grad_norm": 0.05566524147630196, + "learning_rate": 1.1922495144484504e-05, + "loss": 0.3028, + "step": 2686 + }, + { + "epoch": 3.866187050359712, + "grad_norm": 0.061870388935277666, + "learning_rate": 1.1893886831285136e-05, + "loss": 0.3076, + "step": 2687 + }, + { + "epoch": 3.8676258992805757, + "grad_norm": 0.05847049335714899, + "learning_rate": 1.1865306887132122e-05, + "loss": 0.3023, + "step": 2688 + }, + { + "epoch": 3.869064748201439, + "grad_norm": 0.0528554059921117, + "learning_rate": 1.183675534087279e-05, + "loss": 0.3043, + "step": 2689 + }, + { + "epoch": 3.870503597122302, + "grad_norm": 0.0578540020472865, + "learning_rate": 1.1808232221325749e-05, + "loss": 0.301, + "step": 2690 + }, + { + "epoch": 3.8719424460431657, + "grad_norm": 0.057910505762297546, + "learning_rate": 1.1779737557280985e-05, + "loss": 0.3087, + "step": 2691 + }, + { + "epoch": 3.873381294964029, + "grad_norm": 0.06273079560410788, + "learning_rate": 1.1751271377499736e-05, + "loss": 0.3067, + "step": 2692 + }, + { + "epoch": 3.874820143884892, + "grad_norm": 0.05935430832911722, + "learning_rate": 1.1722833710714454e-05, + "loss": 0.3128, + "step": 2693 + }, + { + "epoch": 3.8762589928057554, + "grad_norm": 0.176977809547984, + "learning_rate": 1.1694424585628861e-05, + "loss": 0.3207, + "step": 2694 + }, + { + "epoch": 3.8776978417266186, + "grad_norm": 0.06022004430395275, + "learning_rate": 1.166604403091784e-05, + "loss": 0.3014, + "step": 2695 + }, + { + "epoch": 3.8791366906474822, + "grad_norm": 0.06223362207654486, + "learning_rate": 1.1637692075227451e-05, + "loss": 0.3089, + "step": 2696 + }, + { + "epoch": 3.8805755395683454, + "grad_norm": 0.06874855910589964, + "learning_rate": 1.1609368747174883e-05, + "loss": 0.3143, + "step": 2697 + }, + { + "epoch": 3.8820143884892087, + "grad_norm": 0.060370453841061725, + "learning_rate": 1.1581074075348431e-05, + "loss": 0.3172, + "step": 2698 + }, + { + "epoch": 3.883453237410072, + "grad_norm": 0.06925325983031541, + "learning_rate": 1.155280808830746e-05, + "loss": 0.3071, + "step": 2699 + }, + { + "epoch": 3.884892086330935, + "grad_norm": 0.06409960859469387, + "learning_rate": 1.15245708145824e-05, + "loss": 0.309, + "step": 2700 + }, + { + "epoch": 3.8863309352517987, + "grad_norm": 0.062347138259423694, + "learning_rate": 1.1496362282674647e-05, + "loss": 0.3066, + "step": 2701 + }, + { + "epoch": 3.887769784172662, + "grad_norm": 0.060137578227819184, + "learning_rate": 1.1468182521056663e-05, + "loss": 0.3057, + "step": 2702 + }, + { + "epoch": 3.889208633093525, + "grad_norm": 0.0663260823248241, + "learning_rate": 1.1440031558171834e-05, + "loss": 0.3119, + "step": 2703 + }, + { + "epoch": 3.8906474820143884, + "grad_norm": 0.05526910347359479, + "learning_rate": 1.1411909422434441e-05, + "loss": 0.3063, + "step": 2704 + }, + { + "epoch": 3.8920863309352516, + "grad_norm": 0.05847051252150774, + "learning_rate": 1.1383816142229715e-05, + "loss": 0.2982, + "step": 2705 + }, + { + "epoch": 3.8935251798561152, + "grad_norm": 0.06429140059660536, + "learning_rate": 1.1355751745913781e-05, + "loss": 0.3076, + "step": 2706 + }, + { + "epoch": 3.8949640287769784, + "grad_norm": 0.05672662126677904, + "learning_rate": 1.1327716261813539e-05, + "loss": 0.3074, + "step": 2707 + }, + { + "epoch": 3.8964028776978417, + "grad_norm": 0.06428826342831176, + "learning_rate": 1.1299709718226745e-05, + "loss": 0.3203, + "step": 2708 + }, + { + "epoch": 3.897841726618705, + "grad_norm": 0.06905960502906461, + "learning_rate": 1.1271732143421992e-05, + "loss": 0.3145, + "step": 2709 + }, + { + "epoch": 3.899280575539568, + "grad_norm": 0.05555046754317393, + "learning_rate": 1.1243783565638533e-05, + "loss": 0.3119, + "step": 2710 + }, + { + "epoch": 3.9007194244604317, + "grad_norm": 0.2581207444305228, + "learning_rate": 1.121586401308643e-05, + "loss": 0.3183, + "step": 2711 + }, + { + "epoch": 3.902158273381295, + "grad_norm": 0.05829992626182602, + "learning_rate": 1.1187973513946417e-05, + "loss": 0.3074, + "step": 2712 + }, + { + "epoch": 3.903597122302158, + "grad_norm": 0.056376048677371636, + "learning_rate": 1.1160112096369913e-05, + "loss": 0.318, + "step": 2713 + }, + { + "epoch": 3.905035971223022, + "grad_norm": 0.059928006320685914, + "learning_rate": 1.1132279788478977e-05, + "loss": 0.308, + "step": 2714 + }, + { + "epoch": 3.906474820143885, + "grad_norm": 0.05541920682418896, + "learning_rate": 1.1104476618366298e-05, + "loss": 0.3173, + "step": 2715 + }, + { + "epoch": 3.9079136690647482, + "grad_norm": 0.05309312395846593, + "learning_rate": 1.1076702614095116e-05, + "loss": 0.2997, + "step": 2716 + }, + { + "epoch": 3.9093525179856115, + "grad_norm": 0.060377652429816915, + "learning_rate": 1.1048957803699292e-05, + "loss": 0.3172, + "step": 2717 + }, + { + "epoch": 3.9107913669064747, + "grad_norm": 0.05214388021450585, + "learning_rate": 1.1021242215183193e-05, + "loss": 0.308, + "step": 2718 + }, + { + "epoch": 3.9122302158273383, + "grad_norm": 0.054476845725141956, + "learning_rate": 1.0993555876521658e-05, + "loss": 0.3024, + "step": 2719 + }, + { + "epoch": 3.9136690647482015, + "grad_norm": 0.05522494995246515, + "learning_rate": 1.096589881566005e-05, + "loss": 0.3062, + "step": 2720 + }, + { + "epoch": 3.9151079136690647, + "grad_norm": 0.05330150644521752, + "learning_rate": 1.0938271060514162e-05, + "loss": 0.3068, + "step": 2721 + }, + { + "epoch": 3.916546762589928, + "grad_norm": 0.05748988376868886, + "learning_rate": 1.0910672638970206e-05, + "loss": 0.3094, + "step": 2722 + }, + { + "epoch": 3.917985611510791, + "grad_norm": 0.0554307762195385, + "learning_rate": 1.0883103578884784e-05, + "loss": 0.306, + "step": 2723 + }, + { + "epoch": 3.919424460431655, + "grad_norm": 0.05323938446383709, + "learning_rate": 1.085556390808487e-05, + "loss": 0.3193, + "step": 2724 + }, + { + "epoch": 3.920863309352518, + "grad_norm": 0.056347745539254916, + "learning_rate": 1.082805365436776e-05, + "loss": 0.3082, + "step": 2725 + }, + { + "epoch": 3.9223021582733812, + "grad_norm": 0.056289500329134816, + "learning_rate": 1.0800572845501095e-05, + "loss": 0.3121, + "step": 2726 + }, + { + "epoch": 3.9237410071942445, + "grad_norm": 0.05554719201993376, + "learning_rate": 1.0773121509222712e-05, + "loss": 0.3182, + "step": 2727 + }, + { + "epoch": 3.9251798561151077, + "grad_norm": 0.06374598503779488, + "learning_rate": 1.0745699673240808e-05, + "loss": 0.3136, + "step": 2728 + }, + { + "epoch": 3.9266187050359713, + "grad_norm": 0.06042213685904112, + "learning_rate": 1.0718307365233737e-05, + "loss": 0.3083, + "step": 2729 + }, + { + "epoch": 3.9280575539568345, + "grad_norm": 0.05747646863163404, + "learning_rate": 1.0690944612850052e-05, + "loss": 0.3121, + "step": 2730 + }, + { + "epoch": 3.9294964028776977, + "grad_norm": 0.06334409390272748, + "learning_rate": 1.0663611443708471e-05, + "loss": 0.3054, + "step": 2731 + }, + { + "epoch": 3.9309352517985614, + "grad_norm": 0.053503891340662454, + "learning_rate": 1.0636307885397911e-05, + "loss": 0.3128, + "step": 2732 + }, + { + "epoch": 3.9323741007194246, + "grad_norm": 0.06154617585575193, + "learning_rate": 1.0609033965477318e-05, + "loss": 0.3101, + "step": 2733 + }, + { + "epoch": 3.933812949640288, + "grad_norm": 0.0581555444560647, + "learning_rate": 1.0581789711475752e-05, + "loss": 0.3088, + "step": 2734 + }, + { + "epoch": 3.935251798561151, + "grad_norm": 0.0592766340142503, + "learning_rate": 1.0554575150892386e-05, + "loss": 0.3144, + "step": 2735 + }, + { + "epoch": 3.9366906474820142, + "grad_norm": 0.05803536652402496, + "learning_rate": 1.0527390311196326e-05, + "loss": 0.316, + "step": 2736 + }, + { + "epoch": 3.938129496402878, + "grad_norm": 0.12891965481062795, + "learning_rate": 1.0500235219826748e-05, + "loss": 0.3216, + "step": 2737 + }, + { + "epoch": 3.939568345323741, + "grad_norm": 0.060135996685002246, + "learning_rate": 1.0473109904192773e-05, + "loss": 0.3016, + "step": 2738 + }, + { + "epoch": 3.9410071942446043, + "grad_norm": 0.05936836251326289, + "learning_rate": 1.0446014391673476e-05, + "loss": 0.3117, + "step": 2739 + }, + { + "epoch": 3.9424460431654675, + "grad_norm": 0.05606392843941533, + "learning_rate": 1.0418948709617846e-05, + "loss": 0.3051, + "step": 2740 + }, + { + "epoch": 3.9438848920863308, + "grad_norm": 0.0615264351109077, + "learning_rate": 1.0391912885344784e-05, + "loss": 0.315, + "step": 2741 + }, + { + "epoch": 3.9453237410071944, + "grad_norm": 0.05647955827623214, + "learning_rate": 1.0364906946142996e-05, + "loss": 0.3053, + "step": 2742 + }, + { + "epoch": 3.9467625899280576, + "grad_norm": 0.054273776569956106, + "learning_rate": 1.0337930919271094e-05, + "loss": 0.308, + "step": 2743 + }, + { + "epoch": 3.948201438848921, + "grad_norm": 0.10330088413233861, + "learning_rate": 1.0310984831957471e-05, + "loss": 0.3156, + "step": 2744 + }, + { + "epoch": 3.949640287769784, + "grad_norm": 0.05528293581596261, + "learning_rate": 1.0284068711400254e-05, + "loss": 0.3031, + "step": 2745 + }, + { + "epoch": 3.9510791366906473, + "grad_norm": 0.05894655309137916, + "learning_rate": 1.0257182584767423e-05, + "loss": 0.3075, + "step": 2746 + }, + { + "epoch": 3.952517985611511, + "grad_norm": 0.05607153633072281, + "learning_rate": 1.0230326479196573e-05, + "loss": 0.3065, + "step": 2747 + }, + { + "epoch": 3.953956834532374, + "grad_norm": 0.055195443443561164, + "learning_rate": 1.0203500421795075e-05, + "loss": 0.3152, + "step": 2748 + }, + { + "epoch": 3.9553956834532373, + "grad_norm": 0.06149427347933062, + "learning_rate": 1.017670443963994e-05, + "loss": 0.3079, + "step": 2749 + }, + { + "epoch": 3.956834532374101, + "grad_norm": 0.06094647156893332, + "learning_rate": 1.0149938559777825e-05, + "loss": 0.309, + "step": 2750 + }, + { + "epoch": 3.9582733812949638, + "grad_norm": 0.05664212483371998, + "learning_rate": 1.0123202809225009e-05, + "loss": 0.3117, + "step": 2751 + }, + { + "epoch": 3.9597122302158274, + "grad_norm": 0.06329360387733363, + "learning_rate": 1.0096497214967349e-05, + "loss": 0.3036, + "step": 2752 + }, + { + "epoch": 3.9611510791366906, + "grad_norm": 0.058119174435414415, + "learning_rate": 1.0069821803960277e-05, + "loss": 0.3131, + "step": 2753 + }, + { + "epoch": 3.962589928057554, + "grad_norm": 0.05595191987767366, + "learning_rate": 1.0043176603128755e-05, + "loss": 0.3103, + "step": 2754 + }, + { + "epoch": 3.9640287769784175, + "grad_norm": 0.05502864366856433, + "learning_rate": 1.0016561639367253e-05, + "loss": 0.3097, + "step": 2755 + }, + { + "epoch": 3.9654676258992807, + "grad_norm": 0.052737631100398905, + "learning_rate": 9.989976939539687e-06, + "loss": 0.3118, + "step": 2756 + }, + { + "epoch": 3.966906474820144, + "grad_norm": 0.05642274570543027, + "learning_rate": 9.963422530479496e-06, + "loss": 0.3046, + "step": 2757 + }, + { + "epoch": 3.968345323741007, + "grad_norm": 0.053250375328443185, + "learning_rate": 9.936898438989507e-06, + "loss": 0.3016, + "step": 2758 + }, + { + "epoch": 3.9697841726618703, + "grad_norm": 0.05711820644911219, + "learning_rate": 9.910404691841915e-06, + "loss": 0.3117, + "step": 2759 + }, + { + "epoch": 3.971223021582734, + "grad_norm": 0.05621493149689219, + "learning_rate": 9.883941315778319e-06, + "loss": 0.3079, + "step": 2760 + }, + { + "epoch": 3.972661870503597, + "grad_norm": 0.05627457631006604, + "learning_rate": 9.857508337509692e-06, + "loss": 0.3143, + "step": 2761 + }, + { + "epoch": 3.9741007194244604, + "grad_norm": 0.0528755363079526, + "learning_rate": 9.831105783716266e-06, + "loss": 0.3088, + "step": 2762 + }, + { + "epoch": 3.9755395683453236, + "grad_norm": 0.05672953310062539, + "learning_rate": 9.8047336810476e-06, + "loss": 0.3107, + "step": 2763 + }, + { + "epoch": 3.976978417266187, + "grad_norm": 0.05792026261999815, + "learning_rate": 9.778392056122503e-06, + "loss": 0.3116, + "step": 2764 + }, + { + "epoch": 3.9784172661870505, + "grad_norm": 0.05859681948245761, + "learning_rate": 9.752080935529037e-06, + "loss": 0.3099, + "step": 2765 + }, + { + "epoch": 3.9798561151079137, + "grad_norm": 0.062351998100664094, + "learning_rate": 9.725800345824453e-06, + "loss": 0.3054, + "step": 2766 + }, + { + "epoch": 3.981294964028777, + "grad_norm": 0.05538121828358899, + "learning_rate": 9.699550313535196e-06, + "loss": 0.3097, + "step": 2767 + }, + { + "epoch": 3.98273381294964, + "grad_norm": 0.054767113499204136, + "learning_rate": 9.673330865156875e-06, + "loss": 0.3009, + "step": 2768 + }, + { + "epoch": 3.9841726618705033, + "grad_norm": 0.05460313364278765, + "learning_rate": 9.647142027154222e-06, + "loss": 0.3121, + "step": 2769 + }, + { + "epoch": 3.985611510791367, + "grad_norm": 0.057503736480089745, + "learning_rate": 9.620983825961078e-06, + "loss": 0.3081, + "step": 2770 + }, + { + "epoch": 3.98705035971223, + "grad_norm": 0.05405237866099866, + "learning_rate": 9.594856287980323e-06, + "loss": 0.3043, + "step": 2771 + }, + { + "epoch": 3.9884892086330934, + "grad_norm": 0.05758398953886903, + "learning_rate": 9.56875943958396e-06, + "loss": 0.305, + "step": 2772 + }, + { + "epoch": 3.989928057553957, + "grad_norm": 0.059091244036538114, + "learning_rate": 9.542693307112949e-06, + "loss": 0.3065, + "step": 2773 + }, + { + "epoch": 3.9913669064748203, + "grad_norm": 0.05447280953122306, + "learning_rate": 9.516657916877272e-06, + "loss": 0.3101, + "step": 2774 + }, + { + "epoch": 3.9928057553956835, + "grad_norm": 0.052766134508733596, + "learning_rate": 9.490653295155891e-06, + "loss": 0.3047, + "step": 2775 + }, + { + "epoch": 3.9942446043165467, + "grad_norm": 0.05510053829467285, + "learning_rate": 9.464679468196696e-06, + "loss": 0.3141, + "step": 2776 + }, + { + "epoch": 3.99568345323741, + "grad_norm": 0.05623421629121918, + "learning_rate": 9.438736462216496e-06, + "loss": 0.3088, + "step": 2777 + }, + { + "epoch": 3.9971223021582736, + "grad_norm": 0.05780685261699166, + "learning_rate": 9.412824303401003e-06, + "loss": 0.3022, + "step": 2778 + }, + { + "epoch": 3.998561151079137, + "grad_norm": 0.05594874455328415, + "learning_rate": 9.38694301790478e-06, + "loss": 0.3117, + "step": 2779 + }, + { + "epoch": 4.0, + "grad_norm": 0.05307323279651753, + "learning_rate": 9.361092631851228e-06, + "loss": 0.3, + "step": 2780 + }, + { + "epoch": 4.001438848920864, + "grad_norm": 0.1043212830115124, + "learning_rate": 9.335273171332581e-06, + "loss": 0.2854, + "step": 2781 + }, + { + "epoch": 4.002877697841726, + "grad_norm": 0.07730198498278092, + "learning_rate": 9.30948466240981e-06, + "loss": 0.2831, + "step": 2782 + }, + { + "epoch": 4.00431654676259, + "grad_norm": 0.08892434575632624, + "learning_rate": 9.2837271311127e-06, + "loss": 0.2884, + "step": 2783 + }, + { + "epoch": 4.005755395683453, + "grad_norm": 0.09610014701185844, + "learning_rate": 9.25800060343975e-06, + "loss": 0.2853, + "step": 2784 + }, + { + "epoch": 4.0071942446043165, + "grad_norm": 0.09566633437440125, + "learning_rate": 9.232305105358139e-06, + "loss": 0.2764, + "step": 2785 + }, + { + "epoch": 4.00863309352518, + "grad_norm": 0.09732839889203922, + "learning_rate": 9.206640662803746e-06, + "loss": 0.2841, + "step": 2786 + }, + { + "epoch": 4.010071942446043, + "grad_norm": 0.08107566020332119, + "learning_rate": 9.181007301681135e-06, + "loss": 0.2827, + "step": 2787 + }, + { + "epoch": 4.011510791366907, + "grad_norm": 0.09798451170105568, + "learning_rate": 9.155405047863439e-06, + "loss": 0.2901, + "step": 2788 + }, + { + "epoch": 4.012949640287769, + "grad_norm": 0.09339308886157084, + "learning_rate": 9.12983392719243e-06, + "loss": 0.2843, + "step": 2789 + }, + { + "epoch": 4.014388489208633, + "grad_norm": 0.07831082980908396, + "learning_rate": 9.104293965478446e-06, + "loss": 0.2975, + "step": 2790 + }, + { + "epoch": 4.015827338129497, + "grad_norm": 0.09726756244872185, + "learning_rate": 9.078785188500378e-06, + "loss": 0.2786, + "step": 2791 + }, + { + "epoch": 4.017266187050359, + "grad_norm": 0.0839026602171746, + "learning_rate": 9.053307622005639e-06, + "loss": 0.2891, + "step": 2792 + }, + { + "epoch": 4.018705035971223, + "grad_norm": 0.2569418270484783, + "learning_rate": 9.02786129171013e-06, + "loss": 0.2872, + "step": 2793 + }, + { + "epoch": 4.020143884892087, + "grad_norm": 0.09036941895856587, + "learning_rate": 9.002446223298244e-06, + "loss": 0.2899, + "step": 2794 + }, + { + "epoch": 4.0215827338129495, + "grad_norm": 0.07823056260316286, + "learning_rate": 8.977062442422796e-06, + "loss": 0.2825, + "step": 2795 + }, + { + "epoch": 4.023021582733813, + "grad_norm": 0.06939967074653287, + "learning_rate": 8.951709974705057e-06, + "loss": 0.2839, + "step": 2796 + }, + { + "epoch": 4.024460431654676, + "grad_norm": 0.07775161937964527, + "learning_rate": 8.926388845734624e-06, + "loss": 0.2867, + "step": 2797 + }, + { + "epoch": 4.02589928057554, + "grad_norm": 0.08344607559720935, + "learning_rate": 8.901099081069553e-06, + "loss": 0.2841, + "step": 2798 + }, + { + "epoch": 4.027338129496403, + "grad_norm": 0.0697908363212248, + "learning_rate": 8.875840706236163e-06, + "loss": 0.2797, + "step": 2799 + }, + { + "epoch": 4.028776978417266, + "grad_norm": 0.0702787799782245, + "learning_rate": 8.850613746729117e-06, + "loss": 0.2824, + "step": 2800 + }, + { + "epoch": 4.03021582733813, + "grad_norm": 0.0784344071210075, + "learning_rate": 8.825418228011413e-06, + "loss": 0.2858, + "step": 2801 + }, + { + "epoch": 4.031654676258992, + "grad_norm": 0.06910497452870903, + "learning_rate": 8.80025417551424e-06, + "loss": 0.2845, + "step": 2802 + }, + { + "epoch": 4.033093525179856, + "grad_norm": 0.0703298040829297, + "learning_rate": 8.775121614637064e-06, + "loss": 0.2899, + "step": 2803 + }, + { + "epoch": 4.03453237410072, + "grad_norm": 0.07461618628151065, + "learning_rate": 8.750020570747568e-06, + "loss": 0.2925, + "step": 2804 + }, + { + "epoch": 4.0359712230215825, + "grad_norm": 0.060595293834618015, + "learning_rate": 8.724951069181617e-06, + "loss": 0.2826, + "step": 2805 + }, + { + "epoch": 4.037410071942446, + "grad_norm": 0.05947476428356925, + "learning_rate": 8.699913135243237e-06, + "loss": 0.2854, + "step": 2806 + }, + { + "epoch": 4.038848920863309, + "grad_norm": 0.06970235397677162, + "learning_rate": 8.6749067942046e-06, + "loss": 0.289, + "step": 2807 + }, + { + "epoch": 4.040287769784173, + "grad_norm": 0.057161855218202745, + "learning_rate": 8.649932071305952e-06, + "loss": 0.2812, + "step": 2808 + }, + { + "epoch": 4.041726618705036, + "grad_norm": 0.056968773294844594, + "learning_rate": 8.624988991755687e-06, + "loss": 0.2892, + "step": 2809 + }, + { + "epoch": 4.043165467625899, + "grad_norm": 0.06523034947999166, + "learning_rate": 8.60007758073023e-06, + "loss": 0.2918, + "step": 2810 + }, + { + "epoch": 4.044604316546763, + "grad_norm": 0.05586819145618191, + "learning_rate": 8.575197863374006e-06, + "loss": 0.2866, + "step": 2811 + }, + { + "epoch": 4.046043165467626, + "grad_norm": 0.05690920989007164, + "learning_rate": 8.550349864799505e-06, + "loss": 0.2859, + "step": 2812 + }, + { + "epoch": 4.047482014388489, + "grad_norm": 0.06543859621756212, + "learning_rate": 8.525533610087193e-06, + "loss": 0.2774, + "step": 2813 + }, + { + "epoch": 4.048920863309353, + "grad_norm": 0.05824744569514629, + "learning_rate": 8.500749124285455e-06, + "loss": 0.285, + "step": 2814 + }, + { + "epoch": 4.0503597122302155, + "grad_norm": 0.05628826449906327, + "learning_rate": 8.475996432410642e-06, + "loss": 0.2843, + "step": 2815 + }, + { + "epoch": 4.051798561151079, + "grad_norm": 0.0619226003891041, + "learning_rate": 8.451275559447011e-06, + "loss": 0.2814, + "step": 2816 + }, + { + "epoch": 4.053237410071943, + "grad_norm": 0.062451691131779286, + "learning_rate": 8.426586530346705e-06, + "loss": 0.2873, + "step": 2817 + }, + { + "epoch": 4.054676258992806, + "grad_norm": 0.05631344243537601, + "learning_rate": 8.401929370029708e-06, + "loss": 0.2905, + "step": 2818 + }, + { + "epoch": 4.056115107913669, + "grad_norm": 0.0676675896500192, + "learning_rate": 8.377304103383857e-06, + "loss": 0.2824, + "step": 2819 + }, + { + "epoch": 4.057553956834532, + "grad_norm": 0.05470175927050552, + "learning_rate": 8.352710755264786e-06, + "loss": 0.2843, + "step": 2820 + }, + { + "epoch": 4.058992805755396, + "grad_norm": 0.05634470720435986, + "learning_rate": 8.328149350495916e-06, + "loss": 0.2859, + "step": 2821 + }, + { + "epoch": 4.060431654676259, + "grad_norm": 0.062579593233253, + "learning_rate": 8.303619913868427e-06, + "loss": 0.29, + "step": 2822 + }, + { + "epoch": 4.061870503597122, + "grad_norm": 0.054386224661288236, + "learning_rate": 8.279122470141208e-06, + "loss": 0.2776, + "step": 2823 + }, + { + "epoch": 4.063309352517986, + "grad_norm": 0.062338182507232595, + "learning_rate": 8.254657044040914e-06, + "loss": 0.2892, + "step": 2824 + }, + { + "epoch": 4.0647482014388485, + "grad_norm": 0.052951289601991906, + "learning_rate": 8.230223660261814e-06, + "loss": 0.2882, + "step": 2825 + }, + { + "epoch": 4.066187050359712, + "grad_norm": 0.06809591160216252, + "learning_rate": 8.205822343465865e-06, + "loss": 0.2906, + "step": 2826 + }, + { + "epoch": 4.067625899280576, + "grad_norm": 0.05262875248953221, + "learning_rate": 8.181453118282694e-06, + "loss": 0.282, + "step": 2827 + }, + { + "epoch": 4.069064748201439, + "grad_norm": 0.05940219092006468, + "learning_rate": 8.157116009309467e-06, + "loss": 0.2852, + "step": 2828 + }, + { + "epoch": 4.070503597122302, + "grad_norm": 0.057024370076768506, + "learning_rate": 8.132811041110976e-06, + "loss": 0.2928, + "step": 2829 + }, + { + "epoch": 4.071942446043165, + "grad_norm": 0.05326639214772234, + "learning_rate": 8.108538238219564e-06, + "loss": 0.2848, + "step": 2830 + }, + { + "epoch": 4.073381294964029, + "grad_norm": 0.053437726714684115, + "learning_rate": 8.084297625135104e-06, + "loss": 0.2895, + "step": 2831 + }, + { + "epoch": 4.074820143884892, + "grad_norm": 0.058604609107049246, + "learning_rate": 8.060089226324987e-06, + "loss": 0.2856, + "step": 2832 + }, + { + "epoch": 4.076258992805755, + "grad_norm": 0.053144560045417705, + "learning_rate": 8.035913066224088e-06, + "loss": 0.2802, + "step": 2833 + }, + { + "epoch": 4.077697841726619, + "grad_norm": 0.05393640789482759, + "learning_rate": 8.0117691692347e-06, + "loss": 0.2861, + "step": 2834 + }, + { + "epoch": 4.079136690647482, + "grad_norm": 0.0554108586723657, + "learning_rate": 7.987657559726628e-06, + "loss": 0.2834, + "step": 2835 + }, + { + "epoch": 4.080575539568345, + "grad_norm": 0.052701212312735835, + "learning_rate": 7.963578262037038e-06, + "loss": 0.2852, + "step": 2836 + }, + { + "epoch": 4.082014388489209, + "grad_norm": 0.05505255808236752, + "learning_rate": 7.939531300470458e-06, + "loss": 0.2876, + "step": 2837 + }, + { + "epoch": 4.083453237410072, + "grad_norm": 0.05295422349610247, + "learning_rate": 7.915516699298847e-06, + "loss": 0.296, + "step": 2838 + }, + { + "epoch": 4.084892086330935, + "grad_norm": 0.051567097545618855, + "learning_rate": 7.891534482761463e-06, + "loss": 0.2833, + "step": 2839 + }, + { + "epoch": 4.086330935251799, + "grad_norm": 0.05219702522484231, + "learning_rate": 7.867584675064846e-06, + "loss": 0.2864, + "step": 2840 + }, + { + "epoch": 4.087769784172662, + "grad_norm": 0.05179815914083395, + "learning_rate": 7.843667300382863e-06, + "loss": 0.2806, + "step": 2841 + }, + { + "epoch": 4.089208633093525, + "grad_norm": 0.05368160641813598, + "learning_rate": 7.81978238285667e-06, + "loss": 0.2848, + "step": 2842 + }, + { + "epoch": 4.090647482014388, + "grad_norm": 0.05810459252023169, + "learning_rate": 7.795929946594584e-06, + "loss": 0.2789, + "step": 2843 + }, + { + "epoch": 4.092086330935252, + "grad_norm": 0.050098165455327956, + "learning_rate": 7.772110015672209e-06, + "loss": 0.2793, + "step": 2844 + }, + { + "epoch": 4.093525179856115, + "grad_norm": 0.050937414080107296, + "learning_rate": 7.748322614132297e-06, + "loss": 0.2897, + "step": 2845 + }, + { + "epoch": 4.094964028776978, + "grad_norm": 0.05238091268905972, + "learning_rate": 7.72456776598479e-06, + "loss": 0.2835, + "step": 2846 + }, + { + "epoch": 4.096402877697842, + "grad_norm": 0.05434773894332223, + "learning_rate": 7.70084549520676e-06, + "loss": 0.2825, + "step": 2847 + }, + { + "epoch": 4.097841726618705, + "grad_norm": 0.05174764145307135, + "learning_rate": 7.6771558257424e-06, + "loss": 0.2854, + "step": 2848 + }, + { + "epoch": 4.099280575539568, + "grad_norm": 0.057151568563367755, + "learning_rate": 7.653498781502997e-06, + "loss": 0.2813, + "step": 2849 + }, + { + "epoch": 4.100719424460432, + "grad_norm": 0.04875085068136776, + "learning_rate": 7.629874386366918e-06, + "loss": 0.2777, + "step": 2850 + }, + { + "epoch": 4.102158273381295, + "grad_norm": 0.05195638644729961, + "learning_rate": 7.606282664179545e-06, + "loss": 0.2883, + "step": 2851 + }, + { + "epoch": 4.103597122302158, + "grad_norm": 0.058528895470629484, + "learning_rate": 7.5827236387532976e-06, + "loss": 0.2874, + "step": 2852 + }, + { + "epoch": 4.105035971223022, + "grad_norm": 0.05478092147502408, + "learning_rate": 7.559197333867629e-06, + "loss": 0.274, + "step": 2853 + }, + { + "epoch": 4.106474820143885, + "grad_norm": 0.05885883121797831, + "learning_rate": 7.53570377326891e-06, + "loss": 0.2886, + "step": 2854 + }, + { + "epoch": 4.107913669064748, + "grad_norm": 0.05378570314666407, + "learning_rate": 7.512242980670481e-06, + "loss": 0.2856, + "step": 2855 + }, + { + "epoch": 4.109352517985611, + "grad_norm": 0.05047021641202359, + "learning_rate": 7.488814979752615e-06, + "loss": 0.2835, + "step": 2856 + }, + { + "epoch": 4.110791366906475, + "grad_norm": 0.050547742980969305, + "learning_rate": 7.465419794162487e-06, + "loss": 0.2867, + "step": 2857 + }, + { + "epoch": 4.1122302158273385, + "grad_norm": 0.0490690485972627, + "learning_rate": 7.442057447514144e-06, + "loss": 0.2797, + "step": 2858 + }, + { + "epoch": 4.113669064748201, + "grad_norm": 0.05206071276401406, + "learning_rate": 7.418727963388481e-06, + "loss": 0.2942, + "step": 2859 + }, + { + "epoch": 4.115107913669065, + "grad_norm": 0.04900725135791448, + "learning_rate": 7.395431365333241e-06, + "loss": 0.2897, + "step": 2860 + }, + { + "epoch": 4.116546762589928, + "grad_norm": 0.04918959977849764, + "learning_rate": 7.372167676862952e-06, + "loss": 0.2752, + "step": 2861 + }, + { + "epoch": 4.117985611510791, + "grad_norm": 0.05385482651396476, + "learning_rate": 7.348936921458949e-06, + "loss": 0.2832, + "step": 2862 + }, + { + "epoch": 4.119424460431655, + "grad_norm": 0.05256305439467217, + "learning_rate": 7.325739122569282e-06, + "loss": 0.2877, + "step": 2863 + }, + { + "epoch": 4.120863309352518, + "grad_norm": 0.05078066127205468, + "learning_rate": 7.302574303608794e-06, + "loss": 0.2825, + "step": 2864 + }, + { + "epoch": 4.122302158273381, + "grad_norm": 0.05145114814377242, + "learning_rate": 7.279442487959012e-06, + "loss": 0.2796, + "step": 2865 + }, + { + "epoch": 4.123741007194244, + "grad_norm": 0.05472436663389367, + "learning_rate": 7.256343698968131e-06, + "loss": 0.295, + "step": 2866 + }, + { + "epoch": 4.125179856115108, + "grad_norm": 0.054104706784023536, + "learning_rate": 7.233277959951026e-06, + "loss": 0.2896, + "step": 2867 + }, + { + "epoch": 4.1266187050359715, + "grad_norm": 0.05674997857150882, + "learning_rate": 7.210245294189251e-06, + "loss": 0.2823, + "step": 2868 + }, + { + "epoch": 4.128057553956834, + "grad_norm": 0.04874500387689634, + "learning_rate": 7.187245724930911e-06, + "loss": 0.2918, + "step": 2869 + }, + { + "epoch": 4.129496402877698, + "grad_norm": 0.04619627546770947, + "learning_rate": 7.164279275390749e-06, + "loss": 0.282, + "step": 2870 + }, + { + "epoch": 4.130935251798562, + "grad_norm": 0.049751766918390714, + "learning_rate": 7.14134596875006e-06, + "loss": 0.2848, + "step": 2871 + }, + { + "epoch": 4.132374100719424, + "grad_norm": 0.05350054049527425, + "learning_rate": 7.118445828156697e-06, + "loss": 0.282, + "step": 2872 + }, + { + "epoch": 4.133812949640288, + "grad_norm": 0.04690832284051088, + "learning_rate": 7.0955788767250334e-06, + "loss": 0.2863, + "step": 2873 + }, + { + "epoch": 4.135251798561151, + "grad_norm": 0.04754333086734813, + "learning_rate": 7.0727451375359345e-06, + "loss": 0.2879, + "step": 2874 + }, + { + "epoch": 4.136690647482014, + "grad_norm": 0.05222521043696767, + "learning_rate": 7.049944633636756e-06, + "loss": 0.2887, + "step": 2875 + }, + { + "epoch": 4.138129496402878, + "grad_norm": 0.049879256026004834, + "learning_rate": 7.027177388041311e-06, + "loss": 0.2852, + "step": 2876 + }, + { + "epoch": 4.139568345323741, + "grad_norm": 0.049328534052848386, + "learning_rate": 7.004443423729808e-06, + "loss": 0.2886, + "step": 2877 + }, + { + "epoch": 4.1410071942446045, + "grad_norm": 0.05023564759363154, + "learning_rate": 6.981742763648891e-06, + "loss": 0.2866, + "step": 2878 + }, + { + "epoch": 4.142446043165467, + "grad_norm": 0.20239413390092711, + "learning_rate": 6.959075430711614e-06, + "loss": 0.2971, + "step": 2879 + }, + { + "epoch": 4.143884892086331, + "grad_norm": 0.05076010444591421, + "learning_rate": 6.936441447797335e-06, + "loss": 0.2866, + "step": 2880 + }, + { + "epoch": 4.145323741007195, + "grad_norm": 0.049379185923188657, + "learning_rate": 6.913840837751778e-06, + "loss": 0.2833, + "step": 2881 + }, + { + "epoch": 4.146762589928057, + "grad_norm": 0.0529015164113446, + "learning_rate": 6.8912736233870095e-06, + "loss": 0.2891, + "step": 2882 + }, + { + "epoch": 4.148201438848921, + "grad_norm": 0.04926536197741727, + "learning_rate": 6.868739827481335e-06, + "loss": 0.2782, + "step": 2883 + }, + { + "epoch": 4.149640287769784, + "grad_norm": 0.04723571998825495, + "learning_rate": 6.846239472779359e-06, + "loss": 0.2848, + "step": 2884 + }, + { + "epoch": 4.151079136690647, + "grad_norm": 0.05075196787398027, + "learning_rate": 6.82377258199193e-06, + "loss": 0.2809, + "step": 2885 + }, + { + "epoch": 4.152517985611511, + "grad_norm": 0.05010532301754759, + "learning_rate": 6.80133917779612e-06, + "loss": 0.2842, + "step": 2886 + }, + { + "epoch": 4.153956834532374, + "grad_norm": 0.05281873962011146, + "learning_rate": 6.778939282835195e-06, + "loss": 0.2861, + "step": 2887 + }, + { + "epoch": 4.1553956834532375, + "grad_norm": 0.049782188980158876, + "learning_rate": 6.756572919718611e-06, + "loss": 0.2875, + "step": 2888 + }, + { + "epoch": 4.1568345323741, + "grad_norm": 0.055433584840673934, + "learning_rate": 6.734240111021937e-06, + "loss": 0.2891, + "step": 2889 + }, + { + "epoch": 4.158273381294964, + "grad_norm": 0.04956071896209945, + "learning_rate": 6.711940879286944e-06, + "loss": 0.2807, + "step": 2890 + }, + { + "epoch": 4.159712230215828, + "grad_norm": 0.05422120966942815, + "learning_rate": 6.689675247021461e-06, + "loss": 0.29, + "step": 2891 + }, + { + "epoch": 4.16115107913669, + "grad_norm": 0.05157553339280195, + "learning_rate": 6.667443236699398e-06, + "loss": 0.2847, + "step": 2892 + }, + { + "epoch": 4.162589928057554, + "grad_norm": 0.04944057862866461, + "learning_rate": 6.64524487076077e-06, + "loss": 0.293, + "step": 2893 + }, + { + "epoch": 4.164028776978418, + "grad_norm": 0.05023221501561263, + "learning_rate": 6.623080171611605e-06, + "loss": 0.2794, + "step": 2894 + }, + { + "epoch": 4.16546762589928, + "grad_norm": 0.05555620939833201, + "learning_rate": 6.600949161623939e-06, + "loss": 0.2769, + "step": 2895 + }, + { + "epoch": 4.166906474820144, + "grad_norm": 0.053803815152891114, + "learning_rate": 6.578851863135831e-06, + "loss": 0.2832, + "step": 2896 + }, + { + "epoch": 4.168345323741007, + "grad_norm": 0.047906703153829215, + "learning_rate": 6.556788298451291e-06, + "loss": 0.2848, + "step": 2897 + }, + { + "epoch": 4.1697841726618705, + "grad_norm": 0.056442437064043754, + "learning_rate": 6.534758489840296e-06, + "loss": 0.2852, + "step": 2898 + }, + { + "epoch": 4.171223021582734, + "grad_norm": 0.053616468579709003, + "learning_rate": 6.512762459538744e-06, + "loss": 0.2905, + "step": 2899 + }, + { + "epoch": 4.172661870503597, + "grad_norm": 0.04887273018888442, + "learning_rate": 6.49080022974843e-06, + "loss": 0.2821, + "step": 2900 + }, + { + "epoch": 4.174100719424461, + "grad_norm": 0.05351889400146498, + "learning_rate": 6.468871822637051e-06, + "loss": 0.2819, + "step": 2901 + }, + { + "epoch": 4.175539568345323, + "grad_norm": 0.05547256928044035, + "learning_rate": 6.446977260338152e-06, + "loss": 0.2887, + "step": 2902 + }, + { + "epoch": 4.176978417266187, + "grad_norm": 0.05084436131281766, + "learning_rate": 6.425116564951115e-06, + "loss": 0.2866, + "step": 2903 + }, + { + "epoch": 4.178417266187051, + "grad_norm": 0.052107981016798956, + "learning_rate": 6.403289758541143e-06, + "loss": 0.2891, + "step": 2904 + }, + { + "epoch": 4.179856115107913, + "grad_norm": 0.05355343441284484, + "learning_rate": 6.381496863139247e-06, + "loss": 0.2835, + "step": 2905 + }, + { + "epoch": 4.181294964028777, + "grad_norm": 0.049893232291485824, + "learning_rate": 6.3597379007421755e-06, + "loss": 0.2878, + "step": 2906 + }, + { + "epoch": 4.18273381294964, + "grad_norm": 0.07239619223915701, + "learning_rate": 6.338012893312444e-06, + "loss": 0.2949, + "step": 2907 + }, + { + "epoch": 4.1841726618705035, + "grad_norm": 0.04993222977179514, + "learning_rate": 6.31632186277833e-06, + "loss": 0.2844, + "step": 2908 + }, + { + "epoch": 4.185611510791367, + "grad_norm": 0.05044234362148215, + "learning_rate": 6.294664831033746e-06, + "loss": 0.2951, + "step": 2909 + }, + { + "epoch": 4.18705035971223, + "grad_norm": 0.048300204379515224, + "learning_rate": 6.273041819938343e-06, + "loss": 0.2876, + "step": 2910 + }, + { + "epoch": 4.188489208633094, + "grad_norm": 0.047482360054201984, + "learning_rate": 6.251452851317421e-06, + "loss": 0.2958, + "step": 2911 + }, + { + "epoch": 4.189928057553957, + "grad_norm": 0.047487943147196676, + "learning_rate": 6.229897946961903e-06, + "loss": 0.2908, + "step": 2912 + }, + { + "epoch": 4.19136690647482, + "grad_norm": 0.050600528349371905, + "learning_rate": 6.20837712862834e-06, + "loss": 0.2834, + "step": 2913 + }, + { + "epoch": 4.192805755395684, + "grad_norm": 0.05127597982053406, + "learning_rate": 6.186890418038887e-06, + "loss": 0.2929, + "step": 2914 + }, + { + "epoch": 4.194244604316546, + "grad_norm": 0.050312325645171053, + "learning_rate": 6.165437836881256e-06, + "loss": 0.2868, + "step": 2915 + }, + { + "epoch": 4.19568345323741, + "grad_norm": 0.049959117057170654, + "learning_rate": 6.144019406808724e-06, + "loss": 0.2888, + "step": 2916 + }, + { + "epoch": 4.197122302158274, + "grad_norm": 0.05182875516760443, + "learning_rate": 6.122635149440093e-06, + "loss": 0.2814, + "step": 2917 + }, + { + "epoch": 4.1985611510791365, + "grad_norm": 0.05272581281709548, + "learning_rate": 6.101285086359645e-06, + "loss": 0.2988, + "step": 2918 + }, + { + "epoch": 4.2, + "grad_norm": 0.048477762613489274, + "learning_rate": 6.079969239117201e-06, + "loss": 0.286, + "step": 2919 + }, + { + "epoch": 4.201438848920863, + "grad_norm": 0.048996323200934035, + "learning_rate": 6.05868762922802e-06, + "loss": 0.2876, + "step": 2920 + }, + { + "epoch": 4.202877697841727, + "grad_norm": 0.04875957798976624, + "learning_rate": 6.037440278172782e-06, + "loss": 0.289, + "step": 2921 + }, + { + "epoch": 4.20431654676259, + "grad_norm": 0.04765756332215801, + "learning_rate": 6.016227207397616e-06, + "loss": 0.2815, + "step": 2922 + }, + { + "epoch": 4.205755395683453, + "grad_norm": 0.0497126707077397, + "learning_rate": 5.995048438314044e-06, + "loss": 0.2791, + "step": 2923 + }, + { + "epoch": 4.207194244604317, + "grad_norm": 0.04653946342028209, + "learning_rate": 5.973903992298962e-06, + "loss": 0.2875, + "step": 2924 + }, + { + "epoch": 4.2086330935251794, + "grad_norm": 0.04924042551777451, + "learning_rate": 5.952793890694617e-06, + "loss": 0.2862, + "step": 2925 + }, + { + "epoch": 4.210071942446043, + "grad_norm": 0.051758404134836025, + "learning_rate": 5.9317181548086055e-06, + "loss": 0.2816, + "step": 2926 + }, + { + "epoch": 4.211510791366907, + "grad_norm": 0.047704734972427824, + "learning_rate": 5.910676805913822e-06, + "loss": 0.294, + "step": 2927 + }, + { + "epoch": 4.2129496402877695, + "grad_norm": 0.05089276802228076, + "learning_rate": 5.889669865248455e-06, + "loss": 0.2869, + "step": 2928 + }, + { + "epoch": 4.214388489208633, + "grad_norm": 0.047597601617219244, + "learning_rate": 5.8686973540159706e-06, + "loss": 0.2882, + "step": 2929 + }, + { + "epoch": 4.215827338129497, + "grad_norm": 0.04924280137176742, + "learning_rate": 5.847759293385075e-06, + "loss": 0.2842, + "step": 2930 + }, + { + "epoch": 4.21726618705036, + "grad_norm": 0.04903403474162036, + "learning_rate": 5.8268557044897175e-06, + "loss": 0.2924, + "step": 2931 + }, + { + "epoch": 4.218705035971223, + "grad_norm": 0.04818439389958206, + "learning_rate": 5.805986608429019e-06, + "loss": 0.2897, + "step": 2932 + }, + { + "epoch": 4.220143884892086, + "grad_norm": 0.04581373055337451, + "learning_rate": 5.785152026267309e-06, + "loss": 0.2798, + "step": 2933 + }, + { + "epoch": 4.22158273381295, + "grad_norm": 0.048126025527469155, + "learning_rate": 5.764351979034102e-06, + "loss": 0.2807, + "step": 2934 + }, + { + "epoch": 4.223021582733813, + "grad_norm": 0.04746825529325539, + "learning_rate": 5.743586487724e-06, + "loss": 0.2832, + "step": 2935 + }, + { + "epoch": 4.224460431654676, + "grad_norm": 0.05151018633435883, + "learning_rate": 5.722855573296775e-06, + "loss": 0.2869, + "step": 2936 + }, + { + "epoch": 4.22589928057554, + "grad_norm": 0.04761838167446043, + "learning_rate": 5.702159256677266e-06, + "loss": 0.28, + "step": 2937 + }, + { + "epoch": 4.2273381294964025, + "grad_norm": 0.05200047832651735, + "learning_rate": 5.681497558755417e-06, + "loss": 0.2941, + "step": 2938 + }, + { + "epoch": 4.228776978417266, + "grad_norm": 0.04740391127957364, + "learning_rate": 5.6608705003862085e-06, + "loss": 0.2791, + "step": 2939 + }, + { + "epoch": 4.23021582733813, + "grad_norm": 0.048150554546777156, + "learning_rate": 5.6402781023896695e-06, + "loss": 0.2857, + "step": 2940 + }, + { + "epoch": 4.231654676258993, + "grad_norm": 0.046799705680378056, + "learning_rate": 5.619720385550835e-06, + "loss": 0.2853, + "step": 2941 + }, + { + "epoch": 4.233093525179856, + "grad_norm": 0.04725239128908153, + "learning_rate": 5.5991973706197445e-06, + "loss": 0.2836, + "step": 2942 + }, + { + "epoch": 4.234532374100719, + "grad_norm": 0.044872887371123056, + "learning_rate": 5.578709078311417e-06, + "loss": 0.2863, + "step": 2943 + }, + { + "epoch": 4.235971223021583, + "grad_norm": 0.05015078020635212, + "learning_rate": 5.558255529305779e-06, + "loss": 0.2843, + "step": 2944 + }, + { + "epoch": 4.237410071942446, + "grad_norm": 0.051963637365920454, + "learning_rate": 5.537836744247753e-06, + "loss": 0.2859, + "step": 2945 + }, + { + "epoch": 4.238848920863309, + "grad_norm": 0.04704737505765592, + "learning_rate": 5.517452743747145e-06, + "loss": 0.2848, + "step": 2946 + }, + { + "epoch": 4.240287769784173, + "grad_norm": 0.04987546625046638, + "learning_rate": 5.497103548378628e-06, + "loss": 0.2858, + "step": 2947 + }, + { + "epoch": 4.2417266187050355, + "grad_norm": 0.04864707037982135, + "learning_rate": 5.476789178681769e-06, + "loss": 0.2813, + "step": 2948 + }, + { + "epoch": 4.243165467625899, + "grad_norm": 0.05300509050782678, + "learning_rate": 5.456509655160989e-06, + "loss": 0.284, + "step": 2949 + }, + { + "epoch": 4.244604316546763, + "grad_norm": 0.04704427554950865, + "learning_rate": 5.436264998285516e-06, + "loss": 0.2902, + "step": 2950 + }, + { + "epoch": 4.246043165467626, + "grad_norm": 0.04890309795539629, + "learning_rate": 5.4160552284894075e-06, + "loss": 0.2958, + "step": 2951 + }, + { + "epoch": 4.247482014388489, + "grad_norm": 0.049436994603225766, + "learning_rate": 5.3958803661714865e-06, + "loss": 0.2792, + "step": 2952 + }, + { + "epoch": 4.248920863309353, + "grad_norm": 0.05064437091392666, + "learning_rate": 5.375740431695353e-06, + "loss": 0.2879, + "step": 2953 + }, + { + "epoch": 4.250359712230216, + "grad_norm": 0.24718137686965794, + "learning_rate": 5.355635445389355e-06, + "loss": 0.3011, + "step": 2954 + }, + { + "epoch": 4.251798561151079, + "grad_norm": 0.051772752251279835, + "learning_rate": 5.3355654275465584e-06, + "loss": 0.2924, + "step": 2955 + }, + { + "epoch": 4.253237410071942, + "grad_norm": 0.05156888608226264, + "learning_rate": 5.315530398424735e-06, + "loss": 0.2843, + "step": 2956 + }, + { + "epoch": 4.254676258992806, + "grad_norm": 0.049485260469823086, + "learning_rate": 5.295530378246354e-06, + "loss": 0.2913, + "step": 2957 + }, + { + "epoch": 4.256115107913669, + "grad_norm": 0.04677394192798888, + "learning_rate": 5.27556538719852e-06, + "loss": 0.2944, + "step": 2958 + }, + { + "epoch": 4.257553956834532, + "grad_norm": 0.05233186413123022, + "learning_rate": 5.2556354454329895e-06, + "loss": 0.2794, + "step": 2959 + }, + { + "epoch": 4.258992805755396, + "grad_norm": 0.04930931732994099, + "learning_rate": 5.235740573066186e-06, + "loss": 0.2779, + "step": 2960 + }, + { + "epoch": 4.260431654676259, + "grad_norm": 0.049213575603995675, + "learning_rate": 5.21588079017906e-06, + "loss": 0.2845, + "step": 2961 + }, + { + "epoch": 4.261870503597122, + "grad_norm": 0.04659787031115534, + "learning_rate": 5.196056116817194e-06, + "loss": 0.2837, + "step": 2962 + }, + { + "epoch": 4.263309352517986, + "grad_norm": 0.05293959590454975, + "learning_rate": 5.1762665729907424e-06, + "loss": 0.2851, + "step": 2963 + }, + { + "epoch": 4.264748201438849, + "grad_norm": 0.04998528391845439, + "learning_rate": 5.156512178674358e-06, + "loss": 0.2863, + "step": 2964 + }, + { + "epoch": 4.266187050359712, + "grad_norm": 0.047380819754018616, + "learning_rate": 5.136792953807242e-06, + "loss": 0.2786, + "step": 2965 + }, + { + "epoch": 4.267625899280576, + "grad_norm": 0.05463844211593878, + "learning_rate": 5.117108918293095e-06, + "loss": 0.282, + "step": 2966 + }, + { + "epoch": 4.269064748201439, + "grad_norm": 0.047928910779058316, + "learning_rate": 5.097460092000095e-06, + "loss": 0.2849, + "step": 2967 + }, + { + "epoch": 4.270503597122302, + "grad_norm": 0.047804611394234274, + "learning_rate": 5.07784649476089e-06, + "loss": 0.2804, + "step": 2968 + }, + { + "epoch": 4.271942446043165, + "grad_norm": 0.0529496731240823, + "learning_rate": 5.058268146372562e-06, + "loss": 0.2933, + "step": 2969 + }, + { + "epoch": 4.273381294964029, + "grad_norm": 0.048064374075540754, + "learning_rate": 5.038725066596595e-06, + "loss": 0.2856, + "step": 2970 + }, + { + "epoch": 4.274820143884892, + "grad_norm": 0.054688345166117924, + "learning_rate": 5.019217275158923e-06, + "loss": 0.2903, + "step": 2971 + }, + { + "epoch": 4.276258992805755, + "grad_norm": 0.05497703483626272, + "learning_rate": 4.9997447917498276e-06, + "loss": 0.2852, + "step": 2972 + }, + { + "epoch": 4.277697841726619, + "grad_norm": 0.050839990022528606, + "learning_rate": 4.9803076360239335e-06, + "loss": 0.2816, + "step": 2973 + }, + { + "epoch": 4.279136690647482, + "grad_norm": 0.048344516305128385, + "learning_rate": 4.960905827600266e-06, + "loss": 0.2862, + "step": 2974 + }, + { + "epoch": 4.280575539568345, + "grad_norm": 0.05405339556587912, + "learning_rate": 4.941539386062113e-06, + "loss": 0.2904, + "step": 2975 + }, + { + "epoch": 4.282014388489209, + "grad_norm": 0.051578164450635594, + "learning_rate": 4.922208330957094e-06, + "loss": 0.2865, + "step": 2976 + }, + { + "epoch": 4.283453237410072, + "grad_norm": 0.04809405998317956, + "learning_rate": 4.902912681797114e-06, + "loss": 0.2876, + "step": 2977 + }, + { + "epoch": 4.284892086330935, + "grad_norm": 0.04874372234845439, + "learning_rate": 4.88365245805833e-06, + "loss": 0.2881, + "step": 2978 + }, + { + "epoch": 4.286330935251798, + "grad_norm": 0.04939751644300752, + "learning_rate": 4.864427679181143e-06, + "loss": 0.2755, + "step": 2979 + }, + { + "epoch": 4.287769784172662, + "grad_norm": 0.05277997570760775, + "learning_rate": 4.8452383645701815e-06, + "loss": 0.2828, + "step": 2980 + }, + { + "epoch": 4.2892086330935255, + "grad_norm": 0.04706576694360547, + "learning_rate": 4.826084533594277e-06, + "loss": 0.2818, + "step": 2981 + }, + { + "epoch": 4.290647482014388, + "grad_norm": 0.048706280989824775, + "learning_rate": 4.806966205586441e-06, + "loss": 0.2825, + "step": 2982 + }, + { + "epoch": 4.292086330935252, + "grad_norm": 0.04804532513842935, + "learning_rate": 4.787883399843871e-06, + "loss": 0.291, + "step": 2983 + }, + { + "epoch": 4.293525179856115, + "grad_norm": 0.045865371855854924, + "learning_rate": 4.768836135627859e-06, + "loss": 0.2872, + "step": 2984 + }, + { + "epoch": 4.294964028776978, + "grad_norm": 0.047470561617020374, + "learning_rate": 4.749824432163888e-06, + "loss": 0.2849, + "step": 2985 + }, + { + "epoch": 4.296402877697842, + "grad_norm": 0.04793577633768564, + "learning_rate": 4.730848308641509e-06, + "loss": 0.2793, + "step": 2986 + }, + { + "epoch": 4.297841726618705, + "grad_norm": 0.045949851344600036, + "learning_rate": 4.711907784214358e-06, + "loss": 0.2862, + "step": 2987 + }, + { + "epoch": 4.299280575539568, + "grad_norm": 0.046652218656282315, + "learning_rate": 4.693002878000146e-06, + "loss": 0.2819, + "step": 2988 + }, + { + "epoch": 4.300719424460432, + "grad_norm": 0.04750890426157474, + "learning_rate": 4.674133609080658e-06, + "loss": 0.2912, + "step": 2989 + }, + { + "epoch": 4.302158273381295, + "grad_norm": 0.045998792865143925, + "learning_rate": 4.6552999965016634e-06, + "loss": 0.2843, + "step": 2990 + }, + { + "epoch": 4.3035971223021585, + "grad_norm": 0.045687884736637555, + "learning_rate": 4.6365020592729694e-06, + "loss": 0.2889, + "step": 2991 + }, + { + "epoch": 4.305035971223021, + "grad_norm": 0.05166995591409614, + "learning_rate": 4.617739816368367e-06, + "loss": 0.2857, + "step": 2992 + }, + { + "epoch": 4.306474820143885, + "grad_norm": 0.048667639183769144, + "learning_rate": 4.599013286725624e-06, + "loss": 0.2878, + "step": 2993 + }, + { + "epoch": 4.307913669064749, + "grad_norm": 0.04707175291891023, + "learning_rate": 4.580322489246456e-06, + "loss": 0.2795, + "step": 2994 + }, + { + "epoch": 4.309352517985611, + "grad_norm": 0.04526119084528856, + "learning_rate": 4.5616674427965135e-06, + "loss": 0.286, + "step": 2995 + }, + { + "epoch": 4.310791366906475, + "grad_norm": 0.047858303369376694, + "learning_rate": 4.543048166205357e-06, + "loss": 0.289, + "step": 2996 + }, + { + "epoch": 4.312230215827338, + "grad_norm": 0.04558739027643893, + "learning_rate": 4.524464678266452e-06, + "loss": 0.2821, + "step": 2997 + }, + { + "epoch": 4.313669064748201, + "grad_norm": 0.0481016114455754, + "learning_rate": 4.505916997737143e-06, + "loss": 0.2831, + "step": 2998 + }, + { + "epoch": 4.315107913669065, + "grad_norm": 0.045722359491594936, + "learning_rate": 4.487405143338599e-06, + "loss": 0.2859, + "step": 2999 + }, + { + "epoch": 4.316546762589928, + "grad_norm": 0.0470161732124256, + "learning_rate": 4.468929133755881e-06, + "loss": 0.2812, + "step": 3000 + }, + { + "epoch": 4.3179856115107915, + "grad_norm": 0.04591954625816037, + "learning_rate": 4.450488987637824e-06, + "loss": 0.2886, + "step": 3001 + }, + { + "epoch": 4.319424460431654, + "grad_norm": 0.04657325914607652, + "learning_rate": 4.43208472359709e-06, + "loss": 0.2914, + "step": 3002 + }, + { + "epoch": 4.320863309352518, + "grad_norm": 0.04707613298814469, + "learning_rate": 4.4137163602101114e-06, + "loss": 0.293, + "step": 3003 + }, + { + "epoch": 4.322302158273382, + "grad_norm": 0.04785380336076747, + "learning_rate": 4.3953839160170906e-06, + "loss": 0.2857, + "step": 3004 + }, + { + "epoch": 4.323741007194244, + "grad_norm": 0.04516417550977605, + "learning_rate": 4.377087409521972e-06, + "loss": 0.2886, + "step": 3005 + }, + { + "epoch": 4.325179856115108, + "grad_norm": 0.05188756071690509, + "learning_rate": 4.358826859192422e-06, + "loss": 0.2795, + "step": 3006 + }, + { + "epoch": 4.326618705035971, + "grad_norm": 0.05010143541911294, + "learning_rate": 4.340602283459827e-06, + "loss": 0.2902, + "step": 3007 + }, + { + "epoch": 4.3280575539568344, + "grad_norm": 0.045632732949339, + "learning_rate": 4.322413700719246e-06, + "loss": 0.2881, + "step": 3008 + }, + { + "epoch": 4.329496402877698, + "grad_norm": 0.047000573332118295, + "learning_rate": 4.3042611293294276e-06, + "loss": 0.2845, + "step": 3009 + }, + { + "epoch": 4.330935251798561, + "grad_norm": 0.04966870119308216, + "learning_rate": 4.28614458761274e-06, + "loss": 0.2879, + "step": 3010 + }, + { + "epoch": 4.3323741007194245, + "grad_norm": 0.047705707552179094, + "learning_rate": 4.2680640938552245e-06, + "loss": 0.2788, + "step": 3011 + }, + { + "epoch": 4.333812949640288, + "grad_norm": 0.045191298550149736, + "learning_rate": 4.250019666306515e-06, + "loss": 0.2784, + "step": 3012 + }, + { + "epoch": 4.335251798561151, + "grad_norm": 0.049970789129198244, + "learning_rate": 4.232011323179839e-06, + "loss": 0.2856, + "step": 3013 + }, + { + "epoch": 4.336690647482015, + "grad_norm": 0.04438060522753159, + "learning_rate": 4.214039082652002e-06, + "loss": 0.29, + "step": 3014 + }, + { + "epoch": 4.338129496402877, + "grad_norm": 0.044969327942639095, + "learning_rate": 4.1961029628634e-06, + "loss": 0.2829, + "step": 3015 + }, + { + "epoch": 4.339568345323741, + "grad_norm": 0.04651094800999467, + "learning_rate": 4.17820298191792e-06, + "loss": 0.2822, + "step": 3016 + }, + { + "epoch": 4.341007194244605, + "grad_norm": 0.0448637402914183, + "learning_rate": 4.160339157883e-06, + "loss": 0.2852, + "step": 3017 + }, + { + "epoch": 4.3424460431654675, + "grad_norm": 0.045239343753276685, + "learning_rate": 4.142511508789606e-06, + "loss": 0.2851, + "step": 3018 + }, + { + "epoch": 4.343884892086331, + "grad_norm": 0.04649656395277292, + "learning_rate": 4.1247200526321364e-06, + "loss": 0.2913, + "step": 3019 + }, + { + "epoch": 4.345323741007194, + "grad_norm": 0.04437618628613109, + "learning_rate": 4.106964807368496e-06, + "loss": 0.2888, + "step": 3020 + }, + { + "epoch": 4.3467625899280575, + "grad_norm": 0.04763457548306963, + "learning_rate": 4.089245790920031e-06, + "loss": 0.2854, + "step": 3021 + }, + { + "epoch": 4.348201438848921, + "grad_norm": 0.047633259428891156, + "learning_rate": 4.071563021171523e-06, + "loss": 0.284, + "step": 3022 + }, + { + "epoch": 4.349640287769784, + "grad_norm": 0.04789058302376588, + "learning_rate": 4.0539165159711615e-06, + "loss": 0.2882, + "step": 3023 + }, + { + "epoch": 4.351079136690648, + "grad_norm": 0.04800378524550687, + "learning_rate": 4.036306293130543e-06, + "loss": 0.2865, + "step": 3024 + }, + { + "epoch": 4.35251798561151, + "grad_norm": 0.04766128596285838, + "learning_rate": 4.01873237042461e-06, + "loss": 0.2825, + "step": 3025 + }, + { + "epoch": 4.353956834532374, + "grad_norm": 0.04679739443368158, + "learning_rate": 4.001194765591723e-06, + "loss": 0.2801, + "step": 3026 + }, + { + "epoch": 4.355395683453238, + "grad_norm": 0.04681997545874503, + "learning_rate": 3.983693496333522e-06, + "loss": 0.2831, + "step": 3027 + }, + { + "epoch": 4.3568345323741005, + "grad_norm": 0.04428659452277041, + "learning_rate": 3.966228580315017e-06, + "loss": 0.2835, + "step": 3028 + }, + { + "epoch": 4.358273381294964, + "grad_norm": 0.046913444364728246, + "learning_rate": 3.9488000351645036e-06, + "loss": 0.2817, + "step": 3029 + }, + { + "epoch": 4.359712230215827, + "grad_norm": 0.04959811180884263, + "learning_rate": 3.931407878473575e-06, + "loss": 0.2864, + "step": 3030 + }, + { + "epoch": 4.3611510791366905, + "grad_norm": 0.04734009750172826, + "learning_rate": 3.914052127797088e-06, + "loss": 0.2829, + "step": 3031 + }, + { + "epoch": 4.362589928057554, + "grad_norm": 0.045639018353351817, + "learning_rate": 3.8967328006531605e-06, + "loss": 0.2849, + "step": 3032 + }, + { + "epoch": 4.364028776978417, + "grad_norm": 0.04457688635649878, + "learning_rate": 3.879449914523137e-06, + "loss": 0.284, + "step": 3033 + }, + { + "epoch": 4.365467625899281, + "grad_norm": 0.04532307133138978, + "learning_rate": 3.862203486851588e-06, + "loss": 0.2829, + "step": 3034 + }, + { + "epoch": 4.366906474820144, + "grad_norm": 0.04462611668844534, + "learning_rate": 3.844993535046291e-06, + "loss": 0.2845, + "step": 3035 + }, + { + "epoch": 4.368345323741007, + "grad_norm": 0.04572426626114917, + "learning_rate": 3.8278200764781725e-06, + "loss": 0.2863, + "step": 3036 + }, + { + "epoch": 4.369784172661871, + "grad_norm": 0.04710485572015182, + "learning_rate": 3.8106831284813718e-06, + "loss": 0.2775, + "step": 3037 + }, + { + "epoch": 4.3712230215827335, + "grad_norm": 0.04511969348995074, + "learning_rate": 3.7935827083531585e-06, + "loss": 0.2946, + "step": 3038 + }, + { + "epoch": 4.372661870503597, + "grad_norm": 0.046330465569408265, + "learning_rate": 3.7765188333539037e-06, + "loss": 0.2906, + "step": 3039 + }, + { + "epoch": 4.374100719424461, + "grad_norm": 0.045991357799606274, + "learning_rate": 3.759491520707119e-06, + "loss": 0.2858, + "step": 3040 + }, + { + "epoch": 4.3755395683453235, + "grad_norm": 0.04775861216740306, + "learning_rate": 3.74250078759943e-06, + "loss": 0.2827, + "step": 3041 + }, + { + "epoch": 4.376978417266187, + "grad_norm": 0.047533453521402605, + "learning_rate": 3.7255466511805007e-06, + "loss": 0.2917, + "step": 3042 + }, + { + "epoch": 4.37841726618705, + "grad_norm": 0.045122859269612174, + "learning_rate": 3.7086291285630683e-06, + "loss": 0.2796, + "step": 3043 + }, + { + "epoch": 4.379856115107914, + "grad_norm": 0.047364012253357936, + "learning_rate": 3.6917482368229406e-06, + "loss": 0.2852, + "step": 3044 + }, + { + "epoch": 4.381294964028777, + "grad_norm": 0.04676052940699597, + "learning_rate": 3.674903992998915e-06, + "loss": 0.275, + "step": 3045 + }, + { + "epoch": 4.38273381294964, + "grad_norm": 0.046548802436815834, + "learning_rate": 3.6580964140928133e-06, + "loss": 0.2871, + "step": 3046 + }, + { + "epoch": 4.384172661870504, + "grad_norm": 0.04508602407063141, + "learning_rate": 3.6413255170694515e-06, + "loss": 0.2895, + "step": 3047 + }, + { + "epoch": 4.385611510791367, + "grad_norm": 0.0488251074647898, + "learning_rate": 3.6245913188566227e-06, + "loss": 0.2774, + "step": 3048 + }, + { + "epoch": 4.38705035971223, + "grad_norm": 0.046903569771117315, + "learning_rate": 3.607893836345069e-06, + "loss": 0.2806, + "step": 3049 + }, + { + "epoch": 4.388489208633094, + "grad_norm": 0.04365027073587901, + "learning_rate": 3.5912330863884904e-06, + "loss": 0.2892, + "step": 3050 + }, + { + "epoch": 4.3899280575539565, + "grad_norm": 0.045772150095634324, + "learning_rate": 3.574609085803471e-06, + "loss": 0.282, + "step": 3051 + }, + { + "epoch": 4.39136690647482, + "grad_norm": 0.0457368953801093, + "learning_rate": 3.5580218513695573e-06, + "loss": 0.2892, + "step": 3052 + }, + { + "epoch": 4.392805755395684, + "grad_norm": 0.04792704812052965, + "learning_rate": 3.5414713998291483e-06, + "loss": 0.2943, + "step": 3053 + }, + { + "epoch": 4.394244604316547, + "grad_norm": 0.04752511450125093, + "learning_rate": 3.524957747887512e-06, + "loss": 0.2805, + "step": 3054 + }, + { + "epoch": 4.39568345323741, + "grad_norm": 0.04881368473854953, + "learning_rate": 3.5084809122128125e-06, + "loss": 0.2807, + "step": 3055 + }, + { + "epoch": 4.397122302158273, + "grad_norm": 0.0470367816327077, + "learning_rate": 3.4920409094360054e-06, + "loss": 0.2783, + "step": 3056 + }, + { + "epoch": 4.398561151079137, + "grad_norm": 0.0465763916174093, + "learning_rate": 3.475637756150896e-06, + "loss": 0.2917, + "step": 3057 + }, + { + "epoch": 4.4, + "grad_norm": 0.0451566593898251, + "learning_rate": 3.4592714689140895e-06, + "loss": 0.2828, + "step": 3058 + }, + { + "epoch": 4.401438848920863, + "grad_norm": 0.04725631766489214, + "learning_rate": 3.442942064244981e-06, + "loss": 0.2901, + "step": 3059 + }, + { + "epoch": 4.402877697841727, + "grad_norm": 0.045376106227395, + "learning_rate": 3.426649558625732e-06, + "loss": 0.2899, + "step": 3060 + }, + { + "epoch": 4.4043165467625895, + "grad_norm": 0.04467265942320661, + "learning_rate": 3.4103939685012823e-06, + "loss": 0.2853, + "step": 3061 + }, + { + "epoch": 4.405755395683453, + "grad_norm": 0.047030313069995736, + "learning_rate": 3.3941753102792617e-06, + "loss": 0.284, + "step": 3062 + }, + { + "epoch": 4.407194244604317, + "grad_norm": 0.04592765541447238, + "learning_rate": 3.377993600330083e-06, + "loss": 0.2819, + "step": 3063 + }, + { + "epoch": 4.40863309352518, + "grad_norm": 0.044707990629991354, + "learning_rate": 3.361848854986831e-06, + "loss": 0.2891, + "step": 3064 + }, + { + "epoch": 4.410071942446043, + "grad_norm": 0.0461525481123277, + "learning_rate": 3.3457410905452624e-06, + "loss": 0.2793, + "step": 3065 + }, + { + "epoch": 4.411510791366906, + "grad_norm": 0.04469960961460827, + "learning_rate": 3.3296703232638606e-06, + "loss": 0.2867, + "step": 3066 + }, + { + "epoch": 4.41294964028777, + "grad_norm": 0.045028596250381685, + "learning_rate": 3.3136365693637294e-06, + "loss": 0.2847, + "step": 3067 + }, + { + "epoch": 4.414388489208633, + "grad_norm": 0.0451846137511454, + "learning_rate": 3.297639845028604e-06, + "loss": 0.2938, + "step": 3068 + }, + { + "epoch": 4.415827338129496, + "grad_norm": 0.04576617100329387, + "learning_rate": 3.281680166404857e-06, + "loss": 0.2866, + "step": 3069 + }, + { + "epoch": 4.41726618705036, + "grad_norm": 0.048420259568492184, + "learning_rate": 3.265757549601496e-06, + "loss": 0.2819, + "step": 3070 + }, + { + "epoch": 4.418705035971223, + "grad_norm": 0.047462371508189154, + "learning_rate": 3.249872010690074e-06, + "loss": 0.2861, + "step": 3071 + }, + { + "epoch": 4.420143884892086, + "grad_norm": 0.04572972380824012, + "learning_rate": 3.234023565704738e-06, + "loss": 0.2836, + "step": 3072 + }, + { + "epoch": 4.42158273381295, + "grad_norm": 0.04665768998623574, + "learning_rate": 3.2182122306422035e-06, + "loss": 0.2818, + "step": 3073 + }, + { + "epoch": 4.423021582733813, + "grad_norm": 0.046315803895643375, + "learning_rate": 3.2024380214617136e-06, + "loss": 0.2803, + "step": 3074 + }, + { + "epoch": 4.424460431654676, + "grad_norm": 0.04494539815332072, + "learning_rate": 3.186700954085056e-06, + "loss": 0.2934, + "step": 3075 + }, + { + "epoch": 4.42589928057554, + "grad_norm": 0.04586469090336672, + "learning_rate": 3.1710010443965065e-06, + "loss": 0.2816, + "step": 3076 + }, + { + "epoch": 4.427338129496403, + "grad_norm": 0.044106793033919324, + "learning_rate": 3.1553383082428568e-06, + "loss": 0.2807, + "step": 3077 + }, + { + "epoch": 4.428776978417266, + "grad_norm": 0.046162670508771264, + "learning_rate": 3.139712761433367e-06, + "loss": 0.2807, + "step": 3078 + }, + { + "epoch": 4.430215827338129, + "grad_norm": 0.04590486676601341, + "learning_rate": 3.1241244197397626e-06, + "loss": 0.2835, + "step": 3079 + }, + { + "epoch": 4.431654676258993, + "grad_norm": 0.04625623474456893, + "learning_rate": 3.1085732988962003e-06, + "loss": 0.2827, + "step": 3080 + }, + { + "epoch": 4.433093525179856, + "grad_norm": 0.04544268712327201, + "learning_rate": 3.0930594145993063e-06, + "loss": 0.2845, + "step": 3081 + }, + { + "epoch": 4.434532374100719, + "grad_norm": 0.04629058843461137, + "learning_rate": 3.077582782508075e-06, + "loss": 0.2857, + "step": 3082 + }, + { + "epoch": 4.435971223021583, + "grad_norm": 0.05036946924362949, + "learning_rate": 3.0621434182439345e-06, + "loss": 0.2908, + "step": 3083 + }, + { + "epoch": 4.437410071942446, + "grad_norm": 0.04580338282889522, + "learning_rate": 3.0467413373906773e-06, + "loss": 0.2932, + "step": 3084 + }, + { + "epoch": 4.438848920863309, + "grad_norm": 0.04563365127109482, + "learning_rate": 3.0313765554944806e-06, + "loss": 0.2924, + "step": 3085 + }, + { + "epoch": 4.440287769784173, + "grad_norm": 0.04691341222883385, + "learning_rate": 3.0160490880638593e-06, + "loss": 0.2903, + "step": 3086 + }, + { + "epoch": 4.441726618705036, + "grad_norm": 0.04352245865742581, + "learning_rate": 3.0007589505696645e-06, + "loss": 0.2887, + "step": 3087 + }, + { + "epoch": 4.443165467625899, + "grad_norm": 0.04569527256131873, + "learning_rate": 2.9855061584450795e-06, + "loss": 0.2877, + "step": 3088 + }, + { + "epoch": 4.444604316546762, + "grad_norm": 0.04539684907823774, + "learning_rate": 2.97029072708559e-06, + "loss": 0.2878, + "step": 3089 + }, + { + "epoch": 4.446043165467626, + "grad_norm": 0.04655124115211836, + "learning_rate": 2.955112671848963e-06, + "loss": 0.277, + "step": 3090 + }, + { + "epoch": 4.4474820143884894, + "grad_norm": 0.04452158634531101, + "learning_rate": 2.9399720080552383e-06, + "loss": 0.2839, + "step": 3091 + }, + { + "epoch": 4.448920863309352, + "grad_norm": 0.04487552335110992, + "learning_rate": 2.924868750986729e-06, + "loss": 0.2836, + "step": 3092 + }, + { + "epoch": 4.450359712230216, + "grad_norm": 0.045344931430501334, + "learning_rate": 2.9098029158879914e-06, + "loss": 0.288, + "step": 3093 + }, + { + "epoch": 4.4517985611510795, + "grad_norm": 0.04471278511391404, + "learning_rate": 2.8947745179657815e-06, + "loss": 0.2819, + "step": 3094 + }, + { + "epoch": 4.453237410071942, + "grad_norm": 0.047830060036277836, + "learning_rate": 2.8797835723890944e-06, + "loss": 0.2802, + "step": 3095 + }, + { + "epoch": 4.454676258992806, + "grad_norm": 0.04946417523984009, + "learning_rate": 2.864830094289137e-06, + "loss": 0.296, + "step": 3096 + }, + { + "epoch": 4.456115107913669, + "grad_norm": 0.04623580196593618, + "learning_rate": 2.84991409875925e-06, + "loss": 0.2866, + "step": 3097 + }, + { + "epoch": 4.457553956834532, + "grad_norm": 0.04510489404083211, + "learning_rate": 2.8350356008549806e-06, + "loss": 0.2859, + "step": 3098 + }, + { + "epoch": 4.458992805755396, + "grad_norm": 0.045666606052583214, + "learning_rate": 2.8201946155940142e-06, + "loss": 0.2808, + "step": 3099 + }, + { + "epoch": 4.460431654676259, + "grad_norm": 0.04983704496186958, + "learning_rate": 2.8053911579561764e-06, + "loss": 0.2934, + "step": 3100 + }, + { + "epoch": 4.4618705035971225, + "grad_norm": 0.048865598009512075, + "learning_rate": 2.7906252428834044e-06, + "loss": 0.2844, + "step": 3101 + }, + { + "epoch": 4.463309352517985, + "grad_norm": 0.045393191516494076, + "learning_rate": 2.7758968852797542e-06, + "loss": 0.2822, + "step": 3102 + }, + { + "epoch": 4.464748201438849, + "grad_norm": 0.044241543153539384, + "learning_rate": 2.761206100011369e-06, + "loss": 0.2838, + "step": 3103 + }, + { + "epoch": 4.4661870503597125, + "grad_norm": 0.048380967966845406, + "learning_rate": 2.746552901906463e-06, + "loss": 0.2856, + "step": 3104 + }, + { + "epoch": 4.467625899280575, + "grad_norm": 0.04589060710974179, + "learning_rate": 2.731937305755321e-06, + "loss": 0.2898, + "step": 3105 + }, + { + "epoch": 4.469064748201439, + "grad_norm": 0.04799604377711833, + "learning_rate": 2.717359326310249e-06, + "loss": 0.2804, + "step": 3106 + }, + { + "epoch": 4.470503597122303, + "grad_norm": 0.04399166756583154, + "learning_rate": 2.702818978285633e-06, + "loss": 0.2802, + "step": 3107 + }, + { + "epoch": 4.471942446043165, + "grad_norm": 0.045173616948004265, + "learning_rate": 2.688316276357825e-06, + "loss": 0.2852, + "step": 3108 + }, + { + "epoch": 4.473381294964029, + "grad_norm": 0.04880876466760763, + "learning_rate": 2.6738512351652012e-06, + "loss": 0.2824, + "step": 3109 + }, + { + "epoch": 4.474820143884892, + "grad_norm": 0.04937337994587513, + "learning_rate": 2.65942386930814e-06, + "loss": 0.2874, + "step": 3110 + }, + { + "epoch": 4.4762589928057555, + "grad_norm": 0.04798329139322227, + "learning_rate": 2.645034193348961e-06, + "loss": 0.2918, + "step": 3111 + }, + { + "epoch": 4.477697841726619, + "grad_norm": 0.043177094305288106, + "learning_rate": 2.6306822218119533e-06, + "loss": 0.2825, + "step": 3112 + }, + { + "epoch": 4.479136690647482, + "grad_norm": 0.04726294683534518, + "learning_rate": 2.61636796918336e-06, + "loss": 0.2918, + "step": 3113 + }, + { + "epoch": 4.4805755395683455, + "grad_norm": 0.0506165230183232, + "learning_rate": 2.6020914499113438e-06, + "loss": 0.2893, + "step": 3114 + }, + { + "epoch": 4.482014388489208, + "grad_norm": 0.04698891076583604, + "learning_rate": 2.587852678405973e-06, + "loss": 0.2871, + "step": 3115 + }, + { + "epoch": 4.483453237410072, + "grad_norm": 0.045791826692126864, + "learning_rate": 2.5736516690392366e-06, + "loss": 0.2823, + "step": 3116 + }, + { + "epoch": 4.484892086330936, + "grad_norm": 0.045936359862766014, + "learning_rate": 2.5594884361449746e-06, + "loss": 0.2845, + "step": 3117 + }, + { + "epoch": 4.486330935251798, + "grad_norm": 0.049870589317582585, + "learning_rate": 2.5453629940189338e-06, + "loss": 0.29, + "step": 3118 + }, + { + "epoch": 4.487769784172662, + "grad_norm": 0.044952325310770015, + "learning_rate": 2.531275356918701e-06, + "loss": 0.2792, + "step": 3119 + }, + { + "epoch": 4.489208633093525, + "grad_norm": 0.04474980272543277, + "learning_rate": 2.5172255390636878e-06, + "loss": 0.282, + "step": 3120 + }, + { + "epoch": 4.4906474820143885, + "grad_norm": 0.04619342703112689, + "learning_rate": 2.5032135546351644e-06, + "loss": 0.2855, + "step": 3121 + }, + { + "epoch": 4.492086330935252, + "grad_norm": 0.04904437438371335, + "learning_rate": 2.4892394177761947e-06, + "loss": 0.2975, + "step": 3122 + }, + { + "epoch": 4.493525179856115, + "grad_norm": 0.04481478449252868, + "learning_rate": 2.475303142591634e-06, + "loss": 0.287, + "step": 3123 + }, + { + "epoch": 4.4949640287769785, + "grad_norm": 0.04430038256500023, + "learning_rate": 2.461404743148141e-06, + "loss": 0.2838, + "step": 3124 + }, + { + "epoch": 4.496402877697841, + "grad_norm": 0.04863974155020966, + "learning_rate": 2.4475442334741306e-06, + "loss": 0.289, + "step": 3125 + }, + { + "epoch": 4.497841726618705, + "grad_norm": 0.047389061537361936, + "learning_rate": 2.43372162755978e-06, + "loss": 0.2872, + "step": 3126 + }, + { + "epoch": 4.499280575539569, + "grad_norm": 0.04717389258461824, + "learning_rate": 2.419936939357004e-06, + "loss": 0.2805, + "step": 3127 + }, + { + "epoch": 4.500719424460431, + "grad_norm": 0.04648293926121682, + "learning_rate": 2.4061901827794466e-06, + "loss": 0.2762, + "step": 3128 + }, + { + "epoch": 4.502158273381295, + "grad_norm": 0.04495179640002291, + "learning_rate": 2.3924813717024663e-06, + "loss": 0.2798, + "step": 3129 + }, + { + "epoch": 4.503597122302159, + "grad_norm": 0.04786332178584631, + "learning_rate": 2.378810519963124e-06, + "loss": 0.2884, + "step": 3130 + }, + { + "epoch": 4.5050359712230215, + "grad_norm": 0.046038653109148474, + "learning_rate": 2.3651776413601634e-06, + "loss": 0.2879, + "step": 3131 + }, + { + "epoch": 4.506474820143885, + "grad_norm": 0.04552477765442219, + "learning_rate": 2.3515827496539823e-06, + "loss": 0.2891, + "step": 3132 + }, + { + "epoch": 4.507913669064748, + "grad_norm": 0.04590163299711689, + "learning_rate": 2.3380258585666793e-06, + "loss": 0.2835, + "step": 3133 + }, + { + "epoch": 4.5093525179856115, + "grad_norm": 0.04224172604796208, + "learning_rate": 2.324506981781949e-06, + "loss": 0.2789, + "step": 3134 + }, + { + "epoch": 4.510791366906475, + "grad_norm": 0.04482762947979673, + "learning_rate": 2.311026132945138e-06, + "loss": 0.2871, + "step": 3135 + }, + { + "epoch": 4.512230215827338, + "grad_norm": 0.04499861476986286, + "learning_rate": 2.297583325663233e-06, + "loss": 0.282, + "step": 3136 + }, + { + "epoch": 4.513669064748202, + "grad_norm": 0.042959657151869356, + "learning_rate": 2.2841785735047717e-06, + "loss": 0.2818, + "step": 3137 + }, + { + "epoch": 4.515107913669064, + "grad_norm": 0.04649272008206621, + "learning_rate": 2.2708118899999175e-06, + "loss": 0.2856, + "step": 3138 + }, + { + "epoch": 4.516546762589928, + "grad_norm": 0.0460211142434342, + "learning_rate": 2.2574832886403988e-06, + "loss": 0.2901, + "step": 3139 + }, + { + "epoch": 4.517985611510792, + "grad_norm": 0.0451331665609917, + "learning_rate": 2.2441927828795106e-06, + "loss": 0.2844, + "step": 3140 + }, + { + "epoch": 4.5194244604316545, + "grad_norm": 0.044479318953797725, + "learning_rate": 2.230940386132088e-06, + "loss": 0.2845, + "step": 3141 + }, + { + "epoch": 4.520863309352518, + "grad_norm": 0.044434221175025886, + "learning_rate": 2.21772611177451e-06, + "loss": 0.2871, + "step": 3142 + }, + { + "epoch": 4.522302158273382, + "grad_norm": 0.04497800404610502, + "learning_rate": 2.204549973144654e-06, + "loss": 0.2907, + "step": 3143 + }, + { + "epoch": 4.5237410071942445, + "grad_norm": 0.04434053092042553, + "learning_rate": 2.1914119835419358e-06, + "loss": 0.289, + "step": 3144 + }, + { + "epoch": 4.525179856115108, + "grad_norm": 0.044472727823722906, + "learning_rate": 2.178312156227258e-06, + "loss": 0.2903, + "step": 3145 + }, + { + "epoch": 4.526618705035971, + "grad_norm": 0.04352294261035754, + "learning_rate": 2.1652505044229734e-06, + "loss": 0.287, + "step": 3146 + }, + { + "epoch": 4.528057553956835, + "grad_norm": 0.047867061889741554, + "learning_rate": 2.1522270413129444e-06, + "loss": 0.2931, + "step": 3147 + }, + { + "epoch": 4.529496402877697, + "grad_norm": 0.04381400605651353, + "learning_rate": 2.1392417800424738e-06, + "loss": 0.2869, + "step": 3148 + }, + { + "epoch": 4.530935251798561, + "grad_norm": 0.04521821920349319, + "learning_rate": 2.1262947337182815e-06, + "loss": 0.2893, + "step": 3149 + }, + { + "epoch": 4.532374100719425, + "grad_norm": 0.04553689821434878, + "learning_rate": 2.113385915408546e-06, + "loss": 0.2836, + "step": 3150 + }, + { + "epoch": 4.5338129496402875, + "grad_norm": 0.04379746931920125, + "learning_rate": 2.100515338142839e-06, + "loss": 0.2811, + "step": 3151 + }, + { + "epoch": 4.535251798561151, + "grad_norm": 0.04413680616493023, + "learning_rate": 2.087683014912152e-06, + "loss": 0.279, + "step": 3152 + }, + { + "epoch": 4.536690647482015, + "grad_norm": 0.046387639158312635, + "learning_rate": 2.0748889586688526e-06, + "loss": 0.286, + "step": 3153 + }, + { + "epoch": 4.5381294964028775, + "grad_norm": 0.04508938254342468, + "learning_rate": 2.0621331823266777e-06, + "loss": 0.2795, + "step": 3154 + }, + { + "epoch": 4.539568345323741, + "grad_norm": 0.0468802883650182, + "learning_rate": 2.049415698760746e-06, + "loss": 0.2865, + "step": 3155 + }, + { + "epoch": 4.541007194244604, + "grad_norm": 0.04624928272944986, + "learning_rate": 2.036736520807505e-06, + "loss": 0.2889, + "step": 3156 + }, + { + "epoch": 4.542446043165468, + "grad_norm": 0.04474152738166633, + "learning_rate": 2.0240956612647487e-06, + "loss": 0.2876, + "step": 3157 + }, + { + "epoch": 4.543884892086331, + "grad_norm": 0.04216196285366899, + "learning_rate": 2.011493132891591e-06, + "loss": 0.2785, + "step": 3158 + }, + { + "epoch": 4.545323741007194, + "grad_norm": 0.04497742131776572, + "learning_rate": 1.998928948408465e-06, + "loss": 0.2885, + "step": 3159 + }, + { + "epoch": 4.546762589928058, + "grad_norm": 0.04569780342971736, + "learning_rate": 1.9864031204970847e-06, + "loss": 0.2855, + "step": 3160 + }, + { + "epoch": 4.5482014388489205, + "grad_norm": 0.04775506469401655, + "learning_rate": 1.973915661800452e-06, + "loss": 0.2822, + "step": 3161 + }, + { + "epoch": 4.549640287769784, + "grad_norm": 0.04382785070841345, + "learning_rate": 1.9614665849228666e-06, + "loss": 0.2893, + "step": 3162 + }, + { + "epoch": 4.551079136690648, + "grad_norm": 0.04308916245297393, + "learning_rate": 1.949055902429846e-06, + "loss": 0.2979, + "step": 3163 + }, + { + "epoch": 4.5525179856115106, + "grad_norm": 0.045927616822700745, + "learning_rate": 1.936683626848179e-06, + "loss": 0.2961, + "step": 3164 + }, + { + "epoch": 4.553956834532374, + "grad_norm": 0.04318819506247855, + "learning_rate": 1.9243497706658944e-06, + "loss": 0.2817, + "step": 3165 + }, + { + "epoch": 4.555395683453238, + "grad_norm": 0.04395040323865869, + "learning_rate": 1.9120543463322238e-06, + "loss": 0.2878, + "step": 3166 + }, + { + "epoch": 4.556834532374101, + "grad_norm": 0.04870584182566934, + "learning_rate": 1.899797366257614e-06, + "loss": 0.2904, + "step": 3167 + }, + { + "epoch": 4.558273381294964, + "grad_norm": 0.042490273184344104, + "learning_rate": 1.887578842813711e-06, + "loss": 0.2747, + "step": 3168 + }, + { + "epoch": 4.559712230215827, + "grad_norm": 0.044974677146473956, + "learning_rate": 1.875398788333347e-06, + "loss": 0.2916, + "step": 3169 + }, + { + "epoch": 4.561151079136691, + "grad_norm": 0.04218317060094236, + "learning_rate": 1.8632572151105189e-06, + "loss": 0.2782, + "step": 3170 + }, + { + "epoch": 4.5625899280575535, + "grad_norm": 0.04113782679463585, + "learning_rate": 1.8511541354003882e-06, + "loss": 0.2765, + "step": 3171 + }, + { + "epoch": 4.564028776978417, + "grad_norm": 0.04285203911801668, + "learning_rate": 1.8390895614192405e-06, + "loss": 0.2899, + "step": 3172 + }, + { + "epoch": 4.565467625899281, + "grad_norm": 0.04210684058326767, + "learning_rate": 1.8270635053445352e-06, + "loss": 0.2863, + "step": 3173 + }, + { + "epoch": 4.566906474820144, + "grad_norm": 0.04274374266127899, + "learning_rate": 1.8150759793148332e-06, + "loss": 0.289, + "step": 3174 + }, + { + "epoch": 4.568345323741007, + "grad_norm": 0.04433194830899028, + "learning_rate": 1.803126995429789e-06, + "loss": 0.2898, + "step": 3175 + }, + { + "epoch": 4.569784172661871, + "grad_norm": 0.04368810837599763, + "learning_rate": 1.7912165657501779e-06, + "loss": 0.2907, + "step": 3176 + }, + { + "epoch": 4.571223021582734, + "grad_norm": 0.04298745026737905, + "learning_rate": 1.779344702297845e-06, + "loss": 0.2894, + "step": 3177 + }, + { + "epoch": 4.572661870503597, + "grad_norm": 0.04218022078052001, + "learning_rate": 1.767511417055725e-06, + "loss": 0.2862, + "step": 3178 + }, + { + "epoch": 4.57410071942446, + "grad_norm": 0.04265609620970078, + "learning_rate": 1.7557167219678018e-06, + "loss": 0.277, + "step": 3179 + }, + { + "epoch": 4.575539568345324, + "grad_norm": 0.0439300849590657, + "learning_rate": 1.7439606289391032e-06, + "loss": 0.2821, + "step": 3180 + }, + { + "epoch": 4.576978417266187, + "grad_norm": 0.04227905131910428, + "learning_rate": 1.7322431498357063e-06, + "loss": 0.2809, + "step": 3181 + }, + { + "epoch": 4.57841726618705, + "grad_norm": 0.04472282355744868, + "learning_rate": 1.7205642964847103e-06, + "loss": 0.2885, + "step": 3182 + }, + { + "epoch": 4.579856115107914, + "grad_norm": 0.044805856192444427, + "learning_rate": 1.7089240806742147e-06, + "loss": 0.279, + "step": 3183 + }, + { + "epoch": 4.581294964028777, + "grad_norm": 0.046279114883507784, + "learning_rate": 1.697322514153341e-06, + "loss": 0.2905, + "step": 3184 + }, + { + "epoch": 4.58273381294964, + "grad_norm": 0.042683798448461145, + "learning_rate": 1.6857596086321848e-06, + "loss": 0.2815, + "step": 3185 + }, + { + "epoch": 4.584172661870504, + "grad_norm": 0.04562810926048852, + "learning_rate": 1.6742353757818187e-06, + "loss": 0.2852, + "step": 3186 + }, + { + "epoch": 4.585611510791367, + "grad_norm": 0.04768178060373971, + "learning_rate": 1.6627498272342802e-06, + "loss": 0.2831, + "step": 3187 + }, + { + "epoch": 4.58705035971223, + "grad_norm": 0.04487900169719795, + "learning_rate": 1.6513029745825803e-06, + "loss": 0.286, + "step": 3188 + }, + { + "epoch": 4.588489208633094, + "grad_norm": 0.042795918038530806, + "learning_rate": 1.6398948293806504e-06, + "loss": 0.2842, + "step": 3189 + }, + { + "epoch": 4.589928057553957, + "grad_norm": 0.04303209927416349, + "learning_rate": 1.6285254031433462e-06, + "loss": 0.2828, + "step": 3190 + }, + { + "epoch": 4.59136690647482, + "grad_norm": 0.042562349055923286, + "learning_rate": 1.6171947073464834e-06, + "loss": 0.2758, + "step": 3191 + }, + { + "epoch": 4.592805755395683, + "grad_norm": 0.044191634516535336, + "learning_rate": 1.6059027534267313e-06, + "loss": 0.2851, + "step": 3192 + }, + { + "epoch": 4.594244604316547, + "grad_norm": 0.042741946610148566, + "learning_rate": 1.594649552781693e-06, + "loss": 0.2836, + "step": 3193 + }, + { + "epoch": 4.5956834532374105, + "grad_norm": 0.04473222357427111, + "learning_rate": 1.5834351167698336e-06, + "loss": 0.2888, + "step": 3194 + }, + { + "epoch": 4.597122302158273, + "grad_norm": 0.04678881538800095, + "learning_rate": 1.572259456710512e-06, + "loss": 0.2889, + "step": 3195 + }, + { + "epoch": 4.598561151079137, + "grad_norm": 0.0445047082886457, + "learning_rate": 1.5611225838839272e-06, + "loss": 0.2866, + "step": 3196 + }, + { + "epoch": 4.6, + "grad_norm": 0.04198922277969195, + "learning_rate": 1.550024509531145e-06, + "loss": 0.2832, + "step": 3197 + }, + { + "epoch": 4.601438848920863, + "grad_norm": 0.04780944683412724, + "learning_rate": 1.5389652448540537e-06, + "loss": 0.2825, + "step": 3198 + }, + { + "epoch": 4.602877697841727, + "grad_norm": 0.047847664878454765, + "learning_rate": 1.527944801015382e-06, + "loss": 0.2883, + "step": 3199 + }, + { + "epoch": 4.60431654676259, + "grad_norm": 0.0458686980032547, + "learning_rate": 1.5169631891386805e-06, + "loss": 0.2896, + "step": 3200 + }, + { + "epoch": 4.605755395683453, + "grad_norm": 0.04451970044204277, + "learning_rate": 1.506020420308274e-06, + "loss": 0.2809, + "step": 3201 + }, + { + "epoch": 4.607194244604317, + "grad_norm": 0.047936867529953305, + "learning_rate": 1.495116505569314e-06, + "loss": 0.2807, + "step": 3202 + }, + { + "epoch": 4.60863309352518, + "grad_norm": 0.04600991740189068, + "learning_rate": 1.4842514559277254e-06, + "loss": 0.2827, + "step": 3203 + }, + { + "epoch": 4.6100719424460435, + "grad_norm": 0.042638124793315746, + "learning_rate": 1.4734252823501894e-06, + "loss": 0.2845, + "step": 3204 + }, + { + "epoch": 4.611510791366906, + "grad_norm": 0.04414564436388609, + "learning_rate": 1.4626379957641646e-06, + "loss": 0.285, + "step": 3205 + }, + { + "epoch": 4.61294964028777, + "grad_norm": 0.0429683620965709, + "learning_rate": 1.451889607057848e-06, + "loss": 0.2841, + "step": 3206 + }, + { + "epoch": 4.614388489208633, + "grad_norm": 0.043993994658264805, + "learning_rate": 1.4411801270801885e-06, + "loss": 0.2918, + "step": 3207 + }, + { + "epoch": 4.615827338129496, + "grad_norm": 0.044701654054872725, + "learning_rate": 1.4305095666408453e-06, + "loss": 0.2775, + "step": 3208 + }, + { + "epoch": 4.61726618705036, + "grad_norm": 0.0485189917496429, + "learning_rate": 1.4198779365102077e-06, + "loss": 0.2769, + "step": 3209 + }, + { + "epoch": 4.618705035971223, + "grad_norm": 0.04409380806889366, + "learning_rate": 1.409285247419363e-06, + "loss": 0.2815, + "step": 3210 + }, + { + "epoch": 4.620143884892086, + "grad_norm": 0.043161989577127885, + "learning_rate": 1.3987315100600961e-06, + "loss": 0.2882, + "step": 3211 + }, + { + "epoch": 4.62158273381295, + "grad_norm": 0.042634187481623204, + "learning_rate": 1.3882167350848686e-06, + "loss": 0.2779, + "step": 3212 + }, + { + "epoch": 4.623021582733813, + "grad_norm": 0.04483985224792961, + "learning_rate": 1.3777409331068258e-06, + "loss": 0.2838, + "step": 3213 + }, + { + "epoch": 4.6244604316546765, + "grad_norm": 0.04550005592664216, + "learning_rate": 1.3673041146997768e-06, + "loss": 0.2885, + "step": 3214 + }, + { + "epoch": 4.625899280575539, + "grad_norm": 0.044129693064556864, + "learning_rate": 1.35690629039817e-06, + "loss": 0.2885, + "step": 3215 + }, + { + "epoch": 4.627338129496403, + "grad_norm": 0.044326368086113545, + "learning_rate": 1.346547470697095e-06, + "loss": 0.2887, + "step": 3216 + }, + { + "epoch": 4.6287769784172665, + "grad_norm": 0.04446788991957044, + "learning_rate": 1.3362276660522943e-06, + "loss": 0.2831, + "step": 3217 + }, + { + "epoch": 4.630215827338129, + "grad_norm": 0.043743410105604154, + "learning_rate": 1.325946886880103e-06, + "loss": 0.2779, + "step": 3218 + }, + { + "epoch": 4.631654676258993, + "grad_norm": 0.04217257440269431, + "learning_rate": 1.315705143557482e-06, + "loss": 0.2919, + "step": 3219 + }, + { + "epoch": 4.633093525179856, + "grad_norm": 0.04389969404816933, + "learning_rate": 1.3055024464219846e-06, + "loss": 0.2857, + "step": 3220 + }, + { + "epoch": 4.634532374100719, + "grad_norm": 0.04109587558951776, + "learning_rate": 1.295338805771751e-06, + "loss": 0.2914, + "step": 3221 + }, + { + "epoch": 4.635971223021583, + "grad_norm": 0.04231590117042742, + "learning_rate": 1.285214231865508e-06, + "loss": 0.2856, + "step": 3222 + }, + { + "epoch": 4.637410071942446, + "grad_norm": 0.0443269152627286, + "learning_rate": 1.2751287349225484e-06, + "loss": 0.2855, + "step": 3223 + }, + { + "epoch": 4.6388489208633095, + "grad_norm": 0.0462980954449299, + "learning_rate": 1.2650823251227062e-06, + "loss": 0.2884, + "step": 3224 + }, + { + "epoch": 4.640287769784173, + "grad_norm": 0.040429547658139595, + "learning_rate": 1.255075012606386e-06, + "loss": 0.2832, + "step": 3225 + }, + { + "epoch": 4.641726618705036, + "grad_norm": 0.04486184187050472, + "learning_rate": 1.2451068074745254e-06, + "loss": 0.2843, + "step": 3226 + }, + { + "epoch": 4.6431654676258995, + "grad_norm": 0.042249295960709955, + "learning_rate": 1.2351777197885606e-06, + "loss": 0.2881, + "step": 3227 + }, + { + "epoch": 4.644604316546762, + "grad_norm": 0.04458340484085948, + "learning_rate": 1.2252877595704838e-06, + "loss": 0.2837, + "step": 3228 + }, + { + "epoch": 4.646043165467626, + "grad_norm": 0.04208468376135168, + "learning_rate": 1.2154369368027763e-06, + "loss": 0.2875, + "step": 3229 + }, + { + "epoch": 4.647482014388489, + "grad_norm": 0.04374783965571623, + "learning_rate": 1.2056252614284047e-06, + "loss": 0.2819, + "step": 3230 + }, + { + "epoch": 4.648920863309352, + "grad_norm": 0.04525843062003917, + "learning_rate": 1.1958527433508381e-06, + "loss": 0.2836, + "step": 3231 + }, + { + "epoch": 4.650359712230216, + "grad_norm": 0.04466294146865429, + "learning_rate": 1.1861193924340176e-06, + "loss": 0.2917, + "step": 3232 + }, + { + "epoch": 4.651798561151079, + "grad_norm": 0.04071085015830011, + "learning_rate": 1.176425218502346e-06, + "loss": 0.283, + "step": 3233 + }, + { + "epoch": 4.6532374100719425, + "grad_norm": 0.04326246127923558, + "learning_rate": 1.1667702313406903e-06, + "loss": 0.2778, + "step": 3234 + }, + { + "epoch": 4.654676258992806, + "grad_norm": 0.041761485943218456, + "learning_rate": 1.1571544406943614e-06, + "loss": 0.2802, + "step": 3235 + }, + { + "epoch": 4.656115107913669, + "grad_norm": 0.04394084699989742, + "learning_rate": 1.147577856269102e-06, + "loss": 0.2851, + "step": 3236 + }, + { + "epoch": 4.6575539568345325, + "grad_norm": 0.04384334299676405, + "learning_rate": 1.1380404877310957e-06, + "loss": 0.2799, + "step": 3237 + }, + { + "epoch": 4.658992805755395, + "grad_norm": 0.044116322355897984, + "learning_rate": 1.1285423447069133e-06, + "loss": 0.2828, + "step": 3238 + }, + { + "epoch": 4.660431654676259, + "grad_norm": 0.04552590103355667, + "learning_rate": 1.1190834367835701e-06, + "loss": 0.286, + "step": 3239 + }, + { + "epoch": 4.661870503597123, + "grad_norm": 0.0465443435010611, + "learning_rate": 1.1096637735084602e-06, + "loss": 0.2814, + "step": 3240 + }, + { + "epoch": 4.663309352517985, + "grad_norm": 0.04360432862627896, + "learning_rate": 1.1002833643893606e-06, + "loss": 0.2806, + "step": 3241 + }, + { + "epoch": 4.664748201438849, + "grad_norm": 0.041014423211236745, + "learning_rate": 1.0909422188944308e-06, + "loss": 0.288, + "step": 3242 + }, + { + "epoch": 4.666187050359712, + "grad_norm": 0.043487675209465046, + "learning_rate": 1.0816403464522262e-06, + "loss": 0.2781, + "step": 3243 + }, + { + "epoch": 4.6676258992805755, + "grad_norm": 0.0434688738922224, + "learning_rate": 1.0723777564516148e-06, + "loss": 0.283, + "step": 3244 + }, + { + "epoch": 4.669064748201439, + "grad_norm": 0.0436478159108985, + "learning_rate": 1.0631544582418463e-06, + "loss": 0.2798, + "step": 3245 + }, + { + "epoch": 4.670503597122302, + "grad_norm": 0.04698488687018127, + "learning_rate": 1.0539704611325008e-06, + "loss": 0.2891, + "step": 3246 + }, + { + "epoch": 4.6719424460431656, + "grad_norm": 0.04252447518711718, + "learning_rate": 1.0448257743934964e-06, + "loss": 0.2794, + "step": 3247 + }, + { + "epoch": 4.673381294964029, + "grad_norm": 0.04133669991529513, + "learning_rate": 1.0357204072550676e-06, + "loss": 0.2791, + "step": 3248 + }, + { + "epoch": 4.674820143884892, + "grad_norm": 0.04590539275118129, + "learning_rate": 1.0266543689077602e-06, + "loss": 0.2867, + "step": 3249 + }, + { + "epoch": 4.676258992805756, + "grad_norm": 0.04123553797223722, + "learning_rate": 1.0176276685024233e-06, + "loss": 0.2832, + "step": 3250 + }, + { + "epoch": 4.677697841726618, + "grad_norm": 0.04289768657332813, + "learning_rate": 1.0086403151502088e-06, + "loss": 0.2893, + "step": 3251 + }, + { + "epoch": 4.679136690647482, + "grad_norm": 0.042216007316176335, + "learning_rate": 9.996923179225448e-07, + "loss": 0.2832, + "step": 3252 + }, + { + "epoch": 4.680575539568346, + "grad_norm": 0.04522822430226165, + "learning_rate": 9.90783685851131e-07, + "loss": 0.2889, + "step": 3253 + }, + { + "epoch": 4.6820143884892085, + "grad_norm": 0.04486886217148146, + "learning_rate": 9.81914427927948e-07, + "loss": 0.2807, + "step": 3254 + }, + { + "epoch": 4.683453237410072, + "grad_norm": 0.04206957973117205, + "learning_rate": 9.730845531052214e-07, + "loss": 0.285, + "step": 3255 + }, + { + "epoch": 4.684892086330935, + "grad_norm": 0.04268891368382537, + "learning_rate": 9.642940702954306e-07, + "loss": 0.2899, + "step": 3256 + }, + { + "epoch": 4.686330935251799, + "grad_norm": 0.04223922193649368, + "learning_rate": 9.555429883712963e-07, + "loss": 0.2839, + "step": 3257 + }, + { + "epoch": 4.687769784172662, + "grad_norm": 0.04116027131343593, + "learning_rate": 9.468313161657617e-07, + "loss": 0.288, + "step": 3258 + }, + { + "epoch": 4.689208633093525, + "grad_norm": 0.042695974314423804, + "learning_rate": 9.381590624719972e-07, + "loss": 0.2844, + "step": 3259 + }, + { + "epoch": 4.690647482014389, + "grad_norm": 0.04190157933084132, + "learning_rate": 9.295262360433921e-07, + "loss": 0.2799, + "step": 3260 + }, + { + "epoch": 4.692086330935252, + "grad_norm": 0.04509694273732758, + "learning_rate": 9.209328455935274e-07, + "loss": 0.2827, + "step": 3261 + }, + { + "epoch": 4.693525179856115, + "grad_norm": 0.04320569445563929, + "learning_rate": 9.123788997961847e-07, + "loss": 0.2849, + "step": 3262 + }, + { + "epoch": 4.694964028776979, + "grad_norm": 0.04161867791634296, + "learning_rate": 9.038644072853331e-07, + "loss": 0.2789, + "step": 3263 + }, + { + "epoch": 4.6964028776978415, + "grad_norm": 0.042314134927405225, + "learning_rate": 8.953893766551203e-07, + "loss": 0.2861, + "step": 3264 + }, + { + "epoch": 4.697841726618705, + "grad_norm": 0.041629546653669705, + "learning_rate": 8.86953816459859e-07, + "loss": 0.2879, + "step": 3265 + }, + { + "epoch": 4.699280575539568, + "grad_norm": 0.041723942617953015, + "learning_rate": 8.785577352140317e-07, + "loss": 0.2806, + "step": 3266 + }, + { + "epoch": 4.700719424460432, + "grad_norm": 0.04454818248431817, + "learning_rate": 8.702011413922506e-07, + "loss": 0.2862, + "step": 3267 + }, + { + "epoch": 4.702158273381295, + "grad_norm": 0.04355139769352831, + "learning_rate": 8.61884043429293e-07, + "loss": 0.2818, + "step": 3268 + }, + { + "epoch": 4.703597122302158, + "grad_norm": 0.04204134591185999, + "learning_rate": 8.536064497200702e-07, + "loss": 0.2805, + "step": 3269 + }, + { + "epoch": 4.705035971223022, + "grad_norm": 0.04270053767629897, + "learning_rate": 8.453683686196012e-07, + "loss": 0.2837, + "step": 3270 + }, + { + "epoch": 4.706474820143885, + "grad_norm": 0.04240735124788586, + "learning_rate": 8.371698084430346e-07, + "loss": 0.2822, + "step": 3271 + }, + { + "epoch": 4.707913669064748, + "grad_norm": 0.04245460034719818, + "learning_rate": 8.290107774656441e-07, + "loss": 0.2882, + "step": 3272 + }, + { + "epoch": 4.709352517985612, + "grad_norm": 0.04200433994331035, + "learning_rate": 8.208912839227712e-07, + "loss": 0.2759, + "step": 3273 + }, + { + "epoch": 4.7107913669064745, + "grad_norm": 0.04154456441805117, + "learning_rate": 8.128113360098777e-07, + "loss": 0.2841, + "step": 3274 + }, + { + "epoch": 4.712230215827338, + "grad_norm": 0.043042952807340926, + "learning_rate": 8.047709418824934e-07, + "loss": 0.2871, + "step": 3275 + }, + { + "epoch": 4.713669064748202, + "grad_norm": 0.043440337986098886, + "learning_rate": 7.96770109656233e-07, + "loss": 0.2901, + "step": 3276 + }, + { + "epoch": 4.715107913669065, + "grad_norm": 0.04303030245868966, + "learning_rate": 7.88808847406779e-07, + "loss": 0.286, + "step": 3277 + }, + { + "epoch": 4.716546762589928, + "grad_norm": 0.041327486619416834, + "learning_rate": 7.808871631698723e-07, + "loss": 0.2942, + "step": 3278 + }, + { + "epoch": 4.717985611510791, + "grad_norm": 0.0407580233165075, + "learning_rate": 7.730050649412946e-07, + "loss": 0.2797, + "step": 3279 + }, + { + "epoch": 4.719424460431655, + "grad_norm": 0.045055105516116564, + "learning_rate": 7.651625606768908e-07, + "loss": 0.2856, + "step": 3280 + }, + { + "epoch": 4.720863309352518, + "grad_norm": 0.04163709593874125, + "learning_rate": 7.573596582925291e-07, + "loss": 0.277, + "step": 3281 + }, + { + "epoch": 4.722302158273381, + "grad_norm": 0.04259533465764146, + "learning_rate": 7.495963656641048e-07, + "loss": 0.2947, + "step": 3282 + }, + { + "epoch": 4.723741007194245, + "grad_norm": 0.043496681746895215, + "learning_rate": 7.418726906275497e-07, + "loss": 0.2899, + "step": 3283 + }, + { + "epoch": 4.725179856115108, + "grad_norm": 0.04103629481090085, + "learning_rate": 7.341886409787746e-07, + "loss": 0.2759, + "step": 3284 + }, + { + "epoch": 4.726618705035971, + "grad_norm": 0.042188130402698934, + "learning_rate": 7.265442244737264e-07, + "loss": 0.2772, + "step": 3285 + }, + { + "epoch": 4.728057553956835, + "grad_norm": 0.04149586786309318, + "learning_rate": 7.189394488283307e-07, + "loss": 0.2861, + "step": 3286 + }, + { + "epoch": 4.729496402877698, + "grad_norm": 0.041391815410889245, + "learning_rate": 7.113743217185099e-07, + "loss": 0.2805, + "step": 3287 + }, + { + "epoch": 4.730935251798561, + "grad_norm": 0.04115458244017699, + "learning_rate": 7.0384885078016e-07, + "loss": 0.293, + "step": 3288 + }, + { + "epoch": 4.732374100719424, + "grad_norm": 0.04172485851828882, + "learning_rate": 6.963630436091518e-07, + "loss": 0.2848, + "step": 3289 + }, + { + "epoch": 4.733812949640288, + "grad_norm": 0.040586990215047185, + "learning_rate": 6.889169077613212e-07, + "loss": 0.2868, + "step": 3290 + }, + { + "epoch": 4.735251798561151, + "grad_norm": 0.04112353475003874, + "learning_rate": 6.815104507524695e-07, + "loss": 0.2801, + "step": 3291 + }, + { + "epoch": 4.736690647482014, + "grad_norm": 0.04157853046899118, + "learning_rate": 6.741436800583367e-07, + "loss": 0.2927, + "step": 3292 + }, + { + "epoch": 4.738129496402878, + "grad_norm": 0.0438786444494133, + "learning_rate": 6.668166031146062e-07, + "loss": 0.2866, + "step": 3293 + }, + { + "epoch": 4.739568345323741, + "grad_norm": 0.042734560172589184, + "learning_rate": 6.595292273169041e-07, + "loss": 0.2839, + "step": 3294 + }, + { + "epoch": 4.741007194244604, + "grad_norm": 0.042515843125377976, + "learning_rate": 6.522815600207866e-07, + "loss": 0.2772, + "step": 3295 + }, + { + "epoch": 4.742446043165468, + "grad_norm": 0.0402613039250005, + "learning_rate": 6.450736085417086e-07, + "loss": 0.279, + "step": 3296 + }, + { + "epoch": 4.743884892086331, + "grad_norm": 0.04109263705843827, + "learning_rate": 6.379053801550594e-07, + "loss": 0.2822, + "step": 3297 + }, + { + "epoch": 4.745323741007194, + "grad_norm": 0.09293908614636029, + "learning_rate": 6.307768820961269e-07, + "loss": 0.2841, + "step": 3298 + }, + { + "epoch": 4.746762589928058, + "grad_norm": 0.04041588774309918, + "learning_rate": 6.236881215600976e-07, + "loss": 0.2862, + "step": 3299 + }, + { + "epoch": 4.748201438848921, + "grad_norm": 0.04211380249832902, + "learning_rate": 6.166391057020438e-07, + "loss": 0.2841, + "step": 3300 + }, + { + "epoch": 4.749640287769784, + "grad_norm": 0.04119439798710801, + "learning_rate": 6.096298416369273e-07, + "loss": 0.2942, + "step": 3301 + }, + { + "epoch": 4.751079136690647, + "grad_norm": 0.04190903520399297, + "learning_rate": 6.026603364395867e-07, + "loss": 0.2875, + "step": 3302 + }, + { + "epoch": 4.752517985611511, + "grad_norm": 0.046643300434686695, + "learning_rate": 5.957305971447192e-07, + "loss": 0.2799, + "step": 3303 + }, + { + "epoch": 4.753956834532374, + "grad_norm": 0.04302145218360165, + "learning_rate": 5.888406307468986e-07, + "loss": 0.2889, + "step": 3304 + }, + { + "epoch": 4.755395683453237, + "grad_norm": 0.04132944990889099, + "learning_rate": 5.819904442005442e-07, + "loss": 0.2838, + "step": 3305 + }, + { + "epoch": 4.756834532374101, + "grad_norm": 0.04161240484067499, + "learning_rate": 5.751800444199295e-07, + "loss": 0.2844, + "step": 3306 + }, + { + "epoch": 4.7582733812949645, + "grad_norm": 0.04256756179905745, + "learning_rate": 5.684094382791605e-07, + "loss": 0.2956, + "step": 3307 + }, + { + "epoch": 4.759712230215827, + "grad_norm": 0.043508014652496096, + "learning_rate": 5.616786326121837e-07, + "loss": 0.2856, + "step": 3308 + }, + { + "epoch": 4.761151079136691, + "grad_norm": 0.040238845233041366, + "learning_rate": 5.54987634212778e-07, + "loss": 0.2831, + "step": 3309 + }, + { + "epoch": 4.762589928057554, + "grad_norm": 0.04020307345574689, + "learning_rate": 5.483364498345279e-07, + "loss": 0.2814, + "step": 3310 + }, + { + "epoch": 4.764028776978417, + "grad_norm": 0.04346406741583254, + "learning_rate": 5.417250861908452e-07, + "loss": 0.2861, + "step": 3311 + }, + { + "epoch": 4.76546762589928, + "grad_norm": 0.04217980322246722, + "learning_rate": 5.351535499549387e-07, + "loss": 0.2864, + "step": 3312 + }, + { + "epoch": 4.766906474820144, + "grad_norm": 0.0431790541198276, + "learning_rate": 5.286218477598226e-07, + "loss": 0.2881, + "step": 3313 + }, + { + "epoch": 4.768345323741007, + "grad_norm": 0.040920642398328155, + "learning_rate": 5.221299861983075e-07, + "loss": 0.283, + "step": 3314 + }, + { + "epoch": 4.76978417266187, + "grad_norm": 0.04113680865948008, + "learning_rate": 5.156779718229787e-07, + "loss": 0.283, + "step": 3315 + }, + { + "epoch": 4.771223021582734, + "grad_norm": 0.04206869377518095, + "learning_rate": 5.092658111462179e-07, + "loss": 0.2819, + "step": 3316 + }, + { + "epoch": 4.7726618705035975, + "grad_norm": 0.04145270335602184, + "learning_rate": 5.028935106401678e-07, + "loss": 0.2862, + "step": 3317 + }, + { + "epoch": 4.77410071942446, + "grad_norm": 0.041916908198382855, + "learning_rate": 4.965610767367413e-07, + "loss": 0.284, + "step": 3318 + }, + { + "epoch": 4.775539568345324, + "grad_norm": 0.04318371796542412, + "learning_rate": 4.902685158276078e-07, + "loss": 0.2872, + "step": 3319 + }, + { + "epoch": 4.7769784172661875, + "grad_norm": 0.04269750788765892, + "learning_rate": 4.840158342642021e-07, + "loss": 0.2844, + "step": 3320 + }, + { + "epoch": 4.77841726618705, + "grad_norm": 0.04167765037493412, + "learning_rate": 4.778030383577026e-07, + "loss": 0.2817, + "step": 3321 + }, + { + "epoch": 4.779856115107914, + "grad_norm": 0.04676371457990791, + "learning_rate": 4.716301343790175e-07, + "loss": 0.2924, + "step": 3322 + }, + { + "epoch": 4.781294964028777, + "grad_norm": 0.04279536578991645, + "learning_rate": 4.6549712855879837e-07, + "loss": 0.2751, + "step": 3323 + }, + { + "epoch": 4.78273381294964, + "grad_norm": 0.040420197504620795, + "learning_rate": 4.5940402708744005e-07, + "loss": 0.2852, + "step": 3324 + }, + { + "epoch": 4.784172661870503, + "grad_norm": 0.04245397455825739, + "learning_rate": 4.5335083611502293e-07, + "loss": 0.2924, + "step": 3325 + }, + { + "epoch": 4.785611510791367, + "grad_norm": 0.042732378110714474, + "learning_rate": 4.473375617513842e-07, + "loss": 0.2889, + "step": 3326 + }, + { + "epoch": 4.7870503597122305, + "grad_norm": 0.04075680836884234, + "learning_rate": 4.41364210066042e-07, + "loss": 0.2857, + "step": 3327 + }, + { + "epoch": 4.788489208633093, + "grad_norm": 0.041591069620682895, + "learning_rate": 4.3543078708823126e-07, + "loss": 0.2886, + "step": 3328 + }, + { + "epoch": 4.789928057553957, + "grad_norm": 0.041438570320909945, + "learning_rate": 4.295372988068813e-07, + "loss": 0.2901, + "step": 3329 + }, + { + "epoch": 4.7913669064748206, + "grad_norm": 0.04146670829980007, + "learning_rate": 4.2368375117062043e-07, + "loss": 0.285, + "step": 3330 + }, + { + "epoch": 4.792805755395683, + "grad_norm": 0.0413497691397624, + "learning_rate": 4.178701500877491e-07, + "loss": 0.2769, + "step": 3331 + }, + { + "epoch": 4.794244604316547, + "grad_norm": 0.041040800163841984, + "learning_rate": 4.120965014262579e-07, + "loss": 0.2882, + "step": 3332 + }, + { + "epoch": 4.79568345323741, + "grad_norm": 0.04189017618203409, + "learning_rate": 4.063628110138096e-07, + "loss": 0.2904, + "step": 3333 + }, + { + "epoch": 4.797122302158273, + "grad_norm": 0.04061719266311827, + "learning_rate": 4.0066908463772593e-07, + "loss": 0.2803, + "step": 3334 + }, + { + "epoch": 4.798561151079137, + "grad_norm": 0.042636097032954345, + "learning_rate": 3.9501532804500974e-07, + "loss": 0.2904, + "step": 3335 + }, + { + "epoch": 4.8, + "grad_norm": 0.042510369470737885, + "learning_rate": 3.894015469423007e-07, + "loss": 0.2862, + "step": 3336 + }, + { + "epoch": 4.8014388489208635, + "grad_norm": 0.04042658066542344, + "learning_rate": 3.838277469958973e-07, + "loss": 0.2839, + "step": 3337 + }, + { + "epoch": 4.802877697841726, + "grad_norm": 0.04304923392502034, + "learning_rate": 3.7829393383174375e-07, + "loss": 0.2915, + "step": 3338 + }, + { + "epoch": 4.80431654676259, + "grad_norm": 0.04140898979796202, + "learning_rate": 3.7280011303542084e-07, + "loss": 0.2866, + "step": 3339 + }, + { + "epoch": 4.805755395683454, + "grad_norm": 0.041749156567567884, + "learning_rate": 3.673462901521463e-07, + "loss": 0.2822, + "step": 3340 + }, + { + "epoch": 4.807194244604316, + "grad_norm": 0.041097810617877185, + "learning_rate": 3.619324706867655e-07, + "loss": 0.2845, + "step": 3341 + }, + { + "epoch": 4.80863309352518, + "grad_norm": 0.04357761380107742, + "learning_rate": 3.5655866010373853e-07, + "loss": 0.2825, + "step": 3342 + }, + { + "epoch": 4.810071942446044, + "grad_norm": 0.04202729999373872, + "learning_rate": 3.5122486382715314e-07, + "loss": 0.287, + "step": 3343 + }, + { + "epoch": 4.811510791366906, + "grad_norm": 0.04187911764346768, + "learning_rate": 3.459310872407029e-07, + "loss": 0.2838, + "step": 3344 + }, + { + "epoch": 4.81294964028777, + "grad_norm": 0.04103324603536389, + "learning_rate": 3.4067733568768246e-07, + "loss": 0.2916, + "step": 3345 + }, + { + "epoch": 4.814388489208633, + "grad_norm": 0.041161310147595144, + "learning_rate": 3.3546361447099664e-07, + "loss": 0.2868, + "step": 3346 + }, + { + "epoch": 4.8158273381294965, + "grad_norm": 0.04100603431956255, + "learning_rate": 3.3028992885314247e-07, + "loss": 0.2885, + "step": 3347 + }, + { + "epoch": 4.817266187050359, + "grad_norm": 0.04072316462824549, + "learning_rate": 3.2515628405620503e-07, + "loss": 0.2834, + "step": 3348 + }, + { + "epoch": 4.818705035971223, + "grad_norm": 0.041417910096654374, + "learning_rate": 3.2006268526184824e-07, + "loss": 0.2861, + "step": 3349 + }, + { + "epoch": 4.820143884892087, + "grad_norm": 0.039557375878430695, + "learning_rate": 3.150091376113329e-07, + "loss": 0.2802, + "step": 3350 + }, + { + "epoch": 4.821582733812949, + "grad_norm": 0.04192453172556301, + "learning_rate": 3.0999564620547207e-07, + "loss": 0.2848, + "step": 3351 + }, + { + "epoch": 4.823021582733813, + "grad_norm": 0.04212047586899518, + "learning_rate": 3.0502221610465786e-07, + "loss": 0.2944, + "step": 3352 + }, + { + "epoch": 4.824460431654677, + "grad_norm": 0.04134459177446905, + "learning_rate": 3.0008885232886144e-07, + "loss": 0.2884, + "step": 3353 + }, + { + "epoch": 4.825899280575539, + "grad_norm": 0.04105737532473463, + "learning_rate": 2.95195559857584e-07, + "loss": 0.2785, + "step": 3354 + }, + { + "epoch": 4.827338129496403, + "grad_norm": 0.04026592035207497, + "learning_rate": 2.9034234362989687e-07, + "loss": 0.2819, + "step": 3355 + }, + { + "epoch": 4.828776978417266, + "grad_norm": 0.042417527147535355, + "learning_rate": 2.855292085444239e-07, + "loss": 0.2875, + "step": 3356 + }, + { + "epoch": 4.8302158273381295, + "grad_norm": 0.04173478745196841, + "learning_rate": 2.8075615945932333e-07, + "loss": 0.2861, + "step": 3357 + }, + { + "epoch": 4.831654676258993, + "grad_norm": 0.04172308960992333, + "learning_rate": 2.7602320119229254e-07, + "loss": 0.2822, + "step": 3358 + }, + { + "epoch": 4.833093525179856, + "grad_norm": 0.04299243006807228, + "learning_rate": 2.7133033852057675e-07, + "loss": 0.2888, + "step": 3359 + }, + { + "epoch": 4.83453237410072, + "grad_norm": 0.04150364052335503, + "learning_rate": 2.666775761809337e-07, + "loss": 0.2869, + "step": 3360 + }, + { + "epoch": 4.835971223021582, + "grad_norm": 0.04033984987840341, + "learning_rate": 2.620649188696511e-07, + "loss": 0.2857, + "step": 3361 + }, + { + "epoch": 4.837410071942446, + "grad_norm": 0.040675359063735185, + "learning_rate": 2.574923712425426e-07, + "loss": 0.2923, + "step": 3362 + }, + { + "epoch": 4.83884892086331, + "grad_norm": 0.04233903725882189, + "learning_rate": 2.52959937914925e-07, + "loss": 0.2911, + "step": 3363 + }, + { + "epoch": 4.840287769784172, + "grad_norm": 0.04076547458038119, + "learning_rate": 2.484676234616412e-07, + "loss": 0.2899, + "step": 3364 + }, + { + "epoch": 4.841726618705036, + "grad_norm": 0.041433913736535356, + "learning_rate": 2.440154324170285e-07, + "loss": 0.2884, + "step": 3365 + }, + { + "epoch": 4.8431654676259, + "grad_norm": 0.041269377675926526, + "learning_rate": 2.3960336927492333e-07, + "loss": 0.287, + "step": 3366 + }, + { + "epoch": 4.8446043165467625, + "grad_norm": 0.041061994988089365, + "learning_rate": 2.3523143848867003e-07, + "loss": 0.2827, + "step": 3367 + }, + { + "epoch": 4.846043165467626, + "grad_norm": 0.04187010556305427, + "learning_rate": 2.3089964447109425e-07, + "loss": 0.2809, + "step": 3368 + }, + { + "epoch": 4.847482014388489, + "grad_norm": 0.04303825153494671, + "learning_rate": 2.2660799159451629e-07, + "loss": 0.2815, + "step": 3369 + }, + { + "epoch": 4.848920863309353, + "grad_norm": 0.041983880590083476, + "learning_rate": 2.2235648419073773e-07, + "loss": 0.2796, + "step": 3370 + }, + { + "epoch": 4.850359712230215, + "grad_norm": 0.04145071337212364, + "learning_rate": 2.1814512655103703e-07, + "loss": 0.2846, + "step": 3371 + }, + { + "epoch": 4.851798561151079, + "grad_norm": 0.04035087733302524, + "learning_rate": 2.1397392292617392e-07, + "loss": 0.2809, + "step": 3372 + }, + { + "epoch": 4.853237410071943, + "grad_norm": 0.04175320728399013, + "learning_rate": 2.0984287752636722e-07, + "loss": 0.2871, + "step": 3373 + }, + { + "epoch": 4.854676258992805, + "grad_norm": 0.040747681149666495, + "learning_rate": 2.0575199452131268e-07, + "loss": 0.2835, + "step": 3374 + }, + { + "epoch": 4.856115107913669, + "grad_norm": 0.04010326090661748, + "learning_rate": 2.017012780401606e-07, + "loss": 0.2862, + "step": 3375 + }, + { + "epoch": 4.857553956834533, + "grad_norm": 0.040912678927391144, + "learning_rate": 1.9769073217152933e-07, + "loss": 0.2812, + "step": 3376 + }, + { + "epoch": 4.8589928057553955, + "grad_norm": 0.040760288220981, + "learning_rate": 1.9372036096347414e-07, + "loss": 0.2959, + "step": 3377 + }, + { + "epoch": 4.860431654676259, + "grad_norm": 0.041949715345728174, + "learning_rate": 1.8979016842350928e-07, + "loss": 0.2813, + "step": 3378 + }, + { + "epoch": 4.861870503597122, + "grad_norm": 0.04341096348456281, + "learning_rate": 1.8590015851860376e-07, + "loss": 0.2807, + "step": 3379 + }, + { + "epoch": 4.863309352517986, + "grad_norm": 0.043875483322259254, + "learning_rate": 1.8205033517515015e-07, + "loss": 0.2886, + "step": 3380 + }, + { + "epoch": 4.864748201438849, + "grad_norm": 0.04046833701881062, + "learning_rate": 1.7824070227899115e-07, + "loss": 0.2873, + "step": 3381 + }, + { + "epoch": 4.866187050359712, + "grad_norm": 0.04098147062761545, + "learning_rate": 1.7447126367539313e-07, + "loss": 0.2895, + "step": 3382 + }, + { + "epoch": 4.867625899280576, + "grad_norm": 0.04041993174822328, + "learning_rate": 1.7074202316906374e-07, + "loss": 0.2901, + "step": 3383 + }, + { + "epoch": 4.869064748201438, + "grad_norm": 0.040792225360416115, + "learning_rate": 1.6705298452412978e-07, + "loss": 0.2842, + "step": 3384 + }, + { + "epoch": 4.870503597122302, + "grad_norm": 0.04172365959670488, + "learning_rate": 1.6340415146414157e-07, + "loss": 0.2865, + "step": 3385 + }, + { + "epoch": 4.871942446043166, + "grad_norm": 0.04127806179352284, + "learning_rate": 1.597955276720642e-07, + "loss": 0.2777, + "step": 3386 + }, + { + "epoch": 4.8733812949640285, + "grad_norm": 0.041416591174507904, + "learning_rate": 1.562271167902818e-07, + "loss": 0.2827, + "step": 3387 + }, + { + "epoch": 4.874820143884892, + "grad_norm": 0.040091316222597644, + "learning_rate": 1.526989224205888e-07, + "loss": 0.2871, + "step": 3388 + }, + { + "epoch": 4.876258992805756, + "grad_norm": 0.04107944987078495, + "learning_rate": 1.4921094812418103e-07, + "loss": 0.2883, + "step": 3389 + }, + { + "epoch": 4.877697841726619, + "grad_norm": 0.041313202214486346, + "learning_rate": 1.457631974216689e-07, + "loss": 0.2885, + "step": 3390 + }, + { + "epoch": 4.879136690647482, + "grad_norm": 0.04079308145614227, + "learning_rate": 1.4235567379305536e-07, + "loss": 0.2856, + "step": 3391 + }, + { + "epoch": 4.880575539568345, + "grad_norm": 0.0415866997071899, + "learning_rate": 1.389883806777359e-07, + "loss": 0.2879, + "step": 3392 + }, + { + "epoch": 4.882014388489209, + "grad_norm": 0.04234944058835651, + "learning_rate": 1.356613214745117e-07, + "loss": 0.2876, + "step": 3393 + }, + { + "epoch": 4.883453237410072, + "grad_norm": 0.041253715501214684, + "learning_rate": 1.3237449954156767e-07, + "loss": 0.2869, + "step": 3394 + }, + { + "epoch": 4.884892086330935, + "grad_norm": 0.04213450458140177, + "learning_rate": 1.2912791819646774e-07, + "loss": 0.2905, + "step": 3395 + }, + { + "epoch": 4.886330935251799, + "grad_norm": 0.04155949050167922, + "learning_rate": 1.2592158071616844e-07, + "loss": 0.2832, + "step": 3396 + }, + { + "epoch": 4.8877697841726615, + "grad_norm": 0.041056507695268085, + "learning_rate": 1.2275549033700097e-07, + "loss": 0.2785, + "step": 3397 + }, + { + "epoch": 4.889208633093525, + "grad_norm": 0.04080476316594233, + "learning_rate": 1.1962965025467564e-07, + "loss": 0.2853, + "step": 3398 + }, + { + "epoch": 4.890647482014389, + "grad_norm": 0.0416474307419702, + "learning_rate": 1.1654406362427762e-07, + "loss": 0.2871, + "step": 3399 + }, + { + "epoch": 4.892086330935252, + "grad_norm": 0.04127363787913204, + "learning_rate": 1.1349873356025332e-07, + "loss": 0.286, + "step": 3400 + }, + { + "epoch": 4.893525179856115, + "grad_norm": 0.041206338684211516, + "learning_rate": 1.1049366313642395e-07, + "loss": 0.2896, + "step": 3401 + }, + { + "epoch": 4.894964028776979, + "grad_norm": 0.03976419154372853, + "learning_rate": 1.0752885538598102e-07, + "loss": 0.2836, + "step": 3402 + }, + { + "epoch": 4.896402877697842, + "grad_norm": 0.04048593648075003, + "learning_rate": 1.0460431330145515e-07, + "loss": 0.2758, + "step": 3403 + }, + { + "epoch": 4.897841726618705, + "grad_norm": 0.09191720051199531, + "learning_rate": 1.0172003983475176e-07, + "loss": 0.2869, + "step": 3404 + }, + { + "epoch": 4.899280575539568, + "grad_norm": 0.04114152568043639, + "learning_rate": 9.887603789712875e-08, + "loss": 0.278, + "step": 3405 + }, + { + "epoch": 4.900719424460432, + "grad_norm": 0.04285267729633756, + "learning_rate": 9.607231035919651e-08, + "loss": 0.2994, + "step": 3406 + }, + { + "epoch": 4.9021582733812945, + "grad_norm": 0.04083695716288777, + "learning_rate": 9.330886005090467e-08, + "loss": 0.2801, + "step": 3407 + }, + { + "epoch": 4.903597122302158, + "grad_norm": 0.0393357487579918, + "learning_rate": 9.058568976155979e-08, + "loss": 0.2851, + "step": 3408 + }, + { + "epoch": 4.905035971223022, + "grad_norm": 0.040494727324840725, + "learning_rate": 8.790280223980763e-08, + "loss": 0.2884, + "step": 3409 + }, + { + "epoch": 4.906474820143885, + "grad_norm": 0.041520914283993134, + "learning_rate": 8.526020019363313e-08, + "loss": 0.2866, + "step": 3410 + }, + { + "epoch": 4.907913669064748, + "grad_norm": 0.04080158727412204, + "learning_rate": 8.265788629036043e-08, + "loss": 0.2807, + "step": 3411 + }, + { + "epoch": 4.909352517985612, + "grad_norm": 0.03972572760435433, + "learning_rate": 8.009586315664842e-08, + "loss": 0.2847, + "step": 3412 + }, + { + "epoch": 4.910791366906475, + "grad_norm": 0.04125019779174967, + "learning_rate": 7.757413337848629e-08, + "loss": 0.2875, + "step": 3413 + }, + { + "epoch": 4.912230215827338, + "grad_norm": 0.04035270128054002, + "learning_rate": 7.509269950119358e-08, + "loss": 0.2852, + "step": 3414 + }, + { + "epoch": 4.913669064748201, + "grad_norm": 0.04091786942262446, + "learning_rate": 7.265156402942452e-08, + "loss": 0.2889, + "step": 3415 + }, + { + "epoch": 4.915107913669065, + "grad_norm": 0.04040951017184305, + "learning_rate": 7.025072942714595e-08, + "loss": 0.2865, + "step": 3416 + }, + { + "epoch": 4.916546762589928, + "grad_norm": 0.040690841064591504, + "learning_rate": 6.789019811765052e-08, + "loss": 0.2852, + "step": 3417 + }, + { + "epoch": 4.917985611510791, + "grad_norm": 0.04147303639496497, + "learning_rate": 6.556997248355679e-08, + "loss": 0.2905, + "step": 3418 + }, + { + "epoch": 4.919424460431655, + "grad_norm": 0.04404809316123878, + "learning_rate": 6.329005486679584e-08, + "loss": 0.2929, + "step": 3419 + }, + { + "epoch": 4.920863309352518, + "grad_norm": 0.04059398598689888, + "learning_rate": 6.105044756861134e-08, + "loss": 0.2824, + "step": 3420 + }, + { + "epoch": 4.922302158273381, + "grad_norm": 0.04216826638341303, + "learning_rate": 5.8851152849563886e-08, + "loss": 0.2819, + "step": 3421 + }, + { + "epoch": 4.923741007194245, + "grad_norm": 0.0415649921570707, + "learning_rate": 5.669217292952223e-08, + "loss": 0.2816, + "step": 3422 + }, + { + "epoch": 4.925179856115108, + "grad_norm": 0.04255796731782151, + "learning_rate": 5.4573509987663196e-08, + "loss": 0.2847, + "step": 3423 + }, + { + "epoch": 4.926618705035971, + "grad_norm": 0.03971819662193374, + "learning_rate": 5.2495166162471747e-08, + "loss": 0.2772, + "step": 3424 + }, + { + "epoch": 4.928057553956835, + "grad_norm": 0.04116670684744748, + "learning_rate": 5.045714355173203e-08, + "loss": 0.2943, + "step": 3425 + }, + { + "epoch": 4.929496402877698, + "grad_norm": 0.04207188074099609, + "learning_rate": 4.845944421253634e-08, + "loss": 0.2915, + "step": 3426 + }, + { + "epoch": 4.930935251798561, + "grad_norm": 0.040857963264732865, + "learning_rate": 4.650207016126729e-08, + "loss": 0.2871, + "step": 3427 + }, + { + "epoch": 4.932374100719424, + "grad_norm": 0.040268768737739886, + "learning_rate": 4.458502337361115e-08, + "loss": 0.2837, + "step": 3428 + }, + { + "epoch": 4.933812949640288, + "grad_norm": 0.07856901734688387, + "learning_rate": 4.270830578455343e-08, + "loss": 0.2801, + "step": 3429 + }, + { + "epoch": 4.935251798561151, + "grad_norm": 0.04017694922339272, + "learning_rate": 4.087191928836554e-08, + "loss": 0.2778, + "step": 3430 + }, + { + "epoch": 4.936690647482014, + "grad_norm": 0.041636434158801784, + "learning_rate": 3.907586573860922e-08, + "loss": 0.2875, + "step": 3431 + }, + { + "epoch": 4.938129496402878, + "grad_norm": 0.04151392621853386, + "learning_rate": 3.7320146948149894e-08, + "loss": 0.2801, + "step": 3432 + }, + { + "epoch": 4.939568345323741, + "grad_norm": 0.04170858249943328, + "learning_rate": 3.560476468912111e-08, + "loss": 0.2859, + "step": 3433 + }, + { + "epoch": 4.941007194244604, + "grad_norm": 0.04144939942650809, + "learning_rate": 3.392972069295564e-08, + "loss": 0.2896, + "step": 3434 + }, + { + "epoch": 4.942446043165468, + "grad_norm": 0.04072381817838615, + "learning_rate": 3.229501665037216e-08, + "loss": 0.2849, + "step": 3435 + }, + { + "epoch": 4.943884892086331, + "grad_norm": 0.03980670385989213, + "learning_rate": 3.0700654211361925e-08, + "loss": 0.2881, + "step": 3436 + }, + { + "epoch": 4.945323741007194, + "grad_norm": 0.040739092121564686, + "learning_rate": 2.9146634985206535e-08, + "loss": 0.2806, + "step": 3437 + }, + { + "epoch": 4.946762589928057, + "grad_norm": 0.04188578120567552, + "learning_rate": 2.7632960540460162e-08, + "loss": 0.2904, + "step": 3438 + }, + { + "epoch": 4.948201438848921, + "grad_norm": 0.04031944356055068, + "learning_rate": 2.6159632404958447e-08, + "loss": 0.2948, + "step": 3439 + }, + { + "epoch": 4.9496402877697845, + "grad_norm": 0.03966164742709178, + "learning_rate": 2.472665206581404e-08, + "loss": 0.2852, + "step": 3440 + }, + { + "epoch": 4.951079136690647, + "grad_norm": 0.04129213051444172, + "learning_rate": 2.3334020969407733e-08, + "loss": 0.2842, + "step": 3441 + }, + { + "epoch": 4.952517985611511, + "grad_norm": 0.04107420396143875, + "learning_rate": 2.1981740521406226e-08, + "loss": 0.2846, + "step": 3442 + }, + { + "epoch": 4.953956834532374, + "grad_norm": 0.041070165648897115, + "learning_rate": 2.0669812086735464e-08, + "loss": 0.2912, + "step": 3443 + }, + { + "epoch": 4.955395683453237, + "grad_norm": 0.040347565351461646, + "learning_rate": 1.9398236989598418e-08, + "loss": 0.2825, + "step": 3444 + }, + { + "epoch": 4.956834532374101, + "grad_norm": 0.039675938953722636, + "learning_rate": 1.8167016513470636e-08, + "loss": 0.2843, + "step": 3445 + }, + { + "epoch": 4.958273381294964, + "grad_norm": 0.04016568107289864, + "learning_rate": 1.697615190107804e-08, + "loss": 0.2842, + "step": 3446 + }, + { + "epoch": 4.959712230215827, + "grad_norm": 0.040123864608462276, + "learning_rate": 1.582564435444134e-08, + "loss": 0.2822, + "step": 3447 + }, + { + "epoch": 4.961151079136691, + "grad_norm": 0.04190621407744655, + "learning_rate": 1.4715495034818284e-08, + "loss": 0.2897, + "step": 3448 + }, + { + "epoch": 4.962589928057554, + "grad_norm": 0.0410268233631543, + "learning_rate": 1.3645705062748094e-08, + "loss": 0.2897, + "step": 3449 + }, + { + "epoch": 4.9640287769784175, + "grad_norm": 0.04235511323058934, + "learning_rate": 1.2616275518033683e-08, + "loss": 0.2884, + "step": 3450 + }, + { + "epoch": 4.96546762589928, + "grad_norm": 0.03967760133464778, + "learning_rate": 1.1627207439728339e-08, + "loss": 0.2842, + "step": 3451 + }, + { + "epoch": 4.966906474820144, + "grad_norm": 0.04083234181194213, + "learning_rate": 1.0678501826153486e-08, + "loss": 0.288, + "step": 3452 + }, + { + "epoch": 4.968345323741008, + "grad_norm": 0.04245635744108195, + "learning_rate": 9.770159634894249e-09, + "loss": 0.2869, + "step": 3453 + }, + { + "epoch": 4.96978417266187, + "grad_norm": 0.0413453937775347, + "learning_rate": 8.902181782786124e-09, + "loss": 0.2834, + "step": 3454 + }, + { + "epoch": 4.971223021582734, + "grad_norm": 0.04127801612306726, + "learning_rate": 8.074569145928301e-09, + "loss": 0.2856, + "step": 3455 + }, + { + "epoch": 4.972661870503597, + "grad_norm": 0.040774570467288315, + "learning_rate": 7.287322559679233e-09, + "loss": 0.2851, + "step": 3456 + }, + { + "epoch": 4.97410071942446, + "grad_norm": 0.041267924692496766, + "learning_rate": 6.5404428186433e-09, + "loss": 0.2932, + "step": 3457 + }, + { + "epoch": 4.975539568345324, + "grad_norm": 0.04191204714568303, + "learning_rate": 5.833930676693023e-09, + "loss": 0.2872, + "step": 3458 + }, + { + "epoch": 4.976978417266187, + "grad_norm": 0.04124089367530566, + "learning_rate": 5.167786846946854e-09, + "loss": 0.2859, + "step": 3459 + }, + { + "epoch": 4.9784172661870505, + "grad_norm": 0.0400216176741452, + "learning_rate": 4.542012001778062e-09, + "loss": 0.286, + "step": 3460 + }, + { + "epoch": 4.979856115107914, + "grad_norm": 0.04047404384354867, + "learning_rate": 3.956606772823613e-09, + "loss": 0.2883, + "step": 3461 + }, + { + "epoch": 4.981294964028777, + "grad_norm": 0.040197633392815, + "learning_rate": 3.4115717509619616e-09, + "loss": 0.285, + "step": 3462 + }, + { + "epoch": 4.982733812949641, + "grad_norm": 0.04129146054521192, + "learning_rate": 2.9069074863219414e-09, + "loss": 0.2841, + "step": 3463 + }, + { + "epoch": 4.984172661870503, + "grad_norm": 0.04042323219819034, + "learning_rate": 2.4426144882916392e-09, + "loss": 0.2853, + "step": 3464 + }, + { + "epoch": 4.985611510791367, + "grad_norm": 0.04112481051522228, + "learning_rate": 2.018693225509516e-09, + "loss": 0.2896, + "step": 3465 + }, + { + "epoch": 4.98705035971223, + "grad_norm": 0.04102996245279678, + "learning_rate": 1.6351441258644073e-09, + "loss": 0.2834, + "step": 3466 + }, + { + "epoch": 4.988489208633093, + "grad_norm": 0.042034830155993054, + "learning_rate": 1.2919675764910823e-09, + "loss": 0.2753, + "step": 3467 + }, + { + "epoch": 4.989928057553957, + "grad_norm": 0.04012243583380766, + "learning_rate": 9.89163923770242e-10, + "loss": 0.2795, + "step": 3468 + }, + { + "epoch": 4.99136690647482, + "grad_norm": 0.04165166345300335, + "learning_rate": 7.26733473350727e-10, + "loss": 0.2903, + "step": 3469 + }, + { + "epoch": 4.9928057553956835, + "grad_norm": 0.04109840813348842, + "learning_rate": 5.046764901095457e-10, + "loss": 0.2836, + "step": 3470 + }, + { + "epoch": 4.994244604316547, + "grad_norm": 0.042065480515719435, + "learning_rate": 3.2299319817852283e-10, + "loss": 0.2871, + "step": 3471 + }, + { + "epoch": 4.99568345323741, + "grad_norm": 0.04048558566819859, + "learning_rate": 1.816837809487382e-10, + "loss": 0.2906, + "step": 3472 + }, + { + "epoch": 4.997122302158274, + "grad_norm": 0.040932974287600316, + "learning_rate": 8.074838104832338e-11, + "loss": 0.2845, + "step": 3473 + }, + { + "epoch": 4.998561151079136, + "grad_norm": 0.04198094321209325, + "learning_rate": 2.0187100355784085e-11, + "loss": 0.2817, + "step": 3474 + }, + { + "epoch": 5.0, + "grad_norm": 0.040263433455370916, + "learning_rate": 0.0, + "loss": 0.277, + "step": 3475 + }, + { + "epoch": 5.0, + "step": 3475, + "total_flos": 1.957564489138176e+17, + "train_loss": 0.0, + "train_runtime": 5.6508, + "train_samples_per_second": 314804.431, + "train_steps_per_second": 614.958 + } + ], + "logging_steps": 1, + "max_steps": 3475, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.957564489138176e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}