{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.993958920660491, "eval_steps": 500, "global_step": 3100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016109544905356424, "grad_norm": 6.032404008426598, "learning_rate": 1.2903225806451614e-07, "loss": 0.8102, "step": 1 }, { "epoch": 0.003221908981071285, "grad_norm": 6.091224660092794, "learning_rate": 2.580645161290323e-07, "loss": 0.8201, "step": 2 }, { "epoch": 0.004832863471606927, "grad_norm": 6.242173191014313, "learning_rate": 3.870967741935484e-07, "loss": 0.8364, "step": 3 }, { "epoch": 0.00644381796214257, "grad_norm": 5.9113732582345815, "learning_rate": 5.161290322580646e-07, "loss": 0.8033, "step": 4 }, { "epoch": 0.008054772452678213, "grad_norm": 5.836423086831506, "learning_rate": 6.451612903225807e-07, "loss": 0.7877, "step": 5 }, { "epoch": 0.009665726943213854, "grad_norm": 5.755126086223614, "learning_rate": 7.741935483870968e-07, "loss": 0.7791, "step": 6 }, { "epoch": 0.011276681433749497, "grad_norm": 5.649783806242555, "learning_rate": 9.032258064516129e-07, "loss": 0.7931, "step": 7 }, { "epoch": 0.01288763592428514, "grad_norm": 5.479452292994705, "learning_rate": 1.0322580645161291e-06, "loss": 0.7854, "step": 8 }, { "epoch": 0.01449859041482078, "grad_norm": 5.366601689873154, "learning_rate": 1.1612903225806454e-06, "loss": 0.7727, "step": 9 }, { "epoch": 0.016109544905356425, "grad_norm": 4.434943674510098, "learning_rate": 1.2903225806451614e-06, "loss": 0.7384, "step": 10 }, { "epoch": 0.017720499395892066, "grad_norm": 4.293198637826971, "learning_rate": 1.4193548387096776e-06, "loss": 0.7394, "step": 11 }, { "epoch": 0.019331453886427707, "grad_norm": 4.083095267873627, "learning_rate": 1.5483870967741937e-06, "loss": 0.74, "step": 12 }, { "epoch": 0.020942408376963352, "grad_norm": 2.4257793361125777, "learning_rate": 1.67741935483871e-06, "loss": 0.6943, "step": 13 }, { "epoch": 0.022553362867498993, "grad_norm": 2.404485912753832, "learning_rate": 1.8064516129032258e-06, "loss": 0.7221, "step": 14 }, { "epoch": 0.024164317358034634, "grad_norm": 2.1157005039902415, "learning_rate": 1.935483870967742e-06, "loss": 0.7026, "step": 15 }, { "epoch": 0.02577527184857028, "grad_norm": 1.9329284071198143, "learning_rate": 2.0645161290322582e-06, "loss": 0.6697, "step": 16 }, { "epoch": 0.02738622633910592, "grad_norm": 1.856103169273838, "learning_rate": 2.1935483870967745e-06, "loss": 0.672, "step": 17 }, { "epoch": 0.02899718082964156, "grad_norm": 3.1117388382933022, "learning_rate": 2.3225806451612907e-06, "loss": 0.6815, "step": 18 }, { "epoch": 0.030608135320177206, "grad_norm": 3.2415743455441484, "learning_rate": 2.4516129032258066e-06, "loss": 0.6679, "step": 19 }, { "epoch": 0.03221908981071285, "grad_norm": 3.20677352745239, "learning_rate": 2.580645161290323e-06, "loss": 0.6551, "step": 20 }, { "epoch": 0.03383004430124849, "grad_norm": 2.823762323190898, "learning_rate": 2.709677419354839e-06, "loss": 0.6319, "step": 21 }, { "epoch": 0.03544099879178413, "grad_norm": 2.6726511366702796, "learning_rate": 2.8387096774193553e-06, "loss": 0.6458, "step": 22 }, { "epoch": 0.03705195328231978, "grad_norm": 2.2046688512156916, "learning_rate": 2.967741935483871e-06, "loss": 0.6299, "step": 23 }, { "epoch": 0.038662907772855415, "grad_norm": 1.6050271766621365, "learning_rate": 3.0967741935483874e-06, "loss": 0.6186, "step": 24 }, { "epoch": 0.04027386226339106, "grad_norm": 1.3573669680309588, "learning_rate": 3.225806451612903e-06, "loss": 0.5967, "step": 25 }, { "epoch": 0.041884816753926704, "grad_norm": 1.1627320193357804, "learning_rate": 3.35483870967742e-06, "loss": 0.6048, "step": 26 }, { "epoch": 0.04349577124446234, "grad_norm": 0.9897690073046419, "learning_rate": 3.4838709677419357e-06, "loss": 0.5834, "step": 27 }, { "epoch": 0.045106725734997986, "grad_norm": 0.9903767157665542, "learning_rate": 3.6129032258064515e-06, "loss": 0.5815, "step": 28 }, { "epoch": 0.04671768022553363, "grad_norm": 1.1368680688613164, "learning_rate": 3.741935483870968e-06, "loss": 0.5707, "step": 29 }, { "epoch": 0.04832863471606927, "grad_norm": 1.0426684828392754, "learning_rate": 3.870967741935484e-06, "loss": 0.5461, "step": 30 }, { "epoch": 0.04993958920660491, "grad_norm": 0.9337404457709985, "learning_rate": 4.000000000000001e-06, "loss": 0.5549, "step": 31 }, { "epoch": 0.05155054369714056, "grad_norm": 0.7242627578457423, "learning_rate": 4.1290322580645165e-06, "loss": 0.5507, "step": 32 }, { "epoch": 0.053161498187676196, "grad_norm": 0.6685301803699689, "learning_rate": 4.258064516129032e-06, "loss": 0.5403, "step": 33 }, { "epoch": 0.05477245267821184, "grad_norm": 0.6968371169058664, "learning_rate": 4.387096774193549e-06, "loss": 0.5268, "step": 34 }, { "epoch": 0.056383407168747485, "grad_norm": 0.8728817789992291, "learning_rate": 4.516129032258065e-06, "loss": 0.5347, "step": 35 }, { "epoch": 0.05799436165928312, "grad_norm": 0.8602530490558115, "learning_rate": 4.6451612903225815e-06, "loss": 0.5462, "step": 36 }, { "epoch": 0.05960531614981877, "grad_norm": 0.6378392806884997, "learning_rate": 4.774193548387097e-06, "loss": 0.5199, "step": 37 }, { "epoch": 0.06121627064035441, "grad_norm": 0.5998483205329147, "learning_rate": 4.903225806451613e-06, "loss": 0.5378, "step": 38 }, { "epoch": 0.06282722513089005, "grad_norm": 0.6059446636403915, "learning_rate": 5.032258064516129e-06, "loss": 0.5161, "step": 39 }, { "epoch": 0.0644381796214257, "grad_norm": 0.6817404761352738, "learning_rate": 5.161290322580646e-06, "loss": 0.5194, "step": 40 }, { "epoch": 0.06604913411196134, "grad_norm": 0.6331844245949004, "learning_rate": 5.290322580645162e-06, "loss": 0.5384, "step": 41 }, { "epoch": 0.06766008860249698, "grad_norm": 0.4948231352853153, "learning_rate": 5.419354838709678e-06, "loss": 0.5169, "step": 42 }, { "epoch": 0.06927104309303263, "grad_norm": 0.5035080428321753, "learning_rate": 5.548387096774194e-06, "loss": 0.5135, "step": 43 }, { "epoch": 0.07088199758356827, "grad_norm": 0.536385019533745, "learning_rate": 5.677419354838711e-06, "loss": 0.4819, "step": 44 }, { "epoch": 0.0724929520741039, "grad_norm": 0.5872680433979439, "learning_rate": 5.806451612903226e-06, "loss": 0.5404, "step": 45 }, { "epoch": 0.07410390656463955, "grad_norm": 0.553950328048959, "learning_rate": 5.935483870967742e-06, "loss": 0.5012, "step": 46 }, { "epoch": 0.07571486105517519, "grad_norm": 0.46649858895580004, "learning_rate": 6.064516129032259e-06, "loss": 0.5084, "step": 47 }, { "epoch": 0.07732581554571083, "grad_norm": 0.49980550410748753, "learning_rate": 6.193548387096775e-06, "loss": 0.4997, "step": 48 }, { "epoch": 0.07893677003624648, "grad_norm": 0.500125771445445, "learning_rate": 6.3225806451612906e-06, "loss": 0.5211, "step": 49 }, { "epoch": 0.08054772452678212, "grad_norm": 0.43794258077350834, "learning_rate": 6.451612903225806e-06, "loss": 0.5019, "step": 50 }, { "epoch": 0.08215867901731776, "grad_norm": 0.41542158356072195, "learning_rate": 6.580645161290323e-06, "loss": 0.5, "step": 51 }, { "epoch": 0.08376963350785341, "grad_norm": 0.4398106247413648, "learning_rate": 6.70967741935484e-06, "loss": 0.482, "step": 52 }, { "epoch": 0.08538058799838905, "grad_norm": 0.4658104909549922, "learning_rate": 6.838709677419355e-06, "loss": 0.51, "step": 53 }, { "epoch": 0.08699154248892468, "grad_norm": 0.44586970110991897, "learning_rate": 6.967741935483871e-06, "loss": 0.5066, "step": 54 }, { "epoch": 0.08860249697946034, "grad_norm": 0.36952288687065493, "learning_rate": 7.096774193548388e-06, "loss": 0.4868, "step": 55 }, { "epoch": 0.09021345146999597, "grad_norm": 0.37449257402238395, "learning_rate": 7.225806451612903e-06, "loss": 0.4874, "step": 56 }, { "epoch": 0.09182440596053161, "grad_norm": 0.43146034558081636, "learning_rate": 7.35483870967742e-06, "loss": 0.4961, "step": 57 }, { "epoch": 0.09343536045106726, "grad_norm": 0.33924473498486496, "learning_rate": 7.483870967741936e-06, "loss": 0.4659, "step": 58 }, { "epoch": 0.0950463149416029, "grad_norm": 0.3650384114275885, "learning_rate": 7.612903225806451e-06, "loss": 0.4922, "step": 59 }, { "epoch": 0.09665726943213854, "grad_norm": 0.42392175050797987, "learning_rate": 7.741935483870968e-06, "loss": 0.4825, "step": 60 }, { "epoch": 0.09826822392267419, "grad_norm": 0.41566274005192855, "learning_rate": 7.870967741935484e-06, "loss": 0.496, "step": 61 }, { "epoch": 0.09987917841320983, "grad_norm": 0.3532541757044527, "learning_rate": 8.000000000000001e-06, "loss": 0.4836, "step": 62 }, { "epoch": 0.10149013290374546, "grad_norm": 0.38676034561653744, "learning_rate": 8.129032258064517e-06, "loss": 0.4617, "step": 63 }, { "epoch": 0.10310108739428112, "grad_norm": 0.39235511829919395, "learning_rate": 8.258064516129033e-06, "loss": 0.4699, "step": 64 }, { "epoch": 0.10471204188481675, "grad_norm": 0.42853549136039776, "learning_rate": 8.387096774193549e-06, "loss": 0.4967, "step": 65 }, { "epoch": 0.10632299637535239, "grad_norm": 0.4006077949721882, "learning_rate": 8.516129032258065e-06, "loss": 0.4745, "step": 66 }, { "epoch": 0.10793395086588804, "grad_norm": 0.365858425655604, "learning_rate": 8.64516129032258e-06, "loss": 0.486, "step": 67 }, { "epoch": 0.10954490535642368, "grad_norm": 0.4029132008489552, "learning_rate": 8.774193548387098e-06, "loss": 0.4578, "step": 68 }, { "epoch": 0.11115585984695932, "grad_norm": 0.4416121827528027, "learning_rate": 8.903225806451614e-06, "loss": 0.4838, "step": 69 }, { "epoch": 0.11276681433749497, "grad_norm": 0.37290691713598234, "learning_rate": 9.03225806451613e-06, "loss": 0.4626, "step": 70 }, { "epoch": 0.11437776882803061, "grad_norm": 0.39437106054528326, "learning_rate": 9.161290322580645e-06, "loss": 0.4606, "step": 71 }, { "epoch": 0.11598872331856624, "grad_norm": 0.46230012352472577, "learning_rate": 9.290322580645163e-06, "loss": 0.4843, "step": 72 }, { "epoch": 0.1175996778091019, "grad_norm": 0.36168746091549053, "learning_rate": 9.419354838709677e-06, "loss": 0.4776, "step": 73 }, { "epoch": 0.11921063229963753, "grad_norm": 0.3887609362159801, "learning_rate": 9.548387096774195e-06, "loss": 0.4913, "step": 74 }, { "epoch": 0.12082158679017317, "grad_norm": 0.4212414686324945, "learning_rate": 9.67741935483871e-06, "loss": 0.472, "step": 75 }, { "epoch": 0.12243254128070882, "grad_norm": 0.3736240767300341, "learning_rate": 9.806451612903226e-06, "loss": 0.463, "step": 76 }, { "epoch": 0.12404349577124446, "grad_norm": 0.3897026748050938, "learning_rate": 9.935483870967742e-06, "loss": 0.4869, "step": 77 }, { "epoch": 0.1256544502617801, "grad_norm": 0.35819030543571656, "learning_rate": 1.0064516129032258e-05, "loss": 0.4821, "step": 78 }, { "epoch": 0.12726540475231574, "grad_norm": 0.419675735054713, "learning_rate": 1.0193548387096774e-05, "loss": 0.4572, "step": 79 }, { "epoch": 0.1288763592428514, "grad_norm": 0.45017621890591875, "learning_rate": 1.0322580645161291e-05, "loss": 0.4519, "step": 80 }, { "epoch": 0.13048731373338704, "grad_norm": 0.38873544789688036, "learning_rate": 1.0451612903225807e-05, "loss": 0.4704, "step": 81 }, { "epoch": 0.13209826822392268, "grad_norm": 0.4590008765481535, "learning_rate": 1.0580645161290325e-05, "loss": 0.4624, "step": 82 }, { "epoch": 0.13370922271445831, "grad_norm": 0.4017641572848416, "learning_rate": 1.070967741935484e-05, "loss": 0.4552, "step": 83 }, { "epoch": 0.13532017720499395, "grad_norm": 0.4919488749370493, "learning_rate": 1.0838709677419356e-05, "loss": 0.4824, "step": 84 }, { "epoch": 0.1369311316955296, "grad_norm": 0.46571573865087124, "learning_rate": 1.096774193548387e-05, "loss": 0.4593, "step": 85 }, { "epoch": 0.13854208618606526, "grad_norm": 0.483375616479342, "learning_rate": 1.1096774193548388e-05, "loss": 0.4762, "step": 86 }, { "epoch": 0.1401530406766009, "grad_norm": 0.390666645368303, "learning_rate": 1.1225806451612904e-05, "loss": 0.4543, "step": 87 }, { "epoch": 0.14176399516713653, "grad_norm": 0.5003728860520759, "learning_rate": 1.1354838709677421e-05, "loss": 0.4466, "step": 88 }, { "epoch": 0.14337494965767217, "grad_norm": 0.4761849607513898, "learning_rate": 1.1483870967741937e-05, "loss": 0.4744, "step": 89 }, { "epoch": 0.1449859041482078, "grad_norm": 0.4431212670764865, "learning_rate": 1.1612903225806453e-05, "loss": 0.4785, "step": 90 }, { "epoch": 0.14659685863874344, "grad_norm": 0.4645484282151763, "learning_rate": 1.1741935483870967e-05, "loss": 0.4747, "step": 91 }, { "epoch": 0.1482078131292791, "grad_norm": 0.4591410377370998, "learning_rate": 1.1870967741935484e-05, "loss": 0.4501, "step": 92 }, { "epoch": 0.14981876761981475, "grad_norm": 0.43211703633867105, "learning_rate": 1.2e-05, "loss": 0.4318, "step": 93 }, { "epoch": 0.15142972211035038, "grad_norm": 0.4134987376570425, "learning_rate": 1.2129032258064518e-05, "loss": 0.4554, "step": 94 }, { "epoch": 0.15304067660088602, "grad_norm": 0.40687971876636914, "learning_rate": 1.2258064516129034e-05, "loss": 0.4463, "step": 95 }, { "epoch": 0.15465163109142166, "grad_norm": 0.4821367578390539, "learning_rate": 1.238709677419355e-05, "loss": 0.4684, "step": 96 }, { "epoch": 0.1562625855819573, "grad_norm": 0.4008872991192848, "learning_rate": 1.2516129032258067e-05, "loss": 0.4587, "step": 97 }, { "epoch": 0.15787354007249296, "grad_norm": 0.4709306605581316, "learning_rate": 1.2645161290322581e-05, "loss": 0.4669, "step": 98 }, { "epoch": 0.1594844945630286, "grad_norm": 0.4270863547298053, "learning_rate": 1.2774193548387097e-05, "loss": 0.453, "step": 99 }, { "epoch": 0.16109544905356424, "grad_norm": 0.47904696715631884, "learning_rate": 1.2903225806451613e-05, "loss": 0.4331, "step": 100 }, { "epoch": 0.16270640354409988, "grad_norm": 0.5646915331058875, "learning_rate": 1.303225806451613e-05, "loss": 0.4647, "step": 101 }, { "epoch": 0.1643173580346355, "grad_norm": 0.421375992229102, "learning_rate": 1.3161290322580646e-05, "loss": 0.4628, "step": 102 }, { "epoch": 0.16592831252517115, "grad_norm": 0.4577925255478774, "learning_rate": 1.3290322580645164e-05, "loss": 0.4432, "step": 103 }, { "epoch": 0.16753926701570682, "grad_norm": 0.5080123906152704, "learning_rate": 1.341935483870968e-05, "loss": 0.4581, "step": 104 }, { "epoch": 0.16915022150624245, "grad_norm": 0.4302757240951249, "learning_rate": 1.3548387096774194e-05, "loss": 0.4252, "step": 105 }, { "epoch": 0.1707611759967781, "grad_norm": 0.543321933222442, "learning_rate": 1.367741935483871e-05, "loss": 0.4556, "step": 106 }, { "epoch": 0.17237213048731373, "grad_norm": 0.4342499733326806, "learning_rate": 1.3806451612903227e-05, "loss": 0.4474, "step": 107 }, { "epoch": 0.17398308497784937, "grad_norm": 0.625382634227691, "learning_rate": 1.3935483870967743e-05, "loss": 0.4262, "step": 108 }, { "epoch": 0.175594039468385, "grad_norm": 0.4889154545842034, "learning_rate": 1.406451612903226e-05, "loss": 0.4633, "step": 109 }, { "epoch": 0.17720499395892067, "grad_norm": 0.468374811877519, "learning_rate": 1.4193548387096776e-05, "loss": 0.4685, "step": 110 }, { "epoch": 0.1788159484494563, "grad_norm": 0.5750906393209809, "learning_rate": 1.4322580645161292e-05, "loss": 0.4521, "step": 111 }, { "epoch": 0.18042690293999195, "grad_norm": 0.43929194658608134, "learning_rate": 1.4451612903225806e-05, "loss": 0.458, "step": 112 }, { "epoch": 0.18203785743052758, "grad_norm": 0.5255857904785238, "learning_rate": 1.4580645161290324e-05, "loss": 0.4354, "step": 113 }, { "epoch": 0.18364881192106322, "grad_norm": 0.5697953246124879, "learning_rate": 1.470967741935484e-05, "loss": 0.4597, "step": 114 }, { "epoch": 0.18525976641159886, "grad_norm": 0.684779610893239, "learning_rate": 1.4838709677419357e-05, "loss": 0.4513, "step": 115 }, { "epoch": 0.18687072090213452, "grad_norm": 0.7663595177962778, "learning_rate": 1.4967741935483873e-05, "loss": 0.4501, "step": 116 }, { "epoch": 0.18848167539267016, "grad_norm": 0.6231547313160818, "learning_rate": 1.5096774193548389e-05, "loss": 0.4654, "step": 117 }, { "epoch": 0.1900926298832058, "grad_norm": 0.762366532637053, "learning_rate": 1.5225806451612903e-05, "loss": 0.4379, "step": 118 }, { "epoch": 0.19170358437374144, "grad_norm": 0.6014548144293846, "learning_rate": 1.535483870967742e-05, "loss": 0.4433, "step": 119 }, { "epoch": 0.19331453886427707, "grad_norm": 0.5976102150815602, "learning_rate": 1.5483870967741936e-05, "loss": 0.4412, "step": 120 }, { "epoch": 0.19492549335481274, "grad_norm": 0.7203469874012064, "learning_rate": 1.5612903225806454e-05, "loss": 0.4746, "step": 121 }, { "epoch": 0.19653644784534838, "grad_norm": 0.7853276912774007, "learning_rate": 1.5741935483870968e-05, "loss": 0.457, "step": 122 }, { "epoch": 0.19814740233588402, "grad_norm": 0.5932744430290895, "learning_rate": 1.5870967741935485e-05, "loss": 0.4576, "step": 123 }, { "epoch": 0.19975835682641965, "grad_norm": 0.7267109233788118, "learning_rate": 1.6000000000000003e-05, "loss": 0.4467, "step": 124 }, { "epoch": 0.2013693113169553, "grad_norm": 0.5406828652336474, "learning_rate": 1.6129032258064517e-05, "loss": 0.4462, "step": 125 }, { "epoch": 0.20298026580749093, "grad_norm": 0.5858021844743821, "learning_rate": 1.6258064516129034e-05, "loss": 0.4626, "step": 126 }, { "epoch": 0.2045912202980266, "grad_norm": 0.5877766270343217, "learning_rate": 1.638709677419355e-05, "loss": 0.455, "step": 127 }, { "epoch": 0.20620217478856223, "grad_norm": 0.47863537726359784, "learning_rate": 1.6516129032258066e-05, "loss": 0.4539, "step": 128 }, { "epoch": 0.20781312927909787, "grad_norm": 0.439070948766706, "learning_rate": 1.6645161290322583e-05, "loss": 0.4443, "step": 129 }, { "epoch": 0.2094240837696335, "grad_norm": 0.5181283081353542, "learning_rate": 1.6774193548387098e-05, "loss": 0.4522, "step": 130 }, { "epoch": 0.21103503826016914, "grad_norm": 0.4577673620554301, "learning_rate": 1.6903225806451615e-05, "loss": 0.479, "step": 131 }, { "epoch": 0.21264599275070478, "grad_norm": 0.4523908387978445, "learning_rate": 1.703225806451613e-05, "loss": 0.4456, "step": 132 }, { "epoch": 0.21425694724124045, "grad_norm": 0.5290727008449524, "learning_rate": 1.7161290322580647e-05, "loss": 0.4221, "step": 133 }, { "epoch": 0.21586790173177609, "grad_norm": 0.4980372169033664, "learning_rate": 1.729032258064516e-05, "loss": 0.4563, "step": 134 }, { "epoch": 0.21747885622231172, "grad_norm": 0.48606495543694916, "learning_rate": 1.741935483870968e-05, "loss": 0.4324, "step": 135 }, { "epoch": 0.21908981071284736, "grad_norm": 0.6957991032575429, "learning_rate": 1.7548387096774196e-05, "loss": 0.4722, "step": 136 }, { "epoch": 0.220700765203383, "grad_norm": 0.4643809416764509, "learning_rate": 1.7677419354838713e-05, "loss": 0.4521, "step": 137 }, { "epoch": 0.22231171969391864, "grad_norm": 0.6664029651503106, "learning_rate": 1.7806451612903228e-05, "loss": 0.4481, "step": 138 }, { "epoch": 0.2239226741844543, "grad_norm": 0.5260601890755087, "learning_rate": 1.7935483870967742e-05, "loss": 0.4525, "step": 139 }, { "epoch": 0.22553362867498994, "grad_norm": 0.5161740937718994, "learning_rate": 1.806451612903226e-05, "loss": 0.4455, "step": 140 }, { "epoch": 0.22714458316552558, "grad_norm": 0.5760986430160835, "learning_rate": 1.8193548387096777e-05, "loss": 0.4582, "step": 141 }, { "epoch": 0.22875553765606121, "grad_norm": 0.44639939244600985, "learning_rate": 1.832258064516129e-05, "loss": 0.4296, "step": 142 }, { "epoch": 0.23036649214659685, "grad_norm": 0.4882062915992461, "learning_rate": 1.845161290322581e-05, "loss": 0.446, "step": 143 }, { "epoch": 0.2319774466371325, "grad_norm": 0.4896222422614565, "learning_rate": 1.8580645161290326e-05, "loss": 0.4093, "step": 144 }, { "epoch": 0.23358840112766815, "grad_norm": 0.5601501144229972, "learning_rate": 1.870967741935484e-05, "loss": 0.4505, "step": 145 }, { "epoch": 0.2351993556182038, "grad_norm": 0.5437487520198745, "learning_rate": 1.8838709677419354e-05, "loss": 0.4322, "step": 146 }, { "epoch": 0.23681031010873943, "grad_norm": 0.526186951905453, "learning_rate": 1.896774193548387e-05, "loss": 0.4521, "step": 147 }, { "epoch": 0.23842126459927507, "grad_norm": 0.5667856097490956, "learning_rate": 1.909677419354839e-05, "loss": 0.4271, "step": 148 }, { "epoch": 0.2400322190898107, "grad_norm": 0.49095301330200275, "learning_rate": 1.9225806451612907e-05, "loss": 0.4253, "step": 149 }, { "epoch": 0.24164317358034634, "grad_norm": 0.5980300594255382, "learning_rate": 1.935483870967742e-05, "loss": 0.4244, "step": 150 }, { "epoch": 0.243254128070882, "grad_norm": 0.47376445836894726, "learning_rate": 1.948387096774194e-05, "loss": 0.4372, "step": 151 }, { "epoch": 0.24486508256141765, "grad_norm": 0.5782178587487257, "learning_rate": 1.9612903225806452e-05, "loss": 0.4616, "step": 152 }, { "epoch": 0.24647603705195328, "grad_norm": 0.6641367840166066, "learning_rate": 1.974193548387097e-05, "loss": 0.4373, "step": 153 }, { "epoch": 0.24808699154248892, "grad_norm": 0.4915152452969149, "learning_rate": 1.9870967741935484e-05, "loss": 0.4375, "step": 154 }, { "epoch": 0.24969794603302456, "grad_norm": 0.4715056217318344, "learning_rate": 2e-05, "loss": 0.4602, "step": 155 }, { "epoch": 0.2513089005235602, "grad_norm": 0.4588234482596573, "learning_rate": 2.0129032258064516e-05, "loss": 0.4364, "step": 156 }, { "epoch": 0.25291985501409586, "grad_norm": 0.49507022488662084, "learning_rate": 2.0258064516129033e-05, "loss": 0.4328, "step": 157 }, { "epoch": 0.2545308095046315, "grad_norm": 0.475753423255671, "learning_rate": 2.0387096774193547e-05, "loss": 0.428, "step": 158 }, { "epoch": 0.25614176399516714, "grad_norm": 0.5183928270573065, "learning_rate": 2.051612903225807e-05, "loss": 0.4392, "step": 159 }, { "epoch": 0.2577527184857028, "grad_norm": 0.4774005808982356, "learning_rate": 2.0645161290322582e-05, "loss": 0.4428, "step": 160 }, { "epoch": 0.2593636729762384, "grad_norm": 0.4572472041520854, "learning_rate": 2.07741935483871e-05, "loss": 0.4634, "step": 161 }, { "epoch": 0.2609746274667741, "grad_norm": 0.6081326727542029, "learning_rate": 2.0903225806451614e-05, "loss": 0.428, "step": 162 }, { "epoch": 0.2625855819573097, "grad_norm": 0.5023244790140038, "learning_rate": 2.1032258064516128e-05, "loss": 0.462, "step": 163 }, { "epoch": 0.26419653644784535, "grad_norm": 0.5479494420185165, "learning_rate": 2.116129032258065e-05, "loss": 0.4469, "step": 164 }, { "epoch": 0.26580749093838096, "grad_norm": 0.4584032900157975, "learning_rate": 2.1290322580645163e-05, "loss": 0.4477, "step": 165 }, { "epoch": 0.26741844542891663, "grad_norm": 0.6145948964239977, "learning_rate": 2.141935483870968e-05, "loss": 0.4371, "step": 166 }, { "epoch": 0.2690293999194523, "grad_norm": 0.5594470471380588, "learning_rate": 2.1548387096774195e-05, "loss": 0.456, "step": 167 }, { "epoch": 0.2706403544099879, "grad_norm": 0.45972051481741444, "learning_rate": 2.1677419354838712e-05, "loss": 0.4391, "step": 168 }, { "epoch": 0.27225130890052357, "grad_norm": 0.5461702197752573, "learning_rate": 2.1806451612903227e-05, "loss": 0.4409, "step": 169 }, { "epoch": 0.2738622633910592, "grad_norm": 0.5906536061989351, "learning_rate": 2.193548387096774e-05, "loss": 0.4586, "step": 170 }, { "epoch": 0.27547321788159485, "grad_norm": 0.599985383963263, "learning_rate": 2.206451612903226e-05, "loss": 0.4503, "step": 171 }, { "epoch": 0.2770841723721305, "grad_norm": 0.5604973755851727, "learning_rate": 2.2193548387096776e-05, "loss": 0.442, "step": 172 }, { "epoch": 0.2786951268626661, "grad_norm": 0.4798615338782793, "learning_rate": 2.2322580645161293e-05, "loss": 0.4436, "step": 173 }, { "epoch": 0.2803060813532018, "grad_norm": 0.6325144823189365, "learning_rate": 2.2451612903225807e-05, "loss": 0.4439, "step": 174 }, { "epoch": 0.2819170358437374, "grad_norm": 0.5641277931011702, "learning_rate": 2.2580645161290328e-05, "loss": 0.4288, "step": 175 }, { "epoch": 0.28352799033427306, "grad_norm": 0.4953611842384428, "learning_rate": 2.2709677419354842e-05, "loss": 0.4449, "step": 176 }, { "epoch": 0.2851389448248087, "grad_norm": 0.5224583314537559, "learning_rate": 2.2838709677419357e-05, "loss": 0.4162, "step": 177 }, { "epoch": 0.28674989931534434, "grad_norm": 0.5001528193199506, "learning_rate": 2.2967741935483874e-05, "loss": 0.4393, "step": 178 }, { "epoch": 0.28836085380588, "grad_norm": 0.48777820697444463, "learning_rate": 2.3096774193548388e-05, "loss": 0.4261, "step": 179 }, { "epoch": 0.2899718082964156, "grad_norm": 0.64062502637235, "learning_rate": 2.3225806451612906e-05, "loss": 0.4442, "step": 180 }, { "epoch": 0.2915827627869513, "grad_norm": 0.5683627649259488, "learning_rate": 2.335483870967742e-05, "loss": 0.447, "step": 181 }, { "epoch": 0.2931937172774869, "grad_norm": 0.7696086123267061, "learning_rate": 2.3483870967741934e-05, "loss": 0.4219, "step": 182 }, { "epoch": 0.29480467176802255, "grad_norm": 0.7701985708034406, "learning_rate": 2.3612903225806455e-05, "loss": 0.4178, "step": 183 }, { "epoch": 0.2964156262585582, "grad_norm": 0.655838324079945, "learning_rate": 2.374193548387097e-05, "loss": 0.4229, "step": 184 }, { "epoch": 0.29802658074909383, "grad_norm": 0.5294585112609449, "learning_rate": 2.3870967741935486e-05, "loss": 0.4374, "step": 185 }, { "epoch": 0.2996375352396295, "grad_norm": 0.6213012416475787, "learning_rate": 2.4e-05, "loss": 0.4307, "step": 186 }, { "epoch": 0.3012484897301651, "grad_norm": 0.7049316455544041, "learning_rate": 2.4129032258064518e-05, "loss": 0.4251, "step": 187 }, { "epoch": 0.30285944422070077, "grad_norm": 0.5847111370562961, "learning_rate": 2.4258064516129036e-05, "loss": 0.4322, "step": 188 }, { "epoch": 0.30447039871123643, "grad_norm": 0.5607890554854233, "learning_rate": 2.438709677419355e-05, "loss": 0.4283, "step": 189 }, { "epoch": 0.30608135320177204, "grad_norm": 0.4807120710699517, "learning_rate": 2.4516129032258067e-05, "loss": 0.4321, "step": 190 }, { "epoch": 0.3076923076923077, "grad_norm": 0.6290000689322379, "learning_rate": 2.464516129032258e-05, "loss": 0.4398, "step": 191 }, { "epoch": 0.3093032621828433, "grad_norm": 0.5860760329948072, "learning_rate": 2.47741935483871e-05, "loss": 0.4239, "step": 192 }, { "epoch": 0.310914216673379, "grad_norm": 0.5713603124420665, "learning_rate": 2.4903225806451613e-05, "loss": 0.4501, "step": 193 }, { "epoch": 0.3125251711639146, "grad_norm": 0.7292654172275151, "learning_rate": 2.5032258064516134e-05, "loss": 0.44, "step": 194 }, { "epoch": 0.31413612565445026, "grad_norm": 0.557304847589959, "learning_rate": 2.5161290322580648e-05, "loss": 0.4391, "step": 195 }, { "epoch": 0.3157470801449859, "grad_norm": 0.8185353596554609, "learning_rate": 2.5290322580645162e-05, "loss": 0.4495, "step": 196 }, { "epoch": 0.31735803463552154, "grad_norm": 0.6696922756603706, "learning_rate": 2.541935483870968e-05, "loss": 0.4229, "step": 197 }, { "epoch": 0.3189689891260572, "grad_norm": 0.5867477527038533, "learning_rate": 2.5548387096774194e-05, "loss": 0.4281, "step": 198 }, { "epoch": 0.3205799436165928, "grad_norm": 0.7383292533663941, "learning_rate": 2.567741935483871e-05, "loss": 0.4335, "step": 199 }, { "epoch": 0.3221908981071285, "grad_norm": 0.5494545052520469, "learning_rate": 2.5806451612903226e-05, "loss": 0.4123, "step": 200 }, { "epoch": 0.32380185259766414, "grad_norm": 0.787263534027303, "learning_rate": 2.5935483870967746e-05, "loss": 0.4334, "step": 201 }, { "epoch": 0.32541280708819975, "grad_norm": 0.671237794078286, "learning_rate": 2.606451612903226e-05, "loss": 0.4469, "step": 202 }, { "epoch": 0.3270237615787354, "grad_norm": 0.6087290607998392, "learning_rate": 2.6193548387096775e-05, "loss": 0.4468, "step": 203 }, { "epoch": 0.328634716069271, "grad_norm": 0.8456692623524803, "learning_rate": 2.6322580645161292e-05, "loss": 0.4349, "step": 204 }, { "epoch": 0.3302456705598067, "grad_norm": 0.5789430896858069, "learning_rate": 2.6451612903225806e-05, "loss": 0.4386, "step": 205 }, { "epoch": 0.3318566250503423, "grad_norm": 0.5227774356619872, "learning_rate": 2.6580645161290327e-05, "loss": 0.4237, "step": 206 }, { "epoch": 0.33346757954087797, "grad_norm": 0.5151727075586936, "learning_rate": 2.670967741935484e-05, "loss": 0.4397, "step": 207 }, { "epoch": 0.33507853403141363, "grad_norm": 0.48417771982658486, "learning_rate": 2.683870967741936e-05, "loss": 0.4479, "step": 208 }, { "epoch": 0.33668948852194924, "grad_norm": 0.5481117200758377, "learning_rate": 2.6967741935483873e-05, "loss": 0.4186, "step": 209 }, { "epoch": 0.3383004430124849, "grad_norm": 0.4448179461564387, "learning_rate": 2.7096774193548387e-05, "loss": 0.4084, "step": 210 }, { "epoch": 0.3399113975030205, "grad_norm": 0.49449277894946425, "learning_rate": 2.7225806451612905e-05, "loss": 0.442, "step": 211 }, { "epoch": 0.3415223519935562, "grad_norm": 0.5207685357019004, "learning_rate": 2.735483870967742e-05, "loss": 0.4353, "step": 212 }, { "epoch": 0.34313330648409185, "grad_norm": 0.7050029152494538, "learning_rate": 2.748387096774194e-05, "loss": 0.4226, "step": 213 }, { "epoch": 0.34474426097462746, "grad_norm": 0.6948829474376914, "learning_rate": 2.7612903225806454e-05, "loss": 0.4246, "step": 214 }, { "epoch": 0.3463552154651631, "grad_norm": 0.5285793312066525, "learning_rate": 2.774193548387097e-05, "loss": 0.4387, "step": 215 }, { "epoch": 0.34796616995569873, "grad_norm": 0.5892451924441414, "learning_rate": 2.7870967741935485e-05, "loss": 0.4455, "step": 216 }, { "epoch": 0.3495771244462344, "grad_norm": 0.8021238678624306, "learning_rate": 2.8e-05, "loss": 0.425, "step": 217 }, { "epoch": 0.35118807893677, "grad_norm": 0.5945002361956657, "learning_rate": 2.812903225806452e-05, "loss": 0.4286, "step": 218 }, { "epoch": 0.3527990334273057, "grad_norm": 0.428799155792937, "learning_rate": 2.8258064516129035e-05, "loss": 0.408, "step": 219 }, { "epoch": 0.35440998791784134, "grad_norm": 0.5963355738995051, "learning_rate": 2.8387096774193552e-05, "loss": 0.4067, "step": 220 }, { "epoch": 0.35602094240837695, "grad_norm": 0.6365882727245175, "learning_rate": 2.8516129032258066e-05, "loss": 0.4408, "step": 221 }, { "epoch": 0.3576318968989126, "grad_norm": 0.46796304829903396, "learning_rate": 2.8645161290322584e-05, "loss": 0.4338, "step": 222 }, { "epoch": 0.3592428513894482, "grad_norm": 0.47402969771177833, "learning_rate": 2.8774193548387098e-05, "loss": 0.4128, "step": 223 }, { "epoch": 0.3608538058799839, "grad_norm": 0.48803059781651736, "learning_rate": 2.8903225806451612e-05, "loss": 0.4407, "step": 224 }, { "epoch": 0.36246476037051956, "grad_norm": 0.49604586682206475, "learning_rate": 2.9032258064516133e-05, "loss": 0.4232, "step": 225 }, { "epoch": 0.36407571486105517, "grad_norm": 0.48189900699189114, "learning_rate": 2.9161290322580647e-05, "loss": 0.4377, "step": 226 }, { "epoch": 0.36568666935159083, "grad_norm": 0.48989884967543235, "learning_rate": 2.9290322580645165e-05, "loss": 0.4182, "step": 227 }, { "epoch": 0.36729762384212644, "grad_norm": 0.5085990736148946, "learning_rate": 2.941935483870968e-05, "loss": 0.4141, "step": 228 }, { "epoch": 0.3689085783326621, "grad_norm": 0.5542764435566585, "learning_rate": 2.95483870967742e-05, "loss": 0.4239, "step": 229 }, { "epoch": 0.3705195328231977, "grad_norm": 0.5509845441337561, "learning_rate": 2.9677419354838714e-05, "loss": 0.4327, "step": 230 }, { "epoch": 0.3721304873137334, "grad_norm": 0.5374789273502775, "learning_rate": 2.9806451612903228e-05, "loss": 0.4046, "step": 231 }, { "epoch": 0.37374144180426905, "grad_norm": 0.512328932947813, "learning_rate": 2.9935483870967745e-05, "loss": 0.4416, "step": 232 }, { "epoch": 0.37535239629480466, "grad_norm": 0.5980263212180401, "learning_rate": 3.006451612903226e-05, "loss": 0.437, "step": 233 }, { "epoch": 0.3769633507853403, "grad_norm": 0.4528657439497093, "learning_rate": 3.0193548387096777e-05, "loss": 0.4151, "step": 234 }, { "epoch": 0.37857430527587593, "grad_norm": 0.5472677539799308, "learning_rate": 3.032258064516129e-05, "loss": 0.4435, "step": 235 }, { "epoch": 0.3801852597664116, "grad_norm": 0.5284722744720981, "learning_rate": 3.0451612903225805e-05, "loss": 0.421, "step": 236 }, { "epoch": 0.38179621425694726, "grad_norm": 0.5145172039518136, "learning_rate": 3.0580645161290326e-05, "loss": 0.4255, "step": 237 }, { "epoch": 0.3834071687474829, "grad_norm": 0.48595473394452576, "learning_rate": 3.070967741935484e-05, "loss": 0.4237, "step": 238 }, { "epoch": 0.38501812323801854, "grad_norm": 0.6578784457151443, "learning_rate": 3.083870967741936e-05, "loss": 0.4372, "step": 239 }, { "epoch": 0.38662907772855415, "grad_norm": 0.6381289705882743, "learning_rate": 3.096774193548387e-05, "loss": 0.4405, "step": 240 }, { "epoch": 0.3882400322190898, "grad_norm": 0.5736653750108794, "learning_rate": 3.109677419354839e-05, "loss": 0.4304, "step": 241 }, { "epoch": 0.3898509867096255, "grad_norm": 0.61679678041202, "learning_rate": 3.122580645161291e-05, "loss": 0.4299, "step": 242 }, { "epoch": 0.3914619412001611, "grad_norm": 0.6665472313912354, "learning_rate": 3.135483870967742e-05, "loss": 0.4312, "step": 243 }, { "epoch": 0.39307289569069676, "grad_norm": 0.6298402893744898, "learning_rate": 3.1483870967741935e-05, "loss": 0.4161, "step": 244 }, { "epoch": 0.39468385018123237, "grad_norm": 0.5300708226898071, "learning_rate": 3.161290322580645e-05, "loss": 0.4277, "step": 245 }, { "epoch": 0.39629480467176803, "grad_norm": 0.5893160355497045, "learning_rate": 3.174193548387097e-05, "loss": 0.4285, "step": 246 }, { "epoch": 0.39790575916230364, "grad_norm": 0.6980967474518777, "learning_rate": 3.187096774193549e-05, "loss": 0.4181, "step": 247 }, { "epoch": 0.3995167136528393, "grad_norm": 0.5413223642212789, "learning_rate": 3.2000000000000005e-05, "loss": 0.4269, "step": 248 }, { "epoch": 0.40112766814337497, "grad_norm": 0.40290216945992247, "learning_rate": 3.2129032258064516e-05, "loss": 0.4347, "step": 249 }, { "epoch": 0.4027386226339106, "grad_norm": 0.5063526982810828, "learning_rate": 3.2258064516129034e-05, "loss": 0.4387, "step": 250 }, { "epoch": 0.40434957712444625, "grad_norm": 0.5220460662141977, "learning_rate": 3.238709677419355e-05, "loss": 0.4307, "step": 251 }, { "epoch": 0.40596053161498186, "grad_norm": 0.4914288485416426, "learning_rate": 3.251612903225807e-05, "loss": 0.4212, "step": 252 }, { "epoch": 0.4075714861055175, "grad_norm": 0.5074074899301333, "learning_rate": 3.2645161290322586e-05, "loss": 0.4247, "step": 253 }, { "epoch": 0.4091824405960532, "grad_norm": 0.5117479204044484, "learning_rate": 3.27741935483871e-05, "loss": 0.4199, "step": 254 }, { "epoch": 0.4107933950865888, "grad_norm": 0.5057623752314662, "learning_rate": 3.2903225806451614e-05, "loss": 0.4377, "step": 255 }, { "epoch": 0.41240434957712446, "grad_norm": 0.45973399006678667, "learning_rate": 3.303225806451613e-05, "loss": 0.4294, "step": 256 }, { "epoch": 0.4140153040676601, "grad_norm": 0.4690587970020925, "learning_rate": 3.316129032258064e-05, "loss": 0.43, "step": 257 }, { "epoch": 0.41562625855819574, "grad_norm": 0.5079472609720749, "learning_rate": 3.329032258064517e-05, "loss": 0.4218, "step": 258 }, { "epoch": 0.41723721304873135, "grad_norm": 0.5254404696098615, "learning_rate": 3.341935483870968e-05, "loss": 0.4236, "step": 259 }, { "epoch": 0.418848167539267, "grad_norm": 0.4097556199416461, "learning_rate": 3.3548387096774195e-05, "loss": 0.4297, "step": 260 }, { "epoch": 0.4204591220298027, "grad_norm": 0.5308773641962137, "learning_rate": 3.367741935483871e-05, "loss": 0.4144, "step": 261 }, { "epoch": 0.4220700765203383, "grad_norm": 0.5114920055423455, "learning_rate": 3.380645161290323e-05, "loss": 0.4063, "step": 262 }, { "epoch": 0.42368103101087395, "grad_norm": 0.5684133487028641, "learning_rate": 3.393548387096775e-05, "loss": 0.3987, "step": 263 }, { "epoch": 0.42529198550140956, "grad_norm": 0.4879815808284312, "learning_rate": 3.406451612903226e-05, "loss": 0.4301, "step": 264 }, { "epoch": 0.42690293999194523, "grad_norm": 0.5519017376318576, "learning_rate": 3.4193548387096776e-05, "loss": 0.4282, "step": 265 }, { "epoch": 0.4285138944824809, "grad_norm": 0.5172565641895582, "learning_rate": 3.4322580645161294e-05, "loss": 0.4293, "step": 266 }, { "epoch": 0.4301248489730165, "grad_norm": 0.5978796630371895, "learning_rate": 3.445161290322581e-05, "loss": 0.4256, "step": 267 }, { "epoch": 0.43173580346355217, "grad_norm": 0.46900318232253063, "learning_rate": 3.458064516129032e-05, "loss": 0.4472, "step": 268 }, { "epoch": 0.4333467579540878, "grad_norm": 0.5359562617290479, "learning_rate": 3.4709677419354846e-05, "loss": 0.434, "step": 269 }, { "epoch": 0.43495771244462345, "grad_norm": 0.5467288956413452, "learning_rate": 3.483870967741936e-05, "loss": 0.4273, "step": 270 }, { "epoch": 0.43656866693515906, "grad_norm": 0.5657718857551184, "learning_rate": 3.4967741935483874e-05, "loss": 0.4353, "step": 271 }, { "epoch": 0.4381796214256947, "grad_norm": 0.5222662238820099, "learning_rate": 3.509677419354839e-05, "loss": 0.4054, "step": 272 }, { "epoch": 0.4397905759162304, "grad_norm": 0.5104771085486108, "learning_rate": 3.52258064516129e-05, "loss": 0.4296, "step": 273 }, { "epoch": 0.441401530406766, "grad_norm": 0.573976020499011, "learning_rate": 3.535483870967743e-05, "loss": 0.4336, "step": 274 }, { "epoch": 0.44301248489730166, "grad_norm": 0.5651641640641581, "learning_rate": 3.548387096774194e-05, "loss": 0.4332, "step": 275 }, { "epoch": 0.44462343938783727, "grad_norm": 0.4432761779379142, "learning_rate": 3.5612903225806455e-05, "loss": 0.4372, "step": 276 }, { "epoch": 0.44623439387837294, "grad_norm": 0.5288954385305018, "learning_rate": 3.574193548387097e-05, "loss": 0.4319, "step": 277 }, { "epoch": 0.4478453483689086, "grad_norm": 0.4467858794531326, "learning_rate": 3.5870967741935483e-05, "loss": 0.398, "step": 278 }, { "epoch": 0.4494563028594442, "grad_norm": 0.4854143357523973, "learning_rate": 3.6e-05, "loss": 0.405, "step": 279 }, { "epoch": 0.4510672573499799, "grad_norm": 0.47026604377994136, "learning_rate": 3.612903225806452e-05, "loss": 0.4406, "step": 280 }, { "epoch": 0.4526782118405155, "grad_norm": 0.49527144296582787, "learning_rate": 3.6258064516129036e-05, "loss": 0.4397, "step": 281 }, { "epoch": 0.45428916633105115, "grad_norm": 0.5999996141858859, "learning_rate": 3.6387096774193553e-05, "loss": 0.4234, "step": 282 }, { "epoch": 0.45590012082158676, "grad_norm": 0.5220722098689459, "learning_rate": 3.651612903225807e-05, "loss": 0.4346, "step": 283 }, { "epoch": 0.45751107531212243, "grad_norm": 0.43711076153629586, "learning_rate": 3.664516129032258e-05, "loss": 0.4355, "step": 284 }, { "epoch": 0.4591220298026581, "grad_norm": 0.6172985122046, "learning_rate": 3.67741935483871e-05, "loss": 0.4184, "step": 285 }, { "epoch": 0.4607329842931937, "grad_norm": 0.5867425345199105, "learning_rate": 3.690322580645162e-05, "loss": 0.4438, "step": 286 }, { "epoch": 0.46234393878372937, "grad_norm": 0.4457544342019048, "learning_rate": 3.7032258064516134e-05, "loss": 0.4194, "step": 287 }, { "epoch": 0.463954893274265, "grad_norm": 0.666524347966105, "learning_rate": 3.716129032258065e-05, "loss": 0.422, "step": 288 }, { "epoch": 0.46556584776480064, "grad_norm": 0.48549716093426315, "learning_rate": 3.729032258064516e-05, "loss": 0.4163, "step": 289 }, { "epoch": 0.4671768022553363, "grad_norm": 0.528960986822585, "learning_rate": 3.741935483870968e-05, "loss": 0.4244, "step": 290 }, { "epoch": 0.4687877567458719, "grad_norm": 0.5886855824754854, "learning_rate": 3.75483870967742e-05, "loss": 0.4131, "step": 291 }, { "epoch": 0.4703987112364076, "grad_norm": 0.5041181448802059, "learning_rate": 3.767741935483871e-05, "loss": 0.4293, "step": 292 }, { "epoch": 0.4720096657269432, "grad_norm": 0.5833712014204475, "learning_rate": 3.780645161290323e-05, "loss": 0.4189, "step": 293 }, { "epoch": 0.47362062021747886, "grad_norm": 0.6344962543797149, "learning_rate": 3.793548387096774e-05, "loss": 0.4148, "step": 294 }, { "epoch": 0.4752315747080145, "grad_norm": 0.4209164542717526, "learning_rate": 3.806451612903226e-05, "loss": 0.3986, "step": 295 }, { "epoch": 0.47684252919855014, "grad_norm": 0.5561556783839265, "learning_rate": 3.819354838709678e-05, "loss": 0.4354, "step": 296 }, { "epoch": 0.4784534836890858, "grad_norm": 0.6257224120221472, "learning_rate": 3.832258064516129e-05, "loss": 0.4151, "step": 297 }, { "epoch": 0.4800644381796214, "grad_norm": 0.6035957973374384, "learning_rate": 3.8451612903225813e-05, "loss": 0.4116, "step": 298 }, { "epoch": 0.4816753926701571, "grad_norm": 0.5050188538136214, "learning_rate": 3.8580645161290324e-05, "loss": 0.4205, "step": 299 }, { "epoch": 0.4832863471606927, "grad_norm": 0.4716292658982768, "learning_rate": 3.870967741935484e-05, "loss": 0.4277, "step": 300 }, { "epoch": 0.48489730165122835, "grad_norm": 0.4741578168809942, "learning_rate": 3.883870967741936e-05, "loss": 0.4212, "step": 301 }, { "epoch": 0.486508256141764, "grad_norm": 0.4987417243199203, "learning_rate": 3.896774193548388e-05, "loss": 0.433, "step": 302 }, { "epoch": 0.4881192106322996, "grad_norm": 0.47028866203783526, "learning_rate": 3.909677419354839e-05, "loss": 0.4409, "step": 303 }, { "epoch": 0.4897301651228353, "grad_norm": 0.5419362505448754, "learning_rate": 3.9225806451612905e-05, "loss": 0.4072, "step": 304 }, { "epoch": 0.4913411196133709, "grad_norm": 0.6217806303253193, "learning_rate": 3.935483870967742e-05, "loss": 0.4285, "step": 305 }, { "epoch": 0.49295207410390657, "grad_norm": 0.5420446824564362, "learning_rate": 3.948387096774194e-05, "loss": 0.4405, "step": 306 }, { "epoch": 0.49456302859444223, "grad_norm": 0.6297593472421277, "learning_rate": 3.961290322580646e-05, "loss": 0.4336, "step": 307 }, { "epoch": 0.49617398308497784, "grad_norm": 0.6587570423137625, "learning_rate": 3.974193548387097e-05, "loss": 0.4241, "step": 308 }, { "epoch": 0.4977849375755135, "grad_norm": 0.670548883274989, "learning_rate": 3.9870967741935486e-05, "loss": 0.4347, "step": 309 }, { "epoch": 0.4993958920660491, "grad_norm": 0.6932491705418964, "learning_rate": 4e-05, "loss": 0.3976, "step": 310 }, { "epoch": 0.5010068465565848, "grad_norm": 0.4742576328363187, "learning_rate": 3.999998732081634e-05, "loss": 0.4291, "step": 311 }, { "epoch": 0.5026178010471204, "grad_norm": 0.529513963401129, "learning_rate": 3.999994928328141e-05, "loss": 0.4305, "step": 312 }, { "epoch": 0.5042287555376561, "grad_norm": 0.6091004305460854, "learning_rate": 3.9999885887443455e-05, "loss": 0.4216, "step": 313 }, { "epoch": 0.5058397100281917, "grad_norm": 0.5381494437696771, "learning_rate": 3.9999797133382855e-05, "loss": 0.4165, "step": 314 }, { "epoch": 0.5074506645187273, "grad_norm": 0.4317011506932105, "learning_rate": 3.9999683021212134e-05, "loss": 0.443, "step": 315 }, { "epoch": 0.509061619009263, "grad_norm": 0.47285920628489064, "learning_rate": 3.999954355107598e-05, "loss": 0.4282, "step": 316 }, { "epoch": 0.5106725734997987, "grad_norm": 0.5242267633007971, "learning_rate": 3.999937872315124e-05, "loss": 0.4179, "step": 317 }, { "epoch": 0.5122835279903343, "grad_norm": 0.4123011973763891, "learning_rate": 3.9999188537646894e-05, "loss": 0.4438, "step": 318 }, { "epoch": 0.5138944824808699, "grad_norm": 0.4057247056617174, "learning_rate": 3.999897299480408e-05, "loss": 0.4036, "step": 319 }, { "epoch": 0.5155054369714056, "grad_norm": 0.4188342427708109, "learning_rate": 3.9998732094896084e-05, "loss": 0.4299, "step": 320 }, { "epoch": 0.5171163914619412, "grad_norm": 0.43672985709660445, "learning_rate": 3.999846583822836e-05, "loss": 0.4237, "step": 321 }, { "epoch": 0.5187273459524768, "grad_norm": 0.48310073400224307, "learning_rate": 3.999817422513849e-05, "loss": 0.4187, "step": 322 }, { "epoch": 0.5203383004430124, "grad_norm": 0.5530788167112765, "learning_rate": 3.999785725599623e-05, "loss": 0.4311, "step": 323 }, { "epoch": 0.5219492549335482, "grad_norm": 0.5381555047474669, "learning_rate": 3.999751493120345e-05, "loss": 0.4291, "step": 324 }, { "epoch": 0.5235602094240838, "grad_norm": 0.5827345030273974, "learning_rate": 3.99971472511942e-05, "loss": 0.4305, "step": 325 }, { "epoch": 0.5251711639146194, "grad_norm": 0.38155681688778703, "learning_rate": 3.999675421643467e-05, "loss": 0.4174, "step": 326 }, { "epoch": 0.5267821184051551, "grad_norm": 0.5492754517767143, "learning_rate": 3.99963358274232e-05, "loss": 0.4212, "step": 327 }, { "epoch": 0.5283930728956907, "grad_norm": 0.48789623482177874, "learning_rate": 3.9995892084690256e-05, "loss": 0.4215, "step": 328 }, { "epoch": 0.5300040273862263, "grad_norm": 0.4422563535366751, "learning_rate": 3.9995422988798494e-05, "loss": 0.4118, "step": 329 }, { "epoch": 0.5316149818767619, "grad_norm": 0.6826787557912818, "learning_rate": 3.999492854034266e-05, "loss": 0.4322, "step": 330 }, { "epoch": 0.5332259363672976, "grad_norm": 0.7161197167511507, "learning_rate": 3.99944087399497e-05, "loss": 0.4297, "step": 331 }, { "epoch": 0.5348368908578333, "grad_norm": 0.45307187386139897, "learning_rate": 3.999386358827866e-05, "loss": 0.4017, "step": 332 }, { "epoch": 0.5364478453483689, "grad_norm": 0.5080846848977575, "learning_rate": 3.999329308602076e-05, "loss": 0.4176, "step": 333 }, { "epoch": 0.5380587998389046, "grad_norm": 0.44961265160037006, "learning_rate": 3.9992697233899345e-05, "loss": 0.4146, "step": 334 }, { "epoch": 0.5396697543294402, "grad_norm": 0.44569468180373156, "learning_rate": 3.9992076032669905e-05, "loss": 0.4276, "step": 335 }, { "epoch": 0.5412807088199758, "grad_norm": 0.4589301924407286, "learning_rate": 3.999142948312007e-05, "loss": 0.4068, "step": 336 }, { "epoch": 0.5428916633105115, "grad_norm": 0.46283215335566513, "learning_rate": 3.999075758606963e-05, "loss": 0.4102, "step": 337 }, { "epoch": 0.5445026178010471, "grad_norm": 0.4428187054855388, "learning_rate": 3.999006034237047e-05, "loss": 0.4076, "step": 338 }, { "epoch": 0.5461135722915827, "grad_norm": 0.3941621296766751, "learning_rate": 3.9989337752906656e-05, "loss": 0.4248, "step": 339 }, { "epoch": 0.5477245267821184, "grad_norm": 0.4911947798210392, "learning_rate": 3.998858981859436e-05, "loss": 0.4234, "step": 340 }, { "epoch": 0.5493354812726541, "grad_norm": 0.45142619025228986, "learning_rate": 3.998781654038192e-05, "loss": 0.4098, "step": 341 }, { "epoch": 0.5509464357631897, "grad_norm": 0.45716417313795044, "learning_rate": 3.998701791924977e-05, "loss": 0.4232, "step": 342 }, { "epoch": 0.5525573902537253, "grad_norm": 0.4811829968487867, "learning_rate": 3.998619395621051e-05, "loss": 0.4301, "step": 343 }, { "epoch": 0.554168344744261, "grad_norm": 0.3793740271087329, "learning_rate": 3.9985344652308846e-05, "loss": 0.4215, "step": 344 }, { "epoch": 0.5557792992347966, "grad_norm": 0.42003419142799425, "learning_rate": 3.998447000862164e-05, "loss": 0.4509, "step": 345 }, { "epoch": 0.5573902537253322, "grad_norm": 0.3906557697989691, "learning_rate": 3.9983570026257844e-05, "loss": 0.4231, "step": 346 }, { "epoch": 0.5590012082158679, "grad_norm": 0.39196036899925196, "learning_rate": 3.9982644706358596e-05, "loss": 0.4089, "step": 347 }, { "epoch": 0.5606121627064036, "grad_norm": 0.4913202549527037, "learning_rate": 3.998169405009711e-05, "loss": 0.422, "step": 348 }, { "epoch": 0.5622231171969392, "grad_norm": 0.4279358660405233, "learning_rate": 3.9980718058678733e-05, "loss": 0.4248, "step": 349 }, { "epoch": 0.5638340716874748, "grad_norm": 0.49969446549842206, "learning_rate": 3.997971673334095e-05, "loss": 0.4202, "step": 350 }, { "epoch": 0.5654450261780105, "grad_norm": 0.41732878429281683, "learning_rate": 3.997869007535336e-05, "loss": 0.422, "step": 351 }, { "epoch": 0.5670559806685461, "grad_norm": 0.4294137359439838, "learning_rate": 3.997763808601768e-05, "loss": 0.4252, "step": 352 }, { "epoch": 0.5686669351590817, "grad_norm": 0.3707661272418532, "learning_rate": 3.997656076666776e-05, "loss": 0.4027, "step": 353 }, { "epoch": 0.5702778896496175, "grad_norm": 0.3766908978268678, "learning_rate": 3.997545811866952e-05, "loss": 0.4312, "step": 354 }, { "epoch": 0.5718888441401531, "grad_norm": 0.45306234795270156, "learning_rate": 3.997433014342106e-05, "loss": 0.4223, "step": 355 }, { "epoch": 0.5734997986306887, "grad_norm": 0.4793083302776694, "learning_rate": 3.997317684235254e-05, "loss": 0.4229, "step": 356 }, { "epoch": 0.5751107531212243, "grad_norm": 0.4675021055165329, "learning_rate": 3.9971998216926274e-05, "loss": 0.4434, "step": 357 }, { "epoch": 0.57672170761176, "grad_norm": 0.4460710699457162, "learning_rate": 3.997079426863664e-05, "loss": 0.4013, "step": 358 }, { "epoch": 0.5783326621022956, "grad_norm": 0.45964782363764733, "learning_rate": 3.996956499901015e-05, "loss": 0.4245, "step": 359 }, { "epoch": 0.5799436165928312, "grad_norm": 0.5182341589054947, "learning_rate": 3.996831040960543e-05, "loss": 0.4038, "step": 360 }, { "epoch": 0.581554571083367, "grad_norm": 0.4117269217562021, "learning_rate": 3.996703050201319e-05, "loss": 0.4153, "step": 361 }, { "epoch": 0.5831655255739026, "grad_norm": 0.46882737702461946, "learning_rate": 3.996572527785625e-05, "loss": 0.3976, "step": 362 }, { "epoch": 0.5847764800644382, "grad_norm": 0.3796364352520464, "learning_rate": 3.996439473878952e-05, "loss": 0.4208, "step": 363 }, { "epoch": 0.5863874345549738, "grad_norm": 0.4992194206692767, "learning_rate": 3.996303888650002e-05, "loss": 0.4266, "step": 364 }, { "epoch": 0.5879983890455095, "grad_norm": 0.4787895226087798, "learning_rate": 3.9961657722706864e-05, "loss": 0.4301, "step": 365 }, { "epoch": 0.5896093435360451, "grad_norm": 0.5407494009323632, "learning_rate": 3.996025124916125e-05, "loss": 0.4157, "step": 366 }, { "epoch": 0.5912202980265807, "grad_norm": 0.4261943946875255, "learning_rate": 3.995881946764647e-05, "loss": 0.4089, "step": 367 }, { "epoch": 0.5928312525171164, "grad_norm": 0.3850462450614437, "learning_rate": 3.995736237997792e-05, "loss": 0.4272, "step": 368 }, { "epoch": 0.594442207007652, "grad_norm": 0.45463889935104884, "learning_rate": 3.9955879988003046e-05, "loss": 0.4133, "step": 369 }, { "epoch": 0.5960531614981877, "grad_norm": 0.48825127888326364, "learning_rate": 3.9954372293601415e-05, "loss": 0.4336, "step": 370 }, { "epoch": 0.5976641159887233, "grad_norm": 0.4068129703538119, "learning_rate": 3.9952839298684656e-05, "loss": 0.4001, "step": 371 }, { "epoch": 0.599275070479259, "grad_norm": 0.49552613300447507, "learning_rate": 3.9951281005196486e-05, "loss": 0.4023, "step": 372 }, { "epoch": 0.6008860249697946, "grad_norm": 0.46945317770031153, "learning_rate": 3.994969741511269e-05, "loss": 0.3857, "step": 373 }, { "epoch": 0.6024969794603302, "grad_norm": 0.4741587306281566, "learning_rate": 3.994808853044113e-05, "loss": 0.4461, "step": 374 }, { "epoch": 0.6041079339508659, "grad_norm": 0.4685179794707909, "learning_rate": 3.994645435322174e-05, "loss": 0.4094, "step": 375 }, { "epoch": 0.6057188884414015, "grad_norm": 0.45483642283060166, "learning_rate": 3.994479488552652e-05, "loss": 0.4228, "step": 376 }, { "epoch": 0.6073298429319371, "grad_norm": 0.4244946382664049, "learning_rate": 3.9943110129459555e-05, "loss": 0.4279, "step": 377 }, { "epoch": 0.6089407974224729, "grad_norm": 0.4294198854653389, "learning_rate": 3.994140008715697e-05, "loss": 0.4032, "step": 378 }, { "epoch": 0.6105517519130085, "grad_norm": 0.3989456663260737, "learning_rate": 3.993966476078694e-05, "loss": 0.4147, "step": 379 }, { "epoch": 0.6121627064035441, "grad_norm": 0.3713937522581636, "learning_rate": 3.9937904152549746e-05, "loss": 0.4192, "step": 380 }, { "epoch": 0.6137736608940797, "grad_norm": 0.4384496746610661, "learning_rate": 3.993611826467768e-05, "loss": 0.4356, "step": 381 }, { "epoch": 0.6153846153846154, "grad_norm": 0.42889466156762074, "learning_rate": 3.99343070994351e-05, "loss": 0.4299, "step": 382 }, { "epoch": 0.616995569875151, "grad_norm": 0.3555646074061858, "learning_rate": 3.993247065911844e-05, "loss": 0.4309, "step": 383 }, { "epoch": 0.6186065243656866, "grad_norm": 0.43396045989866394, "learning_rate": 3.993060894605612e-05, "loss": 0.3994, "step": 384 }, { "epoch": 0.6202174788562224, "grad_norm": 0.43776606757803926, "learning_rate": 3.992872196260866e-05, "loss": 0.4097, "step": 385 }, { "epoch": 0.621828433346758, "grad_norm": 0.420636085604741, "learning_rate": 3.99268097111686e-05, "loss": 0.417, "step": 386 }, { "epoch": 0.6234393878372936, "grad_norm": 0.5064346878522746, "learning_rate": 3.992487219416052e-05, "loss": 0.4183, "step": 387 }, { "epoch": 0.6250503423278292, "grad_norm": 0.5057851487040337, "learning_rate": 3.992290941404103e-05, "loss": 0.3974, "step": 388 }, { "epoch": 0.6266612968183649, "grad_norm": 0.5521508424756063, "learning_rate": 3.992092137329878e-05, "loss": 0.4108, "step": 389 }, { "epoch": 0.6282722513089005, "grad_norm": 0.5405974200010049, "learning_rate": 3.991890807445443e-05, "loss": 0.4477, "step": 390 }, { "epoch": 0.6298832057994361, "grad_norm": 0.39506341263585665, "learning_rate": 3.991686952006069e-05, "loss": 0.4326, "step": 391 }, { "epoch": 0.6314941602899719, "grad_norm": 0.511124831235241, "learning_rate": 3.991480571270228e-05, "loss": 0.4226, "step": 392 }, { "epoch": 0.6331051147805075, "grad_norm": 0.42776265768914273, "learning_rate": 3.991271665499594e-05, "loss": 0.4046, "step": 393 }, { "epoch": 0.6347160692710431, "grad_norm": 0.4959908838554333, "learning_rate": 3.991060234959042e-05, "loss": 0.416, "step": 394 }, { "epoch": 0.6363270237615787, "grad_norm": 0.4670521894890168, "learning_rate": 3.990846279916649e-05, "loss": 0.4224, "step": 395 }, { "epoch": 0.6379379782521144, "grad_norm": 0.4006152685344792, "learning_rate": 3.9906298006436924e-05, "loss": 0.4164, "step": 396 }, { "epoch": 0.63954893274265, "grad_norm": 0.4463899466226375, "learning_rate": 3.99041079741465e-05, "loss": 0.4298, "step": 397 }, { "epoch": 0.6411598872331856, "grad_norm": 0.39436113561085406, "learning_rate": 3.9901892705072004e-05, "loss": 0.4139, "step": 398 }, { "epoch": 0.6427708417237213, "grad_norm": 0.48684252012399337, "learning_rate": 3.989965220202221e-05, "loss": 0.4157, "step": 399 }, { "epoch": 0.644381796214257, "grad_norm": 0.36781064362322613, "learning_rate": 3.98973864678379e-05, "loss": 0.4094, "step": 400 }, { "epoch": 0.6459927507047926, "grad_norm": 0.4545736363221886, "learning_rate": 3.989509550539185e-05, "loss": 0.416, "step": 401 }, { "epoch": 0.6476037051953283, "grad_norm": 0.4021213653694868, "learning_rate": 3.989277931758879e-05, "loss": 0.3986, "step": 402 }, { "epoch": 0.6492146596858639, "grad_norm": 0.3938202729838156, "learning_rate": 3.989043790736547e-05, "loss": 0.3977, "step": 403 }, { "epoch": 0.6508256141763995, "grad_norm": 0.3714283205343456, "learning_rate": 3.9888071277690606e-05, "loss": 0.4008, "step": 404 }, { "epoch": 0.6524365686669351, "grad_norm": 0.49605923508034744, "learning_rate": 3.988567943156489e-05, "loss": 0.4044, "step": 405 }, { "epoch": 0.6540475231574708, "grad_norm": 0.5342861440227792, "learning_rate": 3.988326237202099e-05, "loss": 0.4053, "step": 406 }, { "epoch": 0.6556584776480064, "grad_norm": 0.3428099270926289, "learning_rate": 3.988082010212354e-05, "loss": 0.3928, "step": 407 }, { "epoch": 0.657269432138542, "grad_norm": 0.4004574130249575, "learning_rate": 3.987835262496913e-05, "loss": 0.4171, "step": 408 }, { "epoch": 0.6588803866290778, "grad_norm": 0.35970332120506987, "learning_rate": 3.9875859943686335e-05, "loss": 0.4024, "step": 409 }, { "epoch": 0.6604913411196134, "grad_norm": 0.4224502636249679, "learning_rate": 3.9873342061435664e-05, "loss": 0.4187, "step": 410 }, { "epoch": 0.662102295610149, "grad_norm": 0.3764938020888867, "learning_rate": 3.987079898140958e-05, "loss": 0.4072, "step": 411 }, { "epoch": 0.6637132501006846, "grad_norm": 0.3694724316905374, "learning_rate": 3.98682307068325e-05, "loss": 0.415, "step": 412 }, { "epoch": 0.6653242045912203, "grad_norm": 0.36693082697539087, "learning_rate": 3.98656372409608e-05, "loss": 0.4046, "step": 413 }, { "epoch": 0.6669351590817559, "grad_norm": 0.44730430916292946, "learning_rate": 3.986301858708278e-05, "loss": 0.4133, "step": 414 }, { "epoch": 0.6685461135722915, "grad_norm": 0.3809823415227652, "learning_rate": 3.9860374748518676e-05, "loss": 0.4211, "step": 415 }, { "epoch": 0.6701570680628273, "grad_norm": 0.3563474324193386, "learning_rate": 3.985770572862065e-05, "loss": 0.4088, "step": 416 }, { "epoch": 0.6717680225533629, "grad_norm": 0.3805173842463904, "learning_rate": 3.985501153077282e-05, "loss": 0.4124, "step": 417 }, { "epoch": 0.6733789770438985, "grad_norm": 0.43552193509232306, "learning_rate": 3.985229215839119e-05, "loss": 0.4261, "step": 418 }, { "epoch": 0.6749899315344342, "grad_norm": 0.42439761528072495, "learning_rate": 3.984954761492372e-05, "loss": 0.4017, "step": 419 }, { "epoch": 0.6766008860249698, "grad_norm": 0.3105332394794023, "learning_rate": 3.984677790385025e-05, "loss": 0.3985, "step": 420 }, { "epoch": 0.6782118405155054, "grad_norm": 0.4027207084117632, "learning_rate": 3.9843983028682555e-05, "loss": 0.4026, "step": 421 }, { "epoch": 0.679822795006041, "grad_norm": 0.3508235171579207, "learning_rate": 3.984116299296432e-05, "loss": 0.3896, "step": 422 }, { "epoch": 0.6814337494965768, "grad_norm": 0.3573507910316362, "learning_rate": 3.9838317800271105e-05, "loss": 0.4283, "step": 423 }, { "epoch": 0.6830447039871124, "grad_norm": 0.37632509861430463, "learning_rate": 3.983544745421038e-05, "loss": 0.3997, "step": 424 }, { "epoch": 0.684655658477648, "grad_norm": 0.3388804904806458, "learning_rate": 3.983255195842152e-05, "loss": 0.4124, "step": 425 }, { "epoch": 0.6862666129681837, "grad_norm": 0.4833298766343206, "learning_rate": 3.982963131657577e-05, "loss": 0.4177, "step": 426 }, { "epoch": 0.6878775674587193, "grad_norm": 0.42070364198235066, "learning_rate": 3.982668553237628e-05, "loss": 0.3904, "step": 427 }, { "epoch": 0.6894885219492549, "grad_norm": 0.4482515006894441, "learning_rate": 3.982371460955804e-05, "loss": 0.4311, "step": 428 }, { "epoch": 0.6910994764397905, "grad_norm": 0.40436392123161874, "learning_rate": 3.982071855188796e-05, "loss": 0.4045, "step": 429 }, { "epoch": 0.6927104309303262, "grad_norm": 0.3383709764320333, "learning_rate": 3.981769736316478e-05, "loss": 0.4006, "step": 430 }, { "epoch": 0.6943213854208619, "grad_norm": 0.42754996232658854, "learning_rate": 3.9814651047219135e-05, "loss": 0.4289, "step": 431 }, { "epoch": 0.6959323399113975, "grad_norm": 0.3876602252178334, "learning_rate": 3.9811579607913486e-05, "loss": 0.4246, "step": 432 }, { "epoch": 0.6975432944019332, "grad_norm": 0.4278854255881262, "learning_rate": 3.9808483049142185e-05, "loss": 0.4448, "step": 433 }, { "epoch": 0.6991542488924688, "grad_norm": 0.35865693620507294, "learning_rate": 3.980536137483141e-05, "loss": 0.4142, "step": 434 }, { "epoch": 0.7007652033830044, "grad_norm": 0.4625593533325866, "learning_rate": 3.980221458893919e-05, "loss": 0.4171, "step": 435 }, { "epoch": 0.70237615787354, "grad_norm": 0.32682701461120106, "learning_rate": 3.979904269545538e-05, "loss": 0.4266, "step": 436 }, { "epoch": 0.7039871123640757, "grad_norm": 0.415104731241559, "learning_rate": 3.979584569840171e-05, "loss": 0.4204, "step": 437 }, { "epoch": 0.7055980668546113, "grad_norm": 0.3592034123608851, "learning_rate": 3.979262360183169e-05, "loss": 0.4067, "step": 438 }, { "epoch": 0.707209021345147, "grad_norm": 0.43562091204913506, "learning_rate": 3.9789376409830674e-05, "loss": 0.4081, "step": 439 }, { "epoch": 0.7088199758356827, "grad_norm": 0.43196190361541487, "learning_rate": 3.978610412651584e-05, "loss": 0.401, "step": 440 }, { "epoch": 0.7104309303262183, "grad_norm": 0.4071385089261009, "learning_rate": 3.978280675603618e-05, "loss": 0.4163, "step": 441 }, { "epoch": 0.7120418848167539, "grad_norm": 0.41016039223243267, "learning_rate": 3.97794843025725e-05, "loss": 0.3942, "step": 442 }, { "epoch": 0.7136528393072896, "grad_norm": 0.3841321139543235, "learning_rate": 3.977613677033738e-05, "loss": 0.4118, "step": 443 }, { "epoch": 0.7152637937978252, "grad_norm": 0.35310267037819615, "learning_rate": 3.977276416357523e-05, "loss": 0.4218, "step": 444 }, { "epoch": 0.7168747482883608, "grad_norm": 0.4197080640230907, "learning_rate": 3.976936648656223e-05, "loss": 0.4261, "step": 445 }, { "epoch": 0.7184857027788965, "grad_norm": 0.3546111322565111, "learning_rate": 3.976594374360637e-05, "loss": 0.4133, "step": 446 }, { "epoch": 0.7200966572694322, "grad_norm": 0.4394668646642138, "learning_rate": 3.97624959390474e-05, "loss": 0.4168, "step": 447 }, { "epoch": 0.7217076117599678, "grad_norm": 0.3221466833352809, "learning_rate": 3.975902307725686e-05, "loss": 0.4175, "step": 448 }, { "epoch": 0.7233185662505034, "grad_norm": 0.46121007238081874, "learning_rate": 3.975552516263804e-05, "loss": 0.3995, "step": 449 }, { "epoch": 0.7249295207410391, "grad_norm": 0.40160398472276715, "learning_rate": 3.9752002199626035e-05, "loss": 0.4152, "step": 450 }, { "epoch": 0.7265404752315747, "grad_norm": 0.37117019417124525, "learning_rate": 3.974845419268766e-05, "loss": 0.3937, "step": 451 }, { "epoch": 0.7281514297221103, "grad_norm": 0.31852553564843433, "learning_rate": 3.97448811463215e-05, "loss": 0.396, "step": 452 }, { "epoch": 0.7297623842126459, "grad_norm": 0.42093190036658035, "learning_rate": 3.974128306505788e-05, "loss": 0.4105, "step": 453 }, { "epoch": 0.7313733387031817, "grad_norm": 0.3427182108489802, "learning_rate": 3.973765995345889e-05, "loss": 0.3974, "step": 454 }, { "epoch": 0.7329842931937173, "grad_norm": 0.38214069037483106, "learning_rate": 3.973401181611832e-05, "loss": 0.4309, "step": 455 }, { "epoch": 0.7345952476842529, "grad_norm": 0.41629666543067606, "learning_rate": 3.973033865766172e-05, "loss": 0.4094, "step": 456 }, { "epoch": 0.7362062021747886, "grad_norm": 0.34200464296520006, "learning_rate": 3.972664048274636e-05, "loss": 0.4161, "step": 457 }, { "epoch": 0.7378171566653242, "grad_norm": 0.3471638897635492, "learning_rate": 3.9722917296061216e-05, "loss": 0.4297, "step": 458 }, { "epoch": 0.7394281111558598, "grad_norm": 0.45161457244236064, "learning_rate": 3.971916910232699e-05, "loss": 0.4295, "step": 459 }, { "epoch": 0.7410390656463954, "grad_norm": 0.42974197063134223, "learning_rate": 3.971539590629608e-05, "loss": 0.399, "step": 460 }, { "epoch": 0.7426500201369312, "grad_norm": 0.43361932874840936, "learning_rate": 3.971159771275259e-05, "loss": 0.4096, "step": 461 }, { "epoch": 0.7442609746274668, "grad_norm": 0.36249109522453676, "learning_rate": 3.9707774526512334e-05, "loss": 0.3998, "step": 462 }, { "epoch": 0.7458719291180024, "grad_norm": 0.42345130014954085, "learning_rate": 3.970392635242278e-05, "loss": 0.4123, "step": 463 }, { "epoch": 0.7474828836085381, "grad_norm": 0.4028131720862331, "learning_rate": 3.970005319536311e-05, "loss": 0.4048, "step": 464 }, { "epoch": 0.7490938380990737, "grad_norm": 0.3922591817364784, "learning_rate": 3.9696155060244166e-05, "loss": 0.416, "step": 465 }, { "epoch": 0.7507047925896093, "grad_norm": 0.394910086124614, "learning_rate": 3.969223195200847e-05, "loss": 0.409, "step": 466 }, { "epoch": 0.752315747080145, "grad_norm": 0.4010895179099205, "learning_rate": 3.9688283875630193e-05, "loss": 0.4221, "step": 467 }, { "epoch": 0.7539267015706806, "grad_norm": 0.3344274716823261, "learning_rate": 3.96843108361152e-05, "loss": 0.3966, "step": 468 }, { "epoch": 0.7555376560612163, "grad_norm": 0.36508649487932493, "learning_rate": 3.968031283850094e-05, "loss": 0.3896, "step": 469 }, { "epoch": 0.7571486105517519, "grad_norm": 0.3194248632705373, "learning_rate": 3.967628988785658e-05, "loss": 0.3913, "step": 470 }, { "epoch": 0.7587595650422876, "grad_norm": 0.3348385327401639, "learning_rate": 3.967224198928289e-05, "loss": 0.4321, "step": 471 }, { "epoch": 0.7603705195328232, "grad_norm": 0.3331982859049919, "learning_rate": 3.966816914791226e-05, "loss": 0.4088, "step": 472 }, { "epoch": 0.7619814740233588, "grad_norm": 0.3252671342709306, "learning_rate": 3.9664071368908726e-05, "loss": 0.4324, "step": 473 }, { "epoch": 0.7635924285138945, "grad_norm": 0.3778011510229453, "learning_rate": 3.965994865746794e-05, "loss": 0.4083, "step": 474 }, { "epoch": 0.7652033830044301, "grad_norm": 0.3697341056634576, "learning_rate": 3.9655801018817166e-05, "loss": 0.417, "step": 475 }, { "epoch": 0.7668143374949657, "grad_norm": 0.3686614640695823, "learning_rate": 3.965162845821526e-05, "loss": 0.404, "step": 476 }, { "epoch": 0.7684252919855014, "grad_norm": 0.3198502413551383, "learning_rate": 3.96474309809527e-05, "loss": 0.3918, "step": 477 }, { "epoch": 0.7700362464760371, "grad_norm": 0.404426337364835, "learning_rate": 3.964320859235155e-05, "loss": 0.3978, "step": 478 }, { "epoch": 0.7716472009665727, "grad_norm": 0.37411618257407514, "learning_rate": 3.963896129776544e-05, "loss": 0.4447, "step": 479 }, { "epoch": 0.7732581554571083, "grad_norm": 0.3647067289069705, "learning_rate": 3.963468910257959e-05, "loss": 0.4123, "step": 480 }, { "epoch": 0.774869109947644, "grad_norm": 0.3646906272324245, "learning_rate": 3.9630392012210804e-05, "loss": 0.4106, "step": 481 }, { "epoch": 0.7764800644381796, "grad_norm": 0.34012637826464404, "learning_rate": 3.962607003210745e-05, "loss": 0.4228, "step": 482 }, { "epoch": 0.7780910189287152, "grad_norm": 0.3104491216102974, "learning_rate": 3.9621723167749424e-05, "loss": 0.4041, "step": 483 }, { "epoch": 0.779701973419251, "grad_norm": 0.35559733908050584, "learning_rate": 3.9617351424648215e-05, "loss": 0.4224, "step": 484 }, { "epoch": 0.7813129279097866, "grad_norm": 0.3461370454635196, "learning_rate": 3.961295480834683e-05, "loss": 0.4032, "step": 485 }, { "epoch": 0.7829238824003222, "grad_norm": 0.3625961813341553, "learning_rate": 3.960853332441981e-05, "loss": 0.4031, "step": 486 }, { "epoch": 0.7845348368908578, "grad_norm": 0.329895268062928, "learning_rate": 3.960408697847324e-05, "loss": 0.4115, "step": 487 }, { "epoch": 0.7861457913813935, "grad_norm": 0.38660120590435787, "learning_rate": 3.959961577614474e-05, "loss": 0.4166, "step": 488 }, { "epoch": 0.7877567458719291, "grad_norm": 0.33405951123771366, "learning_rate": 3.9595119723103416e-05, "loss": 0.4125, "step": 489 }, { "epoch": 0.7893677003624647, "grad_norm": 0.37519732027389735, "learning_rate": 3.9590598825049896e-05, "loss": 0.4171, "step": 490 }, { "epoch": 0.7909786548530005, "grad_norm": 0.3309560769864468, "learning_rate": 3.95860530877163e-05, "loss": 0.3816, "step": 491 }, { "epoch": 0.7925896093435361, "grad_norm": 0.5272231298290175, "learning_rate": 3.958148251686628e-05, "loss": 0.4174, "step": 492 }, { "epoch": 0.7942005638340717, "grad_norm": 0.3928743178231115, "learning_rate": 3.9576887118294915e-05, "loss": 0.4315, "step": 493 }, { "epoch": 0.7958115183246073, "grad_norm": 0.4279333846270503, "learning_rate": 3.957226689782882e-05, "loss": 0.3968, "step": 494 }, { "epoch": 0.797422472815143, "grad_norm": 0.5131288201799059, "learning_rate": 3.956762186132604e-05, "loss": 0.4145, "step": 495 }, { "epoch": 0.7990334273056786, "grad_norm": 0.4048881402151487, "learning_rate": 3.9562952014676116e-05, "loss": 0.4184, "step": 496 }, { "epoch": 0.8006443817962142, "grad_norm": 0.5006403294292532, "learning_rate": 3.955825736380002e-05, "loss": 0.4098, "step": 497 }, { "epoch": 0.8022553362867499, "grad_norm": 0.45269029887323053, "learning_rate": 3.95535379146502e-05, "loss": 0.3942, "step": 498 }, { "epoch": 0.8038662907772856, "grad_norm": 0.4401085637177939, "learning_rate": 3.9548793673210515e-05, "loss": 0.4112, "step": 499 }, { "epoch": 0.8054772452678212, "grad_norm": 0.4536636237917159, "learning_rate": 3.954402464549628e-05, "loss": 0.4069, "step": 500 }, { "epoch": 0.8070881997583568, "grad_norm": 0.4549086907445366, "learning_rate": 3.9539230837554253e-05, "loss": 0.3907, "step": 501 }, { "epoch": 0.8086991542488925, "grad_norm": 0.4852155503335648, "learning_rate": 3.953441225546257e-05, "loss": 0.3988, "step": 502 }, { "epoch": 0.8103101087394281, "grad_norm": 0.44412599397084784, "learning_rate": 3.95295689053308e-05, "loss": 0.4119, "step": 503 }, { "epoch": 0.8119210632299637, "grad_norm": 0.44492278549668857, "learning_rate": 3.9524700793299926e-05, "loss": 0.4121, "step": 504 }, { "epoch": 0.8135320177204994, "grad_norm": 0.34508494742909007, "learning_rate": 3.951980792554231e-05, "loss": 0.4033, "step": 505 }, { "epoch": 0.815142972211035, "grad_norm": 0.4855681935693533, "learning_rate": 3.9514890308261706e-05, "loss": 0.4152, "step": 506 }, { "epoch": 0.8167539267015707, "grad_norm": 0.3132705383046582, "learning_rate": 3.9509947947693266e-05, "loss": 0.3828, "step": 507 }, { "epoch": 0.8183648811921064, "grad_norm": 0.4283053623030758, "learning_rate": 3.950498085010348e-05, "loss": 0.4005, "step": 508 }, { "epoch": 0.819975835682642, "grad_norm": 0.3609521715579133, "learning_rate": 3.949998902179024e-05, "loss": 0.4155, "step": 509 }, { "epoch": 0.8215867901731776, "grad_norm": 0.36832356239597386, "learning_rate": 3.9494972469082764e-05, "loss": 0.4141, "step": 510 }, { "epoch": 0.8231977446637132, "grad_norm": 0.4503424316996966, "learning_rate": 3.948993119834164e-05, "loss": 0.4112, "step": 511 }, { "epoch": 0.8248086991542489, "grad_norm": 0.41571779430634015, "learning_rate": 3.948486521595878e-05, "loss": 0.427, "step": 512 }, { "epoch": 0.8264196536447845, "grad_norm": 0.3883909259491303, "learning_rate": 3.9479774528357445e-05, "loss": 0.3969, "step": 513 }, { "epoch": 0.8280306081353201, "grad_norm": 0.3932810351114709, "learning_rate": 3.9474659141992197e-05, "loss": 0.4121, "step": 514 }, { "epoch": 0.8296415626258559, "grad_norm": 0.4308748780875734, "learning_rate": 3.946951906334895e-05, "loss": 0.4076, "step": 515 }, { "epoch": 0.8312525171163915, "grad_norm": 0.35777478573661275, "learning_rate": 3.946435429894488e-05, "loss": 0.4132, "step": 516 }, { "epoch": 0.8328634716069271, "grad_norm": 0.48261083312441627, "learning_rate": 3.94591648553285e-05, "loss": 0.4006, "step": 517 }, { "epoch": 0.8344744260974627, "grad_norm": 0.40863315410267975, "learning_rate": 3.94539507390796e-05, "loss": 0.4063, "step": 518 }, { "epoch": 0.8360853805879984, "grad_norm": 0.43340425011865114, "learning_rate": 3.944871195680926e-05, "loss": 0.4138, "step": 519 }, { "epoch": 0.837696335078534, "grad_norm": 0.39224215584003436, "learning_rate": 3.9443448515159815e-05, "loss": 0.3847, "step": 520 }, { "epoch": 0.8393072895690696, "grad_norm": 0.4188929922999027, "learning_rate": 3.9438160420804886e-05, "loss": 0.4059, "step": 521 }, { "epoch": 0.8409182440596054, "grad_norm": 0.41744558940813187, "learning_rate": 3.943284768044935e-05, "loss": 0.4052, "step": 522 }, { "epoch": 0.842529198550141, "grad_norm": 0.46993836475096185, "learning_rate": 3.942751030082932e-05, "loss": 0.4243, "step": 523 }, { "epoch": 0.8441401530406766, "grad_norm": 0.36251580903388136, "learning_rate": 3.942214828871216e-05, "loss": 0.4198, "step": 524 }, { "epoch": 0.8457511075312123, "grad_norm": 0.3940717167893435, "learning_rate": 3.9416761650896456e-05, "loss": 0.4082, "step": 525 }, { "epoch": 0.8473620620217479, "grad_norm": 0.4274934534112222, "learning_rate": 3.941135039421204e-05, "loss": 0.3946, "step": 526 }, { "epoch": 0.8489730165122835, "grad_norm": 0.36645772020289197, "learning_rate": 3.940591452551993e-05, "loss": 0.4068, "step": 527 }, { "epoch": 0.8505839710028191, "grad_norm": 0.4316640291709618, "learning_rate": 3.9400454051712375e-05, "loss": 0.4009, "step": 528 }, { "epoch": 0.8521949254933548, "grad_norm": 0.34398127722906846, "learning_rate": 3.939496897971281e-05, "loss": 0.3907, "step": 529 }, { "epoch": 0.8538058799838905, "grad_norm": 0.40687953024824, "learning_rate": 3.938945931647585e-05, "loss": 0.3901, "step": 530 }, { "epoch": 0.8554168344744261, "grad_norm": 0.35490989918542215, "learning_rate": 3.9383925068987306e-05, "loss": 0.3928, "step": 531 }, { "epoch": 0.8570277889649618, "grad_norm": 0.3548604235172149, "learning_rate": 3.937836624426414e-05, "loss": 0.4037, "step": 532 }, { "epoch": 0.8586387434554974, "grad_norm": 0.3846710598884646, "learning_rate": 3.9372782849354496e-05, "loss": 0.4046, "step": 533 }, { "epoch": 0.860249697946033, "grad_norm": 0.46337690960360517, "learning_rate": 3.936717489133768e-05, "loss": 0.4049, "step": 534 }, { "epoch": 0.8618606524365686, "grad_norm": 0.3347915864544789, "learning_rate": 3.936154237732409e-05, "loss": 0.4199, "step": 535 }, { "epoch": 0.8634716069271043, "grad_norm": 0.4109719486832367, "learning_rate": 3.9355885314455316e-05, "loss": 0.4145, "step": 536 }, { "epoch": 0.86508256141764, "grad_norm": 0.4039943914910843, "learning_rate": 3.935020370990405e-05, "loss": 0.4094, "step": 537 }, { "epoch": 0.8666935159081756, "grad_norm": 0.33381054763996676, "learning_rate": 3.9344497570874105e-05, "loss": 0.3997, "step": 538 }, { "epoch": 0.8683044703987113, "grad_norm": 0.37323165887081494, "learning_rate": 3.933876690460039e-05, "loss": 0.4003, "step": 539 }, { "epoch": 0.8699154248892469, "grad_norm": 0.4031148093361396, "learning_rate": 3.9333011718348925e-05, "loss": 0.4066, "step": 540 }, { "epoch": 0.8715263793797825, "grad_norm": 0.35075255673198685, "learning_rate": 3.932723201941683e-05, "loss": 0.3941, "step": 541 }, { "epoch": 0.8731373338703181, "grad_norm": 0.40922218226065216, "learning_rate": 3.932142781513227e-05, "loss": 0.4147, "step": 542 }, { "epoch": 0.8747482883608538, "grad_norm": 0.3354099083614772, "learning_rate": 3.9315599112854513e-05, "loss": 0.4039, "step": 543 }, { "epoch": 0.8763592428513894, "grad_norm": 0.3748405697078558, "learning_rate": 3.930974591997387e-05, "loss": 0.4069, "step": 544 }, { "epoch": 0.877970197341925, "grad_norm": 0.3578120297382228, "learning_rate": 3.930386824391173e-05, "loss": 0.4237, "step": 545 }, { "epoch": 0.8795811518324608, "grad_norm": 0.3498596617280635, "learning_rate": 3.9297966092120494e-05, "loss": 0.4201, "step": 546 }, { "epoch": 0.8811921063229964, "grad_norm": 0.32782617117586416, "learning_rate": 3.9292039472083604e-05, "loss": 0.4052, "step": 547 }, { "epoch": 0.882803060813532, "grad_norm": 0.32420421899810425, "learning_rate": 3.928608839131554e-05, "loss": 0.3967, "step": 548 }, { "epoch": 0.8844140153040677, "grad_norm": 0.3610196304954956, "learning_rate": 3.9280112857361785e-05, "loss": 0.4073, "step": 549 }, { "epoch": 0.8860249697946033, "grad_norm": 0.3462247564175289, "learning_rate": 3.927411287779882e-05, "loss": 0.419, "step": 550 }, { "epoch": 0.8876359242851389, "grad_norm": 0.38210585668638986, "learning_rate": 3.926808846023414e-05, "loss": 0.398, "step": 551 }, { "epoch": 0.8892468787756745, "grad_norm": 0.38393152658210905, "learning_rate": 3.926203961230621e-05, "loss": 0.425, "step": 552 }, { "epoch": 0.8908578332662103, "grad_norm": 0.38660450532200585, "learning_rate": 3.925596634168447e-05, "loss": 0.4139, "step": 553 }, { "epoch": 0.8924687877567459, "grad_norm": 0.4455772806196931, "learning_rate": 3.9249868656069346e-05, "loss": 0.4076, "step": 554 }, { "epoch": 0.8940797422472815, "grad_norm": 0.30617516032050024, "learning_rate": 3.9243746563192184e-05, "loss": 0.4047, "step": 555 }, { "epoch": 0.8956906967378172, "grad_norm": 0.4660640316019242, "learning_rate": 3.923760007081532e-05, "loss": 0.4023, "step": 556 }, { "epoch": 0.8973016512283528, "grad_norm": 0.3429596427602513, "learning_rate": 3.9231429186731996e-05, "loss": 0.38, "step": 557 }, { "epoch": 0.8989126057188884, "grad_norm": 0.41289741497886173, "learning_rate": 3.922523391876638e-05, "loss": 0.3973, "step": 558 }, { "epoch": 0.900523560209424, "grad_norm": 0.4494827289948899, "learning_rate": 3.921901427477358e-05, "loss": 0.3997, "step": 559 }, { "epoch": 0.9021345146999598, "grad_norm": 0.33632315973719623, "learning_rate": 3.921277026263959e-05, "loss": 0.409, "step": 560 }, { "epoch": 0.9037454691904954, "grad_norm": 0.5070254674512086, "learning_rate": 3.9206501890281305e-05, "loss": 0.3968, "step": 561 }, { "epoch": 0.905356423681031, "grad_norm": 0.4187055671468531, "learning_rate": 3.920020916564652e-05, "loss": 0.4142, "step": 562 }, { "epoch": 0.9069673781715667, "grad_norm": 0.4460483809169745, "learning_rate": 3.9193892096713886e-05, "loss": 0.3944, "step": 563 }, { "epoch": 0.9085783326621023, "grad_norm": 0.39905515724281176, "learning_rate": 3.918755069149293e-05, "loss": 0.4127, "step": 564 }, { "epoch": 0.9101892871526379, "grad_norm": 0.3437786125257829, "learning_rate": 3.9181184958024045e-05, "loss": 0.4223, "step": 565 }, { "epoch": 0.9118002416431735, "grad_norm": 0.3388066571758449, "learning_rate": 3.917479490437845e-05, "loss": 0.402, "step": 566 }, { "epoch": 0.9134111961337092, "grad_norm": 0.38012535779270673, "learning_rate": 3.9168380538658224e-05, "loss": 0.4161, "step": 567 }, { "epoch": 0.9150221506242449, "grad_norm": 0.3391977799991008, "learning_rate": 3.916194186899626e-05, "loss": 0.4038, "step": 568 }, { "epoch": 0.9166331051147805, "grad_norm": 0.3646302581201986, "learning_rate": 3.915547890355625e-05, "loss": 0.4148, "step": 569 }, { "epoch": 0.9182440596053162, "grad_norm": 0.3135049185381983, "learning_rate": 3.914899165053272e-05, "loss": 0.4111, "step": 570 }, { "epoch": 0.9198550140958518, "grad_norm": 0.3535605107932701, "learning_rate": 3.9142480118150964e-05, "loss": 0.3874, "step": 571 }, { "epoch": 0.9214659685863874, "grad_norm": 0.3418138874943149, "learning_rate": 3.913594431466709e-05, "loss": 0.395, "step": 572 }, { "epoch": 0.9230769230769231, "grad_norm": 0.40111797732706755, "learning_rate": 3.912938424836795e-05, "loss": 0.412, "step": 573 }, { "epoch": 0.9246878775674587, "grad_norm": 0.3565226075692776, "learning_rate": 3.912279992757117e-05, "loss": 0.4047, "step": 574 }, { "epoch": 0.9262988320579943, "grad_norm": 0.43319435674290785, "learning_rate": 3.911619136062515e-05, "loss": 0.3937, "step": 575 }, { "epoch": 0.92790978654853, "grad_norm": 0.40127033071910445, "learning_rate": 3.9109558555909005e-05, "loss": 0.4009, "step": 576 }, { "epoch": 0.9295207410390657, "grad_norm": 0.40228778188193176, "learning_rate": 3.910290152183258e-05, "loss": 0.4035, "step": 577 }, { "epoch": 0.9311316955296013, "grad_norm": 0.41880827476759785, "learning_rate": 3.909622026683647e-05, "loss": 0.3902, "step": 578 }, { "epoch": 0.9327426500201369, "grad_norm": 0.4339612846528539, "learning_rate": 3.9089514799391936e-05, "loss": 0.427, "step": 579 }, { "epoch": 0.9343536045106726, "grad_norm": 0.40920471224475635, "learning_rate": 3.908278512800098e-05, "loss": 0.4053, "step": 580 }, { "epoch": 0.9359645590012082, "grad_norm": 0.3903565760977215, "learning_rate": 3.907603126119627e-05, "loss": 0.3851, "step": 581 }, { "epoch": 0.9375755134917438, "grad_norm": 0.36452212891712393, "learning_rate": 3.9069253207541165e-05, "loss": 0.4102, "step": 582 }, { "epoch": 0.9391864679822794, "grad_norm": 0.40963179376690034, "learning_rate": 3.906245097562968e-05, "loss": 0.4042, "step": 583 }, { "epoch": 0.9407974224728152, "grad_norm": 0.47835605254275715, "learning_rate": 3.9055624574086485e-05, "loss": 0.4083, "step": 584 }, { "epoch": 0.9424083769633508, "grad_norm": 0.35302712606627323, "learning_rate": 3.9048774011566906e-05, "loss": 0.399, "step": 585 }, { "epoch": 0.9440193314538864, "grad_norm": 0.45147544517664695, "learning_rate": 3.904189929675689e-05, "loss": 0.3835, "step": 586 }, { "epoch": 0.9456302859444221, "grad_norm": 0.3576207044412354, "learning_rate": 3.903500043837302e-05, "loss": 0.3983, "step": 587 }, { "epoch": 0.9472412404349577, "grad_norm": 0.47971033814742575, "learning_rate": 3.9028077445162486e-05, "loss": 0.3876, "step": 588 }, { "epoch": 0.9488521949254933, "grad_norm": 0.46648498519940407, "learning_rate": 3.9021130325903076e-05, "loss": 0.4104, "step": 589 }, { "epoch": 0.950463149416029, "grad_norm": 0.36648880409549883, "learning_rate": 3.9014159089403167e-05, "loss": 0.4145, "step": 590 }, { "epoch": 0.9520741039065647, "grad_norm": 0.4553737406525612, "learning_rate": 3.9007163744501723e-05, "loss": 0.4032, "step": 591 }, { "epoch": 0.9536850583971003, "grad_norm": 0.4171288408426286, "learning_rate": 3.900014430006827e-05, "loss": 0.4131, "step": 592 }, { "epoch": 0.9552960128876359, "grad_norm": 0.37296598077856796, "learning_rate": 3.8993100765002886e-05, "loss": 0.4098, "step": 593 }, { "epoch": 0.9569069673781716, "grad_norm": 0.3668599752905086, "learning_rate": 3.8986033148236206e-05, "loss": 0.4006, "step": 594 }, { "epoch": 0.9585179218687072, "grad_norm": 0.32605216315392055, "learning_rate": 3.897894145872939e-05, "loss": 0.389, "step": 595 }, { "epoch": 0.9601288763592428, "grad_norm": 0.3408714924804675, "learning_rate": 3.8971825705474104e-05, "loss": 0.3854, "step": 596 }, { "epoch": 0.9617398308497785, "grad_norm": 0.37330680663486493, "learning_rate": 3.8964685897492566e-05, "loss": 0.4025, "step": 597 }, { "epoch": 0.9633507853403142, "grad_norm": 0.3549667131762123, "learning_rate": 3.895752204383746e-05, "loss": 0.4013, "step": 598 }, { "epoch": 0.9649617398308498, "grad_norm": 0.3660249510077813, "learning_rate": 3.895033415359196e-05, "loss": 0.4057, "step": 599 }, { "epoch": 0.9665726943213854, "grad_norm": 0.39751018282200024, "learning_rate": 3.894312223586974e-05, "loss": 0.4048, "step": 600 }, { "epoch": 0.9681836488119211, "grad_norm": 0.31674666928386697, "learning_rate": 3.8935886299814904e-05, "loss": 0.3884, "step": 601 }, { "epoch": 0.9697946033024567, "grad_norm": 0.42282251139146454, "learning_rate": 3.8928626354602046e-05, "loss": 0.373, "step": 602 }, { "epoch": 0.9714055577929923, "grad_norm": 0.31609372557688337, "learning_rate": 3.8921342409436175e-05, "loss": 0.3946, "step": 603 }, { "epoch": 0.973016512283528, "grad_norm": 0.44022277125892706, "learning_rate": 3.891403447355274e-05, "loss": 0.3961, "step": 604 }, { "epoch": 0.9746274667740636, "grad_norm": 0.39405832611427605, "learning_rate": 3.890670255621761e-05, "loss": 0.3799, "step": 605 }, { "epoch": 0.9762384212645993, "grad_norm": 0.3119144672899951, "learning_rate": 3.889934666672706e-05, "loss": 0.3837, "step": 606 }, { "epoch": 0.9778493757551349, "grad_norm": 0.3896876780989498, "learning_rate": 3.8891966814407745e-05, "loss": 0.3998, "step": 607 }, { "epoch": 0.9794603302456706, "grad_norm": 0.360270898809426, "learning_rate": 3.8884563008616726e-05, "loss": 0.4064, "step": 608 }, { "epoch": 0.9810712847362062, "grad_norm": 0.42631951285751807, "learning_rate": 3.8877135258741416e-05, "loss": 0.392, "step": 609 }, { "epoch": 0.9826822392267418, "grad_norm": 0.4415412873812909, "learning_rate": 3.886968357419961e-05, "loss": 0.4088, "step": 610 }, { "epoch": 0.9842931937172775, "grad_norm": 0.36666132641993976, "learning_rate": 3.886220796443942e-05, "loss": 0.3939, "step": 611 }, { "epoch": 0.9859041482078131, "grad_norm": 0.4202033590716815, "learning_rate": 3.885470843893932e-05, "loss": 0.3816, "step": 612 }, { "epoch": 0.9875151026983487, "grad_norm": 0.3516656089440837, "learning_rate": 3.884718500720808e-05, "loss": 0.3865, "step": 613 }, { "epoch": 0.9891260571888845, "grad_norm": 0.42108876415853347, "learning_rate": 3.8839637678784815e-05, "loss": 0.4058, "step": 614 }, { "epoch": 0.9907370116794201, "grad_norm": 0.38505023099974517, "learning_rate": 3.883206646323892e-05, "loss": 0.4097, "step": 615 }, { "epoch": 0.9923479661699557, "grad_norm": 0.40822421339312126, "learning_rate": 3.882447137017007e-05, "loss": 0.3982, "step": 616 }, { "epoch": 0.9939589206604913, "grad_norm": 0.40702788912386473, "learning_rate": 3.881685240920822e-05, "loss": 0.4071, "step": 617 }, { "epoch": 0.995569875151027, "grad_norm": 0.3723642576686159, "learning_rate": 3.8809209590013606e-05, "loss": 0.3869, "step": 618 }, { "epoch": 0.9971808296415626, "grad_norm": 0.346526709604658, "learning_rate": 3.8801542922276686e-05, "loss": 0.3939, "step": 619 }, { "epoch": 0.9987917841320982, "grad_norm": 0.4059535452467616, "learning_rate": 3.879385241571817e-05, "loss": 0.3963, "step": 620 }, { "epoch": 1.000402738622634, "grad_norm": 0.41606971570933265, "learning_rate": 3.8786138080089e-05, "loss": 0.4643, "step": 621 }, { "epoch": 1.0020136931131696, "grad_norm": 0.4642191778743235, "learning_rate": 3.877839992517031e-05, "loss": 0.3676, "step": 622 }, { "epoch": 1.0036246476037052, "grad_norm": 0.4205733804052529, "learning_rate": 3.8770637960773465e-05, "loss": 0.358, "step": 623 }, { "epoch": 1.0052356020942408, "grad_norm": 0.40959547235243765, "learning_rate": 3.8762852196739994e-05, "loss": 0.3635, "step": 624 }, { "epoch": 1.0068465565847764, "grad_norm": 0.3791990728099719, "learning_rate": 3.875504264294161e-05, "loss": 0.3383, "step": 625 }, { "epoch": 1.0084575110753122, "grad_norm": 0.5130910962490318, "learning_rate": 3.8747209309280195e-05, "loss": 0.3667, "step": 626 }, { "epoch": 1.0100684655658478, "grad_norm": 0.39767900122923594, "learning_rate": 3.873935220568776e-05, "loss": 0.3597, "step": 627 }, { "epoch": 1.0116794200563834, "grad_norm": 0.44532028330504037, "learning_rate": 3.8731471342126495e-05, "loss": 0.356, "step": 628 }, { "epoch": 1.013290374546919, "grad_norm": 0.4136333579628707, "learning_rate": 3.872356672858868e-05, "loss": 0.3356, "step": 629 }, { "epoch": 1.0149013290374547, "grad_norm": 0.40179390937304404, "learning_rate": 3.871563837509672e-05, "loss": 0.3317, "step": 630 }, { "epoch": 1.0165122835279903, "grad_norm": 0.4824835553730714, "learning_rate": 3.870768629170311e-05, "loss": 0.3426, "step": 631 }, { "epoch": 1.018123238018526, "grad_norm": 0.367967163870901, "learning_rate": 3.869971048849046e-05, "loss": 0.3409, "step": 632 }, { "epoch": 1.0197341925090617, "grad_norm": 0.49092699271229584, "learning_rate": 3.8691710975571425e-05, "loss": 0.3658, "step": 633 }, { "epoch": 1.0213451469995973, "grad_norm": 0.344714835725969, "learning_rate": 3.8683687763088745e-05, "loss": 0.3418, "step": 634 }, { "epoch": 1.022956101490133, "grad_norm": 0.43359371603108976, "learning_rate": 3.867564086121519e-05, "loss": 0.3684, "step": 635 }, { "epoch": 1.0245670559806686, "grad_norm": 0.3727355148405864, "learning_rate": 3.866757028015357e-05, "loss": 0.3721, "step": 636 }, { "epoch": 1.0261780104712042, "grad_norm": 0.39174926745367894, "learning_rate": 3.865947603013674e-05, "loss": 0.3309, "step": 637 }, { "epoch": 1.0277889649617398, "grad_norm": 0.3255706942445588, "learning_rate": 3.865135812142753e-05, "loss": 0.3429, "step": 638 }, { "epoch": 1.0293999194522754, "grad_norm": 0.40845531163767246, "learning_rate": 3.86432165643188e-05, "loss": 0.3456, "step": 639 }, { "epoch": 1.0310108739428112, "grad_norm": 0.32752549293401095, "learning_rate": 3.863505136913337e-05, "loss": 0.3561, "step": 640 }, { "epoch": 1.0326218284333468, "grad_norm": 0.4306213011485377, "learning_rate": 3.862686254622405e-05, "loss": 0.3675, "step": 641 }, { "epoch": 1.0342327829238824, "grad_norm": 0.3768521574814054, "learning_rate": 3.8618650105973586e-05, "loss": 0.3551, "step": 642 }, { "epoch": 1.035843737414418, "grad_norm": 0.3398666804464596, "learning_rate": 3.8610414058794695e-05, "loss": 0.3195, "step": 643 }, { "epoch": 1.0374546919049537, "grad_norm": 0.355463485377364, "learning_rate": 3.860215441513001e-05, "loss": 0.3427, "step": 644 }, { "epoch": 1.0390656463954893, "grad_norm": 0.29625164040082436, "learning_rate": 3.8593871185452074e-05, "loss": 0.3446, "step": 645 }, { "epoch": 1.0406766008860249, "grad_norm": 0.3640138737256208, "learning_rate": 3.858556438026335e-05, "loss": 0.3728, "step": 646 }, { "epoch": 1.0422875553765607, "grad_norm": 0.32386946809285116, "learning_rate": 3.8577234010096206e-05, "loss": 0.3468, "step": 647 }, { "epoch": 1.0438985098670963, "grad_norm": 0.36609144049870035, "learning_rate": 3.856888008551285e-05, "loss": 0.3617, "step": 648 }, { "epoch": 1.045509464357632, "grad_norm": 0.4000523742064122, "learning_rate": 3.856050261710539e-05, "loss": 0.3591, "step": 649 }, { "epoch": 1.0471204188481675, "grad_norm": 0.3847983140548682, "learning_rate": 3.8552101615495755e-05, "loss": 0.3566, "step": 650 }, { "epoch": 1.0487313733387031, "grad_norm": 0.38930820839888375, "learning_rate": 3.854367709133575e-05, "loss": 0.3632, "step": 651 }, { "epoch": 1.0503423278292388, "grad_norm": 0.3309535311423929, "learning_rate": 3.853522905530698e-05, "loss": 0.3477, "step": 652 }, { "epoch": 1.0519532823197744, "grad_norm": 0.36099065328392765, "learning_rate": 3.8526757518120846e-05, "loss": 0.3498, "step": 653 }, { "epoch": 1.0535642368103102, "grad_norm": 0.37075788058964326, "learning_rate": 3.8518262490518585e-05, "loss": 0.3658, "step": 654 }, { "epoch": 1.0551751913008458, "grad_norm": 0.34165627904419094, "learning_rate": 3.8509743983271196e-05, "loss": 0.3506, "step": 655 }, { "epoch": 1.0567861457913814, "grad_norm": 0.30950248679253206, "learning_rate": 3.8501202007179447e-05, "loss": 0.3307, "step": 656 }, { "epoch": 1.058397100281917, "grad_norm": 0.3237159191997492, "learning_rate": 3.8492636573073866e-05, "loss": 0.3319, "step": 657 }, { "epoch": 1.0600080547724526, "grad_norm": 0.3517039667110292, "learning_rate": 3.8484047691814724e-05, "loss": 0.3707, "step": 658 }, { "epoch": 1.0616190092629882, "grad_norm": 0.339096140285127, "learning_rate": 3.847543537429202e-05, "loss": 0.3516, "step": 659 }, { "epoch": 1.0632299637535239, "grad_norm": 0.38263230843949914, "learning_rate": 3.8466799631425474e-05, "loss": 0.3811, "step": 660 }, { "epoch": 1.0648409182440597, "grad_norm": 0.36914183180517873, "learning_rate": 3.8458140474164503e-05, "loss": 0.3729, "step": 661 }, { "epoch": 1.0664518727345953, "grad_norm": 0.3636472527013866, "learning_rate": 3.8449457913488205e-05, "loss": 0.344, "step": 662 }, { "epoch": 1.068062827225131, "grad_norm": 0.3219280442977279, "learning_rate": 3.8440751960405365e-05, "loss": 0.3574, "step": 663 }, { "epoch": 1.0696737817156665, "grad_norm": 0.3405685714287699, "learning_rate": 3.843202262595442e-05, "loss": 0.3402, "step": 664 }, { "epoch": 1.0712847362062021, "grad_norm": 0.3498606308270493, "learning_rate": 3.842326992120345e-05, "loss": 0.3639, "step": 665 }, { "epoch": 1.0728956906967377, "grad_norm": 0.32207988596751275, "learning_rate": 3.841449385725018e-05, "loss": 0.3409, "step": 666 }, { "epoch": 1.0745066451872733, "grad_norm": 0.3491136718479199, "learning_rate": 3.8405694445221924e-05, "loss": 0.3462, "step": 667 }, { "epoch": 1.0761175996778092, "grad_norm": 0.36346049492233184, "learning_rate": 3.839687169627564e-05, "loss": 0.3675, "step": 668 }, { "epoch": 1.0777285541683448, "grad_norm": 0.3270189746678473, "learning_rate": 3.838802562159783e-05, "loss": 0.3428, "step": 669 }, { "epoch": 1.0793395086588804, "grad_norm": 0.3572958227451133, "learning_rate": 3.837915623240462e-05, "loss": 0.3538, "step": 670 }, { "epoch": 1.080950463149416, "grad_norm": 0.3646744219922745, "learning_rate": 3.8370263539941647e-05, "loss": 0.3521, "step": 671 }, { "epoch": 1.0825614176399516, "grad_norm": 0.3403146272839342, "learning_rate": 3.8361347555484136e-05, "loss": 0.3447, "step": 672 }, { "epoch": 1.0841723721304872, "grad_norm": 0.32652840028581526, "learning_rate": 3.835240829033682e-05, "loss": 0.3304, "step": 673 }, { "epoch": 1.085783326621023, "grad_norm": 0.3400766178269568, "learning_rate": 3.834344575583396e-05, "loss": 0.3599, "step": 674 }, { "epoch": 1.0873942811115587, "grad_norm": 0.38815523922016976, "learning_rate": 3.833445996333932e-05, "loss": 0.3359, "step": 675 }, { "epoch": 1.0890052356020943, "grad_norm": 0.41704665471764435, "learning_rate": 3.832545092424615e-05, "loss": 0.3596, "step": 676 }, { "epoch": 1.09061619009263, "grad_norm": 0.32531971390606135, "learning_rate": 3.831641864997717e-05, "loss": 0.3263, "step": 677 }, { "epoch": 1.0922271445831655, "grad_norm": 0.3585802875329914, "learning_rate": 3.830736315198457e-05, "loss": 0.3641, "step": 678 }, { "epoch": 1.093838099073701, "grad_norm": 0.31768810788040425, "learning_rate": 3.8298284441749985e-05, "loss": 0.3707, "step": 679 }, { "epoch": 1.0954490535642367, "grad_norm": 0.32746796457773686, "learning_rate": 3.828918253078448e-05, "loss": 0.3433, "step": 680 }, { "epoch": 1.0970600080547726, "grad_norm": 0.3224365136634425, "learning_rate": 3.828005743062853e-05, "loss": 0.3825, "step": 681 }, { "epoch": 1.0986709625453082, "grad_norm": 0.3486055061501249, "learning_rate": 3.827090915285202e-05, "loss": 0.3626, "step": 682 }, { "epoch": 1.1002819170358438, "grad_norm": 0.33829450630818075, "learning_rate": 3.826173770905422e-05, "loss": 0.35, "step": 683 }, { "epoch": 1.1018928715263794, "grad_norm": 0.31882058047495504, "learning_rate": 3.825254311086377e-05, "loss": 0.3405, "step": 684 }, { "epoch": 1.103503826016915, "grad_norm": 0.37226089185500716, "learning_rate": 3.8243325369938674e-05, "loss": 0.3469, "step": 685 }, { "epoch": 1.1051147805074506, "grad_norm": 0.4101836669177939, "learning_rate": 3.823408449796627e-05, "loss": 0.345, "step": 686 }, { "epoch": 1.1067257349979862, "grad_norm": 0.3137847522951066, "learning_rate": 3.822482050666322e-05, "loss": 0.3221, "step": 687 }, { "epoch": 1.108336689488522, "grad_norm": 0.36142432810250685, "learning_rate": 3.821553340777553e-05, "loss": 0.3635, "step": 688 }, { "epoch": 1.1099476439790577, "grad_norm": 0.33266260220336297, "learning_rate": 3.820622321307847e-05, "loss": 0.3545, "step": 689 }, { "epoch": 1.1115585984695933, "grad_norm": 0.31351708791895283, "learning_rate": 3.8196889934376617e-05, "loss": 0.3681, "step": 690 }, { "epoch": 1.1131695529601289, "grad_norm": 0.28063065842857104, "learning_rate": 3.818753358350379e-05, "loss": 0.3443, "step": 691 }, { "epoch": 1.1147805074506645, "grad_norm": 0.36353977199002524, "learning_rate": 3.8178154172323094e-05, "loss": 0.3753, "step": 692 }, { "epoch": 1.1163914619412, "grad_norm": 0.2768651552529601, "learning_rate": 3.8168751712726846e-05, "loss": 0.3446, "step": 693 }, { "epoch": 1.1180024164317357, "grad_norm": 0.3342687611239813, "learning_rate": 3.815932621663661e-05, "loss": 0.3724, "step": 694 }, { "epoch": 1.1196133709222715, "grad_norm": 0.33647821887748514, "learning_rate": 3.814987769600312e-05, "loss": 0.3455, "step": 695 }, { "epoch": 1.1212243254128071, "grad_norm": 0.31273923882238536, "learning_rate": 3.814040616280636e-05, "loss": 0.3462, "step": 696 }, { "epoch": 1.1228352799033428, "grad_norm": 0.34039417308107917, "learning_rate": 3.8130911629055443e-05, "loss": 0.3398, "step": 697 }, { "epoch": 1.1244462343938784, "grad_norm": 0.33579456080236614, "learning_rate": 3.812139410678866e-05, "loss": 0.3596, "step": 698 }, { "epoch": 1.126057188884414, "grad_norm": 0.3230165075161485, "learning_rate": 3.811185360807347e-05, "loss": 0.3497, "step": 699 }, { "epoch": 1.1276681433749496, "grad_norm": 0.3241972598923344, "learning_rate": 3.810229014500643e-05, "loss": 0.3324, "step": 700 }, { "epoch": 1.1292790978654854, "grad_norm": 0.3296772176461001, "learning_rate": 3.809270372971323e-05, "loss": 0.3506, "step": 701 }, { "epoch": 1.130890052356021, "grad_norm": 0.37285713064822357, "learning_rate": 3.8083094374348676e-05, "loss": 0.3732, "step": 702 }, { "epoch": 1.1325010068465566, "grad_norm": 0.2874339346820618, "learning_rate": 3.807346209109663e-05, "loss": 0.3483, "step": 703 }, { "epoch": 1.1341119613370922, "grad_norm": 0.37713299713195747, "learning_rate": 3.8063806892170055e-05, "loss": 0.3707, "step": 704 }, { "epoch": 1.1357229158276279, "grad_norm": 0.36683564876617375, "learning_rate": 3.805412878981095e-05, "loss": 0.3412, "step": 705 }, { "epoch": 1.1373338703181635, "grad_norm": 0.37846828032041174, "learning_rate": 3.804442779629035e-05, "loss": 0.3823, "step": 706 }, { "epoch": 1.138944824808699, "grad_norm": 0.39303680121813295, "learning_rate": 3.803470392390834e-05, "loss": 0.335, "step": 707 }, { "epoch": 1.140555779299235, "grad_norm": 0.32030974316744143, "learning_rate": 3.8024957184993986e-05, "loss": 0.3496, "step": 708 }, { "epoch": 1.1421667337897705, "grad_norm": 0.40038802750352454, "learning_rate": 3.8015187591905356e-05, "loss": 0.3631, "step": 709 }, { "epoch": 1.1437776882803061, "grad_norm": 0.3889019319477048, "learning_rate": 3.800539515702949e-05, "loss": 0.3617, "step": 710 }, { "epoch": 1.1453886427708417, "grad_norm": 0.3372232290880548, "learning_rate": 3.799557989278241e-05, "loss": 0.3763, "step": 711 }, { "epoch": 1.1469995972613773, "grad_norm": 0.37103844691689164, "learning_rate": 3.798574181160907e-05, "loss": 0.3707, "step": 712 }, { "epoch": 1.148610551751913, "grad_norm": 0.3193426723569192, "learning_rate": 3.7975880925983345e-05, "loss": 0.3419, "step": 713 }, { "epoch": 1.1502215062424486, "grad_norm": 0.3735797639088716, "learning_rate": 3.796599724840803e-05, "loss": 0.3612, "step": 714 }, { "epoch": 1.1518324607329844, "grad_norm": 0.3144326051598097, "learning_rate": 3.795609079141484e-05, "loss": 0.344, "step": 715 }, { "epoch": 1.15344341522352, "grad_norm": 0.3226419021515401, "learning_rate": 3.794616156756433e-05, "loss": 0.3352, "step": 716 }, { "epoch": 1.1550543697140556, "grad_norm": 0.2952244005389135, "learning_rate": 3.793620958944596e-05, "loss": 0.369, "step": 717 }, { "epoch": 1.1566653242045912, "grad_norm": 0.3017482997087285, "learning_rate": 3.792623486967802e-05, "loss": 0.3554, "step": 718 }, { "epoch": 1.1582762786951268, "grad_norm": 0.3332567088466206, "learning_rate": 3.791623742090765e-05, "loss": 0.3549, "step": 719 }, { "epoch": 1.1598872331856624, "grad_norm": 0.3496541869504861, "learning_rate": 3.790621725581079e-05, "loss": 0.3477, "step": 720 }, { "epoch": 1.161498187676198, "grad_norm": 0.29711261872162453, "learning_rate": 3.7896174387092194e-05, "loss": 0.338, "step": 721 }, { "epoch": 1.163109142166734, "grad_norm": 0.35281267419179585, "learning_rate": 3.788610882748539e-05, "loss": 0.329, "step": 722 }, { "epoch": 1.1647200966572695, "grad_norm": 0.40685923102090615, "learning_rate": 3.78760205897527e-05, "loss": 0.4075, "step": 723 }, { "epoch": 1.166331051147805, "grad_norm": 0.32685402009954984, "learning_rate": 3.786590968668518e-05, "loss": 0.3441, "step": 724 }, { "epoch": 1.1679420056383407, "grad_norm": 0.30475399585683605, "learning_rate": 3.785577613110264e-05, "loss": 0.3271, "step": 725 }, { "epoch": 1.1695529601288763, "grad_norm": 0.39855984307309933, "learning_rate": 3.784561993585358e-05, "loss": 0.3795, "step": 726 }, { "epoch": 1.171163914619412, "grad_norm": 0.320784768450466, "learning_rate": 3.783544111381524e-05, "loss": 0.3476, "step": 727 }, { "epoch": 1.1727748691099475, "grad_norm": 0.398225649315675, "learning_rate": 3.782523967789354e-05, "loss": 0.3998, "step": 728 }, { "epoch": 1.1743858236004834, "grad_norm": 0.33274129125722207, "learning_rate": 3.781501564102305e-05, "loss": 0.3594, "step": 729 }, { "epoch": 1.175996778091019, "grad_norm": 0.29733991185377096, "learning_rate": 3.7804769016167036e-05, "loss": 0.329, "step": 730 }, { "epoch": 1.1776077325815546, "grad_norm": 0.34843064282370634, "learning_rate": 3.779449981631737e-05, "loss": 0.3725, "step": 731 }, { "epoch": 1.1792186870720902, "grad_norm": 0.29488591686422405, "learning_rate": 3.7784208054494554e-05, "loss": 0.3618, "step": 732 }, { "epoch": 1.1808296415626258, "grad_norm": 0.32309697260812176, "learning_rate": 3.777389374374772e-05, "loss": 0.3567, "step": 733 }, { "epoch": 1.1824405960531614, "grad_norm": 0.2949421891798728, "learning_rate": 3.776355689715455e-05, "loss": 0.3425, "step": 734 }, { "epoch": 1.184051550543697, "grad_norm": 0.3234000905104195, "learning_rate": 3.775319752782133e-05, "loss": 0.3556, "step": 735 }, { "epoch": 1.1856625050342329, "grad_norm": 0.31297392729871315, "learning_rate": 3.7742815648882906e-05, "loss": 0.3596, "step": 736 }, { "epoch": 1.1872734595247685, "grad_norm": 0.3651886549670445, "learning_rate": 3.773241127350264e-05, "loss": 0.3675, "step": 737 }, { "epoch": 1.188884414015304, "grad_norm": 0.30965174965919956, "learning_rate": 3.772198441487243e-05, "loss": 0.343, "step": 738 }, { "epoch": 1.1904953685058397, "grad_norm": 0.3346542890794813, "learning_rate": 3.771153508621269e-05, "loss": 0.3554, "step": 739 }, { "epoch": 1.1921063229963753, "grad_norm": 0.3132631499207701, "learning_rate": 3.770106330077231e-05, "loss": 0.3428, "step": 740 }, { "epoch": 1.193717277486911, "grad_norm": 0.36626598122283643, "learning_rate": 3.769056907182866e-05, "loss": 0.3674, "step": 741 }, { "epoch": 1.1953282319774465, "grad_norm": 0.3296955790235176, "learning_rate": 3.768005241268757e-05, "loss": 0.3422, "step": 742 }, { "epoch": 1.1969391864679824, "grad_norm": 0.3214817268876617, "learning_rate": 3.76695133366833e-05, "loss": 0.3456, "step": 743 }, { "epoch": 1.198550140958518, "grad_norm": 0.2844760445477903, "learning_rate": 3.7658951857178544e-05, "loss": 0.3428, "step": 744 }, { "epoch": 1.2001610954490536, "grad_norm": 0.3371872324687368, "learning_rate": 3.764836798756439e-05, "loss": 0.3618, "step": 745 }, { "epoch": 1.2017720499395892, "grad_norm": 0.2908980367000638, "learning_rate": 3.763776174126031e-05, "loss": 0.3335, "step": 746 }, { "epoch": 1.2033830044301248, "grad_norm": 0.3261395185565908, "learning_rate": 3.762713313171419e-05, "loss": 0.3677, "step": 747 }, { "epoch": 1.2049939589206604, "grad_norm": 0.31868194245423315, "learning_rate": 3.761648217240221e-05, "loss": 0.3708, "step": 748 }, { "epoch": 1.206604913411196, "grad_norm": 0.29164880164956636, "learning_rate": 3.760580887682892e-05, "loss": 0.3267, "step": 749 }, { "epoch": 1.2082158679017319, "grad_norm": 0.28510156507177037, "learning_rate": 3.7595113258527206e-05, "loss": 0.3437, "step": 750 }, { "epoch": 1.2098268223922675, "grad_norm": 0.2957794927406389, "learning_rate": 3.758439533105822e-05, "loss": 0.3238, "step": 751 }, { "epoch": 1.211437776882803, "grad_norm": 0.348201286744665, "learning_rate": 3.757365510801143e-05, "loss": 0.3804, "step": 752 }, { "epoch": 1.2130487313733387, "grad_norm": 0.31209813548990145, "learning_rate": 3.756289260300456e-05, "loss": 0.3738, "step": 753 }, { "epoch": 1.2146596858638743, "grad_norm": 0.3194574953898529, "learning_rate": 3.755210782968358e-05, "loss": 0.3411, "step": 754 }, { "epoch": 1.21627064035441, "grad_norm": 0.315228642494671, "learning_rate": 3.7541300801722715e-05, "loss": 0.3442, "step": 755 }, { "epoch": 1.2178815948449455, "grad_norm": 0.2897162651173343, "learning_rate": 3.7530471532824385e-05, "loss": 0.3496, "step": 756 }, { "epoch": 1.2194925493354813, "grad_norm": 0.34000075648700634, "learning_rate": 3.751962003671922e-05, "loss": 0.3824, "step": 757 }, { "epoch": 1.221103503826017, "grad_norm": 0.2976452200617564, "learning_rate": 3.750874632716604e-05, "loss": 0.3424, "step": 758 }, { "epoch": 1.2227144583165526, "grad_norm": 0.29249500001948164, "learning_rate": 3.74978504179518e-05, "loss": 0.3723, "step": 759 }, { "epoch": 1.2243254128070882, "grad_norm": 0.3355440693648072, "learning_rate": 3.7486932322891646e-05, "loss": 0.354, "step": 760 }, { "epoch": 1.2259363672976238, "grad_norm": 0.2774173083048779, "learning_rate": 3.747599205582882e-05, "loss": 0.3388, "step": 761 }, { "epoch": 1.2275473217881594, "grad_norm": 0.307261387050007, "learning_rate": 3.746502963063469e-05, "loss": 0.3536, "step": 762 }, { "epoch": 1.229158276278695, "grad_norm": 0.2779362555047775, "learning_rate": 3.745404506120872e-05, "loss": 0.3418, "step": 763 }, { "epoch": 1.2307692307692308, "grad_norm": 0.2970571621131108, "learning_rate": 3.744303836147844e-05, "loss": 0.3885, "step": 764 }, { "epoch": 1.2323801852597664, "grad_norm": 0.299475415748665, "learning_rate": 3.743200954539945e-05, "loss": 0.3357, "step": 765 }, { "epoch": 1.233991139750302, "grad_norm": 0.28066511695868956, "learning_rate": 3.7420958626955395e-05, "loss": 0.3186, "step": 766 }, { "epoch": 1.2356020942408377, "grad_norm": 0.32650220462849366, "learning_rate": 3.7409885620157925e-05, "loss": 0.374, "step": 767 }, { "epoch": 1.2372130487313733, "grad_norm": 0.33812078920671956, "learning_rate": 3.739879053904672e-05, "loss": 0.3508, "step": 768 }, { "epoch": 1.2388240032219089, "grad_norm": 0.34383445727592993, "learning_rate": 3.738767339768942e-05, "loss": 0.3779, "step": 769 }, { "epoch": 1.2404349577124447, "grad_norm": 0.3202923747231639, "learning_rate": 3.737653421018168e-05, "loss": 0.3426, "step": 770 }, { "epoch": 1.2420459122029803, "grad_norm": 0.35409199577635775, "learning_rate": 3.736537299064706e-05, "loss": 0.3572, "step": 771 }, { "epoch": 1.243656866693516, "grad_norm": 0.3436578084892543, "learning_rate": 3.735418975323708e-05, "loss": 0.3388, "step": 772 }, { "epoch": 1.2452678211840515, "grad_norm": 0.29421403073429603, "learning_rate": 3.734298451213117e-05, "loss": 0.3496, "step": 773 }, { "epoch": 1.2468787756745872, "grad_norm": 0.3674454300701155, "learning_rate": 3.7331757281536665e-05, "loss": 0.3457, "step": 774 }, { "epoch": 1.2484897301651228, "grad_norm": 0.2658262781818514, "learning_rate": 3.732050807568878e-05, "loss": 0.3353, "step": 775 }, { "epoch": 1.2501006846556586, "grad_norm": 0.3558402384689967, "learning_rate": 3.7309236908850574e-05, "loss": 0.3531, "step": 776 }, { "epoch": 1.251711639146194, "grad_norm": 0.2968753461483703, "learning_rate": 3.729794379531299e-05, "loss": 0.3463, "step": 777 }, { "epoch": 1.2533225936367298, "grad_norm": 0.3068748390315143, "learning_rate": 3.7286628749394754e-05, "loss": 0.3495, "step": 778 }, { "epoch": 1.2549335481272654, "grad_norm": 0.2853593430828942, "learning_rate": 3.727529178544243e-05, "loss": 0.349, "step": 779 }, { "epoch": 1.256544502617801, "grad_norm": 0.36814006315805403, "learning_rate": 3.726393291783036e-05, "loss": 0.3683, "step": 780 }, { "epoch": 1.2581554571083367, "grad_norm": 0.34204662393847385, "learning_rate": 3.7252552160960666e-05, "loss": 0.3395, "step": 781 }, { "epoch": 1.2597664115988723, "grad_norm": 0.25952729501669486, "learning_rate": 3.724114952926322e-05, "loss": 0.3426, "step": 782 }, { "epoch": 1.261377366089408, "grad_norm": 0.3434914251843314, "learning_rate": 3.722972503719561e-05, "loss": 0.3277, "step": 783 }, { "epoch": 1.2629883205799435, "grad_norm": 0.36202104323985773, "learning_rate": 3.7218278699243176e-05, "loss": 0.3722, "step": 784 }, { "epoch": 1.2645992750704793, "grad_norm": 0.35145897106947177, "learning_rate": 3.7206810529918935e-05, "loss": 0.3711, "step": 785 }, { "epoch": 1.266210229561015, "grad_norm": 0.3119487302914577, "learning_rate": 3.7195320543763596e-05, "loss": 0.3352, "step": 786 }, { "epoch": 1.2678211840515505, "grad_norm": 0.35035093132752193, "learning_rate": 3.718380875534552e-05, "loss": 0.3743, "step": 787 }, { "epoch": 1.2694321385420861, "grad_norm": 0.3418946608898889, "learning_rate": 3.71722751792607e-05, "loss": 0.3429, "step": 788 }, { "epoch": 1.2710430930326218, "grad_norm": 0.31270337962677086, "learning_rate": 3.7160719830132794e-05, "loss": 0.344, "step": 789 }, { "epoch": 1.2726540475231576, "grad_norm": 0.3274369342826082, "learning_rate": 3.714914272261302e-05, "loss": 0.3499, "step": 790 }, { "epoch": 1.2742650020136932, "grad_norm": 0.34598565436756357, "learning_rate": 3.7137543871380224e-05, "loss": 0.3829, "step": 791 }, { "epoch": 1.2758759565042288, "grad_norm": 0.28889266818776443, "learning_rate": 3.712592329114079e-05, "loss": 0.3381, "step": 792 }, { "epoch": 1.2774869109947644, "grad_norm": 0.3717773811020485, "learning_rate": 3.7114280996628666e-05, "loss": 0.3721, "step": 793 }, { "epoch": 1.2790978654853, "grad_norm": 0.37135926966232047, "learning_rate": 3.710261700260534e-05, "loss": 0.3718, "step": 794 }, { "epoch": 1.2807088199758356, "grad_norm": 0.33049624744013434, "learning_rate": 3.7090931323859794e-05, "loss": 0.3586, "step": 795 }, { "epoch": 1.2823197744663712, "grad_norm": 0.3101839548871945, "learning_rate": 3.707922397520852e-05, "loss": 0.3397, "step": 796 }, { "epoch": 1.283930728956907, "grad_norm": 0.30737107820444354, "learning_rate": 3.706749497149547e-05, "loss": 0.336, "step": 797 }, { "epoch": 1.2855416834474427, "grad_norm": 0.3019490959641579, "learning_rate": 3.705574432759208e-05, "loss": 0.3512, "step": 798 }, { "epoch": 1.2871526379379783, "grad_norm": 0.3503088570735374, "learning_rate": 3.70439720583972e-05, "loss": 0.3822, "step": 799 }, { "epoch": 1.288763592428514, "grad_norm": 0.311137345604332, "learning_rate": 3.70321781788371e-05, "loss": 0.3641, "step": 800 }, { "epoch": 1.2903745469190495, "grad_norm": 0.31960209562676384, "learning_rate": 3.702036270386547e-05, "loss": 0.3813, "step": 801 }, { "epoch": 1.2919855014095851, "grad_norm": 0.3425160205698806, "learning_rate": 3.700852564846335e-05, "loss": 0.354, "step": 802 }, { "epoch": 1.2935964559001207, "grad_norm": 0.30986308980528005, "learning_rate": 3.6996667027639174e-05, "loss": 0.3579, "step": 803 }, { "epoch": 1.2952074103906566, "grad_norm": 0.3908250025109488, "learning_rate": 3.6984786856428705e-05, "loss": 0.3531, "step": 804 }, { "epoch": 1.2968183648811922, "grad_norm": 0.3475929563748349, "learning_rate": 3.697288514989502e-05, "loss": 0.374, "step": 805 }, { "epoch": 1.2984293193717278, "grad_norm": 0.36294126212027755, "learning_rate": 3.696096192312852e-05, "loss": 0.3576, "step": 806 }, { "epoch": 1.3000402738622634, "grad_norm": 0.32338264256863225, "learning_rate": 3.694901719124688e-05, "loss": 0.3312, "step": 807 }, { "epoch": 1.301651228352799, "grad_norm": 0.35527955178483733, "learning_rate": 3.6937050969395055e-05, "loss": 0.3836, "step": 808 }, { "epoch": 1.3032621828433346, "grad_norm": 0.3628976303009859, "learning_rate": 3.6925063272745214e-05, "loss": 0.3629, "step": 809 }, { "epoch": 1.3048731373338702, "grad_norm": 0.3022352084491082, "learning_rate": 3.6913054116496797e-05, "loss": 0.3605, "step": 810 }, { "epoch": 1.306484091824406, "grad_norm": 0.3393294470418059, "learning_rate": 3.690102351587643e-05, "loss": 0.3512, "step": 811 }, { "epoch": 1.3080950463149417, "grad_norm": 0.28363397227191833, "learning_rate": 3.688897148613794e-05, "loss": 0.3429, "step": 812 }, { "epoch": 1.3097060008054773, "grad_norm": 0.33963097783454826, "learning_rate": 3.687689804256229e-05, "loss": 0.3332, "step": 813 }, { "epoch": 1.3113169552960129, "grad_norm": 0.3600086874002223, "learning_rate": 3.6864803200457646e-05, "loss": 0.3625, "step": 814 }, { "epoch": 1.3129279097865485, "grad_norm": 0.2816481879831538, "learning_rate": 3.685268697515928e-05, "loss": 0.3684, "step": 815 }, { "epoch": 1.314538864277084, "grad_norm": 0.3139691237821825, "learning_rate": 3.684054938202956e-05, "loss": 0.3393, "step": 816 }, { "epoch": 1.3161498187676197, "grad_norm": 0.29165261191102926, "learning_rate": 3.682839043645798e-05, "loss": 0.3228, "step": 817 }, { "epoch": 1.3177607732581555, "grad_norm": 0.2718062507025103, "learning_rate": 3.681621015386108e-05, "loss": 0.3769, "step": 818 }, { "epoch": 1.3193717277486912, "grad_norm": 0.3635524032153051, "learning_rate": 3.680400854968246e-05, "loss": 0.36, "step": 819 }, { "epoch": 1.3209826822392268, "grad_norm": 0.28780321276593324, "learning_rate": 3.679178563939278e-05, "loss": 0.3479, "step": 820 }, { "epoch": 1.3225936367297624, "grad_norm": 0.3238210640271435, "learning_rate": 3.677954143848967e-05, "loss": 0.3767, "step": 821 }, { "epoch": 1.324204591220298, "grad_norm": 0.3080441012695841, "learning_rate": 3.676727596249779e-05, "loss": 0.3561, "step": 822 }, { "epoch": 1.3258155457108336, "grad_norm": 0.32081488942296216, "learning_rate": 3.675498922696876e-05, "loss": 0.3692, "step": 823 }, { "epoch": 1.3274265002013692, "grad_norm": 0.3278673509495895, "learning_rate": 3.6742681247481144e-05, "loss": 0.3231, "step": 824 }, { "epoch": 1.329037454691905, "grad_norm": 0.3096169164731479, "learning_rate": 3.6730352039640476e-05, "loss": 0.3821, "step": 825 }, { "epoch": 1.3306484091824407, "grad_norm": 0.33349256230646473, "learning_rate": 3.671800161907917e-05, "loss": 0.3268, "step": 826 }, { "epoch": 1.3322593636729763, "grad_norm": 0.3295767828308085, "learning_rate": 3.6705630001456556e-05, "loss": 0.335, "step": 827 }, { "epoch": 1.3338703181635119, "grad_norm": 0.2912807748587227, "learning_rate": 3.669323720245884e-05, "loss": 0.3409, "step": 828 }, { "epoch": 1.3354812726540475, "grad_norm": 0.30649973311303386, "learning_rate": 3.668082323779907e-05, "loss": 0.3512, "step": 829 }, { "epoch": 1.337092227144583, "grad_norm": 0.2959906133761005, "learning_rate": 3.6668388123217154e-05, "loss": 0.3612, "step": 830 }, { "epoch": 1.3387031816351187, "grad_norm": 0.29861778533728894, "learning_rate": 3.6655931874479783e-05, "loss": 0.3356, "step": 831 }, { "epoch": 1.3403141361256545, "grad_norm": 0.36174475564403585, "learning_rate": 3.664345450738048e-05, "loss": 0.3857, "step": 832 }, { "epoch": 1.3419250906161901, "grad_norm": 0.2964990585195445, "learning_rate": 3.663095603773952e-05, "loss": 0.3703, "step": 833 }, { "epoch": 1.3435360451067258, "grad_norm": 0.3444117342564229, "learning_rate": 3.6618436481403945e-05, "loss": 0.3477, "step": 834 }, { "epoch": 1.3451469995972614, "grad_norm": 0.2921618958671085, "learning_rate": 3.6605895854247534e-05, "loss": 0.3663, "step": 835 }, { "epoch": 1.346757954087797, "grad_norm": 0.3135599574497408, "learning_rate": 3.659333417217076e-05, "loss": 0.3556, "step": 836 }, { "epoch": 1.3483689085783326, "grad_norm": 0.28548160767634045, "learning_rate": 3.658075145110083e-05, "loss": 0.3401, "step": 837 }, { "epoch": 1.3499798630688682, "grad_norm": 0.33185921583222566, "learning_rate": 3.6568147706991616e-05, "loss": 0.3671, "step": 838 }, { "epoch": 1.351590817559404, "grad_norm": 0.29106614505484624, "learning_rate": 3.655552295582361e-05, "loss": 0.3751, "step": 839 }, { "epoch": 1.3532017720499396, "grad_norm": 0.32102312365154567, "learning_rate": 3.654287721360398e-05, "loss": 0.3854, "step": 840 }, { "epoch": 1.3548127265404752, "grad_norm": 0.28054378659585727, "learning_rate": 3.653021049636648e-05, "loss": 0.3331, "step": 841 }, { "epoch": 1.3564236810310109, "grad_norm": 0.3182024975537731, "learning_rate": 3.65175228201715e-05, "loss": 0.3423, "step": 842 }, { "epoch": 1.3580346355215465, "grad_norm": 0.29631447266060057, "learning_rate": 3.650481420110596e-05, "loss": 0.3454, "step": 843 }, { "epoch": 1.3596455900120823, "grad_norm": 0.39903373865432623, "learning_rate": 3.6492084655283355e-05, "loss": 0.369, "step": 844 }, { "epoch": 1.3612565445026177, "grad_norm": 0.29989201553650346, "learning_rate": 3.647933419884371e-05, "loss": 0.3304, "step": 845 }, { "epoch": 1.3628674989931535, "grad_norm": 0.35056143882565566, "learning_rate": 3.646656284795357e-05, "loss": 0.3723, "step": 846 }, { "epoch": 1.3644784534836891, "grad_norm": 0.3441551125299749, "learning_rate": 3.645377061880595e-05, "loss": 0.3557, "step": 847 }, { "epoch": 1.3660894079742247, "grad_norm": 0.3274845208948303, "learning_rate": 3.644095752762036e-05, "loss": 0.3399, "step": 848 }, { "epoch": 1.3677003624647603, "grad_norm": 0.3024421060736529, "learning_rate": 3.642812359064276e-05, "loss": 0.3596, "step": 849 }, { "epoch": 1.369311316955296, "grad_norm": 0.31600146060711926, "learning_rate": 3.641526882414553e-05, "loss": 0.3423, "step": 850 }, { "epoch": 1.3709222714458318, "grad_norm": 0.30071046711567967, "learning_rate": 3.640239324442746e-05, "loss": 0.3424, "step": 851 }, { "epoch": 1.3725332259363672, "grad_norm": 0.3203775229603514, "learning_rate": 3.638949686781374e-05, "loss": 0.3563, "step": 852 }, { "epoch": 1.374144180426903, "grad_norm": 0.2791617091294289, "learning_rate": 3.6376579710655915e-05, "loss": 0.3473, "step": 853 }, { "epoch": 1.3757551349174386, "grad_norm": 0.3262506737731935, "learning_rate": 3.63636417893319e-05, "loss": 0.3535, "step": 854 }, { "epoch": 1.3773660894079742, "grad_norm": 0.37963304107161205, "learning_rate": 3.6350683120245906e-05, "loss": 0.3877, "step": 855 }, { "epoch": 1.3789770438985098, "grad_norm": 0.26345344067107274, "learning_rate": 3.633770371982848e-05, "loss": 0.3344, "step": 856 }, { "epoch": 1.3805879983890454, "grad_norm": 0.3589958673951733, "learning_rate": 3.632470360453643e-05, "loss": 0.3624, "step": 857 }, { "epoch": 1.3821989528795813, "grad_norm": 0.2870645821680028, "learning_rate": 3.631168279085286e-05, "loss": 0.3442, "step": 858 }, { "epoch": 1.3838099073701167, "grad_norm": 0.3154900838253569, "learning_rate": 3.629864129528709e-05, "loss": 0.3544, "step": 859 }, { "epoch": 1.3854208618606525, "grad_norm": 0.32587561605795656, "learning_rate": 3.6285579134374655e-05, "loss": 0.3365, "step": 860 }, { "epoch": 1.387031816351188, "grad_norm": 0.28524552874452264, "learning_rate": 3.627249632467733e-05, "loss": 0.3513, "step": 861 }, { "epoch": 1.3886427708417237, "grad_norm": 0.3043822768243937, "learning_rate": 3.625939288278304e-05, "loss": 0.3505, "step": 862 }, { "epoch": 1.3902537253322593, "grad_norm": 0.3283867868538535, "learning_rate": 3.6246268825305886e-05, "loss": 0.328, "step": 863 }, { "epoch": 1.391864679822795, "grad_norm": 0.27028256421648467, "learning_rate": 3.6233124168886094e-05, "loss": 0.337, "step": 864 }, { "epoch": 1.3934756343133308, "grad_norm": 0.31823659410494604, "learning_rate": 3.621995893019003e-05, "loss": 0.3483, "step": 865 }, { "epoch": 1.3950865888038662, "grad_norm": 0.2853031194725599, "learning_rate": 3.620677312591012e-05, "loss": 0.3672, "step": 866 }, { "epoch": 1.396697543294402, "grad_norm": 0.3703793870257088, "learning_rate": 3.61935667727649e-05, "loss": 0.3466, "step": 867 }, { "epoch": 1.3983084977849376, "grad_norm": 0.3194336151665436, "learning_rate": 3.6180339887498953e-05, "loss": 0.3268, "step": 868 }, { "epoch": 1.3999194522754732, "grad_norm": 0.33386957554709146, "learning_rate": 3.616709248688288e-05, "loss": 0.3419, "step": 869 }, { "epoch": 1.4015304067660088, "grad_norm": 0.30916892946803903, "learning_rate": 3.61538245877133e-05, "loss": 0.3446, "step": 870 }, { "epoch": 1.4031413612565444, "grad_norm": 0.29073836232064454, "learning_rate": 3.614053620681284e-05, "loss": 0.3247, "step": 871 }, { "epoch": 1.4047523157470803, "grad_norm": 0.3341684803586826, "learning_rate": 3.6127227361030076e-05, "loss": 0.3811, "step": 872 }, { "epoch": 1.4063632702376159, "grad_norm": 0.31287760315769936, "learning_rate": 3.611389806723953e-05, "loss": 0.3606, "step": 873 }, { "epoch": 1.4079742247281515, "grad_norm": 0.3063668454004494, "learning_rate": 3.610054834234167e-05, "loss": 0.3625, "step": 874 }, { "epoch": 1.409585179218687, "grad_norm": 0.3168490265948009, "learning_rate": 3.608717820326285e-05, "loss": 0.373, "step": 875 }, { "epoch": 1.4111961337092227, "grad_norm": 0.3722746286249624, "learning_rate": 3.6073787666955326e-05, "loss": 0.3765, "step": 876 }, { "epoch": 1.4128070881997583, "grad_norm": 0.2752473149335231, "learning_rate": 3.6060376750397187e-05, "loss": 0.3144, "step": 877 }, { "epoch": 1.414418042690294, "grad_norm": 0.35239569804844095, "learning_rate": 3.6046945470592395e-05, "loss": 0.3759, "step": 878 }, { "epoch": 1.4160289971808298, "grad_norm": 0.3259555420279317, "learning_rate": 3.6033493844570704e-05, "loss": 0.3643, "step": 879 }, { "epoch": 1.4176399516713654, "grad_norm": 0.28683471938292443, "learning_rate": 3.602002188938769e-05, "loss": 0.3337, "step": 880 }, { "epoch": 1.419250906161901, "grad_norm": 0.32883729844020054, "learning_rate": 3.6006529622124694e-05, "loss": 0.345, "step": 881 }, { "epoch": 1.4208618606524366, "grad_norm": 0.2947797991693968, "learning_rate": 3.59930170598888e-05, "loss": 0.3356, "step": 882 }, { "epoch": 1.4224728151429722, "grad_norm": 0.2826135754232561, "learning_rate": 3.597948421981283e-05, "loss": 0.3107, "step": 883 }, { "epoch": 1.4240837696335078, "grad_norm": 0.3096811885777464, "learning_rate": 3.596593111905533e-05, "loss": 0.3754, "step": 884 }, { "epoch": 1.4256947241240434, "grad_norm": 0.2813838416003549, "learning_rate": 3.5952357774800526e-05, "loss": 0.3484, "step": 885 }, { "epoch": 1.4273056786145792, "grad_norm": 0.3147799342493275, "learning_rate": 3.5938764204258306e-05, "loss": 0.3464, "step": 886 }, { "epoch": 1.4289166331051149, "grad_norm": 0.27371781021865504, "learning_rate": 3.5925150424664206e-05, "loss": 0.3348, "step": 887 }, { "epoch": 1.4305275875956505, "grad_norm": 0.2929800361539922, "learning_rate": 3.591151645327939e-05, "loss": 0.3638, "step": 888 }, { "epoch": 1.432138542086186, "grad_norm": 0.3091933237540173, "learning_rate": 3.589786230739062e-05, "loss": 0.3751, "step": 889 }, { "epoch": 1.4337494965767217, "grad_norm": 0.30038308262391095, "learning_rate": 3.5884188004310244e-05, "loss": 0.3685, "step": 890 }, { "epoch": 1.4353604510672573, "grad_norm": 0.2702977590995165, "learning_rate": 3.587049356137615e-05, "loss": 0.3421, "step": 891 }, { "epoch": 1.436971405557793, "grad_norm": 0.31963763306891574, "learning_rate": 3.5856778995951794e-05, "loss": 0.3574, "step": 892 }, { "epoch": 1.4385823600483287, "grad_norm": 0.2789441221702962, "learning_rate": 3.58430443254261e-05, "loss": 0.3707, "step": 893 }, { "epoch": 1.4401933145388643, "grad_norm": 0.32066299952132377, "learning_rate": 3.582928956721352e-05, "loss": 0.3041, "step": 894 }, { "epoch": 1.4418042690294, "grad_norm": 0.2805130759068286, "learning_rate": 3.581551473875397e-05, "loss": 0.3738, "step": 895 }, { "epoch": 1.4434152235199356, "grad_norm": 0.32812904359127354, "learning_rate": 3.580171985751281e-05, "loss": 0.346, "step": 896 }, { "epoch": 1.4450261780104712, "grad_norm": 0.27111375470537674, "learning_rate": 3.578790494098081e-05, "loss": 0.3306, "step": 897 }, { "epoch": 1.4466371325010068, "grad_norm": 0.38149897363461827, "learning_rate": 3.5774070006674164e-05, "loss": 0.3516, "step": 898 }, { "epoch": 1.4482480869915424, "grad_norm": 0.2892182943854451, "learning_rate": 3.576021507213444e-05, "loss": 0.3594, "step": 899 }, { "epoch": 1.4498590414820782, "grad_norm": 0.3372883515472436, "learning_rate": 3.574634015492857e-05, "loss": 0.3532, "step": 900 }, { "epoch": 1.4514699959726138, "grad_norm": 0.26358417061513745, "learning_rate": 3.57324452726488e-05, "loss": 0.3288, "step": 901 }, { "epoch": 1.4530809504631494, "grad_norm": 0.31566876672673594, "learning_rate": 3.571853044291271e-05, "loss": 0.3362, "step": 902 }, { "epoch": 1.454691904953685, "grad_norm": 0.32363249428251567, "learning_rate": 3.5704595683363187e-05, "loss": 0.3781, "step": 903 }, { "epoch": 1.4563028594442207, "grad_norm": 0.3230781499413541, "learning_rate": 3.569064101166835e-05, "loss": 0.3541, "step": 904 }, { "epoch": 1.4579138139347563, "grad_norm": 0.3156721284052765, "learning_rate": 3.567666644552159e-05, "loss": 0.3609, "step": 905 }, { "epoch": 1.4595247684252919, "grad_norm": 0.24785776600781328, "learning_rate": 3.566267200264151e-05, "loss": 0.3287, "step": 906 }, { "epoch": 1.4611357229158277, "grad_norm": 0.3480224482497804, "learning_rate": 3.564865770077193e-05, "loss": 0.3661, "step": 907 }, { "epoch": 1.4627466774063633, "grad_norm": 0.2646519126771718, "learning_rate": 3.563462355768184e-05, "loss": 0.3346, "step": 908 }, { "epoch": 1.464357631896899, "grad_norm": 0.3172875453636419, "learning_rate": 3.562056959116538e-05, "loss": 0.358, "step": 909 }, { "epoch": 1.4659685863874345, "grad_norm": 0.3022971119624446, "learning_rate": 3.560649581904184e-05, "loss": 0.3738, "step": 910 }, { "epoch": 1.4675795408779702, "grad_norm": 0.3594652213181189, "learning_rate": 3.559240225915561e-05, "loss": 0.364, "step": 911 }, { "epoch": 1.4691904953685058, "grad_norm": 0.29252293105230515, "learning_rate": 3.557828892937617e-05, "loss": 0.3324, "step": 912 }, { "epoch": 1.4708014498590414, "grad_norm": 0.33755638959110873, "learning_rate": 3.5564155847598085e-05, "loss": 0.3463, "step": 913 }, { "epoch": 1.4724124043495772, "grad_norm": 0.28967085216937327, "learning_rate": 3.555000303174093e-05, "loss": 0.3529, "step": 914 }, { "epoch": 1.4740233588401128, "grad_norm": 0.3396436534360357, "learning_rate": 3.553583049974933e-05, "loss": 0.3821, "step": 915 }, { "epoch": 1.4756343133306484, "grad_norm": 0.27806521100206133, "learning_rate": 3.55216382695929e-05, "loss": 0.3594, "step": 916 }, { "epoch": 1.477245267821184, "grad_norm": 0.30304501601924577, "learning_rate": 3.550742635926622e-05, "loss": 0.332, "step": 917 }, { "epoch": 1.4788562223117196, "grad_norm": 0.28397653690883456, "learning_rate": 3.549319478678885e-05, "loss": 0.3478, "step": 918 }, { "epoch": 1.4804671768022553, "grad_norm": 0.3534165168824709, "learning_rate": 3.547894357020525e-05, "loss": 0.3502, "step": 919 }, { "epoch": 1.4820781312927909, "grad_norm": 0.30300554924196443, "learning_rate": 3.546467272758479e-05, "loss": 0.3427, "step": 920 }, { "epoch": 1.4836890857833267, "grad_norm": 0.2821833761521695, "learning_rate": 3.5450382277021745e-05, "loss": 0.336, "step": 921 }, { "epoch": 1.4853000402738623, "grad_norm": 0.3067490433896909, "learning_rate": 3.543607223663524e-05, "loss": 0.3478, "step": 922 }, { "epoch": 1.486910994764398, "grad_norm": 0.35973200105393427, "learning_rate": 3.542174262456924e-05, "loss": 0.3744, "step": 923 }, { "epoch": 1.4885219492549335, "grad_norm": 0.3624137649224464, "learning_rate": 3.540739345899252e-05, "loss": 0.3801, "step": 924 }, { "epoch": 1.4901329037454691, "grad_norm": 0.30165267187605543, "learning_rate": 3.5393024758098645e-05, "loss": 0.3286, "step": 925 }, { "epoch": 1.491743858236005, "grad_norm": 0.28896166859244726, "learning_rate": 3.537863654010597e-05, "loss": 0.3387, "step": 926 }, { "epoch": 1.4933548127265404, "grad_norm": 0.3064859067705259, "learning_rate": 3.5364228823257565e-05, "loss": 0.3568, "step": 927 }, { "epoch": 1.4949657672170762, "grad_norm": 0.32160688390289044, "learning_rate": 3.534980162582124e-05, "loss": 0.3688, "step": 928 }, { "epoch": 1.4965767217076118, "grad_norm": 0.3013538609491535, "learning_rate": 3.5335354966089514e-05, "loss": 0.3376, "step": 929 }, { "epoch": 1.4981876761981474, "grad_norm": 0.3057453257092756, "learning_rate": 3.532088886237956e-05, "loss": 0.3426, "step": 930 }, { "epoch": 1.499798630688683, "grad_norm": 0.3070652933287628, "learning_rate": 3.530640333303323e-05, "loss": 0.3572, "step": 931 }, { "epoch": 1.5014095851792186, "grad_norm": 0.303242364147413, "learning_rate": 3.5291898396416984e-05, "loss": 0.3766, "step": 932 }, { "epoch": 1.5030205396697545, "grad_norm": 0.30911330099386514, "learning_rate": 3.5277374070921904e-05, "loss": 0.3606, "step": 933 }, { "epoch": 1.5046314941602899, "grad_norm": 0.26387501860850937, "learning_rate": 3.5262830374963636e-05, "loss": 0.3132, "step": 934 }, { "epoch": 1.5062424486508257, "grad_norm": 0.31366992473226274, "learning_rate": 3.524826732698241e-05, "loss": 0.345, "step": 935 }, { "epoch": 1.5078534031413613, "grad_norm": 0.36305603623046623, "learning_rate": 3.523368494544298e-05, "loss": 0.3481, "step": 936 }, { "epoch": 1.509464357631897, "grad_norm": 0.3028100237793981, "learning_rate": 3.521908324883462e-05, "loss": 0.3494, "step": 937 }, { "epoch": 1.5110753121224325, "grad_norm": 0.34440302527582833, "learning_rate": 3.520446225567108e-05, "loss": 0.3635, "step": 938 }, { "epoch": 1.5126862666129681, "grad_norm": 0.2883451420805599, "learning_rate": 3.518982198449059e-05, "loss": 0.356, "step": 939 }, { "epoch": 1.514297221103504, "grad_norm": 0.3013627859596229, "learning_rate": 3.517516245385582e-05, "loss": 0.3289, "step": 940 }, { "epoch": 1.5159081755940393, "grad_norm": 0.3018221027079222, "learning_rate": 3.516048368235386e-05, "loss": 0.3604, "step": 941 }, { "epoch": 1.5175191300845752, "grad_norm": 0.30511935925361944, "learning_rate": 3.5145785688596184e-05, "loss": 0.3234, "step": 942 }, { "epoch": 1.5191300845751108, "grad_norm": 0.3007192258635043, "learning_rate": 3.513106849121866e-05, "loss": 0.3544, "step": 943 }, { "epoch": 1.5207410390656464, "grad_norm": 0.2641898297732925, "learning_rate": 3.5116332108881486e-05, "loss": 0.3408, "step": 944 }, { "epoch": 1.522351993556182, "grad_norm": 0.3410443834630316, "learning_rate": 3.5101576560269195e-05, "loss": 0.3533, "step": 945 }, { "epoch": 1.5239629480467176, "grad_norm": 0.28303295707105, "learning_rate": 3.508680186409062e-05, "loss": 0.3367, "step": 946 }, { "epoch": 1.5255739025372534, "grad_norm": 0.3345908288880972, "learning_rate": 3.507200803907886e-05, "loss": 0.3861, "step": 947 }, { "epoch": 1.5271848570277888, "grad_norm": 2.8670378142195654, "learning_rate": 3.505719510399129e-05, "loss": 0.4205, "step": 948 }, { "epoch": 1.5287958115183247, "grad_norm": 0.4210670854339028, "learning_rate": 3.504236307760949e-05, "loss": 0.3497, "step": 949 }, { "epoch": 1.5304067660088603, "grad_norm": 0.2976798975045218, "learning_rate": 3.502751197873927e-05, "loss": 0.3504, "step": 950 }, { "epoch": 1.5320177204993959, "grad_norm": 0.3591627202188393, "learning_rate": 3.501264182621061e-05, "loss": 0.3746, "step": 951 }, { "epoch": 1.5336286749899315, "grad_norm": 0.3210770679629104, "learning_rate": 3.499775263887764e-05, "loss": 0.3468, "step": 952 }, { "epoch": 1.535239629480467, "grad_norm": 0.33550226114199827, "learning_rate": 3.4982844435618643e-05, "loss": 0.3508, "step": 953 }, { "epoch": 1.536850583971003, "grad_norm": 0.32509668801494995, "learning_rate": 3.4967917235336e-05, "loss": 0.3508, "step": 954 }, { "epoch": 1.5384615384615383, "grad_norm": 0.338038211291058, "learning_rate": 3.4952971056956186e-05, "loss": 0.3496, "step": 955 }, { "epoch": 1.5400724929520742, "grad_norm": 0.3141904048502338, "learning_rate": 3.4938005919429724e-05, "loss": 0.3418, "step": 956 }, { "epoch": 1.5416834474426098, "grad_norm": 0.2815520350465673, "learning_rate": 3.49230218417312e-05, "loss": 0.3312, "step": 957 }, { "epoch": 1.5432944019331454, "grad_norm": 0.36948297955531856, "learning_rate": 3.49080188428592e-05, "loss": 0.3603, "step": 958 }, { "epoch": 1.544905356423681, "grad_norm": 0.3084260379112921, "learning_rate": 3.489299694183629e-05, "loss": 0.357, "step": 959 }, { "epoch": 1.5465163109142166, "grad_norm": 0.3549939328103352, "learning_rate": 3.4877956157709024e-05, "loss": 0.3565, "step": 960 }, { "epoch": 1.5481272654047524, "grad_norm": 0.28779131717746126, "learning_rate": 3.4862896509547886e-05, "loss": 0.3468, "step": 961 }, { "epoch": 1.5497382198952878, "grad_norm": 0.3232280682124179, "learning_rate": 3.4847818016447284e-05, "loss": 0.362, "step": 962 }, { "epoch": 1.5513491743858236, "grad_norm": 0.30311024972844897, "learning_rate": 3.483272069752551e-05, "loss": 0.3175, "step": 963 }, { "epoch": 1.5529601288763593, "grad_norm": 0.325020064316006, "learning_rate": 3.481760457192474e-05, "loss": 0.3729, "step": 964 }, { "epoch": 1.5545710833668949, "grad_norm": 0.308178367218141, "learning_rate": 3.4802469658810984e-05, "loss": 0.3433, "step": 965 }, { "epoch": 1.5561820378574305, "grad_norm": 0.3139480265008325, "learning_rate": 3.478731597737407e-05, "loss": 0.3455, "step": 966 }, { "epoch": 1.557792992347966, "grad_norm": 0.32224249506902913, "learning_rate": 3.4772143546827635e-05, "loss": 0.3526, "step": 967 }, { "epoch": 1.559403946838502, "grad_norm": 0.38577637148488075, "learning_rate": 3.475695238640908e-05, "loss": 0.3668, "step": 968 }, { "epoch": 1.5610149013290373, "grad_norm": 0.3160738969499725, "learning_rate": 3.474174251537956e-05, "loss": 0.3389, "step": 969 }, { "epoch": 1.5626258558195731, "grad_norm": 0.348308437450324, "learning_rate": 3.4726513953023944e-05, "loss": 0.3841, "step": 970 }, { "epoch": 1.5642368103101087, "grad_norm": 0.3192517617664544, "learning_rate": 3.471126671865082e-05, "loss": 0.3518, "step": 971 }, { "epoch": 1.5658477648006444, "grad_norm": 0.3315975814114015, "learning_rate": 3.469600083159241e-05, "loss": 0.3501, "step": 972 }, { "epoch": 1.56745871929118, "grad_norm": 0.3429653180075547, "learning_rate": 3.468071631120464e-05, "loss": 0.3673, "step": 973 }, { "epoch": 1.5690696737817156, "grad_norm": 0.2936081834216568, "learning_rate": 3.466541317686702e-05, "loss": 0.3726, "step": 974 }, { "epoch": 1.5706806282722514, "grad_norm": 0.2870880348305391, "learning_rate": 3.465009144798268e-05, "loss": 0.3473, "step": 975 }, { "epoch": 1.5722915827627868, "grad_norm": 0.31292709888328285, "learning_rate": 3.4634751143978317e-05, "loss": 0.3494, "step": 976 }, { "epoch": 1.5739025372533226, "grad_norm": 0.3344663746754926, "learning_rate": 3.461939228430419e-05, "loss": 0.3594, "step": 977 }, { "epoch": 1.5755134917438582, "grad_norm": 0.3243550278021458, "learning_rate": 3.460401488843407e-05, "loss": 0.3842, "step": 978 }, { "epoch": 1.5771244462343939, "grad_norm": 0.28798557728982177, "learning_rate": 3.458861897586524e-05, "loss": 0.3267, "step": 979 }, { "epoch": 1.5787354007249297, "grad_norm": 0.3366500322864135, "learning_rate": 3.4573204566118476e-05, "loss": 0.3929, "step": 980 }, { "epoch": 1.580346355215465, "grad_norm": 0.32529408271268495, "learning_rate": 3.455777167873798e-05, "loss": 0.3484, "step": 981 }, { "epoch": 1.581957309706001, "grad_norm": 0.3119605526527732, "learning_rate": 3.454232033329139e-05, "loss": 0.3498, "step": 982 }, { "epoch": 1.5835682641965363, "grad_norm": 0.3155331712421188, "learning_rate": 3.452685054936976e-05, "loss": 0.3472, "step": 983 }, { "epoch": 1.5851792186870721, "grad_norm": 0.2915430968431837, "learning_rate": 3.4511362346587504e-05, "loss": 0.3407, "step": 984 }, { "epoch": 1.5867901731776077, "grad_norm": 0.3137253719745165, "learning_rate": 3.44958557445824e-05, "loss": 0.355, "step": 985 }, { "epoch": 1.5884011276681433, "grad_norm": 0.3188776562958085, "learning_rate": 3.4480330763015564e-05, "loss": 0.3558, "step": 986 }, { "epoch": 1.5900120821586792, "grad_norm": 0.3129181773241444, "learning_rate": 3.4464787421571395e-05, "loss": 0.3584, "step": 987 }, { "epoch": 1.5916230366492146, "grad_norm": 0.3197808791803997, "learning_rate": 3.444922573995758e-05, "loss": 0.3538, "step": 988 }, { "epoch": 1.5932339911397504, "grad_norm": 0.29636880126371085, "learning_rate": 3.443364573790507e-05, "loss": 0.3371, "step": 989 }, { "epoch": 1.5948449456302858, "grad_norm": 0.30432129789976997, "learning_rate": 3.4418047435168025e-05, "loss": 0.3474, "step": 990 }, { "epoch": 1.5964559001208216, "grad_norm": 0.26763048833686587, "learning_rate": 3.4402430851523824e-05, "loss": 0.3407, "step": 991 }, { "epoch": 1.5980668546113572, "grad_norm": 0.27316984480658096, "learning_rate": 3.438679600677303e-05, "loss": 0.3422, "step": 992 }, { "epoch": 1.5996778091018928, "grad_norm": 0.28509710496947416, "learning_rate": 3.437114292073933e-05, "loss": 0.3438, "step": 993 }, { "epoch": 1.6012887635924287, "grad_norm": 0.2820397711644288, "learning_rate": 3.435547161326958e-05, "loss": 0.3758, "step": 994 }, { "epoch": 1.602899718082964, "grad_norm": 0.27746741501473066, "learning_rate": 3.43397821042337e-05, "loss": 0.357, "step": 995 }, { "epoch": 1.6045106725734999, "grad_norm": 0.329470817735326, "learning_rate": 3.4324074413524725e-05, "loss": 0.3732, "step": 996 }, { "epoch": 1.6061216270640355, "grad_norm": 0.2883189529050093, "learning_rate": 3.430834856105871e-05, "loss": 0.3113, "step": 997 }, { "epoch": 1.607732581554571, "grad_norm": 0.30812120490912964, "learning_rate": 3.429260456677475e-05, "loss": 0.3413, "step": 998 }, { "epoch": 1.6093435360451067, "grad_norm": 0.3016161794889211, "learning_rate": 3.4276842450634964e-05, "loss": 0.3732, "step": 999 }, { "epoch": 1.6109544905356423, "grad_norm": 0.30608350201014334, "learning_rate": 3.4261062232624405e-05, "loss": 0.345, "step": 1000 }, { "epoch": 1.6125654450261782, "grad_norm": 0.287021015994268, "learning_rate": 3.4245263932751124e-05, "loss": 0.3675, "step": 1001 }, { "epoch": 1.6141763995167135, "grad_norm": 0.29513629823587945, "learning_rate": 3.4229447571046055e-05, "loss": 0.3726, "step": 1002 }, { "epoch": 1.6157873540072494, "grad_norm": 0.28497512595967667, "learning_rate": 3.421361316756307e-05, "loss": 0.3417, "step": 1003 }, { "epoch": 1.617398308497785, "grad_norm": 0.27472976631378027, "learning_rate": 3.4197760742378886e-05, "loss": 0.3826, "step": 1004 }, { "epoch": 1.6190092629883206, "grad_norm": 0.32583486377402415, "learning_rate": 3.4181890315593104e-05, "loss": 0.3429, "step": 1005 }, { "epoch": 1.6206202174788562, "grad_norm": 0.3158909644175841, "learning_rate": 3.41660019073281e-05, "loss": 0.339, "step": 1006 }, { "epoch": 1.6222311719693918, "grad_norm": 0.2874717190588641, "learning_rate": 3.41500955377291e-05, "loss": 0.3448, "step": 1007 }, { "epoch": 1.6238421264599276, "grad_norm": 0.32333997510660806, "learning_rate": 3.413417122696408e-05, "loss": 0.3574, "step": 1008 }, { "epoch": 1.625453080950463, "grad_norm": 0.32153546396431953, "learning_rate": 3.411822899522376e-05, "loss": 0.343, "step": 1009 }, { "epoch": 1.6270640354409989, "grad_norm": 0.33818702288315083, "learning_rate": 3.410226886272159e-05, "loss": 0.3592, "step": 1010 }, { "epoch": 1.6286749899315345, "grad_norm": 0.2779482392923402, "learning_rate": 3.408629084969372e-05, "loss": 0.3575, "step": 1011 }, { "epoch": 1.63028594442207, "grad_norm": 0.2958503892623944, "learning_rate": 3.407029497639896e-05, "loss": 0.3431, "step": 1012 }, { "epoch": 1.6318968989126057, "grad_norm": 0.27651659921242794, "learning_rate": 3.405428126311878e-05, "loss": 0.3476, "step": 1013 }, { "epoch": 1.6335078534031413, "grad_norm": 0.2709900192224873, "learning_rate": 3.403824973015725e-05, "loss": 0.376, "step": 1014 }, { "epoch": 1.6351188078936771, "grad_norm": 0.33820245973489416, "learning_rate": 3.4022200397841056e-05, "loss": 0.3518, "step": 1015 }, { "epoch": 1.6367297623842125, "grad_norm": 0.2802490380827594, "learning_rate": 3.4006133286519435e-05, "loss": 0.3777, "step": 1016 }, { "epoch": 1.6383407168747484, "grad_norm": 0.3219486083753848, "learning_rate": 3.399004841656417e-05, "loss": 0.334, "step": 1017 }, { "epoch": 1.639951671365284, "grad_norm": 0.2804170578284511, "learning_rate": 3.3973945808369566e-05, "loss": 0.3353, "step": 1018 }, { "epoch": 1.6415626258558196, "grad_norm": 0.3397988556608258, "learning_rate": 3.395782548235242e-05, "loss": 0.3929, "step": 1019 }, { "epoch": 1.6431735803463552, "grad_norm": 0.25874400518260726, "learning_rate": 3.394168745895199e-05, "loss": 0.3425, "step": 1020 }, { "epoch": 1.6447845348368908, "grad_norm": 0.29769463217777464, "learning_rate": 3.392553175862996e-05, "loss": 0.3454, "step": 1021 }, { "epoch": 1.6463954893274266, "grad_norm": 0.27496871301352516, "learning_rate": 3.390935840187045e-05, "loss": 0.3574, "step": 1022 }, { "epoch": 1.648006443817962, "grad_norm": 0.3130236579001918, "learning_rate": 3.3893167409179945e-05, "loss": 0.353, "step": 1023 }, { "epoch": 1.6496173983084979, "grad_norm": 0.30244268078948694, "learning_rate": 3.387695880108732e-05, "loss": 0.3554, "step": 1024 }, { "epoch": 1.6512283527990335, "grad_norm": 0.2637267118602355, "learning_rate": 3.3860732598143754e-05, "loss": 0.3254, "step": 1025 }, { "epoch": 1.652839307289569, "grad_norm": 0.27265533782673024, "learning_rate": 3.3844488820922755e-05, "loss": 0.3352, "step": 1026 }, { "epoch": 1.6544502617801047, "grad_norm": 0.2847443661542053, "learning_rate": 3.3828227490020096e-05, "loss": 0.345, "step": 1027 }, { "epoch": 1.6560612162706403, "grad_norm": 0.2910046942769011, "learning_rate": 3.381194862605383e-05, "loss": 0.3406, "step": 1028 }, { "epoch": 1.6576721707611761, "grad_norm": 0.30278444886358213, "learning_rate": 3.3795652249664216e-05, "loss": 0.3744, "step": 1029 }, { "epoch": 1.6592831252517115, "grad_norm": 0.3181445159389258, "learning_rate": 3.377933838151374e-05, "loss": 0.3571, "step": 1030 }, { "epoch": 1.6608940797422473, "grad_norm": 0.2555750516659823, "learning_rate": 3.376300704228704e-05, "loss": 0.3293, "step": 1031 }, { "epoch": 1.662505034232783, "grad_norm": 0.2994395275689444, "learning_rate": 3.374665825269093e-05, "loss": 0.3625, "step": 1032 }, { "epoch": 1.6641159887233186, "grad_norm": 0.2941866705604041, "learning_rate": 3.373029203345435e-05, "loss": 0.3466, "step": 1033 }, { "epoch": 1.6657269432138542, "grad_norm": 0.2520003523058466, "learning_rate": 3.3713908405328316e-05, "loss": 0.3492, "step": 1034 }, { "epoch": 1.6673378977043898, "grad_norm": 0.27879316282952243, "learning_rate": 3.369750738908593e-05, "loss": 0.3658, "step": 1035 }, { "epoch": 1.6689488521949256, "grad_norm": 0.26750661336616033, "learning_rate": 3.368108900552236e-05, "loss": 0.3479, "step": 1036 }, { "epoch": 1.670559806685461, "grad_norm": 0.2703473771389898, "learning_rate": 3.366465327545475e-05, "loss": 0.3246, "step": 1037 }, { "epoch": 1.6721707611759968, "grad_norm": 0.2492707097376767, "learning_rate": 3.3648200219722285e-05, "loss": 0.3379, "step": 1038 }, { "epoch": 1.6737817156665324, "grad_norm": 0.28088000970492083, "learning_rate": 3.3631729859186086e-05, "loss": 0.3772, "step": 1039 }, { "epoch": 1.675392670157068, "grad_norm": 0.6148118511176305, "learning_rate": 3.3615242214729226e-05, "loss": 0.3457, "step": 1040 }, { "epoch": 1.6770036246476037, "grad_norm": 0.2741941991125401, "learning_rate": 3.35987373072567e-05, "loss": 0.3679, "step": 1041 }, { "epoch": 1.6786145791381393, "grad_norm": 0.30929820639855726, "learning_rate": 3.3582215157695376e-05, "loss": 0.3595, "step": 1042 }, { "epoch": 1.680225533628675, "grad_norm": 0.2855629710854195, "learning_rate": 3.3565675786994e-05, "loss": 0.3485, "step": 1043 }, { "epoch": 1.6818364881192105, "grad_norm": 0.3160188622883023, "learning_rate": 3.3549119216123125e-05, "loss": 0.3469, "step": 1044 }, { "epoch": 1.6834474426097463, "grad_norm": 0.3178650447881005, "learning_rate": 3.353254546607515e-05, "loss": 0.3631, "step": 1045 }, { "epoch": 1.685058397100282, "grad_norm": 0.26522002369737246, "learning_rate": 3.351595455786423e-05, "loss": 0.3164, "step": 1046 }, { "epoch": 1.6866693515908175, "grad_norm": 0.3103051673772948, "learning_rate": 3.3499346512526286e-05, "loss": 0.3537, "step": 1047 }, { "epoch": 1.6882803060813532, "grad_norm": 0.26705305622759873, "learning_rate": 3.348272135111895e-05, "loss": 0.3207, "step": 1048 }, { "epoch": 1.6898912605718888, "grad_norm": 0.2965365817221479, "learning_rate": 3.346607909472159e-05, "loss": 0.3301, "step": 1049 }, { "epoch": 1.6915022150624246, "grad_norm": 0.3039530865497769, "learning_rate": 3.344941976443521e-05, "loss": 0.3541, "step": 1050 }, { "epoch": 1.69311316955296, "grad_norm": 0.3019357707645832, "learning_rate": 3.3432743381382494e-05, "loss": 0.3476, "step": 1051 }, { "epoch": 1.6947241240434958, "grad_norm": 0.30486830582214286, "learning_rate": 3.341604996670773e-05, "loss": 0.3692, "step": 1052 }, { "epoch": 1.6963350785340314, "grad_norm": 0.2846970874116654, "learning_rate": 3.33993395415768e-05, "loss": 0.3489, "step": 1053 }, { "epoch": 1.697946033024567, "grad_norm": 0.2844019284784835, "learning_rate": 3.3382612127177166e-05, "loss": 0.3484, "step": 1054 }, { "epoch": 1.6995569875151026, "grad_norm": 0.31814090002272866, "learning_rate": 3.3365867744717827e-05, "loss": 0.3313, "step": 1055 }, { "epoch": 1.7011679420056383, "grad_norm": 0.3018174083258602, "learning_rate": 3.334910641542928e-05, "loss": 0.3928, "step": 1056 }, { "epoch": 1.702778896496174, "grad_norm": 0.2805949905286555, "learning_rate": 3.3332328160563534e-05, "loss": 0.3138, "step": 1057 }, { "epoch": 1.7043898509867095, "grad_norm": 0.294154845413384, "learning_rate": 3.331553300139404e-05, "loss": 0.3715, "step": 1058 }, { "epoch": 1.7060008054772453, "grad_norm": 0.28060148879956204, "learning_rate": 3.3298720959215686e-05, "loss": 0.3404, "step": 1059 }, { "epoch": 1.707611759967781, "grad_norm": 0.2898378970822988, "learning_rate": 3.328189205534479e-05, "loss": 0.3707, "step": 1060 }, { "epoch": 1.7092227144583165, "grad_norm": 0.29125221651243705, "learning_rate": 3.3265046311118996e-05, "loss": 0.3513, "step": 1061 }, { "epoch": 1.7108336689488524, "grad_norm": 0.2944155426474975, "learning_rate": 3.3248183747897354e-05, "loss": 0.3294, "step": 1062 }, { "epoch": 1.7124446234393877, "grad_norm": 0.2934301800909067, "learning_rate": 3.3231304387060215e-05, "loss": 0.3756, "step": 1063 }, { "epoch": 1.7140555779299236, "grad_norm": 0.3258895549591974, "learning_rate": 3.321440825000923e-05, "loss": 0.3307, "step": 1064 }, { "epoch": 1.715666532420459, "grad_norm": 0.25585412371136995, "learning_rate": 3.3197495358167314e-05, "loss": 0.3319, "step": 1065 }, { "epoch": 1.7172774869109948, "grad_norm": 0.33852886154261486, "learning_rate": 3.318056573297864e-05, "loss": 0.345, "step": 1066 }, { "epoch": 1.7188884414015304, "grad_norm": 0.29308836939109273, "learning_rate": 3.3163619395908594e-05, "loss": 0.3521, "step": 1067 }, { "epoch": 1.720499395892066, "grad_norm": 0.3260413931230068, "learning_rate": 3.314665636844374e-05, "loss": 0.3579, "step": 1068 }, { "epoch": 1.7221103503826019, "grad_norm": 0.315853677609962, "learning_rate": 3.3129676672091814e-05, "loss": 0.3693, "step": 1069 }, { "epoch": 1.7237213048731372, "grad_norm": 0.25252028334754817, "learning_rate": 3.311268032838169e-05, "loss": 0.311, "step": 1070 }, { "epoch": 1.725332259363673, "grad_norm": 0.30596679620107337, "learning_rate": 3.309566735886334e-05, "loss": 0.361, "step": 1071 }, { "epoch": 1.7269432138542085, "grad_norm": 0.2605593059669489, "learning_rate": 3.307863778510782e-05, "loss": 0.3559, "step": 1072 }, { "epoch": 1.7285541683447443, "grad_norm": 0.30530466414998403, "learning_rate": 3.306159162870724e-05, "loss": 0.3668, "step": 1073 }, { "epoch": 1.73016512283528, "grad_norm": 0.28036764595429337, "learning_rate": 3.304452891127474e-05, "loss": 0.3556, "step": 1074 }, { "epoch": 1.7317760773258155, "grad_norm": 0.3020190132811284, "learning_rate": 3.302744965444445e-05, "loss": 0.3526, "step": 1075 }, { "epoch": 1.7333870318163513, "grad_norm": 0.25272390853900173, "learning_rate": 3.301035387987146e-05, "loss": 0.3276, "step": 1076 }, { "epoch": 1.7349979863068867, "grad_norm": 0.342414793506701, "learning_rate": 3.299324160923184e-05, "loss": 0.3355, "step": 1077 }, { "epoch": 1.7366089407974226, "grad_norm": 0.3002251269987107, "learning_rate": 3.297611286422254e-05, "loss": 0.3466, "step": 1078 }, { "epoch": 1.738219895287958, "grad_norm": 0.32281240413837947, "learning_rate": 3.295896766656141e-05, "loss": 0.3627, "step": 1079 }, { "epoch": 1.7398308497784938, "grad_norm": 0.27577305817853215, "learning_rate": 3.294180603798716e-05, "loss": 0.3264, "step": 1080 }, { "epoch": 1.7414418042690294, "grad_norm": 0.3366688008037847, "learning_rate": 3.292462800025933e-05, "loss": 0.3553, "step": 1081 }, { "epoch": 1.743052758759565, "grad_norm": 0.2864923341321958, "learning_rate": 3.290743357515829e-05, "loss": 0.3715, "step": 1082 }, { "epoch": 1.7446637132501008, "grad_norm": 0.2885887017629758, "learning_rate": 3.289022278448513e-05, "loss": 0.3452, "step": 1083 }, { "epoch": 1.7462746677406362, "grad_norm": 0.28829785150227605, "learning_rate": 3.287299565006177e-05, "loss": 0.3275, "step": 1084 }, { "epoch": 1.747885622231172, "grad_norm": 0.3095311480709755, "learning_rate": 3.285575219373079e-05, "loss": 0.3884, "step": 1085 }, { "epoch": 1.7494965767217077, "grad_norm": 0.3841547619344348, "learning_rate": 3.2838492437355487e-05, "loss": 0.3251, "step": 1086 }, { "epoch": 1.7511075312122433, "grad_norm": 0.2583438353752805, "learning_rate": 3.2821216402819814e-05, "loss": 0.3298, "step": 1087 }, { "epoch": 1.7527184857027789, "grad_norm": 0.3520798827537718, "learning_rate": 3.280392411202838e-05, "loss": 0.3611, "step": 1088 }, { "epoch": 1.7543294401933145, "grad_norm": 0.24181815291715242, "learning_rate": 3.27866155869064e-05, "loss": 0.3353, "step": 1089 }, { "epoch": 1.7559403946838503, "grad_norm": 0.26861982821219954, "learning_rate": 3.276929084939967e-05, "loss": 0.3268, "step": 1090 }, { "epoch": 1.7575513491743857, "grad_norm": 0.30793537607947774, "learning_rate": 3.275194992147455e-05, "loss": 0.3758, "step": 1091 }, { "epoch": 1.7591623036649215, "grad_norm": 0.2735491056478904, "learning_rate": 3.27345928251179e-05, "loss": 0.3522, "step": 1092 }, { "epoch": 1.7607732581554572, "grad_norm": 0.31416721865401886, "learning_rate": 3.271721958233713e-05, "loss": 0.3887, "step": 1093 }, { "epoch": 1.7623842126459928, "grad_norm": 0.23777442546977362, "learning_rate": 3.269983021516006e-05, "loss": 0.341, "step": 1094 }, { "epoch": 1.7639951671365284, "grad_norm": 0.2966716961575548, "learning_rate": 3.268242474563502e-05, "loss": 0.3582, "step": 1095 }, { "epoch": 1.765606121627064, "grad_norm": 0.2695248187187801, "learning_rate": 3.2665003195830705e-05, "loss": 0.3663, "step": 1096 }, { "epoch": 1.7672170761175998, "grad_norm": 0.27648681082467197, "learning_rate": 3.2647565587836224e-05, "loss": 0.3367, "step": 1097 }, { "epoch": 1.7688280306081352, "grad_norm": 0.24769125048939025, "learning_rate": 3.2630111943761035e-05, "loss": 0.3161, "step": 1098 }, { "epoch": 1.770438985098671, "grad_norm": 0.32602085521437335, "learning_rate": 3.261264228573495e-05, "loss": 0.377, "step": 1099 }, { "epoch": 1.7720499395892066, "grad_norm": 0.26152980147545596, "learning_rate": 3.259515663590805e-05, "loss": 0.3594, "step": 1100 }, { "epoch": 1.7736608940797423, "grad_norm": 0.2382508327122932, "learning_rate": 3.257765501645072e-05, "loss": 0.3169, "step": 1101 }, { "epoch": 1.7752718485702779, "grad_norm": 0.3243836829426945, "learning_rate": 3.256013744955359e-05, "loss": 0.3719, "step": 1102 }, { "epoch": 1.7768828030608135, "grad_norm": 0.25232903385465316, "learning_rate": 3.25426039574275e-05, "loss": 0.3608, "step": 1103 }, { "epoch": 1.7784937575513493, "grad_norm": 0.2871596146859017, "learning_rate": 3.2525054562303485e-05, "loss": 0.36, "step": 1104 }, { "epoch": 1.7801047120418847, "grad_norm": 0.27660826788802817, "learning_rate": 3.250748928643274e-05, "loss": 0.3321, "step": 1105 }, { "epoch": 1.7817156665324205, "grad_norm": 0.26281335753440443, "learning_rate": 3.248990815208661e-05, "loss": 0.3365, "step": 1106 }, { "epoch": 1.7833266210229561, "grad_norm": 0.3107781337125716, "learning_rate": 3.247231118155654e-05, "loss": 0.3673, "step": 1107 }, { "epoch": 1.7849375755134917, "grad_norm": 0.2783443100448748, "learning_rate": 3.245469839715404e-05, "loss": 0.3394, "step": 1108 }, { "epoch": 1.7865485300040274, "grad_norm": 0.3194930726854231, "learning_rate": 3.24370698212107e-05, "loss": 0.3464, "step": 1109 }, { "epoch": 1.788159484494563, "grad_norm": 0.29259588976237577, "learning_rate": 3.24194254760781e-05, "loss": 0.351, "step": 1110 }, { "epoch": 1.7897704389850988, "grad_norm": 0.2833032383542838, "learning_rate": 3.240176538412783e-05, "loss": 0.3112, "step": 1111 }, { "epoch": 1.7913813934756342, "grad_norm": 0.3062775036699522, "learning_rate": 3.2384089567751464e-05, "loss": 0.3538, "step": 1112 }, { "epoch": 1.79299234796617, "grad_norm": 0.31520368622639755, "learning_rate": 3.236639804936047e-05, "loss": 0.365, "step": 1113 }, { "epoch": 1.7946033024567056, "grad_norm": 0.2706287135324734, "learning_rate": 3.234869085138626e-05, "loss": 0.3057, "step": 1114 }, { "epoch": 1.7962142569472412, "grad_norm": 0.2776347182974663, "learning_rate": 3.233096799628012e-05, "loss": 0.3591, "step": 1115 }, { "epoch": 1.7978252114377768, "grad_norm": 0.23921615357001141, "learning_rate": 3.2313229506513167e-05, "loss": 0.3287, "step": 1116 }, { "epoch": 1.7994361659283125, "grad_norm": 0.27114468404862, "learning_rate": 3.229547540457638e-05, "loss": 0.361, "step": 1117 }, { "epoch": 1.8010471204188483, "grad_norm": 0.24463881898739914, "learning_rate": 3.2277705712980495e-05, "loss": 0.3404, "step": 1118 }, { "epoch": 1.8026580749093837, "grad_norm": 0.3166270096090458, "learning_rate": 3.225992045425604e-05, "loss": 0.36, "step": 1119 }, { "epoch": 1.8042690293999195, "grad_norm": 0.2723260812159827, "learning_rate": 3.224211965095326e-05, "loss": 0.3402, "step": 1120 }, { "epoch": 1.8058799838904551, "grad_norm": 0.28717184216313857, "learning_rate": 3.222430332564213e-05, "loss": 0.3424, "step": 1121 }, { "epoch": 1.8074909383809907, "grad_norm": 0.30590726225785186, "learning_rate": 3.220647150091229e-05, "loss": 0.384, "step": 1122 }, { "epoch": 1.8091018928715263, "grad_norm": 0.3295960133235172, "learning_rate": 3.2188624199373054e-05, "loss": 0.3477, "step": 1123 }, { "epoch": 1.810712847362062, "grad_norm": 0.24641937154261023, "learning_rate": 3.217076144365332e-05, "loss": 0.3291, "step": 1124 }, { "epoch": 1.8123238018525978, "grad_norm": 0.36214805461319893, "learning_rate": 3.215288325640161e-05, "loss": 0.37, "step": 1125 }, { "epoch": 1.8139347563431332, "grad_norm": 0.27639672336136023, "learning_rate": 3.213498966028603e-05, "loss": 0.3474, "step": 1126 }, { "epoch": 1.815545710833669, "grad_norm": 0.2905471387956144, "learning_rate": 3.2117080677994156e-05, "loss": 0.3489, "step": 1127 }, { "epoch": 1.8171566653242046, "grad_norm": 0.2742465600770904, "learning_rate": 3.2099156332233155e-05, "loss": 0.3212, "step": 1128 }, { "epoch": 1.8187676198147402, "grad_norm": 0.32581522778160565, "learning_rate": 3.2081216645729615e-05, "loss": 0.3789, "step": 1129 }, { "epoch": 1.8203785743052758, "grad_norm": 0.25895277713528797, "learning_rate": 3.20632616412296e-05, "loss": 0.3548, "step": 1130 }, { "epoch": 1.8219895287958114, "grad_norm": 0.2567453896587822, "learning_rate": 3.204529134149858e-05, "loss": 0.337, "step": 1131 }, { "epoch": 1.8236004832863473, "grad_norm": 0.2545418099367893, "learning_rate": 3.2027305769321446e-05, "loss": 0.3757, "step": 1132 }, { "epoch": 1.8252114377768827, "grad_norm": 0.26861543778444336, "learning_rate": 3.2009304947502415e-05, "loss": 0.3309, "step": 1133 }, { "epoch": 1.8268223922674185, "grad_norm": 0.25906425044357323, "learning_rate": 3.1991288898865076e-05, "loss": 0.3425, "step": 1134 }, { "epoch": 1.828433346757954, "grad_norm": 0.2791495649866975, "learning_rate": 3.19732576462523e-05, "loss": 0.3626, "step": 1135 }, { "epoch": 1.8300443012484897, "grad_norm": 0.28566136887901167, "learning_rate": 3.195521121252625e-05, "loss": 0.3427, "step": 1136 }, { "epoch": 1.8316552557390253, "grad_norm": 0.3006511911637088, "learning_rate": 3.193714962056832e-05, "loss": 0.3689, "step": 1137 }, { "epoch": 1.833266210229561, "grad_norm": 0.2575714390405587, "learning_rate": 3.1919072893279144e-05, "loss": 0.3254, "step": 1138 }, { "epoch": 1.8348771647200968, "grad_norm": 0.29714340630201763, "learning_rate": 3.190098105357853e-05, "loss": 0.3114, "step": 1139 }, { "epoch": 1.8364881192106322, "grad_norm": 0.2739715615694121, "learning_rate": 3.188287412440546e-05, "loss": 0.3628, "step": 1140 }, { "epoch": 1.838099073701168, "grad_norm": 0.31490538424728864, "learning_rate": 3.186475212871803e-05, "loss": 0.3681, "step": 1141 }, { "epoch": 1.8397100281917036, "grad_norm": 0.25725247708626503, "learning_rate": 3.1846615089493465e-05, "loss": 0.3373, "step": 1142 }, { "epoch": 1.8413209826822392, "grad_norm": 0.29237114605953757, "learning_rate": 3.182846302972804e-05, "loss": 0.3928, "step": 1143 }, { "epoch": 1.8429319371727748, "grad_norm": 0.24162054032640948, "learning_rate": 3.181029597243709e-05, "loss": 0.3066, "step": 1144 }, { "epoch": 1.8445428916633104, "grad_norm": 0.2746441036778749, "learning_rate": 3.1792113940654976e-05, "loss": 0.3429, "step": 1145 }, { "epoch": 1.8461538461538463, "grad_norm": 0.27290548211325333, "learning_rate": 3.1773916957435e-05, "loss": 0.3567, "step": 1146 }, { "epoch": 1.8477648006443816, "grad_norm": 0.3015414783683767, "learning_rate": 3.1755705045849465e-05, "loss": 0.3386, "step": 1147 }, { "epoch": 1.8493757551349175, "grad_norm": 0.2540351001266061, "learning_rate": 3.173747822898959e-05, "loss": 0.3452, "step": 1148 }, { "epoch": 1.850986709625453, "grad_norm": 0.31221816074924413, "learning_rate": 3.1719236529965494e-05, "loss": 0.3592, "step": 1149 }, { "epoch": 1.8525976641159887, "grad_norm": 0.2423803158509095, "learning_rate": 3.170097997190615e-05, "loss": 0.3195, "step": 1150 }, { "epoch": 1.8542086186065245, "grad_norm": 0.2966064384297838, "learning_rate": 3.16827085779594e-05, "loss": 0.3355, "step": 1151 }, { "epoch": 1.85581957309706, "grad_norm": 0.2810752575128961, "learning_rate": 3.1664422371291866e-05, "loss": 0.3336, "step": 1152 }, { "epoch": 1.8574305275875957, "grad_norm": 0.2784162936236359, "learning_rate": 3.164612137508898e-05, "loss": 0.3542, "step": 1153 }, { "epoch": 1.8590414820781311, "grad_norm": 0.32474379464442327, "learning_rate": 3.162780561255489e-05, "loss": 0.368, "step": 1154 }, { "epoch": 1.860652436568667, "grad_norm": 0.25214198273713495, "learning_rate": 3.16094751069125e-05, "loss": 0.3208, "step": 1155 }, { "epoch": 1.8622633910592026, "grad_norm": 0.2804333800265253, "learning_rate": 3.15911298814034e-05, "loss": 0.3633, "step": 1156 }, { "epoch": 1.8638743455497382, "grad_norm": 0.3177102741885549, "learning_rate": 3.157276995928783e-05, "loss": 0.3548, "step": 1157 }, { "epoch": 1.865485300040274, "grad_norm": 0.2779089270761033, "learning_rate": 3.155439536384467e-05, "loss": 0.3397, "step": 1158 }, { "epoch": 1.8670962545308094, "grad_norm": 0.30157569369109105, "learning_rate": 3.153600611837142e-05, "loss": 0.3538, "step": 1159 }, { "epoch": 1.8687072090213452, "grad_norm": 0.3222782938570124, "learning_rate": 3.151760224618413e-05, "loss": 0.3584, "step": 1160 }, { "epoch": 1.8703181635118806, "grad_norm": 0.26202863886970124, "learning_rate": 3.1499183770617414e-05, "loss": 0.3294, "step": 1161 }, { "epoch": 1.8719291180024165, "grad_norm": 0.3313404569532206, "learning_rate": 3.1480750715024396e-05, "loss": 0.3607, "step": 1162 }, { "epoch": 1.873540072492952, "grad_norm": 0.2804570203558461, "learning_rate": 3.146230310277668e-05, "loss": 0.3684, "step": 1163 }, { "epoch": 1.8751510269834877, "grad_norm": 0.328059825473741, "learning_rate": 3.144384095726433e-05, "loss": 0.3345, "step": 1164 }, { "epoch": 1.8767619814740235, "grad_norm": 0.3310382940626733, "learning_rate": 3.142536430189585e-05, "loss": 0.3592, "step": 1165 }, { "epoch": 1.878372935964559, "grad_norm": 0.2571589985927003, "learning_rate": 3.140687316009812e-05, "loss": 0.334, "step": 1166 }, { "epoch": 1.8799838904550947, "grad_norm": 0.3084210247719844, "learning_rate": 3.138836755531641e-05, "loss": 0.3549, "step": 1167 }, { "epoch": 1.8815948449456303, "grad_norm": 0.3136180657588817, "learning_rate": 3.13698475110143e-05, "loss": 0.3385, "step": 1168 }, { "epoch": 1.883205799436166, "grad_norm": 0.28698454787368805, "learning_rate": 3.135131305067372e-05, "loss": 0.3459, "step": 1169 }, { "epoch": 1.8848167539267016, "grad_norm": 0.3203660869415405, "learning_rate": 3.1332764197794825e-05, "loss": 0.3665, "step": 1170 }, { "epoch": 1.8864277084172372, "grad_norm": 0.2760307046037802, "learning_rate": 3.131420097589606e-05, "loss": 0.3535, "step": 1171 }, { "epoch": 1.888038662907773, "grad_norm": 0.304173676179774, "learning_rate": 3.129562340851408e-05, "loss": 0.3598, "step": 1172 }, { "epoch": 1.8896496173983084, "grad_norm": 0.24789364158706606, "learning_rate": 3.127703151920371e-05, "loss": 0.3423, "step": 1173 }, { "epoch": 1.8912605718888442, "grad_norm": 0.27869905089039404, "learning_rate": 3.125842533153796e-05, "loss": 0.3801, "step": 1174 }, { "epoch": 1.8928715263793798, "grad_norm": 0.2659917053384485, "learning_rate": 3.1239804869107943e-05, "loss": 0.3162, "step": 1175 }, { "epoch": 1.8944824808699154, "grad_norm": 0.2740676239526807, "learning_rate": 3.1221170155522896e-05, "loss": 0.3763, "step": 1176 }, { "epoch": 1.896093435360451, "grad_norm": 0.3036149904699523, "learning_rate": 3.1202521214410116e-05, "loss": 0.3324, "step": 1177 }, { "epoch": 1.8977043898509867, "grad_norm": 0.2704344928402855, "learning_rate": 3.1183858069414936e-05, "loss": 0.3508, "step": 1178 }, { "epoch": 1.8993153443415225, "grad_norm": 0.2745284876347354, "learning_rate": 3.1165180744200704e-05, "loss": 0.3194, "step": 1179 }, { "epoch": 1.9009262988320579, "grad_norm": 0.2584540162273995, "learning_rate": 3.114648926244873e-05, "loss": 0.3385, "step": 1180 }, { "epoch": 1.9025372533225937, "grad_norm": 0.3116807537951856, "learning_rate": 3.11277836478583e-05, "loss": 0.3586, "step": 1181 }, { "epoch": 1.9041482078131293, "grad_norm": 0.2623123292830474, "learning_rate": 3.11090639241466e-05, "loss": 0.3502, "step": 1182 }, { "epoch": 1.905759162303665, "grad_norm": 0.2806030867053086, "learning_rate": 3.1090330115048716e-05, "loss": 0.3602, "step": 1183 }, { "epoch": 1.9073701167942005, "grad_norm": 0.30229750912862674, "learning_rate": 3.107158224431759e-05, "loss": 0.3303, "step": 1184 }, { "epoch": 1.9089810712847362, "grad_norm": 0.3148634173453989, "learning_rate": 3.105282033572398e-05, "loss": 0.3854, "step": 1185 }, { "epoch": 1.910592025775272, "grad_norm": 0.28827408407660937, "learning_rate": 3.1034044413056465e-05, "loss": 0.3304, "step": 1186 }, { "epoch": 1.9122029802658074, "grad_norm": 0.3063959496143443, "learning_rate": 3.1015254500121376e-05, "loss": 0.3747, "step": 1187 }, { "epoch": 1.9138139347563432, "grad_norm": 0.2714542673376483, "learning_rate": 3.09964506207428e-05, "loss": 0.3437, "step": 1188 }, { "epoch": 1.9154248892468788, "grad_norm": 0.2550631224442582, "learning_rate": 3.097763279876251e-05, "loss": 0.3379, "step": 1189 }, { "epoch": 1.9170358437374144, "grad_norm": 0.27675565441191935, "learning_rate": 3.095880105803997e-05, "loss": 0.3567, "step": 1190 }, { "epoch": 1.91864679822795, "grad_norm": 0.25174769021841753, "learning_rate": 3.09399554224523e-05, "loss": 0.3066, "step": 1191 }, { "epoch": 1.9202577527184856, "grad_norm": 0.3167995968880189, "learning_rate": 3.092109591589421e-05, "loss": 0.3712, "step": 1192 }, { "epoch": 1.9218687072090215, "grad_norm": 0.25594727349036206, "learning_rate": 3.0902222562278025e-05, "loss": 0.3463, "step": 1193 }, { "epoch": 1.9234796616995569, "grad_norm": 0.2623623493033658, "learning_rate": 3.088333538553361e-05, "loss": 0.3485, "step": 1194 }, { "epoch": 1.9250906161900927, "grad_norm": 0.24387996552506205, "learning_rate": 3.086443440960838e-05, "loss": 0.3247, "step": 1195 }, { "epoch": 1.9267015706806283, "grad_norm": 0.27837207857209045, "learning_rate": 3.084551965846721e-05, "loss": 0.3378, "step": 1196 }, { "epoch": 1.928312525171164, "grad_norm": 0.30825795217226376, "learning_rate": 3.0826591156092474e-05, "loss": 0.4079, "step": 1197 }, { "epoch": 1.9299234796616995, "grad_norm": 0.2893650060453642, "learning_rate": 3.080764892648396e-05, "loss": 0.3492, "step": 1198 }, { "epoch": 1.9315344341522351, "grad_norm": 0.2649652705405745, "learning_rate": 3.0788692993658874e-05, "loss": 0.3357, "step": 1199 }, { "epoch": 1.933145388642771, "grad_norm": 0.29712091496353926, "learning_rate": 3.076972338165178e-05, "loss": 0.3542, "step": 1200 }, { "epoch": 1.9347563431333064, "grad_norm": 0.27057553138069074, "learning_rate": 3.075074011451461e-05, "loss": 0.3755, "step": 1201 }, { "epoch": 1.9363672976238422, "grad_norm": 0.2491519925272819, "learning_rate": 3.07317432163166e-05, "loss": 0.3191, "step": 1202 }, { "epoch": 1.9379782521143778, "grad_norm": 0.26410068635338846, "learning_rate": 3.0712732711144254e-05, "loss": 0.3462, "step": 1203 }, { "epoch": 1.9395892066049134, "grad_norm": 0.3053998571246562, "learning_rate": 3.0693708623101345e-05, "loss": 0.3587, "step": 1204 }, { "epoch": 1.941200161095449, "grad_norm": 0.2766139786219459, "learning_rate": 3.067467097630886e-05, "loss": 0.3506, "step": 1205 }, { "epoch": 1.9428111155859846, "grad_norm": 0.30093579529593656, "learning_rate": 3.0655619794905e-05, "loss": 0.3676, "step": 1206 }, { "epoch": 1.9444220700765205, "grad_norm": 0.26128628640625445, "learning_rate": 3.063655510304508e-05, "loss": 0.3358, "step": 1207 }, { "epoch": 1.9460330245670558, "grad_norm": 0.29154302004266824, "learning_rate": 3.061747692490159e-05, "loss": 0.3621, "step": 1208 }, { "epoch": 1.9476439790575917, "grad_norm": 0.2518019757597743, "learning_rate": 3.05983852846641e-05, "loss": 0.3323, "step": 1209 }, { "epoch": 1.9492549335481273, "grad_norm": 0.2691404649318548, "learning_rate": 3.057928020653925e-05, "loss": 0.3441, "step": 1210 }, { "epoch": 1.950865888038663, "grad_norm": 0.2548234265739506, "learning_rate": 3.056016171475072e-05, "loss": 0.3453, "step": 1211 }, { "epoch": 1.9524768425291985, "grad_norm": 0.25140408398192154, "learning_rate": 3.0541029833539195e-05, "loss": 0.3427, "step": 1212 }, { "epoch": 1.9540877970197341, "grad_norm": 0.27769435781256435, "learning_rate": 3.0521884587162344e-05, "loss": 0.3448, "step": 1213 }, { "epoch": 1.95569875151027, "grad_norm": 0.2572998449613547, "learning_rate": 3.050272599989477e-05, "loss": 0.3495, "step": 1214 }, { "epoch": 1.9573097060008053, "grad_norm": 0.2557887816223843, "learning_rate": 3.0483554096027998e-05, "loss": 0.328, "step": 1215 }, { "epoch": 1.9589206604913412, "grad_norm": 0.2614863654535399, "learning_rate": 3.046436889987044e-05, "loss": 0.3426, "step": 1216 }, { "epoch": 1.9605316149818768, "grad_norm": 0.2375877870665729, "learning_rate": 3.0445170435747364e-05, "loss": 0.3031, "step": 1217 }, { "epoch": 1.9621425694724124, "grad_norm": 0.2894524906451252, "learning_rate": 3.0425958728000845e-05, "loss": 0.3805, "step": 1218 }, { "epoch": 1.963753523962948, "grad_norm": 0.2593732546970939, "learning_rate": 3.0406733800989766e-05, "loss": 0.3456, "step": 1219 }, { "epoch": 1.9653644784534836, "grad_norm": 0.2533421503146536, "learning_rate": 3.0387495679089753e-05, "loss": 0.3617, "step": 1220 }, { "epoch": 1.9669754329440194, "grad_norm": 0.2737014852345156, "learning_rate": 3.0368244386693196e-05, "loss": 0.3451, "step": 1221 }, { "epoch": 1.9685863874345548, "grad_norm": 0.2385100617519932, "learning_rate": 3.034897994820915e-05, "loss": 0.3462, "step": 1222 }, { "epoch": 1.9701973419250907, "grad_norm": 0.2679673464061188, "learning_rate": 3.0329702388063348e-05, "loss": 0.3474, "step": 1223 }, { "epoch": 1.9718082964156263, "grad_norm": 0.25455746591228023, "learning_rate": 3.0310411730698166e-05, "loss": 0.3403, "step": 1224 }, { "epoch": 1.9734192509061619, "grad_norm": 0.2559764269490091, "learning_rate": 3.029110800057258e-05, "loss": 0.3268, "step": 1225 }, { "epoch": 1.9750302053966975, "grad_norm": 0.26102725959877315, "learning_rate": 3.0271791222162145e-05, "loss": 0.3513, "step": 1226 }, { "epoch": 1.976641159887233, "grad_norm": 0.2837350041203044, "learning_rate": 3.0252461419958968e-05, "loss": 0.355, "step": 1227 }, { "epoch": 1.978252114377769, "grad_norm": 0.276477357654969, "learning_rate": 3.023311861847165e-05, "loss": 0.3156, "step": 1228 }, { "epoch": 1.9798630688683043, "grad_norm": 0.2630117478712932, "learning_rate": 3.0213762842225284e-05, "loss": 0.3311, "step": 1229 }, { "epoch": 1.9814740233588402, "grad_norm": 0.3217425165655156, "learning_rate": 3.0194394115761415e-05, "loss": 0.3413, "step": 1230 }, { "epoch": 1.9830849778493758, "grad_norm": 0.3176180655742061, "learning_rate": 3.0175012463638016e-05, "loss": 0.3541, "step": 1231 }, { "epoch": 1.9846959323399114, "grad_norm": 0.32893088484252825, "learning_rate": 3.0155617910429426e-05, "loss": 0.3467, "step": 1232 }, { "epoch": 1.9863068868304472, "grad_norm": 0.3081265438017872, "learning_rate": 3.0136210480726365e-05, "loss": 0.3385, "step": 1233 }, { "epoch": 1.9879178413209826, "grad_norm": 0.31493139090080025, "learning_rate": 3.011679019913587e-05, "loss": 0.3539, "step": 1234 }, { "epoch": 1.9895287958115184, "grad_norm": 0.3293171737255103, "learning_rate": 3.0097357090281267e-05, "loss": 0.3587, "step": 1235 }, { "epoch": 1.9911397503020538, "grad_norm": 0.2763097409073324, "learning_rate": 3.0077911178802152e-05, "loss": 0.3572, "step": 1236 }, { "epoch": 1.9927507047925896, "grad_norm": 0.29206192980238854, "learning_rate": 3.0058452489354358e-05, "loss": 0.334, "step": 1237 }, { "epoch": 1.9943616592831253, "grad_norm": 0.31050171676421406, "learning_rate": 3.0038981046609915e-05, "loss": 0.3542, "step": 1238 }, { "epoch": 1.9959726137736609, "grad_norm": 0.2887676714430983, "learning_rate": 3.0019496875257012e-05, "loss": 0.3474, "step": 1239 }, { "epoch": 1.9975835682641967, "grad_norm": 0.2960252968104312, "learning_rate": 3.0000000000000004e-05, "loss": 0.3486, "step": 1240 }, { "epoch": 1.999194522754732, "grad_norm": 0.28911173594595874, "learning_rate": 2.9980490445559325e-05, "loss": 0.3589, "step": 1241 }, { "epoch": 2.000805477245268, "grad_norm": 0.39770939551092377, "learning_rate": 2.9960968236671504e-05, "loss": 0.3328, "step": 1242 }, { "epoch": 2.0024164317358033, "grad_norm": 0.2857834417846077, "learning_rate": 2.9941433398089098e-05, "loss": 0.3006, "step": 1243 }, { "epoch": 2.004027386226339, "grad_norm": 0.6949713663880401, "learning_rate": 2.99218859545807e-05, "loss": 0.2604, "step": 1244 }, { "epoch": 2.005638340716875, "grad_norm": 0.5111752099726601, "learning_rate": 2.9902325930930868e-05, "loss": 0.2976, "step": 1245 }, { "epoch": 2.0072492952074104, "grad_norm": 0.39163668448086514, "learning_rate": 2.9882753351940115e-05, "loss": 0.2594, "step": 1246 }, { "epoch": 2.008860249697946, "grad_norm": 0.33796868345118525, "learning_rate": 2.9863168242424873e-05, "loss": 0.2955, "step": 1247 }, { "epoch": 2.0104712041884816, "grad_norm": 0.3941724192343171, "learning_rate": 2.9843570627217463e-05, "loss": 0.2846, "step": 1248 }, { "epoch": 2.0120821586790174, "grad_norm": 0.31642626423981934, "learning_rate": 2.9823960531166065e-05, "loss": 0.2781, "step": 1249 }, { "epoch": 2.013693113169553, "grad_norm": 0.37669153988473264, "learning_rate": 2.980433797913467e-05, "loss": 0.2853, "step": 1250 }, { "epoch": 2.0153040676600886, "grad_norm": 0.30754959333932375, "learning_rate": 2.978470299600308e-05, "loss": 0.2796, "step": 1251 }, { "epoch": 2.0169150221506245, "grad_norm": 0.3422825055523034, "learning_rate": 2.9765055606666844e-05, "loss": 0.2664, "step": 1252 }, { "epoch": 2.01852597664116, "grad_norm": 0.31316330889075605, "learning_rate": 2.9745395836037265e-05, "loss": 0.2814, "step": 1253 }, { "epoch": 2.0201369311316957, "grad_norm": 0.39624358182523745, "learning_rate": 2.9725723709041304e-05, "loss": 0.2882, "step": 1254 }, { "epoch": 2.021747885622231, "grad_norm": 0.29976044657757916, "learning_rate": 2.9706039250621626e-05, "loss": 0.2763, "step": 1255 }, { "epoch": 2.023358840112767, "grad_norm": 0.3659709335531895, "learning_rate": 2.968634248573651e-05, "loss": 0.2939, "step": 1256 }, { "epoch": 2.0249697946033023, "grad_norm": 0.30126786571812936, "learning_rate": 2.9666633439359857e-05, "loss": 0.2716, "step": 1257 }, { "epoch": 2.026580749093838, "grad_norm": 0.3122695764846602, "learning_rate": 2.9646912136481116e-05, "loss": 0.2658, "step": 1258 }, { "epoch": 2.028191703584374, "grad_norm": 0.2867439501044589, "learning_rate": 2.9627178602105296e-05, "loss": 0.2676, "step": 1259 }, { "epoch": 2.0298026580749093, "grad_norm": 0.30992230207590665, "learning_rate": 2.960743286125291e-05, "loss": 0.2955, "step": 1260 }, { "epoch": 2.031413612565445, "grad_norm": 0.3058682460420191, "learning_rate": 2.958767493895994e-05, "loss": 0.2896, "step": 1261 }, { "epoch": 2.0330245670559806, "grad_norm": 0.26862316860853597, "learning_rate": 2.9567904860277825e-05, "loss": 0.2566, "step": 1262 }, { "epoch": 2.0346355215465164, "grad_norm": 0.25194544144940745, "learning_rate": 2.95481226502734e-05, "loss": 0.2478, "step": 1263 }, { "epoch": 2.036246476037052, "grad_norm": 0.30167552158426153, "learning_rate": 2.9528328334028903e-05, "loss": 0.2744, "step": 1264 }, { "epoch": 2.0378574305275876, "grad_norm": 0.28046902794293055, "learning_rate": 2.9508521936641906e-05, "loss": 0.2833, "step": 1265 }, { "epoch": 2.0394683850181234, "grad_norm": 0.310202694342102, "learning_rate": 2.948870348322531e-05, "loss": 0.3095, "step": 1266 }, { "epoch": 2.041079339508659, "grad_norm": 0.2675039895958826, "learning_rate": 2.9468872998907285e-05, "loss": 0.2643, "step": 1267 }, { "epoch": 2.0426902939991947, "grad_norm": 0.2594449033037284, "learning_rate": 2.9449030508831273e-05, "loss": 0.2682, "step": 1268 }, { "epoch": 2.04430124848973, "grad_norm": 0.28891086210870076, "learning_rate": 2.9429176038155938e-05, "loss": 0.2621, "step": 1269 }, { "epoch": 2.045912202980266, "grad_norm": 0.3343156311987029, "learning_rate": 2.9409309612055116e-05, "loss": 0.3005, "step": 1270 }, { "epoch": 2.0475231574708013, "grad_norm": 0.31461832017319397, "learning_rate": 2.938943125571782e-05, "loss": 0.281, "step": 1271 }, { "epoch": 2.049134111961337, "grad_norm": 0.30370641290272277, "learning_rate": 2.9369540994348175e-05, "loss": 0.2864, "step": 1272 }, { "epoch": 2.050745066451873, "grad_norm": 0.27610662631816574, "learning_rate": 2.9349638853165427e-05, "loss": 0.2672, "step": 1273 }, { "epoch": 2.0523560209424083, "grad_norm": 0.3276272216535393, "learning_rate": 2.932972485740385e-05, "loss": 0.282, "step": 1274 }, { "epoch": 2.053966975432944, "grad_norm": 0.3009539346491537, "learning_rate": 2.9309799032312775e-05, "loss": 0.253, "step": 1275 }, { "epoch": 2.0555779299234795, "grad_norm": 0.28633699999733225, "learning_rate": 2.9289861403156504e-05, "loss": 0.2791, "step": 1276 }, { "epoch": 2.0571888844140154, "grad_norm": 0.35069115032081216, "learning_rate": 2.9269911995214354e-05, "loss": 0.2621, "step": 1277 }, { "epoch": 2.0587998389045508, "grad_norm": 0.2799484554842468, "learning_rate": 2.924995083378051e-05, "loss": 0.2723, "step": 1278 }, { "epoch": 2.0604107933950866, "grad_norm": 0.36643461872273314, "learning_rate": 2.922997794416412e-05, "loss": 0.2745, "step": 1279 }, { "epoch": 2.0620217478856224, "grad_norm": 0.2855006354725177, "learning_rate": 2.920999335168917e-05, "loss": 0.2604, "step": 1280 }, { "epoch": 2.063632702376158, "grad_norm": 0.37126926892403106, "learning_rate": 2.9189997081694493e-05, "loss": 0.2772, "step": 1281 }, { "epoch": 2.0652436568666936, "grad_norm": 0.3006772235704741, "learning_rate": 2.916998915953373e-05, "loss": 0.302, "step": 1282 }, { "epoch": 2.066854611357229, "grad_norm": 0.3753958972390217, "learning_rate": 2.914996961057528e-05, "loss": 0.2811, "step": 1283 }, { "epoch": 2.068465565847765, "grad_norm": 0.2994073547515185, "learning_rate": 2.9129938460202312e-05, "loss": 0.2859, "step": 1284 }, { "epoch": 2.0700765203383003, "grad_norm": 0.31202683354041316, "learning_rate": 2.910989573381268e-05, "loss": 0.2733, "step": 1285 }, { "epoch": 2.071687474828836, "grad_norm": 0.2767590884579919, "learning_rate": 2.9089841456818935e-05, "loss": 0.2515, "step": 1286 }, { "epoch": 2.073298429319372, "grad_norm": 0.2735836950606523, "learning_rate": 2.906977565464825e-05, "loss": 0.2903, "step": 1287 }, { "epoch": 2.0749093838099073, "grad_norm": 0.2921931649866422, "learning_rate": 2.9049698352742438e-05, "loss": 0.2981, "step": 1288 }, { "epoch": 2.076520338300443, "grad_norm": 0.267867103961345, "learning_rate": 2.9029609576557863e-05, "loss": 0.2652, "step": 1289 }, { "epoch": 2.0781312927909785, "grad_norm": 0.29224521078187127, "learning_rate": 2.9009509351565462e-05, "loss": 0.2647, "step": 1290 }, { "epoch": 2.0797422472815144, "grad_norm": 0.2729335202374728, "learning_rate": 2.8989397703250682e-05, "loss": 0.2798, "step": 1291 }, { "epoch": 2.0813532017720497, "grad_norm": 0.26817050857073244, "learning_rate": 2.8969274657113452e-05, "loss": 0.2839, "step": 1292 }, { "epoch": 2.0829641562625856, "grad_norm": 0.2954611997992363, "learning_rate": 2.8949140238668145e-05, "loss": 0.3008, "step": 1293 }, { "epoch": 2.0845751107531214, "grad_norm": 0.2584560509593604, "learning_rate": 2.8928994473443557e-05, "loss": 0.2863, "step": 1294 }, { "epoch": 2.086186065243657, "grad_norm": 0.2697921301073831, "learning_rate": 2.890883738698289e-05, "loss": 0.2786, "step": 1295 }, { "epoch": 2.0877970197341926, "grad_norm": 0.2803128875813109, "learning_rate": 2.8888669004843665e-05, "loss": 0.2734, "step": 1296 }, { "epoch": 2.089407974224728, "grad_norm": 0.2381547290712765, "learning_rate": 2.8868489352597762e-05, "loss": 0.251, "step": 1297 }, { "epoch": 2.091018928715264, "grad_norm": 0.3323977346030221, "learning_rate": 2.8848298455831317e-05, "loss": 0.308, "step": 1298 }, { "epoch": 2.0926298832057992, "grad_norm": 0.27153635242123286, "learning_rate": 2.882809634014475e-05, "loss": 0.2589, "step": 1299 }, { "epoch": 2.094240837696335, "grad_norm": 0.285739554905134, "learning_rate": 2.880788303115269e-05, "loss": 0.273, "step": 1300 }, { "epoch": 2.095851792186871, "grad_norm": 0.3010326450465779, "learning_rate": 2.878765855448396e-05, "loss": 0.3033, "step": 1301 }, { "epoch": 2.0974627466774063, "grad_norm": 0.24696037255401815, "learning_rate": 2.876742293578155e-05, "loss": 0.2609, "step": 1302 }, { "epoch": 2.099073701167942, "grad_norm": 0.27166822832389825, "learning_rate": 2.8747176200702572e-05, "loss": 0.2551, "step": 1303 }, { "epoch": 2.1006846556584775, "grad_norm": 0.2735038670067162, "learning_rate": 2.8726918374918233e-05, "loss": 0.2758, "step": 1304 }, { "epoch": 2.1022956101490133, "grad_norm": 0.3028520148216462, "learning_rate": 2.87066494841138e-05, "loss": 0.2911, "step": 1305 }, { "epoch": 2.1039065646395487, "grad_norm": 0.2872325900858017, "learning_rate": 2.8686369553988576e-05, "loss": 0.2843, "step": 1306 }, { "epoch": 2.1055175191300846, "grad_norm": 0.27879094380561814, "learning_rate": 2.8666078610255854e-05, "loss": 0.2792, "step": 1307 }, { "epoch": 2.1071284736206204, "grad_norm": 0.30483411415338657, "learning_rate": 2.8645776678642893e-05, "loss": 0.2495, "step": 1308 }, { "epoch": 2.108739428111156, "grad_norm": 0.2683464007417797, "learning_rate": 2.8625463784890884e-05, "loss": 0.2867, "step": 1309 }, { "epoch": 2.1103503826016916, "grad_norm": 0.29749249023584406, "learning_rate": 2.8605139954754923e-05, "loss": 0.278, "step": 1310 }, { "epoch": 2.111961337092227, "grad_norm": 0.2687970108131372, "learning_rate": 2.8584805214003967e-05, "loss": 0.2504, "step": 1311 }, { "epoch": 2.113572291582763, "grad_norm": 0.32466154003508846, "learning_rate": 2.8564459588420807e-05, "loss": 0.2991, "step": 1312 }, { "epoch": 2.115183246073298, "grad_norm": 0.26356374059791937, "learning_rate": 2.854410310380203e-05, "loss": 0.2656, "step": 1313 }, { "epoch": 2.116794200563834, "grad_norm": 0.2949078910210175, "learning_rate": 2.8523735785958e-05, "loss": 0.2684, "step": 1314 }, { "epoch": 2.11840515505437, "grad_norm": 0.31103645726920576, "learning_rate": 2.8503357660712815e-05, "loss": 0.3014, "step": 1315 }, { "epoch": 2.1200161095449053, "grad_norm": 0.3295336108394426, "learning_rate": 2.8482968753904277e-05, "loss": 0.2945, "step": 1316 }, { "epoch": 2.121627064035441, "grad_norm": 0.3043674560354506, "learning_rate": 2.8462569091383853e-05, "loss": 0.2761, "step": 1317 }, { "epoch": 2.1232380185259765, "grad_norm": 0.2945740096658187, "learning_rate": 2.844215869901664e-05, "loss": 0.2933, "step": 1318 }, { "epoch": 2.1248489730165123, "grad_norm": 0.2751385832645397, "learning_rate": 2.8421737602681364e-05, "loss": 0.2657, "step": 1319 }, { "epoch": 2.1264599275070477, "grad_norm": 0.2966644566850293, "learning_rate": 2.8401305828270302e-05, "loss": 0.2744, "step": 1320 }, { "epoch": 2.1280708819975835, "grad_norm": 0.27307884228282997, "learning_rate": 2.838086340168927e-05, "loss": 0.2798, "step": 1321 }, { "epoch": 2.1296818364881194, "grad_norm": 0.2877534471726608, "learning_rate": 2.836041034885761e-05, "loss": 0.2493, "step": 1322 }, { "epoch": 2.1312927909786548, "grad_norm": 0.2614334519008766, "learning_rate": 2.833994669570811e-05, "loss": 0.2874, "step": 1323 }, { "epoch": 2.1329037454691906, "grad_norm": 0.29160322965169916, "learning_rate": 2.831947246818702e-05, "loss": 0.2856, "step": 1324 }, { "epoch": 2.134514699959726, "grad_norm": 0.2660119080149266, "learning_rate": 2.829898769225399e-05, "loss": 0.3064, "step": 1325 }, { "epoch": 2.136125654450262, "grad_norm": 0.2912904811381936, "learning_rate": 2.8278492393882032e-05, "loss": 0.2657, "step": 1326 }, { "epoch": 2.1377366089407976, "grad_norm": 0.2448795402010965, "learning_rate": 2.8257986599057524e-05, "loss": 0.2437, "step": 1327 }, { "epoch": 2.139347563431333, "grad_norm": 0.27855483837999373, "learning_rate": 2.8237470333780136e-05, "loss": 0.281, "step": 1328 }, { "epoch": 2.140958517921869, "grad_norm": 0.28864912755883265, "learning_rate": 2.8216943624062815e-05, "loss": 0.2634, "step": 1329 }, { "epoch": 2.1425694724124043, "grad_norm": 0.2935166591639771, "learning_rate": 2.8196406495931753e-05, "loss": 0.2896, "step": 1330 }, { "epoch": 2.14418042690294, "grad_norm": 0.27542513543126673, "learning_rate": 2.8175858975426358e-05, "loss": 0.2674, "step": 1331 }, { "epoch": 2.1457913813934755, "grad_norm": 0.28218015707916827, "learning_rate": 2.81553010885992e-05, "loss": 0.2784, "step": 1332 }, { "epoch": 2.1474023358840113, "grad_norm": 0.26365559112393655, "learning_rate": 2.813473286151601e-05, "loss": 0.2661, "step": 1333 }, { "epoch": 2.1490132903745467, "grad_norm": 0.2649219889260671, "learning_rate": 2.8114154320255612e-05, "loss": 0.2729, "step": 1334 }, { "epoch": 2.1506242448650825, "grad_norm": 0.2526894475726239, "learning_rate": 2.809356549090992e-05, "loss": 0.2877, "step": 1335 }, { "epoch": 2.1522351993556184, "grad_norm": 0.2846315930814484, "learning_rate": 2.8072966399583897e-05, "loss": 0.2967, "step": 1336 }, { "epoch": 2.1538461538461537, "grad_norm": 0.24679278451222952, "learning_rate": 2.8052357072395494e-05, "loss": 0.2792, "step": 1337 }, { "epoch": 2.1554571083366896, "grad_norm": 0.25433364102855244, "learning_rate": 2.8031737535475668e-05, "loss": 0.273, "step": 1338 }, { "epoch": 2.157068062827225, "grad_norm": 0.32637103264618766, "learning_rate": 2.801110781496831e-05, "loss": 0.29, "step": 1339 }, { "epoch": 2.158679017317761, "grad_norm": 0.2554277138169469, "learning_rate": 2.799046793703021e-05, "loss": 0.268, "step": 1340 }, { "epoch": 2.1602899718082966, "grad_norm": 0.26969310352840337, "learning_rate": 2.796981792783105e-05, "loss": 0.2853, "step": 1341 }, { "epoch": 2.161900926298832, "grad_norm": 0.2648111637517603, "learning_rate": 2.7949157813553366e-05, "loss": 0.2626, "step": 1342 }, { "epoch": 2.163511880789368, "grad_norm": 0.2559746677208276, "learning_rate": 2.7928487620392487e-05, "loss": 0.3044, "step": 1343 }, { "epoch": 2.1651228352799032, "grad_norm": 0.2714805738931639, "learning_rate": 2.790780737455654e-05, "loss": 0.2633, "step": 1344 }, { "epoch": 2.166733789770439, "grad_norm": 0.24864674470379333, "learning_rate": 2.7887117102266373e-05, "loss": 0.2552, "step": 1345 }, { "epoch": 2.1683447442609745, "grad_norm": 0.2547234928945848, "learning_rate": 2.786641682975558e-05, "loss": 0.2819, "step": 1346 }, { "epoch": 2.1699556987515103, "grad_norm": 0.26524020894968475, "learning_rate": 2.78457065832704e-05, "loss": 0.3123, "step": 1347 }, { "epoch": 2.171566653242046, "grad_norm": 0.2449565344886634, "learning_rate": 2.782498638906975e-05, "loss": 0.2643, "step": 1348 }, { "epoch": 2.1731776077325815, "grad_norm": 0.2893223634942532, "learning_rate": 2.780425627342514e-05, "loss": 0.2816, "step": 1349 }, { "epoch": 2.1747885622231173, "grad_norm": 0.2641617143950078, "learning_rate": 2.7783516262620657e-05, "loss": 0.2727, "step": 1350 }, { "epoch": 2.1763995167136527, "grad_norm": 0.24627345310846985, "learning_rate": 2.7762766382952948e-05, "loss": 0.2814, "step": 1351 }, { "epoch": 2.1780104712041886, "grad_norm": 0.28246010031678387, "learning_rate": 2.7742006660731164e-05, "loss": 0.2872, "step": 1352 }, { "epoch": 2.179621425694724, "grad_norm": 0.27546870573554, "learning_rate": 2.7721237122276944e-05, "loss": 0.2822, "step": 1353 }, { "epoch": 2.18123238018526, "grad_norm": 0.2520502824176606, "learning_rate": 2.7700457793924357e-05, "loss": 0.2802, "step": 1354 }, { "epoch": 2.1828433346757956, "grad_norm": 0.26219847355655707, "learning_rate": 2.767966870201991e-05, "loss": 0.3008, "step": 1355 }, { "epoch": 2.184454289166331, "grad_norm": 0.24699012142387217, "learning_rate": 2.765886987292246e-05, "loss": 0.2777, "step": 1356 }, { "epoch": 2.186065243656867, "grad_norm": 0.23388920141854022, "learning_rate": 2.7638061333003236e-05, "loss": 0.2587, "step": 1357 }, { "epoch": 2.187676198147402, "grad_norm": 0.26392578878534023, "learning_rate": 2.7617243108645753e-05, "loss": 0.3032, "step": 1358 }, { "epoch": 2.189287152637938, "grad_norm": 0.23761776973362883, "learning_rate": 2.759641522624583e-05, "loss": 0.2498, "step": 1359 }, { "epoch": 2.1908981071284734, "grad_norm": 0.27175194130700814, "learning_rate": 2.7575577712211524e-05, "loss": 0.2634, "step": 1360 }, { "epoch": 2.1925090616190093, "grad_norm": 0.2676460221850602, "learning_rate": 2.755473059296309e-05, "loss": 0.2786, "step": 1361 }, { "epoch": 2.194120016109545, "grad_norm": 0.276401287454354, "learning_rate": 2.7533873894932996e-05, "loss": 0.2798, "step": 1362 }, { "epoch": 2.1957309706000805, "grad_norm": 0.24034592871532384, "learning_rate": 2.7513007644565806e-05, "loss": 0.262, "step": 1363 }, { "epoch": 2.1973419250906163, "grad_norm": 0.25935727042899787, "learning_rate": 2.7492131868318247e-05, "loss": 0.2785, "step": 1364 }, { "epoch": 2.1989528795811517, "grad_norm": 0.26803580380287895, "learning_rate": 2.7471246592659075e-05, "loss": 0.286, "step": 1365 }, { "epoch": 2.2005638340716875, "grad_norm": 0.2801935730796939, "learning_rate": 2.745035184406913e-05, "loss": 0.2848, "step": 1366 }, { "epoch": 2.202174788562223, "grad_norm": 0.25675623918826773, "learning_rate": 2.7429447649041243e-05, "loss": 0.2602, "step": 1367 }, { "epoch": 2.2037857430527588, "grad_norm": 0.25791438655113136, "learning_rate": 2.7408534034080228e-05, "loss": 0.262, "step": 1368 }, { "epoch": 2.2053966975432946, "grad_norm": 0.2753646395273099, "learning_rate": 2.7387611025702837e-05, "loss": 0.2958, "step": 1369 }, { "epoch": 2.20700765203383, "grad_norm": 0.2441198130082228, "learning_rate": 2.736667865043775e-05, "loss": 0.2689, "step": 1370 }, { "epoch": 2.208618606524366, "grad_norm": 0.25462467078769574, "learning_rate": 2.734573693482549e-05, "loss": 0.2802, "step": 1371 }, { "epoch": 2.210229561014901, "grad_norm": 0.25852451115604996, "learning_rate": 2.732478590541846e-05, "loss": 0.2712, "step": 1372 }, { "epoch": 2.211840515505437, "grad_norm": 0.25961347780273764, "learning_rate": 2.7303825588780844e-05, "loss": 0.2633, "step": 1373 }, { "epoch": 2.2134514699959724, "grad_norm": 0.3037253510918743, "learning_rate": 2.7282856011488615e-05, "loss": 0.2878, "step": 1374 }, { "epoch": 2.2150624244865083, "grad_norm": 0.22592118401629108, "learning_rate": 2.7261877200129495e-05, "loss": 0.2416, "step": 1375 }, { "epoch": 2.216673378977044, "grad_norm": 0.3573981841929647, "learning_rate": 2.724088918130289e-05, "loss": 0.3017, "step": 1376 }, { "epoch": 2.2182843334675795, "grad_norm": 0.2512932467094342, "learning_rate": 2.721989198161991e-05, "loss": 0.2794, "step": 1377 }, { "epoch": 2.2198952879581153, "grad_norm": 0.3159257756922127, "learning_rate": 2.7198885627703266e-05, "loss": 0.2618, "step": 1378 }, { "epoch": 2.2215062424486507, "grad_norm": 0.2599794920778706, "learning_rate": 2.7177870146187323e-05, "loss": 0.2728, "step": 1379 }, { "epoch": 2.2231171969391865, "grad_norm": 0.2978036146497738, "learning_rate": 2.7156845563717987e-05, "loss": 0.2991, "step": 1380 }, { "epoch": 2.224728151429722, "grad_norm": 0.24686915917646665, "learning_rate": 2.7135811906952714e-05, "loss": 0.2664, "step": 1381 }, { "epoch": 2.2263391059202577, "grad_norm": 0.28407633766585894, "learning_rate": 2.711476920256046e-05, "loss": 0.3044, "step": 1382 }, { "epoch": 2.2279500604107936, "grad_norm": 0.260934439271912, "learning_rate": 2.709371747722166e-05, "loss": 0.2783, "step": 1383 }, { "epoch": 2.229561014901329, "grad_norm": 0.24471982062639003, "learning_rate": 2.7072656757628188e-05, "loss": 0.2686, "step": 1384 }, { "epoch": 2.231171969391865, "grad_norm": 0.2633341856655095, "learning_rate": 2.7051587070483307e-05, "loss": 0.28, "step": 1385 }, { "epoch": 2.2327829238824, "grad_norm": 0.26507841081942923, "learning_rate": 2.7030508442501667e-05, "loss": 0.2783, "step": 1386 }, { "epoch": 2.234393878372936, "grad_norm": 0.25464764785100497, "learning_rate": 2.7009420900409237e-05, "loss": 0.2821, "step": 1387 }, { "epoch": 2.2360048328634714, "grad_norm": 0.2868050675323047, "learning_rate": 2.6988324470943315e-05, "loss": 0.283, "step": 1388 }, { "epoch": 2.2376157873540072, "grad_norm": 0.24611229723847267, "learning_rate": 2.6967219180852448e-05, "loss": 0.2822, "step": 1389 }, { "epoch": 2.239226741844543, "grad_norm": 0.27187189363506403, "learning_rate": 2.6946105056896406e-05, "loss": 0.2994, "step": 1390 }, { "epoch": 2.2408376963350785, "grad_norm": 0.21615468885402625, "learning_rate": 2.692498212584619e-05, "loss": 0.2415, "step": 1391 }, { "epoch": 2.2424486508256143, "grad_norm": 0.26600283632882193, "learning_rate": 2.6903850414483953e-05, "loss": 0.2836, "step": 1392 }, { "epoch": 2.2440596053161497, "grad_norm": 0.23728976141998126, "learning_rate": 2.6882709949602966e-05, "loss": 0.2841, "step": 1393 }, { "epoch": 2.2456705598066855, "grad_norm": 0.25798754292795073, "learning_rate": 2.6861560758007627e-05, "loss": 0.2658, "step": 1394 }, { "epoch": 2.247281514297221, "grad_norm": 0.2734561974922546, "learning_rate": 2.684040286651338e-05, "loss": 0.306, "step": 1395 }, { "epoch": 2.2488924687877567, "grad_norm": 0.2858311588641443, "learning_rate": 2.6819236301946697e-05, "loss": 0.2793, "step": 1396 }, { "epoch": 2.2505034232782926, "grad_norm": 0.2584315966942385, "learning_rate": 2.6798061091145062e-05, "loss": 0.2597, "step": 1397 }, { "epoch": 2.252114377768828, "grad_norm": 0.27927968714043205, "learning_rate": 2.677687726095691e-05, "loss": 0.2817, "step": 1398 }, { "epoch": 2.253725332259364, "grad_norm": 0.33168687660726137, "learning_rate": 2.6755684838241614e-05, "loss": 0.3129, "step": 1399 }, { "epoch": 2.255336286749899, "grad_norm": 0.2554635016264873, "learning_rate": 2.673448384986943e-05, "loss": 0.2854, "step": 1400 }, { "epoch": 2.256947241240435, "grad_norm": 0.2580831506179042, "learning_rate": 2.6713274322721484e-05, "loss": 0.2674, "step": 1401 }, { "epoch": 2.258558195730971, "grad_norm": 0.2716906409548147, "learning_rate": 2.669205628368972e-05, "loss": 0.2931, "step": 1402 }, { "epoch": 2.260169150221506, "grad_norm": 0.22672601567358644, "learning_rate": 2.6670829759676882e-05, "loss": 0.2621, "step": 1403 }, { "epoch": 2.261780104712042, "grad_norm": 0.27308559599455845, "learning_rate": 2.6649594777596476e-05, "loss": 0.2802, "step": 1404 }, { "epoch": 2.2633910592025774, "grad_norm": 0.25346462509049156, "learning_rate": 2.6628351364372717e-05, "loss": 0.2546, "step": 1405 }, { "epoch": 2.2650020136931133, "grad_norm": 0.24996293140763204, "learning_rate": 2.6607099546940526e-05, "loss": 0.2944, "step": 1406 }, { "epoch": 2.2666129681836487, "grad_norm": 0.24233262846738599, "learning_rate": 2.6585839352245467e-05, "loss": 0.2774, "step": 1407 }, { "epoch": 2.2682239226741845, "grad_norm": 0.2583809561916111, "learning_rate": 2.6564570807243728e-05, "loss": 0.2653, "step": 1408 }, { "epoch": 2.26983487716472, "grad_norm": 0.23405502541905018, "learning_rate": 2.65432939389021e-05, "loss": 0.2733, "step": 1409 }, { "epoch": 2.2714458316552557, "grad_norm": 0.27446386749155977, "learning_rate": 2.6522008774197902e-05, "loss": 0.3221, "step": 1410 }, { "epoch": 2.2730567861457915, "grad_norm": 0.2384738855739092, "learning_rate": 2.6500715340118995e-05, "loss": 0.2705, "step": 1411 }, { "epoch": 2.274667740636327, "grad_norm": 0.24307003681995026, "learning_rate": 2.6479413663663706e-05, "loss": 0.2785, "step": 1412 }, { "epoch": 2.2762786951268628, "grad_norm": 0.2526921045428496, "learning_rate": 2.6458103771840835e-05, "loss": 0.2871, "step": 1413 }, { "epoch": 2.277889649617398, "grad_norm": 0.24931242626377267, "learning_rate": 2.6436785691669578e-05, "loss": 0.2808, "step": 1414 }, { "epoch": 2.279500604107934, "grad_norm": 0.2887386630317653, "learning_rate": 2.6415459450179515e-05, "loss": 0.3049, "step": 1415 }, { "epoch": 2.28111155859847, "grad_norm": 0.25033808397227236, "learning_rate": 2.6394125074410584e-05, "loss": 0.2904, "step": 1416 }, { "epoch": 2.282722513089005, "grad_norm": 0.2699612396823536, "learning_rate": 2.6372782591413034e-05, "loss": 0.2729, "step": 1417 }, { "epoch": 2.284333467579541, "grad_norm": 0.23924615054624318, "learning_rate": 2.635143202824739e-05, "loss": 0.2686, "step": 1418 }, { "epoch": 2.2859444220700764, "grad_norm": 0.279887893837138, "learning_rate": 2.6330073411984418e-05, "loss": 0.2697, "step": 1419 }, { "epoch": 2.2875553765606123, "grad_norm": 0.2545245668178236, "learning_rate": 2.6308706769705118e-05, "loss": 0.2698, "step": 1420 }, { "epoch": 2.2891663310511476, "grad_norm": 0.2544237823113252, "learning_rate": 2.6287332128500616e-05, "loss": 0.2784, "step": 1421 }, { "epoch": 2.2907772855416835, "grad_norm": 0.2588334468257935, "learning_rate": 2.6265949515472247e-05, "loss": 0.2856, "step": 1422 }, { "epoch": 2.292388240032219, "grad_norm": 0.2640309275817744, "learning_rate": 2.6244558957731386e-05, "loss": 0.2921, "step": 1423 }, { "epoch": 2.2939991945227547, "grad_norm": 0.27035992387226954, "learning_rate": 2.622316048239954e-05, "loss": 0.2719, "step": 1424 }, { "epoch": 2.2956101490132905, "grad_norm": 0.2763448005510581, "learning_rate": 2.6201754116608222e-05, "loss": 0.2977, "step": 1425 }, { "epoch": 2.297221103503826, "grad_norm": 0.2454033455212278, "learning_rate": 2.618033988749895e-05, "loss": 0.261, "step": 1426 }, { "epoch": 2.2988320579943617, "grad_norm": 0.28206219976356545, "learning_rate": 2.615891782222322e-05, "loss": 0.3032, "step": 1427 }, { "epoch": 2.300443012484897, "grad_norm": 0.25919572585437717, "learning_rate": 2.6137487947942472e-05, "loss": 0.2932, "step": 1428 }, { "epoch": 2.302053966975433, "grad_norm": 0.23936651714279733, "learning_rate": 2.6116050291828026e-05, "loss": 0.2885, "step": 1429 }, { "epoch": 2.303664921465969, "grad_norm": 0.26079396517622616, "learning_rate": 2.6094604881061076e-05, "loss": 0.2644, "step": 1430 }, { "epoch": 2.305275875956504, "grad_norm": 0.27314314971219317, "learning_rate": 2.607315174283267e-05, "loss": 0.3197, "step": 1431 }, { "epoch": 2.30688683044704, "grad_norm": 0.23566335984437972, "learning_rate": 2.6051690904343616e-05, "loss": 0.2795, "step": 1432 }, { "epoch": 2.3084977849375754, "grad_norm": 0.2598558733558277, "learning_rate": 2.6030222392804526e-05, "loss": 0.2833, "step": 1433 }, { "epoch": 2.3101087394281112, "grad_norm": 0.2226975838547479, "learning_rate": 2.60087462354357e-05, "loss": 0.2551, "step": 1434 }, { "epoch": 2.3117196939186466, "grad_norm": 0.2798036466100915, "learning_rate": 2.5987262459467168e-05, "loss": 0.3226, "step": 1435 }, { "epoch": 2.3133306484091825, "grad_norm": 0.2241641247015054, "learning_rate": 2.5965771092138586e-05, "loss": 0.2618, "step": 1436 }, { "epoch": 2.314941602899718, "grad_norm": 0.28580401578871045, "learning_rate": 2.5944272160699272e-05, "loss": 0.2888, "step": 1437 }, { "epoch": 2.3165525573902537, "grad_norm": 0.256570118424594, "learning_rate": 2.5922765692408112e-05, "loss": 0.2771, "step": 1438 }, { "epoch": 2.3181635118807895, "grad_norm": 0.2578027303979569, "learning_rate": 2.5901251714533543e-05, "loss": 0.2574, "step": 1439 }, { "epoch": 2.319774466371325, "grad_norm": 0.2798470005022122, "learning_rate": 2.5879730254353543e-05, "loss": 0.2981, "step": 1440 }, { "epoch": 2.3213854208618607, "grad_norm": 0.27607698811706227, "learning_rate": 2.585820133915555e-05, "loss": 0.2865, "step": 1441 }, { "epoch": 2.322996375352396, "grad_norm": 0.26912286490683257, "learning_rate": 2.58366649962365e-05, "loss": 0.2695, "step": 1442 }, { "epoch": 2.324607329842932, "grad_norm": 0.26457014248103583, "learning_rate": 2.581512125290269e-05, "loss": 0.2812, "step": 1443 }, { "epoch": 2.326218284333468, "grad_norm": 0.2598505097593347, "learning_rate": 2.579357013646985e-05, "loss": 0.2606, "step": 1444 }, { "epoch": 2.327829238824003, "grad_norm": 0.31694547112574056, "learning_rate": 2.5772011674263017e-05, "loss": 0.2661, "step": 1445 }, { "epoch": 2.329440193314539, "grad_norm": 0.24909334654252163, "learning_rate": 2.575044589361657e-05, "loss": 0.2705, "step": 1446 }, { "epoch": 2.3310511478050744, "grad_norm": 0.2774620303831127, "learning_rate": 2.5728872821874155e-05, "loss": 0.2874, "step": 1447 }, { "epoch": 2.33266210229561, "grad_norm": 0.2735139105570509, "learning_rate": 2.5707292486388675e-05, "loss": 0.3037, "step": 1448 }, { "epoch": 2.3342730567861456, "grad_norm": 0.2740635594254122, "learning_rate": 2.5685704914522223e-05, "loss": 0.2897, "step": 1449 }, { "epoch": 2.3358840112766814, "grad_norm": 0.24084315748689009, "learning_rate": 2.566411013364608e-05, "loss": 0.2486, "step": 1450 }, { "epoch": 2.3374949657672173, "grad_norm": 0.287578601994385, "learning_rate": 2.5642508171140675e-05, "loss": 0.3072, "step": 1451 }, { "epoch": 2.3391059202577527, "grad_norm": 0.2550922001271773, "learning_rate": 2.562089905439552e-05, "loss": 0.2716, "step": 1452 }, { "epoch": 2.3407168747482885, "grad_norm": 0.28349657757580804, "learning_rate": 2.5599282810809222e-05, "loss": 0.2776, "step": 1453 }, { "epoch": 2.342327829238824, "grad_norm": 0.2539299708169048, "learning_rate": 2.5577659467789397e-05, "loss": 0.2882, "step": 1454 }, { "epoch": 2.3439387837293597, "grad_norm": 0.2638598040941746, "learning_rate": 2.5556029052752704e-05, "loss": 0.2753, "step": 1455 }, { "epoch": 2.345549738219895, "grad_norm": 0.26546655271952696, "learning_rate": 2.5534391593124717e-05, "loss": 0.2727, "step": 1456 }, { "epoch": 2.347160692710431, "grad_norm": 0.274114516324995, "learning_rate": 2.5512747116339985e-05, "loss": 0.2679, "step": 1457 }, { "epoch": 2.3487716472009668, "grad_norm": 0.27615786351600663, "learning_rate": 2.5491095649841936e-05, "loss": 0.29, "step": 1458 }, { "epoch": 2.350382601691502, "grad_norm": 0.2523360738477017, "learning_rate": 2.5469437221082855e-05, "loss": 0.2876, "step": 1459 }, { "epoch": 2.351993556182038, "grad_norm": 0.26083088114961767, "learning_rate": 2.5447771857523868e-05, "loss": 0.2615, "step": 1460 }, { "epoch": 2.3536045106725734, "grad_norm": 0.26930621483118783, "learning_rate": 2.5426099586634885e-05, "loss": 0.2798, "step": 1461 }, { "epoch": 2.355215465163109, "grad_norm": 0.25754096649737024, "learning_rate": 2.5404420435894578e-05, "loss": 0.2668, "step": 1462 }, { "epoch": 2.356826419653645, "grad_norm": 0.2699366881095369, "learning_rate": 2.538273443279033e-05, "loss": 0.3052, "step": 1463 }, { "epoch": 2.3584373741441804, "grad_norm": 0.2635570721152839, "learning_rate": 2.5361041604818244e-05, "loss": 0.2682, "step": 1464 }, { "epoch": 2.3600483286347163, "grad_norm": 0.2407160146785441, "learning_rate": 2.5339341979483037e-05, "loss": 0.2746, "step": 1465 }, { "epoch": 2.3616592831252516, "grad_norm": 0.29061883733407995, "learning_rate": 2.531763558429807e-05, "loss": 0.2968, "step": 1466 }, { "epoch": 2.3632702376157875, "grad_norm": 0.24834565149196813, "learning_rate": 2.5295922446785275e-05, "loss": 0.2667, "step": 1467 }, { "epoch": 2.364881192106323, "grad_norm": 0.2583516235073483, "learning_rate": 2.527420259447514e-05, "loss": 0.272, "step": 1468 }, { "epoch": 2.3664921465968587, "grad_norm": 0.23935397713698203, "learning_rate": 2.5252476054906668e-05, "loss": 0.2852, "step": 1469 }, { "epoch": 2.368103101087394, "grad_norm": 0.2530197056590334, "learning_rate": 2.523074285562734e-05, "loss": 0.2705, "step": 1470 }, { "epoch": 2.36971405557793, "grad_norm": 0.25163841004710474, "learning_rate": 2.5209003024193067e-05, "loss": 0.2905, "step": 1471 }, { "epoch": 2.3713250100684657, "grad_norm": 0.23237404626614086, "learning_rate": 2.518725658816819e-05, "loss": 0.2526, "step": 1472 }, { "epoch": 2.372935964559001, "grad_norm": 0.26024820542438537, "learning_rate": 2.5165503575125413e-05, "loss": 0.2846, "step": 1473 }, { "epoch": 2.374546919049537, "grad_norm": 0.2585972661841443, "learning_rate": 2.514374401264578e-05, "loss": 0.2749, "step": 1474 }, { "epoch": 2.3761578735400724, "grad_norm": 0.2845138208353183, "learning_rate": 2.5121977928318638e-05, "loss": 0.2748, "step": 1475 }, { "epoch": 2.377768828030608, "grad_norm": 0.2495500233521992, "learning_rate": 2.5100205349741602e-05, "loss": 0.2787, "step": 1476 }, { "epoch": 2.379379782521144, "grad_norm": 0.25151165206564124, "learning_rate": 2.507842630452054e-05, "loss": 0.3044, "step": 1477 }, { "epoch": 2.3809907370116794, "grad_norm": 0.26429785064547257, "learning_rate": 2.5056640820269484e-05, "loss": 0.2835, "step": 1478 }, { "epoch": 2.3826016915022152, "grad_norm": 0.2461207371041576, "learning_rate": 2.503484892461066e-05, "loss": 0.2774, "step": 1479 }, { "epoch": 2.3842126459927506, "grad_norm": 0.27735647570825706, "learning_rate": 2.5013050645174414e-05, "loss": 0.2757, "step": 1480 }, { "epoch": 2.3858236004832865, "grad_norm": 0.2755004610517415, "learning_rate": 2.499124600959918e-05, "loss": 0.2728, "step": 1481 }, { "epoch": 2.387434554973822, "grad_norm": 0.2752976426963208, "learning_rate": 2.4969435045531457e-05, "loss": 0.2604, "step": 1482 }, { "epoch": 2.3890455094643577, "grad_norm": 0.2716284118402911, "learning_rate": 2.494761778062577e-05, "loss": 0.282, "step": 1483 }, { "epoch": 2.390656463954893, "grad_norm": 0.25175456114152706, "learning_rate": 2.4925794242544626e-05, "loss": 0.2972, "step": 1484 }, { "epoch": 2.392267418445429, "grad_norm": 0.2690581205659193, "learning_rate": 2.490396445895849e-05, "loss": 0.2779, "step": 1485 }, { "epoch": 2.3938783729359647, "grad_norm": 0.25560689396934266, "learning_rate": 2.4882128457545748e-05, "loss": 0.2728, "step": 1486 }, { "epoch": 2.3954893274265, "grad_norm": 0.25135288220419316, "learning_rate": 2.4860286265992667e-05, "loss": 0.2888, "step": 1487 }, { "epoch": 2.397100281917036, "grad_norm": 0.2492488977293215, "learning_rate": 2.4838437911993355e-05, "loss": 0.2714, "step": 1488 }, { "epoch": 2.3987112364075713, "grad_norm": 0.2374341024869927, "learning_rate": 2.4816583423249756e-05, "loss": 0.2416, "step": 1489 }, { "epoch": 2.400322190898107, "grad_norm": 0.25762720542042816, "learning_rate": 2.479472282747157e-05, "loss": 0.3118, "step": 1490 }, { "epoch": 2.401933145388643, "grad_norm": 0.24506926236957213, "learning_rate": 2.4772856152376244e-05, "loss": 0.2944, "step": 1491 }, { "epoch": 2.4035440998791784, "grad_norm": 0.2640437205274289, "learning_rate": 2.4750983425688945e-05, "loss": 0.2895, "step": 1492 }, { "epoch": 2.405155054369714, "grad_norm": 0.23704849670354022, "learning_rate": 2.4729104675142496e-05, "loss": 0.2877, "step": 1493 }, { "epoch": 2.4067660088602496, "grad_norm": 0.24521000147844632, "learning_rate": 2.4707219928477372e-05, "loss": 0.2936, "step": 1494 }, { "epoch": 2.4083769633507854, "grad_norm": 0.23091038574228756, "learning_rate": 2.4685329213441645e-05, "loss": 0.2655, "step": 1495 }, { "epoch": 2.409987917841321, "grad_norm": 0.2466614443346807, "learning_rate": 2.4663432557790955e-05, "loss": 0.2762, "step": 1496 }, { "epoch": 2.4115988723318567, "grad_norm": 0.24469535305837437, "learning_rate": 2.464152998928848e-05, "loss": 0.2855, "step": 1497 }, { "epoch": 2.413209826822392, "grad_norm": 0.2505015124713008, "learning_rate": 2.461962153570487e-05, "loss": 0.3012, "step": 1498 }, { "epoch": 2.414820781312928, "grad_norm": 0.23025616103553312, "learning_rate": 2.459770722481827e-05, "loss": 0.2608, "step": 1499 }, { "epoch": 2.4164317358034637, "grad_norm": 0.23710616228759754, "learning_rate": 2.4575787084414244e-05, "loss": 0.2949, "step": 1500 }, { "epoch": 2.418042690293999, "grad_norm": 0.26950446786419663, "learning_rate": 2.4553861142285718e-05, "loss": 0.2945, "step": 1501 }, { "epoch": 2.419653644784535, "grad_norm": 0.22872827384203387, "learning_rate": 2.4531929426233017e-05, "loss": 0.2743, "step": 1502 }, { "epoch": 2.4212645992750703, "grad_norm": 0.24607877318799967, "learning_rate": 2.4509991964063762e-05, "loss": 0.2788, "step": 1503 }, { "epoch": 2.422875553765606, "grad_norm": 0.2392918287186858, "learning_rate": 2.4488048783592864e-05, "loss": 0.2588, "step": 1504 }, { "epoch": 2.424486508256142, "grad_norm": 0.2546711994645602, "learning_rate": 2.446609991264248e-05, "loss": 0.2949, "step": 1505 }, { "epoch": 2.4260974627466774, "grad_norm": 0.24785163146676142, "learning_rate": 2.4444145379041987e-05, "loss": 0.2906, "step": 1506 }, { "epoch": 2.427708417237213, "grad_norm": 0.23900765965369322, "learning_rate": 2.4422185210627943e-05, "loss": 0.2707, "step": 1507 }, { "epoch": 2.4293193717277486, "grad_norm": 0.2849435663798419, "learning_rate": 2.4400219435244047e-05, "loss": 0.3101, "step": 1508 }, { "epoch": 2.4309303262182844, "grad_norm": 0.2335728091178472, "learning_rate": 2.4378248080741123e-05, "loss": 0.2767, "step": 1509 }, { "epoch": 2.43254128070882, "grad_norm": 0.24500309892560315, "learning_rate": 2.435627117497703e-05, "loss": 0.2848, "step": 1510 }, { "epoch": 2.4341522351993556, "grad_norm": 0.26504756906620003, "learning_rate": 2.4334288745816714e-05, "loss": 0.3254, "step": 1511 }, { "epoch": 2.435763189689891, "grad_norm": 0.22953986456533704, "learning_rate": 2.4312300821132087e-05, "loss": 0.2533, "step": 1512 }, { "epoch": 2.437374144180427, "grad_norm": 0.2685582896959977, "learning_rate": 2.4290307428802047e-05, "loss": 0.2936, "step": 1513 }, { "epoch": 2.4389850986709627, "grad_norm": 0.257894081779025, "learning_rate": 2.426830859671242e-05, "loss": 0.2694, "step": 1514 }, { "epoch": 2.440596053161498, "grad_norm": 0.27245685735959463, "learning_rate": 2.4246304352755924e-05, "loss": 0.2743, "step": 1515 }, { "epoch": 2.442207007652034, "grad_norm": 0.2744009692046602, "learning_rate": 2.4224294724832152e-05, "loss": 0.2851, "step": 1516 }, { "epoch": 2.4438179621425693, "grad_norm": 0.2923107453438156, "learning_rate": 2.420227974084751e-05, "loss": 0.2836, "step": 1517 }, { "epoch": 2.445428916633105, "grad_norm": 0.259635977803087, "learning_rate": 2.4180259428715203e-05, "loss": 0.2757, "step": 1518 }, { "epoch": 2.447039871123641, "grad_norm": 0.2792257618006009, "learning_rate": 2.4158233816355185e-05, "loss": 0.2601, "step": 1519 }, { "epoch": 2.4486508256141764, "grad_norm": 0.2922956877345361, "learning_rate": 2.413620293169415e-05, "loss": 0.2859, "step": 1520 }, { "epoch": 2.450261780104712, "grad_norm": 0.27963346219976065, "learning_rate": 2.4114166802665437e-05, "loss": 0.2903, "step": 1521 }, { "epoch": 2.4518727345952476, "grad_norm": 0.26021125040145265, "learning_rate": 2.409212545720908e-05, "loss": 0.2755, "step": 1522 }, { "epoch": 2.4534836890857834, "grad_norm": 0.32453764777695815, "learning_rate": 2.4070078923271688e-05, "loss": 0.3209, "step": 1523 }, { "epoch": 2.455094643576319, "grad_norm": 0.25241900447923254, "learning_rate": 2.404802722880649e-05, "loss": 0.2571, "step": 1524 }, { "epoch": 2.4567055980668546, "grad_norm": 0.28154790258027407, "learning_rate": 2.4025970401773204e-05, "loss": 0.2609, "step": 1525 }, { "epoch": 2.45831655255739, "grad_norm": 0.2591231087696456, "learning_rate": 2.4003908470138106e-05, "loss": 0.2897, "step": 1526 }, { "epoch": 2.459927507047926, "grad_norm": 0.285177493044488, "learning_rate": 2.3981841461873927e-05, "loss": 0.2781, "step": 1527 }, { "epoch": 2.4615384615384617, "grad_norm": 0.28975036947156085, "learning_rate": 2.3959769404959817e-05, "loss": 0.28, "step": 1528 }, { "epoch": 2.463149416028997, "grad_norm": 0.26691677035364914, "learning_rate": 2.3937692327381356e-05, "loss": 0.2775, "step": 1529 }, { "epoch": 2.464760370519533, "grad_norm": 0.2681414848359182, "learning_rate": 2.3915610257130464e-05, "loss": 0.2622, "step": 1530 }, { "epoch": 2.4663713250100683, "grad_norm": 0.24939280874639216, "learning_rate": 2.3893523222205416e-05, "loss": 0.2722, "step": 1531 }, { "epoch": 2.467982279500604, "grad_norm": 0.2669734237482074, "learning_rate": 2.3871431250610765e-05, "loss": 0.2888, "step": 1532 }, { "epoch": 2.46959323399114, "grad_norm": 0.24728738996882668, "learning_rate": 2.3849334370357325e-05, "loss": 0.2927, "step": 1533 }, { "epoch": 2.4712041884816753, "grad_norm": 0.22346148184383519, "learning_rate": 2.382723260946213e-05, "loss": 0.2542, "step": 1534 }, { "epoch": 2.472815142972211, "grad_norm": 0.2713126696973073, "learning_rate": 2.3805125995948422e-05, "loss": 0.2982, "step": 1535 }, { "epoch": 2.4744260974627466, "grad_norm": 0.24113638235375398, "learning_rate": 2.3783014557845573e-05, "loss": 0.2818, "step": 1536 }, { "epoch": 2.4760370519532824, "grad_norm": 0.2340220966184417, "learning_rate": 2.376089832318909e-05, "loss": 0.2654, "step": 1537 }, { "epoch": 2.4776480064438178, "grad_norm": 0.2515922263908132, "learning_rate": 2.3738777320020544e-05, "loss": 0.2906, "step": 1538 }, { "epoch": 2.4792589609343536, "grad_norm": 0.23936876164172788, "learning_rate": 2.3716651576387562e-05, "loss": 0.2705, "step": 1539 }, { "epoch": 2.4808699154248894, "grad_norm": 0.23480838263802561, "learning_rate": 2.369452112034379e-05, "loss": 0.2845, "step": 1540 }, { "epoch": 2.482480869915425, "grad_norm": 0.27839276582383377, "learning_rate": 2.3672385979948825e-05, "loss": 0.303, "step": 1541 }, { "epoch": 2.4840918244059607, "grad_norm": 0.23665728826604437, "learning_rate": 2.3650246183268238e-05, "loss": 0.2787, "step": 1542 }, { "epoch": 2.485702778896496, "grad_norm": 0.25851421490627297, "learning_rate": 2.3628101758373464e-05, "loss": 0.2929, "step": 1543 }, { "epoch": 2.487313733387032, "grad_norm": 0.23517510701071315, "learning_rate": 2.360595273334184e-05, "loss": 0.2683, "step": 1544 }, { "epoch": 2.4889246878775673, "grad_norm": 0.24605001495252662, "learning_rate": 2.3583799136256505e-05, "loss": 0.2844, "step": 1545 }, { "epoch": 2.490535642368103, "grad_norm": 0.25239596472454423, "learning_rate": 2.356164099520643e-05, "loss": 0.2629, "step": 1546 }, { "epoch": 2.492146596858639, "grad_norm": 0.2693185313862507, "learning_rate": 2.3539478338286325e-05, "loss": 0.2827, "step": 1547 }, { "epoch": 2.4937575513491743, "grad_norm": 0.3241129729749029, "learning_rate": 2.351731119359662e-05, "loss": 0.2942, "step": 1548 }, { "epoch": 2.49536850583971, "grad_norm": 0.2568486623179716, "learning_rate": 2.3495139589243455e-05, "loss": 0.2826, "step": 1549 }, { "epoch": 2.4969794603302455, "grad_norm": 0.30686580882418024, "learning_rate": 2.3472963553338614e-05, "loss": 0.2742, "step": 1550 }, { "epoch": 2.4985904148207814, "grad_norm": 0.2576518847992097, "learning_rate": 2.3450783113999487e-05, "loss": 0.28, "step": 1551 }, { "epoch": 2.500201369311317, "grad_norm": 0.3350355156604317, "learning_rate": 2.3428598299349076e-05, "loss": 0.2764, "step": 1552 }, { "epoch": 2.5018123238018526, "grad_norm": 0.2557894175372604, "learning_rate": 2.3406409137515912e-05, "loss": 0.2601, "step": 1553 }, { "epoch": 2.503423278292388, "grad_norm": 0.26635573164209964, "learning_rate": 2.338421565663403e-05, "loss": 0.287, "step": 1554 }, { "epoch": 2.505034232782924, "grad_norm": 0.28770740938014006, "learning_rate": 2.3362017884842967e-05, "loss": 0.2796, "step": 1555 }, { "epoch": 2.5066451872734596, "grad_norm": 0.25946436577333, "learning_rate": 2.3339815850287676e-05, "loss": 0.2753, "step": 1556 }, { "epoch": 2.508256141763995, "grad_norm": 0.2588519714140495, "learning_rate": 2.3317609581118527e-05, "loss": 0.2652, "step": 1557 }, { "epoch": 2.509867096254531, "grad_norm": 0.2690897104911625, "learning_rate": 2.3295399105491256e-05, "loss": 0.2791, "step": 1558 }, { "epoch": 2.5114780507450662, "grad_norm": 0.2520218508663738, "learning_rate": 2.3273184451566934e-05, "loss": 0.2593, "step": 1559 }, { "epoch": 2.513089005235602, "grad_norm": 0.2621023752863465, "learning_rate": 2.325096564751193e-05, "loss": 0.2736, "step": 1560 }, { "epoch": 2.514699959726138, "grad_norm": 0.22160638366693367, "learning_rate": 2.322874272149787e-05, "loss": 0.2715, "step": 1561 }, { "epoch": 2.5163109142166733, "grad_norm": 0.28383611341277853, "learning_rate": 2.3206515701701612e-05, "loss": 0.3236, "step": 1562 }, { "epoch": 2.517921868707209, "grad_norm": 0.23283625465888177, "learning_rate": 2.3184284616305205e-05, "loss": 0.2733, "step": 1563 }, { "epoch": 2.5195328231977445, "grad_norm": 0.24394026083888837, "learning_rate": 2.316204949349585e-05, "loss": 0.2741, "step": 1564 }, { "epoch": 2.5211437776882804, "grad_norm": 0.230477810923903, "learning_rate": 2.3139810361465854e-05, "loss": 0.2541, "step": 1565 }, { "epoch": 2.522754732178816, "grad_norm": 0.2515917527239852, "learning_rate": 2.311756724841265e-05, "loss": 0.2867, "step": 1566 }, { "epoch": 2.5243656866693516, "grad_norm": 0.24050394233763317, "learning_rate": 2.3095320182538657e-05, "loss": 0.2944, "step": 1567 }, { "epoch": 2.525976641159887, "grad_norm": 0.24985185288588335, "learning_rate": 2.3073069192051364e-05, "loss": 0.2617, "step": 1568 }, { "epoch": 2.527587595650423, "grad_norm": 0.24018241301765708, "learning_rate": 2.305081430516319e-05, "loss": 0.2595, "step": 1569 }, { "epoch": 2.5291985501409586, "grad_norm": 0.25522923876634196, "learning_rate": 2.3028555550091536e-05, "loss": 0.2948, "step": 1570 }, { "epoch": 2.530809504631494, "grad_norm": 0.2517439540443441, "learning_rate": 2.300629295505867e-05, "loss": 0.2434, "step": 1571 }, { "epoch": 2.53242045912203, "grad_norm": 0.23925236470373543, "learning_rate": 2.2984026548291752e-05, "loss": 0.2815, "step": 1572 }, { "epoch": 2.5340314136125652, "grad_norm": 0.2870555456129622, "learning_rate": 2.2961756358022765e-05, "loss": 0.313, "step": 1573 }, { "epoch": 2.535642368103101, "grad_norm": 0.24767206562519722, "learning_rate": 2.2939482412488498e-05, "loss": 0.2575, "step": 1574 }, { "epoch": 2.537253322593637, "grad_norm": 0.2722004878155017, "learning_rate": 2.291720473993049e-05, "loss": 0.2865, "step": 1575 }, { "epoch": 2.5388642770841723, "grad_norm": 0.27997751844902186, "learning_rate": 2.289492336859501e-05, "loss": 0.2742, "step": 1576 }, { "epoch": 2.540475231574708, "grad_norm": 0.3147302217916788, "learning_rate": 2.2872638326733018e-05, "loss": 0.2786, "step": 1577 }, { "epoch": 2.5420861860652435, "grad_norm": 0.24157360086493543, "learning_rate": 2.2850349642600137e-05, "loss": 0.2671, "step": 1578 }, { "epoch": 2.5436971405557793, "grad_norm": 0.2931844117704698, "learning_rate": 2.282805734445659e-05, "loss": 0.2828, "step": 1579 }, { "epoch": 2.545308095046315, "grad_norm": 0.27831191484940104, "learning_rate": 2.2805761460567197e-05, "loss": 0.2876, "step": 1580 }, { "epoch": 2.5469190495368506, "grad_norm": 0.27135097897303195, "learning_rate": 2.278346201920131e-05, "loss": 0.3, "step": 1581 }, { "epoch": 2.5485300040273864, "grad_norm": 0.24836372557953207, "learning_rate": 2.2761159048632813e-05, "loss": 0.2653, "step": 1582 }, { "epoch": 2.5501409585179218, "grad_norm": 0.28745989489603807, "learning_rate": 2.273885257714004e-05, "loss": 0.263, "step": 1583 }, { "epoch": 2.5517519130084576, "grad_norm": 0.2616915357039271, "learning_rate": 2.2716542633005777e-05, "loss": 0.2864, "step": 1584 }, { "epoch": 2.553362867498993, "grad_norm": 0.2804002104585373, "learning_rate": 2.2694229244517226e-05, "loss": 0.271, "step": 1585 }, { "epoch": 2.554973821989529, "grad_norm": 0.30579400972491694, "learning_rate": 2.2671912439965923e-05, "loss": 0.2869, "step": 1586 }, { "epoch": 2.556584776480064, "grad_norm": 0.2577374199945713, "learning_rate": 2.264959224764777e-05, "loss": 0.2946, "step": 1587 }, { "epoch": 2.5581957309706, "grad_norm": 0.2970356900027897, "learning_rate": 2.262726869586293e-05, "loss": 0.282, "step": 1588 }, { "epoch": 2.559806685461136, "grad_norm": 0.2555540175655143, "learning_rate": 2.260494181291587e-05, "loss": 0.2948, "step": 1589 }, { "epoch": 2.5614176399516713, "grad_norm": 0.2410510585798002, "learning_rate": 2.258261162711523e-05, "loss": 0.2694, "step": 1590 }, { "epoch": 2.563028594442207, "grad_norm": 0.29264739126391853, "learning_rate": 2.256027816677388e-05, "loss": 0.2778, "step": 1591 }, { "epoch": 2.5646395489327425, "grad_norm": 0.2495793628483905, "learning_rate": 2.2537941460208818e-05, "loss": 0.2792, "step": 1592 }, { "epoch": 2.5662505034232783, "grad_norm": 0.25120155950587036, "learning_rate": 2.2515601535741168e-05, "loss": 0.2727, "step": 1593 }, { "epoch": 2.567861457913814, "grad_norm": 0.2895173143422783, "learning_rate": 2.2493258421696124e-05, "loss": 0.268, "step": 1594 }, { "epoch": 2.5694724124043495, "grad_norm": 0.235477824563421, "learning_rate": 2.2470912146402935e-05, "loss": 0.2559, "step": 1595 }, { "epoch": 2.5710833668948854, "grad_norm": 0.23324303749320113, "learning_rate": 2.244856273819485e-05, "loss": 0.276, "step": 1596 }, { "epoch": 2.5726943213854208, "grad_norm": 0.2874685117002799, "learning_rate": 2.24262102254091e-05, "loss": 0.3234, "step": 1597 }, { "epoch": 2.5743052758759566, "grad_norm": 0.22530125349976493, "learning_rate": 2.2403854636386843e-05, "loss": 0.262, "step": 1598 }, { "epoch": 2.5759162303664924, "grad_norm": 0.25364847023690723, "learning_rate": 2.238149599947314e-05, "loss": 0.2724, "step": 1599 }, { "epoch": 2.577527184857028, "grad_norm": 0.25154728625547484, "learning_rate": 2.2359134343016926e-05, "loss": 0.2899, "step": 1600 }, { "epoch": 2.579138139347563, "grad_norm": 0.2631559112148431, "learning_rate": 2.233676969537094e-05, "loss": 0.2735, "step": 1601 }, { "epoch": 2.580749093838099, "grad_norm": 0.26024689995575573, "learning_rate": 2.2314402084891746e-05, "loss": 0.2723, "step": 1602 }, { "epoch": 2.582360048328635, "grad_norm": 0.2432406812780365, "learning_rate": 2.2292031539939635e-05, "loss": 0.2651, "step": 1603 }, { "epoch": 2.5839710028191702, "grad_norm": 0.26945019384770447, "learning_rate": 2.2269658088878638e-05, "loss": 0.2906, "step": 1604 }, { "epoch": 2.585581957309706, "grad_norm": 0.25788545777611716, "learning_rate": 2.2247281760076468e-05, "loss": 0.2883, "step": 1605 }, { "epoch": 2.5871929118002415, "grad_norm": 0.2732432882264997, "learning_rate": 2.2224902581904476e-05, "loss": 0.268, "step": 1606 }, { "epoch": 2.5888038662907773, "grad_norm": 0.25613794655168626, "learning_rate": 2.2202520582737635e-05, "loss": 0.2596, "step": 1607 }, { "epoch": 2.590414820781313, "grad_norm": 0.3055836045703691, "learning_rate": 2.2180135790954494e-05, "loss": 0.2931, "step": 1608 }, { "epoch": 2.5920257752718485, "grad_norm": 0.23324193325162323, "learning_rate": 2.215774823493715e-05, "loss": 0.2667, "step": 1609 }, { "epoch": 2.5936367297623844, "grad_norm": 0.24811631478589213, "learning_rate": 2.213535794307118e-05, "loss": 0.2757, "step": 1610 }, { "epoch": 2.5952476842529197, "grad_norm": 0.2758092631908418, "learning_rate": 2.211296494374566e-05, "loss": 0.2924, "step": 1611 }, { "epoch": 2.5968586387434556, "grad_norm": 0.2643433716278074, "learning_rate": 2.209056926535307e-05, "loss": 0.3036, "step": 1612 }, { "epoch": 2.5984695932339914, "grad_norm": 0.2685767270547775, "learning_rate": 2.2068170936289323e-05, "loss": 0.2643, "step": 1613 }, { "epoch": 2.600080547724527, "grad_norm": 0.24441427558762532, "learning_rate": 2.2045769984953652e-05, "loss": 0.2685, "step": 1614 }, { "epoch": 2.601691502215062, "grad_norm": 0.29930220047366574, "learning_rate": 2.2023366439748647e-05, "loss": 0.2746, "step": 1615 }, { "epoch": 2.603302456705598, "grad_norm": 0.22953726566232438, "learning_rate": 2.2000960329080166e-05, "loss": 0.2864, "step": 1616 }, { "epoch": 2.604913411196134, "grad_norm": 0.25919477109412487, "learning_rate": 2.197855168135734e-05, "loss": 0.2724, "step": 1617 }, { "epoch": 2.6065243656866692, "grad_norm": 0.24583444723040715, "learning_rate": 2.1956140524992495e-05, "loss": 0.2721, "step": 1618 }, { "epoch": 2.608135320177205, "grad_norm": 0.2488426900524242, "learning_rate": 2.1933726888401146e-05, "loss": 0.3036, "step": 1619 }, { "epoch": 2.6097462746677405, "grad_norm": 0.2709750897529024, "learning_rate": 2.1911310800001967e-05, "loss": 0.2761, "step": 1620 }, { "epoch": 2.6113572291582763, "grad_norm": 0.27314838210070563, "learning_rate": 2.188889228821671e-05, "loss": 0.2854, "step": 1621 }, { "epoch": 2.612968183648812, "grad_norm": 0.24503356817611527, "learning_rate": 2.186647138147024e-05, "loss": 0.2713, "step": 1622 }, { "epoch": 2.6145791381393475, "grad_norm": 0.23765691589614468, "learning_rate": 2.184404810819041e-05, "loss": 0.2682, "step": 1623 }, { "epoch": 2.6161900926298833, "grad_norm": 0.25031315550528577, "learning_rate": 2.182162249680813e-05, "loss": 0.2755, "step": 1624 }, { "epoch": 2.6178010471204187, "grad_norm": 0.24667393832777065, "learning_rate": 2.179919457575722e-05, "loss": 0.2879, "step": 1625 }, { "epoch": 2.6194120016109546, "grad_norm": 0.22980591927089686, "learning_rate": 2.1776764373474465e-05, "loss": 0.267, "step": 1626 }, { "epoch": 2.6210229561014904, "grad_norm": 0.24144423356769742, "learning_rate": 2.1754331918399526e-05, "loss": 0.2771, "step": 1627 }, { "epoch": 2.6226339105920258, "grad_norm": 0.2665613251605162, "learning_rate": 2.1731897238974926e-05, "loss": 0.3079, "step": 1628 }, { "epoch": 2.624244865082561, "grad_norm": 0.22659947309530976, "learning_rate": 2.170946036364601e-05, "loss": 0.2617, "step": 1629 }, { "epoch": 2.625855819573097, "grad_norm": 0.2773783867044038, "learning_rate": 2.1687021320860893e-05, "loss": 0.2679, "step": 1630 }, { "epoch": 2.627466774063633, "grad_norm": 0.25665405302808597, "learning_rate": 2.166458013907047e-05, "loss": 0.295, "step": 1631 }, { "epoch": 2.629077728554168, "grad_norm": 0.2625122154554604, "learning_rate": 2.1642136846728313e-05, "loss": 0.2548, "step": 1632 }, { "epoch": 2.630688683044704, "grad_norm": 0.24524445937268322, "learning_rate": 2.1619691472290692e-05, "loss": 0.278, "step": 1633 }, { "epoch": 2.6322996375352394, "grad_norm": 0.24739471796623, "learning_rate": 2.159724404421649e-05, "loss": 0.2447, "step": 1634 }, { "epoch": 2.6339105920257753, "grad_norm": 0.24322288209895856, "learning_rate": 2.157479459096724e-05, "loss": 0.2823, "step": 1635 }, { "epoch": 2.635521546516311, "grad_norm": 0.22906031471382138, "learning_rate": 2.1552343141007e-05, "loss": 0.2674, "step": 1636 }, { "epoch": 2.6371325010068465, "grad_norm": 0.26408427275553314, "learning_rate": 2.1529889722802384e-05, "loss": 0.3259, "step": 1637 }, { "epoch": 2.6387434554973823, "grad_norm": 0.20577040124306828, "learning_rate": 2.1507434364822487e-05, "loss": 0.2442, "step": 1638 }, { "epoch": 2.6403544099879177, "grad_norm": 0.25336351837263493, "learning_rate": 2.148497709553887e-05, "loss": 0.2966, "step": 1639 }, { "epoch": 2.6419653644784535, "grad_norm": 0.2179088401032362, "learning_rate": 2.1462517943425523e-05, "loss": 0.2399, "step": 1640 }, { "epoch": 2.6435763189689894, "grad_norm": 0.2609474199496771, "learning_rate": 2.1440056936958815e-05, "loss": 0.2897, "step": 1641 }, { "epoch": 2.6451872734595248, "grad_norm": 0.250702552124162, "learning_rate": 2.141759410461746e-05, "loss": 0.2797, "step": 1642 }, { "epoch": 2.64679822795006, "grad_norm": 0.25202435733045636, "learning_rate": 2.1395129474882507e-05, "loss": 0.258, "step": 1643 }, { "epoch": 2.648409182440596, "grad_norm": 0.23559656524296946, "learning_rate": 2.1372663076237273e-05, "loss": 0.3022, "step": 1644 }, { "epoch": 2.650020136931132, "grad_norm": 0.2600623544743553, "learning_rate": 2.1350194937167307e-05, "loss": 0.284, "step": 1645 }, { "epoch": 2.651631091421667, "grad_norm": 0.23042665562467768, "learning_rate": 2.1327725086160385e-05, "loss": 0.2659, "step": 1646 }, { "epoch": 2.653242045912203, "grad_norm": 0.2319917641049967, "learning_rate": 2.1305253551706442e-05, "loss": 0.2755, "step": 1647 }, { "epoch": 2.6548530004027384, "grad_norm": 0.24708094663257485, "learning_rate": 2.1282780362297544e-05, "loss": 0.2895, "step": 1648 }, { "epoch": 2.6564639548932742, "grad_norm": 0.2463749021745463, "learning_rate": 2.1260305546427867e-05, "loss": 0.288, "step": 1649 }, { "epoch": 2.65807490938381, "grad_norm": 0.2540138947859815, "learning_rate": 2.123782913259364e-05, "loss": 0.2826, "step": 1650 }, { "epoch": 2.6596858638743455, "grad_norm": 0.23487737142224063, "learning_rate": 2.121535114929312e-05, "loss": 0.272, "step": 1651 }, { "epoch": 2.6612968183648813, "grad_norm": 0.2282373426932882, "learning_rate": 2.1192871625026553e-05, "loss": 0.2343, "step": 1652 }, { "epoch": 2.6629077728554167, "grad_norm": 0.2591476141147764, "learning_rate": 2.1170390588296148e-05, "loss": 0.3235, "step": 1653 }, { "epoch": 2.6645187273459525, "grad_norm": 0.2636302573560719, "learning_rate": 2.1147908067606012e-05, "loss": 0.2785, "step": 1654 }, { "epoch": 2.6661296818364884, "grad_norm": 0.23618006055822752, "learning_rate": 2.112542409146217e-05, "loss": 0.2655, "step": 1655 }, { "epoch": 2.6677406363270237, "grad_norm": 0.25207523326650344, "learning_rate": 2.1102938688372436e-05, "loss": 0.2836, "step": 1656 }, { "epoch": 2.669351590817559, "grad_norm": 0.25173559248973704, "learning_rate": 2.1080451886846486e-05, "loss": 0.261, "step": 1657 }, { "epoch": 2.670962545308095, "grad_norm": 0.2431590866242753, "learning_rate": 2.1057963715395746e-05, "loss": 0.3086, "step": 1658 }, { "epoch": 2.672573499798631, "grad_norm": 0.25202253459066404, "learning_rate": 2.1035474202533385e-05, "loss": 0.2567, "step": 1659 }, { "epoch": 2.674184454289166, "grad_norm": 0.2555417920210164, "learning_rate": 2.1012983376774255e-05, "loss": 0.2739, "step": 1660 }, { "epoch": 2.675795408779702, "grad_norm": 0.2420625615147683, "learning_rate": 2.0990491266634903e-05, "loss": 0.2653, "step": 1661 }, { "epoch": 2.6774063632702374, "grad_norm": 0.2396489356305853, "learning_rate": 2.0967997900633482e-05, "loss": 0.3129, "step": 1662 }, { "epoch": 2.6790173177607732, "grad_norm": 0.2131386533167181, "learning_rate": 2.094550330728974e-05, "loss": 0.2642, "step": 1663 }, { "epoch": 2.680628272251309, "grad_norm": 0.21745561076267786, "learning_rate": 2.092300751512499e-05, "loss": 0.2642, "step": 1664 }, { "epoch": 2.6822392267418445, "grad_norm": 0.2395086233145289, "learning_rate": 2.0900510552662057e-05, "loss": 0.2924, "step": 1665 }, { "epoch": 2.6838501812323803, "grad_norm": 0.2353999626748506, "learning_rate": 2.0878012448425258e-05, "loss": 0.2591, "step": 1666 }, { "epoch": 2.6854611357229157, "grad_norm": 0.24095219112832106, "learning_rate": 2.085551323094035e-05, "loss": 0.2962, "step": 1667 }, { "epoch": 2.6870720902134515, "grad_norm": 0.2553288916093136, "learning_rate": 2.08330129287345e-05, "loss": 0.3125, "step": 1668 }, { "epoch": 2.6886830447039873, "grad_norm": 0.24345181609728936, "learning_rate": 2.0810511570336262e-05, "loss": 0.2681, "step": 1669 }, { "epoch": 2.6902939991945227, "grad_norm": 0.256453338841592, "learning_rate": 2.0788009184275514e-05, "loss": 0.2694, "step": 1670 }, { "epoch": 2.6919049536850586, "grad_norm": 0.24077026988656322, "learning_rate": 2.0765505799083452e-05, "loss": 0.2741, "step": 1671 }, { "epoch": 2.693515908175594, "grad_norm": 0.2534029520291385, "learning_rate": 2.074300144329252e-05, "loss": 0.3124, "step": 1672 }, { "epoch": 2.6951268626661298, "grad_norm": 0.24039030268215603, "learning_rate": 2.0720496145436423e-05, "loss": 0.2761, "step": 1673 }, { "epoch": 2.696737817156665, "grad_norm": 0.2441294858776203, "learning_rate": 2.0697989934050025e-05, "loss": 0.2702, "step": 1674 }, { "epoch": 2.698348771647201, "grad_norm": 0.23858821941312688, "learning_rate": 2.0675482837669367e-05, "loss": 0.2585, "step": 1675 }, { "epoch": 2.6999597261377364, "grad_norm": 0.22912907435783178, "learning_rate": 2.0652974884831612e-05, "loss": 0.2735, "step": 1676 }, { "epoch": 2.701570680628272, "grad_norm": 0.25427589376548887, "learning_rate": 2.063046610407501e-05, "loss": 0.3006, "step": 1677 }, { "epoch": 2.703181635118808, "grad_norm": 0.25451643357504333, "learning_rate": 2.060795652393886e-05, "loss": 0.2771, "step": 1678 }, { "epoch": 2.7047925896093434, "grad_norm": 0.2470762873381859, "learning_rate": 2.0585446172963457e-05, "loss": 0.2967, "step": 1679 }, { "epoch": 2.7064035440998793, "grad_norm": 0.2405780999475265, "learning_rate": 2.05629350796901e-05, "loss": 0.2728, "step": 1680 }, { "epoch": 2.7080144985904147, "grad_norm": 0.2680658731906045, "learning_rate": 2.0540423272661024e-05, "loss": 0.2846, "step": 1681 }, { "epoch": 2.7096254530809505, "grad_norm": 0.22930174904587397, "learning_rate": 2.0517910780419355e-05, "loss": 0.279, "step": 1682 }, { "epoch": 2.7112364075714863, "grad_norm": 0.2513745451496423, "learning_rate": 2.0495397631509092e-05, "loss": 0.2498, "step": 1683 }, { "epoch": 2.7128473620620217, "grad_norm": 0.2607357243459458, "learning_rate": 2.047288385447507e-05, "loss": 0.29, "step": 1684 }, { "epoch": 2.7144583165525575, "grad_norm": 0.2680985429045293, "learning_rate": 2.0450369477862922e-05, "loss": 0.2788, "step": 1685 }, { "epoch": 2.716069271043093, "grad_norm": 0.28257542550577647, "learning_rate": 2.042785453021905e-05, "loss": 0.2705, "step": 1686 }, { "epoch": 2.7176802255336288, "grad_norm": 0.24906888756996964, "learning_rate": 2.0405339040090557e-05, "loss": 0.2845, "step": 1687 }, { "epoch": 2.7192911800241646, "grad_norm": 0.23828748868464236, "learning_rate": 2.0382823036025243e-05, "loss": 0.2871, "step": 1688 }, { "epoch": 2.7209021345147, "grad_norm": 0.2522899452359346, "learning_rate": 2.0360306546571582e-05, "loss": 0.2743, "step": 1689 }, { "epoch": 2.7225130890052354, "grad_norm": 0.24592334075728844, "learning_rate": 2.0337789600278623e-05, "loss": 0.27, "step": 1690 }, { "epoch": 2.724124043495771, "grad_norm": 0.2554007578885009, "learning_rate": 2.0315272225696034e-05, "loss": 0.3087, "step": 1691 }, { "epoch": 2.725734997986307, "grad_norm": 0.23843192140396555, "learning_rate": 2.0292754451373992e-05, "loss": 0.2685, "step": 1692 }, { "epoch": 2.7273459524768424, "grad_norm": 0.22520952757862236, "learning_rate": 2.027023630586321e-05, "loss": 0.261, "step": 1693 }, { "epoch": 2.7289569069673782, "grad_norm": 0.26311834190007133, "learning_rate": 2.024771781771485e-05, "loss": 0.2881, "step": 1694 }, { "epoch": 2.7305678614579136, "grad_norm": 0.20959220644219456, "learning_rate": 2.0225199015480518e-05, "loss": 0.2437, "step": 1695 }, { "epoch": 2.7321788159484495, "grad_norm": 0.26113815468998086, "learning_rate": 2.0202679927712224e-05, "loss": 0.2894, "step": 1696 }, { "epoch": 2.7337897704389853, "grad_norm": 0.2469561663058487, "learning_rate": 2.018016058296232e-05, "loss": 0.2702, "step": 1697 }, { "epoch": 2.7354007249295207, "grad_norm": 0.2641432503130729, "learning_rate": 2.0157641009783512e-05, "loss": 0.297, "step": 1698 }, { "epoch": 2.7370116794200565, "grad_norm": 0.23840297914176908, "learning_rate": 2.0135121236728762e-05, "loss": 0.2674, "step": 1699 }, { "epoch": 2.738622633910592, "grad_norm": 0.2764405943973875, "learning_rate": 2.0112601292351322e-05, "loss": 0.2947, "step": 1700 }, { "epoch": 2.7402335884011277, "grad_norm": 0.23575772294949454, "learning_rate": 2.009008120520463e-05, "loss": 0.2672, "step": 1701 }, { "epoch": 2.7418445428916636, "grad_norm": 0.2521615855835443, "learning_rate": 2.006756100384233e-05, "loss": 0.2814, "step": 1702 }, { "epoch": 2.743455497382199, "grad_norm": 0.2538001655861593, "learning_rate": 2.0045040716818184e-05, "loss": 0.2802, "step": 1703 }, { "epoch": 2.7450664518727343, "grad_norm": 0.2281414090679116, "learning_rate": 2.0022520372686092e-05, "loss": 0.2723, "step": 1704 }, { "epoch": 2.74667740636327, "grad_norm": 0.27249255745775103, "learning_rate": 2e-05, "loss": 0.2783, "step": 1705 }, { "epoch": 2.748288360853806, "grad_norm": 0.21913760224877482, "learning_rate": 1.9977479627313918e-05, "loss": 0.2666, "step": 1706 }, { "epoch": 2.7498993153443414, "grad_norm": 0.271276451382289, "learning_rate": 1.995495928318182e-05, "loss": 0.2592, "step": 1707 }, { "epoch": 2.7515102698348772, "grad_norm": 0.25181265659436014, "learning_rate": 1.9932438996157678e-05, "loss": 0.2672, "step": 1708 }, { "epoch": 2.7531212243254126, "grad_norm": 0.23000287341824674, "learning_rate": 1.9909918794795378e-05, "loss": 0.2883, "step": 1709 }, { "epoch": 2.7547321788159485, "grad_norm": 0.2509709521664799, "learning_rate": 1.988739870764869e-05, "loss": 0.2955, "step": 1710 }, { "epoch": 2.7563431333064843, "grad_norm": 0.255309204845929, "learning_rate": 1.986487876327124e-05, "loss": 0.2939, "step": 1711 }, { "epoch": 2.7579540877970197, "grad_norm": 0.2389408162743876, "learning_rate": 1.9842358990216498e-05, "loss": 0.276, "step": 1712 }, { "epoch": 2.7595650422875555, "grad_norm": 0.22548655871923712, "learning_rate": 1.9819839417037688e-05, "loss": 0.2551, "step": 1713 }, { "epoch": 2.761175996778091, "grad_norm": 0.2514072456845176, "learning_rate": 1.9797320072287786e-05, "loss": 0.2797, "step": 1714 }, { "epoch": 2.7627869512686267, "grad_norm": 0.2389348677377125, "learning_rate": 1.9774800984519485e-05, "loss": 0.2538, "step": 1715 }, { "epoch": 2.7643979057591626, "grad_norm": 0.24399474291305062, "learning_rate": 1.9752282182285158e-05, "loss": 0.2698, "step": 1716 }, { "epoch": 2.766008860249698, "grad_norm": 0.24492996641517525, "learning_rate": 1.9729763694136796e-05, "loss": 0.2571, "step": 1717 }, { "epoch": 2.7676198147402333, "grad_norm": 0.25512566620189414, "learning_rate": 1.9707245548626008e-05, "loss": 0.2931, "step": 1718 }, { "epoch": 2.769230769230769, "grad_norm": 0.24336440793633282, "learning_rate": 1.968472777430397e-05, "loss": 0.2718, "step": 1719 }, { "epoch": 2.770841723721305, "grad_norm": 0.2365556332430933, "learning_rate": 1.966221039972138e-05, "loss": 0.2833, "step": 1720 }, { "epoch": 2.7724526782118404, "grad_norm": 0.2489637128328502, "learning_rate": 1.9639693453428428e-05, "loss": 0.2614, "step": 1721 }, { "epoch": 2.774063632702376, "grad_norm": 0.23427312408705342, "learning_rate": 1.9617176963974757e-05, "loss": 0.237, "step": 1722 }, { "epoch": 2.7756745871929116, "grad_norm": 0.30355928331474424, "learning_rate": 1.959466095990945e-05, "loss": 0.3156, "step": 1723 }, { "epoch": 2.7772855416834474, "grad_norm": 0.23089885134745378, "learning_rate": 1.9572145469780957e-05, "loss": 0.262, "step": 1724 }, { "epoch": 2.7788964961739833, "grad_norm": 0.2650910801407474, "learning_rate": 1.9549630522137084e-05, "loss": 0.2691, "step": 1725 }, { "epoch": 2.7805074506645187, "grad_norm": 0.27538303821293014, "learning_rate": 1.9527116145524934e-05, "loss": 0.2905, "step": 1726 }, { "epoch": 2.7821184051550545, "grad_norm": 0.2529807875761806, "learning_rate": 1.9504602368490918e-05, "loss": 0.2971, "step": 1727 }, { "epoch": 2.78372935964559, "grad_norm": 0.2799965953217938, "learning_rate": 1.9482089219580655e-05, "loss": 0.2752, "step": 1728 }, { "epoch": 2.7853403141361257, "grad_norm": 0.25007926005011655, "learning_rate": 1.9459576727338986e-05, "loss": 0.2604, "step": 1729 }, { "epoch": 2.7869512686266615, "grad_norm": 0.254990987065823, "learning_rate": 1.9437064920309895e-05, "loss": 0.2971, "step": 1730 }, { "epoch": 2.788562223117197, "grad_norm": 0.26362113970903406, "learning_rate": 1.941455382703655e-05, "loss": 0.2538, "step": 1731 }, { "epoch": 2.7901731776077323, "grad_norm": 0.246885971572867, "learning_rate": 1.939204347606115e-05, "loss": 0.2721, "step": 1732 }, { "epoch": 2.791784132098268, "grad_norm": 0.266283943370594, "learning_rate": 1.9369533895924992e-05, "loss": 0.2866, "step": 1733 }, { "epoch": 2.793395086588804, "grad_norm": 0.2680098900263936, "learning_rate": 1.934702511516839e-05, "loss": 0.2967, "step": 1734 }, { "epoch": 2.7950060410793394, "grad_norm": 0.22740978433404274, "learning_rate": 1.932451716233064e-05, "loss": 0.2694, "step": 1735 }, { "epoch": 2.796616995569875, "grad_norm": 0.2561037233597848, "learning_rate": 1.930201006594999e-05, "loss": 0.2679, "step": 1736 }, { "epoch": 2.7982279500604106, "grad_norm": 0.23434576263054702, "learning_rate": 1.9279503854563584e-05, "loss": 0.2799, "step": 1737 }, { "epoch": 2.7998389045509464, "grad_norm": 0.22674515230572093, "learning_rate": 1.925699855670748e-05, "loss": 0.2728, "step": 1738 }, { "epoch": 2.8014498590414822, "grad_norm": 0.2428990371238118, "learning_rate": 1.9234494200916554e-05, "loss": 0.3036, "step": 1739 }, { "epoch": 2.8030608135320176, "grad_norm": 0.21954338652190178, "learning_rate": 1.9211990815724496e-05, "loss": 0.2474, "step": 1740 }, { "epoch": 2.8046717680225535, "grad_norm": 0.25787658937930996, "learning_rate": 1.918948842966374e-05, "loss": 0.2964, "step": 1741 }, { "epoch": 2.806282722513089, "grad_norm": 0.225083476296253, "learning_rate": 1.9166987071265506e-05, "loss": 0.2625, "step": 1742 }, { "epoch": 2.8078936770036247, "grad_norm": 0.26059530040772755, "learning_rate": 1.914448676905966e-05, "loss": 0.2689, "step": 1743 }, { "epoch": 2.8095046314941605, "grad_norm": 0.2149852949576493, "learning_rate": 1.9121987551574745e-05, "loss": 0.2373, "step": 1744 }, { "epoch": 2.811115585984696, "grad_norm": 0.26351487458727435, "learning_rate": 1.9099489447337946e-05, "loss": 0.2786, "step": 1745 }, { "epoch": 2.8127265404752317, "grad_norm": 0.26753566423015324, "learning_rate": 1.9076992484875014e-05, "loss": 0.3151, "step": 1746 }, { "epoch": 2.814337494965767, "grad_norm": 0.23423084235876232, "learning_rate": 1.905449669271027e-05, "loss": 0.276, "step": 1747 }, { "epoch": 2.815948449456303, "grad_norm": 0.239399463127817, "learning_rate": 1.9032002099366528e-05, "loss": 0.2747, "step": 1748 }, { "epoch": 2.8175594039468383, "grad_norm": 0.24223640625161968, "learning_rate": 1.9009508733365103e-05, "loss": 0.2576, "step": 1749 }, { "epoch": 2.819170358437374, "grad_norm": 0.2741888988118937, "learning_rate": 1.8987016623225748e-05, "loss": 0.2846, "step": 1750 }, { "epoch": 2.8207813129279096, "grad_norm": 0.26900885193390967, "learning_rate": 1.896452579746663e-05, "loss": 0.2628, "step": 1751 }, { "epoch": 2.8223922674184454, "grad_norm": 0.25791432339739123, "learning_rate": 1.8942036284604254e-05, "loss": 0.2684, "step": 1752 }, { "epoch": 2.8240032219089812, "grad_norm": 0.24175592345180597, "learning_rate": 1.8919548113153517e-05, "loss": 0.299, "step": 1753 }, { "epoch": 2.8256141763995166, "grad_norm": 0.24350609167590054, "learning_rate": 1.889706131162757e-05, "loss": 0.2491, "step": 1754 }, { "epoch": 2.8272251308900525, "grad_norm": 0.24783023887132075, "learning_rate": 1.887457590853784e-05, "loss": 0.2805, "step": 1755 }, { "epoch": 2.828836085380588, "grad_norm": 0.23044942043966332, "learning_rate": 1.8852091932393984e-05, "loss": 0.2919, "step": 1756 }, { "epoch": 2.8304470398711237, "grad_norm": 0.23547804631271316, "learning_rate": 1.8829609411703855e-05, "loss": 0.2729, "step": 1757 }, { "epoch": 2.8320579943616595, "grad_norm": 0.2241584722151506, "learning_rate": 1.8807128374973454e-05, "loss": 0.274, "step": 1758 }, { "epoch": 2.833668948852195, "grad_norm": 0.24199888275581322, "learning_rate": 1.8784648850706883e-05, "loss": 0.2853, "step": 1759 }, { "epoch": 2.8352799033427307, "grad_norm": 0.2298411732619803, "learning_rate": 1.8762170867406366e-05, "loss": 0.2626, "step": 1760 }, { "epoch": 2.836890857833266, "grad_norm": 0.21489914910143904, "learning_rate": 1.873969445357214e-05, "loss": 0.2683, "step": 1761 }, { "epoch": 2.838501812323802, "grad_norm": 0.2618260680636698, "learning_rate": 1.871721963770246e-05, "loss": 0.2862, "step": 1762 }, { "epoch": 2.8401127668143373, "grad_norm": 0.2845350731450781, "learning_rate": 1.869474644829356e-05, "loss": 0.2909, "step": 1763 }, { "epoch": 2.841723721304873, "grad_norm": 0.21927422061321766, "learning_rate": 1.867227491383962e-05, "loss": 0.265, "step": 1764 }, { "epoch": 2.8433346757954086, "grad_norm": 0.2603099243315801, "learning_rate": 1.8649805062832697e-05, "loss": 0.3039, "step": 1765 }, { "epoch": 2.8449456302859444, "grad_norm": 0.23315130279336155, "learning_rate": 1.8627336923762737e-05, "loss": 0.2669, "step": 1766 }, { "epoch": 2.84655658477648, "grad_norm": 0.2386078527910061, "learning_rate": 1.8604870525117496e-05, "loss": 0.2713, "step": 1767 }, { "epoch": 2.8481675392670156, "grad_norm": 0.25248905060755344, "learning_rate": 1.8582405895382544e-05, "loss": 0.2866, "step": 1768 }, { "epoch": 2.8497784937575514, "grad_norm": 0.2272473915570634, "learning_rate": 1.8559943063041195e-05, "loss": 0.2566, "step": 1769 }, { "epoch": 2.851389448248087, "grad_norm": 0.2377600612126955, "learning_rate": 1.853748205657448e-05, "loss": 0.2657, "step": 1770 }, { "epoch": 2.8530004027386227, "grad_norm": 0.25639490523892583, "learning_rate": 1.8515022904461136e-05, "loss": 0.2707, "step": 1771 }, { "epoch": 2.8546113572291585, "grad_norm": 0.22335477679276985, "learning_rate": 1.849256563517752e-05, "loss": 0.2578, "step": 1772 }, { "epoch": 2.856222311719694, "grad_norm": 0.26678857888253893, "learning_rate": 1.8470110277197622e-05, "loss": 0.2823, "step": 1773 }, { "epoch": 2.8578332662102297, "grad_norm": 0.2658334462945636, "learning_rate": 1.8447656858993e-05, "loss": 0.2657, "step": 1774 }, { "epoch": 2.859444220700765, "grad_norm": 0.23914934535135607, "learning_rate": 1.8425205409032767e-05, "loss": 0.2807, "step": 1775 }, { "epoch": 2.861055175191301, "grad_norm": 0.23755248829706438, "learning_rate": 1.8402755955783514e-05, "loss": 0.2806, "step": 1776 }, { "epoch": 2.8626661296818368, "grad_norm": 0.23622676175290067, "learning_rate": 1.838030852770932e-05, "loss": 0.2822, "step": 1777 }, { "epoch": 2.864277084172372, "grad_norm": 0.2774308646019204, "learning_rate": 1.835786315327169e-05, "loss": 0.2953, "step": 1778 }, { "epoch": 2.8658880386629075, "grad_norm": 0.23104176491388728, "learning_rate": 1.8335419860929532e-05, "loss": 0.252, "step": 1779 }, { "epoch": 2.8674989931534434, "grad_norm": 0.2640215420459852, "learning_rate": 1.831297867913911e-05, "loss": 0.3052, "step": 1780 }, { "epoch": 2.869109947643979, "grad_norm": 0.2508476637940116, "learning_rate": 1.8290539636354e-05, "loss": 0.2686, "step": 1781 }, { "epoch": 2.8707209021345146, "grad_norm": 0.259821408101572, "learning_rate": 1.8268102761025077e-05, "loss": 0.2747, "step": 1782 }, { "epoch": 2.8723318566250504, "grad_norm": 0.24412116210280163, "learning_rate": 1.8245668081600477e-05, "loss": 0.2755, "step": 1783 }, { "epoch": 2.873942811115586, "grad_norm": 0.24896780062311544, "learning_rate": 1.8223235626525542e-05, "loss": 0.2714, "step": 1784 }, { "epoch": 2.8755537656061216, "grad_norm": 0.24529433746790313, "learning_rate": 1.820080542424278e-05, "loss": 0.2637, "step": 1785 }, { "epoch": 2.8771647200966575, "grad_norm": 0.272289653576584, "learning_rate": 1.8178377503191875e-05, "loss": 0.2916, "step": 1786 }, { "epoch": 2.878775674587193, "grad_norm": 0.24673919563032362, "learning_rate": 1.8155951891809592e-05, "loss": 0.2817, "step": 1787 }, { "epoch": 2.8803866290777287, "grad_norm": 0.24031746045696809, "learning_rate": 1.813352861852977e-05, "loss": 0.2788, "step": 1788 }, { "epoch": 2.881997583568264, "grad_norm": 0.25670613044192553, "learning_rate": 1.8111107711783293e-05, "loss": 0.2658, "step": 1789 }, { "epoch": 2.8836085380588, "grad_norm": 0.24396556057950303, "learning_rate": 1.808868919999804e-05, "loss": 0.256, "step": 1790 }, { "epoch": 2.8852194925493357, "grad_norm": 0.24389592967270443, "learning_rate": 1.806627311159886e-05, "loss": 0.278, "step": 1791 }, { "epoch": 2.886830447039871, "grad_norm": 0.25159917649723634, "learning_rate": 1.8043859475007515e-05, "loss": 0.2755, "step": 1792 }, { "epoch": 2.8884414015304065, "grad_norm": 0.21554818631946265, "learning_rate": 1.8021448318642666e-05, "loss": 0.2546, "step": 1793 }, { "epoch": 2.8900523560209423, "grad_norm": 0.22780147011061957, "learning_rate": 1.7999039670919837e-05, "loss": 0.2675, "step": 1794 }, { "epoch": 2.891663310511478, "grad_norm": 0.2556179680309161, "learning_rate": 1.797663356025136e-05, "loss": 0.268, "step": 1795 }, { "epoch": 2.8932742650020136, "grad_norm": 0.22179477121268426, "learning_rate": 1.795423001504635e-05, "loss": 0.2625, "step": 1796 }, { "epoch": 2.8948852194925494, "grad_norm": 0.2566657026812676, "learning_rate": 1.793182906371068e-05, "loss": 0.3041, "step": 1797 }, { "epoch": 2.896496173983085, "grad_norm": 0.22649500722803623, "learning_rate": 1.7909430734646936e-05, "loss": 0.2626, "step": 1798 }, { "epoch": 2.8981071284736206, "grad_norm": 0.2382002490806491, "learning_rate": 1.788703505625435e-05, "loss": 0.2697, "step": 1799 }, { "epoch": 2.8997180829641565, "grad_norm": 0.23333967874773118, "learning_rate": 1.7864642056928823e-05, "loss": 0.282, "step": 1800 }, { "epoch": 2.901329037454692, "grad_norm": 0.22810377571609813, "learning_rate": 1.7842251765062858e-05, "loss": 0.2703, "step": 1801 }, { "epoch": 2.9029399919452277, "grad_norm": 0.2475532042645999, "learning_rate": 1.7819864209045512e-05, "loss": 0.2893, "step": 1802 }, { "epoch": 2.904550946435763, "grad_norm": 0.23819963629896387, "learning_rate": 1.7797479417262375e-05, "loss": 0.2693, "step": 1803 }, { "epoch": 2.906161900926299, "grad_norm": 0.2498421038014751, "learning_rate": 1.777509741809553e-05, "loss": 0.2685, "step": 1804 }, { "epoch": 2.9077728554168347, "grad_norm": 0.22386243561742944, "learning_rate": 1.775271823992354e-05, "loss": 0.2425, "step": 1805 }, { "epoch": 2.90938380990737, "grad_norm": 0.2524226759643067, "learning_rate": 1.773034191112137e-05, "loss": 0.3005, "step": 1806 }, { "epoch": 2.9109947643979055, "grad_norm": 0.23459453267664337, "learning_rate": 1.7707968460060375e-05, "loss": 0.2858, "step": 1807 }, { "epoch": 2.9126057188884413, "grad_norm": 0.24955656470007692, "learning_rate": 1.7685597915108257e-05, "loss": 0.2793, "step": 1808 }, { "epoch": 2.914216673378977, "grad_norm": 0.22015213316381949, "learning_rate": 1.7663230304629066e-05, "loss": 0.2702, "step": 1809 }, { "epoch": 2.9158276278695126, "grad_norm": 0.2655548953653868, "learning_rate": 1.7640865656983084e-05, "loss": 0.3032, "step": 1810 }, { "epoch": 2.9174385823600484, "grad_norm": 0.2323494023577661, "learning_rate": 1.7618504000526863e-05, "loss": 0.2761, "step": 1811 }, { "epoch": 2.9190495368505838, "grad_norm": 0.2109588847916537, "learning_rate": 1.759614536361316e-05, "loss": 0.2614, "step": 1812 }, { "epoch": 2.9206604913411196, "grad_norm": 0.25601356381819473, "learning_rate": 1.7573789774590903e-05, "loss": 0.3105, "step": 1813 }, { "epoch": 2.9222714458316554, "grad_norm": 0.22547870004031134, "learning_rate": 1.755143726180516e-05, "loss": 0.2899, "step": 1814 }, { "epoch": 2.923882400322191, "grad_norm": 0.2509635812735881, "learning_rate": 1.7529087853597072e-05, "loss": 0.2821, "step": 1815 }, { "epoch": 2.9254933548127267, "grad_norm": 0.23583864792349135, "learning_rate": 1.7506741578303883e-05, "loss": 0.2623, "step": 1816 }, { "epoch": 2.927104309303262, "grad_norm": 0.24318522703606663, "learning_rate": 1.748439846425884e-05, "loss": 0.2868, "step": 1817 }, { "epoch": 2.928715263793798, "grad_norm": 0.22768641363630263, "learning_rate": 1.7462058539791192e-05, "loss": 0.2942, "step": 1818 }, { "epoch": 2.9303262182843337, "grad_norm": 0.22037526237850094, "learning_rate": 1.743972183322612e-05, "loss": 0.2651, "step": 1819 }, { "epoch": 2.931937172774869, "grad_norm": 0.2261074846645067, "learning_rate": 1.7417388372884775e-05, "loss": 0.2863, "step": 1820 }, { "epoch": 2.9335481272654045, "grad_norm": 0.22274024245003757, "learning_rate": 1.739505818708414e-05, "loss": 0.2594, "step": 1821 }, { "epoch": 2.9351590817559403, "grad_norm": 0.24040076004716376, "learning_rate": 1.7372731304137072e-05, "loss": 0.2691, "step": 1822 }, { "epoch": 2.936770036246476, "grad_norm": 0.2506765713731466, "learning_rate": 1.735040775235224e-05, "loss": 0.2998, "step": 1823 }, { "epoch": 2.9383809907370115, "grad_norm": 0.22705774588973596, "learning_rate": 1.732808756003408e-05, "loss": 0.2528, "step": 1824 }, { "epoch": 2.9399919452275474, "grad_norm": 0.22020216475260657, "learning_rate": 1.7305770755482788e-05, "loss": 0.2594, "step": 1825 }, { "epoch": 2.9416028997180828, "grad_norm": 0.2525904487369489, "learning_rate": 1.7283457366994226e-05, "loss": 0.2634, "step": 1826 }, { "epoch": 2.9432138542086186, "grad_norm": 0.24948049883181214, "learning_rate": 1.7261147422859967e-05, "loss": 0.2824, "step": 1827 }, { "epoch": 2.9448248086991544, "grad_norm": 0.22594459892161928, "learning_rate": 1.7238840951367194e-05, "loss": 0.2685, "step": 1828 }, { "epoch": 2.94643576318969, "grad_norm": 0.23969345388513782, "learning_rate": 1.72165379807987e-05, "loss": 0.2776, "step": 1829 }, { "epoch": 2.9480467176802256, "grad_norm": 0.23702835747098586, "learning_rate": 1.7194238539432807e-05, "loss": 0.286, "step": 1830 }, { "epoch": 2.949657672170761, "grad_norm": 0.2308017426275451, "learning_rate": 1.7171942655543415e-05, "loss": 0.2553, "step": 1831 }, { "epoch": 2.951268626661297, "grad_norm": 0.25865741662991343, "learning_rate": 1.714965035739987e-05, "loss": 0.295, "step": 1832 }, { "epoch": 2.9528795811518327, "grad_norm": 0.22524224832126727, "learning_rate": 1.7127361673266982e-05, "loss": 0.2379, "step": 1833 }, { "epoch": 2.954490535642368, "grad_norm": 0.2874573108392332, "learning_rate": 1.7105076631404994e-05, "loss": 0.3053, "step": 1834 }, { "epoch": 2.956101490132904, "grad_norm": 0.2334494538238959, "learning_rate": 1.7082795260069515e-05, "loss": 0.2591, "step": 1835 }, { "epoch": 2.9577124446234393, "grad_norm": 0.25784150732290884, "learning_rate": 1.7060517587511512e-05, "loss": 0.2823, "step": 1836 }, { "epoch": 2.959323399113975, "grad_norm": 0.23058702287993874, "learning_rate": 1.7038243641977238e-05, "loss": 0.2464, "step": 1837 }, { "epoch": 2.9609343536045105, "grad_norm": 0.22596215888674936, "learning_rate": 1.701597345170825e-05, "loss": 0.286, "step": 1838 }, { "epoch": 2.9625453080950463, "grad_norm": 0.23090028001401205, "learning_rate": 1.6993707044941334e-05, "loss": 0.2828, "step": 1839 }, { "epoch": 2.9641562625855817, "grad_norm": 0.22216529160650983, "learning_rate": 1.6971444449908474e-05, "loss": 0.2655, "step": 1840 }, { "epoch": 2.9657672170761176, "grad_norm": 0.23782152766999168, "learning_rate": 1.6949185694836806e-05, "loss": 0.3075, "step": 1841 }, { "epoch": 2.9673781715666534, "grad_norm": 0.23394803632471256, "learning_rate": 1.6926930807948646e-05, "loss": 0.2607, "step": 1842 }, { "epoch": 2.968989126057189, "grad_norm": 0.22966128510957112, "learning_rate": 1.6904679817461347e-05, "loss": 0.2653, "step": 1843 }, { "epoch": 2.9706000805477246, "grad_norm": 0.23928795508961923, "learning_rate": 1.688243275158736e-05, "loss": 0.2515, "step": 1844 }, { "epoch": 2.97221103503826, "grad_norm": 0.2426394963369243, "learning_rate": 1.6860189638534142e-05, "loss": 0.269, "step": 1845 }, { "epoch": 2.973821989528796, "grad_norm": 0.2321987864421601, "learning_rate": 1.6837950506504158e-05, "loss": 0.2831, "step": 1846 }, { "epoch": 2.9754329440193317, "grad_norm": 0.25421214491796396, "learning_rate": 1.6815715383694805e-05, "loss": 0.284, "step": 1847 }, { "epoch": 2.977043898509867, "grad_norm": 0.2443429609504397, "learning_rate": 1.6793484298298387e-05, "loss": 0.284, "step": 1848 }, { "epoch": 2.978654853000403, "grad_norm": 0.21013676562255554, "learning_rate": 1.6771257278502135e-05, "loss": 0.2368, "step": 1849 }, { "epoch": 2.9802658074909383, "grad_norm": 0.25821210868342026, "learning_rate": 1.6749034352488077e-05, "loss": 0.2834, "step": 1850 }, { "epoch": 2.981876761981474, "grad_norm": 0.23933753025125024, "learning_rate": 1.6726815548433072e-05, "loss": 0.2712, "step": 1851 }, { "epoch": 2.98348771647201, "grad_norm": 0.23023492804389561, "learning_rate": 1.6704600894508743e-05, "loss": 0.2598, "step": 1852 }, { "epoch": 2.9850986709625453, "grad_norm": 0.270960478809936, "learning_rate": 1.668239041888148e-05, "loss": 0.3171, "step": 1853 }, { "epoch": 2.9867096254530807, "grad_norm": 0.2165989069224987, "learning_rate": 1.666018414971233e-05, "loss": 0.2687, "step": 1854 }, { "epoch": 2.9883205799436166, "grad_norm": 0.23657876878647432, "learning_rate": 1.663798211515704e-05, "loss": 0.2732, "step": 1855 }, { "epoch": 2.9899315344341524, "grad_norm": 0.2514218073257401, "learning_rate": 1.661578434336597e-05, "loss": 0.2824, "step": 1856 }, { "epoch": 2.9915424889246878, "grad_norm": 0.22874574938184158, "learning_rate": 1.6593590862484095e-05, "loss": 0.2753, "step": 1857 }, { "epoch": 2.9931534434152236, "grad_norm": 0.2270308899508822, "learning_rate": 1.6571401700650934e-05, "loss": 0.2533, "step": 1858 }, { "epoch": 2.994764397905759, "grad_norm": 0.23310437321259292, "learning_rate": 1.6549216886000513e-05, "loss": 0.2709, "step": 1859 }, { "epoch": 2.996375352396295, "grad_norm": 0.26450041707096045, "learning_rate": 1.6527036446661396e-05, "loss": 0.2978, "step": 1860 }, { "epoch": 2.9979863068868307, "grad_norm": 0.2340175999215853, "learning_rate": 1.6504860410756548e-05, "loss": 0.2694, "step": 1861 }, { "epoch": 2.999597261377366, "grad_norm": 0.27305237704293234, "learning_rate": 1.6482688806403383e-05, "loss": 0.3064, "step": 1862 }, { "epoch": 3.001208215867902, "grad_norm": 0.3397063087612353, "learning_rate": 1.646052166171368e-05, "loss": 0.2409, "step": 1863 }, { "epoch": 3.0028191703584373, "grad_norm": 0.30805634582005703, "learning_rate": 1.6438359004793572e-05, "loss": 0.2068, "step": 1864 }, { "epoch": 3.004430124848973, "grad_norm": 0.45744869192317866, "learning_rate": 1.64162008637435e-05, "loss": 0.2269, "step": 1865 }, { "epoch": 3.0060410793395085, "grad_norm": 0.2781379556860123, "learning_rate": 1.639404726665817e-05, "loss": 0.1878, "step": 1866 }, { "epoch": 3.0076520338300443, "grad_norm": 0.45200098984478193, "learning_rate": 1.637189824162654e-05, "loss": 0.2083, "step": 1867 }, { "epoch": 3.00926298832058, "grad_norm": 0.2842219362339001, "learning_rate": 1.634975381673177e-05, "loss": 0.1975, "step": 1868 }, { "epoch": 3.0108739428111155, "grad_norm": 0.3164436406112575, "learning_rate": 1.632761402005118e-05, "loss": 0.1975, "step": 1869 }, { "epoch": 3.0124848973016514, "grad_norm": 0.28217933369594117, "learning_rate": 1.630547887965622e-05, "loss": 0.2004, "step": 1870 }, { "epoch": 3.0140958517921868, "grad_norm": 0.2569624519531185, "learning_rate": 1.628334842361244e-05, "loss": 0.1879, "step": 1871 }, { "epoch": 3.0157068062827226, "grad_norm": 0.27431269684331583, "learning_rate": 1.6261222679979462e-05, "loss": 0.2135, "step": 1872 }, { "epoch": 3.017317760773258, "grad_norm": 0.24794482330292733, "learning_rate": 1.6239101676810917e-05, "loss": 0.2132, "step": 1873 }, { "epoch": 3.018928715263794, "grad_norm": 0.28290856893562144, "learning_rate": 1.6216985442154427e-05, "loss": 0.2237, "step": 1874 }, { "epoch": 3.0205396697543296, "grad_norm": 0.24267628598445265, "learning_rate": 1.619487400405158e-05, "loss": 0.1798, "step": 1875 }, { "epoch": 3.022150624244865, "grad_norm": 0.27946258366684773, "learning_rate": 1.6172767390537874e-05, "loss": 0.2003, "step": 1876 }, { "epoch": 3.023761578735401, "grad_norm": 0.2635177433404853, "learning_rate": 1.6150665629642685e-05, "loss": 0.1947, "step": 1877 }, { "epoch": 3.0253725332259362, "grad_norm": 0.2592624875109387, "learning_rate": 1.6128568749389238e-05, "loss": 0.1899, "step": 1878 }, { "epoch": 3.026983487716472, "grad_norm": 0.25701500127549515, "learning_rate": 1.6106476777794587e-05, "loss": 0.2059, "step": 1879 }, { "epoch": 3.0285944422070075, "grad_norm": 0.2531036287332326, "learning_rate": 1.6084389742869543e-05, "loss": 0.2029, "step": 1880 }, { "epoch": 3.0302053966975433, "grad_norm": 0.26247322863177375, "learning_rate": 1.6062307672618654e-05, "loss": 0.2158, "step": 1881 }, { "epoch": 3.031816351188079, "grad_norm": 0.2514848759579883, "learning_rate": 1.6040230595040186e-05, "loss": 0.2065, "step": 1882 }, { "epoch": 3.0334273056786145, "grad_norm": 0.26086089307040033, "learning_rate": 1.601815853812608e-05, "loss": 0.1832, "step": 1883 }, { "epoch": 3.0350382601691503, "grad_norm": 0.2650496609324423, "learning_rate": 1.5996091529861897e-05, "loss": 0.2189, "step": 1884 }, { "epoch": 3.0366492146596857, "grad_norm": 0.24975861001178923, "learning_rate": 1.5974029598226796e-05, "loss": 0.1956, "step": 1885 }, { "epoch": 3.0382601691502216, "grad_norm": 0.26398918888430833, "learning_rate": 1.595197277119352e-05, "loss": 0.2089, "step": 1886 }, { "epoch": 3.039871123640757, "grad_norm": 0.23695742331210073, "learning_rate": 1.5929921076728316e-05, "loss": 0.1951, "step": 1887 }, { "epoch": 3.041482078131293, "grad_norm": 0.23211753996183163, "learning_rate": 1.590787454279093e-05, "loss": 0.1923, "step": 1888 }, { "epoch": 3.0430930326218286, "grad_norm": 0.24973175927852212, "learning_rate": 1.5885833197334563e-05, "loss": 0.186, "step": 1889 }, { "epoch": 3.044703987112364, "grad_norm": 0.2529151185889554, "learning_rate": 1.586379706830586e-05, "loss": 0.2087, "step": 1890 }, { "epoch": 3.0463149416029, "grad_norm": 0.2482208969171952, "learning_rate": 1.584176618364482e-05, "loss": 0.2011, "step": 1891 }, { "epoch": 3.0479258960934352, "grad_norm": 0.2550637551299936, "learning_rate": 1.5819740571284807e-05, "loss": 0.2177, "step": 1892 }, { "epoch": 3.049536850583971, "grad_norm": 0.23750126804753882, "learning_rate": 1.5797720259152496e-05, "loss": 0.174, "step": 1893 }, { "epoch": 3.0511478050745064, "grad_norm": 0.24172300528999488, "learning_rate": 1.5775705275167854e-05, "loss": 0.2076, "step": 1894 }, { "epoch": 3.0527587595650423, "grad_norm": 0.26506057370117664, "learning_rate": 1.5753695647244083e-05, "loss": 0.2068, "step": 1895 }, { "epoch": 3.054369714055578, "grad_norm": 0.22576990598377747, "learning_rate": 1.5731691403287595e-05, "loss": 0.1925, "step": 1896 }, { "epoch": 3.0559806685461135, "grad_norm": 0.2545620829654115, "learning_rate": 1.5709692571197957e-05, "loss": 0.2, "step": 1897 }, { "epoch": 3.0575916230366493, "grad_norm": 0.26218591276871567, "learning_rate": 1.568769917886792e-05, "loss": 0.1976, "step": 1898 }, { "epoch": 3.0592025775271847, "grad_norm": 0.23957115050109906, "learning_rate": 1.5665711254183293e-05, "loss": 0.1963, "step": 1899 }, { "epoch": 3.0608135320177206, "grad_norm": 0.2595037940259688, "learning_rate": 1.564372882502297e-05, "loss": 0.2051, "step": 1900 }, { "epoch": 3.062424486508256, "grad_norm": 0.2224941487256705, "learning_rate": 1.5621751919258884e-05, "loss": 0.1869, "step": 1901 }, { "epoch": 3.0640354409987918, "grad_norm": 0.25468643872804336, "learning_rate": 1.5599780564755956e-05, "loss": 0.2183, "step": 1902 }, { "epoch": 3.0656463954893276, "grad_norm": 0.25613678517781996, "learning_rate": 1.5577814789372064e-05, "loss": 0.1931, "step": 1903 }, { "epoch": 3.067257349979863, "grad_norm": 0.24029809222854032, "learning_rate": 1.555585462095802e-05, "loss": 0.1899, "step": 1904 }, { "epoch": 3.068868304470399, "grad_norm": 0.25815577075329543, "learning_rate": 1.5533900087357527e-05, "loss": 0.2053, "step": 1905 }, { "epoch": 3.070479258960934, "grad_norm": 0.25550205510849705, "learning_rate": 1.5511951216407142e-05, "loss": 0.202, "step": 1906 }, { "epoch": 3.07209021345147, "grad_norm": 0.24282903913932946, "learning_rate": 1.5490008035936245e-05, "loss": 0.1987, "step": 1907 }, { "epoch": 3.0737011679420054, "grad_norm": 0.24787825404381872, "learning_rate": 1.5468070573766982e-05, "loss": 0.2126, "step": 1908 }, { "epoch": 3.0753121224325413, "grad_norm": 0.23805631453760473, "learning_rate": 1.5446138857714285e-05, "loss": 0.2152, "step": 1909 }, { "epoch": 3.076923076923077, "grad_norm": 0.22261953448924351, "learning_rate": 1.5424212915585766e-05, "loss": 0.2049, "step": 1910 }, { "epoch": 3.0785340314136125, "grad_norm": 0.23389483167172603, "learning_rate": 1.5402292775181732e-05, "loss": 0.1841, "step": 1911 }, { "epoch": 3.0801449859041483, "grad_norm": 0.24520165353665813, "learning_rate": 1.5380378464295133e-05, "loss": 0.2121, "step": 1912 }, { "epoch": 3.0817559403946837, "grad_norm": 0.2355879009329018, "learning_rate": 1.535847001071153e-05, "loss": 0.1831, "step": 1913 }, { "epoch": 3.0833668948852195, "grad_norm": 0.25353870145003116, "learning_rate": 1.5336567442209052e-05, "loss": 0.2093, "step": 1914 }, { "epoch": 3.084977849375755, "grad_norm": 0.23181775476490288, "learning_rate": 1.5314670786558358e-05, "loss": 0.1993, "step": 1915 }, { "epoch": 3.0865888038662908, "grad_norm": 0.24826824746417567, "learning_rate": 1.5292780071522634e-05, "loss": 0.2159, "step": 1916 }, { "epoch": 3.0881997583568266, "grad_norm": 0.23097701612004543, "learning_rate": 1.527089532485751e-05, "loss": 0.1875, "step": 1917 }, { "epoch": 3.089810712847362, "grad_norm": 0.23252448832192432, "learning_rate": 1.5249016574311069e-05, "loss": 0.1925, "step": 1918 }, { "epoch": 3.091421667337898, "grad_norm": 0.2335044424719987, "learning_rate": 1.5227143847623759e-05, "loss": 0.1917, "step": 1919 }, { "epoch": 3.093032621828433, "grad_norm": 0.2467731183493127, "learning_rate": 1.5205277172528438e-05, "loss": 0.2028, "step": 1920 }, { "epoch": 3.094643576318969, "grad_norm": 0.2478688042585764, "learning_rate": 1.5183416576750251e-05, "loss": 0.2155, "step": 1921 }, { "epoch": 3.096254530809505, "grad_norm": 0.23559604410809817, "learning_rate": 1.5161562088006649e-05, "loss": 0.1927, "step": 1922 }, { "epoch": 3.0978654853000402, "grad_norm": 0.25132210120716414, "learning_rate": 1.513971373400734e-05, "loss": 0.2165, "step": 1923 }, { "epoch": 3.099476439790576, "grad_norm": 0.21838775169451463, "learning_rate": 1.5117871542454259e-05, "loss": 0.1959, "step": 1924 }, { "epoch": 3.1010873942811115, "grad_norm": 0.24648218446618922, "learning_rate": 1.509603554104152e-05, "loss": 0.215, "step": 1925 }, { "epoch": 3.1026983487716473, "grad_norm": 0.23608736727692337, "learning_rate": 1.5074205757455382e-05, "loss": 0.1913, "step": 1926 }, { "epoch": 3.1043093032621827, "grad_norm": 0.24688604035464398, "learning_rate": 1.5052382219374238e-05, "loss": 0.2028, "step": 1927 }, { "epoch": 3.1059202577527185, "grad_norm": 0.21415210914040747, "learning_rate": 1.5030564954468548e-05, "loss": 0.1712, "step": 1928 }, { "epoch": 3.107531212243254, "grad_norm": 0.28181734443533013, "learning_rate": 1.500875399040083e-05, "loss": 0.2383, "step": 1929 }, { "epoch": 3.1091421667337897, "grad_norm": 0.23835213355733328, "learning_rate": 1.498694935482559e-05, "loss": 0.1919, "step": 1930 }, { "epoch": 3.1107531212243256, "grad_norm": 0.239890464679267, "learning_rate": 1.4965151075389344e-05, "loss": 0.1857, "step": 1931 }, { "epoch": 3.112364075714861, "grad_norm": 0.24414783199380563, "learning_rate": 1.4943359179730523e-05, "loss": 0.2172, "step": 1932 }, { "epoch": 3.113975030205397, "grad_norm": 0.23874299205907634, "learning_rate": 1.492157369547947e-05, "loss": 0.1947, "step": 1933 }, { "epoch": 3.115585984695932, "grad_norm": 0.26898773638453133, "learning_rate": 1.4899794650258397e-05, "loss": 0.224, "step": 1934 }, { "epoch": 3.117196939186468, "grad_norm": 0.2577407154196406, "learning_rate": 1.4878022071681368e-05, "loss": 0.1898, "step": 1935 }, { "epoch": 3.118807893677004, "grad_norm": 0.29459886996326795, "learning_rate": 1.4856255987354232e-05, "loss": 0.2289, "step": 1936 }, { "epoch": 3.1204188481675392, "grad_norm": 0.2512628187386988, "learning_rate": 1.4834496424874587e-05, "loss": 0.2068, "step": 1937 }, { "epoch": 3.122029802658075, "grad_norm": 0.267387282459161, "learning_rate": 1.4812743411831814e-05, "loss": 0.211, "step": 1938 }, { "epoch": 3.1236407571486104, "grad_norm": 0.25843048547184483, "learning_rate": 1.479099697580694e-05, "loss": 0.2177, "step": 1939 }, { "epoch": 3.1252517116391463, "grad_norm": 0.26113540198237006, "learning_rate": 1.4769257144372668e-05, "loss": 0.1874, "step": 1940 }, { "epoch": 3.1268626661296817, "grad_norm": 0.2508889947235532, "learning_rate": 1.4747523945093332e-05, "loss": 0.2058, "step": 1941 }, { "epoch": 3.1284736206202175, "grad_norm": 0.24792171081568534, "learning_rate": 1.4725797405524866e-05, "loss": 0.1922, "step": 1942 }, { "epoch": 3.130084575110753, "grad_norm": 0.2536281872683442, "learning_rate": 1.4704077553214734e-05, "loss": 0.187, "step": 1943 }, { "epoch": 3.1316955296012887, "grad_norm": 0.24634192517432316, "learning_rate": 1.468236441570194e-05, "loss": 0.2169, "step": 1944 }, { "epoch": 3.1333064840918246, "grad_norm": 0.26202184945627943, "learning_rate": 1.4660658020516966e-05, "loss": 0.2107, "step": 1945 }, { "epoch": 3.13491743858236, "grad_norm": 0.24692155351876013, "learning_rate": 1.463895839518176e-05, "loss": 0.2009, "step": 1946 }, { "epoch": 3.1365283930728958, "grad_norm": 0.2651977196274272, "learning_rate": 1.4617265567209673e-05, "loss": 0.2161, "step": 1947 }, { "epoch": 3.138139347563431, "grad_norm": 0.2254192338184251, "learning_rate": 1.4595579564105432e-05, "loss": 0.198, "step": 1948 }, { "epoch": 3.139750302053967, "grad_norm": 0.2456828260242202, "learning_rate": 1.457390041336512e-05, "loss": 0.2169, "step": 1949 }, { "epoch": 3.141361256544503, "grad_norm": 0.2383954425741448, "learning_rate": 1.4552228142476138e-05, "loss": 0.1913, "step": 1950 }, { "epoch": 3.142972211035038, "grad_norm": 0.2388875846403241, "learning_rate": 1.453056277891715e-05, "loss": 0.2074, "step": 1951 }, { "epoch": 3.144583165525574, "grad_norm": 0.24254252857096423, "learning_rate": 1.4508904350158069e-05, "loss": 0.184, "step": 1952 }, { "epoch": 3.1461941200161094, "grad_norm": 0.2677691092809072, "learning_rate": 1.4487252883660019e-05, "loss": 0.1969, "step": 1953 }, { "epoch": 3.1478050745066453, "grad_norm": 0.25451569692994286, "learning_rate": 1.446560840687529e-05, "loss": 0.2038, "step": 1954 }, { "epoch": 3.1494160289971806, "grad_norm": 0.26165425081978877, "learning_rate": 1.4443970947247308e-05, "loss": 0.2257, "step": 1955 }, { "epoch": 3.1510269834877165, "grad_norm": 0.2586993585923267, "learning_rate": 1.4422340532210601e-05, "loss": 0.2184, "step": 1956 }, { "epoch": 3.1526379379782523, "grad_norm": 0.22749947793278547, "learning_rate": 1.4400717189190783e-05, "loss": 0.1886, "step": 1957 }, { "epoch": 3.1542488924687877, "grad_norm": 0.25165536537988603, "learning_rate": 1.4379100945604486e-05, "loss": 0.1957, "step": 1958 }, { "epoch": 3.1558598469593235, "grad_norm": 0.26090767826355016, "learning_rate": 1.4357491828859333e-05, "loss": 0.2062, "step": 1959 }, { "epoch": 3.157470801449859, "grad_norm": 0.23823889920583088, "learning_rate": 1.433588986635392e-05, "loss": 0.1916, "step": 1960 }, { "epoch": 3.1590817559403948, "grad_norm": 0.25301272984117323, "learning_rate": 1.4314295085477784e-05, "loss": 0.2129, "step": 1961 }, { "epoch": 3.16069271043093, "grad_norm": 0.24454168144111912, "learning_rate": 1.4292707513611332e-05, "loss": 0.191, "step": 1962 }, { "epoch": 3.162303664921466, "grad_norm": 0.2519013664831757, "learning_rate": 1.4271127178125843e-05, "loss": 0.213, "step": 1963 }, { "epoch": 3.163914619412002, "grad_norm": 0.2322029065504692, "learning_rate": 1.4249554106383432e-05, "loss": 0.1915, "step": 1964 }, { "epoch": 3.165525573902537, "grad_norm": 0.268939820675357, "learning_rate": 1.4227988325736991e-05, "loss": 0.2017, "step": 1965 }, { "epoch": 3.167136528393073, "grad_norm": 0.24459588031927953, "learning_rate": 1.420642986353016e-05, "loss": 0.2095, "step": 1966 }, { "epoch": 3.1687474828836084, "grad_norm": 0.23268508577761784, "learning_rate": 1.4184878747097308e-05, "loss": 0.2032, "step": 1967 }, { "epoch": 3.1703584373741442, "grad_norm": 0.24314702305936226, "learning_rate": 1.4163335003763506e-05, "loss": 0.2094, "step": 1968 }, { "epoch": 3.1719693918646796, "grad_norm": 0.2466331707682189, "learning_rate": 1.414179866084445e-05, "loss": 0.2039, "step": 1969 }, { "epoch": 3.1735803463552155, "grad_norm": 0.24010384240031207, "learning_rate": 1.4120269745646469e-05, "loss": 0.2131, "step": 1970 }, { "epoch": 3.1751913008457513, "grad_norm": 0.2315431702165574, "learning_rate": 1.4098748285466464e-05, "loss": 0.1836, "step": 1971 }, { "epoch": 3.1768022553362867, "grad_norm": 0.24912715996998305, "learning_rate": 1.4077234307591896e-05, "loss": 0.2044, "step": 1972 }, { "epoch": 3.1784132098268225, "grad_norm": 0.23840799641981053, "learning_rate": 1.4055727839300733e-05, "loss": 0.2022, "step": 1973 }, { "epoch": 3.180024164317358, "grad_norm": 0.23128654085099198, "learning_rate": 1.4034228907861414e-05, "loss": 0.2028, "step": 1974 }, { "epoch": 3.1816351188078937, "grad_norm": 0.26483087418421175, "learning_rate": 1.4012737540532842e-05, "loss": 0.2106, "step": 1975 }, { "epoch": 3.183246073298429, "grad_norm": 0.23542978958227243, "learning_rate": 1.3991253764564308e-05, "loss": 0.2013, "step": 1976 }, { "epoch": 3.184857027788965, "grad_norm": 0.23883779350162376, "learning_rate": 1.3969777607195485e-05, "loss": 0.1905, "step": 1977 }, { "epoch": 3.186467982279501, "grad_norm": 0.22791901305740134, "learning_rate": 1.3948309095656382e-05, "loss": 0.2003, "step": 1978 }, { "epoch": 3.188078936770036, "grad_norm": 0.2585822611597578, "learning_rate": 1.3926848257167336e-05, "loss": 0.2197, "step": 1979 }, { "epoch": 3.189689891260572, "grad_norm": 0.2573536529025018, "learning_rate": 1.3905395118938929e-05, "loss": 0.206, "step": 1980 }, { "epoch": 3.1913008457511074, "grad_norm": 0.24021103836798227, "learning_rate": 1.3883949708171987e-05, "loss": 0.208, "step": 1981 }, { "epoch": 3.1929118002416432, "grad_norm": 0.24221200646286703, "learning_rate": 1.3862512052057535e-05, "loss": 0.2255, "step": 1982 }, { "epoch": 3.1945227547321786, "grad_norm": 0.23090236397527916, "learning_rate": 1.384108217777678e-05, "loss": 0.1995, "step": 1983 }, { "epoch": 3.1961337092227144, "grad_norm": 0.21761509754767042, "learning_rate": 1.3819660112501054e-05, "loss": 0.1835, "step": 1984 }, { "epoch": 3.1977446637132503, "grad_norm": 0.2703261644476621, "learning_rate": 1.3798245883391788e-05, "loss": 0.2289, "step": 1985 }, { "epoch": 3.1993556182037857, "grad_norm": 0.2230354526114237, "learning_rate": 1.3776839517600458e-05, "loss": 0.1979, "step": 1986 }, { "epoch": 3.2009665726943215, "grad_norm": 0.23711275306860455, "learning_rate": 1.3755441042268615e-05, "loss": 0.1923, "step": 1987 }, { "epoch": 3.202577527184857, "grad_norm": 0.24705039155672698, "learning_rate": 1.3734050484527765e-05, "loss": 0.2165, "step": 1988 }, { "epoch": 3.2041884816753927, "grad_norm": 0.2377534616627376, "learning_rate": 1.3712667871499385e-05, "loss": 0.1929, "step": 1989 }, { "epoch": 3.205799436165928, "grad_norm": 0.2449633515338068, "learning_rate": 1.369129323029489e-05, "loss": 0.2004, "step": 1990 }, { "epoch": 3.207410390656464, "grad_norm": 0.2593360096012776, "learning_rate": 1.3669926588015585e-05, "loss": 0.2185, "step": 1991 }, { "epoch": 3.2090213451469998, "grad_norm": 0.23324589375655463, "learning_rate": 1.364856797175262e-05, "loss": 0.1875, "step": 1992 }, { "epoch": 3.210632299637535, "grad_norm": 0.2413326158872254, "learning_rate": 1.362721740858697e-05, "loss": 0.1826, "step": 1993 }, { "epoch": 3.212243254128071, "grad_norm": 0.23918589432715148, "learning_rate": 1.3605874925589419e-05, "loss": 0.2068, "step": 1994 }, { "epoch": 3.2138542086186064, "grad_norm": 0.23689262549972226, "learning_rate": 1.3584540549820493e-05, "loss": 0.2253, "step": 1995 }, { "epoch": 3.215465163109142, "grad_norm": 0.23319941479929307, "learning_rate": 1.3563214308330434e-05, "loss": 0.1865, "step": 1996 }, { "epoch": 3.217076117599678, "grad_norm": 0.23879194349455857, "learning_rate": 1.3541896228159165e-05, "loss": 0.213, "step": 1997 }, { "epoch": 3.2186870720902134, "grad_norm": 0.25094170517305836, "learning_rate": 1.3520586336336296e-05, "loss": 0.2182, "step": 1998 }, { "epoch": 3.2202980265807493, "grad_norm": 0.25223079196864345, "learning_rate": 1.3499284659881013e-05, "loss": 0.2001, "step": 1999 }, { "epoch": 3.2219089810712846, "grad_norm": 0.24328823956234302, "learning_rate": 1.3477991225802103e-05, "loss": 0.2045, "step": 2000 }, { "epoch": 3.2235199355618205, "grad_norm": 0.23787972021657403, "learning_rate": 1.3456706061097905e-05, "loss": 0.1807, "step": 2001 }, { "epoch": 3.225130890052356, "grad_norm": 0.28090320878999836, "learning_rate": 1.3435429192756275e-05, "loss": 0.2231, "step": 2002 }, { "epoch": 3.2267418445428917, "grad_norm": 0.2591510357821683, "learning_rate": 1.3414160647754547e-05, "loss": 0.2125, "step": 2003 }, { "epoch": 3.228352799033427, "grad_norm": 0.22470518404451317, "learning_rate": 1.339290045305948e-05, "loss": 0.1817, "step": 2004 }, { "epoch": 3.229963753523963, "grad_norm": 0.23711240462541178, "learning_rate": 1.3371648635627285e-05, "loss": 0.202, "step": 2005 }, { "epoch": 3.2315747080144988, "grad_norm": 0.26273477899693026, "learning_rate": 1.3350405222403529e-05, "loss": 0.2079, "step": 2006 }, { "epoch": 3.233185662505034, "grad_norm": 0.2325874086405899, "learning_rate": 1.3329170240323124e-05, "loss": 0.2106, "step": 2007 }, { "epoch": 3.23479661699557, "grad_norm": 0.23774767585628861, "learning_rate": 1.330794371631028e-05, "loss": 0.1882, "step": 2008 }, { "epoch": 3.2364075714861054, "grad_norm": 0.22890478281530627, "learning_rate": 1.3286725677278525e-05, "loss": 0.1993, "step": 2009 }, { "epoch": 3.238018525976641, "grad_norm": 0.25495173404834265, "learning_rate": 1.3265516150130577e-05, "loss": 0.2195, "step": 2010 }, { "epoch": 3.239629480467177, "grad_norm": 0.2433403725605887, "learning_rate": 1.3244315161758392e-05, "loss": 0.2033, "step": 2011 }, { "epoch": 3.2412404349577124, "grad_norm": 0.24981637506327542, "learning_rate": 1.3223122739043091e-05, "loss": 0.2122, "step": 2012 }, { "epoch": 3.2428513894482482, "grad_norm": 0.23690220270645826, "learning_rate": 1.3201938908854942e-05, "loss": 0.2007, "step": 2013 }, { "epoch": 3.2444623439387836, "grad_norm": 0.23365777497487655, "learning_rate": 1.3180763698053311e-05, "loss": 0.2043, "step": 2014 }, { "epoch": 3.2460732984293195, "grad_norm": 0.24922388772794682, "learning_rate": 1.3159597133486628e-05, "loss": 0.2049, "step": 2015 }, { "epoch": 3.247684252919855, "grad_norm": 0.2615394743002068, "learning_rate": 1.3138439241992376e-05, "loss": 0.1982, "step": 2016 }, { "epoch": 3.2492952074103907, "grad_norm": 0.23085725558413792, "learning_rate": 1.3117290050397036e-05, "loss": 0.2063, "step": 2017 }, { "epoch": 3.250906161900926, "grad_norm": 0.2524638411576523, "learning_rate": 1.3096149585516059e-05, "loss": 0.201, "step": 2018 }, { "epoch": 3.252517116391462, "grad_norm": 0.22321898517204689, "learning_rate": 1.3075017874153808e-05, "loss": 0.1873, "step": 2019 }, { "epoch": 3.2541280708819977, "grad_norm": 0.24208007420834118, "learning_rate": 1.3053894943103598e-05, "loss": 0.2028, "step": 2020 }, { "epoch": 3.255739025372533, "grad_norm": 0.23464995395813093, "learning_rate": 1.303278081914756e-05, "loss": 0.1874, "step": 2021 }, { "epoch": 3.257349979863069, "grad_norm": 0.23529260639203314, "learning_rate": 1.3011675529056688e-05, "loss": 0.1984, "step": 2022 }, { "epoch": 3.2589609343536043, "grad_norm": 0.23749352729237172, "learning_rate": 1.2990579099590763e-05, "loss": 0.201, "step": 2023 }, { "epoch": 3.26057188884414, "grad_norm": 0.2427641865797294, "learning_rate": 1.2969491557498342e-05, "loss": 0.2031, "step": 2024 }, { "epoch": 3.262182843334676, "grad_norm": 0.2375077667321114, "learning_rate": 1.2948412929516703e-05, "loss": 0.1918, "step": 2025 }, { "epoch": 3.2637937978252114, "grad_norm": 0.24542218137105598, "learning_rate": 1.2927343242371815e-05, "loss": 0.207, "step": 2026 }, { "epoch": 3.2654047523157472, "grad_norm": 0.2390333630741558, "learning_rate": 1.2906282522778341e-05, "loss": 0.1998, "step": 2027 }, { "epoch": 3.2670157068062826, "grad_norm": 0.24837545936493144, "learning_rate": 1.2885230797439543e-05, "loss": 0.2157, "step": 2028 }, { "epoch": 3.2686266612968184, "grad_norm": 0.2519965003627847, "learning_rate": 1.2864188093047291e-05, "loss": 0.1995, "step": 2029 }, { "epoch": 3.270237615787354, "grad_norm": 0.23774917985531094, "learning_rate": 1.2843154436282014e-05, "loss": 0.2081, "step": 2030 }, { "epoch": 3.2718485702778897, "grad_norm": 0.24180882546372334, "learning_rate": 1.2822129853812682e-05, "loss": 0.1794, "step": 2031 }, { "epoch": 3.273459524768425, "grad_norm": 0.22947773377827083, "learning_rate": 1.2801114372296742e-05, "loss": 0.204, "step": 2032 }, { "epoch": 3.275070479258961, "grad_norm": 0.2455803270540699, "learning_rate": 1.2780108018380103e-05, "loss": 0.2038, "step": 2033 }, { "epoch": 3.2766814337494967, "grad_norm": 0.24059114783030533, "learning_rate": 1.2759110818697114e-05, "loss": 0.203, "step": 2034 }, { "epoch": 3.278292388240032, "grad_norm": 0.22748506298956164, "learning_rate": 1.273812279987051e-05, "loss": 0.2093, "step": 2035 }, { "epoch": 3.279903342730568, "grad_norm": 0.2469088454838881, "learning_rate": 1.2717143988511392e-05, "loss": 0.2227, "step": 2036 }, { "epoch": 3.2815142972211033, "grad_norm": 0.23674243065437753, "learning_rate": 1.2696174411219164e-05, "loss": 0.1981, "step": 2037 }, { "epoch": 3.283125251711639, "grad_norm": 0.22507856798719222, "learning_rate": 1.2675214094581547e-05, "loss": 0.1886, "step": 2038 }, { "epoch": 3.284736206202175, "grad_norm": 0.24558240776697116, "learning_rate": 1.2654263065174515e-05, "loss": 0.2104, "step": 2039 }, { "epoch": 3.2863471606927104, "grad_norm": 0.2247916689809979, "learning_rate": 1.263332134956226e-05, "loss": 0.1996, "step": 2040 }, { "epoch": 3.287958115183246, "grad_norm": 0.24098803655370316, "learning_rate": 1.2612388974297161e-05, "loss": 0.2301, "step": 2041 }, { "epoch": 3.2895690696737816, "grad_norm": 0.22790610018869215, "learning_rate": 1.259146596591978e-05, "loss": 0.2044, "step": 2042 }, { "epoch": 3.2911800241643174, "grad_norm": 0.29439964547743164, "learning_rate": 1.2570552350958764e-05, "loss": 0.1872, "step": 2043 }, { "epoch": 3.292790978654853, "grad_norm": 0.23266526895741785, "learning_rate": 1.2549648155930875e-05, "loss": 0.221, "step": 2044 }, { "epoch": 3.2944019331453887, "grad_norm": 0.23179183784366264, "learning_rate": 1.2528753407340929e-05, "loss": 0.194, "step": 2045 }, { "epoch": 3.296012887635924, "grad_norm": 0.21779866661814773, "learning_rate": 1.250786813168176e-05, "loss": 0.1891, "step": 2046 }, { "epoch": 3.29762384212646, "grad_norm": 0.2532975434083456, "learning_rate": 1.2486992355434197e-05, "loss": 0.2207, "step": 2047 }, { "epoch": 3.2992347966169957, "grad_norm": 0.22096278667714603, "learning_rate": 1.2466126105067014e-05, "loss": 0.1821, "step": 2048 }, { "epoch": 3.300845751107531, "grad_norm": 0.2530562863101158, "learning_rate": 1.2445269407036908e-05, "loss": 0.2133, "step": 2049 }, { "epoch": 3.302456705598067, "grad_norm": 0.24462052555468472, "learning_rate": 1.242442228778848e-05, "loss": 0.1947, "step": 2050 }, { "epoch": 3.3040676600886023, "grad_norm": 0.22993112034177768, "learning_rate": 1.2403584773754176e-05, "loss": 0.1884, "step": 2051 }, { "epoch": 3.305678614579138, "grad_norm": 0.2375252762228901, "learning_rate": 1.238275689135425e-05, "loss": 0.2068, "step": 2052 }, { "epoch": 3.307289569069674, "grad_norm": 0.23974997189809513, "learning_rate": 1.2361938666996772e-05, "loss": 0.2094, "step": 2053 }, { "epoch": 3.3089005235602094, "grad_norm": 0.23634430658941374, "learning_rate": 1.2341130127077548e-05, "loss": 0.1967, "step": 2054 }, { "epoch": 3.310511478050745, "grad_norm": 0.23446190903752556, "learning_rate": 1.2320331297980097e-05, "loss": 0.183, "step": 2055 }, { "epoch": 3.3121224325412806, "grad_norm": 0.23976096133391758, "learning_rate": 1.2299542206075641e-05, "loss": 0.2365, "step": 2056 }, { "epoch": 3.3137333870318164, "grad_norm": 0.23801927478749976, "learning_rate": 1.2278762877723058e-05, "loss": 0.2101, "step": 2057 }, { "epoch": 3.315344341522352, "grad_norm": 0.24082087940178765, "learning_rate": 1.2257993339268843e-05, "loss": 0.1928, "step": 2058 }, { "epoch": 3.3169552960128876, "grad_norm": 0.24713092173760648, "learning_rate": 1.223723361704706e-05, "loss": 0.1997, "step": 2059 }, { "epoch": 3.3185662505034235, "grad_norm": 0.24274357069914468, "learning_rate": 1.221648373737935e-05, "loss": 0.2094, "step": 2060 }, { "epoch": 3.320177204993959, "grad_norm": 0.23991670470326582, "learning_rate": 1.2195743726574869e-05, "loss": 0.2056, "step": 2061 }, { "epoch": 3.3217881594844947, "grad_norm": 0.23272961485583524, "learning_rate": 1.2175013610930253e-05, "loss": 0.2045, "step": 2062 }, { "epoch": 3.32339911397503, "grad_norm": 0.2549934176914694, "learning_rate": 1.2154293416729606e-05, "loss": 0.2088, "step": 2063 }, { "epoch": 3.325010068465566, "grad_norm": 0.2627470044380523, "learning_rate": 1.2133583170244422e-05, "loss": 0.2154, "step": 2064 }, { "epoch": 3.3266210229561013, "grad_norm": 0.2418643009081854, "learning_rate": 1.2112882897733634e-05, "loss": 0.1826, "step": 2065 }, { "epoch": 3.328231977446637, "grad_norm": 0.23964309123146355, "learning_rate": 1.2092192625443469e-05, "loss": 0.1956, "step": 2066 }, { "epoch": 3.329842931937173, "grad_norm": 0.26877059633483413, "learning_rate": 1.2071512379607515e-05, "loss": 0.2125, "step": 2067 }, { "epoch": 3.3314538864277083, "grad_norm": 0.2718834091071389, "learning_rate": 1.2050842186446636e-05, "loss": 0.1953, "step": 2068 }, { "epoch": 3.333064840918244, "grad_norm": 0.24287090855325613, "learning_rate": 1.2030182072168957e-05, "loss": 0.1866, "step": 2069 }, { "epoch": 3.3346757954087796, "grad_norm": 0.25823157441435457, "learning_rate": 1.2009532062969801e-05, "loss": 0.2084, "step": 2070 }, { "epoch": 3.3362867498993154, "grad_norm": 0.239590496772935, "learning_rate": 1.1988892185031697e-05, "loss": 0.1934, "step": 2071 }, { "epoch": 3.3378977043898512, "grad_norm": 0.25170794325761425, "learning_rate": 1.1968262464524333e-05, "loss": 0.2099, "step": 2072 }, { "epoch": 3.3395086588803866, "grad_norm": 0.2774141741661976, "learning_rate": 1.1947642927604507e-05, "loss": 0.1951, "step": 2073 }, { "epoch": 3.3411196133709224, "grad_norm": 0.25992979182223386, "learning_rate": 1.1927033600416113e-05, "loss": 0.1895, "step": 2074 }, { "epoch": 3.342730567861458, "grad_norm": 0.2676459769274524, "learning_rate": 1.190643450909008e-05, "loss": 0.2062, "step": 2075 }, { "epoch": 3.3443415223519937, "grad_norm": 0.24491077877038162, "learning_rate": 1.1885845679744396e-05, "loss": 0.1842, "step": 2076 }, { "epoch": 3.345952476842529, "grad_norm": 0.2573204990604907, "learning_rate": 1.1865267138484e-05, "loss": 0.2252, "step": 2077 }, { "epoch": 3.347563431333065, "grad_norm": 0.2488993418599943, "learning_rate": 1.1844698911400805e-05, "loss": 0.2058, "step": 2078 }, { "epoch": 3.3491743858236003, "grad_norm": 0.2221579767832753, "learning_rate": 1.1824141024573647e-05, "loss": 0.2012, "step": 2079 }, { "epoch": 3.350785340314136, "grad_norm": 0.25173011551503893, "learning_rate": 1.1803593504068256e-05, "loss": 0.2202, "step": 2080 }, { "epoch": 3.352396294804672, "grad_norm": 0.2403210358197002, "learning_rate": 1.1783056375937193e-05, "loss": 0.2103, "step": 2081 }, { "epoch": 3.3540072492952073, "grad_norm": 0.23783146032565713, "learning_rate": 1.1762529666219869e-05, "loss": 0.2067, "step": 2082 }, { "epoch": 3.355618203785743, "grad_norm": 0.2423185833751914, "learning_rate": 1.174201340094248e-05, "loss": 0.2043, "step": 2083 }, { "epoch": 3.3572291582762785, "grad_norm": 0.24591036904977595, "learning_rate": 1.172150760611797e-05, "loss": 0.1968, "step": 2084 }, { "epoch": 3.3588401127668144, "grad_norm": 0.23724633382039845, "learning_rate": 1.1701012307746021e-05, "loss": 0.2087, "step": 2085 }, { "epoch": 3.36045106725735, "grad_norm": 0.24067309607867643, "learning_rate": 1.168052753181298e-05, "loss": 0.2029, "step": 2086 }, { "epoch": 3.3620620217478856, "grad_norm": 0.24166724371741746, "learning_rate": 1.1660053304291894e-05, "loss": 0.1864, "step": 2087 }, { "epoch": 3.3636729762384214, "grad_norm": 0.24147797064255336, "learning_rate": 1.16395896511424e-05, "loss": 0.2112, "step": 2088 }, { "epoch": 3.365283930728957, "grad_norm": 0.24098212734239127, "learning_rate": 1.1619136598310725e-05, "loss": 0.2045, "step": 2089 }, { "epoch": 3.3668948852194927, "grad_norm": 0.25019441770575873, "learning_rate": 1.1598694171729703e-05, "loss": 0.211, "step": 2090 }, { "epoch": 3.368505839710028, "grad_norm": 0.2365678451068785, "learning_rate": 1.1578262397318642e-05, "loss": 0.1937, "step": 2091 }, { "epoch": 3.370116794200564, "grad_norm": 0.23164431406226096, "learning_rate": 1.1557841300983363e-05, "loss": 0.1858, "step": 2092 }, { "epoch": 3.3717277486910993, "grad_norm": 0.24563176728263572, "learning_rate": 1.1537430908616152e-05, "loss": 0.2159, "step": 2093 }, { "epoch": 3.373338703181635, "grad_norm": 0.23552543012415264, "learning_rate": 1.151703124609573e-05, "loss": 0.2163, "step": 2094 }, { "epoch": 3.374949657672171, "grad_norm": 0.23518558644454512, "learning_rate": 1.1496642339287191e-05, "loss": 0.2063, "step": 2095 }, { "epoch": 3.3765606121627063, "grad_norm": 0.2441080853367346, "learning_rate": 1.1476264214042004e-05, "loss": 0.2042, "step": 2096 }, { "epoch": 3.378171566653242, "grad_norm": 0.451665532949196, "learning_rate": 1.1455896896197974e-05, "loss": 0.2149, "step": 2097 }, { "epoch": 3.3797825211437775, "grad_norm": 0.25128065966750873, "learning_rate": 1.1435540411579203e-05, "loss": 0.2113, "step": 2098 }, { "epoch": 3.3813934756343134, "grad_norm": 0.2225475197951388, "learning_rate": 1.1415194785996045e-05, "loss": 0.1594, "step": 2099 }, { "epoch": 3.383004430124849, "grad_norm": 0.2617602545629556, "learning_rate": 1.1394860045245084e-05, "loss": 0.229, "step": 2100 }, { "epoch": 3.3846153846153846, "grad_norm": 0.24500084267347086, "learning_rate": 1.137453621510912e-05, "loss": 0.2098, "step": 2101 }, { "epoch": 3.3862263391059204, "grad_norm": 0.23416531343487373, "learning_rate": 1.1354223321357119e-05, "loss": 0.2084, "step": 2102 }, { "epoch": 3.387837293596456, "grad_norm": 0.22091357857993194, "learning_rate": 1.1333921389744153e-05, "loss": 0.1967, "step": 2103 }, { "epoch": 3.3894482480869916, "grad_norm": 0.2511364869911767, "learning_rate": 1.1313630446011427e-05, "loss": 0.216, "step": 2104 }, { "epoch": 3.391059202577527, "grad_norm": 0.23267127034952573, "learning_rate": 1.1293350515886203e-05, "loss": 0.2022, "step": 2105 }, { "epoch": 3.392670157068063, "grad_norm": 0.23555886598588704, "learning_rate": 1.1273081625081777e-05, "loss": 0.1995, "step": 2106 }, { "epoch": 3.3942811115585982, "grad_norm": 0.23868806139569818, "learning_rate": 1.1252823799297433e-05, "loss": 0.1773, "step": 2107 }, { "epoch": 3.395892066049134, "grad_norm": 0.27452473674180444, "learning_rate": 1.123257706421845e-05, "loss": 0.2199, "step": 2108 }, { "epoch": 3.39750302053967, "grad_norm": 0.2394943226611453, "learning_rate": 1.1212341445516043e-05, "loss": 0.1981, "step": 2109 }, { "epoch": 3.3991139750302053, "grad_norm": 0.24612894971879087, "learning_rate": 1.1192116968847313e-05, "loss": 0.1968, "step": 2110 }, { "epoch": 3.400724929520741, "grad_norm": 0.25883844195430433, "learning_rate": 1.1171903659855256e-05, "loss": 0.1949, "step": 2111 }, { "epoch": 3.4023358840112765, "grad_norm": 0.23984551387483893, "learning_rate": 1.1151701544168685e-05, "loss": 0.1993, "step": 2112 }, { "epoch": 3.4039468385018123, "grad_norm": 0.23016178489654152, "learning_rate": 1.1131510647402246e-05, "loss": 0.1948, "step": 2113 }, { "epoch": 3.405557792992348, "grad_norm": 0.2631642324887254, "learning_rate": 1.1111330995156335e-05, "loss": 0.1918, "step": 2114 }, { "epoch": 3.4071687474828836, "grad_norm": 0.2506220508839031, "learning_rate": 1.1091162613017113e-05, "loss": 0.2062, "step": 2115 }, { "epoch": 3.4087797019734194, "grad_norm": 0.25264234683049214, "learning_rate": 1.1071005526556444e-05, "loss": 0.2141, "step": 2116 }, { "epoch": 3.410390656463955, "grad_norm": 0.25278773105827357, "learning_rate": 1.1050859761331867e-05, "loss": 0.1915, "step": 2117 }, { "epoch": 3.4120016109544906, "grad_norm": 0.23224228492195387, "learning_rate": 1.1030725342886556e-05, "loss": 0.1938, "step": 2118 }, { "epoch": 3.413612565445026, "grad_norm": 0.2142415135543403, "learning_rate": 1.101060229674932e-05, "loss": 0.1936, "step": 2119 }, { "epoch": 3.415223519935562, "grad_norm": 0.23554798219735495, "learning_rate": 1.0990490648434541e-05, "loss": 0.1969, "step": 2120 }, { "epoch": 3.4168344744260972, "grad_norm": 0.26354880573854594, "learning_rate": 1.0970390423442142e-05, "loss": 0.2096, "step": 2121 }, { "epoch": 3.418445428916633, "grad_norm": 0.24440684389786652, "learning_rate": 1.0950301647257572e-05, "loss": 0.2051, "step": 2122 }, { "epoch": 3.420056383407169, "grad_norm": 0.2552755780564851, "learning_rate": 1.0930224345351751e-05, "loss": 0.2117, "step": 2123 }, { "epoch": 3.4216673378977043, "grad_norm": 0.29304190170396727, "learning_rate": 1.0910158543181073e-05, "loss": 0.1947, "step": 2124 }, { "epoch": 3.42327829238824, "grad_norm": 0.2458155097174551, "learning_rate": 1.089010426618732e-05, "loss": 0.213, "step": 2125 }, { "epoch": 3.4248892468787755, "grad_norm": 0.2462846208675487, "learning_rate": 1.0870061539797696e-05, "loss": 0.1995, "step": 2126 }, { "epoch": 3.4265002013693113, "grad_norm": 0.2367304617090815, "learning_rate": 1.0850030389424724e-05, "loss": 0.2052, "step": 2127 }, { "epoch": 3.428111155859847, "grad_norm": 0.24464716596958344, "learning_rate": 1.0830010840466282e-05, "loss": 0.2172, "step": 2128 }, { "epoch": 3.4297221103503825, "grad_norm": 0.24560443043574962, "learning_rate": 1.0810002918305508e-05, "loss": 0.2054, "step": 2129 }, { "epoch": 3.4313330648409184, "grad_norm": 0.24409525550351965, "learning_rate": 1.0790006648310828e-05, "loss": 0.2041, "step": 2130 }, { "epoch": 3.4329440193314538, "grad_norm": 0.23782893656758072, "learning_rate": 1.0770022055835882e-05, "loss": 0.2036, "step": 2131 }, { "epoch": 3.4345549738219896, "grad_norm": 0.23124086461460552, "learning_rate": 1.075004916621949e-05, "loss": 0.199, "step": 2132 }, { "epoch": 3.436165928312525, "grad_norm": 0.2438499354960223, "learning_rate": 1.073008800478566e-05, "loss": 0.1827, "step": 2133 }, { "epoch": 3.437776882803061, "grad_norm": 0.23923418585538098, "learning_rate": 1.0710138596843494e-05, "loss": 0.2235, "step": 2134 }, { "epoch": 3.439387837293596, "grad_norm": 0.23302741192753726, "learning_rate": 1.0690200967687234e-05, "loss": 0.1979, "step": 2135 }, { "epoch": 3.440998791784132, "grad_norm": 0.23649073229436382, "learning_rate": 1.0670275142596154e-05, "loss": 0.2074, "step": 2136 }, { "epoch": 3.442609746274668, "grad_norm": 0.23258425943151656, "learning_rate": 1.065036114683458e-05, "loss": 0.2043, "step": 2137 }, { "epoch": 3.4442207007652033, "grad_norm": 0.23498177622326072, "learning_rate": 1.0630459005651823e-05, "loss": 0.2005, "step": 2138 }, { "epoch": 3.445831655255739, "grad_norm": 0.26706198990505337, "learning_rate": 1.061056874428219e-05, "loss": 0.2026, "step": 2139 }, { "epoch": 3.4474426097462745, "grad_norm": 0.23061635890286963, "learning_rate": 1.059069038794489e-05, "loss": 0.214, "step": 2140 }, { "epoch": 3.4490535642368103, "grad_norm": 0.22470610502390026, "learning_rate": 1.0570823961844065e-05, "loss": 0.1919, "step": 2141 }, { "epoch": 3.450664518727346, "grad_norm": 0.24074853759574122, "learning_rate": 1.055096949116873e-05, "loss": 0.2134, "step": 2142 }, { "epoch": 3.4522754732178815, "grad_norm": 0.238304166701547, "learning_rate": 1.0531127001092715e-05, "loss": 0.2032, "step": 2143 }, { "epoch": 3.4538864277084174, "grad_norm": 0.22940631731584865, "learning_rate": 1.0511296516774699e-05, "loss": 0.1904, "step": 2144 }, { "epoch": 3.4554973821989527, "grad_norm": 0.24654104415440423, "learning_rate": 1.0491478063358096e-05, "loss": 0.2021, "step": 2145 }, { "epoch": 3.4571083366894886, "grad_norm": 0.26612320358388847, "learning_rate": 1.0471671665971104e-05, "loss": 0.2139, "step": 2146 }, { "epoch": 3.4587192911800244, "grad_norm": 0.40048278104943835, "learning_rate": 1.0451877349726605e-05, "loss": 0.2229, "step": 2147 }, { "epoch": 3.46033024567056, "grad_norm": 0.23711292098323433, "learning_rate": 1.0432095139722187e-05, "loss": 0.1839, "step": 2148 }, { "epoch": 3.4619412001610956, "grad_norm": 0.26048839006315294, "learning_rate": 1.0412325061040063e-05, "loss": 0.1968, "step": 2149 }, { "epoch": 3.463552154651631, "grad_norm": 0.2326769954633612, "learning_rate": 1.0392567138747101e-05, "loss": 0.2178, "step": 2150 }, { "epoch": 3.465163109142167, "grad_norm": 0.25045178314044103, "learning_rate": 1.0372821397894709e-05, "loss": 0.1879, "step": 2151 }, { "epoch": 3.4667740636327022, "grad_norm": 0.23568688727213294, "learning_rate": 1.0353087863518894e-05, "loss": 0.1919, "step": 2152 }, { "epoch": 3.468385018123238, "grad_norm": 0.2412245199550461, "learning_rate": 1.0333366560640151e-05, "loss": 0.1995, "step": 2153 }, { "epoch": 3.4699959726137735, "grad_norm": 0.2634416881896074, "learning_rate": 1.0313657514263492e-05, "loss": 0.2353, "step": 2154 }, { "epoch": 3.4716069271043093, "grad_norm": 0.22072632069118556, "learning_rate": 1.0293960749378384e-05, "loss": 0.1939, "step": 2155 }, { "epoch": 3.473217881594845, "grad_norm": 0.23393038614986122, "learning_rate": 1.0274276290958701e-05, "loss": 0.2051, "step": 2156 }, { "epoch": 3.4748288360853805, "grad_norm": 0.23009879755108045, "learning_rate": 1.0254604163962747e-05, "loss": 0.1869, "step": 2157 }, { "epoch": 3.4764397905759163, "grad_norm": 0.24631286359057225, "learning_rate": 1.0234944393333155e-05, "loss": 0.2072, "step": 2158 }, { "epoch": 3.4780507450664517, "grad_norm": 0.2396424841750826, "learning_rate": 1.0215297003996927e-05, "loss": 0.1985, "step": 2159 }, { "epoch": 3.4796616995569876, "grad_norm": 0.2229192233390955, "learning_rate": 1.0195662020865333e-05, "loss": 0.1877, "step": 2160 }, { "epoch": 3.4812726540475234, "grad_norm": 0.27640686696076017, "learning_rate": 1.0176039468833938e-05, "loss": 0.2072, "step": 2161 }, { "epoch": 3.482883608538059, "grad_norm": 0.2501269581468127, "learning_rate": 1.015642937278254e-05, "loss": 0.2093, "step": 2162 }, { "epoch": 3.4844945630285946, "grad_norm": 0.23553356609239753, "learning_rate": 1.0136831757575134e-05, "loss": 0.2097, "step": 2163 }, { "epoch": 3.48610551751913, "grad_norm": 0.2513360823413565, "learning_rate": 1.0117246648059888e-05, "loss": 0.2027, "step": 2164 }, { "epoch": 3.487716472009666, "grad_norm": 0.25559246873602354, "learning_rate": 1.0097674069069132e-05, "loss": 0.2042, "step": 2165 }, { "epoch": 3.4893274265002012, "grad_norm": 0.24381340477783897, "learning_rate": 1.0078114045419305e-05, "loss": 0.2193, "step": 2166 }, { "epoch": 3.490938380990737, "grad_norm": 0.23737314305360308, "learning_rate": 1.0058566601910903e-05, "loss": 0.2057, "step": 2167 }, { "epoch": 3.4925493354812724, "grad_norm": 0.24161527730895105, "learning_rate": 1.0039031763328506e-05, "loss": 0.1997, "step": 2168 }, { "epoch": 3.4941602899718083, "grad_norm": 0.24433075314270097, "learning_rate": 1.0019509554440678e-05, "loss": 0.2156, "step": 2169 }, { "epoch": 3.495771244462344, "grad_norm": 0.2280058138376306, "learning_rate": 1.0000000000000006e-05, "loss": 0.1945, "step": 2170 }, { "epoch": 3.4973821989528795, "grad_norm": 0.24779688243659267, "learning_rate": 9.980503124742988e-06, "loss": 0.2093, "step": 2171 }, { "epoch": 3.4989931534434153, "grad_norm": 0.2428801961690144, "learning_rate": 9.961018953390086e-06, "loss": 0.2065, "step": 2172 }, { "epoch": 3.5006041079339507, "grad_norm": 0.2219328596608684, "learning_rate": 9.941547510645647e-06, "loss": 0.1885, "step": 2173 }, { "epoch": 3.5022150624244865, "grad_norm": 0.2267440935335129, "learning_rate": 9.922088821197854e-06, "loss": 0.2089, "step": 2174 }, { "epoch": 3.5038260169150224, "grad_norm": 0.24159131404047698, "learning_rate": 9.902642909718737e-06, "loss": 0.2101, "step": 2175 }, { "epoch": 3.5054369714055578, "grad_norm": 0.23555256415137021, "learning_rate": 9.88320980086413e-06, "loss": 0.2021, "step": 2176 }, { "epoch": 3.5070479258960936, "grad_norm": 0.2274219679214926, "learning_rate": 9.863789519273638e-06, "loss": 0.1952, "step": 2177 }, { "epoch": 3.508658880386629, "grad_norm": 0.23187334643843013, "learning_rate": 9.844382089570584e-06, "loss": 0.2084, "step": 2178 }, { "epoch": 3.510269834877165, "grad_norm": 0.22771801310950746, "learning_rate": 9.824987536361992e-06, "loss": 0.1949, "step": 2179 }, { "epoch": 3.5118807893677, "grad_norm": 0.22370527590647052, "learning_rate": 9.805605884238587e-06, "loss": 0.205, "step": 2180 }, { "epoch": 3.513491743858236, "grad_norm": 0.23845020170094708, "learning_rate": 9.786237157774726e-06, "loss": 0.2032, "step": 2181 }, { "epoch": 3.5151026983487714, "grad_norm": 0.2283710304261633, "learning_rate": 9.766881381528357e-06, "loss": 0.2032, "step": 2182 }, { "epoch": 3.5167136528393073, "grad_norm": 0.2378550286458637, "learning_rate": 9.747538580041034e-06, "loss": 0.2106, "step": 2183 }, { "epoch": 3.518324607329843, "grad_norm": 0.22430006122712037, "learning_rate": 9.728208777837858e-06, "loss": 0.2024, "step": 2184 }, { "epoch": 3.5199355618203785, "grad_norm": 0.22469949464520347, "learning_rate": 9.70889199942743e-06, "loss": 0.2076, "step": 2185 }, { "epoch": 3.5215465163109143, "grad_norm": 0.2416906697350346, "learning_rate": 9.689588269301842e-06, "loss": 0.2171, "step": 2186 }, { "epoch": 3.5231574708014497, "grad_norm": 0.22087168616579303, "learning_rate": 9.670297611936657e-06, "loss": 0.2009, "step": 2187 }, { "epoch": 3.5247684252919855, "grad_norm": 0.23173269867776017, "learning_rate": 9.651020051790858e-06, "loss": 0.2073, "step": 2188 }, { "epoch": 3.5263793797825214, "grad_norm": 0.2277755728185813, "learning_rate": 9.631755613306814e-06, "loss": 0.2008, "step": 2189 }, { "epoch": 3.5279903342730567, "grad_norm": 0.22892461332859246, "learning_rate": 9.612504320910249e-06, "loss": 0.22, "step": 2190 }, { "epoch": 3.5296012887635926, "grad_norm": 0.22441902310723352, "learning_rate": 9.59326619901024e-06, "loss": 0.2079, "step": 2191 }, { "epoch": 3.531212243254128, "grad_norm": 0.2331194408439935, "learning_rate": 9.574041271999163e-06, "loss": 0.2019, "step": 2192 }, { "epoch": 3.532823197744664, "grad_norm": 0.2317551080792225, "learning_rate": 9.55482956425264e-06, "loss": 0.2088, "step": 2193 }, { "epoch": 3.534434152235199, "grad_norm": 0.24198116715159212, "learning_rate": 9.535631100129556e-06, "loss": 0.2047, "step": 2194 }, { "epoch": 3.536045106725735, "grad_norm": 0.2404786868991303, "learning_rate": 9.516445903972005e-06, "loss": 0.203, "step": 2195 }, { "epoch": 3.5376560612162704, "grad_norm": 0.21582358266853396, "learning_rate": 9.497274000105239e-06, "loss": 0.1868, "step": 2196 }, { "epoch": 3.5392670157068062, "grad_norm": 0.24330479425323304, "learning_rate": 9.478115412837661e-06, "loss": 0.2046, "step": 2197 }, { "epoch": 3.540877970197342, "grad_norm": 0.2290973058402262, "learning_rate": 9.458970166460804e-06, "loss": 0.1997, "step": 2198 }, { "epoch": 3.5424889246878775, "grad_norm": 0.24487357902072016, "learning_rate": 9.439838285249285e-06, "loss": 0.2109, "step": 2199 }, { "epoch": 3.5440998791784133, "grad_norm": 0.23275375857106345, "learning_rate": 9.420719793460758e-06, "loss": 0.2045, "step": 2200 }, { "epoch": 3.5457108336689487, "grad_norm": 0.23599591116541208, "learning_rate": 9.401614715335905e-06, "loss": 0.1868, "step": 2201 }, { "epoch": 3.5473217881594845, "grad_norm": 0.23362100236229594, "learning_rate": 9.38252307509841e-06, "loss": 0.193, "step": 2202 }, { "epoch": 3.5489327426500203, "grad_norm": 0.24650562505944937, "learning_rate": 9.363444896954927e-06, "loss": 0.2109, "step": 2203 }, { "epoch": 3.5505436971405557, "grad_norm": 0.23856000850206951, "learning_rate": 9.344380205095017e-06, "loss": 0.1888, "step": 2204 }, { "epoch": 3.5521546516310916, "grad_norm": 0.23064058859091333, "learning_rate": 9.325329023691137e-06, "loss": 0.2069, "step": 2205 }, { "epoch": 3.553765606121627, "grad_norm": 0.2318135856582349, "learning_rate": 9.30629137689866e-06, "loss": 0.2023, "step": 2206 }, { "epoch": 3.555376560612163, "grad_norm": 0.24342035330502437, "learning_rate": 9.287267288855756e-06, "loss": 0.2067, "step": 2207 }, { "epoch": 3.5569875151026986, "grad_norm": 0.25166127958235246, "learning_rate": 9.268256783683408e-06, "loss": 0.1903, "step": 2208 }, { "epoch": 3.558598469593234, "grad_norm": 0.22256962504013172, "learning_rate": 9.24925988548539e-06, "loss": 0.1935, "step": 2209 }, { "epoch": 3.5602094240837694, "grad_norm": 0.28219584567539696, "learning_rate": 9.230276618348224e-06, "loss": 0.2108, "step": 2210 }, { "epoch": 3.5618203785743052, "grad_norm": 0.24218404180288422, "learning_rate": 9.21130700634114e-06, "loss": 0.2002, "step": 2211 }, { "epoch": 3.563431333064841, "grad_norm": 0.24262274443668091, "learning_rate": 9.192351073516047e-06, "loss": 0.1875, "step": 2212 }, { "epoch": 3.5650422875553764, "grad_norm": 0.2364784795121163, "learning_rate": 9.173408843907529e-06, "loss": 0.1873, "step": 2213 }, { "epoch": 3.5666532420459123, "grad_norm": 0.2317621350392338, "learning_rate": 9.154480341532797e-06, "loss": 0.2008, "step": 2214 }, { "epoch": 3.5682641965364477, "grad_norm": 0.24422946032063308, "learning_rate": 9.135565590391633e-06, "loss": 0.2241, "step": 2215 }, { "epoch": 3.5698751510269835, "grad_norm": 0.23009545062501383, "learning_rate": 9.116664614466386e-06, "loss": 0.2057, "step": 2216 }, { "epoch": 3.5714861055175193, "grad_norm": 0.23204535283854022, "learning_rate": 9.097777437721982e-06, "loss": 0.1953, "step": 2217 }, { "epoch": 3.5730970600080547, "grad_norm": 0.24756670044336804, "learning_rate": 9.078904084105802e-06, "loss": 0.2071, "step": 2218 }, { "epoch": 3.5747080144985905, "grad_norm": 0.22696668780158338, "learning_rate": 9.060044577547711e-06, "loss": 0.197, "step": 2219 }, { "epoch": 3.576318968989126, "grad_norm": 0.21813119741031062, "learning_rate": 9.04119894196003e-06, "loss": 0.1943, "step": 2220 }, { "epoch": 3.5779299234796618, "grad_norm": 0.24547436858026264, "learning_rate": 9.022367201237495e-06, "loss": 0.1908, "step": 2221 }, { "epoch": 3.5795408779701976, "grad_norm": 0.24906258689712765, "learning_rate": 9.00354937925721e-06, "loss": 0.2125, "step": 2222 }, { "epoch": 3.581151832460733, "grad_norm": 0.2358356210611667, "learning_rate": 8.98474549987862e-06, "loss": 0.2037, "step": 2223 }, { "epoch": 3.5827627869512684, "grad_norm": 0.23154346550342686, "learning_rate": 8.965955586943538e-06, "loss": 0.1915, "step": 2224 }, { "epoch": 3.584373741441804, "grad_norm": 0.23932511820741448, "learning_rate": 8.947179664276028e-06, "loss": 0.2049, "step": 2225 }, { "epoch": 3.58598469593234, "grad_norm": 0.216916084634077, "learning_rate": 8.928417755682416e-06, "loss": 0.1896, "step": 2226 }, { "epoch": 3.5875956504228754, "grad_norm": 0.2200896036801776, "learning_rate": 8.909669884951284e-06, "loss": 0.2, "step": 2227 }, { "epoch": 3.5892066049134113, "grad_norm": 0.2485175178385473, "learning_rate": 8.890936075853403e-06, "loss": 0.2034, "step": 2228 }, { "epoch": 3.5908175594039466, "grad_norm": 0.2178344656723329, "learning_rate": 8.87221635214171e-06, "loss": 0.1798, "step": 2229 }, { "epoch": 3.5924285138944825, "grad_norm": 0.2302593584653443, "learning_rate": 8.853510737551274e-06, "loss": 0.2048, "step": 2230 }, { "epoch": 3.5940394683850183, "grad_norm": 0.2195145608319814, "learning_rate": 8.8348192557993e-06, "loss": 0.189, "step": 2231 }, { "epoch": 3.5956504228755537, "grad_norm": 0.2384324191704394, "learning_rate": 8.816141930585067e-06, "loss": 0.191, "step": 2232 }, { "epoch": 3.5972613773660895, "grad_norm": 0.27318029242884734, "learning_rate": 8.79747878558989e-06, "loss": 0.2054, "step": 2233 }, { "epoch": 3.598872331856625, "grad_norm": 0.2443948830306952, "learning_rate": 8.778829844477099e-06, "loss": 0.1963, "step": 2234 }, { "epoch": 3.6004832863471607, "grad_norm": 0.24789244954904005, "learning_rate": 8.76019513089206e-06, "loss": 0.2092, "step": 2235 }, { "epoch": 3.6020942408376966, "grad_norm": 0.24650887122937143, "learning_rate": 8.741574668462053e-06, "loss": 0.2046, "step": 2236 }, { "epoch": 3.603705195328232, "grad_norm": 0.23165112489745265, "learning_rate": 8.722968480796294e-06, "loss": 0.1855, "step": 2237 }, { "epoch": 3.6053161498187674, "grad_norm": 0.2576959078546253, "learning_rate": 8.704376591485923e-06, "loss": 0.2059, "step": 2238 }, { "epoch": 3.606927104309303, "grad_norm": 0.23768867314811368, "learning_rate": 8.685799024103942e-06, "loss": 0.1906, "step": 2239 }, { "epoch": 3.608538058799839, "grad_norm": 0.2395303763373646, "learning_rate": 8.667235802205183e-06, "loss": 0.1901, "step": 2240 }, { "epoch": 3.6101490132903744, "grad_norm": 0.2413793337525353, "learning_rate": 8.648686949326286e-06, "loss": 0.1961, "step": 2241 }, { "epoch": 3.6117599677809102, "grad_norm": 0.23591510967044874, "learning_rate": 8.630152488985694e-06, "loss": 0.2082, "step": 2242 }, { "epoch": 3.6133709222714456, "grad_norm": 0.2368705104725338, "learning_rate": 8.611632444683595e-06, "loss": 0.2016, "step": 2243 }, { "epoch": 3.6149818767619815, "grad_norm": 0.22430359624959098, "learning_rate": 8.593126839901886e-06, "loss": 0.2003, "step": 2244 }, { "epoch": 3.6165928312525173, "grad_norm": 0.2328209779003292, "learning_rate": 8.57463569810415e-06, "loss": 0.2193, "step": 2245 }, { "epoch": 3.6182037857430527, "grad_norm": 0.21765260530670116, "learning_rate": 8.556159042735672e-06, "loss": 0.1884, "step": 2246 }, { "epoch": 3.6198147402335885, "grad_norm": 0.2373213750878368, "learning_rate": 8.537696897223331e-06, "loss": 0.1916, "step": 2247 }, { "epoch": 3.621425694724124, "grad_norm": 0.23641497550916923, "learning_rate": 8.519249284975611e-06, "loss": 0.2159, "step": 2248 }, { "epoch": 3.6230366492146597, "grad_norm": 0.23628799850862556, "learning_rate": 8.500816229382584e-06, "loss": 0.2091, "step": 2249 }, { "epoch": 3.6246476037051956, "grad_norm": 0.2234944894492603, "learning_rate": 8.482397753815872e-06, "loss": 0.2012, "step": 2250 }, { "epoch": 3.626258558195731, "grad_norm": 0.21626870143948573, "learning_rate": 8.463993881628586e-06, "loss": 0.1922, "step": 2251 }, { "epoch": 3.6278695126862663, "grad_norm": 0.23755306979760044, "learning_rate": 8.445604636155328e-06, "loss": 0.2078, "step": 2252 }, { "epoch": 3.629480467176802, "grad_norm": 0.24117810761942085, "learning_rate": 8.427230040712171e-06, "loss": 0.2085, "step": 2253 }, { "epoch": 3.631091421667338, "grad_norm": 0.22854665517211442, "learning_rate": 8.408870118596606e-06, "loss": 0.1999, "step": 2254 }, { "epoch": 3.6327023761578734, "grad_norm": 0.22293728621187936, "learning_rate": 8.390524893087505e-06, "loss": 0.1836, "step": 2255 }, { "epoch": 3.6343133306484092, "grad_norm": 0.22707952398983391, "learning_rate": 8.37219438744511e-06, "loss": 0.1859, "step": 2256 }, { "epoch": 3.6359242851389446, "grad_norm": 0.23511966018985556, "learning_rate": 8.353878624911026e-06, "loss": 0.2185, "step": 2257 }, { "epoch": 3.6375352396294804, "grad_norm": 0.22703099897283688, "learning_rate": 8.335577628708137e-06, "loss": 0.1927, "step": 2258 }, { "epoch": 3.6391461941200163, "grad_norm": 0.25017536611561936, "learning_rate": 8.317291422040601e-06, "loss": 0.2087, "step": 2259 }, { "epoch": 3.6407571486105517, "grad_norm": 0.23049249283018605, "learning_rate": 8.299020028093844e-06, "loss": 0.202, "step": 2260 }, { "epoch": 3.6423681031010875, "grad_norm": 0.2271488606251097, "learning_rate": 8.28076347003451e-06, "loss": 0.1997, "step": 2261 }, { "epoch": 3.643979057591623, "grad_norm": 0.23883238541566312, "learning_rate": 8.262521771010417e-06, "loss": 0.2128, "step": 2262 }, { "epoch": 3.6455900120821587, "grad_norm": 0.22315904417254107, "learning_rate": 8.24429495415054e-06, "loss": 0.1997, "step": 2263 }, { "epoch": 3.6472009665726945, "grad_norm": 0.2234185102011281, "learning_rate": 8.226083042565006e-06, "loss": 0.2003, "step": 2264 }, { "epoch": 3.64881192106323, "grad_norm": 0.23225689520155557, "learning_rate": 8.207886059345034e-06, "loss": 0.1995, "step": 2265 }, { "epoch": 3.6504228755537658, "grad_norm": 0.23187295084042797, "learning_rate": 8.189704027562913e-06, "loss": 0.1775, "step": 2266 }, { "epoch": 3.652033830044301, "grad_norm": 0.23529056309699636, "learning_rate": 8.171536970271963e-06, "loss": 0.1999, "step": 2267 }, { "epoch": 3.653644784534837, "grad_norm": 0.23236799884409995, "learning_rate": 8.153384910506539e-06, "loss": 0.1871, "step": 2268 }, { "epoch": 3.6552557390253724, "grad_norm": 0.23485228349145965, "learning_rate": 8.135247871281977e-06, "loss": 0.2062, "step": 2269 }, { "epoch": 3.656866693515908, "grad_norm": 0.22646757698120465, "learning_rate": 8.11712587559455e-06, "loss": 0.1832, "step": 2270 }, { "epoch": 3.6584776480064436, "grad_norm": 0.2514790956885834, "learning_rate": 8.099018946421473e-06, "loss": 0.2196, "step": 2271 }, { "epoch": 3.6600886024969794, "grad_norm": 0.2340164808100686, "learning_rate": 8.080927106720862e-06, "loss": 0.2239, "step": 2272 }, { "epoch": 3.6616995569875153, "grad_norm": 0.2238317436694524, "learning_rate": 8.062850379431689e-06, "loss": 0.1888, "step": 2273 }, { "epoch": 3.6633105114780506, "grad_norm": 0.23788481343752213, "learning_rate": 8.044788787473756e-06, "loss": 0.2094, "step": 2274 }, { "epoch": 3.6649214659685865, "grad_norm": 0.23426880337639244, "learning_rate": 8.026742353747698e-06, "loss": 0.1888, "step": 2275 }, { "epoch": 3.666532420459122, "grad_norm": 0.23338079652138916, "learning_rate": 8.008711101134928e-06, "loss": 0.1917, "step": 2276 }, { "epoch": 3.6681433749496577, "grad_norm": 0.22694613559354795, "learning_rate": 7.99069505249759e-06, "loss": 0.2077, "step": 2277 }, { "epoch": 3.6697543294401935, "grad_norm": 0.23368251076797544, "learning_rate": 7.972694230678562e-06, "loss": 0.2019, "step": 2278 }, { "epoch": 3.671365283930729, "grad_norm": 0.2274192071332042, "learning_rate": 7.95470865850142e-06, "loss": 0.2081, "step": 2279 }, { "epoch": 3.6729762384212647, "grad_norm": 0.23272909394736677, "learning_rate": 7.936738358770409e-06, "loss": 0.2012, "step": 2280 }, { "epoch": 3.6745871929118, "grad_norm": 0.2283629788794831, "learning_rate": 7.918783354270388e-06, "loss": 0.1951, "step": 2281 }, { "epoch": 3.676198147402336, "grad_norm": 0.23382239466066568, "learning_rate": 7.900843667766845e-06, "loss": 0.2083, "step": 2282 }, { "epoch": 3.677809101892872, "grad_norm": 0.22454604048146312, "learning_rate": 7.882919322005844e-06, "loss": 0.1948, "step": 2283 }, { "epoch": 3.679420056383407, "grad_norm": 0.22453891515871524, "learning_rate": 7.865010339713986e-06, "loss": 0.1785, "step": 2284 }, { "epoch": 3.6810310108739426, "grad_norm": 0.23166411493780942, "learning_rate": 7.847116743598388e-06, "loss": 0.2198, "step": 2285 }, { "epoch": 3.6826419653644784, "grad_norm": 0.2347847692976649, "learning_rate": 7.829238556346683e-06, "loss": 0.1903, "step": 2286 }, { "epoch": 3.6842529198550142, "grad_norm": 0.2388300143720348, "learning_rate": 7.811375800626954e-06, "loss": 0.2082, "step": 2287 }, { "epoch": 3.6858638743455496, "grad_norm": 0.22396488198134365, "learning_rate": 7.793528499087709e-06, "loss": 0.2033, "step": 2288 }, { "epoch": 3.6874748288360855, "grad_norm": 0.2336053446605039, "learning_rate": 7.775696674357876e-06, "loss": 0.2105, "step": 2289 }, { "epoch": 3.689085783326621, "grad_norm": 0.23591046953244488, "learning_rate": 7.757880349046742e-06, "loss": 0.1914, "step": 2290 }, { "epoch": 3.6906967378171567, "grad_norm": 0.23748095616700868, "learning_rate": 7.74007954574397e-06, "loss": 0.2083, "step": 2291 }, { "epoch": 3.6923076923076925, "grad_norm": 0.2481715938640332, "learning_rate": 7.722294287019509e-06, "loss": 0.1945, "step": 2292 }, { "epoch": 3.693918646798228, "grad_norm": 0.24850684859956976, "learning_rate": 7.704524595423631e-06, "loss": 0.2224, "step": 2293 }, { "epoch": 3.6955296012887637, "grad_norm": 0.22449191890739228, "learning_rate": 7.686770493486835e-06, "loss": 0.1917, "step": 2294 }, { "epoch": 3.697140555779299, "grad_norm": 0.22240414266437813, "learning_rate": 7.669032003719894e-06, "loss": 0.2043, "step": 2295 }, { "epoch": 3.698751510269835, "grad_norm": 0.23811833779559768, "learning_rate": 7.651309148613745e-06, "loss": 0.2228, "step": 2296 }, { "epoch": 3.700362464760371, "grad_norm": 0.2346602801883088, "learning_rate": 7.633601950639532e-06, "loss": 0.2044, "step": 2297 }, { "epoch": 3.701973419250906, "grad_norm": 0.24881467892424516, "learning_rate": 7.615910432248546e-06, "loss": 0.1971, "step": 2298 }, { "epoch": 3.7035843737414416, "grad_norm": 0.21959606461798073, "learning_rate": 7.598234615872169e-06, "loss": 0.1723, "step": 2299 }, { "epoch": 3.7051953282319774, "grad_norm": 0.2275339064144191, "learning_rate": 7.580574523921906e-06, "loss": 0.1943, "step": 2300 }, { "epoch": 3.7068062827225132, "grad_norm": 0.2638835977196525, "learning_rate": 7.562930178789305e-06, "loss": 0.225, "step": 2301 }, { "epoch": 3.7084172372130486, "grad_norm": 0.22139223269245323, "learning_rate": 7.545301602845965e-06, "loss": 0.1974, "step": 2302 }, { "epoch": 3.7100281917035844, "grad_norm": 0.23262025318923577, "learning_rate": 7.527688818443466e-06, "loss": 0.1918, "step": 2303 }, { "epoch": 3.71163914619412, "grad_norm": 0.2281510665207774, "learning_rate": 7.510091847913396e-06, "loss": 0.1827, "step": 2304 }, { "epoch": 3.7132501006846557, "grad_norm": 0.2315480501775712, "learning_rate": 7.492510713567265e-06, "loss": 0.2254, "step": 2305 }, { "epoch": 3.7148610551751915, "grad_norm": 0.22423715853143875, "learning_rate": 7.474945437696528e-06, "loss": 0.18, "step": 2306 }, { "epoch": 3.716472009665727, "grad_norm": 0.23128163413988403, "learning_rate": 7.457396042572507e-06, "loss": 0.2008, "step": 2307 }, { "epoch": 3.7180829641562627, "grad_norm": 0.23426184961659158, "learning_rate": 7.4398625504464105e-06, "loss": 0.195, "step": 2308 }, { "epoch": 3.719693918646798, "grad_norm": 0.2242770154556143, "learning_rate": 7.422344983549281e-06, "loss": 0.1905, "step": 2309 }, { "epoch": 3.721304873137334, "grad_norm": 0.22078020401371204, "learning_rate": 7.404843364091951e-06, "loss": 0.1849, "step": 2310 }, { "epoch": 3.7229158276278698, "grad_norm": 0.2319635828085481, "learning_rate": 7.387357714265057e-06, "loss": 0.1882, "step": 2311 }, { "epoch": 3.724526782118405, "grad_norm": 0.245118103968261, "learning_rate": 7.369888056238963e-06, "loss": 0.2013, "step": 2312 }, { "epoch": 3.7261377366089405, "grad_norm": 0.22519823792644417, "learning_rate": 7.352434412163785e-06, "loss": 0.2004, "step": 2313 }, { "epoch": 3.7277486910994764, "grad_norm": 0.21704653831383208, "learning_rate": 7.334996804169301e-06, "loss": 0.1881, "step": 2314 }, { "epoch": 3.729359645590012, "grad_norm": 0.22862567502117337, "learning_rate": 7.31757525436499e-06, "loss": 0.2199, "step": 2315 }, { "epoch": 3.7309706000805476, "grad_norm": 0.21180370521549158, "learning_rate": 7.300169784839941e-06, "loss": 0.1748, "step": 2316 }, { "epoch": 3.7325815545710834, "grad_norm": 0.24645022888252976, "learning_rate": 7.282780417662885e-06, "loss": 0.2207, "step": 2317 }, { "epoch": 3.734192509061619, "grad_norm": 0.21457737876895103, "learning_rate": 7.265407174882102e-06, "loss": 0.1932, "step": 2318 }, { "epoch": 3.7358034635521546, "grad_norm": 0.2338070679618665, "learning_rate": 7.248050078525463e-06, "loss": 0.1986, "step": 2319 }, { "epoch": 3.7374144180426905, "grad_norm": 0.23031422564032217, "learning_rate": 7.2307091506003325e-06, "loss": 0.1966, "step": 2320 }, { "epoch": 3.739025372533226, "grad_norm": 0.23174071582081254, "learning_rate": 7.2133844130936e-06, "loss": 0.2106, "step": 2321 }, { "epoch": 3.7406363270237617, "grad_norm": 0.22866751182170306, "learning_rate": 7.1960758879716255e-06, "loss": 0.1961, "step": 2322 }, { "epoch": 3.742247281514297, "grad_norm": 0.24817377490035214, "learning_rate": 7.178783597180192e-06, "loss": 0.2104, "step": 2323 }, { "epoch": 3.743858236004833, "grad_norm": 0.21967760847457896, "learning_rate": 7.161507562644525e-06, "loss": 0.1868, "step": 2324 }, { "epoch": 3.7454691904953687, "grad_norm": 0.2256324233021988, "learning_rate": 7.1442478062692135e-06, "loss": 0.217, "step": 2325 }, { "epoch": 3.747080144985904, "grad_norm": 0.22260244060453915, "learning_rate": 7.127004349938234e-06, "loss": 0.2126, "step": 2326 }, { "epoch": 3.7486910994764395, "grad_norm": 0.23048377400215084, "learning_rate": 7.109777215514866e-06, "loss": 0.2021, "step": 2327 }, { "epoch": 3.7503020539669754, "grad_norm": 0.22938279327652358, "learning_rate": 7.092566424841724e-06, "loss": 0.2079, "step": 2328 }, { "epoch": 3.751913008457511, "grad_norm": 0.22136550368979707, "learning_rate": 7.0753719997406725e-06, "loss": 0.1841, "step": 2329 }, { "epoch": 3.7535239629480466, "grad_norm": 0.24306246271158838, "learning_rate": 7.0581939620128515e-06, "loss": 0.1993, "step": 2330 }, { "epoch": 3.7551349174385824, "grad_norm": 0.2292465000453504, "learning_rate": 7.0410323334386e-06, "loss": 0.1881, "step": 2331 }, { "epoch": 3.756745871929118, "grad_norm": 0.24266774620522708, "learning_rate": 7.0238871357774655e-06, "loss": 0.2136, "step": 2332 }, { "epoch": 3.7583568264196536, "grad_norm": 0.22008050143973384, "learning_rate": 7.0067583907681645e-06, "loss": 0.1973, "step": 2333 }, { "epoch": 3.7599677809101895, "grad_norm": 0.21463510410453734, "learning_rate": 6.989646120128537e-06, "loss": 0.1983, "step": 2334 }, { "epoch": 3.761578735400725, "grad_norm": 0.23707104664701026, "learning_rate": 6.97255034555556e-06, "loss": 0.2136, "step": 2335 }, { "epoch": 3.7631896898912607, "grad_norm": 0.21494086360905443, "learning_rate": 6.955471088725263e-06, "loss": 0.1896, "step": 2336 }, { "epoch": 3.764800644381796, "grad_norm": 0.22994725268735505, "learning_rate": 6.938408371292764e-06, "loss": 0.1971, "step": 2337 }, { "epoch": 3.766411598872332, "grad_norm": 0.23686841356483834, "learning_rate": 6.921362214892182e-06, "loss": 0.2062, "step": 2338 }, { "epoch": 3.7680225533628677, "grad_norm": 0.21759254488600177, "learning_rate": 6.904332641136668e-06, "loss": 0.1889, "step": 2339 }, { "epoch": 3.769633507853403, "grad_norm": 0.23896380350462712, "learning_rate": 6.887319671618315e-06, "loss": 0.2313, "step": 2340 }, { "epoch": 3.7712444623439385, "grad_norm": 0.22446891923766324, "learning_rate": 6.870323327908193e-06, "loss": 0.2126, "step": 2341 }, { "epoch": 3.7728554168344743, "grad_norm": 0.2192309543722886, "learning_rate": 6.8533436315562665e-06, "loss": 0.2035, "step": 2342 }, { "epoch": 3.77446637132501, "grad_norm": 0.22492224295878233, "learning_rate": 6.836380604091411e-06, "loss": 0.1933, "step": 2343 }, { "epoch": 3.7760773258155456, "grad_norm": 0.23367721504266944, "learning_rate": 6.819434267021366e-06, "loss": 0.207, "step": 2344 }, { "epoch": 3.7776882803060814, "grad_norm": 0.21684454903299977, "learning_rate": 6.80250464183269e-06, "loss": 0.1925, "step": 2345 }, { "epoch": 3.779299234796617, "grad_norm": 0.2220089960826439, "learning_rate": 6.785591749990779e-06, "loss": 0.1939, "step": 2346 }, { "epoch": 3.7809101892871526, "grad_norm": 0.2361697872498856, "learning_rate": 6.768695612939789e-06, "loss": 0.2059, "step": 2347 }, { "epoch": 3.7825211437776884, "grad_norm": 0.21291861216027663, "learning_rate": 6.751816252102652e-06, "loss": 0.1933, "step": 2348 }, { "epoch": 3.784132098268224, "grad_norm": 0.22876029211036183, "learning_rate": 6.734953688881007e-06, "loss": 0.2097, "step": 2349 }, { "epoch": 3.7857430527587597, "grad_norm": 0.23320502678881608, "learning_rate": 6.7181079446552165e-06, "loss": 0.1858, "step": 2350 }, { "epoch": 3.787354007249295, "grad_norm": 0.23397548643850805, "learning_rate": 6.701279040784314e-06, "loss": 0.2063, "step": 2351 }, { "epoch": 3.788964961739831, "grad_norm": 0.221465934796803, "learning_rate": 6.68446699860597e-06, "loss": 0.2066, "step": 2352 }, { "epoch": 3.7905759162303667, "grad_norm": 0.24482731312630718, "learning_rate": 6.667671839436473e-06, "loss": 0.2112, "step": 2353 }, { "epoch": 3.792186870720902, "grad_norm": 0.23761431330849628, "learning_rate": 6.650893584570724e-06, "loss": 0.2135, "step": 2354 }, { "epoch": 3.793797825211438, "grad_norm": 0.23123489562147165, "learning_rate": 6.634132255282182e-06, "loss": 0.1959, "step": 2355 }, { "epoch": 3.7954087797019733, "grad_norm": 0.22293241276120573, "learning_rate": 6.617387872822842e-06, "loss": 0.1976, "step": 2356 }, { "epoch": 3.797019734192509, "grad_norm": 0.23135387901728863, "learning_rate": 6.6006604584232044e-06, "loss": 0.1956, "step": 2357 }, { "epoch": 3.7986306886830445, "grad_norm": 0.2325139006331464, "learning_rate": 6.583950033292277e-06, "loss": 0.208, "step": 2358 }, { "epoch": 3.8002416431735804, "grad_norm": 0.233386229971809, "learning_rate": 6.567256618617515e-06, "loss": 0.2013, "step": 2359 }, { "epoch": 3.8018525976641158, "grad_norm": 0.2191968818040746, "learning_rate": 6.550580235564794e-06, "loss": 0.1866, "step": 2360 }, { "epoch": 3.8034635521546516, "grad_norm": 0.23952247025452705, "learning_rate": 6.533920905278415e-06, "loss": 0.1995, "step": 2361 }, { "epoch": 3.8050745066451874, "grad_norm": 0.21794716771682257, "learning_rate": 6.517278648881054e-06, "loss": 0.1924, "step": 2362 }, { "epoch": 3.806685461135723, "grad_norm": 0.24189326859388585, "learning_rate": 6.500653487473727e-06, "loss": 0.197, "step": 2363 }, { "epoch": 3.8082964156262586, "grad_norm": 0.22118662032379693, "learning_rate": 6.4840454421357755e-06, "loss": 0.1838, "step": 2364 }, { "epoch": 3.809907370116794, "grad_norm": 0.23506496434883922, "learning_rate": 6.46745453392485e-06, "loss": 0.2199, "step": 2365 }, { "epoch": 3.81151832460733, "grad_norm": 0.2226823424824965, "learning_rate": 6.450880783876878e-06, "loss": 0.2025, "step": 2366 }, { "epoch": 3.8131292790978657, "grad_norm": 0.22964938395731235, "learning_rate": 6.434324213006013e-06, "loss": 0.1831, "step": 2367 }, { "epoch": 3.814740233588401, "grad_norm": 0.24890163240709035, "learning_rate": 6.417784842304628e-06, "loss": 0.2056, "step": 2368 }, { "epoch": 3.816351188078937, "grad_norm": 0.23386480273146806, "learning_rate": 6.401262692743302e-06, "loss": 0.2001, "step": 2369 }, { "epoch": 3.8179621425694723, "grad_norm": 0.23767647586351798, "learning_rate": 6.384757785270777e-06, "loss": 0.1918, "step": 2370 }, { "epoch": 3.819573097060008, "grad_norm": 0.2378296385486382, "learning_rate": 6.368270140813917e-06, "loss": 0.2073, "step": 2371 }, { "epoch": 3.821184051550544, "grad_norm": 0.24265218985960949, "learning_rate": 6.351799780277716e-06, "loss": 0.2087, "step": 2372 }, { "epoch": 3.8227950060410794, "grad_norm": 0.21839384113148216, "learning_rate": 6.335346724545255e-06, "loss": 0.203, "step": 2373 }, { "epoch": 3.8244059605316147, "grad_norm": 0.21506408724015144, "learning_rate": 6.318910994477654e-06, "loss": 0.1757, "step": 2374 }, { "epoch": 3.8260169150221506, "grad_norm": 0.2272335909081642, "learning_rate": 6.3024926109140725e-06, "loss": 0.2218, "step": 2375 }, { "epoch": 3.8276278695126864, "grad_norm": 0.22244961279714315, "learning_rate": 6.286091594671688e-06, "loss": 0.1857, "step": 2376 }, { "epoch": 3.829238824003222, "grad_norm": 0.2588268718136773, "learning_rate": 6.2697079665456575e-06, "loss": 0.2055, "step": 2377 }, { "epoch": 3.8308497784937576, "grad_norm": 0.229250689941906, "learning_rate": 6.253341747309076e-06, "loss": 0.1985, "step": 2378 }, { "epoch": 3.832460732984293, "grad_norm": 0.22729115522993168, "learning_rate": 6.236992957712968e-06, "loss": 0.1987, "step": 2379 }, { "epoch": 3.834071687474829, "grad_norm": 0.2203706251189883, "learning_rate": 6.220661618486268e-06, "loss": 0.2028, "step": 2380 }, { "epoch": 3.8356826419653647, "grad_norm": 0.23167585742706767, "learning_rate": 6.204347750335791e-06, "loss": 0.2056, "step": 2381 }, { "epoch": 3.8372935964559, "grad_norm": 0.22942562541407732, "learning_rate": 6.188051373946182e-06, "loss": 0.1891, "step": 2382 }, { "epoch": 3.838904550946436, "grad_norm": 0.22056932768319534, "learning_rate": 6.171772509979903e-06, "loss": 0.2052, "step": 2383 }, { "epoch": 3.8405155054369713, "grad_norm": 0.22982020120028385, "learning_rate": 6.155511179077252e-06, "loss": 0.1968, "step": 2384 }, { "epoch": 3.842126459927507, "grad_norm": 0.24493108816991999, "learning_rate": 6.1392674018562525e-06, "loss": 0.2133, "step": 2385 }, { "epoch": 3.843737414418043, "grad_norm": 0.24068488930223084, "learning_rate": 6.1230411989126825e-06, "loss": 0.2114, "step": 2386 }, { "epoch": 3.8453483689085783, "grad_norm": 0.20980965674211466, "learning_rate": 6.106832590820053e-06, "loss": 0.1814, "step": 2387 }, { "epoch": 3.8469593233991137, "grad_norm": 0.22052853718987608, "learning_rate": 6.090641598129559e-06, "loss": 0.1941, "step": 2388 }, { "epoch": 3.8485702778896496, "grad_norm": 0.24442699536324203, "learning_rate": 6.074468241370053e-06, "loss": 0.2184, "step": 2389 }, { "epoch": 3.8501812323801854, "grad_norm": 0.22931064215122915, "learning_rate": 6.058312541048021e-06, "loss": 0.1814, "step": 2390 }, { "epoch": 3.851792186870721, "grad_norm": 0.21834781868614564, "learning_rate": 6.042174517647583e-06, "loss": 0.2028, "step": 2391 }, { "epoch": 3.8534031413612566, "grad_norm": 0.23946089513989083, "learning_rate": 6.026054191630439e-06, "loss": 0.1887, "step": 2392 }, { "epoch": 3.855014095851792, "grad_norm": 0.22970941758659105, "learning_rate": 6.00995158343584e-06, "loss": 0.2213, "step": 2393 }, { "epoch": 3.856625050342328, "grad_norm": 0.2314117292257891, "learning_rate": 5.993866713480567e-06, "loss": 0.2038, "step": 2394 }, { "epoch": 3.8582360048328637, "grad_norm": 0.2439904102574889, "learning_rate": 5.977799602158949e-06, "loss": 0.2155, "step": 2395 }, { "epoch": 3.859846959323399, "grad_norm": 0.20257437837005451, "learning_rate": 5.961750269842754e-06, "loss": 0.1751, "step": 2396 }, { "epoch": 3.861457913813935, "grad_norm": 0.22955556617831127, "learning_rate": 5.945718736881225e-06, "loss": 0.2154, "step": 2397 }, { "epoch": 3.8630688683044703, "grad_norm": 0.20870764713948106, "learning_rate": 5.929705023601038e-06, "loss": 0.1858, "step": 2398 }, { "epoch": 3.864679822795006, "grad_norm": 0.22806265886533345, "learning_rate": 5.913709150306284e-06, "loss": 0.1966, "step": 2399 }, { "epoch": 3.866290777285542, "grad_norm": 0.21759470538337408, "learning_rate": 5.897731137278417e-06, "loss": 0.206, "step": 2400 }, { "epoch": 3.8679017317760773, "grad_norm": 0.22726788817470528, "learning_rate": 5.881771004776243e-06, "loss": 0.218, "step": 2401 }, { "epoch": 3.8695126862666127, "grad_norm": 0.223006978113695, "learning_rate": 5.865828773035927e-06, "loss": 0.2127, "step": 2402 }, { "epoch": 3.8711236407571485, "grad_norm": 0.2113739938673185, "learning_rate": 5.849904462270908e-06, "loss": 0.1758, "step": 2403 }, { "epoch": 3.8727345952476844, "grad_norm": 0.2209374461562865, "learning_rate": 5.833998092671911e-06, "loss": 0.1949, "step": 2404 }, { "epoch": 3.8743455497382198, "grad_norm": 0.21472108401460543, "learning_rate": 5.8181096844069055e-06, "loss": 0.21, "step": 2405 }, { "epoch": 3.8759565042287556, "grad_norm": 0.2263705045461306, "learning_rate": 5.802239257621116e-06, "loss": 0.2014, "step": 2406 }, { "epoch": 3.877567458719291, "grad_norm": 0.2206794394457454, "learning_rate": 5.786386832436938e-06, "loss": 0.1958, "step": 2407 }, { "epoch": 3.879178413209827, "grad_norm": 0.21542639161588065, "learning_rate": 5.770552428953946e-06, "loss": 0.205, "step": 2408 }, { "epoch": 3.8807893677003626, "grad_norm": 0.21783160109169053, "learning_rate": 5.754736067248878e-06, "loss": 0.2038, "step": 2409 }, { "epoch": 3.882400322190898, "grad_norm": 0.22638816278271062, "learning_rate": 5.738937767375596e-06, "loss": 0.2086, "step": 2410 }, { "epoch": 3.884011276681434, "grad_norm": 0.23034123131475734, "learning_rate": 5.723157549365046e-06, "loss": 0.1996, "step": 2411 }, { "epoch": 3.8856222311719693, "grad_norm": 0.2403842941222503, "learning_rate": 5.707395433225247e-06, "loss": 0.1928, "step": 2412 }, { "epoch": 3.887233185662505, "grad_norm": 0.22829443560895596, "learning_rate": 5.691651438941297e-06, "loss": 0.1937, "step": 2413 }, { "epoch": 3.888844140153041, "grad_norm": 0.22379900687151044, "learning_rate": 5.675925586475286e-06, "loss": 0.2068, "step": 2414 }, { "epoch": 3.8904550946435763, "grad_norm": 0.21890279750969147, "learning_rate": 5.660217895766302e-06, "loss": 0.2006, "step": 2415 }, { "epoch": 3.8920660491341117, "grad_norm": 0.24292430296852172, "learning_rate": 5.644528386730424e-06, "loss": 0.2124, "step": 2416 }, { "epoch": 3.8936770036246475, "grad_norm": 0.23023882143912577, "learning_rate": 5.628857079260672e-06, "loss": 0.1948, "step": 2417 }, { "epoch": 3.8952879581151834, "grad_norm": 0.2182554614510497, "learning_rate": 5.613203993226981e-06, "loss": 0.1901, "step": 2418 }, { "epoch": 3.8968989126057187, "grad_norm": 0.2219288373965724, "learning_rate": 5.597569148476178e-06, "loss": 0.2212, "step": 2419 }, { "epoch": 3.8985098670962546, "grad_norm": 0.21537709000847655, "learning_rate": 5.581952564831978e-06, "loss": 0.2052, "step": 2420 }, { "epoch": 3.90012082158679, "grad_norm": 0.22629553986166404, "learning_rate": 5.56635426209494e-06, "loss": 0.1981, "step": 2421 }, { "epoch": 3.901731776077326, "grad_norm": 0.22348100651315547, "learning_rate": 5.550774260042428e-06, "loss": 0.2071, "step": 2422 }, { "epoch": 3.9033427305678616, "grad_norm": 0.22867321367174948, "learning_rate": 5.5352125784286085e-06, "loss": 0.2048, "step": 2423 }, { "epoch": 3.904953685058397, "grad_norm": 0.23348086911174862, "learning_rate": 5.519669236984442e-06, "loss": 0.2142, "step": 2424 }, { "epoch": 3.906564639548933, "grad_norm": 0.21081715913992716, "learning_rate": 5.504144255417605e-06, "loss": 0.1903, "step": 2425 }, { "epoch": 3.9081755940394682, "grad_norm": 0.22628394919950884, "learning_rate": 5.488637653412501e-06, "loss": 0.1932, "step": 2426 }, { "epoch": 3.909786548530004, "grad_norm": 0.24684525974817506, "learning_rate": 5.473149450630242e-06, "loss": 0.2193, "step": 2427 }, { "epoch": 3.91139750302054, "grad_norm": 0.22045186763448354, "learning_rate": 5.4576796667086125e-06, "loss": 0.1781, "step": 2428 }, { "epoch": 3.9130084575110753, "grad_norm": 0.2120017802530591, "learning_rate": 5.442228321262029e-06, "loss": 0.2007, "step": 2429 }, { "epoch": 3.914619412001611, "grad_norm": 0.23287109451180657, "learning_rate": 5.426795433881527e-06, "loss": 0.206, "step": 2430 }, { "epoch": 3.9162303664921465, "grad_norm": 0.22899308427743442, "learning_rate": 5.411381024134756e-06, "loss": 0.202, "step": 2431 }, { "epoch": 3.9178413209826823, "grad_norm": 0.23426089115523857, "learning_rate": 5.395985111565938e-06, "loss": 0.2043, "step": 2432 }, { "epoch": 3.9194522754732177, "grad_norm": 0.2205496319307363, "learning_rate": 5.380607715695822e-06, "loss": 0.1944, "step": 2433 }, { "epoch": 3.9210632299637536, "grad_norm": 0.23047985224867817, "learning_rate": 5.3652488560216875e-06, "loss": 0.2084, "step": 2434 }, { "epoch": 3.922674184454289, "grad_norm": 0.225561512407316, "learning_rate": 5.349908552017323e-06, "loss": 0.2004, "step": 2435 }, { "epoch": 3.924285138944825, "grad_norm": 0.23594038731291309, "learning_rate": 5.334586823132983e-06, "loss": 0.2138, "step": 2436 }, { "epoch": 3.9258960934353606, "grad_norm": 0.22443378504100486, "learning_rate": 5.31928368879536e-06, "loss": 0.1862, "step": 2437 }, { "epoch": 3.927507047925896, "grad_norm": 0.23429297324806259, "learning_rate": 5.303999168407585e-06, "loss": 0.2156, "step": 2438 }, { "epoch": 3.929118002416432, "grad_norm": 0.22875055640793499, "learning_rate": 5.288733281349186e-06, "loss": 0.1828, "step": 2439 }, { "epoch": 3.930728956906967, "grad_norm": 0.2247145591887911, "learning_rate": 5.273486046976057e-06, "loss": 0.1846, "step": 2440 }, { "epoch": 3.932339911397503, "grad_norm": 0.23458620492491555, "learning_rate": 5.258257484620441e-06, "loss": 0.2149, "step": 2441 }, { "epoch": 3.933950865888039, "grad_norm": 0.22576936655233082, "learning_rate": 5.243047613590919e-06, "loss": 0.1738, "step": 2442 }, { "epoch": 3.9355618203785743, "grad_norm": 0.2407340144249399, "learning_rate": 5.227856453172371e-06, "loss": 0.1849, "step": 2443 }, { "epoch": 3.93717277486911, "grad_norm": 0.21933224499037723, "learning_rate": 5.212684022625938e-06, "loss": 0.2001, "step": 2444 }, { "epoch": 3.9387837293596455, "grad_norm": 0.22669845460715846, "learning_rate": 5.1975303411890235e-06, "loss": 0.1974, "step": 2445 }, { "epoch": 3.9403946838501813, "grad_norm": 0.21697797198281793, "learning_rate": 5.182395428075262e-06, "loss": 0.183, "step": 2446 }, { "epoch": 3.9420056383407167, "grad_norm": 0.23111635845943326, "learning_rate": 5.167279302474493e-06, "loss": 0.2058, "step": 2447 }, { "epoch": 3.9436165928312525, "grad_norm": 0.21966365881724723, "learning_rate": 5.152181983552718e-06, "loss": 0.1975, "step": 2448 }, { "epoch": 3.945227547321788, "grad_norm": 0.23026550236047766, "learning_rate": 5.137103490452113e-06, "loss": 0.1977, "step": 2449 }, { "epoch": 3.9468385018123238, "grad_norm": 0.22759257630671995, "learning_rate": 5.12204384229098e-06, "loss": 0.204, "step": 2450 }, { "epoch": 3.9484494563028596, "grad_norm": 0.21139382664115194, "learning_rate": 5.10700305816372e-06, "loss": 0.1867, "step": 2451 }, { "epoch": 3.950060410793395, "grad_norm": 0.21338019048344403, "learning_rate": 5.091981157140808e-06, "loss": 0.1819, "step": 2452 }, { "epoch": 3.951671365283931, "grad_norm": 0.23306725549135782, "learning_rate": 5.076978158268801e-06, "loss": 0.1956, "step": 2453 }, { "epoch": 3.953282319774466, "grad_norm": 0.2281881871246552, "learning_rate": 5.061994080570281e-06, "loss": 0.1836, "step": 2454 }, { "epoch": 3.954893274265002, "grad_norm": 0.2225882034388656, "learning_rate": 5.047028943043826e-06, "loss": 0.181, "step": 2455 }, { "epoch": 3.956504228755538, "grad_norm": 0.22366930281729092, "learning_rate": 5.0320827646640054e-06, "loss": 0.2067, "step": 2456 }, { "epoch": 3.9581151832460733, "grad_norm": 0.2220234197721443, "learning_rate": 5.01715556438136e-06, "loss": 0.2225, "step": 2457 }, { "epoch": 3.959726137736609, "grad_norm": 0.20193439968881782, "learning_rate": 5.0022473611223635e-06, "loss": 0.1899, "step": 2458 }, { "epoch": 3.9613370922271445, "grad_norm": 0.22650822798732484, "learning_rate": 4.987358173789394e-06, "loss": 0.2058, "step": 2459 }, { "epoch": 3.9629480467176803, "grad_norm": 0.22033364513660997, "learning_rate": 4.972488021260733e-06, "loss": 0.2028, "step": 2460 }, { "epoch": 3.964559001208216, "grad_norm": 0.21837427939672938, "learning_rate": 4.95763692239051e-06, "loss": 0.2115, "step": 2461 }, { "epoch": 3.9661699556987515, "grad_norm": 0.2279553953983244, "learning_rate": 4.942804896008717e-06, "loss": 0.2063, "step": 2462 }, { "epoch": 3.967780910189287, "grad_norm": 0.2138127959490929, "learning_rate": 4.927991960921141e-06, "loss": 0.1793, "step": 2463 }, { "epoch": 3.9693918646798227, "grad_norm": 0.22416253513747308, "learning_rate": 4.9131981359093826e-06, "loss": 0.2017, "step": 2464 }, { "epoch": 3.9710028191703586, "grad_norm": 0.228356776210779, "learning_rate": 4.8984234397308086e-06, "loss": 0.1888, "step": 2465 }, { "epoch": 3.972613773660894, "grad_norm": 0.24338949114710415, "learning_rate": 4.883667891118515e-06, "loss": 0.2384, "step": 2466 }, { "epoch": 3.97422472815143, "grad_norm": 0.2099842060282153, "learning_rate": 4.868931508781345e-06, "loss": 0.1863, "step": 2467 }, { "epoch": 3.975835682641965, "grad_norm": 0.217213579023591, "learning_rate": 4.854214311403818e-06, "loss": 0.207, "step": 2468 }, { "epoch": 3.977446637132501, "grad_norm": 0.2198373072030007, "learning_rate": 4.839516317646149e-06, "loss": 0.198, "step": 2469 }, { "epoch": 3.979057591623037, "grad_norm": 0.2262853783161159, "learning_rate": 4.824837546144183e-06, "loss": 0.2114, "step": 2470 }, { "epoch": 3.9806685461135722, "grad_norm": 0.21939682500082897, "learning_rate": 4.810178015509415e-06, "loss": 0.2064, "step": 2471 }, { "epoch": 3.982279500604108, "grad_norm": 0.22515465224528589, "learning_rate": 4.795537744328924e-06, "loss": 0.1987, "step": 2472 }, { "epoch": 3.9838904550946435, "grad_norm": 0.21740935835993958, "learning_rate": 4.780916751165388e-06, "loss": 0.2032, "step": 2473 }, { "epoch": 3.9855014095851793, "grad_norm": 0.22140692196370554, "learning_rate": 4.76631505455702e-06, "loss": 0.1975, "step": 2474 }, { "epoch": 3.987112364075715, "grad_norm": 0.2284279380672877, "learning_rate": 4.751732673017589e-06, "loss": 0.2144, "step": 2475 }, { "epoch": 3.9887233185662505, "grad_norm": 0.22252220249176013, "learning_rate": 4.737169625036369e-06, "loss": 0.1961, "step": 2476 }, { "epoch": 3.990334273056786, "grad_norm": 0.2213040546419634, "learning_rate": 4.722625929078102e-06, "loss": 0.2141, "step": 2477 }, { "epoch": 3.9919452275473217, "grad_norm": 0.22175328254802687, "learning_rate": 4.708101603583019e-06, "loss": 0.184, "step": 2478 }, { "epoch": 3.9935561820378576, "grad_norm": 0.22787646419219595, "learning_rate": 4.693596666966771e-06, "loss": 0.2179, "step": 2479 }, { "epoch": 3.995167136528393, "grad_norm": 0.21931238676949583, "learning_rate": 4.679111137620442e-06, "loss": 0.2055, "step": 2480 }, { "epoch": 3.996778091018929, "grad_norm": 0.22831017006597473, "learning_rate": 4.664645033910491e-06, "loss": 0.2113, "step": 2481 }, { "epoch": 3.998389045509464, "grad_norm": 0.2179416073465815, "learning_rate": 4.650198374178767e-06, "loss": 0.1947, "step": 2482 }, { "epoch": 4.0, "grad_norm": 0.26286690041897504, "learning_rate": 4.635771176742443e-06, "loss": 0.2311, "step": 2483 }, { "epoch": 4.001610954490536, "grad_norm": 0.394604874099715, "learning_rate": 4.621363459894039e-06, "loss": 0.1444, "step": 2484 }, { "epoch": 4.003221908981072, "grad_norm": 0.30419988334377224, "learning_rate": 4.606975241901354e-06, "loss": 0.1577, "step": 2485 }, { "epoch": 4.004832863471607, "grad_norm": 0.24739619219289316, "learning_rate": 4.592606541007481e-06, "loss": 0.152, "step": 2486 }, { "epoch": 4.006443817962142, "grad_norm": 0.37269612926471524, "learning_rate": 4.578257375430764e-06, "loss": 0.1528, "step": 2487 }, { "epoch": 4.008054772452678, "grad_norm": 0.41792249971665374, "learning_rate": 4.563927763364759e-06, "loss": 0.1466, "step": 2488 }, { "epoch": 4.009665726943214, "grad_norm": 0.3578527160556587, "learning_rate": 4.549617722978259e-06, "loss": 0.1601, "step": 2489 }, { "epoch": 4.01127668143375, "grad_norm": 0.2889923380752588, "learning_rate": 4.535327272415215e-06, "loss": 0.1456, "step": 2490 }, { "epoch": 4.012887635924285, "grad_norm": 0.3818532366043304, "learning_rate": 4.521056429794763e-06, "loss": 0.158, "step": 2491 }, { "epoch": 4.014498590414821, "grad_norm": 0.36182018754446954, "learning_rate": 4.506805213211154e-06, "loss": 0.1496, "step": 2492 }, { "epoch": 4.0161095449053565, "grad_norm": 0.32110946672977075, "learning_rate": 4.492573640733781e-06, "loss": 0.1518, "step": 2493 }, { "epoch": 4.017720499395892, "grad_norm": 0.23828594766969258, "learning_rate": 4.478361730407104e-06, "loss": 0.1427, "step": 2494 }, { "epoch": 4.019331453886427, "grad_norm": 0.28940264741575655, "learning_rate": 4.464169500250677e-06, "loss": 0.137, "step": 2495 }, { "epoch": 4.020942408376963, "grad_norm": 0.32475999960770263, "learning_rate": 4.449996968259074e-06, "loss": 0.1552, "step": 2496 }, { "epoch": 4.022553362867499, "grad_norm": 0.3106520553812506, "learning_rate": 4.435844152401925e-06, "loss": 0.1604, "step": 2497 }, { "epoch": 4.024164317358035, "grad_norm": 0.25339318934044625, "learning_rate": 4.4217110706238305e-06, "loss": 0.1398, "step": 2498 }, { "epoch": 4.025775271848571, "grad_norm": 0.23074024688964512, "learning_rate": 4.407597740844393e-06, "loss": 0.1536, "step": 2499 }, { "epoch": 4.027386226339106, "grad_norm": 0.2298221033798615, "learning_rate": 4.393504180958166e-06, "loss": 0.1467, "step": 2500 }, { "epoch": 4.028997180829641, "grad_norm": 0.24652202184560998, "learning_rate": 4.3794304088346215e-06, "loss": 0.1499, "step": 2501 }, { "epoch": 4.030608135320177, "grad_norm": 0.25842548848300634, "learning_rate": 4.365376442318168e-06, "loss": 0.1491, "step": 2502 }, { "epoch": 4.032219089810713, "grad_norm": 0.2241710771974074, "learning_rate": 4.351342299228072e-06, "loss": 0.1581, "step": 2503 }, { "epoch": 4.033830044301249, "grad_norm": 0.226375077124321, "learning_rate": 4.337327997358494e-06, "loss": 0.1454, "step": 2504 }, { "epoch": 4.035440998791784, "grad_norm": 0.2386861417217738, "learning_rate": 4.323333554478415e-06, "loss": 0.1489, "step": 2505 }, { "epoch": 4.03705195328232, "grad_norm": 0.24161342029014413, "learning_rate": 4.309358988331658e-06, "loss": 0.1391, "step": 2506 }, { "epoch": 4.0386629077728555, "grad_norm": 0.24826467464092444, "learning_rate": 4.2954043166368176e-06, "loss": 0.1428, "step": 2507 }, { "epoch": 4.040273862263391, "grad_norm": 0.22543955458616258, "learning_rate": 4.281469557087292e-06, "loss": 0.1512, "step": 2508 }, { "epoch": 4.041884816753926, "grad_norm": 0.22191099450465107, "learning_rate": 4.267554727351209e-06, "loss": 0.1518, "step": 2509 }, { "epoch": 4.043495771244462, "grad_norm": 0.22413069894128723, "learning_rate": 4.253659845071436e-06, "loss": 0.1486, "step": 2510 }, { "epoch": 4.045106725734998, "grad_norm": 0.23604198643903934, "learning_rate": 4.239784927865562e-06, "loss": 0.1563, "step": 2511 }, { "epoch": 4.046717680225534, "grad_norm": 0.23765897381752274, "learning_rate": 4.225929993325837e-06, "loss": 0.1357, "step": 2512 }, { "epoch": 4.04832863471607, "grad_norm": 0.2325992132869097, "learning_rate": 4.2120950590191945e-06, "loss": 0.1412, "step": 2513 }, { "epoch": 4.049939589206605, "grad_norm": 0.219156691846048, "learning_rate": 4.198280142487194e-06, "loss": 0.1386, "step": 2514 }, { "epoch": 4.05155054369714, "grad_norm": 0.22634529978022389, "learning_rate": 4.184485261246032e-06, "loss": 0.1401, "step": 2515 }, { "epoch": 4.053161498187676, "grad_norm": 0.22623994036256284, "learning_rate": 4.17071043278648e-06, "loss": 0.1512, "step": 2516 }, { "epoch": 4.054772452678212, "grad_norm": 0.2348031400592028, "learning_rate": 4.156955674573908e-06, "loss": 0.1514, "step": 2517 }, { "epoch": 4.056383407168748, "grad_norm": 0.2422395543425095, "learning_rate": 4.143221004048215e-06, "loss": 0.1545, "step": 2518 }, { "epoch": 4.057994361659283, "grad_norm": 0.22710278247393073, "learning_rate": 4.129506438623854e-06, "loss": 0.154, "step": 2519 }, { "epoch": 4.059605316149819, "grad_norm": 0.2119993093135068, "learning_rate": 4.11581199568976e-06, "loss": 0.1508, "step": 2520 }, { "epoch": 4.0612162706403545, "grad_norm": 0.2266486757464885, "learning_rate": 4.10213769260938e-06, "loss": 0.1475, "step": 2521 }, { "epoch": 4.06282722513089, "grad_norm": 0.22567150872123293, "learning_rate": 4.088483546720614e-06, "loss": 0.1552, "step": 2522 }, { "epoch": 4.064438179621425, "grad_norm": 0.23043810924965868, "learning_rate": 4.074849575335804e-06, "loss": 0.1552, "step": 2523 }, { "epoch": 4.066049134111961, "grad_norm": 0.23119706591599065, "learning_rate": 4.061235795741702e-06, "loss": 0.1391, "step": 2524 }, { "epoch": 4.067660088602497, "grad_norm": 0.2167544238933369, "learning_rate": 4.04764222519948e-06, "loss": 0.1439, "step": 2525 }, { "epoch": 4.069271043093033, "grad_norm": 0.22872180237610718, "learning_rate": 4.0340688809446745e-06, "loss": 0.1652, "step": 2526 }, { "epoch": 4.070881997583569, "grad_norm": 0.22662520444169074, "learning_rate": 4.020515780187173e-06, "loss": 0.1401, "step": 2527 }, { "epoch": 4.072492952074104, "grad_norm": 0.22758725109112826, "learning_rate": 4.006982940111204e-06, "loss": 0.1474, "step": 2528 }, { "epoch": 4.074103906564639, "grad_norm": 0.21647537990769658, "learning_rate": 3.993470377875311e-06, "loss": 0.153, "step": 2529 }, { "epoch": 4.075714861055175, "grad_norm": 0.21820740866956023, "learning_rate": 3.979978110612313e-06, "loss": 0.165, "step": 2530 }, { "epoch": 4.077325815545711, "grad_norm": 0.2133491338774458, "learning_rate": 3.9665061554292946e-06, "loss": 0.1397, "step": 2531 }, { "epoch": 4.078936770036247, "grad_norm": 0.22608875122217786, "learning_rate": 3.9530545294076075e-06, "loss": 0.1541, "step": 2532 }, { "epoch": 4.080547724526782, "grad_norm": 0.21979115321693685, "learning_rate": 3.9396232496028176e-06, "loss": 0.1507, "step": 2533 }, { "epoch": 4.082158679017318, "grad_norm": 0.21782116069499285, "learning_rate": 3.926212333044683e-06, "loss": 0.1501, "step": 2534 }, { "epoch": 4.0837696335078535, "grad_norm": 0.22037323116617433, "learning_rate": 3.9128217967371515e-06, "loss": 0.1522, "step": 2535 }, { "epoch": 4.085380587998389, "grad_norm": 0.2144296673882382, "learning_rate": 3.899451657658331e-06, "loss": 0.1486, "step": 2536 }, { "epoch": 4.086991542488924, "grad_norm": 0.21177670307177632, "learning_rate": 3.8861019327604736e-06, "loss": 0.143, "step": 2537 }, { "epoch": 4.08860249697946, "grad_norm": 0.22542118104376185, "learning_rate": 3.872772638969929e-06, "loss": 0.1463, "step": 2538 }, { "epoch": 4.090213451469996, "grad_norm": 0.21745441278678204, "learning_rate": 3.859463793187159e-06, "loss": 0.1361, "step": 2539 }, { "epoch": 4.091824405960532, "grad_norm": 0.21992591585331572, "learning_rate": 3.846175412286701e-06, "loss": 0.1572, "step": 2540 }, { "epoch": 4.093435360451068, "grad_norm": 0.22629768234535663, "learning_rate": 3.83290751311713e-06, "loss": 0.1419, "step": 2541 }, { "epoch": 4.0950463149416025, "grad_norm": 0.2298622699155759, "learning_rate": 3.819660112501053e-06, "loss": 0.1355, "step": 2542 }, { "epoch": 4.096657269432138, "grad_norm": 0.21678800682951568, "learning_rate": 3.8064332272351e-06, "loss": 0.1496, "step": 2543 }, { "epoch": 4.098268223922674, "grad_norm": 0.22949965448343948, "learning_rate": 3.7932268740898836e-06, "loss": 0.1556, "step": 2544 }, { "epoch": 4.09987917841321, "grad_norm": 0.22071553901670435, "learning_rate": 3.7800410698099808e-06, "loss": 0.1574, "step": 2545 }, { "epoch": 4.101490132903746, "grad_norm": 0.2298993165539254, "learning_rate": 3.7668758311139077e-06, "loss": 0.151, "step": 2546 }, { "epoch": 4.103101087394281, "grad_norm": 0.2225517731771703, "learning_rate": 3.753731174694117e-06, "loss": 0.1467, "step": 2547 }, { "epoch": 4.104712041884817, "grad_norm": 0.225093485838971, "learning_rate": 3.7406071172169634e-06, "loss": 0.1352, "step": 2548 }, { "epoch": 4.1063229963753525, "grad_norm": 0.23400738843222163, "learning_rate": 3.727503675322681e-06, "loss": 0.1421, "step": 2549 }, { "epoch": 4.107933950865888, "grad_norm": 0.2366245815845293, "learning_rate": 3.7144208656253476e-06, "loss": 0.1485, "step": 2550 }, { "epoch": 4.109544905356423, "grad_norm": 0.2154306414839747, "learning_rate": 3.701358704712923e-06, "loss": 0.1463, "step": 2551 }, { "epoch": 4.111155859846959, "grad_norm": 0.2207385286375425, "learning_rate": 3.6883172091471474e-06, "loss": 0.1588, "step": 2552 }, { "epoch": 4.112766814337495, "grad_norm": 0.22269838904544906, "learning_rate": 3.67529639546357e-06, "loss": 0.1452, "step": 2553 }, { "epoch": 4.114377768828031, "grad_norm": 0.2313604111125716, "learning_rate": 3.6622962801715243e-06, "loss": 0.1342, "step": 2554 }, { "epoch": 4.115988723318567, "grad_norm": 0.2284422968399893, "learning_rate": 3.649316879754099e-06, "loss": 0.1417, "step": 2555 }, { "epoch": 4.1175996778091015, "grad_norm": 0.2465718086622103, "learning_rate": 3.6363582106681115e-06, "loss": 0.1408, "step": 2556 }, { "epoch": 4.119210632299637, "grad_norm": 0.23546119698074278, "learning_rate": 3.623420289344088e-06, "loss": 0.1367, "step": 2557 }, { "epoch": 4.120821586790173, "grad_norm": 0.23013205586732102, "learning_rate": 3.610503132186265e-06, "loss": 0.1566, "step": 2558 }, { "epoch": 4.122432541280709, "grad_norm": 0.22709715305840766, "learning_rate": 3.597606755572545e-06, "loss": 0.1421, "step": 2559 }, { "epoch": 4.124043495771245, "grad_norm": 0.2270302334606441, "learning_rate": 3.584731175854479e-06, "loss": 0.1417, "step": 2560 }, { "epoch": 4.12565445026178, "grad_norm": 0.22673844401199308, "learning_rate": 3.57187640935724e-06, "loss": 0.1433, "step": 2561 }, { "epoch": 4.127265404752316, "grad_norm": 0.22556426103212388, "learning_rate": 3.559042472379639e-06, "loss": 0.1545, "step": 2562 }, { "epoch": 4.1288763592428515, "grad_norm": 0.22805435104245703, "learning_rate": 3.546229381194057e-06, "loss": 0.1462, "step": 2563 }, { "epoch": 4.130487313733387, "grad_norm": 0.21900564349736037, "learning_rate": 3.5334371520464373e-06, "loss": 0.1522, "step": 2564 }, { "epoch": 4.132098268223922, "grad_norm": 0.2139940599235903, "learning_rate": 3.520665801156289e-06, "loss": 0.1372, "step": 2565 }, { "epoch": 4.133709222714458, "grad_norm": 0.22062229403040695, "learning_rate": 3.507915344716648e-06, "loss": 0.1491, "step": 2566 }, { "epoch": 4.135320177204994, "grad_norm": 0.23722546949444184, "learning_rate": 3.4951857988940475e-06, "loss": 0.1515, "step": 2567 }, { "epoch": 4.13693113169553, "grad_norm": 0.22131787664167898, "learning_rate": 3.482477179828505e-06, "loss": 0.1412, "step": 2568 }, { "epoch": 4.138542086186066, "grad_norm": 0.21918030255310963, "learning_rate": 3.4697895036335204e-06, "loss": 0.1545, "step": 2569 }, { "epoch": 4.1401530406766005, "grad_norm": 0.22054026589218842, "learning_rate": 3.457122786396032e-06, "loss": 0.14, "step": 2570 }, { "epoch": 4.141763995167136, "grad_norm": 0.22504736458991076, "learning_rate": 3.4444770441764043e-06, "loss": 0.1484, "step": 2571 }, { "epoch": 4.143374949657672, "grad_norm": 0.21902661316640898, "learning_rate": 3.431852293008391e-06, "loss": 0.1498, "step": 2572 }, { "epoch": 4.144985904148208, "grad_norm": 0.2210425406056193, "learning_rate": 3.419248548899168e-06, "loss": 0.1465, "step": 2573 }, { "epoch": 4.146596858638744, "grad_norm": 0.22089200704994782, "learning_rate": 3.406665827829243e-06, "loss": 0.1562, "step": 2574 }, { "epoch": 4.148207813129279, "grad_norm": 0.2252789843720791, "learning_rate": 3.3941041457524748e-06, "loss": 0.1465, "step": 2575 }, { "epoch": 4.149818767619815, "grad_norm": 0.23272666378438298, "learning_rate": 3.381563518596056e-06, "loss": 0.1466, "step": 2576 }, { "epoch": 4.15142972211035, "grad_norm": 0.22958528962750838, "learning_rate": 3.3690439622604832e-06, "loss": 0.1347, "step": 2577 }, { "epoch": 4.153040676600886, "grad_norm": 0.2255442065793069, "learning_rate": 3.3565454926195252e-06, "loss": 0.1469, "step": 2578 }, { "epoch": 4.154651631091422, "grad_norm": 0.21542881986949647, "learning_rate": 3.344068125520219e-06, "loss": 0.1515, "step": 2579 }, { "epoch": 4.156262585581957, "grad_norm": 0.22530311793594246, "learning_rate": 3.3316118767828498e-06, "loss": 0.1542, "step": 2580 }, { "epoch": 4.157873540072493, "grad_norm": 0.2156195896321625, "learning_rate": 3.3191767622009307e-06, "loss": 0.1528, "step": 2581 }, { "epoch": 4.159484494563029, "grad_norm": 0.21841843601381836, "learning_rate": 3.3067627975411675e-06, "loss": 0.1496, "step": 2582 }, { "epoch": 4.1610954490535645, "grad_norm": 0.23200002731345346, "learning_rate": 3.294369998543443e-06, "loss": 0.1515, "step": 2583 }, { "epoch": 4.1627064035440995, "grad_norm": 0.23379438316230453, "learning_rate": 3.2819983809208346e-06, "loss": 0.1475, "step": 2584 }, { "epoch": 4.164317358034635, "grad_norm": 0.21329943795679254, "learning_rate": 3.269647960359532e-06, "loss": 0.1473, "step": 2585 }, { "epoch": 4.165928312525171, "grad_norm": 0.23135542202637302, "learning_rate": 3.257318752518859e-06, "loss": 0.1548, "step": 2586 }, { "epoch": 4.167539267015707, "grad_norm": 0.21343772694404733, "learning_rate": 3.2450107730312473e-06, "loss": 0.1397, "step": 2587 }, { "epoch": 4.169150221506243, "grad_norm": 0.23309642536401526, "learning_rate": 3.232724037502215e-06, "loss": 0.1599, "step": 2588 }, { "epoch": 4.170761175996778, "grad_norm": 0.23103447802019342, "learning_rate": 3.2204585615103355e-06, "loss": 0.1534, "step": 2589 }, { "epoch": 4.172372130487314, "grad_norm": 0.2270645147859799, "learning_rate": 3.20821436060722e-06, "loss": 0.1529, "step": 2590 }, { "epoch": 4.173983084977849, "grad_norm": 0.23814634653183514, "learning_rate": 3.195991450317537e-06, "loss": 0.1466, "step": 2591 }, { "epoch": 4.175594039468385, "grad_norm": 0.22830584321137043, "learning_rate": 3.183789846138927e-06, "loss": 0.1445, "step": 2592 }, { "epoch": 4.177204993958921, "grad_norm": 0.21871275817927927, "learning_rate": 3.1716095635420265e-06, "loss": 0.1581, "step": 2593 }, { "epoch": 4.178815948449456, "grad_norm": 0.22189336798505455, "learning_rate": 3.159450617970441e-06, "loss": 0.1572, "step": 2594 }, { "epoch": 4.180426902939992, "grad_norm": 0.21870086409019102, "learning_rate": 3.1473130248407278e-06, "loss": 0.1366, "step": 2595 }, { "epoch": 4.182037857430528, "grad_norm": 0.22775597716347107, "learning_rate": 3.1351967995423594e-06, "loss": 0.1527, "step": 2596 }, { "epoch": 4.1836488119210635, "grad_norm": 0.22276143260948325, "learning_rate": 3.1231019574377153e-06, "loss": 0.1452, "step": 2597 }, { "epoch": 4.1852597664115985, "grad_norm": 0.2178191763823298, "learning_rate": 3.111028513862071e-06, "loss": 0.149, "step": 2598 }, { "epoch": 4.186870720902134, "grad_norm": 0.22160329576960924, "learning_rate": 3.0989764841235746e-06, "loss": 0.1484, "step": 2599 }, { "epoch": 4.18848167539267, "grad_norm": 0.22102821797818814, "learning_rate": 3.0869458835032097e-06, "loss": 0.1469, "step": 2600 }, { "epoch": 4.190092629883206, "grad_norm": 0.22697752307908553, "learning_rate": 3.074936727254785e-06, "loss": 0.1544, "step": 2601 }, { "epoch": 4.191703584373742, "grad_norm": 0.22392044668301703, "learning_rate": 3.0629490306049536e-06, "loss": 0.1477, "step": 2602 }, { "epoch": 4.193314538864277, "grad_norm": 0.22404327159580825, "learning_rate": 3.0509828087531224e-06, "loss": 0.1547, "step": 2603 }, { "epoch": 4.194925493354813, "grad_norm": 0.23084898184198646, "learning_rate": 3.039038076871481e-06, "loss": 0.1544, "step": 2604 }, { "epoch": 4.196536447845348, "grad_norm": 0.2232139079228995, "learning_rate": 3.0271148501049796e-06, "loss": 0.1402, "step": 2605 }, { "epoch": 4.198147402335884, "grad_norm": 0.22551484154069532, "learning_rate": 3.0152131435713007e-06, "loss": 0.1501, "step": 2606 }, { "epoch": 4.19975835682642, "grad_norm": 0.2282570119452304, "learning_rate": 3.003332972360831e-06, "loss": 0.154, "step": 2607 }, { "epoch": 4.201369311316955, "grad_norm": 0.2162893984726015, "learning_rate": 2.9914743515366516e-06, "loss": 0.1439, "step": 2608 }, { "epoch": 4.202980265807491, "grad_norm": 0.2221742722251089, "learning_rate": 2.9796372961345364e-06, "loss": 0.1533, "step": 2609 }, { "epoch": 4.204591220298027, "grad_norm": 0.2333853238695215, "learning_rate": 2.967821821162904e-06, "loss": 0.1454, "step": 2610 }, { "epoch": 4.2062021747885625, "grad_norm": 0.2120253738307176, "learning_rate": 2.9560279416028102e-06, "loss": 0.1506, "step": 2611 }, { "epoch": 4.2078131292790975, "grad_norm": 0.2140333611346206, "learning_rate": 2.944255672407925e-06, "loss": 0.1478, "step": 2612 }, { "epoch": 4.209424083769633, "grad_norm": 0.23068436963355107, "learning_rate": 2.932505028504531e-06, "loss": 0.1441, "step": 2613 }, { "epoch": 4.211035038260169, "grad_norm": 0.21776274650570995, "learning_rate": 2.9207760247914895e-06, "loss": 0.1479, "step": 2614 }, { "epoch": 4.212645992750705, "grad_norm": 0.23364491125107575, "learning_rate": 2.909068676140212e-06, "loss": 0.1507, "step": 2615 }, { "epoch": 4.214256947241241, "grad_norm": 0.2212861004899993, "learning_rate": 2.8973829973946645e-06, "loss": 0.1443, "step": 2616 }, { "epoch": 4.215867901731776, "grad_norm": 0.23588997528901365, "learning_rate": 2.8857190033713343e-06, "loss": 0.1414, "step": 2617 }, { "epoch": 4.217478856222312, "grad_norm": 0.22836799572319763, "learning_rate": 2.874076708859217e-06, "loss": 0.1497, "step": 2618 }, { "epoch": 4.219089810712847, "grad_norm": 0.21435982154109828, "learning_rate": 2.8624561286197793e-06, "loss": 0.1328, "step": 2619 }, { "epoch": 4.220700765203383, "grad_norm": 0.2221623295059318, "learning_rate": 2.850857277386978e-06, "loss": 0.1594, "step": 2620 }, { "epoch": 4.222311719693919, "grad_norm": 0.2224970582246465, "learning_rate": 2.8392801698672135e-06, "loss": 0.162, "step": 2621 }, { "epoch": 4.223922674184454, "grad_norm": 0.2155490975094429, "learning_rate": 2.827724820739306e-06, "loss": 0.1299, "step": 2622 }, { "epoch": 4.22553362867499, "grad_norm": 0.22520553516276554, "learning_rate": 2.8161912446544916e-06, "loss": 0.1516, "step": 2623 }, { "epoch": 4.227144583165526, "grad_norm": 0.2258052727971873, "learning_rate": 2.80467945623641e-06, "loss": 0.1415, "step": 2624 }, { "epoch": 4.2287555376560615, "grad_norm": 0.21802617466661803, "learning_rate": 2.7931894700810703e-06, "loss": 0.141, "step": 2625 }, { "epoch": 4.230366492146596, "grad_norm": 0.22525103510627617, "learning_rate": 2.781721300756828e-06, "loss": 0.1526, "step": 2626 }, { "epoch": 4.231977446637132, "grad_norm": 0.2225143843203524, "learning_rate": 2.7702749628043933e-06, "loss": 0.1539, "step": 2627 }, { "epoch": 4.233588401127668, "grad_norm": 0.22398176140606088, "learning_rate": 2.7588504707367913e-06, "loss": 0.155, "step": 2628 }, { "epoch": 4.235199355618204, "grad_norm": 0.21834762012587516, "learning_rate": 2.747447839039339e-06, "loss": 0.1572, "step": 2629 }, { "epoch": 4.23681031010874, "grad_norm": 0.21892692454842144, "learning_rate": 2.7360670821696422e-06, "loss": 0.1458, "step": 2630 }, { "epoch": 4.238421264599275, "grad_norm": 0.22088363148564538, "learning_rate": 2.724708214557572e-06, "loss": 0.1455, "step": 2631 }, { "epoch": 4.2400322190898105, "grad_norm": 0.23026390864295404, "learning_rate": 2.71337125060525e-06, "loss": 0.1396, "step": 2632 }, { "epoch": 4.241643173580346, "grad_norm": 0.2290115443147785, "learning_rate": 2.702056204687018e-06, "loss": 0.1526, "step": 2633 }, { "epoch": 4.243254128070882, "grad_norm": 0.2329308138537628, "learning_rate": 2.6907630911494287e-06, "loss": 0.156, "step": 2634 }, { "epoch": 4.244865082561418, "grad_norm": 0.2129377543079953, "learning_rate": 2.679491924311226e-06, "loss": 0.1425, "step": 2635 }, { "epoch": 4.246476037051953, "grad_norm": 0.2277121827224005, "learning_rate": 2.668242718463341e-06, "loss": 0.1403, "step": 2636 }, { "epoch": 4.248086991542489, "grad_norm": 0.22813667865109089, "learning_rate": 2.657015487868835e-06, "loss": 0.1505, "step": 2637 }, { "epoch": 4.249697946033025, "grad_norm": 0.22983529188220994, "learning_rate": 2.6458102467629275e-06, "loss": 0.1402, "step": 2638 }, { "epoch": 4.2513089005235605, "grad_norm": 0.21977565245784658, "learning_rate": 2.6346270093529457e-06, "loss": 0.1324, "step": 2639 }, { "epoch": 4.252919855014095, "grad_norm": 0.2170974293585535, "learning_rate": 2.623465789818327e-06, "loss": 0.141, "step": 2640 }, { "epoch": 4.254530809504631, "grad_norm": 0.2310399402068678, "learning_rate": 2.6123266023105774e-06, "loss": 0.1571, "step": 2641 }, { "epoch": 4.256141763995167, "grad_norm": 0.22177073988062584, "learning_rate": 2.6012094609532845e-06, "loss": 0.1489, "step": 2642 }, { "epoch": 4.257752718485703, "grad_norm": 0.2151846397977056, "learning_rate": 2.5901143798420792e-06, "loss": 0.1506, "step": 2643 }, { "epoch": 4.259363672976239, "grad_norm": 0.21855697442042452, "learning_rate": 2.579041373044613e-06, "loss": 0.1439, "step": 2644 }, { "epoch": 4.260974627466774, "grad_norm": 0.2208457852679748, "learning_rate": 2.5679904546005507e-06, "loss": 0.1447, "step": 2645 }, { "epoch": 4.2625855819573095, "grad_norm": 0.22198003753575765, "learning_rate": 2.5569616385215625e-06, "loss": 0.1485, "step": 2646 }, { "epoch": 4.264196536447845, "grad_norm": 0.21034373583782578, "learning_rate": 2.5459549387912843e-06, "loss": 0.1474, "step": 2647 }, { "epoch": 4.265807490938381, "grad_norm": 0.22413997527867038, "learning_rate": 2.5349703693653103e-06, "loss": 0.1454, "step": 2648 }, { "epoch": 4.267418445428917, "grad_norm": 0.22479134118096378, "learning_rate": 2.5240079441711853e-06, "loss": 0.1425, "step": 2649 }, { "epoch": 4.269029399919452, "grad_norm": 0.20947722912794117, "learning_rate": 2.5130676771083585e-06, "loss": 0.1489, "step": 2650 }, { "epoch": 4.270640354409988, "grad_norm": 0.22886596887508937, "learning_rate": 2.5021495820482057e-06, "loss": 0.1481, "step": 2651 }, { "epoch": 4.272251308900524, "grad_norm": 0.23491024852118703, "learning_rate": 2.4912536728339707e-06, "loss": 0.1446, "step": 2652 }, { "epoch": 4.2738622633910595, "grad_norm": 0.2414302632894677, "learning_rate": 2.4803799632807836e-06, "loss": 0.1521, "step": 2653 }, { "epoch": 4.275473217881595, "grad_norm": 0.21026196842028957, "learning_rate": 2.4695284671756215e-06, "loss": 0.1489, "step": 2654 }, { "epoch": 4.27708417237213, "grad_norm": 0.228563137088654, "learning_rate": 2.45869919827729e-06, "loss": 0.1428, "step": 2655 }, { "epoch": 4.278695126862666, "grad_norm": 0.21717230893266137, "learning_rate": 2.4478921703164236e-06, "loss": 0.1423, "step": 2656 }, { "epoch": 4.280306081353202, "grad_norm": 0.22308055219912287, "learning_rate": 2.4371073969954463e-06, "loss": 0.1366, "step": 2657 }, { "epoch": 4.281917035843738, "grad_norm": 0.22580670108095363, "learning_rate": 2.4263448919885745e-06, "loss": 0.1457, "step": 2658 }, { "epoch": 4.283527990334273, "grad_norm": 0.21867003917030386, "learning_rate": 2.4156046689417823e-06, "loss": 0.149, "step": 2659 }, { "epoch": 4.2851389448248085, "grad_norm": 0.21703864427650707, "learning_rate": 2.4048867414728004e-06, "loss": 0.1553, "step": 2660 }, { "epoch": 4.286749899315344, "grad_norm": 0.22892483066011338, "learning_rate": 2.394191123171081e-06, "loss": 0.1364, "step": 2661 }, { "epoch": 4.28836085380588, "grad_norm": 0.22083545583834652, "learning_rate": 2.3835178275978012e-06, "loss": 0.1294, "step": 2662 }, { "epoch": 4.289971808296416, "grad_norm": 0.22446324510789786, "learning_rate": 2.3728668682858193e-06, "loss": 0.1419, "step": 2663 }, { "epoch": 4.291582762786951, "grad_norm": 0.23116257281704497, "learning_rate": 2.3622382587396907e-06, "loss": 0.1445, "step": 2664 }, { "epoch": 4.293193717277487, "grad_norm": 0.21998960949875104, "learning_rate": 2.3516320124356186e-06, "loss": 0.151, "step": 2665 }, { "epoch": 4.294804671768023, "grad_norm": 0.2147293966021974, "learning_rate": 2.3410481428214602e-06, "loss": 0.1543, "step": 2666 }, { "epoch": 4.296415626258558, "grad_norm": 0.21872334682296563, "learning_rate": 2.330486663316702e-06, "loss": 0.1503, "step": 2667 }, { "epoch": 4.298026580749093, "grad_norm": 0.22709801396680795, "learning_rate": 2.31994758731243e-06, "loss": 0.1394, "step": 2668 }, { "epoch": 4.299637535239629, "grad_norm": 0.22239888881280748, "learning_rate": 2.309430928171341e-06, "loss": 0.1456, "step": 2669 }, { "epoch": 4.301248489730165, "grad_norm": 0.22267661198438218, "learning_rate": 2.2989366992276917e-06, "loss": 0.14, "step": 2670 }, { "epoch": 4.302859444220701, "grad_norm": 0.22089398672152466, "learning_rate": 2.288464913787316e-06, "loss": 0.1505, "step": 2671 }, { "epoch": 4.304470398711237, "grad_norm": 0.22234097375462933, "learning_rate": 2.278015585127573e-06, "loss": 0.1365, "step": 2672 }, { "epoch": 4.306081353201772, "grad_norm": 0.2268142943570914, "learning_rate": 2.2675887264973694e-06, "loss": 0.1521, "step": 2673 }, { "epoch": 4.3076923076923075, "grad_norm": 0.20658475916341448, "learning_rate": 2.257184351117101e-06, "loss": 0.1387, "step": 2674 }, { "epoch": 4.309303262182843, "grad_norm": 0.2302415242479479, "learning_rate": 2.246802472178675e-06, "loss": 0.1446, "step": 2675 }, { "epoch": 4.310914216673379, "grad_norm": 0.22560731900901113, "learning_rate": 2.236443102845458e-06, "loss": 0.1525, "step": 2676 }, { "epoch": 4.312525171163915, "grad_norm": 0.2234748667770128, "learning_rate": 2.2261062562522896e-06, "loss": 0.1549, "step": 2677 }, { "epoch": 4.31413612565445, "grad_norm": 0.22870884705411906, "learning_rate": 2.21579194550545e-06, "loss": 0.1386, "step": 2678 }, { "epoch": 4.315747080144986, "grad_norm": 0.2224606918323668, "learning_rate": 2.2055001836826364e-06, "loss": 0.1455, "step": 2679 }, { "epoch": 4.317358034635522, "grad_norm": 0.22036372420401462, "learning_rate": 2.19523098383297e-06, "loss": 0.1508, "step": 2680 }, { "epoch": 4.318968989126057, "grad_norm": 0.21510144959446054, "learning_rate": 2.1849843589769513e-06, "loss": 0.143, "step": 2681 }, { "epoch": 4.320579943616593, "grad_norm": 0.2359493205380365, "learning_rate": 2.1747603221064684e-06, "loss": 0.1585, "step": 2682 }, { "epoch": 4.322190898107128, "grad_norm": 0.21593001357688618, "learning_rate": 2.164558886184762e-06, "loss": 0.1323, "step": 2683 }, { "epoch": 4.323801852597664, "grad_norm": 0.2258254732968412, "learning_rate": 2.1543800641464263e-06, "loss": 0.141, "step": 2684 }, { "epoch": 4.3254128070882, "grad_norm": 0.22439808272135298, "learning_rate": 2.1442238688973682e-06, "loss": 0.1396, "step": 2685 }, { "epoch": 4.327023761578736, "grad_norm": 0.21143922501895449, "learning_rate": 2.1340903133148205e-06, "loss": 0.1527, "step": 2686 }, { "epoch": 4.328634716069271, "grad_norm": 0.22994834674226478, "learning_rate": 2.123979410247301e-06, "loss": 0.1544, "step": 2687 }, { "epoch": 4.3302456705598065, "grad_norm": 0.22793741030782735, "learning_rate": 2.1138911725146106e-06, "loss": 0.156, "step": 2688 }, { "epoch": 4.331856625050342, "grad_norm": 0.22320356349655823, "learning_rate": 2.103825612907815e-06, "loss": 0.1474, "step": 2689 }, { "epoch": 4.333467579540878, "grad_norm": 0.23523038616188513, "learning_rate": 2.093782744189217e-06, "loss": 0.1417, "step": 2690 }, { "epoch": 4.335078534031414, "grad_norm": 0.23099424056260986, "learning_rate": 2.0837625790923543e-06, "loss": 0.1432, "step": 2691 }, { "epoch": 4.336689488521949, "grad_norm": 0.21683386647539887, "learning_rate": 2.0737651303219787e-06, "loss": 0.1584, "step": 2692 }, { "epoch": 4.338300443012485, "grad_norm": 0.22280820135116566, "learning_rate": 2.0637904105540428e-06, "loss": 0.1394, "step": 2693 }, { "epoch": 4.339911397503021, "grad_norm": 0.2281065955008403, "learning_rate": 2.053838432435673e-06, "loss": 0.1492, "step": 2694 }, { "epoch": 4.341522351993556, "grad_norm": 0.20990894608507565, "learning_rate": 2.0439092085851685e-06, "loss": 0.1541, "step": 2695 }, { "epoch": 4.343133306484092, "grad_norm": 0.2410571666539787, "learning_rate": 2.0340027515919704e-06, "loss": 0.1504, "step": 2696 }, { "epoch": 4.344744260974627, "grad_norm": 0.21521786871981052, "learning_rate": 2.024119074016664e-06, "loss": 0.1458, "step": 2697 }, { "epoch": 4.346355215465163, "grad_norm": 0.23352779441098917, "learning_rate": 2.014258188390936e-06, "loss": 0.1319, "step": 2698 }, { "epoch": 4.347966169955699, "grad_norm": 0.21861563397638975, "learning_rate": 2.0044201072175884e-06, "loss": 0.1493, "step": 2699 }, { "epoch": 4.349577124446235, "grad_norm": 0.23187235646614845, "learning_rate": 1.9946048429705133e-06, "loss": 0.1424, "step": 2700 }, { "epoch": 4.35118807893677, "grad_norm": 0.23314062934543647, "learning_rate": 1.984812408094656e-06, "loss": 0.1443, "step": 2701 }, { "epoch": 4.3527990334273055, "grad_norm": 0.22716318100135857, "learning_rate": 1.975042815006023e-06, "loss": 0.1506, "step": 2702 }, { "epoch": 4.354409987917841, "grad_norm": 0.22789140251356194, "learning_rate": 1.9652960760916627e-06, "loss": 0.1509, "step": 2703 }, { "epoch": 4.356020942408377, "grad_norm": 0.2298570051084699, "learning_rate": 1.95557220370965e-06, "loss": 0.1388, "step": 2704 }, { "epoch": 4.357631896898913, "grad_norm": 0.22855340474841207, "learning_rate": 1.945871210189054e-06, "loss": 0.1609, "step": 2705 }, { "epoch": 4.359242851389448, "grad_norm": 0.22885905350588037, "learning_rate": 1.9361931078299443e-06, "loss": 0.166, "step": 2706 }, { "epoch": 4.360853805879984, "grad_norm": 0.21861981366054756, "learning_rate": 1.92653790890337e-06, "loss": 0.138, "step": 2707 }, { "epoch": 4.36246476037052, "grad_norm": 0.23241034579490025, "learning_rate": 1.916905625651331e-06, "loss": 0.148, "step": 2708 }, { "epoch": 4.364075714861055, "grad_norm": 0.22772695647417238, "learning_rate": 1.9072962702867714e-06, "loss": 0.1388, "step": 2709 }, { "epoch": 4.365686669351591, "grad_norm": 0.21875331128783337, "learning_rate": 1.8977098549935745e-06, "loss": 0.1417, "step": 2710 }, { "epoch": 4.367297623842126, "grad_norm": 0.22606749157296668, "learning_rate": 1.8881463919265374e-06, "loss": 0.1512, "step": 2711 }, { "epoch": 4.368908578332662, "grad_norm": 0.232376385563654, "learning_rate": 1.8786058932113428e-06, "loss": 0.1519, "step": 2712 }, { "epoch": 4.370519532823198, "grad_norm": 0.22299262190690977, "learning_rate": 1.8690883709445652e-06, "loss": 0.137, "step": 2713 }, { "epoch": 4.372130487313734, "grad_norm": 0.22002068793259277, "learning_rate": 1.859593837193645e-06, "loss": 0.1624, "step": 2714 }, { "epoch": 4.373741441804269, "grad_norm": 0.21192028521319847, "learning_rate": 1.850122303996882e-06, "loss": 0.1363, "step": 2715 }, { "epoch": 4.375352396294804, "grad_norm": 0.21021284454907238, "learning_rate": 1.8406737833634024e-06, "loss": 0.142, "step": 2716 }, { "epoch": 4.37696335078534, "grad_norm": 0.21658493528620826, "learning_rate": 1.8312482872731553e-06, "loss": 0.1456, "step": 2717 }, { "epoch": 4.378574305275876, "grad_norm": 0.226219598466499, "learning_rate": 1.8218458276769091e-06, "loss": 0.1518, "step": 2718 }, { "epoch": 4.380185259766412, "grad_norm": 0.21769790494716323, "learning_rate": 1.8124664164962124e-06, "loss": 0.1481, "step": 2719 }, { "epoch": 4.381796214256947, "grad_norm": 0.22650672628751944, "learning_rate": 1.803110065623388e-06, "loss": 0.1566, "step": 2720 }, { "epoch": 4.383407168747483, "grad_norm": 0.2125040635428062, "learning_rate": 1.7937767869215284e-06, "loss": 0.1363, "step": 2721 }, { "epoch": 4.3850181232380185, "grad_norm": 0.22322895400847495, "learning_rate": 1.784466592224472e-06, "loss": 0.1463, "step": 2722 }, { "epoch": 4.386629077728554, "grad_norm": 0.21766007587217553, "learning_rate": 1.7751794933367828e-06, "loss": 0.1509, "step": 2723 }, { "epoch": 4.38824003221909, "grad_norm": 0.22239483211233982, "learning_rate": 1.7659155020337392e-06, "loss": 0.1552, "step": 2724 }, { "epoch": 4.389850986709625, "grad_norm": 0.22028648628947187, "learning_rate": 1.7566746300613325e-06, "loss": 0.1551, "step": 2725 }, { "epoch": 4.391461941200161, "grad_norm": 0.2212460302725368, "learning_rate": 1.7474568891362342e-06, "loss": 0.1388, "step": 2726 }, { "epoch": 4.393072895690697, "grad_norm": 0.2215072355482207, "learning_rate": 1.738262290945787e-06, "loss": 0.1539, "step": 2727 }, { "epoch": 4.394683850181233, "grad_norm": 0.21970291081765841, "learning_rate": 1.7290908471479805e-06, "loss": 0.1612, "step": 2728 }, { "epoch": 4.3962948046717685, "grad_norm": 0.23823342781119003, "learning_rate": 1.7199425693714733e-06, "loss": 0.1489, "step": 2729 }, { "epoch": 4.397905759162303, "grad_norm": 0.21723293567882335, "learning_rate": 1.7108174692155266e-06, "loss": 0.1489, "step": 2730 }, { "epoch": 4.399516713652839, "grad_norm": 0.22473337832629095, "learning_rate": 1.701715558250019e-06, "loss": 0.1339, "step": 2731 }, { "epoch": 4.401127668143375, "grad_norm": 0.23218370282919404, "learning_rate": 1.6926368480154344e-06, "loss": 0.1534, "step": 2732 }, { "epoch": 4.402738622633911, "grad_norm": 0.22527228406674732, "learning_rate": 1.683581350022838e-06, "loss": 0.1492, "step": 2733 }, { "epoch": 4.404349577124446, "grad_norm": 0.22299618930067308, "learning_rate": 1.674549075753862e-06, "loss": 0.1559, "step": 2734 }, { "epoch": 4.405960531614982, "grad_norm": 0.2173548439071365, "learning_rate": 1.6655400366606867e-06, "loss": 0.1378, "step": 2735 }, { "epoch": 4.4075714861055175, "grad_norm": 0.21748819640917166, "learning_rate": 1.656554244166042e-06, "loss": 0.1418, "step": 2736 }, { "epoch": 4.409182440596053, "grad_norm": 0.22286227976513667, "learning_rate": 1.6475917096631855e-06, "loss": 0.1665, "step": 2737 }, { "epoch": 4.410793395086589, "grad_norm": 0.21834466918449652, "learning_rate": 1.6386524445158714e-06, "loss": 0.1451, "step": 2738 }, { "epoch": 4.412404349577124, "grad_norm": 0.2155920749793083, "learning_rate": 1.6297364600583554e-06, "loss": 0.1488, "step": 2739 }, { "epoch": 4.41401530406766, "grad_norm": 0.21694950266493748, "learning_rate": 1.620843767595388e-06, "loss": 0.1281, "step": 2740 }, { "epoch": 4.415626258558196, "grad_norm": 0.22513033325891293, "learning_rate": 1.6119743784021725e-06, "loss": 0.1465, "step": 2741 }, { "epoch": 4.417237213048732, "grad_norm": 0.2110624349498471, "learning_rate": 1.6031283037243684e-06, "loss": 0.1494, "step": 2742 }, { "epoch": 4.418848167539267, "grad_norm": 0.22603749621498606, "learning_rate": 1.594305554778075e-06, "loss": 0.1497, "step": 2743 }, { "epoch": 4.420459122029802, "grad_norm": 0.23203113436121725, "learning_rate": 1.5855061427498263e-06, "loss": 0.1441, "step": 2744 }, { "epoch": 4.422070076520338, "grad_norm": 0.23890073771122725, "learning_rate": 1.5767300787965512e-06, "loss": 0.1569, "step": 2745 }, { "epoch": 4.423681031010874, "grad_norm": 0.21934839826909436, "learning_rate": 1.5679773740455817e-06, "loss": 0.156, "step": 2746 }, { "epoch": 4.42529198550141, "grad_norm": 0.21682203320832888, "learning_rate": 1.5592480395946342e-06, "loss": 0.1399, "step": 2747 }, { "epoch": 4.426902939991945, "grad_norm": 0.21588619784464438, "learning_rate": 1.5505420865117993e-06, "loss": 0.1427, "step": 2748 }, { "epoch": 4.428513894482481, "grad_norm": 0.2196132757827625, "learning_rate": 1.541859525835505e-06, "loss": 0.1416, "step": 2749 }, { "epoch": 4.4301248489730165, "grad_norm": 0.22650353348090568, "learning_rate": 1.5332003685745279e-06, "loss": 0.1428, "step": 2750 }, { "epoch": 4.431735803463552, "grad_norm": 0.2163231272238479, "learning_rate": 1.524564625707985e-06, "loss": 0.1592, "step": 2751 }, { "epoch": 4.433346757954088, "grad_norm": 0.21345314169858762, "learning_rate": 1.5159523081852867e-06, "loss": 0.1491, "step": 2752 }, { "epoch": 4.434957712444623, "grad_norm": 0.21724471459393288, "learning_rate": 1.5073634269261427e-06, "loss": 0.1437, "step": 2753 }, { "epoch": 4.436568666935159, "grad_norm": 0.21107110747167726, "learning_rate": 1.4987979928205599e-06, "loss": 0.1498, "step": 2754 }, { "epoch": 4.438179621425695, "grad_norm": 0.23243728370562228, "learning_rate": 1.4902560167288105e-06, "loss": 0.1445, "step": 2755 }, { "epoch": 4.439790575916231, "grad_norm": 0.20420200944872982, "learning_rate": 1.4817375094814202e-06, "loss": 0.1411, "step": 2756 }, { "epoch": 4.441401530406766, "grad_norm": 0.23431831960543478, "learning_rate": 1.473242481879158e-06, "loss": 0.1596, "step": 2757 }, { "epoch": 4.443012484897301, "grad_norm": 0.22563203510054342, "learning_rate": 1.464770944693028e-06, "loss": 0.1306, "step": 2758 }, { "epoch": 4.444623439387837, "grad_norm": 0.21898437843550095, "learning_rate": 1.4563229086642538e-06, "loss": 0.1639, "step": 2759 }, { "epoch": 4.446234393878373, "grad_norm": 0.21799291473907836, "learning_rate": 1.4478983845042493e-06, "loss": 0.1409, "step": 2760 }, { "epoch": 4.447845348368909, "grad_norm": 0.21834671784511098, "learning_rate": 1.439497382894617e-06, "loss": 0.1451, "step": 2761 }, { "epoch": 4.449456302859444, "grad_norm": 0.21784261633030536, "learning_rate": 1.4311199144871534e-06, "loss": 0.1498, "step": 2762 }, { "epoch": 4.45106725734998, "grad_norm": 0.21551061217502962, "learning_rate": 1.4227659899038004e-06, "loss": 0.1483, "step": 2763 }, { "epoch": 4.4526782118405155, "grad_norm": 0.22482929437855542, "learning_rate": 1.4144356197366494e-06, "loss": 0.1447, "step": 2764 }, { "epoch": 4.454289166331051, "grad_norm": 0.21817533638318648, "learning_rate": 1.406128814547929e-06, "loss": 0.1583, "step": 2765 }, { "epoch": 4.455900120821587, "grad_norm": 0.22826403127027628, "learning_rate": 1.397845584869999e-06, "loss": 0.1391, "step": 2766 }, { "epoch": 4.457511075312122, "grad_norm": 0.22368572318707833, "learning_rate": 1.3895859412053093e-06, "loss": 0.1378, "step": 2767 }, { "epoch": 4.459122029802658, "grad_norm": 0.2249853087961331, "learning_rate": 1.3813498940264136e-06, "loss": 0.1463, "step": 2768 }, { "epoch": 4.460732984293194, "grad_norm": 0.2185064787669113, "learning_rate": 1.3731374537759544e-06, "loss": 0.1396, "step": 2769 }, { "epoch": 4.46234393878373, "grad_norm": 0.21864596897479208, "learning_rate": 1.3649486308666314e-06, "loss": 0.1565, "step": 2770 }, { "epoch": 4.4639548932742645, "grad_norm": 0.22727139547878636, "learning_rate": 1.3567834356812015e-06, "loss": 0.1437, "step": 2771 }, { "epoch": 4.4655658477648, "grad_norm": 0.2233584478305029, "learning_rate": 1.3486418785724697e-06, "loss": 0.144, "step": 2772 }, { "epoch": 4.467176802255336, "grad_norm": 0.351620925278512, "learning_rate": 1.3405239698632654e-06, "loss": 0.1532, "step": 2773 }, { "epoch": 4.468787756745872, "grad_norm": 0.2215381503484612, "learning_rate": 1.332429719846433e-06, "loss": 0.1398, "step": 2774 }, { "epoch": 4.470398711236408, "grad_norm": 0.2232986049913044, "learning_rate": 1.3243591387848164e-06, "loss": 0.1485, "step": 2775 }, { "epoch": 4.472009665726943, "grad_norm": 0.2202202269211146, "learning_rate": 1.3163122369112591e-06, "loss": 0.1574, "step": 2776 }, { "epoch": 4.473620620217479, "grad_norm": 0.22918240497697517, "learning_rate": 1.3082890244285773e-06, "loss": 0.1447, "step": 2777 }, { "epoch": 4.4752315747080145, "grad_norm": 0.22566546540240492, "learning_rate": 1.300289511509547e-06, "loss": 0.1463, "step": 2778 }, { "epoch": 4.47684252919855, "grad_norm": 0.21415981839053075, "learning_rate": 1.292313708296893e-06, "loss": 0.1542, "step": 2779 }, { "epoch": 4.478453483689086, "grad_norm": 0.221053938368025, "learning_rate": 1.2843616249032874e-06, "loss": 0.1522, "step": 2780 }, { "epoch": 4.480064438179621, "grad_norm": 0.21618116765332307, "learning_rate": 1.2764332714113258e-06, "loss": 0.1546, "step": 2781 }, { "epoch": 4.481675392670157, "grad_norm": 0.21974552955078566, "learning_rate": 1.2685286578735045e-06, "loss": 0.143, "step": 2782 }, { "epoch": 4.483286347160693, "grad_norm": 0.22078570986976645, "learning_rate": 1.2606477943122352e-06, "loss": 0.1449, "step": 2783 }, { "epoch": 4.484897301651229, "grad_norm": 0.22916711988692676, "learning_rate": 1.2527906907198094e-06, "loss": 0.1422, "step": 2784 }, { "epoch": 4.486508256141764, "grad_norm": 0.22001716488332673, "learning_rate": 1.244957357058394e-06, "loss": 0.1442, "step": 2785 }, { "epoch": 4.488119210632299, "grad_norm": 0.22386383475134952, "learning_rate": 1.2371478032600083e-06, "loss": 0.1601, "step": 2786 }, { "epoch": 4.489730165122835, "grad_norm": 0.21664981131161265, "learning_rate": 1.2293620392265338e-06, "loss": 0.1415, "step": 2787 }, { "epoch": 4.491341119613371, "grad_norm": 0.22629217846588368, "learning_rate": 1.2216000748296897e-06, "loss": 0.1433, "step": 2788 }, { "epoch": 4.492952074103907, "grad_norm": 0.235019336289307, "learning_rate": 1.213861919911008e-06, "loss": 0.1494, "step": 2789 }, { "epoch": 4.494563028594442, "grad_norm": 0.21964906508530987, "learning_rate": 1.2061475842818337e-06, "loss": 0.1425, "step": 2790 }, { "epoch": 4.496173983084978, "grad_norm": 0.21452579914651837, "learning_rate": 1.1984570777233184e-06, "loss": 0.1548, "step": 2791 }, { "epoch": 4.4977849375755135, "grad_norm": 0.22707174901989768, "learning_rate": 1.1907904099863999e-06, "loss": 0.1631, "step": 2792 }, { "epoch": 4.499395892066049, "grad_norm": 0.2155179519101744, "learning_rate": 1.18314759079178e-06, "loss": 0.1443, "step": 2793 }, { "epoch": 4.501006846556585, "grad_norm": 0.2226635289792672, "learning_rate": 1.1755286298299339e-06, "loss": 0.161, "step": 2794 }, { "epoch": 4.50261780104712, "grad_norm": 0.22080540831878404, "learning_rate": 1.1679335367610855e-06, "loss": 0.1413, "step": 2795 }, { "epoch": 4.504228755537656, "grad_norm": 0.22308278536499784, "learning_rate": 1.1603623212151872e-06, "loss": 0.1365, "step": 2796 }, { "epoch": 4.505839710028192, "grad_norm": 0.23108606533673448, "learning_rate": 1.152814992791922e-06, "loss": 0.1393, "step": 2797 }, { "epoch": 4.507450664518728, "grad_norm": 0.21861727723987914, "learning_rate": 1.1452915610606885e-06, "loss": 0.1497, "step": 2798 }, { "epoch": 4.5090616190092625, "grad_norm": 0.2218833710624545, "learning_rate": 1.1377920355605854e-06, "loss": 0.1468, "step": 2799 }, { "epoch": 4.510672573499798, "grad_norm": 0.21908566335116011, "learning_rate": 1.1303164258003974e-06, "loss": 0.1473, "step": 2800 }, { "epoch": 4.512283527990334, "grad_norm": 0.21884018169901615, "learning_rate": 1.1228647412585847e-06, "loss": 0.1318, "step": 2801 }, { "epoch": 4.51389448248087, "grad_norm": 0.22464962629724394, "learning_rate": 1.1154369913832762e-06, "loss": 0.1461, "step": 2802 }, { "epoch": 4.515505436971406, "grad_norm": 0.2233609804572511, "learning_rate": 1.1080331855922588e-06, "loss": 0.1448, "step": 2803 }, { "epoch": 4.517116391461942, "grad_norm": 0.21225483736906917, "learning_rate": 1.100653333272943e-06, "loss": 0.1452, "step": 2804 }, { "epoch": 4.518727345952477, "grad_norm": 0.2269518895274179, "learning_rate": 1.0932974437823884e-06, "loss": 0.1451, "step": 2805 }, { "epoch": 4.520338300443012, "grad_norm": 0.2261411870254825, "learning_rate": 1.0859655264472568e-06, "loss": 0.1538, "step": 2806 }, { "epoch": 4.521949254933548, "grad_norm": 0.22345780419863742, "learning_rate": 1.078657590563823e-06, "loss": 0.1295, "step": 2807 }, { "epoch": 4.523560209424084, "grad_norm": 0.21931553583779848, "learning_rate": 1.0713736453979528e-06, "loss": 0.1419, "step": 2808 }, { "epoch": 4.525171163914619, "grad_norm": 0.22080634847733432, "learning_rate": 1.064113700185092e-06, "loss": 0.1439, "step": 2809 }, { "epoch": 4.526782118405155, "grad_norm": 0.2261480900316319, "learning_rate": 1.0568777641302663e-06, "loss": 0.153, "step": 2810 }, { "epoch": 4.528393072895691, "grad_norm": 0.22439464056795858, "learning_rate": 1.0496658464080434e-06, "loss": 0.1537, "step": 2811 }, { "epoch": 4.5300040273862265, "grad_norm": 0.2248930635120482, "learning_rate": 1.0424779561625465e-06, "loss": 0.1488, "step": 2812 }, { "epoch": 4.531614981876762, "grad_norm": 0.2069570637163575, "learning_rate": 1.0353141025074364e-06, "loss": 0.1647, "step": 2813 }, { "epoch": 4.533225936367297, "grad_norm": 0.23132704696687886, "learning_rate": 1.0281742945258987e-06, "loss": 0.1462, "step": 2814 }, { "epoch": 4.534836890857833, "grad_norm": 0.21337858571941506, "learning_rate": 1.0210585412706187e-06, "loss": 0.1487, "step": 2815 }, { "epoch": 4.536447845348369, "grad_norm": 0.22309163223275602, "learning_rate": 1.0139668517637991e-06, "loss": 0.144, "step": 2816 }, { "epoch": 4.538058799838905, "grad_norm": 0.21907142466944238, "learning_rate": 1.006899234997114e-06, "loss": 0.1446, "step": 2817 }, { "epoch": 4.53966975432944, "grad_norm": 0.217901044651781, "learning_rate": 9.998556999317334e-07, "loss": 0.1367, "step": 2818 }, { "epoch": 4.541280708819976, "grad_norm": 0.21574027266533383, "learning_rate": 9.928362554982796e-07, "loss": 0.1456, "step": 2819 }, { "epoch": 4.542891663310511, "grad_norm": 0.22778052410416838, "learning_rate": 9.858409105968337e-07, "loss": 0.1468, "step": 2820 }, { "epoch": 4.544502617801047, "grad_norm": 0.23571660112324896, "learning_rate": 9.788696740969295e-07, "loss": 0.1455, "step": 2821 }, { "epoch": 4.546113572291583, "grad_norm": 0.2201132602209927, "learning_rate": 9.71922554837521e-07, "loss": 0.1446, "step": 2822 }, { "epoch": 4.547724526782118, "grad_norm": 0.2170027257100728, "learning_rate": 9.649995616269847e-07, "loss": 0.1421, "step": 2823 }, { "epoch": 4.549335481272654, "grad_norm": 0.21147627065327013, "learning_rate": 9.581007032431144e-07, "loss": 0.1486, "step": 2824 }, { "epoch": 4.55094643576319, "grad_norm": 0.21922962408896876, "learning_rate": 9.512259884331021e-07, "loss": 0.138, "step": 2825 }, { "epoch": 4.5525573902537255, "grad_norm": 0.21981668661472575, "learning_rate": 9.443754259135197e-07, "loss": 0.1415, "step": 2826 }, { "epoch": 4.554168344744261, "grad_norm": 0.21062790215118363, "learning_rate": 9.375490243703255e-07, "loss": 0.1388, "step": 2827 }, { "epoch": 4.555779299234796, "grad_norm": 0.20844877721721428, "learning_rate": 9.307467924588364e-07, "loss": 0.1425, "step": 2828 }, { "epoch": 4.557390253725332, "grad_norm": 0.2169075920951353, "learning_rate": 9.239687388037311e-07, "loss": 0.1451, "step": 2829 }, { "epoch": 4.559001208215868, "grad_norm": 0.2209905968566893, "learning_rate": 9.172148719990237e-07, "loss": 0.1419, "step": 2830 }, { "epoch": 4.560612162706404, "grad_norm": 0.22168194810993105, "learning_rate": 9.104852006080689e-07, "loss": 0.1425, "step": 2831 }, { "epoch": 4.56222311719694, "grad_norm": 0.23429944659967786, "learning_rate": 9.03779733163539e-07, "loss": 0.1352, "step": 2832 }, { "epoch": 4.563834071687475, "grad_norm": 0.22251765674022653, "learning_rate": 8.970984781674197e-07, "loss": 0.158, "step": 2833 }, { "epoch": 4.56544502617801, "grad_norm": 0.21439636612358917, "learning_rate": 8.904414440909992e-07, "loss": 0.1658, "step": 2834 }, { "epoch": 4.567055980668546, "grad_norm": 0.21055760967769271, "learning_rate": 8.83808639374848e-07, "loss": 0.1406, "step": 2835 }, { "epoch": 4.568666935159082, "grad_norm": 0.22812053637083266, "learning_rate": 8.772000724288277e-07, "loss": 0.1551, "step": 2836 }, { "epoch": 4.570277889649617, "grad_norm": 0.23203123173864854, "learning_rate": 8.706157516320557e-07, "loss": 0.1457, "step": 2837 }, { "epoch": 4.571888844140153, "grad_norm": 0.21647238240445146, "learning_rate": 8.640556853329185e-07, "loss": 0.1366, "step": 2838 }, { "epoch": 4.573499798630689, "grad_norm": 0.2262132480851578, "learning_rate": 8.575198818490405e-07, "loss": 0.1532, "step": 2839 }, { "epoch": 4.5751107531212245, "grad_norm": 0.22036901946478252, "learning_rate": 8.510083494672905e-07, "loss": 0.15, "step": 2840 }, { "epoch": 4.57672170761176, "grad_norm": 0.20881526264861075, "learning_rate": 8.445210964437556e-07, "loss": 0.1572, "step": 2841 }, { "epoch": 4.578332662102295, "grad_norm": 0.21997273950205165, "learning_rate": 8.380581310037472e-07, "loss": 0.1483, "step": 2842 }, { "epoch": 4.579943616592831, "grad_norm": 0.2115529692321746, "learning_rate": 8.316194613417749e-07, "loss": 0.15, "step": 2843 }, { "epoch": 4.581554571083367, "grad_norm": 0.2101065492853683, "learning_rate": 8.252050956215462e-07, "loss": 0.1327, "step": 2844 }, { "epoch": 4.583165525573903, "grad_norm": 0.22367025563934423, "learning_rate": 8.188150419759577e-07, "loss": 0.1539, "step": 2845 }, { "epoch": 4.584776480064438, "grad_norm": 0.21257442264917925, "learning_rate": 8.124493085070706e-07, "loss": 0.1459, "step": 2846 }, { "epoch": 4.5863874345549736, "grad_norm": 0.2175995649893487, "learning_rate": 8.061079032861197e-07, "loss": 0.1362, "step": 2847 }, { "epoch": 4.587998389045509, "grad_norm": 0.22731145011308046, "learning_rate": 7.997908343534844e-07, "loss": 0.1416, "step": 2848 }, { "epoch": 4.589609343536045, "grad_norm": 0.230612422282459, "learning_rate": 7.934981097186977e-07, "loss": 0.1616, "step": 2849 }, { "epoch": 4.591220298026581, "grad_norm": 0.22856353525173428, "learning_rate": 7.872297373604154e-07, "loss": 0.1481, "step": 2850 }, { "epoch": 4.592831252517117, "grad_norm": 0.21655429075000812, "learning_rate": 7.809857252264263e-07, "loss": 0.1478, "step": 2851 }, { "epoch": 4.594442207007652, "grad_norm": 0.2267420054274175, "learning_rate": 7.747660812336221e-07, "loss": 0.1386, "step": 2852 }, { "epoch": 4.596053161498188, "grad_norm": 0.21517894418513714, "learning_rate": 7.685708132680125e-07, "loss": 0.1575, "step": 2853 }, { "epoch": 4.5976641159887235, "grad_norm": 0.22771611741406794, "learning_rate": 7.623999291846829e-07, "loss": 0.1523, "step": 2854 }, { "epoch": 4.599275070479259, "grad_norm": 0.22020469314547333, "learning_rate": 7.562534368078167e-07, "loss": 0.1312, "step": 2855 }, { "epoch": 4.600886024969794, "grad_norm": 0.21911457185337718, "learning_rate": 7.501313439306623e-07, "loss": 0.1471, "step": 2856 }, { "epoch": 4.60249697946033, "grad_norm": 0.220369515835811, "learning_rate": 7.440336583155306e-07, "loss": 0.1467, "step": 2857 }, { "epoch": 4.604107933950866, "grad_norm": 0.2429223255819812, "learning_rate": 7.379603876937969e-07, "loss": 0.1393, "step": 2858 }, { "epoch": 4.605718888441402, "grad_norm": 0.22295148519094363, "learning_rate": 7.319115397658639e-07, "loss": 0.1477, "step": 2859 }, { "epoch": 4.607329842931938, "grad_norm": 0.2155898298714699, "learning_rate": 7.258871222011832e-07, "loss": 0.1365, "step": 2860 }, { "epoch": 4.6089407974224725, "grad_norm": 0.21243596261299383, "learning_rate": 7.198871426382203e-07, "loss": 0.1558, "step": 2861 }, { "epoch": 4.610551751913008, "grad_norm": 0.22122501445417958, "learning_rate": 7.139116086844655e-07, "loss": 0.161, "step": 2862 }, { "epoch": 4.612162706403544, "grad_norm": 0.21598365968425512, "learning_rate": 7.079605279163982e-07, "loss": 0.1493, "step": 2863 }, { "epoch": 4.61377366089408, "grad_norm": 0.2597213786138549, "learning_rate": 7.020339078795136e-07, "loss": 0.1559, "step": 2864 }, { "epoch": 4.615384615384615, "grad_norm": 0.22332642167151248, "learning_rate": 6.961317560882741e-07, "loss": 0.1501, "step": 2865 }, { "epoch": 4.616995569875151, "grad_norm": 0.21904255318165475, "learning_rate": 6.902540800261292e-07, "loss": 0.1627, "step": 2866 }, { "epoch": 4.618606524365687, "grad_norm": 0.2257051902642088, "learning_rate": 6.844008871454955e-07, "loss": 0.1383, "step": 2867 }, { "epoch": 4.6202174788562225, "grad_norm": 0.2028287385389341, "learning_rate": 6.785721848677406e-07, "loss": 0.1522, "step": 2868 }, { "epoch": 4.621828433346758, "grad_norm": 0.21967388469905091, "learning_rate": 6.727679805831821e-07, "loss": 0.136, "step": 2869 }, { "epoch": 4.623439387837293, "grad_norm": 0.2254164663996422, "learning_rate": 6.669882816510776e-07, "loss": 0.139, "step": 2870 }, { "epoch": 4.625050342327829, "grad_norm": 0.21946694917764376, "learning_rate": 6.61233095399616e-07, "loss": 0.1387, "step": 2871 }, { "epoch": 4.626661296818365, "grad_norm": 0.22027332476566938, "learning_rate": 6.555024291259005e-07, "loss": 0.1447, "step": 2872 }, { "epoch": 4.628272251308901, "grad_norm": 0.21396999455794583, "learning_rate": 6.497962900959542e-07, "loss": 0.1556, "step": 2873 }, { "epoch": 4.629883205799436, "grad_norm": 0.23178083820005146, "learning_rate": 6.441146855446856e-07, "loss": 0.135, "step": 2874 }, { "epoch": 4.6314941602899715, "grad_norm": 0.21644352493998795, "learning_rate": 6.384576226759165e-07, "loss": 0.1559, "step": 2875 }, { "epoch": 4.633105114780507, "grad_norm": 0.22303322809496479, "learning_rate": 6.328251086623294e-07, "loss": 0.1427, "step": 2876 }, { "epoch": 4.634716069271043, "grad_norm": 0.21071259795990352, "learning_rate": 6.272171506455005e-07, "loss": 0.1459, "step": 2877 }, { "epoch": 4.636327023761579, "grad_norm": 0.20980391695821105, "learning_rate": 6.21633755735862e-07, "loss": 0.138, "step": 2878 }, { "epoch": 4.637937978252115, "grad_norm": 0.21263059217171254, "learning_rate": 6.160749310127023e-07, "loss": 0.1484, "step": 2879 }, { "epoch": 4.63954893274265, "grad_norm": 0.204952813257361, "learning_rate": 6.105406835241545e-07, "loss": 0.1406, "step": 2880 }, { "epoch": 4.641159887233186, "grad_norm": 0.21197948810614625, "learning_rate": 6.050310202871922e-07, "loss": 0.1281, "step": 2881 }, { "epoch": 4.6427708417237215, "grad_norm": 0.20655087363236846, "learning_rate": 5.995459482876253e-07, "loss": 0.1522, "step": 2882 }, { "epoch": 4.644381796214257, "grad_norm": 0.22413109041423177, "learning_rate": 5.940854744800706e-07, "loss": 0.1642, "step": 2883 }, { "epoch": 4.645992750704792, "grad_norm": 0.21624222816321237, "learning_rate": 5.886496057879676e-07, "loss": 0.1359, "step": 2884 }, { "epoch": 4.647603705195328, "grad_norm": 0.23678058476570954, "learning_rate": 5.832383491035499e-07, "loss": 0.1397, "step": 2885 }, { "epoch": 4.649214659685864, "grad_norm": 0.21030051110070672, "learning_rate": 5.778517112878512e-07, "loss": 0.1485, "step": 2886 }, { "epoch": 4.6508256141764, "grad_norm": 0.21914807690463745, "learning_rate": 5.724896991706885e-07, "loss": 0.1456, "step": 2887 }, { "epoch": 4.652436568666936, "grad_norm": 0.21409204336065774, "learning_rate": 5.671523195506567e-07, "loss": 0.1543, "step": 2888 }, { "epoch": 4.6540475231574705, "grad_norm": 0.21075887789724632, "learning_rate": 5.618395791951159e-07, "loss": 0.15, "step": 2889 }, { "epoch": 4.655658477648006, "grad_norm": 0.22077872594297945, "learning_rate": 5.565514848401887e-07, "loss": 0.1354, "step": 2890 }, { "epoch": 4.657269432138542, "grad_norm": 0.21354385616306004, "learning_rate": 5.512880431907452e-07, "loss": 0.1408, "step": 2891 }, { "epoch": 4.658880386629078, "grad_norm": 0.212847283457817, "learning_rate": 5.460492609203982e-07, "loss": 0.1433, "step": 2892 }, { "epoch": 4.660491341119613, "grad_norm": 0.2187454345219124, "learning_rate": 5.40835144671501e-07, "loss": 0.1465, "step": 2893 }, { "epoch": 4.662102295610149, "grad_norm": 0.21640789825365564, "learning_rate": 5.356457010551253e-07, "loss": 0.1455, "step": 2894 }, { "epoch": 4.663713250100685, "grad_norm": 0.21575352229791922, "learning_rate": 5.304809366510566e-07, "loss": 0.1556, "step": 2895 }, { "epoch": 4.66532420459122, "grad_norm": 0.21723870718473914, "learning_rate": 5.253408580078035e-07, "loss": 0.1584, "step": 2896 }, { "epoch": 4.666935159081756, "grad_norm": 0.22077657920712968, "learning_rate": 5.202254716425636e-07, "loss": 0.1502, "step": 2897 }, { "epoch": 4.668546113572291, "grad_norm": 0.22750870337063364, "learning_rate": 5.151347840412224e-07, "loss": 0.1404, "step": 2898 }, { "epoch": 4.670157068062827, "grad_norm": 0.21780471723045816, "learning_rate": 5.100688016583632e-07, "loss": 0.1317, "step": 2899 }, { "epoch": 4.671768022553363, "grad_norm": 0.22372509521913855, "learning_rate": 5.05027530917237e-07, "loss": 0.1507, "step": 2900 }, { "epoch": 4.673378977043899, "grad_norm": 0.21577961205407864, "learning_rate": 5.00010978209764e-07, "loss": 0.1498, "step": 2901 }, { "epoch": 4.6749899315344345, "grad_norm": 0.22095270932077324, "learning_rate": 4.950191498965207e-07, "loss": 0.1567, "step": 2902 }, { "epoch": 4.6766008860249695, "grad_norm": 0.22385680458863616, "learning_rate": 4.900520523067376e-07, "loss": 0.1387, "step": 2903 }, { "epoch": 4.678211840515505, "grad_norm": 0.21480467606962006, "learning_rate": 4.851096917382925e-07, "loss": 0.1429, "step": 2904 }, { "epoch": 4.679822795006041, "grad_norm": 0.21972524220304457, "learning_rate": 4.801920744576949e-07, "loss": 0.1466, "step": 2905 }, { "epoch": 4.681433749496577, "grad_norm": 0.2143359141084018, "learning_rate": 4.7529920670007724e-07, "loss": 0.1559, "step": 2906 }, { "epoch": 4.683044703987113, "grad_norm": 0.2195510458192481, "learning_rate": 4.704310946692037e-07, "loss": 0.1436, "step": 2907 }, { "epoch": 4.684655658477648, "grad_norm": 0.2244650853064686, "learning_rate": 4.6558774453743684e-07, "loss": 0.1539, "step": 2908 }, { "epoch": 4.686266612968184, "grad_norm": 0.2215405869664864, "learning_rate": 4.607691624457511e-07, "loss": 0.1387, "step": 2909 }, { "epoch": 4.687877567458719, "grad_norm": 0.21906085385910223, "learning_rate": 4.559753545037171e-07, "loss": 0.1373, "step": 2910 }, { "epoch": 4.689488521949255, "grad_norm": 0.21588538081675898, "learning_rate": 4.512063267894906e-07, "loss": 0.1598, "step": 2911 }, { "epoch": 4.69109947643979, "grad_norm": 0.21543206275284182, "learning_rate": 4.4646208534980807e-07, "loss": 0.135, "step": 2912 }, { "epoch": 4.692710430930326, "grad_norm": 0.20168686340533482, "learning_rate": 4.4174263619998217e-07, "loss": 0.138, "step": 2913 }, { "epoch": 4.694321385420862, "grad_norm": 0.22114311160392375, "learning_rate": 4.3704798532388624e-07, "loss": 0.1521, "step": 2914 }, { "epoch": 4.695932339911398, "grad_norm": 0.2165961430758482, "learning_rate": 4.3237813867396117e-07, "loss": 0.1398, "step": 2915 }, { "epoch": 4.6975432944019335, "grad_norm": 0.21089525598082778, "learning_rate": 4.2773310217118394e-07, "loss": 0.1494, "step": 2916 }, { "epoch": 4.6991542488924685, "grad_norm": 0.20868547364457915, "learning_rate": 4.2311288170508336e-07, "loss": 0.1356, "step": 2917 }, { "epoch": 4.700765203383004, "grad_norm": 0.21119647807605038, "learning_rate": 4.1851748313372463e-07, "loss": 0.1551, "step": 2918 }, { "epoch": 4.70237615787354, "grad_norm": 0.21654917077953273, "learning_rate": 4.139469122836981e-07, "loss": 0.1387, "step": 2919 }, { "epoch": 4.703987112364076, "grad_norm": 0.21662833427984002, "learning_rate": 4.094011749501103e-07, "loss": 0.1502, "step": 2920 }, { "epoch": 4.705598066854611, "grad_norm": 0.20946333432367129, "learning_rate": 4.048802768965887e-07, "loss": 0.1413, "step": 2921 }, { "epoch": 4.707209021345147, "grad_norm": 0.22564245087432352, "learning_rate": 4.003842238552613e-07, "loss": 0.1545, "step": 2922 }, { "epoch": 4.708819975835683, "grad_norm": 0.22076168759631085, "learning_rate": 3.9591302152675703e-07, "loss": 0.1591, "step": 2923 }, { "epoch": 4.710430930326218, "grad_norm": 0.2097867230868185, "learning_rate": 3.9146667558019433e-07, "loss": 0.1571, "step": 2924 }, { "epoch": 4.712041884816754, "grad_norm": 0.22088949709015748, "learning_rate": 3.8704519165317923e-07, "loss": 0.1565, "step": 2925 }, { "epoch": 4.71365283930729, "grad_norm": 0.22150240058770918, "learning_rate": 3.8264857535178943e-07, "loss": 0.1324, "step": 2926 }, { "epoch": 4.715263793797825, "grad_norm": 0.20698205346690662, "learning_rate": 3.782768322505792e-07, "loss": 0.1315, "step": 2927 }, { "epoch": 4.716874748288361, "grad_norm": 0.20845720756492594, "learning_rate": 3.7392996789255673e-07, "loss": 0.1442, "step": 2928 }, { "epoch": 4.718485702778897, "grad_norm": 0.2156287236788521, "learning_rate": 3.6960798778919784e-07, "loss": 0.1499, "step": 2929 }, { "epoch": 4.7200966572694325, "grad_norm": 0.2196744399794609, "learning_rate": 3.653108974204145e-07, "loss": 0.1551, "step": 2930 }, { "epoch": 4.7217076117599674, "grad_norm": 0.22139174195462538, "learning_rate": 3.610387022345685e-07, "loss": 0.1534, "step": 2931 }, { "epoch": 4.723318566250503, "grad_norm": 0.2277220395224189, "learning_rate": 3.567914076484558e-07, "loss": 0.1498, "step": 2932 }, { "epoch": 4.724929520741039, "grad_norm": 0.22109687230688657, "learning_rate": 3.5256901904729967e-07, "loss": 0.1562, "step": 2933 }, { "epoch": 4.726540475231575, "grad_norm": 0.21580426849176798, "learning_rate": 3.483715417847422e-07, "loss": 0.1382, "step": 2934 }, { "epoch": 4.728151429722111, "grad_norm": 0.2137311430685286, "learning_rate": 3.441989811828417e-07, "loss": 0.1354, "step": 2935 }, { "epoch": 4.729762384212646, "grad_norm": 0.21046823458686373, "learning_rate": 3.4005134253206393e-07, "loss": 0.1464, "step": 2936 }, { "epoch": 4.7313733387031816, "grad_norm": 0.20552298607096997, "learning_rate": 3.3592863109128016e-07, "loss": 0.1471, "step": 2937 }, { "epoch": 4.732984293193717, "grad_norm": 0.2121097718281581, "learning_rate": 3.318308520877489e-07, "loss": 0.15, "step": 2938 }, { "epoch": 4.734595247684253, "grad_norm": 0.22848848728985083, "learning_rate": 3.277580107171163e-07, "loss": 0.1485, "step": 2939 }, { "epoch": 4.736206202174788, "grad_norm": 0.21565593846592726, "learning_rate": 3.2371011214342053e-07, "loss": 0.1582, "step": 2940 }, { "epoch": 4.737817156665324, "grad_norm": 0.22164738957939945, "learning_rate": 3.1968716149906043e-07, "loss": 0.1453, "step": 2941 }, { "epoch": 4.73942811115586, "grad_norm": 0.2168031711486235, "learning_rate": 3.156891638848092e-07, "loss": 0.1442, "step": 2942 }, { "epoch": 4.741039065646396, "grad_norm": 0.2167016023126979, "learning_rate": 3.117161243698052e-07, "loss": 0.1426, "step": 2943 }, { "epoch": 4.7426500201369315, "grad_norm": 0.21297127531599902, "learning_rate": 3.077680479915368e-07, "loss": 0.1462, "step": 2944 }, { "epoch": 4.744260974627466, "grad_norm": 0.21958491536222166, "learning_rate": 3.038449397558396e-07, "loss": 0.1338, "step": 2945 }, { "epoch": 4.745871929118002, "grad_norm": 0.21879043332580358, "learning_rate": 2.9994680463689697e-07, "loss": 0.1502, "step": 2946 }, { "epoch": 4.747482883608538, "grad_norm": 0.21967915410535746, "learning_rate": 2.9607364757722635e-07, "loss": 0.15, "step": 2947 }, { "epoch": 4.749093838099074, "grad_norm": 0.2113428697983794, "learning_rate": 2.9222547348767504e-07, "loss": 0.1267, "step": 2948 }, { "epoch": 4.750704792589609, "grad_norm": 0.21023635797177284, "learning_rate": 2.884022872474113e-07, "loss": 0.1336, "step": 2949 }, { "epoch": 4.752315747080145, "grad_norm": 0.2076654674576389, "learning_rate": 2.8460409370392405e-07, "loss": 0.1338, "step": 2950 }, { "epoch": 4.7539267015706805, "grad_norm": 0.2371363457394752, "learning_rate": 2.808308976730145e-07, "loss": 0.1463, "step": 2951 }, { "epoch": 4.755537656061216, "grad_norm": 0.21635183048072082, "learning_rate": 2.770827039387869e-07, "loss": 0.1327, "step": 2952 }, { "epoch": 4.757148610551752, "grad_norm": 0.2234269293362455, "learning_rate": 2.7335951725364185e-07, "loss": 0.1455, "step": 2953 }, { "epoch": 4.758759565042288, "grad_norm": 0.21747410916134383, "learning_rate": 2.696613423382788e-07, "loss": 0.1566, "step": 2954 }, { "epoch": 4.760370519532823, "grad_norm": 0.22281641279316053, "learning_rate": 2.6598818388168246e-07, "loss": 0.1429, "step": 2955 }, { "epoch": 4.761981474023359, "grad_norm": 0.2238241839767047, "learning_rate": 2.6234004654111854e-07, "loss": 0.1544, "step": 2956 }, { "epoch": 4.763592428513895, "grad_norm": 0.21188445273823583, "learning_rate": 2.5871693494212036e-07, "loss": 0.1399, "step": 2957 }, { "epoch": 4.7652033830044305, "grad_norm": 0.210545430187273, "learning_rate": 2.551188536785043e-07, "loss": 0.156, "step": 2958 }, { "epoch": 4.766814337494965, "grad_norm": 0.22470921530004012, "learning_rate": 2.5154580731234333e-07, "loss": 0.1457, "step": 2959 }, { "epoch": 4.768425291985501, "grad_norm": 0.21323401360872982, "learning_rate": 2.479978003739669e-07, "loss": 0.1601, "step": 2960 }, { "epoch": 4.770036246476037, "grad_norm": 0.20568485058813663, "learning_rate": 2.4447483736195877e-07, "loss": 0.1345, "step": 2961 }, { "epoch": 4.771647200966573, "grad_norm": 0.20709133915444555, "learning_rate": 2.40976922743148e-07, "loss": 0.1491, "step": 2962 }, { "epoch": 4.773258155457109, "grad_norm": 0.22004597986350832, "learning_rate": 2.3750406095260469e-07, "loss": 0.145, "step": 2963 }, { "epoch": 4.774869109947644, "grad_norm": 0.21770693112333073, "learning_rate": 2.340562563936355e-07, "loss": 0.1523, "step": 2964 }, { "epoch": 4.7764800644381795, "grad_norm": 0.20635080684518206, "learning_rate": 2.3063351343777241e-07, "loss": 0.1487, "step": 2965 }, { "epoch": 4.778091018928715, "grad_norm": 0.2116532190342573, "learning_rate": 2.272358364247773e-07, "loss": 0.1403, "step": 2966 }, { "epoch": 4.779701973419251, "grad_norm": 0.22159722188901532, "learning_rate": 2.238632296626242e-07, "loss": 0.1546, "step": 2967 }, { "epoch": 4.781312927909786, "grad_norm": 0.21591067485783777, "learning_rate": 2.205156974275058e-07, "loss": 0.1511, "step": 2968 }, { "epoch": 4.782923882400322, "grad_norm": 0.2191530577323583, "learning_rate": 2.1719324396381802e-07, "loss": 0.151, "step": 2969 }, { "epoch": 4.784534836890858, "grad_norm": 0.21158599130781444, "learning_rate": 2.138958734841623e-07, "loss": 0.1434, "step": 2970 }, { "epoch": 4.786145791381394, "grad_norm": 0.22489697455911142, "learning_rate": 2.106235901693321e-07, "loss": 0.1453, "step": 2971 }, { "epoch": 4.7877567458719295, "grad_norm": 0.20816987525634345, "learning_rate": 2.0737639816831967e-07, "loss": 0.1417, "step": 2972 }, { "epoch": 4.789367700362464, "grad_norm": 0.2208234962177496, "learning_rate": 2.0415430159829608e-07, "loss": 0.1449, "step": 2973 }, { "epoch": 4.790978654853, "grad_norm": 0.21872410598917383, "learning_rate": 2.0095730454461781e-07, "loss": 0.1539, "step": 2974 }, { "epoch": 4.792589609343536, "grad_norm": 0.2368138202783192, "learning_rate": 1.9778541106081572e-07, "loss": 0.144, "step": 2975 }, { "epoch": 4.794200563834072, "grad_norm": 0.2517246270809971, "learning_rate": 1.9463862516859277e-07, "loss": 0.1639, "step": 2976 }, { "epoch": 4.795811518324607, "grad_norm": 0.20392676903290943, "learning_rate": 1.915169508578174e-07, "loss": 0.1362, "step": 2977 }, { "epoch": 4.797422472815143, "grad_norm": 0.2251301182098052, "learning_rate": 1.8842039208651685e-07, "loss": 0.1366, "step": 2978 }, { "epoch": 4.7990334273056785, "grad_norm": 0.21438203052030705, "learning_rate": 1.8534895278087272e-07, "loss": 0.1443, "step": 2979 }, { "epoch": 4.800644381796214, "grad_norm": 0.21536055584983677, "learning_rate": 1.823026368352232e-07, "loss": 0.1456, "step": 2980 }, { "epoch": 4.80225533628675, "grad_norm": 0.21399263776091337, "learning_rate": 1.792814481120453e-07, "loss": 0.1454, "step": 2981 }, { "epoch": 4.803866290777286, "grad_norm": 0.2215447311821123, "learning_rate": 1.7628539044195924e-07, "loss": 0.1538, "step": 2982 }, { "epoch": 4.805477245267821, "grad_norm": 0.22845562785090617, "learning_rate": 1.7331446762372638e-07, "loss": 0.1379, "step": 2983 }, { "epoch": 4.807088199758357, "grad_norm": 0.21877133621326636, "learning_rate": 1.7036868342422687e-07, "loss": 0.1466, "step": 2984 }, { "epoch": 4.808699154248893, "grad_norm": 0.22197790124470415, "learning_rate": 1.6744804157848183e-07, "loss": 0.1648, "step": 2985 }, { "epoch": 4.810310108739428, "grad_norm": 0.21974675453737633, "learning_rate": 1.6455254578962243e-07, "loss": 0.1332, "step": 2986 }, { "epoch": 4.811921063229963, "grad_norm": 0.21753406916106324, "learning_rate": 1.6168219972890087e-07, "loss": 0.146, "step": 2987 }, { "epoch": 4.813532017720499, "grad_norm": 0.20824467291862445, "learning_rate": 1.5883700703568373e-07, "loss": 0.1513, "step": 2988 }, { "epoch": 4.815142972211035, "grad_norm": 0.21940558640107516, "learning_rate": 1.5601697131744308e-07, "loss": 0.1349, "step": 2989 }, { "epoch": 4.816753926701571, "grad_norm": 0.2169053830515682, "learning_rate": 1.5322209614975214e-07, "loss": 0.1427, "step": 2990 }, { "epoch": 4.818364881192107, "grad_norm": 0.20841544497570255, "learning_rate": 1.5045238507628513e-07, "loss": 0.1293, "step": 2991 }, { "epoch": 4.819975835682642, "grad_norm": 0.22675681327150446, "learning_rate": 1.477078416088107e-07, "loss": 0.1549, "step": 2992 }, { "epoch": 4.8215867901731775, "grad_norm": 0.2147630928023925, "learning_rate": 1.44988469227183e-07, "loss": 0.1446, "step": 2993 }, { "epoch": 4.823197744663713, "grad_norm": 0.22130327588900567, "learning_rate": 1.422942713793485e-07, "loss": 0.1624, "step": 2994 }, { "epoch": 4.824808699154249, "grad_norm": 0.21320882618830075, "learning_rate": 1.396252514813279e-07, "loss": 0.1432, "step": 2995 }, { "epoch": 4.826419653644784, "grad_norm": 0.2086230672911769, "learning_rate": 1.369814129172209e-07, "loss": 0.144, "step": 2996 }, { "epoch": 4.82803060813532, "grad_norm": 0.2134036368549228, "learning_rate": 1.3436275903919716e-07, "loss": 0.1468, "step": 2997 }, { "epoch": 4.829641562625856, "grad_norm": 0.21195574971999337, "learning_rate": 1.3176929316749632e-07, "loss": 0.131, "step": 2998 }, { "epoch": 4.831252517116392, "grad_norm": 0.21601750704641073, "learning_rate": 1.2920101859042578e-07, "loss": 0.1539, "step": 2999 }, { "epoch": 4.832863471606927, "grad_norm": 0.22404313051428473, "learning_rate": 1.2665793856434516e-07, "loss": 0.1334, "step": 3000 }, { "epoch": 4.834474426097462, "grad_norm": 0.21836677236601115, "learning_rate": 1.2414005631366855e-07, "loss": 0.1498, "step": 3001 }, { "epoch": 4.836085380587998, "grad_norm": 0.21793674127632842, "learning_rate": 1.2164737503087108e-07, "loss": 0.1456, "step": 3002 }, { "epoch": 4.837696335078534, "grad_norm": 0.20502370809178963, "learning_rate": 1.1917989787646689e-07, "loss": 0.145, "step": 3003 }, { "epoch": 4.83930728956907, "grad_norm": 0.21438747414467008, "learning_rate": 1.1673762797901334e-07, "loss": 0.1404, "step": 3004 }, { "epoch": 4.840918244059606, "grad_norm": 0.21658308626209727, "learning_rate": 1.1432056843511342e-07, "loss": 0.1366, "step": 3005 }, { "epoch": 4.842529198550141, "grad_norm": 0.2146719093301986, "learning_rate": 1.1192872230939789e-07, "loss": 0.1513, "step": 3006 }, { "epoch": 4.8441401530406765, "grad_norm": 0.24537221608744508, "learning_rate": 1.0956209263453421e-07, "loss": 0.1413, "step": 3007 }, { "epoch": 4.845751107531212, "grad_norm": 0.22340926795790925, "learning_rate": 1.0722068241121319e-07, "loss": 0.1458, "step": 3008 }, { "epoch": 4.847362062021748, "grad_norm": 0.22379919725594086, "learning_rate": 1.0490449460815788e-07, "loss": 0.1499, "step": 3009 }, { "epoch": 4.848973016512284, "grad_norm": 0.2221902130763464, "learning_rate": 1.0261353216209691e-07, "loss": 0.1473, "step": 3010 }, { "epoch": 4.850583971002819, "grad_norm": 0.22992060085331859, "learning_rate": 1.0034779797778893e-07, "loss": 0.1421, "step": 3011 }, { "epoch": 4.852194925493355, "grad_norm": 0.22018812958227876, "learning_rate": 9.810729492800042e-08, "loss": 0.1456, "step": 3012 }, { "epoch": 4.853805879983891, "grad_norm": 0.21391699346938456, "learning_rate": 9.589202585350565e-08, "loss": 0.1488, "step": 3013 }, { "epoch": 4.855416834474426, "grad_norm": 0.21652663225257696, "learning_rate": 9.370199356308229e-08, "loss": 0.1547, "step": 3014 }, { "epoch": 4.857027788964961, "grad_norm": 0.2075965000591131, "learning_rate": 9.153720083351358e-08, "loss": 0.1432, "step": 3015 }, { "epoch": 4.858638743455497, "grad_norm": 0.2162147197513201, "learning_rate": 8.939765040958392e-08, "loss": 0.1352, "step": 3016 }, { "epoch": 4.860249697946033, "grad_norm": 0.21261429233585386, "learning_rate": 8.728334500406332e-08, "loss": 0.1489, "step": 3017 }, { "epoch": 4.861860652436569, "grad_norm": 0.22990524788122235, "learning_rate": 8.519428729772072e-08, "loss": 0.1419, "step": 3018 }, { "epoch": 4.863471606927105, "grad_norm": 0.22341316580801132, "learning_rate": 8.313047993931067e-08, "loss": 0.1434, "step": 3019 }, { "epoch": 4.86508256141764, "grad_norm": 0.22386199673054813, "learning_rate": 8.109192554557333e-08, "loss": 0.1401, "step": 3020 }, { "epoch": 4.8666935159081754, "grad_norm": 0.22098592505813894, "learning_rate": 7.907862670122557e-08, "loss": 0.1294, "step": 3021 }, { "epoch": 4.868304470398711, "grad_norm": 0.21149506248274522, "learning_rate": 7.709058595897213e-08, "loss": 0.146, "step": 3022 }, { "epoch": 4.869915424889247, "grad_norm": 0.2120571099622338, "learning_rate": 7.51278058394811e-08, "loss": 0.1644, "step": 3023 }, { "epoch": 4.871526379379782, "grad_norm": 0.21984813243178256, "learning_rate": 7.319028883139956e-08, "loss": 0.1435, "step": 3024 }, { "epoch": 4.873137333870318, "grad_norm": 0.2046339895959685, "learning_rate": 7.12780373913402e-08, "loss": 0.1479, "step": 3025 }, { "epoch": 4.874748288360854, "grad_norm": 0.21213665620935812, "learning_rate": 6.939105394388356e-08, "loss": 0.1558, "step": 3026 }, { "epoch": 4.8763592428513896, "grad_norm": 0.20974836104729822, "learning_rate": 6.752934088156693e-08, "loss": 0.1607, "step": 3027 }, { "epoch": 4.877970197341925, "grad_norm": 0.21367805851277302, "learning_rate": 6.569290056489542e-08, "loss": 0.1412, "step": 3028 }, { "epoch": 4.879581151832461, "grad_norm": 0.21328781532029567, "learning_rate": 6.3881735322322e-08, "loss": 0.1393, "step": 3029 }, { "epoch": 4.881192106322996, "grad_norm": 0.2147863974760739, "learning_rate": 6.209584745025643e-08, "loss": 0.1507, "step": 3030 }, { "epoch": 4.882803060813532, "grad_norm": 0.2116750037903575, "learning_rate": 6.033523921306072e-08, "loss": 0.1423, "step": 3031 }, { "epoch": 4.884414015304068, "grad_norm": 0.21829717085107075, "learning_rate": 5.859991284303812e-08, "loss": 0.1416, "step": 3032 }, { "epoch": 4.886024969794604, "grad_norm": 0.2175366973660451, "learning_rate": 5.688987054044637e-08, "loss": 0.1472, "step": 3033 }, { "epoch": 4.887635924285139, "grad_norm": 0.20843713478870315, "learning_rate": 5.520511447347776e-08, "loss": 0.1381, "step": 3034 }, { "epoch": 4.889246878775674, "grad_norm": 0.2123862342443856, "learning_rate": 5.3545646778263575e-08, "loss": 0.1511, "step": 3035 }, { "epoch": 4.89085783326621, "grad_norm": 0.2219417417741511, "learning_rate": 5.191146955887405e-08, "loss": 0.1403, "step": 3036 }, { "epoch": 4.892468787756746, "grad_norm": 0.21174959404262084, "learning_rate": 5.0302584887313986e-08, "loss": 0.1451, "step": 3037 }, { "epoch": 4.894079742247282, "grad_norm": 0.21072496789865433, "learning_rate": 4.871899480351605e-08, "loss": 0.1449, "step": 3038 }, { "epoch": 4.895690696737817, "grad_norm": 0.22806728378504748, "learning_rate": 4.7160701315343e-08, "loss": 0.1472, "step": 3039 }, { "epoch": 4.897301651228353, "grad_norm": 0.2168054676106727, "learning_rate": 4.562770639858549e-08, "loss": 0.1575, "step": 3040 }, { "epoch": 4.8989126057188885, "grad_norm": 0.2168197173485377, "learning_rate": 4.412001199695537e-08, "loss": 0.1482, "step": 3041 }, { "epoch": 4.900523560209424, "grad_norm": 0.2250248057225666, "learning_rate": 4.2637620022085715e-08, "loss": 0.1377, "step": 3042 }, { "epoch": 4.902134514699959, "grad_norm": 0.21418687912333567, "learning_rate": 4.118053235352859e-08, "loss": 0.135, "step": 3043 }, { "epoch": 4.903745469190495, "grad_norm": 0.21982028957248173, "learning_rate": 3.974875083875285e-08, "loss": 0.1459, "step": 3044 }, { "epoch": 4.905356423681031, "grad_norm": 0.21288538105643232, "learning_rate": 3.834227729313966e-08, "loss": 0.1442, "step": 3045 }, { "epoch": 4.906967378171567, "grad_norm": 0.24690110457527772, "learning_rate": 3.696111349998255e-08, "loss": 0.1438, "step": 3046 }, { "epoch": 4.908578332662103, "grad_norm": 0.21048986451072557, "learning_rate": 3.5605261210485134e-08, "loss": 0.1472, "step": 3047 }, { "epoch": 4.910189287152638, "grad_norm": 0.21299506119550873, "learning_rate": 3.427472214375671e-08, "loss": 0.1595, "step": 3048 }, { "epoch": 4.911800241643173, "grad_norm": 0.2098004986312955, "learning_rate": 3.296949798681226e-08, "loss": 0.1418, "step": 3049 }, { "epoch": 4.913411196133709, "grad_norm": 0.2111424848272005, "learning_rate": 3.1689590394570204e-08, "loss": 0.1478, "step": 3050 }, { "epoch": 4.915022150624245, "grad_norm": 0.2056760445046121, "learning_rate": 3.0435000989850194e-08, "loss": 0.1522, "step": 3051 }, { "epoch": 4.91663310511478, "grad_norm": 0.22222100807383482, "learning_rate": 2.9205731363364244e-08, "loss": 0.1467, "step": 3052 }, { "epoch": 4.918244059605316, "grad_norm": 0.22694945838983843, "learning_rate": 2.8001783073732248e-08, "loss": 0.1368, "step": 3053 }, { "epoch": 4.919855014095852, "grad_norm": 0.20514202688175226, "learning_rate": 2.6823157647457577e-08, "loss": 0.1377, "step": 3054 }, { "epoch": 4.9214659685863875, "grad_norm": 0.21345732647144894, "learning_rate": 2.566985657894483e-08, "loss": 0.1485, "step": 3055 }, { "epoch": 4.923076923076923, "grad_norm": 0.21597379148933313, "learning_rate": 2.4541881330482075e-08, "loss": 0.1424, "step": 3056 }, { "epoch": 4.924687877567459, "grad_norm": 0.20890314593204515, "learning_rate": 2.3439233332251953e-08, "loss": 0.1628, "step": 3057 }, { "epoch": 4.926298832057994, "grad_norm": 0.21635021856559442, "learning_rate": 2.236191398232057e-08, "loss": 0.1445, "step": 3058 }, { "epoch": 4.92790978654853, "grad_norm": 0.2202683990747119, "learning_rate": 2.1309924646641945e-08, "loss": 0.1482, "step": 3059 }, { "epoch": 4.929520741039066, "grad_norm": 0.21390221711743562, "learning_rate": 2.0283266659051338e-08, "loss": 0.1388, "step": 3060 }, { "epoch": 4.931131695529602, "grad_norm": 0.22450839818366922, "learning_rate": 1.9281941321271925e-08, "loss": 0.1565, "step": 3061 }, { "epoch": 4.932742650020137, "grad_norm": 0.223216962644783, "learning_rate": 1.8305949902897026e-08, "loss": 0.1492, "step": 3062 }, { "epoch": 4.934353604510672, "grad_norm": 0.2005760944353557, "learning_rate": 1.7355293641405647e-08, "loss": 0.1545, "step": 3063 }, { "epoch": 4.935964559001208, "grad_norm": 0.22213102975484492, "learning_rate": 1.6429973742153606e-08, "loss": 0.1418, "step": 3064 }, { "epoch": 4.937575513491744, "grad_norm": 0.22253914887075765, "learning_rate": 1.552999137836908e-08, "loss": 0.136, "step": 3065 }, { "epoch": 4.93918646798228, "grad_norm": 0.22754585273340733, "learning_rate": 1.4655347691159283e-08, "loss": 0.1514, "step": 3066 }, { "epoch": 4.940797422472815, "grad_norm": 0.20712918988601425, "learning_rate": 1.3806043789497126e-08, "loss": 0.143, "step": 3067 }, { "epoch": 4.942408376963351, "grad_norm": 0.22069265881378214, "learning_rate": 1.2982080750234549e-08, "loss": 0.1491, "step": 3068 }, { "epoch": 4.9440193314538865, "grad_norm": 0.20959463552056892, "learning_rate": 1.2183459618084759e-08, "loss": 0.1523, "step": 3069 }, { "epoch": 4.945630285944422, "grad_norm": 0.21612258928364242, "learning_rate": 1.1410181405639986e-08, "loss": 0.1442, "step": 3070 }, { "epoch": 4.947241240434957, "grad_norm": 0.2162006333759751, "learning_rate": 1.0662247093349287e-08, "loss": 0.1244, "step": 3071 }, { "epoch": 4.948852194925493, "grad_norm": 0.22621189321655785, "learning_rate": 9.939657629534083e-09, "loss": 0.1624, "step": 3072 }, { "epoch": 4.950463149416029, "grad_norm": 0.22325164035423106, "learning_rate": 9.242413930377059e-09, "loss": 0.1501, "step": 3073 }, { "epoch": 4.952074103906565, "grad_norm": 0.20908742596922356, "learning_rate": 8.570516879928826e-09, "loss": 0.1313, "step": 3074 }, { "epoch": 4.953685058397101, "grad_norm": 0.22585183848954468, "learning_rate": 7.923967330099036e-09, "loss": 0.1474, "step": 3075 }, { "epoch": 4.9552960128876355, "grad_norm": 0.21204595764120757, "learning_rate": 7.3027661006586095e-09, "loss": 0.1531, "step": 3076 }, { "epoch": 4.956906967378171, "grad_norm": 0.20138290136485443, "learning_rate": 6.706913979241947e-09, "loss": 0.1603, "step": 3077 }, { "epoch": 4.958517921868707, "grad_norm": 0.21351586165159536, "learning_rate": 6.136411721340274e-09, "loss": 0.1383, "step": 3078 }, { "epoch": 4.960128876359243, "grad_norm": 0.21376007659935434, "learning_rate": 5.5912600503038594e-09, "loss": 0.1445, "step": 3079 }, { "epoch": 4.961739830849779, "grad_norm": 0.21489003486773706, "learning_rate": 5.071459657339794e-09, "loss": 0.1388, "step": 3080 }, { "epoch": 4.963350785340314, "grad_norm": 0.21755924068751195, "learning_rate": 4.577011201511994e-09, "loss": 0.1456, "step": 3081 }, { "epoch": 4.96496173983085, "grad_norm": 0.20180190976203755, "learning_rate": 4.107915309743416e-09, "loss": 0.146, "step": 3082 }, { "epoch": 4.9665726943213855, "grad_norm": 0.21855543222434343, "learning_rate": 3.664172576807179e-09, "loss": 0.1405, "step": 3083 }, { "epoch": 4.968183648811921, "grad_norm": 0.215264107680761, "learning_rate": 3.2457835653332272e-09, "loss": 0.1456, "step": 3084 }, { "epoch": 4.969794603302457, "grad_norm": 0.203470431035806, "learning_rate": 2.8527488058038844e-09, "loss": 0.1432, "step": 3085 }, { "epoch": 4.971405557792992, "grad_norm": 0.21276675372989584, "learning_rate": 2.485068796556078e-09, "loss": 0.1577, "step": 3086 }, { "epoch": 4.973016512283528, "grad_norm": 0.21088797347840574, "learning_rate": 2.142744003779118e-09, "loss": 0.1421, "step": 3087 }, { "epoch": 4.974627466774064, "grad_norm": 0.2246105636966214, "learning_rate": 1.8257748615102545e-09, "loss": 0.1351, "step": 3088 }, { "epoch": 4.9762384212646, "grad_norm": 0.21752395162393956, "learning_rate": 1.5341617716435608e-09, "loss": 0.1498, "step": 3089 }, { "epoch": 4.9778493757551345, "grad_norm": 0.21715767148751064, "learning_rate": 1.2679051039188317e-09, "loss": 0.1431, "step": 3090 }, { "epoch": 4.97946033024567, "grad_norm": 0.200604318655284, "learning_rate": 1.0270051959282434e-09, "loss": 0.1356, "step": 3091 }, { "epoch": 4.981071284736206, "grad_norm": 0.22114376968996727, "learning_rate": 8.114623531119137e-10, "loss": 0.1494, "step": 3092 }, { "epoch": 4.982682239226742, "grad_norm": 0.21912817007954846, "learning_rate": 6.212768487623422e-10, "loss": 0.147, "step": 3093 }, { "epoch": 4.984293193717278, "grad_norm": 0.21734907735157796, "learning_rate": 4.564489240177494e-10, "loss": 0.1427, "step": 3094 }, { "epoch": 4.985904148207813, "grad_norm": 0.2122215913186033, "learning_rate": 3.1697878786873804e-10, "loss": 0.1379, "step": 3095 }, { "epoch": 4.987515102698349, "grad_norm": 0.22045381925803034, "learning_rate": 2.0286661714941092e-10, "loss": 0.154, "step": 3096 }, { "epoch": 4.9891260571888845, "grad_norm": 0.21230236722633794, "learning_rate": 1.1411255654625309e-10, "loss": 0.1398, "step": 3097 }, { "epoch": 4.99073701167942, "grad_norm": 0.2155863280622798, "learning_rate": 5.0716718591470313e-11, "loss": 0.1415, "step": 3098 }, { "epoch": 4.992347966169955, "grad_norm": 0.21790703900678665, "learning_rate": 1.267918366743004e-11, "loss": 0.1517, "step": 3099 }, { "epoch": 4.993958920660491, "grad_norm": 0.21710189361301535, "learning_rate": 0.0, "loss": 0.1494, "step": 3100 }, { "epoch": 4.993958920660491, "step": 3100, "total_flos": 6.310856434702221e+18, "train_loss": 0.2839509548799646, "train_runtime": 73731.2238, "train_samples_per_second": 5.387, "train_steps_per_second": 0.042 } ], "logging_steps": 1.0, "max_steps": 3100, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.310856434702221e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }