diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21742 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.993958920660491, + "eval_steps": 500, + "global_step": 3100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016109544905356424, + "grad_norm": 6.032404008426598, + "learning_rate": 1.2903225806451614e-07, + "loss": 0.8102, + "step": 1 + }, + { + "epoch": 0.003221908981071285, + "grad_norm": 6.091224660092794, + "learning_rate": 2.580645161290323e-07, + "loss": 0.8201, + "step": 2 + }, + { + "epoch": 0.004832863471606927, + "grad_norm": 6.242173191014313, + "learning_rate": 3.870967741935484e-07, + "loss": 0.8364, + "step": 3 + }, + { + "epoch": 0.00644381796214257, + "grad_norm": 5.9113732582345815, + "learning_rate": 5.161290322580646e-07, + "loss": 0.8033, + "step": 4 + }, + { + "epoch": 0.008054772452678213, + "grad_norm": 5.836423086831506, + "learning_rate": 6.451612903225807e-07, + "loss": 0.7877, + "step": 5 + }, + { + "epoch": 0.009665726943213854, + "grad_norm": 5.755126086223614, + "learning_rate": 7.741935483870968e-07, + "loss": 0.7791, + "step": 6 + }, + { + "epoch": 0.011276681433749497, + "grad_norm": 5.649783806242555, + "learning_rate": 9.032258064516129e-07, + "loss": 0.7931, + "step": 7 + }, + { + "epoch": 0.01288763592428514, + "grad_norm": 5.479452292994705, + "learning_rate": 1.0322580645161291e-06, + "loss": 0.7854, + "step": 8 + }, + { + "epoch": 0.01449859041482078, + "grad_norm": 5.366601689873154, + "learning_rate": 1.1612903225806454e-06, + "loss": 0.7727, + "step": 9 + }, + { + "epoch": 0.016109544905356425, + "grad_norm": 4.434943674510098, + "learning_rate": 1.2903225806451614e-06, + "loss": 0.7384, + "step": 10 + }, + { + "epoch": 0.017720499395892066, + "grad_norm": 4.293198637826971, + "learning_rate": 1.4193548387096776e-06, + "loss": 0.7394, + "step": 11 + }, + { + "epoch": 0.019331453886427707, + "grad_norm": 4.083095267873627, + "learning_rate": 1.5483870967741937e-06, + "loss": 0.74, + "step": 12 + }, + { + "epoch": 0.020942408376963352, + "grad_norm": 2.4257793361125777, + "learning_rate": 1.67741935483871e-06, + "loss": 0.6943, + "step": 13 + }, + { + "epoch": 0.022553362867498993, + "grad_norm": 2.404485912753832, + "learning_rate": 1.8064516129032258e-06, + "loss": 0.7221, + "step": 14 + }, + { + "epoch": 0.024164317358034634, + "grad_norm": 2.1157005039902415, + "learning_rate": 1.935483870967742e-06, + "loss": 0.7026, + "step": 15 + }, + { + "epoch": 0.02577527184857028, + "grad_norm": 1.9329284071198143, + "learning_rate": 2.0645161290322582e-06, + "loss": 0.6697, + "step": 16 + }, + { + "epoch": 0.02738622633910592, + "grad_norm": 1.856103169273838, + "learning_rate": 2.1935483870967745e-06, + "loss": 0.672, + "step": 17 + }, + { + "epoch": 0.02899718082964156, + "grad_norm": 3.1117388382933022, + "learning_rate": 2.3225806451612907e-06, + "loss": 0.6815, + "step": 18 + }, + { + "epoch": 0.030608135320177206, + "grad_norm": 3.2415743455441484, + "learning_rate": 2.4516129032258066e-06, + "loss": 0.6679, + "step": 19 + }, + { + "epoch": 0.03221908981071285, + "grad_norm": 3.20677352745239, + "learning_rate": 2.580645161290323e-06, + "loss": 0.6551, + "step": 20 + }, + { + "epoch": 0.03383004430124849, + "grad_norm": 2.823762323190898, + "learning_rate": 2.709677419354839e-06, + "loss": 0.6319, + "step": 21 + }, + { + "epoch": 0.03544099879178413, + "grad_norm": 2.6726511366702796, + "learning_rate": 2.8387096774193553e-06, + "loss": 0.6458, + "step": 22 + }, + { + "epoch": 0.03705195328231978, + "grad_norm": 2.2046688512156916, + "learning_rate": 2.967741935483871e-06, + "loss": 0.6299, + "step": 23 + }, + { + "epoch": 0.038662907772855415, + "grad_norm": 1.6050271766621365, + "learning_rate": 3.0967741935483874e-06, + "loss": 0.6186, + "step": 24 + }, + { + "epoch": 0.04027386226339106, + "grad_norm": 1.3573669680309588, + "learning_rate": 3.225806451612903e-06, + "loss": 0.5967, + "step": 25 + }, + { + "epoch": 0.041884816753926704, + "grad_norm": 1.1627320193357804, + "learning_rate": 3.35483870967742e-06, + "loss": 0.6048, + "step": 26 + }, + { + "epoch": 0.04349577124446234, + "grad_norm": 0.9897690073046419, + "learning_rate": 3.4838709677419357e-06, + "loss": 0.5834, + "step": 27 + }, + { + "epoch": 0.045106725734997986, + "grad_norm": 0.9903767157665542, + "learning_rate": 3.6129032258064515e-06, + "loss": 0.5815, + "step": 28 + }, + { + "epoch": 0.04671768022553363, + "grad_norm": 1.1368680688613164, + "learning_rate": 3.741935483870968e-06, + "loss": 0.5707, + "step": 29 + }, + { + "epoch": 0.04832863471606927, + "grad_norm": 1.0426684828392754, + "learning_rate": 3.870967741935484e-06, + "loss": 0.5461, + "step": 30 + }, + { + "epoch": 0.04993958920660491, + "grad_norm": 0.9337404457709985, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5549, + "step": 31 + }, + { + "epoch": 0.05155054369714056, + "grad_norm": 0.7242627578457423, + "learning_rate": 4.1290322580645165e-06, + "loss": 0.5507, + "step": 32 + }, + { + "epoch": 0.053161498187676196, + "grad_norm": 0.6685301803699689, + "learning_rate": 4.258064516129032e-06, + "loss": 0.5403, + "step": 33 + }, + { + "epoch": 0.05477245267821184, + "grad_norm": 0.6968371169058664, + "learning_rate": 4.387096774193549e-06, + "loss": 0.5268, + "step": 34 + }, + { + "epoch": 0.056383407168747485, + "grad_norm": 0.8728817789992291, + "learning_rate": 4.516129032258065e-06, + "loss": 0.5347, + "step": 35 + }, + { + "epoch": 0.05799436165928312, + "grad_norm": 0.8602530490558115, + "learning_rate": 4.6451612903225815e-06, + "loss": 0.5462, + "step": 36 + }, + { + "epoch": 0.05960531614981877, + "grad_norm": 0.6378392806884997, + "learning_rate": 4.774193548387097e-06, + "loss": 0.5199, + "step": 37 + }, + { + "epoch": 0.06121627064035441, + "grad_norm": 0.5998483205329147, + "learning_rate": 4.903225806451613e-06, + "loss": 0.5378, + "step": 38 + }, + { + "epoch": 0.06282722513089005, + "grad_norm": 0.6059446636403915, + "learning_rate": 5.032258064516129e-06, + "loss": 0.5161, + "step": 39 + }, + { + "epoch": 0.0644381796214257, + "grad_norm": 0.6817404761352738, + "learning_rate": 5.161290322580646e-06, + "loss": 0.5194, + "step": 40 + }, + { + "epoch": 0.06604913411196134, + "grad_norm": 0.6331844245949004, + "learning_rate": 5.290322580645162e-06, + "loss": 0.5384, + "step": 41 + }, + { + "epoch": 0.06766008860249698, + "grad_norm": 0.4948231352853153, + "learning_rate": 5.419354838709678e-06, + "loss": 0.5169, + "step": 42 + }, + { + "epoch": 0.06927104309303263, + "grad_norm": 0.5035080428321753, + "learning_rate": 5.548387096774194e-06, + "loss": 0.5135, + "step": 43 + }, + { + "epoch": 0.07088199758356827, + "grad_norm": 0.536385019533745, + "learning_rate": 5.677419354838711e-06, + "loss": 0.4819, + "step": 44 + }, + { + "epoch": 0.0724929520741039, + "grad_norm": 0.5872680433979439, + "learning_rate": 5.806451612903226e-06, + "loss": 0.5404, + "step": 45 + }, + { + "epoch": 0.07410390656463955, + "grad_norm": 0.553950328048959, + "learning_rate": 5.935483870967742e-06, + "loss": 0.5012, + "step": 46 + }, + { + "epoch": 0.07571486105517519, + "grad_norm": 0.46649858895580004, + "learning_rate": 6.064516129032259e-06, + "loss": 0.5084, + "step": 47 + }, + { + "epoch": 0.07732581554571083, + "grad_norm": 0.49980550410748753, + "learning_rate": 6.193548387096775e-06, + "loss": 0.4997, + "step": 48 + }, + { + "epoch": 0.07893677003624648, + "grad_norm": 0.500125771445445, + "learning_rate": 6.3225806451612906e-06, + "loss": 0.5211, + "step": 49 + }, + { + "epoch": 0.08054772452678212, + "grad_norm": 0.43794258077350834, + "learning_rate": 6.451612903225806e-06, + "loss": 0.5019, + "step": 50 + }, + { + "epoch": 0.08215867901731776, + "grad_norm": 0.41542158356072195, + "learning_rate": 6.580645161290323e-06, + "loss": 0.5, + "step": 51 + }, + { + "epoch": 0.08376963350785341, + "grad_norm": 0.4398106247413648, + "learning_rate": 6.70967741935484e-06, + "loss": 0.482, + "step": 52 + }, + { + "epoch": 0.08538058799838905, + "grad_norm": 0.4658104909549922, + "learning_rate": 6.838709677419355e-06, + "loss": 0.51, + "step": 53 + }, + { + "epoch": 0.08699154248892468, + "grad_norm": 0.44586970110991897, + "learning_rate": 6.967741935483871e-06, + "loss": 0.5066, + "step": 54 + }, + { + "epoch": 0.08860249697946034, + "grad_norm": 0.36952288687065493, + "learning_rate": 7.096774193548388e-06, + "loss": 0.4868, + "step": 55 + }, + { + "epoch": 0.09021345146999597, + "grad_norm": 0.37449257402238395, + "learning_rate": 7.225806451612903e-06, + "loss": 0.4874, + "step": 56 + }, + { + "epoch": 0.09182440596053161, + "grad_norm": 0.43146034558081636, + "learning_rate": 7.35483870967742e-06, + "loss": 0.4961, + "step": 57 + }, + { + "epoch": 0.09343536045106726, + "grad_norm": 0.33924473498486496, + "learning_rate": 7.483870967741936e-06, + "loss": 0.4659, + "step": 58 + }, + { + "epoch": 0.0950463149416029, + "grad_norm": 0.3650384114275885, + "learning_rate": 7.612903225806451e-06, + "loss": 0.4922, + "step": 59 + }, + { + "epoch": 0.09665726943213854, + "grad_norm": 0.42392175050797987, + "learning_rate": 7.741935483870968e-06, + "loss": 0.4825, + "step": 60 + }, + { + "epoch": 0.09826822392267419, + "grad_norm": 0.41566274005192855, + "learning_rate": 7.870967741935484e-06, + "loss": 0.496, + "step": 61 + }, + { + "epoch": 0.09987917841320983, + "grad_norm": 0.3532541757044527, + "learning_rate": 8.000000000000001e-06, + "loss": 0.4836, + "step": 62 + }, + { + "epoch": 0.10149013290374546, + "grad_norm": 0.38676034561653744, + "learning_rate": 8.129032258064517e-06, + "loss": 0.4617, + "step": 63 + }, + { + "epoch": 0.10310108739428112, + "grad_norm": 0.39235511829919395, + "learning_rate": 8.258064516129033e-06, + "loss": 0.4699, + "step": 64 + }, + { + "epoch": 0.10471204188481675, + "grad_norm": 0.42853549136039776, + "learning_rate": 8.387096774193549e-06, + "loss": 0.4967, + "step": 65 + }, + { + "epoch": 0.10632299637535239, + "grad_norm": 0.4006077949721882, + "learning_rate": 8.516129032258065e-06, + "loss": 0.4745, + "step": 66 + }, + { + "epoch": 0.10793395086588804, + "grad_norm": 0.365858425655604, + "learning_rate": 8.64516129032258e-06, + "loss": 0.486, + "step": 67 + }, + { + "epoch": 0.10954490535642368, + "grad_norm": 0.4029132008489552, + "learning_rate": 8.774193548387098e-06, + "loss": 0.4578, + "step": 68 + }, + { + "epoch": 0.11115585984695932, + "grad_norm": 0.4416121827528027, + "learning_rate": 8.903225806451614e-06, + "loss": 0.4838, + "step": 69 + }, + { + "epoch": 0.11276681433749497, + "grad_norm": 0.37290691713598234, + "learning_rate": 9.03225806451613e-06, + "loss": 0.4626, + "step": 70 + }, + { + "epoch": 0.11437776882803061, + "grad_norm": 0.39437106054528326, + "learning_rate": 9.161290322580645e-06, + "loss": 0.4606, + "step": 71 + }, + { + "epoch": 0.11598872331856624, + "grad_norm": 0.46230012352472577, + "learning_rate": 9.290322580645163e-06, + "loss": 0.4843, + "step": 72 + }, + { + "epoch": 0.1175996778091019, + "grad_norm": 0.36168746091549053, + "learning_rate": 9.419354838709677e-06, + "loss": 0.4776, + "step": 73 + }, + { + "epoch": 0.11921063229963753, + "grad_norm": 0.3887609362159801, + "learning_rate": 9.548387096774195e-06, + "loss": 0.4913, + "step": 74 + }, + { + "epoch": 0.12082158679017317, + "grad_norm": 0.4212414686324945, + "learning_rate": 9.67741935483871e-06, + "loss": 0.472, + "step": 75 + }, + { + "epoch": 0.12243254128070882, + "grad_norm": 0.3736240767300341, + "learning_rate": 9.806451612903226e-06, + "loss": 0.463, + "step": 76 + }, + { + "epoch": 0.12404349577124446, + "grad_norm": 0.3897026748050938, + "learning_rate": 9.935483870967742e-06, + "loss": 0.4869, + "step": 77 + }, + { + "epoch": 0.1256544502617801, + "grad_norm": 0.35819030543571656, + "learning_rate": 1.0064516129032258e-05, + "loss": 0.4821, + "step": 78 + }, + { + "epoch": 0.12726540475231574, + "grad_norm": 0.419675735054713, + "learning_rate": 1.0193548387096774e-05, + "loss": 0.4572, + "step": 79 + }, + { + "epoch": 0.1288763592428514, + "grad_norm": 0.45017621890591875, + "learning_rate": 1.0322580645161291e-05, + "loss": 0.4519, + "step": 80 + }, + { + "epoch": 0.13048731373338704, + "grad_norm": 0.38873544789688036, + "learning_rate": 1.0451612903225807e-05, + "loss": 0.4704, + "step": 81 + }, + { + "epoch": 0.13209826822392268, + "grad_norm": 0.4590008765481535, + "learning_rate": 1.0580645161290325e-05, + "loss": 0.4624, + "step": 82 + }, + { + "epoch": 0.13370922271445831, + "grad_norm": 0.4017641572848416, + "learning_rate": 1.070967741935484e-05, + "loss": 0.4552, + "step": 83 + }, + { + "epoch": 0.13532017720499395, + "grad_norm": 0.4919488749370493, + "learning_rate": 1.0838709677419356e-05, + "loss": 0.4824, + "step": 84 + }, + { + "epoch": 0.1369311316955296, + "grad_norm": 0.46571573865087124, + "learning_rate": 1.096774193548387e-05, + "loss": 0.4593, + "step": 85 + }, + { + "epoch": 0.13854208618606526, + "grad_norm": 0.483375616479342, + "learning_rate": 1.1096774193548388e-05, + "loss": 0.4762, + "step": 86 + }, + { + "epoch": 0.1401530406766009, + "grad_norm": 0.390666645368303, + "learning_rate": 1.1225806451612904e-05, + "loss": 0.4543, + "step": 87 + }, + { + "epoch": 0.14176399516713653, + "grad_norm": 0.5003728860520759, + "learning_rate": 1.1354838709677421e-05, + "loss": 0.4466, + "step": 88 + }, + { + "epoch": 0.14337494965767217, + "grad_norm": 0.4761849607513898, + "learning_rate": 1.1483870967741937e-05, + "loss": 0.4744, + "step": 89 + }, + { + "epoch": 0.1449859041482078, + "grad_norm": 0.4431212670764865, + "learning_rate": 1.1612903225806453e-05, + "loss": 0.4785, + "step": 90 + }, + { + "epoch": 0.14659685863874344, + "grad_norm": 0.4645484282151763, + "learning_rate": 1.1741935483870967e-05, + "loss": 0.4747, + "step": 91 + }, + { + "epoch": 0.1482078131292791, + "grad_norm": 0.4591410377370998, + "learning_rate": 1.1870967741935484e-05, + "loss": 0.4501, + "step": 92 + }, + { + "epoch": 0.14981876761981475, + "grad_norm": 0.43211703633867105, + "learning_rate": 1.2e-05, + "loss": 0.4318, + "step": 93 + }, + { + "epoch": 0.15142972211035038, + "grad_norm": 0.4134987376570425, + "learning_rate": 1.2129032258064518e-05, + "loss": 0.4554, + "step": 94 + }, + { + "epoch": 0.15304067660088602, + "grad_norm": 0.40687971876636914, + "learning_rate": 1.2258064516129034e-05, + "loss": 0.4463, + "step": 95 + }, + { + "epoch": 0.15465163109142166, + "grad_norm": 0.4821367578390539, + "learning_rate": 1.238709677419355e-05, + "loss": 0.4684, + "step": 96 + }, + { + "epoch": 0.1562625855819573, + "grad_norm": 0.4008872991192848, + "learning_rate": 1.2516129032258067e-05, + "loss": 0.4587, + "step": 97 + }, + { + "epoch": 0.15787354007249296, + "grad_norm": 0.4709306605581316, + "learning_rate": 1.2645161290322581e-05, + "loss": 0.4669, + "step": 98 + }, + { + "epoch": 0.1594844945630286, + "grad_norm": 0.4270863547298053, + "learning_rate": 1.2774193548387097e-05, + "loss": 0.453, + "step": 99 + }, + { + "epoch": 0.16109544905356424, + "grad_norm": 0.47904696715631884, + "learning_rate": 1.2903225806451613e-05, + "loss": 0.4331, + "step": 100 + }, + { + "epoch": 0.16270640354409988, + "grad_norm": 0.5646915331058875, + "learning_rate": 1.303225806451613e-05, + "loss": 0.4647, + "step": 101 + }, + { + "epoch": 0.1643173580346355, + "grad_norm": 0.421375992229102, + "learning_rate": 1.3161290322580646e-05, + "loss": 0.4628, + "step": 102 + }, + { + "epoch": 0.16592831252517115, + "grad_norm": 0.4577925255478774, + "learning_rate": 1.3290322580645164e-05, + "loss": 0.4432, + "step": 103 + }, + { + "epoch": 0.16753926701570682, + "grad_norm": 0.5080123906152704, + "learning_rate": 1.341935483870968e-05, + "loss": 0.4581, + "step": 104 + }, + { + "epoch": 0.16915022150624245, + "grad_norm": 0.4302757240951249, + "learning_rate": 1.3548387096774194e-05, + "loss": 0.4252, + "step": 105 + }, + { + "epoch": 0.1707611759967781, + "grad_norm": 0.543321933222442, + "learning_rate": 1.367741935483871e-05, + "loss": 0.4556, + "step": 106 + }, + { + "epoch": 0.17237213048731373, + "grad_norm": 0.4342499733326806, + "learning_rate": 1.3806451612903227e-05, + "loss": 0.4474, + "step": 107 + }, + { + "epoch": 0.17398308497784937, + "grad_norm": 0.625382634227691, + "learning_rate": 1.3935483870967743e-05, + "loss": 0.4262, + "step": 108 + }, + { + "epoch": 0.175594039468385, + "grad_norm": 0.4889154545842034, + "learning_rate": 1.406451612903226e-05, + "loss": 0.4633, + "step": 109 + }, + { + "epoch": 0.17720499395892067, + "grad_norm": 0.468374811877519, + "learning_rate": 1.4193548387096776e-05, + "loss": 0.4685, + "step": 110 + }, + { + "epoch": 0.1788159484494563, + "grad_norm": 0.5750906393209809, + "learning_rate": 1.4322580645161292e-05, + "loss": 0.4521, + "step": 111 + }, + { + "epoch": 0.18042690293999195, + "grad_norm": 0.43929194658608134, + "learning_rate": 1.4451612903225806e-05, + "loss": 0.458, + "step": 112 + }, + { + "epoch": 0.18203785743052758, + "grad_norm": 0.5255857904785238, + "learning_rate": 1.4580645161290324e-05, + "loss": 0.4354, + "step": 113 + }, + { + "epoch": 0.18364881192106322, + "grad_norm": 0.5697953246124879, + "learning_rate": 1.470967741935484e-05, + "loss": 0.4597, + "step": 114 + }, + { + "epoch": 0.18525976641159886, + "grad_norm": 0.684779610893239, + "learning_rate": 1.4838709677419357e-05, + "loss": 0.4513, + "step": 115 + }, + { + "epoch": 0.18687072090213452, + "grad_norm": 0.7663595177962778, + "learning_rate": 1.4967741935483873e-05, + "loss": 0.4501, + "step": 116 + }, + { + "epoch": 0.18848167539267016, + "grad_norm": 0.6231547313160818, + "learning_rate": 1.5096774193548389e-05, + "loss": 0.4654, + "step": 117 + }, + { + "epoch": 0.1900926298832058, + "grad_norm": 0.762366532637053, + "learning_rate": 1.5225806451612903e-05, + "loss": 0.4379, + "step": 118 + }, + { + "epoch": 0.19170358437374144, + "grad_norm": 0.6014548144293846, + "learning_rate": 1.535483870967742e-05, + "loss": 0.4433, + "step": 119 + }, + { + "epoch": 0.19331453886427707, + "grad_norm": 0.5976102150815602, + "learning_rate": 1.5483870967741936e-05, + "loss": 0.4412, + "step": 120 + }, + { + "epoch": 0.19492549335481274, + "grad_norm": 0.7203469874012064, + "learning_rate": 1.5612903225806454e-05, + "loss": 0.4746, + "step": 121 + }, + { + "epoch": 0.19653644784534838, + "grad_norm": 0.7853276912774007, + "learning_rate": 1.5741935483870968e-05, + "loss": 0.457, + "step": 122 + }, + { + "epoch": 0.19814740233588402, + "grad_norm": 0.5932744430290895, + "learning_rate": 1.5870967741935485e-05, + "loss": 0.4576, + "step": 123 + }, + { + "epoch": 0.19975835682641965, + "grad_norm": 0.7267109233788118, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4467, + "step": 124 + }, + { + "epoch": 0.2013693113169553, + "grad_norm": 0.5406828652336474, + "learning_rate": 1.6129032258064517e-05, + "loss": 0.4462, + "step": 125 + }, + { + "epoch": 0.20298026580749093, + "grad_norm": 0.5858021844743821, + "learning_rate": 1.6258064516129034e-05, + "loss": 0.4626, + "step": 126 + }, + { + "epoch": 0.2045912202980266, + "grad_norm": 0.5877766270343217, + "learning_rate": 1.638709677419355e-05, + "loss": 0.455, + "step": 127 + }, + { + "epoch": 0.20620217478856223, + "grad_norm": 0.47863537726359784, + "learning_rate": 1.6516129032258066e-05, + "loss": 0.4539, + "step": 128 + }, + { + "epoch": 0.20781312927909787, + "grad_norm": 0.439070948766706, + "learning_rate": 1.6645161290322583e-05, + "loss": 0.4443, + "step": 129 + }, + { + "epoch": 0.2094240837696335, + "grad_norm": 0.5181283081353542, + "learning_rate": 1.6774193548387098e-05, + "loss": 0.4522, + "step": 130 + }, + { + "epoch": 0.21103503826016914, + "grad_norm": 0.4577673620554301, + "learning_rate": 1.6903225806451615e-05, + "loss": 0.479, + "step": 131 + }, + { + "epoch": 0.21264599275070478, + "grad_norm": 0.4523908387978445, + "learning_rate": 1.703225806451613e-05, + "loss": 0.4456, + "step": 132 + }, + { + "epoch": 0.21425694724124045, + "grad_norm": 0.5290727008449524, + "learning_rate": 1.7161290322580647e-05, + "loss": 0.4221, + "step": 133 + }, + { + "epoch": 0.21586790173177609, + "grad_norm": 0.4980372169033664, + "learning_rate": 1.729032258064516e-05, + "loss": 0.4563, + "step": 134 + }, + { + "epoch": 0.21747885622231172, + "grad_norm": 0.48606495543694916, + "learning_rate": 1.741935483870968e-05, + "loss": 0.4324, + "step": 135 + }, + { + "epoch": 0.21908981071284736, + "grad_norm": 0.6957991032575429, + "learning_rate": 1.7548387096774196e-05, + "loss": 0.4722, + "step": 136 + }, + { + "epoch": 0.220700765203383, + "grad_norm": 0.4643809416764509, + "learning_rate": 1.7677419354838713e-05, + "loss": 0.4521, + "step": 137 + }, + { + "epoch": 0.22231171969391864, + "grad_norm": 0.6664029651503106, + "learning_rate": 1.7806451612903228e-05, + "loss": 0.4481, + "step": 138 + }, + { + "epoch": 0.2239226741844543, + "grad_norm": 0.5260601890755087, + "learning_rate": 1.7935483870967742e-05, + "loss": 0.4525, + "step": 139 + }, + { + "epoch": 0.22553362867498994, + "grad_norm": 0.5161740937718994, + "learning_rate": 1.806451612903226e-05, + "loss": 0.4455, + "step": 140 + }, + { + "epoch": 0.22714458316552558, + "grad_norm": 0.5760986430160835, + "learning_rate": 1.8193548387096777e-05, + "loss": 0.4582, + "step": 141 + }, + { + "epoch": 0.22875553765606121, + "grad_norm": 0.44639939244600985, + "learning_rate": 1.832258064516129e-05, + "loss": 0.4296, + "step": 142 + }, + { + "epoch": 0.23036649214659685, + "grad_norm": 0.4882062915992461, + "learning_rate": 1.845161290322581e-05, + "loss": 0.446, + "step": 143 + }, + { + "epoch": 0.2319774466371325, + "grad_norm": 0.4896222422614565, + "learning_rate": 1.8580645161290326e-05, + "loss": 0.4093, + "step": 144 + }, + { + "epoch": 0.23358840112766815, + "grad_norm": 0.5601501144229972, + "learning_rate": 1.870967741935484e-05, + "loss": 0.4505, + "step": 145 + }, + { + "epoch": 0.2351993556182038, + "grad_norm": 0.5437487520198745, + "learning_rate": 1.8838709677419354e-05, + "loss": 0.4322, + "step": 146 + }, + { + "epoch": 0.23681031010873943, + "grad_norm": 0.526186951905453, + "learning_rate": 1.896774193548387e-05, + "loss": 0.4521, + "step": 147 + }, + { + "epoch": 0.23842126459927507, + "grad_norm": 0.5667856097490956, + "learning_rate": 1.909677419354839e-05, + "loss": 0.4271, + "step": 148 + }, + { + "epoch": 0.2400322190898107, + "grad_norm": 0.49095301330200275, + "learning_rate": 1.9225806451612907e-05, + "loss": 0.4253, + "step": 149 + }, + { + "epoch": 0.24164317358034634, + "grad_norm": 0.5980300594255382, + "learning_rate": 1.935483870967742e-05, + "loss": 0.4244, + "step": 150 + }, + { + "epoch": 0.243254128070882, + "grad_norm": 0.47376445836894726, + "learning_rate": 1.948387096774194e-05, + "loss": 0.4372, + "step": 151 + }, + { + "epoch": 0.24486508256141765, + "grad_norm": 0.5782178587487257, + "learning_rate": 1.9612903225806452e-05, + "loss": 0.4616, + "step": 152 + }, + { + "epoch": 0.24647603705195328, + "grad_norm": 0.6641367840166066, + "learning_rate": 1.974193548387097e-05, + "loss": 0.4373, + "step": 153 + }, + { + "epoch": 0.24808699154248892, + "grad_norm": 0.4915152452969149, + "learning_rate": 1.9870967741935484e-05, + "loss": 0.4375, + "step": 154 + }, + { + "epoch": 0.24969794603302456, + "grad_norm": 0.4715056217318344, + "learning_rate": 2e-05, + "loss": 0.4602, + "step": 155 + }, + { + "epoch": 0.2513089005235602, + "grad_norm": 0.4588234482596573, + "learning_rate": 2.0129032258064516e-05, + "loss": 0.4364, + "step": 156 + }, + { + "epoch": 0.25291985501409586, + "grad_norm": 0.49507022488662084, + "learning_rate": 2.0258064516129033e-05, + "loss": 0.4328, + "step": 157 + }, + { + "epoch": 0.2545308095046315, + "grad_norm": 0.475753423255671, + "learning_rate": 2.0387096774193547e-05, + "loss": 0.428, + "step": 158 + }, + { + "epoch": 0.25614176399516714, + "grad_norm": 0.5183928270573065, + "learning_rate": 2.051612903225807e-05, + "loss": 0.4392, + "step": 159 + }, + { + "epoch": 0.2577527184857028, + "grad_norm": 0.4774005808982356, + "learning_rate": 2.0645161290322582e-05, + "loss": 0.4428, + "step": 160 + }, + { + "epoch": 0.2593636729762384, + "grad_norm": 0.4572472041520854, + "learning_rate": 2.07741935483871e-05, + "loss": 0.4634, + "step": 161 + }, + { + "epoch": 0.2609746274667741, + "grad_norm": 0.6081326727542029, + "learning_rate": 2.0903225806451614e-05, + "loss": 0.428, + "step": 162 + }, + { + "epoch": 0.2625855819573097, + "grad_norm": 0.5023244790140038, + "learning_rate": 2.1032258064516128e-05, + "loss": 0.462, + "step": 163 + }, + { + "epoch": 0.26419653644784535, + "grad_norm": 0.5479494420185165, + "learning_rate": 2.116129032258065e-05, + "loss": 0.4469, + "step": 164 + }, + { + "epoch": 0.26580749093838096, + "grad_norm": 0.4584032900157975, + "learning_rate": 2.1290322580645163e-05, + "loss": 0.4477, + "step": 165 + }, + { + "epoch": 0.26741844542891663, + "grad_norm": 0.6145948964239977, + "learning_rate": 2.141935483870968e-05, + "loss": 0.4371, + "step": 166 + }, + { + "epoch": 0.2690293999194523, + "grad_norm": 0.5594470471380588, + "learning_rate": 2.1548387096774195e-05, + "loss": 0.456, + "step": 167 + }, + { + "epoch": 0.2706403544099879, + "grad_norm": 0.45972051481741444, + "learning_rate": 2.1677419354838712e-05, + "loss": 0.4391, + "step": 168 + }, + { + "epoch": 0.27225130890052357, + "grad_norm": 0.5461702197752573, + "learning_rate": 2.1806451612903227e-05, + "loss": 0.4409, + "step": 169 + }, + { + "epoch": 0.2738622633910592, + "grad_norm": 0.5906536061989351, + "learning_rate": 2.193548387096774e-05, + "loss": 0.4586, + "step": 170 + }, + { + "epoch": 0.27547321788159485, + "grad_norm": 0.599985383963263, + "learning_rate": 2.206451612903226e-05, + "loss": 0.4503, + "step": 171 + }, + { + "epoch": 0.2770841723721305, + "grad_norm": 0.5604973755851727, + "learning_rate": 2.2193548387096776e-05, + "loss": 0.442, + "step": 172 + }, + { + "epoch": 0.2786951268626661, + "grad_norm": 0.4798615338782793, + "learning_rate": 2.2322580645161293e-05, + "loss": 0.4436, + "step": 173 + }, + { + "epoch": 0.2803060813532018, + "grad_norm": 0.6325144823189365, + "learning_rate": 2.2451612903225807e-05, + "loss": 0.4439, + "step": 174 + }, + { + "epoch": 0.2819170358437374, + "grad_norm": 0.5641277931011702, + "learning_rate": 2.2580645161290328e-05, + "loss": 0.4288, + "step": 175 + }, + { + "epoch": 0.28352799033427306, + "grad_norm": 0.4953611842384428, + "learning_rate": 2.2709677419354842e-05, + "loss": 0.4449, + "step": 176 + }, + { + "epoch": 0.2851389448248087, + "grad_norm": 0.5224583314537559, + "learning_rate": 2.2838709677419357e-05, + "loss": 0.4162, + "step": 177 + }, + { + "epoch": 0.28674989931534434, + "grad_norm": 0.5001528193199506, + "learning_rate": 2.2967741935483874e-05, + "loss": 0.4393, + "step": 178 + }, + { + "epoch": 0.28836085380588, + "grad_norm": 0.48777820697444463, + "learning_rate": 2.3096774193548388e-05, + "loss": 0.4261, + "step": 179 + }, + { + "epoch": 0.2899718082964156, + "grad_norm": 0.64062502637235, + "learning_rate": 2.3225806451612906e-05, + "loss": 0.4442, + "step": 180 + }, + { + "epoch": 0.2915827627869513, + "grad_norm": 0.5683627649259488, + "learning_rate": 2.335483870967742e-05, + "loss": 0.447, + "step": 181 + }, + { + "epoch": 0.2931937172774869, + "grad_norm": 0.7696086123267061, + "learning_rate": 2.3483870967741934e-05, + "loss": 0.4219, + "step": 182 + }, + { + "epoch": 0.29480467176802255, + "grad_norm": 0.7701985708034406, + "learning_rate": 2.3612903225806455e-05, + "loss": 0.4178, + "step": 183 + }, + { + "epoch": 0.2964156262585582, + "grad_norm": 0.655838324079945, + "learning_rate": 2.374193548387097e-05, + "loss": 0.4229, + "step": 184 + }, + { + "epoch": 0.29802658074909383, + "grad_norm": 0.5294585112609449, + "learning_rate": 2.3870967741935486e-05, + "loss": 0.4374, + "step": 185 + }, + { + "epoch": 0.2996375352396295, + "grad_norm": 0.6213012416475787, + "learning_rate": 2.4e-05, + "loss": 0.4307, + "step": 186 + }, + { + "epoch": 0.3012484897301651, + "grad_norm": 0.7049316455544041, + "learning_rate": 2.4129032258064518e-05, + "loss": 0.4251, + "step": 187 + }, + { + "epoch": 0.30285944422070077, + "grad_norm": 0.5847111370562961, + "learning_rate": 2.4258064516129036e-05, + "loss": 0.4322, + "step": 188 + }, + { + "epoch": 0.30447039871123643, + "grad_norm": 0.5607890554854233, + "learning_rate": 2.438709677419355e-05, + "loss": 0.4283, + "step": 189 + }, + { + "epoch": 0.30608135320177204, + "grad_norm": 0.4807120710699517, + "learning_rate": 2.4516129032258067e-05, + "loss": 0.4321, + "step": 190 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.6290000689322379, + "learning_rate": 2.464516129032258e-05, + "loss": 0.4398, + "step": 191 + }, + { + "epoch": 0.3093032621828433, + "grad_norm": 0.5860760329948072, + "learning_rate": 2.47741935483871e-05, + "loss": 0.4239, + "step": 192 + }, + { + "epoch": 0.310914216673379, + "grad_norm": 0.5713603124420665, + "learning_rate": 2.4903225806451613e-05, + "loss": 0.4501, + "step": 193 + }, + { + "epoch": 0.3125251711639146, + "grad_norm": 0.7292654172275151, + "learning_rate": 2.5032258064516134e-05, + "loss": 0.44, + "step": 194 + }, + { + "epoch": 0.31413612565445026, + "grad_norm": 0.557304847589959, + "learning_rate": 2.5161290322580648e-05, + "loss": 0.4391, + "step": 195 + }, + { + "epoch": 0.3157470801449859, + "grad_norm": 0.8185353596554609, + "learning_rate": 2.5290322580645162e-05, + "loss": 0.4495, + "step": 196 + }, + { + "epoch": 0.31735803463552154, + "grad_norm": 0.6696922756603706, + "learning_rate": 2.541935483870968e-05, + "loss": 0.4229, + "step": 197 + }, + { + "epoch": 0.3189689891260572, + "grad_norm": 0.5867477527038533, + "learning_rate": 2.5548387096774194e-05, + "loss": 0.4281, + "step": 198 + }, + { + "epoch": 0.3205799436165928, + "grad_norm": 0.7383292533663941, + "learning_rate": 2.567741935483871e-05, + "loss": 0.4335, + "step": 199 + }, + { + "epoch": 0.3221908981071285, + "grad_norm": 0.5494545052520469, + "learning_rate": 2.5806451612903226e-05, + "loss": 0.4123, + "step": 200 + }, + { + "epoch": 0.32380185259766414, + "grad_norm": 0.787263534027303, + "learning_rate": 2.5935483870967746e-05, + "loss": 0.4334, + "step": 201 + }, + { + "epoch": 0.32541280708819975, + "grad_norm": 0.671237794078286, + "learning_rate": 2.606451612903226e-05, + "loss": 0.4469, + "step": 202 + }, + { + "epoch": 0.3270237615787354, + "grad_norm": 0.6087290607998392, + "learning_rate": 2.6193548387096775e-05, + "loss": 0.4468, + "step": 203 + }, + { + "epoch": 0.328634716069271, + "grad_norm": 0.8456692623524803, + "learning_rate": 2.6322580645161292e-05, + "loss": 0.4349, + "step": 204 + }, + { + "epoch": 0.3302456705598067, + "grad_norm": 0.5789430896858069, + "learning_rate": 2.6451612903225806e-05, + "loss": 0.4386, + "step": 205 + }, + { + "epoch": 0.3318566250503423, + "grad_norm": 0.5227774356619872, + "learning_rate": 2.6580645161290327e-05, + "loss": 0.4237, + "step": 206 + }, + { + "epoch": 0.33346757954087797, + "grad_norm": 0.5151727075586936, + "learning_rate": 2.670967741935484e-05, + "loss": 0.4397, + "step": 207 + }, + { + "epoch": 0.33507853403141363, + "grad_norm": 0.48417771982658486, + "learning_rate": 2.683870967741936e-05, + "loss": 0.4479, + "step": 208 + }, + { + "epoch": 0.33668948852194924, + "grad_norm": 0.5481117200758377, + "learning_rate": 2.6967741935483873e-05, + "loss": 0.4186, + "step": 209 + }, + { + "epoch": 0.3383004430124849, + "grad_norm": 0.4448179461564387, + "learning_rate": 2.7096774193548387e-05, + "loss": 0.4084, + "step": 210 + }, + { + "epoch": 0.3399113975030205, + "grad_norm": 0.49449277894946425, + "learning_rate": 2.7225806451612905e-05, + "loss": 0.442, + "step": 211 + }, + { + "epoch": 0.3415223519935562, + "grad_norm": 0.5207685357019004, + "learning_rate": 2.735483870967742e-05, + "loss": 0.4353, + "step": 212 + }, + { + "epoch": 0.34313330648409185, + "grad_norm": 0.7050029152494538, + "learning_rate": 2.748387096774194e-05, + "loss": 0.4226, + "step": 213 + }, + { + "epoch": 0.34474426097462746, + "grad_norm": 0.6948829474376914, + "learning_rate": 2.7612903225806454e-05, + "loss": 0.4246, + "step": 214 + }, + { + "epoch": 0.3463552154651631, + "grad_norm": 0.5285793312066525, + "learning_rate": 2.774193548387097e-05, + "loss": 0.4387, + "step": 215 + }, + { + "epoch": 0.34796616995569873, + "grad_norm": 0.5892451924441414, + "learning_rate": 2.7870967741935485e-05, + "loss": 0.4455, + "step": 216 + }, + { + "epoch": 0.3495771244462344, + "grad_norm": 0.8021238678624306, + "learning_rate": 2.8e-05, + "loss": 0.425, + "step": 217 + }, + { + "epoch": 0.35118807893677, + "grad_norm": 0.5945002361956657, + "learning_rate": 2.812903225806452e-05, + "loss": 0.4286, + "step": 218 + }, + { + "epoch": 0.3527990334273057, + "grad_norm": 0.428799155792937, + "learning_rate": 2.8258064516129035e-05, + "loss": 0.408, + "step": 219 + }, + { + "epoch": 0.35440998791784134, + "grad_norm": 0.5963355738995051, + "learning_rate": 2.8387096774193552e-05, + "loss": 0.4067, + "step": 220 + }, + { + "epoch": 0.35602094240837695, + "grad_norm": 0.6365882727245175, + "learning_rate": 2.8516129032258066e-05, + "loss": 0.4408, + "step": 221 + }, + { + "epoch": 0.3576318968989126, + "grad_norm": 0.46796304829903396, + "learning_rate": 2.8645161290322584e-05, + "loss": 0.4338, + "step": 222 + }, + { + "epoch": 0.3592428513894482, + "grad_norm": 0.47402969771177833, + "learning_rate": 2.8774193548387098e-05, + "loss": 0.4128, + "step": 223 + }, + { + "epoch": 0.3608538058799839, + "grad_norm": 0.48803059781651736, + "learning_rate": 2.8903225806451612e-05, + "loss": 0.4407, + "step": 224 + }, + { + "epoch": 0.36246476037051956, + "grad_norm": 0.49604586682206475, + "learning_rate": 2.9032258064516133e-05, + "loss": 0.4232, + "step": 225 + }, + { + "epoch": 0.36407571486105517, + "grad_norm": 0.48189900699189114, + "learning_rate": 2.9161290322580647e-05, + "loss": 0.4377, + "step": 226 + }, + { + "epoch": 0.36568666935159083, + "grad_norm": 0.48989884967543235, + "learning_rate": 2.9290322580645165e-05, + "loss": 0.4182, + "step": 227 + }, + { + "epoch": 0.36729762384212644, + "grad_norm": 0.5085990736148946, + "learning_rate": 2.941935483870968e-05, + "loss": 0.4141, + "step": 228 + }, + { + "epoch": 0.3689085783326621, + "grad_norm": 0.5542764435566585, + "learning_rate": 2.95483870967742e-05, + "loss": 0.4239, + "step": 229 + }, + { + "epoch": 0.3705195328231977, + "grad_norm": 0.5509845441337561, + "learning_rate": 2.9677419354838714e-05, + "loss": 0.4327, + "step": 230 + }, + { + "epoch": 0.3721304873137334, + "grad_norm": 0.5374789273502775, + "learning_rate": 2.9806451612903228e-05, + "loss": 0.4046, + "step": 231 + }, + { + "epoch": 0.37374144180426905, + "grad_norm": 0.512328932947813, + "learning_rate": 2.9935483870967745e-05, + "loss": 0.4416, + "step": 232 + }, + { + "epoch": 0.37535239629480466, + "grad_norm": 0.5980263212180401, + "learning_rate": 3.006451612903226e-05, + "loss": 0.437, + "step": 233 + }, + { + "epoch": 0.3769633507853403, + "grad_norm": 0.4528657439497093, + "learning_rate": 3.0193548387096777e-05, + "loss": 0.4151, + "step": 234 + }, + { + "epoch": 0.37857430527587593, + "grad_norm": 0.5472677539799308, + "learning_rate": 3.032258064516129e-05, + "loss": 0.4435, + "step": 235 + }, + { + "epoch": 0.3801852597664116, + "grad_norm": 0.5284722744720981, + "learning_rate": 3.0451612903225805e-05, + "loss": 0.421, + "step": 236 + }, + { + "epoch": 0.38179621425694726, + "grad_norm": 0.5145172039518136, + "learning_rate": 3.0580645161290326e-05, + "loss": 0.4255, + "step": 237 + }, + { + "epoch": 0.3834071687474829, + "grad_norm": 0.48595473394452576, + "learning_rate": 3.070967741935484e-05, + "loss": 0.4237, + "step": 238 + }, + { + "epoch": 0.38501812323801854, + "grad_norm": 0.6578784457151443, + "learning_rate": 3.083870967741936e-05, + "loss": 0.4372, + "step": 239 + }, + { + "epoch": 0.38662907772855415, + "grad_norm": 0.6381289705882743, + "learning_rate": 3.096774193548387e-05, + "loss": 0.4405, + "step": 240 + }, + { + "epoch": 0.3882400322190898, + "grad_norm": 0.5736653750108794, + "learning_rate": 3.109677419354839e-05, + "loss": 0.4304, + "step": 241 + }, + { + "epoch": 0.3898509867096255, + "grad_norm": 0.61679678041202, + "learning_rate": 3.122580645161291e-05, + "loss": 0.4299, + "step": 242 + }, + { + "epoch": 0.3914619412001611, + "grad_norm": 0.6665472313912354, + "learning_rate": 3.135483870967742e-05, + "loss": 0.4312, + "step": 243 + }, + { + "epoch": 0.39307289569069676, + "grad_norm": 0.6298402893744898, + "learning_rate": 3.1483870967741935e-05, + "loss": 0.4161, + "step": 244 + }, + { + "epoch": 0.39468385018123237, + "grad_norm": 0.5300708226898071, + "learning_rate": 3.161290322580645e-05, + "loss": 0.4277, + "step": 245 + }, + { + "epoch": 0.39629480467176803, + "grad_norm": 0.5893160355497045, + "learning_rate": 3.174193548387097e-05, + "loss": 0.4285, + "step": 246 + }, + { + "epoch": 0.39790575916230364, + "grad_norm": 0.6980967474518777, + "learning_rate": 3.187096774193549e-05, + "loss": 0.4181, + "step": 247 + }, + { + "epoch": 0.3995167136528393, + "grad_norm": 0.5413223642212789, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.4269, + "step": 248 + }, + { + "epoch": 0.40112766814337497, + "grad_norm": 0.40290216945992247, + "learning_rate": 3.2129032258064516e-05, + "loss": 0.4347, + "step": 249 + }, + { + "epoch": 0.4027386226339106, + "grad_norm": 0.5063526982810828, + "learning_rate": 3.2258064516129034e-05, + "loss": 0.4387, + "step": 250 + }, + { + "epoch": 0.40434957712444625, + "grad_norm": 0.5220460662141977, + "learning_rate": 3.238709677419355e-05, + "loss": 0.4307, + "step": 251 + }, + { + "epoch": 0.40596053161498186, + "grad_norm": 0.4914288485416426, + "learning_rate": 3.251612903225807e-05, + "loss": 0.4212, + "step": 252 + }, + { + "epoch": 0.4075714861055175, + "grad_norm": 0.5074074899301333, + "learning_rate": 3.2645161290322586e-05, + "loss": 0.4247, + "step": 253 + }, + { + "epoch": 0.4091824405960532, + "grad_norm": 0.5117479204044484, + "learning_rate": 3.27741935483871e-05, + "loss": 0.4199, + "step": 254 + }, + { + "epoch": 0.4107933950865888, + "grad_norm": 0.5057623752314662, + "learning_rate": 3.2903225806451614e-05, + "loss": 0.4377, + "step": 255 + }, + { + "epoch": 0.41240434957712446, + "grad_norm": 0.45973399006678667, + "learning_rate": 3.303225806451613e-05, + "loss": 0.4294, + "step": 256 + }, + { + "epoch": 0.4140153040676601, + "grad_norm": 0.4690587970020925, + "learning_rate": 3.316129032258064e-05, + "loss": 0.43, + "step": 257 + }, + { + "epoch": 0.41562625855819574, + "grad_norm": 0.5079472609720749, + "learning_rate": 3.329032258064517e-05, + "loss": 0.4218, + "step": 258 + }, + { + "epoch": 0.41723721304873135, + "grad_norm": 0.5254404696098615, + "learning_rate": 3.341935483870968e-05, + "loss": 0.4236, + "step": 259 + }, + { + "epoch": 0.418848167539267, + "grad_norm": 0.4097556199416461, + "learning_rate": 3.3548387096774195e-05, + "loss": 0.4297, + "step": 260 + }, + { + "epoch": 0.4204591220298027, + "grad_norm": 0.5308773641962137, + "learning_rate": 3.367741935483871e-05, + "loss": 0.4144, + "step": 261 + }, + { + "epoch": 0.4220700765203383, + "grad_norm": 0.5114920055423455, + "learning_rate": 3.380645161290323e-05, + "loss": 0.4063, + "step": 262 + }, + { + "epoch": 0.42368103101087395, + "grad_norm": 0.5684133487028641, + "learning_rate": 3.393548387096775e-05, + "loss": 0.3987, + "step": 263 + }, + { + "epoch": 0.42529198550140956, + "grad_norm": 0.4879815808284312, + "learning_rate": 3.406451612903226e-05, + "loss": 0.4301, + "step": 264 + }, + { + "epoch": 0.42690293999194523, + "grad_norm": 0.5519017376318576, + "learning_rate": 3.4193548387096776e-05, + "loss": 0.4282, + "step": 265 + }, + { + "epoch": 0.4285138944824809, + "grad_norm": 0.5172565641895582, + "learning_rate": 3.4322580645161294e-05, + "loss": 0.4293, + "step": 266 + }, + { + "epoch": 0.4301248489730165, + "grad_norm": 0.5978796630371895, + "learning_rate": 3.445161290322581e-05, + "loss": 0.4256, + "step": 267 + }, + { + "epoch": 0.43173580346355217, + "grad_norm": 0.46900318232253063, + "learning_rate": 3.458064516129032e-05, + "loss": 0.4472, + "step": 268 + }, + { + "epoch": 0.4333467579540878, + "grad_norm": 0.5359562617290479, + "learning_rate": 3.4709677419354846e-05, + "loss": 0.434, + "step": 269 + }, + { + "epoch": 0.43495771244462345, + "grad_norm": 0.5467288956413452, + "learning_rate": 3.483870967741936e-05, + "loss": 0.4273, + "step": 270 + }, + { + "epoch": 0.43656866693515906, + "grad_norm": 0.5657718857551184, + "learning_rate": 3.4967741935483874e-05, + "loss": 0.4353, + "step": 271 + }, + { + "epoch": 0.4381796214256947, + "grad_norm": 0.5222662238820099, + "learning_rate": 3.509677419354839e-05, + "loss": 0.4054, + "step": 272 + }, + { + "epoch": 0.4397905759162304, + "grad_norm": 0.5104771085486108, + "learning_rate": 3.52258064516129e-05, + "loss": 0.4296, + "step": 273 + }, + { + "epoch": 0.441401530406766, + "grad_norm": 0.573976020499011, + "learning_rate": 3.535483870967743e-05, + "loss": 0.4336, + "step": 274 + }, + { + "epoch": 0.44301248489730166, + "grad_norm": 0.5651641640641581, + "learning_rate": 3.548387096774194e-05, + "loss": 0.4332, + "step": 275 + }, + { + "epoch": 0.44462343938783727, + "grad_norm": 0.4432761779379142, + "learning_rate": 3.5612903225806455e-05, + "loss": 0.4372, + "step": 276 + }, + { + "epoch": 0.44623439387837294, + "grad_norm": 0.5288954385305018, + "learning_rate": 3.574193548387097e-05, + "loss": 0.4319, + "step": 277 + }, + { + "epoch": 0.4478453483689086, + "grad_norm": 0.4467858794531326, + "learning_rate": 3.5870967741935483e-05, + "loss": 0.398, + "step": 278 + }, + { + "epoch": 0.4494563028594442, + "grad_norm": 0.4854143357523973, + "learning_rate": 3.6e-05, + "loss": 0.405, + "step": 279 + }, + { + "epoch": 0.4510672573499799, + "grad_norm": 0.47026604377994136, + "learning_rate": 3.612903225806452e-05, + "loss": 0.4406, + "step": 280 + }, + { + "epoch": 0.4526782118405155, + "grad_norm": 0.49527144296582787, + "learning_rate": 3.6258064516129036e-05, + "loss": 0.4397, + "step": 281 + }, + { + "epoch": 0.45428916633105115, + "grad_norm": 0.5999996141858859, + "learning_rate": 3.6387096774193553e-05, + "loss": 0.4234, + "step": 282 + }, + { + "epoch": 0.45590012082158676, + "grad_norm": 0.5220722098689459, + "learning_rate": 3.651612903225807e-05, + "loss": 0.4346, + "step": 283 + }, + { + "epoch": 0.45751107531212243, + "grad_norm": 0.43711076153629586, + "learning_rate": 3.664516129032258e-05, + "loss": 0.4355, + "step": 284 + }, + { + "epoch": 0.4591220298026581, + "grad_norm": 0.6172985122046, + "learning_rate": 3.67741935483871e-05, + "loss": 0.4184, + "step": 285 + }, + { + "epoch": 0.4607329842931937, + "grad_norm": 0.5867425345199105, + "learning_rate": 3.690322580645162e-05, + "loss": 0.4438, + "step": 286 + }, + { + "epoch": 0.46234393878372937, + "grad_norm": 0.4457544342019048, + "learning_rate": 3.7032258064516134e-05, + "loss": 0.4194, + "step": 287 + }, + { + "epoch": 0.463954893274265, + "grad_norm": 0.666524347966105, + "learning_rate": 3.716129032258065e-05, + "loss": 0.422, + "step": 288 + }, + { + "epoch": 0.46556584776480064, + "grad_norm": 0.48549716093426315, + "learning_rate": 3.729032258064516e-05, + "loss": 0.4163, + "step": 289 + }, + { + "epoch": 0.4671768022553363, + "grad_norm": 0.528960986822585, + "learning_rate": 3.741935483870968e-05, + "loss": 0.4244, + "step": 290 + }, + { + "epoch": 0.4687877567458719, + "grad_norm": 0.5886855824754854, + "learning_rate": 3.75483870967742e-05, + "loss": 0.4131, + "step": 291 + }, + { + "epoch": 0.4703987112364076, + "grad_norm": 0.5041181448802059, + "learning_rate": 3.767741935483871e-05, + "loss": 0.4293, + "step": 292 + }, + { + "epoch": 0.4720096657269432, + "grad_norm": 0.5833712014204475, + "learning_rate": 3.780645161290323e-05, + "loss": 0.4189, + "step": 293 + }, + { + "epoch": 0.47362062021747886, + "grad_norm": 0.6344962543797149, + "learning_rate": 3.793548387096774e-05, + "loss": 0.4148, + "step": 294 + }, + { + "epoch": 0.4752315747080145, + "grad_norm": 0.4209164542717526, + "learning_rate": 3.806451612903226e-05, + "loss": 0.3986, + "step": 295 + }, + { + "epoch": 0.47684252919855014, + "grad_norm": 0.5561556783839265, + "learning_rate": 3.819354838709678e-05, + "loss": 0.4354, + "step": 296 + }, + { + "epoch": 0.4784534836890858, + "grad_norm": 0.6257224120221472, + "learning_rate": 3.832258064516129e-05, + "loss": 0.4151, + "step": 297 + }, + { + "epoch": 0.4800644381796214, + "grad_norm": 0.6035957973374384, + "learning_rate": 3.8451612903225813e-05, + "loss": 0.4116, + "step": 298 + }, + { + "epoch": 0.4816753926701571, + "grad_norm": 0.5050188538136214, + "learning_rate": 3.8580645161290324e-05, + "loss": 0.4205, + "step": 299 + }, + { + "epoch": 0.4832863471606927, + "grad_norm": 0.4716292658982768, + "learning_rate": 3.870967741935484e-05, + "loss": 0.4277, + "step": 300 + }, + { + "epoch": 0.48489730165122835, + "grad_norm": 0.4741578168809942, + "learning_rate": 3.883870967741936e-05, + "loss": 0.4212, + "step": 301 + }, + { + "epoch": 0.486508256141764, + "grad_norm": 0.4987417243199203, + "learning_rate": 3.896774193548388e-05, + "loss": 0.433, + "step": 302 + }, + { + "epoch": 0.4881192106322996, + "grad_norm": 0.47028866203783526, + "learning_rate": 3.909677419354839e-05, + "loss": 0.4409, + "step": 303 + }, + { + "epoch": 0.4897301651228353, + "grad_norm": 0.5419362505448754, + "learning_rate": 3.9225806451612905e-05, + "loss": 0.4072, + "step": 304 + }, + { + "epoch": 0.4913411196133709, + "grad_norm": 0.6217806303253193, + "learning_rate": 3.935483870967742e-05, + "loss": 0.4285, + "step": 305 + }, + { + "epoch": 0.49295207410390657, + "grad_norm": 0.5420446824564362, + "learning_rate": 3.948387096774194e-05, + "loss": 0.4405, + "step": 306 + }, + { + "epoch": 0.49456302859444223, + "grad_norm": 0.6297593472421277, + "learning_rate": 3.961290322580646e-05, + "loss": 0.4336, + "step": 307 + }, + { + "epoch": 0.49617398308497784, + "grad_norm": 0.6587570423137625, + "learning_rate": 3.974193548387097e-05, + "loss": 0.4241, + "step": 308 + }, + { + "epoch": 0.4977849375755135, + "grad_norm": 0.670548883274989, + "learning_rate": 3.9870967741935486e-05, + "loss": 0.4347, + "step": 309 + }, + { + "epoch": 0.4993958920660491, + "grad_norm": 0.6932491705418964, + "learning_rate": 4e-05, + "loss": 0.3976, + "step": 310 + }, + { + "epoch": 0.5010068465565848, + "grad_norm": 0.4742576328363187, + "learning_rate": 3.999998732081634e-05, + "loss": 0.4291, + "step": 311 + }, + { + "epoch": 0.5026178010471204, + "grad_norm": 0.529513963401129, + "learning_rate": 3.999994928328141e-05, + "loss": 0.4305, + "step": 312 + }, + { + "epoch": 0.5042287555376561, + "grad_norm": 0.6091004305460854, + "learning_rate": 3.9999885887443455e-05, + "loss": 0.4216, + "step": 313 + }, + { + "epoch": 0.5058397100281917, + "grad_norm": 0.5381494437696771, + "learning_rate": 3.9999797133382855e-05, + "loss": 0.4165, + "step": 314 + }, + { + "epoch": 0.5074506645187273, + "grad_norm": 0.4317011506932105, + "learning_rate": 3.9999683021212134e-05, + "loss": 0.443, + "step": 315 + }, + { + "epoch": 0.509061619009263, + "grad_norm": 0.47285920628489064, + "learning_rate": 3.999954355107598e-05, + "loss": 0.4282, + "step": 316 + }, + { + "epoch": 0.5106725734997987, + "grad_norm": 0.5242267633007971, + "learning_rate": 3.999937872315124e-05, + "loss": 0.4179, + "step": 317 + }, + { + "epoch": 0.5122835279903343, + "grad_norm": 0.4123011973763891, + "learning_rate": 3.9999188537646894e-05, + "loss": 0.4438, + "step": 318 + }, + { + "epoch": 0.5138944824808699, + "grad_norm": 0.4057247056617174, + "learning_rate": 3.999897299480408e-05, + "loss": 0.4036, + "step": 319 + }, + { + "epoch": 0.5155054369714056, + "grad_norm": 0.4188342427708109, + "learning_rate": 3.9998732094896084e-05, + "loss": 0.4299, + "step": 320 + }, + { + "epoch": 0.5171163914619412, + "grad_norm": 0.43672985709660445, + "learning_rate": 3.999846583822836e-05, + "loss": 0.4237, + "step": 321 + }, + { + "epoch": 0.5187273459524768, + "grad_norm": 0.48310073400224307, + "learning_rate": 3.999817422513849e-05, + "loss": 0.4187, + "step": 322 + }, + { + "epoch": 0.5203383004430124, + "grad_norm": 0.5530788167112765, + "learning_rate": 3.999785725599623e-05, + "loss": 0.4311, + "step": 323 + }, + { + "epoch": 0.5219492549335482, + "grad_norm": 0.5381555047474669, + "learning_rate": 3.999751493120345e-05, + "loss": 0.4291, + "step": 324 + }, + { + "epoch": 0.5235602094240838, + "grad_norm": 0.5827345030273974, + "learning_rate": 3.99971472511942e-05, + "loss": 0.4305, + "step": 325 + }, + { + "epoch": 0.5251711639146194, + "grad_norm": 0.38155681688778703, + "learning_rate": 3.999675421643467e-05, + "loss": 0.4174, + "step": 326 + }, + { + "epoch": 0.5267821184051551, + "grad_norm": 0.5492754517767143, + "learning_rate": 3.99963358274232e-05, + "loss": 0.4212, + "step": 327 + }, + { + "epoch": 0.5283930728956907, + "grad_norm": 0.48789623482177874, + "learning_rate": 3.9995892084690256e-05, + "loss": 0.4215, + "step": 328 + }, + { + "epoch": 0.5300040273862263, + "grad_norm": 0.4422563535366751, + "learning_rate": 3.9995422988798494e-05, + "loss": 0.4118, + "step": 329 + }, + { + "epoch": 0.5316149818767619, + "grad_norm": 0.6826787557912818, + "learning_rate": 3.999492854034266e-05, + "loss": 0.4322, + "step": 330 + }, + { + "epoch": 0.5332259363672976, + "grad_norm": 0.7161197167511507, + "learning_rate": 3.99944087399497e-05, + "loss": 0.4297, + "step": 331 + }, + { + "epoch": 0.5348368908578333, + "grad_norm": 0.45307187386139897, + "learning_rate": 3.999386358827866e-05, + "loss": 0.4017, + "step": 332 + }, + { + "epoch": 0.5364478453483689, + "grad_norm": 0.5080846848977575, + "learning_rate": 3.999329308602076e-05, + "loss": 0.4176, + "step": 333 + }, + { + "epoch": 0.5380587998389046, + "grad_norm": 0.44961265160037006, + "learning_rate": 3.9992697233899345e-05, + "loss": 0.4146, + "step": 334 + }, + { + "epoch": 0.5396697543294402, + "grad_norm": 0.44569468180373156, + "learning_rate": 3.9992076032669905e-05, + "loss": 0.4276, + "step": 335 + }, + { + "epoch": 0.5412807088199758, + "grad_norm": 0.4589301924407286, + "learning_rate": 3.999142948312007e-05, + "loss": 0.4068, + "step": 336 + }, + { + "epoch": 0.5428916633105115, + "grad_norm": 0.46283215335566513, + "learning_rate": 3.999075758606963e-05, + "loss": 0.4102, + "step": 337 + }, + { + "epoch": 0.5445026178010471, + "grad_norm": 0.4428187054855388, + "learning_rate": 3.999006034237047e-05, + "loss": 0.4076, + "step": 338 + }, + { + "epoch": 0.5461135722915827, + "grad_norm": 0.3941621296766751, + "learning_rate": 3.9989337752906656e-05, + "loss": 0.4248, + "step": 339 + }, + { + "epoch": 0.5477245267821184, + "grad_norm": 0.4911947798210392, + "learning_rate": 3.998858981859436e-05, + "loss": 0.4234, + "step": 340 + }, + { + "epoch": 0.5493354812726541, + "grad_norm": 0.45142619025228986, + "learning_rate": 3.998781654038192e-05, + "loss": 0.4098, + "step": 341 + }, + { + "epoch": 0.5509464357631897, + "grad_norm": 0.45716417313795044, + "learning_rate": 3.998701791924977e-05, + "loss": 0.4232, + "step": 342 + }, + { + "epoch": 0.5525573902537253, + "grad_norm": 0.4811829968487867, + "learning_rate": 3.998619395621051e-05, + "loss": 0.4301, + "step": 343 + }, + { + "epoch": 0.554168344744261, + "grad_norm": 0.3793740271087329, + "learning_rate": 3.9985344652308846e-05, + "loss": 0.4215, + "step": 344 + }, + { + "epoch": 0.5557792992347966, + "grad_norm": 0.42003419142799425, + "learning_rate": 3.998447000862164e-05, + "loss": 0.4509, + "step": 345 + }, + { + "epoch": 0.5573902537253322, + "grad_norm": 0.3906557697989691, + "learning_rate": 3.9983570026257844e-05, + "loss": 0.4231, + "step": 346 + }, + { + "epoch": 0.5590012082158679, + "grad_norm": 0.39196036899925196, + "learning_rate": 3.9982644706358596e-05, + "loss": 0.4089, + "step": 347 + }, + { + "epoch": 0.5606121627064036, + "grad_norm": 0.4913202549527037, + "learning_rate": 3.998169405009711e-05, + "loss": 0.422, + "step": 348 + }, + { + "epoch": 0.5622231171969392, + "grad_norm": 0.4279358660405233, + "learning_rate": 3.9980718058678733e-05, + "loss": 0.4248, + "step": 349 + }, + { + "epoch": 0.5638340716874748, + "grad_norm": 0.49969446549842206, + "learning_rate": 3.997971673334095e-05, + "loss": 0.4202, + "step": 350 + }, + { + "epoch": 0.5654450261780105, + "grad_norm": 0.41732878429281683, + "learning_rate": 3.997869007535336e-05, + "loss": 0.422, + "step": 351 + }, + { + "epoch": 0.5670559806685461, + "grad_norm": 0.4294137359439838, + "learning_rate": 3.997763808601768e-05, + "loss": 0.4252, + "step": 352 + }, + { + "epoch": 0.5686669351590817, + "grad_norm": 0.3707661272418532, + "learning_rate": 3.997656076666776e-05, + "loss": 0.4027, + "step": 353 + }, + { + "epoch": 0.5702778896496175, + "grad_norm": 0.3766908978268678, + "learning_rate": 3.997545811866952e-05, + "loss": 0.4312, + "step": 354 + }, + { + "epoch": 0.5718888441401531, + "grad_norm": 0.45306234795270156, + "learning_rate": 3.997433014342106e-05, + "loss": 0.4223, + "step": 355 + }, + { + "epoch": 0.5734997986306887, + "grad_norm": 0.4793083302776694, + "learning_rate": 3.997317684235254e-05, + "loss": 0.4229, + "step": 356 + }, + { + "epoch": 0.5751107531212243, + "grad_norm": 0.4675021055165329, + "learning_rate": 3.9971998216926274e-05, + "loss": 0.4434, + "step": 357 + }, + { + "epoch": 0.57672170761176, + "grad_norm": 0.4460710699457162, + "learning_rate": 3.997079426863664e-05, + "loss": 0.4013, + "step": 358 + }, + { + "epoch": 0.5783326621022956, + "grad_norm": 0.45964782363764733, + "learning_rate": 3.996956499901015e-05, + "loss": 0.4245, + "step": 359 + }, + { + "epoch": 0.5799436165928312, + "grad_norm": 0.5182341589054947, + "learning_rate": 3.996831040960543e-05, + "loss": 0.4038, + "step": 360 + }, + { + "epoch": 0.581554571083367, + "grad_norm": 0.4117269217562021, + "learning_rate": 3.996703050201319e-05, + "loss": 0.4153, + "step": 361 + }, + { + "epoch": 0.5831655255739026, + "grad_norm": 0.46882737702461946, + "learning_rate": 3.996572527785625e-05, + "loss": 0.3976, + "step": 362 + }, + { + "epoch": 0.5847764800644382, + "grad_norm": 0.3796364352520464, + "learning_rate": 3.996439473878952e-05, + "loss": 0.4208, + "step": 363 + }, + { + "epoch": 0.5863874345549738, + "grad_norm": 0.4992194206692767, + "learning_rate": 3.996303888650002e-05, + "loss": 0.4266, + "step": 364 + }, + { + "epoch": 0.5879983890455095, + "grad_norm": 0.4787895226087798, + "learning_rate": 3.9961657722706864e-05, + "loss": 0.4301, + "step": 365 + }, + { + "epoch": 0.5896093435360451, + "grad_norm": 0.5407494009323632, + "learning_rate": 3.996025124916125e-05, + "loss": 0.4157, + "step": 366 + }, + { + "epoch": 0.5912202980265807, + "grad_norm": 0.4261943946875255, + "learning_rate": 3.995881946764647e-05, + "loss": 0.4089, + "step": 367 + }, + { + "epoch": 0.5928312525171164, + "grad_norm": 0.3850462450614437, + "learning_rate": 3.995736237997792e-05, + "loss": 0.4272, + "step": 368 + }, + { + "epoch": 0.594442207007652, + "grad_norm": 0.45463889935104884, + "learning_rate": 3.9955879988003046e-05, + "loss": 0.4133, + "step": 369 + }, + { + "epoch": 0.5960531614981877, + "grad_norm": 0.48825127888326364, + "learning_rate": 3.9954372293601415e-05, + "loss": 0.4336, + "step": 370 + }, + { + "epoch": 0.5976641159887233, + "grad_norm": 0.4068129703538119, + "learning_rate": 3.9952839298684656e-05, + "loss": 0.4001, + "step": 371 + }, + { + "epoch": 0.599275070479259, + "grad_norm": 0.49552613300447507, + "learning_rate": 3.9951281005196486e-05, + "loss": 0.4023, + "step": 372 + }, + { + "epoch": 0.6008860249697946, + "grad_norm": 0.46945317770031153, + "learning_rate": 3.994969741511269e-05, + "loss": 0.3857, + "step": 373 + }, + { + "epoch": 0.6024969794603302, + "grad_norm": 0.4741587306281566, + "learning_rate": 3.994808853044113e-05, + "loss": 0.4461, + "step": 374 + }, + { + "epoch": 0.6041079339508659, + "grad_norm": 0.4685179794707909, + "learning_rate": 3.994645435322174e-05, + "loss": 0.4094, + "step": 375 + }, + { + "epoch": 0.6057188884414015, + "grad_norm": 0.45483642283060166, + "learning_rate": 3.994479488552652e-05, + "loss": 0.4228, + "step": 376 + }, + { + "epoch": 0.6073298429319371, + "grad_norm": 0.4244946382664049, + "learning_rate": 3.9943110129459555e-05, + "loss": 0.4279, + "step": 377 + }, + { + "epoch": 0.6089407974224729, + "grad_norm": 0.4294198854653389, + "learning_rate": 3.994140008715697e-05, + "loss": 0.4032, + "step": 378 + }, + { + "epoch": 0.6105517519130085, + "grad_norm": 0.3989456663260737, + "learning_rate": 3.993966476078694e-05, + "loss": 0.4147, + "step": 379 + }, + { + "epoch": 0.6121627064035441, + "grad_norm": 0.3713937522581636, + "learning_rate": 3.9937904152549746e-05, + "loss": 0.4192, + "step": 380 + }, + { + "epoch": 0.6137736608940797, + "grad_norm": 0.4384496746610661, + "learning_rate": 3.993611826467768e-05, + "loss": 0.4356, + "step": 381 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.42889466156762074, + "learning_rate": 3.99343070994351e-05, + "loss": 0.4299, + "step": 382 + }, + { + "epoch": 0.616995569875151, + "grad_norm": 0.3555646074061858, + "learning_rate": 3.993247065911844e-05, + "loss": 0.4309, + "step": 383 + }, + { + "epoch": 0.6186065243656866, + "grad_norm": 0.43396045989866394, + "learning_rate": 3.993060894605612e-05, + "loss": 0.3994, + "step": 384 + }, + { + "epoch": 0.6202174788562224, + "grad_norm": 0.43776606757803926, + "learning_rate": 3.992872196260866e-05, + "loss": 0.4097, + "step": 385 + }, + { + "epoch": 0.621828433346758, + "grad_norm": 0.420636085604741, + "learning_rate": 3.99268097111686e-05, + "loss": 0.417, + "step": 386 + }, + { + "epoch": 0.6234393878372936, + "grad_norm": 0.5064346878522746, + "learning_rate": 3.992487219416052e-05, + "loss": 0.4183, + "step": 387 + }, + { + "epoch": 0.6250503423278292, + "grad_norm": 0.5057851487040337, + "learning_rate": 3.992290941404103e-05, + "loss": 0.3974, + "step": 388 + }, + { + "epoch": 0.6266612968183649, + "grad_norm": 0.5521508424756063, + "learning_rate": 3.992092137329878e-05, + "loss": 0.4108, + "step": 389 + }, + { + "epoch": 0.6282722513089005, + "grad_norm": 0.5405974200010049, + "learning_rate": 3.991890807445443e-05, + "loss": 0.4477, + "step": 390 + }, + { + "epoch": 0.6298832057994361, + "grad_norm": 0.39506341263585665, + "learning_rate": 3.991686952006069e-05, + "loss": 0.4326, + "step": 391 + }, + { + "epoch": 0.6314941602899719, + "grad_norm": 0.511124831235241, + "learning_rate": 3.991480571270228e-05, + "loss": 0.4226, + "step": 392 + }, + { + "epoch": 0.6331051147805075, + "grad_norm": 0.42776265768914273, + "learning_rate": 3.991271665499594e-05, + "loss": 0.4046, + "step": 393 + }, + { + "epoch": 0.6347160692710431, + "grad_norm": 0.4959908838554333, + "learning_rate": 3.991060234959042e-05, + "loss": 0.416, + "step": 394 + }, + { + "epoch": 0.6363270237615787, + "grad_norm": 0.4670521894890168, + "learning_rate": 3.990846279916649e-05, + "loss": 0.4224, + "step": 395 + }, + { + "epoch": 0.6379379782521144, + "grad_norm": 0.4006152685344792, + "learning_rate": 3.9906298006436924e-05, + "loss": 0.4164, + "step": 396 + }, + { + "epoch": 0.63954893274265, + "grad_norm": 0.4463899466226375, + "learning_rate": 3.99041079741465e-05, + "loss": 0.4298, + "step": 397 + }, + { + "epoch": 0.6411598872331856, + "grad_norm": 0.39436113561085406, + "learning_rate": 3.9901892705072004e-05, + "loss": 0.4139, + "step": 398 + }, + { + "epoch": 0.6427708417237213, + "grad_norm": 0.48684252012399337, + "learning_rate": 3.989965220202221e-05, + "loss": 0.4157, + "step": 399 + }, + { + "epoch": 0.644381796214257, + "grad_norm": 0.36781064362322613, + "learning_rate": 3.98973864678379e-05, + "loss": 0.4094, + "step": 400 + }, + { + "epoch": 0.6459927507047926, + "grad_norm": 0.4545736363221886, + "learning_rate": 3.989509550539185e-05, + "loss": 0.416, + "step": 401 + }, + { + "epoch": 0.6476037051953283, + "grad_norm": 0.4021213653694868, + "learning_rate": 3.989277931758879e-05, + "loss": 0.3986, + "step": 402 + }, + { + "epoch": 0.6492146596858639, + "grad_norm": 0.3938202729838156, + "learning_rate": 3.989043790736547e-05, + "loss": 0.3977, + "step": 403 + }, + { + "epoch": 0.6508256141763995, + "grad_norm": 0.3714283205343456, + "learning_rate": 3.9888071277690606e-05, + "loss": 0.4008, + "step": 404 + }, + { + "epoch": 0.6524365686669351, + "grad_norm": 0.49605923508034744, + "learning_rate": 3.988567943156489e-05, + "loss": 0.4044, + "step": 405 + }, + { + "epoch": 0.6540475231574708, + "grad_norm": 0.5342861440227792, + "learning_rate": 3.988326237202099e-05, + "loss": 0.4053, + "step": 406 + }, + { + "epoch": 0.6556584776480064, + "grad_norm": 0.3428099270926289, + "learning_rate": 3.988082010212354e-05, + "loss": 0.3928, + "step": 407 + }, + { + "epoch": 0.657269432138542, + "grad_norm": 0.4004574130249575, + "learning_rate": 3.987835262496913e-05, + "loss": 0.4171, + "step": 408 + }, + { + "epoch": 0.6588803866290778, + "grad_norm": 0.35970332120506987, + "learning_rate": 3.9875859943686335e-05, + "loss": 0.4024, + "step": 409 + }, + { + "epoch": 0.6604913411196134, + "grad_norm": 0.4224502636249679, + "learning_rate": 3.9873342061435664e-05, + "loss": 0.4187, + "step": 410 + }, + { + "epoch": 0.662102295610149, + "grad_norm": 0.3764938020888867, + "learning_rate": 3.987079898140958e-05, + "loss": 0.4072, + "step": 411 + }, + { + "epoch": 0.6637132501006846, + "grad_norm": 0.3694724316905374, + "learning_rate": 3.98682307068325e-05, + "loss": 0.415, + "step": 412 + }, + { + "epoch": 0.6653242045912203, + "grad_norm": 0.36693082697539087, + "learning_rate": 3.98656372409608e-05, + "loss": 0.4046, + "step": 413 + }, + { + "epoch": 0.6669351590817559, + "grad_norm": 0.44730430916292946, + "learning_rate": 3.986301858708278e-05, + "loss": 0.4133, + "step": 414 + }, + { + "epoch": 0.6685461135722915, + "grad_norm": 0.3809823415227652, + "learning_rate": 3.9860374748518676e-05, + "loss": 0.4211, + "step": 415 + }, + { + "epoch": 0.6701570680628273, + "grad_norm": 0.3563474324193386, + "learning_rate": 3.985770572862065e-05, + "loss": 0.4088, + "step": 416 + }, + { + "epoch": 0.6717680225533629, + "grad_norm": 0.3805173842463904, + "learning_rate": 3.985501153077282e-05, + "loss": 0.4124, + "step": 417 + }, + { + "epoch": 0.6733789770438985, + "grad_norm": 0.43552193509232306, + "learning_rate": 3.985229215839119e-05, + "loss": 0.4261, + "step": 418 + }, + { + "epoch": 0.6749899315344342, + "grad_norm": 0.42439761528072495, + "learning_rate": 3.984954761492372e-05, + "loss": 0.4017, + "step": 419 + }, + { + "epoch": 0.6766008860249698, + "grad_norm": 0.3105332394794023, + "learning_rate": 3.984677790385025e-05, + "loss": 0.3985, + "step": 420 + }, + { + "epoch": 0.6782118405155054, + "grad_norm": 0.4027207084117632, + "learning_rate": 3.9843983028682555e-05, + "loss": 0.4026, + "step": 421 + }, + { + "epoch": 0.679822795006041, + "grad_norm": 0.3508235171579207, + "learning_rate": 3.984116299296432e-05, + "loss": 0.3896, + "step": 422 + }, + { + "epoch": 0.6814337494965768, + "grad_norm": 0.3573507910316362, + "learning_rate": 3.9838317800271105e-05, + "loss": 0.4283, + "step": 423 + }, + { + "epoch": 0.6830447039871124, + "grad_norm": 0.37632509861430463, + "learning_rate": 3.983544745421038e-05, + "loss": 0.3997, + "step": 424 + }, + { + "epoch": 0.684655658477648, + "grad_norm": 0.3388804904806458, + "learning_rate": 3.983255195842152e-05, + "loss": 0.4124, + "step": 425 + }, + { + "epoch": 0.6862666129681837, + "grad_norm": 0.4833298766343206, + "learning_rate": 3.982963131657577e-05, + "loss": 0.4177, + "step": 426 + }, + { + "epoch": 0.6878775674587193, + "grad_norm": 0.42070364198235066, + "learning_rate": 3.982668553237628e-05, + "loss": 0.3904, + "step": 427 + }, + { + "epoch": 0.6894885219492549, + "grad_norm": 0.4482515006894441, + "learning_rate": 3.982371460955804e-05, + "loss": 0.4311, + "step": 428 + }, + { + "epoch": 0.6910994764397905, + "grad_norm": 0.40436392123161874, + "learning_rate": 3.982071855188796e-05, + "loss": 0.4045, + "step": 429 + }, + { + "epoch": 0.6927104309303262, + "grad_norm": 0.3383709764320333, + "learning_rate": 3.981769736316478e-05, + "loss": 0.4006, + "step": 430 + }, + { + "epoch": 0.6943213854208619, + "grad_norm": 0.42754996232658854, + "learning_rate": 3.9814651047219135e-05, + "loss": 0.4289, + "step": 431 + }, + { + "epoch": 0.6959323399113975, + "grad_norm": 0.3876602252178334, + "learning_rate": 3.9811579607913486e-05, + "loss": 0.4246, + "step": 432 + }, + { + "epoch": 0.6975432944019332, + "grad_norm": 0.4278854255881262, + "learning_rate": 3.9808483049142185e-05, + "loss": 0.4448, + "step": 433 + }, + { + "epoch": 0.6991542488924688, + "grad_norm": 0.35865693620507294, + "learning_rate": 3.980536137483141e-05, + "loss": 0.4142, + "step": 434 + }, + { + "epoch": 0.7007652033830044, + "grad_norm": 0.4625593533325866, + "learning_rate": 3.980221458893919e-05, + "loss": 0.4171, + "step": 435 + }, + { + "epoch": 0.70237615787354, + "grad_norm": 0.32682701461120106, + "learning_rate": 3.979904269545538e-05, + "loss": 0.4266, + "step": 436 + }, + { + "epoch": 0.7039871123640757, + "grad_norm": 0.415104731241559, + "learning_rate": 3.979584569840171e-05, + "loss": 0.4204, + "step": 437 + }, + { + "epoch": 0.7055980668546113, + "grad_norm": 0.3592034123608851, + "learning_rate": 3.979262360183169e-05, + "loss": 0.4067, + "step": 438 + }, + { + "epoch": 0.707209021345147, + "grad_norm": 0.43562091204913506, + "learning_rate": 3.9789376409830674e-05, + "loss": 0.4081, + "step": 439 + }, + { + "epoch": 0.7088199758356827, + "grad_norm": 0.43196190361541487, + "learning_rate": 3.978610412651584e-05, + "loss": 0.401, + "step": 440 + }, + { + "epoch": 0.7104309303262183, + "grad_norm": 0.4071385089261009, + "learning_rate": 3.978280675603618e-05, + "loss": 0.4163, + "step": 441 + }, + { + "epoch": 0.7120418848167539, + "grad_norm": 0.41016039223243267, + "learning_rate": 3.97794843025725e-05, + "loss": 0.3942, + "step": 442 + }, + { + "epoch": 0.7136528393072896, + "grad_norm": 0.3841321139543235, + "learning_rate": 3.977613677033738e-05, + "loss": 0.4118, + "step": 443 + }, + { + "epoch": 0.7152637937978252, + "grad_norm": 0.35310267037819615, + "learning_rate": 3.977276416357523e-05, + "loss": 0.4218, + "step": 444 + }, + { + "epoch": 0.7168747482883608, + "grad_norm": 0.4197080640230907, + "learning_rate": 3.976936648656223e-05, + "loss": 0.4261, + "step": 445 + }, + { + "epoch": 0.7184857027788965, + "grad_norm": 0.3546111322565111, + "learning_rate": 3.976594374360637e-05, + "loss": 0.4133, + "step": 446 + }, + { + "epoch": 0.7200966572694322, + "grad_norm": 0.4394668646642138, + "learning_rate": 3.97624959390474e-05, + "loss": 0.4168, + "step": 447 + }, + { + "epoch": 0.7217076117599678, + "grad_norm": 0.3221466833352809, + "learning_rate": 3.975902307725686e-05, + "loss": 0.4175, + "step": 448 + }, + { + "epoch": 0.7233185662505034, + "grad_norm": 0.46121007238081874, + "learning_rate": 3.975552516263804e-05, + "loss": 0.3995, + "step": 449 + }, + { + "epoch": 0.7249295207410391, + "grad_norm": 0.40160398472276715, + "learning_rate": 3.9752002199626035e-05, + "loss": 0.4152, + "step": 450 + }, + { + "epoch": 0.7265404752315747, + "grad_norm": 0.37117019417124525, + "learning_rate": 3.974845419268766e-05, + "loss": 0.3937, + "step": 451 + }, + { + "epoch": 0.7281514297221103, + "grad_norm": 0.31852553564843433, + "learning_rate": 3.97448811463215e-05, + "loss": 0.396, + "step": 452 + }, + { + "epoch": 0.7297623842126459, + "grad_norm": 0.42093190036658035, + "learning_rate": 3.974128306505788e-05, + "loss": 0.4105, + "step": 453 + }, + { + "epoch": 0.7313733387031817, + "grad_norm": 0.3427182108489802, + "learning_rate": 3.973765995345889e-05, + "loss": 0.3974, + "step": 454 + }, + { + "epoch": 0.7329842931937173, + "grad_norm": 0.38214069037483106, + "learning_rate": 3.973401181611832e-05, + "loss": 0.4309, + "step": 455 + }, + { + "epoch": 0.7345952476842529, + "grad_norm": 0.41629666543067606, + "learning_rate": 3.973033865766172e-05, + "loss": 0.4094, + "step": 456 + }, + { + "epoch": 0.7362062021747886, + "grad_norm": 0.34200464296520006, + "learning_rate": 3.972664048274636e-05, + "loss": 0.4161, + "step": 457 + }, + { + "epoch": 0.7378171566653242, + "grad_norm": 0.3471638897635492, + "learning_rate": 3.9722917296061216e-05, + "loss": 0.4297, + "step": 458 + }, + { + "epoch": 0.7394281111558598, + "grad_norm": 0.45161457244236064, + "learning_rate": 3.971916910232699e-05, + "loss": 0.4295, + "step": 459 + }, + { + "epoch": 0.7410390656463954, + "grad_norm": 0.42974197063134223, + "learning_rate": 3.971539590629608e-05, + "loss": 0.399, + "step": 460 + }, + { + "epoch": 0.7426500201369312, + "grad_norm": 0.43361932874840936, + "learning_rate": 3.971159771275259e-05, + "loss": 0.4096, + "step": 461 + }, + { + "epoch": 0.7442609746274668, + "grad_norm": 0.36249109522453676, + "learning_rate": 3.9707774526512334e-05, + "loss": 0.3998, + "step": 462 + }, + { + "epoch": 0.7458719291180024, + "grad_norm": 0.42345130014954085, + "learning_rate": 3.970392635242278e-05, + "loss": 0.4123, + "step": 463 + }, + { + "epoch": 0.7474828836085381, + "grad_norm": 0.4028131720862331, + "learning_rate": 3.970005319536311e-05, + "loss": 0.4048, + "step": 464 + }, + { + "epoch": 0.7490938380990737, + "grad_norm": 0.3922591817364784, + "learning_rate": 3.9696155060244166e-05, + "loss": 0.416, + "step": 465 + }, + { + "epoch": 0.7507047925896093, + "grad_norm": 0.394910086124614, + "learning_rate": 3.969223195200847e-05, + "loss": 0.409, + "step": 466 + }, + { + "epoch": 0.752315747080145, + "grad_norm": 0.4010895179099205, + "learning_rate": 3.9688283875630193e-05, + "loss": 0.4221, + "step": 467 + }, + { + "epoch": 0.7539267015706806, + "grad_norm": 0.3344274716823261, + "learning_rate": 3.96843108361152e-05, + "loss": 0.3966, + "step": 468 + }, + { + "epoch": 0.7555376560612163, + "grad_norm": 0.36508649487932493, + "learning_rate": 3.968031283850094e-05, + "loss": 0.3896, + "step": 469 + }, + { + "epoch": 0.7571486105517519, + "grad_norm": 0.3194248632705373, + "learning_rate": 3.967628988785658e-05, + "loss": 0.3913, + "step": 470 + }, + { + "epoch": 0.7587595650422876, + "grad_norm": 0.3348385327401639, + "learning_rate": 3.967224198928289e-05, + "loss": 0.4321, + "step": 471 + }, + { + "epoch": 0.7603705195328232, + "grad_norm": 0.3331982859049919, + "learning_rate": 3.966816914791226e-05, + "loss": 0.4088, + "step": 472 + }, + { + "epoch": 0.7619814740233588, + "grad_norm": 0.3252671342709306, + "learning_rate": 3.9664071368908726e-05, + "loss": 0.4324, + "step": 473 + }, + { + "epoch": 0.7635924285138945, + "grad_norm": 0.3778011510229453, + "learning_rate": 3.965994865746794e-05, + "loss": 0.4083, + "step": 474 + }, + { + "epoch": 0.7652033830044301, + "grad_norm": 0.3697341056634576, + "learning_rate": 3.9655801018817166e-05, + "loss": 0.417, + "step": 475 + }, + { + "epoch": 0.7668143374949657, + "grad_norm": 0.3686614640695823, + "learning_rate": 3.965162845821526e-05, + "loss": 0.404, + "step": 476 + }, + { + "epoch": 0.7684252919855014, + "grad_norm": 0.3198502413551383, + "learning_rate": 3.96474309809527e-05, + "loss": 0.3918, + "step": 477 + }, + { + "epoch": 0.7700362464760371, + "grad_norm": 0.404426337364835, + "learning_rate": 3.964320859235155e-05, + "loss": 0.3978, + "step": 478 + }, + { + "epoch": 0.7716472009665727, + "grad_norm": 0.37411618257407514, + "learning_rate": 3.963896129776544e-05, + "loss": 0.4447, + "step": 479 + }, + { + "epoch": 0.7732581554571083, + "grad_norm": 0.3647067289069705, + "learning_rate": 3.963468910257959e-05, + "loss": 0.4123, + "step": 480 + }, + { + "epoch": 0.774869109947644, + "grad_norm": 0.3646906272324245, + "learning_rate": 3.9630392012210804e-05, + "loss": 0.4106, + "step": 481 + }, + { + "epoch": 0.7764800644381796, + "grad_norm": 0.34012637826464404, + "learning_rate": 3.962607003210745e-05, + "loss": 0.4228, + "step": 482 + }, + { + "epoch": 0.7780910189287152, + "grad_norm": 0.3104491216102974, + "learning_rate": 3.9621723167749424e-05, + "loss": 0.4041, + "step": 483 + }, + { + "epoch": 0.779701973419251, + "grad_norm": 0.35559733908050584, + "learning_rate": 3.9617351424648215e-05, + "loss": 0.4224, + "step": 484 + }, + { + "epoch": 0.7813129279097866, + "grad_norm": 0.3461370454635196, + "learning_rate": 3.961295480834683e-05, + "loss": 0.4032, + "step": 485 + }, + { + "epoch": 0.7829238824003222, + "grad_norm": 0.3625961813341553, + "learning_rate": 3.960853332441981e-05, + "loss": 0.4031, + "step": 486 + }, + { + "epoch": 0.7845348368908578, + "grad_norm": 0.329895268062928, + "learning_rate": 3.960408697847324e-05, + "loss": 0.4115, + "step": 487 + }, + { + "epoch": 0.7861457913813935, + "grad_norm": 0.38660120590435787, + "learning_rate": 3.959961577614474e-05, + "loss": 0.4166, + "step": 488 + }, + { + "epoch": 0.7877567458719291, + "grad_norm": 0.33405951123771366, + "learning_rate": 3.9595119723103416e-05, + "loss": 0.4125, + "step": 489 + }, + { + "epoch": 0.7893677003624647, + "grad_norm": 0.37519732027389735, + "learning_rate": 3.9590598825049896e-05, + "loss": 0.4171, + "step": 490 + }, + { + "epoch": 0.7909786548530005, + "grad_norm": 0.3309560769864468, + "learning_rate": 3.95860530877163e-05, + "loss": 0.3816, + "step": 491 + }, + { + "epoch": 0.7925896093435361, + "grad_norm": 0.5272231298290175, + "learning_rate": 3.958148251686628e-05, + "loss": 0.4174, + "step": 492 + }, + { + "epoch": 0.7942005638340717, + "grad_norm": 0.3928743178231115, + "learning_rate": 3.9576887118294915e-05, + "loss": 0.4315, + "step": 493 + }, + { + "epoch": 0.7958115183246073, + "grad_norm": 0.4279333846270503, + "learning_rate": 3.957226689782882e-05, + "loss": 0.3968, + "step": 494 + }, + { + "epoch": 0.797422472815143, + "grad_norm": 0.5131288201799059, + "learning_rate": 3.956762186132604e-05, + "loss": 0.4145, + "step": 495 + }, + { + "epoch": 0.7990334273056786, + "grad_norm": 0.4048881402151487, + "learning_rate": 3.9562952014676116e-05, + "loss": 0.4184, + "step": 496 + }, + { + "epoch": 0.8006443817962142, + "grad_norm": 0.5006403294292532, + "learning_rate": 3.955825736380002e-05, + "loss": 0.4098, + "step": 497 + }, + { + "epoch": 0.8022553362867499, + "grad_norm": 0.45269029887323053, + "learning_rate": 3.95535379146502e-05, + "loss": 0.3942, + "step": 498 + }, + { + "epoch": 0.8038662907772856, + "grad_norm": 0.4401085637177939, + "learning_rate": 3.9548793673210515e-05, + "loss": 0.4112, + "step": 499 + }, + { + "epoch": 0.8054772452678212, + "grad_norm": 0.4536636237917159, + "learning_rate": 3.954402464549628e-05, + "loss": 0.4069, + "step": 500 + }, + { + "epoch": 0.8070881997583568, + "grad_norm": 0.4549086907445366, + "learning_rate": 3.9539230837554253e-05, + "loss": 0.3907, + "step": 501 + }, + { + "epoch": 0.8086991542488925, + "grad_norm": 0.4852155503335648, + "learning_rate": 3.953441225546257e-05, + "loss": 0.3988, + "step": 502 + }, + { + "epoch": 0.8103101087394281, + "grad_norm": 0.44412599397084784, + "learning_rate": 3.95295689053308e-05, + "loss": 0.4119, + "step": 503 + }, + { + "epoch": 0.8119210632299637, + "grad_norm": 0.44492278549668857, + "learning_rate": 3.9524700793299926e-05, + "loss": 0.4121, + "step": 504 + }, + { + "epoch": 0.8135320177204994, + "grad_norm": 0.34508494742909007, + "learning_rate": 3.951980792554231e-05, + "loss": 0.4033, + "step": 505 + }, + { + "epoch": 0.815142972211035, + "grad_norm": 0.4855681935693533, + "learning_rate": 3.9514890308261706e-05, + "loss": 0.4152, + "step": 506 + }, + { + "epoch": 0.8167539267015707, + "grad_norm": 0.3132705383046582, + "learning_rate": 3.9509947947693266e-05, + "loss": 0.3828, + "step": 507 + }, + { + "epoch": 0.8183648811921064, + "grad_norm": 0.4283053623030758, + "learning_rate": 3.950498085010348e-05, + "loss": 0.4005, + "step": 508 + }, + { + "epoch": 0.819975835682642, + "grad_norm": 0.3609521715579133, + "learning_rate": 3.949998902179024e-05, + "loss": 0.4155, + "step": 509 + }, + { + "epoch": 0.8215867901731776, + "grad_norm": 0.36832356239597386, + "learning_rate": 3.9494972469082764e-05, + "loss": 0.4141, + "step": 510 + }, + { + "epoch": 0.8231977446637132, + "grad_norm": 0.4503424316996966, + "learning_rate": 3.948993119834164e-05, + "loss": 0.4112, + "step": 511 + }, + { + "epoch": 0.8248086991542489, + "grad_norm": 0.41571779430634015, + "learning_rate": 3.948486521595878e-05, + "loss": 0.427, + "step": 512 + }, + { + "epoch": 0.8264196536447845, + "grad_norm": 0.3883909259491303, + "learning_rate": 3.9479774528357445e-05, + "loss": 0.3969, + "step": 513 + }, + { + "epoch": 0.8280306081353201, + "grad_norm": 0.3932810351114709, + "learning_rate": 3.9474659141992197e-05, + "loss": 0.4121, + "step": 514 + }, + { + "epoch": 0.8296415626258559, + "grad_norm": 0.4308748780875734, + "learning_rate": 3.946951906334895e-05, + "loss": 0.4076, + "step": 515 + }, + { + "epoch": 0.8312525171163915, + "grad_norm": 0.35777478573661275, + "learning_rate": 3.946435429894488e-05, + "loss": 0.4132, + "step": 516 + }, + { + "epoch": 0.8328634716069271, + "grad_norm": 0.48261083312441627, + "learning_rate": 3.94591648553285e-05, + "loss": 0.4006, + "step": 517 + }, + { + "epoch": 0.8344744260974627, + "grad_norm": 0.40863315410267975, + "learning_rate": 3.94539507390796e-05, + "loss": 0.4063, + "step": 518 + }, + { + "epoch": 0.8360853805879984, + "grad_norm": 0.43340425011865114, + "learning_rate": 3.944871195680926e-05, + "loss": 0.4138, + "step": 519 + }, + { + "epoch": 0.837696335078534, + "grad_norm": 0.39224215584003436, + "learning_rate": 3.9443448515159815e-05, + "loss": 0.3847, + "step": 520 + }, + { + "epoch": 0.8393072895690696, + "grad_norm": 0.4188929922999027, + "learning_rate": 3.9438160420804886e-05, + "loss": 0.4059, + "step": 521 + }, + { + "epoch": 0.8409182440596054, + "grad_norm": 0.41744558940813187, + "learning_rate": 3.943284768044935e-05, + "loss": 0.4052, + "step": 522 + }, + { + "epoch": 0.842529198550141, + "grad_norm": 0.46993836475096185, + "learning_rate": 3.942751030082932e-05, + "loss": 0.4243, + "step": 523 + }, + { + "epoch": 0.8441401530406766, + "grad_norm": 0.36251580903388136, + "learning_rate": 3.942214828871216e-05, + "loss": 0.4198, + "step": 524 + }, + { + "epoch": 0.8457511075312123, + "grad_norm": 0.3940717167893435, + "learning_rate": 3.9416761650896456e-05, + "loss": 0.4082, + "step": 525 + }, + { + "epoch": 0.8473620620217479, + "grad_norm": 0.4274934534112222, + "learning_rate": 3.941135039421204e-05, + "loss": 0.3946, + "step": 526 + }, + { + "epoch": 0.8489730165122835, + "grad_norm": 0.36645772020289197, + "learning_rate": 3.940591452551993e-05, + "loss": 0.4068, + "step": 527 + }, + { + "epoch": 0.8505839710028191, + "grad_norm": 0.4316640291709618, + "learning_rate": 3.9400454051712375e-05, + "loss": 0.4009, + "step": 528 + }, + { + "epoch": 0.8521949254933548, + "grad_norm": 0.34398127722906846, + "learning_rate": 3.939496897971281e-05, + "loss": 0.3907, + "step": 529 + }, + { + "epoch": 0.8538058799838905, + "grad_norm": 0.40687953024824, + "learning_rate": 3.938945931647585e-05, + "loss": 0.3901, + "step": 530 + }, + { + "epoch": 0.8554168344744261, + "grad_norm": 0.35490989918542215, + "learning_rate": 3.9383925068987306e-05, + "loss": 0.3928, + "step": 531 + }, + { + "epoch": 0.8570277889649618, + "grad_norm": 0.3548604235172149, + "learning_rate": 3.937836624426414e-05, + "loss": 0.4037, + "step": 532 + }, + { + "epoch": 0.8586387434554974, + "grad_norm": 0.3846710598884646, + "learning_rate": 3.9372782849354496e-05, + "loss": 0.4046, + "step": 533 + }, + { + "epoch": 0.860249697946033, + "grad_norm": 0.46337690960360517, + "learning_rate": 3.936717489133768e-05, + "loss": 0.4049, + "step": 534 + }, + { + "epoch": 0.8618606524365686, + "grad_norm": 0.3347915864544789, + "learning_rate": 3.936154237732409e-05, + "loss": 0.4199, + "step": 535 + }, + { + "epoch": 0.8634716069271043, + "grad_norm": 0.4109719486832367, + "learning_rate": 3.9355885314455316e-05, + "loss": 0.4145, + "step": 536 + }, + { + "epoch": 0.86508256141764, + "grad_norm": 0.4039943914910843, + "learning_rate": 3.935020370990405e-05, + "loss": 0.4094, + "step": 537 + }, + { + "epoch": 0.8666935159081756, + "grad_norm": 0.33381054763996676, + "learning_rate": 3.9344497570874105e-05, + "loss": 0.3997, + "step": 538 + }, + { + "epoch": 0.8683044703987113, + "grad_norm": 0.37323165887081494, + "learning_rate": 3.933876690460039e-05, + "loss": 0.4003, + "step": 539 + }, + { + "epoch": 0.8699154248892469, + "grad_norm": 0.4031148093361396, + "learning_rate": 3.9333011718348925e-05, + "loss": 0.4066, + "step": 540 + }, + { + "epoch": 0.8715263793797825, + "grad_norm": 0.35075255673198685, + "learning_rate": 3.932723201941683e-05, + "loss": 0.3941, + "step": 541 + }, + { + "epoch": 0.8731373338703181, + "grad_norm": 0.40922218226065216, + "learning_rate": 3.932142781513227e-05, + "loss": 0.4147, + "step": 542 + }, + { + "epoch": 0.8747482883608538, + "grad_norm": 0.3354099083614772, + "learning_rate": 3.9315599112854513e-05, + "loss": 0.4039, + "step": 543 + }, + { + "epoch": 0.8763592428513894, + "grad_norm": 0.3748405697078558, + "learning_rate": 3.930974591997387e-05, + "loss": 0.4069, + "step": 544 + }, + { + "epoch": 0.877970197341925, + "grad_norm": 0.3578120297382228, + "learning_rate": 3.930386824391173e-05, + "loss": 0.4237, + "step": 545 + }, + { + "epoch": 0.8795811518324608, + "grad_norm": 0.3498596617280635, + "learning_rate": 3.9297966092120494e-05, + "loss": 0.4201, + "step": 546 + }, + { + "epoch": 0.8811921063229964, + "grad_norm": 0.32782617117586416, + "learning_rate": 3.9292039472083604e-05, + "loss": 0.4052, + "step": 547 + }, + { + "epoch": 0.882803060813532, + "grad_norm": 0.32420421899810425, + "learning_rate": 3.928608839131554e-05, + "loss": 0.3967, + "step": 548 + }, + { + "epoch": 0.8844140153040677, + "grad_norm": 0.3610196304954956, + "learning_rate": 3.9280112857361785e-05, + "loss": 0.4073, + "step": 549 + }, + { + "epoch": 0.8860249697946033, + "grad_norm": 0.3462247564175289, + "learning_rate": 3.927411287779882e-05, + "loss": 0.419, + "step": 550 + }, + { + "epoch": 0.8876359242851389, + "grad_norm": 0.38210585668638986, + "learning_rate": 3.926808846023414e-05, + "loss": 0.398, + "step": 551 + }, + { + "epoch": 0.8892468787756745, + "grad_norm": 0.38393152658210905, + "learning_rate": 3.926203961230621e-05, + "loss": 0.425, + "step": 552 + }, + { + "epoch": 0.8908578332662103, + "grad_norm": 0.38660450532200585, + "learning_rate": 3.925596634168447e-05, + "loss": 0.4139, + "step": 553 + }, + { + "epoch": 0.8924687877567459, + "grad_norm": 0.4455772806196931, + "learning_rate": 3.9249868656069346e-05, + "loss": 0.4076, + "step": 554 + }, + { + "epoch": 0.8940797422472815, + "grad_norm": 0.30617516032050024, + "learning_rate": 3.9243746563192184e-05, + "loss": 0.4047, + "step": 555 + }, + { + "epoch": 0.8956906967378172, + "grad_norm": 0.4660640316019242, + "learning_rate": 3.923760007081532e-05, + "loss": 0.4023, + "step": 556 + }, + { + "epoch": 0.8973016512283528, + "grad_norm": 0.3429596427602513, + "learning_rate": 3.9231429186731996e-05, + "loss": 0.38, + "step": 557 + }, + { + "epoch": 0.8989126057188884, + "grad_norm": 0.41289741497886173, + "learning_rate": 3.922523391876638e-05, + "loss": 0.3973, + "step": 558 + }, + { + "epoch": 0.900523560209424, + "grad_norm": 0.4494827289948899, + "learning_rate": 3.921901427477358e-05, + "loss": 0.3997, + "step": 559 + }, + { + "epoch": 0.9021345146999598, + "grad_norm": 0.33632315973719623, + "learning_rate": 3.921277026263959e-05, + "loss": 0.409, + "step": 560 + }, + { + "epoch": 0.9037454691904954, + "grad_norm": 0.5070254674512086, + "learning_rate": 3.9206501890281305e-05, + "loss": 0.3968, + "step": 561 + }, + { + "epoch": 0.905356423681031, + "grad_norm": 0.4187055671468531, + "learning_rate": 3.920020916564652e-05, + "loss": 0.4142, + "step": 562 + }, + { + "epoch": 0.9069673781715667, + "grad_norm": 0.4460483809169745, + "learning_rate": 3.9193892096713886e-05, + "loss": 0.3944, + "step": 563 + }, + { + "epoch": 0.9085783326621023, + "grad_norm": 0.39905515724281176, + "learning_rate": 3.918755069149293e-05, + "loss": 0.4127, + "step": 564 + }, + { + "epoch": 0.9101892871526379, + "grad_norm": 0.3437786125257829, + "learning_rate": 3.9181184958024045e-05, + "loss": 0.4223, + "step": 565 + }, + { + "epoch": 0.9118002416431735, + "grad_norm": 0.3388066571758449, + "learning_rate": 3.917479490437845e-05, + "loss": 0.402, + "step": 566 + }, + { + "epoch": 0.9134111961337092, + "grad_norm": 0.38012535779270673, + "learning_rate": 3.9168380538658224e-05, + "loss": 0.4161, + "step": 567 + }, + { + "epoch": 0.9150221506242449, + "grad_norm": 0.3391977799991008, + "learning_rate": 3.916194186899626e-05, + "loss": 0.4038, + "step": 568 + }, + { + "epoch": 0.9166331051147805, + "grad_norm": 0.3646302581201986, + "learning_rate": 3.915547890355625e-05, + "loss": 0.4148, + "step": 569 + }, + { + "epoch": 0.9182440596053162, + "grad_norm": 0.3135049185381983, + "learning_rate": 3.914899165053272e-05, + "loss": 0.4111, + "step": 570 + }, + { + "epoch": 0.9198550140958518, + "grad_norm": 0.3535605107932701, + "learning_rate": 3.9142480118150964e-05, + "loss": 0.3874, + "step": 571 + }, + { + "epoch": 0.9214659685863874, + "grad_norm": 0.3418138874943149, + "learning_rate": 3.913594431466709e-05, + "loss": 0.395, + "step": 572 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.40111797732706755, + "learning_rate": 3.912938424836795e-05, + "loss": 0.412, + "step": 573 + }, + { + "epoch": 0.9246878775674587, + "grad_norm": 0.3565226075692776, + "learning_rate": 3.912279992757117e-05, + "loss": 0.4047, + "step": 574 + }, + { + "epoch": 0.9262988320579943, + "grad_norm": 0.43319435674290785, + "learning_rate": 3.911619136062515e-05, + "loss": 0.3937, + "step": 575 + }, + { + "epoch": 0.92790978654853, + "grad_norm": 0.40127033071910445, + "learning_rate": 3.9109558555909005e-05, + "loss": 0.4009, + "step": 576 + }, + { + "epoch": 0.9295207410390657, + "grad_norm": 0.40228778188193176, + "learning_rate": 3.910290152183258e-05, + "loss": 0.4035, + "step": 577 + }, + { + "epoch": 0.9311316955296013, + "grad_norm": 0.41880827476759785, + "learning_rate": 3.909622026683647e-05, + "loss": 0.3902, + "step": 578 + }, + { + "epoch": 0.9327426500201369, + "grad_norm": 0.4339612846528539, + "learning_rate": 3.9089514799391936e-05, + "loss": 0.427, + "step": 579 + }, + { + "epoch": 0.9343536045106726, + "grad_norm": 0.40920471224475635, + "learning_rate": 3.908278512800098e-05, + "loss": 0.4053, + "step": 580 + }, + { + "epoch": 0.9359645590012082, + "grad_norm": 0.3903565760977215, + "learning_rate": 3.907603126119627e-05, + "loss": 0.3851, + "step": 581 + }, + { + "epoch": 0.9375755134917438, + "grad_norm": 0.36452212891712393, + "learning_rate": 3.9069253207541165e-05, + "loss": 0.4102, + "step": 582 + }, + { + "epoch": 0.9391864679822794, + "grad_norm": 0.40963179376690034, + "learning_rate": 3.906245097562968e-05, + "loss": 0.4042, + "step": 583 + }, + { + "epoch": 0.9407974224728152, + "grad_norm": 0.47835605254275715, + "learning_rate": 3.9055624574086485e-05, + "loss": 0.4083, + "step": 584 + }, + { + "epoch": 0.9424083769633508, + "grad_norm": 0.35302712606627323, + "learning_rate": 3.9048774011566906e-05, + "loss": 0.399, + "step": 585 + }, + { + "epoch": 0.9440193314538864, + "grad_norm": 0.45147544517664695, + "learning_rate": 3.904189929675689e-05, + "loss": 0.3835, + "step": 586 + }, + { + "epoch": 0.9456302859444221, + "grad_norm": 0.3576207044412354, + "learning_rate": 3.903500043837302e-05, + "loss": 0.3983, + "step": 587 + }, + { + "epoch": 0.9472412404349577, + "grad_norm": 0.47971033814742575, + "learning_rate": 3.9028077445162486e-05, + "loss": 0.3876, + "step": 588 + }, + { + "epoch": 0.9488521949254933, + "grad_norm": 0.46648498519940407, + "learning_rate": 3.9021130325903076e-05, + "loss": 0.4104, + "step": 589 + }, + { + "epoch": 0.950463149416029, + "grad_norm": 0.36648880409549883, + "learning_rate": 3.9014159089403167e-05, + "loss": 0.4145, + "step": 590 + }, + { + "epoch": 0.9520741039065647, + "grad_norm": 0.4553737406525612, + "learning_rate": 3.9007163744501723e-05, + "loss": 0.4032, + "step": 591 + }, + { + "epoch": 0.9536850583971003, + "grad_norm": 0.4171288408426286, + "learning_rate": 3.900014430006827e-05, + "loss": 0.4131, + "step": 592 + }, + { + "epoch": 0.9552960128876359, + "grad_norm": 0.37296598077856796, + "learning_rate": 3.8993100765002886e-05, + "loss": 0.4098, + "step": 593 + }, + { + "epoch": 0.9569069673781716, + "grad_norm": 0.3668599752905086, + "learning_rate": 3.8986033148236206e-05, + "loss": 0.4006, + "step": 594 + }, + { + "epoch": 0.9585179218687072, + "grad_norm": 0.32605216315392055, + "learning_rate": 3.897894145872939e-05, + "loss": 0.389, + "step": 595 + }, + { + "epoch": 0.9601288763592428, + "grad_norm": 0.3408714924804675, + "learning_rate": 3.8971825705474104e-05, + "loss": 0.3854, + "step": 596 + }, + { + "epoch": 0.9617398308497785, + "grad_norm": 0.37330680663486493, + "learning_rate": 3.8964685897492566e-05, + "loss": 0.4025, + "step": 597 + }, + { + "epoch": 0.9633507853403142, + "grad_norm": 0.3549667131762123, + "learning_rate": 3.895752204383746e-05, + "loss": 0.4013, + "step": 598 + }, + { + "epoch": 0.9649617398308498, + "grad_norm": 0.3660249510077813, + "learning_rate": 3.895033415359196e-05, + "loss": 0.4057, + "step": 599 + }, + { + "epoch": 0.9665726943213854, + "grad_norm": 0.39751018282200024, + "learning_rate": 3.894312223586974e-05, + "loss": 0.4048, + "step": 600 + }, + { + "epoch": 0.9681836488119211, + "grad_norm": 0.31674666928386697, + "learning_rate": 3.8935886299814904e-05, + "loss": 0.3884, + "step": 601 + }, + { + "epoch": 0.9697946033024567, + "grad_norm": 0.42282251139146454, + "learning_rate": 3.8928626354602046e-05, + "loss": 0.373, + "step": 602 + }, + { + "epoch": 0.9714055577929923, + "grad_norm": 0.31609372557688337, + "learning_rate": 3.8921342409436175e-05, + "loss": 0.3946, + "step": 603 + }, + { + "epoch": 0.973016512283528, + "grad_norm": 0.44022277125892706, + "learning_rate": 3.891403447355274e-05, + "loss": 0.3961, + "step": 604 + }, + { + "epoch": 0.9746274667740636, + "grad_norm": 0.39405832611427605, + "learning_rate": 3.890670255621761e-05, + "loss": 0.3799, + "step": 605 + }, + { + "epoch": 0.9762384212645993, + "grad_norm": 0.3119144672899951, + "learning_rate": 3.889934666672706e-05, + "loss": 0.3837, + "step": 606 + }, + { + "epoch": 0.9778493757551349, + "grad_norm": 0.3896876780989498, + "learning_rate": 3.8891966814407745e-05, + "loss": 0.3998, + "step": 607 + }, + { + "epoch": 0.9794603302456706, + "grad_norm": 0.360270898809426, + "learning_rate": 3.8884563008616726e-05, + "loss": 0.4064, + "step": 608 + }, + { + "epoch": 0.9810712847362062, + "grad_norm": 0.42631951285751807, + "learning_rate": 3.8877135258741416e-05, + "loss": 0.392, + "step": 609 + }, + { + "epoch": 0.9826822392267418, + "grad_norm": 0.4415412873812909, + "learning_rate": 3.886968357419961e-05, + "loss": 0.4088, + "step": 610 + }, + { + "epoch": 0.9842931937172775, + "grad_norm": 0.36666132641993976, + "learning_rate": 3.886220796443942e-05, + "loss": 0.3939, + "step": 611 + }, + { + "epoch": 0.9859041482078131, + "grad_norm": 0.4202033590716815, + "learning_rate": 3.885470843893932e-05, + "loss": 0.3816, + "step": 612 + }, + { + "epoch": 0.9875151026983487, + "grad_norm": 0.3516656089440837, + "learning_rate": 3.884718500720808e-05, + "loss": 0.3865, + "step": 613 + }, + { + "epoch": 0.9891260571888845, + "grad_norm": 0.42108876415853347, + "learning_rate": 3.8839637678784815e-05, + "loss": 0.4058, + "step": 614 + }, + { + "epoch": 0.9907370116794201, + "grad_norm": 0.38505023099974517, + "learning_rate": 3.883206646323892e-05, + "loss": 0.4097, + "step": 615 + }, + { + "epoch": 0.9923479661699557, + "grad_norm": 0.40822421339312126, + "learning_rate": 3.882447137017007e-05, + "loss": 0.3982, + "step": 616 + }, + { + "epoch": 0.9939589206604913, + "grad_norm": 0.40702788912386473, + "learning_rate": 3.881685240920822e-05, + "loss": 0.4071, + "step": 617 + }, + { + "epoch": 0.995569875151027, + "grad_norm": 0.3723642576686159, + "learning_rate": 3.8809209590013606e-05, + "loss": 0.3869, + "step": 618 + }, + { + "epoch": 0.9971808296415626, + "grad_norm": 0.346526709604658, + "learning_rate": 3.8801542922276686e-05, + "loss": 0.3939, + "step": 619 + }, + { + "epoch": 0.9987917841320982, + "grad_norm": 0.4059535452467616, + "learning_rate": 3.879385241571817e-05, + "loss": 0.3963, + "step": 620 + }, + { + "epoch": 1.000402738622634, + "grad_norm": 0.41606971570933265, + "learning_rate": 3.8786138080089e-05, + "loss": 0.4643, + "step": 621 + }, + { + "epoch": 1.0020136931131696, + "grad_norm": 0.4642191778743235, + "learning_rate": 3.877839992517031e-05, + "loss": 0.3676, + "step": 622 + }, + { + "epoch": 1.0036246476037052, + "grad_norm": 0.4205733804052529, + "learning_rate": 3.8770637960773465e-05, + "loss": 0.358, + "step": 623 + }, + { + "epoch": 1.0052356020942408, + "grad_norm": 0.40959547235243765, + "learning_rate": 3.8762852196739994e-05, + "loss": 0.3635, + "step": 624 + }, + { + "epoch": 1.0068465565847764, + "grad_norm": 0.3791990728099719, + "learning_rate": 3.875504264294161e-05, + "loss": 0.3383, + "step": 625 + }, + { + "epoch": 1.0084575110753122, + "grad_norm": 0.5130910962490318, + "learning_rate": 3.8747209309280195e-05, + "loss": 0.3667, + "step": 626 + }, + { + "epoch": 1.0100684655658478, + "grad_norm": 0.39767900122923594, + "learning_rate": 3.873935220568776e-05, + "loss": 0.3597, + "step": 627 + }, + { + "epoch": 1.0116794200563834, + "grad_norm": 0.44532028330504037, + "learning_rate": 3.8731471342126495e-05, + "loss": 0.356, + "step": 628 + }, + { + "epoch": 1.013290374546919, + "grad_norm": 0.4136333579628707, + "learning_rate": 3.872356672858868e-05, + "loss": 0.3356, + "step": 629 + }, + { + "epoch": 1.0149013290374547, + "grad_norm": 0.40179390937304404, + "learning_rate": 3.871563837509672e-05, + "loss": 0.3317, + "step": 630 + }, + { + "epoch": 1.0165122835279903, + "grad_norm": 0.4824835553730714, + "learning_rate": 3.870768629170311e-05, + "loss": 0.3426, + "step": 631 + }, + { + "epoch": 1.018123238018526, + "grad_norm": 0.367967163870901, + "learning_rate": 3.869971048849046e-05, + "loss": 0.3409, + "step": 632 + }, + { + "epoch": 1.0197341925090617, + "grad_norm": 0.49092699271229584, + "learning_rate": 3.8691710975571425e-05, + "loss": 0.3658, + "step": 633 + }, + { + "epoch": 1.0213451469995973, + "grad_norm": 0.344714835725969, + "learning_rate": 3.8683687763088745e-05, + "loss": 0.3418, + "step": 634 + }, + { + "epoch": 1.022956101490133, + "grad_norm": 0.43359371603108976, + "learning_rate": 3.867564086121519e-05, + "loss": 0.3684, + "step": 635 + }, + { + "epoch": 1.0245670559806686, + "grad_norm": 0.3727355148405864, + "learning_rate": 3.866757028015357e-05, + "loss": 0.3721, + "step": 636 + }, + { + "epoch": 1.0261780104712042, + "grad_norm": 0.39174926745367894, + "learning_rate": 3.865947603013674e-05, + "loss": 0.3309, + "step": 637 + }, + { + "epoch": 1.0277889649617398, + "grad_norm": 0.3255706942445588, + "learning_rate": 3.865135812142753e-05, + "loss": 0.3429, + "step": 638 + }, + { + "epoch": 1.0293999194522754, + "grad_norm": 0.40845531163767246, + "learning_rate": 3.86432165643188e-05, + "loss": 0.3456, + "step": 639 + }, + { + "epoch": 1.0310108739428112, + "grad_norm": 0.32752549293401095, + "learning_rate": 3.863505136913337e-05, + "loss": 0.3561, + "step": 640 + }, + { + "epoch": 1.0326218284333468, + "grad_norm": 0.4306213011485377, + "learning_rate": 3.862686254622405e-05, + "loss": 0.3675, + "step": 641 + }, + { + "epoch": 1.0342327829238824, + "grad_norm": 0.3768521574814054, + "learning_rate": 3.8618650105973586e-05, + "loss": 0.3551, + "step": 642 + }, + { + "epoch": 1.035843737414418, + "grad_norm": 0.3398666804464596, + "learning_rate": 3.8610414058794695e-05, + "loss": 0.3195, + "step": 643 + }, + { + "epoch": 1.0374546919049537, + "grad_norm": 0.355463485377364, + "learning_rate": 3.860215441513001e-05, + "loss": 0.3427, + "step": 644 + }, + { + "epoch": 1.0390656463954893, + "grad_norm": 0.29625164040082436, + "learning_rate": 3.8593871185452074e-05, + "loss": 0.3446, + "step": 645 + }, + { + "epoch": 1.0406766008860249, + "grad_norm": 0.3640138737256208, + "learning_rate": 3.858556438026335e-05, + "loss": 0.3728, + "step": 646 + }, + { + "epoch": 1.0422875553765607, + "grad_norm": 0.32386946809285116, + "learning_rate": 3.8577234010096206e-05, + "loss": 0.3468, + "step": 647 + }, + { + "epoch": 1.0438985098670963, + "grad_norm": 0.36609144049870035, + "learning_rate": 3.856888008551285e-05, + "loss": 0.3617, + "step": 648 + }, + { + "epoch": 1.045509464357632, + "grad_norm": 0.4000523742064122, + "learning_rate": 3.856050261710539e-05, + "loss": 0.3591, + "step": 649 + }, + { + "epoch": 1.0471204188481675, + "grad_norm": 0.3847983140548682, + "learning_rate": 3.8552101615495755e-05, + "loss": 0.3566, + "step": 650 + }, + { + "epoch": 1.0487313733387031, + "grad_norm": 0.38930820839888375, + "learning_rate": 3.854367709133575e-05, + "loss": 0.3632, + "step": 651 + }, + { + "epoch": 1.0503423278292388, + "grad_norm": 0.3309535311423929, + "learning_rate": 3.853522905530698e-05, + "loss": 0.3477, + "step": 652 + }, + { + "epoch": 1.0519532823197744, + "grad_norm": 0.36099065328392765, + "learning_rate": 3.8526757518120846e-05, + "loss": 0.3498, + "step": 653 + }, + { + "epoch": 1.0535642368103102, + "grad_norm": 0.37075788058964326, + "learning_rate": 3.8518262490518585e-05, + "loss": 0.3658, + "step": 654 + }, + { + "epoch": 1.0551751913008458, + "grad_norm": 0.34165627904419094, + "learning_rate": 3.8509743983271196e-05, + "loss": 0.3506, + "step": 655 + }, + { + "epoch": 1.0567861457913814, + "grad_norm": 0.30950248679253206, + "learning_rate": 3.8501202007179447e-05, + "loss": 0.3307, + "step": 656 + }, + { + "epoch": 1.058397100281917, + "grad_norm": 0.3237159191997492, + "learning_rate": 3.8492636573073866e-05, + "loss": 0.3319, + "step": 657 + }, + { + "epoch": 1.0600080547724526, + "grad_norm": 0.3517039667110292, + "learning_rate": 3.8484047691814724e-05, + "loss": 0.3707, + "step": 658 + }, + { + "epoch": 1.0616190092629882, + "grad_norm": 0.339096140285127, + "learning_rate": 3.847543537429202e-05, + "loss": 0.3516, + "step": 659 + }, + { + "epoch": 1.0632299637535239, + "grad_norm": 0.38263230843949914, + "learning_rate": 3.8466799631425474e-05, + "loss": 0.3811, + "step": 660 + }, + { + "epoch": 1.0648409182440597, + "grad_norm": 0.36914183180517873, + "learning_rate": 3.8458140474164503e-05, + "loss": 0.3729, + "step": 661 + }, + { + "epoch": 1.0664518727345953, + "grad_norm": 0.3636472527013866, + "learning_rate": 3.8449457913488205e-05, + "loss": 0.344, + "step": 662 + }, + { + "epoch": 1.068062827225131, + "grad_norm": 0.3219280442977279, + "learning_rate": 3.8440751960405365e-05, + "loss": 0.3574, + "step": 663 + }, + { + "epoch": 1.0696737817156665, + "grad_norm": 0.3405685714287699, + "learning_rate": 3.843202262595442e-05, + "loss": 0.3402, + "step": 664 + }, + { + "epoch": 1.0712847362062021, + "grad_norm": 0.3498606308270493, + "learning_rate": 3.842326992120345e-05, + "loss": 0.3639, + "step": 665 + }, + { + "epoch": 1.0728956906967377, + "grad_norm": 0.32207988596751275, + "learning_rate": 3.841449385725018e-05, + "loss": 0.3409, + "step": 666 + }, + { + "epoch": 1.0745066451872733, + "grad_norm": 0.3491136718479199, + "learning_rate": 3.8405694445221924e-05, + "loss": 0.3462, + "step": 667 + }, + { + "epoch": 1.0761175996778092, + "grad_norm": 0.36346049492233184, + "learning_rate": 3.839687169627564e-05, + "loss": 0.3675, + "step": 668 + }, + { + "epoch": 1.0777285541683448, + "grad_norm": 0.3270189746678473, + "learning_rate": 3.838802562159783e-05, + "loss": 0.3428, + "step": 669 + }, + { + "epoch": 1.0793395086588804, + "grad_norm": 0.3572958227451133, + "learning_rate": 3.837915623240462e-05, + "loss": 0.3538, + "step": 670 + }, + { + "epoch": 1.080950463149416, + "grad_norm": 0.3646744219922745, + "learning_rate": 3.8370263539941647e-05, + "loss": 0.3521, + "step": 671 + }, + { + "epoch": 1.0825614176399516, + "grad_norm": 0.3403146272839342, + "learning_rate": 3.8361347555484136e-05, + "loss": 0.3447, + "step": 672 + }, + { + "epoch": 1.0841723721304872, + "grad_norm": 0.32652840028581526, + "learning_rate": 3.835240829033682e-05, + "loss": 0.3304, + "step": 673 + }, + { + "epoch": 1.085783326621023, + "grad_norm": 0.3400766178269568, + "learning_rate": 3.834344575583396e-05, + "loss": 0.3599, + "step": 674 + }, + { + "epoch": 1.0873942811115587, + "grad_norm": 0.38815523922016976, + "learning_rate": 3.833445996333932e-05, + "loss": 0.3359, + "step": 675 + }, + { + "epoch": 1.0890052356020943, + "grad_norm": 0.41704665471764435, + "learning_rate": 3.832545092424615e-05, + "loss": 0.3596, + "step": 676 + }, + { + "epoch": 1.09061619009263, + "grad_norm": 0.32531971390606135, + "learning_rate": 3.831641864997717e-05, + "loss": 0.3263, + "step": 677 + }, + { + "epoch": 1.0922271445831655, + "grad_norm": 0.3585802875329914, + "learning_rate": 3.830736315198457e-05, + "loss": 0.3641, + "step": 678 + }, + { + "epoch": 1.093838099073701, + "grad_norm": 0.31768810788040425, + "learning_rate": 3.8298284441749985e-05, + "loss": 0.3707, + "step": 679 + }, + { + "epoch": 1.0954490535642367, + "grad_norm": 0.32746796457773686, + "learning_rate": 3.828918253078448e-05, + "loss": 0.3433, + "step": 680 + }, + { + "epoch": 1.0970600080547726, + "grad_norm": 0.3224365136634425, + "learning_rate": 3.828005743062853e-05, + "loss": 0.3825, + "step": 681 + }, + { + "epoch": 1.0986709625453082, + "grad_norm": 0.3486055061501249, + "learning_rate": 3.827090915285202e-05, + "loss": 0.3626, + "step": 682 + }, + { + "epoch": 1.1002819170358438, + "grad_norm": 0.33829450630818075, + "learning_rate": 3.826173770905422e-05, + "loss": 0.35, + "step": 683 + }, + { + "epoch": 1.1018928715263794, + "grad_norm": 0.31882058047495504, + "learning_rate": 3.825254311086377e-05, + "loss": 0.3405, + "step": 684 + }, + { + "epoch": 1.103503826016915, + "grad_norm": 0.37226089185500716, + "learning_rate": 3.8243325369938674e-05, + "loss": 0.3469, + "step": 685 + }, + { + "epoch": 1.1051147805074506, + "grad_norm": 0.4101836669177939, + "learning_rate": 3.823408449796627e-05, + "loss": 0.345, + "step": 686 + }, + { + "epoch": 1.1067257349979862, + "grad_norm": 0.3137847522951066, + "learning_rate": 3.822482050666322e-05, + "loss": 0.3221, + "step": 687 + }, + { + "epoch": 1.108336689488522, + "grad_norm": 0.36142432810250685, + "learning_rate": 3.821553340777553e-05, + "loss": 0.3635, + "step": 688 + }, + { + "epoch": 1.1099476439790577, + "grad_norm": 0.33266260220336297, + "learning_rate": 3.820622321307847e-05, + "loss": 0.3545, + "step": 689 + }, + { + "epoch": 1.1115585984695933, + "grad_norm": 0.31351708791895283, + "learning_rate": 3.8196889934376617e-05, + "loss": 0.3681, + "step": 690 + }, + { + "epoch": 1.1131695529601289, + "grad_norm": 0.28063065842857104, + "learning_rate": 3.818753358350379e-05, + "loss": 0.3443, + "step": 691 + }, + { + "epoch": 1.1147805074506645, + "grad_norm": 0.36353977199002524, + "learning_rate": 3.8178154172323094e-05, + "loss": 0.3753, + "step": 692 + }, + { + "epoch": 1.1163914619412, + "grad_norm": 0.2768651552529601, + "learning_rate": 3.8168751712726846e-05, + "loss": 0.3446, + "step": 693 + }, + { + "epoch": 1.1180024164317357, + "grad_norm": 0.3342687611239813, + "learning_rate": 3.815932621663661e-05, + "loss": 0.3724, + "step": 694 + }, + { + "epoch": 1.1196133709222715, + "grad_norm": 0.33647821887748514, + "learning_rate": 3.814987769600312e-05, + "loss": 0.3455, + "step": 695 + }, + { + "epoch": 1.1212243254128071, + "grad_norm": 0.31273923882238536, + "learning_rate": 3.814040616280636e-05, + "loss": 0.3462, + "step": 696 + }, + { + "epoch": 1.1228352799033428, + "grad_norm": 0.34039417308107917, + "learning_rate": 3.8130911629055443e-05, + "loss": 0.3398, + "step": 697 + }, + { + "epoch": 1.1244462343938784, + "grad_norm": 0.33579456080236614, + "learning_rate": 3.812139410678866e-05, + "loss": 0.3596, + "step": 698 + }, + { + "epoch": 1.126057188884414, + "grad_norm": 0.3230165075161485, + "learning_rate": 3.811185360807347e-05, + "loss": 0.3497, + "step": 699 + }, + { + "epoch": 1.1276681433749496, + "grad_norm": 0.3241972598923344, + "learning_rate": 3.810229014500643e-05, + "loss": 0.3324, + "step": 700 + }, + { + "epoch": 1.1292790978654854, + "grad_norm": 0.3296772176461001, + "learning_rate": 3.809270372971323e-05, + "loss": 0.3506, + "step": 701 + }, + { + "epoch": 1.130890052356021, + "grad_norm": 0.37285713064822357, + "learning_rate": 3.8083094374348676e-05, + "loss": 0.3732, + "step": 702 + }, + { + "epoch": 1.1325010068465566, + "grad_norm": 0.2874339346820618, + "learning_rate": 3.807346209109663e-05, + "loss": 0.3483, + "step": 703 + }, + { + "epoch": 1.1341119613370922, + "grad_norm": 0.37713299713195747, + "learning_rate": 3.8063806892170055e-05, + "loss": 0.3707, + "step": 704 + }, + { + "epoch": 1.1357229158276279, + "grad_norm": 0.36683564876617375, + "learning_rate": 3.805412878981095e-05, + "loss": 0.3412, + "step": 705 + }, + { + "epoch": 1.1373338703181635, + "grad_norm": 0.37846828032041174, + "learning_rate": 3.804442779629035e-05, + "loss": 0.3823, + "step": 706 + }, + { + "epoch": 1.138944824808699, + "grad_norm": 0.39303680121813295, + "learning_rate": 3.803470392390834e-05, + "loss": 0.335, + "step": 707 + }, + { + "epoch": 1.140555779299235, + "grad_norm": 0.32030974316744143, + "learning_rate": 3.8024957184993986e-05, + "loss": 0.3496, + "step": 708 + }, + { + "epoch": 1.1421667337897705, + "grad_norm": 0.40038802750352454, + "learning_rate": 3.8015187591905356e-05, + "loss": 0.3631, + "step": 709 + }, + { + "epoch": 1.1437776882803061, + "grad_norm": 0.3889019319477048, + "learning_rate": 3.800539515702949e-05, + "loss": 0.3617, + "step": 710 + }, + { + "epoch": 1.1453886427708417, + "grad_norm": 0.3372232290880548, + "learning_rate": 3.799557989278241e-05, + "loss": 0.3763, + "step": 711 + }, + { + "epoch": 1.1469995972613773, + "grad_norm": 0.37103844691689164, + "learning_rate": 3.798574181160907e-05, + "loss": 0.3707, + "step": 712 + }, + { + "epoch": 1.148610551751913, + "grad_norm": 0.3193426723569192, + "learning_rate": 3.7975880925983345e-05, + "loss": 0.3419, + "step": 713 + }, + { + "epoch": 1.1502215062424486, + "grad_norm": 0.3735797639088716, + "learning_rate": 3.796599724840803e-05, + "loss": 0.3612, + "step": 714 + }, + { + "epoch": 1.1518324607329844, + "grad_norm": 0.3144326051598097, + "learning_rate": 3.795609079141484e-05, + "loss": 0.344, + "step": 715 + }, + { + "epoch": 1.15344341522352, + "grad_norm": 0.3226419021515401, + "learning_rate": 3.794616156756433e-05, + "loss": 0.3352, + "step": 716 + }, + { + "epoch": 1.1550543697140556, + "grad_norm": 0.2952244005389135, + "learning_rate": 3.793620958944596e-05, + "loss": 0.369, + "step": 717 + }, + { + "epoch": 1.1566653242045912, + "grad_norm": 0.3017482997087285, + "learning_rate": 3.792623486967802e-05, + "loss": 0.3554, + "step": 718 + }, + { + "epoch": 1.1582762786951268, + "grad_norm": 0.3332567088466206, + "learning_rate": 3.791623742090765e-05, + "loss": 0.3549, + "step": 719 + }, + { + "epoch": 1.1598872331856624, + "grad_norm": 0.3496541869504861, + "learning_rate": 3.790621725581079e-05, + "loss": 0.3477, + "step": 720 + }, + { + "epoch": 1.161498187676198, + "grad_norm": 0.29711261872162453, + "learning_rate": 3.7896174387092194e-05, + "loss": 0.338, + "step": 721 + }, + { + "epoch": 1.163109142166734, + "grad_norm": 0.35281267419179585, + "learning_rate": 3.788610882748539e-05, + "loss": 0.329, + "step": 722 + }, + { + "epoch": 1.1647200966572695, + "grad_norm": 0.40685923102090615, + "learning_rate": 3.78760205897527e-05, + "loss": 0.4075, + "step": 723 + }, + { + "epoch": 1.166331051147805, + "grad_norm": 0.32685402009954984, + "learning_rate": 3.786590968668518e-05, + "loss": 0.3441, + "step": 724 + }, + { + "epoch": 1.1679420056383407, + "grad_norm": 0.30475399585683605, + "learning_rate": 3.785577613110264e-05, + "loss": 0.3271, + "step": 725 + }, + { + "epoch": 1.1695529601288763, + "grad_norm": 0.39855984307309933, + "learning_rate": 3.784561993585358e-05, + "loss": 0.3795, + "step": 726 + }, + { + "epoch": 1.171163914619412, + "grad_norm": 0.320784768450466, + "learning_rate": 3.783544111381524e-05, + "loss": 0.3476, + "step": 727 + }, + { + "epoch": 1.1727748691099475, + "grad_norm": 0.398225649315675, + "learning_rate": 3.782523967789354e-05, + "loss": 0.3998, + "step": 728 + }, + { + "epoch": 1.1743858236004834, + "grad_norm": 0.33274129125722207, + "learning_rate": 3.781501564102305e-05, + "loss": 0.3594, + "step": 729 + }, + { + "epoch": 1.175996778091019, + "grad_norm": 0.29733991185377096, + "learning_rate": 3.7804769016167036e-05, + "loss": 0.329, + "step": 730 + }, + { + "epoch": 1.1776077325815546, + "grad_norm": 0.34843064282370634, + "learning_rate": 3.779449981631737e-05, + "loss": 0.3725, + "step": 731 + }, + { + "epoch": 1.1792186870720902, + "grad_norm": 0.29488591686422405, + "learning_rate": 3.7784208054494554e-05, + "loss": 0.3618, + "step": 732 + }, + { + "epoch": 1.1808296415626258, + "grad_norm": 0.32309697260812176, + "learning_rate": 3.777389374374772e-05, + "loss": 0.3567, + "step": 733 + }, + { + "epoch": 1.1824405960531614, + "grad_norm": 0.2949421891798728, + "learning_rate": 3.776355689715455e-05, + "loss": 0.3425, + "step": 734 + }, + { + "epoch": 1.184051550543697, + "grad_norm": 0.3234000905104195, + "learning_rate": 3.775319752782133e-05, + "loss": 0.3556, + "step": 735 + }, + { + "epoch": 1.1856625050342329, + "grad_norm": 0.31297392729871315, + "learning_rate": 3.7742815648882906e-05, + "loss": 0.3596, + "step": 736 + }, + { + "epoch": 1.1872734595247685, + "grad_norm": 0.3651886549670445, + "learning_rate": 3.773241127350264e-05, + "loss": 0.3675, + "step": 737 + }, + { + "epoch": 1.188884414015304, + "grad_norm": 0.30965174965919956, + "learning_rate": 3.772198441487243e-05, + "loss": 0.343, + "step": 738 + }, + { + "epoch": 1.1904953685058397, + "grad_norm": 0.3346542890794813, + "learning_rate": 3.771153508621269e-05, + "loss": 0.3554, + "step": 739 + }, + { + "epoch": 1.1921063229963753, + "grad_norm": 0.3132631499207701, + "learning_rate": 3.770106330077231e-05, + "loss": 0.3428, + "step": 740 + }, + { + "epoch": 1.193717277486911, + "grad_norm": 0.36626598122283643, + "learning_rate": 3.769056907182866e-05, + "loss": 0.3674, + "step": 741 + }, + { + "epoch": 1.1953282319774465, + "grad_norm": 0.3296955790235176, + "learning_rate": 3.768005241268757e-05, + "loss": 0.3422, + "step": 742 + }, + { + "epoch": 1.1969391864679824, + "grad_norm": 0.3214817268876617, + "learning_rate": 3.76695133366833e-05, + "loss": 0.3456, + "step": 743 + }, + { + "epoch": 1.198550140958518, + "grad_norm": 0.2844760445477903, + "learning_rate": 3.7658951857178544e-05, + "loss": 0.3428, + "step": 744 + }, + { + "epoch": 1.2001610954490536, + "grad_norm": 0.3371872324687368, + "learning_rate": 3.764836798756439e-05, + "loss": 0.3618, + "step": 745 + }, + { + "epoch": 1.2017720499395892, + "grad_norm": 0.2908980367000638, + "learning_rate": 3.763776174126031e-05, + "loss": 0.3335, + "step": 746 + }, + { + "epoch": 1.2033830044301248, + "grad_norm": 0.3261395185565908, + "learning_rate": 3.762713313171419e-05, + "loss": 0.3677, + "step": 747 + }, + { + "epoch": 1.2049939589206604, + "grad_norm": 0.31868194245423315, + "learning_rate": 3.761648217240221e-05, + "loss": 0.3708, + "step": 748 + }, + { + "epoch": 1.206604913411196, + "grad_norm": 0.29164880164956636, + "learning_rate": 3.760580887682892e-05, + "loss": 0.3267, + "step": 749 + }, + { + "epoch": 1.2082158679017319, + "grad_norm": 0.28510156507177037, + "learning_rate": 3.7595113258527206e-05, + "loss": 0.3437, + "step": 750 + }, + { + "epoch": 1.2098268223922675, + "grad_norm": 0.2957794927406389, + "learning_rate": 3.758439533105822e-05, + "loss": 0.3238, + "step": 751 + }, + { + "epoch": 1.211437776882803, + "grad_norm": 0.348201286744665, + "learning_rate": 3.757365510801143e-05, + "loss": 0.3804, + "step": 752 + }, + { + "epoch": 1.2130487313733387, + "grad_norm": 0.31209813548990145, + "learning_rate": 3.756289260300456e-05, + "loss": 0.3738, + "step": 753 + }, + { + "epoch": 1.2146596858638743, + "grad_norm": 0.3194574953898529, + "learning_rate": 3.755210782968358e-05, + "loss": 0.3411, + "step": 754 + }, + { + "epoch": 1.21627064035441, + "grad_norm": 0.315228642494671, + "learning_rate": 3.7541300801722715e-05, + "loss": 0.3442, + "step": 755 + }, + { + "epoch": 1.2178815948449455, + "grad_norm": 0.2897162651173343, + "learning_rate": 3.7530471532824385e-05, + "loss": 0.3496, + "step": 756 + }, + { + "epoch": 1.2194925493354813, + "grad_norm": 0.34000075648700634, + "learning_rate": 3.751962003671922e-05, + "loss": 0.3824, + "step": 757 + }, + { + "epoch": 1.221103503826017, + "grad_norm": 0.2976452200617564, + "learning_rate": 3.750874632716604e-05, + "loss": 0.3424, + "step": 758 + }, + { + "epoch": 1.2227144583165526, + "grad_norm": 0.29249500001948164, + "learning_rate": 3.74978504179518e-05, + "loss": 0.3723, + "step": 759 + }, + { + "epoch": 1.2243254128070882, + "grad_norm": 0.3355440693648072, + "learning_rate": 3.7486932322891646e-05, + "loss": 0.354, + "step": 760 + }, + { + "epoch": 1.2259363672976238, + "grad_norm": 0.2774173083048779, + "learning_rate": 3.747599205582882e-05, + "loss": 0.3388, + "step": 761 + }, + { + "epoch": 1.2275473217881594, + "grad_norm": 0.307261387050007, + "learning_rate": 3.746502963063469e-05, + "loss": 0.3536, + "step": 762 + }, + { + "epoch": 1.229158276278695, + "grad_norm": 0.2779362555047775, + "learning_rate": 3.745404506120872e-05, + "loss": 0.3418, + "step": 763 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 0.2970571621131108, + "learning_rate": 3.744303836147844e-05, + "loss": 0.3885, + "step": 764 + }, + { + "epoch": 1.2323801852597664, + "grad_norm": 0.299475415748665, + "learning_rate": 3.743200954539945e-05, + "loss": 0.3357, + "step": 765 + }, + { + "epoch": 1.233991139750302, + "grad_norm": 0.28066511695868956, + "learning_rate": 3.7420958626955395e-05, + "loss": 0.3186, + "step": 766 + }, + { + "epoch": 1.2356020942408377, + "grad_norm": 0.32650220462849366, + "learning_rate": 3.7409885620157925e-05, + "loss": 0.374, + "step": 767 + }, + { + "epoch": 1.2372130487313733, + "grad_norm": 0.33812078920671956, + "learning_rate": 3.739879053904672e-05, + "loss": 0.3508, + "step": 768 + }, + { + "epoch": 1.2388240032219089, + "grad_norm": 0.34383445727592993, + "learning_rate": 3.738767339768942e-05, + "loss": 0.3779, + "step": 769 + }, + { + "epoch": 1.2404349577124447, + "grad_norm": 0.3202923747231639, + "learning_rate": 3.737653421018168e-05, + "loss": 0.3426, + "step": 770 + }, + { + "epoch": 1.2420459122029803, + "grad_norm": 0.35409199577635775, + "learning_rate": 3.736537299064706e-05, + "loss": 0.3572, + "step": 771 + }, + { + "epoch": 1.243656866693516, + "grad_norm": 0.3436578084892543, + "learning_rate": 3.735418975323708e-05, + "loss": 0.3388, + "step": 772 + }, + { + "epoch": 1.2452678211840515, + "grad_norm": 0.29421403073429603, + "learning_rate": 3.734298451213117e-05, + "loss": 0.3496, + "step": 773 + }, + { + "epoch": 1.2468787756745872, + "grad_norm": 0.3674454300701155, + "learning_rate": 3.7331757281536665e-05, + "loss": 0.3457, + "step": 774 + }, + { + "epoch": 1.2484897301651228, + "grad_norm": 0.2658262781818514, + "learning_rate": 3.732050807568878e-05, + "loss": 0.3353, + "step": 775 + }, + { + "epoch": 1.2501006846556586, + "grad_norm": 0.3558402384689967, + "learning_rate": 3.7309236908850574e-05, + "loss": 0.3531, + "step": 776 + }, + { + "epoch": 1.251711639146194, + "grad_norm": 0.2968753461483703, + "learning_rate": 3.729794379531299e-05, + "loss": 0.3463, + "step": 777 + }, + { + "epoch": 1.2533225936367298, + "grad_norm": 0.3068748390315143, + "learning_rate": 3.7286628749394754e-05, + "loss": 0.3495, + "step": 778 + }, + { + "epoch": 1.2549335481272654, + "grad_norm": 0.2853593430828942, + "learning_rate": 3.727529178544243e-05, + "loss": 0.349, + "step": 779 + }, + { + "epoch": 1.256544502617801, + "grad_norm": 0.36814006315805403, + "learning_rate": 3.726393291783036e-05, + "loss": 0.3683, + "step": 780 + }, + { + "epoch": 1.2581554571083367, + "grad_norm": 0.34204662393847385, + "learning_rate": 3.7252552160960666e-05, + "loss": 0.3395, + "step": 781 + }, + { + "epoch": 1.2597664115988723, + "grad_norm": 0.25952729501669486, + "learning_rate": 3.724114952926322e-05, + "loss": 0.3426, + "step": 782 + }, + { + "epoch": 1.261377366089408, + "grad_norm": 0.3434914251843314, + "learning_rate": 3.722972503719561e-05, + "loss": 0.3277, + "step": 783 + }, + { + "epoch": 1.2629883205799435, + "grad_norm": 0.36202104323985773, + "learning_rate": 3.7218278699243176e-05, + "loss": 0.3722, + "step": 784 + }, + { + "epoch": 1.2645992750704793, + "grad_norm": 0.35145897106947177, + "learning_rate": 3.7206810529918935e-05, + "loss": 0.3711, + "step": 785 + }, + { + "epoch": 1.266210229561015, + "grad_norm": 0.3119487302914577, + "learning_rate": 3.7195320543763596e-05, + "loss": 0.3352, + "step": 786 + }, + { + "epoch": 1.2678211840515505, + "grad_norm": 0.35035093132752193, + "learning_rate": 3.718380875534552e-05, + "loss": 0.3743, + "step": 787 + }, + { + "epoch": 1.2694321385420861, + "grad_norm": 0.3418946608898889, + "learning_rate": 3.71722751792607e-05, + "loss": 0.3429, + "step": 788 + }, + { + "epoch": 1.2710430930326218, + "grad_norm": 0.31270337962677086, + "learning_rate": 3.7160719830132794e-05, + "loss": 0.344, + "step": 789 + }, + { + "epoch": 1.2726540475231576, + "grad_norm": 0.3274369342826082, + "learning_rate": 3.714914272261302e-05, + "loss": 0.3499, + "step": 790 + }, + { + "epoch": 1.2742650020136932, + "grad_norm": 0.34598565436756357, + "learning_rate": 3.7137543871380224e-05, + "loss": 0.3829, + "step": 791 + }, + { + "epoch": 1.2758759565042288, + "grad_norm": 0.28889266818776443, + "learning_rate": 3.712592329114079e-05, + "loss": 0.3381, + "step": 792 + }, + { + "epoch": 1.2774869109947644, + "grad_norm": 0.3717773811020485, + "learning_rate": 3.7114280996628666e-05, + "loss": 0.3721, + "step": 793 + }, + { + "epoch": 1.2790978654853, + "grad_norm": 0.37135926966232047, + "learning_rate": 3.710261700260534e-05, + "loss": 0.3718, + "step": 794 + }, + { + "epoch": 1.2807088199758356, + "grad_norm": 0.33049624744013434, + "learning_rate": 3.7090931323859794e-05, + "loss": 0.3586, + "step": 795 + }, + { + "epoch": 1.2823197744663712, + "grad_norm": 0.3101839548871945, + "learning_rate": 3.707922397520852e-05, + "loss": 0.3397, + "step": 796 + }, + { + "epoch": 1.283930728956907, + "grad_norm": 0.30737107820444354, + "learning_rate": 3.706749497149547e-05, + "loss": 0.336, + "step": 797 + }, + { + "epoch": 1.2855416834474427, + "grad_norm": 0.3019490959641579, + "learning_rate": 3.705574432759208e-05, + "loss": 0.3512, + "step": 798 + }, + { + "epoch": 1.2871526379379783, + "grad_norm": 0.3503088570735374, + "learning_rate": 3.70439720583972e-05, + "loss": 0.3822, + "step": 799 + }, + { + "epoch": 1.288763592428514, + "grad_norm": 0.311137345604332, + "learning_rate": 3.70321781788371e-05, + "loss": 0.3641, + "step": 800 + }, + { + "epoch": 1.2903745469190495, + "grad_norm": 0.31960209562676384, + "learning_rate": 3.702036270386547e-05, + "loss": 0.3813, + "step": 801 + }, + { + "epoch": 1.2919855014095851, + "grad_norm": 0.3425160205698806, + "learning_rate": 3.700852564846335e-05, + "loss": 0.354, + "step": 802 + }, + { + "epoch": 1.2935964559001207, + "grad_norm": 0.30986308980528005, + "learning_rate": 3.6996667027639174e-05, + "loss": 0.3579, + "step": 803 + }, + { + "epoch": 1.2952074103906566, + "grad_norm": 0.3908250025109488, + "learning_rate": 3.6984786856428705e-05, + "loss": 0.3531, + "step": 804 + }, + { + "epoch": 1.2968183648811922, + "grad_norm": 0.3475929563748349, + "learning_rate": 3.697288514989502e-05, + "loss": 0.374, + "step": 805 + }, + { + "epoch": 1.2984293193717278, + "grad_norm": 0.36294126212027755, + "learning_rate": 3.696096192312852e-05, + "loss": 0.3576, + "step": 806 + }, + { + "epoch": 1.3000402738622634, + "grad_norm": 0.32338264256863225, + "learning_rate": 3.694901719124688e-05, + "loss": 0.3312, + "step": 807 + }, + { + "epoch": 1.301651228352799, + "grad_norm": 0.35527955178483733, + "learning_rate": 3.6937050969395055e-05, + "loss": 0.3836, + "step": 808 + }, + { + "epoch": 1.3032621828433346, + "grad_norm": 0.3628976303009859, + "learning_rate": 3.6925063272745214e-05, + "loss": 0.3629, + "step": 809 + }, + { + "epoch": 1.3048731373338702, + "grad_norm": 0.3022352084491082, + "learning_rate": 3.6913054116496797e-05, + "loss": 0.3605, + "step": 810 + }, + { + "epoch": 1.306484091824406, + "grad_norm": 0.3393294470418059, + "learning_rate": 3.690102351587643e-05, + "loss": 0.3512, + "step": 811 + }, + { + "epoch": 1.3080950463149417, + "grad_norm": 0.28363397227191833, + "learning_rate": 3.688897148613794e-05, + "loss": 0.3429, + "step": 812 + }, + { + "epoch": 1.3097060008054773, + "grad_norm": 0.33963097783454826, + "learning_rate": 3.687689804256229e-05, + "loss": 0.3332, + "step": 813 + }, + { + "epoch": 1.3113169552960129, + "grad_norm": 0.3600086874002223, + "learning_rate": 3.6864803200457646e-05, + "loss": 0.3625, + "step": 814 + }, + { + "epoch": 1.3129279097865485, + "grad_norm": 0.2816481879831538, + "learning_rate": 3.685268697515928e-05, + "loss": 0.3684, + "step": 815 + }, + { + "epoch": 1.314538864277084, + "grad_norm": 0.3139691237821825, + "learning_rate": 3.684054938202956e-05, + "loss": 0.3393, + "step": 816 + }, + { + "epoch": 1.3161498187676197, + "grad_norm": 0.29165261191102926, + "learning_rate": 3.682839043645798e-05, + "loss": 0.3228, + "step": 817 + }, + { + "epoch": 1.3177607732581555, + "grad_norm": 0.2718062507025103, + "learning_rate": 3.681621015386108e-05, + "loss": 0.3769, + "step": 818 + }, + { + "epoch": 1.3193717277486912, + "grad_norm": 0.3635524032153051, + "learning_rate": 3.680400854968246e-05, + "loss": 0.36, + "step": 819 + }, + { + "epoch": 1.3209826822392268, + "grad_norm": 0.28780321276593324, + "learning_rate": 3.679178563939278e-05, + "loss": 0.3479, + "step": 820 + }, + { + "epoch": 1.3225936367297624, + "grad_norm": 0.3238210640271435, + "learning_rate": 3.677954143848967e-05, + "loss": 0.3767, + "step": 821 + }, + { + "epoch": 1.324204591220298, + "grad_norm": 0.3080441012695841, + "learning_rate": 3.676727596249779e-05, + "loss": 0.3561, + "step": 822 + }, + { + "epoch": 1.3258155457108336, + "grad_norm": 0.32081488942296216, + "learning_rate": 3.675498922696876e-05, + "loss": 0.3692, + "step": 823 + }, + { + "epoch": 1.3274265002013692, + "grad_norm": 0.3278673509495895, + "learning_rate": 3.6742681247481144e-05, + "loss": 0.3231, + "step": 824 + }, + { + "epoch": 1.329037454691905, + "grad_norm": 0.3096169164731479, + "learning_rate": 3.6730352039640476e-05, + "loss": 0.3821, + "step": 825 + }, + { + "epoch": 1.3306484091824407, + "grad_norm": 0.33349256230646473, + "learning_rate": 3.671800161907917e-05, + "loss": 0.3268, + "step": 826 + }, + { + "epoch": 1.3322593636729763, + "grad_norm": 0.3295767828308085, + "learning_rate": 3.6705630001456556e-05, + "loss": 0.335, + "step": 827 + }, + { + "epoch": 1.3338703181635119, + "grad_norm": 0.2912807748587227, + "learning_rate": 3.669323720245884e-05, + "loss": 0.3409, + "step": 828 + }, + { + "epoch": 1.3354812726540475, + "grad_norm": 0.30649973311303386, + "learning_rate": 3.668082323779907e-05, + "loss": 0.3512, + "step": 829 + }, + { + "epoch": 1.337092227144583, + "grad_norm": 0.2959906133761005, + "learning_rate": 3.6668388123217154e-05, + "loss": 0.3612, + "step": 830 + }, + { + "epoch": 1.3387031816351187, + "grad_norm": 0.29861778533728894, + "learning_rate": 3.6655931874479783e-05, + "loss": 0.3356, + "step": 831 + }, + { + "epoch": 1.3403141361256545, + "grad_norm": 0.36174475564403585, + "learning_rate": 3.664345450738048e-05, + "loss": 0.3857, + "step": 832 + }, + { + "epoch": 1.3419250906161901, + "grad_norm": 0.2964990585195445, + "learning_rate": 3.663095603773952e-05, + "loss": 0.3703, + "step": 833 + }, + { + "epoch": 1.3435360451067258, + "grad_norm": 0.3444117342564229, + "learning_rate": 3.6618436481403945e-05, + "loss": 0.3477, + "step": 834 + }, + { + "epoch": 1.3451469995972614, + "grad_norm": 0.2921618958671085, + "learning_rate": 3.6605895854247534e-05, + "loss": 0.3663, + "step": 835 + }, + { + "epoch": 1.346757954087797, + "grad_norm": 0.3135599574497408, + "learning_rate": 3.659333417217076e-05, + "loss": 0.3556, + "step": 836 + }, + { + "epoch": 1.3483689085783326, + "grad_norm": 0.28548160767634045, + "learning_rate": 3.658075145110083e-05, + "loss": 0.3401, + "step": 837 + }, + { + "epoch": 1.3499798630688682, + "grad_norm": 0.33185921583222566, + "learning_rate": 3.6568147706991616e-05, + "loss": 0.3671, + "step": 838 + }, + { + "epoch": 1.351590817559404, + "grad_norm": 0.29106614505484624, + "learning_rate": 3.655552295582361e-05, + "loss": 0.3751, + "step": 839 + }, + { + "epoch": 1.3532017720499396, + "grad_norm": 0.32102312365154567, + "learning_rate": 3.654287721360398e-05, + "loss": 0.3854, + "step": 840 + }, + { + "epoch": 1.3548127265404752, + "grad_norm": 0.28054378659585727, + "learning_rate": 3.653021049636648e-05, + "loss": 0.3331, + "step": 841 + }, + { + "epoch": 1.3564236810310109, + "grad_norm": 0.3182024975537731, + "learning_rate": 3.65175228201715e-05, + "loss": 0.3423, + "step": 842 + }, + { + "epoch": 1.3580346355215465, + "grad_norm": 0.29631447266060057, + "learning_rate": 3.650481420110596e-05, + "loss": 0.3454, + "step": 843 + }, + { + "epoch": 1.3596455900120823, + "grad_norm": 0.39903373865432623, + "learning_rate": 3.6492084655283355e-05, + "loss": 0.369, + "step": 844 + }, + { + "epoch": 1.3612565445026177, + "grad_norm": 0.29989201553650346, + "learning_rate": 3.647933419884371e-05, + "loss": 0.3304, + "step": 845 + }, + { + "epoch": 1.3628674989931535, + "grad_norm": 0.35056143882565566, + "learning_rate": 3.646656284795357e-05, + "loss": 0.3723, + "step": 846 + }, + { + "epoch": 1.3644784534836891, + "grad_norm": 0.3441551125299749, + "learning_rate": 3.645377061880595e-05, + "loss": 0.3557, + "step": 847 + }, + { + "epoch": 1.3660894079742247, + "grad_norm": 0.3274845208948303, + "learning_rate": 3.644095752762036e-05, + "loss": 0.3399, + "step": 848 + }, + { + "epoch": 1.3677003624647603, + "grad_norm": 0.3024421060736529, + "learning_rate": 3.642812359064276e-05, + "loss": 0.3596, + "step": 849 + }, + { + "epoch": 1.369311316955296, + "grad_norm": 0.31600146060711926, + "learning_rate": 3.641526882414553e-05, + "loss": 0.3423, + "step": 850 + }, + { + "epoch": 1.3709222714458318, + "grad_norm": 0.30071046711567967, + "learning_rate": 3.640239324442746e-05, + "loss": 0.3424, + "step": 851 + }, + { + "epoch": 1.3725332259363672, + "grad_norm": 0.3203775229603514, + "learning_rate": 3.638949686781374e-05, + "loss": 0.3563, + "step": 852 + }, + { + "epoch": 1.374144180426903, + "grad_norm": 0.2791617091294289, + "learning_rate": 3.6376579710655915e-05, + "loss": 0.3473, + "step": 853 + }, + { + "epoch": 1.3757551349174386, + "grad_norm": 0.3262506737731935, + "learning_rate": 3.63636417893319e-05, + "loss": 0.3535, + "step": 854 + }, + { + "epoch": 1.3773660894079742, + "grad_norm": 0.37963304107161205, + "learning_rate": 3.6350683120245906e-05, + "loss": 0.3877, + "step": 855 + }, + { + "epoch": 1.3789770438985098, + "grad_norm": 0.26345344067107274, + "learning_rate": 3.633770371982848e-05, + "loss": 0.3344, + "step": 856 + }, + { + "epoch": 1.3805879983890454, + "grad_norm": 0.3589958673951733, + "learning_rate": 3.632470360453643e-05, + "loss": 0.3624, + "step": 857 + }, + { + "epoch": 1.3821989528795813, + "grad_norm": 0.2870645821680028, + "learning_rate": 3.631168279085286e-05, + "loss": 0.3442, + "step": 858 + }, + { + "epoch": 1.3838099073701167, + "grad_norm": 0.3154900838253569, + "learning_rate": 3.629864129528709e-05, + "loss": 0.3544, + "step": 859 + }, + { + "epoch": 1.3854208618606525, + "grad_norm": 0.32587561605795656, + "learning_rate": 3.6285579134374655e-05, + "loss": 0.3365, + "step": 860 + }, + { + "epoch": 1.387031816351188, + "grad_norm": 0.28524552874452264, + "learning_rate": 3.627249632467733e-05, + "loss": 0.3513, + "step": 861 + }, + { + "epoch": 1.3886427708417237, + "grad_norm": 0.3043822768243937, + "learning_rate": 3.625939288278304e-05, + "loss": 0.3505, + "step": 862 + }, + { + "epoch": 1.3902537253322593, + "grad_norm": 0.3283867868538535, + "learning_rate": 3.6246268825305886e-05, + "loss": 0.328, + "step": 863 + }, + { + "epoch": 1.391864679822795, + "grad_norm": 0.27028256421648467, + "learning_rate": 3.6233124168886094e-05, + "loss": 0.337, + "step": 864 + }, + { + "epoch": 1.3934756343133308, + "grad_norm": 0.31823659410494604, + "learning_rate": 3.621995893019003e-05, + "loss": 0.3483, + "step": 865 + }, + { + "epoch": 1.3950865888038662, + "grad_norm": 0.2853031194725599, + "learning_rate": 3.620677312591012e-05, + "loss": 0.3672, + "step": 866 + }, + { + "epoch": 1.396697543294402, + "grad_norm": 0.3703793870257088, + "learning_rate": 3.61935667727649e-05, + "loss": 0.3466, + "step": 867 + }, + { + "epoch": 1.3983084977849376, + "grad_norm": 0.3194336151665436, + "learning_rate": 3.6180339887498953e-05, + "loss": 0.3268, + "step": 868 + }, + { + "epoch": 1.3999194522754732, + "grad_norm": 0.33386957554709146, + "learning_rate": 3.616709248688288e-05, + "loss": 0.3419, + "step": 869 + }, + { + "epoch": 1.4015304067660088, + "grad_norm": 0.30916892946803903, + "learning_rate": 3.61538245877133e-05, + "loss": 0.3446, + "step": 870 + }, + { + "epoch": 1.4031413612565444, + "grad_norm": 0.29073836232064454, + "learning_rate": 3.614053620681284e-05, + "loss": 0.3247, + "step": 871 + }, + { + "epoch": 1.4047523157470803, + "grad_norm": 0.3341684803586826, + "learning_rate": 3.6127227361030076e-05, + "loss": 0.3811, + "step": 872 + }, + { + "epoch": 1.4063632702376159, + "grad_norm": 0.31287760315769936, + "learning_rate": 3.611389806723953e-05, + "loss": 0.3606, + "step": 873 + }, + { + "epoch": 1.4079742247281515, + "grad_norm": 0.3063668454004494, + "learning_rate": 3.610054834234167e-05, + "loss": 0.3625, + "step": 874 + }, + { + "epoch": 1.409585179218687, + "grad_norm": 0.3168490265948009, + "learning_rate": 3.608717820326285e-05, + "loss": 0.373, + "step": 875 + }, + { + "epoch": 1.4111961337092227, + "grad_norm": 0.3722746286249624, + "learning_rate": 3.6073787666955326e-05, + "loss": 0.3765, + "step": 876 + }, + { + "epoch": 1.4128070881997583, + "grad_norm": 0.2752473149335231, + "learning_rate": 3.6060376750397187e-05, + "loss": 0.3144, + "step": 877 + }, + { + "epoch": 1.414418042690294, + "grad_norm": 0.35239569804844095, + "learning_rate": 3.6046945470592395e-05, + "loss": 0.3759, + "step": 878 + }, + { + "epoch": 1.4160289971808298, + "grad_norm": 0.3259555420279317, + "learning_rate": 3.6033493844570704e-05, + "loss": 0.3643, + "step": 879 + }, + { + "epoch": 1.4176399516713654, + "grad_norm": 0.28683471938292443, + "learning_rate": 3.602002188938769e-05, + "loss": 0.3337, + "step": 880 + }, + { + "epoch": 1.419250906161901, + "grad_norm": 0.32883729844020054, + "learning_rate": 3.6006529622124694e-05, + "loss": 0.345, + "step": 881 + }, + { + "epoch": 1.4208618606524366, + "grad_norm": 0.2947797991693968, + "learning_rate": 3.59930170598888e-05, + "loss": 0.3356, + "step": 882 + }, + { + "epoch": 1.4224728151429722, + "grad_norm": 0.2826135754232561, + "learning_rate": 3.597948421981283e-05, + "loss": 0.3107, + "step": 883 + }, + { + "epoch": 1.4240837696335078, + "grad_norm": 0.3096811885777464, + "learning_rate": 3.596593111905533e-05, + "loss": 0.3754, + "step": 884 + }, + { + "epoch": 1.4256947241240434, + "grad_norm": 0.2813838416003549, + "learning_rate": 3.5952357774800526e-05, + "loss": 0.3484, + "step": 885 + }, + { + "epoch": 1.4273056786145792, + "grad_norm": 0.3147799342493275, + "learning_rate": 3.5938764204258306e-05, + "loss": 0.3464, + "step": 886 + }, + { + "epoch": 1.4289166331051149, + "grad_norm": 0.27371781021865504, + "learning_rate": 3.5925150424664206e-05, + "loss": 0.3348, + "step": 887 + }, + { + "epoch": 1.4305275875956505, + "grad_norm": 0.2929800361539922, + "learning_rate": 3.591151645327939e-05, + "loss": 0.3638, + "step": 888 + }, + { + "epoch": 1.432138542086186, + "grad_norm": 0.3091933237540173, + "learning_rate": 3.589786230739062e-05, + "loss": 0.3751, + "step": 889 + }, + { + "epoch": 1.4337494965767217, + "grad_norm": 0.30038308262391095, + "learning_rate": 3.5884188004310244e-05, + "loss": 0.3685, + "step": 890 + }, + { + "epoch": 1.4353604510672573, + "grad_norm": 0.2702977590995165, + "learning_rate": 3.587049356137615e-05, + "loss": 0.3421, + "step": 891 + }, + { + "epoch": 1.436971405557793, + "grad_norm": 0.31963763306891574, + "learning_rate": 3.5856778995951794e-05, + "loss": 0.3574, + "step": 892 + }, + { + "epoch": 1.4385823600483287, + "grad_norm": 0.2789441221702962, + "learning_rate": 3.58430443254261e-05, + "loss": 0.3707, + "step": 893 + }, + { + "epoch": 1.4401933145388643, + "grad_norm": 0.32066299952132377, + "learning_rate": 3.582928956721352e-05, + "loss": 0.3041, + "step": 894 + }, + { + "epoch": 1.4418042690294, + "grad_norm": 0.2805130759068286, + "learning_rate": 3.581551473875397e-05, + "loss": 0.3738, + "step": 895 + }, + { + "epoch": 1.4434152235199356, + "grad_norm": 0.32812904359127354, + "learning_rate": 3.580171985751281e-05, + "loss": 0.346, + "step": 896 + }, + { + "epoch": 1.4450261780104712, + "grad_norm": 0.27111375470537674, + "learning_rate": 3.578790494098081e-05, + "loss": 0.3306, + "step": 897 + }, + { + "epoch": 1.4466371325010068, + "grad_norm": 0.38149897363461827, + "learning_rate": 3.5774070006674164e-05, + "loss": 0.3516, + "step": 898 + }, + { + "epoch": 1.4482480869915424, + "grad_norm": 0.2892182943854451, + "learning_rate": 3.576021507213444e-05, + "loss": 0.3594, + "step": 899 + }, + { + "epoch": 1.4498590414820782, + "grad_norm": 0.3372883515472436, + "learning_rate": 3.574634015492857e-05, + "loss": 0.3532, + "step": 900 + }, + { + "epoch": 1.4514699959726138, + "grad_norm": 0.26358417061513745, + "learning_rate": 3.57324452726488e-05, + "loss": 0.3288, + "step": 901 + }, + { + "epoch": 1.4530809504631494, + "grad_norm": 0.31566876672673594, + "learning_rate": 3.571853044291271e-05, + "loss": 0.3362, + "step": 902 + }, + { + "epoch": 1.454691904953685, + "grad_norm": 0.32363249428251567, + "learning_rate": 3.5704595683363187e-05, + "loss": 0.3781, + "step": 903 + }, + { + "epoch": 1.4563028594442207, + "grad_norm": 0.3230781499413541, + "learning_rate": 3.569064101166835e-05, + "loss": 0.3541, + "step": 904 + }, + { + "epoch": 1.4579138139347563, + "grad_norm": 0.3156721284052765, + "learning_rate": 3.567666644552159e-05, + "loss": 0.3609, + "step": 905 + }, + { + "epoch": 1.4595247684252919, + "grad_norm": 0.24785776600781328, + "learning_rate": 3.566267200264151e-05, + "loss": 0.3287, + "step": 906 + }, + { + "epoch": 1.4611357229158277, + "grad_norm": 0.3480224482497804, + "learning_rate": 3.564865770077193e-05, + "loss": 0.3661, + "step": 907 + }, + { + "epoch": 1.4627466774063633, + "grad_norm": 0.2646519126771718, + "learning_rate": 3.563462355768184e-05, + "loss": 0.3346, + "step": 908 + }, + { + "epoch": 1.464357631896899, + "grad_norm": 0.3172875453636419, + "learning_rate": 3.562056959116538e-05, + "loss": 0.358, + "step": 909 + }, + { + "epoch": 1.4659685863874345, + "grad_norm": 0.3022971119624446, + "learning_rate": 3.560649581904184e-05, + "loss": 0.3738, + "step": 910 + }, + { + "epoch": 1.4675795408779702, + "grad_norm": 0.3594652213181189, + "learning_rate": 3.559240225915561e-05, + "loss": 0.364, + "step": 911 + }, + { + "epoch": 1.4691904953685058, + "grad_norm": 0.29252293105230515, + "learning_rate": 3.557828892937617e-05, + "loss": 0.3324, + "step": 912 + }, + { + "epoch": 1.4708014498590414, + "grad_norm": 0.33755638959110873, + "learning_rate": 3.5564155847598085e-05, + "loss": 0.3463, + "step": 913 + }, + { + "epoch": 1.4724124043495772, + "grad_norm": 0.28967085216937327, + "learning_rate": 3.555000303174093e-05, + "loss": 0.3529, + "step": 914 + }, + { + "epoch": 1.4740233588401128, + "grad_norm": 0.3396436534360357, + "learning_rate": 3.553583049974933e-05, + "loss": 0.3821, + "step": 915 + }, + { + "epoch": 1.4756343133306484, + "grad_norm": 0.27806521100206133, + "learning_rate": 3.55216382695929e-05, + "loss": 0.3594, + "step": 916 + }, + { + "epoch": 1.477245267821184, + "grad_norm": 0.30304501601924577, + "learning_rate": 3.550742635926622e-05, + "loss": 0.332, + "step": 917 + }, + { + "epoch": 1.4788562223117196, + "grad_norm": 0.28397653690883456, + "learning_rate": 3.549319478678885e-05, + "loss": 0.3478, + "step": 918 + }, + { + "epoch": 1.4804671768022553, + "grad_norm": 0.3534165168824709, + "learning_rate": 3.547894357020525e-05, + "loss": 0.3502, + "step": 919 + }, + { + "epoch": 1.4820781312927909, + "grad_norm": 0.30300554924196443, + "learning_rate": 3.546467272758479e-05, + "loss": 0.3427, + "step": 920 + }, + { + "epoch": 1.4836890857833267, + "grad_norm": 0.2821833761521695, + "learning_rate": 3.5450382277021745e-05, + "loss": 0.336, + "step": 921 + }, + { + "epoch": 1.4853000402738623, + "grad_norm": 0.3067490433896909, + "learning_rate": 3.543607223663524e-05, + "loss": 0.3478, + "step": 922 + }, + { + "epoch": 1.486910994764398, + "grad_norm": 0.35973200105393427, + "learning_rate": 3.542174262456924e-05, + "loss": 0.3744, + "step": 923 + }, + { + "epoch": 1.4885219492549335, + "grad_norm": 0.3624137649224464, + "learning_rate": 3.540739345899252e-05, + "loss": 0.3801, + "step": 924 + }, + { + "epoch": 1.4901329037454691, + "grad_norm": 0.30165267187605543, + "learning_rate": 3.5393024758098645e-05, + "loss": 0.3286, + "step": 925 + }, + { + "epoch": 1.491743858236005, + "grad_norm": 0.28896166859244726, + "learning_rate": 3.537863654010597e-05, + "loss": 0.3387, + "step": 926 + }, + { + "epoch": 1.4933548127265404, + "grad_norm": 0.3064859067705259, + "learning_rate": 3.5364228823257565e-05, + "loss": 0.3568, + "step": 927 + }, + { + "epoch": 1.4949657672170762, + "grad_norm": 0.32160688390289044, + "learning_rate": 3.534980162582124e-05, + "loss": 0.3688, + "step": 928 + }, + { + "epoch": 1.4965767217076118, + "grad_norm": 0.3013538609491535, + "learning_rate": 3.5335354966089514e-05, + "loss": 0.3376, + "step": 929 + }, + { + "epoch": 1.4981876761981474, + "grad_norm": 0.3057453257092756, + "learning_rate": 3.532088886237956e-05, + "loss": 0.3426, + "step": 930 + }, + { + "epoch": 1.499798630688683, + "grad_norm": 0.3070652933287628, + "learning_rate": 3.530640333303323e-05, + "loss": 0.3572, + "step": 931 + }, + { + "epoch": 1.5014095851792186, + "grad_norm": 0.303242364147413, + "learning_rate": 3.5291898396416984e-05, + "loss": 0.3766, + "step": 932 + }, + { + "epoch": 1.5030205396697545, + "grad_norm": 0.30911330099386514, + "learning_rate": 3.5277374070921904e-05, + "loss": 0.3606, + "step": 933 + }, + { + "epoch": 1.5046314941602899, + "grad_norm": 0.26387501860850937, + "learning_rate": 3.5262830374963636e-05, + "loss": 0.3132, + "step": 934 + }, + { + "epoch": 1.5062424486508257, + "grad_norm": 0.31366992473226274, + "learning_rate": 3.524826732698241e-05, + "loss": 0.345, + "step": 935 + }, + { + "epoch": 1.5078534031413613, + "grad_norm": 0.36305603623046623, + "learning_rate": 3.523368494544298e-05, + "loss": 0.3481, + "step": 936 + }, + { + "epoch": 1.509464357631897, + "grad_norm": 0.3028100237793981, + "learning_rate": 3.521908324883462e-05, + "loss": 0.3494, + "step": 937 + }, + { + "epoch": 1.5110753121224325, + "grad_norm": 0.34440302527582833, + "learning_rate": 3.520446225567108e-05, + "loss": 0.3635, + "step": 938 + }, + { + "epoch": 1.5126862666129681, + "grad_norm": 0.2883451420805599, + "learning_rate": 3.518982198449059e-05, + "loss": 0.356, + "step": 939 + }, + { + "epoch": 1.514297221103504, + "grad_norm": 0.3013627859596229, + "learning_rate": 3.517516245385582e-05, + "loss": 0.3289, + "step": 940 + }, + { + "epoch": 1.5159081755940393, + "grad_norm": 0.3018221027079222, + "learning_rate": 3.516048368235386e-05, + "loss": 0.3604, + "step": 941 + }, + { + "epoch": 1.5175191300845752, + "grad_norm": 0.30511935925361944, + "learning_rate": 3.5145785688596184e-05, + "loss": 0.3234, + "step": 942 + }, + { + "epoch": 1.5191300845751108, + "grad_norm": 0.3007192258635043, + "learning_rate": 3.513106849121866e-05, + "loss": 0.3544, + "step": 943 + }, + { + "epoch": 1.5207410390656464, + "grad_norm": 0.2641898297732925, + "learning_rate": 3.5116332108881486e-05, + "loss": 0.3408, + "step": 944 + }, + { + "epoch": 1.522351993556182, + "grad_norm": 0.3410443834630316, + "learning_rate": 3.5101576560269195e-05, + "loss": 0.3533, + "step": 945 + }, + { + "epoch": 1.5239629480467176, + "grad_norm": 0.28303295707105, + "learning_rate": 3.508680186409062e-05, + "loss": 0.3367, + "step": 946 + }, + { + "epoch": 1.5255739025372534, + "grad_norm": 0.3345908288880972, + "learning_rate": 3.507200803907886e-05, + "loss": 0.3861, + "step": 947 + }, + { + "epoch": 1.5271848570277888, + "grad_norm": 2.8670378142195654, + "learning_rate": 3.505719510399129e-05, + "loss": 0.4205, + "step": 948 + }, + { + "epoch": 1.5287958115183247, + "grad_norm": 0.4210670854339028, + "learning_rate": 3.504236307760949e-05, + "loss": 0.3497, + "step": 949 + }, + { + "epoch": 1.5304067660088603, + "grad_norm": 0.2976798975045218, + "learning_rate": 3.502751197873927e-05, + "loss": 0.3504, + "step": 950 + }, + { + "epoch": 1.5320177204993959, + "grad_norm": 0.3591627202188393, + "learning_rate": 3.501264182621061e-05, + "loss": 0.3746, + "step": 951 + }, + { + "epoch": 1.5336286749899315, + "grad_norm": 0.3210770679629104, + "learning_rate": 3.499775263887764e-05, + "loss": 0.3468, + "step": 952 + }, + { + "epoch": 1.535239629480467, + "grad_norm": 0.33550226114199827, + "learning_rate": 3.4982844435618643e-05, + "loss": 0.3508, + "step": 953 + }, + { + "epoch": 1.536850583971003, + "grad_norm": 0.32509668801494995, + "learning_rate": 3.4967917235336e-05, + "loss": 0.3508, + "step": 954 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.338038211291058, + "learning_rate": 3.4952971056956186e-05, + "loss": 0.3496, + "step": 955 + }, + { + "epoch": 1.5400724929520742, + "grad_norm": 0.3141904048502338, + "learning_rate": 3.4938005919429724e-05, + "loss": 0.3418, + "step": 956 + }, + { + "epoch": 1.5416834474426098, + "grad_norm": 0.2815520350465673, + "learning_rate": 3.49230218417312e-05, + "loss": 0.3312, + "step": 957 + }, + { + "epoch": 1.5432944019331454, + "grad_norm": 0.36948297955531856, + "learning_rate": 3.49080188428592e-05, + "loss": 0.3603, + "step": 958 + }, + { + "epoch": 1.544905356423681, + "grad_norm": 0.3084260379112921, + "learning_rate": 3.489299694183629e-05, + "loss": 0.357, + "step": 959 + }, + { + "epoch": 1.5465163109142166, + "grad_norm": 0.3549939328103352, + "learning_rate": 3.4877956157709024e-05, + "loss": 0.3565, + "step": 960 + }, + { + "epoch": 1.5481272654047524, + "grad_norm": 0.28779131717746126, + "learning_rate": 3.4862896509547886e-05, + "loss": 0.3468, + "step": 961 + }, + { + "epoch": 1.5497382198952878, + "grad_norm": 0.3232280682124179, + "learning_rate": 3.4847818016447284e-05, + "loss": 0.362, + "step": 962 + }, + { + "epoch": 1.5513491743858236, + "grad_norm": 0.30311024972844897, + "learning_rate": 3.483272069752551e-05, + "loss": 0.3175, + "step": 963 + }, + { + "epoch": 1.5529601288763593, + "grad_norm": 0.325020064316006, + "learning_rate": 3.481760457192474e-05, + "loss": 0.3729, + "step": 964 + }, + { + "epoch": 1.5545710833668949, + "grad_norm": 0.308178367218141, + "learning_rate": 3.4802469658810984e-05, + "loss": 0.3433, + "step": 965 + }, + { + "epoch": 1.5561820378574305, + "grad_norm": 0.3139480265008325, + "learning_rate": 3.478731597737407e-05, + "loss": 0.3455, + "step": 966 + }, + { + "epoch": 1.557792992347966, + "grad_norm": 0.32224249506902913, + "learning_rate": 3.4772143546827635e-05, + "loss": 0.3526, + "step": 967 + }, + { + "epoch": 1.559403946838502, + "grad_norm": 0.38577637148488075, + "learning_rate": 3.475695238640908e-05, + "loss": 0.3668, + "step": 968 + }, + { + "epoch": 1.5610149013290373, + "grad_norm": 0.3160738969499725, + "learning_rate": 3.474174251537956e-05, + "loss": 0.3389, + "step": 969 + }, + { + "epoch": 1.5626258558195731, + "grad_norm": 0.348308437450324, + "learning_rate": 3.4726513953023944e-05, + "loss": 0.3841, + "step": 970 + }, + { + "epoch": 1.5642368103101087, + "grad_norm": 0.3192517617664544, + "learning_rate": 3.471126671865082e-05, + "loss": 0.3518, + "step": 971 + }, + { + "epoch": 1.5658477648006444, + "grad_norm": 0.3315975814114015, + "learning_rate": 3.469600083159241e-05, + "loss": 0.3501, + "step": 972 + }, + { + "epoch": 1.56745871929118, + "grad_norm": 0.3429653180075547, + "learning_rate": 3.468071631120464e-05, + "loss": 0.3673, + "step": 973 + }, + { + "epoch": 1.5690696737817156, + "grad_norm": 0.2936081834216568, + "learning_rate": 3.466541317686702e-05, + "loss": 0.3726, + "step": 974 + }, + { + "epoch": 1.5706806282722514, + "grad_norm": 0.2870880348305391, + "learning_rate": 3.465009144798268e-05, + "loss": 0.3473, + "step": 975 + }, + { + "epoch": 1.5722915827627868, + "grad_norm": 0.31292709888328285, + "learning_rate": 3.4634751143978317e-05, + "loss": 0.3494, + "step": 976 + }, + { + "epoch": 1.5739025372533226, + "grad_norm": 0.3344663746754926, + "learning_rate": 3.461939228430419e-05, + "loss": 0.3594, + "step": 977 + }, + { + "epoch": 1.5755134917438582, + "grad_norm": 0.3243550278021458, + "learning_rate": 3.460401488843407e-05, + "loss": 0.3842, + "step": 978 + }, + { + "epoch": 1.5771244462343939, + "grad_norm": 0.28798557728982177, + "learning_rate": 3.458861897586524e-05, + "loss": 0.3267, + "step": 979 + }, + { + "epoch": 1.5787354007249297, + "grad_norm": 0.3366500322864135, + "learning_rate": 3.4573204566118476e-05, + "loss": 0.3929, + "step": 980 + }, + { + "epoch": 1.580346355215465, + "grad_norm": 0.32529408271268495, + "learning_rate": 3.455777167873798e-05, + "loss": 0.3484, + "step": 981 + }, + { + "epoch": 1.581957309706001, + "grad_norm": 0.3119605526527732, + "learning_rate": 3.454232033329139e-05, + "loss": 0.3498, + "step": 982 + }, + { + "epoch": 1.5835682641965363, + "grad_norm": 0.3155331712421188, + "learning_rate": 3.452685054936976e-05, + "loss": 0.3472, + "step": 983 + }, + { + "epoch": 1.5851792186870721, + "grad_norm": 0.2915430968431837, + "learning_rate": 3.4511362346587504e-05, + "loss": 0.3407, + "step": 984 + }, + { + "epoch": 1.5867901731776077, + "grad_norm": 0.3137253719745165, + "learning_rate": 3.44958557445824e-05, + "loss": 0.355, + "step": 985 + }, + { + "epoch": 1.5884011276681433, + "grad_norm": 0.3188776562958085, + "learning_rate": 3.4480330763015564e-05, + "loss": 0.3558, + "step": 986 + }, + { + "epoch": 1.5900120821586792, + "grad_norm": 0.3129181773241444, + "learning_rate": 3.4464787421571395e-05, + "loss": 0.3584, + "step": 987 + }, + { + "epoch": 1.5916230366492146, + "grad_norm": 0.3197808791803997, + "learning_rate": 3.444922573995758e-05, + "loss": 0.3538, + "step": 988 + }, + { + "epoch": 1.5932339911397504, + "grad_norm": 0.29636880126371085, + "learning_rate": 3.443364573790507e-05, + "loss": 0.3371, + "step": 989 + }, + { + "epoch": 1.5948449456302858, + "grad_norm": 0.30432129789976997, + "learning_rate": 3.4418047435168025e-05, + "loss": 0.3474, + "step": 990 + }, + { + "epoch": 1.5964559001208216, + "grad_norm": 0.26763048833686587, + "learning_rate": 3.4402430851523824e-05, + "loss": 0.3407, + "step": 991 + }, + { + "epoch": 1.5980668546113572, + "grad_norm": 0.27316984480658096, + "learning_rate": 3.438679600677303e-05, + "loss": 0.3422, + "step": 992 + }, + { + "epoch": 1.5996778091018928, + "grad_norm": 0.28509710496947416, + "learning_rate": 3.437114292073933e-05, + "loss": 0.3438, + "step": 993 + }, + { + "epoch": 1.6012887635924287, + "grad_norm": 0.2820397711644288, + "learning_rate": 3.435547161326958e-05, + "loss": 0.3758, + "step": 994 + }, + { + "epoch": 1.602899718082964, + "grad_norm": 0.27746741501473066, + "learning_rate": 3.43397821042337e-05, + "loss": 0.357, + "step": 995 + }, + { + "epoch": 1.6045106725734999, + "grad_norm": 0.329470817735326, + "learning_rate": 3.4324074413524725e-05, + "loss": 0.3732, + "step": 996 + }, + { + "epoch": 1.6061216270640355, + "grad_norm": 0.2883189529050093, + "learning_rate": 3.430834856105871e-05, + "loss": 0.3113, + "step": 997 + }, + { + "epoch": 1.607732581554571, + "grad_norm": 0.30812120490912964, + "learning_rate": 3.429260456677475e-05, + "loss": 0.3413, + "step": 998 + }, + { + "epoch": 1.6093435360451067, + "grad_norm": 0.3016161794889211, + "learning_rate": 3.4276842450634964e-05, + "loss": 0.3732, + "step": 999 + }, + { + "epoch": 1.6109544905356423, + "grad_norm": 0.30608350201014334, + "learning_rate": 3.4261062232624405e-05, + "loss": 0.345, + "step": 1000 + }, + { + "epoch": 1.6125654450261782, + "grad_norm": 0.287021015994268, + "learning_rate": 3.4245263932751124e-05, + "loss": 0.3675, + "step": 1001 + }, + { + "epoch": 1.6141763995167135, + "grad_norm": 0.29513629823587945, + "learning_rate": 3.4229447571046055e-05, + "loss": 0.3726, + "step": 1002 + }, + { + "epoch": 1.6157873540072494, + "grad_norm": 0.28497512595967667, + "learning_rate": 3.421361316756307e-05, + "loss": 0.3417, + "step": 1003 + }, + { + "epoch": 1.617398308497785, + "grad_norm": 0.27472976631378027, + "learning_rate": 3.4197760742378886e-05, + "loss": 0.3826, + "step": 1004 + }, + { + "epoch": 1.6190092629883206, + "grad_norm": 0.32583486377402415, + "learning_rate": 3.4181890315593104e-05, + "loss": 0.3429, + "step": 1005 + }, + { + "epoch": 1.6206202174788562, + "grad_norm": 0.3158909644175841, + "learning_rate": 3.41660019073281e-05, + "loss": 0.339, + "step": 1006 + }, + { + "epoch": 1.6222311719693918, + "grad_norm": 0.2874717190588641, + "learning_rate": 3.41500955377291e-05, + "loss": 0.3448, + "step": 1007 + }, + { + "epoch": 1.6238421264599276, + "grad_norm": 0.32333997510660806, + "learning_rate": 3.413417122696408e-05, + "loss": 0.3574, + "step": 1008 + }, + { + "epoch": 1.625453080950463, + "grad_norm": 0.32153546396431953, + "learning_rate": 3.411822899522376e-05, + "loss": 0.343, + "step": 1009 + }, + { + "epoch": 1.6270640354409989, + "grad_norm": 0.33818702288315083, + "learning_rate": 3.410226886272159e-05, + "loss": 0.3592, + "step": 1010 + }, + { + "epoch": 1.6286749899315345, + "grad_norm": 0.2779482392923402, + "learning_rate": 3.408629084969372e-05, + "loss": 0.3575, + "step": 1011 + }, + { + "epoch": 1.63028594442207, + "grad_norm": 0.2958503892623944, + "learning_rate": 3.407029497639896e-05, + "loss": 0.3431, + "step": 1012 + }, + { + "epoch": 1.6318968989126057, + "grad_norm": 0.27651659921242794, + "learning_rate": 3.405428126311878e-05, + "loss": 0.3476, + "step": 1013 + }, + { + "epoch": 1.6335078534031413, + "grad_norm": 0.2709900192224873, + "learning_rate": 3.403824973015725e-05, + "loss": 0.376, + "step": 1014 + }, + { + "epoch": 1.6351188078936771, + "grad_norm": 0.33820245973489416, + "learning_rate": 3.4022200397841056e-05, + "loss": 0.3518, + "step": 1015 + }, + { + "epoch": 1.6367297623842125, + "grad_norm": 0.2802490380827594, + "learning_rate": 3.4006133286519435e-05, + "loss": 0.3777, + "step": 1016 + }, + { + "epoch": 1.6383407168747484, + "grad_norm": 0.3219486083753848, + "learning_rate": 3.399004841656417e-05, + "loss": 0.334, + "step": 1017 + }, + { + "epoch": 1.639951671365284, + "grad_norm": 0.2804170578284511, + "learning_rate": 3.3973945808369566e-05, + "loss": 0.3353, + "step": 1018 + }, + { + "epoch": 1.6415626258558196, + "grad_norm": 0.3397988556608258, + "learning_rate": 3.395782548235242e-05, + "loss": 0.3929, + "step": 1019 + }, + { + "epoch": 1.6431735803463552, + "grad_norm": 0.25874400518260726, + "learning_rate": 3.394168745895199e-05, + "loss": 0.3425, + "step": 1020 + }, + { + "epoch": 1.6447845348368908, + "grad_norm": 0.29769463217777464, + "learning_rate": 3.392553175862996e-05, + "loss": 0.3454, + "step": 1021 + }, + { + "epoch": 1.6463954893274266, + "grad_norm": 0.27496871301352516, + "learning_rate": 3.390935840187045e-05, + "loss": 0.3574, + "step": 1022 + }, + { + "epoch": 1.648006443817962, + "grad_norm": 0.3130236579001918, + "learning_rate": 3.3893167409179945e-05, + "loss": 0.353, + "step": 1023 + }, + { + "epoch": 1.6496173983084979, + "grad_norm": 0.30244268078948694, + "learning_rate": 3.387695880108732e-05, + "loss": 0.3554, + "step": 1024 + }, + { + "epoch": 1.6512283527990335, + "grad_norm": 0.2637267118602355, + "learning_rate": 3.3860732598143754e-05, + "loss": 0.3254, + "step": 1025 + }, + { + "epoch": 1.652839307289569, + "grad_norm": 0.27265533782673024, + "learning_rate": 3.3844488820922755e-05, + "loss": 0.3352, + "step": 1026 + }, + { + "epoch": 1.6544502617801047, + "grad_norm": 0.2847443661542053, + "learning_rate": 3.3828227490020096e-05, + "loss": 0.345, + "step": 1027 + }, + { + "epoch": 1.6560612162706403, + "grad_norm": 0.2910046942769011, + "learning_rate": 3.381194862605383e-05, + "loss": 0.3406, + "step": 1028 + }, + { + "epoch": 1.6576721707611761, + "grad_norm": 0.30278444886358213, + "learning_rate": 3.3795652249664216e-05, + "loss": 0.3744, + "step": 1029 + }, + { + "epoch": 1.6592831252517115, + "grad_norm": 0.3181445159389258, + "learning_rate": 3.377933838151374e-05, + "loss": 0.3571, + "step": 1030 + }, + { + "epoch": 1.6608940797422473, + "grad_norm": 0.2555750516659823, + "learning_rate": 3.376300704228704e-05, + "loss": 0.3293, + "step": 1031 + }, + { + "epoch": 1.662505034232783, + "grad_norm": 0.2994395275689444, + "learning_rate": 3.374665825269093e-05, + "loss": 0.3625, + "step": 1032 + }, + { + "epoch": 1.6641159887233186, + "grad_norm": 0.2941866705604041, + "learning_rate": 3.373029203345435e-05, + "loss": 0.3466, + "step": 1033 + }, + { + "epoch": 1.6657269432138542, + "grad_norm": 0.2520003523058466, + "learning_rate": 3.3713908405328316e-05, + "loss": 0.3492, + "step": 1034 + }, + { + "epoch": 1.6673378977043898, + "grad_norm": 0.27879316282952243, + "learning_rate": 3.369750738908593e-05, + "loss": 0.3658, + "step": 1035 + }, + { + "epoch": 1.6689488521949256, + "grad_norm": 0.26750661336616033, + "learning_rate": 3.368108900552236e-05, + "loss": 0.3479, + "step": 1036 + }, + { + "epoch": 1.670559806685461, + "grad_norm": 0.2703473771389898, + "learning_rate": 3.366465327545475e-05, + "loss": 0.3246, + "step": 1037 + }, + { + "epoch": 1.6721707611759968, + "grad_norm": 0.2492707097376767, + "learning_rate": 3.3648200219722285e-05, + "loss": 0.3379, + "step": 1038 + }, + { + "epoch": 1.6737817156665324, + "grad_norm": 0.28088000970492083, + "learning_rate": 3.3631729859186086e-05, + "loss": 0.3772, + "step": 1039 + }, + { + "epoch": 1.675392670157068, + "grad_norm": 0.6148118511176305, + "learning_rate": 3.3615242214729226e-05, + "loss": 0.3457, + "step": 1040 + }, + { + "epoch": 1.6770036246476037, + "grad_norm": 0.2741941991125401, + "learning_rate": 3.35987373072567e-05, + "loss": 0.3679, + "step": 1041 + }, + { + "epoch": 1.6786145791381393, + "grad_norm": 0.30929820639855726, + "learning_rate": 3.3582215157695376e-05, + "loss": 0.3595, + "step": 1042 + }, + { + "epoch": 1.680225533628675, + "grad_norm": 0.2855629710854195, + "learning_rate": 3.3565675786994e-05, + "loss": 0.3485, + "step": 1043 + }, + { + "epoch": 1.6818364881192105, + "grad_norm": 0.3160188622883023, + "learning_rate": 3.3549119216123125e-05, + "loss": 0.3469, + "step": 1044 + }, + { + "epoch": 1.6834474426097463, + "grad_norm": 0.3178650447881005, + "learning_rate": 3.353254546607515e-05, + "loss": 0.3631, + "step": 1045 + }, + { + "epoch": 1.685058397100282, + "grad_norm": 0.26522002369737246, + "learning_rate": 3.351595455786423e-05, + "loss": 0.3164, + "step": 1046 + }, + { + "epoch": 1.6866693515908175, + "grad_norm": 0.3103051673772948, + "learning_rate": 3.3499346512526286e-05, + "loss": 0.3537, + "step": 1047 + }, + { + "epoch": 1.6882803060813532, + "grad_norm": 0.26705305622759873, + "learning_rate": 3.348272135111895e-05, + "loss": 0.3207, + "step": 1048 + }, + { + "epoch": 1.6898912605718888, + "grad_norm": 0.2965365817221479, + "learning_rate": 3.346607909472159e-05, + "loss": 0.3301, + "step": 1049 + }, + { + "epoch": 1.6915022150624246, + "grad_norm": 0.3039530865497769, + "learning_rate": 3.344941976443521e-05, + "loss": 0.3541, + "step": 1050 + }, + { + "epoch": 1.69311316955296, + "grad_norm": 0.3019357707645832, + "learning_rate": 3.3432743381382494e-05, + "loss": 0.3476, + "step": 1051 + }, + { + "epoch": 1.6947241240434958, + "grad_norm": 0.30486830582214286, + "learning_rate": 3.341604996670773e-05, + "loss": 0.3692, + "step": 1052 + }, + { + "epoch": 1.6963350785340314, + "grad_norm": 0.2846970874116654, + "learning_rate": 3.33993395415768e-05, + "loss": 0.3489, + "step": 1053 + }, + { + "epoch": 1.697946033024567, + "grad_norm": 0.2844019284784835, + "learning_rate": 3.3382612127177166e-05, + "loss": 0.3484, + "step": 1054 + }, + { + "epoch": 1.6995569875151026, + "grad_norm": 0.31814090002272866, + "learning_rate": 3.3365867744717827e-05, + "loss": 0.3313, + "step": 1055 + }, + { + "epoch": 1.7011679420056383, + "grad_norm": 0.3018174083258602, + "learning_rate": 3.334910641542928e-05, + "loss": 0.3928, + "step": 1056 + }, + { + "epoch": 1.702778896496174, + "grad_norm": 0.2805949905286555, + "learning_rate": 3.3332328160563534e-05, + "loss": 0.3138, + "step": 1057 + }, + { + "epoch": 1.7043898509867095, + "grad_norm": 0.294154845413384, + "learning_rate": 3.331553300139404e-05, + "loss": 0.3715, + "step": 1058 + }, + { + "epoch": 1.7060008054772453, + "grad_norm": 0.28060148879956204, + "learning_rate": 3.3298720959215686e-05, + "loss": 0.3404, + "step": 1059 + }, + { + "epoch": 1.707611759967781, + "grad_norm": 0.2898378970822988, + "learning_rate": 3.328189205534479e-05, + "loss": 0.3707, + "step": 1060 + }, + { + "epoch": 1.7092227144583165, + "grad_norm": 0.29125221651243705, + "learning_rate": 3.3265046311118996e-05, + "loss": 0.3513, + "step": 1061 + }, + { + "epoch": 1.7108336689488524, + "grad_norm": 0.2944155426474975, + "learning_rate": 3.3248183747897354e-05, + "loss": 0.3294, + "step": 1062 + }, + { + "epoch": 1.7124446234393877, + "grad_norm": 0.2934301800909067, + "learning_rate": 3.3231304387060215e-05, + "loss": 0.3756, + "step": 1063 + }, + { + "epoch": 1.7140555779299236, + "grad_norm": 0.3258895549591974, + "learning_rate": 3.321440825000923e-05, + "loss": 0.3307, + "step": 1064 + }, + { + "epoch": 1.715666532420459, + "grad_norm": 0.25585412371136995, + "learning_rate": 3.3197495358167314e-05, + "loss": 0.3319, + "step": 1065 + }, + { + "epoch": 1.7172774869109948, + "grad_norm": 0.33852886154261486, + "learning_rate": 3.318056573297864e-05, + "loss": 0.345, + "step": 1066 + }, + { + "epoch": 1.7188884414015304, + "grad_norm": 0.29308836939109273, + "learning_rate": 3.3163619395908594e-05, + "loss": 0.3521, + "step": 1067 + }, + { + "epoch": 1.720499395892066, + "grad_norm": 0.3260413931230068, + "learning_rate": 3.314665636844374e-05, + "loss": 0.3579, + "step": 1068 + }, + { + "epoch": 1.7221103503826019, + "grad_norm": 0.315853677609962, + "learning_rate": 3.3129676672091814e-05, + "loss": 0.3693, + "step": 1069 + }, + { + "epoch": 1.7237213048731372, + "grad_norm": 0.25252028334754817, + "learning_rate": 3.311268032838169e-05, + "loss": 0.311, + "step": 1070 + }, + { + "epoch": 1.725332259363673, + "grad_norm": 0.30596679620107337, + "learning_rate": 3.309566735886334e-05, + "loss": 0.361, + "step": 1071 + }, + { + "epoch": 1.7269432138542085, + "grad_norm": 0.2605593059669489, + "learning_rate": 3.307863778510782e-05, + "loss": 0.3559, + "step": 1072 + }, + { + "epoch": 1.7285541683447443, + "grad_norm": 0.30530466414998403, + "learning_rate": 3.306159162870724e-05, + "loss": 0.3668, + "step": 1073 + }, + { + "epoch": 1.73016512283528, + "grad_norm": 0.28036764595429337, + "learning_rate": 3.304452891127474e-05, + "loss": 0.3556, + "step": 1074 + }, + { + "epoch": 1.7317760773258155, + "grad_norm": 0.3020190132811284, + "learning_rate": 3.302744965444445e-05, + "loss": 0.3526, + "step": 1075 + }, + { + "epoch": 1.7333870318163513, + "grad_norm": 0.25272390853900173, + "learning_rate": 3.301035387987146e-05, + "loss": 0.3276, + "step": 1076 + }, + { + "epoch": 1.7349979863068867, + "grad_norm": 0.342414793506701, + "learning_rate": 3.299324160923184e-05, + "loss": 0.3355, + "step": 1077 + }, + { + "epoch": 1.7366089407974226, + "grad_norm": 0.3002251269987107, + "learning_rate": 3.297611286422254e-05, + "loss": 0.3466, + "step": 1078 + }, + { + "epoch": 1.738219895287958, + "grad_norm": 0.32281240413837947, + "learning_rate": 3.295896766656141e-05, + "loss": 0.3627, + "step": 1079 + }, + { + "epoch": 1.7398308497784938, + "grad_norm": 0.27577305817853215, + "learning_rate": 3.294180603798716e-05, + "loss": 0.3264, + "step": 1080 + }, + { + "epoch": 1.7414418042690294, + "grad_norm": 0.3366688008037847, + "learning_rate": 3.292462800025933e-05, + "loss": 0.3553, + "step": 1081 + }, + { + "epoch": 1.743052758759565, + "grad_norm": 0.2864923341321958, + "learning_rate": 3.290743357515829e-05, + "loss": 0.3715, + "step": 1082 + }, + { + "epoch": 1.7446637132501008, + "grad_norm": 0.2885887017629758, + "learning_rate": 3.289022278448513e-05, + "loss": 0.3452, + "step": 1083 + }, + { + "epoch": 1.7462746677406362, + "grad_norm": 0.28829785150227605, + "learning_rate": 3.287299565006177e-05, + "loss": 0.3275, + "step": 1084 + }, + { + "epoch": 1.747885622231172, + "grad_norm": 0.3095311480709755, + "learning_rate": 3.285575219373079e-05, + "loss": 0.3884, + "step": 1085 + }, + { + "epoch": 1.7494965767217077, + "grad_norm": 0.3841547619344348, + "learning_rate": 3.2838492437355487e-05, + "loss": 0.3251, + "step": 1086 + }, + { + "epoch": 1.7511075312122433, + "grad_norm": 0.2583438353752805, + "learning_rate": 3.2821216402819814e-05, + "loss": 0.3298, + "step": 1087 + }, + { + "epoch": 1.7527184857027789, + "grad_norm": 0.3520798827537718, + "learning_rate": 3.280392411202838e-05, + "loss": 0.3611, + "step": 1088 + }, + { + "epoch": 1.7543294401933145, + "grad_norm": 0.24181815291715242, + "learning_rate": 3.27866155869064e-05, + "loss": 0.3353, + "step": 1089 + }, + { + "epoch": 1.7559403946838503, + "grad_norm": 0.26861982821219954, + "learning_rate": 3.276929084939967e-05, + "loss": 0.3268, + "step": 1090 + }, + { + "epoch": 1.7575513491743857, + "grad_norm": 0.30793537607947774, + "learning_rate": 3.275194992147455e-05, + "loss": 0.3758, + "step": 1091 + }, + { + "epoch": 1.7591623036649215, + "grad_norm": 0.2735491056478904, + "learning_rate": 3.27345928251179e-05, + "loss": 0.3522, + "step": 1092 + }, + { + "epoch": 1.7607732581554572, + "grad_norm": 0.31416721865401886, + "learning_rate": 3.271721958233713e-05, + "loss": 0.3887, + "step": 1093 + }, + { + "epoch": 1.7623842126459928, + "grad_norm": 0.23777442546977362, + "learning_rate": 3.269983021516006e-05, + "loss": 0.341, + "step": 1094 + }, + { + "epoch": 1.7639951671365284, + "grad_norm": 0.2966716961575548, + "learning_rate": 3.268242474563502e-05, + "loss": 0.3582, + "step": 1095 + }, + { + "epoch": 1.765606121627064, + "grad_norm": 0.2695248187187801, + "learning_rate": 3.2665003195830705e-05, + "loss": 0.3663, + "step": 1096 + }, + { + "epoch": 1.7672170761175998, + "grad_norm": 0.27648681082467197, + "learning_rate": 3.2647565587836224e-05, + "loss": 0.3367, + "step": 1097 + }, + { + "epoch": 1.7688280306081352, + "grad_norm": 0.24769125048939025, + "learning_rate": 3.2630111943761035e-05, + "loss": 0.3161, + "step": 1098 + }, + { + "epoch": 1.770438985098671, + "grad_norm": 0.32602085521437335, + "learning_rate": 3.261264228573495e-05, + "loss": 0.377, + "step": 1099 + }, + { + "epoch": 1.7720499395892066, + "grad_norm": 0.26152980147545596, + "learning_rate": 3.259515663590805e-05, + "loss": 0.3594, + "step": 1100 + }, + { + "epoch": 1.7736608940797423, + "grad_norm": 0.2382508327122932, + "learning_rate": 3.257765501645072e-05, + "loss": 0.3169, + "step": 1101 + }, + { + "epoch": 1.7752718485702779, + "grad_norm": 0.3243836829426945, + "learning_rate": 3.256013744955359e-05, + "loss": 0.3719, + "step": 1102 + }, + { + "epoch": 1.7768828030608135, + "grad_norm": 0.25232903385465316, + "learning_rate": 3.25426039574275e-05, + "loss": 0.3608, + "step": 1103 + }, + { + "epoch": 1.7784937575513493, + "grad_norm": 0.2871596146859017, + "learning_rate": 3.2525054562303485e-05, + "loss": 0.36, + "step": 1104 + }, + { + "epoch": 1.7801047120418847, + "grad_norm": 0.27660826788802817, + "learning_rate": 3.250748928643274e-05, + "loss": 0.3321, + "step": 1105 + }, + { + "epoch": 1.7817156665324205, + "grad_norm": 0.26281335753440443, + "learning_rate": 3.248990815208661e-05, + "loss": 0.3365, + "step": 1106 + }, + { + "epoch": 1.7833266210229561, + "grad_norm": 0.3107781337125716, + "learning_rate": 3.247231118155654e-05, + "loss": 0.3673, + "step": 1107 + }, + { + "epoch": 1.7849375755134917, + "grad_norm": 0.2783443100448748, + "learning_rate": 3.245469839715404e-05, + "loss": 0.3394, + "step": 1108 + }, + { + "epoch": 1.7865485300040274, + "grad_norm": 0.3194930726854231, + "learning_rate": 3.24370698212107e-05, + "loss": 0.3464, + "step": 1109 + }, + { + "epoch": 1.788159484494563, + "grad_norm": 0.29259588976237577, + "learning_rate": 3.24194254760781e-05, + "loss": 0.351, + "step": 1110 + }, + { + "epoch": 1.7897704389850988, + "grad_norm": 0.2833032383542838, + "learning_rate": 3.240176538412783e-05, + "loss": 0.3112, + "step": 1111 + }, + { + "epoch": 1.7913813934756342, + "grad_norm": 0.3062775036699522, + "learning_rate": 3.2384089567751464e-05, + "loss": 0.3538, + "step": 1112 + }, + { + "epoch": 1.79299234796617, + "grad_norm": 0.31520368622639755, + "learning_rate": 3.236639804936047e-05, + "loss": 0.365, + "step": 1113 + }, + { + "epoch": 1.7946033024567056, + "grad_norm": 0.2706287135324734, + "learning_rate": 3.234869085138626e-05, + "loss": 0.3057, + "step": 1114 + }, + { + "epoch": 1.7962142569472412, + "grad_norm": 0.2776347182974663, + "learning_rate": 3.233096799628012e-05, + "loss": 0.3591, + "step": 1115 + }, + { + "epoch": 1.7978252114377768, + "grad_norm": 0.23921615357001141, + "learning_rate": 3.2313229506513167e-05, + "loss": 0.3287, + "step": 1116 + }, + { + "epoch": 1.7994361659283125, + "grad_norm": 0.27114468404862, + "learning_rate": 3.229547540457638e-05, + "loss": 0.361, + "step": 1117 + }, + { + "epoch": 1.8010471204188483, + "grad_norm": 0.24463881898739914, + "learning_rate": 3.2277705712980495e-05, + "loss": 0.3404, + "step": 1118 + }, + { + "epoch": 1.8026580749093837, + "grad_norm": 0.3166270096090458, + "learning_rate": 3.225992045425604e-05, + "loss": 0.36, + "step": 1119 + }, + { + "epoch": 1.8042690293999195, + "grad_norm": 0.2723260812159827, + "learning_rate": 3.224211965095326e-05, + "loss": 0.3402, + "step": 1120 + }, + { + "epoch": 1.8058799838904551, + "grad_norm": 0.28717184216313857, + "learning_rate": 3.222430332564213e-05, + "loss": 0.3424, + "step": 1121 + }, + { + "epoch": 1.8074909383809907, + "grad_norm": 0.30590726225785186, + "learning_rate": 3.220647150091229e-05, + "loss": 0.384, + "step": 1122 + }, + { + "epoch": 1.8091018928715263, + "grad_norm": 0.3295960133235172, + "learning_rate": 3.2188624199373054e-05, + "loss": 0.3477, + "step": 1123 + }, + { + "epoch": 1.810712847362062, + "grad_norm": 0.24641937154261023, + "learning_rate": 3.217076144365332e-05, + "loss": 0.3291, + "step": 1124 + }, + { + "epoch": 1.8123238018525978, + "grad_norm": 0.36214805461319893, + "learning_rate": 3.215288325640161e-05, + "loss": 0.37, + "step": 1125 + }, + { + "epoch": 1.8139347563431332, + "grad_norm": 0.27639672336136023, + "learning_rate": 3.213498966028603e-05, + "loss": 0.3474, + "step": 1126 + }, + { + "epoch": 1.815545710833669, + "grad_norm": 0.2905471387956144, + "learning_rate": 3.2117080677994156e-05, + "loss": 0.3489, + "step": 1127 + }, + { + "epoch": 1.8171566653242046, + "grad_norm": 0.2742465600770904, + "learning_rate": 3.2099156332233155e-05, + "loss": 0.3212, + "step": 1128 + }, + { + "epoch": 1.8187676198147402, + "grad_norm": 0.32581522778160565, + "learning_rate": 3.2081216645729615e-05, + "loss": 0.3789, + "step": 1129 + }, + { + "epoch": 1.8203785743052758, + "grad_norm": 0.25895277713528797, + "learning_rate": 3.20632616412296e-05, + "loss": 0.3548, + "step": 1130 + }, + { + "epoch": 1.8219895287958114, + "grad_norm": 0.2567453896587822, + "learning_rate": 3.204529134149858e-05, + "loss": 0.337, + "step": 1131 + }, + { + "epoch": 1.8236004832863473, + "grad_norm": 0.2545418099367893, + "learning_rate": 3.2027305769321446e-05, + "loss": 0.3757, + "step": 1132 + }, + { + "epoch": 1.8252114377768827, + "grad_norm": 0.26861543778444336, + "learning_rate": 3.2009304947502415e-05, + "loss": 0.3309, + "step": 1133 + }, + { + "epoch": 1.8268223922674185, + "grad_norm": 0.25906425044357323, + "learning_rate": 3.1991288898865076e-05, + "loss": 0.3425, + "step": 1134 + }, + { + "epoch": 1.828433346757954, + "grad_norm": 0.2791495649866975, + "learning_rate": 3.19732576462523e-05, + "loss": 0.3626, + "step": 1135 + }, + { + "epoch": 1.8300443012484897, + "grad_norm": 0.28566136887901167, + "learning_rate": 3.195521121252625e-05, + "loss": 0.3427, + "step": 1136 + }, + { + "epoch": 1.8316552557390253, + "grad_norm": 0.3006511911637088, + "learning_rate": 3.193714962056832e-05, + "loss": 0.3689, + "step": 1137 + }, + { + "epoch": 1.833266210229561, + "grad_norm": 0.2575714390405587, + "learning_rate": 3.1919072893279144e-05, + "loss": 0.3254, + "step": 1138 + }, + { + "epoch": 1.8348771647200968, + "grad_norm": 0.29714340630201763, + "learning_rate": 3.190098105357853e-05, + "loss": 0.3114, + "step": 1139 + }, + { + "epoch": 1.8364881192106322, + "grad_norm": 0.2739715615694121, + "learning_rate": 3.188287412440546e-05, + "loss": 0.3628, + "step": 1140 + }, + { + "epoch": 1.838099073701168, + "grad_norm": 0.31490538424728864, + "learning_rate": 3.186475212871803e-05, + "loss": 0.3681, + "step": 1141 + }, + { + "epoch": 1.8397100281917036, + "grad_norm": 0.25725247708626503, + "learning_rate": 3.1846615089493465e-05, + "loss": 0.3373, + "step": 1142 + }, + { + "epoch": 1.8413209826822392, + "grad_norm": 0.29237114605953757, + "learning_rate": 3.182846302972804e-05, + "loss": 0.3928, + "step": 1143 + }, + { + "epoch": 1.8429319371727748, + "grad_norm": 0.24162054032640948, + "learning_rate": 3.181029597243709e-05, + "loss": 0.3066, + "step": 1144 + }, + { + "epoch": 1.8445428916633104, + "grad_norm": 0.2746441036778749, + "learning_rate": 3.1792113940654976e-05, + "loss": 0.3429, + "step": 1145 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.27290548211325333, + "learning_rate": 3.1773916957435e-05, + "loss": 0.3567, + "step": 1146 + }, + { + "epoch": 1.8477648006443816, + "grad_norm": 0.3015414783683767, + "learning_rate": 3.1755705045849465e-05, + "loss": 0.3386, + "step": 1147 + }, + { + "epoch": 1.8493757551349175, + "grad_norm": 0.2540351001266061, + "learning_rate": 3.173747822898959e-05, + "loss": 0.3452, + "step": 1148 + }, + { + "epoch": 1.850986709625453, + "grad_norm": 0.31221816074924413, + "learning_rate": 3.1719236529965494e-05, + "loss": 0.3592, + "step": 1149 + }, + { + "epoch": 1.8525976641159887, + "grad_norm": 0.2423803158509095, + "learning_rate": 3.170097997190615e-05, + "loss": 0.3195, + "step": 1150 + }, + { + "epoch": 1.8542086186065245, + "grad_norm": 0.2966064384297838, + "learning_rate": 3.16827085779594e-05, + "loss": 0.3355, + "step": 1151 + }, + { + "epoch": 1.85581957309706, + "grad_norm": 0.2810752575128961, + "learning_rate": 3.1664422371291866e-05, + "loss": 0.3336, + "step": 1152 + }, + { + "epoch": 1.8574305275875957, + "grad_norm": 0.2784162936236359, + "learning_rate": 3.164612137508898e-05, + "loss": 0.3542, + "step": 1153 + }, + { + "epoch": 1.8590414820781311, + "grad_norm": 0.32474379464442327, + "learning_rate": 3.162780561255489e-05, + "loss": 0.368, + "step": 1154 + }, + { + "epoch": 1.860652436568667, + "grad_norm": 0.25214198273713495, + "learning_rate": 3.16094751069125e-05, + "loss": 0.3208, + "step": 1155 + }, + { + "epoch": 1.8622633910592026, + "grad_norm": 0.2804333800265253, + "learning_rate": 3.15911298814034e-05, + "loss": 0.3633, + "step": 1156 + }, + { + "epoch": 1.8638743455497382, + "grad_norm": 0.3177102741885549, + "learning_rate": 3.157276995928783e-05, + "loss": 0.3548, + "step": 1157 + }, + { + "epoch": 1.865485300040274, + "grad_norm": 0.2779089270761033, + "learning_rate": 3.155439536384467e-05, + "loss": 0.3397, + "step": 1158 + }, + { + "epoch": 1.8670962545308094, + "grad_norm": 0.30157569369109105, + "learning_rate": 3.153600611837142e-05, + "loss": 0.3538, + "step": 1159 + }, + { + "epoch": 1.8687072090213452, + "grad_norm": 0.3222782938570124, + "learning_rate": 3.151760224618413e-05, + "loss": 0.3584, + "step": 1160 + }, + { + "epoch": 1.8703181635118806, + "grad_norm": 0.26202863886970124, + "learning_rate": 3.1499183770617414e-05, + "loss": 0.3294, + "step": 1161 + }, + { + "epoch": 1.8719291180024165, + "grad_norm": 0.3313404569532206, + "learning_rate": 3.1480750715024396e-05, + "loss": 0.3607, + "step": 1162 + }, + { + "epoch": 1.873540072492952, + "grad_norm": 0.2804570203558461, + "learning_rate": 3.146230310277668e-05, + "loss": 0.3684, + "step": 1163 + }, + { + "epoch": 1.8751510269834877, + "grad_norm": 0.328059825473741, + "learning_rate": 3.144384095726433e-05, + "loss": 0.3345, + "step": 1164 + }, + { + "epoch": 1.8767619814740235, + "grad_norm": 0.3310382940626733, + "learning_rate": 3.142536430189585e-05, + "loss": 0.3592, + "step": 1165 + }, + { + "epoch": 1.878372935964559, + "grad_norm": 0.2571589985927003, + "learning_rate": 3.140687316009812e-05, + "loss": 0.334, + "step": 1166 + }, + { + "epoch": 1.8799838904550947, + "grad_norm": 0.3084210247719844, + "learning_rate": 3.138836755531641e-05, + "loss": 0.3549, + "step": 1167 + }, + { + "epoch": 1.8815948449456303, + "grad_norm": 0.3136180657588817, + "learning_rate": 3.13698475110143e-05, + "loss": 0.3385, + "step": 1168 + }, + { + "epoch": 1.883205799436166, + "grad_norm": 0.28698454787368805, + "learning_rate": 3.135131305067372e-05, + "loss": 0.3459, + "step": 1169 + }, + { + "epoch": 1.8848167539267016, + "grad_norm": 0.3203660869415405, + "learning_rate": 3.1332764197794825e-05, + "loss": 0.3665, + "step": 1170 + }, + { + "epoch": 1.8864277084172372, + "grad_norm": 0.2760307046037802, + "learning_rate": 3.131420097589606e-05, + "loss": 0.3535, + "step": 1171 + }, + { + "epoch": 1.888038662907773, + "grad_norm": 0.304173676179774, + "learning_rate": 3.129562340851408e-05, + "loss": 0.3598, + "step": 1172 + }, + { + "epoch": 1.8896496173983084, + "grad_norm": 0.24789364158706606, + "learning_rate": 3.127703151920371e-05, + "loss": 0.3423, + "step": 1173 + }, + { + "epoch": 1.8912605718888442, + "grad_norm": 0.27869905089039404, + "learning_rate": 3.125842533153796e-05, + "loss": 0.3801, + "step": 1174 + }, + { + "epoch": 1.8928715263793798, + "grad_norm": 0.2659917053384485, + "learning_rate": 3.1239804869107943e-05, + "loss": 0.3162, + "step": 1175 + }, + { + "epoch": 1.8944824808699154, + "grad_norm": 0.2740676239526807, + "learning_rate": 3.1221170155522896e-05, + "loss": 0.3763, + "step": 1176 + }, + { + "epoch": 1.896093435360451, + "grad_norm": 0.3036149904699523, + "learning_rate": 3.1202521214410116e-05, + "loss": 0.3324, + "step": 1177 + }, + { + "epoch": 1.8977043898509867, + "grad_norm": 0.2704344928402855, + "learning_rate": 3.1183858069414936e-05, + "loss": 0.3508, + "step": 1178 + }, + { + "epoch": 1.8993153443415225, + "grad_norm": 0.2745284876347354, + "learning_rate": 3.1165180744200704e-05, + "loss": 0.3194, + "step": 1179 + }, + { + "epoch": 1.9009262988320579, + "grad_norm": 0.2584540162273995, + "learning_rate": 3.114648926244873e-05, + "loss": 0.3385, + "step": 1180 + }, + { + "epoch": 1.9025372533225937, + "grad_norm": 0.3116807537951856, + "learning_rate": 3.11277836478583e-05, + "loss": 0.3586, + "step": 1181 + }, + { + "epoch": 1.9041482078131293, + "grad_norm": 0.2623123292830474, + "learning_rate": 3.11090639241466e-05, + "loss": 0.3502, + "step": 1182 + }, + { + "epoch": 1.905759162303665, + "grad_norm": 0.2806030867053086, + "learning_rate": 3.1090330115048716e-05, + "loss": 0.3602, + "step": 1183 + }, + { + "epoch": 1.9073701167942005, + "grad_norm": 0.30229750912862674, + "learning_rate": 3.107158224431759e-05, + "loss": 0.3303, + "step": 1184 + }, + { + "epoch": 1.9089810712847362, + "grad_norm": 0.3148634173453989, + "learning_rate": 3.105282033572398e-05, + "loss": 0.3854, + "step": 1185 + }, + { + "epoch": 1.910592025775272, + "grad_norm": 0.28827408407660937, + "learning_rate": 3.1034044413056465e-05, + "loss": 0.3304, + "step": 1186 + }, + { + "epoch": 1.9122029802658074, + "grad_norm": 0.3063959496143443, + "learning_rate": 3.1015254500121376e-05, + "loss": 0.3747, + "step": 1187 + }, + { + "epoch": 1.9138139347563432, + "grad_norm": 0.2714542673376483, + "learning_rate": 3.09964506207428e-05, + "loss": 0.3437, + "step": 1188 + }, + { + "epoch": 1.9154248892468788, + "grad_norm": 0.2550631224442582, + "learning_rate": 3.097763279876251e-05, + "loss": 0.3379, + "step": 1189 + }, + { + "epoch": 1.9170358437374144, + "grad_norm": 0.27675565441191935, + "learning_rate": 3.095880105803997e-05, + "loss": 0.3567, + "step": 1190 + }, + { + "epoch": 1.91864679822795, + "grad_norm": 0.25174769021841753, + "learning_rate": 3.09399554224523e-05, + "loss": 0.3066, + "step": 1191 + }, + { + "epoch": 1.9202577527184856, + "grad_norm": 0.3167995968880189, + "learning_rate": 3.092109591589421e-05, + "loss": 0.3712, + "step": 1192 + }, + { + "epoch": 1.9218687072090215, + "grad_norm": 0.25594727349036206, + "learning_rate": 3.0902222562278025e-05, + "loss": 0.3463, + "step": 1193 + }, + { + "epoch": 1.9234796616995569, + "grad_norm": 0.2623623493033658, + "learning_rate": 3.088333538553361e-05, + "loss": 0.3485, + "step": 1194 + }, + { + "epoch": 1.9250906161900927, + "grad_norm": 0.24387996552506205, + "learning_rate": 3.086443440960838e-05, + "loss": 0.3247, + "step": 1195 + }, + { + "epoch": 1.9267015706806283, + "grad_norm": 0.27837207857209045, + "learning_rate": 3.084551965846721e-05, + "loss": 0.3378, + "step": 1196 + }, + { + "epoch": 1.928312525171164, + "grad_norm": 0.30825795217226376, + "learning_rate": 3.0826591156092474e-05, + "loss": 0.4079, + "step": 1197 + }, + { + "epoch": 1.9299234796616995, + "grad_norm": 0.2893650060453642, + "learning_rate": 3.080764892648396e-05, + "loss": 0.3492, + "step": 1198 + }, + { + "epoch": 1.9315344341522351, + "grad_norm": 0.2649652705405745, + "learning_rate": 3.0788692993658874e-05, + "loss": 0.3357, + "step": 1199 + }, + { + "epoch": 1.933145388642771, + "grad_norm": 0.29712091496353926, + "learning_rate": 3.076972338165178e-05, + "loss": 0.3542, + "step": 1200 + }, + { + "epoch": 1.9347563431333064, + "grad_norm": 0.27057553138069074, + "learning_rate": 3.075074011451461e-05, + "loss": 0.3755, + "step": 1201 + }, + { + "epoch": 1.9363672976238422, + "grad_norm": 0.2491519925272819, + "learning_rate": 3.07317432163166e-05, + "loss": 0.3191, + "step": 1202 + }, + { + "epoch": 1.9379782521143778, + "grad_norm": 0.26410068635338846, + "learning_rate": 3.0712732711144254e-05, + "loss": 0.3462, + "step": 1203 + }, + { + "epoch": 1.9395892066049134, + "grad_norm": 0.3053998571246562, + "learning_rate": 3.0693708623101345e-05, + "loss": 0.3587, + "step": 1204 + }, + { + "epoch": 1.941200161095449, + "grad_norm": 0.2766139786219459, + "learning_rate": 3.067467097630886e-05, + "loss": 0.3506, + "step": 1205 + }, + { + "epoch": 1.9428111155859846, + "grad_norm": 0.30093579529593656, + "learning_rate": 3.0655619794905e-05, + "loss": 0.3676, + "step": 1206 + }, + { + "epoch": 1.9444220700765205, + "grad_norm": 0.26128628640625445, + "learning_rate": 3.063655510304508e-05, + "loss": 0.3358, + "step": 1207 + }, + { + "epoch": 1.9460330245670558, + "grad_norm": 0.29154302004266824, + "learning_rate": 3.061747692490159e-05, + "loss": 0.3621, + "step": 1208 + }, + { + "epoch": 1.9476439790575917, + "grad_norm": 0.2518019757597743, + "learning_rate": 3.05983852846641e-05, + "loss": 0.3323, + "step": 1209 + }, + { + "epoch": 1.9492549335481273, + "grad_norm": 0.2691404649318548, + "learning_rate": 3.057928020653925e-05, + "loss": 0.3441, + "step": 1210 + }, + { + "epoch": 1.950865888038663, + "grad_norm": 0.2548234265739506, + "learning_rate": 3.056016171475072e-05, + "loss": 0.3453, + "step": 1211 + }, + { + "epoch": 1.9524768425291985, + "grad_norm": 0.25140408398192154, + "learning_rate": 3.0541029833539195e-05, + "loss": 0.3427, + "step": 1212 + }, + { + "epoch": 1.9540877970197341, + "grad_norm": 0.27769435781256435, + "learning_rate": 3.0521884587162344e-05, + "loss": 0.3448, + "step": 1213 + }, + { + "epoch": 1.95569875151027, + "grad_norm": 0.2572998449613547, + "learning_rate": 3.050272599989477e-05, + "loss": 0.3495, + "step": 1214 + }, + { + "epoch": 1.9573097060008053, + "grad_norm": 0.2557887816223843, + "learning_rate": 3.0483554096027998e-05, + "loss": 0.328, + "step": 1215 + }, + { + "epoch": 1.9589206604913412, + "grad_norm": 0.2614863654535399, + "learning_rate": 3.046436889987044e-05, + "loss": 0.3426, + "step": 1216 + }, + { + "epoch": 1.9605316149818768, + "grad_norm": 0.2375877870665729, + "learning_rate": 3.0445170435747364e-05, + "loss": 0.3031, + "step": 1217 + }, + { + "epoch": 1.9621425694724124, + "grad_norm": 0.2894524906451252, + "learning_rate": 3.0425958728000845e-05, + "loss": 0.3805, + "step": 1218 + }, + { + "epoch": 1.963753523962948, + "grad_norm": 0.2593732546970939, + "learning_rate": 3.0406733800989766e-05, + "loss": 0.3456, + "step": 1219 + }, + { + "epoch": 1.9653644784534836, + "grad_norm": 0.2533421503146536, + "learning_rate": 3.0387495679089753e-05, + "loss": 0.3617, + "step": 1220 + }, + { + "epoch": 1.9669754329440194, + "grad_norm": 0.2737014852345156, + "learning_rate": 3.0368244386693196e-05, + "loss": 0.3451, + "step": 1221 + }, + { + "epoch": 1.9685863874345548, + "grad_norm": 0.2385100617519932, + "learning_rate": 3.034897994820915e-05, + "loss": 0.3462, + "step": 1222 + }, + { + "epoch": 1.9701973419250907, + "grad_norm": 0.2679673464061188, + "learning_rate": 3.0329702388063348e-05, + "loss": 0.3474, + "step": 1223 + }, + { + "epoch": 1.9718082964156263, + "grad_norm": 0.25455746591228023, + "learning_rate": 3.0310411730698166e-05, + "loss": 0.3403, + "step": 1224 + }, + { + "epoch": 1.9734192509061619, + "grad_norm": 0.2559764269490091, + "learning_rate": 3.029110800057258e-05, + "loss": 0.3268, + "step": 1225 + }, + { + "epoch": 1.9750302053966975, + "grad_norm": 0.26102725959877315, + "learning_rate": 3.0271791222162145e-05, + "loss": 0.3513, + "step": 1226 + }, + { + "epoch": 1.976641159887233, + "grad_norm": 0.2837350041203044, + "learning_rate": 3.0252461419958968e-05, + "loss": 0.355, + "step": 1227 + }, + { + "epoch": 1.978252114377769, + "grad_norm": 0.276477357654969, + "learning_rate": 3.023311861847165e-05, + "loss": 0.3156, + "step": 1228 + }, + { + "epoch": 1.9798630688683043, + "grad_norm": 0.2630117478712932, + "learning_rate": 3.0213762842225284e-05, + "loss": 0.3311, + "step": 1229 + }, + { + "epoch": 1.9814740233588402, + "grad_norm": 0.3217425165655156, + "learning_rate": 3.0194394115761415e-05, + "loss": 0.3413, + "step": 1230 + }, + { + "epoch": 1.9830849778493758, + "grad_norm": 0.3176180655742061, + "learning_rate": 3.0175012463638016e-05, + "loss": 0.3541, + "step": 1231 + }, + { + "epoch": 1.9846959323399114, + "grad_norm": 0.32893088484252825, + "learning_rate": 3.0155617910429426e-05, + "loss": 0.3467, + "step": 1232 + }, + { + "epoch": 1.9863068868304472, + "grad_norm": 0.3081265438017872, + "learning_rate": 3.0136210480726365e-05, + "loss": 0.3385, + "step": 1233 + }, + { + "epoch": 1.9879178413209826, + "grad_norm": 0.31493139090080025, + "learning_rate": 3.011679019913587e-05, + "loss": 0.3539, + "step": 1234 + }, + { + "epoch": 1.9895287958115184, + "grad_norm": 0.3293171737255103, + "learning_rate": 3.0097357090281267e-05, + "loss": 0.3587, + "step": 1235 + }, + { + "epoch": 1.9911397503020538, + "grad_norm": 0.2763097409073324, + "learning_rate": 3.0077911178802152e-05, + "loss": 0.3572, + "step": 1236 + }, + { + "epoch": 1.9927507047925896, + "grad_norm": 0.29206192980238854, + "learning_rate": 3.0058452489354358e-05, + "loss": 0.334, + "step": 1237 + }, + { + "epoch": 1.9943616592831253, + "grad_norm": 0.31050171676421406, + "learning_rate": 3.0038981046609915e-05, + "loss": 0.3542, + "step": 1238 + }, + { + "epoch": 1.9959726137736609, + "grad_norm": 0.2887676714430983, + "learning_rate": 3.0019496875257012e-05, + "loss": 0.3474, + "step": 1239 + }, + { + "epoch": 1.9975835682641967, + "grad_norm": 0.2960252968104312, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.3486, + "step": 1240 + }, + { + "epoch": 1.999194522754732, + "grad_norm": 0.28911173594595874, + "learning_rate": 2.9980490445559325e-05, + "loss": 0.3589, + "step": 1241 + }, + { + "epoch": 2.000805477245268, + "grad_norm": 0.39770939551092377, + "learning_rate": 2.9960968236671504e-05, + "loss": 0.3328, + "step": 1242 + }, + { + "epoch": 2.0024164317358033, + "grad_norm": 0.2857834417846077, + "learning_rate": 2.9941433398089098e-05, + "loss": 0.3006, + "step": 1243 + }, + { + "epoch": 2.004027386226339, + "grad_norm": 0.6949713663880401, + "learning_rate": 2.99218859545807e-05, + "loss": 0.2604, + "step": 1244 + }, + { + "epoch": 2.005638340716875, + "grad_norm": 0.5111752099726601, + "learning_rate": 2.9902325930930868e-05, + "loss": 0.2976, + "step": 1245 + }, + { + "epoch": 2.0072492952074104, + "grad_norm": 0.39163668448086514, + "learning_rate": 2.9882753351940115e-05, + "loss": 0.2594, + "step": 1246 + }, + { + "epoch": 2.008860249697946, + "grad_norm": 0.33796868345118525, + "learning_rate": 2.9863168242424873e-05, + "loss": 0.2955, + "step": 1247 + }, + { + "epoch": 2.0104712041884816, + "grad_norm": 0.3941724192343171, + "learning_rate": 2.9843570627217463e-05, + "loss": 0.2846, + "step": 1248 + }, + { + "epoch": 2.0120821586790174, + "grad_norm": 0.31642626423981934, + "learning_rate": 2.9823960531166065e-05, + "loss": 0.2781, + "step": 1249 + }, + { + "epoch": 2.013693113169553, + "grad_norm": 0.37669153988473264, + "learning_rate": 2.980433797913467e-05, + "loss": 0.2853, + "step": 1250 + }, + { + "epoch": 2.0153040676600886, + "grad_norm": 0.30754959333932375, + "learning_rate": 2.978470299600308e-05, + "loss": 0.2796, + "step": 1251 + }, + { + "epoch": 2.0169150221506245, + "grad_norm": 0.3422825055523034, + "learning_rate": 2.9765055606666844e-05, + "loss": 0.2664, + "step": 1252 + }, + { + "epoch": 2.01852597664116, + "grad_norm": 0.31316330889075605, + "learning_rate": 2.9745395836037265e-05, + "loss": 0.2814, + "step": 1253 + }, + { + "epoch": 2.0201369311316957, + "grad_norm": 0.39624358182523745, + "learning_rate": 2.9725723709041304e-05, + "loss": 0.2882, + "step": 1254 + }, + { + "epoch": 2.021747885622231, + "grad_norm": 0.29976044657757916, + "learning_rate": 2.9706039250621626e-05, + "loss": 0.2763, + "step": 1255 + }, + { + "epoch": 2.023358840112767, + "grad_norm": 0.3659709335531895, + "learning_rate": 2.968634248573651e-05, + "loss": 0.2939, + "step": 1256 + }, + { + "epoch": 2.0249697946033023, + "grad_norm": 0.30126786571812936, + "learning_rate": 2.9666633439359857e-05, + "loss": 0.2716, + "step": 1257 + }, + { + "epoch": 2.026580749093838, + "grad_norm": 0.3122695764846602, + "learning_rate": 2.9646912136481116e-05, + "loss": 0.2658, + "step": 1258 + }, + { + "epoch": 2.028191703584374, + "grad_norm": 0.2867439501044589, + "learning_rate": 2.9627178602105296e-05, + "loss": 0.2676, + "step": 1259 + }, + { + "epoch": 2.0298026580749093, + "grad_norm": 0.30992230207590665, + "learning_rate": 2.960743286125291e-05, + "loss": 0.2955, + "step": 1260 + }, + { + "epoch": 2.031413612565445, + "grad_norm": 0.3058682460420191, + "learning_rate": 2.958767493895994e-05, + "loss": 0.2896, + "step": 1261 + }, + { + "epoch": 2.0330245670559806, + "grad_norm": 0.26862316860853597, + "learning_rate": 2.9567904860277825e-05, + "loss": 0.2566, + "step": 1262 + }, + { + "epoch": 2.0346355215465164, + "grad_norm": 0.25194544144940745, + "learning_rate": 2.95481226502734e-05, + "loss": 0.2478, + "step": 1263 + }, + { + "epoch": 2.036246476037052, + "grad_norm": 0.30167552158426153, + "learning_rate": 2.9528328334028903e-05, + "loss": 0.2744, + "step": 1264 + }, + { + "epoch": 2.0378574305275876, + "grad_norm": 0.28046902794293055, + "learning_rate": 2.9508521936641906e-05, + "loss": 0.2833, + "step": 1265 + }, + { + "epoch": 2.0394683850181234, + "grad_norm": 0.310202694342102, + "learning_rate": 2.948870348322531e-05, + "loss": 0.3095, + "step": 1266 + }, + { + "epoch": 2.041079339508659, + "grad_norm": 0.2675039895958826, + "learning_rate": 2.9468872998907285e-05, + "loss": 0.2643, + "step": 1267 + }, + { + "epoch": 2.0426902939991947, + "grad_norm": 0.2594449033037284, + "learning_rate": 2.9449030508831273e-05, + "loss": 0.2682, + "step": 1268 + }, + { + "epoch": 2.04430124848973, + "grad_norm": 0.28891086210870076, + "learning_rate": 2.9429176038155938e-05, + "loss": 0.2621, + "step": 1269 + }, + { + "epoch": 2.045912202980266, + "grad_norm": 0.3343156311987029, + "learning_rate": 2.9409309612055116e-05, + "loss": 0.3005, + "step": 1270 + }, + { + "epoch": 2.0475231574708013, + "grad_norm": 0.31461832017319397, + "learning_rate": 2.938943125571782e-05, + "loss": 0.281, + "step": 1271 + }, + { + "epoch": 2.049134111961337, + "grad_norm": 0.30370641290272277, + "learning_rate": 2.9369540994348175e-05, + "loss": 0.2864, + "step": 1272 + }, + { + "epoch": 2.050745066451873, + "grad_norm": 0.27610662631816574, + "learning_rate": 2.9349638853165427e-05, + "loss": 0.2672, + "step": 1273 + }, + { + "epoch": 2.0523560209424083, + "grad_norm": 0.3276272216535393, + "learning_rate": 2.932972485740385e-05, + "loss": 0.282, + "step": 1274 + }, + { + "epoch": 2.053966975432944, + "grad_norm": 0.3009539346491537, + "learning_rate": 2.9309799032312775e-05, + "loss": 0.253, + "step": 1275 + }, + { + "epoch": 2.0555779299234795, + "grad_norm": 0.28633699999733225, + "learning_rate": 2.9289861403156504e-05, + "loss": 0.2791, + "step": 1276 + }, + { + "epoch": 2.0571888844140154, + "grad_norm": 0.35069115032081216, + "learning_rate": 2.9269911995214354e-05, + "loss": 0.2621, + "step": 1277 + }, + { + "epoch": 2.0587998389045508, + "grad_norm": 0.2799484554842468, + "learning_rate": 2.924995083378051e-05, + "loss": 0.2723, + "step": 1278 + }, + { + "epoch": 2.0604107933950866, + "grad_norm": 0.36643461872273314, + "learning_rate": 2.922997794416412e-05, + "loss": 0.2745, + "step": 1279 + }, + { + "epoch": 2.0620217478856224, + "grad_norm": 0.2855006354725177, + "learning_rate": 2.920999335168917e-05, + "loss": 0.2604, + "step": 1280 + }, + { + "epoch": 2.063632702376158, + "grad_norm": 0.37126926892403106, + "learning_rate": 2.9189997081694493e-05, + "loss": 0.2772, + "step": 1281 + }, + { + "epoch": 2.0652436568666936, + "grad_norm": 0.3006772235704741, + "learning_rate": 2.916998915953373e-05, + "loss": 0.302, + "step": 1282 + }, + { + "epoch": 2.066854611357229, + "grad_norm": 0.3753958972390217, + "learning_rate": 2.914996961057528e-05, + "loss": 0.2811, + "step": 1283 + }, + { + "epoch": 2.068465565847765, + "grad_norm": 0.2994073547515185, + "learning_rate": 2.9129938460202312e-05, + "loss": 0.2859, + "step": 1284 + }, + { + "epoch": 2.0700765203383003, + "grad_norm": 0.31202683354041316, + "learning_rate": 2.910989573381268e-05, + "loss": 0.2733, + "step": 1285 + }, + { + "epoch": 2.071687474828836, + "grad_norm": 0.2767590884579919, + "learning_rate": 2.9089841456818935e-05, + "loss": 0.2515, + "step": 1286 + }, + { + "epoch": 2.073298429319372, + "grad_norm": 0.2735836950606523, + "learning_rate": 2.906977565464825e-05, + "loss": 0.2903, + "step": 1287 + }, + { + "epoch": 2.0749093838099073, + "grad_norm": 0.2921931649866422, + "learning_rate": 2.9049698352742438e-05, + "loss": 0.2981, + "step": 1288 + }, + { + "epoch": 2.076520338300443, + "grad_norm": 0.267867103961345, + "learning_rate": 2.9029609576557863e-05, + "loss": 0.2652, + "step": 1289 + }, + { + "epoch": 2.0781312927909785, + "grad_norm": 0.29224521078187127, + "learning_rate": 2.9009509351565462e-05, + "loss": 0.2647, + "step": 1290 + }, + { + "epoch": 2.0797422472815144, + "grad_norm": 0.2729335202374728, + "learning_rate": 2.8989397703250682e-05, + "loss": 0.2798, + "step": 1291 + }, + { + "epoch": 2.0813532017720497, + "grad_norm": 0.26817050857073244, + "learning_rate": 2.8969274657113452e-05, + "loss": 0.2839, + "step": 1292 + }, + { + "epoch": 2.0829641562625856, + "grad_norm": 0.2954611997992363, + "learning_rate": 2.8949140238668145e-05, + "loss": 0.3008, + "step": 1293 + }, + { + "epoch": 2.0845751107531214, + "grad_norm": 0.2584560509593604, + "learning_rate": 2.8928994473443557e-05, + "loss": 0.2863, + "step": 1294 + }, + { + "epoch": 2.086186065243657, + "grad_norm": 0.2697921301073831, + "learning_rate": 2.890883738698289e-05, + "loss": 0.2786, + "step": 1295 + }, + { + "epoch": 2.0877970197341926, + "grad_norm": 0.2803128875813109, + "learning_rate": 2.8888669004843665e-05, + "loss": 0.2734, + "step": 1296 + }, + { + "epoch": 2.089407974224728, + "grad_norm": 0.2381547290712765, + "learning_rate": 2.8868489352597762e-05, + "loss": 0.251, + "step": 1297 + }, + { + "epoch": 2.091018928715264, + "grad_norm": 0.3323977346030221, + "learning_rate": 2.8848298455831317e-05, + "loss": 0.308, + "step": 1298 + }, + { + "epoch": 2.0926298832057992, + "grad_norm": 0.27153635242123286, + "learning_rate": 2.882809634014475e-05, + "loss": 0.2589, + "step": 1299 + }, + { + "epoch": 2.094240837696335, + "grad_norm": 0.285739554905134, + "learning_rate": 2.880788303115269e-05, + "loss": 0.273, + "step": 1300 + }, + { + "epoch": 2.095851792186871, + "grad_norm": 0.3010326450465779, + "learning_rate": 2.878765855448396e-05, + "loss": 0.3033, + "step": 1301 + }, + { + "epoch": 2.0974627466774063, + "grad_norm": 0.24696037255401815, + "learning_rate": 2.876742293578155e-05, + "loss": 0.2609, + "step": 1302 + }, + { + "epoch": 2.099073701167942, + "grad_norm": 0.27166822832389825, + "learning_rate": 2.8747176200702572e-05, + "loss": 0.2551, + "step": 1303 + }, + { + "epoch": 2.1006846556584775, + "grad_norm": 0.2735038670067162, + "learning_rate": 2.8726918374918233e-05, + "loss": 0.2758, + "step": 1304 + }, + { + "epoch": 2.1022956101490133, + "grad_norm": 0.3028520148216462, + "learning_rate": 2.87066494841138e-05, + "loss": 0.2911, + "step": 1305 + }, + { + "epoch": 2.1039065646395487, + "grad_norm": 0.2872325900858017, + "learning_rate": 2.8686369553988576e-05, + "loss": 0.2843, + "step": 1306 + }, + { + "epoch": 2.1055175191300846, + "grad_norm": 0.27879094380561814, + "learning_rate": 2.8666078610255854e-05, + "loss": 0.2792, + "step": 1307 + }, + { + "epoch": 2.1071284736206204, + "grad_norm": 0.30483411415338657, + "learning_rate": 2.8645776678642893e-05, + "loss": 0.2495, + "step": 1308 + }, + { + "epoch": 2.108739428111156, + "grad_norm": 0.2683464007417797, + "learning_rate": 2.8625463784890884e-05, + "loss": 0.2867, + "step": 1309 + }, + { + "epoch": 2.1103503826016916, + "grad_norm": 0.29749249023584406, + "learning_rate": 2.8605139954754923e-05, + "loss": 0.278, + "step": 1310 + }, + { + "epoch": 2.111961337092227, + "grad_norm": 0.2687970108131372, + "learning_rate": 2.8584805214003967e-05, + "loss": 0.2504, + "step": 1311 + }, + { + "epoch": 2.113572291582763, + "grad_norm": 0.32466154003508846, + "learning_rate": 2.8564459588420807e-05, + "loss": 0.2991, + "step": 1312 + }, + { + "epoch": 2.115183246073298, + "grad_norm": 0.26356374059791937, + "learning_rate": 2.854410310380203e-05, + "loss": 0.2656, + "step": 1313 + }, + { + "epoch": 2.116794200563834, + "grad_norm": 0.2949078910210175, + "learning_rate": 2.8523735785958e-05, + "loss": 0.2684, + "step": 1314 + }, + { + "epoch": 2.11840515505437, + "grad_norm": 0.31103645726920576, + "learning_rate": 2.8503357660712815e-05, + "loss": 0.3014, + "step": 1315 + }, + { + "epoch": 2.1200161095449053, + "grad_norm": 0.3295336108394426, + "learning_rate": 2.8482968753904277e-05, + "loss": 0.2945, + "step": 1316 + }, + { + "epoch": 2.121627064035441, + "grad_norm": 0.3043674560354506, + "learning_rate": 2.8462569091383853e-05, + "loss": 0.2761, + "step": 1317 + }, + { + "epoch": 2.1232380185259765, + "grad_norm": 0.2945740096658187, + "learning_rate": 2.844215869901664e-05, + "loss": 0.2933, + "step": 1318 + }, + { + "epoch": 2.1248489730165123, + "grad_norm": 0.2751385832645397, + "learning_rate": 2.8421737602681364e-05, + "loss": 0.2657, + "step": 1319 + }, + { + "epoch": 2.1264599275070477, + "grad_norm": 0.2966644566850293, + "learning_rate": 2.8401305828270302e-05, + "loss": 0.2744, + "step": 1320 + }, + { + "epoch": 2.1280708819975835, + "grad_norm": 0.27307884228282997, + "learning_rate": 2.838086340168927e-05, + "loss": 0.2798, + "step": 1321 + }, + { + "epoch": 2.1296818364881194, + "grad_norm": 0.2877534471726608, + "learning_rate": 2.836041034885761e-05, + "loss": 0.2493, + "step": 1322 + }, + { + "epoch": 2.1312927909786548, + "grad_norm": 0.2614334519008766, + "learning_rate": 2.833994669570811e-05, + "loss": 0.2874, + "step": 1323 + }, + { + "epoch": 2.1329037454691906, + "grad_norm": 0.29160322965169916, + "learning_rate": 2.831947246818702e-05, + "loss": 0.2856, + "step": 1324 + }, + { + "epoch": 2.134514699959726, + "grad_norm": 0.2660119080149266, + "learning_rate": 2.829898769225399e-05, + "loss": 0.3064, + "step": 1325 + }, + { + "epoch": 2.136125654450262, + "grad_norm": 0.2912904811381936, + "learning_rate": 2.8278492393882032e-05, + "loss": 0.2657, + "step": 1326 + }, + { + "epoch": 2.1377366089407976, + "grad_norm": 0.2448795402010965, + "learning_rate": 2.8257986599057524e-05, + "loss": 0.2437, + "step": 1327 + }, + { + "epoch": 2.139347563431333, + "grad_norm": 0.27855483837999373, + "learning_rate": 2.8237470333780136e-05, + "loss": 0.281, + "step": 1328 + }, + { + "epoch": 2.140958517921869, + "grad_norm": 0.28864912755883265, + "learning_rate": 2.8216943624062815e-05, + "loss": 0.2634, + "step": 1329 + }, + { + "epoch": 2.1425694724124043, + "grad_norm": 0.2935166591639771, + "learning_rate": 2.8196406495931753e-05, + "loss": 0.2896, + "step": 1330 + }, + { + "epoch": 2.14418042690294, + "grad_norm": 0.27542513543126673, + "learning_rate": 2.8175858975426358e-05, + "loss": 0.2674, + "step": 1331 + }, + { + "epoch": 2.1457913813934755, + "grad_norm": 0.28218015707916827, + "learning_rate": 2.81553010885992e-05, + "loss": 0.2784, + "step": 1332 + }, + { + "epoch": 2.1474023358840113, + "grad_norm": 0.26365559112393655, + "learning_rate": 2.813473286151601e-05, + "loss": 0.2661, + "step": 1333 + }, + { + "epoch": 2.1490132903745467, + "grad_norm": 0.2649219889260671, + "learning_rate": 2.8114154320255612e-05, + "loss": 0.2729, + "step": 1334 + }, + { + "epoch": 2.1506242448650825, + "grad_norm": 0.2526894475726239, + "learning_rate": 2.809356549090992e-05, + "loss": 0.2877, + "step": 1335 + }, + { + "epoch": 2.1522351993556184, + "grad_norm": 0.2846315930814484, + "learning_rate": 2.8072966399583897e-05, + "loss": 0.2967, + "step": 1336 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 0.24679278451222952, + "learning_rate": 2.8052357072395494e-05, + "loss": 0.2792, + "step": 1337 + }, + { + "epoch": 2.1554571083366896, + "grad_norm": 0.25433364102855244, + "learning_rate": 2.8031737535475668e-05, + "loss": 0.273, + "step": 1338 + }, + { + "epoch": 2.157068062827225, + "grad_norm": 0.32637103264618766, + "learning_rate": 2.801110781496831e-05, + "loss": 0.29, + "step": 1339 + }, + { + "epoch": 2.158679017317761, + "grad_norm": 0.2554277138169469, + "learning_rate": 2.799046793703021e-05, + "loss": 0.268, + "step": 1340 + }, + { + "epoch": 2.1602899718082966, + "grad_norm": 0.26969310352840337, + "learning_rate": 2.796981792783105e-05, + "loss": 0.2853, + "step": 1341 + }, + { + "epoch": 2.161900926298832, + "grad_norm": 0.2648111637517603, + "learning_rate": 2.7949157813553366e-05, + "loss": 0.2626, + "step": 1342 + }, + { + "epoch": 2.163511880789368, + "grad_norm": 0.2559746677208276, + "learning_rate": 2.7928487620392487e-05, + "loss": 0.3044, + "step": 1343 + }, + { + "epoch": 2.1651228352799032, + "grad_norm": 0.2714805738931639, + "learning_rate": 2.790780737455654e-05, + "loss": 0.2633, + "step": 1344 + }, + { + "epoch": 2.166733789770439, + "grad_norm": 0.24864674470379333, + "learning_rate": 2.7887117102266373e-05, + "loss": 0.2552, + "step": 1345 + }, + { + "epoch": 2.1683447442609745, + "grad_norm": 0.2547234928945848, + "learning_rate": 2.786641682975558e-05, + "loss": 0.2819, + "step": 1346 + }, + { + "epoch": 2.1699556987515103, + "grad_norm": 0.26524020894968475, + "learning_rate": 2.78457065832704e-05, + "loss": 0.3123, + "step": 1347 + }, + { + "epoch": 2.171566653242046, + "grad_norm": 0.2449565344886634, + "learning_rate": 2.782498638906975e-05, + "loss": 0.2643, + "step": 1348 + }, + { + "epoch": 2.1731776077325815, + "grad_norm": 0.2893223634942532, + "learning_rate": 2.780425627342514e-05, + "loss": 0.2816, + "step": 1349 + }, + { + "epoch": 2.1747885622231173, + "grad_norm": 0.2641617143950078, + "learning_rate": 2.7783516262620657e-05, + "loss": 0.2727, + "step": 1350 + }, + { + "epoch": 2.1763995167136527, + "grad_norm": 0.24627345310846985, + "learning_rate": 2.7762766382952948e-05, + "loss": 0.2814, + "step": 1351 + }, + { + "epoch": 2.1780104712041886, + "grad_norm": 0.28246010031678387, + "learning_rate": 2.7742006660731164e-05, + "loss": 0.2872, + "step": 1352 + }, + { + "epoch": 2.179621425694724, + "grad_norm": 0.27546870573554, + "learning_rate": 2.7721237122276944e-05, + "loss": 0.2822, + "step": 1353 + }, + { + "epoch": 2.18123238018526, + "grad_norm": 0.2520502824176606, + "learning_rate": 2.7700457793924357e-05, + "loss": 0.2802, + "step": 1354 + }, + { + "epoch": 2.1828433346757956, + "grad_norm": 0.26219847355655707, + "learning_rate": 2.767966870201991e-05, + "loss": 0.3008, + "step": 1355 + }, + { + "epoch": 2.184454289166331, + "grad_norm": 0.24699012142387217, + "learning_rate": 2.765886987292246e-05, + "loss": 0.2777, + "step": 1356 + }, + { + "epoch": 2.186065243656867, + "grad_norm": 0.23388920141854022, + "learning_rate": 2.7638061333003236e-05, + "loss": 0.2587, + "step": 1357 + }, + { + "epoch": 2.187676198147402, + "grad_norm": 0.26392578878534023, + "learning_rate": 2.7617243108645753e-05, + "loss": 0.3032, + "step": 1358 + }, + { + "epoch": 2.189287152637938, + "grad_norm": 0.23761776973362883, + "learning_rate": 2.759641522624583e-05, + "loss": 0.2498, + "step": 1359 + }, + { + "epoch": 2.1908981071284734, + "grad_norm": 0.27175194130700814, + "learning_rate": 2.7575577712211524e-05, + "loss": 0.2634, + "step": 1360 + }, + { + "epoch": 2.1925090616190093, + "grad_norm": 0.2676460221850602, + "learning_rate": 2.755473059296309e-05, + "loss": 0.2786, + "step": 1361 + }, + { + "epoch": 2.194120016109545, + "grad_norm": 0.276401287454354, + "learning_rate": 2.7533873894932996e-05, + "loss": 0.2798, + "step": 1362 + }, + { + "epoch": 2.1957309706000805, + "grad_norm": 0.24034592871532384, + "learning_rate": 2.7513007644565806e-05, + "loss": 0.262, + "step": 1363 + }, + { + "epoch": 2.1973419250906163, + "grad_norm": 0.25935727042899787, + "learning_rate": 2.7492131868318247e-05, + "loss": 0.2785, + "step": 1364 + }, + { + "epoch": 2.1989528795811517, + "grad_norm": 0.26803580380287895, + "learning_rate": 2.7471246592659075e-05, + "loss": 0.286, + "step": 1365 + }, + { + "epoch": 2.2005638340716875, + "grad_norm": 0.2801935730796939, + "learning_rate": 2.745035184406913e-05, + "loss": 0.2848, + "step": 1366 + }, + { + "epoch": 2.202174788562223, + "grad_norm": 0.25675623918826773, + "learning_rate": 2.7429447649041243e-05, + "loss": 0.2602, + "step": 1367 + }, + { + "epoch": 2.2037857430527588, + "grad_norm": 0.25791438655113136, + "learning_rate": 2.7408534034080228e-05, + "loss": 0.262, + "step": 1368 + }, + { + "epoch": 2.2053966975432946, + "grad_norm": 0.2753646395273099, + "learning_rate": 2.7387611025702837e-05, + "loss": 0.2958, + "step": 1369 + }, + { + "epoch": 2.20700765203383, + "grad_norm": 0.2441198130082228, + "learning_rate": 2.736667865043775e-05, + "loss": 0.2689, + "step": 1370 + }, + { + "epoch": 2.208618606524366, + "grad_norm": 0.25462467078769574, + "learning_rate": 2.734573693482549e-05, + "loss": 0.2802, + "step": 1371 + }, + { + "epoch": 2.210229561014901, + "grad_norm": 0.25852451115604996, + "learning_rate": 2.732478590541846e-05, + "loss": 0.2712, + "step": 1372 + }, + { + "epoch": 2.211840515505437, + "grad_norm": 0.25961347780273764, + "learning_rate": 2.7303825588780844e-05, + "loss": 0.2633, + "step": 1373 + }, + { + "epoch": 2.2134514699959724, + "grad_norm": 0.3037253510918743, + "learning_rate": 2.7282856011488615e-05, + "loss": 0.2878, + "step": 1374 + }, + { + "epoch": 2.2150624244865083, + "grad_norm": 0.22592118401629108, + "learning_rate": 2.7261877200129495e-05, + "loss": 0.2416, + "step": 1375 + }, + { + "epoch": 2.216673378977044, + "grad_norm": 0.3573981841929647, + "learning_rate": 2.724088918130289e-05, + "loss": 0.3017, + "step": 1376 + }, + { + "epoch": 2.2182843334675795, + "grad_norm": 0.2512932467094342, + "learning_rate": 2.721989198161991e-05, + "loss": 0.2794, + "step": 1377 + }, + { + "epoch": 2.2198952879581153, + "grad_norm": 0.3159257756922127, + "learning_rate": 2.7198885627703266e-05, + "loss": 0.2618, + "step": 1378 + }, + { + "epoch": 2.2215062424486507, + "grad_norm": 0.2599794920778706, + "learning_rate": 2.7177870146187323e-05, + "loss": 0.2728, + "step": 1379 + }, + { + "epoch": 2.2231171969391865, + "grad_norm": 0.2978036146497738, + "learning_rate": 2.7156845563717987e-05, + "loss": 0.2991, + "step": 1380 + }, + { + "epoch": 2.224728151429722, + "grad_norm": 0.24686915917646665, + "learning_rate": 2.7135811906952714e-05, + "loss": 0.2664, + "step": 1381 + }, + { + "epoch": 2.2263391059202577, + "grad_norm": 0.28407633766585894, + "learning_rate": 2.711476920256046e-05, + "loss": 0.3044, + "step": 1382 + }, + { + "epoch": 2.2279500604107936, + "grad_norm": 0.260934439271912, + "learning_rate": 2.709371747722166e-05, + "loss": 0.2783, + "step": 1383 + }, + { + "epoch": 2.229561014901329, + "grad_norm": 0.24471982062639003, + "learning_rate": 2.7072656757628188e-05, + "loss": 0.2686, + "step": 1384 + }, + { + "epoch": 2.231171969391865, + "grad_norm": 0.2633341856655095, + "learning_rate": 2.7051587070483307e-05, + "loss": 0.28, + "step": 1385 + }, + { + "epoch": 2.2327829238824, + "grad_norm": 0.26507841081942923, + "learning_rate": 2.7030508442501667e-05, + "loss": 0.2783, + "step": 1386 + }, + { + "epoch": 2.234393878372936, + "grad_norm": 0.25464764785100497, + "learning_rate": 2.7009420900409237e-05, + "loss": 0.2821, + "step": 1387 + }, + { + "epoch": 2.2360048328634714, + "grad_norm": 0.2868050675323047, + "learning_rate": 2.6988324470943315e-05, + "loss": 0.283, + "step": 1388 + }, + { + "epoch": 2.2376157873540072, + "grad_norm": 0.24611229723847267, + "learning_rate": 2.6967219180852448e-05, + "loss": 0.2822, + "step": 1389 + }, + { + "epoch": 2.239226741844543, + "grad_norm": 0.27187189363506403, + "learning_rate": 2.6946105056896406e-05, + "loss": 0.2994, + "step": 1390 + }, + { + "epoch": 2.2408376963350785, + "grad_norm": 0.21615468885402625, + "learning_rate": 2.692498212584619e-05, + "loss": 0.2415, + "step": 1391 + }, + { + "epoch": 2.2424486508256143, + "grad_norm": 0.26600283632882193, + "learning_rate": 2.6903850414483953e-05, + "loss": 0.2836, + "step": 1392 + }, + { + "epoch": 2.2440596053161497, + "grad_norm": 0.23728976141998126, + "learning_rate": 2.6882709949602966e-05, + "loss": 0.2841, + "step": 1393 + }, + { + "epoch": 2.2456705598066855, + "grad_norm": 0.25798754292795073, + "learning_rate": 2.6861560758007627e-05, + "loss": 0.2658, + "step": 1394 + }, + { + "epoch": 2.247281514297221, + "grad_norm": 0.2734561974922546, + "learning_rate": 2.684040286651338e-05, + "loss": 0.306, + "step": 1395 + }, + { + "epoch": 2.2488924687877567, + "grad_norm": 0.2858311588641443, + "learning_rate": 2.6819236301946697e-05, + "loss": 0.2793, + "step": 1396 + }, + { + "epoch": 2.2505034232782926, + "grad_norm": 0.2584315966942385, + "learning_rate": 2.6798061091145062e-05, + "loss": 0.2597, + "step": 1397 + }, + { + "epoch": 2.252114377768828, + "grad_norm": 0.27927968714043205, + "learning_rate": 2.677687726095691e-05, + "loss": 0.2817, + "step": 1398 + }, + { + "epoch": 2.253725332259364, + "grad_norm": 0.33168687660726137, + "learning_rate": 2.6755684838241614e-05, + "loss": 0.3129, + "step": 1399 + }, + { + "epoch": 2.255336286749899, + "grad_norm": 0.2554635016264873, + "learning_rate": 2.673448384986943e-05, + "loss": 0.2854, + "step": 1400 + }, + { + "epoch": 2.256947241240435, + "grad_norm": 0.2580831506179042, + "learning_rate": 2.6713274322721484e-05, + "loss": 0.2674, + "step": 1401 + }, + { + "epoch": 2.258558195730971, + "grad_norm": 0.2716906409548147, + "learning_rate": 2.669205628368972e-05, + "loss": 0.2931, + "step": 1402 + }, + { + "epoch": 2.260169150221506, + "grad_norm": 0.22672601567358644, + "learning_rate": 2.6670829759676882e-05, + "loss": 0.2621, + "step": 1403 + }, + { + "epoch": 2.261780104712042, + "grad_norm": 0.27308559599455845, + "learning_rate": 2.6649594777596476e-05, + "loss": 0.2802, + "step": 1404 + }, + { + "epoch": 2.2633910592025774, + "grad_norm": 0.25346462509049156, + "learning_rate": 2.6628351364372717e-05, + "loss": 0.2546, + "step": 1405 + }, + { + "epoch": 2.2650020136931133, + "grad_norm": 0.24996293140763204, + "learning_rate": 2.6607099546940526e-05, + "loss": 0.2944, + "step": 1406 + }, + { + "epoch": 2.2666129681836487, + "grad_norm": 0.24233262846738599, + "learning_rate": 2.6585839352245467e-05, + "loss": 0.2774, + "step": 1407 + }, + { + "epoch": 2.2682239226741845, + "grad_norm": 0.2583809561916111, + "learning_rate": 2.6564570807243728e-05, + "loss": 0.2653, + "step": 1408 + }, + { + "epoch": 2.26983487716472, + "grad_norm": 0.23405502541905018, + "learning_rate": 2.65432939389021e-05, + "loss": 0.2733, + "step": 1409 + }, + { + "epoch": 2.2714458316552557, + "grad_norm": 0.27446386749155977, + "learning_rate": 2.6522008774197902e-05, + "loss": 0.3221, + "step": 1410 + }, + { + "epoch": 2.2730567861457915, + "grad_norm": 0.2384738855739092, + "learning_rate": 2.6500715340118995e-05, + "loss": 0.2705, + "step": 1411 + }, + { + "epoch": 2.274667740636327, + "grad_norm": 0.24307003681995026, + "learning_rate": 2.6479413663663706e-05, + "loss": 0.2785, + "step": 1412 + }, + { + "epoch": 2.2762786951268628, + "grad_norm": 0.2526921045428496, + "learning_rate": 2.6458103771840835e-05, + "loss": 0.2871, + "step": 1413 + }, + { + "epoch": 2.277889649617398, + "grad_norm": 0.24931242626377267, + "learning_rate": 2.6436785691669578e-05, + "loss": 0.2808, + "step": 1414 + }, + { + "epoch": 2.279500604107934, + "grad_norm": 0.2887386630317653, + "learning_rate": 2.6415459450179515e-05, + "loss": 0.3049, + "step": 1415 + }, + { + "epoch": 2.28111155859847, + "grad_norm": 0.25033808397227236, + "learning_rate": 2.6394125074410584e-05, + "loss": 0.2904, + "step": 1416 + }, + { + "epoch": 2.282722513089005, + "grad_norm": 0.2699612396823536, + "learning_rate": 2.6372782591413034e-05, + "loss": 0.2729, + "step": 1417 + }, + { + "epoch": 2.284333467579541, + "grad_norm": 0.23924615054624318, + "learning_rate": 2.635143202824739e-05, + "loss": 0.2686, + "step": 1418 + }, + { + "epoch": 2.2859444220700764, + "grad_norm": 0.279887893837138, + "learning_rate": 2.6330073411984418e-05, + "loss": 0.2697, + "step": 1419 + }, + { + "epoch": 2.2875553765606123, + "grad_norm": 0.2545245668178236, + "learning_rate": 2.6308706769705118e-05, + "loss": 0.2698, + "step": 1420 + }, + { + "epoch": 2.2891663310511476, + "grad_norm": 0.2544237823113252, + "learning_rate": 2.6287332128500616e-05, + "loss": 0.2784, + "step": 1421 + }, + { + "epoch": 2.2907772855416835, + "grad_norm": 0.2588334468257935, + "learning_rate": 2.6265949515472247e-05, + "loss": 0.2856, + "step": 1422 + }, + { + "epoch": 2.292388240032219, + "grad_norm": 0.2640309275817744, + "learning_rate": 2.6244558957731386e-05, + "loss": 0.2921, + "step": 1423 + }, + { + "epoch": 2.2939991945227547, + "grad_norm": 0.27035992387226954, + "learning_rate": 2.622316048239954e-05, + "loss": 0.2719, + "step": 1424 + }, + { + "epoch": 2.2956101490132905, + "grad_norm": 0.2763448005510581, + "learning_rate": 2.6201754116608222e-05, + "loss": 0.2977, + "step": 1425 + }, + { + "epoch": 2.297221103503826, + "grad_norm": 0.2454033455212278, + "learning_rate": 2.618033988749895e-05, + "loss": 0.261, + "step": 1426 + }, + { + "epoch": 2.2988320579943617, + "grad_norm": 0.28206219976356545, + "learning_rate": 2.615891782222322e-05, + "loss": 0.3032, + "step": 1427 + }, + { + "epoch": 2.300443012484897, + "grad_norm": 0.25919572585437717, + "learning_rate": 2.6137487947942472e-05, + "loss": 0.2932, + "step": 1428 + }, + { + "epoch": 2.302053966975433, + "grad_norm": 0.23936651714279733, + "learning_rate": 2.6116050291828026e-05, + "loss": 0.2885, + "step": 1429 + }, + { + "epoch": 2.303664921465969, + "grad_norm": 0.26079396517622616, + "learning_rate": 2.6094604881061076e-05, + "loss": 0.2644, + "step": 1430 + }, + { + "epoch": 2.305275875956504, + "grad_norm": 0.27314314971219317, + "learning_rate": 2.607315174283267e-05, + "loss": 0.3197, + "step": 1431 + }, + { + "epoch": 2.30688683044704, + "grad_norm": 0.23566335984437972, + "learning_rate": 2.6051690904343616e-05, + "loss": 0.2795, + "step": 1432 + }, + { + "epoch": 2.3084977849375754, + "grad_norm": 0.2598558733558277, + "learning_rate": 2.6030222392804526e-05, + "loss": 0.2833, + "step": 1433 + }, + { + "epoch": 2.3101087394281112, + "grad_norm": 0.2226975838547479, + "learning_rate": 2.60087462354357e-05, + "loss": 0.2551, + "step": 1434 + }, + { + "epoch": 2.3117196939186466, + "grad_norm": 0.2798036466100915, + "learning_rate": 2.5987262459467168e-05, + "loss": 0.3226, + "step": 1435 + }, + { + "epoch": 2.3133306484091825, + "grad_norm": 0.2241641247015054, + "learning_rate": 2.5965771092138586e-05, + "loss": 0.2618, + "step": 1436 + }, + { + "epoch": 2.314941602899718, + "grad_norm": 0.28580401578871045, + "learning_rate": 2.5944272160699272e-05, + "loss": 0.2888, + "step": 1437 + }, + { + "epoch": 2.3165525573902537, + "grad_norm": 0.256570118424594, + "learning_rate": 2.5922765692408112e-05, + "loss": 0.2771, + "step": 1438 + }, + { + "epoch": 2.3181635118807895, + "grad_norm": 0.2578027303979569, + "learning_rate": 2.5901251714533543e-05, + "loss": 0.2574, + "step": 1439 + }, + { + "epoch": 2.319774466371325, + "grad_norm": 0.2798470005022122, + "learning_rate": 2.5879730254353543e-05, + "loss": 0.2981, + "step": 1440 + }, + { + "epoch": 2.3213854208618607, + "grad_norm": 0.27607698811706227, + "learning_rate": 2.585820133915555e-05, + "loss": 0.2865, + "step": 1441 + }, + { + "epoch": 2.322996375352396, + "grad_norm": 0.26912286490683257, + "learning_rate": 2.58366649962365e-05, + "loss": 0.2695, + "step": 1442 + }, + { + "epoch": 2.324607329842932, + "grad_norm": 0.26457014248103583, + "learning_rate": 2.581512125290269e-05, + "loss": 0.2812, + "step": 1443 + }, + { + "epoch": 2.326218284333468, + "grad_norm": 0.2598505097593347, + "learning_rate": 2.579357013646985e-05, + "loss": 0.2606, + "step": 1444 + }, + { + "epoch": 2.327829238824003, + "grad_norm": 0.31694547112574056, + "learning_rate": 2.5772011674263017e-05, + "loss": 0.2661, + "step": 1445 + }, + { + "epoch": 2.329440193314539, + "grad_norm": 0.24909334654252163, + "learning_rate": 2.575044589361657e-05, + "loss": 0.2705, + "step": 1446 + }, + { + "epoch": 2.3310511478050744, + "grad_norm": 0.2774620303831127, + "learning_rate": 2.5728872821874155e-05, + "loss": 0.2874, + "step": 1447 + }, + { + "epoch": 2.33266210229561, + "grad_norm": 0.2735139105570509, + "learning_rate": 2.5707292486388675e-05, + "loss": 0.3037, + "step": 1448 + }, + { + "epoch": 2.3342730567861456, + "grad_norm": 0.2740635594254122, + "learning_rate": 2.5685704914522223e-05, + "loss": 0.2897, + "step": 1449 + }, + { + "epoch": 2.3358840112766814, + "grad_norm": 0.24084315748689009, + "learning_rate": 2.566411013364608e-05, + "loss": 0.2486, + "step": 1450 + }, + { + "epoch": 2.3374949657672173, + "grad_norm": 0.287578601994385, + "learning_rate": 2.5642508171140675e-05, + "loss": 0.3072, + "step": 1451 + }, + { + "epoch": 2.3391059202577527, + "grad_norm": 0.2550922001271773, + "learning_rate": 2.562089905439552e-05, + "loss": 0.2716, + "step": 1452 + }, + { + "epoch": 2.3407168747482885, + "grad_norm": 0.28349657757580804, + "learning_rate": 2.5599282810809222e-05, + "loss": 0.2776, + "step": 1453 + }, + { + "epoch": 2.342327829238824, + "grad_norm": 0.2539299708169048, + "learning_rate": 2.5577659467789397e-05, + "loss": 0.2882, + "step": 1454 + }, + { + "epoch": 2.3439387837293597, + "grad_norm": 0.2638598040941746, + "learning_rate": 2.5556029052752704e-05, + "loss": 0.2753, + "step": 1455 + }, + { + "epoch": 2.345549738219895, + "grad_norm": 0.26546655271952696, + "learning_rate": 2.5534391593124717e-05, + "loss": 0.2727, + "step": 1456 + }, + { + "epoch": 2.347160692710431, + "grad_norm": 0.274114516324995, + "learning_rate": 2.5512747116339985e-05, + "loss": 0.2679, + "step": 1457 + }, + { + "epoch": 2.3487716472009668, + "grad_norm": 0.27615786351600663, + "learning_rate": 2.5491095649841936e-05, + "loss": 0.29, + "step": 1458 + }, + { + "epoch": 2.350382601691502, + "grad_norm": 0.2523360738477017, + "learning_rate": 2.5469437221082855e-05, + "loss": 0.2876, + "step": 1459 + }, + { + "epoch": 2.351993556182038, + "grad_norm": 0.26083088114961767, + "learning_rate": 2.5447771857523868e-05, + "loss": 0.2615, + "step": 1460 + }, + { + "epoch": 2.3536045106725734, + "grad_norm": 0.26930621483118783, + "learning_rate": 2.5426099586634885e-05, + "loss": 0.2798, + "step": 1461 + }, + { + "epoch": 2.355215465163109, + "grad_norm": 0.25754096649737024, + "learning_rate": 2.5404420435894578e-05, + "loss": 0.2668, + "step": 1462 + }, + { + "epoch": 2.356826419653645, + "grad_norm": 0.2699366881095369, + "learning_rate": 2.538273443279033e-05, + "loss": 0.3052, + "step": 1463 + }, + { + "epoch": 2.3584373741441804, + "grad_norm": 0.2635570721152839, + "learning_rate": 2.5361041604818244e-05, + "loss": 0.2682, + "step": 1464 + }, + { + "epoch": 2.3600483286347163, + "grad_norm": 0.2407160146785441, + "learning_rate": 2.5339341979483037e-05, + "loss": 0.2746, + "step": 1465 + }, + { + "epoch": 2.3616592831252516, + "grad_norm": 0.29061883733407995, + "learning_rate": 2.531763558429807e-05, + "loss": 0.2968, + "step": 1466 + }, + { + "epoch": 2.3632702376157875, + "grad_norm": 0.24834565149196813, + "learning_rate": 2.5295922446785275e-05, + "loss": 0.2667, + "step": 1467 + }, + { + "epoch": 2.364881192106323, + "grad_norm": 0.2583516235073483, + "learning_rate": 2.527420259447514e-05, + "loss": 0.272, + "step": 1468 + }, + { + "epoch": 2.3664921465968587, + "grad_norm": 0.23935397713698203, + "learning_rate": 2.5252476054906668e-05, + "loss": 0.2852, + "step": 1469 + }, + { + "epoch": 2.368103101087394, + "grad_norm": 0.2530197056590334, + "learning_rate": 2.523074285562734e-05, + "loss": 0.2705, + "step": 1470 + }, + { + "epoch": 2.36971405557793, + "grad_norm": 0.25163841004710474, + "learning_rate": 2.5209003024193067e-05, + "loss": 0.2905, + "step": 1471 + }, + { + "epoch": 2.3713250100684657, + "grad_norm": 0.23237404626614086, + "learning_rate": 2.518725658816819e-05, + "loss": 0.2526, + "step": 1472 + }, + { + "epoch": 2.372935964559001, + "grad_norm": 0.26024820542438537, + "learning_rate": 2.5165503575125413e-05, + "loss": 0.2846, + "step": 1473 + }, + { + "epoch": 2.374546919049537, + "grad_norm": 0.2585972661841443, + "learning_rate": 2.514374401264578e-05, + "loss": 0.2749, + "step": 1474 + }, + { + "epoch": 2.3761578735400724, + "grad_norm": 0.2845138208353183, + "learning_rate": 2.5121977928318638e-05, + "loss": 0.2748, + "step": 1475 + }, + { + "epoch": 2.377768828030608, + "grad_norm": 0.2495500233521992, + "learning_rate": 2.5100205349741602e-05, + "loss": 0.2787, + "step": 1476 + }, + { + "epoch": 2.379379782521144, + "grad_norm": 0.25151165206564124, + "learning_rate": 2.507842630452054e-05, + "loss": 0.3044, + "step": 1477 + }, + { + "epoch": 2.3809907370116794, + "grad_norm": 0.26429785064547257, + "learning_rate": 2.5056640820269484e-05, + "loss": 0.2835, + "step": 1478 + }, + { + "epoch": 2.3826016915022152, + "grad_norm": 0.2461207371041576, + "learning_rate": 2.503484892461066e-05, + "loss": 0.2774, + "step": 1479 + }, + { + "epoch": 2.3842126459927506, + "grad_norm": 0.27735647570825706, + "learning_rate": 2.5013050645174414e-05, + "loss": 0.2757, + "step": 1480 + }, + { + "epoch": 2.3858236004832865, + "grad_norm": 0.2755004610517415, + "learning_rate": 2.499124600959918e-05, + "loss": 0.2728, + "step": 1481 + }, + { + "epoch": 2.387434554973822, + "grad_norm": 0.2752976426963208, + "learning_rate": 2.4969435045531457e-05, + "loss": 0.2604, + "step": 1482 + }, + { + "epoch": 2.3890455094643577, + "grad_norm": 0.2716284118402911, + "learning_rate": 2.494761778062577e-05, + "loss": 0.282, + "step": 1483 + }, + { + "epoch": 2.390656463954893, + "grad_norm": 0.25175456114152706, + "learning_rate": 2.4925794242544626e-05, + "loss": 0.2972, + "step": 1484 + }, + { + "epoch": 2.392267418445429, + "grad_norm": 0.2690581205659193, + "learning_rate": 2.490396445895849e-05, + "loss": 0.2779, + "step": 1485 + }, + { + "epoch": 2.3938783729359647, + "grad_norm": 0.25560689396934266, + "learning_rate": 2.4882128457545748e-05, + "loss": 0.2728, + "step": 1486 + }, + { + "epoch": 2.3954893274265, + "grad_norm": 0.25135288220419316, + "learning_rate": 2.4860286265992667e-05, + "loss": 0.2888, + "step": 1487 + }, + { + "epoch": 2.397100281917036, + "grad_norm": 0.2492488977293215, + "learning_rate": 2.4838437911993355e-05, + "loss": 0.2714, + "step": 1488 + }, + { + "epoch": 2.3987112364075713, + "grad_norm": 0.2374341024869927, + "learning_rate": 2.4816583423249756e-05, + "loss": 0.2416, + "step": 1489 + }, + { + "epoch": 2.400322190898107, + "grad_norm": 0.25762720542042816, + "learning_rate": 2.479472282747157e-05, + "loss": 0.3118, + "step": 1490 + }, + { + "epoch": 2.401933145388643, + "grad_norm": 0.24506926236957213, + "learning_rate": 2.4772856152376244e-05, + "loss": 0.2944, + "step": 1491 + }, + { + "epoch": 2.4035440998791784, + "grad_norm": 0.2640437205274289, + "learning_rate": 2.4750983425688945e-05, + "loss": 0.2895, + "step": 1492 + }, + { + "epoch": 2.405155054369714, + "grad_norm": 0.23704849670354022, + "learning_rate": 2.4729104675142496e-05, + "loss": 0.2877, + "step": 1493 + }, + { + "epoch": 2.4067660088602496, + "grad_norm": 0.24521000147844632, + "learning_rate": 2.4707219928477372e-05, + "loss": 0.2936, + "step": 1494 + }, + { + "epoch": 2.4083769633507854, + "grad_norm": 0.23091038574228756, + "learning_rate": 2.4685329213441645e-05, + "loss": 0.2655, + "step": 1495 + }, + { + "epoch": 2.409987917841321, + "grad_norm": 0.2466614443346807, + "learning_rate": 2.4663432557790955e-05, + "loss": 0.2762, + "step": 1496 + }, + { + "epoch": 2.4115988723318567, + "grad_norm": 0.24469535305837437, + "learning_rate": 2.464152998928848e-05, + "loss": 0.2855, + "step": 1497 + }, + { + "epoch": 2.413209826822392, + "grad_norm": 0.2505015124713008, + "learning_rate": 2.461962153570487e-05, + "loss": 0.3012, + "step": 1498 + }, + { + "epoch": 2.414820781312928, + "grad_norm": 0.23025616103553312, + "learning_rate": 2.459770722481827e-05, + "loss": 0.2608, + "step": 1499 + }, + { + "epoch": 2.4164317358034637, + "grad_norm": 0.23710616228759754, + "learning_rate": 2.4575787084414244e-05, + "loss": 0.2949, + "step": 1500 + }, + { + "epoch": 2.418042690293999, + "grad_norm": 0.26950446786419663, + "learning_rate": 2.4553861142285718e-05, + "loss": 0.2945, + "step": 1501 + }, + { + "epoch": 2.419653644784535, + "grad_norm": 0.22872827384203387, + "learning_rate": 2.4531929426233017e-05, + "loss": 0.2743, + "step": 1502 + }, + { + "epoch": 2.4212645992750703, + "grad_norm": 0.24607877318799967, + "learning_rate": 2.4509991964063762e-05, + "loss": 0.2788, + "step": 1503 + }, + { + "epoch": 2.422875553765606, + "grad_norm": 0.2392918287186858, + "learning_rate": 2.4488048783592864e-05, + "loss": 0.2588, + "step": 1504 + }, + { + "epoch": 2.424486508256142, + "grad_norm": 0.2546711994645602, + "learning_rate": 2.446609991264248e-05, + "loss": 0.2949, + "step": 1505 + }, + { + "epoch": 2.4260974627466774, + "grad_norm": 0.24785163146676142, + "learning_rate": 2.4444145379041987e-05, + "loss": 0.2906, + "step": 1506 + }, + { + "epoch": 2.427708417237213, + "grad_norm": 0.23900765965369322, + "learning_rate": 2.4422185210627943e-05, + "loss": 0.2707, + "step": 1507 + }, + { + "epoch": 2.4293193717277486, + "grad_norm": 0.2849435663798419, + "learning_rate": 2.4400219435244047e-05, + "loss": 0.3101, + "step": 1508 + }, + { + "epoch": 2.4309303262182844, + "grad_norm": 0.2335728091178472, + "learning_rate": 2.4378248080741123e-05, + "loss": 0.2767, + "step": 1509 + }, + { + "epoch": 2.43254128070882, + "grad_norm": 0.24500309892560315, + "learning_rate": 2.435627117497703e-05, + "loss": 0.2848, + "step": 1510 + }, + { + "epoch": 2.4341522351993556, + "grad_norm": 0.26504756906620003, + "learning_rate": 2.4334288745816714e-05, + "loss": 0.3254, + "step": 1511 + }, + { + "epoch": 2.435763189689891, + "grad_norm": 0.22953986456533704, + "learning_rate": 2.4312300821132087e-05, + "loss": 0.2533, + "step": 1512 + }, + { + "epoch": 2.437374144180427, + "grad_norm": 0.2685582896959977, + "learning_rate": 2.4290307428802047e-05, + "loss": 0.2936, + "step": 1513 + }, + { + "epoch": 2.4389850986709627, + "grad_norm": 0.257894081779025, + "learning_rate": 2.426830859671242e-05, + "loss": 0.2694, + "step": 1514 + }, + { + "epoch": 2.440596053161498, + "grad_norm": 0.27245685735959463, + "learning_rate": 2.4246304352755924e-05, + "loss": 0.2743, + "step": 1515 + }, + { + "epoch": 2.442207007652034, + "grad_norm": 0.2744009692046602, + "learning_rate": 2.4224294724832152e-05, + "loss": 0.2851, + "step": 1516 + }, + { + "epoch": 2.4438179621425693, + "grad_norm": 0.2923107453438156, + "learning_rate": 2.420227974084751e-05, + "loss": 0.2836, + "step": 1517 + }, + { + "epoch": 2.445428916633105, + "grad_norm": 0.259635977803087, + "learning_rate": 2.4180259428715203e-05, + "loss": 0.2757, + "step": 1518 + }, + { + "epoch": 2.447039871123641, + "grad_norm": 0.2792257618006009, + "learning_rate": 2.4158233816355185e-05, + "loss": 0.2601, + "step": 1519 + }, + { + "epoch": 2.4486508256141764, + "grad_norm": 0.2922956877345361, + "learning_rate": 2.413620293169415e-05, + "loss": 0.2859, + "step": 1520 + }, + { + "epoch": 2.450261780104712, + "grad_norm": 0.27963346219976065, + "learning_rate": 2.4114166802665437e-05, + "loss": 0.2903, + "step": 1521 + }, + { + "epoch": 2.4518727345952476, + "grad_norm": 0.26021125040145265, + "learning_rate": 2.409212545720908e-05, + "loss": 0.2755, + "step": 1522 + }, + { + "epoch": 2.4534836890857834, + "grad_norm": 0.32453764777695815, + "learning_rate": 2.4070078923271688e-05, + "loss": 0.3209, + "step": 1523 + }, + { + "epoch": 2.455094643576319, + "grad_norm": 0.25241900447923254, + "learning_rate": 2.404802722880649e-05, + "loss": 0.2571, + "step": 1524 + }, + { + "epoch": 2.4567055980668546, + "grad_norm": 0.28154790258027407, + "learning_rate": 2.4025970401773204e-05, + "loss": 0.2609, + "step": 1525 + }, + { + "epoch": 2.45831655255739, + "grad_norm": 0.2591231087696456, + "learning_rate": 2.4003908470138106e-05, + "loss": 0.2897, + "step": 1526 + }, + { + "epoch": 2.459927507047926, + "grad_norm": 0.285177493044488, + "learning_rate": 2.3981841461873927e-05, + "loss": 0.2781, + "step": 1527 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 0.28975036947156085, + "learning_rate": 2.3959769404959817e-05, + "loss": 0.28, + "step": 1528 + }, + { + "epoch": 2.463149416028997, + "grad_norm": 0.26691677035364914, + "learning_rate": 2.3937692327381356e-05, + "loss": 0.2775, + "step": 1529 + }, + { + "epoch": 2.464760370519533, + "grad_norm": 0.2681414848359182, + "learning_rate": 2.3915610257130464e-05, + "loss": 0.2622, + "step": 1530 + }, + { + "epoch": 2.4663713250100683, + "grad_norm": 0.24939280874639216, + "learning_rate": 2.3893523222205416e-05, + "loss": 0.2722, + "step": 1531 + }, + { + "epoch": 2.467982279500604, + "grad_norm": 0.2669734237482074, + "learning_rate": 2.3871431250610765e-05, + "loss": 0.2888, + "step": 1532 + }, + { + "epoch": 2.46959323399114, + "grad_norm": 0.24728738996882668, + "learning_rate": 2.3849334370357325e-05, + "loss": 0.2927, + "step": 1533 + }, + { + "epoch": 2.4712041884816753, + "grad_norm": 0.22346148184383519, + "learning_rate": 2.382723260946213e-05, + "loss": 0.2542, + "step": 1534 + }, + { + "epoch": 2.472815142972211, + "grad_norm": 0.2713126696973073, + "learning_rate": 2.3805125995948422e-05, + "loss": 0.2982, + "step": 1535 + }, + { + "epoch": 2.4744260974627466, + "grad_norm": 0.24113638235375398, + "learning_rate": 2.3783014557845573e-05, + "loss": 0.2818, + "step": 1536 + }, + { + "epoch": 2.4760370519532824, + "grad_norm": 0.2340220966184417, + "learning_rate": 2.376089832318909e-05, + "loss": 0.2654, + "step": 1537 + }, + { + "epoch": 2.4776480064438178, + "grad_norm": 0.2515922263908132, + "learning_rate": 2.3738777320020544e-05, + "loss": 0.2906, + "step": 1538 + }, + { + "epoch": 2.4792589609343536, + "grad_norm": 0.23936876164172788, + "learning_rate": 2.3716651576387562e-05, + "loss": 0.2705, + "step": 1539 + }, + { + "epoch": 2.4808699154248894, + "grad_norm": 0.23480838263802561, + "learning_rate": 2.369452112034379e-05, + "loss": 0.2845, + "step": 1540 + }, + { + "epoch": 2.482480869915425, + "grad_norm": 0.27839276582383377, + "learning_rate": 2.3672385979948825e-05, + "loss": 0.303, + "step": 1541 + }, + { + "epoch": 2.4840918244059607, + "grad_norm": 0.23665728826604437, + "learning_rate": 2.3650246183268238e-05, + "loss": 0.2787, + "step": 1542 + }, + { + "epoch": 2.485702778896496, + "grad_norm": 0.25851421490627297, + "learning_rate": 2.3628101758373464e-05, + "loss": 0.2929, + "step": 1543 + }, + { + "epoch": 2.487313733387032, + "grad_norm": 0.23517510701071315, + "learning_rate": 2.360595273334184e-05, + "loss": 0.2683, + "step": 1544 + }, + { + "epoch": 2.4889246878775673, + "grad_norm": 0.24605001495252662, + "learning_rate": 2.3583799136256505e-05, + "loss": 0.2844, + "step": 1545 + }, + { + "epoch": 2.490535642368103, + "grad_norm": 0.25239596472454423, + "learning_rate": 2.356164099520643e-05, + "loss": 0.2629, + "step": 1546 + }, + { + "epoch": 2.492146596858639, + "grad_norm": 0.2693185313862507, + "learning_rate": 2.3539478338286325e-05, + "loss": 0.2827, + "step": 1547 + }, + { + "epoch": 2.4937575513491743, + "grad_norm": 0.3241129729749029, + "learning_rate": 2.351731119359662e-05, + "loss": 0.2942, + "step": 1548 + }, + { + "epoch": 2.49536850583971, + "grad_norm": 0.2568486623179716, + "learning_rate": 2.3495139589243455e-05, + "loss": 0.2826, + "step": 1549 + }, + { + "epoch": 2.4969794603302455, + "grad_norm": 0.30686580882418024, + "learning_rate": 2.3472963553338614e-05, + "loss": 0.2742, + "step": 1550 + }, + { + "epoch": 2.4985904148207814, + "grad_norm": 0.2576518847992097, + "learning_rate": 2.3450783113999487e-05, + "loss": 0.28, + "step": 1551 + }, + { + "epoch": 2.500201369311317, + "grad_norm": 0.3350355156604317, + "learning_rate": 2.3428598299349076e-05, + "loss": 0.2764, + "step": 1552 + }, + { + "epoch": 2.5018123238018526, + "grad_norm": 0.2557894175372604, + "learning_rate": 2.3406409137515912e-05, + "loss": 0.2601, + "step": 1553 + }, + { + "epoch": 2.503423278292388, + "grad_norm": 0.26635573164209964, + "learning_rate": 2.338421565663403e-05, + "loss": 0.287, + "step": 1554 + }, + { + "epoch": 2.505034232782924, + "grad_norm": 0.28770740938014006, + "learning_rate": 2.3362017884842967e-05, + "loss": 0.2796, + "step": 1555 + }, + { + "epoch": 2.5066451872734596, + "grad_norm": 0.25946436577333, + "learning_rate": 2.3339815850287676e-05, + "loss": 0.2753, + "step": 1556 + }, + { + "epoch": 2.508256141763995, + "grad_norm": 0.2588519714140495, + "learning_rate": 2.3317609581118527e-05, + "loss": 0.2652, + "step": 1557 + }, + { + "epoch": 2.509867096254531, + "grad_norm": 0.2690897104911625, + "learning_rate": 2.3295399105491256e-05, + "loss": 0.2791, + "step": 1558 + }, + { + "epoch": 2.5114780507450662, + "grad_norm": 0.2520218508663738, + "learning_rate": 2.3273184451566934e-05, + "loss": 0.2593, + "step": 1559 + }, + { + "epoch": 2.513089005235602, + "grad_norm": 0.2621023752863465, + "learning_rate": 2.325096564751193e-05, + "loss": 0.2736, + "step": 1560 + }, + { + "epoch": 2.514699959726138, + "grad_norm": 0.22160638366693367, + "learning_rate": 2.322874272149787e-05, + "loss": 0.2715, + "step": 1561 + }, + { + "epoch": 2.5163109142166733, + "grad_norm": 0.28383611341277853, + "learning_rate": 2.3206515701701612e-05, + "loss": 0.3236, + "step": 1562 + }, + { + "epoch": 2.517921868707209, + "grad_norm": 0.23283625465888177, + "learning_rate": 2.3184284616305205e-05, + "loss": 0.2733, + "step": 1563 + }, + { + "epoch": 2.5195328231977445, + "grad_norm": 0.24394026083888837, + "learning_rate": 2.316204949349585e-05, + "loss": 0.2741, + "step": 1564 + }, + { + "epoch": 2.5211437776882804, + "grad_norm": 0.230477810923903, + "learning_rate": 2.3139810361465854e-05, + "loss": 0.2541, + "step": 1565 + }, + { + "epoch": 2.522754732178816, + "grad_norm": 0.2515917527239852, + "learning_rate": 2.311756724841265e-05, + "loss": 0.2867, + "step": 1566 + }, + { + "epoch": 2.5243656866693516, + "grad_norm": 0.24050394233763317, + "learning_rate": 2.3095320182538657e-05, + "loss": 0.2944, + "step": 1567 + }, + { + "epoch": 2.525976641159887, + "grad_norm": 0.24985185288588335, + "learning_rate": 2.3073069192051364e-05, + "loss": 0.2617, + "step": 1568 + }, + { + "epoch": 2.527587595650423, + "grad_norm": 0.24018241301765708, + "learning_rate": 2.305081430516319e-05, + "loss": 0.2595, + "step": 1569 + }, + { + "epoch": 2.5291985501409586, + "grad_norm": 0.25522923876634196, + "learning_rate": 2.3028555550091536e-05, + "loss": 0.2948, + "step": 1570 + }, + { + "epoch": 2.530809504631494, + "grad_norm": 0.2517439540443441, + "learning_rate": 2.300629295505867e-05, + "loss": 0.2434, + "step": 1571 + }, + { + "epoch": 2.53242045912203, + "grad_norm": 0.23925236470373543, + "learning_rate": 2.2984026548291752e-05, + "loss": 0.2815, + "step": 1572 + }, + { + "epoch": 2.5340314136125652, + "grad_norm": 0.2870555456129622, + "learning_rate": 2.2961756358022765e-05, + "loss": 0.313, + "step": 1573 + }, + { + "epoch": 2.535642368103101, + "grad_norm": 0.24767206562519722, + "learning_rate": 2.2939482412488498e-05, + "loss": 0.2575, + "step": 1574 + }, + { + "epoch": 2.537253322593637, + "grad_norm": 0.2722004878155017, + "learning_rate": 2.291720473993049e-05, + "loss": 0.2865, + "step": 1575 + }, + { + "epoch": 2.5388642770841723, + "grad_norm": 0.27997751844902186, + "learning_rate": 2.289492336859501e-05, + "loss": 0.2742, + "step": 1576 + }, + { + "epoch": 2.540475231574708, + "grad_norm": 0.3147302217916788, + "learning_rate": 2.2872638326733018e-05, + "loss": 0.2786, + "step": 1577 + }, + { + "epoch": 2.5420861860652435, + "grad_norm": 0.24157360086493543, + "learning_rate": 2.2850349642600137e-05, + "loss": 0.2671, + "step": 1578 + }, + { + "epoch": 2.5436971405557793, + "grad_norm": 0.2931844117704698, + "learning_rate": 2.282805734445659e-05, + "loss": 0.2828, + "step": 1579 + }, + { + "epoch": 2.545308095046315, + "grad_norm": 0.27831191484940104, + "learning_rate": 2.2805761460567197e-05, + "loss": 0.2876, + "step": 1580 + }, + { + "epoch": 2.5469190495368506, + "grad_norm": 0.27135097897303195, + "learning_rate": 2.278346201920131e-05, + "loss": 0.3, + "step": 1581 + }, + { + "epoch": 2.5485300040273864, + "grad_norm": 0.24836372557953207, + "learning_rate": 2.2761159048632813e-05, + "loss": 0.2653, + "step": 1582 + }, + { + "epoch": 2.5501409585179218, + "grad_norm": 0.28745989489603807, + "learning_rate": 2.273885257714004e-05, + "loss": 0.263, + "step": 1583 + }, + { + "epoch": 2.5517519130084576, + "grad_norm": 0.2616915357039271, + "learning_rate": 2.2716542633005777e-05, + "loss": 0.2864, + "step": 1584 + }, + { + "epoch": 2.553362867498993, + "grad_norm": 0.2804002104585373, + "learning_rate": 2.2694229244517226e-05, + "loss": 0.271, + "step": 1585 + }, + { + "epoch": 2.554973821989529, + "grad_norm": 0.30579400972491694, + "learning_rate": 2.2671912439965923e-05, + "loss": 0.2869, + "step": 1586 + }, + { + "epoch": 2.556584776480064, + "grad_norm": 0.2577374199945713, + "learning_rate": 2.264959224764777e-05, + "loss": 0.2946, + "step": 1587 + }, + { + "epoch": 2.5581957309706, + "grad_norm": 0.2970356900027897, + "learning_rate": 2.262726869586293e-05, + "loss": 0.282, + "step": 1588 + }, + { + "epoch": 2.559806685461136, + "grad_norm": 0.2555540175655143, + "learning_rate": 2.260494181291587e-05, + "loss": 0.2948, + "step": 1589 + }, + { + "epoch": 2.5614176399516713, + "grad_norm": 0.2410510585798002, + "learning_rate": 2.258261162711523e-05, + "loss": 0.2694, + "step": 1590 + }, + { + "epoch": 2.563028594442207, + "grad_norm": 0.29264739126391853, + "learning_rate": 2.256027816677388e-05, + "loss": 0.2778, + "step": 1591 + }, + { + "epoch": 2.5646395489327425, + "grad_norm": 0.2495793628483905, + "learning_rate": 2.2537941460208818e-05, + "loss": 0.2792, + "step": 1592 + }, + { + "epoch": 2.5662505034232783, + "grad_norm": 0.25120155950587036, + "learning_rate": 2.2515601535741168e-05, + "loss": 0.2727, + "step": 1593 + }, + { + "epoch": 2.567861457913814, + "grad_norm": 0.2895173143422783, + "learning_rate": 2.2493258421696124e-05, + "loss": 0.268, + "step": 1594 + }, + { + "epoch": 2.5694724124043495, + "grad_norm": 0.235477824563421, + "learning_rate": 2.2470912146402935e-05, + "loss": 0.2559, + "step": 1595 + }, + { + "epoch": 2.5710833668948854, + "grad_norm": 0.23324303749320113, + "learning_rate": 2.244856273819485e-05, + "loss": 0.276, + "step": 1596 + }, + { + "epoch": 2.5726943213854208, + "grad_norm": 0.2874685117002799, + "learning_rate": 2.24262102254091e-05, + "loss": 0.3234, + "step": 1597 + }, + { + "epoch": 2.5743052758759566, + "grad_norm": 0.22530125349976493, + "learning_rate": 2.2403854636386843e-05, + "loss": 0.262, + "step": 1598 + }, + { + "epoch": 2.5759162303664924, + "grad_norm": 0.25364847023690723, + "learning_rate": 2.238149599947314e-05, + "loss": 0.2724, + "step": 1599 + }, + { + "epoch": 2.577527184857028, + "grad_norm": 0.25154728625547484, + "learning_rate": 2.2359134343016926e-05, + "loss": 0.2899, + "step": 1600 + }, + { + "epoch": 2.579138139347563, + "grad_norm": 0.2631559112148431, + "learning_rate": 2.233676969537094e-05, + "loss": 0.2735, + "step": 1601 + }, + { + "epoch": 2.580749093838099, + "grad_norm": 0.26024689995575573, + "learning_rate": 2.2314402084891746e-05, + "loss": 0.2723, + "step": 1602 + }, + { + "epoch": 2.582360048328635, + "grad_norm": 0.2432406812780365, + "learning_rate": 2.2292031539939635e-05, + "loss": 0.2651, + "step": 1603 + }, + { + "epoch": 2.5839710028191702, + "grad_norm": 0.26945019384770447, + "learning_rate": 2.2269658088878638e-05, + "loss": 0.2906, + "step": 1604 + }, + { + "epoch": 2.585581957309706, + "grad_norm": 0.25788545777611716, + "learning_rate": 2.2247281760076468e-05, + "loss": 0.2883, + "step": 1605 + }, + { + "epoch": 2.5871929118002415, + "grad_norm": 0.2732432882264997, + "learning_rate": 2.2224902581904476e-05, + "loss": 0.268, + "step": 1606 + }, + { + "epoch": 2.5888038662907773, + "grad_norm": 0.25613794655168626, + "learning_rate": 2.2202520582737635e-05, + "loss": 0.2596, + "step": 1607 + }, + { + "epoch": 2.590414820781313, + "grad_norm": 0.3055836045703691, + "learning_rate": 2.2180135790954494e-05, + "loss": 0.2931, + "step": 1608 + }, + { + "epoch": 2.5920257752718485, + "grad_norm": 0.23324193325162323, + "learning_rate": 2.215774823493715e-05, + "loss": 0.2667, + "step": 1609 + }, + { + "epoch": 2.5936367297623844, + "grad_norm": 0.24811631478589213, + "learning_rate": 2.213535794307118e-05, + "loss": 0.2757, + "step": 1610 + }, + { + "epoch": 2.5952476842529197, + "grad_norm": 0.2758092631908418, + "learning_rate": 2.211296494374566e-05, + "loss": 0.2924, + "step": 1611 + }, + { + "epoch": 2.5968586387434556, + "grad_norm": 0.2643433716278074, + "learning_rate": 2.209056926535307e-05, + "loss": 0.3036, + "step": 1612 + }, + { + "epoch": 2.5984695932339914, + "grad_norm": 0.2685767270547775, + "learning_rate": 2.2068170936289323e-05, + "loss": 0.2643, + "step": 1613 + }, + { + "epoch": 2.600080547724527, + "grad_norm": 0.24441427558762532, + "learning_rate": 2.2045769984953652e-05, + "loss": 0.2685, + "step": 1614 + }, + { + "epoch": 2.601691502215062, + "grad_norm": 0.29930220047366574, + "learning_rate": 2.2023366439748647e-05, + "loss": 0.2746, + "step": 1615 + }, + { + "epoch": 2.603302456705598, + "grad_norm": 0.22953726566232438, + "learning_rate": 2.2000960329080166e-05, + "loss": 0.2864, + "step": 1616 + }, + { + "epoch": 2.604913411196134, + "grad_norm": 0.25919477109412487, + "learning_rate": 2.197855168135734e-05, + "loss": 0.2724, + "step": 1617 + }, + { + "epoch": 2.6065243656866692, + "grad_norm": 0.24583444723040715, + "learning_rate": 2.1956140524992495e-05, + "loss": 0.2721, + "step": 1618 + }, + { + "epoch": 2.608135320177205, + "grad_norm": 0.2488426900524242, + "learning_rate": 2.1933726888401146e-05, + "loss": 0.3036, + "step": 1619 + }, + { + "epoch": 2.6097462746677405, + "grad_norm": 0.2709750897529024, + "learning_rate": 2.1911310800001967e-05, + "loss": 0.2761, + "step": 1620 + }, + { + "epoch": 2.6113572291582763, + "grad_norm": 0.27314838210070563, + "learning_rate": 2.188889228821671e-05, + "loss": 0.2854, + "step": 1621 + }, + { + "epoch": 2.612968183648812, + "grad_norm": 0.24503356817611527, + "learning_rate": 2.186647138147024e-05, + "loss": 0.2713, + "step": 1622 + }, + { + "epoch": 2.6145791381393475, + "grad_norm": 0.23765691589614468, + "learning_rate": 2.184404810819041e-05, + "loss": 0.2682, + "step": 1623 + }, + { + "epoch": 2.6161900926298833, + "grad_norm": 0.25031315550528577, + "learning_rate": 2.182162249680813e-05, + "loss": 0.2755, + "step": 1624 + }, + { + "epoch": 2.6178010471204187, + "grad_norm": 0.24667393832777065, + "learning_rate": 2.179919457575722e-05, + "loss": 0.2879, + "step": 1625 + }, + { + "epoch": 2.6194120016109546, + "grad_norm": 0.22980591927089686, + "learning_rate": 2.1776764373474465e-05, + "loss": 0.267, + "step": 1626 + }, + { + "epoch": 2.6210229561014904, + "grad_norm": 0.24144423356769742, + "learning_rate": 2.1754331918399526e-05, + "loss": 0.2771, + "step": 1627 + }, + { + "epoch": 2.6226339105920258, + "grad_norm": 0.2665613251605162, + "learning_rate": 2.1731897238974926e-05, + "loss": 0.3079, + "step": 1628 + }, + { + "epoch": 2.624244865082561, + "grad_norm": 0.22659947309530976, + "learning_rate": 2.170946036364601e-05, + "loss": 0.2617, + "step": 1629 + }, + { + "epoch": 2.625855819573097, + "grad_norm": 0.2773783867044038, + "learning_rate": 2.1687021320860893e-05, + "loss": 0.2679, + "step": 1630 + }, + { + "epoch": 2.627466774063633, + "grad_norm": 0.25665405302808597, + "learning_rate": 2.166458013907047e-05, + "loss": 0.295, + "step": 1631 + }, + { + "epoch": 2.629077728554168, + "grad_norm": 0.2625122154554604, + "learning_rate": 2.1642136846728313e-05, + "loss": 0.2548, + "step": 1632 + }, + { + "epoch": 2.630688683044704, + "grad_norm": 0.24524445937268322, + "learning_rate": 2.1619691472290692e-05, + "loss": 0.278, + "step": 1633 + }, + { + "epoch": 2.6322996375352394, + "grad_norm": 0.24739471796623, + "learning_rate": 2.159724404421649e-05, + "loss": 0.2447, + "step": 1634 + }, + { + "epoch": 2.6339105920257753, + "grad_norm": 0.24322288209895856, + "learning_rate": 2.157479459096724e-05, + "loss": 0.2823, + "step": 1635 + }, + { + "epoch": 2.635521546516311, + "grad_norm": 0.22906031471382138, + "learning_rate": 2.1552343141007e-05, + "loss": 0.2674, + "step": 1636 + }, + { + "epoch": 2.6371325010068465, + "grad_norm": 0.26408427275553314, + "learning_rate": 2.1529889722802384e-05, + "loss": 0.3259, + "step": 1637 + }, + { + "epoch": 2.6387434554973823, + "grad_norm": 0.20577040124306828, + "learning_rate": 2.1507434364822487e-05, + "loss": 0.2442, + "step": 1638 + }, + { + "epoch": 2.6403544099879177, + "grad_norm": 0.25336351837263493, + "learning_rate": 2.148497709553887e-05, + "loss": 0.2966, + "step": 1639 + }, + { + "epoch": 2.6419653644784535, + "grad_norm": 0.2179088401032362, + "learning_rate": 2.1462517943425523e-05, + "loss": 0.2399, + "step": 1640 + }, + { + "epoch": 2.6435763189689894, + "grad_norm": 0.2609474199496771, + "learning_rate": 2.1440056936958815e-05, + "loss": 0.2897, + "step": 1641 + }, + { + "epoch": 2.6451872734595248, + "grad_norm": 0.250702552124162, + "learning_rate": 2.141759410461746e-05, + "loss": 0.2797, + "step": 1642 + }, + { + "epoch": 2.64679822795006, + "grad_norm": 0.25202435733045636, + "learning_rate": 2.1395129474882507e-05, + "loss": 0.258, + "step": 1643 + }, + { + "epoch": 2.648409182440596, + "grad_norm": 0.23559656524296946, + "learning_rate": 2.1372663076237273e-05, + "loss": 0.3022, + "step": 1644 + }, + { + "epoch": 2.650020136931132, + "grad_norm": 0.2600623544743553, + "learning_rate": 2.1350194937167307e-05, + "loss": 0.284, + "step": 1645 + }, + { + "epoch": 2.651631091421667, + "grad_norm": 0.23042665562467768, + "learning_rate": 2.1327725086160385e-05, + "loss": 0.2659, + "step": 1646 + }, + { + "epoch": 2.653242045912203, + "grad_norm": 0.2319917641049967, + "learning_rate": 2.1305253551706442e-05, + "loss": 0.2755, + "step": 1647 + }, + { + "epoch": 2.6548530004027384, + "grad_norm": 0.24708094663257485, + "learning_rate": 2.1282780362297544e-05, + "loss": 0.2895, + "step": 1648 + }, + { + "epoch": 2.6564639548932742, + "grad_norm": 0.2463749021745463, + "learning_rate": 2.1260305546427867e-05, + "loss": 0.288, + "step": 1649 + }, + { + "epoch": 2.65807490938381, + "grad_norm": 0.2540138947859815, + "learning_rate": 2.123782913259364e-05, + "loss": 0.2826, + "step": 1650 + }, + { + "epoch": 2.6596858638743455, + "grad_norm": 0.23487737142224063, + "learning_rate": 2.121535114929312e-05, + "loss": 0.272, + "step": 1651 + }, + { + "epoch": 2.6612968183648813, + "grad_norm": 0.2282373426932882, + "learning_rate": 2.1192871625026553e-05, + "loss": 0.2343, + "step": 1652 + }, + { + "epoch": 2.6629077728554167, + "grad_norm": 0.2591476141147764, + "learning_rate": 2.1170390588296148e-05, + "loss": 0.3235, + "step": 1653 + }, + { + "epoch": 2.6645187273459525, + "grad_norm": 0.2636302573560719, + "learning_rate": 2.1147908067606012e-05, + "loss": 0.2785, + "step": 1654 + }, + { + "epoch": 2.6661296818364884, + "grad_norm": 0.23618006055822752, + "learning_rate": 2.112542409146217e-05, + "loss": 0.2655, + "step": 1655 + }, + { + "epoch": 2.6677406363270237, + "grad_norm": 0.25207523326650344, + "learning_rate": 2.1102938688372436e-05, + "loss": 0.2836, + "step": 1656 + }, + { + "epoch": 2.669351590817559, + "grad_norm": 0.25173559248973704, + "learning_rate": 2.1080451886846486e-05, + "loss": 0.261, + "step": 1657 + }, + { + "epoch": 2.670962545308095, + "grad_norm": 0.2431590866242753, + "learning_rate": 2.1057963715395746e-05, + "loss": 0.3086, + "step": 1658 + }, + { + "epoch": 2.672573499798631, + "grad_norm": 0.25202253459066404, + "learning_rate": 2.1035474202533385e-05, + "loss": 0.2567, + "step": 1659 + }, + { + "epoch": 2.674184454289166, + "grad_norm": 0.2555417920210164, + "learning_rate": 2.1012983376774255e-05, + "loss": 0.2739, + "step": 1660 + }, + { + "epoch": 2.675795408779702, + "grad_norm": 0.2420625615147683, + "learning_rate": 2.0990491266634903e-05, + "loss": 0.2653, + "step": 1661 + }, + { + "epoch": 2.6774063632702374, + "grad_norm": 0.2396489356305853, + "learning_rate": 2.0967997900633482e-05, + "loss": 0.3129, + "step": 1662 + }, + { + "epoch": 2.6790173177607732, + "grad_norm": 0.2131386533167181, + "learning_rate": 2.094550330728974e-05, + "loss": 0.2642, + "step": 1663 + }, + { + "epoch": 2.680628272251309, + "grad_norm": 0.21745561076267786, + "learning_rate": 2.092300751512499e-05, + "loss": 0.2642, + "step": 1664 + }, + { + "epoch": 2.6822392267418445, + "grad_norm": 0.2395086233145289, + "learning_rate": 2.0900510552662057e-05, + "loss": 0.2924, + "step": 1665 + }, + { + "epoch": 2.6838501812323803, + "grad_norm": 0.2353999626748506, + "learning_rate": 2.0878012448425258e-05, + "loss": 0.2591, + "step": 1666 + }, + { + "epoch": 2.6854611357229157, + "grad_norm": 0.24095219112832106, + "learning_rate": 2.085551323094035e-05, + "loss": 0.2962, + "step": 1667 + }, + { + "epoch": 2.6870720902134515, + "grad_norm": 0.2553288916093136, + "learning_rate": 2.08330129287345e-05, + "loss": 0.3125, + "step": 1668 + }, + { + "epoch": 2.6886830447039873, + "grad_norm": 0.24345181609728936, + "learning_rate": 2.0810511570336262e-05, + "loss": 0.2681, + "step": 1669 + }, + { + "epoch": 2.6902939991945227, + "grad_norm": 0.256453338841592, + "learning_rate": 2.0788009184275514e-05, + "loss": 0.2694, + "step": 1670 + }, + { + "epoch": 2.6919049536850586, + "grad_norm": 0.24077026988656322, + "learning_rate": 2.0765505799083452e-05, + "loss": 0.2741, + "step": 1671 + }, + { + "epoch": 2.693515908175594, + "grad_norm": 0.2534029520291385, + "learning_rate": 2.074300144329252e-05, + "loss": 0.3124, + "step": 1672 + }, + { + "epoch": 2.6951268626661298, + "grad_norm": 0.24039030268215603, + "learning_rate": 2.0720496145436423e-05, + "loss": 0.2761, + "step": 1673 + }, + { + "epoch": 2.696737817156665, + "grad_norm": 0.2441294858776203, + "learning_rate": 2.0697989934050025e-05, + "loss": 0.2702, + "step": 1674 + }, + { + "epoch": 2.698348771647201, + "grad_norm": 0.23858821941312688, + "learning_rate": 2.0675482837669367e-05, + "loss": 0.2585, + "step": 1675 + }, + { + "epoch": 2.6999597261377364, + "grad_norm": 0.22912907435783178, + "learning_rate": 2.0652974884831612e-05, + "loss": 0.2735, + "step": 1676 + }, + { + "epoch": 2.701570680628272, + "grad_norm": 0.25427589376548887, + "learning_rate": 2.063046610407501e-05, + "loss": 0.3006, + "step": 1677 + }, + { + "epoch": 2.703181635118808, + "grad_norm": 0.25451643357504333, + "learning_rate": 2.060795652393886e-05, + "loss": 0.2771, + "step": 1678 + }, + { + "epoch": 2.7047925896093434, + "grad_norm": 0.2470762873381859, + "learning_rate": 2.0585446172963457e-05, + "loss": 0.2967, + "step": 1679 + }, + { + "epoch": 2.7064035440998793, + "grad_norm": 0.2405780999475265, + "learning_rate": 2.05629350796901e-05, + "loss": 0.2728, + "step": 1680 + }, + { + "epoch": 2.7080144985904147, + "grad_norm": 0.2680658731906045, + "learning_rate": 2.0540423272661024e-05, + "loss": 0.2846, + "step": 1681 + }, + { + "epoch": 2.7096254530809505, + "grad_norm": 0.22930174904587397, + "learning_rate": 2.0517910780419355e-05, + "loss": 0.279, + "step": 1682 + }, + { + "epoch": 2.7112364075714863, + "grad_norm": 0.2513745451496423, + "learning_rate": 2.0495397631509092e-05, + "loss": 0.2498, + "step": 1683 + }, + { + "epoch": 2.7128473620620217, + "grad_norm": 0.2607357243459458, + "learning_rate": 2.047288385447507e-05, + "loss": 0.29, + "step": 1684 + }, + { + "epoch": 2.7144583165525575, + "grad_norm": 0.2680985429045293, + "learning_rate": 2.0450369477862922e-05, + "loss": 0.2788, + "step": 1685 + }, + { + "epoch": 2.716069271043093, + "grad_norm": 0.28257542550577647, + "learning_rate": 2.042785453021905e-05, + "loss": 0.2705, + "step": 1686 + }, + { + "epoch": 2.7176802255336288, + "grad_norm": 0.24906888756996964, + "learning_rate": 2.0405339040090557e-05, + "loss": 0.2845, + "step": 1687 + }, + { + "epoch": 2.7192911800241646, + "grad_norm": 0.23828748868464236, + "learning_rate": 2.0382823036025243e-05, + "loss": 0.2871, + "step": 1688 + }, + { + "epoch": 2.7209021345147, + "grad_norm": 0.2522899452359346, + "learning_rate": 2.0360306546571582e-05, + "loss": 0.2743, + "step": 1689 + }, + { + "epoch": 2.7225130890052354, + "grad_norm": 0.24592334075728844, + "learning_rate": 2.0337789600278623e-05, + "loss": 0.27, + "step": 1690 + }, + { + "epoch": 2.724124043495771, + "grad_norm": 0.2554007578885009, + "learning_rate": 2.0315272225696034e-05, + "loss": 0.3087, + "step": 1691 + }, + { + "epoch": 2.725734997986307, + "grad_norm": 0.23843192140396555, + "learning_rate": 2.0292754451373992e-05, + "loss": 0.2685, + "step": 1692 + }, + { + "epoch": 2.7273459524768424, + "grad_norm": 0.22520952757862236, + "learning_rate": 2.027023630586321e-05, + "loss": 0.261, + "step": 1693 + }, + { + "epoch": 2.7289569069673782, + "grad_norm": 0.26311834190007133, + "learning_rate": 2.024771781771485e-05, + "loss": 0.2881, + "step": 1694 + }, + { + "epoch": 2.7305678614579136, + "grad_norm": 0.20959220644219456, + "learning_rate": 2.0225199015480518e-05, + "loss": 0.2437, + "step": 1695 + }, + { + "epoch": 2.7321788159484495, + "grad_norm": 0.26113815468998086, + "learning_rate": 2.0202679927712224e-05, + "loss": 0.2894, + "step": 1696 + }, + { + "epoch": 2.7337897704389853, + "grad_norm": 0.2469561663058487, + "learning_rate": 2.018016058296232e-05, + "loss": 0.2702, + "step": 1697 + }, + { + "epoch": 2.7354007249295207, + "grad_norm": 0.2641432503130729, + "learning_rate": 2.0157641009783512e-05, + "loss": 0.297, + "step": 1698 + }, + { + "epoch": 2.7370116794200565, + "grad_norm": 0.23840297914176908, + "learning_rate": 2.0135121236728762e-05, + "loss": 0.2674, + "step": 1699 + }, + { + "epoch": 2.738622633910592, + "grad_norm": 0.2764405943973875, + "learning_rate": 2.0112601292351322e-05, + "loss": 0.2947, + "step": 1700 + }, + { + "epoch": 2.7402335884011277, + "grad_norm": 0.23575772294949454, + "learning_rate": 2.009008120520463e-05, + "loss": 0.2672, + "step": 1701 + }, + { + "epoch": 2.7418445428916636, + "grad_norm": 0.2521615855835443, + "learning_rate": 2.006756100384233e-05, + "loss": 0.2814, + "step": 1702 + }, + { + "epoch": 2.743455497382199, + "grad_norm": 0.2538001655861593, + "learning_rate": 2.0045040716818184e-05, + "loss": 0.2802, + "step": 1703 + }, + { + "epoch": 2.7450664518727343, + "grad_norm": 0.2281414090679116, + "learning_rate": 2.0022520372686092e-05, + "loss": 0.2723, + "step": 1704 + }, + { + "epoch": 2.74667740636327, + "grad_norm": 0.27249255745775103, + "learning_rate": 2e-05, + "loss": 0.2783, + "step": 1705 + }, + { + "epoch": 2.748288360853806, + "grad_norm": 0.21913760224877482, + "learning_rate": 1.9977479627313918e-05, + "loss": 0.2666, + "step": 1706 + }, + { + "epoch": 2.7498993153443414, + "grad_norm": 0.271276451382289, + "learning_rate": 1.995495928318182e-05, + "loss": 0.2592, + "step": 1707 + }, + { + "epoch": 2.7515102698348772, + "grad_norm": 0.25181265659436014, + "learning_rate": 1.9932438996157678e-05, + "loss": 0.2672, + "step": 1708 + }, + { + "epoch": 2.7531212243254126, + "grad_norm": 0.23000287341824674, + "learning_rate": 1.9909918794795378e-05, + "loss": 0.2883, + "step": 1709 + }, + { + "epoch": 2.7547321788159485, + "grad_norm": 0.2509709521664799, + "learning_rate": 1.988739870764869e-05, + "loss": 0.2955, + "step": 1710 + }, + { + "epoch": 2.7563431333064843, + "grad_norm": 0.255309204845929, + "learning_rate": 1.986487876327124e-05, + "loss": 0.2939, + "step": 1711 + }, + { + "epoch": 2.7579540877970197, + "grad_norm": 0.2389408162743876, + "learning_rate": 1.9842358990216498e-05, + "loss": 0.276, + "step": 1712 + }, + { + "epoch": 2.7595650422875555, + "grad_norm": 0.22548655871923712, + "learning_rate": 1.9819839417037688e-05, + "loss": 0.2551, + "step": 1713 + }, + { + "epoch": 2.761175996778091, + "grad_norm": 0.2514072456845176, + "learning_rate": 1.9797320072287786e-05, + "loss": 0.2797, + "step": 1714 + }, + { + "epoch": 2.7627869512686267, + "grad_norm": 0.2389348677377125, + "learning_rate": 1.9774800984519485e-05, + "loss": 0.2538, + "step": 1715 + }, + { + "epoch": 2.7643979057591626, + "grad_norm": 0.24399474291305062, + "learning_rate": 1.9752282182285158e-05, + "loss": 0.2698, + "step": 1716 + }, + { + "epoch": 2.766008860249698, + "grad_norm": 0.24492996641517525, + "learning_rate": 1.9729763694136796e-05, + "loss": 0.2571, + "step": 1717 + }, + { + "epoch": 2.7676198147402333, + "grad_norm": 0.25512566620189414, + "learning_rate": 1.9707245548626008e-05, + "loss": 0.2931, + "step": 1718 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 0.24336440793633282, + "learning_rate": 1.968472777430397e-05, + "loss": 0.2718, + "step": 1719 + }, + { + "epoch": 2.770841723721305, + "grad_norm": 0.2365556332430933, + "learning_rate": 1.966221039972138e-05, + "loss": 0.2833, + "step": 1720 + }, + { + "epoch": 2.7724526782118404, + "grad_norm": 0.2489637128328502, + "learning_rate": 1.9639693453428428e-05, + "loss": 0.2614, + "step": 1721 + }, + { + "epoch": 2.774063632702376, + "grad_norm": 0.23427312408705342, + "learning_rate": 1.9617176963974757e-05, + "loss": 0.237, + "step": 1722 + }, + { + "epoch": 2.7756745871929116, + "grad_norm": 0.30355928331474424, + "learning_rate": 1.959466095990945e-05, + "loss": 0.3156, + "step": 1723 + }, + { + "epoch": 2.7772855416834474, + "grad_norm": 0.23089885134745378, + "learning_rate": 1.9572145469780957e-05, + "loss": 0.262, + "step": 1724 + }, + { + "epoch": 2.7788964961739833, + "grad_norm": 0.2650910801407474, + "learning_rate": 1.9549630522137084e-05, + "loss": 0.2691, + "step": 1725 + }, + { + "epoch": 2.7805074506645187, + "grad_norm": 0.27538303821293014, + "learning_rate": 1.9527116145524934e-05, + "loss": 0.2905, + "step": 1726 + }, + { + "epoch": 2.7821184051550545, + "grad_norm": 0.2529807875761806, + "learning_rate": 1.9504602368490918e-05, + "loss": 0.2971, + "step": 1727 + }, + { + "epoch": 2.78372935964559, + "grad_norm": 0.2799965953217938, + "learning_rate": 1.9482089219580655e-05, + "loss": 0.2752, + "step": 1728 + }, + { + "epoch": 2.7853403141361257, + "grad_norm": 0.25007926005011655, + "learning_rate": 1.9459576727338986e-05, + "loss": 0.2604, + "step": 1729 + }, + { + "epoch": 2.7869512686266615, + "grad_norm": 0.254990987065823, + "learning_rate": 1.9437064920309895e-05, + "loss": 0.2971, + "step": 1730 + }, + { + "epoch": 2.788562223117197, + "grad_norm": 0.26362113970903406, + "learning_rate": 1.941455382703655e-05, + "loss": 0.2538, + "step": 1731 + }, + { + "epoch": 2.7901731776077323, + "grad_norm": 0.246885971572867, + "learning_rate": 1.939204347606115e-05, + "loss": 0.2721, + "step": 1732 + }, + { + "epoch": 2.791784132098268, + "grad_norm": 0.266283943370594, + "learning_rate": 1.9369533895924992e-05, + "loss": 0.2866, + "step": 1733 + }, + { + "epoch": 2.793395086588804, + "grad_norm": 0.2680098900263936, + "learning_rate": 1.934702511516839e-05, + "loss": 0.2967, + "step": 1734 + }, + { + "epoch": 2.7950060410793394, + "grad_norm": 0.22740978433404274, + "learning_rate": 1.932451716233064e-05, + "loss": 0.2694, + "step": 1735 + }, + { + "epoch": 2.796616995569875, + "grad_norm": 0.2561037233597848, + "learning_rate": 1.930201006594999e-05, + "loss": 0.2679, + "step": 1736 + }, + { + "epoch": 2.7982279500604106, + "grad_norm": 0.23434576263054702, + "learning_rate": 1.9279503854563584e-05, + "loss": 0.2799, + "step": 1737 + }, + { + "epoch": 2.7998389045509464, + "grad_norm": 0.22674515230572093, + "learning_rate": 1.925699855670748e-05, + "loss": 0.2728, + "step": 1738 + }, + { + "epoch": 2.8014498590414822, + "grad_norm": 0.2428990371238118, + "learning_rate": 1.9234494200916554e-05, + "loss": 0.3036, + "step": 1739 + }, + { + "epoch": 2.8030608135320176, + "grad_norm": 0.21954338652190178, + "learning_rate": 1.9211990815724496e-05, + "loss": 0.2474, + "step": 1740 + }, + { + "epoch": 2.8046717680225535, + "grad_norm": 0.25787658937930996, + "learning_rate": 1.918948842966374e-05, + "loss": 0.2964, + "step": 1741 + }, + { + "epoch": 2.806282722513089, + "grad_norm": 0.225083476296253, + "learning_rate": 1.9166987071265506e-05, + "loss": 0.2625, + "step": 1742 + }, + { + "epoch": 2.8078936770036247, + "grad_norm": 0.26059530040772755, + "learning_rate": 1.914448676905966e-05, + "loss": 0.2689, + "step": 1743 + }, + { + "epoch": 2.8095046314941605, + "grad_norm": 0.2149852949576493, + "learning_rate": 1.9121987551574745e-05, + "loss": 0.2373, + "step": 1744 + }, + { + "epoch": 2.811115585984696, + "grad_norm": 0.26351487458727435, + "learning_rate": 1.9099489447337946e-05, + "loss": 0.2786, + "step": 1745 + }, + { + "epoch": 2.8127265404752317, + "grad_norm": 0.26753566423015324, + "learning_rate": 1.9076992484875014e-05, + "loss": 0.3151, + "step": 1746 + }, + { + "epoch": 2.814337494965767, + "grad_norm": 0.23423084235876232, + "learning_rate": 1.905449669271027e-05, + "loss": 0.276, + "step": 1747 + }, + { + "epoch": 2.815948449456303, + "grad_norm": 0.239399463127817, + "learning_rate": 1.9032002099366528e-05, + "loss": 0.2747, + "step": 1748 + }, + { + "epoch": 2.8175594039468383, + "grad_norm": 0.24223640625161968, + "learning_rate": 1.9009508733365103e-05, + "loss": 0.2576, + "step": 1749 + }, + { + "epoch": 2.819170358437374, + "grad_norm": 0.2741888988118937, + "learning_rate": 1.8987016623225748e-05, + "loss": 0.2846, + "step": 1750 + }, + { + "epoch": 2.8207813129279096, + "grad_norm": 0.26900885193390967, + "learning_rate": 1.896452579746663e-05, + "loss": 0.2628, + "step": 1751 + }, + { + "epoch": 2.8223922674184454, + "grad_norm": 0.25791432339739123, + "learning_rate": 1.8942036284604254e-05, + "loss": 0.2684, + "step": 1752 + }, + { + "epoch": 2.8240032219089812, + "grad_norm": 0.24175592345180597, + "learning_rate": 1.8919548113153517e-05, + "loss": 0.299, + "step": 1753 + }, + { + "epoch": 2.8256141763995166, + "grad_norm": 0.24350609167590054, + "learning_rate": 1.889706131162757e-05, + "loss": 0.2491, + "step": 1754 + }, + { + "epoch": 2.8272251308900525, + "grad_norm": 0.24783023887132075, + "learning_rate": 1.887457590853784e-05, + "loss": 0.2805, + "step": 1755 + }, + { + "epoch": 2.828836085380588, + "grad_norm": 0.23044942043966332, + "learning_rate": 1.8852091932393984e-05, + "loss": 0.2919, + "step": 1756 + }, + { + "epoch": 2.8304470398711237, + "grad_norm": 0.23547804631271316, + "learning_rate": 1.8829609411703855e-05, + "loss": 0.2729, + "step": 1757 + }, + { + "epoch": 2.8320579943616595, + "grad_norm": 0.2241584722151506, + "learning_rate": 1.8807128374973454e-05, + "loss": 0.274, + "step": 1758 + }, + { + "epoch": 2.833668948852195, + "grad_norm": 0.24199888275581322, + "learning_rate": 1.8784648850706883e-05, + "loss": 0.2853, + "step": 1759 + }, + { + "epoch": 2.8352799033427307, + "grad_norm": 0.2298411732619803, + "learning_rate": 1.8762170867406366e-05, + "loss": 0.2626, + "step": 1760 + }, + { + "epoch": 2.836890857833266, + "grad_norm": 0.21489914910143904, + "learning_rate": 1.873969445357214e-05, + "loss": 0.2683, + "step": 1761 + }, + { + "epoch": 2.838501812323802, + "grad_norm": 0.2618260680636698, + "learning_rate": 1.871721963770246e-05, + "loss": 0.2862, + "step": 1762 + }, + { + "epoch": 2.8401127668143373, + "grad_norm": 0.2845350731450781, + "learning_rate": 1.869474644829356e-05, + "loss": 0.2909, + "step": 1763 + }, + { + "epoch": 2.841723721304873, + "grad_norm": 0.21927422061321766, + "learning_rate": 1.867227491383962e-05, + "loss": 0.265, + "step": 1764 + }, + { + "epoch": 2.8433346757954086, + "grad_norm": 0.2603099243315801, + "learning_rate": 1.8649805062832697e-05, + "loss": 0.3039, + "step": 1765 + }, + { + "epoch": 2.8449456302859444, + "grad_norm": 0.23315130279336155, + "learning_rate": 1.8627336923762737e-05, + "loss": 0.2669, + "step": 1766 + }, + { + "epoch": 2.84655658477648, + "grad_norm": 0.2386078527910061, + "learning_rate": 1.8604870525117496e-05, + "loss": 0.2713, + "step": 1767 + }, + { + "epoch": 2.8481675392670156, + "grad_norm": 0.25248905060755344, + "learning_rate": 1.8582405895382544e-05, + "loss": 0.2866, + "step": 1768 + }, + { + "epoch": 2.8497784937575514, + "grad_norm": 0.2272473915570634, + "learning_rate": 1.8559943063041195e-05, + "loss": 0.2566, + "step": 1769 + }, + { + "epoch": 2.851389448248087, + "grad_norm": 0.2377600612126955, + "learning_rate": 1.853748205657448e-05, + "loss": 0.2657, + "step": 1770 + }, + { + "epoch": 2.8530004027386227, + "grad_norm": 0.25639490523892583, + "learning_rate": 1.8515022904461136e-05, + "loss": 0.2707, + "step": 1771 + }, + { + "epoch": 2.8546113572291585, + "grad_norm": 0.22335477679276985, + "learning_rate": 1.849256563517752e-05, + "loss": 0.2578, + "step": 1772 + }, + { + "epoch": 2.856222311719694, + "grad_norm": 0.26678857888253893, + "learning_rate": 1.8470110277197622e-05, + "loss": 0.2823, + "step": 1773 + }, + { + "epoch": 2.8578332662102297, + "grad_norm": 0.2658334462945636, + "learning_rate": 1.8447656858993e-05, + "loss": 0.2657, + "step": 1774 + }, + { + "epoch": 2.859444220700765, + "grad_norm": 0.23914934535135607, + "learning_rate": 1.8425205409032767e-05, + "loss": 0.2807, + "step": 1775 + }, + { + "epoch": 2.861055175191301, + "grad_norm": 0.23755248829706438, + "learning_rate": 1.8402755955783514e-05, + "loss": 0.2806, + "step": 1776 + }, + { + "epoch": 2.8626661296818368, + "grad_norm": 0.23622676175290067, + "learning_rate": 1.838030852770932e-05, + "loss": 0.2822, + "step": 1777 + }, + { + "epoch": 2.864277084172372, + "grad_norm": 0.2774308646019204, + "learning_rate": 1.835786315327169e-05, + "loss": 0.2953, + "step": 1778 + }, + { + "epoch": 2.8658880386629075, + "grad_norm": 0.23104176491388728, + "learning_rate": 1.8335419860929532e-05, + "loss": 0.252, + "step": 1779 + }, + { + "epoch": 2.8674989931534434, + "grad_norm": 0.2640215420459852, + "learning_rate": 1.831297867913911e-05, + "loss": 0.3052, + "step": 1780 + }, + { + "epoch": 2.869109947643979, + "grad_norm": 0.2508476637940116, + "learning_rate": 1.8290539636354e-05, + "loss": 0.2686, + "step": 1781 + }, + { + "epoch": 2.8707209021345146, + "grad_norm": 0.259821408101572, + "learning_rate": 1.8268102761025077e-05, + "loss": 0.2747, + "step": 1782 + }, + { + "epoch": 2.8723318566250504, + "grad_norm": 0.24412116210280163, + "learning_rate": 1.8245668081600477e-05, + "loss": 0.2755, + "step": 1783 + }, + { + "epoch": 2.873942811115586, + "grad_norm": 0.24896780062311544, + "learning_rate": 1.8223235626525542e-05, + "loss": 0.2714, + "step": 1784 + }, + { + "epoch": 2.8755537656061216, + "grad_norm": 0.24529433746790313, + "learning_rate": 1.820080542424278e-05, + "loss": 0.2637, + "step": 1785 + }, + { + "epoch": 2.8771647200966575, + "grad_norm": 0.272289653576584, + "learning_rate": 1.8178377503191875e-05, + "loss": 0.2916, + "step": 1786 + }, + { + "epoch": 2.878775674587193, + "grad_norm": 0.24673919563032362, + "learning_rate": 1.8155951891809592e-05, + "loss": 0.2817, + "step": 1787 + }, + { + "epoch": 2.8803866290777287, + "grad_norm": 0.24031746045696809, + "learning_rate": 1.813352861852977e-05, + "loss": 0.2788, + "step": 1788 + }, + { + "epoch": 2.881997583568264, + "grad_norm": 0.25670613044192553, + "learning_rate": 1.8111107711783293e-05, + "loss": 0.2658, + "step": 1789 + }, + { + "epoch": 2.8836085380588, + "grad_norm": 0.24396556057950303, + "learning_rate": 1.808868919999804e-05, + "loss": 0.256, + "step": 1790 + }, + { + "epoch": 2.8852194925493357, + "grad_norm": 0.24389592967270443, + "learning_rate": 1.806627311159886e-05, + "loss": 0.278, + "step": 1791 + }, + { + "epoch": 2.886830447039871, + "grad_norm": 0.25159917649723634, + "learning_rate": 1.8043859475007515e-05, + "loss": 0.2755, + "step": 1792 + }, + { + "epoch": 2.8884414015304065, + "grad_norm": 0.21554818631946265, + "learning_rate": 1.8021448318642666e-05, + "loss": 0.2546, + "step": 1793 + }, + { + "epoch": 2.8900523560209423, + "grad_norm": 0.22780147011061957, + "learning_rate": 1.7999039670919837e-05, + "loss": 0.2675, + "step": 1794 + }, + { + "epoch": 2.891663310511478, + "grad_norm": 0.2556179680309161, + "learning_rate": 1.797663356025136e-05, + "loss": 0.268, + "step": 1795 + }, + { + "epoch": 2.8932742650020136, + "grad_norm": 0.22179477121268426, + "learning_rate": 1.795423001504635e-05, + "loss": 0.2625, + "step": 1796 + }, + { + "epoch": 2.8948852194925494, + "grad_norm": 0.2566657026812676, + "learning_rate": 1.793182906371068e-05, + "loss": 0.3041, + "step": 1797 + }, + { + "epoch": 2.896496173983085, + "grad_norm": 0.22649500722803623, + "learning_rate": 1.7909430734646936e-05, + "loss": 0.2626, + "step": 1798 + }, + { + "epoch": 2.8981071284736206, + "grad_norm": 0.2382002490806491, + "learning_rate": 1.788703505625435e-05, + "loss": 0.2697, + "step": 1799 + }, + { + "epoch": 2.8997180829641565, + "grad_norm": 0.23333967874773118, + "learning_rate": 1.7864642056928823e-05, + "loss": 0.282, + "step": 1800 + }, + { + "epoch": 2.901329037454692, + "grad_norm": 0.22810377571609813, + "learning_rate": 1.7842251765062858e-05, + "loss": 0.2703, + "step": 1801 + }, + { + "epoch": 2.9029399919452277, + "grad_norm": 0.2475532042645999, + "learning_rate": 1.7819864209045512e-05, + "loss": 0.2893, + "step": 1802 + }, + { + "epoch": 2.904550946435763, + "grad_norm": 0.23819963629896387, + "learning_rate": 1.7797479417262375e-05, + "loss": 0.2693, + "step": 1803 + }, + { + "epoch": 2.906161900926299, + "grad_norm": 0.2498421038014751, + "learning_rate": 1.777509741809553e-05, + "loss": 0.2685, + "step": 1804 + }, + { + "epoch": 2.9077728554168347, + "grad_norm": 0.22386243561742944, + "learning_rate": 1.775271823992354e-05, + "loss": 0.2425, + "step": 1805 + }, + { + "epoch": 2.90938380990737, + "grad_norm": 0.2524226759643067, + "learning_rate": 1.773034191112137e-05, + "loss": 0.3005, + "step": 1806 + }, + { + "epoch": 2.9109947643979055, + "grad_norm": 0.23459453267664337, + "learning_rate": 1.7707968460060375e-05, + "loss": 0.2858, + "step": 1807 + }, + { + "epoch": 2.9126057188884413, + "grad_norm": 0.24955656470007692, + "learning_rate": 1.7685597915108257e-05, + "loss": 0.2793, + "step": 1808 + }, + { + "epoch": 2.914216673378977, + "grad_norm": 0.22015213316381949, + "learning_rate": 1.7663230304629066e-05, + "loss": 0.2702, + "step": 1809 + }, + { + "epoch": 2.9158276278695126, + "grad_norm": 0.2655548953653868, + "learning_rate": 1.7640865656983084e-05, + "loss": 0.3032, + "step": 1810 + }, + { + "epoch": 2.9174385823600484, + "grad_norm": 0.2323494023577661, + "learning_rate": 1.7618504000526863e-05, + "loss": 0.2761, + "step": 1811 + }, + { + "epoch": 2.9190495368505838, + "grad_norm": 0.2109588847916537, + "learning_rate": 1.759614536361316e-05, + "loss": 0.2614, + "step": 1812 + }, + { + "epoch": 2.9206604913411196, + "grad_norm": 0.25601356381819473, + "learning_rate": 1.7573789774590903e-05, + "loss": 0.3105, + "step": 1813 + }, + { + "epoch": 2.9222714458316554, + "grad_norm": 0.22547870004031134, + "learning_rate": 1.755143726180516e-05, + "loss": 0.2899, + "step": 1814 + }, + { + "epoch": 2.923882400322191, + "grad_norm": 0.2509635812735881, + "learning_rate": 1.7529087853597072e-05, + "loss": 0.2821, + "step": 1815 + }, + { + "epoch": 2.9254933548127267, + "grad_norm": 0.23583864792349135, + "learning_rate": 1.7506741578303883e-05, + "loss": 0.2623, + "step": 1816 + }, + { + "epoch": 2.927104309303262, + "grad_norm": 0.24318522703606663, + "learning_rate": 1.748439846425884e-05, + "loss": 0.2868, + "step": 1817 + }, + { + "epoch": 2.928715263793798, + "grad_norm": 0.22768641363630263, + "learning_rate": 1.7462058539791192e-05, + "loss": 0.2942, + "step": 1818 + }, + { + "epoch": 2.9303262182843337, + "grad_norm": 0.22037526237850094, + "learning_rate": 1.743972183322612e-05, + "loss": 0.2651, + "step": 1819 + }, + { + "epoch": 2.931937172774869, + "grad_norm": 0.2261074846645067, + "learning_rate": 1.7417388372884775e-05, + "loss": 0.2863, + "step": 1820 + }, + { + "epoch": 2.9335481272654045, + "grad_norm": 0.22274024245003757, + "learning_rate": 1.739505818708414e-05, + "loss": 0.2594, + "step": 1821 + }, + { + "epoch": 2.9351590817559403, + "grad_norm": 0.24040076004716376, + "learning_rate": 1.7372731304137072e-05, + "loss": 0.2691, + "step": 1822 + }, + { + "epoch": 2.936770036246476, + "grad_norm": 0.2506765713731466, + "learning_rate": 1.735040775235224e-05, + "loss": 0.2998, + "step": 1823 + }, + { + "epoch": 2.9383809907370115, + "grad_norm": 0.22705774588973596, + "learning_rate": 1.732808756003408e-05, + "loss": 0.2528, + "step": 1824 + }, + { + "epoch": 2.9399919452275474, + "grad_norm": 0.22020216475260657, + "learning_rate": 1.7305770755482788e-05, + "loss": 0.2594, + "step": 1825 + }, + { + "epoch": 2.9416028997180828, + "grad_norm": 0.2525904487369489, + "learning_rate": 1.7283457366994226e-05, + "loss": 0.2634, + "step": 1826 + }, + { + "epoch": 2.9432138542086186, + "grad_norm": 0.24948049883181214, + "learning_rate": 1.7261147422859967e-05, + "loss": 0.2824, + "step": 1827 + }, + { + "epoch": 2.9448248086991544, + "grad_norm": 0.22594459892161928, + "learning_rate": 1.7238840951367194e-05, + "loss": 0.2685, + "step": 1828 + }, + { + "epoch": 2.94643576318969, + "grad_norm": 0.23969345388513782, + "learning_rate": 1.72165379807987e-05, + "loss": 0.2776, + "step": 1829 + }, + { + "epoch": 2.9480467176802256, + "grad_norm": 0.23702835747098586, + "learning_rate": 1.7194238539432807e-05, + "loss": 0.286, + "step": 1830 + }, + { + "epoch": 2.949657672170761, + "grad_norm": 0.2308017426275451, + "learning_rate": 1.7171942655543415e-05, + "loss": 0.2553, + "step": 1831 + }, + { + "epoch": 2.951268626661297, + "grad_norm": 0.25865741662991343, + "learning_rate": 1.714965035739987e-05, + "loss": 0.295, + "step": 1832 + }, + { + "epoch": 2.9528795811518327, + "grad_norm": 0.22524224832126727, + "learning_rate": 1.7127361673266982e-05, + "loss": 0.2379, + "step": 1833 + }, + { + "epoch": 2.954490535642368, + "grad_norm": 0.2874573108392332, + "learning_rate": 1.7105076631404994e-05, + "loss": 0.3053, + "step": 1834 + }, + { + "epoch": 2.956101490132904, + "grad_norm": 0.2334494538238959, + "learning_rate": 1.7082795260069515e-05, + "loss": 0.2591, + "step": 1835 + }, + { + "epoch": 2.9577124446234393, + "grad_norm": 0.25784150732290884, + "learning_rate": 1.7060517587511512e-05, + "loss": 0.2823, + "step": 1836 + }, + { + "epoch": 2.959323399113975, + "grad_norm": 0.23058702287993874, + "learning_rate": 1.7038243641977238e-05, + "loss": 0.2464, + "step": 1837 + }, + { + "epoch": 2.9609343536045105, + "grad_norm": 0.22596215888674936, + "learning_rate": 1.701597345170825e-05, + "loss": 0.286, + "step": 1838 + }, + { + "epoch": 2.9625453080950463, + "grad_norm": 0.23090028001401205, + "learning_rate": 1.6993707044941334e-05, + "loss": 0.2828, + "step": 1839 + }, + { + "epoch": 2.9641562625855817, + "grad_norm": 0.22216529160650983, + "learning_rate": 1.6971444449908474e-05, + "loss": 0.2655, + "step": 1840 + }, + { + "epoch": 2.9657672170761176, + "grad_norm": 0.23782152766999168, + "learning_rate": 1.6949185694836806e-05, + "loss": 0.3075, + "step": 1841 + }, + { + "epoch": 2.9673781715666534, + "grad_norm": 0.23394803632471256, + "learning_rate": 1.6926930807948646e-05, + "loss": 0.2607, + "step": 1842 + }, + { + "epoch": 2.968989126057189, + "grad_norm": 0.22966128510957112, + "learning_rate": 1.6904679817461347e-05, + "loss": 0.2653, + "step": 1843 + }, + { + "epoch": 2.9706000805477246, + "grad_norm": 0.23928795508961923, + "learning_rate": 1.688243275158736e-05, + "loss": 0.2515, + "step": 1844 + }, + { + "epoch": 2.97221103503826, + "grad_norm": 0.2426394963369243, + "learning_rate": 1.6860189638534142e-05, + "loss": 0.269, + "step": 1845 + }, + { + "epoch": 2.973821989528796, + "grad_norm": 0.2321987864421601, + "learning_rate": 1.6837950506504158e-05, + "loss": 0.2831, + "step": 1846 + }, + { + "epoch": 2.9754329440193317, + "grad_norm": 0.25421214491796396, + "learning_rate": 1.6815715383694805e-05, + "loss": 0.284, + "step": 1847 + }, + { + "epoch": 2.977043898509867, + "grad_norm": 0.2443429609504397, + "learning_rate": 1.6793484298298387e-05, + "loss": 0.284, + "step": 1848 + }, + { + "epoch": 2.978654853000403, + "grad_norm": 0.21013676562255554, + "learning_rate": 1.6771257278502135e-05, + "loss": 0.2368, + "step": 1849 + }, + { + "epoch": 2.9802658074909383, + "grad_norm": 0.25821210868342026, + "learning_rate": 1.6749034352488077e-05, + "loss": 0.2834, + "step": 1850 + }, + { + "epoch": 2.981876761981474, + "grad_norm": 0.23933753025125024, + "learning_rate": 1.6726815548433072e-05, + "loss": 0.2712, + "step": 1851 + }, + { + "epoch": 2.98348771647201, + "grad_norm": 0.23023492804389561, + "learning_rate": 1.6704600894508743e-05, + "loss": 0.2598, + "step": 1852 + }, + { + "epoch": 2.9850986709625453, + "grad_norm": 0.270960478809936, + "learning_rate": 1.668239041888148e-05, + "loss": 0.3171, + "step": 1853 + }, + { + "epoch": 2.9867096254530807, + "grad_norm": 0.2165989069224987, + "learning_rate": 1.666018414971233e-05, + "loss": 0.2687, + "step": 1854 + }, + { + "epoch": 2.9883205799436166, + "grad_norm": 0.23657876878647432, + "learning_rate": 1.663798211515704e-05, + "loss": 0.2732, + "step": 1855 + }, + { + "epoch": 2.9899315344341524, + "grad_norm": 0.2514218073257401, + "learning_rate": 1.661578434336597e-05, + "loss": 0.2824, + "step": 1856 + }, + { + "epoch": 2.9915424889246878, + "grad_norm": 0.22874574938184158, + "learning_rate": 1.6593590862484095e-05, + "loss": 0.2753, + "step": 1857 + }, + { + "epoch": 2.9931534434152236, + "grad_norm": 0.2270308899508822, + "learning_rate": 1.6571401700650934e-05, + "loss": 0.2533, + "step": 1858 + }, + { + "epoch": 2.994764397905759, + "grad_norm": 0.23310437321259292, + "learning_rate": 1.6549216886000513e-05, + "loss": 0.2709, + "step": 1859 + }, + { + "epoch": 2.996375352396295, + "grad_norm": 0.26450041707096045, + "learning_rate": 1.6527036446661396e-05, + "loss": 0.2978, + "step": 1860 + }, + { + "epoch": 2.9979863068868307, + "grad_norm": 0.2340175999215853, + "learning_rate": 1.6504860410756548e-05, + "loss": 0.2694, + "step": 1861 + }, + { + "epoch": 2.999597261377366, + "grad_norm": 0.27305237704293234, + "learning_rate": 1.6482688806403383e-05, + "loss": 0.3064, + "step": 1862 + }, + { + "epoch": 3.001208215867902, + "grad_norm": 0.3397063087612353, + "learning_rate": 1.646052166171368e-05, + "loss": 0.2409, + "step": 1863 + }, + { + "epoch": 3.0028191703584373, + "grad_norm": 0.30805634582005703, + "learning_rate": 1.6438359004793572e-05, + "loss": 0.2068, + "step": 1864 + }, + { + "epoch": 3.004430124848973, + "grad_norm": 0.45744869192317866, + "learning_rate": 1.64162008637435e-05, + "loss": 0.2269, + "step": 1865 + }, + { + "epoch": 3.0060410793395085, + "grad_norm": 0.2781379556860123, + "learning_rate": 1.639404726665817e-05, + "loss": 0.1878, + "step": 1866 + }, + { + "epoch": 3.0076520338300443, + "grad_norm": 0.45200098984478193, + "learning_rate": 1.637189824162654e-05, + "loss": 0.2083, + "step": 1867 + }, + { + "epoch": 3.00926298832058, + "grad_norm": 0.2842219362339001, + "learning_rate": 1.634975381673177e-05, + "loss": 0.1975, + "step": 1868 + }, + { + "epoch": 3.0108739428111155, + "grad_norm": 0.3164436406112575, + "learning_rate": 1.632761402005118e-05, + "loss": 0.1975, + "step": 1869 + }, + { + "epoch": 3.0124848973016514, + "grad_norm": 0.28217933369594117, + "learning_rate": 1.630547887965622e-05, + "loss": 0.2004, + "step": 1870 + }, + { + "epoch": 3.0140958517921868, + "grad_norm": 0.2569624519531185, + "learning_rate": 1.628334842361244e-05, + "loss": 0.1879, + "step": 1871 + }, + { + "epoch": 3.0157068062827226, + "grad_norm": 0.27431269684331583, + "learning_rate": 1.6261222679979462e-05, + "loss": 0.2135, + "step": 1872 + }, + { + "epoch": 3.017317760773258, + "grad_norm": 0.24794482330292733, + "learning_rate": 1.6239101676810917e-05, + "loss": 0.2132, + "step": 1873 + }, + { + "epoch": 3.018928715263794, + "grad_norm": 0.28290856893562144, + "learning_rate": 1.6216985442154427e-05, + "loss": 0.2237, + "step": 1874 + }, + { + "epoch": 3.0205396697543296, + "grad_norm": 0.24267628598445265, + "learning_rate": 1.619487400405158e-05, + "loss": 0.1798, + "step": 1875 + }, + { + "epoch": 3.022150624244865, + "grad_norm": 0.27946258366684773, + "learning_rate": 1.6172767390537874e-05, + "loss": 0.2003, + "step": 1876 + }, + { + "epoch": 3.023761578735401, + "grad_norm": 0.2635177433404853, + "learning_rate": 1.6150665629642685e-05, + "loss": 0.1947, + "step": 1877 + }, + { + "epoch": 3.0253725332259362, + "grad_norm": 0.2592624875109387, + "learning_rate": 1.6128568749389238e-05, + "loss": 0.1899, + "step": 1878 + }, + { + "epoch": 3.026983487716472, + "grad_norm": 0.25701500127549515, + "learning_rate": 1.6106476777794587e-05, + "loss": 0.2059, + "step": 1879 + }, + { + "epoch": 3.0285944422070075, + "grad_norm": 0.2531036287332326, + "learning_rate": 1.6084389742869543e-05, + "loss": 0.2029, + "step": 1880 + }, + { + "epoch": 3.0302053966975433, + "grad_norm": 0.26247322863177375, + "learning_rate": 1.6062307672618654e-05, + "loss": 0.2158, + "step": 1881 + }, + { + "epoch": 3.031816351188079, + "grad_norm": 0.2514848759579883, + "learning_rate": 1.6040230595040186e-05, + "loss": 0.2065, + "step": 1882 + }, + { + "epoch": 3.0334273056786145, + "grad_norm": 0.26086089307040033, + "learning_rate": 1.601815853812608e-05, + "loss": 0.1832, + "step": 1883 + }, + { + "epoch": 3.0350382601691503, + "grad_norm": 0.2650496609324423, + "learning_rate": 1.5996091529861897e-05, + "loss": 0.2189, + "step": 1884 + }, + { + "epoch": 3.0366492146596857, + "grad_norm": 0.24975861001178923, + "learning_rate": 1.5974029598226796e-05, + "loss": 0.1956, + "step": 1885 + }, + { + "epoch": 3.0382601691502216, + "grad_norm": 0.26398918888430833, + "learning_rate": 1.595197277119352e-05, + "loss": 0.2089, + "step": 1886 + }, + { + "epoch": 3.039871123640757, + "grad_norm": 0.23695742331210073, + "learning_rate": 1.5929921076728316e-05, + "loss": 0.1951, + "step": 1887 + }, + { + "epoch": 3.041482078131293, + "grad_norm": 0.23211753996183163, + "learning_rate": 1.590787454279093e-05, + "loss": 0.1923, + "step": 1888 + }, + { + "epoch": 3.0430930326218286, + "grad_norm": 0.24973175927852212, + "learning_rate": 1.5885833197334563e-05, + "loss": 0.186, + "step": 1889 + }, + { + "epoch": 3.044703987112364, + "grad_norm": 0.2529151185889554, + "learning_rate": 1.586379706830586e-05, + "loss": 0.2087, + "step": 1890 + }, + { + "epoch": 3.0463149416029, + "grad_norm": 0.2482208969171952, + "learning_rate": 1.584176618364482e-05, + "loss": 0.2011, + "step": 1891 + }, + { + "epoch": 3.0479258960934352, + "grad_norm": 0.2550637551299936, + "learning_rate": 1.5819740571284807e-05, + "loss": 0.2177, + "step": 1892 + }, + { + "epoch": 3.049536850583971, + "grad_norm": 0.23750126804753882, + "learning_rate": 1.5797720259152496e-05, + "loss": 0.174, + "step": 1893 + }, + { + "epoch": 3.0511478050745064, + "grad_norm": 0.24172300528999488, + "learning_rate": 1.5775705275167854e-05, + "loss": 0.2076, + "step": 1894 + }, + { + "epoch": 3.0527587595650423, + "grad_norm": 0.26506057370117664, + "learning_rate": 1.5753695647244083e-05, + "loss": 0.2068, + "step": 1895 + }, + { + "epoch": 3.054369714055578, + "grad_norm": 0.22576990598377747, + "learning_rate": 1.5731691403287595e-05, + "loss": 0.1925, + "step": 1896 + }, + { + "epoch": 3.0559806685461135, + "grad_norm": 0.2545620829654115, + "learning_rate": 1.5709692571197957e-05, + "loss": 0.2, + "step": 1897 + }, + { + "epoch": 3.0575916230366493, + "grad_norm": 0.26218591276871567, + "learning_rate": 1.568769917886792e-05, + "loss": 0.1976, + "step": 1898 + }, + { + "epoch": 3.0592025775271847, + "grad_norm": 0.23957115050109906, + "learning_rate": 1.5665711254183293e-05, + "loss": 0.1963, + "step": 1899 + }, + { + "epoch": 3.0608135320177206, + "grad_norm": 0.2595037940259688, + "learning_rate": 1.564372882502297e-05, + "loss": 0.2051, + "step": 1900 + }, + { + "epoch": 3.062424486508256, + "grad_norm": 0.2224941487256705, + "learning_rate": 1.5621751919258884e-05, + "loss": 0.1869, + "step": 1901 + }, + { + "epoch": 3.0640354409987918, + "grad_norm": 0.25468643872804336, + "learning_rate": 1.5599780564755956e-05, + "loss": 0.2183, + "step": 1902 + }, + { + "epoch": 3.0656463954893276, + "grad_norm": 0.25613678517781996, + "learning_rate": 1.5577814789372064e-05, + "loss": 0.1931, + "step": 1903 + }, + { + "epoch": 3.067257349979863, + "grad_norm": 0.24029809222854032, + "learning_rate": 1.555585462095802e-05, + "loss": 0.1899, + "step": 1904 + }, + { + "epoch": 3.068868304470399, + "grad_norm": 0.25815577075329543, + "learning_rate": 1.5533900087357527e-05, + "loss": 0.2053, + "step": 1905 + }, + { + "epoch": 3.070479258960934, + "grad_norm": 0.25550205510849705, + "learning_rate": 1.5511951216407142e-05, + "loss": 0.202, + "step": 1906 + }, + { + "epoch": 3.07209021345147, + "grad_norm": 0.24282903913932946, + "learning_rate": 1.5490008035936245e-05, + "loss": 0.1987, + "step": 1907 + }, + { + "epoch": 3.0737011679420054, + "grad_norm": 0.24787825404381872, + "learning_rate": 1.5468070573766982e-05, + "loss": 0.2126, + "step": 1908 + }, + { + "epoch": 3.0753121224325413, + "grad_norm": 0.23805631453760473, + "learning_rate": 1.5446138857714285e-05, + "loss": 0.2152, + "step": 1909 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.22261953448924351, + "learning_rate": 1.5424212915585766e-05, + "loss": 0.2049, + "step": 1910 + }, + { + "epoch": 3.0785340314136125, + "grad_norm": 0.23389483167172603, + "learning_rate": 1.5402292775181732e-05, + "loss": 0.1841, + "step": 1911 + }, + { + "epoch": 3.0801449859041483, + "grad_norm": 0.24520165353665813, + "learning_rate": 1.5380378464295133e-05, + "loss": 0.2121, + "step": 1912 + }, + { + "epoch": 3.0817559403946837, + "grad_norm": 0.2355879009329018, + "learning_rate": 1.535847001071153e-05, + "loss": 0.1831, + "step": 1913 + }, + { + "epoch": 3.0833668948852195, + "grad_norm": 0.25353870145003116, + "learning_rate": 1.5336567442209052e-05, + "loss": 0.2093, + "step": 1914 + }, + { + "epoch": 3.084977849375755, + "grad_norm": 0.23181775476490288, + "learning_rate": 1.5314670786558358e-05, + "loss": 0.1993, + "step": 1915 + }, + { + "epoch": 3.0865888038662908, + "grad_norm": 0.24826824746417567, + "learning_rate": 1.5292780071522634e-05, + "loss": 0.2159, + "step": 1916 + }, + { + "epoch": 3.0881997583568266, + "grad_norm": 0.23097701612004543, + "learning_rate": 1.527089532485751e-05, + "loss": 0.1875, + "step": 1917 + }, + { + "epoch": 3.089810712847362, + "grad_norm": 0.23252448832192432, + "learning_rate": 1.5249016574311069e-05, + "loss": 0.1925, + "step": 1918 + }, + { + "epoch": 3.091421667337898, + "grad_norm": 0.2335044424719987, + "learning_rate": 1.5227143847623759e-05, + "loss": 0.1917, + "step": 1919 + }, + { + "epoch": 3.093032621828433, + "grad_norm": 0.2467731183493127, + "learning_rate": 1.5205277172528438e-05, + "loss": 0.2028, + "step": 1920 + }, + { + "epoch": 3.094643576318969, + "grad_norm": 0.2478688042585764, + "learning_rate": 1.5183416576750251e-05, + "loss": 0.2155, + "step": 1921 + }, + { + "epoch": 3.096254530809505, + "grad_norm": 0.23559604410809817, + "learning_rate": 1.5161562088006649e-05, + "loss": 0.1927, + "step": 1922 + }, + { + "epoch": 3.0978654853000402, + "grad_norm": 0.25132210120716414, + "learning_rate": 1.513971373400734e-05, + "loss": 0.2165, + "step": 1923 + }, + { + "epoch": 3.099476439790576, + "grad_norm": 0.21838775169451463, + "learning_rate": 1.5117871542454259e-05, + "loss": 0.1959, + "step": 1924 + }, + { + "epoch": 3.1010873942811115, + "grad_norm": 0.24648218446618922, + "learning_rate": 1.509603554104152e-05, + "loss": 0.215, + "step": 1925 + }, + { + "epoch": 3.1026983487716473, + "grad_norm": 0.23608736727692337, + "learning_rate": 1.5074205757455382e-05, + "loss": 0.1913, + "step": 1926 + }, + { + "epoch": 3.1043093032621827, + "grad_norm": 0.24688604035464398, + "learning_rate": 1.5052382219374238e-05, + "loss": 0.2028, + "step": 1927 + }, + { + "epoch": 3.1059202577527185, + "grad_norm": 0.21415210914040747, + "learning_rate": 1.5030564954468548e-05, + "loss": 0.1712, + "step": 1928 + }, + { + "epoch": 3.107531212243254, + "grad_norm": 0.28181734443533013, + "learning_rate": 1.500875399040083e-05, + "loss": 0.2383, + "step": 1929 + }, + { + "epoch": 3.1091421667337897, + "grad_norm": 0.23835213355733328, + "learning_rate": 1.498694935482559e-05, + "loss": 0.1919, + "step": 1930 + }, + { + "epoch": 3.1107531212243256, + "grad_norm": 0.239890464679267, + "learning_rate": 1.4965151075389344e-05, + "loss": 0.1857, + "step": 1931 + }, + { + "epoch": 3.112364075714861, + "grad_norm": 0.24414783199380563, + "learning_rate": 1.4943359179730523e-05, + "loss": 0.2172, + "step": 1932 + }, + { + "epoch": 3.113975030205397, + "grad_norm": 0.23874299205907634, + "learning_rate": 1.492157369547947e-05, + "loss": 0.1947, + "step": 1933 + }, + { + "epoch": 3.115585984695932, + "grad_norm": 0.26898773638453133, + "learning_rate": 1.4899794650258397e-05, + "loss": 0.224, + "step": 1934 + }, + { + "epoch": 3.117196939186468, + "grad_norm": 0.2577407154196406, + "learning_rate": 1.4878022071681368e-05, + "loss": 0.1898, + "step": 1935 + }, + { + "epoch": 3.118807893677004, + "grad_norm": 0.29459886996326795, + "learning_rate": 1.4856255987354232e-05, + "loss": 0.2289, + "step": 1936 + }, + { + "epoch": 3.1204188481675392, + "grad_norm": 0.2512628187386988, + "learning_rate": 1.4834496424874587e-05, + "loss": 0.2068, + "step": 1937 + }, + { + "epoch": 3.122029802658075, + "grad_norm": 0.267387282459161, + "learning_rate": 1.4812743411831814e-05, + "loss": 0.211, + "step": 1938 + }, + { + "epoch": 3.1236407571486104, + "grad_norm": 0.25843048547184483, + "learning_rate": 1.479099697580694e-05, + "loss": 0.2177, + "step": 1939 + }, + { + "epoch": 3.1252517116391463, + "grad_norm": 0.26113540198237006, + "learning_rate": 1.4769257144372668e-05, + "loss": 0.1874, + "step": 1940 + }, + { + "epoch": 3.1268626661296817, + "grad_norm": 0.2508889947235532, + "learning_rate": 1.4747523945093332e-05, + "loss": 0.2058, + "step": 1941 + }, + { + "epoch": 3.1284736206202175, + "grad_norm": 0.24792171081568534, + "learning_rate": 1.4725797405524866e-05, + "loss": 0.1922, + "step": 1942 + }, + { + "epoch": 3.130084575110753, + "grad_norm": 0.2536281872683442, + "learning_rate": 1.4704077553214734e-05, + "loss": 0.187, + "step": 1943 + }, + { + "epoch": 3.1316955296012887, + "grad_norm": 0.24634192517432316, + "learning_rate": 1.468236441570194e-05, + "loss": 0.2169, + "step": 1944 + }, + { + "epoch": 3.1333064840918246, + "grad_norm": 0.26202184945627943, + "learning_rate": 1.4660658020516966e-05, + "loss": 0.2107, + "step": 1945 + }, + { + "epoch": 3.13491743858236, + "grad_norm": 0.24692155351876013, + "learning_rate": 1.463895839518176e-05, + "loss": 0.2009, + "step": 1946 + }, + { + "epoch": 3.1365283930728958, + "grad_norm": 0.2651977196274272, + "learning_rate": 1.4617265567209673e-05, + "loss": 0.2161, + "step": 1947 + }, + { + "epoch": 3.138139347563431, + "grad_norm": 0.2254192338184251, + "learning_rate": 1.4595579564105432e-05, + "loss": 0.198, + "step": 1948 + }, + { + "epoch": 3.139750302053967, + "grad_norm": 0.2456828260242202, + "learning_rate": 1.457390041336512e-05, + "loss": 0.2169, + "step": 1949 + }, + { + "epoch": 3.141361256544503, + "grad_norm": 0.2383954425741448, + "learning_rate": 1.4552228142476138e-05, + "loss": 0.1913, + "step": 1950 + }, + { + "epoch": 3.142972211035038, + "grad_norm": 0.2388875846403241, + "learning_rate": 1.453056277891715e-05, + "loss": 0.2074, + "step": 1951 + }, + { + "epoch": 3.144583165525574, + "grad_norm": 0.24254252857096423, + "learning_rate": 1.4508904350158069e-05, + "loss": 0.184, + "step": 1952 + }, + { + "epoch": 3.1461941200161094, + "grad_norm": 0.2677691092809072, + "learning_rate": 1.4487252883660019e-05, + "loss": 0.1969, + "step": 1953 + }, + { + "epoch": 3.1478050745066453, + "grad_norm": 0.25451569692994286, + "learning_rate": 1.446560840687529e-05, + "loss": 0.2038, + "step": 1954 + }, + { + "epoch": 3.1494160289971806, + "grad_norm": 0.26165425081978877, + "learning_rate": 1.4443970947247308e-05, + "loss": 0.2257, + "step": 1955 + }, + { + "epoch": 3.1510269834877165, + "grad_norm": 0.2586993585923267, + "learning_rate": 1.4422340532210601e-05, + "loss": 0.2184, + "step": 1956 + }, + { + "epoch": 3.1526379379782523, + "grad_norm": 0.22749947793278547, + "learning_rate": 1.4400717189190783e-05, + "loss": 0.1886, + "step": 1957 + }, + { + "epoch": 3.1542488924687877, + "grad_norm": 0.25165536537988603, + "learning_rate": 1.4379100945604486e-05, + "loss": 0.1957, + "step": 1958 + }, + { + "epoch": 3.1558598469593235, + "grad_norm": 0.26090767826355016, + "learning_rate": 1.4357491828859333e-05, + "loss": 0.2062, + "step": 1959 + }, + { + "epoch": 3.157470801449859, + "grad_norm": 0.23823889920583088, + "learning_rate": 1.433588986635392e-05, + "loss": 0.1916, + "step": 1960 + }, + { + "epoch": 3.1590817559403948, + "grad_norm": 0.25301272984117323, + "learning_rate": 1.4314295085477784e-05, + "loss": 0.2129, + "step": 1961 + }, + { + "epoch": 3.16069271043093, + "grad_norm": 0.24454168144111912, + "learning_rate": 1.4292707513611332e-05, + "loss": 0.191, + "step": 1962 + }, + { + "epoch": 3.162303664921466, + "grad_norm": 0.2519013664831757, + "learning_rate": 1.4271127178125843e-05, + "loss": 0.213, + "step": 1963 + }, + { + "epoch": 3.163914619412002, + "grad_norm": 0.2322029065504692, + "learning_rate": 1.4249554106383432e-05, + "loss": 0.1915, + "step": 1964 + }, + { + "epoch": 3.165525573902537, + "grad_norm": 0.268939820675357, + "learning_rate": 1.4227988325736991e-05, + "loss": 0.2017, + "step": 1965 + }, + { + "epoch": 3.167136528393073, + "grad_norm": 0.24459588031927953, + "learning_rate": 1.420642986353016e-05, + "loss": 0.2095, + "step": 1966 + }, + { + "epoch": 3.1687474828836084, + "grad_norm": 0.23268508577761784, + "learning_rate": 1.4184878747097308e-05, + "loss": 0.2032, + "step": 1967 + }, + { + "epoch": 3.1703584373741442, + "grad_norm": 0.24314702305936226, + "learning_rate": 1.4163335003763506e-05, + "loss": 0.2094, + "step": 1968 + }, + { + "epoch": 3.1719693918646796, + "grad_norm": 0.2466331707682189, + "learning_rate": 1.414179866084445e-05, + "loss": 0.2039, + "step": 1969 + }, + { + "epoch": 3.1735803463552155, + "grad_norm": 0.24010384240031207, + "learning_rate": 1.4120269745646469e-05, + "loss": 0.2131, + "step": 1970 + }, + { + "epoch": 3.1751913008457513, + "grad_norm": 0.2315431702165574, + "learning_rate": 1.4098748285466464e-05, + "loss": 0.1836, + "step": 1971 + }, + { + "epoch": 3.1768022553362867, + "grad_norm": 0.24912715996998305, + "learning_rate": 1.4077234307591896e-05, + "loss": 0.2044, + "step": 1972 + }, + { + "epoch": 3.1784132098268225, + "grad_norm": 0.23840799641981053, + "learning_rate": 1.4055727839300733e-05, + "loss": 0.2022, + "step": 1973 + }, + { + "epoch": 3.180024164317358, + "grad_norm": 0.23128654085099198, + "learning_rate": 1.4034228907861414e-05, + "loss": 0.2028, + "step": 1974 + }, + { + "epoch": 3.1816351188078937, + "grad_norm": 0.26483087418421175, + "learning_rate": 1.4012737540532842e-05, + "loss": 0.2106, + "step": 1975 + }, + { + "epoch": 3.183246073298429, + "grad_norm": 0.23542978958227243, + "learning_rate": 1.3991253764564308e-05, + "loss": 0.2013, + "step": 1976 + }, + { + "epoch": 3.184857027788965, + "grad_norm": 0.23883779350162376, + "learning_rate": 1.3969777607195485e-05, + "loss": 0.1905, + "step": 1977 + }, + { + "epoch": 3.186467982279501, + "grad_norm": 0.22791901305740134, + "learning_rate": 1.3948309095656382e-05, + "loss": 0.2003, + "step": 1978 + }, + { + "epoch": 3.188078936770036, + "grad_norm": 0.2585822611597578, + "learning_rate": 1.3926848257167336e-05, + "loss": 0.2197, + "step": 1979 + }, + { + "epoch": 3.189689891260572, + "grad_norm": 0.2573536529025018, + "learning_rate": 1.3905395118938929e-05, + "loss": 0.206, + "step": 1980 + }, + { + "epoch": 3.1913008457511074, + "grad_norm": 0.24021103836798227, + "learning_rate": 1.3883949708171987e-05, + "loss": 0.208, + "step": 1981 + }, + { + "epoch": 3.1929118002416432, + "grad_norm": 0.24221200646286703, + "learning_rate": 1.3862512052057535e-05, + "loss": 0.2255, + "step": 1982 + }, + { + "epoch": 3.1945227547321786, + "grad_norm": 0.23090236397527916, + "learning_rate": 1.384108217777678e-05, + "loss": 0.1995, + "step": 1983 + }, + { + "epoch": 3.1961337092227144, + "grad_norm": 0.21761509754767042, + "learning_rate": 1.3819660112501054e-05, + "loss": 0.1835, + "step": 1984 + }, + { + "epoch": 3.1977446637132503, + "grad_norm": 0.2703261644476621, + "learning_rate": 1.3798245883391788e-05, + "loss": 0.2289, + "step": 1985 + }, + { + "epoch": 3.1993556182037857, + "grad_norm": 0.2230354526114237, + "learning_rate": 1.3776839517600458e-05, + "loss": 0.1979, + "step": 1986 + }, + { + "epoch": 3.2009665726943215, + "grad_norm": 0.23711275306860455, + "learning_rate": 1.3755441042268615e-05, + "loss": 0.1923, + "step": 1987 + }, + { + "epoch": 3.202577527184857, + "grad_norm": 0.24705039155672698, + "learning_rate": 1.3734050484527765e-05, + "loss": 0.2165, + "step": 1988 + }, + { + "epoch": 3.2041884816753927, + "grad_norm": 0.2377534616627376, + "learning_rate": 1.3712667871499385e-05, + "loss": 0.1929, + "step": 1989 + }, + { + "epoch": 3.205799436165928, + "grad_norm": 0.2449633515338068, + "learning_rate": 1.369129323029489e-05, + "loss": 0.2004, + "step": 1990 + }, + { + "epoch": 3.207410390656464, + "grad_norm": 0.2593360096012776, + "learning_rate": 1.3669926588015585e-05, + "loss": 0.2185, + "step": 1991 + }, + { + "epoch": 3.2090213451469998, + "grad_norm": 0.23324589375655463, + "learning_rate": 1.364856797175262e-05, + "loss": 0.1875, + "step": 1992 + }, + { + "epoch": 3.210632299637535, + "grad_norm": 0.2413326158872254, + "learning_rate": 1.362721740858697e-05, + "loss": 0.1826, + "step": 1993 + }, + { + "epoch": 3.212243254128071, + "grad_norm": 0.23918589432715148, + "learning_rate": 1.3605874925589419e-05, + "loss": 0.2068, + "step": 1994 + }, + { + "epoch": 3.2138542086186064, + "grad_norm": 0.23689262549972226, + "learning_rate": 1.3584540549820493e-05, + "loss": 0.2253, + "step": 1995 + }, + { + "epoch": 3.215465163109142, + "grad_norm": 0.23319941479929307, + "learning_rate": 1.3563214308330434e-05, + "loss": 0.1865, + "step": 1996 + }, + { + "epoch": 3.217076117599678, + "grad_norm": 0.23879194349455857, + "learning_rate": 1.3541896228159165e-05, + "loss": 0.213, + "step": 1997 + }, + { + "epoch": 3.2186870720902134, + "grad_norm": 0.25094170517305836, + "learning_rate": 1.3520586336336296e-05, + "loss": 0.2182, + "step": 1998 + }, + { + "epoch": 3.2202980265807493, + "grad_norm": 0.25223079196864345, + "learning_rate": 1.3499284659881013e-05, + "loss": 0.2001, + "step": 1999 + }, + { + "epoch": 3.2219089810712846, + "grad_norm": 0.24328823956234302, + "learning_rate": 1.3477991225802103e-05, + "loss": 0.2045, + "step": 2000 + }, + { + "epoch": 3.2235199355618205, + "grad_norm": 0.23787972021657403, + "learning_rate": 1.3456706061097905e-05, + "loss": 0.1807, + "step": 2001 + }, + { + "epoch": 3.225130890052356, + "grad_norm": 0.28090320878999836, + "learning_rate": 1.3435429192756275e-05, + "loss": 0.2231, + "step": 2002 + }, + { + "epoch": 3.2267418445428917, + "grad_norm": 0.2591510357821683, + "learning_rate": 1.3414160647754547e-05, + "loss": 0.2125, + "step": 2003 + }, + { + "epoch": 3.228352799033427, + "grad_norm": 0.22470518404451317, + "learning_rate": 1.339290045305948e-05, + "loss": 0.1817, + "step": 2004 + }, + { + "epoch": 3.229963753523963, + "grad_norm": 0.23711240462541178, + "learning_rate": 1.3371648635627285e-05, + "loss": 0.202, + "step": 2005 + }, + { + "epoch": 3.2315747080144988, + "grad_norm": 0.26273477899693026, + "learning_rate": 1.3350405222403529e-05, + "loss": 0.2079, + "step": 2006 + }, + { + "epoch": 3.233185662505034, + "grad_norm": 0.2325874086405899, + "learning_rate": 1.3329170240323124e-05, + "loss": 0.2106, + "step": 2007 + }, + { + "epoch": 3.23479661699557, + "grad_norm": 0.23774767585628861, + "learning_rate": 1.330794371631028e-05, + "loss": 0.1882, + "step": 2008 + }, + { + "epoch": 3.2364075714861054, + "grad_norm": 0.22890478281530627, + "learning_rate": 1.3286725677278525e-05, + "loss": 0.1993, + "step": 2009 + }, + { + "epoch": 3.238018525976641, + "grad_norm": 0.25495173404834265, + "learning_rate": 1.3265516150130577e-05, + "loss": 0.2195, + "step": 2010 + }, + { + "epoch": 3.239629480467177, + "grad_norm": 0.2433403725605887, + "learning_rate": 1.3244315161758392e-05, + "loss": 0.2033, + "step": 2011 + }, + { + "epoch": 3.2412404349577124, + "grad_norm": 0.24981637506327542, + "learning_rate": 1.3223122739043091e-05, + "loss": 0.2122, + "step": 2012 + }, + { + "epoch": 3.2428513894482482, + "grad_norm": 0.23690220270645826, + "learning_rate": 1.3201938908854942e-05, + "loss": 0.2007, + "step": 2013 + }, + { + "epoch": 3.2444623439387836, + "grad_norm": 0.23365777497487655, + "learning_rate": 1.3180763698053311e-05, + "loss": 0.2043, + "step": 2014 + }, + { + "epoch": 3.2460732984293195, + "grad_norm": 0.24922388772794682, + "learning_rate": 1.3159597133486628e-05, + "loss": 0.2049, + "step": 2015 + }, + { + "epoch": 3.247684252919855, + "grad_norm": 0.2615394743002068, + "learning_rate": 1.3138439241992376e-05, + "loss": 0.1982, + "step": 2016 + }, + { + "epoch": 3.2492952074103907, + "grad_norm": 0.23085725558413792, + "learning_rate": 1.3117290050397036e-05, + "loss": 0.2063, + "step": 2017 + }, + { + "epoch": 3.250906161900926, + "grad_norm": 0.2524638411576523, + "learning_rate": 1.3096149585516059e-05, + "loss": 0.201, + "step": 2018 + }, + { + "epoch": 3.252517116391462, + "grad_norm": 0.22321898517204689, + "learning_rate": 1.3075017874153808e-05, + "loss": 0.1873, + "step": 2019 + }, + { + "epoch": 3.2541280708819977, + "grad_norm": 0.24208007420834118, + "learning_rate": 1.3053894943103598e-05, + "loss": 0.2028, + "step": 2020 + }, + { + "epoch": 3.255739025372533, + "grad_norm": 0.23464995395813093, + "learning_rate": 1.303278081914756e-05, + "loss": 0.1874, + "step": 2021 + }, + { + "epoch": 3.257349979863069, + "grad_norm": 0.23529260639203314, + "learning_rate": 1.3011675529056688e-05, + "loss": 0.1984, + "step": 2022 + }, + { + "epoch": 3.2589609343536043, + "grad_norm": 0.23749352729237172, + "learning_rate": 1.2990579099590763e-05, + "loss": 0.201, + "step": 2023 + }, + { + "epoch": 3.26057188884414, + "grad_norm": 0.2427641865797294, + "learning_rate": 1.2969491557498342e-05, + "loss": 0.2031, + "step": 2024 + }, + { + "epoch": 3.262182843334676, + "grad_norm": 0.2375077667321114, + "learning_rate": 1.2948412929516703e-05, + "loss": 0.1918, + "step": 2025 + }, + { + "epoch": 3.2637937978252114, + "grad_norm": 0.24542218137105598, + "learning_rate": 1.2927343242371815e-05, + "loss": 0.207, + "step": 2026 + }, + { + "epoch": 3.2654047523157472, + "grad_norm": 0.2390333630741558, + "learning_rate": 1.2906282522778341e-05, + "loss": 0.1998, + "step": 2027 + }, + { + "epoch": 3.2670157068062826, + "grad_norm": 0.24837545936493144, + "learning_rate": 1.2885230797439543e-05, + "loss": 0.2157, + "step": 2028 + }, + { + "epoch": 3.2686266612968184, + "grad_norm": 0.2519965003627847, + "learning_rate": 1.2864188093047291e-05, + "loss": 0.1995, + "step": 2029 + }, + { + "epoch": 3.270237615787354, + "grad_norm": 0.23774917985531094, + "learning_rate": 1.2843154436282014e-05, + "loss": 0.2081, + "step": 2030 + }, + { + "epoch": 3.2718485702778897, + "grad_norm": 0.24180882546372334, + "learning_rate": 1.2822129853812682e-05, + "loss": 0.1794, + "step": 2031 + }, + { + "epoch": 3.273459524768425, + "grad_norm": 0.22947773377827083, + "learning_rate": 1.2801114372296742e-05, + "loss": 0.204, + "step": 2032 + }, + { + "epoch": 3.275070479258961, + "grad_norm": 0.2455803270540699, + "learning_rate": 1.2780108018380103e-05, + "loss": 0.2038, + "step": 2033 + }, + { + "epoch": 3.2766814337494967, + "grad_norm": 0.24059114783030533, + "learning_rate": 1.2759110818697114e-05, + "loss": 0.203, + "step": 2034 + }, + { + "epoch": 3.278292388240032, + "grad_norm": 0.22748506298956164, + "learning_rate": 1.273812279987051e-05, + "loss": 0.2093, + "step": 2035 + }, + { + "epoch": 3.279903342730568, + "grad_norm": 0.2469088454838881, + "learning_rate": 1.2717143988511392e-05, + "loss": 0.2227, + "step": 2036 + }, + { + "epoch": 3.2815142972211033, + "grad_norm": 0.23674243065437753, + "learning_rate": 1.2696174411219164e-05, + "loss": 0.1981, + "step": 2037 + }, + { + "epoch": 3.283125251711639, + "grad_norm": 0.22507856798719222, + "learning_rate": 1.2675214094581547e-05, + "loss": 0.1886, + "step": 2038 + }, + { + "epoch": 3.284736206202175, + "grad_norm": 0.24558240776697116, + "learning_rate": 1.2654263065174515e-05, + "loss": 0.2104, + "step": 2039 + }, + { + "epoch": 3.2863471606927104, + "grad_norm": 0.2247916689809979, + "learning_rate": 1.263332134956226e-05, + "loss": 0.1996, + "step": 2040 + }, + { + "epoch": 3.287958115183246, + "grad_norm": 0.24098803655370316, + "learning_rate": 1.2612388974297161e-05, + "loss": 0.2301, + "step": 2041 + }, + { + "epoch": 3.2895690696737816, + "grad_norm": 0.22790610018869215, + "learning_rate": 1.259146596591978e-05, + "loss": 0.2044, + "step": 2042 + }, + { + "epoch": 3.2911800241643174, + "grad_norm": 0.29439964547743164, + "learning_rate": 1.2570552350958764e-05, + "loss": 0.1872, + "step": 2043 + }, + { + "epoch": 3.292790978654853, + "grad_norm": 0.23266526895741785, + "learning_rate": 1.2549648155930875e-05, + "loss": 0.221, + "step": 2044 + }, + { + "epoch": 3.2944019331453887, + "grad_norm": 0.23179183784366264, + "learning_rate": 1.2528753407340929e-05, + "loss": 0.194, + "step": 2045 + }, + { + "epoch": 3.296012887635924, + "grad_norm": 0.21779866661814773, + "learning_rate": 1.250786813168176e-05, + "loss": 0.1891, + "step": 2046 + }, + { + "epoch": 3.29762384212646, + "grad_norm": 0.2532975434083456, + "learning_rate": 1.2486992355434197e-05, + "loss": 0.2207, + "step": 2047 + }, + { + "epoch": 3.2992347966169957, + "grad_norm": 0.22096278667714603, + "learning_rate": 1.2466126105067014e-05, + "loss": 0.1821, + "step": 2048 + }, + { + "epoch": 3.300845751107531, + "grad_norm": 0.2530562863101158, + "learning_rate": 1.2445269407036908e-05, + "loss": 0.2133, + "step": 2049 + }, + { + "epoch": 3.302456705598067, + "grad_norm": 0.24462052555468472, + "learning_rate": 1.242442228778848e-05, + "loss": 0.1947, + "step": 2050 + }, + { + "epoch": 3.3040676600886023, + "grad_norm": 0.22993112034177768, + "learning_rate": 1.2403584773754176e-05, + "loss": 0.1884, + "step": 2051 + }, + { + "epoch": 3.305678614579138, + "grad_norm": 0.2375252762228901, + "learning_rate": 1.238275689135425e-05, + "loss": 0.2068, + "step": 2052 + }, + { + "epoch": 3.307289569069674, + "grad_norm": 0.23974997189809513, + "learning_rate": 1.2361938666996772e-05, + "loss": 0.2094, + "step": 2053 + }, + { + "epoch": 3.3089005235602094, + "grad_norm": 0.23634430658941374, + "learning_rate": 1.2341130127077548e-05, + "loss": 0.1967, + "step": 2054 + }, + { + "epoch": 3.310511478050745, + "grad_norm": 0.23446190903752556, + "learning_rate": 1.2320331297980097e-05, + "loss": 0.183, + "step": 2055 + }, + { + "epoch": 3.3121224325412806, + "grad_norm": 0.23976096133391758, + "learning_rate": 1.2299542206075641e-05, + "loss": 0.2365, + "step": 2056 + }, + { + "epoch": 3.3137333870318164, + "grad_norm": 0.23801927478749976, + "learning_rate": 1.2278762877723058e-05, + "loss": 0.2101, + "step": 2057 + }, + { + "epoch": 3.315344341522352, + "grad_norm": 0.24082087940178765, + "learning_rate": 1.2257993339268843e-05, + "loss": 0.1928, + "step": 2058 + }, + { + "epoch": 3.3169552960128876, + "grad_norm": 0.24713092173760648, + "learning_rate": 1.223723361704706e-05, + "loss": 0.1997, + "step": 2059 + }, + { + "epoch": 3.3185662505034235, + "grad_norm": 0.24274357069914468, + "learning_rate": 1.221648373737935e-05, + "loss": 0.2094, + "step": 2060 + }, + { + "epoch": 3.320177204993959, + "grad_norm": 0.23991670470326582, + "learning_rate": 1.2195743726574869e-05, + "loss": 0.2056, + "step": 2061 + }, + { + "epoch": 3.3217881594844947, + "grad_norm": 0.23272961485583524, + "learning_rate": 1.2175013610930253e-05, + "loss": 0.2045, + "step": 2062 + }, + { + "epoch": 3.32339911397503, + "grad_norm": 0.2549934176914694, + "learning_rate": 1.2154293416729606e-05, + "loss": 0.2088, + "step": 2063 + }, + { + "epoch": 3.325010068465566, + "grad_norm": 0.2627470044380523, + "learning_rate": 1.2133583170244422e-05, + "loss": 0.2154, + "step": 2064 + }, + { + "epoch": 3.3266210229561013, + "grad_norm": 0.2418643009081854, + "learning_rate": 1.2112882897733634e-05, + "loss": 0.1826, + "step": 2065 + }, + { + "epoch": 3.328231977446637, + "grad_norm": 0.23964309123146355, + "learning_rate": 1.2092192625443469e-05, + "loss": 0.1956, + "step": 2066 + }, + { + "epoch": 3.329842931937173, + "grad_norm": 0.26877059633483413, + "learning_rate": 1.2071512379607515e-05, + "loss": 0.2125, + "step": 2067 + }, + { + "epoch": 3.3314538864277083, + "grad_norm": 0.2718834091071389, + "learning_rate": 1.2050842186446636e-05, + "loss": 0.1953, + "step": 2068 + }, + { + "epoch": 3.333064840918244, + "grad_norm": 0.24287090855325613, + "learning_rate": 1.2030182072168957e-05, + "loss": 0.1866, + "step": 2069 + }, + { + "epoch": 3.3346757954087796, + "grad_norm": 0.25823157441435457, + "learning_rate": 1.2009532062969801e-05, + "loss": 0.2084, + "step": 2070 + }, + { + "epoch": 3.3362867498993154, + "grad_norm": 0.239590496772935, + "learning_rate": 1.1988892185031697e-05, + "loss": 0.1934, + "step": 2071 + }, + { + "epoch": 3.3378977043898512, + "grad_norm": 0.25170794325761425, + "learning_rate": 1.1968262464524333e-05, + "loss": 0.2099, + "step": 2072 + }, + { + "epoch": 3.3395086588803866, + "grad_norm": 0.2774141741661976, + "learning_rate": 1.1947642927604507e-05, + "loss": 0.1951, + "step": 2073 + }, + { + "epoch": 3.3411196133709224, + "grad_norm": 0.25992979182223386, + "learning_rate": 1.1927033600416113e-05, + "loss": 0.1895, + "step": 2074 + }, + { + "epoch": 3.342730567861458, + "grad_norm": 0.2676459769274524, + "learning_rate": 1.190643450909008e-05, + "loss": 0.2062, + "step": 2075 + }, + { + "epoch": 3.3443415223519937, + "grad_norm": 0.24491077877038162, + "learning_rate": 1.1885845679744396e-05, + "loss": 0.1842, + "step": 2076 + }, + { + "epoch": 3.345952476842529, + "grad_norm": 0.2573204990604907, + "learning_rate": 1.1865267138484e-05, + "loss": 0.2252, + "step": 2077 + }, + { + "epoch": 3.347563431333065, + "grad_norm": 0.2488993418599943, + "learning_rate": 1.1844698911400805e-05, + "loss": 0.2058, + "step": 2078 + }, + { + "epoch": 3.3491743858236003, + "grad_norm": 0.2221579767832753, + "learning_rate": 1.1824141024573647e-05, + "loss": 0.2012, + "step": 2079 + }, + { + "epoch": 3.350785340314136, + "grad_norm": 0.25173011551503893, + "learning_rate": 1.1803593504068256e-05, + "loss": 0.2202, + "step": 2080 + }, + { + "epoch": 3.352396294804672, + "grad_norm": 0.2403210358197002, + "learning_rate": 1.1783056375937193e-05, + "loss": 0.2103, + "step": 2081 + }, + { + "epoch": 3.3540072492952073, + "grad_norm": 0.23783146032565713, + "learning_rate": 1.1762529666219869e-05, + "loss": 0.2067, + "step": 2082 + }, + { + "epoch": 3.355618203785743, + "grad_norm": 0.2423185833751914, + "learning_rate": 1.174201340094248e-05, + "loss": 0.2043, + "step": 2083 + }, + { + "epoch": 3.3572291582762785, + "grad_norm": 0.24591036904977595, + "learning_rate": 1.172150760611797e-05, + "loss": 0.1968, + "step": 2084 + }, + { + "epoch": 3.3588401127668144, + "grad_norm": 0.23724633382039845, + "learning_rate": 1.1701012307746021e-05, + "loss": 0.2087, + "step": 2085 + }, + { + "epoch": 3.36045106725735, + "grad_norm": 0.24067309607867643, + "learning_rate": 1.168052753181298e-05, + "loss": 0.2029, + "step": 2086 + }, + { + "epoch": 3.3620620217478856, + "grad_norm": 0.24166724371741746, + "learning_rate": 1.1660053304291894e-05, + "loss": 0.1864, + "step": 2087 + }, + { + "epoch": 3.3636729762384214, + "grad_norm": 0.24147797064255336, + "learning_rate": 1.16395896511424e-05, + "loss": 0.2112, + "step": 2088 + }, + { + "epoch": 3.365283930728957, + "grad_norm": 0.24098212734239127, + "learning_rate": 1.1619136598310725e-05, + "loss": 0.2045, + "step": 2089 + }, + { + "epoch": 3.3668948852194927, + "grad_norm": 0.25019441770575873, + "learning_rate": 1.1598694171729703e-05, + "loss": 0.211, + "step": 2090 + }, + { + "epoch": 3.368505839710028, + "grad_norm": 0.2365678451068785, + "learning_rate": 1.1578262397318642e-05, + "loss": 0.1937, + "step": 2091 + }, + { + "epoch": 3.370116794200564, + "grad_norm": 0.23164431406226096, + "learning_rate": 1.1557841300983363e-05, + "loss": 0.1858, + "step": 2092 + }, + { + "epoch": 3.3717277486910993, + "grad_norm": 0.24563176728263572, + "learning_rate": 1.1537430908616152e-05, + "loss": 0.2159, + "step": 2093 + }, + { + "epoch": 3.373338703181635, + "grad_norm": 0.23552543012415264, + "learning_rate": 1.151703124609573e-05, + "loss": 0.2163, + "step": 2094 + }, + { + "epoch": 3.374949657672171, + "grad_norm": 0.23518558644454512, + "learning_rate": 1.1496642339287191e-05, + "loss": 0.2063, + "step": 2095 + }, + { + "epoch": 3.3765606121627063, + "grad_norm": 0.2441080853367346, + "learning_rate": 1.1476264214042004e-05, + "loss": 0.2042, + "step": 2096 + }, + { + "epoch": 3.378171566653242, + "grad_norm": 0.451665532949196, + "learning_rate": 1.1455896896197974e-05, + "loss": 0.2149, + "step": 2097 + }, + { + "epoch": 3.3797825211437775, + "grad_norm": 0.25128065966750873, + "learning_rate": 1.1435540411579203e-05, + "loss": 0.2113, + "step": 2098 + }, + { + "epoch": 3.3813934756343134, + "grad_norm": 0.2225475197951388, + "learning_rate": 1.1415194785996045e-05, + "loss": 0.1594, + "step": 2099 + }, + { + "epoch": 3.383004430124849, + "grad_norm": 0.2617602545629556, + "learning_rate": 1.1394860045245084e-05, + "loss": 0.229, + "step": 2100 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 0.24500084267347086, + "learning_rate": 1.137453621510912e-05, + "loss": 0.2098, + "step": 2101 + }, + { + "epoch": 3.3862263391059204, + "grad_norm": 0.23416531343487373, + "learning_rate": 1.1354223321357119e-05, + "loss": 0.2084, + "step": 2102 + }, + { + "epoch": 3.387837293596456, + "grad_norm": 0.22091357857993194, + "learning_rate": 1.1333921389744153e-05, + "loss": 0.1967, + "step": 2103 + }, + { + "epoch": 3.3894482480869916, + "grad_norm": 0.2511364869911767, + "learning_rate": 1.1313630446011427e-05, + "loss": 0.216, + "step": 2104 + }, + { + "epoch": 3.391059202577527, + "grad_norm": 0.23267127034952573, + "learning_rate": 1.1293350515886203e-05, + "loss": 0.2022, + "step": 2105 + }, + { + "epoch": 3.392670157068063, + "grad_norm": 0.23555886598588704, + "learning_rate": 1.1273081625081777e-05, + "loss": 0.1995, + "step": 2106 + }, + { + "epoch": 3.3942811115585982, + "grad_norm": 0.23868806139569818, + "learning_rate": 1.1252823799297433e-05, + "loss": 0.1773, + "step": 2107 + }, + { + "epoch": 3.395892066049134, + "grad_norm": 0.27452473674180444, + "learning_rate": 1.123257706421845e-05, + "loss": 0.2199, + "step": 2108 + }, + { + "epoch": 3.39750302053967, + "grad_norm": 0.2394943226611453, + "learning_rate": 1.1212341445516043e-05, + "loss": 0.1981, + "step": 2109 + }, + { + "epoch": 3.3991139750302053, + "grad_norm": 0.24612894971879087, + "learning_rate": 1.1192116968847313e-05, + "loss": 0.1968, + "step": 2110 + }, + { + "epoch": 3.400724929520741, + "grad_norm": 0.25883844195430433, + "learning_rate": 1.1171903659855256e-05, + "loss": 0.1949, + "step": 2111 + }, + { + "epoch": 3.4023358840112765, + "grad_norm": 0.23984551387483893, + "learning_rate": 1.1151701544168685e-05, + "loss": 0.1993, + "step": 2112 + }, + { + "epoch": 3.4039468385018123, + "grad_norm": 0.23016178489654152, + "learning_rate": 1.1131510647402246e-05, + "loss": 0.1948, + "step": 2113 + }, + { + "epoch": 3.405557792992348, + "grad_norm": 0.2631642324887254, + "learning_rate": 1.1111330995156335e-05, + "loss": 0.1918, + "step": 2114 + }, + { + "epoch": 3.4071687474828836, + "grad_norm": 0.2506220508839031, + "learning_rate": 1.1091162613017113e-05, + "loss": 0.2062, + "step": 2115 + }, + { + "epoch": 3.4087797019734194, + "grad_norm": 0.25264234683049214, + "learning_rate": 1.1071005526556444e-05, + "loss": 0.2141, + "step": 2116 + }, + { + "epoch": 3.410390656463955, + "grad_norm": 0.25278773105827357, + "learning_rate": 1.1050859761331867e-05, + "loss": 0.1915, + "step": 2117 + }, + { + "epoch": 3.4120016109544906, + "grad_norm": 0.23224228492195387, + "learning_rate": 1.1030725342886556e-05, + "loss": 0.1938, + "step": 2118 + }, + { + "epoch": 3.413612565445026, + "grad_norm": 0.2142415135543403, + "learning_rate": 1.101060229674932e-05, + "loss": 0.1936, + "step": 2119 + }, + { + "epoch": 3.415223519935562, + "grad_norm": 0.23554798219735495, + "learning_rate": 1.0990490648434541e-05, + "loss": 0.1969, + "step": 2120 + }, + { + "epoch": 3.4168344744260972, + "grad_norm": 0.26354880573854594, + "learning_rate": 1.0970390423442142e-05, + "loss": 0.2096, + "step": 2121 + }, + { + "epoch": 3.418445428916633, + "grad_norm": 0.24440684389786652, + "learning_rate": 1.0950301647257572e-05, + "loss": 0.2051, + "step": 2122 + }, + { + "epoch": 3.420056383407169, + "grad_norm": 0.2552755780564851, + "learning_rate": 1.0930224345351751e-05, + "loss": 0.2117, + "step": 2123 + }, + { + "epoch": 3.4216673378977043, + "grad_norm": 0.29304190170396727, + "learning_rate": 1.0910158543181073e-05, + "loss": 0.1947, + "step": 2124 + }, + { + "epoch": 3.42327829238824, + "grad_norm": 0.2458155097174551, + "learning_rate": 1.089010426618732e-05, + "loss": 0.213, + "step": 2125 + }, + { + "epoch": 3.4248892468787755, + "grad_norm": 0.2462846208675487, + "learning_rate": 1.0870061539797696e-05, + "loss": 0.1995, + "step": 2126 + }, + { + "epoch": 3.4265002013693113, + "grad_norm": 0.2367304617090815, + "learning_rate": 1.0850030389424724e-05, + "loss": 0.2052, + "step": 2127 + }, + { + "epoch": 3.428111155859847, + "grad_norm": 0.24464716596958344, + "learning_rate": 1.0830010840466282e-05, + "loss": 0.2172, + "step": 2128 + }, + { + "epoch": 3.4297221103503825, + "grad_norm": 0.24560443043574962, + "learning_rate": 1.0810002918305508e-05, + "loss": 0.2054, + "step": 2129 + }, + { + "epoch": 3.4313330648409184, + "grad_norm": 0.24409525550351965, + "learning_rate": 1.0790006648310828e-05, + "loss": 0.2041, + "step": 2130 + }, + { + "epoch": 3.4329440193314538, + "grad_norm": 0.23782893656758072, + "learning_rate": 1.0770022055835882e-05, + "loss": 0.2036, + "step": 2131 + }, + { + "epoch": 3.4345549738219896, + "grad_norm": 0.23124086461460552, + "learning_rate": 1.075004916621949e-05, + "loss": 0.199, + "step": 2132 + }, + { + "epoch": 3.436165928312525, + "grad_norm": 0.2438499354960223, + "learning_rate": 1.073008800478566e-05, + "loss": 0.1827, + "step": 2133 + }, + { + "epoch": 3.437776882803061, + "grad_norm": 0.23923418585538098, + "learning_rate": 1.0710138596843494e-05, + "loss": 0.2235, + "step": 2134 + }, + { + "epoch": 3.439387837293596, + "grad_norm": 0.23302741192753726, + "learning_rate": 1.0690200967687234e-05, + "loss": 0.1979, + "step": 2135 + }, + { + "epoch": 3.440998791784132, + "grad_norm": 0.23649073229436382, + "learning_rate": 1.0670275142596154e-05, + "loss": 0.2074, + "step": 2136 + }, + { + "epoch": 3.442609746274668, + "grad_norm": 0.23258425943151656, + "learning_rate": 1.065036114683458e-05, + "loss": 0.2043, + "step": 2137 + }, + { + "epoch": 3.4442207007652033, + "grad_norm": 0.23498177622326072, + "learning_rate": 1.0630459005651823e-05, + "loss": 0.2005, + "step": 2138 + }, + { + "epoch": 3.445831655255739, + "grad_norm": 0.26706198990505337, + "learning_rate": 1.061056874428219e-05, + "loss": 0.2026, + "step": 2139 + }, + { + "epoch": 3.4474426097462745, + "grad_norm": 0.23061635890286963, + "learning_rate": 1.059069038794489e-05, + "loss": 0.214, + "step": 2140 + }, + { + "epoch": 3.4490535642368103, + "grad_norm": 0.22470610502390026, + "learning_rate": 1.0570823961844065e-05, + "loss": 0.1919, + "step": 2141 + }, + { + "epoch": 3.450664518727346, + "grad_norm": 0.24074853759574122, + "learning_rate": 1.055096949116873e-05, + "loss": 0.2134, + "step": 2142 + }, + { + "epoch": 3.4522754732178815, + "grad_norm": 0.238304166701547, + "learning_rate": 1.0531127001092715e-05, + "loss": 0.2032, + "step": 2143 + }, + { + "epoch": 3.4538864277084174, + "grad_norm": 0.22940631731584865, + "learning_rate": 1.0511296516774699e-05, + "loss": 0.1904, + "step": 2144 + }, + { + "epoch": 3.4554973821989527, + "grad_norm": 0.24654104415440423, + "learning_rate": 1.0491478063358096e-05, + "loss": 0.2021, + "step": 2145 + }, + { + "epoch": 3.4571083366894886, + "grad_norm": 0.26612320358388847, + "learning_rate": 1.0471671665971104e-05, + "loss": 0.2139, + "step": 2146 + }, + { + "epoch": 3.4587192911800244, + "grad_norm": 0.40048278104943835, + "learning_rate": 1.0451877349726605e-05, + "loss": 0.2229, + "step": 2147 + }, + { + "epoch": 3.46033024567056, + "grad_norm": 0.23711292098323433, + "learning_rate": 1.0432095139722187e-05, + "loss": 0.1839, + "step": 2148 + }, + { + "epoch": 3.4619412001610956, + "grad_norm": 0.26048839006315294, + "learning_rate": 1.0412325061040063e-05, + "loss": 0.1968, + "step": 2149 + }, + { + "epoch": 3.463552154651631, + "grad_norm": 0.2326769954633612, + "learning_rate": 1.0392567138747101e-05, + "loss": 0.2178, + "step": 2150 + }, + { + "epoch": 3.465163109142167, + "grad_norm": 0.25045178314044103, + "learning_rate": 1.0372821397894709e-05, + "loss": 0.1879, + "step": 2151 + }, + { + "epoch": 3.4667740636327022, + "grad_norm": 0.23568688727213294, + "learning_rate": 1.0353087863518894e-05, + "loss": 0.1919, + "step": 2152 + }, + { + "epoch": 3.468385018123238, + "grad_norm": 0.2412245199550461, + "learning_rate": 1.0333366560640151e-05, + "loss": 0.1995, + "step": 2153 + }, + { + "epoch": 3.4699959726137735, + "grad_norm": 0.2634416881896074, + "learning_rate": 1.0313657514263492e-05, + "loss": 0.2353, + "step": 2154 + }, + { + "epoch": 3.4716069271043093, + "grad_norm": 0.22072632069118556, + "learning_rate": 1.0293960749378384e-05, + "loss": 0.1939, + "step": 2155 + }, + { + "epoch": 3.473217881594845, + "grad_norm": 0.23393038614986122, + "learning_rate": 1.0274276290958701e-05, + "loss": 0.2051, + "step": 2156 + }, + { + "epoch": 3.4748288360853805, + "grad_norm": 0.23009879755108045, + "learning_rate": 1.0254604163962747e-05, + "loss": 0.1869, + "step": 2157 + }, + { + "epoch": 3.4764397905759163, + "grad_norm": 0.24631286359057225, + "learning_rate": 1.0234944393333155e-05, + "loss": 0.2072, + "step": 2158 + }, + { + "epoch": 3.4780507450664517, + "grad_norm": 0.2396424841750826, + "learning_rate": 1.0215297003996927e-05, + "loss": 0.1985, + "step": 2159 + }, + { + "epoch": 3.4796616995569876, + "grad_norm": 0.2229192233390955, + "learning_rate": 1.0195662020865333e-05, + "loss": 0.1877, + "step": 2160 + }, + { + "epoch": 3.4812726540475234, + "grad_norm": 0.27640686696076017, + "learning_rate": 1.0176039468833938e-05, + "loss": 0.2072, + "step": 2161 + }, + { + "epoch": 3.482883608538059, + "grad_norm": 0.2501269581468127, + "learning_rate": 1.015642937278254e-05, + "loss": 0.2093, + "step": 2162 + }, + { + "epoch": 3.4844945630285946, + "grad_norm": 0.23553356609239753, + "learning_rate": 1.0136831757575134e-05, + "loss": 0.2097, + "step": 2163 + }, + { + "epoch": 3.48610551751913, + "grad_norm": 0.2513360823413565, + "learning_rate": 1.0117246648059888e-05, + "loss": 0.2027, + "step": 2164 + }, + { + "epoch": 3.487716472009666, + "grad_norm": 0.25559246873602354, + "learning_rate": 1.0097674069069132e-05, + "loss": 0.2042, + "step": 2165 + }, + { + "epoch": 3.4893274265002012, + "grad_norm": 0.24381340477783897, + "learning_rate": 1.0078114045419305e-05, + "loss": 0.2193, + "step": 2166 + }, + { + "epoch": 3.490938380990737, + "grad_norm": 0.23737314305360308, + "learning_rate": 1.0058566601910903e-05, + "loss": 0.2057, + "step": 2167 + }, + { + "epoch": 3.4925493354812724, + "grad_norm": 0.24161527730895105, + "learning_rate": 1.0039031763328506e-05, + "loss": 0.1997, + "step": 2168 + }, + { + "epoch": 3.4941602899718083, + "grad_norm": 0.24433075314270097, + "learning_rate": 1.0019509554440678e-05, + "loss": 0.2156, + "step": 2169 + }, + { + "epoch": 3.495771244462344, + "grad_norm": 0.2280058138376306, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.1945, + "step": 2170 + }, + { + "epoch": 3.4973821989528795, + "grad_norm": 0.24779688243659267, + "learning_rate": 9.980503124742988e-06, + "loss": 0.2093, + "step": 2171 + }, + { + "epoch": 3.4989931534434153, + "grad_norm": 0.2428801961690144, + "learning_rate": 9.961018953390086e-06, + "loss": 0.2065, + "step": 2172 + }, + { + "epoch": 3.5006041079339507, + "grad_norm": 0.2219328596608684, + "learning_rate": 9.941547510645647e-06, + "loss": 0.1885, + "step": 2173 + }, + { + "epoch": 3.5022150624244865, + "grad_norm": 0.2267440935335129, + "learning_rate": 9.922088821197854e-06, + "loss": 0.2089, + "step": 2174 + }, + { + "epoch": 3.5038260169150224, + "grad_norm": 0.24159131404047698, + "learning_rate": 9.902642909718737e-06, + "loss": 0.2101, + "step": 2175 + }, + { + "epoch": 3.5054369714055578, + "grad_norm": 0.23555256415137021, + "learning_rate": 9.88320980086413e-06, + "loss": 0.2021, + "step": 2176 + }, + { + "epoch": 3.5070479258960936, + "grad_norm": 0.2274219679214926, + "learning_rate": 9.863789519273638e-06, + "loss": 0.1952, + "step": 2177 + }, + { + "epoch": 3.508658880386629, + "grad_norm": 0.23187334643843013, + "learning_rate": 9.844382089570584e-06, + "loss": 0.2084, + "step": 2178 + }, + { + "epoch": 3.510269834877165, + "grad_norm": 0.22771801310950746, + "learning_rate": 9.824987536361992e-06, + "loss": 0.1949, + "step": 2179 + }, + { + "epoch": 3.5118807893677, + "grad_norm": 0.22370527590647052, + "learning_rate": 9.805605884238587e-06, + "loss": 0.205, + "step": 2180 + }, + { + "epoch": 3.513491743858236, + "grad_norm": 0.23845020170094708, + "learning_rate": 9.786237157774726e-06, + "loss": 0.2032, + "step": 2181 + }, + { + "epoch": 3.5151026983487714, + "grad_norm": 0.2283710304261633, + "learning_rate": 9.766881381528357e-06, + "loss": 0.2032, + "step": 2182 + }, + { + "epoch": 3.5167136528393073, + "grad_norm": 0.2378550286458637, + "learning_rate": 9.747538580041034e-06, + "loss": 0.2106, + "step": 2183 + }, + { + "epoch": 3.518324607329843, + "grad_norm": 0.22430006122712037, + "learning_rate": 9.728208777837858e-06, + "loss": 0.2024, + "step": 2184 + }, + { + "epoch": 3.5199355618203785, + "grad_norm": 0.22469949464520347, + "learning_rate": 9.70889199942743e-06, + "loss": 0.2076, + "step": 2185 + }, + { + "epoch": 3.5215465163109143, + "grad_norm": 0.2416906697350346, + "learning_rate": 9.689588269301842e-06, + "loss": 0.2171, + "step": 2186 + }, + { + "epoch": 3.5231574708014497, + "grad_norm": 0.22087168616579303, + "learning_rate": 9.670297611936657e-06, + "loss": 0.2009, + "step": 2187 + }, + { + "epoch": 3.5247684252919855, + "grad_norm": 0.23173269867776017, + "learning_rate": 9.651020051790858e-06, + "loss": 0.2073, + "step": 2188 + }, + { + "epoch": 3.5263793797825214, + "grad_norm": 0.2277755728185813, + "learning_rate": 9.631755613306814e-06, + "loss": 0.2008, + "step": 2189 + }, + { + "epoch": 3.5279903342730567, + "grad_norm": 0.22892461332859246, + "learning_rate": 9.612504320910249e-06, + "loss": 0.22, + "step": 2190 + }, + { + "epoch": 3.5296012887635926, + "grad_norm": 0.22441902310723352, + "learning_rate": 9.59326619901024e-06, + "loss": 0.2079, + "step": 2191 + }, + { + "epoch": 3.531212243254128, + "grad_norm": 0.2331194408439935, + "learning_rate": 9.574041271999163e-06, + "loss": 0.2019, + "step": 2192 + }, + { + "epoch": 3.532823197744664, + "grad_norm": 0.2317551080792225, + "learning_rate": 9.55482956425264e-06, + "loss": 0.2088, + "step": 2193 + }, + { + "epoch": 3.534434152235199, + "grad_norm": 0.24198116715159212, + "learning_rate": 9.535631100129556e-06, + "loss": 0.2047, + "step": 2194 + }, + { + "epoch": 3.536045106725735, + "grad_norm": 0.2404786868991303, + "learning_rate": 9.516445903972005e-06, + "loss": 0.203, + "step": 2195 + }, + { + "epoch": 3.5376560612162704, + "grad_norm": 0.21582358266853396, + "learning_rate": 9.497274000105239e-06, + "loss": 0.1868, + "step": 2196 + }, + { + "epoch": 3.5392670157068062, + "grad_norm": 0.24330479425323304, + "learning_rate": 9.478115412837661e-06, + "loss": 0.2046, + "step": 2197 + }, + { + "epoch": 3.540877970197342, + "grad_norm": 0.2290973058402262, + "learning_rate": 9.458970166460804e-06, + "loss": 0.1997, + "step": 2198 + }, + { + "epoch": 3.5424889246878775, + "grad_norm": 0.24487357902072016, + "learning_rate": 9.439838285249285e-06, + "loss": 0.2109, + "step": 2199 + }, + { + "epoch": 3.5440998791784133, + "grad_norm": 0.23275375857106345, + "learning_rate": 9.420719793460758e-06, + "loss": 0.2045, + "step": 2200 + }, + { + "epoch": 3.5457108336689487, + "grad_norm": 0.23599591116541208, + "learning_rate": 9.401614715335905e-06, + "loss": 0.1868, + "step": 2201 + }, + { + "epoch": 3.5473217881594845, + "grad_norm": 0.23362100236229594, + "learning_rate": 9.38252307509841e-06, + "loss": 0.193, + "step": 2202 + }, + { + "epoch": 3.5489327426500203, + "grad_norm": 0.24650562505944937, + "learning_rate": 9.363444896954927e-06, + "loss": 0.2109, + "step": 2203 + }, + { + "epoch": 3.5505436971405557, + "grad_norm": 0.23856000850206951, + "learning_rate": 9.344380205095017e-06, + "loss": 0.1888, + "step": 2204 + }, + { + "epoch": 3.5521546516310916, + "grad_norm": 0.23064058859091333, + "learning_rate": 9.325329023691137e-06, + "loss": 0.2069, + "step": 2205 + }, + { + "epoch": 3.553765606121627, + "grad_norm": 0.2318135856582349, + "learning_rate": 9.30629137689866e-06, + "loss": 0.2023, + "step": 2206 + }, + { + "epoch": 3.555376560612163, + "grad_norm": 0.24342035330502437, + "learning_rate": 9.287267288855756e-06, + "loss": 0.2067, + "step": 2207 + }, + { + "epoch": 3.5569875151026986, + "grad_norm": 0.25166127958235246, + "learning_rate": 9.268256783683408e-06, + "loss": 0.1903, + "step": 2208 + }, + { + "epoch": 3.558598469593234, + "grad_norm": 0.22256962504013172, + "learning_rate": 9.24925988548539e-06, + "loss": 0.1935, + "step": 2209 + }, + { + "epoch": 3.5602094240837694, + "grad_norm": 0.28219584567539696, + "learning_rate": 9.230276618348224e-06, + "loss": 0.2108, + "step": 2210 + }, + { + "epoch": 3.5618203785743052, + "grad_norm": 0.24218404180288422, + "learning_rate": 9.21130700634114e-06, + "loss": 0.2002, + "step": 2211 + }, + { + "epoch": 3.563431333064841, + "grad_norm": 0.24262274443668091, + "learning_rate": 9.192351073516047e-06, + "loss": 0.1875, + "step": 2212 + }, + { + "epoch": 3.5650422875553764, + "grad_norm": 0.2364784795121163, + "learning_rate": 9.173408843907529e-06, + "loss": 0.1873, + "step": 2213 + }, + { + "epoch": 3.5666532420459123, + "grad_norm": 0.2317621350392338, + "learning_rate": 9.154480341532797e-06, + "loss": 0.2008, + "step": 2214 + }, + { + "epoch": 3.5682641965364477, + "grad_norm": 0.24422946032063308, + "learning_rate": 9.135565590391633e-06, + "loss": 0.2241, + "step": 2215 + }, + { + "epoch": 3.5698751510269835, + "grad_norm": 0.23009545062501383, + "learning_rate": 9.116664614466386e-06, + "loss": 0.2057, + "step": 2216 + }, + { + "epoch": 3.5714861055175193, + "grad_norm": 0.23204535283854022, + "learning_rate": 9.097777437721982e-06, + "loss": 0.1953, + "step": 2217 + }, + { + "epoch": 3.5730970600080547, + "grad_norm": 0.24756670044336804, + "learning_rate": 9.078904084105802e-06, + "loss": 0.2071, + "step": 2218 + }, + { + "epoch": 3.5747080144985905, + "grad_norm": 0.22696668780158338, + "learning_rate": 9.060044577547711e-06, + "loss": 0.197, + "step": 2219 + }, + { + "epoch": 3.576318968989126, + "grad_norm": 0.21813119741031062, + "learning_rate": 9.04119894196003e-06, + "loss": 0.1943, + "step": 2220 + }, + { + "epoch": 3.5779299234796618, + "grad_norm": 0.24547436858026264, + "learning_rate": 9.022367201237495e-06, + "loss": 0.1908, + "step": 2221 + }, + { + "epoch": 3.5795408779701976, + "grad_norm": 0.24906258689712765, + "learning_rate": 9.00354937925721e-06, + "loss": 0.2125, + "step": 2222 + }, + { + "epoch": 3.581151832460733, + "grad_norm": 0.2358356210611667, + "learning_rate": 8.98474549987862e-06, + "loss": 0.2037, + "step": 2223 + }, + { + "epoch": 3.5827627869512684, + "grad_norm": 0.23154346550342686, + "learning_rate": 8.965955586943538e-06, + "loss": 0.1915, + "step": 2224 + }, + { + "epoch": 3.584373741441804, + "grad_norm": 0.23932511820741448, + "learning_rate": 8.947179664276028e-06, + "loss": 0.2049, + "step": 2225 + }, + { + "epoch": 3.58598469593234, + "grad_norm": 0.216916084634077, + "learning_rate": 8.928417755682416e-06, + "loss": 0.1896, + "step": 2226 + }, + { + "epoch": 3.5875956504228754, + "grad_norm": 0.2200896036801776, + "learning_rate": 8.909669884951284e-06, + "loss": 0.2, + "step": 2227 + }, + { + "epoch": 3.5892066049134113, + "grad_norm": 0.2485175178385473, + "learning_rate": 8.890936075853403e-06, + "loss": 0.2034, + "step": 2228 + }, + { + "epoch": 3.5908175594039466, + "grad_norm": 0.2178344656723329, + "learning_rate": 8.87221635214171e-06, + "loss": 0.1798, + "step": 2229 + }, + { + "epoch": 3.5924285138944825, + "grad_norm": 0.2302593584653443, + "learning_rate": 8.853510737551274e-06, + "loss": 0.2048, + "step": 2230 + }, + { + "epoch": 3.5940394683850183, + "grad_norm": 0.2195145608319814, + "learning_rate": 8.8348192557993e-06, + "loss": 0.189, + "step": 2231 + }, + { + "epoch": 3.5956504228755537, + "grad_norm": 0.2384324191704394, + "learning_rate": 8.816141930585067e-06, + "loss": 0.191, + "step": 2232 + }, + { + "epoch": 3.5972613773660895, + "grad_norm": 0.27318029242884734, + "learning_rate": 8.79747878558989e-06, + "loss": 0.2054, + "step": 2233 + }, + { + "epoch": 3.598872331856625, + "grad_norm": 0.2443948830306952, + "learning_rate": 8.778829844477099e-06, + "loss": 0.1963, + "step": 2234 + }, + { + "epoch": 3.6004832863471607, + "grad_norm": 0.24789244954904005, + "learning_rate": 8.76019513089206e-06, + "loss": 0.2092, + "step": 2235 + }, + { + "epoch": 3.6020942408376966, + "grad_norm": 0.24650887122937143, + "learning_rate": 8.741574668462053e-06, + "loss": 0.2046, + "step": 2236 + }, + { + "epoch": 3.603705195328232, + "grad_norm": 0.23165112489745265, + "learning_rate": 8.722968480796294e-06, + "loss": 0.1855, + "step": 2237 + }, + { + "epoch": 3.6053161498187674, + "grad_norm": 0.2576959078546253, + "learning_rate": 8.704376591485923e-06, + "loss": 0.2059, + "step": 2238 + }, + { + "epoch": 3.606927104309303, + "grad_norm": 0.23768867314811368, + "learning_rate": 8.685799024103942e-06, + "loss": 0.1906, + "step": 2239 + }, + { + "epoch": 3.608538058799839, + "grad_norm": 0.2395303763373646, + "learning_rate": 8.667235802205183e-06, + "loss": 0.1901, + "step": 2240 + }, + { + "epoch": 3.6101490132903744, + "grad_norm": 0.2413793337525353, + "learning_rate": 8.648686949326286e-06, + "loss": 0.1961, + "step": 2241 + }, + { + "epoch": 3.6117599677809102, + "grad_norm": 0.23591510967044874, + "learning_rate": 8.630152488985694e-06, + "loss": 0.2082, + "step": 2242 + }, + { + "epoch": 3.6133709222714456, + "grad_norm": 0.2368705104725338, + "learning_rate": 8.611632444683595e-06, + "loss": 0.2016, + "step": 2243 + }, + { + "epoch": 3.6149818767619815, + "grad_norm": 0.22430359624959098, + "learning_rate": 8.593126839901886e-06, + "loss": 0.2003, + "step": 2244 + }, + { + "epoch": 3.6165928312525173, + "grad_norm": 0.2328209779003292, + "learning_rate": 8.57463569810415e-06, + "loss": 0.2193, + "step": 2245 + }, + { + "epoch": 3.6182037857430527, + "grad_norm": 0.21765260530670116, + "learning_rate": 8.556159042735672e-06, + "loss": 0.1884, + "step": 2246 + }, + { + "epoch": 3.6198147402335885, + "grad_norm": 0.2373213750878368, + "learning_rate": 8.537696897223331e-06, + "loss": 0.1916, + "step": 2247 + }, + { + "epoch": 3.621425694724124, + "grad_norm": 0.23641497550916923, + "learning_rate": 8.519249284975611e-06, + "loss": 0.2159, + "step": 2248 + }, + { + "epoch": 3.6230366492146597, + "grad_norm": 0.23628799850862556, + "learning_rate": 8.500816229382584e-06, + "loss": 0.2091, + "step": 2249 + }, + { + "epoch": 3.6246476037051956, + "grad_norm": 0.2234944894492603, + "learning_rate": 8.482397753815872e-06, + "loss": 0.2012, + "step": 2250 + }, + { + "epoch": 3.626258558195731, + "grad_norm": 0.21626870143948573, + "learning_rate": 8.463993881628586e-06, + "loss": 0.1922, + "step": 2251 + }, + { + "epoch": 3.6278695126862663, + "grad_norm": 0.23755306979760044, + "learning_rate": 8.445604636155328e-06, + "loss": 0.2078, + "step": 2252 + }, + { + "epoch": 3.629480467176802, + "grad_norm": 0.24117810761942085, + "learning_rate": 8.427230040712171e-06, + "loss": 0.2085, + "step": 2253 + }, + { + "epoch": 3.631091421667338, + "grad_norm": 0.22854665517211442, + "learning_rate": 8.408870118596606e-06, + "loss": 0.1999, + "step": 2254 + }, + { + "epoch": 3.6327023761578734, + "grad_norm": 0.22293728621187936, + "learning_rate": 8.390524893087505e-06, + "loss": 0.1836, + "step": 2255 + }, + { + "epoch": 3.6343133306484092, + "grad_norm": 0.22707952398983391, + "learning_rate": 8.37219438744511e-06, + "loss": 0.1859, + "step": 2256 + }, + { + "epoch": 3.6359242851389446, + "grad_norm": 0.23511966018985556, + "learning_rate": 8.353878624911026e-06, + "loss": 0.2185, + "step": 2257 + }, + { + "epoch": 3.6375352396294804, + "grad_norm": 0.22703099897283688, + "learning_rate": 8.335577628708137e-06, + "loss": 0.1927, + "step": 2258 + }, + { + "epoch": 3.6391461941200163, + "grad_norm": 0.25017536611561936, + "learning_rate": 8.317291422040601e-06, + "loss": 0.2087, + "step": 2259 + }, + { + "epoch": 3.6407571486105517, + "grad_norm": 0.23049249283018605, + "learning_rate": 8.299020028093844e-06, + "loss": 0.202, + "step": 2260 + }, + { + "epoch": 3.6423681031010875, + "grad_norm": 0.2271488606251097, + "learning_rate": 8.28076347003451e-06, + "loss": 0.1997, + "step": 2261 + }, + { + "epoch": 3.643979057591623, + "grad_norm": 0.23883238541566312, + "learning_rate": 8.262521771010417e-06, + "loss": 0.2128, + "step": 2262 + }, + { + "epoch": 3.6455900120821587, + "grad_norm": 0.22315904417254107, + "learning_rate": 8.24429495415054e-06, + "loss": 0.1997, + "step": 2263 + }, + { + "epoch": 3.6472009665726945, + "grad_norm": 0.2234185102011281, + "learning_rate": 8.226083042565006e-06, + "loss": 0.2003, + "step": 2264 + }, + { + "epoch": 3.64881192106323, + "grad_norm": 0.23225689520155557, + "learning_rate": 8.207886059345034e-06, + "loss": 0.1995, + "step": 2265 + }, + { + "epoch": 3.6504228755537658, + "grad_norm": 0.23187295084042797, + "learning_rate": 8.189704027562913e-06, + "loss": 0.1775, + "step": 2266 + }, + { + "epoch": 3.652033830044301, + "grad_norm": 0.23529056309699636, + "learning_rate": 8.171536970271963e-06, + "loss": 0.1999, + "step": 2267 + }, + { + "epoch": 3.653644784534837, + "grad_norm": 0.23236799884409995, + "learning_rate": 8.153384910506539e-06, + "loss": 0.1871, + "step": 2268 + }, + { + "epoch": 3.6552557390253724, + "grad_norm": 0.23485228349145965, + "learning_rate": 8.135247871281977e-06, + "loss": 0.2062, + "step": 2269 + }, + { + "epoch": 3.656866693515908, + "grad_norm": 0.22646757698120465, + "learning_rate": 8.11712587559455e-06, + "loss": 0.1832, + "step": 2270 + }, + { + "epoch": 3.6584776480064436, + "grad_norm": 0.2514790956885834, + "learning_rate": 8.099018946421473e-06, + "loss": 0.2196, + "step": 2271 + }, + { + "epoch": 3.6600886024969794, + "grad_norm": 0.2340164808100686, + "learning_rate": 8.080927106720862e-06, + "loss": 0.2239, + "step": 2272 + }, + { + "epoch": 3.6616995569875153, + "grad_norm": 0.2238317436694524, + "learning_rate": 8.062850379431689e-06, + "loss": 0.1888, + "step": 2273 + }, + { + "epoch": 3.6633105114780506, + "grad_norm": 0.23788481343752213, + "learning_rate": 8.044788787473756e-06, + "loss": 0.2094, + "step": 2274 + }, + { + "epoch": 3.6649214659685865, + "grad_norm": 0.23426880337639244, + "learning_rate": 8.026742353747698e-06, + "loss": 0.1888, + "step": 2275 + }, + { + "epoch": 3.666532420459122, + "grad_norm": 0.23338079652138916, + "learning_rate": 8.008711101134928e-06, + "loss": 0.1917, + "step": 2276 + }, + { + "epoch": 3.6681433749496577, + "grad_norm": 0.22694613559354795, + "learning_rate": 7.99069505249759e-06, + "loss": 0.2077, + "step": 2277 + }, + { + "epoch": 3.6697543294401935, + "grad_norm": 0.23368251076797544, + "learning_rate": 7.972694230678562e-06, + "loss": 0.2019, + "step": 2278 + }, + { + "epoch": 3.671365283930729, + "grad_norm": 0.2274192071332042, + "learning_rate": 7.95470865850142e-06, + "loss": 0.2081, + "step": 2279 + }, + { + "epoch": 3.6729762384212647, + "grad_norm": 0.23272909394736677, + "learning_rate": 7.936738358770409e-06, + "loss": 0.2012, + "step": 2280 + }, + { + "epoch": 3.6745871929118, + "grad_norm": 0.2283629788794831, + "learning_rate": 7.918783354270388e-06, + "loss": 0.1951, + "step": 2281 + }, + { + "epoch": 3.676198147402336, + "grad_norm": 0.23382239466066568, + "learning_rate": 7.900843667766845e-06, + "loss": 0.2083, + "step": 2282 + }, + { + "epoch": 3.677809101892872, + "grad_norm": 0.22454604048146312, + "learning_rate": 7.882919322005844e-06, + "loss": 0.1948, + "step": 2283 + }, + { + "epoch": 3.679420056383407, + "grad_norm": 0.22453891515871524, + "learning_rate": 7.865010339713986e-06, + "loss": 0.1785, + "step": 2284 + }, + { + "epoch": 3.6810310108739426, + "grad_norm": 0.23166411493780942, + "learning_rate": 7.847116743598388e-06, + "loss": 0.2198, + "step": 2285 + }, + { + "epoch": 3.6826419653644784, + "grad_norm": 0.2347847692976649, + "learning_rate": 7.829238556346683e-06, + "loss": 0.1903, + "step": 2286 + }, + { + "epoch": 3.6842529198550142, + "grad_norm": 0.2388300143720348, + "learning_rate": 7.811375800626954e-06, + "loss": 0.2082, + "step": 2287 + }, + { + "epoch": 3.6858638743455496, + "grad_norm": 0.22396488198134365, + "learning_rate": 7.793528499087709e-06, + "loss": 0.2033, + "step": 2288 + }, + { + "epoch": 3.6874748288360855, + "grad_norm": 0.2336053446605039, + "learning_rate": 7.775696674357876e-06, + "loss": 0.2105, + "step": 2289 + }, + { + "epoch": 3.689085783326621, + "grad_norm": 0.23591046953244488, + "learning_rate": 7.757880349046742e-06, + "loss": 0.1914, + "step": 2290 + }, + { + "epoch": 3.6906967378171567, + "grad_norm": 0.23748095616700868, + "learning_rate": 7.74007954574397e-06, + "loss": 0.2083, + "step": 2291 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 0.2481715938640332, + "learning_rate": 7.722294287019509e-06, + "loss": 0.1945, + "step": 2292 + }, + { + "epoch": 3.693918646798228, + "grad_norm": 0.24850684859956976, + "learning_rate": 7.704524595423631e-06, + "loss": 0.2224, + "step": 2293 + }, + { + "epoch": 3.6955296012887637, + "grad_norm": 0.22449191890739228, + "learning_rate": 7.686770493486835e-06, + "loss": 0.1917, + "step": 2294 + }, + { + "epoch": 3.697140555779299, + "grad_norm": 0.22240414266437813, + "learning_rate": 7.669032003719894e-06, + "loss": 0.2043, + "step": 2295 + }, + { + "epoch": 3.698751510269835, + "grad_norm": 0.23811833779559768, + "learning_rate": 7.651309148613745e-06, + "loss": 0.2228, + "step": 2296 + }, + { + "epoch": 3.700362464760371, + "grad_norm": 0.2346602801883088, + "learning_rate": 7.633601950639532e-06, + "loss": 0.2044, + "step": 2297 + }, + { + "epoch": 3.701973419250906, + "grad_norm": 0.24881467892424516, + "learning_rate": 7.615910432248546e-06, + "loss": 0.1971, + "step": 2298 + }, + { + "epoch": 3.7035843737414416, + "grad_norm": 0.21959606461798073, + "learning_rate": 7.598234615872169e-06, + "loss": 0.1723, + "step": 2299 + }, + { + "epoch": 3.7051953282319774, + "grad_norm": 0.2275339064144191, + "learning_rate": 7.580574523921906e-06, + "loss": 0.1943, + "step": 2300 + }, + { + "epoch": 3.7068062827225132, + "grad_norm": 0.2638835977196525, + "learning_rate": 7.562930178789305e-06, + "loss": 0.225, + "step": 2301 + }, + { + "epoch": 3.7084172372130486, + "grad_norm": 0.22139223269245323, + "learning_rate": 7.545301602845965e-06, + "loss": 0.1974, + "step": 2302 + }, + { + "epoch": 3.7100281917035844, + "grad_norm": 0.23262025318923577, + "learning_rate": 7.527688818443466e-06, + "loss": 0.1918, + "step": 2303 + }, + { + "epoch": 3.71163914619412, + "grad_norm": 0.2281510665207774, + "learning_rate": 7.510091847913396e-06, + "loss": 0.1827, + "step": 2304 + }, + { + "epoch": 3.7132501006846557, + "grad_norm": 0.2315480501775712, + "learning_rate": 7.492510713567265e-06, + "loss": 0.2254, + "step": 2305 + }, + { + "epoch": 3.7148610551751915, + "grad_norm": 0.22423715853143875, + "learning_rate": 7.474945437696528e-06, + "loss": 0.18, + "step": 2306 + }, + { + "epoch": 3.716472009665727, + "grad_norm": 0.23128163413988403, + "learning_rate": 7.457396042572507e-06, + "loss": 0.2008, + "step": 2307 + }, + { + "epoch": 3.7180829641562627, + "grad_norm": 0.23426184961659158, + "learning_rate": 7.4398625504464105e-06, + "loss": 0.195, + "step": 2308 + }, + { + "epoch": 3.719693918646798, + "grad_norm": 0.2242770154556143, + "learning_rate": 7.422344983549281e-06, + "loss": 0.1905, + "step": 2309 + }, + { + "epoch": 3.721304873137334, + "grad_norm": 0.22078020401371204, + "learning_rate": 7.404843364091951e-06, + "loss": 0.1849, + "step": 2310 + }, + { + "epoch": 3.7229158276278698, + "grad_norm": 0.2319635828085481, + "learning_rate": 7.387357714265057e-06, + "loss": 0.1882, + "step": 2311 + }, + { + "epoch": 3.724526782118405, + "grad_norm": 0.245118103968261, + "learning_rate": 7.369888056238963e-06, + "loss": 0.2013, + "step": 2312 + }, + { + "epoch": 3.7261377366089405, + "grad_norm": 0.22519823792644417, + "learning_rate": 7.352434412163785e-06, + "loss": 0.2004, + "step": 2313 + }, + { + "epoch": 3.7277486910994764, + "grad_norm": 0.21704653831383208, + "learning_rate": 7.334996804169301e-06, + "loss": 0.1881, + "step": 2314 + }, + { + "epoch": 3.729359645590012, + "grad_norm": 0.22862567502117337, + "learning_rate": 7.31757525436499e-06, + "loss": 0.2199, + "step": 2315 + }, + { + "epoch": 3.7309706000805476, + "grad_norm": 0.21180370521549158, + "learning_rate": 7.300169784839941e-06, + "loss": 0.1748, + "step": 2316 + }, + { + "epoch": 3.7325815545710834, + "grad_norm": 0.24645022888252976, + "learning_rate": 7.282780417662885e-06, + "loss": 0.2207, + "step": 2317 + }, + { + "epoch": 3.734192509061619, + "grad_norm": 0.21457737876895103, + "learning_rate": 7.265407174882102e-06, + "loss": 0.1932, + "step": 2318 + }, + { + "epoch": 3.7358034635521546, + "grad_norm": 0.2338070679618665, + "learning_rate": 7.248050078525463e-06, + "loss": 0.1986, + "step": 2319 + }, + { + "epoch": 3.7374144180426905, + "grad_norm": 0.23031422564032217, + "learning_rate": 7.2307091506003325e-06, + "loss": 0.1966, + "step": 2320 + }, + { + "epoch": 3.739025372533226, + "grad_norm": 0.23174071582081254, + "learning_rate": 7.2133844130936e-06, + "loss": 0.2106, + "step": 2321 + }, + { + "epoch": 3.7406363270237617, + "grad_norm": 0.22866751182170306, + "learning_rate": 7.1960758879716255e-06, + "loss": 0.1961, + "step": 2322 + }, + { + "epoch": 3.742247281514297, + "grad_norm": 0.24817377490035214, + "learning_rate": 7.178783597180192e-06, + "loss": 0.2104, + "step": 2323 + }, + { + "epoch": 3.743858236004833, + "grad_norm": 0.21967760847457896, + "learning_rate": 7.161507562644525e-06, + "loss": 0.1868, + "step": 2324 + }, + { + "epoch": 3.7454691904953687, + "grad_norm": 0.2256324233021988, + "learning_rate": 7.1442478062692135e-06, + "loss": 0.217, + "step": 2325 + }, + { + "epoch": 3.747080144985904, + "grad_norm": 0.22260244060453915, + "learning_rate": 7.127004349938234e-06, + "loss": 0.2126, + "step": 2326 + }, + { + "epoch": 3.7486910994764395, + "grad_norm": 0.23048377400215084, + "learning_rate": 7.109777215514866e-06, + "loss": 0.2021, + "step": 2327 + }, + { + "epoch": 3.7503020539669754, + "grad_norm": 0.22938279327652358, + "learning_rate": 7.092566424841724e-06, + "loss": 0.2079, + "step": 2328 + }, + { + "epoch": 3.751913008457511, + "grad_norm": 0.22136550368979707, + "learning_rate": 7.0753719997406725e-06, + "loss": 0.1841, + "step": 2329 + }, + { + "epoch": 3.7535239629480466, + "grad_norm": 0.24306246271158838, + "learning_rate": 7.0581939620128515e-06, + "loss": 0.1993, + "step": 2330 + }, + { + "epoch": 3.7551349174385824, + "grad_norm": 0.2292465000453504, + "learning_rate": 7.0410323334386e-06, + "loss": 0.1881, + "step": 2331 + }, + { + "epoch": 3.756745871929118, + "grad_norm": 0.24266774620522708, + "learning_rate": 7.0238871357774655e-06, + "loss": 0.2136, + "step": 2332 + }, + { + "epoch": 3.7583568264196536, + "grad_norm": 0.22008050143973384, + "learning_rate": 7.0067583907681645e-06, + "loss": 0.1973, + "step": 2333 + }, + { + "epoch": 3.7599677809101895, + "grad_norm": 0.21463510410453734, + "learning_rate": 6.989646120128537e-06, + "loss": 0.1983, + "step": 2334 + }, + { + "epoch": 3.761578735400725, + "grad_norm": 0.23707104664701026, + "learning_rate": 6.97255034555556e-06, + "loss": 0.2136, + "step": 2335 + }, + { + "epoch": 3.7631896898912607, + "grad_norm": 0.21494086360905443, + "learning_rate": 6.955471088725263e-06, + "loss": 0.1896, + "step": 2336 + }, + { + "epoch": 3.764800644381796, + "grad_norm": 0.22994725268735505, + "learning_rate": 6.938408371292764e-06, + "loss": 0.1971, + "step": 2337 + }, + { + "epoch": 3.766411598872332, + "grad_norm": 0.23686841356483834, + "learning_rate": 6.921362214892182e-06, + "loss": 0.2062, + "step": 2338 + }, + { + "epoch": 3.7680225533628677, + "grad_norm": 0.21759254488600177, + "learning_rate": 6.904332641136668e-06, + "loss": 0.1889, + "step": 2339 + }, + { + "epoch": 3.769633507853403, + "grad_norm": 0.23896380350462712, + "learning_rate": 6.887319671618315e-06, + "loss": 0.2313, + "step": 2340 + }, + { + "epoch": 3.7712444623439385, + "grad_norm": 0.22446891923766324, + "learning_rate": 6.870323327908193e-06, + "loss": 0.2126, + "step": 2341 + }, + { + "epoch": 3.7728554168344743, + "grad_norm": 0.2192309543722886, + "learning_rate": 6.8533436315562665e-06, + "loss": 0.2035, + "step": 2342 + }, + { + "epoch": 3.77446637132501, + "grad_norm": 0.22492224295878233, + "learning_rate": 6.836380604091411e-06, + "loss": 0.1933, + "step": 2343 + }, + { + "epoch": 3.7760773258155456, + "grad_norm": 0.23367721504266944, + "learning_rate": 6.819434267021366e-06, + "loss": 0.207, + "step": 2344 + }, + { + "epoch": 3.7776882803060814, + "grad_norm": 0.21684454903299977, + "learning_rate": 6.80250464183269e-06, + "loss": 0.1925, + "step": 2345 + }, + { + "epoch": 3.779299234796617, + "grad_norm": 0.2220089960826439, + "learning_rate": 6.785591749990779e-06, + "loss": 0.1939, + "step": 2346 + }, + { + "epoch": 3.7809101892871526, + "grad_norm": 0.2361697872498856, + "learning_rate": 6.768695612939789e-06, + "loss": 0.2059, + "step": 2347 + }, + { + "epoch": 3.7825211437776884, + "grad_norm": 0.21291861216027663, + "learning_rate": 6.751816252102652e-06, + "loss": 0.1933, + "step": 2348 + }, + { + "epoch": 3.784132098268224, + "grad_norm": 0.22876029211036183, + "learning_rate": 6.734953688881007e-06, + "loss": 0.2097, + "step": 2349 + }, + { + "epoch": 3.7857430527587597, + "grad_norm": 0.23320502678881608, + "learning_rate": 6.7181079446552165e-06, + "loss": 0.1858, + "step": 2350 + }, + { + "epoch": 3.787354007249295, + "grad_norm": 0.23397548643850805, + "learning_rate": 6.701279040784314e-06, + "loss": 0.2063, + "step": 2351 + }, + { + "epoch": 3.788964961739831, + "grad_norm": 0.221465934796803, + "learning_rate": 6.68446699860597e-06, + "loss": 0.2066, + "step": 2352 + }, + { + "epoch": 3.7905759162303667, + "grad_norm": 0.24482731312630718, + "learning_rate": 6.667671839436473e-06, + "loss": 0.2112, + "step": 2353 + }, + { + "epoch": 3.792186870720902, + "grad_norm": 0.23761431330849628, + "learning_rate": 6.650893584570724e-06, + "loss": 0.2135, + "step": 2354 + }, + { + "epoch": 3.793797825211438, + "grad_norm": 0.23123489562147165, + "learning_rate": 6.634132255282182e-06, + "loss": 0.1959, + "step": 2355 + }, + { + "epoch": 3.7954087797019733, + "grad_norm": 0.22293241276120573, + "learning_rate": 6.617387872822842e-06, + "loss": 0.1976, + "step": 2356 + }, + { + "epoch": 3.797019734192509, + "grad_norm": 0.23135387901728863, + "learning_rate": 6.6006604584232044e-06, + "loss": 0.1956, + "step": 2357 + }, + { + "epoch": 3.7986306886830445, + "grad_norm": 0.2325139006331464, + "learning_rate": 6.583950033292277e-06, + "loss": 0.208, + "step": 2358 + }, + { + "epoch": 3.8002416431735804, + "grad_norm": 0.233386229971809, + "learning_rate": 6.567256618617515e-06, + "loss": 0.2013, + "step": 2359 + }, + { + "epoch": 3.8018525976641158, + "grad_norm": 0.2191968818040746, + "learning_rate": 6.550580235564794e-06, + "loss": 0.1866, + "step": 2360 + }, + { + "epoch": 3.8034635521546516, + "grad_norm": 0.23952247025452705, + "learning_rate": 6.533920905278415e-06, + "loss": 0.1995, + "step": 2361 + }, + { + "epoch": 3.8050745066451874, + "grad_norm": 0.21794716771682257, + "learning_rate": 6.517278648881054e-06, + "loss": 0.1924, + "step": 2362 + }, + { + "epoch": 3.806685461135723, + "grad_norm": 0.24189326859388585, + "learning_rate": 6.500653487473727e-06, + "loss": 0.197, + "step": 2363 + }, + { + "epoch": 3.8082964156262586, + "grad_norm": 0.22118662032379693, + "learning_rate": 6.4840454421357755e-06, + "loss": 0.1838, + "step": 2364 + }, + { + "epoch": 3.809907370116794, + "grad_norm": 0.23506496434883922, + "learning_rate": 6.46745453392485e-06, + "loss": 0.2199, + "step": 2365 + }, + { + "epoch": 3.81151832460733, + "grad_norm": 0.2226823424824965, + "learning_rate": 6.450880783876878e-06, + "loss": 0.2025, + "step": 2366 + }, + { + "epoch": 3.8131292790978657, + "grad_norm": 0.22964938395731235, + "learning_rate": 6.434324213006013e-06, + "loss": 0.1831, + "step": 2367 + }, + { + "epoch": 3.814740233588401, + "grad_norm": 0.24890163240709035, + "learning_rate": 6.417784842304628e-06, + "loss": 0.2056, + "step": 2368 + }, + { + "epoch": 3.816351188078937, + "grad_norm": 0.23386480273146806, + "learning_rate": 6.401262692743302e-06, + "loss": 0.2001, + "step": 2369 + }, + { + "epoch": 3.8179621425694723, + "grad_norm": 0.23767647586351798, + "learning_rate": 6.384757785270777e-06, + "loss": 0.1918, + "step": 2370 + }, + { + "epoch": 3.819573097060008, + "grad_norm": 0.2378296385486382, + "learning_rate": 6.368270140813917e-06, + "loss": 0.2073, + "step": 2371 + }, + { + "epoch": 3.821184051550544, + "grad_norm": 0.24265218985960949, + "learning_rate": 6.351799780277716e-06, + "loss": 0.2087, + "step": 2372 + }, + { + "epoch": 3.8227950060410794, + "grad_norm": 0.21839384113148216, + "learning_rate": 6.335346724545255e-06, + "loss": 0.203, + "step": 2373 + }, + { + "epoch": 3.8244059605316147, + "grad_norm": 0.21506408724015144, + "learning_rate": 6.318910994477654e-06, + "loss": 0.1757, + "step": 2374 + }, + { + "epoch": 3.8260169150221506, + "grad_norm": 0.2272335909081642, + "learning_rate": 6.3024926109140725e-06, + "loss": 0.2218, + "step": 2375 + }, + { + "epoch": 3.8276278695126864, + "grad_norm": 0.22244961279714315, + "learning_rate": 6.286091594671688e-06, + "loss": 0.1857, + "step": 2376 + }, + { + "epoch": 3.829238824003222, + "grad_norm": 0.2588268718136773, + "learning_rate": 6.2697079665456575e-06, + "loss": 0.2055, + "step": 2377 + }, + { + "epoch": 3.8308497784937576, + "grad_norm": 0.229250689941906, + "learning_rate": 6.253341747309076e-06, + "loss": 0.1985, + "step": 2378 + }, + { + "epoch": 3.832460732984293, + "grad_norm": 0.22729115522993168, + "learning_rate": 6.236992957712968e-06, + "loss": 0.1987, + "step": 2379 + }, + { + "epoch": 3.834071687474829, + "grad_norm": 0.2203706251189883, + "learning_rate": 6.220661618486268e-06, + "loss": 0.2028, + "step": 2380 + }, + { + "epoch": 3.8356826419653647, + "grad_norm": 0.23167585742706767, + "learning_rate": 6.204347750335791e-06, + "loss": 0.2056, + "step": 2381 + }, + { + "epoch": 3.8372935964559, + "grad_norm": 0.22942562541407732, + "learning_rate": 6.188051373946182e-06, + "loss": 0.1891, + "step": 2382 + }, + { + "epoch": 3.838904550946436, + "grad_norm": 0.22056932768319534, + "learning_rate": 6.171772509979903e-06, + "loss": 0.2052, + "step": 2383 + }, + { + "epoch": 3.8405155054369713, + "grad_norm": 0.22982020120028385, + "learning_rate": 6.155511179077252e-06, + "loss": 0.1968, + "step": 2384 + }, + { + "epoch": 3.842126459927507, + "grad_norm": 0.24493108816991999, + "learning_rate": 6.1392674018562525e-06, + "loss": 0.2133, + "step": 2385 + }, + { + "epoch": 3.843737414418043, + "grad_norm": 0.24068488930223084, + "learning_rate": 6.1230411989126825e-06, + "loss": 0.2114, + "step": 2386 + }, + { + "epoch": 3.8453483689085783, + "grad_norm": 0.20980965674211466, + "learning_rate": 6.106832590820053e-06, + "loss": 0.1814, + "step": 2387 + }, + { + "epoch": 3.8469593233991137, + "grad_norm": 0.22052853718987608, + "learning_rate": 6.090641598129559e-06, + "loss": 0.1941, + "step": 2388 + }, + { + "epoch": 3.8485702778896496, + "grad_norm": 0.24442699536324203, + "learning_rate": 6.074468241370053e-06, + "loss": 0.2184, + "step": 2389 + }, + { + "epoch": 3.8501812323801854, + "grad_norm": 0.22931064215122915, + "learning_rate": 6.058312541048021e-06, + "loss": 0.1814, + "step": 2390 + }, + { + "epoch": 3.851792186870721, + "grad_norm": 0.21834781868614564, + "learning_rate": 6.042174517647583e-06, + "loss": 0.2028, + "step": 2391 + }, + { + "epoch": 3.8534031413612566, + "grad_norm": 0.23946089513989083, + "learning_rate": 6.026054191630439e-06, + "loss": 0.1887, + "step": 2392 + }, + { + "epoch": 3.855014095851792, + "grad_norm": 0.22970941758659105, + "learning_rate": 6.00995158343584e-06, + "loss": 0.2213, + "step": 2393 + }, + { + "epoch": 3.856625050342328, + "grad_norm": 0.2314117292257891, + "learning_rate": 5.993866713480567e-06, + "loss": 0.2038, + "step": 2394 + }, + { + "epoch": 3.8582360048328637, + "grad_norm": 0.2439904102574889, + "learning_rate": 5.977799602158949e-06, + "loss": 0.2155, + "step": 2395 + }, + { + "epoch": 3.859846959323399, + "grad_norm": 0.20257437837005451, + "learning_rate": 5.961750269842754e-06, + "loss": 0.1751, + "step": 2396 + }, + { + "epoch": 3.861457913813935, + "grad_norm": 0.22955556617831127, + "learning_rate": 5.945718736881225e-06, + "loss": 0.2154, + "step": 2397 + }, + { + "epoch": 3.8630688683044703, + "grad_norm": 0.20870764713948106, + "learning_rate": 5.929705023601038e-06, + "loss": 0.1858, + "step": 2398 + }, + { + "epoch": 3.864679822795006, + "grad_norm": 0.22806265886533345, + "learning_rate": 5.913709150306284e-06, + "loss": 0.1966, + "step": 2399 + }, + { + "epoch": 3.866290777285542, + "grad_norm": 0.21759470538337408, + "learning_rate": 5.897731137278417e-06, + "loss": 0.206, + "step": 2400 + }, + { + "epoch": 3.8679017317760773, + "grad_norm": 0.22726788817470528, + "learning_rate": 5.881771004776243e-06, + "loss": 0.218, + "step": 2401 + }, + { + "epoch": 3.8695126862666127, + "grad_norm": 0.223006978113695, + "learning_rate": 5.865828773035927e-06, + "loss": 0.2127, + "step": 2402 + }, + { + "epoch": 3.8711236407571485, + "grad_norm": 0.2113739938673185, + "learning_rate": 5.849904462270908e-06, + "loss": 0.1758, + "step": 2403 + }, + { + "epoch": 3.8727345952476844, + "grad_norm": 0.2209374461562865, + "learning_rate": 5.833998092671911e-06, + "loss": 0.1949, + "step": 2404 + }, + { + "epoch": 3.8743455497382198, + "grad_norm": 0.21472108401460543, + "learning_rate": 5.8181096844069055e-06, + "loss": 0.21, + "step": 2405 + }, + { + "epoch": 3.8759565042287556, + "grad_norm": 0.2263705045461306, + "learning_rate": 5.802239257621116e-06, + "loss": 0.2014, + "step": 2406 + }, + { + "epoch": 3.877567458719291, + "grad_norm": 0.2206794394457454, + "learning_rate": 5.786386832436938e-06, + "loss": 0.1958, + "step": 2407 + }, + { + "epoch": 3.879178413209827, + "grad_norm": 0.21542639161588065, + "learning_rate": 5.770552428953946e-06, + "loss": 0.205, + "step": 2408 + }, + { + "epoch": 3.8807893677003626, + "grad_norm": 0.21783160109169053, + "learning_rate": 5.754736067248878e-06, + "loss": 0.2038, + "step": 2409 + }, + { + "epoch": 3.882400322190898, + "grad_norm": 0.22638816278271062, + "learning_rate": 5.738937767375596e-06, + "loss": 0.2086, + "step": 2410 + }, + { + "epoch": 3.884011276681434, + "grad_norm": 0.23034123131475734, + "learning_rate": 5.723157549365046e-06, + "loss": 0.1996, + "step": 2411 + }, + { + "epoch": 3.8856222311719693, + "grad_norm": 0.2403842941222503, + "learning_rate": 5.707395433225247e-06, + "loss": 0.1928, + "step": 2412 + }, + { + "epoch": 3.887233185662505, + "grad_norm": 0.22829443560895596, + "learning_rate": 5.691651438941297e-06, + "loss": 0.1937, + "step": 2413 + }, + { + "epoch": 3.888844140153041, + "grad_norm": 0.22379900687151044, + "learning_rate": 5.675925586475286e-06, + "loss": 0.2068, + "step": 2414 + }, + { + "epoch": 3.8904550946435763, + "grad_norm": 0.21890279750969147, + "learning_rate": 5.660217895766302e-06, + "loss": 0.2006, + "step": 2415 + }, + { + "epoch": 3.8920660491341117, + "grad_norm": 0.24292430296852172, + "learning_rate": 5.644528386730424e-06, + "loss": 0.2124, + "step": 2416 + }, + { + "epoch": 3.8936770036246475, + "grad_norm": 0.23023882143912577, + "learning_rate": 5.628857079260672e-06, + "loss": 0.1948, + "step": 2417 + }, + { + "epoch": 3.8952879581151834, + "grad_norm": 0.2182554614510497, + "learning_rate": 5.613203993226981e-06, + "loss": 0.1901, + "step": 2418 + }, + { + "epoch": 3.8968989126057187, + "grad_norm": 0.2219288373965724, + "learning_rate": 5.597569148476178e-06, + "loss": 0.2212, + "step": 2419 + }, + { + "epoch": 3.8985098670962546, + "grad_norm": 0.21537709000847655, + "learning_rate": 5.581952564831978e-06, + "loss": 0.2052, + "step": 2420 + }, + { + "epoch": 3.90012082158679, + "grad_norm": 0.22629553986166404, + "learning_rate": 5.56635426209494e-06, + "loss": 0.1981, + "step": 2421 + }, + { + "epoch": 3.901731776077326, + "grad_norm": 0.22348100651315547, + "learning_rate": 5.550774260042428e-06, + "loss": 0.2071, + "step": 2422 + }, + { + "epoch": 3.9033427305678616, + "grad_norm": 0.22867321367174948, + "learning_rate": 5.5352125784286085e-06, + "loss": 0.2048, + "step": 2423 + }, + { + "epoch": 3.904953685058397, + "grad_norm": 0.23348086911174862, + "learning_rate": 5.519669236984442e-06, + "loss": 0.2142, + "step": 2424 + }, + { + "epoch": 3.906564639548933, + "grad_norm": 0.21081715913992716, + "learning_rate": 5.504144255417605e-06, + "loss": 0.1903, + "step": 2425 + }, + { + "epoch": 3.9081755940394682, + "grad_norm": 0.22628394919950884, + "learning_rate": 5.488637653412501e-06, + "loss": 0.1932, + "step": 2426 + }, + { + "epoch": 3.909786548530004, + "grad_norm": 0.24684525974817506, + "learning_rate": 5.473149450630242e-06, + "loss": 0.2193, + "step": 2427 + }, + { + "epoch": 3.91139750302054, + "grad_norm": 0.22045186763448354, + "learning_rate": 5.4576796667086125e-06, + "loss": 0.1781, + "step": 2428 + }, + { + "epoch": 3.9130084575110753, + "grad_norm": 0.2120017802530591, + "learning_rate": 5.442228321262029e-06, + "loss": 0.2007, + "step": 2429 + }, + { + "epoch": 3.914619412001611, + "grad_norm": 0.23287109451180657, + "learning_rate": 5.426795433881527e-06, + "loss": 0.206, + "step": 2430 + }, + { + "epoch": 3.9162303664921465, + "grad_norm": 0.22899308427743442, + "learning_rate": 5.411381024134756e-06, + "loss": 0.202, + "step": 2431 + }, + { + "epoch": 3.9178413209826823, + "grad_norm": 0.23426089115523857, + "learning_rate": 5.395985111565938e-06, + "loss": 0.2043, + "step": 2432 + }, + { + "epoch": 3.9194522754732177, + "grad_norm": 0.2205496319307363, + "learning_rate": 5.380607715695822e-06, + "loss": 0.1944, + "step": 2433 + }, + { + "epoch": 3.9210632299637536, + "grad_norm": 0.23047985224867817, + "learning_rate": 5.3652488560216875e-06, + "loss": 0.2084, + "step": 2434 + }, + { + "epoch": 3.922674184454289, + "grad_norm": 0.225561512407316, + "learning_rate": 5.349908552017323e-06, + "loss": 0.2004, + "step": 2435 + }, + { + "epoch": 3.924285138944825, + "grad_norm": 0.23594038731291309, + "learning_rate": 5.334586823132983e-06, + "loss": 0.2138, + "step": 2436 + }, + { + "epoch": 3.9258960934353606, + "grad_norm": 0.22443378504100486, + "learning_rate": 5.31928368879536e-06, + "loss": 0.1862, + "step": 2437 + }, + { + "epoch": 3.927507047925896, + "grad_norm": 0.23429297324806259, + "learning_rate": 5.303999168407585e-06, + "loss": 0.2156, + "step": 2438 + }, + { + "epoch": 3.929118002416432, + "grad_norm": 0.22875055640793499, + "learning_rate": 5.288733281349186e-06, + "loss": 0.1828, + "step": 2439 + }, + { + "epoch": 3.930728956906967, + "grad_norm": 0.2247145591887911, + "learning_rate": 5.273486046976057e-06, + "loss": 0.1846, + "step": 2440 + }, + { + "epoch": 3.932339911397503, + "grad_norm": 0.23458620492491555, + "learning_rate": 5.258257484620441e-06, + "loss": 0.2149, + "step": 2441 + }, + { + "epoch": 3.933950865888039, + "grad_norm": 0.22576936655233082, + "learning_rate": 5.243047613590919e-06, + "loss": 0.1738, + "step": 2442 + }, + { + "epoch": 3.9355618203785743, + "grad_norm": 0.2407340144249399, + "learning_rate": 5.227856453172371e-06, + "loss": 0.1849, + "step": 2443 + }, + { + "epoch": 3.93717277486911, + "grad_norm": 0.21933224499037723, + "learning_rate": 5.212684022625938e-06, + "loss": 0.2001, + "step": 2444 + }, + { + "epoch": 3.9387837293596455, + "grad_norm": 0.22669845460715846, + "learning_rate": 5.1975303411890235e-06, + "loss": 0.1974, + "step": 2445 + }, + { + "epoch": 3.9403946838501813, + "grad_norm": 0.21697797198281793, + "learning_rate": 5.182395428075262e-06, + "loss": 0.183, + "step": 2446 + }, + { + "epoch": 3.9420056383407167, + "grad_norm": 0.23111635845943326, + "learning_rate": 5.167279302474493e-06, + "loss": 0.2058, + "step": 2447 + }, + { + "epoch": 3.9436165928312525, + "grad_norm": 0.21966365881724723, + "learning_rate": 5.152181983552718e-06, + "loss": 0.1975, + "step": 2448 + }, + { + "epoch": 3.945227547321788, + "grad_norm": 0.23026550236047766, + "learning_rate": 5.137103490452113e-06, + "loss": 0.1977, + "step": 2449 + }, + { + "epoch": 3.9468385018123238, + "grad_norm": 0.22759257630671995, + "learning_rate": 5.12204384229098e-06, + "loss": 0.204, + "step": 2450 + }, + { + "epoch": 3.9484494563028596, + "grad_norm": 0.21139382664115194, + "learning_rate": 5.10700305816372e-06, + "loss": 0.1867, + "step": 2451 + }, + { + "epoch": 3.950060410793395, + "grad_norm": 0.21338019048344403, + "learning_rate": 5.091981157140808e-06, + "loss": 0.1819, + "step": 2452 + }, + { + "epoch": 3.951671365283931, + "grad_norm": 0.23306725549135782, + "learning_rate": 5.076978158268801e-06, + "loss": 0.1956, + "step": 2453 + }, + { + "epoch": 3.953282319774466, + "grad_norm": 0.2281881871246552, + "learning_rate": 5.061994080570281e-06, + "loss": 0.1836, + "step": 2454 + }, + { + "epoch": 3.954893274265002, + "grad_norm": 0.2225882034388656, + "learning_rate": 5.047028943043826e-06, + "loss": 0.181, + "step": 2455 + }, + { + "epoch": 3.956504228755538, + "grad_norm": 0.22366930281729092, + "learning_rate": 5.0320827646640054e-06, + "loss": 0.2067, + "step": 2456 + }, + { + "epoch": 3.9581151832460733, + "grad_norm": 0.2220234197721443, + "learning_rate": 5.01715556438136e-06, + "loss": 0.2225, + "step": 2457 + }, + { + "epoch": 3.959726137736609, + "grad_norm": 0.20193439968881782, + "learning_rate": 5.0022473611223635e-06, + "loss": 0.1899, + "step": 2458 + }, + { + "epoch": 3.9613370922271445, + "grad_norm": 0.22650822798732484, + "learning_rate": 4.987358173789394e-06, + "loss": 0.2058, + "step": 2459 + }, + { + "epoch": 3.9629480467176803, + "grad_norm": 0.22033364513660997, + "learning_rate": 4.972488021260733e-06, + "loss": 0.2028, + "step": 2460 + }, + { + "epoch": 3.964559001208216, + "grad_norm": 0.21837427939672938, + "learning_rate": 4.95763692239051e-06, + "loss": 0.2115, + "step": 2461 + }, + { + "epoch": 3.9661699556987515, + "grad_norm": 0.2279553953983244, + "learning_rate": 4.942804896008717e-06, + "loss": 0.2063, + "step": 2462 + }, + { + "epoch": 3.967780910189287, + "grad_norm": 0.2138127959490929, + "learning_rate": 4.927991960921141e-06, + "loss": 0.1793, + "step": 2463 + }, + { + "epoch": 3.9693918646798227, + "grad_norm": 0.22416253513747308, + "learning_rate": 4.9131981359093826e-06, + "loss": 0.2017, + "step": 2464 + }, + { + "epoch": 3.9710028191703586, + "grad_norm": 0.228356776210779, + "learning_rate": 4.8984234397308086e-06, + "loss": 0.1888, + "step": 2465 + }, + { + "epoch": 3.972613773660894, + "grad_norm": 0.24338949114710415, + "learning_rate": 4.883667891118515e-06, + "loss": 0.2384, + "step": 2466 + }, + { + "epoch": 3.97422472815143, + "grad_norm": 0.2099842060282153, + "learning_rate": 4.868931508781345e-06, + "loss": 0.1863, + "step": 2467 + }, + { + "epoch": 3.975835682641965, + "grad_norm": 0.217213579023591, + "learning_rate": 4.854214311403818e-06, + "loss": 0.207, + "step": 2468 + }, + { + "epoch": 3.977446637132501, + "grad_norm": 0.2198373072030007, + "learning_rate": 4.839516317646149e-06, + "loss": 0.198, + "step": 2469 + }, + { + "epoch": 3.979057591623037, + "grad_norm": 0.2262853783161159, + "learning_rate": 4.824837546144183e-06, + "loss": 0.2114, + "step": 2470 + }, + { + "epoch": 3.9806685461135722, + "grad_norm": 0.21939682500082897, + "learning_rate": 4.810178015509415e-06, + "loss": 0.2064, + "step": 2471 + }, + { + "epoch": 3.982279500604108, + "grad_norm": 0.22515465224528589, + "learning_rate": 4.795537744328924e-06, + "loss": 0.1987, + "step": 2472 + }, + { + "epoch": 3.9838904550946435, + "grad_norm": 0.21740935835993958, + "learning_rate": 4.780916751165388e-06, + "loss": 0.2032, + "step": 2473 + }, + { + "epoch": 3.9855014095851793, + "grad_norm": 0.22140692196370554, + "learning_rate": 4.76631505455702e-06, + "loss": 0.1975, + "step": 2474 + }, + { + "epoch": 3.987112364075715, + "grad_norm": 0.2284279380672877, + "learning_rate": 4.751732673017589e-06, + "loss": 0.2144, + "step": 2475 + }, + { + "epoch": 3.9887233185662505, + "grad_norm": 0.22252220249176013, + "learning_rate": 4.737169625036369e-06, + "loss": 0.1961, + "step": 2476 + }, + { + "epoch": 3.990334273056786, + "grad_norm": 0.2213040546419634, + "learning_rate": 4.722625929078102e-06, + "loss": 0.2141, + "step": 2477 + }, + { + "epoch": 3.9919452275473217, + "grad_norm": 0.22175328254802687, + "learning_rate": 4.708101603583019e-06, + "loss": 0.184, + "step": 2478 + }, + { + "epoch": 3.9935561820378576, + "grad_norm": 0.22787646419219595, + "learning_rate": 4.693596666966771e-06, + "loss": 0.2179, + "step": 2479 + }, + { + "epoch": 3.995167136528393, + "grad_norm": 0.21931238676949583, + "learning_rate": 4.679111137620442e-06, + "loss": 0.2055, + "step": 2480 + }, + { + "epoch": 3.996778091018929, + "grad_norm": 0.22831017006597473, + "learning_rate": 4.664645033910491e-06, + "loss": 0.2113, + "step": 2481 + }, + { + "epoch": 3.998389045509464, + "grad_norm": 0.2179416073465815, + "learning_rate": 4.650198374178767e-06, + "loss": 0.1947, + "step": 2482 + }, + { + "epoch": 4.0, + "grad_norm": 0.26286690041897504, + "learning_rate": 4.635771176742443e-06, + "loss": 0.2311, + "step": 2483 + }, + { + "epoch": 4.001610954490536, + "grad_norm": 0.394604874099715, + "learning_rate": 4.621363459894039e-06, + "loss": 0.1444, + "step": 2484 + }, + { + "epoch": 4.003221908981072, + "grad_norm": 0.30419988334377224, + "learning_rate": 4.606975241901354e-06, + "loss": 0.1577, + "step": 2485 + }, + { + "epoch": 4.004832863471607, + "grad_norm": 0.24739619219289316, + "learning_rate": 4.592606541007481e-06, + "loss": 0.152, + "step": 2486 + }, + { + "epoch": 4.006443817962142, + "grad_norm": 0.37269612926471524, + "learning_rate": 4.578257375430764e-06, + "loss": 0.1528, + "step": 2487 + }, + { + "epoch": 4.008054772452678, + "grad_norm": 0.41792249971665374, + "learning_rate": 4.563927763364759e-06, + "loss": 0.1466, + "step": 2488 + }, + { + "epoch": 4.009665726943214, + "grad_norm": 0.3578527160556587, + "learning_rate": 4.549617722978259e-06, + "loss": 0.1601, + "step": 2489 + }, + { + "epoch": 4.01127668143375, + "grad_norm": 0.2889923380752588, + "learning_rate": 4.535327272415215e-06, + "loss": 0.1456, + "step": 2490 + }, + { + "epoch": 4.012887635924285, + "grad_norm": 0.3818532366043304, + "learning_rate": 4.521056429794763e-06, + "loss": 0.158, + "step": 2491 + }, + { + "epoch": 4.014498590414821, + "grad_norm": 0.36182018754446954, + "learning_rate": 4.506805213211154e-06, + "loss": 0.1496, + "step": 2492 + }, + { + "epoch": 4.0161095449053565, + "grad_norm": 0.32110946672977075, + "learning_rate": 4.492573640733781e-06, + "loss": 0.1518, + "step": 2493 + }, + { + "epoch": 4.017720499395892, + "grad_norm": 0.23828594766969258, + "learning_rate": 4.478361730407104e-06, + "loss": 0.1427, + "step": 2494 + }, + { + "epoch": 4.019331453886427, + "grad_norm": 0.28940264741575655, + "learning_rate": 4.464169500250677e-06, + "loss": 0.137, + "step": 2495 + }, + { + "epoch": 4.020942408376963, + "grad_norm": 0.32475999960770263, + "learning_rate": 4.449996968259074e-06, + "loss": 0.1552, + "step": 2496 + }, + { + "epoch": 4.022553362867499, + "grad_norm": 0.3106520553812506, + "learning_rate": 4.435844152401925e-06, + "loss": 0.1604, + "step": 2497 + }, + { + "epoch": 4.024164317358035, + "grad_norm": 0.25339318934044625, + "learning_rate": 4.4217110706238305e-06, + "loss": 0.1398, + "step": 2498 + }, + { + "epoch": 4.025775271848571, + "grad_norm": 0.23074024688964512, + "learning_rate": 4.407597740844393e-06, + "loss": 0.1536, + "step": 2499 + }, + { + "epoch": 4.027386226339106, + "grad_norm": 0.2298221033798615, + "learning_rate": 4.393504180958166e-06, + "loss": 0.1467, + "step": 2500 + }, + { + "epoch": 4.028997180829641, + "grad_norm": 0.24652202184560998, + "learning_rate": 4.3794304088346215e-06, + "loss": 0.1499, + "step": 2501 + }, + { + "epoch": 4.030608135320177, + "grad_norm": 0.25842548848300634, + "learning_rate": 4.365376442318168e-06, + "loss": 0.1491, + "step": 2502 + }, + { + "epoch": 4.032219089810713, + "grad_norm": 0.2241710771974074, + "learning_rate": 4.351342299228072e-06, + "loss": 0.1581, + "step": 2503 + }, + { + "epoch": 4.033830044301249, + "grad_norm": 0.226375077124321, + "learning_rate": 4.337327997358494e-06, + "loss": 0.1454, + "step": 2504 + }, + { + "epoch": 4.035440998791784, + "grad_norm": 0.2386861417217738, + "learning_rate": 4.323333554478415e-06, + "loss": 0.1489, + "step": 2505 + }, + { + "epoch": 4.03705195328232, + "grad_norm": 0.24161342029014413, + "learning_rate": 4.309358988331658e-06, + "loss": 0.1391, + "step": 2506 + }, + { + "epoch": 4.0386629077728555, + "grad_norm": 0.24826467464092444, + "learning_rate": 4.2954043166368176e-06, + "loss": 0.1428, + "step": 2507 + }, + { + "epoch": 4.040273862263391, + "grad_norm": 0.22543955458616258, + "learning_rate": 4.281469557087292e-06, + "loss": 0.1512, + "step": 2508 + }, + { + "epoch": 4.041884816753926, + "grad_norm": 0.22191099450465107, + "learning_rate": 4.267554727351209e-06, + "loss": 0.1518, + "step": 2509 + }, + { + "epoch": 4.043495771244462, + "grad_norm": 0.22413069894128723, + "learning_rate": 4.253659845071436e-06, + "loss": 0.1486, + "step": 2510 + }, + { + "epoch": 4.045106725734998, + "grad_norm": 0.23604198643903934, + "learning_rate": 4.239784927865562e-06, + "loss": 0.1563, + "step": 2511 + }, + { + "epoch": 4.046717680225534, + "grad_norm": 0.23765897381752274, + "learning_rate": 4.225929993325837e-06, + "loss": 0.1357, + "step": 2512 + }, + { + "epoch": 4.04832863471607, + "grad_norm": 0.2325992132869097, + "learning_rate": 4.2120950590191945e-06, + "loss": 0.1412, + "step": 2513 + }, + { + "epoch": 4.049939589206605, + "grad_norm": 0.219156691846048, + "learning_rate": 4.198280142487194e-06, + "loss": 0.1386, + "step": 2514 + }, + { + "epoch": 4.05155054369714, + "grad_norm": 0.22634529978022389, + "learning_rate": 4.184485261246032e-06, + "loss": 0.1401, + "step": 2515 + }, + { + "epoch": 4.053161498187676, + "grad_norm": 0.22623994036256284, + "learning_rate": 4.17071043278648e-06, + "loss": 0.1512, + "step": 2516 + }, + { + "epoch": 4.054772452678212, + "grad_norm": 0.2348031400592028, + "learning_rate": 4.156955674573908e-06, + "loss": 0.1514, + "step": 2517 + }, + { + "epoch": 4.056383407168748, + "grad_norm": 0.2422395543425095, + "learning_rate": 4.143221004048215e-06, + "loss": 0.1545, + "step": 2518 + }, + { + "epoch": 4.057994361659283, + "grad_norm": 0.22710278247393073, + "learning_rate": 4.129506438623854e-06, + "loss": 0.154, + "step": 2519 + }, + { + "epoch": 4.059605316149819, + "grad_norm": 0.2119993093135068, + "learning_rate": 4.11581199568976e-06, + "loss": 0.1508, + "step": 2520 + }, + { + "epoch": 4.0612162706403545, + "grad_norm": 0.2266486757464885, + "learning_rate": 4.10213769260938e-06, + "loss": 0.1475, + "step": 2521 + }, + { + "epoch": 4.06282722513089, + "grad_norm": 0.22567150872123293, + "learning_rate": 4.088483546720614e-06, + "loss": 0.1552, + "step": 2522 + }, + { + "epoch": 4.064438179621425, + "grad_norm": 0.23043810924965868, + "learning_rate": 4.074849575335804e-06, + "loss": 0.1552, + "step": 2523 + }, + { + "epoch": 4.066049134111961, + "grad_norm": 0.23119706591599065, + "learning_rate": 4.061235795741702e-06, + "loss": 0.1391, + "step": 2524 + }, + { + "epoch": 4.067660088602497, + "grad_norm": 0.2167544238933369, + "learning_rate": 4.04764222519948e-06, + "loss": 0.1439, + "step": 2525 + }, + { + "epoch": 4.069271043093033, + "grad_norm": 0.22872180237610718, + "learning_rate": 4.0340688809446745e-06, + "loss": 0.1652, + "step": 2526 + }, + { + "epoch": 4.070881997583569, + "grad_norm": 0.22662520444169074, + "learning_rate": 4.020515780187173e-06, + "loss": 0.1401, + "step": 2527 + }, + { + "epoch": 4.072492952074104, + "grad_norm": 0.22758725109112826, + "learning_rate": 4.006982940111204e-06, + "loss": 0.1474, + "step": 2528 + }, + { + "epoch": 4.074103906564639, + "grad_norm": 0.21647537990769658, + "learning_rate": 3.993470377875311e-06, + "loss": 0.153, + "step": 2529 + }, + { + "epoch": 4.075714861055175, + "grad_norm": 0.21820740866956023, + "learning_rate": 3.979978110612313e-06, + "loss": 0.165, + "step": 2530 + }, + { + "epoch": 4.077325815545711, + "grad_norm": 0.2133491338774458, + "learning_rate": 3.9665061554292946e-06, + "loss": 0.1397, + "step": 2531 + }, + { + "epoch": 4.078936770036247, + "grad_norm": 0.22608875122217786, + "learning_rate": 3.9530545294076075e-06, + "loss": 0.1541, + "step": 2532 + }, + { + "epoch": 4.080547724526782, + "grad_norm": 0.21979115321693685, + "learning_rate": 3.9396232496028176e-06, + "loss": 0.1507, + "step": 2533 + }, + { + "epoch": 4.082158679017318, + "grad_norm": 0.21782116069499285, + "learning_rate": 3.926212333044683e-06, + "loss": 0.1501, + "step": 2534 + }, + { + "epoch": 4.0837696335078535, + "grad_norm": 0.22037323116617433, + "learning_rate": 3.9128217967371515e-06, + "loss": 0.1522, + "step": 2535 + }, + { + "epoch": 4.085380587998389, + "grad_norm": 0.2144296673882382, + "learning_rate": 3.899451657658331e-06, + "loss": 0.1486, + "step": 2536 + }, + { + "epoch": 4.086991542488924, + "grad_norm": 0.21177670307177632, + "learning_rate": 3.8861019327604736e-06, + "loss": 0.143, + "step": 2537 + }, + { + "epoch": 4.08860249697946, + "grad_norm": 0.22542118104376185, + "learning_rate": 3.872772638969929e-06, + "loss": 0.1463, + "step": 2538 + }, + { + "epoch": 4.090213451469996, + "grad_norm": 0.21745441278678204, + "learning_rate": 3.859463793187159e-06, + "loss": 0.1361, + "step": 2539 + }, + { + "epoch": 4.091824405960532, + "grad_norm": 0.21992591585331572, + "learning_rate": 3.846175412286701e-06, + "loss": 0.1572, + "step": 2540 + }, + { + "epoch": 4.093435360451068, + "grad_norm": 0.22629768234535663, + "learning_rate": 3.83290751311713e-06, + "loss": 0.1419, + "step": 2541 + }, + { + "epoch": 4.0950463149416025, + "grad_norm": 0.2298622699155759, + "learning_rate": 3.819660112501053e-06, + "loss": 0.1355, + "step": 2542 + }, + { + "epoch": 4.096657269432138, + "grad_norm": 0.21678800682951568, + "learning_rate": 3.8064332272351e-06, + "loss": 0.1496, + "step": 2543 + }, + { + "epoch": 4.098268223922674, + "grad_norm": 0.22949965448343948, + "learning_rate": 3.7932268740898836e-06, + "loss": 0.1556, + "step": 2544 + }, + { + "epoch": 4.09987917841321, + "grad_norm": 0.22071553901670435, + "learning_rate": 3.7800410698099808e-06, + "loss": 0.1574, + "step": 2545 + }, + { + "epoch": 4.101490132903746, + "grad_norm": 0.2298993165539254, + "learning_rate": 3.7668758311139077e-06, + "loss": 0.151, + "step": 2546 + }, + { + "epoch": 4.103101087394281, + "grad_norm": 0.2225517731771703, + "learning_rate": 3.753731174694117e-06, + "loss": 0.1467, + "step": 2547 + }, + { + "epoch": 4.104712041884817, + "grad_norm": 0.225093485838971, + "learning_rate": 3.7406071172169634e-06, + "loss": 0.1352, + "step": 2548 + }, + { + "epoch": 4.1063229963753525, + "grad_norm": 0.23400738843222163, + "learning_rate": 3.727503675322681e-06, + "loss": 0.1421, + "step": 2549 + }, + { + "epoch": 4.107933950865888, + "grad_norm": 0.2366245815845293, + "learning_rate": 3.7144208656253476e-06, + "loss": 0.1485, + "step": 2550 + }, + { + "epoch": 4.109544905356423, + "grad_norm": 0.2154306414839747, + "learning_rate": 3.701358704712923e-06, + "loss": 0.1463, + "step": 2551 + }, + { + "epoch": 4.111155859846959, + "grad_norm": 0.2207385286375425, + "learning_rate": 3.6883172091471474e-06, + "loss": 0.1588, + "step": 2552 + }, + { + "epoch": 4.112766814337495, + "grad_norm": 0.22269838904544906, + "learning_rate": 3.67529639546357e-06, + "loss": 0.1452, + "step": 2553 + }, + { + "epoch": 4.114377768828031, + "grad_norm": 0.2313604111125716, + "learning_rate": 3.6622962801715243e-06, + "loss": 0.1342, + "step": 2554 + }, + { + "epoch": 4.115988723318567, + "grad_norm": 0.2284422968399893, + "learning_rate": 3.649316879754099e-06, + "loss": 0.1417, + "step": 2555 + }, + { + "epoch": 4.1175996778091015, + "grad_norm": 0.2465718086622103, + "learning_rate": 3.6363582106681115e-06, + "loss": 0.1408, + "step": 2556 + }, + { + "epoch": 4.119210632299637, + "grad_norm": 0.23546119698074278, + "learning_rate": 3.623420289344088e-06, + "loss": 0.1367, + "step": 2557 + }, + { + "epoch": 4.120821586790173, + "grad_norm": 0.23013205586732102, + "learning_rate": 3.610503132186265e-06, + "loss": 0.1566, + "step": 2558 + }, + { + "epoch": 4.122432541280709, + "grad_norm": 0.22709715305840766, + "learning_rate": 3.597606755572545e-06, + "loss": 0.1421, + "step": 2559 + }, + { + "epoch": 4.124043495771245, + "grad_norm": 0.2270302334606441, + "learning_rate": 3.584731175854479e-06, + "loss": 0.1417, + "step": 2560 + }, + { + "epoch": 4.12565445026178, + "grad_norm": 0.22673844401199308, + "learning_rate": 3.57187640935724e-06, + "loss": 0.1433, + "step": 2561 + }, + { + "epoch": 4.127265404752316, + "grad_norm": 0.22556426103212388, + "learning_rate": 3.559042472379639e-06, + "loss": 0.1545, + "step": 2562 + }, + { + "epoch": 4.1288763592428515, + "grad_norm": 0.22805435104245703, + "learning_rate": 3.546229381194057e-06, + "loss": 0.1462, + "step": 2563 + }, + { + "epoch": 4.130487313733387, + "grad_norm": 0.21900564349736037, + "learning_rate": 3.5334371520464373e-06, + "loss": 0.1522, + "step": 2564 + }, + { + "epoch": 4.132098268223922, + "grad_norm": 0.2139940599235903, + "learning_rate": 3.520665801156289e-06, + "loss": 0.1372, + "step": 2565 + }, + { + "epoch": 4.133709222714458, + "grad_norm": 0.22062229403040695, + "learning_rate": 3.507915344716648e-06, + "loss": 0.1491, + "step": 2566 + }, + { + "epoch": 4.135320177204994, + "grad_norm": 0.23722546949444184, + "learning_rate": 3.4951857988940475e-06, + "loss": 0.1515, + "step": 2567 + }, + { + "epoch": 4.13693113169553, + "grad_norm": 0.22131787664167898, + "learning_rate": 3.482477179828505e-06, + "loss": 0.1412, + "step": 2568 + }, + { + "epoch": 4.138542086186066, + "grad_norm": 0.21918030255310963, + "learning_rate": 3.4697895036335204e-06, + "loss": 0.1545, + "step": 2569 + }, + { + "epoch": 4.1401530406766005, + "grad_norm": 0.22054026589218842, + "learning_rate": 3.457122786396032e-06, + "loss": 0.14, + "step": 2570 + }, + { + "epoch": 4.141763995167136, + "grad_norm": 0.22504736458991076, + "learning_rate": 3.4444770441764043e-06, + "loss": 0.1484, + "step": 2571 + }, + { + "epoch": 4.143374949657672, + "grad_norm": 0.21902661316640898, + "learning_rate": 3.431852293008391e-06, + "loss": 0.1498, + "step": 2572 + }, + { + "epoch": 4.144985904148208, + "grad_norm": 0.2210425406056193, + "learning_rate": 3.419248548899168e-06, + "loss": 0.1465, + "step": 2573 + }, + { + "epoch": 4.146596858638744, + "grad_norm": 0.22089200704994782, + "learning_rate": 3.406665827829243e-06, + "loss": 0.1562, + "step": 2574 + }, + { + "epoch": 4.148207813129279, + "grad_norm": 0.2252789843720791, + "learning_rate": 3.3941041457524748e-06, + "loss": 0.1465, + "step": 2575 + }, + { + "epoch": 4.149818767619815, + "grad_norm": 0.23272666378438298, + "learning_rate": 3.381563518596056e-06, + "loss": 0.1466, + "step": 2576 + }, + { + "epoch": 4.15142972211035, + "grad_norm": 0.22958528962750838, + "learning_rate": 3.3690439622604832e-06, + "loss": 0.1347, + "step": 2577 + }, + { + "epoch": 4.153040676600886, + "grad_norm": 0.2255442065793069, + "learning_rate": 3.3565454926195252e-06, + "loss": 0.1469, + "step": 2578 + }, + { + "epoch": 4.154651631091422, + "grad_norm": 0.21542881986949647, + "learning_rate": 3.344068125520219e-06, + "loss": 0.1515, + "step": 2579 + }, + { + "epoch": 4.156262585581957, + "grad_norm": 0.22530311793594246, + "learning_rate": 3.3316118767828498e-06, + "loss": 0.1542, + "step": 2580 + }, + { + "epoch": 4.157873540072493, + "grad_norm": 0.2156195896321625, + "learning_rate": 3.3191767622009307e-06, + "loss": 0.1528, + "step": 2581 + }, + { + "epoch": 4.159484494563029, + "grad_norm": 0.21841843601381836, + "learning_rate": 3.3067627975411675e-06, + "loss": 0.1496, + "step": 2582 + }, + { + "epoch": 4.1610954490535645, + "grad_norm": 0.23200002731345346, + "learning_rate": 3.294369998543443e-06, + "loss": 0.1515, + "step": 2583 + }, + { + "epoch": 4.1627064035440995, + "grad_norm": 0.23379438316230453, + "learning_rate": 3.2819983809208346e-06, + "loss": 0.1475, + "step": 2584 + }, + { + "epoch": 4.164317358034635, + "grad_norm": 0.21329943795679254, + "learning_rate": 3.269647960359532e-06, + "loss": 0.1473, + "step": 2585 + }, + { + "epoch": 4.165928312525171, + "grad_norm": 0.23135542202637302, + "learning_rate": 3.257318752518859e-06, + "loss": 0.1548, + "step": 2586 + }, + { + "epoch": 4.167539267015707, + "grad_norm": 0.21343772694404733, + "learning_rate": 3.2450107730312473e-06, + "loss": 0.1397, + "step": 2587 + }, + { + "epoch": 4.169150221506243, + "grad_norm": 0.23309642536401526, + "learning_rate": 3.232724037502215e-06, + "loss": 0.1599, + "step": 2588 + }, + { + "epoch": 4.170761175996778, + "grad_norm": 0.23103447802019342, + "learning_rate": 3.2204585615103355e-06, + "loss": 0.1534, + "step": 2589 + }, + { + "epoch": 4.172372130487314, + "grad_norm": 0.2270645147859799, + "learning_rate": 3.20821436060722e-06, + "loss": 0.1529, + "step": 2590 + }, + { + "epoch": 4.173983084977849, + "grad_norm": 0.23814634653183514, + "learning_rate": 3.195991450317537e-06, + "loss": 0.1466, + "step": 2591 + }, + { + "epoch": 4.175594039468385, + "grad_norm": 0.22830584321137043, + "learning_rate": 3.183789846138927e-06, + "loss": 0.1445, + "step": 2592 + }, + { + "epoch": 4.177204993958921, + "grad_norm": 0.21871275817927927, + "learning_rate": 3.1716095635420265e-06, + "loss": 0.1581, + "step": 2593 + }, + { + "epoch": 4.178815948449456, + "grad_norm": 0.22189336798505455, + "learning_rate": 3.159450617970441e-06, + "loss": 0.1572, + "step": 2594 + }, + { + "epoch": 4.180426902939992, + "grad_norm": 0.21870086409019102, + "learning_rate": 3.1473130248407278e-06, + "loss": 0.1366, + "step": 2595 + }, + { + "epoch": 4.182037857430528, + "grad_norm": 0.22775597716347107, + "learning_rate": 3.1351967995423594e-06, + "loss": 0.1527, + "step": 2596 + }, + { + "epoch": 4.1836488119210635, + "grad_norm": 0.22276143260948325, + "learning_rate": 3.1231019574377153e-06, + "loss": 0.1452, + "step": 2597 + }, + { + "epoch": 4.1852597664115985, + "grad_norm": 0.2178191763823298, + "learning_rate": 3.111028513862071e-06, + "loss": 0.149, + "step": 2598 + }, + { + "epoch": 4.186870720902134, + "grad_norm": 0.22160329576960924, + "learning_rate": 3.0989764841235746e-06, + "loss": 0.1484, + "step": 2599 + }, + { + "epoch": 4.18848167539267, + "grad_norm": 0.22102821797818814, + "learning_rate": 3.0869458835032097e-06, + "loss": 0.1469, + "step": 2600 + }, + { + "epoch": 4.190092629883206, + "grad_norm": 0.22697752307908553, + "learning_rate": 3.074936727254785e-06, + "loss": 0.1544, + "step": 2601 + }, + { + "epoch": 4.191703584373742, + "grad_norm": 0.22392044668301703, + "learning_rate": 3.0629490306049536e-06, + "loss": 0.1477, + "step": 2602 + }, + { + "epoch": 4.193314538864277, + "grad_norm": 0.22404327159580825, + "learning_rate": 3.0509828087531224e-06, + "loss": 0.1547, + "step": 2603 + }, + { + "epoch": 4.194925493354813, + "grad_norm": 0.23084898184198646, + "learning_rate": 3.039038076871481e-06, + "loss": 0.1544, + "step": 2604 + }, + { + "epoch": 4.196536447845348, + "grad_norm": 0.2232139079228995, + "learning_rate": 3.0271148501049796e-06, + "loss": 0.1402, + "step": 2605 + }, + { + "epoch": 4.198147402335884, + "grad_norm": 0.22551484154069532, + "learning_rate": 3.0152131435713007e-06, + "loss": 0.1501, + "step": 2606 + }, + { + "epoch": 4.19975835682642, + "grad_norm": 0.2282570119452304, + "learning_rate": 3.003332972360831e-06, + "loss": 0.154, + "step": 2607 + }, + { + "epoch": 4.201369311316955, + "grad_norm": 0.2162893984726015, + "learning_rate": 2.9914743515366516e-06, + "loss": 0.1439, + "step": 2608 + }, + { + "epoch": 4.202980265807491, + "grad_norm": 0.2221742722251089, + "learning_rate": 2.9796372961345364e-06, + "loss": 0.1533, + "step": 2609 + }, + { + "epoch": 4.204591220298027, + "grad_norm": 0.2333853238695215, + "learning_rate": 2.967821821162904e-06, + "loss": 0.1454, + "step": 2610 + }, + { + "epoch": 4.2062021747885625, + "grad_norm": 0.2120253738307176, + "learning_rate": 2.9560279416028102e-06, + "loss": 0.1506, + "step": 2611 + }, + { + "epoch": 4.2078131292790975, + "grad_norm": 0.2140333611346206, + "learning_rate": 2.944255672407925e-06, + "loss": 0.1478, + "step": 2612 + }, + { + "epoch": 4.209424083769633, + "grad_norm": 0.23068436963355107, + "learning_rate": 2.932505028504531e-06, + "loss": 0.1441, + "step": 2613 + }, + { + "epoch": 4.211035038260169, + "grad_norm": 0.21776274650570995, + "learning_rate": 2.9207760247914895e-06, + "loss": 0.1479, + "step": 2614 + }, + { + "epoch": 4.212645992750705, + "grad_norm": 0.23364491125107575, + "learning_rate": 2.909068676140212e-06, + "loss": 0.1507, + "step": 2615 + }, + { + "epoch": 4.214256947241241, + "grad_norm": 0.2212861004899993, + "learning_rate": 2.8973829973946645e-06, + "loss": 0.1443, + "step": 2616 + }, + { + "epoch": 4.215867901731776, + "grad_norm": 0.23588997528901365, + "learning_rate": 2.8857190033713343e-06, + "loss": 0.1414, + "step": 2617 + }, + { + "epoch": 4.217478856222312, + "grad_norm": 0.22836799572319763, + "learning_rate": 2.874076708859217e-06, + "loss": 0.1497, + "step": 2618 + }, + { + "epoch": 4.219089810712847, + "grad_norm": 0.21435982154109828, + "learning_rate": 2.8624561286197793e-06, + "loss": 0.1328, + "step": 2619 + }, + { + "epoch": 4.220700765203383, + "grad_norm": 0.2221623295059318, + "learning_rate": 2.850857277386978e-06, + "loss": 0.1594, + "step": 2620 + }, + { + "epoch": 4.222311719693919, + "grad_norm": 0.2224970582246465, + "learning_rate": 2.8392801698672135e-06, + "loss": 0.162, + "step": 2621 + }, + { + "epoch": 4.223922674184454, + "grad_norm": 0.2155490975094429, + "learning_rate": 2.827724820739306e-06, + "loss": 0.1299, + "step": 2622 + }, + { + "epoch": 4.22553362867499, + "grad_norm": 0.22520553516276554, + "learning_rate": 2.8161912446544916e-06, + "loss": 0.1516, + "step": 2623 + }, + { + "epoch": 4.227144583165526, + "grad_norm": 0.2258052727971873, + "learning_rate": 2.80467945623641e-06, + "loss": 0.1415, + "step": 2624 + }, + { + "epoch": 4.2287555376560615, + "grad_norm": 0.21802617466661803, + "learning_rate": 2.7931894700810703e-06, + "loss": 0.141, + "step": 2625 + }, + { + "epoch": 4.230366492146596, + "grad_norm": 0.22525103510627617, + "learning_rate": 2.781721300756828e-06, + "loss": 0.1526, + "step": 2626 + }, + { + "epoch": 4.231977446637132, + "grad_norm": 0.2225143843203524, + "learning_rate": 2.7702749628043933e-06, + "loss": 0.1539, + "step": 2627 + }, + { + "epoch": 4.233588401127668, + "grad_norm": 0.22398176140606088, + "learning_rate": 2.7588504707367913e-06, + "loss": 0.155, + "step": 2628 + }, + { + "epoch": 4.235199355618204, + "grad_norm": 0.21834762012587516, + "learning_rate": 2.747447839039339e-06, + "loss": 0.1572, + "step": 2629 + }, + { + "epoch": 4.23681031010874, + "grad_norm": 0.21892692454842144, + "learning_rate": 2.7360670821696422e-06, + "loss": 0.1458, + "step": 2630 + }, + { + "epoch": 4.238421264599275, + "grad_norm": 0.22088363148564538, + "learning_rate": 2.724708214557572e-06, + "loss": 0.1455, + "step": 2631 + }, + { + "epoch": 4.2400322190898105, + "grad_norm": 0.23026390864295404, + "learning_rate": 2.71337125060525e-06, + "loss": 0.1396, + "step": 2632 + }, + { + "epoch": 4.241643173580346, + "grad_norm": 0.2290115443147785, + "learning_rate": 2.702056204687018e-06, + "loss": 0.1526, + "step": 2633 + }, + { + "epoch": 4.243254128070882, + "grad_norm": 0.2329308138537628, + "learning_rate": 2.6907630911494287e-06, + "loss": 0.156, + "step": 2634 + }, + { + "epoch": 4.244865082561418, + "grad_norm": 0.2129377543079953, + "learning_rate": 2.679491924311226e-06, + "loss": 0.1425, + "step": 2635 + }, + { + "epoch": 4.246476037051953, + "grad_norm": 0.2277121827224005, + "learning_rate": 2.668242718463341e-06, + "loss": 0.1403, + "step": 2636 + }, + { + "epoch": 4.248086991542489, + "grad_norm": 0.22813667865109089, + "learning_rate": 2.657015487868835e-06, + "loss": 0.1505, + "step": 2637 + }, + { + "epoch": 4.249697946033025, + "grad_norm": 0.22983529188220994, + "learning_rate": 2.6458102467629275e-06, + "loss": 0.1402, + "step": 2638 + }, + { + "epoch": 4.2513089005235605, + "grad_norm": 0.21977565245784658, + "learning_rate": 2.6346270093529457e-06, + "loss": 0.1324, + "step": 2639 + }, + { + "epoch": 4.252919855014095, + "grad_norm": 0.2170974293585535, + "learning_rate": 2.623465789818327e-06, + "loss": 0.141, + "step": 2640 + }, + { + "epoch": 4.254530809504631, + "grad_norm": 0.2310399402068678, + "learning_rate": 2.6123266023105774e-06, + "loss": 0.1571, + "step": 2641 + }, + { + "epoch": 4.256141763995167, + "grad_norm": 0.22177073988062584, + "learning_rate": 2.6012094609532845e-06, + "loss": 0.1489, + "step": 2642 + }, + { + "epoch": 4.257752718485703, + "grad_norm": 0.2151846397977056, + "learning_rate": 2.5901143798420792e-06, + "loss": 0.1506, + "step": 2643 + }, + { + "epoch": 4.259363672976239, + "grad_norm": 0.21855697442042452, + "learning_rate": 2.579041373044613e-06, + "loss": 0.1439, + "step": 2644 + }, + { + "epoch": 4.260974627466774, + "grad_norm": 0.2208457852679748, + "learning_rate": 2.5679904546005507e-06, + "loss": 0.1447, + "step": 2645 + }, + { + "epoch": 4.2625855819573095, + "grad_norm": 0.22198003753575765, + "learning_rate": 2.5569616385215625e-06, + "loss": 0.1485, + "step": 2646 + }, + { + "epoch": 4.264196536447845, + "grad_norm": 0.21034373583782578, + "learning_rate": 2.5459549387912843e-06, + "loss": 0.1474, + "step": 2647 + }, + { + "epoch": 4.265807490938381, + "grad_norm": 0.22413997527867038, + "learning_rate": 2.5349703693653103e-06, + "loss": 0.1454, + "step": 2648 + }, + { + "epoch": 4.267418445428917, + "grad_norm": 0.22479134118096378, + "learning_rate": 2.5240079441711853e-06, + "loss": 0.1425, + "step": 2649 + }, + { + "epoch": 4.269029399919452, + "grad_norm": 0.20947722912794117, + "learning_rate": 2.5130676771083585e-06, + "loss": 0.1489, + "step": 2650 + }, + { + "epoch": 4.270640354409988, + "grad_norm": 0.22886596887508937, + "learning_rate": 2.5021495820482057e-06, + "loss": 0.1481, + "step": 2651 + }, + { + "epoch": 4.272251308900524, + "grad_norm": 0.23491024852118703, + "learning_rate": 2.4912536728339707e-06, + "loss": 0.1446, + "step": 2652 + }, + { + "epoch": 4.2738622633910595, + "grad_norm": 0.2414302632894677, + "learning_rate": 2.4803799632807836e-06, + "loss": 0.1521, + "step": 2653 + }, + { + "epoch": 4.275473217881595, + "grad_norm": 0.21026196842028957, + "learning_rate": 2.4695284671756215e-06, + "loss": 0.1489, + "step": 2654 + }, + { + "epoch": 4.27708417237213, + "grad_norm": 0.228563137088654, + "learning_rate": 2.45869919827729e-06, + "loss": 0.1428, + "step": 2655 + }, + { + "epoch": 4.278695126862666, + "grad_norm": 0.21717230893266137, + "learning_rate": 2.4478921703164236e-06, + "loss": 0.1423, + "step": 2656 + }, + { + "epoch": 4.280306081353202, + "grad_norm": 0.22308055219912287, + "learning_rate": 2.4371073969954463e-06, + "loss": 0.1366, + "step": 2657 + }, + { + "epoch": 4.281917035843738, + "grad_norm": 0.22580670108095363, + "learning_rate": 2.4263448919885745e-06, + "loss": 0.1457, + "step": 2658 + }, + { + "epoch": 4.283527990334273, + "grad_norm": 0.21867003917030386, + "learning_rate": 2.4156046689417823e-06, + "loss": 0.149, + "step": 2659 + }, + { + "epoch": 4.2851389448248085, + "grad_norm": 0.21703864427650707, + "learning_rate": 2.4048867414728004e-06, + "loss": 0.1553, + "step": 2660 + }, + { + "epoch": 4.286749899315344, + "grad_norm": 0.22892483066011338, + "learning_rate": 2.394191123171081e-06, + "loss": 0.1364, + "step": 2661 + }, + { + "epoch": 4.28836085380588, + "grad_norm": 0.22083545583834652, + "learning_rate": 2.3835178275978012e-06, + "loss": 0.1294, + "step": 2662 + }, + { + "epoch": 4.289971808296416, + "grad_norm": 0.22446324510789786, + "learning_rate": 2.3728668682858193e-06, + "loss": 0.1419, + "step": 2663 + }, + { + "epoch": 4.291582762786951, + "grad_norm": 0.23116257281704497, + "learning_rate": 2.3622382587396907e-06, + "loss": 0.1445, + "step": 2664 + }, + { + "epoch": 4.293193717277487, + "grad_norm": 0.21998960949875104, + "learning_rate": 2.3516320124356186e-06, + "loss": 0.151, + "step": 2665 + }, + { + "epoch": 4.294804671768023, + "grad_norm": 0.2147293966021974, + "learning_rate": 2.3410481428214602e-06, + "loss": 0.1543, + "step": 2666 + }, + { + "epoch": 4.296415626258558, + "grad_norm": 0.21872334682296563, + "learning_rate": 2.330486663316702e-06, + "loss": 0.1503, + "step": 2667 + }, + { + "epoch": 4.298026580749093, + "grad_norm": 0.22709801396680795, + "learning_rate": 2.31994758731243e-06, + "loss": 0.1394, + "step": 2668 + }, + { + "epoch": 4.299637535239629, + "grad_norm": 0.22239888881280748, + "learning_rate": 2.309430928171341e-06, + "loss": 0.1456, + "step": 2669 + }, + { + "epoch": 4.301248489730165, + "grad_norm": 0.22267661198438218, + "learning_rate": 2.2989366992276917e-06, + "loss": 0.14, + "step": 2670 + }, + { + "epoch": 4.302859444220701, + "grad_norm": 0.22089398672152466, + "learning_rate": 2.288464913787316e-06, + "loss": 0.1505, + "step": 2671 + }, + { + "epoch": 4.304470398711237, + "grad_norm": 0.22234097375462933, + "learning_rate": 2.278015585127573e-06, + "loss": 0.1365, + "step": 2672 + }, + { + "epoch": 4.306081353201772, + "grad_norm": 0.2268142943570914, + "learning_rate": 2.2675887264973694e-06, + "loss": 0.1521, + "step": 2673 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 0.20658475916341448, + "learning_rate": 2.257184351117101e-06, + "loss": 0.1387, + "step": 2674 + }, + { + "epoch": 4.309303262182843, + "grad_norm": 0.2302415242479479, + "learning_rate": 2.246802472178675e-06, + "loss": 0.1446, + "step": 2675 + }, + { + "epoch": 4.310914216673379, + "grad_norm": 0.22560731900901113, + "learning_rate": 2.236443102845458e-06, + "loss": 0.1525, + "step": 2676 + }, + { + "epoch": 4.312525171163915, + "grad_norm": 0.2234748667770128, + "learning_rate": 2.2261062562522896e-06, + "loss": 0.1549, + "step": 2677 + }, + { + "epoch": 4.31413612565445, + "grad_norm": 0.22870884705411906, + "learning_rate": 2.21579194550545e-06, + "loss": 0.1386, + "step": 2678 + }, + { + "epoch": 4.315747080144986, + "grad_norm": 0.2224606918323668, + "learning_rate": 2.2055001836826364e-06, + "loss": 0.1455, + "step": 2679 + }, + { + "epoch": 4.317358034635522, + "grad_norm": 0.22036372420401462, + "learning_rate": 2.19523098383297e-06, + "loss": 0.1508, + "step": 2680 + }, + { + "epoch": 4.318968989126057, + "grad_norm": 0.21510144959446054, + "learning_rate": 2.1849843589769513e-06, + "loss": 0.143, + "step": 2681 + }, + { + "epoch": 4.320579943616593, + "grad_norm": 0.2359493205380365, + "learning_rate": 2.1747603221064684e-06, + "loss": 0.1585, + "step": 2682 + }, + { + "epoch": 4.322190898107128, + "grad_norm": 0.21593001357688618, + "learning_rate": 2.164558886184762e-06, + "loss": 0.1323, + "step": 2683 + }, + { + "epoch": 4.323801852597664, + "grad_norm": 0.2258254732968412, + "learning_rate": 2.1543800641464263e-06, + "loss": 0.141, + "step": 2684 + }, + { + "epoch": 4.3254128070882, + "grad_norm": 0.22439808272135298, + "learning_rate": 2.1442238688973682e-06, + "loss": 0.1396, + "step": 2685 + }, + { + "epoch": 4.327023761578736, + "grad_norm": 0.21143922501895449, + "learning_rate": 2.1340903133148205e-06, + "loss": 0.1527, + "step": 2686 + }, + { + "epoch": 4.328634716069271, + "grad_norm": 0.22994834674226478, + "learning_rate": 2.123979410247301e-06, + "loss": 0.1544, + "step": 2687 + }, + { + "epoch": 4.3302456705598065, + "grad_norm": 0.22793741030782735, + "learning_rate": 2.1138911725146106e-06, + "loss": 0.156, + "step": 2688 + }, + { + "epoch": 4.331856625050342, + "grad_norm": 0.22320356349655823, + "learning_rate": 2.103825612907815e-06, + "loss": 0.1474, + "step": 2689 + }, + { + "epoch": 4.333467579540878, + "grad_norm": 0.23523038616188513, + "learning_rate": 2.093782744189217e-06, + "loss": 0.1417, + "step": 2690 + }, + { + "epoch": 4.335078534031414, + "grad_norm": 0.23099424056260986, + "learning_rate": 2.0837625790923543e-06, + "loss": 0.1432, + "step": 2691 + }, + { + "epoch": 4.336689488521949, + "grad_norm": 0.21683386647539887, + "learning_rate": 2.0737651303219787e-06, + "loss": 0.1584, + "step": 2692 + }, + { + "epoch": 4.338300443012485, + "grad_norm": 0.22280820135116566, + "learning_rate": 2.0637904105540428e-06, + "loss": 0.1394, + "step": 2693 + }, + { + "epoch": 4.339911397503021, + "grad_norm": 0.2281065955008403, + "learning_rate": 2.053838432435673e-06, + "loss": 0.1492, + "step": 2694 + }, + { + "epoch": 4.341522351993556, + "grad_norm": 0.20990894608507565, + "learning_rate": 2.0439092085851685e-06, + "loss": 0.1541, + "step": 2695 + }, + { + "epoch": 4.343133306484092, + "grad_norm": 0.2410571666539787, + "learning_rate": 2.0340027515919704e-06, + "loss": 0.1504, + "step": 2696 + }, + { + "epoch": 4.344744260974627, + "grad_norm": 0.21521786871981052, + "learning_rate": 2.024119074016664e-06, + "loss": 0.1458, + "step": 2697 + }, + { + "epoch": 4.346355215465163, + "grad_norm": 0.23352779441098917, + "learning_rate": 2.014258188390936e-06, + "loss": 0.1319, + "step": 2698 + }, + { + "epoch": 4.347966169955699, + "grad_norm": 0.21861563397638975, + "learning_rate": 2.0044201072175884e-06, + "loss": 0.1493, + "step": 2699 + }, + { + "epoch": 4.349577124446235, + "grad_norm": 0.23187235646614845, + "learning_rate": 1.9946048429705133e-06, + "loss": 0.1424, + "step": 2700 + }, + { + "epoch": 4.35118807893677, + "grad_norm": 0.23314062934543647, + "learning_rate": 1.984812408094656e-06, + "loss": 0.1443, + "step": 2701 + }, + { + "epoch": 4.3527990334273055, + "grad_norm": 0.22716318100135857, + "learning_rate": 1.975042815006023e-06, + "loss": 0.1506, + "step": 2702 + }, + { + "epoch": 4.354409987917841, + "grad_norm": 0.22789140251356194, + "learning_rate": 1.9652960760916627e-06, + "loss": 0.1509, + "step": 2703 + }, + { + "epoch": 4.356020942408377, + "grad_norm": 0.2298570051084699, + "learning_rate": 1.95557220370965e-06, + "loss": 0.1388, + "step": 2704 + }, + { + "epoch": 4.357631896898913, + "grad_norm": 0.22855340474841207, + "learning_rate": 1.945871210189054e-06, + "loss": 0.1609, + "step": 2705 + }, + { + "epoch": 4.359242851389448, + "grad_norm": 0.22885905350588037, + "learning_rate": 1.9361931078299443e-06, + "loss": 0.166, + "step": 2706 + }, + { + "epoch": 4.360853805879984, + "grad_norm": 0.21861981366054756, + "learning_rate": 1.92653790890337e-06, + "loss": 0.138, + "step": 2707 + }, + { + "epoch": 4.36246476037052, + "grad_norm": 0.23241034579490025, + "learning_rate": 1.916905625651331e-06, + "loss": 0.148, + "step": 2708 + }, + { + "epoch": 4.364075714861055, + "grad_norm": 0.22772695647417238, + "learning_rate": 1.9072962702867714e-06, + "loss": 0.1388, + "step": 2709 + }, + { + "epoch": 4.365686669351591, + "grad_norm": 0.21875331128783337, + "learning_rate": 1.8977098549935745e-06, + "loss": 0.1417, + "step": 2710 + }, + { + "epoch": 4.367297623842126, + "grad_norm": 0.22606749157296668, + "learning_rate": 1.8881463919265374e-06, + "loss": 0.1512, + "step": 2711 + }, + { + "epoch": 4.368908578332662, + "grad_norm": 0.232376385563654, + "learning_rate": 1.8786058932113428e-06, + "loss": 0.1519, + "step": 2712 + }, + { + "epoch": 4.370519532823198, + "grad_norm": 0.22299262190690977, + "learning_rate": 1.8690883709445652e-06, + "loss": 0.137, + "step": 2713 + }, + { + "epoch": 4.372130487313734, + "grad_norm": 0.22002068793259277, + "learning_rate": 1.859593837193645e-06, + "loss": 0.1624, + "step": 2714 + }, + { + "epoch": 4.373741441804269, + "grad_norm": 0.21192028521319847, + "learning_rate": 1.850122303996882e-06, + "loss": 0.1363, + "step": 2715 + }, + { + "epoch": 4.375352396294804, + "grad_norm": 0.21021284454907238, + "learning_rate": 1.8406737833634024e-06, + "loss": 0.142, + "step": 2716 + }, + { + "epoch": 4.37696335078534, + "grad_norm": 0.21658493528620826, + "learning_rate": 1.8312482872731553e-06, + "loss": 0.1456, + "step": 2717 + }, + { + "epoch": 4.378574305275876, + "grad_norm": 0.226219598466499, + "learning_rate": 1.8218458276769091e-06, + "loss": 0.1518, + "step": 2718 + }, + { + "epoch": 4.380185259766412, + "grad_norm": 0.21769790494716323, + "learning_rate": 1.8124664164962124e-06, + "loss": 0.1481, + "step": 2719 + }, + { + "epoch": 4.381796214256947, + "grad_norm": 0.22650672628751944, + "learning_rate": 1.803110065623388e-06, + "loss": 0.1566, + "step": 2720 + }, + { + "epoch": 4.383407168747483, + "grad_norm": 0.2125040635428062, + "learning_rate": 1.7937767869215284e-06, + "loss": 0.1363, + "step": 2721 + }, + { + "epoch": 4.3850181232380185, + "grad_norm": 0.22322895400847495, + "learning_rate": 1.784466592224472e-06, + "loss": 0.1463, + "step": 2722 + }, + { + "epoch": 4.386629077728554, + "grad_norm": 0.21766007587217553, + "learning_rate": 1.7751794933367828e-06, + "loss": 0.1509, + "step": 2723 + }, + { + "epoch": 4.38824003221909, + "grad_norm": 0.22239483211233982, + "learning_rate": 1.7659155020337392e-06, + "loss": 0.1552, + "step": 2724 + }, + { + "epoch": 4.389850986709625, + "grad_norm": 0.22028648628947187, + "learning_rate": 1.7566746300613325e-06, + "loss": 0.1551, + "step": 2725 + }, + { + "epoch": 4.391461941200161, + "grad_norm": 0.2212460302725368, + "learning_rate": 1.7474568891362342e-06, + "loss": 0.1388, + "step": 2726 + }, + { + "epoch": 4.393072895690697, + "grad_norm": 0.2215072355482207, + "learning_rate": 1.738262290945787e-06, + "loss": 0.1539, + "step": 2727 + }, + { + "epoch": 4.394683850181233, + "grad_norm": 0.21970291081765841, + "learning_rate": 1.7290908471479805e-06, + "loss": 0.1612, + "step": 2728 + }, + { + "epoch": 4.3962948046717685, + "grad_norm": 0.23823342781119003, + "learning_rate": 1.7199425693714733e-06, + "loss": 0.1489, + "step": 2729 + }, + { + "epoch": 4.397905759162303, + "grad_norm": 0.21723293567882335, + "learning_rate": 1.7108174692155266e-06, + "loss": 0.1489, + "step": 2730 + }, + { + "epoch": 4.399516713652839, + "grad_norm": 0.22473337832629095, + "learning_rate": 1.701715558250019e-06, + "loss": 0.1339, + "step": 2731 + }, + { + "epoch": 4.401127668143375, + "grad_norm": 0.23218370282919404, + "learning_rate": 1.6926368480154344e-06, + "loss": 0.1534, + "step": 2732 + }, + { + "epoch": 4.402738622633911, + "grad_norm": 0.22527228406674732, + "learning_rate": 1.683581350022838e-06, + "loss": 0.1492, + "step": 2733 + }, + { + "epoch": 4.404349577124446, + "grad_norm": 0.22299618930067308, + "learning_rate": 1.674549075753862e-06, + "loss": 0.1559, + "step": 2734 + }, + { + "epoch": 4.405960531614982, + "grad_norm": 0.2173548439071365, + "learning_rate": 1.6655400366606867e-06, + "loss": 0.1378, + "step": 2735 + }, + { + "epoch": 4.4075714861055175, + "grad_norm": 0.21748819640917166, + "learning_rate": 1.656554244166042e-06, + "loss": 0.1418, + "step": 2736 + }, + { + "epoch": 4.409182440596053, + "grad_norm": 0.22286227976513667, + "learning_rate": 1.6475917096631855e-06, + "loss": 0.1665, + "step": 2737 + }, + { + "epoch": 4.410793395086589, + "grad_norm": 0.21834466918449652, + "learning_rate": 1.6386524445158714e-06, + "loss": 0.1451, + "step": 2738 + }, + { + "epoch": 4.412404349577124, + "grad_norm": 0.2155920749793083, + "learning_rate": 1.6297364600583554e-06, + "loss": 0.1488, + "step": 2739 + }, + { + "epoch": 4.41401530406766, + "grad_norm": 0.21694950266493748, + "learning_rate": 1.620843767595388e-06, + "loss": 0.1281, + "step": 2740 + }, + { + "epoch": 4.415626258558196, + "grad_norm": 0.22513033325891293, + "learning_rate": 1.6119743784021725e-06, + "loss": 0.1465, + "step": 2741 + }, + { + "epoch": 4.417237213048732, + "grad_norm": 0.2110624349498471, + "learning_rate": 1.6031283037243684e-06, + "loss": 0.1494, + "step": 2742 + }, + { + "epoch": 4.418848167539267, + "grad_norm": 0.22603749621498606, + "learning_rate": 1.594305554778075e-06, + "loss": 0.1497, + "step": 2743 + }, + { + "epoch": 4.420459122029802, + "grad_norm": 0.23203113436121725, + "learning_rate": 1.5855061427498263e-06, + "loss": 0.1441, + "step": 2744 + }, + { + "epoch": 4.422070076520338, + "grad_norm": 0.23890073771122725, + "learning_rate": 1.5767300787965512e-06, + "loss": 0.1569, + "step": 2745 + }, + { + "epoch": 4.423681031010874, + "grad_norm": 0.21934839826909436, + "learning_rate": 1.5679773740455817e-06, + "loss": 0.156, + "step": 2746 + }, + { + "epoch": 4.42529198550141, + "grad_norm": 0.21682203320832888, + "learning_rate": 1.5592480395946342e-06, + "loss": 0.1399, + "step": 2747 + }, + { + "epoch": 4.426902939991945, + "grad_norm": 0.21588619784464438, + "learning_rate": 1.5505420865117993e-06, + "loss": 0.1427, + "step": 2748 + }, + { + "epoch": 4.428513894482481, + "grad_norm": 0.2196132757827625, + "learning_rate": 1.541859525835505e-06, + "loss": 0.1416, + "step": 2749 + }, + { + "epoch": 4.4301248489730165, + "grad_norm": 0.22650353348090568, + "learning_rate": 1.5332003685745279e-06, + "loss": 0.1428, + "step": 2750 + }, + { + "epoch": 4.431735803463552, + "grad_norm": 0.2163231272238479, + "learning_rate": 1.524564625707985e-06, + "loss": 0.1592, + "step": 2751 + }, + { + "epoch": 4.433346757954088, + "grad_norm": 0.21345314169858762, + "learning_rate": 1.5159523081852867e-06, + "loss": 0.1491, + "step": 2752 + }, + { + "epoch": 4.434957712444623, + "grad_norm": 0.21724471459393288, + "learning_rate": 1.5073634269261427e-06, + "loss": 0.1437, + "step": 2753 + }, + { + "epoch": 4.436568666935159, + "grad_norm": 0.21107110747167726, + "learning_rate": 1.4987979928205599e-06, + "loss": 0.1498, + "step": 2754 + }, + { + "epoch": 4.438179621425695, + "grad_norm": 0.23243728370562228, + "learning_rate": 1.4902560167288105e-06, + "loss": 0.1445, + "step": 2755 + }, + { + "epoch": 4.439790575916231, + "grad_norm": 0.20420200944872982, + "learning_rate": 1.4817375094814202e-06, + "loss": 0.1411, + "step": 2756 + }, + { + "epoch": 4.441401530406766, + "grad_norm": 0.23431831960543478, + "learning_rate": 1.473242481879158e-06, + "loss": 0.1596, + "step": 2757 + }, + { + "epoch": 4.443012484897301, + "grad_norm": 0.22563203510054342, + "learning_rate": 1.464770944693028e-06, + "loss": 0.1306, + "step": 2758 + }, + { + "epoch": 4.444623439387837, + "grad_norm": 0.21898437843550095, + "learning_rate": 1.4563229086642538e-06, + "loss": 0.1639, + "step": 2759 + }, + { + "epoch": 4.446234393878373, + "grad_norm": 0.21799291473907836, + "learning_rate": 1.4478983845042493e-06, + "loss": 0.1409, + "step": 2760 + }, + { + "epoch": 4.447845348368909, + "grad_norm": 0.21834671784511098, + "learning_rate": 1.439497382894617e-06, + "loss": 0.1451, + "step": 2761 + }, + { + "epoch": 4.449456302859444, + "grad_norm": 0.21784261633030536, + "learning_rate": 1.4311199144871534e-06, + "loss": 0.1498, + "step": 2762 + }, + { + "epoch": 4.45106725734998, + "grad_norm": 0.21551061217502962, + "learning_rate": 1.4227659899038004e-06, + "loss": 0.1483, + "step": 2763 + }, + { + "epoch": 4.4526782118405155, + "grad_norm": 0.22482929437855542, + "learning_rate": 1.4144356197366494e-06, + "loss": 0.1447, + "step": 2764 + }, + { + "epoch": 4.454289166331051, + "grad_norm": 0.21817533638318648, + "learning_rate": 1.406128814547929e-06, + "loss": 0.1583, + "step": 2765 + }, + { + "epoch": 4.455900120821587, + "grad_norm": 0.22826403127027628, + "learning_rate": 1.397845584869999e-06, + "loss": 0.1391, + "step": 2766 + }, + { + "epoch": 4.457511075312122, + "grad_norm": 0.22368572318707833, + "learning_rate": 1.3895859412053093e-06, + "loss": 0.1378, + "step": 2767 + }, + { + "epoch": 4.459122029802658, + "grad_norm": 0.2249853087961331, + "learning_rate": 1.3813498940264136e-06, + "loss": 0.1463, + "step": 2768 + }, + { + "epoch": 4.460732984293194, + "grad_norm": 0.2185064787669113, + "learning_rate": 1.3731374537759544e-06, + "loss": 0.1396, + "step": 2769 + }, + { + "epoch": 4.46234393878373, + "grad_norm": 0.21864596897479208, + "learning_rate": 1.3649486308666314e-06, + "loss": 0.1565, + "step": 2770 + }, + { + "epoch": 4.4639548932742645, + "grad_norm": 0.22727139547878636, + "learning_rate": 1.3567834356812015e-06, + "loss": 0.1437, + "step": 2771 + }, + { + "epoch": 4.4655658477648, + "grad_norm": 0.2233584478305029, + "learning_rate": 1.3486418785724697e-06, + "loss": 0.144, + "step": 2772 + }, + { + "epoch": 4.467176802255336, + "grad_norm": 0.351620925278512, + "learning_rate": 1.3405239698632654e-06, + "loss": 0.1532, + "step": 2773 + }, + { + "epoch": 4.468787756745872, + "grad_norm": 0.2215381503484612, + "learning_rate": 1.332429719846433e-06, + "loss": 0.1398, + "step": 2774 + }, + { + "epoch": 4.470398711236408, + "grad_norm": 0.2232986049913044, + "learning_rate": 1.3243591387848164e-06, + "loss": 0.1485, + "step": 2775 + }, + { + "epoch": 4.472009665726943, + "grad_norm": 0.2202202269211146, + "learning_rate": 1.3163122369112591e-06, + "loss": 0.1574, + "step": 2776 + }, + { + "epoch": 4.473620620217479, + "grad_norm": 0.22918240497697517, + "learning_rate": 1.3082890244285773e-06, + "loss": 0.1447, + "step": 2777 + }, + { + "epoch": 4.4752315747080145, + "grad_norm": 0.22566546540240492, + "learning_rate": 1.300289511509547e-06, + "loss": 0.1463, + "step": 2778 + }, + { + "epoch": 4.47684252919855, + "grad_norm": 0.21415981839053075, + "learning_rate": 1.292313708296893e-06, + "loss": 0.1542, + "step": 2779 + }, + { + "epoch": 4.478453483689086, + "grad_norm": 0.221053938368025, + "learning_rate": 1.2843616249032874e-06, + "loss": 0.1522, + "step": 2780 + }, + { + "epoch": 4.480064438179621, + "grad_norm": 0.21618116765332307, + "learning_rate": 1.2764332714113258e-06, + "loss": 0.1546, + "step": 2781 + }, + { + "epoch": 4.481675392670157, + "grad_norm": 0.21974552955078566, + "learning_rate": 1.2685286578735045e-06, + "loss": 0.143, + "step": 2782 + }, + { + "epoch": 4.483286347160693, + "grad_norm": 0.22078570986976645, + "learning_rate": 1.2606477943122352e-06, + "loss": 0.1449, + "step": 2783 + }, + { + "epoch": 4.484897301651229, + "grad_norm": 0.22916711988692676, + "learning_rate": 1.2527906907198094e-06, + "loss": 0.1422, + "step": 2784 + }, + { + "epoch": 4.486508256141764, + "grad_norm": 0.22001716488332673, + "learning_rate": 1.244957357058394e-06, + "loss": 0.1442, + "step": 2785 + }, + { + "epoch": 4.488119210632299, + "grad_norm": 0.22386383475134952, + "learning_rate": 1.2371478032600083e-06, + "loss": 0.1601, + "step": 2786 + }, + { + "epoch": 4.489730165122835, + "grad_norm": 0.21664981131161265, + "learning_rate": 1.2293620392265338e-06, + "loss": 0.1415, + "step": 2787 + }, + { + "epoch": 4.491341119613371, + "grad_norm": 0.22629217846588368, + "learning_rate": 1.2216000748296897e-06, + "loss": 0.1433, + "step": 2788 + }, + { + "epoch": 4.492952074103907, + "grad_norm": 0.235019336289307, + "learning_rate": 1.213861919911008e-06, + "loss": 0.1494, + "step": 2789 + }, + { + "epoch": 4.494563028594442, + "grad_norm": 0.21964906508530987, + "learning_rate": 1.2061475842818337e-06, + "loss": 0.1425, + "step": 2790 + }, + { + "epoch": 4.496173983084978, + "grad_norm": 0.21452579914651837, + "learning_rate": 1.1984570777233184e-06, + "loss": 0.1548, + "step": 2791 + }, + { + "epoch": 4.4977849375755135, + "grad_norm": 0.22707174901989768, + "learning_rate": 1.1907904099863999e-06, + "loss": 0.1631, + "step": 2792 + }, + { + "epoch": 4.499395892066049, + "grad_norm": 0.2155179519101744, + "learning_rate": 1.18314759079178e-06, + "loss": 0.1443, + "step": 2793 + }, + { + "epoch": 4.501006846556585, + "grad_norm": 0.2226635289792672, + "learning_rate": 1.1755286298299339e-06, + "loss": 0.161, + "step": 2794 + }, + { + "epoch": 4.50261780104712, + "grad_norm": 0.22080540831878404, + "learning_rate": 1.1679335367610855e-06, + "loss": 0.1413, + "step": 2795 + }, + { + "epoch": 4.504228755537656, + "grad_norm": 0.22308278536499784, + "learning_rate": 1.1603623212151872e-06, + "loss": 0.1365, + "step": 2796 + }, + { + "epoch": 4.505839710028192, + "grad_norm": 0.23108606533673448, + "learning_rate": 1.152814992791922e-06, + "loss": 0.1393, + "step": 2797 + }, + { + "epoch": 4.507450664518728, + "grad_norm": 0.21861727723987914, + "learning_rate": 1.1452915610606885e-06, + "loss": 0.1497, + "step": 2798 + }, + { + "epoch": 4.5090616190092625, + "grad_norm": 0.2218833710624545, + "learning_rate": 1.1377920355605854e-06, + "loss": 0.1468, + "step": 2799 + }, + { + "epoch": 4.510672573499798, + "grad_norm": 0.21908566335116011, + "learning_rate": 1.1303164258003974e-06, + "loss": 0.1473, + "step": 2800 + }, + { + "epoch": 4.512283527990334, + "grad_norm": 0.21884018169901615, + "learning_rate": 1.1228647412585847e-06, + "loss": 0.1318, + "step": 2801 + }, + { + "epoch": 4.51389448248087, + "grad_norm": 0.22464962629724394, + "learning_rate": 1.1154369913832762e-06, + "loss": 0.1461, + "step": 2802 + }, + { + "epoch": 4.515505436971406, + "grad_norm": 0.2233609804572511, + "learning_rate": 1.1080331855922588e-06, + "loss": 0.1448, + "step": 2803 + }, + { + "epoch": 4.517116391461942, + "grad_norm": 0.21225483736906917, + "learning_rate": 1.100653333272943e-06, + "loss": 0.1452, + "step": 2804 + }, + { + "epoch": 4.518727345952477, + "grad_norm": 0.2269518895274179, + "learning_rate": 1.0932974437823884e-06, + "loss": 0.1451, + "step": 2805 + }, + { + "epoch": 4.520338300443012, + "grad_norm": 0.2261411870254825, + "learning_rate": 1.0859655264472568e-06, + "loss": 0.1538, + "step": 2806 + }, + { + "epoch": 4.521949254933548, + "grad_norm": 0.22345780419863742, + "learning_rate": 1.078657590563823e-06, + "loss": 0.1295, + "step": 2807 + }, + { + "epoch": 4.523560209424084, + "grad_norm": 0.21931553583779848, + "learning_rate": 1.0713736453979528e-06, + "loss": 0.1419, + "step": 2808 + }, + { + "epoch": 4.525171163914619, + "grad_norm": 0.22080634847733432, + "learning_rate": 1.064113700185092e-06, + "loss": 0.1439, + "step": 2809 + }, + { + "epoch": 4.526782118405155, + "grad_norm": 0.2261480900316319, + "learning_rate": 1.0568777641302663e-06, + "loss": 0.153, + "step": 2810 + }, + { + "epoch": 4.528393072895691, + "grad_norm": 0.22439464056795858, + "learning_rate": 1.0496658464080434e-06, + "loss": 0.1537, + "step": 2811 + }, + { + "epoch": 4.5300040273862265, + "grad_norm": 0.2248930635120482, + "learning_rate": 1.0424779561625465e-06, + "loss": 0.1488, + "step": 2812 + }, + { + "epoch": 4.531614981876762, + "grad_norm": 0.2069570637163575, + "learning_rate": 1.0353141025074364e-06, + "loss": 0.1647, + "step": 2813 + }, + { + "epoch": 4.533225936367297, + "grad_norm": 0.23132704696687886, + "learning_rate": 1.0281742945258987e-06, + "loss": 0.1462, + "step": 2814 + }, + { + "epoch": 4.534836890857833, + "grad_norm": 0.21337858571941506, + "learning_rate": 1.0210585412706187e-06, + "loss": 0.1487, + "step": 2815 + }, + { + "epoch": 4.536447845348369, + "grad_norm": 0.22309163223275602, + "learning_rate": 1.0139668517637991e-06, + "loss": 0.144, + "step": 2816 + }, + { + "epoch": 4.538058799838905, + "grad_norm": 0.21907142466944238, + "learning_rate": 1.006899234997114e-06, + "loss": 0.1446, + "step": 2817 + }, + { + "epoch": 4.53966975432944, + "grad_norm": 0.217901044651781, + "learning_rate": 9.998556999317334e-07, + "loss": 0.1367, + "step": 2818 + }, + { + "epoch": 4.541280708819976, + "grad_norm": 0.21574027266533383, + "learning_rate": 9.928362554982796e-07, + "loss": 0.1456, + "step": 2819 + }, + { + "epoch": 4.542891663310511, + "grad_norm": 0.22778052410416838, + "learning_rate": 9.858409105968337e-07, + "loss": 0.1468, + "step": 2820 + }, + { + "epoch": 4.544502617801047, + "grad_norm": 0.23571660112324896, + "learning_rate": 9.788696740969295e-07, + "loss": 0.1455, + "step": 2821 + }, + { + "epoch": 4.546113572291583, + "grad_norm": 0.2201132602209927, + "learning_rate": 9.71922554837521e-07, + "loss": 0.1446, + "step": 2822 + }, + { + "epoch": 4.547724526782118, + "grad_norm": 0.2170027257100728, + "learning_rate": 9.649995616269847e-07, + "loss": 0.1421, + "step": 2823 + }, + { + "epoch": 4.549335481272654, + "grad_norm": 0.21147627065327013, + "learning_rate": 9.581007032431144e-07, + "loss": 0.1486, + "step": 2824 + }, + { + "epoch": 4.55094643576319, + "grad_norm": 0.21922962408896876, + "learning_rate": 9.512259884331021e-07, + "loss": 0.138, + "step": 2825 + }, + { + "epoch": 4.5525573902537255, + "grad_norm": 0.21981668661472575, + "learning_rate": 9.443754259135197e-07, + "loss": 0.1415, + "step": 2826 + }, + { + "epoch": 4.554168344744261, + "grad_norm": 0.21062790215118363, + "learning_rate": 9.375490243703255e-07, + "loss": 0.1388, + "step": 2827 + }, + { + "epoch": 4.555779299234796, + "grad_norm": 0.20844877721721428, + "learning_rate": 9.307467924588364e-07, + "loss": 0.1425, + "step": 2828 + }, + { + "epoch": 4.557390253725332, + "grad_norm": 0.2169075920951353, + "learning_rate": 9.239687388037311e-07, + "loss": 0.1451, + "step": 2829 + }, + { + "epoch": 4.559001208215868, + "grad_norm": 0.2209905968566893, + "learning_rate": 9.172148719990237e-07, + "loss": 0.1419, + "step": 2830 + }, + { + "epoch": 4.560612162706404, + "grad_norm": 0.22168194810993105, + "learning_rate": 9.104852006080689e-07, + "loss": 0.1425, + "step": 2831 + }, + { + "epoch": 4.56222311719694, + "grad_norm": 0.23429944659967786, + "learning_rate": 9.03779733163539e-07, + "loss": 0.1352, + "step": 2832 + }, + { + "epoch": 4.563834071687475, + "grad_norm": 0.22251765674022653, + "learning_rate": 8.970984781674197e-07, + "loss": 0.158, + "step": 2833 + }, + { + "epoch": 4.56544502617801, + "grad_norm": 0.21439636612358917, + "learning_rate": 8.904414440909992e-07, + "loss": 0.1658, + "step": 2834 + }, + { + "epoch": 4.567055980668546, + "grad_norm": 0.21055760967769271, + "learning_rate": 8.83808639374848e-07, + "loss": 0.1406, + "step": 2835 + }, + { + "epoch": 4.568666935159082, + "grad_norm": 0.22812053637083266, + "learning_rate": 8.772000724288277e-07, + "loss": 0.1551, + "step": 2836 + }, + { + "epoch": 4.570277889649617, + "grad_norm": 0.23203123173864854, + "learning_rate": 8.706157516320557e-07, + "loss": 0.1457, + "step": 2837 + }, + { + "epoch": 4.571888844140153, + "grad_norm": 0.21647238240445146, + "learning_rate": 8.640556853329185e-07, + "loss": 0.1366, + "step": 2838 + }, + { + "epoch": 4.573499798630689, + "grad_norm": 0.2262132480851578, + "learning_rate": 8.575198818490405e-07, + "loss": 0.1532, + "step": 2839 + }, + { + "epoch": 4.5751107531212245, + "grad_norm": 0.22036901946478252, + "learning_rate": 8.510083494672905e-07, + "loss": 0.15, + "step": 2840 + }, + { + "epoch": 4.57672170761176, + "grad_norm": 0.20881526264861075, + "learning_rate": 8.445210964437556e-07, + "loss": 0.1572, + "step": 2841 + }, + { + "epoch": 4.578332662102295, + "grad_norm": 0.21997273950205165, + "learning_rate": 8.380581310037472e-07, + "loss": 0.1483, + "step": 2842 + }, + { + "epoch": 4.579943616592831, + "grad_norm": 0.2115529692321746, + "learning_rate": 8.316194613417749e-07, + "loss": 0.15, + "step": 2843 + }, + { + "epoch": 4.581554571083367, + "grad_norm": 0.2101065492853683, + "learning_rate": 8.252050956215462e-07, + "loss": 0.1327, + "step": 2844 + }, + { + "epoch": 4.583165525573903, + "grad_norm": 0.22367025563934423, + "learning_rate": 8.188150419759577e-07, + "loss": 0.1539, + "step": 2845 + }, + { + "epoch": 4.584776480064438, + "grad_norm": 0.21257442264917925, + "learning_rate": 8.124493085070706e-07, + "loss": 0.1459, + "step": 2846 + }, + { + "epoch": 4.5863874345549736, + "grad_norm": 0.2175995649893487, + "learning_rate": 8.061079032861197e-07, + "loss": 0.1362, + "step": 2847 + }, + { + "epoch": 4.587998389045509, + "grad_norm": 0.22731145011308046, + "learning_rate": 7.997908343534844e-07, + "loss": 0.1416, + "step": 2848 + }, + { + "epoch": 4.589609343536045, + "grad_norm": 0.230612422282459, + "learning_rate": 7.934981097186977e-07, + "loss": 0.1616, + "step": 2849 + }, + { + "epoch": 4.591220298026581, + "grad_norm": 0.22856353525173428, + "learning_rate": 7.872297373604154e-07, + "loss": 0.1481, + "step": 2850 + }, + { + "epoch": 4.592831252517117, + "grad_norm": 0.21655429075000812, + "learning_rate": 7.809857252264263e-07, + "loss": 0.1478, + "step": 2851 + }, + { + "epoch": 4.594442207007652, + "grad_norm": 0.2267420054274175, + "learning_rate": 7.747660812336221e-07, + "loss": 0.1386, + "step": 2852 + }, + { + "epoch": 4.596053161498188, + "grad_norm": 0.21517894418513714, + "learning_rate": 7.685708132680125e-07, + "loss": 0.1575, + "step": 2853 + }, + { + "epoch": 4.5976641159887235, + "grad_norm": 0.22771611741406794, + "learning_rate": 7.623999291846829e-07, + "loss": 0.1523, + "step": 2854 + }, + { + "epoch": 4.599275070479259, + "grad_norm": 0.22020469314547333, + "learning_rate": 7.562534368078167e-07, + "loss": 0.1312, + "step": 2855 + }, + { + "epoch": 4.600886024969794, + "grad_norm": 0.21911457185337718, + "learning_rate": 7.501313439306623e-07, + "loss": 0.1471, + "step": 2856 + }, + { + "epoch": 4.60249697946033, + "grad_norm": 0.220369515835811, + "learning_rate": 7.440336583155306e-07, + "loss": 0.1467, + "step": 2857 + }, + { + "epoch": 4.604107933950866, + "grad_norm": 0.2429223255819812, + "learning_rate": 7.379603876937969e-07, + "loss": 0.1393, + "step": 2858 + }, + { + "epoch": 4.605718888441402, + "grad_norm": 0.22295148519094363, + "learning_rate": 7.319115397658639e-07, + "loss": 0.1477, + "step": 2859 + }, + { + "epoch": 4.607329842931938, + "grad_norm": 0.2155898298714699, + "learning_rate": 7.258871222011832e-07, + "loss": 0.1365, + "step": 2860 + }, + { + "epoch": 4.6089407974224725, + "grad_norm": 0.21243596261299383, + "learning_rate": 7.198871426382203e-07, + "loss": 0.1558, + "step": 2861 + }, + { + "epoch": 4.610551751913008, + "grad_norm": 0.22122501445417958, + "learning_rate": 7.139116086844655e-07, + "loss": 0.161, + "step": 2862 + }, + { + "epoch": 4.612162706403544, + "grad_norm": 0.21598365968425512, + "learning_rate": 7.079605279163982e-07, + "loss": 0.1493, + "step": 2863 + }, + { + "epoch": 4.61377366089408, + "grad_norm": 0.2597213786138549, + "learning_rate": 7.020339078795136e-07, + "loss": 0.1559, + "step": 2864 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.22332642167151248, + "learning_rate": 6.961317560882741e-07, + "loss": 0.1501, + "step": 2865 + }, + { + "epoch": 4.616995569875151, + "grad_norm": 0.21904255318165475, + "learning_rate": 6.902540800261292e-07, + "loss": 0.1627, + "step": 2866 + }, + { + "epoch": 4.618606524365687, + "grad_norm": 0.2257051902642088, + "learning_rate": 6.844008871454955e-07, + "loss": 0.1383, + "step": 2867 + }, + { + "epoch": 4.6202174788562225, + "grad_norm": 0.2028287385389341, + "learning_rate": 6.785721848677406e-07, + "loss": 0.1522, + "step": 2868 + }, + { + "epoch": 4.621828433346758, + "grad_norm": 0.21967388469905091, + "learning_rate": 6.727679805831821e-07, + "loss": 0.136, + "step": 2869 + }, + { + "epoch": 4.623439387837293, + "grad_norm": 0.2254164663996422, + "learning_rate": 6.669882816510776e-07, + "loss": 0.139, + "step": 2870 + }, + { + "epoch": 4.625050342327829, + "grad_norm": 0.21946694917764376, + "learning_rate": 6.61233095399616e-07, + "loss": 0.1387, + "step": 2871 + }, + { + "epoch": 4.626661296818365, + "grad_norm": 0.22027332476566938, + "learning_rate": 6.555024291259005e-07, + "loss": 0.1447, + "step": 2872 + }, + { + "epoch": 4.628272251308901, + "grad_norm": 0.21396999455794583, + "learning_rate": 6.497962900959542e-07, + "loss": 0.1556, + "step": 2873 + }, + { + "epoch": 4.629883205799436, + "grad_norm": 0.23178083820005146, + "learning_rate": 6.441146855446856e-07, + "loss": 0.135, + "step": 2874 + }, + { + "epoch": 4.6314941602899715, + "grad_norm": 0.21644352493998795, + "learning_rate": 6.384576226759165e-07, + "loss": 0.1559, + "step": 2875 + }, + { + "epoch": 4.633105114780507, + "grad_norm": 0.22303322809496479, + "learning_rate": 6.328251086623294e-07, + "loss": 0.1427, + "step": 2876 + }, + { + "epoch": 4.634716069271043, + "grad_norm": 0.21071259795990352, + "learning_rate": 6.272171506455005e-07, + "loss": 0.1459, + "step": 2877 + }, + { + "epoch": 4.636327023761579, + "grad_norm": 0.20980391695821105, + "learning_rate": 6.21633755735862e-07, + "loss": 0.138, + "step": 2878 + }, + { + "epoch": 4.637937978252115, + "grad_norm": 0.21263059217171254, + "learning_rate": 6.160749310127023e-07, + "loss": 0.1484, + "step": 2879 + }, + { + "epoch": 4.63954893274265, + "grad_norm": 0.204952813257361, + "learning_rate": 6.105406835241545e-07, + "loss": 0.1406, + "step": 2880 + }, + { + "epoch": 4.641159887233186, + "grad_norm": 0.21197948810614625, + "learning_rate": 6.050310202871922e-07, + "loss": 0.1281, + "step": 2881 + }, + { + "epoch": 4.6427708417237215, + "grad_norm": 0.20655087363236846, + "learning_rate": 5.995459482876253e-07, + "loss": 0.1522, + "step": 2882 + }, + { + "epoch": 4.644381796214257, + "grad_norm": 0.22413109041423177, + "learning_rate": 5.940854744800706e-07, + "loss": 0.1642, + "step": 2883 + }, + { + "epoch": 4.645992750704792, + "grad_norm": 0.21624222816321237, + "learning_rate": 5.886496057879676e-07, + "loss": 0.1359, + "step": 2884 + }, + { + "epoch": 4.647603705195328, + "grad_norm": 0.23678058476570954, + "learning_rate": 5.832383491035499e-07, + "loss": 0.1397, + "step": 2885 + }, + { + "epoch": 4.649214659685864, + "grad_norm": 0.21030051110070672, + "learning_rate": 5.778517112878512e-07, + "loss": 0.1485, + "step": 2886 + }, + { + "epoch": 4.6508256141764, + "grad_norm": 0.21914807690463745, + "learning_rate": 5.724896991706885e-07, + "loss": 0.1456, + "step": 2887 + }, + { + "epoch": 4.652436568666936, + "grad_norm": 0.21409204336065774, + "learning_rate": 5.671523195506567e-07, + "loss": 0.1543, + "step": 2888 + }, + { + "epoch": 4.6540475231574705, + "grad_norm": 0.21075887789724632, + "learning_rate": 5.618395791951159e-07, + "loss": 0.15, + "step": 2889 + }, + { + "epoch": 4.655658477648006, + "grad_norm": 0.22077872594297945, + "learning_rate": 5.565514848401887e-07, + "loss": 0.1354, + "step": 2890 + }, + { + "epoch": 4.657269432138542, + "grad_norm": 0.21354385616306004, + "learning_rate": 5.512880431907452e-07, + "loss": 0.1408, + "step": 2891 + }, + { + "epoch": 4.658880386629078, + "grad_norm": 0.212847283457817, + "learning_rate": 5.460492609203982e-07, + "loss": 0.1433, + "step": 2892 + }, + { + "epoch": 4.660491341119613, + "grad_norm": 0.2187454345219124, + "learning_rate": 5.40835144671501e-07, + "loss": 0.1465, + "step": 2893 + }, + { + "epoch": 4.662102295610149, + "grad_norm": 0.21640789825365564, + "learning_rate": 5.356457010551253e-07, + "loss": 0.1455, + "step": 2894 + }, + { + "epoch": 4.663713250100685, + "grad_norm": 0.21575352229791922, + "learning_rate": 5.304809366510566e-07, + "loss": 0.1556, + "step": 2895 + }, + { + "epoch": 4.66532420459122, + "grad_norm": 0.21723870718473914, + "learning_rate": 5.253408580078035e-07, + "loss": 0.1584, + "step": 2896 + }, + { + "epoch": 4.666935159081756, + "grad_norm": 0.22077657920712968, + "learning_rate": 5.202254716425636e-07, + "loss": 0.1502, + "step": 2897 + }, + { + "epoch": 4.668546113572291, + "grad_norm": 0.22750870337063364, + "learning_rate": 5.151347840412224e-07, + "loss": 0.1404, + "step": 2898 + }, + { + "epoch": 4.670157068062827, + "grad_norm": 0.21780471723045816, + "learning_rate": 5.100688016583632e-07, + "loss": 0.1317, + "step": 2899 + }, + { + "epoch": 4.671768022553363, + "grad_norm": 0.22372509521913855, + "learning_rate": 5.05027530917237e-07, + "loss": 0.1507, + "step": 2900 + }, + { + "epoch": 4.673378977043899, + "grad_norm": 0.21577961205407864, + "learning_rate": 5.00010978209764e-07, + "loss": 0.1498, + "step": 2901 + }, + { + "epoch": 4.6749899315344345, + "grad_norm": 0.22095270932077324, + "learning_rate": 4.950191498965207e-07, + "loss": 0.1567, + "step": 2902 + }, + { + "epoch": 4.6766008860249695, + "grad_norm": 0.22385680458863616, + "learning_rate": 4.900520523067376e-07, + "loss": 0.1387, + "step": 2903 + }, + { + "epoch": 4.678211840515505, + "grad_norm": 0.21480467606962006, + "learning_rate": 4.851096917382925e-07, + "loss": 0.1429, + "step": 2904 + }, + { + "epoch": 4.679822795006041, + "grad_norm": 0.21972524220304457, + "learning_rate": 4.801920744576949e-07, + "loss": 0.1466, + "step": 2905 + }, + { + "epoch": 4.681433749496577, + "grad_norm": 0.2143359141084018, + "learning_rate": 4.7529920670007724e-07, + "loss": 0.1559, + "step": 2906 + }, + { + "epoch": 4.683044703987113, + "grad_norm": 0.2195510458192481, + "learning_rate": 4.704310946692037e-07, + "loss": 0.1436, + "step": 2907 + }, + { + "epoch": 4.684655658477648, + "grad_norm": 0.2244650853064686, + "learning_rate": 4.6558774453743684e-07, + "loss": 0.1539, + "step": 2908 + }, + { + "epoch": 4.686266612968184, + "grad_norm": 0.2215405869664864, + "learning_rate": 4.607691624457511e-07, + "loss": 0.1387, + "step": 2909 + }, + { + "epoch": 4.687877567458719, + "grad_norm": 0.21906085385910223, + "learning_rate": 4.559753545037171e-07, + "loss": 0.1373, + "step": 2910 + }, + { + "epoch": 4.689488521949255, + "grad_norm": 0.21588538081675898, + "learning_rate": 4.512063267894906e-07, + "loss": 0.1598, + "step": 2911 + }, + { + "epoch": 4.69109947643979, + "grad_norm": 0.21543206275284182, + "learning_rate": 4.4646208534980807e-07, + "loss": 0.135, + "step": 2912 + }, + { + "epoch": 4.692710430930326, + "grad_norm": 0.20168686340533482, + "learning_rate": 4.4174263619998217e-07, + "loss": 0.138, + "step": 2913 + }, + { + "epoch": 4.694321385420862, + "grad_norm": 0.22114311160392375, + "learning_rate": 4.3704798532388624e-07, + "loss": 0.1521, + "step": 2914 + }, + { + "epoch": 4.695932339911398, + "grad_norm": 0.2165961430758482, + "learning_rate": 4.3237813867396117e-07, + "loss": 0.1398, + "step": 2915 + }, + { + "epoch": 4.6975432944019335, + "grad_norm": 0.21089525598082778, + "learning_rate": 4.2773310217118394e-07, + "loss": 0.1494, + "step": 2916 + }, + { + "epoch": 4.6991542488924685, + "grad_norm": 0.20868547364457915, + "learning_rate": 4.2311288170508336e-07, + "loss": 0.1356, + "step": 2917 + }, + { + "epoch": 4.700765203383004, + "grad_norm": 0.21119647807605038, + "learning_rate": 4.1851748313372463e-07, + "loss": 0.1551, + "step": 2918 + }, + { + "epoch": 4.70237615787354, + "grad_norm": 0.21654917077953273, + "learning_rate": 4.139469122836981e-07, + "loss": 0.1387, + "step": 2919 + }, + { + "epoch": 4.703987112364076, + "grad_norm": 0.21662833427984002, + "learning_rate": 4.094011749501103e-07, + "loss": 0.1502, + "step": 2920 + }, + { + "epoch": 4.705598066854611, + "grad_norm": 0.20946333432367129, + "learning_rate": 4.048802768965887e-07, + "loss": 0.1413, + "step": 2921 + }, + { + "epoch": 4.707209021345147, + "grad_norm": 0.22564245087432352, + "learning_rate": 4.003842238552613e-07, + "loss": 0.1545, + "step": 2922 + }, + { + "epoch": 4.708819975835683, + "grad_norm": 0.22076168759631085, + "learning_rate": 3.9591302152675703e-07, + "loss": 0.1591, + "step": 2923 + }, + { + "epoch": 4.710430930326218, + "grad_norm": 0.2097867230868185, + "learning_rate": 3.9146667558019433e-07, + "loss": 0.1571, + "step": 2924 + }, + { + "epoch": 4.712041884816754, + "grad_norm": 0.22088949709015748, + "learning_rate": 3.8704519165317923e-07, + "loss": 0.1565, + "step": 2925 + }, + { + "epoch": 4.71365283930729, + "grad_norm": 0.22150240058770918, + "learning_rate": 3.8264857535178943e-07, + "loss": 0.1324, + "step": 2926 + }, + { + "epoch": 4.715263793797825, + "grad_norm": 0.20698205346690662, + "learning_rate": 3.782768322505792e-07, + "loss": 0.1315, + "step": 2927 + }, + { + "epoch": 4.716874748288361, + "grad_norm": 0.20845720756492594, + "learning_rate": 3.7392996789255673e-07, + "loss": 0.1442, + "step": 2928 + }, + { + "epoch": 4.718485702778897, + "grad_norm": 0.2156287236788521, + "learning_rate": 3.6960798778919784e-07, + "loss": 0.1499, + "step": 2929 + }, + { + "epoch": 4.7200966572694325, + "grad_norm": 0.2196744399794609, + "learning_rate": 3.653108974204145e-07, + "loss": 0.1551, + "step": 2930 + }, + { + "epoch": 4.7217076117599674, + "grad_norm": 0.22139174195462538, + "learning_rate": 3.610387022345685e-07, + "loss": 0.1534, + "step": 2931 + }, + { + "epoch": 4.723318566250503, + "grad_norm": 0.2277220395224189, + "learning_rate": 3.567914076484558e-07, + "loss": 0.1498, + "step": 2932 + }, + { + "epoch": 4.724929520741039, + "grad_norm": 0.22109687230688657, + "learning_rate": 3.5256901904729967e-07, + "loss": 0.1562, + "step": 2933 + }, + { + "epoch": 4.726540475231575, + "grad_norm": 0.21580426849176798, + "learning_rate": 3.483715417847422e-07, + "loss": 0.1382, + "step": 2934 + }, + { + "epoch": 4.728151429722111, + "grad_norm": 0.2137311430685286, + "learning_rate": 3.441989811828417e-07, + "loss": 0.1354, + "step": 2935 + }, + { + "epoch": 4.729762384212646, + "grad_norm": 0.21046823458686373, + "learning_rate": 3.4005134253206393e-07, + "loss": 0.1464, + "step": 2936 + }, + { + "epoch": 4.7313733387031816, + "grad_norm": 0.20552298607096997, + "learning_rate": 3.3592863109128016e-07, + "loss": 0.1471, + "step": 2937 + }, + { + "epoch": 4.732984293193717, + "grad_norm": 0.2121097718281581, + "learning_rate": 3.318308520877489e-07, + "loss": 0.15, + "step": 2938 + }, + { + "epoch": 4.734595247684253, + "grad_norm": 0.22848848728985083, + "learning_rate": 3.277580107171163e-07, + "loss": 0.1485, + "step": 2939 + }, + { + "epoch": 4.736206202174788, + "grad_norm": 0.21565593846592726, + "learning_rate": 3.2371011214342053e-07, + "loss": 0.1582, + "step": 2940 + }, + { + "epoch": 4.737817156665324, + "grad_norm": 0.22164738957939945, + "learning_rate": 3.1968716149906043e-07, + "loss": 0.1453, + "step": 2941 + }, + { + "epoch": 4.73942811115586, + "grad_norm": 0.2168031711486235, + "learning_rate": 3.156891638848092e-07, + "loss": 0.1442, + "step": 2942 + }, + { + "epoch": 4.741039065646396, + "grad_norm": 0.2167016023126979, + "learning_rate": 3.117161243698052e-07, + "loss": 0.1426, + "step": 2943 + }, + { + "epoch": 4.7426500201369315, + "grad_norm": 0.21297127531599902, + "learning_rate": 3.077680479915368e-07, + "loss": 0.1462, + "step": 2944 + }, + { + "epoch": 4.744260974627466, + "grad_norm": 0.21958491536222166, + "learning_rate": 3.038449397558396e-07, + "loss": 0.1338, + "step": 2945 + }, + { + "epoch": 4.745871929118002, + "grad_norm": 0.21879043332580358, + "learning_rate": 2.9994680463689697e-07, + "loss": 0.1502, + "step": 2946 + }, + { + "epoch": 4.747482883608538, + "grad_norm": 0.21967915410535746, + "learning_rate": 2.9607364757722635e-07, + "loss": 0.15, + "step": 2947 + }, + { + "epoch": 4.749093838099074, + "grad_norm": 0.2113428697983794, + "learning_rate": 2.9222547348767504e-07, + "loss": 0.1267, + "step": 2948 + }, + { + "epoch": 4.750704792589609, + "grad_norm": 0.21023635797177284, + "learning_rate": 2.884022872474113e-07, + "loss": 0.1336, + "step": 2949 + }, + { + "epoch": 4.752315747080145, + "grad_norm": 0.2076654674576389, + "learning_rate": 2.8460409370392405e-07, + "loss": 0.1338, + "step": 2950 + }, + { + "epoch": 4.7539267015706805, + "grad_norm": 0.2371363457394752, + "learning_rate": 2.808308976730145e-07, + "loss": 0.1463, + "step": 2951 + }, + { + "epoch": 4.755537656061216, + "grad_norm": 0.21635183048072082, + "learning_rate": 2.770827039387869e-07, + "loss": 0.1327, + "step": 2952 + }, + { + "epoch": 4.757148610551752, + "grad_norm": 0.2234269293362455, + "learning_rate": 2.7335951725364185e-07, + "loss": 0.1455, + "step": 2953 + }, + { + "epoch": 4.758759565042288, + "grad_norm": 0.21747410916134383, + "learning_rate": 2.696613423382788e-07, + "loss": 0.1566, + "step": 2954 + }, + { + "epoch": 4.760370519532823, + "grad_norm": 0.22281641279316053, + "learning_rate": 2.6598818388168246e-07, + "loss": 0.1429, + "step": 2955 + }, + { + "epoch": 4.761981474023359, + "grad_norm": 0.2238241839767047, + "learning_rate": 2.6234004654111854e-07, + "loss": 0.1544, + "step": 2956 + }, + { + "epoch": 4.763592428513895, + "grad_norm": 0.21188445273823583, + "learning_rate": 2.5871693494212036e-07, + "loss": 0.1399, + "step": 2957 + }, + { + "epoch": 4.7652033830044305, + "grad_norm": 0.210545430187273, + "learning_rate": 2.551188536785043e-07, + "loss": 0.156, + "step": 2958 + }, + { + "epoch": 4.766814337494965, + "grad_norm": 0.22470921530004012, + "learning_rate": 2.5154580731234333e-07, + "loss": 0.1457, + "step": 2959 + }, + { + "epoch": 4.768425291985501, + "grad_norm": 0.21323401360872982, + "learning_rate": 2.479978003739669e-07, + "loss": 0.1601, + "step": 2960 + }, + { + "epoch": 4.770036246476037, + "grad_norm": 0.20568485058813663, + "learning_rate": 2.4447483736195877e-07, + "loss": 0.1345, + "step": 2961 + }, + { + "epoch": 4.771647200966573, + "grad_norm": 0.20709133915444555, + "learning_rate": 2.40976922743148e-07, + "loss": 0.1491, + "step": 2962 + }, + { + "epoch": 4.773258155457109, + "grad_norm": 0.22004597986350832, + "learning_rate": 2.3750406095260469e-07, + "loss": 0.145, + "step": 2963 + }, + { + "epoch": 4.774869109947644, + "grad_norm": 0.21770693112333073, + "learning_rate": 2.340562563936355e-07, + "loss": 0.1523, + "step": 2964 + }, + { + "epoch": 4.7764800644381795, + "grad_norm": 0.20635080684518206, + "learning_rate": 2.3063351343777241e-07, + "loss": 0.1487, + "step": 2965 + }, + { + "epoch": 4.778091018928715, + "grad_norm": 0.2116532190342573, + "learning_rate": 2.272358364247773e-07, + "loss": 0.1403, + "step": 2966 + }, + { + "epoch": 4.779701973419251, + "grad_norm": 0.22159722188901532, + "learning_rate": 2.238632296626242e-07, + "loss": 0.1546, + "step": 2967 + }, + { + "epoch": 4.781312927909786, + "grad_norm": 0.21591067485783777, + "learning_rate": 2.205156974275058e-07, + "loss": 0.1511, + "step": 2968 + }, + { + "epoch": 4.782923882400322, + "grad_norm": 0.2191530577323583, + "learning_rate": 2.1719324396381802e-07, + "loss": 0.151, + "step": 2969 + }, + { + "epoch": 4.784534836890858, + "grad_norm": 0.21158599130781444, + "learning_rate": 2.138958734841623e-07, + "loss": 0.1434, + "step": 2970 + }, + { + "epoch": 4.786145791381394, + "grad_norm": 0.22489697455911142, + "learning_rate": 2.106235901693321e-07, + "loss": 0.1453, + "step": 2971 + }, + { + "epoch": 4.7877567458719295, + "grad_norm": 0.20816987525634345, + "learning_rate": 2.0737639816831967e-07, + "loss": 0.1417, + "step": 2972 + }, + { + "epoch": 4.789367700362464, + "grad_norm": 0.2208234962177496, + "learning_rate": 2.0415430159829608e-07, + "loss": 0.1449, + "step": 2973 + }, + { + "epoch": 4.790978654853, + "grad_norm": 0.21872410598917383, + "learning_rate": 2.0095730454461781e-07, + "loss": 0.1539, + "step": 2974 + }, + { + "epoch": 4.792589609343536, + "grad_norm": 0.2368138202783192, + "learning_rate": 1.9778541106081572e-07, + "loss": 0.144, + "step": 2975 + }, + { + "epoch": 4.794200563834072, + "grad_norm": 0.2517246270809971, + "learning_rate": 1.9463862516859277e-07, + "loss": 0.1639, + "step": 2976 + }, + { + "epoch": 4.795811518324607, + "grad_norm": 0.20392676903290943, + "learning_rate": 1.915169508578174e-07, + "loss": 0.1362, + "step": 2977 + }, + { + "epoch": 4.797422472815143, + "grad_norm": 0.2251301182098052, + "learning_rate": 1.8842039208651685e-07, + "loss": 0.1366, + "step": 2978 + }, + { + "epoch": 4.7990334273056785, + "grad_norm": 0.21438203052030705, + "learning_rate": 1.8534895278087272e-07, + "loss": 0.1443, + "step": 2979 + }, + { + "epoch": 4.800644381796214, + "grad_norm": 0.21536055584983677, + "learning_rate": 1.823026368352232e-07, + "loss": 0.1456, + "step": 2980 + }, + { + "epoch": 4.80225533628675, + "grad_norm": 0.21399263776091337, + "learning_rate": 1.792814481120453e-07, + "loss": 0.1454, + "step": 2981 + }, + { + "epoch": 4.803866290777286, + "grad_norm": 0.2215447311821123, + "learning_rate": 1.7628539044195924e-07, + "loss": 0.1538, + "step": 2982 + }, + { + "epoch": 4.805477245267821, + "grad_norm": 0.22845562785090617, + "learning_rate": 1.7331446762372638e-07, + "loss": 0.1379, + "step": 2983 + }, + { + "epoch": 4.807088199758357, + "grad_norm": 0.21877133621326636, + "learning_rate": 1.7036868342422687e-07, + "loss": 0.1466, + "step": 2984 + }, + { + "epoch": 4.808699154248893, + "grad_norm": 0.22197790124470415, + "learning_rate": 1.6744804157848183e-07, + "loss": 0.1648, + "step": 2985 + }, + { + "epoch": 4.810310108739428, + "grad_norm": 0.21974675453737633, + "learning_rate": 1.6455254578962243e-07, + "loss": 0.1332, + "step": 2986 + }, + { + "epoch": 4.811921063229963, + "grad_norm": 0.21753406916106324, + "learning_rate": 1.6168219972890087e-07, + "loss": 0.146, + "step": 2987 + }, + { + "epoch": 4.813532017720499, + "grad_norm": 0.20824467291862445, + "learning_rate": 1.5883700703568373e-07, + "loss": 0.1513, + "step": 2988 + }, + { + "epoch": 4.815142972211035, + "grad_norm": 0.21940558640107516, + "learning_rate": 1.5601697131744308e-07, + "loss": 0.1349, + "step": 2989 + }, + { + "epoch": 4.816753926701571, + "grad_norm": 0.2169053830515682, + "learning_rate": 1.5322209614975214e-07, + "loss": 0.1427, + "step": 2990 + }, + { + "epoch": 4.818364881192107, + "grad_norm": 0.20841544497570255, + "learning_rate": 1.5045238507628513e-07, + "loss": 0.1293, + "step": 2991 + }, + { + "epoch": 4.819975835682642, + "grad_norm": 0.22675681327150446, + "learning_rate": 1.477078416088107e-07, + "loss": 0.1549, + "step": 2992 + }, + { + "epoch": 4.8215867901731775, + "grad_norm": 0.2147630928023925, + "learning_rate": 1.44988469227183e-07, + "loss": 0.1446, + "step": 2993 + }, + { + "epoch": 4.823197744663713, + "grad_norm": 0.22130327588900567, + "learning_rate": 1.422942713793485e-07, + "loss": 0.1624, + "step": 2994 + }, + { + "epoch": 4.824808699154249, + "grad_norm": 0.21320882618830075, + "learning_rate": 1.396252514813279e-07, + "loss": 0.1432, + "step": 2995 + }, + { + "epoch": 4.826419653644784, + "grad_norm": 0.2086230672911769, + "learning_rate": 1.369814129172209e-07, + "loss": 0.144, + "step": 2996 + }, + { + "epoch": 4.82803060813532, + "grad_norm": 0.2134036368549228, + "learning_rate": 1.3436275903919716e-07, + "loss": 0.1468, + "step": 2997 + }, + { + "epoch": 4.829641562625856, + "grad_norm": 0.21195574971999337, + "learning_rate": 1.3176929316749632e-07, + "loss": 0.131, + "step": 2998 + }, + { + "epoch": 4.831252517116392, + "grad_norm": 0.21601750704641073, + "learning_rate": 1.2920101859042578e-07, + "loss": 0.1539, + "step": 2999 + }, + { + "epoch": 4.832863471606927, + "grad_norm": 0.22404313051428473, + "learning_rate": 1.2665793856434516e-07, + "loss": 0.1334, + "step": 3000 + }, + { + "epoch": 4.834474426097462, + "grad_norm": 0.21836677236601115, + "learning_rate": 1.2414005631366855e-07, + "loss": 0.1498, + "step": 3001 + }, + { + "epoch": 4.836085380587998, + "grad_norm": 0.21793674127632842, + "learning_rate": 1.2164737503087108e-07, + "loss": 0.1456, + "step": 3002 + }, + { + "epoch": 4.837696335078534, + "grad_norm": 0.20502370809178963, + "learning_rate": 1.1917989787646689e-07, + "loss": 0.145, + "step": 3003 + }, + { + "epoch": 4.83930728956907, + "grad_norm": 0.21438747414467008, + "learning_rate": 1.1673762797901334e-07, + "loss": 0.1404, + "step": 3004 + }, + { + "epoch": 4.840918244059606, + "grad_norm": 0.21658308626209727, + "learning_rate": 1.1432056843511342e-07, + "loss": 0.1366, + "step": 3005 + }, + { + "epoch": 4.842529198550141, + "grad_norm": 0.2146719093301986, + "learning_rate": 1.1192872230939789e-07, + "loss": 0.1513, + "step": 3006 + }, + { + "epoch": 4.8441401530406765, + "grad_norm": 0.24537221608744508, + "learning_rate": 1.0956209263453421e-07, + "loss": 0.1413, + "step": 3007 + }, + { + "epoch": 4.845751107531212, + "grad_norm": 0.22340926795790925, + "learning_rate": 1.0722068241121319e-07, + "loss": 0.1458, + "step": 3008 + }, + { + "epoch": 4.847362062021748, + "grad_norm": 0.22379919725594086, + "learning_rate": 1.0490449460815788e-07, + "loss": 0.1499, + "step": 3009 + }, + { + "epoch": 4.848973016512284, + "grad_norm": 0.2221902130763464, + "learning_rate": 1.0261353216209691e-07, + "loss": 0.1473, + "step": 3010 + }, + { + "epoch": 4.850583971002819, + "grad_norm": 0.22992060085331859, + "learning_rate": 1.0034779797778893e-07, + "loss": 0.1421, + "step": 3011 + }, + { + "epoch": 4.852194925493355, + "grad_norm": 0.22018812958227876, + "learning_rate": 9.810729492800042e-08, + "loss": 0.1456, + "step": 3012 + }, + { + "epoch": 4.853805879983891, + "grad_norm": 0.21391699346938456, + "learning_rate": 9.589202585350565e-08, + "loss": 0.1488, + "step": 3013 + }, + { + "epoch": 4.855416834474426, + "grad_norm": 0.21652663225257696, + "learning_rate": 9.370199356308229e-08, + "loss": 0.1547, + "step": 3014 + }, + { + "epoch": 4.857027788964961, + "grad_norm": 0.2075965000591131, + "learning_rate": 9.153720083351358e-08, + "loss": 0.1432, + "step": 3015 + }, + { + "epoch": 4.858638743455497, + "grad_norm": 0.2162147197513201, + "learning_rate": 8.939765040958392e-08, + "loss": 0.1352, + "step": 3016 + }, + { + "epoch": 4.860249697946033, + "grad_norm": 0.21261429233585386, + "learning_rate": 8.728334500406332e-08, + "loss": 0.1489, + "step": 3017 + }, + { + "epoch": 4.861860652436569, + "grad_norm": 0.22990524788122235, + "learning_rate": 8.519428729772072e-08, + "loss": 0.1419, + "step": 3018 + }, + { + "epoch": 4.863471606927105, + "grad_norm": 0.22341316580801132, + "learning_rate": 8.313047993931067e-08, + "loss": 0.1434, + "step": 3019 + }, + { + "epoch": 4.86508256141764, + "grad_norm": 0.22386199673054813, + "learning_rate": 8.109192554557333e-08, + "loss": 0.1401, + "step": 3020 + }, + { + "epoch": 4.8666935159081754, + "grad_norm": 0.22098592505813894, + "learning_rate": 7.907862670122557e-08, + "loss": 0.1294, + "step": 3021 + }, + { + "epoch": 4.868304470398711, + "grad_norm": 0.21149506248274522, + "learning_rate": 7.709058595897213e-08, + "loss": 0.146, + "step": 3022 + }, + { + "epoch": 4.869915424889247, + "grad_norm": 0.2120571099622338, + "learning_rate": 7.51278058394811e-08, + "loss": 0.1644, + "step": 3023 + }, + { + "epoch": 4.871526379379782, + "grad_norm": 0.21984813243178256, + "learning_rate": 7.319028883139956e-08, + "loss": 0.1435, + "step": 3024 + }, + { + "epoch": 4.873137333870318, + "grad_norm": 0.2046339895959685, + "learning_rate": 7.12780373913402e-08, + "loss": 0.1479, + "step": 3025 + }, + { + "epoch": 4.874748288360854, + "grad_norm": 0.21213665620935812, + "learning_rate": 6.939105394388356e-08, + "loss": 0.1558, + "step": 3026 + }, + { + "epoch": 4.8763592428513896, + "grad_norm": 0.20974836104729822, + "learning_rate": 6.752934088156693e-08, + "loss": 0.1607, + "step": 3027 + }, + { + "epoch": 4.877970197341925, + "grad_norm": 0.21367805851277302, + "learning_rate": 6.569290056489542e-08, + "loss": 0.1412, + "step": 3028 + }, + { + "epoch": 4.879581151832461, + "grad_norm": 0.21328781532029567, + "learning_rate": 6.3881735322322e-08, + "loss": 0.1393, + "step": 3029 + }, + { + "epoch": 4.881192106322996, + "grad_norm": 0.2147863974760739, + "learning_rate": 6.209584745025643e-08, + "loss": 0.1507, + "step": 3030 + }, + { + "epoch": 4.882803060813532, + "grad_norm": 0.2116750037903575, + "learning_rate": 6.033523921306072e-08, + "loss": 0.1423, + "step": 3031 + }, + { + "epoch": 4.884414015304068, + "grad_norm": 0.21829717085107075, + "learning_rate": 5.859991284303812e-08, + "loss": 0.1416, + "step": 3032 + }, + { + "epoch": 4.886024969794604, + "grad_norm": 0.2175366973660451, + "learning_rate": 5.688987054044637e-08, + "loss": 0.1472, + "step": 3033 + }, + { + "epoch": 4.887635924285139, + "grad_norm": 0.20843713478870315, + "learning_rate": 5.520511447347776e-08, + "loss": 0.1381, + "step": 3034 + }, + { + "epoch": 4.889246878775674, + "grad_norm": 0.2123862342443856, + "learning_rate": 5.3545646778263575e-08, + "loss": 0.1511, + "step": 3035 + }, + { + "epoch": 4.89085783326621, + "grad_norm": 0.2219417417741511, + "learning_rate": 5.191146955887405e-08, + "loss": 0.1403, + "step": 3036 + }, + { + "epoch": 4.892468787756746, + "grad_norm": 0.21174959404262084, + "learning_rate": 5.0302584887313986e-08, + "loss": 0.1451, + "step": 3037 + }, + { + "epoch": 4.894079742247282, + "grad_norm": 0.21072496789865433, + "learning_rate": 4.871899480351605e-08, + "loss": 0.1449, + "step": 3038 + }, + { + "epoch": 4.895690696737817, + "grad_norm": 0.22806728378504748, + "learning_rate": 4.7160701315343e-08, + "loss": 0.1472, + "step": 3039 + }, + { + "epoch": 4.897301651228353, + "grad_norm": 0.2168054676106727, + "learning_rate": 4.562770639858549e-08, + "loss": 0.1575, + "step": 3040 + }, + { + "epoch": 4.8989126057188885, + "grad_norm": 0.2168197173485377, + "learning_rate": 4.412001199695537e-08, + "loss": 0.1482, + "step": 3041 + }, + { + "epoch": 4.900523560209424, + "grad_norm": 0.2250248057225666, + "learning_rate": 4.2637620022085715e-08, + "loss": 0.1377, + "step": 3042 + }, + { + "epoch": 4.902134514699959, + "grad_norm": 0.21418687912333567, + "learning_rate": 4.118053235352859e-08, + "loss": 0.135, + "step": 3043 + }, + { + "epoch": 4.903745469190495, + "grad_norm": 0.21982028957248173, + "learning_rate": 3.974875083875285e-08, + "loss": 0.1459, + "step": 3044 + }, + { + "epoch": 4.905356423681031, + "grad_norm": 0.21288538105643232, + "learning_rate": 3.834227729313966e-08, + "loss": 0.1442, + "step": 3045 + }, + { + "epoch": 4.906967378171567, + "grad_norm": 0.24690110457527772, + "learning_rate": 3.696111349998255e-08, + "loss": 0.1438, + "step": 3046 + }, + { + "epoch": 4.908578332662103, + "grad_norm": 0.21048986451072557, + "learning_rate": 3.5605261210485134e-08, + "loss": 0.1472, + "step": 3047 + }, + { + "epoch": 4.910189287152638, + "grad_norm": 0.21299506119550873, + "learning_rate": 3.427472214375671e-08, + "loss": 0.1595, + "step": 3048 + }, + { + "epoch": 4.911800241643173, + "grad_norm": 0.2098004986312955, + "learning_rate": 3.296949798681226e-08, + "loss": 0.1418, + "step": 3049 + }, + { + "epoch": 4.913411196133709, + "grad_norm": 0.2111424848272005, + "learning_rate": 3.1689590394570204e-08, + "loss": 0.1478, + "step": 3050 + }, + { + "epoch": 4.915022150624245, + "grad_norm": 0.2056760445046121, + "learning_rate": 3.0435000989850194e-08, + "loss": 0.1522, + "step": 3051 + }, + { + "epoch": 4.91663310511478, + "grad_norm": 0.22222100807383482, + "learning_rate": 2.9205731363364244e-08, + "loss": 0.1467, + "step": 3052 + }, + { + "epoch": 4.918244059605316, + "grad_norm": 0.22694945838983843, + "learning_rate": 2.8001783073732248e-08, + "loss": 0.1368, + "step": 3053 + }, + { + "epoch": 4.919855014095852, + "grad_norm": 0.20514202688175226, + "learning_rate": 2.6823157647457577e-08, + "loss": 0.1377, + "step": 3054 + }, + { + "epoch": 4.9214659685863875, + "grad_norm": 0.21345732647144894, + "learning_rate": 2.566985657894483e-08, + "loss": 0.1485, + "step": 3055 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 0.21597379148933313, + "learning_rate": 2.4541881330482075e-08, + "loss": 0.1424, + "step": 3056 + }, + { + "epoch": 4.924687877567459, + "grad_norm": 0.20890314593204515, + "learning_rate": 2.3439233332251953e-08, + "loss": 0.1628, + "step": 3057 + }, + { + "epoch": 4.926298832057994, + "grad_norm": 0.21635021856559442, + "learning_rate": 2.236191398232057e-08, + "loss": 0.1445, + "step": 3058 + }, + { + "epoch": 4.92790978654853, + "grad_norm": 0.2202683990747119, + "learning_rate": 2.1309924646641945e-08, + "loss": 0.1482, + "step": 3059 + }, + { + "epoch": 4.929520741039066, + "grad_norm": 0.21390221711743562, + "learning_rate": 2.0283266659051338e-08, + "loss": 0.1388, + "step": 3060 + }, + { + "epoch": 4.931131695529602, + "grad_norm": 0.22450839818366922, + "learning_rate": 1.9281941321271925e-08, + "loss": 0.1565, + "step": 3061 + }, + { + "epoch": 4.932742650020137, + "grad_norm": 0.223216962644783, + "learning_rate": 1.8305949902897026e-08, + "loss": 0.1492, + "step": 3062 + }, + { + "epoch": 4.934353604510672, + "grad_norm": 0.2005760944353557, + "learning_rate": 1.7355293641405647e-08, + "loss": 0.1545, + "step": 3063 + }, + { + "epoch": 4.935964559001208, + "grad_norm": 0.22213102975484492, + "learning_rate": 1.6429973742153606e-08, + "loss": 0.1418, + "step": 3064 + }, + { + "epoch": 4.937575513491744, + "grad_norm": 0.22253914887075765, + "learning_rate": 1.552999137836908e-08, + "loss": 0.136, + "step": 3065 + }, + { + "epoch": 4.93918646798228, + "grad_norm": 0.22754585273340733, + "learning_rate": 1.4655347691159283e-08, + "loss": 0.1514, + "step": 3066 + }, + { + "epoch": 4.940797422472815, + "grad_norm": 0.20712918988601425, + "learning_rate": 1.3806043789497126e-08, + "loss": 0.143, + "step": 3067 + }, + { + "epoch": 4.942408376963351, + "grad_norm": 0.22069265881378214, + "learning_rate": 1.2982080750234549e-08, + "loss": 0.1491, + "step": 3068 + }, + { + "epoch": 4.9440193314538865, + "grad_norm": 0.20959463552056892, + "learning_rate": 1.2183459618084759e-08, + "loss": 0.1523, + "step": 3069 + }, + { + "epoch": 4.945630285944422, + "grad_norm": 0.21612258928364242, + "learning_rate": 1.1410181405639986e-08, + "loss": 0.1442, + "step": 3070 + }, + { + "epoch": 4.947241240434957, + "grad_norm": 0.2162006333759751, + "learning_rate": 1.0662247093349287e-08, + "loss": 0.1244, + "step": 3071 + }, + { + "epoch": 4.948852194925493, + "grad_norm": 0.22621189321655785, + "learning_rate": 9.939657629534083e-09, + "loss": 0.1624, + "step": 3072 + }, + { + "epoch": 4.950463149416029, + "grad_norm": 0.22325164035423106, + "learning_rate": 9.242413930377059e-09, + "loss": 0.1501, + "step": 3073 + }, + { + "epoch": 4.952074103906565, + "grad_norm": 0.20908742596922356, + "learning_rate": 8.570516879928826e-09, + "loss": 0.1313, + "step": 3074 + }, + { + "epoch": 4.953685058397101, + "grad_norm": 0.22585183848954468, + "learning_rate": 7.923967330099036e-09, + "loss": 0.1474, + "step": 3075 + }, + { + "epoch": 4.9552960128876355, + "grad_norm": 0.21204595764120757, + "learning_rate": 7.3027661006586095e-09, + "loss": 0.1531, + "step": 3076 + }, + { + "epoch": 4.956906967378171, + "grad_norm": 0.20138290136485443, + "learning_rate": 6.706913979241947e-09, + "loss": 0.1603, + "step": 3077 + }, + { + "epoch": 4.958517921868707, + "grad_norm": 0.21351586165159536, + "learning_rate": 6.136411721340274e-09, + "loss": 0.1383, + "step": 3078 + }, + { + "epoch": 4.960128876359243, + "grad_norm": 0.21376007659935434, + "learning_rate": 5.5912600503038594e-09, + "loss": 0.1445, + "step": 3079 + }, + { + "epoch": 4.961739830849779, + "grad_norm": 0.21489003486773706, + "learning_rate": 5.071459657339794e-09, + "loss": 0.1388, + "step": 3080 + }, + { + "epoch": 4.963350785340314, + "grad_norm": 0.21755924068751195, + "learning_rate": 4.577011201511994e-09, + "loss": 0.1456, + "step": 3081 + }, + { + "epoch": 4.96496173983085, + "grad_norm": 0.20180190976203755, + "learning_rate": 4.107915309743416e-09, + "loss": 0.146, + "step": 3082 + }, + { + "epoch": 4.9665726943213855, + "grad_norm": 0.21855543222434343, + "learning_rate": 3.664172576807179e-09, + "loss": 0.1405, + "step": 3083 + }, + { + "epoch": 4.968183648811921, + "grad_norm": 0.215264107680761, + "learning_rate": 3.2457835653332272e-09, + "loss": 0.1456, + "step": 3084 + }, + { + "epoch": 4.969794603302457, + "grad_norm": 0.203470431035806, + "learning_rate": 2.8527488058038844e-09, + "loss": 0.1432, + "step": 3085 + }, + { + "epoch": 4.971405557792992, + "grad_norm": 0.21276675372989584, + "learning_rate": 2.485068796556078e-09, + "loss": 0.1577, + "step": 3086 + }, + { + "epoch": 4.973016512283528, + "grad_norm": 0.21088797347840574, + "learning_rate": 2.142744003779118e-09, + "loss": 0.1421, + "step": 3087 + }, + { + "epoch": 4.974627466774064, + "grad_norm": 0.2246105636966214, + "learning_rate": 1.8257748615102545e-09, + "loss": 0.1351, + "step": 3088 + }, + { + "epoch": 4.9762384212646, + "grad_norm": 0.21752395162393956, + "learning_rate": 1.5341617716435608e-09, + "loss": 0.1498, + "step": 3089 + }, + { + "epoch": 4.9778493757551345, + "grad_norm": 0.21715767148751064, + "learning_rate": 1.2679051039188317e-09, + "loss": 0.1431, + "step": 3090 + }, + { + "epoch": 4.97946033024567, + "grad_norm": 0.200604318655284, + "learning_rate": 1.0270051959282434e-09, + "loss": 0.1356, + "step": 3091 + }, + { + "epoch": 4.981071284736206, + "grad_norm": 0.22114376968996727, + "learning_rate": 8.114623531119137e-10, + "loss": 0.1494, + "step": 3092 + }, + { + "epoch": 4.982682239226742, + "grad_norm": 0.21912817007954846, + "learning_rate": 6.212768487623422e-10, + "loss": 0.147, + "step": 3093 + }, + { + "epoch": 4.984293193717278, + "grad_norm": 0.21734907735157796, + "learning_rate": 4.564489240177494e-10, + "loss": 0.1427, + "step": 3094 + }, + { + "epoch": 4.985904148207813, + "grad_norm": 0.2122215913186033, + "learning_rate": 3.1697878786873804e-10, + "loss": 0.1379, + "step": 3095 + }, + { + "epoch": 4.987515102698349, + "grad_norm": 0.22045381925803034, + "learning_rate": 2.0286661714941092e-10, + "loss": 0.154, + "step": 3096 + }, + { + "epoch": 4.9891260571888845, + "grad_norm": 0.21230236722633794, + "learning_rate": 1.1411255654625309e-10, + "loss": 0.1398, + "step": 3097 + }, + { + "epoch": 4.99073701167942, + "grad_norm": 0.2155863280622798, + "learning_rate": 5.0716718591470313e-11, + "loss": 0.1415, + "step": 3098 + }, + { + "epoch": 4.992347966169955, + "grad_norm": 0.21790703900678665, + "learning_rate": 1.267918366743004e-11, + "loss": 0.1517, + "step": 3099 + }, + { + "epoch": 4.993958920660491, + "grad_norm": 0.21710189361301535, + "learning_rate": 0.0, + "loss": 0.1494, + "step": 3100 + }, + { + "epoch": 4.993958920660491, + "step": 3100, + "total_flos": 6.310856434702221e+18, + "train_loss": 0.2839509548799646, + "train_runtime": 73731.2238, + "train_samples_per_second": 5.387, + "train_steps_per_second": 0.042 + } + ], + "logging_steps": 1.0, + "max_steps": 3100, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.310856434702221e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}