diff --git "a/models/checkpoint-10000/trainer_state.json" "b/models/checkpoint-10000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/models/checkpoint-10000/trainer_state.json" @@ -0,0 +1,70113 @@ +{ + "best_metric": 2.5555005073547363, + "best_model_checkpoint": "./out/checkpoint-10000", + "epoch": 0.9052231375033946, + "eval_steps": 1000, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.052231375033946e-05, + "grad_norm": 6.121739864349365, + "learning_rate": 2.0000000000000003e-06, + "loss": 8.7363, + "step": 1 + }, + { + "epoch": 0.0001810446275006789, + "grad_norm": 6.012884140014648, + "learning_rate": 4.000000000000001e-06, + "loss": 8.7408, + "step": 2 + }, + { + "epoch": 0.00027156694125101837, + "grad_norm": 5.92803955078125, + "learning_rate": 6e-06, + "loss": 8.7133, + "step": 3 + }, + { + "epoch": 0.0003620892550013578, + "grad_norm": 5.670593738555908, + "learning_rate": 8.000000000000001e-06, + "loss": 8.6591, + "step": 4 + }, + { + "epoch": 0.0004526115687516973, + "grad_norm": 4.359646320343018, + "learning_rate": 1e-05, + "loss": 8.6018, + "step": 5 + }, + { + "epoch": 0.0005431338825020367, + "grad_norm": 3.6603660583496094, + "learning_rate": 1.2e-05, + "loss": 8.5488, + "step": 6 + }, + { + "epoch": 0.0006336561962523762, + "grad_norm": 3.5484490394592285, + "learning_rate": 1.4000000000000001e-05, + "loss": 8.4825, + "step": 7 + }, + { + "epoch": 0.0007241785100027156, + "grad_norm": 2.9020848274230957, + "learning_rate": 1.6000000000000003e-05, + "loss": 8.4122, + "step": 8 + }, + { + "epoch": 0.0008147008237530552, + "grad_norm": 2.7795889377593994, + "learning_rate": 1.8e-05, + "loss": 8.3877, + "step": 9 + }, + { + "epoch": 0.0009052231375033946, + "grad_norm": 2.3020100593566895, + "learning_rate": 2e-05, + "loss": 8.3544, + "step": 10 + }, + { + "epoch": 0.000995745451253734, + "grad_norm": 2.105527400970459, + "learning_rate": 2.2000000000000003e-05, + "loss": 8.3211, + "step": 11 + }, + { + "epoch": 0.0010862677650040735, + "grad_norm": 2.0495691299438477, + "learning_rate": 2.4e-05, + "loss": 8.3053, + "step": 12 + }, + { + "epoch": 0.001176790078754413, + "grad_norm": 1.9920520782470703, + "learning_rate": 2.6000000000000002e-05, + "loss": 8.2805, + "step": 13 + }, + { + "epoch": 0.0012673123925047525, + "grad_norm": 1.8438405990600586, + "learning_rate": 2.8000000000000003e-05, + "loss": 8.2162, + "step": 14 + }, + { + "epoch": 0.0013578347062550918, + "grad_norm": 1.7026909589767456, + "learning_rate": 3e-05, + "loss": 8.2363, + "step": 15 + }, + { + "epoch": 0.0014483570200054313, + "grad_norm": 1.6654070615768433, + "learning_rate": 3.2000000000000005e-05, + "loss": 8.1967, + "step": 16 + }, + { + "epoch": 0.0015388793337557708, + "grad_norm": 1.606818437576294, + "learning_rate": 3.4000000000000007e-05, + "loss": 8.1724, + "step": 17 + }, + { + "epoch": 0.0016294016475061103, + "grad_norm": 1.5517821311950684, + "learning_rate": 3.6e-05, + "loss": 8.13, + "step": 18 + }, + { + "epoch": 0.0017199239612564498, + "grad_norm": 1.511502981185913, + "learning_rate": 3.8e-05, + "loss": 8.0888, + "step": 19 + }, + { + "epoch": 0.001810446275006789, + "grad_norm": 1.5508986711502075, + "learning_rate": 4e-05, + "loss": 8.0653, + "step": 20 + }, + { + "epoch": 0.0019009685887571286, + "grad_norm": 1.5409382581710815, + "learning_rate": 4.2e-05, + "loss": 8.0266, + "step": 21 + }, + { + "epoch": 0.001991490902507468, + "grad_norm": 1.6338551044464111, + "learning_rate": 4.4000000000000006e-05, + "loss": 7.995, + "step": 22 + }, + { + "epoch": 0.0020820132162578074, + "grad_norm": 1.7471034526824951, + "learning_rate": 4.600000000000001e-05, + "loss": 7.9505, + "step": 23 + }, + { + "epoch": 0.002172535530008147, + "grad_norm": 1.6112545728683472, + "learning_rate": 4.8e-05, + "loss": 7.8981, + "step": 24 + }, + { + "epoch": 0.0022630578437584864, + "grad_norm": 1.7129489183425903, + "learning_rate": 5e-05, + "loss": 7.8488, + "step": 25 + }, + { + "epoch": 0.002353580157508826, + "grad_norm": 1.6461535692214966, + "learning_rate": 5.2000000000000004e-05, + "loss": 7.8213, + "step": 26 + }, + { + "epoch": 0.0024441024712591655, + "grad_norm": 1.9870665073394775, + "learning_rate": 5.4000000000000005e-05, + "loss": 7.7754, + "step": 27 + }, + { + "epoch": 0.002534624785009505, + "grad_norm": 1.6220543384552002, + "learning_rate": 5.6000000000000006e-05, + "loss": 7.6909, + "step": 28 + }, + { + "epoch": 0.0026251470987598445, + "grad_norm": 4.334130764007568, + "learning_rate": 5.8e-05, + "loss": 7.6899, + "step": 29 + }, + { + "epoch": 0.0027156694125101835, + "grad_norm": 3.4958887100219727, + "learning_rate": 6e-05, + "loss": 7.5933, + "step": 30 + }, + { + "epoch": 0.002806191726260523, + "grad_norm": 2.5281577110290527, + "learning_rate": 6.2e-05, + "loss": 7.546, + "step": 31 + }, + { + "epoch": 0.0028967140400108626, + "grad_norm": 4.500778675079346, + "learning_rate": 6.400000000000001e-05, + "loss": 7.5493, + "step": 32 + }, + { + "epoch": 0.002987236353761202, + "grad_norm": 2.2724337577819824, + "learning_rate": 6.6e-05, + "loss": 7.4798, + "step": 33 + }, + { + "epoch": 0.0030777586675115416, + "grad_norm": 4.736296653747559, + "learning_rate": 6.800000000000001e-05, + "loss": 7.4648, + "step": 34 + }, + { + "epoch": 0.003168280981261881, + "grad_norm": 2.454152822494507, + "learning_rate": 7e-05, + "loss": 7.3821, + "step": 35 + }, + { + "epoch": 0.0032588032950122206, + "grad_norm": 16.982067108154297, + "learning_rate": 7.2e-05, + "loss": 7.4612, + "step": 36 + }, + { + "epoch": 0.00334932560876256, + "grad_norm": 4.89740514755249, + "learning_rate": 7.4e-05, + "loss": 7.3308, + "step": 37 + }, + { + "epoch": 0.0034398479225128996, + "grad_norm": 3.0148491859436035, + "learning_rate": 7.6e-05, + "loss": 7.2499, + "step": 38 + }, + { + "epoch": 0.0035303702362632387, + "grad_norm": 3.8699262142181396, + "learning_rate": 7.800000000000001e-05, + "loss": 7.2579, + "step": 39 + }, + { + "epoch": 0.003620892550013578, + "grad_norm": 2.45534086227417, + "learning_rate": 8e-05, + "loss": 7.1558, + "step": 40 + }, + { + "epoch": 0.0037114148637639177, + "grad_norm": 5.5353827476501465, + "learning_rate": 8.2e-05, + "loss": 7.1483, + "step": 41 + }, + { + "epoch": 0.0038019371775142572, + "grad_norm": 2.3498878479003906, + "learning_rate": 8.4e-05, + "loss": 7.0903, + "step": 42 + }, + { + "epoch": 0.0038924594912645967, + "grad_norm": 3.148735284805298, + "learning_rate": 8.6e-05, + "loss": 7.0197, + "step": 43 + }, + { + "epoch": 0.003982981805014936, + "grad_norm": 3.3806612491607666, + "learning_rate": 8.800000000000001e-05, + "loss": 6.9801, + "step": 44 + }, + { + "epoch": 0.004073504118765276, + "grad_norm": 4.305522441864014, + "learning_rate": 9e-05, + "loss": 6.9286, + "step": 45 + }, + { + "epoch": 0.004164026432515615, + "grad_norm": 3.0494394302368164, + "learning_rate": 9.200000000000001e-05, + "loss": 6.9058, + "step": 46 + }, + { + "epoch": 0.004254548746265955, + "grad_norm": 1.9974802732467651, + "learning_rate": 9.4e-05, + "loss": 6.8774, + "step": 47 + }, + { + "epoch": 0.004345071060016294, + "grad_norm": 7.055619239807129, + "learning_rate": 9.6e-05, + "loss": 6.8402, + "step": 48 + }, + { + "epoch": 0.004435593373766634, + "grad_norm": 2.6249217987060547, + "learning_rate": 9.8e-05, + "loss": 6.7974, + "step": 49 + }, + { + "epoch": 0.004526115687516973, + "grad_norm": 2.5299758911132812, + "learning_rate": 0.0001, + "loss": 6.7687, + "step": 50 + }, + { + "epoch": 0.004616638001267313, + "grad_norm": 2.7093379497528076, + "learning_rate": 0.00010200000000000001, + "loss": 6.705, + "step": 51 + }, + { + "epoch": 0.004707160315017652, + "grad_norm": 2.7266974449157715, + "learning_rate": 0.00010400000000000001, + "loss": 6.5869, + "step": 52 + }, + { + "epoch": 0.004797682628767991, + "grad_norm": 2.071244239807129, + "learning_rate": 0.00010600000000000002, + "loss": 6.5817, + "step": 53 + }, + { + "epoch": 0.004888204942518331, + "grad_norm": 2.33720064163208, + "learning_rate": 0.00010800000000000001, + "loss": 6.582, + "step": 54 + }, + { + "epoch": 0.00497872725626867, + "grad_norm": 2.151409387588501, + "learning_rate": 0.00011000000000000002, + "loss": 6.5736, + "step": 55 + }, + { + "epoch": 0.00506924957001901, + "grad_norm": 3.529722213745117, + "learning_rate": 0.00011200000000000001, + "loss": 6.4519, + "step": 56 + }, + { + "epoch": 0.005159771883769349, + "grad_norm": 3.7185935974121094, + "learning_rate": 0.00011399999999999999, + "loss": 6.3824, + "step": 57 + }, + { + "epoch": 0.005250294197519689, + "grad_norm": 3.911526918411255, + "learning_rate": 0.000116, + "loss": 6.3547, + "step": 58 + }, + { + "epoch": 0.005340816511270028, + "grad_norm": 3.760375499725342, + "learning_rate": 0.000118, + "loss": 6.3178, + "step": 59 + }, + { + "epoch": 0.005431338825020367, + "grad_norm": 3.210420608520508, + "learning_rate": 0.00012, + "loss": 6.3338, + "step": 60 + }, + { + "epoch": 0.005521861138770707, + "grad_norm": 2.1828818321228027, + "learning_rate": 0.000122, + "loss": 6.2572, + "step": 61 + }, + { + "epoch": 0.005612383452521046, + "grad_norm": 2.459007740020752, + "learning_rate": 0.000124, + "loss": 6.2562, + "step": 62 + }, + { + "epoch": 0.005702905766271386, + "grad_norm": 2.0643117427825928, + "learning_rate": 0.000126, + "loss": 6.126, + "step": 63 + }, + { + "epoch": 0.005793428080021725, + "grad_norm": 3.102482557296753, + "learning_rate": 0.00012800000000000002, + "loss": 6.1061, + "step": 64 + }, + { + "epoch": 0.005883950393772065, + "grad_norm": 2.2519826889038086, + "learning_rate": 0.00013000000000000002, + "loss": 6.1004, + "step": 65 + }, + { + "epoch": 0.005974472707522404, + "grad_norm": 1.8878542184829712, + "learning_rate": 0.000132, + "loss": 6.0569, + "step": 66 + }, + { + "epoch": 0.006064995021272744, + "grad_norm": 1.7067312002182007, + "learning_rate": 0.000134, + "loss": 6.0232, + "step": 67 + }, + { + "epoch": 0.006155517335023083, + "grad_norm": 2.0737111568450928, + "learning_rate": 0.00013600000000000003, + "loss": 5.9652, + "step": 68 + }, + { + "epoch": 0.006246039648773422, + "grad_norm": 2.1083405017852783, + "learning_rate": 0.000138, + "loss": 5.9704, + "step": 69 + }, + { + "epoch": 0.006336561962523762, + "grad_norm": 1.5221211910247803, + "learning_rate": 0.00014, + "loss": 5.8621, + "step": 70 + }, + { + "epoch": 0.006427084276274101, + "grad_norm": 2.04142427444458, + "learning_rate": 0.000142, + "loss": 5.8764, + "step": 71 + }, + { + "epoch": 0.006517606590024441, + "grad_norm": 2.7947962284088135, + "learning_rate": 0.000144, + "loss": 5.8276, + "step": 72 + }, + { + "epoch": 0.00660812890377478, + "grad_norm": 2.515359878540039, + "learning_rate": 0.000146, + "loss": 5.7222, + "step": 73 + }, + { + "epoch": 0.00669865121752512, + "grad_norm": 1.4865795373916626, + "learning_rate": 0.000148, + "loss": 5.7296, + "step": 74 + }, + { + "epoch": 0.006789173531275459, + "grad_norm": 1.521973729133606, + "learning_rate": 0.00015000000000000001, + "loss": 5.6688, + "step": 75 + }, + { + "epoch": 0.006879695845025799, + "grad_norm": 1.8078877925872803, + "learning_rate": 0.000152, + "loss": 5.7169, + "step": 76 + }, + { + "epoch": 0.006970218158776138, + "grad_norm": 1.7064664363861084, + "learning_rate": 0.000154, + "loss": 5.6, + "step": 77 + }, + { + "epoch": 0.007060740472526477, + "grad_norm": 1.5587918758392334, + "learning_rate": 0.00015600000000000002, + "loss": 5.6158, + "step": 78 + }, + { + "epoch": 0.007151262786276817, + "grad_norm": 1.5954521894454956, + "learning_rate": 0.00015800000000000002, + "loss": 5.5937, + "step": 79 + }, + { + "epoch": 0.007241785100027156, + "grad_norm": 1.7314285039901733, + "learning_rate": 0.00016, + "loss": 5.6023, + "step": 80 + }, + { + "epoch": 0.007332307413777496, + "grad_norm": 1.9381115436553955, + "learning_rate": 0.000162, + "loss": 5.4748, + "step": 81 + }, + { + "epoch": 0.007422829727527835, + "grad_norm": 1.2395066022872925, + "learning_rate": 0.000164, + "loss": 5.4938, + "step": 82 + }, + { + "epoch": 0.007513352041278175, + "grad_norm": 1.3130717277526855, + "learning_rate": 0.000166, + "loss": 5.5345, + "step": 83 + }, + { + "epoch": 0.0076038743550285144, + "grad_norm": 1.5142767429351807, + "learning_rate": 0.000168, + "loss": 5.4502, + "step": 84 + }, + { + "epoch": 0.007694396668778854, + "grad_norm": 1.2253237962722778, + "learning_rate": 0.00017, + "loss": 5.3557, + "step": 85 + }, + { + "epoch": 0.0077849189825291935, + "grad_norm": 1.3859381675720215, + "learning_rate": 0.000172, + "loss": 5.3861, + "step": 86 + }, + { + "epoch": 0.007875441296279533, + "grad_norm": 1.8688181638717651, + "learning_rate": 0.000174, + "loss": 5.4363, + "step": 87 + }, + { + "epoch": 0.007965963610029872, + "grad_norm": 1.328377604484558, + "learning_rate": 0.00017600000000000002, + "loss": 5.3266, + "step": 88 + }, + { + "epoch": 0.008056485923780212, + "grad_norm": 1.5786259174346924, + "learning_rate": 0.00017800000000000002, + "loss": 5.3736, + "step": 89 + }, + { + "epoch": 0.008147008237530552, + "grad_norm": 1.2112221717834473, + "learning_rate": 0.00018, + "loss": 5.2945, + "step": 90 + }, + { + "epoch": 0.00823753055128089, + "grad_norm": 1.210732340812683, + "learning_rate": 0.000182, + "loss": 5.2812, + "step": 91 + }, + { + "epoch": 0.00832805286503123, + "grad_norm": 1.3312040567398071, + "learning_rate": 0.00018400000000000003, + "loss": 5.2365, + "step": 92 + }, + { + "epoch": 0.00841857517878157, + "grad_norm": 1.0963982343673706, + "learning_rate": 0.00018600000000000002, + "loss": 5.243, + "step": 93 + }, + { + "epoch": 0.00850909749253191, + "grad_norm": 1.202478051185608, + "learning_rate": 0.000188, + "loss": 5.2121, + "step": 94 + }, + { + "epoch": 0.008599619806282249, + "grad_norm": 1.3619744777679443, + "learning_rate": 0.00019, + "loss": 5.186, + "step": 95 + }, + { + "epoch": 0.008690142120032588, + "grad_norm": 1.3738235235214233, + "learning_rate": 0.000192, + "loss": 5.2303, + "step": 96 + }, + { + "epoch": 0.008780664433782927, + "grad_norm": 1.2414801120758057, + "learning_rate": 0.000194, + "loss": 5.222, + "step": 97 + }, + { + "epoch": 0.008871186747533268, + "grad_norm": 1.2269741296768188, + "learning_rate": 0.000196, + "loss": 5.1837, + "step": 98 + }, + { + "epoch": 0.008961709061283607, + "grad_norm": 1.3814342021942139, + "learning_rate": 0.00019800000000000002, + "loss": 5.1261, + "step": 99 + }, + { + "epoch": 0.009052231375033946, + "grad_norm": 1.3333874940872192, + "learning_rate": 0.0002, + "loss": 5.135, + "step": 100 + }, + { + "epoch": 0.009142753688784285, + "grad_norm": 0.9502299427986145, + "learning_rate": 0.00019999999496500136, + "loss": 5.167, + "step": 101 + }, + { + "epoch": 0.009233276002534626, + "grad_norm": 1.2078323364257812, + "learning_rate": 0.00019999997986000598, + "loss": 5.1009, + "step": 102 + }, + { + "epoch": 0.009323798316284965, + "grad_norm": 1.2079190015792847, + "learning_rate": 0.00019999995468501536, + "loss": 5.1678, + "step": 103 + }, + { + "epoch": 0.009414320630035304, + "grad_norm": 1.362955927848816, + "learning_rate": 0.00019999991944003202, + "loss": 5.0459, + "step": 104 + }, + { + "epoch": 0.009504842943785643, + "grad_norm": 1.3274837732315063, + "learning_rate": 0.00019999987412505955, + "loss": 5.1548, + "step": 105 + }, + { + "epoch": 0.009595365257535982, + "grad_norm": 1.1168432235717773, + "learning_rate": 0.00019999981874010248, + "loss": 5.0671, + "step": 106 + }, + { + "epoch": 0.009685887571286323, + "grad_norm": 1.6353976726531982, + "learning_rate": 0.00019999975328516642, + "loss": 5.0539, + "step": 107 + }, + { + "epoch": 0.009776409885036662, + "grad_norm": 1.2709882259368896, + "learning_rate": 0.00019999967776025793, + "loss": 5.0878, + "step": 108 + }, + { + "epoch": 0.009866932198787001, + "grad_norm": 1.3899245262145996, + "learning_rate": 0.00019999959216538462, + "loss": 4.9687, + "step": 109 + }, + { + "epoch": 0.00995745451253734, + "grad_norm": 1.2088505029678345, + "learning_rate": 0.00019999949650055513, + "loss": 5.0465, + "step": 110 + }, + { + "epoch": 0.01004797682628768, + "grad_norm": 0.9921075105667114, + "learning_rate": 0.00019999939076577905, + "loss": 5.1357, + "step": 111 + }, + { + "epoch": 0.01013849914003802, + "grad_norm": 1.2015072107315063, + "learning_rate": 0.00019999927496106707, + "loss": 5.0771, + "step": 112 + }, + { + "epoch": 0.010229021453788359, + "grad_norm": 1.0013004541397095, + "learning_rate": 0.00019999914908643086, + "loss": 5.0229, + "step": 113 + }, + { + "epoch": 0.010319543767538698, + "grad_norm": 1.2613677978515625, + "learning_rate": 0.00019999901314188303, + "loss": 4.9829, + "step": 114 + }, + { + "epoch": 0.010410066081289037, + "grad_norm": 1.47295081615448, + "learning_rate": 0.00019999886712743732, + "loss": 4.9887, + "step": 115 + }, + { + "epoch": 0.010500588395039378, + "grad_norm": 1.5374642610549927, + "learning_rate": 0.00019999871104310846, + "loss": 5.0556, + "step": 116 + }, + { + "epoch": 0.010591110708789717, + "grad_norm": 1.3752161264419556, + "learning_rate": 0.00019999854488891213, + "loss": 5.0298, + "step": 117 + }, + { + "epoch": 0.010681633022540056, + "grad_norm": 1.0788917541503906, + "learning_rate": 0.00019999836866486503, + "loss": 4.9835, + "step": 118 + }, + { + "epoch": 0.010772155336290395, + "grad_norm": 1.0625488758087158, + "learning_rate": 0.00019999818237098496, + "loss": 4.9575, + "step": 119 + }, + { + "epoch": 0.010862677650040734, + "grad_norm": 1.0351204872131348, + "learning_rate": 0.00019999798600729064, + "loss": 4.9703, + "step": 120 + }, + { + "epoch": 0.010953199963791075, + "grad_norm": 0.9874302744865417, + "learning_rate": 0.0001999977795738019, + "loss": 5.0153, + "step": 121 + }, + { + "epoch": 0.011043722277541414, + "grad_norm": 1.0721620321273804, + "learning_rate": 0.00019999756307053948, + "loss": 4.9537, + "step": 122 + }, + { + "epoch": 0.011134244591291753, + "grad_norm": 1.1902644634246826, + "learning_rate": 0.0001999973364975252, + "loss": 5.0332, + "step": 123 + }, + { + "epoch": 0.011224766905042092, + "grad_norm": 0.9976598620414734, + "learning_rate": 0.00019999709985478188, + "loss": 4.8953, + "step": 124 + }, + { + "epoch": 0.011315289218792433, + "grad_norm": 1.1079658269882202, + "learning_rate": 0.0001999968531423333, + "loss": 4.9752, + "step": 125 + }, + { + "epoch": 0.011405811532542772, + "grad_norm": 1.037431240081787, + "learning_rate": 0.0001999965963602044, + "loss": 4.9653, + "step": 126 + }, + { + "epoch": 0.011496333846293111, + "grad_norm": 1.1416285037994385, + "learning_rate": 0.00019999632950842093, + "loss": 4.926, + "step": 127 + }, + { + "epoch": 0.01158685616004345, + "grad_norm": 1.3808517456054688, + "learning_rate": 0.00019999605258700983, + "loss": 4.9866, + "step": 128 + }, + { + "epoch": 0.01167737847379379, + "grad_norm": 1.0173901319503784, + "learning_rate": 0.00019999576559599902, + "loss": 5.002, + "step": 129 + }, + { + "epoch": 0.01176790078754413, + "grad_norm": 1.1562535762786865, + "learning_rate": 0.0001999954685354173, + "loss": 4.9624, + "step": 130 + }, + { + "epoch": 0.01185842310129447, + "grad_norm": 1.1456613540649414, + "learning_rate": 0.00019999516140529463, + "loss": 4.9042, + "step": 131 + }, + { + "epoch": 0.011948945415044808, + "grad_norm": 1.322360634803772, + "learning_rate": 0.00019999484420566197, + "loss": 4.804, + "step": 132 + }, + { + "epoch": 0.012039467728795147, + "grad_norm": 1.1253564357757568, + "learning_rate": 0.00019999451693655123, + "loss": 4.8905, + "step": 133 + }, + { + "epoch": 0.012129990042545488, + "grad_norm": 1.1110708713531494, + "learning_rate": 0.00019999417959799535, + "loss": 4.9735, + "step": 134 + }, + { + "epoch": 0.012220512356295827, + "grad_norm": 1.0920100212097168, + "learning_rate": 0.00019999383219002835, + "loss": 4.9384, + "step": 135 + }, + { + "epoch": 0.012311034670046166, + "grad_norm": 0.9289610981941223, + "learning_rate": 0.00019999347471268516, + "loss": 4.9417, + "step": 136 + }, + { + "epoch": 0.012401556983796505, + "grad_norm": 1.009425163269043, + "learning_rate": 0.0001999931071660018, + "loss": 4.9405, + "step": 137 + }, + { + "epoch": 0.012492079297546844, + "grad_norm": 1.14154851436615, + "learning_rate": 0.00019999272955001528, + "loss": 4.9205, + "step": 138 + }, + { + "epoch": 0.012582601611297185, + "grad_norm": 0.9523165822029114, + "learning_rate": 0.00019999234186476365, + "loss": 4.8967, + "step": 139 + }, + { + "epoch": 0.012673123925047524, + "grad_norm": 1.2908835411071777, + "learning_rate": 0.00019999194411028594, + "loss": 4.9088, + "step": 140 + }, + { + "epoch": 0.012763646238797863, + "grad_norm": 0.9540936946868896, + "learning_rate": 0.00019999153628662218, + "loss": 4.9449, + "step": 141 + }, + { + "epoch": 0.012854168552548203, + "grad_norm": 0.9404425621032715, + "learning_rate": 0.00019999111839381345, + "loss": 4.9067, + "step": 142 + }, + { + "epoch": 0.012944690866298543, + "grad_norm": 0.8981451988220215, + "learning_rate": 0.00019999069043190185, + "loss": 4.8862, + "step": 143 + }, + { + "epoch": 0.013035213180048882, + "grad_norm": 0.8868217468261719, + "learning_rate": 0.00019999025240093044, + "loss": 4.8235, + "step": 144 + }, + { + "epoch": 0.013125735493799221, + "grad_norm": 0.8762542605400085, + "learning_rate": 0.00019998980430094334, + "loss": 4.8599, + "step": 145 + }, + { + "epoch": 0.01321625780754956, + "grad_norm": 1.0478554964065552, + "learning_rate": 0.0001999893461319857, + "loss": 4.8078, + "step": 146 + }, + { + "epoch": 0.0133067801212999, + "grad_norm": 1.216299295425415, + "learning_rate": 0.00019998887789410364, + "loss": 4.8147, + "step": 147 + }, + { + "epoch": 0.01339730243505024, + "grad_norm": 0.9078794717788696, + "learning_rate": 0.0001999883995873443, + "loss": 4.8386, + "step": 148 + }, + { + "epoch": 0.01348782474880058, + "grad_norm": 0.9007813930511475, + "learning_rate": 0.0001999879112117559, + "loss": 4.8378, + "step": 149 + }, + { + "epoch": 0.013578347062550919, + "grad_norm": 0.8915342092514038, + "learning_rate": 0.00019998741276738754, + "loss": 4.8782, + "step": 150 + }, + { + "epoch": 0.013668869376301258, + "grad_norm": 0.9013955593109131, + "learning_rate": 0.00019998690425428943, + "loss": 4.8299, + "step": 151 + }, + { + "epoch": 0.013759391690051598, + "grad_norm": 0.9909464120864868, + "learning_rate": 0.0001999863856725128, + "loss": 4.7892, + "step": 152 + }, + { + "epoch": 0.013849914003801938, + "grad_norm": 1.0322654247283936, + "learning_rate": 0.0001999858570221099, + "loss": 4.7991, + "step": 153 + }, + { + "epoch": 0.013940436317552277, + "grad_norm": 0.9434685111045837, + "learning_rate": 0.00019998531830313395, + "loss": 4.8662, + "step": 154 + }, + { + "epoch": 0.014030958631302616, + "grad_norm": 0.8822855353355408, + "learning_rate": 0.00019998476951563915, + "loss": 4.808, + "step": 155 + }, + { + "epoch": 0.014121480945052955, + "grad_norm": 1.0918142795562744, + "learning_rate": 0.00019998421065968078, + "loss": 4.9409, + "step": 156 + }, + { + "epoch": 0.014212003258803296, + "grad_norm": 1.1329283714294434, + "learning_rate": 0.00019998364173531513, + "loss": 4.8035, + "step": 157 + }, + { + "epoch": 0.014302525572553635, + "grad_norm": 1.6153861284255981, + "learning_rate": 0.00019998306274259954, + "loss": 4.8325, + "step": 158 + }, + { + "epoch": 0.014393047886303974, + "grad_norm": 0.9851101636886597, + "learning_rate": 0.00019998247368159224, + "loss": 4.7919, + "step": 159 + }, + { + "epoch": 0.014483570200054313, + "grad_norm": 0.9160235524177551, + "learning_rate": 0.0001999818745523526, + "loss": 4.8134, + "step": 160 + }, + { + "epoch": 0.014574092513804654, + "grad_norm": 1.10393488407135, + "learning_rate": 0.00019998126535494088, + "loss": 4.8546, + "step": 161 + }, + { + "epoch": 0.014664614827554993, + "grad_norm": 1.088984489440918, + "learning_rate": 0.0001999806460894185, + "loss": 4.7318, + "step": 162 + }, + { + "epoch": 0.014755137141305332, + "grad_norm": 1.0326741933822632, + "learning_rate": 0.00019998001675584781, + "loss": 4.8165, + "step": 163 + }, + { + "epoch": 0.01484565945505567, + "grad_norm": 0.8954424262046814, + "learning_rate": 0.00019997937735429219, + "loss": 4.8473, + "step": 164 + }, + { + "epoch": 0.01493618176880601, + "grad_norm": 0.9650543332099915, + "learning_rate": 0.00019997872788481594, + "loss": 4.8017, + "step": 165 + }, + { + "epoch": 0.01502670408255635, + "grad_norm": 1.042865514755249, + "learning_rate": 0.00019997806834748456, + "loss": 4.8427, + "step": 166 + }, + { + "epoch": 0.01511722639630669, + "grad_norm": 0.8964381814002991, + "learning_rate": 0.00019997739874236443, + "loss": 4.7021, + "step": 167 + }, + { + "epoch": 0.015207748710057029, + "grad_norm": 1.068221092224121, + "learning_rate": 0.00019997671906952298, + "loss": 4.8155, + "step": 168 + }, + { + "epoch": 0.015298271023807368, + "grad_norm": 1.0170791149139404, + "learning_rate": 0.00019997602932902866, + "loss": 4.8067, + "step": 169 + }, + { + "epoch": 0.015388793337557709, + "grad_norm": 1.1236660480499268, + "learning_rate": 0.00019997532952095094, + "loss": 4.8021, + "step": 170 + }, + { + "epoch": 0.015479315651308048, + "grad_norm": 0.9010448455810547, + "learning_rate": 0.00019997461964536023, + "loss": 4.8208, + "step": 171 + }, + { + "epoch": 0.015569837965058387, + "grad_norm": 1.1579195261001587, + "learning_rate": 0.0001999738997023281, + "loss": 4.8075, + "step": 172 + }, + { + "epoch": 0.015660360278808728, + "grad_norm": 0.8007659912109375, + "learning_rate": 0.00019997316969192698, + "loss": 4.791, + "step": 173 + }, + { + "epoch": 0.015750882592559065, + "grad_norm": 0.9970986247062683, + "learning_rate": 0.00019997242961423042, + "loss": 4.711, + "step": 174 + }, + { + "epoch": 0.015841404906309406, + "grad_norm": 0.7657274603843689, + "learning_rate": 0.0001999716794693129, + "loss": 4.7906, + "step": 175 + }, + { + "epoch": 0.015931927220059743, + "grad_norm": 0.899833083152771, + "learning_rate": 0.00019997091925725004, + "loss": 4.8504, + "step": 176 + }, + { + "epoch": 0.016022449533810084, + "grad_norm": 1.044193983078003, + "learning_rate": 0.00019997014897811833, + "loss": 4.8014, + "step": 177 + }, + { + "epoch": 0.016112971847560425, + "grad_norm": 1.321266531944275, + "learning_rate": 0.00019996936863199535, + "loss": 4.807, + "step": 178 + }, + { + "epoch": 0.016203494161310762, + "grad_norm": 0.8726624250411987, + "learning_rate": 0.00019996857821895966, + "loss": 4.7457, + "step": 179 + }, + { + "epoch": 0.016294016475061103, + "grad_norm": 0.9373568892478943, + "learning_rate": 0.00019996777773909093, + "loss": 4.7909, + "step": 180 + }, + { + "epoch": 0.01638453878881144, + "grad_norm": 0.933677613735199, + "learning_rate": 0.00019996696719246969, + "loss": 4.7476, + "step": 181 + }, + { + "epoch": 0.01647506110256178, + "grad_norm": 0.8615386486053467, + "learning_rate": 0.00019996614657917758, + "loss": 4.757, + "step": 182 + }, + { + "epoch": 0.016565583416312122, + "grad_norm": 0.9267805218696594, + "learning_rate": 0.00019996531589929725, + "loss": 4.8215, + "step": 183 + }, + { + "epoch": 0.01665610573006246, + "grad_norm": 0.9681649804115295, + "learning_rate": 0.00019996447515291233, + "loss": 4.7463, + "step": 184 + }, + { + "epoch": 0.0167466280438128, + "grad_norm": 0.9593618512153625, + "learning_rate": 0.00019996362434010752, + "loss": 4.7061, + "step": 185 + }, + { + "epoch": 0.01683715035756314, + "grad_norm": 1.017175555229187, + "learning_rate": 0.00019996276346096847, + "loss": 4.7243, + "step": 186 + }, + { + "epoch": 0.01692767267131348, + "grad_norm": 1.036476731300354, + "learning_rate": 0.00019996189251558189, + "loss": 4.7767, + "step": 187 + }, + { + "epoch": 0.01701819498506382, + "grad_norm": 0.892395555973053, + "learning_rate": 0.00019996101150403543, + "loss": 4.745, + "step": 188 + }, + { + "epoch": 0.017108717298814156, + "grad_norm": 1.1248540878295898, + "learning_rate": 0.00019996012042641786, + "loss": 4.7538, + "step": 189 + }, + { + "epoch": 0.017199239612564497, + "grad_norm": 0.9392964243888855, + "learning_rate": 0.00019995921928281894, + "loss": 4.7655, + "step": 190 + }, + { + "epoch": 0.017289761926314838, + "grad_norm": 1.0880126953125, + "learning_rate": 0.00019995830807332932, + "loss": 4.7164, + "step": 191 + }, + { + "epoch": 0.017380284240065175, + "grad_norm": 0.9286646246910095, + "learning_rate": 0.00019995738679804085, + "loss": 4.7107, + "step": 192 + }, + { + "epoch": 0.017470806553815516, + "grad_norm": 1.0103776454925537, + "learning_rate": 0.00019995645545704623, + "loss": 4.6853, + "step": 193 + }, + { + "epoch": 0.017561328867565854, + "grad_norm": 1.0643168687820435, + "learning_rate": 0.00019995551405043932, + "loss": 4.7065, + "step": 194 + }, + { + "epoch": 0.017651851181316194, + "grad_norm": 0.9227851629257202, + "learning_rate": 0.00019995456257831484, + "loss": 4.7221, + "step": 195 + }, + { + "epoch": 0.017742373495066535, + "grad_norm": 0.8938180208206177, + "learning_rate": 0.00019995360104076867, + "loss": 4.7293, + "step": 196 + }, + { + "epoch": 0.017832895808816873, + "grad_norm": 1.2211713790893555, + "learning_rate": 0.0001999526294378976, + "loss": 4.707, + "step": 197 + }, + { + "epoch": 0.017923418122567213, + "grad_norm": 0.8459523916244507, + "learning_rate": 0.0001999516477697995, + "loss": 4.6857, + "step": 198 + }, + { + "epoch": 0.01801394043631755, + "grad_norm": 0.9301251769065857, + "learning_rate": 0.00019995065603657316, + "loss": 4.7267, + "step": 199 + }, + { + "epoch": 0.01810446275006789, + "grad_norm": 0.8725582957267761, + "learning_rate": 0.00019994965423831854, + "loss": 4.6945, + "step": 200 + }, + { + "epoch": 0.018194985063818232, + "grad_norm": 1.0342596769332886, + "learning_rate": 0.00019994864237513643, + "loss": 4.707, + "step": 201 + }, + { + "epoch": 0.01828550737756857, + "grad_norm": 0.8990219235420227, + "learning_rate": 0.0001999476204471288, + "loss": 4.7324, + "step": 202 + }, + { + "epoch": 0.01837602969131891, + "grad_norm": 0.9822607040405273, + "learning_rate": 0.00019994658845439852, + "loss": 4.7214, + "step": 203 + }, + { + "epoch": 0.01846655200506925, + "grad_norm": 1.0090265274047852, + "learning_rate": 0.0001999455463970495, + "loss": 4.6733, + "step": 204 + }, + { + "epoch": 0.01855707431881959, + "grad_norm": 0.8931706547737122, + "learning_rate": 0.0001999444942751867, + "loss": 4.673, + "step": 205 + }, + { + "epoch": 0.01864759663256993, + "grad_norm": 0.975825309753418, + "learning_rate": 0.00019994343208891606, + "loss": 4.6323, + "step": 206 + }, + { + "epoch": 0.018738118946320267, + "grad_norm": 0.967452347278595, + "learning_rate": 0.00019994235983834455, + "loss": 4.6975, + "step": 207 + }, + { + "epoch": 0.018828641260070608, + "grad_norm": 0.8899520039558411, + "learning_rate": 0.00019994127752358013, + "loss": 4.6753, + "step": 208 + }, + { + "epoch": 0.01891916357382095, + "grad_norm": 1.09125816822052, + "learning_rate": 0.00019994018514473183, + "loss": 4.644, + "step": 209 + }, + { + "epoch": 0.019009685887571286, + "grad_norm": 0.9174689054489136, + "learning_rate": 0.0001999390827019096, + "loss": 4.6589, + "step": 210 + }, + { + "epoch": 0.019100208201321626, + "grad_norm": 0.8886850476264954, + "learning_rate": 0.00019993797019522447, + "loss": 4.6273, + "step": 211 + }, + { + "epoch": 0.019190730515071964, + "grad_norm": 0.8056244850158691, + "learning_rate": 0.00019993684762478845, + "loss": 4.6332, + "step": 212 + }, + { + "epoch": 0.019281252828822305, + "grad_norm": 0.7783074975013733, + "learning_rate": 0.00019993571499071465, + "loss": 4.6484, + "step": 213 + }, + { + "epoch": 0.019371775142572645, + "grad_norm": 0.8422152400016785, + "learning_rate": 0.00019993457229311708, + "loss": 4.6529, + "step": 214 + }, + { + "epoch": 0.019462297456322983, + "grad_norm": 0.8490102887153625, + "learning_rate": 0.0001999334195321108, + "loss": 4.6784, + "step": 215 + }, + { + "epoch": 0.019552819770073324, + "grad_norm": 0.931522786617279, + "learning_rate": 0.0001999322567078119, + "loss": 4.6701, + "step": 216 + }, + { + "epoch": 0.01964334208382366, + "grad_norm": 0.825369119644165, + "learning_rate": 0.0001999310838203375, + "loss": 4.64, + "step": 217 + }, + { + "epoch": 0.019733864397574002, + "grad_norm": 1.099587082862854, + "learning_rate": 0.0001999299008698057, + "loss": 4.6228, + "step": 218 + }, + { + "epoch": 0.019824386711324343, + "grad_norm": 0.8843775391578674, + "learning_rate": 0.00019992870785633563, + "loss": 4.6485, + "step": 219 + }, + { + "epoch": 0.01991490902507468, + "grad_norm": 0.8883227705955505, + "learning_rate": 0.00019992750478004738, + "loss": 4.6917, + "step": 220 + }, + { + "epoch": 0.02000543133882502, + "grad_norm": 1.078590989112854, + "learning_rate": 0.0001999262916410621, + "loss": 4.6544, + "step": 221 + }, + { + "epoch": 0.02009595365257536, + "grad_norm": 1.051282286643982, + "learning_rate": 0.00019992506843950208, + "loss": 4.7503, + "step": 222 + }, + { + "epoch": 0.0201864759663257, + "grad_norm": 0.8993271589279175, + "learning_rate": 0.0001999238351754903, + "loss": 4.6718, + "step": 223 + }, + { + "epoch": 0.02027699828007604, + "grad_norm": 0.9291291832923889, + "learning_rate": 0.00019992259184915115, + "loss": 4.6306, + "step": 224 + }, + { + "epoch": 0.020367520593826377, + "grad_norm": 0.9018997550010681, + "learning_rate": 0.00019992133846060968, + "loss": 4.6095, + "step": 225 + }, + { + "epoch": 0.020458042907576718, + "grad_norm": 0.9196507930755615, + "learning_rate": 0.00019992007500999214, + "loss": 4.6376, + "step": 226 + }, + { + "epoch": 0.02054856522132706, + "grad_norm": 1.0362221002578735, + "learning_rate": 0.00019991880149742582, + "loss": 4.6297, + "step": 227 + }, + { + "epoch": 0.020639087535077396, + "grad_norm": 0.9928797483444214, + "learning_rate": 0.00019991751792303893, + "loss": 4.6679, + "step": 228 + }, + { + "epoch": 0.020729609848827737, + "grad_norm": 1.0466713905334473, + "learning_rate": 0.00019991622428696068, + "loss": 4.6161, + "step": 229 + }, + { + "epoch": 0.020820132162578074, + "grad_norm": 1.0954371690750122, + "learning_rate": 0.00019991492058932142, + "loss": 4.6043, + "step": 230 + }, + { + "epoch": 0.020910654476328415, + "grad_norm": 0.9638803601264954, + "learning_rate": 0.00019991360683025238, + "loss": 4.6289, + "step": 231 + }, + { + "epoch": 0.021001176790078756, + "grad_norm": 0.8331062197685242, + "learning_rate": 0.00019991228300988585, + "loss": 4.6163, + "step": 232 + }, + { + "epoch": 0.021091699103829093, + "grad_norm": 0.9988797307014465, + "learning_rate": 0.00019991094912835515, + "loss": 4.6584, + "step": 233 + }, + { + "epoch": 0.021182221417579434, + "grad_norm": 0.9852668046951294, + "learning_rate": 0.0001999096051857946, + "loss": 4.6209, + "step": 234 + }, + { + "epoch": 0.02127274373132977, + "grad_norm": 1.0406630039215088, + "learning_rate": 0.00019990825118233957, + "loss": 4.578, + "step": 235 + }, + { + "epoch": 0.021363266045080112, + "grad_norm": 0.97223299741745, + "learning_rate": 0.0001999068871181264, + "loss": 4.7119, + "step": 236 + }, + { + "epoch": 0.021453788358830453, + "grad_norm": 1.0562740564346313, + "learning_rate": 0.00019990551299329238, + "loss": 4.6474, + "step": 237 + }, + { + "epoch": 0.02154431067258079, + "grad_norm": 0.8278136849403381, + "learning_rate": 0.00019990412880797597, + "loss": 4.6211, + "step": 238 + }, + { + "epoch": 0.02163483298633113, + "grad_norm": 0.9848172664642334, + "learning_rate": 0.0001999027345623165, + "loss": 4.6811, + "step": 239 + }, + { + "epoch": 0.02172535530008147, + "grad_norm": 0.9529289603233337, + "learning_rate": 0.0001999013302564544, + "loss": 4.6545, + "step": 240 + }, + { + "epoch": 0.02181587761383181, + "grad_norm": 0.993384063243866, + "learning_rate": 0.00019989991589053108, + "loss": 4.6565, + "step": 241 + }, + { + "epoch": 0.02190639992758215, + "grad_norm": 0.9394677877426147, + "learning_rate": 0.00019989849146468896, + "loss": 4.5875, + "step": 242 + }, + { + "epoch": 0.021996922241332487, + "grad_norm": 0.9237437844276428, + "learning_rate": 0.00019989705697907149, + "loss": 4.5709, + "step": 243 + }, + { + "epoch": 0.022087444555082828, + "grad_norm": 0.8978863954544067, + "learning_rate": 0.00019989561243382312, + "loss": 4.6205, + "step": 244 + }, + { + "epoch": 0.02217796686883317, + "grad_norm": 0.8756672143936157, + "learning_rate": 0.00019989415782908928, + "loss": 4.6204, + "step": 245 + }, + { + "epoch": 0.022268489182583506, + "grad_norm": 0.9668368101119995, + "learning_rate": 0.0001998926931650165, + "loss": 4.5963, + "step": 246 + }, + { + "epoch": 0.022359011496333847, + "grad_norm": 0.9980696439743042, + "learning_rate": 0.00019989121844175224, + "loss": 4.5992, + "step": 247 + }, + { + "epoch": 0.022449533810084184, + "grad_norm": 1.0099190473556519, + "learning_rate": 0.00019988973365944507, + "loss": 4.5303, + "step": 248 + }, + { + "epoch": 0.022540056123834525, + "grad_norm": 0.9659672975540161, + "learning_rate": 0.00019988823881824438, + "loss": 4.5601, + "step": 249 + }, + { + "epoch": 0.022630578437584866, + "grad_norm": 1.0651359558105469, + "learning_rate": 0.0001998867339183008, + "loss": 4.609, + "step": 250 + }, + { + "epoch": 0.022721100751335203, + "grad_norm": 0.9525238275527954, + "learning_rate": 0.00019988521895976586, + "loss": 4.6038, + "step": 251 + }, + { + "epoch": 0.022811623065085544, + "grad_norm": 0.859809398651123, + "learning_rate": 0.0001998836939427921, + "loss": 4.6071, + "step": 252 + }, + { + "epoch": 0.02290214537883588, + "grad_norm": 1.0387052297592163, + "learning_rate": 0.00019988215886753308, + "loss": 4.6451, + "step": 253 + }, + { + "epoch": 0.022992667692586222, + "grad_norm": 0.9915367960929871, + "learning_rate": 0.0001998806137341434, + "loss": 4.589, + "step": 254 + }, + { + "epoch": 0.023083190006336563, + "grad_norm": 0.8341922163963318, + "learning_rate": 0.00019987905854277867, + "loss": 4.5568, + "step": 255 + }, + { + "epoch": 0.0231737123200869, + "grad_norm": 0.8853479027748108, + "learning_rate": 0.00019987749329359548, + "loss": 4.5839, + "step": 256 + }, + { + "epoch": 0.02326423463383724, + "grad_norm": 0.9336071610450745, + "learning_rate": 0.0001998759179867514, + "loss": 4.6152, + "step": 257 + }, + { + "epoch": 0.02335475694758758, + "grad_norm": 1.0642181634902954, + "learning_rate": 0.00019987433262240516, + "loss": 4.5539, + "step": 258 + }, + { + "epoch": 0.02344527926133792, + "grad_norm": 1.0101536512374878, + "learning_rate": 0.00019987273720071632, + "loss": 4.6234, + "step": 259 + }, + { + "epoch": 0.02353580157508826, + "grad_norm": 0.9692165851593018, + "learning_rate": 0.00019987113172184563, + "loss": 4.5625, + "step": 260 + }, + { + "epoch": 0.023626323888838598, + "grad_norm": 1.0215392112731934, + "learning_rate": 0.00019986951618595464, + "loss": 4.6085, + "step": 261 + }, + { + "epoch": 0.02371684620258894, + "grad_norm": 1.2795698642730713, + "learning_rate": 0.00019986789059320615, + "loss": 4.5254, + "step": 262 + }, + { + "epoch": 0.02380736851633928, + "grad_norm": 0.8494666814804077, + "learning_rate": 0.0001998662549437638, + "loss": 4.5586, + "step": 263 + }, + { + "epoch": 0.023897890830089617, + "grad_norm": 1.0478988885879517, + "learning_rate": 0.0001998646092377923, + "loss": 4.5894, + "step": 264 + }, + { + "epoch": 0.023988413143839957, + "grad_norm": 0.8754821419715881, + "learning_rate": 0.0001998629534754574, + "loss": 4.5618, + "step": 265 + }, + { + "epoch": 0.024078935457590295, + "grad_norm": 1.0455201864242554, + "learning_rate": 0.0001998612876569258, + "loss": 4.5031, + "step": 266 + }, + { + "epoch": 0.024169457771340636, + "grad_norm": 0.9330147504806519, + "learning_rate": 0.00019985961178236529, + "loss": 4.5686, + "step": 267 + }, + { + "epoch": 0.024259980085090976, + "grad_norm": 0.933779239654541, + "learning_rate": 0.00019985792585194457, + "loss": 4.5083, + "step": 268 + }, + { + "epoch": 0.024350502398841314, + "grad_norm": 0.9001979231834412, + "learning_rate": 0.00019985622986583347, + "loss": 4.4943, + "step": 269 + }, + { + "epoch": 0.024441024712591655, + "grad_norm": 1.0643647909164429, + "learning_rate": 0.00019985452382420275, + "loss": 4.5722, + "step": 270 + }, + { + "epoch": 0.024531547026341992, + "grad_norm": 0.876860499382019, + "learning_rate": 0.00019985280772722423, + "loss": 4.5179, + "step": 271 + }, + { + "epoch": 0.024622069340092333, + "grad_norm": 1.0777976512908936, + "learning_rate": 0.00019985108157507067, + "loss": 4.5369, + "step": 272 + }, + { + "epoch": 0.024712591653842673, + "grad_norm": 0.962979257106781, + "learning_rate": 0.00019984934536791595, + "loss": 4.6008, + "step": 273 + }, + { + "epoch": 0.02480311396759301, + "grad_norm": 0.8431717157363892, + "learning_rate": 0.00019984759910593488, + "loss": 4.4974, + "step": 274 + }, + { + "epoch": 0.02489363628134335, + "grad_norm": 0.915391206741333, + "learning_rate": 0.0001998458427893033, + "loss": 4.4514, + "step": 275 + }, + { + "epoch": 0.02498415859509369, + "grad_norm": 0.8736391067504883, + "learning_rate": 0.00019984407641819812, + "loss": 4.5691, + "step": 276 + }, + { + "epoch": 0.02507468090884403, + "grad_norm": 0.9402247667312622, + "learning_rate": 0.00019984229999279714, + "loss": 4.406, + "step": 277 + }, + { + "epoch": 0.02516520322259437, + "grad_norm": 0.9595934152603149, + "learning_rate": 0.0001998405135132793, + "loss": 4.5847, + "step": 278 + }, + { + "epoch": 0.025255725536344708, + "grad_norm": 0.8316798210144043, + "learning_rate": 0.00019983871697982445, + "loss": 4.4519, + "step": 279 + }, + { + "epoch": 0.02534624785009505, + "grad_norm": 0.8516654968261719, + "learning_rate": 0.00019983691039261357, + "loss": 4.5498, + "step": 280 + }, + { + "epoch": 0.02543677016384539, + "grad_norm": 0.915367603302002, + "learning_rate": 0.00019983509375182855, + "loss": 4.5475, + "step": 281 + }, + { + "epoch": 0.025527292477595727, + "grad_norm": 0.9939991235733032, + "learning_rate": 0.0001998332670576523, + "loss": 4.5124, + "step": 282 + }, + { + "epoch": 0.025617814791346068, + "grad_norm": 1.2104617357254028, + "learning_rate": 0.00019983143031026879, + "loss": 4.4694, + "step": 283 + }, + { + "epoch": 0.025708337105096405, + "grad_norm": 1.1700996160507202, + "learning_rate": 0.00019982958350986296, + "loss": 4.5415, + "step": 284 + }, + { + "epoch": 0.025798859418846746, + "grad_norm": 1.249987006187439, + "learning_rate": 0.00019982772665662083, + "loss": 4.5698, + "step": 285 + }, + { + "epoch": 0.025889381732597087, + "grad_norm": 0.9590091109275818, + "learning_rate": 0.00019982585975072937, + "loss": 4.567, + "step": 286 + }, + { + "epoch": 0.025979904046347424, + "grad_norm": 1.1153589487075806, + "learning_rate": 0.00019982398279237655, + "loss": 4.4925, + "step": 287 + }, + { + "epoch": 0.026070426360097765, + "grad_norm": 0.9221207499504089, + "learning_rate": 0.00019982209578175137, + "loss": 4.5124, + "step": 288 + }, + { + "epoch": 0.026160948673848102, + "grad_norm": 0.9073323607444763, + "learning_rate": 0.00019982019871904393, + "loss": 4.5402, + "step": 289 + }, + { + "epoch": 0.026251470987598443, + "grad_norm": 0.9508566856384277, + "learning_rate": 0.00019981829160444514, + "loss": 4.5094, + "step": 290 + }, + { + "epoch": 0.026341993301348784, + "grad_norm": 0.963015615940094, + "learning_rate": 0.00019981637443814717, + "loss": 4.5076, + "step": 291 + }, + { + "epoch": 0.02643251561509912, + "grad_norm": 1.0233488082885742, + "learning_rate": 0.000199814447220343, + "loss": 4.4913, + "step": 292 + }, + { + "epoch": 0.026523037928849462, + "grad_norm": 0.9522859454154968, + "learning_rate": 0.00019981250995122673, + "loss": 4.4579, + "step": 293 + }, + { + "epoch": 0.0266135602425998, + "grad_norm": 0.8754635453224182, + "learning_rate": 0.00019981056263099344, + "loss": 4.4501, + "step": 294 + }, + { + "epoch": 0.02670408255635014, + "grad_norm": 0.9538990259170532, + "learning_rate": 0.00019980860525983923, + "loss": 4.47, + "step": 295 + }, + { + "epoch": 0.02679460487010048, + "grad_norm": 0.9469854235649109, + "learning_rate": 0.00019980663783796118, + "loss": 4.4804, + "step": 296 + }, + { + "epoch": 0.026885127183850818, + "grad_norm": 0.8881922364234924, + "learning_rate": 0.00019980466036555744, + "loss": 4.437, + "step": 297 + }, + { + "epoch": 0.02697564949760116, + "grad_norm": 0.9970735311508179, + "learning_rate": 0.00019980267284282717, + "loss": 4.5339, + "step": 298 + }, + { + "epoch": 0.0270661718113515, + "grad_norm": 1.0509942770004272, + "learning_rate": 0.00019980067526997045, + "loss": 4.5365, + "step": 299 + }, + { + "epoch": 0.027156694125101837, + "grad_norm": 0.9530550241470337, + "learning_rate": 0.00019979866764718843, + "loss": 4.4565, + "step": 300 + }, + { + "epoch": 0.027247216438852178, + "grad_norm": 1.178865909576416, + "learning_rate": 0.00019979664997468337, + "loss": 4.5071, + "step": 301 + }, + { + "epoch": 0.027337738752602515, + "grad_norm": 0.932168185710907, + "learning_rate": 0.00019979462225265833, + "loss": 4.4988, + "step": 302 + }, + { + "epoch": 0.027428261066352856, + "grad_norm": 0.9907435774803162, + "learning_rate": 0.00019979258448131758, + "loss": 4.4837, + "step": 303 + }, + { + "epoch": 0.027518783380103197, + "grad_norm": 0.9297950267791748, + "learning_rate": 0.00019979053666086634, + "loss": 4.4823, + "step": 304 + }, + { + "epoch": 0.027609305693853534, + "grad_norm": 0.9497891664505005, + "learning_rate": 0.00019978847879151076, + "loss": 4.4843, + "step": 305 + }, + { + "epoch": 0.027699828007603875, + "grad_norm": 0.9658074975013733, + "learning_rate": 0.00019978641087345806, + "loss": 4.3628, + "step": 306 + }, + { + "epoch": 0.027790350321354212, + "grad_norm": 0.9158378839492798, + "learning_rate": 0.00019978433290691655, + "loss": 4.4123, + "step": 307 + }, + { + "epoch": 0.027880872635104553, + "grad_norm": 1.1265052556991577, + "learning_rate": 0.00019978224489209545, + "loss": 4.471, + "step": 308 + }, + { + "epoch": 0.027971394948854894, + "grad_norm": 0.8981154561042786, + "learning_rate": 0.000199780146829205, + "loss": 4.4544, + "step": 309 + }, + { + "epoch": 0.02806191726260523, + "grad_norm": 0.9838968515396118, + "learning_rate": 0.0001997780387184565, + "loss": 4.5014, + "step": 310 + }, + { + "epoch": 0.028152439576355572, + "grad_norm": 0.991176962852478, + "learning_rate": 0.00019977592056006228, + "loss": 4.3644, + "step": 311 + }, + { + "epoch": 0.02824296189010591, + "grad_norm": 0.986817479133606, + "learning_rate": 0.00019977379235423551, + "loss": 4.4363, + "step": 312 + }, + { + "epoch": 0.02833348420385625, + "grad_norm": 1.0609725713729858, + "learning_rate": 0.00019977165410119062, + "loss": 4.4627, + "step": 313 + }, + { + "epoch": 0.02842400651760659, + "grad_norm": 1.1476916074752808, + "learning_rate": 0.0001997695058011429, + "loss": 4.4855, + "step": 314 + }, + { + "epoch": 0.02851452883135693, + "grad_norm": 1.1370539665222168, + "learning_rate": 0.00019976734745430868, + "loss": 4.4234, + "step": 315 + }, + { + "epoch": 0.02860505114510727, + "grad_norm": 1.0598125457763672, + "learning_rate": 0.00019976517906090529, + "loss": 4.4257, + "step": 316 + }, + { + "epoch": 0.028695573458857607, + "grad_norm": 1.014866590499878, + "learning_rate": 0.0001997630006211511, + "loss": 4.3799, + "step": 317 + }, + { + "epoch": 0.028786095772607947, + "grad_norm": 0.9023067951202393, + "learning_rate": 0.00019976081213526545, + "loss": 4.4157, + "step": 318 + }, + { + "epoch": 0.02887661808635829, + "grad_norm": 1.0376299619674683, + "learning_rate": 0.00019975861360346876, + "loss": 4.403, + "step": 319 + }, + { + "epoch": 0.028967140400108626, + "grad_norm": 1.077791452407837, + "learning_rate": 0.00019975640502598244, + "loss": 4.3543, + "step": 320 + }, + { + "epoch": 0.029057662713858966, + "grad_norm": 0.9885241389274597, + "learning_rate": 0.00019975418640302882, + "loss": 4.3259, + "step": 321 + }, + { + "epoch": 0.029148185027609307, + "grad_norm": 1.0007898807525635, + "learning_rate": 0.0001997519577348314, + "loss": 4.4491, + "step": 322 + }, + { + "epoch": 0.029238707341359645, + "grad_norm": 0.9583142995834351, + "learning_rate": 0.00019974971902161453, + "loss": 4.4285, + "step": 323 + }, + { + "epoch": 0.029329229655109985, + "grad_norm": 1.1063108444213867, + "learning_rate": 0.00019974747026360373, + "loss": 4.3511, + "step": 324 + }, + { + "epoch": 0.029419751968860323, + "grad_norm": 1.0260143280029297, + "learning_rate": 0.00019974521146102537, + "loss": 4.3798, + "step": 325 + }, + { + "epoch": 0.029510274282610664, + "grad_norm": 0.9445636868476868, + "learning_rate": 0.00019974294261410695, + "loss": 4.4577, + "step": 326 + }, + { + "epoch": 0.029600796596361004, + "grad_norm": 0.9906353950500488, + "learning_rate": 0.00019974066372307694, + "loss": 4.427, + "step": 327 + }, + { + "epoch": 0.02969131891011134, + "grad_norm": 1.0205063819885254, + "learning_rate": 0.0001997383747881648, + "loss": 4.4002, + "step": 328 + }, + { + "epoch": 0.029781841223861683, + "grad_norm": 1.0043867826461792, + "learning_rate": 0.00019973607580960108, + "loss": 4.3951, + "step": 329 + }, + { + "epoch": 0.02987236353761202, + "grad_norm": 0.9098749160766602, + "learning_rate": 0.00019973376678761724, + "loss": 4.3715, + "step": 330 + }, + { + "epoch": 0.02996288585136236, + "grad_norm": 1.0196176767349243, + "learning_rate": 0.00019973144772244582, + "loss": 4.5046, + "step": 331 + }, + { + "epoch": 0.0300534081651127, + "grad_norm": 0.970935583114624, + "learning_rate": 0.00019972911861432032, + "loss": 4.4171, + "step": 332 + }, + { + "epoch": 0.03014393047886304, + "grad_norm": 1.073278546333313, + "learning_rate": 0.00019972677946347535, + "loss": 4.4271, + "step": 333 + }, + { + "epoch": 0.03023445279261338, + "grad_norm": 0.9405482411384583, + "learning_rate": 0.0001997244302701464, + "loss": 4.3853, + "step": 334 + }, + { + "epoch": 0.030324975106363717, + "grad_norm": 1.1856549978256226, + "learning_rate": 0.00019972207103457001, + "loss": 4.4458, + "step": 335 + }, + { + "epoch": 0.030415497420114058, + "grad_norm": 1.0638178586959839, + "learning_rate": 0.00019971970175698385, + "loss": 4.4305, + "step": 336 + }, + { + "epoch": 0.0305060197338644, + "grad_norm": 0.9577354788780212, + "learning_rate": 0.00019971732243762643, + "loss": 4.3614, + "step": 337 + }, + { + "epoch": 0.030596542047614736, + "grad_norm": 0.867456316947937, + "learning_rate": 0.00019971493307673735, + "loss": 4.4327, + "step": 338 + }, + { + "epoch": 0.030687064361365077, + "grad_norm": 0.9249234199523926, + "learning_rate": 0.00019971253367455727, + "loss": 4.3964, + "step": 339 + }, + { + "epoch": 0.030777586675115418, + "grad_norm": 0.9764379262924194, + "learning_rate": 0.00019971012423132775, + "loss": 4.3969, + "step": 340 + }, + { + "epoch": 0.030868108988865755, + "grad_norm": 0.9793526530265808, + "learning_rate": 0.00019970770474729145, + "loss": 4.3077, + "step": 341 + }, + { + "epoch": 0.030958631302616096, + "grad_norm": 0.996056318283081, + "learning_rate": 0.00019970527522269205, + "loss": 4.3773, + "step": 342 + }, + { + "epoch": 0.031049153616366433, + "grad_norm": 0.9756794571876526, + "learning_rate": 0.0001997028356577741, + "loss": 4.3676, + "step": 343 + }, + { + "epoch": 0.031139675930116774, + "grad_norm": 0.9667348265647888, + "learning_rate": 0.00019970038605278338, + "loss": 4.3651, + "step": 344 + }, + { + "epoch": 0.031230198243867115, + "grad_norm": 1.0161426067352295, + "learning_rate": 0.0001996979264079665, + "loss": 4.4087, + "step": 345 + }, + { + "epoch": 0.031320720557617455, + "grad_norm": 0.9502101540565491, + "learning_rate": 0.00019969545672357116, + "loss": 4.3836, + "step": 346 + }, + { + "epoch": 0.03141124287136779, + "grad_norm": 1.0001097917556763, + "learning_rate": 0.00019969297699984603, + "loss": 4.3798, + "step": 347 + }, + { + "epoch": 0.03150176518511813, + "grad_norm": 0.9770258069038391, + "learning_rate": 0.00019969048723704087, + "loss": 4.4348, + "step": 348 + }, + { + "epoch": 0.03159228749886847, + "grad_norm": 0.9103361964225769, + "learning_rate": 0.00019968798743540635, + "loss": 4.4196, + "step": 349 + }, + { + "epoch": 0.03168280981261881, + "grad_norm": 1.0442837476730347, + "learning_rate": 0.00019968547759519425, + "loss": 4.3155, + "step": 350 + }, + { + "epoch": 0.03177333212636915, + "grad_norm": 0.9110909700393677, + "learning_rate": 0.00019968295771665727, + "loss": 4.2968, + "step": 351 + }, + { + "epoch": 0.031863854440119486, + "grad_norm": 1.1041204929351807, + "learning_rate": 0.00019968042780004917, + "loss": 4.3275, + "step": 352 + }, + { + "epoch": 0.03195437675386983, + "grad_norm": 1.1094104051589966, + "learning_rate": 0.00019967788784562473, + "loss": 4.3965, + "step": 353 + }, + { + "epoch": 0.03204489906762017, + "grad_norm": 1.0326623916625977, + "learning_rate": 0.0001996753378536397, + "loss": 4.3922, + "step": 354 + }, + { + "epoch": 0.03213542138137051, + "grad_norm": 1.1457520723342896, + "learning_rate": 0.00019967277782435088, + "loss": 4.3337, + "step": 355 + }, + { + "epoch": 0.03222594369512085, + "grad_norm": 1.0908020734786987, + "learning_rate": 0.00019967020775801605, + "loss": 4.3584, + "step": 356 + }, + { + "epoch": 0.032316466008871184, + "grad_norm": 1.1035650968551636, + "learning_rate": 0.00019966762765489407, + "loss": 4.3424, + "step": 357 + }, + { + "epoch": 0.032406988322621524, + "grad_norm": 0.9557312726974487, + "learning_rate": 0.00019966503751524465, + "loss": 4.3228, + "step": 358 + }, + { + "epoch": 0.032497510636371865, + "grad_norm": 1.026123046875, + "learning_rate": 0.00019966243733932873, + "loss": 4.3184, + "step": 359 + }, + { + "epoch": 0.032588032950122206, + "grad_norm": 0.9995486736297607, + "learning_rate": 0.00019965982712740808, + "loss": 4.3935, + "step": 360 + }, + { + "epoch": 0.03267855526387255, + "grad_norm": 1.0720714330673218, + "learning_rate": 0.00019965720687974554, + "loss": 4.3373, + "step": 361 + }, + { + "epoch": 0.03276907757762288, + "grad_norm": 0.9495123028755188, + "learning_rate": 0.00019965457659660503, + "loss": 4.3709, + "step": 362 + }, + { + "epoch": 0.03285959989137322, + "grad_norm": 0.9384881854057312, + "learning_rate": 0.00019965193627825137, + "loss": 4.3316, + "step": 363 + }, + { + "epoch": 0.03295012220512356, + "grad_norm": 0.9817916750907898, + "learning_rate": 0.00019964928592495045, + "loss": 4.3621, + "step": 364 + }, + { + "epoch": 0.0330406445188739, + "grad_norm": 0.9114745855331421, + "learning_rate": 0.00019964662553696917, + "loss": 4.3202, + "step": 365 + }, + { + "epoch": 0.033131166832624244, + "grad_norm": 0.9784656763076782, + "learning_rate": 0.0001996439551145754, + "loss": 4.3443, + "step": 366 + }, + { + "epoch": 0.033221689146374585, + "grad_norm": 1.0070286989212036, + "learning_rate": 0.00019964127465803812, + "loss": 4.3719, + "step": 367 + }, + { + "epoch": 0.03331221146012492, + "grad_norm": 0.9459866285324097, + "learning_rate": 0.00019963858416762717, + "loss": 4.3708, + "step": 368 + }, + { + "epoch": 0.03340273377387526, + "grad_norm": 0.937214732170105, + "learning_rate": 0.00019963588364361353, + "loss": 4.353, + "step": 369 + }, + { + "epoch": 0.0334932560876256, + "grad_norm": 0.8671116828918457, + "learning_rate": 0.00019963317308626914, + "loss": 4.3555, + "step": 370 + }, + { + "epoch": 0.03358377840137594, + "grad_norm": 0.9910116791725159, + "learning_rate": 0.00019963045249586693, + "loss": 4.375, + "step": 371 + }, + { + "epoch": 0.03367430071512628, + "grad_norm": 0.9444873332977295, + "learning_rate": 0.00019962772187268093, + "loss": 4.3282, + "step": 372 + }, + { + "epoch": 0.033764823028876616, + "grad_norm": 0.8867988586425781, + "learning_rate": 0.000199624981216986, + "loss": 4.3491, + "step": 373 + }, + { + "epoch": 0.03385534534262696, + "grad_norm": 1.096407413482666, + "learning_rate": 0.0001996222305290582, + "loss": 4.3438, + "step": 374 + }, + { + "epoch": 0.0339458676563773, + "grad_norm": 0.9576050043106079, + "learning_rate": 0.00019961946980917456, + "loss": 4.3186, + "step": 375 + }, + { + "epoch": 0.03403638997012764, + "grad_norm": 1.0348565578460693, + "learning_rate": 0.00019961669905761302, + "loss": 4.2565, + "step": 376 + }, + { + "epoch": 0.03412691228387798, + "grad_norm": 0.8637972474098206, + "learning_rate": 0.0001996139182746526, + "loss": 4.4194, + "step": 377 + }, + { + "epoch": 0.03421743459762831, + "grad_norm": 0.9136695265769958, + "learning_rate": 0.0001996111274605733, + "loss": 4.3234, + "step": 378 + }, + { + "epoch": 0.034307956911378654, + "grad_norm": 1.0527623891830444, + "learning_rate": 0.00019960832661565622, + "loss": 4.3159, + "step": 379 + }, + { + "epoch": 0.034398479225128994, + "grad_norm": 0.9086984992027283, + "learning_rate": 0.0001996055157401834, + "loss": 4.3534, + "step": 380 + }, + { + "epoch": 0.034489001538879335, + "grad_norm": 0.9098812341690063, + "learning_rate": 0.00019960269483443782, + "loss": 4.3152, + "step": 381 + }, + { + "epoch": 0.034579523852629676, + "grad_norm": 0.9199385046958923, + "learning_rate": 0.00019959986389870364, + "loss": 4.313, + "step": 382 + }, + { + "epoch": 0.03467004616638001, + "grad_norm": 1.0625910758972168, + "learning_rate": 0.00019959702293326586, + "loss": 4.305, + "step": 383 + }, + { + "epoch": 0.03476056848013035, + "grad_norm": 1.0181784629821777, + "learning_rate": 0.00019959417193841063, + "loss": 4.3, + "step": 384 + }, + { + "epoch": 0.03485109079388069, + "grad_norm": 0.9671763181686401, + "learning_rate": 0.00019959131091442495, + "loss": 4.3007, + "step": 385 + }, + { + "epoch": 0.03494161310763103, + "grad_norm": 0.9672999978065491, + "learning_rate": 0.00019958843986159704, + "loss": 4.3513, + "step": 386 + }, + { + "epoch": 0.03503213542138137, + "grad_norm": 0.9663201570510864, + "learning_rate": 0.00019958555878021595, + "loss": 4.3138, + "step": 387 + }, + { + "epoch": 0.03512265773513171, + "grad_norm": 1.073101282119751, + "learning_rate": 0.0001995826676705718, + "loss": 4.3399, + "step": 388 + }, + { + "epoch": 0.03521318004888205, + "grad_norm": 0.956693708896637, + "learning_rate": 0.00019957976653295576, + "loss": 4.3208, + "step": 389 + }, + { + "epoch": 0.03530370236263239, + "grad_norm": 0.952547550201416, + "learning_rate": 0.00019957685536765995, + "loss": 4.3082, + "step": 390 + }, + { + "epoch": 0.03539422467638273, + "grad_norm": 0.9408294558525085, + "learning_rate": 0.0001995739341749775, + "loss": 4.3202, + "step": 391 + }, + { + "epoch": 0.03548474699013307, + "grad_norm": 1.056509017944336, + "learning_rate": 0.00019957100295520266, + "loss": 4.258, + "step": 392 + }, + { + "epoch": 0.035575269303883404, + "grad_norm": 1.1849064826965332, + "learning_rate": 0.00019956806170863049, + "loss": 4.2442, + "step": 393 + }, + { + "epoch": 0.035665791617633745, + "grad_norm": 0.8983389139175415, + "learning_rate": 0.00019956511043555728, + "loss": 4.2728, + "step": 394 + }, + { + "epoch": 0.035756313931384086, + "grad_norm": 1.0108262300491333, + "learning_rate": 0.00019956214913628015, + "loss": 4.249, + "step": 395 + }, + { + "epoch": 0.03584683624513443, + "grad_norm": 0.9574976563453674, + "learning_rate": 0.0001995591778110973, + "loss": 4.2356, + "step": 396 + }, + { + "epoch": 0.03593735855888477, + "grad_norm": 1.0257916450500488, + "learning_rate": 0.00019955619646030802, + "loss": 4.2957, + "step": 397 + }, + { + "epoch": 0.0360278808726351, + "grad_norm": 1.0345457792282104, + "learning_rate": 0.00019955320508421243, + "loss": 4.2462, + "step": 398 + }, + { + "epoch": 0.03611840318638544, + "grad_norm": 0.9709088802337646, + "learning_rate": 0.00019955020368311183, + "loss": 4.2942, + "step": 399 + }, + { + "epoch": 0.03620892550013578, + "grad_norm": 0.9114799499511719, + "learning_rate": 0.00019954719225730847, + "loss": 4.3045, + "step": 400 + }, + { + "epoch": 0.036299447813886124, + "grad_norm": 1.0316181182861328, + "learning_rate": 0.00019954417080710556, + "loss": 4.3309, + "step": 401 + }, + { + "epoch": 0.036389970127636465, + "grad_norm": 0.9053332209587097, + "learning_rate": 0.00019954113933280736, + "loss": 4.2553, + "step": 402 + }, + { + "epoch": 0.0364804924413868, + "grad_norm": 0.9040462970733643, + "learning_rate": 0.00019953809783471917, + "loss": 4.2645, + "step": 403 + }, + { + "epoch": 0.03657101475513714, + "grad_norm": 0.8616097569465637, + "learning_rate": 0.0001995350463131472, + "loss": 4.2518, + "step": 404 + }, + { + "epoch": 0.03666153706888748, + "grad_norm": 0.8178855180740356, + "learning_rate": 0.00019953198476839888, + "loss": 4.2194, + "step": 405 + }, + { + "epoch": 0.03675205938263782, + "grad_norm": 0.8645557761192322, + "learning_rate": 0.00019952891320078236, + "loss": 4.1956, + "step": 406 + }, + { + "epoch": 0.03684258169638816, + "grad_norm": 0.9388260841369629, + "learning_rate": 0.000199525831610607, + "loss": 4.2618, + "step": 407 + }, + { + "epoch": 0.0369331040101385, + "grad_norm": 0.992255687713623, + "learning_rate": 0.0001995227399981831, + "loss": 4.2326, + "step": 408 + }, + { + "epoch": 0.037023626323888836, + "grad_norm": 0.964103639125824, + "learning_rate": 0.00019951963836382206, + "loss": 4.3077, + "step": 409 + }, + { + "epoch": 0.03711414863763918, + "grad_norm": 0.9777807593345642, + "learning_rate": 0.00019951652670783615, + "loss": 4.2155, + "step": 410 + }, + { + "epoch": 0.03720467095138952, + "grad_norm": 0.9376230835914612, + "learning_rate": 0.00019951340503053872, + "loss": 4.2368, + "step": 411 + }, + { + "epoch": 0.03729519326513986, + "grad_norm": 1.0110722780227661, + "learning_rate": 0.0001995102733322441, + "loss": 4.2638, + "step": 412 + }, + { + "epoch": 0.0373857155788902, + "grad_norm": 1.151977300643921, + "learning_rate": 0.00019950713161326767, + "loss": 4.2924, + "step": 413 + }, + { + "epoch": 0.03747623789264053, + "grad_norm": 0.9603147506713867, + "learning_rate": 0.00019950397987392586, + "loss": 4.2671, + "step": 414 + }, + { + "epoch": 0.037566760206390874, + "grad_norm": 1.1345927715301514, + "learning_rate": 0.00019950081811453597, + "loss": 4.2101, + "step": 415 + }, + { + "epoch": 0.037657282520141215, + "grad_norm": 1.1134638786315918, + "learning_rate": 0.00019949764633541643, + "loss": 4.282, + "step": 416 + }, + { + "epoch": 0.037747804833891556, + "grad_norm": 1.0932708978652954, + "learning_rate": 0.00019949446453688662, + "loss": 4.3003, + "step": 417 + }, + { + "epoch": 0.0378383271476419, + "grad_norm": 0.9736213088035583, + "learning_rate": 0.00019949127271926695, + "loss": 4.328, + "step": 418 + }, + { + "epoch": 0.03792884946139223, + "grad_norm": 1.0969462394714355, + "learning_rate": 0.00019948807088287883, + "loss": 4.2475, + "step": 419 + }, + { + "epoch": 0.03801937177514257, + "grad_norm": 1.046892523765564, + "learning_rate": 0.0001994848590280447, + "loss": 4.1855, + "step": 420 + }, + { + "epoch": 0.03810989408889291, + "grad_norm": 1.1222968101501465, + "learning_rate": 0.00019948163715508802, + "loss": 4.2414, + "step": 421 + }, + { + "epoch": 0.03820041640264325, + "grad_norm": 1.0552254915237427, + "learning_rate": 0.00019947840526433314, + "loss": 4.2269, + "step": 422 + }, + { + "epoch": 0.038290938716393594, + "grad_norm": 1.12909996509552, + "learning_rate": 0.00019947516335610562, + "loss": 4.2008, + "step": 423 + }, + { + "epoch": 0.03838146103014393, + "grad_norm": 1.0824707746505737, + "learning_rate": 0.00019947191143073186, + "loss": 4.2587, + "step": 424 + }, + { + "epoch": 0.03847198334389427, + "grad_norm": 1.0686942338943481, + "learning_rate": 0.00019946864948853935, + "loss": 4.2781, + "step": 425 + }, + { + "epoch": 0.03856250565764461, + "grad_norm": 0.9873172640800476, + "learning_rate": 0.0001994653775298565, + "loss": 4.2174, + "step": 426 + }, + { + "epoch": 0.03865302797139495, + "grad_norm": 0.9392846822738647, + "learning_rate": 0.00019946209555501293, + "loss": 4.2374, + "step": 427 + }, + { + "epoch": 0.03874355028514529, + "grad_norm": 1.0531303882598877, + "learning_rate": 0.00019945880356433903, + "loss": 4.2903, + "step": 428 + }, + { + "epoch": 0.038834072598895625, + "grad_norm": 0.963618814945221, + "learning_rate": 0.00019945550155816633, + "loss": 4.2779, + "step": 429 + }, + { + "epoch": 0.038924594912645966, + "grad_norm": 0.9659363031387329, + "learning_rate": 0.00019945218953682734, + "loss": 4.1951, + "step": 430 + }, + { + "epoch": 0.039015117226396306, + "grad_norm": 0.9671744704246521, + "learning_rate": 0.00019944886750065558, + "loss": 4.2058, + "step": 431 + }, + { + "epoch": 0.03910563954014665, + "grad_norm": 1.0616800785064697, + "learning_rate": 0.00019944553544998562, + "loss": 4.1887, + "step": 432 + }, + { + "epoch": 0.03919616185389699, + "grad_norm": 0.9300423264503479, + "learning_rate": 0.00019944219338515292, + "loss": 4.1821, + "step": 433 + }, + { + "epoch": 0.03928668416764732, + "grad_norm": 1.0894814729690552, + "learning_rate": 0.00019943884130649409, + "loss": 4.2175, + "step": 434 + }, + { + "epoch": 0.03937720648139766, + "grad_norm": 0.9679089188575745, + "learning_rate": 0.00019943547921434665, + "loss": 4.2315, + "step": 435 + }, + { + "epoch": 0.039467728795148004, + "grad_norm": 1.0963354110717773, + "learning_rate": 0.00019943210710904918, + "loss": 4.2163, + "step": 436 + }, + { + "epoch": 0.039558251108898344, + "grad_norm": 1.0342007875442505, + "learning_rate": 0.00019942872499094123, + "loss": 4.1901, + "step": 437 + }, + { + "epoch": 0.039648773422648685, + "grad_norm": 0.9861835837364197, + "learning_rate": 0.0001994253328603634, + "loss": 4.183, + "step": 438 + }, + { + "epoch": 0.03973929573639902, + "grad_norm": 1.081486701965332, + "learning_rate": 0.0001994219307176573, + "loss": 4.1647, + "step": 439 + }, + { + "epoch": 0.03982981805014936, + "grad_norm": 1.042413592338562, + "learning_rate": 0.00019941851856316548, + "loss": 4.1848, + "step": 440 + }, + { + "epoch": 0.0399203403638997, + "grad_norm": 0.9466920495033264, + "learning_rate": 0.00019941509639723155, + "loss": 4.1853, + "step": 441 + }, + { + "epoch": 0.04001086267765004, + "grad_norm": 1.0339183807373047, + "learning_rate": 0.00019941166422020014, + "loss": 4.271, + "step": 442 + }, + { + "epoch": 0.04010138499140038, + "grad_norm": 1.083583116531372, + "learning_rate": 0.00019940822203241685, + "loss": 4.2329, + "step": 443 + }, + { + "epoch": 0.04019190730515072, + "grad_norm": 0.9255849123001099, + "learning_rate": 0.00019940476983422832, + "loss": 4.1842, + "step": 444 + }, + { + "epoch": 0.04028242961890106, + "grad_norm": 1.0079110860824585, + "learning_rate": 0.00019940130762598223, + "loss": 4.2266, + "step": 445 + }, + { + "epoch": 0.0403729519326514, + "grad_norm": 1.0702378749847412, + "learning_rate": 0.00019939783540802715, + "loss": 4.2801, + "step": 446 + }, + { + "epoch": 0.04046347424640174, + "grad_norm": 1.0222079753875732, + "learning_rate": 0.00019939435318071277, + "loss": 4.1854, + "step": 447 + }, + { + "epoch": 0.04055399656015208, + "grad_norm": 1.028741478919983, + "learning_rate": 0.00019939086094438975, + "loss": 4.2361, + "step": 448 + }, + { + "epoch": 0.04064451887390242, + "grad_norm": 1.023998498916626, + "learning_rate": 0.00019938735869940973, + "loss": 4.2604, + "step": 449 + }, + { + "epoch": 0.040735041187652754, + "grad_norm": 0.9994665384292603, + "learning_rate": 0.00019938384644612543, + "loss": 4.2244, + "step": 450 + }, + { + "epoch": 0.040825563501403095, + "grad_norm": 0.9782562851905823, + "learning_rate": 0.0001993803241848905, + "loss": 4.1538, + "step": 451 + }, + { + "epoch": 0.040916085815153436, + "grad_norm": 0.9541926383972168, + "learning_rate": 0.00019937679191605963, + "loss": 4.2414, + "step": 452 + }, + { + "epoch": 0.041006608128903776, + "grad_norm": 0.9733009934425354, + "learning_rate": 0.0001993732496399886, + "loss": 4.1626, + "step": 453 + }, + { + "epoch": 0.04109713044265412, + "grad_norm": 1.0683079957962036, + "learning_rate": 0.00019936969735703396, + "loss": 4.1847, + "step": 454 + }, + { + "epoch": 0.04118765275640445, + "grad_norm": 0.8915784955024719, + "learning_rate": 0.00019936613506755358, + "loss": 4.2098, + "step": 455 + }, + { + "epoch": 0.04127817507015479, + "grad_norm": 0.8858823776245117, + "learning_rate": 0.00019936256277190608, + "loss": 4.1784, + "step": 456 + }, + { + "epoch": 0.04136869738390513, + "grad_norm": 0.9432275891304016, + "learning_rate": 0.00019935898047045126, + "loss": 4.2451, + "step": 457 + }, + { + "epoch": 0.041459219697655474, + "grad_norm": 0.9765615463256836, + "learning_rate": 0.0001993553881635498, + "loss": 4.1778, + "step": 458 + }, + { + "epoch": 0.041549742011405814, + "grad_norm": 0.993130624294281, + "learning_rate": 0.00019935178585156347, + "loss": 4.1452, + "step": 459 + }, + { + "epoch": 0.04164026432515615, + "grad_norm": 0.9554489850997925, + "learning_rate": 0.00019934817353485501, + "loss": 4.2302, + "step": 460 + }, + { + "epoch": 0.04173078663890649, + "grad_norm": 0.972508430480957, + "learning_rate": 0.00019934455121378824, + "loss": 4.211, + "step": 461 + }, + { + "epoch": 0.04182130895265683, + "grad_norm": 0.9439429640769958, + "learning_rate": 0.00019934091888872786, + "loss": 4.1795, + "step": 462 + }, + { + "epoch": 0.04191183126640717, + "grad_norm": 1.0175976753234863, + "learning_rate": 0.00019933727656003963, + "loss": 4.1867, + "step": 463 + }, + { + "epoch": 0.04200235358015751, + "grad_norm": 0.9793443083763123, + "learning_rate": 0.0001993336242280904, + "loss": 4.1704, + "step": 464 + }, + { + "epoch": 0.042092875893907845, + "grad_norm": 1.0228900909423828, + "learning_rate": 0.00019932996189324796, + "loss": 4.1865, + "step": 465 + }, + { + "epoch": 0.042183398207658186, + "grad_norm": 0.9703587889671326, + "learning_rate": 0.00019932628955588103, + "loss": 4.1667, + "step": 466 + }, + { + "epoch": 0.04227392052140853, + "grad_norm": 1.0352518558502197, + "learning_rate": 0.00019932260721635946, + "loss": 4.1628, + "step": 467 + }, + { + "epoch": 0.04236444283515887, + "grad_norm": 0.9684962034225464, + "learning_rate": 0.0001993189148750541, + "loss": 4.1845, + "step": 468 + }, + { + "epoch": 0.04245496514890921, + "grad_norm": 1.0994378328323364, + "learning_rate": 0.00019931521253233669, + "loss": 4.2044, + "step": 469 + }, + { + "epoch": 0.04254548746265954, + "grad_norm": 0.972074568271637, + "learning_rate": 0.00019931150018858012, + "loss": 4.1366, + "step": 470 + }, + { + "epoch": 0.04263600977640988, + "grad_norm": 1.0492547750473022, + "learning_rate": 0.0001993077778441582, + "loss": 4.1926, + "step": 471 + }, + { + "epoch": 0.042726532090160224, + "grad_norm": 0.9738761186599731, + "learning_rate": 0.00019930404549944574, + "loss": 4.1006, + "step": 472 + }, + { + "epoch": 0.042817054403910565, + "grad_norm": 1.095558524131775, + "learning_rate": 0.00019930030315481862, + "loss": 4.1846, + "step": 473 + }, + { + "epoch": 0.042907576717660906, + "grad_norm": 1.0250678062438965, + "learning_rate": 0.0001992965508106537, + "loss": 4.1583, + "step": 474 + }, + { + "epoch": 0.04299809903141124, + "grad_norm": 1.0323116779327393, + "learning_rate": 0.00019929278846732884, + "loss": 4.1364, + "step": 475 + }, + { + "epoch": 0.04308862134516158, + "grad_norm": 0.9422488808631897, + "learning_rate": 0.00019928901612522288, + "loss": 4.1598, + "step": 476 + }, + { + "epoch": 0.04317914365891192, + "grad_norm": 1.027329683303833, + "learning_rate": 0.00019928523378471573, + "loss": 4.1284, + "step": 477 + }, + { + "epoch": 0.04326966597266226, + "grad_norm": 1.0527349710464478, + "learning_rate": 0.00019928144144618824, + "loss": 4.1378, + "step": 478 + }, + { + "epoch": 0.0433601882864126, + "grad_norm": 1.0269211530685425, + "learning_rate": 0.00019927763911002232, + "loss": 4.1888, + "step": 479 + }, + { + "epoch": 0.04345071060016294, + "grad_norm": 1.0055499076843262, + "learning_rate": 0.00019927382677660088, + "loss": 4.1982, + "step": 480 + }, + { + "epoch": 0.04354123291391328, + "grad_norm": 1.147194504737854, + "learning_rate": 0.00019927000444630775, + "loss": 4.1557, + "step": 481 + }, + { + "epoch": 0.04363175522766362, + "grad_norm": 1.041730284690857, + "learning_rate": 0.0001992661721195279, + "loss": 4.1445, + "step": 482 + }, + { + "epoch": 0.04372227754141396, + "grad_norm": 1.1168657541275024, + "learning_rate": 0.00019926232979664728, + "loss": 4.1335, + "step": 483 + }, + { + "epoch": 0.0438127998551643, + "grad_norm": 0.983162522315979, + "learning_rate": 0.00019925847747805274, + "loss": 4.2185, + "step": 484 + }, + { + "epoch": 0.04390332216891464, + "grad_norm": 0.9528363347053528, + "learning_rate": 0.00019925461516413223, + "loss": 4.1842, + "step": 485 + }, + { + "epoch": 0.043993844482664975, + "grad_norm": 1.0722746849060059, + "learning_rate": 0.00019925074285527468, + "loss": 4.1321, + "step": 486 + }, + { + "epoch": 0.044084366796415315, + "grad_norm": 1.1250028610229492, + "learning_rate": 0.00019924686055187003, + "loss": 4.075, + "step": 487 + }, + { + "epoch": 0.044174889110165656, + "grad_norm": 1.0873173475265503, + "learning_rate": 0.00019924296825430925, + "loss": 4.1771, + "step": 488 + }, + { + "epoch": 0.044265411423916, + "grad_norm": 1.05666983127594, + "learning_rate": 0.0001992390659629843, + "loss": 4.1521, + "step": 489 + }, + { + "epoch": 0.04435593373766634, + "grad_norm": 0.9849108457565308, + "learning_rate": 0.0001992351536782881, + "loss": 4.1375, + "step": 490 + }, + { + "epoch": 0.04444645605141667, + "grad_norm": 1.069494605064392, + "learning_rate": 0.00019923123140061467, + "loss": 4.1763, + "step": 491 + }, + { + "epoch": 0.04453697836516701, + "grad_norm": 0.9202231168746948, + "learning_rate": 0.00019922729913035895, + "loss": 4.1439, + "step": 492 + }, + { + "epoch": 0.04462750067891735, + "grad_norm": 0.9625434279441833, + "learning_rate": 0.0001992233568679169, + "loss": 4.1377, + "step": 493 + }, + { + "epoch": 0.044718022992667694, + "grad_norm": 0.9912998676300049, + "learning_rate": 0.0001992194046136855, + "loss": 4.1928, + "step": 494 + }, + { + "epoch": 0.044808545306418035, + "grad_norm": 0.9711489081382751, + "learning_rate": 0.00019921544236806282, + "loss": 4.1072, + "step": 495 + }, + { + "epoch": 0.04489906762016837, + "grad_norm": 0.9541346430778503, + "learning_rate": 0.0001992114701314478, + "loss": 4.1439, + "step": 496 + }, + { + "epoch": 0.04498958993391871, + "grad_norm": 0.9553717374801636, + "learning_rate": 0.00019920748790424044, + "loss": 4.1336, + "step": 497 + }, + { + "epoch": 0.04508011224766905, + "grad_norm": 0.9162443280220032, + "learning_rate": 0.0001992034956868418, + "loss": 4.1435, + "step": 498 + }, + { + "epoch": 0.04517063456141939, + "grad_norm": 1.0411958694458008, + "learning_rate": 0.0001991994934796538, + "loss": 4.1484, + "step": 499 + }, + { + "epoch": 0.04526115687516973, + "grad_norm": 0.9627156257629395, + "learning_rate": 0.00019919548128307954, + "loss": 4.1246, + "step": 500 + }, + { + "epoch": 0.045351679188920066, + "grad_norm": 0.952808678150177, + "learning_rate": 0.00019919145909752303, + "loss": 4.0516, + "step": 501 + }, + { + "epoch": 0.04544220150267041, + "grad_norm": 0.9976837635040283, + "learning_rate": 0.00019918742692338933, + "loss": 4.093, + "step": 502 + }, + { + "epoch": 0.04553272381642075, + "grad_norm": 1.005296230316162, + "learning_rate": 0.0001991833847610844, + "loss": 4.1368, + "step": 503 + }, + { + "epoch": 0.04562324613017109, + "grad_norm": 0.9490670561790466, + "learning_rate": 0.0001991793326110154, + "loss": 4.1738, + "step": 504 + }, + { + "epoch": 0.04571376844392143, + "grad_norm": 1.0245599746704102, + "learning_rate": 0.00019917527047359028, + "loss": 4.1499, + "step": 505 + }, + { + "epoch": 0.04580429075767176, + "grad_norm": 1.0409306287765503, + "learning_rate": 0.00019917119834921815, + "loss": 4.1139, + "step": 506 + }, + { + "epoch": 0.045894813071422104, + "grad_norm": 1.1501810550689697, + "learning_rate": 0.00019916711623830903, + "loss": 4.1677, + "step": 507 + }, + { + "epoch": 0.045985335385172445, + "grad_norm": 0.9567729234695435, + "learning_rate": 0.00019916302414127408, + "loss": 4.1223, + "step": 508 + }, + { + "epoch": 0.046075857698922786, + "grad_norm": 0.9898899793624878, + "learning_rate": 0.00019915892205852527, + "loss": 4.085, + "step": 509 + }, + { + "epoch": 0.046166380012673126, + "grad_norm": 0.962177038192749, + "learning_rate": 0.00019915480999047573, + "loss": 4.0921, + "step": 510 + }, + { + "epoch": 0.04625690232642346, + "grad_norm": 1.188584566116333, + "learning_rate": 0.00019915068793753952, + "loss": 4.0904, + "step": 511 + }, + { + "epoch": 0.0463474246401738, + "grad_norm": 1.13594651222229, + "learning_rate": 0.00019914655590013176, + "loss": 4.1522, + "step": 512 + }, + { + "epoch": 0.04643794695392414, + "grad_norm": 1.0383785963058472, + "learning_rate": 0.00019914241387866856, + "loss": 4.1267, + "step": 513 + }, + { + "epoch": 0.04652846926767448, + "grad_norm": 1.0647755861282349, + "learning_rate": 0.00019913826187356696, + "loss": 4.1404, + "step": 514 + }, + { + "epoch": 0.04661899158142482, + "grad_norm": 0.9425531029701233, + "learning_rate": 0.00019913409988524513, + "loss": 4.1329, + "step": 515 + }, + { + "epoch": 0.04670951389517516, + "grad_norm": 1.001288890838623, + "learning_rate": 0.00019912992791412212, + "loss": 4.116, + "step": 516 + }, + { + "epoch": 0.0468000362089255, + "grad_norm": 1.1774042844772339, + "learning_rate": 0.0001991257459606181, + "loss": 4.1825, + "step": 517 + }, + { + "epoch": 0.04689055852267584, + "grad_norm": 1.0480451583862305, + "learning_rate": 0.00019912155402515417, + "loss": 4.0484, + "step": 518 + }, + { + "epoch": 0.04698108083642618, + "grad_norm": 1.034772515296936, + "learning_rate": 0.0001991173521081525, + "loss": 4.0266, + "step": 519 + }, + { + "epoch": 0.04707160315017652, + "grad_norm": 1.3288160562515259, + "learning_rate": 0.00019911314021003613, + "loss": 4.0988, + "step": 520 + }, + { + "epoch": 0.04716212546392686, + "grad_norm": 1.2328402996063232, + "learning_rate": 0.00019910891833122926, + "loss": 4.1125, + "step": 521 + }, + { + "epoch": 0.047252647777677195, + "grad_norm": 1.0993627309799194, + "learning_rate": 0.00019910468647215706, + "loss": 4.1599, + "step": 522 + }, + { + "epoch": 0.047343170091427536, + "grad_norm": 1.1606861352920532, + "learning_rate": 0.00019910044463324563, + "loss": 4.1601, + "step": 523 + }, + { + "epoch": 0.04743369240517788, + "grad_norm": 0.9817080497741699, + "learning_rate": 0.0001990961928149221, + "loss": 4.0921, + "step": 524 + }, + { + "epoch": 0.04752421471892822, + "grad_norm": 1.0352342128753662, + "learning_rate": 0.0001990919310176147, + "loss": 4.0273, + "step": 525 + }, + { + "epoch": 0.04761473703267856, + "grad_norm": 1.0226386785507202, + "learning_rate": 0.00019908765924175258, + "loss": 4.1009, + "step": 526 + }, + { + "epoch": 0.04770525934642889, + "grad_norm": 0.9978080987930298, + "learning_rate": 0.00019908337748776584, + "loss": 4.1242, + "step": 527 + }, + { + "epoch": 0.04779578166017923, + "grad_norm": 1.0166219472885132, + "learning_rate": 0.00019907908575608573, + "loss": 4.085, + "step": 528 + }, + { + "epoch": 0.047886303973929574, + "grad_norm": 1.0522500276565552, + "learning_rate": 0.00019907478404714436, + "loss": 4.1089, + "step": 529 + }, + { + "epoch": 0.047976826287679915, + "grad_norm": 0.9937741756439209, + "learning_rate": 0.00019907047236137498, + "loss": 4.1281, + "step": 530 + }, + { + "epoch": 0.048067348601430256, + "grad_norm": 0.9463094472885132, + "learning_rate": 0.00019906615069921173, + "loss": 4.0034, + "step": 531 + }, + { + "epoch": 0.04815787091518059, + "grad_norm": 1.009852647781372, + "learning_rate": 0.00019906181906108984, + "loss": 4.1578, + "step": 532 + }, + { + "epoch": 0.04824839322893093, + "grad_norm": 1.0535956621170044, + "learning_rate": 0.00019905747744744545, + "loss": 4.1425, + "step": 533 + }, + { + "epoch": 0.04833891554268127, + "grad_norm": 1.0467532873153687, + "learning_rate": 0.0001990531258587158, + "loss": 4.1166, + "step": 534 + }, + { + "epoch": 0.04842943785643161, + "grad_norm": 0.9849286079406738, + "learning_rate": 0.0001990487642953391, + "loss": 4.0509, + "step": 535 + }, + { + "epoch": 0.04851996017018195, + "grad_norm": 1.15632963180542, + "learning_rate": 0.00019904439275775452, + "loss": 4.1081, + "step": 536 + }, + { + "epoch": 0.04861048248393229, + "grad_norm": 1.0920475721359253, + "learning_rate": 0.0001990400112464023, + "loss": 4.0632, + "step": 537 + }, + { + "epoch": 0.04870100479768263, + "grad_norm": 1.0463149547576904, + "learning_rate": 0.00019903561976172368, + "loss": 4.0839, + "step": 538 + }, + { + "epoch": 0.04879152711143297, + "grad_norm": 1.1025164127349854, + "learning_rate": 0.00019903121830416084, + "loss": 4.0504, + "step": 539 + }, + { + "epoch": 0.04888204942518331, + "grad_norm": 1.0772355794906616, + "learning_rate": 0.00019902680687415705, + "loss": 4.125, + "step": 540 + }, + { + "epoch": 0.04897257173893365, + "grad_norm": 1.1654671430587769, + "learning_rate": 0.0001990223854721565, + "loss": 4.0562, + "step": 541 + }, + { + "epoch": 0.049063094052683984, + "grad_norm": 1.0062533617019653, + "learning_rate": 0.00019901795409860444, + "loss": 3.9895, + "step": 542 + }, + { + "epoch": 0.049153616366434325, + "grad_norm": 1.0412124395370483, + "learning_rate": 0.0001990135127539471, + "loss": 4.0952, + "step": 543 + }, + { + "epoch": 0.049244138680184665, + "grad_norm": 0.9877052903175354, + "learning_rate": 0.0001990090614386318, + "loss": 4.0693, + "step": 544 + }, + { + "epoch": 0.049334660993935006, + "grad_norm": 0.978701114654541, + "learning_rate": 0.00019900460015310665, + "loss": 3.9831, + "step": 545 + }, + { + "epoch": 0.04942518330768535, + "grad_norm": 1.1422040462493896, + "learning_rate": 0.00019900012889782098, + "loss": 4.1191, + "step": 546 + }, + { + "epoch": 0.04951570562143568, + "grad_norm": 1.0867984294891357, + "learning_rate": 0.00019899564767322507, + "loss": 4.1175, + "step": 547 + }, + { + "epoch": 0.04960622793518602, + "grad_norm": 1.0576975345611572, + "learning_rate": 0.00019899115647977015, + "loss": 4.0439, + "step": 548 + }, + { + "epoch": 0.04969675024893636, + "grad_norm": 1.073725700378418, + "learning_rate": 0.00019898665531790845, + "loss": 4.1013, + "step": 549 + }, + { + "epoch": 0.0497872725626867, + "grad_norm": 0.9969006180763245, + "learning_rate": 0.0001989821441880933, + "loss": 4.0446, + "step": 550 + }, + { + "epoch": 0.049877794876437044, + "grad_norm": 1.0461211204528809, + "learning_rate": 0.0001989776230907789, + "loss": 4.0519, + "step": 551 + }, + { + "epoch": 0.04996831719018738, + "grad_norm": 0.9178494215011597, + "learning_rate": 0.0001989730920264206, + "loss": 3.9729, + "step": 552 + }, + { + "epoch": 0.05005883950393772, + "grad_norm": 1.0089956521987915, + "learning_rate": 0.0001989685509954746, + "loss": 4.0438, + "step": 553 + }, + { + "epoch": 0.05014936181768806, + "grad_norm": 0.9153216481208801, + "learning_rate": 0.00019896399999839827, + "loss": 4.0706, + "step": 554 + }, + { + "epoch": 0.0502398841314384, + "grad_norm": 1.0593777894973755, + "learning_rate": 0.0001989594390356498, + "loss": 4.0807, + "step": 555 + }, + { + "epoch": 0.05033040644518874, + "grad_norm": 1.0582592487335205, + "learning_rate": 0.00019895486810768856, + "loss": 3.9998, + "step": 556 + }, + { + "epoch": 0.050420928758939075, + "grad_norm": 0.9722828269004822, + "learning_rate": 0.00019895028721497482, + "loss": 4.0521, + "step": 557 + }, + { + "epoch": 0.050511451072689416, + "grad_norm": 1.1836525201797485, + "learning_rate": 0.00019894569635796984, + "loss": 4.0514, + "step": 558 + }, + { + "epoch": 0.05060197338643976, + "grad_norm": 1.1598135232925415, + "learning_rate": 0.00019894109553713596, + "loss": 4.0248, + "step": 559 + }, + { + "epoch": 0.0506924957001901, + "grad_norm": 0.9481819868087769, + "learning_rate": 0.00019893648475293648, + "loss": 4.0499, + "step": 560 + }, + { + "epoch": 0.05078301801394044, + "grad_norm": 1.014451503753662, + "learning_rate": 0.00019893186400583566, + "loss": 4.0659, + "step": 561 + }, + { + "epoch": 0.05087354032769078, + "grad_norm": 0.9883650541305542, + "learning_rate": 0.00019892723329629887, + "loss": 4.0131, + "step": 562 + }, + { + "epoch": 0.05096406264144111, + "grad_norm": 0.9952863454818726, + "learning_rate": 0.00019892259262479237, + "loss": 4.0082, + "step": 563 + }, + { + "epoch": 0.051054584955191454, + "grad_norm": 1.0756782293319702, + "learning_rate": 0.0001989179419917835, + "loss": 4.0799, + "step": 564 + }, + { + "epoch": 0.051145107268941795, + "grad_norm": 1.066792368888855, + "learning_rate": 0.00019891328139774056, + "loss": 4.0465, + "step": 565 + }, + { + "epoch": 0.051235629582692135, + "grad_norm": 1.0339174270629883, + "learning_rate": 0.00019890861084313294, + "loss": 4.0085, + "step": 566 + }, + { + "epoch": 0.051326151896442476, + "grad_norm": 0.9658034443855286, + "learning_rate": 0.00019890393032843086, + "loss": 4.025, + "step": 567 + }, + { + "epoch": 0.05141667421019281, + "grad_norm": 1.0180926322937012, + "learning_rate": 0.00019889923985410576, + "loss": 4.0619, + "step": 568 + }, + { + "epoch": 0.05150719652394315, + "grad_norm": 0.9875718951225281, + "learning_rate": 0.00019889453942062985, + "loss": 4.0292, + "step": 569 + }, + { + "epoch": 0.05159771883769349, + "grad_norm": 0.9706630706787109, + "learning_rate": 0.00019888982902847656, + "loss": 4.0822, + "step": 570 + }, + { + "epoch": 0.05168824115144383, + "grad_norm": 0.981421172618866, + "learning_rate": 0.0001988851086781202, + "loss": 4.0864, + "step": 571 + }, + { + "epoch": 0.05177876346519417, + "grad_norm": 1.0634344816207886, + "learning_rate": 0.00019888037837003612, + "loss": 4.0156, + "step": 572 + }, + { + "epoch": 0.05186928577894451, + "grad_norm": 1.0097788572311401, + "learning_rate": 0.0001988756381047006, + "loss": 4.0723, + "step": 573 + }, + { + "epoch": 0.05195980809269485, + "grad_norm": 0.9124920964241028, + "learning_rate": 0.00019887088788259102, + "loss": 3.9745, + "step": 574 + }, + { + "epoch": 0.05205033040644519, + "grad_norm": 1.0202012062072754, + "learning_rate": 0.00019886612770418578, + "loss": 4.0043, + "step": 575 + }, + { + "epoch": 0.05214085272019553, + "grad_norm": 1.0590636730194092, + "learning_rate": 0.0001988613575699642, + "loss": 3.9986, + "step": 576 + }, + { + "epoch": 0.05223137503394587, + "grad_norm": 0.9344479441642761, + "learning_rate": 0.00019885657748040653, + "loss": 4.0404, + "step": 577 + }, + { + "epoch": 0.052321897347696204, + "grad_norm": 0.9335837364196777, + "learning_rate": 0.00019885178743599428, + "loss": 4.0202, + "step": 578 + }, + { + "epoch": 0.052412419661446545, + "grad_norm": 0.8614705204963684, + "learning_rate": 0.00019884698743720974, + "loss": 4.0153, + "step": 579 + }, + { + "epoch": 0.052502941975196886, + "grad_norm": 0.9567529559135437, + "learning_rate": 0.00019884217748453623, + "loss": 4.0822, + "step": 580 + }, + { + "epoch": 0.05259346428894723, + "grad_norm": 0.9799798130989075, + "learning_rate": 0.0001988373575784582, + "loss": 4.0124, + "step": 581 + }, + { + "epoch": 0.05268398660269757, + "grad_norm": 1.0360461473464966, + "learning_rate": 0.00019883252771946095, + "loss": 4.0128, + "step": 582 + }, + { + "epoch": 0.0527745089164479, + "grad_norm": 1.0541273355484009, + "learning_rate": 0.00019882768790803086, + "loss": 3.9391, + "step": 583 + }, + { + "epoch": 0.05286503123019824, + "grad_norm": 1.0942133665084839, + "learning_rate": 0.0001988228381446553, + "loss": 3.9941, + "step": 584 + }, + { + "epoch": 0.05295555354394858, + "grad_norm": 0.9243170619010925, + "learning_rate": 0.00019881797842982263, + "loss": 3.9886, + "step": 585 + }, + { + "epoch": 0.053046075857698924, + "grad_norm": 0.9738331437110901, + "learning_rate": 0.00019881310876402223, + "loss": 3.9795, + "step": 586 + }, + { + "epoch": 0.053136598171449265, + "grad_norm": 0.932464599609375, + "learning_rate": 0.00019880822914774453, + "loss": 3.9952, + "step": 587 + }, + { + "epoch": 0.0532271204851996, + "grad_norm": 1.0746235847473145, + "learning_rate": 0.00019880333958148083, + "loss": 3.9263, + "step": 588 + }, + { + "epoch": 0.05331764279894994, + "grad_norm": 0.9642815589904785, + "learning_rate": 0.00019879844006572353, + "loss": 4.0313, + "step": 589 + }, + { + "epoch": 0.05340816511270028, + "grad_norm": 1.041780710220337, + "learning_rate": 0.00019879353060096603, + "loss": 3.9453, + "step": 590 + }, + { + "epoch": 0.05349868742645062, + "grad_norm": 0.9562025666236877, + "learning_rate": 0.0001987886111877027, + "loss": 3.9492, + "step": 591 + }, + { + "epoch": 0.05358920974020096, + "grad_norm": 0.9852890372276306, + "learning_rate": 0.0001987836818264289, + "loss": 3.9696, + "step": 592 + }, + { + "epoch": 0.053679732053951296, + "grad_norm": 1.0013340711593628, + "learning_rate": 0.00019877874251764105, + "loss": 3.9991, + "step": 593 + }, + { + "epoch": 0.053770254367701636, + "grad_norm": 1.1419621706008911, + "learning_rate": 0.00019877379326183655, + "loss": 4.0529, + "step": 594 + }, + { + "epoch": 0.05386077668145198, + "grad_norm": 1.0771965980529785, + "learning_rate": 0.00019876883405951377, + "loss": 4.094, + "step": 595 + }, + { + "epoch": 0.05395129899520232, + "grad_norm": 1.0541069507598877, + "learning_rate": 0.0001987638649111721, + "loss": 3.9807, + "step": 596 + }, + { + "epoch": 0.05404182130895266, + "grad_norm": 0.9966290593147278, + "learning_rate": 0.00019875888581731194, + "loss": 3.9641, + "step": 597 + }, + { + "epoch": 0.054132343622703, + "grad_norm": 1.0405066013336182, + "learning_rate": 0.0001987538967784347, + "loss": 4.0511, + "step": 598 + }, + { + "epoch": 0.054222865936453334, + "grad_norm": 0.9865603446960449, + "learning_rate": 0.00019874889779504274, + "loss": 3.9883, + "step": 599 + }, + { + "epoch": 0.054313388250203674, + "grad_norm": 1.097095251083374, + "learning_rate": 0.00019874388886763944, + "loss": 3.9818, + "step": 600 + }, + { + "epoch": 0.054403910563954015, + "grad_norm": 0.9284229278564453, + "learning_rate": 0.00019873886999672926, + "loss": 4.0203, + "step": 601 + }, + { + "epoch": 0.054494432877704356, + "grad_norm": 1.0191222429275513, + "learning_rate": 0.00019873384118281756, + "loss": 3.9996, + "step": 602 + }, + { + "epoch": 0.0545849551914547, + "grad_norm": 1.0141578912734985, + "learning_rate": 0.00019872880242641078, + "loss": 3.996, + "step": 603 + }, + { + "epoch": 0.05467547750520503, + "grad_norm": 1.0445934534072876, + "learning_rate": 0.0001987237537280163, + "loss": 3.9988, + "step": 604 + }, + { + "epoch": 0.05476599981895537, + "grad_norm": 1.0781155824661255, + "learning_rate": 0.0001987186950881425, + "loss": 4.0494, + "step": 605 + }, + { + "epoch": 0.05485652213270571, + "grad_norm": 0.9800688028335571, + "learning_rate": 0.0001987136265072988, + "loss": 4.0039, + "step": 606 + }, + { + "epoch": 0.05494704444645605, + "grad_norm": 0.9078426361083984, + "learning_rate": 0.00019870854798599563, + "loss": 3.954, + "step": 607 + }, + { + "epoch": 0.055037566760206394, + "grad_norm": 0.9343809485435486, + "learning_rate": 0.00019870345952474437, + "loss": 3.947, + "step": 608 + }, + { + "epoch": 0.05512808907395673, + "grad_norm": 1.0472208261489868, + "learning_rate": 0.00019869836112405742, + "loss": 4.0662, + "step": 609 + }, + { + "epoch": 0.05521861138770707, + "grad_norm": 0.9884335994720459, + "learning_rate": 0.00019869325278444824, + "loss": 3.9495, + "step": 610 + }, + { + "epoch": 0.05530913370145741, + "grad_norm": 0.9580947160720825, + "learning_rate": 0.00019868813450643117, + "loss": 4.0143, + "step": 611 + }, + { + "epoch": 0.05539965601520775, + "grad_norm": 1.0103482007980347, + "learning_rate": 0.0001986830062905217, + "loss": 3.9516, + "step": 612 + }, + { + "epoch": 0.05549017832895809, + "grad_norm": 0.9422419667243958, + "learning_rate": 0.00019867786813723614, + "loss": 4.0485, + "step": 613 + }, + { + "epoch": 0.055580700642708425, + "grad_norm": 1.042916178703308, + "learning_rate": 0.00019867272004709202, + "loss": 3.9367, + "step": 614 + }, + { + "epoch": 0.055671222956458766, + "grad_norm": 1.0430114269256592, + "learning_rate": 0.00019866756202060762, + "loss": 3.9835, + "step": 615 + }, + { + "epoch": 0.05576174527020911, + "grad_norm": 0.9182488322257996, + "learning_rate": 0.00019866239405830248, + "loss": 3.9871, + "step": 616 + }, + { + "epoch": 0.05585226758395945, + "grad_norm": 1.0735499858856201, + "learning_rate": 0.00019865721616069696, + "loss": 4.0137, + "step": 617 + }, + { + "epoch": 0.05594278989770979, + "grad_norm": 0.9811666011810303, + "learning_rate": 0.00019865202832831244, + "loss": 4.0045, + "step": 618 + }, + { + "epoch": 0.05603331221146012, + "grad_norm": 1.1695924997329712, + "learning_rate": 0.00019864683056167138, + "loss": 4.0147, + "step": 619 + }, + { + "epoch": 0.05612383452521046, + "grad_norm": 1.052254557609558, + "learning_rate": 0.0001986416228612972, + "loss": 4.0263, + "step": 620 + }, + { + "epoch": 0.056214356838960804, + "grad_norm": 1.2805979251861572, + "learning_rate": 0.00019863640522771429, + "loss": 3.9631, + "step": 621 + }, + { + "epoch": 0.056304879152711144, + "grad_norm": 1.036574363708496, + "learning_rate": 0.00019863117766144806, + "loss": 4.0074, + "step": 622 + }, + { + "epoch": 0.056395401466461485, + "grad_norm": 1.0256383419036865, + "learning_rate": 0.00019862594016302492, + "loss": 4.0937, + "step": 623 + }, + { + "epoch": 0.05648592378021182, + "grad_norm": 0.9368764758110046, + "learning_rate": 0.00019862069273297232, + "loss": 3.9692, + "step": 624 + }, + { + "epoch": 0.05657644609396216, + "grad_norm": 1.1771879196166992, + "learning_rate": 0.00019861543537181867, + "loss": 4.0685, + "step": 625 + }, + { + "epoch": 0.0566669684077125, + "grad_norm": 1.032165288925171, + "learning_rate": 0.00019861016808009335, + "loss": 3.8764, + "step": 626 + }, + { + "epoch": 0.05675749072146284, + "grad_norm": 0.970867395401001, + "learning_rate": 0.00019860489085832684, + "loss": 3.9917, + "step": 627 + }, + { + "epoch": 0.05684801303521318, + "grad_norm": 1.1051703691482544, + "learning_rate": 0.0001985996037070505, + "loss": 3.9425, + "step": 628 + }, + { + "epoch": 0.056938535348963516, + "grad_norm": 0.953762948513031, + "learning_rate": 0.00019859430662679678, + "loss": 3.912, + "step": 629 + }, + { + "epoch": 0.05702905766271386, + "grad_norm": 1.0868688821792603, + "learning_rate": 0.00019858899961809905, + "loss": 3.9425, + "step": 630 + }, + { + "epoch": 0.0571195799764642, + "grad_norm": 1.01759672164917, + "learning_rate": 0.0001985836826814918, + "loss": 4.0169, + "step": 631 + }, + { + "epoch": 0.05721010229021454, + "grad_norm": 0.9279404878616333, + "learning_rate": 0.00019857835581751037, + "loss": 3.9645, + "step": 632 + }, + { + "epoch": 0.05730062460396488, + "grad_norm": 0.9583117961883545, + "learning_rate": 0.00019857301902669118, + "loss": 4.0205, + "step": 633 + }, + { + "epoch": 0.05739114691771521, + "grad_norm": 1.0274845361709595, + "learning_rate": 0.00019856767230957173, + "loss": 4.0191, + "step": 634 + }, + { + "epoch": 0.057481669231465554, + "grad_norm": 0.970229983329773, + "learning_rate": 0.00019856231566669035, + "loss": 4.0247, + "step": 635 + }, + { + "epoch": 0.057572191545215895, + "grad_norm": 0.9355469346046448, + "learning_rate": 0.00019855694909858644, + "loss": 3.937, + "step": 636 + }, + { + "epoch": 0.057662713858966236, + "grad_norm": 1.0804182291030884, + "learning_rate": 0.0001985515726058005, + "loss": 3.9142, + "step": 637 + }, + { + "epoch": 0.05775323617271658, + "grad_norm": 0.9826698303222656, + "learning_rate": 0.0001985461861888739, + "loss": 3.9708, + "step": 638 + }, + { + "epoch": 0.05784375848646692, + "grad_norm": 1.0979808568954468, + "learning_rate": 0.00019854078984834903, + "loss": 3.9984, + "step": 639 + }, + { + "epoch": 0.05793428080021725, + "grad_norm": 1.0105253458023071, + "learning_rate": 0.00019853538358476932, + "loss": 3.9522, + "step": 640 + }, + { + "epoch": 0.05802480311396759, + "grad_norm": 0.9418004155158997, + "learning_rate": 0.00019852996739867917, + "loss": 3.9531, + "step": 641 + }, + { + "epoch": 0.05811532542771793, + "grad_norm": 0.9501124024391174, + "learning_rate": 0.000198524541290624, + "loss": 4.0334, + "step": 642 + }, + { + "epoch": 0.058205847741468274, + "grad_norm": 0.9833979606628418, + "learning_rate": 0.00019851910526115022, + "loss": 3.9814, + "step": 643 + }, + { + "epoch": 0.058296370055218615, + "grad_norm": 0.9670637249946594, + "learning_rate": 0.00019851365931080527, + "loss": 3.9314, + "step": 644 + }, + { + "epoch": 0.05838689236896895, + "grad_norm": 0.9459611773490906, + "learning_rate": 0.00019850820344013748, + "loss": 3.901, + "step": 645 + }, + { + "epoch": 0.05847741468271929, + "grad_norm": 1.0001896619796753, + "learning_rate": 0.00019850273764969632, + "loss": 3.9213, + "step": 646 + }, + { + "epoch": 0.05856793699646963, + "grad_norm": 0.947214663028717, + "learning_rate": 0.00019849726194003214, + "loss": 3.9845, + "step": 647 + }, + { + "epoch": 0.05865845931021997, + "grad_norm": 0.9455295205116272, + "learning_rate": 0.00019849177631169643, + "loss": 3.9425, + "step": 648 + }, + { + "epoch": 0.05874898162397031, + "grad_norm": 0.9829345345497131, + "learning_rate": 0.00019848628076524148, + "loss": 3.847, + "step": 649 + }, + { + "epoch": 0.058839503937720646, + "grad_norm": 0.9669824838638306, + "learning_rate": 0.00019848077530122083, + "loss": 3.9601, + "step": 650 + }, + { + "epoch": 0.058930026251470986, + "grad_norm": 0.9459212422370911, + "learning_rate": 0.00019847525992018877, + "loss": 3.9733, + "step": 651 + }, + { + "epoch": 0.05902054856522133, + "grad_norm": 1.0247117280960083, + "learning_rate": 0.0001984697346227007, + "loss": 3.9652, + "step": 652 + }, + { + "epoch": 0.05911107087897167, + "grad_norm": 0.9490055441856384, + "learning_rate": 0.00019846419940931304, + "loss": 4.0009, + "step": 653 + }, + { + "epoch": 0.05920159319272201, + "grad_norm": 1.0625797510147095, + "learning_rate": 0.00019845865428058326, + "loss": 3.9344, + "step": 654 + }, + { + "epoch": 0.05929211550647234, + "grad_norm": 0.9928398132324219, + "learning_rate": 0.00019845309923706962, + "loss": 3.9198, + "step": 655 + }, + { + "epoch": 0.05938263782022268, + "grad_norm": 0.9566262364387512, + "learning_rate": 0.00019844753427933164, + "loss": 3.9347, + "step": 656 + }, + { + "epoch": 0.059473160133973024, + "grad_norm": 0.9817463755607605, + "learning_rate": 0.0001984419594079296, + "loss": 3.9768, + "step": 657 + }, + { + "epoch": 0.059563682447723365, + "grad_norm": 0.9232942461967468, + "learning_rate": 0.00019843637462342497, + "loss": 3.9482, + "step": 658 + }, + { + "epoch": 0.059654204761473706, + "grad_norm": 1.021917462348938, + "learning_rate": 0.00019843077992638008, + "loss": 4.0121, + "step": 659 + }, + { + "epoch": 0.05974472707522404, + "grad_norm": 0.8719213604927063, + "learning_rate": 0.00019842517531735838, + "loss": 3.932, + "step": 660 + }, + { + "epoch": 0.05983524938897438, + "grad_norm": 0.9565935134887695, + "learning_rate": 0.0001984195607969242, + "loss": 3.8937, + "step": 661 + }, + { + "epoch": 0.05992577170272472, + "grad_norm": 0.9811224341392517, + "learning_rate": 0.00019841393636564294, + "loss": 3.9254, + "step": 662 + }, + { + "epoch": 0.06001629401647506, + "grad_norm": 0.9084984064102173, + "learning_rate": 0.000198408302024081, + "loss": 3.9867, + "step": 663 + }, + { + "epoch": 0.0601068163302254, + "grad_norm": 1.0378906726837158, + "learning_rate": 0.0001984026577728057, + "loss": 3.8996, + "step": 664 + }, + { + "epoch": 0.06019733864397574, + "grad_norm": 1.0800020694732666, + "learning_rate": 0.0001983970036123855, + "loss": 3.9582, + "step": 665 + }, + { + "epoch": 0.06028786095772608, + "grad_norm": 1.086745023727417, + "learning_rate": 0.0001983913395433897, + "loss": 3.8442, + "step": 666 + }, + { + "epoch": 0.06037838327147642, + "grad_norm": 1.1331897974014282, + "learning_rate": 0.0001983856655663887, + "loss": 3.9351, + "step": 667 + }, + { + "epoch": 0.06046890558522676, + "grad_norm": 1.074208378791809, + "learning_rate": 0.0001983799816819539, + "loss": 3.9473, + "step": 668 + }, + { + "epoch": 0.0605594278989771, + "grad_norm": 0.9405530691146851, + "learning_rate": 0.0001983742878906576, + "loss": 3.9648, + "step": 669 + }, + { + "epoch": 0.060649950212727434, + "grad_norm": 0.9946943521499634, + "learning_rate": 0.00019836858419307324, + "loss": 3.9814, + "step": 670 + }, + { + "epoch": 0.060740472526477775, + "grad_norm": 0.952600359916687, + "learning_rate": 0.0001983628705897751, + "loss": 3.986, + "step": 671 + }, + { + "epoch": 0.060830994840228116, + "grad_norm": 0.9927718639373779, + "learning_rate": 0.00019835714708133862, + "loss": 3.9474, + "step": 672 + }, + { + "epoch": 0.060921517153978456, + "grad_norm": 0.9685267806053162, + "learning_rate": 0.00019835141366834007, + "loss": 3.8756, + "step": 673 + }, + { + "epoch": 0.0610120394677288, + "grad_norm": 1.0644733905792236, + "learning_rate": 0.0001983456703513569, + "loss": 4.029, + "step": 674 + }, + { + "epoch": 0.06110256178147914, + "grad_norm": 0.8898302316665649, + "learning_rate": 0.0001983399171309674, + "loss": 3.9162, + "step": 675 + }, + { + "epoch": 0.06119308409522947, + "grad_norm": 1.0014737844467163, + "learning_rate": 0.00019833415400775093, + "loss": 3.8812, + "step": 676 + }, + { + "epoch": 0.06128360640897981, + "grad_norm": 0.9743993878364563, + "learning_rate": 0.00019832838098228785, + "loss": 3.9309, + "step": 677 + }, + { + "epoch": 0.061374128722730154, + "grad_norm": 0.9850934743881226, + "learning_rate": 0.0001983225980551595, + "loss": 4.0093, + "step": 678 + }, + { + "epoch": 0.061464651036480494, + "grad_norm": 1.0402182340621948, + "learning_rate": 0.00019831680522694822, + "loss": 4.009, + "step": 679 + }, + { + "epoch": 0.061555173350230835, + "grad_norm": 1.0211234092712402, + "learning_rate": 0.00019831100249823733, + "loss": 4.0221, + "step": 680 + }, + { + "epoch": 0.06164569566398117, + "grad_norm": 1.0483235120773315, + "learning_rate": 0.00019830518986961118, + "loss": 3.9018, + "step": 681 + }, + { + "epoch": 0.06173621797773151, + "grad_norm": 0.9993516802787781, + "learning_rate": 0.0001982993673416551, + "loss": 3.9225, + "step": 682 + }, + { + "epoch": 0.06182674029148185, + "grad_norm": 0.9728137254714966, + "learning_rate": 0.00019829353491495545, + "loss": 3.9108, + "step": 683 + }, + { + "epoch": 0.06191726260523219, + "grad_norm": 1.0375216007232666, + "learning_rate": 0.00019828769259009948, + "loss": 3.9321, + "step": 684 + }, + { + "epoch": 0.06200778491898253, + "grad_norm": 1.0920323133468628, + "learning_rate": 0.00019828184036767556, + "loss": 3.9325, + "step": 685 + }, + { + "epoch": 0.062098307232732866, + "grad_norm": 0.940028965473175, + "learning_rate": 0.00019827597824827303, + "loss": 3.8847, + "step": 686 + }, + { + "epoch": 0.06218882954648321, + "grad_norm": 1.0867862701416016, + "learning_rate": 0.00019827010623248216, + "loss": 3.8561, + "step": 687 + }, + { + "epoch": 0.06227935186023355, + "grad_norm": 1.0442867279052734, + "learning_rate": 0.0001982642243208943, + "loss": 3.9314, + "step": 688 + }, + { + "epoch": 0.06236987417398389, + "grad_norm": 0.9575579762458801, + "learning_rate": 0.00019825833251410174, + "loss": 3.9118, + "step": 689 + }, + { + "epoch": 0.06246039648773423, + "grad_norm": 0.9611737132072449, + "learning_rate": 0.00019825243081269774, + "loss": 3.9184, + "step": 690 + }, + { + "epoch": 0.06255091880148457, + "grad_norm": 1.049651026725769, + "learning_rate": 0.0001982465192172767, + "loss": 3.949, + "step": 691 + }, + { + "epoch": 0.06264144111523491, + "grad_norm": 1.005272388458252, + "learning_rate": 0.0001982405977284338, + "loss": 3.8511, + "step": 692 + }, + { + "epoch": 0.06273196342898525, + "grad_norm": 1.052721381187439, + "learning_rate": 0.00019823466634676543, + "loss": 3.8426, + "step": 693 + }, + { + "epoch": 0.06282248574273558, + "grad_norm": 0.9251368641853333, + "learning_rate": 0.0001982287250728689, + "loss": 3.9242, + "step": 694 + }, + { + "epoch": 0.06291300805648592, + "grad_norm": 1.048956274986267, + "learning_rate": 0.00019822277390734239, + "loss": 3.8853, + "step": 695 + }, + { + "epoch": 0.06300353037023626, + "grad_norm": 1.1769357919692993, + "learning_rate": 0.00019821681285078522, + "loss": 3.9212, + "step": 696 + }, + { + "epoch": 0.0630940526839866, + "grad_norm": 0.9820570349693298, + "learning_rate": 0.0001982108419037977, + "loss": 3.8646, + "step": 697 + }, + { + "epoch": 0.06318457499773694, + "grad_norm": 1.104885220527649, + "learning_rate": 0.00019820486106698112, + "loss": 3.9699, + "step": 698 + }, + { + "epoch": 0.06327509731148728, + "grad_norm": 0.9958544969558716, + "learning_rate": 0.00019819887034093768, + "loss": 3.8866, + "step": 699 + }, + { + "epoch": 0.06336561962523762, + "grad_norm": 1.0173392295837402, + "learning_rate": 0.00019819286972627066, + "loss": 3.9072, + "step": 700 + }, + { + "epoch": 0.06345614193898796, + "grad_norm": 0.9573622345924377, + "learning_rate": 0.0001981868592235844, + "loss": 3.932, + "step": 701 + }, + { + "epoch": 0.0635466642527383, + "grad_norm": 0.9838647246360779, + "learning_rate": 0.0001981808388334841, + "loss": 3.851, + "step": 702 + }, + { + "epoch": 0.06363718656648865, + "grad_norm": 0.9603915810585022, + "learning_rate": 0.00019817480855657599, + "loss": 3.9591, + "step": 703 + }, + { + "epoch": 0.06372770888023897, + "grad_norm": 1.0218600034713745, + "learning_rate": 0.00019816876839346735, + "loss": 3.8491, + "step": 704 + }, + { + "epoch": 0.06381823119398931, + "grad_norm": 1.0183830261230469, + "learning_rate": 0.00019816271834476642, + "loss": 3.8674, + "step": 705 + }, + { + "epoch": 0.06390875350773965, + "grad_norm": 1.0954585075378418, + "learning_rate": 0.00019815665841108241, + "loss": 3.9141, + "step": 706 + }, + { + "epoch": 0.06399927582149, + "grad_norm": 1.0194724798202515, + "learning_rate": 0.00019815058859302563, + "loss": 3.9022, + "step": 707 + }, + { + "epoch": 0.06408979813524034, + "grad_norm": 0.9629772901535034, + "learning_rate": 0.00019814450889120723, + "loss": 3.8725, + "step": 708 + }, + { + "epoch": 0.06418032044899068, + "grad_norm": 0.9155080914497375, + "learning_rate": 0.0001981384193062395, + "loss": 3.9373, + "step": 709 + }, + { + "epoch": 0.06427084276274102, + "grad_norm": 0.9478293657302856, + "learning_rate": 0.0001981323198387356, + "loss": 3.9601, + "step": 710 + }, + { + "epoch": 0.06436136507649136, + "grad_norm": 0.9674055576324463, + "learning_rate": 0.0001981262104893098, + "loss": 3.8731, + "step": 711 + }, + { + "epoch": 0.0644518873902417, + "grad_norm": 0.898500382900238, + "learning_rate": 0.00019812009125857728, + "loss": 3.8238, + "step": 712 + }, + { + "epoch": 0.06454240970399204, + "grad_norm": 0.8977572321891785, + "learning_rate": 0.00019811396214715427, + "loss": 3.8915, + "step": 713 + }, + { + "epoch": 0.06463293201774237, + "grad_norm": 0.8710629940032959, + "learning_rate": 0.00019810782315565794, + "loss": 3.8397, + "step": 714 + }, + { + "epoch": 0.06472345433149271, + "grad_norm": 0.9772562980651855, + "learning_rate": 0.00019810167428470653, + "loss": 3.8439, + "step": 715 + }, + { + "epoch": 0.06481397664524305, + "grad_norm": 0.8889133930206299, + "learning_rate": 0.00019809551553491916, + "loss": 3.8663, + "step": 716 + }, + { + "epoch": 0.06490449895899339, + "grad_norm": 0.9468355774879456, + "learning_rate": 0.0001980893469069161, + "loss": 3.8705, + "step": 717 + }, + { + "epoch": 0.06499502127274373, + "grad_norm": 0.9171603322029114, + "learning_rate": 0.00019808316840131846, + "loss": 3.8796, + "step": 718 + }, + { + "epoch": 0.06508554358649407, + "grad_norm": 0.943007230758667, + "learning_rate": 0.00019807698001874846, + "loss": 3.8835, + "step": 719 + }, + { + "epoch": 0.06517606590024441, + "grad_norm": 0.8981984257698059, + "learning_rate": 0.00019807078175982924, + "loss": 3.9003, + "step": 720 + }, + { + "epoch": 0.06526658821399475, + "grad_norm": 0.9815030694007874, + "learning_rate": 0.000198064573625185, + "loss": 3.8903, + "step": 721 + }, + { + "epoch": 0.0653571105277451, + "grad_norm": 0.9398093819618225, + "learning_rate": 0.00019805835561544086, + "loss": 3.8726, + "step": 722 + }, + { + "epoch": 0.06544763284149543, + "grad_norm": 0.9546122550964355, + "learning_rate": 0.00019805212773122303, + "loss": 3.8834, + "step": 723 + }, + { + "epoch": 0.06553815515524576, + "grad_norm": 0.9206355810165405, + "learning_rate": 0.00019804588997315858, + "loss": 3.8714, + "step": 724 + }, + { + "epoch": 0.0656286774689961, + "grad_norm": 1.0589860677719116, + "learning_rate": 0.0001980396423418757, + "loss": 3.8971, + "step": 725 + }, + { + "epoch": 0.06571919978274644, + "grad_norm": 0.9480148553848267, + "learning_rate": 0.00019803338483800353, + "loss": 3.8883, + "step": 726 + }, + { + "epoch": 0.06580972209649678, + "grad_norm": 1.0164626836776733, + "learning_rate": 0.00019802711746217218, + "loss": 3.8837, + "step": 727 + }, + { + "epoch": 0.06590024441024712, + "grad_norm": 0.9303282499313354, + "learning_rate": 0.0001980208402150128, + "loss": 3.8288, + "step": 728 + }, + { + "epoch": 0.06599076672399747, + "grad_norm": 0.9358525276184082, + "learning_rate": 0.00019801455309715748, + "loss": 3.8526, + "step": 729 + }, + { + "epoch": 0.0660812890377478, + "grad_norm": 0.9523688554763794, + "learning_rate": 0.00019800825610923934, + "loss": 3.8298, + "step": 730 + }, + { + "epoch": 0.06617181135149815, + "grad_norm": 0.9664517045021057, + "learning_rate": 0.0001980019492518925, + "loss": 3.8322, + "step": 731 + }, + { + "epoch": 0.06626233366524849, + "grad_norm": 0.9282121062278748, + "learning_rate": 0.00019799563252575206, + "loss": 3.8764, + "step": 732 + }, + { + "epoch": 0.06635285597899883, + "grad_norm": 0.9822096824645996, + "learning_rate": 0.0001979893059314541, + "loss": 3.8923, + "step": 733 + }, + { + "epoch": 0.06644337829274917, + "grad_norm": 1.003895878791809, + "learning_rate": 0.0001979829694696357, + "loss": 3.8908, + "step": 734 + }, + { + "epoch": 0.0665339006064995, + "grad_norm": 0.9121044874191284, + "learning_rate": 0.00019797662314093497, + "loss": 3.8674, + "step": 735 + }, + { + "epoch": 0.06662442292024984, + "grad_norm": 0.898923933506012, + "learning_rate": 0.00019797026694599098, + "loss": 3.8893, + "step": 736 + }, + { + "epoch": 0.06671494523400018, + "grad_norm": 0.9216980338096619, + "learning_rate": 0.00019796390088544377, + "loss": 3.7977, + "step": 737 + }, + { + "epoch": 0.06680546754775052, + "grad_norm": 0.9738581776618958, + "learning_rate": 0.0001979575249599344, + "loss": 3.8256, + "step": 738 + }, + { + "epoch": 0.06689598986150086, + "grad_norm": 0.8819999098777771, + "learning_rate": 0.000197951139170105, + "loss": 3.8756, + "step": 739 + }, + { + "epoch": 0.0669865121752512, + "grad_norm": 0.925317108631134, + "learning_rate": 0.00019794474351659852, + "loss": 3.8616, + "step": 740 + }, + { + "epoch": 0.06707703448900154, + "grad_norm": 0.9750755429267883, + "learning_rate": 0.00019793833800005908, + "loss": 3.8383, + "step": 741 + }, + { + "epoch": 0.06716755680275188, + "grad_norm": 0.9731130599975586, + "learning_rate": 0.00019793192262113166, + "loss": 3.8245, + "step": 742 + }, + { + "epoch": 0.06725807911650222, + "grad_norm": 0.9129505753517151, + "learning_rate": 0.00019792549738046234, + "loss": 3.8026, + "step": 743 + }, + { + "epoch": 0.06734860143025256, + "grad_norm": 0.971178412437439, + "learning_rate": 0.00019791906227869808, + "loss": 3.8382, + "step": 744 + }, + { + "epoch": 0.06743912374400289, + "grad_norm": 1.011927843093872, + "learning_rate": 0.00019791261731648695, + "loss": 3.8173, + "step": 745 + }, + { + "epoch": 0.06752964605775323, + "grad_norm": 0.9691061973571777, + "learning_rate": 0.00019790616249447794, + "loss": 3.8659, + "step": 746 + }, + { + "epoch": 0.06762016837150357, + "grad_norm": 0.9729804992675781, + "learning_rate": 0.000197899697813321, + "loss": 3.8626, + "step": 747 + }, + { + "epoch": 0.06771069068525391, + "grad_norm": 0.9716980457305908, + "learning_rate": 0.00019789322327366723, + "loss": 3.9086, + "step": 748 + }, + { + "epoch": 0.06780121299900425, + "grad_norm": 1.1275818347930908, + "learning_rate": 0.0001978867388761685, + "loss": 3.8068, + "step": 749 + }, + { + "epoch": 0.0678917353127546, + "grad_norm": 0.9165233969688416, + "learning_rate": 0.00019788024462147788, + "loss": 3.9002, + "step": 750 + }, + { + "epoch": 0.06798225762650494, + "grad_norm": 0.9921157956123352, + "learning_rate": 0.0001978737405102493, + "loss": 3.833, + "step": 751 + }, + { + "epoch": 0.06807277994025528, + "grad_norm": 0.9720535278320312, + "learning_rate": 0.00019786722654313772, + "loss": 3.8524, + "step": 752 + }, + { + "epoch": 0.06816330225400562, + "grad_norm": 0.9478955864906311, + "learning_rate": 0.00019786070272079912, + "loss": 3.8483, + "step": 753 + }, + { + "epoch": 0.06825382456775596, + "grad_norm": 1.005020260810852, + "learning_rate": 0.00019785416904389042, + "loss": 3.9172, + "step": 754 + }, + { + "epoch": 0.06834434688150628, + "grad_norm": 0.9765123128890991, + "learning_rate": 0.00019784762551306955, + "loss": 3.8409, + "step": 755 + }, + { + "epoch": 0.06843486919525663, + "grad_norm": 1.0998058319091797, + "learning_rate": 0.00019784107212899552, + "loss": 3.8233, + "step": 756 + }, + { + "epoch": 0.06852539150900697, + "grad_norm": 0.9845994710922241, + "learning_rate": 0.00019783450889232818, + "loss": 3.9168, + "step": 757 + }, + { + "epoch": 0.06861591382275731, + "grad_norm": 1.118572473526001, + "learning_rate": 0.00019782793580372848, + "loss": 3.9189, + "step": 758 + }, + { + "epoch": 0.06870643613650765, + "grad_norm": 1.011348009109497, + "learning_rate": 0.0001978213528638583, + "loss": 3.8837, + "step": 759 + }, + { + "epoch": 0.06879695845025799, + "grad_norm": 1.0644543170928955, + "learning_rate": 0.00019781476007338058, + "loss": 3.9323, + "step": 760 + }, + { + "epoch": 0.06888748076400833, + "grad_norm": 0.9996188282966614, + "learning_rate": 0.0001978081574329592, + "loss": 3.8424, + "step": 761 + }, + { + "epoch": 0.06897800307775867, + "grad_norm": 1.0625057220458984, + "learning_rate": 0.00019780154494325903, + "loss": 3.8176, + "step": 762 + }, + { + "epoch": 0.06906852539150901, + "grad_norm": 1.0283875465393066, + "learning_rate": 0.00019779492260494594, + "loss": 3.8401, + "step": 763 + }, + { + "epoch": 0.06915904770525935, + "grad_norm": 1.0000511407852173, + "learning_rate": 0.00019778829041868689, + "loss": 3.8379, + "step": 764 + }, + { + "epoch": 0.06924957001900968, + "grad_norm": 1.0903455018997192, + "learning_rate": 0.00019778164838514962, + "loss": 3.8621, + "step": 765 + }, + { + "epoch": 0.06934009233276002, + "grad_norm": 1.013375163078308, + "learning_rate": 0.000197774996505003, + "loss": 3.797, + "step": 766 + }, + { + "epoch": 0.06943061464651036, + "grad_norm": 1.1123466491699219, + "learning_rate": 0.00019776833477891696, + "loss": 3.8484, + "step": 767 + }, + { + "epoch": 0.0695211369602607, + "grad_norm": 0.9966886043548584, + "learning_rate": 0.00019776166320756227, + "loss": 3.8394, + "step": 768 + }, + { + "epoch": 0.06961165927401104, + "grad_norm": 1.0038275718688965, + "learning_rate": 0.00019775498179161076, + "loss": 3.8286, + "step": 769 + }, + { + "epoch": 0.06970218158776138, + "grad_norm": 1.0068305730819702, + "learning_rate": 0.00019774829053173526, + "loss": 3.918, + "step": 770 + }, + { + "epoch": 0.06979270390151172, + "grad_norm": 1.0238758325576782, + "learning_rate": 0.0001977415894286096, + "loss": 3.801, + "step": 771 + }, + { + "epoch": 0.06988322621526206, + "grad_norm": 1.0598841905593872, + "learning_rate": 0.00019773487848290854, + "loss": 3.8412, + "step": 772 + }, + { + "epoch": 0.0699737485290124, + "grad_norm": 1.0061222314834595, + "learning_rate": 0.0001977281576953079, + "loss": 3.8693, + "step": 773 + }, + { + "epoch": 0.07006427084276275, + "grad_norm": 0.9197167754173279, + "learning_rate": 0.00019772142706648443, + "loss": 3.8136, + "step": 774 + }, + { + "epoch": 0.07015479315651309, + "grad_norm": 0.9932407140731812, + "learning_rate": 0.00019771468659711595, + "loss": 3.8804, + "step": 775 + }, + { + "epoch": 0.07024531547026341, + "grad_norm": 0.91663658618927, + "learning_rate": 0.00019770793628788122, + "loss": 3.9397, + "step": 776 + }, + { + "epoch": 0.07033583778401375, + "grad_norm": 0.9589143991470337, + "learning_rate": 0.00019770117613945995, + "loss": 3.8571, + "step": 777 + }, + { + "epoch": 0.0704263600977641, + "grad_norm": 0.9023710489273071, + "learning_rate": 0.00019769440615253293, + "loss": 3.861, + "step": 778 + }, + { + "epoch": 0.07051688241151444, + "grad_norm": 0.9534058570861816, + "learning_rate": 0.00019768762632778187, + "loss": 3.875, + "step": 779 + }, + { + "epoch": 0.07060740472526478, + "grad_norm": 0.9815419316291809, + "learning_rate": 0.00019768083666588953, + "loss": 3.7904, + "step": 780 + }, + { + "epoch": 0.07069792703901512, + "grad_norm": 1.0916591882705688, + "learning_rate": 0.00019767403716753959, + "loss": 3.8464, + "step": 781 + }, + { + "epoch": 0.07078844935276546, + "grad_norm": 0.9650158882141113, + "learning_rate": 0.0001976672278334168, + "loss": 3.8239, + "step": 782 + }, + { + "epoch": 0.0708789716665158, + "grad_norm": 0.9654297232627869, + "learning_rate": 0.00019766040866420683, + "loss": 3.9072, + "step": 783 + }, + { + "epoch": 0.07096949398026614, + "grad_norm": 0.9741345643997192, + "learning_rate": 0.00019765357966059638, + "loss": 3.8536, + "step": 784 + }, + { + "epoch": 0.07106001629401648, + "grad_norm": 0.9063356518745422, + "learning_rate": 0.0001976467408232731, + "loss": 3.8186, + "step": 785 + }, + { + "epoch": 0.07115053860776681, + "grad_norm": 0.9848807454109192, + "learning_rate": 0.00019763989215292575, + "loss": 3.866, + "step": 786 + }, + { + "epoch": 0.07124106092151715, + "grad_norm": 1.0605283975601196, + "learning_rate": 0.0001976330336502439, + "loss": 3.9509, + "step": 787 + }, + { + "epoch": 0.07133158323526749, + "grad_norm": 0.9561129808425903, + "learning_rate": 0.00019762616531591824, + "loss": 3.8792, + "step": 788 + }, + { + "epoch": 0.07142210554901783, + "grad_norm": 0.9112474322319031, + "learning_rate": 0.0001976192871506404, + "loss": 3.8499, + "step": 789 + }, + { + "epoch": 0.07151262786276817, + "grad_norm": 1.0188720226287842, + "learning_rate": 0.00019761239915510302, + "loss": 3.7913, + "step": 790 + }, + { + "epoch": 0.07160315017651851, + "grad_norm": 0.914110541343689, + "learning_rate": 0.0001976055013299997, + "loss": 3.8441, + "step": 791 + }, + { + "epoch": 0.07169367249026885, + "grad_norm": 1.0345866680145264, + "learning_rate": 0.0001975985936760251, + "loss": 3.7915, + "step": 792 + }, + { + "epoch": 0.0717841948040192, + "grad_norm": 0.9036014676094055, + "learning_rate": 0.00019759167619387476, + "loss": 3.8819, + "step": 793 + }, + { + "epoch": 0.07187471711776953, + "grad_norm": 0.8982127904891968, + "learning_rate": 0.0001975847488842453, + "loss": 3.7775, + "step": 794 + }, + { + "epoch": 0.07196523943151988, + "grad_norm": 0.8711851835250854, + "learning_rate": 0.00019757781174783428, + "loss": 3.8353, + "step": 795 + }, + { + "epoch": 0.0720557617452702, + "grad_norm": 0.9486883878707886, + "learning_rate": 0.0001975708647853403, + "loss": 3.8049, + "step": 796 + }, + { + "epoch": 0.07214628405902054, + "grad_norm": 0.9494708776473999, + "learning_rate": 0.00019756390799746294, + "loss": 3.8221, + "step": 797 + }, + { + "epoch": 0.07223680637277088, + "grad_norm": 1.008435845375061, + "learning_rate": 0.0001975569413849027, + "loss": 3.8172, + "step": 798 + }, + { + "epoch": 0.07232732868652123, + "grad_norm": 0.9401419758796692, + "learning_rate": 0.0001975499649483611, + "loss": 3.8145, + "step": 799 + }, + { + "epoch": 0.07241785100027157, + "grad_norm": 0.8914586305618286, + "learning_rate": 0.00019754297868854073, + "loss": 3.7788, + "step": 800 + }, + { + "epoch": 0.0725083733140219, + "grad_norm": 0.9457998275756836, + "learning_rate": 0.00019753598260614506, + "loss": 3.8508, + "step": 801 + }, + { + "epoch": 0.07259889562777225, + "grad_norm": 0.9615713953971863, + "learning_rate": 0.0001975289767018786, + "loss": 3.8725, + "step": 802 + }, + { + "epoch": 0.07268941794152259, + "grad_norm": 0.942954421043396, + "learning_rate": 0.00019752196097644686, + "loss": 3.8043, + "step": 803 + }, + { + "epoch": 0.07277994025527293, + "grad_norm": 0.9813474416732788, + "learning_rate": 0.00019751493543055632, + "loss": 3.8567, + "step": 804 + }, + { + "epoch": 0.07287046256902327, + "grad_norm": 1.0663015842437744, + "learning_rate": 0.00019750790006491448, + "loss": 3.8372, + "step": 805 + }, + { + "epoch": 0.0729609848827736, + "grad_norm": 0.9190868139266968, + "learning_rate": 0.00019750085488022973, + "loss": 3.8005, + "step": 806 + }, + { + "epoch": 0.07305150719652394, + "grad_norm": 0.9721860289573669, + "learning_rate": 0.00019749379987721159, + "loss": 3.7956, + "step": 807 + }, + { + "epoch": 0.07314202951027428, + "grad_norm": 1.0076109170913696, + "learning_rate": 0.00019748673505657046, + "loss": 3.8849, + "step": 808 + }, + { + "epoch": 0.07323255182402462, + "grad_norm": 1.0171688795089722, + "learning_rate": 0.00019747966041901776, + "loss": 3.8125, + "step": 809 + }, + { + "epoch": 0.07332307413777496, + "grad_norm": 0.9697157740592957, + "learning_rate": 0.00019747257596526593, + "loss": 3.7726, + "step": 810 + }, + { + "epoch": 0.0734135964515253, + "grad_norm": 1.0516706705093384, + "learning_rate": 0.0001974654816960284, + "loss": 3.7536, + "step": 811 + }, + { + "epoch": 0.07350411876527564, + "grad_norm": 0.9496332406997681, + "learning_rate": 0.00019745837761201947, + "loss": 3.8467, + "step": 812 + }, + { + "epoch": 0.07359464107902598, + "grad_norm": 1.0452812910079956, + "learning_rate": 0.00019745126371395464, + "loss": 3.7487, + "step": 813 + }, + { + "epoch": 0.07368516339277632, + "grad_norm": 1.0293980836868286, + "learning_rate": 0.0001974441400025502, + "loss": 3.8284, + "step": 814 + }, + { + "epoch": 0.07377568570652666, + "grad_norm": 0.9970689415931702, + "learning_rate": 0.00019743700647852354, + "loss": 3.8618, + "step": 815 + }, + { + "epoch": 0.073866208020277, + "grad_norm": 0.9802175760269165, + "learning_rate": 0.00019742986314259299, + "loss": 3.8989, + "step": 816 + }, + { + "epoch": 0.07395673033402733, + "grad_norm": 0.945270299911499, + "learning_rate": 0.00019742270999547788, + "loss": 3.8061, + "step": 817 + }, + { + "epoch": 0.07404725264777767, + "grad_norm": 0.9834704399108887, + "learning_rate": 0.00019741554703789853, + "loss": 3.8592, + "step": 818 + }, + { + "epoch": 0.07413777496152801, + "grad_norm": 0.889618992805481, + "learning_rate": 0.00019740837427057625, + "loss": 3.7997, + "step": 819 + }, + { + "epoch": 0.07422829727527835, + "grad_norm": 0.9406912326812744, + "learning_rate": 0.00019740119169423337, + "loss": 3.827, + "step": 820 + }, + { + "epoch": 0.0743188195890287, + "grad_norm": 0.8791980147361755, + "learning_rate": 0.00019739399930959314, + "loss": 3.7941, + "step": 821 + }, + { + "epoch": 0.07440934190277904, + "grad_norm": 0.89346843957901, + "learning_rate": 0.00019738679711737983, + "loss": 3.7829, + "step": 822 + }, + { + "epoch": 0.07449986421652938, + "grad_norm": 0.9298461675643921, + "learning_rate": 0.00019737958511831875, + "loss": 3.8339, + "step": 823 + }, + { + "epoch": 0.07459038653027972, + "grad_norm": 0.9051544666290283, + "learning_rate": 0.00019737236331313608, + "loss": 3.8614, + "step": 824 + }, + { + "epoch": 0.07468090884403006, + "grad_norm": 1.0275598764419556, + "learning_rate": 0.00019736513170255911, + "loss": 3.8188, + "step": 825 + }, + { + "epoch": 0.0747714311577804, + "grad_norm": 1.0261467695236206, + "learning_rate": 0.00019735789028731604, + "loss": 3.7921, + "step": 826 + }, + { + "epoch": 0.07486195347153073, + "grad_norm": 0.916692852973938, + "learning_rate": 0.00019735063906813606, + "loss": 3.7529, + "step": 827 + }, + { + "epoch": 0.07495247578528107, + "grad_norm": 1.012143850326538, + "learning_rate": 0.00019734337804574943, + "loss": 3.8495, + "step": 828 + }, + { + "epoch": 0.07504299809903141, + "grad_norm": 0.9389269351959229, + "learning_rate": 0.00019733610722088725, + "loss": 3.8664, + "step": 829 + }, + { + "epoch": 0.07513352041278175, + "grad_norm": 0.9384870529174805, + "learning_rate": 0.00019732882659428177, + "loss": 3.7599, + "step": 830 + }, + { + "epoch": 0.07522404272653209, + "grad_norm": 0.9308289885520935, + "learning_rate": 0.00019732153616666608, + "loss": 3.877, + "step": 831 + }, + { + "epoch": 0.07531456504028243, + "grad_norm": 0.9849106669425964, + "learning_rate": 0.0001973142359387744, + "loss": 3.7972, + "step": 832 + }, + { + "epoch": 0.07540508735403277, + "grad_norm": 0.9762073159217834, + "learning_rate": 0.0001973069259113418, + "loss": 3.8222, + "step": 833 + }, + { + "epoch": 0.07549560966778311, + "grad_norm": 0.946532130241394, + "learning_rate": 0.00019729960608510445, + "loss": 3.8143, + "step": 834 + }, + { + "epoch": 0.07558613198153345, + "grad_norm": 0.9300776124000549, + "learning_rate": 0.00019729227646079938, + "loss": 3.7953, + "step": 835 + }, + { + "epoch": 0.0756766542952838, + "grad_norm": 0.9871339797973633, + "learning_rate": 0.00019728493703916478, + "loss": 3.7921, + "step": 836 + }, + { + "epoch": 0.07576717660903412, + "grad_norm": 0.932281494140625, + "learning_rate": 0.00019727758782093967, + "loss": 3.8262, + "step": 837 + }, + { + "epoch": 0.07585769892278446, + "grad_norm": 0.9992408752441406, + "learning_rate": 0.00019727022880686412, + "loss": 3.8204, + "step": 838 + }, + { + "epoch": 0.0759482212365348, + "grad_norm": 0.9957219958305359, + "learning_rate": 0.00019726285999767919, + "loss": 3.8209, + "step": 839 + }, + { + "epoch": 0.07603874355028514, + "grad_norm": 0.9381322264671326, + "learning_rate": 0.00019725548139412692, + "loss": 3.7818, + "step": 840 + }, + { + "epoch": 0.07612926586403548, + "grad_norm": 0.8691635727882385, + "learning_rate": 0.00019724809299695033, + "loss": 3.7704, + "step": 841 + }, + { + "epoch": 0.07621978817778582, + "grad_norm": 0.9367995858192444, + "learning_rate": 0.00019724069480689345, + "loss": 3.768, + "step": 842 + }, + { + "epoch": 0.07631031049153617, + "grad_norm": 0.9435257911682129, + "learning_rate": 0.00019723328682470125, + "loss": 3.7615, + "step": 843 + }, + { + "epoch": 0.0764008328052865, + "grad_norm": 0.9820596575737, + "learning_rate": 0.00019722586905111976, + "loss": 3.823, + "step": 844 + }, + { + "epoch": 0.07649135511903685, + "grad_norm": 1.0243360996246338, + "learning_rate": 0.00019721844148689588, + "loss": 3.8064, + "step": 845 + }, + { + "epoch": 0.07658187743278719, + "grad_norm": 1.0168771743774414, + "learning_rate": 0.00019721100413277762, + "loss": 3.856, + "step": 846 + }, + { + "epoch": 0.07667239974653753, + "grad_norm": 0.9951126575469971, + "learning_rate": 0.00019720355698951388, + "loss": 3.8397, + "step": 847 + }, + { + "epoch": 0.07676292206028786, + "grad_norm": 0.9725316166877747, + "learning_rate": 0.00019719610005785465, + "loss": 3.8152, + "step": 848 + }, + { + "epoch": 0.0768534443740382, + "grad_norm": 0.9000321626663208, + "learning_rate": 0.0001971886333385508, + "loss": 3.8176, + "step": 849 + }, + { + "epoch": 0.07694396668778854, + "grad_norm": 0.9348044991493225, + "learning_rate": 0.00019718115683235417, + "loss": 3.7787, + "step": 850 + }, + { + "epoch": 0.07703448900153888, + "grad_norm": 0.9273238778114319, + "learning_rate": 0.00019717367054001775, + "loss": 3.8014, + "step": 851 + }, + { + "epoch": 0.07712501131528922, + "grad_norm": 0.9387201070785522, + "learning_rate": 0.00019716617446229536, + "loss": 3.7383, + "step": 852 + }, + { + "epoch": 0.07721553362903956, + "grad_norm": 0.9373703002929688, + "learning_rate": 0.0001971586685999419, + "loss": 3.8054, + "step": 853 + }, + { + "epoch": 0.0773060559427899, + "grad_norm": 0.9412992596626282, + "learning_rate": 0.00019715115295371313, + "loss": 3.7198, + "step": 854 + }, + { + "epoch": 0.07739657825654024, + "grad_norm": 1.0220283269882202, + "learning_rate": 0.0001971436275243659, + "loss": 3.7441, + "step": 855 + }, + { + "epoch": 0.07748710057029058, + "grad_norm": 0.9344440698623657, + "learning_rate": 0.00019713609231265805, + "loss": 3.8045, + "step": 856 + }, + { + "epoch": 0.07757762288404092, + "grad_norm": 0.9369484186172485, + "learning_rate": 0.00019712854731934837, + "loss": 3.765, + "step": 857 + }, + { + "epoch": 0.07766814519779125, + "grad_norm": 0.8779152631759644, + "learning_rate": 0.00019712099254519665, + "loss": 3.7957, + "step": 858 + }, + { + "epoch": 0.07775866751154159, + "grad_norm": 0.9569523930549622, + "learning_rate": 0.00019711342799096361, + "loss": 3.7851, + "step": 859 + }, + { + "epoch": 0.07784918982529193, + "grad_norm": 0.930668294429779, + "learning_rate": 0.00019710585365741103, + "loss": 3.8031, + "step": 860 + }, + { + "epoch": 0.07793971213904227, + "grad_norm": 0.9617711305618286, + "learning_rate": 0.00019709826954530168, + "loss": 3.8352, + "step": 861 + }, + { + "epoch": 0.07803023445279261, + "grad_norm": 0.9281615018844604, + "learning_rate": 0.0001970906756553992, + "loss": 3.8318, + "step": 862 + }, + { + "epoch": 0.07812075676654295, + "grad_norm": 0.9399791359901428, + "learning_rate": 0.00019708307198846835, + "loss": 3.7991, + "step": 863 + }, + { + "epoch": 0.0782112790802933, + "grad_norm": 0.965398371219635, + "learning_rate": 0.0001970754585452748, + "loss": 3.7721, + "step": 864 + }, + { + "epoch": 0.07830180139404364, + "grad_norm": 0.9370290040969849, + "learning_rate": 0.00019706783532658526, + "loss": 3.8555, + "step": 865 + }, + { + "epoch": 0.07839232370779398, + "grad_norm": 0.9139227867126465, + "learning_rate": 0.00019706020233316735, + "loss": 3.7587, + "step": 866 + }, + { + "epoch": 0.07848284602154432, + "grad_norm": 0.9634401202201843, + "learning_rate": 0.00019705255956578972, + "loss": 3.7673, + "step": 867 + }, + { + "epoch": 0.07857336833529464, + "grad_norm": 1.0112007856369019, + "learning_rate": 0.00019704490702522197, + "loss": 3.7562, + "step": 868 + }, + { + "epoch": 0.07866389064904498, + "grad_norm": 0.9345536231994629, + "learning_rate": 0.00019703724471223475, + "loss": 3.8501, + "step": 869 + }, + { + "epoch": 0.07875441296279533, + "grad_norm": 0.9797864556312561, + "learning_rate": 0.00019702957262759965, + "loss": 3.7076, + "step": 870 + }, + { + "epoch": 0.07884493527654567, + "grad_norm": 0.9007889032363892, + "learning_rate": 0.00019702189077208922, + "loss": 3.7771, + "step": 871 + }, + { + "epoch": 0.07893545759029601, + "grad_norm": 0.9339497089385986, + "learning_rate": 0.0001970141991464771, + "loss": 3.8068, + "step": 872 + }, + { + "epoch": 0.07902597990404635, + "grad_norm": 0.9866631627082825, + "learning_rate": 0.00019700649775153774, + "loss": 3.733, + "step": 873 + }, + { + "epoch": 0.07911650221779669, + "grad_norm": 0.936284065246582, + "learning_rate": 0.00019699878658804672, + "loss": 3.7971, + "step": 874 + }, + { + "epoch": 0.07920702453154703, + "grad_norm": 1.0367450714111328, + "learning_rate": 0.0001969910656567805, + "loss": 3.8082, + "step": 875 + }, + { + "epoch": 0.07929754684529737, + "grad_norm": 0.9137488603591919, + "learning_rate": 0.0001969833349585167, + "loss": 3.779, + "step": 876 + }, + { + "epoch": 0.07938806915904771, + "grad_norm": 1.0584319829940796, + "learning_rate": 0.00019697559449403368, + "loss": 3.756, + "step": 877 + }, + { + "epoch": 0.07947859147279804, + "grad_norm": 0.9855899810791016, + "learning_rate": 0.00019696784426411097, + "loss": 3.774, + "step": 878 + }, + { + "epoch": 0.07956911378654838, + "grad_norm": 0.9397339820861816, + "learning_rate": 0.00019696008426952897, + "loss": 3.7419, + "step": 879 + }, + { + "epoch": 0.07965963610029872, + "grad_norm": 1.0694619417190552, + "learning_rate": 0.00019695231451106912, + "loss": 3.839, + "step": 880 + }, + { + "epoch": 0.07975015841404906, + "grad_norm": 0.9818793535232544, + "learning_rate": 0.0001969445349895139, + "loss": 3.8308, + "step": 881 + }, + { + "epoch": 0.0798406807277994, + "grad_norm": 1.0380256175994873, + "learning_rate": 0.00019693674570564663, + "loss": 3.7835, + "step": 882 + }, + { + "epoch": 0.07993120304154974, + "grad_norm": 0.9677836894989014, + "learning_rate": 0.00019692894666025176, + "loss": 3.7801, + "step": 883 + }, + { + "epoch": 0.08002172535530008, + "grad_norm": 0.9173345565795898, + "learning_rate": 0.00019692113785411456, + "loss": 3.6993, + "step": 884 + }, + { + "epoch": 0.08011224766905042, + "grad_norm": 0.9473771452903748, + "learning_rate": 0.00019691331928802143, + "loss": 3.7979, + "step": 885 + }, + { + "epoch": 0.08020276998280076, + "grad_norm": 1.0071412324905396, + "learning_rate": 0.00019690549096275972, + "loss": 3.8488, + "step": 886 + }, + { + "epoch": 0.0802932922965511, + "grad_norm": 0.9691148400306702, + "learning_rate": 0.00019689765287911773, + "loss": 3.7602, + "step": 887 + }, + { + "epoch": 0.08038381461030145, + "grad_norm": 0.9631767868995667, + "learning_rate": 0.00019688980503788475, + "loss": 3.764, + "step": 888 + }, + { + "epoch": 0.08047433692405177, + "grad_norm": 1.0345791578292847, + "learning_rate": 0.00019688194743985103, + "loss": 3.7938, + "step": 889 + }, + { + "epoch": 0.08056485923780211, + "grad_norm": 0.9610744714736938, + "learning_rate": 0.00019687408008580784, + "loss": 3.7792, + "step": 890 + }, + { + "epoch": 0.08065538155155245, + "grad_norm": 1.063826560974121, + "learning_rate": 0.00019686620297654748, + "loss": 3.6855, + "step": 891 + }, + { + "epoch": 0.0807459038653028, + "grad_norm": 0.9013906121253967, + "learning_rate": 0.0001968583161128631, + "loss": 3.7464, + "step": 892 + }, + { + "epoch": 0.08083642617905314, + "grad_norm": 0.9041231274604797, + "learning_rate": 0.00019685041949554896, + "loss": 3.7574, + "step": 893 + }, + { + "epoch": 0.08092694849280348, + "grad_norm": 0.9630007743835449, + "learning_rate": 0.00019684251312540024, + "loss": 3.7775, + "step": 894 + }, + { + "epoch": 0.08101747080655382, + "grad_norm": 0.9465706944465637, + "learning_rate": 0.00019683459700321304, + "loss": 3.7591, + "step": 895 + }, + { + "epoch": 0.08110799312030416, + "grad_norm": 0.8408172726631165, + "learning_rate": 0.00019682667112978463, + "loss": 3.7055, + "step": 896 + }, + { + "epoch": 0.0811985154340545, + "grad_norm": 0.9609735608100891, + "learning_rate": 0.00019681873550591306, + "loss": 3.6991, + "step": 897 + }, + { + "epoch": 0.08128903774780484, + "grad_norm": 0.9495016932487488, + "learning_rate": 0.00019681079013239748, + "loss": 3.7452, + "step": 898 + }, + { + "epoch": 0.08137956006155517, + "grad_norm": 1.0434441566467285, + "learning_rate": 0.00019680283501003797, + "loss": 3.7501, + "step": 899 + }, + { + "epoch": 0.08147008237530551, + "grad_norm": 0.9953208565711975, + "learning_rate": 0.00019679487013963564, + "loss": 3.8621, + "step": 900 + }, + { + "epoch": 0.08156060468905585, + "grad_norm": 0.9249243140220642, + "learning_rate": 0.0001967868955219925, + "loss": 3.6988, + "step": 901 + }, + { + "epoch": 0.08165112700280619, + "grad_norm": 1.0066379308700562, + "learning_rate": 0.0001967789111579117, + "loss": 3.7775, + "step": 902 + }, + { + "epoch": 0.08174164931655653, + "grad_norm": 1.0791577100753784, + "learning_rate": 0.00019677091704819715, + "loss": 3.7848, + "step": 903 + }, + { + "epoch": 0.08183217163030687, + "grad_norm": 1.0767844915390015, + "learning_rate": 0.00019676291319365387, + "loss": 3.7873, + "step": 904 + }, + { + "epoch": 0.08192269394405721, + "grad_norm": 0.9516428112983704, + "learning_rate": 0.00019675489959508792, + "loss": 3.8179, + "step": 905 + }, + { + "epoch": 0.08201321625780755, + "grad_norm": 1.0235987901687622, + "learning_rate": 0.00019674687625330625, + "loss": 3.7205, + "step": 906 + }, + { + "epoch": 0.0821037385715579, + "grad_norm": 0.9475753307342529, + "learning_rate": 0.00019673884316911673, + "loss": 3.7991, + "step": 907 + }, + { + "epoch": 0.08219426088530823, + "grad_norm": 1.0284664630889893, + "learning_rate": 0.0001967308003433284, + "loss": 3.7888, + "step": 908 + }, + { + "epoch": 0.08228478319905856, + "grad_norm": 0.9725825190544128, + "learning_rate": 0.0001967227477767511, + "loss": 3.7334, + "step": 909 + }, + { + "epoch": 0.0823753055128089, + "grad_norm": 1.0133692026138306, + "learning_rate": 0.00019671468547019573, + "loss": 3.6991, + "step": 910 + }, + { + "epoch": 0.08246582782655924, + "grad_norm": 0.9631583094596863, + "learning_rate": 0.0001967066134244742, + "loss": 3.7733, + "step": 911 + }, + { + "epoch": 0.08255635014030958, + "grad_norm": 0.8878318667411804, + "learning_rate": 0.00019669853164039933, + "loss": 3.7456, + "step": 912 + }, + { + "epoch": 0.08264687245405992, + "grad_norm": 0.9535197615623474, + "learning_rate": 0.00019669044011878497, + "loss": 3.7512, + "step": 913 + }, + { + "epoch": 0.08273739476781027, + "grad_norm": 0.9916483759880066, + "learning_rate": 0.00019668233886044597, + "loss": 3.7768, + "step": 914 + }, + { + "epoch": 0.0828279170815606, + "grad_norm": 0.8816608786582947, + "learning_rate": 0.00019667422786619806, + "loss": 3.7579, + "step": 915 + }, + { + "epoch": 0.08291843939531095, + "grad_norm": 0.9805126786231995, + "learning_rate": 0.00019666610713685802, + "loss": 3.6392, + "step": 916 + }, + { + "epoch": 0.08300896170906129, + "grad_norm": 1.065384864807129, + "learning_rate": 0.0001966579766732437, + "loss": 3.8084, + "step": 917 + }, + { + "epoch": 0.08309948402281163, + "grad_norm": 0.9200580716133118, + "learning_rate": 0.00019664983647617375, + "loss": 3.6934, + "step": 918 + }, + { + "epoch": 0.08319000633656196, + "grad_norm": 0.911810040473938, + "learning_rate": 0.00019664168654646787, + "loss": 3.7117, + "step": 919 + }, + { + "epoch": 0.0832805286503123, + "grad_norm": 0.9342407584190369, + "learning_rate": 0.00019663352688494684, + "loss": 3.8123, + "step": 920 + }, + { + "epoch": 0.08337105096406264, + "grad_norm": 0.9220612645149231, + "learning_rate": 0.00019662535749243233, + "loss": 3.7095, + "step": 921 + }, + { + "epoch": 0.08346157327781298, + "grad_norm": 0.9277602434158325, + "learning_rate": 0.0001966171783697469, + "loss": 3.7442, + "step": 922 + }, + { + "epoch": 0.08355209559156332, + "grad_norm": 0.9696083068847656, + "learning_rate": 0.0001966089895177143, + "loss": 3.6917, + "step": 923 + }, + { + "epoch": 0.08364261790531366, + "grad_norm": 0.9408513307571411, + "learning_rate": 0.00019660079093715906, + "loss": 3.7603, + "step": 924 + }, + { + "epoch": 0.083733140219064, + "grad_norm": 0.9134141802787781, + "learning_rate": 0.00019659258262890683, + "loss": 3.7554, + "step": 925 + }, + { + "epoch": 0.08382366253281434, + "grad_norm": 1.1565461158752441, + "learning_rate": 0.0001965843645937842, + "loss": 3.7358, + "step": 926 + }, + { + "epoch": 0.08391418484656468, + "grad_norm": 0.9286847114562988, + "learning_rate": 0.00019657613683261865, + "loss": 3.7097, + "step": 927 + }, + { + "epoch": 0.08400470716031502, + "grad_norm": 0.9701700806617737, + "learning_rate": 0.00019656789934623881, + "loss": 3.6998, + "step": 928 + }, + { + "epoch": 0.08409522947406536, + "grad_norm": 0.9360522627830505, + "learning_rate": 0.0001965596521354741, + "loss": 3.8173, + "step": 929 + }, + { + "epoch": 0.08418575178781569, + "grad_norm": 0.9045924544334412, + "learning_rate": 0.0001965513952011551, + "loss": 3.7291, + "step": 930 + }, + { + "epoch": 0.08427627410156603, + "grad_norm": 1.012802243232727, + "learning_rate": 0.00019654312854411324, + "loss": 3.767, + "step": 931 + }, + { + "epoch": 0.08436679641531637, + "grad_norm": 1.0733472108840942, + "learning_rate": 0.00019653485216518095, + "loss": 3.727, + "step": 932 + }, + { + "epoch": 0.08445731872906671, + "grad_norm": 0.977625846862793, + "learning_rate": 0.00019652656606519172, + "loss": 3.7632, + "step": 933 + }, + { + "epoch": 0.08454784104281705, + "grad_norm": 0.9703370928764343, + "learning_rate": 0.0001965182702449799, + "loss": 3.7985, + "step": 934 + }, + { + "epoch": 0.0846383633565674, + "grad_norm": 1.000142216682434, + "learning_rate": 0.0001965099647053809, + "loss": 3.737, + "step": 935 + }, + { + "epoch": 0.08472888567031774, + "grad_norm": 0.9989868402481079, + "learning_rate": 0.00019650164944723115, + "loss": 3.7328, + "step": 936 + }, + { + "epoch": 0.08481940798406808, + "grad_norm": 1.0792580842971802, + "learning_rate": 0.0001964933244713679, + "loss": 3.7435, + "step": 937 + }, + { + "epoch": 0.08490993029781842, + "grad_norm": 0.9847734570503235, + "learning_rate": 0.00019648498977862952, + "loss": 3.7951, + "step": 938 + }, + { + "epoch": 0.08500045261156876, + "grad_norm": 1.0123600959777832, + "learning_rate": 0.00019647664536985536, + "loss": 3.7984, + "step": 939 + }, + { + "epoch": 0.08509097492531909, + "grad_norm": 0.9911985993385315, + "learning_rate": 0.0001964682912458856, + "loss": 3.7909, + "step": 940 + }, + { + "epoch": 0.08518149723906943, + "grad_norm": 0.9254298210144043, + "learning_rate": 0.00019645992740756154, + "loss": 3.8021, + "step": 941 + }, + { + "epoch": 0.08527201955281977, + "grad_norm": 0.912542462348938, + "learning_rate": 0.00019645155385572544, + "loss": 3.7413, + "step": 942 + }, + { + "epoch": 0.08536254186657011, + "grad_norm": 0.9094882607460022, + "learning_rate": 0.00019644317059122051, + "loss": 3.7391, + "step": 943 + }, + { + "epoch": 0.08545306418032045, + "grad_norm": 0.9295530319213867, + "learning_rate": 0.00019643477761489096, + "loss": 3.7333, + "step": 944 + }, + { + "epoch": 0.08554358649407079, + "grad_norm": 0.9964175224304199, + "learning_rate": 0.0001964263749275819, + "loss": 3.7959, + "step": 945 + }, + { + "epoch": 0.08563410880782113, + "grad_norm": 0.9350283145904541, + "learning_rate": 0.00019641796253013958, + "loss": 3.807, + "step": 946 + }, + { + "epoch": 0.08572463112157147, + "grad_norm": 0.9795020222663879, + "learning_rate": 0.00019640954042341103, + "loss": 3.6858, + "step": 947 + }, + { + "epoch": 0.08581515343532181, + "grad_norm": 0.9330901503562927, + "learning_rate": 0.0001964011086082444, + "loss": 3.8028, + "step": 948 + }, + { + "epoch": 0.08590567574907215, + "grad_norm": 0.9229695200920105, + "learning_rate": 0.00019639266708548878, + "loss": 3.6575, + "step": 949 + }, + { + "epoch": 0.08599619806282248, + "grad_norm": 0.99271160364151, + "learning_rate": 0.00019638421585599423, + "loss": 3.7792, + "step": 950 + }, + { + "epoch": 0.08608672037657282, + "grad_norm": 0.9463202357292175, + "learning_rate": 0.00019637575492061174, + "loss": 3.8157, + "step": 951 + }, + { + "epoch": 0.08617724269032316, + "grad_norm": 1.0383622646331787, + "learning_rate": 0.0001963672842801934, + "loss": 3.6997, + "step": 952 + }, + { + "epoch": 0.0862677650040735, + "grad_norm": 1.0432873964309692, + "learning_rate": 0.0001963588039355922, + "loss": 3.7557, + "step": 953 + }, + { + "epoch": 0.08635828731782384, + "grad_norm": 0.914127767086029, + "learning_rate": 0.00019635031388766204, + "loss": 3.7724, + "step": 954 + }, + { + "epoch": 0.08644880963157418, + "grad_norm": 0.897434651851654, + "learning_rate": 0.0001963418141372579, + "loss": 3.7032, + "step": 955 + }, + { + "epoch": 0.08653933194532452, + "grad_norm": 1.049588918685913, + "learning_rate": 0.00019633330468523573, + "loss": 3.7061, + "step": 956 + }, + { + "epoch": 0.08662985425907486, + "grad_norm": 0.9720751047134399, + "learning_rate": 0.00019632478553245243, + "loss": 3.7529, + "step": 957 + }, + { + "epoch": 0.0867203765728252, + "grad_norm": 0.8728967905044556, + "learning_rate": 0.00019631625667976583, + "loss": 3.6846, + "step": 958 + }, + { + "epoch": 0.08681089888657555, + "grad_norm": 0.9369397759437561, + "learning_rate": 0.00019630771812803482, + "loss": 3.7338, + "step": 959 + }, + { + "epoch": 0.08690142120032587, + "grad_norm": 0.9343727231025696, + "learning_rate": 0.00019629916987811926, + "loss": 3.7064, + "step": 960 + }, + { + "epoch": 0.08699194351407621, + "grad_norm": 0.8479828238487244, + "learning_rate": 0.0001962906119308799, + "loss": 3.6929, + "step": 961 + }, + { + "epoch": 0.08708246582782656, + "grad_norm": 0.8664740920066833, + "learning_rate": 0.00019628204428717857, + "loss": 3.7699, + "step": 962 + }, + { + "epoch": 0.0871729881415769, + "grad_norm": 0.9612951278686523, + "learning_rate": 0.00019627346694787797, + "loss": 3.7328, + "step": 963 + }, + { + "epoch": 0.08726351045532724, + "grad_norm": 0.9078434109687805, + "learning_rate": 0.00019626487991384196, + "loss": 3.7624, + "step": 964 + }, + { + "epoch": 0.08735403276907758, + "grad_norm": 1.0075331926345825, + "learning_rate": 0.00019625628318593514, + "loss": 3.7806, + "step": 965 + }, + { + "epoch": 0.08744455508282792, + "grad_norm": 0.8739338517189026, + "learning_rate": 0.0001962476767650232, + "loss": 3.7445, + "step": 966 + }, + { + "epoch": 0.08753507739657826, + "grad_norm": 0.9157695174217224, + "learning_rate": 0.00019623906065197288, + "loss": 3.7081, + "step": 967 + }, + { + "epoch": 0.0876255997103286, + "grad_norm": 0.9016348719596863, + "learning_rate": 0.0001962304348476518, + "loss": 3.7984, + "step": 968 + }, + { + "epoch": 0.08771612202407894, + "grad_norm": 0.9907736778259277, + "learning_rate": 0.00019622179935292855, + "loss": 3.7199, + "step": 969 + }, + { + "epoch": 0.08780664433782928, + "grad_norm": 0.8860751390457153, + "learning_rate": 0.00019621315416867274, + "loss": 3.6944, + "step": 970 + }, + { + "epoch": 0.08789716665157961, + "grad_norm": 0.8661206960678101, + "learning_rate": 0.00019620449929575492, + "loss": 3.6799, + "step": 971 + }, + { + "epoch": 0.08798768896532995, + "grad_norm": 0.8809290528297424, + "learning_rate": 0.00019619583473504668, + "loss": 3.7001, + "step": 972 + }, + { + "epoch": 0.08807821127908029, + "grad_norm": 0.8480473160743713, + "learning_rate": 0.00019618716048742048, + "loss": 3.7303, + "step": 973 + }, + { + "epoch": 0.08816873359283063, + "grad_norm": 0.9732685685157776, + "learning_rate": 0.0001961784765537499, + "loss": 3.7068, + "step": 974 + }, + { + "epoch": 0.08825925590658097, + "grad_norm": 0.9032775163650513, + "learning_rate": 0.0001961697829349093, + "loss": 3.6811, + "step": 975 + }, + { + "epoch": 0.08834977822033131, + "grad_norm": 0.9440922141075134, + "learning_rate": 0.00019616107963177425, + "loss": 3.7028, + "step": 976 + }, + { + "epoch": 0.08844030053408165, + "grad_norm": 0.967886745929718, + "learning_rate": 0.00019615236664522105, + "loss": 3.6888, + "step": 977 + }, + { + "epoch": 0.088530822847832, + "grad_norm": 0.9971091151237488, + "learning_rate": 0.00019614364397612722, + "loss": 3.7256, + "step": 978 + }, + { + "epoch": 0.08862134516158233, + "grad_norm": 0.9110148549079895, + "learning_rate": 0.00019613491162537105, + "loss": 3.7195, + "step": 979 + }, + { + "epoch": 0.08871186747533268, + "grad_norm": 0.9255290627479553, + "learning_rate": 0.0001961261695938319, + "loss": 3.7545, + "step": 980 + }, + { + "epoch": 0.088802389789083, + "grad_norm": 0.9252705574035645, + "learning_rate": 0.00019611741788239012, + "loss": 3.7555, + "step": 981 + }, + { + "epoch": 0.08889291210283334, + "grad_norm": 0.9513722062110901, + "learning_rate": 0.00019610865649192697, + "loss": 3.7356, + "step": 982 + }, + { + "epoch": 0.08898343441658368, + "grad_norm": 0.9082736372947693, + "learning_rate": 0.00019609988542332473, + "loss": 3.7165, + "step": 983 + }, + { + "epoch": 0.08907395673033403, + "grad_norm": 0.9556236267089844, + "learning_rate": 0.00019609110467746666, + "loss": 3.6943, + "step": 984 + }, + { + "epoch": 0.08916447904408437, + "grad_norm": 0.9560606479644775, + "learning_rate": 0.00019608231425523702, + "loss": 3.7364, + "step": 985 + }, + { + "epoch": 0.0892550013578347, + "grad_norm": 0.9406256675720215, + "learning_rate": 0.00019607351415752095, + "loss": 3.6523, + "step": 986 + }, + { + "epoch": 0.08934552367158505, + "grad_norm": 0.8736352324485779, + "learning_rate": 0.00019606470438520462, + "loss": 3.6718, + "step": 987 + }, + { + "epoch": 0.08943604598533539, + "grad_norm": 0.9579623341560364, + "learning_rate": 0.00019605588493917518, + "loss": 3.6735, + "step": 988 + }, + { + "epoch": 0.08952656829908573, + "grad_norm": 1.038737177848816, + "learning_rate": 0.00019604705582032078, + "loss": 3.7648, + "step": 989 + }, + { + "epoch": 0.08961709061283607, + "grad_norm": 0.8953343033790588, + "learning_rate": 0.00019603821702953046, + "loss": 3.7876, + "step": 990 + }, + { + "epoch": 0.0897076129265864, + "grad_norm": 0.9059159159660339, + "learning_rate": 0.0001960293685676943, + "loss": 3.7621, + "step": 991 + }, + { + "epoch": 0.08979813524033674, + "grad_norm": 0.9297139644622803, + "learning_rate": 0.0001960205104357034, + "loss": 3.7517, + "step": 992 + }, + { + "epoch": 0.08988865755408708, + "grad_norm": 1.069353461265564, + "learning_rate": 0.0001960116426344497, + "loss": 3.6651, + "step": 993 + }, + { + "epoch": 0.08997917986783742, + "grad_norm": 0.9858116507530212, + "learning_rate": 0.00019600276516482622, + "loss": 3.7341, + "step": 994 + }, + { + "epoch": 0.09006970218158776, + "grad_norm": 0.93048495054245, + "learning_rate": 0.0001959938780277269, + "loss": 3.7052, + "step": 995 + }, + { + "epoch": 0.0901602244953381, + "grad_norm": 0.8646350502967834, + "learning_rate": 0.00019598498122404672, + "loss": 3.6858, + "step": 996 + }, + { + "epoch": 0.09025074680908844, + "grad_norm": 0.9945192337036133, + "learning_rate": 0.00019597607475468153, + "loss": 3.6827, + "step": 997 + }, + { + "epoch": 0.09034126912283878, + "grad_norm": 0.9069668054580688, + "learning_rate": 0.00019596715862052824, + "loss": 3.7084, + "step": 998 + }, + { + "epoch": 0.09043179143658912, + "grad_norm": 0.8991177082061768, + "learning_rate": 0.00019595823282248472, + "loss": 3.6903, + "step": 999 + }, + { + "epoch": 0.09052231375033946, + "grad_norm": 0.8807759284973145, + "learning_rate": 0.00019594929736144976, + "loss": 3.6824, + "step": 1000 + }, + { + "epoch": 0.09052231375033946, + "eval_loss": 3.6549718379974365, + "eval_runtime": 71.8769, + "eval_samples_per_second": 37.606, + "eval_steps_per_second": 3.144, + "step": 1000 + }, + { + "epoch": 0.0906128360640898, + "grad_norm": 0.952815055847168, + "learning_rate": 0.0001959403522383232, + "loss": 3.6511, + "step": 1001 + }, + { + "epoch": 0.09070335837784013, + "grad_norm": 0.9049227237701416, + "learning_rate": 0.00019593139745400576, + "loss": 3.6608, + "step": 1002 + }, + { + "epoch": 0.09079388069159047, + "grad_norm": 0.9202782511711121, + "learning_rate": 0.00019592243300939926, + "loss": 3.7317, + "step": 1003 + }, + { + "epoch": 0.09088440300534081, + "grad_norm": 0.9427291750907898, + "learning_rate": 0.00019591345890540636, + "loss": 3.7335, + "step": 1004 + }, + { + "epoch": 0.09097492531909115, + "grad_norm": 0.8800294399261475, + "learning_rate": 0.00019590447514293078, + "loss": 3.6614, + "step": 1005 + }, + { + "epoch": 0.0910654476328415, + "grad_norm": 0.9174665212631226, + "learning_rate": 0.00019589548172287719, + "loss": 3.6391, + "step": 1006 + }, + { + "epoch": 0.09115596994659184, + "grad_norm": 0.8996450901031494, + "learning_rate": 0.00019588647864615118, + "loss": 3.7422, + "step": 1007 + }, + { + "epoch": 0.09124649226034218, + "grad_norm": 0.9029880166053772, + "learning_rate": 0.00019587746591365941, + "loss": 3.6324, + "step": 1008 + }, + { + "epoch": 0.09133701457409252, + "grad_norm": 0.9739851355552673, + "learning_rate": 0.00019586844352630943, + "loss": 3.7409, + "step": 1009 + }, + { + "epoch": 0.09142753688784286, + "grad_norm": 0.9311057925224304, + "learning_rate": 0.00019585941148500985, + "loss": 3.7517, + "step": 1010 + }, + { + "epoch": 0.0915180592015932, + "grad_norm": 0.9630916714668274, + "learning_rate": 0.00019585036979067013, + "loss": 3.6949, + "step": 1011 + }, + { + "epoch": 0.09160858151534353, + "grad_norm": 1.0906858444213867, + "learning_rate": 0.00019584131844420082, + "loss": 3.644, + "step": 1012 + }, + { + "epoch": 0.09169910382909387, + "grad_norm": 1.0149617195129395, + "learning_rate": 0.00019583225744651333, + "loss": 3.6535, + "step": 1013 + }, + { + "epoch": 0.09178962614284421, + "grad_norm": 1.0162272453308105, + "learning_rate": 0.00019582318679852017, + "loss": 3.7233, + "step": 1014 + }, + { + "epoch": 0.09188014845659455, + "grad_norm": 0.9386601448059082, + "learning_rate": 0.0001958141065011347, + "loss": 3.7117, + "step": 1015 + }, + { + "epoch": 0.09197067077034489, + "grad_norm": 1.0775753259658813, + "learning_rate": 0.00019580501655527133, + "loss": 3.7124, + "step": 1016 + }, + { + "epoch": 0.09206119308409523, + "grad_norm": 1.0624293088912964, + "learning_rate": 0.0001957959169618454, + "loss": 3.7581, + "step": 1017 + }, + { + "epoch": 0.09215171539784557, + "grad_norm": 0.9643511176109314, + "learning_rate": 0.00019578680772177327, + "loss": 3.6738, + "step": 1018 + }, + { + "epoch": 0.09224223771159591, + "grad_norm": 0.9284974336624146, + "learning_rate": 0.00019577768883597224, + "loss": 3.7531, + "step": 1019 + }, + { + "epoch": 0.09233276002534625, + "grad_norm": 0.9831129908561707, + "learning_rate": 0.00019576856030536054, + "loss": 3.738, + "step": 1020 + }, + { + "epoch": 0.0924232823390966, + "grad_norm": 1.0086435079574585, + "learning_rate": 0.00019575942213085744, + "loss": 3.6976, + "step": 1021 + }, + { + "epoch": 0.09251380465284692, + "grad_norm": 1.039759874343872, + "learning_rate": 0.00019575027431338314, + "loss": 3.7301, + "step": 1022 + }, + { + "epoch": 0.09260432696659726, + "grad_norm": 1.0100873708724976, + "learning_rate": 0.00019574111685385887, + "loss": 3.6884, + "step": 1023 + }, + { + "epoch": 0.0926948492803476, + "grad_norm": 1.0146422386169434, + "learning_rate": 0.00019573194975320673, + "loss": 3.6353, + "step": 1024 + }, + { + "epoch": 0.09278537159409794, + "grad_norm": 1.0001540184020996, + "learning_rate": 0.00019572277301234986, + "loss": 3.7716, + "step": 1025 + }, + { + "epoch": 0.09287589390784828, + "grad_norm": 0.8770668506622314, + "learning_rate": 0.0001957135866322124, + "loss": 3.7044, + "step": 1026 + }, + { + "epoch": 0.09296641622159862, + "grad_norm": 0.9398157000541687, + "learning_rate": 0.00019570439061371937, + "loss": 3.7006, + "step": 1027 + }, + { + "epoch": 0.09305693853534897, + "grad_norm": 0.9291334748268127, + "learning_rate": 0.00019569518495779682, + "loss": 3.6211, + "step": 1028 + }, + { + "epoch": 0.0931474608490993, + "grad_norm": 0.9487553238868713, + "learning_rate": 0.00019568596966537175, + "loss": 3.7016, + "step": 1029 + }, + { + "epoch": 0.09323798316284965, + "grad_norm": 0.9963234066963196, + "learning_rate": 0.00019567674473737218, + "loss": 3.6843, + "step": 1030 + }, + { + "epoch": 0.09332850547659999, + "grad_norm": 0.9393078088760376, + "learning_rate": 0.00019566751017472702, + "loss": 3.7114, + "step": 1031 + }, + { + "epoch": 0.09341902779035031, + "grad_norm": 0.978520393371582, + "learning_rate": 0.0001956582659783662, + "loss": 3.6871, + "step": 1032 + }, + { + "epoch": 0.09350955010410066, + "grad_norm": 1.07383131980896, + "learning_rate": 0.00019564901214922062, + "loss": 3.7456, + "step": 1033 + }, + { + "epoch": 0.093600072417851, + "grad_norm": 0.89312744140625, + "learning_rate": 0.00019563974868822212, + "loss": 3.7178, + "step": 1034 + }, + { + "epoch": 0.09369059473160134, + "grad_norm": 0.9418454766273499, + "learning_rate": 0.00019563047559630357, + "loss": 3.6676, + "step": 1035 + }, + { + "epoch": 0.09378111704535168, + "grad_norm": 0.9237220287322998, + "learning_rate": 0.00019562119287439873, + "loss": 3.6844, + "step": 1036 + }, + { + "epoch": 0.09387163935910202, + "grad_norm": 0.8427319526672363, + "learning_rate": 0.00019561190052344238, + "loss": 3.6416, + "step": 1037 + }, + { + "epoch": 0.09396216167285236, + "grad_norm": 0.9645209312438965, + "learning_rate": 0.00019560259854437025, + "loss": 3.6922, + "step": 1038 + }, + { + "epoch": 0.0940526839866027, + "grad_norm": 0.9387515783309937, + "learning_rate": 0.00019559328693811908, + "loss": 3.6764, + "step": 1039 + }, + { + "epoch": 0.09414320630035304, + "grad_norm": 1.045303225517273, + "learning_rate": 0.0001955839657056265, + "loss": 3.6943, + "step": 1040 + }, + { + "epoch": 0.09423372861410338, + "grad_norm": 0.9981943964958191, + "learning_rate": 0.00019557463484783124, + "loss": 3.6776, + "step": 1041 + }, + { + "epoch": 0.09432425092785372, + "grad_norm": 0.9997328519821167, + "learning_rate": 0.00019556529436567287, + "loss": 3.6776, + "step": 1042 + }, + { + "epoch": 0.09441477324160405, + "grad_norm": 0.9236479997634888, + "learning_rate": 0.00019555594426009192, + "loss": 3.7411, + "step": 1043 + }, + { + "epoch": 0.09450529555535439, + "grad_norm": 0.9315744638442993, + "learning_rate": 0.00019554658453203004, + "loss": 3.6653, + "step": 1044 + }, + { + "epoch": 0.09459581786910473, + "grad_norm": 0.9356442093849182, + "learning_rate": 0.00019553721518242968, + "loss": 3.736, + "step": 1045 + }, + { + "epoch": 0.09468634018285507, + "grad_norm": 0.9334856271743774, + "learning_rate": 0.00019552783621223436, + "loss": 3.7031, + "step": 1046 + }, + { + "epoch": 0.09477686249660541, + "grad_norm": 1.04819655418396, + "learning_rate": 0.00019551844762238858, + "loss": 3.6478, + "step": 1047 + }, + { + "epoch": 0.09486738481035575, + "grad_norm": 1.0170056819915771, + "learning_rate": 0.00019550904941383773, + "loss": 3.6761, + "step": 1048 + }, + { + "epoch": 0.0949579071241061, + "grad_norm": 0.9866304397583008, + "learning_rate": 0.00019549964158752822, + "loss": 3.6145, + "step": 1049 + }, + { + "epoch": 0.09504842943785644, + "grad_norm": 0.9720956087112427, + "learning_rate": 0.0001954902241444074, + "loss": 3.6766, + "step": 1050 + }, + { + "epoch": 0.09513895175160678, + "grad_norm": 0.9859123229980469, + "learning_rate": 0.00019548079708542364, + "loss": 3.7507, + "step": 1051 + }, + { + "epoch": 0.09522947406535712, + "grad_norm": 1.0696841478347778, + "learning_rate": 0.0001954713604115262, + "loss": 3.7064, + "step": 1052 + }, + { + "epoch": 0.09531999637910744, + "grad_norm": 1.0893000364303589, + "learning_rate": 0.00019546191412366543, + "loss": 3.7356, + "step": 1053 + }, + { + "epoch": 0.09541051869285778, + "grad_norm": 0.9062834978103638, + "learning_rate": 0.00019545245822279243, + "loss": 3.7005, + "step": 1054 + }, + { + "epoch": 0.09550104100660813, + "grad_norm": 1.0514686107635498, + "learning_rate": 0.00019544299270985956, + "loss": 3.7077, + "step": 1055 + }, + { + "epoch": 0.09559156332035847, + "grad_norm": 0.9726833701133728, + "learning_rate": 0.00019543351758581994, + "loss": 3.6598, + "step": 1056 + }, + { + "epoch": 0.09568208563410881, + "grad_norm": 0.9796419143676758, + "learning_rate": 0.0001954240328516277, + "loss": 3.5648, + "step": 1057 + }, + { + "epoch": 0.09577260794785915, + "grad_norm": 0.9635905623435974, + "learning_rate": 0.00019541453850823793, + "loss": 3.669, + "step": 1058 + }, + { + "epoch": 0.09586313026160949, + "grad_norm": 0.8129740953445435, + "learning_rate": 0.0001954050345566068, + "loss": 3.6587, + "step": 1059 + }, + { + "epoch": 0.09595365257535983, + "grad_norm": 0.9021315574645996, + "learning_rate": 0.00019539552099769126, + "loss": 3.6621, + "step": 1060 + }, + { + "epoch": 0.09604417488911017, + "grad_norm": 0.948771595954895, + "learning_rate": 0.00019538599783244939, + "loss": 3.6992, + "step": 1061 + }, + { + "epoch": 0.09613469720286051, + "grad_norm": 0.9002098441123962, + "learning_rate": 0.00019537646506184015, + "loss": 3.6613, + "step": 1062 + }, + { + "epoch": 0.09622521951661084, + "grad_norm": 0.9096206426620483, + "learning_rate": 0.00019536692268682347, + "loss": 3.7313, + "step": 1063 + }, + { + "epoch": 0.09631574183036118, + "grad_norm": 0.9365320205688477, + "learning_rate": 0.00019535737070836028, + "loss": 3.6228, + "step": 1064 + }, + { + "epoch": 0.09640626414411152, + "grad_norm": 0.9058887362480164, + "learning_rate": 0.0001953478091274125, + "loss": 3.703, + "step": 1065 + }, + { + "epoch": 0.09649678645786186, + "grad_norm": 0.8754643797874451, + "learning_rate": 0.00019533823794494292, + "loss": 3.643, + "step": 1066 + }, + { + "epoch": 0.0965873087716122, + "grad_norm": 0.9260498881340027, + "learning_rate": 0.0001953286571619154, + "loss": 3.6299, + "step": 1067 + }, + { + "epoch": 0.09667783108536254, + "grad_norm": 0.9123169779777527, + "learning_rate": 0.0001953190667792947, + "loss": 3.7424, + "step": 1068 + }, + { + "epoch": 0.09676835339911288, + "grad_norm": 0.9547523856163025, + "learning_rate": 0.0001953094667980466, + "loss": 3.6272, + "step": 1069 + }, + { + "epoch": 0.09685887571286322, + "grad_norm": 0.8996102809906006, + "learning_rate": 0.00019529985721913778, + "loss": 3.6572, + "step": 1070 + }, + { + "epoch": 0.09694939802661356, + "grad_norm": 0.9247338771820068, + "learning_rate": 0.00019529023804353596, + "loss": 3.7018, + "step": 1071 + }, + { + "epoch": 0.0970399203403639, + "grad_norm": 0.9439358711242676, + "learning_rate": 0.0001952806092722098, + "loss": 3.641, + "step": 1072 + }, + { + "epoch": 0.09713044265411423, + "grad_norm": 0.9879926443099976, + "learning_rate": 0.00019527097090612887, + "loss": 3.7011, + "step": 1073 + }, + { + "epoch": 0.09722096496786457, + "grad_norm": 0.8817611932754517, + "learning_rate": 0.0001952613229462638, + "loss": 3.6628, + "step": 1074 + }, + { + "epoch": 0.09731148728161491, + "grad_norm": 0.9118003845214844, + "learning_rate": 0.00019525166539358606, + "loss": 3.6269, + "step": 1075 + }, + { + "epoch": 0.09740200959536525, + "grad_norm": 0.9752653241157532, + "learning_rate": 0.00019524199824906826, + "loss": 3.6148, + "step": 1076 + }, + { + "epoch": 0.0974925319091156, + "grad_norm": 0.8902285099029541, + "learning_rate": 0.00019523232151368383, + "loss": 3.6231, + "step": 1077 + }, + { + "epoch": 0.09758305422286594, + "grad_norm": 1.024625301361084, + "learning_rate": 0.0001952226351884072, + "loss": 3.7061, + "step": 1078 + }, + { + "epoch": 0.09767357653661628, + "grad_norm": 0.9171874523162842, + "learning_rate": 0.00019521293927421388, + "loss": 3.7018, + "step": 1079 + }, + { + "epoch": 0.09776409885036662, + "grad_norm": 0.9678434133529663, + "learning_rate": 0.00019520323377208017, + "loss": 3.6325, + "step": 1080 + }, + { + "epoch": 0.09785462116411696, + "grad_norm": 0.923358142375946, + "learning_rate": 0.00019519351868298336, + "loss": 3.5903, + "step": 1081 + }, + { + "epoch": 0.0979451434778673, + "grad_norm": 0.8904416561126709, + "learning_rate": 0.00019518379400790189, + "loss": 3.6347, + "step": 1082 + }, + { + "epoch": 0.09803566579161764, + "grad_norm": 0.9834840893745422, + "learning_rate": 0.00019517405974781493, + "loss": 3.7031, + "step": 1083 + }, + { + "epoch": 0.09812618810536797, + "grad_norm": 0.9426645040512085, + "learning_rate": 0.00019516431590370278, + "loss": 3.675, + "step": 1084 + }, + { + "epoch": 0.09821671041911831, + "grad_norm": 0.8758911490440369, + "learning_rate": 0.0001951545624765466, + "loss": 3.7114, + "step": 1085 + }, + { + "epoch": 0.09830723273286865, + "grad_norm": 0.9593923687934875, + "learning_rate": 0.0001951447994673286, + "loss": 3.6828, + "step": 1086 + }, + { + "epoch": 0.09839775504661899, + "grad_norm": 0.8801664113998413, + "learning_rate": 0.00019513502687703189, + "loss": 3.6278, + "step": 1087 + }, + { + "epoch": 0.09848827736036933, + "grad_norm": 0.9123901128768921, + "learning_rate": 0.00019512524470664057, + "loss": 3.5551, + "step": 1088 + }, + { + "epoch": 0.09857879967411967, + "grad_norm": 0.8368750810623169, + "learning_rate": 0.00019511545295713972, + "loss": 3.6159, + "step": 1089 + }, + { + "epoch": 0.09866932198787001, + "grad_norm": 0.8892086148262024, + "learning_rate": 0.00019510565162951537, + "loss": 3.669, + "step": 1090 + }, + { + "epoch": 0.09875984430162035, + "grad_norm": 0.8537901639938354, + "learning_rate": 0.0001950958407247545, + "loss": 3.5952, + "step": 1091 + }, + { + "epoch": 0.0988503666153707, + "grad_norm": 0.8904717564582825, + "learning_rate": 0.00019508602024384506, + "loss": 3.7375, + "step": 1092 + }, + { + "epoch": 0.09894088892912103, + "grad_norm": 0.8963021636009216, + "learning_rate": 0.00019507619018777596, + "loss": 3.6784, + "step": 1093 + }, + { + "epoch": 0.09903141124287136, + "grad_norm": 0.9099161028862, + "learning_rate": 0.00019506635055753716, + "loss": 3.6208, + "step": 1094 + }, + { + "epoch": 0.0991219335566217, + "grad_norm": 0.9092404842376709, + "learning_rate": 0.00019505650135411942, + "loss": 3.6476, + "step": 1095 + }, + { + "epoch": 0.09921245587037204, + "grad_norm": 0.9447798728942871, + "learning_rate": 0.0001950466425785146, + "loss": 3.636, + "step": 1096 + }, + { + "epoch": 0.09930297818412238, + "grad_norm": 0.8991760611534119, + "learning_rate": 0.0001950367742317155, + "loss": 3.6676, + "step": 1097 + }, + { + "epoch": 0.09939350049787272, + "grad_norm": 0.8840291500091553, + "learning_rate": 0.00019502689631471583, + "loss": 3.635, + "step": 1098 + }, + { + "epoch": 0.09948402281162307, + "grad_norm": 0.9281356930732727, + "learning_rate": 0.0001950170088285103, + "loss": 3.612, + "step": 1099 + }, + { + "epoch": 0.0995745451253734, + "grad_norm": 0.8732534646987915, + "learning_rate": 0.00019500711177409454, + "loss": 3.6033, + "step": 1100 + }, + { + "epoch": 0.09966506743912375, + "grad_norm": 0.9408723711967468, + "learning_rate": 0.00019499720515246525, + "loss": 3.599, + "step": 1101 + }, + { + "epoch": 0.09975558975287409, + "grad_norm": 0.9561828970909119, + "learning_rate": 0.00019498728896462, + "loss": 3.6431, + "step": 1102 + }, + { + "epoch": 0.09984611206662443, + "grad_norm": 0.8556196093559265, + "learning_rate": 0.00019497736321155736, + "loss": 3.6944, + "step": 1103 + }, + { + "epoch": 0.09993663438037476, + "grad_norm": 0.895474910736084, + "learning_rate": 0.00019496742789427683, + "loss": 3.6039, + "step": 1104 + }, + { + "epoch": 0.1000271566941251, + "grad_norm": 0.908662736415863, + "learning_rate": 0.00019495748301377895, + "loss": 3.5707, + "step": 1105 + }, + { + "epoch": 0.10011767900787544, + "grad_norm": 0.9285642504692078, + "learning_rate": 0.0001949475285710651, + "loss": 3.6242, + "step": 1106 + }, + { + "epoch": 0.10020820132162578, + "grad_norm": 0.8637263178825378, + "learning_rate": 0.0001949375645671377, + "loss": 3.7278, + "step": 1107 + }, + { + "epoch": 0.10029872363537612, + "grad_norm": 0.9085043668746948, + "learning_rate": 0.00019492759100300019, + "loss": 3.6507, + "step": 1108 + }, + { + "epoch": 0.10038924594912646, + "grad_norm": 0.8560227155685425, + "learning_rate": 0.0001949176078796568, + "loss": 3.5991, + "step": 1109 + }, + { + "epoch": 0.1004797682628768, + "grad_norm": 0.845382034778595, + "learning_rate": 0.00019490761519811293, + "loss": 3.6392, + "step": 1110 + }, + { + "epoch": 0.10057029057662714, + "grad_norm": 0.9317412376403809, + "learning_rate": 0.00019489761295937483, + "loss": 3.687, + "step": 1111 + }, + { + "epoch": 0.10066081289037748, + "grad_norm": 0.8906388878822327, + "learning_rate": 0.00019488760116444966, + "loss": 3.6313, + "step": 1112 + }, + { + "epoch": 0.10075133520412782, + "grad_norm": 0.8965174555778503, + "learning_rate": 0.00019487757981434566, + "loss": 3.6316, + "step": 1113 + }, + { + "epoch": 0.10084185751787815, + "grad_norm": 0.8226318359375, + "learning_rate": 0.00019486754891007198, + "loss": 3.6078, + "step": 1114 + }, + { + "epoch": 0.10093237983162849, + "grad_norm": 0.8582698702812195, + "learning_rate": 0.00019485750845263874, + "loss": 3.6597, + "step": 1115 + }, + { + "epoch": 0.10102290214537883, + "grad_norm": 0.8765236139297485, + "learning_rate": 0.00019484745844305695, + "loss": 3.5721, + "step": 1116 + }, + { + "epoch": 0.10111342445912917, + "grad_norm": 0.9732540249824524, + "learning_rate": 0.00019483739888233873, + "loss": 3.6898, + "step": 1117 + }, + { + "epoch": 0.10120394677287951, + "grad_norm": 0.8816714286804199, + "learning_rate": 0.000194827329771497, + "loss": 3.6785, + "step": 1118 + }, + { + "epoch": 0.10129446908662985, + "grad_norm": 1.0106911659240723, + "learning_rate": 0.00019481725111154577, + "loss": 3.6438, + "step": 1119 + }, + { + "epoch": 0.1013849914003802, + "grad_norm": 0.943163275718689, + "learning_rate": 0.00019480716290349995, + "loss": 3.6332, + "step": 1120 + }, + { + "epoch": 0.10147551371413054, + "grad_norm": 0.9958202242851257, + "learning_rate": 0.00019479706514837544, + "loss": 3.7277, + "step": 1121 + }, + { + "epoch": 0.10156603602788088, + "grad_norm": 0.985831618309021, + "learning_rate": 0.00019478695784718903, + "loss": 3.6826, + "step": 1122 + }, + { + "epoch": 0.10165655834163122, + "grad_norm": 0.9224241971969604, + "learning_rate": 0.0001947768410009586, + "loss": 3.6623, + "step": 1123 + }, + { + "epoch": 0.10174708065538156, + "grad_norm": 1.0890220403671265, + "learning_rate": 0.00019476671461070283, + "loss": 3.6679, + "step": 1124 + }, + { + "epoch": 0.10183760296913189, + "grad_norm": 0.901580274105072, + "learning_rate": 0.0001947565786774415, + "loss": 3.6733, + "step": 1125 + }, + { + "epoch": 0.10192812528288223, + "grad_norm": 1.0238951444625854, + "learning_rate": 0.00019474643320219532, + "loss": 3.705, + "step": 1126 + }, + { + "epoch": 0.10201864759663257, + "grad_norm": 0.9749259948730469, + "learning_rate": 0.00019473627818598587, + "loss": 3.6251, + "step": 1127 + }, + { + "epoch": 0.10210916991038291, + "grad_norm": 1.0571074485778809, + "learning_rate": 0.0001947261136298358, + "loss": 3.5952, + "step": 1128 + }, + { + "epoch": 0.10219969222413325, + "grad_norm": 1.082695722579956, + "learning_rate": 0.00019471593953476873, + "loss": 3.6309, + "step": 1129 + }, + { + "epoch": 0.10229021453788359, + "grad_norm": 1.031955361366272, + "learning_rate": 0.0001947057559018091, + "loss": 3.6201, + "step": 1130 + }, + { + "epoch": 0.10238073685163393, + "grad_norm": 0.928398072719574, + "learning_rate": 0.00019469556273198245, + "loss": 3.6031, + "step": 1131 + }, + { + "epoch": 0.10247125916538427, + "grad_norm": 0.9015419483184814, + "learning_rate": 0.0001946853600263152, + "loss": 3.6516, + "step": 1132 + }, + { + "epoch": 0.10256178147913461, + "grad_norm": 0.9696022272109985, + "learning_rate": 0.00019467514778583484, + "loss": 3.6614, + "step": 1133 + }, + { + "epoch": 0.10265230379288495, + "grad_norm": 0.9159278273582458, + "learning_rate": 0.00019466492601156966, + "loss": 3.6942, + "step": 1134 + }, + { + "epoch": 0.10274282610663528, + "grad_norm": 0.9715399742126465, + "learning_rate": 0.000194654694704549, + "loss": 3.6107, + "step": 1135 + }, + { + "epoch": 0.10283334842038562, + "grad_norm": 0.9022889137268066, + "learning_rate": 0.0001946444538658032, + "loss": 3.6543, + "step": 1136 + }, + { + "epoch": 0.10292387073413596, + "grad_norm": 0.9374133944511414, + "learning_rate": 0.00019463420349636348, + "loss": 3.6396, + "step": 1137 + }, + { + "epoch": 0.1030143930478863, + "grad_norm": 0.9193755984306335, + "learning_rate": 0.00019462394359726206, + "loss": 3.6156, + "step": 1138 + }, + { + "epoch": 0.10310491536163664, + "grad_norm": 0.9619964361190796, + "learning_rate": 0.00019461367416953208, + "loss": 3.7139, + "step": 1139 + }, + { + "epoch": 0.10319543767538698, + "grad_norm": 0.9839112162590027, + "learning_rate": 0.00019460339521420772, + "loss": 3.6013, + "step": 1140 + }, + { + "epoch": 0.10328595998913732, + "grad_norm": 0.9973909258842468, + "learning_rate": 0.00019459310673232408, + "loss": 3.6059, + "step": 1141 + }, + { + "epoch": 0.10337648230288767, + "grad_norm": 0.8987120985984802, + "learning_rate": 0.00019458280872491714, + "loss": 3.6464, + "step": 1142 + }, + { + "epoch": 0.103467004616638, + "grad_norm": 1.0310770273208618, + "learning_rate": 0.00019457250119302394, + "loss": 3.6645, + "step": 1143 + }, + { + "epoch": 0.10355752693038835, + "grad_norm": 0.913856565952301, + "learning_rate": 0.0001945621841376825, + "loss": 3.6143, + "step": 1144 + }, + { + "epoch": 0.10364804924413867, + "grad_norm": 1.0820924043655396, + "learning_rate": 0.0001945518575599317, + "loss": 3.6924, + "step": 1145 + }, + { + "epoch": 0.10373857155788901, + "grad_norm": 0.9650067687034607, + "learning_rate": 0.00019454152146081141, + "loss": 3.6596, + "step": 1146 + }, + { + "epoch": 0.10382909387163936, + "grad_norm": 0.8672732710838318, + "learning_rate": 0.0001945311758413625, + "loss": 3.6083, + "step": 1147 + }, + { + "epoch": 0.1039196161853897, + "grad_norm": 1.0733089447021484, + "learning_rate": 0.0001945208207026268, + "loss": 3.7095, + "step": 1148 + }, + { + "epoch": 0.10401013849914004, + "grad_norm": 0.9187482595443726, + "learning_rate": 0.000194510456045647, + "loss": 3.6309, + "step": 1149 + }, + { + "epoch": 0.10410066081289038, + "grad_norm": 1.012186050415039, + "learning_rate": 0.00019450008187146684, + "loss": 3.6064, + "step": 1150 + }, + { + "epoch": 0.10419118312664072, + "grad_norm": 0.9701893329620361, + "learning_rate": 0.00019448969818113107, + "loss": 3.6653, + "step": 1151 + }, + { + "epoch": 0.10428170544039106, + "grad_norm": 0.9445899128913879, + "learning_rate": 0.00019447930497568528, + "loss": 3.6369, + "step": 1152 + }, + { + "epoch": 0.1043722277541414, + "grad_norm": 0.9546499848365784, + "learning_rate": 0.00019446890225617604, + "loss": 3.6069, + "step": 1153 + }, + { + "epoch": 0.10446275006789174, + "grad_norm": 0.9554566144943237, + "learning_rate": 0.00019445849002365094, + "loss": 3.6144, + "step": 1154 + }, + { + "epoch": 0.10455327238164208, + "grad_norm": 0.8900935649871826, + "learning_rate": 0.00019444806827915846, + "loss": 3.6298, + "step": 1155 + }, + { + "epoch": 0.10464379469539241, + "grad_norm": 1.0609067678451538, + "learning_rate": 0.00019443763702374812, + "loss": 3.6561, + "step": 1156 + }, + { + "epoch": 0.10473431700914275, + "grad_norm": 0.8893911838531494, + "learning_rate": 0.0001944271962584703, + "loss": 3.56, + "step": 1157 + }, + { + "epoch": 0.10482483932289309, + "grad_norm": 1.1273530721664429, + "learning_rate": 0.0001944167459843764, + "loss": 3.6363, + "step": 1158 + }, + { + "epoch": 0.10491536163664343, + "grad_norm": 0.8835474848747253, + "learning_rate": 0.00019440628620251874, + "loss": 3.5974, + "step": 1159 + }, + { + "epoch": 0.10500588395039377, + "grad_norm": 0.9661706686019897, + "learning_rate": 0.00019439581691395067, + "loss": 3.685, + "step": 1160 + }, + { + "epoch": 0.10509640626414411, + "grad_norm": 0.9183404445648193, + "learning_rate": 0.00019438533811972643, + "loss": 3.6391, + "step": 1161 + }, + { + "epoch": 0.10518692857789445, + "grad_norm": 0.9571143984794617, + "learning_rate": 0.0001943748498209012, + "loss": 3.5659, + "step": 1162 + }, + { + "epoch": 0.1052774508916448, + "grad_norm": 0.9471140503883362, + "learning_rate": 0.0001943643520185312, + "loss": 3.6097, + "step": 1163 + }, + { + "epoch": 0.10536797320539514, + "grad_norm": 1.0031734704971313, + "learning_rate": 0.0001943538447136735, + "loss": 3.5766, + "step": 1164 + }, + { + "epoch": 0.10545849551914548, + "grad_norm": 0.9944052696228027, + "learning_rate": 0.00019434332790738625, + "loss": 3.6271, + "step": 1165 + }, + { + "epoch": 0.1055490178328958, + "grad_norm": 0.9067218899726868, + "learning_rate": 0.00019433280160072847, + "loss": 3.6163, + "step": 1166 + }, + { + "epoch": 0.10563954014664614, + "grad_norm": 0.9322523474693298, + "learning_rate": 0.0001943222657947601, + "loss": 3.6253, + "step": 1167 + }, + { + "epoch": 0.10573006246039648, + "grad_norm": 1.0816961526870728, + "learning_rate": 0.0001943117204905422, + "loss": 3.6209, + "step": 1168 + }, + { + "epoch": 0.10582058477414683, + "grad_norm": 0.9161190986633301, + "learning_rate": 0.00019430116568913656, + "loss": 3.6201, + "step": 1169 + }, + { + "epoch": 0.10591110708789717, + "grad_norm": 1.024245023727417, + "learning_rate": 0.00019429060139160618, + "loss": 3.5892, + "step": 1170 + }, + { + "epoch": 0.1060016294016475, + "grad_norm": 0.9940409660339355, + "learning_rate": 0.00019428002759901478, + "loss": 3.5953, + "step": 1171 + }, + { + "epoch": 0.10609215171539785, + "grad_norm": 1.0768675804138184, + "learning_rate": 0.0001942694443124272, + "loss": 3.6088, + "step": 1172 + }, + { + "epoch": 0.10618267402914819, + "grad_norm": 0.9526625275611877, + "learning_rate": 0.00019425885153290918, + "loss": 3.6164, + "step": 1173 + }, + { + "epoch": 0.10627319634289853, + "grad_norm": 1.0202966928482056, + "learning_rate": 0.00019424824926152735, + "loss": 3.6156, + "step": 1174 + }, + { + "epoch": 0.10636371865664887, + "grad_norm": 1.0075091123580933, + "learning_rate": 0.0001942376374993494, + "loss": 3.6492, + "step": 1175 + }, + { + "epoch": 0.1064542409703992, + "grad_norm": 1.0465952157974243, + "learning_rate": 0.00019422701624744395, + "loss": 3.6663, + "step": 1176 + }, + { + "epoch": 0.10654476328414954, + "grad_norm": 0.9357006549835205, + "learning_rate": 0.00019421638550688055, + "loss": 3.5978, + "step": 1177 + }, + { + "epoch": 0.10663528559789988, + "grad_norm": 0.9215185642242432, + "learning_rate": 0.00019420574527872968, + "loss": 3.5934, + "step": 1178 + }, + { + "epoch": 0.10672580791165022, + "grad_norm": 1.01131272315979, + "learning_rate": 0.00019419509556406285, + "loss": 3.6031, + "step": 1179 + }, + { + "epoch": 0.10681633022540056, + "grad_norm": 0.9412872195243835, + "learning_rate": 0.00019418443636395248, + "loss": 3.6491, + "step": 1180 + }, + { + "epoch": 0.1069068525391509, + "grad_norm": 0.9983108639717102, + "learning_rate": 0.00019417376767947192, + "loss": 3.7015, + "step": 1181 + }, + { + "epoch": 0.10699737485290124, + "grad_norm": 0.9529833793640137, + "learning_rate": 0.00019416308951169557, + "loss": 3.6213, + "step": 1182 + }, + { + "epoch": 0.10708789716665158, + "grad_norm": 1.0248433351516724, + "learning_rate": 0.00019415240186169866, + "loss": 3.6436, + "step": 1183 + }, + { + "epoch": 0.10717841948040192, + "grad_norm": 0.9561635255813599, + "learning_rate": 0.00019414170473055746, + "loss": 3.6078, + "step": 1184 + }, + { + "epoch": 0.10726894179415226, + "grad_norm": 0.8160126209259033, + "learning_rate": 0.00019413099811934917, + "loss": 3.6372, + "step": 1185 + }, + { + "epoch": 0.10735946410790259, + "grad_norm": 0.9989318251609802, + "learning_rate": 0.00019412028202915198, + "loss": 3.6229, + "step": 1186 + }, + { + "epoch": 0.10744998642165293, + "grad_norm": 0.8841098546981812, + "learning_rate": 0.00019410955646104492, + "loss": 3.5736, + "step": 1187 + }, + { + "epoch": 0.10754050873540327, + "grad_norm": 0.9628979563713074, + "learning_rate": 0.00019409882141610812, + "loss": 3.587, + "step": 1188 + }, + { + "epoch": 0.10763103104915361, + "grad_norm": 0.873678982257843, + "learning_rate": 0.00019408807689542257, + "loss": 3.6213, + "step": 1189 + }, + { + "epoch": 0.10772155336290395, + "grad_norm": 0.8769716024398804, + "learning_rate": 0.00019407732290007023, + "loss": 3.6219, + "step": 1190 + }, + { + "epoch": 0.1078120756766543, + "grad_norm": 0.9059527516365051, + "learning_rate": 0.00019406655943113408, + "loss": 3.6575, + "step": 1191 + }, + { + "epoch": 0.10790259799040464, + "grad_norm": 0.878751277923584, + "learning_rate": 0.00019405578648969796, + "loss": 3.6348, + "step": 1192 + }, + { + "epoch": 0.10799312030415498, + "grad_norm": 1.0593430995941162, + "learning_rate": 0.00019404500407684672, + "loss": 3.5751, + "step": 1193 + }, + { + "epoch": 0.10808364261790532, + "grad_norm": 0.9103216528892517, + "learning_rate": 0.00019403421219366611, + "loss": 3.577, + "step": 1194 + }, + { + "epoch": 0.10817416493165566, + "grad_norm": 1.0272597074508667, + "learning_rate": 0.00019402341084124295, + "loss": 3.6078, + "step": 1195 + }, + { + "epoch": 0.108264687245406, + "grad_norm": 0.8711608648300171, + "learning_rate": 0.00019401260002066488, + "loss": 3.6503, + "step": 1196 + }, + { + "epoch": 0.10835520955915633, + "grad_norm": 1.018958330154419, + "learning_rate": 0.00019400177973302054, + "loss": 3.6311, + "step": 1197 + }, + { + "epoch": 0.10844573187290667, + "grad_norm": 0.8852933049201965, + "learning_rate": 0.00019399094997939957, + "loss": 3.67, + "step": 1198 + }, + { + "epoch": 0.10853625418665701, + "grad_norm": 0.939679741859436, + "learning_rate": 0.00019398011076089252, + "loss": 3.5849, + "step": 1199 + }, + { + "epoch": 0.10862677650040735, + "grad_norm": 0.8842020630836487, + "learning_rate": 0.00019396926207859084, + "loss": 3.6535, + "step": 1200 + }, + { + "epoch": 0.10871729881415769, + "grad_norm": 0.9720060229301453, + "learning_rate": 0.00019395840393358706, + "loss": 3.636, + "step": 1201 + }, + { + "epoch": 0.10880782112790803, + "grad_norm": 0.930729329586029, + "learning_rate": 0.00019394753632697462, + "loss": 3.5714, + "step": 1202 + }, + { + "epoch": 0.10889834344165837, + "grad_norm": 0.9698287844657898, + "learning_rate": 0.0001939366592598478, + "loss": 3.5511, + "step": 1203 + }, + { + "epoch": 0.10898886575540871, + "grad_norm": 0.9683051109313965, + "learning_rate": 0.00019392577273330197, + "loss": 3.6155, + "step": 1204 + }, + { + "epoch": 0.10907938806915905, + "grad_norm": 0.9001908302307129, + "learning_rate": 0.00019391487674843338, + "loss": 3.5123, + "step": 1205 + }, + { + "epoch": 0.1091699103829094, + "grad_norm": 0.9388008713722229, + "learning_rate": 0.0001939039713063393, + "loss": 3.56, + "step": 1206 + }, + { + "epoch": 0.10926043269665972, + "grad_norm": 1.036388635635376, + "learning_rate": 0.0001938930564081179, + "loss": 3.5918, + "step": 1207 + }, + { + "epoch": 0.10935095501041006, + "grad_norm": 1.0771197080612183, + "learning_rate": 0.00019388213205486822, + "loss": 3.6138, + "step": 1208 + }, + { + "epoch": 0.1094414773241604, + "grad_norm": 0.922707736492157, + "learning_rate": 0.00019387119824769045, + "loss": 3.5791, + "step": 1209 + }, + { + "epoch": 0.10953199963791074, + "grad_norm": 1.0649242401123047, + "learning_rate": 0.00019386025498768558, + "loss": 3.5486, + "step": 1210 + }, + { + "epoch": 0.10962252195166108, + "grad_norm": 0.9694133400917053, + "learning_rate": 0.0001938493022759556, + "loss": 3.5445, + "step": 1211 + }, + { + "epoch": 0.10971304426541142, + "grad_norm": 0.9994919896125793, + "learning_rate": 0.00019383834011360346, + "loss": 3.577, + "step": 1212 + }, + { + "epoch": 0.10980356657916177, + "grad_norm": 0.8897203803062439, + "learning_rate": 0.00019382736850173302, + "loss": 3.5406, + "step": 1213 + }, + { + "epoch": 0.1098940888929121, + "grad_norm": 0.9498850107192993, + "learning_rate": 0.00019381638744144915, + "loss": 3.5708, + "step": 1214 + }, + { + "epoch": 0.10998461120666245, + "grad_norm": 0.9750722050666809, + "learning_rate": 0.00019380539693385763, + "loss": 3.6341, + "step": 1215 + }, + { + "epoch": 0.11007513352041279, + "grad_norm": 0.979321300983429, + "learning_rate": 0.0001937943969800652, + "loss": 3.6495, + "step": 1216 + }, + { + "epoch": 0.11016565583416311, + "grad_norm": 0.9199374914169312, + "learning_rate": 0.00019378338758117958, + "loss": 3.5429, + "step": 1217 + }, + { + "epoch": 0.11025617814791346, + "grad_norm": 0.9436404705047607, + "learning_rate": 0.00019377236873830938, + "loss": 3.5996, + "step": 1218 + }, + { + "epoch": 0.1103467004616638, + "grad_norm": 0.9604935646057129, + "learning_rate": 0.00019376134045256423, + "loss": 3.6583, + "step": 1219 + }, + { + "epoch": 0.11043722277541414, + "grad_norm": 0.9841674566268921, + "learning_rate": 0.00019375030272505463, + "loss": 3.6077, + "step": 1220 + }, + { + "epoch": 0.11052774508916448, + "grad_norm": 0.919344425201416, + "learning_rate": 0.00019373925555689212, + "loss": 3.5181, + "step": 1221 + }, + { + "epoch": 0.11061826740291482, + "grad_norm": 0.9568752646446228, + "learning_rate": 0.00019372819894918915, + "loss": 3.5464, + "step": 1222 + }, + { + "epoch": 0.11070878971666516, + "grad_norm": 0.9117941856384277, + "learning_rate": 0.0001937171329030591, + "loss": 3.6367, + "step": 1223 + }, + { + "epoch": 0.1107993120304155, + "grad_norm": 0.9444876909255981, + "learning_rate": 0.00019370605741961635, + "loss": 3.5982, + "step": 1224 + }, + { + "epoch": 0.11088983434416584, + "grad_norm": 1.0101433992385864, + "learning_rate": 0.0001936949724999762, + "loss": 3.5492, + "step": 1225 + }, + { + "epoch": 0.11098035665791618, + "grad_norm": 0.9878042340278625, + "learning_rate": 0.00019368387814525483, + "loss": 3.6045, + "step": 1226 + }, + { + "epoch": 0.11107087897166651, + "grad_norm": 0.9536324143409729, + "learning_rate": 0.00019367277435656952, + "loss": 3.5784, + "step": 1227 + }, + { + "epoch": 0.11116140128541685, + "grad_norm": 1.0027134418487549, + "learning_rate": 0.0001936616611350384, + "loss": 3.5433, + "step": 1228 + }, + { + "epoch": 0.11125192359916719, + "grad_norm": 0.9173308610916138, + "learning_rate": 0.00019365053848178056, + "loss": 3.6148, + "step": 1229 + }, + { + "epoch": 0.11134244591291753, + "grad_norm": 1.0498443841934204, + "learning_rate": 0.00019363940639791606, + "loss": 3.686, + "step": 1230 + }, + { + "epoch": 0.11143296822666787, + "grad_norm": 1.124347448348999, + "learning_rate": 0.00019362826488456588, + "loss": 3.613, + "step": 1231 + }, + { + "epoch": 0.11152349054041821, + "grad_norm": 0.9275368452072144, + "learning_rate": 0.00019361711394285202, + "loss": 3.6151, + "step": 1232 + }, + { + "epoch": 0.11161401285416855, + "grad_norm": 1.0720618963241577, + "learning_rate": 0.00019360595357389735, + "loss": 3.5815, + "step": 1233 + }, + { + "epoch": 0.1117045351679189, + "grad_norm": 1.006804347038269, + "learning_rate": 0.00019359478377882567, + "loss": 3.6022, + "step": 1234 + }, + { + "epoch": 0.11179505748166924, + "grad_norm": 0.9970232248306274, + "learning_rate": 0.00019358360455876188, + "loss": 3.5799, + "step": 1235 + }, + { + "epoch": 0.11188557979541958, + "grad_norm": 0.9625207185745239, + "learning_rate": 0.00019357241591483164, + "loss": 3.5385, + "step": 1236 + }, + { + "epoch": 0.11197610210916992, + "grad_norm": 1.0641515254974365, + "learning_rate": 0.00019356121784816166, + "loss": 3.6034, + "step": 1237 + }, + { + "epoch": 0.11206662442292024, + "grad_norm": 0.9369845390319824, + "learning_rate": 0.00019355001035987966, + "loss": 3.5279, + "step": 1238 + }, + { + "epoch": 0.11215714673667058, + "grad_norm": 1.0742141008377075, + "learning_rate": 0.00019353879345111413, + "loss": 3.6396, + "step": 1239 + }, + { + "epoch": 0.11224766905042093, + "grad_norm": 1.0029163360595703, + "learning_rate": 0.00019352756712299468, + "loss": 3.5808, + "step": 1240 + }, + { + "epoch": 0.11233819136417127, + "grad_norm": 0.9248932003974915, + "learning_rate": 0.00019351633137665175, + "loss": 3.6924, + "step": 1241 + }, + { + "epoch": 0.11242871367792161, + "grad_norm": 0.9621601700782776, + "learning_rate": 0.00019350508621321683, + "loss": 3.6178, + "step": 1242 + }, + { + "epoch": 0.11251923599167195, + "grad_norm": 0.9053172469139099, + "learning_rate": 0.00019349383163382228, + "loss": 3.5506, + "step": 1243 + }, + { + "epoch": 0.11260975830542229, + "grad_norm": 0.9937805533409119, + "learning_rate": 0.00019348256763960145, + "loss": 3.6734, + "step": 1244 + }, + { + "epoch": 0.11270028061917263, + "grad_norm": 0.9606733918190002, + "learning_rate": 0.00019347129423168862, + "loss": 3.5821, + "step": 1245 + }, + { + "epoch": 0.11279080293292297, + "grad_norm": 0.8962657451629639, + "learning_rate": 0.000193460011411219, + "loss": 3.5363, + "step": 1246 + }, + { + "epoch": 0.11288132524667331, + "grad_norm": 0.9315593838691711, + "learning_rate": 0.00019344871917932884, + "loss": 3.5507, + "step": 1247 + }, + { + "epoch": 0.11297184756042364, + "grad_norm": 0.899619996547699, + "learning_rate": 0.00019343741753715516, + "loss": 3.4878, + "step": 1248 + }, + { + "epoch": 0.11306236987417398, + "grad_norm": 0.9632608294487, + "learning_rate": 0.00019342610648583611, + "loss": 3.593, + "step": 1249 + }, + { + "epoch": 0.11315289218792432, + "grad_norm": 0.879357635974884, + "learning_rate": 0.00019341478602651069, + "loss": 3.5508, + "step": 1250 + }, + { + "epoch": 0.11324341450167466, + "grad_norm": 0.9848529696464539, + "learning_rate": 0.00019340345616031886, + "loss": 3.5655, + "step": 1251 + }, + { + "epoch": 0.113333936815425, + "grad_norm": 0.8929630517959595, + "learning_rate": 0.00019339211688840157, + "loss": 3.5551, + "step": 1252 + }, + { + "epoch": 0.11342445912917534, + "grad_norm": 0.863125741481781, + "learning_rate": 0.00019338076821190066, + "loss": 3.6034, + "step": 1253 + }, + { + "epoch": 0.11351498144292568, + "grad_norm": 0.8554606437683105, + "learning_rate": 0.00019336941013195892, + "loss": 3.5429, + "step": 1254 + }, + { + "epoch": 0.11360550375667602, + "grad_norm": 1.0379880666732788, + "learning_rate": 0.00019335804264972018, + "loss": 3.6066, + "step": 1255 + }, + { + "epoch": 0.11369602607042636, + "grad_norm": 0.9469103217124939, + "learning_rate": 0.00019334666576632906, + "loss": 3.6111, + "step": 1256 + }, + { + "epoch": 0.1137865483841767, + "grad_norm": 0.9332228899002075, + "learning_rate": 0.00019333527948293128, + "loss": 3.6062, + "step": 1257 + }, + { + "epoch": 0.11387707069792703, + "grad_norm": 0.8677222728729248, + "learning_rate": 0.0001933238838006734, + "loss": 3.579, + "step": 1258 + }, + { + "epoch": 0.11396759301167737, + "grad_norm": 1.0136536359786987, + "learning_rate": 0.000193312478720703, + "loss": 3.511, + "step": 1259 + }, + { + "epoch": 0.11405811532542771, + "grad_norm": 0.87325119972229, + "learning_rate": 0.00019330106424416852, + "loss": 3.5735, + "step": 1260 + }, + { + "epoch": 0.11414863763917805, + "grad_norm": 0.9630944132804871, + "learning_rate": 0.00019328964037221942, + "loss": 3.554, + "step": 1261 + }, + { + "epoch": 0.1142391599529284, + "grad_norm": 0.946215808391571, + "learning_rate": 0.0001932782071060061, + "loss": 3.5589, + "step": 1262 + }, + { + "epoch": 0.11432968226667874, + "grad_norm": 0.9987102150917053, + "learning_rate": 0.00019326676444667986, + "loss": 3.5423, + "step": 1263 + }, + { + "epoch": 0.11442020458042908, + "grad_norm": 1.0245448350906372, + "learning_rate": 0.000193255312395393, + "loss": 3.5212, + "step": 1264 + }, + { + "epoch": 0.11451072689417942, + "grad_norm": 1.0864923000335693, + "learning_rate": 0.00019324385095329874, + "loss": 3.5383, + "step": 1265 + }, + { + "epoch": 0.11460124920792976, + "grad_norm": 0.9032666683197021, + "learning_rate": 0.00019323238012155123, + "loss": 3.5436, + "step": 1266 + }, + { + "epoch": 0.1146917715216801, + "grad_norm": 0.9453108906745911, + "learning_rate": 0.0001932208999013056, + "loss": 3.5882, + "step": 1267 + }, + { + "epoch": 0.11478229383543043, + "grad_norm": 1.0362881422042847, + "learning_rate": 0.0001932094102937179, + "loss": 3.5696, + "step": 1268 + }, + { + "epoch": 0.11487281614918077, + "grad_norm": 1.04246187210083, + "learning_rate": 0.00019319791129994513, + "loss": 3.6289, + "step": 1269 + }, + { + "epoch": 0.11496333846293111, + "grad_norm": 0.9332128167152405, + "learning_rate": 0.00019318640292114524, + "loss": 3.5249, + "step": 1270 + }, + { + "epoch": 0.11505386077668145, + "grad_norm": 0.9713038802146912, + "learning_rate": 0.00019317488515847715, + "loss": 3.5822, + "step": 1271 + }, + { + "epoch": 0.11514438309043179, + "grad_norm": 0.96926349401474, + "learning_rate": 0.00019316335801310063, + "loss": 3.545, + "step": 1272 + }, + { + "epoch": 0.11523490540418213, + "grad_norm": 1.0021302700042725, + "learning_rate": 0.00019315182148617655, + "loss": 3.6455, + "step": 1273 + }, + { + "epoch": 0.11532542771793247, + "grad_norm": 0.9882180690765381, + "learning_rate": 0.00019314027557886657, + "loss": 3.4918, + "step": 1274 + }, + { + "epoch": 0.11541595003168281, + "grad_norm": 0.9814496040344238, + "learning_rate": 0.00019312872029233339, + "loss": 3.5967, + "step": 1275 + }, + { + "epoch": 0.11550647234543315, + "grad_norm": 0.9681494235992432, + "learning_rate": 0.00019311715562774062, + "loss": 3.5891, + "step": 1276 + }, + { + "epoch": 0.1155969946591835, + "grad_norm": 1.08894681930542, + "learning_rate": 0.00019310558158625285, + "loss": 3.5718, + "step": 1277 + }, + { + "epoch": 0.11568751697293383, + "grad_norm": 0.9990349411964417, + "learning_rate": 0.00019309399816903552, + "loss": 3.5909, + "step": 1278 + }, + { + "epoch": 0.11577803928668416, + "grad_norm": 0.9717885851860046, + "learning_rate": 0.00019308240537725517, + "loss": 3.4866, + "step": 1279 + }, + { + "epoch": 0.1158685616004345, + "grad_norm": 0.9531372785568237, + "learning_rate": 0.00019307080321207912, + "loss": 3.6393, + "step": 1280 + }, + { + "epoch": 0.11595908391418484, + "grad_norm": 1.0083506107330322, + "learning_rate": 0.00019305919167467573, + "loss": 3.5573, + "step": 1281 + }, + { + "epoch": 0.11604960622793518, + "grad_norm": 0.9667612910270691, + "learning_rate": 0.0001930475707662143, + "loss": 3.5548, + "step": 1282 + }, + { + "epoch": 0.11614012854168553, + "grad_norm": 0.9590164422988892, + "learning_rate": 0.00019303594048786503, + "loss": 3.5589, + "step": 1283 + }, + { + "epoch": 0.11623065085543587, + "grad_norm": 0.9013192057609558, + "learning_rate": 0.0001930243008407991, + "loss": 3.5273, + "step": 1284 + }, + { + "epoch": 0.1163211731691862, + "grad_norm": 1.077648639678955, + "learning_rate": 0.00019301265182618862, + "loss": 3.6351, + "step": 1285 + }, + { + "epoch": 0.11641169548293655, + "grad_norm": 0.9391737580299377, + "learning_rate": 0.00019300099344520666, + "loss": 3.6289, + "step": 1286 + }, + { + "epoch": 0.11650221779668689, + "grad_norm": 0.9416689276695251, + "learning_rate": 0.00019298932569902717, + "loss": 3.5554, + "step": 1287 + }, + { + "epoch": 0.11659274011043723, + "grad_norm": 0.9104056358337402, + "learning_rate": 0.00019297764858882514, + "loss": 3.477, + "step": 1288 + }, + { + "epoch": 0.11668326242418756, + "grad_norm": 0.9194529056549072, + "learning_rate": 0.00019296596211577645, + "loss": 3.4682, + "step": 1289 + }, + { + "epoch": 0.1167737847379379, + "grad_norm": 0.9856456518173218, + "learning_rate": 0.00019295426628105792, + "loss": 3.4858, + "step": 1290 + }, + { + "epoch": 0.11686430705168824, + "grad_norm": 0.9236043691635132, + "learning_rate": 0.0001929425610858473, + "loss": 3.6041, + "step": 1291 + }, + { + "epoch": 0.11695482936543858, + "grad_norm": 1.005852460861206, + "learning_rate": 0.00019293084653132335, + "loss": 3.5722, + "step": 1292 + }, + { + "epoch": 0.11704535167918892, + "grad_norm": 0.9348894357681274, + "learning_rate": 0.00019291912261866568, + "loss": 3.6171, + "step": 1293 + }, + { + "epoch": 0.11713587399293926, + "grad_norm": 1.0413389205932617, + "learning_rate": 0.00019290738934905492, + "loss": 3.5894, + "step": 1294 + }, + { + "epoch": 0.1172263963066896, + "grad_norm": 1.0090665817260742, + "learning_rate": 0.00019289564672367258, + "loss": 3.5964, + "step": 1295 + }, + { + "epoch": 0.11731691862043994, + "grad_norm": 1.0007750988006592, + "learning_rate": 0.00019288389474370117, + "loss": 3.5692, + "step": 1296 + }, + { + "epoch": 0.11740744093419028, + "grad_norm": 0.9936386942863464, + "learning_rate": 0.0001928721334103241, + "loss": 3.5052, + "step": 1297 + }, + { + "epoch": 0.11749796324794062, + "grad_norm": 0.9362757802009583, + "learning_rate": 0.0001928603627247257, + "loss": 3.497, + "step": 1298 + }, + { + "epoch": 0.11758848556169095, + "grad_norm": 1.0437519550323486, + "learning_rate": 0.00019284858268809137, + "loss": 3.5627, + "step": 1299 + }, + { + "epoch": 0.11767900787544129, + "grad_norm": 1.045013666152954, + "learning_rate": 0.00019283679330160726, + "loss": 3.5357, + "step": 1300 + }, + { + "epoch": 0.11776953018919163, + "grad_norm": 1.0569078922271729, + "learning_rate": 0.00019282499456646065, + "loss": 3.5811, + "step": 1301 + }, + { + "epoch": 0.11786005250294197, + "grad_norm": 0.9556296467781067, + "learning_rate": 0.00019281318648383956, + "loss": 3.5919, + "step": 1302 + }, + { + "epoch": 0.11795057481669231, + "grad_norm": 0.9206773042678833, + "learning_rate": 0.0001928013690549332, + "loss": 3.5818, + "step": 1303 + }, + { + "epoch": 0.11804109713044265, + "grad_norm": 1.0458245277404785, + "learning_rate": 0.00019278954228093146, + "loss": 3.5062, + "step": 1304 + }, + { + "epoch": 0.118131619444193, + "grad_norm": 0.9117056727409363, + "learning_rate": 0.0001927777061630254, + "loss": 3.4908, + "step": 1305 + }, + { + "epoch": 0.11822214175794334, + "grad_norm": 0.9894657135009766, + "learning_rate": 0.00019276586070240682, + "loss": 3.5476, + "step": 1306 + }, + { + "epoch": 0.11831266407169368, + "grad_norm": 1.0137649774551392, + "learning_rate": 0.00019275400590026864, + "loss": 3.5573, + "step": 1307 + }, + { + "epoch": 0.11840318638544402, + "grad_norm": 0.9838030934333801, + "learning_rate": 0.00019274214175780458, + "loss": 3.5925, + "step": 1308 + }, + { + "epoch": 0.11849370869919436, + "grad_norm": 0.970636785030365, + "learning_rate": 0.00019273026827620941, + "loss": 3.5876, + "step": 1309 + }, + { + "epoch": 0.11858423101294469, + "grad_norm": 1.0284696817398071, + "learning_rate": 0.00019271838545667876, + "loss": 3.508, + "step": 1310 + }, + { + "epoch": 0.11867475332669503, + "grad_norm": 0.9239933490753174, + "learning_rate": 0.0001927064933004092, + "loss": 3.5244, + "step": 1311 + }, + { + "epoch": 0.11876527564044537, + "grad_norm": 0.9314912557601929, + "learning_rate": 0.0001926945918085983, + "loss": 3.5688, + "step": 1312 + }, + { + "epoch": 0.11885579795419571, + "grad_norm": 0.9962323307991028, + "learning_rate": 0.0001926826809824446, + "loss": 3.5586, + "step": 1313 + }, + { + "epoch": 0.11894632026794605, + "grad_norm": 0.8930354714393616, + "learning_rate": 0.0001926707608231474, + "loss": 3.5189, + "step": 1314 + }, + { + "epoch": 0.11903684258169639, + "grad_norm": 1.0008631944656372, + "learning_rate": 0.00019265883133190713, + "loss": 3.5565, + "step": 1315 + }, + { + "epoch": 0.11912736489544673, + "grad_norm": 0.9283084869384766, + "learning_rate": 0.0001926468925099251, + "loss": 3.51, + "step": 1316 + }, + { + "epoch": 0.11921788720919707, + "grad_norm": 0.9283885955810547, + "learning_rate": 0.00019263494435840355, + "loss": 3.5087, + "step": 1317 + }, + { + "epoch": 0.11930840952294741, + "grad_norm": 0.9244400858879089, + "learning_rate": 0.0001926229868785456, + "loss": 3.4833, + "step": 1318 + }, + { + "epoch": 0.11939893183669775, + "grad_norm": 0.9127993583679199, + "learning_rate": 0.0001926110200715554, + "loss": 3.5401, + "step": 1319 + }, + { + "epoch": 0.11948945415044808, + "grad_norm": 0.8905998468399048, + "learning_rate": 0.00019259904393863802, + "loss": 3.5208, + "step": 1320 + }, + { + "epoch": 0.11957997646419842, + "grad_norm": 0.915275514125824, + "learning_rate": 0.0001925870584809995, + "loss": 3.5384, + "step": 1321 + }, + { + "epoch": 0.11967049877794876, + "grad_norm": 0.9308586716651917, + "learning_rate": 0.00019257506369984667, + "loss": 3.522, + "step": 1322 + }, + { + "epoch": 0.1197610210916991, + "grad_norm": 0.981001079082489, + "learning_rate": 0.00019256305959638748, + "loss": 3.559, + "step": 1323 + }, + { + "epoch": 0.11985154340544944, + "grad_norm": 0.9434746503829956, + "learning_rate": 0.0001925510461718307, + "loss": 3.5701, + "step": 1324 + }, + { + "epoch": 0.11994206571919978, + "grad_norm": 0.897885799407959, + "learning_rate": 0.0001925390234273861, + "loss": 3.5868, + "step": 1325 + }, + { + "epoch": 0.12003258803295012, + "grad_norm": 0.9865405559539795, + "learning_rate": 0.0001925269913642644, + "loss": 3.5309, + "step": 1326 + }, + { + "epoch": 0.12012311034670047, + "grad_norm": 0.9099435806274414, + "learning_rate": 0.0001925149499836772, + "loss": 3.5296, + "step": 1327 + }, + { + "epoch": 0.1202136326604508, + "grad_norm": 1.001774787902832, + "learning_rate": 0.00019250289928683705, + "loss": 3.4915, + "step": 1328 + }, + { + "epoch": 0.12030415497420115, + "grad_norm": 0.9014139175415039, + "learning_rate": 0.00019249083927495747, + "loss": 3.555, + "step": 1329 + }, + { + "epoch": 0.12039467728795147, + "grad_norm": 1.0934901237487793, + "learning_rate": 0.00019247876994925292, + "loss": 3.5932, + "step": 1330 + }, + { + "epoch": 0.12048519960170181, + "grad_norm": 0.9719903469085693, + "learning_rate": 0.00019246669131093875, + "loss": 3.5252, + "step": 1331 + }, + { + "epoch": 0.12057572191545216, + "grad_norm": 1.0894511938095093, + "learning_rate": 0.00019245460336123134, + "loss": 3.5852, + "step": 1332 + }, + { + "epoch": 0.1206662442292025, + "grad_norm": 1.0272449254989624, + "learning_rate": 0.00019244250610134787, + "loss": 3.5173, + "step": 1333 + }, + { + "epoch": 0.12075676654295284, + "grad_norm": 1.0155720710754395, + "learning_rate": 0.00019243039953250655, + "loss": 3.6185, + "step": 1334 + }, + { + "epoch": 0.12084728885670318, + "grad_norm": 1.3178904056549072, + "learning_rate": 0.00019241828365592655, + "loss": 3.5569, + "step": 1335 + }, + { + "epoch": 0.12093781117045352, + "grad_norm": 0.923801600933075, + "learning_rate": 0.00019240615847282788, + "loss": 3.5147, + "step": 1336 + }, + { + "epoch": 0.12102833348420386, + "grad_norm": 0.9957558512687683, + "learning_rate": 0.0001923940239844316, + "loss": 3.6609, + "step": 1337 + }, + { + "epoch": 0.1211188557979542, + "grad_norm": 1.0449210405349731, + "learning_rate": 0.00019238188019195964, + "loss": 3.4863, + "step": 1338 + }, + { + "epoch": 0.12120937811170454, + "grad_norm": 0.9299371242523193, + "learning_rate": 0.00019236972709663487, + "loss": 3.5952, + "step": 1339 + }, + { + "epoch": 0.12129990042545487, + "grad_norm": 1.1433054208755493, + "learning_rate": 0.0001923575646996811, + "loss": 3.5433, + "step": 1340 + }, + { + "epoch": 0.12139042273920521, + "grad_norm": 1.0358242988586426, + "learning_rate": 0.00019234539300232314, + "loss": 3.5435, + "step": 1341 + }, + { + "epoch": 0.12148094505295555, + "grad_norm": 0.9494446516036987, + "learning_rate": 0.0001923332120057866, + "loss": 3.5099, + "step": 1342 + }, + { + "epoch": 0.12157146736670589, + "grad_norm": 1.068947196006775, + "learning_rate": 0.00019232102171129811, + "loss": 3.5744, + "step": 1343 + }, + { + "epoch": 0.12166198968045623, + "grad_norm": 0.9469134211540222, + "learning_rate": 0.00019230882212008528, + "loss": 3.562, + "step": 1344 + }, + { + "epoch": 0.12175251199420657, + "grad_norm": 0.9132084846496582, + "learning_rate": 0.00019229661323337657, + "loss": 3.5147, + "step": 1345 + }, + { + "epoch": 0.12184303430795691, + "grad_norm": 0.9203004837036133, + "learning_rate": 0.00019228439505240147, + "loss": 3.5774, + "step": 1346 + }, + { + "epoch": 0.12193355662170725, + "grad_norm": 0.9643454551696777, + "learning_rate": 0.00019227216757839027, + "loss": 3.5466, + "step": 1347 + }, + { + "epoch": 0.1220240789354576, + "grad_norm": 0.9245633482933044, + "learning_rate": 0.00019225993081257436, + "loss": 3.5785, + "step": 1348 + }, + { + "epoch": 0.12211460124920794, + "grad_norm": 0.9338047504425049, + "learning_rate": 0.0001922476847561859, + "loss": 3.4997, + "step": 1349 + }, + { + "epoch": 0.12220512356295828, + "grad_norm": 0.948529839515686, + "learning_rate": 0.00019223542941045817, + "loss": 3.4982, + "step": 1350 + }, + { + "epoch": 0.1222956458767086, + "grad_norm": 0.8959832787513733, + "learning_rate": 0.00019222316477662517, + "loss": 3.5094, + "step": 1351 + }, + { + "epoch": 0.12238616819045894, + "grad_norm": 0.8953176736831665, + "learning_rate": 0.00019221089085592202, + "loss": 3.5108, + "step": 1352 + }, + { + "epoch": 0.12247669050420928, + "grad_norm": 0.9416437745094299, + "learning_rate": 0.00019219860764958465, + "loss": 3.4638, + "step": 1353 + }, + { + "epoch": 0.12256721281795963, + "grad_norm": 0.9912828803062439, + "learning_rate": 0.00019218631515885006, + "loss": 3.5506, + "step": 1354 + }, + { + "epoch": 0.12265773513170997, + "grad_norm": 0.9638131856918335, + "learning_rate": 0.00019217401338495605, + "loss": 3.5068, + "step": 1355 + }, + { + "epoch": 0.12274825744546031, + "grad_norm": 0.9816718697547913, + "learning_rate": 0.0001921617023291414, + "loss": 3.4654, + "step": 1356 + }, + { + "epoch": 0.12283877975921065, + "grad_norm": 0.9565276503562927, + "learning_rate": 0.00019214938199264581, + "loss": 3.495, + "step": 1357 + }, + { + "epoch": 0.12292930207296099, + "grad_norm": 0.9229384660720825, + "learning_rate": 0.00019213705237671006, + "loss": 3.5001, + "step": 1358 + }, + { + "epoch": 0.12301982438671133, + "grad_norm": 0.9728844165802002, + "learning_rate": 0.00019212471348257562, + "loss": 3.4909, + "step": 1359 + }, + { + "epoch": 0.12311034670046167, + "grad_norm": 0.9265088438987732, + "learning_rate": 0.000192112365311485, + "loss": 3.4382, + "step": 1360 + }, + { + "epoch": 0.123200869014212, + "grad_norm": 1.0222772359848022, + "learning_rate": 0.00019210000786468179, + "loss": 3.5261, + "step": 1361 + }, + { + "epoch": 0.12329139132796234, + "grad_norm": 0.9751477837562561, + "learning_rate": 0.00019208764114341027, + "loss": 3.5201, + "step": 1362 + }, + { + "epoch": 0.12338191364171268, + "grad_norm": 0.9564617872238159, + "learning_rate": 0.0001920752651489158, + "loss": 3.4582, + "step": 1363 + }, + { + "epoch": 0.12347243595546302, + "grad_norm": 0.923428475856781, + "learning_rate": 0.00019206287988244466, + "loss": 3.4767, + "step": 1364 + }, + { + "epoch": 0.12356295826921336, + "grad_norm": 0.9109767079353333, + "learning_rate": 0.00019205048534524406, + "loss": 3.5975, + "step": 1365 + }, + { + "epoch": 0.1236534805829637, + "grad_norm": 0.8991703987121582, + "learning_rate": 0.00019203808153856207, + "loss": 3.5281, + "step": 1366 + }, + { + "epoch": 0.12374400289671404, + "grad_norm": 0.9747706651687622, + "learning_rate": 0.0001920256684636478, + "loss": 3.6229, + "step": 1367 + }, + { + "epoch": 0.12383452521046438, + "grad_norm": 0.9236463308334351, + "learning_rate": 0.00019201324612175123, + "loss": 3.6039, + "step": 1368 + }, + { + "epoch": 0.12392504752421472, + "grad_norm": 0.970248281955719, + "learning_rate": 0.00019200081451412328, + "loss": 3.4691, + "step": 1369 + }, + { + "epoch": 0.12401556983796506, + "grad_norm": 0.8713831305503845, + "learning_rate": 0.00019198837364201585, + "loss": 3.4869, + "step": 1370 + }, + { + "epoch": 0.12410609215171539, + "grad_norm": 0.9409332871437073, + "learning_rate": 0.0001919759235066817, + "loss": 3.4984, + "step": 1371 + }, + { + "epoch": 0.12419661446546573, + "grad_norm": 0.9115921854972839, + "learning_rate": 0.00019196346410937457, + "loss": 3.5122, + "step": 1372 + }, + { + "epoch": 0.12428713677921607, + "grad_norm": 0.9855613112449646, + "learning_rate": 0.00019195099545134912, + "loss": 3.5239, + "step": 1373 + }, + { + "epoch": 0.12437765909296641, + "grad_norm": 0.8684623837471008, + "learning_rate": 0.00019193851753386093, + "loss": 3.5421, + "step": 1374 + }, + { + "epoch": 0.12446818140671675, + "grad_norm": 0.9192137122154236, + "learning_rate": 0.00019192603035816656, + "loss": 3.5047, + "step": 1375 + }, + { + "epoch": 0.1245587037204671, + "grad_norm": 0.8745288848876953, + "learning_rate": 0.00019191353392552344, + "loss": 3.5405, + "step": 1376 + }, + { + "epoch": 0.12464922603421744, + "grad_norm": 0.8868715763092041, + "learning_rate": 0.00019190102823718998, + "loss": 3.4262, + "step": 1377 + }, + { + "epoch": 0.12473974834796778, + "grad_norm": 0.8993757963180542, + "learning_rate": 0.00019188851329442547, + "loss": 3.5437, + "step": 1378 + }, + { + "epoch": 0.12483027066171812, + "grad_norm": 0.8323911428451538, + "learning_rate": 0.0001918759890984902, + "loss": 3.5279, + "step": 1379 + }, + { + "epoch": 0.12492079297546846, + "grad_norm": 0.9292891025543213, + "learning_rate": 0.00019186345565064535, + "loss": 3.5397, + "step": 1380 + }, + { + "epoch": 0.1250113152892188, + "grad_norm": 0.9611436724662781, + "learning_rate": 0.00019185091295215303, + "loss": 3.5072, + "step": 1381 + }, + { + "epoch": 0.12510183760296914, + "grad_norm": 0.918283998966217, + "learning_rate": 0.00019183836100427628, + "loss": 3.4667, + "step": 1382 + }, + { + "epoch": 0.12519235991671948, + "grad_norm": 1.1492328643798828, + "learning_rate": 0.00019182579980827908, + "loss": 3.5358, + "step": 1383 + }, + { + "epoch": 0.12528288223046982, + "grad_norm": 0.8876953125, + "learning_rate": 0.00019181322936542635, + "loss": 3.5318, + "step": 1384 + }, + { + "epoch": 0.12537340454422016, + "grad_norm": 0.9587846398353577, + "learning_rate": 0.00019180064967698398, + "loss": 3.5055, + "step": 1385 + }, + { + "epoch": 0.1254639268579705, + "grad_norm": 0.9714934229850769, + "learning_rate": 0.00019178806074421866, + "loss": 3.536, + "step": 1386 + }, + { + "epoch": 0.12555444917172082, + "grad_norm": 0.9751466512680054, + "learning_rate": 0.00019177546256839812, + "loss": 3.4873, + "step": 1387 + }, + { + "epoch": 0.12564497148547116, + "grad_norm": 1.019853115081787, + "learning_rate": 0.00019176285515079102, + "loss": 3.5236, + "step": 1388 + }, + { + "epoch": 0.1257354937992215, + "grad_norm": 0.9810512661933899, + "learning_rate": 0.00019175023849266697, + "loss": 3.5788, + "step": 1389 + }, + { + "epoch": 0.12582601611297184, + "grad_norm": 1.1488577127456665, + "learning_rate": 0.00019173761259529633, + "loss": 3.5419, + "step": 1390 + }, + { + "epoch": 0.12591653842672218, + "grad_norm": 1.1745120286941528, + "learning_rate": 0.00019172497745995065, + "loss": 3.5204, + "step": 1391 + }, + { + "epoch": 0.12600706074047252, + "grad_norm": 1.1298781633377075, + "learning_rate": 0.00019171233308790225, + "loss": 3.5831, + "step": 1392 + }, + { + "epoch": 0.12609758305422286, + "grad_norm": 1.0139622688293457, + "learning_rate": 0.00019169967948042444, + "loss": 3.5082, + "step": 1393 + }, + { + "epoch": 0.1261881053679732, + "grad_norm": 0.9887393116950989, + "learning_rate": 0.0001916870166387914, + "loss": 3.4952, + "step": 1394 + }, + { + "epoch": 0.12627862768172354, + "grad_norm": 1.0016969442367554, + "learning_rate": 0.00019167434456427828, + "loss": 3.444, + "step": 1395 + }, + { + "epoch": 0.12636914999547388, + "grad_norm": 0.8871667385101318, + "learning_rate": 0.00019166166325816118, + "loss": 3.5377, + "step": 1396 + }, + { + "epoch": 0.12645967230922422, + "grad_norm": 0.9886758923530579, + "learning_rate": 0.0001916489727217171, + "loss": 3.5954, + "step": 1397 + }, + { + "epoch": 0.12655019462297457, + "grad_norm": 1.1002118587493896, + "learning_rate": 0.00019163627295622397, + "loss": 3.6343, + "step": 1398 + }, + { + "epoch": 0.1266407169367249, + "grad_norm": 1.1045169830322266, + "learning_rate": 0.00019162356396296067, + "loss": 3.51, + "step": 1399 + }, + { + "epoch": 0.12673123925047525, + "grad_norm": 1.0856785774230957, + "learning_rate": 0.00019161084574320696, + "loss": 3.4861, + "step": 1400 + }, + { + "epoch": 0.1268217615642256, + "grad_norm": 0.9421377182006836, + "learning_rate": 0.00019159811829824364, + "loss": 3.4557, + "step": 1401 + }, + { + "epoch": 0.12691228387797593, + "grad_norm": 1.1086941957473755, + "learning_rate": 0.00019158538162935225, + "loss": 3.5516, + "step": 1402 + }, + { + "epoch": 0.12700280619172627, + "grad_norm": 0.9888288378715515, + "learning_rate": 0.0001915726357378155, + "loss": 3.5028, + "step": 1403 + }, + { + "epoch": 0.1270933285054766, + "grad_norm": 1.0222643613815308, + "learning_rate": 0.0001915598806249168, + "loss": 3.5185, + "step": 1404 + }, + { + "epoch": 0.12718385081922695, + "grad_norm": 1.073885202407837, + "learning_rate": 0.00019154711629194062, + "loss": 3.4822, + "step": 1405 + }, + { + "epoch": 0.1272743731329773, + "grad_norm": 1.0235117673873901, + "learning_rate": 0.00019153434274017236, + "loss": 3.4721, + "step": 1406 + }, + { + "epoch": 0.1273648954467276, + "grad_norm": 0.8984697461128235, + "learning_rate": 0.00019152155997089823, + "loss": 3.4577, + "step": 1407 + }, + { + "epoch": 0.12745541776047795, + "grad_norm": 1.0705209970474243, + "learning_rate": 0.0001915087679854056, + "loss": 3.6095, + "step": 1408 + }, + { + "epoch": 0.1275459400742283, + "grad_norm": 0.9822291135787964, + "learning_rate": 0.0001914959667849825, + "loss": 3.5591, + "step": 1409 + }, + { + "epoch": 0.12763646238797863, + "grad_norm": 0.9398566484451294, + "learning_rate": 0.00019148315637091803, + "loss": 3.53, + "step": 1410 + }, + { + "epoch": 0.12772698470172897, + "grad_norm": 0.9197871088981628, + "learning_rate": 0.00019147033674450225, + "loss": 3.4776, + "step": 1411 + }, + { + "epoch": 0.1278175070154793, + "grad_norm": 0.966709315776825, + "learning_rate": 0.00019145750790702603, + "loss": 3.4558, + "step": 1412 + }, + { + "epoch": 0.12790802932922965, + "grad_norm": 0.8962054252624512, + "learning_rate": 0.00019144466985978128, + "loss": 3.4839, + "step": 1413 + }, + { + "epoch": 0.12799855164298, + "grad_norm": 0.9782243371009827, + "learning_rate": 0.0001914318226040608, + "loss": 3.5515, + "step": 1414 + }, + { + "epoch": 0.12808907395673033, + "grad_norm": 0.97163987159729, + "learning_rate": 0.00019141896614115824, + "loss": 3.491, + "step": 1415 + }, + { + "epoch": 0.12817959627048067, + "grad_norm": 0.8906465768814087, + "learning_rate": 0.00019140610047236833, + "loss": 3.4513, + "step": 1416 + }, + { + "epoch": 0.128270118584231, + "grad_norm": 0.9411565661430359, + "learning_rate": 0.00019139322559898658, + "loss": 3.5149, + "step": 1417 + }, + { + "epoch": 0.12836064089798135, + "grad_norm": 1.0057343244552612, + "learning_rate": 0.00019138034152230954, + "loss": 3.5091, + "step": 1418 + }, + { + "epoch": 0.1284511632117317, + "grad_norm": 0.9373046159744263, + "learning_rate": 0.0001913674482436346, + "loss": 3.5215, + "step": 1419 + }, + { + "epoch": 0.12854168552548204, + "grad_norm": 0.9086987972259521, + "learning_rate": 0.0001913545457642601, + "loss": 3.5094, + "step": 1420 + }, + { + "epoch": 0.12863220783923238, + "grad_norm": 1.0151550769805908, + "learning_rate": 0.00019134163408548535, + "loss": 3.5051, + "step": 1421 + }, + { + "epoch": 0.12872273015298272, + "grad_norm": 0.9737538695335388, + "learning_rate": 0.00019132871320861057, + "loss": 3.5201, + "step": 1422 + }, + { + "epoch": 0.12881325246673306, + "grad_norm": 0.8971673250198364, + "learning_rate": 0.00019131578313493684, + "loss": 3.5268, + "step": 1423 + }, + { + "epoch": 0.1289037747804834, + "grad_norm": 0.9950623512268066, + "learning_rate": 0.00019130284386576624, + "loss": 3.5047, + "step": 1424 + }, + { + "epoch": 0.12899429709423374, + "grad_norm": 0.9624702334403992, + "learning_rate": 0.00019128989540240178, + "loss": 3.4666, + "step": 1425 + }, + { + "epoch": 0.12908481940798408, + "grad_norm": 0.9711193442344666, + "learning_rate": 0.00019127693774614738, + "loss": 3.5022, + "step": 1426 + }, + { + "epoch": 0.12917534172173442, + "grad_norm": 1.091729998588562, + "learning_rate": 0.0001912639708983078, + "loss": 3.4588, + "step": 1427 + }, + { + "epoch": 0.12926586403548473, + "grad_norm": 1.0445271730422974, + "learning_rate": 0.00019125099486018887, + "loss": 3.4819, + "step": 1428 + }, + { + "epoch": 0.12935638634923508, + "grad_norm": 0.9670456051826477, + "learning_rate": 0.00019123800963309723, + "loss": 3.5112, + "step": 1429 + }, + { + "epoch": 0.12944690866298542, + "grad_norm": 1.0354905128479004, + "learning_rate": 0.00019122501521834053, + "loss": 3.5502, + "step": 1430 + }, + { + "epoch": 0.12953743097673576, + "grad_norm": 0.9035726189613342, + "learning_rate": 0.0001912120116172273, + "loss": 3.5438, + "step": 1431 + }, + { + "epoch": 0.1296279532904861, + "grad_norm": 1.0219416618347168, + "learning_rate": 0.000191198998831067, + "loss": 3.4761, + "step": 1432 + }, + { + "epoch": 0.12971847560423644, + "grad_norm": 0.9801256060600281, + "learning_rate": 0.00019118597686117002, + "loss": 3.4326, + "step": 1433 + }, + { + "epoch": 0.12980899791798678, + "grad_norm": 0.9733026027679443, + "learning_rate": 0.00019117294570884764, + "loss": 3.5113, + "step": 1434 + }, + { + "epoch": 0.12989952023173712, + "grad_norm": 0.966882586479187, + "learning_rate": 0.00019115990537541216, + "loss": 3.5104, + "step": 1435 + }, + { + "epoch": 0.12999004254548746, + "grad_norm": 0.934974730014801, + "learning_rate": 0.00019114685586217665, + "loss": 3.5122, + "step": 1436 + }, + { + "epoch": 0.1300805648592378, + "grad_norm": 0.9903215765953064, + "learning_rate": 0.00019113379717045529, + "loss": 3.471, + "step": 1437 + }, + { + "epoch": 0.13017108717298814, + "grad_norm": 1.0421541929244995, + "learning_rate": 0.00019112072930156302, + "loss": 3.5033, + "step": 1438 + }, + { + "epoch": 0.13026160948673848, + "grad_norm": 1.0076266527175903, + "learning_rate": 0.00019110765225681582, + "loss": 3.4898, + "step": 1439 + }, + { + "epoch": 0.13035213180048882, + "grad_norm": 0.9797717928886414, + "learning_rate": 0.0001910945660375305, + "loss": 3.4908, + "step": 1440 + }, + { + "epoch": 0.13044265411423916, + "grad_norm": 1.0033999681472778, + "learning_rate": 0.00019108147064502494, + "loss": 3.4859, + "step": 1441 + }, + { + "epoch": 0.1305331764279895, + "grad_norm": 0.9333057403564453, + "learning_rate": 0.00019106836608061772, + "loss": 3.4984, + "step": 1442 + }, + { + "epoch": 0.13062369874173985, + "grad_norm": 1.0323694944381714, + "learning_rate": 0.00019105525234562856, + "loss": 3.4998, + "step": 1443 + }, + { + "epoch": 0.1307142210554902, + "grad_norm": 0.9501261115074158, + "learning_rate": 0.00019104212944137796, + "loss": 3.4632, + "step": 1444 + }, + { + "epoch": 0.13080474336924053, + "grad_norm": 0.9542381763458252, + "learning_rate": 0.0001910289973691874, + "loss": 3.4215, + "step": 1445 + }, + { + "epoch": 0.13089526568299087, + "grad_norm": 1.1503580808639526, + "learning_rate": 0.00019101585613037934, + "loss": 3.53, + "step": 1446 + }, + { + "epoch": 0.1309857879967412, + "grad_norm": 0.8851205706596375, + "learning_rate": 0.00019100270572627704, + "loss": 3.5032, + "step": 1447 + }, + { + "epoch": 0.13107631031049152, + "grad_norm": 1.1074649095535278, + "learning_rate": 0.00019098954615820476, + "loss": 3.5025, + "step": 1448 + }, + { + "epoch": 0.13116683262424186, + "grad_norm": 1.0411019325256348, + "learning_rate": 0.00019097637742748766, + "loss": 3.5763, + "step": 1449 + }, + { + "epoch": 0.1312573549379922, + "grad_norm": 0.9861066937446594, + "learning_rate": 0.00019096319953545185, + "loss": 3.5056, + "step": 1450 + }, + { + "epoch": 0.13134787725174255, + "grad_norm": 1.028899073600769, + "learning_rate": 0.00019095001248342435, + "loss": 3.5157, + "step": 1451 + }, + { + "epoch": 0.13143839956549289, + "grad_norm": 0.9953543543815613, + "learning_rate": 0.00019093681627273306, + "loss": 3.5027, + "step": 1452 + }, + { + "epoch": 0.13152892187924323, + "grad_norm": 0.969194233417511, + "learning_rate": 0.00019092361090470688, + "loss": 3.4318, + "step": 1453 + }, + { + "epoch": 0.13161944419299357, + "grad_norm": 0.9358369708061218, + "learning_rate": 0.00019091039638067554, + "loss": 3.488, + "step": 1454 + }, + { + "epoch": 0.1317099665067439, + "grad_norm": 1.0051780939102173, + "learning_rate": 0.00019089717270196982, + "loss": 3.4563, + "step": 1455 + }, + { + "epoch": 0.13180048882049425, + "grad_norm": 0.9302476048469543, + "learning_rate": 0.00019088393986992124, + "loss": 3.4359, + "step": 1456 + }, + { + "epoch": 0.1318910111342446, + "grad_norm": 1.03059720993042, + "learning_rate": 0.00019087069788586243, + "loss": 3.4482, + "step": 1457 + }, + { + "epoch": 0.13198153344799493, + "grad_norm": 0.8484632968902588, + "learning_rate": 0.00019085744675112682, + "loss": 3.5185, + "step": 1458 + }, + { + "epoch": 0.13207205576174527, + "grad_norm": 0.9625447988510132, + "learning_rate": 0.00019084418646704882, + "loss": 3.4287, + "step": 1459 + }, + { + "epoch": 0.1321625780754956, + "grad_norm": 0.9219020009040833, + "learning_rate": 0.0001908309170349637, + "loss": 3.4854, + "step": 1460 + }, + { + "epoch": 0.13225310038924595, + "grad_norm": 0.8788402676582336, + "learning_rate": 0.00019081763845620777, + "loss": 3.4869, + "step": 1461 + }, + { + "epoch": 0.1323436227029963, + "grad_norm": 0.9385944604873657, + "learning_rate": 0.0001908043507321181, + "loss": 3.4948, + "step": 1462 + }, + { + "epoch": 0.13243414501674663, + "grad_norm": 0.9940325021743774, + "learning_rate": 0.00019079105386403283, + "loss": 3.4377, + "step": 1463 + }, + { + "epoch": 0.13252466733049698, + "grad_norm": 0.9103172421455383, + "learning_rate": 0.00019077774785329087, + "loss": 3.4933, + "step": 1464 + }, + { + "epoch": 0.13261518964424732, + "grad_norm": 0.9533010721206665, + "learning_rate": 0.00019076443270123222, + "loss": 3.4554, + "step": 1465 + }, + { + "epoch": 0.13270571195799766, + "grad_norm": 0.9723507761955261, + "learning_rate": 0.00019075110840919765, + "loss": 3.5272, + "step": 1466 + }, + { + "epoch": 0.132796234271748, + "grad_norm": 0.9802050590515137, + "learning_rate": 0.00019073777497852898, + "loss": 3.5226, + "step": 1467 + }, + { + "epoch": 0.13288675658549834, + "grad_norm": 1.023480772972107, + "learning_rate": 0.00019072443241056883, + "loss": 3.4561, + "step": 1468 + }, + { + "epoch": 0.13297727889924865, + "grad_norm": 1.0592081546783447, + "learning_rate": 0.00019071108070666082, + "loss": 3.5974, + "step": 1469 + }, + { + "epoch": 0.133067801212999, + "grad_norm": 1.0560920238494873, + "learning_rate": 0.00019069771986814947, + "loss": 3.4346, + "step": 1470 + }, + { + "epoch": 0.13315832352674933, + "grad_norm": 0.8262521624565125, + "learning_rate": 0.00019068434989638023, + "loss": 3.3677, + "step": 1471 + }, + { + "epoch": 0.13324884584049967, + "grad_norm": 1.0982695817947388, + "learning_rate": 0.00019067097079269942, + "loss": 3.4826, + "step": 1472 + }, + { + "epoch": 0.13333936815425002, + "grad_norm": 0.9667245149612427, + "learning_rate": 0.00019065758255845433, + "loss": 3.358, + "step": 1473 + }, + { + "epoch": 0.13342989046800036, + "grad_norm": 0.8388450741767883, + "learning_rate": 0.00019064418519499317, + "loss": 3.4301, + "step": 1474 + }, + { + "epoch": 0.1335204127817507, + "grad_norm": 0.9396435618400574, + "learning_rate": 0.000190630778703665, + "loss": 3.4448, + "step": 1475 + }, + { + "epoch": 0.13361093509550104, + "grad_norm": 0.8738861083984375, + "learning_rate": 0.00019061736308581995, + "loss": 3.544, + "step": 1476 + }, + { + "epoch": 0.13370145740925138, + "grad_norm": 0.9134858250617981, + "learning_rate": 0.00019060393834280888, + "loss": 3.4979, + "step": 1477 + }, + { + "epoch": 0.13379197972300172, + "grad_norm": 1.056847095489502, + "learning_rate": 0.0001905905044759837, + "loss": 3.4604, + "step": 1478 + }, + { + "epoch": 0.13388250203675206, + "grad_norm": 0.9403647184371948, + "learning_rate": 0.0001905770614866972, + "loss": 3.5034, + "step": 1479 + }, + { + "epoch": 0.1339730243505024, + "grad_norm": 1.0279815196990967, + "learning_rate": 0.0001905636093763031, + "loss": 3.4622, + "step": 1480 + }, + { + "epoch": 0.13406354666425274, + "grad_norm": 0.8829526305198669, + "learning_rate": 0.00019055014814615598, + "loss": 3.4812, + "step": 1481 + }, + { + "epoch": 0.13415406897800308, + "grad_norm": 0.9637207388877869, + "learning_rate": 0.00019053667779761146, + "loss": 3.5299, + "step": 1482 + }, + { + "epoch": 0.13424459129175342, + "grad_norm": 1.0193819999694824, + "learning_rate": 0.00019052319833202597, + "loss": 3.4526, + "step": 1483 + }, + { + "epoch": 0.13433511360550376, + "grad_norm": 0.8556128144264221, + "learning_rate": 0.00019050970975075684, + "loss": 3.4781, + "step": 1484 + }, + { + "epoch": 0.1344256359192541, + "grad_norm": 0.9707963466644287, + "learning_rate": 0.00019049621205516242, + "loss": 3.503, + "step": 1485 + }, + { + "epoch": 0.13451615823300445, + "grad_norm": 0.9474601745605469, + "learning_rate": 0.00019048270524660196, + "loss": 3.4621, + "step": 1486 + }, + { + "epoch": 0.1346066805467548, + "grad_norm": 0.9763953685760498, + "learning_rate": 0.00019046918932643555, + "loss": 3.4991, + "step": 1487 + }, + { + "epoch": 0.13469720286050513, + "grad_norm": 0.8932214975357056, + "learning_rate": 0.00019045566429602424, + "loss": 3.4698, + "step": 1488 + }, + { + "epoch": 0.13478772517425544, + "grad_norm": 0.9706962704658508, + "learning_rate": 0.00019044213015673, + "loss": 3.3941, + "step": 1489 + }, + { + "epoch": 0.13487824748800578, + "grad_norm": 0.9907398223876953, + "learning_rate": 0.00019042858690991574, + "loss": 3.4483, + "step": 1490 + }, + { + "epoch": 0.13496876980175612, + "grad_norm": 1.020414113998413, + "learning_rate": 0.00019041503455694524, + "loss": 3.4209, + "step": 1491 + }, + { + "epoch": 0.13505929211550646, + "grad_norm": 0.9534351229667664, + "learning_rate": 0.00019040147309918326, + "loss": 3.442, + "step": 1492 + }, + { + "epoch": 0.1351498144292568, + "grad_norm": 0.9675535559654236, + "learning_rate": 0.00019038790253799535, + "loss": 3.568, + "step": 1493 + }, + { + "epoch": 0.13524033674300714, + "grad_norm": 0.9924291372299194, + "learning_rate": 0.0001903743228747482, + "loss": 3.4718, + "step": 1494 + }, + { + "epoch": 0.13533085905675749, + "grad_norm": 0.9964470863342285, + "learning_rate": 0.00019036073411080916, + "loss": 3.5284, + "step": 1495 + }, + { + "epoch": 0.13542138137050783, + "grad_norm": 1.0707465410232544, + "learning_rate": 0.00019034713624754672, + "loss": 3.5124, + "step": 1496 + }, + { + "epoch": 0.13551190368425817, + "grad_norm": 1.0072213411331177, + "learning_rate": 0.0001903335292863301, + "loss": 3.5582, + "step": 1497 + }, + { + "epoch": 0.1356024259980085, + "grad_norm": 1.0074089765548706, + "learning_rate": 0.00019031991322852955, + "loss": 3.5065, + "step": 1498 + }, + { + "epoch": 0.13569294831175885, + "grad_norm": 1.06528902053833, + "learning_rate": 0.0001903062880755162, + "loss": 3.5091, + "step": 1499 + }, + { + "epoch": 0.1357834706255092, + "grad_norm": 1.0103434324264526, + "learning_rate": 0.00019029265382866214, + "loss": 3.4023, + "step": 1500 + }, + { + "epoch": 0.13587399293925953, + "grad_norm": 1.0442228317260742, + "learning_rate": 0.00019027901048934028, + "loss": 3.3906, + "step": 1501 + }, + { + "epoch": 0.13596451525300987, + "grad_norm": 0.9370065927505493, + "learning_rate": 0.00019026535805892456, + "loss": 3.5163, + "step": 1502 + }, + { + "epoch": 0.1360550375667602, + "grad_norm": 0.9656304717063904, + "learning_rate": 0.00019025169653878973, + "loss": 3.5767, + "step": 1503 + }, + { + "epoch": 0.13614555988051055, + "grad_norm": 1.059404730796814, + "learning_rate": 0.00019023802593031154, + "loss": 3.3782, + "step": 1504 + }, + { + "epoch": 0.1362360821942609, + "grad_norm": 0.9082018136978149, + "learning_rate": 0.0001902243462348666, + "loss": 3.4719, + "step": 1505 + }, + { + "epoch": 0.13632660450801123, + "grad_norm": 0.9529256820678711, + "learning_rate": 0.0001902106574538325, + "loss": 3.4799, + "step": 1506 + }, + { + "epoch": 0.13641712682176158, + "grad_norm": 1.0089010000228882, + "learning_rate": 0.0001901969595885876, + "loss": 3.4298, + "step": 1507 + }, + { + "epoch": 0.13650764913551192, + "grad_norm": 0.9658178687095642, + "learning_rate": 0.0001901832526405114, + "loss": 3.4837, + "step": 1508 + }, + { + "epoch": 0.13659817144926226, + "grad_norm": 0.8518688678741455, + "learning_rate": 0.0001901695366109841, + "loss": 3.4015, + "step": 1509 + }, + { + "epoch": 0.13668869376301257, + "grad_norm": 1.0004554986953735, + "learning_rate": 0.00019015581150138693, + "loss": 3.4847, + "step": 1510 + }, + { + "epoch": 0.1367792160767629, + "grad_norm": 1.018133521080017, + "learning_rate": 0.00019014207731310202, + "loss": 3.4175, + "step": 1511 + }, + { + "epoch": 0.13686973839051325, + "grad_norm": 0.9288983941078186, + "learning_rate": 0.00019012833404751235, + "loss": 3.4276, + "step": 1512 + }, + { + "epoch": 0.1369602607042636, + "grad_norm": 1.074337124824524, + "learning_rate": 0.00019011458170600193, + "loss": 3.4433, + "step": 1513 + }, + { + "epoch": 0.13705078301801393, + "grad_norm": 0.8855864405632019, + "learning_rate": 0.0001901008202899556, + "loss": 3.4224, + "step": 1514 + }, + { + "epoch": 0.13714130533176427, + "grad_norm": 1.0131200551986694, + "learning_rate": 0.00019008704980075916, + "loss": 3.4568, + "step": 1515 + }, + { + "epoch": 0.13723182764551461, + "grad_norm": 0.971107006072998, + "learning_rate": 0.00019007327023979923, + "loss": 3.4019, + "step": 1516 + }, + { + "epoch": 0.13732234995926496, + "grad_norm": 0.9484931230545044, + "learning_rate": 0.00019005948160846344, + "loss": 3.4329, + "step": 1517 + }, + { + "epoch": 0.1374128722730153, + "grad_norm": 1.075060486793518, + "learning_rate": 0.00019004568390814037, + "loss": 3.4524, + "step": 1518 + }, + { + "epoch": 0.13750339458676564, + "grad_norm": 1.008601427078247, + "learning_rate": 0.00019003187714021938, + "loss": 3.5014, + "step": 1519 + }, + { + "epoch": 0.13759391690051598, + "grad_norm": 0.9233218431472778, + "learning_rate": 0.0001900180613060908, + "loss": 3.4856, + "step": 1520 + }, + { + "epoch": 0.13768443921426632, + "grad_norm": 1.0939658880233765, + "learning_rate": 0.00019000423640714594, + "loss": 3.4835, + "step": 1521 + }, + { + "epoch": 0.13777496152801666, + "grad_norm": 0.95887291431427, + "learning_rate": 0.0001899904024447769, + "loss": 3.4504, + "step": 1522 + }, + { + "epoch": 0.137865483841767, + "grad_norm": 1.1146246194839478, + "learning_rate": 0.0001899765594203768, + "loss": 3.5101, + "step": 1523 + }, + { + "epoch": 0.13795600615551734, + "grad_norm": 0.9778677225112915, + "learning_rate": 0.00018996270733533964, + "loss": 3.4009, + "step": 1524 + }, + { + "epoch": 0.13804652846926768, + "grad_norm": 0.945473849773407, + "learning_rate": 0.00018994884619106031, + "loss": 3.4917, + "step": 1525 + }, + { + "epoch": 0.13813705078301802, + "grad_norm": 1.044701099395752, + "learning_rate": 0.00018993497598893466, + "loss": 3.3803, + "step": 1526 + }, + { + "epoch": 0.13822757309676836, + "grad_norm": 1.0342203378677368, + "learning_rate": 0.00018992109673035936, + "loss": 3.435, + "step": 1527 + }, + { + "epoch": 0.1383180954105187, + "grad_norm": 1.0020346641540527, + "learning_rate": 0.00018990720841673207, + "loss": 3.4686, + "step": 1528 + }, + { + "epoch": 0.13840861772426905, + "grad_norm": 0.9971665740013123, + "learning_rate": 0.00018989331104945137, + "loss": 3.4112, + "step": 1529 + }, + { + "epoch": 0.13849914003801936, + "grad_norm": 0.9247247576713562, + "learning_rate": 0.0001898794046299167, + "loss": 3.4628, + "step": 1530 + }, + { + "epoch": 0.1385896623517697, + "grad_norm": 0.9933388233184814, + "learning_rate": 0.00018986548915952845, + "loss": 3.4662, + "step": 1531 + }, + { + "epoch": 0.13868018466552004, + "grad_norm": 0.9920099377632141, + "learning_rate": 0.0001898515646396879, + "loss": 3.4703, + "step": 1532 + }, + { + "epoch": 0.13877070697927038, + "grad_norm": 0.9866044521331787, + "learning_rate": 0.00018983763107179727, + "loss": 3.4612, + "step": 1533 + }, + { + "epoch": 0.13886122929302072, + "grad_norm": 0.9920278191566467, + "learning_rate": 0.0001898236884572596, + "loss": 3.4442, + "step": 1534 + }, + { + "epoch": 0.13895175160677106, + "grad_norm": 0.9837521910667419, + "learning_rate": 0.00018980973679747897, + "loss": 3.4224, + "step": 1535 + }, + { + "epoch": 0.1390422739205214, + "grad_norm": 0.9028905034065247, + "learning_rate": 0.00018979577609386033, + "loss": 3.3831, + "step": 1536 + }, + { + "epoch": 0.13913279623427174, + "grad_norm": 1.0407545566558838, + "learning_rate": 0.0001897818063478095, + "loss": 3.4652, + "step": 1537 + }, + { + "epoch": 0.13922331854802208, + "grad_norm": 1.0021555423736572, + "learning_rate": 0.0001897678275607332, + "loss": 3.5014, + "step": 1538 + }, + { + "epoch": 0.13931384086177243, + "grad_norm": 1.2132344245910645, + "learning_rate": 0.00018975383973403914, + "loss": 3.4108, + "step": 1539 + }, + { + "epoch": 0.13940436317552277, + "grad_norm": 1.033569574356079, + "learning_rate": 0.00018973984286913584, + "loss": 3.4222, + "step": 1540 + }, + { + "epoch": 0.1394948854892731, + "grad_norm": 0.9325762391090393, + "learning_rate": 0.00018972583696743285, + "loss": 3.4622, + "step": 1541 + }, + { + "epoch": 0.13958540780302345, + "grad_norm": 0.9818862080574036, + "learning_rate": 0.0001897118220303405, + "loss": 3.4873, + "step": 1542 + }, + { + "epoch": 0.1396759301167738, + "grad_norm": 1.0688445568084717, + "learning_rate": 0.00018969779805927018, + "loss": 3.459, + "step": 1543 + }, + { + "epoch": 0.13976645243052413, + "grad_norm": 0.9991565346717834, + "learning_rate": 0.000189683765055634, + "loss": 3.4183, + "step": 1544 + }, + { + "epoch": 0.13985697474427447, + "grad_norm": 1.1223208904266357, + "learning_rate": 0.00018966972302084516, + "loss": 3.4349, + "step": 1545 + }, + { + "epoch": 0.1399474970580248, + "grad_norm": 1.0523415803909302, + "learning_rate": 0.00018965567195631765, + "loss": 3.4106, + "step": 1546 + }, + { + "epoch": 0.14003801937177515, + "grad_norm": 1.0131930112838745, + "learning_rate": 0.00018964161186346646, + "loss": 3.3923, + "step": 1547 + }, + { + "epoch": 0.1401285416855255, + "grad_norm": 0.9429630637168884, + "learning_rate": 0.0001896275427437074, + "loss": 3.3949, + "step": 1548 + }, + { + "epoch": 0.14021906399927583, + "grad_norm": 1.0264955759048462, + "learning_rate": 0.00018961346459845722, + "loss": 3.4737, + "step": 1549 + }, + { + "epoch": 0.14030958631302617, + "grad_norm": 1.078149437904358, + "learning_rate": 0.00018959937742913359, + "loss": 3.4399, + "step": 1550 + }, + { + "epoch": 0.1404001086267765, + "grad_norm": 0.9134674072265625, + "learning_rate": 0.00018958528123715513, + "loss": 3.4693, + "step": 1551 + }, + { + "epoch": 0.14049063094052683, + "grad_norm": 1.0741221904754639, + "learning_rate": 0.0001895711760239413, + "loss": 3.415, + "step": 1552 + }, + { + "epoch": 0.14058115325427717, + "grad_norm": 0.9441749453544617, + "learning_rate": 0.00018955706179091248, + "loss": 3.4556, + "step": 1553 + }, + { + "epoch": 0.1406716755680275, + "grad_norm": 1.1377722024917603, + "learning_rate": 0.00018954293853949, + "loss": 3.4593, + "step": 1554 + }, + { + "epoch": 0.14076219788177785, + "grad_norm": 1.0758979320526123, + "learning_rate": 0.00018952880627109606, + "loss": 3.3585, + "step": 1555 + }, + { + "epoch": 0.1408527201955282, + "grad_norm": 0.9907482266426086, + "learning_rate": 0.00018951466498715378, + "loss": 3.4805, + "step": 1556 + }, + { + "epoch": 0.14094324250927853, + "grad_norm": 0.977868378162384, + "learning_rate": 0.0001895005146890872, + "loss": 3.4679, + "step": 1557 + }, + { + "epoch": 0.14103376482302887, + "grad_norm": 0.994621753692627, + "learning_rate": 0.0001894863553783212, + "loss": 3.4203, + "step": 1558 + }, + { + "epoch": 0.14112428713677921, + "grad_norm": 0.9056159853935242, + "learning_rate": 0.00018947218705628167, + "loss": 3.3717, + "step": 1559 + }, + { + "epoch": 0.14121480945052955, + "grad_norm": 0.9723846316337585, + "learning_rate": 0.00018945800972439538, + "loss": 3.4537, + "step": 1560 + }, + { + "epoch": 0.1413053317642799, + "grad_norm": 1.0195595026016235, + "learning_rate": 0.00018944382338408994, + "loss": 3.4347, + "step": 1561 + }, + { + "epoch": 0.14139585407803024, + "grad_norm": 0.9769614934921265, + "learning_rate": 0.00018942962803679395, + "loss": 3.4331, + "step": 1562 + }, + { + "epoch": 0.14148637639178058, + "grad_norm": 0.9903321862220764, + "learning_rate": 0.0001894154236839368, + "loss": 3.3952, + "step": 1563 + }, + { + "epoch": 0.14157689870553092, + "grad_norm": 1.066959261894226, + "learning_rate": 0.00018940121032694898, + "loss": 3.4105, + "step": 1564 + }, + { + "epoch": 0.14166742101928126, + "grad_norm": 0.9177101850509644, + "learning_rate": 0.00018938698796726176, + "loss": 3.4389, + "step": 1565 + }, + { + "epoch": 0.1417579433330316, + "grad_norm": 0.9230473041534424, + "learning_rate": 0.00018937275660630727, + "loss": 3.4248, + "step": 1566 + }, + { + "epoch": 0.14184846564678194, + "grad_norm": 1.0110989809036255, + "learning_rate": 0.00018935851624551862, + "loss": 3.3903, + "step": 1567 + }, + { + "epoch": 0.14193898796053228, + "grad_norm": 0.9214289784431458, + "learning_rate": 0.00018934426688632986, + "loss": 3.4403, + "step": 1568 + }, + { + "epoch": 0.14202951027428262, + "grad_norm": 0.9641263484954834, + "learning_rate": 0.0001893300085301758, + "loss": 3.4886, + "step": 1569 + }, + { + "epoch": 0.14212003258803296, + "grad_norm": 1.065324306488037, + "learning_rate": 0.0001893157411784924, + "loss": 3.4602, + "step": 1570 + }, + { + "epoch": 0.14221055490178328, + "grad_norm": 1.0880188941955566, + "learning_rate": 0.00018930146483271627, + "loss": 3.4612, + "step": 1571 + }, + { + "epoch": 0.14230107721553362, + "grad_norm": 1.1627023220062256, + "learning_rate": 0.00018928717949428508, + "loss": 3.46, + "step": 1572 + }, + { + "epoch": 0.14239159952928396, + "grad_norm": 0.9831439256668091, + "learning_rate": 0.00018927288516463737, + "loss": 3.5054, + "step": 1573 + }, + { + "epoch": 0.1424821218430343, + "grad_norm": 1.106811761856079, + "learning_rate": 0.00018925858184521256, + "loss": 3.3911, + "step": 1574 + }, + { + "epoch": 0.14257264415678464, + "grad_norm": 0.993094265460968, + "learning_rate": 0.000189244269537451, + "loss": 3.4724, + "step": 1575 + }, + { + "epoch": 0.14266316647053498, + "grad_norm": 1.1177440881729126, + "learning_rate": 0.00018922994824279395, + "loss": 3.4473, + "step": 1576 + }, + { + "epoch": 0.14275368878428532, + "grad_norm": 0.9967995882034302, + "learning_rate": 0.00018921561796268353, + "loss": 3.4442, + "step": 1577 + }, + { + "epoch": 0.14284421109803566, + "grad_norm": 0.9261888265609741, + "learning_rate": 0.00018920127869856282, + "loss": 3.488, + "step": 1578 + }, + { + "epoch": 0.142934733411786, + "grad_norm": 0.9250870943069458, + "learning_rate": 0.0001891869304518758, + "loss": 3.4093, + "step": 1579 + }, + { + "epoch": 0.14302525572553634, + "grad_norm": 0.9998716115951538, + "learning_rate": 0.00018917257322406734, + "loss": 3.4525, + "step": 1580 + }, + { + "epoch": 0.14311577803928668, + "grad_norm": 0.977238118648529, + "learning_rate": 0.0001891582070165832, + "loss": 3.3836, + "step": 1581 + }, + { + "epoch": 0.14320630035303702, + "grad_norm": 0.9520090222358704, + "learning_rate": 0.00018914383183087002, + "loss": 3.4572, + "step": 1582 + }, + { + "epoch": 0.14329682266678737, + "grad_norm": 1.0415148735046387, + "learning_rate": 0.00018912944766837542, + "loss": 3.4392, + "step": 1583 + }, + { + "epoch": 0.1433873449805377, + "grad_norm": 0.9595130681991577, + "learning_rate": 0.00018911505453054786, + "loss": 3.4015, + "step": 1584 + }, + { + "epoch": 0.14347786729428805, + "grad_norm": 0.9924153685569763, + "learning_rate": 0.0001891006524188368, + "loss": 3.3964, + "step": 1585 + }, + { + "epoch": 0.1435683896080384, + "grad_norm": 0.8610692024230957, + "learning_rate": 0.00018908624133469245, + "loss": 3.3727, + "step": 1586 + }, + { + "epoch": 0.14365891192178873, + "grad_norm": 1.0391387939453125, + "learning_rate": 0.00018907182127956605, + "loss": 3.5463, + "step": 1587 + }, + { + "epoch": 0.14374943423553907, + "grad_norm": 0.894811749458313, + "learning_rate": 0.00018905739225490967, + "loss": 3.465, + "step": 1588 + }, + { + "epoch": 0.1438399565492894, + "grad_norm": 0.9358307719230652, + "learning_rate": 0.00018904295426217633, + "loss": 3.4852, + "step": 1589 + }, + { + "epoch": 0.14393047886303975, + "grad_norm": 0.880241334438324, + "learning_rate": 0.00018902850730281992, + "loss": 3.3983, + "step": 1590 + }, + { + "epoch": 0.1440210011767901, + "grad_norm": 0.9443881511688232, + "learning_rate": 0.00018901405137829527, + "loss": 3.4337, + "step": 1591 + }, + { + "epoch": 0.1441115234905404, + "grad_norm": 0.9192736744880676, + "learning_rate": 0.0001889995864900581, + "loss": 3.4605, + "step": 1592 + }, + { + "epoch": 0.14420204580429075, + "grad_norm": 1.04524827003479, + "learning_rate": 0.000188985112639565, + "loss": 3.425, + "step": 1593 + }, + { + "epoch": 0.1442925681180411, + "grad_norm": 0.9558252096176147, + "learning_rate": 0.00018897062982827344, + "loss": 3.4396, + "step": 1594 + }, + { + "epoch": 0.14438309043179143, + "grad_norm": 0.8985421657562256, + "learning_rate": 0.00018895613805764195, + "loss": 3.399, + "step": 1595 + }, + { + "epoch": 0.14447361274554177, + "grad_norm": 0.9235852956771851, + "learning_rate": 0.00018894163732912977, + "loss": 3.4708, + "step": 1596 + }, + { + "epoch": 0.1445641350592921, + "grad_norm": 1.1525856256484985, + "learning_rate": 0.00018892712764419716, + "loss": 3.4202, + "step": 1597 + }, + { + "epoch": 0.14465465737304245, + "grad_norm": 0.9813726544380188, + "learning_rate": 0.0001889126090043052, + "loss": 3.4341, + "step": 1598 + }, + { + "epoch": 0.1447451796867928, + "grad_norm": 0.9528858065605164, + "learning_rate": 0.00018889808141091597, + "loss": 3.4332, + "step": 1599 + }, + { + "epoch": 0.14483570200054313, + "grad_norm": 0.964176595211029, + "learning_rate": 0.00018888354486549237, + "loss": 3.427, + "step": 1600 + }, + { + "epoch": 0.14492622431429347, + "grad_norm": 0.9885776042938232, + "learning_rate": 0.00018886899936949823, + "loss": 3.4011, + "step": 1601 + }, + { + "epoch": 0.1450167466280438, + "grad_norm": 0.9918528199195862, + "learning_rate": 0.00018885444492439829, + "loss": 3.4817, + "step": 1602 + }, + { + "epoch": 0.14510726894179415, + "grad_norm": 0.9491215944290161, + "learning_rate": 0.00018883988153165813, + "loss": 3.3567, + "step": 1603 + }, + { + "epoch": 0.1451977912555445, + "grad_norm": 0.9561818242073059, + "learning_rate": 0.00018882530919274443, + "loss": 3.3902, + "step": 1604 + }, + { + "epoch": 0.14528831356929484, + "grad_norm": 0.9810082316398621, + "learning_rate": 0.00018881072790912445, + "loss": 3.3979, + "step": 1605 + }, + { + "epoch": 0.14537883588304518, + "grad_norm": 0.985683023929596, + "learning_rate": 0.0001887961376822666, + "loss": 3.3853, + "step": 1606 + }, + { + "epoch": 0.14546935819679552, + "grad_norm": 0.9815475940704346, + "learning_rate": 0.00018878153851364013, + "loss": 3.4275, + "step": 1607 + }, + { + "epoch": 0.14555988051054586, + "grad_norm": 1.012288212776184, + "learning_rate": 0.00018876693040471517, + "loss": 3.367, + "step": 1608 + }, + { + "epoch": 0.1456504028242962, + "grad_norm": 0.8871361613273621, + "learning_rate": 0.00018875231335696276, + "loss": 3.4282, + "step": 1609 + }, + { + "epoch": 0.14574092513804654, + "grad_norm": 0.9624701738357544, + "learning_rate": 0.0001887376873718548, + "loss": 3.4779, + "step": 1610 + }, + { + "epoch": 0.14583144745179688, + "grad_norm": 1.0587818622589111, + "learning_rate": 0.00018872305245086414, + "loss": 3.3712, + "step": 1611 + }, + { + "epoch": 0.1459219697655472, + "grad_norm": 0.9896037578582764, + "learning_rate": 0.00018870840859546456, + "loss": 3.4185, + "step": 1612 + }, + { + "epoch": 0.14601249207929753, + "grad_norm": 0.9666143655776978, + "learning_rate": 0.00018869375580713062, + "loss": 3.4198, + "step": 1613 + }, + { + "epoch": 0.14610301439304788, + "grad_norm": 0.9257593154907227, + "learning_rate": 0.00018867909408733794, + "loss": 3.3719, + "step": 1614 + }, + { + "epoch": 0.14619353670679822, + "grad_norm": 0.8875561356544495, + "learning_rate": 0.00018866442343756288, + "loss": 3.4509, + "step": 1615 + }, + { + "epoch": 0.14628405902054856, + "grad_norm": 0.9463050365447998, + "learning_rate": 0.00018864974385928283, + "loss": 3.3477, + "step": 1616 + }, + { + "epoch": 0.1463745813342989, + "grad_norm": 0.9890984296798706, + "learning_rate": 0.00018863505535397597, + "loss": 3.4441, + "step": 1617 + }, + { + "epoch": 0.14646510364804924, + "grad_norm": 1.125440001487732, + "learning_rate": 0.00018862035792312147, + "loss": 3.3712, + "step": 1618 + }, + { + "epoch": 0.14655562596179958, + "grad_norm": 1.0327622890472412, + "learning_rate": 0.00018860565156819935, + "loss": 3.4126, + "step": 1619 + }, + { + "epoch": 0.14664614827554992, + "grad_norm": 0.9664136171340942, + "learning_rate": 0.00018859093629069058, + "loss": 3.3958, + "step": 1620 + }, + { + "epoch": 0.14673667058930026, + "grad_norm": 1.0481597185134888, + "learning_rate": 0.00018857621209207692, + "loss": 3.4617, + "step": 1621 + }, + { + "epoch": 0.1468271929030506, + "grad_norm": 1.0612270832061768, + "learning_rate": 0.0001885614789738411, + "loss": 3.429, + "step": 1622 + }, + { + "epoch": 0.14691771521680094, + "grad_norm": 1.0658665895462036, + "learning_rate": 0.0001885467369374668, + "loss": 3.4412, + "step": 1623 + }, + { + "epoch": 0.14700823753055128, + "grad_norm": 1.0005717277526855, + "learning_rate": 0.00018853198598443852, + "loss": 3.4411, + "step": 1624 + }, + { + "epoch": 0.14709875984430162, + "grad_norm": 0.9699280858039856, + "learning_rate": 0.00018851722611624164, + "loss": 3.3876, + "step": 1625 + }, + { + "epoch": 0.14718928215805197, + "grad_norm": 1.086168646812439, + "learning_rate": 0.00018850245733436255, + "loss": 3.3776, + "step": 1626 + }, + { + "epoch": 0.1472798044718023, + "grad_norm": 1.1531916856765747, + "learning_rate": 0.00018848767964028845, + "loss": 3.4189, + "step": 1627 + }, + { + "epoch": 0.14737032678555265, + "grad_norm": 1.0791008472442627, + "learning_rate": 0.00018847289303550738, + "loss": 3.4613, + "step": 1628 + }, + { + "epoch": 0.147460849099303, + "grad_norm": 1.0842652320861816, + "learning_rate": 0.0001884580975215084, + "loss": 3.424, + "step": 1629 + }, + { + "epoch": 0.14755137141305333, + "grad_norm": 1.0409084558486938, + "learning_rate": 0.00018844329309978145, + "loss": 3.4281, + "step": 1630 + }, + { + "epoch": 0.14764189372680367, + "grad_norm": 1.0114672183990479, + "learning_rate": 0.0001884284797718173, + "loss": 3.4209, + "step": 1631 + }, + { + "epoch": 0.147732416040554, + "grad_norm": 0.9554311037063599, + "learning_rate": 0.00018841365753910765, + "loss": 3.4625, + "step": 1632 + }, + { + "epoch": 0.14782293835430432, + "grad_norm": 1.119260311126709, + "learning_rate": 0.0001883988264031451, + "loss": 3.4175, + "step": 1633 + }, + { + "epoch": 0.14791346066805466, + "grad_norm": 0.9265503883361816, + "learning_rate": 0.00018838398636542312, + "loss": 3.4327, + "step": 1634 + }, + { + "epoch": 0.148003982981805, + "grad_norm": 0.9685906767845154, + "learning_rate": 0.00018836913742743618, + "loss": 3.3286, + "step": 1635 + }, + { + "epoch": 0.14809450529555535, + "grad_norm": 0.9059945940971375, + "learning_rate": 0.0001883542795906795, + "loss": 3.3457, + "step": 1636 + }, + { + "epoch": 0.1481850276093057, + "grad_norm": 1.1037979125976562, + "learning_rate": 0.0001883394128566493, + "loss": 3.3818, + "step": 1637 + }, + { + "epoch": 0.14827554992305603, + "grad_norm": 0.9228719472885132, + "learning_rate": 0.00018832453722684262, + "loss": 3.3509, + "step": 1638 + }, + { + "epoch": 0.14836607223680637, + "grad_norm": 1.0875881910324097, + "learning_rate": 0.00018830965270275746, + "loss": 3.4402, + "step": 1639 + }, + { + "epoch": 0.1484565945505567, + "grad_norm": 1.1249608993530273, + "learning_rate": 0.00018829475928589271, + "loss": 3.4133, + "step": 1640 + }, + { + "epoch": 0.14854711686430705, + "grad_norm": 1.0222041606903076, + "learning_rate": 0.00018827985697774811, + "loss": 3.3476, + "step": 1641 + }, + { + "epoch": 0.1486376391780574, + "grad_norm": 0.9464477896690369, + "learning_rate": 0.00018826494577982433, + "loss": 3.3605, + "step": 1642 + }, + { + "epoch": 0.14872816149180773, + "grad_norm": 1.1207705736160278, + "learning_rate": 0.0001882500256936229, + "loss": 3.3664, + "step": 1643 + }, + { + "epoch": 0.14881868380555807, + "grad_norm": 0.8890235424041748, + "learning_rate": 0.0001882350967206463, + "loss": 3.4928, + "step": 1644 + }, + { + "epoch": 0.1489092061193084, + "grad_norm": 1.437223196029663, + "learning_rate": 0.00018822015886239788, + "loss": 3.4072, + "step": 1645 + }, + { + "epoch": 0.14899972843305875, + "grad_norm": 1.0502667427062988, + "learning_rate": 0.00018820521212038188, + "loss": 3.3892, + "step": 1646 + }, + { + "epoch": 0.1490902507468091, + "grad_norm": 1.1289931535720825, + "learning_rate": 0.00018819025649610344, + "loss": 3.4068, + "step": 1647 + }, + { + "epoch": 0.14918077306055944, + "grad_norm": 0.9635894298553467, + "learning_rate": 0.0001881752919910686, + "loss": 3.4564, + "step": 1648 + }, + { + "epoch": 0.14927129537430978, + "grad_norm": 1.047222375869751, + "learning_rate": 0.00018816031860678423, + "loss": 3.3389, + "step": 1649 + }, + { + "epoch": 0.14936181768806012, + "grad_norm": 1.1027660369873047, + "learning_rate": 0.00018814533634475822, + "loss": 3.392, + "step": 1650 + }, + { + "epoch": 0.14945234000181046, + "grad_norm": 0.9586750268936157, + "learning_rate": 0.0001881303452064992, + "loss": 3.4108, + "step": 1651 + }, + { + "epoch": 0.1495428623155608, + "grad_norm": 1.0597426891326904, + "learning_rate": 0.0001881153451935169, + "loss": 3.3901, + "step": 1652 + }, + { + "epoch": 0.1496333846293111, + "grad_norm": 1.084460973739624, + "learning_rate": 0.00018810033630732172, + "loss": 3.3989, + "step": 1653 + }, + { + "epoch": 0.14972390694306145, + "grad_norm": 1.0946100950241089, + "learning_rate": 0.0001880853185494251, + "loss": 3.4507, + "step": 1654 + }, + { + "epoch": 0.1498144292568118, + "grad_norm": 0.8299400806427002, + "learning_rate": 0.00018807029192133927, + "loss": 3.3557, + "step": 1655 + }, + { + "epoch": 0.14990495157056213, + "grad_norm": 1.1521278619766235, + "learning_rate": 0.0001880552564245775, + "loss": 3.4572, + "step": 1656 + }, + { + "epoch": 0.14999547388431247, + "grad_norm": 0.9992600679397583, + "learning_rate": 0.00018804021206065378, + "loss": 3.3996, + "step": 1657 + }, + { + "epoch": 0.15008599619806282, + "grad_norm": 1.0150352716445923, + "learning_rate": 0.00018802515883108314, + "loss": 3.3961, + "step": 1658 + }, + { + "epoch": 0.15017651851181316, + "grad_norm": 0.9606907963752747, + "learning_rate": 0.00018801009673738138, + "loss": 3.4012, + "step": 1659 + }, + { + "epoch": 0.1502670408255635, + "grad_norm": 0.9377285242080688, + "learning_rate": 0.00018799502578106534, + "loss": 3.3552, + "step": 1660 + }, + { + "epoch": 0.15035756313931384, + "grad_norm": 0.909257709980011, + "learning_rate": 0.00018797994596365255, + "loss": 3.3984, + "step": 1661 + }, + { + "epoch": 0.15044808545306418, + "grad_norm": 1.0251672267913818, + "learning_rate": 0.00018796485728666165, + "loss": 3.4319, + "step": 1662 + }, + { + "epoch": 0.15053860776681452, + "grad_norm": 0.857364296913147, + "learning_rate": 0.00018794975975161205, + "loss": 3.2863, + "step": 1663 + }, + { + "epoch": 0.15062913008056486, + "grad_norm": 1.043169379234314, + "learning_rate": 0.000187934653360024, + "loss": 3.4156, + "step": 1664 + }, + { + "epoch": 0.1507196523943152, + "grad_norm": 0.9212275743484497, + "learning_rate": 0.00018791953811341877, + "loss": 3.4092, + "step": 1665 + }, + { + "epoch": 0.15081017470806554, + "grad_norm": 1.0773905515670776, + "learning_rate": 0.00018790441401331847, + "loss": 3.4174, + "step": 1666 + }, + { + "epoch": 0.15090069702181588, + "grad_norm": 0.9565873146057129, + "learning_rate": 0.00018788928106124608, + "loss": 3.3829, + "step": 1667 + }, + { + "epoch": 0.15099121933556622, + "grad_norm": 1.0591626167297363, + "learning_rate": 0.0001878741392587255, + "loss": 3.4282, + "step": 1668 + }, + { + "epoch": 0.15108174164931656, + "grad_norm": 0.9780889749526978, + "learning_rate": 0.00018785898860728152, + "loss": 3.4514, + "step": 1669 + }, + { + "epoch": 0.1511722639630669, + "grad_norm": 0.9237357974052429, + "learning_rate": 0.00018784382910843976, + "loss": 3.3146, + "step": 1670 + }, + { + "epoch": 0.15126278627681725, + "grad_norm": 0.9899992942810059, + "learning_rate": 0.00018782866076372682, + "loss": 3.3556, + "step": 1671 + }, + { + "epoch": 0.1513533085905676, + "grad_norm": 1.0460927486419678, + "learning_rate": 0.00018781348357467013, + "loss": 3.4256, + "step": 1672 + }, + { + "epoch": 0.15144383090431793, + "grad_norm": 0.9416075944900513, + "learning_rate": 0.00018779829754279805, + "loss": 3.3589, + "step": 1673 + }, + { + "epoch": 0.15153435321806824, + "grad_norm": 1.208129644393921, + "learning_rate": 0.00018778310266963987, + "loss": 3.3374, + "step": 1674 + }, + { + "epoch": 0.15162487553181858, + "grad_norm": 0.9150928258895874, + "learning_rate": 0.00018776789895672558, + "loss": 3.3576, + "step": 1675 + }, + { + "epoch": 0.15171539784556892, + "grad_norm": 1.0763602256774902, + "learning_rate": 0.0001877526864055863, + "loss": 3.3803, + "step": 1676 + }, + { + "epoch": 0.15180592015931926, + "grad_norm": 0.9067040085792542, + "learning_rate": 0.00018773746501775387, + "loss": 3.404, + "step": 1677 + }, + { + "epoch": 0.1518964424730696, + "grad_norm": 0.9757846593856812, + "learning_rate": 0.00018772223479476114, + "loss": 3.4245, + "step": 1678 + }, + { + "epoch": 0.15198696478681994, + "grad_norm": 0.9587690234184265, + "learning_rate": 0.00018770699573814176, + "loss": 3.3856, + "step": 1679 + }, + { + "epoch": 0.15207748710057029, + "grad_norm": 1.2289484739303589, + "learning_rate": 0.0001876917478494303, + "loss": 3.3734, + "step": 1680 + }, + { + "epoch": 0.15216800941432063, + "grad_norm": 0.899712860584259, + "learning_rate": 0.00018767649113016224, + "loss": 3.4229, + "step": 1681 + }, + { + "epoch": 0.15225853172807097, + "grad_norm": 1.2586640119552612, + "learning_rate": 0.00018766122558187395, + "loss": 3.3438, + "step": 1682 + }, + { + "epoch": 0.1523490540418213, + "grad_norm": 0.8806917667388916, + "learning_rate": 0.00018764595120610258, + "loss": 3.3843, + "step": 1683 + }, + { + "epoch": 0.15243957635557165, + "grad_norm": 1.154797077178955, + "learning_rate": 0.00018763066800438636, + "loss": 3.4531, + "step": 1684 + }, + { + "epoch": 0.152530098669322, + "grad_norm": 0.9956049919128418, + "learning_rate": 0.00018761537597826425, + "loss": 3.4306, + "step": 1685 + }, + { + "epoch": 0.15262062098307233, + "grad_norm": 1.1457597017288208, + "learning_rate": 0.00018760007512927623, + "loss": 3.4051, + "step": 1686 + }, + { + "epoch": 0.15271114329682267, + "grad_norm": 0.9226441383361816, + "learning_rate": 0.000187584765458963, + "loss": 3.4094, + "step": 1687 + }, + { + "epoch": 0.152801665610573, + "grad_norm": 1.1360063552856445, + "learning_rate": 0.0001875694469688663, + "loss": 3.3587, + "step": 1688 + }, + { + "epoch": 0.15289218792432335, + "grad_norm": 0.8890431523323059, + "learning_rate": 0.00018755411966052865, + "loss": 3.3562, + "step": 1689 + }, + { + "epoch": 0.1529827102380737, + "grad_norm": 1.0502040386199951, + "learning_rate": 0.00018753878353549357, + "loss": 3.4775, + "step": 1690 + }, + { + "epoch": 0.15307323255182403, + "grad_norm": 0.9252305030822754, + "learning_rate": 0.00018752343859530538, + "loss": 3.3852, + "step": 1691 + }, + { + "epoch": 0.15316375486557438, + "grad_norm": 1.039414405822754, + "learning_rate": 0.00018750808484150935, + "loss": 3.3441, + "step": 1692 + }, + { + "epoch": 0.15325427717932472, + "grad_norm": 1.0017343759536743, + "learning_rate": 0.00018749272227565152, + "loss": 3.4088, + "step": 1693 + }, + { + "epoch": 0.15334479949307506, + "grad_norm": 0.9906306266784668, + "learning_rate": 0.00018747735089927897, + "loss": 3.3515, + "step": 1694 + }, + { + "epoch": 0.15343532180682537, + "grad_norm": 1.0180130004882812, + "learning_rate": 0.00018746197071393958, + "loss": 3.4191, + "step": 1695 + }, + { + "epoch": 0.1535258441205757, + "grad_norm": 0.9270363450050354, + "learning_rate": 0.00018744658172118215, + "loss": 3.3679, + "step": 1696 + }, + { + "epoch": 0.15361636643432605, + "grad_norm": 1.0553703308105469, + "learning_rate": 0.0001874311839225563, + "loss": 3.4235, + "step": 1697 + }, + { + "epoch": 0.1537068887480764, + "grad_norm": 0.9562894105911255, + "learning_rate": 0.00018741577731961267, + "loss": 3.382, + "step": 1698 + }, + { + "epoch": 0.15379741106182673, + "grad_norm": 1.0290777683258057, + "learning_rate": 0.0001874003619139026, + "loss": 3.2955, + "step": 1699 + }, + { + "epoch": 0.15388793337557707, + "grad_norm": 1.0647125244140625, + "learning_rate": 0.00018738493770697852, + "loss": 3.4676, + "step": 1700 + }, + { + "epoch": 0.15397845568932741, + "grad_norm": 0.975814700126648, + "learning_rate": 0.00018736950470039357, + "loss": 3.3495, + "step": 1701 + }, + { + "epoch": 0.15406897800307776, + "grad_norm": 1.3109288215637207, + "learning_rate": 0.00018735406289570192, + "loss": 3.3745, + "step": 1702 + }, + { + "epoch": 0.1541595003168281, + "grad_norm": 0.8713765740394592, + "learning_rate": 0.0001873386122944585, + "loss": 3.3465, + "step": 1703 + }, + { + "epoch": 0.15425002263057844, + "grad_norm": 0.9419022798538208, + "learning_rate": 0.00018732315289821921, + "loss": 3.4305, + "step": 1704 + }, + { + "epoch": 0.15434054494432878, + "grad_norm": 1.0088133811950684, + "learning_rate": 0.00018730768470854084, + "loss": 3.3117, + "step": 1705 + }, + { + "epoch": 0.15443106725807912, + "grad_norm": 0.9172151684761047, + "learning_rate": 0.00018729220772698097, + "loss": 3.4693, + "step": 1706 + }, + { + "epoch": 0.15452158957182946, + "grad_norm": 1.0178710222244263, + "learning_rate": 0.0001872767219550982, + "loss": 3.3724, + "step": 1707 + }, + { + "epoch": 0.1546121118855798, + "grad_norm": 0.9283190965652466, + "learning_rate": 0.0001872612273944519, + "loss": 3.3333, + "step": 1708 + }, + { + "epoch": 0.15470263419933014, + "grad_norm": 0.9490437507629395, + "learning_rate": 0.0001872457240466024, + "loss": 3.3637, + "step": 1709 + }, + { + "epoch": 0.15479315651308048, + "grad_norm": 0.9215477705001831, + "learning_rate": 0.0001872302119131109, + "loss": 3.3062, + "step": 1710 + }, + { + "epoch": 0.15488367882683082, + "grad_norm": 0.8653420805931091, + "learning_rate": 0.0001872146909955394, + "loss": 3.3447, + "step": 1711 + }, + { + "epoch": 0.15497420114058116, + "grad_norm": 0.9787905216217041, + "learning_rate": 0.00018719916129545093, + "loss": 3.4323, + "step": 1712 + }, + { + "epoch": 0.1550647234543315, + "grad_norm": 0.8503413200378418, + "learning_rate": 0.00018718362281440929, + "loss": 3.3701, + "step": 1713 + }, + { + "epoch": 0.15515524576808185, + "grad_norm": 1.0395936965942383, + "learning_rate": 0.0001871680755539792, + "loss": 3.3697, + "step": 1714 + }, + { + "epoch": 0.15524576808183216, + "grad_norm": 0.9945054650306702, + "learning_rate": 0.00018715251951572634, + "loss": 3.4051, + "step": 1715 + }, + { + "epoch": 0.1553362903955825, + "grad_norm": 0.9538306593894958, + "learning_rate": 0.00018713695470121714, + "loss": 3.3483, + "step": 1716 + }, + { + "epoch": 0.15542681270933284, + "grad_norm": 0.9405638575553894, + "learning_rate": 0.00018712138111201895, + "loss": 3.3741, + "step": 1717 + }, + { + "epoch": 0.15551733502308318, + "grad_norm": 1.037540316581726, + "learning_rate": 0.00018710579874970008, + "loss": 3.2944, + "step": 1718 + }, + { + "epoch": 0.15560785733683352, + "grad_norm": 0.9655166268348694, + "learning_rate": 0.00018709020761582967, + "loss": 3.4324, + "step": 1719 + }, + { + "epoch": 0.15569837965058386, + "grad_norm": 0.9335793256759644, + "learning_rate": 0.00018707460771197774, + "loss": 3.3129, + "step": 1720 + }, + { + "epoch": 0.1557889019643342, + "grad_norm": 0.9587303996086121, + "learning_rate": 0.00018705899903971517, + "loss": 3.3023, + "step": 1721 + }, + { + "epoch": 0.15587942427808454, + "grad_norm": 1.0624523162841797, + "learning_rate": 0.0001870433816006138, + "loss": 3.3677, + "step": 1722 + }, + { + "epoch": 0.15596994659183488, + "grad_norm": 0.9460078477859497, + "learning_rate": 0.0001870277553962463, + "loss": 3.4654, + "step": 1723 + }, + { + "epoch": 0.15606046890558523, + "grad_norm": 0.9337052702903748, + "learning_rate": 0.00018701212042818616, + "loss": 3.3522, + "step": 1724 + }, + { + "epoch": 0.15615099121933557, + "grad_norm": 1.0163861513137817, + "learning_rate": 0.0001869964766980079, + "loss": 3.3387, + "step": 1725 + }, + { + "epoch": 0.1562415135330859, + "grad_norm": 0.8806196451187134, + "learning_rate": 0.00018698082420728684, + "loss": 3.3855, + "step": 1726 + }, + { + "epoch": 0.15633203584683625, + "grad_norm": 0.9415055513381958, + "learning_rate": 0.00018696516295759914, + "loss": 3.3961, + "step": 1727 + }, + { + "epoch": 0.1564225581605866, + "grad_norm": 1.0619957447052002, + "learning_rate": 0.0001869494929505219, + "loss": 3.4081, + "step": 1728 + }, + { + "epoch": 0.15651308047433693, + "grad_norm": 1.1068556308746338, + "learning_rate": 0.0001869338141876331, + "loss": 3.3572, + "step": 1729 + }, + { + "epoch": 0.15660360278808727, + "grad_norm": 0.9381948709487915, + "learning_rate": 0.00018691812667051162, + "loss": 3.3883, + "step": 1730 + }, + { + "epoch": 0.1566941251018376, + "grad_norm": 1.0782184600830078, + "learning_rate": 0.00018690243040073713, + "loss": 3.3126, + "step": 1731 + }, + { + "epoch": 0.15678464741558795, + "grad_norm": 0.9588199853897095, + "learning_rate": 0.0001868867253798903, + "loss": 3.3538, + "step": 1732 + }, + { + "epoch": 0.1568751697293383, + "grad_norm": 1.112703561782837, + "learning_rate": 0.00018687101160955261, + "loss": 3.4172, + "step": 1733 + }, + { + "epoch": 0.15696569204308863, + "grad_norm": 1.008054256439209, + "learning_rate": 0.0001868552890913064, + "loss": 3.3506, + "step": 1734 + }, + { + "epoch": 0.15705621435683897, + "grad_norm": 1.138614535331726, + "learning_rate": 0.00018683955782673498, + "loss": 3.3616, + "step": 1735 + }, + { + "epoch": 0.1571467366705893, + "grad_norm": 0.9115349054336548, + "learning_rate": 0.00018682381781742245, + "loss": 3.3093, + "step": 1736 + }, + { + "epoch": 0.15723725898433963, + "grad_norm": 1.0118136405944824, + "learning_rate": 0.00018680806906495382, + "loss": 3.3777, + "step": 1737 + }, + { + "epoch": 0.15732778129808997, + "grad_norm": 0.9229224920272827, + "learning_rate": 0.00018679231157091506, + "loss": 3.4082, + "step": 1738 + }, + { + "epoch": 0.1574183036118403, + "grad_norm": 0.9689410924911499, + "learning_rate": 0.00018677654533689287, + "loss": 3.2666, + "step": 1739 + }, + { + "epoch": 0.15750882592559065, + "grad_norm": 1.0013811588287354, + "learning_rate": 0.00018676077036447494, + "loss": 3.4077, + "step": 1740 + }, + { + "epoch": 0.157599348239341, + "grad_norm": 0.9101596474647522, + "learning_rate": 0.0001867449866552498, + "loss": 3.3659, + "step": 1741 + }, + { + "epoch": 0.15768987055309133, + "grad_norm": 0.9114277958869934, + "learning_rate": 0.00018672919421080687, + "loss": 3.3438, + "step": 1742 + }, + { + "epoch": 0.15778039286684167, + "grad_norm": 1.0535948276519775, + "learning_rate": 0.00018671339303273648, + "loss": 3.4323, + "step": 1743 + }, + { + "epoch": 0.15787091518059201, + "grad_norm": 0.9524382948875427, + "learning_rate": 0.00018669758312262976, + "loss": 3.4158, + "step": 1744 + }, + { + "epoch": 0.15796143749434235, + "grad_norm": 1.006758213043213, + "learning_rate": 0.0001866817644820788, + "loss": 3.3891, + "step": 1745 + }, + { + "epoch": 0.1580519598080927, + "grad_norm": 0.8595851063728333, + "learning_rate": 0.00018666593711267658, + "loss": 3.2898, + "step": 1746 + }, + { + "epoch": 0.15814248212184304, + "grad_norm": 0.932120680809021, + "learning_rate": 0.0001866501010160168, + "loss": 3.2822, + "step": 1747 + }, + { + "epoch": 0.15823300443559338, + "grad_norm": 0.9294149279594421, + "learning_rate": 0.00018663425619369425, + "loss": 3.4403, + "step": 1748 + }, + { + "epoch": 0.15832352674934372, + "grad_norm": 1.0117833614349365, + "learning_rate": 0.00018661840264730444, + "loss": 3.3145, + "step": 1749 + }, + { + "epoch": 0.15841404906309406, + "grad_norm": 0.9287446141242981, + "learning_rate": 0.00018660254037844388, + "loss": 3.3256, + "step": 1750 + }, + { + "epoch": 0.1585045713768444, + "grad_norm": 1.0927577018737793, + "learning_rate": 0.00018658666938870986, + "loss": 3.3843, + "step": 1751 + }, + { + "epoch": 0.15859509369059474, + "grad_norm": 1.0403542518615723, + "learning_rate": 0.00018657078967970062, + "loss": 3.3488, + "step": 1752 + }, + { + "epoch": 0.15868561600434508, + "grad_norm": 0.9642882347106934, + "learning_rate": 0.00018655490125301521, + "loss": 3.4212, + "step": 1753 + }, + { + "epoch": 0.15877613831809542, + "grad_norm": 0.9047889709472656, + "learning_rate": 0.0001865390041102536, + "loss": 3.3679, + "step": 1754 + }, + { + "epoch": 0.15886666063184576, + "grad_norm": 0.9774285554885864, + "learning_rate": 0.0001865230982530167, + "loss": 3.2796, + "step": 1755 + }, + { + "epoch": 0.15895718294559608, + "grad_norm": 0.944681704044342, + "learning_rate": 0.0001865071836829061, + "loss": 3.3673, + "step": 1756 + }, + { + "epoch": 0.15904770525934642, + "grad_norm": 0.9720564484596252, + "learning_rate": 0.00018649126040152454, + "loss": 3.3176, + "step": 1757 + }, + { + "epoch": 0.15913822757309676, + "grad_norm": 1.0847887992858887, + "learning_rate": 0.00018647532841047537, + "loss": 3.3372, + "step": 1758 + }, + { + "epoch": 0.1592287498868471, + "grad_norm": 1.0074677467346191, + "learning_rate": 0.00018645938771136303, + "loss": 3.4203, + "step": 1759 + }, + { + "epoch": 0.15931927220059744, + "grad_norm": 1.074242115020752, + "learning_rate": 0.0001864434383057927, + "loss": 3.3453, + "step": 1760 + }, + { + "epoch": 0.15940979451434778, + "grad_norm": 0.9605053067207336, + "learning_rate": 0.0001864274801953705, + "loss": 3.3927, + "step": 1761 + }, + { + "epoch": 0.15950031682809812, + "grad_norm": 1.0063918828964233, + "learning_rate": 0.0001864115133817034, + "loss": 3.392, + "step": 1762 + }, + { + "epoch": 0.15959083914184846, + "grad_norm": 1.0003013610839844, + "learning_rate": 0.00018639553786639924, + "loss": 3.3505, + "step": 1763 + }, + { + "epoch": 0.1596813614555988, + "grad_norm": 1.046702265739441, + "learning_rate": 0.00018637955365106683, + "loss": 3.3625, + "step": 1764 + }, + { + "epoch": 0.15977188376934914, + "grad_norm": 0.8922204375267029, + "learning_rate": 0.0001863635607373157, + "loss": 3.3603, + "step": 1765 + }, + { + "epoch": 0.15986240608309948, + "grad_norm": 0.9413361549377441, + "learning_rate": 0.00018634755912675638, + "loss": 3.3674, + "step": 1766 + }, + { + "epoch": 0.15995292839684982, + "grad_norm": 0.9285749793052673, + "learning_rate": 0.00018633154882100023, + "loss": 3.4742, + "step": 1767 + }, + { + "epoch": 0.16004345071060017, + "grad_norm": 0.9485217332839966, + "learning_rate": 0.00018631552982165944, + "loss": 3.2931, + "step": 1768 + }, + { + "epoch": 0.1601339730243505, + "grad_norm": 0.9633521437644958, + "learning_rate": 0.00018629950213034722, + "loss": 3.4031, + "step": 1769 + }, + { + "epoch": 0.16022449533810085, + "grad_norm": 0.984691321849823, + "learning_rate": 0.00018628346574867745, + "loss": 3.3443, + "step": 1770 + }, + { + "epoch": 0.1603150176518512, + "grad_norm": 1.0759167671203613, + "learning_rate": 0.00018626742067826506, + "loss": 3.407, + "step": 1771 + }, + { + "epoch": 0.16040553996560153, + "grad_norm": 1.0845353603363037, + "learning_rate": 0.00018625136692072575, + "loss": 3.4139, + "step": 1772 + }, + { + "epoch": 0.16049606227935187, + "grad_norm": 1.1085989475250244, + "learning_rate": 0.00018623530447767615, + "loss": 3.4532, + "step": 1773 + }, + { + "epoch": 0.1605865845931022, + "grad_norm": 0.9502083659172058, + "learning_rate": 0.00018621923335073376, + "loss": 3.3118, + "step": 1774 + }, + { + "epoch": 0.16067710690685255, + "grad_norm": 1.1858634948730469, + "learning_rate": 0.00018620315354151695, + "loss": 3.3605, + "step": 1775 + }, + { + "epoch": 0.1607676292206029, + "grad_norm": 1.0164425373077393, + "learning_rate": 0.0001861870650516449, + "loss": 3.3414, + "step": 1776 + }, + { + "epoch": 0.1608581515343532, + "grad_norm": 1.6314200162887573, + "learning_rate": 0.00018617096788273778, + "loss": 3.3694, + "step": 1777 + }, + { + "epoch": 0.16094867384810355, + "grad_norm": 1.19080650806427, + "learning_rate": 0.00018615486203641654, + "loss": 3.3305, + "step": 1778 + }, + { + "epoch": 0.1610391961618539, + "grad_norm": 1.3236929178237915, + "learning_rate": 0.00018613874751430306, + "loss": 3.4197, + "step": 1779 + }, + { + "epoch": 0.16112971847560423, + "grad_norm": 1.0939617156982422, + "learning_rate": 0.00018612262431802007, + "loss": 3.3496, + "step": 1780 + }, + { + "epoch": 0.16122024078935457, + "grad_norm": 0.9928971529006958, + "learning_rate": 0.00018610649244919114, + "loss": 3.3846, + "step": 1781 + }, + { + "epoch": 0.1613107631031049, + "grad_norm": 1.2084952592849731, + "learning_rate": 0.0001860903519094408, + "loss": 3.3893, + "step": 1782 + }, + { + "epoch": 0.16140128541685525, + "grad_norm": 1.0836037397384644, + "learning_rate": 0.0001860742027003944, + "loss": 3.3599, + "step": 1783 + }, + { + "epoch": 0.1614918077306056, + "grad_norm": 1.016389012336731, + "learning_rate": 0.00018605804482367807, + "loss": 3.4159, + "step": 1784 + }, + { + "epoch": 0.16158233004435593, + "grad_norm": 0.9981551766395569, + "learning_rate": 0.00018604187828091906, + "loss": 3.391, + "step": 1785 + }, + { + "epoch": 0.16167285235810627, + "grad_norm": 0.9092329740524292, + "learning_rate": 0.0001860257030737452, + "loss": 3.3708, + "step": 1786 + }, + { + "epoch": 0.1617633746718566, + "grad_norm": 0.9498738646507263, + "learning_rate": 0.00018600951920378543, + "loss": 3.325, + "step": 1787 + }, + { + "epoch": 0.16185389698560695, + "grad_norm": 1.0748759508132935, + "learning_rate": 0.00018599332667266943, + "loss": 3.3166, + "step": 1788 + }, + { + "epoch": 0.1619444192993573, + "grad_norm": 1.01380455493927, + "learning_rate": 0.00018597712548202778, + "loss": 3.4486, + "step": 1789 + }, + { + "epoch": 0.16203494161310764, + "grad_norm": 1.2337299585342407, + "learning_rate": 0.00018596091563349192, + "loss": 3.3372, + "step": 1790 + }, + { + "epoch": 0.16212546392685798, + "grad_norm": 0.9750309586524963, + "learning_rate": 0.00018594469712869424, + "loss": 3.4084, + "step": 1791 + }, + { + "epoch": 0.16221598624060832, + "grad_norm": 1.2485698461532593, + "learning_rate": 0.00018592846996926793, + "loss": 3.3383, + "step": 1792 + }, + { + "epoch": 0.16230650855435866, + "grad_norm": 1.1208914518356323, + "learning_rate": 0.000185912234156847, + "loss": 3.388, + "step": 1793 + }, + { + "epoch": 0.162397030868109, + "grad_norm": 0.8985668420791626, + "learning_rate": 0.00018589598969306645, + "loss": 3.3437, + "step": 1794 + }, + { + "epoch": 0.16248755318185934, + "grad_norm": 1.1364415884017944, + "learning_rate": 0.00018587973657956211, + "loss": 3.3177, + "step": 1795 + }, + { + "epoch": 0.16257807549560968, + "grad_norm": 1.0473604202270508, + "learning_rate": 0.00018586347481797062, + "loss": 3.4173, + "step": 1796 + }, + { + "epoch": 0.16266859780936, + "grad_norm": 0.9839944839477539, + "learning_rate": 0.00018584720440992957, + "loss": 3.3088, + "step": 1797 + }, + { + "epoch": 0.16275912012311033, + "grad_norm": 1.1791011095046997, + "learning_rate": 0.0001858309253570774, + "loss": 3.3331, + "step": 1798 + }, + { + "epoch": 0.16284964243686068, + "grad_norm": 1.0361721515655518, + "learning_rate": 0.0001858146376610534, + "loss": 3.3211, + "step": 1799 + }, + { + "epoch": 0.16294016475061102, + "grad_norm": 0.9734877943992615, + "learning_rate": 0.00018579834132349772, + "loss": 3.3124, + "step": 1800 + }, + { + "epoch": 0.16303068706436136, + "grad_norm": 1.0755244493484497, + "learning_rate": 0.0001857820363460514, + "loss": 3.439, + "step": 1801 + }, + { + "epoch": 0.1631212093781117, + "grad_norm": 0.9513584971427917, + "learning_rate": 0.0001857657227303564, + "loss": 3.3119, + "step": 1802 + }, + { + "epoch": 0.16321173169186204, + "grad_norm": 0.9032912850379944, + "learning_rate": 0.00018574940047805548, + "loss": 3.3228, + "step": 1803 + }, + { + "epoch": 0.16330225400561238, + "grad_norm": 1.1157050132751465, + "learning_rate": 0.0001857330695907922, + "loss": 3.2961, + "step": 1804 + }, + { + "epoch": 0.16339277631936272, + "grad_norm": 1.0433858633041382, + "learning_rate": 0.00018571673007021123, + "loss": 3.3587, + "step": 1805 + }, + { + "epoch": 0.16348329863311306, + "grad_norm": 0.9127818942070007, + "learning_rate": 0.00018570038191795786, + "loss": 3.425, + "step": 1806 + }, + { + "epoch": 0.1635738209468634, + "grad_norm": 0.9649263024330139, + "learning_rate": 0.0001856840251356784, + "loss": 3.3319, + "step": 1807 + }, + { + "epoch": 0.16366434326061374, + "grad_norm": 0.983040988445282, + "learning_rate": 0.00018566765972501993, + "loss": 3.3198, + "step": 1808 + }, + { + "epoch": 0.16375486557436408, + "grad_norm": 0.9087714552879333, + "learning_rate": 0.00018565128568763048, + "loss": 3.2742, + "step": 1809 + }, + { + "epoch": 0.16384538788811442, + "grad_norm": 0.9332515597343445, + "learning_rate": 0.0001856349030251589, + "loss": 3.3727, + "step": 1810 + }, + { + "epoch": 0.16393591020186477, + "grad_norm": 0.8810479640960693, + "learning_rate": 0.00018561851173925495, + "loss": 3.2996, + "step": 1811 + }, + { + "epoch": 0.1640264325156151, + "grad_norm": 0.9994482398033142, + "learning_rate": 0.00018560211183156917, + "loss": 3.325, + "step": 1812 + }, + { + "epoch": 0.16411695482936545, + "grad_norm": 0.9770562052726746, + "learning_rate": 0.0001855857033037531, + "loss": 3.335, + "step": 1813 + }, + { + "epoch": 0.1642074771431158, + "grad_norm": 1.2734119892120361, + "learning_rate": 0.00018556928615745904, + "loss": 3.3162, + "step": 1814 + }, + { + "epoch": 0.16429799945686613, + "grad_norm": 0.9887378215789795, + "learning_rate": 0.0001855528603943402, + "loss": 3.2748, + "step": 1815 + }, + { + "epoch": 0.16438852177061647, + "grad_norm": 1.0900452136993408, + "learning_rate": 0.00018553642601605068, + "loss": 3.32, + "step": 1816 + }, + { + "epoch": 0.1644790440843668, + "grad_norm": 0.8632405996322632, + "learning_rate": 0.00018551998302424538, + "loss": 3.3532, + "step": 1817 + }, + { + "epoch": 0.16456956639811712, + "grad_norm": 0.945196807384491, + "learning_rate": 0.00018550353142058015, + "loss": 3.3202, + "step": 1818 + }, + { + "epoch": 0.16466008871186746, + "grad_norm": 0.9298883080482483, + "learning_rate": 0.0001854870712067116, + "loss": 3.3807, + "step": 1819 + }, + { + "epoch": 0.1647506110256178, + "grad_norm": 0.828575849533081, + "learning_rate": 0.00018547060238429736, + "loss": 3.2874, + "step": 1820 + }, + { + "epoch": 0.16484113333936815, + "grad_norm": 1.0545551776885986, + "learning_rate": 0.00018545412495499578, + "loss": 3.3441, + "step": 1821 + }, + { + "epoch": 0.1649316556531185, + "grad_norm": 0.9633010625839233, + "learning_rate": 0.00018543763892046617, + "loss": 3.3648, + "step": 1822 + }, + { + "epoch": 0.16502217796686883, + "grad_norm": 0.9645661115646362, + "learning_rate": 0.00018542114428236864, + "loss": 3.3397, + "step": 1823 + }, + { + "epoch": 0.16511270028061917, + "grad_norm": 0.9543652534484863, + "learning_rate": 0.00018540464104236425, + "loss": 3.289, + "step": 1824 + }, + { + "epoch": 0.1652032225943695, + "grad_norm": 0.8952108025550842, + "learning_rate": 0.0001853881292021148, + "loss": 3.3568, + "step": 1825 + }, + { + "epoch": 0.16529374490811985, + "grad_norm": 0.911792516708374, + "learning_rate": 0.00018537160876328312, + "loss": 3.3482, + "step": 1826 + }, + { + "epoch": 0.1653842672218702, + "grad_norm": 0.9409090876579285, + "learning_rate": 0.00018535507972753274, + "loss": 3.3525, + "step": 1827 + }, + { + "epoch": 0.16547478953562053, + "grad_norm": 0.9712650179862976, + "learning_rate": 0.00018533854209652818, + "loss": 3.3596, + "step": 1828 + }, + { + "epoch": 0.16556531184937087, + "grad_norm": 0.9322774410247803, + "learning_rate": 0.00018532199587193479, + "loss": 3.358, + "step": 1829 + }, + { + "epoch": 0.1656558341631212, + "grad_norm": 1.0519828796386719, + "learning_rate": 0.00018530544105541872, + "loss": 3.2452, + "step": 1830 + }, + { + "epoch": 0.16574635647687155, + "grad_norm": 0.9365170001983643, + "learning_rate": 0.0001852888776486471, + "loss": 3.3644, + "step": 1831 + }, + { + "epoch": 0.1658368787906219, + "grad_norm": 1.0104777812957764, + "learning_rate": 0.00018527230565328778, + "loss": 3.2377, + "step": 1832 + }, + { + "epoch": 0.16592740110437224, + "grad_norm": 0.9820732474327087, + "learning_rate": 0.00018525572507100964, + "loss": 3.3633, + "step": 1833 + }, + { + "epoch": 0.16601792341812258, + "grad_norm": 1.0812543630599976, + "learning_rate": 0.0001852391359034823, + "loss": 3.3757, + "step": 1834 + }, + { + "epoch": 0.16610844573187292, + "grad_norm": 0.9869741201400757, + "learning_rate": 0.00018522253815237636, + "loss": 3.2189, + "step": 1835 + }, + { + "epoch": 0.16619896804562326, + "grad_norm": 1.1047897338867188, + "learning_rate": 0.00018520593181936312, + "loss": 3.4102, + "step": 1836 + }, + { + "epoch": 0.1662894903593736, + "grad_norm": 1.1179860830307007, + "learning_rate": 0.0001851893169061149, + "loss": 3.2613, + "step": 1837 + }, + { + "epoch": 0.1663800126731239, + "grad_norm": 0.9361429214477539, + "learning_rate": 0.00018517269341430476, + "loss": 3.3489, + "step": 1838 + }, + { + "epoch": 0.16647053498687425, + "grad_norm": 1.0511845350265503, + "learning_rate": 0.00018515606134560675, + "loss": 3.294, + "step": 1839 + }, + { + "epoch": 0.1665610573006246, + "grad_norm": 0.9614317417144775, + "learning_rate": 0.0001851394207016957, + "loss": 3.3574, + "step": 1840 + }, + { + "epoch": 0.16665157961437493, + "grad_norm": 0.8911113142967224, + "learning_rate": 0.0001851227714842473, + "loss": 3.3029, + "step": 1841 + }, + { + "epoch": 0.16674210192812527, + "grad_norm": 1.0619572401046753, + "learning_rate": 0.00018510611369493816, + "loss": 3.3396, + "step": 1842 + }, + { + "epoch": 0.16683262424187562, + "grad_norm": 1.2229690551757812, + "learning_rate": 0.0001850894473354457, + "loss": 3.381, + "step": 1843 + }, + { + "epoch": 0.16692314655562596, + "grad_norm": 1.0011261701583862, + "learning_rate": 0.0001850727724074482, + "loss": 3.3347, + "step": 1844 + }, + { + "epoch": 0.1670136688693763, + "grad_norm": 1.1166538000106812, + "learning_rate": 0.00018505608891262486, + "loss": 3.3346, + "step": 1845 + }, + { + "epoch": 0.16710419118312664, + "grad_norm": 0.881597638130188, + "learning_rate": 0.00018503939685265568, + "loss": 3.2962, + "step": 1846 + }, + { + "epoch": 0.16719471349687698, + "grad_norm": 0.9117144346237183, + "learning_rate": 0.00018502269622922154, + "loss": 3.3041, + "step": 1847 + }, + { + "epoch": 0.16728523581062732, + "grad_norm": 0.9797277450561523, + "learning_rate": 0.00018500598704400428, + "loss": 3.3918, + "step": 1848 + }, + { + "epoch": 0.16737575812437766, + "grad_norm": 0.9083805680274963, + "learning_rate": 0.00018498926929868642, + "loss": 3.3226, + "step": 1849 + }, + { + "epoch": 0.167466280438128, + "grad_norm": 1.0031794309616089, + "learning_rate": 0.00018497254299495146, + "loss": 3.3247, + "step": 1850 + }, + { + "epoch": 0.16755680275187834, + "grad_norm": 0.9302388429641724, + "learning_rate": 0.00018495580813448372, + "loss": 3.3599, + "step": 1851 + }, + { + "epoch": 0.16764732506562868, + "grad_norm": 0.9099136590957642, + "learning_rate": 0.00018493906471896848, + "loss": 3.3358, + "step": 1852 + }, + { + "epoch": 0.16773784737937902, + "grad_norm": 0.8923931121826172, + "learning_rate": 0.00018492231275009174, + "loss": 3.329, + "step": 1853 + }, + { + "epoch": 0.16782836969312936, + "grad_norm": 1.0236018896102905, + "learning_rate": 0.00018490555222954038, + "loss": 3.3552, + "step": 1854 + }, + { + "epoch": 0.1679188920068797, + "grad_norm": 0.9347558617591858, + "learning_rate": 0.00018488878315900227, + "loss": 3.3169, + "step": 1855 + }, + { + "epoch": 0.16800941432063005, + "grad_norm": 1.0411633253097534, + "learning_rate": 0.00018487200554016602, + "loss": 3.331, + "step": 1856 + }, + { + "epoch": 0.1680999366343804, + "grad_norm": 0.9088875651359558, + "learning_rate": 0.00018485521937472113, + "loss": 3.3023, + "step": 1857 + }, + { + "epoch": 0.16819045894813073, + "grad_norm": 0.9790009260177612, + "learning_rate": 0.00018483842466435798, + "loss": 3.3666, + "step": 1858 + }, + { + "epoch": 0.16828098126188104, + "grad_norm": 1.0297186374664307, + "learning_rate": 0.00018482162141076778, + "loss": 3.3311, + "step": 1859 + }, + { + "epoch": 0.16837150357563138, + "grad_norm": 0.9576818346977234, + "learning_rate": 0.0001848048096156426, + "loss": 3.2539, + "step": 1860 + }, + { + "epoch": 0.16846202588938172, + "grad_norm": 0.9573496580123901, + "learning_rate": 0.00018478798928067542, + "loss": 3.3519, + "step": 1861 + }, + { + "epoch": 0.16855254820313206, + "grad_norm": 0.907659649848938, + "learning_rate": 0.00018477116040756006, + "loss": 3.3051, + "step": 1862 + }, + { + "epoch": 0.1686430705168824, + "grad_norm": 0.9441897869110107, + "learning_rate": 0.00018475432299799117, + "loss": 3.2564, + "step": 1863 + }, + { + "epoch": 0.16873359283063274, + "grad_norm": 1.1152738332748413, + "learning_rate": 0.00018473747705366426, + "loss": 3.2523, + "step": 1864 + }, + { + "epoch": 0.16882411514438309, + "grad_norm": 1.0354543924331665, + "learning_rate": 0.00018472062257627575, + "loss": 3.3501, + "step": 1865 + }, + { + "epoch": 0.16891463745813343, + "grad_norm": 0.9793465733528137, + "learning_rate": 0.00018470375956752285, + "loss": 3.2576, + "step": 1866 + }, + { + "epoch": 0.16900515977188377, + "grad_norm": 0.9896705746650696, + "learning_rate": 0.0001846868880291037, + "loss": 3.3021, + "step": 1867 + }, + { + "epoch": 0.1690956820856341, + "grad_norm": 0.9611893892288208, + "learning_rate": 0.0001846700079627172, + "loss": 3.2927, + "step": 1868 + }, + { + "epoch": 0.16918620439938445, + "grad_norm": 0.9868037104606628, + "learning_rate": 0.00018465311937006321, + "loss": 3.2828, + "step": 1869 + }, + { + "epoch": 0.1692767267131348, + "grad_norm": 0.9744787812232971, + "learning_rate": 0.00018463622225284242, + "loss": 3.3267, + "step": 1870 + }, + { + "epoch": 0.16936724902688513, + "grad_norm": 1.002983570098877, + "learning_rate": 0.00018461931661275643, + "loss": 3.2383, + "step": 1871 + }, + { + "epoch": 0.16945777134063547, + "grad_norm": 0.876952052116394, + "learning_rate": 0.0001846024024515075, + "loss": 3.2725, + "step": 1872 + }, + { + "epoch": 0.1695482936543858, + "grad_norm": 1.1321675777435303, + "learning_rate": 0.000184585479770799, + "loss": 3.3552, + "step": 1873 + }, + { + "epoch": 0.16963881596813615, + "grad_norm": 0.8953356742858887, + "learning_rate": 0.00018456854857233496, + "loss": 3.3455, + "step": 1874 + }, + { + "epoch": 0.1697293382818865, + "grad_norm": 1.0338746309280396, + "learning_rate": 0.00018455160885782045, + "loss": 3.3038, + "step": 1875 + }, + { + "epoch": 0.16981986059563683, + "grad_norm": 0.9554142951965332, + "learning_rate": 0.0001845346606289612, + "loss": 3.3413, + "step": 1876 + }, + { + "epoch": 0.16991038290938718, + "grad_norm": 0.9484089016914368, + "learning_rate": 0.00018451770388746398, + "loss": 3.3453, + "step": 1877 + }, + { + "epoch": 0.17000090522313752, + "grad_norm": 0.9068911075592041, + "learning_rate": 0.00018450073863503623, + "loss": 3.329, + "step": 1878 + }, + { + "epoch": 0.17009142753688783, + "grad_norm": 0.9792476296424866, + "learning_rate": 0.00018448376487338646, + "loss": 3.3212, + "step": 1879 + }, + { + "epoch": 0.17018194985063817, + "grad_norm": 0.9379374384880066, + "learning_rate": 0.00018446678260422385, + "loss": 3.3014, + "step": 1880 + }, + { + "epoch": 0.1702724721643885, + "grad_norm": 0.9665486216545105, + "learning_rate": 0.00018444979182925854, + "loss": 3.337, + "step": 1881 + }, + { + "epoch": 0.17036299447813885, + "grad_norm": 1.1156326532363892, + "learning_rate": 0.00018443279255020152, + "loss": 3.3711, + "step": 1882 + }, + { + "epoch": 0.1704535167918892, + "grad_norm": 1.0622475147247314, + "learning_rate": 0.0001844157847687646, + "loss": 3.3345, + "step": 1883 + }, + { + "epoch": 0.17054403910563953, + "grad_norm": 1.058219313621521, + "learning_rate": 0.00018439876848666046, + "loss": 3.3102, + "step": 1884 + }, + { + "epoch": 0.17063456141938987, + "grad_norm": 1.2012033462524414, + "learning_rate": 0.00018438174370560264, + "loss": 3.3606, + "step": 1885 + }, + { + "epoch": 0.17072508373314021, + "grad_norm": 1.0893925428390503, + "learning_rate": 0.00018436471042730554, + "loss": 3.3498, + "step": 1886 + }, + { + "epoch": 0.17081560604689056, + "grad_norm": 1.0066337585449219, + "learning_rate": 0.0001843476686534844, + "loss": 3.3289, + "step": 1887 + }, + { + "epoch": 0.1709061283606409, + "grad_norm": 1.0413285493850708, + "learning_rate": 0.00018433061838585534, + "loss": 3.3485, + "step": 1888 + }, + { + "epoch": 0.17099665067439124, + "grad_norm": 1.0871161222457886, + "learning_rate": 0.00018431355962613536, + "loss": 3.3714, + "step": 1889 + }, + { + "epoch": 0.17108717298814158, + "grad_norm": 0.9766511917114258, + "learning_rate": 0.00018429649237604217, + "loss": 3.309, + "step": 1890 + }, + { + "epoch": 0.17117769530189192, + "grad_norm": 1.0211293697357178, + "learning_rate": 0.00018427941663729455, + "loss": 3.2944, + "step": 1891 + }, + { + "epoch": 0.17126821761564226, + "grad_norm": 1.0084211826324463, + "learning_rate": 0.00018426233241161191, + "loss": 3.3315, + "step": 1892 + }, + { + "epoch": 0.1713587399293926, + "grad_norm": 0.87370765209198, + "learning_rate": 0.00018424523970071477, + "loss": 3.3272, + "step": 1893 + }, + { + "epoch": 0.17144926224314294, + "grad_norm": 0.9169411063194275, + "learning_rate": 0.0001842281385063243, + "loss": 3.295, + "step": 1894 + }, + { + "epoch": 0.17153978455689328, + "grad_norm": 1.0767097473144531, + "learning_rate": 0.00018421102883016253, + "loss": 3.3453, + "step": 1895 + }, + { + "epoch": 0.17163030687064362, + "grad_norm": 1.1827503442764282, + "learning_rate": 0.00018419391067395248, + "loss": 3.2742, + "step": 1896 + }, + { + "epoch": 0.17172082918439396, + "grad_norm": 0.9097943305969238, + "learning_rate": 0.00018417678403941795, + "loss": 3.2716, + "step": 1897 + }, + { + "epoch": 0.1718113514981443, + "grad_norm": 0.9012870788574219, + "learning_rate": 0.00018415964892828357, + "loss": 3.3063, + "step": 1898 + }, + { + "epoch": 0.17190187381189465, + "grad_norm": 1.1969982385635376, + "learning_rate": 0.00018414250534227485, + "loss": 3.2844, + "step": 1899 + }, + { + "epoch": 0.17199239612564496, + "grad_norm": 0.9854839444160461, + "learning_rate": 0.00018412535328311814, + "loss": 3.3285, + "step": 1900 + }, + { + "epoch": 0.1720829184393953, + "grad_norm": 1.0452015399932861, + "learning_rate": 0.00018410819275254062, + "loss": 3.2647, + "step": 1901 + }, + { + "epoch": 0.17217344075314564, + "grad_norm": 0.9425656199455261, + "learning_rate": 0.00018409102375227044, + "loss": 3.358, + "step": 1902 + }, + { + "epoch": 0.17226396306689598, + "grad_norm": 0.9737415313720703, + "learning_rate": 0.00018407384628403643, + "loss": 3.3233, + "step": 1903 + }, + { + "epoch": 0.17235448538064632, + "grad_norm": 1.0962470769882202, + "learning_rate": 0.00018405666034956844, + "loss": 3.3832, + "step": 1904 + }, + { + "epoch": 0.17244500769439666, + "grad_norm": 1.1687324047088623, + "learning_rate": 0.00018403946595059703, + "loss": 3.3107, + "step": 1905 + }, + { + "epoch": 0.172535530008147, + "grad_norm": 0.9966987371444702, + "learning_rate": 0.00018402226308885368, + "loss": 3.3198, + "step": 1906 + }, + { + "epoch": 0.17262605232189734, + "grad_norm": 0.9648373126983643, + "learning_rate": 0.00018400505176607078, + "loss": 3.2521, + "step": 1907 + }, + { + "epoch": 0.17271657463564768, + "grad_norm": 1.113847255706787, + "learning_rate": 0.00018398783198398145, + "loss": 3.363, + "step": 1908 + }, + { + "epoch": 0.17280709694939803, + "grad_norm": 1.0072240829467773, + "learning_rate": 0.00018397060374431972, + "loss": 3.3407, + "step": 1909 + }, + { + "epoch": 0.17289761926314837, + "grad_norm": 0.9911056160926819, + "learning_rate": 0.0001839533670488205, + "loss": 3.2513, + "step": 1910 + }, + { + "epoch": 0.1729881415768987, + "grad_norm": 1.049584984779358, + "learning_rate": 0.00018393612189921953, + "loss": 3.3049, + "step": 1911 + }, + { + "epoch": 0.17307866389064905, + "grad_norm": 0.9908673763275146, + "learning_rate": 0.00018391886829725334, + "loss": 3.3312, + "step": 1912 + }, + { + "epoch": 0.1731691862043994, + "grad_norm": 1.1561472415924072, + "learning_rate": 0.00018390160624465944, + "loss": 3.3125, + "step": 1913 + }, + { + "epoch": 0.17325970851814973, + "grad_norm": 1.0068906545639038, + "learning_rate": 0.0001838843357431761, + "loss": 3.3257, + "step": 1914 + }, + { + "epoch": 0.17335023083190007, + "grad_norm": 0.9671674370765686, + "learning_rate": 0.00018386705679454242, + "loss": 3.3392, + "step": 1915 + }, + { + "epoch": 0.1734407531456504, + "grad_norm": 0.8658766150474548, + "learning_rate": 0.0001838497694004984, + "loss": 3.2903, + "step": 1916 + }, + { + "epoch": 0.17353127545940075, + "grad_norm": 1.09023118019104, + "learning_rate": 0.00018383247356278494, + "loss": 3.3672, + "step": 1917 + }, + { + "epoch": 0.1736217977731511, + "grad_norm": 0.9814264178276062, + "learning_rate": 0.00018381516928314367, + "loss": 3.3352, + "step": 1918 + }, + { + "epoch": 0.17371232008690143, + "grad_norm": 1.116402268409729, + "learning_rate": 0.00018379785656331713, + "loss": 3.3452, + "step": 1919 + }, + { + "epoch": 0.17380284240065175, + "grad_norm": 1.029109001159668, + "learning_rate": 0.00018378053540504873, + "loss": 3.3507, + "step": 1920 + }, + { + "epoch": 0.1738933647144021, + "grad_norm": 0.9994332194328308, + "learning_rate": 0.00018376320581008272, + "loss": 3.3412, + "step": 1921 + }, + { + "epoch": 0.17398388702815243, + "grad_norm": 0.9789600372314453, + "learning_rate": 0.00018374586778016418, + "loss": 3.2994, + "step": 1922 + }, + { + "epoch": 0.17407440934190277, + "grad_norm": 1.317229151725769, + "learning_rate": 0.00018372852131703904, + "loss": 3.3474, + "step": 1923 + }, + { + "epoch": 0.1741649316556531, + "grad_norm": 1.0556979179382324, + "learning_rate": 0.00018371116642245408, + "loss": 3.3084, + "step": 1924 + }, + { + "epoch": 0.17425545396940345, + "grad_norm": 1.17436683177948, + "learning_rate": 0.00018369380309815698, + "loss": 3.2949, + "step": 1925 + }, + { + "epoch": 0.1743459762831538, + "grad_norm": 1.2278040647506714, + "learning_rate": 0.00018367643134589617, + "loss": 3.2857, + "step": 1926 + }, + { + "epoch": 0.17443649859690413, + "grad_norm": 1.1209522485733032, + "learning_rate": 0.00018365905116742105, + "loss": 3.2659, + "step": 1927 + }, + { + "epoch": 0.17452702091065447, + "grad_norm": 1.10820472240448, + "learning_rate": 0.00018364166256448173, + "loss": 3.2915, + "step": 1928 + }, + { + "epoch": 0.17461754322440481, + "grad_norm": 0.9124554395675659, + "learning_rate": 0.0001836242655388293, + "loss": 3.2319, + "step": 1929 + }, + { + "epoch": 0.17470806553815516, + "grad_norm": 1.0809296369552612, + "learning_rate": 0.0001836068600922156, + "loss": 3.3081, + "step": 1930 + }, + { + "epoch": 0.1747985878519055, + "grad_norm": 0.9386884570121765, + "learning_rate": 0.00018358944622639338, + "loss": 3.3148, + "step": 1931 + }, + { + "epoch": 0.17488911016565584, + "grad_norm": 0.9475247263908386, + "learning_rate": 0.00018357202394311624, + "loss": 3.2621, + "step": 1932 + }, + { + "epoch": 0.17497963247940618, + "grad_norm": 1.2415645122528076, + "learning_rate": 0.00018355459324413852, + "loss": 3.3089, + "step": 1933 + }, + { + "epoch": 0.17507015479315652, + "grad_norm": 0.897971510887146, + "learning_rate": 0.0001835371541312156, + "loss": 3.2613, + "step": 1934 + }, + { + "epoch": 0.17516067710690686, + "grad_norm": 0.8742809891700745, + "learning_rate": 0.0001835197066061035, + "loss": 3.2324, + "step": 1935 + }, + { + "epoch": 0.1752511994206572, + "grad_norm": 0.9875279068946838, + "learning_rate": 0.00018350225067055925, + "loss": 3.2903, + "step": 1936 + }, + { + "epoch": 0.17534172173440754, + "grad_norm": 0.8936052918434143, + "learning_rate": 0.00018348478632634066, + "loss": 3.2961, + "step": 1937 + }, + { + "epoch": 0.17543224404815788, + "grad_norm": 0.9748427271842957, + "learning_rate": 0.00018346731357520637, + "loss": 3.3293, + "step": 1938 + }, + { + "epoch": 0.17552276636190822, + "grad_norm": 1.0512956380844116, + "learning_rate": 0.00018344983241891586, + "loss": 3.2362, + "step": 1939 + }, + { + "epoch": 0.17561328867565856, + "grad_norm": 0.950809121131897, + "learning_rate": 0.00018343234285922953, + "loss": 3.27, + "step": 1940 + }, + { + "epoch": 0.17570381098940888, + "grad_norm": 1.1645820140838623, + "learning_rate": 0.00018341484489790854, + "loss": 3.3205, + "step": 1941 + }, + { + "epoch": 0.17579433330315922, + "grad_norm": 1.1008210182189941, + "learning_rate": 0.00018339733853671496, + "loss": 3.2902, + "step": 1942 + }, + { + "epoch": 0.17588485561690956, + "grad_norm": 1.1445972919464111, + "learning_rate": 0.00018337982377741166, + "loss": 3.319, + "step": 1943 + }, + { + "epoch": 0.1759753779306599, + "grad_norm": 1.0789272785186768, + "learning_rate": 0.00018336230062176244, + "loss": 3.2692, + "step": 1944 + }, + { + "epoch": 0.17606590024441024, + "grad_norm": 1.190246820449829, + "learning_rate": 0.00018334476907153177, + "loss": 3.337, + "step": 1945 + }, + { + "epoch": 0.17615642255816058, + "grad_norm": 1.0693939924240112, + "learning_rate": 0.00018332722912848515, + "loss": 3.1858, + "step": 1946 + }, + { + "epoch": 0.17624694487191092, + "grad_norm": 1.0451300144195557, + "learning_rate": 0.00018330968079438887, + "loss": 3.2647, + "step": 1947 + }, + { + "epoch": 0.17633746718566126, + "grad_norm": 0.9115802645683289, + "learning_rate": 0.00018329212407100994, + "loss": 3.2782, + "step": 1948 + }, + { + "epoch": 0.1764279894994116, + "grad_norm": 1.0351324081420898, + "learning_rate": 0.00018327455896011645, + "loss": 3.3408, + "step": 1949 + }, + { + "epoch": 0.17651851181316194, + "grad_norm": 0.8210245370864868, + "learning_rate": 0.00018325698546347715, + "loss": 3.3034, + "step": 1950 + }, + { + "epoch": 0.17660903412691228, + "grad_norm": 0.9735020399093628, + "learning_rate": 0.00018323940358286168, + "loss": 3.3017, + "step": 1951 + }, + { + "epoch": 0.17669955644066263, + "grad_norm": 0.914596676826477, + "learning_rate": 0.00018322181332004056, + "loss": 3.2619, + "step": 1952 + }, + { + "epoch": 0.17679007875441297, + "grad_norm": 0.8887321949005127, + "learning_rate": 0.00018320421467678507, + "loss": 3.2938, + "step": 1953 + }, + { + "epoch": 0.1768806010681633, + "grad_norm": 0.851830005645752, + "learning_rate": 0.00018318660765486748, + "loss": 3.2433, + "step": 1954 + }, + { + "epoch": 0.17697112338191365, + "grad_norm": 0.9544591903686523, + "learning_rate": 0.00018316899225606078, + "loss": 3.2891, + "step": 1955 + }, + { + "epoch": 0.177061645695664, + "grad_norm": 0.9891889691352844, + "learning_rate": 0.00018315136848213884, + "loss": 3.2969, + "step": 1956 + }, + { + "epoch": 0.17715216800941433, + "grad_norm": 0.9165775179862976, + "learning_rate": 0.00018313373633487633, + "loss": 3.185, + "step": 1957 + }, + { + "epoch": 0.17724269032316467, + "grad_norm": 0.8629693388938904, + "learning_rate": 0.00018311609581604887, + "loss": 3.3367, + "step": 1958 + }, + { + "epoch": 0.177333212636915, + "grad_norm": 0.9266963601112366, + "learning_rate": 0.00018309844692743283, + "loss": 3.3069, + "step": 1959 + }, + { + "epoch": 0.17742373495066535, + "grad_norm": 0.865626335144043, + "learning_rate": 0.00018308078967080546, + "loss": 3.2557, + "step": 1960 + }, + { + "epoch": 0.17751425726441566, + "grad_norm": 0.9198414087295532, + "learning_rate": 0.00018306312404794487, + "loss": 3.2345, + "step": 1961 + }, + { + "epoch": 0.177604779578166, + "grad_norm": 1.0007754564285278, + "learning_rate": 0.0001830454500606299, + "loss": 3.3282, + "step": 1962 + }, + { + "epoch": 0.17769530189191635, + "grad_norm": 0.9740175604820251, + "learning_rate": 0.00018302776771064044, + "loss": 3.3237, + "step": 1963 + }, + { + "epoch": 0.1777858242056667, + "grad_norm": 0.9359791278839111, + "learning_rate": 0.00018301007699975702, + "loss": 3.281, + "step": 1964 + }, + { + "epoch": 0.17787634651941703, + "grad_norm": 1.0587137937545776, + "learning_rate": 0.00018299237792976112, + "loss": 3.3509, + "step": 1965 + }, + { + "epoch": 0.17796686883316737, + "grad_norm": 0.9631637334823608, + "learning_rate": 0.00018297467050243501, + "loss": 3.3646, + "step": 1966 + }, + { + "epoch": 0.1780573911469177, + "grad_norm": 0.9102620482444763, + "learning_rate": 0.00018295695471956189, + "loss": 3.3086, + "step": 1967 + }, + { + "epoch": 0.17814791346066805, + "grad_norm": 0.9048988819122314, + "learning_rate": 0.0001829392305829257, + "loss": 3.2488, + "step": 1968 + }, + { + "epoch": 0.1782384357744184, + "grad_norm": 0.9322419166564941, + "learning_rate": 0.00018292149809431122, + "loss": 3.242, + "step": 1969 + }, + { + "epoch": 0.17832895808816873, + "grad_norm": 1.042884349822998, + "learning_rate": 0.00018290375725550417, + "loss": 3.2302, + "step": 1970 + }, + { + "epoch": 0.17841948040191907, + "grad_norm": 0.9759350419044495, + "learning_rate": 0.00018288600806829104, + "loss": 3.2822, + "step": 1971 + }, + { + "epoch": 0.1785100027156694, + "grad_norm": 0.9325250387191772, + "learning_rate": 0.00018286825053445918, + "loss": 3.2232, + "step": 1972 + }, + { + "epoch": 0.17860052502941975, + "grad_norm": 0.9702731370925903, + "learning_rate": 0.00018285048465579672, + "loss": 3.2642, + "step": 1973 + }, + { + "epoch": 0.1786910473431701, + "grad_norm": 0.9540538787841797, + "learning_rate": 0.00018283271043409273, + "loss": 3.2451, + "step": 1974 + }, + { + "epoch": 0.17878156965692044, + "grad_norm": 0.9841457605361938, + "learning_rate": 0.00018281492787113708, + "loss": 3.3384, + "step": 1975 + }, + { + "epoch": 0.17887209197067078, + "grad_norm": 0.9560434222221375, + "learning_rate": 0.00018279713696872047, + "loss": 3.372, + "step": 1976 + }, + { + "epoch": 0.17896261428442112, + "grad_norm": 1.0200730562210083, + "learning_rate": 0.0001827793377286344, + "loss": 3.3252, + "step": 1977 + }, + { + "epoch": 0.17905313659817146, + "grad_norm": 0.9988753795623779, + "learning_rate": 0.00018276153015267134, + "loss": 3.2427, + "step": 1978 + }, + { + "epoch": 0.1791436589119218, + "grad_norm": 0.9856618642807007, + "learning_rate": 0.0001827437142426244, + "loss": 3.3344, + "step": 1979 + }, + { + "epoch": 0.17923418122567214, + "grad_norm": 0.9482274055480957, + "learning_rate": 0.00018272589000028772, + "loss": 3.2712, + "step": 1980 + }, + { + "epoch": 0.17932470353942248, + "grad_norm": 1.0072088241577148, + "learning_rate": 0.00018270805742745617, + "loss": 3.2766, + "step": 1981 + }, + { + "epoch": 0.1794152258531728, + "grad_norm": 1.0293302536010742, + "learning_rate": 0.00018269021652592553, + "loss": 3.3456, + "step": 1982 + }, + { + "epoch": 0.17950574816692313, + "grad_norm": 1.1364372968673706, + "learning_rate": 0.00018267236729749232, + "loss": 3.3901, + "step": 1983 + }, + { + "epoch": 0.17959627048067348, + "grad_norm": 1.0293986797332764, + "learning_rate": 0.00018265450974395403, + "loss": 3.2848, + "step": 1984 + }, + { + "epoch": 0.17968679279442382, + "grad_norm": 0.8650600910186768, + "learning_rate": 0.00018263664386710882, + "loss": 3.2819, + "step": 1985 + }, + { + "epoch": 0.17977731510817416, + "grad_norm": 0.9513899087905884, + "learning_rate": 0.00018261876966875584, + "loss": 3.3258, + "step": 1986 + }, + { + "epoch": 0.1798678374219245, + "grad_norm": 0.9329274892807007, + "learning_rate": 0.00018260088715069505, + "loss": 3.2609, + "step": 1987 + }, + { + "epoch": 0.17995835973567484, + "grad_norm": 1.0387474298477173, + "learning_rate": 0.00018258299631472715, + "loss": 3.2725, + "step": 1988 + }, + { + "epoch": 0.18004888204942518, + "grad_norm": 0.8893908858299255, + "learning_rate": 0.0001825650971626538, + "loss": 3.2557, + "step": 1989 + }, + { + "epoch": 0.18013940436317552, + "grad_norm": 1.0605710744857788, + "learning_rate": 0.0001825471896962774, + "loss": 3.1661, + "step": 1990 + }, + { + "epoch": 0.18022992667692586, + "grad_norm": 1.0651538372039795, + "learning_rate": 0.00018252927391740128, + "loss": 3.305, + "step": 1991 + }, + { + "epoch": 0.1803204489906762, + "grad_norm": 1.0231541395187378, + "learning_rate": 0.00018251134982782952, + "loss": 3.3347, + "step": 1992 + }, + { + "epoch": 0.18041097130442654, + "grad_norm": 1.2418352365493774, + "learning_rate": 0.0001824934174293671, + "loss": 3.284, + "step": 1993 + }, + { + "epoch": 0.18050149361817688, + "grad_norm": 1.0233439207077026, + "learning_rate": 0.0001824754767238198, + "loss": 3.3628, + "step": 1994 + }, + { + "epoch": 0.18059201593192722, + "grad_norm": 1.1309494972229004, + "learning_rate": 0.00018245752771299425, + "loss": 3.2799, + "step": 1995 + }, + { + "epoch": 0.18068253824567757, + "grad_norm": 1.152944564819336, + "learning_rate": 0.0001824395703986979, + "loss": 3.271, + "step": 1996 + }, + { + "epoch": 0.1807730605594279, + "grad_norm": 0.96519535779953, + "learning_rate": 0.00018242160478273907, + "loss": 3.2316, + "step": 1997 + }, + { + "epoch": 0.18086358287317825, + "grad_norm": 0.9933448433876038, + "learning_rate": 0.00018240363086692693, + "loss": 3.3146, + "step": 1998 + }, + { + "epoch": 0.1809541051869286, + "grad_norm": 0.9097755551338196, + "learning_rate": 0.00018238564865307138, + "loss": 3.3065, + "step": 1999 + }, + { + "epoch": 0.18104462750067893, + "grad_norm": 1.1705602407455444, + "learning_rate": 0.0001823676581429833, + "loss": 3.2106, + "step": 2000 + }, + { + "epoch": 0.18104462750067893, + "eval_loss": 3.209501028060913, + "eval_runtime": 71.6327, + "eval_samples_per_second": 37.734, + "eval_steps_per_second": 3.155, + "step": 2000 + }, + { + "epoch": 0.18113514981442927, + "grad_norm": 1.0002061128616333, + "learning_rate": 0.00018234965933847428, + "loss": 3.2278, + "step": 2001 + }, + { + "epoch": 0.1812256721281796, + "grad_norm": 1.1755338907241821, + "learning_rate": 0.00018233165224135678, + "loss": 3.2653, + "step": 2002 + }, + { + "epoch": 0.18131619444192992, + "grad_norm": 0.9064604043960571, + "learning_rate": 0.0001823136368534442, + "loss": 3.2967, + "step": 2003 + }, + { + "epoch": 0.18140671675568026, + "grad_norm": 0.9214131236076355, + "learning_rate": 0.00018229561317655062, + "loss": 3.3147, + "step": 2004 + }, + { + "epoch": 0.1814972390694306, + "grad_norm": 0.8722012042999268, + "learning_rate": 0.00018227758121249106, + "loss": 3.2075, + "step": 2005 + }, + { + "epoch": 0.18158776138318095, + "grad_norm": 0.9610263705253601, + "learning_rate": 0.00018225954096308131, + "loss": 3.2718, + "step": 2006 + }, + { + "epoch": 0.1816782836969313, + "grad_norm": 0.9243883490562439, + "learning_rate": 0.00018224149243013804, + "loss": 3.2456, + "step": 2007 + }, + { + "epoch": 0.18176880601068163, + "grad_norm": 0.9212303757667542, + "learning_rate": 0.00018222343561547874, + "loss": 3.2107, + "step": 2008 + }, + { + "epoch": 0.18185932832443197, + "grad_norm": 0.8368549942970276, + "learning_rate": 0.0001822053705209217, + "loss": 3.2671, + "step": 2009 + }, + { + "epoch": 0.1819498506381823, + "grad_norm": 0.9258269667625427, + "learning_rate": 0.00018218729714828612, + "loss": 3.2758, + "step": 2010 + }, + { + "epoch": 0.18204037295193265, + "grad_norm": 0.9486406445503235, + "learning_rate": 0.00018216921549939197, + "loss": 3.2163, + "step": 2011 + }, + { + "epoch": 0.182130895265683, + "grad_norm": 0.9499784111976624, + "learning_rate": 0.00018215112557606005, + "loss": 3.2586, + "step": 2012 + }, + { + "epoch": 0.18222141757943333, + "grad_norm": 0.9287562370300293, + "learning_rate": 0.00018213302738011203, + "loss": 3.3807, + "step": 2013 + }, + { + "epoch": 0.18231193989318367, + "grad_norm": 1.0922387838363647, + "learning_rate": 0.00018211492091337042, + "loss": 3.2507, + "step": 2014 + }, + { + "epoch": 0.182402462206934, + "grad_norm": 0.9361944198608398, + "learning_rate": 0.00018209680617765848, + "loss": 3.3209, + "step": 2015 + }, + { + "epoch": 0.18249298452068435, + "grad_norm": 0.9226075410842896, + "learning_rate": 0.00018207868317480046, + "loss": 3.2744, + "step": 2016 + }, + { + "epoch": 0.1825835068344347, + "grad_norm": 0.9788509607315063, + "learning_rate": 0.00018206055190662126, + "loss": 3.2184, + "step": 2017 + }, + { + "epoch": 0.18267402914818504, + "grad_norm": 1.0086486339569092, + "learning_rate": 0.00018204241237494673, + "loss": 3.3117, + "step": 2018 + }, + { + "epoch": 0.18276455146193538, + "grad_norm": 1.0230251550674438, + "learning_rate": 0.00018202426458160354, + "loss": 3.3178, + "step": 2019 + }, + { + "epoch": 0.18285507377568572, + "grad_norm": 0.9822840690612793, + "learning_rate": 0.00018200610852841913, + "loss": 3.309, + "step": 2020 + }, + { + "epoch": 0.18294559608943606, + "grad_norm": 1.0047194957733154, + "learning_rate": 0.00018198794421722182, + "loss": 3.222, + "step": 2021 + }, + { + "epoch": 0.1830361184031864, + "grad_norm": 0.923944354057312, + "learning_rate": 0.0001819697716498408, + "loss": 3.2258, + "step": 2022 + }, + { + "epoch": 0.1831266407169367, + "grad_norm": 0.9885908961296082, + "learning_rate": 0.000181951590828106, + "loss": 3.2468, + "step": 2023 + }, + { + "epoch": 0.18321716303068705, + "grad_norm": 0.9848500490188599, + "learning_rate": 0.00018193340175384824, + "loss": 3.2564, + "step": 2024 + }, + { + "epoch": 0.1833076853444374, + "grad_norm": 0.9301328659057617, + "learning_rate": 0.0001819152044288992, + "loss": 3.2512, + "step": 2025 + }, + { + "epoch": 0.18339820765818773, + "grad_norm": 0.9676833152770996, + "learning_rate": 0.00018189699885509127, + "loss": 3.3275, + "step": 2026 + }, + { + "epoch": 0.18348872997193807, + "grad_norm": 1.0562924146652222, + "learning_rate": 0.00018187878503425782, + "loss": 3.2531, + "step": 2027 + }, + { + "epoch": 0.18357925228568842, + "grad_norm": 0.8997370600700378, + "learning_rate": 0.00018186056296823298, + "loss": 3.2492, + "step": 2028 + }, + { + "epoch": 0.18366977459943876, + "grad_norm": 1.132249355316162, + "learning_rate": 0.00018184233265885165, + "loss": 3.263, + "step": 2029 + }, + { + "epoch": 0.1837602969131891, + "grad_norm": 0.9985945224761963, + "learning_rate": 0.00018182409410794968, + "loss": 3.2512, + "step": 2030 + }, + { + "epoch": 0.18385081922693944, + "grad_norm": 0.9604045748710632, + "learning_rate": 0.00018180584731736365, + "loss": 3.3035, + "step": 2031 + }, + { + "epoch": 0.18394134154068978, + "grad_norm": 0.9413614273071289, + "learning_rate": 0.00018178759228893108, + "loss": 3.2909, + "step": 2032 + }, + { + "epoch": 0.18403186385444012, + "grad_norm": 1.2012258768081665, + "learning_rate": 0.00018176932902449016, + "loss": 3.2334, + "step": 2033 + }, + { + "epoch": 0.18412238616819046, + "grad_norm": 0.8994251489639282, + "learning_rate": 0.00018175105752588007, + "loss": 3.2546, + "step": 2034 + }, + { + "epoch": 0.1842129084819408, + "grad_norm": 0.9620054960250854, + "learning_rate": 0.0001817327777949407, + "loss": 3.3143, + "step": 2035 + }, + { + "epoch": 0.18430343079569114, + "grad_norm": 0.9758648872375488, + "learning_rate": 0.00018171448983351284, + "loss": 3.2648, + "step": 2036 + }, + { + "epoch": 0.18439395310944148, + "grad_norm": 0.9170195460319519, + "learning_rate": 0.00018169619364343813, + "loss": 3.2282, + "step": 2037 + }, + { + "epoch": 0.18448447542319182, + "grad_norm": 0.9589044451713562, + "learning_rate": 0.00018167788922655894, + "loss": 3.2508, + "step": 2038 + }, + { + "epoch": 0.18457499773694216, + "grad_norm": 0.948966383934021, + "learning_rate": 0.00018165957658471853, + "loss": 3.2873, + "step": 2039 + }, + { + "epoch": 0.1846655200506925, + "grad_norm": 1.168324589729309, + "learning_rate": 0.00018164125571976098, + "loss": 3.3223, + "step": 2040 + }, + { + "epoch": 0.18475604236444285, + "grad_norm": 0.9419066905975342, + "learning_rate": 0.00018162292663353124, + "loss": 3.2272, + "step": 2041 + }, + { + "epoch": 0.1848465646781932, + "grad_norm": 0.9817954301834106, + "learning_rate": 0.000181604589327875, + "loss": 3.1878, + "step": 2042 + }, + { + "epoch": 0.18493708699194353, + "grad_norm": 0.8973686099052429, + "learning_rate": 0.00018158624380463888, + "loss": 3.2144, + "step": 2043 + }, + { + "epoch": 0.18502760930569384, + "grad_norm": 1.1221078634262085, + "learning_rate": 0.0001815678900656702, + "loss": 3.2616, + "step": 2044 + }, + { + "epoch": 0.18511813161944418, + "grad_norm": 0.8626366853713989, + "learning_rate": 0.00018154952811281724, + "loss": 3.2119, + "step": 2045 + }, + { + "epoch": 0.18520865393319452, + "grad_norm": 0.9677149653434753, + "learning_rate": 0.00018153115794792904, + "loss": 3.2749, + "step": 2046 + }, + { + "epoch": 0.18529917624694486, + "grad_norm": 0.9512866139411926, + "learning_rate": 0.00018151277957285543, + "loss": 3.2477, + "step": 2047 + }, + { + "epoch": 0.1853896985606952, + "grad_norm": 1.0640791654586792, + "learning_rate": 0.00018149439298944717, + "loss": 3.198, + "step": 2048 + }, + { + "epoch": 0.18548022087444554, + "grad_norm": 1.0518122911453247, + "learning_rate": 0.0001814759981995558, + "loss": 3.3341, + "step": 2049 + }, + { + "epoch": 0.18557074318819589, + "grad_norm": 0.9965335130691528, + "learning_rate": 0.00018145759520503358, + "loss": 3.2755, + "step": 2050 + }, + { + "epoch": 0.18566126550194623, + "grad_norm": 0.8979808688163757, + "learning_rate": 0.00018143918400773376, + "loss": 3.2551, + "step": 2051 + }, + { + "epoch": 0.18575178781569657, + "grad_norm": 0.93203204870224, + "learning_rate": 0.0001814207646095104, + "loss": 3.2214, + "step": 2052 + }, + { + "epoch": 0.1858423101294469, + "grad_norm": 0.9714310169219971, + "learning_rate": 0.0001814023370122182, + "loss": 3.3333, + "step": 2053 + }, + { + "epoch": 0.18593283244319725, + "grad_norm": 0.8723704218864441, + "learning_rate": 0.00018138390121771295, + "loss": 3.2398, + "step": 2054 + }, + { + "epoch": 0.1860233547569476, + "grad_norm": 0.9395724534988403, + "learning_rate": 0.00018136545722785102, + "loss": 3.3092, + "step": 2055 + }, + { + "epoch": 0.18611387707069793, + "grad_norm": 0.9963365793228149, + "learning_rate": 0.0001813470050444898, + "loss": 3.2367, + "step": 2056 + }, + { + "epoch": 0.18620439938444827, + "grad_norm": 0.9642789363861084, + "learning_rate": 0.0001813285446694874, + "loss": 3.2365, + "step": 2057 + }, + { + "epoch": 0.1862949216981986, + "grad_norm": 1.004480242729187, + "learning_rate": 0.00018131007610470276, + "loss": 3.2573, + "step": 2058 + }, + { + "epoch": 0.18638544401194895, + "grad_norm": 0.8809323310852051, + "learning_rate": 0.00018129159935199572, + "loss": 3.1922, + "step": 2059 + }, + { + "epoch": 0.1864759663256993, + "grad_norm": 1.2615585327148438, + "learning_rate": 0.0001812731144132268, + "loss": 3.2372, + "step": 2060 + }, + { + "epoch": 0.18656648863944963, + "grad_norm": 0.8767595887184143, + "learning_rate": 0.00018125462129025752, + "loss": 3.1661, + "step": 2061 + }, + { + "epoch": 0.18665701095319998, + "grad_norm": 1.0361719131469727, + "learning_rate": 0.00018123611998495007, + "loss": 3.2573, + "step": 2062 + }, + { + "epoch": 0.18674753326695032, + "grad_norm": 0.9326725006103516, + "learning_rate": 0.0001812176104991676, + "loss": 3.1669, + "step": 2063 + }, + { + "epoch": 0.18683805558070063, + "grad_norm": 1.1481561660766602, + "learning_rate": 0.00018119909283477394, + "loss": 3.2874, + "step": 2064 + }, + { + "epoch": 0.18692857789445097, + "grad_norm": 0.9084659218788147, + "learning_rate": 0.00018118056699363387, + "loss": 3.3139, + "step": 2065 + }, + { + "epoch": 0.1870191002082013, + "grad_norm": 0.9566446542739868, + "learning_rate": 0.00018116203297761292, + "loss": 3.318, + "step": 2066 + }, + { + "epoch": 0.18710962252195165, + "grad_norm": 0.8745667338371277, + "learning_rate": 0.00018114349078857746, + "loss": 3.1519, + "step": 2067 + }, + { + "epoch": 0.187200144835702, + "grad_norm": 0.954801082611084, + "learning_rate": 0.0001811249404283947, + "loss": 3.221, + "step": 2068 + }, + { + "epoch": 0.18729066714945233, + "grad_norm": 0.8902572989463806, + "learning_rate": 0.00018110638189893267, + "loss": 3.2083, + "step": 2069 + }, + { + "epoch": 0.18738118946320267, + "grad_norm": 0.880317211151123, + "learning_rate": 0.0001810878152020602, + "loss": 3.1926, + "step": 2070 + }, + { + "epoch": 0.18747171177695302, + "grad_norm": 1.0060982704162598, + "learning_rate": 0.00018106924033964696, + "loss": 3.2319, + "step": 2071 + }, + { + "epoch": 0.18756223409070336, + "grad_norm": 0.9899522662162781, + "learning_rate": 0.00018105065731356343, + "loss": 3.1814, + "step": 2072 + }, + { + "epoch": 0.1876527564044537, + "grad_norm": 0.8588231205940247, + "learning_rate": 0.00018103206612568094, + "loss": 3.21, + "step": 2073 + }, + { + "epoch": 0.18774327871820404, + "grad_norm": 0.9145846962928772, + "learning_rate": 0.00018101346677787156, + "loss": 3.2141, + "step": 2074 + }, + { + "epoch": 0.18783380103195438, + "grad_norm": 0.9708083271980286, + "learning_rate": 0.00018099485927200836, + "loss": 3.2744, + "step": 2075 + }, + { + "epoch": 0.18792432334570472, + "grad_norm": 0.9268299341201782, + "learning_rate": 0.000180976243609965, + "loss": 3.1685, + "step": 2076 + }, + { + "epoch": 0.18801484565945506, + "grad_norm": 0.9239932894706726, + "learning_rate": 0.00018095761979361615, + "loss": 3.2955, + "step": 2077 + }, + { + "epoch": 0.1881053679732054, + "grad_norm": 0.9097129702568054, + "learning_rate": 0.00018093898782483718, + "loss": 3.3611, + "step": 2078 + }, + { + "epoch": 0.18819589028695574, + "grad_norm": 1.1154252290725708, + "learning_rate": 0.00018092034770550436, + "loss": 3.2398, + "step": 2079 + }, + { + "epoch": 0.18828641260070608, + "grad_norm": 0.9370711445808411, + "learning_rate": 0.00018090169943749476, + "loss": 3.2075, + "step": 2080 + }, + { + "epoch": 0.18837693491445642, + "grad_norm": 0.9421517252922058, + "learning_rate": 0.00018088304302268623, + "loss": 3.1681, + "step": 2081 + }, + { + "epoch": 0.18846745722820676, + "grad_norm": 0.8839324712753296, + "learning_rate": 0.00018086437846295746, + "loss": 3.2357, + "step": 2082 + }, + { + "epoch": 0.1885579795419571, + "grad_norm": 1.0185707807540894, + "learning_rate": 0.000180845705760188, + "loss": 3.2913, + "step": 2083 + }, + { + "epoch": 0.18864850185570745, + "grad_norm": 0.9895525574684143, + "learning_rate": 0.0001808270249162582, + "loss": 3.3165, + "step": 2084 + }, + { + "epoch": 0.18873902416945776, + "grad_norm": 0.9668818712234497, + "learning_rate": 0.00018080833593304917, + "loss": 3.2791, + "step": 2085 + }, + { + "epoch": 0.1888295464832081, + "grad_norm": 1.1404991149902344, + "learning_rate": 0.00018078963881244296, + "loss": 3.3115, + "step": 2086 + }, + { + "epoch": 0.18892006879695844, + "grad_norm": 0.9569199085235596, + "learning_rate": 0.0001807709335563223, + "loss": 3.265, + "step": 2087 + }, + { + "epoch": 0.18901059111070878, + "grad_norm": 0.8879489302635193, + "learning_rate": 0.00018075222016657088, + "loss": 3.3034, + "step": 2088 + }, + { + "epoch": 0.18910111342445912, + "grad_norm": 1.0276618003845215, + "learning_rate": 0.00018073349864507305, + "loss": 3.2423, + "step": 2089 + }, + { + "epoch": 0.18919163573820946, + "grad_norm": 1.0222197771072388, + "learning_rate": 0.00018071476899371414, + "loss": 3.2373, + "step": 2090 + }, + { + "epoch": 0.1892821580519598, + "grad_norm": 0.9289442896842957, + "learning_rate": 0.00018069603121438022, + "loss": 3.173, + "step": 2091 + }, + { + "epoch": 0.18937268036571014, + "grad_norm": 0.9327288866043091, + "learning_rate": 0.00018067728530895815, + "loss": 3.2297, + "step": 2092 + }, + { + "epoch": 0.18946320267946049, + "grad_norm": 0.9465458393096924, + "learning_rate": 0.00018065853127933562, + "loss": 3.225, + "step": 2093 + }, + { + "epoch": 0.18955372499321083, + "grad_norm": 0.9022896885871887, + "learning_rate": 0.00018063976912740124, + "loss": 3.2729, + "step": 2094 + }, + { + "epoch": 0.18964424730696117, + "grad_norm": 0.9528231024742126, + "learning_rate": 0.0001806209988550443, + "loss": 3.268, + "step": 2095 + }, + { + "epoch": 0.1897347696207115, + "grad_norm": 0.9072818756103516, + "learning_rate": 0.000180602220464155, + "loss": 3.2786, + "step": 2096 + }, + { + "epoch": 0.18982529193446185, + "grad_norm": 0.8678036332130432, + "learning_rate": 0.00018058343395662427, + "loss": 3.2, + "step": 2097 + }, + { + "epoch": 0.1899158142482122, + "grad_norm": 0.9147240519523621, + "learning_rate": 0.00018056463933434398, + "loss": 3.2329, + "step": 2098 + }, + { + "epoch": 0.19000633656196253, + "grad_norm": 0.9179824590682983, + "learning_rate": 0.00018054583659920669, + "loss": 3.2009, + "step": 2099 + }, + { + "epoch": 0.19009685887571287, + "grad_norm": 0.9133867621421814, + "learning_rate": 0.00018052702575310588, + "loss": 3.1839, + "step": 2100 + }, + { + "epoch": 0.1901873811894632, + "grad_norm": 0.9742360711097717, + "learning_rate": 0.00018050820679793575, + "loss": 3.1951, + "step": 2101 + }, + { + "epoch": 0.19027790350321355, + "grad_norm": 0.9586870670318604, + "learning_rate": 0.0001804893797355914, + "loss": 3.276, + "step": 2102 + }, + { + "epoch": 0.1903684258169639, + "grad_norm": 1.0903187990188599, + "learning_rate": 0.00018047054456796877, + "loss": 3.1827, + "step": 2103 + }, + { + "epoch": 0.19045894813071423, + "grad_norm": 1.017938494682312, + "learning_rate": 0.0001804517012969644, + "loss": 3.2418, + "step": 2104 + }, + { + "epoch": 0.19054947044446455, + "grad_norm": 0.997882604598999, + "learning_rate": 0.00018043284992447602, + "loss": 3.2032, + "step": 2105 + }, + { + "epoch": 0.1906399927582149, + "grad_norm": 1.032028079032898, + "learning_rate": 0.00018041399045240183, + "loss": 3.2286, + "step": 2106 + }, + { + "epoch": 0.19073051507196523, + "grad_norm": 1.2459874153137207, + "learning_rate": 0.00018039512288264096, + "loss": 3.2662, + "step": 2107 + }, + { + "epoch": 0.19082103738571557, + "grad_norm": 1.0763884782791138, + "learning_rate": 0.00018037624721709342, + "loss": 3.2846, + "step": 2108 + }, + { + "epoch": 0.1909115596994659, + "grad_norm": 1.4135953187942505, + "learning_rate": 0.00018035736345765999, + "loss": 3.1769, + "step": 2109 + }, + { + "epoch": 0.19100208201321625, + "grad_norm": 1.0524734258651733, + "learning_rate": 0.00018033847160624225, + "loss": 3.2379, + "step": 2110 + }, + { + "epoch": 0.1910926043269666, + "grad_norm": 1.238844633102417, + "learning_rate": 0.00018031957166474264, + "loss": 3.2223, + "step": 2111 + }, + { + "epoch": 0.19118312664071693, + "grad_norm": 1.1036057472229004, + "learning_rate": 0.00018030066363506437, + "loss": 3.3338, + "step": 2112 + }, + { + "epoch": 0.19127364895446727, + "grad_norm": 1.2267645597457886, + "learning_rate": 0.00018028174751911146, + "loss": 3.2718, + "step": 2113 + }, + { + "epoch": 0.19136417126821761, + "grad_norm": 1.2532908916473389, + "learning_rate": 0.00018026282331878876, + "loss": 3.2186, + "step": 2114 + }, + { + "epoch": 0.19145469358196796, + "grad_norm": 0.913084089756012, + "learning_rate": 0.00018024389103600197, + "loss": 3.2445, + "step": 2115 + }, + { + "epoch": 0.1915452158957183, + "grad_norm": 1.1660557985305786, + "learning_rate": 0.00018022495067265753, + "loss": 3.1916, + "step": 2116 + }, + { + "epoch": 0.19163573820946864, + "grad_norm": 0.9609684348106384, + "learning_rate": 0.00018020600223066276, + "loss": 3.2422, + "step": 2117 + }, + { + "epoch": 0.19172626052321898, + "grad_norm": 0.9922468066215515, + "learning_rate": 0.00018018704571192573, + "loss": 3.2267, + "step": 2118 + }, + { + "epoch": 0.19181678283696932, + "grad_norm": 1.00340735912323, + "learning_rate": 0.00018016808111835544, + "loss": 3.2951, + "step": 2119 + }, + { + "epoch": 0.19190730515071966, + "grad_norm": 0.9352942109107971, + "learning_rate": 0.00018014910845186153, + "loss": 3.2118, + "step": 2120 + }, + { + "epoch": 0.19199782746447, + "grad_norm": 0.9789060950279236, + "learning_rate": 0.0001801301277143546, + "loss": 3.3368, + "step": 2121 + }, + { + "epoch": 0.19208834977822034, + "grad_norm": 0.9243131279945374, + "learning_rate": 0.00018011113890774603, + "loss": 3.2403, + "step": 2122 + }, + { + "epoch": 0.19217887209197068, + "grad_norm": 0.943941593170166, + "learning_rate": 0.00018009214203394794, + "loss": 3.2127, + "step": 2123 + }, + { + "epoch": 0.19226939440572102, + "grad_norm": 0.9229180216789246, + "learning_rate": 0.00018007313709487334, + "loss": 3.2846, + "step": 2124 + }, + { + "epoch": 0.19235991671947136, + "grad_norm": 1.020640254020691, + "learning_rate": 0.00018005412409243606, + "loss": 3.2412, + "step": 2125 + }, + { + "epoch": 0.19245043903322168, + "grad_norm": 0.8972303867340088, + "learning_rate": 0.0001800351030285506, + "loss": 3.2709, + "step": 2126 + }, + { + "epoch": 0.19254096134697202, + "grad_norm": 0.9157093167304993, + "learning_rate": 0.00018001607390513255, + "loss": 3.2852, + "step": 2127 + }, + { + "epoch": 0.19263148366072236, + "grad_norm": 0.9684386849403381, + "learning_rate": 0.000179997036724098, + "loss": 3.2637, + "step": 2128 + }, + { + "epoch": 0.1927220059744727, + "grad_norm": 1.0669679641723633, + "learning_rate": 0.00017997799148736403, + "loss": 3.1715, + "step": 2129 + }, + { + "epoch": 0.19281252828822304, + "grad_norm": 0.9132657051086426, + "learning_rate": 0.0001799589381968485, + "loss": 3.2373, + "step": 2130 + }, + { + "epoch": 0.19290305060197338, + "grad_norm": 0.9766040444374084, + "learning_rate": 0.00017993987685447008, + "loss": 3.2287, + "step": 2131 + }, + { + "epoch": 0.19299357291572372, + "grad_norm": 0.9845170974731445, + "learning_rate": 0.0001799208074621483, + "loss": 3.2386, + "step": 2132 + }, + { + "epoch": 0.19308409522947406, + "grad_norm": 0.9120230674743652, + "learning_rate": 0.00017990173002180334, + "loss": 3.2479, + "step": 2133 + }, + { + "epoch": 0.1931746175432244, + "grad_norm": 0.8494027256965637, + "learning_rate": 0.0001798826445353564, + "loss": 3.1777, + "step": 2134 + }, + { + "epoch": 0.19326513985697474, + "grad_norm": 0.945387065410614, + "learning_rate": 0.00017986355100472928, + "loss": 3.2252, + "step": 2135 + }, + { + "epoch": 0.19335566217072508, + "grad_norm": 1.0038387775421143, + "learning_rate": 0.0001798444494318448, + "loss": 3.1945, + "step": 2136 + }, + { + "epoch": 0.19344618448447543, + "grad_norm": 0.9529719948768616, + "learning_rate": 0.00017982533981862644, + "loss": 3.1815, + "step": 2137 + }, + { + "epoch": 0.19353670679822577, + "grad_norm": 0.9867908954620361, + "learning_rate": 0.00017980622216699855, + "loss": 3.2314, + "step": 2138 + }, + { + "epoch": 0.1936272291119761, + "grad_norm": 0.9234095215797424, + "learning_rate": 0.0001797870964788863, + "loss": 3.1586, + "step": 2139 + }, + { + "epoch": 0.19371775142572645, + "grad_norm": 0.9278466105461121, + "learning_rate": 0.00017976796275621555, + "loss": 3.2559, + "step": 2140 + }, + { + "epoch": 0.1938082737394768, + "grad_norm": 0.9761223196983337, + "learning_rate": 0.00017974882100091317, + "loss": 3.2429, + "step": 2141 + }, + { + "epoch": 0.19389879605322713, + "grad_norm": 0.9403523206710815, + "learning_rate": 0.00017972967121490668, + "loss": 3.2106, + "step": 2142 + }, + { + "epoch": 0.19398931836697747, + "grad_norm": 0.9349404573440552, + "learning_rate": 0.0001797105134001245, + "loss": 3.2218, + "step": 2143 + }, + { + "epoch": 0.1940798406807278, + "grad_norm": 0.8340486884117126, + "learning_rate": 0.0001796913475584958, + "loss": 3.1981, + "step": 2144 + }, + { + "epoch": 0.19417036299447815, + "grad_norm": 0.920889139175415, + "learning_rate": 0.00017967217369195058, + "loss": 3.1651, + "step": 2145 + }, + { + "epoch": 0.19426088530822846, + "grad_norm": 0.8778406381607056, + "learning_rate": 0.00017965299180241963, + "loss": 3.2923, + "step": 2146 + }, + { + "epoch": 0.1943514076219788, + "grad_norm": 0.9380999207496643, + "learning_rate": 0.0001796338018918346, + "loss": 3.2636, + "step": 2147 + }, + { + "epoch": 0.19444192993572915, + "grad_norm": 0.9442136883735657, + "learning_rate": 0.0001796146039621279, + "loss": 3.2431, + "step": 2148 + }, + { + "epoch": 0.1945324522494795, + "grad_norm": 0.8796325922012329, + "learning_rate": 0.00017959539801523278, + "loss": 3.1596, + "step": 2149 + }, + { + "epoch": 0.19462297456322983, + "grad_norm": 1.0169458389282227, + "learning_rate": 0.00017957618405308324, + "loss": 3.146, + "step": 2150 + }, + { + "epoch": 0.19471349687698017, + "grad_norm": 0.987337589263916, + "learning_rate": 0.00017955696207761412, + "loss": 3.1754, + "step": 2151 + }, + { + "epoch": 0.1948040191907305, + "grad_norm": 1.0014207363128662, + "learning_rate": 0.0001795377320907611, + "loss": 3.2415, + "step": 2152 + }, + { + "epoch": 0.19489454150448085, + "grad_norm": 0.9423050880432129, + "learning_rate": 0.00017951849409446067, + "loss": 3.1492, + "step": 2153 + }, + { + "epoch": 0.1949850638182312, + "grad_norm": 1.1418346166610718, + "learning_rate": 0.00017949924809065003, + "loss": 3.2466, + "step": 2154 + }, + { + "epoch": 0.19507558613198153, + "grad_norm": 0.8354297876358032, + "learning_rate": 0.0001794799940812673, + "loss": 3.227, + "step": 2155 + }, + { + "epoch": 0.19516610844573187, + "grad_norm": 1.1176152229309082, + "learning_rate": 0.00017946073206825134, + "loss": 3.2342, + "step": 2156 + }, + { + "epoch": 0.1952566307594822, + "grad_norm": 0.9996242523193359, + "learning_rate": 0.00017944146205354182, + "loss": 3.3404, + "step": 2157 + }, + { + "epoch": 0.19534715307323255, + "grad_norm": 1.025205373764038, + "learning_rate": 0.00017942218403907924, + "loss": 3.1818, + "step": 2158 + }, + { + "epoch": 0.1954376753869829, + "grad_norm": 0.8781141638755798, + "learning_rate": 0.0001794028980268049, + "loss": 3.1567, + "step": 2159 + }, + { + "epoch": 0.19552819770073324, + "grad_norm": 0.9099708199501038, + "learning_rate": 0.00017938360401866093, + "loss": 3.2146, + "step": 2160 + }, + { + "epoch": 0.19561872001448358, + "grad_norm": 0.9119500517845154, + "learning_rate": 0.0001793643020165902, + "loss": 3.2271, + "step": 2161 + }, + { + "epoch": 0.19570924232823392, + "grad_norm": 0.8782615065574646, + "learning_rate": 0.00017934499202253642, + "loss": 3.1913, + "step": 2162 + }, + { + "epoch": 0.19579976464198426, + "grad_norm": 0.9160125255584717, + "learning_rate": 0.0001793256740384441, + "loss": 3.2224, + "step": 2163 + }, + { + "epoch": 0.1958902869557346, + "grad_norm": 0.8718566298484802, + "learning_rate": 0.0001793063480662586, + "loss": 3.1911, + "step": 2164 + }, + { + "epoch": 0.19598080926948494, + "grad_norm": 0.9492814540863037, + "learning_rate": 0.00017928701410792602, + "loss": 3.1833, + "step": 2165 + }, + { + "epoch": 0.19607133158323528, + "grad_norm": 0.8263920545578003, + "learning_rate": 0.0001792676721653933, + "loss": 3.3056, + "step": 2166 + }, + { + "epoch": 0.1961618538969856, + "grad_norm": 0.9434691667556763, + "learning_rate": 0.0001792483222406081, + "loss": 3.2539, + "step": 2167 + }, + { + "epoch": 0.19625237621073593, + "grad_norm": 0.8706825375556946, + "learning_rate": 0.00017922896433551907, + "loss": 3.1788, + "step": 2168 + }, + { + "epoch": 0.19634289852448628, + "grad_norm": 0.9813132882118225, + "learning_rate": 0.0001792095984520755, + "loss": 3.2355, + "step": 2169 + }, + { + "epoch": 0.19643342083823662, + "grad_norm": 0.9587005972862244, + "learning_rate": 0.00017919022459222752, + "loss": 3.2232, + "step": 2170 + }, + { + "epoch": 0.19652394315198696, + "grad_norm": 0.9128755927085876, + "learning_rate": 0.0001791708427579261, + "loss": 3.1844, + "step": 2171 + }, + { + "epoch": 0.1966144654657373, + "grad_norm": 0.89957195520401, + "learning_rate": 0.00017915145295112296, + "loss": 3.1824, + "step": 2172 + }, + { + "epoch": 0.19670498777948764, + "grad_norm": 0.8944728374481201, + "learning_rate": 0.0001791320551737707, + "loss": 3.2404, + "step": 2173 + }, + { + "epoch": 0.19679551009323798, + "grad_norm": 1.0140613317489624, + "learning_rate": 0.00017911264942782262, + "loss": 3.2424, + "step": 2174 + }, + { + "epoch": 0.19688603240698832, + "grad_norm": 0.9342913627624512, + "learning_rate": 0.00017909323571523294, + "loss": 3.2279, + "step": 2175 + }, + { + "epoch": 0.19697655472073866, + "grad_norm": 0.9111455678939819, + "learning_rate": 0.00017907381403795656, + "loss": 3.2082, + "step": 2176 + }, + { + "epoch": 0.197067077034489, + "grad_norm": 0.9179486036300659, + "learning_rate": 0.00017905438439794929, + "loss": 3.1214, + "step": 2177 + }, + { + "epoch": 0.19715759934823934, + "grad_norm": 0.9826908111572266, + "learning_rate": 0.00017903494679716767, + "loss": 3.228, + "step": 2178 + }, + { + "epoch": 0.19724812166198968, + "grad_norm": 0.9230121374130249, + "learning_rate": 0.00017901550123756906, + "loss": 3.1306, + "step": 2179 + }, + { + "epoch": 0.19733864397574002, + "grad_norm": 0.9603817462921143, + "learning_rate": 0.00017899604772111163, + "loss": 3.2442, + "step": 2180 + }, + { + "epoch": 0.19742916628949037, + "grad_norm": 0.9203359484672546, + "learning_rate": 0.00017897658624975438, + "loss": 3.1501, + "step": 2181 + }, + { + "epoch": 0.1975196886032407, + "grad_norm": 0.9163203835487366, + "learning_rate": 0.00017895711682545704, + "loss": 3.2631, + "step": 2182 + }, + { + "epoch": 0.19761021091699105, + "grad_norm": 0.8634765148162842, + "learning_rate": 0.0001789376394501802, + "loss": 3.2008, + "step": 2183 + }, + { + "epoch": 0.1977007332307414, + "grad_norm": 0.9373166561126709, + "learning_rate": 0.00017891815412588523, + "loss": 3.1603, + "step": 2184 + }, + { + "epoch": 0.19779125554449173, + "grad_norm": 1.0072497129440308, + "learning_rate": 0.00017889866085453427, + "loss": 3.2015, + "step": 2185 + }, + { + "epoch": 0.19788177785824207, + "grad_norm": 0.8856438398361206, + "learning_rate": 0.00017887915963809038, + "loss": 3.2296, + "step": 2186 + }, + { + "epoch": 0.19797230017199238, + "grad_norm": 0.9211776852607727, + "learning_rate": 0.0001788596504785172, + "loss": 3.2378, + "step": 2187 + }, + { + "epoch": 0.19806282248574272, + "grad_norm": 0.9734625220298767, + "learning_rate": 0.00017884013337777943, + "loss": 3.1718, + "step": 2188 + }, + { + "epoch": 0.19815334479949306, + "grad_norm": 0.9697534441947937, + "learning_rate": 0.00017882060833784237, + "loss": 3.2205, + "step": 2189 + }, + { + "epoch": 0.1982438671132434, + "grad_norm": 0.9940907955169678, + "learning_rate": 0.00017880107536067218, + "loss": 3.2506, + "step": 2190 + }, + { + "epoch": 0.19833438942699375, + "grad_norm": 0.9850450158119202, + "learning_rate": 0.00017878153444823592, + "loss": 3.2165, + "step": 2191 + }, + { + "epoch": 0.1984249117407441, + "grad_norm": 0.9790923595428467, + "learning_rate": 0.00017876198560250127, + "loss": 3.1925, + "step": 2192 + }, + { + "epoch": 0.19851543405449443, + "grad_norm": 0.9181186556816101, + "learning_rate": 0.0001787424288254368, + "loss": 3.196, + "step": 2193 + }, + { + "epoch": 0.19860595636824477, + "grad_norm": 0.924216628074646, + "learning_rate": 0.00017872286411901191, + "loss": 3.1773, + "step": 2194 + }, + { + "epoch": 0.1986964786819951, + "grad_norm": 0.9113030433654785, + "learning_rate": 0.00017870329148519676, + "loss": 3.1877, + "step": 2195 + }, + { + "epoch": 0.19878700099574545, + "grad_norm": 1.0087077617645264, + "learning_rate": 0.00017868371092596232, + "loss": 3.3046, + "step": 2196 + }, + { + "epoch": 0.1988775233094958, + "grad_norm": 0.9603467583656311, + "learning_rate": 0.00017866412244328035, + "loss": 3.2462, + "step": 2197 + }, + { + "epoch": 0.19896804562324613, + "grad_norm": 1.048497200012207, + "learning_rate": 0.00017864452603912339, + "loss": 3.1842, + "step": 2198 + }, + { + "epoch": 0.19905856793699647, + "grad_norm": 0.9084252715110779, + "learning_rate": 0.00017862492171546478, + "loss": 3.1238, + "step": 2199 + }, + { + "epoch": 0.1991490902507468, + "grad_norm": 1.100671410560608, + "learning_rate": 0.00017860530947427875, + "loss": 3.2618, + "step": 2200 + }, + { + "epoch": 0.19923961256449715, + "grad_norm": 1.1458168029785156, + "learning_rate": 0.0001785856893175402, + "loss": 3.185, + "step": 2201 + }, + { + "epoch": 0.1993301348782475, + "grad_norm": 1.094252109527588, + "learning_rate": 0.0001785660612472249, + "loss": 3.2558, + "step": 2202 + }, + { + "epoch": 0.19942065719199784, + "grad_norm": 0.8893122673034668, + "learning_rate": 0.00017854642526530934, + "loss": 3.1477, + "step": 2203 + }, + { + "epoch": 0.19951117950574818, + "grad_norm": 1.0326915979385376, + "learning_rate": 0.00017852678137377092, + "loss": 3.2328, + "step": 2204 + }, + { + "epoch": 0.19960170181949852, + "grad_norm": 1.028706431388855, + "learning_rate": 0.00017850712957458774, + "loss": 3.1817, + "step": 2205 + }, + { + "epoch": 0.19969222413324886, + "grad_norm": 0.9901148080825806, + "learning_rate": 0.00017848746986973883, + "loss": 3.2454, + "step": 2206 + }, + { + "epoch": 0.1997827464469992, + "grad_norm": 0.8898138403892517, + "learning_rate": 0.0001784678022612038, + "loss": 3.1329, + "step": 2207 + }, + { + "epoch": 0.1998732687607495, + "grad_norm": 0.9994850158691406, + "learning_rate": 0.00017844812675096324, + "loss": 3.2728, + "step": 2208 + }, + { + "epoch": 0.19996379107449985, + "grad_norm": 0.9238694906234741, + "learning_rate": 0.00017842844334099847, + "loss": 3.2762, + "step": 2209 + }, + { + "epoch": 0.2000543133882502, + "grad_norm": 1.0947091579437256, + "learning_rate": 0.0001784087520332916, + "loss": 3.254, + "step": 2210 + }, + { + "epoch": 0.20014483570200053, + "grad_norm": 0.8965323567390442, + "learning_rate": 0.00017838905282982552, + "loss": 3.2118, + "step": 2211 + }, + { + "epoch": 0.20023535801575088, + "grad_norm": 0.9534854292869568, + "learning_rate": 0.000178369345732584, + "loss": 3.2671, + "step": 2212 + }, + { + "epoch": 0.20032588032950122, + "grad_norm": 0.9523543119430542, + "learning_rate": 0.0001783496307435515, + "loss": 3.1687, + "step": 2213 + }, + { + "epoch": 0.20041640264325156, + "grad_norm": 0.9066437482833862, + "learning_rate": 0.0001783299078647133, + "loss": 3.2303, + "step": 2214 + }, + { + "epoch": 0.2005069249570019, + "grad_norm": 1.025066614151001, + "learning_rate": 0.00017831017709805556, + "loss": 3.2281, + "step": 2215 + }, + { + "epoch": 0.20059744727075224, + "grad_norm": 0.8147643208503723, + "learning_rate": 0.0001782904384455651, + "loss": 3.2593, + "step": 2216 + }, + { + "epoch": 0.20068796958450258, + "grad_norm": 1.0472511053085327, + "learning_rate": 0.00017827069190922966, + "loss": 3.1703, + "step": 2217 + }, + { + "epoch": 0.20077849189825292, + "grad_norm": 0.9242010712623596, + "learning_rate": 0.00017825093749103765, + "loss": 3.1703, + "step": 2218 + }, + { + "epoch": 0.20086901421200326, + "grad_norm": 0.8766939640045166, + "learning_rate": 0.0001782311751929784, + "loss": 3.1491, + "step": 2219 + }, + { + "epoch": 0.2009595365257536, + "grad_norm": 0.9397275447845459, + "learning_rate": 0.00017821140501704194, + "loss": 3.2021, + "step": 2220 + }, + { + "epoch": 0.20105005883950394, + "grad_norm": 0.8902556300163269, + "learning_rate": 0.00017819162696521915, + "loss": 3.2027, + "step": 2221 + }, + { + "epoch": 0.20114058115325428, + "grad_norm": 0.9259093999862671, + "learning_rate": 0.00017817184103950163, + "loss": 3.2105, + "step": 2222 + }, + { + "epoch": 0.20123110346700462, + "grad_norm": 1.0207278728485107, + "learning_rate": 0.00017815204724188187, + "loss": 3.2432, + "step": 2223 + }, + { + "epoch": 0.20132162578075496, + "grad_norm": 0.9795804619789124, + "learning_rate": 0.00017813224557435312, + "loss": 3.2357, + "step": 2224 + }, + { + "epoch": 0.2014121480945053, + "grad_norm": 0.9147193431854248, + "learning_rate": 0.00017811243603890934, + "loss": 3.2376, + "step": 2225 + }, + { + "epoch": 0.20150267040825565, + "grad_norm": 0.8962690234184265, + "learning_rate": 0.00017809261863754544, + "loss": 3.1848, + "step": 2226 + }, + { + "epoch": 0.201593192722006, + "grad_norm": 0.907362163066864, + "learning_rate": 0.00017807279337225693, + "loss": 3.1929, + "step": 2227 + }, + { + "epoch": 0.2016837150357563, + "grad_norm": 0.8899882435798645, + "learning_rate": 0.00017805296024504024, + "loss": 3.317, + "step": 2228 + }, + { + "epoch": 0.20177423734950664, + "grad_norm": 0.934199869632721, + "learning_rate": 0.00017803311925789265, + "loss": 3.2336, + "step": 2229 + }, + { + "epoch": 0.20186475966325698, + "grad_norm": 0.8858028650283813, + "learning_rate": 0.00017801327041281207, + "loss": 3.1753, + "step": 2230 + }, + { + "epoch": 0.20195528197700732, + "grad_norm": 0.9815089702606201, + "learning_rate": 0.00017799341371179728, + "loss": 3.1928, + "step": 2231 + }, + { + "epoch": 0.20204580429075766, + "grad_norm": 1.0124167203903198, + "learning_rate": 0.0001779735491568479, + "loss": 3.2052, + "step": 2232 + }, + { + "epoch": 0.202136326604508, + "grad_norm": 0.9589294791221619, + "learning_rate": 0.0001779536767499642, + "loss": 3.2347, + "step": 2233 + }, + { + "epoch": 0.20222684891825835, + "grad_norm": 1.084632158279419, + "learning_rate": 0.00017793379649314744, + "loss": 3.269, + "step": 2234 + }, + { + "epoch": 0.20231737123200869, + "grad_norm": 1.1935856342315674, + "learning_rate": 0.00017791390838839944, + "loss": 3.1992, + "step": 2235 + }, + { + "epoch": 0.20240789354575903, + "grad_norm": 1.022628664970398, + "learning_rate": 0.00017789401243772305, + "loss": 3.2459, + "step": 2236 + }, + { + "epoch": 0.20249841585950937, + "grad_norm": 1.017520785331726, + "learning_rate": 0.00017787410864312172, + "loss": 3.2321, + "step": 2237 + }, + { + "epoch": 0.2025889381732597, + "grad_norm": 0.9736983776092529, + "learning_rate": 0.0001778541970065998, + "loss": 3.2685, + "step": 2238 + }, + { + "epoch": 0.20267946048701005, + "grad_norm": 0.9741368293762207, + "learning_rate": 0.00017783427753016232, + "loss": 3.2179, + "step": 2239 + }, + { + "epoch": 0.2027699828007604, + "grad_norm": 1.198488712310791, + "learning_rate": 0.00017781435021581527, + "loss": 3.2421, + "step": 2240 + }, + { + "epoch": 0.20286050511451073, + "grad_norm": 0.9209917783737183, + "learning_rate": 0.00017779441506556528, + "loss": 3.1841, + "step": 2241 + }, + { + "epoch": 0.20295102742826107, + "grad_norm": 0.995261549949646, + "learning_rate": 0.0001777744720814198, + "loss": 3.2257, + "step": 2242 + }, + { + "epoch": 0.2030415497420114, + "grad_norm": 1.0581942796707153, + "learning_rate": 0.0001777545212653871, + "loss": 3.131, + "step": 2243 + }, + { + "epoch": 0.20313207205576175, + "grad_norm": 0.9594766497612, + "learning_rate": 0.00017773456261947627, + "loss": 3.2499, + "step": 2244 + }, + { + "epoch": 0.2032225943695121, + "grad_norm": 1.0768234729766846, + "learning_rate": 0.0001777145961456971, + "loss": 3.2079, + "step": 2245 + }, + { + "epoch": 0.20331311668326243, + "grad_norm": 0.952381432056427, + "learning_rate": 0.00017769462184606024, + "loss": 3.1935, + "step": 2246 + }, + { + "epoch": 0.20340363899701278, + "grad_norm": 0.895455002784729, + "learning_rate": 0.00017767463972257706, + "loss": 3.2608, + "step": 2247 + }, + { + "epoch": 0.20349416131076312, + "grad_norm": 0.9744084477424622, + "learning_rate": 0.00017765464977725977, + "loss": 3.1806, + "step": 2248 + }, + { + "epoch": 0.20358468362451343, + "grad_norm": 0.9829551577568054, + "learning_rate": 0.0001776346520121214, + "loss": 3.2598, + "step": 2249 + }, + { + "epoch": 0.20367520593826377, + "grad_norm": 1.0411384105682373, + "learning_rate": 0.0001776146464291757, + "loss": 3.1889, + "step": 2250 + }, + { + "epoch": 0.2037657282520141, + "grad_norm": 1.1049972772598267, + "learning_rate": 0.00017759463303043723, + "loss": 3.1938, + "step": 2251 + }, + { + "epoch": 0.20385625056576445, + "grad_norm": 0.9213403463363647, + "learning_rate": 0.0001775746118179213, + "loss": 3.1873, + "step": 2252 + }, + { + "epoch": 0.2039467728795148, + "grad_norm": 0.97609943151474, + "learning_rate": 0.00017755458279364411, + "loss": 3.1772, + "step": 2253 + }, + { + "epoch": 0.20403729519326513, + "grad_norm": 1.166957139968872, + "learning_rate": 0.00017753454595962256, + "loss": 3.2645, + "step": 2254 + }, + { + "epoch": 0.20412781750701547, + "grad_norm": 0.9787424802780151, + "learning_rate": 0.00017751450131787435, + "loss": 3.1883, + "step": 2255 + }, + { + "epoch": 0.20421833982076582, + "grad_norm": 0.8625527620315552, + "learning_rate": 0.00017749444887041799, + "loss": 3.1512, + "step": 2256 + }, + { + "epoch": 0.20430886213451616, + "grad_norm": 0.8910434246063232, + "learning_rate": 0.00017747438861927272, + "loss": 3.135, + "step": 2257 + }, + { + "epoch": 0.2043993844482665, + "grad_norm": 0.8338545560836792, + "learning_rate": 0.00017745432056645864, + "loss": 3.2219, + "step": 2258 + }, + { + "epoch": 0.20448990676201684, + "grad_norm": 1.1126137971878052, + "learning_rate": 0.00017743424471399662, + "loss": 3.2227, + "step": 2259 + }, + { + "epoch": 0.20458042907576718, + "grad_norm": 1.1436206102371216, + "learning_rate": 0.00017741416106390826, + "loss": 3.2326, + "step": 2260 + }, + { + "epoch": 0.20467095138951752, + "grad_norm": 1.0938067436218262, + "learning_rate": 0.00017739406961821602, + "loss": 3.1486, + "step": 2261 + }, + { + "epoch": 0.20476147370326786, + "grad_norm": 1.6595628261566162, + "learning_rate": 0.00017737397037894304, + "loss": 3.2298, + "step": 2262 + }, + { + "epoch": 0.2048519960170182, + "grad_norm": 0.9511428475379944, + "learning_rate": 0.00017735386334811343, + "loss": 3.2617, + "step": 2263 + }, + { + "epoch": 0.20494251833076854, + "grad_norm": 1.0140020847320557, + "learning_rate": 0.00017733374852775183, + "loss": 3.2335, + "step": 2264 + }, + { + "epoch": 0.20503304064451888, + "grad_norm": 0.9424805045127869, + "learning_rate": 0.0001773136259198839, + "loss": 3.1668, + "step": 2265 + }, + { + "epoch": 0.20512356295826922, + "grad_norm": 1.0711252689361572, + "learning_rate": 0.00017729349552653595, + "loss": 3.1674, + "step": 2266 + }, + { + "epoch": 0.20521408527201956, + "grad_norm": 0.9161908626556396, + "learning_rate": 0.00017727335734973512, + "loss": 3.1823, + "step": 2267 + }, + { + "epoch": 0.2053046075857699, + "grad_norm": 0.8851282000541687, + "learning_rate": 0.0001772532113915093, + "loss": 3.1713, + "step": 2268 + }, + { + "epoch": 0.20539512989952022, + "grad_norm": 0.9802316427230835, + "learning_rate": 0.00017723305765388725, + "loss": 3.1771, + "step": 2269 + }, + { + "epoch": 0.20548565221327056, + "grad_norm": 0.8938327431678772, + "learning_rate": 0.00017721289613889835, + "loss": 3.1648, + "step": 2270 + }, + { + "epoch": 0.2055761745270209, + "grad_norm": 1.0969438552856445, + "learning_rate": 0.00017719272684857295, + "loss": 3.2681, + "step": 2271 + }, + { + "epoch": 0.20566669684077124, + "grad_norm": 0.9800640940666199, + "learning_rate": 0.0001771725497849421, + "loss": 3.2137, + "step": 2272 + }, + { + "epoch": 0.20575721915452158, + "grad_norm": 0.9212649464607239, + "learning_rate": 0.00017715236495003756, + "loss": 3.1973, + "step": 2273 + }, + { + "epoch": 0.20584774146827192, + "grad_norm": 0.9442010521888733, + "learning_rate": 0.000177132172345892, + "loss": 3.2673, + "step": 2274 + }, + { + "epoch": 0.20593826378202226, + "grad_norm": 0.9160739779472351, + "learning_rate": 0.00017711197197453878, + "loss": 3.2638, + "step": 2275 + }, + { + "epoch": 0.2060287860957726, + "grad_norm": 0.9333342909812927, + "learning_rate": 0.0001770917638380121, + "loss": 3.2041, + "step": 2276 + }, + { + "epoch": 0.20611930840952294, + "grad_norm": 0.9469937086105347, + "learning_rate": 0.0001770715479383469, + "loss": 3.1967, + "step": 2277 + }, + { + "epoch": 0.20620983072327329, + "grad_norm": 1.072282075881958, + "learning_rate": 0.00017705132427757895, + "loss": 3.2015, + "step": 2278 + }, + { + "epoch": 0.20630035303702363, + "grad_norm": 1.0128763914108276, + "learning_rate": 0.00017703109285774473, + "loss": 3.1652, + "step": 2279 + }, + { + "epoch": 0.20639087535077397, + "grad_norm": 0.9716585278511047, + "learning_rate": 0.00017701085368088156, + "loss": 3.2273, + "step": 2280 + }, + { + "epoch": 0.2064813976645243, + "grad_norm": 0.9625109434127808, + "learning_rate": 0.00017699060674902756, + "loss": 3.241, + "step": 2281 + }, + { + "epoch": 0.20657191997827465, + "grad_norm": 0.9801328778266907, + "learning_rate": 0.00017697035206422156, + "loss": 3.1929, + "step": 2282 + }, + { + "epoch": 0.206662442292025, + "grad_norm": 1.0571147203445435, + "learning_rate": 0.0001769500896285032, + "loss": 3.2106, + "step": 2283 + }, + { + "epoch": 0.20675296460577533, + "grad_norm": 0.8661196231842041, + "learning_rate": 0.00017692981944391294, + "loss": 3.1964, + "step": 2284 + }, + { + "epoch": 0.20684348691952567, + "grad_norm": 1.0298678874969482, + "learning_rate": 0.00017690954151249196, + "loss": 3.1557, + "step": 2285 + }, + { + "epoch": 0.206934009233276, + "grad_norm": 0.9180775880813599, + "learning_rate": 0.00017688925583628223, + "loss": 3.1943, + "step": 2286 + }, + { + "epoch": 0.20702453154702635, + "grad_norm": 1.0359828472137451, + "learning_rate": 0.00017686896241732658, + "loss": 3.1889, + "step": 2287 + }, + { + "epoch": 0.2071150538607767, + "grad_norm": 1.0745766162872314, + "learning_rate": 0.00017684866125766853, + "loss": 3.2023, + "step": 2288 + }, + { + "epoch": 0.20720557617452703, + "grad_norm": 1.0356098413467407, + "learning_rate": 0.00017682835235935236, + "loss": 3.1698, + "step": 2289 + }, + { + "epoch": 0.20729609848827735, + "grad_norm": 0.9535381197929382, + "learning_rate": 0.00017680803572442318, + "loss": 3.1851, + "step": 2290 + }, + { + "epoch": 0.2073866208020277, + "grad_norm": 0.9482605457305908, + "learning_rate": 0.00017678771135492695, + "loss": 3.2179, + "step": 2291 + }, + { + "epoch": 0.20747714311577803, + "grad_norm": 1.3931949138641357, + "learning_rate": 0.00017676737925291028, + "loss": 3.2253, + "step": 2292 + }, + { + "epoch": 0.20756766542952837, + "grad_norm": 0.9733486175537109, + "learning_rate": 0.00017674703942042062, + "loss": 3.1814, + "step": 2293 + }, + { + "epoch": 0.2076581877432787, + "grad_norm": 1.0330384969711304, + "learning_rate": 0.0001767266918595062, + "loss": 3.2122, + "step": 2294 + }, + { + "epoch": 0.20774871005702905, + "grad_norm": 1.0049928426742554, + "learning_rate": 0.000176706336572216, + "loss": 3.1779, + "step": 2295 + }, + { + "epoch": 0.2078392323707794, + "grad_norm": 0.9967108368873596, + "learning_rate": 0.00017668597356059978, + "loss": 3.1691, + "step": 2296 + }, + { + "epoch": 0.20792975468452973, + "grad_norm": 1.0939202308654785, + "learning_rate": 0.00017666560282670815, + "loss": 3.1512, + "step": 2297 + }, + { + "epoch": 0.20802027699828007, + "grad_norm": 0.9251821637153625, + "learning_rate": 0.0001766452243725924, + "loss": 3.2136, + "step": 2298 + }, + { + "epoch": 0.20811079931203041, + "grad_norm": 1.0332478284835815, + "learning_rate": 0.00017662483820030466, + "loss": 3.185, + "step": 2299 + }, + { + "epoch": 0.20820132162578076, + "grad_norm": 0.8579204082489014, + "learning_rate": 0.0001766044443118978, + "loss": 3.1962, + "step": 2300 + }, + { + "epoch": 0.2082918439395311, + "grad_norm": 0.9324372410774231, + "learning_rate": 0.00017658404270942552, + "loss": 3.1525, + "step": 2301 + }, + { + "epoch": 0.20838236625328144, + "grad_norm": 0.8616467118263245, + "learning_rate": 0.0001765636333949422, + "loss": 3.1899, + "step": 2302 + }, + { + "epoch": 0.20847288856703178, + "grad_norm": 0.8347539901733398, + "learning_rate": 0.00017654321637050312, + "loss": 3.2244, + "step": 2303 + }, + { + "epoch": 0.20856341088078212, + "grad_norm": 0.8825692534446716, + "learning_rate": 0.00017652279163816423, + "loss": 3.1876, + "step": 2304 + }, + { + "epoch": 0.20865393319453246, + "grad_norm": 0.9334546327590942, + "learning_rate": 0.00017650235919998232, + "loss": 3.1664, + "step": 2305 + }, + { + "epoch": 0.2087444555082828, + "grad_norm": 0.8983275890350342, + "learning_rate": 0.00017648191905801492, + "loss": 3.1204, + "step": 2306 + }, + { + "epoch": 0.20883497782203314, + "grad_norm": 0.9988447427749634, + "learning_rate": 0.0001764614712143204, + "loss": 3.2325, + "step": 2307 + }, + { + "epoch": 0.20892550013578348, + "grad_norm": 0.883084774017334, + "learning_rate": 0.00017644101567095778, + "loss": 3.1446, + "step": 2308 + }, + { + "epoch": 0.20901602244953382, + "grad_norm": 1.0438385009765625, + "learning_rate": 0.000176420552429987, + "loss": 3.1989, + "step": 2309 + }, + { + "epoch": 0.20910654476328416, + "grad_norm": 0.9093191623687744, + "learning_rate": 0.00017640008149346866, + "loss": 3.1986, + "step": 2310 + }, + { + "epoch": 0.20919706707703448, + "grad_norm": 0.9882534146308899, + "learning_rate": 0.00017637960286346425, + "loss": 3.1743, + "step": 2311 + }, + { + "epoch": 0.20928758939078482, + "grad_norm": 1.0348436832427979, + "learning_rate": 0.00017635911654203588, + "loss": 3.195, + "step": 2312 + }, + { + "epoch": 0.20937811170453516, + "grad_norm": 0.8533861637115479, + "learning_rate": 0.00017633862253124655, + "loss": 3.1846, + "step": 2313 + }, + { + "epoch": 0.2094686340182855, + "grad_norm": 0.973780632019043, + "learning_rate": 0.00017631812083316003, + "loss": 3.2228, + "step": 2314 + }, + { + "epoch": 0.20955915633203584, + "grad_norm": 0.9459713697433472, + "learning_rate": 0.00017629761144984088, + "loss": 3.1582, + "step": 2315 + }, + { + "epoch": 0.20964967864578618, + "grad_norm": 0.9357503652572632, + "learning_rate": 0.00017627709438335426, + "loss": 3.1782, + "step": 2316 + }, + { + "epoch": 0.20974020095953652, + "grad_norm": 0.9655158519744873, + "learning_rate": 0.0001762565696357664, + "loss": 3.1712, + "step": 2317 + }, + { + "epoch": 0.20983072327328686, + "grad_norm": 0.8718851208686829, + "learning_rate": 0.00017623603720914402, + "loss": 3.1737, + "step": 2318 + }, + { + "epoch": 0.2099212455870372, + "grad_norm": 0.9923802614212036, + "learning_rate": 0.0001762154971055548, + "loss": 3.1181, + "step": 2319 + }, + { + "epoch": 0.21001176790078754, + "grad_norm": 0.8967884182929993, + "learning_rate": 0.0001761949493270671, + "loss": 3.2443, + "step": 2320 + }, + { + "epoch": 0.21010229021453788, + "grad_norm": 0.9012006521224976, + "learning_rate": 0.00017617439387575007, + "loss": 3.201, + "step": 2321 + }, + { + "epoch": 0.21019281252828823, + "grad_norm": 0.8997471332550049, + "learning_rate": 0.0001761538307536737, + "loss": 3.1972, + "step": 2322 + }, + { + "epoch": 0.21028333484203857, + "grad_norm": 0.9365257024765015, + "learning_rate": 0.00017613325996290862, + "loss": 3.265, + "step": 2323 + }, + { + "epoch": 0.2103738571557889, + "grad_norm": 1.175702691078186, + "learning_rate": 0.00017611268150552635, + "loss": 3.2106, + "step": 2324 + }, + { + "epoch": 0.21046437946953925, + "grad_norm": 0.9600235819816589, + "learning_rate": 0.00017609209538359917, + "loss": 3.1665, + "step": 2325 + }, + { + "epoch": 0.2105549017832896, + "grad_norm": 1.3065840005874634, + "learning_rate": 0.0001760715015992, + "loss": 3.1699, + "step": 2326 + }, + { + "epoch": 0.21064542409703993, + "grad_norm": 0.8768181204795837, + "learning_rate": 0.00017605090015440275, + "loss": 3.2304, + "step": 2327 + }, + { + "epoch": 0.21073594641079027, + "grad_norm": 1.052873134613037, + "learning_rate": 0.00017603029105128193, + "loss": 3.1579, + "step": 2328 + }, + { + "epoch": 0.2108264687245406, + "grad_norm": 1.34178626537323, + "learning_rate": 0.0001760096742919129, + "loss": 3.23, + "step": 2329 + }, + { + "epoch": 0.21091699103829095, + "grad_norm": 1.0743076801300049, + "learning_rate": 0.0001759890498783717, + "loss": 3.2031, + "step": 2330 + }, + { + "epoch": 0.21100751335204126, + "grad_norm": 1.1340997219085693, + "learning_rate": 0.0001759684178127353, + "loss": 3.2033, + "step": 2331 + }, + { + "epoch": 0.2110980356657916, + "grad_norm": 0.9930472373962402, + "learning_rate": 0.00017594777809708126, + "loss": 3.0984, + "step": 2332 + }, + { + "epoch": 0.21118855797954195, + "grad_norm": 1.214159607887268, + "learning_rate": 0.00017592713073348807, + "loss": 3.2372, + "step": 2333 + }, + { + "epoch": 0.2112790802932923, + "grad_norm": 0.992418646812439, + "learning_rate": 0.0001759064757240349, + "loss": 3.2189, + "step": 2334 + }, + { + "epoch": 0.21136960260704263, + "grad_norm": 1.2041786909103394, + "learning_rate": 0.00017588581307080169, + "loss": 3.1819, + "step": 2335 + }, + { + "epoch": 0.21146012492079297, + "grad_norm": 1.1528639793395996, + "learning_rate": 0.0001758651427758692, + "loss": 3.2333, + "step": 2336 + }, + { + "epoch": 0.2115506472345433, + "grad_norm": 1.0668143033981323, + "learning_rate": 0.0001758444648413189, + "loss": 3.1998, + "step": 2337 + }, + { + "epoch": 0.21164116954829365, + "grad_norm": 1.164767861366272, + "learning_rate": 0.00017582377926923305, + "loss": 3.1914, + "step": 2338 + }, + { + "epoch": 0.211731691862044, + "grad_norm": 1.0460022687911987, + "learning_rate": 0.00017580308606169473, + "loss": 3.1727, + "step": 2339 + }, + { + "epoch": 0.21182221417579433, + "grad_norm": 1.0179146528244019, + "learning_rate": 0.0001757823852207877, + "loss": 3.1831, + "step": 2340 + }, + { + "epoch": 0.21191273648954467, + "grad_norm": 1.1723949909210205, + "learning_rate": 0.00017576167674859656, + "loss": 3.1325, + "step": 2341 + }, + { + "epoch": 0.212003258803295, + "grad_norm": 1.135374903678894, + "learning_rate": 0.00017574096064720663, + "loss": 3.1349, + "step": 2342 + }, + { + "epoch": 0.21209378111704535, + "grad_norm": 0.9510069489479065, + "learning_rate": 0.00017572023691870403, + "loss": 3.093, + "step": 2343 + }, + { + "epoch": 0.2121843034307957, + "grad_norm": 1.171178936958313, + "learning_rate": 0.00017569950556517566, + "loss": 3.1266, + "step": 2344 + }, + { + "epoch": 0.21227482574454604, + "grad_norm": 0.9744029641151428, + "learning_rate": 0.00017567876658870914, + "loss": 3.169, + "step": 2345 + }, + { + "epoch": 0.21236534805829638, + "grad_norm": 0.8958362936973572, + "learning_rate": 0.0001756580199913929, + "loss": 3.1228, + "step": 2346 + }, + { + "epoch": 0.21245587037204672, + "grad_norm": 0.9392367601394653, + "learning_rate": 0.00017563726577531612, + "loss": 3.126, + "step": 2347 + }, + { + "epoch": 0.21254639268579706, + "grad_norm": 0.9655486941337585, + "learning_rate": 0.00017561650394256874, + "loss": 3.1772, + "step": 2348 + }, + { + "epoch": 0.2126369149995474, + "grad_norm": 0.8699467778205872, + "learning_rate": 0.0001755957344952415, + "loss": 3.1088, + "step": 2349 + }, + { + "epoch": 0.21272743731329774, + "grad_norm": 0.9144679307937622, + "learning_rate": 0.00017557495743542585, + "loss": 3.1993, + "step": 2350 + }, + { + "epoch": 0.21281795962704808, + "grad_norm": 0.919262707233429, + "learning_rate": 0.00017555417276521405, + "loss": 3.1578, + "step": 2351 + }, + { + "epoch": 0.2129084819407984, + "grad_norm": 1.0309118032455444, + "learning_rate": 0.00017553338048669913, + "loss": 3.167, + "step": 2352 + }, + { + "epoch": 0.21299900425454873, + "grad_norm": 0.9247647523880005, + "learning_rate": 0.00017551258060197485, + "loss": 3.1737, + "step": 2353 + }, + { + "epoch": 0.21308952656829908, + "grad_norm": 0.9028778076171875, + "learning_rate": 0.00017549177311313577, + "loss": 3.1385, + "step": 2354 + }, + { + "epoch": 0.21318004888204942, + "grad_norm": 0.9501022100448608, + "learning_rate": 0.00017547095802227723, + "loss": 3.1866, + "step": 2355 + }, + { + "epoch": 0.21327057119579976, + "grad_norm": 0.8799710273742676, + "learning_rate": 0.00017545013533149523, + "loss": 3.1837, + "step": 2356 + }, + { + "epoch": 0.2133610935095501, + "grad_norm": 0.9339806437492371, + "learning_rate": 0.0001754293050428867, + "loss": 3.2179, + "step": 2357 + }, + { + "epoch": 0.21345161582330044, + "grad_norm": 0.891981840133667, + "learning_rate": 0.00017540846715854923, + "loss": 3.2324, + "step": 2358 + }, + { + "epoch": 0.21354213813705078, + "grad_norm": 0.8680137395858765, + "learning_rate": 0.00017538762168058117, + "loss": 3.1399, + "step": 2359 + }, + { + "epoch": 0.21363266045080112, + "grad_norm": 0.9303855299949646, + "learning_rate": 0.00017536676861108164, + "loss": 3.1699, + "step": 2360 + }, + { + "epoch": 0.21372318276455146, + "grad_norm": 0.9293773770332336, + "learning_rate": 0.0001753459079521506, + "loss": 3.203, + "step": 2361 + }, + { + "epoch": 0.2138137050783018, + "grad_norm": 1.0480334758758545, + "learning_rate": 0.0001753250397058887, + "loss": 3.2587, + "step": 2362 + }, + { + "epoch": 0.21390422739205214, + "grad_norm": 0.8495811223983765, + "learning_rate": 0.00017530416387439733, + "loss": 3.1403, + "step": 2363 + }, + { + "epoch": 0.21399474970580248, + "grad_norm": 1.0046278238296509, + "learning_rate": 0.00017528328045977876, + "loss": 3.1598, + "step": 2364 + }, + { + "epoch": 0.21408527201955282, + "grad_norm": 0.9166176915168762, + "learning_rate": 0.00017526238946413589, + "loss": 3.1074, + "step": 2365 + }, + { + "epoch": 0.21417579433330317, + "grad_norm": 0.816504716873169, + "learning_rate": 0.00017524149088957245, + "loss": 3.1762, + "step": 2366 + }, + { + "epoch": 0.2142663166470535, + "grad_norm": 0.9491996765136719, + "learning_rate": 0.00017522058473819294, + "loss": 3.199, + "step": 2367 + }, + { + "epoch": 0.21435683896080385, + "grad_norm": 0.9550940990447998, + "learning_rate": 0.0001751996710121026, + "loss": 3.2461, + "step": 2368 + }, + { + "epoch": 0.2144473612745542, + "grad_norm": 0.9655570983886719, + "learning_rate": 0.00017517874971340747, + "loss": 3.1665, + "step": 2369 + }, + { + "epoch": 0.21453788358830453, + "grad_norm": 0.899738073348999, + "learning_rate": 0.00017515782084421427, + "loss": 3.1383, + "step": 2370 + }, + { + "epoch": 0.21462840590205487, + "grad_norm": 0.9755668044090271, + "learning_rate": 0.0001751368844066306, + "loss": 3.0888, + "step": 2371 + }, + { + "epoch": 0.21471892821580518, + "grad_norm": 0.9048589468002319, + "learning_rate": 0.00017511594040276469, + "loss": 3.1741, + "step": 2372 + }, + { + "epoch": 0.21480945052955552, + "grad_norm": 0.9476847648620605, + "learning_rate": 0.00017509498883472565, + "loss": 3.2199, + "step": 2373 + }, + { + "epoch": 0.21489997284330586, + "grad_norm": 0.8622844219207764, + "learning_rate": 0.0001750740297046233, + "loss": 3.1287, + "step": 2374 + }, + { + "epoch": 0.2149904951570562, + "grad_norm": 0.9325116276741028, + "learning_rate": 0.00017505306301456822, + "loss": 3.1792, + "step": 2375 + }, + { + "epoch": 0.21508101747080655, + "grad_norm": 0.9339794516563416, + "learning_rate": 0.00017503208876667173, + "loss": 3.1535, + "step": 2376 + }, + { + "epoch": 0.2151715397845569, + "grad_norm": 0.8598697185516357, + "learning_rate": 0.00017501110696304596, + "loss": 3.1139, + "step": 2377 + }, + { + "epoch": 0.21526206209830723, + "grad_norm": 0.8950153589248657, + "learning_rate": 0.00017499011760580376, + "loss": 3.1469, + "step": 2378 + }, + { + "epoch": 0.21535258441205757, + "grad_norm": 0.9081557989120483, + "learning_rate": 0.0001749691206970588, + "loss": 3.1602, + "step": 2379 + }, + { + "epoch": 0.2154431067258079, + "grad_norm": 0.8921812176704407, + "learning_rate": 0.0001749481162389254, + "loss": 3.187, + "step": 2380 + }, + { + "epoch": 0.21553362903955825, + "grad_norm": 0.8708139061927795, + "learning_rate": 0.00017492710423351878, + "loss": 3.1596, + "step": 2381 + }, + { + "epoch": 0.2156241513533086, + "grad_norm": 0.8599114418029785, + "learning_rate": 0.0001749060846829548, + "loss": 3.1284, + "step": 2382 + }, + { + "epoch": 0.21571467366705893, + "grad_norm": 0.9189428687095642, + "learning_rate": 0.00017488505758935015, + "loss": 3.0794, + "step": 2383 + }, + { + "epoch": 0.21580519598080927, + "grad_norm": 0.8451887965202332, + "learning_rate": 0.00017486402295482223, + "loss": 3.1743, + "step": 2384 + }, + { + "epoch": 0.2158957182945596, + "grad_norm": 0.8397995829582214, + "learning_rate": 0.00017484298078148926, + "loss": 3.1973, + "step": 2385 + }, + { + "epoch": 0.21598624060830995, + "grad_norm": 0.889971911907196, + "learning_rate": 0.00017482193107147014, + "loss": 3.1543, + "step": 2386 + }, + { + "epoch": 0.2160767629220603, + "grad_norm": 0.8662976622581482, + "learning_rate": 0.00017480087382688465, + "loss": 3.1575, + "step": 2387 + }, + { + "epoch": 0.21616728523581064, + "grad_norm": 0.8261364698410034, + "learning_rate": 0.0001747798090498532, + "loss": 3.1776, + "step": 2388 + }, + { + "epoch": 0.21625780754956098, + "grad_norm": 0.8544386029243469, + "learning_rate": 0.00017475873674249704, + "loss": 3.1272, + "step": 2389 + }, + { + "epoch": 0.21634832986331132, + "grad_norm": 0.8943440914154053, + "learning_rate": 0.0001747376569069381, + "loss": 3.1176, + "step": 2390 + }, + { + "epoch": 0.21643885217706166, + "grad_norm": 0.8617627620697021, + "learning_rate": 0.0001747165695452992, + "loss": 3.0768, + "step": 2391 + }, + { + "epoch": 0.216529374490812, + "grad_norm": 1.0215213298797607, + "learning_rate": 0.00017469547465970373, + "loss": 3.1451, + "step": 2392 + }, + { + "epoch": 0.2166198968045623, + "grad_norm": 0.8882916569709778, + "learning_rate": 0.00017467437225227606, + "loss": 3.1673, + "step": 2393 + }, + { + "epoch": 0.21671041911831265, + "grad_norm": 0.8575624227523804, + "learning_rate": 0.0001746532623251411, + "loss": 3.1365, + "step": 2394 + }, + { + "epoch": 0.216800941432063, + "grad_norm": 0.9205721616744995, + "learning_rate": 0.00017463214488042472, + "loss": 3.1569, + "step": 2395 + }, + { + "epoch": 0.21689146374581333, + "grad_norm": 0.9491685628890991, + "learning_rate": 0.00017461101992025335, + "loss": 3.1019, + "step": 2396 + }, + { + "epoch": 0.21698198605956368, + "grad_norm": 0.8524163365364075, + "learning_rate": 0.00017458988744675433, + "loss": 3.1516, + "step": 2397 + }, + { + "epoch": 0.21707250837331402, + "grad_norm": 0.9187124967575073, + "learning_rate": 0.00017456874746205568, + "loss": 3.186, + "step": 2398 + }, + { + "epoch": 0.21716303068706436, + "grad_norm": 0.9027589559555054, + "learning_rate": 0.00017454759996828623, + "loss": 3.1449, + "step": 2399 + }, + { + "epoch": 0.2172535530008147, + "grad_norm": 0.9175606966018677, + "learning_rate": 0.0001745264449675755, + "loss": 3.1573, + "step": 2400 + }, + { + "epoch": 0.21734407531456504, + "grad_norm": 0.9693629741668701, + "learning_rate": 0.00017450528246205376, + "loss": 3.0802, + "step": 2401 + }, + { + "epoch": 0.21743459762831538, + "grad_norm": 0.867482602596283, + "learning_rate": 0.00017448411245385214, + "loss": 3.176, + "step": 2402 + }, + { + "epoch": 0.21752511994206572, + "grad_norm": 1.0069620609283447, + "learning_rate": 0.00017446293494510244, + "loss": 3.1445, + "step": 2403 + }, + { + "epoch": 0.21761564225581606, + "grad_norm": 0.9470597505569458, + "learning_rate": 0.0001744417499379372, + "loss": 3.1548, + "step": 2404 + }, + { + "epoch": 0.2177061645695664, + "grad_norm": 0.9411137700080872, + "learning_rate": 0.0001744205574344898, + "loss": 3.1723, + "step": 2405 + }, + { + "epoch": 0.21779668688331674, + "grad_norm": 0.9758121967315674, + "learning_rate": 0.00017439935743689431, + "loss": 3.2164, + "step": 2406 + }, + { + "epoch": 0.21788720919706708, + "grad_norm": 0.9710676670074463, + "learning_rate": 0.00017437814994728553, + "loss": 3.1502, + "step": 2407 + }, + { + "epoch": 0.21797773151081742, + "grad_norm": 0.9743636250495911, + "learning_rate": 0.00017435693496779911, + "loss": 3.1949, + "step": 2408 + }, + { + "epoch": 0.21806825382456776, + "grad_norm": 0.9133598208427429, + "learning_rate": 0.0001743357125005714, + "loss": 3.1189, + "step": 2409 + }, + { + "epoch": 0.2181587761383181, + "grad_norm": 0.8851455450057983, + "learning_rate": 0.00017431448254773944, + "loss": 3.1698, + "step": 2410 + }, + { + "epoch": 0.21824929845206845, + "grad_norm": 0.873092770576477, + "learning_rate": 0.0001742932451114411, + "loss": 3.156, + "step": 2411 + }, + { + "epoch": 0.2183398207658188, + "grad_norm": 0.956868052482605, + "learning_rate": 0.00017427200019381504, + "loss": 3.202, + "step": 2412 + }, + { + "epoch": 0.2184303430795691, + "grad_norm": 0.9415931701660156, + "learning_rate": 0.00017425074779700063, + "loss": 3.1744, + "step": 2413 + }, + { + "epoch": 0.21852086539331944, + "grad_norm": 0.9146011471748352, + "learning_rate": 0.00017422948792313788, + "loss": 3.1592, + "step": 2414 + }, + { + "epoch": 0.21861138770706978, + "grad_norm": 1.0343977212905884, + "learning_rate": 0.00017420822057436777, + "loss": 3.1817, + "step": 2415 + }, + { + "epoch": 0.21870191002082012, + "grad_norm": 1.1891487836837769, + "learning_rate": 0.00017418694575283185, + "loss": 3.2632, + "step": 2416 + }, + { + "epoch": 0.21879243233457046, + "grad_norm": 0.9265338182449341, + "learning_rate": 0.00017416566346067255, + "loss": 3.1894, + "step": 2417 + }, + { + "epoch": 0.2188829546483208, + "grad_norm": 0.8732950687408447, + "learning_rate": 0.00017414437370003293, + "loss": 3.1959, + "step": 2418 + }, + { + "epoch": 0.21897347696207115, + "grad_norm": 0.9065073132514954, + "learning_rate": 0.00017412307647305696, + "loss": 3.1425, + "step": 2419 + }, + { + "epoch": 0.21906399927582149, + "grad_norm": 0.9114652872085571, + "learning_rate": 0.00017410177178188918, + "loss": 3.2211, + "step": 2420 + }, + { + "epoch": 0.21915452158957183, + "grad_norm": 1.0082124471664429, + "learning_rate": 0.000174080459628675, + "loss": 3.2108, + "step": 2421 + }, + { + "epoch": 0.21924504390332217, + "grad_norm": 1.0025688409805298, + "learning_rate": 0.0001740591400155606, + "loss": 3.144, + "step": 2422 + }, + { + "epoch": 0.2193355662170725, + "grad_norm": 0.8575987815856934, + "learning_rate": 0.0001740378129446928, + "loss": 3.1099, + "step": 2423 + }, + { + "epoch": 0.21942608853082285, + "grad_norm": 0.9605771899223328, + "learning_rate": 0.00017401647841821928, + "loss": 3.0751, + "step": 2424 + }, + { + "epoch": 0.2195166108445732, + "grad_norm": 1.021604061126709, + "learning_rate": 0.0001739951364382884, + "loss": 3.1323, + "step": 2425 + }, + { + "epoch": 0.21960713315832353, + "grad_norm": 0.9601428508758545, + "learning_rate": 0.0001739737870070493, + "loss": 3.3042, + "step": 2426 + }, + { + "epoch": 0.21969765547207387, + "grad_norm": 1.1104557514190674, + "learning_rate": 0.00017395243012665188, + "loss": 3.2057, + "step": 2427 + }, + { + "epoch": 0.2197881777858242, + "grad_norm": 0.9566591382026672, + "learning_rate": 0.00017393106579924677, + "loss": 3.1467, + "step": 2428 + }, + { + "epoch": 0.21987870009957455, + "grad_norm": 0.9779771566390991, + "learning_rate": 0.00017390969402698535, + "loss": 3.1151, + "step": 2429 + }, + { + "epoch": 0.2199692224133249, + "grad_norm": 0.9599485993385315, + "learning_rate": 0.00017388831481201977, + "loss": 3.1216, + "step": 2430 + }, + { + "epoch": 0.22005974472707523, + "grad_norm": 1.0270799398422241, + "learning_rate": 0.0001738669281565029, + "loss": 3.258, + "step": 2431 + }, + { + "epoch": 0.22015026704082558, + "grad_norm": 0.8845586180686951, + "learning_rate": 0.00017384553406258842, + "loss": 3.0924, + "step": 2432 + }, + { + "epoch": 0.22024078935457592, + "grad_norm": 0.826064944267273, + "learning_rate": 0.0001738241325324306, + "loss": 3.1836, + "step": 2433 + }, + { + "epoch": 0.22033131166832623, + "grad_norm": 0.9534816145896912, + "learning_rate": 0.00017380272356818473, + "loss": 3.2086, + "step": 2434 + }, + { + "epoch": 0.22042183398207657, + "grad_norm": 0.9598084688186646, + "learning_rate": 0.0001737813071720066, + "loss": 3.1166, + "step": 2435 + }, + { + "epoch": 0.2205123562958269, + "grad_norm": 1.0776300430297852, + "learning_rate": 0.00017375988334605285, + "loss": 3.176, + "step": 2436 + }, + { + "epoch": 0.22060287860957725, + "grad_norm": 0.8963593244552612, + "learning_rate": 0.00017373845209248088, + "loss": 3.1777, + "step": 2437 + }, + { + "epoch": 0.2206934009233276, + "grad_norm": 0.9389859437942505, + "learning_rate": 0.00017371701341344878, + "loss": 3.1094, + "step": 2438 + }, + { + "epoch": 0.22078392323707793, + "grad_norm": 0.9606500267982483, + "learning_rate": 0.00017369556731111545, + "loss": 3.131, + "step": 2439 + }, + { + "epoch": 0.22087444555082827, + "grad_norm": 0.8964757323265076, + "learning_rate": 0.0001736741137876405, + "loss": 3.1318, + "step": 2440 + }, + { + "epoch": 0.22096496786457862, + "grad_norm": 0.9549975395202637, + "learning_rate": 0.00017365265284518432, + "loss": 3.1401, + "step": 2441 + }, + { + "epoch": 0.22105549017832896, + "grad_norm": 0.9945970773696899, + "learning_rate": 0.00017363118448590801, + "loss": 3.1488, + "step": 2442 + }, + { + "epoch": 0.2211460124920793, + "grad_norm": 0.9644862413406372, + "learning_rate": 0.00017360970871197346, + "loss": 3.1731, + "step": 2443 + }, + { + "epoch": 0.22123653480582964, + "grad_norm": 0.8978458046913147, + "learning_rate": 0.00017358822552554322, + "loss": 3.2147, + "step": 2444 + }, + { + "epoch": 0.22132705711957998, + "grad_norm": 0.97486811876297, + "learning_rate": 0.0001735667349287807, + "loss": 3.1623, + "step": 2445 + }, + { + "epoch": 0.22141757943333032, + "grad_norm": 0.9704816937446594, + "learning_rate": 0.00017354523692385, + "loss": 3.1984, + "step": 2446 + }, + { + "epoch": 0.22150810174708066, + "grad_norm": 1.077000379562378, + "learning_rate": 0.00017352373151291591, + "loss": 3.1177, + "step": 2447 + }, + { + "epoch": 0.221598624060831, + "grad_norm": 0.8644207715988159, + "learning_rate": 0.00017350221869814411, + "loss": 3.1528, + "step": 2448 + }, + { + "epoch": 0.22168914637458134, + "grad_norm": 0.8725171089172363, + "learning_rate": 0.00017348069848170086, + "loss": 3.159, + "step": 2449 + }, + { + "epoch": 0.22177966868833168, + "grad_norm": 1.0187084674835205, + "learning_rate": 0.00017345917086575332, + "loss": 3.1971, + "step": 2450 + }, + { + "epoch": 0.22187019100208202, + "grad_norm": 0.8878188133239746, + "learning_rate": 0.00017343763585246928, + "loss": 3.1588, + "step": 2451 + }, + { + "epoch": 0.22196071331583236, + "grad_norm": 0.9226207137107849, + "learning_rate": 0.0001734160934440173, + "loss": 3.1223, + "step": 2452 + }, + { + "epoch": 0.2220512356295827, + "grad_norm": 0.9456731677055359, + "learning_rate": 0.00017339454364256676, + "loss": 3.223, + "step": 2453 + }, + { + "epoch": 0.22214175794333302, + "grad_norm": 0.9416071176528931, + "learning_rate": 0.00017337298645028764, + "loss": 3.2436, + "step": 2454 + }, + { + "epoch": 0.22223228025708336, + "grad_norm": 0.8809584379196167, + "learning_rate": 0.00017335142186935081, + "loss": 3.1287, + "step": 2455 + }, + { + "epoch": 0.2223228025708337, + "grad_norm": 0.905727744102478, + "learning_rate": 0.00017332984990192779, + "loss": 3.2025, + "step": 2456 + }, + { + "epoch": 0.22241332488458404, + "grad_norm": 1.0547999143600464, + "learning_rate": 0.00017330827055019092, + "loss": 3.1311, + "step": 2457 + }, + { + "epoch": 0.22250384719833438, + "grad_norm": 0.8692690134048462, + "learning_rate": 0.00017328668381631318, + "loss": 3.0641, + "step": 2458 + }, + { + "epoch": 0.22259436951208472, + "grad_norm": 1.1819257736206055, + "learning_rate": 0.0001732650897024684, + "loss": 3.1454, + "step": 2459 + }, + { + "epoch": 0.22268489182583506, + "grad_norm": 1.016374111175537, + "learning_rate": 0.0001732434882108311, + "loss": 3.194, + "step": 2460 + }, + { + "epoch": 0.2227754141395854, + "grad_norm": 1.5239908695220947, + "learning_rate": 0.0001732218793435765, + "loss": 3.1774, + "step": 2461 + }, + { + "epoch": 0.22286593645333574, + "grad_norm": 0.9903870820999146, + "learning_rate": 0.00017320026310288072, + "loss": 3.2263, + "step": 2462 + }, + { + "epoch": 0.22295645876708609, + "grad_norm": 1.1110303401947021, + "learning_rate": 0.0001731786394909204, + "loss": 3.1876, + "step": 2463 + }, + { + "epoch": 0.22304698108083643, + "grad_norm": 1.109635829925537, + "learning_rate": 0.0001731570085098731, + "loss": 3.1677, + "step": 2464 + }, + { + "epoch": 0.22313750339458677, + "grad_norm": 0.9838115572929382, + "learning_rate": 0.00017313537016191706, + "loss": 3.127, + "step": 2465 + }, + { + "epoch": 0.2232280257083371, + "grad_norm": 1.3351515531539917, + "learning_rate": 0.00017311372444923123, + "loss": 3.1621, + "step": 2466 + }, + { + "epoch": 0.22331854802208745, + "grad_norm": 0.900549590587616, + "learning_rate": 0.00017309207137399535, + "loss": 3.0983, + "step": 2467 + }, + { + "epoch": 0.2234090703358378, + "grad_norm": 1.293603777885437, + "learning_rate": 0.00017307041093838986, + "loss": 3.1548, + "step": 2468 + }, + { + "epoch": 0.22349959264958813, + "grad_norm": 0.9529392123222351, + "learning_rate": 0.00017304874314459602, + "loss": 3.1299, + "step": 2469 + }, + { + "epoch": 0.22359011496333847, + "grad_norm": 1.1548309326171875, + "learning_rate": 0.00017302706799479574, + "loss": 3.1252, + "step": 2470 + }, + { + "epoch": 0.2236806372770888, + "grad_norm": 1.0050303936004639, + "learning_rate": 0.0001730053854911717, + "loss": 3.1417, + "step": 2471 + }, + { + "epoch": 0.22377115959083915, + "grad_norm": 1.0455986261367798, + "learning_rate": 0.00017298369563590734, + "loss": 3.1961, + "step": 2472 + }, + { + "epoch": 0.2238616819045895, + "grad_norm": 0.999272346496582, + "learning_rate": 0.00017296199843118683, + "loss": 3.1329, + "step": 2473 + }, + { + "epoch": 0.22395220421833983, + "grad_norm": 0.886278510093689, + "learning_rate": 0.00017294029387919508, + "loss": 3.1698, + "step": 2474 + }, + { + "epoch": 0.22404272653209015, + "grad_norm": 1.007181167602539, + "learning_rate": 0.00017291858198211773, + "loss": 3.1676, + "step": 2475 + }, + { + "epoch": 0.2241332488458405, + "grad_norm": 0.8566113114356995, + "learning_rate": 0.00017289686274214118, + "loss": 3.1895, + "step": 2476 + }, + { + "epoch": 0.22422377115959083, + "grad_norm": 0.9515550136566162, + "learning_rate": 0.0001728751361614525, + "loss": 3.1546, + "step": 2477 + }, + { + "epoch": 0.22431429347334117, + "grad_norm": 0.8973231911659241, + "learning_rate": 0.00017285340224223965, + "loss": 3.1885, + "step": 2478 + }, + { + "epoch": 0.2244048157870915, + "grad_norm": 0.9097982048988342, + "learning_rate": 0.00017283166098669116, + "loss": 3.1111, + "step": 2479 + }, + { + "epoch": 0.22449533810084185, + "grad_norm": 0.9288604855537415, + "learning_rate": 0.00017280991239699642, + "loss": 3.1735, + "step": 2480 + }, + { + "epoch": 0.2245858604145922, + "grad_norm": 0.8434594869613647, + "learning_rate": 0.00017278815647534548, + "loss": 3.0955, + "step": 2481 + }, + { + "epoch": 0.22467638272834253, + "grad_norm": 0.8894537687301636, + "learning_rate": 0.00017276639322392917, + "loss": 3.0957, + "step": 2482 + }, + { + "epoch": 0.22476690504209287, + "grad_norm": 0.9819609522819519, + "learning_rate": 0.0001727446226449391, + "loss": 3.1211, + "step": 2483 + }, + { + "epoch": 0.22485742735584321, + "grad_norm": 0.8595069646835327, + "learning_rate": 0.00017272284474056744, + "loss": 3.1588, + "step": 2484 + }, + { + "epoch": 0.22494794966959356, + "grad_norm": 0.9514359831809998, + "learning_rate": 0.00017270105951300738, + "loss": 3.1498, + "step": 2485 + }, + { + "epoch": 0.2250384719833439, + "grad_norm": 0.878564178943634, + "learning_rate": 0.00017267926696445257, + "loss": 3.1062, + "step": 2486 + }, + { + "epoch": 0.22512899429709424, + "grad_norm": 0.9188651442527771, + "learning_rate": 0.0001726574670970976, + "loss": 3.1394, + "step": 2487 + }, + { + "epoch": 0.22521951661084458, + "grad_norm": 0.9147782325744629, + "learning_rate": 0.00017263565991313765, + "loss": 3.1907, + "step": 2488 + }, + { + "epoch": 0.22531003892459492, + "grad_norm": 0.8864535093307495, + "learning_rate": 0.00017261384541476877, + "loss": 3.195, + "step": 2489 + }, + { + "epoch": 0.22540056123834526, + "grad_norm": 0.8890230655670166, + "learning_rate": 0.00017259202360418762, + "loss": 3.1051, + "step": 2490 + }, + { + "epoch": 0.2254910835520956, + "grad_norm": 0.9365044832229614, + "learning_rate": 0.00017257019448359168, + "loss": 3.1838, + "step": 2491 + }, + { + "epoch": 0.22558160586584594, + "grad_norm": 1.0051701068878174, + "learning_rate": 0.00017254835805517914, + "loss": 3.1432, + "step": 2492 + }, + { + "epoch": 0.22567212817959628, + "grad_norm": 0.9916554689407349, + "learning_rate": 0.00017252651432114892, + "loss": 3.1628, + "step": 2493 + }, + { + "epoch": 0.22576265049334662, + "grad_norm": 0.9559274911880493, + "learning_rate": 0.0001725046632837007, + "loss": 3.1104, + "step": 2494 + }, + { + "epoch": 0.22585317280709694, + "grad_norm": 1.0775694847106934, + "learning_rate": 0.00017248280494503487, + "loss": 3.2151, + "step": 2495 + }, + { + "epoch": 0.22594369512084728, + "grad_norm": 1.055009365081787, + "learning_rate": 0.0001724609393073526, + "loss": 3.1756, + "step": 2496 + }, + { + "epoch": 0.22603421743459762, + "grad_norm": 1.210542917251587, + "learning_rate": 0.00017243906637285568, + "loss": 3.1387, + "step": 2497 + }, + { + "epoch": 0.22612473974834796, + "grad_norm": 1.0770033597946167, + "learning_rate": 0.00017241718614374678, + "loss": 3.2614, + "step": 2498 + }, + { + "epoch": 0.2262152620620983, + "grad_norm": 1.004407525062561, + "learning_rate": 0.00017239529862222918, + "loss": 3.1391, + "step": 2499 + }, + { + "epoch": 0.22630578437584864, + "grad_norm": 1.0340214967727661, + "learning_rate": 0.00017237340381050703, + "loss": 3.2116, + "step": 2500 + }, + { + "epoch": 0.22639630668959898, + "grad_norm": 1.049316167831421, + "learning_rate": 0.00017235150171078506, + "loss": 3.1887, + "step": 2501 + }, + { + "epoch": 0.22648682900334932, + "grad_norm": 0.9701083898544312, + "learning_rate": 0.0001723295923252689, + "loss": 3.1859, + "step": 2502 + }, + { + "epoch": 0.22657735131709966, + "grad_norm": 1.031803011894226, + "learning_rate": 0.00017230767565616474, + "loss": 3.2023, + "step": 2503 + }, + { + "epoch": 0.22666787363085, + "grad_norm": 0.998855710029602, + "learning_rate": 0.00017228575170567963, + "loss": 3.1229, + "step": 2504 + }, + { + "epoch": 0.22675839594460034, + "grad_norm": 0.975975513458252, + "learning_rate": 0.00017226382047602127, + "loss": 3.1604, + "step": 2505 + }, + { + "epoch": 0.22684891825835068, + "grad_norm": 0.9017311334609985, + "learning_rate": 0.00017224188196939818, + "loss": 3.1677, + "step": 2506 + }, + { + "epoch": 0.22693944057210103, + "grad_norm": 0.9536845088005066, + "learning_rate": 0.0001722199361880196, + "loss": 3.0564, + "step": 2507 + }, + { + "epoch": 0.22702996288585137, + "grad_norm": 0.8992807865142822, + "learning_rate": 0.00017219798313409537, + "loss": 3.0938, + "step": 2508 + }, + { + "epoch": 0.2271204851996017, + "grad_norm": 0.9468969702720642, + "learning_rate": 0.00017217602280983623, + "loss": 3.2082, + "step": 2509 + }, + { + "epoch": 0.22721100751335205, + "grad_norm": 0.9722668528556824, + "learning_rate": 0.00017215405521745357, + "loss": 3.2445, + "step": 2510 + }, + { + "epoch": 0.2273015298271024, + "grad_norm": 0.8341018557548523, + "learning_rate": 0.0001721320803591595, + "loss": 3.099, + "step": 2511 + }, + { + "epoch": 0.22739205214085273, + "grad_norm": 1.0530873537063599, + "learning_rate": 0.00017211009823716694, + "loss": 3.181, + "step": 2512 + }, + { + "epoch": 0.22748257445460307, + "grad_norm": 0.890476405620575, + "learning_rate": 0.00017208810885368947, + "loss": 3.1118, + "step": 2513 + }, + { + "epoch": 0.2275730967683534, + "grad_norm": 1.012795329093933, + "learning_rate": 0.0001720661122109414, + "loss": 3.1775, + "step": 2514 + }, + { + "epoch": 0.22766361908210375, + "grad_norm": 0.9841723442077637, + "learning_rate": 0.00017204410831113778, + "loss": 3.1364, + "step": 2515 + }, + { + "epoch": 0.22775414139585407, + "grad_norm": 0.9760757684707642, + "learning_rate": 0.00017202209715649444, + "loss": 3.15, + "step": 2516 + }, + { + "epoch": 0.2278446637096044, + "grad_norm": 1.123109221458435, + "learning_rate": 0.00017200007874922788, + "loss": 3.1347, + "step": 2517 + }, + { + "epoch": 0.22793518602335475, + "grad_norm": 0.9273945093154907, + "learning_rate": 0.00017197805309155536, + "loss": 3.1127, + "step": 2518 + }, + { + "epoch": 0.2280257083371051, + "grad_norm": 1.2223721742630005, + "learning_rate": 0.00017195602018569488, + "loss": 3.1295, + "step": 2519 + }, + { + "epoch": 0.22811623065085543, + "grad_norm": 0.9610428214073181, + "learning_rate": 0.0001719339800338651, + "loss": 3.153, + "step": 2520 + }, + { + "epoch": 0.22820675296460577, + "grad_norm": 1.200606107711792, + "learning_rate": 0.00017191193263828554, + "loss": 3.1055, + "step": 2521 + }, + { + "epoch": 0.2282972752783561, + "grad_norm": 1.0138325691223145, + "learning_rate": 0.00017188987800117634, + "loss": 3.1634, + "step": 2522 + }, + { + "epoch": 0.22838779759210645, + "grad_norm": 0.8626307845115662, + "learning_rate": 0.00017186781612475835, + "loss": 3.0981, + "step": 2523 + }, + { + "epoch": 0.2284783199058568, + "grad_norm": 1.167111873626709, + "learning_rate": 0.00017184574701125326, + "loss": 3.1991, + "step": 2524 + }, + { + "epoch": 0.22856884221960713, + "grad_norm": 0.8812128901481628, + "learning_rate": 0.00017182367066288342, + "loss": 3.1453, + "step": 2525 + }, + { + "epoch": 0.22865936453335747, + "grad_norm": 0.9116116166114807, + "learning_rate": 0.00017180158708187188, + "loss": 3.1386, + "step": 2526 + }, + { + "epoch": 0.22874988684710781, + "grad_norm": 1.0221889019012451, + "learning_rate": 0.00017177949627044252, + "loss": 3.2163, + "step": 2527 + }, + { + "epoch": 0.22884040916085815, + "grad_norm": 0.9080731272697449, + "learning_rate": 0.00017175739823081985, + "loss": 3.0455, + "step": 2528 + }, + { + "epoch": 0.2289309314746085, + "grad_norm": 0.9786624908447266, + "learning_rate": 0.00017173529296522912, + "loss": 3.1194, + "step": 2529 + }, + { + "epoch": 0.22902145378835884, + "grad_norm": 0.8339390158653259, + "learning_rate": 0.00017171318047589637, + "loss": 3.1286, + "step": 2530 + }, + { + "epoch": 0.22911197610210918, + "grad_norm": 1.001883625984192, + "learning_rate": 0.0001716910607650483, + "loss": 3.1536, + "step": 2531 + }, + { + "epoch": 0.22920249841585952, + "grad_norm": 0.9073264002799988, + "learning_rate": 0.00017166893383491235, + "loss": 3.1435, + "step": 2532 + }, + { + "epoch": 0.22929302072960986, + "grad_norm": 0.913053035736084, + "learning_rate": 0.00017164679968771675, + "loss": 3.1389, + "step": 2533 + }, + { + "epoch": 0.2293835430433602, + "grad_norm": 0.9608319401741028, + "learning_rate": 0.00017162465832569037, + "loss": 3.1763, + "step": 2534 + }, + { + "epoch": 0.22947406535711054, + "grad_norm": 0.8648624420166016, + "learning_rate": 0.00017160250975106287, + "loss": 3.1384, + "step": 2535 + }, + { + "epoch": 0.22956458767086085, + "grad_norm": 1.0078084468841553, + "learning_rate": 0.00017158035396606458, + "loss": 3.1459, + "step": 2536 + }, + { + "epoch": 0.2296551099846112, + "grad_norm": 0.9229320883750916, + "learning_rate": 0.0001715581909729266, + "loss": 3.1257, + "step": 2537 + }, + { + "epoch": 0.22974563229836154, + "grad_norm": 0.9432445168495178, + "learning_rate": 0.0001715360207738808, + "loss": 3.0445, + "step": 2538 + }, + { + "epoch": 0.22983615461211188, + "grad_norm": 0.9014883041381836, + "learning_rate": 0.00017151384337115962, + "loss": 3.1565, + "step": 2539 + }, + { + "epoch": 0.22992667692586222, + "grad_norm": 1.0331895351409912, + "learning_rate": 0.00017149165876699635, + "loss": 3.128, + "step": 2540 + }, + { + "epoch": 0.23001719923961256, + "grad_norm": 0.9233134388923645, + "learning_rate": 0.00017146946696362502, + "loss": 3.1212, + "step": 2541 + }, + { + "epoch": 0.2301077215533629, + "grad_norm": 0.9826732277870178, + "learning_rate": 0.00017144726796328034, + "loss": 3.1716, + "step": 2542 + }, + { + "epoch": 0.23019824386711324, + "grad_norm": 0.9416763782501221, + "learning_rate": 0.0001714250617681977, + "loss": 3.1447, + "step": 2543 + }, + { + "epoch": 0.23028876618086358, + "grad_norm": 0.9064895510673523, + "learning_rate": 0.00017140284838061334, + "loss": 3.0929, + "step": 2544 + }, + { + "epoch": 0.23037928849461392, + "grad_norm": 0.9274477362632751, + "learning_rate": 0.00017138062780276404, + "loss": 3.1231, + "step": 2545 + }, + { + "epoch": 0.23046981080836426, + "grad_norm": 1.0109010934829712, + "learning_rate": 0.00017135840003688752, + "loss": 3.0921, + "step": 2546 + }, + { + "epoch": 0.2305603331221146, + "grad_norm": 0.9265955090522766, + "learning_rate": 0.00017133616508522206, + "loss": 3.0887, + "step": 2547 + }, + { + "epoch": 0.23065085543586494, + "grad_norm": 0.9467498064041138, + "learning_rate": 0.00017131392295000674, + "loss": 3.1189, + "step": 2548 + }, + { + "epoch": 0.23074137774961528, + "grad_norm": 0.9402139782905579, + "learning_rate": 0.0001712916736334813, + "loss": 3.1551, + "step": 2549 + }, + { + "epoch": 0.23083190006336562, + "grad_norm": 0.9011145830154419, + "learning_rate": 0.00017126941713788632, + "loss": 3.108, + "step": 2550 + }, + { + "epoch": 0.23092242237711597, + "grad_norm": 1.010998249053955, + "learning_rate": 0.00017124715346546293, + "loss": 3.1518, + "step": 2551 + }, + { + "epoch": 0.2310129446908663, + "grad_norm": 0.8391900062561035, + "learning_rate": 0.00017122488261845316, + "loss": 3.0762, + "step": 2552 + }, + { + "epoch": 0.23110346700461665, + "grad_norm": 0.9358366131782532, + "learning_rate": 0.00017120260459909967, + "loss": 3.12, + "step": 2553 + }, + { + "epoch": 0.231193989318367, + "grad_norm": 0.9052395820617676, + "learning_rate": 0.00017118031940964584, + "loss": 3.0629, + "step": 2554 + }, + { + "epoch": 0.23128451163211733, + "grad_norm": 0.8716477751731873, + "learning_rate": 0.00017115802705233577, + "loss": 3.1671, + "step": 2555 + }, + { + "epoch": 0.23137503394586767, + "grad_norm": 0.8970102667808533, + "learning_rate": 0.00017113572752941434, + "loss": 3.1503, + "step": 2556 + }, + { + "epoch": 0.23146555625961798, + "grad_norm": 0.9600470066070557, + "learning_rate": 0.00017111342084312708, + "loss": 3.1936, + "step": 2557 + }, + { + "epoch": 0.23155607857336832, + "grad_norm": 0.9360811114311218, + "learning_rate": 0.0001710911069957203, + "loss": 3.1651, + "step": 2558 + }, + { + "epoch": 0.23164660088711866, + "grad_norm": 0.9788040518760681, + "learning_rate": 0.000171068785989441, + "loss": 3.1569, + "step": 2559 + }, + { + "epoch": 0.231737123200869, + "grad_norm": 0.9954851269721985, + "learning_rate": 0.0001710464578265369, + "loss": 3.117, + "step": 2560 + }, + { + "epoch": 0.23182764551461935, + "grad_norm": 0.9106504321098328, + "learning_rate": 0.0001710241225092564, + "loss": 3.0862, + "step": 2561 + }, + { + "epoch": 0.2319181678283697, + "grad_norm": 0.9232210516929626, + "learning_rate": 0.00017100178003984874, + "loss": 3.1645, + "step": 2562 + }, + { + "epoch": 0.23200869014212003, + "grad_norm": 0.880970299243927, + "learning_rate": 0.00017097943042056378, + "loss": 3.1417, + "step": 2563 + }, + { + "epoch": 0.23209921245587037, + "grad_norm": 0.9681224226951599, + "learning_rate": 0.0001709570736536521, + "loss": 3.1376, + "step": 2564 + }, + { + "epoch": 0.2321897347696207, + "grad_norm": 0.9339620471000671, + "learning_rate": 0.00017093470974136504, + "loss": 3.1156, + "step": 2565 + }, + { + "epoch": 0.23228025708337105, + "grad_norm": 0.9068838357925415, + "learning_rate": 0.00017091233868595467, + "loss": 3.1433, + "step": 2566 + }, + { + "epoch": 0.2323707793971214, + "grad_norm": 0.8875805139541626, + "learning_rate": 0.00017088996048967374, + "loss": 3.1451, + "step": 2567 + }, + { + "epoch": 0.23246130171087173, + "grad_norm": 0.8893206119537354, + "learning_rate": 0.0001708675751547757, + "loss": 3.0724, + "step": 2568 + }, + { + "epoch": 0.23255182402462207, + "grad_norm": 0.8865343928337097, + "learning_rate": 0.00017084518268351478, + "loss": 3.0983, + "step": 2569 + }, + { + "epoch": 0.2326423463383724, + "grad_norm": 0.8660908937454224, + "learning_rate": 0.00017082278307814592, + "loss": 3.0848, + "step": 2570 + }, + { + "epoch": 0.23273286865212275, + "grad_norm": 0.8525618314743042, + "learning_rate": 0.00017080037634092476, + "loss": 3.0969, + "step": 2571 + }, + { + "epoch": 0.2328233909658731, + "grad_norm": 0.9192307591438293, + "learning_rate": 0.0001707779624741076, + "loss": 3.223, + "step": 2572 + }, + { + "epoch": 0.23291391327962344, + "grad_norm": 0.9319369792938232, + "learning_rate": 0.0001707555414799516, + "loss": 3.075, + "step": 2573 + }, + { + "epoch": 0.23300443559337378, + "grad_norm": 0.9295352697372437, + "learning_rate": 0.00017073311336071446, + "loss": 3.1054, + "step": 2574 + }, + { + "epoch": 0.23309495790712412, + "grad_norm": 1.1705827713012695, + "learning_rate": 0.00017071067811865476, + "loss": 3.1209, + "step": 2575 + }, + { + "epoch": 0.23318548022087446, + "grad_norm": 0.8612735271453857, + "learning_rate": 0.00017068823575603172, + "loss": 3.0827, + "step": 2576 + }, + { + "epoch": 0.23327600253462477, + "grad_norm": 0.9167778491973877, + "learning_rate": 0.00017066578627510527, + "loss": 3.0788, + "step": 2577 + }, + { + "epoch": 0.2333665248483751, + "grad_norm": 1.0142779350280762, + "learning_rate": 0.00017064332967813605, + "loss": 3.1004, + "step": 2578 + }, + { + "epoch": 0.23345704716212545, + "grad_norm": 0.8765705823898315, + "learning_rate": 0.0001706208659673855, + "loss": 3.0819, + "step": 2579 + }, + { + "epoch": 0.2335475694758758, + "grad_norm": 0.9593830704689026, + "learning_rate": 0.00017059839514511565, + "loss": 3.1851, + "step": 2580 + }, + { + "epoch": 0.23363809178962613, + "grad_norm": 1.0137742757797241, + "learning_rate": 0.00017057591721358935, + "loss": 3.1235, + "step": 2581 + }, + { + "epoch": 0.23372861410337648, + "grad_norm": 0.873552680015564, + "learning_rate": 0.00017055343217507014, + "loss": 3.1555, + "step": 2582 + }, + { + "epoch": 0.23381913641712682, + "grad_norm": 0.8927232623100281, + "learning_rate": 0.0001705309400318222, + "loss": 3.1304, + "step": 2583 + }, + { + "epoch": 0.23390965873087716, + "grad_norm": 0.9595664739608765, + "learning_rate": 0.00017050844078611056, + "loss": 3.0944, + "step": 2584 + }, + { + "epoch": 0.2340001810446275, + "grad_norm": 0.9072505831718445, + "learning_rate": 0.00017048593444020084, + "loss": 3.2111, + "step": 2585 + }, + { + "epoch": 0.23409070335837784, + "grad_norm": 0.8766895532608032, + "learning_rate": 0.00017046342099635948, + "loss": 3.1421, + "step": 2586 + }, + { + "epoch": 0.23418122567212818, + "grad_norm": 0.8676466941833496, + "learning_rate": 0.00017044090045685353, + "loss": 3.14, + "step": 2587 + }, + { + "epoch": 0.23427174798587852, + "grad_norm": 0.9251097440719604, + "learning_rate": 0.00017041837282395084, + "loss": 3.0815, + "step": 2588 + }, + { + "epoch": 0.23436227029962886, + "grad_norm": 0.8791188597679138, + "learning_rate": 0.00017039583809991992, + "loss": 3.1301, + "step": 2589 + }, + { + "epoch": 0.2344527926133792, + "grad_norm": 0.9609639048576355, + "learning_rate": 0.00017037329628703004, + "loss": 3.1222, + "step": 2590 + }, + { + "epoch": 0.23454331492712954, + "grad_norm": 0.937321662902832, + "learning_rate": 0.00017035074738755115, + "loss": 3.0724, + "step": 2591 + }, + { + "epoch": 0.23463383724087988, + "grad_norm": 0.9102678894996643, + "learning_rate": 0.00017032819140375393, + "loss": 3.1259, + "step": 2592 + }, + { + "epoch": 0.23472435955463022, + "grad_norm": 0.9216590523719788, + "learning_rate": 0.00017030562833790977, + "loss": 3.1505, + "step": 2593 + }, + { + "epoch": 0.23481488186838056, + "grad_norm": 1.007556676864624, + "learning_rate": 0.00017028305819229074, + "loss": 3.1326, + "step": 2594 + }, + { + "epoch": 0.2349054041821309, + "grad_norm": 0.9005347490310669, + "learning_rate": 0.0001702604809691697, + "loss": 3.1449, + "step": 2595 + }, + { + "epoch": 0.23499592649588125, + "grad_norm": 1.139877438545227, + "learning_rate": 0.00017023789667082012, + "loss": 3.0806, + "step": 2596 + }, + { + "epoch": 0.2350864488096316, + "grad_norm": 0.910658597946167, + "learning_rate": 0.00017021530529951625, + "loss": 3.1299, + "step": 2597 + }, + { + "epoch": 0.2351769711233819, + "grad_norm": 0.9529627561569214, + "learning_rate": 0.0001701927068575331, + "loss": 3.1053, + "step": 2598 + }, + { + "epoch": 0.23526749343713224, + "grad_norm": 0.892433226108551, + "learning_rate": 0.00017017010134714626, + "loss": 3.0509, + "step": 2599 + }, + { + "epoch": 0.23535801575088258, + "grad_norm": 0.9055562019348145, + "learning_rate": 0.00017014748877063214, + "loss": 3.169, + "step": 2600 + }, + { + "epoch": 0.23544853806463292, + "grad_norm": 1.1402655839920044, + "learning_rate": 0.00017012486913026782, + "loss": 3.1448, + "step": 2601 + }, + { + "epoch": 0.23553906037838326, + "grad_norm": 0.8739327192306519, + "learning_rate": 0.0001701022424283311, + "loss": 3.1233, + "step": 2602 + }, + { + "epoch": 0.2356295826921336, + "grad_norm": 0.8963400721549988, + "learning_rate": 0.00017007960866710048, + "loss": 3.1718, + "step": 2603 + }, + { + "epoch": 0.23572010500588395, + "grad_norm": 1.0158209800720215, + "learning_rate": 0.00017005696784885518, + "loss": 3.1502, + "step": 2604 + }, + { + "epoch": 0.2358106273196343, + "grad_norm": 0.9023421406745911, + "learning_rate": 0.00017003431997587515, + "loss": 3.1557, + "step": 2605 + }, + { + "epoch": 0.23590114963338463, + "grad_norm": 0.8342739343643188, + "learning_rate": 0.00017001166505044104, + "loss": 3.1859, + "step": 2606 + }, + { + "epoch": 0.23599167194713497, + "grad_norm": 0.8725590705871582, + "learning_rate": 0.00016998900307483412, + "loss": 3.0903, + "step": 2607 + }, + { + "epoch": 0.2360821942608853, + "grad_norm": 0.9000881314277649, + "learning_rate": 0.00016996633405133655, + "loss": 3.1529, + "step": 2608 + }, + { + "epoch": 0.23617271657463565, + "grad_norm": 0.8588462471961975, + "learning_rate": 0.00016994365798223102, + "loss": 3.11, + "step": 2609 + }, + { + "epoch": 0.236263238888386, + "grad_norm": 0.8472119569778442, + "learning_rate": 0.00016992097486980107, + "loss": 3.093, + "step": 2610 + }, + { + "epoch": 0.23635376120213633, + "grad_norm": 0.881607711315155, + "learning_rate": 0.00016989828471633084, + "loss": 3.0996, + "step": 2611 + }, + { + "epoch": 0.23644428351588667, + "grad_norm": 0.9433134198188782, + "learning_rate": 0.0001698755875241053, + "loss": 3.1756, + "step": 2612 + }, + { + "epoch": 0.236534805829637, + "grad_norm": 0.8859644532203674, + "learning_rate": 0.00016985288329540997, + "loss": 3.0946, + "step": 2613 + }, + { + "epoch": 0.23662532814338735, + "grad_norm": 0.845602810382843, + "learning_rate": 0.00016983017203253122, + "loss": 3.1056, + "step": 2614 + }, + { + "epoch": 0.2367158504571377, + "grad_norm": 0.9098728895187378, + "learning_rate": 0.00016980745373775604, + "loss": 3.1132, + "step": 2615 + }, + { + "epoch": 0.23680637277088804, + "grad_norm": 1.0199395418167114, + "learning_rate": 0.00016978472841337218, + "loss": 3.1743, + "step": 2616 + }, + { + "epoch": 0.23689689508463838, + "grad_norm": 1.2085365056991577, + "learning_rate": 0.00016976199606166809, + "loss": 3.0876, + "step": 2617 + }, + { + "epoch": 0.23698741739838872, + "grad_norm": 0.9905280470848083, + "learning_rate": 0.0001697392566849329, + "loss": 3.1126, + "step": 2618 + }, + { + "epoch": 0.23707793971213903, + "grad_norm": 1.0612152814865112, + "learning_rate": 0.00016971651028545648, + "loss": 3.1126, + "step": 2619 + }, + { + "epoch": 0.23716846202588937, + "grad_norm": 0.8954588770866394, + "learning_rate": 0.00016969375686552937, + "loss": 3.1352, + "step": 2620 + }, + { + "epoch": 0.2372589843396397, + "grad_norm": 1.0112584829330444, + "learning_rate": 0.00016967099642744283, + "loss": 3.1017, + "step": 2621 + }, + { + "epoch": 0.23734950665339005, + "grad_norm": 0.9944405555725098, + "learning_rate": 0.0001696482289734889, + "loss": 3.1267, + "step": 2622 + }, + { + "epoch": 0.2374400289671404, + "grad_norm": 0.9250493049621582, + "learning_rate": 0.00016962545450596017, + "loss": 3.1542, + "step": 2623 + }, + { + "epoch": 0.23753055128089073, + "grad_norm": 1.0268713235855103, + "learning_rate": 0.0001696026730271501, + "loss": 3.1766, + "step": 2624 + }, + { + "epoch": 0.23762107359464107, + "grad_norm": 0.9544373750686646, + "learning_rate": 0.00016957988453935276, + "loss": 3.0965, + "step": 2625 + }, + { + "epoch": 0.23771159590839142, + "grad_norm": 0.8758040070533752, + "learning_rate": 0.00016955708904486296, + "loss": 3.1607, + "step": 2626 + }, + { + "epoch": 0.23780211822214176, + "grad_norm": 0.9426255822181702, + "learning_rate": 0.00016953428654597618, + "loss": 3.1236, + "step": 2627 + }, + { + "epoch": 0.2378926405358921, + "grad_norm": 0.8743308782577515, + "learning_rate": 0.0001695114770449887, + "loss": 3.0598, + "step": 2628 + }, + { + "epoch": 0.23798316284964244, + "grad_norm": 0.9559704065322876, + "learning_rate": 0.00016948866054419733, + "loss": 3.0916, + "step": 2629 + }, + { + "epoch": 0.23807368516339278, + "grad_norm": 0.899918258190155, + "learning_rate": 0.00016946583704589973, + "loss": 3.153, + "step": 2630 + }, + { + "epoch": 0.23816420747714312, + "grad_norm": 1.040543556213379, + "learning_rate": 0.00016944300655239429, + "loss": 3.193, + "step": 2631 + }, + { + "epoch": 0.23825472979089346, + "grad_norm": 0.8428203463554382, + "learning_rate": 0.00016942016906597995, + "loss": 3.0544, + "step": 2632 + }, + { + "epoch": 0.2383452521046438, + "grad_norm": 0.9760965704917908, + "learning_rate": 0.0001693973245889565, + "loss": 3.0425, + "step": 2633 + }, + { + "epoch": 0.23843577441839414, + "grad_norm": 0.881698727607727, + "learning_rate": 0.00016937447312362437, + "loss": 3.0975, + "step": 2634 + }, + { + "epoch": 0.23852629673214448, + "grad_norm": 0.934918224811554, + "learning_rate": 0.00016935161467228465, + "loss": 3.1498, + "step": 2635 + }, + { + "epoch": 0.23861681904589482, + "grad_norm": 0.9894874691963196, + "learning_rate": 0.00016932874923723928, + "loss": 3.171, + "step": 2636 + }, + { + "epoch": 0.23870734135964516, + "grad_norm": 0.9783918857574463, + "learning_rate": 0.00016930587682079077, + "loss": 3.1563, + "step": 2637 + }, + { + "epoch": 0.2387978636733955, + "grad_norm": 0.9620823860168457, + "learning_rate": 0.00016928299742524234, + "loss": 3.1405, + "step": 2638 + }, + { + "epoch": 0.23888838598714582, + "grad_norm": 0.8668125867843628, + "learning_rate": 0.00016926011105289796, + "loss": 3.1277, + "step": 2639 + }, + { + "epoch": 0.23897890830089616, + "grad_norm": 1.424202799797058, + "learning_rate": 0.00016923721770606228, + "loss": 3.1993, + "step": 2640 + }, + { + "epoch": 0.2390694306146465, + "grad_norm": 0.874039351940155, + "learning_rate": 0.0001692143173870407, + "loss": 3.1476, + "step": 2641 + }, + { + "epoch": 0.23915995292839684, + "grad_norm": 0.8891192674636841, + "learning_rate": 0.00016919141009813924, + "loss": 3.1399, + "step": 2642 + }, + { + "epoch": 0.23925047524214718, + "grad_norm": 0.954897940158844, + "learning_rate": 0.00016916849584166466, + "loss": 3.1055, + "step": 2643 + }, + { + "epoch": 0.23934099755589752, + "grad_norm": 0.9279910326004028, + "learning_rate": 0.00016914557461992447, + "loss": 3.1847, + "step": 2644 + }, + { + "epoch": 0.23943151986964786, + "grad_norm": 0.9511407017707825, + "learning_rate": 0.00016912264643522678, + "loss": 3.1576, + "step": 2645 + }, + { + "epoch": 0.2395220421833982, + "grad_norm": 0.9269322752952576, + "learning_rate": 0.00016909971128988049, + "loss": 3.1004, + "step": 2646 + }, + { + "epoch": 0.23961256449714854, + "grad_norm": 0.9328409433364868, + "learning_rate": 0.0001690767691861952, + "loss": 3.1329, + "step": 2647 + }, + { + "epoch": 0.23970308681089889, + "grad_norm": 0.8925078511238098, + "learning_rate": 0.00016905382012648109, + "loss": 3.1537, + "step": 2648 + }, + { + "epoch": 0.23979360912464923, + "grad_norm": 0.9077745079994202, + "learning_rate": 0.00016903086411304923, + "loss": 3.168, + "step": 2649 + }, + { + "epoch": 0.23988413143839957, + "grad_norm": 0.9630613327026367, + "learning_rate": 0.00016900790114821122, + "loss": 3.1688, + "step": 2650 + }, + { + "epoch": 0.2399746537521499, + "grad_norm": 0.8002099990844727, + "learning_rate": 0.00016898493123427945, + "loss": 3.07, + "step": 2651 + }, + { + "epoch": 0.24006517606590025, + "grad_norm": 0.8331252336502075, + "learning_rate": 0.000168961954373567, + "loss": 3.0506, + "step": 2652 + }, + { + "epoch": 0.2401556983796506, + "grad_norm": 0.8587708473205566, + "learning_rate": 0.0001689389705683876, + "loss": 3.1304, + "step": 2653 + }, + { + "epoch": 0.24024622069340093, + "grad_norm": 0.9123610854148865, + "learning_rate": 0.0001689159798210558, + "loss": 3.1164, + "step": 2654 + }, + { + "epoch": 0.24033674300715127, + "grad_norm": 1.0066545009613037, + "learning_rate": 0.0001688929821338867, + "loss": 3.1512, + "step": 2655 + }, + { + "epoch": 0.2404272653209016, + "grad_norm": 0.8197920918464661, + "learning_rate": 0.00016886997750919619, + "loss": 3.1139, + "step": 2656 + }, + { + "epoch": 0.24051778763465195, + "grad_norm": 1.0735701322555542, + "learning_rate": 0.00016884696594930077, + "loss": 3.0909, + "step": 2657 + }, + { + "epoch": 0.2406083099484023, + "grad_norm": 1.0315901041030884, + "learning_rate": 0.00016882394745651783, + "loss": 3.0775, + "step": 2658 + }, + { + "epoch": 0.24069883226215263, + "grad_norm": 0.9101348519325256, + "learning_rate": 0.00016880092203316527, + "loss": 3.067, + "step": 2659 + }, + { + "epoch": 0.24078935457590295, + "grad_norm": 0.9687630534172058, + "learning_rate": 0.0001687778896815617, + "loss": 3.1138, + "step": 2660 + }, + { + "epoch": 0.2408798768896533, + "grad_norm": 0.8488638997077942, + "learning_rate": 0.00016875485040402654, + "loss": 3.0299, + "step": 2661 + }, + { + "epoch": 0.24097039920340363, + "grad_norm": 0.9105443358421326, + "learning_rate": 0.0001687318042028798, + "loss": 3.0773, + "step": 2662 + }, + { + "epoch": 0.24106092151715397, + "grad_norm": 1.0244542360305786, + "learning_rate": 0.0001687087510804423, + "loss": 3.1329, + "step": 2663 + }, + { + "epoch": 0.2411514438309043, + "grad_norm": 0.9051873683929443, + "learning_rate": 0.00016868569103903544, + "loss": 3.1798, + "step": 2664 + }, + { + "epoch": 0.24124196614465465, + "grad_norm": 0.916400134563446, + "learning_rate": 0.00016866262408098134, + "loss": 3.11, + "step": 2665 + }, + { + "epoch": 0.241332488458405, + "grad_norm": 1.0660948753356934, + "learning_rate": 0.0001686395502086029, + "loss": 3.1874, + "step": 2666 + }, + { + "epoch": 0.24142301077215533, + "grad_norm": 1.2193211317062378, + "learning_rate": 0.00016861646942422365, + "loss": 3.1298, + "step": 2667 + }, + { + "epoch": 0.24151353308590567, + "grad_norm": 0.9132906198501587, + "learning_rate": 0.0001685933817301678, + "loss": 3.1635, + "step": 2668 + }, + { + "epoch": 0.24160405539965601, + "grad_norm": 1.070044994354248, + "learning_rate": 0.00016857028712876024, + "loss": 3.1155, + "step": 2669 + }, + { + "epoch": 0.24169457771340636, + "grad_norm": 0.8624181747436523, + "learning_rate": 0.00016854718562232668, + "loss": 3.1299, + "step": 2670 + }, + { + "epoch": 0.2417851000271567, + "grad_norm": 0.9248359203338623, + "learning_rate": 0.0001685240772131934, + "loss": 3.086, + "step": 2671 + }, + { + "epoch": 0.24187562234090704, + "grad_norm": 0.9360045194625854, + "learning_rate": 0.0001685009619036874, + "loss": 3.0809, + "step": 2672 + }, + { + "epoch": 0.24196614465465738, + "grad_norm": 0.9229385852813721, + "learning_rate": 0.00016847783969613643, + "loss": 3.0644, + "step": 2673 + }, + { + "epoch": 0.24205666696840772, + "grad_norm": 1.0891914367675781, + "learning_rate": 0.00016845471059286887, + "loss": 3.1425, + "step": 2674 + }, + { + "epoch": 0.24214718928215806, + "grad_norm": 0.9290924668312073, + "learning_rate": 0.00016843157459621384, + "loss": 3.1359, + "step": 2675 + }, + { + "epoch": 0.2422377115959084, + "grad_norm": 1.0456758737564087, + "learning_rate": 0.0001684084317085011, + "loss": 3.1505, + "step": 2676 + }, + { + "epoch": 0.24232823390965874, + "grad_norm": 0.8679757714271545, + "learning_rate": 0.00016838528193206117, + "loss": 3.082, + "step": 2677 + }, + { + "epoch": 0.24241875622340908, + "grad_norm": 1.1940603256225586, + "learning_rate": 0.00016836212526922522, + "loss": 3.1754, + "step": 2678 + }, + { + "epoch": 0.24250927853715942, + "grad_norm": 0.8902338743209839, + "learning_rate": 0.00016833896172232512, + "loss": 3.0364, + "step": 2679 + }, + { + "epoch": 0.24259980085090974, + "grad_norm": 1.1721466779708862, + "learning_rate": 0.00016831579129369346, + "loss": 3.0953, + "step": 2680 + }, + { + "epoch": 0.24269032316466008, + "grad_norm": 0.9444583058357239, + "learning_rate": 0.00016829261398566348, + "loss": 3.1059, + "step": 2681 + }, + { + "epoch": 0.24278084547841042, + "grad_norm": 0.9138522744178772, + "learning_rate": 0.00016826942980056913, + "loss": 3.1372, + "step": 2682 + }, + { + "epoch": 0.24287136779216076, + "grad_norm": 0.9610081911087036, + "learning_rate": 0.00016824623874074507, + "loss": 3.0693, + "step": 2683 + }, + { + "epoch": 0.2429618901059111, + "grad_norm": 0.8005548715591431, + "learning_rate": 0.00016822304080852662, + "loss": 3.1198, + "step": 2684 + }, + { + "epoch": 0.24305241241966144, + "grad_norm": 1.113952875137329, + "learning_rate": 0.00016819983600624986, + "loss": 3.1332, + "step": 2685 + }, + { + "epoch": 0.24314293473341178, + "grad_norm": 0.8378565311431885, + "learning_rate": 0.00016817662433625148, + "loss": 3.0195, + "step": 2686 + }, + { + "epoch": 0.24323345704716212, + "grad_norm": 0.9602024555206299, + "learning_rate": 0.00016815340580086886, + "loss": 3.066, + "step": 2687 + }, + { + "epoch": 0.24332397936091246, + "grad_norm": 0.8759976029396057, + "learning_rate": 0.00016813018040244017, + "loss": 3.0734, + "step": 2688 + }, + { + "epoch": 0.2434145016746628, + "grad_norm": 0.8833613991737366, + "learning_rate": 0.00016810694814330416, + "loss": 3.1273, + "step": 2689 + }, + { + "epoch": 0.24350502398841314, + "grad_norm": 0.8197110295295715, + "learning_rate": 0.00016808370902580036, + "loss": 3.0442, + "step": 2690 + }, + { + "epoch": 0.24359554630216348, + "grad_norm": 0.891551673412323, + "learning_rate": 0.0001680604630522689, + "loss": 3.0359, + "step": 2691 + }, + { + "epoch": 0.24368606861591383, + "grad_norm": 0.8699698448181152, + "learning_rate": 0.00016803721022505067, + "loss": 3.1161, + "step": 2692 + }, + { + "epoch": 0.24377659092966417, + "grad_norm": 0.8947954177856445, + "learning_rate": 0.00016801395054648723, + "loss": 3.0821, + "step": 2693 + }, + { + "epoch": 0.2438671132434145, + "grad_norm": 0.9444195032119751, + "learning_rate": 0.00016799068401892087, + "loss": 3.0483, + "step": 2694 + }, + { + "epoch": 0.24395763555716485, + "grad_norm": 0.8962039351463318, + "learning_rate": 0.00016796741064469445, + "loss": 3.1249, + "step": 2695 + }, + { + "epoch": 0.2440481578709152, + "grad_norm": 0.8785931468009949, + "learning_rate": 0.00016794413042615168, + "loss": 3.141, + "step": 2696 + }, + { + "epoch": 0.24413868018466553, + "grad_norm": 0.9811086654663086, + "learning_rate": 0.00016792084336563677, + "loss": 3.0502, + "step": 2697 + }, + { + "epoch": 0.24422920249841587, + "grad_norm": 0.8483330607414246, + "learning_rate": 0.00016789754946549485, + "loss": 3.0703, + "step": 2698 + }, + { + "epoch": 0.2443197248121662, + "grad_norm": 1.045301079750061, + "learning_rate": 0.00016787424872807152, + "loss": 3.0932, + "step": 2699 + }, + { + "epoch": 0.24441024712591655, + "grad_norm": 0.9416995048522949, + "learning_rate": 0.00016785094115571322, + "loss": 3.1171, + "step": 2700 + }, + { + "epoch": 0.24450076943966687, + "grad_norm": 0.8743078112602234, + "learning_rate": 0.00016782762675076699, + "loss": 3.1714, + "step": 2701 + }, + { + "epoch": 0.2445912917534172, + "grad_norm": 0.9239590764045715, + "learning_rate": 0.00016780430551558063, + "loss": 3.064, + "step": 2702 + }, + { + "epoch": 0.24468181406716755, + "grad_norm": 0.8741731643676758, + "learning_rate": 0.00016778097745250253, + "loss": 3.1444, + "step": 2703 + }, + { + "epoch": 0.2447723363809179, + "grad_norm": 0.9072793126106262, + "learning_rate": 0.00016775764256388186, + "loss": 3.0598, + "step": 2704 + }, + { + "epoch": 0.24486285869466823, + "grad_norm": 0.8954857587814331, + "learning_rate": 0.00016773430085206847, + "loss": 3.0361, + "step": 2705 + }, + { + "epoch": 0.24495338100841857, + "grad_norm": 0.8716298341751099, + "learning_rate": 0.0001677109523194128, + "loss": 3.0546, + "step": 2706 + }, + { + "epoch": 0.2450439033221689, + "grad_norm": 1.0035932064056396, + "learning_rate": 0.00016768759696826608, + "loss": 3.1424, + "step": 2707 + }, + { + "epoch": 0.24513442563591925, + "grad_norm": 0.8431956768035889, + "learning_rate": 0.0001676642348009802, + "loss": 3.0365, + "step": 2708 + }, + { + "epoch": 0.2452249479496696, + "grad_norm": 0.8482758402824402, + "learning_rate": 0.00016764086581990775, + "loss": 3.109, + "step": 2709 + }, + { + "epoch": 0.24531547026341993, + "grad_norm": 0.9053434729576111, + "learning_rate": 0.00016761749002740193, + "loss": 3.056, + "step": 2710 + }, + { + "epoch": 0.24540599257717027, + "grad_norm": 0.9073513746261597, + "learning_rate": 0.00016759410742581677, + "loss": 3.2316, + "step": 2711 + }, + { + "epoch": 0.24549651489092061, + "grad_norm": 0.9571587443351746, + "learning_rate": 0.00016757071801750678, + "loss": 3.0679, + "step": 2712 + }, + { + "epoch": 0.24558703720467095, + "grad_norm": 0.9908713698387146, + "learning_rate": 0.0001675473218048274, + "loss": 3.0663, + "step": 2713 + }, + { + "epoch": 0.2456775595184213, + "grad_norm": 0.8795186877250671, + "learning_rate": 0.0001675239187901345, + "loss": 2.9681, + "step": 2714 + }, + { + "epoch": 0.24576808183217164, + "grad_norm": 0.8643364906311035, + "learning_rate": 0.00016750050897578484, + "loss": 3.1057, + "step": 2715 + }, + { + "epoch": 0.24585860414592198, + "grad_norm": 0.9430257678031921, + "learning_rate": 0.0001674770923641358, + "loss": 3.1365, + "step": 2716 + }, + { + "epoch": 0.24594912645967232, + "grad_norm": 0.9766716957092285, + "learning_rate": 0.0001674536689575454, + "loss": 3.0803, + "step": 2717 + }, + { + "epoch": 0.24603964877342266, + "grad_norm": 0.9355599284172058, + "learning_rate": 0.00016743023875837233, + "loss": 3.1598, + "step": 2718 + }, + { + "epoch": 0.246130171087173, + "grad_norm": 0.8610727190971375, + "learning_rate": 0.00016740680176897615, + "loss": 3.0801, + "step": 2719 + }, + { + "epoch": 0.24622069340092334, + "grad_norm": 1.0121809244155884, + "learning_rate": 0.00016738335799171682, + "loss": 3.0124, + "step": 2720 + }, + { + "epoch": 0.24631121571467365, + "grad_norm": 0.8912957310676575, + "learning_rate": 0.00016735990742895517, + "loss": 3.0346, + "step": 2721 + }, + { + "epoch": 0.246401738028424, + "grad_norm": 0.9681625962257385, + "learning_rate": 0.00016733645008305272, + "loss": 3.0641, + "step": 2722 + }, + { + "epoch": 0.24649226034217434, + "grad_norm": 0.9645764827728271, + "learning_rate": 0.00016731298595637156, + "loss": 3.1159, + "step": 2723 + }, + { + "epoch": 0.24658278265592468, + "grad_norm": 0.8538256883621216, + "learning_rate": 0.00016728951505127457, + "loss": 3.0658, + "step": 2724 + }, + { + "epoch": 0.24667330496967502, + "grad_norm": 0.8959582448005676, + "learning_rate": 0.00016726603737012529, + "loss": 3.0969, + "step": 2725 + }, + { + "epoch": 0.24676382728342536, + "grad_norm": 0.9084468483924866, + "learning_rate": 0.0001672425529152878, + "loss": 3.0568, + "step": 2726 + }, + { + "epoch": 0.2468543495971757, + "grad_norm": 0.8262922763824463, + "learning_rate": 0.00016721906168912717, + "loss": 3.1008, + "step": 2727 + }, + { + "epoch": 0.24694487191092604, + "grad_norm": 1.1768760681152344, + "learning_rate": 0.0001671955636940088, + "loss": 3.0462, + "step": 2728 + }, + { + "epoch": 0.24703539422467638, + "grad_norm": 0.8710028529167175, + "learning_rate": 0.00016717205893229903, + "loss": 3.0722, + "step": 2729 + }, + { + "epoch": 0.24712591653842672, + "grad_norm": 1.082719326019287, + "learning_rate": 0.00016714854740636478, + "loss": 3.0794, + "step": 2730 + }, + { + "epoch": 0.24721643885217706, + "grad_norm": 0.9447751641273499, + "learning_rate": 0.0001671250291185736, + "loss": 3.0471, + "step": 2731 + }, + { + "epoch": 0.2473069611659274, + "grad_norm": 1.1959116458892822, + "learning_rate": 0.00016710150407129386, + "loss": 3.2106, + "step": 2732 + }, + { + "epoch": 0.24739748347967774, + "grad_norm": 0.9239301085472107, + "learning_rate": 0.00016707797226689448, + "loss": 3.0975, + "step": 2733 + }, + { + "epoch": 0.24748800579342808, + "grad_norm": 0.8619546890258789, + "learning_rate": 0.00016705443370774515, + "loss": 3.2133, + "step": 2734 + }, + { + "epoch": 0.24757852810717842, + "grad_norm": 1.0291749238967896, + "learning_rate": 0.00016703088839621613, + "loss": 3.0778, + "step": 2735 + }, + { + "epoch": 0.24766905042092877, + "grad_norm": 0.9417911767959595, + "learning_rate": 0.00016700733633467853, + "loss": 3.0914, + "step": 2736 + }, + { + "epoch": 0.2477595727346791, + "grad_norm": 1.1757731437683105, + "learning_rate": 0.000166983777525504, + "loss": 3.0828, + "step": 2737 + }, + { + "epoch": 0.24785009504842945, + "grad_norm": 0.9673495888710022, + "learning_rate": 0.00016696021197106487, + "loss": 3.0929, + "step": 2738 + }, + { + "epoch": 0.2479406173621798, + "grad_norm": 0.9365146160125732, + "learning_rate": 0.00016693663967373422, + "loss": 3.0604, + "step": 2739 + }, + { + "epoch": 0.24803113967593013, + "grad_norm": 0.9483555555343628, + "learning_rate": 0.00016691306063588583, + "loss": 3.0947, + "step": 2740 + }, + { + "epoch": 0.24812166198968047, + "grad_norm": 0.8077372312545776, + "learning_rate": 0.00016688947485989406, + "loss": 3.0894, + "step": 2741 + }, + { + "epoch": 0.24821218430343078, + "grad_norm": 0.9072548747062683, + "learning_rate": 0.000166865882348134, + "loss": 3.0464, + "step": 2742 + }, + { + "epoch": 0.24830270661718112, + "grad_norm": 0.8407996892929077, + "learning_rate": 0.00016684228310298138, + "loss": 3.0608, + "step": 2743 + }, + { + "epoch": 0.24839322893093146, + "grad_norm": 0.9338646531105042, + "learning_rate": 0.0001668186771268127, + "loss": 3.1346, + "step": 2744 + }, + { + "epoch": 0.2484837512446818, + "grad_norm": 0.9149847626686096, + "learning_rate": 0.00016679506442200505, + "loss": 3.0997, + "step": 2745 + }, + { + "epoch": 0.24857427355843215, + "grad_norm": 0.8203171491622925, + "learning_rate": 0.00016677144499093626, + "loss": 3.0163, + "step": 2746 + }, + { + "epoch": 0.2486647958721825, + "grad_norm": 0.8672574758529663, + "learning_rate": 0.00016674781883598477, + "loss": 3.0743, + "step": 2747 + }, + { + "epoch": 0.24875531818593283, + "grad_norm": 0.9464472532272339, + "learning_rate": 0.0001667241859595298, + "loss": 3.1096, + "step": 2748 + }, + { + "epoch": 0.24884584049968317, + "grad_norm": 0.8735107779502869, + "learning_rate": 0.00016670054636395108, + "loss": 3.1216, + "step": 2749 + }, + { + "epoch": 0.2489363628134335, + "grad_norm": 1.1609880924224854, + "learning_rate": 0.00016667690005162916, + "loss": 3.0835, + "step": 2750 + }, + { + "epoch": 0.24902688512718385, + "grad_norm": 0.8689505457878113, + "learning_rate": 0.00016665324702494524, + "loss": 3.1091, + "step": 2751 + }, + { + "epoch": 0.2491174074409342, + "grad_norm": 0.8796523809432983, + "learning_rate": 0.0001666295872862812, + "loss": 3.0843, + "step": 2752 + }, + { + "epoch": 0.24920792975468453, + "grad_norm": 0.9133955836296082, + "learning_rate": 0.0001666059208380195, + "loss": 3.1447, + "step": 2753 + }, + { + "epoch": 0.24929845206843487, + "grad_norm": 0.8879693150520325, + "learning_rate": 0.00016658224768254342, + "loss": 3.047, + "step": 2754 + }, + { + "epoch": 0.2493889743821852, + "grad_norm": 0.9240202307701111, + "learning_rate": 0.00016655856782223682, + "loss": 3.0566, + "step": 2755 + }, + { + "epoch": 0.24947949669593555, + "grad_norm": 0.9515302777290344, + "learning_rate": 0.00016653488125948425, + "loss": 3.1346, + "step": 2756 + }, + { + "epoch": 0.2495700190096859, + "grad_norm": 0.8590241074562073, + "learning_rate": 0.00016651118799667096, + "loss": 3.0785, + "step": 2757 + }, + { + "epoch": 0.24966054132343624, + "grad_norm": 0.9613081216812134, + "learning_rate": 0.00016648748803618286, + "loss": 3.1146, + "step": 2758 + }, + { + "epoch": 0.24975106363718658, + "grad_norm": 0.8211583495140076, + "learning_rate": 0.00016646378138040655, + "loss": 3.0034, + "step": 2759 + }, + { + "epoch": 0.24984158595093692, + "grad_norm": 0.9760122299194336, + "learning_rate": 0.00016644006803172924, + "loss": 3.0653, + "step": 2760 + }, + { + "epoch": 0.24993210826468726, + "grad_norm": 0.8821545243263245, + "learning_rate": 0.0001664163479925389, + "loss": 3.1247, + "step": 2761 + }, + { + "epoch": 0.2500226305784376, + "grad_norm": 1.0329774618148804, + "learning_rate": 0.00016639262126522418, + "loss": 3.0699, + "step": 2762 + }, + { + "epoch": 0.25011315289218794, + "grad_norm": 0.9172666668891907, + "learning_rate": 0.00016636888785217426, + "loss": 3.1346, + "step": 2763 + }, + { + "epoch": 0.2502036752059383, + "grad_norm": 0.9607560634613037, + "learning_rate": 0.0001663451477557792, + "loss": 3.1161, + "step": 2764 + }, + { + "epoch": 0.2502941975196886, + "grad_norm": 0.8451930284500122, + "learning_rate": 0.00016632140097842953, + "loss": 3.0687, + "step": 2765 + }, + { + "epoch": 0.25038471983343896, + "grad_norm": 1.0101604461669922, + "learning_rate": 0.00016629764752251665, + "loss": 3.1201, + "step": 2766 + }, + { + "epoch": 0.2504752421471893, + "grad_norm": 0.8747375011444092, + "learning_rate": 0.00016627388739043242, + "loss": 3.1586, + "step": 2767 + }, + { + "epoch": 0.25056576446093964, + "grad_norm": 0.87981778383255, + "learning_rate": 0.00016625012058456957, + "loss": 3.0545, + "step": 2768 + }, + { + "epoch": 0.25065628677469, + "grad_norm": 1.0310460329055786, + "learning_rate": 0.00016622634710732138, + "loss": 3.0795, + "step": 2769 + }, + { + "epoch": 0.2507468090884403, + "grad_norm": 0.8559186458587646, + "learning_rate": 0.00016620256696108188, + "loss": 3.0805, + "step": 2770 + }, + { + "epoch": 0.25083733140219067, + "grad_norm": 0.988170325756073, + "learning_rate": 0.00016617878014824563, + "loss": 3.1125, + "step": 2771 + }, + { + "epoch": 0.250927853715941, + "grad_norm": 0.917720377445221, + "learning_rate": 0.0001661549866712081, + "loss": 3.0562, + "step": 2772 + }, + { + "epoch": 0.2510183760296913, + "grad_norm": 0.9410594701766968, + "learning_rate": 0.00016613118653236518, + "loss": 3.1102, + "step": 2773 + }, + { + "epoch": 0.25110889834344163, + "grad_norm": 0.9203336834907532, + "learning_rate": 0.00016610737973411362, + "loss": 3.1359, + "step": 2774 + }, + { + "epoch": 0.251199420657192, + "grad_norm": 0.9632644057273865, + "learning_rate": 0.00016608356627885072, + "loss": 3.0328, + "step": 2775 + }, + { + "epoch": 0.2512899429709423, + "grad_norm": 0.8281729817390442, + "learning_rate": 0.00016605974616897449, + "loss": 3.0756, + "step": 2776 + }, + { + "epoch": 0.25138046528469266, + "grad_norm": 0.9495511651039124, + "learning_rate": 0.00016603591940688364, + "loss": 3.0499, + "step": 2777 + }, + { + "epoch": 0.251470987598443, + "grad_norm": 0.8314706683158875, + "learning_rate": 0.00016601208599497752, + "loss": 3.036, + "step": 2778 + }, + { + "epoch": 0.25156150991219334, + "grad_norm": 1.015807032585144, + "learning_rate": 0.0001659882459356561, + "loss": 3.0966, + "step": 2779 + }, + { + "epoch": 0.2516520322259437, + "grad_norm": 0.9160824418067932, + "learning_rate": 0.00016596439923132017, + "loss": 3.0643, + "step": 2780 + }, + { + "epoch": 0.251742554539694, + "grad_norm": 1.1869463920593262, + "learning_rate": 0.00016594054588437102, + "loss": 3.0994, + "step": 2781 + }, + { + "epoch": 0.25183307685344436, + "grad_norm": 1.000400185585022, + "learning_rate": 0.0001659166858972107, + "loss": 3.0737, + "step": 2782 + }, + { + "epoch": 0.2519235991671947, + "grad_norm": 1.0648994445800781, + "learning_rate": 0.00016589281927224193, + "loss": 3.0411, + "step": 2783 + }, + { + "epoch": 0.25201412148094504, + "grad_norm": 0.989122748374939, + "learning_rate": 0.00016586894601186805, + "loss": 3.089, + "step": 2784 + }, + { + "epoch": 0.2521046437946954, + "grad_norm": 0.8601987361907959, + "learning_rate": 0.0001658450661184931, + "loss": 3.0815, + "step": 2785 + }, + { + "epoch": 0.2521951661084457, + "grad_norm": 1.0197594165802002, + "learning_rate": 0.00016582117959452184, + "loss": 3.129, + "step": 2786 + }, + { + "epoch": 0.25228568842219606, + "grad_norm": 0.9159774780273438, + "learning_rate": 0.00016579728644235955, + "loss": 3.1056, + "step": 2787 + }, + { + "epoch": 0.2523762107359464, + "grad_norm": 0.8355146050453186, + "learning_rate": 0.00016577338666441232, + "loss": 3.1091, + "step": 2788 + }, + { + "epoch": 0.25246673304969675, + "grad_norm": 0.92014080286026, + "learning_rate": 0.00016574948026308687, + "loss": 3.1115, + "step": 2789 + }, + { + "epoch": 0.2525572553634471, + "grad_norm": 0.9154512882232666, + "learning_rate": 0.00016572556724079056, + "loss": 3.1172, + "step": 2790 + }, + { + "epoch": 0.2526477776771974, + "grad_norm": 0.859870970249176, + "learning_rate": 0.00016570164759993142, + "loss": 3.0903, + "step": 2791 + }, + { + "epoch": 0.25273829999094777, + "grad_norm": 0.946029782295227, + "learning_rate": 0.00016567772134291816, + "loss": 3.0855, + "step": 2792 + }, + { + "epoch": 0.2528288223046981, + "grad_norm": 0.9203336834907532, + "learning_rate": 0.00016565378847216016, + "loss": 3.0622, + "step": 2793 + }, + { + "epoch": 0.25291934461844845, + "grad_norm": 0.9952797293663025, + "learning_rate": 0.00016562984899006744, + "loss": 3.0854, + "step": 2794 + }, + { + "epoch": 0.2530098669321988, + "grad_norm": 0.849673330783844, + "learning_rate": 0.00016560590289905073, + "loss": 3.0319, + "step": 2795 + }, + { + "epoch": 0.25310038924594913, + "grad_norm": 0.857610821723938, + "learning_rate": 0.0001655819502015214, + "loss": 3.0494, + "step": 2796 + }, + { + "epoch": 0.25319091155969947, + "grad_norm": 1.0628398656845093, + "learning_rate": 0.0001655579908998915, + "loss": 3.1293, + "step": 2797 + }, + { + "epoch": 0.2532814338734498, + "grad_norm": 0.8674313426017761, + "learning_rate": 0.0001655340249965737, + "loss": 3.0552, + "step": 2798 + }, + { + "epoch": 0.25337195618720015, + "grad_norm": 0.9114541411399841, + "learning_rate": 0.00016551005249398134, + "loss": 3.0008, + "step": 2799 + }, + { + "epoch": 0.2534624785009505, + "grad_norm": 0.9368698000907898, + "learning_rate": 0.00016548607339452853, + "loss": 3.0419, + "step": 2800 + }, + { + "epoch": 0.25355300081470084, + "grad_norm": 0.9874829649925232, + "learning_rate": 0.0001654620877006299, + "loss": 3.0858, + "step": 2801 + }, + { + "epoch": 0.2536435231284512, + "grad_norm": 0.9258869290351868, + "learning_rate": 0.00016543809541470084, + "loss": 3.0617, + "step": 2802 + }, + { + "epoch": 0.2537340454422015, + "grad_norm": 0.924634575843811, + "learning_rate": 0.00016541409653915735, + "loss": 3.0395, + "step": 2803 + }, + { + "epoch": 0.25382456775595186, + "grad_norm": 0.8764649033546448, + "learning_rate": 0.00016539009107641612, + "loss": 3.0184, + "step": 2804 + }, + { + "epoch": 0.2539150900697022, + "grad_norm": 0.8432656526565552, + "learning_rate": 0.00016536607902889453, + "loss": 3.0731, + "step": 2805 + }, + { + "epoch": 0.25400561238345254, + "grad_norm": 0.8797190189361572, + "learning_rate": 0.00016534206039901057, + "loss": 3.1247, + "step": 2806 + }, + { + "epoch": 0.2540961346972029, + "grad_norm": 0.9437577128410339, + "learning_rate": 0.00016531803518918289, + "loss": 3.0625, + "step": 2807 + }, + { + "epoch": 0.2541866570109532, + "grad_norm": 0.8880354166030884, + "learning_rate": 0.00016529400340183087, + "loss": 3.0751, + "step": 2808 + }, + { + "epoch": 0.25427717932470356, + "grad_norm": 0.8828173875808716, + "learning_rate": 0.00016526996503937448, + "loss": 3.0696, + "step": 2809 + }, + { + "epoch": 0.2543677016384539, + "grad_norm": 0.9315621852874756, + "learning_rate": 0.00016524592010423443, + "loss": 3.0851, + "step": 2810 + }, + { + "epoch": 0.25445822395220424, + "grad_norm": 0.8461422324180603, + "learning_rate": 0.000165221868598832, + "loss": 3.0212, + "step": 2811 + }, + { + "epoch": 0.2545487462659546, + "grad_norm": 0.902062714099884, + "learning_rate": 0.00016519781052558917, + "loss": 3.0147, + "step": 2812 + }, + { + "epoch": 0.2546392685797049, + "grad_norm": 0.9265572428703308, + "learning_rate": 0.0001651737458869286, + "loss": 3.0852, + "step": 2813 + }, + { + "epoch": 0.2547297908934552, + "grad_norm": 1.1529960632324219, + "learning_rate": 0.00016514967468527365, + "loss": 3.057, + "step": 2814 + }, + { + "epoch": 0.25482031320720555, + "grad_norm": 0.894597589969635, + "learning_rate": 0.0001651255969230482, + "loss": 2.9886, + "step": 2815 + }, + { + "epoch": 0.2549108355209559, + "grad_norm": 0.9490203261375427, + "learning_rate": 0.00016510151260267693, + "loss": 3.0935, + "step": 2816 + }, + { + "epoch": 0.25500135783470623, + "grad_norm": 0.8473650813102722, + "learning_rate": 0.0001650774217265851, + "loss": 3.1135, + "step": 2817 + }, + { + "epoch": 0.2550918801484566, + "grad_norm": 0.913254976272583, + "learning_rate": 0.0001650533242971987, + "loss": 3.1171, + "step": 2818 + }, + { + "epoch": 0.2551824024622069, + "grad_norm": 0.9239524006843567, + "learning_rate": 0.00016502922031694434, + "loss": 3.0479, + "step": 2819 + }, + { + "epoch": 0.25527292477595726, + "grad_norm": 0.9490577578544617, + "learning_rate": 0.00016500510978824926, + "loss": 3.0795, + "step": 2820 + }, + { + "epoch": 0.2553634470897076, + "grad_norm": 1.005797028541565, + "learning_rate": 0.0001649809927135414, + "loss": 3.0178, + "step": 2821 + }, + { + "epoch": 0.25545396940345794, + "grad_norm": 0.8729204535484314, + "learning_rate": 0.00016495686909524934, + "loss": 3.113, + "step": 2822 + }, + { + "epoch": 0.2555444917172083, + "grad_norm": 0.9689465165138245, + "learning_rate": 0.00016493273893580237, + "loss": 3.1293, + "step": 2823 + }, + { + "epoch": 0.2556350140309586, + "grad_norm": 0.9394911527633667, + "learning_rate": 0.00016490860223763036, + "loss": 2.996, + "step": 2824 + }, + { + "epoch": 0.25572553634470896, + "grad_norm": 1.1007540225982666, + "learning_rate": 0.00016488445900316386, + "loss": 3.0747, + "step": 2825 + }, + { + "epoch": 0.2558160586584593, + "grad_norm": 0.8632012009620667, + "learning_rate": 0.00016486030923483413, + "loss": 3.0662, + "step": 2826 + }, + { + "epoch": 0.25590658097220964, + "grad_norm": 0.9733211994171143, + "learning_rate": 0.00016483615293507304, + "loss": 3.0925, + "step": 2827 + }, + { + "epoch": 0.25599710328596, + "grad_norm": 0.9308933019638062, + "learning_rate": 0.0001648119901063131, + "loss": 3.0849, + "step": 2828 + }, + { + "epoch": 0.2560876255997103, + "grad_norm": 1.0638294219970703, + "learning_rate": 0.00016478782075098755, + "loss": 3.0717, + "step": 2829 + }, + { + "epoch": 0.25617814791346066, + "grad_norm": 0.9067211151123047, + "learning_rate": 0.00016476364487153023, + "loss": 3.0821, + "step": 2830 + }, + { + "epoch": 0.256268670227211, + "grad_norm": 0.9581461548805237, + "learning_rate": 0.00016473946247037562, + "loss": 3.0636, + "step": 2831 + }, + { + "epoch": 0.25635919254096134, + "grad_norm": 0.9932011365890503, + "learning_rate": 0.0001647152735499589, + "loss": 3.0906, + "step": 2832 + }, + { + "epoch": 0.2564497148547117, + "grad_norm": 0.9417665004730225, + "learning_rate": 0.00016469107811271593, + "loss": 3.0517, + "step": 2833 + }, + { + "epoch": 0.256540237168462, + "grad_norm": 1.0941578149795532, + "learning_rate": 0.00016466687616108316, + "loss": 3.0603, + "step": 2834 + }, + { + "epoch": 0.25663075948221237, + "grad_norm": 0.9311975836753845, + "learning_rate": 0.00016464266769749774, + "loss": 3.0018, + "step": 2835 + }, + { + "epoch": 0.2567212817959627, + "grad_norm": 1.0369527339935303, + "learning_rate": 0.00016461845272439741, + "loss": 3.0921, + "step": 2836 + }, + { + "epoch": 0.25681180410971305, + "grad_norm": 0.8587122559547424, + "learning_rate": 0.00016459423124422072, + "loss": 3.0498, + "step": 2837 + }, + { + "epoch": 0.2569023264234634, + "grad_norm": 0.8985700011253357, + "learning_rate": 0.00016457000325940667, + "loss": 3.088, + "step": 2838 + }, + { + "epoch": 0.25699284873721373, + "grad_norm": 0.8981794118881226, + "learning_rate": 0.00016454576877239507, + "loss": 3.0898, + "step": 2839 + }, + { + "epoch": 0.25708337105096407, + "grad_norm": 0.9229661226272583, + "learning_rate": 0.0001645215277856263, + "loss": 3.0688, + "step": 2840 + }, + { + "epoch": 0.2571738933647144, + "grad_norm": 0.9163200259208679, + "learning_rate": 0.00016449728030154147, + "loss": 3.0581, + "step": 2841 + }, + { + "epoch": 0.25726441567846475, + "grad_norm": 0.8850761651992798, + "learning_rate": 0.0001644730263225823, + "loss": 3.002, + "step": 2842 + }, + { + "epoch": 0.2573549379922151, + "grad_norm": 0.9454545378684998, + "learning_rate": 0.00016444876585119112, + "loss": 3.056, + "step": 2843 + }, + { + "epoch": 0.25744546030596543, + "grad_norm": 0.9575306177139282, + "learning_rate": 0.00016442449888981098, + "loss": 3.0659, + "step": 2844 + }, + { + "epoch": 0.2575359826197158, + "grad_norm": 0.8770989775657654, + "learning_rate": 0.00016440022544088553, + "loss": 3.0354, + "step": 2845 + }, + { + "epoch": 0.2576265049334661, + "grad_norm": 0.899128258228302, + "learning_rate": 0.0001643759455068592, + "loss": 3.1051, + "step": 2846 + }, + { + "epoch": 0.25771702724721646, + "grad_norm": 0.9273167252540588, + "learning_rate": 0.00016435165909017688, + "loss": 3.062, + "step": 2847 + }, + { + "epoch": 0.2578075495609668, + "grad_norm": 0.8725229501724243, + "learning_rate": 0.00016432736619328425, + "loss": 3.0726, + "step": 2848 + }, + { + "epoch": 0.25789807187471714, + "grad_norm": 0.8743504285812378, + "learning_rate": 0.00016430306681862765, + "loss": 3.0016, + "step": 2849 + }, + { + "epoch": 0.2579885941884675, + "grad_norm": 0.894152820110321, + "learning_rate": 0.00016427876096865394, + "loss": 3.1186, + "step": 2850 + }, + { + "epoch": 0.2580791165022178, + "grad_norm": 0.8980875015258789, + "learning_rate": 0.00016425444864581076, + "loss": 3.1181, + "step": 2851 + }, + { + "epoch": 0.25816963881596816, + "grad_norm": 1.2219386100769043, + "learning_rate": 0.00016423012985254638, + "loss": 3.1049, + "step": 2852 + }, + { + "epoch": 0.2582601611297185, + "grad_norm": 0.9017558693885803, + "learning_rate": 0.00016420580459130965, + "loss": 3.055, + "step": 2853 + }, + { + "epoch": 0.25835068344346884, + "grad_norm": 0.9594643115997314, + "learning_rate": 0.0001641814728645502, + "loss": 3.0854, + "step": 2854 + }, + { + "epoch": 0.25844120575721913, + "grad_norm": 0.8878849148750305, + "learning_rate": 0.00016415713467471816, + "loss": 3.0405, + "step": 2855 + }, + { + "epoch": 0.25853172807096947, + "grad_norm": 0.8501585721969604, + "learning_rate": 0.0001641327900242644, + "loss": 3.0943, + "step": 2856 + }, + { + "epoch": 0.2586222503847198, + "grad_norm": 0.8996968865394592, + "learning_rate": 0.00016410843891564048, + "loss": 3.0927, + "step": 2857 + }, + { + "epoch": 0.25871277269847015, + "grad_norm": 0.9761808514595032, + "learning_rate": 0.0001640840813512985, + "loss": 3.1048, + "step": 2858 + }, + { + "epoch": 0.2588032950122205, + "grad_norm": 0.8404366970062256, + "learning_rate": 0.00016405971733369126, + "loss": 3.0006, + "step": 2859 + }, + { + "epoch": 0.25889381732597083, + "grad_norm": 0.8861578702926636, + "learning_rate": 0.00016403534686527225, + "loss": 3.0375, + "step": 2860 + }, + { + "epoch": 0.2589843396397212, + "grad_norm": 0.8492283225059509, + "learning_rate": 0.00016401096994849557, + "loss": 3.0891, + "step": 2861 + }, + { + "epoch": 0.2590748619534715, + "grad_norm": 0.9136780500411987, + "learning_rate": 0.00016398658658581594, + "loss": 3.0869, + "step": 2862 + }, + { + "epoch": 0.25916538426722185, + "grad_norm": 0.903567910194397, + "learning_rate": 0.00016396219677968882, + "loss": 3.0415, + "step": 2863 + }, + { + "epoch": 0.2592559065809722, + "grad_norm": 0.8297886848449707, + "learning_rate": 0.0001639378005325702, + "loss": 3.0239, + "step": 2864 + }, + { + "epoch": 0.25934642889472254, + "grad_norm": 0.8206044435501099, + "learning_rate": 0.00016391339784691686, + "loss": 3.0838, + "step": 2865 + }, + { + "epoch": 0.2594369512084729, + "grad_norm": 0.8900150656700134, + "learning_rate": 0.0001638889887251861, + "loss": 3.0946, + "step": 2866 + }, + { + "epoch": 0.2595274735222232, + "grad_norm": 0.8471210598945618, + "learning_rate": 0.0001638645731698359, + "loss": 2.9901, + "step": 2867 + }, + { + "epoch": 0.25961799583597356, + "grad_norm": 0.8738295435905457, + "learning_rate": 0.00016384015118332495, + "loss": 3.0481, + "step": 2868 + }, + { + "epoch": 0.2597085181497239, + "grad_norm": 0.805209219455719, + "learning_rate": 0.00016381572276811252, + "loss": 3.0732, + "step": 2869 + }, + { + "epoch": 0.25979904046347424, + "grad_norm": 0.8056511282920837, + "learning_rate": 0.00016379128792665855, + "loss": 3.0464, + "step": 2870 + }, + { + "epoch": 0.2598895627772246, + "grad_norm": 0.8855249881744385, + "learning_rate": 0.00016376684666142362, + "loss": 3.0513, + "step": 2871 + }, + { + "epoch": 0.2599800850909749, + "grad_norm": 0.8531777858734131, + "learning_rate": 0.000163742398974869, + "loss": 3.0807, + "step": 2872 + }, + { + "epoch": 0.26007060740472526, + "grad_norm": 0.8292708396911621, + "learning_rate": 0.0001637179448694565, + "loss": 3.1046, + "step": 2873 + }, + { + "epoch": 0.2601611297184756, + "grad_norm": 0.8251423239707947, + "learning_rate": 0.00016369348434764876, + "loss": 3.0695, + "step": 2874 + }, + { + "epoch": 0.26025165203222594, + "grad_norm": 0.938856303691864, + "learning_rate": 0.00016366901741190882, + "loss": 3.0725, + "step": 2875 + }, + { + "epoch": 0.2603421743459763, + "grad_norm": 0.9786975979804993, + "learning_rate": 0.00016364454406470063, + "loss": 3.0873, + "step": 2876 + }, + { + "epoch": 0.2604326966597266, + "grad_norm": 0.8554605841636658, + "learning_rate": 0.00016362006430848856, + "loss": 3.1021, + "step": 2877 + }, + { + "epoch": 0.26052321897347697, + "grad_norm": 1.1426806449890137, + "learning_rate": 0.00016359557814573777, + "loss": 3.0669, + "step": 2878 + }, + { + "epoch": 0.2606137412872273, + "grad_norm": 0.8070796728134155, + "learning_rate": 0.00016357108557891401, + "loss": 3.1003, + "step": 2879 + }, + { + "epoch": 0.26070426360097765, + "grad_norm": 1.416195034980774, + "learning_rate": 0.00016354658661048364, + "loss": 3.0877, + "step": 2880 + }, + { + "epoch": 0.260794785914728, + "grad_norm": 0.86333167552948, + "learning_rate": 0.00016352208124291375, + "loss": 3.0707, + "step": 2881 + }, + { + "epoch": 0.26088530822847833, + "grad_norm": 0.8207772374153137, + "learning_rate": 0.000163497569478672, + "loss": 3.09, + "step": 2882 + }, + { + "epoch": 0.26097583054222867, + "grad_norm": 0.9560832381248474, + "learning_rate": 0.00016347305132022677, + "loss": 3.0405, + "step": 2883 + }, + { + "epoch": 0.261066352855979, + "grad_norm": 0.9158850312232971, + "learning_rate": 0.000163448526770047, + "loss": 3.1062, + "step": 2884 + }, + { + "epoch": 0.26115687516972935, + "grad_norm": 0.9456655383110046, + "learning_rate": 0.00016342399583060233, + "loss": 3.0498, + "step": 2885 + }, + { + "epoch": 0.2612473974834797, + "grad_norm": 0.9813029766082764, + "learning_rate": 0.000163399458504363, + "loss": 3.1253, + "step": 2886 + }, + { + "epoch": 0.26133791979723003, + "grad_norm": 0.9460070133209229, + "learning_rate": 0.00016337491479379994, + "loss": 3.1175, + "step": 2887 + }, + { + "epoch": 0.2614284421109804, + "grad_norm": 0.9098150730133057, + "learning_rate": 0.0001633503647013847, + "loss": 3.0655, + "step": 2888 + }, + { + "epoch": 0.2615189644247307, + "grad_norm": 0.9258887767791748, + "learning_rate": 0.00016332580822958946, + "loss": 3.0744, + "step": 2889 + }, + { + "epoch": 0.26160948673848106, + "grad_norm": 0.8709380626678467, + "learning_rate": 0.00016330124538088705, + "loss": 3.0591, + "step": 2890 + }, + { + "epoch": 0.2617000090522314, + "grad_norm": 0.8492916226387024, + "learning_rate": 0.00016327667615775098, + "loss": 3.0236, + "step": 2891 + }, + { + "epoch": 0.26179053136598174, + "grad_norm": 0.9202527403831482, + "learning_rate": 0.00016325210056265536, + "loss": 2.9596, + "step": 2892 + }, + { + "epoch": 0.2618810536797321, + "grad_norm": 0.8801531195640564, + "learning_rate": 0.0001632275185980749, + "loss": 3.0367, + "step": 2893 + }, + { + "epoch": 0.2619715759934824, + "grad_norm": 0.9154375195503235, + "learning_rate": 0.0001632029302664851, + "loss": 3.0255, + "step": 2894 + }, + { + "epoch": 0.26206209830723276, + "grad_norm": 0.9271922707557678, + "learning_rate": 0.00016317833557036193, + "loss": 3.033, + "step": 2895 + }, + { + "epoch": 0.26215262062098305, + "grad_norm": 1.0529191493988037, + "learning_rate": 0.0001631537345121821, + "loss": 3.0615, + "step": 2896 + }, + { + "epoch": 0.2622431429347334, + "grad_norm": 1.1552090644836426, + "learning_rate": 0.00016312912709442293, + "loss": 3.0434, + "step": 2897 + }, + { + "epoch": 0.2623336652484837, + "grad_norm": 1.091846227645874, + "learning_rate": 0.00016310451331956238, + "loss": 2.9952, + "step": 2898 + }, + { + "epoch": 0.26242418756223407, + "grad_norm": 1.0549595355987549, + "learning_rate": 0.00016307989319007908, + "loss": 3.0755, + "step": 2899 + }, + { + "epoch": 0.2625147098759844, + "grad_norm": 1.1350163221359253, + "learning_rate": 0.00016305526670845226, + "loss": 3.071, + "step": 2900 + }, + { + "epoch": 0.26260523218973475, + "grad_norm": 0.9817744493484497, + "learning_rate": 0.0001630306338771618, + "loss": 3.0365, + "step": 2901 + }, + { + "epoch": 0.2626957545034851, + "grad_norm": 1.173923134803772, + "learning_rate": 0.00016300599469868825, + "loss": 3.0985, + "step": 2902 + }, + { + "epoch": 0.26278627681723543, + "grad_norm": 0.8843302130699158, + "learning_rate": 0.00016298134917551273, + "loss": 2.9988, + "step": 2903 + }, + { + "epoch": 0.26287679913098577, + "grad_norm": 0.9259403347969055, + "learning_rate": 0.00016295669731011708, + "loss": 3.0112, + "step": 2904 + }, + { + "epoch": 0.2629673214447361, + "grad_norm": 0.9056614637374878, + "learning_rate": 0.00016293203910498376, + "loss": 3.0118, + "step": 2905 + }, + { + "epoch": 0.26305784375848645, + "grad_norm": 0.919861912727356, + "learning_rate": 0.0001629073745625958, + "loss": 3.0115, + "step": 2906 + }, + { + "epoch": 0.2631483660722368, + "grad_norm": 0.8886492848396301, + "learning_rate": 0.00016288270368543698, + "loss": 3.075, + "step": 2907 + }, + { + "epoch": 0.26323888838598714, + "grad_norm": 0.842409610748291, + "learning_rate": 0.00016285802647599156, + "loss": 3.08, + "step": 2908 + }, + { + "epoch": 0.2633294106997375, + "grad_norm": 0.838261067867279, + "learning_rate": 0.00016283334293674463, + "loss": 3.0069, + "step": 2909 + }, + { + "epoch": 0.2634199330134878, + "grad_norm": 0.8960436582565308, + "learning_rate": 0.00016280865307018177, + "loss": 3.0596, + "step": 2910 + }, + { + "epoch": 0.26351045532723816, + "grad_norm": 0.9022462964057922, + "learning_rate": 0.00016278395687878926, + "loss": 3.0169, + "step": 2911 + }, + { + "epoch": 0.2636009776409885, + "grad_norm": 1.000325322151184, + "learning_rate": 0.000162759254365054, + "loss": 3.051, + "step": 2912 + }, + { + "epoch": 0.26369149995473884, + "grad_norm": 0.8697109222412109, + "learning_rate": 0.00016273454553146358, + "loss": 3.046, + "step": 2913 + }, + { + "epoch": 0.2637820222684892, + "grad_norm": 0.9372190833091736, + "learning_rate": 0.00016270983038050614, + "loss": 2.9794, + "step": 2914 + }, + { + "epoch": 0.2638725445822395, + "grad_norm": 0.8693639039993286, + "learning_rate": 0.00016268510891467045, + "loss": 3.0849, + "step": 2915 + }, + { + "epoch": 0.26396306689598986, + "grad_norm": 0.8379819393157959, + "learning_rate": 0.00016266038113644607, + "loss": 3.0528, + "step": 2916 + }, + { + "epoch": 0.2640535892097402, + "grad_norm": 0.9772222638130188, + "learning_rate": 0.00016263564704832297, + "loss": 3.0461, + "step": 2917 + }, + { + "epoch": 0.26414411152349054, + "grad_norm": 0.941190242767334, + "learning_rate": 0.00016261090665279198, + "loss": 3.0726, + "step": 2918 + }, + { + "epoch": 0.2642346338372409, + "grad_norm": 0.8939282894134521, + "learning_rate": 0.00016258615995234433, + "loss": 3.0199, + "step": 2919 + }, + { + "epoch": 0.2643251561509912, + "grad_norm": 0.8429410457611084, + "learning_rate": 0.00016256140694947217, + "loss": 3.0208, + "step": 2920 + }, + { + "epoch": 0.26441567846474157, + "grad_norm": 0.8292998671531677, + "learning_rate": 0.00016253664764666797, + "loss": 3.0057, + "step": 2921 + }, + { + "epoch": 0.2645062007784919, + "grad_norm": 0.8581786751747131, + "learning_rate": 0.0001625118820464251, + "loss": 3.11, + "step": 2922 + }, + { + "epoch": 0.26459672309224225, + "grad_norm": 0.8744654059410095, + "learning_rate": 0.00016248711015123742, + "loss": 3.0704, + "step": 2923 + }, + { + "epoch": 0.2646872454059926, + "grad_norm": 0.9087191820144653, + "learning_rate": 0.00016246233196359942, + "loss": 3.0857, + "step": 2924 + }, + { + "epoch": 0.26477776771974293, + "grad_norm": 0.8340759873390198, + "learning_rate": 0.00016243754748600635, + "loss": 3.0451, + "step": 2925 + }, + { + "epoch": 0.26486829003349327, + "grad_norm": 0.8676270246505737, + "learning_rate": 0.00016241275672095395, + "loss": 2.9594, + "step": 2926 + }, + { + "epoch": 0.2649588123472436, + "grad_norm": 0.9136918187141418, + "learning_rate": 0.00016238795967093864, + "loss": 3.0149, + "step": 2927 + }, + { + "epoch": 0.26504933466099395, + "grad_norm": 0.9369771480560303, + "learning_rate": 0.00016236315633845748, + "loss": 3.131, + "step": 2928 + }, + { + "epoch": 0.2651398569747443, + "grad_norm": 0.8263919353485107, + "learning_rate": 0.00016233834672600822, + "loss": 2.9978, + "step": 2929 + }, + { + "epoch": 0.26523037928849463, + "grad_norm": 0.8877750635147095, + "learning_rate": 0.00016231353083608912, + "loss": 3.0539, + "step": 2930 + }, + { + "epoch": 0.265320901602245, + "grad_norm": 0.860744059085846, + "learning_rate": 0.00016228870867119919, + "loss": 3.0041, + "step": 2931 + }, + { + "epoch": 0.2654114239159953, + "grad_norm": 0.9063569903373718, + "learning_rate": 0.000162263880233838, + "loss": 3.0642, + "step": 2932 + }, + { + "epoch": 0.26550194622974566, + "grad_norm": 0.8420007228851318, + "learning_rate": 0.00016223904552650575, + "loss": 3.0962, + "step": 2933 + }, + { + "epoch": 0.265592468543496, + "grad_norm": 0.939404308795929, + "learning_rate": 0.00016221420455170336, + "loss": 3.0918, + "step": 2934 + }, + { + "epoch": 0.26568299085724634, + "grad_norm": 0.9313684105873108, + "learning_rate": 0.00016218935731193224, + "loss": 3.1075, + "step": 2935 + }, + { + "epoch": 0.2657735131709967, + "grad_norm": 0.8911711573600769, + "learning_rate": 0.00016216450380969455, + "loss": 3.0071, + "step": 2936 + }, + { + "epoch": 0.26586403548474696, + "grad_norm": 0.8708754181861877, + "learning_rate": 0.00016213964404749302, + "loss": 3.0248, + "step": 2937 + }, + { + "epoch": 0.2659545577984973, + "grad_norm": 0.8700071573257446, + "learning_rate": 0.00016211477802783103, + "loss": 3.0178, + "step": 2938 + }, + { + "epoch": 0.26604508011224764, + "grad_norm": 0.8768590688705444, + "learning_rate": 0.00016208990575321264, + "loss": 3.0197, + "step": 2939 + }, + { + "epoch": 0.266135602425998, + "grad_norm": 0.9197354912757874, + "learning_rate": 0.00016206502722614238, + "loss": 2.9952, + "step": 2940 + }, + { + "epoch": 0.2662261247397483, + "grad_norm": 0.8466536998748779, + "learning_rate": 0.00016204014244912562, + "loss": 2.9759, + "step": 2941 + }, + { + "epoch": 0.26631664705349867, + "grad_norm": 0.9619039297103882, + "learning_rate": 0.00016201525142466817, + "loss": 3.0296, + "step": 2942 + }, + { + "epoch": 0.266407169367249, + "grad_norm": 0.9434160590171814, + "learning_rate": 0.00016199035415527662, + "loss": 3.0937, + "step": 2943 + }, + { + "epoch": 0.26649769168099935, + "grad_norm": 0.861297070980072, + "learning_rate": 0.00016196545064345812, + "loss": 3.0342, + "step": 2944 + }, + { + "epoch": 0.2665882139947497, + "grad_norm": 1.038414478302002, + "learning_rate": 0.00016194054089172042, + "loss": 3.0606, + "step": 2945 + }, + { + "epoch": 0.26667873630850003, + "grad_norm": 0.9113991856575012, + "learning_rate": 0.00016191562490257196, + "loss": 3.1049, + "step": 2946 + }, + { + "epoch": 0.26676925862225037, + "grad_norm": 0.9192466735839844, + "learning_rate": 0.00016189070267852174, + "loss": 3.0546, + "step": 2947 + }, + { + "epoch": 0.2668597809360007, + "grad_norm": 1.0234183073043823, + "learning_rate": 0.00016186577422207946, + "loss": 3.0497, + "step": 2948 + }, + { + "epoch": 0.26695030324975105, + "grad_norm": 0.896352231502533, + "learning_rate": 0.0001618408395357554, + "loss": 3.0133, + "step": 2949 + }, + { + "epoch": 0.2670408255635014, + "grad_norm": 1.0323007106781006, + "learning_rate": 0.00016181589862206052, + "loss": 3.1426, + "step": 2950 + }, + { + "epoch": 0.26713134787725173, + "grad_norm": 0.8639165163040161, + "learning_rate": 0.00016179095148350633, + "loss": 3.0012, + "step": 2951 + }, + { + "epoch": 0.2672218701910021, + "grad_norm": 0.9051254987716675, + "learning_rate": 0.00016176599812260501, + "loss": 3.0118, + "step": 2952 + }, + { + "epoch": 0.2673123925047524, + "grad_norm": 0.844688892364502, + "learning_rate": 0.0001617410385418694, + "loss": 2.9979, + "step": 2953 + }, + { + "epoch": 0.26740291481850276, + "grad_norm": 0.9484454989433289, + "learning_rate": 0.00016171607274381285, + "loss": 3.0809, + "step": 2954 + }, + { + "epoch": 0.2674934371322531, + "grad_norm": 0.8286004662513733, + "learning_rate": 0.00016169110073094948, + "loss": 3.0456, + "step": 2955 + }, + { + "epoch": 0.26758395944600344, + "grad_norm": 0.8905054926872253, + "learning_rate": 0.00016166612250579395, + "loss": 3.1231, + "step": 2956 + }, + { + "epoch": 0.2676744817597538, + "grad_norm": 0.9593135118484497, + "learning_rate": 0.0001616411380708616, + "loss": 3.0714, + "step": 2957 + }, + { + "epoch": 0.2677650040735041, + "grad_norm": 0.9497634172439575, + "learning_rate": 0.00016161614742866832, + "loss": 3.0306, + "step": 2958 + }, + { + "epoch": 0.26785552638725446, + "grad_norm": 1.0392967462539673, + "learning_rate": 0.00016159115058173064, + "loss": 2.9926, + "step": 2959 + }, + { + "epoch": 0.2679460487010048, + "grad_norm": 0.874346137046814, + "learning_rate": 0.0001615661475325658, + "loss": 3.0302, + "step": 2960 + }, + { + "epoch": 0.26803657101475514, + "grad_norm": 1.0402164459228516, + "learning_rate": 0.00016154113828369164, + "loss": 3.0318, + "step": 2961 + }, + { + "epoch": 0.2681270933285055, + "grad_norm": 0.9201018810272217, + "learning_rate": 0.00016151612283762652, + "loss": 3.1107, + "step": 2962 + }, + { + "epoch": 0.2682176156422558, + "grad_norm": 0.9608359932899475, + "learning_rate": 0.00016149110119688948, + "loss": 3.0355, + "step": 2963 + }, + { + "epoch": 0.26830813795600617, + "grad_norm": 0.9042795896530151, + "learning_rate": 0.00016146607336400023, + "loss": 3.0887, + "step": 2964 + }, + { + "epoch": 0.2683986602697565, + "grad_norm": 0.9694705009460449, + "learning_rate": 0.0001614410393414791, + "loss": 2.9801, + "step": 2965 + }, + { + "epoch": 0.26848918258350685, + "grad_norm": 0.831889271736145, + "learning_rate": 0.00016141599913184699, + "loss": 3.0268, + "step": 2966 + }, + { + "epoch": 0.2685797048972572, + "grad_norm": 1.1728935241699219, + "learning_rate": 0.00016139095273762544, + "loss": 3.1047, + "step": 2967 + }, + { + "epoch": 0.26867022721100753, + "grad_norm": 0.8189383745193481, + "learning_rate": 0.00016136590016133662, + "loss": 3.0057, + "step": 2968 + }, + { + "epoch": 0.26876074952475787, + "grad_norm": 1.0662305355072021, + "learning_rate": 0.00016134084140550335, + "loss": 3.0184, + "step": 2969 + }, + { + "epoch": 0.2688512718385082, + "grad_norm": 0.9276952743530273, + "learning_rate": 0.00016131577647264902, + "loss": 3.0401, + "step": 2970 + }, + { + "epoch": 0.26894179415225855, + "grad_norm": 0.9908543825149536, + "learning_rate": 0.00016129070536529766, + "loss": 3.0723, + "step": 2971 + }, + { + "epoch": 0.2690323164660089, + "grad_norm": 1.025470495223999, + "learning_rate": 0.00016126562808597397, + "loss": 3.0269, + "step": 2972 + }, + { + "epoch": 0.26912283877975923, + "grad_norm": 0.9223266243934631, + "learning_rate": 0.0001612405446372032, + "loss": 3.1187, + "step": 2973 + }, + { + "epoch": 0.2692133610935096, + "grad_norm": 0.924764096736908, + "learning_rate": 0.00016121545502151125, + "loss": 3.0623, + "step": 2974 + }, + { + "epoch": 0.2693038834072599, + "grad_norm": 0.8277813792228699, + "learning_rate": 0.00016119035924142466, + "loss": 3.027, + "step": 2975 + }, + { + "epoch": 0.26939440572101025, + "grad_norm": 0.8711262345314026, + "learning_rate": 0.00016116525729947058, + "loss": 3.039, + "step": 2976 + }, + { + "epoch": 0.2694849280347606, + "grad_norm": 0.9595316052436829, + "learning_rate": 0.00016114014919817678, + "loss": 3.0453, + "step": 2977 + }, + { + "epoch": 0.2695754503485109, + "grad_norm": 0.9323618412017822, + "learning_rate": 0.0001611150349400716, + "loss": 3.0467, + "step": 2978 + }, + { + "epoch": 0.2696659726622612, + "grad_norm": 0.9700012803077698, + "learning_rate": 0.00016108991452768408, + "loss": 3.027, + "step": 2979 + }, + { + "epoch": 0.26975649497601156, + "grad_norm": 1.0660173892974854, + "learning_rate": 0.00016106478796354382, + "loss": 3.1133, + "step": 2980 + }, + { + "epoch": 0.2698470172897619, + "grad_norm": 0.8464480638504028, + "learning_rate": 0.00016103965525018113, + "loss": 3.0249, + "step": 2981 + }, + { + "epoch": 0.26993753960351224, + "grad_norm": 1.045445203781128, + "learning_rate": 0.0001610145163901268, + "loss": 3.0867, + "step": 2982 + }, + { + "epoch": 0.2700280619172626, + "grad_norm": 0.9046902656555176, + "learning_rate": 0.0001609893713859123, + "loss": 3.0284, + "step": 2983 + }, + { + "epoch": 0.2701185842310129, + "grad_norm": 0.8621628880500793, + "learning_rate": 0.00016096422024006982, + "loss": 3.0301, + "step": 2984 + }, + { + "epoch": 0.27020910654476327, + "grad_norm": 1.079614520072937, + "learning_rate": 0.00016093906295513202, + "loss": 3.0707, + "step": 2985 + }, + { + "epoch": 0.2702996288585136, + "grad_norm": 0.805829644203186, + "learning_rate": 0.00016091389953363226, + "loss": 3.0931, + "step": 2986 + }, + { + "epoch": 0.27039015117226395, + "grad_norm": 0.9469945430755615, + "learning_rate": 0.00016088872997810445, + "loss": 3.121, + "step": 2987 + }, + { + "epoch": 0.2704806734860143, + "grad_norm": 0.8542191386222839, + "learning_rate": 0.0001608635542910832, + "loss": 3.0177, + "step": 2988 + }, + { + "epoch": 0.27057119579976463, + "grad_norm": 0.8838629126548767, + "learning_rate": 0.00016083837247510375, + "loss": 3.0072, + "step": 2989 + }, + { + "epoch": 0.27066171811351497, + "grad_norm": 1.1052781343460083, + "learning_rate": 0.0001608131845327018, + "loss": 3.0693, + "step": 2990 + }, + { + "epoch": 0.2707522404272653, + "grad_norm": 0.8787950277328491, + "learning_rate": 0.00016078799046641385, + "loss": 3.0323, + "step": 2991 + }, + { + "epoch": 0.27084276274101565, + "grad_norm": 0.9619665741920471, + "learning_rate": 0.0001607627902787769, + "loss": 3.019, + "step": 2992 + }, + { + "epoch": 0.270933285054766, + "grad_norm": 0.943647027015686, + "learning_rate": 0.00016073758397232868, + "loss": 3.0532, + "step": 2993 + }, + { + "epoch": 0.27102380736851633, + "grad_norm": 0.9765943884849548, + "learning_rate": 0.00016071237154960738, + "loss": 3.0638, + "step": 2994 + }, + { + "epoch": 0.2711143296822667, + "grad_norm": 0.8682699799537659, + "learning_rate": 0.00016068715301315197, + "loss": 3.0934, + "step": 2995 + }, + { + "epoch": 0.271204851996017, + "grad_norm": 0.9122071862220764, + "learning_rate": 0.00016066192836550184, + "loss": 3.0495, + "step": 2996 + }, + { + "epoch": 0.27129537430976736, + "grad_norm": 0.935530960559845, + "learning_rate": 0.00016063669760919727, + "loss": 3.0794, + "step": 2997 + }, + { + "epoch": 0.2713858966235177, + "grad_norm": 0.9900686740875244, + "learning_rate": 0.00016061146074677885, + "loss": 3.0998, + "step": 2998 + }, + { + "epoch": 0.27147641893726804, + "grad_norm": 0.9189563393592834, + "learning_rate": 0.000160586217780788, + "loss": 3.0788, + "step": 2999 + }, + { + "epoch": 0.2715669412510184, + "grad_norm": 0.8959607481956482, + "learning_rate": 0.00016056096871376667, + "loss": 2.97, + "step": 3000 + }, + { + "epoch": 0.2715669412510184, + "eval_loss": 2.978346109390259, + "eval_runtime": 71.5356, + "eval_samples_per_second": 37.785, + "eval_steps_per_second": 3.159, + "step": 3000 + }, + { + "epoch": 0.2716574635647687, + "grad_norm": 0.8900837302207947, + "learning_rate": 0.00016053571354825746, + "loss": 2.992, + "step": 3001 + }, + { + "epoch": 0.27174798587851906, + "grad_norm": 0.8718365430831909, + "learning_rate": 0.00016051045228680354, + "loss": 3.0593, + "step": 3002 + }, + { + "epoch": 0.2718385081922694, + "grad_norm": 0.9835093021392822, + "learning_rate": 0.00016048518493194878, + "loss": 3.0406, + "step": 3003 + }, + { + "epoch": 0.27192903050601974, + "grad_norm": 0.8958598971366882, + "learning_rate": 0.0001604599114862375, + "loss": 3.0482, + "step": 3004 + }, + { + "epoch": 0.2720195528197701, + "grad_norm": 0.9504435658454895, + "learning_rate": 0.0001604346319522148, + "loss": 3.0606, + "step": 3005 + }, + { + "epoch": 0.2721100751335204, + "grad_norm": 0.911404013633728, + "learning_rate": 0.00016040934633242634, + "loss": 3.1059, + "step": 3006 + }, + { + "epoch": 0.27220059744727076, + "grad_norm": 0.8670641183853149, + "learning_rate": 0.00016038405462941833, + "loss": 3.0696, + "step": 3007 + }, + { + "epoch": 0.2722911197610211, + "grad_norm": 0.8918884992599487, + "learning_rate": 0.00016035875684573765, + "loss": 3.0457, + "step": 3008 + }, + { + "epoch": 0.27238164207477145, + "grad_norm": 1.0598082542419434, + "learning_rate": 0.00016033345298393186, + "loss": 3.03, + "step": 3009 + }, + { + "epoch": 0.2724721643885218, + "grad_norm": 0.8834877014160156, + "learning_rate": 0.00016030814304654895, + "loss": 2.9908, + "step": 3010 + }, + { + "epoch": 0.27256268670227213, + "grad_norm": 0.9708331823348999, + "learning_rate": 0.00016028282703613772, + "loss": 3.0751, + "step": 3011 + }, + { + "epoch": 0.27265320901602247, + "grad_norm": 0.9399593472480774, + "learning_rate": 0.00016025750495524745, + "loss": 3.0301, + "step": 3012 + }, + { + "epoch": 0.2727437313297728, + "grad_norm": 0.8679122924804688, + "learning_rate": 0.0001602321768064281, + "loss": 3.0653, + "step": 3013 + }, + { + "epoch": 0.27283425364352315, + "grad_norm": 0.9808764457702637, + "learning_rate": 0.00016020684259223015, + "loss": 3.0514, + "step": 3014 + }, + { + "epoch": 0.2729247759572735, + "grad_norm": 0.8745496273040771, + "learning_rate": 0.00016018150231520486, + "loss": 3.0555, + "step": 3015 + }, + { + "epoch": 0.27301529827102383, + "grad_norm": 1.0140714645385742, + "learning_rate": 0.00016015615597790388, + "loss": 3.0285, + "step": 3016 + }, + { + "epoch": 0.2731058205847742, + "grad_norm": 0.9872329831123352, + "learning_rate": 0.00016013080358287963, + "loss": 3.0468, + "step": 3017 + }, + { + "epoch": 0.2731963428985245, + "grad_norm": 0.8958715200424194, + "learning_rate": 0.00016010544513268515, + "loss": 3.0561, + "step": 3018 + }, + { + "epoch": 0.2732868652122748, + "grad_norm": 0.9180740118026733, + "learning_rate": 0.00016008008062987398, + "loss": 3.0743, + "step": 3019 + }, + { + "epoch": 0.27337738752602514, + "grad_norm": 0.8599140048027039, + "learning_rate": 0.00016005471007700031, + "loss": 3.0579, + "step": 3020 + }, + { + "epoch": 0.2734679098397755, + "grad_norm": 0.9585762023925781, + "learning_rate": 0.00016002933347661902, + "loss": 3.0443, + "step": 3021 + }, + { + "epoch": 0.2735584321535258, + "grad_norm": 0.8619949221611023, + "learning_rate": 0.00016000395083128543, + "loss": 3.0021, + "step": 3022 + }, + { + "epoch": 0.27364895446727616, + "grad_norm": 0.9174975156784058, + "learning_rate": 0.00015997856214355565, + "loss": 2.9539, + "step": 3023 + }, + { + "epoch": 0.2737394767810265, + "grad_norm": 1.008433222770691, + "learning_rate": 0.00015995316741598633, + "loss": 3.0251, + "step": 3024 + }, + { + "epoch": 0.27382999909477684, + "grad_norm": 0.8638203740119934, + "learning_rate": 0.0001599277666511347, + "loss": 3.0041, + "step": 3025 + }, + { + "epoch": 0.2739205214085272, + "grad_norm": 0.9356272220611572, + "learning_rate": 0.0001599023598515586, + "loss": 3.0161, + "step": 3026 + }, + { + "epoch": 0.2740110437222775, + "grad_norm": 0.875857412815094, + "learning_rate": 0.00015987694701981647, + "loss": 3.049, + "step": 3027 + }, + { + "epoch": 0.27410156603602787, + "grad_norm": 0.9533547759056091, + "learning_rate": 0.00015985152815846745, + "loss": 3.0531, + "step": 3028 + }, + { + "epoch": 0.2741920883497782, + "grad_norm": 0.9074561595916748, + "learning_rate": 0.00015982610327007116, + "loss": 2.9959, + "step": 3029 + }, + { + "epoch": 0.27428261066352855, + "grad_norm": 0.8548593521118164, + "learning_rate": 0.00015980067235718792, + "loss": 3.0434, + "step": 3030 + }, + { + "epoch": 0.2743731329772789, + "grad_norm": 0.8892874121665955, + "learning_rate": 0.00015977523542237863, + "loss": 2.9971, + "step": 3031 + }, + { + "epoch": 0.27446365529102923, + "grad_norm": 0.9240937829017639, + "learning_rate": 0.0001597497924682047, + "loss": 3.1156, + "step": 3032 + }, + { + "epoch": 0.27455417760477957, + "grad_norm": 1.205907940864563, + "learning_rate": 0.0001597243434972284, + "loss": 2.9952, + "step": 3033 + }, + { + "epoch": 0.2746446999185299, + "grad_norm": 0.9608234763145447, + "learning_rate": 0.00015969888851201226, + "loss": 3.0614, + "step": 3034 + }, + { + "epoch": 0.27473522223228025, + "grad_norm": 0.9994444251060486, + "learning_rate": 0.0001596734275151197, + "loss": 3.0317, + "step": 3035 + }, + { + "epoch": 0.2748257445460306, + "grad_norm": 1.1104655265808105, + "learning_rate": 0.00015964796050911467, + "loss": 3.0698, + "step": 3036 + }, + { + "epoch": 0.27491626685978093, + "grad_norm": 1.007348656654358, + "learning_rate": 0.0001596224874965616, + "loss": 2.9821, + "step": 3037 + }, + { + "epoch": 0.2750067891735313, + "grad_norm": 1.0268911123275757, + "learning_rate": 0.00015959700848002567, + "loss": 3.0238, + "step": 3038 + }, + { + "epoch": 0.2750973114872816, + "grad_norm": 1.0308053493499756, + "learning_rate": 0.00015957152346207263, + "loss": 3.0165, + "step": 3039 + }, + { + "epoch": 0.27518783380103196, + "grad_norm": 0.9716582894325256, + "learning_rate": 0.0001595460324452688, + "loss": 3.0735, + "step": 3040 + }, + { + "epoch": 0.2752783561147823, + "grad_norm": 1.2407675981521606, + "learning_rate": 0.00015952053543218114, + "loss": 3.0039, + "step": 3041 + }, + { + "epoch": 0.27536887842853264, + "grad_norm": 0.8391728401184082, + "learning_rate": 0.00015949503242537717, + "loss": 3.0505, + "step": 3042 + }, + { + "epoch": 0.275459400742283, + "grad_norm": 1.2026934623718262, + "learning_rate": 0.0001594695234274251, + "loss": 3.0462, + "step": 3043 + }, + { + "epoch": 0.2755499230560333, + "grad_norm": 1.088293433189392, + "learning_rate": 0.00015944400844089364, + "loss": 3.0565, + "step": 3044 + }, + { + "epoch": 0.27564044536978366, + "grad_norm": 0.9235910177230835, + "learning_rate": 0.00015941848746835215, + "loss": 3.0061, + "step": 3045 + }, + { + "epoch": 0.275730967683534, + "grad_norm": 0.9839001893997192, + "learning_rate": 0.0001593929605123706, + "loss": 3.0349, + "step": 3046 + }, + { + "epoch": 0.27582148999728434, + "grad_norm": 0.8767859935760498, + "learning_rate": 0.00015936742757551955, + "loss": 3.0455, + "step": 3047 + }, + { + "epoch": 0.2759120123110347, + "grad_norm": 0.973345160484314, + "learning_rate": 0.00015934188866037016, + "loss": 3.065, + "step": 3048 + }, + { + "epoch": 0.276002534624785, + "grad_norm": 0.9066287875175476, + "learning_rate": 0.0001593163437694942, + "loss": 3.0872, + "step": 3049 + }, + { + "epoch": 0.27609305693853536, + "grad_norm": 0.9278755784034729, + "learning_rate": 0.00015929079290546408, + "loss": 2.9691, + "step": 3050 + }, + { + "epoch": 0.2761835792522857, + "grad_norm": 0.88511061668396, + "learning_rate": 0.00015926523607085269, + "loss": 3.0507, + "step": 3051 + }, + { + "epoch": 0.27627410156603605, + "grad_norm": 0.9436579346656799, + "learning_rate": 0.00015923967326823368, + "loss": 2.9823, + "step": 3052 + }, + { + "epoch": 0.2763646238797864, + "grad_norm": 0.9194360971450806, + "learning_rate": 0.00015921410450018118, + "loss": 2.9886, + "step": 3053 + }, + { + "epoch": 0.2764551461935367, + "grad_norm": 0.8538716435432434, + "learning_rate": 0.00015918852976926998, + "loss": 2.9554, + "step": 3054 + }, + { + "epoch": 0.27654566850728707, + "grad_norm": 0.9728320837020874, + "learning_rate": 0.00015916294907807545, + "loss": 3.02, + "step": 3055 + }, + { + "epoch": 0.2766361908210374, + "grad_norm": 0.8909928202629089, + "learning_rate": 0.00015913736242917356, + "loss": 3.0283, + "step": 3056 + }, + { + "epoch": 0.27672671313478775, + "grad_norm": 0.8910848498344421, + "learning_rate": 0.0001591117698251409, + "loss": 3.0063, + "step": 3057 + }, + { + "epoch": 0.2768172354485381, + "grad_norm": 0.8911659717559814, + "learning_rate": 0.00015908617126855466, + "loss": 2.9937, + "step": 3058 + }, + { + "epoch": 0.27690775776228843, + "grad_norm": 0.8785828351974487, + "learning_rate": 0.00015906056676199255, + "loss": 3.0193, + "step": 3059 + }, + { + "epoch": 0.2769982800760387, + "grad_norm": 0.9020123481750488, + "learning_rate": 0.000159034956308033, + "loss": 2.9334, + "step": 3060 + }, + { + "epoch": 0.27708880238978906, + "grad_norm": 0.8372553586959839, + "learning_rate": 0.00015900933990925498, + "loss": 3.0273, + "step": 3061 + }, + { + "epoch": 0.2771793247035394, + "grad_norm": 0.9668348431587219, + "learning_rate": 0.000158983717568238, + "loss": 3.0453, + "step": 3062 + }, + { + "epoch": 0.27726984701728974, + "grad_norm": 0.8780279755592346, + "learning_rate": 0.0001589580892875623, + "loss": 2.9765, + "step": 3063 + }, + { + "epoch": 0.2773603693310401, + "grad_norm": 0.8655260801315308, + "learning_rate": 0.00015893245506980866, + "loss": 3.0367, + "step": 3064 + }, + { + "epoch": 0.2774508916447904, + "grad_norm": 0.9587757587432861, + "learning_rate": 0.00015890681491755837, + "loss": 3.0454, + "step": 3065 + }, + { + "epoch": 0.27754141395854076, + "grad_norm": 0.8964729905128479, + "learning_rate": 0.0001588811688333934, + "loss": 3.0603, + "step": 3066 + }, + { + "epoch": 0.2776319362722911, + "grad_norm": 0.8597284555435181, + "learning_rate": 0.0001588555168198964, + "loss": 3.0304, + "step": 3067 + }, + { + "epoch": 0.27772245858604144, + "grad_norm": 0.9043926000595093, + "learning_rate": 0.00015882985887965043, + "loss": 3.0292, + "step": 3068 + }, + { + "epoch": 0.2778129808997918, + "grad_norm": 0.9580448865890503, + "learning_rate": 0.00015880419501523927, + "loss": 3.1074, + "step": 3069 + }, + { + "epoch": 0.2779035032135421, + "grad_norm": 0.9260460138320923, + "learning_rate": 0.00015877852522924732, + "loss": 3.0452, + "step": 3070 + }, + { + "epoch": 0.27799402552729247, + "grad_norm": 1.195544958114624, + "learning_rate": 0.0001587528495242595, + "loss": 3.0491, + "step": 3071 + }, + { + "epoch": 0.2780845478410428, + "grad_norm": 1.0550506114959717, + "learning_rate": 0.0001587271679028613, + "loss": 2.9788, + "step": 3072 + }, + { + "epoch": 0.27817507015479315, + "grad_norm": 1.0724434852600098, + "learning_rate": 0.0001587014803676389, + "loss": 2.9796, + "step": 3073 + }, + { + "epoch": 0.2782655924685435, + "grad_norm": 0.8458510637283325, + "learning_rate": 0.000158675786921179, + "loss": 3.053, + "step": 3074 + }, + { + "epoch": 0.27835611478229383, + "grad_norm": 1.1424552202224731, + "learning_rate": 0.00015865008756606904, + "loss": 3.0551, + "step": 3075 + }, + { + "epoch": 0.27844663709604417, + "grad_norm": 0.9233999252319336, + "learning_rate": 0.0001586243823048968, + "loss": 2.9895, + "step": 3076 + }, + { + "epoch": 0.2785371594097945, + "grad_norm": 0.8394502401351929, + "learning_rate": 0.0001585986711402509, + "loss": 3.0804, + "step": 3077 + }, + { + "epoch": 0.27862768172354485, + "grad_norm": 0.9870647192001343, + "learning_rate": 0.00015857295407472046, + "loss": 3.0542, + "step": 3078 + }, + { + "epoch": 0.2787182040372952, + "grad_norm": 0.8402102589607239, + "learning_rate": 0.00015854723111089508, + "loss": 3.0197, + "step": 3079 + }, + { + "epoch": 0.27880872635104553, + "grad_norm": 0.8542654514312744, + "learning_rate": 0.00015852150225136518, + "loss": 3.0423, + "step": 3080 + }, + { + "epoch": 0.2788992486647959, + "grad_norm": 0.9618365168571472, + "learning_rate": 0.00015849576749872157, + "loss": 3.0611, + "step": 3081 + }, + { + "epoch": 0.2789897709785462, + "grad_norm": 0.8424764275550842, + "learning_rate": 0.00015847002685555578, + "loss": 3.0326, + "step": 3082 + }, + { + "epoch": 0.27908029329229656, + "grad_norm": 0.8958133459091187, + "learning_rate": 0.00015844428032445987, + "loss": 3.0293, + "step": 3083 + }, + { + "epoch": 0.2791708156060469, + "grad_norm": 0.9073579907417297, + "learning_rate": 0.00015841852790802652, + "loss": 2.996, + "step": 3084 + }, + { + "epoch": 0.27926133791979724, + "grad_norm": 0.9596017599105835, + "learning_rate": 0.00015839276960884905, + "loss": 2.999, + "step": 3085 + }, + { + "epoch": 0.2793518602335476, + "grad_norm": 0.985481858253479, + "learning_rate": 0.00015836700542952128, + "loss": 3.0469, + "step": 3086 + }, + { + "epoch": 0.2794423825472979, + "grad_norm": 0.9854786992073059, + "learning_rate": 0.00015834123537263763, + "loss": 3.0807, + "step": 3087 + }, + { + "epoch": 0.27953290486104826, + "grad_norm": 0.9631576538085938, + "learning_rate": 0.0001583154594407932, + "loss": 2.9977, + "step": 3088 + }, + { + "epoch": 0.2796234271747986, + "grad_norm": 1.4733103513717651, + "learning_rate": 0.00015828967763658358, + "loss": 3.0604, + "step": 3089 + }, + { + "epoch": 0.27971394948854894, + "grad_norm": 0.9754955768585205, + "learning_rate": 0.00015826388996260503, + "loss": 3.0514, + "step": 3090 + }, + { + "epoch": 0.2798044718022993, + "grad_norm": 0.918458104133606, + "learning_rate": 0.00015823809642145434, + "loss": 2.9837, + "step": 3091 + }, + { + "epoch": 0.2798949941160496, + "grad_norm": 0.9138178825378418, + "learning_rate": 0.00015821229701572896, + "loss": 3.0117, + "step": 3092 + }, + { + "epoch": 0.27998551642979996, + "grad_norm": 0.9586968421936035, + "learning_rate": 0.00015818649174802684, + "loss": 3.0233, + "step": 3093 + }, + { + "epoch": 0.2800760387435503, + "grad_norm": 0.9427858591079712, + "learning_rate": 0.0001581606806209466, + "loss": 3.0529, + "step": 3094 + }, + { + "epoch": 0.28016656105730064, + "grad_norm": 0.891553521156311, + "learning_rate": 0.0001581348636370874, + "loss": 3.0154, + "step": 3095 + }, + { + "epoch": 0.280257083371051, + "grad_norm": 0.8784413933753967, + "learning_rate": 0.00015810904079904904, + "loss": 3.0261, + "step": 3096 + }, + { + "epoch": 0.2803476056848013, + "grad_norm": 0.9564874172210693, + "learning_rate": 0.00015808321210943188, + "loss": 3.0236, + "step": 3097 + }, + { + "epoch": 0.28043812799855167, + "grad_norm": 0.9087296724319458, + "learning_rate": 0.00015805737757083681, + "loss": 3.094, + "step": 3098 + }, + { + "epoch": 0.280528650312302, + "grad_norm": 0.9095321893692017, + "learning_rate": 0.00015803153718586544, + "loss": 2.9883, + "step": 3099 + }, + { + "epoch": 0.28061917262605235, + "grad_norm": 0.9614866971969604, + "learning_rate": 0.00015800569095711982, + "loss": 3.003, + "step": 3100 + }, + { + "epoch": 0.28070969493980263, + "grad_norm": 0.8452001810073853, + "learning_rate": 0.00015797983888720274, + "loss": 2.9554, + "step": 3101 + }, + { + "epoch": 0.280800217253553, + "grad_norm": 0.8980777263641357, + "learning_rate": 0.00015795398097871748, + "loss": 3.0503, + "step": 3102 + }, + { + "epoch": 0.2808907395673033, + "grad_norm": 0.8974297046661377, + "learning_rate": 0.0001579281172342679, + "loss": 2.9996, + "step": 3103 + }, + { + "epoch": 0.28098126188105366, + "grad_norm": 1.022741675376892, + "learning_rate": 0.00015790224765645849, + "loss": 2.9941, + "step": 3104 + }, + { + "epoch": 0.281071784194804, + "grad_norm": 0.9084989428520203, + "learning_rate": 0.00015787637224789434, + "loss": 3.0378, + "step": 3105 + }, + { + "epoch": 0.28116230650855434, + "grad_norm": 1.0861493349075317, + "learning_rate": 0.0001578504910111811, + "loss": 3.0639, + "step": 3106 + }, + { + "epoch": 0.2812528288223047, + "grad_norm": 0.9200314283370972, + "learning_rate": 0.00015782460394892498, + "loss": 2.9762, + "step": 3107 + }, + { + "epoch": 0.281343351136055, + "grad_norm": 1.0727646350860596, + "learning_rate": 0.00015779871106373283, + "loss": 3.0473, + "step": 3108 + }, + { + "epoch": 0.28143387344980536, + "grad_norm": 0.9366647601127625, + "learning_rate": 0.00015777281235821206, + "loss": 3.0291, + "step": 3109 + }, + { + "epoch": 0.2815243957635557, + "grad_norm": 1.0757999420166016, + "learning_rate": 0.00015774690783497067, + "loss": 3.0211, + "step": 3110 + }, + { + "epoch": 0.28161491807730604, + "grad_norm": 0.8640356659889221, + "learning_rate": 0.00015772099749661723, + "loss": 3.0259, + "step": 3111 + }, + { + "epoch": 0.2817054403910564, + "grad_norm": 0.8835778832435608, + "learning_rate": 0.00015769508134576095, + "loss": 2.9957, + "step": 3112 + }, + { + "epoch": 0.2817959627048067, + "grad_norm": 0.9119789600372314, + "learning_rate": 0.00015766915938501152, + "loss": 3.0188, + "step": 3113 + }, + { + "epoch": 0.28188648501855706, + "grad_norm": 0.8268768787384033, + "learning_rate": 0.00015764323161697935, + "loss": 2.9249, + "step": 3114 + }, + { + "epoch": 0.2819770073323074, + "grad_norm": 0.8696132302284241, + "learning_rate": 0.0001576172980442753, + "loss": 2.9569, + "step": 3115 + }, + { + "epoch": 0.28206752964605775, + "grad_norm": 0.8999311923980713, + "learning_rate": 0.00015759135866951092, + "loss": 3.024, + "step": 3116 + }, + { + "epoch": 0.2821580519598081, + "grad_norm": 0.8366394639015198, + "learning_rate": 0.00015756541349529833, + "loss": 2.9927, + "step": 3117 + }, + { + "epoch": 0.28224857427355843, + "grad_norm": 0.9423478841781616, + "learning_rate": 0.00015753946252425013, + "loss": 3.0575, + "step": 3118 + }, + { + "epoch": 0.28233909658730877, + "grad_norm": 0.8939400911331177, + "learning_rate": 0.00015751350575897964, + "loss": 3.0443, + "step": 3119 + }, + { + "epoch": 0.2824296189010591, + "grad_norm": 0.8804583549499512, + "learning_rate": 0.00015748754320210072, + "loss": 3.0482, + "step": 3120 + }, + { + "epoch": 0.28252014121480945, + "grad_norm": 0.9358223676681519, + "learning_rate": 0.00015746157485622777, + "loss": 3.0573, + "step": 3121 + }, + { + "epoch": 0.2826106635285598, + "grad_norm": 0.8463422060012817, + "learning_rate": 0.00015743560072397578, + "loss": 3.0755, + "step": 3122 + }, + { + "epoch": 0.28270118584231013, + "grad_norm": 0.9778035879135132, + "learning_rate": 0.0001574096208079604, + "loss": 3.059, + "step": 3123 + }, + { + "epoch": 0.2827917081560605, + "grad_norm": 0.8894894123077393, + "learning_rate": 0.00015738363511079776, + "loss": 3.0353, + "step": 3124 + }, + { + "epoch": 0.2828822304698108, + "grad_norm": 0.9795463681221008, + "learning_rate": 0.0001573576436351046, + "loss": 3.0643, + "step": 3125 + }, + { + "epoch": 0.28297275278356115, + "grad_norm": 0.9977430105209351, + "learning_rate": 0.00015733164638349835, + "loss": 3.0345, + "step": 3126 + }, + { + "epoch": 0.2830632750973115, + "grad_norm": 1.0255120992660522, + "learning_rate": 0.00015730564335859684, + "loss": 3.0646, + "step": 3127 + }, + { + "epoch": 0.28315379741106184, + "grad_norm": 0.920333981513977, + "learning_rate": 0.0001572796345630186, + "loss": 3.0291, + "step": 3128 + }, + { + "epoch": 0.2832443197248122, + "grad_norm": 0.8356728553771973, + "learning_rate": 0.00015725361999938277, + "loss": 3.0165, + "step": 3129 + }, + { + "epoch": 0.2833348420385625, + "grad_norm": 0.9003295302391052, + "learning_rate": 0.00015722759967030898, + "loss": 2.9575, + "step": 3130 + }, + { + "epoch": 0.28342536435231286, + "grad_norm": 0.9911743998527527, + "learning_rate": 0.00015720157357841744, + "loss": 3.0059, + "step": 3131 + }, + { + "epoch": 0.2835158866660632, + "grad_norm": 0.8708944320678711, + "learning_rate": 0.00015717554172632904, + "loss": 2.9358, + "step": 3132 + }, + { + "epoch": 0.28360640897981354, + "grad_norm": 0.9072350859642029, + "learning_rate": 0.0001571495041166651, + "loss": 3.0155, + "step": 3133 + }, + { + "epoch": 0.2836969312935639, + "grad_norm": 0.9872345924377441, + "learning_rate": 0.00015712346075204766, + "loss": 3.0427, + "step": 3134 + }, + { + "epoch": 0.2837874536073142, + "grad_norm": 1.0371378660202026, + "learning_rate": 0.00015709741163509933, + "loss": 3.0016, + "step": 3135 + }, + { + "epoch": 0.28387797592106456, + "grad_norm": 1.0638636350631714, + "learning_rate": 0.0001570713567684432, + "loss": 3.0165, + "step": 3136 + }, + { + "epoch": 0.2839684982348149, + "grad_norm": 0.879398763179779, + "learning_rate": 0.00015704529615470295, + "loss": 3.0285, + "step": 3137 + }, + { + "epoch": 0.28405902054856524, + "grad_norm": 1.0027191638946533, + "learning_rate": 0.000157019229796503, + "loss": 3.019, + "step": 3138 + }, + { + "epoch": 0.2841495428623156, + "grad_norm": 0.9104477763175964, + "learning_rate": 0.00015699315769646815, + "loss": 3.0091, + "step": 3139 + }, + { + "epoch": 0.2842400651760659, + "grad_norm": 0.9437276721000671, + "learning_rate": 0.0001569670798572239, + "loss": 3.0236, + "step": 3140 + }, + { + "epoch": 0.28433058748981627, + "grad_norm": 1.2169289588928223, + "learning_rate": 0.00015694099628139622, + "loss": 3.0437, + "step": 3141 + }, + { + "epoch": 0.28442110980356655, + "grad_norm": 0.9517204761505127, + "learning_rate": 0.00015691490697161182, + "loss": 3.016, + "step": 3142 + }, + { + "epoch": 0.2845116321173169, + "grad_norm": 1.0102424621582031, + "learning_rate": 0.00015688881193049782, + "loss": 2.9728, + "step": 3143 + }, + { + "epoch": 0.28460215443106723, + "grad_norm": 0.8501645922660828, + "learning_rate": 0.000156862711160682, + "loss": 2.9421, + "step": 3144 + }, + { + "epoch": 0.2846926767448176, + "grad_norm": 0.870749831199646, + "learning_rate": 0.00015683660466479275, + "loss": 2.9931, + "step": 3145 + }, + { + "epoch": 0.2847831990585679, + "grad_norm": 0.9530490636825562, + "learning_rate": 0.000156810492445459, + "loss": 3.0254, + "step": 3146 + }, + { + "epoch": 0.28487372137231826, + "grad_norm": 0.902142345905304, + "learning_rate": 0.00015678437450531013, + "loss": 3.0355, + "step": 3147 + }, + { + "epoch": 0.2849642436860686, + "grad_norm": 0.9047490954399109, + "learning_rate": 0.00015675825084697636, + "loss": 3.0706, + "step": 3148 + }, + { + "epoch": 0.28505476599981894, + "grad_norm": 0.8961873650550842, + "learning_rate": 0.00015673212147308826, + "loss": 3.0495, + "step": 3149 + }, + { + "epoch": 0.2851452883135693, + "grad_norm": 0.8525515198707581, + "learning_rate": 0.00015670598638627706, + "loss": 3.0943, + "step": 3150 + }, + { + "epoch": 0.2852358106273196, + "grad_norm": 0.9663558602333069, + "learning_rate": 0.00015667984558917464, + "loss": 3.0162, + "step": 3151 + }, + { + "epoch": 0.28532633294106996, + "grad_norm": 0.9535746574401855, + "learning_rate": 0.00015665369908441328, + "loss": 3.0412, + "step": 3152 + }, + { + "epoch": 0.2854168552548203, + "grad_norm": 0.8646260499954224, + "learning_rate": 0.00015662754687462596, + "loss": 2.9971, + "step": 3153 + }, + { + "epoch": 0.28550737756857064, + "grad_norm": 0.9250417351722717, + "learning_rate": 0.00015660138896244624, + "loss": 2.9493, + "step": 3154 + }, + { + "epoch": 0.285597899882321, + "grad_norm": 0.9591944217681885, + "learning_rate": 0.00015657522535050822, + "loss": 3.0685, + "step": 3155 + }, + { + "epoch": 0.2856884221960713, + "grad_norm": 0.9319788217544556, + "learning_rate": 0.00015654905604144657, + "loss": 3.0092, + "step": 3156 + }, + { + "epoch": 0.28577894450982166, + "grad_norm": 0.8835229873657227, + "learning_rate": 0.00015652288103789648, + "loss": 2.9717, + "step": 3157 + }, + { + "epoch": 0.285869466823572, + "grad_norm": 0.9241860508918762, + "learning_rate": 0.0001564967003424938, + "loss": 2.9682, + "step": 3158 + }, + { + "epoch": 0.28595998913732235, + "grad_norm": 0.8293352127075195, + "learning_rate": 0.00015647051395787498, + "loss": 2.9572, + "step": 3159 + }, + { + "epoch": 0.2860505114510727, + "grad_norm": 0.8276857137680054, + "learning_rate": 0.00015644432188667695, + "loss": 3.0033, + "step": 3160 + }, + { + "epoch": 0.286141033764823, + "grad_norm": 0.891465425491333, + "learning_rate": 0.00015641812413153724, + "loss": 3.0832, + "step": 3161 + }, + { + "epoch": 0.28623155607857337, + "grad_norm": 0.823231041431427, + "learning_rate": 0.00015639192069509395, + "loss": 3.0404, + "step": 3162 + }, + { + "epoch": 0.2863220783923237, + "grad_norm": 0.8683503270149231, + "learning_rate": 0.0001563657115799858, + "loss": 2.9718, + "step": 3163 + }, + { + "epoch": 0.28641260070607405, + "grad_norm": 0.8113049864768982, + "learning_rate": 0.00015633949678885208, + "loss": 3.0384, + "step": 3164 + }, + { + "epoch": 0.2865031230198244, + "grad_norm": 0.8468487858772278, + "learning_rate": 0.0001563132763243325, + "loss": 3.0317, + "step": 3165 + }, + { + "epoch": 0.28659364533357473, + "grad_norm": 0.8657675385475159, + "learning_rate": 0.0001562870501890676, + "loss": 3.048, + "step": 3166 + }, + { + "epoch": 0.28668416764732507, + "grad_norm": 0.8631026148796082, + "learning_rate": 0.00015626081838569826, + "loss": 3.0143, + "step": 3167 + }, + { + "epoch": 0.2867746899610754, + "grad_norm": 0.8840932846069336, + "learning_rate": 0.00015623458091686604, + "loss": 3.0532, + "step": 3168 + }, + { + "epoch": 0.28686521227482575, + "grad_norm": 0.8777768611907959, + "learning_rate": 0.00015620833778521307, + "loss": 3.0652, + "step": 3169 + }, + { + "epoch": 0.2869557345885761, + "grad_norm": 1.0639394521713257, + "learning_rate": 0.00015618208899338202, + "loss": 3.0654, + "step": 3170 + }, + { + "epoch": 0.28704625690232644, + "grad_norm": 0.9420127868652344, + "learning_rate": 0.00015615583454401617, + "loss": 2.967, + "step": 3171 + }, + { + "epoch": 0.2871367792160768, + "grad_norm": 0.9539990425109863, + "learning_rate": 0.0001561295744397593, + "loss": 3.0091, + "step": 3172 + }, + { + "epoch": 0.2872273015298271, + "grad_norm": 0.9690334796905518, + "learning_rate": 0.0001561033086832558, + "loss": 3.0722, + "step": 3173 + }, + { + "epoch": 0.28731782384357746, + "grad_norm": 0.9461049437522888, + "learning_rate": 0.00015607703727715065, + "loss": 3.0282, + "step": 3174 + }, + { + "epoch": 0.2874083461573278, + "grad_norm": 1.0628281831741333, + "learning_rate": 0.0001560507602240894, + "loss": 3.0082, + "step": 3175 + }, + { + "epoch": 0.28749886847107814, + "grad_norm": 0.8840575814247131, + "learning_rate": 0.00015602447752671814, + "loss": 3.0105, + "step": 3176 + }, + { + "epoch": 0.2875893907848285, + "grad_norm": 0.9166979789733887, + "learning_rate": 0.00015599818918768352, + "loss": 2.9886, + "step": 3177 + }, + { + "epoch": 0.2876799130985788, + "grad_norm": 0.9768393039703369, + "learning_rate": 0.00015597189520963277, + "loss": 2.9479, + "step": 3178 + }, + { + "epoch": 0.28777043541232916, + "grad_norm": 0.9496955871582031, + "learning_rate": 0.0001559455955952137, + "loss": 3.0099, + "step": 3179 + }, + { + "epoch": 0.2878609577260795, + "grad_norm": 0.886669933795929, + "learning_rate": 0.0001559192903470747, + "loss": 2.9841, + "step": 3180 + }, + { + "epoch": 0.28795148003982984, + "grad_norm": 0.8499637842178345, + "learning_rate": 0.00015589297946786467, + "loss": 2.9788, + "step": 3181 + }, + { + "epoch": 0.2880420023535802, + "grad_norm": 0.917888879776001, + "learning_rate": 0.00015586666296023315, + "loss": 3.0087, + "step": 3182 + }, + { + "epoch": 0.28813252466733047, + "grad_norm": 0.8253499269485474, + "learning_rate": 0.00015584034082683022, + "loss": 3.009, + "step": 3183 + }, + { + "epoch": 0.2882230469810808, + "grad_norm": 0.7821881771087646, + "learning_rate": 0.00015581401307030647, + "loss": 3.0094, + "step": 3184 + }, + { + "epoch": 0.28831356929483115, + "grad_norm": 0.9027956128120422, + "learning_rate": 0.00015578767969331314, + "loss": 3.0757, + "step": 3185 + }, + { + "epoch": 0.2884040916085815, + "grad_norm": 0.8098450303077698, + "learning_rate": 0.000155761340698502, + "loss": 2.9402, + "step": 3186 + }, + { + "epoch": 0.28849461392233183, + "grad_norm": 0.8672653436660767, + "learning_rate": 0.00015573499608852537, + "loss": 3.0249, + "step": 3187 + }, + { + "epoch": 0.2885851362360822, + "grad_norm": 0.9086789488792419, + "learning_rate": 0.00015570864586603612, + "loss": 3.0921, + "step": 3188 + }, + { + "epoch": 0.2886756585498325, + "grad_norm": 0.8213782906532288, + "learning_rate": 0.0001556822900336878, + "loss": 2.9316, + "step": 3189 + }, + { + "epoch": 0.28876618086358286, + "grad_norm": 0.8321839570999146, + "learning_rate": 0.0001556559285941344, + "loss": 3.0399, + "step": 3190 + }, + { + "epoch": 0.2888567031773332, + "grad_norm": 0.8627842664718628, + "learning_rate": 0.0001556295615500305, + "loss": 2.9849, + "step": 3191 + }, + { + "epoch": 0.28894722549108354, + "grad_norm": 0.8597263693809509, + "learning_rate": 0.00015560318890403126, + "loss": 2.9951, + "step": 3192 + }, + { + "epoch": 0.2890377478048339, + "grad_norm": 0.8256116509437561, + "learning_rate": 0.00015557681065879244, + "loss": 2.9524, + "step": 3193 + }, + { + "epoch": 0.2891282701185842, + "grad_norm": 0.9126301407814026, + "learning_rate": 0.0001555504268169703, + "loss": 2.949, + "step": 3194 + }, + { + "epoch": 0.28921879243233456, + "grad_norm": 0.8662191033363342, + "learning_rate": 0.0001555240373812217, + "loss": 3.0159, + "step": 3195 + }, + { + "epoch": 0.2893093147460849, + "grad_norm": 0.852206289768219, + "learning_rate": 0.00015549764235420405, + "loss": 3.0154, + "step": 3196 + }, + { + "epoch": 0.28939983705983524, + "grad_norm": 0.8456076383590698, + "learning_rate": 0.00015547124173857535, + "loss": 3.0452, + "step": 3197 + }, + { + "epoch": 0.2894903593735856, + "grad_norm": 0.8650831580162048, + "learning_rate": 0.00015544483553699408, + "loss": 3.0385, + "step": 3198 + }, + { + "epoch": 0.2895808816873359, + "grad_norm": 0.8533116579055786, + "learning_rate": 0.00015541842375211946, + "loss": 3.0309, + "step": 3199 + }, + { + "epoch": 0.28967140400108626, + "grad_norm": 0.8483850955963135, + "learning_rate": 0.00015539200638661104, + "loss": 3.029, + "step": 3200 + }, + { + "epoch": 0.2897619263148366, + "grad_norm": 0.7920590043067932, + "learning_rate": 0.00015536558344312907, + "loss": 2.9799, + "step": 3201 + }, + { + "epoch": 0.28985244862858695, + "grad_norm": 0.8692976832389832, + "learning_rate": 0.00015533915492433443, + "loss": 3.0048, + "step": 3202 + }, + { + "epoch": 0.2899429709423373, + "grad_norm": 0.8859113454818726, + "learning_rate": 0.00015531272083288834, + "loss": 3.0178, + "step": 3203 + }, + { + "epoch": 0.2900334932560876, + "grad_norm": 0.8331505060195923, + "learning_rate": 0.0001552862811714528, + "loss": 3.0049, + "step": 3204 + }, + { + "epoch": 0.29012401556983797, + "grad_norm": 0.8357052206993103, + "learning_rate": 0.00015525983594269027, + "loss": 2.9647, + "step": 3205 + }, + { + "epoch": 0.2902145378835883, + "grad_norm": 0.9645798802375793, + "learning_rate": 0.00015523338514926376, + "loss": 2.9726, + "step": 3206 + }, + { + "epoch": 0.29030506019733865, + "grad_norm": 0.8409217000007629, + "learning_rate": 0.00015520692879383686, + "loss": 2.9647, + "step": 3207 + }, + { + "epoch": 0.290395582511089, + "grad_norm": 1.0742449760437012, + "learning_rate": 0.00015518046687907377, + "loss": 3.0654, + "step": 3208 + }, + { + "epoch": 0.29048610482483933, + "grad_norm": 0.9031012654304504, + "learning_rate": 0.00015515399940763916, + "loss": 2.9865, + "step": 3209 + }, + { + "epoch": 0.29057662713858967, + "grad_norm": 0.9780822396278381, + "learning_rate": 0.00015512752638219835, + "loss": 3.0079, + "step": 3210 + }, + { + "epoch": 0.29066714945234, + "grad_norm": 1.170419692993164, + "learning_rate": 0.0001551010478054171, + "loss": 3.0254, + "step": 3211 + }, + { + "epoch": 0.29075767176609035, + "grad_norm": 0.9002463817596436, + "learning_rate": 0.00015507456367996184, + "loss": 3.0676, + "step": 3212 + }, + { + "epoch": 0.2908481940798407, + "grad_norm": 1.1873091459274292, + "learning_rate": 0.00015504807400849958, + "loss": 2.9774, + "step": 3213 + }, + { + "epoch": 0.29093871639359103, + "grad_norm": 0.9433950185775757, + "learning_rate": 0.0001550215787936977, + "loss": 2.9299, + "step": 3214 + }, + { + "epoch": 0.2910292387073414, + "grad_norm": 1.0682939291000366, + "learning_rate": 0.0001549950780382244, + "loss": 3.013, + "step": 3215 + }, + { + "epoch": 0.2911197610210917, + "grad_norm": 0.9073317646980286, + "learning_rate": 0.00015496857174474824, + "loss": 3.016, + "step": 3216 + }, + { + "epoch": 0.29121028333484206, + "grad_norm": 0.9233403205871582, + "learning_rate": 0.0001549420599159384, + "loss": 3.0042, + "step": 3217 + }, + { + "epoch": 0.2913008056485924, + "grad_norm": 0.9346694350242615, + "learning_rate": 0.00015491554255446462, + "loss": 3.0543, + "step": 3218 + }, + { + "epoch": 0.29139132796234274, + "grad_norm": 0.8480331897735596, + "learning_rate": 0.0001548890196629972, + "loss": 3.0001, + "step": 3219 + }, + { + "epoch": 0.2914818502760931, + "grad_norm": 0.9132717251777649, + "learning_rate": 0.000154862491244207, + "loss": 2.9961, + "step": 3220 + }, + { + "epoch": 0.2915723725898434, + "grad_norm": 0.9685938358306885, + "learning_rate": 0.00015483595730076546, + "loss": 2.9909, + "step": 3221 + }, + { + "epoch": 0.29166289490359376, + "grad_norm": 0.8925626277923584, + "learning_rate": 0.00015480941783534448, + "loss": 3.0059, + "step": 3222 + }, + { + "epoch": 0.2917534172173441, + "grad_norm": 0.8972873687744141, + "learning_rate": 0.00015478287285061666, + "loss": 2.99, + "step": 3223 + }, + { + "epoch": 0.2918439395310944, + "grad_norm": 0.8935641646385193, + "learning_rate": 0.00015475632234925504, + "loss": 2.9713, + "step": 3224 + }, + { + "epoch": 0.29193446184484473, + "grad_norm": 0.8276620507240295, + "learning_rate": 0.00015472976633393326, + "loss": 2.9518, + "step": 3225 + }, + { + "epoch": 0.29202498415859507, + "grad_norm": 0.9621675610542297, + "learning_rate": 0.0001547032048073255, + "loss": 3.0256, + "step": 3226 + }, + { + "epoch": 0.2921155064723454, + "grad_norm": 0.996496856212616, + "learning_rate": 0.0001546766377721065, + "loss": 2.9528, + "step": 3227 + }, + { + "epoch": 0.29220602878609575, + "grad_norm": 0.8234094381332397, + "learning_rate": 0.00015465006523095157, + "loss": 3.024, + "step": 3228 + }, + { + "epoch": 0.2922965510998461, + "grad_norm": 0.9537985920906067, + "learning_rate": 0.00015462348718653658, + "loss": 3.0105, + "step": 3229 + }, + { + "epoch": 0.29238707341359643, + "grad_norm": 1.0219292640686035, + "learning_rate": 0.0001545969036415379, + "loss": 3.0254, + "step": 3230 + }, + { + "epoch": 0.2924775957273468, + "grad_norm": 0.8733369708061218, + "learning_rate": 0.00015457031459863258, + "loss": 3.0066, + "step": 3231 + }, + { + "epoch": 0.2925681180410971, + "grad_norm": 0.8705121874809265, + "learning_rate": 0.00015454372006049803, + "loss": 3.016, + "step": 3232 + }, + { + "epoch": 0.29265864035484745, + "grad_norm": 0.9468245506286621, + "learning_rate": 0.00015451712002981235, + "loss": 2.9881, + "step": 3233 + }, + { + "epoch": 0.2927491626685978, + "grad_norm": 0.8147585391998291, + "learning_rate": 0.0001544905145092542, + "loss": 3.0279, + "step": 3234 + }, + { + "epoch": 0.29283968498234814, + "grad_norm": 0.8886904716491699, + "learning_rate": 0.00015446390350150273, + "loss": 2.9563, + "step": 3235 + }, + { + "epoch": 0.2929302072960985, + "grad_norm": 0.8884991407394409, + "learning_rate": 0.00015443728700923766, + "loss": 3.0518, + "step": 3236 + }, + { + "epoch": 0.2930207296098488, + "grad_norm": 0.8598198294639587, + "learning_rate": 0.00015441066503513927, + "loss": 3.0198, + "step": 3237 + }, + { + "epoch": 0.29311125192359916, + "grad_norm": 0.8423158526420593, + "learning_rate": 0.0001543840375818884, + "loss": 3.0396, + "step": 3238 + }, + { + "epoch": 0.2932017742373495, + "grad_norm": 0.9256920218467712, + "learning_rate": 0.00015435740465216644, + "loss": 3.0247, + "step": 3239 + }, + { + "epoch": 0.29329229655109984, + "grad_norm": 0.862433910369873, + "learning_rate": 0.00015433076624865531, + "loss": 3.0077, + "step": 3240 + }, + { + "epoch": 0.2933828188648502, + "grad_norm": 0.930212140083313, + "learning_rate": 0.00015430412237403752, + "loss": 3.0163, + "step": 3241 + }, + { + "epoch": 0.2934733411786005, + "grad_norm": 0.8511956334114075, + "learning_rate": 0.0001542774730309961, + "loss": 2.8893, + "step": 3242 + }, + { + "epoch": 0.29356386349235086, + "grad_norm": 0.8834841847419739, + "learning_rate": 0.0001542508182222146, + "loss": 2.9634, + "step": 3243 + }, + { + "epoch": 0.2936543858061012, + "grad_norm": 0.9784421920776367, + "learning_rate": 0.0001542241579503772, + "loss": 2.9846, + "step": 3244 + }, + { + "epoch": 0.29374490811985154, + "grad_norm": 0.8660554885864258, + "learning_rate": 0.00015419749221816858, + "loss": 2.9127, + "step": 3245 + }, + { + "epoch": 0.2938354304336019, + "grad_norm": 0.9871695637702942, + "learning_rate": 0.000154170821028274, + "loss": 2.9124, + "step": 3246 + }, + { + "epoch": 0.2939259527473522, + "grad_norm": 0.8965557217597961, + "learning_rate": 0.0001541441443833792, + "loss": 2.9299, + "step": 3247 + }, + { + "epoch": 0.29401647506110257, + "grad_norm": 0.8073522448539734, + "learning_rate": 0.00015411746228617055, + "loss": 2.9716, + "step": 3248 + }, + { + "epoch": 0.2941069973748529, + "grad_norm": 0.9049869775772095, + "learning_rate": 0.00015409077473933492, + "loss": 3.0341, + "step": 3249 + }, + { + "epoch": 0.29419751968860325, + "grad_norm": 0.9170584082603455, + "learning_rate": 0.00015406408174555976, + "loss": 3.0301, + "step": 3250 + }, + { + "epoch": 0.2942880420023536, + "grad_norm": 0.9323590397834778, + "learning_rate": 0.00015403738330753306, + "loss": 3.0336, + "step": 3251 + }, + { + "epoch": 0.29437856431610393, + "grad_norm": 0.9274752736091614, + "learning_rate": 0.00015401067942794332, + "loss": 3.0528, + "step": 3252 + }, + { + "epoch": 0.29446908662985427, + "grad_norm": 0.8868407011032104, + "learning_rate": 0.00015398397010947965, + "loss": 2.9815, + "step": 3253 + }, + { + "epoch": 0.2945596089436046, + "grad_norm": 0.9790063500404358, + "learning_rate": 0.00015395725535483168, + "loss": 3.0448, + "step": 3254 + }, + { + "epoch": 0.29465013125735495, + "grad_norm": 0.8878175020217896, + "learning_rate": 0.00015393053516668954, + "loss": 2.9678, + "step": 3255 + }, + { + "epoch": 0.2947406535711053, + "grad_norm": 0.897851824760437, + "learning_rate": 0.000153903809547744, + "loss": 3.0092, + "step": 3256 + }, + { + "epoch": 0.29483117588485563, + "grad_norm": 0.9687259793281555, + "learning_rate": 0.0001538770785006863, + "loss": 2.9965, + "step": 3257 + }, + { + "epoch": 0.294921698198606, + "grad_norm": 0.9195958971977234, + "learning_rate": 0.0001538503420282083, + "loss": 3.0088, + "step": 3258 + }, + { + "epoch": 0.2950122205123563, + "grad_norm": 0.9068818092346191, + "learning_rate": 0.00015382360013300233, + "loss": 2.9513, + "step": 3259 + }, + { + "epoch": 0.29510274282610666, + "grad_norm": 0.8815194368362427, + "learning_rate": 0.00015379685281776125, + "loss": 2.9852, + "step": 3260 + }, + { + "epoch": 0.295193265139857, + "grad_norm": 0.8915935754776001, + "learning_rate": 0.0001537701000851786, + "loss": 2.9612, + "step": 3261 + }, + { + "epoch": 0.29528378745360734, + "grad_norm": 0.9282931685447693, + "learning_rate": 0.00015374334193794838, + "loss": 3.0762, + "step": 3262 + }, + { + "epoch": 0.2953743097673577, + "grad_norm": 0.9892238974571228, + "learning_rate": 0.00015371657837876502, + "loss": 3.0449, + "step": 3263 + }, + { + "epoch": 0.295464832081108, + "grad_norm": 0.9499996304512024, + "learning_rate": 0.00015368980941032372, + "loss": 2.9896, + "step": 3264 + }, + { + "epoch": 0.2955553543948583, + "grad_norm": 0.8943758606910706, + "learning_rate": 0.00015366303503532009, + "loss": 3.0555, + "step": 3265 + }, + { + "epoch": 0.29564587670860865, + "grad_norm": 0.926461935043335, + "learning_rate": 0.0001536362552564503, + "loss": 2.9596, + "step": 3266 + }, + { + "epoch": 0.295736399022359, + "grad_norm": 0.8656926155090332, + "learning_rate": 0.00015360947007641106, + "loss": 2.9488, + "step": 3267 + }, + { + "epoch": 0.2958269213361093, + "grad_norm": 0.9003334641456604, + "learning_rate": 0.00015358267949789966, + "loss": 3.0083, + "step": 3268 + }, + { + "epoch": 0.29591744364985967, + "grad_norm": 0.9856387376785278, + "learning_rate": 0.0001535558835236139, + "loss": 3.0598, + "step": 3269 + }, + { + "epoch": 0.29600796596361, + "grad_norm": 0.8337308764457703, + "learning_rate": 0.00015352908215625214, + "loss": 2.9763, + "step": 3270 + }, + { + "epoch": 0.29609848827736035, + "grad_norm": 0.8456469774246216, + "learning_rate": 0.00015350227539851326, + "loss": 2.9611, + "step": 3271 + }, + { + "epoch": 0.2961890105911107, + "grad_norm": 0.9669084548950195, + "learning_rate": 0.00015347546325309672, + "loss": 2.9944, + "step": 3272 + }, + { + "epoch": 0.29627953290486103, + "grad_norm": 0.9577633738517761, + "learning_rate": 0.0001534486457227025, + "loss": 3.0196, + "step": 3273 + }, + { + "epoch": 0.2963700552186114, + "grad_norm": 0.8991074562072754, + "learning_rate": 0.00015342182281003112, + "loss": 2.9924, + "step": 3274 + }, + { + "epoch": 0.2964605775323617, + "grad_norm": 1.0249013900756836, + "learning_rate": 0.00015339499451778364, + "loss": 2.9924, + "step": 3275 + }, + { + "epoch": 0.29655109984611205, + "grad_norm": 1.0314909219741821, + "learning_rate": 0.00015336816084866165, + "loss": 3.0581, + "step": 3276 + }, + { + "epoch": 0.2966416221598624, + "grad_norm": 0.785307765007019, + "learning_rate": 0.00015334132180536733, + "loss": 2.9663, + "step": 3277 + }, + { + "epoch": 0.29673214447361274, + "grad_norm": 0.9974069595336914, + "learning_rate": 0.00015331447739060338, + "loss": 2.9589, + "step": 3278 + }, + { + "epoch": 0.2968226667873631, + "grad_norm": 0.9813283681869507, + "learning_rate": 0.000153287627607073, + "loss": 3.0866, + "step": 3279 + }, + { + "epoch": 0.2969131891011134, + "grad_norm": 0.8992034792900085, + "learning_rate": 0.00015326077245747999, + "loss": 3.0273, + "step": 3280 + }, + { + "epoch": 0.29700371141486376, + "grad_norm": 1.013922095298767, + "learning_rate": 0.00015323391194452864, + "loss": 3.0097, + "step": 3281 + }, + { + "epoch": 0.2970942337286141, + "grad_norm": 0.950112521648407, + "learning_rate": 0.00015320704607092384, + "loss": 2.9898, + "step": 3282 + }, + { + "epoch": 0.29718475604236444, + "grad_norm": 0.9233756065368652, + "learning_rate": 0.00015318017483937092, + "loss": 2.9998, + "step": 3283 + }, + { + "epoch": 0.2972752783561148, + "grad_norm": 0.9420918226242065, + "learning_rate": 0.0001531532982525759, + "loss": 2.9409, + "step": 3284 + }, + { + "epoch": 0.2973658006698651, + "grad_norm": 0.8628808856010437, + "learning_rate": 0.00015312641631324515, + "loss": 3.0076, + "step": 3285 + }, + { + "epoch": 0.29745632298361546, + "grad_norm": 0.869685709476471, + "learning_rate": 0.00015309952902408576, + "loss": 3.0232, + "step": 3286 + }, + { + "epoch": 0.2975468452973658, + "grad_norm": 0.8513570427894592, + "learning_rate": 0.00015307263638780522, + "loss": 2.9768, + "step": 3287 + }, + { + "epoch": 0.29763736761111614, + "grad_norm": 0.9360085129737854, + "learning_rate": 0.00015304573840711167, + "loss": 3.0254, + "step": 3288 + }, + { + "epoch": 0.2977278899248665, + "grad_norm": 1.0327681303024292, + "learning_rate": 0.00015301883508471372, + "loss": 3.078, + "step": 3289 + }, + { + "epoch": 0.2978184122386168, + "grad_norm": 0.8258141279220581, + "learning_rate": 0.0001529919264233205, + "loss": 2.9808, + "step": 3290 + }, + { + "epoch": 0.29790893455236717, + "grad_norm": 0.9610850811004639, + "learning_rate": 0.00015296501242564173, + "loss": 3.0336, + "step": 3291 + }, + { + "epoch": 0.2979994568661175, + "grad_norm": 0.9272431135177612, + "learning_rate": 0.00015293809309438773, + "loss": 3.072, + "step": 3292 + }, + { + "epoch": 0.29808997917986785, + "grad_norm": 0.9149249792098999, + "learning_rate": 0.00015291116843226915, + "loss": 2.9522, + "step": 3293 + }, + { + "epoch": 0.2981805014936182, + "grad_norm": 0.9025709629058838, + "learning_rate": 0.00015288423844199734, + "loss": 2.9639, + "step": 3294 + }, + { + "epoch": 0.29827102380736853, + "grad_norm": 0.8880616426467896, + "learning_rate": 0.0001528573031262842, + "loss": 3.0719, + "step": 3295 + }, + { + "epoch": 0.29836154612111887, + "grad_norm": 0.8158171772956848, + "learning_rate": 0.00015283036248784208, + "loss": 2.9746, + "step": 3296 + }, + { + "epoch": 0.2984520684348692, + "grad_norm": 0.8735789060592651, + "learning_rate": 0.0001528034165293839, + "loss": 3.0311, + "step": 3297 + }, + { + "epoch": 0.29854259074861955, + "grad_norm": 0.8365547060966492, + "learning_rate": 0.0001527764652536231, + "loss": 2.9659, + "step": 3298 + }, + { + "epoch": 0.2986331130623699, + "grad_norm": 0.8127195239067078, + "learning_rate": 0.0001527495086632737, + "loss": 3.0082, + "step": 3299 + }, + { + "epoch": 0.29872363537612023, + "grad_norm": 1.0759278535842896, + "learning_rate": 0.00015272254676105025, + "loss": 3.0071, + "step": 3300 + }, + { + "epoch": 0.2988141576898706, + "grad_norm": 0.8265242576599121, + "learning_rate": 0.00015269557954966778, + "loss": 2.9862, + "step": 3301 + }, + { + "epoch": 0.2989046800036209, + "grad_norm": 0.8545429110527039, + "learning_rate": 0.00015266860703184188, + "loss": 2.998, + "step": 3302 + }, + { + "epoch": 0.29899520231737126, + "grad_norm": 0.9787036776542664, + "learning_rate": 0.00015264162921028865, + "loss": 2.9452, + "step": 3303 + }, + { + "epoch": 0.2990857246311216, + "grad_norm": 0.8430793881416321, + "learning_rate": 0.00015261464608772488, + "loss": 2.9786, + "step": 3304 + }, + { + "epoch": 0.29917624694487194, + "grad_norm": 0.9696670770645142, + "learning_rate": 0.00015258765766686761, + "loss": 3.0475, + "step": 3305 + }, + { + "epoch": 0.2992667692586222, + "grad_norm": 0.8284124135971069, + "learning_rate": 0.00015256066395043471, + "loss": 2.9923, + "step": 3306 + }, + { + "epoch": 0.29935729157237256, + "grad_norm": 0.8875390291213989, + "learning_rate": 0.00015253366494114435, + "loss": 2.9976, + "step": 3307 + }, + { + "epoch": 0.2994478138861229, + "grad_norm": 0.9056249856948853, + "learning_rate": 0.00015250666064171538, + "loss": 2.9238, + "step": 3308 + }, + { + "epoch": 0.29953833619987325, + "grad_norm": 0.885613739490509, + "learning_rate": 0.00015247965105486712, + "loss": 2.9875, + "step": 3309 + }, + { + "epoch": 0.2996288585136236, + "grad_norm": 0.9627782106399536, + "learning_rate": 0.00015245263618331945, + "loss": 3.0419, + "step": 3310 + }, + { + "epoch": 0.2997193808273739, + "grad_norm": 0.9762522578239441, + "learning_rate": 0.00015242561602979275, + "loss": 2.9858, + "step": 3311 + }, + { + "epoch": 0.29980990314112427, + "grad_norm": 0.9357553720474243, + "learning_rate": 0.00015239859059700794, + "loss": 2.9092, + "step": 3312 + }, + { + "epoch": 0.2999004254548746, + "grad_norm": 1.0730036497116089, + "learning_rate": 0.00015237155988768647, + "loss": 2.9937, + "step": 3313 + }, + { + "epoch": 0.29999094776862495, + "grad_norm": 0.917066752910614, + "learning_rate": 0.00015234452390455036, + "loss": 3.0989, + "step": 3314 + }, + { + "epoch": 0.3000814700823753, + "grad_norm": 1.0417499542236328, + "learning_rate": 0.00015231748265032215, + "loss": 2.9708, + "step": 3315 + }, + { + "epoch": 0.30017199239612563, + "grad_norm": 0.9708789587020874, + "learning_rate": 0.00015229043612772486, + "loss": 2.9939, + "step": 3316 + }, + { + "epoch": 0.30026251470987597, + "grad_norm": 0.9104913473129272, + "learning_rate": 0.0001522633843394821, + "loss": 2.9814, + "step": 3317 + }, + { + "epoch": 0.3003530370236263, + "grad_norm": 0.9186818599700928, + "learning_rate": 0.0001522363272883179, + "loss": 2.9769, + "step": 3318 + }, + { + "epoch": 0.30044355933737665, + "grad_norm": 0.9663811922073364, + "learning_rate": 0.00015220926497695704, + "loss": 3.058, + "step": 3319 + }, + { + "epoch": 0.300534081651127, + "grad_norm": 0.9680636525154114, + "learning_rate": 0.0001521821974081246, + "loss": 2.9354, + "step": 3320 + }, + { + "epoch": 0.30062460396487733, + "grad_norm": 0.9134761095046997, + "learning_rate": 0.00015215512458454633, + "loss": 2.9999, + "step": 3321 + }, + { + "epoch": 0.3007151262786277, + "grad_norm": 0.9421324729919434, + "learning_rate": 0.0001521280465089484, + "loss": 3.0146, + "step": 3322 + }, + { + "epoch": 0.300805648592378, + "grad_norm": 0.9263170957565308, + "learning_rate": 0.00015210096318405767, + "loss": 2.972, + "step": 3323 + }, + { + "epoch": 0.30089617090612836, + "grad_norm": 0.9598850011825562, + "learning_rate": 0.00015207387461260133, + "loss": 3.07, + "step": 3324 + }, + { + "epoch": 0.3009866932198787, + "grad_norm": 0.8545998334884644, + "learning_rate": 0.00015204678079730724, + "loss": 3.0162, + "step": 3325 + }, + { + "epoch": 0.30107721553362904, + "grad_norm": 0.9190890192985535, + "learning_rate": 0.00015201968174090373, + "loss": 2.9696, + "step": 3326 + }, + { + "epoch": 0.3011677378473794, + "grad_norm": 0.942925751209259, + "learning_rate": 0.00015199257744611977, + "loss": 3.0451, + "step": 3327 + }, + { + "epoch": 0.3012582601611297, + "grad_norm": 0.8846598267555237, + "learning_rate": 0.0001519654679156846, + "loss": 2.996, + "step": 3328 + }, + { + "epoch": 0.30134878247488006, + "grad_norm": 0.8187833428382874, + "learning_rate": 0.00015193835315232827, + "loss": 2.9353, + "step": 3329 + }, + { + "epoch": 0.3014393047886304, + "grad_norm": 0.8995598554611206, + "learning_rate": 0.00015191123315878123, + "loss": 2.992, + "step": 3330 + }, + { + "epoch": 0.30152982710238074, + "grad_norm": 0.856484055519104, + "learning_rate": 0.00015188410793777436, + "loss": 2.963, + "step": 3331 + }, + { + "epoch": 0.3016203494161311, + "grad_norm": 0.9061743021011353, + "learning_rate": 0.00015185697749203926, + "loss": 2.9959, + "step": 3332 + }, + { + "epoch": 0.3017108717298814, + "grad_norm": 0.9021839499473572, + "learning_rate": 0.000151829841824308, + "loss": 2.9489, + "step": 3333 + }, + { + "epoch": 0.30180139404363177, + "grad_norm": 0.9534750580787659, + "learning_rate": 0.00015180270093731303, + "loss": 3.0352, + "step": 3334 + }, + { + "epoch": 0.3018919163573821, + "grad_norm": 0.8187622427940369, + "learning_rate": 0.00015177555483378752, + "loss": 3.0018, + "step": 3335 + }, + { + "epoch": 0.30198243867113245, + "grad_norm": 0.8136058449745178, + "learning_rate": 0.00015174840351646503, + "loss": 2.9523, + "step": 3336 + }, + { + "epoch": 0.3020729609848828, + "grad_norm": 0.8742151260375977, + "learning_rate": 0.00015172124698807973, + "loss": 2.9309, + "step": 3337 + }, + { + "epoch": 0.30216348329863313, + "grad_norm": 0.9198973774909973, + "learning_rate": 0.0001516940852513663, + "loss": 2.9981, + "step": 3338 + }, + { + "epoch": 0.30225400561238347, + "grad_norm": 0.8850584030151367, + "learning_rate": 0.00015166691830905988, + "loss": 3.0065, + "step": 3339 + }, + { + "epoch": 0.3023445279261338, + "grad_norm": 0.9615797400474548, + "learning_rate": 0.0001516397461638962, + "loss": 3.0168, + "step": 3340 + }, + { + "epoch": 0.30243505023988415, + "grad_norm": 0.8681991696357727, + "learning_rate": 0.0001516125688186115, + "loss": 3.05, + "step": 3341 + }, + { + "epoch": 0.3025255725536345, + "grad_norm": 0.8902552723884583, + "learning_rate": 0.00015158538627594253, + "loss": 3.0239, + "step": 3342 + }, + { + "epoch": 0.30261609486738483, + "grad_norm": 0.9143476486206055, + "learning_rate": 0.00015155819853862655, + "loss": 2.9265, + "step": 3343 + }, + { + "epoch": 0.3027066171811352, + "grad_norm": 0.8717308044433594, + "learning_rate": 0.00015153100560940142, + "loss": 2.9903, + "step": 3344 + }, + { + "epoch": 0.3027971394948855, + "grad_norm": 0.8563142418861389, + "learning_rate": 0.00015150380749100545, + "loss": 3.0039, + "step": 3345 + }, + { + "epoch": 0.30288766180863586, + "grad_norm": 0.9677398800849915, + "learning_rate": 0.00015147660418617743, + "loss": 2.9603, + "step": 3346 + }, + { + "epoch": 0.30297818412238614, + "grad_norm": 0.8721147775650024, + "learning_rate": 0.0001514493956976568, + "loss": 2.9895, + "step": 3347 + }, + { + "epoch": 0.3030687064361365, + "grad_norm": 0.9216533899307251, + "learning_rate": 0.00015142218202818342, + "loss": 3.0475, + "step": 3348 + }, + { + "epoch": 0.3031592287498868, + "grad_norm": 0.9659701585769653, + "learning_rate": 0.00015139496318049774, + "loss": 2.9786, + "step": 3349 + }, + { + "epoch": 0.30324975106363716, + "grad_norm": 0.8679906129837036, + "learning_rate": 0.00015136773915734066, + "loss": 3.0225, + "step": 3350 + }, + { + "epoch": 0.3033402733773875, + "grad_norm": 0.9853136539459229, + "learning_rate": 0.00015134050996145362, + "loss": 2.9994, + "step": 3351 + }, + { + "epoch": 0.30343079569113784, + "grad_norm": 1.0005536079406738, + "learning_rate": 0.00015131327559557867, + "loss": 3.0669, + "step": 3352 + }, + { + "epoch": 0.3035213180048882, + "grad_norm": 0.9564321041107178, + "learning_rate": 0.0001512860360624583, + "loss": 2.9842, + "step": 3353 + }, + { + "epoch": 0.3036118403186385, + "grad_norm": 0.9644863605499268, + "learning_rate": 0.00015125879136483543, + "loss": 2.9576, + "step": 3354 + }, + { + "epoch": 0.30370236263238887, + "grad_norm": 1.0262359380722046, + "learning_rate": 0.00015123154150545372, + "loss": 3.0161, + "step": 3355 + }, + { + "epoch": 0.3037928849461392, + "grad_norm": 0.8304927349090576, + "learning_rate": 0.00015120428648705717, + "loss": 2.9595, + "step": 3356 + }, + { + "epoch": 0.30388340725988955, + "grad_norm": 0.9257547855377197, + "learning_rate": 0.00015117702631239037, + "loss": 2.97, + "step": 3357 + }, + { + "epoch": 0.3039739295736399, + "grad_norm": 0.8537124395370483, + "learning_rate": 0.00015114976098419842, + "loss": 2.9598, + "step": 3358 + }, + { + "epoch": 0.30406445188739023, + "grad_norm": 0.8638425469398499, + "learning_rate": 0.00015112249050522694, + "loss": 2.9879, + "step": 3359 + }, + { + "epoch": 0.30415497420114057, + "grad_norm": 0.8485719561576843, + "learning_rate": 0.00015109521487822206, + "loss": 2.9358, + "step": 3360 + }, + { + "epoch": 0.3042454965148909, + "grad_norm": 0.8621719479560852, + "learning_rate": 0.00015106793410593045, + "loss": 3.004, + "step": 3361 + }, + { + "epoch": 0.30433601882864125, + "grad_norm": 0.8881914019584656, + "learning_rate": 0.00015104064819109926, + "loss": 3.0266, + "step": 3362 + }, + { + "epoch": 0.3044265411423916, + "grad_norm": 0.9412158727645874, + "learning_rate": 0.0001510133571364762, + "loss": 2.9925, + "step": 3363 + }, + { + "epoch": 0.30451706345614193, + "grad_norm": 0.8518620729446411, + "learning_rate": 0.00015098606094480948, + "loss": 2.9884, + "step": 3364 + }, + { + "epoch": 0.3046075857698923, + "grad_norm": 0.8724182844161987, + "learning_rate": 0.0001509587596188478, + "loss": 2.9559, + "step": 3365 + }, + { + "epoch": 0.3046981080836426, + "grad_norm": 0.8881733417510986, + "learning_rate": 0.00015093145316134044, + "loss": 2.9055, + "step": 3366 + }, + { + "epoch": 0.30478863039739296, + "grad_norm": 0.8453825116157532, + "learning_rate": 0.00015090414157503714, + "loss": 2.9716, + "step": 3367 + }, + { + "epoch": 0.3048791527111433, + "grad_norm": 1.0025969743728638, + "learning_rate": 0.00015087682486268817, + "loss": 2.9664, + "step": 3368 + }, + { + "epoch": 0.30496967502489364, + "grad_norm": 1.0106847286224365, + "learning_rate": 0.00015084950302704433, + "loss": 2.9872, + "step": 3369 + }, + { + "epoch": 0.305060197338644, + "grad_norm": 0.9144858717918396, + "learning_rate": 0.00015082217607085692, + "loss": 2.9294, + "step": 3370 + }, + { + "epoch": 0.3051507196523943, + "grad_norm": 0.943499743938446, + "learning_rate": 0.00015079484399687778, + "loss": 2.9154, + "step": 3371 + }, + { + "epoch": 0.30524124196614466, + "grad_norm": 0.9145709276199341, + "learning_rate": 0.00015076750680785922, + "loss": 2.9809, + "step": 3372 + }, + { + "epoch": 0.305331764279895, + "grad_norm": 1.0750885009765625, + "learning_rate": 0.00015074016450655411, + "loss": 2.9281, + "step": 3373 + }, + { + "epoch": 0.30542228659364534, + "grad_norm": 0.8634659051895142, + "learning_rate": 0.00015071281709571585, + "loss": 2.9821, + "step": 3374 + }, + { + "epoch": 0.3055128089073957, + "grad_norm": 0.9092392325401306, + "learning_rate": 0.0001506854645780983, + "loss": 2.9494, + "step": 3375 + }, + { + "epoch": 0.305603331221146, + "grad_norm": 0.9126806259155273, + "learning_rate": 0.00015065810695645584, + "loss": 2.964, + "step": 3376 + }, + { + "epoch": 0.30569385353489636, + "grad_norm": 0.9306450486183167, + "learning_rate": 0.00015063074423354338, + "loss": 2.9637, + "step": 3377 + }, + { + "epoch": 0.3057843758486467, + "grad_norm": 1.1827573776245117, + "learning_rate": 0.00015060337641211637, + "loss": 3.0492, + "step": 3378 + }, + { + "epoch": 0.30587489816239705, + "grad_norm": 0.8481791615486145, + "learning_rate": 0.00015057600349493077, + "loss": 2.9656, + "step": 3379 + }, + { + "epoch": 0.3059654204761474, + "grad_norm": 0.951055109500885, + "learning_rate": 0.000150548625484743, + "loss": 3.0363, + "step": 3380 + }, + { + "epoch": 0.30605594278989773, + "grad_norm": 0.8536085486412048, + "learning_rate": 0.00015052124238430995, + "loss": 2.9393, + "step": 3381 + }, + { + "epoch": 0.30614646510364807, + "grad_norm": 0.8343144655227661, + "learning_rate": 0.00015049385419638926, + "loss": 3.0001, + "step": 3382 + }, + { + "epoch": 0.3062369874173984, + "grad_norm": 0.9116124510765076, + "learning_rate": 0.00015046646092373883, + "loss": 3.0328, + "step": 3383 + }, + { + "epoch": 0.30632750973114875, + "grad_norm": 0.8201098442077637, + "learning_rate": 0.0001504390625691171, + "loss": 3.0223, + "step": 3384 + }, + { + "epoch": 0.3064180320448991, + "grad_norm": 0.8883117437362671, + "learning_rate": 0.0001504116591352832, + "loss": 3.0032, + "step": 3385 + }, + { + "epoch": 0.30650855435864943, + "grad_norm": 0.883526086807251, + "learning_rate": 0.0001503842506249966, + "loss": 3.0075, + "step": 3386 + }, + { + "epoch": 0.3065990766723998, + "grad_norm": 0.8869072198867798, + "learning_rate": 0.00015035683704101732, + "loss": 2.8875, + "step": 3387 + }, + { + "epoch": 0.3066895989861501, + "grad_norm": 0.8745526075363159, + "learning_rate": 0.00015032941838610597, + "loss": 3.0136, + "step": 3388 + }, + { + "epoch": 0.3067801212999004, + "grad_norm": 0.8355185389518738, + "learning_rate": 0.00015030199466302353, + "loss": 2.9815, + "step": 3389 + }, + { + "epoch": 0.30687064361365074, + "grad_norm": 0.8745567202568054, + "learning_rate": 0.0001502745658745316, + "loss": 2.9707, + "step": 3390 + }, + { + "epoch": 0.3069611659274011, + "grad_norm": 0.813612699508667, + "learning_rate": 0.0001502471320233923, + "loss": 2.9419, + "step": 3391 + }, + { + "epoch": 0.3070516882411514, + "grad_norm": 0.9350422024726868, + "learning_rate": 0.00015021969311236814, + "loss": 2.9618, + "step": 3392 + }, + { + "epoch": 0.30714221055490176, + "grad_norm": 0.888590931892395, + "learning_rate": 0.00015019224914422226, + "loss": 2.976, + "step": 3393 + }, + { + "epoch": 0.3072327328686521, + "grad_norm": 0.88670414686203, + "learning_rate": 0.00015016480012171828, + "loss": 3.0039, + "step": 3394 + }, + { + "epoch": 0.30732325518240244, + "grad_norm": 0.8592669367790222, + "learning_rate": 0.0001501373460476203, + "loss": 2.9485, + "step": 3395 + }, + { + "epoch": 0.3074137774961528, + "grad_norm": 0.9061694741249084, + "learning_rate": 0.00015010988692469295, + "loss": 2.9608, + "step": 3396 + }, + { + "epoch": 0.3075042998099031, + "grad_norm": 0.8457714319229126, + "learning_rate": 0.00015008242275570133, + "loss": 2.9619, + "step": 3397 + }, + { + "epoch": 0.30759482212365347, + "grad_norm": 0.8951268196105957, + "learning_rate": 0.00015005495354341114, + "loss": 2.9305, + "step": 3398 + }, + { + "epoch": 0.3076853444374038, + "grad_norm": 0.8539029359817505, + "learning_rate": 0.00015002747929058848, + "loss": 3.001, + "step": 3399 + }, + { + "epoch": 0.30777586675115415, + "grad_norm": 0.8860955238342285, + "learning_rate": 0.00015000000000000001, + "loss": 2.9903, + "step": 3400 + }, + { + "epoch": 0.3078663890649045, + "grad_norm": 0.8125213980674744, + "learning_rate": 0.00014997251567441293, + "loss": 2.9353, + "step": 3401 + }, + { + "epoch": 0.30795691137865483, + "grad_norm": 0.8126906156539917, + "learning_rate": 0.0001499450263165949, + "loss": 2.9851, + "step": 3402 + }, + { + "epoch": 0.30804743369240517, + "grad_norm": 0.8433899283409119, + "learning_rate": 0.00014991753192931405, + "loss": 2.9667, + "step": 3403 + }, + { + "epoch": 0.3081379560061555, + "grad_norm": 0.8762634992599487, + "learning_rate": 0.00014989003251533912, + "loss": 2.9581, + "step": 3404 + }, + { + "epoch": 0.30822847831990585, + "grad_norm": 0.891649067401886, + "learning_rate": 0.0001498625280774393, + "loss": 2.911, + "step": 3405 + }, + { + "epoch": 0.3083190006336562, + "grad_norm": 0.9227769374847412, + "learning_rate": 0.00014983501861838425, + "loss": 2.9672, + "step": 3406 + }, + { + "epoch": 0.30840952294740653, + "grad_norm": 0.9137094020843506, + "learning_rate": 0.00014980750414094417, + "loss": 2.9415, + "step": 3407 + }, + { + "epoch": 0.3085000452611569, + "grad_norm": 0.878205418586731, + "learning_rate": 0.00014977998464788984, + "loss": 2.9455, + "step": 3408 + }, + { + "epoch": 0.3085905675749072, + "grad_norm": 0.8257315754890442, + "learning_rate": 0.00014975246014199237, + "loss": 2.9471, + "step": 3409 + }, + { + "epoch": 0.30868108988865756, + "grad_norm": 0.8512585163116455, + "learning_rate": 0.00014972493062602354, + "loss": 3.0183, + "step": 3410 + }, + { + "epoch": 0.3087716122024079, + "grad_norm": 0.9373469352722168, + "learning_rate": 0.00014969739610275556, + "loss": 2.9832, + "step": 3411 + }, + { + "epoch": 0.30886213451615824, + "grad_norm": 0.8257470726966858, + "learning_rate": 0.00014966985657496114, + "loss": 2.9597, + "step": 3412 + }, + { + "epoch": 0.3089526568299086, + "grad_norm": 0.8489750027656555, + "learning_rate": 0.00014964231204541353, + "loss": 3.0061, + "step": 3413 + }, + { + "epoch": 0.3090431791436589, + "grad_norm": 0.8522926568984985, + "learning_rate": 0.00014961476251688645, + "loss": 2.9881, + "step": 3414 + }, + { + "epoch": 0.30913370145740926, + "grad_norm": 0.901701807975769, + "learning_rate": 0.00014958720799215414, + "loss": 3.0353, + "step": 3415 + }, + { + "epoch": 0.3092242237711596, + "grad_norm": 0.8479433655738831, + "learning_rate": 0.00014955964847399137, + "loss": 2.9602, + "step": 3416 + }, + { + "epoch": 0.30931474608490994, + "grad_norm": 0.9179433584213257, + "learning_rate": 0.00014953208396517332, + "loss": 3.0362, + "step": 3417 + }, + { + "epoch": 0.3094052683986603, + "grad_norm": 0.8743851780891418, + "learning_rate": 0.00014950451446847578, + "loss": 2.9541, + "step": 3418 + }, + { + "epoch": 0.3094957907124106, + "grad_norm": 0.8852507472038269, + "learning_rate": 0.00014947693998667498, + "loss": 3.0251, + "step": 3419 + }, + { + "epoch": 0.30958631302616096, + "grad_norm": 1.0363515615463257, + "learning_rate": 0.0001494493605225477, + "loss": 2.9889, + "step": 3420 + }, + { + "epoch": 0.3096768353399113, + "grad_norm": 0.7981639504432678, + "learning_rate": 0.00014942177607887115, + "loss": 2.9148, + "step": 3421 + }, + { + "epoch": 0.30976735765366165, + "grad_norm": 0.962907075881958, + "learning_rate": 0.0001493941866584231, + "loss": 2.9303, + "step": 3422 + }, + { + "epoch": 0.309857879967412, + "grad_norm": 0.8996175527572632, + "learning_rate": 0.00014936659226398181, + "loss": 2.994, + "step": 3423 + }, + { + "epoch": 0.3099484022811623, + "grad_norm": 0.9445270299911499, + "learning_rate": 0.00014933899289832603, + "loss": 2.9394, + "step": 3424 + }, + { + "epoch": 0.31003892459491267, + "grad_norm": 0.9439915418624878, + "learning_rate": 0.00014931138856423502, + "loss": 2.9735, + "step": 3425 + }, + { + "epoch": 0.310129446908663, + "grad_norm": 0.813269853591919, + "learning_rate": 0.00014928377926448855, + "loss": 2.9808, + "step": 3426 + }, + { + "epoch": 0.31021996922241335, + "grad_norm": 0.861720621585846, + "learning_rate": 0.0001492561650018668, + "loss": 3.0155, + "step": 3427 + }, + { + "epoch": 0.3103104915361637, + "grad_norm": 1.0294265747070312, + "learning_rate": 0.00014922854577915064, + "loss": 2.9706, + "step": 3428 + }, + { + "epoch": 0.31040101384991403, + "grad_norm": 0.9049494862556458, + "learning_rate": 0.00014920092159912126, + "loss": 2.96, + "step": 3429 + }, + { + "epoch": 0.3104915361636643, + "grad_norm": 1.1088683605194092, + "learning_rate": 0.0001491732924645604, + "loss": 3.0453, + "step": 3430 + }, + { + "epoch": 0.31058205847741466, + "grad_norm": 0.9710911512374878, + "learning_rate": 0.00014914565837825032, + "loss": 2.9452, + "step": 3431 + }, + { + "epoch": 0.310672580791165, + "grad_norm": 0.9654075503349304, + "learning_rate": 0.00014911801934297381, + "loss": 2.9662, + "step": 3432 + }, + { + "epoch": 0.31076310310491534, + "grad_norm": 0.8625348806381226, + "learning_rate": 0.00014909037536151409, + "loss": 2.9671, + "step": 3433 + }, + { + "epoch": 0.3108536254186657, + "grad_norm": 0.9648244976997375, + "learning_rate": 0.0001490627264366549, + "loss": 2.9945, + "step": 3434 + }, + { + "epoch": 0.310944147732416, + "grad_norm": 0.937522828578949, + "learning_rate": 0.00014903507257118054, + "loss": 2.919, + "step": 3435 + }, + { + "epoch": 0.31103467004616636, + "grad_norm": 0.8191124200820923, + "learning_rate": 0.0001490074137678757, + "loss": 2.8935, + "step": 3436 + }, + { + "epoch": 0.3111251923599167, + "grad_norm": 0.9017074704170227, + "learning_rate": 0.0001489797500295256, + "loss": 3.0027, + "step": 3437 + }, + { + "epoch": 0.31121571467366704, + "grad_norm": 0.9712589383125305, + "learning_rate": 0.00014895208135891604, + "loss": 2.9805, + "step": 3438 + }, + { + "epoch": 0.3113062369874174, + "grad_norm": 0.9253582954406738, + "learning_rate": 0.00014892440775883322, + "loss": 3.0006, + "step": 3439 + }, + { + "epoch": 0.3113967593011677, + "grad_norm": 0.9078521132469177, + "learning_rate": 0.0001488967292320639, + "loss": 2.9813, + "step": 3440 + }, + { + "epoch": 0.31148728161491807, + "grad_norm": 0.8458201289176941, + "learning_rate": 0.00014886904578139526, + "loss": 3.0009, + "step": 3441 + }, + { + "epoch": 0.3115778039286684, + "grad_norm": 0.9842881560325623, + "learning_rate": 0.00014884135740961504, + "loss": 3.0177, + "step": 3442 + }, + { + "epoch": 0.31166832624241875, + "grad_norm": 0.8262262940406799, + "learning_rate": 0.00014881366411951147, + "loss": 2.9914, + "step": 3443 + }, + { + "epoch": 0.3117588485561691, + "grad_norm": 0.8502604961395264, + "learning_rate": 0.0001487859659138733, + "loss": 2.9886, + "step": 3444 + }, + { + "epoch": 0.31184937086991943, + "grad_norm": 1.1262938976287842, + "learning_rate": 0.00014875826279548963, + "loss": 3.0346, + "step": 3445 + }, + { + "epoch": 0.31193989318366977, + "grad_norm": 1.0363510847091675, + "learning_rate": 0.0001487305547671503, + "loss": 2.9927, + "step": 3446 + }, + { + "epoch": 0.3120304154974201, + "grad_norm": 1.2626910209655762, + "learning_rate": 0.00014870284183164537, + "loss": 2.9711, + "step": 3447 + }, + { + "epoch": 0.31212093781117045, + "grad_norm": 1.3493363857269287, + "learning_rate": 0.00014867512399176563, + "loss": 3.0042, + "step": 3448 + }, + { + "epoch": 0.3122114601249208, + "grad_norm": 0.9481455087661743, + "learning_rate": 0.00014864740125030226, + "loss": 3.0049, + "step": 3449 + }, + { + "epoch": 0.31230198243867113, + "grad_norm": 1.1836961507797241, + "learning_rate": 0.00014861967361004687, + "loss": 2.9519, + "step": 3450 + }, + { + "epoch": 0.3123925047524215, + "grad_norm": 0.9749188423156738, + "learning_rate": 0.0001485919410737917, + "loss": 2.9565, + "step": 3451 + }, + { + "epoch": 0.3124830270661718, + "grad_norm": 0.8708198070526123, + "learning_rate": 0.0001485642036443294, + "loss": 3.0315, + "step": 3452 + }, + { + "epoch": 0.31257354937992216, + "grad_norm": 1.0292015075683594, + "learning_rate": 0.00014853646132445311, + "loss": 2.9577, + "step": 3453 + }, + { + "epoch": 0.3126640716936725, + "grad_norm": 0.9962379336357117, + "learning_rate": 0.0001485087141169565, + "loss": 2.9129, + "step": 3454 + }, + { + "epoch": 0.31275459400742284, + "grad_norm": 0.8681389093399048, + "learning_rate": 0.00014848096202463372, + "loss": 2.9596, + "step": 3455 + }, + { + "epoch": 0.3128451163211732, + "grad_norm": 0.9183157682418823, + "learning_rate": 0.00014845320505027938, + "loss": 2.9673, + "step": 3456 + }, + { + "epoch": 0.3129356386349235, + "grad_norm": 0.9543888568878174, + "learning_rate": 0.0001484254431966886, + "loss": 3.0068, + "step": 3457 + }, + { + "epoch": 0.31302616094867386, + "grad_norm": 0.8184690475463867, + "learning_rate": 0.000148397676466657, + "loss": 2.9318, + "step": 3458 + }, + { + "epoch": 0.3131166832624242, + "grad_norm": 0.9538193941116333, + "learning_rate": 0.00014836990486298077, + "loss": 3.0353, + "step": 3459 + }, + { + "epoch": 0.31320720557617454, + "grad_norm": 0.9387879371643066, + "learning_rate": 0.00014834212838845637, + "loss": 3.0407, + "step": 3460 + }, + { + "epoch": 0.3132977278899249, + "grad_norm": 0.9548669457435608, + "learning_rate": 0.00014831434704588102, + "loss": 2.9379, + "step": 3461 + }, + { + "epoch": 0.3133882502036752, + "grad_norm": 0.9051340222358704, + "learning_rate": 0.0001482865608380522, + "loss": 2.9099, + "step": 3462 + }, + { + "epoch": 0.31347877251742556, + "grad_norm": 0.8997891545295715, + "learning_rate": 0.00014825876976776807, + "loss": 2.979, + "step": 3463 + }, + { + "epoch": 0.3135692948311759, + "grad_norm": 0.9646556377410889, + "learning_rate": 0.00014823097383782715, + "loss": 3.0219, + "step": 3464 + }, + { + "epoch": 0.31365981714492625, + "grad_norm": 0.8742055296897888, + "learning_rate": 0.0001482031730510284, + "loss": 2.9746, + "step": 3465 + }, + { + "epoch": 0.3137503394586766, + "grad_norm": 0.8171219229698181, + "learning_rate": 0.00014817536741017152, + "loss": 2.9259, + "step": 3466 + }, + { + "epoch": 0.3138408617724269, + "grad_norm": 0.9034594893455505, + "learning_rate": 0.00014814755691805646, + "loss": 2.9577, + "step": 3467 + }, + { + "epoch": 0.31393138408617727, + "grad_norm": 0.8481171727180481, + "learning_rate": 0.00014811974157748372, + "loss": 2.9841, + "step": 3468 + }, + { + "epoch": 0.3140219063999276, + "grad_norm": 1.0891640186309814, + "learning_rate": 0.0001480919213912543, + "loss": 2.9582, + "step": 3469 + }, + { + "epoch": 0.31411242871367795, + "grad_norm": 0.9174221754074097, + "learning_rate": 0.00014806409636216973, + "loss": 2.9967, + "step": 3470 + }, + { + "epoch": 0.31420295102742823, + "grad_norm": 0.9914358258247375, + "learning_rate": 0.00014803626649303198, + "loss": 3.0575, + "step": 3471 + }, + { + "epoch": 0.3142934733411786, + "grad_norm": 0.8691591620445251, + "learning_rate": 0.0001480084317866435, + "loss": 2.9155, + "step": 3472 + }, + { + "epoch": 0.3143839956549289, + "grad_norm": 1.0562714338302612, + "learning_rate": 0.00014798059224580725, + "loss": 2.954, + "step": 3473 + }, + { + "epoch": 0.31447451796867926, + "grad_norm": 0.9047598242759705, + "learning_rate": 0.00014795274787332666, + "loss": 3.0319, + "step": 3474 + }, + { + "epoch": 0.3145650402824296, + "grad_norm": 0.9135189652442932, + "learning_rate": 0.0001479248986720057, + "loss": 2.9636, + "step": 3475 + }, + { + "epoch": 0.31465556259617994, + "grad_norm": 0.8667458891868591, + "learning_rate": 0.00014789704464464873, + "loss": 2.9684, + "step": 3476 + }, + { + "epoch": 0.3147460849099303, + "grad_norm": 0.9025455713272095, + "learning_rate": 0.0001478691857940607, + "loss": 2.9914, + "step": 3477 + }, + { + "epoch": 0.3148366072236806, + "grad_norm": 0.8761173486709595, + "learning_rate": 0.00014784132212304694, + "loss": 2.908, + "step": 3478 + }, + { + "epoch": 0.31492712953743096, + "grad_norm": 0.946856677532196, + "learning_rate": 0.0001478134536344134, + "loss": 2.9857, + "step": 3479 + }, + { + "epoch": 0.3150176518511813, + "grad_norm": 0.9196282029151917, + "learning_rate": 0.00014778558033096633, + "loss": 2.9855, + "step": 3480 + }, + { + "epoch": 0.31510817416493164, + "grad_norm": 0.9204413890838623, + "learning_rate": 0.00014775770221551264, + "loss": 3.0459, + "step": 3481 + }, + { + "epoch": 0.315198696478682, + "grad_norm": 0.971332848072052, + "learning_rate": 0.00014772981929085964, + "loss": 3.0045, + "step": 3482 + }, + { + "epoch": 0.3152892187924323, + "grad_norm": 0.8783162236213684, + "learning_rate": 0.0001477019315598152, + "loss": 2.9549, + "step": 3483 + }, + { + "epoch": 0.31537974110618266, + "grad_norm": 0.9069380164146423, + "learning_rate": 0.0001476740390251875, + "loss": 2.9589, + "step": 3484 + }, + { + "epoch": 0.315470263419933, + "grad_norm": 1.194395899772644, + "learning_rate": 0.0001476461416897854, + "loss": 3.0065, + "step": 3485 + }, + { + "epoch": 0.31556078573368335, + "grad_norm": 0.8641033172607422, + "learning_rate": 0.00014761823955641813, + "loss": 3.0169, + "step": 3486 + }, + { + "epoch": 0.3156513080474337, + "grad_norm": 1.0002996921539307, + "learning_rate": 0.00014759033262789543, + "loss": 2.9833, + "step": 3487 + }, + { + "epoch": 0.31574183036118403, + "grad_norm": 0.9187266230583191, + "learning_rate": 0.00014756242090702756, + "loss": 2.956, + "step": 3488 + }, + { + "epoch": 0.31583235267493437, + "grad_norm": 0.9133673906326294, + "learning_rate": 0.00014753450439662516, + "loss": 3.0216, + "step": 3489 + }, + { + "epoch": 0.3159228749886847, + "grad_norm": 0.9382879734039307, + "learning_rate": 0.0001475065830994995, + "loss": 2.9997, + "step": 3490 + }, + { + "epoch": 0.31601339730243505, + "grad_norm": 1.1736315488815308, + "learning_rate": 0.00014747865701846222, + "loss": 3.2377, + "step": 3491 + }, + { + "epoch": 0.3161039196161854, + "grad_norm": 0.9480122327804565, + "learning_rate": 0.00014745072615632548, + "loss": 2.9493, + "step": 3492 + }, + { + "epoch": 0.31619444192993573, + "grad_norm": 0.9033775925636292, + "learning_rate": 0.0001474227905159019, + "loss": 2.9793, + "step": 3493 + }, + { + "epoch": 0.3162849642436861, + "grad_norm": 0.9714937210083008, + "learning_rate": 0.00014739485010000465, + "loss": 2.9726, + "step": 3494 + }, + { + "epoch": 0.3163754865574364, + "grad_norm": 1.0207247734069824, + "learning_rate": 0.00014736690491144725, + "loss": 2.986, + "step": 3495 + }, + { + "epoch": 0.31646600887118675, + "grad_norm": 0.9080100655555725, + "learning_rate": 0.0001473389549530438, + "loss": 2.9687, + "step": 3496 + }, + { + "epoch": 0.3165565311849371, + "grad_norm": 0.9480652809143066, + "learning_rate": 0.00014731100022760893, + "loss": 2.9902, + "step": 3497 + }, + { + "epoch": 0.31664705349868744, + "grad_norm": 0.9133635759353638, + "learning_rate": 0.00014728304073795764, + "loss": 2.9709, + "step": 3498 + }, + { + "epoch": 0.3167375758124378, + "grad_norm": 0.9235483407974243, + "learning_rate": 0.00014725507648690543, + "loss": 3.0183, + "step": 3499 + }, + { + "epoch": 0.3168280981261881, + "grad_norm": 0.9264606833457947, + "learning_rate": 0.0001472271074772683, + "loss": 2.9804, + "step": 3500 + }, + { + "epoch": 0.31691862043993846, + "grad_norm": 0.8027808666229248, + "learning_rate": 0.00014719913371186275, + "loss": 2.9111, + "step": 3501 + }, + { + "epoch": 0.3170091427536888, + "grad_norm": 0.8905692100524902, + "learning_rate": 0.00014717115519350567, + "loss": 2.9191, + "step": 3502 + }, + { + "epoch": 0.31709966506743914, + "grad_norm": 1.0128405094146729, + "learning_rate": 0.00014714317192501461, + "loss": 2.9664, + "step": 3503 + }, + { + "epoch": 0.3171901873811895, + "grad_norm": 0.8433849811553955, + "learning_rate": 0.00014711518390920743, + "loss": 2.9312, + "step": 3504 + }, + { + "epoch": 0.3172807096949398, + "grad_norm": 0.9650546908378601, + "learning_rate": 0.00014708719114890252, + "loss": 2.9772, + "step": 3505 + }, + { + "epoch": 0.31737123200869016, + "grad_norm": 0.8981730341911316, + "learning_rate": 0.0001470591936469187, + "loss": 2.986, + "step": 3506 + }, + { + "epoch": 0.3174617543224405, + "grad_norm": 0.8478082418441772, + "learning_rate": 0.00014703119140607542, + "loss": 3.0154, + "step": 3507 + }, + { + "epoch": 0.31755227663619084, + "grad_norm": 0.9572350382804871, + "learning_rate": 0.00014700318442919242, + "loss": 3.0304, + "step": 3508 + }, + { + "epoch": 0.3176427989499412, + "grad_norm": 0.8729496598243713, + "learning_rate": 0.00014697517271909007, + "loss": 2.9723, + "step": 3509 + }, + { + "epoch": 0.3177333212636915, + "grad_norm": 0.9278503060340881, + "learning_rate": 0.00014694715627858908, + "loss": 2.9353, + "step": 3510 + }, + { + "epoch": 0.31782384357744187, + "grad_norm": 0.9096142649650574, + "learning_rate": 0.00014691913511051077, + "loss": 2.9696, + "step": 3511 + }, + { + "epoch": 0.31791436589119215, + "grad_norm": 1.0005139112472534, + "learning_rate": 0.0001468911092176768, + "loss": 2.9708, + "step": 3512 + }, + { + "epoch": 0.3180048882049425, + "grad_norm": 0.8990221619606018, + "learning_rate": 0.00014686307860290945, + "loss": 2.9606, + "step": 3513 + }, + { + "epoch": 0.31809541051869283, + "grad_norm": 0.8603647351264954, + "learning_rate": 0.00014683504326903134, + "loss": 3.0335, + "step": 3514 + }, + { + "epoch": 0.3181859328324432, + "grad_norm": 0.9636383652687073, + "learning_rate": 0.00014680700321886568, + "loss": 2.9187, + "step": 3515 + }, + { + "epoch": 0.3182764551461935, + "grad_norm": 0.9207581877708435, + "learning_rate": 0.00014677895845523607, + "loss": 2.9667, + "step": 3516 + }, + { + "epoch": 0.31836697745994386, + "grad_norm": 0.8631317019462585, + "learning_rate": 0.0001467509089809666, + "loss": 2.9814, + "step": 3517 + }, + { + "epoch": 0.3184574997736942, + "grad_norm": 0.836303174495697, + "learning_rate": 0.0001467228547988819, + "loss": 2.9347, + "step": 3518 + }, + { + "epoch": 0.31854802208744454, + "grad_norm": 0.8611830472946167, + "learning_rate": 0.00014669479591180695, + "loss": 2.9619, + "step": 3519 + }, + { + "epoch": 0.3186385444011949, + "grad_norm": 0.8925729990005493, + "learning_rate": 0.00014666673232256738, + "loss": 2.9209, + "step": 3520 + }, + { + "epoch": 0.3187290667149452, + "grad_norm": 0.8913902640342712, + "learning_rate": 0.00014663866403398913, + "loss": 2.9936, + "step": 3521 + }, + { + "epoch": 0.31881958902869556, + "grad_norm": 0.8865054249763489, + "learning_rate": 0.00014661059104889866, + "loss": 3.0011, + "step": 3522 + }, + { + "epoch": 0.3189101113424459, + "grad_norm": 0.8239423632621765, + "learning_rate": 0.000146582513370123, + "loss": 2.9246, + "step": 3523 + }, + { + "epoch": 0.31900063365619624, + "grad_norm": 0.9283320307731628, + "learning_rate": 0.00014655443100048952, + "loss": 2.9693, + "step": 3524 + }, + { + "epoch": 0.3190911559699466, + "grad_norm": 0.88014155626297, + "learning_rate": 0.00014652634394282608, + "loss": 2.8746, + "step": 3525 + }, + { + "epoch": 0.3191816782836969, + "grad_norm": 1.0238566398620605, + "learning_rate": 0.00014649825219996106, + "loss": 2.9485, + "step": 3526 + }, + { + "epoch": 0.31927220059744726, + "grad_norm": 0.8470163941383362, + "learning_rate": 0.00014647015577472333, + "loss": 2.9672, + "step": 3527 + }, + { + "epoch": 0.3193627229111976, + "grad_norm": 0.8591601252555847, + "learning_rate": 0.0001464420546699422, + "loss": 2.8937, + "step": 3528 + }, + { + "epoch": 0.31945324522494795, + "grad_norm": 0.9024593830108643, + "learning_rate": 0.00014641394888844744, + "loss": 2.9542, + "step": 3529 + }, + { + "epoch": 0.3195437675386983, + "grad_norm": 0.901153564453125, + "learning_rate": 0.00014638583843306927, + "loss": 3.0101, + "step": 3530 + }, + { + "epoch": 0.3196342898524486, + "grad_norm": 0.8467450141906738, + "learning_rate": 0.0001463577233066385, + "loss": 2.9043, + "step": 3531 + }, + { + "epoch": 0.31972481216619897, + "grad_norm": 1.0799063444137573, + "learning_rate": 0.00014632960351198618, + "loss": 2.9837, + "step": 3532 + }, + { + "epoch": 0.3198153344799493, + "grad_norm": 0.9393935799598694, + "learning_rate": 0.0001463014790519441, + "loss": 2.9321, + "step": 3533 + }, + { + "epoch": 0.31990585679369965, + "grad_norm": 0.9979540109634399, + "learning_rate": 0.0001462733499293443, + "loss": 3.016, + "step": 3534 + }, + { + "epoch": 0.31999637910745, + "grad_norm": 1.284339189529419, + "learning_rate": 0.0001462452161470195, + "loss": 2.9151, + "step": 3535 + }, + { + "epoch": 0.32008690142120033, + "grad_norm": 0.9282693862915039, + "learning_rate": 0.00014621707770780268, + "loss": 2.9544, + "step": 3536 + }, + { + "epoch": 0.3201774237349507, + "grad_norm": 0.9534181356430054, + "learning_rate": 0.00014618893461452735, + "loss": 2.9185, + "step": 3537 + }, + { + "epoch": 0.320267946048701, + "grad_norm": 0.9282907247543335, + "learning_rate": 0.0001461607868700276, + "loss": 2.8928, + "step": 3538 + }, + { + "epoch": 0.32035846836245135, + "grad_norm": 0.8647146224975586, + "learning_rate": 0.0001461326344771379, + "loss": 2.9864, + "step": 3539 + }, + { + "epoch": 0.3204489906762017, + "grad_norm": 0.9271777868270874, + "learning_rate": 0.00014610447743869314, + "loss": 2.9487, + "step": 3540 + }, + { + "epoch": 0.32053951298995204, + "grad_norm": 0.9051327109336853, + "learning_rate": 0.00014607631575752877, + "loss": 2.8988, + "step": 3541 + }, + { + "epoch": 0.3206300353037024, + "grad_norm": 0.9088444113731384, + "learning_rate": 0.00014604814943648065, + "loss": 2.9941, + "step": 3542 + }, + { + "epoch": 0.3207205576174527, + "grad_norm": 0.8992854952812195, + "learning_rate": 0.00014601997847838518, + "loss": 2.9576, + "step": 3543 + }, + { + "epoch": 0.32081107993120306, + "grad_norm": 0.9406589865684509, + "learning_rate": 0.0001459918028860791, + "loss": 2.9269, + "step": 3544 + }, + { + "epoch": 0.3209016022449534, + "grad_norm": 0.821314811706543, + "learning_rate": 0.00014596362266239973, + "loss": 2.9641, + "step": 3545 + }, + { + "epoch": 0.32099212455870374, + "grad_norm": 0.873015284538269, + "learning_rate": 0.00014593543781018484, + "loss": 2.9098, + "step": 3546 + }, + { + "epoch": 0.3210826468724541, + "grad_norm": 0.8742039203643799, + "learning_rate": 0.0001459072483322726, + "loss": 3.0273, + "step": 3547 + }, + { + "epoch": 0.3211731691862044, + "grad_norm": 0.861594021320343, + "learning_rate": 0.0001458790542315017, + "loss": 2.9947, + "step": 3548 + }, + { + "epoch": 0.32126369149995476, + "grad_norm": 0.8861513733863831, + "learning_rate": 0.00014585085551071134, + "loss": 2.967, + "step": 3549 + }, + { + "epoch": 0.3213542138137051, + "grad_norm": 0.8987154960632324, + "learning_rate": 0.00014582265217274104, + "loss": 2.9136, + "step": 3550 + }, + { + "epoch": 0.32144473612745544, + "grad_norm": 0.8634107708930969, + "learning_rate": 0.00014579444422043093, + "loss": 2.9147, + "step": 3551 + }, + { + "epoch": 0.3215352584412058, + "grad_norm": 0.9557790160179138, + "learning_rate": 0.00014576623165662157, + "loss": 3.0121, + "step": 3552 + }, + { + "epoch": 0.32162578075495607, + "grad_norm": 0.8912983536720276, + "learning_rate": 0.00014573801448415389, + "loss": 2.9428, + "step": 3553 + }, + { + "epoch": 0.3217163030687064, + "grad_norm": 0.8403054475784302, + "learning_rate": 0.00014570979270586945, + "loss": 2.917, + "step": 3554 + }, + { + "epoch": 0.32180682538245675, + "grad_norm": 0.8585752844810486, + "learning_rate": 0.00014568156632461008, + "loss": 2.9762, + "step": 3555 + }, + { + "epoch": 0.3218973476962071, + "grad_norm": 0.9195684790611267, + "learning_rate": 0.00014565333534321826, + "loss": 2.8837, + "step": 3556 + }, + { + "epoch": 0.32198787000995743, + "grad_norm": 0.8550272583961487, + "learning_rate": 0.00014562509976453683, + "loss": 2.9835, + "step": 3557 + }, + { + "epoch": 0.3220783923237078, + "grad_norm": 0.8897809982299805, + "learning_rate": 0.00014559685959140907, + "loss": 2.8815, + "step": 3558 + }, + { + "epoch": 0.3221689146374581, + "grad_norm": 0.8674122095108032, + "learning_rate": 0.00014556861482667883, + "loss": 2.9212, + "step": 3559 + }, + { + "epoch": 0.32225943695120846, + "grad_norm": 0.8612362146377563, + "learning_rate": 0.00014554036547319033, + "loss": 2.9564, + "step": 3560 + }, + { + "epoch": 0.3223499592649588, + "grad_norm": 0.8690678477287292, + "learning_rate": 0.00014551211153378824, + "loss": 3.0016, + "step": 3561 + }, + { + "epoch": 0.32244048157870914, + "grad_norm": 0.9328231811523438, + "learning_rate": 0.0001454838530113178, + "loss": 3.0187, + "step": 3562 + }, + { + "epoch": 0.3225310038924595, + "grad_norm": 0.8388855457305908, + "learning_rate": 0.00014545558990862456, + "loss": 2.9387, + "step": 3563 + }, + { + "epoch": 0.3226215262062098, + "grad_norm": 0.806565523147583, + "learning_rate": 0.00014542732222855469, + "loss": 2.9359, + "step": 3564 + }, + { + "epoch": 0.32271204851996016, + "grad_norm": 0.892017126083374, + "learning_rate": 0.00014539904997395468, + "loss": 2.8981, + "step": 3565 + }, + { + "epoch": 0.3228025708337105, + "grad_norm": 0.837681770324707, + "learning_rate": 0.00014537077314767162, + "loss": 2.9683, + "step": 3566 + }, + { + "epoch": 0.32289309314746084, + "grad_norm": 0.8649284839630127, + "learning_rate": 0.00014534249175255295, + "loss": 2.9524, + "step": 3567 + }, + { + "epoch": 0.3229836154612112, + "grad_norm": 0.842377245426178, + "learning_rate": 0.00014531420579144656, + "loss": 2.9824, + "step": 3568 + }, + { + "epoch": 0.3230741377749615, + "grad_norm": 0.8362056016921997, + "learning_rate": 0.0001452859152672009, + "loss": 2.9602, + "step": 3569 + }, + { + "epoch": 0.32316466008871186, + "grad_norm": 0.8706377148628235, + "learning_rate": 0.00014525762018266483, + "loss": 3.0274, + "step": 3570 + }, + { + "epoch": 0.3232551824024622, + "grad_norm": 0.8478174805641174, + "learning_rate": 0.00014522932054068766, + "loss": 2.9577, + "step": 3571 + }, + { + "epoch": 0.32334570471621255, + "grad_norm": 0.8531762957572937, + "learning_rate": 0.0001452010163441191, + "loss": 2.9567, + "step": 3572 + }, + { + "epoch": 0.3234362270299629, + "grad_norm": 0.9084886908531189, + "learning_rate": 0.00014517270759580947, + "loss": 2.9484, + "step": 3573 + }, + { + "epoch": 0.3235267493437132, + "grad_norm": 0.803013026714325, + "learning_rate": 0.00014514439429860943, + "loss": 2.9553, + "step": 3574 + }, + { + "epoch": 0.32361727165746357, + "grad_norm": 0.8305002450942993, + "learning_rate": 0.0001451160764553701, + "loss": 2.9813, + "step": 3575 + }, + { + "epoch": 0.3237077939712139, + "grad_norm": 0.8824440240859985, + "learning_rate": 0.00014508775406894307, + "loss": 3.0241, + "step": 3576 + }, + { + "epoch": 0.32379831628496425, + "grad_norm": 0.8341369032859802, + "learning_rate": 0.0001450594271421805, + "loss": 2.9415, + "step": 3577 + }, + { + "epoch": 0.3238888385987146, + "grad_norm": 0.8482344150543213, + "learning_rate": 0.00014503109567793481, + "loss": 2.9265, + "step": 3578 + }, + { + "epoch": 0.32397936091246493, + "grad_norm": 0.8310097455978394, + "learning_rate": 0.00014500275967905903, + "loss": 2.9018, + "step": 3579 + }, + { + "epoch": 0.32406988322621527, + "grad_norm": 0.8658933639526367, + "learning_rate": 0.0001449744191484066, + "loss": 3.0183, + "step": 3580 + }, + { + "epoch": 0.3241604055399656, + "grad_norm": 0.8318408131599426, + "learning_rate": 0.00014494607408883136, + "loss": 2.9002, + "step": 3581 + }, + { + "epoch": 0.32425092785371595, + "grad_norm": 0.9183574914932251, + "learning_rate": 0.00014491772450318772, + "loss": 2.9481, + "step": 3582 + }, + { + "epoch": 0.3243414501674663, + "grad_norm": 0.9491264820098877, + "learning_rate": 0.0001448893703943304, + "loss": 2.9774, + "step": 3583 + }, + { + "epoch": 0.32443197248121664, + "grad_norm": 0.9588972330093384, + "learning_rate": 0.00014486101176511474, + "loss": 2.9391, + "step": 3584 + }, + { + "epoch": 0.324522494794967, + "grad_norm": 0.8931008577346802, + "learning_rate": 0.00014483264861839644, + "loss": 2.9383, + "step": 3585 + }, + { + "epoch": 0.3246130171087173, + "grad_norm": 1.0485310554504395, + "learning_rate": 0.00014480428095703165, + "loss": 2.9579, + "step": 3586 + }, + { + "epoch": 0.32470353942246766, + "grad_norm": 0.9095520973205566, + "learning_rate": 0.00014477590878387696, + "loss": 2.9845, + "step": 3587 + }, + { + "epoch": 0.324794061736218, + "grad_norm": 0.9263955950737, + "learning_rate": 0.00014474753210178953, + "loss": 2.9714, + "step": 3588 + }, + { + "epoch": 0.32488458404996834, + "grad_norm": 0.8608377575874329, + "learning_rate": 0.00014471915091362683, + "loss": 2.9234, + "step": 3589 + }, + { + "epoch": 0.3249751063637187, + "grad_norm": 0.8173804879188538, + "learning_rate": 0.0001446907652222468, + "loss": 2.9445, + "step": 3590 + }, + { + "epoch": 0.325065628677469, + "grad_norm": 0.9364265203475952, + "learning_rate": 0.000144662375030508, + "loss": 2.9693, + "step": 3591 + }, + { + "epoch": 0.32515615099121936, + "grad_norm": 0.8866042494773865, + "learning_rate": 0.0001446339803412692, + "loss": 2.9631, + "step": 3592 + }, + { + "epoch": 0.3252466733049697, + "grad_norm": 0.9168328642845154, + "learning_rate": 0.00014460558115738986, + "loss": 2.9383, + "step": 3593 + }, + { + "epoch": 0.32533719561872, + "grad_norm": 1.059483528137207, + "learning_rate": 0.0001445771774817297, + "loss": 2.9121, + "step": 3594 + }, + { + "epoch": 0.32542771793247033, + "grad_norm": 0.8932848572731018, + "learning_rate": 0.00014454876931714895, + "loss": 2.8651, + "step": 3595 + }, + { + "epoch": 0.32551824024622067, + "grad_norm": 1.0029690265655518, + "learning_rate": 0.0001445203566665084, + "loss": 2.9751, + "step": 3596 + }, + { + "epoch": 0.325608762559971, + "grad_norm": 0.8788452744483948, + "learning_rate": 0.00014449193953266912, + "loss": 2.9459, + "step": 3597 + }, + { + "epoch": 0.32569928487372135, + "grad_norm": 1.2407371997833252, + "learning_rate": 0.00014446351791849276, + "loss": 3.045, + "step": 3598 + }, + { + "epoch": 0.3257898071874717, + "grad_norm": 0.8738275766372681, + "learning_rate": 0.00014443509182684135, + "loss": 2.9144, + "step": 3599 + }, + { + "epoch": 0.32588032950122203, + "grad_norm": 1.4616142511367798, + "learning_rate": 0.00014440666126057744, + "loss": 2.9866, + "step": 3600 + }, + { + "epoch": 0.3259708518149724, + "grad_norm": 0.9846639633178711, + "learning_rate": 0.00014437822622256392, + "loss": 2.8977, + "step": 3601 + }, + { + "epoch": 0.3260613741287227, + "grad_norm": 0.8873237371444702, + "learning_rate": 0.00014434978671566424, + "loss": 2.944, + "step": 3602 + }, + { + "epoch": 0.32615189644247305, + "grad_norm": 1.063632607460022, + "learning_rate": 0.00014432134274274224, + "loss": 2.9393, + "step": 3603 + }, + { + "epoch": 0.3262424187562234, + "grad_norm": 0.8502988815307617, + "learning_rate": 0.00014429289430666227, + "loss": 2.9085, + "step": 3604 + }, + { + "epoch": 0.32633294106997374, + "grad_norm": 0.8328250646591187, + "learning_rate": 0.00014426444141028906, + "loss": 2.892, + "step": 3605 + }, + { + "epoch": 0.3264234633837241, + "grad_norm": 0.9032594561576843, + "learning_rate": 0.00014423598405648776, + "loss": 2.9523, + "step": 3606 + }, + { + "epoch": 0.3265139856974744, + "grad_norm": 0.8867715001106262, + "learning_rate": 0.0001442075222481241, + "loss": 2.913, + "step": 3607 + }, + { + "epoch": 0.32660450801122476, + "grad_norm": 0.9769021272659302, + "learning_rate": 0.0001441790559880642, + "loss": 3.0404, + "step": 3608 + }, + { + "epoch": 0.3266950303249751, + "grad_norm": 0.9831045866012573, + "learning_rate": 0.00014415058527917452, + "loss": 2.949, + "step": 3609 + }, + { + "epoch": 0.32678555263872544, + "grad_norm": 0.8027966618537903, + "learning_rate": 0.00014412211012432212, + "loss": 2.9648, + "step": 3610 + }, + { + "epoch": 0.3268760749524758, + "grad_norm": 0.9449635744094849, + "learning_rate": 0.00014409363052637447, + "loss": 2.9716, + "step": 3611 + }, + { + "epoch": 0.3269665972662261, + "grad_norm": 0.9377633929252625, + "learning_rate": 0.00014406514648819942, + "loss": 2.9289, + "step": 3612 + }, + { + "epoch": 0.32705711957997646, + "grad_norm": 0.971062183380127, + "learning_rate": 0.0001440366580126653, + "loss": 3.0385, + "step": 3613 + }, + { + "epoch": 0.3271476418937268, + "grad_norm": 0.9378071427345276, + "learning_rate": 0.00014400816510264092, + "loss": 2.9413, + "step": 3614 + }, + { + "epoch": 0.32723816420747714, + "grad_norm": 0.8906871676445007, + "learning_rate": 0.00014397966776099556, + "loss": 2.9619, + "step": 3615 + }, + { + "epoch": 0.3273286865212275, + "grad_norm": 0.8794304728507996, + "learning_rate": 0.0001439511659905988, + "loss": 2.9532, + "step": 3616 + }, + { + "epoch": 0.3274192088349778, + "grad_norm": 0.9370628595352173, + "learning_rate": 0.0001439226597943209, + "loss": 2.978, + "step": 3617 + }, + { + "epoch": 0.32750973114872817, + "grad_norm": 0.9121912717819214, + "learning_rate": 0.0001438941491750323, + "loss": 2.938, + "step": 3618 + }, + { + "epoch": 0.3276002534624785, + "grad_norm": 0.8118641376495361, + "learning_rate": 0.00014386563413560413, + "loss": 2.986, + "step": 3619 + }, + { + "epoch": 0.32769077577622885, + "grad_norm": 0.9029416441917419, + "learning_rate": 0.00014383711467890774, + "loss": 2.9635, + "step": 3620 + }, + { + "epoch": 0.3277812980899792, + "grad_norm": 0.8763449192047119, + "learning_rate": 0.00014380859080781515, + "loss": 2.9133, + "step": 3621 + }, + { + "epoch": 0.32787182040372953, + "grad_norm": 0.881456732749939, + "learning_rate": 0.00014378006252519865, + "loss": 2.9245, + "step": 3622 + }, + { + "epoch": 0.32796234271747987, + "grad_norm": 0.8821005821228027, + "learning_rate": 0.00014375152983393106, + "loss": 2.9826, + "step": 3623 + }, + { + "epoch": 0.3280528650312302, + "grad_norm": 0.9180141091346741, + "learning_rate": 0.00014372299273688558, + "loss": 2.9819, + "step": 3624 + }, + { + "epoch": 0.32814338734498055, + "grad_norm": 0.9080764651298523, + "learning_rate": 0.00014369445123693596, + "loss": 3.0095, + "step": 3625 + }, + { + "epoch": 0.3282339096587309, + "grad_norm": 0.903130054473877, + "learning_rate": 0.0001436659053369563, + "loss": 2.902, + "step": 3626 + }, + { + "epoch": 0.32832443197248123, + "grad_norm": 0.8723325729370117, + "learning_rate": 0.00014363735503982114, + "loss": 2.9421, + "step": 3627 + }, + { + "epoch": 0.3284149542862316, + "grad_norm": 0.9515482187271118, + "learning_rate": 0.00014360880034840554, + "loss": 2.9796, + "step": 3628 + }, + { + "epoch": 0.3285054765999819, + "grad_norm": 0.8937515616416931, + "learning_rate": 0.00014358024126558493, + "loss": 2.9437, + "step": 3629 + }, + { + "epoch": 0.32859599891373226, + "grad_norm": 0.8733628392219543, + "learning_rate": 0.00014355167779423524, + "loss": 2.9753, + "step": 3630 + }, + { + "epoch": 0.3286865212274826, + "grad_norm": 0.8773425221443176, + "learning_rate": 0.00014352310993723277, + "loss": 2.9253, + "step": 3631 + }, + { + "epoch": 0.32877704354123294, + "grad_norm": 0.8728827238082886, + "learning_rate": 0.0001434945376974543, + "loss": 2.9234, + "step": 3632 + }, + { + "epoch": 0.3288675658549833, + "grad_norm": 0.8740129470825195, + "learning_rate": 0.0001434659610777771, + "loss": 2.98, + "step": 3633 + }, + { + "epoch": 0.3289580881687336, + "grad_norm": 0.831460177898407, + "learning_rate": 0.0001434373800810788, + "loss": 2.9551, + "step": 3634 + }, + { + "epoch": 0.3290486104824839, + "grad_norm": 0.8963636755943298, + "learning_rate": 0.0001434087947102375, + "loss": 2.9985, + "step": 3635 + }, + { + "epoch": 0.32913913279623425, + "grad_norm": 0.9177209138870239, + "learning_rate": 0.0001433802049681318, + "loss": 2.9544, + "step": 3636 + }, + { + "epoch": 0.3292296551099846, + "grad_norm": 0.8798718452453613, + "learning_rate": 0.00014335161085764062, + "loss": 2.9751, + "step": 3637 + }, + { + "epoch": 0.3293201774237349, + "grad_norm": 0.8863023519515991, + "learning_rate": 0.00014332301238164342, + "loss": 2.8911, + "step": 3638 + }, + { + "epoch": 0.32941069973748527, + "grad_norm": 0.8351691961288452, + "learning_rate": 0.00014329440954302005, + "loss": 2.9239, + "step": 3639 + }, + { + "epoch": 0.3295012220512356, + "grad_norm": 0.8428190350532532, + "learning_rate": 0.00014326580234465085, + "loss": 2.9803, + "step": 3640 + }, + { + "epoch": 0.32959174436498595, + "grad_norm": 0.8503945469856262, + "learning_rate": 0.0001432371907894165, + "loss": 2.9194, + "step": 3641 + }, + { + "epoch": 0.3296822666787363, + "grad_norm": 0.9099516868591309, + "learning_rate": 0.00014320857488019824, + "loss": 2.9307, + "step": 3642 + }, + { + "epoch": 0.32977278899248663, + "grad_norm": 0.9085094928741455, + "learning_rate": 0.00014317995461987767, + "loss": 2.9746, + "step": 3643 + }, + { + "epoch": 0.329863311306237, + "grad_norm": 0.9462329149246216, + "learning_rate": 0.00014315133001133686, + "loss": 2.9018, + "step": 3644 + }, + { + "epoch": 0.3299538336199873, + "grad_norm": 0.9889357686042786, + "learning_rate": 0.0001431227010574583, + "loss": 3.0249, + "step": 3645 + }, + { + "epoch": 0.33004435593373765, + "grad_norm": 0.8533375859260559, + "learning_rate": 0.0001430940677611249, + "loss": 2.8802, + "step": 3646 + }, + { + "epoch": 0.330134878247488, + "grad_norm": 0.9745096564292908, + "learning_rate": 0.00014306543012522005, + "loss": 2.9591, + "step": 3647 + }, + { + "epoch": 0.33022540056123834, + "grad_norm": 0.843706488609314, + "learning_rate": 0.0001430367881526276, + "loss": 3.0582, + "step": 3648 + }, + { + "epoch": 0.3303159228749887, + "grad_norm": 1.0291938781738281, + "learning_rate": 0.00014300814184623172, + "loss": 2.9939, + "step": 3649 + }, + { + "epoch": 0.330406445188739, + "grad_norm": 1.0679194927215576, + "learning_rate": 0.00014297949120891718, + "loss": 2.9759, + "step": 3650 + }, + { + "epoch": 0.33049696750248936, + "grad_norm": 0.8248839974403381, + "learning_rate": 0.00014295083624356903, + "loss": 2.9712, + "step": 3651 + }, + { + "epoch": 0.3305874898162397, + "grad_norm": 1.0173978805541992, + "learning_rate": 0.00014292217695307285, + "loss": 2.9606, + "step": 3652 + }, + { + "epoch": 0.33067801212999004, + "grad_norm": 0.9261618852615356, + "learning_rate": 0.0001428935133403146, + "loss": 2.9399, + "step": 3653 + }, + { + "epoch": 0.3307685344437404, + "grad_norm": 1.108538031578064, + "learning_rate": 0.00014286484540818078, + "loss": 3.0039, + "step": 3654 + }, + { + "epoch": 0.3308590567574907, + "grad_norm": 0.9498209953308105, + "learning_rate": 0.00014283617315955814, + "loss": 2.9224, + "step": 3655 + }, + { + "epoch": 0.33094957907124106, + "grad_norm": 0.8508312106132507, + "learning_rate": 0.00014280749659733406, + "loss": 2.8872, + "step": 3656 + }, + { + "epoch": 0.3310401013849914, + "grad_norm": 0.9958304762840271, + "learning_rate": 0.00014277881572439627, + "loss": 2.9212, + "step": 3657 + }, + { + "epoch": 0.33113062369874174, + "grad_norm": 0.8798844814300537, + "learning_rate": 0.00014275013054363287, + "loss": 2.9737, + "step": 3658 + }, + { + "epoch": 0.3312211460124921, + "grad_norm": 0.913406252861023, + "learning_rate": 0.0001427214410579325, + "loss": 2.8793, + "step": 3659 + }, + { + "epoch": 0.3313116683262424, + "grad_norm": 0.8229892253875732, + "learning_rate": 0.0001426927472701842, + "loss": 3.0067, + "step": 3660 + }, + { + "epoch": 0.33140219063999277, + "grad_norm": 0.9121760725975037, + "learning_rate": 0.00014266404918327743, + "loss": 2.9986, + "step": 3661 + }, + { + "epoch": 0.3314927129537431, + "grad_norm": 0.875041127204895, + "learning_rate": 0.00014263534680010204, + "loss": 2.934, + "step": 3662 + }, + { + "epoch": 0.33158323526749345, + "grad_norm": 0.8997043967247009, + "learning_rate": 0.00014260664012354842, + "loss": 2.9247, + "step": 3663 + }, + { + "epoch": 0.3316737575812438, + "grad_norm": 0.8942994475364685, + "learning_rate": 0.00014257792915650728, + "loss": 2.9651, + "step": 3664 + }, + { + "epoch": 0.33176427989499413, + "grad_norm": 0.8819488286972046, + "learning_rate": 0.00014254921390186986, + "loss": 2.9339, + "step": 3665 + }, + { + "epoch": 0.33185480220874447, + "grad_norm": 0.8902394771575928, + "learning_rate": 0.00014252049436252776, + "loss": 2.8851, + "step": 3666 + }, + { + "epoch": 0.3319453245224948, + "grad_norm": 0.876203179359436, + "learning_rate": 0.00014249177054137305, + "loss": 2.9535, + "step": 3667 + }, + { + "epoch": 0.33203584683624515, + "grad_norm": 0.8925275802612305, + "learning_rate": 0.00014246304244129818, + "loss": 2.9907, + "step": 3668 + }, + { + "epoch": 0.3321263691499955, + "grad_norm": 0.8686583042144775, + "learning_rate": 0.00014243431006519613, + "loss": 2.9287, + "step": 3669 + }, + { + "epoch": 0.33221689146374583, + "grad_norm": 0.939652681350708, + "learning_rate": 0.00014240557341596018, + "loss": 2.9866, + "step": 3670 + }, + { + "epoch": 0.3323074137774962, + "grad_norm": 1.0006009340286255, + "learning_rate": 0.00014237683249648418, + "loss": 3.0263, + "step": 3671 + }, + { + "epoch": 0.3323979360912465, + "grad_norm": 0.8614761233329773, + "learning_rate": 0.0001423480873096623, + "loss": 2.9568, + "step": 3672 + }, + { + "epoch": 0.33248845840499686, + "grad_norm": 0.9231300950050354, + "learning_rate": 0.00014231933785838918, + "loss": 2.9747, + "step": 3673 + }, + { + "epoch": 0.3325789807187472, + "grad_norm": 0.9635558128356934, + "learning_rate": 0.0001422905841455599, + "loss": 2.927, + "step": 3674 + }, + { + "epoch": 0.33266950303249754, + "grad_norm": 0.8838689923286438, + "learning_rate": 0.00014226182617406996, + "loss": 2.9381, + "step": 3675 + }, + { + "epoch": 0.3327600253462478, + "grad_norm": 0.9083494544029236, + "learning_rate": 0.00014223306394681528, + "loss": 2.9369, + "step": 3676 + }, + { + "epoch": 0.33285054765999816, + "grad_norm": 0.9969063997268677, + "learning_rate": 0.00014220429746669222, + "loss": 2.9549, + "step": 3677 + }, + { + "epoch": 0.3329410699737485, + "grad_norm": 0.8796403408050537, + "learning_rate": 0.00014217552673659754, + "loss": 2.9092, + "step": 3678 + }, + { + "epoch": 0.33303159228749885, + "grad_norm": 0.8336253762245178, + "learning_rate": 0.00014214675175942847, + "loss": 2.9268, + "step": 3679 + }, + { + "epoch": 0.3331221146012492, + "grad_norm": 0.9689157009124756, + "learning_rate": 0.00014211797253808268, + "loss": 2.971, + "step": 3680 + }, + { + "epoch": 0.3332126369149995, + "grad_norm": 0.8374845385551453, + "learning_rate": 0.00014208918907545817, + "loss": 2.9561, + "step": 3681 + }, + { + "epoch": 0.33330315922874987, + "grad_norm": 0.9589225053787231, + "learning_rate": 0.00014206040137445348, + "loss": 2.9519, + "step": 3682 + }, + { + "epoch": 0.3333936815425002, + "grad_norm": 0.977763295173645, + "learning_rate": 0.00014203160943796755, + "loss": 2.9803, + "step": 3683 + }, + { + "epoch": 0.33348420385625055, + "grad_norm": 0.8360752463340759, + "learning_rate": 0.0001420028132688997, + "loss": 2.9539, + "step": 3684 + }, + { + "epoch": 0.3335747261700009, + "grad_norm": 0.9216536283493042, + "learning_rate": 0.0001419740128701497, + "loss": 2.8518, + "step": 3685 + }, + { + "epoch": 0.33366524848375123, + "grad_norm": 0.9173371195793152, + "learning_rate": 0.00014194520824461771, + "loss": 3.0058, + "step": 3686 + }, + { + "epoch": 0.33375577079750157, + "grad_norm": 0.841460645198822, + "learning_rate": 0.00014191639939520442, + "loss": 2.9655, + "step": 3687 + }, + { + "epoch": 0.3338462931112519, + "grad_norm": 1.026754379272461, + "learning_rate": 0.0001418875863248109, + "loss": 2.9974, + "step": 3688 + }, + { + "epoch": 0.33393681542500225, + "grad_norm": 0.8212361335754395, + "learning_rate": 0.0001418587690363385, + "loss": 2.8847, + "step": 3689 + }, + { + "epoch": 0.3340273377387526, + "grad_norm": 0.8820496201515198, + "learning_rate": 0.00014182994753268927, + "loss": 2.938, + "step": 3690 + }, + { + "epoch": 0.33411786005250294, + "grad_norm": 1.0030783414840698, + "learning_rate": 0.0001418011218167655, + "loss": 2.9654, + "step": 3691 + }, + { + "epoch": 0.3342083823662533, + "grad_norm": 0.9057952165603638, + "learning_rate": 0.00014177229189146984, + "loss": 2.8773, + "step": 3692 + }, + { + "epoch": 0.3342989046800036, + "grad_norm": 0.8676903247833252, + "learning_rate": 0.00014174345775970552, + "loss": 2.9207, + "step": 3693 + }, + { + "epoch": 0.33438942699375396, + "grad_norm": 0.9278954863548279, + "learning_rate": 0.0001417146194243762, + "loss": 2.9738, + "step": 3694 + }, + { + "epoch": 0.3344799493075043, + "grad_norm": 0.8062622547149658, + "learning_rate": 0.0001416857768883858, + "loss": 2.8546, + "step": 3695 + }, + { + "epoch": 0.33457047162125464, + "grad_norm": 0.8296931385993958, + "learning_rate": 0.00014165693015463885, + "loss": 2.9236, + "step": 3696 + }, + { + "epoch": 0.334660993935005, + "grad_norm": 0.8529160022735596, + "learning_rate": 0.00014162807922604012, + "loss": 2.9099, + "step": 3697 + }, + { + "epoch": 0.3347515162487553, + "grad_norm": 0.876424252986908, + "learning_rate": 0.00014159922410549497, + "loss": 2.958, + "step": 3698 + }, + { + "epoch": 0.33484203856250566, + "grad_norm": 0.920254647731781, + "learning_rate": 0.00014157036479590913, + "loss": 2.9123, + "step": 3699 + }, + { + "epoch": 0.334932560876256, + "grad_norm": 0.9651416540145874, + "learning_rate": 0.00014154150130018866, + "loss": 2.9118, + "step": 3700 + }, + { + "epoch": 0.33502308319000634, + "grad_norm": 0.8106426000595093, + "learning_rate": 0.00014151263362124013, + "loss": 2.9436, + "step": 3701 + }, + { + "epoch": 0.3351136055037567, + "grad_norm": 0.9680935144424438, + "learning_rate": 0.00014148376176197056, + "loss": 3.0007, + "step": 3702 + }, + { + "epoch": 0.335204127817507, + "grad_norm": 0.831698477268219, + "learning_rate": 0.0001414548857252873, + "loss": 2.8877, + "step": 3703 + }, + { + "epoch": 0.33529465013125737, + "grad_norm": 0.8796342015266418, + "learning_rate": 0.00014142600551409818, + "loss": 2.9198, + "step": 3704 + }, + { + "epoch": 0.3353851724450077, + "grad_norm": 0.9696304798126221, + "learning_rate": 0.00014139712113131144, + "loss": 2.9221, + "step": 3705 + }, + { + "epoch": 0.33547569475875805, + "grad_norm": 0.8577775955200195, + "learning_rate": 0.00014136823257983577, + "loss": 2.9389, + "step": 3706 + }, + { + "epoch": 0.3355662170725084, + "grad_norm": 0.9382533431053162, + "learning_rate": 0.00014133933986258017, + "loss": 2.8726, + "step": 3707 + }, + { + "epoch": 0.33565673938625873, + "grad_norm": 1.0489027500152588, + "learning_rate": 0.0001413104429824542, + "loss": 2.9656, + "step": 3708 + }, + { + "epoch": 0.33574726170000907, + "grad_norm": 0.8827459216117859, + "learning_rate": 0.0001412815419423677, + "loss": 2.9129, + "step": 3709 + }, + { + "epoch": 0.3358377840137594, + "grad_norm": 1.0175361633300781, + "learning_rate": 0.00014125263674523114, + "loss": 2.9383, + "step": 3710 + }, + { + "epoch": 0.33592830632750975, + "grad_norm": 1.0674699544906616, + "learning_rate": 0.00014122372739395513, + "loss": 2.9685, + "step": 3711 + }, + { + "epoch": 0.3360188286412601, + "grad_norm": 0.9076451659202576, + "learning_rate": 0.0001411948138914509, + "loss": 2.942, + "step": 3712 + }, + { + "epoch": 0.33610935095501043, + "grad_norm": 0.9945937395095825, + "learning_rate": 0.00014116589624063006, + "loss": 2.9611, + "step": 3713 + }, + { + "epoch": 0.3361998732687608, + "grad_norm": 1.0060627460479736, + "learning_rate": 0.0001411369744444046, + "loss": 2.9929, + "step": 3714 + }, + { + "epoch": 0.3362903955825111, + "grad_norm": 0.8711893558502197, + "learning_rate": 0.0001411080485056869, + "loss": 2.9307, + "step": 3715 + }, + { + "epoch": 0.33638091789626146, + "grad_norm": 0.9552099704742432, + "learning_rate": 0.00014107911842738987, + "loss": 2.961, + "step": 3716 + }, + { + "epoch": 0.33647144021001174, + "grad_norm": 1.0096842050552368, + "learning_rate": 0.00014105018421242674, + "loss": 2.8867, + "step": 3717 + }, + { + "epoch": 0.3365619625237621, + "grad_norm": 0.9284431338310242, + "learning_rate": 0.0001410212458637112, + "loss": 2.9696, + "step": 3718 + }, + { + "epoch": 0.3366524848375124, + "grad_norm": 0.9499095678329468, + "learning_rate": 0.00014099230338415728, + "loss": 2.9788, + "step": 3719 + }, + { + "epoch": 0.33674300715126276, + "grad_norm": 0.8471880555152893, + "learning_rate": 0.00014096335677667954, + "loss": 2.9692, + "step": 3720 + }, + { + "epoch": 0.3368335294650131, + "grad_norm": 0.8633290529251099, + "learning_rate": 0.00014093440604419293, + "loss": 2.9557, + "step": 3721 + }, + { + "epoch": 0.33692405177876344, + "grad_norm": 0.855199933052063, + "learning_rate": 0.00014090545118961272, + "loss": 2.8918, + "step": 3722 + }, + { + "epoch": 0.3370145740925138, + "grad_norm": 0.9451314806938171, + "learning_rate": 0.0001408764922158547, + "loss": 2.9487, + "step": 3723 + }, + { + "epoch": 0.3371050964062641, + "grad_norm": 0.8324365615844727, + "learning_rate": 0.00014084752912583504, + "loss": 2.894, + "step": 3724 + }, + { + "epoch": 0.33719561872001447, + "grad_norm": 0.9008909463882446, + "learning_rate": 0.00014081856192247033, + "loss": 2.9572, + "step": 3725 + }, + { + "epoch": 0.3372861410337648, + "grad_norm": 0.8862847089767456, + "learning_rate": 0.0001407895906086775, + "loss": 2.9045, + "step": 3726 + }, + { + "epoch": 0.33737666334751515, + "grad_norm": 0.8587580919265747, + "learning_rate": 0.00014076061518737405, + "loss": 2.9436, + "step": 3727 + }, + { + "epoch": 0.3374671856612655, + "grad_norm": 0.8281906843185425, + "learning_rate": 0.00014073163566147775, + "loss": 2.9422, + "step": 3728 + }, + { + "epoch": 0.33755770797501583, + "grad_norm": 0.9020146727561951, + "learning_rate": 0.00014070265203390686, + "loss": 2.9748, + "step": 3729 + }, + { + "epoch": 0.33764823028876617, + "grad_norm": 0.9331706166267395, + "learning_rate": 0.00014067366430758004, + "loss": 2.9166, + "step": 3730 + }, + { + "epoch": 0.3377387526025165, + "grad_norm": 0.8349210023880005, + "learning_rate": 0.00014064467248541633, + "loss": 2.8796, + "step": 3731 + }, + { + "epoch": 0.33782927491626685, + "grad_norm": 0.9263269305229187, + "learning_rate": 0.00014061567657033518, + "loss": 2.9703, + "step": 3732 + }, + { + "epoch": 0.3379197972300172, + "grad_norm": 0.9281939268112183, + "learning_rate": 0.00014058667656525654, + "loss": 2.8827, + "step": 3733 + }, + { + "epoch": 0.33801031954376753, + "grad_norm": 0.9000682234764099, + "learning_rate": 0.0001405576724731007, + "loss": 2.915, + "step": 3734 + }, + { + "epoch": 0.3381008418575179, + "grad_norm": 0.8951552510261536, + "learning_rate": 0.00014052866429678832, + "loss": 2.9386, + "step": 3735 + }, + { + "epoch": 0.3381913641712682, + "grad_norm": 0.8973798155784607, + "learning_rate": 0.00014049965203924054, + "loss": 2.9231, + "step": 3736 + }, + { + "epoch": 0.33828188648501856, + "grad_norm": 0.9233877658843994, + "learning_rate": 0.00014047063570337895, + "loss": 2.9131, + "step": 3737 + }, + { + "epoch": 0.3383724087987689, + "grad_norm": 0.7996513843536377, + "learning_rate": 0.00014044161529212543, + "loss": 2.9451, + "step": 3738 + }, + { + "epoch": 0.33846293111251924, + "grad_norm": 0.9365185499191284, + "learning_rate": 0.00014041259080840236, + "loss": 2.9335, + "step": 3739 + }, + { + "epoch": 0.3385534534262696, + "grad_norm": 0.8419017791748047, + "learning_rate": 0.00014038356225513248, + "loss": 2.9522, + "step": 3740 + }, + { + "epoch": 0.3386439757400199, + "grad_norm": 0.8480751514434814, + "learning_rate": 0.00014035452963523902, + "loss": 2.9226, + "step": 3741 + }, + { + "epoch": 0.33873449805377026, + "grad_norm": 0.8409445881843567, + "learning_rate": 0.00014032549295164552, + "loss": 2.9347, + "step": 3742 + }, + { + "epoch": 0.3388250203675206, + "grad_norm": 0.8422466516494751, + "learning_rate": 0.00014029645220727595, + "loss": 2.8973, + "step": 3743 + }, + { + "epoch": 0.33891554268127094, + "grad_norm": 0.7969680428504944, + "learning_rate": 0.0001402674074050548, + "loss": 2.8855, + "step": 3744 + }, + { + "epoch": 0.3390060649950213, + "grad_norm": 0.8092168569564819, + "learning_rate": 0.0001402383585479068, + "loss": 2.94, + "step": 3745 + }, + { + "epoch": 0.3390965873087716, + "grad_norm": 0.8328232765197754, + "learning_rate": 0.0001402093056387572, + "loss": 2.8787, + "step": 3746 + }, + { + "epoch": 0.33918710962252197, + "grad_norm": 0.8564672470092773, + "learning_rate": 0.0001401802486805316, + "loss": 2.9456, + "step": 3747 + }, + { + "epoch": 0.3392776319362723, + "grad_norm": 0.8365373015403748, + "learning_rate": 0.00014015118767615606, + "loss": 2.933, + "step": 3748 + }, + { + "epoch": 0.33936815425002265, + "grad_norm": 0.8401426076889038, + "learning_rate": 0.00014012212262855706, + "loss": 2.8474, + "step": 3749 + }, + { + "epoch": 0.339458676563773, + "grad_norm": 0.8071068525314331, + "learning_rate": 0.00014009305354066137, + "loss": 2.9493, + "step": 3750 + }, + { + "epoch": 0.33954919887752333, + "grad_norm": 0.8695021867752075, + "learning_rate": 0.0001400639804153963, + "loss": 2.9483, + "step": 3751 + }, + { + "epoch": 0.33963972119127367, + "grad_norm": 0.81166672706604, + "learning_rate": 0.00014003490325568954, + "loss": 2.9202, + "step": 3752 + }, + { + "epoch": 0.339730243505024, + "grad_norm": 0.8853387832641602, + "learning_rate": 0.00014000582206446905, + "loss": 2.976, + "step": 3753 + }, + { + "epoch": 0.33982076581877435, + "grad_norm": 0.8278989791870117, + "learning_rate": 0.0001399767368446634, + "loss": 2.9156, + "step": 3754 + }, + { + "epoch": 0.3399112881325247, + "grad_norm": 0.8867419958114624, + "learning_rate": 0.00013994764759920142, + "loss": 2.9845, + "step": 3755 + }, + { + "epoch": 0.34000181044627503, + "grad_norm": 0.8510578870773315, + "learning_rate": 0.00013991855433101246, + "loss": 2.95, + "step": 3756 + }, + { + "epoch": 0.3400923327600254, + "grad_norm": 0.8585345149040222, + "learning_rate": 0.00013988945704302615, + "loss": 2.8936, + "step": 3757 + }, + { + "epoch": 0.34018285507377566, + "grad_norm": 0.854981005191803, + "learning_rate": 0.0001398603557381726, + "loss": 2.9409, + "step": 3758 + }, + { + "epoch": 0.340273377387526, + "grad_norm": 0.8987625241279602, + "learning_rate": 0.00013983125041938232, + "loss": 2.9962, + "step": 3759 + }, + { + "epoch": 0.34036389970127634, + "grad_norm": 0.9021677374839783, + "learning_rate": 0.00013980214108958624, + "loss": 2.8906, + "step": 3760 + }, + { + "epoch": 0.3404544220150267, + "grad_norm": 0.9355793595314026, + "learning_rate": 0.0001397730277517156, + "loss": 2.9823, + "step": 3761 + }, + { + "epoch": 0.340544944328777, + "grad_norm": 0.8212774395942688, + "learning_rate": 0.00013974391040870218, + "loss": 2.927, + "step": 3762 + }, + { + "epoch": 0.34063546664252736, + "grad_norm": 0.9005094170570374, + "learning_rate": 0.00013971478906347806, + "loss": 2.9531, + "step": 3763 + }, + { + "epoch": 0.3407259889562777, + "grad_norm": 1.0443588495254517, + "learning_rate": 0.0001396856637189758, + "loss": 2.9933, + "step": 3764 + }, + { + "epoch": 0.34081651127002804, + "grad_norm": 0.8536207675933838, + "learning_rate": 0.00013965653437812825, + "loss": 2.9121, + "step": 3765 + }, + { + "epoch": 0.3409070335837784, + "grad_norm": 0.8683392405509949, + "learning_rate": 0.00013962740104386876, + "loss": 2.9358, + "step": 3766 + }, + { + "epoch": 0.3409975558975287, + "grad_norm": 0.8488772511482239, + "learning_rate": 0.00013959826371913111, + "loss": 2.9209, + "step": 3767 + }, + { + "epoch": 0.34108807821127907, + "grad_norm": 0.9025128483772278, + "learning_rate": 0.0001395691224068494, + "loss": 2.9691, + "step": 3768 + }, + { + "epoch": 0.3411786005250294, + "grad_norm": 0.878285825252533, + "learning_rate": 0.0001395399771099581, + "loss": 2.9542, + "step": 3769 + }, + { + "epoch": 0.34126912283877975, + "grad_norm": 0.8517298698425293, + "learning_rate": 0.0001395108278313922, + "loss": 2.9952, + "step": 3770 + }, + { + "epoch": 0.3413596451525301, + "grad_norm": 0.919712483882904, + "learning_rate": 0.00013948167457408702, + "loss": 3.0014, + "step": 3771 + }, + { + "epoch": 0.34145016746628043, + "grad_norm": 0.8718852400779724, + "learning_rate": 0.00013945251734097828, + "loss": 2.9527, + "step": 3772 + }, + { + "epoch": 0.34154068978003077, + "grad_norm": 0.8577459454536438, + "learning_rate": 0.00013942335613500214, + "loss": 2.918, + "step": 3773 + }, + { + "epoch": 0.3416312120937811, + "grad_norm": 0.8327313661575317, + "learning_rate": 0.00013939419095909512, + "loss": 2.9399, + "step": 3774 + }, + { + "epoch": 0.34172173440753145, + "grad_norm": 0.9073922038078308, + "learning_rate": 0.00013936502181619416, + "loss": 2.9598, + "step": 3775 + }, + { + "epoch": 0.3418122567212818, + "grad_norm": 0.8576645851135254, + "learning_rate": 0.00013933584870923655, + "loss": 2.9237, + "step": 3776 + }, + { + "epoch": 0.34190277903503213, + "grad_norm": 0.8886789679527283, + "learning_rate": 0.00013930667164116004, + "loss": 2.958, + "step": 3777 + }, + { + "epoch": 0.3419933013487825, + "grad_norm": 0.8276694416999817, + "learning_rate": 0.0001392774906149028, + "loss": 2.9362, + "step": 3778 + }, + { + "epoch": 0.3420838236625328, + "grad_norm": 0.8300824761390686, + "learning_rate": 0.00013924830563340334, + "loss": 2.9551, + "step": 3779 + }, + { + "epoch": 0.34217434597628316, + "grad_norm": 0.8122133612632751, + "learning_rate": 0.00013921911669960055, + "loss": 2.9439, + "step": 3780 + }, + { + "epoch": 0.3422648682900335, + "grad_norm": 0.8210098743438721, + "learning_rate": 0.0001391899238164338, + "loss": 2.9274, + "step": 3781 + }, + { + "epoch": 0.34235539060378384, + "grad_norm": 0.8149298429489136, + "learning_rate": 0.00013916072698684278, + "loss": 2.9039, + "step": 3782 + }, + { + "epoch": 0.3424459129175342, + "grad_norm": 0.8439550399780273, + "learning_rate": 0.00013913152621376765, + "loss": 2.9896, + "step": 3783 + }, + { + "epoch": 0.3425364352312845, + "grad_norm": 0.8266587257385254, + "learning_rate": 0.00013910232150014885, + "loss": 2.9002, + "step": 3784 + }, + { + "epoch": 0.34262695754503486, + "grad_norm": 0.8575969934463501, + "learning_rate": 0.00013907311284892736, + "loss": 2.9798, + "step": 3785 + }, + { + "epoch": 0.3427174798587852, + "grad_norm": 0.8559625148773193, + "learning_rate": 0.00013904390026304452, + "loss": 2.9167, + "step": 3786 + }, + { + "epoch": 0.34280800217253554, + "grad_norm": 0.8668503761291504, + "learning_rate": 0.00013901468374544196, + "loss": 2.8825, + "step": 3787 + }, + { + "epoch": 0.3428985244862859, + "grad_norm": 0.8820399045944214, + "learning_rate": 0.00013898546329906182, + "loss": 2.9226, + "step": 3788 + }, + { + "epoch": 0.3429890468000362, + "grad_norm": 0.9047626256942749, + "learning_rate": 0.00013895623892684657, + "loss": 2.9645, + "step": 3789 + }, + { + "epoch": 0.34307956911378656, + "grad_norm": 0.9389455318450928, + "learning_rate": 0.00013892701063173918, + "loss": 2.8754, + "step": 3790 + }, + { + "epoch": 0.3431700914275369, + "grad_norm": 0.8222225904464722, + "learning_rate": 0.00013889777841668283, + "loss": 2.8993, + "step": 3791 + }, + { + "epoch": 0.34326061374128725, + "grad_norm": 0.8395286798477173, + "learning_rate": 0.00013886854228462126, + "loss": 2.9675, + "step": 3792 + }, + { + "epoch": 0.3433511360550376, + "grad_norm": 0.8453222513198853, + "learning_rate": 0.00013883930223849858, + "loss": 2.8964, + "step": 3793 + }, + { + "epoch": 0.3434416583687879, + "grad_norm": 0.8446892499923706, + "learning_rate": 0.00013881005828125922, + "loss": 2.8557, + "step": 3794 + }, + { + "epoch": 0.34353218068253827, + "grad_norm": 0.8886775970458984, + "learning_rate": 0.00013878081041584803, + "loss": 2.9072, + "step": 3795 + }, + { + "epoch": 0.3436227029962886, + "grad_norm": 0.8726206421852112, + "learning_rate": 0.0001387515586452103, + "loss": 2.9519, + "step": 3796 + }, + { + "epoch": 0.34371322531003895, + "grad_norm": 0.8880523443222046, + "learning_rate": 0.00013872230297229168, + "loss": 2.88, + "step": 3797 + }, + { + "epoch": 0.3438037476237893, + "grad_norm": 0.8805081248283386, + "learning_rate": 0.0001386930434000382, + "loss": 2.8936, + "step": 3798 + }, + { + "epoch": 0.3438942699375396, + "grad_norm": 0.8521276712417603, + "learning_rate": 0.00013866377993139635, + "loss": 2.9465, + "step": 3799 + }, + { + "epoch": 0.3439847922512899, + "grad_norm": 0.9059352874755859, + "learning_rate": 0.00013863451256931287, + "loss": 2.89, + "step": 3800 + }, + { + "epoch": 0.34407531456504026, + "grad_norm": 0.8595585823059082, + "learning_rate": 0.00013860524131673505, + "loss": 2.9224, + "step": 3801 + }, + { + "epoch": 0.3441658368787906, + "grad_norm": 0.8836684823036194, + "learning_rate": 0.00013857596617661047, + "loss": 2.9381, + "step": 3802 + }, + { + "epoch": 0.34425635919254094, + "grad_norm": 0.8365030884742737, + "learning_rate": 0.0001385466871518872, + "loss": 2.9124, + "step": 3803 + }, + { + "epoch": 0.3443468815062913, + "grad_norm": 0.8696910738945007, + "learning_rate": 0.0001385174042455135, + "loss": 2.8779, + "step": 3804 + }, + { + "epoch": 0.3444374038200416, + "grad_norm": 0.8183446526527405, + "learning_rate": 0.00013848811746043835, + "loss": 2.8944, + "step": 3805 + }, + { + "epoch": 0.34452792613379196, + "grad_norm": 0.8655415177345276, + "learning_rate": 0.00013845882679961076, + "loss": 2.9028, + "step": 3806 + }, + { + "epoch": 0.3446184484475423, + "grad_norm": 0.9962408542633057, + "learning_rate": 0.00013842953226598037, + "loss": 2.9101, + "step": 3807 + }, + { + "epoch": 0.34470897076129264, + "grad_norm": 0.9712062478065491, + "learning_rate": 0.00013840023386249713, + "loss": 2.9491, + "step": 3808 + }, + { + "epoch": 0.344799493075043, + "grad_norm": 0.9880498051643372, + "learning_rate": 0.00013837093159211144, + "loss": 2.9419, + "step": 3809 + }, + { + "epoch": 0.3448900153887933, + "grad_norm": 0.8662852048873901, + "learning_rate": 0.00013834162545777395, + "loss": 2.8891, + "step": 3810 + }, + { + "epoch": 0.34498053770254367, + "grad_norm": 0.9303768277168274, + "learning_rate": 0.00013831231546243582, + "loss": 2.9551, + "step": 3811 + }, + { + "epoch": 0.345071060016294, + "grad_norm": 0.8257996439933777, + "learning_rate": 0.00013828300160904856, + "loss": 2.9602, + "step": 3812 + }, + { + "epoch": 0.34516158233004435, + "grad_norm": 0.9260730147361755, + "learning_rate": 0.00013825368390056416, + "loss": 2.9734, + "step": 3813 + }, + { + "epoch": 0.3452521046437947, + "grad_norm": 0.8937355875968933, + "learning_rate": 0.00013822436233993475, + "loss": 2.9492, + "step": 3814 + }, + { + "epoch": 0.34534262695754503, + "grad_norm": 0.8791353106498718, + "learning_rate": 0.00013819503693011313, + "loss": 2.863, + "step": 3815 + }, + { + "epoch": 0.34543314927129537, + "grad_norm": 0.8687493205070496, + "learning_rate": 0.00013816570767405232, + "loss": 2.8727, + "step": 3816 + }, + { + "epoch": 0.3455236715850457, + "grad_norm": 0.9130697250366211, + "learning_rate": 0.00013813637457470583, + "loss": 2.9826, + "step": 3817 + }, + { + "epoch": 0.34561419389879605, + "grad_norm": 0.9329674243927002, + "learning_rate": 0.00013810703763502744, + "loss": 2.9252, + "step": 3818 + }, + { + "epoch": 0.3457047162125464, + "grad_norm": 0.9105332493782043, + "learning_rate": 0.00013807769685797138, + "loss": 2.897, + "step": 3819 + }, + { + "epoch": 0.34579523852629673, + "grad_norm": 0.9339108467102051, + "learning_rate": 0.0001380483522464923, + "loss": 2.8901, + "step": 3820 + }, + { + "epoch": 0.3458857608400471, + "grad_norm": 0.9092152118682861, + "learning_rate": 0.00013801900380354526, + "loss": 2.9041, + "step": 3821 + }, + { + "epoch": 0.3459762831537974, + "grad_norm": 1.0351412296295166, + "learning_rate": 0.0001379896515320855, + "loss": 2.9424, + "step": 3822 + }, + { + "epoch": 0.34606680546754776, + "grad_norm": 0.8615590333938599, + "learning_rate": 0.00013796029543506887, + "loss": 2.8604, + "step": 3823 + }, + { + "epoch": 0.3461573277812981, + "grad_norm": 0.9683560132980347, + "learning_rate": 0.00013793093551545154, + "loss": 2.8971, + "step": 3824 + }, + { + "epoch": 0.34624785009504844, + "grad_norm": 0.9004538655281067, + "learning_rate": 0.00013790157177619004, + "loss": 2.9622, + "step": 3825 + }, + { + "epoch": 0.3463383724087988, + "grad_norm": 0.9129707217216492, + "learning_rate": 0.00013787220422024134, + "loss": 2.9685, + "step": 3826 + }, + { + "epoch": 0.3464288947225491, + "grad_norm": 0.9671385288238525, + "learning_rate": 0.00013784283285056268, + "loss": 2.9015, + "step": 3827 + }, + { + "epoch": 0.34651941703629946, + "grad_norm": 0.9019637107849121, + "learning_rate": 0.00013781345767011177, + "loss": 2.9677, + "step": 3828 + }, + { + "epoch": 0.3466099393500498, + "grad_norm": 0.8544565439224243, + "learning_rate": 0.00013778407868184672, + "loss": 2.966, + "step": 3829 + }, + { + "epoch": 0.34670046166380014, + "grad_norm": 0.9372018575668335, + "learning_rate": 0.000137754695888726, + "loss": 2.9562, + "step": 3830 + }, + { + "epoch": 0.3467909839775505, + "grad_norm": 1.1464779376983643, + "learning_rate": 0.0001377253092937084, + "loss": 2.9491, + "step": 3831 + }, + { + "epoch": 0.3468815062913008, + "grad_norm": 1.0488883256912231, + "learning_rate": 0.0001376959188997532, + "loss": 2.8893, + "step": 3832 + }, + { + "epoch": 0.34697202860505116, + "grad_norm": 0.9740673899650574, + "learning_rate": 0.00013766652470982002, + "loss": 2.9507, + "step": 3833 + }, + { + "epoch": 0.3470625509188015, + "grad_norm": 0.8552302718162537, + "learning_rate": 0.00013763712672686885, + "loss": 2.9664, + "step": 3834 + }, + { + "epoch": 0.34715307323255185, + "grad_norm": 1.0639338493347168, + "learning_rate": 0.00013760772495385998, + "loss": 3.0053, + "step": 3835 + }, + { + "epoch": 0.3472435955463022, + "grad_norm": 1.0619330406188965, + "learning_rate": 0.0001375783193937543, + "loss": 2.9125, + "step": 3836 + }, + { + "epoch": 0.3473341178600525, + "grad_norm": 1.0379544496536255, + "learning_rate": 0.00013754891004951286, + "loss": 2.8785, + "step": 3837 + }, + { + "epoch": 0.34742464017380287, + "grad_norm": 0.863757848739624, + "learning_rate": 0.00013751949692409718, + "loss": 2.8689, + "step": 3838 + }, + { + "epoch": 0.3475151624875532, + "grad_norm": 0.9642022252082825, + "learning_rate": 0.00013749008002046921, + "loss": 2.9416, + "step": 3839 + }, + { + "epoch": 0.3476056848013035, + "grad_norm": 1.0625447034835815, + "learning_rate": 0.00013746065934159123, + "loss": 2.927, + "step": 3840 + }, + { + "epoch": 0.34769620711505383, + "grad_norm": 0.9343215227127075, + "learning_rate": 0.00013743123489042586, + "loss": 2.865, + "step": 3841 + }, + { + "epoch": 0.3477867294288042, + "grad_norm": 1.0552362203598022, + "learning_rate": 0.00013740180666993614, + "loss": 2.9259, + "step": 3842 + }, + { + "epoch": 0.3478772517425545, + "grad_norm": 0.9143028855323792, + "learning_rate": 0.00013737237468308553, + "loss": 2.9009, + "step": 3843 + }, + { + "epoch": 0.34796777405630486, + "grad_norm": 0.9504222273826599, + "learning_rate": 0.00013734293893283783, + "loss": 2.9042, + "step": 3844 + }, + { + "epoch": 0.3480582963700552, + "grad_norm": 0.8866637349128723, + "learning_rate": 0.00013731349942215716, + "loss": 2.9043, + "step": 3845 + }, + { + "epoch": 0.34814881868380554, + "grad_norm": 0.9600756764411926, + "learning_rate": 0.00013728405615400814, + "loss": 2.9725, + "step": 3846 + }, + { + "epoch": 0.3482393409975559, + "grad_norm": 1.0086690187454224, + "learning_rate": 0.00013725460913135572, + "loss": 2.9477, + "step": 3847 + }, + { + "epoch": 0.3483298633113062, + "grad_norm": 0.8588940501213074, + "learning_rate": 0.00013722515835716515, + "loss": 2.9101, + "step": 3848 + }, + { + "epoch": 0.34842038562505656, + "grad_norm": 0.8638690710067749, + "learning_rate": 0.00013719570383440211, + "loss": 2.9316, + "step": 3849 + }, + { + "epoch": 0.3485109079388069, + "grad_norm": 0.9232494235038757, + "learning_rate": 0.00013716624556603274, + "loss": 2.9143, + "step": 3850 + }, + { + "epoch": 0.34860143025255724, + "grad_norm": 0.9864001274108887, + "learning_rate": 0.00013713678355502351, + "loss": 2.9226, + "step": 3851 + }, + { + "epoch": 0.3486919525663076, + "grad_norm": 0.898056149482727, + "learning_rate": 0.00013710731780434114, + "loss": 2.8942, + "step": 3852 + }, + { + "epoch": 0.3487824748800579, + "grad_norm": 0.8752807378768921, + "learning_rate": 0.0001370778483169529, + "loss": 2.8418, + "step": 3853 + }, + { + "epoch": 0.34887299719380827, + "grad_norm": 0.9617156982421875, + "learning_rate": 0.00013704837509582633, + "loss": 2.9151, + "step": 3854 + }, + { + "epoch": 0.3489635195075586, + "grad_norm": 0.8508132100105286, + "learning_rate": 0.00013701889814392944, + "loss": 2.9283, + "step": 3855 + }, + { + "epoch": 0.34905404182130895, + "grad_norm": 0.8842729330062866, + "learning_rate": 0.00013698941746423046, + "loss": 2.9183, + "step": 3856 + }, + { + "epoch": 0.3491445641350593, + "grad_norm": 0.8900924324989319, + "learning_rate": 0.0001369599330596982, + "loss": 2.8878, + "step": 3857 + }, + { + "epoch": 0.34923508644880963, + "grad_norm": 0.8917263150215149, + "learning_rate": 0.00013693044493330166, + "loss": 2.8979, + "step": 3858 + }, + { + "epoch": 0.34932560876255997, + "grad_norm": 1.1581900119781494, + "learning_rate": 0.00013690095308801036, + "loss": 2.9183, + "step": 3859 + }, + { + "epoch": 0.3494161310763103, + "grad_norm": 0.900983452796936, + "learning_rate": 0.0001368714575267941, + "loss": 2.8954, + "step": 3860 + }, + { + "epoch": 0.34950665339006065, + "grad_norm": 0.8715013265609741, + "learning_rate": 0.00013684195825262302, + "loss": 2.8939, + "step": 3861 + }, + { + "epoch": 0.349597175703811, + "grad_norm": 1.0625563859939575, + "learning_rate": 0.00013681245526846783, + "loss": 2.9139, + "step": 3862 + }, + { + "epoch": 0.34968769801756133, + "grad_norm": 0.8163450956344604, + "learning_rate": 0.00013678294857729934, + "loss": 2.8957, + "step": 3863 + }, + { + "epoch": 0.3497782203313117, + "grad_norm": 1.280600905418396, + "learning_rate": 0.00013675343818208896, + "loss": 2.9895, + "step": 3864 + }, + { + "epoch": 0.349868742645062, + "grad_norm": 0.9801856875419617, + "learning_rate": 0.00013672392408580833, + "loss": 2.9835, + "step": 3865 + }, + { + "epoch": 0.34995926495881235, + "grad_norm": 0.8751368522644043, + "learning_rate": 0.0001366944062914296, + "loss": 2.9536, + "step": 3866 + }, + { + "epoch": 0.3500497872725627, + "grad_norm": 0.8888764977455139, + "learning_rate": 0.00013666488480192513, + "loss": 2.8834, + "step": 3867 + }, + { + "epoch": 0.35014030958631304, + "grad_norm": 0.8316412568092346, + "learning_rate": 0.00013663535962026778, + "loss": 2.8587, + "step": 3868 + }, + { + "epoch": 0.3502308319000634, + "grad_norm": 0.8390792608261108, + "learning_rate": 0.00013660583074943068, + "loss": 2.9432, + "step": 3869 + }, + { + "epoch": 0.3503213542138137, + "grad_norm": 0.8946274518966675, + "learning_rate": 0.00013657629819238746, + "loss": 2.9448, + "step": 3870 + }, + { + "epoch": 0.35041187652756406, + "grad_norm": 0.8634069561958313, + "learning_rate": 0.000136546761952112, + "loss": 2.9023, + "step": 3871 + }, + { + "epoch": 0.3505023988413144, + "grad_norm": 0.932811975479126, + "learning_rate": 0.00013651722203157862, + "loss": 2.9542, + "step": 3872 + }, + { + "epoch": 0.35059292115506474, + "grad_norm": 0.9042550921440125, + "learning_rate": 0.00013648767843376196, + "loss": 2.978, + "step": 3873 + }, + { + "epoch": 0.3506834434688151, + "grad_norm": 0.8619733452796936, + "learning_rate": 0.00013645813116163713, + "loss": 2.9299, + "step": 3874 + }, + { + "epoch": 0.3507739657825654, + "grad_norm": 0.9075789451599121, + "learning_rate": 0.00013642858021817943, + "loss": 2.8829, + "step": 3875 + }, + { + "epoch": 0.35086448809631576, + "grad_norm": 0.8590002655982971, + "learning_rate": 0.00013639902560636473, + "loss": 2.8802, + "step": 3876 + }, + { + "epoch": 0.3509550104100661, + "grad_norm": 0.9196003079414368, + "learning_rate": 0.00013636946732916916, + "loss": 2.9321, + "step": 3877 + }, + { + "epoch": 0.35104553272381644, + "grad_norm": 0.9296711087226868, + "learning_rate": 0.0001363399053895692, + "loss": 2.9494, + "step": 3878 + }, + { + "epoch": 0.3511360550375668, + "grad_norm": 1.036049723625183, + "learning_rate": 0.00013631033979054176, + "loss": 2.9524, + "step": 3879 + }, + { + "epoch": 0.3512265773513171, + "grad_norm": 0.806705892086029, + "learning_rate": 0.0001362807705350641, + "loss": 2.9233, + "step": 3880 + }, + { + "epoch": 0.3513170996650674, + "grad_norm": 0.9473732709884644, + "learning_rate": 0.00013625119762611387, + "loss": 2.9387, + "step": 3881 + }, + { + "epoch": 0.35140762197881775, + "grad_norm": 0.8431051969528198, + "learning_rate": 0.000136221621066669, + "loss": 2.9026, + "step": 3882 + }, + { + "epoch": 0.3514981442925681, + "grad_norm": 0.784507155418396, + "learning_rate": 0.00013619204085970788, + "loss": 2.8671, + "step": 3883 + }, + { + "epoch": 0.35158866660631843, + "grad_norm": 0.8995726704597473, + "learning_rate": 0.00013616245700820922, + "loss": 2.877, + "step": 3884 + }, + { + "epoch": 0.3516791889200688, + "grad_norm": 0.8044691681861877, + "learning_rate": 0.00013613286951515217, + "loss": 2.926, + "step": 3885 + }, + { + "epoch": 0.3517697112338191, + "grad_norm": 0.886650025844574, + "learning_rate": 0.00013610327838351613, + "loss": 2.8995, + "step": 3886 + }, + { + "epoch": 0.35186023354756946, + "grad_norm": 0.8361202478408813, + "learning_rate": 0.00013607368361628094, + "loss": 2.8801, + "step": 3887 + }, + { + "epoch": 0.3519507558613198, + "grad_norm": 0.8228411078453064, + "learning_rate": 0.0001360440852164268, + "loss": 2.8873, + "step": 3888 + }, + { + "epoch": 0.35204127817507014, + "grad_norm": 0.8641404509544373, + "learning_rate": 0.0001360144831869343, + "loss": 2.9192, + "step": 3889 + }, + { + "epoch": 0.3521318004888205, + "grad_norm": 0.862514078617096, + "learning_rate": 0.00013598487753078425, + "loss": 3.0059, + "step": 3890 + }, + { + "epoch": 0.3522223228025708, + "grad_norm": 0.8352610468864441, + "learning_rate": 0.0001359552682509581, + "loss": 2.9455, + "step": 3891 + }, + { + "epoch": 0.35231284511632116, + "grad_norm": 0.8321881890296936, + "learning_rate": 0.00013592565535043737, + "loss": 2.874, + "step": 3892 + }, + { + "epoch": 0.3524033674300715, + "grad_norm": 0.9299854040145874, + "learning_rate": 0.00013589603883220415, + "loss": 2.9172, + "step": 3893 + }, + { + "epoch": 0.35249388974382184, + "grad_norm": 0.8699597716331482, + "learning_rate": 0.00013586641869924078, + "loss": 2.8922, + "step": 3894 + }, + { + "epoch": 0.3525844120575722, + "grad_norm": 0.9263263940811157, + "learning_rate": 0.00013583679495453, + "loss": 2.9072, + "step": 3895 + }, + { + "epoch": 0.3526749343713225, + "grad_norm": 0.9022011160850525, + "learning_rate": 0.00013580716760105502, + "loss": 2.9152, + "step": 3896 + }, + { + "epoch": 0.35276545668507286, + "grad_norm": 0.8681530356407166, + "learning_rate": 0.0001357775366417992, + "loss": 2.9911, + "step": 3897 + }, + { + "epoch": 0.3528559789988232, + "grad_norm": 0.9377198815345764, + "learning_rate": 0.00013574790207974646, + "loss": 2.8987, + "step": 3898 + }, + { + "epoch": 0.35294650131257355, + "grad_norm": 0.9530036449432373, + "learning_rate": 0.00013571826391788093, + "loss": 2.9039, + "step": 3899 + }, + { + "epoch": 0.3530370236263239, + "grad_norm": 1.0229028463363647, + "learning_rate": 0.00013568862215918717, + "loss": 2.9019, + "step": 3900 + }, + { + "epoch": 0.35312754594007423, + "grad_norm": 1.012978434562683, + "learning_rate": 0.0001356589768066502, + "loss": 2.8668, + "step": 3901 + }, + { + "epoch": 0.35321806825382457, + "grad_norm": 1.0030837059020996, + "learning_rate": 0.00013562932786325522, + "loss": 2.9775, + "step": 3902 + }, + { + "epoch": 0.3533085905675749, + "grad_norm": 1.00313401222229, + "learning_rate": 0.00013559967533198787, + "loss": 2.909, + "step": 3903 + }, + { + "epoch": 0.35339911288132525, + "grad_norm": 1.044822335243225, + "learning_rate": 0.0001355700192158342, + "loss": 2.938, + "step": 3904 + }, + { + "epoch": 0.3534896351950756, + "grad_norm": 1.1762657165527344, + "learning_rate": 0.00013554035951778058, + "loss": 2.9599, + "step": 3905 + }, + { + "epoch": 0.35358015750882593, + "grad_norm": 0.9761765599250793, + "learning_rate": 0.0001355106962408137, + "loss": 2.9664, + "step": 3906 + }, + { + "epoch": 0.3536706798225763, + "grad_norm": 0.9626485109329224, + "learning_rate": 0.00013548102938792074, + "loss": 2.9368, + "step": 3907 + }, + { + "epoch": 0.3537612021363266, + "grad_norm": 1.0166648626327515, + "learning_rate": 0.00013545135896208906, + "loss": 2.9386, + "step": 3908 + }, + { + "epoch": 0.35385172445007695, + "grad_norm": 0.9419061541557312, + "learning_rate": 0.00013542168496630648, + "loss": 2.9433, + "step": 3909 + }, + { + "epoch": 0.3539422467638273, + "grad_norm": 0.870576024055481, + "learning_rate": 0.00013539200740356118, + "loss": 2.9445, + "step": 3910 + }, + { + "epoch": 0.35403276907757764, + "grad_norm": 0.9303431510925293, + "learning_rate": 0.00013536232627684176, + "loss": 2.8962, + "step": 3911 + }, + { + "epoch": 0.354123291391328, + "grad_norm": 1.0295835733413696, + "learning_rate": 0.00013533264158913704, + "loss": 2.9648, + "step": 3912 + }, + { + "epoch": 0.3542138137050783, + "grad_norm": 0.9527624249458313, + "learning_rate": 0.00013530295334343624, + "loss": 2.9421, + "step": 3913 + }, + { + "epoch": 0.35430433601882866, + "grad_norm": 1.1770457029342651, + "learning_rate": 0.000135273261542729, + "loss": 2.9074, + "step": 3914 + }, + { + "epoch": 0.354394858332579, + "grad_norm": 0.8796162605285645, + "learning_rate": 0.00013524356619000534, + "loss": 2.956, + "step": 3915 + }, + { + "epoch": 0.35448538064632934, + "grad_norm": 0.9670255780220032, + "learning_rate": 0.0001352138672882555, + "loss": 2.8812, + "step": 3916 + }, + { + "epoch": 0.3545759029600797, + "grad_norm": 0.9039412140846252, + "learning_rate": 0.00013518416484047018, + "loss": 2.9025, + "step": 3917 + }, + { + "epoch": 0.35466642527383, + "grad_norm": 0.8382848501205444, + "learning_rate": 0.00013515445884964045, + "loss": 2.914, + "step": 3918 + }, + { + "epoch": 0.35475694758758036, + "grad_norm": 0.9838911890983582, + "learning_rate": 0.00013512474931875765, + "loss": 2.9671, + "step": 3919 + }, + { + "epoch": 0.3548474699013307, + "grad_norm": 0.9354199171066284, + "learning_rate": 0.00013509503625081358, + "loss": 2.9004, + "step": 3920 + }, + { + "epoch": 0.35493799221508104, + "grad_norm": 0.8873734474182129, + "learning_rate": 0.0001350653196488003, + "loss": 2.879, + "step": 3921 + }, + { + "epoch": 0.35502851452883133, + "grad_norm": 1.0136916637420654, + "learning_rate": 0.0001350355995157103, + "loss": 2.914, + "step": 3922 + }, + { + "epoch": 0.35511903684258167, + "grad_norm": 0.8554311394691467, + "learning_rate": 0.00013500587585453638, + "loss": 2.9463, + "step": 3923 + }, + { + "epoch": 0.355209559156332, + "grad_norm": 0.8193634748458862, + "learning_rate": 0.00013497614866827173, + "loss": 2.9058, + "step": 3924 + }, + { + "epoch": 0.35530008147008235, + "grad_norm": 0.9600344300270081, + "learning_rate": 0.00013494641795990986, + "loss": 2.873, + "step": 3925 + }, + { + "epoch": 0.3553906037838327, + "grad_norm": 0.928473949432373, + "learning_rate": 0.00013491668373244464, + "loss": 2.8798, + "step": 3926 + }, + { + "epoch": 0.35548112609758303, + "grad_norm": 0.8732498288154602, + "learning_rate": 0.00013488694598887035, + "loss": 2.861, + "step": 3927 + }, + { + "epoch": 0.3555716484113334, + "grad_norm": 0.9893165230751038, + "learning_rate": 0.00013485720473218154, + "loss": 2.8755, + "step": 3928 + }, + { + "epoch": 0.3556621707250837, + "grad_norm": 0.9542164206504822, + "learning_rate": 0.00013482745996537315, + "loss": 2.9289, + "step": 3929 + }, + { + "epoch": 0.35575269303883406, + "grad_norm": 0.8913797736167908, + "learning_rate": 0.0001347977116914405, + "loss": 2.8931, + "step": 3930 + }, + { + "epoch": 0.3558432153525844, + "grad_norm": 1.016385555267334, + "learning_rate": 0.0001347679599133793, + "loss": 2.8436, + "step": 3931 + }, + { + "epoch": 0.35593373766633474, + "grad_norm": 1.0167441368103027, + "learning_rate": 0.0001347382046341854, + "loss": 2.9751, + "step": 3932 + }, + { + "epoch": 0.3560242599800851, + "grad_norm": 0.8540645241737366, + "learning_rate": 0.0001347084458568553, + "loss": 2.95, + "step": 3933 + }, + { + "epoch": 0.3561147822938354, + "grad_norm": 0.943660318851471, + "learning_rate": 0.00013467868358438563, + "loss": 2.8982, + "step": 3934 + }, + { + "epoch": 0.35620530460758576, + "grad_norm": 0.9221616983413696, + "learning_rate": 0.00013464891781977349, + "loss": 2.9524, + "step": 3935 + }, + { + "epoch": 0.3562958269213361, + "grad_norm": 0.9135646224021912, + "learning_rate": 0.00013461914856601625, + "loss": 2.8953, + "step": 3936 + }, + { + "epoch": 0.35638634923508644, + "grad_norm": 1.021881103515625, + "learning_rate": 0.0001345893758261117, + "loss": 2.994, + "step": 3937 + }, + { + "epoch": 0.3564768715488368, + "grad_norm": 0.8870673775672913, + "learning_rate": 0.00013455959960305798, + "loss": 2.8441, + "step": 3938 + }, + { + "epoch": 0.3565673938625871, + "grad_norm": 0.9238022565841675, + "learning_rate": 0.00013452981989985348, + "loss": 2.8884, + "step": 3939 + }, + { + "epoch": 0.35665791617633746, + "grad_norm": 1.0125572681427002, + "learning_rate": 0.00013450003671949706, + "loss": 2.9548, + "step": 3940 + }, + { + "epoch": 0.3567484384900878, + "grad_norm": 0.8192005157470703, + "learning_rate": 0.00013447025006498793, + "loss": 2.9126, + "step": 3941 + }, + { + "epoch": 0.35683896080383815, + "grad_norm": 1.4117205142974854, + "learning_rate": 0.00013444045993932555, + "loss": 2.9368, + "step": 3942 + }, + { + "epoch": 0.3569294831175885, + "grad_norm": 0.9625622630119324, + "learning_rate": 0.00013441066634550978, + "loss": 2.8696, + "step": 3943 + }, + { + "epoch": 0.3570200054313388, + "grad_norm": 0.7912752032279968, + "learning_rate": 0.00013438086928654086, + "loss": 2.9093, + "step": 3944 + }, + { + "epoch": 0.35711052774508917, + "grad_norm": 1.269979476928711, + "learning_rate": 0.00013435106876541933, + "loss": 2.9174, + "step": 3945 + }, + { + "epoch": 0.3572010500588395, + "grad_norm": 1.0200188159942627, + "learning_rate": 0.00013432126478514614, + "loss": 2.8732, + "step": 3946 + }, + { + "epoch": 0.35729157237258985, + "grad_norm": 0.8661880493164062, + "learning_rate": 0.0001342914573487225, + "loss": 2.8182, + "step": 3947 + }, + { + "epoch": 0.3573820946863402, + "grad_norm": 1.0338268280029297, + "learning_rate": 0.00013426164645915004, + "loss": 2.9328, + "step": 3948 + }, + { + "epoch": 0.35747261700009053, + "grad_norm": 1.0458835363388062, + "learning_rate": 0.00013423183211943074, + "loss": 2.8545, + "step": 3949 + }, + { + "epoch": 0.35756313931384087, + "grad_norm": 0.9032859206199646, + "learning_rate": 0.00013420201433256689, + "loss": 2.8888, + "step": 3950 + }, + { + "epoch": 0.3576536616275912, + "grad_norm": 0.9008709788322449, + "learning_rate": 0.0001341721931015611, + "loss": 2.8591, + "step": 3951 + }, + { + "epoch": 0.35774418394134155, + "grad_norm": 0.9279823899269104, + "learning_rate": 0.00013414236842941644, + "loss": 2.8552, + "step": 3952 + }, + { + "epoch": 0.3578347062550919, + "grad_norm": 0.9996717572212219, + "learning_rate": 0.00013411254031913615, + "loss": 2.9944, + "step": 3953 + }, + { + "epoch": 0.35792522856884224, + "grad_norm": 0.8940361738204956, + "learning_rate": 0.00013408270877372406, + "loss": 2.8701, + "step": 3954 + }, + { + "epoch": 0.3580157508825926, + "grad_norm": 0.96931391954422, + "learning_rate": 0.0001340528737961841, + "loss": 2.9365, + "step": 3955 + }, + { + "epoch": 0.3581062731963429, + "grad_norm": 0.9654762148857117, + "learning_rate": 0.00013402303538952065, + "loss": 2.854, + "step": 3956 + }, + { + "epoch": 0.35819679551009326, + "grad_norm": 0.8370428681373596, + "learning_rate": 0.0001339931935567385, + "loss": 2.9161, + "step": 3957 + }, + { + "epoch": 0.3582873178238436, + "grad_norm": 0.8343283534049988, + "learning_rate": 0.0001339633483008427, + "loss": 2.8579, + "step": 3958 + }, + { + "epoch": 0.35837784013759394, + "grad_norm": 0.9394232630729675, + "learning_rate": 0.0001339334996248386, + "loss": 2.9178, + "step": 3959 + }, + { + "epoch": 0.3584683624513443, + "grad_norm": 0.946890652179718, + "learning_rate": 0.00013390364753173206, + "loss": 2.9488, + "step": 3960 + }, + { + "epoch": 0.3585588847650946, + "grad_norm": 0.8705549836158752, + "learning_rate": 0.00013387379202452917, + "loss": 2.8761, + "step": 3961 + }, + { + "epoch": 0.35864940707884496, + "grad_norm": 0.836525559425354, + "learning_rate": 0.0001338439331062363, + "loss": 2.8907, + "step": 3962 + }, + { + "epoch": 0.35873992939259525, + "grad_norm": 0.9729019403457642, + "learning_rate": 0.0001338140707798603, + "loss": 2.9337, + "step": 3963 + }, + { + "epoch": 0.3588304517063456, + "grad_norm": 0.8781609535217285, + "learning_rate": 0.00013378420504840828, + "loss": 3.0256, + "step": 3964 + }, + { + "epoch": 0.35892097402009593, + "grad_norm": 0.8746907114982605, + "learning_rate": 0.0001337543359148878, + "loss": 2.9021, + "step": 3965 + }, + { + "epoch": 0.35901149633384627, + "grad_norm": 0.8925855755805969, + "learning_rate": 0.00013372446338230656, + "loss": 2.9353, + "step": 3966 + }, + { + "epoch": 0.3591020186475966, + "grad_norm": 0.8417982459068298, + "learning_rate": 0.0001336945874536728, + "loss": 2.8828, + "step": 3967 + }, + { + "epoch": 0.35919254096134695, + "grad_norm": 0.8790850043296814, + "learning_rate": 0.000133664708131995, + "loss": 2.8381, + "step": 3968 + }, + { + "epoch": 0.3592830632750973, + "grad_norm": 0.9135075807571411, + "learning_rate": 0.000133634825420282, + "loss": 2.9152, + "step": 3969 + }, + { + "epoch": 0.35937358558884763, + "grad_norm": 0.939116358757019, + "learning_rate": 0.00013360493932154302, + "loss": 2.9089, + "step": 3970 + }, + { + "epoch": 0.359464107902598, + "grad_norm": 0.8897899985313416, + "learning_rate": 0.00013357504983878754, + "loss": 2.902, + "step": 3971 + }, + { + "epoch": 0.3595546302163483, + "grad_norm": 0.9135044813156128, + "learning_rate": 0.00013354515697502553, + "loss": 2.8635, + "step": 3972 + }, + { + "epoch": 0.35964515253009866, + "grad_norm": 0.8595693111419678, + "learning_rate": 0.00013351526073326707, + "loss": 2.9217, + "step": 3973 + }, + { + "epoch": 0.359735674843849, + "grad_norm": 0.9174153804779053, + "learning_rate": 0.00013348536111652278, + "loss": 2.8597, + "step": 3974 + }, + { + "epoch": 0.35982619715759934, + "grad_norm": 0.9224886894226074, + "learning_rate": 0.00013345545812780353, + "loss": 2.9499, + "step": 3975 + }, + { + "epoch": 0.3599167194713497, + "grad_norm": 0.8565012216567993, + "learning_rate": 0.0001334255517701206, + "loss": 2.9246, + "step": 3976 + }, + { + "epoch": 0.3600072417851, + "grad_norm": 0.8404529690742493, + "learning_rate": 0.0001333956420464855, + "loss": 2.8516, + "step": 3977 + }, + { + "epoch": 0.36009776409885036, + "grad_norm": 0.8397718071937561, + "learning_rate": 0.00013336572895991016, + "loss": 2.9095, + "step": 3978 + }, + { + "epoch": 0.3601882864126007, + "grad_norm": 0.8200170993804932, + "learning_rate": 0.0001333358125134068, + "loss": 2.9085, + "step": 3979 + }, + { + "epoch": 0.36027880872635104, + "grad_norm": 0.8960346579551697, + "learning_rate": 0.00013330589270998808, + "loss": 2.9098, + "step": 3980 + }, + { + "epoch": 0.3603693310401014, + "grad_norm": 0.8814414143562317, + "learning_rate": 0.00013327596955266686, + "loss": 2.8328, + "step": 3981 + }, + { + "epoch": 0.3604598533538517, + "grad_norm": 0.9030823707580566, + "learning_rate": 0.0001332460430444564, + "loss": 2.9183, + "step": 3982 + }, + { + "epoch": 0.36055037566760206, + "grad_norm": 0.9058015942573547, + "learning_rate": 0.00013321611318837032, + "loss": 2.9164, + "step": 3983 + }, + { + "epoch": 0.3606408979813524, + "grad_norm": 1.0026699304580688, + "learning_rate": 0.0001331861799874226, + "loss": 2.8641, + "step": 3984 + }, + { + "epoch": 0.36073142029510274, + "grad_norm": 0.8512104749679565, + "learning_rate": 0.0001331562434446274, + "loss": 2.8295, + "step": 3985 + }, + { + "epoch": 0.3608219426088531, + "grad_norm": 0.7946496605873108, + "learning_rate": 0.00013312630356299943, + "loss": 2.8758, + "step": 3986 + }, + { + "epoch": 0.3609124649226034, + "grad_norm": 0.9297618865966797, + "learning_rate": 0.00013309636034555355, + "loss": 2.8679, + "step": 3987 + }, + { + "epoch": 0.36100298723635377, + "grad_norm": 0.9128022789955139, + "learning_rate": 0.00013306641379530514, + "loss": 2.8875, + "step": 3988 + }, + { + "epoch": 0.3610935095501041, + "grad_norm": 0.9625574946403503, + "learning_rate": 0.00013303646391526973, + "loss": 2.9534, + "step": 3989 + }, + { + "epoch": 0.36118403186385445, + "grad_norm": 0.8782124519348145, + "learning_rate": 0.00013300651070846333, + "loss": 2.9566, + "step": 3990 + }, + { + "epoch": 0.3612745541776048, + "grad_norm": 0.8106698989868164, + "learning_rate": 0.0001329765541779022, + "loss": 2.8592, + "step": 3991 + }, + { + "epoch": 0.36136507649135513, + "grad_norm": 0.9018152356147766, + "learning_rate": 0.00013294659432660296, + "loss": 2.9032, + "step": 3992 + }, + { + "epoch": 0.36145559880510547, + "grad_norm": 0.8355795741081238, + "learning_rate": 0.0001329166311575826, + "loss": 2.9498, + "step": 3993 + }, + { + "epoch": 0.3615461211188558, + "grad_norm": 0.862778902053833, + "learning_rate": 0.00013288666467385833, + "loss": 2.8544, + "step": 3994 + }, + { + "epoch": 0.36163664343260615, + "grad_norm": 0.8779953718185425, + "learning_rate": 0.00013285669487844785, + "loss": 2.9377, + "step": 3995 + }, + { + "epoch": 0.3617271657463565, + "grad_norm": 0.9221465587615967, + "learning_rate": 0.00013282672177436912, + "loss": 2.8477, + "step": 3996 + }, + { + "epoch": 0.36181768806010683, + "grad_norm": 0.908944308757782, + "learning_rate": 0.00013279674536464036, + "loss": 2.8751, + "step": 3997 + }, + { + "epoch": 0.3619082103738572, + "grad_norm": 0.9321480989456177, + "learning_rate": 0.00013276676565228027, + "loss": 2.9282, + "step": 3998 + }, + { + "epoch": 0.3619987326876075, + "grad_norm": 0.9838185906410217, + "learning_rate": 0.00013273678264030778, + "loss": 2.905, + "step": 3999 + }, + { + "epoch": 0.36208925500135786, + "grad_norm": 0.9170008301734924, + "learning_rate": 0.00013270679633174218, + "loss": 2.9405, + "step": 4000 + }, + { + "epoch": 0.36208925500135786, + "eval_loss": 2.8359169960021973, + "eval_runtime": 71.472, + "eval_samples_per_second": 37.819, + "eval_steps_per_second": 3.162, + "step": 4000 + }, + { + "epoch": 0.3621797773151082, + "grad_norm": 0.8533981442451477, + "learning_rate": 0.00013267680672960307, + "loss": 2.9021, + "step": 4001 + }, + { + "epoch": 0.36227029962885854, + "grad_norm": 0.9296990633010864, + "learning_rate": 0.0001326468138369104, + "loss": 2.9398, + "step": 4002 + }, + { + "epoch": 0.3623608219426089, + "grad_norm": 0.8591863512992859, + "learning_rate": 0.00013261681765668453, + "loss": 2.8718, + "step": 4003 + }, + { + "epoch": 0.3624513442563592, + "grad_norm": 0.9215024709701538, + "learning_rate": 0.00013258681819194594, + "loss": 2.9158, + "step": 4004 + }, + { + "epoch": 0.3625418665701095, + "grad_norm": 0.9291088581085205, + "learning_rate": 0.00013255681544571568, + "loss": 2.8731, + "step": 4005 + }, + { + "epoch": 0.36263238888385985, + "grad_norm": 0.9838840365409851, + "learning_rate": 0.000132526809421015, + "loss": 2.8812, + "step": 4006 + }, + { + "epoch": 0.3627229111976102, + "grad_norm": 0.8778908848762512, + "learning_rate": 0.0001324968001208655, + "loss": 2.8826, + "step": 4007 + }, + { + "epoch": 0.36281343351136053, + "grad_norm": 0.9058883190155029, + "learning_rate": 0.00013246678754828912, + "loss": 2.9113, + "step": 4008 + }, + { + "epoch": 0.36290395582511087, + "grad_norm": 0.9569624662399292, + "learning_rate": 0.0001324367717063081, + "loss": 2.9263, + "step": 4009 + }, + { + "epoch": 0.3629944781388612, + "grad_norm": 0.839088499546051, + "learning_rate": 0.00013240675259794507, + "loss": 2.8705, + "step": 4010 + }, + { + "epoch": 0.36308500045261155, + "grad_norm": 0.906187891960144, + "learning_rate": 0.00013237673022622296, + "loss": 2.842, + "step": 4011 + }, + { + "epoch": 0.3631755227663619, + "grad_norm": 0.8538724780082703, + "learning_rate": 0.00013234670459416498, + "loss": 2.8928, + "step": 4012 + }, + { + "epoch": 0.36326604508011223, + "grad_norm": 0.933245062828064, + "learning_rate": 0.00013231667570479474, + "loss": 2.9416, + "step": 4013 + }, + { + "epoch": 0.3633565673938626, + "grad_norm": 0.82557213306427, + "learning_rate": 0.00013228664356113612, + "loss": 2.8834, + "step": 4014 + }, + { + "epoch": 0.3634470897076129, + "grad_norm": 0.8684479594230652, + "learning_rate": 0.0001322566081662134, + "loss": 2.9387, + "step": 4015 + }, + { + "epoch": 0.36353761202136325, + "grad_norm": 0.8461321592330933, + "learning_rate": 0.00013222656952305113, + "loss": 2.9157, + "step": 4016 + }, + { + "epoch": 0.3636281343351136, + "grad_norm": 0.815130352973938, + "learning_rate": 0.00013219652763467416, + "loss": 2.831, + "step": 4017 + }, + { + "epoch": 0.36371865664886394, + "grad_norm": 0.835340142250061, + "learning_rate": 0.00013216648250410776, + "loss": 2.8644, + "step": 4018 + }, + { + "epoch": 0.3638091789626143, + "grad_norm": 0.8132588267326355, + "learning_rate": 0.00013213643413437746, + "loss": 2.9148, + "step": 4019 + }, + { + "epoch": 0.3638997012763646, + "grad_norm": 0.8714550733566284, + "learning_rate": 0.00013210638252850908, + "loss": 2.8805, + "step": 4020 + }, + { + "epoch": 0.36399022359011496, + "grad_norm": 0.8507872819900513, + "learning_rate": 0.0001320763276895289, + "loss": 2.901, + "step": 4021 + }, + { + "epoch": 0.3640807459038653, + "grad_norm": 0.8836880922317505, + "learning_rate": 0.0001320462696204634, + "loss": 2.9043, + "step": 4022 + }, + { + "epoch": 0.36417126821761564, + "grad_norm": 0.9315512776374817, + "learning_rate": 0.0001320162083243394, + "loss": 2.8655, + "step": 4023 + }, + { + "epoch": 0.364261790531366, + "grad_norm": 0.828289806842804, + "learning_rate": 0.00013198614380418412, + "loss": 2.8794, + "step": 4024 + }, + { + "epoch": 0.3643523128451163, + "grad_norm": 0.8746485114097595, + "learning_rate": 0.00013195607606302501, + "loss": 2.9045, + "step": 4025 + }, + { + "epoch": 0.36444283515886666, + "grad_norm": 0.8498311638832092, + "learning_rate": 0.00013192600510388992, + "loss": 2.8249, + "step": 4026 + }, + { + "epoch": 0.364533357472617, + "grad_norm": 0.8305221796035767, + "learning_rate": 0.00013189593092980702, + "loss": 2.8825, + "step": 4027 + }, + { + "epoch": 0.36462387978636734, + "grad_norm": 0.7950791716575623, + "learning_rate": 0.0001318658535438047, + "loss": 2.9084, + "step": 4028 + }, + { + "epoch": 0.3647144021001177, + "grad_norm": 0.888029158115387, + "learning_rate": 0.00013183577294891184, + "loss": 2.8292, + "step": 4029 + }, + { + "epoch": 0.364804924413868, + "grad_norm": 0.8210163712501526, + "learning_rate": 0.00013180568914815752, + "loss": 2.888, + "step": 4030 + }, + { + "epoch": 0.36489544672761837, + "grad_norm": 0.8236469030380249, + "learning_rate": 0.00013177560214457115, + "loss": 2.8584, + "step": 4031 + }, + { + "epoch": 0.3649859690413687, + "grad_norm": 0.8504825234413147, + "learning_rate": 0.0001317455119411825, + "loss": 2.8539, + "step": 4032 + }, + { + "epoch": 0.36507649135511905, + "grad_norm": 0.825000524520874, + "learning_rate": 0.0001317154185410217, + "loss": 2.8158, + "step": 4033 + }, + { + "epoch": 0.3651670136688694, + "grad_norm": 0.8093627095222473, + "learning_rate": 0.0001316853219471191, + "loss": 2.8809, + "step": 4034 + }, + { + "epoch": 0.36525753598261973, + "grad_norm": 0.9785566926002502, + "learning_rate": 0.00013165522216250543, + "loss": 2.9101, + "step": 4035 + }, + { + "epoch": 0.36534805829637007, + "grad_norm": 0.9431535005569458, + "learning_rate": 0.00013162511919021178, + "loss": 2.8807, + "step": 4036 + }, + { + "epoch": 0.3654385806101204, + "grad_norm": 0.8769155144691467, + "learning_rate": 0.0001315950130332695, + "loss": 2.9477, + "step": 4037 + }, + { + "epoch": 0.36552910292387075, + "grad_norm": 0.9615750908851624, + "learning_rate": 0.00013156490369471027, + "loss": 2.8857, + "step": 4038 + }, + { + "epoch": 0.3656196252376211, + "grad_norm": 0.8426896333694458, + "learning_rate": 0.00013153479117756608, + "loss": 2.8606, + "step": 4039 + }, + { + "epoch": 0.36571014755137143, + "grad_norm": 1.0978776216506958, + "learning_rate": 0.0001315046754848693, + "loss": 2.8393, + "step": 4040 + }, + { + "epoch": 0.3658006698651218, + "grad_norm": 0.8376330137252808, + "learning_rate": 0.00013147455661965255, + "loss": 2.9575, + "step": 4041 + }, + { + "epoch": 0.3658911921788721, + "grad_norm": 0.8953753709793091, + "learning_rate": 0.00013144443458494882, + "loss": 2.8782, + "step": 4042 + }, + { + "epoch": 0.36598171449262246, + "grad_norm": 0.9642663598060608, + "learning_rate": 0.00013141430938379134, + "loss": 2.8789, + "step": 4043 + }, + { + "epoch": 0.3660722368063728, + "grad_norm": 1.0093460083007812, + "learning_rate": 0.00013138418101921385, + "loss": 2.9142, + "step": 4044 + }, + { + "epoch": 0.36616275912012314, + "grad_norm": 0.83012455701828, + "learning_rate": 0.00013135404949425015, + "loss": 2.8987, + "step": 4045 + }, + { + "epoch": 0.3662532814338734, + "grad_norm": 1.1322907209396362, + "learning_rate": 0.00013132391481193451, + "loss": 2.8769, + "step": 4046 + }, + { + "epoch": 0.36634380374762376, + "grad_norm": 1.0075737237930298, + "learning_rate": 0.0001312937769753015, + "loss": 2.8329, + "step": 4047 + }, + { + "epoch": 0.3664343260613741, + "grad_norm": 1.0125104188919067, + "learning_rate": 0.00013126363598738603, + "loss": 2.8341, + "step": 4048 + }, + { + "epoch": 0.36652484837512445, + "grad_norm": 0.9636220932006836, + "learning_rate": 0.00013123349185122327, + "loss": 2.8841, + "step": 4049 + }, + { + "epoch": 0.3666153706888748, + "grad_norm": 0.9219857454299927, + "learning_rate": 0.0001312033445698487, + "loss": 2.9358, + "step": 4050 + }, + { + "epoch": 0.3667058930026251, + "grad_norm": 0.8947569727897644, + "learning_rate": 0.00013117319414629824, + "loss": 2.9103, + "step": 4051 + }, + { + "epoch": 0.36679641531637547, + "grad_norm": 0.9168105125427246, + "learning_rate": 0.00013114304058360797, + "loss": 2.8693, + "step": 4052 + }, + { + "epoch": 0.3668869376301258, + "grad_norm": 0.8657991886138916, + "learning_rate": 0.00013111288388481434, + "loss": 2.9258, + "step": 4053 + }, + { + "epoch": 0.36697745994387615, + "grad_norm": 0.8736631274223328, + "learning_rate": 0.00013108272405295415, + "loss": 2.8122, + "step": 4054 + }, + { + "epoch": 0.3670679822576265, + "grad_norm": 0.8283593654632568, + "learning_rate": 0.0001310525610910645, + "loss": 2.9052, + "step": 4055 + }, + { + "epoch": 0.36715850457137683, + "grad_norm": 0.8502643704414368, + "learning_rate": 0.00013102239500218282, + "loss": 2.892, + "step": 4056 + }, + { + "epoch": 0.36724902688512717, + "grad_norm": 0.976543128490448, + "learning_rate": 0.0001309922257893468, + "loss": 2.9142, + "step": 4057 + }, + { + "epoch": 0.3673395491988775, + "grad_norm": 0.8439890742301941, + "learning_rate": 0.00013096205345559448, + "loss": 2.9241, + "step": 4058 + }, + { + "epoch": 0.36743007151262785, + "grad_norm": 0.8744709491729736, + "learning_rate": 0.00013093187800396425, + "loss": 2.8804, + "step": 4059 + }, + { + "epoch": 0.3675205938263782, + "grad_norm": 0.8625809550285339, + "learning_rate": 0.00013090169943749476, + "loss": 2.8871, + "step": 4060 + }, + { + "epoch": 0.36761111614012854, + "grad_norm": 0.9283736348152161, + "learning_rate": 0.00013087151775922494, + "loss": 2.9417, + "step": 4061 + }, + { + "epoch": 0.3677016384538789, + "grad_norm": 0.8103830814361572, + "learning_rate": 0.00013084133297219417, + "loss": 2.7827, + "step": 4062 + }, + { + "epoch": 0.3677921607676292, + "grad_norm": 0.8377107381820679, + "learning_rate": 0.00013081114507944197, + "loss": 2.8471, + "step": 4063 + }, + { + "epoch": 0.36788268308137956, + "grad_norm": 0.8858683705329895, + "learning_rate": 0.00013078095408400835, + "loss": 2.8529, + "step": 4064 + }, + { + "epoch": 0.3679732053951299, + "grad_norm": 0.8910707235336304, + "learning_rate": 0.00013075075998893345, + "loss": 2.9123, + "step": 4065 + }, + { + "epoch": 0.36806372770888024, + "grad_norm": 0.8349747657775879, + "learning_rate": 0.00013072056279725788, + "loss": 2.837, + "step": 4066 + }, + { + "epoch": 0.3681542500226306, + "grad_norm": 1.0403574705123901, + "learning_rate": 0.0001306903625120225, + "loss": 2.898, + "step": 4067 + }, + { + "epoch": 0.3682447723363809, + "grad_norm": 0.9942634105682373, + "learning_rate": 0.00013066015913626843, + "loss": 2.8599, + "step": 4068 + }, + { + "epoch": 0.36833529465013126, + "grad_norm": 0.812485933303833, + "learning_rate": 0.0001306299526730372, + "loss": 2.8962, + "step": 4069 + }, + { + "epoch": 0.3684258169638816, + "grad_norm": 1.0433251857757568, + "learning_rate": 0.00013059974312537053, + "loss": 2.918, + "step": 4070 + }, + { + "epoch": 0.36851633927763194, + "grad_norm": 0.9210599064826965, + "learning_rate": 0.00013056953049631057, + "loss": 2.8613, + "step": 4071 + }, + { + "epoch": 0.3686068615913823, + "grad_norm": 0.8464783430099487, + "learning_rate": 0.00013053931478889975, + "loss": 2.8794, + "step": 4072 + }, + { + "epoch": 0.3686973839051326, + "grad_norm": 1.1169415712356567, + "learning_rate": 0.00013050909600618077, + "loss": 2.9114, + "step": 4073 + }, + { + "epoch": 0.36878790621888297, + "grad_norm": 0.8666006922721863, + "learning_rate": 0.00013047887415119664, + "loss": 2.8093, + "step": 4074 + }, + { + "epoch": 0.3688784285326333, + "grad_norm": 0.8614941835403442, + "learning_rate": 0.0001304486492269907, + "loss": 2.9002, + "step": 4075 + }, + { + "epoch": 0.36896895084638365, + "grad_norm": 0.951631486415863, + "learning_rate": 0.00013041842123660667, + "loss": 2.8685, + "step": 4076 + }, + { + "epoch": 0.369059473160134, + "grad_norm": 0.920071542263031, + "learning_rate": 0.0001303881901830884, + "loss": 2.8821, + "step": 4077 + }, + { + "epoch": 0.36914999547388433, + "grad_norm": 0.8674535751342773, + "learning_rate": 0.00013035795606948023, + "loss": 2.9443, + "step": 4078 + }, + { + "epoch": 0.36924051778763467, + "grad_norm": 1.0542681217193604, + "learning_rate": 0.00013032771889882674, + "loss": 2.9529, + "step": 4079 + }, + { + "epoch": 0.369331040101385, + "grad_norm": 0.8281850218772888, + "learning_rate": 0.00013029747867417276, + "loss": 2.8641, + "step": 4080 + }, + { + "epoch": 0.36942156241513535, + "grad_norm": 0.850652277469635, + "learning_rate": 0.00013026723539856348, + "loss": 2.8656, + "step": 4081 + }, + { + "epoch": 0.3695120847288857, + "grad_norm": 0.8725227117538452, + "learning_rate": 0.00013023698907504446, + "loss": 2.9565, + "step": 4082 + }, + { + "epoch": 0.36960260704263603, + "grad_norm": 0.832707405090332, + "learning_rate": 0.0001302067397066615, + "loss": 2.8305, + "step": 4083 + }, + { + "epoch": 0.3696931293563864, + "grad_norm": 0.8224778771400452, + "learning_rate": 0.0001301764872964606, + "loss": 2.9179, + "step": 4084 + }, + { + "epoch": 0.3697836516701367, + "grad_norm": 0.8339827060699463, + "learning_rate": 0.0001301462318474883, + "loss": 2.8407, + "step": 4085 + }, + { + "epoch": 0.36987417398388706, + "grad_norm": 0.8362826108932495, + "learning_rate": 0.00013011597336279126, + "loss": 2.7989, + "step": 4086 + }, + { + "epoch": 0.36996469629763734, + "grad_norm": 0.8587050437927246, + "learning_rate": 0.00013008571184541657, + "loss": 2.8354, + "step": 4087 + }, + { + "epoch": 0.3700552186113877, + "grad_norm": 0.885046660900116, + "learning_rate": 0.00013005544729841146, + "loss": 2.8989, + "step": 4088 + }, + { + "epoch": 0.370145740925138, + "grad_norm": 0.9273785948753357, + "learning_rate": 0.00013002517972482368, + "loss": 2.8913, + "step": 4089 + }, + { + "epoch": 0.37023626323888836, + "grad_norm": 0.8214293122291565, + "learning_rate": 0.00012999490912770107, + "loss": 2.8843, + "step": 4090 + }, + { + "epoch": 0.3703267855526387, + "grad_norm": 0.9375723004341125, + "learning_rate": 0.00012996463551009197, + "loss": 2.8452, + "step": 4091 + }, + { + "epoch": 0.37041730786638905, + "grad_norm": 0.8647713661193848, + "learning_rate": 0.00012993435887504487, + "loss": 2.9241, + "step": 4092 + }, + { + "epoch": 0.3705078301801394, + "grad_norm": 0.8443884253501892, + "learning_rate": 0.00012990407922560868, + "loss": 2.8765, + "step": 4093 + }, + { + "epoch": 0.3705983524938897, + "grad_norm": 0.9264103770256042, + "learning_rate": 0.0001298737965648325, + "loss": 2.8917, + "step": 4094 + }, + { + "epoch": 0.37068887480764007, + "grad_norm": 0.7921240925788879, + "learning_rate": 0.00012984351089576586, + "loss": 2.7979, + "step": 4095 + }, + { + "epoch": 0.3707793971213904, + "grad_norm": 0.9128064513206482, + "learning_rate": 0.00012981322222145846, + "loss": 2.9056, + "step": 4096 + }, + { + "epoch": 0.37086991943514075, + "grad_norm": 1.012534499168396, + "learning_rate": 0.0001297829305449604, + "loss": 3.0145, + "step": 4097 + }, + { + "epoch": 0.3709604417488911, + "grad_norm": 0.9095678925514221, + "learning_rate": 0.00012975263586932208, + "loss": 2.8566, + "step": 4098 + }, + { + "epoch": 0.37105096406264143, + "grad_norm": 0.8481354713439941, + "learning_rate": 0.0001297223381975941, + "loss": 2.8792, + "step": 4099 + }, + { + "epoch": 0.37114148637639177, + "grad_norm": 0.9588367938995361, + "learning_rate": 0.0001296920375328275, + "loss": 2.8702, + "step": 4100 + }, + { + "epoch": 0.3712320086901421, + "grad_norm": 0.9041178822517395, + "learning_rate": 0.0001296617338780735, + "loss": 2.8923, + "step": 4101 + }, + { + "epoch": 0.37132253100389245, + "grad_norm": 0.9123795628547668, + "learning_rate": 0.00012963142723638378, + "loss": 2.9847, + "step": 4102 + }, + { + "epoch": 0.3714130533176428, + "grad_norm": 0.8965758085250854, + "learning_rate": 0.00012960111761081008, + "loss": 2.8985, + "step": 4103 + }, + { + "epoch": 0.37150357563139313, + "grad_norm": 0.8802927732467651, + "learning_rate": 0.00012957080500440468, + "loss": 2.8288, + "step": 4104 + }, + { + "epoch": 0.3715940979451435, + "grad_norm": 0.8850862383842468, + "learning_rate": 0.00012954048942022002, + "loss": 2.878, + "step": 4105 + }, + { + "epoch": 0.3716846202588938, + "grad_norm": 0.8778537511825562, + "learning_rate": 0.0001295101708613089, + "loss": 2.8459, + "step": 4106 + }, + { + "epoch": 0.37177514257264416, + "grad_norm": 0.8377190232276917, + "learning_rate": 0.00012947984933072435, + "loss": 2.9462, + "step": 4107 + }, + { + "epoch": 0.3718656648863945, + "grad_norm": 1.0077134370803833, + "learning_rate": 0.00012944952483151978, + "loss": 2.8496, + "step": 4108 + }, + { + "epoch": 0.37195618720014484, + "grad_norm": 0.8665797114372253, + "learning_rate": 0.00012941919736674887, + "loss": 2.8963, + "step": 4109 + }, + { + "epoch": 0.3720467095138952, + "grad_norm": 0.8284353017807007, + "learning_rate": 0.0001293888669394656, + "loss": 2.9595, + "step": 4110 + }, + { + "epoch": 0.3721372318276455, + "grad_norm": 0.8614863753318787, + "learning_rate": 0.00012935853355272424, + "loss": 2.878, + "step": 4111 + }, + { + "epoch": 0.37222775414139586, + "grad_norm": 0.8768258690834045, + "learning_rate": 0.00012932819720957935, + "loss": 2.8506, + "step": 4112 + }, + { + "epoch": 0.3723182764551462, + "grad_norm": 0.8659483194351196, + "learning_rate": 0.0001292978579130858, + "loss": 2.883, + "step": 4113 + }, + { + "epoch": 0.37240879876889654, + "grad_norm": 0.8106630444526672, + "learning_rate": 0.00012926751566629875, + "loss": 2.8823, + "step": 4114 + }, + { + "epoch": 0.3724993210826469, + "grad_norm": 0.8505522012710571, + "learning_rate": 0.00012923717047227368, + "loss": 2.878, + "step": 4115 + }, + { + "epoch": 0.3725898433963972, + "grad_norm": 0.8515750765800476, + "learning_rate": 0.00012920682233406633, + "loss": 2.8721, + "step": 4116 + }, + { + "epoch": 0.37268036571014757, + "grad_norm": 0.893912672996521, + "learning_rate": 0.0001291764712547328, + "loss": 2.9565, + "step": 4117 + }, + { + "epoch": 0.3727708880238979, + "grad_norm": 0.937900185585022, + "learning_rate": 0.00012914611723732942, + "loss": 2.8862, + "step": 4118 + }, + { + "epoch": 0.37286141033764825, + "grad_norm": 0.921802818775177, + "learning_rate": 0.00012911576028491278, + "loss": 2.9014, + "step": 4119 + }, + { + "epoch": 0.3729519326513986, + "grad_norm": 0.8836219310760498, + "learning_rate": 0.0001290854004005399, + "loss": 2.9419, + "step": 4120 + }, + { + "epoch": 0.37304245496514893, + "grad_norm": 0.8305163383483887, + "learning_rate": 0.00012905503758726803, + "loss": 2.8919, + "step": 4121 + }, + { + "epoch": 0.37313297727889927, + "grad_norm": 0.992817759513855, + "learning_rate": 0.00012902467184815463, + "loss": 2.8757, + "step": 4122 + }, + { + "epoch": 0.3732234995926496, + "grad_norm": 0.8104541897773743, + "learning_rate": 0.00012899430318625757, + "loss": 2.8196, + "step": 4123 + }, + { + "epoch": 0.37331402190639995, + "grad_norm": 0.8181300163269043, + "learning_rate": 0.000128963931604635, + "loss": 2.8211, + "step": 4124 + }, + { + "epoch": 0.3734045442201503, + "grad_norm": 0.8287342190742493, + "learning_rate": 0.0001289335571063453, + "loss": 2.8883, + "step": 4125 + }, + { + "epoch": 0.37349506653390063, + "grad_norm": 0.8907531499862671, + "learning_rate": 0.00012890317969444716, + "loss": 2.926, + "step": 4126 + }, + { + "epoch": 0.373585588847651, + "grad_norm": 0.8196613192558289, + "learning_rate": 0.00012887279937199962, + "loss": 2.8536, + "step": 4127 + }, + { + "epoch": 0.37367611116140126, + "grad_norm": 0.84673011302948, + "learning_rate": 0.00012884241614206202, + "loss": 2.8441, + "step": 4128 + }, + { + "epoch": 0.3737666334751516, + "grad_norm": 0.9478344321250916, + "learning_rate": 0.00012881203000769388, + "loss": 2.8454, + "step": 4129 + }, + { + "epoch": 0.37385715578890194, + "grad_norm": 0.8046672344207764, + "learning_rate": 0.0001287816409719551, + "loss": 2.8328, + "step": 4130 + }, + { + "epoch": 0.3739476781026523, + "grad_norm": 0.924982488155365, + "learning_rate": 0.00012875124903790586, + "loss": 2.9235, + "step": 4131 + }, + { + "epoch": 0.3740382004164026, + "grad_norm": 0.8310782313346863, + "learning_rate": 0.00012872085420860665, + "loss": 2.9017, + "step": 4132 + }, + { + "epoch": 0.37412872273015296, + "grad_norm": 0.8438023328781128, + "learning_rate": 0.00012869045648711817, + "loss": 2.931, + "step": 4133 + }, + { + "epoch": 0.3742192450439033, + "grad_norm": 0.8599631786346436, + "learning_rate": 0.0001286600558765015, + "loss": 2.8513, + "step": 4134 + }, + { + "epoch": 0.37430976735765364, + "grad_norm": 0.9266738891601562, + "learning_rate": 0.00012862965237981803, + "loss": 2.9311, + "step": 4135 + }, + { + "epoch": 0.374400289671404, + "grad_norm": 0.918306827545166, + "learning_rate": 0.00012859924600012936, + "loss": 2.8969, + "step": 4136 + }, + { + "epoch": 0.3744908119851543, + "grad_norm": 0.8407638072967529, + "learning_rate": 0.00012856883674049736, + "loss": 2.8794, + "step": 4137 + }, + { + "epoch": 0.37458133429890467, + "grad_norm": 0.8753742575645447, + "learning_rate": 0.00012853842460398428, + "loss": 2.8764, + "step": 4138 + }, + { + "epoch": 0.374671856612655, + "grad_norm": 0.9728417992591858, + "learning_rate": 0.00012850800959365266, + "loss": 2.9482, + "step": 4139 + }, + { + "epoch": 0.37476237892640535, + "grad_norm": 0.832207977771759, + "learning_rate": 0.00012847759171256523, + "loss": 2.9202, + "step": 4140 + }, + { + "epoch": 0.3748529012401557, + "grad_norm": 0.8750010132789612, + "learning_rate": 0.00012844717096378508, + "loss": 2.9017, + "step": 4141 + }, + { + "epoch": 0.37494342355390603, + "grad_norm": 0.9617355465888977, + "learning_rate": 0.0001284167473503756, + "loss": 2.8529, + "step": 4142 + }, + { + "epoch": 0.37503394586765637, + "grad_norm": 0.8699533939361572, + "learning_rate": 0.00012838632087540047, + "loss": 2.8743, + "step": 4143 + }, + { + "epoch": 0.3751244681814067, + "grad_norm": 0.9251249432563782, + "learning_rate": 0.00012835589154192357, + "loss": 2.819, + "step": 4144 + }, + { + "epoch": 0.37521499049515705, + "grad_norm": 0.9185225963592529, + "learning_rate": 0.0001283254593530092, + "loss": 2.8883, + "step": 4145 + }, + { + "epoch": 0.3753055128089074, + "grad_norm": 0.92988520860672, + "learning_rate": 0.0001282950243117218, + "loss": 2.9043, + "step": 4146 + }, + { + "epoch": 0.37539603512265773, + "grad_norm": 0.8868668675422668, + "learning_rate": 0.0001282645864211263, + "loss": 2.9247, + "step": 4147 + }, + { + "epoch": 0.3754865574364081, + "grad_norm": 0.9574105739593506, + "learning_rate": 0.00012823414568428768, + "loss": 2.8973, + "step": 4148 + }, + { + "epoch": 0.3755770797501584, + "grad_norm": 0.936896026134491, + "learning_rate": 0.00012820370210427133, + "loss": 2.9021, + "step": 4149 + }, + { + "epoch": 0.37566760206390876, + "grad_norm": 0.8531725406646729, + "learning_rate": 0.00012817325568414297, + "loss": 2.7698, + "step": 4150 + }, + { + "epoch": 0.3757581243776591, + "grad_norm": 0.9539023041725159, + "learning_rate": 0.00012814280642696856, + "loss": 2.9243, + "step": 4151 + }, + { + "epoch": 0.37584864669140944, + "grad_norm": 0.8933930397033691, + "learning_rate": 0.00012811235433581427, + "loss": 2.8733, + "step": 4152 + }, + { + "epoch": 0.3759391690051598, + "grad_norm": 0.9392674565315247, + "learning_rate": 0.00012808189941374668, + "loss": 2.8682, + "step": 4153 + }, + { + "epoch": 0.3760296913189101, + "grad_norm": 0.8223789930343628, + "learning_rate": 0.00012805144166383263, + "loss": 2.861, + "step": 4154 + }, + { + "epoch": 0.37612021363266046, + "grad_norm": 0.9169121980667114, + "learning_rate": 0.00012802098108913914, + "loss": 2.8804, + "step": 4155 + }, + { + "epoch": 0.3762107359464108, + "grad_norm": 0.9235851168632507, + "learning_rate": 0.00012799051769273362, + "loss": 2.8841, + "step": 4156 + }, + { + "epoch": 0.37630125826016114, + "grad_norm": 0.8986383080482483, + "learning_rate": 0.00012796005147768376, + "loss": 2.8593, + "step": 4157 + }, + { + "epoch": 0.3763917805739115, + "grad_norm": 0.9925276637077332, + "learning_rate": 0.00012792958244705745, + "loss": 2.9139, + "step": 4158 + }, + { + "epoch": 0.3764823028876618, + "grad_norm": 0.8893771171569824, + "learning_rate": 0.00012789911060392294, + "loss": 2.7976, + "step": 4159 + }, + { + "epoch": 0.37657282520141216, + "grad_norm": 0.8771906495094299, + "learning_rate": 0.0001278686359513488, + "loss": 2.8583, + "step": 4160 + }, + { + "epoch": 0.3766633475151625, + "grad_norm": 0.9729670882225037, + "learning_rate": 0.00012783815849240374, + "loss": 2.8408, + "step": 4161 + }, + { + "epoch": 0.37675386982891285, + "grad_norm": 0.9033991098403931, + "learning_rate": 0.0001278076782301569, + "loss": 2.8587, + "step": 4162 + }, + { + "epoch": 0.3768443921426632, + "grad_norm": 0.9147695302963257, + "learning_rate": 0.0001277771951676776, + "loss": 2.8858, + "step": 4163 + }, + { + "epoch": 0.37693491445641353, + "grad_norm": 0.849720299243927, + "learning_rate": 0.00012774670930803554, + "loss": 2.8747, + "step": 4164 + }, + { + "epoch": 0.37702543677016387, + "grad_norm": 0.8087278604507446, + "learning_rate": 0.0001277162206543006, + "loss": 2.8786, + "step": 4165 + }, + { + "epoch": 0.3771159590839142, + "grad_norm": 0.8310896158218384, + "learning_rate": 0.00012768572920954298, + "loss": 2.9031, + "step": 4166 + }, + { + "epoch": 0.37720648139766455, + "grad_norm": 0.8340770602226257, + "learning_rate": 0.00012765523497683322, + "loss": 2.8748, + "step": 4167 + }, + { + "epoch": 0.3772970037114149, + "grad_norm": 0.8991544842720032, + "learning_rate": 0.00012762473795924204, + "loss": 2.9399, + "step": 4168 + }, + { + "epoch": 0.3773875260251652, + "grad_norm": 0.9005395174026489, + "learning_rate": 0.0001275942381598405, + "loss": 2.9311, + "step": 4169 + }, + { + "epoch": 0.3774780483389155, + "grad_norm": 0.9556332230567932, + "learning_rate": 0.0001275637355816999, + "loss": 2.8791, + "step": 4170 + }, + { + "epoch": 0.37756857065266586, + "grad_norm": 0.8282430768013, + "learning_rate": 0.00012753323022789194, + "loss": 2.9248, + "step": 4171 + }, + { + "epoch": 0.3776590929664162, + "grad_norm": 0.8877475261688232, + "learning_rate": 0.00012750272210148843, + "loss": 2.8638, + "step": 4172 + }, + { + "epoch": 0.37774961528016654, + "grad_norm": 0.8167067170143127, + "learning_rate": 0.00012747221120556153, + "loss": 2.9017, + "step": 4173 + }, + { + "epoch": 0.3778401375939169, + "grad_norm": 0.8058724403381348, + "learning_rate": 0.00012744169754318375, + "loss": 2.8457, + "step": 4174 + }, + { + "epoch": 0.3779306599076672, + "grad_norm": 0.812492847442627, + "learning_rate": 0.00012741118111742777, + "loss": 2.8472, + "step": 4175 + }, + { + "epoch": 0.37802118222141756, + "grad_norm": 0.9392409920692444, + "learning_rate": 0.0001273806619313666, + "loss": 2.851, + "step": 4176 + }, + { + "epoch": 0.3781117045351679, + "grad_norm": 0.877270519733429, + "learning_rate": 0.0001273501399880735, + "loss": 2.8653, + "step": 4177 + }, + { + "epoch": 0.37820222684891824, + "grad_norm": 0.9208534359931946, + "learning_rate": 0.00012731961529062211, + "loss": 2.8362, + "step": 4178 + }, + { + "epoch": 0.3782927491626686, + "grad_norm": 0.9426068663597107, + "learning_rate": 0.00012728908784208616, + "loss": 2.8833, + "step": 4179 + }, + { + "epoch": 0.3783832714764189, + "grad_norm": 0.9093818068504333, + "learning_rate": 0.0001272585576455398, + "loss": 2.9297, + "step": 4180 + }, + { + "epoch": 0.37847379379016927, + "grad_norm": 0.953253984451294, + "learning_rate": 0.00012722802470405744, + "loss": 2.837, + "step": 4181 + }, + { + "epoch": 0.3785643161039196, + "grad_norm": 0.9242717623710632, + "learning_rate": 0.00012719748902071374, + "loss": 2.8074, + "step": 4182 + }, + { + "epoch": 0.37865483841766995, + "grad_norm": 0.8428967595100403, + "learning_rate": 0.00012716695059858366, + "loss": 2.8661, + "step": 4183 + }, + { + "epoch": 0.3787453607314203, + "grad_norm": 1.0141724348068237, + "learning_rate": 0.00012713640944074238, + "loss": 2.8884, + "step": 4184 + }, + { + "epoch": 0.37883588304517063, + "grad_norm": 0.8837010860443115, + "learning_rate": 0.0001271058655502654, + "loss": 2.903, + "step": 4185 + }, + { + "epoch": 0.37892640535892097, + "grad_norm": 0.8637431859970093, + "learning_rate": 0.00012707531893022854, + "loss": 2.8396, + "step": 4186 + }, + { + "epoch": 0.3790169276726713, + "grad_norm": 0.9034766554832458, + "learning_rate": 0.00012704476958370778, + "loss": 2.8339, + "step": 4187 + }, + { + "epoch": 0.37910744998642165, + "grad_norm": 0.8125141859054565, + "learning_rate": 0.00012701421751377945, + "loss": 2.8675, + "step": 4188 + }, + { + "epoch": 0.379197972300172, + "grad_norm": 0.9025378227233887, + "learning_rate": 0.00012698366272352019, + "loss": 2.8319, + "step": 4189 + }, + { + "epoch": 0.37928849461392233, + "grad_norm": 0.9068765044212341, + "learning_rate": 0.0001269531052160068, + "loss": 2.8712, + "step": 4190 + }, + { + "epoch": 0.3793790169276727, + "grad_norm": 0.9077470302581787, + "learning_rate": 0.00012692254499431646, + "loss": 2.8898, + "step": 4191 + }, + { + "epoch": 0.379469539241423, + "grad_norm": 0.8503580093383789, + "learning_rate": 0.00012689198206152657, + "loss": 2.8703, + "step": 4192 + }, + { + "epoch": 0.37956006155517336, + "grad_norm": 0.8890337347984314, + "learning_rate": 0.00012686141642071486, + "loss": 2.8733, + "step": 4193 + }, + { + "epoch": 0.3796505838689237, + "grad_norm": 0.7975422739982605, + "learning_rate": 0.0001268308480749592, + "loss": 2.8605, + "step": 4194 + }, + { + "epoch": 0.37974110618267404, + "grad_norm": 0.8244346380233765, + "learning_rate": 0.0001268002770273379, + "loss": 2.8681, + "step": 4195 + }, + { + "epoch": 0.3798316284964244, + "grad_norm": 0.8913033604621887, + "learning_rate": 0.00012676970328092944, + "loss": 2.9361, + "step": 4196 + }, + { + "epoch": 0.3799221508101747, + "grad_norm": 0.8553683161735535, + "learning_rate": 0.0001267391268388126, + "loss": 2.8772, + "step": 4197 + }, + { + "epoch": 0.38001267312392506, + "grad_norm": 0.8335919976234436, + "learning_rate": 0.0001267085477040664, + "loss": 2.8831, + "step": 4198 + }, + { + "epoch": 0.3801031954376754, + "grad_norm": 0.8713265061378479, + "learning_rate": 0.00012667796587977016, + "loss": 2.9399, + "step": 4199 + }, + { + "epoch": 0.38019371775142574, + "grad_norm": 0.94398033618927, + "learning_rate": 0.00012664738136900348, + "loss": 2.8853, + "step": 4200 + }, + { + "epoch": 0.3802842400651761, + "grad_norm": 0.7876652479171753, + "learning_rate": 0.0001266167941748463, + "loss": 2.8678, + "step": 4201 + }, + { + "epoch": 0.3803747623789264, + "grad_norm": 0.8114630579948425, + "learning_rate": 0.00012658620430037863, + "loss": 2.8559, + "step": 4202 + }, + { + "epoch": 0.38046528469267676, + "grad_norm": 0.9380217790603638, + "learning_rate": 0.00012655561174868088, + "loss": 2.8233, + "step": 4203 + }, + { + "epoch": 0.3805558070064271, + "grad_norm": 0.8938210606575012, + "learning_rate": 0.00012652501652283377, + "loss": 2.8636, + "step": 4204 + }, + { + "epoch": 0.38064632932017745, + "grad_norm": 0.9160013794898987, + "learning_rate": 0.00012649441862591827, + "loss": 2.8823, + "step": 4205 + }, + { + "epoch": 0.3807368516339278, + "grad_norm": 1.035874605178833, + "learning_rate": 0.0001264638180610155, + "loss": 2.9009, + "step": 4206 + }, + { + "epoch": 0.3808273739476781, + "grad_norm": 0.8780242204666138, + "learning_rate": 0.00012643321483120698, + "loss": 2.8823, + "step": 4207 + }, + { + "epoch": 0.38091789626142847, + "grad_norm": 0.9666605591773987, + "learning_rate": 0.00012640260893957445, + "loss": 2.8469, + "step": 4208 + }, + { + "epoch": 0.3810084185751788, + "grad_norm": 0.8629251718521118, + "learning_rate": 0.0001263720003891999, + "loss": 2.8137, + "step": 4209 + }, + { + "epoch": 0.3810989408889291, + "grad_norm": 0.9518691301345825, + "learning_rate": 0.00012634138918316568, + "loss": 2.9227, + "step": 4210 + }, + { + "epoch": 0.38118946320267943, + "grad_norm": 0.8704745769500732, + "learning_rate": 0.00012631077532455426, + "loss": 2.8615, + "step": 4211 + }, + { + "epoch": 0.3812799855164298, + "grad_norm": 0.9694190621376038, + "learning_rate": 0.0001262801588164485, + "loss": 2.9558, + "step": 4212 + }, + { + "epoch": 0.3813705078301801, + "grad_norm": 1.1124874353408813, + "learning_rate": 0.00012624953966193145, + "loss": 2.9274, + "step": 4213 + }, + { + "epoch": 0.38146103014393046, + "grad_norm": 0.9309592247009277, + "learning_rate": 0.00012621891786408648, + "loss": 2.8492, + "step": 4214 + }, + { + "epoch": 0.3815515524576808, + "grad_norm": 0.910110592842102, + "learning_rate": 0.00012618829342599718, + "loss": 2.8612, + "step": 4215 + }, + { + "epoch": 0.38164207477143114, + "grad_norm": 0.9568766355514526, + "learning_rate": 0.0001261576663507475, + "loss": 2.8755, + "step": 4216 + }, + { + "epoch": 0.3817325970851815, + "grad_norm": 0.9147237539291382, + "learning_rate": 0.0001261270366414215, + "loss": 2.8692, + "step": 4217 + }, + { + "epoch": 0.3818231193989318, + "grad_norm": 0.9557003378868103, + "learning_rate": 0.0001260964043011036, + "loss": 2.9074, + "step": 4218 + }, + { + "epoch": 0.38191364171268216, + "grad_norm": 0.9502111077308655, + "learning_rate": 0.00012606576933287854, + "loss": 2.9124, + "step": 4219 + }, + { + "epoch": 0.3820041640264325, + "grad_norm": 0.9677018523216248, + "learning_rate": 0.0001260351317398312, + "loss": 2.8416, + "step": 4220 + }, + { + "epoch": 0.38209468634018284, + "grad_norm": 0.9683780074119568, + "learning_rate": 0.00012600449152504683, + "loss": 2.9383, + "step": 4221 + }, + { + "epoch": 0.3821852086539332, + "grad_norm": 0.8879417777061462, + "learning_rate": 0.00012597384869161084, + "loss": 2.9172, + "step": 4222 + }, + { + "epoch": 0.3822757309676835, + "grad_norm": 0.9438839554786682, + "learning_rate": 0.00012594320324260905, + "loss": 2.8511, + "step": 4223 + }, + { + "epoch": 0.38236625328143387, + "grad_norm": 0.9300707578659058, + "learning_rate": 0.00012591255518112736, + "loss": 2.8671, + "step": 4224 + }, + { + "epoch": 0.3824567755951842, + "grad_norm": 0.9284319877624512, + "learning_rate": 0.00012588190451025207, + "loss": 2.9136, + "step": 4225 + }, + { + "epoch": 0.38254729790893455, + "grad_norm": 0.8524740934371948, + "learning_rate": 0.0001258512512330697, + "loss": 2.8853, + "step": 4226 + }, + { + "epoch": 0.3826378202226849, + "grad_norm": 0.935153603553772, + "learning_rate": 0.00012582059535266708, + "loss": 2.9154, + "step": 4227 + }, + { + "epoch": 0.38272834253643523, + "grad_norm": 0.9207292199134827, + "learning_rate": 0.00012578993687213118, + "loss": 2.8153, + "step": 4228 + }, + { + "epoch": 0.38281886485018557, + "grad_norm": 0.8508001565933228, + "learning_rate": 0.00012575927579454936, + "loss": 2.8845, + "step": 4229 + }, + { + "epoch": 0.3829093871639359, + "grad_norm": 0.9135047793388367, + "learning_rate": 0.00012572861212300918, + "loss": 2.9142, + "step": 4230 + }, + { + "epoch": 0.38299990947768625, + "grad_norm": 0.8793505430221558, + "learning_rate": 0.00012569794586059845, + "loss": 2.9614, + "step": 4231 + }, + { + "epoch": 0.3830904317914366, + "grad_norm": 0.8126338124275208, + "learning_rate": 0.00012566727701040527, + "loss": 2.859, + "step": 4232 + }, + { + "epoch": 0.38318095410518693, + "grad_norm": 0.8644139170646667, + "learning_rate": 0.000125636605575518, + "loss": 2.875, + "step": 4233 + }, + { + "epoch": 0.3832714764189373, + "grad_norm": 0.9225925207138062, + "learning_rate": 0.00012560593155902522, + "loss": 2.9098, + "step": 4234 + }, + { + "epoch": 0.3833619987326876, + "grad_norm": 0.8944104909896851, + "learning_rate": 0.00012557525496401589, + "loss": 2.8909, + "step": 4235 + }, + { + "epoch": 0.38345252104643796, + "grad_norm": 0.8773167729377747, + "learning_rate": 0.00012554457579357905, + "loss": 2.8494, + "step": 4236 + }, + { + "epoch": 0.3835430433601883, + "grad_norm": 0.9156031012535095, + "learning_rate": 0.00012551389405080415, + "loss": 2.8644, + "step": 4237 + }, + { + "epoch": 0.38363356567393864, + "grad_norm": 0.9683679342269897, + "learning_rate": 0.0001254832097387808, + "loss": 2.8617, + "step": 4238 + }, + { + "epoch": 0.383724087987689, + "grad_norm": 0.8678101301193237, + "learning_rate": 0.0001254525228605989, + "loss": 2.8842, + "step": 4239 + }, + { + "epoch": 0.3838146103014393, + "grad_norm": 0.8760818243026733, + "learning_rate": 0.00012542183341934872, + "loss": 2.8221, + "step": 4240 + }, + { + "epoch": 0.38390513261518966, + "grad_norm": 0.957080066204071, + "learning_rate": 0.00012539114141812057, + "loss": 2.8962, + "step": 4241 + }, + { + "epoch": 0.38399565492894, + "grad_norm": 0.8777123093605042, + "learning_rate": 0.00012536044686000514, + "loss": 2.8909, + "step": 4242 + }, + { + "epoch": 0.38408617724269034, + "grad_norm": 0.8769468665122986, + "learning_rate": 0.00012532974974809345, + "loss": 2.8519, + "step": 4243 + }, + { + "epoch": 0.3841766995564407, + "grad_norm": 0.9063210487365723, + "learning_rate": 0.0001252990500854766, + "loss": 2.8714, + "step": 4244 + }, + { + "epoch": 0.384267221870191, + "grad_norm": 0.9434245228767395, + "learning_rate": 0.00012526834787524615, + "loss": 2.8069, + "step": 4245 + }, + { + "epoch": 0.38435774418394136, + "grad_norm": 1.06622314453125, + "learning_rate": 0.00012523764312049376, + "loss": 2.8203, + "step": 4246 + }, + { + "epoch": 0.3844482664976917, + "grad_norm": 0.9174593687057495, + "learning_rate": 0.0001252069358243114, + "loss": 2.879, + "step": 4247 + }, + { + "epoch": 0.38453878881144204, + "grad_norm": 1.1018455028533936, + "learning_rate": 0.00012517622598979126, + "loss": 2.8449, + "step": 4248 + }, + { + "epoch": 0.3846293111251924, + "grad_norm": 0.8385401368141174, + "learning_rate": 0.00012514551362002586, + "loss": 2.7949, + "step": 4249 + }, + { + "epoch": 0.3847198334389427, + "grad_norm": 0.9275259971618652, + "learning_rate": 0.0001251147987181079, + "loss": 2.8635, + "step": 4250 + }, + { + "epoch": 0.384810355752693, + "grad_norm": 0.9716927409172058, + "learning_rate": 0.00012508408128713048, + "loss": 2.9095, + "step": 4251 + }, + { + "epoch": 0.38490087806644335, + "grad_norm": 0.8956915140151978, + "learning_rate": 0.00012505336133018672, + "loss": 2.8683, + "step": 4252 + }, + { + "epoch": 0.3849914003801937, + "grad_norm": 0.9020968675613403, + "learning_rate": 0.00012502263885037014, + "loss": 2.8835, + "step": 4253 + }, + { + "epoch": 0.38508192269394403, + "grad_norm": 1.0122654438018799, + "learning_rate": 0.0001249919138507745, + "loss": 2.9904, + "step": 4254 + }, + { + "epoch": 0.3851724450076944, + "grad_norm": 0.9834968447685242, + "learning_rate": 0.00012496118633449385, + "loss": 2.8318, + "step": 4255 + }, + { + "epoch": 0.3852629673214447, + "grad_norm": 0.9706962704658508, + "learning_rate": 0.0001249304563046224, + "loss": 2.9138, + "step": 4256 + }, + { + "epoch": 0.38535348963519506, + "grad_norm": 1.0231425762176514, + "learning_rate": 0.0001248997237642547, + "loss": 2.9679, + "step": 4257 + }, + { + "epoch": 0.3854440119489454, + "grad_norm": 0.9518309831619263, + "learning_rate": 0.0001248689887164855, + "loss": 2.891, + "step": 4258 + }, + { + "epoch": 0.38553453426269574, + "grad_norm": 0.9990224242210388, + "learning_rate": 0.00012483825116440983, + "loss": 2.8797, + "step": 4259 + }, + { + "epoch": 0.3856250565764461, + "grad_norm": 0.8865232467651367, + "learning_rate": 0.0001248075111111229, + "loss": 2.8957, + "step": 4260 + }, + { + "epoch": 0.3857155788901964, + "grad_norm": 0.9017773270606995, + "learning_rate": 0.0001247767685597203, + "loss": 2.9159, + "step": 4261 + }, + { + "epoch": 0.38580610120394676, + "grad_norm": 0.8915837407112122, + "learning_rate": 0.0001247460235132978, + "loss": 2.9103, + "step": 4262 + }, + { + "epoch": 0.3858966235176971, + "grad_norm": 0.9616152048110962, + "learning_rate": 0.00012471527597495138, + "loss": 2.8551, + "step": 4263 + }, + { + "epoch": 0.38598714583144744, + "grad_norm": 0.9576265811920166, + "learning_rate": 0.00012468452594777737, + "loss": 2.9263, + "step": 4264 + }, + { + "epoch": 0.3860776681451978, + "grad_norm": 0.9476226568222046, + "learning_rate": 0.00012465377343487226, + "loss": 2.957, + "step": 4265 + }, + { + "epoch": 0.3861681904589481, + "grad_norm": 0.8279786705970764, + "learning_rate": 0.00012462301843933285, + "loss": 2.8588, + "step": 4266 + }, + { + "epoch": 0.38625871277269846, + "grad_norm": 0.8010810017585754, + "learning_rate": 0.00012459226096425615, + "loss": 2.8308, + "step": 4267 + }, + { + "epoch": 0.3863492350864488, + "grad_norm": 0.7839717268943787, + "learning_rate": 0.00012456150101273944, + "loss": 2.8188, + "step": 4268 + }, + { + "epoch": 0.38643975740019915, + "grad_norm": 0.8168207406997681, + "learning_rate": 0.00012453073858788026, + "loss": 2.8888, + "step": 4269 + }, + { + "epoch": 0.3865302797139495, + "grad_norm": 0.8193057179450989, + "learning_rate": 0.0001244999736927764, + "loss": 2.8527, + "step": 4270 + }, + { + "epoch": 0.38662080202769983, + "grad_norm": 0.8369550704956055, + "learning_rate": 0.00012446920633052582, + "loss": 2.8992, + "step": 4271 + }, + { + "epoch": 0.38671132434145017, + "grad_norm": 0.8519963026046753, + "learning_rate": 0.00012443843650422685, + "loss": 2.8817, + "step": 4272 + }, + { + "epoch": 0.3868018466552005, + "grad_norm": 0.8299965262413025, + "learning_rate": 0.00012440766421697799, + "loss": 2.9291, + "step": 4273 + }, + { + "epoch": 0.38689236896895085, + "grad_norm": 0.826339066028595, + "learning_rate": 0.000124376889471878, + "loss": 2.8891, + "step": 4274 + }, + { + "epoch": 0.3869828912827012, + "grad_norm": 0.9041489958763123, + "learning_rate": 0.0001243461122720259, + "loss": 2.9446, + "step": 4275 + }, + { + "epoch": 0.38707341359645153, + "grad_norm": 0.8152336478233337, + "learning_rate": 0.00012431533262052098, + "loss": 2.9069, + "step": 4276 + }, + { + "epoch": 0.3871639359102019, + "grad_norm": 1.0291637182235718, + "learning_rate": 0.00012428455052046272, + "loss": 2.8704, + "step": 4277 + }, + { + "epoch": 0.3872544582239522, + "grad_norm": 0.8456107378005981, + "learning_rate": 0.0001242537659749509, + "loss": 2.9149, + "step": 4278 + }, + { + "epoch": 0.38734498053770255, + "grad_norm": 0.8081295490264893, + "learning_rate": 0.0001242229789870855, + "loss": 2.8514, + "step": 4279 + }, + { + "epoch": 0.3874355028514529, + "grad_norm": 0.9080233573913574, + "learning_rate": 0.00012419218955996676, + "loss": 2.8538, + "step": 4280 + }, + { + "epoch": 0.38752602516520324, + "grad_norm": 0.862369954586029, + "learning_rate": 0.00012416139769669523, + "loss": 2.8268, + "step": 4281 + }, + { + "epoch": 0.3876165474789536, + "grad_norm": 0.8651420474052429, + "learning_rate": 0.00012413060340037163, + "loss": 2.8342, + "step": 4282 + }, + { + "epoch": 0.3877070697927039, + "grad_norm": 0.8810906410217285, + "learning_rate": 0.00012409980667409687, + "loss": 2.8656, + "step": 4283 + }, + { + "epoch": 0.38779759210645426, + "grad_norm": 0.8607074022293091, + "learning_rate": 0.00012406900752097227, + "loss": 2.8539, + "step": 4284 + }, + { + "epoch": 0.3878881144202046, + "grad_norm": 0.9719976186752319, + "learning_rate": 0.00012403820594409924, + "loss": 2.9161, + "step": 4285 + }, + { + "epoch": 0.38797863673395494, + "grad_norm": 0.8851466178894043, + "learning_rate": 0.00012400740194657957, + "loss": 2.8657, + "step": 4286 + }, + { + "epoch": 0.3880691590477053, + "grad_norm": 0.853900671005249, + "learning_rate": 0.00012397659553151514, + "loss": 2.8479, + "step": 4287 + }, + { + "epoch": 0.3881596813614556, + "grad_norm": 0.9867987036705017, + "learning_rate": 0.00012394578670200826, + "loss": 2.8907, + "step": 4288 + }, + { + "epoch": 0.38825020367520596, + "grad_norm": 0.9488958716392517, + "learning_rate": 0.00012391497546116126, + "loss": 2.8455, + "step": 4289 + }, + { + "epoch": 0.3883407259889563, + "grad_norm": 0.9296489953994751, + "learning_rate": 0.0001238841618120769, + "loss": 2.8706, + "step": 4290 + }, + { + "epoch": 0.38843124830270664, + "grad_norm": 0.8813881874084473, + "learning_rate": 0.0001238533457578581, + "loss": 2.9245, + "step": 4291 + }, + { + "epoch": 0.38852177061645693, + "grad_norm": 0.8357471823692322, + "learning_rate": 0.00012382252730160803, + "loss": 2.8462, + "step": 4292 + }, + { + "epoch": 0.38861229293020727, + "grad_norm": 0.8444204330444336, + "learning_rate": 0.00012379170644643013, + "loss": 2.8553, + "step": 4293 + }, + { + "epoch": 0.3887028152439576, + "grad_norm": 0.8784762024879456, + "learning_rate": 0.000123760883195428, + "loss": 2.8585, + "step": 4294 + }, + { + "epoch": 0.38879333755770795, + "grad_norm": 0.8281036019325256, + "learning_rate": 0.00012373005755170563, + "loss": 2.8572, + "step": 4295 + }, + { + "epoch": 0.3888838598714583, + "grad_norm": 0.8708269596099854, + "learning_rate": 0.0001236992295183671, + "loss": 2.8737, + "step": 4296 + }, + { + "epoch": 0.38897438218520863, + "grad_norm": 0.9682866930961609, + "learning_rate": 0.0001236683990985168, + "loss": 2.8974, + "step": 4297 + }, + { + "epoch": 0.389064904498959, + "grad_norm": 0.8646484017372131, + "learning_rate": 0.00012363756629525937, + "loss": 2.8173, + "step": 4298 + }, + { + "epoch": 0.3891554268127093, + "grad_norm": 0.8460427522659302, + "learning_rate": 0.00012360673111169965, + "loss": 2.8546, + "step": 4299 + }, + { + "epoch": 0.38924594912645966, + "grad_norm": 0.8345630168914795, + "learning_rate": 0.00012357589355094275, + "loss": 2.8514, + "step": 4300 + }, + { + "epoch": 0.38933647144021, + "grad_norm": 0.8849084377288818, + "learning_rate": 0.00012354505361609398, + "loss": 2.8739, + "step": 4301 + }, + { + "epoch": 0.38942699375396034, + "grad_norm": 0.825874388217926, + "learning_rate": 0.000123514211310259, + "loss": 2.8724, + "step": 4302 + }, + { + "epoch": 0.3895175160677107, + "grad_norm": 0.8098484873771667, + "learning_rate": 0.00012348336663654358, + "loss": 2.8817, + "step": 4303 + }, + { + "epoch": 0.389608038381461, + "grad_norm": 0.8967568874359131, + "learning_rate": 0.0001234525195980538, + "loss": 2.9141, + "step": 4304 + }, + { + "epoch": 0.38969856069521136, + "grad_norm": 0.8785738348960876, + "learning_rate": 0.0001234216701978959, + "loss": 2.8451, + "step": 4305 + }, + { + "epoch": 0.3897890830089617, + "grad_norm": 0.9527652263641357, + "learning_rate": 0.00012339081843917645, + "loss": 2.796, + "step": 4306 + }, + { + "epoch": 0.38987960532271204, + "grad_norm": 0.8243290781974792, + "learning_rate": 0.00012335996432500228, + "loss": 2.7992, + "step": 4307 + }, + { + "epoch": 0.3899701276364624, + "grad_norm": 0.8251200318336487, + "learning_rate": 0.0001233291078584803, + "loss": 2.839, + "step": 4308 + }, + { + "epoch": 0.3900606499502127, + "grad_norm": 0.8345006108283997, + "learning_rate": 0.00012329824904271777, + "loss": 2.854, + "step": 4309 + }, + { + "epoch": 0.39015117226396306, + "grad_norm": 0.8241994976997375, + "learning_rate": 0.00012326738788082223, + "loss": 2.8613, + "step": 4310 + }, + { + "epoch": 0.3902416945777134, + "grad_norm": 0.7824759483337402, + "learning_rate": 0.00012323652437590138, + "loss": 2.8107, + "step": 4311 + }, + { + "epoch": 0.39033221689146375, + "grad_norm": 0.8252853751182556, + "learning_rate": 0.00012320565853106316, + "loss": 2.8466, + "step": 4312 + }, + { + "epoch": 0.3904227392052141, + "grad_norm": 0.8508025407791138, + "learning_rate": 0.00012317479034941573, + "loss": 2.8644, + "step": 4313 + }, + { + "epoch": 0.3905132615189644, + "grad_norm": 0.8455524444580078, + "learning_rate": 0.00012314391983406757, + "loss": 2.8706, + "step": 4314 + }, + { + "epoch": 0.39060378383271477, + "grad_norm": 0.8711053729057312, + "learning_rate": 0.0001231130469881273, + "loss": 2.8698, + "step": 4315 + }, + { + "epoch": 0.3906943061464651, + "grad_norm": 0.9837364554405212, + "learning_rate": 0.00012308217181470385, + "loss": 2.9033, + "step": 4316 + }, + { + "epoch": 0.39078482846021545, + "grad_norm": 0.8791324496269226, + "learning_rate": 0.00012305129431690632, + "loss": 2.8562, + "step": 4317 + }, + { + "epoch": 0.3908753507739658, + "grad_norm": 0.8958924412727356, + "learning_rate": 0.00012302041449784409, + "loss": 2.8738, + "step": 4318 + }, + { + "epoch": 0.39096587308771613, + "grad_norm": 0.8958149552345276, + "learning_rate": 0.00012298953236062677, + "loss": 2.9199, + "step": 4319 + }, + { + "epoch": 0.3910563954014665, + "grad_norm": 0.7985171675682068, + "learning_rate": 0.0001229586479083641, + "loss": 2.8096, + "step": 4320 + }, + { + "epoch": 0.3911469177152168, + "grad_norm": 0.8733648061752319, + "learning_rate": 0.00012292776114416627, + "loss": 2.8864, + "step": 4321 + }, + { + "epoch": 0.39123744002896715, + "grad_norm": 0.8304138779640198, + "learning_rate": 0.0001228968720711435, + "loss": 2.8483, + "step": 4322 + }, + { + "epoch": 0.3913279623427175, + "grad_norm": 0.9201784729957581, + "learning_rate": 0.00012286598069240635, + "loss": 2.9073, + "step": 4323 + }, + { + "epoch": 0.39141848465646784, + "grad_norm": 0.9191954135894775, + "learning_rate": 0.00012283508701106557, + "loss": 2.8897, + "step": 4324 + }, + { + "epoch": 0.3915090069702182, + "grad_norm": 0.8933663368225098, + "learning_rate": 0.00012280419103023217, + "loss": 2.8218, + "step": 4325 + }, + { + "epoch": 0.3915995292839685, + "grad_norm": 0.8663362860679626, + "learning_rate": 0.00012277329275301732, + "loss": 2.8768, + "step": 4326 + }, + { + "epoch": 0.39169005159771886, + "grad_norm": 0.8382437229156494, + "learning_rate": 0.00012274239218253255, + "loss": 2.8738, + "step": 4327 + }, + { + "epoch": 0.3917805739114692, + "grad_norm": 0.8278548121452332, + "learning_rate": 0.0001227114893218895, + "loss": 2.8751, + "step": 4328 + }, + { + "epoch": 0.39187109622521954, + "grad_norm": 0.8287359476089478, + "learning_rate": 0.00012268058417420007, + "loss": 2.8383, + "step": 4329 + }, + { + "epoch": 0.3919616185389699, + "grad_norm": 0.8964946866035461, + "learning_rate": 0.00012264967674257646, + "loss": 2.8526, + "step": 4330 + }, + { + "epoch": 0.3920521408527202, + "grad_norm": 0.829495370388031, + "learning_rate": 0.00012261876703013102, + "loss": 2.8759, + "step": 4331 + }, + { + "epoch": 0.39214266316647056, + "grad_norm": 0.8812410235404968, + "learning_rate": 0.00012258785503997633, + "loss": 2.8668, + "step": 4332 + }, + { + "epoch": 0.39223318548022085, + "grad_norm": 0.8432778120040894, + "learning_rate": 0.00012255694077522528, + "loss": 2.857, + "step": 4333 + }, + { + "epoch": 0.3923237077939712, + "grad_norm": 0.8417193293571472, + "learning_rate": 0.0001225260242389909, + "loss": 2.8395, + "step": 4334 + }, + { + "epoch": 0.39241423010772153, + "grad_norm": 0.8582254648208618, + "learning_rate": 0.0001224951054343865, + "loss": 2.9089, + "step": 4335 + }, + { + "epoch": 0.39250475242147187, + "grad_norm": 0.8694718480110168, + "learning_rate": 0.00012246418436452562, + "loss": 2.8468, + "step": 4336 + }, + { + "epoch": 0.3925952747352222, + "grad_norm": 0.8391682505607605, + "learning_rate": 0.00012243326103252198, + "loss": 2.9174, + "step": 4337 + }, + { + "epoch": 0.39268579704897255, + "grad_norm": 0.8766013979911804, + "learning_rate": 0.00012240233544148955, + "loss": 2.7995, + "step": 4338 + }, + { + "epoch": 0.3927763193627229, + "grad_norm": 0.8532915711402893, + "learning_rate": 0.00012237140759454256, + "loss": 2.8328, + "step": 4339 + }, + { + "epoch": 0.39286684167647323, + "grad_norm": 0.8749273419380188, + "learning_rate": 0.00012234047749479544, + "loss": 2.8936, + "step": 4340 + }, + { + "epoch": 0.3929573639902236, + "grad_norm": 0.8936702013015747, + "learning_rate": 0.00012230954514536285, + "loss": 2.7884, + "step": 4341 + }, + { + "epoch": 0.3930478863039739, + "grad_norm": 0.7897341847419739, + "learning_rate": 0.00012227861054935968, + "loss": 2.8163, + "step": 4342 + }, + { + "epoch": 0.39313840861772426, + "grad_norm": 0.8324590921401978, + "learning_rate": 0.00012224767370990101, + "loss": 2.9101, + "step": 4343 + }, + { + "epoch": 0.3932289309314746, + "grad_norm": 0.787476122379303, + "learning_rate": 0.00012221673463010224, + "loss": 2.8887, + "step": 4344 + }, + { + "epoch": 0.39331945324522494, + "grad_norm": 0.8571562767028809, + "learning_rate": 0.00012218579331307888, + "loss": 2.8645, + "step": 4345 + }, + { + "epoch": 0.3934099755589753, + "grad_norm": 0.87290358543396, + "learning_rate": 0.00012215484976194676, + "loss": 2.8824, + "step": 4346 + }, + { + "epoch": 0.3935004978727256, + "grad_norm": 0.8542999029159546, + "learning_rate": 0.00012212390397982185, + "loss": 2.8801, + "step": 4347 + }, + { + "epoch": 0.39359102018647596, + "grad_norm": 0.9487765431404114, + "learning_rate": 0.00012209295596982042, + "loss": 2.8601, + "step": 4348 + }, + { + "epoch": 0.3936815425002263, + "grad_norm": 0.85524982213974, + "learning_rate": 0.00012206200573505895, + "loss": 2.8454, + "step": 4349 + }, + { + "epoch": 0.39377206481397664, + "grad_norm": 0.8825935125350952, + "learning_rate": 0.00012203105327865407, + "loss": 2.9284, + "step": 4350 + }, + { + "epoch": 0.393862587127727, + "grad_norm": 0.8699039816856384, + "learning_rate": 0.00012200009860372275, + "loss": 2.8432, + "step": 4351 + }, + { + "epoch": 0.3939531094414773, + "grad_norm": 0.8540199398994446, + "learning_rate": 0.0001219691417133821, + "loss": 2.9114, + "step": 4352 + }, + { + "epoch": 0.39404363175522766, + "grad_norm": 0.9271214604377747, + "learning_rate": 0.00012193818261074949, + "loss": 2.8886, + "step": 4353 + }, + { + "epoch": 0.394134154068978, + "grad_norm": 0.8362212777137756, + "learning_rate": 0.00012190722129894248, + "loss": 2.8184, + "step": 4354 + }, + { + "epoch": 0.39422467638272835, + "grad_norm": 0.8742707371711731, + "learning_rate": 0.0001218762577810789, + "loss": 2.9139, + "step": 4355 + }, + { + "epoch": 0.3943151986964787, + "grad_norm": 0.8451969027519226, + "learning_rate": 0.00012184529206027673, + "loss": 2.8399, + "step": 4356 + }, + { + "epoch": 0.394405721010229, + "grad_norm": 0.8662227988243103, + "learning_rate": 0.00012181432413965428, + "loss": 2.8809, + "step": 4357 + }, + { + "epoch": 0.39449624332397937, + "grad_norm": 0.8754814267158508, + "learning_rate": 0.00012178335402232996, + "loss": 2.8007, + "step": 4358 + }, + { + "epoch": 0.3945867656377297, + "grad_norm": 0.7949616312980652, + "learning_rate": 0.00012175238171142249, + "loss": 2.8371, + "step": 4359 + }, + { + "epoch": 0.39467728795148005, + "grad_norm": 0.8271093368530273, + "learning_rate": 0.00012172140721005079, + "loss": 2.7895, + "step": 4360 + }, + { + "epoch": 0.3947678102652304, + "grad_norm": 0.8187628388404846, + "learning_rate": 0.00012169043052133398, + "loss": 2.7996, + "step": 4361 + }, + { + "epoch": 0.39485833257898073, + "grad_norm": 0.9346163868904114, + "learning_rate": 0.00012165945164839138, + "loss": 2.8735, + "step": 4362 + }, + { + "epoch": 0.39494885489273107, + "grad_norm": 0.9251669645309448, + "learning_rate": 0.00012162847059434261, + "loss": 2.8397, + "step": 4363 + }, + { + "epoch": 0.3950393772064814, + "grad_norm": 0.818914532661438, + "learning_rate": 0.00012159748736230747, + "loss": 2.8434, + "step": 4364 + }, + { + "epoch": 0.39512989952023175, + "grad_norm": 0.8839664459228516, + "learning_rate": 0.0001215665019554059, + "loss": 2.8257, + "step": 4365 + }, + { + "epoch": 0.3952204218339821, + "grad_norm": 0.8695526719093323, + "learning_rate": 0.00012153551437675821, + "loss": 2.8094, + "step": 4366 + }, + { + "epoch": 0.39531094414773243, + "grad_norm": 0.8801332712173462, + "learning_rate": 0.00012150452462948479, + "loss": 2.832, + "step": 4367 + }, + { + "epoch": 0.3954014664614828, + "grad_norm": 0.8525098562240601, + "learning_rate": 0.00012147353271670634, + "loss": 2.8797, + "step": 4368 + }, + { + "epoch": 0.3954919887752331, + "grad_norm": 1.0056629180908203, + "learning_rate": 0.00012144253864154372, + "loss": 2.9493, + "step": 4369 + }, + { + "epoch": 0.39558251108898346, + "grad_norm": 0.8409388065338135, + "learning_rate": 0.00012141154240711805, + "loss": 2.8124, + "step": 4370 + }, + { + "epoch": 0.3956730334027338, + "grad_norm": 0.8684418201446533, + "learning_rate": 0.00012138054401655063, + "loss": 2.7768, + "step": 4371 + }, + { + "epoch": 0.39576355571648414, + "grad_norm": 0.8791725039482117, + "learning_rate": 0.00012134954347296305, + "loss": 2.8593, + "step": 4372 + }, + { + "epoch": 0.3958540780302345, + "grad_norm": 0.9349090456962585, + "learning_rate": 0.00012131854077947699, + "loss": 2.8747, + "step": 4373 + }, + { + "epoch": 0.39594460034398477, + "grad_norm": 0.8536260724067688, + "learning_rate": 0.00012128753593921447, + "loss": 2.9138, + "step": 4374 + }, + { + "epoch": 0.3960351226577351, + "grad_norm": 0.8024949431419373, + "learning_rate": 0.00012125652895529766, + "loss": 2.8145, + "step": 4375 + }, + { + "epoch": 0.39612564497148545, + "grad_norm": 0.876518726348877, + "learning_rate": 0.000121225519830849, + "loss": 2.8944, + "step": 4376 + }, + { + "epoch": 0.3962161672852358, + "grad_norm": 0.9107688069343567, + "learning_rate": 0.00012119450856899104, + "loss": 2.8592, + "step": 4377 + }, + { + "epoch": 0.39630668959898613, + "grad_norm": 0.8297701478004456, + "learning_rate": 0.00012116349517284665, + "loss": 2.8506, + "step": 4378 + }, + { + "epoch": 0.39639721191273647, + "grad_norm": 0.9151343703269958, + "learning_rate": 0.00012113247964553888, + "loss": 2.8746, + "step": 4379 + }, + { + "epoch": 0.3964877342264868, + "grad_norm": 0.9112572073936462, + "learning_rate": 0.000121101461990191, + "loss": 2.8462, + "step": 4380 + }, + { + "epoch": 0.39657825654023715, + "grad_norm": 0.879729151725769, + "learning_rate": 0.00012107044220992642, + "loss": 2.9074, + "step": 4381 + }, + { + "epoch": 0.3966687788539875, + "grad_norm": 0.9959706664085388, + "learning_rate": 0.00012103942030786894, + "loss": 2.8299, + "step": 4382 + }, + { + "epoch": 0.39675930116773783, + "grad_norm": 0.869488000869751, + "learning_rate": 0.0001210083962871424, + "loss": 2.8547, + "step": 4383 + }, + { + "epoch": 0.3968498234814882, + "grad_norm": 0.826572597026825, + "learning_rate": 0.00012097737015087094, + "loss": 2.8806, + "step": 4384 + }, + { + "epoch": 0.3969403457952385, + "grad_norm": 0.8823446035385132, + "learning_rate": 0.00012094634190217887, + "loss": 2.8588, + "step": 4385 + }, + { + "epoch": 0.39703086810898885, + "grad_norm": 0.9061761498451233, + "learning_rate": 0.00012091531154419072, + "loss": 2.8667, + "step": 4386 + }, + { + "epoch": 0.3971213904227392, + "grad_norm": 0.8355569243431091, + "learning_rate": 0.0001208842790800313, + "loss": 2.8116, + "step": 4387 + }, + { + "epoch": 0.39721191273648954, + "grad_norm": 0.8613497018814087, + "learning_rate": 0.00012085324451282553, + "loss": 2.8725, + "step": 4388 + }, + { + "epoch": 0.3973024350502399, + "grad_norm": 0.9011744856834412, + "learning_rate": 0.00012082220784569862, + "loss": 2.9125, + "step": 4389 + }, + { + "epoch": 0.3973929573639902, + "grad_norm": 0.8703922629356384, + "learning_rate": 0.00012079116908177593, + "loss": 2.845, + "step": 4390 + }, + { + "epoch": 0.39748347967774056, + "grad_norm": 0.8401715159416199, + "learning_rate": 0.00012076012822418312, + "loss": 2.8521, + "step": 4391 + }, + { + "epoch": 0.3975740019914909, + "grad_norm": 0.8488914370536804, + "learning_rate": 0.00012072908527604594, + "loss": 2.8931, + "step": 4392 + }, + { + "epoch": 0.39766452430524124, + "grad_norm": 0.9117351770401001, + "learning_rate": 0.00012069804024049043, + "loss": 2.7976, + "step": 4393 + }, + { + "epoch": 0.3977550466189916, + "grad_norm": 0.94156813621521, + "learning_rate": 0.00012066699312064286, + "loss": 2.9198, + "step": 4394 + }, + { + "epoch": 0.3978455689327419, + "grad_norm": 0.8953226804733276, + "learning_rate": 0.00012063594391962962, + "loss": 2.8474, + "step": 4395 + }, + { + "epoch": 0.39793609124649226, + "grad_norm": 0.8929262161254883, + "learning_rate": 0.00012060489264057742, + "loss": 2.8395, + "step": 4396 + }, + { + "epoch": 0.3980266135602426, + "grad_norm": 0.8781452178955078, + "learning_rate": 0.00012057383928661308, + "loss": 2.853, + "step": 4397 + }, + { + "epoch": 0.39811713587399294, + "grad_norm": 0.9336954951286316, + "learning_rate": 0.00012054278386086368, + "loss": 2.8811, + "step": 4398 + }, + { + "epoch": 0.3982076581877433, + "grad_norm": 0.8699139952659607, + "learning_rate": 0.00012051172636645654, + "loss": 2.9717, + "step": 4399 + }, + { + "epoch": 0.3982981805014936, + "grad_norm": 0.8598851561546326, + "learning_rate": 0.00012048066680651908, + "loss": 2.8222, + "step": 4400 + }, + { + "epoch": 0.39838870281524397, + "grad_norm": 0.8180475831031799, + "learning_rate": 0.00012044960518417903, + "loss": 2.8072, + "step": 4401 + }, + { + "epoch": 0.3984792251289943, + "grad_norm": 0.9748538732528687, + "learning_rate": 0.00012041854150256433, + "loss": 2.8898, + "step": 4402 + }, + { + "epoch": 0.39856974744274465, + "grad_norm": 0.8849080204963684, + "learning_rate": 0.00012038747576480303, + "loss": 2.8782, + "step": 4403 + }, + { + "epoch": 0.398660269756495, + "grad_norm": 1.0537378787994385, + "learning_rate": 0.0001203564079740235, + "loss": 2.8096, + "step": 4404 + }, + { + "epoch": 0.39875079207024533, + "grad_norm": 0.8559945225715637, + "learning_rate": 0.00012032533813335423, + "loss": 2.8918, + "step": 4405 + }, + { + "epoch": 0.39884131438399567, + "grad_norm": 0.9511855244636536, + "learning_rate": 0.00012029426624592397, + "loss": 2.9183, + "step": 4406 + }, + { + "epoch": 0.398931836697746, + "grad_norm": 0.9862476587295532, + "learning_rate": 0.00012026319231486165, + "loss": 2.8062, + "step": 4407 + }, + { + "epoch": 0.39902235901149635, + "grad_norm": 0.9083656668663025, + "learning_rate": 0.00012023211634329643, + "loss": 2.8313, + "step": 4408 + }, + { + "epoch": 0.3991128813252467, + "grad_norm": 0.9601117372512817, + "learning_rate": 0.00012020103833435766, + "loss": 2.8391, + "step": 4409 + }, + { + "epoch": 0.39920340363899703, + "grad_norm": 0.9977763295173645, + "learning_rate": 0.00012016995829117488, + "loss": 2.8204, + "step": 4410 + }, + { + "epoch": 0.3992939259527474, + "grad_norm": 0.8471393585205078, + "learning_rate": 0.00012013887621687783, + "loss": 2.905, + "step": 4411 + }, + { + "epoch": 0.3993844482664977, + "grad_norm": 0.9744412302970886, + "learning_rate": 0.00012010779211459648, + "loss": 2.8808, + "step": 4412 + }, + { + "epoch": 0.39947497058024806, + "grad_norm": 1.0381581783294678, + "learning_rate": 0.00012007670598746103, + "loss": 2.8895, + "step": 4413 + }, + { + "epoch": 0.3995654928939984, + "grad_norm": 0.8495534658432007, + "learning_rate": 0.00012004561783860186, + "loss": 2.8401, + "step": 4414 + }, + { + "epoch": 0.3996560152077487, + "grad_norm": 1.0044671297073364, + "learning_rate": 0.00012001452767114951, + "loss": 2.918, + "step": 4415 + }, + { + "epoch": 0.399746537521499, + "grad_norm": 0.9183343648910522, + "learning_rate": 0.00011998343548823474, + "loss": 2.8259, + "step": 4416 + }, + { + "epoch": 0.39983705983524936, + "grad_norm": 0.8820471167564392, + "learning_rate": 0.0001199523412929886, + "loss": 2.8592, + "step": 4417 + }, + { + "epoch": 0.3999275821489997, + "grad_norm": 0.9422193169593811, + "learning_rate": 0.00011992124508854223, + "loss": 2.8703, + "step": 4418 + }, + { + "epoch": 0.40001810446275005, + "grad_norm": 0.8565012216567993, + "learning_rate": 0.00011989014687802703, + "loss": 2.8063, + "step": 4419 + }, + { + "epoch": 0.4001086267765004, + "grad_norm": 0.8324585556983948, + "learning_rate": 0.00011985904666457455, + "loss": 2.8611, + "step": 4420 + }, + { + "epoch": 0.4001991490902507, + "grad_norm": 0.8732492923736572, + "learning_rate": 0.00011982794445131665, + "loss": 2.8615, + "step": 4421 + }, + { + "epoch": 0.40028967140400107, + "grad_norm": 0.7857765555381775, + "learning_rate": 0.00011979684024138529, + "loss": 2.8034, + "step": 4422 + }, + { + "epoch": 0.4003801937177514, + "grad_norm": 0.8531179428100586, + "learning_rate": 0.00011976573403791262, + "loss": 2.9058, + "step": 4423 + }, + { + "epoch": 0.40047071603150175, + "grad_norm": 0.9699780941009521, + "learning_rate": 0.0001197346258440311, + "loss": 2.8425, + "step": 4424 + }, + { + "epoch": 0.4005612383452521, + "grad_norm": 0.8353390097618103, + "learning_rate": 0.00011970351566287333, + "loss": 2.8082, + "step": 4425 + }, + { + "epoch": 0.40065176065900243, + "grad_norm": 0.9401875138282776, + "learning_rate": 0.00011967240349757203, + "loss": 2.9017, + "step": 4426 + }, + { + "epoch": 0.4007422829727528, + "grad_norm": 0.9973408579826355, + "learning_rate": 0.00011964128935126023, + "loss": 2.8623, + "step": 4427 + }, + { + "epoch": 0.4008328052865031, + "grad_norm": 0.8002395629882812, + "learning_rate": 0.00011961017322707113, + "loss": 2.8808, + "step": 4428 + }, + { + "epoch": 0.40092332760025345, + "grad_norm": 0.9130750298500061, + "learning_rate": 0.00011957905512813817, + "loss": 2.8638, + "step": 4429 + }, + { + "epoch": 0.4010138499140038, + "grad_norm": 0.9972096085548401, + "learning_rate": 0.00011954793505759483, + "loss": 2.9469, + "step": 4430 + }, + { + "epoch": 0.40110437222775414, + "grad_norm": 0.8289200663566589, + "learning_rate": 0.000119516813018575, + "loss": 2.7791, + "step": 4431 + }, + { + "epoch": 0.4011948945415045, + "grad_norm": 1.0694494247436523, + "learning_rate": 0.00011948568901421263, + "loss": 2.8447, + "step": 4432 + }, + { + "epoch": 0.4012854168552548, + "grad_norm": 1.0092542171478271, + "learning_rate": 0.00011945456304764192, + "loss": 2.8501, + "step": 4433 + }, + { + "epoch": 0.40137593916900516, + "grad_norm": 0.955417275428772, + "learning_rate": 0.0001194234351219972, + "loss": 2.8781, + "step": 4434 + }, + { + "epoch": 0.4014664614827555, + "grad_norm": 1.1501630544662476, + "learning_rate": 0.00011939230524041314, + "loss": 2.841, + "step": 4435 + }, + { + "epoch": 0.40155698379650584, + "grad_norm": 0.8584458827972412, + "learning_rate": 0.00011936117340602444, + "loss": 2.8188, + "step": 4436 + }, + { + "epoch": 0.4016475061102562, + "grad_norm": 0.9481405019760132, + "learning_rate": 0.00011933003962196613, + "loss": 2.8674, + "step": 4437 + }, + { + "epoch": 0.4017380284240065, + "grad_norm": 0.9717084169387817, + "learning_rate": 0.00011929890389137337, + "loss": 2.857, + "step": 4438 + }, + { + "epoch": 0.40182855073775686, + "grad_norm": 0.8797189593315125, + "learning_rate": 0.00011926776621738149, + "loss": 2.7974, + "step": 4439 + }, + { + "epoch": 0.4019190730515072, + "grad_norm": 0.9222303628921509, + "learning_rate": 0.00011923662660312611, + "loss": 2.8468, + "step": 4440 + }, + { + "epoch": 0.40200959536525754, + "grad_norm": 0.8810079097747803, + "learning_rate": 0.00011920548505174293, + "loss": 2.8549, + "step": 4441 + }, + { + "epoch": 0.4021001176790079, + "grad_norm": 0.9079646468162537, + "learning_rate": 0.00011917434156636793, + "loss": 2.7933, + "step": 4442 + }, + { + "epoch": 0.4021906399927582, + "grad_norm": 0.8858142495155334, + "learning_rate": 0.00011914319615013726, + "loss": 2.7967, + "step": 4443 + }, + { + "epoch": 0.40228116230650857, + "grad_norm": 0.8796921372413635, + "learning_rate": 0.00011911204880618729, + "loss": 2.8341, + "step": 4444 + }, + { + "epoch": 0.4023716846202589, + "grad_norm": 0.9303306937217712, + "learning_rate": 0.00011908089953765449, + "loss": 2.8224, + "step": 4445 + }, + { + "epoch": 0.40246220693400925, + "grad_norm": 0.9221446514129639, + "learning_rate": 0.00011904974834767563, + "loss": 2.8236, + "step": 4446 + }, + { + "epoch": 0.4025527292477596, + "grad_norm": 0.9513319730758667, + "learning_rate": 0.00011901859523938765, + "loss": 2.8596, + "step": 4447 + }, + { + "epoch": 0.40264325156150993, + "grad_norm": 0.7914127707481384, + "learning_rate": 0.00011898744021592767, + "loss": 2.8026, + "step": 4448 + }, + { + "epoch": 0.40273377387526027, + "grad_norm": 0.8371176719665527, + "learning_rate": 0.00011895628328043296, + "loss": 2.8296, + "step": 4449 + }, + { + "epoch": 0.4028242961890106, + "grad_norm": 0.9562647938728333, + "learning_rate": 0.00011892512443604102, + "loss": 2.8599, + "step": 4450 + }, + { + "epoch": 0.40291481850276095, + "grad_norm": 0.9588335156440735, + "learning_rate": 0.00011889396368588958, + "loss": 2.8484, + "step": 4451 + }, + { + "epoch": 0.4030053408165113, + "grad_norm": 0.9410818219184875, + "learning_rate": 0.00011886280103311656, + "loss": 2.8455, + "step": 4452 + }, + { + "epoch": 0.40309586313026163, + "grad_norm": 1.0550609827041626, + "learning_rate": 0.00011883163648085993, + "loss": 2.8471, + "step": 4453 + }, + { + "epoch": 0.403186385444012, + "grad_norm": 0.8768680095672607, + "learning_rate": 0.00011880047003225804, + "loss": 2.8065, + "step": 4454 + }, + { + "epoch": 0.4032769077577623, + "grad_norm": 0.815466582775116, + "learning_rate": 0.00011876930169044936, + "loss": 2.8442, + "step": 4455 + }, + { + "epoch": 0.4033674300715126, + "grad_norm": 0.9968241453170776, + "learning_rate": 0.00011873813145857249, + "loss": 2.9243, + "step": 4456 + }, + { + "epoch": 0.40345795238526294, + "grad_norm": 0.8512164354324341, + "learning_rate": 0.00011870695933976628, + "loss": 2.8586, + "step": 4457 + }, + { + "epoch": 0.4035484746990133, + "grad_norm": 0.846886396408081, + "learning_rate": 0.00011867578533716979, + "loss": 2.8505, + "step": 4458 + }, + { + "epoch": 0.4036389970127636, + "grad_norm": 0.8526726365089417, + "learning_rate": 0.00011864460945392224, + "loss": 2.8297, + "step": 4459 + }, + { + "epoch": 0.40372951932651396, + "grad_norm": 0.8981117606163025, + "learning_rate": 0.00011861343169316301, + "loss": 2.8628, + "step": 4460 + }, + { + "epoch": 0.4038200416402643, + "grad_norm": 0.8755601644515991, + "learning_rate": 0.00011858225205803171, + "loss": 2.8063, + "step": 4461 + }, + { + "epoch": 0.40391056395401465, + "grad_norm": 0.9109624624252319, + "learning_rate": 0.00011855107055166814, + "loss": 2.8223, + "step": 4462 + }, + { + "epoch": 0.404001086267765, + "grad_norm": 0.8895991444587708, + "learning_rate": 0.00011851988717721232, + "loss": 2.8655, + "step": 4463 + }, + { + "epoch": 0.4040916085815153, + "grad_norm": 0.8829487562179565, + "learning_rate": 0.00011848870193780433, + "loss": 2.8666, + "step": 4464 + }, + { + "epoch": 0.40418213089526567, + "grad_norm": 0.9633542895317078, + "learning_rate": 0.00011845751483658453, + "loss": 2.8261, + "step": 4465 + }, + { + "epoch": 0.404272653209016, + "grad_norm": 0.8148611187934875, + "learning_rate": 0.00011842632587669353, + "loss": 2.8781, + "step": 4466 + }, + { + "epoch": 0.40436317552276635, + "grad_norm": 0.8730365633964539, + "learning_rate": 0.00011839513506127203, + "loss": 2.8309, + "step": 4467 + }, + { + "epoch": 0.4044536978365167, + "grad_norm": 0.90838623046875, + "learning_rate": 0.00011836394239346091, + "loss": 2.8738, + "step": 4468 + }, + { + "epoch": 0.40454422015026703, + "grad_norm": 0.8492640256881714, + "learning_rate": 0.00011833274787640129, + "loss": 2.8809, + "step": 4469 + }, + { + "epoch": 0.40463474246401737, + "grad_norm": 0.8511492013931274, + "learning_rate": 0.00011830155151323446, + "loss": 2.8329, + "step": 4470 + }, + { + "epoch": 0.4047252647777677, + "grad_norm": 0.8511828780174255, + "learning_rate": 0.00011827035330710192, + "loss": 2.9125, + "step": 4471 + }, + { + "epoch": 0.40481578709151805, + "grad_norm": 0.9062504172325134, + "learning_rate": 0.00011823915326114527, + "loss": 2.8957, + "step": 4472 + }, + { + "epoch": 0.4049063094052684, + "grad_norm": 0.927947461605072, + "learning_rate": 0.0001182079513785064, + "loss": 2.8272, + "step": 4473 + }, + { + "epoch": 0.40499683171901874, + "grad_norm": 0.8176203966140747, + "learning_rate": 0.00011817674766232734, + "loss": 2.8658, + "step": 4474 + }, + { + "epoch": 0.4050873540327691, + "grad_norm": 0.86082524061203, + "learning_rate": 0.00011814554211575027, + "loss": 2.8829, + "step": 4475 + }, + { + "epoch": 0.4051778763465194, + "grad_norm": 0.8323209881782532, + "learning_rate": 0.0001181143347419176, + "loss": 2.834, + "step": 4476 + }, + { + "epoch": 0.40526839866026976, + "grad_norm": 0.8598990440368652, + "learning_rate": 0.00011808312554397192, + "loss": 2.8747, + "step": 4477 + }, + { + "epoch": 0.4053589209740201, + "grad_norm": 0.887123167514801, + "learning_rate": 0.00011805191452505602, + "loss": 2.9156, + "step": 4478 + }, + { + "epoch": 0.40544944328777044, + "grad_norm": 0.9374668598175049, + "learning_rate": 0.00011802070168831279, + "loss": 2.9036, + "step": 4479 + }, + { + "epoch": 0.4055399656015208, + "grad_norm": 0.9349672198295593, + "learning_rate": 0.00011798948703688539, + "loss": 2.8519, + "step": 4480 + }, + { + "epoch": 0.4056304879152711, + "grad_norm": 0.8681858777999878, + "learning_rate": 0.00011795827057391715, + "loss": 2.8797, + "step": 4481 + }, + { + "epoch": 0.40572101022902146, + "grad_norm": 0.8855987191200256, + "learning_rate": 0.00011792705230255157, + "loss": 2.8729, + "step": 4482 + }, + { + "epoch": 0.4058115325427718, + "grad_norm": 0.8382616639137268, + "learning_rate": 0.00011789583222593227, + "loss": 2.7793, + "step": 4483 + }, + { + "epoch": 0.40590205485652214, + "grad_norm": 0.8915740251541138, + "learning_rate": 0.00011786461034720319, + "loss": 2.8429, + "step": 4484 + }, + { + "epoch": 0.4059925771702725, + "grad_norm": 0.9631752967834473, + "learning_rate": 0.00011783338666950831, + "loss": 2.899, + "step": 4485 + }, + { + "epoch": 0.4060830994840228, + "grad_norm": 0.8699368238449097, + "learning_rate": 0.00011780216119599192, + "loss": 2.8685, + "step": 4486 + }, + { + "epoch": 0.40617362179777317, + "grad_norm": 0.9649322032928467, + "learning_rate": 0.00011777093392979837, + "loss": 2.822, + "step": 4487 + }, + { + "epoch": 0.4062641441115235, + "grad_norm": 0.9097427725791931, + "learning_rate": 0.00011773970487407224, + "loss": 2.8564, + "step": 4488 + }, + { + "epoch": 0.40635466642527385, + "grad_norm": 0.8963183164596558, + "learning_rate": 0.00011770847403195834, + "loss": 2.8812, + "step": 4489 + }, + { + "epoch": 0.4064451887390242, + "grad_norm": 0.9345114827156067, + "learning_rate": 0.00011767724140660157, + "loss": 2.8541, + "step": 4490 + }, + { + "epoch": 0.40653571105277453, + "grad_norm": 0.9190236926078796, + "learning_rate": 0.00011764600700114708, + "loss": 2.9107, + "step": 4491 + }, + { + "epoch": 0.40662623336652487, + "grad_norm": 0.8682000041007996, + "learning_rate": 0.00011761477081874015, + "loss": 2.9114, + "step": 4492 + }, + { + "epoch": 0.4067167556802752, + "grad_norm": 0.9088363647460938, + "learning_rate": 0.00011758353286252632, + "loss": 2.8547, + "step": 4493 + }, + { + "epoch": 0.40680727799402555, + "grad_norm": 0.8862866759300232, + "learning_rate": 0.0001175522931356512, + "loss": 2.8347, + "step": 4494 + }, + { + "epoch": 0.4068978003077759, + "grad_norm": 0.8324484825134277, + "learning_rate": 0.00011752105164126062, + "loss": 2.8227, + "step": 4495 + }, + { + "epoch": 0.40698832262152623, + "grad_norm": 0.9196053743362427, + "learning_rate": 0.00011748980838250065, + "loss": 2.793, + "step": 4496 + }, + { + "epoch": 0.4070788449352765, + "grad_norm": 0.8978742361068726, + "learning_rate": 0.00011745856336251742, + "loss": 2.8878, + "step": 4497 + }, + { + "epoch": 0.40716936724902686, + "grad_norm": 0.8683122992515564, + "learning_rate": 0.00011742731658445738, + "loss": 2.855, + "step": 4498 + }, + { + "epoch": 0.4072598895627772, + "grad_norm": 0.8661417365074158, + "learning_rate": 0.00011739606805146701, + "loss": 2.8421, + "step": 4499 + }, + { + "epoch": 0.40735041187652754, + "grad_norm": 0.9605647921562195, + "learning_rate": 0.00011736481776669306, + "loss": 2.803, + "step": 4500 + }, + { + "epoch": 0.4074409341902779, + "grad_norm": 0.8836671710014343, + "learning_rate": 0.00011733356573328244, + "loss": 2.8395, + "step": 4501 + }, + { + "epoch": 0.4075314565040282, + "grad_norm": 0.9318768382072449, + "learning_rate": 0.00011730231195438222, + "loss": 2.9127, + "step": 4502 + }, + { + "epoch": 0.40762197881777856, + "grad_norm": 0.8878833651542664, + "learning_rate": 0.00011727105643313966, + "loss": 2.8762, + "step": 4503 + }, + { + "epoch": 0.4077125011315289, + "grad_norm": 0.8291527032852173, + "learning_rate": 0.00011723979917270218, + "loss": 2.8568, + "step": 4504 + }, + { + "epoch": 0.40780302344527924, + "grad_norm": 0.9366280436515808, + "learning_rate": 0.00011720854017621743, + "loss": 2.8106, + "step": 4505 + }, + { + "epoch": 0.4078935457590296, + "grad_norm": 0.8883543610572815, + "learning_rate": 0.00011717727944683313, + "loss": 2.8194, + "step": 4506 + }, + { + "epoch": 0.4079840680727799, + "grad_norm": 0.9248679280281067, + "learning_rate": 0.00011714601698769725, + "loss": 2.8272, + "step": 4507 + }, + { + "epoch": 0.40807459038653027, + "grad_norm": 0.9774189591407776, + "learning_rate": 0.00011711475280195794, + "loss": 2.8291, + "step": 4508 + }, + { + "epoch": 0.4081651127002806, + "grad_norm": 0.9434395432472229, + "learning_rate": 0.00011708348689276351, + "loss": 2.8495, + "step": 4509 + }, + { + "epoch": 0.40825563501403095, + "grad_norm": 0.8033177256584167, + "learning_rate": 0.0001170522192632624, + "loss": 2.8239, + "step": 4510 + }, + { + "epoch": 0.4083461573277813, + "grad_norm": 0.8647253513336182, + "learning_rate": 0.00011702094991660326, + "loss": 2.8936, + "step": 4511 + }, + { + "epoch": 0.40843667964153163, + "grad_norm": 0.8673940896987915, + "learning_rate": 0.00011698967885593495, + "loss": 2.7856, + "step": 4512 + }, + { + "epoch": 0.40852720195528197, + "grad_norm": 0.8677223920822144, + "learning_rate": 0.00011695840608440644, + "loss": 2.8896, + "step": 4513 + }, + { + "epoch": 0.4086177242690323, + "grad_norm": 0.996747612953186, + "learning_rate": 0.00011692713160516691, + "loss": 2.88, + "step": 4514 + }, + { + "epoch": 0.40870824658278265, + "grad_norm": 0.9442813992500305, + "learning_rate": 0.00011689585542136567, + "loss": 2.9284, + "step": 4515 + }, + { + "epoch": 0.408798768896533, + "grad_norm": 0.9169313311576843, + "learning_rate": 0.00011686457753615228, + "loss": 2.8345, + "step": 4516 + }, + { + "epoch": 0.40888929121028333, + "grad_norm": 0.847822904586792, + "learning_rate": 0.00011683329795267636, + "loss": 2.8906, + "step": 4517 + }, + { + "epoch": 0.4089798135240337, + "grad_norm": 0.9691492915153503, + "learning_rate": 0.00011680201667408783, + "loss": 2.8347, + "step": 4518 + }, + { + "epoch": 0.409070335837784, + "grad_norm": 0.8392035961151123, + "learning_rate": 0.00011677073370353666, + "loss": 2.8626, + "step": 4519 + }, + { + "epoch": 0.40916085815153436, + "grad_norm": 0.8107256293296814, + "learning_rate": 0.00011673944904417308, + "loss": 2.8532, + "step": 4520 + }, + { + "epoch": 0.4092513804652847, + "grad_norm": 0.9076669812202454, + "learning_rate": 0.00011670816269914742, + "loss": 2.8716, + "step": 4521 + }, + { + "epoch": 0.40934190277903504, + "grad_norm": 0.8376779556274414, + "learning_rate": 0.00011667687467161024, + "loss": 2.8887, + "step": 4522 + }, + { + "epoch": 0.4094324250927854, + "grad_norm": 0.759242594242096, + "learning_rate": 0.00011664558496471223, + "loss": 2.84, + "step": 4523 + }, + { + "epoch": 0.4095229474065357, + "grad_norm": 0.815267026424408, + "learning_rate": 0.00011661429358160429, + "loss": 2.8227, + "step": 4524 + }, + { + "epoch": 0.40961346972028606, + "grad_norm": 0.854684591293335, + "learning_rate": 0.00011658300052543742, + "loss": 2.8768, + "step": 4525 + }, + { + "epoch": 0.4097039920340364, + "grad_norm": 0.8627126216888428, + "learning_rate": 0.00011655170579936286, + "loss": 2.8762, + "step": 4526 + }, + { + "epoch": 0.40979451434778674, + "grad_norm": 0.8167665004730225, + "learning_rate": 0.00011652040940653197, + "loss": 2.8455, + "step": 4527 + }, + { + "epoch": 0.4098850366615371, + "grad_norm": 0.9664933681488037, + "learning_rate": 0.00011648911135009634, + "loss": 2.8866, + "step": 4528 + }, + { + "epoch": 0.4099755589752874, + "grad_norm": 0.8510946035385132, + "learning_rate": 0.00011645781163320761, + "loss": 2.8484, + "step": 4529 + }, + { + "epoch": 0.41006608128903776, + "grad_norm": 0.9158300757408142, + "learning_rate": 0.00011642651025901772, + "loss": 2.9048, + "step": 4530 + }, + { + "epoch": 0.4101566036027881, + "grad_norm": 0.8029270172119141, + "learning_rate": 0.00011639520723067868, + "loss": 2.8317, + "step": 4531 + }, + { + "epoch": 0.41024712591653845, + "grad_norm": 0.962932288646698, + "learning_rate": 0.00011636390255134273, + "loss": 2.8754, + "step": 4532 + }, + { + "epoch": 0.4103376482302888, + "grad_norm": 0.9180269241333008, + "learning_rate": 0.00011633259622416224, + "loss": 2.869, + "step": 4533 + }, + { + "epoch": 0.41042817054403913, + "grad_norm": 0.9610112905502319, + "learning_rate": 0.00011630128825228974, + "loss": 2.878, + "step": 4534 + }, + { + "epoch": 0.41051869285778947, + "grad_norm": 0.8697507381439209, + "learning_rate": 0.000116269978638878, + "loss": 2.8602, + "step": 4535 + }, + { + "epoch": 0.4106092151715398, + "grad_norm": 0.9531928300857544, + "learning_rate": 0.00011623866738707983, + "loss": 2.8178, + "step": 4536 + }, + { + "epoch": 0.41069973748529015, + "grad_norm": 0.870388388633728, + "learning_rate": 0.00011620735450004829, + "loss": 2.8233, + "step": 4537 + }, + { + "epoch": 0.41079025979904044, + "grad_norm": 0.8597344160079956, + "learning_rate": 0.0001161760399809366, + "loss": 2.8641, + "step": 4538 + }, + { + "epoch": 0.4108807821127908, + "grad_norm": 0.8699856400489807, + "learning_rate": 0.00011614472383289814, + "loss": 2.8854, + "step": 4539 + }, + { + "epoch": 0.4109713044265411, + "grad_norm": 0.8118836283683777, + "learning_rate": 0.00011611340605908642, + "loss": 2.818, + "step": 4540 + }, + { + "epoch": 0.41106182674029146, + "grad_norm": 0.8394683003425598, + "learning_rate": 0.00011608208666265514, + "loss": 2.8377, + "step": 4541 + }, + { + "epoch": 0.4111523490540418, + "grad_norm": 0.9108496904373169, + "learning_rate": 0.00011605076564675818, + "loss": 2.8582, + "step": 4542 + }, + { + "epoch": 0.41124287136779214, + "grad_norm": 0.9241228699684143, + "learning_rate": 0.0001160194430145496, + "loss": 2.8557, + "step": 4543 + }, + { + "epoch": 0.4113333936815425, + "grad_norm": 0.8194548487663269, + "learning_rate": 0.0001159881187691835, + "loss": 2.8321, + "step": 4544 + }, + { + "epoch": 0.4114239159952928, + "grad_norm": 0.8807789087295532, + "learning_rate": 0.00011595679291381429, + "loss": 2.7702, + "step": 4545 + }, + { + "epoch": 0.41151443830904316, + "grad_norm": 0.9182827472686768, + "learning_rate": 0.00011592546545159645, + "loss": 2.8463, + "step": 4546 + }, + { + "epoch": 0.4116049606227935, + "grad_norm": 0.8201344013214111, + "learning_rate": 0.00011589413638568471, + "loss": 2.8441, + "step": 4547 + }, + { + "epoch": 0.41169548293654384, + "grad_norm": 0.9747979640960693, + "learning_rate": 0.00011586280571923384, + "loss": 2.8377, + "step": 4548 + }, + { + "epoch": 0.4117860052502942, + "grad_norm": 0.9699158072471619, + "learning_rate": 0.00011583147345539889, + "loss": 2.8487, + "step": 4549 + }, + { + "epoch": 0.4118765275640445, + "grad_norm": 0.8967488408088684, + "learning_rate": 0.000115800139597335, + "loss": 2.8408, + "step": 4550 + }, + { + "epoch": 0.41196704987779487, + "grad_norm": 0.9237414598464966, + "learning_rate": 0.00011576880414819749, + "loss": 2.8475, + "step": 4551 + }, + { + "epoch": 0.4120575721915452, + "grad_norm": 0.8308104276657104, + "learning_rate": 0.00011573746711114179, + "loss": 2.869, + "step": 4552 + }, + { + "epoch": 0.41214809450529555, + "grad_norm": 0.9393368363380432, + "learning_rate": 0.00011570612848932361, + "loss": 2.8297, + "step": 4553 + }, + { + "epoch": 0.4122386168190459, + "grad_norm": 0.8871431946754456, + "learning_rate": 0.00011567478828589875, + "loss": 2.7945, + "step": 4554 + }, + { + "epoch": 0.41232913913279623, + "grad_norm": 0.9611302018165588, + "learning_rate": 0.0001156434465040231, + "loss": 2.8664, + "step": 4555 + }, + { + "epoch": 0.41241966144654657, + "grad_norm": 0.9113096594810486, + "learning_rate": 0.00011561210314685282, + "loss": 2.8138, + "step": 4556 + }, + { + "epoch": 0.4125101837602969, + "grad_norm": 0.8560320138931274, + "learning_rate": 0.00011558075821754417, + "loss": 2.8107, + "step": 4557 + }, + { + "epoch": 0.41260070607404725, + "grad_norm": 0.8099222183227539, + "learning_rate": 0.00011554941171925365, + "loss": 2.8509, + "step": 4558 + }, + { + "epoch": 0.4126912283877976, + "grad_norm": 0.9044903516769409, + "learning_rate": 0.00011551806365513773, + "loss": 2.8588, + "step": 4559 + }, + { + "epoch": 0.41278175070154793, + "grad_norm": 0.9626808762550354, + "learning_rate": 0.00011548671402835325, + "loss": 2.87, + "step": 4560 + }, + { + "epoch": 0.4128722730152983, + "grad_norm": 0.8645593523979187, + "learning_rate": 0.00011545536284205707, + "loss": 2.8122, + "step": 4561 + }, + { + "epoch": 0.4129627953290486, + "grad_norm": 0.92280113697052, + "learning_rate": 0.00011542401009940631, + "loss": 2.8386, + "step": 4562 + }, + { + "epoch": 0.41305331764279896, + "grad_norm": 0.8630074858665466, + "learning_rate": 0.00011539265580355814, + "loss": 2.7993, + "step": 4563 + }, + { + "epoch": 0.4131438399565493, + "grad_norm": 0.9118307828903198, + "learning_rate": 0.00011536129995766996, + "loss": 2.8264, + "step": 4564 + }, + { + "epoch": 0.41323436227029964, + "grad_norm": 0.9462095499038696, + "learning_rate": 0.00011532994256489927, + "loss": 2.8627, + "step": 4565 + }, + { + "epoch": 0.41332488458405, + "grad_norm": 0.9295179843902588, + "learning_rate": 0.00011529858362840382, + "loss": 2.8083, + "step": 4566 + }, + { + "epoch": 0.4134154068978003, + "grad_norm": 0.8588746190071106, + "learning_rate": 0.00011526722315134139, + "loss": 2.8716, + "step": 4567 + }, + { + "epoch": 0.41350592921155066, + "grad_norm": 0.9106554388999939, + "learning_rate": 0.00011523586113687001, + "loss": 2.9148, + "step": 4568 + }, + { + "epoch": 0.413596451525301, + "grad_norm": 0.8623448610305786, + "learning_rate": 0.00011520449758814784, + "loss": 2.8142, + "step": 4569 + }, + { + "epoch": 0.41368697383905134, + "grad_norm": 0.898600161075592, + "learning_rate": 0.00011517313250833317, + "loss": 2.887, + "step": 4570 + }, + { + "epoch": 0.4137774961528017, + "grad_norm": 0.8837737441062927, + "learning_rate": 0.00011514176590058448, + "loss": 2.7951, + "step": 4571 + }, + { + "epoch": 0.413868018466552, + "grad_norm": 0.8727512955665588, + "learning_rate": 0.00011511039776806037, + "loss": 2.868, + "step": 4572 + }, + { + "epoch": 0.41395854078030236, + "grad_norm": 0.9692105650901794, + "learning_rate": 0.00011507902811391961, + "loss": 2.8471, + "step": 4573 + }, + { + "epoch": 0.4140490630940527, + "grad_norm": 0.8482305407524109, + "learning_rate": 0.00011504765694132116, + "loss": 2.8946, + "step": 4574 + }, + { + "epoch": 0.41413958540780305, + "grad_norm": 0.7920590043067932, + "learning_rate": 0.00011501628425342403, + "loss": 2.8343, + "step": 4575 + }, + { + "epoch": 0.4142301077215534, + "grad_norm": 0.8491135835647583, + "learning_rate": 0.0001149849100533875, + "loss": 2.8269, + "step": 4576 + }, + { + "epoch": 0.4143206300353037, + "grad_norm": 0.8065090775489807, + "learning_rate": 0.00011495353434437098, + "loss": 2.8108, + "step": 4577 + }, + { + "epoch": 0.41441115234905407, + "grad_norm": 0.8313660025596619, + "learning_rate": 0.0001149221571295339, + "loss": 2.8532, + "step": 4578 + }, + { + "epoch": 0.41450167466280435, + "grad_norm": 0.8295189142227173, + "learning_rate": 0.00011489077841203602, + "loss": 2.8218, + "step": 4579 + }, + { + "epoch": 0.4145921969765547, + "grad_norm": 0.8401261568069458, + "learning_rate": 0.00011485939819503717, + "loss": 2.848, + "step": 4580 + }, + { + "epoch": 0.41468271929030504, + "grad_norm": 0.9519902467727661, + "learning_rate": 0.00011482801648169736, + "loss": 2.8726, + "step": 4581 + }, + { + "epoch": 0.4147732416040554, + "grad_norm": 1.0146515369415283, + "learning_rate": 0.00011479663327517667, + "loss": 2.8943, + "step": 4582 + }, + { + "epoch": 0.4148637639178057, + "grad_norm": 0.8038744926452637, + "learning_rate": 0.0001147652485786354, + "loss": 2.8749, + "step": 4583 + }, + { + "epoch": 0.41495428623155606, + "grad_norm": 0.9129198789596558, + "learning_rate": 0.000114733862395234, + "loss": 2.8427, + "step": 4584 + }, + { + "epoch": 0.4150448085453064, + "grad_norm": 0.8790321350097656, + "learning_rate": 0.00011470247472813309, + "loss": 2.8366, + "step": 4585 + }, + { + "epoch": 0.41513533085905674, + "grad_norm": 0.8764024972915649, + "learning_rate": 0.00011467108558049335, + "loss": 2.8418, + "step": 4586 + }, + { + "epoch": 0.4152258531728071, + "grad_norm": 0.8494692444801331, + "learning_rate": 0.0001146396949554757, + "loss": 2.8391, + "step": 4587 + }, + { + "epoch": 0.4153163754865574, + "grad_norm": 0.874064028263092, + "learning_rate": 0.00011460830285624118, + "loss": 2.7919, + "step": 4588 + }, + { + "epoch": 0.41540689780030776, + "grad_norm": 0.8704847693443298, + "learning_rate": 0.00011457690928595095, + "loss": 2.8265, + "step": 4589 + }, + { + "epoch": 0.4154974201140581, + "grad_norm": 0.8909250497817993, + "learning_rate": 0.00011454551424776637, + "loss": 2.7961, + "step": 4590 + }, + { + "epoch": 0.41558794242780844, + "grad_norm": 0.8759780526161194, + "learning_rate": 0.00011451411774484889, + "loss": 2.804, + "step": 4591 + }, + { + "epoch": 0.4156784647415588, + "grad_norm": 0.8733645081520081, + "learning_rate": 0.00011448271978036013, + "loss": 2.8749, + "step": 4592 + }, + { + "epoch": 0.4157689870553091, + "grad_norm": 0.9054057002067566, + "learning_rate": 0.0001144513203574619, + "loss": 2.8899, + "step": 4593 + }, + { + "epoch": 0.41585950936905947, + "grad_norm": 0.8773120641708374, + "learning_rate": 0.00011441991947931612, + "loss": 2.8509, + "step": 4594 + }, + { + "epoch": 0.4159500316828098, + "grad_norm": 0.9054686427116394, + "learning_rate": 0.00011438851714908483, + "loss": 2.8504, + "step": 4595 + }, + { + "epoch": 0.41604055399656015, + "grad_norm": 0.9160197377204895, + "learning_rate": 0.00011435711336993027, + "loss": 2.872, + "step": 4596 + }, + { + "epoch": 0.4161310763103105, + "grad_norm": 0.8991233706474304, + "learning_rate": 0.00011432570814501478, + "loss": 2.8094, + "step": 4597 + }, + { + "epoch": 0.41622159862406083, + "grad_norm": 0.9077282547950745, + "learning_rate": 0.00011429430147750087, + "loss": 2.9541, + "step": 4598 + }, + { + "epoch": 0.41631212093781117, + "grad_norm": 1.050952434539795, + "learning_rate": 0.00011426289337055119, + "loss": 2.9243, + "step": 4599 + }, + { + "epoch": 0.4164026432515615, + "grad_norm": 1.1110472679138184, + "learning_rate": 0.00011423148382732853, + "loss": 2.876, + "step": 4600 + }, + { + "epoch": 0.41649316556531185, + "grad_norm": 0.9317214488983154, + "learning_rate": 0.00011420007285099585, + "loss": 2.8084, + "step": 4601 + }, + { + "epoch": 0.4165836878790622, + "grad_norm": 0.9028974771499634, + "learning_rate": 0.00011416866044471622, + "loss": 2.859, + "step": 4602 + }, + { + "epoch": 0.41667421019281253, + "grad_norm": 0.9547843337059021, + "learning_rate": 0.00011413724661165286, + "loss": 2.8426, + "step": 4603 + }, + { + "epoch": 0.4167647325065629, + "grad_norm": 0.8694342970848083, + "learning_rate": 0.00011410583135496917, + "loss": 2.8205, + "step": 4604 + }, + { + "epoch": 0.4168552548203132, + "grad_norm": 0.872846245765686, + "learning_rate": 0.00011407441467782865, + "loss": 2.8493, + "step": 4605 + }, + { + "epoch": 0.41694577713406356, + "grad_norm": 1.0523523092269897, + "learning_rate": 0.00011404299658339493, + "loss": 2.8408, + "step": 4606 + }, + { + "epoch": 0.4170362994478139, + "grad_norm": 0.9270108342170715, + "learning_rate": 0.00011401157707483185, + "loss": 2.8591, + "step": 4607 + }, + { + "epoch": 0.41712682176156424, + "grad_norm": 0.9646519422531128, + "learning_rate": 0.00011398015615530337, + "loss": 2.8092, + "step": 4608 + }, + { + "epoch": 0.4172173440753146, + "grad_norm": 0.8431654572486877, + "learning_rate": 0.00011394873382797351, + "loss": 2.8707, + "step": 4609 + }, + { + "epoch": 0.4173078663890649, + "grad_norm": 1.0408530235290527, + "learning_rate": 0.00011391731009600654, + "loss": 2.7969, + "step": 4610 + }, + { + "epoch": 0.41739838870281526, + "grad_norm": 0.906665563583374, + "learning_rate": 0.00011388588496256685, + "loss": 2.861, + "step": 4611 + }, + { + "epoch": 0.4174889110165656, + "grad_norm": 0.9940811395645142, + "learning_rate": 0.00011385445843081892, + "loss": 2.8514, + "step": 4612 + }, + { + "epoch": 0.41757943333031594, + "grad_norm": 0.8722427487373352, + "learning_rate": 0.0001138230305039274, + "loss": 2.7887, + "step": 4613 + }, + { + "epoch": 0.4176699556440663, + "grad_norm": 0.9708684682846069, + "learning_rate": 0.00011379160118505705, + "loss": 2.8326, + "step": 4614 + }, + { + "epoch": 0.4177604779578166, + "grad_norm": 1.0897921323776245, + "learning_rate": 0.00011376017047737291, + "loss": 2.8131, + "step": 4615 + }, + { + "epoch": 0.41785100027156696, + "grad_norm": 0.9895443320274353, + "learning_rate": 0.00011372873838403993, + "loss": 2.8639, + "step": 4616 + }, + { + "epoch": 0.4179415225853173, + "grad_norm": 1.0044602155685425, + "learning_rate": 0.00011369730490822336, + "loss": 2.8073, + "step": 4617 + }, + { + "epoch": 0.41803204489906765, + "grad_norm": 1.0245896577835083, + "learning_rate": 0.00011366587005308858, + "loss": 2.8415, + "step": 4618 + }, + { + "epoch": 0.418122567212818, + "grad_norm": 0.8707718849182129, + "learning_rate": 0.00011363443382180108, + "loss": 2.8101, + "step": 4619 + }, + { + "epoch": 0.4182130895265683, + "grad_norm": 0.8844798803329468, + "learning_rate": 0.00011360299621752644, + "loss": 2.8005, + "step": 4620 + }, + { + "epoch": 0.4183036118403186, + "grad_norm": 0.9705588817596436, + "learning_rate": 0.00011357155724343045, + "loss": 2.8566, + "step": 4621 + }, + { + "epoch": 0.41839413415406895, + "grad_norm": 0.8912902474403381, + "learning_rate": 0.00011354011690267903, + "loss": 2.8557, + "step": 4622 + }, + { + "epoch": 0.4184846564678193, + "grad_norm": 0.8974612951278687, + "learning_rate": 0.00011350867519843821, + "loss": 2.9134, + "step": 4623 + }, + { + "epoch": 0.41857517878156963, + "grad_norm": 0.8711275458335876, + "learning_rate": 0.00011347723213387416, + "loss": 2.7833, + "step": 4624 + }, + { + "epoch": 0.41866570109532, + "grad_norm": 0.876578152179718, + "learning_rate": 0.00011344578771215319, + "loss": 2.889, + "step": 4625 + }, + { + "epoch": 0.4187562234090703, + "grad_norm": 0.8725136518478394, + "learning_rate": 0.00011341434193644178, + "loss": 2.8994, + "step": 4626 + }, + { + "epoch": 0.41884674572282066, + "grad_norm": 0.9713298082351685, + "learning_rate": 0.00011338289480990652, + "loss": 2.8326, + "step": 4627 + }, + { + "epoch": 0.418937268036571, + "grad_norm": 0.9306853413581848, + "learning_rate": 0.00011335144633571411, + "loss": 2.8872, + "step": 4628 + }, + { + "epoch": 0.41902779035032134, + "grad_norm": 0.8581621646881104, + "learning_rate": 0.00011331999651703139, + "loss": 2.8281, + "step": 4629 + }, + { + "epoch": 0.4191183126640717, + "grad_norm": 0.8599483966827393, + "learning_rate": 0.00011328854535702543, + "loss": 2.8116, + "step": 4630 + }, + { + "epoch": 0.419208834977822, + "grad_norm": 0.8819389939308167, + "learning_rate": 0.00011325709285886328, + "loss": 2.824, + "step": 4631 + }, + { + "epoch": 0.41929935729157236, + "grad_norm": 0.8218708038330078, + "learning_rate": 0.00011322563902571226, + "loss": 2.8347, + "step": 4632 + }, + { + "epoch": 0.4193898796053227, + "grad_norm": 0.768627405166626, + "learning_rate": 0.00011319418386073972, + "loss": 2.783, + "step": 4633 + }, + { + "epoch": 0.41948040191907304, + "grad_norm": 0.8319827914237976, + "learning_rate": 0.00011316272736711329, + "loss": 2.8694, + "step": 4634 + }, + { + "epoch": 0.4195709242328234, + "grad_norm": 0.8202218413352966, + "learning_rate": 0.00011313126954800052, + "loss": 2.8062, + "step": 4635 + }, + { + "epoch": 0.4196614465465737, + "grad_norm": 0.8292239904403687, + "learning_rate": 0.0001130998104065693, + "loss": 2.9159, + "step": 4636 + }, + { + "epoch": 0.41975196886032407, + "grad_norm": 0.8331717848777771, + "learning_rate": 0.0001130683499459875, + "loss": 2.731, + "step": 4637 + }, + { + "epoch": 0.4198424911740744, + "grad_norm": 0.7963569164276123, + "learning_rate": 0.00011303688816942324, + "loss": 2.8067, + "step": 4638 + }, + { + "epoch": 0.41993301348782475, + "grad_norm": 0.8360899090766907, + "learning_rate": 0.00011300542508004468, + "loss": 2.872, + "step": 4639 + }, + { + "epoch": 0.4200235358015751, + "grad_norm": 0.8733784556388855, + "learning_rate": 0.00011297396068102017, + "loss": 2.8585, + "step": 4640 + }, + { + "epoch": 0.42011405811532543, + "grad_norm": 0.8134546875953674, + "learning_rate": 0.00011294249497551819, + "loss": 2.8574, + "step": 4641 + }, + { + "epoch": 0.42020458042907577, + "grad_norm": 0.7982373833656311, + "learning_rate": 0.00011291102796670734, + "loss": 2.8462, + "step": 4642 + }, + { + "epoch": 0.4202951027428261, + "grad_norm": 0.8143455982208252, + "learning_rate": 0.0001128795596577563, + "loss": 2.815, + "step": 4643 + }, + { + "epoch": 0.42038562505657645, + "grad_norm": 0.8223409056663513, + "learning_rate": 0.00011284809005183394, + "loss": 2.8082, + "step": 4644 + }, + { + "epoch": 0.4204761473703268, + "grad_norm": 0.9110050201416016, + "learning_rate": 0.0001128166191521093, + "loss": 2.8725, + "step": 4645 + }, + { + "epoch": 0.42056666968407713, + "grad_norm": 0.8185941576957703, + "learning_rate": 0.00011278514696175144, + "loss": 2.8505, + "step": 4646 + }, + { + "epoch": 0.4206571919978275, + "grad_norm": 0.9211212396621704, + "learning_rate": 0.00011275367348392962, + "loss": 2.8217, + "step": 4647 + }, + { + "epoch": 0.4207477143115778, + "grad_norm": 0.8780548572540283, + "learning_rate": 0.00011272219872181322, + "loss": 2.8891, + "step": 4648 + }, + { + "epoch": 0.42083823662532815, + "grad_norm": 0.9037941098213196, + "learning_rate": 0.00011269072267857182, + "loss": 2.8489, + "step": 4649 + }, + { + "epoch": 0.4209287589390785, + "grad_norm": 0.8477228283882141, + "learning_rate": 0.00011265924535737493, + "loss": 2.8404, + "step": 4650 + }, + { + "epoch": 0.42101928125282884, + "grad_norm": 0.8942874073982239, + "learning_rate": 0.0001126277667613924, + "loss": 2.7807, + "step": 4651 + }, + { + "epoch": 0.4211098035665792, + "grad_norm": 0.8547981381416321, + "learning_rate": 0.00011259628689379411, + "loss": 2.8997, + "step": 4652 + }, + { + "epoch": 0.4212003258803295, + "grad_norm": 0.8350889682769775, + "learning_rate": 0.00011256480575775007, + "loss": 2.8157, + "step": 4653 + }, + { + "epoch": 0.42129084819407986, + "grad_norm": 1.0040391683578491, + "learning_rate": 0.00011253332335643043, + "loss": 2.8049, + "step": 4654 + }, + { + "epoch": 0.4213813705078302, + "grad_norm": 0.8473368287086487, + "learning_rate": 0.00011250183969300548, + "loss": 2.8186, + "step": 4655 + }, + { + "epoch": 0.42147189282158054, + "grad_norm": 0.8104838728904724, + "learning_rate": 0.0001124703547706456, + "loss": 2.7801, + "step": 4656 + }, + { + "epoch": 0.4215624151353309, + "grad_norm": 0.8853965401649475, + "learning_rate": 0.00011243886859252135, + "loss": 2.7825, + "step": 4657 + }, + { + "epoch": 0.4216529374490812, + "grad_norm": 0.9112235307693481, + "learning_rate": 0.00011240738116180336, + "loss": 2.8799, + "step": 4658 + }, + { + "epoch": 0.42174345976283156, + "grad_norm": 0.8312155604362488, + "learning_rate": 0.00011237589248166242, + "loss": 2.8449, + "step": 4659 + }, + { + "epoch": 0.4218339820765819, + "grad_norm": 0.8818202018737793, + "learning_rate": 0.00011234440255526948, + "loss": 2.8088, + "step": 4660 + }, + { + "epoch": 0.42192450439033224, + "grad_norm": 0.9247568249702454, + "learning_rate": 0.00011231291138579552, + "loss": 2.7613, + "step": 4661 + }, + { + "epoch": 0.42201502670408253, + "grad_norm": 0.8678056597709656, + "learning_rate": 0.00011228141897641172, + "loss": 2.8426, + "step": 4662 + }, + { + "epoch": 0.42210554901783287, + "grad_norm": 0.8642814755439758, + "learning_rate": 0.00011224992533028935, + "loss": 2.7944, + "step": 4663 + }, + { + "epoch": 0.4221960713315832, + "grad_norm": 0.9176373481750488, + "learning_rate": 0.00011221843045059988, + "loss": 2.7694, + "step": 4664 + }, + { + "epoch": 0.42228659364533355, + "grad_norm": 0.8463912010192871, + "learning_rate": 0.00011218693434051475, + "loss": 2.8044, + "step": 4665 + }, + { + "epoch": 0.4223771159590839, + "grad_norm": 0.9299211502075195, + "learning_rate": 0.0001121554370032057, + "loss": 2.8042, + "step": 4666 + }, + { + "epoch": 0.42246763827283423, + "grad_norm": 0.9610387682914734, + "learning_rate": 0.00011212393844184445, + "loss": 2.8076, + "step": 4667 + }, + { + "epoch": 0.4225581605865846, + "grad_norm": 0.8744280338287354, + "learning_rate": 0.00011209243865960296, + "loss": 2.807, + "step": 4668 + }, + { + "epoch": 0.4226486829003349, + "grad_norm": 0.8455687761306763, + "learning_rate": 0.0001120609376596532, + "loss": 2.7714, + "step": 4669 + }, + { + "epoch": 0.42273920521408526, + "grad_norm": 0.8594902753829956, + "learning_rate": 0.00011202943544516736, + "loss": 2.7968, + "step": 4670 + }, + { + "epoch": 0.4228297275278356, + "grad_norm": 0.8992725610733032, + "learning_rate": 0.00011199793201931767, + "loss": 2.8628, + "step": 4671 + }, + { + "epoch": 0.42292024984158594, + "grad_norm": 0.9300136566162109, + "learning_rate": 0.00011196642738527659, + "loss": 2.8668, + "step": 4672 + }, + { + "epoch": 0.4230107721553363, + "grad_norm": 0.8369343876838684, + "learning_rate": 0.0001119349215462166, + "loss": 2.8436, + "step": 4673 + }, + { + "epoch": 0.4231012944690866, + "grad_norm": 0.8958376049995422, + "learning_rate": 0.00011190341450531033, + "loss": 2.8824, + "step": 4674 + }, + { + "epoch": 0.42319181678283696, + "grad_norm": 0.8437206149101257, + "learning_rate": 0.00011187190626573052, + "loss": 2.7803, + "step": 4675 + }, + { + "epoch": 0.4232823390965873, + "grad_norm": 0.8179407119750977, + "learning_rate": 0.00011184039683065013, + "loss": 2.8214, + "step": 4676 + }, + { + "epoch": 0.42337286141033764, + "grad_norm": 0.8458150625228882, + "learning_rate": 0.00011180888620324205, + "loss": 2.8447, + "step": 4677 + }, + { + "epoch": 0.423463383724088, + "grad_norm": 0.9066711664199829, + "learning_rate": 0.00011177737438667948, + "loss": 2.7854, + "step": 4678 + }, + { + "epoch": 0.4235539060378383, + "grad_norm": 0.8020719289779663, + "learning_rate": 0.0001117458613841356, + "loss": 2.7944, + "step": 4679 + }, + { + "epoch": 0.42364442835158866, + "grad_norm": 0.8214885592460632, + "learning_rate": 0.00011171434719878384, + "loss": 2.8372, + "step": 4680 + }, + { + "epoch": 0.423734950665339, + "grad_norm": 0.8697049021720886, + "learning_rate": 0.00011168283183379762, + "loss": 2.8341, + "step": 4681 + }, + { + "epoch": 0.42382547297908935, + "grad_norm": 0.85956871509552, + "learning_rate": 0.00011165131529235056, + "loss": 2.7766, + "step": 4682 + }, + { + "epoch": 0.4239159952928397, + "grad_norm": 0.8642212748527527, + "learning_rate": 0.00011161979757761634, + "loss": 2.8029, + "step": 4683 + }, + { + "epoch": 0.42400651760659, + "grad_norm": 0.864029586315155, + "learning_rate": 0.00011158827869276887, + "loss": 2.8399, + "step": 4684 + }, + { + "epoch": 0.42409703992034037, + "grad_norm": 0.9287242293357849, + "learning_rate": 0.00011155675864098201, + "loss": 2.854, + "step": 4685 + }, + { + "epoch": 0.4241875622340907, + "grad_norm": 0.9051283001899719, + "learning_rate": 0.00011152523742542989, + "loss": 2.8412, + "step": 4686 + }, + { + "epoch": 0.42427808454784105, + "grad_norm": 0.9301141500473022, + "learning_rate": 0.00011149371504928668, + "loss": 2.8457, + "step": 4687 + }, + { + "epoch": 0.4243686068615914, + "grad_norm": 0.8932237029075623, + "learning_rate": 0.00011146219151572668, + "loss": 2.8585, + "step": 4688 + }, + { + "epoch": 0.42445912917534173, + "grad_norm": 0.844142496585846, + "learning_rate": 0.0001114306668279243, + "loss": 2.8338, + "step": 4689 + }, + { + "epoch": 0.4245496514890921, + "grad_norm": 0.8861607909202576, + "learning_rate": 0.00011139914098905406, + "loss": 2.7525, + "step": 4690 + }, + { + "epoch": 0.4246401738028424, + "grad_norm": 0.8443228602409363, + "learning_rate": 0.0001113676140022907, + "loss": 2.825, + "step": 4691 + }, + { + "epoch": 0.42473069611659275, + "grad_norm": 0.804202675819397, + "learning_rate": 0.00011133608587080886, + "loss": 2.8037, + "step": 4692 + }, + { + "epoch": 0.4248212184303431, + "grad_norm": 0.8334986567497253, + "learning_rate": 0.00011130455659778349, + "loss": 2.8234, + "step": 4693 + }, + { + "epoch": 0.42491174074409344, + "grad_norm": 0.8185308575630188, + "learning_rate": 0.0001112730261863896, + "loss": 2.8714, + "step": 4694 + }, + { + "epoch": 0.4250022630578438, + "grad_norm": 0.863042950630188, + "learning_rate": 0.00011124149463980229, + "loss": 2.8481, + "step": 4695 + }, + { + "epoch": 0.4250927853715941, + "grad_norm": 0.8510686755180359, + "learning_rate": 0.00011120996196119675, + "loss": 2.8536, + "step": 4696 + }, + { + "epoch": 0.42518330768534446, + "grad_norm": 0.9181560277938843, + "learning_rate": 0.00011117842815374835, + "loss": 2.8783, + "step": 4697 + }, + { + "epoch": 0.4252738299990948, + "grad_norm": 0.8466564416885376, + "learning_rate": 0.00011114689322063255, + "loss": 2.8045, + "step": 4698 + }, + { + "epoch": 0.42536435231284514, + "grad_norm": 0.9046276211738586, + "learning_rate": 0.00011111535716502493, + "loss": 2.8196, + "step": 4699 + }, + { + "epoch": 0.4254548746265955, + "grad_norm": 1.0147709846496582, + "learning_rate": 0.00011108381999010111, + "loss": 2.8999, + "step": 4700 + }, + { + "epoch": 0.4255453969403458, + "grad_norm": 0.8731804490089417, + "learning_rate": 0.00011105228169903695, + "loss": 2.7696, + "step": 4701 + }, + { + "epoch": 0.42563591925409616, + "grad_norm": 0.8598802089691162, + "learning_rate": 0.0001110207422950083, + "loss": 2.8292, + "step": 4702 + }, + { + "epoch": 0.42572644156784645, + "grad_norm": 0.8652425408363342, + "learning_rate": 0.0001109892017811912, + "loss": 2.7647, + "step": 4703 + }, + { + "epoch": 0.4258169638815968, + "grad_norm": 0.8402916193008423, + "learning_rate": 0.0001109576601607618, + "loss": 2.8358, + "step": 4704 + }, + { + "epoch": 0.42590748619534713, + "grad_norm": 0.843143105506897, + "learning_rate": 0.00011092611743689631, + "loss": 2.7721, + "step": 4705 + }, + { + "epoch": 0.42599800850909747, + "grad_norm": 0.8784768581390381, + "learning_rate": 0.00011089457361277113, + "loss": 2.8367, + "step": 4706 + }, + { + "epoch": 0.4260885308228478, + "grad_norm": 0.9330942034721375, + "learning_rate": 0.00011086302869156265, + "loss": 2.8551, + "step": 4707 + }, + { + "epoch": 0.42617905313659815, + "grad_norm": 0.8752138614654541, + "learning_rate": 0.00011083148267644747, + "loss": 2.876, + "step": 4708 + }, + { + "epoch": 0.4262695754503485, + "grad_norm": 0.8569071292877197, + "learning_rate": 0.0001107999355706023, + "loss": 2.7731, + "step": 4709 + }, + { + "epoch": 0.42636009776409883, + "grad_norm": 0.9110112190246582, + "learning_rate": 0.00011076838737720392, + "loss": 2.8479, + "step": 4710 + }, + { + "epoch": 0.4264506200778492, + "grad_norm": 0.8108072280883789, + "learning_rate": 0.0001107368380994292, + "loss": 2.8059, + "step": 4711 + }, + { + "epoch": 0.4265411423915995, + "grad_norm": 0.8465254902839661, + "learning_rate": 0.00011070528774045519, + "loss": 2.86, + "step": 4712 + }, + { + "epoch": 0.42663166470534986, + "grad_norm": 0.8941924571990967, + "learning_rate": 0.00011067373630345899, + "loss": 2.8472, + "step": 4713 + }, + { + "epoch": 0.4267221870191002, + "grad_norm": 0.8601610064506531, + "learning_rate": 0.00011064218379161786, + "loss": 2.8781, + "step": 4714 + }, + { + "epoch": 0.42681270933285054, + "grad_norm": 0.9912343621253967, + "learning_rate": 0.00011061063020810909, + "loss": 2.8762, + "step": 4715 + }, + { + "epoch": 0.4269032316466009, + "grad_norm": 0.81369948387146, + "learning_rate": 0.00011057907555611016, + "loss": 2.8324, + "step": 4716 + }, + { + "epoch": 0.4269937539603512, + "grad_norm": 0.8867669701576233, + "learning_rate": 0.00011054751983879859, + "loss": 2.8066, + "step": 4717 + }, + { + "epoch": 0.42708427627410156, + "grad_norm": 0.8661606907844543, + "learning_rate": 0.0001105159630593521, + "loss": 2.8492, + "step": 4718 + }, + { + "epoch": 0.4271747985878519, + "grad_norm": 0.9250156283378601, + "learning_rate": 0.00011048440522094838, + "loss": 2.8745, + "step": 4719 + }, + { + "epoch": 0.42726532090160224, + "grad_norm": 0.9255236983299255, + "learning_rate": 0.00011045284632676536, + "loss": 2.855, + "step": 4720 + }, + { + "epoch": 0.4273558432153526, + "grad_norm": 0.9331503510475159, + "learning_rate": 0.00011042128637998099, + "loss": 2.8058, + "step": 4721 + }, + { + "epoch": 0.4274463655291029, + "grad_norm": 0.8795275092124939, + "learning_rate": 0.0001103897253837734, + "loss": 2.814, + "step": 4722 + }, + { + "epoch": 0.42753688784285326, + "grad_norm": 0.9849618077278137, + "learning_rate": 0.0001103581633413207, + "loss": 2.8216, + "step": 4723 + }, + { + "epoch": 0.4276274101566036, + "grad_norm": 0.8891952037811279, + "learning_rate": 0.00011032660025580125, + "loss": 2.8081, + "step": 4724 + }, + { + "epoch": 0.42771793247035395, + "grad_norm": 0.9136685729026794, + "learning_rate": 0.00011029503613039346, + "loss": 2.9267, + "step": 4725 + }, + { + "epoch": 0.4278084547841043, + "grad_norm": 0.8363056778907776, + "learning_rate": 0.00011026347096827578, + "loss": 2.8177, + "step": 4726 + }, + { + "epoch": 0.4278989770978546, + "grad_norm": 0.9110442399978638, + "learning_rate": 0.00011023190477262688, + "loss": 2.7913, + "step": 4727 + }, + { + "epoch": 0.42798949941160497, + "grad_norm": 0.7873501777648926, + "learning_rate": 0.00011020033754662543, + "loss": 2.7982, + "step": 4728 + }, + { + "epoch": 0.4280800217253553, + "grad_norm": 0.8280324339866638, + "learning_rate": 0.0001101687692934503, + "loss": 2.8528, + "step": 4729 + }, + { + "epoch": 0.42817054403910565, + "grad_norm": 0.9280153512954712, + "learning_rate": 0.00011013720001628035, + "loss": 2.881, + "step": 4730 + }, + { + "epoch": 0.428261066352856, + "grad_norm": 0.8496368527412415, + "learning_rate": 0.00011010562971829463, + "loss": 2.8228, + "step": 4731 + }, + { + "epoch": 0.42835158866660633, + "grad_norm": 0.8465971350669861, + "learning_rate": 0.00011007405840267228, + "loss": 2.8523, + "step": 4732 + }, + { + "epoch": 0.42844211098035667, + "grad_norm": 1.1676387786865234, + "learning_rate": 0.00011004248607259255, + "loss": 2.8543, + "step": 4733 + }, + { + "epoch": 0.428532633294107, + "grad_norm": 0.855336606502533, + "learning_rate": 0.00011001091273123473, + "loss": 2.8102, + "step": 4734 + }, + { + "epoch": 0.42862315560785735, + "grad_norm": 0.8129889965057373, + "learning_rate": 0.00010997933838177827, + "loss": 2.8427, + "step": 4735 + }, + { + "epoch": 0.4287136779216077, + "grad_norm": 0.8779901266098022, + "learning_rate": 0.0001099477630274027, + "loss": 2.8498, + "step": 4736 + }, + { + "epoch": 0.42880420023535804, + "grad_norm": 0.8442177176475525, + "learning_rate": 0.00010991618667128769, + "loss": 2.8006, + "step": 4737 + }, + { + "epoch": 0.4288947225491084, + "grad_norm": 0.8123629093170166, + "learning_rate": 0.00010988460931661295, + "loss": 2.7864, + "step": 4738 + }, + { + "epoch": 0.4289852448628587, + "grad_norm": 0.8855149149894714, + "learning_rate": 0.0001098530309665583, + "loss": 2.8527, + "step": 4739 + }, + { + "epoch": 0.42907576717660906, + "grad_norm": 0.9084986448287964, + "learning_rate": 0.00010982145162430373, + "loss": 2.8764, + "step": 4740 + }, + { + "epoch": 0.4291662894903594, + "grad_norm": 0.8852027654647827, + "learning_rate": 0.00010978987129302922, + "loss": 2.8407, + "step": 4741 + }, + { + "epoch": 0.42925681180410974, + "grad_norm": 0.8793681263923645, + "learning_rate": 0.00010975828997591495, + "loss": 2.8483, + "step": 4742 + }, + { + "epoch": 0.4293473341178601, + "grad_norm": 0.9163318276405334, + "learning_rate": 0.00010972670767614114, + "loss": 2.8332, + "step": 4743 + }, + { + "epoch": 0.42943785643161037, + "grad_norm": 0.8697690963745117, + "learning_rate": 0.00010969512439688816, + "loss": 2.8413, + "step": 4744 + }, + { + "epoch": 0.4295283787453607, + "grad_norm": 0.8311997056007385, + "learning_rate": 0.0001096635401413364, + "loss": 2.7779, + "step": 4745 + }, + { + "epoch": 0.42961890105911105, + "grad_norm": 0.891202449798584, + "learning_rate": 0.0001096319549126664, + "loss": 2.8442, + "step": 4746 + }, + { + "epoch": 0.4297094233728614, + "grad_norm": 0.9648249745368958, + "learning_rate": 0.0001096003687140588, + "loss": 2.8438, + "step": 4747 + }, + { + "epoch": 0.42979994568661173, + "grad_norm": 0.8829810619354248, + "learning_rate": 0.00010956878154869437, + "loss": 2.8257, + "step": 4748 + }, + { + "epoch": 0.42989046800036207, + "grad_norm": 0.9004414677619934, + "learning_rate": 0.00010953719341975388, + "loss": 2.7777, + "step": 4749 + }, + { + "epoch": 0.4299809903141124, + "grad_norm": 0.892963171005249, + "learning_rate": 0.00010950560433041826, + "loss": 2.8466, + "step": 4750 + }, + { + "epoch": 0.43007151262786275, + "grad_norm": 1.0104292631149292, + "learning_rate": 0.00010947401428386856, + "loss": 2.8534, + "step": 4751 + }, + { + "epoch": 0.4301620349416131, + "grad_norm": 0.90214604139328, + "learning_rate": 0.0001094424232832859, + "loss": 2.8257, + "step": 4752 + }, + { + "epoch": 0.43025255725536343, + "grad_norm": 0.9649103283882141, + "learning_rate": 0.00010941083133185146, + "loss": 2.867, + "step": 4753 + }, + { + "epoch": 0.4303430795691138, + "grad_norm": 0.9712350964546204, + "learning_rate": 0.00010937923843274654, + "loss": 2.8453, + "step": 4754 + }, + { + "epoch": 0.4304336018828641, + "grad_norm": 1.0164681673049927, + "learning_rate": 0.00010934764458915258, + "loss": 2.8726, + "step": 4755 + }, + { + "epoch": 0.43052412419661445, + "grad_norm": 0.9008257389068604, + "learning_rate": 0.00010931604980425108, + "loss": 2.7884, + "step": 4756 + }, + { + "epoch": 0.4306146465103648, + "grad_norm": 0.9697628021240234, + "learning_rate": 0.00010928445408122361, + "loss": 2.7969, + "step": 4757 + }, + { + "epoch": 0.43070516882411514, + "grad_norm": 1.0028095245361328, + "learning_rate": 0.00010925285742325184, + "loss": 2.8293, + "step": 4758 + }, + { + "epoch": 0.4307956911378655, + "grad_norm": 0.8799826502799988, + "learning_rate": 0.0001092212598335176, + "loss": 2.8643, + "step": 4759 + }, + { + "epoch": 0.4308862134516158, + "grad_norm": 0.8830612897872925, + "learning_rate": 0.00010918966131520277, + "loss": 2.7758, + "step": 4760 + }, + { + "epoch": 0.43097673576536616, + "grad_norm": 0.9633815288543701, + "learning_rate": 0.00010915806187148927, + "loss": 2.8098, + "step": 4761 + }, + { + "epoch": 0.4310672580791165, + "grad_norm": 0.9045613408088684, + "learning_rate": 0.00010912646150555919, + "loss": 2.8306, + "step": 4762 + }, + { + "epoch": 0.43115778039286684, + "grad_norm": 0.7916074395179749, + "learning_rate": 0.0001090948602205947, + "loss": 2.8245, + "step": 4763 + }, + { + "epoch": 0.4312483027066172, + "grad_norm": 0.8476104736328125, + "learning_rate": 0.00010906325801977804, + "loss": 2.7464, + "step": 4764 + }, + { + "epoch": 0.4313388250203675, + "grad_norm": 0.9538005590438843, + "learning_rate": 0.00010903165490629153, + "loss": 2.8615, + "step": 4765 + }, + { + "epoch": 0.43142934733411786, + "grad_norm": 0.8170968294143677, + "learning_rate": 0.00010900005088331761, + "loss": 2.7865, + "step": 4766 + }, + { + "epoch": 0.4315198696478682, + "grad_norm": 0.8634709715843201, + "learning_rate": 0.00010896844595403881, + "loss": 2.8317, + "step": 4767 + }, + { + "epoch": 0.43161039196161854, + "grad_norm": 0.9003289341926575, + "learning_rate": 0.00010893684012163779, + "loss": 2.7993, + "step": 4768 + }, + { + "epoch": 0.4317009142753689, + "grad_norm": 0.8575468063354492, + "learning_rate": 0.00010890523338929719, + "loss": 2.7796, + "step": 4769 + }, + { + "epoch": 0.4317914365891192, + "grad_norm": 0.7894264459609985, + "learning_rate": 0.00010887362576019981, + "loss": 2.7963, + "step": 4770 + }, + { + "epoch": 0.43188195890286957, + "grad_norm": 0.9664896130561829, + "learning_rate": 0.00010884201723752861, + "loss": 2.7932, + "step": 4771 + }, + { + "epoch": 0.4319724812166199, + "grad_norm": 0.9773446917533875, + "learning_rate": 0.00010881040782446645, + "loss": 2.8373, + "step": 4772 + }, + { + "epoch": 0.43206300353037025, + "grad_norm": 1.0005125999450684, + "learning_rate": 0.00010877879752419651, + "loss": 2.8478, + "step": 4773 + }, + { + "epoch": 0.4321535258441206, + "grad_norm": 0.886049747467041, + "learning_rate": 0.00010874718633990189, + "loss": 2.7862, + "step": 4774 + }, + { + "epoch": 0.43224404815787093, + "grad_norm": 0.9218837022781372, + "learning_rate": 0.00010871557427476583, + "loss": 2.8233, + "step": 4775 + }, + { + "epoch": 0.43233457047162127, + "grad_norm": 0.8934755921363831, + "learning_rate": 0.0001086839613319717, + "loss": 2.8197, + "step": 4776 + }, + { + "epoch": 0.4324250927853716, + "grad_norm": 0.9310414791107178, + "learning_rate": 0.00010865234751470288, + "loss": 2.8478, + "step": 4777 + }, + { + "epoch": 0.43251561509912195, + "grad_norm": 0.8048056960105896, + "learning_rate": 0.00010862073282614291, + "loss": 2.8203, + "step": 4778 + }, + { + "epoch": 0.4326061374128723, + "grad_norm": 1.0201817750930786, + "learning_rate": 0.00010858911726947541, + "loss": 2.852, + "step": 4779 + }, + { + "epoch": 0.43269665972662263, + "grad_norm": 0.8759087920188904, + "learning_rate": 0.00010855750084788398, + "loss": 2.9003, + "step": 4780 + }, + { + "epoch": 0.432787182040373, + "grad_norm": 0.83902508020401, + "learning_rate": 0.00010852588356455249, + "loss": 2.7608, + "step": 4781 + }, + { + "epoch": 0.4328777043541233, + "grad_norm": 1.0640679597854614, + "learning_rate": 0.00010849426542266474, + "loss": 2.8459, + "step": 4782 + }, + { + "epoch": 0.43296822666787366, + "grad_norm": 0.893929660320282, + "learning_rate": 0.0001084626464254047, + "loss": 2.8119, + "step": 4783 + }, + { + "epoch": 0.433058748981624, + "grad_norm": 0.8554607629776001, + "learning_rate": 0.00010843102657595641, + "loss": 2.8471, + "step": 4784 + }, + { + "epoch": 0.4331492712953743, + "grad_norm": 0.989777684211731, + "learning_rate": 0.00010839940587750395, + "loss": 2.8205, + "step": 4785 + }, + { + "epoch": 0.4332397936091246, + "grad_norm": 0.926990807056427, + "learning_rate": 0.00010836778433323158, + "loss": 2.8905, + "step": 4786 + }, + { + "epoch": 0.43333031592287496, + "grad_norm": 0.9434003829956055, + "learning_rate": 0.00010833616194632353, + "loss": 2.8383, + "step": 4787 + }, + { + "epoch": 0.4334208382366253, + "grad_norm": 0.9875321388244629, + "learning_rate": 0.0001083045387199642, + "loss": 2.7727, + "step": 4788 + }, + { + "epoch": 0.43351136055037565, + "grad_norm": 0.8352621793746948, + "learning_rate": 0.00010827291465733804, + "loss": 2.812, + "step": 4789 + }, + { + "epoch": 0.433601882864126, + "grad_norm": 0.959829568862915, + "learning_rate": 0.00010824128976162964, + "loss": 2.899, + "step": 4790 + }, + { + "epoch": 0.43369240517787633, + "grad_norm": 0.8951857686042786, + "learning_rate": 0.00010820966403602355, + "loss": 2.817, + "step": 4791 + }, + { + "epoch": 0.43378292749162667, + "grad_norm": 0.8995950222015381, + "learning_rate": 0.00010817803748370452, + "loss": 2.8097, + "step": 4792 + }, + { + "epoch": 0.433873449805377, + "grad_norm": 0.9037355780601501, + "learning_rate": 0.00010814641010785733, + "loss": 2.8683, + "step": 4793 + }, + { + "epoch": 0.43396397211912735, + "grad_norm": 0.9580223560333252, + "learning_rate": 0.00010811478191166691, + "loss": 2.7849, + "step": 4794 + }, + { + "epoch": 0.4340544944328777, + "grad_norm": 0.8089432716369629, + "learning_rate": 0.00010808315289831814, + "loss": 2.8267, + "step": 4795 + }, + { + "epoch": 0.43414501674662803, + "grad_norm": 0.8201754689216614, + "learning_rate": 0.00010805152307099609, + "loss": 2.8147, + "step": 4796 + }, + { + "epoch": 0.4342355390603784, + "grad_norm": 0.9184460043907166, + "learning_rate": 0.00010801989243288589, + "loss": 2.7641, + "step": 4797 + }, + { + "epoch": 0.4343260613741287, + "grad_norm": 0.9155360460281372, + "learning_rate": 0.00010798826098717276, + "loss": 2.8113, + "step": 4798 + }, + { + "epoch": 0.43441658368787905, + "grad_norm": 0.834479808807373, + "learning_rate": 0.00010795662873704196, + "loss": 2.8262, + "step": 4799 + }, + { + "epoch": 0.4345071060016294, + "grad_norm": 0.8895245790481567, + "learning_rate": 0.00010792499568567884, + "loss": 2.8121, + "step": 4800 + }, + { + "epoch": 0.43459762831537974, + "grad_norm": 0.9145119190216064, + "learning_rate": 0.00010789336183626892, + "loss": 2.8212, + "step": 4801 + }, + { + "epoch": 0.4346881506291301, + "grad_norm": 0.9751086235046387, + "learning_rate": 0.00010786172719199763, + "loss": 2.767, + "step": 4802 + }, + { + "epoch": 0.4347786729428804, + "grad_norm": 0.8262462615966797, + "learning_rate": 0.00010783009175605065, + "loss": 2.7572, + "step": 4803 + }, + { + "epoch": 0.43486919525663076, + "grad_norm": 1.0077941417694092, + "learning_rate": 0.00010779845553161362, + "loss": 2.8179, + "step": 4804 + }, + { + "epoch": 0.4349597175703811, + "grad_norm": 0.9216823577880859, + "learning_rate": 0.00010776681852187238, + "loss": 2.7986, + "step": 4805 + }, + { + "epoch": 0.43505023988413144, + "grad_norm": 0.8575187921524048, + "learning_rate": 0.0001077351807300127, + "loss": 2.8272, + "step": 4806 + }, + { + "epoch": 0.4351407621978818, + "grad_norm": 1.0988225936889648, + "learning_rate": 0.00010770354215922052, + "loss": 2.9174, + "step": 4807 + }, + { + "epoch": 0.4352312845116321, + "grad_norm": 0.8845909833908081, + "learning_rate": 0.00010767190281268187, + "loss": 2.8137, + "step": 4808 + }, + { + "epoch": 0.43532180682538246, + "grad_norm": 0.9869620203971863, + "learning_rate": 0.00010764026269358282, + "loss": 2.8202, + "step": 4809 + }, + { + "epoch": 0.4354123291391328, + "grad_norm": 0.9068284034729004, + "learning_rate": 0.00010760862180510951, + "loss": 2.8021, + "step": 4810 + }, + { + "epoch": 0.43550285145288314, + "grad_norm": 1.0142253637313843, + "learning_rate": 0.00010757698015044822, + "loss": 2.9227, + "step": 4811 + }, + { + "epoch": 0.4355933737666335, + "grad_norm": 0.8755457997322083, + "learning_rate": 0.0001075453377327852, + "loss": 2.8365, + "step": 4812 + }, + { + "epoch": 0.4356838960803838, + "grad_norm": 1.2936592102050781, + "learning_rate": 0.0001075136945553069, + "loss": 2.9145, + "step": 4813 + }, + { + "epoch": 0.43577441839413417, + "grad_norm": 0.9290542006492615, + "learning_rate": 0.00010748205062119976, + "loss": 2.8088, + "step": 4814 + }, + { + "epoch": 0.4358649407078845, + "grad_norm": 0.8115638494491577, + "learning_rate": 0.00010745040593365032, + "loss": 2.7689, + "step": 4815 + }, + { + "epoch": 0.43595546302163485, + "grad_norm": 0.93105548620224, + "learning_rate": 0.00010741876049584523, + "loss": 2.8144, + "step": 4816 + }, + { + "epoch": 0.4360459853353852, + "grad_norm": 0.9035531282424927, + "learning_rate": 0.00010738711431097112, + "loss": 2.825, + "step": 4817 + }, + { + "epoch": 0.43613650764913553, + "grad_norm": 0.8488040566444397, + "learning_rate": 0.00010735546738221483, + "loss": 2.8077, + "step": 4818 + }, + { + "epoch": 0.43622702996288587, + "grad_norm": 0.9691907167434692, + "learning_rate": 0.00010732381971276318, + "loss": 2.8493, + "step": 4819 + }, + { + "epoch": 0.4363175522766362, + "grad_norm": 0.7879188060760498, + "learning_rate": 0.0001072921713058031, + "loss": 2.8162, + "step": 4820 + }, + { + "epoch": 0.43640807459038655, + "grad_norm": 1.0249758958816528, + "learning_rate": 0.00010726052216452151, + "loss": 2.8505, + "step": 4821 + }, + { + "epoch": 0.4364985969041369, + "grad_norm": 0.9572840332984924, + "learning_rate": 0.00010722887229210557, + "loss": 2.7912, + "step": 4822 + }, + { + "epoch": 0.43658911921788723, + "grad_norm": 0.8638061285018921, + "learning_rate": 0.00010719722169174238, + "loss": 2.7941, + "step": 4823 + }, + { + "epoch": 0.4366796415316376, + "grad_norm": 0.8792514801025391, + "learning_rate": 0.00010716557036661918, + "loss": 2.79, + "step": 4824 + }, + { + "epoch": 0.4367701638453879, + "grad_norm": 0.8902309536933899, + "learning_rate": 0.00010713391831992323, + "loss": 2.8306, + "step": 4825 + }, + { + "epoch": 0.4368606861591382, + "grad_norm": 1.0273114442825317, + "learning_rate": 0.0001071022655548419, + "loss": 2.8273, + "step": 4826 + }, + { + "epoch": 0.43695120847288854, + "grad_norm": 0.8674315214157104, + "learning_rate": 0.00010707061207456264, + "loss": 2.8312, + "step": 4827 + }, + { + "epoch": 0.4370417307866389, + "grad_norm": 0.846809446811676, + "learning_rate": 0.00010703895788227292, + "loss": 2.829, + "step": 4828 + }, + { + "epoch": 0.4371322531003892, + "grad_norm": 0.8628326654434204, + "learning_rate": 0.00010700730298116033, + "loss": 2.7867, + "step": 4829 + }, + { + "epoch": 0.43722277541413956, + "grad_norm": 0.8731932640075684, + "learning_rate": 0.00010697564737441252, + "loss": 2.7932, + "step": 4830 + }, + { + "epoch": 0.4373132977278899, + "grad_norm": 0.8147916197776794, + "learning_rate": 0.00010694399106521722, + "loss": 2.8206, + "step": 4831 + }, + { + "epoch": 0.43740382004164025, + "grad_norm": 0.861187219619751, + "learning_rate": 0.00010691233405676221, + "loss": 2.8254, + "step": 4832 + }, + { + "epoch": 0.4374943423553906, + "grad_norm": 0.8892345428466797, + "learning_rate": 0.00010688067635223536, + "loss": 2.819, + "step": 4833 + }, + { + "epoch": 0.4375848646691409, + "grad_norm": 0.8232401609420776, + "learning_rate": 0.00010684901795482456, + "loss": 2.8206, + "step": 4834 + }, + { + "epoch": 0.43767538698289127, + "grad_norm": 0.899243950843811, + "learning_rate": 0.00010681735886771788, + "loss": 2.8571, + "step": 4835 + }, + { + "epoch": 0.4377659092966416, + "grad_norm": 0.8655483722686768, + "learning_rate": 0.00010678569909410331, + "loss": 2.779, + "step": 4836 + }, + { + "epoch": 0.43785643161039195, + "grad_norm": 0.8032087683677673, + "learning_rate": 0.00010675403863716907, + "loss": 2.7326, + "step": 4837 + }, + { + "epoch": 0.4379469539241423, + "grad_norm": 0.8202515840530396, + "learning_rate": 0.00010672237750010329, + "loss": 2.7479, + "step": 4838 + }, + { + "epoch": 0.43803747623789263, + "grad_norm": 0.8518462181091309, + "learning_rate": 0.00010669071568609427, + "loss": 2.8267, + "step": 4839 + }, + { + "epoch": 0.43812799855164297, + "grad_norm": 0.8580194711685181, + "learning_rate": 0.00010665905319833041, + "loss": 2.7765, + "step": 4840 + }, + { + "epoch": 0.4382185208653933, + "grad_norm": 0.8346232771873474, + "learning_rate": 0.00010662739004000005, + "loss": 2.8643, + "step": 4841 + }, + { + "epoch": 0.43830904317914365, + "grad_norm": 0.8472459316253662, + "learning_rate": 0.00010659572621429166, + "loss": 2.7977, + "step": 4842 + }, + { + "epoch": 0.438399565492894, + "grad_norm": 0.8689875602722168, + "learning_rate": 0.00010656406172439386, + "loss": 2.7833, + "step": 4843 + }, + { + "epoch": 0.43849008780664434, + "grad_norm": 0.8687725067138672, + "learning_rate": 0.00010653239657349524, + "loss": 2.8504, + "step": 4844 + }, + { + "epoch": 0.4385806101203947, + "grad_norm": 0.8304334282875061, + "learning_rate": 0.00010650073076478442, + "loss": 2.7971, + "step": 4845 + }, + { + "epoch": 0.438671132434145, + "grad_norm": 0.8737621307373047, + "learning_rate": 0.00010646906430145018, + "loss": 2.8455, + "step": 4846 + }, + { + "epoch": 0.43876165474789536, + "grad_norm": 0.7990383505821228, + "learning_rate": 0.0001064373971866814, + "loss": 2.7958, + "step": 4847 + }, + { + "epoch": 0.4388521770616457, + "grad_norm": 0.8296318650245667, + "learning_rate": 0.00010640572942366686, + "loss": 2.8039, + "step": 4848 + }, + { + "epoch": 0.43894269937539604, + "grad_norm": 0.8771649599075317, + "learning_rate": 0.00010637406101559555, + "loss": 2.8536, + "step": 4849 + }, + { + "epoch": 0.4390332216891464, + "grad_norm": 0.885762095451355, + "learning_rate": 0.00010634239196565646, + "loss": 2.823, + "step": 4850 + }, + { + "epoch": 0.4391237440028967, + "grad_norm": 0.7665767073631287, + "learning_rate": 0.0001063107222770387, + "loss": 2.7319, + "step": 4851 + }, + { + "epoch": 0.43921426631664706, + "grad_norm": 0.8268359303474426, + "learning_rate": 0.00010627905195293135, + "loss": 2.8446, + "step": 4852 + }, + { + "epoch": 0.4393047886303974, + "grad_norm": 0.9912901520729065, + "learning_rate": 0.00010624738099652365, + "loss": 2.857, + "step": 4853 + }, + { + "epoch": 0.43939531094414774, + "grad_norm": 0.8394913673400879, + "learning_rate": 0.00010621570941100485, + "loss": 2.7842, + "step": 4854 + }, + { + "epoch": 0.4394858332578981, + "grad_norm": 0.9693658351898193, + "learning_rate": 0.00010618403719956431, + "loss": 2.8659, + "step": 4855 + }, + { + "epoch": 0.4395763555716484, + "grad_norm": 0.8589224815368652, + "learning_rate": 0.00010615236436539137, + "loss": 2.8498, + "step": 4856 + }, + { + "epoch": 0.43966687788539877, + "grad_norm": 0.8704410195350647, + "learning_rate": 0.00010612069091167551, + "loss": 2.8244, + "step": 4857 + }, + { + "epoch": 0.4397574001991491, + "grad_norm": 0.9131534099578857, + "learning_rate": 0.00010608901684160624, + "loss": 2.7766, + "step": 4858 + }, + { + "epoch": 0.43984792251289945, + "grad_norm": 0.9123629927635193, + "learning_rate": 0.00010605734215837316, + "loss": 2.756, + "step": 4859 + }, + { + "epoch": 0.4399384448266498, + "grad_norm": 0.8991694450378418, + "learning_rate": 0.00010602566686516586, + "loss": 2.8209, + "step": 4860 + }, + { + "epoch": 0.44002896714040013, + "grad_norm": 1.0187561511993408, + "learning_rate": 0.00010599399096517408, + "loss": 2.8028, + "step": 4861 + }, + { + "epoch": 0.44011948945415047, + "grad_norm": 0.87259441614151, + "learning_rate": 0.0001059623144615876, + "loss": 2.9245, + "step": 4862 + }, + { + "epoch": 0.4402100117679008, + "grad_norm": 0.8925728797912598, + "learning_rate": 0.00010593063735759618, + "loss": 2.7712, + "step": 4863 + }, + { + "epoch": 0.44030053408165115, + "grad_norm": 1.0046817064285278, + "learning_rate": 0.00010589895965638976, + "loss": 2.807, + "step": 4864 + }, + { + "epoch": 0.4403910563954015, + "grad_norm": 0.8938895463943481, + "learning_rate": 0.00010586728136115824, + "loss": 2.7988, + "step": 4865 + }, + { + "epoch": 0.44048157870915183, + "grad_norm": 0.9129030108451843, + "learning_rate": 0.0001058356024750917, + "loss": 2.8089, + "step": 4866 + }, + { + "epoch": 0.4405721010229021, + "grad_norm": 1.0655113458633423, + "learning_rate": 0.00010580392300138011, + "loss": 2.8057, + "step": 4867 + }, + { + "epoch": 0.44066262333665246, + "grad_norm": 0.9552054405212402, + "learning_rate": 0.00010577224294321362, + "loss": 2.7412, + "step": 4868 + }, + { + "epoch": 0.4407531456504028, + "grad_norm": 0.8570512533187866, + "learning_rate": 0.00010574056230378245, + "loss": 2.7769, + "step": 4869 + }, + { + "epoch": 0.44084366796415314, + "grad_norm": 1.06633722782135, + "learning_rate": 0.00010570888108627681, + "loss": 2.8406, + "step": 4870 + }, + { + "epoch": 0.4409341902779035, + "grad_norm": 0.9155492782592773, + "learning_rate": 0.000105677199293887, + "loss": 2.8326, + "step": 4871 + }, + { + "epoch": 0.4410247125916538, + "grad_norm": 0.8963524699211121, + "learning_rate": 0.00010564551692980337, + "loss": 2.8356, + "step": 4872 + }, + { + "epoch": 0.44111523490540416, + "grad_norm": 0.9572543501853943, + "learning_rate": 0.00010561383399721633, + "loss": 2.787, + "step": 4873 + }, + { + "epoch": 0.4412057572191545, + "grad_norm": 0.8590235114097595, + "learning_rate": 0.00010558215049931638, + "loss": 2.7589, + "step": 4874 + }, + { + "epoch": 0.44129627953290484, + "grad_norm": 0.8373232483863831, + "learning_rate": 0.00010555046643929403, + "loss": 2.766, + "step": 4875 + }, + { + "epoch": 0.4413868018466552, + "grad_norm": 0.82489013671875, + "learning_rate": 0.00010551878182033985, + "loss": 2.7575, + "step": 4876 + }, + { + "epoch": 0.4414773241604055, + "grad_norm": 0.9751967787742615, + "learning_rate": 0.00010548709664564449, + "loss": 2.8225, + "step": 4877 + }, + { + "epoch": 0.44156784647415587, + "grad_norm": 0.8528650999069214, + "learning_rate": 0.00010545541091839867, + "loss": 2.7976, + "step": 4878 + }, + { + "epoch": 0.4416583687879062, + "grad_norm": 0.9051427841186523, + "learning_rate": 0.00010542372464179309, + "loss": 2.8601, + "step": 4879 + }, + { + "epoch": 0.44174889110165655, + "grad_norm": 0.8459887504577637, + "learning_rate": 0.00010539203781901861, + "loss": 2.8074, + "step": 4880 + }, + { + "epoch": 0.4418394134154069, + "grad_norm": 0.7828327417373657, + "learning_rate": 0.00010536035045326606, + "loss": 2.8364, + "step": 4881 + }, + { + "epoch": 0.44192993572915723, + "grad_norm": 0.881540060043335, + "learning_rate": 0.00010532866254772638, + "loss": 2.8323, + "step": 4882 + }, + { + "epoch": 0.44202045804290757, + "grad_norm": 0.8367167711257935, + "learning_rate": 0.00010529697410559049, + "loss": 2.7518, + "step": 4883 + }, + { + "epoch": 0.4421109803566579, + "grad_norm": 0.7621573805809021, + "learning_rate": 0.00010526528513004947, + "loss": 2.795, + "step": 4884 + }, + { + "epoch": 0.44220150267040825, + "grad_norm": 0.8117220997810364, + "learning_rate": 0.0001052335956242944, + "loss": 2.8407, + "step": 4885 + }, + { + "epoch": 0.4422920249841586, + "grad_norm": 0.7706013321876526, + "learning_rate": 0.00010520190559151637, + "loss": 2.8473, + "step": 4886 + }, + { + "epoch": 0.44238254729790893, + "grad_norm": 0.8366645574569702, + "learning_rate": 0.0001051702150349066, + "loss": 2.822, + "step": 4887 + }, + { + "epoch": 0.4424730696116593, + "grad_norm": 0.8487375378608704, + "learning_rate": 0.00010513852395765631, + "loss": 2.8013, + "step": 4888 + }, + { + "epoch": 0.4425635919254096, + "grad_norm": 0.7707353234291077, + "learning_rate": 0.00010510683236295682, + "loss": 2.8227, + "step": 4889 + }, + { + "epoch": 0.44265411423915996, + "grad_norm": 0.850206196308136, + "learning_rate": 0.00010507514025399943, + "loss": 2.7895, + "step": 4890 + }, + { + "epoch": 0.4427446365529103, + "grad_norm": 0.9814115166664124, + "learning_rate": 0.00010504344763397556, + "loss": 2.8261, + "step": 4891 + }, + { + "epoch": 0.44283515886666064, + "grad_norm": 0.9729773998260498, + "learning_rate": 0.00010501175450607664, + "loss": 2.8185, + "step": 4892 + }, + { + "epoch": 0.442925681180411, + "grad_norm": 1.0523028373718262, + "learning_rate": 0.00010498006087349421, + "loss": 2.8141, + "step": 4893 + }, + { + "epoch": 0.4430162034941613, + "grad_norm": 0.9811918139457703, + "learning_rate": 0.00010494836673941977, + "loss": 2.8219, + "step": 4894 + }, + { + "epoch": 0.44310672580791166, + "grad_norm": 0.8823708891868591, + "learning_rate": 0.00010491667210704493, + "loss": 2.7874, + "step": 4895 + }, + { + "epoch": 0.443197248121662, + "grad_norm": 0.9686410427093506, + "learning_rate": 0.00010488497697956135, + "loss": 2.8533, + "step": 4896 + }, + { + "epoch": 0.44328777043541234, + "grad_norm": 0.9977758526802063, + "learning_rate": 0.00010485328136016071, + "loss": 2.8345, + "step": 4897 + }, + { + "epoch": 0.4433782927491627, + "grad_norm": 0.8284408450126648, + "learning_rate": 0.00010482158525203476, + "loss": 2.7972, + "step": 4898 + }, + { + "epoch": 0.443468815062913, + "grad_norm": 0.9240431785583496, + "learning_rate": 0.00010478988865837531, + "loss": 2.7735, + "step": 4899 + }, + { + "epoch": 0.44355933737666337, + "grad_norm": 0.9728517532348633, + "learning_rate": 0.00010475819158237425, + "loss": 2.8262, + "step": 4900 + }, + { + "epoch": 0.4436498596904137, + "grad_norm": 0.9056679010391235, + "learning_rate": 0.00010472649402722339, + "loss": 2.8424, + "step": 4901 + }, + { + "epoch": 0.44374038200416405, + "grad_norm": 0.9230318665504456, + "learning_rate": 0.0001046947959961147, + "loss": 2.8233, + "step": 4902 + }, + { + "epoch": 0.4438309043179144, + "grad_norm": 0.8907014727592468, + "learning_rate": 0.00010466309749224019, + "loss": 2.7827, + "step": 4903 + }, + { + "epoch": 0.44392142663166473, + "grad_norm": 0.8172028660774231, + "learning_rate": 0.00010463139851879191, + "loss": 2.8291, + "step": 4904 + }, + { + "epoch": 0.44401194894541507, + "grad_norm": 0.9084460139274597, + "learning_rate": 0.00010459969907896192, + "loss": 2.7821, + "step": 4905 + }, + { + "epoch": 0.4441024712591654, + "grad_norm": 0.8679286241531372, + "learning_rate": 0.00010456799917594233, + "loss": 2.8449, + "step": 4906 + }, + { + "epoch": 0.44419299357291575, + "grad_norm": 0.9083068370819092, + "learning_rate": 0.00010453629881292538, + "loss": 2.7829, + "step": 4907 + }, + { + "epoch": 0.44428351588666604, + "grad_norm": 0.877331018447876, + "learning_rate": 0.00010450459799310326, + "loss": 2.7828, + "step": 4908 + }, + { + "epoch": 0.4443740382004164, + "grad_norm": 0.9195173978805542, + "learning_rate": 0.00010447289671966824, + "loss": 2.7903, + "step": 4909 + }, + { + "epoch": 0.4444645605141667, + "grad_norm": 0.8775318264961243, + "learning_rate": 0.00010444119499581261, + "loss": 2.8409, + "step": 4910 + }, + { + "epoch": 0.44455508282791706, + "grad_norm": 0.9003087282180786, + "learning_rate": 0.00010440949282472883, + "loss": 2.8174, + "step": 4911 + }, + { + "epoch": 0.4446456051416674, + "grad_norm": 0.8533322215080261, + "learning_rate": 0.00010437779020960921, + "loss": 2.7998, + "step": 4912 + }, + { + "epoch": 0.44473612745541774, + "grad_norm": 0.8779058456420898, + "learning_rate": 0.00010434608715364622, + "loss": 2.856, + "step": 4913 + }, + { + "epoch": 0.4448266497691681, + "grad_norm": 0.8800071477890015, + "learning_rate": 0.00010431438366003239, + "loss": 2.8337, + "step": 4914 + }, + { + "epoch": 0.4449171720829184, + "grad_norm": 0.875268816947937, + "learning_rate": 0.00010428267973196027, + "loss": 2.8456, + "step": 4915 + }, + { + "epoch": 0.44500769439666876, + "grad_norm": 0.9144801497459412, + "learning_rate": 0.00010425097537262236, + "loss": 2.8311, + "step": 4916 + }, + { + "epoch": 0.4450982167104191, + "grad_norm": 0.9461082220077515, + "learning_rate": 0.00010421927058521137, + "loss": 2.8462, + "step": 4917 + }, + { + "epoch": 0.44518873902416944, + "grad_norm": 0.9008672833442688, + "learning_rate": 0.00010418756537291996, + "loss": 2.7521, + "step": 4918 + }, + { + "epoch": 0.4452792613379198, + "grad_norm": 0.8437387943267822, + "learning_rate": 0.00010415585973894084, + "loss": 2.7871, + "step": 4919 + }, + { + "epoch": 0.4453697836516701, + "grad_norm": 0.8589797616004944, + "learning_rate": 0.00010412415368646673, + "loss": 2.7782, + "step": 4920 + }, + { + "epoch": 0.44546030596542047, + "grad_norm": 0.7951464653015137, + "learning_rate": 0.00010409244721869047, + "loss": 2.7879, + "step": 4921 + }, + { + "epoch": 0.4455508282791708, + "grad_norm": 0.8469287753105164, + "learning_rate": 0.00010406074033880491, + "loss": 2.7961, + "step": 4922 + }, + { + "epoch": 0.44564135059292115, + "grad_norm": 0.83460533618927, + "learning_rate": 0.00010402903305000287, + "loss": 2.8391, + "step": 4923 + }, + { + "epoch": 0.4457318729066715, + "grad_norm": 0.9511914253234863, + "learning_rate": 0.00010399732535547734, + "loss": 2.8332, + "step": 4924 + }, + { + "epoch": 0.44582239522042183, + "grad_norm": 0.8237845301628113, + "learning_rate": 0.00010396561725842124, + "loss": 2.7943, + "step": 4925 + }, + { + "epoch": 0.44591291753417217, + "grad_norm": 0.850771963596344, + "learning_rate": 0.0001039339087620276, + "loss": 2.8397, + "step": 4926 + }, + { + "epoch": 0.4460034398479225, + "grad_norm": 0.7968292236328125, + "learning_rate": 0.00010390219986948946, + "loss": 2.8291, + "step": 4927 + }, + { + "epoch": 0.44609396216167285, + "grad_norm": 0.9098146557807922, + "learning_rate": 0.00010387049058399989, + "loss": 2.7927, + "step": 4928 + }, + { + "epoch": 0.4461844844754232, + "grad_norm": 0.878772497177124, + "learning_rate": 0.00010383878090875201, + "loss": 2.8495, + "step": 4929 + }, + { + "epoch": 0.44627500678917353, + "grad_norm": 0.8625677227973938, + "learning_rate": 0.00010380707084693901, + "loss": 2.8378, + "step": 4930 + }, + { + "epoch": 0.4463655291029239, + "grad_norm": 0.8826537132263184, + "learning_rate": 0.00010377536040175408, + "loss": 2.8177, + "step": 4931 + }, + { + "epoch": 0.4464560514166742, + "grad_norm": 0.865592360496521, + "learning_rate": 0.00010374364957639047, + "loss": 2.8599, + "step": 4932 + }, + { + "epoch": 0.44654657373042456, + "grad_norm": 0.8049623966217041, + "learning_rate": 0.00010371193837404143, + "loss": 2.7947, + "step": 4933 + }, + { + "epoch": 0.4466370960441749, + "grad_norm": 0.868293821811676, + "learning_rate": 0.00010368022679790031, + "loss": 2.7748, + "step": 4934 + }, + { + "epoch": 0.44672761835792524, + "grad_norm": 0.9875622987747192, + "learning_rate": 0.00010364851485116048, + "loss": 2.8251, + "step": 4935 + }, + { + "epoch": 0.4468181406716756, + "grad_norm": 0.8716400265693665, + "learning_rate": 0.00010361680253701527, + "loss": 2.7795, + "step": 4936 + }, + { + "epoch": 0.4469086629854259, + "grad_norm": 0.808600902557373, + "learning_rate": 0.00010358508985865813, + "loss": 2.8153, + "step": 4937 + }, + { + "epoch": 0.44699918529917626, + "grad_norm": 0.8896029591560364, + "learning_rate": 0.00010355337681928254, + "loss": 2.7848, + "step": 4938 + }, + { + "epoch": 0.4470897076129266, + "grad_norm": 0.8998689651489258, + "learning_rate": 0.00010352166342208205, + "loss": 2.856, + "step": 4939 + }, + { + "epoch": 0.44718022992667694, + "grad_norm": 0.8900119066238403, + "learning_rate": 0.00010348994967025012, + "loss": 2.7832, + "step": 4940 + }, + { + "epoch": 0.4472707522404273, + "grad_norm": 0.921204686164856, + "learning_rate": 0.00010345823556698034, + "loss": 2.8575, + "step": 4941 + }, + { + "epoch": 0.4473612745541776, + "grad_norm": 0.8895449042320251, + "learning_rate": 0.00010342652111546635, + "loss": 2.7814, + "step": 4942 + }, + { + "epoch": 0.44745179686792796, + "grad_norm": 0.8308514952659607, + "learning_rate": 0.00010339480631890178, + "loss": 2.8368, + "step": 4943 + }, + { + "epoch": 0.4475423191816783, + "grad_norm": 0.8498168587684631, + "learning_rate": 0.00010336309118048029, + "loss": 2.8192, + "step": 4944 + }, + { + "epoch": 0.44763284149542865, + "grad_norm": 0.8537622690200806, + "learning_rate": 0.00010333137570339562, + "loss": 2.8891, + "step": 4945 + }, + { + "epoch": 0.447723363809179, + "grad_norm": 0.8644798994064331, + "learning_rate": 0.00010329965989084152, + "loss": 2.8316, + "step": 4946 + }, + { + "epoch": 0.4478138861229293, + "grad_norm": 0.8171406388282776, + "learning_rate": 0.00010326794374601174, + "loss": 2.7997, + "step": 4947 + }, + { + "epoch": 0.44790440843667967, + "grad_norm": 0.8642208576202393, + "learning_rate": 0.00010323622727210012, + "loss": 2.7823, + "step": 4948 + }, + { + "epoch": 0.44799493075042995, + "grad_norm": 0.8446072340011597, + "learning_rate": 0.00010320451047230051, + "loss": 2.8498, + "step": 4949 + }, + { + "epoch": 0.4480854530641803, + "grad_norm": 0.8191630840301514, + "learning_rate": 0.00010317279334980678, + "loss": 2.8523, + "step": 4950 + }, + { + "epoch": 0.44817597537793064, + "grad_norm": 0.8264406323432922, + "learning_rate": 0.00010314107590781284, + "loss": 2.7282, + "step": 4951 + }, + { + "epoch": 0.448266497691681, + "grad_norm": 0.7761468887329102, + "learning_rate": 0.00010310935814951264, + "loss": 2.7908, + "step": 4952 + }, + { + "epoch": 0.4483570200054313, + "grad_norm": 0.8397097587585449, + "learning_rate": 0.00010307764007810017, + "loss": 2.7509, + "step": 4953 + }, + { + "epoch": 0.44844754231918166, + "grad_norm": 0.8186659812927246, + "learning_rate": 0.00010304592169676943, + "loss": 2.7241, + "step": 4954 + }, + { + "epoch": 0.448538064632932, + "grad_norm": 0.8386641144752502, + "learning_rate": 0.00010301420300871445, + "loss": 2.7638, + "step": 4955 + }, + { + "epoch": 0.44862858694668234, + "grad_norm": 0.8329310417175293, + "learning_rate": 0.00010298248401712931, + "loss": 2.8275, + "step": 4956 + }, + { + "epoch": 0.4487191092604327, + "grad_norm": 0.8720096349716187, + "learning_rate": 0.00010295076472520812, + "loss": 2.753, + "step": 4957 + }, + { + "epoch": 0.448809631574183, + "grad_norm": 0.8473954200744629, + "learning_rate": 0.00010291904513614499, + "loss": 2.843, + "step": 4958 + }, + { + "epoch": 0.44890015388793336, + "grad_norm": 0.8658212423324585, + "learning_rate": 0.0001028873252531341, + "loss": 2.7285, + "step": 4959 + }, + { + "epoch": 0.4489906762016837, + "grad_norm": 0.8777493238449097, + "learning_rate": 0.00010285560507936961, + "loss": 2.7865, + "step": 4960 + }, + { + "epoch": 0.44908119851543404, + "grad_norm": 0.9246544241905212, + "learning_rate": 0.00010282388461804584, + "loss": 2.7525, + "step": 4961 + }, + { + "epoch": 0.4491717208291844, + "grad_norm": 0.8622031211853027, + "learning_rate": 0.0001027921638723569, + "loss": 2.9143, + "step": 4962 + }, + { + "epoch": 0.4492622431429347, + "grad_norm": 0.9040536880493164, + "learning_rate": 0.00010276044284549716, + "loss": 2.8462, + "step": 4963 + }, + { + "epoch": 0.44935276545668507, + "grad_norm": 0.8143455386161804, + "learning_rate": 0.00010272872154066089, + "loss": 2.8701, + "step": 4964 + }, + { + "epoch": 0.4494432877704354, + "grad_norm": 0.8502079248428345, + "learning_rate": 0.00010269699996104246, + "loss": 2.8012, + "step": 4965 + }, + { + "epoch": 0.44953381008418575, + "grad_norm": 0.9193055629730225, + "learning_rate": 0.00010266527810983617, + "loss": 2.8005, + "step": 4966 + }, + { + "epoch": 0.4496243323979361, + "grad_norm": 0.8001332879066467, + "learning_rate": 0.00010263355599023645, + "loss": 2.7171, + "step": 4967 + }, + { + "epoch": 0.44971485471168643, + "grad_norm": 0.830764889717102, + "learning_rate": 0.00010260183360543773, + "loss": 2.8556, + "step": 4968 + }, + { + "epoch": 0.44980537702543677, + "grad_norm": 0.8240771889686584, + "learning_rate": 0.00010257011095863444, + "loss": 2.805, + "step": 4969 + }, + { + "epoch": 0.4498958993391871, + "grad_norm": 0.8989068269729614, + "learning_rate": 0.00010253838805302104, + "loss": 2.8138, + "step": 4970 + }, + { + "epoch": 0.44998642165293745, + "grad_norm": 0.7964334487915039, + "learning_rate": 0.00010250666489179204, + "loss": 2.7718, + "step": 4971 + }, + { + "epoch": 0.4500769439666878, + "grad_norm": 0.8724195957183838, + "learning_rate": 0.00010247494147814196, + "loss": 2.809, + "step": 4972 + }, + { + "epoch": 0.45016746628043813, + "grad_norm": 0.9154969453811646, + "learning_rate": 0.00010244321781526533, + "loss": 2.8357, + "step": 4973 + }, + { + "epoch": 0.4502579885941885, + "grad_norm": 0.898012101650238, + "learning_rate": 0.00010241149390635674, + "loss": 2.744, + "step": 4974 + }, + { + "epoch": 0.4503485109079388, + "grad_norm": 1.0105397701263428, + "learning_rate": 0.00010237976975461075, + "loss": 2.8061, + "step": 4975 + }, + { + "epoch": 0.45043903322168916, + "grad_norm": 0.8304058313369751, + "learning_rate": 0.00010234804536322204, + "loss": 2.816, + "step": 4976 + }, + { + "epoch": 0.4505295555354395, + "grad_norm": 0.9987416863441467, + "learning_rate": 0.00010231632073538522, + "loss": 2.8618, + "step": 4977 + }, + { + "epoch": 0.45062007784918984, + "grad_norm": 0.8492810726165771, + "learning_rate": 0.00010228459587429497, + "loss": 2.7893, + "step": 4978 + }, + { + "epoch": 0.4507106001629402, + "grad_norm": 0.8335104584693909, + "learning_rate": 0.00010225287078314596, + "loss": 2.8049, + "step": 4979 + }, + { + "epoch": 0.4508011224766905, + "grad_norm": 0.881281852722168, + "learning_rate": 0.00010222114546513295, + "loss": 2.8618, + "step": 4980 + }, + { + "epoch": 0.45089164479044086, + "grad_norm": 0.9159513115882874, + "learning_rate": 0.00010218941992345063, + "loss": 2.8276, + "step": 4981 + }, + { + "epoch": 0.4509821671041912, + "grad_norm": 0.846339762210846, + "learning_rate": 0.0001021576941612938, + "loss": 2.7684, + "step": 4982 + }, + { + "epoch": 0.45107268941794154, + "grad_norm": 0.8389536142349243, + "learning_rate": 0.0001021259681818572, + "loss": 2.8375, + "step": 4983 + }, + { + "epoch": 0.4511632117316919, + "grad_norm": 0.8507980108261108, + "learning_rate": 0.0001020942419883357, + "loss": 2.8065, + "step": 4984 + }, + { + "epoch": 0.4512537340454422, + "grad_norm": 0.8403035998344421, + "learning_rate": 0.00010206251558392408, + "loss": 2.8151, + "step": 4985 + }, + { + "epoch": 0.45134425635919256, + "grad_norm": 0.8249539732933044, + "learning_rate": 0.00010203078897181717, + "loss": 2.7943, + "step": 4986 + }, + { + "epoch": 0.4514347786729429, + "grad_norm": 0.8872669339179993, + "learning_rate": 0.00010199906215520989, + "loss": 2.8219, + "step": 4987 + }, + { + "epoch": 0.45152530098669325, + "grad_norm": 0.9296691417694092, + "learning_rate": 0.0001019673351372971, + "loss": 2.8051, + "step": 4988 + }, + { + "epoch": 0.4516158233004436, + "grad_norm": 0.9113702774047852, + "learning_rate": 0.00010193560792127372, + "loss": 2.8503, + "step": 4989 + }, + { + "epoch": 0.45170634561419387, + "grad_norm": 0.9052483439445496, + "learning_rate": 0.00010190388051033466, + "loss": 2.8591, + "step": 4990 + }, + { + "epoch": 0.4517968679279442, + "grad_norm": 0.844837486743927, + "learning_rate": 0.00010187215290767491, + "loss": 2.8988, + "step": 4991 + }, + { + "epoch": 0.45188739024169455, + "grad_norm": 0.8985702395439148, + "learning_rate": 0.00010184042511648942, + "loss": 2.8286, + "step": 4992 + }, + { + "epoch": 0.4519779125554449, + "grad_norm": 0.9050378203392029, + "learning_rate": 0.00010180869713997312, + "loss": 2.7684, + "step": 4993 + }, + { + "epoch": 0.45206843486919523, + "grad_norm": 0.8572180867195129, + "learning_rate": 0.0001017769689813211, + "loss": 2.7869, + "step": 4994 + }, + { + "epoch": 0.4521589571829456, + "grad_norm": 0.8583084940910339, + "learning_rate": 0.00010174524064372837, + "loss": 2.839, + "step": 4995 + }, + { + "epoch": 0.4522494794966959, + "grad_norm": 0.7894691228866577, + "learning_rate": 0.00010171351213038993, + "loss": 2.7837, + "step": 4996 + }, + { + "epoch": 0.45234000181044626, + "grad_norm": 0.843013346195221, + "learning_rate": 0.00010168178344450086, + "loss": 2.7827, + "step": 4997 + }, + { + "epoch": 0.4524305241241966, + "grad_norm": 0.9390647411346436, + "learning_rate": 0.00010165005458925626, + "loss": 2.7951, + "step": 4998 + }, + { + "epoch": 0.45252104643794694, + "grad_norm": 0.8602046966552734, + "learning_rate": 0.0001016183255678512, + "loss": 2.77, + "step": 4999 + }, + { + "epoch": 0.4526115687516973, + "grad_norm": 0.9842708110809326, + "learning_rate": 0.00010158659638348081, + "loss": 2.8039, + "step": 5000 + }, + { + "epoch": 0.4526115687516973, + "eval_loss": 2.7390329837799072, + "eval_runtime": 71.5613, + "eval_samples_per_second": 37.772, + "eval_steps_per_second": 3.158, + "step": 5000 + }, + { + "epoch": 0.4527020910654476, + "grad_norm": 0.8009204864501953, + "learning_rate": 0.00010155486703934018, + "loss": 2.8257, + "step": 5001 + }, + { + "epoch": 0.45279261337919796, + "grad_norm": 0.8216099143028259, + "learning_rate": 0.0001015231375386245, + "loss": 2.7155, + "step": 5002 + }, + { + "epoch": 0.4528831356929483, + "grad_norm": 0.879997193813324, + "learning_rate": 0.00010149140788452894, + "loss": 2.8294, + "step": 5003 + }, + { + "epoch": 0.45297365800669864, + "grad_norm": 0.7909499406814575, + "learning_rate": 0.00010145967808024859, + "loss": 2.7564, + "step": 5004 + }, + { + "epoch": 0.453064180320449, + "grad_norm": 0.8433207869529724, + "learning_rate": 0.00010142794812897873, + "loss": 2.8164, + "step": 5005 + }, + { + "epoch": 0.4531547026341993, + "grad_norm": 0.8329892754554749, + "learning_rate": 0.00010139621803391455, + "loss": 2.8001, + "step": 5006 + }, + { + "epoch": 0.45324522494794967, + "grad_norm": 0.9415286779403687, + "learning_rate": 0.0001013644877982512, + "loss": 2.8395, + "step": 5007 + }, + { + "epoch": 0.4533357472617, + "grad_norm": 0.8327991366386414, + "learning_rate": 0.00010133275742518403, + "loss": 2.8404, + "step": 5008 + }, + { + "epoch": 0.45342626957545035, + "grad_norm": 0.8747456073760986, + "learning_rate": 0.0001013010269179082, + "loss": 2.8421, + "step": 5009 + }, + { + "epoch": 0.4535167918892007, + "grad_norm": 0.8911159038543701, + "learning_rate": 0.00010126929627961896, + "loss": 2.7713, + "step": 5010 + }, + { + "epoch": 0.45360731420295103, + "grad_norm": 0.7843373417854309, + "learning_rate": 0.00010123756551351166, + "loss": 2.806, + "step": 5011 + }, + { + "epoch": 0.45369783651670137, + "grad_norm": 0.8249711990356445, + "learning_rate": 0.00010120583462278152, + "loss": 2.7439, + "step": 5012 + }, + { + "epoch": 0.4537883588304517, + "grad_norm": 0.8340107202529907, + "learning_rate": 0.00010117410361062387, + "loss": 2.7893, + "step": 5013 + }, + { + "epoch": 0.45387888114420205, + "grad_norm": 0.8179864287376404, + "learning_rate": 0.00010114237248023404, + "loss": 2.7871, + "step": 5014 + }, + { + "epoch": 0.4539694034579524, + "grad_norm": 0.8432429432868958, + "learning_rate": 0.00010111064123480733, + "loss": 2.8294, + "step": 5015 + }, + { + "epoch": 0.45405992577170273, + "grad_norm": 0.8422669768333435, + "learning_rate": 0.00010107890987753905, + "loss": 2.8206, + "step": 5016 + }, + { + "epoch": 0.4541504480854531, + "grad_norm": 0.8166409134864807, + "learning_rate": 0.00010104717841162458, + "loss": 2.81, + "step": 5017 + }, + { + "epoch": 0.4542409703992034, + "grad_norm": 0.8468937277793884, + "learning_rate": 0.00010101544684025931, + "loss": 2.7166, + "step": 5018 + }, + { + "epoch": 0.45433149271295376, + "grad_norm": 0.8151309490203857, + "learning_rate": 0.00010098371516663853, + "loss": 2.8096, + "step": 5019 + }, + { + "epoch": 0.4544220150267041, + "grad_norm": 0.8079573512077332, + "learning_rate": 0.00010095198339395769, + "loss": 2.7564, + "step": 5020 + }, + { + "epoch": 0.45451253734045444, + "grad_norm": 0.8220385313034058, + "learning_rate": 0.00010092025152541211, + "loss": 2.7707, + "step": 5021 + }, + { + "epoch": 0.4546030596542048, + "grad_norm": 0.7748722434043884, + "learning_rate": 0.00010088851956419728, + "loss": 2.7977, + "step": 5022 + }, + { + "epoch": 0.4546935819679551, + "grad_norm": 0.8827757835388184, + "learning_rate": 0.0001008567875135085, + "loss": 2.8557, + "step": 5023 + }, + { + "epoch": 0.45478410428170546, + "grad_norm": 0.852816104888916, + "learning_rate": 0.00010082505537654128, + "loss": 2.8645, + "step": 5024 + }, + { + "epoch": 0.4548746265954558, + "grad_norm": 0.845237672328949, + "learning_rate": 0.00010079332315649097, + "loss": 2.805, + "step": 5025 + }, + { + "epoch": 0.45496514890920614, + "grad_norm": 0.7926275134086609, + "learning_rate": 0.00010076159085655308, + "loss": 2.7563, + "step": 5026 + }, + { + "epoch": 0.4550556712229565, + "grad_norm": 0.8070298433303833, + "learning_rate": 0.000100729858479923, + "loss": 2.8283, + "step": 5027 + }, + { + "epoch": 0.4551461935367068, + "grad_norm": 0.8690354824066162, + "learning_rate": 0.00010069812602979615, + "loss": 2.8512, + "step": 5028 + }, + { + "epoch": 0.45523671585045716, + "grad_norm": 0.8458876013755798, + "learning_rate": 0.00010066639350936806, + "loss": 2.8317, + "step": 5029 + }, + { + "epoch": 0.4553272381642075, + "grad_norm": 0.8781901001930237, + "learning_rate": 0.0001006346609218342, + "loss": 2.7928, + "step": 5030 + }, + { + "epoch": 0.4554177604779578, + "grad_norm": 0.8437893986701965, + "learning_rate": 0.00010060292827038995, + "loss": 2.7485, + "step": 5031 + }, + { + "epoch": 0.45550828279170813, + "grad_norm": 0.8205193281173706, + "learning_rate": 0.00010057119555823085, + "loss": 2.8012, + "step": 5032 + }, + { + "epoch": 0.45559880510545847, + "grad_norm": 0.8345540165901184, + "learning_rate": 0.00010053946278855237, + "loss": 2.7236, + "step": 5033 + }, + { + "epoch": 0.4556893274192088, + "grad_norm": 0.8251396417617798, + "learning_rate": 0.00010050772996455002, + "loss": 2.8524, + "step": 5034 + }, + { + "epoch": 0.45577984973295915, + "grad_norm": 0.9152898192405701, + "learning_rate": 0.00010047599708941927, + "loss": 2.7933, + "step": 5035 + }, + { + "epoch": 0.4558703720467095, + "grad_norm": 0.8356764316558838, + "learning_rate": 0.00010044426416635562, + "loss": 2.7767, + "step": 5036 + }, + { + "epoch": 0.45596089436045983, + "grad_norm": 0.8549394607543945, + "learning_rate": 0.0001004125311985546, + "loss": 2.7789, + "step": 5037 + }, + { + "epoch": 0.4560514166742102, + "grad_norm": 0.8408095240592957, + "learning_rate": 0.00010038079818921166, + "loss": 2.7851, + "step": 5038 + }, + { + "epoch": 0.4561419389879605, + "grad_norm": 0.8904051184654236, + "learning_rate": 0.00010034906514152238, + "loss": 2.7786, + "step": 5039 + }, + { + "epoch": 0.45623246130171086, + "grad_norm": 0.8291855454444885, + "learning_rate": 0.00010031733205868224, + "loss": 2.7598, + "step": 5040 + }, + { + "epoch": 0.4563229836154612, + "grad_norm": 0.8298647403717041, + "learning_rate": 0.0001002855989438868, + "loss": 2.7359, + "step": 5041 + }, + { + "epoch": 0.45641350592921154, + "grad_norm": 0.8610008358955383, + "learning_rate": 0.00010025386580033151, + "loss": 2.7943, + "step": 5042 + }, + { + "epoch": 0.4565040282429619, + "grad_norm": 0.8803578615188599, + "learning_rate": 0.00010022213263121197, + "loss": 2.7389, + "step": 5043 + }, + { + "epoch": 0.4565945505567122, + "grad_norm": 0.8574979901313782, + "learning_rate": 0.00010019039943972366, + "loss": 2.8107, + "step": 5044 + }, + { + "epoch": 0.45668507287046256, + "grad_norm": 0.8162574172019958, + "learning_rate": 0.00010015866622906217, + "loss": 2.843, + "step": 5045 + }, + { + "epoch": 0.4567755951842129, + "grad_norm": 0.9479469060897827, + "learning_rate": 0.00010012693300242296, + "loss": 2.7705, + "step": 5046 + }, + { + "epoch": 0.45686611749796324, + "grad_norm": 0.8440346121788025, + "learning_rate": 0.00010009519976300159, + "loss": 2.8239, + "step": 5047 + }, + { + "epoch": 0.4569566398117136, + "grad_norm": 0.8232409358024597, + "learning_rate": 0.00010006346651399363, + "loss": 2.7954, + "step": 5048 + }, + { + "epoch": 0.4570471621254639, + "grad_norm": 0.8686602115631104, + "learning_rate": 0.00010003173325859461, + "loss": 2.8405, + "step": 5049 + }, + { + "epoch": 0.45713768443921426, + "grad_norm": 0.8840144872665405, + "learning_rate": 0.0001, + "loss": 2.8045, + "step": 5050 + }, + { + "epoch": 0.4572282067529646, + "grad_norm": 0.8932952880859375, + "learning_rate": 9.996826674140544e-05, + "loss": 2.778, + "step": 5051 + }, + { + "epoch": 0.45731872906671495, + "grad_norm": 0.874193012714386, + "learning_rate": 9.99365334860064e-05, + "loss": 2.7877, + "step": 5052 + }, + { + "epoch": 0.4574092513804653, + "grad_norm": 0.7909197807312012, + "learning_rate": 9.990480023699845e-05, + "loss": 2.7235, + "step": 5053 + }, + { + "epoch": 0.45749977369421563, + "grad_norm": 0.8901601433753967, + "learning_rate": 9.987306699757707e-05, + "loss": 2.7525, + "step": 5054 + }, + { + "epoch": 0.45759029600796597, + "grad_norm": 0.8555707931518555, + "learning_rate": 9.984133377093789e-05, + "loss": 2.7974, + "step": 5055 + }, + { + "epoch": 0.4576808183217163, + "grad_norm": 0.8725164532661438, + "learning_rate": 9.980960056027636e-05, + "loss": 2.7639, + "step": 5056 + }, + { + "epoch": 0.45777134063546665, + "grad_norm": 0.8749620318412781, + "learning_rate": 9.977786736878808e-05, + "loss": 2.766, + "step": 5057 + }, + { + "epoch": 0.457861862949217, + "grad_norm": 0.8529465794563293, + "learning_rate": 9.974613419966851e-05, + "loss": 2.7849, + "step": 5058 + }, + { + "epoch": 0.45795238526296733, + "grad_norm": 0.8112756609916687, + "learning_rate": 9.971440105611325e-05, + "loss": 2.8221, + "step": 5059 + }, + { + "epoch": 0.4580429075767177, + "grad_norm": 0.8644946217536926, + "learning_rate": 9.968266794131777e-05, + "loss": 2.8586, + "step": 5060 + }, + { + "epoch": 0.458133429890468, + "grad_norm": 0.819271981716156, + "learning_rate": 9.965093485847767e-05, + "loss": 2.7961, + "step": 5061 + }, + { + "epoch": 0.45822395220421835, + "grad_norm": 0.8047432899475098, + "learning_rate": 9.961920181078835e-05, + "loss": 2.824, + "step": 5062 + }, + { + "epoch": 0.4583144745179687, + "grad_norm": 0.9896413087844849, + "learning_rate": 9.958746880144545e-05, + "loss": 2.887, + "step": 5063 + }, + { + "epoch": 0.45840499683171904, + "grad_norm": 0.8702191114425659, + "learning_rate": 9.95557358336444e-05, + "loss": 2.7741, + "step": 5064 + }, + { + "epoch": 0.4584955191454694, + "grad_norm": 0.8362398147583008, + "learning_rate": 9.952400291058077e-05, + "loss": 2.7465, + "step": 5065 + }, + { + "epoch": 0.4585860414592197, + "grad_norm": 0.8557057976722717, + "learning_rate": 9.949227003545e-05, + "loss": 2.7277, + "step": 5066 + }, + { + "epoch": 0.45867656377297006, + "grad_norm": 0.9606072902679443, + "learning_rate": 9.946053721144765e-05, + "loss": 2.7846, + "step": 5067 + }, + { + "epoch": 0.4587670860867204, + "grad_norm": 0.8010448217391968, + "learning_rate": 9.942880444176918e-05, + "loss": 2.7601, + "step": 5068 + }, + { + "epoch": 0.45885760840047074, + "grad_norm": 0.833601176738739, + "learning_rate": 9.93970717296101e-05, + "loss": 2.8154, + "step": 5069 + }, + { + "epoch": 0.4589481307142211, + "grad_norm": 0.8944669961929321, + "learning_rate": 9.936533907816584e-05, + "loss": 2.7353, + "step": 5070 + }, + { + "epoch": 0.4590386530279714, + "grad_norm": 0.9217827916145325, + "learning_rate": 9.933360649063195e-05, + "loss": 2.8062, + "step": 5071 + }, + { + "epoch": 0.4591291753417217, + "grad_norm": 0.7689042091369629, + "learning_rate": 9.930187397020386e-05, + "loss": 2.6995, + "step": 5072 + }, + { + "epoch": 0.45921969765547205, + "grad_norm": 0.8688762187957764, + "learning_rate": 9.927014152007706e-05, + "loss": 2.7928, + "step": 5073 + }, + { + "epoch": 0.4593102199692224, + "grad_norm": 0.8136226534843445, + "learning_rate": 9.923840914344695e-05, + "loss": 2.786, + "step": 5074 + }, + { + "epoch": 0.45940074228297273, + "grad_norm": 0.8458419442176819, + "learning_rate": 9.920667684350905e-05, + "loss": 2.7491, + "step": 5075 + }, + { + "epoch": 0.45949126459672307, + "grad_norm": 0.8307839035987854, + "learning_rate": 9.917494462345876e-05, + "loss": 2.7433, + "step": 5076 + }, + { + "epoch": 0.4595817869104734, + "grad_norm": 0.8397489786148071, + "learning_rate": 9.914321248649153e-05, + "loss": 2.8396, + "step": 5077 + }, + { + "epoch": 0.45967230922422375, + "grad_norm": 0.8725177645683289, + "learning_rate": 9.911148043580275e-05, + "loss": 2.802, + "step": 5078 + }, + { + "epoch": 0.4597628315379741, + "grad_norm": 0.9256505370140076, + "learning_rate": 9.907974847458791e-05, + "loss": 2.8105, + "step": 5079 + }, + { + "epoch": 0.45985335385172443, + "grad_norm": 0.8496103286743164, + "learning_rate": 9.904801660604234e-05, + "loss": 2.8376, + "step": 5080 + }, + { + "epoch": 0.4599438761654748, + "grad_norm": 0.8865105509757996, + "learning_rate": 9.90162848333615e-05, + "loss": 2.842, + "step": 5081 + }, + { + "epoch": 0.4600343984792251, + "grad_norm": 0.862686038017273, + "learning_rate": 9.898455315974071e-05, + "loss": 2.7353, + "step": 5082 + }, + { + "epoch": 0.46012492079297546, + "grad_norm": 0.8391594290733337, + "learning_rate": 9.895282158837545e-05, + "loss": 2.8188, + "step": 5083 + }, + { + "epoch": 0.4602154431067258, + "grad_norm": 0.8564516305923462, + "learning_rate": 9.892109012246096e-05, + "loss": 2.8319, + "step": 5084 + }, + { + "epoch": 0.46030596542047614, + "grad_norm": 0.8370792269706726, + "learning_rate": 9.888935876519272e-05, + "loss": 2.8322, + "step": 5085 + }, + { + "epoch": 0.4603964877342265, + "grad_norm": 0.8662410974502563, + "learning_rate": 9.8857627519766e-05, + "loss": 2.7647, + "step": 5086 + }, + { + "epoch": 0.4604870100479768, + "grad_norm": 0.8782781362533569, + "learning_rate": 9.882589638937615e-05, + "loss": 2.8176, + "step": 5087 + }, + { + "epoch": 0.46057753236172716, + "grad_norm": 0.878341794013977, + "learning_rate": 9.87941653772185e-05, + "loss": 2.8341, + "step": 5088 + }, + { + "epoch": 0.4606680546754775, + "grad_norm": 0.8614301085472107, + "learning_rate": 9.876243448648839e-05, + "loss": 2.8229, + "step": 5089 + }, + { + "epoch": 0.46075857698922784, + "grad_norm": 0.9689937233924866, + "learning_rate": 9.873070372038105e-05, + "loss": 2.8517, + "step": 5090 + }, + { + "epoch": 0.4608490993029782, + "grad_norm": 0.8726419806480408, + "learning_rate": 9.869897308209186e-05, + "loss": 2.8126, + "step": 5091 + }, + { + "epoch": 0.4609396216167285, + "grad_norm": 0.8292773365974426, + "learning_rate": 9.8667242574816e-05, + "loss": 2.7537, + "step": 5092 + }, + { + "epoch": 0.46103014393047886, + "grad_norm": 0.8113095760345459, + "learning_rate": 9.863551220174881e-05, + "loss": 2.7456, + "step": 5093 + }, + { + "epoch": 0.4611206662442292, + "grad_norm": 0.8891957402229309, + "learning_rate": 9.860378196608549e-05, + "loss": 2.8136, + "step": 5094 + }, + { + "epoch": 0.46121118855797955, + "grad_norm": 0.8564567565917969, + "learning_rate": 9.85720518710213e-05, + "loss": 2.8245, + "step": 5095 + }, + { + "epoch": 0.4613017108717299, + "grad_norm": 0.8329167366027832, + "learning_rate": 9.85403219197514e-05, + "loss": 2.766, + "step": 5096 + }, + { + "epoch": 0.4613922331854802, + "grad_norm": 0.824367105960846, + "learning_rate": 9.85085921154711e-05, + "loss": 2.7998, + "step": 5097 + }, + { + "epoch": 0.46148275549923057, + "grad_norm": 0.8718504309654236, + "learning_rate": 9.847686246137551e-05, + "loss": 2.7728, + "step": 5098 + }, + { + "epoch": 0.4615732778129809, + "grad_norm": 0.8713845610618591, + "learning_rate": 9.844513296065984e-05, + "loss": 2.8486, + "step": 5099 + }, + { + "epoch": 0.46166380012673125, + "grad_norm": 0.7742902040481567, + "learning_rate": 9.84134036165192e-05, + "loss": 2.8051, + "step": 5100 + }, + { + "epoch": 0.4617543224404816, + "grad_norm": 0.8396895527839661, + "learning_rate": 9.838167443214882e-05, + "loss": 2.803, + "step": 5101 + }, + { + "epoch": 0.46184484475423193, + "grad_norm": 0.8684070110321045, + "learning_rate": 9.834994541074375e-05, + "loss": 2.8388, + "step": 5102 + }, + { + "epoch": 0.46193536706798227, + "grad_norm": 0.8808414936065674, + "learning_rate": 9.831821655549916e-05, + "loss": 2.792, + "step": 5103 + }, + { + "epoch": 0.4620258893817326, + "grad_norm": 0.8871559500694275, + "learning_rate": 9.828648786961008e-05, + "loss": 2.8197, + "step": 5104 + }, + { + "epoch": 0.46211641169548295, + "grad_norm": 0.8918201923370361, + "learning_rate": 9.825475935627165e-05, + "loss": 2.8327, + "step": 5105 + }, + { + "epoch": 0.4622069340092333, + "grad_norm": 0.9378277659416199, + "learning_rate": 9.82230310186789e-05, + "loss": 2.7894, + "step": 5106 + }, + { + "epoch": 0.46229745632298364, + "grad_norm": 0.8530616760253906, + "learning_rate": 9.819130286002689e-05, + "loss": 2.8486, + "step": 5107 + }, + { + "epoch": 0.462387978636734, + "grad_norm": 0.8263341784477234, + "learning_rate": 9.81595748835106e-05, + "loss": 2.7534, + "step": 5108 + }, + { + "epoch": 0.4624785009504843, + "grad_norm": 0.9433355927467346, + "learning_rate": 9.81278470923251e-05, + "loss": 2.833, + "step": 5109 + }, + { + "epoch": 0.46256902326423466, + "grad_norm": 0.8154783844947815, + "learning_rate": 9.809611948966533e-05, + "loss": 2.8349, + "step": 5110 + }, + { + "epoch": 0.462659545577985, + "grad_norm": 0.8896995782852173, + "learning_rate": 9.80643920787263e-05, + "loss": 2.7907, + "step": 5111 + }, + { + "epoch": 0.46275006789173534, + "grad_norm": 0.8480448126792908, + "learning_rate": 9.803266486270288e-05, + "loss": 2.7842, + "step": 5112 + }, + { + "epoch": 0.4628405902054856, + "grad_norm": 0.844628095626831, + "learning_rate": 9.800093784479013e-05, + "loss": 2.8267, + "step": 5113 + }, + { + "epoch": 0.46293111251923597, + "grad_norm": 0.9172288775444031, + "learning_rate": 9.796921102818281e-05, + "loss": 2.8163, + "step": 5114 + }, + { + "epoch": 0.4630216348329863, + "grad_norm": 0.8828997015953064, + "learning_rate": 9.793748441607594e-05, + "loss": 2.7847, + "step": 5115 + }, + { + "epoch": 0.46311215714673665, + "grad_norm": 0.9007895588874817, + "learning_rate": 9.790575801166432e-05, + "loss": 2.8427, + "step": 5116 + }, + { + "epoch": 0.463202679460487, + "grad_norm": 0.8572633862495422, + "learning_rate": 9.787403181814281e-05, + "loss": 2.7562, + "step": 5117 + }, + { + "epoch": 0.46329320177423733, + "grad_norm": 0.8919517993927002, + "learning_rate": 9.784230583870621e-05, + "loss": 2.8008, + "step": 5118 + }, + { + "epoch": 0.46338372408798767, + "grad_norm": 0.8707318305969238, + "learning_rate": 9.781058007654939e-05, + "loss": 2.8122, + "step": 5119 + }, + { + "epoch": 0.463474246401738, + "grad_norm": 0.8310428857803345, + "learning_rate": 9.777885453486706e-05, + "loss": 2.807, + "step": 5120 + }, + { + "epoch": 0.46356476871548835, + "grad_norm": 0.8464776277542114, + "learning_rate": 9.774712921685407e-05, + "loss": 2.751, + "step": 5121 + }, + { + "epoch": 0.4636552910292387, + "grad_norm": 0.9811182618141174, + "learning_rate": 9.771540412570504e-05, + "loss": 2.8484, + "step": 5122 + }, + { + "epoch": 0.46374581334298903, + "grad_norm": 0.9233271479606628, + "learning_rate": 9.768367926461479e-05, + "loss": 2.7754, + "step": 5123 + }, + { + "epoch": 0.4638363356567394, + "grad_norm": 0.9555431604385376, + "learning_rate": 9.765195463677797e-05, + "loss": 2.8653, + "step": 5124 + }, + { + "epoch": 0.4639268579704897, + "grad_norm": 1.0181915760040283, + "learning_rate": 9.762023024538926e-05, + "loss": 2.8013, + "step": 5125 + }, + { + "epoch": 0.46401738028424006, + "grad_norm": 0.8504894971847534, + "learning_rate": 9.758850609364327e-05, + "loss": 2.7465, + "step": 5126 + }, + { + "epoch": 0.4641079025979904, + "grad_norm": 0.9759464263916016, + "learning_rate": 9.755678218473469e-05, + "loss": 2.8684, + "step": 5127 + }, + { + "epoch": 0.46419842491174074, + "grad_norm": 0.870422899723053, + "learning_rate": 9.752505852185805e-05, + "loss": 2.7721, + "step": 5128 + }, + { + "epoch": 0.4642889472254911, + "grad_norm": 0.8897386789321899, + "learning_rate": 9.749333510820798e-05, + "loss": 2.7581, + "step": 5129 + }, + { + "epoch": 0.4643794695392414, + "grad_norm": 0.8926811218261719, + "learning_rate": 9.746161194697895e-05, + "loss": 2.7956, + "step": 5130 + }, + { + "epoch": 0.46446999185299176, + "grad_norm": 0.797377347946167, + "learning_rate": 9.742988904136557e-05, + "loss": 2.8038, + "step": 5131 + }, + { + "epoch": 0.4645605141667421, + "grad_norm": 0.8459729552268982, + "learning_rate": 9.739816639456228e-05, + "loss": 2.7561, + "step": 5132 + }, + { + "epoch": 0.46465103648049244, + "grad_norm": 0.8637222051620483, + "learning_rate": 9.736644400976357e-05, + "loss": 2.772, + "step": 5133 + }, + { + "epoch": 0.4647415587942428, + "grad_norm": 0.7574160695075989, + "learning_rate": 9.733472189016383e-05, + "loss": 2.7673, + "step": 5134 + }, + { + "epoch": 0.4648320811079931, + "grad_norm": 0.8637992739677429, + "learning_rate": 9.730300003895758e-05, + "loss": 2.7775, + "step": 5135 + }, + { + "epoch": 0.46492260342174346, + "grad_norm": 0.8622542023658752, + "learning_rate": 9.727127845933915e-05, + "loss": 2.7724, + "step": 5136 + }, + { + "epoch": 0.4650131257354938, + "grad_norm": 0.9396885633468628, + "learning_rate": 9.723955715450287e-05, + "loss": 2.7755, + "step": 5137 + }, + { + "epoch": 0.46510364804924414, + "grad_norm": 0.8237531781196594, + "learning_rate": 9.720783612764314e-05, + "loss": 2.7245, + "step": 5138 + }, + { + "epoch": 0.4651941703629945, + "grad_norm": 1.1125627756118774, + "learning_rate": 9.717611538195419e-05, + "loss": 2.8233, + "step": 5139 + }, + { + "epoch": 0.4652846926767448, + "grad_norm": 0.836974024772644, + "learning_rate": 9.71443949206304e-05, + "loss": 2.7418, + "step": 5140 + }, + { + "epoch": 0.46537521499049517, + "grad_norm": 0.9694908857345581, + "learning_rate": 9.711267474686591e-05, + "loss": 2.8141, + "step": 5141 + }, + { + "epoch": 0.4654657373042455, + "grad_norm": 0.894636332988739, + "learning_rate": 9.708095486385504e-05, + "loss": 2.7566, + "step": 5142 + }, + { + "epoch": 0.46555625961799585, + "grad_norm": 0.8978986144065857, + "learning_rate": 9.704923527479189e-05, + "loss": 2.8253, + "step": 5143 + }, + { + "epoch": 0.4656467819317462, + "grad_norm": 0.8937349915504456, + "learning_rate": 9.701751598287074e-05, + "loss": 2.7733, + "step": 5144 + }, + { + "epoch": 0.46573730424549653, + "grad_norm": 0.8157208561897278, + "learning_rate": 9.698579699128557e-05, + "loss": 2.7197, + "step": 5145 + }, + { + "epoch": 0.46582782655924687, + "grad_norm": 0.8707994818687439, + "learning_rate": 9.69540783032306e-05, + "loss": 2.7368, + "step": 5146 + }, + { + "epoch": 0.4659183488729972, + "grad_norm": 0.8988774418830872, + "learning_rate": 9.692235992189985e-05, + "loss": 2.783, + "step": 5147 + }, + { + "epoch": 0.46600887118674755, + "grad_norm": 0.9029574990272522, + "learning_rate": 9.68906418504874e-05, + "loss": 2.8309, + "step": 5148 + }, + { + "epoch": 0.4660993935004979, + "grad_norm": 0.846811056137085, + "learning_rate": 9.685892409218717e-05, + "loss": 2.8838, + "step": 5149 + }, + { + "epoch": 0.46618991581424823, + "grad_norm": 0.8255429863929749, + "learning_rate": 9.682720665019325e-05, + "loss": 2.7979, + "step": 5150 + }, + { + "epoch": 0.4662804381279986, + "grad_norm": 0.8471197485923767, + "learning_rate": 9.679548952769953e-05, + "loss": 2.7293, + "step": 5151 + }, + { + "epoch": 0.4663709604417489, + "grad_norm": 0.7674946188926697, + "learning_rate": 9.676377272789992e-05, + "loss": 2.7723, + "step": 5152 + }, + { + "epoch": 0.46646148275549926, + "grad_norm": 0.809223473072052, + "learning_rate": 9.673205625398827e-05, + "loss": 2.8182, + "step": 5153 + }, + { + "epoch": 0.46655200506924954, + "grad_norm": 0.9060366153717041, + "learning_rate": 9.670034010915852e-05, + "loss": 2.7695, + "step": 5154 + }, + { + "epoch": 0.4666425273829999, + "grad_norm": 0.8269774317741394, + "learning_rate": 9.66686242966044e-05, + "loss": 2.7963, + "step": 5155 + }, + { + "epoch": 0.4667330496967502, + "grad_norm": 0.8438358902931213, + "learning_rate": 9.663690881951975e-05, + "loss": 2.7485, + "step": 5156 + }, + { + "epoch": 0.46682357201050056, + "grad_norm": 0.9002498984336853, + "learning_rate": 9.660519368109823e-05, + "loss": 2.7802, + "step": 5157 + }, + { + "epoch": 0.4669140943242509, + "grad_norm": 0.8270025849342346, + "learning_rate": 9.657347888453367e-05, + "loss": 2.8007, + "step": 5158 + }, + { + "epoch": 0.46700461663800125, + "grad_norm": 0.892013669013977, + "learning_rate": 9.654176443301967e-05, + "loss": 2.8027, + "step": 5159 + }, + { + "epoch": 0.4670951389517516, + "grad_norm": 0.9057855606079102, + "learning_rate": 9.651005032974994e-05, + "loss": 2.7702, + "step": 5160 + }, + { + "epoch": 0.46718566126550193, + "grad_norm": 0.786109209060669, + "learning_rate": 9.647833657791797e-05, + "loss": 2.846, + "step": 5161 + }, + { + "epoch": 0.46727618357925227, + "grad_norm": 0.9512817859649658, + "learning_rate": 9.644662318071747e-05, + "loss": 2.8294, + "step": 5162 + }, + { + "epoch": 0.4673667058930026, + "grad_norm": 0.967814028263092, + "learning_rate": 9.641491014134189e-05, + "loss": 2.7562, + "step": 5163 + }, + { + "epoch": 0.46745722820675295, + "grad_norm": 0.8379111289978027, + "learning_rate": 9.638319746298478e-05, + "loss": 2.7692, + "step": 5164 + }, + { + "epoch": 0.4675477505205033, + "grad_norm": 0.8625251054763794, + "learning_rate": 9.635148514883956e-05, + "loss": 2.7676, + "step": 5165 + }, + { + "epoch": 0.46763827283425363, + "grad_norm": 0.921722412109375, + "learning_rate": 9.631977320209972e-05, + "loss": 2.8362, + "step": 5166 + }, + { + "epoch": 0.467728795148004, + "grad_norm": 0.9456198215484619, + "learning_rate": 9.628806162595858e-05, + "loss": 2.8482, + "step": 5167 + }, + { + "epoch": 0.4678193174617543, + "grad_norm": 0.9429085850715637, + "learning_rate": 9.625635042360957e-05, + "loss": 2.7669, + "step": 5168 + }, + { + "epoch": 0.46790983977550465, + "grad_norm": 1.4454035758972168, + "learning_rate": 9.622463959824593e-05, + "loss": 2.8269, + "step": 5169 + }, + { + "epoch": 0.468000362089255, + "grad_norm": 0.8234190940856934, + "learning_rate": 9.619292915306101e-05, + "loss": 2.8078, + "step": 5170 + }, + { + "epoch": 0.46809088440300534, + "grad_norm": 0.9734144806861877, + "learning_rate": 9.616121909124801e-05, + "loss": 2.7734, + "step": 5171 + }, + { + "epoch": 0.4681814067167557, + "grad_norm": 0.934136688709259, + "learning_rate": 9.612950941600016e-05, + "loss": 2.8085, + "step": 5172 + }, + { + "epoch": 0.468271929030506, + "grad_norm": 0.9146413207054138, + "learning_rate": 9.609780013051057e-05, + "loss": 2.7461, + "step": 5173 + }, + { + "epoch": 0.46836245134425636, + "grad_norm": 0.9391007423400879, + "learning_rate": 9.606609123797243e-05, + "loss": 2.8535, + "step": 5174 + }, + { + "epoch": 0.4684529736580067, + "grad_norm": 0.9421570301055908, + "learning_rate": 9.603438274157878e-05, + "loss": 2.8389, + "step": 5175 + }, + { + "epoch": 0.46854349597175704, + "grad_norm": 0.7810022830963135, + "learning_rate": 9.60026746445227e-05, + "loss": 2.7927, + "step": 5176 + }, + { + "epoch": 0.4686340182855074, + "grad_norm": 0.8332025408744812, + "learning_rate": 9.597096694999715e-05, + "loss": 2.769, + "step": 5177 + }, + { + "epoch": 0.4687245405992577, + "grad_norm": 0.841300368309021, + "learning_rate": 9.593925966119515e-05, + "loss": 2.8227, + "step": 5178 + }, + { + "epoch": 0.46881506291300806, + "grad_norm": 0.9895218014717102, + "learning_rate": 9.590755278130952e-05, + "loss": 2.8348, + "step": 5179 + }, + { + "epoch": 0.4689055852267584, + "grad_norm": 0.9044789671897888, + "learning_rate": 9.587584631353329e-05, + "loss": 2.8038, + "step": 5180 + }, + { + "epoch": 0.46899610754050874, + "grad_norm": 0.8770217299461365, + "learning_rate": 9.584414026105918e-05, + "loss": 2.8169, + "step": 5181 + }, + { + "epoch": 0.4690866298542591, + "grad_norm": 0.9499712586402893, + "learning_rate": 9.581243462708006e-05, + "loss": 2.8173, + "step": 5182 + }, + { + "epoch": 0.4691771521680094, + "grad_norm": 0.8335294127464294, + "learning_rate": 9.578072941478861e-05, + "loss": 2.7457, + "step": 5183 + }, + { + "epoch": 0.46926767448175977, + "grad_norm": 0.9775024056434631, + "learning_rate": 9.574902462737765e-05, + "loss": 2.8145, + "step": 5184 + }, + { + "epoch": 0.4693581967955101, + "grad_norm": 0.9454213380813599, + "learning_rate": 9.571732026803977e-05, + "loss": 2.766, + "step": 5185 + }, + { + "epoch": 0.46944871910926045, + "grad_norm": 0.9457453489303589, + "learning_rate": 9.568561633996763e-05, + "loss": 2.8457, + "step": 5186 + }, + { + "epoch": 0.4695392414230108, + "grad_norm": 0.8910146355628967, + "learning_rate": 9.565391284635378e-05, + "loss": 2.8096, + "step": 5187 + }, + { + "epoch": 0.46962976373676113, + "grad_norm": 0.8416481614112854, + "learning_rate": 9.562220979039082e-05, + "loss": 2.8409, + "step": 5188 + }, + { + "epoch": 0.46972028605051147, + "grad_norm": 0.9077867865562439, + "learning_rate": 9.559050717527119e-05, + "loss": 2.7308, + "step": 5189 + }, + { + "epoch": 0.4698108083642618, + "grad_norm": 0.8774727582931519, + "learning_rate": 9.55588050041874e-05, + "loss": 2.7031, + "step": 5190 + }, + { + "epoch": 0.46990133067801215, + "grad_norm": 0.9110966324806213, + "learning_rate": 9.552710328033176e-05, + "loss": 2.7335, + "step": 5191 + }, + { + "epoch": 0.4699918529917625, + "grad_norm": 0.9437413811683655, + "learning_rate": 9.549540200689676e-05, + "loss": 2.8008, + "step": 5192 + }, + { + "epoch": 0.47008237530551283, + "grad_norm": 0.9005468487739563, + "learning_rate": 9.546370118707463e-05, + "loss": 2.8006, + "step": 5193 + }, + { + "epoch": 0.4701728976192632, + "grad_norm": 0.9098567962646484, + "learning_rate": 9.543200082405768e-05, + "loss": 2.8465, + "step": 5194 + }, + { + "epoch": 0.47026341993301346, + "grad_norm": 1.0034912824630737, + "learning_rate": 9.540030092103809e-05, + "loss": 2.8063, + "step": 5195 + }, + { + "epoch": 0.4703539422467638, + "grad_norm": 0.873590350151062, + "learning_rate": 9.536860148120811e-05, + "loss": 2.7531, + "step": 5196 + }, + { + "epoch": 0.47044446456051414, + "grad_norm": 0.8259015679359436, + "learning_rate": 9.53369025077598e-05, + "loss": 2.8364, + "step": 5197 + }, + { + "epoch": 0.4705349868742645, + "grad_norm": 0.9385213851928711, + "learning_rate": 9.530520400388531e-05, + "loss": 2.7638, + "step": 5198 + }, + { + "epoch": 0.4706255091880148, + "grad_norm": 0.8609074354171753, + "learning_rate": 9.527350597277661e-05, + "loss": 2.7279, + "step": 5199 + }, + { + "epoch": 0.47071603150176516, + "grad_norm": 0.7989048957824707, + "learning_rate": 9.524180841762577e-05, + "loss": 2.7767, + "step": 5200 + }, + { + "epoch": 0.4708065538155155, + "grad_norm": 0.8445473313331604, + "learning_rate": 9.521011134162467e-05, + "loss": 2.7703, + "step": 5201 + }, + { + "epoch": 0.47089707612926585, + "grad_norm": 0.9142276048660278, + "learning_rate": 9.517841474796526e-05, + "loss": 2.7968, + "step": 5202 + }, + { + "epoch": 0.4709875984430162, + "grad_norm": 0.8804418444633484, + "learning_rate": 9.51467186398393e-05, + "loss": 2.7839, + "step": 5203 + }, + { + "epoch": 0.4710781207567665, + "grad_norm": 0.9042705297470093, + "learning_rate": 9.511502302043868e-05, + "loss": 2.7856, + "step": 5204 + }, + { + "epoch": 0.47116864307051687, + "grad_norm": 0.8807315826416016, + "learning_rate": 9.508332789295508e-05, + "loss": 2.8576, + "step": 5205 + }, + { + "epoch": 0.4712591653842672, + "grad_norm": 0.9314857721328735, + "learning_rate": 9.505163326058027e-05, + "loss": 2.7471, + "step": 5206 + }, + { + "epoch": 0.47134968769801755, + "grad_norm": 0.8590176105499268, + "learning_rate": 9.50199391265058e-05, + "loss": 2.8074, + "step": 5207 + }, + { + "epoch": 0.4714402100117679, + "grad_norm": 0.8539712429046631, + "learning_rate": 9.498824549392337e-05, + "loss": 2.7981, + "step": 5208 + }, + { + "epoch": 0.47153073232551823, + "grad_norm": 0.8064674139022827, + "learning_rate": 9.495655236602444e-05, + "loss": 2.7815, + "step": 5209 + }, + { + "epoch": 0.4716212546392686, + "grad_norm": 0.8869786262512207, + "learning_rate": 9.492485974600059e-05, + "loss": 2.8347, + "step": 5210 + }, + { + "epoch": 0.4717117769530189, + "grad_norm": 0.8497887253761292, + "learning_rate": 9.48931676370432e-05, + "loss": 2.7493, + "step": 5211 + }, + { + "epoch": 0.47180229926676925, + "grad_norm": 0.8447049856185913, + "learning_rate": 9.486147604234371e-05, + "loss": 2.7629, + "step": 5212 + }, + { + "epoch": 0.4718928215805196, + "grad_norm": 0.8657582402229309, + "learning_rate": 9.482978496509341e-05, + "loss": 2.7943, + "step": 5213 + }, + { + "epoch": 0.47198334389426994, + "grad_norm": 0.8378434777259827, + "learning_rate": 9.479809440848364e-05, + "loss": 2.7565, + "step": 5214 + }, + { + "epoch": 0.4720738662080203, + "grad_norm": 0.8710640668869019, + "learning_rate": 9.476640437570562e-05, + "loss": 2.8118, + "step": 5215 + }, + { + "epoch": 0.4721643885217706, + "grad_norm": 0.866980254650116, + "learning_rate": 9.473471486995055e-05, + "loss": 2.794, + "step": 5216 + }, + { + "epoch": 0.47225491083552096, + "grad_norm": 0.9196212887763977, + "learning_rate": 9.470302589440952e-05, + "loss": 2.8236, + "step": 5217 + }, + { + "epoch": 0.4723454331492713, + "grad_norm": 0.9319663047790527, + "learning_rate": 9.467133745227366e-05, + "loss": 2.7992, + "step": 5218 + }, + { + "epoch": 0.47243595546302164, + "grad_norm": 0.8404010534286499, + "learning_rate": 9.463964954673396e-05, + "loss": 2.8101, + "step": 5219 + }, + { + "epoch": 0.472526477776772, + "grad_norm": 0.9603475332260132, + "learning_rate": 9.460796218098143e-05, + "loss": 2.7364, + "step": 5220 + }, + { + "epoch": 0.4726170000905223, + "grad_norm": 0.8324125409126282, + "learning_rate": 9.457627535820695e-05, + "loss": 2.7999, + "step": 5221 + }, + { + "epoch": 0.47270752240427266, + "grad_norm": 0.9238003492355347, + "learning_rate": 9.454458908160135e-05, + "loss": 2.7564, + "step": 5222 + }, + { + "epoch": 0.472798044718023, + "grad_norm": 0.866795003414154, + "learning_rate": 9.451290335435553e-05, + "loss": 2.7623, + "step": 5223 + }, + { + "epoch": 0.47288856703177334, + "grad_norm": 0.8267090320587158, + "learning_rate": 9.448121817966018e-05, + "loss": 2.7862, + "step": 5224 + }, + { + "epoch": 0.4729790893455237, + "grad_norm": 0.8153204917907715, + "learning_rate": 9.444953356070601e-05, + "loss": 2.7832, + "step": 5225 + }, + { + "epoch": 0.473069611659274, + "grad_norm": 0.8457561731338501, + "learning_rate": 9.441784950068362e-05, + "loss": 2.8002, + "step": 5226 + }, + { + "epoch": 0.47316013397302437, + "grad_norm": 0.8408893346786499, + "learning_rate": 9.438616600278368e-05, + "loss": 2.7474, + "step": 5227 + }, + { + "epoch": 0.4732506562867747, + "grad_norm": 0.9174901843070984, + "learning_rate": 9.435448307019665e-05, + "loss": 2.828, + "step": 5228 + }, + { + "epoch": 0.47334117860052505, + "grad_norm": 0.8532944917678833, + "learning_rate": 9.432280070611305e-05, + "loss": 2.7933, + "step": 5229 + }, + { + "epoch": 0.4734317009142754, + "grad_norm": 0.8048722743988037, + "learning_rate": 9.42911189137232e-05, + "loss": 2.7738, + "step": 5230 + }, + { + "epoch": 0.47352222322802573, + "grad_norm": 0.8400250673294067, + "learning_rate": 9.425943769621757e-05, + "loss": 2.784, + "step": 5231 + }, + { + "epoch": 0.47361274554177607, + "grad_norm": 0.8464124798774719, + "learning_rate": 9.422775705678639e-05, + "loss": 2.8168, + "step": 5232 + }, + { + "epoch": 0.4737032678555264, + "grad_norm": 0.8276224136352539, + "learning_rate": 9.419607699861994e-05, + "loss": 2.8354, + "step": 5233 + }, + { + "epoch": 0.47379379016927675, + "grad_norm": 0.8101887702941895, + "learning_rate": 9.416439752490834e-05, + "loss": 2.7846, + "step": 5234 + }, + { + "epoch": 0.4738843124830271, + "grad_norm": 0.863121747970581, + "learning_rate": 9.413271863884178e-05, + "loss": 2.8169, + "step": 5235 + }, + { + "epoch": 0.47397483479677743, + "grad_norm": 0.89903724193573, + "learning_rate": 9.410104034361027e-05, + "loss": 2.7933, + "step": 5236 + }, + { + "epoch": 0.4740653571105277, + "grad_norm": 0.8720853924751282, + "learning_rate": 9.406936264240386e-05, + "loss": 2.8064, + "step": 5237 + }, + { + "epoch": 0.47415587942427806, + "grad_norm": 0.9143301844596863, + "learning_rate": 9.403768553841244e-05, + "loss": 2.8193, + "step": 5238 + }, + { + "epoch": 0.4742464017380284, + "grad_norm": 0.8418611288070679, + "learning_rate": 9.400600903482597e-05, + "loss": 2.7703, + "step": 5239 + }, + { + "epoch": 0.47433692405177874, + "grad_norm": 0.8874722719192505, + "learning_rate": 9.397433313483416e-05, + "loss": 2.8001, + "step": 5240 + }, + { + "epoch": 0.4744274463655291, + "grad_norm": 0.9456201195716858, + "learning_rate": 9.394265784162689e-05, + "loss": 2.7166, + "step": 5241 + }, + { + "epoch": 0.4745179686792794, + "grad_norm": 0.8349551558494568, + "learning_rate": 9.39109831583938e-05, + "loss": 2.8146, + "step": 5242 + }, + { + "epoch": 0.47460849099302976, + "grad_norm": 0.8971189856529236, + "learning_rate": 9.387930908832454e-05, + "loss": 2.7563, + "step": 5243 + }, + { + "epoch": 0.4746990133067801, + "grad_norm": 0.8368891477584839, + "learning_rate": 9.384763563460865e-05, + "loss": 2.8169, + "step": 5244 + }, + { + "epoch": 0.47478953562053045, + "grad_norm": 0.7798503637313843, + "learning_rate": 9.381596280043573e-05, + "loss": 2.7759, + "step": 5245 + }, + { + "epoch": 0.4748800579342808, + "grad_norm": 0.8371593356132507, + "learning_rate": 9.378429058899516e-05, + "loss": 2.7602, + "step": 5246 + }, + { + "epoch": 0.4749705802480311, + "grad_norm": 0.9390305280685425, + "learning_rate": 9.37526190034764e-05, + "loss": 2.7563, + "step": 5247 + }, + { + "epoch": 0.47506110256178147, + "grad_norm": 0.8491203784942627, + "learning_rate": 9.372094804706867e-05, + "loss": 2.8111, + "step": 5248 + }, + { + "epoch": 0.4751516248755318, + "grad_norm": 0.9118536114692688, + "learning_rate": 9.368927772296134e-05, + "loss": 2.764, + "step": 5249 + }, + { + "epoch": 0.47524214718928215, + "grad_norm": 0.9842036962509155, + "learning_rate": 9.365760803434355e-05, + "loss": 2.8288, + "step": 5250 + }, + { + "epoch": 0.4753326695030325, + "grad_norm": 1.0022794008255005, + "learning_rate": 9.36259389844045e-05, + "loss": 2.8153, + "step": 5251 + }, + { + "epoch": 0.47542319181678283, + "grad_norm": 0.8196097016334534, + "learning_rate": 9.359427057633316e-05, + "loss": 2.8021, + "step": 5252 + }, + { + "epoch": 0.47551371413053317, + "grad_norm": 0.839787483215332, + "learning_rate": 9.356260281331864e-05, + "loss": 2.7805, + "step": 5253 + }, + { + "epoch": 0.4756042364442835, + "grad_norm": 0.8965675234794617, + "learning_rate": 9.353093569854983e-05, + "loss": 2.7889, + "step": 5254 + }, + { + "epoch": 0.47569475875803385, + "grad_norm": 0.8917959928512573, + "learning_rate": 9.349926923521563e-05, + "loss": 2.8078, + "step": 5255 + }, + { + "epoch": 0.4757852810717842, + "grad_norm": 0.8598231077194214, + "learning_rate": 9.34676034265048e-05, + "loss": 2.7312, + "step": 5256 + }, + { + "epoch": 0.47587580338553453, + "grad_norm": 0.8513514995574951, + "learning_rate": 9.343593827560617e-05, + "loss": 2.787, + "step": 5257 + }, + { + "epoch": 0.4759663256992849, + "grad_norm": 0.81255704164505, + "learning_rate": 9.340427378570835e-05, + "loss": 2.7823, + "step": 5258 + }, + { + "epoch": 0.4760568480130352, + "grad_norm": 0.8444796800613403, + "learning_rate": 9.337260996000002e-05, + "loss": 2.7665, + "step": 5259 + }, + { + "epoch": 0.47614737032678556, + "grad_norm": 0.9492307901382446, + "learning_rate": 9.334094680166962e-05, + "loss": 2.7264, + "step": 5260 + }, + { + "epoch": 0.4762378926405359, + "grad_norm": 0.809389591217041, + "learning_rate": 9.330928431390575e-05, + "loss": 2.7932, + "step": 5261 + }, + { + "epoch": 0.47632841495428624, + "grad_norm": 0.8459622859954834, + "learning_rate": 9.327762249989673e-05, + "loss": 2.765, + "step": 5262 + }, + { + "epoch": 0.4764189372680366, + "grad_norm": 0.8855199217796326, + "learning_rate": 9.324596136283097e-05, + "loss": 2.7769, + "step": 5263 + }, + { + "epoch": 0.4765094595817869, + "grad_norm": 0.8350811004638672, + "learning_rate": 9.321430090589668e-05, + "loss": 2.736, + "step": 5264 + }, + { + "epoch": 0.47659998189553726, + "grad_norm": 0.8843063116073608, + "learning_rate": 9.318264113228214e-05, + "loss": 2.7474, + "step": 5265 + }, + { + "epoch": 0.4766905042092876, + "grad_norm": 0.8612480759620667, + "learning_rate": 9.315098204517543e-05, + "loss": 2.8543, + "step": 5266 + }, + { + "epoch": 0.47678102652303794, + "grad_norm": 0.8599689602851868, + "learning_rate": 9.311932364776466e-05, + "loss": 2.8309, + "step": 5267 + }, + { + "epoch": 0.4768715488367883, + "grad_norm": 0.7801754474639893, + "learning_rate": 9.308766594323778e-05, + "loss": 2.7796, + "step": 5268 + }, + { + "epoch": 0.4769620711505386, + "grad_norm": 0.853583037853241, + "learning_rate": 9.30560089347828e-05, + "loss": 2.7741, + "step": 5269 + }, + { + "epoch": 0.47705259346428897, + "grad_norm": 0.9441990852355957, + "learning_rate": 9.302435262558747e-05, + "loss": 2.7768, + "step": 5270 + }, + { + "epoch": 0.4771431157780393, + "grad_norm": 0.9938909411430359, + "learning_rate": 9.299269701883968e-05, + "loss": 2.8211, + "step": 5271 + }, + { + "epoch": 0.47723363809178965, + "grad_norm": 0.8555877804756165, + "learning_rate": 9.296104211772709e-05, + "loss": 2.7621, + "step": 5272 + }, + { + "epoch": 0.47732416040554, + "grad_norm": 0.938060998916626, + "learning_rate": 9.29293879254374e-05, + "loss": 2.7601, + "step": 5273 + }, + { + "epoch": 0.47741468271929033, + "grad_norm": 0.8729612827301025, + "learning_rate": 9.28977344451581e-05, + "loss": 2.7921, + "step": 5274 + }, + { + "epoch": 0.47750520503304067, + "grad_norm": 0.87042635679245, + "learning_rate": 9.286608168007678e-05, + "loss": 2.8061, + "step": 5275 + }, + { + "epoch": 0.477595727346791, + "grad_norm": 0.8272691965103149, + "learning_rate": 9.283442963338083e-05, + "loss": 2.7974, + "step": 5276 + }, + { + "epoch": 0.47768624966054135, + "grad_norm": 0.826310932636261, + "learning_rate": 9.280277830825763e-05, + "loss": 2.7249, + "step": 5277 + }, + { + "epoch": 0.47777677197429164, + "grad_norm": 0.8631619215011597, + "learning_rate": 9.277112770789443e-05, + "loss": 2.7685, + "step": 5278 + }, + { + "epoch": 0.477867294288042, + "grad_norm": 0.8807775974273682, + "learning_rate": 9.273947783547851e-05, + "loss": 2.8475, + "step": 5279 + }, + { + "epoch": 0.4779578166017923, + "grad_norm": 0.8230133056640625, + "learning_rate": 9.270782869419694e-05, + "loss": 2.7196, + "step": 5280 + }, + { + "epoch": 0.47804833891554266, + "grad_norm": 0.9338566064834595, + "learning_rate": 9.267618028723686e-05, + "loss": 2.8041, + "step": 5281 + }, + { + "epoch": 0.478138861229293, + "grad_norm": 0.8943211436271667, + "learning_rate": 9.264453261778517e-05, + "loss": 2.7414, + "step": 5282 + }, + { + "epoch": 0.47822938354304334, + "grad_norm": 0.8780472278594971, + "learning_rate": 9.261288568902889e-05, + "loss": 2.829, + "step": 5283 + }, + { + "epoch": 0.4783199058567937, + "grad_norm": 0.8478779792785645, + "learning_rate": 9.258123950415479e-05, + "loss": 2.8234, + "step": 5284 + }, + { + "epoch": 0.478410428170544, + "grad_norm": 0.841843843460083, + "learning_rate": 9.25495940663497e-05, + "loss": 2.7386, + "step": 5285 + }, + { + "epoch": 0.47850095048429436, + "grad_norm": 1.0312539339065552, + "learning_rate": 9.251794937880023e-05, + "loss": 2.8354, + "step": 5286 + }, + { + "epoch": 0.4785914727980447, + "grad_norm": 0.8610813617706299, + "learning_rate": 9.24863054446931e-05, + "loss": 2.769, + "step": 5287 + }, + { + "epoch": 0.47868199511179504, + "grad_norm": 0.8196991086006165, + "learning_rate": 9.24546622672148e-05, + "loss": 2.7291, + "step": 5288 + }, + { + "epoch": 0.4787725174255454, + "grad_norm": 0.8547167181968689, + "learning_rate": 9.242301984955182e-05, + "loss": 2.7811, + "step": 5289 + }, + { + "epoch": 0.4788630397392957, + "grad_norm": 0.8360316753387451, + "learning_rate": 9.239137819489047e-05, + "loss": 2.795, + "step": 5290 + }, + { + "epoch": 0.47895356205304607, + "grad_norm": 0.831993579864502, + "learning_rate": 9.235973730641719e-05, + "loss": 2.7939, + "step": 5291 + }, + { + "epoch": 0.4790440843667964, + "grad_norm": 0.8065147995948792, + "learning_rate": 9.232809718731814e-05, + "loss": 2.8107, + "step": 5292 + }, + { + "epoch": 0.47913460668054675, + "grad_norm": 0.8437579274177551, + "learning_rate": 9.229645784077949e-05, + "loss": 2.8158, + "step": 5293 + }, + { + "epoch": 0.4792251289942971, + "grad_norm": 0.876053512096405, + "learning_rate": 9.22648192699873e-05, + "loss": 2.8033, + "step": 5294 + }, + { + "epoch": 0.47931565130804743, + "grad_norm": 0.8478907346725464, + "learning_rate": 9.223318147812763e-05, + "loss": 2.8328, + "step": 5295 + }, + { + "epoch": 0.47940617362179777, + "grad_norm": 0.818863570690155, + "learning_rate": 9.220154446838637e-05, + "loss": 2.7338, + "step": 5296 + }, + { + "epoch": 0.4794966959355481, + "grad_norm": 0.8235446810722351, + "learning_rate": 9.216990824394937e-05, + "loss": 2.8377, + "step": 5297 + }, + { + "epoch": 0.47958721824929845, + "grad_norm": 0.9009406566619873, + "learning_rate": 9.213827280800236e-05, + "loss": 2.7701, + "step": 5298 + }, + { + "epoch": 0.4796777405630488, + "grad_norm": 0.8477178812026978, + "learning_rate": 9.21066381637311e-05, + "loss": 2.773, + "step": 5299 + }, + { + "epoch": 0.47976826287679913, + "grad_norm": 0.8886308073997498, + "learning_rate": 9.207500431432115e-05, + "loss": 2.7909, + "step": 5300 + }, + { + "epoch": 0.4798587851905495, + "grad_norm": 0.8153219223022461, + "learning_rate": 9.204337126295807e-05, + "loss": 2.7347, + "step": 5301 + }, + { + "epoch": 0.4799493075042998, + "grad_norm": 0.8049747347831726, + "learning_rate": 9.201173901282724e-05, + "loss": 2.7898, + "step": 5302 + }, + { + "epoch": 0.48003982981805016, + "grad_norm": 0.8145269155502319, + "learning_rate": 9.198010756711412e-05, + "loss": 2.7629, + "step": 5303 + }, + { + "epoch": 0.4801303521318005, + "grad_norm": 0.909950315952301, + "learning_rate": 9.194847692900391e-05, + "loss": 2.8089, + "step": 5304 + }, + { + "epoch": 0.48022087444555084, + "grad_norm": 0.8205299377441406, + "learning_rate": 9.191684710168188e-05, + "loss": 2.7675, + "step": 5305 + }, + { + "epoch": 0.4803113967593012, + "grad_norm": 0.9104087352752686, + "learning_rate": 9.188521808833311e-05, + "loss": 2.824, + "step": 5306 + }, + { + "epoch": 0.4804019190730515, + "grad_norm": 0.956626296043396, + "learning_rate": 9.185358989214268e-05, + "loss": 2.7594, + "step": 5307 + }, + { + "epoch": 0.48049244138680186, + "grad_norm": 0.8074916005134583, + "learning_rate": 9.182196251629552e-05, + "loss": 2.8271, + "step": 5308 + }, + { + "epoch": 0.4805829637005522, + "grad_norm": 0.8834424018859863, + "learning_rate": 9.179033596397647e-05, + "loss": 2.8061, + "step": 5309 + }, + { + "epoch": 0.48067348601430254, + "grad_norm": 0.8030386567115784, + "learning_rate": 9.175871023837042e-05, + "loss": 2.7642, + "step": 5310 + }, + { + "epoch": 0.4807640083280529, + "grad_norm": 0.8542213439941406, + "learning_rate": 9.172708534266197e-05, + "loss": 2.8134, + "step": 5311 + }, + { + "epoch": 0.4808545306418032, + "grad_norm": 0.8158377408981323, + "learning_rate": 9.169546128003585e-05, + "loss": 2.7797, + "step": 5312 + }, + { + "epoch": 0.48094505295555356, + "grad_norm": 0.7623111605644226, + "learning_rate": 9.16638380536765e-05, + "loss": 2.7451, + "step": 5313 + }, + { + "epoch": 0.4810355752693039, + "grad_norm": 0.8422583937644958, + "learning_rate": 9.163221566676847e-05, + "loss": 2.8154, + "step": 5314 + }, + { + "epoch": 0.48112609758305425, + "grad_norm": 0.9185038805007935, + "learning_rate": 9.160059412249608e-05, + "loss": 2.7935, + "step": 5315 + }, + { + "epoch": 0.4812166198968046, + "grad_norm": 0.8019948601722717, + "learning_rate": 9.156897342404364e-05, + "loss": 2.7404, + "step": 5316 + }, + { + "epoch": 0.48130714221055493, + "grad_norm": 0.8030464053153992, + "learning_rate": 9.15373535745953e-05, + "loss": 2.7628, + "step": 5317 + }, + { + "epoch": 0.48139766452430527, + "grad_norm": 0.8811296820640564, + "learning_rate": 9.150573457733529e-05, + "loss": 2.8164, + "step": 5318 + }, + { + "epoch": 0.48148818683805555, + "grad_norm": 0.8141608834266663, + "learning_rate": 9.147411643544755e-05, + "loss": 2.826, + "step": 5319 + }, + { + "epoch": 0.4815787091518059, + "grad_norm": 0.8477288484573364, + "learning_rate": 9.144249915211605e-05, + "loss": 2.7852, + "step": 5320 + }, + { + "epoch": 0.48166923146555624, + "grad_norm": 0.811499297618866, + "learning_rate": 9.141088273052463e-05, + "loss": 2.7776, + "step": 5321 + }, + { + "epoch": 0.4817597537793066, + "grad_norm": 0.8425334095954895, + "learning_rate": 9.137926717385711e-05, + "loss": 2.797, + "step": 5322 + }, + { + "epoch": 0.4818502760930569, + "grad_norm": 0.9187545776367188, + "learning_rate": 9.134765248529715e-05, + "loss": 2.7419, + "step": 5323 + }, + { + "epoch": 0.48194079840680726, + "grad_norm": 0.8210468888282776, + "learning_rate": 9.131603866802835e-05, + "loss": 2.7954, + "step": 5324 + }, + { + "epoch": 0.4820313207205576, + "grad_norm": 0.844838559627533, + "learning_rate": 9.128442572523417e-05, + "loss": 2.7602, + "step": 5325 + }, + { + "epoch": 0.48212184303430794, + "grad_norm": 0.8497430086135864, + "learning_rate": 9.125281366009815e-05, + "loss": 2.7525, + "step": 5326 + }, + { + "epoch": 0.4822123653480583, + "grad_norm": 0.8585840463638306, + "learning_rate": 9.122120247580351e-05, + "loss": 2.803, + "step": 5327 + }, + { + "epoch": 0.4823028876618086, + "grad_norm": 0.9494702219963074, + "learning_rate": 9.118959217553358e-05, + "loss": 2.7816, + "step": 5328 + }, + { + "epoch": 0.48239340997555896, + "grad_norm": 0.9262629151344299, + "learning_rate": 9.115798276247143e-05, + "loss": 2.7696, + "step": 5329 + }, + { + "epoch": 0.4824839322893093, + "grad_norm": 0.8653173446655273, + "learning_rate": 9.112637423980021e-05, + "loss": 2.8015, + "step": 5330 + }, + { + "epoch": 0.48257445460305964, + "grad_norm": 0.841497540473938, + "learning_rate": 9.109476661070285e-05, + "loss": 2.7352, + "step": 5331 + }, + { + "epoch": 0.48266497691681, + "grad_norm": 0.8958382606506348, + "learning_rate": 9.106315987836225e-05, + "loss": 2.7476, + "step": 5332 + }, + { + "epoch": 0.4827554992305603, + "grad_norm": 0.8004329204559326, + "learning_rate": 9.10315540459612e-05, + "loss": 2.7025, + "step": 5333 + }, + { + "epoch": 0.48284602154431067, + "grad_norm": 0.8242617845535278, + "learning_rate": 9.099994911668242e-05, + "loss": 2.7859, + "step": 5334 + }, + { + "epoch": 0.482936543858061, + "grad_norm": 0.8296079635620117, + "learning_rate": 9.096834509370849e-05, + "loss": 2.7148, + "step": 5335 + }, + { + "epoch": 0.48302706617181135, + "grad_norm": 0.8059534430503845, + "learning_rate": 9.093674198022201e-05, + "loss": 2.7773, + "step": 5336 + }, + { + "epoch": 0.4831175884855617, + "grad_norm": 0.8140705227851868, + "learning_rate": 9.090513977940532e-05, + "loss": 2.7571, + "step": 5337 + }, + { + "epoch": 0.48320811079931203, + "grad_norm": 0.9303644299507141, + "learning_rate": 9.087353849444085e-05, + "loss": 2.839, + "step": 5338 + }, + { + "epoch": 0.48329863311306237, + "grad_norm": 0.8803795576095581, + "learning_rate": 9.084193812851075e-05, + "loss": 2.8175, + "step": 5339 + }, + { + "epoch": 0.4833891554268127, + "grad_norm": 0.8591445088386536, + "learning_rate": 9.081033868479727e-05, + "loss": 2.7367, + "step": 5340 + }, + { + "epoch": 0.48347967774056305, + "grad_norm": 0.819056510925293, + "learning_rate": 9.077874016648243e-05, + "loss": 2.784, + "step": 5341 + }, + { + "epoch": 0.4835702000543134, + "grad_norm": 0.8365514874458313, + "learning_rate": 9.07471425767482e-05, + "loss": 2.7453, + "step": 5342 + }, + { + "epoch": 0.48366072236806373, + "grad_norm": 0.8705887794494629, + "learning_rate": 9.071554591877643e-05, + "loss": 2.836, + "step": 5343 + }, + { + "epoch": 0.4837512446818141, + "grad_norm": 0.8518171310424805, + "learning_rate": 9.068395019574897e-05, + "loss": 2.827, + "step": 5344 + }, + { + "epoch": 0.4838417669955644, + "grad_norm": 0.8364830613136292, + "learning_rate": 9.065235541084744e-05, + "loss": 2.7149, + "step": 5345 + }, + { + "epoch": 0.48393228930931476, + "grad_norm": 0.8858799338340759, + "learning_rate": 9.06207615672535e-05, + "loss": 2.7244, + "step": 5346 + }, + { + "epoch": 0.4840228116230651, + "grad_norm": 0.8261039853096008, + "learning_rate": 9.058916866814858e-05, + "loss": 2.7948, + "step": 5347 + }, + { + "epoch": 0.48411333393681544, + "grad_norm": 0.8314043283462524, + "learning_rate": 9.055757671671413e-05, + "loss": 2.729, + "step": 5348 + }, + { + "epoch": 0.4842038562505658, + "grad_norm": 0.8006643652915955, + "learning_rate": 9.052598571613144e-05, + "loss": 2.7492, + "step": 5349 + }, + { + "epoch": 0.4842943785643161, + "grad_norm": 0.8151100277900696, + "learning_rate": 9.049439566958175e-05, + "loss": 2.7673, + "step": 5350 + }, + { + "epoch": 0.48438490087806646, + "grad_norm": 0.9020787477493286, + "learning_rate": 9.046280658024612e-05, + "loss": 2.7503, + "step": 5351 + }, + { + "epoch": 0.4844754231918168, + "grad_norm": 0.8793193101882935, + "learning_rate": 9.043121845130564e-05, + "loss": 2.7668, + "step": 5352 + }, + { + "epoch": 0.48456594550556714, + "grad_norm": 0.7916908860206604, + "learning_rate": 9.039963128594119e-05, + "loss": 2.7605, + "step": 5353 + }, + { + "epoch": 0.4846564678193175, + "grad_norm": 0.8754737377166748, + "learning_rate": 9.036804508733362e-05, + "loss": 2.7812, + "step": 5354 + }, + { + "epoch": 0.4847469901330678, + "grad_norm": 0.9203606843948364, + "learning_rate": 9.03364598586636e-05, + "loss": 2.7841, + "step": 5355 + }, + { + "epoch": 0.48483751244681816, + "grad_norm": 0.8462159633636475, + "learning_rate": 9.030487560311186e-05, + "loss": 2.784, + "step": 5356 + }, + { + "epoch": 0.4849280347605685, + "grad_norm": 0.8361531496047974, + "learning_rate": 9.027329232385887e-05, + "loss": 2.7941, + "step": 5357 + }, + { + "epoch": 0.48501855707431885, + "grad_norm": 0.8178485035896301, + "learning_rate": 9.024171002408506e-05, + "loss": 2.733, + "step": 5358 + }, + { + "epoch": 0.4851090793880692, + "grad_norm": 0.9467344284057617, + "learning_rate": 9.021012870697077e-05, + "loss": 2.8265, + "step": 5359 + }, + { + "epoch": 0.48519960170181947, + "grad_norm": 0.8394115567207336, + "learning_rate": 9.01785483756963e-05, + "loss": 2.8171, + "step": 5360 + }, + { + "epoch": 0.4852901240155698, + "grad_norm": 0.869340717792511, + "learning_rate": 9.014696903344169e-05, + "loss": 2.7675, + "step": 5361 + }, + { + "epoch": 0.48538064632932015, + "grad_norm": 0.8268486261367798, + "learning_rate": 9.011539068338708e-05, + "loss": 2.8036, + "step": 5362 + }, + { + "epoch": 0.4854711686430705, + "grad_norm": 0.7936989665031433, + "learning_rate": 9.008381332871229e-05, + "loss": 2.7107, + "step": 5363 + }, + { + "epoch": 0.48556169095682084, + "grad_norm": 0.849716067314148, + "learning_rate": 9.005223697259732e-05, + "loss": 2.7853, + "step": 5364 + }, + { + "epoch": 0.4856522132705712, + "grad_norm": 0.8900785446166992, + "learning_rate": 9.002066161822172e-05, + "loss": 2.7911, + "step": 5365 + }, + { + "epoch": 0.4857427355843215, + "grad_norm": 0.7993195652961731, + "learning_rate": 8.998908726876529e-05, + "loss": 2.7532, + "step": 5366 + }, + { + "epoch": 0.48583325789807186, + "grad_norm": 0.9737759828567505, + "learning_rate": 8.995751392740746e-05, + "loss": 2.7393, + "step": 5367 + }, + { + "epoch": 0.4859237802118222, + "grad_norm": 0.8641330599784851, + "learning_rate": 8.992594159732774e-05, + "loss": 2.7719, + "step": 5368 + }, + { + "epoch": 0.48601430252557254, + "grad_norm": 0.8608652353286743, + "learning_rate": 8.989437028170537e-05, + "loss": 2.7101, + "step": 5369 + }, + { + "epoch": 0.4861048248393229, + "grad_norm": 0.9764891862869263, + "learning_rate": 8.986279998371966e-05, + "loss": 2.7968, + "step": 5370 + }, + { + "epoch": 0.4861953471530732, + "grad_norm": 0.9630980491638184, + "learning_rate": 8.983123070654973e-05, + "loss": 2.7807, + "step": 5371 + }, + { + "epoch": 0.48628586946682356, + "grad_norm": 0.8285449147224426, + "learning_rate": 8.979966245337458e-05, + "loss": 2.7832, + "step": 5372 + }, + { + "epoch": 0.4863763917805739, + "grad_norm": 0.8406493067741394, + "learning_rate": 8.976809522737313e-05, + "loss": 2.8093, + "step": 5373 + }, + { + "epoch": 0.48646691409432424, + "grad_norm": 0.9877937436103821, + "learning_rate": 8.973652903172423e-05, + "loss": 2.7104, + "step": 5374 + }, + { + "epoch": 0.4865574364080746, + "grad_norm": 0.8676995038986206, + "learning_rate": 8.970496386960656e-05, + "loss": 2.8185, + "step": 5375 + }, + { + "epoch": 0.4866479587218249, + "grad_norm": 0.8179798126220703, + "learning_rate": 8.967339974419877e-05, + "loss": 2.812, + "step": 5376 + }, + { + "epoch": 0.48673848103557527, + "grad_norm": 0.8636631965637207, + "learning_rate": 8.96418366586793e-05, + "loss": 2.7825, + "step": 5377 + }, + { + "epoch": 0.4868290033493256, + "grad_norm": 0.9385567307472229, + "learning_rate": 8.961027461622664e-05, + "loss": 2.7539, + "step": 5378 + }, + { + "epoch": 0.48691952566307595, + "grad_norm": 0.8954267501831055, + "learning_rate": 8.957871362001902e-05, + "loss": 2.7582, + "step": 5379 + }, + { + "epoch": 0.4870100479768263, + "grad_norm": 0.869686484336853, + "learning_rate": 8.954715367323468e-05, + "loss": 2.8182, + "step": 5380 + }, + { + "epoch": 0.48710057029057663, + "grad_norm": 0.8564013838768005, + "learning_rate": 8.951559477905161e-05, + "loss": 2.7847, + "step": 5381 + }, + { + "epoch": 0.48719109260432697, + "grad_norm": 0.8536160588264465, + "learning_rate": 8.948403694064793e-05, + "loss": 2.7588, + "step": 5382 + }, + { + "epoch": 0.4872816149180773, + "grad_norm": 0.8505567312240601, + "learning_rate": 8.945248016120142e-05, + "loss": 2.7828, + "step": 5383 + }, + { + "epoch": 0.48737213723182765, + "grad_norm": 0.9134613275527954, + "learning_rate": 8.942092444388988e-05, + "loss": 2.7442, + "step": 5384 + }, + { + "epoch": 0.487462659545578, + "grad_norm": 0.9947112798690796, + "learning_rate": 8.938936979189091e-05, + "loss": 2.8192, + "step": 5385 + }, + { + "epoch": 0.48755318185932833, + "grad_norm": 0.7951664328575134, + "learning_rate": 8.935781620838216e-05, + "loss": 2.7398, + "step": 5386 + }, + { + "epoch": 0.4876437041730787, + "grad_norm": 0.8587552905082703, + "learning_rate": 8.932626369654101e-05, + "loss": 2.7777, + "step": 5387 + }, + { + "epoch": 0.487734226486829, + "grad_norm": 0.9508317708969116, + "learning_rate": 8.929471225954484e-05, + "loss": 2.7739, + "step": 5388 + }, + { + "epoch": 0.48782474880057936, + "grad_norm": 0.8943755030632019, + "learning_rate": 8.92631619005708e-05, + "loss": 2.8702, + "step": 5389 + }, + { + "epoch": 0.4879152711143297, + "grad_norm": 0.8466609716415405, + "learning_rate": 8.92316126227961e-05, + "loss": 2.7681, + "step": 5390 + }, + { + "epoch": 0.48800579342808004, + "grad_norm": 0.9310498833656311, + "learning_rate": 8.920006442939772e-05, + "loss": 2.7856, + "step": 5391 + }, + { + "epoch": 0.4880963157418304, + "grad_norm": 1.0089055299758911, + "learning_rate": 8.916851732355255e-05, + "loss": 2.8008, + "step": 5392 + }, + { + "epoch": 0.4881868380555807, + "grad_norm": 0.9262751936912537, + "learning_rate": 8.91369713084374e-05, + "loss": 2.7587, + "step": 5393 + }, + { + "epoch": 0.48827736036933106, + "grad_norm": 0.9704766869544983, + "learning_rate": 8.910542638722889e-05, + "loss": 2.7551, + "step": 5394 + }, + { + "epoch": 0.4883678826830814, + "grad_norm": 0.8155854344367981, + "learning_rate": 8.907388256310373e-05, + "loss": 2.7697, + "step": 5395 + }, + { + "epoch": 0.48845840499683174, + "grad_norm": 0.9788278341293335, + "learning_rate": 8.90423398392382e-05, + "loss": 2.8081, + "step": 5396 + }, + { + "epoch": 0.4885489273105821, + "grad_norm": 0.9229268431663513, + "learning_rate": 8.901079821880882e-05, + "loss": 2.7874, + "step": 5397 + }, + { + "epoch": 0.4886394496243324, + "grad_norm": 0.8876495361328125, + "learning_rate": 8.897925770499174e-05, + "loss": 2.7961, + "step": 5398 + }, + { + "epoch": 0.48872997193808276, + "grad_norm": 0.9457216262817383, + "learning_rate": 8.894771830096312e-05, + "loss": 2.7652, + "step": 5399 + }, + { + "epoch": 0.4888204942518331, + "grad_norm": 0.8485191464424133, + "learning_rate": 8.891618000989891e-05, + "loss": 2.7648, + "step": 5400 + }, + { + "epoch": 0.4889110165655834, + "grad_norm": 0.8234026432037354, + "learning_rate": 8.888464283497512e-05, + "loss": 2.731, + "step": 5401 + }, + { + "epoch": 0.48900153887933373, + "grad_norm": 0.9092859625816345, + "learning_rate": 8.885310677936746e-05, + "loss": 2.7226, + "step": 5402 + }, + { + "epoch": 0.48909206119308407, + "grad_norm": 0.8186323642730713, + "learning_rate": 8.882157184625169e-05, + "loss": 2.8257, + "step": 5403 + }, + { + "epoch": 0.4891825835068344, + "grad_norm": 0.7837226986885071, + "learning_rate": 8.879003803880326e-05, + "loss": 2.7632, + "step": 5404 + }, + { + "epoch": 0.48927310582058475, + "grad_norm": 0.953040599822998, + "learning_rate": 8.875850536019775e-05, + "loss": 2.804, + "step": 5405 + }, + { + "epoch": 0.4893636281343351, + "grad_norm": 0.921772301197052, + "learning_rate": 8.872697381361043e-05, + "loss": 2.8101, + "step": 5406 + }, + { + "epoch": 0.48945415044808543, + "grad_norm": 0.8325876593589783, + "learning_rate": 8.869544340221654e-05, + "loss": 2.7615, + "step": 5407 + }, + { + "epoch": 0.4895446727618358, + "grad_norm": 0.8860905170440674, + "learning_rate": 8.866391412919117e-05, + "loss": 2.7417, + "step": 5408 + }, + { + "epoch": 0.4896351950755861, + "grad_norm": 0.8813610672950745, + "learning_rate": 8.863238599770936e-05, + "loss": 2.7837, + "step": 5409 + }, + { + "epoch": 0.48972571738933646, + "grad_norm": 0.878919243812561, + "learning_rate": 8.860085901094595e-05, + "loss": 2.7313, + "step": 5410 + }, + { + "epoch": 0.4898162397030868, + "grad_norm": 0.884091854095459, + "learning_rate": 8.856933317207576e-05, + "loss": 2.7618, + "step": 5411 + }, + { + "epoch": 0.48990676201683714, + "grad_norm": 0.848335862159729, + "learning_rate": 8.853780848427334e-05, + "loss": 2.7053, + "step": 5412 + }, + { + "epoch": 0.4899972843305875, + "grad_norm": 0.8469763994216919, + "learning_rate": 8.850628495071336e-05, + "loss": 2.7734, + "step": 5413 + }, + { + "epoch": 0.4900878066443378, + "grad_norm": 0.8686950206756592, + "learning_rate": 8.847476257457013e-05, + "loss": 2.8046, + "step": 5414 + }, + { + "epoch": 0.49017832895808816, + "grad_norm": 1.0004031658172607, + "learning_rate": 8.844324135901802e-05, + "loss": 2.7847, + "step": 5415 + }, + { + "epoch": 0.4902688512718385, + "grad_norm": 0.8817363381385803, + "learning_rate": 8.841172130723115e-05, + "loss": 2.7698, + "step": 5416 + }, + { + "epoch": 0.49035937358558884, + "grad_norm": 0.8560882806777954, + "learning_rate": 8.838020242238367e-05, + "loss": 2.7825, + "step": 5417 + }, + { + "epoch": 0.4904498958993392, + "grad_norm": 0.8987289071083069, + "learning_rate": 8.834868470764948e-05, + "loss": 2.8044, + "step": 5418 + }, + { + "epoch": 0.4905404182130895, + "grad_norm": 0.8843113780021667, + "learning_rate": 8.831716816620243e-05, + "loss": 2.7462, + "step": 5419 + }, + { + "epoch": 0.49063094052683986, + "grad_norm": 0.9686959385871887, + "learning_rate": 8.828565280121617e-05, + "loss": 2.7268, + "step": 5420 + }, + { + "epoch": 0.4907214628405902, + "grad_norm": 0.8693787455558777, + "learning_rate": 8.825413861586442e-05, + "loss": 2.7575, + "step": 5421 + }, + { + "epoch": 0.49081198515434055, + "grad_norm": 0.8458731770515442, + "learning_rate": 8.822262561332056e-05, + "loss": 2.8095, + "step": 5422 + }, + { + "epoch": 0.4909025074680909, + "grad_norm": 0.8771583437919617, + "learning_rate": 8.819111379675798e-05, + "loss": 2.7571, + "step": 5423 + }, + { + "epoch": 0.49099302978184123, + "grad_norm": 0.8951244950294495, + "learning_rate": 8.81596031693499e-05, + "loss": 2.8171, + "step": 5424 + }, + { + "epoch": 0.49108355209559157, + "grad_norm": 0.9194262623786926, + "learning_rate": 8.812809373426951e-05, + "loss": 2.7763, + "step": 5425 + }, + { + "epoch": 0.4911740744093419, + "grad_norm": 1.0411819219589233, + "learning_rate": 8.80965854946897e-05, + "loss": 2.8314, + "step": 5426 + }, + { + "epoch": 0.49126459672309225, + "grad_norm": 0.8811046481132507, + "learning_rate": 8.806507845378343e-05, + "loss": 2.7847, + "step": 5427 + }, + { + "epoch": 0.4913551190368426, + "grad_norm": 0.9744120836257935, + "learning_rate": 8.803357261472343e-05, + "loss": 2.7625, + "step": 5428 + }, + { + "epoch": 0.49144564135059293, + "grad_norm": 0.8727533221244812, + "learning_rate": 8.800206798068235e-05, + "loss": 2.823, + "step": 5429 + }, + { + "epoch": 0.4915361636643433, + "grad_norm": 0.8588023781776428, + "learning_rate": 8.797056455483266e-05, + "loss": 2.7709, + "step": 5430 + }, + { + "epoch": 0.4916266859780936, + "grad_norm": 0.9703391790390015, + "learning_rate": 8.793906234034683e-05, + "loss": 2.8147, + "step": 5431 + }, + { + "epoch": 0.49171720829184395, + "grad_norm": 0.8462010622024536, + "learning_rate": 8.790756134039708e-05, + "loss": 2.7482, + "step": 5432 + }, + { + "epoch": 0.4918077306055943, + "grad_norm": 0.8772428631782532, + "learning_rate": 8.787606155815558e-05, + "loss": 2.7578, + "step": 5433 + }, + { + "epoch": 0.49189825291934464, + "grad_norm": 0.8778265118598938, + "learning_rate": 8.784456299679432e-05, + "loss": 2.7482, + "step": 5434 + }, + { + "epoch": 0.491988775233095, + "grad_norm": 0.8943318724632263, + "learning_rate": 8.781306565948528e-05, + "loss": 2.7438, + "step": 5435 + }, + { + "epoch": 0.4920792975468453, + "grad_norm": 0.8805944323539734, + "learning_rate": 8.778156954940016e-05, + "loss": 2.7882, + "step": 5436 + }, + { + "epoch": 0.49216981986059566, + "grad_norm": 1.0474109649658203, + "learning_rate": 8.775007466971067e-05, + "loss": 2.7527, + "step": 5437 + }, + { + "epoch": 0.492260342174346, + "grad_norm": 0.8754709362983704, + "learning_rate": 8.771858102358828e-05, + "loss": 2.7693, + "step": 5438 + }, + { + "epoch": 0.49235086448809634, + "grad_norm": 0.8586017489433289, + "learning_rate": 8.76870886142045e-05, + "loss": 2.7932, + "step": 5439 + }, + { + "epoch": 0.4924413868018467, + "grad_norm": 0.9212132692337036, + "learning_rate": 8.765559744473053e-05, + "loss": 2.7714, + "step": 5440 + }, + { + "epoch": 0.492531909115597, + "grad_norm": 0.9377496838569641, + "learning_rate": 8.762410751833759e-05, + "loss": 2.8411, + "step": 5441 + }, + { + "epoch": 0.4926224314293473, + "grad_norm": 0.8290513157844543, + "learning_rate": 8.759261883819664e-05, + "loss": 2.7639, + "step": 5442 + }, + { + "epoch": 0.49271295374309765, + "grad_norm": 0.8210857510566711, + "learning_rate": 8.756113140747867e-05, + "loss": 2.7487, + "step": 5443 + }, + { + "epoch": 0.492803476056848, + "grad_norm": 0.9162151217460632, + "learning_rate": 8.752964522935441e-05, + "loss": 2.8344, + "step": 5444 + }, + { + "epoch": 0.49289399837059833, + "grad_norm": 0.8273245096206665, + "learning_rate": 8.749816030699456e-05, + "loss": 2.7405, + "step": 5445 + }, + { + "epoch": 0.49298452068434867, + "grad_norm": 0.8682637214660645, + "learning_rate": 8.746667664356956e-05, + "loss": 2.7461, + "step": 5446 + }, + { + "epoch": 0.493075042998099, + "grad_norm": 0.8884209990501404, + "learning_rate": 8.743519424224995e-05, + "loss": 2.8145, + "step": 5447 + }, + { + "epoch": 0.49316556531184935, + "grad_norm": 0.9339919090270996, + "learning_rate": 8.74037131062059e-05, + "loss": 2.7779, + "step": 5448 + }, + { + "epoch": 0.4932560876255997, + "grad_norm": 0.9694758653640747, + "learning_rate": 8.737223323860762e-05, + "loss": 2.7847, + "step": 5449 + }, + { + "epoch": 0.49334660993935003, + "grad_norm": 0.7979462146759033, + "learning_rate": 8.734075464262507e-05, + "loss": 2.7319, + "step": 5450 + }, + { + "epoch": 0.4934371322531004, + "grad_norm": 0.8110317587852478, + "learning_rate": 8.730927732142822e-05, + "loss": 2.6897, + "step": 5451 + }, + { + "epoch": 0.4935276545668507, + "grad_norm": 0.8965287208557129, + "learning_rate": 8.727780127818677e-05, + "loss": 2.6903, + "step": 5452 + }, + { + "epoch": 0.49361817688060106, + "grad_norm": 0.9517734050750732, + "learning_rate": 8.72463265160704e-05, + "loss": 2.8256, + "step": 5453 + }, + { + "epoch": 0.4937086991943514, + "grad_norm": 0.892761766910553, + "learning_rate": 8.721485303824857e-05, + "loss": 2.6702, + "step": 5454 + }, + { + "epoch": 0.49379922150810174, + "grad_norm": 0.9610890746116638, + "learning_rate": 8.718338084789072e-05, + "loss": 2.7717, + "step": 5455 + }, + { + "epoch": 0.4938897438218521, + "grad_norm": 0.9610509872436523, + "learning_rate": 8.715190994816607e-05, + "loss": 2.7269, + "step": 5456 + }, + { + "epoch": 0.4939802661356024, + "grad_norm": 0.9475899338722229, + "learning_rate": 8.712044034224374e-05, + "loss": 2.8009, + "step": 5457 + }, + { + "epoch": 0.49407078844935276, + "grad_norm": 0.93819659948349, + "learning_rate": 8.708897203329267e-05, + "loss": 2.785, + "step": 5458 + }, + { + "epoch": 0.4941613107631031, + "grad_norm": 0.957547128200531, + "learning_rate": 8.705750502448183e-05, + "loss": 2.7372, + "step": 5459 + }, + { + "epoch": 0.49425183307685344, + "grad_norm": 1.0170553922653198, + "learning_rate": 8.702603931897982e-05, + "loss": 2.7857, + "step": 5460 + }, + { + "epoch": 0.4943423553906038, + "grad_norm": 0.9468273520469666, + "learning_rate": 8.699457491995534e-05, + "loss": 2.8072, + "step": 5461 + }, + { + "epoch": 0.4944328777043541, + "grad_norm": 0.9994382858276367, + "learning_rate": 8.696311183057678e-05, + "loss": 2.7599, + "step": 5462 + }, + { + "epoch": 0.49452340001810446, + "grad_norm": 0.8759012818336487, + "learning_rate": 8.693165005401254e-05, + "loss": 2.7841, + "step": 5463 + }, + { + "epoch": 0.4946139223318548, + "grad_norm": 0.894015908241272, + "learning_rate": 8.690018959343072e-05, + "loss": 2.7601, + "step": 5464 + }, + { + "epoch": 0.49470444464560515, + "grad_norm": 0.8741127848625183, + "learning_rate": 8.68687304519995e-05, + "loss": 2.7864, + "step": 5465 + }, + { + "epoch": 0.4947949669593555, + "grad_norm": 0.8510367274284363, + "learning_rate": 8.683727263288673e-05, + "loss": 2.737, + "step": 5466 + }, + { + "epoch": 0.4948854892731058, + "grad_norm": 0.8362459540367126, + "learning_rate": 8.680581613926029e-05, + "loss": 2.8298, + "step": 5467 + }, + { + "epoch": 0.49497601158685617, + "grad_norm": 0.9130518436431885, + "learning_rate": 8.677436097428775e-05, + "loss": 2.7198, + "step": 5468 + }, + { + "epoch": 0.4950665339006065, + "grad_norm": 0.9414628148078918, + "learning_rate": 8.674290714113675e-05, + "loss": 2.8174, + "step": 5469 + }, + { + "epoch": 0.49515705621435685, + "grad_norm": 0.8726707696914673, + "learning_rate": 8.67114546429746e-05, + "loss": 2.6835, + "step": 5470 + }, + { + "epoch": 0.4952475785281072, + "grad_norm": 0.8614273071289062, + "learning_rate": 8.668000348296862e-05, + "loss": 2.7743, + "step": 5471 + }, + { + "epoch": 0.49533810084185753, + "grad_norm": 0.8497290015220642, + "learning_rate": 8.66485536642859e-05, + "loss": 2.7286, + "step": 5472 + }, + { + "epoch": 0.4954286231556079, + "grad_norm": 0.8182339072227478, + "learning_rate": 8.661710519009351e-05, + "loss": 2.7595, + "step": 5473 + }, + { + "epoch": 0.4955191454693582, + "grad_norm": 0.8943324089050293, + "learning_rate": 8.658565806355823e-05, + "loss": 2.7802, + "step": 5474 + }, + { + "epoch": 0.49560966778310855, + "grad_norm": 0.8168202042579651, + "learning_rate": 8.655421228784683e-05, + "loss": 2.7348, + "step": 5475 + }, + { + "epoch": 0.4957001900968589, + "grad_norm": 0.9240425825119019, + "learning_rate": 8.652276786612584e-05, + "loss": 2.7805, + "step": 5476 + }, + { + "epoch": 0.49579071241060924, + "grad_norm": 0.9528402090072632, + "learning_rate": 8.649132480156181e-05, + "loss": 2.7187, + "step": 5477 + }, + { + "epoch": 0.4958812347243596, + "grad_norm": 0.835676908493042, + "learning_rate": 8.6459883097321e-05, + "loss": 2.7786, + "step": 5478 + }, + { + "epoch": 0.4959717570381099, + "grad_norm": 0.9240402579307556, + "learning_rate": 8.642844275656957e-05, + "loss": 2.8179, + "step": 5479 + }, + { + "epoch": 0.49606227935186026, + "grad_norm": 1.0338637828826904, + "learning_rate": 8.639700378247361e-05, + "loss": 2.7696, + "step": 5480 + }, + { + "epoch": 0.4961528016656106, + "grad_norm": 0.9106727242469788, + "learning_rate": 8.636556617819896e-05, + "loss": 2.7819, + "step": 5481 + }, + { + "epoch": 0.49624332397936094, + "grad_norm": 0.9276056289672852, + "learning_rate": 8.633412994691144e-05, + "loss": 2.7896, + "step": 5482 + }, + { + "epoch": 0.4963338462931112, + "grad_norm": 0.92255699634552, + "learning_rate": 8.630269509177666e-05, + "loss": 2.7433, + "step": 5483 + }, + { + "epoch": 0.49642436860686157, + "grad_norm": 0.8409753441810608, + "learning_rate": 8.627126161596012e-05, + "loss": 2.7577, + "step": 5484 + }, + { + "epoch": 0.4965148909206119, + "grad_norm": 0.9331403374671936, + "learning_rate": 8.623982952262713e-05, + "loss": 2.7238, + "step": 5485 + }, + { + "epoch": 0.49660541323436225, + "grad_norm": 0.8792486190795898, + "learning_rate": 8.620839881494296e-05, + "loss": 2.7781, + "step": 5486 + }, + { + "epoch": 0.4966959355481126, + "grad_norm": 0.9217527508735657, + "learning_rate": 8.617696949607264e-05, + "loss": 2.7072, + "step": 5487 + }, + { + "epoch": 0.49678645786186293, + "grad_norm": 0.8589668273925781, + "learning_rate": 8.614554156918113e-05, + "loss": 2.6873, + "step": 5488 + }, + { + "epoch": 0.49687698017561327, + "grad_norm": 0.879298210144043, + "learning_rate": 8.611411503743319e-05, + "loss": 2.7492, + "step": 5489 + }, + { + "epoch": 0.4969675024893636, + "grad_norm": 0.8220292329788208, + "learning_rate": 8.608268990399349e-05, + "loss": 2.7402, + "step": 5490 + }, + { + "epoch": 0.49705802480311395, + "grad_norm": 0.840372622013092, + "learning_rate": 8.60512661720265e-05, + "loss": 2.7803, + "step": 5491 + }, + { + "epoch": 0.4971485471168643, + "grad_norm": 0.9626206755638123, + "learning_rate": 8.601984384469668e-05, + "loss": 2.6962, + "step": 5492 + }, + { + "epoch": 0.49723906943061463, + "grad_norm": 0.8860694169998169, + "learning_rate": 8.598842292516817e-05, + "loss": 2.7389, + "step": 5493 + }, + { + "epoch": 0.497329591744365, + "grad_norm": 0.9730173945426941, + "learning_rate": 8.595700341660511e-05, + "loss": 2.8225, + "step": 5494 + }, + { + "epoch": 0.4974201140581153, + "grad_norm": 0.8305181264877319, + "learning_rate": 8.592558532217139e-05, + "loss": 2.8373, + "step": 5495 + }, + { + "epoch": 0.49751063637186566, + "grad_norm": 0.8341726660728455, + "learning_rate": 8.589416864503087e-05, + "loss": 2.7348, + "step": 5496 + }, + { + "epoch": 0.497601158685616, + "grad_norm": 0.8894588351249695, + "learning_rate": 8.586275338834718e-05, + "loss": 2.7598, + "step": 5497 + }, + { + "epoch": 0.49769168099936634, + "grad_norm": 0.857745885848999, + "learning_rate": 8.583133955528383e-05, + "loss": 2.7502, + "step": 5498 + }, + { + "epoch": 0.4977822033131167, + "grad_norm": 0.8845468759536743, + "learning_rate": 8.579992714900417e-05, + "loss": 2.7651, + "step": 5499 + }, + { + "epoch": 0.497872725626867, + "grad_norm": 0.8758352398872375, + "learning_rate": 8.57685161726715e-05, + "loss": 2.7849, + "step": 5500 + }, + { + "epoch": 0.49796324794061736, + "grad_norm": 0.9248115420341492, + "learning_rate": 8.573710662944885e-05, + "loss": 2.7983, + "step": 5501 + }, + { + "epoch": 0.4980537702543677, + "grad_norm": 0.873688817024231, + "learning_rate": 8.57056985224992e-05, + "loss": 2.7343, + "step": 5502 + }, + { + "epoch": 0.49814429256811804, + "grad_norm": 1.0141209363937378, + "learning_rate": 8.567429185498524e-05, + "loss": 2.7702, + "step": 5503 + }, + { + "epoch": 0.4982348148818684, + "grad_norm": 0.8634735941886902, + "learning_rate": 8.564288663006977e-05, + "loss": 2.7663, + "step": 5504 + }, + { + "epoch": 0.4983253371956187, + "grad_norm": 1.0201530456542969, + "learning_rate": 8.56114828509152e-05, + "loss": 2.732, + "step": 5505 + }, + { + "epoch": 0.49841585950936906, + "grad_norm": 1.0170044898986816, + "learning_rate": 8.558008052068392e-05, + "loss": 2.7937, + "step": 5506 + }, + { + "epoch": 0.4985063818231194, + "grad_norm": 0.8283863663673401, + "learning_rate": 8.55486796425381e-05, + "loss": 2.7991, + "step": 5507 + }, + { + "epoch": 0.49859690413686975, + "grad_norm": 0.8377839922904968, + "learning_rate": 8.551728021963989e-05, + "loss": 2.741, + "step": 5508 + }, + { + "epoch": 0.4986874264506201, + "grad_norm": 0.9424071311950684, + "learning_rate": 8.548588225515115e-05, + "loss": 2.6981, + "step": 5509 + }, + { + "epoch": 0.4987779487643704, + "grad_norm": 1.026139259338379, + "learning_rate": 8.545448575223368e-05, + "loss": 2.7906, + "step": 5510 + }, + { + "epoch": 0.49886847107812077, + "grad_norm": 0.9049829840660095, + "learning_rate": 8.542309071404906e-05, + "loss": 2.7607, + "step": 5511 + }, + { + "epoch": 0.4989589933918711, + "grad_norm": 0.8872949481010437, + "learning_rate": 8.539169714375885e-05, + "loss": 2.7427, + "step": 5512 + }, + { + "epoch": 0.49904951570562145, + "grad_norm": 0.9735966324806213, + "learning_rate": 8.536030504452432e-05, + "loss": 2.7462, + "step": 5513 + }, + { + "epoch": 0.4991400380193718, + "grad_norm": 0.8961998820304871, + "learning_rate": 8.532891441950667e-05, + "loss": 2.7231, + "step": 5514 + }, + { + "epoch": 0.49923056033312213, + "grad_norm": 0.8910137414932251, + "learning_rate": 8.529752527186693e-05, + "loss": 2.7965, + "step": 5515 + }, + { + "epoch": 0.49932108264687247, + "grad_norm": 0.8779480457305908, + "learning_rate": 8.526613760476602e-05, + "loss": 2.7419, + "step": 5516 + }, + { + "epoch": 0.4994116049606228, + "grad_norm": 0.9542471766471863, + "learning_rate": 8.523475142136463e-05, + "loss": 2.7206, + "step": 5517 + }, + { + "epoch": 0.49950212727437315, + "grad_norm": 0.8974640369415283, + "learning_rate": 8.520336672482338e-05, + "loss": 2.7817, + "step": 5518 + }, + { + "epoch": 0.4995926495881235, + "grad_norm": 0.9307314157485962, + "learning_rate": 8.517198351830265e-05, + "loss": 2.7546, + "step": 5519 + }, + { + "epoch": 0.49968317190187383, + "grad_norm": 0.8345010876655579, + "learning_rate": 8.514060180496285e-05, + "loss": 2.7418, + "step": 5520 + }, + { + "epoch": 0.4997736942156242, + "grad_norm": 0.8099145293235779, + "learning_rate": 8.510922158796398e-05, + "loss": 2.6789, + "step": 5521 + }, + { + "epoch": 0.4998642165293745, + "grad_norm": 0.9911144375801086, + "learning_rate": 8.507784287046612e-05, + "loss": 2.7712, + "step": 5522 + }, + { + "epoch": 0.49995473884312486, + "grad_norm": 0.8634917140007019, + "learning_rate": 8.504646565562906e-05, + "loss": 2.762, + "step": 5523 + }, + { + "epoch": 0.5000452611568752, + "grad_norm": 0.8461886644363403, + "learning_rate": 8.501508994661251e-05, + "loss": 2.7835, + "step": 5524 + }, + { + "epoch": 0.5001357834706255, + "grad_norm": 0.9123673439025879, + "learning_rate": 8.498371574657596e-05, + "loss": 2.7487, + "step": 5525 + }, + { + "epoch": 0.5002263057843759, + "grad_norm": 0.9670429825782776, + "learning_rate": 8.495234305867887e-05, + "loss": 2.712, + "step": 5526 + }, + { + "epoch": 0.5003168280981262, + "grad_norm": 0.7825371026992798, + "learning_rate": 8.49209718860804e-05, + "loss": 2.7777, + "step": 5527 + }, + { + "epoch": 0.5004073504118766, + "grad_norm": 0.8448870778083801, + "learning_rate": 8.488960223193967e-05, + "loss": 2.7192, + "step": 5528 + }, + { + "epoch": 0.5004978727256268, + "grad_norm": 0.9148772358894348, + "learning_rate": 8.485823409941553e-05, + "loss": 2.7715, + "step": 5529 + }, + { + "epoch": 0.5005883950393772, + "grad_norm": 0.8517109751701355, + "learning_rate": 8.482686749166686e-05, + "loss": 2.7457, + "step": 5530 + }, + { + "epoch": 0.5006789173531275, + "grad_norm": 0.9205785393714905, + "learning_rate": 8.479550241185218e-05, + "loss": 2.7768, + "step": 5531 + }, + { + "epoch": 0.5007694396668779, + "grad_norm": 0.8705542087554932, + "learning_rate": 8.476413886313001e-05, + "loss": 2.8299, + "step": 5532 + }, + { + "epoch": 0.5008599619806282, + "grad_norm": 0.8568559885025024, + "learning_rate": 8.473277684865862e-05, + "loss": 2.7363, + "step": 5533 + }, + { + "epoch": 0.5009504842943786, + "grad_norm": 0.8830341696739197, + "learning_rate": 8.47014163715962e-05, + "loss": 2.8005, + "step": 5534 + }, + { + "epoch": 0.5010410066081289, + "grad_norm": 0.8620105981826782, + "learning_rate": 8.467005743510073e-05, + "loss": 2.7425, + "step": 5535 + }, + { + "epoch": 0.5011315289218793, + "grad_norm": 0.8455480933189392, + "learning_rate": 8.463870004233008e-05, + "loss": 2.7436, + "step": 5536 + }, + { + "epoch": 0.5012220512356296, + "grad_norm": 0.8549380898475647, + "learning_rate": 8.460734419644185e-05, + "loss": 2.7569, + "step": 5537 + }, + { + "epoch": 0.50131257354938, + "grad_norm": 0.8382590413093567, + "learning_rate": 8.45759899005937e-05, + "loss": 2.7851, + "step": 5538 + }, + { + "epoch": 0.5014030958631303, + "grad_norm": 0.9025892615318298, + "learning_rate": 8.454463715794292e-05, + "loss": 2.7553, + "step": 5539 + }, + { + "epoch": 0.5014936181768807, + "grad_norm": 0.9311736226081848, + "learning_rate": 8.451328597164679e-05, + "loss": 2.8024, + "step": 5540 + }, + { + "epoch": 0.5015841404906309, + "grad_norm": 0.8533101677894592, + "learning_rate": 8.448193634486226e-05, + "loss": 2.716, + "step": 5541 + }, + { + "epoch": 0.5016746628043813, + "grad_norm": 0.8735119104385376, + "learning_rate": 8.445058828074639e-05, + "loss": 2.7105, + "step": 5542 + }, + { + "epoch": 0.5017651851181316, + "grad_norm": 0.9033412337303162, + "learning_rate": 8.441924178245581e-05, + "loss": 2.7299, + "step": 5543 + }, + { + "epoch": 0.501855707431882, + "grad_norm": 0.8257886171340942, + "learning_rate": 8.438789685314721e-05, + "loss": 2.7265, + "step": 5544 + }, + { + "epoch": 0.5019462297456323, + "grad_norm": 0.9109305739402771, + "learning_rate": 8.435655349597689e-05, + "loss": 2.7337, + "step": 5545 + }, + { + "epoch": 0.5020367520593826, + "grad_norm": 0.8924078345298767, + "learning_rate": 8.432521171410128e-05, + "loss": 2.7947, + "step": 5546 + }, + { + "epoch": 0.502127274373133, + "grad_norm": 0.9318864345550537, + "learning_rate": 8.429387151067638e-05, + "loss": 2.7178, + "step": 5547 + }, + { + "epoch": 0.5022177966868833, + "grad_norm": 0.8161680102348328, + "learning_rate": 8.426253288885822e-05, + "loss": 2.7857, + "step": 5548 + }, + { + "epoch": 0.5023083190006337, + "grad_norm": 0.7949011921882629, + "learning_rate": 8.423119585180252e-05, + "loss": 2.7455, + "step": 5549 + }, + { + "epoch": 0.502398841314384, + "grad_norm": 0.8968280553817749, + "learning_rate": 8.4199860402665e-05, + "loss": 2.7814, + "step": 5550 + }, + { + "epoch": 0.5024893636281343, + "grad_norm": 0.8965038061141968, + "learning_rate": 8.41685265446011e-05, + "loss": 2.7532, + "step": 5551 + }, + { + "epoch": 0.5025798859418846, + "grad_norm": 0.8199223875999451, + "learning_rate": 8.413719428076617e-05, + "loss": 2.7675, + "step": 5552 + }, + { + "epoch": 0.502670408255635, + "grad_norm": 0.8846849203109741, + "learning_rate": 8.410586361431528e-05, + "loss": 2.7393, + "step": 5553 + }, + { + "epoch": 0.5027609305693853, + "grad_norm": 0.895946741104126, + "learning_rate": 8.407453454840357e-05, + "loss": 2.8227, + "step": 5554 + }, + { + "epoch": 0.5028514528831357, + "grad_norm": 0.8147122859954834, + "learning_rate": 8.404320708618572e-05, + "loss": 2.7743, + "step": 5555 + }, + { + "epoch": 0.502941975196886, + "grad_norm": 0.7956191897392273, + "learning_rate": 8.401188123081653e-05, + "loss": 2.7715, + "step": 5556 + }, + { + "epoch": 0.5030324975106364, + "grad_norm": 0.9651154279708862, + "learning_rate": 8.398055698545043e-05, + "loss": 2.7298, + "step": 5557 + }, + { + "epoch": 0.5031230198243867, + "grad_norm": 0.9466992020606995, + "learning_rate": 8.394923435324183e-05, + "loss": 2.7663, + "step": 5558 + }, + { + "epoch": 0.5032135421381371, + "grad_norm": 0.9539802074432373, + "learning_rate": 8.391791333734484e-05, + "loss": 2.7655, + "step": 5559 + }, + { + "epoch": 0.5033040644518874, + "grad_norm": 0.9526113271713257, + "learning_rate": 8.38865939409136e-05, + "loss": 2.6986, + "step": 5560 + }, + { + "epoch": 0.5033945867656378, + "grad_norm": 0.8260806798934937, + "learning_rate": 8.385527616710188e-05, + "loss": 2.7494, + "step": 5561 + }, + { + "epoch": 0.503485109079388, + "grad_norm": 0.8653164505958557, + "learning_rate": 8.382396001906343e-05, + "loss": 2.7554, + "step": 5562 + }, + { + "epoch": 0.5035756313931384, + "grad_norm": 0.9280028939247131, + "learning_rate": 8.379264549995176e-05, + "loss": 2.776, + "step": 5563 + }, + { + "epoch": 0.5036661537068887, + "grad_norm": 0.8204590678215027, + "learning_rate": 8.37613326129202e-05, + "loss": 2.7261, + "step": 5564 + }, + { + "epoch": 0.5037566760206391, + "grad_norm": 0.8759326934814453, + "learning_rate": 8.373002136112205e-05, + "loss": 2.7027, + "step": 5565 + }, + { + "epoch": 0.5038471983343894, + "grad_norm": 0.8595099449157715, + "learning_rate": 8.369871174771027e-05, + "loss": 2.7887, + "step": 5566 + }, + { + "epoch": 0.5039377206481398, + "grad_norm": 0.8577147722244263, + "learning_rate": 8.366740377583781e-05, + "loss": 2.7618, + "step": 5567 + }, + { + "epoch": 0.5040282429618901, + "grad_norm": 0.8850293159484863, + "learning_rate": 8.363609744865728e-05, + "loss": 2.7329, + "step": 5568 + }, + { + "epoch": 0.5041187652756405, + "grad_norm": 0.8491239547729492, + "learning_rate": 8.360479276932135e-05, + "loss": 2.6908, + "step": 5569 + }, + { + "epoch": 0.5042092875893908, + "grad_norm": 0.9252224564552307, + "learning_rate": 8.357348974098231e-05, + "loss": 2.7958, + "step": 5570 + }, + { + "epoch": 0.5042998099031412, + "grad_norm": 0.8544613122940063, + "learning_rate": 8.354218836679243e-05, + "loss": 2.7999, + "step": 5571 + }, + { + "epoch": 0.5043903322168914, + "grad_norm": 0.9366018176078796, + "learning_rate": 8.351088864990368e-05, + "loss": 2.8178, + "step": 5572 + }, + { + "epoch": 0.5044808545306418, + "grad_norm": 0.8889404535293579, + "learning_rate": 8.347959059346804e-05, + "loss": 2.7446, + "step": 5573 + }, + { + "epoch": 0.5045713768443921, + "grad_norm": 0.8375549912452698, + "learning_rate": 8.344829420063715e-05, + "loss": 2.7979, + "step": 5574 + }, + { + "epoch": 0.5046618991581425, + "grad_norm": 0.8302408456802368, + "learning_rate": 8.34169994745626e-05, + "loss": 2.7855, + "step": 5575 + }, + { + "epoch": 0.5047524214718928, + "grad_norm": 0.8453435301780701, + "learning_rate": 8.338570641839573e-05, + "loss": 2.775, + "step": 5576 + }, + { + "epoch": 0.5048429437856432, + "grad_norm": 0.8431276082992554, + "learning_rate": 8.33544150352878e-05, + "loss": 2.7928, + "step": 5577 + }, + { + "epoch": 0.5049334660993935, + "grad_norm": 0.7774648666381836, + "learning_rate": 8.332312532838978e-05, + "loss": 2.7188, + "step": 5578 + }, + { + "epoch": 0.5050239884131439, + "grad_norm": 1.0299315452575684, + "learning_rate": 8.329183730085262e-05, + "loss": 2.6806, + "step": 5579 + }, + { + "epoch": 0.5051145107268942, + "grad_norm": 0.9037138819694519, + "learning_rate": 8.326055095582694e-05, + "loss": 2.7802, + "step": 5580 + }, + { + "epoch": 0.5052050330406446, + "grad_norm": 0.9699219465255737, + "learning_rate": 8.32292662964634e-05, + "loss": 2.7911, + "step": 5581 + }, + { + "epoch": 0.5052955553543949, + "grad_norm": 0.9145121574401855, + "learning_rate": 8.31979833259122e-05, + "loss": 2.7354, + "step": 5582 + }, + { + "epoch": 0.5053860776681453, + "grad_norm": 0.854351282119751, + "learning_rate": 8.316670204732366e-05, + "loss": 2.7455, + "step": 5583 + }, + { + "epoch": 0.5054765999818955, + "grad_norm": 0.9479194283485413, + "learning_rate": 8.313542246384775e-05, + "loss": 2.8164, + "step": 5584 + }, + { + "epoch": 0.5055671222956459, + "grad_norm": 0.8744751811027527, + "learning_rate": 8.310414457863437e-05, + "loss": 2.7129, + "step": 5585 + }, + { + "epoch": 0.5056576446093962, + "grad_norm": 0.8344112634658813, + "learning_rate": 8.307286839483311e-05, + "loss": 2.6956, + "step": 5586 + }, + { + "epoch": 0.5057481669231465, + "grad_norm": 0.8224936127662659, + "learning_rate": 8.304159391559359e-05, + "loss": 2.7589, + "step": 5587 + }, + { + "epoch": 0.5058386892368969, + "grad_norm": 0.8344510793685913, + "learning_rate": 8.301032114406507e-05, + "loss": 2.7799, + "step": 5588 + }, + { + "epoch": 0.5059292115506472, + "grad_norm": 0.8643276691436768, + "learning_rate": 8.297905008339677e-05, + "loss": 2.7249, + "step": 5589 + }, + { + "epoch": 0.5060197338643976, + "grad_norm": 0.8124856352806091, + "learning_rate": 8.294778073673762e-05, + "loss": 2.8106, + "step": 5590 + }, + { + "epoch": 0.5061102561781479, + "grad_norm": 0.869785487651825, + "learning_rate": 8.291651310723654e-05, + "loss": 2.7463, + "step": 5591 + }, + { + "epoch": 0.5062007784918983, + "grad_norm": 0.9260883927345276, + "learning_rate": 8.288524719804208e-05, + "loss": 2.7447, + "step": 5592 + }, + { + "epoch": 0.5062913008056485, + "grad_norm": 0.9370576739311218, + "learning_rate": 8.285398301230278e-05, + "loss": 2.6753, + "step": 5593 + }, + { + "epoch": 0.5063818231193989, + "grad_norm": 0.7781187891960144, + "learning_rate": 8.282272055316688e-05, + "loss": 2.7359, + "step": 5594 + }, + { + "epoch": 0.5064723454331492, + "grad_norm": 0.8413085341453552, + "learning_rate": 8.279145982378261e-05, + "loss": 2.7524, + "step": 5595 + }, + { + "epoch": 0.5065628677468996, + "grad_norm": 0.852820634841919, + "learning_rate": 8.276020082729783e-05, + "loss": 2.734, + "step": 5596 + }, + { + "epoch": 0.5066533900606499, + "grad_norm": 0.8400428891181946, + "learning_rate": 8.272894356686039e-05, + "loss": 2.7109, + "step": 5597 + }, + { + "epoch": 0.5067439123744003, + "grad_norm": 0.9511899352073669, + "learning_rate": 8.269768804561779e-05, + "loss": 2.7996, + "step": 5598 + }, + { + "epoch": 0.5068344346881506, + "grad_norm": 0.8015035390853882, + "learning_rate": 8.26664342667176e-05, + "loss": 2.7499, + "step": 5599 + }, + { + "epoch": 0.506924957001901, + "grad_norm": 0.83297199010849, + "learning_rate": 8.263518223330697e-05, + "loss": 2.7384, + "step": 5600 + }, + { + "epoch": 0.5070154793156513, + "grad_norm": 0.9492606520652771, + "learning_rate": 8.260393194853304e-05, + "loss": 2.7537, + "step": 5601 + }, + { + "epoch": 0.5071060016294017, + "grad_norm": 0.8790477514266968, + "learning_rate": 8.257268341554264e-05, + "loss": 2.7606, + "step": 5602 + }, + { + "epoch": 0.507196523943152, + "grad_norm": 0.8750600814819336, + "learning_rate": 8.25414366374826e-05, + "loss": 2.7937, + "step": 5603 + }, + { + "epoch": 0.5072870462569024, + "grad_norm": 0.8970506191253662, + "learning_rate": 8.251019161749938e-05, + "loss": 2.82, + "step": 5604 + }, + { + "epoch": 0.5073775685706526, + "grad_norm": 0.9380742311477661, + "learning_rate": 8.24789483587394e-05, + "loss": 2.7346, + "step": 5605 + }, + { + "epoch": 0.507468090884403, + "grad_norm": 0.9393888711929321, + "learning_rate": 8.244770686434881e-05, + "loss": 2.7351, + "step": 5606 + }, + { + "epoch": 0.5075586131981533, + "grad_norm": 0.7890758514404297, + "learning_rate": 8.24164671374737e-05, + "loss": 2.7012, + "step": 5607 + }, + { + "epoch": 0.5076491355119037, + "grad_norm": 0.881951630115509, + "learning_rate": 8.238522918125983e-05, + "loss": 2.8086, + "step": 5608 + }, + { + "epoch": 0.507739657825654, + "grad_norm": 1.0359519720077515, + "learning_rate": 8.235399299885293e-05, + "loss": 2.7548, + "step": 5609 + }, + { + "epoch": 0.5078301801394044, + "grad_norm": 0.8619797825813293, + "learning_rate": 8.232275859339841e-05, + "loss": 2.7076, + "step": 5610 + }, + { + "epoch": 0.5079207024531547, + "grad_norm": 0.887519359588623, + "learning_rate": 8.229152596804168e-05, + "loss": 2.7639, + "step": 5611 + }, + { + "epoch": 0.5080112247669051, + "grad_norm": 0.9172592759132385, + "learning_rate": 8.226029512592777e-05, + "loss": 2.7452, + "step": 5612 + }, + { + "epoch": 0.5081017470806554, + "grad_norm": 0.8905028104782104, + "learning_rate": 8.222906607020165e-05, + "loss": 2.806, + "step": 5613 + }, + { + "epoch": 0.5081922693944058, + "grad_norm": 0.8358117341995239, + "learning_rate": 8.219783880400808e-05, + "loss": 2.7684, + "step": 5614 + }, + { + "epoch": 0.508282791708156, + "grad_norm": 0.9052249789237976, + "learning_rate": 8.21666133304917e-05, + "loss": 2.8095, + "step": 5615 + }, + { + "epoch": 0.5083733140219064, + "grad_norm": 0.8885976672172546, + "learning_rate": 8.213538965279682e-05, + "loss": 2.7564, + "step": 5616 + }, + { + "epoch": 0.5084638363356567, + "grad_norm": 0.9161496162414551, + "learning_rate": 8.210416777406774e-05, + "loss": 2.7672, + "step": 5617 + }, + { + "epoch": 0.5085543586494071, + "grad_norm": 0.8568997979164124, + "learning_rate": 8.207294769744847e-05, + "loss": 2.748, + "step": 5618 + }, + { + "epoch": 0.5086448809631574, + "grad_norm": 0.8771458864212036, + "learning_rate": 8.204172942608289e-05, + "loss": 2.7699, + "step": 5619 + }, + { + "epoch": 0.5087354032769078, + "grad_norm": 0.9072162508964539, + "learning_rate": 8.201051296311462e-05, + "loss": 2.7902, + "step": 5620 + }, + { + "epoch": 0.5088259255906581, + "grad_norm": 0.8902173638343811, + "learning_rate": 8.197929831168724e-05, + "loss": 2.7359, + "step": 5621 + }, + { + "epoch": 0.5089164479044085, + "grad_norm": 0.8940975666046143, + "learning_rate": 8.194808547494401e-05, + "loss": 2.8079, + "step": 5622 + }, + { + "epoch": 0.5090069702181588, + "grad_norm": 0.8261607885360718, + "learning_rate": 8.19168744560281e-05, + "loss": 2.7931, + "step": 5623 + }, + { + "epoch": 0.5090974925319092, + "grad_norm": 0.8398805260658264, + "learning_rate": 8.18856652580824e-05, + "loss": 2.7469, + "step": 5624 + }, + { + "epoch": 0.5091880148456595, + "grad_norm": 0.8998643755912781, + "learning_rate": 8.185445788424974e-05, + "loss": 2.723, + "step": 5625 + }, + { + "epoch": 0.5092785371594098, + "grad_norm": 0.9138926863670349, + "learning_rate": 8.182325233767267e-05, + "loss": 2.7699, + "step": 5626 + }, + { + "epoch": 0.5093690594731601, + "grad_norm": 0.8297197818756104, + "learning_rate": 8.179204862149361e-05, + "loss": 2.7137, + "step": 5627 + }, + { + "epoch": 0.5094595817869104, + "grad_norm": 0.8896348476409912, + "learning_rate": 8.176084673885471e-05, + "loss": 2.7489, + "step": 5628 + }, + { + "epoch": 0.5095501041006608, + "grad_norm": 0.9025987386703491, + "learning_rate": 8.17296466928981e-05, + "loss": 2.7342, + "step": 5629 + }, + { + "epoch": 0.5096406264144111, + "grad_norm": 0.890731692314148, + "learning_rate": 8.169844848676554e-05, + "loss": 2.7439, + "step": 5630 + }, + { + "epoch": 0.5097311487281615, + "grad_norm": 0.8023970127105713, + "learning_rate": 8.166725212359873e-05, + "loss": 2.794, + "step": 5631 + }, + { + "epoch": 0.5098216710419118, + "grad_norm": 0.8444201946258545, + "learning_rate": 8.16360576065391e-05, + "loss": 2.7278, + "step": 5632 + }, + { + "epoch": 0.5099121933556622, + "grad_norm": 0.8420068621635437, + "learning_rate": 8.160486493872798e-05, + "loss": 2.7529, + "step": 5633 + }, + { + "epoch": 0.5100027156694125, + "grad_norm": 0.8325995206832886, + "learning_rate": 8.157367412330647e-05, + "loss": 2.7908, + "step": 5634 + }, + { + "epoch": 0.5100932379831629, + "grad_norm": 0.9310572743415833, + "learning_rate": 8.154248516341548e-05, + "loss": 2.7331, + "step": 5635 + }, + { + "epoch": 0.5101837602969131, + "grad_norm": 0.8454694747924805, + "learning_rate": 8.151129806219568e-05, + "loss": 2.7153, + "step": 5636 + }, + { + "epoch": 0.5102742826106635, + "grad_norm": 0.842123806476593, + "learning_rate": 8.148011282278772e-05, + "loss": 2.79, + "step": 5637 + }, + { + "epoch": 0.5103648049244138, + "grad_norm": 0.9366497993469238, + "learning_rate": 8.144892944833184e-05, + "loss": 2.7858, + "step": 5638 + }, + { + "epoch": 0.5104553272381642, + "grad_norm": 0.8815160393714905, + "learning_rate": 8.14177479419683e-05, + "loss": 2.7287, + "step": 5639 + }, + { + "epoch": 0.5105458495519145, + "grad_norm": 0.8443790078163147, + "learning_rate": 8.1386568306837e-05, + "loss": 2.7358, + "step": 5640 + }, + { + "epoch": 0.5106363718656649, + "grad_norm": 0.8315573930740356, + "learning_rate": 8.135539054607779e-05, + "loss": 2.7245, + "step": 5641 + }, + { + "epoch": 0.5107268941794152, + "grad_norm": 0.9345365762710571, + "learning_rate": 8.132421466283022e-05, + "loss": 2.7289, + "step": 5642 + }, + { + "epoch": 0.5108174164931656, + "grad_norm": 0.9540194869041443, + "learning_rate": 8.129304066023375e-05, + "loss": 2.7142, + "step": 5643 + }, + { + "epoch": 0.5109079388069159, + "grad_norm": 0.9122259020805359, + "learning_rate": 8.126186854142752e-05, + "loss": 2.8238, + "step": 5644 + }, + { + "epoch": 0.5109984611206663, + "grad_norm": 0.8844560980796814, + "learning_rate": 8.123069830955067e-05, + "loss": 2.7545, + "step": 5645 + }, + { + "epoch": 0.5110889834344166, + "grad_norm": 0.9123935699462891, + "learning_rate": 8.119952996774197e-05, + "loss": 2.74, + "step": 5646 + }, + { + "epoch": 0.511179505748167, + "grad_norm": 0.9042823314666748, + "learning_rate": 8.116836351914009e-05, + "loss": 2.7527, + "step": 5647 + }, + { + "epoch": 0.5112700280619172, + "grad_norm": 0.9140527248382568, + "learning_rate": 8.11371989668835e-05, + "loss": 2.7742, + "step": 5648 + }, + { + "epoch": 0.5113605503756676, + "grad_norm": 0.8494327664375305, + "learning_rate": 8.110603631411043e-05, + "loss": 2.7112, + "step": 5649 + }, + { + "epoch": 0.5114510726894179, + "grad_norm": 0.8976174592971802, + "learning_rate": 8.107487556395901e-05, + "loss": 2.7035, + "step": 5650 + }, + { + "epoch": 0.5115415950031683, + "grad_norm": 0.9411106109619141, + "learning_rate": 8.104371671956707e-05, + "loss": 2.7734, + "step": 5651 + }, + { + "epoch": 0.5116321173169186, + "grad_norm": 0.8943339586257935, + "learning_rate": 8.101255978407237e-05, + "loss": 2.759, + "step": 5652 + }, + { + "epoch": 0.511722639630669, + "grad_norm": 0.8546929955482483, + "learning_rate": 8.098140476061236e-05, + "loss": 2.7437, + "step": 5653 + }, + { + "epoch": 0.5118131619444193, + "grad_norm": 0.8101754784584045, + "learning_rate": 8.095025165232439e-05, + "loss": 2.7083, + "step": 5654 + }, + { + "epoch": 0.5119036842581697, + "grad_norm": 0.8320404291152954, + "learning_rate": 8.091910046234552e-05, + "loss": 2.7549, + "step": 5655 + }, + { + "epoch": 0.51199420657192, + "grad_norm": 0.8851386308670044, + "learning_rate": 8.088795119381276e-05, + "loss": 2.755, + "step": 5656 + }, + { + "epoch": 0.5120847288856704, + "grad_norm": 0.8043539524078369, + "learning_rate": 8.085680384986276e-05, + "loss": 2.6904, + "step": 5657 + }, + { + "epoch": 0.5121752511994206, + "grad_norm": 0.8275747895240784, + "learning_rate": 8.08256584336321e-05, + "loss": 2.7432, + "step": 5658 + }, + { + "epoch": 0.512265773513171, + "grad_norm": 0.8125655055046082, + "learning_rate": 8.079451494825709e-05, + "loss": 2.7733, + "step": 5659 + }, + { + "epoch": 0.5123562958269213, + "grad_norm": 0.8181120157241821, + "learning_rate": 8.076337339687394e-05, + "loss": 2.7406, + "step": 5660 + }, + { + "epoch": 0.5124468181406717, + "grad_norm": 0.8543720841407776, + "learning_rate": 8.073223378261852e-05, + "loss": 2.7117, + "step": 5661 + }, + { + "epoch": 0.512537340454422, + "grad_norm": 0.8811604976654053, + "learning_rate": 8.070109610862668e-05, + "loss": 2.8013, + "step": 5662 + }, + { + "epoch": 0.5126278627681724, + "grad_norm": 0.8130089640617371, + "learning_rate": 8.066996037803388e-05, + "loss": 2.7559, + "step": 5663 + }, + { + "epoch": 0.5127183850819227, + "grad_norm": 0.779761016368866, + "learning_rate": 8.063882659397557e-05, + "loss": 2.7497, + "step": 5664 + }, + { + "epoch": 0.5128089073956731, + "grad_norm": 0.8280618786811829, + "learning_rate": 8.060769475958689e-05, + "loss": 2.7415, + "step": 5665 + }, + { + "epoch": 0.5128994297094234, + "grad_norm": 0.8324514031410217, + "learning_rate": 8.057656487800282e-05, + "loss": 2.7464, + "step": 5666 + }, + { + "epoch": 0.5129899520231738, + "grad_norm": 0.794349193572998, + "learning_rate": 8.054543695235811e-05, + "loss": 2.7231, + "step": 5667 + }, + { + "epoch": 0.513080474336924, + "grad_norm": 0.8479178547859192, + "learning_rate": 8.051431098578741e-05, + "loss": 2.7078, + "step": 5668 + }, + { + "epoch": 0.5131709966506743, + "grad_norm": 0.864479660987854, + "learning_rate": 8.048318698142503e-05, + "loss": 2.743, + "step": 5669 + }, + { + "epoch": 0.5132615189644247, + "grad_norm": 0.8360306024551392, + "learning_rate": 8.045206494240521e-05, + "loss": 2.7561, + "step": 5670 + }, + { + "epoch": 0.513352041278175, + "grad_norm": 0.8737164735794067, + "learning_rate": 8.042094487186187e-05, + "loss": 2.7586, + "step": 5671 + }, + { + "epoch": 0.5134425635919254, + "grad_norm": 0.8805273175239563, + "learning_rate": 8.038982677292889e-05, + "loss": 2.7258, + "step": 5672 + }, + { + "epoch": 0.5135330859056757, + "grad_norm": 0.8032810688018799, + "learning_rate": 8.03587106487398e-05, + "loss": 2.6993, + "step": 5673 + }, + { + "epoch": 0.5136236082194261, + "grad_norm": 0.813574492931366, + "learning_rate": 8.032759650242802e-05, + "loss": 2.7844, + "step": 5674 + }, + { + "epoch": 0.5137141305331764, + "grad_norm": 0.8926960825920105, + "learning_rate": 8.029648433712671e-05, + "loss": 2.7381, + "step": 5675 + }, + { + "epoch": 0.5138046528469268, + "grad_norm": 0.8661999702453613, + "learning_rate": 8.026537415596895e-05, + "loss": 2.732, + "step": 5676 + }, + { + "epoch": 0.5138951751606771, + "grad_norm": 0.7916553616523743, + "learning_rate": 8.023426596208739e-05, + "loss": 2.7387, + "step": 5677 + }, + { + "epoch": 0.5139856974744275, + "grad_norm": 0.8442259430885315, + "learning_rate": 8.020315975861476e-05, + "loss": 2.7264, + "step": 5678 + }, + { + "epoch": 0.5140762197881777, + "grad_norm": 0.8810858130455017, + "learning_rate": 8.017205554868337e-05, + "loss": 2.7055, + "step": 5679 + }, + { + "epoch": 0.5141667421019281, + "grad_norm": 0.8070834279060364, + "learning_rate": 8.014095333542548e-05, + "loss": 2.7906, + "step": 5680 + }, + { + "epoch": 0.5142572644156784, + "grad_norm": 0.8607498407363892, + "learning_rate": 8.0109853121973e-05, + "loss": 2.7554, + "step": 5681 + }, + { + "epoch": 0.5143477867294288, + "grad_norm": 0.8190198540687561, + "learning_rate": 8.007875491145781e-05, + "loss": 2.7636, + "step": 5682 + }, + { + "epoch": 0.5144383090431791, + "grad_norm": 0.832055389881134, + "learning_rate": 8.004765870701144e-05, + "loss": 2.7015, + "step": 5683 + }, + { + "epoch": 0.5145288313569295, + "grad_norm": 0.8482311964035034, + "learning_rate": 8.00165645117653e-05, + "loss": 2.762, + "step": 5684 + }, + { + "epoch": 0.5146193536706798, + "grad_norm": 0.8550591468811035, + "learning_rate": 7.998547232885052e-05, + "loss": 2.7264, + "step": 5685 + }, + { + "epoch": 0.5147098759844302, + "grad_norm": 0.8633624315261841, + "learning_rate": 7.995438216139818e-05, + "loss": 2.7324, + "step": 5686 + }, + { + "epoch": 0.5148003982981805, + "grad_norm": 0.8387537598609924, + "learning_rate": 7.992329401253898e-05, + "loss": 2.7605, + "step": 5687 + }, + { + "epoch": 0.5148909206119309, + "grad_norm": 0.7846046686172485, + "learning_rate": 7.989220788540355e-05, + "loss": 2.7169, + "step": 5688 + }, + { + "epoch": 0.5149814429256812, + "grad_norm": 0.8761247396469116, + "learning_rate": 7.986112378312221e-05, + "loss": 2.7233, + "step": 5689 + }, + { + "epoch": 0.5150719652394316, + "grad_norm": 0.888114333152771, + "learning_rate": 7.983004170882518e-05, + "loss": 2.7698, + "step": 5690 + }, + { + "epoch": 0.5151624875531818, + "grad_norm": 0.8965131044387817, + "learning_rate": 7.979896166564237e-05, + "loss": 2.8113, + "step": 5691 + }, + { + "epoch": 0.5152530098669322, + "grad_norm": 0.8232806921005249, + "learning_rate": 7.97678836567036e-05, + "loss": 2.7355, + "step": 5692 + }, + { + "epoch": 0.5153435321806825, + "grad_norm": 0.8814472556114197, + "learning_rate": 7.973680768513835e-05, + "loss": 2.7442, + "step": 5693 + }, + { + "epoch": 0.5154340544944329, + "grad_norm": 0.8496522307395935, + "learning_rate": 7.970573375407604e-05, + "loss": 2.703, + "step": 5694 + }, + { + "epoch": 0.5155245768081832, + "grad_norm": 0.9013231992721558, + "learning_rate": 7.967466186664578e-05, + "loss": 2.7507, + "step": 5695 + }, + { + "epoch": 0.5156150991219336, + "grad_norm": 0.9093582034111023, + "learning_rate": 7.964359202597654e-05, + "loss": 2.753, + "step": 5696 + }, + { + "epoch": 0.5157056214356839, + "grad_norm": 0.8807817697525024, + "learning_rate": 7.961252423519696e-05, + "loss": 2.7535, + "step": 5697 + }, + { + "epoch": 0.5157961437494343, + "grad_norm": 0.8731459379196167, + "learning_rate": 7.958145849743569e-05, + "loss": 2.79, + "step": 5698 + }, + { + "epoch": 0.5158866660631846, + "grad_norm": 0.8574411273002625, + "learning_rate": 7.955039481582097e-05, + "loss": 2.6913, + "step": 5699 + }, + { + "epoch": 0.515977188376935, + "grad_norm": 0.8288585543632507, + "learning_rate": 7.951933319348095e-05, + "loss": 2.7044, + "step": 5700 + }, + { + "epoch": 0.5160677106906852, + "grad_norm": 0.8653036952018738, + "learning_rate": 7.948827363354347e-05, + "loss": 2.6882, + "step": 5701 + }, + { + "epoch": 0.5161582330044356, + "grad_norm": 0.818445086479187, + "learning_rate": 7.945721613913633e-05, + "loss": 2.7755, + "step": 5702 + }, + { + "epoch": 0.5162487553181859, + "grad_norm": 0.7889580726623535, + "learning_rate": 7.942616071338693e-05, + "loss": 2.786, + "step": 5703 + }, + { + "epoch": 0.5163392776319363, + "grad_norm": 0.875316858291626, + "learning_rate": 7.939510735942262e-05, + "loss": 2.7799, + "step": 5704 + }, + { + "epoch": 0.5164297999456866, + "grad_norm": 0.8405930995941162, + "learning_rate": 7.936405608037036e-05, + "loss": 2.7441, + "step": 5705 + }, + { + "epoch": 0.516520322259437, + "grad_norm": 0.9210299253463745, + "learning_rate": 7.933300687935716e-05, + "loss": 2.7433, + "step": 5706 + }, + { + "epoch": 0.5166108445731873, + "grad_norm": 0.8364388346672058, + "learning_rate": 7.930195975950957e-05, + "loss": 2.7355, + "step": 5707 + }, + { + "epoch": 0.5167013668869377, + "grad_norm": 0.8821870684623718, + "learning_rate": 7.92709147239541e-05, + "loss": 2.7608, + "step": 5708 + }, + { + "epoch": 0.516791889200688, + "grad_norm": 0.9231151342391968, + "learning_rate": 7.923987177581689e-05, + "loss": 2.7756, + "step": 5709 + }, + { + "epoch": 0.5168824115144383, + "grad_norm": 0.8480895161628723, + "learning_rate": 7.920883091822408e-05, + "loss": 2.7919, + "step": 5710 + }, + { + "epoch": 0.5169729338281887, + "grad_norm": 0.8738788962364197, + "learning_rate": 7.917779215430139e-05, + "loss": 2.7691, + "step": 5711 + }, + { + "epoch": 0.5170634561419389, + "grad_norm": 0.8850705027580261, + "learning_rate": 7.914675548717449e-05, + "loss": 2.7242, + "step": 5712 + }, + { + "epoch": 0.5171539784556893, + "grad_norm": 0.9246595501899719, + "learning_rate": 7.911572091996873e-05, + "loss": 2.7948, + "step": 5713 + }, + { + "epoch": 0.5172445007694396, + "grad_norm": 0.8189140558242798, + "learning_rate": 7.90846884558093e-05, + "loss": 2.717, + "step": 5714 + }, + { + "epoch": 0.51733502308319, + "grad_norm": 0.9362362027168274, + "learning_rate": 7.905365809782114e-05, + "loss": 2.7988, + "step": 5715 + }, + { + "epoch": 0.5174255453969403, + "grad_norm": 0.9175571799278259, + "learning_rate": 7.902262984912909e-05, + "loss": 2.7469, + "step": 5716 + }, + { + "epoch": 0.5175160677106907, + "grad_norm": 0.8594390749931335, + "learning_rate": 7.899160371285761e-05, + "loss": 2.7248, + "step": 5717 + }, + { + "epoch": 0.517606590024441, + "grad_norm": 0.8442773818969727, + "learning_rate": 7.896057969213109e-05, + "loss": 2.7009, + "step": 5718 + }, + { + "epoch": 0.5176971123381914, + "grad_norm": 0.8920677900314331, + "learning_rate": 7.892955779007356e-05, + "loss": 2.7349, + "step": 5719 + }, + { + "epoch": 0.5177876346519417, + "grad_norm": 0.86868816614151, + "learning_rate": 7.889853800980904e-05, + "loss": 2.7457, + "step": 5720 + }, + { + "epoch": 0.5178781569656921, + "grad_norm": 0.9317598938941956, + "learning_rate": 7.886752035446114e-05, + "loss": 2.811, + "step": 5721 + }, + { + "epoch": 0.5179686792794423, + "grad_norm": 1.1110806465148926, + "learning_rate": 7.883650482715338e-05, + "loss": 2.8067, + "step": 5722 + }, + { + "epoch": 0.5180592015931927, + "grad_norm": 0.8412470817565918, + "learning_rate": 7.880549143100897e-05, + "loss": 2.7069, + "step": 5723 + }, + { + "epoch": 0.518149723906943, + "grad_norm": 0.9267826676368713, + "learning_rate": 7.877448016915104e-05, + "loss": 2.7886, + "step": 5724 + }, + { + "epoch": 0.5182402462206934, + "grad_norm": 1.0773874521255493, + "learning_rate": 7.874347104470234e-05, + "loss": 2.7465, + "step": 5725 + }, + { + "epoch": 0.5183307685344437, + "grad_norm": 1.025495171546936, + "learning_rate": 7.871246406078554e-05, + "loss": 2.7204, + "step": 5726 + }, + { + "epoch": 0.5184212908481941, + "grad_norm": 0.8501183390617371, + "learning_rate": 7.8681459220523e-05, + "loss": 2.7032, + "step": 5727 + }, + { + "epoch": 0.5185118131619444, + "grad_norm": 1.1093471050262451, + "learning_rate": 7.865045652703697e-05, + "loss": 2.7576, + "step": 5728 + }, + { + "epoch": 0.5186023354756948, + "grad_norm": 0.8723694086074829, + "learning_rate": 7.861945598344936e-05, + "loss": 2.7593, + "step": 5729 + }, + { + "epoch": 0.5186928577894451, + "grad_norm": 1.0160908699035645, + "learning_rate": 7.858845759288198e-05, + "loss": 2.7928, + "step": 5730 + }, + { + "epoch": 0.5187833801031955, + "grad_norm": 0.7631496787071228, + "learning_rate": 7.855746135845628e-05, + "loss": 2.7705, + "step": 5731 + }, + { + "epoch": 0.5188739024169458, + "grad_norm": 0.882847785949707, + "learning_rate": 7.852646728329368e-05, + "loss": 2.7306, + "step": 5732 + }, + { + "epoch": 0.5189644247306961, + "grad_norm": 0.9211302995681763, + "learning_rate": 7.849547537051525e-05, + "loss": 2.7593, + "step": 5733 + }, + { + "epoch": 0.5190549470444464, + "grad_norm": 0.9857428073883057, + "learning_rate": 7.846448562324183e-05, + "loss": 2.7279, + "step": 5734 + }, + { + "epoch": 0.5191454693581968, + "grad_norm": 0.8712624311447144, + "learning_rate": 7.843349804459412e-05, + "loss": 2.7363, + "step": 5735 + }, + { + "epoch": 0.5192359916719471, + "grad_norm": 1.004382848739624, + "learning_rate": 7.840251263769255e-05, + "loss": 2.7636, + "step": 5736 + }, + { + "epoch": 0.5193265139856975, + "grad_norm": 0.9310490489006042, + "learning_rate": 7.837152940565741e-05, + "loss": 2.779, + "step": 5737 + }, + { + "epoch": 0.5194170362994478, + "grad_norm": 0.8835846781730652, + "learning_rate": 7.834054835160864e-05, + "loss": 2.7261, + "step": 5738 + }, + { + "epoch": 0.5195075586131982, + "grad_norm": 1.0388543605804443, + "learning_rate": 7.830956947866607e-05, + "loss": 2.7124, + "step": 5739 + }, + { + "epoch": 0.5195980809269485, + "grad_norm": 0.8679820895195007, + "learning_rate": 7.827859278994925e-05, + "loss": 2.7933, + "step": 5740 + }, + { + "epoch": 0.5196886032406989, + "grad_norm": 1.0058894157409668, + "learning_rate": 7.824761828857755e-05, + "loss": 2.8184, + "step": 5741 + }, + { + "epoch": 0.5197791255544492, + "grad_norm": 0.9288240671157837, + "learning_rate": 7.821664597767006e-05, + "loss": 2.7192, + "step": 5742 + }, + { + "epoch": 0.5198696478681996, + "grad_norm": 0.8325408697128296, + "learning_rate": 7.818567586034577e-05, + "loss": 2.6912, + "step": 5743 + }, + { + "epoch": 0.5199601701819498, + "grad_norm": 0.9072100520133972, + "learning_rate": 7.815470793972331e-05, + "loss": 2.7642, + "step": 5744 + }, + { + "epoch": 0.5200506924957002, + "grad_norm": 0.8883663415908813, + "learning_rate": 7.812374221892115e-05, + "loss": 2.7666, + "step": 5745 + }, + { + "epoch": 0.5201412148094505, + "grad_norm": 0.8535317778587341, + "learning_rate": 7.809277870105753e-05, + "loss": 2.7037, + "step": 5746 + }, + { + "epoch": 0.5202317371232009, + "grad_norm": 0.8594459295272827, + "learning_rate": 7.806181738925055e-05, + "loss": 2.7063, + "step": 5747 + }, + { + "epoch": 0.5203222594369512, + "grad_norm": 0.9233312606811523, + "learning_rate": 7.803085828661792e-05, + "loss": 2.7687, + "step": 5748 + }, + { + "epoch": 0.5204127817507016, + "grad_norm": 0.8871405124664307, + "learning_rate": 7.799990139627729e-05, + "loss": 2.7712, + "step": 5749 + }, + { + "epoch": 0.5205033040644519, + "grad_norm": 0.8980066776275635, + "learning_rate": 7.796894672134594e-05, + "loss": 2.7603, + "step": 5750 + }, + { + "epoch": 0.5205938263782022, + "grad_norm": 0.8637656569480896, + "learning_rate": 7.79379942649411e-05, + "loss": 2.7187, + "step": 5751 + }, + { + "epoch": 0.5206843486919526, + "grad_norm": 0.8965839147567749, + "learning_rate": 7.79070440301796e-05, + "loss": 2.7725, + "step": 5752 + }, + { + "epoch": 0.5207748710057029, + "grad_norm": 0.7956546545028687, + "learning_rate": 7.78760960201782e-05, + "loss": 2.6925, + "step": 5753 + }, + { + "epoch": 0.5208653933194533, + "grad_norm": 0.9585323333740234, + "learning_rate": 7.784515023805328e-05, + "loss": 2.8031, + "step": 5754 + }, + { + "epoch": 0.5209559156332035, + "grad_norm": 0.935881495475769, + "learning_rate": 7.781420668692116e-05, + "loss": 2.7467, + "step": 5755 + }, + { + "epoch": 0.5210464379469539, + "grad_norm": 0.8071610927581787, + "learning_rate": 7.778326536989779e-05, + "loss": 2.7042, + "step": 5756 + }, + { + "epoch": 0.5211369602607042, + "grad_norm": 0.9618384838104248, + "learning_rate": 7.775232629009904e-05, + "loss": 2.7429, + "step": 5757 + }, + { + "epoch": 0.5212274825744546, + "grad_norm": 0.9782554507255554, + "learning_rate": 7.772138945064035e-05, + "loss": 2.7997, + "step": 5758 + }, + { + "epoch": 0.5213180048882049, + "grad_norm": 0.9291982054710388, + "learning_rate": 7.769045485463719e-05, + "loss": 2.7496, + "step": 5759 + }, + { + "epoch": 0.5214085272019553, + "grad_norm": 0.8440256118774414, + "learning_rate": 7.765952250520459e-05, + "loss": 2.7811, + "step": 5760 + }, + { + "epoch": 0.5214990495157056, + "grad_norm": 0.8678810596466064, + "learning_rate": 7.762859240545748e-05, + "loss": 2.7768, + "step": 5761 + }, + { + "epoch": 0.521589571829456, + "grad_norm": 0.8405066132545471, + "learning_rate": 7.759766455851046e-05, + "loss": 2.7836, + "step": 5762 + }, + { + "epoch": 0.5216800941432063, + "grad_norm": 0.8501881957054138, + "learning_rate": 7.756673896747805e-05, + "loss": 2.7211, + "step": 5763 + }, + { + "epoch": 0.5217706164569567, + "grad_norm": 0.9259394407272339, + "learning_rate": 7.753581563547441e-05, + "loss": 2.7372, + "step": 5764 + }, + { + "epoch": 0.521861138770707, + "grad_norm": 0.8363810181617737, + "learning_rate": 7.750489456561352e-05, + "loss": 2.7835, + "step": 5765 + }, + { + "epoch": 0.5219516610844573, + "grad_norm": 0.8172920942306519, + "learning_rate": 7.74739757610091e-05, + "loss": 2.8003, + "step": 5766 + }, + { + "epoch": 0.5220421833982076, + "grad_norm": 0.8599033355712891, + "learning_rate": 7.744305922477474e-05, + "loss": 2.7674, + "step": 5767 + }, + { + "epoch": 0.522132705711958, + "grad_norm": 0.853209376335144, + "learning_rate": 7.741214496002368e-05, + "loss": 2.7579, + "step": 5768 + }, + { + "epoch": 0.5222232280257083, + "grad_norm": 0.8500555753707886, + "learning_rate": 7.738123296986903e-05, + "loss": 2.7062, + "step": 5769 + }, + { + "epoch": 0.5223137503394587, + "grad_norm": 0.8476354479789734, + "learning_rate": 7.735032325742355e-05, + "loss": 2.818, + "step": 5770 + }, + { + "epoch": 0.522404272653209, + "grad_norm": 0.8359866738319397, + "learning_rate": 7.731941582579998e-05, + "loss": 2.7028, + "step": 5771 + }, + { + "epoch": 0.5224947949669594, + "grad_norm": 0.8658446073532104, + "learning_rate": 7.728851067811054e-05, + "loss": 2.7431, + "step": 5772 + }, + { + "epoch": 0.5225853172807097, + "grad_norm": 0.9646530151367188, + "learning_rate": 7.72576078174675e-05, + "loss": 2.705, + "step": 5773 + }, + { + "epoch": 0.5226758395944601, + "grad_norm": 0.8622293472290039, + "learning_rate": 7.72267072469827e-05, + "loss": 2.7561, + "step": 5774 + }, + { + "epoch": 0.5227663619082104, + "grad_norm": 0.7764016389846802, + "learning_rate": 7.719580896976788e-05, + "loss": 2.75, + "step": 5775 + }, + { + "epoch": 0.5228568842219607, + "grad_norm": 0.8442264199256897, + "learning_rate": 7.716491298893442e-05, + "loss": 2.6812, + "step": 5776 + }, + { + "epoch": 0.522947406535711, + "grad_norm": 0.9320610165596008, + "learning_rate": 7.713401930759365e-05, + "loss": 2.6775, + "step": 5777 + }, + { + "epoch": 0.5230379288494614, + "grad_norm": 0.8457524180412292, + "learning_rate": 7.71031279288565e-05, + "loss": 2.7454, + "step": 5778 + }, + { + "epoch": 0.5231284511632117, + "grad_norm": 0.8799865245819092, + "learning_rate": 7.707223885583375e-05, + "loss": 2.7764, + "step": 5779 + }, + { + "epoch": 0.5232189734769621, + "grad_norm": 0.8141556978225708, + "learning_rate": 7.704135209163589e-05, + "loss": 2.7949, + "step": 5780 + }, + { + "epoch": 0.5233094957907124, + "grad_norm": 0.8758169412612915, + "learning_rate": 7.701046763937328e-05, + "loss": 2.7202, + "step": 5781 + }, + { + "epoch": 0.5234000181044628, + "grad_norm": 0.8458023071289062, + "learning_rate": 7.697958550215592e-05, + "loss": 2.756, + "step": 5782 + }, + { + "epoch": 0.5234905404182131, + "grad_norm": 0.8262035846710205, + "learning_rate": 7.69487056830937e-05, + "loss": 2.7523, + "step": 5783 + }, + { + "epoch": 0.5235810627319635, + "grad_norm": 0.7745006680488586, + "learning_rate": 7.691782818529615e-05, + "loss": 2.7067, + "step": 5784 + }, + { + "epoch": 0.5236715850457138, + "grad_norm": 0.825136661529541, + "learning_rate": 7.68869530118727e-05, + "loss": 2.7196, + "step": 5785 + }, + { + "epoch": 0.5237621073594642, + "grad_norm": 0.8253481984138489, + "learning_rate": 7.685608016593244e-05, + "loss": 2.7117, + "step": 5786 + }, + { + "epoch": 0.5238526296732144, + "grad_norm": 0.8600527048110962, + "learning_rate": 7.682520965058428e-05, + "loss": 2.7612, + "step": 5787 + }, + { + "epoch": 0.5239431519869648, + "grad_norm": 0.7747024297714233, + "learning_rate": 7.679434146893685e-05, + "loss": 2.6913, + "step": 5788 + }, + { + "epoch": 0.5240336743007151, + "grad_norm": 0.8048658967018127, + "learning_rate": 7.676347562409863e-05, + "loss": 2.8, + "step": 5789 + }, + { + "epoch": 0.5241241966144655, + "grad_norm": 0.8808863162994385, + "learning_rate": 7.673261211917776e-05, + "loss": 2.7518, + "step": 5790 + }, + { + "epoch": 0.5242147189282158, + "grad_norm": 0.8000421524047852, + "learning_rate": 7.670175095728224e-05, + "loss": 2.7402, + "step": 5791 + }, + { + "epoch": 0.5243052412419661, + "grad_norm": 0.8699004054069519, + "learning_rate": 7.667089214151971e-05, + "loss": 2.8246, + "step": 5792 + }, + { + "epoch": 0.5243957635557165, + "grad_norm": 0.8132240176200867, + "learning_rate": 7.664003567499776e-05, + "loss": 2.7355, + "step": 5793 + }, + { + "epoch": 0.5244862858694668, + "grad_norm": 0.8190984725952148, + "learning_rate": 7.660918156082354e-05, + "loss": 2.6704, + "step": 5794 + }, + { + "epoch": 0.5245768081832172, + "grad_norm": 0.9430919885635376, + "learning_rate": 7.657832980210412e-05, + "loss": 2.7914, + "step": 5795 + }, + { + "epoch": 0.5246673304969675, + "grad_norm": 0.8266918659210205, + "learning_rate": 7.654748040194622e-05, + "loss": 2.7768, + "step": 5796 + }, + { + "epoch": 0.5247578528107179, + "grad_norm": 0.9308954477310181, + "learning_rate": 7.651663336345642e-05, + "loss": 2.7125, + "step": 5797 + }, + { + "epoch": 0.5248483751244681, + "grad_norm": 0.9079810380935669, + "learning_rate": 7.6485788689741e-05, + "loss": 2.761, + "step": 5798 + }, + { + "epoch": 0.5249388974382185, + "grad_norm": 0.9260097742080688, + "learning_rate": 7.645494638390603e-05, + "loss": 2.7153, + "step": 5799 + }, + { + "epoch": 0.5250294197519688, + "grad_norm": 0.8457943797111511, + "learning_rate": 7.642410644905726e-05, + "loss": 2.7854, + "step": 5800 + }, + { + "epoch": 0.5251199420657192, + "grad_norm": 0.8552992343902588, + "learning_rate": 7.639326888830039e-05, + "loss": 2.7727, + "step": 5801 + }, + { + "epoch": 0.5252104643794695, + "grad_norm": 0.9081202745437622, + "learning_rate": 7.636243370474066e-05, + "loss": 2.7699, + "step": 5802 + }, + { + "epoch": 0.5253009866932199, + "grad_norm": 0.9340787529945374, + "learning_rate": 7.633160090148323e-05, + "loss": 2.7689, + "step": 5803 + }, + { + "epoch": 0.5253915090069702, + "grad_norm": 0.8104767799377441, + "learning_rate": 7.630077048163292e-05, + "loss": 2.7085, + "step": 5804 + }, + { + "epoch": 0.5254820313207206, + "grad_norm": 0.907471776008606, + "learning_rate": 7.626994244829441e-05, + "loss": 2.7188, + "step": 5805 + }, + { + "epoch": 0.5255725536344709, + "grad_norm": 0.8070109486579895, + "learning_rate": 7.623911680457198e-05, + "loss": 2.6972, + "step": 5806 + }, + { + "epoch": 0.5256630759482213, + "grad_norm": 0.7986403107643127, + "learning_rate": 7.620829355356989e-05, + "loss": 2.7255, + "step": 5807 + }, + { + "epoch": 0.5257535982619715, + "grad_norm": 0.893733561038971, + "learning_rate": 7.617747269839198e-05, + "loss": 2.7295, + "step": 5808 + }, + { + "epoch": 0.5258441205757219, + "grad_norm": 1.015689730644226, + "learning_rate": 7.614665424214193e-05, + "loss": 2.7273, + "step": 5809 + }, + { + "epoch": 0.5259346428894722, + "grad_norm": 0.8221648931503296, + "learning_rate": 7.611583818792311e-05, + "loss": 2.7565, + "step": 5810 + }, + { + "epoch": 0.5260251652032226, + "grad_norm": 0.8100350499153137, + "learning_rate": 7.608502453883876e-05, + "loss": 2.7337, + "step": 5811 + }, + { + "epoch": 0.5261156875169729, + "grad_norm": 0.9273995161056519, + "learning_rate": 7.605421329799176e-05, + "loss": 2.7955, + "step": 5812 + }, + { + "epoch": 0.5262062098307233, + "grad_norm": 0.8550669550895691, + "learning_rate": 7.602340446848486e-05, + "loss": 2.7722, + "step": 5813 + }, + { + "epoch": 0.5262967321444736, + "grad_norm": 0.9372648596763611, + "learning_rate": 7.599259805342043e-05, + "loss": 2.7734, + "step": 5814 + }, + { + "epoch": 0.526387254458224, + "grad_norm": 0.8486389517784119, + "learning_rate": 7.596179405590076e-05, + "loss": 2.7522, + "step": 5815 + }, + { + "epoch": 0.5264777767719743, + "grad_norm": 0.9426712393760681, + "learning_rate": 7.593099247902775e-05, + "loss": 2.7529, + "step": 5816 + }, + { + "epoch": 0.5265682990857247, + "grad_norm": 0.8096632361412048, + "learning_rate": 7.590019332590315e-05, + "loss": 2.7477, + "step": 5817 + }, + { + "epoch": 0.526658821399475, + "grad_norm": 0.89849454164505, + "learning_rate": 7.58693965996284e-05, + "loss": 2.7255, + "step": 5818 + }, + { + "epoch": 0.5267493437132253, + "grad_norm": 0.8777953386306763, + "learning_rate": 7.583860230330478e-05, + "loss": 2.7483, + "step": 5819 + }, + { + "epoch": 0.5268398660269756, + "grad_norm": 0.8368728160858154, + "learning_rate": 7.580781044003324e-05, + "loss": 2.7536, + "step": 5820 + }, + { + "epoch": 0.526930388340726, + "grad_norm": 0.9228582978248596, + "learning_rate": 7.577702101291453e-05, + "loss": 2.7164, + "step": 5821 + }, + { + "epoch": 0.5270209106544763, + "grad_norm": 0.8236722350120544, + "learning_rate": 7.574623402504914e-05, + "loss": 2.7076, + "step": 5822 + }, + { + "epoch": 0.5271114329682267, + "grad_norm": 0.8082314133644104, + "learning_rate": 7.571544947953728e-05, + "loss": 2.7056, + "step": 5823 + }, + { + "epoch": 0.527201955281977, + "grad_norm": 0.9416561722755432, + "learning_rate": 7.568466737947905e-05, + "loss": 2.785, + "step": 5824 + }, + { + "epoch": 0.5272924775957274, + "grad_norm": 0.8790504336357117, + "learning_rate": 7.565388772797411e-05, + "loss": 2.767, + "step": 5825 + }, + { + "epoch": 0.5273829999094777, + "grad_norm": 0.875694215297699, + "learning_rate": 7.562311052812206e-05, + "loss": 2.7041, + "step": 5826 + }, + { + "epoch": 0.5274735222232281, + "grad_norm": 0.8280487656593323, + "learning_rate": 7.559233578302204e-05, + "loss": 2.6991, + "step": 5827 + }, + { + "epoch": 0.5275640445369784, + "grad_norm": 0.9365737438201904, + "learning_rate": 7.556156349577319e-05, + "loss": 2.8038, + "step": 5828 + }, + { + "epoch": 0.5276545668507288, + "grad_norm": 0.911470890045166, + "learning_rate": 7.553079366947421e-05, + "loss": 2.7747, + "step": 5829 + }, + { + "epoch": 0.527745089164479, + "grad_norm": 0.802963137626648, + "learning_rate": 7.550002630722366e-05, + "loss": 2.6946, + "step": 5830 + }, + { + "epoch": 0.5278356114782294, + "grad_norm": 0.9103678464889526, + "learning_rate": 7.546926141211974e-05, + "loss": 2.7042, + "step": 5831 + }, + { + "epoch": 0.5279261337919797, + "grad_norm": 0.9274582862854004, + "learning_rate": 7.54384989872606e-05, + "loss": 2.7517, + "step": 5832 + }, + { + "epoch": 0.52801665610573, + "grad_norm": 0.8614877462387085, + "learning_rate": 7.540773903574386e-05, + "loss": 2.7453, + "step": 5833 + }, + { + "epoch": 0.5281071784194804, + "grad_norm": 0.888545036315918, + "learning_rate": 7.537698156066718e-05, + "loss": 2.6807, + "step": 5834 + }, + { + "epoch": 0.5281977007332307, + "grad_norm": 0.8485361337661743, + "learning_rate": 7.534622656512776e-05, + "loss": 2.7216, + "step": 5835 + }, + { + "epoch": 0.5282882230469811, + "grad_norm": 0.947659432888031, + "learning_rate": 7.531547405222268e-05, + "loss": 2.794, + "step": 5836 + }, + { + "epoch": 0.5283787453607314, + "grad_norm": 0.8418532609939575, + "learning_rate": 7.528472402504862e-05, + "loss": 2.7598, + "step": 5837 + }, + { + "epoch": 0.5284692676744818, + "grad_norm": 0.8268518447875977, + "learning_rate": 7.525397648670225e-05, + "loss": 2.7805, + "step": 5838 + }, + { + "epoch": 0.528559789988232, + "grad_norm": 0.8547778129577637, + "learning_rate": 7.522323144027972e-05, + "loss": 2.7813, + "step": 5839 + }, + { + "epoch": 0.5286503123019825, + "grad_norm": 0.8886696696281433, + "learning_rate": 7.519248888887716e-05, + "loss": 2.736, + "step": 5840 + }, + { + "epoch": 0.5287408346157327, + "grad_norm": 0.8827240467071533, + "learning_rate": 7.516174883559022e-05, + "loss": 2.6714, + "step": 5841 + }, + { + "epoch": 0.5288313569294831, + "grad_norm": 0.8961451649665833, + "learning_rate": 7.513101128351454e-05, + "loss": 2.7882, + "step": 5842 + }, + { + "epoch": 0.5289218792432334, + "grad_norm": 0.8667140007019043, + "learning_rate": 7.510027623574532e-05, + "loss": 2.7755, + "step": 5843 + }, + { + "epoch": 0.5290124015569838, + "grad_norm": 0.810696005821228, + "learning_rate": 7.506954369537763e-05, + "loss": 2.6993, + "step": 5844 + }, + { + "epoch": 0.5291029238707341, + "grad_norm": 0.8313196301460266, + "learning_rate": 7.503881366550617e-05, + "loss": 2.6605, + "step": 5845 + }, + { + "epoch": 0.5291934461844845, + "grad_norm": 0.8745372295379639, + "learning_rate": 7.500808614922551e-05, + "loss": 2.7422, + "step": 5846 + }, + { + "epoch": 0.5292839684982348, + "grad_norm": 0.8211199641227722, + "learning_rate": 7.497736114962989e-05, + "loss": 2.7424, + "step": 5847 + }, + { + "epoch": 0.5293744908119852, + "grad_norm": 0.8483652472496033, + "learning_rate": 7.494663866981335e-05, + "loss": 2.7724, + "step": 5848 + }, + { + "epoch": 0.5294650131257355, + "grad_norm": 0.8949604630470276, + "learning_rate": 7.491591871286954e-05, + "loss": 2.7255, + "step": 5849 + }, + { + "epoch": 0.5295555354394859, + "grad_norm": 0.8604280948638916, + "learning_rate": 7.488520128189209e-05, + "loss": 2.7332, + "step": 5850 + }, + { + "epoch": 0.5296460577532361, + "grad_norm": 0.8617332577705383, + "learning_rate": 7.485448637997416e-05, + "loss": 2.7666, + "step": 5851 + }, + { + "epoch": 0.5297365800669865, + "grad_norm": 0.8203973770141602, + "learning_rate": 7.482377401020879e-05, + "loss": 2.7467, + "step": 5852 + }, + { + "epoch": 0.5298271023807368, + "grad_norm": 0.8294515013694763, + "learning_rate": 7.479306417568864e-05, + "loss": 2.7485, + "step": 5853 + }, + { + "epoch": 0.5299176246944872, + "grad_norm": 0.8514267802238464, + "learning_rate": 7.476235687950628e-05, + "loss": 2.7631, + "step": 5854 + }, + { + "epoch": 0.5300081470082375, + "grad_norm": 0.909512460231781, + "learning_rate": 7.473165212475386e-05, + "loss": 2.7867, + "step": 5855 + }, + { + "epoch": 0.5300986693219879, + "grad_norm": 0.8771156072616577, + "learning_rate": 7.470094991452341e-05, + "loss": 2.7723, + "step": 5856 + }, + { + "epoch": 0.5301891916357382, + "grad_norm": 0.9345921277999878, + "learning_rate": 7.467025025190657e-05, + "loss": 2.7522, + "step": 5857 + }, + { + "epoch": 0.5302797139494886, + "grad_norm": 0.858391523361206, + "learning_rate": 7.463955313999488e-05, + "loss": 2.7519, + "step": 5858 + }, + { + "epoch": 0.5303702362632389, + "grad_norm": 0.8921976685523987, + "learning_rate": 7.460885858187948e-05, + "loss": 2.7519, + "step": 5859 + }, + { + "epoch": 0.5304607585769893, + "grad_norm": 0.8551515340805054, + "learning_rate": 7.457816658065134e-05, + "loss": 2.6766, + "step": 5860 + }, + { + "epoch": 0.5305512808907396, + "grad_norm": 0.8405719995498657, + "learning_rate": 7.454747713940107e-05, + "loss": 2.6896, + "step": 5861 + }, + { + "epoch": 0.53064180320449, + "grad_norm": 0.8666146397590637, + "learning_rate": 7.451679026121922e-05, + "loss": 2.7501, + "step": 5862 + }, + { + "epoch": 0.5307323255182402, + "grad_norm": 0.8197934031486511, + "learning_rate": 7.448610594919586e-05, + "loss": 2.7125, + "step": 5863 + }, + { + "epoch": 0.5308228478319906, + "grad_norm": 0.8025767803192139, + "learning_rate": 7.445542420642097e-05, + "loss": 2.772, + "step": 5864 + }, + { + "epoch": 0.5309133701457409, + "grad_norm": 0.8161811232566833, + "learning_rate": 7.442474503598411e-05, + "loss": 2.7286, + "step": 5865 + }, + { + "epoch": 0.5310038924594913, + "grad_norm": 0.8732130527496338, + "learning_rate": 7.439406844097479e-05, + "loss": 2.6955, + "step": 5866 + }, + { + "epoch": 0.5310944147732416, + "grad_norm": 0.8568199872970581, + "learning_rate": 7.436339442448201e-05, + "loss": 2.7652, + "step": 5867 + }, + { + "epoch": 0.531184937086992, + "grad_norm": 0.8628947138786316, + "learning_rate": 7.433272298959475e-05, + "loss": 2.7113, + "step": 5868 + }, + { + "epoch": 0.5312754594007423, + "grad_norm": 0.8824062347412109, + "learning_rate": 7.430205413940156e-05, + "loss": 2.726, + "step": 5869 + }, + { + "epoch": 0.5313659817144927, + "grad_norm": 0.8852974772453308, + "learning_rate": 7.427138787699086e-05, + "loss": 2.7462, + "step": 5870 + }, + { + "epoch": 0.531456504028243, + "grad_norm": 0.8664782047271729, + "learning_rate": 7.424072420545063e-05, + "loss": 2.7654, + "step": 5871 + }, + { + "epoch": 0.5315470263419934, + "grad_norm": 0.8074523210525513, + "learning_rate": 7.421006312786883e-05, + "loss": 2.6719, + "step": 5872 + }, + { + "epoch": 0.5316375486557436, + "grad_norm": 0.8614087104797363, + "learning_rate": 7.417940464733293e-05, + "loss": 2.7299, + "step": 5873 + }, + { + "epoch": 0.5317280709694939, + "grad_norm": 0.8256407380104065, + "learning_rate": 7.41487487669303e-05, + "loss": 2.7517, + "step": 5874 + }, + { + "epoch": 0.5318185932832443, + "grad_norm": 0.8972731232643127, + "learning_rate": 7.411809548974792e-05, + "loss": 2.7187, + "step": 5875 + }, + { + "epoch": 0.5319091155969946, + "grad_norm": 0.875149130821228, + "learning_rate": 7.408744481887266e-05, + "loss": 2.7454, + "step": 5876 + }, + { + "epoch": 0.531999637910745, + "grad_norm": 0.8577212691307068, + "learning_rate": 7.405679675739096e-05, + "loss": 2.7422, + "step": 5877 + }, + { + "epoch": 0.5320901602244953, + "grad_norm": 0.8802106380462646, + "learning_rate": 7.402615130838917e-05, + "loss": 2.7643, + "step": 5878 + }, + { + "epoch": 0.5321806825382457, + "grad_norm": 0.9183444976806641, + "learning_rate": 7.399550847495318e-05, + "loss": 2.7689, + "step": 5879 + }, + { + "epoch": 0.532271204851996, + "grad_norm": 0.7971477508544922, + "learning_rate": 7.39648682601688e-05, + "loss": 2.7532, + "step": 5880 + }, + { + "epoch": 0.5323617271657464, + "grad_norm": 0.9035381078720093, + "learning_rate": 7.393423066712146e-05, + "loss": 2.7816, + "step": 5881 + }, + { + "epoch": 0.5324522494794967, + "grad_norm": 0.9476202130317688, + "learning_rate": 7.39035956988964e-05, + "loss": 2.7672, + "step": 5882 + }, + { + "epoch": 0.532542771793247, + "grad_norm": 0.8284057378768921, + "learning_rate": 7.387296335857851e-05, + "loss": 2.6749, + "step": 5883 + }, + { + "epoch": 0.5326332941069973, + "grad_norm": 0.8201116919517517, + "learning_rate": 7.384233364925253e-05, + "loss": 2.695, + "step": 5884 + }, + { + "epoch": 0.5327238164207477, + "grad_norm": 0.8144286274909973, + "learning_rate": 7.38117065740028e-05, + "loss": 2.7469, + "step": 5885 + }, + { + "epoch": 0.532814338734498, + "grad_norm": 0.8197963237762451, + "learning_rate": 7.378108213591355e-05, + "loss": 2.7029, + "step": 5886 + }, + { + "epoch": 0.5329048610482484, + "grad_norm": 0.8558396100997925, + "learning_rate": 7.375046033806854e-05, + "loss": 2.6771, + "step": 5887 + }, + { + "epoch": 0.5329953833619987, + "grad_norm": 0.8030669093132019, + "learning_rate": 7.371984118355152e-05, + "loss": 2.7112, + "step": 5888 + }, + { + "epoch": 0.5330859056757491, + "grad_norm": 0.8504447937011719, + "learning_rate": 7.368922467544575e-05, + "loss": 2.7953, + "step": 5889 + }, + { + "epoch": 0.5331764279894994, + "grad_norm": 0.8708440065383911, + "learning_rate": 7.365861081683433e-05, + "loss": 2.774, + "step": 5890 + }, + { + "epoch": 0.5332669503032498, + "grad_norm": 0.9638001322746277, + "learning_rate": 7.362799961080006e-05, + "loss": 2.8029, + "step": 5891 + }, + { + "epoch": 0.5333574726170001, + "grad_norm": 0.8533604145050049, + "learning_rate": 7.359739106042556e-05, + "loss": 2.7299, + "step": 5892 + }, + { + "epoch": 0.5334479949307505, + "grad_norm": 0.8586344718933105, + "learning_rate": 7.356678516879303e-05, + "loss": 2.769, + "step": 5893 + }, + { + "epoch": 0.5335385172445007, + "grad_norm": 0.8530272841453552, + "learning_rate": 7.353618193898453e-05, + "loss": 2.7365, + "step": 5894 + }, + { + "epoch": 0.5336290395582511, + "grad_norm": 0.8625940680503845, + "learning_rate": 7.350558137408174e-05, + "loss": 2.7369, + "step": 5895 + }, + { + "epoch": 0.5337195618720014, + "grad_norm": 0.8640052676200867, + "learning_rate": 7.347498347716624e-05, + "loss": 2.7248, + "step": 5896 + }, + { + "epoch": 0.5338100841857518, + "grad_norm": 0.8243191242218018, + "learning_rate": 7.344438825131911e-05, + "loss": 2.7137, + "step": 5897 + }, + { + "epoch": 0.5339006064995021, + "grad_norm": 0.8464127779006958, + "learning_rate": 7.341379569962141e-05, + "loss": 2.7535, + "step": 5898 + }, + { + "epoch": 0.5339911288132525, + "grad_norm": 0.884215235710144, + "learning_rate": 7.338320582515374e-05, + "loss": 2.7813, + "step": 5899 + }, + { + "epoch": 0.5340816511270028, + "grad_norm": 0.8700332641601562, + "learning_rate": 7.335261863099651e-05, + "loss": 2.7107, + "step": 5900 + }, + { + "epoch": 0.5341721734407532, + "grad_norm": 0.8220349550247192, + "learning_rate": 7.332203412022985e-05, + "loss": 2.6694, + "step": 5901 + }, + { + "epoch": 0.5342626957545035, + "grad_norm": 0.806977391242981, + "learning_rate": 7.329145229593364e-05, + "loss": 2.7473, + "step": 5902 + }, + { + "epoch": 0.5343532180682539, + "grad_norm": 0.8836818337440491, + "learning_rate": 7.326087316118744e-05, + "loss": 2.7675, + "step": 5903 + }, + { + "epoch": 0.5344437403820042, + "grad_norm": 0.8241246342658997, + "learning_rate": 7.323029671907061e-05, + "loss": 2.7076, + "step": 5904 + }, + { + "epoch": 0.5345342626957545, + "grad_norm": 0.876545250415802, + "learning_rate": 7.319972297266214e-05, + "loss": 2.7178, + "step": 5905 + }, + { + "epoch": 0.5346247850095048, + "grad_norm": 0.8125167489051819, + "learning_rate": 7.316915192504082e-05, + "loss": 2.7232, + "step": 5906 + }, + { + "epoch": 0.5347153073232552, + "grad_norm": 0.8703387379646301, + "learning_rate": 7.313858357928519e-05, + "loss": 2.7011, + "step": 5907 + }, + { + "epoch": 0.5348058296370055, + "grad_norm": 0.8583438396453857, + "learning_rate": 7.310801793847344e-05, + "loss": 2.7476, + "step": 5908 + }, + { + "epoch": 0.5348963519507559, + "grad_norm": 0.8597928285598755, + "learning_rate": 7.307745500568358e-05, + "loss": 2.751, + "step": 5909 + }, + { + "epoch": 0.5349868742645062, + "grad_norm": 0.8204509615898132, + "learning_rate": 7.304689478399323e-05, + "loss": 2.7064, + "step": 5910 + }, + { + "epoch": 0.5350773965782566, + "grad_norm": 0.8251175880432129, + "learning_rate": 7.301633727647985e-05, + "loss": 2.8144, + "step": 5911 + }, + { + "epoch": 0.5351679188920069, + "grad_norm": 0.7730387449264526, + "learning_rate": 7.298578248622056e-05, + "loss": 2.7158, + "step": 5912 + }, + { + "epoch": 0.5352584412057573, + "grad_norm": 0.918010413646698, + "learning_rate": 7.295523041629227e-05, + "loss": 2.7091, + "step": 5913 + }, + { + "epoch": 0.5353489635195076, + "grad_norm": 0.8758893013000488, + "learning_rate": 7.292468106977148e-05, + "loss": 2.6953, + "step": 5914 + }, + { + "epoch": 0.5354394858332578, + "grad_norm": 0.8721521496772766, + "learning_rate": 7.289413444973461e-05, + "loss": 2.7949, + "step": 5915 + }, + { + "epoch": 0.5355300081470082, + "grad_norm": 0.9021723866462708, + "learning_rate": 7.286359055925765e-05, + "loss": 2.783, + "step": 5916 + }, + { + "epoch": 0.5356205304607585, + "grad_norm": 0.8621231913566589, + "learning_rate": 7.283304940141637e-05, + "loss": 2.793, + "step": 5917 + }, + { + "epoch": 0.5357110527745089, + "grad_norm": 0.7841774225234985, + "learning_rate": 7.280251097928625e-05, + "loss": 2.7259, + "step": 5918 + }, + { + "epoch": 0.5358015750882592, + "grad_norm": 0.8686859607696533, + "learning_rate": 7.277197529594257e-05, + "loss": 2.7544, + "step": 5919 + }, + { + "epoch": 0.5358920974020096, + "grad_norm": 0.8585182428359985, + "learning_rate": 7.274144235446023e-05, + "loss": 2.7618, + "step": 5920 + }, + { + "epoch": 0.5359826197157599, + "grad_norm": 0.907319962978363, + "learning_rate": 7.27109121579139e-05, + "loss": 2.7342, + "step": 5921 + }, + { + "epoch": 0.5360731420295103, + "grad_norm": 0.81803959608078, + "learning_rate": 7.268038470937792e-05, + "loss": 2.6816, + "step": 5922 + }, + { + "epoch": 0.5361636643432606, + "grad_norm": 0.861193835735321, + "learning_rate": 7.26498600119265e-05, + "loss": 2.672, + "step": 5923 + }, + { + "epoch": 0.536254186657011, + "grad_norm": 0.8426436185836792, + "learning_rate": 7.261933806863342e-05, + "loss": 2.7048, + "step": 5924 + }, + { + "epoch": 0.5363447089707613, + "grad_norm": 0.8128862977027893, + "learning_rate": 7.258881888257227e-05, + "loss": 2.6564, + "step": 5925 + }, + { + "epoch": 0.5364352312845116, + "grad_norm": 0.8285736441612244, + "learning_rate": 7.255830245681626e-05, + "loss": 2.6867, + "step": 5926 + }, + { + "epoch": 0.5365257535982619, + "grad_norm": 0.8216138482093811, + "learning_rate": 7.252778879443849e-05, + "loss": 2.7369, + "step": 5927 + }, + { + "epoch": 0.5366162759120123, + "grad_norm": 0.7723904848098755, + "learning_rate": 7.249727789851158e-05, + "loss": 2.7313, + "step": 5928 + }, + { + "epoch": 0.5367067982257626, + "grad_norm": 0.8136193752288818, + "learning_rate": 7.24667697721081e-05, + "loss": 2.7245, + "step": 5929 + }, + { + "epoch": 0.536797320539513, + "grad_norm": 0.8116896748542786, + "learning_rate": 7.243626441830009e-05, + "loss": 2.699, + "step": 5930 + }, + { + "epoch": 0.5368878428532633, + "grad_norm": 0.7944446206092834, + "learning_rate": 7.240576184015954e-05, + "loss": 2.7374, + "step": 5931 + }, + { + "epoch": 0.5369783651670137, + "grad_norm": 0.8052989840507507, + "learning_rate": 7.237526204075797e-05, + "loss": 2.6733, + "step": 5932 + }, + { + "epoch": 0.537068887480764, + "grad_norm": 0.92603600025177, + "learning_rate": 7.234476502316682e-05, + "loss": 2.7779, + "step": 5933 + }, + { + "epoch": 0.5371594097945144, + "grad_norm": 0.9119099974632263, + "learning_rate": 7.231427079045703e-05, + "loss": 2.7242, + "step": 5934 + }, + { + "epoch": 0.5372499321082647, + "grad_norm": 0.8775193095207214, + "learning_rate": 7.228377934569945e-05, + "loss": 2.7134, + "step": 5935 + }, + { + "epoch": 0.5373404544220151, + "grad_norm": 0.8331400156021118, + "learning_rate": 7.225329069196448e-05, + "loss": 2.7262, + "step": 5936 + }, + { + "epoch": 0.5374309767357653, + "grad_norm": 0.9593919515609741, + "learning_rate": 7.222280483232242e-05, + "loss": 2.7241, + "step": 5937 + }, + { + "epoch": 0.5375214990495157, + "grad_norm": 0.8055837750434875, + "learning_rate": 7.219232176984314e-05, + "loss": 2.7373, + "step": 5938 + }, + { + "epoch": 0.537612021363266, + "grad_norm": 0.8983293175697327, + "learning_rate": 7.21618415075963e-05, + "loss": 2.7519, + "step": 5939 + }, + { + "epoch": 0.5377025436770164, + "grad_norm": 0.8614658117294312, + "learning_rate": 7.213136404865124e-05, + "loss": 2.7133, + "step": 5940 + }, + { + "epoch": 0.5377930659907667, + "grad_norm": 0.8453959822654724, + "learning_rate": 7.210088939607708e-05, + "loss": 2.7399, + "step": 5941 + }, + { + "epoch": 0.5378835883045171, + "grad_norm": 0.8339153528213501, + "learning_rate": 7.207041755294259e-05, + "loss": 2.7262, + "step": 5942 + }, + { + "epoch": 0.5379741106182674, + "grad_norm": 0.8798535466194153, + "learning_rate": 7.20399485223163e-05, + "loss": 2.6945, + "step": 5943 + }, + { + "epoch": 0.5380646329320178, + "grad_norm": 0.8549126982688904, + "learning_rate": 7.200948230726639e-05, + "loss": 2.7325, + "step": 5944 + }, + { + "epoch": 0.5381551552457681, + "grad_norm": 0.9284637570381165, + "learning_rate": 7.19790189108609e-05, + "loss": 2.7726, + "step": 5945 + }, + { + "epoch": 0.5382456775595185, + "grad_norm": 0.8560376167297363, + "learning_rate": 7.194855833616739e-05, + "loss": 2.7303, + "step": 5946 + }, + { + "epoch": 0.5383361998732688, + "grad_norm": 0.8566827178001404, + "learning_rate": 7.191810058625333e-05, + "loss": 2.7196, + "step": 5947 + }, + { + "epoch": 0.5384267221870191, + "grad_norm": 0.8862281441688538, + "learning_rate": 7.188764566418572e-05, + "loss": 2.7211, + "step": 5948 + }, + { + "epoch": 0.5385172445007694, + "grad_norm": 0.8478960990905762, + "learning_rate": 7.185719357303146e-05, + "loss": 2.7088, + "step": 5949 + }, + { + "epoch": 0.5386077668145198, + "grad_norm": 0.8626987934112549, + "learning_rate": 7.182674431585704e-05, + "loss": 2.7561, + "step": 5950 + }, + { + "epoch": 0.5386982891282701, + "grad_norm": 0.9624312520027161, + "learning_rate": 7.17962978957287e-05, + "loss": 2.7622, + "step": 5951 + }, + { + "epoch": 0.5387888114420205, + "grad_norm": 0.8094488382339478, + "learning_rate": 7.176585431571235e-05, + "loss": 2.6762, + "step": 5952 + }, + { + "epoch": 0.5388793337557708, + "grad_norm": 0.8915351033210754, + "learning_rate": 7.173541357887375e-05, + "loss": 2.7593, + "step": 5953 + }, + { + "epoch": 0.5389698560695212, + "grad_norm": 0.9364251494407654, + "learning_rate": 7.170497568827818e-05, + "loss": 2.7553, + "step": 5954 + }, + { + "epoch": 0.5390603783832715, + "grad_norm": 0.850823163986206, + "learning_rate": 7.167454064699083e-05, + "loss": 2.7318, + "step": 5955 + }, + { + "epoch": 0.5391509006970218, + "grad_norm": 0.8671566843986511, + "learning_rate": 7.16441084580764e-05, + "loss": 2.7674, + "step": 5956 + }, + { + "epoch": 0.5392414230107722, + "grad_norm": 0.8717043995857239, + "learning_rate": 7.161367912459954e-05, + "loss": 2.7091, + "step": 5957 + }, + { + "epoch": 0.5393319453245224, + "grad_norm": 0.8433679938316345, + "learning_rate": 7.158325264962438e-05, + "loss": 2.7505, + "step": 5958 + }, + { + "epoch": 0.5394224676382728, + "grad_norm": 0.8027213215827942, + "learning_rate": 7.155282903621492e-05, + "loss": 2.7469, + "step": 5959 + }, + { + "epoch": 0.5395129899520231, + "grad_norm": 0.823722779750824, + "learning_rate": 7.152240828743477e-05, + "loss": 2.7318, + "step": 5960 + }, + { + "epoch": 0.5396035122657735, + "grad_norm": 0.8178278803825378, + "learning_rate": 7.149199040634737e-05, + "loss": 2.7116, + "step": 5961 + }, + { + "epoch": 0.5396940345795238, + "grad_norm": 0.8535356521606445, + "learning_rate": 7.14615753960157e-05, + "loss": 2.7306, + "step": 5962 + }, + { + "epoch": 0.5397845568932742, + "grad_norm": 0.8493401408195496, + "learning_rate": 7.143116325950265e-05, + "loss": 2.7386, + "step": 5963 + }, + { + "epoch": 0.5398750792070245, + "grad_norm": 0.7935865521430969, + "learning_rate": 7.140075399987068e-05, + "loss": 2.7214, + "step": 5964 + }, + { + "epoch": 0.5399656015207749, + "grad_norm": 0.9051488041877747, + "learning_rate": 7.137034762018198e-05, + "loss": 2.7434, + "step": 5965 + }, + { + "epoch": 0.5400561238345252, + "grad_norm": 0.8879326581954956, + "learning_rate": 7.133994412349847e-05, + "loss": 2.7408, + "step": 5966 + }, + { + "epoch": 0.5401466461482756, + "grad_norm": 0.9167154431343079, + "learning_rate": 7.130954351288184e-05, + "loss": 2.7936, + "step": 5967 + }, + { + "epoch": 0.5402371684620259, + "grad_norm": 0.9298393726348877, + "learning_rate": 7.127914579139338e-05, + "loss": 2.7735, + "step": 5968 + }, + { + "epoch": 0.5403276907757762, + "grad_norm": 0.845268964767456, + "learning_rate": 7.124875096209416e-05, + "loss": 2.7469, + "step": 5969 + }, + { + "epoch": 0.5404182130895265, + "grad_norm": 0.9079629182815552, + "learning_rate": 7.12183590280449e-05, + "loss": 2.7276, + "step": 5970 + }, + { + "epoch": 0.5405087354032769, + "grad_norm": 0.8613841533660889, + "learning_rate": 7.118796999230614e-05, + "loss": 2.7374, + "step": 5971 + }, + { + "epoch": 0.5405992577170272, + "grad_norm": 0.8585695028305054, + "learning_rate": 7.115758385793799e-05, + "loss": 2.7661, + "step": 5972 + }, + { + "epoch": 0.5406897800307776, + "grad_norm": 0.8777322173118591, + "learning_rate": 7.112720062800038e-05, + "loss": 2.7605, + "step": 5973 + }, + { + "epoch": 0.5407803023445279, + "grad_norm": 0.8847030997276306, + "learning_rate": 7.109682030555283e-05, + "loss": 2.714, + "step": 5974 + }, + { + "epoch": 0.5408708246582783, + "grad_norm": 0.8644956350326538, + "learning_rate": 7.106644289365473e-05, + "loss": 2.7303, + "step": 5975 + }, + { + "epoch": 0.5409613469720286, + "grad_norm": 0.8250646591186523, + "learning_rate": 7.103606839536502e-05, + "loss": 2.7247, + "step": 5976 + }, + { + "epoch": 0.541051869285779, + "grad_norm": 0.8640917539596558, + "learning_rate": 7.100569681374245e-05, + "loss": 2.7778, + "step": 5977 + }, + { + "epoch": 0.5411423915995293, + "grad_norm": 0.864972710609436, + "learning_rate": 7.097532815184538e-05, + "loss": 2.6894, + "step": 5978 + }, + { + "epoch": 0.5412329139132797, + "grad_norm": 0.8219044804573059, + "learning_rate": 7.094496241273199e-05, + "loss": 2.7022, + "step": 5979 + }, + { + "epoch": 0.5413234362270299, + "grad_norm": 0.8397749066352844, + "learning_rate": 7.09145995994601e-05, + "loss": 2.7996, + "step": 5980 + }, + { + "epoch": 0.5414139585407803, + "grad_norm": 0.8366591930389404, + "learning_rate": 7.088423971508724e-05, + "loss": 2.7352, + "step": 5981 + }, + { + "epoch": 0.5415044808545306, + "grad_norm": 0.8768446445465088, + "learning_rate": 7.08538827626706e-05, + "loss": 2.7229, + "step": 5982 + }, + { + "epoch": 0.541595003168281, + "grad_norm": 0.8540074825286865, + "learning_rate": 7.082352874526721e-05, + "loss": 2.7058, + "step": 5983 + }, + { + "epoch": 0.5416855254820313, + "grad_norm": 0.8582448363304138, + "learning_rate": 7.079317766593365e-05, + "loss": 2.7219, + "step": 5984 + }, + { + "epoch": 0.5417760477957817, + "grad_norm": 0.9049850106239319, + "learning_rate": 7.076282952772633e-05, + "loss": 2.7804, + "step": 5985 + }, + { + "epoch": 0.541866570109532, + "grad_norm": 0.8414204716682434, + "learning_rate": 7.073248433370124e-05, + "loss": 2.7502, + "step": 5986 + }, + { + "epoch": 0.5419570924232824, + "grad_norm": 0.9655411839485168, + "learning_rate": 7.070214208691422e-05, + "loss": 2.7796, + "step": 5987 + }, + { + "epoch": 0.5420476147370327, + "grad_norm": 0.9256438612937927, + "learning_rate": 7.067180279042066e-05, + "loss": 2.7149, + "step": 5988 + }, + { + "epoch": 0.5421381370507831, + "grad_norm": 0.8467999696731567, + "learning_rate": 7.064146644727579e-05, + "loss": 2.6974, + "step": 5989 + }, + { + "epoch": 0.5422286593645333, + "grad_norm": 0.883070170879364, + "learning_rate": 7.061113306053443e-05, + "loss": 2.745, + "step": 5990 + }, + { + "epoch": 0.5423191816782837, + "grad_norm": 0.9323094487190247, + "learning_rate": 7.058080263325115e-05, + "loss": 2.8148, + "step": 5991 + }, + { + "epoch": 0.542409703992034, + "grad_norm": 0.8571617007255554, + "learning_rate": 7.055047516848025e-05, + "loss": 2.7616, + "step": 5992 + }, + { + "epoch": 0.5425002263057844, + "grad_norm": 0.911531925201416, + "learning_rate": 7.052015066927568e-05, + "loss": 2.7568, + "step": 5993 + }, + { + "epoch": 0.5425907486195347, + "grad_norm": 0.8240039944648743, + "learning_rate": 7.048982913869116e-05, + "loss": 2.7021, + "step": 5994 + }, + { + "epoch": 0.5426812709332851, + "grad_norm": 0.8227641582489014, + "learning_rate": 7.045951057978e-05, + "loss": 2.7268, + "step": 5995 + }, + { + "epoch": 0.5427717932470354, + "grad_norm": 0.8382290601730347, + "learning_rate": 7.042919499559537e-05, + "loss": 2.7026, + "step": 5996 + }, + { + "epoch": 0.5428623155607857, + "grad_norm": 0.9181439876556396, + "learning_rate": 7.039888238918993e-05, + "loss": 2.7422, + "step": 5997 + }, + { + "epoch": 0.5429528378745361, + "grad_norm": 0.8383064866065979, + "learning_rate": 7.036857276361627e-05, + "loss": 2.7658, + "step": 5998 + }, + { + "epoch": 0.5430433601882864, + "grad_norm": 1.049307942390442, + "learning_rate": 7.03382661219265e-05, + "loss": 2.7369, + "step": 5999 + }, + { + "epoch": 0.5431338825020368, + "grad_norm": 0.8651420474052429, + "learning_rate": 7.030796246717255e-05, + "loss": 2.6735, + "step": 6000 + }, + { + "epoch": 0.5431338825020368, + "eval_loss": 2.6711881160736084, + "eval_runtime": 71.67, + "eval_samples_per_second": 37.715, + "eval_steps_per_second": 3.153, + "step": 6000 + }, + { + "epoch": 0.543224404815787, + "grad_norm": 0.8109533786773682, + "learning_rate": 7.027766180240592e-05, + "loss": 2.7178, + "step": 6001 + }, + { + "epoch": 0.5433149271295374, + "grad_norm": 0.845350980758667, + "learning_rate": 7.024736413067796e-05, + "loss": 2.7571, + "step": 6002 + }, + { + "epoch": 0.5434054494432877, + "grad_norm": 0.82073575258255, + "learning_rate": 7.02170694550396e-05, + "loss": 2.7131, + "step": 6003 + }, + { + "epoch": 0.5434959717570381, + "grad_norm": 0.8369652032852173, + "learning_rate": 7.018677777854157e-05, + "loss": 2.7398, + "step": 6004 + }, + { + "epoch": 0.5435864940707884, + "grad_norm": 0.8532238602638245, + "learning_rate": 7.015648910423416e-05, + "loss": 2.7892, + "step": 6005 + }, + { + "epoch": 0.5436770163845388, + "grad_norm": 0.7869599461555481, + "learning_rate": 7.012620343516751e-05, + "loss": 2.6773, + "step": 6006 + }, + { + "epoch": 0.5437675386982891, + "grad_norm": 0.820217490196228, + "learning_rate": 7.009592077439134e-05, + "loss": 2.7094, + "step": 6007 + }, + { + "epoch": 0.5438580610120395, + "grad_norm": 0.8736022710800171, + "learning_rate": 7.006564112495516e-05, + "loss": 2.7607, + "step": 6008 + }, + { + "epoch": 0.5439485833257898, + "grad_norm": 0.8576371669769287, + "learning_rate": 7.003536448990804e-05, + "loss": 2.7191, + "step": 6009 + }, + { + "epoch": 0.5440391056395402, + "grad_norm": 0.8501553535461426, + "learning_rate": 7.000509087229895e-05, + "loss": 2.7255, + "step": 6010 + }, + { + "epoch": 0.5441296279532905, + "grad_norm": 0.8708006143569946, + "learning_rate": 6.997482027517637e-05, + "loss": 2.7492, + "step": 6011 + }, + { + "epoch": 0.5442201502670408, + "grad_norm": 0.9946036338806152, + "learning_rate": 6.994455270158858e-05, + "loss": 2.7547, + "step": 6012 + }, + { + "epoch": 0.5443106725807911, + "grad_norm": 0.8790095448493958, + "learning_rate": 6.991428815458347e-05, + "loss": 2.8004, + "step": 6013 + }, + { + "epoch": 0.5444011948945415, + "grad_norm": 0.8968866467475891, + "learning_rate": 6.988402663720877e-05, + "loss": 2.7249, + "step": 6014 + }, + { + "epoch": 0.5444917172082918, + "grad_norm": 0.847731351852417, + "learning_rate": 6.985376815251173e-05, + "loss": 2.6742, + "step": 6015 + }, + { + "epoch": 0.5445822395220422, + "grad_norm": 0.9313018918037415, + "learning_rate": 6.982351270353944e-05, + "loss": 2.6773, + "step": 6016 + }, + { + "epoch": 0.5446727618357925, + "grad_norm": 0.8626921772956848, + "learning_rate": 6.979326029333855e-05, + "loss": 2.6568, + "step": 6017 + }, + { + "epoch": 0.5447632841495429, + "grad_norm": 0.873760461807251, + "learning_rate": 6.976301092495556e-05, + "loss": 2.7645, + "step": 6018 + }, + { + "epoch": 0.5448538064632932, + "grad_norm": 0.8684285283088684, + "learning_rate": 6.973276460143653e-05, + "loss": 2.7056, + "step": 6019 + }, + { + "epoch": 0.5449443287770436, + "grad_norm": 0.8305872082710266, + "learning_rate": 6.970252132582728e-05, + "loss": 2.7385, + "step": 6020 + }, + { + "epoch": 0.5450348510907939, + "grad_norm": 0.8480159640312195, + "learning_rate": 6.967228110117328e-05, + "loss": 2.6805, + "step": 6021 + }, + { + "epoch": 0.5451253734045443, + "grad_norm": 0.8201212286949158, + "learning_rate": 6.964204393051981e-05, + "loss": 2.7454, + "step": 6022 + }, + { + "epoch": 0.5452158957182945, + "grad_norm": 0.8432485461235046, + "learning_rate": 6.961180981691162e-05, + "loss": 2.7223, + "step": 6023 + }, + { + "epoch": 0.5453064180320449, + "grad_norm": 0.9415342211723328, + "learning_rate": 6.958157876339337e-05, + "loss": 2.7548, + "step": 6024 + }, + { + "epoch": 0.5453969403457952, + "grad_norm": 0.8638390302658081, + "learning_rate": 6.955135077300931e-05, + "loss": 2.755, + "step": 6025 + }, + { + "epoch": 0.5454874626595456, + "grad_norm": 0.9359387159347534, + "learning_rate": 6.952112584880341e-05, + "loss": 2.7935, + "step": 6026 + }, + { + "epoch": 0.5455779849732959, + "grad_norm": 0.8699626326560974, + "learning_rate": 6.949090399381925e-05, + "loss": 2.7137, + "step": 6027 + }, + { + "epoch": 0.5456685072870463, + "grad_norm": 0.8389036655426025, + "learning_rate": 6.946068521110028e-05, + "loss": 2.7469, + "step": 6028 + }, + { + "epoch": 0.5457590296007966, + "grad_norm": 0.8703330159187317, + "learning_rate": 6.943046950368944e-05, + "loss": 2.7258, + "step": 6029 + }, + { + "epoch": 0.545849551914547, + "grad_norm": 0.8876437544822693, + "learning_rate": 6.940025687462952e-05, + "loss": 2.7863, + "step": 6030 + }, + { + "epoch": 0.5459400742282973, + "grad_norm": 0.8767717480659485, + "learning_rate": 6.937004732696285e-05, + "loss": 2.7403, + "step": 6031 + }, + { + "epoch": 0.5460305965420477, + "grad_norm": 0.8453347086906433, + "learning_rate": 6.93398408637316e-05, + "loss": 2.7127, + "step": 6032 + }, + { + "epoch": 0.546121118855798, + "grad_norm": 0.8377524018287659, + "learning_rate": 6.930963748797754e-05, + "loss": 2.7375, + "step": 6033 + }, + { + "epoch": 0.5462116411695483, + "grad_norm": 0.8833092451095581, + "learning_rate": 6.927943720274215e-05, + "loss": 2.7563, + "step": 6034 + }, + { + "epoch": 0.5463021634832986, + "grad_norm": 0.8684247732162476, + "learning_rate": 6.924924001106654e-05, + "loss": 2.7916, + "step": 6035 + }, + { + "epoch": 0.546392685797049, + "grad_norm": 0.8185412287712097, + "learning_rate": 6.921904591599169e-05, + "loss": 2.6799, + "step": 6036 + }, + { + "epoch": 0.5464832081107993, + "grad_norm": 0.8553128838539124, + "learning_rate": 6.918885492055803e-05, + "loss": 2.6648, + "step": 6037 + }, + { + "epoch": 0.5465737304245496, + "grad_norm": 0.8351742625236511, + "learning_rate": 6.915866702780586e-05, + "loss": 2.7457, + "step": 6038 + }, + { + "epoch": 0.5466642527383, + "grad_norm": 0.8689993619918823, + "learning_rate": 6.912848224077506e-05, + "loss": 2.691, + "step": 6039 + }, + { + "epoch": 0.5467547750520503, + "grad_norm": 0.788202166557312, + "learning_rate": 6.909830056250527e-05, + "loss": 2.6985, + "step": 6040 + }, + { + "epoch": 0.5468452973658007, + "grad_norm": 0.8690296411514282, + "learning_rate": 6.906812199603575e-05, + "loss": 2.7368, + "step": 6041 + }, + { + "epoch": 0.546935819679551, + "grad_norm": 0.8578805327415466, + "learning_rate": 6.903794654440551e-05, + "loss": 2.7074, + "step": 6042 + }, + { + "epoch": 0.5470263419933014, + "grad_norm": 0.8935035467147827, + "learning_rate": 6.900777421065319e-05, + "loss": 2.7643, + "step": 6043 + }, + { + "epoch": 0.5471168643070516, + "grad_norm": 0.7811352610588074, + "learning_rate": 6.897760499781718e-05, + "loss": 2.6852, + "step": 6044 + }, + { + "epoch": 0.547207386620802, + "grad_norm": 0.7939018607139587, + "learning_rate": 6.894743890893548e-05, + "loss": 2.7098, + "step": 6045 + }, + { + "epoch": 0.5472979089345523, + "grad_norm": 0.8887366056442261, + "learning_rate": 6.891727594704587e-05, + "loss": 2.7744, + "step": 6046 + }, + { + "epoch": 0.5473884312483027, + "grad_norm": 0.8419579267501831, + "learning_rate": 6.888711611518568e-05, + "loss": 2.7, + "step": 6047 + }, + { + "epoch": 0.547478953562053, + "grad_norm": 0.771739661693573, + "learning_rate": 6.885695941639207e-05, + "loss": 2.6992, + "step": 6048 + }, + { + "epoch": 0.5475694758758034, + "grad_norm": 0.9229654669761658, + "learning_rate": 6.882680585370178e-05, + "loss": 2.7491, + "step": 6049 + }, + { + "epoch": 0.5476599981895537, + "grad_norm": 0.8693364858627319, + "learning_rate": 6.87966554301513e-05, + "loss": 2.7429, + "step": 6050 + }, + { + "epoch": 0.5477505205033041, + "grad_norm": 0.8253883719444275, + "learning_rate": 6.876650814877674e-05, + "loss": 2.6733, + "step": 6051 + }, + { + "epoch": 0.5478410428170544, + "grad_norm": 0.8329223394393921, + "learning_rate": 6.873636401261401e-05, + "loss": 2.7187, + "step": 6052 + }, + { + "epoch": 0.5479315651308048, + "grad_norm": 0.8911111354827881, + "learning_rate": 6.87062230246985e-05, + "loss": 2.7515, + "step": 6053 + }, + { + "epoch": 0.548022087444555, + "grad_norm": 0.8329285979270935, + "learning_rate": 6.867608518806552e-05, + "loss": 2.7343, + "step": 6054 + }, + { + "epoch": 0.5481126097583054, + "grad_norm": 0.8729497194290161, + "learning_rate": 6.864595050574989e-05, + "loss": 2.7164, + "step": 6055 + }, + { + "epoch": 0.5482031320720557, + "grad_norm": 0.9158942699432373, + "learning_rate": 6.861581898078619e-05, + "loss": 2.7189, + "step": 6056 + }, + { + "epoch": 0.5482936543858061, + "grad_norm": 0.8531440496444702, + "learning_rate": 6.858569061620862e-05, + "loss": 2.6761, + "step": 6057 + }, + { + "epoch": 0.5483841766995564, + "grad_norm": 0.8701982498168945, + "learning_rate": 6.855556541505122e-05, + "loss": 2.7406, + "step": 6058 + }, + { + "epoch": 0.5484746990133068, + "grad_norm": 0.8322367668151855, + "learning_rate": 6.852544338034748e-05, + "loss": 2.7331, + "step": 6059 + }, + { + "epoch": 0.5485652213270571, + "grad_norm": 0.8104520440101624, + "learning_rate": 6.849532451513074e-05, + "loss": 2.6839, + "step": 6060 + }, + { + "epoch": 0.5486557436408075, + "grad_norm": 0.8525737524032593, + "learning_rate": 6.846520882243393e-05, + "loss": 2.6964, + "step": 6061 + }, + { + "epoch": 0.5487462659545578, + "grad_norm": 0.8493409156799316, + "learning_rate": 6.843509630528977e-05, + "loss": 2.756, + "step": 6062 + }, + { + "epoch": 0.5488367882683082, + "grad_norm": 0.829964816570282, + "learning_rate": 6.840498696673053e-05, + "loss": 2.7348, + "step": 6063 + }, + { + "epoch": 0.5489273105820585, + "grad_norm": 0.8463034629821777, + "learning_rate": 6.837488080978824e-05, + "loss": 2.6979, + "step": 6064 + }, + { + "epoch": 0.5490178328958089, + "grad_norm": 0.8787871599197388, + "learning_rate": 6.834477783749455e-05, + "loss": 2.7413, + "step": 6065 + }, + { + "epoch": 0.5491083552095591, + "grad_norm": 0.8388311862945557, + "learning_rate": 6.831467805288092e-05, + "loss": 2.6772, + "step": 6066 + }, + { + "epoch": 0.5491988775233095, + "grad_norm": 0.8327917456626892, + "learning_rate": 6.828458145897832e-05, + "loss": 2.6996, + "step": 6067 + }, + { + "epoch": 0.5492893998370598, + "grad_norm": 0.9191029071807861, + "learning_rate": 6.825448805881752e-05, + "loss": 2.7829, + "step": 6068 + }, + { + "epoch": 0.5493799221508102, + "grad_norm": 0.7828680872917175, + "learning_rate": 6.822439785542886e-05, + "loss": 2.7264, + "step": 6069 + }, + { + "epoch": 0.5494704444645605, + "grad_norm": 0.8679816722869873, + "learning_rate": 6.819431085184251e-05, + "loss": 2.689, + "step": 6070 + }, + { + "epoch": 0.5495609667783109, + "grad_norm": 0.913976788520813, + "learning_rate": 6.816422705108817e-05, + "loss": 2.7366, + "step": 6071 + }, + { + "epoch": 0.5496514890920612, + "grad_norm": 0.7898628115653992, + "learning_rate": 6.813414645619531e-05, + "loss": 2.6953, + "step": 6072 + }, + { + "epoch": 0.5497420114058116, + "grad_norm": 0.8191077709197998, + "learning_rate": 6.8104069070193e-05, + "loss": 2.6626, + "step": 6073 + }, + { + "epoch": 0.5498325337195619, + "grad_norm": 0.8287845849990845, + "learning_rate": 6.807399489611009e-05, + "loss": 2.6873, + "step": 6074 + }, + { + "epoch": 0.5499230560333123, + "grad_norm": 0.8519953489303589, + "learning_rate": 6.804392393697502e-05, + "loss": 2.7353, + "step": 6075 + }, + { + "epoch": 0.5500135783470625, + "grad_norm": 0.9047031402587891, + "learning_rate": 6.801385619581592e-05, + "loss": 2.6962, + "step": 6076 + }, + { + "epoch": 0.550104100660813, + "grad_norm": 0.8621162176132202, + "learning_rate": 6.798379167566064e-05, + "loss": 2.6707, + "step": 6077 + }, + { + "epoch": 0.5501946229745632, + "grad_norm": 0.8716885447502136, + "learning_rate": 6.795373037953663e-05, + "loss": 2.7579, + "step": 6078 + }, + { + "epoch": 0.5502851452883135, + "grad_norm": 0.777885377407074, + "learning_rate": 6.792367231047112e-05, + "loss": 2.6918, + "step": 6079 + }, + { + "epoch": 0.5503756676020639, + "grad_norm": 0.9372280836105347, + "learning_rate": 6.789361747149093e-05, + "loss": 2.7738, + "step": 6080 + }, + { + "epoch": 0.5504661899158142, + "grad_norm": 0.8034177422523499, + "learning_rate": 6.786356586562259e-05, + "loss": 2.6903, + "step": 6081 + }, + { + "epoch": 0.5505567122295646, + "grad_norm": 0.7964674234390259, + "learning_rate": 6.783351749589225e-05, + "loss": 2.6633, + "step": 6082 + }, + { + "epoch": 0.5506472345433149, + "grad_norm": 0.8511160612106323, + "learning_rate": 6.780347236532588e-05, + "loss": 2.7443, + "step": 6083 + }, + { + "epoch": 0.5507377568570653, + "grad_norm": 0.8763010501861572, + "learning_rate": 6.77734304769489e-05, + "loss": 2.7221, + "step": 6084 + }, + { + "epoch": 0.5508282791708156, + "grad_norm": 0.8387385606765747, + "learning_rate": 6.774339183378663e-05, + "loss": 2.7448, + "step": 6085 + }, + { + "epoch": 0.550918801484566, + "grad_norm": 0.8287498950958252, + "learning_rate": 6.771335643886389e-05, + "loss": 2.6757, + "step": 6086 + }, + { + "epoch": 0.5510093237983162, + "grad_norm": 0.8377882242202759, + "learning_rate": 6.768332429520532e-05, + "loss": 2.6849, + "step": 6087 + }, + { + "epoch": 0.5510998461120666, + "grad_norm": 0.8893755078315735, + "learning_rate": 6.765329540583504e-05, + "loss": 2.7471, + "step": 6088 + }, + { + "epoch": 0.5511903684258169, + "grad_norm": 1.009269118309021, + "learning_rate": 6.762326977377709e-05, + "loss": 2.6596, + "step": 6089 + }, + { + "epoch": 0.5512808907395673, + "grad_norm": 0.9541810154914856, + "learning_rate": 6.759324740205495e-05, + "loss": 2.7035, + "step": 6090 + }, + { + "epoch": 0.5513714130533176, + "grad_norm": 0.808933436870575, + "learning_rate": 6.756322829369195e-05, + "loss": 2.7264, + "step": 6091 + }, + { + "epoch": 0.551461935367068, + "grad_norm": 0.9128525257110596, + "learning_rate": 6.753321245171092e-05, + "loss": 2.7286, + "step": 6092 + }, + { + "epoch": 0.5515524576808183, + "grad_norm": 0.9354084134101868, + "learning_rate": 6.750319987913453e-05, + "loss": 2.7369, + "step": 6093 + }, + { + "epoch": 0.5516429799945687, + "grad_norm": 0.8727840781211853, + "learning_rate": 6.747319057898503e-05, + "loss": 2.7014, + "step": 6094 + }, + { + "epoch": 0.551733502308319, + "grad_norm": 0.8667216897010803, + "learning_rate": 6.744318455428436e-05, + "loss": 2.7354, + "step": 6095 + }, + { + "epoch": 0.5518240246220694, + "grad_norm": 1.0342938899993896, + "learning_rate": 6.741318180805407e-05, + "loss": 2.7443, + "step": 6096 + }, + { + "epoch": 0.5519145469358196, + "grad_norm": 0.7890323400497437, + "learning_rate": 6.738318234331554e-05, + "loss": 2.6711, + "step": 6097 + }, + { + "epoch": 0.55200506924957, + "grad_norm": 0.8307895064353943, + "learning_rate": 6.735318616308961e-05, + "loss": 2.6776, + "step": 6098 + }, + { + "epoch": 0.5520955915633203, + "grad_norm": 0.8395517468452454, + "learning_rate": 6.732319327039698e-05, + "loss": 2.6839, + "step": 6099 + }, + { + "epoch": 0.5521861138770707, + "grad_norm": 0.855904221534729, + "learning_rate": 6.729320366825784e-05, + "loss": 2.7223, + "step": 6100 + }, + { + "epoch": 0.552276636190821, + "grad_norm": 0.8447253108024597, + "learning_rate": 6.726321735969223e-05, + "loss": 2.7251, + "step": 6101 + }, + { + "epoch": 0.5523671585045714, + "grad_norm": 0.8225886225700378, + "learning_rate": 6.723323434771974e-05, + "loss": 2.7384, + "step": 6102 + }, + { + "epoch": 0.5524576808183217, + "grad_norm": 0.869172990322113, + "learning_rate": 6.720325463535966e-05, + "loss": 2.7413, + "step": 6103 + }, + { + "epoch": 0.5525482031320721, + "grad_norm": 0.8755947947502136, + "learning_rate": 6.717327822563089e-05, + "loss": 2.7926, + "step": 6104 + }, + { + "epoch": 0.5526387254458224, + "grad_norm": 0.8883076310157776, + "learning_rate": 6.714330512155216e-05, + "loss": 2.7569, + "step": 6105 + }, + { + "epoch": 0.5527292477595728, + "grad_norm": 0.8901398181915283, + "learning_rate": 6.711333532614168e-05, + "loss": 2.7178, + "step": 6106 + }, + { + "epoch": 0.5528197700733231, + "grad_norm": 0.856147050857544, + "learning_rate": 6.708336884241746e-05, + "loss": 2.6511, + "step": 6107 + }, + { + "epoch": 0.5529102923870735, + "grad_norm": 0.834158718585968, + "learning_rate": 6.705340567339704e-05, + "loss": 2.6735, + "step": 6108 + }, + { + "epoch": 0.5530008147008237, + "grad_norm": 0.8236615657806396, + "learning_rate": 6.702344582209782e-05, + "loss": 2.7029, + "step": 6109 + }, + { + "epoch": 0.5530913370145741, + "grad_norm": 0.8336913585662842, + "learning_rate": 6.699348929153668e-05, + "loss": 2.6636, + "step": 6110 + }, + { + "epoch": 0.5531818593283244, + "grad_norm": 0.9094150066375732, + "learning_rate": 6.69635360847303e-05, + "loss": 2.7634, + "step": 6111 + }, + { + "epoch": 0.5532723816420748, + "grad_norm": 0.9049180150032043, + "learning_rate": 6.693358620469487e-05, + "loss": 2.7459, + "step": 6112 + }, + { + "epoch": 0.5533629039558251, + "grad_norm": 0.8091766834259033, + "learning_rate": 6.690363965444646e-05, + "loss": 2.7064, + "step": 6113 + }, + { + "epoch": 0.5534534262695755, + "grad_norm": 0.8895435929298401, + "learning_rate": 6.68736964370006e-05, + "loss": 2.7459, + "step": 6114 + }, + { + "epoch": 0.5535439485833258, + "grad_norm": 0.8452686071395874, + "learning_rate": 6.684375655537263e-05, + "loss": 2.7731, + "step": 6115 + }, + { + "epoch": 0.5536344708970762, + "grad_norm": 0.8960869312286377, + "learning_rate": 6.681382001257744e-05, + "loss": 2.6751, + "step": 6116 + }, + { + "epoch": 0.5537249932108265, + "grad_norm": 0.8500857949256897, + "learning_rate": 6.67838868116297e-05, + "loss": 2.7669, + "step": 6117 + }, + { + "epoch": 0.5538155155245769, + "grad_norm": 0.9451707005500793, + "learning_rate": 6.675395695554359e-05, + "loss": 2.6578, + "step": 6118 + }, + { + "epoch": 0.5539060378383271, + "grad_norm": 0.8644303679466248, + "learning_rate": 6.672403044733317e-05, + "loss": 2.7596, + "step": 6119 + }, + { + "epoch": 0.5539965601520774, + "grad_norm": 0.7932984232902527, + "learning_rate": 6.669410729001193e-05, + "loss": 2.7395, + "step": 6120 + }, + { + "epoch": 0.5540870824658278, + "grad_norm": 0.9047853946685791, + "learning_rate": 6.666418748659321e-05, + "loss": 2.777, + "step": 6121 + }, + { + "epoch": 0.5541776047795781, + "grad_norm": 0.8457450866699219, + "learning_rate": 6.663427104008984e-05, + "loss": 2.7113, + "step": 6122 + }, + { + "epoch": 0.5542681270933285, + "grad_norm": 0.9217113256454468, + "learning_rate": 6.660435795351451e-05, + "loss": 2.7712, + "step": 6123 + }, + { + "epoch": 0.5543586494070788, + "grad_norm": 1.0403680801391602, + "learning_rate": 6.657444822987942e-05, + "loss": 2.7665, + "step": 6124 + }, + { + "epoch": 0.5544491717208292, + "grad_norm": 0.7791826128959656, + "learning_rate": 6.654454187219649e-05, + "loss": 2.6847, + "step": 6125 + }, + { + "epoch": 0.5545396940345795, + "grad_norm": 0.8778238296508789, + "learning_rate": 6.651463888347722e-05, + "loss": 2.7482, + "step": 6126 + }, + { + "epoch": 0.5546302163483299, + "grad_norm": 0.7843742370605469, + "learning_rate": 6.648473926673295e-05, + "loss": 2.744, + "step": 6127 + }, + { + "epoch": 0.5547207386620802, + "grad_norm": 0.959306538105011, + "learning_rate": 6.64548430249745e-05, + "loss": 2.7685, + "step": 6128 + }, + { + "epoch": 0.5548112609758306, + "grad_norm": 0.8497962355613708, + "learning_rate": 6.642495016121246e-05, + "loss": 2.731, + "step": 6129 + }, + { + "epoch": 0.5549017832895808, + "grad_norm": 0.7902641892433167, + "learning_rate": 6.639506067845697e-05, + "loss": 2.7404, + "step": 6130 + }, + { + "epoch": 0.5549923056033312, + "grad_norm": 0.910959780216217, + "learning_rate": 6.636517457971801e-05, + "loss": 2.7076, + "step": 6131 + }, + { + "epoch": 0.5550828279170815, + "grad_norm": 0.9271765351295471, + "learning_rate": 6.633529186800502e-05, + "loss": 2.7268, + "step": 6132 + }, + { + "epoch": 0.5551733502308319, + "grad_norm": 0.8239401578903198, + "learning_rate": 6.630541254632724e-05, + "loss": 2.7182, + "step": 6133 + }, + { + "epoch": 0.5552638725445822, + "grad_norm": 0.852472186088562, + "learning_rate": 6.627553661769345e-05, + "loss": 2.6915, + "step": 6134 + }, + { + "epoch": 0.5553543948583326, + "grad_norm": 0.8380687832832336, + "learning_rate": 6.624566408511223e-05, + "loss": 2.6643, + "step": 6135 + }, + { + "epoch": 0.5554449171720829, + "grad_norm": 0.7780315279960632, + "learning_rate": 6.621579495159171e-05, + "loss": 2.6831, + "step": 6136 + }, + { + "epoch": 0.5555354394858333, + "grad_norm": 0.9226695895195007, + "learning_rate": 6.618592922013973e-05, + "loss": 2.6992, + "step": 6137 + }, + { + "epoch": 0.5556259617995836, + "grad_norm": 0.894949734210968, + "learning_rate": 6.61560668937637e-05, + "loss": 2.7078, + "step": 6138 + }, + { + "epoch": 0.555716484113334, + "grad_norm": 0.911466658115387, + "learning_rate": 6.612620797547087e-05, + "loss": 2.7357, + "step": 6139 + }, + { + "epoch": 0.5558070064270842, + "grad_norm": 0.9253066778182983, + "learning_rate": 6.609635246826794e-05, + "loss": 2.7053, + "step": 6140 + }, + { + "epoch": 0.5558975287408346, + "grad_norm": 0.8521486520767212, + "learning_rate": 6.60665003751614e-05, + "loss": 2.76, + "step": 6141 + }, + { + "epoch": 0.5559880510545849, + "grad_norm": 0.7954798340797424, + "learning_rate": 6.603665169915732e-05, + "loss": 2.6759, + "step": 6142 + }, + { + "epoch": 0.5560785733683353, + "grad_norm": 0.9189724326133728, + "learning_rate": 6.600680644326151e-05, + "loss": 2.7218, + "step": 6143 + }, + { + "epoch": 0.5561690956820856, + "grad_norm": 0.8937711119651794, + "learning_rate": 6.597696461047934e-05, + "loss": 2.7618, + "step": 6144 + }, + { + "epoch": 0.556259617995836, + "grad_norm": 0.8732416033744812, + "learning_rate": 6.594712620381594e-05, + "loss": 2.7768, + "step": 6145 + }, + { + "epoch": 0.5563501403095863, + "grad_norm": 0.8561906218528748, + "learning_rate": 6.591729122627595e-05, + "loss": 2.7299, + "step": 6146 + }, + { + "epoch": 0.5564406626233367, + "grad_norm": 0.959690272808075, + "learning_rate": 6.588745968086385e-05, + "loss": 2.7445, + "step": 6147 + }, + { + "epoch": 0.556531184937087, + "grad_norm": 0.8490232229232788, + "learning_rate": 6.585763157058358e-05, + "loss": 2.7148, + "step": 6148 + }, + { + "epoch": 0.5566217072508374, + "grad_norm": 0.9139682650566101, + "learning_rate": 6.58278068984389e-05, + "loss": 2.676, + "step": 6149 + }, + { + "epoch": 0.5567122295645877, + "grad_norm": 0.8509264588356018, + "learning_rate": 6.579798566743314e-05, + "loss": 2.7449, + "step": 6150 + }, + { + "epoch": 0.556802751878338, + "grad_norm": 0.8683857917785645, + "learning_rate": 6.576816788056928e-05, + "loss": 2.7018, + "step": 6151 + }, + { + "epoch": 0.5568932741920883, + "grad_norm": 0.9115766882896423, + "learning_rate": 6.573835354084996e-05, + "loss": 2.7474, + "step": 6152 + }, + { + "epoch": 0.5569837965058387, + "grad_norm": 0.9348102807998657, + "learning_rate": 6.570854265127753e-05, + "loss": 2.7245, + "step": 6153 + }, + { + "epoch": 0.557074318819589, + "grad_norm": 0.8374444842338562, + "learning_rate": 6.567873521485389e-05, + "loss": 2.6585, + "step": 6154 + }, + { + "epoch": 0.5571648411333394, + "grad_norm": 0.9902597665786743, + "learning_rate": 6.564893123458069e-05, + "loss": 2.7539, + "step": 6155 + }, + { + "epoch": 0.5572553634470897, + "grad_norm": 0.9274411201477051, + "learning_rate": 6.561913071345915e-05, + "loss": 2.6765, + "step": 6156 + }, + { + "epoch": 0.5573458857608401, + "grad_norm": 0.8324682712554932, + "learning_rate": 6.558933365449025e-05, + "loss": 2.711, + "step": 6157 + }, + { + "epoch": 0.5574364080745904, + "grad_norm": 0.8855867385864258, + "learning_rate": 6.555954006067447e-05, + "loss": 2.6812, + "step": 6158 + }, + { + "epoch": 0.5575269303883408, + "grad_norm": 0.8500581979751587, + "learning_rate": 6.552974993501211e-05, + "loss": 2.7387, + "step": 6159 + }, + { + "epoch": 0.5576174527020911, + "grad_norm": 0.8339815735816956, + "learning_rate": 6.549996328050296e-05, + "loss": 2.7213, + "step": 6160 + }, + { + "epoch": 0.5577079750158414, + "grad_norm": 0.9083566069602966, + "learning_rate": 6.547018010014654e-05, + "loss": 2.7218, + "step": 6161 + }, + { + "epoch": 0.5577984973295917, + "grad_norm": 0.8667166829109192, + "learning_rate": 6.544040039694208e-05, + "loss": 2.7232, + "step": 6162 + }, + { + "epoch": 0.557889019643342, + "grad_norm": 0.8151534795761108, + "learning_rate": 6.541062417388833e-05, + "loss": 2.6772, + "step": 6163 + }, + { + "epoch": 0.5579795419570924, + "grad_norm": 0.8102704286575317, + "learning_rate": 6.53808514339838e-05, + "loss": 2.6715, + "step": 6164 + }, + { + "epoch": 0.5580700642708427, + "grad_norm": 0.9075278043746948, + "learning_rate": 6.535108218022654e-05, + "loss": 2.7122, + "step": 6165 + }, + { + "epoch": 0.5581605865845931, + "grad_norm": 0.8931668996810913, + "learning_rate": 6.53213164156144e-05, + "loss": 2.7641, + "step": 6166 + }, + { + "epoch": 0.5582511088983434, + "grad_norm": 0.8440109491348267, + "learning_rate": 6.529155414314472e-05, + "loss": 2.7849, + "step": 6167 + }, + { + "epoch": 0.5583416312120938, + "grad_norm": 0.8298215270042419, + "learning_rate": 6.526179536581463e-05, + "loss": 2.7624, + "step": 6168 + }, + { + "epoch": 0.5584321535258441, + "grad_norm": 0.9021025896072388, + "learning_rate": 6.523204008662074e-05, + "loss": 2.7398, + "step": 6169 + }, + { + "epoch": 0.5585226758395945, + "grad_norm": 0.8510945439338684, + "learning_rate": 6.52022883085595e-05, + "loss": 2.7218, + "step": 6170 + }, + { + "epoch": 0.5586131981533448, + "grad_norm": 0.8765257596969604, + "learning_rate": 6.517254003462686e-05, + "loss": 2.7187, + "step": 6171 + }, + { + "epoch": 0.5587037204670952, + "grad_norm": 0.8824365735054016, + "learning_rate": 6.51427952678185e-05, + "loss": 2.7178, + "step": 6172 + }, + { + "epoch": 0.5587942427808454, + "grad_norm": 0.8940009474754333, + "learning_rate": 6.511305401112967e-05, + "loss": 2.7676, + "step": 6173 + }, + { + "epoch": 0.5588847650945958, + "grad_norm": 0.8891744613647461, + "learning_rate": 6.508331626755538e-05, + "loss": 2.6589, + "step": 6174 + }, + { + "epoch": 0.5589752874083461, + "grad_norm": 0.8407256603240967, + "learning_rate": 6.505358204009017e-05, + "loss": 2.7429, + "step": 6175 + }, + { + "epoch": 0.5590658097220965, + "grad_norm": 0.8492648005485535, + "learning_rate": 6.502385133172833e-05, + "loss": 2.7259, + "step": 6176 + }, + { + "epoch": 0.5591563320358468, + "grad_norm": 1.0108580589294434, + "learning_rate": 6.499412414546362e-05, + "loss": 2.7615, + "step": 6177 + }, + { + "epoch": 0.5592468543495972, + "grad_norm": 0.8850505948066711, + "learning_rate": 6.496440048428976e-05, + "loss": 2.723, + "step": 6178 + }, + { + "epoch": 0.5593373766633475, + "grad_norm": 0.8815732598304749, + "learning_rate": 6.493468035119972e-05, + "loss": 2.6943, + "step": 6179 + }, + { + "epoch": 0.5594278989770979, + "grad_norm": 0.8840451240539551, + "learning_rate": 6.490496374918647e-05, + "loss": 2.714, + "step": 6180 + }, + { + "epoch": 0.5595184212908482, + "grad_norm": 0.8313421010971069, + "learning_rate": 6.487525068124237e-05, + "loss": 2.7207, + "step": 6181 + }, + { + "epoch": 0.5596089436045986, + "grad_norm": 0.8717589378356934, + "learning_rate": 6.48455411503596e-05, + "loss": 2.7393, + "step": 6182 + }, + { + "epoch": 0.5596994659183488, + "grad_norm": 0.9375311732292175, + "learning_rate": 6.481583515952983e-05, + "loss": 2.7661, + "step": 6183 + }, + { + "epoch": 0.5597899882320992, + "grad_norm": 0.9299476146697998, + "learning_rate": 6.478613271174453e-05, + "loss": 2.7136, + "step": 6184 + }, + { + "epoch": 0.5598805105458495, + "grad_norm": 0.9196488857269287, + "learning_rate": 6.475643380999468e-05, + "loss": 2.7096, + "step": 6185 + }, + { + "epoch": 0.5599710328595999, + "grad_norm": 0.852522075176239, + "learning_rate": 6.472673845727102e-05, + "loss": 2.6871, + "step": 6186 + }, + { + "epoch": 0.5600615551733502, + "grad_norm": 0.9577264785766602, + "learning_rate": 6.469704665656378e-05, + "loss": 2.8258, + "step": 6187 + }, + { + "epoch": 0.5601520774871006, + "grad_norm": 0.9313653111457825, + "learning_rate": 6.466735841086302e-05, + "loss": 2.69, + "step": 6188 + }, + { + "epoch": 0.5602425998008509, + "grad_norm": 0.8006998300552368, + "learning_rate": 6.463767372315827e-05, + "loss": 2.6518, + "step": 6189 + }, + { + "epoch": 0.5603331221146013, + "grad_norm": 0.8366522789001465, + "learning_rate": 6.460799259643884e-05, + "loss": 2.7591, + "step": 6190 + }, + { + "epoch": 0.5604236444283516, + "grad_norm": 0.8412538170814514, + "learning_rate": 6.457831503369354e-05, + "loss": 2.6933, + "step": 6191 + }, + { + "epoch": 0.560514166742102, + "grad_norm": 0.8227540254592896, + "learning_rate": 6.454864103791099e-05, + "loss": 2.7545, + "step": 6192 + }, + { + "epoch": 0.5606046890558523, + "grad_norm": 0.8566422462463379, + "learning_rate": 6.45189706120793e-05, + "loss": 2.7272, + "step": 6193 + }, + { + "epoch": 0.5606952113696027, + "grad_norm": 0.8530976176261902, + "learning_rate": 6.448930375918631e-05, + "loss": 2.6771, + "step": 6194 + }, + { + "epoch": 0.5607857336833529, + "grad_norm": 0.8847705125808716, + "learning_rate": 6.445964048221944e-05, + "loss": 2.7028, + "step": 6195 + }, + { + "epoch": 0.5608762559971033, + "grad_norm": 0.9251405000686646, + "learning_rate": 6.442998078416583e-05, + "loss": 2.7475, + "step": 6196 + }, + { + "epoch": 0.5609667783108536, + "grad_norm": 0.8360767364501953, + "learning_rate": 6.440032466801215e-05, + "loss": 2.7187, + "step": 6197 + }, + { + "epoch": 0.561057300624604, + "grad_norm": 0.901405930519104, + "learning_rate": 6.437067213674483e-05, + "loss": 2.6693, + "step": 6198 + }, + { + "epoch": 0.5611478229383543, + "grad_norm": 0.9669973850250244, + "learning_rate": 6.434102319334981e-05, + "loss": 2.7424, + "step": 6199 + }, + { + "epoch": 0.5612383452521047, + "grad_norm": 0.8233455419540405, + "learning_rate": 6.431137784081282e-05, + "loss": 2.6649, + "step": 6200 + }, + { + "epoch": 0.561328867565855, + "grad_norm": 0.9279999732971191, + "learning_rate": 6.428173608211909e-05, + "loss": 2.7304, + "step": 6201 + }, + { + "epoch": 0.5614193898796053, + "grad_norm": 0.88908451795578, + "learning_rate": 6.425209792025358e-05, + "loss": 2.7849, + "step": 6202 + }, + { + "epoch": 0.5615099121933557, + "grad_norm": 0.9120383858680725, + "learning_rate": 6.422246335820079e-05, + "loss": 2.7655, + "step": 6203 + }, + { + "epoch": 0.561600434507106, + "grad_norm": 0.8763618469238281, + "learning_rate": 6.419283239894499e-05, + "loss": 2.7167, + "step": 6204 + }, + { + "epoch": 0.5616909568208563, + "grad_norm": 0.8769980669021606, + "learning_rate": 6.416320504546997e-05, + "loss": 2.7123, + "step": 6205 + }, + { + "epoch": 0.5617814791346066, + "grad_norm": 0.8798347115516663, + "learning_rate": 6.413358130075925e-05, + "loss": 2.7843, + "step": 6206 + }, + { + "epoch": 0.561872001448357, + "grad_norm": 0.8606103658676147, + "learning_rate": 6.410396116779587e-05, + "loss": 2.6994, + "step": 6207 + }, + { + "epoch": 0.5619625237621073, + "grad_norm": 0.8350090980529785, + "learning_rate": 6.407434464956266e-05, + "loss": 2.7007, + "step": 6208 + }, + { + "epoch": 0.5620530460758577, + "grad_norm": 0.841839075088501, + "learning_rate": 6.404473174904193e-05, + "loss": 2.6919, + "step": 6209 + }, + { + "epoch": 0.562143568389608, + "grad_norm": 0.9548243880271912, + "learning_rate": 6.401512246921576e-05, + "loss": 2.7083, + "step": 6210 + }, + { + "epoch": 0.5622340907033584, + "grad_norm": 0.8236305117607117, + "learning_rate": 6.398551681306575e-05, + "loss": 2.7162, + "step": 6211 + }, + { + "epoch": 0.5623246130171087, + "grad_norm": 0.8622593879699707, + "learning_rate": 6.395591478357324e-05, + "loss": 2.7296, + "step": 6212 + }, + { + "epoch": 0.5624151353308591, + "grad_norm": 0.8808903694152832, + "learning_rate": 6.392631638371907e-05, + "loss": 2.7442, + "step": 6213 + }, + { + "epoch": 0.5625056576446094, + "grad_norm": 0.84144526720047, + "learning_rate": 6.389672161648389e-05, + "loss": 2.7346, + "step": 6214 + }, + { + "epoch": 0.5625961799583598, + "grad_norm": 0.8263358473777771, + "learning_rate": 6.386713048484785e-05, + "loss": 2.7278, + "step": 6215 + }, + { + "epoch": 0.56268670227211, + "grad_norm": 0.8334097266197205, + "learning_rate": 6.383754299179079e-05, + "loss": 2.7624, + "step": 6216 + }, + { + "epoch": 0.5627772245858604, + "grad_norm": 0.8723674416542053, + "learning_rate": 6.380795914029213e-05, + "loss": 2.6862, + "step": 6217 + }, + { + "epoch": 0.5628677468996107, + "grad_norm": 0.8597888946533203, + "learning_rate": 6.377837893333103e-05, + "loss": 2.7343, + "step": 6218 + }, + { + "epoch": 0.5629582692133611, + "grad_norm": 0.819015383720398, + "learning_rate": 6.374880237388616e-05, + "loss": 2.6803, + "step": 6219 + }, + { + "epoch": 0.5630487915271114, + "grad_norm": 0.9522851705551147, + "learning_rate": 6.371922946493591e-05, + "loss": 2.693, + "step": 6220 + }, + { + "epoch": 0.5631393138408618, + "grad_norm": 0.8460712432861328, + "learning_rate": 6.368966020945824e-05, + "loss": 2.8005, + "step": 6221 + }, + { + "epoch": 0.5632298361546121, + "grad_norm": 0.8678927421569824, + "learning_rate": 6.366009461043083e-05, + "loss": 2.7346, + "step": 6222 + }, + { + "epoch": 0.5633203584683625, + "grad_norm": 0.8084684014320374, + "learning_rate": 6.363053267083086e-05, + "loss": 2.6989, + "step": 6223 + }, + { + "epoch": 0.5634108807821128, + "grad_norm": 0.8358949422836304, + "learning_rate": 6.360097439363529e-05, + "loss": 2.7444, + "step": 6224 + }, + { + "epoch": 0.5635014030958632, + "grad_norm": 0.8385283350944519, + "learning_rate": 6.357141978182056e-05, + "loss": 2.7455, + "step": 6225 + }, + { + "epoch": 0.5635919254096134, + "grad_norm": 0.8100994229316711, + "learning_rate": 6.35418688383629e-05, + "loss": 2.6919, + "step": 6226 + }, + { + "epoch": 0.5636824477233638, + "grad_norm": 0.9156104326248169, + "learning_rate": 6.351232156623803e-05, + "loss": 2.7142, + "step": 6227 + }, + { + "epoch": 0.5637729700371141, + "grad_norm": 0.8520402312278748, + "learning_rate": 6.348277796842141e-05, + "loss": 2.7497, + "step": 6228 + }, + { + "epoch": 0.5638634923508645, + "grad_norm": 0.8622889518737793, + "learning_rate": 6.345323804788799e-05, + "loss": 2.6753, + "step": 6229 + }, + { + "epoch": 0.5639540146646148, + "grad_norm": 0.8826088309288025, + "learning_rate": 6.342370180761256e-05, + "loss": 2.7352, + "step": 6230 + }, + { + "epoch": 0.5640445369783652, + "grad_norm": 0.8279819488525391, + "learning_rate": 6.339416925056933e-05, + "loss": 2.6723, + "step": 6231 + }, + { + "epoch": 0.5641350592921155, + "grad_norm": 0.8228965401649475, + "learning_rate": 6.336464037973226e-05, + "loss": 2.7167, + "step": 6232 + }, + { + "epoch": 0.5642255816058659, + "grad_norm": 0.921067476272583, + "learning_rate": 6.333511519807488e-05, + "loss": 2.7505, + "step": 6233 + }, + { + "epoch": 0.5643161039196162, + "grad_norm": 0.8844332695007324, + "learning_rate": 6.330559370857043e-05, + "loss": 2.6383, + "step": 6234 + }, + { + "epoch": 0.5644066262333666, + "grad_norm": 0.9329710602760315, + "learning_rate": 6.327607591419166e-05, + "loss": 2.7016, + "step": 6235 + }, + { + "epoch": 0.5644971485471169, + "grad_norm": 0.8243615627288818, + "learning_rate": 6.324656181791108e-05, + "loss": 2.7539, + "step": 6236 + }, + { + "epoch": 0.5645876708608673, + "grad_norm": 0.8301962018013, + "learning_rate": 6.321705142270067e-05, + "loss": 2.6993, + "step": 6237 + }, + { + "epoch": 0.5646781931746175, + "grad_norm": 0.8087690472602844, + "learning_rate": 6.318754473153221e-05, + "loss": 2.7268, + "step": 6238 + }, + { + "epoch": 0.5647687154883679, + "grad_norm": 0.8300209641456604, + "learning_rate": 6.315804174737697e-05, + "loss": 2.688, + "step": 6239 + }, + { + "epoch": 0.5648592378021182, + "grad_norm": 0.8376254439353943, + "learning_rate": 6.312854247320595e-05, + "loss": 2.7136, + "step": 6240 + }, + { + "epoch": 0.5649497601158686, + "grad_norm": 0.8355643153190613, + "learning_rate": 6.309904691198962e-05, + "loss": 2.6754, + "step": 6241 + }, + { + "epoch": 0.5650402824296189, + "grad_norm": 0.8193901181221008, + "learning_rate": 6.306955506669836e-05, + "loss": 2.7298, + "step": 6242 + }, + { + "epoch": 0.5651308047433692, + "grad_norm": 0.8663204908370972, + "learning_rate": 6.30400669403018e-05, + "loss": 2.7622, + "step": 6243 + }, + { + "epoch": 0.5652213270571196, + "grad_norm": 0.8417595028877258, + "learning_rate": 6.301058253576955e-05, + "loss": 2.7069, + "step": 6244 + }, + { + "epoch": 0.5653118493708699, + "grad_norm": 0.8490182757377625, + "learning_rate": 6.298110185607063e-05, + "loss": 2.7275, + "step": 6245 + }, + { + "epoch": 0.5654023716846203, + "grad_norm": 0.8474552035331726, + "learning_rate": 6.29516249041737e-05, + "loss": 2.6955, + "step": 6246 + }, + { + "epoch": 0.5654928939983705, + "grad_norm": 0.7780411839485168, + "learning_rate": 6.292215168304716e-05, + "loss": 2.7219, + "step": 6247 + }, + { + "epoch": 0.565583416312121, + "grad_norm": 0.8314462900161743, + "learning_rate": 6.28926821956589e-05, + "loss": 2.7067, + "step": 6248 + }, + { + "epoch": 0.5656739386258712, + "grad_norm": 0.851689875125885, + "learning_rate": 6.286321644497655e-05, + "loss": 2.7218, + "step": 6249 + }, + { + "epoch": 0.5657644609396216, + "grad_norm": 0.841330349445343, + "learning_rate": 6.283375443396726e-05, + "loss": 2.7342, + "step": 6250 + }, + { + "epoch": 0.5658549832533719, + "grad_norm": 0.8683754801750183, + "learning_rate": 6.280429616559792e-05, + "loss": 2.7366, + "step": 6251 + }, + { + "epoch": 0.5659455055671223, + "grad_norm": 0.7926754355430603, + "learning_rate": 6.277484164283489e-05, + "loss": 2.7295, + "step": 6252 + }, + { + "epoch": 0.5660360278808726, + "grad_norm": 0.9038828611373901, + "learning_rate": 6.274539086864433e-05, + "loss": 2.7524, + "step": 6253 + }, + { + "epoch": 0.566126550194623, + "grad_norm": 0.8178255558013916, + "learning_rate": 6.271594384599188e-05, + "loss": 2.6828, + "step": 6254 + }, + { + "epoch": 0.5662170725083733, + "grad_norm": 0.7750669121742249, + "learning_rate": 6.268650057784288e-05, + "loss": 2.6739, + "step": 6255 + }, + { + "epoch": 0.5663075948221237, + "grad_norm": 0.8675525188446045, + "learning_rate": 6.26570610671622e-05, + "loss": 2.7148, + "step": 6256 + }, + { + "epoch": 0.566398117135874, + "grad_norm": 0.928869366645813, + "learning_rate": 6.262762531691451e-05, + "loss": 2.7006, + "step": 6257 + }, + { + "epoch": 0.5664886394496244, + "grad_norm": 0.8617798686027527, + "learning_rate": 6.259819333006388e-05, + "loss": 2.745, + "step": 6258 + }, + { + "epoch": 0.5665791617633746, + "grad_norm": 0.8284051418304443, + "learning_rate": 6.25687651095742e-05, + "loss": 2.6976, + "step": 6259 + }, + { + "epoch": 0.566669684077125, + "grad_norm": 0.9877228736877441, + "learning_rate": 6.25393406584088e-05, + "loss": 2.7422, + "step": 6260 + }, + { + "epoch": 0.5667602063908753, + "grad_norm": 0.889123260974884, + "learning_rate": 6.250991997953082e-05, + "loss": 2.7538, + "step": 6261 + }, + { + "epoch": 0.5668507287046257, + "grad_norm": 0.8846951723098755, + "learning_rate": 6.248050307590283e-05, + "loss": 2.7475, + "step": 6262 + }, + { + "epoch": 0.566941251018376, + "grad_norm": 0.8666096925735474, + "learning_rate": 6.245108995048719e-05, + "loss": 2.7457, + "step": 6263 + }, + { + "epoch": 0.5670317733321264, + "grad_norm": 0.8434032201766968, + "learning_rate": 6.242168060624572e-05, + "loss": 2.7108, + "step": 6264 + }, + { + "epoch": 0.5671222956458767, + "grad_norm": 0.8461436629295349, + "learning_rate": 6.239227504614003e-05, + "loss": 2.6978, + "step": 6265 + }, + { + "epoch": 0.5672128179596271, + "grad_norm": 0.8737865686416626, + "learning_rate": 6.23628732731312e-05, + "loss": 2.6715, + "step": 6266 + }, + { + "epoch": 0.5673033402733774, + "grad_norm": 0.9114745855331421, + "learning_rate": 6.233347529018e-05, + "loss": 2.6919, + "step": 6267 + }, + { + "epoch": 0.5673938625871278, + "grad_norm": 0.8154740929603577, + "learning_rate": 6.230408110024679e-05, + "loss": 2.6827, + "step": 6268 + }, + { + "epoch": 0.567484384900878, + "grad_norm": 0.8688656091690063, + "learning_rate": 6.227469070629162e-05, + "loss": 2.6951, + "step": 6269 + }, + { + "epoch": 0.5675749072146284, + "grad_norm": 0.8917716145515442, + "learning_rate": 6.224530411127403e-05, + "loss": 2.749, + "step": 6270 + }, + { + "epoch": 0.5676654295283787, + "grad_norm": 0.8169562816619873, + "learning_rate": 6.22159213181533e-05, + "loss": 2.7254, + "step": 6271 + }, + { + "epoch": 0.5677559518421291, + "grad_norm": 0.8753547072410583, + "learning_rate": 6.218654232988824e-05, + "loss": 2.7687, + "step": 6272 + }, + { + "epoch": 0.5678464741558794, + "grad_norm": 0.8617856502532959, + "learning_rate": 6.215716714943738e-05, + "loss": 2.7017, + "step": 6273 + }, + { + "epoch": 0.5679369964696298, + "grad_norm": 0.8434569239616394, + "learning_rate": 6.21277957797587e-05, + "loss": 2.674, + "step": 6274 + }, + { + "epoch": 0.5680275187833801, + "grad_norm": 0.8649165034294128, + "learning_rate": 6.209842822380998e-05, + "loss": 2.7317, + "step": 6275 + }, + { + "epoch": 0.5681180410971305, + "grad_norm": 0.8554753661155701, + "learning_rate": 6.206906448454848e-05, + "loss": 2.7445, + "step": 6276 + }, + { + "epoch": 0.5682085634108808, + "grad_norm": 0.9278080463409424, + "learning_rate": 6.203970456493118e-05, + "loss": 2.7859, + "step": 6277 + }, + { + "epoch": 0.5682990857246312, + "grad_norm": 0.8859151005744934, + "learning_rate": 6.201034846791454e-05, + "loss": 2.6952, + "step": 6278 + }, + { + "epoch": 0.5683896080383815, + "grad_norm": 0.9116597771644592, + "learning_rate": 6.198099619645481e-05, + "loss": 2.7023, + "step": 6279 + }, + { + "epoch": 0.5684801303521319, + "grad_norm": 0.850883424282074, + "learning_rate": 6.19516477535077e-05, + "loss": 2.6934, + "step": 6280 + }, + { + "epoch": 0.5685706526658821, + "grad_norm": 0.9140669703483582, + "learning_rate": 6.192230314202864e-05, + "loss": 2.6676, + "step": 6281 + }, + { + "epoch": 0.5686611749796325, + "grad_norm": 0.8315000534057617, + "learning_rate": 6.18929623649726e-05, + "loss": 2.6754, + "step": 6282 + }, + { + "epoch": 0.5687516972933828, + "grad_norm": 0.8378522396087646, + "learning_rate": 6.186362542529421e-05, + "loss": 2.7073, + "step": 6283 + }, + { + "epoch": 0.5688422196071331, + "grad_norm": 0.8570544123649597, + "learning_rate": 6.18342923259477e-05, + "loss": 2.6984, + "step": 6284 + }, + { + "epoch": 0.5689327419208835, + "grad_norm": 0.9210760593414307, + "learning_rate": 6.180496306988692e-05, + "loss": 2.7113, + "step": 6285 + }, + { + "epoch": 0.5690232642346338, + "grad_norm": 0.8755303025245667, + "learning_rate": 6.177563766006526e-05, + "loss": 2.6953, + "step": 6286 + }, + { + "epoch": 0.5691137865483842, + "grad_norm": 0.9134300351142883, + "learning_rate": 6.174631609943591e-05, + "loss": 2.7662, + "step": 6287 + }, + { + "epoch": 0.5692043088621345, + "grad_norm": 0.8612111210823059, + "learning_rate": 6.171699839095144e-05, + "loss": 2.7175, + "step": 6288 + }, + { + "epoch": 0.5692948311758849, + "grad_norm": 0.8268765211105347, + "learning_rate": 6.168768453756421e-05, + "loss": 2.685, + "step": 6289 + }, + { + "epoch": 0.5693853534896351, + "grad_norm": 0.9528706669807434, + "learning_rate": 6.165837454222608e-05, + "loss": 2.7083, + "step": 6290 + }, + { + "epoch": 0.5694758758033855, + "grad_norm": 0.9075449705123901, + "learning_rate": 6.16290684078886e-05, + "loss": 2.6614, + "step": 6291 + }, + { + "epoch": 0.5695663981171358, + "grad_norm": 0.8905490636825562, + "learning_rate": 6.159976613750286e-05, + "loss": 2.7241, + "step": 6292 + }, + { + "epoch": 0.5696569204308862, + "grad_norm": 0.8218395113945007, + "learning_rate": 6.157046773401964e-05, + "loss": 2.6705, + "step": 6293 + }, + { + "epoch": 0.5697474427446365, + "grad_norm": 0.9417014122009277, + "learning_rate": 6.154117320038924e-05, + "loss": 2.7265, + "step": 6294 + }, + { + "epoch": 0.5698379650583869, + "grad_norm": 0.8464024066925049, + "learning_rate": 6.151188253956168e-05, + "loss": 2.7003, + "step": 6295 + }, + { + "epoch": 0.5699284873721372, + "grad_norm": 0.9145004153251648, + "learning_rate": 6.148259575448647e-05, + "loss": 2.7112, + "step": 6296 + }, + { + "epoch": 0.5700190096858876, + "grad_norm": 0.8569707870483398, + "learning_rate": 6.145331284811285e-05, + "loss": 2.7289, + "step": 6297 + }, + { + "epoch": 0.5701095319996379, + "grad_norm": 0.9021281003952026, + "learning_rate": 6.142403382338951e-05, + "loss": 2.6766, + "step": 6298 + }, + { + "epoch": 0.5702000543133883, + "grad_norm": 0.8854073286056519, + "learning_rate": 6.139475868326496e-05, + "loss": 2.7415, + "step": 6299 + }, + { + "epoch": 0.5702905766271386, + "grad_norm": 0.816065788269043, + "learning_rate": 6.136548743068713e-05, + "loss": 2.6619, + "step": 6300 + }, + { + "epoch": 0.570381098940889, + "grad_norm": 0.8643266558647156, + "learning_rate": 6.133622006860369e-05, + "loss": 2.7427, + "step": 6301 + }, + { + "epoch": 0.5704716212546392, + "grad_norm": 0.8317883014678955, + "learning_rate": 6.130695659996179e-05, + "loss": 2.651, + "step": 6302 + }, + { + "epoch": 0.5705621435683896, + "grad_norm": 0.8347657322883606, + "learning_rate": 6.127769702770834e-05, + "loss": 2.6842, + "step": 6303 + }, + { + "epoch": 0.5706526658821399, + "grad_norm": 0.8077671527862549, + "learning_rate": 6.12484413547897e-05, + "loss": 2.6599, + "step": 6304 + }, + { + "epoch": 0.5707431881958903, + "grad_norm": 0.8410141468048096, + "learning_rate": 6.121918958415199e-05, + "loss": 2.7244, + "step": 6305 + }, + { + "epoch": 0.5708337105096406, + "grad_norm": 0.8797140121459961, + "learning_rate": 6.118994171874082e-05, + "loss": 2.7024, + "step": 6306 + }, + { + "epoch": 0.570924232823391, + "grad_norm": 0.8294859528541565, + "learning_rate": 6.116069776150147e-05, + "loss": 2.7675, + "step": 6307 + }, + { + "epoch": 0.5710147551371413, + "grad_norm": 0.8064338564872742, + "learning_rate": 6.113145771537874e-05, + "loss": 2.7055, + "step": 6308 + }, + { + "epoch": 0.5711052774508917, + "grad_norm": 0.7874312996864319, + "learning_rate": 6.11022215833172e-05, + "loss": 2.6885, + "step": 6309 + }, + { + "epoch": 0.571195799764642, + "grad_norm": 0.8688129186630249, + "learning_rate": 6.107298936826086e-05, + "loss": 2.6763, + "step": 6310 + }, + { + "epoch": 0.5712863220783924, + "grad_norm": 0.9120535850524902, + "learning_rate": 6.104376107315345e-05, + "loss": 2.6554, + "step": 6311 + }, + { + "epoch": 0.5713768443921426, + "grad_norm": 0.869500994682312, + "learning_rate": 6.10145367009382e-05, + "loss": 2.7581, + "step": 6312 + }, + { + "epoch": 0.571467366705893, + "grad_norm": 0.8980798721313477, + "learning_rate": 6.098531625455806e-05, + "loss": 2.663, + "step": 6313 + }, + { + "epoch": 0.5715578890196433, + "grad_norm": 0.8309452533721924, + "learning_rate": 6.0956099736955494e-05, + "loss": 2.7424, + "step": 6314 + }, + { + "epoch": 0.5716484113333937, + "grad_norm": 0.8424989581108093, + "learning_rate": 6.092688715107264e-05, + "loss": 2.7321, + "step": 6315 + }, + { + "epoch": 0.571738933647144, + "grad_norm": 0.8888978362083435, + "learning_rate": 6.089767849985114e-05, + "loss": 2.7492, + "step": 6316 + }, + { + "epoch": 0.5718294559608944, + "grad_norm": 0.8150400519371033, + "learning_rate": 6.0868473786232395e-05, + "loss": 2.7341, + "step": 6317 + }, + { + "epoch": 0.5719199782746447, + "grad_norm": 0.8388065695762634, + "learning_rate": 6.083927301315724e-05, + "loss": 2.7131, + "step": 6318 + }, + { + "epoch": 0.5720105005883951, + "grad_norm": 0.9214149117469788, + "learning_rate": 6.081007618356623e-05, + "loss": 2.7402, + "step": 6319 + }, + { + "epoch": 0.5721010229021454, + "grad_norm": 0.8912999629974365, + "learning_rate": 6.078088330039945e-05, + "loss": 2.7239, + "step": 6320 + }, + { + "epoch": 0.5721915452158958, + "grad_norm": 0.7984560132026672, + "learning_rate": 6.075169436659669e-05, + "loss": 2.6878, + "step": 6321 + }, + { + "epoch": 0.572282067529646, + "grad_norm": 0.8435370326042175, + "learning_rate": 6.0722509385097205e-05, + "loss": 2.7153, + "step": 6322 + }, + { + "epoch": 0.5723725898433965, + "grad_norm": 0.8220704793930054, + "learning_rate": 6.0693328358839966e-05, + "loss": 2.658, + "step": 6323 + }, + { + "epoch": 0.5724631121571467, + "grad_norm": 0.8204720616340637, + "learning_rate": 6.066415129076346e-05, + "loss": 2.6718, + "step": 6324 + }, + { + "epoch": 0.572553634470897, + "grad_norm": 0.8146944046020508, + "learning_rate": 6.063497818380587e-05, + "loss": 2.6801, + "step": 6325 + }, + { + "epoch": 0.5726441567846474, + "grad_norm": 0.7801823019981384, + "learning_rate": 6.0605809040904894e-05, + "loss": 2.6909, + "step": 6326 + }, + { + "epoch": 0.5727346790983977, + "grad_norm": 0.8066073656082153, + "learning_rate": 6.0576643864997875e-05, + "loss": 2.7067, + "step": 6327 + }, + { + "epoch": 0.5728252014121481, + "grad_norm": 0.8099483847618103, + "learning_rate": 6.0547482659021706e-05, + "loss": 2.6446, + "step": 6328 + }, + { + "epoch": 0.5729157237258984, + "grad_norm": 0.841600775718689, + "learning_rate": 6.051832542591299e-05, + "loss": 2.7368, + "step": 6329 + }, + { + "epoch": 0.5730062460396488, + "grad_norm": 0.8247181177139282, + "learning_rate": 6.048917216860781e-05, + "loss": 2.6526, + "step": 6330 + }, + { + "epoch": 0.5730967683533991, + "grad_norm": 0.8284602165222168, + "learning_rate": 6.0460022890041934e-05, + "loss": 2.6957, + "step": 6331 + }, + { + "epoch": 0.5731872906671495, + "grad_norm": 0.8628219366073608, + "learning_rate": 6.043087759315066e-05, + "loss": 2.7116, + "step": 6332 + }, + { + "epoch": 0.5732778129808997, + "grad_norm": 0.754366934299469, + "learning_rate": 6.0401736280868895e-05, + "loss": 2.6362, + "step": 6333 + }, + { + "epoch": 0.5733683352946501, + "grad_norm": 0.8926199674606323, + "learning_rate": 6.0372598956131265e-05, + "loss": 2.7344, + "step": 6334 + }, + { + "epoch": 0.5734588576084004, + "grad_norm": 0.8918818235397339, + "learning_rate": 6.0343465621871776e-05, + "loss": 2.7467, + "step": 6335 + }, + { + "epoch": 0.5735493799221508, + "grad_norm": 0.8268258571624756, + "learning_rate": 6.031433628102425e-05, + "loss": 2.6649, + "step": 6336 + }, + { + "epoch": 0.5736399022359011, + "grad_norm": 0.8441937565803528, + "learning_rate": 6.0285210936521955e-05, + "loss": 2.7086, + "step": 6337 + }, + { + "epoch": 0.5737304245496515, + "grad_norm": 0.873117208480835, + "learning_rate": 6.025608959129785e-05, + "loss": 2.6915, + "step": 6338 + }, + { + "epoch": 0.5738209468634018, + "grad_norm": 0.8519162535667419, + "learning_rate": 6.02269722482844e-05, + "loss": 2.7296, + "step": 6339 + }, + { + "epoch": 0.5739114691771522, + "grad_norm": 0.9300065040588379, + "learning_rate": 6.019785891041381e-05, + "loss": 2.7292, + "step": 6340 + }, + { + "epoch": 0.5740019914909025, + "grad_norm": 0.8156430125236511, + "learning_rate": 6.0168749580617686e-05, + "loss": 2.7259, + "step": 6341 + }, + { + "epoch": 0.5740925138046529, + "grad_norm": 0.9231059551239014, + "learning_rate": 6.0139644261827435e-05, + "loss": 2.7137, + "step": 6342 + }, + { + "epoch": 0.5741830361184032, + "grad_norm": 0.8606845140457153, + "learning_rate": 6.011054295697387e-05, + "loss": 2.6815, + "step": 6343 + }, + { + "epoch": 0.5742735584321536, + "grad_norm": 0.8757749795913696, + "learning_rate": 6.008144566898758e-05, + "loss": 2.6852, + "step": 6344 + }, + { + "epoch": 0.5743640807459038, + "grad_norm": 0.8971512317657471, + "learning_rate": 6.005235240079859e-05, + "loss": 2.6847, + "step": 6345 + }, + { + "epoch": 0.5744546030596542, + "grad_norm": 0.8060917854309082, + "learning_rate": 6.002326315533665e-05, + "loss": 2.6772, + "step": 6346 + }, + { + "epoch": 0.5745451253734045, + "grad_norm": 0.9402631521224976, + "learning_rate": 5.999417793553097e-05, + "loss": 2.745, + "step": 6347 + }, + { + "epoch": 0.5746356476871549, + "grad_norm": 0.8727978467941284, + "learning_rate": 5.9965096744310526e-05, + "loss": 2.6546, + "step": 6348 + }, + { + "epoch": 0.5747261700009052, + "grad_norm": 0.9258029460906982, + "learning_rate": 5.993601958460371e-05, + "loss": 2.7069, + "step": 6349 + }, + { + "epoch": 0.5748166923146556, + "grad_norm": 0.8712645769119263, + "learning_rate": 5.9906946459338656e-05, + "loss": 2.7036, + "step": 6350 + }, + { + "epoch": 0.5749072146284059, + "grad_norm": 1.0271838903427124, + "learning_rate": 5.987787737144296e-05, + "loss": 2.717, + "step": 6351 + }, + { + "epoch": 0.5749977369421563, + "grad_norm": 0.8358034491539001, + "learning_rate": 5.984881232384394e-05, + "loss": 2.665, + "step": 6352 + }, + { + "epoch": 0.5750882592559066, + "grad_norm": 0.9175279140472412, + "learning_rate": 5.981975131946842e-05, + "loss": 2.6991, + "step": 6353 + }, + { + "epoch": 0.575178781569657, + "grad_norm": 0.9423436522483826, + "learning_rate": 5.979069436124285e-05, + "loss": 2.698, + "step": 6354 + }, + { + "epoch": 0.5752693038834072, + "grad_norm": 0.9727945923805237, + "learning_rate": 5.976164145209322e-05, + "loss": 2.7172, + "step": 6355 + }, + { + "epoch": 0.5753598261971576, + "grad_norm": 0.8513864874839783, + "learning_rate": 5.973259259494524e-05, + "loss": 2.6729, + "step": 6356 + }, + { + "epoch": 0.5754503485109079, + "grad_norm": 1.0204179286956787, + "learning_rate": 5.9703547792724045e-05, + "loss": 2.7242, + "step": 6357 + }, + { + "epoch": 0.5755408708246583, + "grad_norm": 0.9762052893638611, + "learning_rate": 5.967450704835452e-05, + "loss": 2.7628, + "step": 6358 + }, + { + "epoch": 0.5756313931384086, + "grad_norm": 0.8357113003730774, + "learning_rate": 5.964547036476099e-05, + "loss": 2.7648, + "step": 6359 + }, + { + "epoch": 0.575721915452159, + "grad_norm": 0.8610087633132935, + "learning_rate": 5.9616437744867535e-05, + "loss": 2.6764, + "step": 6360 + }, + { + "epoch": 0.5758124377659093, + "grad_norm": 0.916714608669281, + "learning_rate": 5.958740919159766e-05, + "loss": 2.6536, + "step": 6361 + }, + { + "epoch": 0.5759029600796597, + "grad_norm": 0.7818869352340698, + "learning_rate": 5.95583847078746e-05, + "loss": 2.7121, + "step": 6362 + }, + { + "epoch": 0.57599348239341, + "grad_norm": 0.9010102152824402, + "learning_rate": 5.952936429662106e-05, + "loss": 2.7471, + "step": 6363 + }, + { + "epoch": 0.5760840047071604, + "grad_norm": 0.8215236663818359, + "learning_rate": 5.950034796075947e-05, + "loss": 2.6007, + "step": 6364 + }, + { + "epoch": 0.5761745270209107, + "grad_norm": 0.8552883267402649, + "learning_rate": 5.94713357032117e-05, + "loss": 2.7062, + "step": 6365 + }, + { + "epoch": 0.5762650493346609, + "grad_norm": 0.7977161407470703, + "learning_rate": 5.944232752689936e-05, + "loss": 2.7083, + "step": 6366 + }, + { + "epoch": 0.5763555716484113, + "grad_norm": 0.8528048396110535, + "learning_rate": 5.941332343474349e-05, + "loss": 2.715, + "step": 6367 + }, + { + "epoch": 0.5764460939621616, + "grad_norm": 0.9308705925941467, + "learning_rate": 5.9384323429664845e-05, + "loss": 2.6994, + "step": 6368 + }, + { + "epoch": 0.576536616275912, + "grad_norm": 0.9029669761657715, + "learning_rate": 5.93553275145837e-05, + "loss": 2.7165, + "step": 6369 + }, + { + "epoch": 0.5766271385896623, + "grad_norm": 0.791476309299469, + "learning_rate": 5.9326335692419995e-05, + "loss": 2.7014, + "step": 6370 + }, + { + "epoch": 0.5767176609034127, + "grad_norm": 0.8183833956718445, + "learning_rate": 5.929734796609315e-05, + "loss": 2.742, + "step": 6371 + }, + { + "epoch": 0.576808183217163, + "grad_norm": 0.9724903702735901, + "learning_rate": 5.926836433852227e-05, + "loss": 2.6981, + "step": 6372 + }, + { + "epoch": 0.5768987055309134, + "grad_norm": 0.8855651021003723, + "learning_rate": 5.9239384812625964e-05, + "loss": 2.6728, + "step": 6373 + }, + { + "epoch": 0.5769892278446637, + "grad_norm": 0.7812449336051941, + "learning_rate": 5.9210409391322516e-05, + "loss": 2.6848, + "step": 6374 + }, + { + "epoch": 0.5770797501584141, + "grad_norm": 0.9259555339813232, + "learning_rate": 5.918143807752972e-05, + "loss": 2.7506, + "step": 6375 + }, + { + "epoch": 0.5771702724721643, + "grad_norm": 0.9594706296920776, + "learning_rate": 5.9152470874165e-05, + "loss": 2.6974, + "step": 6376 + }, + { + "epoch": 0.5772607947859147, + "grad_norm": 0.9384106397628784, + "learning_rate": 5.912350778414531e-05, + "loss": 2.6748, + "step": 6377 + }, + { + "epoch": 0.577351317099665, + "grad_norm": 0.9170209765434265, + "learning_rate": 5.909454881038729e-05, + "loss": 2.664, + "step": 6378 + }, + { + "epoch": 0.5774418394134154, + "grad_norm": 0.915307343006134, + "learning_rate": 5.906559395580709e-05, + "loss": 2.7388, + "step": 6379 + }, + { + "epoch": 0.5775323617271657, + "grad_norm": 0.8271397352218628, + "learning_rate": 5.9036643223320475e-05, + "loss": 2.6895, + "step": 6380 + }, + { + "epoch": 0.5776228840409161, + "grad_norm": 0.7941808700561523, + "learning_rate": 5.900769661584272e-05, + "loss": 2.6932, + "step": 6381 + }, + { + "epoch": 0.5777134063546664, + "grad_norm": 0.9167299270629883, + "learning_rate": 5.897875413628884e-05, + "loss": 2.7084, + "step": 6382 + }, + { + "epoch": 0.5778039286684168, + "grad_norm": 0.8044254183769226, + "learning_rate": 5.894981578757327e-05, + "loss": 2.7183, + "step": 6383 + }, + { + "epoch": 0.5778944509821671, + "grad_norm": 0.9017865061759949, + "learning_rate": 5.892088157261015e-05, + "loss": 2.7275, + "step": 6384 + }, + { + "epoch": 0.5779849732959175, + "grad_norm": 0.8614157438278198, + "learning_rate": 5.889195149431309e-05, + "loss": 2.7007, + "step": 6385 + }, + { + "epoch": 0.5780754956096678, + "grad_norm": 0.9102115631103516, + "learning_rate": 5.886302555559543e-05, + "loss": 2.7378, + "step": 6386 + }, + { + "epoch": 0.5781660179234182, + "grad_norm": 0.8719278573989868, + "learning_rate": 5.883410375936995e-05, + "loss": 2.7231, + "step": 6387 + }, + { + "epoch": 0.5782565402371684, + "grad_norm": 0.9065318703651428, + "learning_rate": 5.8805186108549114e-05, + "loss": 2.707, + "step": 6388 + }, + { + "epoch": 0.5783470625509188, + "grad_norm": 0.8618296980857849, + "learning_rate": 5.877627260604488e-05, + "loss": 2.7146, + "step": 6389 + }, + { + "epoch": 0.5784375848646691, + "grad_norm": 0.9378622770309448, + "learning_rate": 5.8747363254768894e-05, + "loss": 2.7278, + "step": 6390 + }, + { + "epoch": 0.5785281071784195, + "grad_norm": 0.9272425770759583, + "learning_rate": 5.8718458057632286e-05, + "loss": 2.6517, + "step": 6391 + }, + { + "epoch": 0.5786186294921698, + "grad_norm": 0.8620819449424744, + "learning_rate": 5.868955701754584e-05, + "loss": 2.7031, + "step": 6392 + }, + { + "epoch": 0.5787091518059202, + "grad_norm": 0.8210526704788208, + "learning_rate": 5.866066013741983e-05, + "loss": 2.6852, + "step": 6393 + }, + { + "epoch": 0.5787996741196705, + "grad_norm": 0.8317428827285767, + "learning_rate": 5.863176742016425e-05, + "loss": 2.7107, + "step": 6394 + }, + { + "epoch": 0.5788901964334209, + "grad_norm": 0.854906439781189, + "learning_rate": 5.860287886868855e-05, + "loss": 2.7202, + "step": 6395 + }, + { + "epoch": 0.5789807187471712, + "grad_norm": 0.8338930606842041, + "learning_rate": 5.857399448590183e-05, + "loss": 2.6836, + "step": 6396 + }, + { + "epoch": 0.5790712410609216, + "grad_norm": 0.8774193525314331, + "learning_rate": 5.8545114274712695e-05, + "loss": 2.7342, + "step": 6397 + }, + { + "epoch": 0.5791617633746718, + "grad_norm": 0.912401020526886, + "learning_rate": 5.8516238238029476e-05, + "loss": 2.683, + "step": 6398 + }, + { + "epoch": 0.5792522856884222, + "grad_norm": 0.835411548614502, + "learning_rate": 5.848736637875987e-05, + "loss": 2.7199, + "step": 6399 + }, + { + "epoch": 0.5793428080021725, + "grad_norm": 0.9073437452316284, + "learning_rate": 5.845849869981137e-05, + "loss": 2.734, + "step": 6400 + }, + { + "epoch": 0.5794333303159229, + "grad_norm": 0.8086467385292053, + "learning_rate": 5.8429635204090904e-05, + "loss": 2.7081, + "step": 6401 + }, + { + "epoch": 0.5795238526296732, + "grad_norm": 0.8387646675109863, + "learning_rate": 5.840077589450505e-05, + "loss": 2.7258, + "step": 6402 + }, + { + "epoch": 0.5796143749434236, + "grad_norm": 0.870087742805481, + "learning_rate": 5.83719207739599e-05, + "loss": 2.7826, + "step": 6403 + }, + { + "epoch": 0.5797048972571739, + "grad_norm": 0.8552343249320984, + "learning_rate": 5.83430698453612e-05, + "loss": 2.7115, + "step": 6404 + }, + { + "epoch": 0.5797954195709243, + "grad_norm": 0.8185401558876038, + "learning_rate": 5.831422311161421e-05, + "loss": 2.6749, + "step": 6405 + }, + { + "epoch": 0.5798859418846746, + "grad_norm": 0.8774237036705017, + "learning_rate": 5.8285380575623826e-05, + "loss": 2.6766, + "step": 6406 + }, + { + "epoch": 0.5799764641984249, + "grad_norm": 0.9418449401855469, + "learning_rate": 5.825654224029447e-05, + "loss": 2.6728, + "step": 6407 + }, + { + "epoch": 0.5800669865121753, + "grad_norm": 0.8073645830154419, + "learning_rate": 5.822770810853022e-05, + "loss": 2.6885, + "step": 6408 + }, + { + "epoch": 0.5801575088259255, + "grad_norm": 0.8460543751716614, + "learning_rate": 5.819887818323453e-05, + "loss": 2.7333, + "step": 6409 + }, + { + "epoch": 0.5802480311396759, + "grad_norm": 0.9544316530227661, + "learning_rate": 5.817005246731073e-05, + "loss": 2.689, + "step": 6410 + }, + { + "epoch": 0.5803385534534262, + "grad_norm": 0.8486777544021606, + "learning_rate": 5.814123096366148e-05, + "loss": 2.7374, + "step": 6411 + }, + { + "epoch": 0.5804290757671766, + "grad_norm": 0.8409209251403809, + "learning_rate": 5.811241367518914e-05, + "loss": 2.7153, + "step": 6412 + }, + { + "epoch": 0.5805195980809269, + "grad_norm": 0.8512220978736877, + "learning_rate": 5.808360060479557e-05, + "loss": 2.708, + "step": 6413 + }, + { + "epoch": 0.5806101203946773, + "grad_norm": 0.9078315496444702, + "learning_rate": 5.805479175538229e-05, + "loss": 2.7111, + "step": 6414 + }, + { + "epoch": 0.5807006427084276, + "grad_norm": 0.8165913224220276, + "learning_rate": 5.802598712985032e-05, + "loss": 2.6777, + "step": 6415 + }, + { + "epoch": 0.580791165022178, + "grad_norm": 0.890654444694519, + "learning_rate": 5.799718673110035e-05, + "loss": 2.7412, + "step": 6416 + }, + { + "epoch": 0.5808816873359283, + "grad_norm": 0.8421646356582642, + "learning_rate": 5.796839056203247e-05, + "loss": 2.7217, + "step": 6417 + }, + { + "epoch": 0.5809722096496787, + "grad_norm": 0.8950380086898804, + "learning_rate": 5.793959862554652e-05, + "loss": 2.6949, + "step": 6418 + }, + { + "epoch": 0.581062731963429, + "grad_norm": 0.8083726167678833, + "learning_rate": 5.7910810924541844e-05, + "loss": 2.6173, + "step": 6419 + }, + { + "epoch": 0.5811532542771793, + "grad_norm": 0.9236308336257935, + "learning_rate": 5.788202746191734e-05, + "loss": 2.7524, + "step": 6420 + }, + { + "epoch": 0.5812437765909296, + "grad_norm": 0.8910950422286987, + "learning_rate": 5.785324824057157e-05, + "loss": 2.6792, + "step": 6421 + }, + { + "epoch": 0.58133429890468, + "grad_norm": 0.798999547958374, + "learning_rate": 5.782447326340247e-05, + "loss": 2.6614, + "step": 6422 + }, + { + "epoch": 0.5814248212184303, + "grad_norm": 0.8792107701301575, + "learning_rate": 5.779570253330784e-05, + "loss": 2.6563, + "step": 6423 + }, + { + "epoch": 0.5815153435321807, + "grad_norm": 0.9009785652160645, + "learning_rate": 5.776693605318476e-05, + "loss": 2.7166, + "step": 6424 + }, + { + "epoch": 0.581605865845931, + "grad_norm": 0.8150326013565063, + "learning_rate": 5.773817382593008e-05, + "loss": 2.738, + "step": 6425 + }, + { + "epoch": 0.5816963881596814, + "grad_norm": 0.8171740770339966, + "learning_rate": 5.7709415854440116e-05, + "loss": 2.6949, + "step": 6426 + }, + { + "epoch": 0.5817869104734317, + "grad_norm": 0.9186320900917053, + "learning_rate": 5.768066214161087e-05, + "loss": 2.686, + "step": 6427 + }, + { + "epoch": 0.5818774327871821, + "grad_norm": 0.8507105708122253, + "learning_rate": 5.7651912690337716e-05, + "loss": 2.7283, + "step": 6428 + }, + { + "epoch": 0.5819679551009324, + "grad_norm": 0.8721489906311035, + "learning_rate": 5.762316750351586e-05, + "loss": 2.6998, + "step": 6429 + }, + { + "epoch": 0.5820584774146828, + "grad_norm": 0.9127447009086609, + "learning_rate": 5.759442658403985e-05, + "loss": 2.696, + "step": 6430 + }, + { + "epoch": 0.582148999728433, + "grad_norm": 0.8901234865188599, + "learning_rate": 5.756568993480391e-05, + "loss": 2.6944, + "step": 6431 + }, + { + "epoch": 0.5822395220421834, + "grad_norm": 0.8674662113189697, + "learning_rate": 5.753695755870185e-05, + "loss": 2.6569, + "step": 6432 + }, + { + "epoch": 0.5823300443559337, + "grad_norm": 0.8261517882347107, + "learning_rate": 5.7508229458627e-05, + "loss": 2.6845, + "step": 6433 + }, + { + "epoch": 0.5824205666696841, + "grad_norm": 0.8022430539131165, + "learning_rate": 5.7479505637472264e-05, + "loss": 2.6961, + "step": 6434 + }, + { + "epoch": 0.5825110889834344, + "grad_norm": 0.9176723957061768, + "learning_rate": 5.7450786098130194e-05, + "loss": 2.6622, + "step": 6435 + }, + { + "epoch": 0.5826016112971848, + "grad_norm": 0.9195401072502136, + "learning_rate": 5.7422070843492734e-05, + "loss": 2.7736, + "step": 6436 + }, + { + "epoch": 0.5826921336109351, + "grad_norm": 0.9636011719703674, + "learning_rate": 5.7393359876451634e-05, + "loss": 2.7399, + "step": 6437 + }, + { + "epoch": 0.5827826559246855, + "grad_norm": 0.870815098285675, + "learning_rate": 5.7364653199898e-05, + "loss": 2.7254, + "step": 6438 + }, + { + "epoch": 0.5828731782384358, + "grad_norm": 0.881637692451477, + "learning_rate": 5.733595081672263e-05, + "loss": 2.6513, + "step": 6439 + }, + { + "epoch": 0.5829637005521862, + "grad_norm": 0.9924260973930359, + "learning_rate": 5.7307252729815833e-05, + "loss": 2.7274, + "step": 6440 + }, + { + "epoch": 0.5830542228659364, + "grad_norm": 0.853918731212616, + "learning_rate": 5.727855894206752e-05, + "loss": 2.7194, + "step": 6441 + }, + { + "epoch": 0.5831447451796868, + "grad_norm": 0.907598078250885, + "learning_rate": 5.7249869456367146e-05, + "loss": 2.7265, + "step": 6442 + }, + { + "epoch": 0.5832352674934371, + "grad_norm": 0.8251694440841675, + "learning_rate": 5.722118427560379e-05, + "loss": 2.7336, + "step": 6443 + }, + { + "epoch": 0.5833257898071875, + "grad_norm": 0.8961870670318604, + "learning_rate": 5.7192503402665944e-05, + "loss": 2.7209, + "step": 6444 + }, + { + "epoch": 0.5834163121209378, + "grad_norm": 0.8750089406967163, + "learning_rate": 5.71638268404419e-05, + "loss": 2.7938, + "step": 6445 + }, + { + "epoch": 0.5835068344346882, + "grad_norm": 0.8658246397972107, + "learning_rate": 5.7135154591819287e-05, + "loss": 2.6955, + "step": 6446 + }, + { + "epoch": 0.5835973567484385, + "grad_norm": 0.8144567608833313, + "learning_rate": 5.710648665968543e-05, + "loss": 2.6923, + "step": 6447 + }, + { + "epoch": 0.5836878790621888, + "grad_norm": 0.8991824388504028, + "learning_rate": 5.707782304692719e-05, + "loss": 2.694, + "step": 6448 + }, + { + "epoch": 0.5837784013759392, + "grad_norm": 1.0232534408569336, + "learning_rate": 5.704916375643099e-05, + "loss": 2.7462, + "step": 6449 + }, + { + "epoch": 0.5838689236896895, + "grad_norm": 0.8977211117744446, + "learning_rate": 5.702050879108284e-05, + "loss": 2.74, + "step": 6450 + }, + { + "epoch": 0.5839594460034399, + "grad_norm": 0.8499026894569397, + "learning_rate": 5.6991858153768305e-05, + "loss": 2.697, + "step": 6451 + }, + { + "epoch": 0.5840499683171901, + "grad_norm": 0.8971026539802551, + "learning_rate": 5.696321184737241e-05, + "loss": 2.6834, + "step": 6452 + }, + { + "epoch": 0.5841404906309405, + "grad_norm": 0.9016530513763428, + "learning_rate": 5.693456987477996e-05, + "loss": 2.7218, + "step": 6453 + }, + { + "epoch": 0.5842310129446908, + "grad_norm": 0.8605112433433533, + "learning_rate": 5.6905932238875123e-05, + "loss": 2.7338, + "step": 6454 + }, + { + "epoch": 0.5843215352584412, + "grad_norm": 0.8572389483451843, + "learning_rate": 5.6877298942541746e-05, + "loss": 2.7093, + "step": 6455 + }, + { + "epoch": 0.5844120575721915, + "grad_norm": 0.8466581702232361, + "learning_rate": 5.684866998866316e-05, + "loss": 2.6993, + "step": 6456 + }, + { + "epoch": 0.5845025798859419, + "grad_norm": 0.8294211030006409, + "learning_rate": 5.682004538012233e-05, + "loss": 2.6486, + "step": 6457 + }, + { + "epoch": 0.5845931021996922, + "grad_norm": 0.7687167525291443, + "learning_rate": 5.679142511980175e-05, + "loss": 2.6596, + "step": 6458 + }, + { + "epoch": 0.5846836245134426, + "grad_norm": 0.7967802882194519, + "learning_rate": 5.6762809210583534e-05, + "loss": 2.7049, + "step": 6459 + }, + { + "epoch": 0.5847741468271929, + "grad_norm": 0.905829906463623, + "learning_rate": 5.6734197655349156e-05, + "loss": 2.6424, + "step": 6460 + }, + { + "epoch": 0.5848646691409433, + "grad_norm": 0.7958777546882629, + "learning_rate": 5.670559045697996e-05, + "loss": 2.664, + "step": 6461 + }, + { + "epoch": 0.5849551914546935, + "grad_norm": 0.8610337376594543, + "learning_rate": 5.66769876183566e-05, + "loss": 2.7594, + "step": 6462 + }, + { + "epoch": 0.5850457137684439, + "grad_norm": 0.9278115034103394, + "learning_rate": 5.664838914235939e-05, + "loss": 2.7304, + "step": 6463 + }, + { + "epoch": 0.5851362360821942, + "grad_norm": 0.9422511458396912, + "learning_rate": 5.661979503186821e-05, + "loss": 2.7364, + "step": 6464 + }, + { + "epoch": 0.5852267583959446, + "grad_norm": 0.8577379584312439, + "learning_rate": 5.659120528976252e-05, + "loss": 2.8038, + "step": 6465 + }, + { + "epoch": 0.5853172807096949, + "grad_norm": 0.8961200714111328, + "learning_rate": 5.65626199189212e-05, + "loss": 2.6653, + "step": 6466 + }, + { + "epoch": 0.5854078030234453, + "grad_norm": 0.8905138373374939, + "learning_rate": 5.653403892222293e-05, + "loss": 2.7148, + "step": 6467 + }, + { + "epoch": 0.5854983253371956, + "grad_norm": 0.9227715134620667, + "learning_rate": 5.650546230254572e-05, + "loss": 2.73, + "step": 6468 + }, + { + "epoch": 0.585588847650946, + "grad_norm": 0.8112932443618774, + "learning_rate": 5.647689006276726e-05, + "loss": 2.6512, + "step": 6469 + }, + { + "epoch": 0.5856793699646963, + "grad_norm": 0.8007726073265076, + "learning_rate": 5.6448322205764794e-05, + "loss": 2.6787, + "step": 6470 + }, + { + "epoch": 0.5857698922784467, + "grad_norm": 0.9057304859161377, + "learning_rate": 5.6419758734415075e-05, + "loss": 2.721, + "step": 6471 + }, + { + "epoch": 0.585860414592197, + "grad_norm": 0.8815562725067139, + "learning_rate": 5.639119965159446e-05, + "loss": 2.6815, + "step": 6472 + }, + { + "epoch": 0.5859509369059474, + "grad_norm": 0.8501884341239929, + "learning_rate": 5.636264496017889e-05, + "loss": 2.6578, + "step": 6473 + }, + { + "epoch": 0.5860414592196976, + "grad_norm": 0.9638636708259583, + "learning_rate": 5.63340946630437e-05, + "loss": 2.7108, + "step": 6474 + }, + { + "epoch": 0.586131981533448, + "grad_norm": 0.8438670635223389, + "learning_rate": 5.630554876306407e-05, + "loss": 2.6919, + "step": 6475 + }, + { + "epoch": 0.5862225038471983, + "grad_norm": 0.8943795561790466, + "learning_rate": 5.6277007263114437e-05, + "loss": 2.6687, + "step": 6476 + }, + { + "epoch": 0.5863130261609487, + "grad_norm": 0.8650156259536743, + "learning_rate": 5.624847016606898e-05, + "loss": 2.7242, + "step": 6477 + }, + { + "epoch": 0.586403548474699, + "grad_norm": 0.949901819229126, + "learning_rate": 5.6219937474801366e-05, + "loss": 2.7432, + "step": 6478 + }, + { + "epoch": 0.5864940707884494, + "grad_norm": 0.8957086205482483, + "learning_rate": 5.619140919218486e-05, + "loss": 2.7215, + "step": 6479 + }, + { + "epoch": 0.5865845931021997, + "grad_norm": 0.8338333964347839, + "learning_rate": 5.616288532109225e-05, + "loss": 2.7067, + "step": 6480 + }, + { + "epoch": 0.5866751154159501, + "grad_norm": 0.7819648385047913, + "learning_rate": 5.613436586439593e-05, + "loss": 2.6851, + "step": 6481 + }, + { + "epoch": 0.5867656377297004, + "grad_norm": 0.9758778214454651, + "learning_rate": 5.610585082496769e-05, + "loss": 2.7379, + "step": 6482 + }, + { + "epoch": 0.5868561600434508, + "grad_norm": 0.9074897766113281, + "learning_rate": 5.607734020567914e-05, + "loss": 2.7016, + "step": 6483 + }, + { + "epoch": 0.586946682357201, + "grad_norm": 0.8512108325958252, + "learning_rate": 5.6048834009401196e-05, + "loss": 2.7295, + "step": 6484 + }, + { + "epoch": 0.5870372046709514, + "grad_norm": 0.8494038581848145, + "learning_rate": 5.602033223900447e-05, + "loss": 2.694, + "step": 6485 + }, + { + "epoch": 0.5871277269847017, + "grad_norm": 0.8115109205245972, + "learning_rate": 5.599183489735907e-05, + "loss": 2.6858, + "step": 6486 + }, + { + "epoch": 0.5872182492984521, + "grad_norm": 0.8463485836982727, + "learning_rate": 5.5963341987334704e-05, + "loss": 2.6405, + "step": 6487 + }, + { + "epoch": 0.5873087716122024, + "grad_norm": 0.7917320728302002, + "learning_rate": 5.593485351180059e-05, + "loss": 2.674, + "step": 6488 + }, + { + "epoch": 0.5873992939259527, + "grad_norm": 0.819711446762085, + "learning_rate": 5.590636947362557e-05, + "loss": 2.811, + "step": 6489 + }, + { + "epoch": 0.5874898162397031, + "grad_norm": 0.92168128490448, + "learning_rate": 5.5877889875677845e-05, + "loss": 2.6875, + "step": 6490 + }, + { + "epoch": 0.5875803385534534, + "grad_norm": 0.864594042301178, + "learning_rate": 5.584941472082549e-05, + "loss": 2.6684, + "step": 6491 + }, + { + "epoch": 0.5876708608672038, + "grad_norm": 0.8601884245872498, + "learning_rate": 5.5820944011935826e-05, + "loss": 2.713, + "step": 6492 + }, + { + "epoch": 0.5877613831809541, + "grad_norm": 0.8170925378799438, + "learning_rate": 5.5792477751875886e-05, + "loss": 2.7317, + "step": 6493 + }, + { + "epoch": 0.5878519054947045, + "grad_norm": 0.9108325839042664, + "learning_rate": 5.576401594351223e-05, + "loss": 2.765, + "step": 6494 + }, + { + "epoch": 0.5879424278084547, + "grad_norm": 0.8704808354377747, + "learning_rate": 5.5735558589711e-05, + "loss": 2.7455, + "step": 6495 + }, + { + "epoch": 0.5880329501222051, + "grad_norm": 0.868759036064148, + "learning_rate": 5.570710569333772e-05, + "loss": 2.7448, + "step": 6496 + }, + { + "epoch": 0.5881234724359554, + "grad_norm": 0.884606122970581, + "learning_rate": 5.567865725725777e-05, + "loss": 2.6955, + "step": 6497 + }, + { + "epoch": 0.5882139947497058, + "grad_norm": 0.891055703163147, + "learning_rate": 5.565021328433578e-05, + "loss": 2.6881, + "step": 6498 + }, + { + "epoch": 0.5883045170634561, + "grad_norm": 0.8647257089614868, + "learning_rate": 5.56217737774361e-05, + "loss": 2.745, + "step": 6499 + }, + { + "epoch": 0.5883950393772065, + "grad_norm": 0.8522151112556458, + "learning_rate": 5.559333873942259e-05, + "loss": 2.758, + "step": 6500 + }, + { + "epoch": 0.5884855616909568, + "grad_norm": 0.8541638851165771, + "learning_rate": 5.556490817315865e-05, + "loss": 2.6753, + "step": 6501 + }, + { + "epoch": 0.5885760840047072, + "grad_norm": 0.8651673197746277, + "learning_rate": 5.553648208150728e-05, + "loss": 2.6963, + "step": 6502 + }, + { + "epoch": 0.5886666063184575, + "grad_norm": 0.8872950077056885, + "learning_rate": 5.5508060467330915e-05, + "loss": 2.7789, + "step": 6503 + }, + { + "epoch": 0.5887571286322079, + "grad_norm": 0.8444436192512512, + "learning_rate": 5.547964333349164e-05, + "loss": 2.7032, + "step": 6504 + }, + { + "epoch": 0.5888476509459581, + "grad_norm": 0.835922122001648, + "learning_rate": 5.545123068285105e-05, + "loss": 2.7042, + "step": 6505 + }, + { + "epoch": 0.5889381732597085, + "grad_norm": 0.8999233245849609, + "learning_rate": 5.542282251827034e-05, + "loss": 2.7133, + "step": 6506 + }, + { + "epoch": 0.5890286955734588, + "grad_norm": 0.8679043650627136, + "learning_rate": 5.539441884261016e-05, + "loss": 2.6582, + "step": 6507 + }, + { + "epoch": 0.5891192178872092, + "grad_norm": 0.8555617928504944, + "learning_rate": 5.5366019658730825e-05, + "loss": 2.6486, + "step": 6508 + }, + { + "epoch": 0.5892097402009595, + "grad_norm": 0.8898964524269104, + "learning_rate": 5.5337624969492006e-05, + "loss": 2.7339, + "step": 6509 + }, + { + "epoch": 0.5893002625147099, + "grad_norm": 0.890902042388916, + "learning_rate": 5.530923477775323e-05, + "loss": 2.6978, + "step": 6510 + }, + { + "epoch": 0.5893907848284602, + "grad_norm": 0.8486526012420654, + "learning_rate": 5.528084908637323e-05, + "loss": 2.6369, + "step": 6511 + }, + { + "epoch": 0.5894813071422106, + "grad_norm": 0.8361667990684509, + "learning_rate": 5.5252467898210505e-05, + "loss": 2.692, + "step": 6512 + }, + { + "epoch": 0.5895718294559609, + "grad_norm": 0.8661167025566101, + "learning_rate": 5.522409121612304e-05, + "loss": 2.7287, + "step": 6513 + }, + { + "epoch": 0.5896623517697113, + "grad_norm": 0.9223380088806152, + "learning_rate": 5.5195719042968365e-05, + "loss": 2.7243, + "step": 6514 + }, + { + "epoch": 0.5897528740834616, + "grad_norm": 0.8186379075050354, + "learning_rate": 5.516735138160356e-05, + "loss": 2.6714, + "step": 6515 + }, + { + "epoch": 0.589843396397212, + "grad_norm": 1.0045249462127686, + "learning_rate": 5.513898823488528e-05, + "loss": 2.7635, + "step": 6516 + }, + { + "epoch": 0.5899339187109622, + "grad_norm": 0.8674926161766052, + "learning_rate": 5.5110629605669574e-05, + "loss": 2.7173, + "step": 6517 + }, + { + "epoch": 0.5900244410247126, + "grad_norm": 0.8774318695068359, + "learning_rate": 5.508227549681233e-05, + "loss": 2.7164, + "step": 6518 + }, + { + "epoch": 0.5901149633384629, + "grad_norm": 0.972149133682251, + "learning_rate": 5.505392591116867e-05, + "loss": 2.7337, + "step": 6519 + }, + { + "epoch": 0.5902054856522133, + "grad_norm": 0.847661018371582, + "learning_rate": 5.5025580851593436e-05, + "loss": 2.7372, + "step": 6520 + }, + { + "epoch": 0.5902960079659636, + "grad_norm": 0.9635895490646362, + "learning_rate": 5.499724032094098e-05, + "loss": 2.7392, + "step": 6521 + }, + { + "epoch": 0.590386530279714, + "grad_norm": 0.8678410649299622, + "learning_rate": 5.496890432206524e-05, + "loss": 2.6731, + "step": 6522 + }, + { + "epoch": 0.5904770525934643, + "grad_norm": 0.9082297682762146, + "learning_rate": 5.4940572857819516e-05, + "loss": 2.7296, + "step": 6523 + }, + { + "epoch": 0.5905675749072147, + "grad_norm": 0.9244065880775452, + "learning_rate": 5.491224593105695e-05, + "loss": 2.7483, + "step": 6524 + }, + { + "epoch": 0.590658097220965, + "grad_norm": 0.8604749441146851, + "learning_rate": 5.4883923544629955e-05, + "loss": 2.7175, + "step": 6525 + }, + { + "epoch": 0.5907486195347154, + "grad_norm": 0.9151113033294678, + "learning_rate": 5.485560570139061e-05, + "loss": 2.6912, + "step": 6526 + }, + { + "epoch": 0.5908391418484656, + "grad_norm": 0.8470368981361389, + "learning_rate": 5.482729240419056e-05, + "loss": 2.7286, + "step": 6527 + }, + { + "epoch": 0.590929664162216, + "grad_norm": 0.7878344058990479, + "learning_rate": 5.4798983655880896e-05, + "loss": 2.6645, + "step": 6528 + }, + { + "epoch": 0.5910201864759663, + "grad_norm": 0.8447884917259216, + "learning_rate": 5.477067945931237e-05, + "loss": 2.6488, + "step": 6529 + }, + { + "epoch": 0.5911107087897166, + "grad_norm": 0.8558040857315063, + "learning_rate": 5.474237981733521e-05, + "loss": 2.6722, + "step": 6530 + }, + { + "epoch": 0.591201231103467, + "grad_norm": 0.9111878871917725, + "learning_rate": 5.4714084732799084e-05, + "loss": 2.7111, + "step": 6531 + }, + { + "epoch": 0.5912917534172173, + "grad_norm": 0.8919588923454285, + "learning_rate": 5.4685794208553465e-05, + "loss": 2.7136, + "step": 6532 + }, + { + "epoch": 0.5913822757309677, + "grad_norm": 0.8634358048439026, + "learning_rate": 5.465750824744711e-05, + "loss": 2.6038, + "step": 6533 + }, + { + "epoch": 0.591472798044718, + "grad_norm": 0.8753179311752319, + "learning_rate": 5.462922685232841e-05, + "loss": 2.6767, + "step": 6534 + }, + { + "epoch": 0.5915633203584684, + "grad_norm": 0.8791088461875916, + "learning_rate": 5.4600950026045326e-05, + "loss": 2.7181, + "step": 6535 + }, + { + "epoch": 0.5916538426722187, + "grad_norm": 0.8607524633407593, + "learning_rate": 5.4572677771445344e-05, + "loss": 2.6828, + "step": 6536 + }, + { + "epoch": 0.591744364985969, + "grad_norm": 0.9010187983512878, + "learning_rate": 5.4544410091375454e-05, + "loss": 2.6991, + "step": 6537 + }, + { + "epoch": 0.5918348872997193, + "grad_norm": 0.8918096423149109, + "learning_rate": 5.4516146988682285e-05, + "loss": 2.7083, + "step": 6538 + }, + { + "epoch": 0.5919254096134697, + "grad_norm": 0.8350089192390442, + "learning_rate": 5.448788846621178e-05, + "loss": 2.7102, + "step": 6539 + }, + { + "epoch": 0.59201593192722, + "grad_norm": 0.8323737382888794, + "learning_rate": 5.445963452680973e-05, + "loss": 2.7275, + "step": 6540 + }, + { + "epoch": 0.5921064542409704, + "grad_norm": 0.8640821576118469, + "learning_rate": 5.4431385173321204e-05, + "loss": 2.7141, + "step": 6541 + }, + { + "epoch": 0.5921969765547207, + "grad_norm": 0.8797996044158936, + "learning_rate": 5.440314040859094e-05, + "loss": 2.7086, + "step": 6542 + }, + { + "epoch": 0.5922874988684711, + "grad_norm": 0.809884250164032, + "learning_rate": 5.43749002354632e-05, + "loss": 2.5982, + "step": 6543 + }, + { + "epoch": 0.5923780211822214, + "grad_norm": 0.9129722714424133, + "learning_rate": 5.434666465678175e-05, + "loss": 2.8024, + "step": 6544 + }, + { + "epoch": 0.5924685434959718, + "grad_norm": 0.9209685921669006, + "learning_rate": 5.431843367538992e-05, + "loss": 2.7221, + "step": 6545 + }, + { + "epoch": 0.5925590658097221, + "grad_norm": 0.8171425461769104, + "learning_rate": 5.4290207294130615e-05, + "loss": 2.7129, + "step": 6546 + }, + { + "epoch": 0.5926495881234725, + "grad_norm": 0.8546719551086426, + "learning_rate": 5.426198551584609e-05, + "loss": 2.6868, + "step": 6547 + }, + { + "epoch": 0.5927401104372227, + "grad_norm": 0.8493045568466187, + "learning_rate": 5.423376834337848e-05, + "loss": 2.6751, + "step": 6548 + }, + { + "epoch": 0.5928306327509731, + "grad_norm": 0.8081749677658081, + "learning_rate": 5.4205555779569075e-05, + "loss": 2.6307, + "step": 6549 + }, + { + "epoch": 0.5929211550647234, + "grad_norm": 0.9289596676826477, + "learning_rate": 5.417734782725896e-05, + "loss": 2.6709, + "step": 6550 + }, + { + "epoch": 0.5930116773784738, + "grad_norm": 0.844437837600708, + "learning_rate": 5.414914448928867e-05, + "loss": 2.7338, + "step": 6551 + }, + { + "epoch": 0.5931021996922241, + "grad_norm": 0.905487060546875, + "learning_rate": 5.412094576849829e-05, + "loss": 2.6682, + "step": 6552 + }, + { + "epoch": 0.5931927220059745, + "grad_norm": 1.018079400062561, + "learning_rate": 5.409275166772738e-05, + "loss": 2.7137, + "step": 6553 + }, + { + "epoch": 0.5932832443197248, + "grad_norm": 0.9324988126754761, + "learning_rate": 5.406456218981519e-05, + "loss": 2.7833, + "step": 6554 + }, + { + "epoch": 0.5933737666334752, + "grad_norm": 0.8467705249786377, + "learning_rate": 5.403637733760024e-05, + "loss": 2.6998, + "step": 6555 + }, + { + "epoch": 0.5934642889472255, + "grad_norm": 0.878297746181488, + "learning_rate": 5.400819711392091e-05, + "loss": 2.6542, + "step": 6556 + }, + { + "epoch": 0.5935548112609759, + "grad_norm": 1.018547773361206, + "learning_rate": 5.398002152161484e-05, + "loss": 2.753, + "step": 6557 + }, + { + "epoch": 0.5936453335747262, + "grad_norm": 0.8300045132637024, + "learning_rate": 5.395185056351935e-05, + "loss": 2.6834, + "step": 6558 + }, + { + "epoch": 0.5937358558884765, + "grad_norm": 0.8211262822151184, + "learning_rate": 5.392368424247123e-05, + "loss": 2.6606, + "step": 6559 + }, + { + "epoch": 0.5938263782022268, + "grad_norm": 0.936378538608551, + "learning_rate": 5.38955225613069e-05, + "loss": 2.6774, + "step": 6560 + }, + { + "epoch": 0.5939169005159772, + "grad_norm": 0.8802645206451416, + "learning_rate": 5.38673655228621e-05, + "loss": 2.6995, + "step": 6561 + }, + { + "epoch": 0.5940074228297275, + "grad_norm": 0.8793949484825134, + "learning_rate": 5.383921312997242e-05, + "loss": 2.7237, + "step": 6562 + }, + { + "epoch": 0.5940979451434779, + "grad_norm": 0.8559528589248657, + "learning_rate": 5.381106538547266e-05, + "loss": 2.6689, + "step": 6563 + }, + { + "epoch": 0.5941884674572282, + "grad_norm": 0.9179282188415527, + "learning_rate": 5.3782922292197354e-05, + "loss": 2.6752, + "step": 6564 + }, + { + "epoch": 0.5942789897709786, + "grad_norm": 0.9603455662727356, + "learning_rate": 5.375478385298052e-05, + "loss": 2.7392, + "step": 6565 + }, + { + "epoch": 0.5943695120847289, + "grad_norm": 0.8355276584625244, + "learning_rate": 5.372665007065567e-05, + "loss": 2.6832, + "step": 6566 + }, + { + "epoch": 0.5944600343984793, + "grad_norm": 0.8501160740852356, + "learning_rate": 5.3698520948055906e-05, + "loss": 2.6979, + "step": 6567 + }, + { + "epoch": 0.5945505567122296, + "grad_norm": 0.8548684120178223, + "learning_rate": 5.3670396488013854e-05, + "loss": 2.67, + "step": 6568 + }, + { + "epoch": 0.59464107902598, + "grad_norm": 0.8284228444099426, + "learning_rate": 5.364227669336153e-05, + "loss": 2.6429, + "step": 6569 + }, + { + "epoch": 0.5947316013397302, + "grad_norm": 0.8854268193244934, + "learning_rate": 5.361416156693075e-05, + "loss": 2.6237, + "step": 6570 + }, + { + "epoch": 0.5948221236534805, + "grad_norm": 0.9293057322502136, + "learning_rate": 5.358605111155259e-05, + "loss": 2.704, + "step": 6571 + }, + { + "epoch": 0.5949126459672309, + "grad_norm": 0.917051374912262, + "learning_rate": 5.3557945330057813e-05, + "loss": 2.7112, + "step": 6572 + }, + { + "epoch": 0.5950031682809812, + "grad_norm": 0.8866673111915588, + "learning_rate": 5.352984422527667e-05, + "loss": 2.6689, + "step": 6573 + }, + { + "epoch": 0.5950936905947316, + "grad_norm": 0.8097630143165588, + "learning_rate": 5.3501747800038934e-05, + "loss": 2.6382, + "step": 6574 + }, + { + "epoch": 0.5951842129084819, + "grad_norm": 0.8683230876922607, + "learning_rate": 5.347365605717394e-05, + "loss": 2.7184, + "step": 6575 + }, + { + "epoch": 0.5952747352222323, + "grad_norm": 0.9342178106307983, + "learning_rate": 5.344556899951054e-05, + "loss": 2.7366, + "step": 6576 + }, + { + "epoch": 0.5953652575359826, + "grad_norm": 0.838149905204773, + "learning_rate": 5.3417486629876986e-05, + "loss": 2.737, + "step": 6577 + }, + { + "epoch": 0.595455779849733, + "grad_norm": 0.8259788751602173, + "learning_rate": 5.3389408951101336e-05, + "loss": 2.7081, + "step": 6578 + }, + { + "epoch": 0.5955463021634833, + "grad_norm": 0.8724661469459534, + "learning_rate": 5.33613359660109e-05, + "loss": 2.6943, + "step": 6579 + }, + { + "epoch": 0.5956368244772337, + "grad_norm": 0.8654960989952087, + "learning_rate": 5.333326767743263e-05, + "loss": 2.7003, + "step": 6580 + }, + { + "epoch": 0.5957273467909839, + "grad_norm": 0.9523176550865173, + "learning_rate": 5.330520408819304e-05, + "loss": 2.6237, + "step": 6581 + }, + { + "epoch": 0.5958178691047343, + "grad_norm": 0.9099210500717163, + "learning_rate": 5.327714520111813e-05, + "loss": 2.6642, + "step": 6582 + }, + { + "epoch": 0.5959083914184846, + "grad_norm": 0.8548341393470764, + "learning_rate": 5.324909101903341e-05, + "loss": 2.6689, + "step": 6583 + }, + { + "epoch": 0.595998913732235, + "grad_norm": 0.9076147675514221, + "learning_rate": 5.322104154476399e-05, + "loss": 2.687, + "step": 6584 + }, + { + "epoch": 0.5960894360459853, + "grad_norm": 0.8491636514663696, + "learning_rate": 5.319299678113432e-05, + "loss": 2.7021, + "step": 6585 + }, + { + "epoch": 0.5961799583597357, + "grad_norm": 0.8565483689308167, + "learning_rate": 5.316495673096869e-05, + "loss": 2.7048, + "step": 6586 + }, + { + "epoch": 0.596270480673486, + "grad_norm": 0.8949451446533203, + "learning_rate": 5.313692139709061e-05, + "loss": 2.7275, + "step": 6587 + }, + { + "epoch": 0.5963610029872364, + "grad_norm": 0.8468038439750671, + "learning_rate": 5.31088907823232e-05, + "loss": 2.6339, + "step": 6588 + }, + { + "epoch": 0.5964515253009867, + "grad_norm": 0.8995200991630554, + "learning_rate": 5.308086488948929e-05, + "loss": 2.6561, + "step": 6589 + }, + { + "epoch": 0.5965420476147371, + "grad_norm": 0.8418900370597839, + "learning_rate": 5.305284372141095e-05, + "loss": 2.7061, + "step": 6590 + }, + { + "epoch": 0.5966325699284873, + "grad_norm": 0.8721002340316772, + "learning_rate": 5.302482728090996e-05, + "loss": 2.727, + "step": 6591 + }, + { + "epoch": 0.5967230922422377, + "grad_norm": 0.9773480296134949, + "learning_rate": 5.299681557080759e-05, + "loss": 2.7723, + "step": 6592 + }, + { + "epoch": 0.596813614555988, + "grad_norm": 0.8725921511650085, + "learning_rate": 5.2968808593924604e-05, + "loss": 2.6763, + "step": 6593 + }, + { + "epoch": 0.5969041368697384, + "grad_norm": 0.8329956531524658, + "learning_rate": 5.2940806353081296e-05, + "loss": 2.6554, + "step": 6594 + }, + { + "epoch": 0.5969946591834887, + "grad_norm": 0.7967681288719177, + "learning_rate": 5.291280885109755e-05, + "loss": 2.7187, + "step": 6595 + }, + { + "epoch": 0.5970851814972391, + "grad_norm": 0.861714780330658, + "learning_rate": 5.288481609079259e-05, + "loss": 2.6465, + "step": 6596 + }, + { + "epoch": 0.5971757038109894, + "grad_norm": 0.9225656390190125, + "learning_rate": 5.285682807498542e-05, + "loss": 2.6922, + "step": 6597 + }, + { + "epoch": 0.5972662261247398, + "grad_norm": 0.8872184753417969, + "learning_rate": 5.282884480649435e-05, + "loss": 2.7618, + "step": 6598 + }, + { + "epoch": 0.5973567484384901, + "grad_norm": 0.8705012798309326, + "learning_rate": 5.280086628813731e-05, + "loss": 2.7644, + "step": 6599 + }, + { + "epoch": 0.5974472707522405, + "grad_norm": 0.8825952410697937, + "learning_rate": 5.277289252273174e-05, + "loss": 2.7019, + "step": 6600 + }, + { + "epoch": 0.5975377930659908, + "grad_norm": 0.8400018811225891, + "learning_rate": 5.274492351309461e-05, + "loss": 2.654, + "step": 6601 + }, + { + "epoch": 0.5976283153797411, + "grad_norm": 0.8624444007873535, + "learning_rate": 5.2716959262042386e-05, + "loss": 2.6893, + "step": 6602 + }, + { + "epoch": 0.5977188376934914, + "grad_norm": 0.84315425157547, + "learning_rate": 5.26889997723911e-05, + "loss": 2.6576, + "step": 6603 + }, + { + "epoch": 0.5978093600072418, + "grad_norm": 0.960757315158844, + "learning_rate": 5.266104504695617e-05, + "loss": 2.66, + "step": 6604 + }, + { + "epoch": 0.5978998823209921, + "grad_norm": 0.8555940985679626, + "learning_rate": 5.26330950885528e-05, + "loss": 2.6179, + "step": 6605 + }, + { + "epoch": 0.5979904046347425, + "grad_norm": 0.9326249361038208, + "learning_rate": 5.2605149899995406e-05, + "loss": 2.6908, + "step": 6606 + }, + { + "epoch": 0.5980809269484928, + "grad_norm": 0.9245107769966125, + "learning_rate": 5.257720948409812e-05, + "loss": 2.7202, + "step": 6607 + }, + { + "epoch": 0.5981714492622432, + "grad_norm": 0.9167315363883972, + "learning_rate": 5.254927384367454e-05, + "loss": 2.6924, + "step": 6608 + }, + { + "epoch": 0.5982619715759935, + "grad_norm": 0.8900579214096069, + "learning_rate": 5.252134298153779e-05, + "loss": 2.6718, + "step": 6609 + }, + { + "epoch": 0.5983524938897439, + "grad_norm": 0.8357805609703064, + "learning_rate": 5.249341690050051e-05, + "loss": 2.6777, + "step": 6610 + }, + { + "epoch": 0.5984430162034942, + "grad_norm": 0.8500257134437561, + "learning_rate": 5.246549560337488e-05, + "loss": 2.669, + "step": 6611 + }, + { + "epoch": 0.5985335385172444, + "grad_norm": 0.9223446846008301, + "learning_rate": 5.243757909297247e-05, + "loss": 2.7136, + "step": 6612 + }, + { + "epoch": 0.5986240608309948, + "grad_norm": 0.9422861337661743, + "learning_rate": 5.2409667372104607e-05, + "loss": 2.7072, + "step": 6613 + }, + { + "epoch": 0.5987145831447451, + "grad_norm": 0.841312050819397, + "learning_rate": 5.2381760443581916e-05, + "loss": 2.6323, + "step": 6614 + }, + { + "epoch": 0.5988051054584955, + "grad_norm": 0.9254672527313232, + "learning_rate": 5.2353858310214635e-05, + "loss": 2.7042, + "step": 6615 + }, + { + "epoch": 0.5988956277722458, + "grad_norm": 0.8144631385803223, + "learning_rate": 5.232596097481251e-05, + "loss": 2.6905, + "step": 6616 + }, + { + "epoch": 0.5989861500859962, + "grad_norm": 0.8707183003425598, + "learning_rate": 5.229806844018487e-05, + "loss": 2.7542, + "step": 6617 + }, + { + "epoch": 0.5990766723997465, + "grad_norm": 0.9004245400428772, + "learning_rate": 5.227018070914034e-05, + "loss": 2.7192, + "step": 6618 + }, + { + "epoch": 0.5991671947134969, + "grad_norm": 0.8687030673027039, + "learning_rate": 5.2242297784487395e-05, + "loss": 2.6497, + "step": 6619 + }, + { + "epoch": 0.5992577170272472, + "grad_norm": 0.8232517242431641, + "learning_rate": 5.221441966903371e-05, + "loss": 2.7078, + "step": 6620 + }, + { + "epoch": 0.5993482393409976, + "grad_norm": 0.7970709204673767, + "learning_rate": 5.218654636558666e-05, + "loss": 2.7088, + "step": 6621 + }, + { + "epoch": 0.5994387616547479, + "grad_norm": 0.8401699662208557, + "learning_rate": 5.2158677876953075e-05, + "loss": 2.6806, + "step": 6622 + }, + { + "epoch": 0.5995292839684982, + "grad_norm": 0.8412536978721619, + "learning_rate": 5.213081420593933e-05, + "loss": 2.7362, + "step": 6623 + }, + { + "epoch": 0.5996198062822485, + "grad_norm": 0.835806131362915, + "learning_rate": 5.2102955355351276e-05, + "loss": 2.7277, + "step": 6624 + }, + { + "epoch": 0.5997103285959989, + "grad_norm": 0.7974480986595154, + "learning_rate": 5.207510132799436e-05, + "loss": 2.6583, + "step": 6625 + }, + { + "epoch": 0.5998008509097492, + "grad_norm": 0.8804001212120056, + "learning_rate": 5.204725212667334e-05, + "loss": 2.7269, + "step": 6626 + }, + { + "epoch": 0.5998913732234996, + "grad_norm": 0.8101835250854492, + "learning_rate": 5.201940775419278e-05, + "loss": 2.6732, + "step": 6627 + }, + { + "epoch": 0.5999818955372499, + "grad_norm": 0.815082311630249, + "learning_rate": 5.199156821335653e-05, + "loss": 2.6888, + "step": 6628 + }, + { + "epoch": 0.6000724178510003, + "grad_norm": 0.8119527101516724, + "learning_rate": 5.1963733506968046e-05, + "loss": 2.5854, + "step": 6629 + }, + { + "epoch": 0.6001629401647506, + "grad_norm": 0.841860830783844, + "learning_rate": 5.193590363783028e-05, + "loss": 2.6658, + "step": 6630 + }, + { + "epoch": 0.600253462478501, + "grad_norm": 0.8963244557380676, + "learning_rate": 5.1908078608745705e-05, + "loss": 2.7245, + "step": 6631 + }, + { + "epoch": 0.6003439847922513, + "grad_norm": 0.8063508868217468, + "learning_rate": 5.1880258422516294e-05, + "loss": 2.6822, + "step": 6632 + }, + { + "epoch": 0.6004345071060017, + "grad_norm": 0.830461859703064, + "learning_rate": 5.185244308194358e-05, + "loss": 2.6834, + "step": 6633 + }, + { + "epoch": 0.6005250294197519, + "grad_norm": 0.8261478543281555, + "learning_rate": 5.182463258982846e-05, + "loss": 2.6629, + "step": 6634 + }, + { + "epoch": 0.6006155517335023, + "grad_norm": 0.8018027544021606, + "learning_rate": 5.179682694897159e-05, + "loss": 2.6857, + "step": 6635 + }, + { + "epoch": 0.6007060740472526, + "grad_norm": 0.8765165209770203, + "learning_rate": 5.17690261621729e-05, + "loss": 2.6925, + "step": 6636 + }, + { + "epoch": 0.600796596361003, + "grad_norm": 0.8018073439598083, + "learning_rate": 5.174123023223194e-05, + "loss": 2.7014, + "step": 6637 + }, + { + "epoch": 0.6008871186747533, + "grad_norm": 0.8557896018028259, + "learning_rate": 5.1713439161947785e-05, + "loss": 2.6468, + "step": 6638 + }, + { + "epoch": 0.6009776409885037, + "grad_norm": 0.8501036167144775, + "learning_rate": 5.168565295411898e-05, + "loss": 2.6903, + "step": 6639 + }, + { + "epoch": 0.601068163302254, + "grad_norm": 0.8637775182723999, + "learning_rate": 5.1657871611543605e-05, + "loss": 2.6627, + "step": 6640 + }, + { + "epoch": 0.6011586856160044, + "grad_norm": 0.8541369438171387, + "learning_rate": 5.163009513701929e-05, + "loss": 2.7018, + "step": 6641 + }, + { + "epoch": 0.6012492079297547, + "grad_norm": 0.8165152668952942, + "learning_rate": 5.1602323533342964e-05, + "loss": 2.6724, + "step": 6642 + }, + { + "epoch": 0.6013397302435051, + "grad_norm": 0.965965747833252, + "learning_rate": 5.157455680331142e-05, + "loss": 2.6428, + "step": 6643 + }, + { + "epoch": 0.6014302525572554, + "grad_norm": 0.8963990211486816, + "learning_rate": 5.154679494972066e-05, + "loss": 2.68, + "step": 6644 + }, + { + "epoch": 0.6015207748710057, + "grad_norm": 0.8216471672058105, + "learning_rate": 5.15190379753663e-05, + "loss": 2.6938, + "step": 6645 + }, + { + "epoch": 0.601611297184756, + "grad_norm": 0.8308603167533875, + "learning_rate": 5.149128588304351e-05, + "loss": 2.6742, + "step": 6646 + }, + { + "epoch": 0.6017018194985064, + "grad_norm": 0.9278542995452881, + "learning_rate": 5.146353867554688e-05, + "loss": 2.7318, + "step": 6647 + }, + { + "epoch": 0.6017923418122567, + "grad_norm": 0.9083507061004639, + "learning_rate": 5.1435796355670594e-05, + "loss": 2.7105, + "step": 6648 + }, + { + "epoch": 0.6018828641260071, + "grad_norm": 0.865098237991333, + "learning_rate": 5.140805892620833e-05, + "loss": 2.6751, + "step": 6649 + }, + { + "epoch": 0.6019733864397574, + "grad_norm": 0.813866913318634, + "learning_rate": 5.138032638995315e-05, + "loss": 2.6353, + "step": 6650 + }, + { + "epoch": 0.6020639087535078, + "grad_norm": 0.970386803150177, + "learning_rate": 5.135259874969778e-05, + "loss": 2.6791, + "step": 6651 + }, + { + "epoch": 0.6021544310672581, + "grad_norm": 0.9106497764587402, + "learning_rate": 5.132487600823438e-05, + "loss": 2.7343, + "step": 6652 + }, + { + "epoch": 0.6022449533810084, + "grad_norm": 0.8499709367752075, + "learning_rate": 5.129715816835463e-05, + "loss": 2.733, + "step": 6653 + }, + { + "epoch": 0.6023354756947588, + "grad_norm": 0.9132952094078064, + "learning_rate": 5.1269445232849734e-05, + "loss": 2.707, + "step": 6654 + }, + { + "epoch": 0.602425998008509, + "grad_norm": 0.9442603588104248, + "learning_rate": 5.1241737204510395e-05, + "loss": 2.7212, + "step": 6655 + }, + { + "epoch": 0.6025165203222594, + "grad_norm": 1.0592482089996338, + "learning_rate": 5.121403408612672e-05, + "loss": 2.7101, + "step": 6656 + }, + { + "epoch": 0.6026070426360097, + "grad_norm": 0.831420361995697, + "learning_rate": 5.1186335880488544e-05, + "loss": 2.6751, + "step": 6657 + }, + { + "epoch": 0.6026975649497601, + "grad_norm": 0.8643193244934082, + "learning_rate": 5.115864259038498e-05, + "loss": 2.71, + "step": 6658 + }, + { + "epoch": 0.6027880872635104, + "grad_norm": 0.8968473076820374, + "learning_rate": 5.1130954218604767e-05, + "loss": 2.6981, + "step": 6659 + }, + { + "epoch": 0.6028786095772608, + "grad_norm": 0.8939312696456909, + "learning_rate": 5.110327076793613e-05, + "loss": 2.7011, + "step": 6660 + }, + { + "epoch": 0.6029691318910111, + "grad_norm": 0.8195379376411438, + "learning_rate": 5.107559224116678e-05, + "loss": 2.6823, + "step": 6661 + }, + { + "epoch": 0.6030596542047615, + "grad_norm": 0.8442776799201965, + "learning_rate": 5.104791864108396e-05, + "loss": 2.696, + "step": 6662 + }, + { + "epoch": 0.6031501765185118, + "grad_norm": 0.9072765707969666, + "learning_rate": 5.102024997047443e-05, + "loss": 2.7176, + "step": 6663 + }, + { + "epoch": 0.6032406988322622, + "grad_norm": 0.8840109705924988, + "learning_rate": 5.099258623212431e-05, + "loss": 2.6998, + "step": 6664 + }, + { + "epoch": 0.6033312211460125, + "grad_norm": 0.8407348990440369, + "learning_rate": 5.096492742881949e-05, + "loss": 2.6591, + "step": 6665 + }, + { + "epoch": 0.6034217434597628, + "grad_norm": 0.8747027516365051, + "learning_rate": 5.09372735633451e-05, + "loss": 2.6933, + "step": 6666 + }, + { + "epoch": 0.6035122657735131, + "grad_norm": 0.8869914412498474, + "learning_rate": 5.090962463848592e-05, + "loss": 2.7229, + "step": 6667 + }, + { + "epoch": 0.6036027880872635, + "grad_norm": 0.8956210613250732, + "learning_rate": 5.0881980657026195e-05, + "loss": 2.6556, + "step": 6668 + }, + { + "epoch": 0.6036933104010138, + "grad_norm": 0.8411816358566284, + "learning_rate": 5.085434162174968e-05, + "loss": 2.707, + "step": 6669 + }, + { + "epoch": 0.6037838327147642, + "grad_norm": 0.8487591743469238, + "learning_rate": 5.082670753543961e-05, + "loss": 2.754, + "step": 6670 + }, + { + "epoch": 0.6038743550285145, + "grad_norm": 0.8434279561042786, + "learning_rate": 5.0799078400878786e-05, + "loss": 2.666, + "step": 6671 + }, + { + "epoch": 0.6039648773422649, + "grad_norm": 0.8222647309303284, + "learning_rate": 5.077145422084939e-05, + "loss": 2.7008, + "step": 6672 + }, + { + "epoch": 0.6040553996560152, + "grad_norm": 0.9104819893836975, + "learning_rate": 5.0743834998133196e-05, + "loss": 2.6976, + "step": 6673 + }, + { + "epoch": 0.6041459219697656, + "grad_norm": 0.829352617263794, + "learning_rate": 5.071622073551148e-05, + "loss": 2.6813, + "step": 6674 + }, + { + "epoch": 0.6042364442835159, + "grad_norm": 0.8321340680122375, + "learning_rate": 5.0688611435764975e-05, + "loss": 2.6406, + "step": 6675 + }, + { + "epoch": 0.6043269665972663, + "grad_norm": 0.8086342215538025, + "learning_rate": 5.066100710167401e-05, + "loss": 2.6696, + "step": 6676 + }, + { + "epoch": 0.6044174889110165, + "grad_norm": 0.8482884764671326, + "learning_rate": 5.063340773601818e-05, + "loss": 2.7286, + "step": 6677 + }, + { + "epoch": 0.6045080112247669, + "grad_norm": 0.857356607913971, + "learning_rate": 5.0605813341576924e-05, + "loss": 2.6962, + "step": 6678 + }, + { + "epoch": 0.6045985335385172, + "grad_norm": 0.9065026640892029, + "learning_rate": 5.0578223921128875e-05, + "loss": 2.7226, + "step": 6679 + }, + { + "epoch": 0.6046890558522676, + "grad_norm": 0.8281168937683105, + "learning_rate": 5.055063947745233e-05, + "loss": 2.6442, + "step": 6680 + }, + { + "epoch": 0.6047795781660179, + "grad_norm": 0.8826650977134705, + "learning_rate": 5.052306001332503e-05, + "loss": 2.7138, + "step": 6681 + }, + { + "epoch": 0.6048701004797683, + "grad_norm": 0.8792107105255127, + "learning_rate": 5.049548553152428e-05, + "loss": 2.7541, + "step": 6682 + }, + { + "epoch": 0.6049606227935186, + "grad_norm": 0.7904659509658813, + "learning_rate": 5.0467916034826704e-05, + "loss": 2.7115, + "step": 6683 + }, + { + "epoch": 0.605051145107269, + "grad_norm": 0.891960084438324, + "learning_rate": 5.044035152600869e-05, + "loss": 2.6897, + "step": 6684 + }, + { + "epoch": 0.6051416674210193, + "grad_norm": 0.9381482005119324, + "learning_rate": 5.04127920078459e-05, + "loss": 2.7353, + "step": 6685 + }, + { + "epoch": 0.6052321897347697, + "grad_norm": 0.8143098950386047, + "learning_rate": 5.038523748311359e-05, + "loss": 2.6349, + "step": 6686 + }, + { + "epoch": 0.60532271204852, + "grad_norm": 0.837303102016449, + "learning_rate": 5.0357687954586506e-05, + "loss": 2.6685, + "step": 6687 + }, + { + "epoch": 0.6054132343622703, + "grad_norm": 0.8285949230194092, + "learning_rate": 5.033014342503889e-05, + "loss": 2.6775, + "step": 6688 + }, + { + "epoch": 0.6055037566760206, + "grad_norm": 0.8302776217460632, + "learning_rate": 5.0302603897244474e-05, + "loss": 2.7217, + "step": 6689 + }, + { + "epoch": 0.605594278989771, + "grad_norm": 0.7810454368591309, + "learning_rate": 5.027506937397652e-05, + "loss": 2.6596, + "step": 6690 + }, + { + "epoch": 0.6056848013035213, + "grad_norm": 0.8588391542434692, + "learning_rate": 5.0247539858007644e-05, + "loss": 2.6877, + "step": 6691 + }, + { + "epoch": 0.6057753236172717, + "grad_norm": 0.9216744899749756, + "learning_rate": 5.022001535211022e-05, + "loss": 2.7234, + "step": 6692 + }, + { + "epoch": 0.605865845931022, + "grad_norm": 0.8668200373649597, + "learning_rate": 5.019249585905585e-05, + "loss": 2.6811, + "step": 6693 + }, + { + "epoch": 0.6059563682447723, + "grad_norm": 0.7832801938056946, + "learning_rate": 5.0164981381615786e-05, + "loss": 2.654, + "step": 6694 + }, + { + "epoch": 0.6060468905585227, + "grad_norm": 0.8727904558181763, + "learning_rate": 5.0137471922560734e-05, + "loss": 2.6633, + "step": 6695 + }, + { + "epoch": 0.606137412872273, + "grad_norm": 0.8319174647331238, + "learning_rate": 5.010996748466088e-05, + "loss": 2.733, + "step": 6696 + }, + { + "epoch": 0.6062279351860234, + "grad_norm": 0.8686563372612, + "learning_rate": 5.008246807068595e-05, + "loss": 2.7043, + "step": 6697 + }, + { + "epoch": 0.6063184574997736, + "grad_norm": 0.8815221786499023, + "learning_rate": 5.005497368340516e-05, + "loss": 2.7486, + "step": 6698 + }, + { + "epoch": 0.606408979813524, + "grad_norm": 0.8359525799751282, + "learning_rate": 5.002748432558707e-05, + "loss": 2.6408, + "step": 6699 + }, + { + "epoch": 0.6064995021272743, + "grad_norm": 0.8514461517333984, + "learning_rate": 5.000000000000002e-05, + "loss": 2.6483, + "step": 6700 + }, + { + "epoch": 0.6065900244410247, + "grad_norm": 0.890376091003418, + "learning_rate": 4.9972520709411565e-05, + "loss": 2.6174, + "step": 6701 + }, + { + "epoch": 0.606680546754775, + "grad_norm": 0.8972718715667725, + "learning_rate": 4.9945046456588905e-05, + "loss": 2.7029, + "step": 6702 + }, + { + "epoch": 0.6067710690685254, + "grad_norm": 0.8866444230079651, + "learning_rate": 4.9917577244298696e-05, + "loss": 2.718, + "step": 6703 + }, + { + "epoch": 0.6068615913822757, + "grad_norm": 0.8488792777061462, + "learning_rate": 4.9890113075307085e-05, + "loss": 2.7348, + "step": 6704 + }, + { + "epoch": 0.6069521136960261, + "grad_norm": 0.8102442622184753, + "learning_rate": 4.986265395237972e-05, + "loss": 2.6668, + "step": 6705 + }, + { + "epoch": 0.6070426360097764, + "grad_norm": 0.8155598640441895, + "learning_rate": 4.9835199878281765e-05, + "loss": 2.6679, + "step": 6706 + }, + { + "epoch": 0.6071331583235268, + "grad_norm": 0.8324066400527954, + "learning_rate": 4.980775085577775e-05, + "loss": 2.7047, + "step": 6707 + }, + { + "epoch": 0.607223680637277, + "grad_norm": 0.8296021223068237, + "learning_rate": 4.978030688763191e-05, + "loss": 2.6714, + "step": 6708 + }, + { + "epoch": 0.6073142029510274, + "grad_norm": 0.8644400238990784, + "learning_rate": 4.975286797660775e-05, + "loss": 2.7341, + "step": 6709 + }, + { + "epoch": 0.6074047252647777, + "grad_norm": 0.8654029369354248, + "learning_rate": 4.972543412546842e-05, + "loss": 2.6972, + "step": 6710 + }, + { + "epoch": 0.6074952475785281, + "grad_norm": 0.848343551158905, + "learning_rate": 4.969800533697649e-05, + "loss": 2.6639, + "step": 6711 + }, + { + "epoch": 0.6075857698922784, + "grad_norm": 0.8281153440475464, + "learning_rate": 4.9670581613894094e-05, + "loss": 2.6206, + "step": 6712 + }, + { + "epoch": 0.6076762922060288, + "grad_norm": 0.8595371246337891, + "learning_rate": 4.964316295898268e-05, + "loss": 2.6815, + "step": 6713 + }, + { + "epoch": 0.6077668145197791, + "grad_norm": 0.8331053256988525, + "learning_rate": 4.961574937500344e-05, + "loss": 2.6783, + "step": 6714 + }, + { + "epoch": 0.6078573368335295, + "grad_norm": 0.8491067290306091, + "learning_rate": 4.958834086471683e-05, + "loss": 2.7056, + "step": 6715 + }, + { + "epoch": 0.6079478591472798, + "grad_norm": 0.8933209776878357, + "learning_rate": 4.956093743088291e-05, + "loss": 2.7263, + "step": 6716 + }, + { + "epoch": 0.6080383814610302, + "grad_norm": 0.883208155632019, + "learning_rate": 4.953353907626122e-05, + "loss": 2.6177, + "step": 6717 + }, + { + "epoch": 0.6081289037747805, + "grad_norm": 0.8465838432312012, + "learning_rate": 4.950614580361076e-05, + "loss": 2.6835, + "step": 6718 + }, + { + "epoch": 0.6082194260885309, + "grad_norm": 0.8147262334823608, + "learning_rate": 4.947875761569003e-05, + "loss": 2.6652, + "step": 6719 + }, + { + "epoch": 0.6083099484022811, + "grad_norm": 0.9500305652618408, + "learning_rate": 4.945137451525707e-05, + "loss": 2.6889, + "step": 6720 + }, + { + "epoch": 0.6084004707160315, + "grad_norm": 0.9218015074729919, + "learning_rate": 4.942399650506923e-05, + "loss": 2.6836, + "step": 6721 + }, + { + "epoch": 0.6084909930297818, + "grad_norm": 0.7956727743148804, + "learning_rate": 4.939662358788364e-05, + "loss": 2.6653, + "step": 6722 + }, + { + "epoch": 0.6085815153435322, + "grad_norm": 0.9122270345687866, + "learning_rate": 4.936925576645663e-05, + "loss": 2.7115, + "step": 6723 + }, + { + "epoch": 0.6086720376572825, + "grad_norm": 0.8649951219558716, + "learning_rate": 4.9341893043544185e-05, + "loss": 2.7205, + "step": 6724 + }, + { + "epoch": 0.6087625599710329, + "grad_norm": 0.9105194211006165, + "learning_rate": 4.9314535421901716e-05, + "loss": 2.7186, + "step": 6725 + }, + { + "epoch": 0.6088530822847832, + "grad_norm": 0.8381279706954956, + "learning_rate": 4.9287182904284143e-05, + "loss": 2.7746, + "step": 6726 + }, + { + "epoch": 0.6089436045985336, + "grad_norm": 0.8129174113273621, + "learning_rate": 4.925983549344587e-05, + "loss": 2.6485, + "step": 6727 + }, + { + "epoch": 0.6090341269122839, + "grad_norm": 0.8500168323516846, + "learning_rate": 4.9232493192140806e-05, + "loss": 2.6848, + "step": 6728 + }, + { + "epoch": 0.6091246492260343, + "grad_norm": 0.9311719536781311, + "learning_rate": 4.920515600312222e-05, + "loss": 2.7047, + "step": 6729 + }, + { + "epoch": 0.6092151715397846, + "grad_norm": 0.7978945970535278, + "learning_rate": 4.9177823929143106e-05, + "loss": 2.6726, + "step": 6730 + }, + { + "epoch": 0.609305693853535, + "grad_norm": 0.8584246635437012, + "learning_rate": 4.9150496972955696e-05, + "loss": 2.6859, + "step": 6731 + }, + { + "epoch": 0.6093962161672852, + "grad_norm": 0.8876283764839172, + "learning_rate": 4.912317513731185e-05, + "loss": 2.6616, + "step": 6732 + }, + { + "epoch": 0.6094867384810356, + "grad_norm": 0.8028571605682373, + "learning_rate": 4.909585842496287e-05, + "loss": 2.7145, + "step": 6733 + }, + { + "epoch": 0.6095772607947859, + "grad_norm": 0.87955641746521, + "learning_rate": 4.906854683865956e-05, + "loss": 2.7036, + "step": 6734 + }, + { + "epoch": 0.6096677831085362, + "grad_norm": 0.8874965906143188, + "learning_rate": 4.9041240381152185e-05, + "loss": 2.725, + "step": 6735 + }, + { + "epoch": 0.6097583054222866, + "grad_norm": 0.9333867430686951, + "learning_rate": 4.901393905519055e-05, + "loss": 2.7052, + "step": 6736 + }, + { + "epoch": 0.6098488277360369, + "grad_norm": 0.8766831755638123, + "learning_rate": 4.898664286352378e-05, + "loss": 2.7184, + "step": 6737 + }, + { + "epoch": 0.6099393500497873, + "grad_norm": 0.8419327139854431, + "learning_rate": 4.8959351808900754e-05, + "loss": 2.7346, + "step": 6738 + }, + { + "epoch": 0.6100298723635376, + "grad_norm": 0.9690167307853699, + "learning_rate": 4.893206589406957e-05, + "loss": 2.8024, + "step": 6739 + }, + { + "epoch": 0.610120394677288, + "grad_norm": 0.8833349347114563, + "learning_rate": 4.890478512177795e-05, + "loss": 2.7829, + "step": 6740 + }, + { + "epoch": 0.6102109169910382, + "grad_norm": 0.8960972428321838, + "learning_rate": 4.8877509494773065e-05, + "loss": 2.7061, + "step": 6741 + }, + { + "epoch": 0.6103014393047886, + "grad_norm": 0.9377439022064209, + "learning_rate": 4.8850239015801625e-05, + "loss": 2.6516, + "step": 6742 + }, + { + "epoch": 0.6103919616185389, + "grad_norm": 0.9206840395927429, + "learning_rate": 4.882297368760963e-05, + "loss": 2.6613, + "step": 6743 + }, + { + "epoch": 0.6104824839322893, + "grad_norm": 0.9181665778160095, + "learning_rate": 4.8795713512942865e-05, + "loss": 2.6805, + "step": 6744 + }, + { + "epoch": 0.6105730062460396, + "grad_norm": 0.9120600819587708, + "learning_rate": 4.876845849454631e-05, + "loss": 2.6996, + "step": 6745 + }, + { + "epoch": 0.61066352855979, + "grad_norm": 0.8618726134300232, + "learning_rate": 4.8741208635164585e-05, + "loss": 2.7314, + "step": 6746 + }, + { + "epoch": 0.6107540508735403, + "grad_norm": 0.8103480935096741, + "learning_rate": 4.871396393754174e-05, + "loss": 2.6864, + "step": 6747 + }, + { + "epoch": 0.6108445731872907, + "grad_norm": 0.8761424422264099, + "learning_rate": 4.868672440442134e-05, + "loss": 2.6805, + "step": 6748 + }, + { + "epoch": 0.610935095501041, + "grad_norm": 0.8975646495819092, + "learning_rate": 4.865949003854637e-05, + "loss": 2.7368, + "step": 6749 + }, + { + "epoch": 0.6110256178147914, + "grad_norm": 0.9692808985710144, + "learning_rate": 4.8632260842659393e-05, + "loss": 2.7351, + "step": 6750 + }, + { + "epoch": 0.6111161401285417, + "grad_norm": 0.8636605739593506, + "learning_rate": 4.860503681950227e-05, + "loss": 2.653, + "step": 6751 + }, + { + "epoch": 0.611206662442292, + "grad_norm": 0.8828364014625549, + "learning_rate": 4.857781797181661e-05, + "loss": 2.7143, + "step": 6752 + }, + { + "epoch": 0.6112971847560423, + "grad_norm": 0.9609152674674988, + "learning_rate": 4.8550604302343227e-05, + "loss": 2.6827, + "step": 6753 + }, + { + "epoch": 0.6113877070697927, + "grad_norm": 1.0201287269592285, + "learning_rate": 4.852339581382258e-05, + "loss": 2.7189, + "step": 6754 + }, + { + "epoch": 0.611478229383543, + "grad_norm": 0.9233447313308716, + "learning_rate": 4.8496192508994576e-05, + "loss": 2.7395, + "step": 6755 + }, + { + "epoch": 0.6115687516972934, + "grad_norm": 0.8440586924552917, + "learning_rate": 4.8468994390598574e-05, + "loss": 2.6691, + "step": 6756 + }, + { + "epoch": 0.6116592740110437, + "grad_norm": 0.9626230001449585, + "learning_rate": 4.844180146137347e-05, + "loss": 2.6837, + "step": 6757 + }, + { + "epoch": 0.6117497963247941, + "grad_norm": 0.9260591268539429, + "learning_rate": 4.841461372405751e-05, + "loss": 2.7157, + "step": 6758 + }, + { + "epoch": 0.6118403186385444, + "grad_norm": 0.8617759943008423, + "learning_rate": 4.838743118138853e-05, + "loss": 2.6938, + "step": 6759 + }, + { + "epoch": 0.6119308409522948, + "grad_norm": 0.8547298908233643, + "learning_rate": 4.836025383610382e-05, + "loss": 2.705, + "step": 6760 + }, + { + "epoch": 0.6120213632660451, + "grad_norm": 0.8187577724456787, + "learning_rate": 4.833308169094014e-05, + "loss": 2.7203, + "step": 6761 + }, + { + "epoch": 0.6121118855797955, + "grad_norm": 0.8286332488059998, + "learning_rate": 4.830591474863372e-05, + "loss": 2.6601, + "step": 6762 + }, + { + "epoch": 0.6122024078935457, + "grad_norm": 0.8664364218711853, + "learning_rate": 4.8278753011920295e-05, + "loss": 2.7084, + "step": 6763 + }, + { + "epoch": 0.6122929302072961, + "grad_norm": 0.8383306860923767, + "learning_rate": 4.8251596483534965e-05, + "loss": 2.6873, + "step": 6764 + }, + { + "epoch": 0.6123834525210464, + "grad_norm": 0.8713318109512329, + "learning_rate": 4.822444516621252e-05, + "loss": 2.6643, + "step": 6765 + }, + { + "epoch": 0.6124739748347968, + "grad_norm": 0.8678194284439087, + "learning_rate": 4.8197299062686995e-05, + "loss": 2.6886, + "step": 6766 + }, + { + "epoch": 0.6125644971485471, + "grad_norm": 0.7932306528091431, + "learning_rate": 4.8170158175692035e-05, + "loss": 2.6389, + "step": 6767 + }, + { + "epoch": 0.6126550194622975, + "grad_norm": 0.8153578042984009, + "learning_rate": 4.8143022507960735e-05, + "loss": 2.6928, + "step": 6768 + }, + { + "epoch": 0.6127455417760478, + "grad_norm": 0.8641914129257202, + "learning_rate": 4.811589206222565e-05, + "loss": 2.715, + "step": 6769 + }, + { + "epoch": 0.6128360640897982, + "grad_norm": 0.8290460109710693, + "learning_rate": 4.808876684121881e-05, + "loss": 2.7175, + "step": 6770 + }, + { + "epoch": 0.6129265864035485, + "grad_norm": 0.8863869309425354, + "learning_rate": 4.806164684767177e-05, + "loss": 2.6914, + "step": 6771 + }, + { + "epoch": 0.6130171087172989, + "grad_norm": 0.8532030582427979, + "learning_rate": 4.80345320843154e-05, + "loss": 2.7317, + "step": 6772 + }, + { + "epoch": 0.6131076310310491, + "grad_norm": 0.8034926652908325, + "learning_rate": 4.800742255388029e-05, + "loss": 2.7586, + "step": 6773 + }, + { + "epoch": 0.6131981533447995, + "grad_norm": 0.8824958801269531, + "learning_rate": 4.798031825909627e-05, + "loss": 2.7163, + "step": 6774 + }, + { + "epoch": 0.6132886756585498, + "grad_norm": 0.8216482400894165, + "learning_rate": 4.795321920269279e-05, + "loss": 2.7146, + "step": 6775 + }, + { + "epoch": 0.6133791979723002, + "grad_norm": 0.8607343435287476, + "learning_rate": 4.79261253873987e-05, + "loss": 2.6636, + "step": 6776 + }, + { + "epoch": 0.6134697202860505, + "grad_norm": 0.9348889589309692, + "learning_rate": 4.78990368159424e-05, + "loss": 2.7435, + "step": 6777 + }, + { + "epoch": 0.6135602425998008, + "grad_norm": 0.8563113212585449, + "learning_rate": 4.787195349105159e-05, + "loss": 2.6861, + "step": 6778 + }, + { + "epoch": 0.6136507649135512, + "grad_norm": 0.8089884519577026, + "learning_rate": 4.7844875415453714e-05, + "loss": 2.7004, + "step": 6779 + }, + { + "epoch": 0.6137412872273015, + "grad_norm": 0.8360724449157715, + "learning_rate": 4.7817802591875426e-05, + "loss": 2.6961, + "step": 6780 + }, + { + "epoch": 0.6138318095410519, + "grad_norm": 0.7992052435874939, + "learning_rate": 4.779073502304299e-05, + "loss": 2.6625, + "step": 6781 + }, + { + "epoch": 0.6139223318548022, + "grad_norm": 0.7994864583015442, + "learning_rate": 4.776367271168209e-05, + "loss": 2.6471, + "step": 6782 + }, + { + "epoch": 0.6140128541685526, + "grad_norm": 0.8618261814117432, + "learning_rate": 4.773661566051793e-05, + "loss": 2.707, + "step": 6783 + }, + { + "epoch": 0.6141033764823028, + "grad_norm": 0.8256996273994446, + "learning_rate": 4.770956387227515e-05, + "loss": 2.6674, + "step": 6784 + }, + { + "epoch": 0.6141938987960532, + "grad_norm": 0.7925830483436584, + "learning_rate": 4.768251734967789e-05, + "loss": 2.7334, + "step": 6785 + }, + { + "epoch": 0.6142844211098035, + "grad_norm": 0.9798775315284729, + "learning_rate": 4.765547609544963e-05, + "loss": 2.6846, + "step": 6786 + }, + { + "epoch": 0.6143749434235539, + "grad_norm": 0.8514744639396667, + "learning_rate": 4.7628440112313556e-05, + "loss": 2.6665, + "step": 6787 + }, + { + "epoch": 0.6144654657373042, + "grad_norm": 0.9284042716026306, + "learning_rate": 4.7601409402992106e-05, + "loss": 2.6741, + "step": 6788 + }, + { + "epoch": 0.6145559880510546, + "grad_norm": 0.8871949315071106, + "learning_rate": 4.757438397020729e-05, + "loss": 2.7161, + "step": 6789 + }, + { + "epoch": 0.6146465103648049, + "grad_norm": 0.8882738351821899, + "learning_rate": 4.754736381668057e-05, + "loss": 2.7017, + "step": 6790 + }, + { + "epoch": 0.6147370326785553, + "grad_norm": 0.849459707736969, + "learning_rate": 4.7520348945132886e-05, + "loss": 2.7023, + "step": 6791 + }, + { + "epoch": 0.6148275549923056, + "grad_norm": 0.9086245894432068, + "learning_rate": 4.7493339358284625e-05, + "loss": 2.7293, + "step": 6792 + }, + { + "epoch": 0.614918077306056, + "grad_norm": 0.8342201709747314, + "learning_rate": 4.746633505885569e-05, + "loss": 2.6367, + "step": 6793 + }, + { + "epoch": 0.6150085996198063, + "grad_norm": 0.8293510675430298, + "learning_rate": 4.7439336049565295e-05, + "loss": 2.6908, + "step": 6794 + }, + { + "epoch": 0.6150991219335566, + "grad_norm": 0.831994354724884, + "learning_rate": 4.741234233313241e-05, + "loss": 2.6787, + "step": 6795 + }, + { + "epoch": 0.6151896442473069, + "grad_norm": 0.9759984016418457, + "learning_rate": 4.7385353912275165e-05, + "loss": 2.7383, + "step": 6796 + }, + { + "epoch": 0.6152801665610573, + "grad_norm": 0.92172771692276, + "learning_rate": 4.735837078971135e-05, + "loss": 2.702, + "step": 6797 + }, + { + "epoch": 0.6153706888748076, + "grad_norm": 0.892575204372406, + "learning_rate": 4.733139296815815e-05, + "loss": 2.6274, + "step": 6798 + }, + { + "epoch": 0.615461211188558, + "grad_norm": 0.8499800562858582, + "learning_rate": 4.7304420450332244e-05, + "loss": 2.6803, + "step": 6799 + }, + { + "epoch": 0.6155517335023083, + "grad_norm": 0.8400810360908508, + "learning_rate": 4.727745323894976e-05, + "loss": 2.6503, + "step": 6800 + }, + { + "epoch": 0.6156422558160587, + "grad_norm": 0.8385573625564575, + "learning_rate": 4.725049133672631e-05, + "loss": 2.718, + "step": 6801 + }, + { + "epoch": 0.615732778129809, + "grad_norm": 0.9191269278526306, + "learning_rate": 4.7223534746376884e-05, + "loss": 2.6797, + "step": 6802 + }, + { + "epoch": 0.6158233004435594, + "grad_norm": 1.0587358474731445, + "learning_rate": 4.719658347061613e-05, + "loss": 2.7238, + "step": 6803 + }, + { + "epoch": 0.6159138227573097, + "grad_norm": 0.9422360062599182, + "learning_rate": 4.716963751215794e-05, + "loss": 2.6894, + "step": 6804 + }, + { + "epoch": 0.61600434507106, + "grad_norm": 0.9692339301109314, + "learning_rate": 4.714269687371581e-05, + "loss": 2.6718, + "step": 6805 + }, + { + "epoch": 0.6160948673848103, + "grad_norm": 0.8721240162849426, + "learning_rate": 4.7115761558002646e-05, + "loss": 2.714, + "step": 6806 + }, + { + "epoch": 0.6161853896985607, + "grad_norm": 0.9523304104804993, + "learning_rate": 4.7088831567730894e-05, + "loss": 2.6986, + "step": 6807 + }, + { + "epoch": 0.616275912012311, + "grad_norm": 0.8966914415359497, + "learning_rate": 4.706190690561228e-05, + "loss": 2.671, + "step": 6808 + }, + { + "epoch": 0.6163664343260614, + "grad_norm": 0.8884119391441345, + "learning_rate": 4.7034987574358266e-05, + "loss": 2.6862, + "step": 6809 + }, + { + "epoch": 0.6164569566398117, + "grad_norm": 0.9186428785324097, + "learning_rate": 4.700807357667952e-05, + "loss": 2.6717, + "step": 6810 + }, + { + "epoch": 0.6165474789535621, + "grad_norm": 0.8419829607009888, + "learning_rate": 4.698116491528631e-05, + "loss": 2.6886, + "step": 6811 + }, + { + "epoch": 0.6166380012673124, + "grad_norm": 0.9079512357711792, + "learning_rate": 4.695426159288835e-05, + "loss": 2.6787, + "step": 6812 + }, + { + "epoch": 0.6167285235810628, + "grad_norm": 0.8453792333602905, + "learning_rate": 4.692736361219478e-05, + "loss": 2.6798, + "step": 6813 + }, + { + "epoch": 0.6168190458948131, + "grad_norm": 0.9105737805366516, + "learning_rate": 4.690047097591427e-05, + "loss": 2.6833, + "step": 6814 + }, + { + "epoch": 0.6169095682085635, + "grad_norm": 0.9034150242805481, + "learning_rate": 4.68735836867549e-05, + "loss": 2.7098, + "step": 6815 + }, + { + "epoch": 0.6170000905223137, + "grad_norm": 0.8991881608963013, + "learning_rate": 4.684670174742412e-05, + "loss": 2.6973, + "step": 6816 + }, + { + "epoch": 0.6170906128360641, + "grad_norm": 0.9135326743125916, + "learning_rate": 4.6819825160629096e-05, + "loss": 2.7298, + "step": 6817 + }, + { + "epoch": 0.6171811351498144, + "grad_norm": 0.8568701148033142, + "learning_rate": 4.679295392907619e-05, + "loss": 2.6618, + "step": 6818 + }, + { + "epoch": 0.6172716574635647, + "grad_norm": 0.8539561629295349, + "learning_rate": 4.676608805547137e-05, + "loss": 2.7173, + "step": 6819 + }, + { + "epoch": 0.6173621797773151, + "grad_norm": 0.8731486797332764, + "learning_rate": 4.673922754252002e-05, + "loss": 2.6987, + "step": 6820 + }, + { + "epoch": 0.6174527020910654, + "grad_norm": 0.8331453800201416, + "learning_rate": 4.6712372392927e-05, + "loss": 2.672, + "step": 6821 + }, + { + "epoch": 0.6175432244048158, + "grad_norm": 0.8664054870605469, + "learning_rate": 4.6685522609396626e-05, + "loss": 2.681, + "step": 6822 + }, + { + "epoch": 0.6176337467185661, + "grad_norm": 0.7990508079528809, + "learning_rate": 4.66586781946327e-05, + "loss": 2.6322, + "step": 6823 + }, + { + "epoch": 0.6177242690323165, + "grad_norm": 0.8614292740821838, + "learning_rate": 4.663183915133834e-05, + "loss": 2.6739, + "step": 6824 + }, + { + "epoch": 0.6178147913460668, + "grad_norm": 0.8222824931144714, + "learning_rate": 4.6605005482216405e-05, + "loss": 2.7179, + "step": 6825 + }, + { + "epoch": 0.6179053136598172, + "grad_norm": 0.8004710078239441, + "learning_rate": 4.657817718996891e-05, + "loss": 2.665, + "step": 6826 + }, + { + "epoch": 0.6179958359735674, + "grad_norm": 0.9650024771690369, + "learning_rate": 4.655135427729752e-05, + "loss": 2.6428, + "step": 6827 + }, + { + "epoch": 0.6180863582873178, + "grad_norm": 0.8087639212608337, + "learning_rate": 4.652453674690328e-05, + "loss": 2.6029, + "step": 6828 + }, + { + "epoch": 0.6181768806010681, + "grad_norm": 0.8862298130989075, + "learning_rate": 4.6497724601486735e-05, + "loss": 2.6962, + "step": 6829 + }, + { + "epoch": 0.6182674029148185, + "grad_norm": 0.8443248271942139, + "learning_rate": 4.647091784374785e-05, + "loss": 2.6529, + "step": 6830 + }, + { + "epoch": 0.6183579252285688, + "grad_norm": 0.7951116561889648, + "learning_rate": 4.6444116476386124e-05, + "loss": 2.655, + "step": 6831 + }, + { + "epoch": 0.6184484475423192, + "grad_norm": 0.9196255207061768, + "learning_rate": 4.6417320502100316e-05, + "loss": 2.6802, + "step": 6832 + }, + { + "epoch": 0.6185389698560695, + "grad_norm": 0.9127574563026428, + "learning_rate": 4.6390529923588946e-05, + "loss": 2.6822, + "step": 6833 + }, + { + "epoch": 0.6186294921698199, + "grad_norm": 0.8869885802268982, + "learning_rate": 4.636374474354971e-05, + "loss": 2.73, + "step": 6834 + }, + { + "epoch": 0.6187200144835702, + "grad_norm": 0.8392091989517212, + "learning_rate": 4.633696496467991e-05, + "loss": 2.6734, + "step": 6835 + }, + { + "epoch": 0.6188105367973206, + "grad_norm": 0.8784205317497253, + "learning_rate": 4.631019058967627e-05, + "loss": 2.6801, + "step": 6836 + }, + { + "epoch": 0.6189010591110709, + "grad_norm": 0.9287988543510437, + "learning_rate": 4.628342162123501e-05, + "loss": 2.6924, + "step": 6837 + }, + { + "epoch": 0.6189915814248212, + "grad_norm": 0.8669235110282898, + "learning_rate": 4.625665806205164e-05, + "loss": 2.7482, + "step": 6838 + }, + { + "epoch": 0.6190821037385715, + "grad_norm": 0.89461350440979, + "learning_rate": 4.6229899914821414e-05, + "loss": 2.7027, + "step": 6839 + }, + { + "epoch": 0.6191726260523219, + "grad_norm": 0.9218267798423767, + "learning_rate": 4.620314718223876e-05, + "loss": 2.6773, + "step": 6840 + }, + { + "epoch": 0.6192631483660722, + "grad_norm": 0.8262689113616943, + "learning_rate": 4.617639986699771e-05, + "loss": 2.7483, + "step": 6841 + }, + { + "epoch": 0.6193536706798226, + "grad_norm": 0.8751800060272217, + "learning_rate": 4.614965797179172e-05, + "loss": 2.6881, + "step": 6842 + }, + { + "epoch": 0.6194441929935729, + "grad_norm": 0.9504587650299072, + "learning_rate": 4.612292149931369e-05, + "loss": 2.6459, + "step": 6843 + }, + { + "epoch": 0.6195347153073233, + "grad_norm": 0.8248963952064514, + "learning_rate": 4.609619045225604e-05, + "loss": 2.6531, + "step": 6844 + }, + { + "epoch": 0.6196252376210736, + "grad_norm": 0.851895272731781, + "learning_rate": 4.60694648333105e-05, + "loss": 2.6741, + "step": 6845 + }, + { + "epoch": 0.619715759934824, + "grad_norm": 0.8876405358314514, + "learning_rate": 4.6042744645168366e-05, + "loss": 2.7042, + "step": 6846 + }, + { + "epoch": 0.6198062822485743, + "grad_norm": 0.8736615180969238, + "learning_rate": 4.601602989052037e-05, + "loss": 2.7016, + "step": 6847 + }, + { + "epoch": 0.6198968045623247, + "grad_norm": 0.8399230241775513, + "learning_rate": 4.5989320572056696e-05, + "loss": 2.6293, + "step": 6848 + }, + { + "epoch": 0.6199873268760749, + "grad_norm": 0.9112479090690613, + "learning_rate": 4.5962616692466954e-05, + "loss": 2.6717, + "step": 6849 + }, + { + "epoch": 0.6200778491898253, + "grad_norm": 0.8878874778747559, + "learning_rate": 4.593591825444028e-05, + "loss": 2.6787, + "step": 6850 + }, + { + "epoch": 0.6201683715035756, + "grad_norm": 0.9500158429145813, + "learning_rate": 4.590922526066508e-05, + "loss": 2.7017, + "step": 6851 + }, + { + "epoch": 0.620258893817326, + "grad_norm": 1.0439828634262085, + "learning_rate": 4.588253771382949e-05, + "loss": 2.6546, + "step": 6852 + }, + { + "epoch": 0.6203494161310763, + "grad_norm": 0.9312981367111206, + "learning_rate": 4.585585561662082e-05, + "loss": 2.6252, + "step": 6853 + }, + { + "epoch": 0.6204399384448267, + "grad_norm": 0.8428036570549011, + "learning_rate": 4.582917897172603e-05, + "loss": 2.6421, + "step": 6854 + }, + { + "epoch": 0.620530460758577, + "grad_norm": 0.9673554301261902, + "learning_rate": 4.580250778183143e-05, + "loss": 2.6584, + "step": 6855 + }, + { + "epoch": 0.6206209830723274, + "grad_norm": 0.9068723320960999, + "learning_rate": 4.5775842049622806e-05, + "loss": 2.7353, + "step": 6856 + }, + { + "epoch": 0.6207115053860777, + "grad_norm": 0.8398150205612183, + "learning_rate": 4.574918177778541e-05, + "loss": 2.7008, + "step": 6857 + }, + { + "epoch": 0.6208020276998281, + "grad_norm": 0.8431996703147888, + "learning_rate": 4.5722526969003956e-05, + "loss": 2.6987, + "step": 6858 + }, + { + "epoch": 0.6208925500135783, + "grad_norm": 0.9009360074996948, + "learning_rate": 4.569587762596248e-05, + "loss": 2.6668, + "step": 6859 + }, + { + "epoch": 0.6209830723273286, + "grad_norm": 0.857540488243103, + "learning_rate": 4.566923375134472e-05, + "loss": 2.6374, + "step": 6860 + }, + { + "epoch": 0.621073594641079, + "grad_norm": 0.83136385679245, + "learning_rate": 4.5642595347833594e-05, + "loss": 2.652, + "step": 6861 + }, + { + "epoch": 0.6211641169548293, + "grad_norm": 0.8541494011878967, + "learning_rate": 4.5615962418111625e-05, + "loss": 2.6612, + "step": 6862 + }, + { + "epoch": 0.6212546392685797, + "grad_norm": 0.8624999523162842, + "learning_rate": 4.558933496486075e-05, + "loss": 2.6716, + "step": 6863 + }, + { + "epoch": 0.62134516158233, + "grad_norm": 0.9279386401176453, + "learning_rate": 4.5562712990762366e-05, + "loss": 2.7097, + "step": 6864 + }, + { + "epoch": 0.6214356838960804, + "grad_norm": 0.8684672713279724, + "learning_rate": 4.5536096498497295e-05, + "loss": 2.6706, + "step": 6865 + }, + { + "epoch": 0.6215262062098307, + "grad_norm": 0.8466289639472961, + "learning_rate": 4.550948549074584e-05, + "loss": 2.6747, + "step": 6866 + }, + { + "epoch": 0.6216167285235811, + "grad_norm": 0.7903226613998413, + "learning_rate": 4.548287997018764e-05, + "loss": 2.7077, + "step": 6867 + }, + { + "epoch": 0.6217072508373314, + "grad_norm": 0.86106938123703, + "learning_rate": 4.545627993950201e-05, + "loss": 2.6402, + "step": 6868 + }, + { + "epoch": 0.6217977731510818, + "grad_norm": 0.9263157248497009, + "learning_rate": 4.542968540136746e-05, + "loss": 2.7607, + "step": 6869 + }, + { + "epoch": 0.621888295464832, + "grad_norm": 0.9493169784545898, + "learning_rate": 4.5403096358462095e-05, + "loss": 2.7068, + "step": 6870 + }, + { + "epoch": 0.6219788177785824, + "grad_norm": 0.8483140468597412, + "learning_rate": 4.5376512813463434e-05, + "loss": 2.7293, + "step": 6871 + }, + { + "epoch": 0.6220693400923327, + "grad_norm": 0.8495675325393677, + "learning_rate": 4.534993476904848e-05, + "loss": 2.6592, + "step": 6872 + }, + { + "epoch": 0.6221598624060831, + "grad_norm": 0.8462209105491638, + "learning_rate": 4.532336222789352e-05, + "loss": 2.7176, + "step": 6873 + }, + { + "epoch": 0.6222503847198334, + "grad_norm": 0.9456688165664673, + "learning_rate": 4.529679519267456e-05, + "loss": 2.6491, + "step": 6874 + }, + { + "epoch": 0.6223409070335838, + "grad_norm": 0.843177080154419, + "learning_rate": 4.527023366606679e-05, + "loss": 2.6578, + "step": 6875 + }, + { + "epoch": 0.6224314293473341, + "grad_norm": 0.9310163855552673, + "learning_rate": 4.524367765074499e-05, + "loss": 2.7034, + "step": 6876 + }, + { + "epoch": 0.6225219516610845, + "grad_norm": 0.8866987228393555, + "learning_rate": 4.521712714938335e-05, + "loss": 2.6532, + "step": 6877 + }, + { + "epoch": 0.6226124739748348, + "grad_norm": 0.9527557492256165, + "learning_rate": 4.5190582164655525e-05, + "loss": 2.6735, + "step": 6878 + }, + { + "epoch": 0.6227029962885852, + "grad_norm": 0.9127796292304993, + "learning_rate": 4.5164042699234566e-05, + "loss": 2.6247, + "step": 6879 + }, + { + "epoch": 0.6227935186023354, + "grad_norm": 0.8668599724769592, + "learning_rate": 4.513750875579303e-05, + "loss": 2.6347, + "step": 6880 + }, + { + "epoch": 0.6228840409160858, + "grad_norm": 0.7990663647651672, + "learning_rate": 4.5110980337002806e-05, + "loss": 2.687, + "step": 6881 + }, + { + "epoch": 0.6229745632298361, + "grad_norm": 0.8905903697013855, + "learning_rate": 4.5084457445535435e-05, + "loss": 2.6849, + "step": 6882 + }, + { + "epoch": 0.6230650855435865, + "grad_norm": 0.9093679189682007, + "learning_rate": 4.505794008406165e-05, + "loss": 2.6596, + "step": 6883 + }, + { + "epoch": 0.6231556078573368, + "grad_norm": 0.8659344911575317, + "learning_rate": 4.5031428255251793e-05, + "loss": 2.7382, + "step": 6884 + }, + { + "epoch": 0.6232461301710872, + "grad_norm": 0.7988776564598083, + "learning_rate": 4.500492196177561e-05, + "loss": 2.6526, + "step": 6885 + }, + { + "epoch": 0.6233366524848375, + "grad_norm": 0.8617873787879944, + "learning_rate": 4.497842120630229e-05, + "loss": 2.7033, + "step": 6886 + }, + { + "epoch": 0.6234271747985879, + "grad_norm": 0.8665861487388611, + "learning_rate": 4.495192599150044e-05, + "loss": 2.6881, + "step": 6887 + }, + { + "epoch": 0.6235176971123382, + "grad_norm": 0.8218960762023926, + "learning_rate": 4.492543632003817e-05, + "loss": 2.6161, + "step": 6888 + }, + { + "epoch": 0.6236082194260886, + "grad_norm": 0.8584741950035095, + "learning_rate": 4.489895219458289e-05, + "loss": 2.7239, + "step": 6889 + }, + { + "epoch": 0.6236987417398389, + "grad_norm": 0.8423812389373779, + "learning_rate": 4.487247361780169e-05, + "loss": 2.679, + "step": 6890 + }, + { + "epoch": 0.6237892640535893, + "grad_norm": 0.8747096061706543, + "learning_rate": 4.484600059236085e-05, + "loss": 2.6855, + "step": 6891 + }, + { + "epoch": 0.6238797863673395, + "grad_norm": 0.8690003156661987, + "learning_rate": 4.4819533120926236e-05, + "loss": 2.6958, + "step": 6892 + }, + { + "epoch": 0.6239703086810899, + "grad_norm": 0.802766740322113, + "learning_rate": 4.479307120616313e-05, + "loss": 2.6509, + "step": 6893 + }, + { + "epoch": 0.6240608309948402, + "grad_norm": 0.8117000460624695, + "learning_rate": 4.476661485073624e-05, + "loss": 2.7068, + "step": 6894 + }, + { + "epoch": 0.6241513533085906, + "grad_norm": 0.8532359600067139, + "learning_rate": 4.474016405730973e-05, + "loss": 2.68, + "step": 6895 + }, + { + "epoch": 0.6242418756223409, + "grad_norm": 0.8388345241546631, + "learning_rate": 4.471371882854723e-05, + "loss": 2.7115, + "step": 6896 + }, + { + "epoch": 0.6243323979360913, + "grad_norm": 0.9178318381309509, + "learning_rate": 4.468727916711164e-05, + "loss": 2.6154, + "step": 6897 + }, + { + "epoch": 0.6244229202498416, + "grad_norm": 0.8816856145858765, + "learning_rate": 4.46608450756656e-05, + "loss": 2.7028, + "step": 6898 + }, + { + "epoch": 0.624513442563592, + "grad_norm": 0.8467722535133362, + "learning_rate": 4.463441655687092e-05, + "loss": 2.6352, + "step": 6899 + }, + { + "epoch": 0.6246039648773423, + "grad_norm": 0.8618403077125549, + "learning_rate": 4.4607993613388976e-05, + "loss": 2.695, + "step": 6900 + }, + { + "epoch": 0.6246944871910926, + "grad_norm": 0.9370056986808777, + "learning_rate": 4.458157624788056e-05, + "loss": 2.6996, + "step": 6901 + }, + { + "epoch": 0.624785009504843, + "grad_norm": 0.9463796019554138, + "learning_rate": 4.455516446300593e-05, + "loss": 2.6751, + "step": 6902 + }, + { + "epoch": 0.6248755318185932, + "grad_norm": 0.9075630307197571, + "learning_rate": 4.4528758261424655e-05, + "loss": 2.6395, + "step": 6903 + }, + { + "epoch": 0.6249660541323436, + "grad_norm": 0.9862688183784485, + "learning_rate": 4.4502357645795976e-05, + "loss": 2.6984, + "step": 6904 + }, + { + "epoch": 0.6250565764460939, + "grad_norm": 0.8373948931694031, + "learning_rate": 4.447596261877832e-05, + "loss": 2.6353, + "step": 6905 + }, + { + "epoch": 0.6251470987598443, + "grad_norm": 0.86262047290802, + "learning_rate": 4.444957318302973e-05, + "loss": 2.6511, + "step": 6906 + }, + { + "epoch": 0.6252376210735946, + "grad_norm": 0.8128232359886169, + "learning_rate": 4.442318934120758e-05, + "loss": 2.6396, + "step": 6907 + }, + { + "epoch": 0.625328143387345, + "grad_norm": 0.9029171466827393, + "learning_rate": 4.439681109596875e-05, + "loss": 2.6639, + "step": 6908 + }, + { + "epoch": 0.6254186657010953, + "grad_norm": 0.877526581287384, + "learning_rate": 4.437043844996952e-05, + "loss": 2.6135, + "step": 6909 + }, + { + "epoch": 0.6255091880148457, + "grad_norm": 0.8649809956550598, + "learning_rate": 4.434407140586565e-05, + "loss": 2.7198, + "step": 6910 + }, + { + "epoch": 0.625599710328596, + "grad_norm": 0.8621827363967896, + "learning_rate": 4.431770996631219e-05, + "loss": 2.6379, + "step": 6911 + }, + { + "epoch": 0.6256902326423464, + "grad_norm": 0.8444892764091492, + "learning_rate": 4.429135413396389e-05, + "loss": 2.71, + "step": 6912 + }, + { + "epoch": 0.6257807549560966, + "grad_norm": 0.8484147191047668, + "learning_rate": 4.426500391147468e-05, + "loss": 2.7082, + "step": 6913 + }, + { + "epoch": 0.625871277269847, + "grad_norm": 0.8146012425422668, + "learning_rate": 4.4238659301498045e-05, + "loss": 2.7241, + "step": 6914 + }, + { + "epoch": 0.6259617995835973, + "grad_norm": 0.866885244846344, + "learning_rate": 4.4212320306686884e-05, + "loss": 2.6802, + "step": 6915 + }, + { + "epoch": 0.6260523218973477, + "grad_norm": 0.8107602000236511, + "learning_rate": 4.4185986929693546e-05, + "loss": 2.7391, + "step": 6916 + }, + { + "epoch": 0.626142844211098, + "grad_norm": 0.9229167103767395, + "learning_rate": 4.41596591731698e-05, + "loss": 2.6994, + "step": 6917 + }, + { + "epoch": 0.6262333665248484, + "grad_norm": 0.9321455359458923, + "learning_rate": 4.4133337039766877e-05, + "loss": 2.7259, + "step": 6918 + }, + { + "epoch": 0.6263238888385987, + "grad_norm": 0.9236816167831421, + "learning_rate": 4.4107020532135314e-05, + "loss": 2.6672, + "step": 6919 + }, + { + "epoch": 0.6264144111523491, + "grad_norm": 0.831727147102356, + "learning_rate": 4.4080709652925336e-05, + "loss": 2.6458, + "step": 6920 + }, + { + "epoch": 0.6265049334660994, + "grad_norm": 0.8836252093315125, + "learning_rate": 4.405440440478632e-05, + "loss": 2.6761, + "step": 6921 + }, + { + "epoch": 0.6265954557798498, + "grad_norm": 0.8219419121742249, + "learning_rate": 4.402810479036725e-05, + "loss": 2.6527, + "step": 6922 + }, + { + "epoch": 0.6266859780936, + "grad_norm": 0.8386253118515015, + "learning_rate": 4.40018108123165e-05, + "loss": 2.7052, + "step": 6923 + }, + { + "epoch": 0.6267765004073504, + "grad_norm": 0.8826115727424622, + "learning_rate": 4.3975522473281875e-05, + "loss": 2.6718, + "step": 6924 + }, + { + "epoch": 0.6268670227211007, + "grad_norm": 0.8514438271522522, + "learning_rate": 4.394923977591059e-05, + "loss": 2.6277, + "step": 6925 + }, + { + "epoch": 0.6269575450348511, + "grad_norm": 0.8475186824798584, + "learning_rate": 4.392296272284937e-05, + "loss": 2.692, + "step": 6926 + }, + { + "epoch": 0.6270480673486014, + "grad_norm": 0.891557514667511, + "learning_rate": 4.389669131674419e-05, + "loss": 2.7062, + "step": 6927 + }, + { + "epoch": 0.6271385896623518, + "grad_norm": 0.832285463809967, + "learning_rate": 4.387042556024074e-05, + "loss": 2.6905, + "step": 6928 + }, + { + "epoch": 0.6272291119761021, + "grad_norm": 0.9494576454162598, + "learning_rate": 4.38441654559839e-05, + "loss": 2.6781, + "step": 6929 + }, + { + "epoch": 0.6273196342898525, + "grad_norm": 0.8865106701850891, + "learning_rate": 4.3817911006617986e-05, + "loss": 2.7055, + "step": 6930 + }, + { + "epoch": 0.6274101566036028, + "grad_norm": 0.8264738321304321, + "learning_rate": 4.379166221478697e-05, + "loss": 2.6233, + "step": 6931 + }, + { + "epoch": 0.6275006789173532, + "grad_norm": 0.8542665243148804, + "learning_rate": 4.3765419083134e-05, + "loss": 2.6594, + "step": 6932 + }, + { + "epoch": 0.6275912012311035, + "grad_norm": 0.8898385763168335, + "learning_rate": 4.3739181614301786e-05, + "loss": 2.6302, + "step": 6933 + }, + { + "epoch": 0.6276817235448539, + "grad_norm": 0.7890973687171936, + "learning_rate": 4.371294981093244e-05, + "loss": 2.6175, + "step": 6934 + }, + { + "epoch": 0.6277722458586041, + "grad_norm": 0.8115043640136719, + "learning_rate": 4.368672367566751e-05, + "loss": 2.6913, + "step": 6935 + }, + { + "epoch": 0.6278627681723545, + "grad_norm": 0.8468533158302307, + "learning_rate": 4.366050321114796e-05, + "loss": 2.6607, + "step": 6936 + }, + { + "epoch": 0.6279532904861048, + "grad_norm": 0.8599989414215088, + "learning_rate": 4.363428842001425e-05, + "loss": 2.6724, + "step": 6937 + }, + { + "epoch": 0.6280438127998552, + "grad_norm": 0.8746944069862366, + "learning_rate": 4.360807930490606e-05, + "loss": 2.7038, + "step": 6938 + }, + { + "epoch": 0.6281343351136055, + "grad_norm": 0.8745467066764832, + "learning_rate": 4.3581875868462815e-05, + "loss": 2.7207, + "step": 6939 + }, + { + "epoch": 0.6282248574273559, + "grad_norm": 0.8731328845024109, + "learning_rate": 4.355567811332311e-05, + "loss": 2.6887, + "step": 6940 + }, + { + "epoch": 0.6283153797411062, + "grad_norm": 0.8534673452377319, + "learning_rate": 4.3529486042125055e-05, + "loss": 2.6319, + "step": 6941 + }, + { + "epoch": 0.6284059020548565, + "grad_norm": 0.8535872101783752, + "learning_rate": 4.350329965750621e-05, + "loss": 2.6605, + "step": 6942 + }, + { + "epoch": 0.6284964243686069, + "grad_norm": 0.8605890870094299, + "learning_rate": 4.347711896210356e-05, + "loss": 2.7461, + "step": 6943 + }, + { + "epoch": 0.6285869466823572, + "grad_norm": 0.7937997579574585, + "learning_rate": 4.3450943958553466e-05, + "loss": 2.6408, + "step": 6944 + }, + { + "epoch": 0.6286774689961075, + "grad_norm": 0.8514191508293152, + "learning_rate": 4.342477464949182e-05, + "loss": 2.7152, + "step": 6945 + }, + { + "epoch": 0.6287679913098578, + "grad_norm": 0.8694384694099426, + "learning_rate": 4.339861103755374e-05, + "loss": 2.6926, + "step": 6946 + }, + { + "epoch": 0.6288585136236082, + "grad_norm": 0.9281476140022278, + "learning_rate": 4.3372453125374046e-05, + "loss": 2.6812, + "step": 6947 + }, + { + "epoch": 0.6289490359373585, + "grad_norm": 0.8273919224739075, + "learning_rate": 4.334630091558675e-05, + "loss": 2.6989, + "step": 6948 + }, + { + "epoch": 0.6290395582511089, + "grad_norm": 0.9093640446662903, + "learning_rate": 4.3320154410825386e-05, + "loss": 2.707, + "step": 6949 + }, + { + "epoch": 0.6291300805648592, + "grad_norm": 0.8211041688919067, + "learning_rate": 4.329401361372294e-05, + "loss": 2.661, + "step": 6950 + }, + { + "epoch": 0.6292206028786096, + "grad_norm": 0.860684871673584, + "learning_rate": 4.326787852691175e-05, + "loss": 2.7215, + "step": 6951 + }, + { + "epoch": 0.6293111251923599, + "grad_norm": 0.9397785663604736, + "learning_rate": 4.324174915302366e-05, + "loss": 2.7168, + "step": 6952 + }, + { + "epoch": 0.6294016475061103, + "grad_norm": 0.8350105285644531, + "learning_rate": 4.32156254946899e-05, + "loss": 2.6404, + "step": 6953 + }, + { + "epoch": 0.6294921698198606, + "grad_norm": 0.8832926750183105, + "learning_rate": 4.318950755454103e-05, + "loss": 2.6015, + "step": 6954 + }, + { + "epoch": 0.629582692133611, + "grad_norm": 0.8618656992912292, + "learning_rate": 4.3163395335207265e-05, + "loss": 2.6702, + "step": 6955 + }, + { + "epoch": 0.6296732144473612, + "grad_norm": 0.8379136323928833, + "learning_rate": 4.3137288839318014e-05, + "loss": 2.6714, + "step": 6956 + }, + { + "epoch": 0.6297637367611116, + "grad_norm": 0.8382007479667664, + "learning_rate": 4.3111188069502206e-05, + "loss": 2.6437, + "step": 6957 + }, + { + "epoch": 0.6298542590748619, + "grad_norm": 0.8624709844589233, + "learning_rate": 4.30850930283882e-05, + "loss": 2.7487, + "step": 6958 + }, + { + "epoch": 0.6299447813886123, + "grad_norm": 0.8321948647499084, + "learning_rate": 4.3059003718603776e-05, + "loss": 2.6971, + "step": 6959 + }, + { + "epoch": 0.6300353037023626, + "grad_norm": 0.8173766732215881, + "learning_rate": 4.3032920142776125e-05, + "loss": 2.6913, + "step": 6960 + }, + { + "epoch": 0.630125826016113, + "grad_norm": 0.8754605650901794, + "learning_rate": 4.300684230353188e-05, + "loss": 2.6797, + "step": 6961 + }, + { + "epoch": 0.6302163483298633, + "grad_norm": 0.8446570634841919, + "learning_rate": 4.2980770203497035e-05, + "loss": 2.6248, + "step": 6962 + }, + { + "epoch": 0.6303068706436137, + "grad_norm": 0.8704685568809509, + "learning_rate": 4.295470384529705e-05, + "loss": 2.6948, + "step": 6963 + }, + { + "epoch": 0.630397392957364, + "grad_norm": 0.8113224506378174, + "learning_rate": 4.2928643231556844e-05, + "loss": 2.6748, + "step": 6964 + }, + { + "epoch": 0.6304879152711144, + "grad_norm": 0.8907050490379333, + "learning_rate": 4.2902588364900695e-05, + "loss": 2.6532, + "step": 6965 + }, + { + "epoch": 0.6305784375848646, + "grad_norm": 0.8262181878089905, + "learning_rate": 4.287653924795234e-05, + "loss": 2.658, + "step": 6966 + }, + { + "epoch": 0.630668959898615, + "grad_norm": 0.8925417065620422, + "learning_rate": 4.285049588333495e-05, + "loss": 2.7295, + "step": 6967 + }, + { + "epoch": 0.6307594822123653, + "grad_norm": 0.8073951005935669, + "learning_rate": 4.2824458273671e-05, + "loss": 2.6385, + "step": 6968 + }, + { + "epoch": 0.6308500045261157, + "grad_norm": 0.9392690658569336, + "learning_rate": 4.2798426421582594e-05, + "loss": 2.6744, + "step": 6969 + }, + { + "epoch": 0.630940526839866, + "grad_norm": 0.8945513963699341, + "learning_rate": 4.277240032969105e-05, + "loss": 2.6396, + "step": 6970 + }, + { + "epoch": 0.6310310491536164, + "grad_norm": 0.8523789644241333, + "learning_rate": 4.2746380000617236e-05, + "loss": 2.6597, + "step": 6971 + }, + { + "epoch": 0.6311215714673667, + "grad_norm": 0.8517530560493469, + "learning_rate": 4.272036543698138e-05, + "loss": 2.6104, + "step": 6972 + }, + { + "epoch": 0.6312120937811171, + "grad_norm": 0.8112363815307617, + "learning_rate": 4.2694356641403165e-05, + "loss": 2.6842, + "step": 6973 + }, + { + "epoch": 0.6313026160948674, + "grad_norm": 0.8543645739555359, + "learning_rate": 4.2668353616501666e-05, + "loss": 2.6565, + "step": 6974 + }, + { + "epoch": 0.6313931384086178, + "grad_norm": 0.8598533272743225, + "learning_rate": 4.264235636489542e-05, + "loss": 2.6996, + "step": 6975 + }, + { + "epoch": 0.6314836607223681, + "grad_norm": 0.8872503042221069, + "learning_rate": 4.2616364889202254e-05, + "loss": 2.6053, + "step": 6976 + }, + { + "epoch": 0.6315741830361185, + "grad_norm": 0.8489211797714233, + "learning_rate": 4.259037919203964e-05, + "loss": 2.6973, + "step": 6977 + }, + { + "epoch": 0.6316647053498687, + "grad_norm": 0.8446736335754395, + "learning_rate": 4.256439927602424e-05, + "loss": 2.699, + "step": 6978 + }, + { + "epoch": 0.6317552276636191, + "grad_norm": 0.9012982845306396, + "learning_rate": 4.2538425143772256e-05, + "loss": 2.6917, + "step": 6979 + }, + { + "epoch": 0.6318457499773694, + "grad_norm": 0.8830162882804871, + "learning_rate": 4.251245679789928e-05, + "loss": 2.681, + "step": 6980 + }, + { + "epoch": 0.6319362722911198, + "grad_norm": 0.8840652108192444, + "learning_rate": 4.248649424102035e-05, + "loss": 2.7276, + "step": 6981 + }, + { + "epoch": 0.6320267946048701, + "grad_norm": 0.9230279922485352, + "learning_rate": 4.2460537475749864e-05, + "loss": 2.6295, + "step": 6982 + }, + { + "epoch": 0.6321173169186204, + "grad_norm": 0.8687030673027039, + "learning_rate": 4.243458650470171e-05, + "loss": 2.6694, + "step": 6983 + }, + { + "epoch": 0.6322078392323708, + "grad_norm": 0.8421883583068848, + "learning_rate": 4.2408641330489066e-05, + "loss": 2.6714, + "step": 6984 + }, + { + "epoch": 0.6322983615461211, + "grad_norm": 0.8303890824317932, + "learning_rate": 4.238270195572472e-05, + "loss": 2.6576, + "step": 6985 + }, + { + "epoch": 0.6323888838598715, + "grad_norm": 0.8518853783607483, + "learning_rate": 4.235676838302068e-05, + "loss": 2.666, + "step": 6986 + }, + { + "epoch": 0.6324794061736217, + "grad_norm": 0.9766391515731812, + "learning_rate": 4.2330840614988496e-05, + "loss": 2.7746, + "step": 6987 + }, + { + "epoch": 0.6325699284873721, + "grad_norm": 0.8131774663925171, + "learning_rate": 4.230491865423908e-05, + "loss": 2.7201, + "step": 6988 + }, + { + "epoch": 0.6326604508011224, + "grad_norm": 0.8058497905731201, + "learning_rate": 4.227900250338277e-05, + "loss": 2.6695, + "step": 6989 + }, + { + "epoch": 0.6327509731148728, + "grad_norm": 0.8577495217323303, + "learning_rate": 4.225309216502933e-05, + "loss": 2.6565, + "step": 6990 + }, + { + "epoch": 0.6328414954286231, + "grad_norm": 0.8953158259391785, + "learning_rate": 4.2227187641787966e-05, + "loss": 2.6916, + "step": 6991 + }, + { + "epoch": 0.6329320177423735, + "grad_norm": 0.918773889541626, + "learning_rate": 4.220128893626715e-05, + "loss": 2.7069, + "step": 6992 + }, + { + "epoch": 0.6330225400561238, + "grad_norm": 0.8627886772155762, + "learning_rate": 4.2175396051075035e-05, + "loss": 2.6335, + "step": 6993 + }, + { + "epoch": 0.6331130623698742, + "grad_norm": 0.8618264198303223, + "learning_rate": 4.214950898881892e-05, + "loss": 2.6784, + "step": 6994 + }, + { + "epoch": 0.6332035846836245, + "grad_norm": 0.8480239510536194, + "learning_rate": 4.2123627752105665e-05, + "loss": 2.6699, + "step": 6995 + }, + { + "epoch": 0.6332941069973749, + "grad_norm": 0.9663839340209961, + "learning_rate": 4.209775234354151e-05, + "loss": 2.6057, + "step": 6996 + }, + { + "epoch": 0.6333846293111252, + "grad_norm": 0.8858630657196045, + "learning_rate": 4.207188276573214e-05, + "loss": 2.6634, + "step": 6997 + }, + { + "epoch": 0.6334751516248756, + "grad_norm": 0.9681085348129272, + "learning_rate": 4.204601902128253e-05, + "loss": 2.7401, + "step": 6998 + }, + { + "epoch": 0.6335656739386258, + "grad_norm": 0.853465735912323, + "learning_rate": 4.2020161112797285e-05, + "loss": 2.6457, + "step": 6999 + }, + { + "epoch": 0.6336561962523762, + "grad_norm": 0.9195594191551208, + "learning_rate": 4.19943090428802e-05, + "loss": 2.6566, + "step": 7000 + }, + { + "epoch": 0.6336561962523762, + "eval_loss": 2.6202468872070312, + "eval_runtime": 71.5629, + "eval_samples_per_second": 37.771, + "eval_steps_per_second": 3.158, + "step": 7000 + }, + { + "epoch": 0.6337467185661265, + "grad_norm": 0.8611754775047302, + "learning_rate": 4.19684628141346e-05, + "loss": 2.6688, + "step": 7001 + }, + { + "epoch": 0.6338372408798769, + "grad_norm": 0.8615222573280334, + "learning_rate": 4.19426224291632e-05, + "loss": 2.6729, + "step": 7002 + }, + { + "epoch": 0.6339277631936272, + "grad_norm": 0.8649547696113586, + "learning_rate": 4.1916787890568146e-05, + "loss": 2.6855, + "step": 7003 + }, + { + "epoch": 0.6340182855073776, + "grad_norm": 0.874506413936615, + "learning_rate": 4.189095920095096e-05, + "loss": 2.6496, + "step": 7004 + }, + { + "epoch": 0.6341088078211279, + "grad_norm": 0.811741054058075, + "learning_rate": 4.186513636291263e-05, + "loss": 2.6205, + "step": 7005 + }, + { + "epoch": 0.6341993301348783, + "grad_norm": 0.8108596801757812, + "learning_rate": 4.18393193790534e-05, + "loss": 2.6642, + "step": 7006 + }, + { + "epoch": 0.6342898524486286, + "grad_norm": 0.8476105332374573, + "learning_rate": 4.181350825197319e-05, + "loss": 2.6937, + "step": 7007 + }, + { + "epoch": 0.634380374762379, + "grad_norm": 0.9545331001281738, + "learning_rate": 4.1787702984271074e-05, + "loss": 2.6579, + "step": 7008 + }, + { + "epoch": 0.6344708970761292, + "grad_norm": 0.8565835356712341, + "learning_rate": 4.176190357854567e-05, + "loss": 2.6092, + "step": 7009 + }, + { + "epoch": 0.6345614193898796, + "grad_norm": 0.8987926840782166, + "learning_rate": 4.173611003739498e-05, + "loss": 2.7727, + "step": 7010 + }, + { + "epoch": 0.6346519417036299, + "grad_norm": 1.0171281099319458, + "learning_rate": 4.171032236341642e-05, + "loss": 2.6363, + "step": 7011 + }, + { + "epoch": 0.6347424640173803, + "grad_norm": 0.918834388256073, + "learning_rate": 4.168454055920681e-05, + "loss": 2.6419, + "step": 7012 + }, + { + "epoch": 0.6348329863311306, + "grad_norm": 0.8396235108375549, + "learning_rate": 4.165876462736239e-05, + "loss": 2.6763, + "step": 7013 + }, + { + "epoch": 0.634923508644881, + "grad_norm": 0.8260104656219482, + "learning_rate": 4.1632994570478746e-05, + "loss": 2.6594, + "step": 7014 + }, + { + "epoch": 0.6350140309586313, + "grad_norm": 0.9070295691490173, + "learning_rate": 4.160723039115095e-05, + "loss": 2.6776, + "step": 7015 + }, + { + "epoch": 0.6351045532723817, + "grad_norm": 0.8622607588768005, + "learning_rate": 4.158147209197347e-05, + "loss": 2.6578, + "step": 7016 + }, + { + "epoch": 0.635195075586132, + "grad_norm": 0.8886837959289551, + "learning_rate": 4.155571967554014e-05, + "loss": 2.6415, + "step": 7017 + }, + { + "epoch": 0.6352855978998824, + "grad_norm": 0.9302876591682434, + "learning_rate": 4.152997314444428e-05, + "loss": 2.7068, + "step": 7018 + }, + { + "epoch": 0.6353761202136327, + "grad_norm": 0.8932064175605774, + "learning_rate": 4.150423250127845e-05, + "loss": 2.6586, + "step": 7019 + }, + { + "epoch": 0.635466642527383, + "grad_norm": 0.842445433139801, + "learning_rate": 4.147849774863488e-05, + "loss": 2.6829, + "step": 7020 + }, + { + "epoch": 0.6355571648411333, + "grad_norm": 0.8959222435951233, + "learning_rate": 4.145276888910494e-05, + "loss": 2.6554, + "step": 7021 + }, + { + "epoch": 0.6356476871548837, + "grad_norm": 0.8270236253738403, + "learning_rate": 4.1427045925279586e-05, + "loss": 2.6167, + "step": 7022 + }, + { + "epoch": 0.635738209468634, + "grad_norm": 0.8666673302650452, + "learning_rate": 4.14013288597491e-05, + "loss": 2.7031, + "step": 7023 + }, + { + "epoch": 0.6358287317823843, + "grad_norm": 0.8924165368080139, + "learning_rate": 4.137561769510322e-05, + "loss": 2.651, + "step": 7024 + }, + { + "epoch": 0.6359192540961347, + "grad_norm": 0.8522528409957886, + "learning_rate": 4.134991243393097e-05, + "loss": 2.686, + "step": 7025 + }, + { + "epoch": 0.636009776409885, + "grad_norm": 0.8556380271911621, + "learning_rate": 4.1324213078821005e-05, + "loss": 2.6777, + "step": 7026 + }, + { + "epoch": 0.6361002987236354, + "grad_norm": 0.8803753852844238, + "learning_rate": 4.1298519632361144e-05, + "loss": 2.6631, + "step": 7027 + }, + { + "epoch": 0.6361908210373857, + "grad_norm": 0.8770917654037476, + "learning_rate": 4.1272832097138744e-05, + "loss": 2.6452, + "step": 7028 + }, + { + "epoch": 0.6362813433511361, + "grad_norm": 0.8415400385856628, + "learning_rate": 4.124715047574055e-05, + "loss": 2.6616, + "step": 7029 + }, + { + "epoch": 0.6363718656648863, + "grad_norm": 0.8721527457237244, + "learning_rate": 4.12214747707527e-05, + "loss": 2.6991, + "step": 7030 + }, + { + "epoch": 0.6364623879786367, + "grad_norm": 0.9462018609046936, + "learning_rate": 4.119580498476072e-05, + "loss": 2.6648, + "step": 7031 + }, + { + "epoch": 0.636552910292387, + "grad_norm": 0.8758288621902466, + "learning_rate": 4.117014112034961e-05, + "loss": 2.6952, + "step": 7032 + }, + { + "epoch": 0.6366434326061374, + "grad_norm": 0.8675450086593628, + "learning_rate": 4.114448318010361e-05, + "loss": 2.6529, + "step": 7033 + }, + { + "epoch": 0.6367339549198877, + "grad_norm": 0.8605788946151733, + "learning_rate": 4.111883116660662e-05, + "loss": 2.6896, + "step": 7034 + }, + { + "epoch": 0.6368244772336381, + "grad_norm": 0.8481203317642212, + "learning_rate": 4.1093185082441677e-05, + "loss": 2.6641, + "step": 7035 + }, + { + "epoch": 0.6369149995473884, + "grad_norm": 0.8389714360237122, + "learning_rate": 4.106754493019138e-05, + "loss": 2.7555, + "step": 7036 + }, + { + "epoch": 0.6370055218611388, + "grad_norm": 0.857469916343689, + "learning_rate": 4.10419107124377e-05, + "loss": 2.6379, + "step": 7037 + }, + { + "epoch": 0.6370960441748891, + "grad_norm": 0.8377165794372559, + "learning_rate": 4.101628243176201e-05, + "loss": 2.656, + "step": 7038 + }, + { + "epoch": 0.6371865664886395, + "grad_norm": 0.8693411350250244, + "learning_rate": 4.099066009074505e-05, + "loss": 2.6445, + "step": 7039 + }, + { + "epoch": 0.6372770888023898, + "grad_norm": 0.9868283867835999, + "learning_rate": 4.096504369196704e-05, + "loss": 2.6139, + "step": 7040 + }, + { + "epoch": 0.6373676111161402, + "grad_norm": 0.8638676404953003, + "learning_rate": 4.093943323800745e-05, + "loss": 2.6561, + "step": 7041 + }, + { + "epoch": 0.6374581334298904, + "grad_norm": 0.844467043876648, + "learning_rate": 4.091382873144539e-05, + "loss": 2.6476, + "step": 7042 + }, + { + "epoch": 0.6375486557436408, + "grad_norm": 0.8906952738761902, + "learning_rate": 4.088823017485912e-05, + "loss": 2.6715, + "step": 7043 + }, + { + "epoch": 0.6376391780573911, + "grad_norm": 0.9263741374015808, + "learning_rate": 4.086263757082646e-05, + "loss": 2.6712, + "step": 7044 + }, + { + "epoch": 0.6377297003711415, + "grad_norm": 0.9978516101837158, + "learning_rate": 4.083705092192457e-05, + "loss": 2.6592, + "step": 7045 + }, + { + "epoch": 0.6378202226848918, + "grad_norm": 0.8400484323501587, + "learning_rate": 4.0811470230730045e-05, + "loss": 2.6687, + "step": 7046 + }, + { + "epoch": 0.6379107449986422, + "grad_norm": 0.812984049320221, + "learning_rate": 4.078589549981884e-05, + "loss": 2.65, + "step": 7047 + }, + { + "epoch": 0.6380012673123925, + "grad_norm": 0.8772035241127014, + "learning_rate": 4.0760326731766374e-05, + "loss": 2.6881, + "step": 7048 + }, + { + "epoch": 0.6380917896261429, + "grad_norm": 0.8191560506820679, + "learning_rate": 4.073476392914731e-05, + "loss": 2.6028, + "step": 7049 + }, + { + "epoch": 0.6381823119398932, + "grad_norm": 0.8499199748039246, + "learning_rate": 4.070920709453597e-05, + "loss": 2.6397, + "step": 7050 + }, + { + "epoch": 0.6382728342536436, + "grad_norm": 0.896375834941864, + "learning_rate": 4.068365623050582e-05, + "loss": 2.6906, + "step": 7051 + }, + { + "epoch": 0.6383633565673938, + "grad_norm": 0.8490253686904907, + "learning_rate": 4.065811133962987e-05, + "loss": 2.6626, + "step": 7052 + }, + { + "epoch": 0.6384538788811442, + "grad_norm": 0.8479554057121277, + "learning_rate": 4.0632572424480476e-05, + "loss": 2.6799, + "step": 7053 + }, + { + "epoch": 0.6385444011948945, + "grad_norm": 0.8941323161125183, + "learning_rate": 4.060703948762945e-05, + "loss": 2.7183, + "step": 7054 + }, + { + "epoch": 0.6386349235086449, + "grad_norm": 0.8600932955741882, + "learning_rate": 4.0581512531647857e-05, + "loss": 2.6362, + "step": 7055 + }, + { + "epoch": 0.6387254458223952, + "grad_norm": 0.8809225559234619, + "learning_rate": 4.055599155910639e-05, + "loss": 2.7255, + "step": 7056 + }, + { + "epoch": 0.6388159681361456, + "grad_norm": 0.8225449919700623, + "learning_rate": 4.0530476572574915e-05, + "loss": 2.6802, + "step": 7057 + }, + { + "epoch": 0.6389064904498959, + "grad_norm": 0.8979951739311218, + "learning_rate": 4.0504967574622824e-05, + "loss": 2.7077, + "step": 7058 + }, + { + "epoch": 0.6389970127636463, + "grad_norm": 0.8487021923065186, + "learning_rate": 4.047946456781887e-05, + "loss": 2.6969, + "step": 7059 + }, + { + "epoch": 0.6390875350773966, + "grad_norm": 0.8609391450881958, + "learning_rate": 4.045396755473121e-05, + "loss": 2.6395, + "step": 7060 + }, + { + "epoch": 0.639178057391147, + "grad_norm": 0.8995999693870544, + "learning_rate": 4.042847653792737e-05, + "loss": 2.6665, + "step": 7061 + }, + { + "epoch": 0.6392685797048973, + "grad_norm": 0.94828200340271, + "learning_rate": 4.0402991519974364e-05, + "loss": 2.6555, + "step": 7062 + }, + { + "epoch": 0.6393591020186477, + "grad_norm": 0.8435460329055786, + "learning_rate": 4.037751250343841e-05, + "loss": 2.6601, + "step": 7063 + }, + { + "epoch": 0.6394496243323979, + "grad_norm": 0.8438577055931091, + "learning_rate": 4.0352039490885384e-05, + "loss": 2.6738, + "step": 7064 + }, + { + "epoch": 0.6395401466461482, + "grad_norm": 0.8616018295288086, + "learning_rate": 4.032657248488031e-05, + "loss": 2.6628, + "step": 7065 + }, + { + "epoch": 0.6396306689598986, + "grad_norm": 0.8819381594657898, + "learning_rate": 4.030111148798775e-05, + "loss": 2.704, + "step": 7066 + }, + { + "epoch": 0.6397211912736489, + "grad_norm": 0.8637552261352539, + "learning_rate": 4.027565650277164e-05, + "loss": 2.7014, + "step": 7067 + }, + { + "epoch": 0.6398117135873993, + "grad_norm": 0.8700119853019714, + "learning_rate": 4.025020753179528e-05, + "loss": 2.6311, + "step": 7068 + }, + { + "epoch": 0.6399022359011496, + "grad_norm": 0.8614180684089661, + "learning_rate": 4.0224764577621396e-05, + "loss": 2.6712, + "step": 7069 + }, + { + "epoch": 0.6399927582149, + "grad_norm": 0.8416438698768616, + "learning_rate": 4.019932764281211e-05, + "loss": 2.6397, + "step": 7070 + }, + { + "epoch": 0.6400832805286503, + "grad_norm": 0.849120557308197, + "learning_rate": 4.017389672992883e-05, + "loss": 2.6249, + "step": 7071 + }, + { + "epoch": 0.6401738028424007, + "grad_norm": 0.91354900598526, + "learning_rate": 4.014847184153258e-05, + "loss": 2.6493, + "step": 7072 + }, + { + "epoch": 0.640264325156151, + "grad_norm": 0.8874666690826416, + "learning_rate": 4.012305298018354e-05, + "loss": 2.6636, + "step": 7073 + }, + { + "epoch": 0.6403548474699013, + "grad_norm": 0.9202023148536682, + "learning_rate": 4.009764014844143e-05, + "loss": 2.6334, + "step": 7074 + }, + { + "epoch": 0.6404453697836516, + "grad_norm": 0.8148588538169861, + "learning_rate": 4.007223334886531e-05, + "loss": 2.6099, + "step": 7075 + }, + { + "epoch": 0.640535892097402, + "grad_norm": 0.8374625444412231, + "learning_rate": 4.004683258401366e-05, + "loss": 2.6621, + "step": 7076 + }, + { + "epoch": 0.6406264144111523, + "grad_norm": 0.8390886187553406, + "learning_rate": 4.002143785644432e-05, + "loss": 2.6802, + "step": 7077 + }, + { + "epoch": 0.6407169367249027, + "grad_norm": 0.908154308795929, + "learning_rate": 3.9996049168714586e-05, + "loss": 2.6487, + "step": 7078 + }, + { + "epoch": 0.640807459038653, + "grad_norm": 0.8716943860054016, + "learning_rate": 3.9970666523380994e-05, + "loss": 2.6942, + "step": 7079 + }, + { + "epoch": 0.6408979813524034, + "grad_norm": 0.8854706287384033, + "learning_rate": 3.994528992299971e-05, + "loss": 2.6846, + "step": 7080 + }, + { + "epoch": 0.6409885036661537, + "grad_norm": 1.0154533386230469, + "learning_rate": 3.9919919370126045e-05, + "loss": 2.6979, + "step": 7081 + }, + { + "epoch": 0.6410790259799041, + "grad_norm": 0.9278495907783508, + "learning_rate": 3.9894554867314856e-05, + "loss": 2.7322, + "step": 7082 + }, + { + "epoch": 0.6411695482936544, + "grad_norm": 0.881550133228302, + "learning_rate": 3.9869196417120355e-05, + "loss": 2.6633, + "step": 7083 + }, + { + "epoch": 0.6412600706074048, + "grad_norm": 0.8752927184104919, + "learning_rate": 3.9843844022096135e-05, + "loss": 2.6351, + "step": 7084 + }, + { + "epoch": 0.641350592921155, + "grad_norm": 0.8580919504165649, + "learning_rate": 3.981849768479517e-05, + "loss": 2.6034, + "step": 7085 + }, + { + "epoch": 0.6414411152349054, + "grad_norm": 0.9341000318527222, + "learning_rate": 3.9793157407769866e-05, + "loss": 2.7307, + "step": 7086 + }, + { + "epoch": 0.6415316375486557, + "grad_norm": 0.892649233341217, + "learning_rate": 3.9767823193571896e-05, + "loss": 2.6646, + "step": 7087 + }, + { + "epoch": 0.6416221598624061, + "grad_norm": 0.8354327082633972, + "learning_rate": 3.974249504475256e-05, + "loss": 2.6757, + "step": 7088 + }, + { + "epoch": 0.6417126821761564, + "grad_norm": 0.90390545129776, + "learning_rate": 3.9717172963862294e-05, + "loss": 2.6837, + "step": 7089 + }, + { + "epoch": 0.6418032044899068, + "grad_norm": 0.8547748923301697, + "learning_rate": 3.969185695345105e-05, + "loss": 2.6614, + "step": 7090 + }, + { + "epoch": 0.6418937268036571, + "grad_norm": 0.9019080996513367, + "learning_rate": 3.9666547016068167e-05, + "loss": 2.6596, + "step": 7091 + }, + { + "epoch": 0.6419842491174075, + "grad_norm": 0.9135777950286865, + "learning_rate": 3.964124315426237e-05, + "loss": 2.6858, + "step": 7092 + }, + { + "epoch": 0.6420747714311578, + "grad_norm": 0.8872252106666565, + "learning_rate": 3.961594537058169e-05, + "loss": 2.6434, + "step": 7093 + }, + { + "epoch": 0.6421652937449082, + "grad_norm": 0.948832094669342, + "learning_rate": 3.959065366757371e-05, + "loss": 2.6778, + "step": 7094 + }, + { + "epoch": 0.6422558160586584, + "grad_norm": 0.9001719951629639, + "learning_rate": 3.956536804778522e-05, + "loss": 2.7093, + "step": 7095 + }, + { + "epoch": 0.6423463383724088, + "grad_norm": 0.7865238189697266, + "learning_rate": 3.954008851376252e-05, + "loss": 2.6818, + "step": 7096 + }, + { + "epoch": 0.6424368606861591, + "grad_norm": 0.8909227252006531, + "learning_rate": 3.951481506805125e-05, + "loss": 2.6662, + "step": 7097 + }, + { + "epoch": 0.6425273829999095, + "grad_norm": 0.8204286098480225, + "learning_rate": 3.948954771319645e-05, + "loss": 2.6512, + "step": 7098 + }, + { + "epoch": 0.6426179053136598, + "grad_norm": 0.8769195079803467, + "learning_rate": 3.946428645174257e-05, + "loss": 2.6931, + "step": 7099 + }, + { + "epoch": 0.6427084276274102, + "grad_norm": 0.814003050327301, + "learning_rate": 3.943903128623335e-05, + "loss": 2.6869, + "step": 7100 + }, + { + "epoch": 0.6427989499411605, + "grad_norm": 0.8625674247741699, + "learning_rate": 3.9413782219212035e-05, + "loss": 2.6778, + "step": 7101 + }, + { + "epoch": 0.6428894722549109, + "grad_norm": 0.9159715175628662, + "learning_rate": 3.938853925322118e-05, + "loss": 2.6754, + "step": 7102 + }, + { + "epoch": 0.6429799945686612, + "grad_norm": 0.8626770377159119, + "learning_rate": 3.936330239080277e-05, + "loss": 2.6311, + "step": 7103 + }, + { + "epoch": 0.6430705168824116, + "grad_norm": 0.833588182926178, + "learning_rate": 3.933807163449815e-05, + "loss": 2.6601, + "step": 7104 + }, + { + "epoch": 0.6431610391961619, + "grad_norm": 0.8459279537200928, + "learning_rate": 3.9312846986848095e-05, + "loss": 2.7138, + "step": 7105 + }, + { + "epoch": 0.6432515615099121, + "grad_norm": 0.837768018245697, + "learning_rate": 3.928762845039261e-05, + "loss": 2.6789, + "step": 7106 + }, + { + "epoch": 0.6433420838236625, + "grad_norm": 0.8656929135322571, + "learning_rate": 3.9262416027671356e-05, + "loss": 2.694, + "step": 7107 + }, + { + "epoch": 0.6434326061374128, + "grad_norm": 0.8893898129463196, + "learning_rate": 3.923720972122311e-05, + "loss": 2.6351, + "step": 7108 + }, + { + "epoch": 0.6435231284511632, + "grad_norm": 0.7833218574523926, + "learning_rate": 3.921200953358618e-05, + "loss": 2.6247, + "step": 7109 + }, + { + "epoch": 0.6436136507649135, + "grad_norm": 0.8954211473464966, + "learning_rate": 3.918681546729822e-05, + "loss": 2.6661, + "step": 7110 + }, + { + "epoch": 0.6437041730786639, + "grad_norm": 0.822925865650177, + "learning_rate": 3.91616275248963e-05, + "loss": 2.6223, + "step": 7111 + }, + { + "epoch": 0.6437946953924142, + "grad_norm": 0.9061444401741028, + "learning_rate": 3.91364457089168e-05, + "loss": 2.7069, + "step": 7112 + }, + { + "epoch": 0.6438852177061646, + "grad_norm": 0.8984582424163818, + "learning_rate": 3.9111270021895596e-05, + "loss": 2.6528, + "step": 7113 + }, + { + "epoch": 0.6439757400199149, + "grad_norm": 0.8554080128669739, + "learning_rate": 3.908610046636776e-05, + "loss": 2.6859, + "step": 7114 + }, + { + "epoch": 0.6440662623336653, + "grad_norm": 0.8965380191802979, + "learning_rate": 3.906093704486802e-05, + "loss": 2.7286, + "step": 7115 + }, + { + "epoch": 0.6441567846474155, + "grad_norm": 0.8609764575958252, + "learning_rate": 3.903577975993021e-05, + "loss": 2.6072, + "step": 7116 + }, + { + "epoch": 0.6442473069611659, + "grad_norm": 0.8662620782852173, + "learning_rate": 3.9010628614087705e-05, + "loss": 2.6947, + "step": 7117 + }, + { + "epoch": 0.6443378292749162, + "grad_norm": 0.880328893661499, + "learning_rate": 3.8985483609873244e-05, + "loss": 2.636, + "step": 7118 + }, + { + "epoch": 0.6444283515886666, + "grad_norm": 0.897362470626831, + "learning_rate": 3.896034474981894e-05, + "loss": 2.6112, + "step": 7119 + }, + { + "epoch": 0.6445188739024169, + "grad_norm": 0.9112858176231384, + "learning_rate": 3.893521203645618e-05, + "loss": 2.7018, + "step": 7120 + }, + { + "epoch": 0.6446093962161673, + "grad_norm": 0.8464901447296143, + "learning_rate": 3.891008547231597e-05, + "loss": 2.6105, + "step": 7121 + }, + { + "epoch": 0.6446999185299176, + "grad_norm": 0.888416051864624, + "learning_rate": 3.8884965059928444e-05, + "loss": 2.7111, + "step": 7122 + }, + { + "epoch": 0.644790440843668, + "grad_norm": 0.8662939667701721, + "learning_rate": 3.885985080182327e-05, + "loss": 2.6843, + "step": 7123 + }, + { + "epoch": 0.6448809631574183, + "grad_norm": 0.9020508527755737, + "learning_rate": 3.8834742700529435e-05, + "loss": 2.7249, + "step": 7124 + }, + { + "epoch": 0.6449714854711687, + "grad_norm": 0.8338090777397156, + "learning_rate": 3.880964075857535e-05, + "loss": 2.6608, + "step": 7125 + }, + { + "epoch": 0.645062007784919, + "grad_norm": 0.7852875590324402, + "learning_rate": 3.8784544978488756e-05, + "loss": 2.6479, + "step": 7126 + }, + { + "epoch": 0.6451525300986694, + "grad_norm": 0.7826331853866577, + "learning_rate": 3.8759455362796846e-05, + "loss": 2.6882, + "step": 7127 + }, + { + "epoch": 0.6452430524124196, + "grad_norm": 0.9392991065979004, + "learning_rate": 3.8734371914026044e-05, + "loss": 2.7536, + "step": 7128 + }, + { + "epoch": 0.64533357472617, + "grad_norm": 0.8982208371162415, + "learning_rate": 3.8709294634702376e-05, + "loss": 2.7042, + "step": 7129 + }, + { + "epoch": 0.6454240970399203, + "grad_norm": 0.9267832040786743, + "learning_rate": 3.8684223527351025e-05, + "loss": 2.6595, + "step": 7130 + }, + { + "epoch": 0.6455146193536707, + "grad_norm": 0.8747368454933167, + "learning_rate": 3.865915859449669e-05, + "loss": 2.6925, + "step": 7131 + }, + { + "epoch": 0.645605141667421, + "grad_norm": 0.8629845380783081, + "learning_rate": 3.863409983866341e-05, + "loss": 2.711, + "step": 7132 + }, + { + "epoch": 0.6456956639811714, + "grad_norm": 0.8262534141540527, + "learning_rate": 3.8609047262374586e-05, + "loss": 2.6492, + "step": 7133 + }, + { + "epoch": 0.6457861862949217, + "grad_norm": 0.8800877332687378, + "learning_rate": 3.8584000868153025e-05, + "loss": 2.7027, + "step": 7134 + }, + { + "epoch": 0.6458767086086721, + "grad_norm": 0.9538904428482056, + "learning_rate": 3.855896065852094e-05, + "loss": 2.6911, + "step": 7135 + }, + { + "epoch": 0.6459672309224224, + "grad_norm": 0.8761887550354004, + "learning_rate": 3.853392663599976e-05, + "loss": 2.6475, + "step": 7136 + }, + { + "epoch": 0.6460577532361728, + "grad_norm": 0.9193413257598877, + "learning_rate": 3.850889880311055e-05, + "loss": 2.7227, + "step": 7137 + }, + { + "epoch": 0.646148275549923, + "grad_norm": 0.8643597960472107, + "learning_rate": 3.848387716237353e-05, + "loss": 2.6666, + "step": 7138 + }, + { + "epoch": 0.6462387978636734, + "grad_norm": 0.8625378012657166, + "learning_rate": 3.845886171630838e-05, + "loss": 2.6604, + "step": 7139 + }, + { + "epoch": 0.6463293201774237, + "grad_norm": 0.8720336556434631, + "learning_rate": 3.843385246743417e-05, + "loss": 2.6764, + "step": 7140 + }, + { + "epoch": 0.6464198424911741, + "grad_norm": 0.8405011296272278, + "learning_rate": 3.8408849418269346e-05, + "loss": 2.685, + "step": 7141 + }, + { + "epoch": 0.6465103648049244, + "grad_norm": 0.882945716381073, + "learning_rate": 3.838385257133169e-05, + "loss": 2.7478, + "step": 7142 + }, + { + "epoch": 0.6466008871186748, + "grad_norm": 0.9414501786231995, + "learning_rate": 3.8358861929138436e-05, + "loss": 2.6697, + "step": 7143 + }, + { + "epoch": 0.6466914094324251, + "grad_norm": 0.8411595821380615, + "learning_rate": 3.833387749420603e-05, + "loss": 2.6961, + "step": 7144 + }, + { + "epoch": 0.6467819317461755, + "grad_norm": 0.8170534372329712, + "learning_rate": 3.8308899269050535e-05, + "loss": 2.6046, + "step": 7145 + }, + { + "epoch": 0.6468724540599258, + "grad_norm": 0.8441464900970459, + "learning_rate": 3.828392725618717e-05, + "loss": 2.6861, + "step": 7146 + }, + { + "epoch": 0.6469629763736761, + "grad_norm": 0.8740844130516052, + "learning_rate": 3.8258961458130625e-05, + "loss": 2.6563, + "step": 7147 + }, + { + "epoch": 0.6470534986874265, + "grad_norm": 0.8880472183227539, + "learning_rate": 3.823400187739499e-05, + "loss": 2.6645, + "step": 7148 + }, + { + "epoch": 0.6471440210011767, + "grad_norm": 0.854893684387207, + "learning_rate": 3.8209048516493706e-05, + "loss": 2.7327, + "step": 7149 + }, + { + "epoch": 0.6472345433149271, + "grad_norm": 0.8700079321861267, + "learning_rate": 3.8184101377939476e-05, + "loss": 2.6862, + "step": 7150 + }, + { + "epoch": 0.6473250656286774, + "grad_norm": 0.934057354927063, + "learning_rate": 3.8159160464244606e-05, + "loss": 2.6973, + "step": 7151 + }, + { + "epoch": 0.6474155879424278, + "grad_norm": 0.8303758502006531, + "learning_rate": 3.813422577792056e-05, + "loss": 2.6229, + "step": 7152 + }, + { + "epoch": 0.6475061102561781, + "grad_norm": 0.9065086245536804, + "learning_rate": 3.810929732147829e-05, + "loss": 2.6548, + "step": 7153 + }, + { + "epoch": 0.6475966325699285, + "grad_norm": 0.9324973225593567, + "learning_rate": 3.808437509742807e-05, + "loss": 2.6761, + "step": 7154 + }, + { + "epoch": 0.6476871548836788, + "grad_norm": 0.8872169256210327, + "learning_rate": 3.8059459108279594e-05, + "loss": 2.681, + "step": 7155 + }, + { + "epoch": 0.6477776771974292, + "grad_norm": 0.815177321434021, + "learning_rate": 3.8034549356541894e-05, + "loss": 2.6889, + "step": 7156 + }, + { + "epoch": 0.6478681995111795, + "grad_norm": 0.8200878500938416, + "learning_rate": 3.8009645844723406e-05, + "loss": 2.5912, + "step": 7157 + }, + { + "epoch": 0.6479587218249299, + "grad_norm": 0.8385193943977356, + "learning_rate": 3.798474857533182e-05, + "loss": 2.6814, + "step": 7158 + }, + { + "epoch": 0.6480492441386801, + "grad_norm": 0.8454488515853882, + "learning_rate": 3.795985755087442e-05, + "loss": 2.6542, + "step": 7159 + }, + { + "epoch": 0.6481397664524305, + "grad_norm": 0.8543698787689209, + "learning_rate": 3.7934972773857634e-05, + "loss": 2.6238, + "step": 7160 + }, + { + "epoch": 0.6482302887661808, + "grad_norm": 0.8137564659118652, + "learning_rate": 3.79100942467874e-05, + "loss": 2.6324, + "step": 7161 + }, + { + "epoch": 0.6483208110799312, + "grad_norm": 1.0308117866516113, + "learning_rate": 3.788522197216897e-05, + "loss": 2.646, + "step": 7162 + }, + { + "epoch": 0.6484113333936815, + "grad_norm": 0.9226447939872742, + "learning_rate": 3.7860355952506985e-05, + "loss": 2.6695, + "step": 7163 + }, + { + "epoch": 0.6485018557074319, + "grad_norm": 0.8795346021652222, + "learning_rate": 3.7835496190305463e-05, + "loss": 2.6895, + "step": 7164 + }, + { + "epoch": 0.6485923780211822, + "grad_norm": 0.8603231310844421, + "learning_rate": 3.7810642688067796e-05, + "loss": 2.5942, + "step": 7165 + }, + { + "epoch": 0.6486829003349326, + "grad_norm": 0.8747546672821045, + "learning_rate": 3.7785795448296646e-05, + "loss": 2.6818, + "step": 7166 + }, + { + "epoch": 0.6487734226486829, + "grad_norm": 0.9285270571708679, + "learning_rate": 3.776095447349427e-05, + "loss": 2.7052, + "step": 7167 + }, + { + "epoch": 0.6488639449624333, + "grad_norm": 0.8840459585189819, + "learning_rate": 3.773611976616203e-05, + "loss": 2.7358, + "step": 7168 + }, + { + "epoch": 0.6489544672761836, + "grad_norm": 0.7858835458755493, + "learning_rate": 3.7711291328800824e-05, + "loss": 2.6761, + "step": 7169 + }, + { + "epoch": 0.649044989589934, + "grad_norm": 0.8488116264343262, + "learning_rate": 3.7686469163910885e-05, + "loss": 2.6549, + "step": 7170 + }, + { + "epoch": 0.6491355119036842, + "grad_norm": 0.8749338388442993, + "learning_rate": 3.766165327399179e-05, + "loss": 2.6001, + "step": 7171 + }, + { + "epoch": 0.6492260342174346, + "grad_norm": 0.9855179190635681, + "learning_rate": 3.7636843661542506e-05, + "loss": 2.731, + "step": 7172 + }, + { + "epoch": 0.6493165565311849, + "grad_norm": 0.8725718855857849, + "learning_rate": 3.7612040329061405e-05, + "loss": 2.593, + "step": 7173 + }, + { + "epoch": 0.6494070788449353, + "grad_norm": 0.9173427224159241, + "learning_rate": 3.7587243279046056e-05, + "loss": 2.6827, + "step": 7174 + }, + { + "epoch": 0.6494976011586856, + "grad_norm": 0.9711499810218811, + "learning_rate": 3.7562452513993674e-05, + "loss": 2.6605, + "step": 7175 + }, + { + "epoch": 0.649588123472436, + "grad_norm": 0.8643674254417419, + "learning_rate": 3.7537668036400574e-05, + "loss": 2.6727, + "step": 7176 + }, + { + "epoch": 0.6496786457861863, + "grad_norm": 0.9335436224937439, + "learning_rate": 3.75128898487626e-05, + "loss": 2.6548, + "step": 7177 + }, + { + "epoch": 0.6497691680999367, + "grad_norm": 0.8806255459785461, + "learning_rate": 3.748811795357491e-05, + "loss": 2.5545, + "step": 7178 + }, + { + "epoch": 0.649859690413687, + "grad_norm": 0.8108535408973694, + "learning_rate": 3.746335235333203e-05, + "loss": 2.6547, + "step": 7179 + }, + { + "epoch": 0.6499502127274374, + "grad_norm": 0.9339863657951355, + "learning_rate": 3.7438593050527845e-05, + "loss": 2.6247, + "step": 7180 + }, + { + "epoch": 0.6500407350411876, + "grad_norm": 0.8718104958534241, + "learning_rate": 3.7413840047655666e-05, + "loss": 2.6577, + "step": 7181 + }, + { + "epoch": 0.650131257354938, + "grad_norm": 0.8415114879608154, + "learning_rate": 3.7389093347208026e-05, + "loss": 2.735, + "step": 7182 + }, + { + "epoch": 0.6502217796686883, + "grad_norm": 0.8771581649780273, + "learning_rate": 3.736435295167704e-05, + "loss": 2.7182, + "step": 7183 + }, + { + "epoch": 0.6503123019824387, + "grad_norm": 0.828473687171936, + "learning_rate": 3.733961886355398e-05, + "loss": 2.6797, + "step": 7184 + }, + { + "epoch": 0.650402824296189, + "grad_norm": 0.8189043402671814, + "learning_rate": 3.731489108532954e-05, + "loss": 2.6999, + "step": 7185 + }, + { + "epoch": 0.6504933466099394, + "grad_norm": 0.913808286190033, + "learning_rate": 3.729016961949391e-05, + "loss": 2.6483, + "step": 7186 + }, + { + "epoch": 0.6505838689236897, + "grad_norm": 0.8551123738288879, + "learning_rate": 3.7265454468536456e-05, + "loss": 2.6946, + "step": 7187 + }, + { + "epoch": 0.65067439123744, + "grad_norm": 0.8683720827102661, + "learning_rate": 3.724074563494601e-05, + "loss": 2.7036, + "step": 7188 + }, + { + "epoch": 0.6507649135511904, + "grad_norm": 0.8683645725250244, + "learning_rate": 3.721604312121076e-05, + "loss": 2.6839, + "step": 7189 + }, + { + "epoch": 0.6508554358649407, + "grad_norm": 0.8150107860565186, + "learning_rate": 3.719134692981826e-05, + "loss": 2.6815, + "step": 7190 + }, + { + "epoch": 0.650945958178691, + "grad_norm": 0.8053891062736511, + "learning_rate": 3.71666570632554e-05, + "loss": 2.5942, + "step": 7191 + }, + { + "epoch": 0.6510364804924413, + "grad_norm": 0.8532904982566833, + "learning_rate": 3.714197352400849e-05, + "loss": 2.6201, + "step": 7192 + }, + { + "epoch": 0.6511270028061917, + "grad_norm": 0.8892574310302734, + "learning_rate": 3.711729631456305e-05, + "loss": 2.7435, + "step": 7193 + }, + { + "epoch": 0.651217525119942, + "grad_norm": 0.8301617503166199, + "learning_rate": 3.7092625437404236e-05, + "loss": 2.6718, + "step": 7194 + }, + { + "epoch": 0.6513080474336924, + "grad_norm": 0.801541268825531, + "learning_rate": 3.7067960895016275e-05, + "loss": 2.6546, + "step": 7195 + }, + { + "epoch": 0.6513985697474427, + "grad_norm": 0.8979970216751099, + "learning_rate": 3.704330268988293e-05, + "loss": 2.6777, + "step": 7196 + }, + { + "epoch": 0.6514890920611931, + "grad_norm": 0.8236854672431946, + "learning_rate": 3.7018650824487277e-05, + "loss": 2.6671, + "step": 7197 + }, + { + "epoch": 0.6515796143749434, + "grad_norm": 0.8789283633232117, + "learning_rate": 3.6994005301311777e-05, + "loss": 2.6919, + "step": 7198 + }, + { + "epoch": 0.6516701366886938, + "grad_norm": 0.8241438865661621, + "learning_rate": 3.6969366122838215e-05, + "loss": 2.6186, + "step": 7199 + }, + { + "epoch": 0.6517606590024441, + "grad_norm": 0.8289199471473694, + "learning_rate": 3.694473329154778e-05, + "loss": 2.6487, + "step": 7200 + }, + { + "epoch": 0.6518511813161945, + "grad_norm": 0.8322336077690125, + "learning_rate": 3.692010680992092e-05, + "loss": 2.6941, + "step": 7201 + }, + { + "epoch": 0.6519417036299447, + "grad_norm": 0.8611689805984497, + "learning_rate": 3.689548668043764e-05, + "loss": 2.6761, + "step": 7202 + }, + { + "epoch": 0.6520322259436951, + "grad_norm": 0.8806263208389282, + "learning_rate": 3.68708729055771e-05, + "loss": 2.6751, + "step": 7203 + }, + { + "epoch": 0.6521227482574454, + "grad_norm": 0.836117148399353, + "learning_rate": 3.684626548781792e-05, + "loss": 2.6244, + "step": 7204 + }, + { + "epoch": 0.6522132705711958, + "grad_norm": 0.9649540185928345, + "learning_rate": 3.682166442963809e-05, + "loss": 2.7419, + "step": 7205 + }, + { + "epoch": 0.6523037928849461, + "grad_norm": 0.8877153396606445, + "learning_rate": 3.679706973351491e-05, + "loss": 2.6511, + "step": 7206 + }, + { + "epoch": 0.6523943151986965, + "grad_norm": 0.9087035655975342, + "learning_rate": 3.67724814019251e-05, + "loss": 2.6311, + "step": 7207 + }, + { + "epoch": 0.6524848375124468, + "grad_norm": 0.9013026356697083, + "learning_rate": 3.674789943734469e-05, + "loss": 2.6748, + "step": 7208 + }, + { + "epoch": 0.6525753598261972, + "grad_norm": 0.8409872651100159, + "learning_rate": 3.6723323842249025e-05, + "loss": 2.7069, + "step": 7209 + }, + { + "epoch": 0.6526658821399475, + "grad_norm": 0.8533176183700562, + "learning_rate": 3.669875461911297e-05, + "loss": 2.6412, + "step": 7210 + }, + { + "epoch": 0.6527564044536979, + "grad_norm": 0.8591048717498779, + "learning_rate": 3.667419177041058e-05, + "loss": 2.7495, + "step": 7211 + }, + { + "epoch": 0.6528469267674482, + "grad_norm": 0.8098570108413696, + "learning_rate": 3.664963529861534e-05, + "loss": 2.6755, + "step": 7212 + }, + { + "epoch": 0.6529374490811986, + "grad_norm": 0.871677815914154, + "learning_rate": 3.662508520620008e-05, + "loss": 2.6289, + "step": 7213 + }, + { + "epoch": 0.6530279713949488, + "grad_norm": 0.895744800567627, + "learning_rate": 3.6600541495637055e-05, + "loss": 2.6593, + "step": 7214 + }, + { + "epoch": 0.6531184937086992, + "grad_norm": 0.836062490940094, + "learning_rate": 3.657600416939768e-05, + "loss": 2.6574, + "step": 7215 + }, + { + "epoch": 0.6532090160224495, + "grad_norm": 0.8524973392486572, + "learning_rate": 3.6551473229953037e-05, + "loss": 2.6401, + "step": 7216 + }, + { + "epoch": 0.6532995383361999, + "grad_norm": 0.8783817291259766, + "learning_rate": 3.6526948679773257e-05, + "loss": 2.6905, + "step": 7217 + }, + { + "epoch": 0.6533900606499502, + "grad_norm": 1.0054057836532593, + "learning_rate": 3.650243052132801e-05, + "loss": 2.6963, + "step": 7218 + }, + { + "epoch": 0.6534805829637006, + "grad_norm": 0.8324257135391235, + "learning_rate": 3.647791875708628e-05, + "loss": 2.6178, + "step": 7219 + }, + { + "epoch": 0.6535711052774509, + "grad_norm": 0.8131170272827148, + "learning_rate": 3.645341338951639e-05, + "loss": 2.6597, + "step": 7220 + }, + { + "epoch": 0.6536616275912013, + "grad_norm": 0.879077136516571, + "learning_rate": 3.642891442108602e-05, + "loss": 2.6612, + "step": 7221 + }, + { + "epoch": 0.6537521499049516, + "grad_norm": 0.9463702440261841, + "learning_rate": 3.640442185426228e-05, + "loss": 2.7035, + "step": 7222 + }, + { + "epoch": 0.653842672218702, + "grad_norm": 0.8919106721878052, + "learning_rate": 3.6379935691511447e-05, + "loss": 2.6789, + "step": 7223 + }, + { + "epoch": 0.6539331945324522, + "grad_norm": 0.8371802568435669, + "learning_rate": 3.635545593529941e-05, + "loss": 2.6433, + "step": 7224 + }, + { + "epoch": 0.6540237168462026, + "grad_norm": 0.8284990191459656, + "learning_rate": 3.6330982588091186e-05, + "loss": 2.6303, + "step": 7225 + }, + { + "epoch": 0.6541142391599529, + "grad_norm": 0.9169434905052185, + "learning_rate": 3.630651565235128e-05, + "loss": 2.7317, + "step": 7226 + }, + { + "epoch": 0.6542047614737033, + "grad_norm": 0.8090481758117676, + "learning_rate": 3.6282055130543505e-05, + "loss": 2.6255, + "step": 7227 + }, + { + "epoch": 0.6542952837874536, + "grad_norm": 0.9667547941207886, + "learning_rate": 3.6257601025131026e-05, + "loss": 2.6606, + "step": 7228 + }, + { + "epoch": 0.6543858061012039, + "grad_norm": 0.868475079536438, + "learning_rate": 3.6233153338576384e-05, + "loss": 2.6585, + "step": 7229 + }, + { + "epoch": 0.6544763284149543, + "grad_norm": 0.8772515058517456, + "learning_rate": 3.62087120733415e-05, + "loss": 2.6808, + "step": 7230 + }, + { + "epoch": 0.6545668507287046, + "grad_norm": 0.8411134481430054, + "learning_rate": 3.618427723188749e-05, + "loss": 2.6591, + "step": 7231 + }, + { + "epoch": 0.654657373042455, + "grad_norm": 0.8320329189300537, + "learning_rate": 3.615984881667508e-05, + "loss": 2.6187, + "step": 7232 + }, + { + "epoch": 0.6547478953562053, + "grad_norm": 0.8759403824806213, + "learning_rate": 3.613542683016411e-05, + "loss": 2.7017, + "step": 7233 + }, + { + "epoch": 0.6548384176699557, + "grad_norm": 0.9727622270584106, + "learning_rate": 3.611101127481392e-05, + "loss": 2.6601, + "step": 7234 + }, + { + "epoch": 0.6549289399837059, + "grad_norm": 0.8831741213798523, + "learning_rate": 3.6086602153083146e-05, + "loss": 2.7137, + "step": 7235 + }, + { + "epoch": 0.6550194622974563, + "grad_norm": 0.8495936393737793, + "learning_rate": 3.606219946742978e-05, + "loss": 2.6793, + "step": 7236 + }, + { + "epoch": 0.6551099846112066, + "grad_norm": 0.8561792373657227, + "learning_rate": 3.6037803220311187e-05, + "loss": 2.6837, + "step": 7237 + }, + { + "epoch": 0.655200506924957, + "grad_norm": 0.8705282211303711, + "learning_rate": 3.601341341418408e-05, + "loss": 2.6161, + "step": 7238 + }, + { + "epoch": 0.6552910292387073, + "grad_norm": 0.8599856495857239, + "learning_rate": 3.5989030051504434e-05, + "loss": 2.7134, + "step": 7239 + }, + { + "epoch": 0.6553815515524577, + "grad_norm": 0.8800930380821228, + "learning_rate": 3.5964653134727776e-05, + "loss": 2.6341, + "step": 7240 + }, + { + "epoch": 0.655472073866208, + "grad_norm": 0.8483147621154785, + "learning_rate": 3.594028266630877e-05, + "loss": 2.6875, + "step": 7241 + }, + { + "epoch": 0.6555625961799584, + "grad_norm": 0.952504575252533, + "learning_rate": 3.5915918648701526e-05, + "loss": 2.6613, + "step": 7242 + }, + { + "epoch": 0.6556531184937087, + "grad_norm": 0.8933284878730774, + "learning_rate": 3.5891561084359535e-05, + "loss": 2.6707, + "step": 7243 + }, + { + "epoch": 0.6557436408074591, + "grad_norm": 0.9218231439590454, + "learning_rate": 3.5867209975735625e-05, + "loss": 2.6409, + "step": 7244 + }, + { + "epoch": 0.6558341631212093, + "grad_norm": 0.9200209379196167, + "learning_rate": 3.584286532528184e-05, + "loss": 2.6412, + "step": 7245 + }, + { + "epoch": 0.6559246854349597, + "grad_norm": 0.8185088634490967, + "learning_rate": 3.581852713544983e-05, + "loss": 2.6865, + "step": 7246 + }, + { + "epoch": 0.65601520774871, + "grad_norm": 0.8688246011734009, + "learning_rate": 3.579419540869036e-05, + "loss": 2.7028, + "step": 7247 + }, + { + "epoch": 0.6561057300624604, + "grad_norm": 0.9082852602005005, + "learning_rate": 3.5769870147453646e-05, + "loss": 2.6682, + "step": 7248 + }, + { + "epoch": 0.6561962523762107, + "grad_norm": 0.9646705985069275, + "learning_rate": 3.5745551354189244e-05, + "loss": 2.7612, + "step": 7249 + }, + { + "epoch": 0.6562867746899611, + "grad_norm": 0.9219226241111755, + "learning_rate": 3.5721239031346066e-05, + "loss": 2.573, + "step": 7250 + }, + { + "epoch": 0.6563772970037114, + "grad_norm": 0.8789383769035339, + "learning_rate": 3.5696933181372364e-05, + "loss": 2.7032, + "step": 7251 + }, + { + "epoch": 0.6564678193174618, + "grad_norm": 0.9892781376838684, + "learning_rate": 3.567263380671576e-05, + "loss": 2.6416, + "step": 7252 + }, + { + "epoch": 0.6565583416312121, + "grad_norm": 0.8261721730232239, + "learning_rate": 3.5648340909823105e-05, + "loss": 2.6554, + "step": 7253 + }, + { + "epoch": 0.6566488639449625, + "grad_norm": 0.8541987538337708, + "learning_rate": 3.562405449314084e-05, + "loss": 2.7041, + "step": 7254 + }, + { + "epoch": 0.6567393862587128, + "grad_norm": 0.886324942111969, + "learning_rate": 3.5599774559114475e-05, + "loss": 2.6688, + "step": 7255 + }, + { + "epoch": 0.6568299085724632, + "grad_norm": 0.891233503818512, + "learning_rate": 3.557550111018906e-05, + "loss": 2.703, + "step": 7256 + }, + { + "epoch": 0.6569204308862134, + "grad_norm": 0.8668513894081116, + "learning_rate": 3.555123414880891e-05, + "loss": 2.6485, + "step": 7257 + }, + { + "epoch": 0.6570109531999638, + "grad_norm": 0.8383379578590393, + "learning_rate": 3.552697367741772e-05, + "loss": 2.6134, + "step": 7258 + }, + { + "epoch": 0.6571014755137141, + "grad_norm": 0.8313611149787903, + "learning_rate": 3.5502719698458517e-05, + "loss": 2.6524, + "step": 7259 + }, + { + "epoch": 0.6571919978274645, + "grad_norm": 0.9203676581382751, + "learning_rate": 3.547847221437372e-05, + "loss": 2.7042, + "step": 7260 + }, + { + "epoch": 0.6572825201412148, + "grad_norm": 0.8398598432540894, + "learning_rate": 3.545423122760493e-05, + "loss": 2.6851, + "step": 7261 + }, + { + "epoch": 0.6573730424549652, + "grad_norm": 1.0203299522399902, + "learning_rate": 3.542999674059335e-05, + "loss": 2.6645, + "step": 7262 + }, + { + "epoch": 0.6574635647687155, + "grad_norm": 0.8423504829406738, + "learning_rate": 3.5405768755779315e-05, + "loss": 2.6734, + "step": 7263 + }, + { + "epoch": 0.6575540870824659, + "grad_norm": 0.8030686378479004, + "learning_rate": 3.538154727560259e-05, + "loss": 2.6363, + "step": 7264 + }, + { + "epoch": 0.6576446093962162, + "grad_norm": 0.8241353034973145, + "learning_rate": 3.5357332302502274e-05, + "loss": 2.6287, + "step": 7265 + }, + { + "epoch": 0.6577351317099666, + "grad_norm": 0.8681092262268066, + "learning_rate": 3.5333123838916845e-05, + "loss": 2.6093, + "step": 7266 + }, + { + "epoch": 0.6578256540237168, + "grad_norm": 0.8543620705604553, + "learning_rate": 3.530892188728406e-05, + "loss": 2.6196, + "step": 7267 + }, + { + "epoch": 0.6579161763374672, + "grad_norm": 0.8001599311828613, + "learning_rate": 3.5284726450041116e-05, + "loss": 2.6523, + "step": 7268 + }, + { + "epoch": 0.6580066986512175, + "grad_norm": 0.8121172785758972, + "learning_rate": 3.526053752962441e-05, + "loss": 2.6775, + "step": 7269 + }, + { + "epoch": 0.6580972209649678, + "grad_norm": 0.8860118389129639, + "learning_rate": 3.523635512846981e-05, + "loss": 2.6323, + "step": 7270 + }, + { + "epoch": 0.6581877432787182, + "grad_norm": 0.937731921672821, + "learning_rate": 3.521217924901248e-05, + "loss": 2.675, + "step": 7271 + }, + { + "epoch": 0.6582782655924685, + "grad_norm": 0.868818461894989, + "learning_rate": 3.518800989368691e-05, + "loss": 2.6533, + "step": 7272 + }, + { + "epoch": 0.6583687879062189, + "grad_norm": 0.8203585147857666, + "learning_rate": 3.516384706492701e-05, + "loss": 2.6489, + "step": 7273 + }, + { + "epoch": 0.6584593102199692, + "grad_norm": 0.9456077814102173, + "learning_rate": 3.5139690765165876e-05, + "loss": 2.7232, + "step": 7274 + }, + { + "epoch": 0.6585498325337196, + "grad_norm": 0.8109578490257263, + "learning_rate": 3.5115540996836174e-05, + "loss": 2.6335, + "step": 7275 + }, + { + "epoch": 0.6586403548474699, + "grad_norm": 0.8843202590942383, + "learning_rate": 3.509139776236967e-05, + "loss": 2.6194, + "step": 7276 + }, + { + "epoch": 0.6587308771612203, + "grad_norm": 0.9179278612136841, + "learning_rate": 3.506726106419765e-05, + "loss": 2.6024, + "step": 7277 + }, + { + "epoch": 0.6588213994749705, + "grad_norm": 0.9402636289596558, + "learning_rate": 3.5043130904750665e-05, + "loss": 2.6987, + "step": 7278 + }, + { + "epoch": 0.6589119217887209, + "grad_norm": 0.8278437852859497, + "learning_rate": 3.501900728645865e-05, + "loss": 2.624, + "step": 7279 + }, + { + "epoch": 0.6590024441024712, + "grad_norm": 0.8150365352630615, + "learning_rate": 3.4994890211750754e-05, + "loss": 2.6596, + "step": 7280 + }, + { + "epoch": 0.6590929664162216, + "grad_norm": 0.9417485594749451, + "learning_rate": 3.49707796830557e-05, + "loss": 2.6667, + "step": 7281 + }, + { + "epoch": 0.6591834887299719, + "grad_norm": 0.8826394081115723, + "learning_rate": 3.494667570280132e-05, + "loss": 2.6328, + "step": 7282 + }, + { + "epoch": 0.6592740110437223, + "grad_norm": 0.8603037595748901, + "learning_rate": 3.492257827341492e-05, + "loss": 2.727, + "step": 7283 + }, + { + "epoch": 0.6593645333574726, + "grad_norm": 0.8361364603042603, + "learning_rate": 3.489848739732311e-05, + "loss": 2.6093, + "step": 7284 + }, + { + "epoch": 0.659455055671223, + "grad_norm": 0.8012417554855347, + "learning_rate": 3.487440307695183e-05, + "loss": 2.6427, + "step": 7285 + }, + { + "epoch": 0.6595455779849733, + "grad_norm": 0.8624984622001648, + "learning_rate": 3.485032531472638e-05, + "loss": 2.6505, + "step": 7286 + }, + { + "epoch": 0.6596361002987237, + "grad_norm": 0.9410037398338318, + "learning_rate": 3.4826254113071434e-05, + "loss": 2.637, + "step": 7287 + }, + { + "epoch": 0.659726622612474, + "grad_norm": 0.862177848815918, + "learning_rate": 3.480218947441083e-05, + "loss": 2.6587, + "step": 7288 + }, + { + "epoch": 0.6598171449262243, + "grad_norm": 0.82010418176651, + "learning_rate": 3.477813140116805e-05, + "loss": 2.6426, + "step": 7289 + }, + { + "epoch": 0.6599076672399746, + "grad_norm": 0.9039086699485779, + "learning_rate": 3.47540798957656e-05, + "loss": 2.6614, + "step": 7290 + }, + { + "epoch": 0.659998189553725, + "grad_norm": 0.8869426250457764, + "learning_rate": 3.473003496062552e-05, + "loss": 2.5854, + "step": 7291 + }, + { + "epoch": 0.6600887118674753, + "grad_norm": 0.8362395763397217, + "learning_rate": 3.470599659816914e-05, + "loss": 2.6311, + "step": 7292 + }, + { + "epoch": 0.6601792341812257, + "grad_norm": 0.8808051943778992, + "learning_rate": 3.4681964810817124e-05, + "loss": 2.6611, + "step": 7293 + }, + { + "epoch": 0.660269756494976, + "grad_norm": 0.8643760085105896, + "learning_rate": 3.465793960098945e-05, + "loss": 2.6705, + "step": 7294 + }, + { + "epoch": 0.6603602788087264, + "grad_norm": 0.8985621333122253, + "learning_rate": 3.4633920971105515e-05, + "loss": 2.6765, + "step": 7295 + }, + { + "epoch": 0.6604508011224767, + "grad_norm": 0.9751450419425964, + "learning_rate": 3.460990892358388e-05, + "loss": 2.6361, + "step": 7296 + }, + { + "epoch": 0.6605413234362271, + "grad_norm": 0.8282762169837952, + "learning_rate": 3.458590346084269e-05, + "loss": 2.6246, + "step": 7297 + }, + { + "epoch": 0.6606318457499774, + "grad_norm": 0.8377431631088257, + "learning_rate": 3.45619045852992e-05, + "loss": 2.6649, + "step": 7298 + }, + { + "epoch": 0.6607223680637277, + "grad_norm": 0.9260903596878052, + "learning_rate": 3.4537912299370134e-05, + "loss": 2.7063, + "step": 7299 + }, + { + "epoch": 0.660812890377478, + "grad_norm": 0.9289246797561646, + "learning_rate": 3.45139266054715e-05, + "loss": 2.7076, + "step": 7300 + }, + { + "epoch": 0.6609034126912284, + "grad_norm": 0.8479653596878052, + "learning_rate": 3.4489947506018674e-05, + "loss": 2.6206, + "step": 7301 + }, + { + "epoch": 0.6609939350049787, + "grad_norm": 0.8131391406059265, + "learning_rate": 3.446597500342633e-05, + "loss": 2.6383, + "step": 7302 + }, + { + "epoch": 0.6610844573187291, + "grad_norm": 0.8845452666282654, + "learning_rate": 3.444200910010854e-05, + "loss": 2.6501, + "step": 7303 + }, + { + "epoch": 0.6611749796324794, + "grad_norm": 0.8882754445075989, + "learning_rate": 3.441804979847859e-05, + "loss": 2.6433, + "step": 7304 + }, + { + "epoch": 0.6612655019462298, + "grad_norm": 0.8090992569923401, + "learning_rate": 3.439409710094929e-05, + "loss": 2.6514, + "step": 7305 + }, + { + "epoch": 0.6613560242599801, + "grad_norm": 0.9277315735816956, + "learning_rate": 3.4370151009932584e-05, + "loss": 2.6344, + "step": 7306 + }, + { + "epoch": 0.6614465465737305, + "grad_norm": 0.8132951259613037, + "learning_rate": 3.434621152783987e-05, + "loss": 2.6609, + "step": 7307 + }, + { + "epoch": 0.6615370688874808, + "grad_norm": 0.8669840097427368, + "learning_rate": 3.4322278657081867e-05, + "loss": 2.6665, + "step": 7308 + }, + { + "epoch": 0.6616275912012312, + "grad_norm": 0.8841760754585266, + "learning_rate": 3.4298352400068635e-05, + "loss": 2.6327, + "step": 7309 + }, + { + "epoch": 0.6617181135149814, + "grad_norm": 0.8753495812416077, + "learning_rate": 3.4274432759209453e-05, + "loss": 2.6814, + "step": 7310 + }, + { + "epoch": 0.6618086358287317, + "grad_norm": 0.8575315475463867, + "learning_rate": 3.4250519736913154e-05, + "loss": 2.6455, + "step": 7311 + }, + { + "epoch": 0.6618991581424821, + "grad_norm": 0.874495804309845, + "learning_rate": 3.4226613335587695e-05, + "loss": 2.6424, + "step": 7312 + }, + { + "epoch": 0.6619896804562324, + "grad_norm": 0.8438037633895874, + "learning_rate": 3.420271355764047e-05, + "loss": 2.7271, + "step": 7313 + }, + { + "epoch": 0.6620802027699828, + "grad_norm": 0.8370341658592224, + "learning_rate": 3.417882040547818e-05, + "loss": 2.7052, + "step": 7314 + }, + { + "epoch": 0.6621707250837331, + "grad_norm": 0.8512766361236572, + "learning_rate": 3.415493388150689e-05, + "loss": 2.6608, + "step": 7315 + }, + { + "epoch": 0.6622612473974835, + "grad_norm": 0.804831326007843, + "learning_rate": 3.413105398813195e-05, + "loss": 2.7129, + "step": 7316 + }, + { + "epoch": 0.6623517697112338, + "grad_norm": 0.8848623633384705, + "learning_rate": 3.41071807277581e-05, + "loss": 2.6712, + "step": 7317 + }, + { + "epoch": 0.6624422920249842, + "grad_norm": 0.8708617687225342, + "learning_rate": 3.408331410278929e-05, + "loss": 2.6331, + "step": 7318 + }, + { + "epoch": 0.6625328143387345, + "grad_norm": 0.8301901817321777, + "learning_rate": 3.4059454115629005e-05, + "loss": 2.6331, + "step": 7319 + }, + { + "epoch": 0.6626233366524849, + "grad_norm": 0.8876309394836426, + "learning_rate": 3.4035600768679855e-05, + "loss": 2.7056, + "step": 7320 + }, + { + "epoch": 0.6627138589662351, + "grad_norm": 0.8508949875831604, + "learning_rate": 3.40117540643439e-05, + "loss": 2.7127, + "step": 7321 + }, + { + "epoch": 0.6628043812799855, + "grad_norm": 0.8443604707717896, + "learning_rate": 3.398791400502251e-05, + "loss": 2.6863, + "step": 7322 + }, + { + "epoch": 0.6628949035937358, + "grad_norm": 0.8200173377990723, + "learning_rate": 3.396408059311638e-05, + "loss": 2.6272, + "step": 7323 + }, + { + "epoch": 0.6629854259074862, + "grad_norm": 0.8505333662033081, + "learning_rate": 3.394025383102552e-05, + "loss": 2.6465, + "step": 7324 + }, + { + "epoch": 0.6630759482212365, + "grad_norm": 0.8726922273635864, + "learning_rate": 3.391643372114932e-05, + "loss": 2.6408, + "step": 7325 + }, + { + "epoch": 0.6631664705349869, + "grad_norm": 0.9224769473075867, + "learning_rate": 3.3892620265886376e-05, + "loss": 2.6597, + "step": 7326 + }, + { + "epoch": 0.6632569928487372, + "grad_norm": 0.9343586564064026, + "learning_rate": 3.386881346763483e-05, + "loss": 2.6887, + "step": 7327 + }, + { + "epoch": 0.6633475151624876, + "grad_norm": 0.8156290650367737, + "learning_rate": 3.384501332879192e-05, + "loss": 2.608, + "step": 7328 + }, + { + "epoch": 0.6634380374762379, + "grad_norm": 0.8800485730171204, + "learning_rate": 3.382121985175436e-05, + "loss": 2.7136, + "step": 7329 + }, + { + "epoch": 0.6635285597899883, + "grad_norm": 0.8803322315216064, + "learning_rate": 3.379743303891815e-05, + "loss": 2.6477, + "step": 7330 + }, + { + "epoch": 0.6636190821037385, + "grad_norm": 0.8415948152542114, + "learning_rate": 3.377365289267862e-05, + "loss": 2.6389, + "step": 7331 + }, + { + "epoch": 0.6637096044174889, + "grad_norm": 0.9095073938369751, + "learning_rate": 3.374987941543043e-05, + "loss": 2.6926, + "step": 7332 + }, + { + "epoch": 0.6638001267312392, + "grad_norm": 0.8726220726966858, + "learning_rate": 3.372611260956761e-05, + "loss": 2.6237, + "step": 7333 + }, + { + "epoch": 0.6638906490449896, + "grad_norm": 0.8335959315299988, + "learning_rate": 3.370235247748337e-05, + "loss": 2.6272, + "step": 7334 + }, + { + "epoch": 0.6639811713587399, + "grad_norm": 0.7942869663238525, + "learning_rate": 3.367859902157048e-05, + "loss": 2.5965, + "step": 7335 + }, + { + "epoch": 0.6640716936724903, + "grad_norm": 0.9050523042678833, + "learning_rate": 3.3654852244220826e-05, + "loss": 2.6389, + "step": 7336 + }, + { + "epoch": 0.6641622159862406, + "grad_norm": 0.8640265464782715, + "learning_rate": 3.363111214782574e-05, + "loss": 2.701, + "step": 7337 + }, + { + "epoch": 0.664252738299991, + "grad_norm": 0.8449823260307312, + "learning_rate": 3.360737873477584e-05, + "loss": 2.6787, + "step": 7338 + }, + { + "epoch": 0.6643432606137413, + "grad_norm": 0.885446310043335, + "learning_rate": 3.358365200746112e-05, + "loss": 2.6858, + "step": 7339 + }, + { + "epoch": 0.6644337829274917, + "grad_norm": 0.9015811085700989, + "learning_rate": 3.3559931968270753e-05, + "loss": 2.6176, + "step": 7340 + }, + { + "epoch": 0.664524305241242, + "grad_norm": 0.8146497011184692, + "learning_rate": 3.3536218619593496e-05, + "loss": 2.6218, + "step": 7341 + }, + { + "epoch": 0.6646148275549923, + "grad_norm": 0.7753192186355591, + "learning_rate": 3.351251196381716e-05, + "loss": 2.6111, + "step": 7342 + }, + { + "epoch": 0.6647053498687426, + "grad_norm": 0.8494303822517395, + "learning_rate": 3.348881200332905e-05, + "loss": 2.7045, + "step": 7343 + }, + { + "epoch": 0.664795872182493, + "grad_norm": 0.8519464731216431, + "learning_rate": 3.3465118740515764e-05, + "loss": 2.6458, + "step": 7344 + }, + { + "epoch": 0.6648863944962433, + "grad_norm": 0.7856410145759583, + "learning_rate": 3.344143217776319e-05, + "loss": 2.6726, + "step": 7345 + }, + { + "epoch": 0.6649769168099937, + "grad_norm": 0.8536087870597839, + "learning_rate": 3.3417752317456584e-05, + "loss": 2.6002, + "step": 7346 + }, + { + "epoch": 0.665067439123744, + "grad_norm": 0.8146572709083557, + "learning_rate": 3.339407916198052e-05, + "loss": 2.6429, + "step": 7347 + }, + { + "epoch": 0.6651579614374944, + "grad_norm": 0.8779633045196533, + "learning_rate": 3.33704127137188e-05, + "loss": 2.6807, + "step": 7348 + }, + { + "epoch": 0.6652484837512447, + "grad_norm": 0.8361524343490601, + "learning_rate": 3.334675297505476e-05, + "loss": 2.5803, + "step": 7349 + }, + { + "epoch": 0.6653390060649951, + "grad_norm": 0.8956053256988525, + "learning_rate": 3.332309994837085e-05, + "loss": 2.6896, + "step": 7350 + }, + { + "epoch": 0.6654295283787454, + "grad_norm": 0.7889111638069153, + "learning_rate": 3.329945363604894e-05, + "loss": 2.6373, + "step": 7351 + }, + { + "epoch": 0.6655200506924956, + "grad_norm": 0.8257021307945251, + "learning_rate": 3.3275814040470234e-05, + "loss": 2.5832, + "step": 7352 + }, + { + "epoch": 0.665610573006246, + "grad_norm": 0.8282142877578735, + "learning_rate": 3.325218116401522e-05, + "loss": 2.65, + "step": 7353 + }, + { + "epoch": 0.6657010953199963, + "grad_norm": 0.8529660105705261, + "learning_rate": 3.322855500906373e-05, + "loss": 2.6685, + "step": 7354 + }, + { + "epoch": 0.6657916176337467, + "grad_norm": 0.9197294116020203, + "learning_rate": 3.320493557799497e-05, + "loss": 2.6648, + "step": 7355 + }, + { + "epoch": 0.665882139947497, + "grad_norm": 0.8231920003890991, + "learning_rate": 3.3181322873187326e-05, + "loss": 2.6398, + "step": 7356 + }, + { + "epoch": 0.6659726622612474, + "grad_norm": 0.8430575728416443, + "learning_rate": 3.3157716897018645e-05, + "loss": 2.6812, + "step": 7357 + }, + { + "epoch": 0.6660631845749977, + "grad_norm": 0.8932907581329346, + "learning_rate": 3.313411765186605e-05, + "loss": 2.6575, + "step": 7358 + }, + { + "epoch": 0.6661537068887481, + "grad_norm": 0.823968768119812, + "learning_rate": 3.311052514010596e-05, + "loss": 2.6438, + "step": 7359 + }, + { + "epoch": 0.6662442292024984, + "grad_norm": 0.8624204397201538, + "learning_rate": 3.308693936411421e-05, + "loss": 2.6686, + "step": 7360 + }, + { + "epoch": 0.6663347515162488, + "grad_norm": 0.7924514412879944, + "learning_rate": 3.306336032626576e-05, + "loss": 2.6259, + "step": 7361 + }, + { + "epoch": 0.666425273829999, + "grad_norm": 0.843051552772522, + "learning_rate": 3.3039788028935156e-05, + "loss": 2.6942, + "step": 7362 + }, + { + "epoch": 0.6665157961437495, + "grad_norm": 0.8649163842201233, + "learning_rate": 3.301622247449604e-05, + "loss": 2.6264, + "step": 7363 + }, + { + "epoch": 0.6666063184574997, + "grad_norm": 0.8684132695198059, + "learning_rate": 3.299266366532149e-05, + "loss": 2.66, + "step": 7364 + }, + { + "epoch": 0.6666968407712501, + "grad_norm": 0.8766152858734131, + "learning_rate": 3.2969111603783875e-05, + "loss": 2.6308, + "step": 7365 + }, + { + "epoch": 0.6667873630850004, + "grad_norm": 0.9793428778648376, + "learning_rate": 3.294556629225488e-05, + "loss": 2.726, + "step": 7366 + }, + { + "epoch": 0.6668778853987508, + "grad_norm": 0.8449831008911133, + "learning_rate": 3.292202773310553e-05, + "loss": 2.6433, + "step": 7367 + }, + { + "epoch": 0.6669684077125011, + "grad_norm": 0.7769423723220825, + "learning_rate": 3.2898495928706185e-05, + "loss": 2.6087, + "step": 7368 + }, + { + "epoch": 0.6670589300262515, + "grad_norm": 0.8507257699966431, + "learning_rate": 3.2874970881426426e-05, + "loss": 2.6934, + "step": 7369 + }, + { + "epoch": 0.6671494523400018, + "grad_norm": 0.8462991714477539, + "learning_rate": 3.2851452593635266e-05, + "loss": 2.7214, + "step": 7370 + }, + { + "epoch": 0.6672399746537522, + "grad_norm": 0.8643592000007629, + "learning_rate": 3.2827941067700996e-05, + "loss": 2.7112, + "step": 7371 + }, + { + "epoch": 0.6673304969675025, + "grad_norm": 0.9072657823562622, + "learning_rate": 3.2804436305991214e-05, + "loss": 2.6819, + "step": 7372 + }, + { + "epoch": 0.6674210192812529, + "grad_norm": 0.8652128577232361, + "learning_rate": 3.278093831087287e-05, + "loss": 2.6554, + "step": 7373 + }, + { + "epoch": 0.6675115415950031, + "grad_norm": 0.8562641739845276, + "learning_rate": 3.275744708471222e-05, + "loss": 2.6745, + "step": 7374 + }, + { + "epoch": 0.6676020639087535, + "grad_norm": 0.8770109415054321, + "learning_rate": 3.273396262987475e-05, + "loss": 2.6677, + "step": 7375 + }, + { + "epoch": 0.6676925862225038, + "grad_norm": 0.8314611315727234, + "learning_rate": 3.271048494872546e-05, + "loss": 2.6635, + "step": 7376 + }, + { + "epoch": 0.6677831085362542, + "grad_norm": 0.9163419604301453, + "learning_rate": 3.268701404362847e-05, + "loss": 2.634, + "step": 7377 + }, + { + "epoch": 0.6678736308500045, + "grad_norm": 0.8832092881202698, + "learning_rate": 3.266354991694732e-05, + "loss": 2.6737, + "step": 7378 + }, + { + "epoch": 0.6679641531637549, + "grad_norm": 0.9241970777511597, + "learning_rate": 3.264009257104486e-05, + "loss": 2.6664, + "step": 7379 + }, + { + "epoch": 0.6680546754775052, + "grad_norm": 0.9200668334960938, + "learning_rate": 3.2616642008283213e-05, + "loss": 2.6741, + "step": 7380 + }, + { + "epoch": 0.6681451977912556, + "grad_norm": 0.8832535743713379, + "learning_rate": 3.259319823102389e-05, + "loss": 2.6622, + "step": 7381 + }, + { + "epoch": 0.6682357201050059, + "grad_norm": 0.8708347678184509, + "learning_rate": 3.2569761241627696e-05, + "loss": 2.6535, + "step": 7382 + }, + { + "epoch": 0.6683262424187563, + "grad_norm": 0.8515925407409668, + "learning_rate": 3.254633104245463e-05, + "loss": 2.6247, + "step": 7383 + }, + { + "epoch": 0.6684167647325066, + "grad_norm": 0.9260343909263611, + "learning_rate": 3.2522907635864244e-05, + "loss": 2.6386, + "step": 7384 + }, + { + "epoch": 0.668507287046257, + "grad_norm": 0.8982704877853394, + "learning_rate": 3.249949102421518e-05, + "loss": 2.697, + "step": 7385 + }, + { + "epoch": 0.6685978093600072, + "grad_norm": 0.8327007293701172, + "learning_rate": 3.247608120986552e-05, + "loss": 2.6658, + "step": 7386 + }, + { + "epoch": 0.6686883316737576, + "grad_norm": 0.8208869695663452, + "learning_rate": 3.2452678195172635e-05, + "loss": 2.6131, + "step": 7387 + }, + { + "epoch": 0.6687788539875079, + "grad_norm": 0.830223023891449, + "learning_rate": 3.242928198249322e-05, + "loss": 2.6519, + "step": 7388 + }, + { + "epoch": 0.6688693763012583, + "grad_norm": 0.8725573420524597, + "learning_rate": 3.240589257418325e-05, + "loss": 2.6519, + "step": 7389 + }, + { + "epoch": 0.6689598986150086, + "grad_norm": 0.8340583443641663, + "learning_rate": 3.238250997259808e-05, + "loss": 2.7087, + "step": 7390 + }, + { + "epoch": 0.669050420928759, + "grad_norm": 0.9096596837043762, + "learning_rate": 3.235913418009224e-05, + "loss": 2.6359, + "step": 7391 + }, + { + "epoch": 0.6691409432425093, + "grad_norm": 0.7934027314186096, + "learning_rate": 3.233576519901981e-05, + "loss": 2.6322, + "step": 7392 + }, + { + "epoch": 0.6692314655562596, + "grad_norm": 0.8696547746658325, + "learning_rate": 3.231240303173394e-05, + "loss": 2.639, + "step": 7393 + }, + { + "epoch": 0.66932198787001, + "grad_norm": 0.8256217837333679, + "learning_rate": 3.228904768058722e-05, + "loss": 2.619, + "step": 7394 + }, + { + "epoch": 0.6694125101837602, + "grad_norm": 0.8816400170326233, + "learning_rate": 3.226569914793156e-05, + "loss": 2.6671, + "step": 7395 + }, + { + "epoch": 0.6695030324975106, + "grad_norm": 0.8656291365623474, + "learning_rate": 3.224235743611814e-05, + "loss": 2.6264, + "step": 7396 + }, + { + "epoch": 0.6695935548112609, + "grad_norm": 0.8678957223892212, + "learning_rate": 3.221902254749747e-05, + "loss": 2.7066, + "step": 7397 + }, + { + "epoch": 0.6696840771250113, + "grad_norm": 0.870672345161438, + "learning_rate": 3.21956944844194e-05, + "loss": 2.6839, + "step": 7398 + }, + { + "epoch": 0.6697745994387616, + "grad_norm": 0.8486281633377075, + "learning_rate": 3.217237324923299e-05, + "loss": 2.7099, + "step": 7399 + }, + { + "epoch": 0.669865121752512, + "grad_norm": 0.8596231937408447, + "learning_rate": 3.21490588442868e-05, + "loss": 2.6821, + "step": 7400 + }, + { + "epoch": 0.6699556440662623, + "grad_norm": 0.8599048852920532, + "learning_rate": 3.2125751271928485e-05, + "loss": 2.6727, + "step": 7401 + }, + { + "epoch": 0.6700461663800127, + "grad_norm": 0.8863310217857361, + "learning_rate": 3.210245053450517e-05, + "loss": 2.6249, + "step": 7402 + }, + { + "epoch": 0.670136688693763, + "grad_norm": 0.8507751822471619, + "learning_rate": 3.207915663436322e-05, + "loss": 2.6963, + "step": 7403 + }, + { + "epoch": 0.6702272110075134, + "grad_norm": 0.8587643504142761, + "learning_rate": 3.205586957384838e-05, + "loss": 2.6832, + "step": 7404 + }, + { + "epoch": 0.6703177333212637, + "grad_norm": 0.8385748267173767, + "learning_rate": 3.203258935530554e-05, + "loss": 2.6764, + "step": 7405 + }, + { + "epoch": 0.670408255635014, + "grad_norm": 0.8625391721725464, + "learning_rate": 3.200931598107916e-05, + "loss": 2.6947, + "step": 7406 + }, + { + "epoch": 0.6704987779487643, + "grad_norm": 0.9186429977416992, + "learning_rate": 3.1986049453512766e-05, + "loss": 2.6733, + "step": 7407 + }, + { + "epoch": 0.6705893002625147, + "grad_norm": 0.8060020804405212, + "learning_rate": 3.196278977494934e-05, + "loss": 2.6097, + "step": 7408 + }, + { + "epoch": 0.670679822576265, + "grad_norm": 0.8401286005973816, + "learning_rate": 3.193953694773112e-05, + "loss": 2.6956, + "step": 7409 + }, + { + "epoch": 0.6707703448900154, + "grad_norm": 0.8545077443122864, + "learning_rate": 3.191629097419966e-05, + "loss": 2.6702, + "step": 7410 + }, + { + "epoch": 0.6708608672037657, + "grad_norm": 0.9147730469703674, + "learning_rate": 3.1893051856695846e-05, + "loss": 2.6265, + "step": 7411 + }, + { + "epoch": 0.6709513895175161, + "grad_norm": 0.8632221221923828, + "learning_rate": 3.186981959755987e-05, + "loss": 2.6475, + "step": 7412 + }, + { + "epoch": 0.6710419118312664, + "grad_norm": 0.846584141254425, + "learning_rate": 3.184659419913113e-05, + "loss": 2.6488, + "step": 7413 + }, + { + "epoch": 0.6711324341450168, + "grad_norm": 0.8977507948875427, + "learning_rate": 3.182337566374856e-05, + "loss": 2.5776, + "step": 7414 + }, + { + "epoch": 0.6712229564587671, + "grad_norm": 0.8261081576347351, + "learning_rate": 3.1800163993750166e-05, + "loss": 2.6674, + "step": 7415 + }, + { + "epoch": 0.6713134787725175, + "grad_norm": 0.8828268647193909, + "learning_rate": 3.177695919147339e-05, + "loss": 2.6563, + "step": 7416 + }, + { + "epoch": 0.6714040010862677, + "grad_norm": 0.8706898093223572, + "learning_rate": 3.1753761259254946e-05, + "loss": 2.651, + "step": 7417 + }, + { + "epoch": 0.6714945234000181, + "grad_norm": 0.8818956613540649, + "learning_rate": 3.173057019943089e-05, + "loss": 2.6796, + "step": 7418 + }, + { + "epoch": 0.6715850457137684, + "grad_norm": 0.8012150526046753, + "learning_rate": 3.1707386014336537e-05, + "loss": 2.6349, + "step": 7419 + }, + { + "epoch": 0.6716755680275188, + "grad_norm": 0.8736044764518738, + "learning_rate": 3.1684208706306574e-05, + "loss": 2.6447, + "step": 7420 + }, + { + "epoch": 0.6717660903412691, + "grad_norm": 0.8982002139091492, + "learning_rate": 3.166103827767487e-05, + "loss": 2.7035, + "step": 7421 + }, + { + "epoch": 0.6718566126550195, + "grad_norm": 0.816577136516571, + "learning_rate": 3.1637874730774795e-05, + "loss": 2.6144, + "step": 7422 + }, + { + "epoch": 0.6719471349687698, + "grad_norm": 0.8612717986106873, + "learning_rate": 3.161471806793884e-05, + "loss": 2.6686, + "step": 7423 + }, + { + "epoch": 0.6720376572825202, + "grad_norm": 0.8321888446807861, + "learning_rate": 3.1591568291498904e-05, + "loss": 2.5793, + "step": 7424 + }, + { + "epoch": 0.6721281795962705, + "grad_norm": 0.8054267168045044, + "learning_rate": 3.156842540378617e-05, + "loss": 2.6141, + "step": 7425 + }, + { + "epoch": 0.6722187019100209, + "grad_norm": 0.8201277852058411, + "learning_rate": 3.154528940713113e-05, + "loss": 2.6211, + "step": 7426 + }, + { + "epoch": 0.6723092242237712, + "grad_norm": 0.8517640233039856, + "learning_rate": 3.152216030386357e-05, + "loss": 2.6474, + "step": 7427 + }, + { + "epoch": 0.6723997465375215, + "grad_norm": 0.9056238532066345, + "learning_rate": 3.1499038096312615e-05, + "loss": 2.6526, + "step": 7428 + }, + { + "epoch": 0.6724902688512718, + "grad_norm": 0.9206411838531494, + "learning_rate": 3.14759227868066e-05, + "loss": 2.6644, + "step": 7429 + }, + { + "epoch": 0.6725807911650222, + "grad_norm": 0.8964048027992249, + "learning_rate": 3.1452814377673346e-05, + "loss": 2.6055, + "step": 7430 + }, + { + "epoch": 0.6726713134787725, + "grad_norm": 0.8579918146133423, + "learning_rate": 3.142971287123977e-05, + "loss": 2.6294, + "step": 7431 + }, + { + "epoch": 0.6727618357925229, + "grad_norm": 0.8341345191001892, + "learning_rate": 3.140661826983223e-05, + "loss": 2.6064, + "step": 7432 + }, + { + "epoch": 0.6728523581062732, + "grad_norm": 0.8820474147796631, + "learning_rate": 3.138353057577636e-05, + "loss": 2.6981, + "step": 7433 + }, + { + "epoch": 0.6729428804200235, + "grad_norm": 0.8326157331466675, + "learning_rate": 3.136044979139712e-05, + "loss": 2.6532, + "step": 7434 + }, + { + "epoch": 0.6730334027337739, + "grad_norm": 0.8076948523521423, + "learning_rate": 3.133737591901864e-05, + "loss": 2.6392, + "step": 7435 + }, + { + "epoch": 0.6731239250475242, + "grad_norm": 0.809867799282074, + "learning_rate": 3.131430896096459e-05, + "loss": 2.6554, + "step": 7436 + }, + { + "epoch": 0.6732144473612746, + "grad_norm": 0.8507243990898132, + "learning_rate": 3.129124891955771e-05, + "loss": 2.6328, + "step": 7437 + }, + { + "epoch": 0.6733049696750248, + "grad_norm": 0.8950903415679932, + "learning_rate": 3.1268195797120195e-05, + "loss": 2.7115, + "step": 7438 + }, + { + "epoch": 0.6733954919887752, + "grad_norm": 0.9369960427284241, + "learning_rate": 3.124514959597346e-05, + "loss": 2.6328, + "step": 7439 + }, + { + "epoch": 0.6734860143025255, + "grad_norm": 0.8768920302391052, + "learning_rate": 3.1222110318438304e-05, + "loss": 2.6721, + "step": 7440 + }, + { + "epoch": 0.6735765366162759, + "grad_norm": 0.8693579435348511, + "learning_rate": 3.119907796683479e-05, + "loss": 2.6859, + "step": 7441 + }, + { + "epoch": 0.6736670589300262, + "grad_norm": 0.847797155380249, + "learning_rate": 3.1176052543482194e-05, + "loss": 2.6447, + "step": 7442 + }, + { + "epoch": 0.6737575812437766, + "grad_norm": 0.819843053817749, + "learning_rate": 3.115303405069922e-05, + "loss": 2.6963, + "step": 7443 + }, + { + "epoch": 0.6738481035575269, + "grad_norm": 0.8983978033065796, + "learning_rate": 3.113002249080386e-05, + "loss": 2.6088, + "step": 7444 + }, + { + "epoch": 0.6739386258712773, + "grad_norm": 0.8833874464035034, + "learning_rate": 3.110701786611333e-05, + "loss": 2.6925, + "step": 7445 + }, + { + "epoch": 0.6740291481850276, + "grad_norm": 0.7997896075248718, + "learning_rate": 3.108402017894422e-05, + "loss": 2.5894, + "step": 7446 + }, + { + "epoch": 0.674119670498778, + "grad_norm": 0.867850661277771, + "learning_rate": 3.106102943161242e-05, + "loss": 2.5973, + "step": 7447 + }, + { + "epoch": 0.6742101928125283, + "grad_norm": 0.7908462882041931, + "learning_rate": 3.103804562643302e-05, + "loss": 2.6414, + "step": 7448 + }, + { + "epoch": 0.6743007151262786, + "grad_norm": 0.8645251393318176, + "learning_rate": 3.101506876572059e-05, + "loss": 2.6771, + "step": 7449 + }, + { + "epoch": 0.6743912374400289, + "grad_norm": 0.8117453455924988, + "learning_rate": 3.099209885178882e-05, + "loss": 2.6452, + "step": 7450 + }, + { + "epoch": 0.6744817597537793, + "grad_norm": 0.8113788962364197, + "learning_rate": 3.096913588695081e-05, + "loss": 2.6444, + "step": 7451 + }, + { + "epoch": 0.6745722820675296, + "grad_norm": 0.871590256690979, + "learning_rate": 3.094617987351892e-05, + "loss": 2.6551, + "step": 7452 + }, + { + "epoch": 0.67466280438128, + "grad_norm": 0.8343399167060852, + "learning_rate": 3.0923230813804825e-05, + "loss": 2.6521, + "step": 7453 + }, + { + "epoch": 0.6747533266950303, + "grad_norm": 0.9126983880996704, + "learning_rate": 3.0900288710119504e-05, + "loss": 2.6849, + "step": 7454 + }, + { + "epoch": 0.6748438490087807, + "grad_norm": 0.8040212392807007, + "learning_rate": 3.087735356477326e-05, + "loss": 2.6112, + "step": 7455 + }, + { + "epoch": 0.674934371322531, + "grad_norm": 0.8733232021331787, + "learning_rate": 3.0854425380075544e-05, + "loss": 2.6742, + "step": 7456 + }, + { + "epoch": 0.6750248936362814, + "grad_norm": 0.8318789601325989, + "learning_rate": 3.083150415833537e-05, + "loss": 2.6729, + "step": 7457 + }, + { + "epoch": 0.6751154159500317, + "grad_norm": 0.884968638420105, + "learning_rate": 3.08085899018608e-05, + "loss": 2.6462, + "step": 7458 + }, + { + "epoch": 0.6752059382637821, + "grad_norm": 0.7779831886291504, + "learning_rate": 3.078568261295933e-05, + "loss": 2.6402, + "step": 7459 + }, + { + "epoch": 0.6752964605775323, + "grad_norm": 0.8294646143913269, + "learning_rate": 3.076278229393773e-05, + "loss": 2.6428, + "step": 7460 + }, + { + "epoch": 0.6753869828912827, + "grad_norm": 0.8570337295532227, + "learning_rate": 3.073988894710209e-05, + "loss": 2.6938, + "step": 7461 + }, + { + "epoch": 0.675477505205033, + "grad_norm": 0.87303227186203, + "learning_rate": 3.071700257475768e-05, + "loss": 2.7059, + "step": 7462 + }, + { + "epoch": 0.6755680275187834, + "grad_norm": 0.808883011341095, + "learning_rate": 3.0694123179209266e-05, + "loss": 2.6511, + "step": 7463 + }, + { + "epoch": 0.6756585498325337, + "grad_norm": 0.9272127151489258, + "learning_rate": 3.067125076276073e-05, + "loss": 2.6945, + "step": 7464 + }, + { + "epoch": 0.6757490721462841, + "grad_norm": 0.974225640296936, + "learning_rate": 3.064838532771535e-05, + "loss": 2.6187, + "step": 7465 + }, + { + "epoch": 0.6758395944600344, + "grad_norm": 0.8074182868003845, + "learning_rate": 3.062552687637566e-05, + "loss": 2.6598, + "step": 7466 + }, + { + "epoch": 0.6759301167737848, + "grad_norm": 0.8622375130653381, + "learning_rate": 3.0602675411043516e-05, + "loss": 2.6804, + "step": 7467 + }, + { + "epoch": 0.6760206390875351, + "grad_norm": 0.8563747406005859, + "learning_rate": 3.0579830934020057e-05, + "loss": 2.6641, + "step": 7468 + }, + { + "epoch": 0.6761111614012855, + "grad_norm": 0.8611782193183899, + "learning_rate": 3.0556993447605765e-05, + "loss": 2.653, + "step": 7469 + }, + { + "epoch": 0.6762016837150358, + "grad_norm": 0.8958120942115784, + "learning_rate": 3.053416295410026e-05, + "loss": 2.6745, + "step": 7470 + }, + { + "epoch": 0.6762922060287861, + "grad_norm": 0.883176326751709, + "learning_rate": 3.0511339455802724e-05, + "loss": 2.6106, + "step": 7471 + }, + { + "epoch": 0.6763827283425364, + "grad_norm": 0.8690735101699829, + "learning_rate": 3.0488522955011357e-05, + "loss": 2.6656, + "step": 7472 + }, + { + "epoch": 0.6764732506562868, + "grad_norm": 0.8291492462158203, + "learning_rate": 3.0465713454023826e-05, + "loss": 2.6225, + "step": 7473 + }, + { + "epoch": 0.6765637729700371, + "grad_norm": 0.7962046265602112, + "learning_rate": 3.044291095513705e-05, + "loss": 2.6375, + "step": 7474 + }, + { + "epoch": 0.6766542952837874, + "grad_norm": 0.9379552602767944, + "learning_rate": 3.0420115460647235e-05, + "loss": 2.7519, + "step": 7475 + }, + { + "epoch": 0.6767448175975378, + "grad_norm": 0.8300173878669739, + "learning_rate": 3.0397326972849892e-05, + "loss": 2.6709, + "step": 7476 + }, + { + "epoch": 0.6768353399112881, + "grad_norm": 0.9035888314247131, + "learning_rate": 3.0374545494039852e-05, + "loss": 2.6305, + "step": 7477 + }, + { + "epoch": 0.6769258622250385, + "grad_norm": 0.8968276381492615, + "learning_rate": 3.0351771026511122e-05, + "loss": 2.6216, + "step": 7478 + }, + { + "epoch": 0.6770163845387888, + "grad_norm": 0.847926139831543, + "learning_rate": 3.032900357255719e-05, + "loss": 2.6709, + "step": 7479 + }, + { + "epoch": 0.6771069068525392, + "grad_norm": 1.1153925657272339, + "learning_rate": 3.030624313447067e-05, + "loss": 2.6113, + "step": 7480 + }, + { + "epoch": 0.6771974291662894, + "grad_norm": 0.9465881586074829, + "learning_rate": 3.0283489714543556e-05, + "loss": 2.6132, + "step": 7481 + }, + { + "epoch": 0.6772879514800398, + "grad_norm": 0.8538044691085815, + "learning_rate": 3.026074331506712e-05, + "loss": 2.6816, + "step": 7482 + }, + { + "epoch": 0.6773784737937901, + "grad_norm": 0.8334993720054626, + "learning_rate": 3.0238003938331927e-05, + "loss": 2.655, + "step": 7483 + }, + { + "epoch": 0.6774689961075405, + "grad_norm": 1.0370806455612183, + "learning_rate": 3.0215271586627825e-05, + "loss": 2.6343, + "step": 7484 + }, + { + "epoch": 0.6775595184212908, + "grad_norm": 0.8620277047157288, + "learning_rate": 3.019254626224399e-05, + "loss": 2.6227, + "step": 7485 + }, + { + "epoch": 0.6776500407350412, + "grad_norm": 0.8354658484458923, + "learning_rate": 3.016982796746879e-05, + "loss": 2.6554, + "step": 7486 + }, + { + "epoch": 0.6777405630487915, + "grad_norm": 0.8558822274208069, + "learning_rate": 3.014711670459005e-05, + "loss": 2.643, + "step": 7487 + }, + { + "epoch": 0.6778310853625419, + "grad_norm": 0.8511578440666199, + "learning_rate": 3.0124412475894726e-05, + "loss": 2.6077, + "step": 7488 + }, + { + "epoch": 0.6779216076762922, + "grad_norm": 0.9501864910125732, + "learning_rate": 3.0101715283669153e-05, + "loss": 2.6687, + "step": 7489 + }, + { + "epoch": 0.6780121299900426, + "grad_norm": 0.838162362575531, + "learning_rate": 3.0079025130198935e-05, + "loss": 2.6471, + "step": 7490 + }, + { + "epoch": 0.6781026523037929, + "grad_norm": 0.837769091129303, + "learning_rate": 3.005634201776897e-05, + "loss": 2.6728, + "step": 7491 + }, + { + "epoch": 0.6781931746175432, + "grad_norm": 0.8764926195144653, + "learning_rate": 3.0033665948663448e-05, + "loss": 2.67, + "step": 7492 + }, + { + "epoch": 0.6782836969312935, + "grad_norm": 0.9114204049110413, + "learning_rate": 3.0010996925165892e-05, + "loss": 2.6779, + "step": 7493 + }, + { + "epoch": 0.6783742192450439, + "grad_norm": 0.8063619136810303, + "learning_rate": 2.998833494955896e-05, + "loss": 2.5825, + "step": 7494 + }, + { + "epoch": 0.6784647415587942, + "grad_norm": 0.8676156401634216, + "learning_rate": 2.9965680024124852e-05, + "loss": 2.6501, + "step": 7495 + }, + { + "epoch": 0.6785552638725446, + "grad_norm": 0.8409841060638428, + "learning_rate": 2.9943032151144812e-05, + "loss": 2.6461, + "step": 7496 + }, + { + "epoch": 0.6786457861862949, + "grad_norm": 0.8916147947311401, + "learning_rate": 2.992039133289952e-05, + "loss": 2.6078, + "step": 7497 + }, + { + "epoch": 0.6787363085000453, + "grad_norm": 0.8280127644538879, + "learning_rate": 2.9897757571668905e-05, + "loss": 2.6796, + "step": 7498 + }, + { + "epoch": 0.6788268308137956, + "grad_norm": 0.881831705570221, + "learning_rate": 2.9875130869732214e-05, + "loss": 2.6024, + "step": 7499 + }, + { + "epoch": 0.678917353127546, + "grad_norm": 0.8717087507247925, + "learning_rate": 2.9852511229367865e-05, + "loss": 2.6138, + "step": 7500 + }, + { + "epoch": 0.6790078754412963, + "grad_norm": 0.8997431397438049, + "learning_rate": 2.9829898652853773e-05, + "loss": 2.6541, + "step": 7501 + }, + { + "epoch": 0.6790983977550467, + "grad_norm": 0.9234827756881714, + "learning_rate": 2.980729314246694e-05, + "loss": 2.672, + "step": 7502 + }, + { + "epoch": 0.6791889200687969, + "grad_norm": 0.8099880814552307, + "learning_rate": 2.9784694700483762e-05, + "loss": 2.6478, + "step": 7503 + }, + { + "epoch": 0.6792794423825473, + "grad_norm": 0.8408997058868408, + "learning_rate": 2.9762103329179913e-05, + "loss": 2.6768, + "step": 7504 + }, + { + "epoch": 0.6793699646962976, + "grad_norm": 0.8334234952926636, + "learning_rate": 2.9739519030830333e-05, + "loss": 2.6251, + "step": 7505 + }, + { + "epoch": 0.679460487010048, + "grad_norm": 0.8718681335449219, + "learning_rate": 2.9716941807709265e-05, + "loss": 2.6532, + "step": 7506 + }, + { + "epoch": 0.6795510093237983, + "grad_norm": 0.9016917943954468, + "learning_rate": 2.969437166209027e-05, + "loss": 2.6318, + "step": 7507 + }, + { + "epoch": 0.6796415316375487, + "grad_norm": 0.9001185297966003, + "learning_rate": 2.9671808596246053e-05, + "loss": 2.6418, + "step": 7508 + }, + { + "epoch": 0.679732053951299, + "grad_norm": 0.8468513488769531, + "learning_rate": 2.9649252612448863e-05, + "loss": 2.6544, + "step": 7509 + }, + { + "epoch": 0.6798225762650494, + "grad_norm": 0.8203318119049072, + "learning_rate": 2.962670371296996e-05, + "loss": 2.644, + "step": 7510 + }, + { + "epoch": 0.6799130985787997, + "grad_norm": 0.8951826691627502, + "learning_rate": 2.9604161900080087e-05, + "loss": 2.6286, + "step": 7511 + }, + { + "epoch": 0.6800036208925501, + "grad_norm": 0.8559239506721497, + "learning_rate": 2.9581627176049166e-05, + "loss": 2.6228, + "step": 7512 + }, + { + "epoch": 0.6800941432063004, + "grad_norm": 0.880177915096283, + "learning_rate": 2.9559099543146474e-05, + "loss": 2.6389, + "step": 7513 + }, + { + "epoch": 0.6801846655200507, + "grad_norm": 0.9628912210464478, + "learning_rate": 2.953657900364053e-05, + "loss": 2.6487, + "step": 7514 + }, + { + "epoch": 0.680275187833801, + "grad_norm": 0.7840589284896851, + "learning_rate": 2.9514065559799176e-05, + "loss": 2.642, + "step": 7515 + }, + { + "epoch": 0.6803657101475513, + "grad_norm": 0.9465866088867188, + "learning_rate": 2.949155921388943e-05, + "loss": 2.6898, + "step": 7516 + }, + { + "epoch": 0.6804562324613017, + "grad_norm": 0.8815156817436218, + "learning_rate": 2.9469059968177815e-05, + "loss": 2.699, + "step": 7517 + }, + { + "epoch": 0.680546754775052, + "grad_norm": 0.9435743093490601, + "learning_rate": 2.9446567824929883e-05, + "loss": 2.6925, + "step": 7518 + }, + { + "epoch": 0.6806372770888024, + "grad_norm": 0.8814501166343689, + "learning_rate": 2.9424082786410656e-05, + "loss": 2.7095, + "step": 7519 + }, + { + "epoch": 0.6807277994025527, + "grad_norm": 0.7933807969093323, + "learning_rate": 2.9401604854884357e-05, + "loss": 2.6263, + "step": 7520 + }, + { + "epoch": 0.6808183217163031, + "grad_norm": 0.8809409141540527, + "learning_rate": 2.9379134032614507e-05, + "loss": 2.6298, + "step": 7521 + }, + { + "epoch": 0.6809088440300534, + "grad_norm": 0.8568894267082214, + "learning_rate": 2.9356670321863942e-05, + "loss": 2.6771, + "step": 7522 + }, + { + "epoch": 0.6809993663438038, + "grad_norm": 0.8187510371208191, + "learning_rate": 2.9334213724894767e-05, + "loss": 2.6327, + "step": 7523 + }, + { + "epoch": 0.681089888657554, + "grad_norm": 0.8523328900337219, + "learning_rate": 2.9311764243968275e-05, + "loss": 2.6121, + "step": 7524 + }, + { + "epoch": 0.6811804109713044, + "grad_norm": 0.9264262914657593, + "learning_rate": 2.9289321881345254e-05, + "loss": 2.7426, + "step": 7525 + }, + { + "epoch": 0.6812709332850547, + "grad_norm": 0.9365399479866028, + "learning_rate": 2.9266886639285585e-05, + "loss": 2.6904, + "step": 7526 + }, + { + "epoch": 0.6813614555988051, + "grad_norm": 0.8703701496124268, + "learning_rate": 2.9244458520048434e-05, + "loss": 2.6645, + "step": 7527 + }, + { + "epoch": 0.6814519779125554, + "grad_norm": 0.8455237150192261, + "learning_rate": 2.922203752589243e-05, + "loss": 2.602, + "step": 7528 + }, + { + "epoch": 0.6815425002263058, + "grad_norm": 0.8652321696281433, + "learning_rate": 2.9199623659075283e-05, + "loss": 2.6605, + "step": 7529 + }, + { + "epoch": 0.6816330225400561, + "grad_norm": 0.8773972988128662, + "learning_rate": 2.91772169218541e-05, + "loss": 2.6237, + "step": 7530 + }, + { + "epoch": 0.6817235448538065, + "grad_norm": 0.8873440623283386, + "learning_rate": 2.915481731648523e-05, + "loss": 2.6625, + "step": 7531 + }, + { + "epoch": 0.6818140671675568, + "grad_norm": 0.7923961877822876, + "learning_rate": 2.9132424845224317e-05, + "loss": 2.681, + "step": 7532 + }, + { + "epoch": 0.6819045894813072, + "grad_norm": 0.8112161755561829, + "learning_rate": 2.9110039510326293e-05, + "loss": 2.5817, + "step": 7533 + }, + { + "epoch": 0.6819951117950575, + "grad_norm": 0.874810516834259, + "learning_rate": 2.9087661314045366e-05, + "loss": 2.6135, + "step": 7534 + }, + { + "epoch": 0.6820856341088078, + "grad_norm": 0.7897319793701172, + "learning_rate": 2.9065290258634958e-05, + "loss": 2.6112, + "step": 7535 + }, + { + "epoch": 0.6821761564225581, + "grad_norm": 0.8886232972145081, + "learning_rate": 2.904292634634793e-05, + "loss": 2.6189, + "step": 7536 + }, + { + "epoch": 0.6822666787363085, + "grad_norm": 0.8394643068313599, + "learning_rate": 2.9020569579436252e-05, + "loss": 2.6385, + "step": 7537 + }, + { + "epoch": 0.6823572010500588, + "grad_norm": 0.8300806879997253, + "learning_rate": 2.8998219960151275e-05, + "loss": 2.6521, + "step": 7538 + }, + { + "epoch": 0.6824477233638092, + "grad_norm": 0.8635507822036743, + "learning_rate": 2.8975877490743607e-05, + "loss": 2.6268, + "step": 7539 + }, + { + "epoch": 0.6825382456775595, + "grad_norm": 0.7847825884819031, + "learning_rate": 2.8953542173463133e-05, + "loss": 2.6358, + "step": 7540 + }, + { + "epoch": 0.6826287679913099, + "grad_norm": 0.9233072400093079, + "learning_rate": 2.8931214010559004e-05, + "loss": 2.6771, + "step": 7541 + }, + { + "epoch": 0.6827192903050602, + "grad_norm": 0.8705090880393982, + "learning_rate": 2.8908893004279723e-05, + "loss": 2.6798, + "step": 7542 + }, + { + "epoch": 0.6828098126188106, + "grad_norm": 0.8056427836418152, + "learning_rate": 2.8886579156872916e-05, + "loss": 2.6228, + "step": 7543 + }, + { + "epoch": 0.6829003349325609, + "grad_norm": 0.8417764902114868, + "learning_rate": 2.8864272470585696e-05, + "loss": 2.6361, + "step": 7544 + }, + { + "epoch": 0.6829908572463113, + "grad_norm": 0.9244621992111206, + "learning_rate": 2.8841972947664252e-05, + "loss": 2.6495, + "step": 7545 + }, + { + "epoch": 0.6830813795600615, + "grad_norm": 0.8148440718650818, + "learning_rate": 2.8819680590354202e-05, + "loss": 2.6521, + "step": 7546 + }, + { + "epoch": 0.6831719018738119, + "grad_norm": 0.8853485584259033, + "learning_rate": 2.879739540090036e-05, + "loss": 2.6394, + "step": 7547 + }, + { + "epoch": 0.6832624241875622, + "grad_norm": 0.8543866276741028, + "learning_rate": 2.8775117381546857e-05, + "loss": 2.6207, + "step": 7548 + }, + { + "epoch": 0.6833529465013126, + "grad_norm": 0.8939554691314697, + "learning_rate": 2.875284653453708e-05, + "loss": 2.653, + "step": 7549 + }, + { + "epoch": 0.6834434688150629, + "grad_norm": 0.8086675405502319, + "learning_rate": 2.8730582862113742e-05, + "loss": 2.6412, + "step": 7550 + }, + { + "epoch": 0.6835339911288133, + "grad_norm": 0.8328844308853149, + "learning_rate": 2.870832636651869e-05, + "loss": 2.6295, + "step": 7551 + }, + { + "epoch": 0.6836245134425636, + "grad_norm": 0.8223507404327393, + "learning_rate": 2.8686077049993287e-05, + "loss": 2.6705, + "step": 7552 + }, + { + "epoch": 0.683715035756314, + "grad_norm": 0.8664432764053345, + "learning_rate": 2.8663834914777952e-05, + "loss": 2.6218, + "step": 7553 + }, + { + "epoch": 0.6838055580700643, + "grad_norm": 0.8756895661354065, + "learning_rate": 2.8641599963112488e-05, + "loss": 2.6909, + "step": 7554 + }, + { + "epoch": 0.6838960803838147, + "grad_norm": 0.8878546357154846, + "learning_rate": 2.861937219723595e-05, + "loss": 2.6313, + "step": 7555 + }, + { + "epoch": 0.683986602697565, + "grad_norm": 0.8262296319007874, + "learning_rate": 2.8597151619386707e-05, + "loss": 2.6287, + "step": 7556 + }, + { + "epoch": 0.6840771250113152, + "grad_norm": 0.8517236709594727, + "learning_rate": 2.8574938231802283e-05, + "loss": 2.6161, + "step": 7557 + }, + { + "epoch": 0.6841676473250656, + "grad_norm": 0.8476995229721069, + "learning_rate": 2.8552732036719687e-05, + "loss": 2.6321, + "step": 7558 + }, + { + "epoch": 0.6842581696388159, + "grad_norm": 0.8412792086601257, + "learning_rate": 2.8530533036374984e-05, + "loss": 2.6756, + "step": 7559 + }, + { + "epoch": 0.6843486919525663, + "grad_norm": 0.9044947624206543, + "learning_rate": 2.8508341233003654e-05, + "loss": 2.6733, + "step": 7560 + }, + { + "epoch": 0.6844392142663166, + "grad_norm": 0.8056501150131226, + "learning_rate": 2.848615662884041e-05, + "loss": 2.6212, + "step": 7561 + }, + { + "epoch": 0.684529736580067, + "grad_norm": 0.8448612689971924, + "learning_rate": 2.846397922611923e-05, + "loss": 2.6485, + "step": 7562 + }, + { + "epoch": 0.6846202588938173, + "grad_norm": 0.8401530385017395, + "learning_rate": 2.8441809027073386e-05, + "loss": 2.6439, + "step": 7563 + }, + { + "epoch": 0.6847107812075677, + "grad_norm": 0.846659243106842, + "learning_rate": 2.8419646033935444e-05, + "loss": 2.5858, + "step": 7564 + }, + { + "epoch": 0.684801303521318, + "grad_norm": 0.8078282475471497, + "learning_rate": 2.839749024893713e-05, + "loss": 2.6221, + "step": 7565 + }, + { + "epoch": 0.6848918258350684, + "grad_norm": 0.833645761013031, + "learning_rate": 2.8375341674309653e-05, + "loss": 2.6456, + "step": 7566 + }, + { + "epoch": 0.6849823481488186, + "grad_norm": 0.7956631779670715, + "learning_rate": 2.835320031228328e-05, + "loss": 2.6156, + "step": 7567 + }, + { + "epoch": 0.685072870462569, + "grad_norm": 0.8895859718322754, + "learning_rate": 2.833106616508766e-05, + "loss": 2.7153, + "step": 7568 + }, + { + "epoch": 0.6851633927763193, + "grad_norm": 0.8952557444572449, + "learning_rate": 2.8308939234951726e-05, + "loss": 2.6639, + "step": 7569 + }, + { + "epoch": 0.6852539150900697, + "grad_norm": 0.8234478235244751, + "learning_rate": 2.828681952410366e-05, + "loss": 2.6604, + "step": 7570 + }, + { + "epoch": 0.68534443740382, + "grad_norm": 0.7875887751579285, + "learning_rate": 2.826470703477089e-05, + "loss": 2.5946, + "step": 7571 + }, + { + "epoch": 0.6854349597175704, + "grad_norm": 0.7900510430335999, + "learning_rate": 2.82426017691802e-05, + "loss": 2.6325, + "step": 7572 + }, + { + "epoch": 0.6855254820313207, + "grad_norm": 0.8922678828239441, + "learning_rate": 2.822050372955748e-05, + "loss": 2.5736, + "step": 7573 + }, + { + "epoch": 0.6856160043450711, + "grad_norm": 0.8456141948699951, + "learning_rate": 2.8198412918128126e-05, + "loss": 2.7264, + "step": 7574 + }, + { + "epoch": 0.6857065266588214, + "grad_norm": 0.8549056053161621, + "learning_rate": 2.81763293371166e-05, + "loss": 2.6174, + "step": 7575 + }, + { + "epoch": 0.6857970489725718, + "grad_norm": 0.8470386266708374, + "learning_rate": 2.8154252988746755e-05, + "loss": 2.6436, + "step": 7576 + }, + { + "epoch": 0.685887571286322, + "grad_norm": 0.8629388809204102, + "learning_rate": 2.8132183875241657e-05, + "loss": 2.6735, + "step": 7577 + }, + { + "epoch": 0.6859780936000724, + "grad_norm": 0.9061052203178406, + "learning_rate": 2.811012199882368e-05, + "loss": 2.6579, + "step": 7578 + }, + { + "epoch": 0.6860686159138227, + "grad_norm": 0.8674964308738708, + "learning_rate": 2.8088067361714455e-05, + "loss": 2.6735, + "step": 7579 + }, + { + "epoch": 0.6861591382275731, + "grad_norm": 0.8112437725067139, + "learning_rate": 2.8066019966134904e-05, + "loss": 2.6401, + "step": 7580 + }, + { + "epoch": 0.6862496605413234, + "grad_norm": 0.8892813324928284, + "learning_rate": 2.8043979814305123e-05, + "loss": 2.6504, + "step": 7581 + }, + { + "epoch": 0.6863401828550738, + "grad_norm": 0.8235838413238525, + "learning_rate": 2.802194690844465e-05, + "loss": 2.6437, + "step": 7582 + }, + { + "epoch": 0.6864307051688241, + "grad_norm": 0.8243260383605957, + "learning_rate": 2.799992125077213e-05, + "loss": 2.6016, + "step": 7583 + }, + { + "epoch": 0.6865212274825745, + "grad_norm": 0.8342082500457764, + "learning_rate": 2.7977902843505578e-05, + "loss": 2.6386, + "step": 7584 + }, + { + "epoch": 0.6866117497963248, + "grad_norm": 0.8364400267601013, + "learning_rate": 2.7955891688862236e-05, + "loss": 2.6256, + "step": 7585 + }, + { + "epoch": 0.6867022721100752, + "grad_norm": 0.9203447103500366, + "learning_rate": 2.7933887789058622e-05, + "loss": 2.6949, + "step": 7586 + }, + { + "epoch": 0.6867927944238255, + "grad_norm": 0.9918256998062134, + "learning_rate": 2.7911891146310544e-05, + "loss": 2.6685, + "step": 7587 + }, + { + "epoch": 0.6868833167375759, + "grad_norm": 0.8644828796386719, + "learning_rate": 2.7889901762833083e-05, + "loss": 2.5957, + "step": 7588 + }, + { + "epoch": 0.6869738390513261, + "grad_norm": 0.8854060173034668, + "learning_rate": 2.786791964084048e-05, + "loss": 2.604, + "step": 7589 + }, + { + "epoch": 0.6870643613650765, + "grad_norm": 0.8530220985412598, + "learning_rate": 2.7845944782546453e-05, + "loss": 2.678, + "step": 7590 + }, + { + "epoch": 0.6871548836788268, + "grad_norm": 0.8707152009010315, + "learning_rate": 2.7823977190163786e-05, + "loss": 2.6487, + "step": 7591 + }, + { + "epoch": 0.6872454059925772, + "grad_norm": 0.8770884275436401, + "learning_rate": 2.7802016865904644e-05, + "loss": 2.6149, + "step": 7592 + }, + { + "epoch": 0.6873359283063275, + "grad_norm": 0.9286378026008606, + "learning_rate": 2.7780063811980417e-05, + "loss": 2.6784, + "step": 7593 + }, + { + "epoch": 0.6874264506200779, + "grad_norm": 0.8385622501373291, + "learning_rate": 2.775811803060183e-05, + "loss": 2.642, + "step": 7594 + }, + { + "epoch": 0.6875169729338282, + "grad_norm": 0.9218869805335999, + "learning_rate": 2.773617952397871e-05, + "loss": 2.6173, + "step": 7595 + }, + { + "epoch": 0.6876074952475786, + "grad_norm": 0.8965048789978027, + "learning_rate": 2.771424829432041e-05, + "loss": 2.6545, + "step": 7596 + }, + { + "epoch": 0.6876980175613289, + "grad_norm": 0.9724282622337341, + "learning_rate": 2.7692324343835286e-05, + "loss": 2.7086, + "step": 7597 + }, + { + "epoch": 0.6877885398750792, + "grad_norm": 0.9105857014656067, + "learning_rate": 2.7670407674731126e-05, + "loss": 2.6421, + "step": 7598 + }, + { + "epoch": 0.6878790621888295, + "grad_norm": 0.8620057106018066, + "learning_rate": 2.764849828921493e-05, + "loss": 2.6495, + "step": 7599 + }, + { + "epoch": 0.6879695845025798, + "grad_norm": 0.8677911162376404, + "learning_rate": 2.7626596189492983e-05, + "loss": 2.6579, + "step": 7600 + }, + { + "epoch": 0.6880601068163302, + "grad_norm": 0.8879269361495972, + "learning_rate": 2.7604701377770816e-05, + "loss": 2.6584, + "step": 7601 + }, + { + "epoch": 0.6881506291300805, + "grad_norm": 0.8561745882034302, + "learning_rate": 2.7582813856253275e-05, + "loss": 2.6416, + "step": 7602 + }, + { + "epoch": 0.6882411514438309, + "grad_norm": 0.8197829723358154, + "learning_rate": 2.7560933627144325e-05, + "loss": 2.6112, + "step": 7603 + }, + { + "epoch": 0.6883316737575812, + "grad_norm": 0.8608514666557312, + "learning_rate": 2.7539060692647446e-05, + "loss": 2.6194, + "step": 7604 + }, + { + "epoch": 0.6884221960713316, + "grad_norm": 0.8288963437080383, + "learning_rate": 2.7517195054965138e-05, + "loss": 2.5699, + "step": 7605 + }, + { + "epoch": 0.6885127183850819, + "grad_norm": 0.8938080072402954, + "learning_rate": 2.7495336716299313e-05, + "loss": 2.7042, + "step": 7606 + }, + { + "epoch": 0.6886032406988323, + "grad_norm": 0.869457483291626, + "learning_rate": 2.7473485678851086e-05, + "loss": 2.6793, + "step": 7607 + }, + { + "epoch": 0.6886937630125826, + "grad_norm": 0.8588059544563293, + "learning_rate": 2.745164194482087e-05, + "loss": 2.6752, + "step": 7608 + }, + { + "epoch": 0.688784285326333, + "grad_norm": 0.8931800127029419, + "learning_rate": 2.7429805516408337e-05, + "loss": 2.6063, + "step": 7609 + }, + { + "epoch": 0.6888748076400832, + "grad_norm": 0.9317516684532166, + "learning_rate": 2.7407976395812418e-05, + "loss": 2.6434, + "step": 7610 + }, + { + "epoch": 0.6889653299538336, + "grad_norm": 0.9000864028930664, + "learning_rate": 2.738615458523126e-05, + "loss": 2.646, + "step": 7611 + }, + { + "epoch": 0.6890558522675839, + "grad_norm": 0.8780214190483093, + "learning_rate": 2.736434008686235e-05, + "loss": 2.5954, + "step": 7612 + }, + { + "epoch": 0.6891463745813343, + "grad_norm": 0.8571871519088745, + "learning_rate": 2.734253290290242e-05, + "loss": 2.6304, + "step": 7613 + }, + { + "epoch": 0.6892368968950846, + "grad_norm": 0.8072302937507629, + "learning_rate": 2.732073303554742e-05, + "loss": 2.6923, + "step": 7614 + }, + { + "epoch": 0.689327419208835, + "grad_norm": 0.8645291924476624, + "learning_rate": 2.729894048699265e-05, + "loss": 2.7113, + "step": 7615 + }, + { + "epoch": 0.6894179415225853, + "grad_norm": 0.8110138773918152, + "learning_rate": 2.727715525943253e-05, + "loss": 2.6095, + "step": 7616 + }, + { + "epoch": 0.6895084638363357, + "grad_norm": 0.8943793773651123, + "learning_rate": 2.725537735506094e-05, + "loss": 2.6108, + "step": 7617 + }, + { + "epoch": 0.689598986150086, + "grad_norm": 0.844049334526062, + "learning_rate": 2.723360677607083e-05, + "loss": 2.6082, + "step": 7618 + }, + { + "epoch": 0.6896895084638364, + "grad_norm": 0.808815062046051, + "learning_rate": 2.7211843524654535e-05, + "loss": 2.6504, + "step": 7619 + }, + { + "epoch": 0.6897800307775867, + "grad_norm": 0.8212969899177551, + "learning_rate": 2.719008760300359e-05, + "loss": 2.6516, + "step": 7620 + }, + { + "epoch": 0.689870553091337, + "grad_norm": 0.8447639346122742, + "learning_rate": 2.7168339013308875e-05, + "loss": 2.6996, + "step": 7621 + }, + { + "epoch": 0.6899610754050873, + "grad_norm": 0.9643140435218811, + "learning_rate": 2.714659775776036e-05, + "loss": 2.6779, + "step": 7622 + }, + { + "epoch": 0.6900515977188377, + "grad_norm": 0.8468511700630188, + "learning_rate": 2.712486383854752e-05, + "loss": 2.6389, + "step": 7623 + }, + { + "epoch": 0.690142120032588, + "grad_norm": 0.8627493977546692, + "learning_rate": 2.7103137257858868e-05, + "loss": 2.6873, + "step": 7624 + }, + { + "epoch": 0.6902326423463384, + "grad_norm": 0.9205848574638367, + "learning_rate": 2.70814180178823e-05, + "loss": 2.6157, + "step": 7625 + }, + { + "epoch": 0.6903231646600887, + "grad_norm": 0.857008695602417, + "learning_rate": 2.705970612080494e-05, + "loss": 2.6498, + "step": 7626 + }, + { + "epoch": 0.6904136869738391, + "grad_norm": 0.8849743604660034, + "learning_rate": 2.7038001568813175e-05, + "loss": 2.6083, + "step": 7627 + }, + { + "epoch": 0.6905042092875894, + "grad_norm": 0.8480933904647827, + "learning_rate": 2.701630436409267e-05, + "loss": 2.6904, + "step": 7628 + }, + { + "epoch": 0.6905947316013398, + "grad_norm": 0.8617257475852966, + "learning_rate": 2.699461450882833e-05, + "loss": 2.6551, + "step": 7629 + }, + { + "epoch": 0.6906852539150901, + "grad_norm": 0.9328704476356506, + "learning_rate": 2.6972932005204267e-05, + "loss": 2.6988, + "step": 7630 + }, + { + "epoch": 0.6907757762288405, + "grad_norm": 0.8302156329154968, + "learning_rate": 2.6951256855404005e-05, + "loss": 2.574, + "step": 7631 + }, + { + "epoch": 0.6908662985425907, + "grad_norm": 0.8471900820732117, + "learning_rate": 2.692958906161015e-05, + "loss": 2.662, + "step": 7632 + }, + { + "epoch": 0.6909568208563411, + "grad_norm": 0.8453875184059143, + "learning_rate": 2.6907928626004674e-05, + "loss": 2.6366, + "step": 7633 + }, + { + "epoch": 0.6910473431700914, + "grad_norm": 0.8545379638671875, + "learning_rate": 2.688627555076879e-05, + "loss": 2.5942, + "step": 7634 + }, + { + "epoch": 0.6911378654838418, + "grad_norm": 0.8129159808158875, + "learning_rate": 2.6864629838082956e-05, + "loss": 2.6947, + "step": 7635 + }, + { + "epoch": 0.6912283877975921, + "grad_norm": 0.8285556435585022, + "learning_rate": 2.68429914901269e-05, + "loss": 2.6357, + "step": 7636 + }, + { + "epoch": 0.6913189101113425, + "grad_norm": 0.8780727386474609, + "learning_rate": 2.682136050907962e-05, + "loss": 2.642, + "step": 7637 + }, + { + "epoch": 0.6914094324250928, + "grad_norm": 0.9794698357582092, + "learning_rate": 2.6799736897119278e-05, + "loss": 2.7138, + "step": 7638 + }, + { + "epoch": 0.6914999547388431, + "grad_norm": 0.815741240978241, + "learning_rate": 2.677812065642349e-05, + "loss": 2.6004, + "step": 7639 + }, + { + "epoch": 0.6915904770525935, + "grad_norm": 0.84134441614151, + "learning_rate": 2.6756511789168925e-05, + "loss": 2.6911, + "step": 7640 + }, + { + "epoch": 0.6916809993663438, + "grad_norm": 1.0214581489562988, + "learning_rate": 2.6734910297531613e-05, + "loss": 2.5947, + "step": 7641 + }, + { + "epoch": 0.6917715216800941, + "grad_norm": 0.9078264832496643, + "learning_rate": 2.671331618368682e-05, + "loss": 2.6658, + "step": 7642 + }, + { + "epoch": 0.6918620439938444, + "grad_norm": 0.8683854341506958, + "learning_rate": 2.6691729449809088e-05, + "loss": 2.6305, + "step": 7643 + }, + { + "epoch": 0.6919525663075948, + "grad_norm": 0.9419749975204468, + "learning_rate": 2.66701500980722e-05, + "loss": 2.6673, + "step": 7644 + }, + { + "epoch": 0.6920430886213451, + "grad_norm": 0.9228997230529785, + "learning_rate": 2.6648578130649215e-05, + "loss": 2.6446, + "step": 7645 + }, + { + "epoch": 0.6921336109350955, + "grad_norm": 0.839359700679779, + "learning_rate": 2.6627013549712355e-05, + "loss": 2.633, + "step": 7646 + }, + { + "epoch": 0.6922241332488458, + "grad_norm": 0.8543779850006104, + "learning_rate": 2.6605456357433268e-05, + "loss": 2.619, + "step": 7647 + }, + { + "epoch": 0.6923146555625962, + "grad_norm": 0.9176219701766968, + "learning_rate": 2.6583906555982697e-05, + "loss": 2.6784, + "step": 7648 + }, + { + "epoch": 0.6924051778763465, + "grad_norm": 0.8997385501861572, + "learning_rate": 2.6562364147530727e-05, + "loss": 2.7032, + "step": 7649 + }, + { + "epoch": 0.6924957001900969, + "grad_norm": 0.868236243724823, + "learning_rate": 2.654082913424668e-05, + "loss": 2.6488, + "step": 7650 + }, + { + "epoch": 0.6925862225038472, + "grad_norm": 0.8935613036155701, + "learning_rate": 2.6519301518299157e-05, + "loss": 2.6575, + "step": 7651 + }, + { + "epoch": 0.6926767448175976, + "grad_norm": 0.895734965801239, + "learning_rate": 2.6497781301855895e-05, + "loss": 2.6495, + "step": 7652 + }, + { + "epoch": 0.6927672671313478, + "grad_norm": 0.9116073250770569, + "learning_rate": 2.647626848708411e-05, + "loss": 2.6608, + "step": 7653 + }, + { + "epoch": 0.6928577894450982, + "grad_norm": 0.9018348455429077, + "learning_rate": 2.6454763076150046e-05, + "loss": 2.7109, + "step": 7654 + }, + { + "epoch": 0.6929483117588485, + "grad_norm": 0.83049076795578, + "learning_rate": 2.6433265071219327e-05, + "loss": 2.6045, + "step": 7655 + }, + { + "epoch": 0.6930388340725989, + "grad_norm": 0.8695515394210815, + "learning_rate": 2.6411774474456797e-05, + "loss": 2.6627, + "step": 7656 + }, + { + "epoch": 0.6931293563863492, + "grad_norm": 0.8901898860931396, + "learning_rate": 2.639029128802657e-05, + "loss": 2.5905, + "step": 7657 + }, + { + "epoch": 0.6932198787000996, + "grad_norm": 0.8204820156097412, + "learning_rate": 2.6368815514091992e-05, + "loss": 2.6049, + "step": 7658 + }, + { + "epoch": 0.6933104010138499, + "grad_norm": 0.8440626263618469, + "learning_rate": 2.6347347154815715e-05, + "loss": 2.6364, + "step": 7659 + }, + { + "epoch": 0.6934009233276003, + "grad_norm": 0.8276053071022034, + "learning_rate": 2.6325886212359498e-05, + "loss": 2.61, + "step": 7660 + }, + { + "epoch": 0.6934914456413506, + "grad_norm": 0.8775001764297485, + "learning_rate": 2.6304432688884584e-05, + "loss": 2.691, + "step": 7661 + }, + { + "epoch": 0.693581967955101, + "grad_norm": 0.7714978456497192, + "learning_rate": 2.6282986586551262e-05, + "loss": 2.6292, + "step": 7662 + }, + { + "epoch": 0.6936724902688512, + "grad_norm": 0.8272567987442017, + "learning_rate": 2.6261547907519158e-05, + "loss": 2.6549, + "step": 7663 + }, + { + "epoch": 0.6937630125826016, + "grad_norm": 0.8909174203872681, + "learning_rate": 2.6240116653947166e-05, + "loss": 2.5959, + "step": 7664 + }, + { + "epoch": 0.6938535348963519, + "grad_norm": 0.8319750428199768, + "learning_rate": 2.6218692827993417e-05, + "loss": 2.6633, + "step": 7665 + }, + { + "epoch": 0.6939440572101023, + "grad_norm": 0.836725652217865, + "learning_rate": 2.6197276431815277e-05, + "loss": 2.6459, + "step": 7666 + }, + { + "epoch": 0.6940345795238526, + "grad_norm": 0.8594474196434021, + "learning_rate": 2.6175867467569405e-05, + "loss": 2.6501, + "step": 7667 + }, + { + "epoch": 0.694125101837603, + "grad_norm": 0.8767529129981995, + "learning_rate": 2.615446593741161e-05, + "loss": 2.5987, + "step": 7668 + }, + { + "epoch": 0.6942156241513533, + "grad_norm": 0.8602820038795471, + "learning_rate": 2.6133071843497126e-05, + "loss": 2.6252, + "step": 7669 + }, + { + "epoch": 0.6943061464651037, + "grad_norm": 0.9225953221321106, + "learning_rate": 2.6111685187980262e-05, + "loss": 2.6669, + "step": 7670 + }, + { + "epoch": 0.694396668778854, + "grad_norm": 0.8112370371818542, + "learning_rate": 2.6090305973014672e-05, + "loss": 2.6329, + "step": 7671 + }, + { + "epoch": 0.6944871910926044, + "grad_norm": 0.8147202134132385, + "learning_rate": 2.606893420075325e-05, + "loss": 2.6265, + "step": 7672 + }, + { + "epoch": 0.6945777134063547, + "grad_norm": 0.8251135349273682, + "learning_rate": 2.6047569873348133e-05, + "loss": 2.6144, + "step": 7673 + }, + { + "epoch": 0.694668235720105, + "grad_norm": 0.850727915763855, + "learning_rate": 2.6026212992950704e-05, + "loss": 2.6623, + "step": 7674 + }, + { + "epoch": 0.6947587580338553, + "grad_norm": 0.9017343521118164, + "learning_rate": 2.6004863561711635e-05, + "loss": 2.6588, + "step": 7675 + }, + { + "epoch": 0.6948492803476057, + "grad_norm": 0.8735215067863464, + "learning_rate": 2.5983521581780724e-05, + "loss": 2.6361, + "step": 7676 + }, + { + "epoch": 0.694939802661356, + "grad_norm": 0.7911974787712097, + "learning_rate": 2.5962187055307218e-05, + "loss": 2.5933, + "step": 7677 + }, + { + "epoch": 0.6950303249751064, + "grad_norm": 0.9292235970497131, + "learning_rate": 2.5940859984439424e-05, + "loss": 2.6769, + "step": 7678 + }, + { + "epoch": 0.6951208472888567, + "grad_norm": 0.8577764630317688, + "learning_rate": 2.5919540371325e-05, + "loss": 2.6119, + "step": 7679 + }, + { + "epoch": 0.695211369602607, + "grad_norm": 0.8263047933578491, + "learning_rate": 2.589822821811083e-05, + "loss": 2.6231, + "step": 7680 + }, + { + "epoch": 0.6953018919163574, + "grad_norm": 0.825162947177887, + "learning_rate": 2.5876923526943055e-05, + "loss": 2.6699, + "step": 7681 + }, + { + "epoch": 0.6953924142301077, + "grad_norm": 0.8402916193008423, + "learning_rate": 2.585562629996705e-05, + "loss": 2.6313, + "step": 7682 + }, + { + "epoch": 0.6954829365438581, + "grad_norm": 0.832701563835144, + "learning_rate": 2.5834336539327486e-05, + "loss": 2.6451, + "step": 7683 + }, + { + "epoch": 0.6955734588576084, + "grad_norm": 0.8593753576278687, + "learning_rate": 2.5813054247168167e-05, + "loss": 2.6064, + "step": 7684 + }, + { + "epoch": 0.6956639811713587, + "grad_norm": 0.8329028487205505, + "learning_rate": 2.5791779425632255e-05, + "loss": 2.6614, + "step": 7685 + }, + { + "epoch": 0.695754503485109, + "grad_norm": 0.8595302700996399, + "learning_rate": 2.577051207686213e-05, + "loss": 2.5816, + "step": 7686 + }, + { + "epoch": 0.6958450257988594, + "grad_norm": 0.8601256012916565, + "learning_rate": 2.5749252202999408e-05, + "loss": 2.6177, + "step": 7687 + }, + { + "epoch": 0.6959355481126097, + "grad_norm": 0.8946163654327393, + "learning_rate": 2.5727999806184945e-05, + "loss": 2.6212, + "step": 7688 + }, + { + "epoch": 0.6960260704263601, + "grad_norm": 0.8692400455474854, + "learning_rate": 2.5706754888558914e-05, + "loss": 2.612, + "step": 7689 + }, + { + "epoch": 0.6961165927401104, + "grad_norm": 0.8570036888122559, + "learning_rate": 2.5685517452260567e-05, + "loss": 2.6734, + "step": 7690 + }, + { + "epoch": 0.6962071150538608, + "grad_norm": 0.854727566242218, + "learning_rate": 2.5664287499428642e-05, + "loss": 2.6159, + "step": 7691 + }, + { + "epoch": 0.6962976373676111, + "grad_norm": 0.8037676215171814, + "learning_rate": 2.5643065032200895e-05, + "loss": 2.6414, + "step": 7692 + }, + { + "epoch": 0.6963881596813615, + "grad_norm": 0.8172332644462585, + "learning_rate": 2.5621850052714468e-05, + "loss": 2.6629, + "step": 7693 + }, + { + "epoch": 0.6964786819951118, + "grad_norm": 0.9130670428276062, + "learning_rate": 2.5600642563105703e-05, + "loss": 2.6594, + "step": 7694 + }, + { + "epoch": 0.6965692043088622, + "grad_norm": 0.8906755447387695, + "learning_rate": 2.5579442565510203e-05, + "loss": 2.6459, + "step": 7695 + }, + { + "epoch": 0.6966597266226124, + "grad_norm": 0.8938328623771667, + "learning_rate": 2.5558250062062828e-05, + "loss": 2.664, + "step": 7696 + }, + { + "epoch": 0.6967502489363628, + "grad_norm": 1.0191668272018433, + "learning_rate": 2.5537065054897602e-05, + "loss": 2.6972, + "step": 7697 + }, + { + "epoch": 0.6968407712501131, + "grad_norm": 0.921165943145752, + "learning_rate": 2.5515887546147886e-05, + "loss": 2.6141, + "step": 7698 + }, + { + "epoch": 0.6969312935638635, + "grad_norm": 0.8833820819854736, + "learning_rate": 2.5494717537946256e-05, + "loss": 2.6725, + "step": 7699 + }, + { + "epoch": 0.6970218158776138, + "grad_norm": 0.8970410227775574, + "learning_rate": 2.5473555032424533e-05, + "loss": 2.6716, + "step": 7700 + }, + { + "epoch": 0.6971123381913642, + "grad_norm": 0.8503276705741882, + "learning_rate": 2.5452400031713785e-05, + "loss": 2.6465, + "step": 7701 + }, + { + "epoch": 0.6972028605051145, + "grad_norm": 0.9117843508720398, + "learning_rate": 2.543125253794434e-05, + "loss": 2.6583, + "step": 7702 + }, + { + "epoch": 0.6972933828188649, + "grad_norm": 0.8610855937004089, + "learning_rate": 2.5410112553245657e-05, + "loss": 2.5539, + "step": 7703 + }, + { + "epoch": 0.6973839051326152, + "grad_norm": 0.8850502371788025, + "learning_rate": 2.538898007974667e-05, + "loss": 2.6254, + "step": 7704 + }, + { + "epoch": 0.6974744274463656, + "grad_norm": 0.9232295751571655, + "learning_rate": 2.536785511957531e-05, + "loss": 2.6897, + "step": 7705 + }, + { + "epoch": 0.6975649497601158, + "grad_norm": 0.8219690918922424, + "learning_rate": 2.53467376748589e-05, + "loss": 2.6201, + "step": 7706 + }, + { + "epoch": 0.6976554720738662, + "grad_norm": 0.8805753588676453, + "learning_rate": 2.532562774772397e-05, + "loss": 2.5956, + "step": 7707 + }, + { + "epoch": 0.6977459943876165, + "grad_norm": 0.8470893502235413, + "learning_rate": 2.530452534029627e-05, + "loss": 2.6556, + "step": 7708 + }, + { + "epoch": 0.6978365167013669, + "grad_norm": 0.8256682753562927, + "learning_rate": 2.528343045470083e-05, + "loss": 2.6603, + "step": 7709 + }, + { + "epoch": 0.6979270390151172, + "grad_norm": 0.8292689323425293, + "learning_rate": 2.5262343093061936e-05, + "loss": 2.6241, + "step": 7710 + }, + { + "epoch": 0.6980175613288676, + "grad_norm": 0.8731260299682617, + "learning_rate": 2.5241263257502977e-05, + "loss": 2.6633, + "step": 7711 + }, + { + "epoch": 0.6981080836426179, + "grad_norm": 0.9130814075469971, + "learning_rate": 2.5220190950146827e-05, + "loss": 2.6724, + "step": 7712 + }, + { + "epoch": 0.6981986059563683, + "grad_norm": 0.8607717752456665, + "learning_rate": 2.5199126173115372e-05, + "loss": 2.6518, + "step": 7713 + }, + { + "epoch": 0.6982891282701186, + "grad_norm": 0.8219232559204102, + "learning_rate": 2.5178068928529864e-05, + "loss": 2.647, + "step": 7714 + }, + { + "epoch": 0.698379650583869, + "grad_norm": 0.8676030039787292, + "learning_rate": 2.515701921851077e-05, + "loss": 2.6465, + "step": 7715 + }, + { + "epoch": 0.6984701728976193, + "grad_norm": 0.8603833317756653, + "learning_rate": 2.5135977045177815e-05, + "loss": 2.6908, + "step": 7716 + }, + { + "epoch": 0.6985606952113697, + "grad_norm": 0.8424780368804932, + "learning_rate": 2.5114942410649878e-05, + "loss": 2.6428, + "step": 7717 + }, + { + "epoch": 0.6986512175251199, + "grad_norm": 0.8137457370758057, + "learning_rate": 2.5093915317045237e-05, + "loss": 2.723, + "step": 7718 + }, + { + "epoch": 0.6987417398388703, + "grad_norm": 0.8049132227897644, + "learning_rate": 2.5072895766481254e-05, + "loss": 2.668, + "step": 7719 + }, + { + "epoch": 0.6988322621526206, + "grad_norm": 0.8493908047676086, + "learning_rate": 2.5051883761074614e-05, + "loss": 2.5841, + "step": 7720 + }, + { + "epoch": 0.6989227844663709, + "grad_norm": 0.949332058429718, + "learning_rate": 2.5030879302941222e-05, + "loss": 2.6522, + "step": 7721 + }, + { + "epoch": 0.6990133067801213, + "grad_norm": 0.8372514247894287, + "learning_rate": 2.500988239419624e-05, + "loss": 2.569, + "step": 7722 + }, + { + "epoch": 0.6991038290938716, + "grad_norm": 0.9176880717277527, + "learning_rate": 2.4988893036954043e-05, + "loss": 2.6696, + "step": 7723 + }, + { + "epoch": 0.699194351407622, + "grad_norm": 0.7949821352958679, + "learning_rate": 2.4967911233328302e-05, + "loss": 2.6849, + "step": 7724 + }, + { + "epoch": 0.6992848737213723, + "grad_norm": 0.9147369861602783, + "learning_rate": 2.494693698543179e-05, + "loss": 2.6272, + "step": 7725 + }, + { + "epoch": 0.6993753960351227, + "grad_norm": 0.8446216583251953, + "learning_rate": 2.4925970295376722e-05, + "loss": 2.6242, + "step": 7726 + }, + { + "epoch": 0.699465918348873, + "grad_norm": 0.8607931137084961, + "learning_rate": 2.4905011165274363e-05, + "loss": 2.6809, + "step": 7727 + }, + { + "epoch": 0.6995564406626233, + "grad_norm": 0.9009124636650085, + "learning_rate": 2.488405959723532e-05, + "loss": 2.6458, + "step": 7728 + }, + { + "epoch": 0.6996469629763736, + "grad_norm": 0.8539814949035645, + "learning_rate": 2.4863115593369423e-05, + "loss": 2.6098, + "step": 7729 + }, + { + "epoch": 0.699737485290124, + "grad_norm": 0.9067366123199463, + "learning_rate": 2.4842179155785737e-05, + "loss": 2.5878, + "step": 7730 + }, + { + "epoch": 0.6998280076038743, + "grad_norm": 0.8943397998809814, + "learning_rate": 2.4821250286592546e-05, + "loss": 2.6324, + "step": 7731 + }, + { + "epoch": 0.6999185299176247, + "grad_norm": 0.8534390926361084, + "learning_rate": 2.4800328987897427e-05, + "loss": 2.6712, + "step": 7732 + }, + { + "epoch": 0.700009052231375, + "grad_norm": 0.8815813064575195, + "learning_rate": 2.477941526180706e-05, + "loss": 2.732, + "step": 7733 + }, + { + "epoch": 0.7000995745451254, + "grad_norm": 0.8730425834655762, + "learning_rate": 2.4758509110427575e-05, + "loss": 2.6302, + "step": 7734 + }, + { + "epoch": 0.7001900968588757, + "grad_norm": 0.7939863204956055, + "learning_rate": 2.473761053586414e-05, + "loss": 2.578, + "step": 7735 + }, + { + "epoch": 0.7002806191726261, + "grad_norm": 0.8331583738327026, + "learning_rate": 2.4716719540221268e-05, + "loss": 2.663, + "step": 7736 + }, + { + "epoch": 0.7003711414863764, + "grad_norm": 0.8007463216781616, + "learning_rate": 2.4695836125602667e-05, + "loss": 2.5627, + "step": 7737 + }, + { + "epoch": 0.7004616638001268, + "grad_norm": 0.7924402952194214, + "learning_rate": 2.4674960294111314e-05, + "loss": 2.6149, + "step": 7738 + }, + { + "epoch": 0.700552186113877, + "grad_norm": 0.8849868178367615, + "learning_rate": 2.46540920478494e-05, + "loss": 2.6322, + "step": 7739 + }, + { + "epoch": 0.7006427084276274, + "grad_norm": 0.9710038900375366, + "learning_rate": 2.4633231388918378e-05, + "loss": 2.614, + "step": 7740 + }, + { + "epoch": 0.7007332307413777, + "grad_norm": 0.8998742699623108, + "learning_rate": 2.461237831941884e-05, + "loss": 2.6947, + "step": 7741 + }, + { + "epoch": 0.7008237530551281, + "grad_norm": 0.7999228239059448, + "learning_rate": 2.4591532841450804e-05, + "loss": 2.6198, + "step": 7742 + }, + { + "epoch": 0.7009142753688784, + "grad_norm": 0.9255106449127197, + "learning_rate": 2.4570694957113304e-05, + "loss": 2.6402, + "step": 7743 + }, + { + "epoch": 0.7010047976826288, + "grad_norm": 0.8379217982292175, + "learning_rate": 2.4549864668504774e-05, + "loss": 2.6122, + "step": 7744 + }, + { + "epoch": 0.7010953199963791, + "grad_norm": 0.8677035570144653, + "learning_rate": 2.45290419777228e-05, + "loss": 2.6151, + "step": 7745 + }, + { + "epoch": 0.7011858423101295, + "grad_norm": 0.888058602809906, + "learning_rate": 2.450822688686426e-05, + "loss": 2.6339, + "step": 7746 + }, + { + "epoch": 0.7012763646238798, + "grad_norm": 0.825615644454956, + "learning_rate": 2.4487419398025157e-05, + "loss": 2.6411, + "step": 7747 + }, + { + "epoch": 0.7013668869376302, + "grad_norm": 0.8349601626396179, + "learning_rate": 2.4466619513300915e-05, + "loss": 2.6204, + "step": 7748 + }, + { + "epoch": 0.7014574092513804, + "grad_norm": 0.8351013660430908, + "learning_rate": 2.444582723478598e-05, + "loss": 2.6293, + "step": 7749 + }, + { + "epoch": 0.7015479315651308, + "grad_norm": 0.8708988428115845, + "learning_rate": 2.4425042564574184e-05, + "loss": 2.6312, + "step": 7750 + }, + { + "epoch": 0.7016384538788811, + "grad_norm": 0.8427244424819946, + "learning_rate": 2.4404265504758528e-05, + "loss": 2.6313, + "step": 7751 + }, + { + "epoch": 0.7017289761926315, + "grad_norm": 0.821071445941925, + "learning_rate": 2.4383496057431267e-05, + "loss": 2.6344, + "step": 7752 + }, + { + "epoch": 0.7018194985063818, + "grad_norm": 0.8288323283195496, + "learning_rate": 2.436273422468389e-05, + "loss": 2.6508, + "step": 7753 + }, + { + "epoch": 0.7019100208201322, + "grad_norm": 0.8814200758934021, + "learning_rate": 2.4341980008607136e-05, + "loss": 2.635, + "step": 7754 + }, + { + "epoch": 0.7020005431338825, + "grad_norm": 0.9194662570953369, + "learning_rate": 2.4321233411290868e-05, + "loss": 2.5983, + "step": 7755 + }, + { + "epoch": 0.7020910654476329, + "grad_norm": 0.820624589920044, + "learning_rate": 2.4300494434824373e-05, + "loss": 2.6291, + "step": 7756 + }, + { + "epoch": 0.7021815877613832, + "grad_norm": 0.8468902707099915, + "learning_rate": 2.4279763081295993e-05, + "loss": 2.6783, + "step": 7757 + }, + { + "epoch": 0.7022721100751336, + "grad_norm": 0.8257344365119934, + "learning_rate": 2.42590393527934e-05, + "loss": 2.5901, + "step": 7758 + }, + { + "epoch": 0.7023626323888839, + "grad_norm": 0.8577067852020264, + "learning_rate": 2.4238323251403473e-05, + "loss": 2.6071, + "step": 7759 + }, + { + "epoch": 0.7024531547026343, + "grad_norm": 0.8569862246513367, + "learning_rate": 2.4217614779212315e-05, + "loss": 2.6367, + "step": 7760 + }, + { + "epoch": 0.7025436770163845, + "grad_norm": 0.9399639964103699, + "learning_rate": 2.419691393830529e-05, + "loss": 2.5941, + "step": 7761 + }, + { + "epoch": 0.7026341993301348, + "grad_norm": 0.8270482420921326, + "learning_rate": 2.4176220730766974e-05, + "loss": 2.6653, + "step": 7762 + }, + { + "epoch": 0.7027247216438852, + "grad_norm": 0.8636314868927002, + "learning_rate": 2.4155535158681098e-05, + "loss": 2.693, + "step": 7763 + }, + { + "epoch": 0.7028152439576355, + "grad_norm": 0.9459649920463562, + "learning_rate": 2.4134857224130814e-05, + "loss": 2.648, + "step": 7764 + }, + { + "epoch": 0.7029057662713859, + "grad_norm": 0.9734020829200745, + "learning_rate": 2.4114186929198313e-05, + "loss": 2.6417, + "step": 7765 + }, + { + "epoch": 0.7029962885851362, + "grad_norm": 0.8456811904907227, + "learning_rate": 2.4093524275965097e-05, + "loss": 2.6391, + "step": 7766 + }, + { + "epoch": 0.7030868108988866, + "grad_norm": 0.8550230860710144, + "learning_rate": 2.407286926651192e-05, + "loss": 2.6139, + "step": 7767 + }, + { + "epoch": 0.7031773332126369, + "grad_norm": 0.8369646668434143, + "learning_rate": 2.4052221902918725e-05, + "loss": 2.6357, + "step": 7768 + }, + { + "epoch": 0.7032678555263873, + "grad_norm": 0.8603420853614807, + "learning_rate": 2.4031582187264713e-05, + "loss": 2.6607, + "step": 7769 + }, + { + "epoch": 0.7033583778401375, + "grad_norm": 0.9332559108734131, + "learning_rate": 2.4010950121628318e-05, + "loss": 2.6348, + "step": 7770 + }, + { + "epoch": 0.703448900153888, + "grad_norm": 0.7975991368293762, + "learning_rate": 2.3990325708087114e-05, + "loss": 2.621, + "step": 7771 + }, + { + "epoch": 0.7035394224676382, + "grad_norm": 0.816978931427002, + "learning_rate": 2.3969708948718084e-05, + "loss": 2.5824, + "step": 7772 + }, + { + "epoch": 0.7036299447813886, + "grad_norm": 0.9127275347709656, + "learning_rate": 2.3949099845597255e-05, + "loss": 2.6732, + "step": 7773 + }, + { + "epoch": 0.7037204670951389, + "grad_norm": 0.8123887181282043, + "learning_rate": 2.3928498400799993e-05, + "loss": 2.6204, + "step": 7774 + }, + { + "epoch": 0.7038109894088893, + "grad_norm": 0.871410608291626, + "learning_rate": 2.3907904616400854e-05, + "loss": 2.6454, + "step": 7775 + }, + { + "epoch": 0.7039015117226396, + "grad_norm": 0.8844181895256042, + "learning_rate": 2.3887318494473677e-05, + "loss": 2.6197, + "step": 7776 + }, + { + "epoch": 0.70399203403639, + "grad_norm": 0.845853865146637, + "learning_rate": 2.3866740037091386e-05, + "loss": 2.6927, + "step": 7777 + }, + { + "epoch": 0.7040825563501403, + "grad_norm": 0.8261255621910095, + "learning_rate": 2.3846169246326343e-05, + "loss": 2.666, + "step": 7778 + }, + { + "epoch": 0.7041730786638907, + "grad_norm": 0.8859426379203796, + "learning_rate": 2.3825606124249942e-05, + "loss": 2.5974, + "step": 7779 + }, + { + "epoch": 0.704263600977641, + "grad_norm": 0.8192964792251587, + "learning_rate": 2.3805050672932928e-05, + "loss": 2.6081, + "step": 7780 + }, + { + "epoch": 0.7043541232913914, + "grad_norm": 0.7949223518371582, + "learning_rate": 2.3784502894445214e-05, + "loss": 2.6564, + "step": 7781 + }, + { + "epoch": 0.7044446456051416, + "grad_norm": 0.8724743723869324, + "learning_rate": 2.376396279085599e-05, + "loss": 2.5728, + "step": 7782 + }, + { + "epoch": 0.704535167918892, + "grad_norm": 0.881767749786377, + "learning_rate": 2.3743430364233643e-05, + "loss": 2.6532, + "step": 7783 + }, + { + "epoch": 0.7046256902326423, + "grad_norm": 0.8147710561752319, + "learning_rate": 2.3722905616645752e-05, + "loss": 2.601, + "step": 7784 + }, + { + "epoch": 0.7047162125463927, + "grad_norm": 0.9021417498588562, + "learning_rate": 2.370238855015917e-05, + "loss": 2.6329, + "step": 7785 + }, + { + "epoch": 0.704806734860143, + "grad_norm": 0.8347455263137817, + "learning_rate": 2.368187916683997e-05, + "loss": 2.5916, + "step": 7786 + }, + { + "epoch": 0.7048972571738934, + "grad_norm": 0.8550873398780823, + "learning_rate": 2.3661377468753465e-05, + "loss": 2.6595, + "step": 7787 + }, + { + "epoch": 0.7049877794876437, + "grad_norm": 0.8691414594650269, + "learning_rate": 2.3640883457964148e-05, + "loss": 2.6136, + "step": 7788 + }, + { + "epoch": 0.7050783018013941, + "grad_norm": 0.8594337105751038, + "learning_rate": 2.362039713653581e-05, + "loss": 2.6274, + "step": 7789 + }, + { + "epoch": 0.7051688241151444, + "grad_norm": 0.9055542349815369, + "learning_rate": 2.3599918506531337e-05, + "loss": 2.657, + "step": 7790 + }, + { + "epoch": 0.7052593464288948, + "grad_norm": 0.9172478914260864, + "learning_rate": 2.3579447570013026e-05, + "loss": 2.7091, + "step": 7791 + }, + { + "epoch": 0.705349868742645, + "grad_norm": 0.896195650100708, + "learning_rate": 2.3558984329042233e-05, + "loss": 2.6851, + "step": 7792 + }, + { + "epoch": 0.7054403910563954, + "grad_norm": 0.8319085836410522, + "learning_rate": 2.3538528785679625e-05, + "loss": 2.5979, + "step": 7793 + }, + { + "epoch": 0.7055309133701457, + "grad_norm": 0.8286898732185364, + "learning_rate": 2.3518080941985076e-05, + "loss": 2.6195, + "step": 7794 + }, + { + "epoch": 0.7056214356838961, + "grad_norm": 0.8543409705162048, + "learning_rate": 2.3497640800017685e-05, + "loss": 2.6256, + "step": 7795 + }, + { + "epoch": 0.7057119579976464, + "grad_norm": 0.8427583575248718, + "learning_rate": 2.347720836183578e-05, + "loss": 2.6666, + "step": 7796 + }, + { + "epoch": 0.7058024803113968, + "grad_norm": 0.8994350433349609, + "learning_rate": 2.3456783629496915e-05, + "loss": 2.641, + "step": 7797 + }, + { + "epoch": 0.7058930026251471, + "grad_norm": 0.862696647644043, + "learning_rate": 2.343636660505779e-05, + "loss": 2.6623, + "step": 7798 + }, + { + "epoch": 0.7059835249388975, + "grad_norm": 0.9399603605270386, + "learning_rate": 2.341595729057451e-05, + "loss": 2.6399, + "step": 7799 + }, + { + "epoch": 0.7060740472526478, + "grad_norm": 0.8926723599433899, + "learning_rate": 2.339555568810221e-05, + "loss": 2.5605, + "step": 7800 + }, + { + "epoch": 0.7061645695663982, + "grad_norm": 0.8574604392051697, + "learning_rate": 2.3375161799695355e-05, + "loss": 2.6727, + "step": 7801 + }, + { + "epoch": 0.7062550918801485, + "grad_norm": 0.82745361328125, + "learning_rate": 2.335477562740761e-05, + "loss": 2.5692, + "step": 7802 + }, + { + "epoch": 0.7063456141938987, + "grad_norm": 0.8518375754356384, + "learning_rate": 2.3334397173291855e-05, + "loss": 2.6753, + "step": 7803 + }, + { + "epoch": 0.7064361365076491, + "grad_norm": 0.8378230333328247, + "learning_rate": 2.3314026439400217e-05, + "loss": 2.6279, + "step": 7804 + }, + { + "epoch": 0.7065266588213994, + "grad_norm": 0.850548505783081, + "learning_rate": 2.329366342778404e-05, + "loss": 2.6488, + "step": 7805 + }, + { + "epoch": 0.7066171811351498, + "grad_norm": 0.8931477069854736, + "learning_rate": 2.327330814049381e-05, + "loss": 2.7017, + "step": 7806 + }, + { + "epoch": 0.7067077034489001, + "grad_norm": 0.8398114442825317, + "learning_rate": 2.3252960579579397e-05, + "loss": 2.6148, + "step": 7807 + }, + { + "epoch": 0.7067982257626505, + "grad_norm": 0.9468042254447937, + "learning_rate": 2.3232620747089727e-05, + "loss": 2.6562, + "step": 7808 + }, + { + "epoch": 0.7068887480764008, + "grad_norm": 0.8379910588264465, + "learning_rate": 2.3212288645073054e-05, + "loss": 2.675, + "step": 7809 + }, + { + "epoch": 0.7069792703901512, + "grad_norm": 0.9158812165260315, + "learning_rate": 2.3191964275576805e-05, + "loss": 2.6877, + "step": 7810 + }, + { + "epoch": 0.7070697927039015, + "grad_norm": 0.8671376705169678, + "learning_rate": 2.3171647640647687e-05, + "loss": 2.5982, + "step": 7811 + }, + { + "epoch": 0.7071603150176519, + "grad_norm": 0.8869242668151855, + "learning_rate": 2.3151338742331496e-05, + "loss": 2.5712, + "step": 7812 + }, + { + "epoch": 0.7072508373314021, + "grad_norm": 0.7991974353790283, + "learning_rate": 2.3131037582673443e-05, + "loss": 2.6408, + "step": 7813 + }, + { + "epoch": 0.7073413596451525, + "grad_norm": 0.853305995464325, + "learning_rate": 2.3110744163717766e-05, + "loss": 2.6105, + "step": 7814 + }, + { + "epoch": 0.7074318819589028, + "grad_norm": 0.8013371825218201, + "learning_rate": 2.309045848750806e-05, + "loss": 2.6922, + "step": 7815 + }, + { + "epoch": 0.7075224042726532, + "grad_norm": 0.9263652563095093, + "learning_rate": 2.3070180556087074e-05, + "loss": 2.65, + "step": 7816 + }, + { + "epoch": 0.7076129265864035, + "grad_norm": 0.8769465088844299, + "learning_rate": 2.3049910371496797e-05, + "loss": 2.5925, + "step": 7817 + }, + { + "epoch": 0.7077034489001539, + "grad_norm": 0.7965158820152283, + "learning_rate": 2.302964793577844e-05, + "loss": 2.5404, + "step": 7818 + }, + { + "epoch": 0.7077939712139042, + "grad_norm": 0.8006182909011841, + "learning_rate": 2.3009393250972465e-05, + "loss": 2.6565, + "step": 7819 + }, + { + "epoch": 0.7078844935276546, + "grad_norm": 0.8500511050224304, + "learning_rate": 2.2989146319118425e-05, + "loss": 2.6867, + "step": 7820 + }, + { + "epoch": 0.7079750158414049, + "grad_norm": 0.9117669463157654, + "learning_rate": 2.2968907142255303e-05, + "loss": 2.658, + "step": 7821 + }, + { + "epoch": 0.7080655381551553, + "grad_norm": 0.9068896174430847, + "learning_rate": 2.2948675722421086e-05, + "loss": 2.6569, + "step": 7822 + }, + { + "epoch": 0.7081560604689056, + "grad_norm": 0.8441785573959351, + "learning_rate": 2.2928452061653127e-05, + "loss": 2.6124, + "step": 7823 + }, + { + "epoch": 0.708246582782656, + "grad_norm": 0.8266540169715881, + "learning_rate": 2.290823616198793e-05, + "loss": 2.6104, + "step": 7824 + }, + { + "epoch": 0.7083371050964062, + "grad_norm": 0.8758473992347717, + "learning_rate": 2.288802802546124e-05, + "loss": 2.5885, + "step": 7825 + }, + { + "epoch": 0.7084276274101566, + "grad_norm": 0.8110272884368896, + "learning_rate": 2.286782765410802e-05, + "loss": 2.6885, + "step": 7826 + }, + { + "epoch": 0.7085181497239069, + "grad_norm": 0.860407829284668, + "learning_rate": 2.284763504996248e-05, + "loss": 2.7112, + "step": 7827 + }, + { + "epoch": 0.7086086720376573, + "grad_norm": 0.8285262584686279, + "learning_rate": 2.282745021505791e-05, + "loss": 2.6778, + "step": 7828 + }, + { + "epoch": 0.7086991943514076, + "grad_norm": 0.8299581408500671, + "learning_rate": 2.2807273151427055e-05, + "loss": 2.5845, + "step": 7829 + }, + { + "epoch": 0.708789716665158, + "grad_norm": 0.808203935623169, + "learning_rate": 2.2787103861101655e-05, + "loss": 2.6503, + "step": 7830 + }, + { + "epoch": 0.7088802389789083, + "grad_norm": 0.9187443256378174, + "learning_rate": 2.2766942346112786e-05, + "loss": 2.6364, + "step": 7831 + }, + { + "epoch": 0.7089707612926587, + "grad_norm": 0.8716665506362915, + "learning_rate": 2.27467886084907e-05, + "loss": 2.6456, + "step": 7832 + }, + { + "epoch": 0.709061283606409, + "grad_norm": 0.8400218486785889, + "learning_rate": 2.2726642650264895e-05, + "loss": 2.6181, + "step": 7833 + }, + { + "epoch": 0.7091518059201594, + "grad_norm": 0.9854234457015991, + "learning_rate": 2.2706504473464063e-05, + "loss": 2.6259, + "step": 7834 + }, + { + "epoch": 0.7092423282339096, + "grad_norm": 1.016037106513977, + "learning_rate": 2.2686374080116136e-05, + "loss": 2.6556, + "step": 7835 + }, + { + "epoch": 0.70933285054766, + "grad_norm": 0.8779817819595337, + "learning_rate": 2.266625147224817e-05, + "loss": 2.655, + "step": 7836 + }, + { + "epoch": 0.7094233728614103, + "grad_norm": 0.8986814022064209, + "learning_rate": 2.2646136651886617e-05, + "loss": 2.6691, + "step": 7837 + }, + { + "epoch": 0.7095138951751607, + "grad_norm": 0.9211919903755188, + "learning_rate": 2.2626029621056967e-05, + "loss": 2.6743, + "step": 7838 + }, + { + "epoch": 0.709604417488911, + "grad_norm": 0.9180659651756287, + "learning_rate": 2.2605930381784013e-05, + "loss": 2.6589, + "step": 7839 + }, + { + "epoch": 0.7096949398026614, + "grad_norm": 0.8097158670425415, + "learning_rate": 2.2585838936091754e-05, + "loss": 2.5478, + "step": 7840 + }, + { + "epoch": 0.7097854621164117, + "grad_norm": 0.8261932134628296, + "learning_rate": 2.256575528600342e-05, + "loss": 2.6354, + "step": 7841 + }, + { + "epoch": 0.7098759844301621, + "grad_norm": 0.903597354888916, + "learning_rate": 2.2545679433541355e-05, + "loss": 2.6254, + "step": 7842 + }, + { + "epoch": 0.7099665067439124, + "grad_norm": 0.9137170910835266, + "learning_rate": 2.2525611380727307e-05, + "loss": 2.6587, + "step": 7843 + }, + { + "epoch": 0.7100570290576627, + "grad_norm": 0.8651295304298401, + "learning_rate": 2.2505551129582047e-05, + "loss": 2.6701, + "step": 7844 + }, + { + "epoch": 0.710147551371413, + "grad_norm": 0.8841049075126648, + "learning_rate": 2.2485498682125673e-05, + "loss": 2.6473, + "step": 7845 + }, + { + "epoch": 0.7102380736851633, + "grad_norm": 0.8099082708358765, + "learning_rate": 2.2465454040377455e-05, + "loss": 2.6234, + "step": 7846 + }, + { + "epoch": 0.7103285959989137, + "grad_norm": 0.8708997368812561, + "learning_rate": 2.24454172063559e-05, + "loss": 2.5894, + "step": 7847 + }, + { + "epoch": 0.710419118312664, + "grad_norm": 0.8280975222587585, + "learning_rate": 2.2425388182078698e-05, + "loss": 2.5893, + "step": 7848 + }, + { + "epoch": 0.7105096406264144, + "grad_norm": 0.79764723777771, + "learning_rate": 2.2405366969562823e-05, + "loss": 2.6345, + "step": 7849 + }, + { + "epoch": 0.7106001629401647, + "grad_norm": 0.8236714005470276, + "learning_rate": 2.2385353570824308e-05, + "loss": 2.5931, + "step": 7850 + }, + { + "epoch": 0.7106906852539151, + "grad_norm": 0.8521032929420471, + "learning_rate": 2.2365347987878625e-05, + "loss": 2.6221, + "step": 7851 + }, + { + "epoch": 0.7107812075676654, + "grad_norm": 0.8951077461242676, + "learning_rate": 2.2345350222740247e-05, + "loss": 2.6962, + "step": 7852 + }, + { + "epoch": 0.7108717298814158, + "grad_norm": 0.8242161273956299, + "learning_rate": 2.2325360277422967e-05, + "loss": 2.5837, + "step": 7853 + }, + { + "epoch": 0.7109622521951661, + "grad_norm": 0.8123531341552734, + "learning_rate": 2.2305378153939793e-05, + "loss": 2.6381, + "step": 7854 + }, + { + "epoch": 0.7110527745089165, + "grad_norm": 0.8669412732124329, + "learning_rate": 2.2285403854302912e-05, + "loss": 2.6961, + "step": 7855 + }, + { + "epoch": 0.7111432968226667, + "grad_norm": 0.8449344635009766, + "learning_rate": 2.2265437380523734e-05, + "loss": 2.6054, + "step": 7856 + }, + { + "epoch": 0.7112338191364171, + "grad_norm": 0.8434821963310242, + "learning_rate": 2.2245478734612914e-05, + "loss": 2.6581, + "step": 7857 + }, + { + "epoch": 0.7113243414501674, + "grad_norm": 0.8311582207679749, + "learning_rate": 2.2225527918580204e-05, + "loss": 2.6345, + "step": 7858 + }, + { + "epoch": 0.7114148637639178, + "grad_norm": 0.8839373588562012, + "learning_rate": 2.2205584934434753e-05, + "loss": 2.63, + "step": 7859 + }, + { + "epoch": 0.7115053860776681, + "grad_norm": 0.8330554366111755, + "learning_rate": 2.2185649784184746e-05, + "loss": 2.6272, + "step": 7860 + }, + { + "epoch": 0.7115959083914185, + "grad_norm": 0.8182799220085144, + "learning_rate": 2.216572246983768e-05, + "loss": 2.655, + "step": 7861 + }, + { + "epoch": 0.7116864307051688, + "grad_norm": 0.9063050746917725, + "learning_rate": 2.2145802993400223e-05, + "loss": 2.6717, + "step": 7862 + }, + { + "epoch": 0.7117769530189192, + "grad_norm": 0.8399935960769653, + "learning_rate": 2.212589135687828e-05, + "loss": 2.677, + "step": 7863 + }, + { + "epoch": 0.7118674753326695, + "grad_norm": 0.9330717921257019, + "learning_rate": 2.2105987562276953e-05, + "loss": 2.7948, + "step": 7864 + }, + { + "epoch": 0.7119579976464199, + "grad_norm": 0.8137609362602234, + "learning_rate": 2.2086091611600567e-05, + "loss": 2.6701, + "step": 7865 + }, + { + "epoch": 0.7120485199601702, + "grad_norm": 0.7958082556724548, + "learning_rate": 2.2066203506852566e-05, + "loss": 2.572, + "step": 7866 + }, + { + "epoch": 0.7121390422739206, + "grad_norm": 0.850689172744751, + "learning_rate": 2.2046323250035804e-05, + "loss": 2.6466, + "step": 7867 + }, + { + "epoch": 0.7122295645876708, + "grad_norm": 0.8957266807556152, + "learning_rate": 2.2026450843152157e-05, + "loss": 2.656, + "step": 7868 + }, + { + "epoch": 0.7123200869014212, + "grad_norm": 0.867348849773407, + "learning_rate": 2.200658628820271e-05, + "loss": 2.6563, + "step": 7869 + }, + { + "epoch": 0.7124106092151715, + "grad_norm": 0.833674967288971, + "learning_rate": 2.198672958718796e-05, + "loss": 2.6242, + "step": 7870 + }, + { + "epoch": 0.7125011315289219, + "grad_norm": 0.8591665625572205, + "learning_rate": 2.1966880742107364e-05, + "loss": 2.6653, + "step": 7871 + }, + { + "epoch": 0.7125916538426722, + "grad_norm": 0.8057499527931213, + "learning_rate": 2.1947039754959753e-05, + "loss": 2.5693, + "step": 7872 + }, + { + "epoch": 0.7126821761564226, + "grad_norm": 0.9254628419876099, + "learning_rate": 2.1927206627743102e-05, + "loss": 2.702, + "step": 7873 + }, + { + "epoch": 0.7127726984701729, + "grad_norm": 0.8389781713485718, + "learning_rate": 2.1907381362454605e-05, + "loss": 2.644, + "step": 7874 + }, + { + "epoch": 0.7128632207839233, + "grad_norm": 0.792872428894043, + "learning_rate": 2.1887563961090663e-05, + "loss": 2.6184, + "step": 7875 + }, + { + "epoch": 0.7129537430976736, + "grad_norm": 0.8513414859771729, + "learning_rate": 2.1867754425646926e-05, + "loss": 2.6076, + "step": 7876 + }, + { + "epoch": 0.713044265411424, + "grad_norm": 0.9116446375846863, + "learning_rate": 2.1847952758118117e-05, + "loss": 2.7136, + "step": 7877 + }, + { + "epoch": 0.7131347877251742, + "grad_norm": 0.8806061744689941, + "learning_rate": 2.182815896049839e-05, + "loss": 2.6, + "step": 7878 + }, + { + "epoch": 0.7132253100389246, + "grad_norm": 0.8659895658493042, + "learning_rate": 2.1808373034780893e-05, + "loss": 2.693, + "step": 7879 + }, + { + "epoch": 0.7133158323526749, + "grad_norm": 0.7976323962211609, + "learning_rate": 2.178859498295809e-05, + "loss": 2.6586, + "step": 7880 + }, + { + "epoch": 0.7134063546664253, + "grad_norm": 0.8185864686965942, + "learning_rate": 2.1768824807021624e-05, + "loss": 2.6354, + "step": 7881 + }, + { + "epoch": 0.7134968769801756, + "grad_norm": 0.8875325918197632, + "learning_rate": 2.174906250896237e-05, + "loss": 2.7474, + "step": 7882 + }, + { + "epoch": 0.713587399293926, + "grad_norm": 1.010949730873108, + "learning_rate": 2.1729308090770372e-05, + "loss": 2.6001, + "step": 7883 + }, + { + "epoch": 0.7136779216076763, + "grad_norm": 0.8385566473007202, + "learning_rate": 2.1709561554434933e-05, + "loss": 2.676, + "step": 7884 + }, + { + "epoch": 0.7137684439214266, + "grad_norm": 0.8849919438362122, + "learning_rate": 2.1689822901944457e-05, + "loss": 2.6793, + "step": 7885 + }, + { + "epoch": 0.713858966235177, + "grad_norm": 0.8859854340553284, + "learning_rate": 2.1670092135286723e-05, + "loss": 2.6753, + "step": 7886 + }, + { + "epoch": 0.7139494885489273, + "grad_norm": 0.8255252242088318, + "learning_rate": 2.1650369256448543e-05, + "loss": 2.6519, + "step": 7887 + }, + { + "epoch": 0.7140400108626777, + "grad_norm": 0.8009936809539795, + "learning_rate": 2.163065426741603e-05, + "loss": 2.6148, + "step": 7888 + }, + { + "epoch": 0.7141305331764279, + "grad_norm": 0.85800701379776, + "learning_rate": 2.1610947170174488e-05, + "loss": 2.617, + "step": 7889 + }, + { + "epoch": 0.7142210554901783, + "grad_norm": 0.8138502836227417, + "learning_rate": 2.159124796670843e-05, + "loss": 2.6566, + "step": 7890 + }, + { + "epoch": 0.7143115778039286, + "grad_norm": 0.8225932121276855, + "learning_rate": 2.1571556659001545e-05, + "loss": 2.6542, + "step": 7891 + }, + { + "epoch": 0.714402100117679, + "grad_norm": 0.9008784890174866, + "learning_rate": 2.1551873249036793e-05, + "loss": 2.6692, + "step": 7892 + }, + { + "epoch": 0.7144926224314293, + "grad_norm": 0.8721801042556763, + "learning_rate": 2.15321977387962e-05, + "loss": 2.71, + "step": 7893 + }, + { + "epoch": 0.7145831447451797, + "grad_norm": 0.8188298940658569, + "learning_rate": 2.151253013026121e-05, + "loss": 2.6062, + "step": 7894 + }, + { + "epoch": 0.71467366705893, + "grad_norm": 0.8610367178916931, + "learning_rate": 2.149287042541225e-05, + "loss": 2.7082, + "step": 7895 + }, + { + "epoch": 0.7147641893726804, + "grad_norm": 0.8750115633010864, + "learning_rate": 2.1473218626229095e-05, + "loss": 2.7165, + "step": 7896 + }, + { + "epoch": 0.7148547116864307, + "grad_norm": 0.827516496181488, + "learning_rate": 2.1453574734690672e-05, + "loss": 2.5671, + "step": 7897 + }, + { + "epoch": 0.7149452340001811, + "grad_norm": 0.9428842067718506, + "learning_rate": 2.143393875277513e-05, + "loss": 2.6636, + "step": 7898 + }, + { + "epoch": 0.7150357563139313, + "grad_norm": 0.8270500898361206, + "learning_rate": 2.1414310682459802e-05, + "loss": 2.6905, + "step": 7899 + }, + { + "epoch": 0.7151262786276817, + "grad_norm": 0.7811481356620789, + "learning_rate": 2.139469052572127e-05, + "loss": 2.6164, + "step": 7900 + }, + { + "epoch": 0.715216800941432, + "grad_norm": 0.8671733140945435, + "learning_rate": 2.13750782845352e-05, + "loss": 2.671, + "step": 7901 + }, + { + "epoch": 0.7153073232551824, + "grad_norm": 0.8288303017616272, + "learning_rate": 2.1355473960876637e-05, + "loss": 2.6416, + "step": 7902 + }, + { + "epoch": 0.7153978455689327, + "grad_norm": 0.8316097259521484, + "learning_rate": 2.1335877556719676e-05, + "loss": 2.6305, + "step": 7903 + }, + { + "epoch": 0.7154883678826831, + "grad_norm": 0.9389979243278503, + "learning_rate": 2.1316289074037687e-05, + "loss": 2.6453, + "step": 7904 + }, + { + "epoch": 0.7155788901964334, + "grad_norm": 0.9156227707862854, + "learning_rate": 2.1296708514803245e-05, + "loss": 2.6353, + "step": 7905 + }, + { + "epoch": 0.7156694125101838, + "grad_norm": 0.8820056319236755, + "learning_rate": 2.127713588098812e-05, + "loss": 2.6453, + "step": 7906 + }, + { + "epoch": 0.7157599348239341, + "grad_norm": 0.9267116785049438, + "learning_rate": 2.1257571174563206e-05, + "loss": 2.6122, + "step": 7907 + }, + { + "epoch": 0.7158504571376845, + "grad_norm": 0.8101881742477417, + "learning_rate": 2.1238014397498773e-05, + "loss": 2.6201, + "step": 7908 + }, + { + "epoch": 0.7159409794514348, + "grad_norm": 0.8202838897705078, + "learning_rate": 2.121846555176411e-05, + "loss": 2.6305, + "step": 7909 + }, + { + "epoch": 0.7160315017651852, + "grad_norm": 0.8424105048179626, + "learning_rate": 2.119892463932781e-05, + "loss": 2.6367, + "step": 7910 + }, + { + "epoch": 0.7161220240789354, + "grad_norm": 0.8372321724891663, + "learning_rate": 2.1179391662157643e-05, + "loss": 2.6069, + "step": 7911 + }, + { + "epoch": 0.7162125463926858, + "grad_norm": 0.8347082138061523, + "learning_rate": 2.115986662222058e-05, + "loss": 2.6203, + "step": 7912 + }, + { + "epoch": 0.7163030687064361, + "grad_norm": 0.9294868111610413, + "learning_rate": 2.1140349521482795e-05, + "loss": 2.6633, + "step": 7913 + }, + { + "epoch": 0.7163935910201865, + "grad_norm": 0.8243364691734314, + "learning_rate": 2.1120840361909676e-05, + "loss": 2.6449, + "step": 7914 + }, + { + "epoch": 0.7164841133339368, + "grad_norm": 0.8228632211685181, + "learning_rate": 2.1101339145465726e-05, + "loss": 2.6706, + "step": 7915 + }, + { + "epoch": 0.7165746356476872, + "grad_norm": 0.8205071687698364, + "learning_rate": 2.1081845874114815e-05, + "loss": 2.6565, + "step": 7916 + }, + { + "epoch": 0.7166651579614375, + "grad_norm": 0.9841361045837402, + "learning_rate": 2.1062360549819837e-05, + "loss": 2.7053, + "step": 7917 + }, + { + "epoch": 0.7167556802751879, + "grad_norm": 0.8670927882194519, + "learning_rate": 2.1042883174542992e-05, + "loss": 2.6442, + "step": 7918 + }, + { + "epoch": 0.7168462025889382, + "grad_norm": 0.8424184918403625, + "learning_rate": 2.1023413750245645e-05, + "loss": 2.6044, + "step": 7919 + }, + { + "epoch": 0.7169367249026886, + "grad_norm": 0.8506214618682861, + "learning_rate": 2.1003952278888382e-05, + "loss": 2.6469, + "step": 7920 + }, + { + "epoch": 0.7170272472164388, + "grad_norm": 0.8821026682853699, + "learning_rate": 2.098449876243096e-05, + "loss": 2.55, + "step": 7921 + }, + { + "epoch": 0.7171177695301892, + "grad_norm": 0.7802221179008484, + "learning_rate": 2.0965053202832375e-05, + "loss": 2.607, + "step": 7922 + }, + { + "epoch": 0.7172082918439395, + "grad_norm": 0.8895224928855896, + "learning_rate": 2.0945615602050716e-05, + "loss": 2.5846, + "step": 7923 + }, + { + "epoch": 0.7172988141576899, + "grad_norm": 0.8102734684944153, + "learning_rate": 2.0926185962043466e-05, + "loss": 2.5796, + "step": 7924 + }, + { + "epoch": 0.7173893364714402, + "grad_norm": 0.8378505706787109, + "learning_rate": 2.090676428476709e-05, + "loss": 2.5913, + "step": 7925 + }, + { + "epoch": 0.7174798587851905, + "grad_norm": 0.8973132371902466, + "learning_rate": 2.088735057217739e-05, + "loss": 2.6591, + "step": 7926 + }, + { + "epoch": 0.7175703810989409, + "grad_norm": 0.8435284495353699, + "learning_rate": 2.0867944826229324e-05, + "loss": 2.5978, + "step": 7927 + }, + { + "epoch": 0.7176609034126912, + "grad_norm": 0.9157330989837646, + "learning_rate": 2.0848547048877044e-05, + "loss": 2.6221, + "step": 7928 + }, + { + "epoch": 0.7177514257264416, + "grad_norm": 0.9741874933242798, + "learning_rate": 2.0829157242073915e-05, + "loss": 2.6642, + "step": 7929 + }, + { + "epoch": 0.7178419480401919, + "grad_norm": 0.8608724474906921, + "learning_rate": 2.0809775407772503e-05, + "loss": 2.629, + "step": 7930 + }, + { + "epoch": 0.7179324703539423, + "grad_norm": 0.8521125316619873, + "learning_rate": 2.0790401547924498e-05, + "loss": 2.6475, + "step": 7931 + }, + { + "epoch": 0.7180229926676925, + "grad_norm": 0.8497169017791748, + "learning_rate": 2.0771035664480942e-05, + "loss": 2.6299, + "step": 7932 + }, + { + "epoch": 0.7181135149814429, + "grad_norm": 0.8564878106117249, + "learning_rate": 2.0751677759391895e-05, + "loss": 2.7099, + "step": 7933 + }, + { + "epoch": 0.7182040372951932, + "grad_norm": 0.8833389282226562, + "learning_rate": 2.0732327834606735e-05, + "loss": 2.6199, + "step": 7934 + }, + { + "epoch": 0.7182945596089436, + "grad_norm": 0.9143378138542175, + "learning_rate": 2.0712985892073987e-05, + "loss": 2.6486, + "step": 7935 + }, + { + "epoch": 0.7183850819226939, + "grad_norm": 0.8474334478378296, + "learning_rate": 2.069365193374142e-05, + "loss": 2.636, + "step": 7936 + }, + { + "epoch": 0.7184756042364443, + "grad_norm": 0.8107338547706604, + "learning_rate": 2.0674325961555872e-05, + "loss": 2.6333, + "step": 7937 + }, + { + "epoch": 0.7185661265501946, + "grad_norm": 0.8914929032325745, + "learning_rate": 2.065500797746359e-05, + "loss": 2.6174, + "step": 7938 + }, + { + "epoch": 0.718656648863945, + "grad_norm": 0.8551090359687805, + "learning_rate": 2.0635697983409817e-05, + "loss": 2.6287, + "step": 7939 + }, + { + "epoch": 0.7187471711776953, + "grad_norm": 0.8404701352119446, + "learning_rate": 2.0616395981339075e-05, + "loss": 2.5802, + "step": 7940 + }, + { + "epoch": 0.7188376934914457, + "grad_norm": 0.8328677415847778, + "learning_rate": 2.059710197319509e-05, + "loss": 2.6099, + "step": 7941 + }, + { + "epoch": 0.718928215805196, + "grad_norm": 0.8687942028045654, + "learning_rate": 2.0577815960920754e-05, + "loss": 2.6319, + "step": 7942 + }, + { + "epoch": 0.7190187381189463, + "grad_norm": 0.8320063948631287, + "learning_rate": 2.0558537946458177e-05, + "loss": 2.6597, + "step": 7943 + }, + { + "epoch": 0.7191092604326966, + "grad_norm": 0.835174560546875, + "learning_rate": 2.0539267931748695e-05, + "loss": 2.6207, + "step": 7944 + }, + { + "epoch": 0.719199782746447, + "grad_norm": 0.7959899306297302, + "learning_rate": 2.0520005918732698e-05, + "loss": 2.5987, + "step": 7945 + }, + { + "epoch": 0.7192903050601973, + "grad_norm": 0.8572633862495422, + "learning_rate": 2.050075190934998e-05, + "loss": 2.6462, + "step": 7946 + }, + { + "epoch": 0.7193808273739477, + "grad_norm": 0.8058256506919861, + "learning_rate": 2.0481505905539357e-05, + "loss": 2.6276, + "step": 7947 + }, + { + "epoch": 0.719471349687698, + "grad_norm": 0.8863130807876587, + "learning_rate": 2.0462267909238896e-05, + "loss": 2.6207, + "step": 7948 + }, + { + "epoch": 0.7195618720014484, + "grad_norm": 0.808664083480835, + "learning_rate": 2.0443037922385888e-05, + "loss": 2.6078, + "step": 7949 + }, + { + "epoch": 0.7196523943151987, + "grad_norm": 0.9259856343269348, + "learning_rate": 2.042381594691678e-05, + "loss": 2.672, + "step": 7950 + }, + { + "epoch": 0.7197429166289491, + "grad_norm": 0.8126853704452515, + "learning_rate": 2.0404601984767234e-05, + "loss": 2.6178, + "step": 7951 + }, + { + "epoch": 0.7198334389426994, + "grad_norm": 0.8391700387001038, + "learning_rate": 2.038539603787212e-05, + "loss": 2.608, + "step": 7952 + }, + { + "epoch": 0.7199239612564498, + "grad_norm": 0.8788531422615051, + "learning_rate": 2.0366198108165412e-05, + "loss": 2.6365, + "step": 7953 + }, + { + "epoch": 0.7200144835702, + "grad_norm": 0.8273787498474121, + "learning_rate": 2.0347008197580374e-05, + "loss": 2.6462, + "step": 7954 + }, + { + "epoch": 0.7201050058839504, + "grad_norm": 0.8949300646781921, + "learning_rate": 2.0327826308049447e-05, + "loss": 2.5884, + "step": 7955 + }, + { + "epoch": 0.7201955281977007, + "grad_norm": 0.8484796285629272, + "learning_rate": 2.0308652441504217e-05, + "loss": 2.6097, + "step": 7956 + }, + { + "epoch": 0.7202860505114511, + "grad_norm": 0.8365808129310608, + "learning_rate": 2.0289486599875528e-05, + "loss": 2.6805, + "step": 7957 + }, + { + "epoch": 0.7203765728252014, + "grad_norm": 0.8657910823822021, + "learning_rate": 2.027032878509332e-05, + "loss": 2.6297, + "step": 7958 + }, + { + "epoch": 0.7204670951389518, + "grad_norm": 0.8437427878379822, + "learning_rate": 2.025117899908686e-05, + "loss": 2.6447, + "step": 7959 + }, + { + "epoch": 0.7205576174527021, + "grad_norm": 0.8482447266578674, + "learning_rate": 2.0232037243784475e-05, + "loss": 2.6771, + "step": 7960 + }, + { + "epoch": 0.7206481397664525, + "grad_norm": 0.8186309933662415, + "learning_rate": 2.0212903521113757e-05, + "loss": 2.6535, + "step": 7961 + }, + { + "epoch": 0.7207386620802028, + "grad_norm": 0.8866466283798218, + "learning_rate": 2.019377783300147e-05, + "loss": 2.6354, + "step": 7962 + }, + { + "epoch": 0.7208291843939532, + "grad_norm": 0.8010830283164978, + "learning_rate": 2.0174660181373595e-05, + "loss": 2.6233, + "step": 7963 + }, + { + "epoch": 0.7209197067077034, + "grad_norm": 0.9101557731628418, + "learning_rate": 2.01555505681552e-05, + "loss": 2.7038, + "step": 7964 + }, + { + "epoch": 0.7210102290214538, + "grad_norm": 0.8546535968780518, + "learning_rate": 2.013644899527074e-05, + "loss": 2.6985, + "step": 7965 + }, + { + "epoch": 0.7211007513352041, + "grad_norm": 0.8595011234283447, + "learning_rate": 2.011735546464365e-05, + "loss": 2.6894, + "step": 7966 + }, + { + "epoch": 0.7211912736489544, + "grad_norm": 0.8850213289260864, + "learning_rate": 2.0098269978196672e-05, + "loss": 2.6243, + "step": 7967 + }, + { + "epoch": 0.7212817959627048, + "grad_norm": 0.8743082284927368, + "learning_rate": 2.0079192537851733e-05, + "loss": 2.6324, + "step": 7968 + }, + { + "epoch": 0.7213723182764551, + "grad_norm": 0.8616811633110046, + "learning_rate": 2.0060123145529907e-05, + "loss": 2.6398, + "step": 7969 + }, + { + "epoch": 0.7214628405902055, + "grad_norm": 0.901737630367279, + "learning_rate": 2.0041061803151508e-05, + "loss": 2.6913, + "step": 7970 + }, + { + "epoch": 0.7215533629039558, + "grad_norm": 0.9253664612770081, + "learning_rate": 2.0022008512636016e-05, + "loss": 2.6259, + "step": 7971 + }, + { + "epoch": 0.7216438852177062, + "grad_norm": 0.8385868072509766, + "learning_rate": 2.000296327590202e-05, + "loss": 2.636, + "step": 7972 + }, + { + "epoch": 0.7217344075314565, + "grad_norm": 0.8327115178108215, + "learning_rate": 1.9983926094867488e-05, + "loss": 2.5869, + "step": 7973 + }, + { + "epoch": 0.7218249298452069, + "grad_norm": 0.8412270545959473, + "learning_rate": 1.9964896971449387e-05, + "loss": 2.6467, + "step": 7974 + }, + { + "epoch": 0.7219154521589571, + "grad_norm": 0.8899946808815002, + "learning_rate": 1.994587590756397e-05, + "loss": 2.6581, + "step": 7975 + }, + { + "epoch": 0.7220059744727075, + "grad_norm": 0.8444313406944275, + "learning_rate": 1.9926862905126665e-05, + "loss": 2.5909, + "step": 7976 + }, + { + "epoch": 0.7220964967864578, + "grad_norm": 0.8593295812606812, + "learning_rate": 1.9907857966052056e-05, + "loss": 2.6534, + "step": 7977 + }, + { + "epoch": 0.7221870191002082, + "grad_norm": 0.811881422996521, + "learning_rate": 1.9888861092253975e-05, + "loss": 2.6254, + "step": 7978 + }, + { + "epoch": 0.7222775414139585, + "grad_norm": 0.8473266363143921, + "learning_rate": 1.9869872285645406e-05, + "loss": 2.6193, + "step": 7979 + }, + { + "epoch": 0.7223680637277089, + "grad_norm": 0.8891638517379761, + "learning_rate": 1.985089154813846e-05, + "loss": 2.5915, + "step": 7980 + }, + { + "epoch": 0.7224585860414592, + "grad_norm": 0.8436593413352966, + "learning_rate": 1.9831918881644595e-05, + "loss": 2.6799, + "step": 7981 + }, + { + "epoch": 0.7225491083552096, + "grad_norm": 0.9255457520484924, + "learning_rate": 1.981295428807428e-05, + "loss": 2.6251, + "step": 7982 + }, + { + "epoch": 0.7226396306689599, + "grad_norm": 0.8497655987739563, + "learning_rate": 1.979399776933727e-05, + "loss": 2.603, + "step": 7983 + }, + { + "epoch": 0.7227301529827103, + "grad_norm": 0.8996449708938599, + "learning_rate": 1.9775049327342486e-05, + "loss": 2.7178, + "step": 7984 + }, + { + "epoch": 0.7228206752964605, + "grad_norm": 0.8296543955802917, + "learning_rate": 1.9756108963998054e-05, + "loss": 2.6355, + "step": 7985 + }, + { + "epoch": 0.7229111976102109, + "grad_norm": 0.843795120716095, + "learning_rate": 1.9737176681211244e-05, + "loss": 2.6516, + "step": 7986 + }, + { + "epoch": 0.7230017199239612, + "grad_norm": 0.8656344413757324, + "learning_rate": 1.9718252480888566e-05, + "loss": 2.577, + "step": 7987 + }, + { + "epoch": 0.7230922422377116, + "grad_norm": 0.8967494964599609, + "learning_rate": 1.9699336364935615e-05, + "loss": 2.623, + "step": 7988 + }, + { + "epoch": 0.7231827645514619, + "grad_norm": 0.8973367810249329, + "learning_rate": 1.968042833525736e-05, + "loss": 2.6147, + "step": 7989 + }, + { + "epoch": 0.7232732868652123, + "grad_norm": 0.831631064414978, + "learning_rate": 1.9661528393757744e-05, + "loss": 2.6482, + "step": 7990 + }, + { + "epoch": 0.7233638091789626, + "grad_norm": 0.8482058048248291, + "learning_rate": 1.964263654234001e-05, + "loss": 2.5898, + "step": 7991 + }, + { + "epoch": 0.723454331492713, + "grad_norm": 0.816674530506134, + "learning_rate": 1.962375278290659e-05, + "loss": 2.6284, + "step": 7992 + }, + { + "epoch": 0.7235448538064633, + "grad_norm": 0.9323936104774475, + "learning_rate": 1.9604877117359054e-05, + "loss": 2.6328, + "step": 7993 + }, + { + "epoch": 0.7236353761202137, + "grad_norm": 0.8940310478210449, + "learning_rate": 1.9586009547598205e-05, + "loss": 2.6816, + "step": 7994 + }, + { + "epoch": 0.723725898433964, + "grad_norm": 0.965698778629303, + "learning_rate": 1.956715007552401e-05, + "loss": 2.5628, + "step": 7995 + }, + { + "epoch": 0.7238164207477144, + "grad_norm": 0.8234290480613708, + "learning_rate": 1.954829870303555e-05, + "loss": 2.6205, + "step": 7996 + }, + { + "epoch": 0.7239069430614646, + "grad_norm": 0.8232055902481079, + "learning_rate": 1.9529455432031273e-05, + "loss": 2.6669, + "step": 7997 + }, + { + "epoch": 0.723997465375215, + "grad_norm": 0.8574815392494202, + "learning_rate": 1.9510620264408596e-05, + "loss": 2.6661, + "step": 7998 + }, + { + "epoch": 0.7240879876889653, + "grad_norm": 0.880766749382019, + "learning_rate": 1.9491793202064256e-05, + "loss": 2.6692, + "step": 7999 + }, + { + "epoch": 0.7241785100027157, + "grad_norm": 0.8221156597137451, + "learning_rate": 1.947297424689414e-05, + "loss": 2.6569, + "step": 8000 + }, + { + "epoch": 0.7241785100027157, + "eval_loss": 2.581277847290039, + "eval_runtime": 71.4998, + "eval_samples_per_second": 37.804, + "eval_steps_per_second": 3.161, + "step": 8000 + }, + { + "epoch": 0.724269032316466, + "grad_norm": 0.9000765085220337, + "learning_rate": 1.9454163400793334e-05, + "loss": 2.5595, + "step": 8001 + }, + { + "epoch": 0.7243595546302164, + "grad_norm": 0.8212129473686218, + "learning_rate": 1.943536066565603e-05, + "loss": 2.604, + "step": 8002 + }, + { + "epoch": 0.7244500769439667, + "grad_norm": 0.8661413788795471, + "learning_rate": 1.941656604337575e-05, + "loss": 2.6305, + "step": 8003 + }, + { + "epoch": 0.7245405992577171, + "grad_norm": 0.860085666179657, + "learning_rate": 1.9397779535845028e-05, + "loss": 2.6943, + "step": 8004 + }, + { + "epoch": 0.7246311215714674, + "grad_norm": 0.8990339636802673, + "learning_rate": 1.937900114495571e-05, + "loss": 2.6932, + "step": 8005 + }, + { + "epoch": 0.7247216438852178, + "grad_norm": 0.8864555358886719, + "learning_rate": 1.9360230872598773e-05, + "loss": 2.5889, + "step": 8006 + }, + { + "epoch": 0.724812166198968, + "grad_norm": 0.8880107998847961, + "learning_rate": 1.934146872066438e-05, + "loss": 2.6804, + "step": 8007 + }, + { + "epoch": 0.7249026885127184, + "grad_norm": 0.8222184777259827, + "learning_rate": 1.9322714691041878e-05, + "loss": 2.625, + "step": 8008 + }, + { + "epoch": 0.7249932108264687, + "grad_norm": 0.8501119613647461, + "learning_rate": 1.930396878561983e-05, + "loss": 2.6189, + "step": 8009 + }, + { + "epoch": 0.725083733140219, + "grad_norm": 0.8509238958358765, + "learning_rate": 1.9285231006285853e-05, + "loss": 2.5565, + "step": 8010 + }, + { + "epoch": 0.7251742554539694, + "grad_norm": 0.8645397424697876, + "learning_rate": 1.926650135492697e-05, + "loss": 2.6501, + "step": 8011 + }, + { + "epoch": 0.7252647777677197, + "grad_norm": 0.8589481115341187, + "learning_rate": 1.924777983342916e-05, + "loss": 2.6458, + "step": 8012 + }, + { + "epoch": 0.7253553000814701, + "grad_norm": 0.8989269137382507, + "learning_rate": 1.9229066443677713e-05, + "loss": 2.6505, + "step": 8013 + }, + { + "epoch": 0.7254458223952204, + "grad_norm": 0.924936830997467, + "learning_rate": 1.9210361187557057e-05, + "loss": 2.6867, + "step": 8014 + }, + { + "epoch": 0.7255363447089708, + "grad_norm": 0.8466819524765015, + "learning_rate": 1.9191664066950832e-05, + "loss": 2.6915, + "step": 8015 + }, + { + "epoch": 0.7256268670227211, + "grad_norm": 0.8410524129867554, + "learning_rate": 1.9172975083741817e-05, + "loss": 2.5731, + "step": 8016 + }, + { + "epoch": 0.7257173893364715, + "grad_norm": 0.8567287921905518, + "learning_rate": 1.915429423981202e-05, + "loss": 2.6707, + "step": 8017 + }, + { + "epoch": 0.7258079116502217, + "grad_norm": 0.8804045915603638, + "learning_rate": 1.9135621537042535e-05, + "loss": 2.6704, + "step": 8018 + }, + { + "epoch": 0.7258984339639721, + "grad_norm": 0.8276041746139526, + "learning_rate": 1.9116956977313804e-05, + "loss": 2.6637, + "step": 8019 + }, + { + "epoch": 0.7259889562777224, + "grad_norm": 0.8281861543655396, + "learning_rate": 1.9098300562505266e-05, + "loss": 2.5908, + "step": 8020 + }, + { + "epoch": 0.7260794785914728, + "grad_norm": 0.8218677639961243, + "learning_rate": 1.907965229449564e-05, + "loss": 2.5981, + "step": 8021 + }, + { + "epoch": 0.7261700009052231, + "grad_norm": 0.7891271710395813, + "learning_rate": 1.9061012175162817e-05, + "loss": 2.6288, + "step": 8022 + }, + { + "epoch": 0.7262605232189735, + "grad_norm": 0.8426888585090637, + "learning_rate": 1.9042380206383858e-05, + "loss": 2.6242, + "step": 8023 + }, + { + "epoch": 0.7263510455327238, + "grad_norm": 0.8271638751029968, + "learning_rate": 1.9023756390034998e-05, + "loss": 2.6407, + "step": 8024 + }, + { + "epoch": 0.7264415678464742, + "grad_norm": 0.8045911192893982, + "learning_rate": 1.9005140727991677e-05, + "loss": 2.6306, + "step": 8025 + }, + { + "epoch": 0.7265320901602245, + "grad_norm": 0.8195641040802002, + "learning_rate": 1.8986533222128413e-05, + "loss": 2.587, + "step": 8026 + }, + { + "epoch": 0.7266226124739749, + "grad_norm": 0.8664358258247375, + "learning_rate": 1.896793387431909e-05, + "loss": 2.6334, + "step": 8027 + }, + { + "epoch": 0.7267131347877251, + "grad_norm": 0.8703563809394836, + "learning_rate": 1.8949342686436588e-05, + "loss": 2.6234, + "step": 8028 + }, + { + "epoch": 0.7268036571014755, + "grad_norm": 0.8076162934303284, + "learning_rate": 1.8930759660353048e-05, + "loss": 2.6312, + "step": 8029 + }, + { + "epoch": 0.7268941794152258, + "grad_norm": 0.8560411930084229, + "learning_rate": 1.8912184797939803e-05, + "loss": 2.6338, + "step": 8030 + }, + { + "epoch": 0.7269847017289762, + "grad_norm": 0.8122009038925171, + "learning_rate": 1.8893618101067355e-05, + "loss": 2.6406, + "step": 8031 + }, + { + "epoch": 0.7270752240427265, + "grad_norm": 0.8404006361961365, + "learning_rate": 1.8875059571605293e-05, + "loss": 2.6284, + "step": 8032 + }, + { + "epoch": 0.7271657463564769, + "grad_norm": 0.8854060769081116, + "learning_rate": 1.8856509211422557e-05, + "loss": 2.5995, + "step": 8033 + }, + { + "epoch": 0.7272562686702272, + "grad_norm": 0.8898727297782898, + "learning_rate": 1.8837967022387102e-05, + "loss": 2.611, + "step": 8034 + }, + { + "epoch": 0.7273467909839776, + "grad_norm": 0.9046933054924011, + "learning_rate": 1.881943300636615e-05, + "loss": 2.6472, + "step": 8035 + }, + { + "epoch": 0.7274373132977279, + "grad_norm": 0.8621687889099121, + "learning_rate": 1.8800907165226066e-05, + "loss": 2.6788, + "step": 8036 + }, + { + "epoch": 0.7275278356114783, + "grad_norm": 0.8203818798065186, + "learning_rate": 1.878238950083242e-05, + "loss": 2.6216, + "step": 8037 + }, + { + "epoch": 0.7276183579252286, + "grad_norm": 0.8922222852706909, + "learning_rate": 1.876388001504995e-05, + "loss": 2.6475, + "step": 8038 + }, + { + "epoch": 0.727708880238979, + "grad_norm": 0.8815889358520508, + "learning_rate": 1.874537870974251e-05, + "loss": 2.7283, + "step": 8039 + }, + { + "epoch": 0.7277994025527292, + "grad_norm": 0.8744126558303833, + "learning_rate": 1.8726885586773212e-05, + "loss": 2.6958, + "step": 8040 + }, + { + "epoch": 0.7278899248664796, + "grad_norm": 0.8782918453216553, + "learning_rate": 1.870840064800431e-05, + "loss": 2.6935, + "step": 8041 + }, + { + "epoch": 0.7279804471802299, + "grad_norm": 0.859200119972229, + "learning_rate": 1.8689923895297245e-05, + "loss": 2.6295, + "step": 8042 + }, + { + "epoch": 0.7280709694939803, + "grad_norm": 0.8936563730239868, + "learning_rate": 1.867145533051261e-05, + "loss": 2.6756, + "step": 8043 + }, + { + "epoch": 0.7281614918077306, + "grad_norm": 0.8279479742050171, + "learning_rate": 1.8652994955510227e-05, + "loss": 2.5914, + "step": 8044 + }, + { + "epoch": 0.728252014121481, + "grad_norm": 0.8420711159706116, + "learning_rate": 1.8634542772148978e-05, + "loss": 2.6247, + "step": 8045 + }, + { + "epoch": 0.7283425364352313, + "grad_norm": 0.8849563598632812, + "learning_rate": 1.8616098782287095e-05, + "loss": 2.5771, + "step": 8046 + }, + { + "epoch": 0.7284330587489817, + "grad_norm": 0.8260254263877869, + "learning_rate": 1.8597662987781816e-05, + "loss": 2.65, + "step": 8047 + }, + { + "epoch": 0.728523581062732, + "grad_norm": 0.8107094764709473, + "learning_rate": 1.8579235390489635e-05, + "loss": 2.6522, + "step": 8048 + }, + { + "epoch": 0.7286141033764824, + "grad_norm": 0.9501773715019226, + "learning_rate": 1.8560815992266234e-05, + "loss": 2.6627, + "step": 8049 + }, + { + "epoch": 0.7287046256902326, + "grad_norm": 0.8340609073638916, + "learning_rate": 1.854240479496643e-05, + "loss": 2.593, + "step": 8050 + }, + { + "epoch": 0.7287951480039829, + "grad_norm": 0.8562996983528137, + "learning_rate": 1.8524001800444235e-05, + "loss": 2.6483, + "step": 8051 + }, + { + "epoch": 0.7288856703177333, + "grad_norm": 0.8556627035140991, + "learning_rate": 1.8505607010552862e-05, + "loss": 2.6394, + "step": 8052 + }, + { + "epoch": 0.7289761926314836, + "grad_norm": 0.8138837218284607, + "learning_rate": 1.848722042714457e-05, + "loss": 2.6216, + "step": 8053 + }, + { + "epoch": 0.729066714945234, + "grad_norm": 0.8794299960136414, + "learning_rate": 1.846884205207101e-05, + "loss": 2.6405, + "step": 8054 + }, + { + "epoch": 0.7291572372589843, + "grad_norm": 0.8482553958892822, + "learning_rate": 1.8450471887182796e-05, + "loss": 2.648, + "step": 8055 + }, + { + "epoch": 0.7292477595727347, + "grad_norm": 0.8459094166755676, + "learning_rate": 1.8432109934329834e-05, + "loss": 2.6691, + "step": 8056 + }, + { + "epoch": 0.729338281886485, + "grad_norm": 0.8724743723869324, + "learning_rate": 1.8413756195361166e-05, + "loss": 2.5916, + "step": 8057 + }, + { + "epoch": 0.7294288042002354, + "grad_norm": 0.8487122654914856, + "learning_rate": 1.8395410672125035e-05, + "loss": 2.6292, + "step": 8058 + }, + { + "epoch": 0.7295193265139857, + "grad_norm": 0.8307660818099976, + "learning_rate": 1.8377073366468778e-05, + "loss": 2.5968, + "step": 8059 + }, + { + "epoch": 0.729609848827736, + "grad_norm": 0.8485986590385437, + "learning_rate": 1.835874428023905e-05, + "loss": 2.5854, + "step": 8060 + }, + { + "epoch": 0.7297003711414863, + "grad_norm": 0.8574190139770508, + "learning_rate": 1.8340423415281505e-05, + "loss": 2.6206, + "step": 8061 + }, + { + "epoch": 0.7297908934552367, + "grad_norm": 0.8343666791915894, + "learning_rate": 1.832211077344109e-05, + "loss": 2.6166, + "step": 8062 + }, + { + "epoch": 0.729881415768987, + "grad_norm": 0.8841292262077332, + "learning_rate": 1.8303806356561893e-05, + "loss": 2.6292, + "step": 8063 + }, + { + "epoch": 0.7299719380827374, + "grad_norm": 0.8868502974510193, + "learning_rate": 1.8285510166487152e-05, + "loss": 2.6101, + "step": 8064 + }, + { + "epoch": 0.7300624603964877, + "grad_norm": 0.8825874328613281, + "learning_rate": 1.8267222205059308e-05, + "loss": 2.6845, + "step": 8065 + }, + { + "epoch": 0.7301529827102381, + "grad_norm": 0.9012864828109741, + "learning_rate": 1.824894247411998e-05, + "loss": 2.6119, + "step": 8066 + }, + { + "epoch": 0.7302435050239884, + "grad_norm": 0.8223727345466614, + "learning_rate": 1.8230670975509844e-05, + "loss": 2.6667, + "step": 8067 + }, + { + "epoch": 0.7303340273377388, + "grad_norm": 0.8057619333267212, + "learning_rate": 1.8212407711068958e-05, + "loss": 2.6175, + "step": 8068 + }, + { + "epoch": 0.7304245496514891, + "grad_norm": 0.7814914584159851, + "learning_rate": 1.8194152682636356e-05, + "loss": 2.6271, + "step": 8069 + }, + { + "epoch": 0.7305150719652395, + "grad_norm": 0.8347715735435486, + "learning_rate": 1.817590589205035e-05, + "loss": 2.6295, + "step": 8070 + }, + { + "epoch": 0.7306055942789897, + "grad_norm": 0.8338614106178284, + "learning_rate": 1.8157667341148364e-05, + "loss": 2.612, + "step": 8071 + }, + { + "epoch": 0.7306961165927401, + "grad_norm": 0.8309128880500793, + "learning_rate": 1.8139437031767048e-05, + "loss": 2.6255, + "step": 8072 + }, + { + "epoch": 0.7307866389064904, + "grad_norm": 0.8655341863632202, + "learning_rate": 1.8121214965742182e-05, + "loss": 2.6858, + "step": 8073 + }, + { + "epoch": 0.7308771612202408, + "grad_norm": 0.8365732431411743, + "learning_rate": 1.810300114490875e-05, + "loss": 2.6609, + "step": 8074 + }, + { + "epoch": 0.7309676835339911, + "grad_norm": 0.9555655717849731, + "learning_rate": 1.808479557110081e-05, + "loss": 2.6157, + "step": 8075 + }, + { + "epoch": 0.7310582058477415, + "grad_norm": 0.834373950958252, + "learning_rate": 1.8066598246151768e-05, + "loss": 2.6308, + "step": 8076 + }, + { + "epoch": 0.7311487281614918, + "grad_norm": 0.9408546686172485, + "learning_rate": 1.804840917189402e-05, + "loss": 2.612, + "step": 8077 + }, + { + "epoch": 0.7312392504752422, + "grad_norm": 0.8463700413703918, + "learning_rate": 1.8030228350159228e-05, + "loss": 2.6008, + "step": 8078 + }, + { + "epoch": 0.7313297727889925, + "grad_norm": 0.880274772644043, + "learning_rate": 1.8012055782778194e-05, + "loss": 2.614, + "step": 8079 + }, + { + "epoch": 0.7314202951027429, + "grad_norm": 0.8945399522781372, + "learning_rate": 1.7993891471580893e-05, + "loss": 2.6185, + "step": 8080 + }, + { + "epoch": 0.7315108174164932, + "grad_norm": 0.8566452860832214, + "learning_rate": 1.797573541839648e-05, + "loss": 2.6112, + "step": 8081 + }, + { + "epoch": 0.7316013397302435, + "grad_norm": 0.8027698397636414, + "learning_rate": 1.7957587625053296e-05, + "loss": 2.6642, + "step": 8082 + }, + { + "epoch": 0.7316918620439938, + "grad_norm": 0.7911441922187805, + "learning_rate": 1.7939448093378743e-05, + "loss": 2.5205, + "step": 8083 + }, + { + "epoch": 0.7317823843577442, + "grad_norm": 0.9017260670661926, + "learning_rate": 1.7921316825199563e-05, + "loss": 2.5994, + "step": 8084 + }, + { + "epoch": 0.7318729066714945, + "grad_norm": 0.8945018649101257, + "learning_rate": 1.7903193822341512e-05, + "loss": 2.5853, + "step": 8085 + }, + { + "epoch": 0.7319634289852449, + "grad_norm": 0.8400217890739441, + "learning_rate": 1.78850790866296e-05, + "loss": 2.6382, + "step": 8086 + }, + { + "epoch": 0.7320539512989952, + "grad_norm": 0.8900841474533081, + "learning_rate": 1.7866972619887978e-05, + "loss": 2.6085, + "step": 8087 + }, + { + "epoch": 0.7321444736127456, + "grad_norm": 0.8915272355079651, + "learning_rate": 1.7848874423939966e-05, + "loss": 2.7206, + "step": 8088 + }, + { + "epoch": 0.7322349959264959, + "grad_norm": 0.832134485244751, + "learning_rate": 1.7830784500608044e-05, + "loss": 2.6791, + "step": 8089 + }, + { + "epoch": 0.7323255182402463, + "grad_norm": 0.8645135164260864, + "learning_rate": 1.7812702851713904e-05, + "loss": 2.5735, + "step": 8090 + }, + { + "epoch": 0.7324160405539966, + "grad_norm": 0.7954042553901672, + "learning_rate": 1.7794629479078318e-05, + "loss": 2.6569, + "step": 8091 + }, + { + "epoch": 0.7325065628677468, + "grad_norm": 0.8583080172538757, + "learning_rate": 1.777656438452129e-05, + "loss": 2.6162, + "step": 8092 + }, + { + "epoch": 0.7325970851814972, + "grad_norm": 0.8749803304672241, + "learning_rate": 1.7758507569861983e-05, + "loss": 2.6316, + "step": 8093 + }, + { + "epoch": 0.7326876074952475, + "grad_norm": 0.8284852504730225, + "learning_rate": 1.77404590369187e-05, + "loss": 2.5988, + "step": 8094 + }, + { + "epoch": 0.7327781298089979, + "grad_norm": 0.8743882775306702, + "learning_rate": 1.7722418787508955e-05, + "loss": 2.6676, + "step": 8095 + }, + { + "epoch": 0.7328686521227482, + "grad_norm": 0.8623575568199158, + "learning_rate": 1.7704386823449403e-05, + "loss": 2.6689, + "step": 8096 + }, + { + "epoch": 0.7329591744364986, + "grad_norm": 0.8969399333000183, + "learning_rate": 1.7686363146555805e-05, + "loss": 2.6124, + "step": 8097 + }, + { + "epoch": 0.7330496967502489, + "grad_norm": 0.8351315259933472, + "learning_rate": 1.7668347758643233e-05, + "loss": 2.5901, + "step": 8098 + }, + { + "epoch": 0.7331402190639993, + "grad_norm": 0.811880886554718, + "learning_rate": 1.7650340661525754e-05, + "loss": 2.6038, + "step": 8099 + }, + { + "epoch": 0.7332307413777496, + "grad_norm": 0.810198187828064, + "learning_rate": 1.763234185701673e-05, + "loss": 2.6428, + "step": 8100 + }, + { + "epoch": 0.7333212636915, + "grad_norm": 0.8915790319442749, + "learning_rate": 1.7614351346928627e-05, + "loss": 2.6327, + "step": 8101 + }, + { + "epoch": 0.7334117860052503, + "grad_norm": 1.0429893732070923, + "learning_rate": 1.7596369133073076e-05, + "loss": 2.6487, + "step": 8102 + }, + { + "epoch": 0.7335023083190007, + "grad_norm": 0.8270307779312134, + "learning_rate": 1.7578395217260912e-05, + "loss": 2.6318, + "step": 8103 + }, + { + "epoch": 0.7335928306327509, + "grad_norm": 0.8245958089828491, + "learning_rate": 1.756042960130212e-05, + "loss": 2.6318, + "step": 8104 + }, + { + "epoch": 0.7336833529465013, + "grad_norm": 0.8989834785461426, + "learning_rate": 1.754247228700575e-05, + "loss": 2.6562, + "step": 8105 + }, + { + "epoch": 0.7337738752602516, + "grad_norm": 0.8361373543739319, + "learning_rate": 1.7524523276180226e-05, + "loss": 2.5975, + "step": 8106 + }, + { + "epoch": 0.733864397574002, + "grad_norm": 0.8043052554130554, + "learning_rate": 1.7506582570632914e-05, + "loss": 2.5758, + "step": 8107 + }, + { + "epoch": 0.7339549198877523, + "grad_norm": 0.8632999658584595, + "learning_rate": 1.7488650172170496e-05, + "loss": 2.6127, + "step": 8108 + }, + { + "epoch": 0.7340454422015027, + "grad_norm": 0.8198879361152649, + "learning_rate": 1.7470726082598733e-05, + "loss": 2.6815, + "step": 8109 + }, + { + "epoch": 0.734135964515253, + "grad_norm": 0.8820613026618958, + "learning_rate": 1.74528103037226e-05, + "loss": 2.6031, + "step": 8110 + }, + { + "epoch": 0.7342264868290034, + "grad_norm": 0.8991879224777222, + "learning_rate": 1.743490283734621e-05, + "loss": 2.5671, + "step": 8111 + }, + { + "epoch": 0.7343170091427537, + "grad_norm": 0.7861236929893494, + "learning_rate": 1.741700368527288e-05, + "loss": 2.6053, + "step": 8112 + }, + { + "epoch": 0.7344075314565041, + "grad_norm": 0.8448014259338379, + "learning_rate": 1.7399112849304954e-05, + "loss": 2.6049, + "step": 8113 + }, + { + "epoch": 0.7344980537702543, + "grad_norm": 0.8373123407363892, + "learning_rate": 1.7381230331244168e-05, + "loss": 2.6315, + "step": 8114 + }, + { + "epoch": 0.7345885760840047, + "grad_norm": 0.8829507827758789, + "learning_rate": 1.7363356132891196e-05, + "loss": 2.6234, + "step": 8115 + }, + { + "epoch": 0.734679098397755, + "grad_norm": 0.9009426236152649, + "learning_rate": 1.7345490256045993e-05, + "loss": 2.6165, + "step": 8116 + }, + { + "epoch": 0.7347696207115054, + "grad_norm": 0.8082904815673828, + "learning_rate": 1.7327632702507667e-05, + "loss": 2.6422, + "step": 8117 + }, + { + "epoch": 0.7348601430252557, + "grad_norm": 0.8886256217956543, + "learning_rate": 1.730978347407447e-05, + "loss": 2.6248, + "step": 8118 + }, + { + "epoch": 0.7349506653390061, + "grad_norm": 0.8580721616744995, + "learning_rate": 1.7291942572543807e-05, + "loss": 2.6861, + "step": 8119 + }, + { + "epoch": 0.7350411876527564, + "grad_norm": 0.911753237247467, + "learning_rate": 1.7274109999712295e-05, + "loss": 2.6194, + "step": 8120 + }, + { + "epoch": 0.7351317099665068, + "grad_norm": 0.9159942269325256, + "learning_rate": 1.725628575737559e-05, + "loss": 2.5499, + "step": 8121 + }, + { + "epoch": 0.7352222322802571, + "grad_norm": 0.8776478171348572, + "learning_rate": 1.7238469847328697e-05, + "loss": 2.659, + "step": 8122 + }, + { + "epoch": 0.7353127545940075, + "grad_norm": 0.8453285098075867, + "learning_rate": 1.7220662271365618e-05, + "loss": 2.6071, + "step": 8123 + }, + { + "epoch": 0.7354032769077578, + "grad_norm": 0.9019317030906677, + "learning_rate": 1.7202863031279537e-05, + "loss": 2.6502, + "step": 8124 + }, + { + "epoch": 0.7354937992215081, + "grad_norm": 0.8425202965736389, + "learning_rate": 1.7185072128862933e-05, + "loss": 2.5551, + "step": 8125 + }, + { + "epoch": 0.7355843215352584, + "grad_norm": 0.8753564357757568, + "learning_rate": 1.7167289565907287e-05, + "loss": 2.6372, + "step": 8126 + }, + { + "epoch": 0.7356748438490088, + "grad_norm": 0.847804069519043, + "learning_rate": 1.7149515344203304e-05, + "loss": 2.5992, + "step": 8127 + }, + { + "epoch": 0.7357653661627591, + "grad_norm": 0.8290091156959534, + "learning_rate": 1.713174946554086e-05, + "loss": 2.6184, + "step": 8128 + }, + { + "epoch": 0.7358558884765095, + "grad_norm": 0.8304346203804016, + "learning_rate": 1.7113991931708972e-05, + "loss": 2.6377, + "step": 8129 + }, + { + "epoch": 0.7359464107902598, + "grad_norm": 0.881280243396759, + "learning_rate": 1.7096242744495837e-05, + "loss": 2.6182, + "step": 8130 + }, + { + "epoch": 0.7360369331040102, + "grad_norm": 0.846674382686615, + "learning_rate": 1.7078501905688805e-05, + "loss": 2.6258, + "step": 8131 + }, + { + "epoch": 0.7361274554177605, + "grad_norm": 0.860022783279419, + "learning_rate": 1.7060769417074317e-05, + "loss": 2.6807, + "step": 8132 + }, + { + "epoch": 0.7362179777315108, + "grad_norm": 0.9033744931221008, + "learning_rate": 1.704304528043813e-05, + "loss": 2.7004, + "step": 8133 + }, + { + "epoch": 0.7363085000452612, + "grad_norm": 0.8882016539573669, + "learning_rate": 1.702532949756499e-05, + "loss": 2.6703, + "step": 8134 + }, + { + "epoch": 0.7363990223590114, + "grad_norm": 0.880939245223999, + "learning_rate": 1.7007622070238906e-05, + "loss": 2.6365, + "step": 8135 + }, + { + "epoch": 0.7364895446727618, + "grad_norm": 0.847804844379425, + "learning_rate": 1.6989923000243e-05, + "loss": 2.6706, + "step": 8136 + }, + { + "epoch": 0.7365800669865121, + "grad_norm": 0.8592808246612549, + "learning_rate": 1.697223228935958e-05, + "loss": 2.6047, + "step": 8137 + }, + { + "epoch": 0.7366705893002625, + "grad_norm": 0.8524011969566345, + "learning_rate": 1.695454993937009e-05, + "loss": 2.6969, + "step": 8138 + }, + { + "epoch": 0.7367611116140128, + "grad_norm": 0.8169214725494385, + "learning_rate": 1.6936875952055175e-05, + "loss": 2.607, + "step": 8139 + }, + { + "epoch": 0.7368516339277632, + "grad_norm": 0.8056317567825317, + "learning_rate": 1.6919210329194533e-05, + "loss": 2.6452, + "step": 8140 + }, + { + "epoch": 0.7369421562415135, + "grad_norm": 0.8107497096061707, + "learning_rate": 1.690155307256719e-05, + "loss": 2.5957, + "step": 8141 + }, + { + "epoch": 0.7370326785552639, + "grad_norm": 0.9261689186096191, + "learning_rate": 1.688390418395115e-05, + "loss": 2.6895, + "step": 8142 + }, + { + "epoch": 0.7371232008690142, + "grad_norm": 0.8380181789398193, + "learning_rate": 1.6866263665123684e-05, + "loss": 2.6552, + "step": 8143 + }, + { + "epoch": 0.7372137231827646, + "grad_norm": 0.8535093069076538, + "learning_rate": 1.684863151786119e-05, + "loss": 2.6757, + "step": 8144 + }, + { + "epoch": 0.7373042454965149, + "grad_norm": 0.8620217442512512, + "learning_rate": 1.683100774393923e-05, + "loss": 2.6247, + "step": 8145 + }, + { + "epoch": 0.7373947678102653, + "grad_norm": 0.9317750930786133, + "learning_rate": 1.6813392345132518e-05, + "loss": 2.5605, + "step": 8146 + }, + { + "epoch": 0.7374852901240155, + "grad_norm": 0.8641749024391174, + "learning_rate": 1.679578532321494e-05, + "loss": 2.5937, + "step": 8147 + }, + { + "epoch": 0.7375758124377659, + "grad_norm": 0.8631426692008972, + "learning_rate": 1.6778186679959452e-05, + "loss": 2.7071, + "step": 8148 + }, + { + "epoch": 0.7376663347515162, + "grad_norm": 0.816089391708374, + "learning_rate": 1.6760596417138342e-05, + "loss": 2.6578, + "step": 8149 + }, + { + "epoch": 0.7377568570652666, + "grad_norm": 0.8169116377830505, + "learning_rate": 1.6743014536522873e-05, + "loss": 2.5996, + "step": 8150 + }, + { + "epoch": 0.7378473793790169, + "grad_norm": 0.8737078905105591, + "learning_rate": 1.672544103988356e-05, + "loss": 2.6331, + "step": 8151 + }, + { + "epoch": 0.7379379016927673, + "grad_norm": 0.8869850039482117, + "learning_rate": 1.6707875928990058e-05, + "loss": 2.7173, + "step": 8152 + }, + { + "epoch": 0.7380284240065176, + "grad_norm": 0.8866559863090515, + "learning_rate": 1.6690319205611194e-05, + "loss": 2.5917, + "step": 8153 + }, + { + "epoch": 0.738118946320268, + "grad_norm": 0.8431488871574402, + "learning_rate": 1.6672770871514854e-05, + "loss": 2.5925, + "step": 8154 + }, + { + "epoch": 0.7382094686340183, + "grad_norm": 0.8199314475059509, + "learning_rate": 1.6655230928468258e-05, + "loss": 2.6326, + "step": 8155 + }, + { + "epoch": 0.7382999909477687, + "grad_norm": 0.8784447312355042, + "learning_rate": 1.6637699378237605e-05, + "loss": 2.6156, + "step": 8156 + }, + { + "epoch": 0.7383905132615189, + "grad_norm": 0.7935834527015686, + "learning_rate": 1.6620176222588334e-05, + "loss": 2.6223, + "step": 8157 + }, + { + "epoch": 0.7384810355752693, + "grad_norm": 0.8735072016716003, + "learning_rate": 1.660266146328504e-05, + "loss": 2.6237, + "step": 8158 + }, + { + "epoch": 0.7385715578890196, + "grad_norm": 0.8265995383262634, + "learning_rate": 1.6585155102091467e-05, + "loss": 2.5732, + "step": 8159 + }, + { + "epoch": 0.73866208020277, + "grad_norm": 0.8630474209785461, + "learning_rate": 1.6567657140770475e-05, + "loss": 2.5965, + "step": 8160 + }, + { + "epoch": 0.7387526025165203, + "grad_norm": 0.8460192680358887, + "learning_rate": 1.6550167581084164e-05, + "loss": 2.701, + "step": 8161 + }, + { + "epoch": 0.7388431248302707, + "grad_norm": 0.8023285865783691, + "learning_rate": 1.6532686424793644e-05, + "loss": 2.6347, + "step": 8162 + }, + { + "epoch": 0.738933647144021, + "grad_norm": 0.8689784407615662, + "learning_rate": 1.6515213673659357e-05, + "loss": 2.6865, + "step": 8163 + }, + { + "epoch": 0.7390241694577714, + "grad_norm": 0.8490846157073975, + "learning_rate": 1.649774932944075e-05, + "loss": 2.6641, + "step": 8164 + }, + { + "epoch": 0.7391146917715217, + "grad_norm": 0.8280508518218994, + "learning_rate": 1.6480293393896507e-05, + "loss": 2.5669, + "step": 8165 + }, + { + "epoch": 0.7392052140852721, + "grad_norm": 0.9088148474693298, + "learning_rate": 1.6462845868784425e-05, + "loss": 2.7244, + "step": 8166 + }, + { + "epoch": 0.7392957363990224, + "grad_norm": 0.7922699451446533, + "learning_rate": 1.6445406755861482e-05, + "loss": 2.607, + "step": 8167 + }, + { + "epoch": 0.7393862587127727, + "grad_norm": 0.8359072804450989, + "learning_rate": 1.642797605688379e-05, + "loss": 2.6218, + "step": 8168 + }, + { + "epoch": 0.739476781026523, + "grad_norm": 0.8865196704864502, + "learning_rate": 1.6410553773606653e-05, + "loss": 2.6453, + "step": 8169 + }, + { + "epoch": 0.7395673033402734, + "grad_norm": 0.8127596378326416, + "learning_rate": 1.6393139907784404e-05, + "loss": 2.6738, + "step": 8170 + }, + { + "epoch": 0.7396578256540237, + "grad_norm": 0.8192434906959534, + "learning_rate": 1.6375734461170733e-05, + "loss": 2.6035, + "step": 8171 + }, + { + "epoch": 0.7397483479677741, + "grad_norm": 0.8248426914215088, + "learning_rate": 1.635833743551829e-05, + "loss": 2.6281, + "step": 8172 + }, + { + "epoch": 0.7398388702815244, + "grad_norm": 0.8829620480537415, + "learning_rate": 1.6340948832578983e-05, + "loss": 2.6225, + "step": 8173 + }, + { + "epoch": 0.7399293925952747, + "grad_norm": 0.8189470767974854, + "learning_rate": 1.632356865410384e-05, + "loss": 2.669, + "step": 8174 + }, + { + "epoch": 0.7400199149090251, + "grad_norm": 0.872611403465271, + "learning_rate": 1.630619690184303e-05, + "loss": 2.6179, + "step": 8175 + }, + { + "epoch": 0.7401104372227754, + "grad_norm": 0.8715521097183228, + "learning_rate": 1.6288833577545914e-05, + "loss": 2.6397, + "step": 8176 + }, + { + "epoch": 0.7402009595365258, + "grad_norm": 0.9405277371406555, + "learning_rate": 1.6271478682960993e-05, + "loss": 2.6259, + "step": 8177 + }, + { + "epoch": 0.740291481850276, + "grad_norm": 0.8795056939125061, + "learning_rate": 1.6254132219835816e-05, + "loss": 2.6157, + "step": 8178 + }, + { + "epoch": 0.7403820041640264, + "grad_norm": 0.8261298537254333, + "learning_rate": 1.6236794189917293e-05, + "loss": 2.5853, + "step": 8179 + }, + { + "epoch": 0.7404725264777767, + "grad_norm": 0.9246150255203247, + "learning_rate": 1.621946459495127e-05, + "loss": 2.601, + "step": 8180 + }, + { + "epoch": 0.7405630487915271, + "grad_norm": 0.803239107131958, + "learning_rate": 1.6202143436682882e-05, + "loss": 2.6329, + "step": 8181 + }, + { + "epoch": 0.7406535711052774, + "grad_norm": 0.8190500736236572, + "learning_rate": 1.6184830716856347e-05, + "loss": 2.6416, + "step": 8182 + }, + { + "epoch": 0.7407440934190278, + "grad_norm": 0.945702075958252, + "learning_rate": 1.6167526437215095e-05, + "loss": 2.6504, + "step": 8183 + }, + { + "epoch": 0.7408346157327781, + "grad_norm": 0.8268640041351318, + "learning_rate": 1.6150230599501594e-05, + "loss": 2.6327, + "step": 8184 + }, + { + "epoch": 0.7409251380465285, + "grad_norm": 0.8651500344276428, + "learning_rate": 1.6132943205457606e-05, + "loss": 2.5888, + "step": 8185 + }, + { + "epoch": 0.7410156603602788, + "grad_norm": 0.8950450420379639, + "learning_rate": 1.6115664256823937e-05, + "loss": 2.6752, + "step": 8186 + }, + { + "epoch": 0.7411061826740292, + "grad_norm": 0.8721691370010376, + "learning_rate": 1.609839375534058e-05, + "loss": 2.6091, + "step": 8187 + }, + { + "epoch": 0.7411967049877795, + "grad_norm": 0.8327284455299377, + "learning_rate": 1.6081131702746667e-05, + "loss": 2.6489, + "step": 8188 + }, + { + "epoch": 0.7412872273015298, + "grad_norm": 0.7996222376823425, + "learning_rate": 1.6063878100780506e-05, + "loss": 2.6007, + "step": 8189 + }, + { + "epoch": 0.7413777496152801, + "grad_norm": 0.8431447148323059, + "learning_rate": 1.6046632951179508e-05, + "loss": 2.6216, + "step": 8190 + }, + { + "epoch": 0.7414682719290305, + "grad_norm": 0.8708969950675964, + "learning_rate": 1.6029396255680305e-05, + "loss": 2.614, + "step": 8191 + }, + { + "epoch": 0.7415587942427808, + "grad_norm": 0.8860566020011902, + "learning_rate": 1.601216801601857e-05, + "loss": 2.6083, + "step": 8192 + }, + { + "epoch": 0.7416493165565312, + "grad_norm": 0.8327545523643494, + "learning_rate": 1.5994948233929253e-05, + "loss": 2.6623, + "step": 8193 + }, + { + "epoch": 0.7417398388702815, + "grad_norm": 0.8494716286659241, + "learning_rate": 1.5977736911146324e-05, + "loss": 2.6268, + "step": 8194 + }, + { + "epoch": 0.7418303611840319, + "grad_norm": 0.8706437945365906, + "learning_rate": 1.5960534049402987e-05, + "loss": 2.6315, + "step": 8195 + }, + { + "epoch": 0.7419208834977822, + "grad_norm": 0.7802254557609558, + "learning_rate": 1.5943339650431576e-05, + "loss": 2.6167, + "step": 8196 + }, + { + "epoch": 0.7420114058115326, + "grad_norm": 0.8174785375595093, + "learning_rate": 1.5926153715963565e-05, + "loss": 2.5987, + "step": 8197 + }, + { + "epoch": 0.7421019281252829, + "grad_norm": 0.8315815329551697, + "learning_rate": 1.590897624772957e-05, + "loss": 2.6454, + "step": 8198 + }, + { + "epoch": 0.7421924504390333, + "grad_norm": 0.8264350295066833, + "learning_rate": 1.589180724745939e-05, + "loss": 2.6312, + "step": 8199 + }, + { + "epoch": 0.7422829727527835, + "grad_norm": 0.7961425185203552, + "learning_rate": 1.587464671688187e-05, + "loss": 2.5814, + "step": 8200 + }, + { + "epoch": 0.7423734950665339, + "grad_norm": 0.7933376431465149, + "learning_rate": 1.5857494657725182e-05, + "loss": 2.6176, + "step": 8201 + }, + { + "epoch": 0.7424640173802842, + "grad_norm": 0.8313885927200317, + "learning_rate": 1.584035107171644e-05, + "loss": 2.6577, + "step": 8202 + }, + { + "epoch": 0.7425545396940346, + "grad_norm": 0.8346498012542725, + "learning_rate": 1.582321596058205e-05, + "loss": 2.5666, + "step": 8203 + }, + { + "epoch": 0.7426450620077849, + "grad_norm": 0.8698704838752747, + "learning_rate": 1.5806089326047512e-05, + "loss": 2.5968, + "step": 8204 + }, + { + "epoch": 0.7427355843215353, + "grad_norm": 0.792100727558136, + "learning_rate": 1.5788971169837473e-05, + "loss": 2.6165, + "step": 8205 + }, + { + "epoch": 0.7428261066352856, + "grad_norm": 0.8512359857559204, + "learning_rate": 1.5771861493675732e-05, + "loss": 2.6735, + "step": 8206 + }, + { + "epoch": 0.742916628949036, + "grad_norm": 0.9061763286590576, + "learning_rate": 1.5754760299285252e-05, + "loss": 2.6143, + "step": 8207 + }, + { + "epoch": 0.7430071512627863, + "grad_norm": 0.8766939043998718, + "learning_rate": 1.5737667588388093e-05, + "loss": 2.6372, + "step": 8208 + }, + { + "epoch": 0.7430976735765367, + "grad_norm": 0.9132415056228638, + "learning_rate": 1.5720583362705498e-05, + "loss": 2.6132, + "step": 8209 + }, + { + "epoch": 0.743188195890287, + "grad_norm": 0.7986286282539368, + "learning_rate": 1.5703507623957848e-05, + "loss": 2.6779, + "step": 8210 + }, + { + "epoch": 0.7432787182040373, + "grad_norm": 0.8528510332107544, + "learning_rate": 1.5686440373864675e-05, + "loss": 2.6198, + "step": 8211 + }, + { + "epoch": 0.7433692405177876, + "grad_norm": 0.8899943828582764, + "learning_rate": 1.5669381614144685e-05, + "loss": 2.6477, + "step": 8212 + }, + { + "epoch": 0.743459762831538, + "grad_norm": 0.8294568657875061, + "learning_rate": 1.5652331346515592e-05, + "loss": 2.6188, + "step": 8213 + }, + { + "epoch": 0.7435502851452883, + "grad_norm": 0.7675567269325256, + "learning_rate": 1.563528957269449e-05, + "loss": 2.5777, + "step": 8214 + }, + { + "epoch": 0.7436408074590386, + "grad_norm": 0.8410840034484863, + "learning_rate": 1.5618256294397382e-05, + "loss": 2.6778, + "step": 8215 + }, + { + "epoch": 0.743731329772789, + "grad_norm": 0.8316123485565186, + "learning_rate": 1.5601231513339565e-05, + "loss": 2.6331, + "step": 8216 + }, + { + "epoch": 0.7438218520865393, + "grad_norm": 0.7986269593238831, + "learning_rate": 1.558421523123542e-05, + "loss": 2.642, + "step": 8217 + }, + { + "epoch": 0.7439123744002897, + "grad_norm": 0.862214207649231, + "learning_rate": 1.5567207449798515e-05, + "loss": 2.6645, + "step": 8218 + }, + { + "epoch": 0.74400289671404, + "grad_norm": 0.8295678496360779, + "learning_rate": 1.5550208170741463e-05, + "loss": 2.628, + "step": 8219 + }, + { + "epoch": 0.7440934190277904, + "grad_norm": 0.8058247566223145, + "learning_rate": 1.553321739577619e-05, + "loss": 2.6442, + "step": 8220 + }, + { + "epoch": 0.7441839413415406, + "grad_norm": 0.8965185284614563, + "learning_rate": 1.551623512661359e-05, + "loss": 2.6656, + "step": 8221 + }, + { + "epoch": 0.744274463655291, + "grad_norm": 0.8854925036430359, + "learning_rate": 1.5499261364963792e-05, + "loss": 2.6364, + "step": 8222 + }, + { + "epoch": 0.7443649859690413, + "grad_norm": 0.8158112168312073, + "learning_rate": 1.5482296112536067e-05, + "loss": 2.5834, + "step": 8223 + }, + { + "epoch": 0.7444555082827917, + "grad_norm": 0.9737561345100403, + "learning_rate": 1.546533937103881e-05, + "loss": 2.6548, + "step": 8224 + }, + { + "epoch": 0.744546030596542, + "grad_norm": 0.8352574110031128, + "learning_rate": 1.5448391142179574e-05, + "loss": 2.6044, + "step": 8225 + }, + { + "epoch": 0.7446365529102924, + "grad_norm": 0.8467913269996643, + "learning_rate": 1.5431451427665057e-05, + "loss": 2.6274, + "step": 8226 + }, + { + "epoch": 0.7447270752240427, + "grad_norm": 0.867701530456543, + "learning_rate": 1.541452022920101e-05, + "loss": 2.6024, + "step": 8227 + }, + { + "epoch": 0.7448175975377931, + "grad_norm": 0.8936637043952942, + "learning_rate": 1.5397597548492514e-05, + "loss": 2.6748, + "step": 8228 + }, + { + "epoch": 0.7449081198515434, + "grad_norm": 0.8338824510574341, + "learning_rate": 1.538068338724361e-05, + "loss": 2.6225, + "step": 8229 + }, + { + "epoch": 0.7449986421652938, + "grad_norm": 0.8347210884094238, + "learning_rate": 1.5363777747157572e-05, + "loss": 2.6783, + "step": 8230 + }, + { + "epoch": 0.745089164479044, + "grad_norm": 0.8179384469985962, + "learning_rate": 1.53468806299368e-05, + "loss": 2.615, + "step": 8231 + }, + { + "epoch": 0.7451796867927944, + "grad_norm": 0.8636762499809265, + "learning_rate": 1.5329992037282825e-05, + "loss": 2.685, + "step": 8232 + }, + { + "epoch": 0.7452702091065447, + "grad_norm": 0.793683648109436, + "learning_rate": 1.5313111970896345e-05, + "loss": 2.607, + "step": 8233 + }, + { + "epoch": 0.7453607314202951, + "grad_norm": 0.8522347211837769, + "learning_rate": 1.5296240432477195e-05, + "loss": 2.6216, + "step": 8234 + }, + { + "epoch": 0.7454512537340454, + "grad_norm": 0.8533903360366821, + "learning_rate": 1.5279377423724263e-05, + "loss": 2.5714, + "step": 8235 + }, + { + "epoch": 0.7455417760477958, + "grad_norm": 0.8542166948318481, + "learning_rate": 1.5262522946335755e-05, + "loss": 2.6102, + "step": 8236 + }, + { + "epoch": 0.7456322983615461, + "grad_norm": 0.9041294455528259, + "learning_rate": 1.524567700200884e-05, + "loss": 2.5996, + "step": 8237 + }, + { + "epoch": 0.7457228206752965, + "grad_norm": 0.8735194206237793, + "learning_rate": 1.5228839592439947e-05, + "loss": 2.6817, + "step": 8238 + }, + { + "epoch": 0.7458133429890468, + "grad_norm": 0.8800003528594971, + "learning_rate": 1.5212010719324577e-05, + "loss": 2.6788, + "step": 8239 + }, + { + "epoch": 0.7459038653027972, + "grad_norm": 0.791961133480072, + "learning_rate": 1.5195190384357404e-05, + "loss": 2.6099, + "step": 8240 + }, + { + "epoch": 0.7459943876165475, + "grad_norm": 0.8906918168067932, + "learning_rate": 1.5178378589232246e-05, + "loss": 2.6168, + "step": 8241 + }, + { + "epoch": 0.7460849099302979, + "grad_norm": 0.8241778612136841, + "learning_rate": 1.5161575335642064e-05, + "loss": 2.6929, + "step": 8242 + }, + { + "epoch": 0.7461754322440481, + "grad_norm": 0.8503902554512024, + "learning_rate": 1.5144780625278876e-05, + "loss": 2.5645, + "step": 8243 + }, + { + "epoch": 0.7462659545577985, + "grad_norm": 0.8449828028678894, + "learning_rate": 1.5127994459834006e-05, + "loss": 2.6398, + "step": 8244 + }, + { + "epoch": 0.7463564768715488, + "grad_norm": 0.8013359904289246, + "learning_rate": 1.5111216840997743e-05, + "loss": 2.6211, + "step": 8245 + }, + { + "epoch": 0.7464469991852992, + "grad_norm": 0.85307776927948, + "learning_rate": 1.509444777045963e-05, + "loss": 2.6471, + "step": 8246 + }, + { + "epoch": 0.7465375214990495, + "grad_norm": 0.829259991645813, + "learning_rate": 1.5077687249908302e-05, + "loss": 2.6416, + "step": 8247 + }, + { + "epoch": 0.7466280438127999, + "grad_norm": 0.8358555436134338, + "learning_rate": 1.5060935281031563e-05, + "loss": 2.6283, + "step": 8248 + }, + { + "epoch": 0.7467185661265502, + "grad_norm": 0.8651643991470337, + "learning_rate": 1.5044191865516266e-05, + "loss": 2.5934, + "step": 8249 + }, + { + "epoch": 0.7468090884403006, + "grad_norm": 0.828653872013092, + "learning_rate": 1.5027457005048573e-05, + "loss": 2.6596, + "step": 8250 + }, + { + "epoch": 0.7468996107540509, + "grad_norm": 0.8189446926116943, + "learning_rate": 1.5010730701313625e-05, + "loss": 2.6267, + "step": 8251 + }, + { + "epoch": 0.7469901330678013, + "grad_norm": 0.8088429570198059, + "learning_rate": 1.499401295599575e-05, + "loss": 2.6345, + "step": 8252 + }, + { + "epoch": 0.7470806553815516, + "grad_norm": 0.8520417213439941, + "learning_rate": 1.4977303770778462e-05, + "loss": 2.627, + "step": 8253 + }, + { + "epoch": 0.747171177695302, + "grad_norm": 0.7819222211837769, + "learning_rate": 1.4960603147344343e-05, + "loss": 2.5788, + "step": 8254 + }, + { + "epoch": 0.7472617000090522, + "grad_norm": 0.8902117609977722, + "learning_rate": 1.4943911087375173e-05, + "loss": 2.605, + "step": 8255 + }, + { + "epoch": 0.7473522223228025, + "grad_norm": 0.814746081829071, + "learning_rate": 1.492722759255184e-05, + "loss": 2.6328, + "step": 8256 + }, + { + "epoch": 0.7474427446365529, + "grad_norm": 0.8802046775817871, + "learning_rate": 1.4910552664554323e-05, + "loss": 2.6181, + "step": 8257 + }, + { + "epoch": 0.7475332669503032, + "grad_norm": 0.8508935570716858, + "learning_rate": 1.4893886305061865e-05, + "loss": 2.6653, + "step": 8258 + }, + { + "epoch": 0.7476237892640536, + "grad_norm": 0.8923085927963257, + "learning_rate": 1.4877228515752717e-05, + "loss": 2.6272, + "step": 8259 + }, + { + "epoch": 0.7477143115778039, + "grad_norm": 0.897159218788147, + "learning_rate": 1.4860579298304312e-05, + "loss": 2.5451, + "step": 8260 + }, + { + "epoch": 0.7478048338915543, + "grad_norm": 0.8598203659057617, + "learning_rate": 1.4843938654393253e-05, + "loss": 2.6218, + "step": 8261 + }, + { + "epoch": 0.7478953562053046, + "grad_norm": 0.8585354089736938, + "learning_rate": 1.4827306585695234e-05, + "loss": 2.6232, + "step": 8262 + }, + { + "epoch": 0.747985878519055, + "grad_norm": 0.8278530836105347, + "learning_rate": 1.4810683093885126e-05, + "loss": 2.6651, + "step": 8263 + }, + { + "epoch": 0.7480764008328052, + "grad_norm": 0.8540686964988708, + "learning_rate": 1.479406818063691e-05, + "loss": 2.6294, + "step": 8264 + }, + { + "epoch": 0.7481669231465556, + "grad_norm": 0.8882604837417603, + "learning_rate": 1.4777461847623653e-05, + "loss": 2.6104, + "step": 8265 + }, + { + "epoch": 0.7482574454603059, + "grad_norm": 1.0102626085281372, + "learning_rate": 1.4760864096517701e-05, + "loss": 2.6443, + "step": 8266 + }, + { + "epoch": 0.7483479677740563, + "grad_norm": 0.8543476462364197, + "learning_rate": 1.4744274928990376e-05, + "loss": 2.647, + "step": 8267 + }, + { + "epoch": 0.7484384900878066, + "grad_norm": 0.8158116340637207, + "learning_rate": 1.472769434671224e-05, + "loss": 2.6096, + "step": 8268 + }, + { + "epoch": 0.748529012401557, + "grad_norm": 0.8607696890830994, + "learning_rate": 1.4711122351352947e-05, + "loss": 2.6002, + "step": 8269 + }, + { + "epoch": 0.7486195347153073, + "grad_norm": 0.8293615579605103, + "learning_rate": 1.4694558944581293e-05, + "loss": 2.6072, + "step": 8270 + }, + { + "epoch": 0.7487100570290577, + "grad_norm": 0.8597177863121033, + "learning_rate": 1.4678004128065225e-05, + "loss": 2.6456, + "step": 8271 + }, + { + "epoch": 0.748800579342808, + "grad_norm": 0.7982829213142395, + "learning_rate": 1.466145790347183e-05, + "loss": 2.5905, + "step": 8272 + }, + { + "epoch": 0.7488911016565584, + "grad_norm": 0.8462602496147156, + "learning_rate": 1.4644920272467244e-05, + "loss": 2.619, + "step": 8273 + }, + { + "epoch": 0.7489816239703087, + "grad_norm": 0.8661282658576965, + "learning_rate": 1.4628391236716899e-05, + "loss": 2.6635, + "step": 8274 + }, + { + "epoch": 0.749072146284059, + "grad_norm": 0.838843822479248, + "learning_rate": 1.4611870797885197e-05, + "loss": 2.6534, + "step": 8275 + }, + { + "epoch": 0.7491626685978093, + "grad_norm": 0.984654426574707, + "learning_rate": 1.4595358957635763e-05, + "loss": 2.6257, + "step": 8276 + }, + { + "epoch": 0.7492531909115597, + "grad_norm": 0.8167694807052612, + "learning_rate": 1.4578855717631357e-05, + "loss": 2.5533, + "step": 8277 + }, + { + "epoch": 0.74934371322531, + "grad_norm": 0.8685299754142761, + "learning_rate": 1.4562361079533849e-05, + "loss": 2.556, + "step": 8278 + }, + { + "epoch": 0.7494342355390604, + "grad_norm": 0.8326447606086731, + "learning_rate": 1.454587504500421e-05, + "loss": 2.6453, + "step": 8279 + }, + { + "epoch": 0.7495247578528107, + "grad_norm": 0.8826172947883606, + "learning_rate": 1.4529397615702656e-05, + "loss": 2.6536, + "step": 8280 + }, + { + "epoch": 0.7496152801665611, + "grad_norm": 0.8852538466453552, + "learning_rate": 1.4512928793288405e-05, + "loss": 2.6287, + "step": 8281 + }, + { + "epoch": 0.7497058024803114, + "grad_norm": 0.8458629846572876, + "learning_rate": 1.4496468579419886e-05, + "loss": 2.5647, + "step": 8282 + }, + { + "epoch": 0.7497963247940618, + "grad_norm": 0.9038055539131165, + "learning_rate": 1.4480016975754629e-05, + "loss": 2.6955, + "step": 8283 + }, + { + "epoch": 0.7498868471078121, + "grad_norm": 0.823957085609436, + "learning_rate": 1.4463573983949341e-05, + "loss": 2.6589, + "step": 8284 + }, + { + "epoch": 0.7499773694215625, + "grad_norm": 0.8328465223312378, + "learning_rate": 1.4447139605659799e-05, + "loss": 2.6246, + "step": 8285 + }, + { + "epoch": 0.7500678917353127, + "grad_norm": 0.8769001364707947, + "learning_rate": 1.4430713842540989e-05, + "loss": 2.6395, + "step": 8286 + }, + { + "epoch": 0.7501584140490631, + "grad_norm": 0.8344104290008545, + "learning_rate": 1.44142966962469e-05, + "loss": 2.6723, + "step": 8287 + }, + { + "epoch": 0.7502489363628134, + "grad_norm": 0.8472191095352173, + "learning_rate": 1.439788816843084e-05, + "loss": 2.6124, + "step": 8288 + }, + { + "epoch": 0.7503394586765638, + "grad_norm": 0.8723897933959961, + "learning_rate": 1.438148826074508e-05, + "loss": 2.672, + "step": 8289 + }, + { + "epoch": 0.7504299809903141, + "grad_norm": 0.822022020816803, + "learning_rate": 1.4365096974841108e-05, + "loss": 2.6048, + "step": 8290 + }, + { + "epoch": 0.7505205033040645, + "grad_norm": 0.8745297193527222, + "learning_rate": 1.4348714312369527e-05, + "loss": 2.6106, + "step": 8291 + }, + { + "epoch": 0.7506110256178148, + "grad_norm": 0.8441516160964966, + "learning_rate": 1.4332340274980072e-05, + "loss": 2.5962, + "step": 8292 + }, + { + "epoch": 0.7507015479315652, + "grad_norm": 0.8232734799385071, + "learning_rate": 1.4315974864321602e-05, + "loss": 2.6471, + "step": 8293 + }, + { + "epoch": 0.7507920702453155, + "grad_norm": 0.9545968770980835, + "learning_rate": 1.4299618082042143e-05, + "loss": 2.6994, + "step": 8294 + }, + { + "epoch": 0.7508825925590659, + "grad_norm": 0.8935458064079285, + "learning_rate": 1.4283269929788779e-05, + "loss": 2.6443, + "step": 8295 + }, + { + "epoch": 0.7509731148728161, + "grad_norm": 0.8546053767204285, + "learning_rate": 1.4266930409207791e-05, + "loss": 2.615, + "step": 8296 + }, + { + "epoch": 0.7510636371865664, + "grad_norm": 0.8507965803146362, + "learning_rate": 1.4250599521944563e-05, + "loss": 2.6345, + "step": 8297 + }, + { + "epoch": 0.7511541595003168, + "grad_norm": 0.799853503704071, + "learning_rate": 1.4234277269643614e-05, + "loss": 2.6021, + "step": 8298 + }, + { + "epoch": 0.7512446818140671, + "grad_norm": 0.8891720771789551, + "learning_rate": 1.4217963653948618e-05, + "loss": 2.5909, + "step": 8299 + }, + { + "epoch": 0.7513352041278175, + "grad_norm": 0.8445886969566345, + "learning_rate": 1.4201658676502294e-05, + "loss": 2.6488, + "step": 8300 + }, + { + "epoch": 0.7514257264415678, + "grad_norm": 0.8394327759742737, + "learning_rate": 1.418536233894664e-05, + "loss": 2.6577, + "step": 8301 + }, + { + "epoch": 0.7515162487553182, + "grad_norm": 0.7951192259788513, + "learning_rate": 1.416907464292262e-05, + "loss": 2.5963, + "step": 8302 + }, + { + "epoch": 0.7516067710690685, + "grad_norm": 0.8702280521392822, + "learning_rate": 1.4152795590070445e-05, + "loss": 2.6286, + "step": 8303 + }, + { + "epoch": 0.7516972933828189, + "grad_norm": 0.8904082179069519, + "learning_rate": 1.4136525182029392e-05, + "loss": 2.6705, + "step": 8304 + }, + { + "epoch": 0.7517878156965692, + "grad_norm": 0.8519523739814758, + "learning_rate": 1.4120263420437919e-05, + "loss": 2.5251, + "step": 8305 + }, + { + "epoch": 0.7518783380103196, + "grad_norm": 0.8387498259544373, + "learning_rate": 1.4104010306933557e-05, + "loss": 2.6455, + "step": 8306 + }, + { + "epoch": 0.7519688603240698, + "grad_norm": 0.8888024091720581, + "learning_rate": 1.4087765843153033e-05, + "loss": 2.6558, + "step": 8307 + }, + { + "epoch": 0.7520593826378202, + "grad_norm": 0.8241297006607056, + "learning_rate": 1.4071530030732095e-05, + "loss": 2.62, + "step": 8308 + }, + { + "epoch": 0.7521499049515705, + "grad_norm": 0.8831976652145386, + "learning_rate": 1.4055302871305775e-05, + "loss": 2.6161, + "step": 8309 + }, + { + "epoch": 0.7522404272653209, + "grad_norm": 0.9573201537132263, + "learning_rate": 1.4039084366508092e-05, + "loss": 2.6222, + "step": 8310 + }, + { + "epoch": 0.7523309495790712, + "grad_norm": 0.8343157768249512, + "learning_rate": 1.4022874517972251e-05, + "loss": 2.6234, + "step": 8311 + }, + { + "epoch": 0.7524214718928216, + "grad_norm": 0.8954187631607056, + "learning_rate": 1.4006673327330589e-05, + "loss": 2.66, + "step": 8312 + }, + { + "epoch": 0.7525119942065719, + "grad_norm": 0.8994336128234863, + "learning_rate": 1.39904807962146e-05, + "loss": 2.6723, + "step": 8313 + }, + { + "epoch": 0.7526025165203223, + "grad_norm": 0.8243401050567627, + "learning_rate": 1.3974296926254794e-05, + "loss": 2.6384, + "step": 8314 + }, + { + "epoch": 0.7526930388340726, + "grad_norm": 0.8115041851997375, + "learning_rate": 1.3958121719080986e-05, + "loss": 2.6071, + "step": 8315 + }, + { + "epoch": 0.752783561147823, + "grad_norm": 0.8584572076797485, + "learning_rate": 1.394195517632193e-05, + "loss": 2.6322, + "step": 8316 + }, + { + "epoch": 0.7528740834615733, + "grad_norm": 0.8973844647407532, + "learning_rate": 1.3925797299605647e-05, + "loss": 2.622, + "step": 8317 + }, + { + "epoch": 0.7529646057753236, + "grad_norm": 0.8394388556480408, + "learning_rate": 1.3909648090559212e-05, + "loss": 2.5859, + "step": 8318 + }, + { + "epoch": 0.7530551280890739, + "grad_norm": 0.8554968237876892, + "learning_rate": 1.389350755080886e-05, + "loss": 2.6101, + "step": 8319 + }, + { + "epoch": 0.7531456504028243, + "grad_norm": 0.8254625797271729, + "learning_rate": 1.3877375681979943e-05, + "loss": 2.609, + "step": 8320 + }, + { + "epoch": 0.7532361727165746, + "grad_norm": 0.9236030578613281, + "learning_rate": 1.3861252485696963e-05, + "loss": 2.7006, + "step": 8321 + }, + { + "epoch": 0.753326695030325, + "grad_norm": 0.9001591801643372, + "learning_rate": 1.3845137963583454e-05, + "loss": 2.5643, + "step": 8322 + }, + { + "epoch": 0.7534172173440753, + "grad_norm": 0.844906210899353, + "learning_rate": 1.3829032117262242e-05, + "loss": 2.6529, + "step": 8323 + }, + { + "epoch": 0.7535077396578257, + "grad_norm": 0.87331622838974, + "learning_rate": 1.3812934948355116e-05, + "loss": 2.6026, + "step": 8324 + }, + { + "epoch": 0.753598261971576, + "grad_norm": 0.814003586769104, + "learning_rate": 1.379684645848307e-05, + "loss": 2.6305, + "step": 8325 + }, + { + "epoch": 0.7536887842853264, + "grad_norm": 0.8583101034164429, + "learning_rate": 1.3780766649266242e-05, + "loss": 2.5818, + "step": 8326 + }, + { + "epoch": 0.7537793065990767, + "grad_norm": 0.792962908744812, + "learning_rate": 1.3764695522323845e-05, + "loss": 2.6312, + "step": 8327 + }, + { + "epoch": 0.7538698289128271, + "grad_norm": 0.8592725396156311, + "learning_rate": 1.3748633079274253e-05, + "loss": 2.546, + "step": 8328 + }, + { + "epoch": 0.7539603512265773, + "grad_norm": 0.873782753944397, + "learning_rate": 1.3732579321734972e-05, + "loss": 2.6973, + "step": 8329 + }, + { + "epoch": 0.7540508735403277, + "grad_norm": 0.8384394645690918, + "learning_rate": 1.3716534251322544e-05, + "loss": 2.6215, + "step": 8330 + }, + { + "epoch": 0.754141395854078, + "grad_norm": 0.9007170796394348, + "learning_rate": 1.3700497869652807e-05, + "loss": 2.6467, + "step": 8331 + }, + { + "epoch": 0.7542319181678284, + "grad_norm": 0.7979040145874023, + "learning_rate": 1.3684470178340548e-05, + "loss": 2.6331, + "step": 8332 + }, + { + "epoch": 0.7543224404815787, + "grad_norm": 0.8360491991043091, + "learning_rate": 1.3668451178999775e-05, + "loss": 2.5763, + "step": 8333 + }, + { + "epoch": 0.7544129627953291, + "grad_norm": 0.8180165886878967, + "learning_rate": 1.3652440873243622e-05, + "loss": 2.5277, + "step": 8334 + }, + { + "epoch": 0.7545034851090794, + "grad_norm": 0.9115409255027771, + "learning_rate": 1.3636439262684298e-05, + "loss": 2.616, + "step": 8335 + }, + { + "epoch": 0.7545940074228298, + "grad_norm": 0.8859723806381226, + "learning_rate": 1.362044634893318e-05, + "loss": 2.6118, + "step": 8336 + }, + { + "epoch": 0.7546845297365801, + "grad_norm": 0.8769333958625793, + "learning_rate": 1.360446213360077e-05, + "loss": 2.6341, + "step": 8337 + }, + { + "epoch": 0.7547750520503304, + "grad_norm": 0.8497744202613831, + "learning_rate": 1.3588486618296615e-05, + "loss": 2.6316, + "step": 8338 + }, + { + "epoch": 0.7548655743640807, + "grad_norm": 0.8281365633010864, + "learning_rate": 1.3572519804629536e-05, + "loss": 2.6588, + "step": 8339 + }, + { + "epoch": 0.754956096677831, + "grad_norm": 0.888231635093689, + "learning_rate": 1.3556561694207338e-05, + "loss": 2.6359, + "step": 8340 + }, + { + "epoch": 0.7550466189915814, + "grad_norm": 0.893904447555542, + "learning_rate": 1.3540612288637e-05, + "loss": 2.5951, + "step": 8341 + }, + { + "epoch": 0.7551371413053317, + "grad_norm": 0.8344972729682922, + "learning_rate": 1.352467158952464e-05, + "loss": 2.5994, + "step": 8342 + }, + { + "epoch": 0.7552276636190821, + "grad_norm": 0.8195839524269104, + "learning_rate": 1.3508739598475506e-05, + "loss": 2.6477, + "step": 8343 + }, + { + "epoch": 0.7553181859328324, + "grad_norm": 0.8574609160423279, + "learning_rate": 1.3492816317093893e-05, + "loss": 2.7042, + "step": 8344 + }, + { + "epoch": 0.7554087082465828, + "grad_norm": 0.846380352973938, + "learning_rate": 1.3476901746983351e-05, + "loss": 2.5925, + "step": 8345 + }, + { + "epoch": 0.7554992305603331, + "grad_norm": 0.8791022300720215, + "learning_rate": 1.3460995889746408e-05, + "loss": 2.6581, + "step": 8346 + }, + { + "epoch": 0.7555897528740835, + "grad_norm": 0.8232809901237488, + "learning_rate": 1.3445098746984808e-05, + "loss": 2.6401, + "step": 8347 + }, + { + "epoch": 0.7556802751878338, + "grad_norm": 0.814947247505188, + "learning_rate": 1.3429210320299401e-05, + "loss": 2.6589, + "step": 8348 + }, + { + "epoch": 0.7557707975015842, + "grad_norm": 0.8401846885681152, + "learning_rate": 1.3413330611290154e-05, + "loss": 2.6416, + "step": 8349 + }, + { + "epoch": 0.7558613198153344, + "grad_norm": 0.8400335907936096, + "learning_rate": 1.339745962155613e-05, + "loss": 2.6249, + "step": 8350 + }, + { + "epoch": 0.7559518421290848, + "grad_norm": 0.8805189728736877, + "learning_rate": 1.3381597352695586e-05, + "loss": 2.635, + "step": 8351 + }, + { + "epoch": 0.7560423644428351, + "grad_norm": 0.8336710929870605, + "learning_rate": 1.3365743806305774e-05, + "loss": 2.6311, + "step": 8352 + }, + { + "epoch": 0.7561328867565855, + "grad_norm": 0.9214432835578918, + "learning_rate": 1.3349898983983233e-05, + "loss": 2.6312, + "step": 8353 + }, + { + "epoch": 0.7562234090703358, + "grad_norm": 0.8631122708320618, + "learning_rate": 1.3334062887323461e-05, + "loss": 2.628, + "step": 8354 + }, + { + "epoch": 0.7563139313840862, + "grad_norm": 0.8307778835296631, + "learning_rate": 1.3318235517921196e-05, + "loss": 2.6126, + "step": 8355 + }, + { + "epoch": 0.7564044536978365, + "grad_norm": 0.902026355266571, + "learning_rate": 1.3302416877370239e-05, + "loss": 2.7022, + "step": 8356 + }, + { + "epoch": 0.7564949760115869, + "grad_norm": 0.8334587812423706, + "learning_rate": 1.3286606967263537e-05, + "loss": 2.6638, + "step": 8357 + }, + { + "epoch": 0.7565854983253372, + "grad_norm": 0.8361995816230774, + "learning_rate": 1.327080578919313e-05, + "loss": 2.6521, + "step": 8358 + }, + { + "epoch": 0.7566760206390876, + "grad_norm": 0.8585364818572998, + "learning_rate": 1.3255013344750233e-05, + "loss": 2.6088, + "step": 8359 + }, + { + "epoch": 0.7567665429528379, + "grad_norm": 0.8744394779205322, + "learning_rate": 1.3239229635525074e-05, + "loss": 2.6148, + "step": 8360 + }, + { + "epoch": 0.7568570652665882, + "grad_norm": 0.8813425898551941, + "learning_rate": 1.3223454663107172e-05, + "loss": 2.6994, + "step": 8361 + }, + { + "epoch": 0.7569475875803385, + "grad_norm": 0.8385540843009949, + "learning_rate": 1.3207688429084974e-05, + "loss": 2.6528, + "step": 8362 + }, + { + "epoch": 0.7570381098940889, + "grad_norm": 0.852018415927887, + "learning_rate": 1.3191930935046181e-05, + "loss": 2.6398, + "step": 8363 + }, + { + "epoch": 0.7571286322078392, + "grad_norm": 0.8620157241821289, + "learning_rate": 1.3176182182577567e-05, + "loss": 2.624, + "step": 8364 + }, + { + "epoch": 0.7572191545215896, + "grad_norm": 0.8437519669532776, + "learning_rate": 1.3160442173265031e-05, + "loss": 2.6425, + "step": 8365 + }, + { + "epoch": 0.7573096768353399, + "grad_norm": 1.0246024131774902, + "learning_rate": 1.3144710908693602e-05, + "loss": 2.6969, + "step": 8366 + }, + { + "epoch": 0.7574001991490903, + "grad_norm": 0.8751848340034485, + "learning_rate": 1.3128988390447416e-05, + "loss": 2.6285, + "step": 8367 + }, + { + "epoch": 0.7574907214628406, + "grad_norm": 0.8701299428939819, + "learning_rate": 1.3113274620109684e-05, + "loss": 2.6615, + "step": 8368 + }, + { + "epoch": 0.757581243776591, + "grad_norm": 0.9776567816734314, + "learning_rate": 1.3097569599262871e-05, + "loss": 2.6625, + "step": 8369 + }, + { + "epoch": 0.7576717660903413, + "grad_norm": 0.9123702049255371, + "learning_rate": 1.3081873329488392e-05, + "loss": 2.6172, + "step": 8370 + }, + { + "epoch": 0.7577622884040917, + "grad_norm": 0.8895208835601807, + "learning_rate": 1.3066185812366883e-05, + "loss": 2.6311, + "step": 8371 + }, + { + "epoch": 0.7578528107178419, + "grad_norm": 0.841701328754425, + "learning_rate": 1.30505070494781e-05, + "loss": 2.6267, + "step": 8372 + }, + { + "epoch": 0.7579433330315923, + "grad_norm": 0.8388078808784485, + "learning_rate": 1.3034837042400894e-05, + "loss": 2.6393, + "step": 8373 + }, + { + "epoch": 0.7580338553453426, + "grad_norm": 0.9033432602882385, + "learning_rate": 1.3019175792713167e-05, + "loss": 2.6579, + "step": 8374 + }, + { + "epoch": 0.758124377659093, + "grad_norm": 0.8371830582618713, + "learning_rate": 1.3003523301992104e-05, + "loss": 2.6308, + "step": 8375 + }, + { + "epoch": 0.7582148999728433, + "grad_norm": 0.8309195041656494, + "learning_rate": 1.2987879571813854e-05, + "loss": 2.6421, + "step": 8376 + }, + { + "epoch": 0.7583054222865937, + "grad_norm": 0.8373768329620361, + "learning_rate": 1.2972244603753736e-05, + "loss": 2.6414, + "step": 8377 + }, + { + "epoch": 0.758395944600344, + "grad_norm": 0.8145757913589478, + "learning_rate": 1.295661839938621e-05, + "loss": 2.6132, + "step": 8378 + }, + { + "epoch": 0.7584864669140943, + "grad_norm": 0.8609750866889954, + "learning_rate": 1.2941000960284832e-05, + "loss": 2.6015, + "step": 8379 + }, + { + "epoch": 0.7585769892278447, + "grad_norm": 0.8729307055473328, + "learning_rate": 1.2925392288022298e-05, + "loss": 2.6121, + "step": 8380 + }, + { + "epoch": 0.758667511541595, + "grad_norm": 0.9552813768386841, + "learning_rate": 1.290979238417035e-05, + "loss": 2.6482, + "step": 8381 + }, + { + "epoch": 0.7587580338553453, + "grad_norm": 0.8189707398414612, + "learning_rate": 1.289420125029993e-05, + "loss": 2.6284, + "step": 8382 + }, + { + "epoch": 0.7588485561690956, + "grad_norm": 0.8567572832107544, + "learning_rate": 1.2878618887981064e-05, + "loss": 2.642, + "step": 8383 + }, + { + "epoch": 0.758939078482846, + "grad_norm": 0.8377892374992371, + "learning_rate": 1.2863045298782895e-05, + "loss": 2.6071, + "step": 8384 + }, + { + "epoch": 0.7590296007965963, + "grad_norm": 0.903947114944458, + "learning_rate": 1.2847480484273666e-05, + "loss": 2.6266, + "step": 8385 + }, + { + "epoch": 0.7591201231103467, + "grad_norm": 0.8137821555137634, + "learning_rate": 1.2831924446020805e-05, + "loss": 2.6096, + "step": 8386 + }, + { + "epoch": 0.759210645424097, + "grad_norm": 0.935057520866394, + "learning_rate": 1.2816377185590711e-05, + "loss": 2.5789, + "step": 8387 + }, + { + "epoch": 0.7593011677378474, + "grad_norm": 0.9254269003868103, + "learning_rate": 1.28008387045491e-05, + "loss": 2.7207, + "step": 8388 + }, + { + "epoch": 0.7593916900515977, + "grad_norm": 0.8800764679908752, + "learning_rate": 1.2785309004460622e-05, + "loss": 2.6285, + "step": 8389 + }, + { + "epoch": 0.7594822123653481, + "grad_norm": 0.8191325664520264, + "learning_rate": 1.2769788086889134e-05, + "loss": 2.6175, + "step": 8390 + }, + { + "epoch": 0.7595727346790984, + "grad_norm": 0.9105868935585022, + "learning_rate": 1.275427595339761e-05, + "loss": 2.7079, + "step": 8391 + }, + { + "epoch": 0.7596632569928488, + "grad_norm": 0.90646892786026, + "learning_rate": 1.27387726055481e-05, + "loss": 2.6461, + "step": 8392 + }, + { + "epoch": 0.759753779306599, + "grad_norm": 0.8635348677635193, + "learning_rate": 1.272327804490181e-05, + "loss": 2.6116, + "step": 8393 + }, + { + "epoch": 0.7598443016203494, + "grad_norm": 0.8347007036209106, + "learning_rate": 1.2707792273019048e-05, + "loss": 2.6086, + "step": 8394 + }, + { + "epoch": 0.7599348239340997, + "grad_norm": 0.8188871741294861, + "learning_rate": 1.2692315291459178e-05, + "loss": 2.6129, + "step": 8395 + }, + { + "epoch": 0.7600253462478501, + "grad_norm": 0.8775956034660339, + "learning_rate": 1.267684710178081e-05, + "loss": 2.6255, + "step": 8396 + }, + { + "epoch": 0.7601158685616004, + "grad_norm": 0.8294050097465515, + "learning_rate": 1.2661387705541528e-05, + "loss": 2.6402, + "step": 8397 + }, + { + "epoch": 0.7602063908753508, + "grad_norm": 0.8578444123268127, + "learning_rate": 1.2645937104298111e-05, + "loss": 2.6559, + "step": 8398 + }, + { + "epoch": 0.7602969131891011, + "grad_norm": 0.8342133164405823, + "learning_rate": 1.2630495299606448e-05, + "loss": 2.6107, + "step": 8399 + }, + { + "epoch": 0.7603874355028515, + "grad_norm": 0.8250716328620911, + "learning_rate": 1.2615062293021507e-05, + "loss": 2.6742, + "step": 8400 + }, + { + "epoch": 0.7604779578166018, + "grad_norm": 0.8013415932655334, + "learning_rate": 1.2599638086097399e-05, + "loss": 2.5804, + "step": 8401 + }, + { + "epoch": 0.7605684801303522, + "grad_norm": 0.9561328291893005, + "learning_rate": 1.2584222680387381e-05, + "loss": 2.5898, + "step": 8402 + }, + { + "epoch": 0.7606590024441025, + "grad_norm": 0.8616392016410828, + "learning_rate": 1.2568816077443712e-05, + "loss": 2.5695, + "step": 8403 + }, + { + "epoch": 0.7607495247578528, + "grad_norm": 0.8710083365440369, + "learning_rate": 1.2553418278817874e-05, + "loss": 2.6597, + "step": 8404 + }, + { + "epoch": 0.7608400470716031, + "grad_norm": 0.926058292388916, + "learning_rate": 1.2538029286060426e-05, + "loss": 2.6538, + "step": 8405 + }, + { + "epoch": 0.7609305693853535, + "grad_norm": 0.9008700251579285, + "learning_rate": 1.252264910072104e-05, + "loss": 2.6652, + "step": 8406 + }, + { + "epoch": 0.7610210916991038, + "grad_norm": 0.8355394601821899, + "learning_rate": 1.2507277724348487e-05, + "loss": 2.5824, + "step": 8407 + }, + { + "epoch": 0.7611116140128542, + "grad_norm": 0.9210795760154724, + "learning_rate": 1.2491915158490697e-05, + "loss": 2.5588, + "step": 8408 + }, + { + "epoch": 0.7612021363266045, + "grad_norm": 0.8426117897033691, + "learning_rate": 1.247656140469462e-05, + "loss": 2.6513, + "step": 8409 + }, + { + "epoch": 0.7612926586403549, + "grad_norm": 0.8535647988319397, + "learning_rate": 1.2461216464506454e-05, + "loss": 2.6681, + "step": 8410 + }, + { + "epoch": 0.7613831809541052, + "grad_norm": 0.9873753786087036, + "learning_rate": 1.2445880339471372e-05, + "loss": 2.615, + "step": 8411 + }, + { + "epoch": 0.7614737032678556, + "grad_norm": 0.8945762515068054, + "learning_rate": 1.243055303113374e-05, + "loss": 2.6129, + "step": 8412 + }, + { + "epoch": 0.7615642255816059, + "grad_norm": 0.9072584509849548, + "learning_rate": 1.2415234541037024e-05, + "loss": 2.573, + "step": 8413 + }, + { + "epoch": 0.7616547478953563, + "grad_norm": 0.8211448192596436, + "learning_rate": 1.23999248707238e-05, + "loss": 2.5749, + "step": 8414 + }, + { + "epoch": 0.7617452702091065, + "grad_norm": 0.9077050685882568, + "learning_rate": 1.2384624021735735e-05, + "loss": 2.692, + "step": 8415 + }, + { + "epoch": 0.7618357925228569, + "grad_norm": 0.8252149224281311, + "learning_rate": 1.2369331995613665e-05, + "loss": 2.5867, + "step": 8416 + }, + { + "epoch": 0.7619263148366072, + "grad_norm": 0.9161756634712219, + "learning_rate": 1.235404879389741e-05, + "loss": 2.6664, + "step": 8417 + }, + { + "epoch": 0.7620168371503576, + "grad_norm": 0.9396390318870544, + "learning_rate": 1.2338774418126098e-05, + "loss": 2.6342, + "step": 8418 + }, + { + "epoch": 0.7621073594641079, + "grad_norm": 0.8113436102867126, + "learning_rate": 1.2323508869837775e-05, + "loss": 2.6522, + "step": 8419 + }, + { + "epoch": 0.7621978817778582, + "grad_norm": 0.9393036961555481, + "learning_rate": 1.230825215056971e-05, + "loss": 2.6139, + "step": 8420 + }, + { + "epoch": 0.7622884040916086, + "grad_norm": 0.8151003122329712, + "learning_rate": 1.2293004261858254e-05, + "loss": 2.6219, + "step": 8421 + }, + { + "epoch": 0.7623789264053589, + "grad_norm": 0.865507960319519, + "learning_rate": 1.2277765205238879e-05, + "loss": 2.6003, + "step": 8422 + }, + { + "epoch": 0.7624694487191093, + "grad_norm": 0.8855223059654236, + "learning_rate": 1.2262534982246132e-05, + "loss": 2.6697, + "step": 8423 + }, + { + "epoch": 0.7625599710328596, + "grad_norm": 0.9732133150100708, + "learning_rate": 1.2247313594413746e-05, + "loss": 2.5585, + "step": 8424 + }, + { + "epoch": 0.76265049334661, + "grad_norm": 0.8758246898651123, + "learning_rate": 1.2232101043274436e-05, + "loss": 2.6185, + "step": 8425 + }, + { + "epoch": 0.7627410156603602, + "grad_norm": 0.8588250279426575, + "learning_rate": 1.2216897330360178e-05, + "loss": 2.6275, + "step": 8426 + }, + { + "epoch": 0.7628315379741106, + "grad_norm": 0.9134284853935242, + "learning_rate": 1.2201702457201947e-05, + "loss": 2.7113, + "step": 8427 + }, + { + "epoch": 0.7629220602878609, + "grad_norm": 0.8985430598258972, + "learning_rate": 1.2186516425329874e-05, + "loss": 2.6369, + "step": 8428 + }, + { + "epoch": 0.7630125826016113, + "grad_norm": 0.8405027389526367, + "learning_rate": 1.2171339236273205e-05, + "loss": 2.613, + "step": 8429 + }, + { + "epoch": 0.7631031049153616, + "grad_norm": 0.8441949486732483, + "learning_rate": 1.2156170891560258e-05, + "loss": 2.6233, + "step": 8430 + }, + { + "epoch": 0.763193627229112, + "grad_norm": 0.84303218126297, + "learning_rate": 1.2141011392718504e-05, + "loss": 2.5981, + "step": 8431 + }, + { + "epoch": 0.7632841495428623, + "grad_norm": 0.9115005731582642, + "learning_rate": 1.212586074127452e-05, + "loss": 2.6523, + "step": 8432 + }, + { + "epoch": 0.7633746718566127, + "grad_norm": 0.8431458473205566, + "learning_rate": 1.2110718938753907e-05, + "loss": 2.6375, + "step": 8433 + }, + { + "epoch": 0.763465194170363, + "grad_norm": 0.9723434448242188, + "learning_rate": 1.2095585986681535e-05, + "loss": 2.6754, + "step": 8434 + }, + { + "epoch": 0.7635557164841134, + "grad_norm": 0.8683786392211914, + "learning_rate": 1.208046188658124e-05, + "loss": 2.6303, + "step": 8435 + }, + { + "epoch": 0.7636462387978636, + "grad_norm": 0.8498546481132507, + "learning_rate": 1.2065346639976016e-05, + "loss": 2.5578, + "step": 8436 + }, + { + "epoch": 0.763736761111614, + "grad_norm": 0.9147982001304626, + "learning_rate": 1.2050240248387978e-05, + "loss": 2.6561, + "step": 8437 + }, + { + "epoch": 0.7638272834253643, + "grad_norm": 0.8597583174705505, + "learning_rate": 1.2035142713338366e-05, + "loss": 2.5931, + "step": 8438 + }, + { + "epoch": 0.7639178057391147, + "grad_norm": 0.9011256694793701, + "learning_rate": 1.2020054036347439e-05, + "loss": 2.5613, + "step": 8439 + }, + { + "epoch": 0.764008328052865, + "grad_norm": 0.8712127804756165, + "learning_rate": 1.2004974218934695e-05, + "loss": 2.6386, + "step": 8440 + }, + { + "epoch": 0.7640988503666154, + "grad_norm": 0.8568911552429199, + "learning_rate": 1.1989903262618628e-05, + "loss": 2.6678, + "step": 8441 + }, + { + "epoch": 0.7641893726803657, + "grad_norm": 0.8371545076370239, + "learning_rate": 1.1974841168916884e-05, + "loss": 2.6751, + "step": 8442 + }, + { + "epoch": 0.7642798949941161, + "grad_norm": 0.8238829374313354, + "learning_rate": 1.1959787939346235e-05, + "loss": 2.6016, + "step": 8443 + }, + { + "epoch": 0.7643704173078664, + "grad_norm": 0.868220329284668, + "learning_rate": 1.1944743575422524e-05, + "loss": 2.5867, + "step": 8444 + }, + { + "epoch": 0.7644609396216168, + "grad_norm": 0.8624018430709839, + "learning_rate": 1.1929708078660728e-05, + "loss": 2.6194, + "step": 8445 + }, + { + "epoch": 0.764551461935367, + "grad_norm": 0.8850862979888916, + "learning_rate": 1.1914681450574949e-05, + "loss": 2.619, + "step": 8446 + }, + { + "epoch": 0.7646419842491174, + "grad_norm": 0.8665874600410461, + "learning_rate": 1.1899663692678287e-05, + "loss": 2.5906, + "step": 8447 + }, + { + "epoch": 0.7647325065628677, + "grad_norm": 0.8597021698951721, + "learning_rate": 1.188465480648312e-05, + "loss": 2.6345, + "step": 8448 + }, + { + "epoch": 0.7648230288766181, + "grad_norm": 0.8657194972038269, + "learning_rate": 1.1869654793500784e-05, + "loss": 2.6515, + "step": 8449 + }, + { + "epoch": 0.7649135511903684, + "grad_norm": 0.8500573635101318, + "learning_rate": 1.1854663655241805e-05, + "loss": 2.7176, + "step": 8450 + }, + { + "epoch": 0.7650040735041188, + "grad_norm": 0.8615239262580872, + "learning_rate": 1.1839681393215773e-05, + "loss": 2.56, + "step": 8451 + }, + { + "epoch": 0.7650945958178691, + "grad_norm": 0.7915595769882202, + "learning_rate": 1.1824708008931418e-05, + "loss": 2.5786, + "step": 8452 + }, + { + "epoch": 0.7651851181316195, + "grad_norm": 0.8926154971122742, + "learning_rate": 1.1809743503896554e-05, + "loss": 2.5991, + "step": 8453 + }, + { + "epoch": 0.7652756404453698, + "grad_norm": 0.8740158677101135, + "learning_rate": 1.179478787961813e-05, + "loss": 2.624, + "step": 8454 + }, + { + "epoch": 0.7653661627591202, + "grad_norm": 0.8418128490447998, + "learning_rate": 1.177984113760211e-05, + "loss": 2.5846, + "step": 8455 + }, + { + "epoch": 0.7654566850728705, + "grad_norm": 0.8274219632148743, + "learning_rate": 1.1764903279353712e-05, + "loss": 2.5793, + "step": 8456 + }, + { + "epoch": 0.7655472073866209, + "grad_norm": 0.8460656404495239, + "learning_rate": 1.1749974306377121e-05, + "loss": 2.5991, + "step": 8457 + }, + { + "epoch": 0.7656377297003711, + "grad_norm": 0.8882750868797302, + "learning_rate": 1.1735054220175711e-05, + "loss": 2.6413, + "step": 8458 + }, + { + "epoch": 0.7657282520141215, + "grad_norm": 0.895829975605011, + "learning_rate": 1.1720143022251917e-05, + "loss": 2.6272, + "step": 8459 + }, + { + "epoch": 0.7658187743278718, + "grad_norm": 0.8516156673431396, + "learning_rate": 1.1705240714107302e-05, + "loss": 2.635, + "step": 8460 + }, + { + "epoch": 0.7659092966416221, + "grad_norm": 0.8761586546897888, + "learning_rate": 1.1690347297242532e-05, + "loss": 2.6001, + "step": 8461 + }, + { + "epoch": 0.7659998189553725, + "grad_norm": 0.8189490437507629, + "learning_rate": 1.1675462773157398e-05, + "loss": 2.6178, + "step": 8462 + }, + { + "epoch": 0.7660903412691228, + "grad_norm": 0.8238328695297241, + "learning_rate": 1.166058714335071e-05, + "loss": 2.6499, + "step": 8463 + }, + { + "epoch": 0.7661808635828732, + "grad_norm": 0.8749627470970154, + "learning_rate": 1.1645720409320504e-05, + "loss": 2.6602, + "step": 8464 + }, + { + "epoch": 0.7662713858966235, + "grad_norm": 0.8527346253395081, + "learning_rate": 1.163086257256385e-05, + "loss": 2.6257, + "step": 8465 + }, + { + "epoch": 0.7663619082103739, + "grad_norm": 0.7932024002075195, + "learning_rate": 1.1616013634576873e-05, + "loss": 2.5665, + "step": 8466 + }, + { + "epoch": 0.7664524305241242, + "grad_norm": 0.8431447148323059, + "learning_rate": 1.1601173596854942e-05, + "loss": 2.698, + "step": 8467 + }, + { + "epoch": 0.7665429528378745, + "grad_norm": 0.8905431628227234, + "learning_rate": 1.1586342460892396e-05, + "loss": 2.5765, + "step": 8468 + }, + { + "epoch": 0.7666334751516248, + "grad_norm": 0.8475296497344971, + "learning_rate": 1.1571520228182742e-05, + "loss": 2.622, + "step": 8469 + }, + { + "epoch": 0.7667239974653752, + "grad_norm": 0.8639915585517883, + "learning_rate": 1.1556706900218572e-05, + "loss": 2.6325, + "step": 8470 + }, + { + "epoch": 0.7668145197791255, + "grad_norm": 0.8727537989616394, + "learning_rate": 1.1541902478491606e-05, + "loss": 2.6259, + "step": 8471 + }, + { + "epoch": 0.7669050420928759, + "grad_norm": 0.8499577045440674, + "learning_rate": 1.152710696449264e-05, + "loss": 2.627, + "step": 8472 + }, + { + "epoch": 0.7669955644066262, + "grad_norm": 0.7941082715988159, + "learning_rate": 1.1512320359711604e-05, + "loss": 2.6087, + "step": 8473 + }, + { + "epoch": 0.7670860867203766, + "grad_norm": 0.873528242111206, + "learning_rate": 1.1497542665637451e-05, + "loss": 2.5909, + "step": 8474 + }, + { + "epoch": 0.7671766090341269, + "grad_norm": 0.820582926273346, + "learning_rate": 1.1482773883758357e-05, + "loss": 2.6254, + "step": 8475 + }, + { + "epoch": 0.7672671313478773, + "grad_norm": 0.9067497253417969, + "learning_rate": 1.14680140155615e-05, + "loss": 2.6378, + "step": 8476 + }, + { + "epoch": 0.7673576536616276, + "grad_norm": 0.804765522480011, + "learning_rate": 1.1453263062533203e-05, + "loss": 2.6179, + "step": 8477 + }, + { + "epoch": 0.767448175975378, + "grad_norm": 0.8286309838294983, + "learning_rate": 1.14385210261589e-05, + "loss": 2.6059, + "step": 8478 + }, + { + "epoch": 0.7675386982891282, + "grad_norm": 0.8492158055305481, + "learning_rate": 1.14237879079231e-05, + "loss": 2.6408, + "step": 8479 + }, + { + "epoch": 0.7676292206028786, + "grad_norm": 0.8288713097572327, + "learning_rate": 1.1409063709309442e-05, + "loss": 2.5898, + "step": 8480 + }, + { + "epoch": 0.7677197429166289, + "grad_norm": 0.842309832572937, + "learning_rate": 1.139434843180066e-05, + "loss": 2.6221, + "step": 8481 + }, + { + "epoch": 0.7678102652303793, + "grad_norm": 0.9057137370109558, + "learning_rate": 1.1379642076878527e-05, + "loss": 2.6263, + "step": 8482 + }, + { + "epoch": 0.7679007875441296, + "grad_norm": 0.8420442938804626, + "learning_rate": 1.1364944646024045e-05, + "loss": 2.6164, + "step": 8483 + }, + { + "epoch": 0.76799130985788, + "grad_norm": 0.8523637056350708, + "learning_rate": 1.1350256140717197e-05, + "loss": 2.5767, + "step": 8484 + }, + { + "epoch": 0.7680818321716303, + "grad_norm": 0.9101349115371704, + "learning_rate": 1.1335576562437134e-05, + "loss": 2.6628, + "step": 8485 + }, + { + "epoch": 0.7681723544853807, + "grad_norm": 0.8948091268539429, + "learning_rate": 1.1320905912662084e-05, + "loss": 2.6112, + "step": 8486 + }, + { + "epoch": 0.768262876799131, + "grad_norm": 0.8722454905509949, + "learning_rate": 1.1306244192869386e-05, + "loss": 2.6569, + "step": 8487 + }, + { + "epoch": 0.7683533991128814, + "grad_norm": 0.8330963253974915, + "learning_rate": 1.1291591404535462e-05, + "loss": 2.6174, + "step": 8488 + }, + { + "epoch": 0.7684439214266316, + "grad_norm": 0.8809168934822083, + "learning_rate": 1.1276947549135875e-05, + "loss": 2.6485, + "step": 8489 + }, + { + "epoch": 0.768534443740382, + "grad_norm": 0.8144750595092773, + "learning_rate": 1.126231262814521e-05, + "loss": 2.608, + "step": 8490 + }, + { + "epoch": 0.7686249660541323, + "grad_norm": 0.8324765563011169, + "learning_rate": 1.124768664303727e-05, + "loss": 2.6431, + "step": 8491 + }, + { + "epoch": 0.7687154883678827, + "grad_norm": 0.8306261301040649, + "learning_rate": 1.123306959528484e-05, + "loss": 2.6201, + "step": 8492 + }, + { + "epoch": 0.768806010681633, + "grad_norm": 0.8392279148101807, + "learning_rate": 1.1218461486359877e-05, + "loss": 2.6706, + "step": 8493 + }, + { + "epoch": 0.7688965329953834, + "grad_norm": 0.8646116256713867, + "learning_rate": 1.1203862317733404e-05, + "loss": 2.6032, + "step": 8494 + }, + { + "epoch": 0.7689870553091337, + "grad_norm": 0.8765877485275269, + "learning_rate": 1.1189272090875591e-05, + "loss": 2.6159, + "step": 8495 + }, + { + "epoch": 0.7690775776228841, + "grad_norm": 0.8648416996002197, + "learning_rate": 1.1174690807255605e-05, + "loss": 2.6345, + "step": 8496 + }, + { + "epoch": 0.7691680999366344, + "grad_norm": 0.8127090334892273, + "learning_rate": 1.1160118468341863e-05, + "loss": 2.6969, + "step": 8497 + }, + { + "epoch": 0.7692586222503848, + "grad_norm": 0.9276617169380188, + "learning_rate": 1.1145555075601743e-05, + "loss": 2.6525, + "step": 8498 + }, + { + "epoch": 0.7693491445641351, + "grad_norm": 0.8718293905258179, + "learning_rate": 1.1131000630501797e-05, + "loss": 2.6579, + "step": 8499 + }, + { + "epoch": 0.7694396668778855, + "grad_norm": 0.8407883048057556, + "learning_rate": 1.1116455134507664e-05, + "loss": 2.6185, + "step": 8500 + }, + { + "epoch": 0.7695301891916357, + "grad_norm": 0.8548136353492737, + "learning_rate": 1.110191858908406e-05, + "loss": 2.6078, + "step": 8501 + }, + { + "epoch": 0.769620711505386, + "grad_norm": 0.8696662783622742, + "learning_rate": 1.1087390995694813e-05, + "loss": 2.6446, + "step": 8502 + }, + { + "epoch": 0.7697112338191364, + "grad_norm": 0.8202139139175415, + "learning_rate": 1.1072872355802888e-05, + "loss": 2.5884, + "step": 8503 + }, + { + "epoch": 0.7698017561328867, + "grad_norm": 0.7680681347846985, + "learning_rate": 1.1058362670870249e-05, + "loss": 2.5989, + "step": 8504 + }, + { + "epoch": 0.7698922784466371, + "grad_norm": 0.8208710551261902, + "learning_rate": 1.104386194235808e-05, + "loss": 2.613, + "step": 8505 + }, + { + "epoch": 0.7699828007603874, + "grad_norm": 0.9006401300430298, + "learning_rate": 1.1029370171726571e-05, + "loss": 2.6549, + "step": 8506 + }, + { + "epoch": 0.7700733230741378, + "grad_norm": 0.8165719509124756, + "learning_rate": 1.1014887360435045e-05, + "loss": 2.6377, + "step": 8507 + }, + { + "epoch": 0.7701638453878881, + "grad_norm": 0.8723152875900269, + "learning_rate": 1.1000413509941931e-05, + "loss": 2.6185, + "step": 8508 + }, + { + "epoch": 0.7702543677016385, + "grad_norm": 0.8101234436035156, + "learning_rate": 1.0985948621704745e-05, + "loss": 2.6072, + "step": 8509 + }, + { + "epoch": 0.7703448900153888, + "grad_norm": 0.8769212961196899, + "learning_rate": 1.0971492697180096e-05, + "loss": 2.6771, + "step": 8510 + }, + { + "epoch": 0.7704354123291391, + "grad_norm": 0.9129467606544495, + "learning_rate": 1.0957045737823712e-05, + "loss": 2.6916, + "step": 8511 + }, + { + "epoch": 0.7705259346428894, + "grad_norm": 0.828231155872345, + "learning_rate": 1.094260774509035e-05, + "loss": 2.5834, + "step": 8512 + }, + { + "epoch": 0.7706164569566398, + "grad_norm": 0.9102683663368225, + "learning_rate": 1.092817872043399e-05, + "loss": 2.6544, + "step": 8513 + }, + { + "epoch": 0.7707069792703901, + "grad_norm": 0.9233009219169617, + "learning_rate": 1.0913758665307572e-05, + "loss": 2.6324, + "step": 8514 + }, + { + "epoch": 0.7707975015841405, + "grad_norm": 0.8687834143638611, + "learning_rate": 1.0899347581163221e-05, + "loss": 2.6166, + "step": 8515 + }, + { + "epoch": 0.7708880238978908, + "grad_norm": 0.8975607752799988, + "learning_rate": 1.0884945469452124e-05, + "loss": 2.6041, + "step": 8516 + }, + { + "epoch": 0.7709785462116412, + "grad_norm": 0.9307120442390442, + "learning_rate": 1.0870552331624594e-05, + "loss": 2.5894, + "step": 8517 + }, + { + "epoch": 0.7710690685253915, + "grad_norm": 0.7821042537689209, + "learning_rate": 1.085616816913e-05, + "loss": 2.5775, + "step": 8518 + }, + { + "epoch": 0.7711595908391419, + "grad_norm": 0.8686316609382629, + "learning_rate": 1.0841792983416842e-05, + "loss": 2.6168, + "step": 8519 + }, + { + "epoch": 0.7712501131528922, + "grad_norm": 0.8629463315010071, + "learning_rate": 1.0827426775932658e-05, + "loss": 2.6129, + "step": 8520 + }, + { + "epoch": 0.7713406354666426, + "grad_norm": 0.8287545442581177, + "learning_rate": 1.081306954812421e-05, + "loss": 2.6252, + "step": 8521 + }, + { + "epoch": 0.7714311577803928, + "grad_norm": 0.8263203501701355, + "learning_rate": 1.0798721301437187e-05, + "loss": 2.6089, + "step": 8522 + }, + { + "epoch": 0.7715216800941432, + "grad_norm": 0.8079046607017517, + "learning_rate": 1.0784382037316487e-05, + "loss": 2.5681, + "step": 8523 + }, + { + "epoch": 0.7716122024078935, + "grad_norm": 0.8208576440811157, + "learning_rate": 1.0770051757206079e-05, + "loss": 2.5862, + "step": 8524 + }, + { + "epoch": 0.7717027247216439, + "grad_norm": 0.8792392611503601, + "learning_rate": 1.0755730462549007e-05, + "loss": 2.6873, + "step": 8525 + }, + { + "epoch": 0.7717932470353942, + "grad_norm": 0.8938965797424316, + "learning_rate": 1.0741418154787442e-05, + "loss": 2.5957, + "step": 8526 + }, + { + "epoch": 0.7718837693491446, + "grad_norm": 0.850585401058197, + "learning_rate": 1.072711483536265e-05, + "loss": 2.5936, + "step": 8527 + }, + { + "epoch": 0.7719742916628949, + "grad_norm": 0.8643617630004883, + "learning_rate": 1.0712820505714916e-05, + "loss": 2.6053, + "step": 8528 + }, + { + "epoch": 0.7720648139766453, + "grad_norm": 0.8208019733428955, + "learning_rate": 1.069853516728374e-05, + "loss": 2.621, + "step": 8529 + }, + { + "epoch": 0.7721553362903956, + "grad_norm": 0.820443868637085, + "learning_rate": 1.068425882150762e-05, + "loss": 2.6543, + "step": 8530 + }, + { + "epoch": 0.772245858604146, + "grad_norm": 0.810172438621521, + "learning_rate": 1.0669991469824192e-05, + "loss": 2.5902, + "step": 8531 + }, + { + "epoch": 0.7723363809178962, + "grad_norm": 0.8411741256713867, + "learning_rate": 1.0655733113670174e-05, + "loss": 2.5858, + "step": 8532 + }, + { + "epoch": 0.7724269032316466, + "grad_norm": 0.8169674873352051, + "learning_rate": 1.0641483754481408e-05, + "loss": 2.6641, + "step": 8533 + }, + { + "epoch": 0.7725174255453969, + "grad_norm": 0.7835273146629333, + "learning_rate": 1.0627243393692743e-05, + "loss": 2.5762, + "step": 8534 + }, + { + "epoch": 0.7726079478591473, + "grad_norm": 0.8190977573394775, + "learning_rate": 1.0613012032738268e-05, + "loss": 2.6175, + "step": 8535 + }, + { + "epoch": 0.7726984701728976, + "grad_norm": 0.8712911605834961, + "learning_rate": 1.0598789673051014e-05, + "loss": 2.6005, + "step": 8536 + }, + { + "epoch": 0.772788992486648, + "grad_norm": 0.8913556933403015, + "learning_rate": 1.0584576316063188e-05, + "loss": 2.6676, + "step": 8537 + }, + { + "epoch": 0.7728795148003983, + "grad_norm": 0.8455060720443726, + "learning_rate": 1.0570371963206083e-05, + "loss": 2.6582, + "step": 8538 + }, + { + "epoch": 0.7729700371141487, + "grad_norm": 0.801141619682312, + "learning_rate": 1.0556176615910074e-05, + "loss": 2.6232, + "step": 8539 + }, + { + "epoch": 0.773060559427899, + "grad_norm": 0.8068770170211792, + "learning_rate": 1.054199027560463e-05, + "loss": 2.5629, + "step": 8540 + }, + { + "epoch": 0.7731510817416494, + "grad_norm": 0.7819279432296753, + "learning_rate": 1.052781294371834e-05, + "loss": 2.632, + "step": 8541 + }, + { + "epoch": 0.7732416040553997, + "grad_norm": 0.863209068775177, + "learning_rate": 1.051364462167881e-05, + "loss": 2.5913, + "step": 8542 + }, + { + "epoch": 0.7733321263691499, + "grad_norm": 0.8289285898208618, + "learning_rate": 1.0499485310912838e-05, + "loss": 2.5594, + "step": 8543 + }, + { + "epoch": 0.7734226486829003, + "grad_norm": 0.8970001935958862, + "learning_rate": 1.0485335012846242e-05, + "loss": 2.6596, + "step": 8544 + }, + { + "epoch": 0.7735131709966506, + "grad_norm": 0.8289165496826172, + "learning_rate": 1.047119372890395e-05, + "loss": 2.5846, + "step": 8545 + }, + { + "epoch": 0.773603693310401, + "grad_norm": 0.877833366394043, + "learning_rate": 1.045706146051001e-05, + "loss": 2.6525, + "step": 8546 + }, + { + "epoch": 0.7736942156241513, + "grad_norm": 0.8716258406639099, + "learning_rate": 1.0442938209087517e-05, + "loss": 2.5987, + "step": 8547 + }, + { + "epoch": 0.7737847379379017, + "grad_norm": 0.8302717804908752, + "learning_rate": 1.042882397605871e-05, + "loss": 2.5771, + "step": 8548 + }, + { + "epoch": 0.773875260251652, + "grad_norm": 0.8333648443222046, + "learning_rate": 1.0414718762844889e-05, + "loss": 2.62, + "step": 8549 + }, + { + "epoch": 0.7739657825654024, + "grad_norm": 0.8761650919914246, + "learning_rate": 1.0400622570866425e-05, + "loss": 2.6661, + "step": 8550 + }, + { + "epoch": 0.7740563048791527, + "grad_norm": 0.8990418910980225, + "learning_rate": 1.0386535401542808e-05, + "loss": 2.6772, + "step": 8551 + }, + { + "epoch": 0.7741468271929031, + "grad_norm": 0.85703045129776, + "learning_rate": 1.0372457256292634e-05, + "loss": 2.6688, + "step": 8552 + }, + { + "epoch": 0.7742373495066533, + "grad_norm": 0.8957107067108154, + "learning_rate": 1.0358388136533549e-05, + "loss": 2.5837, + "step": 8553 + }, + { + "epoch": 0.7743278718204037, + "grad_norm": 0.840560793876648, + "learning_rate": 1.034432804368235e-05, + "loss": 2.5957, + "step": 8554 + }, + { + "epoch": 0.774418394134154, + "grad_norm": 0.8776541948318481, + "learning_rate": 1.033027697915483e-05, + "loss": 2.6229, + "step": 8555 + }, + { + "epoch": 0.7745089164479044, + "grad_norm": 0.8002888560295105, + "learning_rate": 1.0316234944365999e-05, + "loss": 2.6228, + "step": 8556 + }, + { + "epoch": 0.7745994387616547, + "grad_norm": 0.896776020526886, + "learning_rate": 1.0302201940729838e-05, + "loss": 2.6482, + "step": 8557 + }, + { + "epoch": 0.7746899610754051, + "grad_norm": 0.8228409290313721, + "learning_rate": 1.0288177969659485e-05, + "loss": 2.6539, + "step": 8558 + }, + { + "epoch": 0.7747804833891554, + "grad_norm": 0.8731130361557007, + "learning_rate": 1.0274163032567163e-05, + "loss": 2.5874, + "step": 8559 + }, + { + "epoch": 0.7748710057029058, + "grad_norm": 0.8286214470863342, + "learning_rate": 1.026015713086418e-05, + "loss": 2.647, + "step": 8560 + }, + { + "epoch": 0.7749615280166561, + "grad_norm": 0.8495842814445496, + "learning_rate": 1.024616026596088e-05, + "loss": 2.624, + "step": 8561 + }, + { + "epoch": 0.7750520503304065, + "grad_norm": 0.9225957989692688, + "learning_rate": 1.023217243926683e-05, + "loss": 2.6714, + "step": 8562 + }, + { + "epoch": 0.7751425726441568, + "grad_norm": 0.807245135307312, + "learning_rate": 1.0218193652190533e-05, + "loss": 2.6377, + "step": 8563 + }, + { + "epoch": 0.7752330949579072, + "grad_norm": 0.8943997025489807, + "learning_rate": 1.0204223906139686e-05, + "loss": 2.5683, + "step": 8564 + }, + { + "epoch": 0.7753236172716574, + "grad_norm": 0.8158625960350037, + "learning_rate": 1.0190263202521033e-05, + "loss": 2.6212, + "step": 8565 + }, + { + "epoch": 0.7754141395854078, + "grad_norm": 0.8700400590896606, + "learning_rate": 1.0176311542740413e-05, + "loss": 2.6376, + "step": 8566 + }, + { + "epoch": 0.7755046618991581, + "grad_norm": 0.8914387226104736, + "learning_rate": 1.016236892820276e-05, + "loss": 2.6217, + "step": 8567 + }, + { + "epoch": 0.7755951842129085, + "grad_norm": 0.8653732538223267, + "learning_rate": 1.0148435360312125e-05, + "loss": 2.5857, + "step": 8568 + }, + { + "epoch": 0.7756857065266588, + "grad_norm": 0.8333502411842346, + "learning_rate": 1.0134510840471556e-05, + "loss": 2.5917, + "step": 8569 + }, + { + "epoch": 0.7757762288404092, + "grad_norm": 0.9215497970581055, + "learning_rate": 1.0120595370083318e-05, + "loss": 2.6399, + "step": 8570 + }, + { + "epoch": 0.7758667511541595, + "grad_norm": 0.8121035695075989, + "learning_rate": 1.0106688950548649e-05, + "loss": 2.5964, + "step": 8571 + }, + { + "epoch": 0.7759572734679099, + "grad_norm": 0.848892867565155, + "learning_rate": 1.0092791583267936e-05, + "loss": 2.6607, + "step": 8572 + }, + { + "epoch": 0.7760477957816602, + "grad_norm": 0.8766636848449707, + "learning_rate": 1.0078903269640661e-05, + "loss": 2.6587, + "step": 8573 + }, + { + "epoch": 0.7761383180954106, + "grad_norm": 0.8555773496627808, + "learning_rate": 1.006502401106536e-05, + "loss": 2.5938, + "step": 8574 + }, + { + "epoch": 0.7762288404091608, + "grad_norm": 0.819665253162384, + "learning_rate": 1.0051153808939685e-05, + "loss": 2.6605, + "step": 8575 + }, + { + "epoch": 0.7763193627229112, + "grad_norm": 0.797521710395813, + "learning_rate": 1.003729266466038e-05, + "loss": 2.5949, + "step": 8576 + }, + { + "epoch": 0.7764098850366615, + "grad_norm": 0.8572009801864624, + "learning_rate": 1.00234405796232e-05, + "loss": 2.6371, + "step": 8577 + }, + { + "epoch": 0.7765004073504119, + "grad_norm": 0.8093275427818298, + "learning_rate": 1.0009597555223128e-05, + "loss": 2.6094, + "step": 8578 + }, + { + "epoch": 0.7765909296641622, + "grad_norm": 0.9169244170188904, + "learning_rate": 9.995763592854102e-06, + "loss": 2.6016, + "step": 8579 + }, + { + "epoch": 0.7766814519779126, + "grad_norm": 0.9339399933815002, + "learning_rate": 9.98193869390922e-06, + "loss": 2.6263, + "step": 8580 + }, + { + "epoch": 0.7767719742916629, + "grad_norm": 0.8368563055992126, + "learning_rate": 9.968122859780648e-06, + "loss": 2.6526, + "step": 8581 + }, + { + "epoch": 0.7768624966054133, + "grad_norm": 0.9302917122840881, + "learning_rate": 9.95431609185965e-06, + "loss": 2.6532, + "step": 8582 + }, + { + "epoch": 0.7769530189191636, + "grad_norm": 0.8470931649208069, + "learning_rate": 9.940518391536547e-06, + "loss": 2.705, + "step": 8583 + }, + { + "epoch": 0.7770435412329139, + "grad_norm": 0.9310446977615356, + "learning_rate": 9.926729760200803e-06, + "loss": 2.6525, + "step": 8584 + }, + { + "epoch": 0.7771340635466643, + "grad_norm": 0.8245245218276978, + "learning_rate": 9.912950199240867e-06, + "loss": 2.5873, + "step": 8585 + }, + { + "epoch": 0.7772245858604145, + "grad_norm": 0.8588077425956726, + "learning_rate": 9.899179710044415e-06, + "loss": 2.6191, + "step": 8586 + }, + { + "epoch": 0.7773151081741649, + "grad_norm": 0.8226948380470276, + "learning_rate": 9.885418293998083e-06, + "loss": 2.5726, + "step": 8587 + }, + { + "epoch": 0.7774056304879152, + "grad_norm": 0.8949288725852966, + "learning_rate": 9.871665952487663e-06, + "loss": 2.587, + "step": 8588 + }, + { + "epoch": 0.7774961528016656, + "grad_norm": 0.8486374616622925, + "learning_rate": 9.857922686898024e-06, + "loss": 2.5791, + "step": 8589 + }, + { + "epoch": 0.7775866751154159, + "grad_norm": 0.8344716429710388, + "learning_rate": 9.844188498613116e-06, + "loss": 2.6232, + "step": 8590 + }, + { + "epoch": 0.7776771974291663, + "grad_norm": 0.8915365934371948, + "learning_rate": 9.830463389015921e-06, + "loss": 2.6402, + "step": 8591 + }, + { + "epoch": 0.7777677197429166, + "grad_norm": 0.863557755947113, + "learning_rate": 9.816747359488632e-06, + "loss": 2.6397, + "step": 8592 + }, + { + "epoch": 0.777858242056667, + "grad_norm": 0.8383275866508484, + "learning_rate": 9.8030404114124e-06, + "loss": 2.6956, + "step": 8593 + }, + { + "epoch": 0.7779487643704173, + "grad_norm": 0.8920031189918518, + "learning_rate": 9.789342546167535e-06, + "loss": 2.6436, + "step": 8594 + }, + { + "epoch": 0.7780392866841677, + "grad_norm": 0.8083052039146423, + "learning_rate": 9.775653765133396e-06, + "loss": 2.6028, + "step": 8595 + }, + { + "epoch": 0.778129808997918, + "grad_norm": 0.8552556037902832, + "learning_rate": 9.761974069688461e-06, + "loss": 2.6034, + "step": 8596 + }, + { + "epoch": 0.7782203313116683, + "grad_norm": 0.9225404262542725, + "learning_rate": 9.748303461210273e-06, + "loss": 2.5879, + "step": 8597 + }, + { + "epoch": 0.7783108536254186, + "grad_norm": 0.8443930149078369, + "learning_rate": 9.734641941075474e-06, + "loss": 2.613, + "step": 8598 + }, + { + "epoch": 0.778401375939169, + "grad_norm": 0.8860672116279602, + "learning_rate": 9.720989510659717e-06, + "loss": 2.6089, + "step": 8599 + }, + { + "epoch": 0.7784918982529193, + "grad_norm": 0.9268253445625305, + "learning_rate": 9.707346171337894e-06, + "loss": 2.6889, + "step": 8600 + }, + { + "epoch": 0.7785824205666697, + "grad_norm": 0.8402400016784668, + "learning_rate": 9.693711924483817e-06, + "loss": 2.6401, + "step": 8601 + }, + { + "epoch": 0.77867294288042, + "grad_norm": 0.865181028842926, + "learning_rate": 9.680086771470475e-06, + "loss": 2.6647, + "step": 8602 + }, + { + "epoch": 0.7787634651941704, + "grad_norm": 0.893332839012146, + "learning_rate": 9.666470713669918e-06, + "loss": 2.6299, + "step": 8603 + }, + { + "epoch": 0.7788539875079207, + "grad_norm": 0.7875241637229919, + "learning_rate": 9.6528637524533e-06, + "loss": 2.57, + "step": 8604 + }, + { + "epoch": 0.7789445098216711, + "grad_norm": 0.7910863161087036, + "learning_rate": 9.63926588919083e-06, + "loss": 2.6203, + "step": 8605 + }, + { + "epoch": 0.7790350321354214, + "grad_norm": 0.8562816381454468, + "learning_rate": 9.625677125251831e-06, + "loss": 2.6552, + "step": 8606 + }, + { + "epoch": 0.7791255544491718, + "grad_norm": 0.8345043659210205, + "learning_rate": 9.612097462004622e-06, + "loss": 2.6441, + "step": 8607 + }, + { + "epoch": 0.779216076762922, + "grad_norm": 0.786302924156189, + "learning_rate": 9.598526900816774e-06, + "loss": 2.6094, + "step": 8608 + }, + { + "epoch": 0.7793065990766724, + "grad_norm": 0.8758160471916199, + "learning_rate": 9.584965443054772e-06, + "loss": 2.5929, + "step": 8609 + }, + { + "epoch": 0.7793971213904227, + "grad_norm": 0.8279112577438354, + "learning_rate": 9.57141309008428e-06, + "loss": 2.6303, + "step": 8610 + }, + { + "epoch": 0.7794876437041731, + "grad_norm": 0.845248281955719, + "learning_rate": 9.557869843270018e-06, + "loss": 2.6165, + "step": 8611 + }, + { + "epoch": 0.7795781660179234, + "grad_norm": 0.8265160322189331, + "learning_rate": 9.544335703975783e-06, + "loss": 2.6468, + "step": 8612 + }, + { + "epoch": 0.7796686883316738, + "grad_norm": 0.8631579279899597, + "learning_rate": 9.530810673564461e-06, + "loss": 2.6689, + "step": 8613 + }, + { + "epoch": 0.7797592106454241, + "grad_norm": 0.7934325933456421, + "learning_rate": 9.517294753398064e-06, + "loss": 2.6113, + "step": 8614 + }, + { + "epoch": 0.7798497329591745, + "grad_norm": 0.8415437340736389, + "learning_rate": 9.503787944837561e-06, + "loss": 2.6356, + "step": 8615 + }, + { + "epoch": 0.7799402552729248, + "grad_norm": 0.861663281917572, + "learning_rate": 9.490290249243172e-06, + "loss": 2.6316, + "step": 8616 + }, + { + "epoch": 0.7800307775866752, + "grad_norm": 0.7900444865226746, + "learning_rate": 9.476801667974067e-06, + "loss": 2.5699, + "step": 8617 + }, + { + "epoch": 0.7801212999004254, + "grad_norm": 0.8477349281311035, + "learning_rate": 9.463322202388546e-06, + "loss": 2.6082, + "step": 8618 + }, + { + "epoch": 0.7802118222141758, + "grad_norm": 0.8869505524635315, + "learning_rate": 9.449851853844005e-06, + "loss": 2.6699, + "step": 8619 + }, + { + "epoch": 0.7803023445279261, + "grad_norm": 0.8460425138473511, + "learning_rate": 9.436390623696911e-06, + "loss": 2.5868, + "step": 8620 + }, + { + "epoch": 0.7803928668416765, + "grad_norm": 0.8156192302703857, + "learning_rate": 9.422938513302804e-06, + "loss": 2.5906, + "step": 8621 + }, + { + "epoch": 0.7804833891554268, + "grad_norm": 0.9294385313987732, + "learning_rate": 9.409495524016321e-06, + "loss": 2.6262, + "step": 8622 + }, + { + "epoch": 0.7805739114691772, + "grad_norm": 0.8695874810218811, + "learning_rate": 9.396061657191135e-06, + "loss": 2.6308, + "step": 8623 + }, + { + "epoch": 0.7806644337829275, + "grad_norm": 0.8546766042709351, + "learning_rate": 9.382636914180087e-06, + "loss": 2.6711, + "step": 8624 + }, + { + "epoch": 0.7807549560966778, + "grad_norm": 0.8264554738998413, + "learning_rate": 9.369221296335006e-06, + "loss": 2.5733, + "step": 8625 + }, + { + "epoch": 0.7808454784104282, + "grad_norm": 0.8439817428588867, + "learning_rate": 9.355814805006858e-06, + "loss": 2.5912, + "step": 8626 + }, + { + "epoch": 0.7809360007241785, + "grad_norm": 0.8114175796508789, + "learning_rate": 9.342417441545692e-06, + "loss": 2.6588, + "step": 8627 + }, + { + "epoch": 0.7810265230379289, + "grad_norm": 0.8043602705001831, + "learning_rate": 9.32902920730061e-06, + "loss": 2.617, + "step": 8628 + }, + { + "epoch": 0.7811170453516791, + "grad_norm": 0.8502074480056763, + "learning_rate": 9.315650103619778e-06, + "loss": 2.646, + "step": 8629 + }, + { + "epoch": 0.7812075676654295, + "grad_norm": 0.9189439415931702, + "learning_rate": 9.302280131850539e-06, + "loss": 2.5986, + "step": 8630 + }, + { + "epoch": 0.7812980899791798, + "grad_norm": 0.9692003726959229, + "learning_rate": 9.288919293339182e-06, + "loss": 2.5921, + "step": 8631 + }, + { + "epoch": 0.7813886122929302, + "grad_norm": 0.86298668384552, + "learning_rate": 9.275567589431178e-06, + "loss": 2.6563, + "step": 8632 + }, + { + "epoch": 0.7814791346066805, + "grad_norm": 0.8018830418586731, + "learning_rate": 9.262225021471038e-06, + "loss": 2.6627, + "step": 8633 + }, + { + "epoch": 0.7815696569204309, + "grad_norm": 0.8384698033332825, + "learning_rate": 9.248891590802344e-06, + "loss": 2.5806, + "step": 8634 + }, + { + "epoch": 0.7816601792341812, + "grad_norm": 0.8785240650177002, + "learning_rate": 9.23556729876781e-06, + "loss": 2.7003, + "step": 8635 + }, + { + "epoch": 0.7817507015479316, + "grad_norm": 0.8848211765289307, + "learning_rate": 9.222252146709142e-06, + "loss": 2.7151, + "step": 8636 + }, + { + "epoch": 0.7818412238616819, + "grad_norm": 0.8771471381187439, + "learning_rate": 9.208946135967211e-06, + "loss": 2.6156, + "step": 8637 + }, + { + "epoch": 0.7819317461754323, + "grad_norm": 0.8841015100479126, + "learning_rate": 9.195649267881911e-06, + "loss": 2.6746, + "step": 8638 + }, + { + "epoch": 0.7820222684891825, + "grad_norm": 0.822411835193634, + "learning_rate": 9.182361543792239e-06, + "loss": 2.625, + "step": 8639 + }, + { + "epoch": 0.782112790802933, + "grad_norm": 0.8606818318367004, + "learning_rate": 9.16908296503628e-06, + "loss": 2.5483, + "step": 8640 + }, + { + "epoch": 0.7822033131166832, + "grad_norm": 0.8249856233596802, + "learning_rate": 9.155813532951208e-06, + "loss": 2.6611, + "step": 8641 + }, + { + "epoch": 0.7822938354304336, + "grad_norm": 0.8641714453697205, + "learning_rate": 9.142553248873175e-06, + "loss": 2.5893, + "step": 8642 + }, + { + "epoch": 0.7823843577441839, + "grad_norm": 0.8504626154899597, + "learning_rate": 9.129302114137594e-06, + "loss": 2.6912, + "step": 8643 + }, + { + "epoch": 0.7824748800579343, + "grad_norm": 0.8407044410705566, + "learning_rate": 9.116060130078775e-06, + "loss": 2.6577, + "step": 8644 + }, + { + "epoch": 0.7825654023716846, + "grad_norm": 0.8601648211479187, + "learning_rate": 9.102827298030226e-06, + "loss": 2.6203, + "step": 8645 + }, + { + "epoch": 0.782655924685435, + "grad_norm": 0.8728191256523132, + "learning_rate": 9.089603619324472e-06, + "loss": 2.6375, + "step": 8646 + }, + { + "epoch": 0.7827464469991853, + "grad_norm": 0.8072683215141296, + "learning_rate": 9.076389095293148e-06, + "loss": 2.6768, + "step": 8647 + }, + { + "epoch": 0.7828369693129357, + "grad_norm": 0.8349645137786865, + "learning_rate": 9.063183727266956e-06, + "loss": 2.6109, + "step": 8648 + }, + { + "epoch": 0.782927491626686, + "grad_norm": 0.903693675994873, + "learning_rate": 9.049987516575687e-06, + "loss": 2.6401, + "step": 8649 + }, + { + "epoch": 0.7830180139404364, + "grad_norm": 0.9026902914047241, + "learning_rate": 9.036800464548157e-06, + "loss": 2.5926, + "step": 8650 + }, + { + "epoch": 0.7831085362541866, + "grad_norm": 0.8069325685501099, + "learning_rate": 9.023622572512369e-06, + "loss": 2.617, + "step": 8651 + }, + { + "epoch": 0.783199058567937, + "grad_norm": 0.8999775648117065, + "learning_rate": 9.010453841795274e-06, + "loss": 2.5992, + "step": 8652 + }, + { + "epoch": 0.7832895808816873, + "grad_norm": 0.8389754891395569, + "learning_rate": 8.99729427372299e-06, + "loss": 2.6275, + "step": 8653 + }, + { + "epoch": 0.7833801031954377, + "grad_norm": 0.8419077396392822, + "learning_rate": 8.984143869620687e-06, + "loss": 2.6742, + "step": 8654 + }, + { + "epoch": 0.783470625509188, + "grad_norm": 0.8308020830154419, + "learning_rate": 8.971002630812619e-06, + "loss": 2.5836, + "step": 8655 + }, + { + "epoch": 0.7835611478229384, + "grad_norm": 0.8611428737640381, + "learning_rate": 8.957870558622061e-06, + "loss": 2.6362, + "step": 8656 + }, + { + "epoch": 0.7836516701366887, + "grad_norm": 0.8711309432983398, + "learning_rate": 8.944747654371478e-06, + "loss": 2.6592, + "step": 8657 + }, + { + "epoch": 0.7837421924504391, + "grad_norm": 0.8325426578521729, + "learning_rate": 8.931633919382298e-06, + "loss": 2.6889, + "step": 8658 + }, + { + "epoch": 0.7838327147641894, + "grad_norm": 0.8667068481445312, + "learning_rate": 8.918529354975091e-06, + "loss": 2.5751, + "step": 8659 + }, + { + "epoch": 0.7839232370779398, + "grad_norm": 0.8186068534851074, + "learning_rate": 8.905433962469489e-06, + "loss": 2.6027, + "step": 8660 + }, + { + "epoch": 0.78401375939169, + "grad_norm": 0.8946523070335388, + "learning_rate": 8.892347743184192e-06, + "loss": 2.6405, + "step": 8661 + }, + { + "epoch": 0.7841042817054404, + "grad_norm": 0.7675448060035706, + "learning_rate": 8.879270698436993e-06, + "loss": 2.6046, + "step": 8662 + }, + { + "epoch": 0.7841948040191907, + "grad_norm": 0.841367781162262, + "learning_rate": 8.866202829544745e-06, + "loss": 2.5573, + "step": 8663 + }, + { + "epoch": 0.7842853263329411, + "grad_norm": 0.939943253993988, + "learning_rate": 8.853144137823355e-06, + "loss": 2.697, + "step": 8664 + }, + { + "epoch": 0.7843758486466914, + "grad_norm": 0.8685693144798279, + "learning_rate": 8.840094624587891e-06, + "loss": 2.6366, + "step": 8665 + }, + { + "epoch": 0.7844663709604417, + "grad_norm": 0.8320167064666748, + "learning_rate": 8.827054291152381e-06, + "loss": 2.5981, + "step": 8666 + }, + { + "epoch": 0.7845568932741921, + "grad_norm": 0.8161466717720032, + "learning_rate": 8.814023138830008e-06, + "loss": 2.5679, + "step": 8667 + }, + { + "epoch": 0.7846474155879424, + "grad_norm": 0.8341578841209412, + "learning_rate": 8.80100116893301e-06, + "loss": 2.5952, + "step": 8668 + }, + { + "epoch": 0.7847379379016928, + "grad_norm": 0.8965204358100891, + "learning_rate": 8.787988382772705e-06, + "loss": 2.5905, + "step": 8669 + }, + { + "epoch": 0.7848284602154431, + "grad_norm": 0.8038343191146851, + "learning_rate": 8.774984781659467e-06, + "loss": 2.5923, + "step": 8670 + }, + { + "epoch": 0.7849189825291935, + "grad_norm": 0.8061921000480652, + "learning_rate": 8.761990366902783e-06, + "loss": 2.6416, + "step": 8671 + }, + { + "epoch": 0.7850095048429437, + "grad_norm": 0.9051434993743896, + "learning_rate": 8.749005139811139e-06, + "loss": 2.6485, + "step": 8672 + }, + { + "epoch": 0.7851000271566941, + "grad_norm": 0.8930591344833374, + "learning_rate": 8.736029101692223e-06, + "loss": 2.612, + "step": 8673 + }, + { + "epoch": 0.7851905494704444, + "grad_norm": 0.9820253849029541, + "learning_rate": 8.723062253852654e-06, + "loss": 2.6247, + "step": 8674 + }, + { + "epoch": 0.7852810717841948, + "grad_norm": 0.8085618615150452, + "learning_rate": 8.710104597598223e-06, + "loss": 2.5768, + "step": 8675 + }, + { + "epoch": 0.7853715940979451, + "grad_norm": 0.8512108325958252, + "learning_rate": 8.697156134233763e-06, + "loss": 2.6114, + "step": 8676 + }, + { + "epoch": 0.7854621164116955, + "grad_norm": 0.9089962244033813, + "learning_rate": 8.684216865063178e-06, + "loss": 2.6616, + "step": 8677 + }, + { + "epoch": 0.7855526387254458, + "grad_norm": 0.8274604082107544, + "learning_rate": 8.671286791389455e-06, + "loss": 2.6086, + "step": 8678 + }, + { + "epoch": 0.7856431610391962, + "grad_norm": 0.872837483882904, + "learning_rate": 8.658365914514665e-06, + "loss": 2.5432, + "step": 8679 + }, + { + "epoch": 0.7857336833529465, + "grad_norm": 0.7837662696838379, + "learning_rate": 8.645454235739903e-06, + "loss": 2.6133, + "step": 8680 + }, + { + "epoch": 0.7858242056666969, + "grad_norm": 0.8648548126220703, + "learning_rate": 8.632551756365436e-06, + "loss": 2.632, + "step": 8681 + }, + { + "epoch": 0.7859147279804471, + "grad_norm": 0.8463937640190125, + "learning_rate": 8.619658477690485e-06, + "loss": 2.6317, + "step": 8682 + }, + { + "epoch": 0.7860052502941975, + "grad_norm": 0.8059602975845337, + "learning_rate": 8.606774401013418e-06, + "loss": 2.6053, + "step": 8683 + }, + { + "epoch": 0.7860957726079478, + "grad_norm": 0.8308817744255066, + "learning_rate": 8.593899527631676e-06, + "loss": 2.6569, + "step": 8684 + }, + { + "epoch": 0.7861862949216982, + "grad_norm": 0.7859178185462952, + "learning_rate": 8.581033858841769e-06, + "loss": 2.6233, + "step": 8685 + }, + { + "epoch": 0.7862768172354485, + "grad_norm": 0.7964602708816528, + "learning_rate": 8.568177395939215e-06, + "loss": 2.6029, + "step": 8686 + }, + { + "epoch": 0.7863673395491989, + "grad_norm": 0.8171819448471069, + "learning_rate": 8.555330140218732e-06, + "loss": 2.6731, + "step": 8687 + }, + { + "epoch": 0.7864578618629492, + "grad_norm": 0.8279446959495544, + "learning_rate": 8.542492092973987e-06, + "loss": 2.5903, + "step": 8688 + }, + { + "epoch": 0.7865483841766996, + "grad_norm": 0.7885525822639465, + "learning_rate": 8.529663255497778e-06, + "loss": 2.6323, + "step": 8689 + }, + { + "epoch": 0.7866389064904499, + "grad_norm": 0.8916425704956055, + "learning_rate": 8.516843629081984e-06, + "loss": 2.586, + "step": 8690 + }, + { + "epoch": 0.7867294288042003, + "grad_norm": 0.8177568316459656, + "learning_rate": 8.504033215017527e-06, + "loss": 2.6363, + "step": 8691 + }, + { + "epoch": 0.7868199511179506, + "grad_norm": 0.8604661226272583, + "learning_rate": 8.491232014594418e-06, + "loss": 2.6184, + "step": 8692 + }, + { + "epoch": 0.786910473431701, + "grad_norm": 0.784534215927124, + "learning_rate": 8.478440029101764e-06, + "loss": 2.567, + "step": 8693 + }, + { + "epoch": 0.7870009957454512, + "grad_norm": 0.8089743852615356, + "learning_rate": 8.465657259827663e-06, + "loss": 2.608, + "step": 8694 + }, + { + "epoch": 0.7870915180592016, + "grad_norm": 0.8101866245269775, + "learning_rate": 8.4528837080594e-06, + "loss": 2.6037, + "step": 8695 + }, + { + "epoch": 0.7871820403729519, + "grad_norm": 0.8513261675834656, + "learning_rate": 8.440119375083232e-06, + "loss": 2.6164, + "step": 8696 + }, + { + "epoch": 0.7872725626867023, + "grad_norm": 0.8743961453437805, + "learning_rate": 8.427364262184534e-06, + "loss": 2.6142, + "step": 8697 + }, + { + "epoch": 0.7873630850004526, + "grad_norm": 0.8597943186759949, + "learning_rate": 8.414618370647753e-06, + "loss": 2.6462, + "step": 8698 + }, + { + "epoch": 0.787453607314203, + "grad_norm": 0.795477569103241, + "learning_rate": 8.401881701756386e-06, + "loss": 2.5958, + "step": 8699 + }, + { + "epoch": 0.7875441296279533, + "grad_norm": 0.8336818218231201, + "learning_rate": 8.38915425679304e-06, + "loss": 2.613, + "step": 8700 + }, + { + "epoch": 0.7876346519417037, + "grad_norm": 0.8857749104499817, + "learning_rate": 8.376436037039358e-06, + "loss": 2.6521, + "step": 8701 + }, + { + "epoch": 0.787725174255454, + "grad_norm": 0.81220942735672, + "learning_rate": 8.363727043776038e-06, + "loss": 2.6256, + "step": 8702 + }, + { + "epoch": 0.7878156965692044, + "grad_norm": 0.8130623698234558, + "learning_rate": 8.35102727828292e-06, + "loss": 2.5793, + "step": 8703 + }, + { + "epoch": 0.7879062188829546, + "grad_norm": 0.8743788599967957, + "learning_rate": 8.338336741838838e-06, + "loss": 2.6113, + "step": 8704 + }, + { + "epoch": 0.787996741196705, + "grad_norm": 0.8230921626091003, + "learning_rate": 8.325655435721735e-06, + "loss": 2.6658, + "step": 8705 + }, + { + "epoch": 0.7880872635104553, + "grad_norm": 0.8547943830490112, + "learning_rate": 8.312983361208615e-06, + "loss": 2.6389, + "step": 8706 + }, + { + "epoch": 0.7881777858242056, + "grad_norm": 0.8661034107208252, + "learning_rate": 8.300320519575567e-06, + "loss": 2.6049, + "step": 8707 + }, + { + "epoch": 0.788268308137956, + "grad_norm": 0.8751810193061829, + "learning_rate": 8.287666912097736e-06, + "loss": 2.6397, + "step": 8708 + }, + { + "epoch": 0.7883588304517063, + "grad_norm": 0.8370258212089539, + "learning_rate": 8.27502254004936e-06, + "loss": 2.6364, + "step": 8709 + }, + { + "epoch": 0.7884493527654567, + "grad_norm": 0.8904893398284912, + "learning_rate": 8.262387404703653e-06, + "loss": 2.634, + "step": 8710 + }, + { + "epoch": 0.788539875079207, + "grad_norm": 0.8433648943901062, + "learning_rate": 8.249761507333076e-06, + "loss": 2.6143, + "step": 8711 + }, + { + "epoch": 0.7886303973929574, + "grad_norm": 0.8583589792251587, + "learning_rate": 8.237144849208978e-06, + "loss": 2.5688, + "step": 8712 + }, + { + "epoch": 0.7887209197067077, + "grad_norm": 0.8374467492103577, + "learning_rate": 8.224537431601886e-06, + "loss": 2.6051, + "step": 8713 + }, + { + "epoch": 0.788811442020458, + "grad_norm": 0.9252846240997314, + "learning_rate": 8.211939255781365e-06, + "loss": 2.6603, + "step": 8714 + }, + { + "epoch": 0.7889019643342083, + "grad_norm": 0.8124175071716309, + "learning_rate": 8.199350323016041e-06, + "loss": 2.6242, + "step": 8715 + }, + { + "epoch": 0.7889924866479587, + "grad_norm": 0.8528426885604858, + "learning_rate": 8.186770634573637e-06, + "loss": 2.6286, + "step": 8716 + }, + { + "epoch": 0.789083008961709, + "grad_norm": 0.8937164545059204, + "learning_rate": 8.174200191720938e-06, + "loss": 2.5274, + "step": 8717 + }, + { + "epoch": 0.7891735312754594, + "grad_norm": 0.8798384666442871, + "learning_rate": 8.161638995723742e-06, + "loss": 2.6026, + "step": 8718 + }, + { + "epoch": 0.7892640535892097, + "grad_norm": 0.9604594111442566, + "learning_rate": 8.149087047846981e-06, + "loss": 2.6316, + "step": 8719 + }, + { + "epoch": 0.7893545759029601, + "grad_norm": 0.8138752579689026, + "learning_rate": 8.13654434935467e-06, + "loss": 2.5715, + "step": 8720 + }, + { + "epoch": 0.7894450982167104, + "grad_norm": 0.8585542440414429, + "learning_rate": 8.124010901509793e-06, + "loss": 2.6234, + "step": 8721 + }, + { + "epoch": 0.7895356205304608, + "grad_norm": 0.8834009766578674, + "learning_rate": 8.111486705574534e-06, + "loss": 2.6046, + "step": 8722 + }, + { + "epoch": 0.7896261428442111, + "grad_norm": 0.940254807472229, + "learning_rate": 8.098971762810038e-06, + "loss": 2.6024, + "step": 8723 + }, + { + "epoch": 0.7897166651579615, + "grad_norm": 0.8321104645729065, + "learning_rate": 8.086466074476563e-06, + "loss": 2.6294, + "step": 8724 + }, + { + "epoch": 0.7898071874717117, + "grad_norm": 0.82854163646698, + "learning_rate": 8.073969641833445e-06, + "loss": 2.6374, + "step": 8725 + }, + { + "epoch": 0.7898977097854621, + "grad_norm": 0.8564167618751526, + "learning_rate": 8.06148246613907e-06, + "loss": 2.5884, + "step": 8726 + }, + { + "epoch": 0.7899882320992124, + "grad_norm": 0.7891023755073547, + "learning_rate": 8.049004548650895e-06, + "loss": 2.6236, + "step": 8727 + }, + { + "epoch": 0.7900787544129628, + "grad_norm": 0.8628188967704773, + "learning_rate": 8.036535890625463e-06, + "loss": 2.5974, + "step": 8728 + }, + { + "epoch": 0.7901692767267131, + "grad_norm": 0.9000905156135559, + "learning_rate": 8.024076493318312e-06, + "loss": 2.6091, + "step": 8729 + }, + { + "epoch": 0.7902597990404635, + "grad_norm": 0.8640795946121216, + "learning_rate": 8.011626357984181e-06, + "loss": 2.6086, + "step": 8730 + }, + { + "epoch": 0.7903503213542138, + "grad_norm": 0.8639902472496033, + "learning_rate": 7.999185485876737e-06, + "loss": 2.6104, + "step": 8731 + }, + { + "epoch": 0.7904408436679642, + "grad_norm": 0.8826909065246582, + "learning_rate": 7.986753878248799e-06, + "loss": 2.5812, + "step": 8732 + }, + { + "epoch": 0.7905313659817145, + "grad_norm": 0.8098333477973938, + "learning_rate": 7.97433153635222e-06, + "loss": 2.6244, + "step": 8733 + }, + { + "epoch": 0.7906218882954649, + "grad_norm": 0.8974426984786987, + "learning_rate": 7.961918461437946e-06, + "loss": 2.6268, + "step": 8734 + }, + { + "epoch": 0.7907124106092152, + "grad_norm": 0.8217973709106445, + "learning_rate": 7.949514654755962e-06, + "loss": 2.6167, + "step": 8735 + }, + { + "epoch": 0.7908029329229656, + "grad_norm": 0.8261401057243347, + "learning_rate": 7.937120117555363e-06, + "loss": 2.6662, + "step": 8736 + }, + { + "epoch": 0.7908934552367158, + "grad_norm": 0.8707113265991211, + "learning_rate": 7.924734851084203e-06, + "loss": 2.5881, + "step": 8737 + }, + { + "epoch": 0.7909839775504662, + "grad_norm": 0.836617648601532, + "learning_rate": 7.912358856589751e-06, + "loss": 2.6031, + "step": 8738 + }, + { + "epoch": 0.7910744998642165, + "grad_norm": 0.8788150548934937, + "learning_rate": 7.899992135318235e-06, + "loss": 2.7125, + "step": 8739 + }, + { + "epoch": 0.7911650221779669, + "grad_norm": 0.9116982817649841, + "learning_rate": 7.887634688515e-06, + "loss": 2.549, + "step": 8740 + }, + { + "epoch": 0.7912555444917172, + "grad_norm": 0.9610945582389832, + "learning_rate": 7.875286517424418e-06, + "loss": 2.6044, + "step": 8741 + }, + { + "epoch": 0.7913460668054676, + "grad_norm": 0.8556106686592102, + "learning_rate": 7.862947623289962e-06, + "loss": 2.6286, + "step": 8742 + }, + { + "epoch": 0.7914365891192179, + "grad_norm": 0.8233311772346497, + "learning_rate": 7.850618007354172e-06, + "loss": 2.622, + "step": 8743 + }, + { + "epoch": 0.7915271114329683, + "grad_norm": 0.8949629068374634, + "learning_rate": 7.83829767085863e-06, + "loss": 2.5996, + "step": 8744 + }, + { + "epoch": 0.7916176337467186, + "grad_norm": 0.8206738233566284, + "learning_rate": 7.825986615043968e-06, + "loss": 2.5791, + "step": 8745 + }, + { + "epoch": 0.791708156060469, + "grad_norm": 0.8221138119697571, + "learning_rate": 7.81368484114996e-06, + "loss": 2.6101, + "step": 8746 + }, + { + "epoch": 0.7917986783742192, + "grad_norm": 0.838756799697876, + "learning_rate": 7.80139235041535e-06, + "loss": 2.6361, + "step": 8747 + }, + { + "epoch": 0.7918892006879695, + "grad_norm": 0.8324940800666809, + "learning_rate": 7.789109144078e-06, + "loss": 2.5672, + "step": 8748 + }, + { + "epoch": 0.7919797230017199, + "grad_norm": 0.8779529333114624, + "learning_rate": 7.776835223374845e-06, + "loss": 2.6992, + "step": 8749 + }, + { + "epoch": 0.7920702453154702, + "grad_norm": 0.8620442152023315, + "learning_rate": 7.764570589541875e-06, + "loss": 2.6591, + "step": 8750 + }, + { + "epoch": 0.7921607676292206, + "grad_norm": 0.883399486541748, + "learning_rate": 7.75231524381409e-06, + "loss": 2.6385, + "step": 8751 + }, + { + "epoch": 0.7922512899429709, + "grad_norm": 0.874740719795227, + "learning_rate": 7.74006918742567e-06, + "loss": 2.6073, + "step": 8752 + }, + { + "epoch": 0.7923418122567213, + "grad_norm": 0.8114060759544373, + "learning_rate": 7.727832421609738e-06, + "loss": 2.5893, + "step": 8753 + }, + { + "epoch": 0.7924323345704716, + "grad_norm": 0.7919555306434631, + "learning_rate": 7.715604947598564e-06, + "loss": 2.5594, + "step": 8754 + }, + { + "epoch": 0.792522856884222, + "grad_norm": 0.831693708896637, + "learning_rate": 7.703386766623444e-06, + "loss": 2.6075, + "step": 8755 + }, + { + "epoch": 0.7926133791979723, + "grad_norm": 0.803679347038269, + "learning_rate": 7.691177879914747e-06, + "loss": 2.635, + "step": 8756 + }, + { + "epoch": 0.7927039015117227, + "grad_norm": 0.8092373609542847, + "learning_rate": 7.67897828870191e-06, + "loss": 2.565, + "step": 8757 + }, + { + "epoch": 0.7927944238254729, + "grad_norm": 0.8561666011810303, + "learning_rate": 7.666787994213453e-06, + "loss": 2.6256, + "step": 8758 + }, + { + "epoch": 0.7928849461392233, + "grad_norm": 0.8929632306098938, + "learning_rate": 7.654606997676884e-06, + "loss": 2.6277, + "step": 8759 + }, + { + "epoch": 0.7929754684529736, + "grad_norm": 0.8307953476905823, + "learning_rate": 7.642435300318907e-06, + "loss": 2.6541, + "step": 8760 + }, + { + "epoch": 0.793065990766724, + "grad_norm": 0.8383509516716003, + "learning_rate": 7.630272903365131e-06, + "loss": 2.5554, + "step": 8761 + }, + { + "epoch": 0.7931565130804743, + "grad_norm": 0.8645136952400208, + "learning_rate": 7.618119808040369e-06, + "loss": 2.6302, + "step": 8762 + }, + { + "epoch": 0.7932470353942247, + "grad_norm": 0.9244964122772217, + "learning_rate": 7.605976015568394e-06, + "loss": 2.6323, + "step": 8763 + }, + { + "epoch": 0.793337557707975, + "grad_norm": 0.8274905681610107, + "learning_rate": 7.593841527172118e-06, + "loss": 2.6271, + "step": 8764 + }, + { + "epoch": 0.7934280800217254, + "grad_norm": 0.8871122598648071, + "learning_rate": 7.581716344073475e-06, + "loss": 2.5937, + "step": 8765 + }, + { + "epoch": 0.7935186023354757, + "grad_norm": 0.8496264219284058, + "learning_rate": 7.569600467493476e-06, + "loss": 2.5983, + "step": 8766 + }, + { + "epoch": 0.7936091246492261, + "grad_norm": 0.8048467636108398, + "learning_rate": 7.557493898652157e-06, + "loss": 2.6305, + "step": 8767 + }, + { + "epoch": 0.7936996469629763, + "grad_norm": 0.8628296256065369, + "learning_rate": 7.545396638768698e-06, + "loss": 2.5695, + "step": 8768 + }, + { + "epoch": 0.7937901692767267, + "grad_norm": 0.9485136866569519, + "learning_rate": 7.533308689061247e-06, + "loss": 2.6363, + "step": 8769 + }, + { + "epoch": 0.793880691590477, + "grad_norm": 0.8617843985557556, + "learning_rate": 7.521230050747086e-06, + "loss": 2.6322, + "step": 8770 + }, + { + "epoch": 0.7939712139042274, + "grad_norm": 0.8456613421440125, + "learning_rate": 7.50916072504253e-06, + "loss": 2.5722, + "step": 8771 + }, + { + "epoch": 0.7940617362179777, + "grad_norm": 0.8934623003005981, + "learning_rate": 7.497100713162963e-06, + "loss": 2.6712, + "step": 8772 + }, + { + "epoch": 0.7941522585317281, + "grad_norm": 0.8487347960472107, + "learning_rate": 7.485050016322814e-06, + "loss": 2.6064, + "step": 8773 + }, + { + "epoch": 0.7942427808454784, + "grad_norm": 0.8945321440696716, + "learning_rate": 7.473008635735623e-06, + "loss": 2.6911, + "step": 8774 + }, + { + "epoch": 0.7943333031592288, + "grad_norm": 0.9113092422485352, + "learning_rate": 7.460976572613887e-06, + "loss": 2.6164, + "step": 8775 + }, + { + "epoch": 0.7944238254729791, + "grad_norm": 0.840405285358429, + "learning_rate": 7.448953828169314e-06, + "loss": 2.6195, + "step": 8776 + }, + { + "epoch": 0.7945143477867295, + "grad_norm": 0.8640242218971252, + "learning_rate": 7.436940403612547e-06, + "loss": 2.5575, + "step": 8777 + }, + { + "epoch": 0.7946048701004798, + "grad_norm": 0.8765424489974976, + "learning_rate": 7.42493630015334e-06, + "loss": 2.6034, + "step": 8778 + }, + { + "epoch": 0.7946953924142302, + "grad_norm": 0.8792320489883423, + "learning_rate": 7.412941519000527e-06, + "loss": 2.6519, + "step": 8779 + }, + { + "epoch": 0.7947859147279804, + "grad_norm": 0.8563889265060425, + "learning_rate": 7.400956061361974e-06, + "loss": 2.6077, + "step": 8780 + }, + { + "epoch": 0.7948764370417308, + "grad_norm": 0.9053653478622437, + "learning_rate": 7.388979928444584e-06, + "loss": 2.6558, + "step": 8781 + }, + { + "epoch": 0.7949669593554811, + "grad_norm": 0.8070489764213562, + "learning_rate": 7.377013121454412e-06, + "loss": 2.6156, + "step": 8782 + }, + { + "epoch": 0.7950574816692315, + "grad_norm": 0.8767279386520386, + "learning_rate": 7.365055641596474e-06, + "loss": 2.6594, + "step": 8783 + }, + { + "epoch": 0.7951480039829818, + "grad_norm": 0.8439067602157593, + "learning_rate": 7.353107490074895e-06, + "loss": 2.7274, + "step": 8784 + }, + { + "epoch": 0.7952385262967322, + "grad_norm": 0.8305009007453918, + "learning_rate": 7.341168668092857e-06, + "loss": 2.6025, + "step": 8785 + }, + { + "epoch": 0.7953290486104825, + "grad_norm": 0.8626893162727356, + "learning_rate": 7.329239176852598e-06, + "loss": 2.64, + "step": 8786 + }, + { + "epoch": 0.7954195709242329, + "grad_norm": 0.8474772572517395, + "learning_rate": 7.317319017555413e-06, + "loss": 2.636, + "step": 8787 + }, + { + "epoch": 0.7955100932379832, + "grad_norm": 0.9051727652549744, + "learning_rate": 7.305408191401697e-06, + "loss": 2.6179, + "step": 8788 + }, + { + "epoch": 0.7956006155517334, + "grad_norm": 0.8000675439834595, + "learning_rate": 7.293506699590791e-06, + "loss": 2.6536, + "step": 8789 + }, + { + "epoch": 0.7956911378654838, + "grad_norm": 0.8184769749641418, + "learning_rate": 7.281614543321269e-06, + "loss": 2.6171, + "step": 8790 + }, + { + "epoch": 0.7957816601792341, + "grad_norm": 0.8048648834228516, + "learning_rate": 7.269731723790607e-06, + "loss": 2.646, + "step": 8791 + }, + { + "epoch": 0.7958721824929845, + "grad_norm": 0.850342333316803, + "learning_rate": 7.257858242195414e-06, + "loss": 2.5731, + "step": 8792 + }, + { + "epoch": 0.7959627048067348, + "grad_norm": 0.8059483170509338, + "learning_rate": 7.245994099731368e-06, + "loss": 2.631, + "step": 8793 + }, + { + "epoch": 0.7960532271204852, + "grad_norm": 0.9179108738899231, + "learning_rate": 7.2341392975931785e-06, + "loss": 2.6504, + "step": 8794 + }, + { + "epoch": 0.7961437494342355, + "grad_norm": 0.8052546381950378, + "learning_rate": 7.222293836974614e-06, + "loss": 2.6354, + "step": 8795 + }, + { + "epoch": 0.7962342717479859, + "grad_norm": 0.8943376541137695, + "learning_rate": 7.210457719068542e-06, + "loss": 2.6342, + "step": 8796 + }, + { + "epoch": 0.7963247940617362, + "grad_norm": 0.9096117615699768, + "learning_rate": 7.198630945066809e-06, + "loss": 2.6045, + "step": 8797 + }, + { + "epoch": 0.7964153163754866, + "grad_norm": 0.8474493026733398, + "learning_rate": 7.18681351616044e-06, + "loss": 2.5857, + "step": 8798 + }, + { + "epoch": 0.7965058386892369, + "grad_norm": 0.9806895852088928, + "learning_rate": 7.175005433539383e-06, + "loss": 2.6353, + "step": 8799 + }, + { + "epoch": 0.7965963610029873, + "grad_norm": 0.879974365234375, + "learning_rate": 7.163206698392744e-06, + "loss": 2.6562, + "step": 8800 + }, + { + "epoch": 0.7966868833167375, + "grad_norm": 0.8434916734695435, + "learning_rate": 7.151417311908648e-06, + "loss": 2.6339, + "step": 8801 + }, + { + "epoch": 0.7967774056304879, + "grad_norm": 0.8419424295425415, + "learning_rate": 7.139637275274291e-06, + "loss": 2.6013, + "step": 8802 + }, + { + "epoch": 0.7968679279442382, + "grad_norm": 0.8428929448127747, + "learning_rate": 7.1278665896759135e-06, + "loss": 2.6455, + "step": 8803 + }, + { + "epoch": 0.7969584502579886, + "grad_norm": 0.8516901731491089, + "learning_rate": 7.116105256298844e-06, + "loss": 2.6677, + "step": 8804 + }, + { + "epoch": 0.7970489725717389, + "grad_norm": 0.8902682662010193, + "learning_rate": 7.104353276327413e-06, + "loss": 2.669, + "step": 8805 + }, + { + "epoch": 0.7971394948854893, + "grad_norm": 0.8300895094871521, + "learning_rate": 7.092610650945086e-06, + "loss": 2.6027, + "step": 8806 + }, + { + "epoch": 0.7972300171992396, + "grad_norm": 0.8032069206237793, + "learning_rate": 7.0808773813343165e-06, + "loss": 2.5721, + "step": 8807 + }, + { + "epoch": 0.79732053951299, + "grad_norm": 0.8614211082458496, + "learning_rate": 7.069153468676648e-06, + "loss": 2.5475, + "step": 8808 + }, + { + "epoch": 0.7974110618267403, + "grad_norm": 0.893402099609375, + "learning_rate": 7.057438914152703e-06, + "loss": 2.6342, + "step": 8809 + }, + { + "epoch": 0.7975015841404907, + "grad_norm": 0.8314564228057861, + "learning_rate": 7.045733718942094e-06, + "loss": 2.5609, + "step": 8810 + }, + { + "epoch": 0.797592106454241, + "grad_norm": 0.9000319242477417, + "learning_rate": 7.0340378842235656e-06, + "loss": 2.6909, + "step": 8811 + }, + { + "epoch": 0.7976826287679913, + "grad_norm": 0.8975379467010498, + "learning_rate": 7.022351411174866e-06, + "loss": 2.6104, + "step": 8812 + }, + { + "epoch": 0.7977731510817416, + "grad_norm": 0.8559728264808655, + "learning_rate": 7.010674300972842e-06, + "loss": 2.6397, + "step": 8813 + }, + { + "epoch": 0.797863673395492, + "grad_norm": 0.8419605493545532, + "learning_rate": 6.999006554793374e-06, + "loss": 2.589, + "step": 8814 + }, + { + "epoch": 0.7979541957092423, + "grad_norm": 0.8505333662033081, + "learning_rate": 6.9873481738114145e-06, + "loss": 2.5767, + "step": 8815 + }, + { + "epoch": 0.7980447180229927, + "grad_norm": 0.7990020513534546, + "learning_rate": 6.97569915920091e-06, + "loss": 2.6203, + "step": 8816 + }, + { + "epoch": 0.798135240336743, + "grad_norm": 0.8316020965576172, + "learning_rate": 6.964059512135001e-06, + "loss": 2.6493, + "step": 8817 + }, + { + "epoch": 0.7982257626504934, + "grad_norm": 0.9669241309165955, + "learning_rate": 6.952429233785729e-06, + "loss": 2.6498, + "step": 8818 + }, + { + "epoch": 0.7983162849642437, + "grad_norm": 0.8680517673492432, + "learning_rate": 6.940808325324288e-06, + "loss": 2.6225, + "step": 8819 + }, + { + "epoch": 0.7984068072779941, + "grad_norm": 0.8824862837791443, + "learning_rate": 6.929196787920899e-06, + "loss": 2.676, + "step": 8820 + }, + { + "epoch": 0.7984973295917444, + "grad_norm": 0.900398850440979, + "learning_rate": 6.917594622744849e-06, + "loss": 2.6275, + "step": 8821 + }, + { + "epoch": 0.7985878519054947, + "grad_norm": 0.8329018354415894, + "learning_rate": 6.906001830964481e-06, + "loss": 2.659, + "step": 8822 + }, + { + "epoch": 0.798678374219245, + "grad_norm": 0.8776055574417114, + "learning_rate": 6.894418413747183e-06, + "loss": 2.623, + "step": 8823 + }, + { + "epoch": 0.7987688965329954, + "grad_norm": 0.7747718095779419, + "learning_rate": 6.8828443722593784e-06, + "loss": 2.5839, + "step": 8824 + }, + { + "epoch": 0.7988594188467457, + "grad_norm": 0.8642785549163818, + "learning_rate": 6.8712797076666335e-06, + "loss": 2.6411, + "step": 8825 + }, + { + "epoch": 0.7989499411604961, + "grad_norm": 0.962983250617981, + "learning_rate": 6.859724421133462e-06, + "loss": 2.6914, + "step": 8826 + }, + { + "epoch": 0.7990404634742464, + "grad_norm": 0.831626832485199, + "learning_rate": 6.848178513823478e-06, + "loss": 2.5524, + "step": 8827 + }, + { + "epoch": 0.7991309857879968, + "grad_norm": 0.9346736073493958, + "learning_rate": 6.836641986899383e-06, + "loss": 2.5418, + "step": 8828 + }, + { + "epoch": 0.7992215081017471, + "grad_norm": 0.8697760701179504, + "learning_rate": 6.825114841522884e-06, + "loss": 2.663, + "step": 8829 + }, + { + "epoch": 0.7993120304154974, + "grad_norm": 0.9198794364929199, + "learning_rate": 6.813597078854772e-06, + "loss": 2.5732, + "step": 8830 + }, + { + "epoch": 0.7994025527292478, + "grad_norm": 0.7944701313972473, + "learning_rate": 6.8020887000548985e-06, + "loss": 2.5865, + "step": 8831 + }, + { + "epoch": 0.799493075042998, + "grad_norm": 0.7876954078674316, + "learning_rate": 6.790589706282113e-06, + "loss": 2.6271, + "step": 8832 + }, + { + "epoch": 0.7995835973567484, + "grad_norm": 0.8922056555747986, + "learning_rate": 6.779100098694424e-06, + "loss": 2.6208, + "step": 8833 + }, + { + "epoch": 0.7996741196704987, + "grad_norm": 0.8670765161514282, + "learning_rate": 6.767619878448783e-06, + "loss": 2.5881, + "step": 8834 + }, + { + "epoch": 0.7997646419842491, + "grad_norm": 0.8833421468734741, + "learning_rate": 6.756149046701277e-06, + "loss": 2.6605, + "step": 8835 + }, + { + "epoch": 0.7998551642979994, + "grad_norm": 0.8624097108840942, + "learning_rate": 6.744687604607014e-06, + "loss": 2.5801, + "step": 8836 + }, + { + "epoch": 0.7999456866117498, + "grad_norm": 0.8142250776290894, + "learning_rate": 6.7332355533201495e-06, + "loss": 2.6091, + "step": 8837 + }, + { + "epoch": 0.8000362089255001, + "grad_norm": 0.835433304309845, + "learning_rate": 6.721792893993917e-06, + "loss": 2.6392, + "step": 8838 + }, + { + "epoch": 0.8001267312392505, + "grad_norm": 0.8677368760108948, + "learning_rate": 6.710359627780605e-06, + "loss": 2.6255, + "step": 8839 + }, + { + "epoch": 0.8002172535530008, + "grad_norm": 1.044707179069519, + "learning_rate": 6.698935755831492e-06, + "loss": 2.621, + "step": 8840 + }, + { + "epoch": 0.8003077758667512, + "grad_norm": 0.8238329291343689, + "learning_rate": 6.687521279297026e-06, + "loss": 2.5222, + "step": 8841 + }, + { + "epoch": 0.8003982981805015, + "grad_norm": 0.7824071049690247, + "learning_rate": 6.6761161993265985e-06, + "loss": 2.6161, + "step": 8842 + }, + { + "epoch": 0.8004888204942519, + "grad_norm": 0.8748955726623535, + "learning_rate": 6.664720517068723e-06, + "loss": 2.5776, + "step": 8843 + }, + { + "epoch": 0.8005793428080021, + "grad_norm": 0.8659281730651855, + "learning_rate": 6.653334233670927e-06, + "loss": 2.5945, + "step": 8844 + }, + { + "epoch": 0.8006698651217525, + "grad_norm": 0.8498925566673279, + "learning_rate": 6.6419573502798374e-06, + "loss": 2.6343, + "step": 8845 + }, + { + "epoch": 0.8007603874355028, + "grad_norm": 0.809684157371521, + "learning_rate": 6.63058986804106e-06, + "loss": 2.5778, + "step": 8846 + }, + { + "epoch": 0.8008509097492532, + "grad_norm": 0.8012062907218933, + "learning_rate": 6.619231788099356e-06, + "loss": 2.6529, + "step": 8847 + }, + { + "epoch": 0.8009414320630035, + "grad_norm": 0.8674015998840332, + "learning_rate": 6.607883111598445e-06, + "loss": 2.594, + "step": 8848 + }, + { + "epoch": 0.8010319543767539, + "grad_norm": 0.8957664966583252, + "learning_rate": 6.596543839681146e-06, + "loss": 2.6222, + "step": 8849 + }, + { + "epoch": 0.8011224766905042, + "grad_norm": 0.8477498292922974, + "learning_rate": 6.585213973489335e-06, + "loss": 2.6276, + "step": 8850 + }, + { + "epoch": 0.8012129990042546, + "grad_norm": 0.8417879939079285, + "learning_rate": 6.573893514163909e-06, + "loss": 2.6097, + "step": 8851 + }, + { + "epoch": 0.8013035213180049, + "grad_norm": 0.8202842473983765, + "learning_rate": 6.5625824628448575e-06, + "loss": 2.6, + "step": 8852 + }, + { + "epoch": 0.8013940436317553, + "grad_norm": 0.8281559944152832, + "learning_rate": 6.551280820671201e-06, + "loss": 2.5725, + "step": 8853 + }, + { + "epoch": 0.8014845659455055, + "grad_norm": 1.8357462882995605, + "learning_rate": 6.539988588780988e-06, + "loss": 2.8031, + "step": 8854 + }, + { + "epoch": 0.8015750882592559, + "grad_norm": 0.9159720540046692, + "learning_rate": 6.528705768311394e-06, + "loss": 2.6284, + "step": 8855 + }, + { + "epoch": 0.8016656105730062, + "grad_norm": 0.8227183222770691, + "learning_rate": 6.517432360398556e-06, + "loss": 2.6235, + "step": 8856 + }, + { + "epoch": 0.8017561328867566, + "grad_norm": 0.828522801399231, + "learning_rate": 6.506168366177723e-06, + "loss": 2.5962, + "step": 8857 + }, + { + "epoch": 0.8018466552005069, + "grad_norm": 0.836940586566925, + "learning_rate": 6.494913786783185e-06, + "loss": 2.6592, + "step": 8858 + }, + { + "epoch": 0.8019371775142573, + "grad_norm": 0.9068986177444458, + "learning_rate": 6.48366862334826e-06, + "loss": 2.6665, + "step": 8859 + }, + { + "epoch": 0.8020276998280076, + "grad_norm": 0.8480276465415955, + "learning_rate": 6.472432877005341e-06, + "loss": 2.6148, + "step": 8860 + }, + { + "epoch": 0.802118222141758, + "grad_norm": 0.8470519185066223, + "learning_rate": 6.4612065488859004e-06, + "loss": 2.6465, + "step": 8861 + }, + { + "epoch": 0.8022087444555083, + "grad_norm": 0.853524923324585, + "learning_rate": 6.449989640120368e-06, + "loss": 2.6106, + "step": 8862 + }, + { + "epoch": 0.8022992667692587, + "grad_norm": 0.8823791146278381, + "learning_rate": 6.438782151838341e-06, + "loss": 2.5735, + "step": 8863 + }, + { + "epoch": 0.802389789083009, + "grad_norm": 0.8680602312088013, + "learning_rate": 6.427584085168381e-06, + "loss": 2.6031, + "step": 8864 + }, + { + "epoch": 0.8024803113967593, + "grad_norm": 0.820585310459137, + "learning_rate": 6.416395441238143e-06, + "loss": 2.6024, + "step": 8865 + }, + { + "epoch": 0.8025708337105096, + "grad_norm": 0.8056566715240479, + "learning_rate": 6.405216221174326e-06, + "loss": 2.613, + "step": 8866 + }, + { + "epoch": 0.80266135602426, + "grad_norm": 0.8219767808914185, + "learning_rate": 6.394046426102674e-06, + "loss": 2.6172, + "step": 8867 + }, + { + "epoch": 0.8027518783380103, + "grad_norm": 0.8536273241043091, + "learning_rate": 6.382886057147985e-06, + "loss": 2.6725, + "step": 8868 + }, + { + "epoch": 0.8028424006517607, + "grad_norm": 0.8024052381515503, + "learning_rate": 6.371735115434119e-06, + "loss": 2.6119, + "step": 8869 + }, + { + "epoch": 0.802932922965511, + "grad_norm": 0.8551531434059143, + "learning_rate": 6.360593602083942e-06, + "loss": 2.6366, + "step": 8870 + }, + { + "epoch": 0.8030234452792613, + "grad_norm": 0.8735834956169128, + "learning_rate": 6.349461518219446e-06, + "loss": 2.6373, + "step": 8871 + }, + { + "epoch": 0.8031139675930117, + "grad_norm": 0.8546817302703857, + "learning_rate": 6.338338864961612e-06, + "loss": 2.525, + "step": 8872 + }, + { + "epoch": 0.803204489906762, + "grad_norm": 0.839828372001648, + "learning_rate": 6.327225643430479e-06, + "loss": 2.572, + "step": 8873 + }, + { + "epoch": 0.8032950122205124, + "grad_norm": 0.7989261746406555, + "learning_rate": 6.3161218547451716e-06, + "loss": 2.574, + "step": 8874 + }, + { + "epoch": 0.8033855345342626, + "grad_norm": 0.7898634672164917, + "learning_rate": 6.3050275000238414e-06, + "loss": 2.5904, + "step": 8875 + }, + { + "epoch": 0.803476056848013, + "grad_norm": 0.87517911195755, + "learning_rate": 6.293942580383649e-06, + "loss": 2.6511, + "step": 8876 + }, + { + "epoch": 0.8035665791617633, + "grad_norm": 0.8695262670516968, + "learning_rate": 6.282867096940903e-06, + "loss": 2.5497, + "step": 8877 + }, + { + "epoch": 0.8036571014755137, + "grad_norm": 0.8134534358978271, + "learning_rate": 6.2718010508108545e-06, + "loss": 2.6483, + "step": 8878 + }, + { + "epoch": 0.803747623789264, + "grad_norm": 0.8465479016304016, + "learning_rate": 6.26074444310788e-06, + "loss": 2.6658, + "step": 8879 + }, + { + "epoch": 0.8038381461030144, + "grad_norm": 0.8220434188842773, + "learning_rate": 6.2496972749453766e-06, + "loss": 2.6187, + "step": 8880 + }, + { + "epoch": 0.8039286684167647, + "grad_norm": 0.8323163986206055, + "learning_rate": 6.238659547435799e-06, + "loss": 2.669, + "step": 8881 + }, + { + "epoch": 0.8040191907305151, + "grad_norm": 0.851899266242981, + "learning_rate": 6.227631261690636e-06, + "loss": 2.6631, + "step": 8882 + }, + { + "epoch": 0.8041097130442654, + "grad_norm": 0.8882586359977722, + "learning_rate": 6.2166124188204444e-06, + "loss": 2.6435, + "step": 8883 + }, + { + "epoch": 0.8042002353580158, + "grad_norm": 0.8469085693359375, + "learning_rate": 6.205603019934791e-06, + "loss": 2.5918, + "step": 8884 + }, + { + "epoch": 0.804290757671766, + "grad_norm": 0.7756091356277466, + "learning_rate": 6.194603066142379e-06, + "loss": 2.5813, + "step": 8885 + }, + { + "epoch": 0.8043812799855165, + "grad_norm": 0.8727147579193115, + "learning_rate": 6.183612558550867e-06, + "loss": 2.6397, + "step": 8886 + }, + { + "epoch": 0.8044718022992667, + "grad_norm": 0.810825526714325, + "learning_rate": 6.1726314982669896e-06, + "loss": 2.5986, + "step": 8887 + }, + { + "epoch": 0.8045623246130171, + "grad_norm": 0.8421832323074341, + "learning_rate": 6.161659886396554e-06, + "loss": 2.626, + "step": 8888 + }, + { + "epoch": 0.8046528469267674, + "grad_norm": 0.8276161551475525, + "learning_rate": 6.1506977240444074e-06, + "loss": 2.6342, + "step": 8889 + }, + { + "epoch": 0.8047433692405178, + "grad_norm": 0.8567266464233398, + "learning_rate": 6.139745012314424e-06, + "loss": 2.653, + "step": 8890 + }, + { + "epoch": 0.8048338915542681, + "grad_norm": 0.8193711638450623, + "learning_rate": 6.128801752309565e-06, + "loss": 2.5809, + "step": 8891 + }, + { + "epoch": 0.8049244138680185, + "grad_norm": 0.8752370476722717, + "learning_rate": 6.1178679451317945e-06, + "loss": 2.6181, + "step": 8892 + }, + { + "epoch": 0.8050149361817688, + "grad_norm": 0.9479153752326965, + "learning_rate": 6.106943591882153e-06, + "loss": 2.7413, + "step": 8893 + }, + { + "epoch": 0.8051054584955192, + "grad_norm": 0.8616109490394592, + "learning_rate": 6.096028693660716e-06, + "loss": 2.5879, + "step": 8894 + }, + { + "epoch": 0.8051959808092695, + "grad_norm": 0.8333169221878052, + "learning_rate": 6.085123251566616e-06, + "loss": 2.5674, + "step": 8895 + }, + { + "epoch": 0.8052865031230199, + "grad_norm": 0.8511015176773071, + "learning_rate": 6.0742272666980625e-06, + "loss": 2.6387, + "step": 8896 + }, + { + "epoch": 0.8053770254367701, + "grad_norm": 0.778052568435669, + "learning_rate": 6.063340740152213e-06, + "loss": 2.6394, + "step": 8897 + }, + { + "epoch": 0.8054675477505205, + "grad_norm": 0.8331275582313538, + "learning_rate": 6.05246367302541e-06, + "loss": 2.6225, + "step": 8898 + }, + { + "epoch": 0.8055580700642708, + "grad_norm": 0.8217616081237793, + "learning_rate": 6.041596066412936e-06, + "loss": 2.6483, + "step": 8899 + }, + { + "epoch": 0.8056485923780212, + "grad_norm": 0.8371033668518066, + "learning_rate": 6.030737921409169e-06, + "loss": 2.5995, + "step": 8900 + }, + { + "epoch": 0.8057391146917715, + "grad_norm": 0.770965039730072, + "learning_rate": 6.019889239107512e-06, + "loss": 2.638, + "step": 8901 + }, + { + "epoch": 0.8058296370055219, + "grad_norm": 0.8548250198364258, + "learning_rate": 6.009050020600459e-06, + "loss": 2.6621, + "step": 8902 + }, + { + "epoch": 0.8059201593192722, + "grad_norm": 0.8284469842910767, + "learning_rate": 5.998220266979471e-06, + "loss": 2.6562, + "step": 8903 + }, + { + "epoch": 0.8060106816330226, + "grad_norm": 0.8728049397468567, + "learning_rate": 5.987399979335151e-06, + "loss": 2.6278, + "step": 8904 + }, + { + "epoch": 0.8061012039467729, + "grad_norm": 0.8274415731430054, + "learning_rate": 5.976589158757073e-06, + "loss": 2.6045, + "step": 8905 + }, + { + "epoch": 0.8061917262605233, + "grad_norm": 0.9814711809158325, + "learning_rate": 5.965787806333889e-06, + "loss": 2.6218, + "step": 8906 + }, + { + "epoch": 0.8062822485742736, + "grad_norm": 0.882437527179718, + "learning_rate": 5.954995923153306e-06, + "loss": 2.6554, + "step": 8907 + }, + { + "epoch": 0.806372770888024, + "grad_norm": 0.9071118235588074, + "learning_rate": 5.944213510302054e-06, + "loss": 2.5828, + "step": 8908 + }, + { + "epoch": 0.8064632932017742, + "grad_norm": 0.8062431812286377, + "learning_rate": 5.933440568865933e-06, + "loss": 2.6099, + "step": 8909 + }, + { + "epoch": 0.8065538155155246, + "grad_norm": 0.8858148455619812, + "learning_rate": 5.922677099929786e-06, + "loss": 2.5753, + "step": 8910 + }, + { + "epoch": 0.8066443378292749, + "grad_norm": 0.8028451800346375, + "learning_rate": 5.911923104577455e-06, + "loss": 2.5819, + "step": 8911 + }, + { + "epoch": 0.8067348601430252, + "grad_norm": 0.843224048614502, + "learning_rate": 5.901178583891909e-06, + "loss": 2.6249, + "step": 8912 + }, + { + "epoch": 0.8068253824567756, + "grad_norm": 0.8451189994812012, + "learning_rate": 5.890443538955104e-06, + "loss": 2.6, + "step": 8913 + }, + { + "epoch": 0.8069159047705259, + "grad_norm": 0.8016216158866882, + "learning_rate": 5.879717970848053e-06, + "loss": 2.6323, + "step": 8914 + }, + { + "epoch": 0.8070064270842763, + "grad_norm": 0.8591460585594177, + "learning_rate": 5.869001880650826e-06, + "loss": 2.6226, + "step": 8915 + }, + { + "epoch": 0.8070969493980266, + "grad_norm": 0.8800707459449768, + "learning_rate": 5.8582952694425374e-06, + "loss": 2.6134, + "step": 8916 + }, + { + "epoch": 0.807187471711777, + "grad_norm": 0.8347895741462708, + "learning_rate": 5.847598138301347e-06, + "loss": 2.6334, + "step": 8917 + }, + { + "epoch": 0.8072779940255272, + "grad_norm": 0.845406711101532, + "learning_rate": 5.836910488304448e-06, + "loss": 2.6321, + "step": 8918 + }, + { + "epoch": 0.8073685163392776, + "grad_norm": 0.8826831579208374, + "learning_rate": 5.826232320528069e-06, + "loss": 2.6547, + "step": 8919 + }, + { + "epoch": 0.8074590386530279, + "grad_norm": 0.8191585540771484, + "learning_rate": 5.8155636360475385e-06, + "loss": 2.5802, + "step": 8920 + }, + { + "epoch": 0.8075495609667783, + "grad_norm": 0.8471720218658447, + "learning_rate": 5.804904435937164e-06, + "loss": 2.6293, + "step": 8921 + }, + { + "epoch": 0.8076400832805286, + "grad_norm": 0.84287029504776, + "learning_rate": 5.7942547212703315e-06, + "loss": 2.6357, + "step": 8922 + }, + { + "epoch": 0.807730605594279, + "grad_norm": 0.8456634283065796, + "learning_rate": 5.783614493119471e-06, + "loss": 2.5488, + "step": 8923 + }, + { + "epoch": 0.8078211279080293, + "grad_norm": 0.8603562116622925, + "learning_rate": 5.77298375255606e-06, + "loss": 2.6228, + "step": 8924 + }, + { + "epoch": 0.8079116502217797, + "grad_norm": 0.8196355104446411, + "learning_rate": 5.7623625006505975e-06, + "loss": 2.6893, + "step": 8925 + }, + { + "epoch": 0.80800217253553, + "grad_norm": 0.8516672253608704, + "learning_rate": 5.751750738472672e-06, + "loss": 2.6273, + "step": 8926 + }, + { + "epoch": 0.8080926948492804, + "grad_norm": 0.8056585788726807, + "learning_rate": 5.74114846709084e-06, + "loss": 2.6184, + "step": 8927 + }, + { + "epoch": 0.8081832171630307, + "grad_norm": 0.9525690078735352, + "learning_rate": 5.730555687572803e-06, + "loss": 2.5773, + "step": 8928 + }, + { + "epoch": 0.808273739476781, + "grad_norm": 0.8798109889030457, + "learning_rate": 5.719972400985219e-06, + "loss": 2.6305, + "step": 8929 + }, + { + "epoch": 0.8083642617905313, + "grad_norm": 0.9028758406639099, + "learning_rate": 5.709398608393835e-06, + "loss": 2.6212, + "step": 8930 + }, + { + "epoch": 0.8084547841042817, + "grad_norm": 0.8423622250556946, + "learning_rate": 5.698834310863432e-06, + "loss": 2.6162, + "step": 8931 + }, + { + "epoch": 0.808545306418032, + "grad_norm": 0.8932363390922546, + "learning_rate": 5.688279509457828e-06, + "loss": 2.6524, + "step": 8932 + }, + { + "epoch": 0.8086358287317824, + "grad_norm": 0.787690281867981, + "learning_rate": 5.6777342052399045e-06, + "loss": 2.5985, + "step": 8933 + }, + { + "epoch": 0.8087263510455327, + "grad_norm": 0.8639971613883972, + "learning_rate": 5.667198399271567e-06, + "loss": 2.6242, + "step": 8934 + }, + { + "epoch": 0.8088168733592831, + "grad_norm": 0.7899857759475708, + "learning_rate": 5.656672092613757e-06, + "loss": 2.6458, + "step": 8935 + }, + { + "epoch": 0.8089073956730334, + "grad_norm": 0.8420873880386353, + "learning_rate": 5.646155286326504e-06, + "loss": 2.624, + "step": 8936 + }, + { + "epoch": 0.8089979179867838, + "grad_norm": 0.8478801250457764, + "learning_rate": 5.635647981468817e-06, + "loss": 2.6155, + "step": 8937 + }, + { + "epoch": 0.8090884403005341, + "grad_norm": 0.845515787601471, + "learning_rate": 5.625150179098804e-06, + "loss": 2.649, + "step": 8938 + }, + { + "epoch": 0.8091789626142845, + "grad_norm": 0.8689040541648865, + "learning_rate": 5.614661880273575e-06, + "loss": 2.6855, + "step": 8939 + }, + { + "epoch": 0.8092694849280347, + "grad_norm": 0.857174277305603, + "learning_rate": 5.604183086049342e-06, + "loss": 2.5821, + "step": 8940 + }, + { + "epoch": 0.8093600072417851, + "grad_norm": 0.7921069264411926, + "learning_rate": 5.593713797481248e-06, + "loss": 2.5377, + "step": 8941 + }, + { + "epoch": 0.8094505295555354, + "grad_norm": 0.8913686275482178, + "learning_rate": 5.583254015623618e-06, + "loss": 2.6787, + "step": 8942 + }, + { + "epoch": 0.8095410518692858, + "grad_norm": 0.8169851303100586, + "learning_rate": 5.572803741529719e-06, + "loss": 2.5624, + "step": 8943 + }, + { + "epoch": 0.8096315741830361, + "grad_norm": 0.8717596530914307, + "learning_rate": 5.562362976251901e-06, + "loss": 2.5997, + "step": 8944 + }, + { + "epoch": 0.8097220964967865, + "grad_norm": 0.8305097222328186, + "learning_rate": 5.551931720841541e-06, + "loss": 2.6424, + "step": 8945 + }, + { + "epoch": 0.8098126188105368, + "grad_norm": 0.8397768139839172, + "learning_rate": 5.54150997634908e-06, + "loss": 2.6499, + "step": 8946 + }, + { + "epoch": 0.8099031411242872, + "grad_norm": 0.8684827089309692, + "learning_rate": 5.531097743823965e-06, + "loss": 2.5971, + "step": 8947 + }, + { + "epoch": 0.8099936634380375, + "grad_norm": 0.8356045484542847, + "learning_rate": 5.520695024314748e-06, + "loss": 2.6024, + "step": 8948 + }, + { + "epoch": 0.8100841857517879, + "grad_norm": 0.8520987629890442, + "learning_rate": 5.510301818868935e-06, + "loss": 2.6148, + "step": 8949 + }, + { + "epoch": 0.8101747080655382, + "grad_norm": 0.8510938882827759, + "learning_rate": 5.499918128533155e-06, + "loss": 2.654, + "step": 8950 + }, + { + "epoch": 0.8102652303792885, + "grad_norm": 0.8644112944602966, + "learning_rate": 5.489543954353027e-06, + "loss": 2.5872, + "step": 8951 + }, + { + "epoch": 0.8103557526930388, + "grad_norm": 0.7893914580345154, + "learning_rate": 5.479179297373238e-06, + "loss": 2.6502, + "step": 8952 + }, + { + "epoch": 0.8104462750067891, + "grad_norm": 0.8841590881347656, + "learning_rate": 5.468824158637509e-06, + "loss": 2.6004, + "step": 8953 + }, + { + "epoch": 0.8105367973205395, + "grad_norm": 0.8805847764015198, + "learning_rate": 5.458478539188605e-06, + "loss": 2.5836, + "step": 8954 + }, + { + "epoch": 0.8106273196342898, + "grad_norm": 0.8868323564529419, + "learning_rate": 5.448142440068316e-06, + "loss": 2.6562, + "step": 8955 + }, + { + "epoch": 0.8107178419480402, + "grad_norm": 0.8586719632148743, + "learning_rate": 5.437815862317519e-06, + "loss": 2.6409, + "step": 8956 + }, + { + "epoch": 0.8108083642617905, + "grad_norm": 0.8426293134689331, + "learning_rate": 5.427498806976039e-06, + "loss": 2.584, + "step": 8957 + }, + { + "epoch": 0.8108988865755409, + "grad_norm": 0.8411415219306946, + "learning_rate": 5.417191275082878e-06, + "loss": 2.6113, + "step": 8958 + }, + { + "epoch": 0.8109894088892912, + "grad_norm": 0.8480225801467896, + "learning_rate": 5.40689326767595e-06, + "loss": 2.5914, + "step": 8959 + }, + { + "epoch": 0.8110799312030416, + "grad_norm": 0.8543084263801575, + "learning_rate": 5.396604785792281e-06, + "loss": 2.6394, + "step": 8960 + }, + { + "epoch": 0.8111704535167918, + "grad_norm": 0.8379021883010864, + "learning_rate": 5.38632583046792e-06, + "loss": 2.6218, + "step": 8961 + }, + { + "epoch": 0.8112609758305422, + "grad_norm": 0.8481666445732117, + "learning_rate": 5.3760564027379615e-06, + "loss": 2.614, + "step": 8962 + }, + { + "epoch": 0.8113514981442925, + "grad_norm": 0.8670530319213867, + "learning_rate": 5.3657965036365335e-06, + "loss": 2.6602, + "step": 8963 + }, + { + "epoch": 0.8114420204580429, + "grad_norm": 0.8096442818641663, + "learning_rate": 5.3555461341968205e-06, + "loss": 2.6109, + "step": 8964 + }, + { + "epoch": 0.8115325427717932, + "grad_norm": 0.8703358173370361, + "learning_rate": 5.345305295450997e-06, + "loss": 2.6763, + "step": 8965 + }, + { + "epoch": 0.8116230650855436, + "grad_norm": 0.8103675842285156, + "learning_rate": 5.335073988430372e-06, + "loss": 2.5879, + "step": 8966 + }, + { + "epoch": 0.8117135873992939, + "grad_norm": 0.8090990781784058, + "learning_rate": 5.324852214165188e-06, + "loss": 2.5768, + "step": 8967 + }, + { + "epoch": 0.8118041097130443, + "grad_norm": 0.8572064638137817, + "learning_rate": 5.314639973684787e-06, + "loss": 2.5946, + "step": 8968 + }, + { + "epoch": 0.8118946320267946, + "grad_norm": 0.8513726592063904, + "learning_rate": 5.3044372680175595e-06, + "loss": 2.5761, + "step": 8969 + }, + { + "epoch": 0.811985154340545, + "grad_norm": 0.8478163480758667, + "learning_rate": 5.294244098190926e-06, + "loss": 2.6156, + "step": 8970 + }, + { + "epoch": 0.8120756766542953, + "grad_norm": 0.9168113470077515, + "learning_rate": 5.284060465231288e-06, + "loss": 2.6562, + "step": 8971 + }, + { + "epoch": 0.8121661989680456, + "grad_norm": 0.8734987378120422, + "learning_rate": 5.273886370164194e-06, + "loss": 2.6128, + "step": 8972 + }, + { + "epoch": 0.8122567212817959, + "grad_norm": 0.8242517709732056, + "learning_rate": 5.2637218140141444e-06, + "loss": 2.5921, + "step": 8973 + }, + { + "epoch": 0.8123472435955463, + "grad_norm": 0.8394028544425964, + "learning_rate": 5.25356679780471e-06, + "loss": 2.5752, + "step": 8974 + }, + { + "epoch": 0.8124377659092966, + "grad_norm": 0.8377310037612915, + "learning_rate": 5.243421322558506e-06, + "loss": 2.6753, + "step": 8975 + }, + { + "epoch": 0.812528288223047, + "grad_norm": 0.8019624352455139, + "learning_rate": 5.233285389297182e-06, + "loss": 2.5654, + "step": 8976 + }, + { + "epoch": 0.8126188105367973, + "grad_norm": 0.8036606311798096, + "learning_rate": 5.223158999041444e-06, + "loss": 2.5713, + "step": 8977 + }, + { + "epoch": 0.8127093328505477, + "grad_norm": 0.8323128819465637, + "learning_rate": 5.213042152810988e-06, + "loss": 2.6211, + "step": 8978 + }, + { + "epoch": 0.812799855164298, + "grad_norm": 0.8095520734786987, + "learning_rate": 5.202934851624586e-06, + "loss": 2.5408, + "step": 8979 + }, + { + "epoch": 0.8128903774780484, + "grad_norm": 0.8063791394233704, + "learning_rate": 5.192837096500058e-06, + "loss": 2.6181, + "step": 8980 + }, + { + "epoch": 0.8129808997917987, + "grad_norm": 0.8308850526809692, + "learning_rate": 5.1827488884542365e-06, + "loss": 2.5909, + "step": 8981 + }, + { + "epoch": 0.8130714221055491, + "grad_norm": 0.804786741733551, + "learning_rate": 5.172670228503007e-06, + "loss": 2.6033, + "step": 8982 + }, + { + "epoch": 0.8131619444192993, + "grad_norm": 0.8006044030189514, + "learning_rate": 5.162601117661314e-06, + "loss": 2.5865, + "step": 8983 + }, + { + "epoch": 0.8132524667330497, + "grad_norm": 0.8728293180465698, + "learning_rate": 5.152541556943058e-06, + "loss": 2.6456, + "step": 8984 + }, + { + "epoch": 0.8133429890468, + "grad_norm": 0.8906390070915222, + "learning_rate": 5.1424915473612945e-06, + "loss": 2.5856, + "step": 8985 + }, + { + "epoch": 0.8134335113605504, + "grad_norm": 0.8838881254196167, + "learning_rate": 5.132451089928026e-06, + "loss": 2.652, + "step": 8986 + }, + { + "epoch": 0.8135240336743007, + "grad_norm": 0.7686001062393188, + "learning_rate": 5.1224201856543416e-06, + "loss": 2.5519, + "step": 8987 + }, + { + "epoch": 0.8136145559880511, + "grad_norm": 0.8629806637763977, + "learning_rate": 5.1123988355503475e-06, + "loss": 2.6648, + "step": 8988 + }, + { + "epoch": 0.8137050783018014, + "grad_norm": 0.841788649559021, + "learning_rate": 5.1023870406252e-06, + "loss": 2.6663, + "step": 8989 + }, + { + "epoch": 0.8137956006155518, + "grad_norm": 0.8157702684402466, + "learning_rate": 5.092384801887074e-06, + "loss": 2.6437, + "step": 8990 + }, + { + "epoch": 0.8138861229293021, + "grad_norm": 0.8443762063980103, + "learning_rate": 5.082392120343215e-06, + "loss": 2.5729, + "step": 8991 + }, + { + "epoch": 0.8139766452430525, + "grad_norm": 0.8237950801849365, + "learning_rate": 5.072408996999844e-06, + "loss": 2.6317, + "step": 8992 + }, + { + "epoch": 0.8140671675568028, + "grad_norm": 0.8749138712882996, + "learning_rate": 5.0624354328623225e-06, + "loss": 2.6091, + "step": 8993 + }, + { + "epoch": 0.814157689870553, + "grad_norm": 0.8353228569030762, + "learning_rate": 5.052471428934935e-06, + "loss": 2.6076, + "step": 8994 + }, + { + "epoch": 0.8142482121843034, + "grad_norm": 0.8347374796867371, + "learning_rate": 5.04251698622108e-06, + "loss": 2.611, + "step": 8995 + }, + { + "epoch": 0.8143387344980537, + "grad_norm": 0.8166795969009399, + "learning_rate": 5.032572105723165e-06, + "loss": 2.6171, + "step": 8996 + }, + { + "epoch": 0.8144292568118041, + "grad_norm": 0.846067488193512, + "learning_rate": 5.022636788442659e-06, + "loss": 2.6073, + "step": 8997 + }, + { + "epoch": 0.8145197791255544, + "grad_norm": 0.8584829568862915, + "learning_rate": 5.0127110353799915e-06, + "loss": 2.6213, + "step": 8998 + }, + { + "epoch": 0.8146103014393048, + "grad_norm": 0.8557177186012268, + "learning_rate": 5.002794847534764e-06, + "loss": 2.6213, + "step": 8999 + }, + { + "epoch": 0.8147008237530551, + "grad_norm": 0.9503716230392456, + "learning_rate": 4.992888225905468e-06, + "loss": 2.6209, + "step": 9000 + }, + { + "epoch": 0.8147008237530551, + "eval_loss": 2.5586037635803223, + "eval_runtime": 71.4633, + "eval_samples_per_second": 37.824, + "eval_steps_per_second": 3.162, + "step": 9000 + }, + { + "epoch": 0.8147913460668055, + "grad_norm": 0.8258231282234192, + "learning_rate": 4.982991171489737e-06, + "loss": 2.638, + "step": 9001 + }, + { + "epoch": 0.8148818683805558, + "grad_norm": 0.9333593845367432, + "learning_rate": 4.973103685284197e-06, + "loss": 2.6165, + "step": 9002 + }, + { + "epoch": 0.8149723906943062, + "grad_norm": 0.8705248236656189, + "learning_rate": 4.963225768284507e-06, + "loss": 2.6147, + "step": 9003 + }, + { + "epoch": 0.8150629130080564, + "grad_norm": 0.8360617756843567, + "learning_rate": 4.953357421485394e-06, + "loss": 2.6, + "step": 9004 + }, + { + "epoch": 0.8151534353218068, + "grad_norm": 0.8686841726303101, + "learning_rate": 4.943498645880595e-06, + "loss": 2.5953, + "step": 9005 + }, + { + "epoch": 0.8152439576355571, + "grad_norm": 0.8963146805763245, + "learning_rate": 4.933649442462851e-06, + "loss": 2.5785, + "step": 9006 + }, + { + "epoch": 0.8153344799493075, + "grad_norm": 0.853178858757019, + "learning_rate": 4.923809812224045e-06, + "loss": 2.6126, + "step": 9007 + }, + { + "epoch": 0.8154250022630578, + "grad_norm": 0.8550891876220703, + "learning_rate": 4.913979756154963e-06, + "loss": 2.6948, + "step": 9008 + }, + { + "epoch": 0.8155155245768082, + "grad_norm": 0.815525233745575, + "learning_rate": 4.904159275245524e-06, + "loss": 2.6115, + "step": 9009 + }, + { + "epoch": 0.8156060468905585, + "grad_norm": 0.8543317914009094, + "learning_rate": 4.8943483704846475e-06, + "loss": 2.6508, + "step": 9010 + }, + { + "epoch": 0.8156965692043089, + "grad_norm": 0.7768651843070984, + "learning_rate": 4.884547042860288e-06, + "loss": 2.6331, + "step": 9011 + }, + { + "epoch": 0.8157870915180592, + "grad_norm": 0.805082380771637, + "learning_rate": 4.874755293359434e-06, + "loss": 2.6249, + "step": 9012 + }, + { + "epoch": 0.8158776138318096, + "grad_norm": 0.8653188943862915, + "learning_rate": 4.86497312296813e-06, + "loss": 2.6314, + "step": 9013 + }, + { + "epoch": 0.8159681361455599, + "grad_norm": 0.8116848468780518, + "learning_rate": 4.8552005326714e-06, + "loss": 2.6135, + "step": 9014 + }, + { + "epoch": 0.8160586584593102, + "grad_norm": 0.8101407885551453, + "learning_rate": 4.845437523453411e-06, + "loss": 2.5825, + "step": 9015 + }, + { + "epoch": 0.8161491807730605, + "grad_norm": 0.8034658432006836, + "learning_rate": 4.835684096297244e-06, + "loss": 2.575, + "step": 9016 + }, + { + "epoch": 0.8162397030868109, + "grad_norm": 0.9002031087875366, + "learning_rate": 4.82594025218508e-06, + "loss": 2.6026, + "step": 9017 + }, + { + "epoch": 0.8163302254005612, + "grad_norm": 0.952223002910614, + "learning_rate": 4.8162059920981324e-06, + "loss": 2.6649, + "step": 9018 + }, + { + "epoch": 0.8164207477143116, + "grad_norm": 0.9982672333717346, + "learning_rate": 4.806481317016631e-06, + "loss": 2.6041, + "step": 9019 + }, + { + "epoch": 0.8165112700280619, + "grad_norm": 0.8889972567558289, + "learning_rate": 4.796766227919857e-06, + "loss": 2.5816, + "step": 9020 + }, + { + "epoch": 0.8166017923418123, + "grad_norm": 0.8761163353919983, + "learning_rate": 4.7870607257861415e-06, + "loss": 2.6037, + "step": 9021 + }, + { + "epoch": 0.8166923146555626, + "grad_norm": 0.8455398678779602, + "learning_rate": 4.777364811592766e-06, + "loss": 2.6092, + "step": 9022 + }, + { + "epoch": 0.816782836969313, + "grad_norm": 0.9293164610862732, + "learning_rate": 4.767678486316185e-06, + "loss": 2.6139, + "step": 9023 + }, + { + "epoch": 0.8168733592830633, + "grad_norm": 0.8474072217941284, + "learning_rate": 4.758001750931762e-06, + "loss": 2.591, + "step": 9024 + }, + { + "epoch": 0.8169638815968137, + "grad_norm": 0.8196378350257874, + "learning_rate": 4.748334606413951e-06, + "loss": 2.5984, + "step": 9025 + }, + { + "epoch": 0.8170544039105639, + "grad_norm": 0.8233461976051331, + "learning_rate": 4.73867705373624e-06, + "loss": 2.5775, + "step": 9026 + }, + { + "epoch": 0.8171449262243143, + "grad_norm": 0.8707271218299866, + "learning_rate": 4.729029093871151e-06, + "loss": 2.5932, + "step": 9027 + }, + { + "epoch": 0.8172354485380646, + "grad_norm": 0.865720272064209, + "learning_rate": 4.719390727790218e-06, + "loss": 2.6235, + "step": 9028 + }, + { + "epoch": 0.817325970851815, + "grad_norm": 0.8621955513954163, + "learning_rate": 4.709761956464043e-06, + "loss": 2.6102, + "step": 9029 + }, + { + "epoch": 0.8174164931655653, + "grad_norm": 0.8045375347137451, + "learning_rate": 4.700142780862205e-06, + "loss": 2.6161, + "step": 9030 + }, + { + "epoch": 0.8175070154793157, + "grad_norm": 0.8365339040756226, + "learning_rate": 4.690533201953407e-06, + "loss": 2.6275, + "step": 9031 + }, + { + "epoch": 0.817597537793066, + "grad_norm": 0.872138261795044, + "learning_rate": 4.680933220705308e-06, + "loss": 2.5851, + "step": 9032 + }, + { + "epoch": 0.8176880601068164, + "grad_norm": 0.7883930802345276, + "learning_rate": 4.671342838084614e-06, + "loss": 2.5809, + "step": 9033 + }, + { + "epoch": 0.8177785824205667, + "grad_norm": 0.8309215307235718, + "learning_rate": 4.661762055057084e-06, + "loss": 2.6601, + "step": 9034 + }, + { + "epoch": 0.817869104734317, + "grad_norm": 0.875511109828949, + "learning_rate": 4.652190872587525e-06, + "loss": 2.5819, + "step": 9035 + }, + { + "epoch": 0.8179596270480674, + "grad_norm": 0.9092219471931458, + "learning_rate": 4.6426292916397105e-06, + "loss": 2.6716, + "step": 9036 + }, + { + "epoch": 0.8180501493618176, + "grad_norm": 0.8753784894943237, + "learning_rate": 4.633077313176548e-06, + "loss": 2.5837, + "step": 9037 + }, + { + "epoch": 0.818140671675568, + "grad_norm": 0.8911800384521484, + "learning_rate": 4.623534938159868e-06, + "loss": 2.5886, + "step": 9038 + }, + { + "epoch": 0.8182311939893183, + "grad_norm": 0.8206343054771423, + "learning_rate": 4.614002167550613e-06, + "loss": 2.6194, + "step": 9039 + }, + { + "epoch": 0.8183217163030687, + "grad_norm": 0.9463024735450745, + "learning_rate": 4.604479002308737e-06, + "loss": 2.5975, + "step": 9040 + }, + { + "epoch": 0.818412238616819, + "grad_norm": 0.8719370365142822, + "learning_rate": 4.594965443393206e-06, + "loss": 2.6995, + "step": 9041 + }, + { + "epoch": 0.8185027609305694, + "grad_norm": 0.8398632407188416, + "learning_rate": 4.585461491762044e-06, + "loss": 2.5941, + "step": 9042 + }, + { + "epoch": 0.8185932832443197, + "grad_norm": 0.8209207653999329, + "learning_rate": 4.575967148372317e-06, + "loss": 2.6073, + "step": 9043 + }, + { + "epoch": 0.8186838055580701, + "grad_norm": 0.9303069710731506, + "learning_rate": 4.5664824141800624e-06, + "loss": 2.6029, + "step": 9044 + }, + { + "epoch": 0.8187743278718204, + "grad_norm": 0.8226931095123291, + "learning_rate": 4.557007290140447e-06, + "loss": 2.697, + "step": 9045 + }, + { + "epoch": 0.8188648501855708, + "grad_norm": 0.9183313250541687, + "learning_rate": 4.547541777207565e-06, + "loss": 2.6763, + "step": 9046 + }, + { + "epoch": 0.818955372499321, + "grad_norm": 0.8149845004081726, + "learning_rate": 4.538085876334619e-06, + "loss": 2.6456, + "step": 9047 + }, + { + "epoch": 0.8190458948130714, + "grad_norm": 0.8685399293899536, + "learning_rate": 4.528639588473804e-06, + "loss": 2.5925, + "step": 9048 + }, + { + "epoch": 0.8191364171268217, + "grad_norm": 0.830268144607544, + "learning_rate": 4.519202914576381e-06, + "loss": 2.5512, + "step": 9049 + }, + { + "epoch": 0.8192269394405721, + "grad_norm": 0.8395816683769226, + "learning_rate": 4.509775855592613e-06, + "loss": 2.6046, + "step": 9050 + }, + { + "epoch": 0.8193174617543224, + "grad_norm": 0.8282546401023865, + "learning_rate": 4.5003584124718055e-06, + "loss": 2.6571, + "step": 9051 + }, + { + "epoch": 0.8194079840680728, + "grad_norm": 0.8863653540611267, + "learning_rate": 4.490950586162279e-06, + "loss": 2.6142, + "step": 9052 + }, + { + "epoch": 0.8194985063818231, + "grad_norm": 0.8952196836471558, + "learning_rate": 4.481552377611431e-06, + "loss": 2.6454, + "step": 9053 + }, + { + "epoch": 0.8195890286955735, + "grad_norm": 0.845206081867218, + "learning_rate": 4.4721637877656375e-06, + "loss": 2.6916, + "step": 9054 + }, + { + "epoch": 0.8196795510093238, + "grad_norm": 0.8556833863258362, + "learning_rate": 4.462784817570331e-06, + "loss": 2.6503, + "step": 9055 + }, + { + "epoch": 0.8197700733230742, + "grad_norm": 0.7939481735229492, + "learning_rate": 4.45341546796999e-06, + "loss": 2.6606, + "step": 9056 + }, + { + "epoch": 0.8198605956368245, + "grad_norm": 0.8105207085609436, + "learning_rate": 4.444055739908082e-06, + "loss": 2.6122, + "step": 9057 + }, + { + "epoch": 0.8199511179505748, + "grad_norm": 0.8216555714607239, + "learning_rate": 4.434705634327163e-06, + "loss": 2.5746, + "step": 9058 + }, + { + "epoch": 0.8200416402643251, + "grad_norm": 0.9625558853149414, + "learning_rate": 4.42536515216877e-06, + "loss": 2.6698, + "step": 9059 + }, + { + "epoch": 0.8201321625780755, + "grad_norm": 0.9395487308502197, + "learning_rate": 4.416034294373472e-06, + "loss": 2.6948, + "step": 9060 + }, + { + "epoch": 0.8202226848918258, + "grad_norm": 0.8587431907653809, + "learning_rate": 4.406713061880941e-06, + "loss": 2.5942, + "step": 9061 + }, + { + "epoch": 0.8203132072055762, + "grad_norm": 0.9051746129989624, + "learning_rate": 4.3974014556297685e-06, + "loss": 2.6551, + "step": 9062 + }, + { + "epoch": 0.8204037295193265, + "grad_norm": 0.8491767644882202, + "learning_rate": 4.388099476557639e-06, + "loss": 2.5966, + "step": 9063 + }, + { + "epoch": 0.8204942518330769, + "grad_norm": 0.877068042755127, + "learning_rate": 4.378807125601303e-06, + "loss": 2.6041, + "step": 9064 + }, + { + "epoch": 0.8205847741468272, + "grad_norm": 0.8690078258514404, + "learning_rate": 4.369524403696457e-06, + "loss": 2.6046, + "step": 9065 + }, + { + "epoch": 0.8206752964605776, + "grad_norm": 0.8469387292861938, + "learning_rate": 4.360251311777896e-06, + "loss": 2.6946, + "step": 9066 + }, + { + "epoch": 0.8207658187743279, + "grad_norm": 0.8528283834457397, + "learning_rate": 4.3509878507793975e-06, + "loss": 2.5908, + "step": 9067 + }, + { + "epoch": 0.8208563410880783, + "grad_norm": 0.8180561065673828, + "learning_rate": 4.341734021633814e-06, + "loss": 2.6254, + "step": 9068 + }, + { + "epoch": 0.8209468634018285, + "grad_norm": 0.8620453476905823, + "learning_rate": 4.33248982527299e-06, + "loss": 2.6312, + "step": 9069 + }, + { + "epoch": 0.8210373857155789, + "grad_norm": 0.8564789295196533, + "learning_rate": 4.323255262627846e-06, + "loss": 2.5788, + "step": 9070 + }, + { + "epoch": 0.8211279080293292, + "grad_norm": 0.8219857215881348, + "learning_rate": 4.314030334628249e-06, + "loss": 2.6346, + "step": 9071 + }, + { + "epoch": 0.8212184303430796, + "grad_norm": 0.8396607637405396, + "learning_rate": 4.304815042203203e-06, + "loss": 2.6268, + "step": 9072 + }, + { + "epoch": 0.8213089526568299, + "grad_norm": 0.8367118239402771, + "learning_rate": 4.2956093862806525e-06, + "loss": 2.5875, + "step": 9073 + }, + { + "epoch": 0.8213994749705803, + "grad_norm": 0.8491257429122925, + "learning_rate": 4.286413367787612e-06, + "loss": 2.653, + "step": 9074 + }, + { + "epoch": 0.8214899972843306, + "grad_norm": 0.8212398886680603, + "learning_rate": 4.277226987650129e-06, + "loss": 2.597, + "step": 9075 + }, + { + "epoch": 0.8215805195980809, + "grad_norm": 0.857515811920166, + "learning_rate": 4.268050246793276e-06, + "loss": 2.6104, + "step": 9076 + }, + { + "epoch": 0.8216710419118313, + "grad_norm": 0.8828648328781128, + "learning_rate": 4.258883146141135e-06, + "loss": 2.6992, + "step": 9077 + }, + { + "epoch": 0.8217615642255816, + "grad_norm": 0.8497927188873291, + "learning_rate": 4.249725686616867e-06, + "loss": 2.6917, + "step": 9078 + }, + { + "epoch": 0.821852086539332, + "grad_norm": 0.921812117099762, + "learning_rate": 4.24057786914257e-06, + "loss": 2.6864, + "step": 9079 + }, + { + "epoch": 0.8219426088530822, + "grad_norm": 0.901273787021637, + "learning_rate": 4.231439694639483e-06, + "loss": 2.6079, + "step": 9080 + }, + { + "epoch": 0.8220331311668326, + "grad_norm": 0.863243043422699, + "learning_rate": 4.2223111640277815e-06, + "loss": 2.596, + "step": 9081 + }, + { + "epoch": 0.8221236534805829, + "grad_norm": 0.8987923860549927, + "learning_rate": 4.2131922782267405e-06, + "loss": 2.6031, + "step": 9082 + }, + { + "epoch": 0.8222141757943333, + "grad_norm": 0.8319920897483826, + "learning_rate": 4.2040830381546045e-06, + "loss": 2.6171, + "step": 9083 + }, + { + "epoch": 0.8223046981080836, + "grad_norm": 0.8061295747756958, + "learning_rate": 4.194983444728684e-06, + "loss": 2.5421, + "step": 9084 + }, + { + "epoch": 0.822395220421834, + "grad_norm": 0.8673340082168579, + "learning_rate": 4.185893498865323e-06, + "loss": 2.6519, + "step": 9085 + }, + { + "epoch": 0.8224857427355843, + "grad_norm": 0.8772735595703125, + "learning_rate": 4.176813201479867e-06, + "loss": 2.6119, + "step": 9086 + }, + { + "epoch": 0.8225762650493347, + "grad_norm": 0.9388923048973083, + "learning_rate": 4.167742553486675e-06, + "loss": 2.6075, + "step": 9087 + }, + { + "epoch": 0.822666787363085, + "grad_norm": 0.8103634715080261, + "learning_rate": 4.158681555799204e-06, + "loss": 2.5887, + "step": 9088 + }, + { + "epoch": 0.8227573096768354, + "grad_norm": 0.8761494159698486, + "learning_rate": 4.14963020932988e-06, + "loss": 2.6375, + "step": 9089 + }, + { + "epoch": 0.8228478319905856, + "grad_norm": 0.8322147130966187, + "learning_rate": 4.140588514990162e-06, + "loss": 2.6086, + "step": 9090 + }, + { + "epoch": 0.822938354304336, + "grad_norm": 0.823414146900177, + "learning_rate": 4.131556473690557e-06, + "loss": 2.6468, + "step": 9091 + }, + { + "epoch": 0.8230288766180863, + "grad_norm": 0.8451811075210571, + "learning_rate": 4.122534086340612e-06, + "loss": 2.6127, + "step": 9092 + }, + { + "epoch": 0.8231193989318367, + "grad_norm": 0.7874065637588501, + "learning_rate": 4.113521353848826e-06, + "loss": 2.6115, + "step": 9093 + }, + { + "epoch": 0.823209921245587, + "grad_norm": 0.9588502645492554, + "learning_rate": 4.104518277122848e-06, + "loss": 2.6193, + "step": 9094 + }, + { + "epoch": 0.8233004435593374, + "grad_norm": 0.8688252568244934, + "learning_rate": 4.095524857069244e-06, + "loss": 2.6243, + "step": 9095 + }, + { + "epoch": 0.8233909658730877, + "grad_norm": 0.8532291650772095, + "learning_rate": 4.086541094593666e-06, + "loss": 2.5685, + "step": 9096 + }, + { + "epoch": 0.8234814881868381, + "grad_norm": 0.866753876209259, + "learning_rate": 4.077566990600767e-06, + "loss": 2.5997, + "step": 9097 + }, + { + "epoch": 0.8235720105005884, + "grad_norm": 0.8232748508453369, + "learning_rate": 4.068602545994249e-06, + "loss": 2.623, + "step": 9098 + }, + { + "epoch": 0.8236625328143388, + "grad_norm": 0.8832443952560425, + "learning_rate": 4.059647761676833e-06, + "loss": 2.6458, + "step": 9099 + }, + { + "epoch": 0.823753055128089, + "grad_norm": 0.845489501953125, + "learning_rate": 4.050702638550275e-06, + "loss": 2.5738, + "step": 9100 + }, + { + "epoch": 0.8238435774418394, + "grad_norm": 0.8758887052536011, + "learning_rate": 4.04176717751531e-06, + "loss": 2.6469, + "step": 9101 + }, + { + "epoch": 0.8239340997555897, + "grad_norm": 0.8528258800506592, + "learning_rate": 4.0328413794717855e-06, + "loss": 2.6441, + "step": 9102 + }, + { + "epoch": 0.8240246220693401, + "grad_norm": 0.8793032765388489, + "learning_rate": 4.023925245318494e-06, + "loss": 2.64, + "step": 9103 + }, + { + "epoch": 0.8241151443830904, + "grad_norm": 0.8211391568183899, + "learning_rate": 4.015018775953317e-06, + "loss": 2.608, + "step": 9104 + }, + { + "epoch": 0.8242056666968408, + "grad_norm": 0.842514157295227, + "learning_rate": 4.006121972273113e-06, + "loss": 2.6211, + "step": 9105 + }, + { + "epoch": 0.8242961890105911, + "grad_norm": 0.7817758917808533, + "learning_rate": 3.997234835173802e-06, + "loss": 2.5894, + "step": 9106 + }, + { + "epoch": 0.8243867113243415, + "grad_norm": 0.8630059361457825, + "learning_rate": 3.9883573655503106e-06, + "loss": 2.609, + "step": 9107 + }, + { + "epoch": 0.8244772336380918, + "grad_norm": 0.871557354927063, + "learning_rate": 3.979489564296624e-06, + "loss": 2.6673, + "step": 9108 + }, + { + "epoch": 0.8245677559518422, + "grad_norm": 0.9041063189506531, + "learning_rate": 3.970631432305694e-06, + "loss": 2.627, + "step": 9109 + }, + { + "epoch": 0.8246582782655925, + "grad_norm": 0.9307136535644531, + "learning_rate": 3.961782970469563e-06, + "loss": 2.6341, + "step": 9110 + }, + { + "epoch": 0.8247488005793429, + "grad_norm": 0.8669264316558838, + "learning_rate": 3.952944179679252e-06, + "loss": 2.5751, + "step": 9111 + }, + { + "epoch": 0.8248393228930931, + "grad_norm": 0.8764011263847351, + "learning_rate": 3.944115060824826e-06, + "loss": 2.6495, + "step": 9112 + }, + { + "epoch": 0.8249298452068435, + "grad_norm": 0.8889691829681396, + "learning_rate": 3.935295614795398e-06, + "loss": 2.6362, + "step": 9113 + }, + { + "epoch": 0.8250203675205938, + "grad_norm": 0.8333461284637451, + "learning_rate": 3.926485842479066e-06, + "loss": 2.5991, + "step": 9114 + }, + { + "epoch": 0.8251108898343442, + "grad_norm": 0.796550989151001, + "learning_rate": 3.917685744762989e-06, + "loss": 2.6287, + "step": 9115 + }, + { + "epoch": 0.8252014121480945, + "grad_norm": 0.8666393756866455, + "learning_rate": 3.908895322533335e-06, + "loss": 2.6685, + "step": 9116 + }, + { + "epoch": 0.8252919344618448, + "grad_norm": 0.8646209836006165, + "learning_rate": 3.900114576675262e-06, + "loss": 2.6127, + "step": 9117 + }, + { + "epoch": 0.8253824567755952, + "grad_norm": 0.8975469470024109, + "learning_rate": 3.891343508073053e-06, + "loss": 2.5959, + "step": 9118 + }, + { + "epoch": 0.8254729790893455, + "grad_norm": 0.8263643980026245, + "learning_rate": 3.882582117609912e-06, + "loss": 2.6019, + "step": 9119 + }, + { + "epoch": 0.8255635014030959, + "grad_norm": 0.8185175061225891, + "learning_rate": 3.873830406168111e-06, + "loss": 2.6134, + "step": 9120 + }, + { + "epoch": 0.8256540237168462, + "grad_norm": 0.8060017824172974, + "learning_rate": 3.865088374628967e-06, + "loss": 2.5985, + "step": 9121 + }, + { + "epoch": 0.8257445460305965, + "grad_norm": 0.812060534954071, + "learning_rate": 3.856356023872798e-06, + "loss": 2.58, + "step": 9122 + }, + { + "epoch": 0.8258350683443468, + "grad_norm": 0.8128660917282104, + "learning_rate": 3.847633354778935e-06, + "loss": 2.612, + "step": 9123 + }, + { + "epoch": 0.8259255906580972, + "grad_norm": 0.8915673494338989, + "learning_rate": 3.838920368225784e-06, + "loss": 2.5464, + "step": 9124 + }, + { + "epoch": 0.8260161129718475, + "grad_norm": 0.802297830581665, + "learning_rate": 3.830217065090702e-06, + "loss": 2.5796, + "step": 9125 + }, + { + "epoch": 0.8261066352855979, + "grad_norm": 0.8497213125228882, + "learning_rate": 3.821523446250142e-06, + "loss": 2.7114, + "step": 9126 + }, + { + "epoch": 0.8261971575993482, + "grad_norm": 0.8551464676856995, + "learning_rate": 3.8128395125795246e-06, + "loss": 2.6046, + "step": 9127 + }, + { + "epoch": 0.8262876799130986, + "grad_norm": 0.8229303359985352, + "learning_rate": 3.8041652649533412e-06, + "loss": 2.6217, + "step": 9128 + }, + { + "epoch": 0.8263782022268489, + "grad_norm": 0.8738320469856262, + "learning_rate": 3.795500704245092e-06, + "loss": 2.6069, + "step": 9129 + }, + { + "epoch": 0.8264687245405993, + "grad_norm": 0.815537691116333, + "learning_rate": 3.7868458313272904e-06, + "loss": 2.6014, + "step": 9130 + }, + { + "epoch": 0.8265592468543496, + "grad_norm": 0.8061794638633728, + "learning_rate": 3.7782006470714616e-06, + "loss": 2.6608, + "step": 9131 + }, + { + "epoch": 0.8266497691681, + "grad_norm": 0.8272706866264343, + "learning_rate": 3.76956515234822e-06, + "loss": 2.65, + "step": 9132 + }, + { + "epoch": 0.8267402914818502, + "grad_norm": 0.8943510055541992, + "learning_rate": 3.760939348027115e-06, + "loss": 2.6195, + "step": 9133 + }, + { + "epoch": 0.8268308137956006, + "grad_norm": 0.8860402703285217, + "learning_rate": 3.7523232349767957e-06, + "loss": 2.5961, + "step": 9134 + }, + { + "epoch": 0.8269213361093509, + "grad_norm": 0.8261328935623169, + "learning_rate": 3.7437168140648904e-06, + "loss": 2.57, + "step": 9135 + }, + { + "epoch": 0.8270118584231013, + "grad_norm": 0.8775939345359802, + "learning_rate": 3.7351200861580617e-06, + "loss": 2.6482, + "step": 9136 + }, + { + "epoch": 0.8271023807368516, + "grad_norm": 0.8148878216743469, + "learning_rate": 3.7265330521220056e-06, + "loss": 2.6488, + "step": 9137 + }, + { + "epoch": 0.827192903050602, + "grad_norm": 0.791273295879364, + "learning_rate": 3.7179557128214526e-06, + "loss": 2.6304, + "step": 9138 + }, + { + "epoch": 0.8272834253643523, + "grad_norm": 0.8352766633033752, + "learning_rate": 3.709388069120101e-06, + "loss": 2.6171, + "step": 9139 + }, + { + "epoch": 0.8273739476781027, + "grad_norm": 0.8089897036552429, + "learning_rate": 3.7008301218807716e-06, + "loss": 2.5845, + "step": 9140 + }, + { + "epoch": 0.827464469991853, + "grad_norm": 0.8363292217254639, + "learning_rate": 3.692281871965186e-06, + "loss": 2.6664, + "step": 9141 + }, + { + "epoch": 0.8275549923056034, + "grad_norm": 0.8899134993553162, + "learning_rate": 3.68374332023419e-06, + "loss": 2.6082, + "step": 9142 + }, + { + "epoch": 0.8276455146193537, + "grad_norm": 0.8281762003898621, + "learning_rate": 3.675214467547594e-06, + "loss": 2.6403, + "step": 9143 + }, + { + "epoch": 0.827736036933104, + "grad_norm": 0.795846700668335, + "learning_rate": 3.6666953147642792e-06, + "loss": 2.6201, + "step": 9144 + }, + { + "epoch": 0.8278265592468543, + "grad_norm": 0.8525198698043823, + "learning_rate": 3.6581858627421027e-06, + "loss": 2.6095, + "step": 9145 + }, + { + "epoch": 0.8279170815606047, + "grad_norm": 0.8217182159423828, + "learning_rate": 3.649686112337991e-06, + "loss": 2.6264, + "step": 9146 + }, + { + "epoch": 0.828007603874355, + "grad_norm": 0.8377102613449097, + "learning_rate": 3.6411960644078367e-06, + "loss": 2.6227, + "step": 9147 + }, + { + "epoch": 0.8280981261881054, + "grad_norm": 0.8462782502174377, + "learning_rate": 3.632715719806601e-06, + "loss": 2.6041, + "step": 9148 + }, + { + "epoch": 0.8281886485018557, + "grad_norm": 0.8169267773628235, + "learning_rate": 3.6242450793882555e-06, + "loss": 2.6211, + "step": 9149 + }, + { + "epoch": 0.8282791708156061, + "grad_norm": 0.8321042060852051, + "learning_rate": 3.615784144005796e-06, + "loss": 2.6057, + "step": 9150 + }, + { + "epoch": 0.8283696931293564, + "grad_norm": 0.8683964014053345, + "learning_rate": 3.607332914511241e-06, + "loss": 2.5884, + "step": 9151 + }, + { + "epoch": 0.8284602154431068, + "grad_norm": 0.8136768937110901, + "learning_rate": 3.59889139175561e-06, + "loss": 2.5525, + "step": 9152 + }, + { + "epoch": 0.8285507377568571, + "grad_norm": 0.8591272234916687, + "learning_rate": 3.5904595765890005e-06, + "loss": 2.6401, + "step": 9153 + }, + { + "epoch": 0.8286412600706075, + "grad_norm": 0.7958996295928955, + "learning_rate": 3.5820374698604555e-06, + "loss": 2.6246, + "step": 9154 + }, + { + "epoch": 0.8287317823843577, + "grad_norm": 0.8030251860618591, + "learning_rate": 3.5736250724180966e-06, + "loss": 2.6206, + "step": 9155 + }, + { + "epoch": 0.8288223046981081, + "grad_norm": 0.904325544834137, + "learning_rate": 3.565222385109068e-06, + "loss": 2.5609, + "step": 9156 + }, + { + "epoch": 0.8289128270118584, + "grad_norm": 0.8910101056098938, + "learning_rate": 3.5568294087795053e-06, + "loss": 2.6318, + "step": 9157 + }, + { + "epoch": 0.8290033493256087, + "grad_norm": 0.8450800776481628, + "learning_rate": 3.5484461442745643e-06, + "loss": 2.621, + "step": 9158 + }, + { + "epoch": 0.8290938716393591, + "grad_norm": 0.8555982708930969, + "learning_rate": 3.5400725924384813e-06, + "loss": 2.6358, + "step": 9159 + }, + { + "epoch": 0.8291843939531094, + "grad_norm": 0.8615543842315674, + "learning_rate": 3.5317087541144377e-06, + "loss": 2.63, + "step": 9160 + }, + { + "epoch": 0.8292749162668598, + "grad_norm": 0.83702552318573, + "learning_rate": 3.5233546301446817e-06, + "loss": 2.5729, + "step": 9161 + }, + { + "epoch": 0.8293654385806101, + "grad_norm": 0.8201742768287659, + "learning_rate": 3.515010221370474e-06, + "loss": 2.5653, + "step": 9162 + }, + { + "epoch": 0.8294559608943605, + "grad_norm": 0.8219901323318481, + "learning_rate": 3.5066755286320975e-06, + "loss": 2.5807, + "step": 9163 + }, + { + "epoch": 0.8295464832081108, + "grad_norm": 0.8451256155967712, + "learning_rate": 3.4983505527688586e-06, + "loss": 2.6275, + "step": 9164 + }, + { + "epoch": 0.8296370055218611, + "grad_norm": 0.8181697130203247, + "learning_rate": 3.4900352946190874e-06, + "loss": 2.5696, + "step": 9165 + }, + { + "epoch": 0.8297275278356114, + "grad_norm": 0.8448930382728577, + "learning_rate": 3.4817297550200913e-06, + "loss": 2.6381, + "step": 9166 + }, + { + "epoch": 0.8298180501493618, + "grad_norm": 0.9296396374702454, + "learning_rate": 3.4734339348083012e-06, + "loss": 2.6578, + "step": 9167 + }, + { + "epoch": 0.8299085724631121, + "grad_norm": 0.8755658268928528, + "learning_rate": 3.4651478348190603e-06, + "loss": 2.6097, + "step": 9168 + }, + { + "epoch": 0.8299990947768625, + "grad_norm": 0.8138020038604736, + "learning_rate": 3.45687145588679e-06, + "loss": 2.6322, + "step": 9169 + }, + { + "epoch": 0.8300896170906128, + "grad_norm": 0.821542501449585, + "learning_rate": 3.448604798844912e-06, + "loss": 2.602, + "step": 9170 + }, + { + "epoch": 0.8301801394043632, + "grad_norm": 0.8439465165138245, + "learning_rate": 3.4403478645259057e-06, + "loss": 2.6411, + "step": 9171 + }, + { + "epoch": 0.8302706617181135, + "grad_norm": 0.8149715662002563, + "learning_rate": 3.4321006537612165e-06, + "loss": 2.5726, + "step": 9172 + }, + { + "epoch": 0.8303611840318639, + "grad_norm": 0.8409690856933594, + "learning_rate": 3.4238631673813582e-06, + "loss": 2.6176, + "step": 9173 + }, + { + "epoch": 0.8304517063456142, + "grad_norm": 0.8181410431861877, + "learning_rate": 3.4156354062158226e-06, + "loss": 2.595, + "step": 9174 + }, + { + "epoch": 0.8305422286593646, + "grad_norm": 0.8303077220916748, + "learning_rate": 3.40741737109318e-06, + "loss": 2.561, + "step": 9175 + }, + { + "epoch": 0.8306327509731148, + "grad_norm": 0.8174563050270081, + "learning_rate": 3.3992090628409466e-06, + "loss": 2.6311, + "step": 9176 + }, + { + "epoch": 0.8307232732868652, + "grad_norm": 0.8620786666870117, + "learning_rate": 3.391010482285728e-06, + "loss": 2.6014, + "step": 9177 + }, + { + "epoch": 0.8308137956006155, + "grad_norm": 0.9524500370025635, + "learning_rate": 3.3828216302531078e-06, + "loss": 2.6493, + "step": 9178 + }, + { + "epoch": 0.8309043179143659, + "grad_norm": 0.8139269948005676, + "learning_rate": 3.3746425075676933e-06, + "loss": 2.6455, + "step": 9179 + }, + { + "epoch": 0.8309948402281162, + "grad_norm": 0.8140713572502136, + "learning_rate": 3.3664731150531482e-06, + "loss": 2.5223, + "step": 9180 + }, + { + "epoch": 0.8310853625418666, + "grad_norm": 0.8157796859741211, + "learning_rate": 3.358313453532125e-06, + "loss": 2.6053, + "step": 9181 + }, + { + "epoch": 0.8311758848556169, + "grad_norm": 0.8004946112632751, + "learning_rate": 3.3501635238262663e-06, + "loss": 2.6019, + "step": 9182 + }, + { + "epoch": 0.8312664071693673, + "grad_norm": 0.8402578830718994, + "learning_rate": 3.3420233267563272e-06, + "loss": 2.5942, + "step": 9183 + }, + { + "epoch": 0.8313569294831176, + "grad_norm": 0.8298675417900085, + "learning_rate": 3.333892863141974e-06, + "loss": 2.6567, + "step": 9184 + }, + { + "epoch": 0.831447451796868, + "grad_norm": 0.8852568864822388, + "learning_rate": 3.325772133801963e-06, + "loss": 2.6207, + "step": 9185 + }, + { + "epoch": 0.8315379741106182, + "grad_norm": 0.8615192770957947, + "learning_rate": 3.3176611395540626e-06, + "loss": 2.5879, + "step": 9186 + }, + { + "epoch": 0.8316284964243686, + "grad_norm": 0.9066726565361023, + "learning_rate": 3.3095598812150406e-06, + "loss": 2.634, + "step": 9187 + }, + { + "epoch": 0.8317190187381189, + "grad_norm": 0.8623664975166321, + "learning_rate": 3.301468359600679e-06, + "loss": 2.6659, + "step": 9188 + }, + { + "epoch": 0.8318095410518693, + "grad_norm": 0.7866799831390381, + "learning_rate": 3.293386575525825e-06, + "loss": 2.6064, + "step": 9189 + }, + { + "epoch": 0.8319000633656196, + "grad_norm": 0.8399327993392944, + "learning_rate": 3.2853145298042953e-06, + "loss": 2.6452, + "step": 9190 + }, + { + "epoch": 0.83199058567937, + "grad_norm": 0.8688633441925049, + "learning_rate": 3.2772522232489387e-06, + "loss": 2.687, + "step": 9191 + }, + { + "epoch": 0.8320811079931203, + "grad_norm": 0.8381837606430054, + "learning_rate": 3.269199656671629e-06, + "loss": 2.6317, + "step": 9192 + }, + { + "epoch": 0.8321716303068707, + "grad_norm": 0.8940417170524597, + "learning_rate": 3.261156830883283e-06, + "loss": 2.6023, + "step": 9193 + }, + { + "epoch": 0.832262152620621, + "grad_norm": 0.8516998291015625, + "learning_rate": 3.253123746693787e-06, + "loss": 2.6418, + "step": 9194 + }, + { + "epoch": 0.8323526749343714, + "grad_norm": 0.8648186922073364, + "learning_rate": 3.245100404912094e-06, + "loss": 2.6774, + "step": 9195 + }, + { + "epoch": 0.8324431972481217, + "grad_norm": 0.9397299289703369, + "learning_rate": 3.2370868063461236e-06, + "loss": 2.6327, + "step": 9196 + }, + { + "epoch": 0.832533719561872, + "grad_norm": 0.8416707515716553, + "learning_rate": 3.2290829518028862e-06, + "loss": 2.6227, + "step": 9197 + }, + { + "epoch": 0.8326242418756223, + "grad_norm": 0.9227898716926575, + "learning_rate": 3.221088842088338e-06, + "loss": 2.6949, + "step": 9198 + }, + { + "epoch": 0.8327147641893726, + "grad_norm": 0.8900390863418579, + "learning_rate": 3.213104478007489e-06, + "loss": 2.5867, + "step": 9199 + }, + { + "epoch": 0.832805286503123, + "grad_norm": 0.8820727467536926, + "learning_rate": 3.2051298603643753e-06, + "loss": 2.5983, + "step": 9200 + }, + { + "epoch": 0.8328958088168733, + "grad_norm": 0.8546134829521179, + "learning_rate": 3.1971649899620317e-06, + "loss": 2.5818, + "step": 9201 + }, + { + "epoch": 0.8329863311306237, + "grad_norm": 0.8514634966850281, + "learning_rate": 3.1892098676025274e-06, + "loss": 2.6578, + "step": 9202 + }, + { + "epoch": 0.833076853444374, + "grad_norm": 0.9180089235305786, + "learning_rate": 3.1812644940869552e-06, + "loss": 2.5797, + "step": 9203 + }, + { + "epoch": 0.8331673757581244, + "grad_norm": 0.8001142144203186, + "learning_rate": 3.173328870215375e-06, + "loss": 2.574, + "step": 9204 + }, + { + "epoch": 0.8332578980718747, + "grad_norm": 0.9282077550888062, + "learning_rate": 3.165402996786948e-06, + "loss": 2.5877, + "step": 9205 + }, + { + "epoch": 0.8333484203856251, + "grad_norm": 0.8573855757713318, + "learning_rate": 3.157486874599791e-06, + "loss": 2.607, + "step": 9206 + }, + { + "epoch": 0.8334389426993754, + "grad_norm": 0.8349403142929077, + "learning_rate": 3.149580504451044e-06, + "loss": 2.6184, + "step": 9207 + }, + { + "epoch": 0.8335294650131257, + "grad_norm": 0.8084670305252075, + "learning_rate": 3.1416838871368924e-06, + "loss": 2.623, + "step": 9208 + }, + { + "epoch": 0.833619987326876, + "grad_norm": 0.8632099032402039, + "learning_rate": 3.1337970234525226e-06, + "loss": 2.6386, + "step": 9209 + }, + { + "epoch": 0.8337105096406264, + "grad_norm": 0.8786114454269409, + "learning_rate": 3.1259199141921435e-06, + "loss": 2.5755, + "step": 9210 + }, + { + "epoch": 0.8338010319543767, + "grad_norm": 0.8364348411560059, + "learning_rate": 3.1180525601489873e-06, + "loss": 2.6017, + "step": 9211 + }, + { + "epoch": 0.8338915542681271, + "grad_norm": 0.8469467163085938, + "learning_rate": 3.1101949621152647e-06, + "loss": 2.6735, + "step": 9212 + }, + { + "epoch": 0.8339820765818774, + "grad_norm": 0.8727800846099854, + "learning_rate": 3.1023471208822765e-06, + "loss": 2.5964, + "step": 9213 + }, + { + "epoch": 0.8340725988956278, + "grad_norm": 0.9073167443275452, + "learning_rate": 3.0945090372402785e-06, + "loss": 2.6447, + "step": 9214 + }, + { + "epoch": 0.8341631212093781, + "grad_norm": 0.8227028250694275, + "learning_rate": 3.0866807119785734e-06, + "loss": 2.6402, + "step": 9215 + }, + { + "epoch": 0.8342536435231285, + "grad_norm": 0.8529277443885803, + "learning_rate": 3.0788621458854527e-06, + "loss": 2.6093, + "step": 9216 + }, + { + "epoch": 0.8343441658368788, + "grad_norm": 0.8378677368164062, + "learning_rate": 3.0710533397482866e-06, + "loss": 2.654, + "step": 9217 + }, + { + "epoch": 0.8344346881506292, + "grad_norm": 0.8516073226928711, + "learning_rate": 3.063254294353368e-06, + "loss": 2.5382, + "step": 9218 + }, + { + "epoch": 0.8345252104643794, + "grad_norm": 0.831871509552002, + "learning_rate": 3.0554650104861136e-06, + "loss": 2.6137, + "step": 9219 + }, + { + "epoch": 0.8346157327781298, + "grad_norm": 0.8696051836013794, + "learning_rate": 3.047685488930874e-06, + "loss": 2.5878, + "step": 9220 + }, + { + "epoch": 0.8347062550918801, + "grad_norm": 0.8664183020591736, + "learning_rate": 3.0399157304710545e-06, + "loss": 2.6619, + "step": 9221 + }, + { + "epoch": 0.8347967774056305, + "grad_norm": 0.7911030650138855, + "learning_rate": 3.032155735889053e-06, + "loss": 2.5723, + "step": 9222 + }, + { + "epoch": 0.8348872997193808, + "grad_norm": 0.9079735279083252, + "learning_rate": 3.024405505966332e-06, + "loss": 2.7033, + "step": 9223 + }, + { + "epoch": 0.8349778220331312, + "grad_norm": 0.8091928958892822, + "learning_rate": 3.016665041483313e-06, + "loss": 2.5664, + "step": 9224 + }, + { + "epoch": 0.8350683443468815, + "grad_norm": 0.9017247557640076, + "learning_rate": 3.0089343432194826e-06, + "loss": 2.6433, + "step": 9225 + }, + { + "epoch": 0.8351588666606319, + "grad_norm": 0.8352522253990173, + "learning_rate": 3.0012134119532964e-06, + "loss": 2.6499, + "step": 9226 + }, + { + "epoch": 0.8352493889743822, + "grad_norm": 0.8612746596336365, + "learning_rate": 2.9935022484622766e-06, + "loss": 2.6483, + "step": 9227 + }, + { + "epoch": 0.8353399112881326, + "grad_norm": 0.9562451243400574, + "learning_rate": 2.985800853522924e-06, + "loss": 2.6012, + "step": 9228 + }, + { + "epoch": 0.8354304336018828, + "grad_norm": 0.8161827325820923, + "learning_rate": 2.9781092279107637e-06, + "loss": 2.5909, + "step": 9229 + }, + { + "epoch": 0.8355209559156332, + "grad_norm": 0.9282371401786804, + "learning_rate": 2.970427372400353e-06, + "loss": 2.5909, + "step": 9230 + }, + { + "epoch": 0.8356114782293835, + "grad_norm": 0.8111222386360168, + "learning_rate": 2.962755287765251e-06, + "loss": 2.6167, + "step": 9231 + }, + { + "epoch": 0.8357020005431339, + "grad_norm": 0.7850275635719299, + "learning_rate": 2.955092974778051e-06, + "loss": 2.5852, + "step": 9232 + }, + { + "epoch": 0.8357925228568842, + "grad_norm": 0.8214910626411438, + "learning_rate": 2.9474404342103135e-06, + "loss": 2.643, + "step": 9233 + }, + { + "epoch": 0.8358830451706346, + "grad_norm": 0.8709685802459717, + "learning_rate": 2.939797666832678e-06, + "loss": 2.5581, + "step": 9234 + }, + { + "epoch": 0.8359735674843849, + "grad_norm": 0.8599022030830383, + "learning_rate": 2.9321646734147502e-06, + "loss": 2.6064, + "step": 9235 + }, + { + "epoch": 0.8360640897981353, + "grad_norm": 0.9196804761886597, + "learning_rate": 2.9245414547251935e-06, + "loss": 2.7081, + "step": 9236 + }, + { + "epoch": 0.8361546121118856, + "grad_norm": 0.8670064210891724, + "learning_rate": 2.91692801153165e-06, + "loss": 2.5799, + "step": 9237 + }, + { + "epoch": 0.836245134425636, + "grad_norm": 0.8151206970214844, + "learning_rate": 2.9093243446008166e-06, + "loss": 2.6471, + "step": 9238 + }, + { + "epoch": 0.8363356567393863, + "grad_norm": 0.7814372181892395, + "learning_rate": 2.901730454698337e-06, + "loss": 2.6122, + "step": 9239 + }, + { + "epoch": 0.8364261790531367, + "grad_norm": 0.8052501678466797, + "learning_rate": 2.894146342588977e-06, + "loss": 2.5885, + "step": 9240 + }, + { + "epoch": 0.8365167013668869, + "grad_norm": 0.830434262752533, + "learning_rate": 2.8865720090364034e-06, + "loss": 2.657, + "step": 9241 + }, + { + "epoch": 0.8366072236806372, + "grad_norm": 0.855962336063385, + "learning_rate": 2.879007454803373e-06, + "loss": 2.61, + "step": 9242 + }, + { + "epoch": 0.8366977459943876, + "grad_norm": 0.8213224411010742, + "learning_rate": 2.8714526806516316e-06, + "loss": 2.6103, + "step": 9243 + }, + { + "epoch": 0.8367882683081379, + "grad_norm": 0.8698161840438843, + "learning_rate": 2.863907687341949e-06, + "loss": 2.5893, + "step": 9244 + }, + { + "epoch": 0.8368787906218883, + "grad_norm": 0.8918118476867676, + "learning_rate": 2.856372475634106e-06, + "loss": 2.595, + "step": 9245 + }, + { + "epoch": 0.8369693129356386, + "grad_norm": 0.8260166645050049, + "learning_rate": 2.8488470462869065e-06, + "loss": 2.5987, + "step": 9246 + }, + { + "epoch": 0.837059835249389, + "grad_norm": 0.8333005309104919, + "learning_rate": 2.8413314000581338e-06, + "loss": 2.5971, + "step": 9247 + }, + { + "epoch": 0.8371503575631393, + "grad_norm": 0.8795336484909058, + "learning_rate": 2.8338255377046375e-06, + "loss": 2.5984, + "step": 9248 + }, + { + "epoch": 0.8372408798768897, + "grad_norm": 0.7897392511367798, + "learning_rate": 2.8263294599822577e-06, + "loss": 2.6101, + "step": 9249 + }, + { + "epoch": 0.83733140219064, + "grad_norm": 0.7922962307929993, + "learning_rate": 2.818843167645835e-06, + "loss": 2.6432, + "step": 9250 + }, + { + "epoch": 0.8374219245043903, + "grad_norm": 0.8887566328048706, + "learning_rate": 2.811366661449244e-06, + "loss": 2.5893, + "step": 9251 + }, + { + "epoch": 0.8375124468181406, + "grad_norm": 0.8131504654884338, + "learning_rate": 2.8038999421453826e-06, + "loss": 2.6805, + "step": 9252 + }, + { + "epoch": 0.837602969131891, + "grad_norm": 0.8904218673706055, + "learning_rate": 2.7964430104861274e-06, + "loss": 2.6164, + "step": 9253 + }, + { + "epoch": 0.8376934914456413, + "grad_norm": 0.8426781892776489, + "learning_rate": 2.7889958672224104e-06, + "loss": 2.6167, + "step": 9254 + }, + { + "epoch": 0.8377840137593917, + "grad_norm": 0.7983620762825012, + "learning_rate": 2.7815585131041435e-06, + "loss": 2.5987, + "step": 9255 + }, + { + "epoch": 0.837874536073142, + "grad_norm": 0.8963645100593567, + "learning_rate": 2.7741309488802712e-06, + "loss": 2.7137, + "step": 9256 + }, + { + "epoch": 0.8379650583868924, + "grad_norm": 0.8122845888137817, + "learning_rate": 2.7667131752987517e-06, + "loss": 2.5951, + "step": 9257 + }, + { + "epoch": 0.8380555807006427, + "grad_norm": 0.83088618516922, + "learning_rate": 2.759305193106554e-06, + "loss": 2.663, + "step": 9258 + }, + { + "epoch": 0.8381461030143931, + "grad_norm": 0.8195231556892395, + "learning_rate": 2.7519070030496695e-06, + "loss": 2.59, + "step": 9259 + }, + { + "epoch": 0.8382366253281434, + "grad_norm": 0.8513022065162659, + "learning_rate": 2.744518605873092e-06, + "loss": 2.6079, + "step": 9260 + }, + { + "epoch": 0.8383271476418938, + "grad_norm": 0.830094039440155, + "learning_rate": 2.737140002320815e-06, + "loss": 2.668, + "step": 9261 + }, + { + "epoch": 0.838417669955644, + "grad_norm": 0.8253921270370483, + "learning_rate": 2.7297711931358993e-06, + "loss": 2.6399, + "step": 9262 + }, + { + "epoch": 0.8385081922693944, + "grad_norm": 0.9423378705978394, + "learning_rate": 2.7224121790603517e-06, + "loss": 2.6048, + "step": 9263 + }, + { + "epoch": 0.8385987145831447, + "grad_norm": 0.8792223930358887, + "learning_rate": 2.715062960835235e-06, + "loss": 2.626, + "step": 9264 + }, + { + "epoch": 0.8386892368968951, + "grad_norm": 0.8084904551506042, + "learning_rate": 2.707723539200613e-06, + "loss": 2.5398, + "step": 9265 + }, + { + "epoch": 0.8387797592106454, + "grad_norm": 0.8560845255851746, + "learning_rate": 2.7003939148955715e-06, + "loss": 2.6488, + "step": 9266 + }, + { + "epoch": 0.8388702815243958, + "grad_norm": 0.9024186730384827, + "learning_rate": 2.6930740886581985e-06, + "loss": 2.6038, + "step": 9267 + }, + { + "epoch": 0.8389608038381461, + "grad_norm": 0.8976023197174072, + "learning_rate": 2.685764061225615e-06, + "loss": 2.5994, + "step": 9268 + }, + { + "epoch": 0.8390513261518965, + "grad_norm": 0.8648386001586914, + "learning_rate": 2.678463833333911e-06, + "loss": 2.5852, + "step": 9269 + }, + { + "epoch": 0.8391418484656468, + "grad_norm": 0.8111081123352051, + "learning_rate": 2.6711734057182415e-06, + "loss": 2.6155, + "step": 9270 + }, + { + "epoch": 0.8392323707793972, + "grad_norm": 0.808134138584137, + "learning_rate": 2.6638927791127534e-06, + "loss": 2.5815, + "step": 9271 + }, + { + "epoch": 0.8393228930931474, + "grad_norm": 0.8514514565467834, + "learning_rate": 2.6566219542505933e-06, + "loss": 2.6199, + "step": 9272 + }, + { + "epoch": 0.8394134154068978, + "grad_norm": 0.8413440585136414, + "learning_rate": 2.649360931863942e-06, + "loss": 2.584, + "step": 9273 + }, + { + "epoch": 0.8395039377206481, + "grad_norm": 0.8079906105995178, + "learning_rate": 2.6421097126839712e-06, + "loss": 2.5947, + "step": 9274 + }, + { + "epoch": 0.8395944600343985, + "grad_norm": 0.8232433199882507, + "learning_rate": 2.6348682974408955e-06, + "loss": 2.6103, + "step": 9275 + }, + { + "epoch": 0.8396849823481488, + "grad_norm": 0.9182198643684387, + "learning_rate": 2.6276366868639325e-06, + "loss": 2.6223, + "step": 9276 + }, + { + "epoch": 0.8397755046618992, + "grad_norm": 0.8715531826019287, + "learning_rate": 2.6204148816812657e-06, + "loss": 2.671, + "step": 9277 + }, + { + "epoch": 0.8398660269756495, + "grad_norm": 0.8018813729286194, + "learning_rate": 2.6132028826201695e-06, + "loss": 2.6102, + "step": 9278 + }, + { + "epoch": 0.8399565492893999, + "grad_norm": 0.8041639924049377, + "learning_rate": 2.6060006904068844e-06, + "loss": 2.5943, + "step": 9279 + }, + { + "epoch": 0.8400470716031502, + "grad_norm": 0.9004766345024109, + "learning_rate": 2.5988083057666533e-06, + "loss": 2.5938, + "step": 9280 + }, + { + "epoch": 0.8401375939169006, + "grad_norm": 0.8807133436203003, + "learning_rate": 2.5916257294237633e-06, + "loss": 2.6039, + "step": 9281 + }, + { + "epoch": 0.8402281162306509, + "grad_norm": 0.8423158526420593, + "learning_rate": 2.5844529621015023e-06, + "loss": 2.5367, + "step": 9282 + }, + { + "epoch": 0.8403186385444011, + "grad_norm": 0.8630332946777344, + "learning_rate": 2.577290004522148e-06, + "loss": 2.5719, + "step": 9283 + }, + { + "epoch": 0.8404091608581515, + "grad_norm": 0.8227906823158264, + "learning_rate": 2.5701368574070463e-06, + "loss": 2.6222, + "step": 9284 + }, + { + "epoch": 0.8404996831719018, + "grad_norm": 0.8309725522994995, + "learning_rate": 2.5629935214764865e-06, + "loss": 2.6203, + "step": 9285 + }, + { + "epoch": 0.8405902054856522, + "grad_norm": 0.8114734888076782, + "learning_rate": 2.5558599974498054e-06, + "loss": 2.5701, + "step": 9286 + }, + { + "epoch": 0.8406807277994025, + "grad_norm": 0.9079375267028809, + "learning_rate": 2.548736286045361e-06, + "loss": 2.5715, + "step": 9287 + }, + { + "epoch": 0.8407712501131529, + "grad_norm": 0.9126922488212585, + "learning_rate": 2.5416223879805134e-06, + "loss": 2.6241, + "step": 9288 + }, + { + "epoch": 0.8408617724269032, + "grad_norm": 0.8237440586090088, + "learning_rate": 2.5345183039716112e-06, + "loss": 2.6061, + "step": 9289 + }, + { + "epoch": 0.8409522947406536, + "grad_norm": 0.8415271043777466, + "learning_rate": 2.5274240347340717e-06, + "loss": 2.5721, + "step": 9290 + }, + { + "epoch": 0.8410428170544039, + "grad_norm": 0.8436958193778992, + "learning_rate": 2.520339580982234e-06, + "loss": 2.6532, + "step": 9291 + }, + { + "epoch": 0.8411333393681543, + "grad_norm": 0.911311686038971, + "learning_rate": 2.5132649434295606e-06, + "loss": 2.6995, + "step": 9292 + }, + { + "epoch": 0.8412238616819046, + "grad_norm": 0.9336774349212646, + "learning_rate": 2.506200122788427e-06, + "loss": 2.6893, + "step": 9293 + }, + { + "epoch": 0.841314383995655, + "grad_norm": 0.8583858013153076, + "learning_rate": 2.499145119770274e-06, + "loss": 2.5763, + "step": 9294 + }, + { + "epoch": 0.8414049063094052, + "grad_norm": 0.8368796706199646, + "learning_rate": 2.4920999350855458e-06, + "loss": 2.5478, + "step": 9295 + }, + { + "epoch": 0.8414954286231556, + "grad_norm": 0.8354606628417969, + "learning_rate": 2.4850645694436736e-06, + "loss": 2.655, + "step": 9296 + }, + { + "epoch": 0.8415859509369059, + "grad_norm": 0.8495997786521912, + "learning_rate": 2.478039023553136e-06, + "loss": 2.6892, + "step": 9297 + }, + { + "epoch": 0.8416764732506563, + "grad_norm": 0.8512294292449951, + "learning_rate": 2.471023298121422e-06, + "loss": 2.6317, + "step": 9298 + }, + { + "epoch": 0.8417669955644066, + "grad_norm": 0.8798067569732666, + "learning_rate": 2.464017393854956e-06, + "loss": 2.6381, + "step": 9299 + }, + { + "epoch": 0.841857517878157, + "grad_norm": 0.8893240690231323, + "learning_rate": 2.4570213114592954e-06, + "loss": 2.625, + "step": 9300 + }, + { + "epoch": 0.8419480401919073, + "grad_norm": 0.7912008166313171, + "learning_rate": 2.45003505163891e-06, + "loss": 2.5689, + "step": 9301 + }, + { + "epoch": 0.8420385625056577, + "grad_norm": 0.850443959236145, + "learning_rate": 2.443058615097338e-06, + "loss": 2.6325, + "step": 9302 + }, + { + "epoch": 0.842129084819408, + "grad_norm": 0.857284665107727, + "learning_rate": 2.4360920025370827e-06, + "loss": 2.679, + "step": 9303 + }, + { + "epoch": 0.8422196071331584, + "grad_norm": 0.8806108832359314, + "learning_rate": 2.4291352146596945e-06, + "loss": 2.6189, + "step": 9304 + }, + { + "epoch": 0.8423101294469086, + "grad_norm": 0.8154246211051941, + "learning_rate": 2.4221882521657136e-06, + "loss": 2.5786, + "step": 9305 + }, + { + "epoch": 0.842400651760659, + "grad_norm": 0.8218053579330444, + "learning_rate": 2.4152511157547244e-06, + "loss": 2.6728, + "step": 9306 + }, + { + "epoch": 0.8424911740744093, + "grad_norm": 0.8055351376533508, + "learning_rate": 2.4083238061252567e-06, + "loss": 2.6068, + "step": 9307 + }, + { + "epoch": 0.8425816963881597, + "grad_norm": 0.9141035676002502, + "learning_rate": 2.401406323974931e-06, + "loss": 2.6306, + "step": 9308 + }, + { + "epoch": 0.84267221870191, + "grad_norm": 0.8625081777572632, + "learning_rate": 2.3944986700003004e-06, + "loss": 2.5936, + "step": 9309 + }, + { + "epoch": 0.8427627410156604, + "grad_norm": 0.8520241975784302, + "learning_rate": 2.3876008448969976e-06, + "loss": 2.5859, + "step": 9310 + }, + { + "epoch": 0.8428532633294107, + "grad_norm": 0.8802344799041748, + "learning_rate": 2.380712849359612e-06, + "loss": 2.5918, + "step": 9311 + }, + { + "epoch": 0.8429437856431611, + "grad_norm": 0.9308227300643921, + "learning_rate": 2.3738346840817773e-06, + "loss": 2.6696, + "step": 9312 + }, + { + "epoch": 0.8430343079569114, + "grad_norm": 0.782535195350647, + "learning_rate": 2.366966349756106e-06, + "loss": 2.6541, + "step": 9313 + }, + { + "epoch": 0.8431248302706618, + "grad_norm": 0.7898374795913696, + "learning_rate": 2.3601078470742664e-06, + "loss": 2.6019, + "step": 9314 + }, + { + "epoch": 0.843215352584412, + "grad_norm": 0.8801725506782532, + "learning_rate": 2.3532591767268853e-06, + "loss": 2.605, + "step": 9315 + }, + { + "epoch": 0.8433058748981624, + "grad_norm": 0.8103786706924438, + "learning_rate": 2.3464203394036322e-06, + "loss": 2.5527, + "step": 9316 + }, + { + "epoch": 0.8433963972119127, + "grad_norm": 0.8946208357810974, + "learning_rate": 2.3395913357931786e-06, + "loss": 2.5804, + "step": 9317 + }, + { + "epoch": 0.8434869195256631, + "grad_norm": 0.8092337846755981, + "learning_rate": 2.332772166583208e-06, + "loss": 2.6068, + "step": 9318 + }, + { + "epoch": 0.8435774418394134, + "grad_norm": 0.8034257292747498, + "learning_rate": 2.3259628324604155e-06, + "loss": 2.6107, + "step": 9319 + }, + { + "epoch": 0.8436679641531638, + "grad_norm": 0.9613232016563416, + "learning_rate": 2.3191633341104856e-06, + "loss": 2.6176, + "step": 9320 + }, + { + "epoch": 0.8437584864669141, + "grad_norm": 0.8717933297157288, + "learning_rate": 2.3123736722181375e-06, + "loss": 2.6105, + "step": 9321 + }, + { + "epoch": 0.8438490087806645, + "grad_norm": 0.8280078172683716, + "learning_rate": 2.3055938474670915e-06, + "loss": 2.5993, + "step": 9322 + }, + { + "epoch": 0.8439395310944148, + "grad_norm": 0.8411900401115417, + "learning_rate": 2.298823860540056e-06, + "loss": 2.6247, + "step": 9323 + }, + { + "epoch": 0.8440300534081651, + "grad_norm": 0.8409061431884766, + "learning_rate": 2.292063712118797e-06, + "loss": 2.5859, + "step": 9324 + }, + { + "epoch": 0.8441205757219155, + "grad_norm": 0.8324682116508484, + "learning_rate": 2.2853134028840594e-06, + "loss": 2.6155, + "step": 9325 + }, + { + "epoch": 0.8442110980356657, + "grad_norm": 0.8437857627868652, + "learning_rate": 2.2785729335155657e-06, + "loss": 2.6489, + "step": 9326 + }, + { + "epoch": 0.8443016203494161, + "grad_norm": 0.8383326530456543, + "learning_rate": 2.271842304692118e-06, + "loss": 2.6049, + "step": 9327 + }, + { + "epoch": 0.8443921426631664, + "grad_norm": 0.8164210915565491, + "learning_rate": 2.265121517091473e-06, + "loss": 2.6312, + "step": 9328 + }, + { + "epoch": 0.8444826649769168, + "grad_norm": 0.865288496017456, + "learning_rate": 2.2584105713904125e-06, + "loss": 2.6321, + "step": 9329 + }, + { + "epoch": 0.8445731872906671, + "grad_norm": 0.8106584548950195, + "learning_rate": 2.2517094682647397e-06, + "loss": 2.6074, + "step": 9330 + }, + { + "epoch": 0.8446637096044175, + "grad_norm": 0.8246323466300964, + "learning_rate": 2.245018208389249e-06, + "loss": 2.5897, + "step": 9331 + }, + { + "epoch": 0.8447542319181678, + "grad_norm": 0.8730551600456238, + "learning_rate": 2.2383367924377452e-06, + "loss": 2.6694, + "step": 9332 + }, + { + "epoch": 0.8448447542319182, + "grad_norm": 0.8684254288673401, + "learning_rate": 2.2316652210830568e-06, + "loss": 2.5865, + "step": 9333 + }, + { + "epoch": 0.8449352765456685, + "grad_norm": 0.860093891620636, + "learning_rate": 2.2250034949969913e-06, + "loss": 2.6367, + "step": 9334 + }, + { + "epoch": 0.8450257988594189, + "grad_norm": 0.878279447555542, + "learning_rate": 2.2183516148504226e-06, + "loss": 2.6596, + "step": 9335 + }, + { + "epoch": 0.8451163211731691, + "grad_norm": 0.864924967288971, + "learning_rate": 2.2117095813131484e-06, + "loss": 2.6327, + "step": 9336 + }, + { + "epoch": 0.8452068434869195, + "grad_norm": 0.8816613554954529, + "learning_rate": 2.205077395054056e-06, + "loss": 2.6484, + "step": 9337 + }, + { + "epoch": 0.8452973658006698, + "grad_norm": 0.8233687877655029, + "learning_rate": 2.198455056740989e-06, + "loss": 2.5487, + "step": 9338 + }, + { + "epoch": 0.8453878881144202, + "grad_norm": 0.9007396697998047, + "learning_rate": 2.1918425670408247e-06, + "loss": 2.6456, + "step": 9339 + }, + { + "epoch": 0.8454784104281705, + "grad_norm": 0.8072501420974731, + "learning_rate": 2.1852399266194314e-06, + "loss": 2.5781, + "step": 9340 + }, + { + "epoch": 0.8455689327419209, + "grad_norm": 0.879115104675293, + "learning_rate": 2.1786471361417206e-06, + "loss": 2.5904, + "step": 9341 + }, + { + "epoch": 0.8456594550556712, + "grad_norm": 0.9695716500282288, + "learning_rate": 2.1720641962715393e-06, + "loss": 2.6004, + "step": 9342 + }, + { + "epoch": 0.8457499773694216, + "grad_norm": 0.8915279507637024, + "learning_rate": 2.1654911076718355e-06, + "loss": 2.5751, + "step": 9343 + }, + { + "epoch": 0.8458404996831719, + "grad_norm": 0.8442407846450806, + "learning_rate": 2.1589278710045013e-06, + "loss": 2.6255, + "step": 9344 + }, + { + "epoch": 0.8459310219969223, + "grad_norm": 0.8827381134033203, + "learning_rate": 2.152374486930442e-06, + "loss": 2.6682, + "step": 9345 + }, + { + "epoch": 0.8460215443106726, + "grad_norm": 0.8156737089157104, + "learning_rate": 2.145830956109596e-06, + "loss": 2.6366, + "step": 9346 + }, + { + "epoch": 0.846112066624423, + "grad_norm": 0.7898461222648621, + "learning_rate": 2.1392972792009026e-06, + "loss": 2.5952, + "step": 9347 + }, + { + "epoch": 0.8462025889381732, + "grad_norm": 0.8535096645355225, + "learning_rate": 2.1327734568622914e-06, + "loss": 2.6272, + "step": 9348 + }, + { + "epoch": 0.8462931112519236, + "grad_norm": 0.8594303131103516, + "learning_rate": 2.126259489750715e-06, + "loss": 2.6145, + "step": 9349 + }, + { + "epoch": 0.8463836335656739, + "grad_norm": 0.8821359276771545, + "learning_rate": 2.119755378522137e-06, + "loss": 2.6136, + "step": 9350 + }, + { + "epoch": 0.8464741558794243, + "grad_norm": 0.817482590675354, + "learning_rate": 2.1132611238315003e-06, + "loss": 2.6069, + "step": 9351 + }, + { + "epoch": 0.8465646781931746, + "grad_norm": 0.8469284772872925, + "learning_rate": 2.1067767263327933e-06, + "loss": 2.6322, + "step": 9352 + }, + { + "epoch": 0.846655200506925, + "grad_norm": 0.7845486998558044, + "learning_rate": 2.100302186679004e-06, + "loss": 2.6096, + "step": 9353 + }, + { + "epoch": 0.8467457228206753, + "grad_norm": 0.7727720737457275, + "learning_rate": 2.0938375055220893e-06, + "loss": 2.6145, + "step": 9354 + }, + { + "epoch": 0.8468362451344257, + "grad_norm": 0.7955880165100098, + "learning_rate": 2.087382683513073e-06, + "loss": 2.6062, + "step": 9355 + }, + { + "epoch": 0.846926767448176, + "grad_norm": 0.9472482204437256, + "learning_rate": 2.0809377213019234e-06, + "loss": 2.6432, + "step": 9356 + }, + { + "epoch": 0.8470172897619264, + "grad_norm": 0.8384003639221191, + "learning_rate": 2.074502619537688e-06, + "loss": 2.6339, + "step": 9357 + }, + { + "epoch": 0.8471078120756766, + "grad_norm": 0.9159187078475952, + "learning_rate": 2.0680773788683494e-06, + "loss": 2.5945, + "step": 9358 + }, + { + "epoch": 0.847198334389427, + "grad_norm": 0.8477405309677124, + "learning_rate": 2.0616619999409337e-06, + "loss": 2.6826, + "step": 9359 + }, + { + "epoch": 0.8472888567031773, + "grad_norm": 0.8061385750770569, + "learning_rate": 2.05525648340148e-06, + "loss": 2.5887, + "step": 9360 + }, + { + "epoch": 0.8473793790169277, + "grad_norm": 0.7954801917076111, + "learning_rate": 2.048860829895016e-06, + "loss": 2.6384, + "step": 9361 + }, + { + "epoch": 0.847469901330678, + "grad_norm": 0.8151822090148926, + "learning_rate": 2.0424750400655947e-06, + "loss": 2.6229, + "step": 9362 + }, + { + "epoch": 0.8475604236444284, + "grad_norm": 0.8000811338424683, + "learning_rate": 2.0360991145562557e-06, + "loss": 2.5853, + "step": 9363 + }, + { + "epoch": 0.8476509459581787, + "grad_norm": 0.8454170823097229, + "learning_rate": 2.029733054009042e-06, + "loss": 2.6348, + "step": 9364 + }, + { + "epoch": 0.847741468271929, + "grad_norm": 0.8597866892814636, + "learning_rate": 2.0233768590650403e-06, + "loss": 2.5944, + "step": 9365 + }, + { + "epoch": 0.8478319905856794, + "grad_norm": 0.8464025259017944, + "learning_rate": 2.0170305303643054e-06, + "loss": 2.5851, + "step": 9366 + }, + { + "epoch": 0.8479225128994297, + "grad_norm": 0.8239722847938538, + "learning_rate": 2.0106940685459154e-06, + "loss": 2.6316, + "step": 9367 + }, + { + "epoch": 0.84801303521318, + "grad_norm": 0.8130668997764587, + "learning_rate": 2.004367474247948e-06, + "loss": 2.615, + "step": 9368 + }, + { + "epoch": 0.8481035575269303, + "grad_norm": 0.8077114820480347, + "learning_rate": 1.998050748107505e-06, + "loss": 2.6375, + "step": 9369 + }, + { + "epoch": 0.8481940798406807, + "grad_norm": 0.8757891058921814, + "learning_rate": 1.9917438907606556e-06, + "loss": 2.5772, + "step": 9370 + }, + { + "epoch": 0.848284602154431, + "grad_norm": 0.8093799948692322, + "learning_rate": 1.985446902842536e-06, + "loss": 2.6391, + "step": 9371 + }, + { + "epoch": 0.8483751244681814, + "grad_norm": 0.8270487785339355, + "learning_rate": 1.9791597849872057e-06, + "loss": 2.5764, + "step": 9372 + }, + { + "epoch": 0.8484656467819317, + "grad_norm": 0.9486851692199707, + "learning_rate": 1.9728825378278246e-06, + "loss": 2.6505, + "step": 9373 + }, + { + "epoch": 0.8485561690956821, + "grad_norm": 0.9480065107345581, + "learning_rate": 1.966615161996477e-06, + "loss": 2.5907, + "step": 9374 + }, + { + "epoch": 0.8486466914094324, + "grad_norm": 0.8524114489555359, + "learning_rate": 1.960357658124301e-06, + "loss": 2.6317, + "step": 9375 + }, + { + "epoch": 0.8487372137231828, + "grad_norm": 0.908987820148468, + "learning_rate": 1.954110026841427e-06, + "loss": 2.5713, + "step": 9376 + }, + { + "epoch": 0.8488277360369331, + "grad_norm": 0.8268985152244568, + "learning_rate": 1.947872268776996e-06, + "loss": 2.6577, + "step": 9377 + }, + { + "epoch": 0.8489182583506835, + "grad_norm": 0.7865447998046875, + "learning_rate": 1.941644384559138e-06, + "loss": 2.5872, + "step": 9378 + }, + { + "epoch": 0.8490087806644337, + "grad_norm": 0.8748701810836792, + "learning_rate": 1.935426374815008e-06, + "loss": 2.6353, + "step": 9379 + }, + { + "epoch": 0.8490993029781841, + "grad_norm": 0.8506055474281311, + "learning_rate": 1.9292182401707603e-06, + "loss": 2.6137, + "step": 9380 + }, + { + "epoch": 0.8491898252919344, + "grad_norm": 0.7830395698547363, + "learning_rate": 1.9230199812515616e-06, + "loss": 2.5969, + "step": 9381 + }, + { + "epoch": 0.8492803476056848, + "grad_norm": 0.7881369590759277, + "learning_rate": 1.9168315986815567e-06, + "loss": 2.6085, + "step": 9382 + }, + { + "epoch": 0.8493708699194351, + "grad_norm": 0.891779363155365, + "learning_rate": 1.910653093083925e-06, + "loss": 2.6918, + "step": 9383 + }, + { + "epoch": 0.8494613922331855, + "grad_norm": 0.819579541683197, + "learning_rate": 1.904484465080847e-06, + "loss": 2.5821, + "step": 9384 + }, + { + "epoch": 0.8495519145469358, + "grad_norm": 0.8453068137168884, + "learning_rate": 1.898325715293503e-06, + "loss": 2.5648, + "step": 9385 + }, + { + "epoch": 0.8496424368606862, + "grad_norm": 0.8719899654388428, + "learning_rate": 1.8921768443420528e-06, + "loss": 2.6316, + "step": 9386 + }, + { + "epoch": 0.8497329591744365, + "grad_norm": 0.9381957054138184, + "learning_rate": 1.8860378528457456e-06, + "loss": 2.6796, + "step": 9387 + }, + { + "epoch": 0.8498234814881869, + "grad_norm": 0.8671666979789734, + "learning_rate": 1.87990874142272e-06, + "loss": 2.6189, + "step": 9388 + }, + { + "epoch": 0.8499140038019372, + "grad_norm": 0.8676027655601501, + "learning_rate": 1.8737895106902048e-06, + "loss": 2.5648, + "step": 9389 + }, + { + "epoch": 0.8500045261156876, + "grad_norm": 0.8236902356147766, + "learning_rate": 1.8676801612643957e-06, + "loss": 2.5913, + "step": 9390 + }, + { + "epoch": 0.8500950484294378, + "grad_norm": 0.769631028175354, + "learning_rate": 1.8615806937605118e-06, + "loss": 2.6084, + "step": 9391 + }, + { + "epoch": 0.8501855707431882, + "grad_norm": 0.8675589561462402, + "learning_rate": 1.8554911087927618e-06, + "loss": 2.6249, + "step": 9392 + }, + { + "epoch": 0.8502760930569385, + "grad_norm": 0.8161866664886475, + "learning_rate": 1.8494114069743885e-06, + "loss": 2.6269, + "step": 9393 + }, + { + "epoch": 0.8503666153706889, + "grad_norm": 0.8564107418060303, + "learning_rate": 1.8433415889175799e-06, + "loss": 2.6545, + "step": 9394 + }, + { + "epoch": 0.8504571376844392, + "grad_norm": 0.8710647225379944, + "learning_rate": 1.8372816552336026e-06, + "loss": 2.6265, + "step": 9395 + }, + { + "epoch": 0.8505476599981896, + "grad_norm": 0.8734665513038635, + "learning_rate": 1.8312316065326796e-06, + "loss": 2.6363, + "step": 9396 + }, + { + "epoch": 0.8506381823119399, + "grad_norm": 0.8522061109542847, + "learning_rate": 1.8251914434240347e-06, + "loss": 2.5858, + "step": 9397 + }, + { + "epoch": 0.8507287046256903, + "grad_norm": 0.838045060634613, + "learning_rate": 1.819161166515937e-06, + "loss": 2.6064, + "step": 9398 + }, + { + "epoch": 0.8508192269394406, + "grad_norm": 0.8066017031669617, + "learning_rate": 1.8131407764156117e-06, + "loss": 2.6138, + "step": 9399 + }, + { + "epoch": 0.850909749253191, + "grad_norm": 0.8111808896064758, + "learning_rate": 1.8071302737293295e-06, + "loss": 2.5855, + "step": 9400 + }, + { + "epoch": 0.8510002715669412, + "grad_norm": 0.8490340709686279, + "learning_rate": 1.8011296590623506e-06, + "loss": 2.5868, + "step": 9401 + }, + { + "epoch": 0.8510907938806916, + "grad_norm": 0.8828288316726685, + "learning_rate": 1.7951389330189029e-06, + "loss": 2.6405, + "step": 9402 + }, + { + "epoch": 0.8511813161944419, + "grad_norm": 0.918430507183075, + "learning_rate": 1.7891580962023036e-06, + "loss": 2.6249, + "step": 9403 + }, + { + "epoch": 0.8512718385081923, + "grad_norm": 0.7756317257881165, + "learning_rate": 1.7831871492147934e-06, + "loss": 2.6389, + "step": 9404 + }, + { + "epoch": 0.8513623608219426, + "grad_norm": 0.9137489795684814, + "learning_rate": 1.7772260926576356e-06, + "loss": 2.6336, + "step": 9405 + }, + { + "epoch": 0.8514528831356929, + "grad_norm": 0.827333390712738, + "learning_rate": 1.771274927131139e-06, + "loss": 2.5928, + "step": 9406 + }, + { + "epoch": 0.8515434054494433, + "grad_norm": 0.8530473113059998, + "learning_rate": 1.7653336532345577e-06, + "loss": 2.6192, + "step": 9407 + }, + { + "epoch": 0.8516339277631936, + "grad_norm": 0.8667789101600647, + "learning_rate": 1.7594022715661906e-06, + "loss": 2.6038, + "step": 9408 + }, + { + "epoch": 0.851724450076944, + "grad_norm": 0.8074167370796204, + "learning_rate": 1.753480782723338e-06, + "loss": 2.6365, + "step": 9409 + }, + { + "epoch": 0.8518149723906943, + "grad_norm": 0.8859494924545288, + "learning_rate": 1.747569187302267e-06, + "loss": 2.6868, + "step": 9410 + }, + { + "epoch": 0.8519054947044447, + "grad_norm": 0.8398774862289429, + "learning_rate": 1.7416674858983018e-06, + "loss": 2.7059, + "step": 9411 + }, + { + "epoch": 0.8519960170181949, + "grad_norm": 0.9021377563476562, + "learning_rate": 1.7357756791057334e-06, + "loss": 2.6566, + "step": 9412 + }, + { + "epoch": 0.8520865393319453, + "grad_norm": 0.8221820592880249, + "learning_rate": 1.7298937675178428e-06, + "loss": 2.6093, + "step": 9413 + }, + { + "epoch": 0.8521770616456956, + "grad_norm": 0.9722798466682434, + "learning_rate": 1.7240217517269897e-06, + "loss": 2.6304, + "step": 9414 + }, + { + "epoch": 0.852267583959446, + "grad_norm": 0.820933997631073, + "learning_rate": 1.7181596323244454e-06, + "loss": 2.6182, + "step": 9415 + }, + { + "epoch": 0.8523581062731963, + "grad_norm": 0.914440393447876, + "learning_rate": 1.7123074099005377e-06, + "loss": 2.6131, + "step": 9416 + }, + { + "epoch": 0.8524486285869467, + "grad_norm": 0.9035472273826599, + "learning_rate": 1.706465085044584e-06, + "loss": 2.5809, + "step": 9417 + }, + { + "epoch": 0.852539150900697, + "grad_norm": 0.9354585409164429, + "learning_rate": 1.7006326583449029e-06, + "loss": 2.6365, + "step": 9418 + }, + { + "epoch": 0.8526296732144474, + "grad_norm": 0.8287830948829651, + "learning_rate": 1.6948101303888243e-06, + "loss": 2.6085, + "step": 9419 + }, + { + "epoch": 0.8527201955281977, + "grad_norm": 0.8384172320365906, + "learning_rate": 1.6889975017626903e-06, + "loss": 2.6083, + "step": 9420 + }, + { + "epoch": 0.8528107178419481, + "grad_norm": 0.817358136177063, + "learning_rate": 1.6831947730517883e-06, + "loss": 2.6818, + "step": 9421 + }, + { + "epoch": 0.8529012401556983, + "grad_norm": 0.8479032516479492, + "learning_rate": 1.6774019448405064e-06, + "loss": 2.5817, + "step": 9422 + }, + { + "epoch": 0.8529917624694487, + "grad_norm": 0.8063598275184631, + "learning_rate": 1.671619017712156e-06, + "loss": 2.5778, + "step": 9423 + }, + { + "epoch": 0.853082284783199, + "grad_norm": 0.840354323387146, + "learning_rate": 1.665845992249071e-06, + "loss": 2.5911, + "step": 9424 + }, + { + "epoch": 0.8531728070969494, + "grad_norm": 0.8489820957183838, + "learning_rate": 1.6600828690326087e-06, + "loss": 2.6492, + "step": 9425 + }, + { + "epoch": 0.8532633294106997, + "grad_norm": 0.8331730961799622, + "learning_rate": 1.6543296486431159e-06, + "loss": 2.6017, + "step": 9426 + }, + { + "epoch": 0.8533538517244501, + "grad_norm": 0.8832200169563293, + "learning_rate": 1.6485863316599294e-06, + "loss": 2.5786, + "step": 9427 + }, + { + "epoch": 0.8534443740382004, + "grad_norm": 0.8762169480323792, + "learning_rate": 1.6428529186614195e-06, + "loss": 2.6089, + "step": 9428 + }, + { + "epoch": 0.8535348963519508, + "grad_norm": 0.8387589454650879, + "learning_rate": 1.6371294102249134e-06, + "loss": 2.6026, + "step": 9429 + }, + { + "epoch": 0.8536254186657011, + "grad_norm": 0.8397504687309265, + "learning_rate": 1.6314158069267948e-06, + "loss": 2.5771, + "step": 9430 + }, + { + "epoch": 0.8537159409794515, + "grad_norm": 0.8257327675819397, + "learning_rate": 1.6257121093424033e-06, + "loss": 2.5911, + "step": 9431 + }, + { + "epoch": 0.8538064632932018, + "grad_norm": 0.836053729057312, + "learning_rate": 1.6200183180461236e-06, + "loss": 2.6332, + "step": 9432 + }, + { + "epoch": 0.8538969856069522, + "grad_norm": 0.8341732621192932, + "learning_rate": 1.6143344336112976e-06, + "loss": 2.5628, + "step": 9433 + }, + { + "epoch": 0.8539875079207024, + "grad_norm": 0.8637101054191589, + "learning_rate": 1.6086604566103002e-06, + "loss": 2.5738, + "step": 9434 + }, + { + "epoch": 0.8540780302344528, + "grad_norm": 0.9812245965003967, + "learning_rate": 1.6029963876145083e-06, + "loss": 2.6578, + "step": 9435 + }, + { + "epoch": 0.8541685525482031, + "grad_norm": 0.8124945163726807, + "learning_rate": 1.5973422271942985e-06, + "loss": 2.6427, + "step": 9436 + }, + { + "epoch": 0.8542590748619535, + "grad_norm": 0.8565968871116638, + "learning_rate": 1.5916979759190264e-06, + "loss": 2.6265, + "step": 9437 + }, + { + "epoch": 0.8543495971757038, + "grad_norm": 0.8052038550376892, + "learning_rate": 1.5860636343570711e-06, + "loss": 2.6221, + "step": 9438 + }, + { + "epoch": 0.8544401194894542, + "grad_norm": 0.9391239881515503, + "learning_rate": 1.580439203075812e-06, + "loss": 2.64, + "step": 9439 + }, + { + "epoch": 0.8545306418032045, + "grad_norm": 0.8188319206237793, + "learning_rate": 1.574824682641629e-06, + "loss": 2.5393, + "step": 9440 + }, + { + "epoch": 0.8546211641169549, + "grad_norm": 0.8291501402854919, + "learning_rate": 1.5692200736199148e-06, + "loss": 2.6161, + "step": 9441 + }, + { + "epoch": 0.8547116864307052, + "grad_norm": 0.9298816919326782, + "learning_rate": 1.5636253765750508e-06, + "loss": 2.6896, + "step": 9442 + }, + { + "epoch": 0.8548022087444556, + "grad_norm": 0.8298754692077637, + "learning_rate": 1.5580405920704088e-06, + "loss": 2.6438, + "step": 9443 + }, + { + "epoch": 0.8548927310582058, + "grad_norm": 0.9584684371948242, + "learning_rate": 1.5524657206683947e-06, + "loss": 2.6297, + "step": 9444 + }, + { + "epoch": 0.8549832533719562, + "grad_norm": 0.8696407675743103, + "learning_rate": 1.5469007629303812e-06, + "loss": 2.6289, + "step": 9445 + }, + { + "epoch": 0.8550737756857065, + "grad_norm": 0.8324605822563171, + "learning_rate": 1.5413457194167757e-06, + "loss": 2.5645, + "step": 9446 + }, + { + "epoch": 0.8551642979994568, + "grad_norm": 0.8564126491546631, + "learning_rate": 1.5358005906869532e-06, + "loss": 2.6157, + "step": 9447 + }, + { + "epoch": 0.8552548203132072, + "grad_norm": 0.8332357406616211, + "learning_rate": 1.5302653772993225e-06, + "loss": 2.5958, + "step": 9448 + }, + { + "epoch": 0.8553453426269575, + "grad_norm": 0.8862999677658081, + "learning_rate": 1.5247400798112598e-06, + "loss": 2.5999, + "step": 9449 + }, + { + "epoch": 0.8554358649407079, + "grad_norm": 0.8509512543678284, + "learning_rate": 1.5192246987791981e-06, + "loss": 2.6174, + "step": 9450 + }, + { + "epoch": 0.8555263872544582, + "grad_norm": 0.7944133877754211, + "learning_rate": 1.513719234758504e-06, + "loss": 2.564, + "step": 9451 + }, + { + "epoch": 0.8556169095682086, + "grad_norm": 0.8499475121498108, + "learning_rate": 1.5082236883035895e-06, + "loss": 2.6325, + "step": 9452 + }, + { + "epoch": 0.8557074318819589, + "grad_norm": 0.8515414595603943, + "learning_rate": 1.5027380599678564e-06, + "loss": 2.6154, + "step": 9453 + }, + { + "epoch": 0.8557979541957093, + "grad_norm": 0.9373891949653625, + "learning_rate": 1.4972623503036965e-06, + "loss": 2.6027, + "step": 9454 + }, + { + "epoch": 0.8558884765094595, + "grad_norm": 0.8476731777191162, + "learning_rate": 1.4917965598625349e-06, + "loss": 2.6462, + "step": 9455 + }, + { + "epoch": 0.8559789988232099, + "grad_norm": 0.9011625051498413, + "learning_rate": 1.486340689194754e-06, + "loss": 2.6312, + "step": 9456 + }, + { + "epoch": 0.8560695211369602, + "grad_norm": 0.8020340800285339, + "learning_rate": 1.4808947388497808e-06, + "loss": 2.5642, + "step": 9457 + }, + { + "epoch": 0.8561600434507106, + "grad_norm": 0.8751192092895508, + "learning_rate": 1.4754587093760097e-06, + "loss": 2.6344, + "step": 9458 + }, + { + "epoch": 0.8562505657644609, + "grad_norm": 0.8079261183738708, + "learning_rate": 1.4700326013208365e-06, + "loss": 2.6598, + "step": 9459 + }, + { + "epoch": 0.8563410880782113, + "grad_norm": 0.8567526340484619, + "learning_rate": 1.4646164152307018e-06, + "loss": 2.6714, + "step": 9460 + }, + { + "epoch": 0.8564316103919616, + "grad_norm": 0.8911241888999939, + "learning_rate": 1.4592101516509914e-06, + "loss": 2.6807, + "step": 9461 + }, + { + "epoch": 0.856522132705712, + "grad_norm": 0.8518723249435425, + "learning_rate": 1.4538138111261257e-06, + "loss": 2.6163, + "step": 9462 + }, + { + "epoch": 0.8566126550194623, + "grad_norm": 0.8393850922584534, + "learning_rate": 1.448427394199503e-06, + "loss": 2.6404, + "step": 9463 + }, + { + "epoch": 0.8567031773332127, + "grad_norm": 0.8219265341758728, + "learning_rate": 1.4430509014135563e-06, + "loss": 2.6566, + "step": 9464 + }, + { + "epoch": 0.856793699646963, + "grad_norm": 0.8629975914955139, + "learning_rate": 1.4376843333096747e-06, + "loss": 2.6258, + "step": 9465 + }, + { + "epoch": 0.8568842219607133, + "grad_norm": 0.8664360046386719, + "learning_rate": 1.4323276904283034e-06, + "loss": 2.5728, + "step": 9466 + }, + { + "epoch": 0.8569747442744636, + "grad_norm": 0.7899994254112244, + "learning_rate": 1.4269809733088112e-06, + "loss": 2.6007, + "step": 9467 + }, + { + "epoch": 0.857065266588214, + "grad_norm": 0.8480337858200073, + "learning_rate": 1.4216441824896565e-06, + "loss": 2.6015, + "step": 9468 + }, + { + "epoch": 0.8571557889019643, + "grad_norm": 0.8864121437072754, + "learning_rate": 1.4163173185082312e-06, + "loss": 2.5481, + "step": 9469 + }, + { + "epoch": 0.8572463112157147, + "grad_norm": 0.8953933119773865, + "learning_rate": 1.411000381900951e-06, + "loss": 2.6592, + "step": 9470 + }, + { + "epoch": 0.857336833529465, + "grad_norm": 0.8673043251037598, + "learning_rate": 1.405693373203243e-06, + "loss": 2.6351, + "step": 9471 + }, + { + "epoch": 0.8574273558432154, + "grad_norm": 0.8806537389755249, + "learning_rate": 1.400396292949513e-06, + "loss": 2.6091, + "step": 9472 + }, + { + "epoch": 0.8575178781569657, + "grad_norm": 0.8178626298904419, + "learning_rate": 1.3951091416731676e-06, + "loss": 2.6218, + "step": 9473 + }, + { + "epoch": 0.8576084004707161, + "grad_norm": 0.8824848532676697, + "learning_rate": 1.3898319199066478e-06, + "loss": 2.6469, + "step": 9474 + }, + { + "epoch": 0.8576989227844664, + "grad_norm": 0.8050047755241394, + "learning_rate": 1.3845646281813507e-06, + "loss": 2.636, + "step": 9475 + }, + { + "epoch": 0.8577894450982168, + "grad_norm": 0.936113178730011, + "learning_rate": 1.3793072670276851e-06, + "loss": 2.5698, + "step": 9476 + }, + { + "epoch": 0.857879967411967, + "grad_norm": 0.8579187393188477, + "learning_rate": 1.3740598369750945e-06, + "loss": 2.6198, + "step": 9477 + }, + { + "epoch": 0.8579704897257174, + "grad_norm": 0.8076322674751282, + "learning_rate": 1.3688223385519672e-06, + "loss": 2.6443, + "step": 9478 + }, + { + "epoch": 0.8580610120394677, + "grad_norm": 0.8612182140350342, + "learning_rate": 1.3635947722857367e-06, + "loss": 2.6371, + "step": 9479 + }, + { + "epoch": 0.8581515343532181, + "grad_norm": 0.8125132918357849, + "learning_rate": 1.3583771387028265e-06, + "loss": 2.5982, + "step": 9480 + }, + { + "epoch": 0.8582420566669684, + "grad_norm": 0.824117124080658, + "learning_rate": 1.3531694383286165e-06, + "loss": 2.572, + "step": 9481 + }, + { + "epoch": 0.8583325789807188, + "grad_norm": 0.8419950008392334, + "learning_rate": 1.3479716716875645e-06, + "loss": 2.5942, + "step": 9482 + }, + { + "epoch": 0.8584231012944691, + "grad_norm": 0.8302912712097168, + "learning_rate": 1.3427838393030633e-06, + "loss": 2.5714, + "step": 9483 + }, + { + "epoch": 0.8585136236082195, + "grad_norm": 0.8524641990661621, + "learning_rate": 1.3376059416975172e-06, + "loss": 2.6421, + "step": 9484 + }, + { + "epoch": 0.8586041459219698, + "grad_norm": 0.8621329069137573, + "learning_rate": 1.3324379793923647e-06, + "loss": 2.5743, + "step": 9485 + }, + { + "epoch": 0.8586946682357202, + "grad_norm": 0.9065933227539062, + "learning_rate": 1.3272799529080004e-06, + "loss": 2.6904, + "step": 9486 + }, + { + "epoch": 0.8587851905494704, + "grad_norm": 0.8736260533332825, + "learning_rate": 1.3221318627638534e-06, + "loss": 2.5828, + "step": 9487 + }, + { + "epoch": 0.8588757128632207, + "grad_norm": 0.8250309824943542, + "learning_rate": 1.3169937094783314e-06, + "loss": 2.5656, + "step": 9488 + }, + { + "epoch": 0.8589662351769711, + "grad_norm": 0.9094919562339783, + "learning_rate": 1.3118654935688313e-06, + "loss": 2.6435, + "step": 9489 + }, + { + "epoch": 0.8590567574907214, + "grad_norm": 0.857205331325531, + "learning_rate": 1.3067472155517735e-06, + "loss": 2.5497, + "step": 9490 + }, + { + "epoch": 0.8591472798044718, + "grad_norm": 0.844194769859314, + "learning_rate": 1.301638875942579e-06, + "loss": 2.6107, + "step": 9491 + }, + { + "epoch": 0.8592378021182221, + "grad_norm": 0.8576357960700989, + "learning_rate": 1.2965404752556476e-06, + "loss": 2.5993, + "step": 9492 + }, + { + "epoch": 0.8593283244319725, + "grad_norm": 0.9494067430496216, + "learning_rate": 1.2914520140043906e-06, + "loss": 2.6781, + "step": 9493 + }, + { + "epoch": 0.8594188467457228, + "grad_norm": 0.8041919469833374, + "learning_rate": 1.2863734927012095e-06, + "loss": 2.5825, + "step": 9494 + }, + { + "epoch": 0.8595093690594732, + "grad_norm": 0.8312264680862427, + "learning_rate": 1.2813049118575282e-06, + "loss": 2.5765, + "step": 9495 + }, + { + "epoch": 0.8595998913732235, + "grad_norm": 0.9145568013191223, + "learning_rate": 1.2762462719837275e-06, + "loss": 2.7119, + "step": 9496 + }, + { + "epoch": 0.8596904136869739, + "grad_norm": 0.8223174810409546, + "learning_rate": 1.2711975735892334e-06, + "loss": 2.5868, + "step": 9497 + }, + { + "epoch": 0.8597809360007241, + "grad_norm": 0.7928852438926697, + "learning_rate": 1.266158817182439e-06, + "loss": 2.5961, + "step": 9498 + }, + { + "epoch": 0.8598714583144745, + "grad_norm": 0.8287642598152161, + "learning_rate": 1.2611300032707496e-06, + "loss": 2.6338, + "step": 9499 + }, + { + "epoch": 0.8599619806282248, + "grad_norm": 0.8964071869850159, + "learning_rate": 1.2561111323605712e-06, + "loss": 2.5713, + "step": 9500 + }, + { + "epoch": 0.8600525029419752, + "grad_norm": 0.7847150564193726, + "learning_rate": 1.2511022049572996e-06, + "loss": 2.6324, + "step": 9501 + }, + { + "epoch": 0.8601430252557255, + "grad_norm": 0.8039739727973938, + "learning_rate": 1.2461032215653311e-06, + "loss": 2.5915, + "step": 9502 + }, + { + "epoch": 0.8602335475694759, + "grad_norm": 0.8426544666290283, + "learning_rate": 1.2411141826880636e-06, + "loss": 2.6034, + "step": 9503 + }, + { + "epoch": 0.8603240698832262, + "grad_norm": 0.8430858254432678, + "learning_rate": 1.2361350888279056e-06, + "loss": 2.543, + "step": 9504 + }, + { + "epoch": 0.8604145921969766, + "grad_norm": 0.8529922366142273, + "learning_rate": 1.231165940486234e-06, + "loss": 2.6163, + "step": 9505 + }, + { + "epoch": 0.8605051145107269, + "grad_norm": 0.8032079339027405, + "learning_rate": 1.226206738163449e-06, + "loss": 2.5339, + "step": 9506 + }, + { + "epoch": 0.8605956368244773, + "grad_norm": 0.9089851379394531, + "learning_rate": 1.2212574823589507e-06, + "loss": 2.6063, + "step": 9507 + }, + { + "epoch": 0.8606861591382275, + "grad_norm": 0.7995228171348572, + "learning_rate": 1.2163181735711072e-06, + "loss": 2.6312, + "step": 9508 + }, + { + "epoch": 0.8607766814519779, + "grad_norm": 0.8813963532447815, + "learning_rate": 1.2113888122973317e-06, + "loss": 2.6057, + "step": 9509 + }, + { + "epoch": 0.8608672037657282, + "grad_norm": 0.8219525814056396, + "learning_rate": 1.2064693990339936e-06, + "loss": 2.6038, + "step": 9510 + }, + { + "epoch": 0.8609577260794786, + "grad_norm": 0.8565325140953064, + "learning_rate": 1.2015599342764862e-06, + "loss": 2.623, + "step": 9511 + }, + { + "epoch": 0.8610482483932289, + "grad_norm": 0.8099278211593628, + "learning_rate": 1.196660418519191e-06, + "loss": 2.6101, + "step": 9512 + }, + { + "epoch": 0.8611387707069793, + "grad_norm": 0.9025592803955078, + "learning_rate": 1.1917708522554915e-06, + "loss": 2.6319, + "step": 9513 + }, + { + "epoch": 0.8612292930207296, + "grad_norm": 0.7932490706443787, + "learning_rate": 1.1868912359777607e-06, + "loss": 2.6129, + "step": 9514 + }, + { + "epoch": 0.86131981533448, + "grad_norm": 0.8897316455841064, + "learning_rate": 1.1820215701773829e-06, + "loss": 2.6638, + "step": 9515 + }, + { + "epoch": 0.8614103376482303, + "grad_norm": 0.8247643709182739, + "learning_rate": 1.1771618553447216e-06, + "loss": 2.6295, + "step": 9516 + }, + { + "epoch": 0.8615008599619807, + "grad_norm": 0.8464657068252563, + "learning_rate": 1.1723120919691634e-06, + "loss": 2.631, + "step": 9517 + }, + { + "epoch": 0.861591382275731, + "grad_norm": 0.8722254037857056, + "learning_rate": 1.1674722805390726e-06, + "loss": 2.6014, + "step": 9518 + }, + { + "epoch": 0.8616819045894814, + "grad_norm": 0.9596865177154541, + "learning_rate": 1.1626424215418151e-06, + "loss": 2.6208, + "step": 9519 + }, + { + "epoch": 0.8617724269032316, + "grad_norm": 0.816199779510498, + "learning_rate": 1.157822515463758e-06, + "loss": 2.5962, + "step": 9520 + }, + { + "epoch": 0.861862949216982, + "grad_norm": 0.8612105250358582, + "learning_rate": 1.1530125627902788e-06, + "loss": 2.6487, + "step": 9521 + }, + { + "epoch": 0.8619534715307323, + "grad_norm": 0.9207258820533752, + "learning_rate": 1.148212564005724e-06, + "loss": 2.6646, + "step": 9522 + }, + { + "epoch": 0.8620439938444827, + "grad_norm": 0.8424425721168518, + "learning_rate": 1.1434225195934622e-06, + "loss": 2.6143, + "step": 9523 + }, + { + "epoch": 0.862134516158233, + "grad_norm": 0.8309099078178406, + "learning_rate": 1.138642430035841e-06, + "loss": 2.6148, + "step": 9524 + }, + { + "epoch": 0.8622250384719834, + "grad_norm": 0.839776873588562, + "learning_rate": 1.1338722958142312e-06, + "loss": 2.6355, + "step": 9525 + }, + { + "epoch": 0.8623155607857337, + "grad_norm": 0.8244553208351135, + "learning_rate": 1.1291121174089703e-06, + "loss": 2.6388, + "step": 9526 + }, + { + "epoch": 0.8624060830994841, + "grad_norm": 0.8162111639976501, + "learning_rate": 1.1243618952994195e-06, + "loss": 2.5933, + "step": 9527 + }, + { + "epoch": 0.8624966054132344, + "grad_norm": 0.7930101156234741, + "learning_rate": 1.1196216299639072e-06, + "loss": 2.5523, + "step": 9528 + }, + { + "epoch": 0.8625871277269846, + "grad_norm": 0.8353174328804016, + "learning_rate": 1.1148913218798075e-06, + "loss": 2.6007, + "step": 9529 + }, + { + "epoch": 0.862677650040735, + "grad_norm": 0.8778246641159058, + "learning_rate": 1.1101709715234386e-06, + "loss": 2.602, + "step": 9530 + }, + { + "epoch": 0.8627681723544853, + "grad_norm": 0.8321190476417542, + "learning_rate": 1.1054605793701545e-06, + "loss": 2.6676, + "step": 9531 + }, + { + "epoch": 0.8628586946682357, + "grad_norm": 0.7573698163032532, + "learning_rate": 1.1007601458942752e-06, + "loss": 2.563, + "step": 9532 + }, + { + "epoch": 0.862949216981986, + "grad_norm": 0.8202477097511292, + "learning_rate": 1.0960696715691443e-06, + "loss": 2.5636, + "step": 9533 + }, + { + "epoch": 0.8630397392957364, + "grad_norm": 0.7851460576057434, + "learning_rate": 1.0913891568670842e-06, + "loss": 2.6107, + "step": 9534 + }, + { + "epoch": 0.8631302616094867, + "grad_norm": 0.8061966896057129, + "learning_rate": 1.08671860225944e-06, + "loss": 2.5635, + "step": 9535 + }, + { + "epoch": 0.8632207839232371, + "grad_norm": 0.8170107007026672, + "learning_rate": 1.0820580082165244e-06, + "loss": 2.577, + "step": 9536 + }, + { + "epoch": 0.8633113062369874, + "grad_norm": 0.8730764985084534, + "learning_rate": 1.0774073752076618e-06, + "loss": 2.6012, + "step": 9537 + }, + { + "epoch": 0.8634018285507378, + "grad_norm": 0.8004407286643982, + "learning_rate": 1.0727667037011668e-06, + "loss": 2.6438, + "step": 9538 + }, + { + "epoch": 0.8634923508644881, + "grad_norm": 0.8690145611763, + "learning_rate": 1.0681359941643654e-06, + "loss": 2.6304, + "step": 9539 + }, + { + "epoch": 0.8635828731782385, + "grad_norm": 0.8464412689208984, + "learning_rate": 1.0635152470635512e-06, + "loss": 2.6322, + "step": 9540 + }, + { + "epoch": 0.8636733954919887, + "grad_norm": 0.86949622631073, + "learning_rate": 1.0589044628640522e-06, + "loss": 2.6291, + "step": 9541 + }, + { + "epoch": 0.8637639178057391, + "grad_norm": 0.8569595217704773, + "learning_rate": 1.0543036420301634e-06, + "loss": 2.5688, + "step": 9542 + }, + { + "epoch": 0.8638544401194894, + "grad_norm": 0.817255437374115, + "learning_rate": 1.049712785025192e-06, + "loss": 2.6486, + "step": 9543 + }, + { + "epoch": 0.8639449624332398, + "grad_norm": 0.8158665299415588, + "learning_rate": 1.045131892311435e-06, + "loss": 2.579, + "step": 9544 + }, + { + "epoch": 0.8640354847469901, + "grad_norm": 0.783174455165863, + "learning_rate": 1.0405609643501902e-06, + "loss": 2.5935, + "step": 9545 + }, + { + "epoch": 0.8641260070607405, + "grad_norm": 0.7993530631065369, + "learning_rate": 1.0360000016017445e-06, + "loss": 2.5839, + "step": 9546 + }, + { + "epoch": 0.8642165293744908, + "grad_norm": 0.8038840889930725, + "learning_rate": 1.0314490045253977e-06, + "loss": 2.5646, + "step": 9547 + }, + { + "epoch": 0.8643070516882412, + "grad_norm": 0.7852585315704346, + "learning_rate": 1.0269079735794164e-06, + "loss": 2.5571, + "step": 9548 + }, + { + "epoch": 0.8643975740019915, + "grad_norm": 0.8642681837081909, + "learning_rate": 1.0223769092211012e-06, + "loss": 2.6293, + "step": 9549 + }, + { + "epoch": 0.8644880963157419, + "grad_norm": 0.820342481136322, + "learning_rate": 1.0178558119067315e-06, + "loss": 2.59, + "step": 9550 + }, + { + "epoch": 0.8645786186294921, + "grad_norm": 0.8889754414558411, + "learning_rate": 1.013344682091555e-06, + "loss": 2.7136, + "step": 9551 + }, + { + "epoch": 0.8646691409432425, + "grad_norm": 0.8811084032058716, + "learning_rate": 1.008843520229874e-06, + "loss": 2.659, + "step": 9552 + }, + { + "epoch": 0.8647596632569928, + "grad_norm": 0.7807613015174866, + "learning_rate": 1.0043523267749489e-06, + "loss": 2.5764, + "step": 9553 + }, + { + "epoch": 0.8648501855707432, + "grad_norm": 0.8308903574943542, + "learning_rate": 9.998711021790174e-07, + "loss": 2.6319, + "step": 9554 + }, + { + "epoch": 0.8649407078844935, + "grad_norm": 0.8310360312461853, + "learning_rate": 9.953998468933634e-07, + "loss": 2.6007, + "step": 9555 + }, + { + "epoch": 0.8650312301982439, + "grad_norm": 0.7842338681221008, + "learning_rate": 9.909385613682375e-07, + "loss": 2.6078, + "step": 9556 + }, + { + "epoch": 0.8651217525119942, + "grad_norm": 0.8118513226509094, + "learning_rate": 9.864872460528918e-07, + "loss": 2.5622, + "step": 9557 + }, + { + "epoch": 0.8652122748257446, + "grad_norm": 0.7811709642410278, + "learning_rate": 9.820459013955674e-07, + "loss": 2.5342, + "step": 9558 + }, + { + "epoch": 0.8653027971394949, + "grad_norm": 0.8299935460090637, + "learning_rate": 9.776145278435178e-07, + "loss": 2.5751, + "step": 9559 + }, + { + "epoch": 0.8653933194532453, + "grad_norm": 0.8128529191017151, + "learning_rate": 9.731931258429638e-07, + "loss": 2.6198, + "step": 9560 + }, + { + "epoch": 0.8654838417669956, + "grad_norm": 0.8885383009910583, + "learning_rate": 9.687816958391716e-07, + "loss": 2.5648, + "step": 9561 + }, + { + "epoch": 0.865574364080746, + "grad_norm": 0.8305163383483887, + "learning_rate": 9.6438023827633e-07, + "loss": 2.586, + "step": 9562 + }, + { + "epoch": 0.8656648863944962, + "grad_norm": 0.8077706098556519, + "learning_rate": 9.59988753597707e-07, + "loss": 2.5863, + "step": 9563 + }, + { + "epoch": 0.8657554087082466, + "grad_norm": 0.8040973544120789, + "learning_rate": 9.55607242245493e-07, + "loss": 2.6233, + "step": 9564 + }, + { + "epoch": 0.8658459310219969, + "grad_norm": 0.8089603781700134, + "learning_rate": 9.512357046609244e-07, + "loss": 2.6533, + "step": 9565 + }, + { + "epoch": 0.8659364533357473, + "grad_norm": 0.7722705006599426, + "learning_rate": 9.468741412842152e-07, + "loss": 2.5909, + "step": 9566 + }, + { + "epoch": 0.8660269756494976, + "grad_norm": 0.8577929735183716, + "learning_rate": 9.4252255255457e-07, + "loss": 2.5763, + "step": 9567 + }, + { + "epoch": 0.866117497963248, + "grad_norm": 0.848434567451477, + "learning_rate": 9.381809389101825e-07, + "loss": 2.6147, + "step": 9568 + }, + { + "epoch": 0.8662080202769983, + "grad_norm": 0.8390416502952576, + "learning_rate": 9.338493007882809e-07, + "loss": 2.6285, + "step": 9569 + }, + { + "epoch": 0.8662985425907486, + "grad_norm": 0.8751125931739807, + "learning_rate": 9.295276386250274e-07, + "loss": 2.6566, + "step": 9570 + }, + { + "epoch": 0.866389064904499, + "grad_norm": 0.8329317569732666, + "learning_rate": 9.252159528556403e-07, + "loss": 2.6553, + "step": 9571 + }, + { + "epoch": 0.8664795872182492, + "grad_norm": 0.9101200103759766, + "learning_rate": 9.209142439142948e-07, + "loss": 2.6609, + "step": 9572 + }, + { + "epoch": 0.8665701095319996, + "grad_norm": 0.8253322243690491, + "learning_rate": 9.166225122341665e-07, + "loss": 2.606, + "step": 9573 + }, + { + "epoch": 0.8666606318457499, + "grad_norm": 0.8649923205375671, + "learning_rate": 9.123407582474541e-07, + "loss": 2.6011, + "step": 9574 + }, + { + "epoch": 0.8667511541595003, + "grad_norm": 0.9202775955200195, + "learning_rate": 9.080689823853017e-07, + "loss": 2.6167, + "step": 9575 + }, + { + "epoch": 0.8668416764732506, + "grad_norm": 0.8605530261993408, + "learning_rate": 9.038071850778984e-07, + "loss": 2.6093, + "step": 9576 + }, + { + "epoch": 0.866932198787001, + "grad_norm": 0.8165022730827332, + "learning_rate": 8.99555366754401e-07, + "loss": 2.5998, + "step": 9577 + }, + { + "epoch": 0.8670227211007513, + "grad_norm": 0.832669198513031, + "learning_rate": 8.953135278429558e-07, + "loss": 2.6118, + "step": 9578 + }, + { + "epoch": 0.8671132434145017, + "grad_norm": 0.8316287398338318, + "learning_rate": 8.910816687707435e-07, + "loss": 2.6511, + "step": 9579 + }, + { + "epoch": 0.867203765728252, + "grad_norm": 0.8216667771339417, + "learning_rate": 8.868597899638898e-07, + "loss": 2.6116, + "step": 9580 + }, + { + "epoch": 0.8672942880420024, + "grad_norm": 0.8596590757369995, + "learning_rate": 8.826478918475323e-07, + "loss": 2.6065, + "step": 9581 + }, + { + "epoch": 0.8673848103557527, + "grad_norm": 0.8234314918518066, + "learning_rate": 8.784459748458318e-07, + "loss": 2.6449, + "step": 9582 + }, + { + "epoch": 0.867475332669503, + "grad_norm": 0.8860653042793274, + "learning_rate": 8.742540393819054e-07, + "loss": 2.6161, + "step": 9583 + }, + { + "epoch": 0.8675658549832533, + "grad_norm": 0.8453433513641357, + "learning_rate": 8.700720858778821e-07, + "loss": 2.633, + "step": 9584 + }, + { + "epoch": 0.8676563772970037, + "grad_norm": 0.8935437202453613, + "learning_rate": 8.659001147548918e-07, + "loss": 2.6248, + "step": 9585 + }, + { + "epoch": 0.867746899610754, + "grad_norm": 0.8853086233139038, + "learning_rate": 8.617381264330426e-07, + "loss": 2.7061, + "step": 9586 + }, + { + "epoch": 0.8678374219245044, + "grad_norm": 0.8435954451560974, + "learning_rate": 8.575861213314551e-07, + "loss": 2.6205, + "step": 9587 + }, + { + "epoch": 0.8679279442382547, + "grad_norm": 0.8238980770111084, + "learning_rate": 8.534440998682391e-07, + "loss": 2.5941, + "step": 9588 + }, + { + "epoch": 0.8680184665520051, + "grad_norm": 0.784957766532898, + "learning_rate": 8.493120624604833e-07, + "loss": 2.5959, + "step": 9589 + }, + { + "epoch": 0.8681089888657554, + "grad_norm": 1.014724612236023, + "learning_rate": 8.451900095242881e-07, + "loss": 2.6458, + "step": 9590 + }, + { + "epoch": 0.8681995111795058, + "grad_norm": 0.7878062725067139, + "learning_rate": 8.410779414747439e-07, + "loss": 2.6508, + "step": 9591 + }, + { + "epoch": 0.8682900334932561, + "grad_norm": 0.9261922836303711, + "learning_rate": 8.369758587259413e-07, + "loss": 2.6285, + "step": 9592 + }, + { + "epoch": 0.8683805558070065, + "grad_norm": 0.8362093567848206, + "learning_rate": 8.328837616909613e-07, + "loss": 2.5912, + "step": 9593 + }, + { + "epoch": 0.8684710781207567, + "grad_norm": 0.8119001984596252, + "learning_rate": 8.288016507818742e-07, + "loss": 2.6457, + "step": 9594 + }, + { + "epoch": 0.8685616004345071, + "grad_norm": 0.7739616632461548, + "learning_rate": 8.247295264097288e-07, + "loss": 2.6102, + "step": 9595 + }, + { + "epoch": 0.8686521227482574, + "grad_norm": 0.8285651206970215, + "learning_rate": 8.206673889846306e-07, + "loss": 2.6539, + "step": 9596 + }, + { + "epoch": 0.8687426450620078, + "grad_norm": 0.8595246076583862, + "learning_rate": 8.166152389155967e-07, + "loss": 2.5955, + "step": 9597 + }, + { + "epoch": 0.8688331673757581, + "grad_norm": 0.8347133994102478, + "learning_rate": 8.125730766107009e-07, + "loss": 2.645, + "step": 9598 + }, + { + "epoch": 0.8689236896895085, + "grad_norm": 0.8522047996520996, + "learning_rate": 8.08540902476973e-07, + "loss": 2.591, + "step": 9599 + }, + { + "epoch": 0.8690142120032588, + "grad_norm": 0.8081803917884827, + "learning_rate": 8.04518716920466e-07, + "loss": 2.5815, + "step": 9600 + }, + { + "epoch": 0.8691047343170092, + "grad_norm": 0.8424069881439209, + "learning_rate": 8.005065203462225e-07, + "loss": 2.6368, + "step": 9601 + }, + { + "epoch": 0.8691952566307595, + "grad_norm": 0.8203180432319641, + "learning_rate": 7.965043131582529e-07, + "loss": 2.5874, + "step": 9602 + }, + { + "epoch": 0.8692857789445099, + "grad_norm": 0.7955147624015808, + "learning_rate": 7.925120957595678e-07, + "loss": 2.6271, + "step": 9603 + }, + { + "epoch": 0.8693763012582602, + "grad_norm": 0.8208197355270386, + "learning_rate": 7.885298685522235e-07, + "loss": 2.5695, + "step": 9604 + }, + { + "epoch": 0.8694668235720105, + "grad_norm": 0.8572072982788086, + "learning_rate": 7.845576319371883e-07, + "loss": 2.5843, + "step": 9605 + }, + { + "epoch": 0.8695573458857608, + "grad_norm": 0.9009573459625244, + "learning_rate": 7.805953863144977e-07, + "loss": 2.5855, + "step": 9606 + }, + { + "epoch": 0.8696478681995112, + "grad_norm": 0.8166123032569885, + "learning_rate": 7.766431320831324e-07, + "loss": 2.6133, + "step": 9607 + }, + { + "epoch": 0.8697383905132615, + "grad_norm": 0.831098198890686, + "learning_rate": 7.727008696410854e-07, + "loss": 2.6089, + "step": 9608 + }, + { + "epoch": 0.8698289128270119, + "grad_norm": 0.8021360039710999, + "learning_rate": 7.6876859938535e-07, + "loss": 2.6223, + "step": 9609 + }, + { + "epoch": 0.8699194351407622, + "grad_norm": 0.8092429041862488, + "learning_rate": 7.648463217118984e-07, + "loss": 2.5619, + "step": 9610 + }, + { + "epoch": 0.8700099574545125, + "grad_norm": 0.8074584603309631, + "learning_rate": 7.609340370157037e-07, + "loss": 2.6306, + "step": 9611 + }, + { + "epoch": 0.8701004797682629, + "grad_norm": 0.898284912109375, + "learning_rate": 7.570317456907506e-07, + "loss": 2.6498, + "step": 9612 + }, + { + "epoch": 0.8701910020820132, + "grad_norm": 0.8530883193016052, + "learning_rate": 7.531394481299691e-07, + "loss": 2.6105, + "step": 9613 + }, + { + "epoch": 0.8702815243957636, + "grad_norm": 0.8278710842132568, + "learning_rate": 7.49257144725346e-07, + "loss": 2.6035, + "step": 9614 + }, + { + "epoch": 0.8703720467095138, + "grad_norm": 0.8579922914505005, + "learning_rate": 7.453848358678017e-07, + "loss": 2.6313, + "step": 9615 + }, + { + "epoch": 0.8704625690232642, + "grad_norm": 0.831552267074585, + "learning_rate": 7.415225219472799e-07, + "loss": 2.6281, + "step": 9616 + }, + { + "epoch": 0.8705530913370145, + "grad_norm": 0.8504630327224731, + "learning_rate": 7.37670203352736e-07, + "loss": 2.6271, + "step": 9617 + }, + { + "epoch": 0.8706436136507649, + "grad_norm": 0.8295220732688904, + "learning_rate": 7.338278804720933e-07, + "loss": 2.583, + "step": 9618 + }, + { + "epoch": 0.8707341359645152, + "grad_norm": 0.8532193303108215, + "learning_rate": 7.299955536922531e-07, + "loss": 2.6143, + "step": 9619 + }, + { + "epoch": 0.8708246582782656, + "grad_norm": 0.8124157786369324, + "learning_rate": 7.261732233991513e-07, + "loss": 2.6051, + "step": 9620 + }, + { + "epoch": 0.8709151805920159, + "grad_norm": 0.8071900010108948, + "learning_rate": 7.223608899776912e-07, + "loss": 2.5306, + "step": 9621 + }, + { + "epoch": 0.8710057029057663, + "grad_norm": 0.8964101672172546, + "learning_rate": 7.185585538117657e-07, + "loss": 2.5625, + "step": 9622 + }, + { + "epoch": 0.8710962252195166, + "grad_norm": 0.8295537829399109, + "learning_rate": 7.147662152842794e-07, + "loss": 2.6173, + "step": 9623 + }, + { + "epoch": 0.871186747533267, + "grad_norm": 0.835732638835907, + "learning_rate": 7.109838747771269e-07, + "loss": 2.6326, + "step": 9624 + }, + { + "epoch": 0.8712772698470173, + "grad_norm": 0.8687270879745483, + "learning_rate": 7.072115326711704e-07, + "loss": 2.5738, + "step": 9625 + }, + { + "epoch": 0.8713677921607677, + "grad_norm": 0.8733930587768555, + "learning_rate": 7.034491893463058e-07, + "loss": 2.6143, + "step": 9626 + }, + { + "epoch": 0.8714583144745179, + "grad_norm": 0.8681638240814209, + "learning_rate": 6.996968451813857e-07, + "loss": 2.6106, + "step": 9627 + }, + { + "epoch": 0.8715488367882683, + "grad_norm": 0.8479701280593872, + "learning_rate": 6.959545005542744e-07, + "loss": 2.6332, + "step": 9628 + }, + { + "epoch": 0.8716393591020186, + "grad_norm": 0.7846943140029907, + "learning_rate": 6.922221558418263e-07, + "loss": 2.6283, + "step": 9629 + }, + { + "epoch": 0.871729881415769, + "grad_norm": 0.8460354208946228, + "learning_rate": 6.884998114198959e-07, + "loss": 2.6348, + "step": 9630 + }, + { + "epoch": 0.8718204037295193, + "grad_norm": 0.9053749442100525, + "learning_rate": 6.847874676633171e-07, + "loss": 2.6853, + "step": 9631 + }, + { + "epoch": 0.8719109260432697, + "grad_norm": 0.8776060938835144, + "learning_rate": 6.810851249459238e-07, + "loss": 2.6069, + "step": 9632 + }, + { + "epoch": 0.87200144835702, + "grad_norm": 0.843860924243927, + "learning_rate": 6.773927836405291e-07, + "loss": 2.6399, + "step": 9633 + }, + { + "epoch": 0.8720919706707704, + "grad_norm": 0.8224048614501953, + "learning_rate": 6.737104441189801e-07, + "loss": 2.6078, + "step": 9634 + }, + { + "epoch": 0.8721824929845207, + "grad_norm": 0.8930675983428955, + "learning_rate": 6.700381067520578e-07, + "loss": 2.6118, + "step": 9635 + }, + { + "epoch": 0.8722730152982711, + "grad_norm": 0.8419080376625061, + "learning_rate": 6.663757719095886e-07, + "loss": 2.5938, + "step": 9636 + }, + { + "epoch": 0.8723635376120213, + "grad_norm": 0.8541986346244812, + "learning_rate": 6.627234399603555e-07, + "loss": 2.6475, + "step": 9637 + }, + { + "epoch": 0.8724540599257717, + "grad_norm": 0.8377965688705444, + "learning_rate": 6.59081111272164e-07, + "loss": 2.6409, + "step": 9638 + }, + { + "epoch": 0.872544582239522, + "grad_norm": 0.8927517533302307, + "learning_rate": 6.554487862117764e-07, + "loss": 2.6301, + "step": 9639 + }, + { + "epoch": 0.8726351045532724, + "grad_norm": 0.8720734119415283, + "learning_rate": 6.518264651449779e-07, + "loss": 2.6155, + "step": 9640 + }, + { + "epoch": 0.8727256268670227, + "grad_norm": 0.8335932493209839, + "learning_rate": 6.482141484365434e-07, + "loss": 2.6052, + "step": 9641 + }, + { + "epoch": 0.8728161491807731, + "grad_norm": 0.7936855554580688, + "learning_rate": 6.446118364502152e-07, + "loss": 2.6495, + "step": 9642 + }, + { + "epoch": 0.8729066714945234, + "grad_norm": 0.8104913234710693, + "learning_rate": 6.410195295487698e-07, + "loss": 2.571, + "step": 9643 + }, + { + "epoch": 0.8729971938082738, + "grad_norm": 0.8593425154685974, + "learning_rate": 6.374372280939289e-07, + "loss": 2.5517, + "step": 9644 + }, + { + "epoch": 0.8730877161220241, + "grad_norm": 0.8080833554267883, + "learning_rate": 6.338649324464374e-07, + "loss": 2.617, + "step": 9645 + }, + { + "epoch": 0.8731782384357745, + "grad_norm": 0.8711926341056824, + "learning_rate": 6.303026429660408e-07, + "loss": 2.627, + "step": 9646 + }, + { + "epoch": 0.8732687607495248, + "grad_norm": 0.8949486017227173, + "learning_rate": 6.267503600114411e-07, + "loss": 2.6759, + "step": 9647 + }, + { + "epoch": 0.8733592830632751, + "grad_norm": 0.8196110129356384, + "learning_rate": 6.232080839403631e-07, + "loss": 2.6175, + "step": 9648 + }, + { + "epoch": 0.8734498053770254, + "grad_norm": 0.8647054433822632, + "learning_rate": 6.196758151095106e-07, + "loss": 2.6176, + "step": 9649 + }, + { + "epoch": 0.8735403276907758, + "grad_norm": 0.7463350296020508, + "learning_rate": 6.161535538745878e-07, + "loss": 2.586, + "step": 9650 + }, + { + "epoch": 0.8736308500045261, + "grad_norm": 0.8691142797470093, + "learning_rate": 6.126413005902775e-07, + "loss": 2.6337, + "step": 9651 + }, + { + "epoch": 0.8737213723182764, + "grad_norm": 0.8492142558097839, + "learning_rate": 6.09139055610275e-07, + "loss": 2.5886, + "step": 9652 + }, + { + "epoch": 0.8738118946320268, + "grad_norm": 0.8391124606132507, + "learning_rate": 6.056468192872422e-07, + "loss": 2.6409, + "step": 9653 + }, + { + "epoch": 0.8739024169457771, + "grad_norm": 0.8021348714828491, + "learning_rate": 6.021645919728647e-07, + "loss": 2.5925, + "step": 9654 + }, + { + "epoch": 0.8739929392595275, + "grad_norm": 0.7921338081359863, + "learning_rate": 5.986923740177842e-07, + "loss": 2.609, + "step": 9655 + }, + { + "epoch": 0.8740834615732778, + "grad_norm": 0.8511329889297485, + "learning_rate": 5.952301657716652e-07, + "loss": 2.6105, + "step": 9656 + }, + { + "epoch": 0.8741739838870282, + "grad_norm": 0.8938495516777039, + "learning_rate": 5.917779675831403e-07, + "loss": 2.5969, + "step": 9657 + }, + { + "epoch": 0.8742645062007784, + "grad_norm": 0.9117132425308228, + "learning_rate": 5.883357797998757e-07, + "loss": 2.6351, + "step": 9658 + }, + { + "epoch": 0.8743550285145288, + "grad_norm": 0.8959320187568665, + "learning_rate": 5.849036027684606e-07, + "loss": 2.5781, + "step": 9659 + }, + { + "epoch": 0.8744455508282791, + "grad_norm": 0.8768128156661987, + "learning_rate": 5.814814368345412e-07, + "loss": 2.6333, + "step": 9660 + }, + { + "epoch": 0.8745360731420295, + "grad_norm": 0.809253454208374, + "learning_rate": 5.780692823427191e-07, + "loss": 2.5255, + "step": 9661 + }, + { + "epoch": 0.8746265954557798, + "grad_norm": 0.9544652104377747, + "learning_rate": 5.746671396365977e-07, + "loss": 2.6322, + "step": 9662 + }, + { + "epoch": 0.8747171177695302, + "grad_norm": 0.8581130504608154, + "learning_rate": 5.712750090587693e-07, + "loss": 2.6322, + "step": 9663 + }, + { + "epoch": 0.8748076400832805, + "grad_norm": 0.8829495906829834, + "learning_rate": 5.678928909508275e-07, + "loss": 2.586, + "step": 9664 + }, + { + "epoch": 0.8748981623970309, + "grad_norm": 0.7909978032112122, + "learning_rate": 5.645207856533552e-07, + "loss": 2.6083, + "step": 9665 + }, + { + "epoch": 0.8749886847107812, + "grad_norm": 0.861017644405365, + "learning_rate": 5.611586935059255e-07, + "loss": 2.6235, + "step": 9666 + }, + { + "epoch": 0.8750792070245316, + "grad_norm": 0.8640226125717163, + "learning_rate": 5.578066148470895e-07, + "loss": 2.5969, + "step": 9667 + }, + { + "epoch": 0.8751697293382819, + "grad_norm": 0.8930851221084595, + "learning_rate": 5.544645500143997e-07, + "loss": 2.5674, + "step": 9668 + }, + { + "epoch": 0.8752602516520323, + "grad_norm": 0.8401358127593994, + "learning_rate": 5.511324993444201e-07, + "loss": 2.6385, + "step": 9669 + }, + { + "epoch": 0.8753507739657825, + "grad_norm": 0.8609418869018555, + "learning_rate": 5.478104631726711e-07, + "loss": 2.6145, + "step": 9670 + }, + { + "epoch": 0.8754412962795329, + "grad_norm": 0.8679646253585815, + "learning_rate": 5.444984418336852e-07, + "loss": 2.6633, + "step": 9671 + }, + { + "epoch": 0.8755318185932832, + "grad_norm": 0.8041927218437195, + "learning_rate": 5.411964356609845e-07, + "loss": 2.6179, + "step": 9672 + }, + { + "epoch": 0.8756223409070336, + "grad_norm": 0.8592411279678345, + "learning_rate": 5.379044449870807e-07, + "loss": 2.598, + "step": 9673 + }, + { + "epoch": 0.8757128632207839, + "grad_norm": 0.8296803832054138, + "learning_rate": 5.346224701434866e-07, + "loss": 2.6365, + "step": 9674 + }, + { + "epoch": 0.8758033855345343, + "grad_norm": 0.8249205946922302, + "learning_rate": 5.31350511460682e-07, + "loss": 2.613, + "step": 9675 + }, + { + "epoch": 0.8758939078482846, + "grad_norm": 0.80314040184021, + "learning_rate": 5.280885692681592e-07, + "loss": 2.5824, + "step": 9676 + }, + { + "epoch": 0.875984430162035, + "grad_norm": 0.8617088794708252, + "learning_rate": 5.248366438943996e-07, + "loss": 2.5894, + "step": 9677 + }, + { + "epoch": 0.8760749524757853, + "grad_norm": 0.8568413257598877, + "learning_rate": 5.215947356668638e-07, + "loss": 2.7115, + "step": 9678 + }, + { + "epoch": 0.8761654747895357, + "grad_norm": 0.8277208209037781, + "learning_rate": 5.183628449120126e-07, + "loss": 2.6778, + "step": 9679 + }, + { + "epoch": 0.8762559971032859, + "grad_norm": 0.9010827541351318, + "learning_rate": 5.151409719553079e-07, + "loss": 2.6317, + "step": 9680 + }, + { + "epoch": 0.8763465194170363, + "grad_norm": 0.8497141599655151, + "learning_rate": 5.119291171211793e-07, + "loss": 2.6076, + "step": 9681 + }, + { + "epoch": 0.8764370417307866, + "grad_norm": 0.8685660362243652, + "learning_rate": 5.087272807330679e-07, + "loss": 2.6325, + "step": 9682 + }, + { + "epoch": 0.876527564044537, + "grad_norm": 0.804347813129425, + "learning_rate": 5.055354631134047e-07, + "loss": 2.6702, + "step": 9683 + }, + { + "epoch": 0.8766180863582873, + "grad_norm": 0.8076506853103638, + "learning_rate": 5.02353664583588e-07, + "loss": 2.6483, + "step": 9684 + }, + { + "epoch": 0.8767086086720377, + "grad_norm": 0.790969967842102, + "learning_rate": 4.991818854640395e-07, + "loss": 2.5682, + "step": 9685 + }, + { + "epoch": 0.876799130985788, + "grad_norm": 0.8944148421287537, + "learning_rate": 4.960201260741593e-07, + "loss": 2.605, + "step": 9686 + }, + { + "epoch": 0.8768896532995384, + "grad_norm": 0.8538900017738342, + "learning_rate": 4.928683867323147e-07, + "loss": 2.6175, + "step": 9687 + }, + { + "epoch": 0.8769801756132887, + "grad_norm": 0.8444143533706665, + "learning_rate": 4.897266677559187e-07, + "loss": 2.6092, + "step": 9688 + }, + { + "epoch": 0.8770706979270391, + "grad_norm": 0.8742196559906006, + "learning_rate": 4.865949694613181e-07, + "loss": 2.6217, + "step": 9689 + }, + { + "epoch": 0.8771612202407894, + "grad_norm": 0.8844677805900574, + "learning_rate": 4.834732921638719e-07, + "loss": 2.5721, + "step": 9690 + }, + { + "epoch": 0.8772517425545397, + "grad_norm": 0.8656468391418457, + "learning_rate": 4.803616361779506e-07, + "loss": 2.6361, + "step": 9691 + }, + { + "epoch": 0.87734226486829, + "grad_norm": 0.8689473271369934, + "learning_rate": 4.772600018168816e-07, + "loss": 2.6226, + "step": 9692 + }, + { + "epoch": 0.8774327871820403, + "grad_norm": 0.8834279179573059, + "learning_rate": 4.7416838939301487e-07, + "loss": 2.5908, + "step": 9693 + }, + { + "epoch": 0.8775233094957907, + "grad_norm": 0.8247548937797546, + "learning_rate": 4.710867992176682e-07, + "loss": 2.6217, + "step": 9694 + }, + { + "epoch": 0.877613831809541, + "grad_norm": 0.8003450036048889, + "learning_rate": 4.680152316011488e-07, + "loss": 2.5869, + "step": 9695 + }, + { + "epoch": 0.8777043541232914, + "grad_norm": 0.8029552698135376, + "learning_rate": 4.649536868527871e-07, + "loss": 2.6065, + "step": 9696 + }, + { + "epoch": 0.8777948764370417, + "grad_norm": 0.8596997857093811, + "learning_rate": 4.619021652808586e-07, + "loss": 2.7067, + "step": 9697 + }, + { + "epoch": 0.8778853987507921, + "grad_norm": 0.7986694574356079, + "learning_rate": 4.58860667192651e-07, + "loss": 2.5868, + "step": 9698 + }, + { + "epoch": 0.8779759210645424, + "grad_norm": 0.8311411142349243, + "learning_rate": 4.5582919289446357e-07, + "loss": 2.5162, + "step": 9699 + }, + { + "epoch": 0.8780664433782928, + "grad_norm": 0.7954468131065369, + "learning_rate": 4.5280774269154115e-07, + "loss": 2.5697, + "step": 9700 + }, + { + "epoch": 0.878156965692043, + "grad_norm": 0.9038304090499878, + "learning_rate": 4.497963168881625e-07, + "loss": 2.5705, + "step": 9701 + }, + { + "epoch": 0.8782474880057934, + "grad_norm": 0.856782078742981, + "learning_rate": 4.4679491578756285e-07, + "loss": 2.6178, + "step": 9702 + }, + { + "epoch": 0.8783380103195437, + "grad_norm": 0.8561943769454956, + "learning_rate": 4.438035396920004e-07, + "loss": 2.6096, + "step": 9703 + }, + { + "epoch": 0.8784285326332941, + "grad_norm": 0.8387544751167297, + "learning_rate": 4.408221889026898e-07, + "loss": 2.5912, + "step": 9704 + }, + { + "epoch": 0.8785190549470444, + "grad_norm": 0.8454294204711914, + "learning_rate": 4.378508637198686e-07, + "loss": 2.5926, + "step": 9705 + }, + { + "epoch": 0.8786095772607948, + "grad_norm": 0.9286566972732544, + "learning_rate": 4.348895644427309e-07, + "loss": 2.6156, + "step": 9706 + }, + { + "epoch": 0.8787000995745451, + "grad_norm": 0.8351259231567383, + "learning_rate": 4.319382913695047e-07, + "loss": 2.651, + "step": 9707 + }, + { + "epoch": 0.8787906218882955, + "grad_norm": 0.8249180912971497, + "learning_rate": 4.289970447973635e-07, + "loss": 2.5576, + "step": 9708 + }, + { + "epoch": 0.8788811442020458, + "grad_norm": 0.8286994695663452, + "learning_rate": 4.2606582502248137e-07, + "loss": 2.6025, + "step": 9709 + }, + { + "epoch": 0.8789716665157962, + "grad_norm": 0.8584800362586975, + "learning_rate": 4.2314463234005565e-07, + "loss": 2.5787, + "step": 9710 + }, + { + "epoch": 0.8790621888295465, + "grad_norm": 0.8871709108352661, + "learning_rate": 4.2023346704425094e-07, + "loss": 2.6003, + "step": 9711 + }, + { + "epoch": 0.8791527111432968, + "grad_norm": 0.8227541446685791, + "learning_rate": 4.173323294281994e-07, + "loss": 2.5901, + "step": 9712 + }, + { + "epoch": 0.8792432334570471, + "grad_norm": 0.8498722910881042, + "learning_rate": 4.144412197840564e-07, + "loss": 2.5865, + "step": 9713 + }, + { + "epoch": 0.8793337557707975, + "grad_norm": 0.8699567317962646, + "learning_rate": 4.115601384029666e-07, + "loss": 2.6844, + "step": 9714 + }, + { + "epoch": 0.8794242780845478, + "grad_norm": 0.9470655918121338, + "learning_rate": 4.0868908557504247e-07, + "loss": 2.643, + "step": 9715 + }, + { + "epoch": 0.8795148003982982, + "grad_norm": 0.8927464485168457, + "learning_rate": 4.058280615893972e-07, + "loss": 2.5728, + "step": 9716 + }, + { + "epoch": 0.8796053227120485, + "grad_norm": 0.8503594398498535, + "learning_rate": 4.029770667341448e-07, + "loss": 2.6019, + "step": 9717 + }, + { + "epoch": 0.8796958450257989, + "grad_norm": 0.885911226272583, + "learning_rate": 4.001361012963778e-07, + "loss": 2.6893, + "step": 9718 + }, + { + "epoch": 0.8797863673395492, + "grad_norm": 0.7960912585258484, + "learning_rate": 3.9730516556217845e-07, + "loss": 2.5933, + "step": 9719 + }, + { + "epoch": 0.8798768896532996, + "grad_norm": 0.8120215535163879, + "learning_rate": 3.9448425981661876e-07, + "loss": 2.6384, + "step": 9720 + }, + { + "epoch": 0.8799674119670499, + "grad_norm": 0.8336339592933655, + "learning_rate": 3.916733843437825e-07, + "loss": 2.6157, + "step": 9721 + }, + { + "epoch": 0.8800579342808003, + "grad_norm": 0.8533289432525635, + "learning_rate": 3.88872539426699e-07, + "loss": 2.5811, + "step": 9722 + }, + { + "epoch": 0.8801484565945505, + "grad_norm": 0.8294474482536316, + "learning_rate": 3.8608172534743135e-07, + "loss": 2.6509, + "step": 9723 + }, + { + "epoch": 0.8802389789083009, + "grad_norm": 0.9152242541313171, + "learning_rate": 3.833009423869993e-07, + "loss": 2.6402, + "step": 9724 + }, + { + "epoch": 0.8803295012220512, + "grad_norm": 1.01529061794281, + "learning_rate": 3.805301908254455e-07, + "loss": 2.6434, + "step": 9725 + }, + { + "epoch": 0.8804200235358016, + "grad_norm": 0.8746420741081238, + "learning_rate": 3.7776947094176893e-07, + "loss": 2.6068, + "step": 9726 + }, + { + "epoch": 0.8805105458495519, + "grad_norm": 0.875525712966919, + "learning_rate": 3.750187830139917e-07, + "loss": 2.5965, + "step": 9727 + }, + { + "epoch": 0.8806010681633023, + "grad_norm": 0.9152287244796753, + "learning_rate": 3.7227812731909227e-07, + "loss": 2.5523, + "step": 9728 + }, + { + "epoch": 0.8806915904770526, + "grad_norm": 0.7987388968467712, + "learning_rate": 3.6954750413306096e-07, + "loss": 2.6045, + "step": 9729 + }, + { + "epoch": 0.880782112790803, + "grad_norm": 0.8325386047363281, + "learning_rate": 3.6682691373086665e-07, + "loss": 2.6529, + "step": 9730 + }, + { + "epoch": 0.8808726351045533, + "grad_norm": 0.7871389985084534, + "learning_rate": 3.64116356386468e-07, + "loss": 2.5837, + "step": 9731 + }, + { + "epoch": 0.8809631574183037, + "grad_norm": 0.83063805103302, + "learning_rate": 3.614158323728356e-07, + "loss": 2.607, + "step": 9732 + }, + { + "epoch": 0.881053679732054, + "grad_norm": 0.859025239944458, + "learning_rate": 3.587253419619074e-07, + "loss": 2.6318, + "step": 9733 + }, + { + "epoch": 0.8811442020458042, + "grad_norm": 0.8289056420326233, + "learning_rate": 3.5604488542460014e-07, + "loss": 2.5765, + "step": 9734 + }, + { + "epoch": 0.8812347243595546, + "grad_norm": 0.826295793056488, + "learning_rate": 3.533744630308533e-07, + "loss": 2.6251, + "step": 9735 + }, + { + "epoch": 0.8813252466733049, + "grad_norm": 0.8544880151748657, + "learning_rate": 3.50714075049563e-07, + "loss": 2.5817, + "step": 9736 + }, + { + "epoch": 0.8814157689870553, + "grad_norm": 0.8992463946342468, + "learning_rate": 3.480637217486482e-07, + "loss": 2.7122, + "step": 9737 + }, + { + "epoch": 0.8815062913008056, + "grad_norm": 0.9246089458465576, + "learning_rate": 3.4542340339498437e-07, + "loss": 2.6458, + "step": 9738 + }, + { + "epoch": 0.881596813614556, + "grad_norm": 0.7926107048988342, + "learning_rate": 3.4279312025445876e-07, + "loss": 2.6255, + "step": 9739 + }, + { + "epoch": 0.8816873359283063, + "grad_norm": 0.8976580500602722, + "learning_rate": 3.401728725919373e-07, + "loss": 2.6193, + "step": 9740 + }, + { + "epoch": 0.8817778582420567, + "grad_norm": 0.8773987889289856, + "learning_rate": 3.375626606712867e-07, + "loss": 2.6014, + "step": 9741 + }, + { + "epoch": 0.881868380555807, + "grad_norm": 0.8479148149490356, + "learning_rate": 3.349624847553412e-07, + "loss": 2.6462, + "step": 9742 + }, + { + "epoch": 0.8819589028695574, + "grad_norm": 0.8767992854118347, + "learning_rate": 3.3237234510595793e-07, + "loss": 2.5869, + "step": 9743 + }, + { + "epoch": 0.8820494251833076, + "grad_norm": 0.8235431909561157, + "learning_rate": 3.2979224198395055e-07, + "loss": 2.6434, + "step": 9744 + }, + { + "epoch": 0.882139947497058, + "grad_norm": 0.8444194197654724, + "learning_rate": 3.2722217564912226e-07, + "loss": 2.6202, + "step": 9745 + }, + { + "epoch": 0.8822304698108083, + "grad_norm": 0.8630514144897461, + "learning_rate": 3.2466214636031056e-07, + "loss": 2.5635, + "step": 9746 + }, + { + "epoch": 0.8823209921245587, + "grad_norm": 0.8077291250228882, + "learning_rate": 3.2211215437528694e-07, + "loss": 2.6431, + "step": 9747 + }, + { + "epoch": 0.882411514438309, + "grad_norm": 0.9250985383987427, + "learning_rate": 3.195721999508461e-07, + "loss": 2.6194, + "step": 9748 + }, + { + "epoch": 0.8825020367520594, + "grad_norm": 0.8408756256103516, + "learning_rate": 3.1704228334273887e-07, + "loss": 2.6216, + "step": 9749 + }, + { + "epoch": 0.8825925590658097, + "grad_norm": 0.8261474967002869, + "learning_rate": 3.145224048057727e-07, + "loss": 2.6363, + "step": 9750 + }, + { + "epoch": 0.8826830813795601, + "grad_norm": 0.8455504775047302, + "learning_rate": 3.1201256459365557e-07, + "loss": 2.6032, + "step": 9751 + }, + { + "epoch": 0.8827736036933104, + "grad_norm": 0.7715721130371094, + "learning_rate": 3.095127629591521e-07, + "loss": 2.5847, + "step": 9752 + }, + { + "epoch": 0.8828641260070608, + "grad_norm": 0.8211907744407654, + "learning_rate": 3.070230001539831e-07, + "loss": 2.6066, + "step": 9753 + }, + { + "epoch": 0.882954648320811, + "grad_norm": 0.8626493215560913, + "learning_rate": 3.045432764288703e-07, + "loss": 2.6688, + "step": 9754 + }, + { + "epoch": 0.8830451706345614, + "grad_norm": 0.8915566205978394, + "learning_rate": 3.020735920335138e-07, + "loss": 2.663, + "step": 9755 + }, + { + "epoch": 0.8831356929483117, + "grad_norm": 0.8275890946388245, + "learning_rate": 2.9961394721663703e-07, + "loss": 2.6331, + "step": 9756 + }, + { + "epoch": 0.8832262152620621, + "grad_norm": 0.837154746055603, + "learning_rate": 2.971643422258974e-07, + "loss": 2.6798, + "step": 9757 + }, + { + "epoch": 0.8833167375758124, + "grad_norm": 0.9589747190475464, + "learning_rate": 2.947247773079753e-07, + "loss": 2.5884, + "step": 9758 + }, + { + "epoch": 0.8834072598895628, + "grad_norm": 0.8292232751846313, + "learning_rate": 2.922952527085521e-07, + "loss": 2.6457, + "step": 9759 + }, + { + "epoch": 0.8834977822033131, + "grad_norm": 0.8505760431289673, + "learning_rate": 2.898757686722542e-07, + "loss": 2.5983, + "step": 9760 + }, + { + "epoch": 0.8835883045170635, + "grad_norm": 0.7981782555580139, + "learning_rate": 2.874663254427534e-07, + "loss": 2.6458, + "step": 9761 + }, + { + "epoch": 0.8836788268308138, + "grad_norm": 0.85322105884552, + "learning_rate": 2.850669232626557e-07, + "loss": 2.6481, + "step": 9762 + }, + { + "epoch": 0.8837693491445642, + "grad_norm": 0.9375588297843933, + "learning_rate": 2.8267756237359e-07, + "loss": 2.642, + "step": 9763 + }, + { + "epoch": 0.8838598714583145, + "grad_norm": 0.8446762561798096, + "learning_rate": 2.802982430161749e-07, + "loss": 2.6055, + "step": 9764 + }, + { + "epoch": 0.8839503937720649, + "grad_norm": 0.8564532399177551, + "learning_rate": 2.779289654299855e-07, + "loss": 2.6305, + "step": 9765 + }, + { + "epoch": 0.8840409160858151, + "grad_norm": 0.8674356341362, + "learning_rate": 2.7556972985363085e-07, + "loss": 2.639, + "step": 9766 + }, + { + "epoch": 0.8841314383995655, + "grad_norm": 0.8495703935623169, + "learning_rate": 2.732205365246654e-07, + "loss": 2.6171, + "step": 9767 + }, + { + "epoch": 0.8842219607133158, + "grad_norm": 0.828662633895874, + "learning_rate": 2.708813856796666e-07, + "loss": 2.6002, + "step": 9768 + }, + { + "epoch": 0.8843124830270662, + "grad_norm": 0.8758851289749146, + "learning_rate": 2.685522775541904e-07, + "loss": 2.6089, + "step": 9769 + }, + { + "epoch": 0.8844030053408165, + "grad_norm": 0.8468634486198425, + "learning_rate": 2.6623321238277157e-07, + "loss": 2.6449, + "step": 9770 + }, + { + "epoch": 0.8844935276545669, + "grad_norm": 0.8085419535636902, + "learning_rate": 2.6392419039892314e-07, + "loss": 2.6476, + "step": 9771 + }, + { + "epoch": 0.8845840499683172, + "grad_norm": 0.9288700222969055, + "learning_rate": 2.616252118352036e-07, + "loss": 2.6527, + "step": 9772 + }, + { + "epoch": 0.8846745722820676, + "grad_norm": 0.796964704990387, + "learning_rate": 2.5933627692308336e-07, + "loss": 2.6187, + "step": 9773 + }, + { + "epoch": 0.8847650945958179, + "grad_norm": 0.8337318897247314, + "learning_rate": 2.5705738589306696e-07, + "loss": 2.5765, + "step": 9774 + }, + { + "epoch": 0.8848556169095682, + "grad_norm": 0.9105563163757324, + "learning_rate": 2.547885389746485e-07, + "loss": 2.5931, + "step": 9775 + }, + { + "epoch": 0.8849461392233186, + "grad_norm": 0.8183169364929199, + "learning_rate": 2.5252973639628976e-07, + "loss": 2.5773, + "step": 9776 + }, + { + "epoch": 0.8850366615370688, + "grad_norm": 0.8156883120536804, + "learning_rate": 2.5028097838546425e-07, + "loss": 2.6185, + "step": 9777 + }, + { + "epoch": 0.8851271838508192, + "grad_norm": 0.9505560398101807, + "learning_rate": 2.480422651686132e-07, + "loss": 2.5851, + "step": 9778 + }, + { + "epoch": 0.8852177061645695, + "grad_norm": 0.9275203943252563, + "learning_rate": 2.4581359697116724e-07, + "loss": 2.6243, + "step": 9779 + }, + { + "epoch": 0.8853082284783199, + "grad_norm": 0.9110391139984131, + "learning_rate": 2.4359497401758024e-07, + "loss": 2.6535, + "step": 9780 + }, + { + "epoch": 0.8853987507920702, + "grad_norm": 0.8156226873397827, + "learning_rate": 2.413863965312402e-07, + "loss": 2.5498, + "step": 9781 + }, + { + "epoch": 0.8854892731058206, + "grad_norm": 0.8637019991874695, + "learning_rate": 2.3918786473455803e-07, + "loss": 2.6342, + "step": 9782 + }, + { + "epoch": 0.8855797954195709, + "grad_norm": 0.7892611622810364, + "learning_rate": 2.3699937884892332e-07, + "loss": 2.6476, + "step": 9783 + }, + { + "epoch": 0.8856703177333213, + "grad_norm": 0.8169649243354797, + "learning_rate": 2.3482093909473756e-07, + "loss": 2.6645, + "step": 9784 + }, + { + "epoch": 0.8857608400470716, + "grad_norm": 0.8641905188560486, + "learning_rate": 2.3265254569133644e-07, + "loss": 2.65, + "step": 9785 + }, + { + "epoch": 0.885851362360822, + "grad_norm": 0.9224991798400879, + "learning_rate": 2.3049419885711187e-07, + "loss": 2.5947, + "step": 9786 + }, + { + "epoch": 0.8859418846745722, + "grad_norm": 0.8599008321762085, + "learning_rate": 2.2834589880937894e-07, + "loss": 2.6237, + "step": 9787 + }, + { + "epoch": 0.8860324069883226, + "grad_norm": 0.8519898653030396, + "learning_rate": 2.2620764576448683e-07, + "loss": 2.6718, + "step": 9788 + }, + { + "epoch": 0.8861229293020729, + "grad_norm": 0.8302574157714844, + "learning_rate": 2.2407943993775214e-07, + "loss": 2.5672, + "step": 9789 + }, + { + "epoch": 0.8862134516158233, + "grad_norm": 0.8853892087936401, + "learning_rate": 2.219612815434924e-07, + "loss": 2.6739, + "step": 9790 + }, + { + "epoch": 0.8863039739295736, + "grad_norm": 0.81326824426651, + "learning_rate": 2.1985317079500356e-07, + "loss": 2.6567, + "step": 9791 + }, + { + "epoch": 0.886394496243324, + "grad_norm": 0.8466541767120361, + "learning_rate": 2.1775510790456033e-07, + "loss": 2.6393, + "step": 9792 + }, + { + "epoch": 0.8864850185570743, + "grad_norm": 0.8405859470367432, + "learning_rate": 2.1566709308346034e-07, + "loss": 2.6045, + "step": 9793 + }, + { + "epoch": 0.8865755408708247, + "grad_norm": 0.867979109287262, + "learning_rate": 2.135891265419465e-07, + "loss": 2.6394, + "step": 9794 + }, + { + "epoch": 0.886666063184575, + "grad_norm": 0.8394297361373901, + "learning_rate": 2.115212084892737e-07, + "loss": 2.5794, + "step": 9795 + }, + { + "epoch": 0.8867565854983254, + "grad_norm": 0.9299885034561157, + "learning_rate": 2.0946333913368643e-07, + "loss": 2.6022, + "step": 9796 + }, + { + "epoch": 0.8868471078120757, + "grad_norm": 0.8341772556304932, + "learning_rate": 2.0741551868241893e-07, + "loss": 2.6508, + "step": 9797 + }, + { + "epoch": 0.886937630125826, + "grad_norm": 0.8111425042152405, + "learning_rate": 2.0537774734167293e-07, + "loss": 2.6191, + "step": 9798 + }, + { + "epoch": 0.8870281524395763, + "grad_norm": 0.9102740287780762, + "learning_rate": 2.0335002531665092e-07, + "loss": 2.5984, + "step": 9799 + }, + { + "epoch": 0.8871186747533267, + "grad_norm": 0.8335270881652832, + "learning_rate": 2.0133235281156736e-07, + "loss": 2.535, + "step": 9800 + }, + { + "epoch": 0.887209197067077, + "grad_norm": 0.8242570161819458, + "learning_rate": 1.9932473002957087e-07, + "loss": 2.6613, + "step": 9801 + }, + { + "epoch": 0.8872997193808274, + "grad_norm": 0.8383133411407471, + "learning_rate": 1.973271571728441e-07, + "loss": 2.6395, + "step": 9802 + }, + { + "epoch": 0.8873902416945777, + "grad_norm": 0.8448271751403809, + "learning_rate": 1.9533963444254843e-07, + "loss": 2.6522, + "step": 9803 + }, + { + "epoch": 0.8874807640083281, + "grad_norm": 0.84769207239151, + "learning_rate": 1.9336216203881262e-07, + "loss": 2.6113, + "step": 9804 + }, + { + "epoch": 0.8875712863220784, + "grad_norm": 0.8611244559288025, + "learning_rate": 1.913947401607774e-07, + "loss": 2.5958, + "step": 9805 + }, + { + "epoch": 0.8876618086358288, + "grad_norm": 0.8302865624427795, + "learning_rate": 1.8943736900656206e-07, + "loss": 2.6314, + "step": 9806 + }, + { + "epoch": 0.8877523309495791, + "grad_norm": 0.8019070029258728, + "learning_rate": 1.8749004877327558e-07, + "loss": 2.648, + "step": 9807 + }, + { + "epoch": 0.8878428532633295, + "grad_norm": 0.856889009475708, + "learning_rate": 1.8555277965701668e-07, + "loss": 2.6345, + "step": 9808 + }, + { + "epoch": 0.8879333755770797, + "grad_norm": 0.8450315594673157, + "learning_rate": 1.836255618528515e-07, + "loss": 2.6494, + "step": 9809 + }, + { + "epoch": 0.8880238978908301, + "grad_norm": 0.8036577701568604, + "learning_rate": 1.817083955548693e-07, + "loss": 2.6248, + "step": 9810 + }, + { + "epoch": 0.8881144202045804, + "grad_norm": 0.8193379044532776, + "learning_rate": 1.798012809561045e-07, + "loss": 2.5855, + "step": 9811 + }, + { + "epoch": 0.8882049425183308, + "grad_norm": 0.7942414879798889, + "learning_rate": 1.7790421824863678e-07, + "loss": 2.591, + "step": 9812 + }, + { + "epoch": 0.8882954648320811, + "grad_norm": 0.9019108414649963, + "learning_rate": 1.7601720762346897e-07, + "loss": 2.6131, + "step": 9813 + }, + { + "epoch": 0.8883859871458315, + "grad_norm": 0.9382195472717285, + "learning_rate": 1.7414024927064897e-07, + "loss": 2.6733, + "step": 9814 + }, + { + "epoch": 0.8884765094595818, + "grad_norm": 0.8000298142433167, + "learning_rate": 1.7227334337917011e-07, + "loss": 2.5882, + "step": 9815 + }, + { + "epoch": 0.8885670317733321, + "grad_norm": 0.7768236994743347, + "learning_rate": 1.7041649013703753e-07, + "loss": 2.6274, + "step": 9816 + }, + { + "epoch": 0.8886575540870825, + "grad_norm": 0.9013504981994629, + "learning_rate": 1.685696897312239e-07, + "loss": 2.5695, + "step": 9817 + }, + { + "epoch": 0.8887480764008328, + "grad_norm": 0.8952003121376038, + "learning_rate": 1.6673294234771374e-07, + "loss": 2.5901, + "step": 9818 + }, + { + "epoch": 0.8888385987145832, + "grad_norm": 0.8011465072631836, + "learning_rate": 1.6490624817147026e-07, + "loss": 2.6135, + "step": 9819 + }, + { + "epoch": 0.8889291210283334, + "grad_norm": 0.7969688773155212, + "learning_rate": 1.630896073864352e-07, + "loss": 2.6103, + "step": 9820 + }, + { + "epoch": 0.8890196433420838, + "grad_norm": 0.853882372379303, + "learning_rate": 1.6128302017553998e-07, + "loss": 2.6324, + "step": 9821 + }, + { + "epoch": 0.8891101656558341, + "grad_norm": 0.8941799402236938, + "learning_rate": 1.5948648672071687e-07, + "loss": 2.5876, + "step": 9822 + }, + { + "epoch": 0.8892006879695845, + "grad_norm": 0.8067328929901123, + "learning_rate": 1.577000072028656e-07, + "loss": 2.5852, + "step": 9823 + }, + { + "epoch": 0.8892912102833348, + "grad_norm": 0.83470618724823, + "learning_rate": 1.5592358180189782e-07, + "loss": 2.6015, + "step": 9824 + }, + { + "epoch": 0.8893817325970852, + "grad_norm": 0.8342416882514954, + "learning_rate": 1.5415721069669264e-07, + "loss": 2.5621, + "step": 9825 + }, + { + "epoch": 0.8894722549108355, + "grad_norm": 0.7842913866043091, + "learning_rate": 1.5240089406513003e-07, + "loss": 2.6539, + "step": 9826 + }, + { + "epoch": 0.8895627772245859, + "grad_norm": 0.8189299702644348, + "learning_rate": 1.506546320840574e-07, + "loss": 2.586, + "step": 9827 + }, + { + "epoch": 0.8896532995383362, + "grad_norm": 0.8747230768203735, + "learning_rate": 1.4891842492933404e-07, + "loss": 2.6314, + "step": 9828 + }, + { + "epoch": 0.8897438218520866, + "grad_norm": 0.8976212739944458, + "learning_rate": 1.4719227277578685e-07, + "loss": 2.5779, + "step": 9829 + }, + { + "epoch": 0.8898343441658368, + "grad_norm": 0.9021840691566467, + "learning_rate": 1.4547617579725449e-07, + "loss": 2.5662, + "step": 9830 + }, + { + "epoch": 0.8899248664795872, + "grad_norm": 0.8414490222930908, + "learning_rate": 1.4377013416654317e-07, + "loss": 2.5859, + "step": 9831 + }, + { + "epoch": 0.8900153887933375, + "grad_norm": 0.8306783437728882, + "learning_rate": 1.4207414805543774e-07, + "loss": 2.6371, + "step": 9832 + }, + { + "epoch": 0.8901059111070879, + "grad_norm": 0.8770350813865662, + "learning_rate": 1.4038821763473485e-07, + "loss": 2.6405, + "step": 9833 + }, + { + "epoch": 0.8901964334208382, + "grad_norm": 0.8089725971221924, + "learning_rate": 1.3871234307420989e-07, + "loss": 2.5898, + "step": 9834 + }, + { + "epoch": 0.8902869557345886, + "grad_norm": 0.8233959078788757, + "learning_rate": 1.3704652454261668e-07, + "loss": 2.62, + "step": 9835 + }, + { + "epoch": 0.8903774780483389, + "grad_norm": 0.828133761882782, + "learning_rate": 1.3539076220769887e-07, + "loss": 2.6081, + "step": 9836 + }, + { + "epoch": 0.8904680003620893, + "grad_norm": 0.8399420976638794, + "learning_rate": 1.3374505623621192e-07, + "loss": 2.5688, + "step": 9837 + }, + { + "epoch": 0.8905585226758396, + "grad_norm": 0.8425469398498535, + "learning_rate": 1.3210940679385664e-07, + "loss": 2.6351, + "step": 9838 + }, + { + "epoch": 0.89064904498959, + "grad_norm": 0.8105515241622925, + "learning_rate": 1.3048381404535682e-07, + "loss": 2.5983, + "step": 9839 + }, + { + "epoch": 0.8907395673033403, + "grad_norm": 0.8184119462966919, + "learning_rate": 1.2886827815440372e-07, + "loss": 2.6031, + "step": 9840 + }, + { + "epoch": 0.8908300896170906, + "grad_norm": 0.8745843768119812, + "learning_rate": 1.2726279928367835e-07, + "loss": 2.6651, + "step": 9841 + }, + { + "epoch": 0.8909206119308409, + "grad_norm": 0.8466340899467468, + "learning_rate": 1.2566737759485136e-07, + "loss": 2.6278, + "step": 9842 + }, + { + "epoch": 0.8910111342445913, + "grad_norm": 0.8394578695297241, + "learning_rate": 1.2408201324859425e-07, + "loss": 2.5863, + "step": 9843 + }, + { + "epoch": 0.8911016565583416, + "grad_norm": 0.8000689148902893, + "learning_rate": 1.2250670640454597e-07, + "loss": 2.6167, + "step": 9844 + }, + { + "epoch": 0.891192178872092, + "grad_norm": 0.784951388835907, + "learning_rate": 1.2094145722134632e-07, + "loss": 2.5898, + "step": 9845 + }, + { + "epoch": 0.8912827011858423, + "grad_norm": 0.8008831739425659, + "learning_rate": 1.193862658566025e-07, + "loss": 2.6132, + "step": 9846 + }, + { + "epoch": 0.8913732234995927, + "grad_norm": 0.8564389944076538, + "learning_rate": 1.1784113246692263e-07, + "loss": 2.6358, + "step": 9847 + }, + { + "epoch": 0.891463745813343, + "grad_norm": 0.8283603191375732, + "learning_rate": 1.1630605720791554e-07, + "loss": 2.59, + "step": 9848 + }, + { + "epoch": 0.8915542681270934, + "grad_norm": 0.796347439289093, + "learning_rate": 1.1478104023414648e-07, + "loss": 2.5929, + "step": 9849 + }, + { + "epoch": 0.8916447904408437, + "grad_norm": 0.7919137477874756, + "learning_rate": 1.1326608169920372e-07, + "loss": 2.5491, + "step": 9850 + }, + { + "epoch": 0.8917353127545941, + "grad_norm": 0.9107193946838379, + "learning_rate": 1.1176118175563188e-07, + "loss": 2.6295, + "step": 9851 + }, + { + "epoch": 0.8918258350683443, + "grad_norm": 0.7748007774353027, + "learning_rate": 1.1026634055497642e-07, + "loss": 2.572, + "step": 9852 + }, + { + "epoch": 0.8919163573820947, + "grad_norm": 0.852328896522522, + "learning_rate": 1.0878155824776137e-07, + "loss": 2.6392, + "step": 9853 + }, + { + "epoch": 0.892006879695845, + "grad_norm": 0.7925384044647217, + "learning_rate": 1.0730683498351157e-07, + "loss": 2.5562, + "step": 9854 + }, + { + "epoch": 0.8920974020095954, + "grad_norm": 0.8247081637382507, + "learning_rate": 1.0584217091073046e-07, + "loss": 2.6082, + "step": 9855 + }, + { + "epoch": 0.8921879243233457, + "grad_norm": 0.8525282740592957, + "learning_rate": 1.0438756617691115e-07, + "loss": 2.622, + "step": 9856 + }, + { + "epoch": 0.892278446637096, + "grad_norm": 0.8287916779518127, + "learning_rate": 1.0294302092853647e-07, + "loss": 2.6699, + "step": 9857 + }, + { + "epoch": 0.8923689689508464, + "grad_norm": 0.8627311587333679, + "learning_rate": 1.0150853531105675e-07, + "loss": 2.6384, + "step": 9858 + }, + { + "epoch": 0.8924594912645967, + "grad_norm": 0.8527313470840454, + "learning_rate": 1.000841094689342e-07, + "loss": 2.5526, + "step": 9859 + }, + { + "epoch": 0.8925500135783471, + "grad_norm": 0.9218466281890869, + "learning_rate": 9.866974354560965e-08, + "loss": 2.6543, + "step": 9860 + }, + { + "epoch": 0.8926405358920974, + "grad_norm": 0.8103916049003601, + "learning_rate": 9.72654376835136e-08, + "loss": 2.5987, + "step": 9861 + }, + { + "epoch": 0.8927310582058477, + "grad_norm": 0.7997682094573975, + "learning_rate": 9.587119202405515e-08, + "loss": 2.6328, + "step": 9862 + }, + { + "epoch": 0.892821580519598, + "grad_norm": 0.9294595122337341, + "learning_rate": 9.4487006707622e-08, + "loss": 2.6306, + "step": 9863 + }, + { + "epoch": 0.8929121028333484, + "grad_norm": 0.8661568760871887, + "learning_rate": 9.311288187362488e-08, + "loss": 2.6281, + "step": 9864 + }, + { + "epoch": 0.8930026251470987, + "grad_norm": 0.8452597856521606, + "learning_rate": 9.174881766043087e-08, + "loss": 2.6432, + "step": 9865 + }, + { + "epoch": 0.8930931474608491, + "grad_norm": 0.8144384622573853, + "learning_rate": 9.039481420538565e-08, + "loss": 2.6192, + "step": 9866 + }, + { + "epoch": 0.8931836697745994, + "grad_norm": 0.8570119738578796, + "learning_rate": 8.905087164485793e-08, + "loss": 2.6034, + "step": 9867 + }, + { + "epoch": 0.8932741920883498, + "grad_norm": 0.8554793000221252, + "learning_rate": 8.771699011416168e-08, + "loss": 2.6499, + "step": 9868 + }, + { + "epoch": 0.8933647144021001, + "grad_norm": 0.8595912456512451, + "learning_rate": 8.6393169747645e-08, + "loss": 2.6522, + "step": 9869 + }, + { + "epoch": 0.8934552367158505, + "grad_norm": 0.7925248146057129, + "learning_rate": 8.507941067859016e-08, + "loss": 2.6335, + "step": 9870 + }, + { + "epoch": 0.8935457590296008, + "grad_norm": 0.9258356690406799, + "learning_rate": 8.377571303931353e-08, + "loss": 2.6616, + "step": 9871 + }, + { + "epoch": 0.8936362813433512, + "grad_norm": 0.8388485908508301, + "learning_rate": 8.24820769610879e-08, + "loss": 2.5649, + "step": 9872 + }, + { + "epoch": 0.8937268036571014, + "grad_norm": 0.8156565427780151, + "learning_rate": 8.119850257417571e-08, + "loss": 2.6408, + "step": 9873 + }, + { + "epoch": 0.8938173259708518, + "grad_norm": 0.8444782495498657, + "learning_rate": 7.992499000785136e-08, + "loss": 2.6462, + "step": 9874 + }, + { + "epoch": 0.8939078482846021, + "grad_norm": 0.7647959589958191, + "learning_rate": 7.866153939033449e-08, + "loss": 2.5655, + "step": 9875 + }, + { + "epoch": 0.8939983705983525, + "grad_norm": 0.9548771381378174, + "learning_rate": 7.740815084887887e-08, + "loss": 2.6601, + "step": 9876 + }, + { + "epoch": 0.8940888929121028, + "grad_norm": 0.8189795613288879, + "learning_rate": 7.616482450968354e-08, + "loss": 2.6318, + "step": 9877 + }, + { + "epoch": 0.8941794152258532, + "grad_norm": 0.9351743459701538, + "learning_rate": 7.493156049794836e-08, + "loss": 2.5698, + "step": 9878 + }, + { + "epoch": 0.8942699375396035, + "grad_norm": 0.8982877135276794, + "learning_rate": 7.370835893788508e-08, + "loss": 2.6731, + "step": 9879 + }, + { + "epoch": 0.8943604598533539, + "grad_norm": 0.9087199568748474, + "learning_rate": 7.249521995263964e-08, + "loss": 2.6435, + "step": 9880 + }, + { + "epoch": 0.8944509821671042, + "grad_norm": 0.8815861940383911, + "learning_rate": 7.129214366440317e-08, + "loss": 2.6479, + "step": 9881 + }, + { + "epoch": 0.8945415044808546, + "grad_norm": 0.8453705906867981, + "learning_rate": 7.009913019431213e-08, + "loss": 2.6292, + "step": 9882 + }, + { + "epoch": 0.8946320267946049, + "grad_norm": 0.8382077813148499, + "learning_rate": 6.891617966250374e-08, + "loss": 2.6051, + "step": 9883 + }, + { + "epoch": 0.8947225491083552, + "grad_norm": 0.8215992450714111, + "learning_rate": 6.774329218810493e-08, + "loss": 2.6244, + "step": 9884 + }, + { + "epoch": 0.8948130714221055, + "grad_norm": 0.9238488078117371, + "learning_rate": 6.658046788921012e-08, + "loss": 2.6096, + "step": 9885 + }, + { + "epoch": 0.8949035937358559, + "grad_norm": 0.8317505717277527, + "learning_rate": 6.542770688293676e-08, + "loss": 2.5954, + "step": 9886 + }, + { + "epoch": 0.8949941160496062, + "grad_norm": 0.8946189284324646, + "learning_rate": 6.428500928535864e-08, + "loss": 2.6525, + "step": 9887 + }, + { + "epoch": 0.8950846383633566, + "grad_norm": 0.8472013473510742, + "learning_rate": 6.315237521155038e-08, + "loss": 2.5973, + "step": 9888 + }, + { + "epoch": 0.8951751606771069, + "grad_norm": 0.8401451706886292, + "learning_rate": 6.202980477555408e-08, + "loss": 2.6031, + "step": 9889 + }, + { + "epoch": 0.8952656829908573, + "grad_norm": 0.8763166666030884, + "learning_rate": 6.09172980904238e-08, + "loss": 2.6654, + "step": 9890 + }, + { + "epoch": 0.8953562053046076, + "grad_norm": 0.8638229370117188, + "learning_rate": 5.981485526819208e-08, + "loss": 2.6553, + "step": 9891 + }, + { + "epoch": 0.895446727618358, + "grad_norm": 0.8521655797958374, + "learning_rate": 5.872247641987016e-08, + "loss": 2.579, + "step": 9892 + }, + { + "epoch": 0.8955372499321083, + "grad_norm": 0.8707503080368042, + "learning_rate": 5.764016165545894e-08, + "loss": 2.592, + "step": 9893 + }, + { + "epoch": 0.8956277722458587, + "grad_norm": 0.7827308177947998, + "learning_rate": 5.6567911083937883e-08, + "loss": 2.5608, + "step": 9894 + }, + { + "epoch": 0.8957182945596089, + "grad_norm": 0.8894588351249695, + "learning_rate": 5.5505724813309514e-08, + "loss": 2.6634, + "step": 9895 + }, + { + "epoch": 0.8958088168733593, + "grad_norm": 0.8418501019477844, + "learning_rate": 5.4453602950510494e-08, + "loss": 2.5637, + "step": 9896 + }, + { + "epoch": 0.8958993391871096, + "grad_norm": 0.8499705791473389, + "learning_rate": 5.341154560150052e-08, + "loss": 2.6132, + "step": 9897 + }, + { + "epoch": 0.8959898615008599, + "grad_norm": 0.871951699256897, + "learning_rate": 5.2379552871217875e-08, + "loss": 2.5509, + "step": 9898 + }, + { + "epoch": 0.8960803838146103, + "grad_norm": 0.8677495121955872, + "learning_rate": 5.1357624863579426e-08, + "loss": 2.6282, + "step": 9899 + }, + { + "epoch": 0.8961709061283606, + "grad_norm": 0.8658422827720642, + "learning_rate": 5.0345761681491746e-08, + "loss": 2.6401, + "step": 9900 + }, + { + "epoch": 0.896261428442111, + "grad_norm": 0.8718020915985107, + "learning_rate": 4.934396342684e-08, + "loss": 2.5931, + "step": 9901 + }, + { + "epoch": 0.8963519507558613, + "grad_norm": 0.8454814553260803, + "learning_rate": 4.8352230200532366e-08, + "loss": 2.5815, + "step": 9902 + }, + { + "epoch": 0.8964424730696117, + "grad_norm": 0.8844153881072998, + "learning_rate": 4.73705621024112e-08, + "loss": 2.6454, + "step": 9903 + }, + { + "epoch": 0.896532995383362, + "grad_norm": 0.7848690748214722, + "learning_rate": 4.639895923134185e-08, + "loss": 2.5818, + "step": 9904 + }, + { + "epoch": 0.8966235176971123, + "grad_norm": 0.8615793585777283, + "learning_rate": 4.543742168516829e-08, + "loss": 2.6557, + "step": 9905 + }, + { + "epoch": 0.8967140400108626, + "grad_norm": 0.924583375453949, + "learning_rate": 4.4485949560701955e-08, + "loss": 2.6373, + "step": 9906 + }, + { + "epoch": 0.896804562324613, + "grad_norm": 0.8576784133911133, + "learning_rate": 4.3544542953766196e-08, + "loss": 2.6288, + "step": 9907 + }, + { + "epoch": 0.8968950846383633, + "grad_norm": 0.9127756953239441, + "learning_rate": 4.2613201959162964e-08, + "loss": 2.6422, + "step": 9908 + }, + { + "epoch": 0.8969856069521137, + "grad_norm": 0.8672379851341248, + "learning_rate": 4.169192667067279e-08, + "loss": 2.5805, + "step": 9909 + }, + { + "epoch": 0.897076129265864, + "grad_norm": 0.8124669194221497, + "learning_rate": 4.078071718107701e-08, + "loss": 2.5549, + "step": 9910 + }, + { + "epoch": 0.8971666515796144, + "grad_norm": 0.9412040114402771, + "learning_rate": 3.987957358212446e-08, + "loss": 2.62, + "step": 9911 + }, + { + "epoch": 0.8972571738933647, + "grad_norm": 0.8409717679023743, + "learning_rate": 3.898849596456478e-08, + "loss": 2.6287, + "step": 9912 + }, + { + "epoch": 0.8973476962071151, + "grad_norm": 0.8697944283485413, + "learning_rate": 3.810748441812617e-08, + "loss": 2.6004, + "step": 9913 + }, + { + "epoch": 0.8974382185208654, + "grad_norm": 0.8681595921516418, + "learning_rate": 3.723653903152657e-08, + "loss": 2.5771, + "step": 9914 + }, + { + "epoch": 0.8975287408346158, + "grad_norm": 0.8452757000923157, + "learning_rate": 3.63756598924736e-08, + "loss": 2.6115, + "step": 9915 + }, + { + "epoch": 0.897619263148366, + "grad_norm": 0.8613131642341614, + "learning_rate": 3.552484708766457e-08, + "loss": 2.6312, + "step": 9916 + }, + { + "epoch": 0.8977097854621164, + "grad_norm": 0.8760083913803101, + "learning_rate": 3.468410070276429e-08, + "loss": 2.6234, + "step": 9917 + }, + { + "epoch": 0.8978003077758667, + "grad_norm": 0.8526800274848938, + "learning_rate": 3.3853420822438367e-08, + "loss": 2.5904, + "step": 9918 + }, + { + "epoch": 0.8978908300896171, + "grad_norm": 0.7996740937232971, + "learning_rate": 3.3032807530331e-08, + "loss": 2.5986, + "step": 9919 + }, + { + "epoch": 0.8979813524033674, + "grad_norm": 0.865635335445404, + "learning_rate": 3.2222260909087196e-08, + "loss": 2.5428, + "step": 9920 + }, + { + "epoch": 0.8980718747171178, + "grad_norm": 0.8549889922142029, + "learning_rate": 3.1421781040330555e-08, + "loss": 2.6553, + "step": 9921 + }, + { + "epoch": 0.8981623970308681, + "grad_norm": 0.8013193011283875, + "learning_rate": 3.0631368004663263e-08, + "loss": 2.5791, + "step": 9922 + }, + { + "epoch": 0.8982529193446185, + "grad_norm": 0.8656326532363892, + "learning_rate": 2.985102188168831e-08, + "loss": 2.6103, + "step": 9923 + }, + { + "epoch": 0.8983434416583688, + "grad_norm": 0.8901556730270386, + "learning_rate": 2.9080742749976188e-08, + "loss": 2.6084, + "step": 9924 + }, + { + "epoch": 0.8984339639721192, + "grad_norm": 0.7727611064910889, + "learning_rate": 2.8320530687098166e-08, + "loss": 2.5956, + "step": 9925 + }, + { + "epoch": 0.8985244862858695, + "grad_norm": 0.9226417541503906, + "learning_rate": 2.7570385769604135e-08, + "loss": 2.6375, + "step": 9926 + }, + { + "epoch": 0.8986150085996198, + "grad_norm": 0.8745355010032654, + "learning_rate": 2.683030807303366e-08, + "loss": 2.6177, + "step": 9927 + }, + { + "epoch": 0.8987055309133701, + "grad_norm": 0.8516587615013123, + "learning_rate": 2.610029767191602e-08, + "loss": 2.6181, + "step": 9928 + }, + { + "epoch": 0.8987960532271205, + "grad_norm": 0.8466259241104126, + "learning_rate": 2.5380354639770176e-08, + "loss": 2.6352, + "step": 9929 + }, + { + "epoch": 0.8988865755408708, + "grad_norm": 0.8286177515983582, + "learning_rate": 2.4670479049082597e-08, + "loss": 2.5683, + "step": 9930 + }, + { + "epoch": 0.8989770978546212, + "grad_norm": 0.9245983958244324, + "learning_rate": 2.397067097135164e-08, + "loss": 2.6485, + "step": 9931 + }, + { + "epoch": 0.8990676201683715, + "grad_norm": 0.8311428427696228, + "learning_rate": 2.3280930477032058e-08, + "loss": 2.6075, + "step": 9932 + }, + { + "epoch": 0.8991581424821219, + "grad_norm": 0.8732925057411194, + "learning_rate": 2.2601257635579408e-08, + "loss": 2.6051, + "step": 9933 + }, + { + "epoch": 0.8992486647958722, + "grad_norm": 0.8483322858810425, + "learning_rate": 2.193165251545004e-08, + "loss": 2.5679, + "step": 9934 + }, + { + "epoch": 0.8993391871096226, + "grad_norm": 0.8530345559120178, + "learning_rate": 2.1272115184067797e-08, + "loss": 2.5918, + "step": 9935 + }, + { + "epoch": 0.8994297094233729, + "grad_norm": 0.8855475783348083, + "learning_rate": 2.0622645707857326e-08, + "loss": 2.6783, + "step": 9936 + }, + { + "epoch": 0.8995202317371233, + "grad_norm": 0.9028958082199097, + "learning_rate": 1.9983244152199654e-08, + "loss": 2.6063, + "step": 9937 + }, + { + "epoch": 0.8996107540508735, + "grad_norm": 0.846435010433197, + "learning_rate": 1.9353910581498826e-08, + "loss": 2.5958, + "step": 9938 + }, + { + "epoch": 0.8997012763646238, + "grad_norm": 0.9395351409912109, + "learning_rate": 1.8734645059115265e-08, + "loss": 2.6846, + "step": 9939 + }, + { + "epoch": 0.8997917986783742, + "grad_norm": 0.7862410545349121, + "learning_rate": 1.81254476474213e-08, + "loss": 2.5467, + "step": 9940 + }, + { + "epoch": 0.8998823209921245, + "grad_norm": 0.8430321216583252, + "learning_rate": 1.752631840776786e-08, + "loss": 2.6722, + "step": 9941 + }, + { + "epoch": 0.8999728433058749, + "grad_norm": 0.8460858464241028, + "learning_rate": 1.693725740046226e-08, + "loss": 2.5722, + "step": 9942 + }, + { + "epoch": 0.9000633656196252, + "grad_norm": 0.7821448445320129, + "learning_rate": 1.6358264684857016e-08, + "loss": 2.5701, + "step": 9943 + }, + { + "epoch": 0.9001538879333756, + "grad_norm": 0.8697470426559448, + "learning_rate": 1.578934031922774e-08, + "loss": 2.6128, + "step": 9944 + }, + { + "epoch": 0.9002444102471259, + "grad_norm": 0.8176562190055847, + "learning_rate": 1.5230484360873044e-08, + "loss": 2.5756, + "step": 9945 + }, + { + "epoch": 0.9003349325608763, + "grad_norm": 0.8593572974205017, + "learning_rate": 1.4681696866081229e-08, + "loss": 2.6, + "step": 9946 + }, + { + "epoch": 0.9004254548746266, + "grad_norm": 0.7897228002548218, + "learning_rate": 1.41429778901081e-08, + "loss": 2.6036, + "step": 9947 + }, + { + "epoch": 0.900515977188377, + "grad_norm": 0.851230800151825, + "learning_rate": 1.361432748718805e-08, + "loss": 2.5607, + "step": 9948 + }, + { + "epoch": 0.9006064995021272, + "grad_norm": 0.855710506439209, + "learning_rate": 1.3095745710578477e-08, + "loss": 2.636, + "step": 9949 + }, + { + "epoch": 0.9006970218158776, + "grad_norm": 0.796705961227417, + "learning_rate": 1.2587232612493172e-08, + "loss": 2.5435, + "step": 9950 + }, + { + "epoch": 0.9007875441296279, + "grad_norm": 0.7984212040901184, + "learning_rate": 1.2088788244135618e-08, + "loss": 2.5907, + "step": 9951 + }, + { + "epoch": 0.9008780664433783, + "grad_norm": 0.8091149926185608, + "learning_rate": 1.1600412655710102e-08, + "loss": 2.6241, + "step": 9952 + }, + { + "epoch": 0.9009685887571286, + "grad_norm": 0.828718900680542, + "learning_rate": 1.1122105896377299e-08, + "loss": 2.6062, + "step": 9953 + }, + { + "epoch": 0.901059111070879, + "grad_norm": 0.8733603358268738, + "learning_rate": 1.0653868014309786e-08, + "loss": 2.5837, + "step": 9954 + }, + { + "epoch": 0.9011496333846293, + "grad_norm": 0.9175240993499756, + "learning_rate": 1.0195699056669838e-08, + "loss": 2.6383, + "step": 9955 + }, + { + "epoch": 0.9012401556983797, + "grad_norm": 0.8264015913009644, + "learning_rate": 9.747599069576119e-09, + "loss": 2.6036, + "step": 9956 + }, + { + "epoch": 0.90133067801213, + "grad_norm": 0.8627322912216187, + "learning_rate": 9.309568098170296e-09, + "loss": 2.5881, + "step": 9957 + }, + { + "epoch": 0.9014212003258804, + "grad_norm": 0.8578746914863586, + "learning_rate": 8.881606186561531e-09, + "loss": 2.6102, + "step": 9958 + }, + { + "epoch": 0.9015117226396306, + "grad_norm": 0.8336324095726013, + "learning_rate": 8.463713377826476e-09, + "loss": 2.6608, + "step": 9959 + }, + { + "epoch": 0.901602244953381, + "grad_norm": 0.8321331143379211, + "learning_rate": 8.055889714064791e-09, + "loss": 2.5883, + "step": 9960 + }, + { + "epoch": 0.9016927672671313, + "grad_norm": 0.822594165802002, + "learning_rate": 7.658135236343623e-09, + "loss": 2.6386, + "step": 9961 + }, + { + "epoch": 0.9017832895808817, + "grad_norm": 0.9133470058441162, + "learning_rate": 7.270449984708716e-09, + "loss": 2.6727, + "step": 9962 + }, + { + "epoch": 0.901873811894632, + "grad_norm": 0.8246998190879822, + "learning_rate": 6.892833998206616e-09, + "loss": 2.6284, + "step": 9963 + }, + { + "epoch": 0.9019643342083824, + "grad_norm": 0.9324564337730408, + "learning_rate": 6.525287314851358e-09, + "loss": 2.698, + "step": 9964 + }, + { + "epoch": 0.9020548565221327, + "grad_norm": 0.9247385859489441, + "learning_rate": 6.167809971668881e-09, + "loss": 2.6434, + "step": 9965 + }, + { + "epoch": 0.9021453788358831, + "grad_norm": 0.8682430982589722, + "learning_rate": 5.820402004652614e-09, + "loss": 2.6659, + "step": 9966 + }, + { + "epoch": 0.9022359011496334, + "grad_norm": 0.8353280425071716, + "learning_rate": 5.483063448785686e-09, + "loss": 2.6197, + "step": 9967 + }, + { + "epoch": 0.9023264234633838, + "grad_norm": 0.8058872222900391, + "learning_rate": 5.15579433804092e-09, + "loss": 2.5601, + "step": 9968 + }, + { + "epoch": 0.902416945777134, + "grad_norm": 0.9338617324829102, + "learning_rate": 4.838594705369737e-09, + "loss": 2.6476, + "step": 9969 + }, + { + "epoch": 0.9025074680908844, + "grad_norm": 0.8994864821434021, + "learning_rate": 4.531464582713252e-09, + "loss": 2.66, + "step": 9970 + }, + { + "epoch": 0.9025979904046347, + "grad_norm": 0.8840702772140503, + "learning_rate": 4.234404001002279e-09, + "loss": 2.578, + "step": 9971 + }, + { + "epoch": 0.9026885127183851, + "grad_norm": 0.9012619256973267, + "learning_rate": 3.947412990157329e-09, + "loss": 2.6063, + "step": 9972 + }, + { + "epoch": 0.9027790350321354, + "grad_norm": 0.8753988742828369, + "learning_rate": 3.670491579066404e-09, + "loss": 2.6057, + "step": 9973 + }, + { + "epoch": 0.9028695573458858, + "grad_norm": 0.8586761951446533, + "learning_rate": 3.4036397956183076e-09, + "loss": 2.6562, + "step": 9974 + }, + { + "epoch": 0.9029600796596361, + "grad_norm": 0.7856614589691162, + "learning_rate": 3.1468576666915383e-09, + "loss": 2.6084, + "step": 9975 + }, + { + "epoch": 0.9030506019733865, + "grad_norm": 0.837960958480835, + "learning_rate": 2.900145218143191e-09, + "loss": 2.648, + "step": 9976 + }, + { + "epoch": 0.9031411242871368, + "grad_norm": 0.9321572184562683, + "learning_rate": 2.6635024748089545e-09, + "loss": 2.6036, + "step": 9977 + }, + { + "epoch": 0.9032316466008872, + "grad_norm": 0.8802992701530457, + "learning_rate": 2.4369294605253166e-09, + "loss": 2.5803, + "step": 9978 + }, + { + "epoch": 0.9033221689146375, + "grad_norm": 0.8097015619277954, + "learning_rate": 2.2204261981073615e-09, + "loss": 2.6154, + "step": 9979 + }, + { + "epoch": 0.9034126912283877, + "grad_norm": 0.8596858978271484, + "learning_rate": 2.0139927093487664e-09, + "loss": 2.6293, + "step": 9980 + }, + { + "epoch": 0.9035032135421381, + "grad_norm": 0.8549774289131165, + "learning_rate": 1.8176290150551112e-09, + "loss": 2.6386, + "step": 9981 + }, + { + "epoch": 0.9035937358558884, + "grad_norm": 0.9319025278091431, + "learning_rate": 1.6313351349883655e-09, + "loss": 2.6025, + "step": 9982 + }, + { + "epoch": 0.9036842581696388, + "grad_norm": 0.8225411772727966, + "learning_rate": 1.455111087900196e-09, + "loss": 2.5595, + "step": 9983 + }, + { + "epoch": 0.9037747804833891, + "grad_norm": 0.8542320132255554, + "learning_rate": 1.2889568915541717e-09, + "loss": 2.6098, + "step": 9984 + }, + { + "epoch": 0.9038653027971395, + "grad_norm": 0.9284166693687439, + "learning_rate": 1.132872562681353e-09, + "loss": 2.6031, + "step": 9985 + }, + { + "epoch": 0.9039558251108898, + "grad_norm": 0.8331528902053833, + "learning_rate": 9.868581169802938e-10, + "loss": 2.6571, + "step": 9986 + }, + { + "epoch": 0.9040463474246402, + "grad_norm": 0.7966681122779846, + "learning_rate": 8.509135691725512e-10, + "loss": 2.6511, + "step": 9987 + }, + { + "epoch": 0.9041368697383905, + "grad_norm": 0.8282911777496338, + "learning_rate": 7.250389329471751e-10, + "loss": 2.5352, + "step": 9988 + }, + { + "epoch": 0.9042273920521409, + "grad_norm": 0.9159926176071167, + "learning_rate": 6.092342209607083e-10, + "loss": 2.5998, + "step": 9989 + }, + { + "epoch": 0.9043179143658912, + "grad_norm": 0.8755436539649963, + "learning_rate": 5.034994448926967e-10, + "loss": 2.6352, + "step": 9990 + }, + { + "epoch": 0.9044084366796415, + "grad_norm": 0.81312096118927, + "learning_rate": 4.078346153901791e-10, + "loss": 2.6202, + "step": 9991 + }, + { + "epoch": 0.9044989589933918, + "grad_norm": 0.7991079688072205, + "learning_rate": 3.2223974207878925e-10, + "loss": 2.6106, + "step": 9992 + }, + { + "epoch": 0.9045894813071422, + "grad_norm": 0.7968497276306152, + "learning_rate": 2.4671483358496007e-10, + "loss": 2.5344, + "step": 9993 + }, + { + "epoch": 0.9046800036208925, + "grad_norm": 0.7982305884361267, + "learning_rate": 1.812598975137192e-10, + "loss": 2.6351, + "step": 9994 + }, + { + "epoch": 0.9047705259346429, + "grad_norm": 0.8206318020820618, + "learning_rate": 1.258749404486892e-10, + "loss": 2.6411, + "step": 9995 + }, + { + "epoch": 0.9048610482483932, + "grad_norm": 0.7778028249740601, + "learning_rate": 8.055996797429188e-11, + "loss": 2.6404, + "step": 9996 + }, + { + "epoch": 0.9049515705621436, + "grad_norm": 0.8508470058441162, + "learning_rate": 4.531498464244166e-11, + "loss": 2.5897, + "step": 9997 + }, + { + "epoch": 0.9050420928758939, + "grad_norm": 0.8271351456642151, + "learning_rate": 2.0139994016954434e-11, + "loss": 2.5587, + "step": 9998 + }, + { + "epoch": 0.9051326151896443, + "grad_norm": 0.8943343758583069, + "learning_rate": 5.034998629138698e-12, + "loss": 2.5531, + "step": 9999 + }, + { + "epoch": 0.9052231375033946, + "grad_norm": 0.8616811633110046, + "learning_rate": 0.0, + "loss": 2.6833, + "step": 10000 + }, + { + "epoch": 0.9052231375033946, + "eval_loss": 2.5555005073547363, + "eval_runtime": 71.4851, + "eval_samples_per_second": 37.812, + "eval_steps_per_second": 3.161, + "step": 10000 + } + ], + "logging_steps": 1, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1494808064e+17, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}