diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,37485 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003589053387169, + "eval_steps": 500, + "global_step": 5352, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00014954389113204725, + "grad_norm": 35.95169344376715, + "learning_rate": 4.975124378109453e-08, + "loss": 1.1911, + "step": 1 + }, + { + "epoch": 0.0002990877822640945, + "grad_norm": 29.047342527504238, + "learning_rate": 9.950248756218906e-08, + "loss": 1.4707, + "step": 2 + }, + { + "epoch": 0.00044863167339614175, + "grad_norm": 24.718727160032117, + "learning_rate": 1.4925373134328358e-07, + "loss": 0.9534, + "step": 3 + }, + { + "epoch": 0.000598175564528189, + "grad_norm": 32.87218994198639, + "learning_rate": 1.9900497512437812e-07, + "loss": 1.2192, + "step": 4 + }, + { + "epoch": 0.0007477194556602363, + "grad_norm": 25.398344980222138, + "learning_rate": 2.4875621890547267e-07, + "loss": 1.1835, + "step": 5 + }, + { + "epoch": 0.0008972633467922835, + "grad_norm": 30.48079389364258, + "learning_rate": 2.9850746268656716e-07, + "loss": 1.0024, + "step": 6 + }, + { + "epoch": 0.0010468072379243307, + "grad_norm": 27.780032565686206, + "learning_rate": 3.4825870646766175e-07, + "loss": 1.1796, + "step": 7 + }, + { + "epoch": 0.001196351129056378, + "grad_norm": 33.19634259772052, + "learning_rate": 3.9800995024875624e-07, + "loss": 0.9585, + "step": 8 + }, + { + "epoch": 0.0013458950201884253, + "grad_norm": 32.92097675417938, + "learning_rate": 4.4776119402985074e-07, + "loss": 1.1831, + "step": 9 + }, + { + "epoch": 0.0014954389113204726, + "grad_norm": 31.267461918177617, + "learning_rate": 4.975124378109453e-07, + "loss": 0.9208, + "step": 10 + }, + { + "epoch": 0.0016449828024525197, + "grad_norm": 31.652990928454088, + "learning_rate": 5.472636815920398e-07, + "loss": 0.8882, + "step": 11 + }, + { + "epoch": 0.001794526693584567, + "grad_norm": 33.800482625732165, + "learning_rate": 5.970149253731343e-07, + "loss": 1.2138, + "step": 12 + }, + { + "epoch": 0.0019440705847166143, + "grad_norm": 30.753216086819556, + "learning_rate": 6.467661691542289e-07, + "loss": 0.9896, + "step": 13 + }, + { + "epoch": 0.0020936144758486614, + "grad_norm": 32.57679525538582, + "learning_rate": 6.965174129353235e-07, + "loss": 0.9195, + "step": 14 + }, + { + "epoch": 0.0022431583669807087, + "grad_norm": 25.334089702892793, + "learning_rate": 7.462686567164179e-07, + "loss": 0.7515, + "step": 15 + }, + { + "epoch": 0.002392702258112756, + "grad_norm": 22.2961872211284, + "learning_rate": 7.960199004975125e-07, + "loss": 0.6638, + "step": 16 + }, + { + "epoch": 0.0025422461492448033, + "grad_norm": 24.245556768411276, + "learning_rate": 8.457711442786071e-07, + "loss": 0.7704, + "step": 17 + }, + { + "epoch": 0.0026917900403768506, + "grad_norm": 19.23412917202397, + "learning_rate": 8.955223880597015e-07, + "loss": 0.7354, + "step": 18 + }, + { + "epoch": 0.002841333931508898, + "grad_norm": 18.58051317424024, + "learning_rate": 9.452736318407961e-07, + "loss": 0.5749, + "step": 19 + }, + { + "epoch": 0.0029908778226409452, + "grad_norm": 11.242228896944281, + "learning_rate": 9.950248756218907e-07, + "loss": 0.4914, + "step": 20 + }, + { + "epoch": 0.0031404217137729925, + "grad_norm": 11.163527479225325, + "learning_rate": 1.044776119402985e-06, + "loss": 0.5823, + "step": 21 + }, + { + "epoch": 0.0032899656049050394, + "grad_norm": 9.100766388616314, + "learning_rate": 1.0945273631840796e-06, + "loss": 0.6887, + "step": 22 + }, + { + "epoch": 0.0034395094960370867, + "grad_norm": 9.371427313022828, + "learning_rate": 1.1442786069651742e-06, + "loss": 0.3365, + "step": 23 + }, + { + "epoch": 0.003589053387169134, + "grad_norm": 6.591365654298028, + "learning_rate": 1.1940298507462686e-06, + "loss": 0.4092, + "step": 24 + }, + { + "epoch": 0.0037385972783011813, + "grad_norm": 6.692920733889971, + "learning_rate": 1.2437810945273632e-06, + "loss": 0.4459, + "step": 25 + }, + { + "epoch": 0.0038881411694332286, + "grad_norm": 6.609492289627464, + "learning_rate": 1.2935323383084578e-06, + "loss": 0.4577, + "step": 26 + }, + { + "epoch": 0.004037685060565276, + "grad_norm": 4.9115623336358, + "learning_rate": 1.3432835820895524e-06, + "loss": 0.5349, + "step": 27 + }, + { + "epoch": 0.004187228951697323, + "grad_norm": 5.117676678055004, + "learning_rate": 1.393034825870647e-06, + "loss": 0.5483, + "step": 28 + }, + { + "epoch": 0.0043367728428293706, + "grad_norm": 5.263481949191207, + "learning_rate": 1.4427860696517414e-06, + "loss": 0.5991, + "step": 29 + }, + { + "epoch": 0.004486316733961417, + "grad_norm": 6.131569220022702, + "learning_rate": 1.4925373134328358e-06, + "loss": 0.3908, + "step": 30 + }, + { + "epoch": 0.004635860625093465, + "grad_norm": 5.928579435490833, + "learning_rate": 1.5422885572139304e-06, + "loss": 0.2084, + "step": 31 + }, + { + "epoch": 0.004785404516225512, + "grad_norm": 5.916757088180695, + "learning_rate": 1.592039800995025e-06, + "loss": 0.3858, + "step": 32 + }, + { + "epoch": 0.00493494840735756, + "grad_norm": 8.20423570651997, + "learning_rate": 1.6417910447761196e-06, + "loss": 0.2901, + "step": 33 + }, + { + "epoch": 0.005084492298489607, + "grad_norm": 8.219360009824356, + "learning_rate": 1.6915422885572142e-06, + "loss": 0.3919, + "step": 34 + }, + { + "epoch": 0.005234036189621654, + "grad_norm": 5.998450714995048, + "learning_rate": 1.7412935323383088e-06, + "loss": 0.2445, + "step": 35 + }, + { + "epoch": 0.005383580080753701, + "grad_norm": 4.267389037528284, + "learning_rate": 1.791044776119403e-06, + "loss": 0.2062, + "step": 36 + }, + { + "epoch": 0.005533123971885748, + "grad_norm": 5.463746992191978, + "learning_rate": 1.8407960199004975e-06, + "loss": 0.5357, + "step": 37 + }, + { + "epoch": 0.005682667863017796, + "grad_norm": 4.306281637510176, + "learning_rate": 1.8905472636815921e-06, + "loss": 0.1867, + "step": 38 + }, + { + "epoch": 0.005832211754149843, + "grad_norm": 6.551059942168939, + "learning_rate": 1.9402985074626867e-06, + "loss": 0.5944, + "step": 39 + }, + { + "epoch": 0.0059817556452818905, + "grad_norm": 6.110559490141819, + "learning_rate": 1.9900497512437813e-06, + "loss": 0.6173, + "step": 40 + }, + { + "epoch": 0.006131299536413937, + "grad_norm": 4.577457366278138, + "learning_rate": 2.0398009950248755e-06, + "loss": 0.3634, + "step": 41 + }, + { + "epoch": 0.006280843427545985, + "grad_norm": 6.020057986889502, + "learning_rate": 2.08955223880597e-06, + "loss": 0.5398, + "step": 42 + }, + { + "epoch": 0.006430387318678032, + "grad_norm": 12.119213807947853, + "learning_rate": 2.1393034825870647e-06, + "loss": 0.2376, + "step": 43 + }, + { + "epoch": 0.006579931209810079, + "grad_norm": 4.977979102095054, + "learning_rate": 2.1890547263681593e-06, + "loss": 0.2455, + "step": 44 + }, + { + "epoch": 0.006729475100942127, + "grad_norm": 3.4274663141099166, + "learning_rate": 2.238805970149254e-06, + "loss": 0.2356, + "step": 45 + }, + { + "epoch": 0.0068790189920741734, + "grad_norm": 4.552279062958819, + "learning_rate": 2.2885572139303485e-06, + "loss": 0.1681, + "step": 46 + }, + { + "epoch": 0.007028562883206221, + "grad_norm": 2.9323320786902496, + "learning_rate": 2.338308457711443e-06, + "loss": 0.2303, + "step": 47 + }, + { + "epoch": 0.007178106774338268, + "grad_norm": 4.623033466327724, + "learning_rate": 2.3880597014925373e-06, + "loss": 0.2404, + "step": 48 + }, + { + "epoch": 0.007327650665470316, + "grad_norm": 5.05007020882628, + "learning_rate": 2.437810945273632e-06, + "loss": 0.4128, + "step": 49 + }, + { + "epoch": 0.007477194556602363, + "grad_norm": 2.5237349934200273, + "learning_rate": 2.4875621890547264e-06, + "loss": 0.2196, + "step": 50 + }, + { + "epoch": 0.00762673844773441, + "grad_norm": 3.7483142878646594, + "learning_rate": 2.537313432835821e-06, + "loss": 0.1725, + "step": 51 + }, + { + "epoch": 0.007776282338866457, + "grad_norm": 4.032155563605261, + "learning_rate": 2.5870646766169156e-06, + "loss": 0.3821, + "step": 52 + }, + { + "epoch": 0.007925826229998505, + "grad_norm": 3.7782327104964333, + "learning_rate": 2.6368159203980102e-06, + "loss": 0.2207, + "step": 53 + }, + { + "epoch": 0.008075370121130552, + "grad_norm": 4.816720331969929, + "learning_rate": 2.686567164179105e-06, + "loss": 0.2265, + "step": 54 + }, + { + "epoch": 0.008224914012262599, + "grad_norm": 2.8481845548797478, + "learning_rate": 2.736318407960199e-06, + "loss": 0.2174, + "step": 55 + }, + { + "epoch": 0.008374457903394646, + "grad_norm": 4.501151176073331, + "learning_rate": 2.786069651741294e-06, + "loss": 0.2306, + "step": 56 + }, + { + "epoch": 0.008524001794526694, + "grad_norm": 4.326693136186164, + "learning_rate": 2.835820895522388e-06, + "loss": 0.4023, + "step": 57 + }, + { + "epoch": 0.008673545685658741, + "grad_norm": 4.061925818141106, + "learning_rate": 2.885572139303483e-06, + "loss": 0.7602, + "step": 58 + }, + { + "epoch": 0.008823089576790788, + "grad_norm": 6.144988240043741, + "learning_rate": 2.9353233830845774e-06, + "loss": 0.4451, + "step": 59 + }, + { + "epoch": 0.008972633467922835, + "grad_norm": 4.985549166627373, + "learning_rate": 2.9850746268656716e-06, + "loss": 0.4621, + "step": 60 + }, + { + "epoch": 0.009122177359054883, + "grad_norm": 3.192079125281125, + "learning_rate": 3.0348258706467666e-06, + "loss": 0.3694, + "step": 61 + }, + { + "epoch": 0.00927172125018693, + "grad_norm": 4.653619400771914, + "learning_rate": 3.0845771144278608e-06, + "loss": 0.2416, + "step": 62 + }, + { + "epoch": 0.009421265141318977, + "grad_norm": 3.4214006556775156, + "learning_rate": 3.1343283582089558e-06, + "loss": 0.4755, + "step": 63 + }, + { + "epoch": 0.009570809032451024, + "grad_norm": 3.0809019894250613, + "learning_rate": 3.18407960199005e-06, + "loss": 0.4154, + "step": 64 + }, + { + "epoch": 0.009720352923583071, + "grad_norm": 4.190290076677796, + "learning_rate": 3.233830845771145e-06, + "loss": 0.4362, + "step": 65 + }, + { + "epoch": 0.00986989681471512, + "grad_norm": 3.1777725686355356, + "learning_rate": 3.283582089552239e-06, + "loss": 0.3635, + "step": 66 + }, + { + "epoch": 0.010019440705847166, + "grad_norm": 2.592442539170553, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1739, + "step": 67 + }, + { + "epoch": 0.010168984596979213, + "grad_norm": 4.610893839801018, + "learning_rate": 3.3830845771144283e-06, + "loss": 0.3845, + "step": 68 + }, + { + "epoch": 0.01031852848811126, + "grad_norm": 2.941030939381248, + "learning_rate": 3.4328358208955225e-06, + "loss": 0.226, + "step": 69 + }, + { + "epoch": 0.010468072379243309, + "grad_norm": 2.641062959772403, + "learning_rate": 3.4825870646766175e-06, + "loss": 0.2083, + "step": 70 + }, + { + "epoch": 0.010617616270375356, + "grad_norm": 4.573399002022637, + "learning_rate": 3.5323383084577117e-06, + "loss": 0.3639, + "step": 71 + }, + { + "epoch": 0.010767160161507403, + "grad_norm": 3.811597787697304, + "learning_rate": 3.582089552238806e-06, + "loss": 0.2046, + "step": 72 + }, + { + "epoch": 0.01091670405263945, + "grad_norm": 7.593654702612937, + "learning_rate": 3.631840796019901e-06, + "loss": 0.3831, + "step": 73 + }, + { + "epoch": 0.011066247943771496, + "grad_norm": 2.6372126137968013, + "learning_rate": 3.681592039800995e-06, + "loss": 0.2155, + "step": 74 + }, + { + "epoch": 0.011215791834903545, + "grad_norm": 3.401033168780161, + "learning_rate": 3.73134328358209e-06, + "loss": 0.2439, + "step": 75 + }, + { + "epoch": 0.011365335726035592, + "grad_norm": 2.8172647382036047, + "learning_rate": 3.7810945273631843e-06, + "loss": 0.1614, + "step": 76 + }, + { + "epoch": 0.011514879617167639, + "grad_norm": 3.525793180439174, + "learning_rate": 3.8308457711442784e-06, + "loss": 0.2176, + "step": 77 + }, + { + "epoch": 0.011664423508299685, + "grad_norm": 2.4029805525684527, + "learning_rate": 3.8805970149253735e-06, + "loss": 0.1893, + "step": 78 + }, + { + "epoch": 0.011813967399431732, + "grad_norm": 5.727795685387504, + "learning_rate": 3.930348258706468e-06, + "loss": 0.5702, + "step": 79 + }, + { + "epoch": 0.011963511290563781, + "grad_norm": 4.021893784746645, + "learning_rate": 3.980099502487563e-06, + "loss": 0.4027, + "step": 80 + }, + { + "epoch": 0.012113055181695828, + "grad_norm": 2.7773808558650535, + "learning_rate": 4.029850746268657e-06, + "loss": 0.2963, + "step": 81 + }, + { + "epoch": 0.012262599072827875, + "grad_norm": 3.4349426033049992, + "learning_rate": 4.079601990049751e-06, + "loss": 0.2211, + "step": 82 + }, + { + "epoch": 0.012412142963959922, + "grad_norm": 4.127258766074891, + "learning_rate": 4.129353233830846e-06, + "loss": 0.2516, + "step": 83 + }, + { + "epoch": 0.01256168685509197, + "grad_norm": 3.551977981988865, + "learning_rate": 4.17910447761194e-06, + "loss": 0.2206, + "step": 84 + }, + { + "epoch": 0.012711230746224017, + "grad_norm": 2.988554589230421, + "learning_rate": 4.228855721393035e-06, + "loss": 0.366, + "step": 85 + }, + { + "epoch": 0.012860774637356064, + "grad_norm": 3.256233912334862, + "learning_rate": 4.278606965174129e-06, + "loss": 0.341, + "step": 86 + }, + { + "epoch": 0.01301031852848811, + "grad_norm": 3.917242635149468, + "learning_rate": 4.3283582089552236e-06, + "loss": 0.281, + "step": 87 + }, + { + "epoch": 0.013159862419620158, + "grad_norm": 3.8372869351661247, + "learning_rate": 4.378109452736319e-06, + "loss": 0.1933, + "step": 88 + }, + { + "epoch": 0.013309406310752206, + "grad_norm": 4.03192980896834, + "learning_rate": 4.427860696517413e-06, + "loss": 0.184, + "step": 89 + }, + { + "epoch": 0.013458950201884253, + "grad_norm": 4.944440623197377, + "learning_rate": 4.477611940298508e-06, + "loss": 0.2406, + "step": 90 + }, + { + "epoch": 0.0136084940930163, + "grad_norm": 3.2771345760625916, + "learning_rate": 4.527363184079602e-06, + "loss": 0.3635, + "step": 91 + }, + { + "epoch": 0.013758037984148347, + "grad_norm": 2.5552685161479913, + "learning_rate": 4.577114427860697e-06, + "loss": 0.3581, + "step": 92 + }, + { + "epoch": 0.013907581875280395, + "grad_norm": 3.825258197515859, + "learning_rate": 4.626865671641791e-06, + "loss": 0.2157, + "step": 93 + }, + { + "epoch": 0.014057125766412442, + "grad_norm": 3.820006828326968, + "learning_rate": 4.676616915422886e-06, + "loss": 0.401, + "step": 94 + }, + { + "epoch": 0.01420666965754449, + "grad_norm": 3.4269639891084056, + "learning_rate": 4.72636815920398e-06, + "loss": 0.21, + "step": 95 + }, + { + "epoch": 0.014356213548676536, + "grad_norm": 3.614177044324435, + "learning_rate": 4.7761194029850745e-06, + "loss": 0.2305, + "step": 96 + }, + { + "epoch": 0.014505757439808583, + "grad_norm": 2.8474787904051633, + "learning_rate": 4.8258706467661695e-06, + "loss": 0.2002, + "step": 97 + }, + { + "epoch": 0.014655301330940632, + "grad_norm": 3.1529185682156333, + "learning_rate": 4.875621890547264e-06, + "loss": 0.3126, + "step": 98 + }, + { + "epoch": 0.014804845222072678, + "grad_norm": 2.805579699726101, + "learning_rate": 4.925373134328359e-06, + "loss": 0.3977, + "step": 99 + }, + { + "epoch": 0.014954389113204725, + "grad_norm": 2.5072872378288134, + "learning_rate": 4.975124378109453e-06, + "loss": 0.1986, + "step": 100 + }, + { + "epoch": 0.015103933004336772, + "grad_norm": 2.8773082972301816, + "learning_rate": 5.024875621890548e-06, + "loss": 0.2421, + "step": 101 + }, + { + "epoch": 0.01525347689546882, + "grad_norm": 2.3650776175631765, + "learning_rate": 5.074626865671642e-06, + "loss": 0.1864, + "step": 102 + }, + { + "epoch": 0.015403020786600868, + "grad_norm": 4.721891286027898, + "learning_rate": 5.124378109452737e-06, + "loss": 0.2939, + "step": 103 + }, + { + "epoch": 0.015552564677732915, + "grad_norm": 2.6753396233648705, + "learning_rate": 5.174129353233831e-06, + "loss": 0.2558, + "step": 104 + }, + { + "epoch": 0.01570210856886496, + "grad_norm": 3.149876968312327, + "learning_rate": 5.2238805970149255e-06, + "loss": 0.3405, + "step": 105 + }, + { + "epoch": 0.01585165245999701, + "grad_norm": 1.6322197066205648, + "learning_rate": 5.2736318407960205e-06, + "loss": 0.1453, + "step": 106 + }, + { + "epoch": 0.016001196351129055, + "grad_norm": 3.3492234789043236, + "learning_rate": 5.323383084577115e-06, + "loss": 0.404, + "step": 107 + }, + { + "epoch": 0.016150740242261104, + "grad_norm": 2.2518951047915157, + "learning_rate": 5.37313432835821e-06, + "loss": 0.2278, + "step": 108 + }, + { + "epoch": 0.016300284133393152, + "grad_norm": 3.0471913491370404, + "learning_rate": 5.422885572139304e-06, + "loss": 0.265, + "step": 109 + }, + { + "epoch": 0.016449828024525198, + "grad_norm": 1.6928519222295142, + "learning_rate": 5.472636815920398e-06, + "loss": 0.2169, + "step": 110 + }, + { + "epoch": 0.016599371915657246, + "grad_norm": 3.265018826674296, + "learning_rate": 5.522388059701493e-06, + "loss": 0.429, + "step": 111 + }, + { + "epoch": 0.01674891580678929, + "grad_norm": 2.637671664378066, + "learning_rate": 5.572139303482588e-06, + "loss": 0.2762, + "step": 112 + }, + { + "epoch": 0.01689845969792134, + "grad_norm": 3.1617986987096134, + "learning_rate": 5.621890547263682e-06, + "loss": 0.4272, + "step": 113 + }, + { + "epoch": 0.01704800358905339, + "grad_norm": 3.0132316717807175, + "learning_rate": 5.671641791044776e-06, + "loss": 0.3644, + "step": 114 + }, + { + "epoch": 0.017197547480185434, + "grad_norm": 2.2850314864309813, + "learning_rate": 5.721393034825871e-06, + "loss": 0.1967, + "step": 115 + }, + { + "epoch": 0.017347091371317482, + "grad_norm": 3.0835871860462314, + "learning_rate": 5.771144278606966e-06, + "loss": 0.2322, + "step": 116 + }, + { + "epoch": 0.017496635262449527, + "grad_norm": 3.5275796788122893, + "learning_rate": 5.820895522388061e-06, + "loss": 0.3543, + "step": 117 + }, + { + "epoch": 0.017646179153581576, + "grad_norm": 3.1301356173345494, + "learning_rate": 5.870646766169155e-06, + "loss": 0.5064, + "step": 118 + }, + { + "epoch": 0.017795723044713625, + "grad_norm": 3.9689250366780313, + "learning_rate": 5.920398009950249e-06, + "loss": 0.8428, + "step": 119 + }, + { + "epoch": 0.01794526693584567, + "grad_norm": 2.6992548320472984, + "learning_rate": 5.970149253731343e-06, + "loss": 0.2727, + "step": 120 + }, + { + "epoch": 0.01809481082697772, + "grad_norm": 2.8823271138601414, + "learning_rate": 6.019900497512439e-06, + "loss": 0.3301, + "step": 121 + }, + { + "epoch": 0.018244354718109767, + "grad_norm": 2.652199321292131, + "learning_rate": 6.069651741293533e-06, + "loss": 0.234, + "step": 122 + }, + { + "epoch": 0.018393898609241812, + "grad_norm": 4.008459949806747, + "learning_rate": 6.119402985074627e-06, + "loss": 0.5713, + "step": 123 + }, + { + "epoch": 0.01854344250037386, + "grad_norm": 2.8867543983581236, + "learning_rate": 6.1691542288557215e-06, + "loss": 0.2146, + "step": 124 + }, + { + "epoch": 0.018692986391505906, + "grad_norm": 2.379666412119815, + "learning_rate": 6.218905472636816e-06, + "loss": 0.3812, + "step": 125 + }, + { + "epoch": 0.018842530282637954, + "grad_norm": 2.8364015730213716, + "learning_rate": 6.2686567164179116e-06, + "loss": 0.3729, + "step": 126 + }, + { + "epoch": 0.018992074173770003, + "grad_norm": 2.9731590306978957, + "learning_rate": 6.318407960199006e-06, + "loss": 0.3922, + "step": 127 + }, + { + "epoch": 0.019141618064902048, + "grad_norm": 2.431931443805707, + "learning_rate": 6.3681592039801e-06, + "loss": 0.2316, + "step": 128 + }, + { + "epoch": 0.019291161956034097, + "grad_norm": 2.5964092588685594, + "learning_rate": 6.417910447761194e-06, + "loss": 0.2129, + "step": 129 + }, + { + "epoch": 0.019440705847166142, + "grad_norm": 4.241711858566103, + "learning_rate": 6.46766169154229e-06, + "loss": 0.2677, + "step": 130 + }, + { + "epoch": 0.01959024973829819, + "grad_norm": 3.743763522090278, + "learning_rate": 6.517412935323384e-06, + "loss": 0.7324, + "step": 131 + }, + { + "epoch": 0.01973979362943024, + "grad_norm": 2.325325226468886, + "learning_rate": 6.567164179104478e-06, + "loss": 0.2282, + "step": 132 + }, + { + "epoch": 0.019889337520562284, + "grad_norm": 2.187485810642544, + "learning_rate": 6.6169154228855725e-06, + "loss": 0.3479, + "step": 133 + }, + { + "epoch": 0.020038881411694333, + "grad_norm": 2.555235252803596, + "learning_rate": 6.666666666666667e-06, + "loss": 0.3084, + "step": 134 + }, + { + "epoch": 0.020188425302826378, + "grad_norm": 2.1409254211343405, + "learning_rate": 6.7164179104477625e-06, + "loss": 0.2413, + "step": 135 + }, + { + "epoch": 0.020337969193958427, + "grad_norm": 2.9475030013466292, + "learning_rate": 6.766169154228857e-06, + "loss": 0.5899, + "step": 136 + }, + { + "epoch": 0.020487513085090475, + "grad_norm": 3.161190387153201, + "learning_rate": 6.815920398009951e-06, + "loss": 0.2722, + "step": 137 + }, + { + "epoch": 0.02063705697622252, + "grad_norm": 3.4231688087143786, + "learning_rate": 6.865671641791045e-06, + "loss": 0.25, + "step": 138 + }, + { + "epoch": 0.02078660086735457, + "grad_norm": 2.891852432700459, + "learning_rate": 6.915422885572139e-06, + "loss": 0.5206, + "step": 139 + }, + { + "epoch": 0.020936144758486618, + "grad_norm": 2.4149596821734645, + "learning_rate": 6.965174129353235e-06, + "loss": 0.2792, + "step": 140 + }, + { + "epoch": 0.021085688649618663, + "grad_norm": 2.737327253049286, + "learning_rate": 7.014925373134329e-06, + "loss": 0.1785, + "step": 141 + }, + { + "epoch": 0.02123523254075071, + "grad_norm": 2.271710572333297, + "learning_rate": 7.064676616915423e-06, + "loss": 0.2216, + "step": 142 + }, + { + "epoch": 0.021384776431882756, + "grad_norm": 3.123818135886555, + "learning_rate": 7.114427860696518e-06, + "loss": 0.5292, + "step": 143 + }, + { + "epoch": 0.021534320323014805, + "grad_norm": 3.4353230085188775, + "learning_rate": 7.164179104477612e-06, + "loss": 0.257, + "step": 144 + }, + { + "epoch": 0.021683864214146854, + "grad_norm": 3.292198842322858, + "learning_rate": 7.213930348258708e-06, + "loss": 0.4413, + "step": 145 + }, + { + "epoch": 0.0218334081052789, + "grad_norm": 2.408669543365234, + "learning_rate": 7.263681592039802e-06, + "loss": 0.4034, + "step": 146 + }, + { + "epoch": 0.021982951996410947, + "grad_norm": 2.918318139010717, + "learning_rate": 7.313432835820896e-06, + "loss": 0.1789, + "step": 147 + }, + { + "epoch": 0.022132495887542993, + "grad_norm": 2.016064943310167, + "learning_rate": 7.36318407960199e-06, + "loss": 0.2454, + "step": 148 + }, + { + "epoch": 0.02228203977867504, + "grad_norm": 3.375282717272202, + "learning_rate": 7.412935323383084e-06, + "loss": 0.5047, + "step": 149 + }, + { + "epoch": 0.02243158366980709, + "grad_norm": 2.747548142801912, + "learning_rate": 7.46268656716418e-06, + "loss": 0.3193, + "step": 150 + }, + { + "epoch": 0.022581127560939135, + "grad_norm": 5.014531999850111, + "learning_rate": 7.512437810945274e-06, + "loss": 0.5367, + "step": 151 + }, + { + "epoch": 0.022730671452071183, + "grad_norm": 1.7396197448467992, + "learning_rate": 7.5621890547263685e-06, + "loss": 0.1602, + "step": 152 + }, + { + "epoch": 0.02288021534320323, + "grad_norm": 3.9271159318267452, + "learning_rate": 7.611940298507463e-06, + "loss": 0.2763, + "step": 153 + }, + { + "epoch": 0.023029759234335277, + "grad_norm": 2.093726492507833, + "learning_rate": 7.661691542288557e-06, + "loss": 0.169, + "step": 154 + }, + { + "epoch": 0.023179303125467326, + "grad_norm": 1.5357011381308088, + "learning_rate": 7.711442786069654e-06, + "loss": 0.1619, + "step": 155 + }, + { + "epoch": 0.02332884701659937, + "grad_norm": 2.3824458230974863, + "learning_rate": 7.761194029850747e-06, + "loss": 0.2094, + "step": 156 + }, + { + "epoch": 0.02347839090773142, + "grad_norm": 2.8236663879690784, + "learning_rate": 7.810945273631842e-06, + "loss": 0.3426, + "step": 157 + }, + { + "epoch": 0.023627934798863465, + "grad_norm": 3.1375695638809815, + "learning_rate": 7.860696517412935e-06, + "loss": 0.5518, + "step": 158 + }, + { + "epoch": 0.023777478689995513, + "grad_norm": 3.2182906468856105, + "learning_rate": 7.91044776119403e-06, + "loss": 0.1995, + "step": 159 + }, + { + "epoch": 0.023927022581127562, + "grad_norm": 14.749841980168513, + "learning_rate": 7.960199004975125e-06, + "loss": 0.5578, + "step": 160 + }, + { + "epoch": 0.024076566472259607, + "grad_norm": 3.0100123201004045, + "learning_rate": 8.00995024875622e-06, + "loss": 0.5091, + "step": 161 + }, + { + "epoch": 0.024226110363391656, + "grad_norm": 3.5091520525666433, + "learning_rate": 8.059701492537314e-06, + "loss": 0.5357, + "step": 162 + }, + { + "epoch": 0.024375654254523704, + "grad_norm": 2.934851375582722, + "learning_rate": 8.109452736318409e-06, + "loss": 0.2267, + "step": 163 + }, + { + "epoch": 0.02452519814565575, + "grad_norm": 2.5911339240383544, + "learning_rate": 8.159203980099502e-06, + "loss": 0.1782, + "step": 164 + }, + { + "epoch": 0.024674742036787798, + "grad_norm": 2.847206263316536, + "learning_rate": 8.208955223880599e-06, + "loss": 0.2252, + "step": 165 + }, + { + "epoch": 0.024824285927919843, + "grad_norm": 3.5380431553535976, + "learning_rate": 8.258706467661692e-06, + "loss": 0.4295, + "step": 166 + }, + { + "epoch": 0.024973829819051892, + "grad_norm": 3.150492354924513, + "learning_rate": 8.308457711442787e-06, + "loss": 0.3276, + "step": 167 + }, + { + "epoch": 0.02512337371018394, + "grad_norm": 3.114695975436696, + "learning_rate": 8.35820895522388e-06, + "loss": 0.5181, + "step": 168 + }, + { + "epoch": 0.025272917601315985, + "grad_norm": 2.6180846619509355, + "learning_rate": 8.407960199004975e-06, + "loss": 0.2577, + "step": 169 + }, + { + "epoch": 0.025422461492448034, + "grad_norm": 1.859950631659999, + "learning_rate": 8.45771144278607e-06, + "loss": 0.1838, + "step": 170 + }, + { + "epoch": 0.02557200538358008, + "grad_norm": 4.092195798232618, + "learning_rate": 8.507462686567165e-06, + "loss": 0.2676, + "step": 171 + }, + { + "epoch": 0.025721549274712128, + "grad_norm": 2.0820308098425766, + "learning_rate": 8.557213930348259e-06, + "loss": 0.2528, + "step": 172 + }, + { + "epoch": 0.025871093165844176, + "grad_norm": 2.8153771201369087, + "learning_rate": 8.606965174129354e-06, + "loss": 0.3374, + "step": 173 + }, + { + "epoch": 0.02602063705697622, + "grad_norm": 2.6417342231989114, + "learning_rate": 8.656716417910447e-06, + "loss": 0.4309, + "step": 174 + }, + { + "epoch": 0.02617018094810827, + "grad_norm": 3.3553357791865825, + "learning_rate": 8.706467661691544e-06, + "loss": 0.279, + "step": 175 + }, + { + "epoch": 0.026319724839240315, + "grad_norm": 2.5896987414147707, + "learning_rate": 8.756218905472637e-06, + "loss": 0.2505, + "step": 176 + }, + { + "epoch": 0.026469268730372364, + "grad_norm": 15.917959164107543, + "learning_rate": 8.805970149253732e-06, + "loss": 0.3903, + "step": 177 + }, + { + "epoch": 0.026618812621504413, + "grad_norm": 1.897502276352634, + "learning_rate": 8.855721393034826e-06, + "loss": 0.3051, + "step": 178 + }, + { + "epoch": 0.026768356512636458, + "grad_norm": 3.498345426750877, + "learning_rate": 8.905472636815922e-06, + "loss": 0.8122, + "step": 179 + }, + { + "epoch": 0.026917900403768506, + "grad_norm": 3.2270107650642297, + "learning_rate": 8.955223880597016e-06, + "loss": 0.2312, + "step": 180 + }, + { + "epoch": 0.027067444294900555, + "grad_norm": 2.373617987334166, + "learning_rate": 9.00497512437811e-06, + "loss": 0.3553, + "step": 181 + }, + { + "epoch": 0.0272169881860326, + "grad_norm": 2.022495433415561, + "learning_rate": 9.054726368159204e-06, + "loss": 0.3372, + "step": 182 + }, + { + "epoch": 0.02736653207716465, + "grad_norm": 2.471303542690233, + "learning_rate": 9.104477611940299e-06, + "loss": 0.2764, + "step": 183 + }, + { + "epoch": 0.027516075968296694, + "grad_norm": 2.170550660433261, + "learning_rate": 9.154228855721394e-06, + "loss": 0.2429, + "step": 184 + }, + { + "epoch": 0.027665619859428742, + "grad_norm": 1.7750572924031363, + "learning_rate": 9.203980099502489e-06, + "loss": 0.1749, + "step": 185 + }, + { + "epoch": 0.02781516375056079, + "grad_norm": 1.9803173977955488, + "learning_rate": 9.253731343283582e-06, + "loss": 0.3061, + "step": 186 + }, + { + "epoch": 0.027964707641692836, + "grad_norm": 2.686793479118654, + "learning_rate": 9.303482587064677e-06, + "loss": 0.2704, + "step": 187 + }, + { + "epoch": 0.028114251532824885, + "grad_norm": 3.0095995560762088, + "learning_rate": 9.353233830845772e-06, + "loss": 0.3935, + "step": 188 + }, + { + "epoch": 0.02826379542395693, + "grad_norm": 3.296780241377357, + "learning_rate": 9.402985074626867e-06, + "loss": 0.4349, + "step": 189 + }, + { + "epoch": 0.02841333931508898, + "grad_norm": 2.0473844316492262, + "learning_rate": 9.45273631840796e-06, + "loss": 0.3594, + "step": 190 + }, + { + "epoch": 0.028562883206221027, + "grad_norm": 2.6746439974295986, + "learning_rate": 9.502487562189056e-06, + "loss": 0.2507, + "step": 191 + }, + { + "epoch": 0.028712427097353072, + "grad_norm": 2.171372767224107, + "learning_rate": 9.552238805970149e-06, + "loss": 0.4442, + "step": 192 + }, + { + "epoch": 0.02886197098848512, + "grad_norm": 3.412610878033882, + "learning_rate": 9.601990049751244e-06, + "loss": 0.5065, + "step": 193 + }, + { + "epoch": 0.029011514879617166, + "grad_norm": 2.5249672849820843, + "learning_rate": 9.651741293532339e-06, + "loss": 0.2775, + "step": 194 + }, + { + "epoch": 0.029161058770749215, + "grad_norm": 1.9244063665371054, + "learning_rate": 9.701492537313434e-06, + "loss": 0.2501, + "step": 195 + }, + { + "epoch": 0.029310602661881263, + "grad_norm": 2.2928756876943788, + "learning_rate": 9.751243781094527e-06, + "loss": 0.391, + "step": 196 + }, + { + "epoch": 0.02946014655301331, + "grad_norm": 3.2090175671059464, + "learning_rate": 9.800995024875622e-06, + "loss": 0.355, + "step": 197 + }, + { + "epoch": 0.029609690444145357, + "grad_norm": 2.564275054094989, + "learning_rate": 9.850746268656717e-06, + "loss": 0.3824, + "step": 198 + }, + { + "epoch": 0.029759234335277406, + "grad_norm": 2.2612313847384473, + "learning_rate": 9.900497512437812e-06, + "loss": 0.255, + "step": 199 + }, + { + "epoch": 0.02990877822640945, + "grad_norm": 2.867410801811384, + "learning_rate": 9.950248756218906e-06, + "loss": 0.2321, + "step": 200 + }, + { + "epoch": 0.0300583221175415, + "grad_norm": 2.7017080308625316, + "learning_rate": 1e-05, + "loss": 0.5355, + "step": 201 + }, + { + "epoch": 0.030207866008673544, + "grad_norm": 1.7563631058650533, + "learning_rate": 9.999999413475907e-06, + "loss": 0.2366, + "step": 202 + }, + { + "epoch": 0.030357409899805593, + "grad_norm": 2.7923486514729134, + "learning_rate": 9.999997653903764e-06, + "loss": 0.5735, + "step": 203 + }, + { + "epoch": 0.03050695379093764, + "grad_norm": 2.5477270678585935, + "learning_rate": 9.999994721283985e-06, + "loss": 0.2316, + "step": 204 + }, + { + "epoch": 0.030656497682069687, + "grad_norm": 1.6435827637040603, + "learning_rate": 9.99999061561726e-06, + "loss": 0.1958, + "step": 205 + }, + { + "epoch": 0.030806041573201735, + "grad_norm": 4.225438559077688, + "learning_rate": 9.999985336904546e-06, + "loss": 0.6052, + "step": 206 + }, + { + "epoch": 0.03095558546433378, + "grad_norm": 2.384218907777814, + "learning_rate": 9.999978885147086e-06, + "loss": 0.382, + "step": 207 + }, + { + "epoch": 0.03110512935546583, + "grad_norm": 3.082533240684358, + "learning_rate": 9.999971260346394e-06, + "loss": 0.4615, + "step": 208 + }, + { + "epoch": 0.03125467324659788, + "grad_norm": 2.126341746782405, + "learning_rate": 9.999962462504259e-06, + "loss": 0.3489, + "step": 209 + }, + { + "epoch": 0.03140421713772992, + "grad_norm": 2.3157719584793974, + "learning_rate": 9.99995249162274e-06, + "loss": 0.351, + "step": 210 + }, + { + "epoch": 0.03155376102886197, + "grad_norm": 3.2569828989709046, + "learning_rate": 9.999941347704183e-06, + "loss": 0.5452, + "step": 211 + }, + { + "epoch": 0.03170330491999402, + "grad_norm": 2.4010549422177747, + "learning_rate": 9.999929030751199e-06, + "loss": 0.5511, + "step": 212 + }, + { + "epoch": 0.031852848811126065, + "grad_norm": 2.2021354319659956, + "learning_rate": 9.999915540766679e-06, + "loss": 0.409, + "step": 213 + }, + { + "epoch": 0.03200239270225811, + "grad_norm": 2.7467598032746467, + "learning_rate": 9.999900877753786e-06, + "loss": 0.2769, + "step": 214 + }, + { + "epoch": 0.03215193659339016, + "grad_norm": 2.250991470386846, + "learning_rate": 9.99988504171596e-06, + "loss": 0.4243, + "step": 215 + }, + { + "epoch": 0.03230148048452221, + "grad_norm": 7.389570164962262, + "learning_rate": 9.999868032656921e-06, + "loss": 0.5661, + "step": 216 + }, + { + "epoch": 0.03245102437565425, + "grad_norm": 2.3232325152419904, + "learning_rate": 9.999849850580653e-06, + "loss": 0.3622, + "step": 217 + }, + { + "epoch": 0.032600568266786305, + "grad_norm": 2.8448629192721153, + "learning_rate": 9.999830495491425e-06, + "loss": 0.5013, + "step": 218 + }, + { + "epoch": 0.03275011215791835, + "grad_norm": 1.9203985094095042, + "learning_rate": 9.99980996739378e-06, + "loss": 0.2597, + "step": 219 + }, + { + "epoch": 0.032899656049050395, + "grad_norm": 2.1343351176097705, + "learning_rate": 9.99978826629253e-06, + "loss": 0.333, + "step": 220 + }, + { + "epoch": 0.03304919994018244, + "grad_norm": 2.675496675158128, + "learning_rate": 9.999765392192766e-06, + "loss": 0.4679, + "step": 221 + }, + { + "epoch": 0.03319874383131449, + "grad_norm": 2.954897252892918, + "learning_rate": 9.99974134509986e-06, + "loss": 0.5779, + "step": 222 + }, + { + "epoch": 0.03334828772244654, + "grad_norm": 3.164155125145253, + "learning_rate": 9.999716125019448e-06, + "loss": 0.5192, + "step": 223 + }, + { + "epoch": 0.03349783161357858, + "grad_norm": 2.9422429580445377, + "learning_rate": 9.99968973195745e-06, + "loss": 0.3514, + "step": 224 + }, + { + "epoch": 0.033647375504710635, + "grad_norm": 2.016818218277119, + "learning_rate": 9.999662165920056e-06, + "loss": 0.3657, + "step": 225 + }, + { + "epoch": 0.03379691939584268, + "grad_norm": 2.805692301474297, + "learning_rate": 9.999633426913733e-06, + "loss": 0.1912, + "step": 226 + }, + { + "epoch": 0.033946463286974725, + "grad_norm": 2.205403428118743, + "learning_rate": 9.999603514945227e-06, + "loss": 0.234, + "step": 227 + }, + { + "epoch": 0.03409600717810678, + "grad_norm": 2.013271573198516, + "learning_rate": 9.999572430021553e-06, + "loss": 0.464, + "step": 228 + }, + { + "epoch": 0.03424555106923882, + "grad_norm": 3.033803346792209, + "learning_rate": 9.999540172150005e-06, + "loss": 0.2599, + "step": 229 + }, + { + "epoch": 0.03439509496037087, + "grad_norm": 2.854186400231596, + "learning_rate": 9.99950674133815e-06, + "loss": 0.6431, + "step": 230 + }, + { + "epoch": 0.03454463885150292, + "grad_norm": 2.162434347622467, + "learning_rate": 9.999472137593829e-06, + "loss": 0.4779, + "step": 231 + }, + { + "epoch": 0.034694182742634964, + "grad_norm": 1.4691335020169023, + "learning_rate": 9.999436360925165e-06, + "loss": 0.1827, + "step": 232 + }, + { + "epoch": 0.03484372663376701, + "grad_norm": 1.6955188606947214, + "learning_rate": 9.99939941134055e-06, + "loss": 0.2336, + "step": 233 + }, + { + "epoch": 0.034993270524899055, + "grad_norm": 2.0710606069082167, + "learning_rate": 9.99936128884865e-06, + "loss": 0.3671, + "step": 234 + }, + { + "epoch": 0.03514281441603111, + "grad_norm": 2.128464465717484, + "learning_rate": 9.999321993458411e-06, + "loss": 0.2928, + "step": 235 + }, + { + "epoch": 0.03529235830716315, + "grad_norm": 1.9685227247781487, + "learning_rate": 9.999281525179054e-06, + "loss": 0.185, + "step": 236 + }, + { + "epoch": 0.0354419021982952, + "grad_norm": 2.3203573768463115, + "learning_rate": 9.99923988402007e-06, + "loss": 0.3733, + "step": 237 + }, + { + "epoch": 0.03559144608942725, + "grad_norm": 2.2161639851963457, + "learning_rate": 9.99919706999123e-06, + "loss": 0.4, + "step": 238 + }, + { + "epoch": 0.035740989980559294, + "grad_norm": 1.551687214387557, + "learning_rate": 9.99915308310258e-06, + "loss": 0.1723, + "step": 239 + }, + { + "epoch": 0.03589053387169134, + "grad_norm": 1.9544776771870587, + "learning_rate": 9.999107923364436e-06, + "loss": 0.2587, + "step": 240 + }, + { + "epoch": 0.03604007776282339, + "grad_norm": 2.1986380601508375, + "learning_rate": 9.999061590787394e-06, + "loss": 0.544, + "step": 241 + }, + { + "epoch": 0.03618962165395544, + "grad_norm": 2.5816888510040457, + "learning_rate": 9.999014085382326e-06, + "loss": 0.4619, + "step": 242 + }, + { + "epoch": 0.03633916554508748, + "grad_norm": 1.8291845348661409, + "learning_rate": 9.998965407160377e-06, + "loss": 0.2052, + "step": 243 + }, + { + "epoch": 0.036488709436219534, + "grad_norm": 3.167062575704647, + "learning_rate": 9.998915556132966e-06, + "loss": 0.6123, + "step": 244 + }, + { + "epoch": 0.03663825332735158, + "grad_norm": 1.8628898225455814, + "learning_rate": 9.99886453231179e-06, + "loss": 0.3634, + "step": 245 + }, + { + "epoch": 0.036787797218483624, + "grad_norm": 1.7903762911789451, + "learning_rate": 9.998812335708818e-06, + "loss": 0.2162, + "step": 246 + }, + { + "epoch": 0.03693734110961567, + "grad_norm": 1.3282642487848175, + "learning_rate": 9.998758966336296e-06, + "loss": 0.1875, + "step": 247 + }, + { + "epoch": 0.03708688500074772, + "grad_norm": 1.8364953512469955, + "learning_rate": 9.998704424206747e-06, + "loss": 0.208, + "step": 248 + }, + { + "epoch": 0.037236428891879766, + "grad_norm": 1.3941303606582691, + "learning_rate": 9.998648709332965e-06, + "loss": 0.1737, + "step": 249 + }, + { + "epoch": 0.03738597278301181, + "grad_norm": 1.7239196409011197, + "learning_rate": 9.998591821728022e-06, + "loss": 0.2339, + "step": 250 + }, + { + "epoch": 0.037535516674143864, + "grad_norm": 2.623262386600702, + "learning_rate": 9.998533761405265e-06, + "loss": 0.3988, + "step": 251 + }, + { + "epoch": 0.03768506056527591, + "grad_norm": 3.0417113736320354, + "learning_rate": 9.998474528378315e-06, + "loss": 0.3998, + "step": 252 + }, + { + "epoch": 0.037834604456407954, + "grad_norm": 2.3389769972346532, + "learning_rate": 9.998414122661066e-06, + "loss": 0.2157, + "step": 253 + }, + { + "epoch": 0.037984148347540006, + "grad_norm": 2.776666496961099, + "learning_rate": 9.998352544267696e-06, + "loss": 0.5598, + "step": 254 + }, + { + "epoch": 0.03813369223867205, + "grad_norm": 2.1472401976055746, + "learning_rate": 9.998289793212645e-06, + "loss": 0.2375, + "step": 255 + }, + { + "epoch": 0.038283236129804096, + "grad_norm": 2.258529852719024, + "learning_rate": 9.99822586951064e-06, + "loss": 0.257, + "step": 256 + }, + { + "epoch": 0.03843278002093614, + "grad_norm": 2.234662282588329, + "learning_rate": 9.998160773176676e-06, + "loss": 0.2513, + "step": 257 + }, + { + "epoch": 0.038582323912068194, + "grad_norm": 1.557075634748184, + "learning_rate": 9.998094504226025e-06, + "loss": 0.2154, + "step": 258 + }, + { + "epoch": 0.03873186780320024, + "grad_norm": 1.2782097805836874, + "learning_rate": 9.998027062674236e-06, + "loss": 0.1997, + "step": 259 + }, + { + "epoch": 0.038881411694332284, + "grad_norm": 1.5754692941437902, + "learning_rate": 9.997958448537129e-06, + "loss": 0.2271, + "step": 260 + }, + { + "epoch": 0.039030955585464336, + "grad_norm": 2.3273358127526516, + "learning_rate": 9.997888661830803e-06, + "loss": 0.4129, + "step": 261 + }, + { + "epoch": 0.03918049947659638, + "grad_norm": 2.5932478274973705, + "learning_rate": 9.997817702571631e-06, + "loss": 0.2762, + "step": 262 + }, + { + "epoch": 0.039330043367728426, + "grad_norm": 1.7415819067090217, + "learning_rate": 9.99774557077626e-06, + "loss": 0.2677, + "step": 263 + }, + { + "epoch": 0.03947958725886048, + "grad_norm": 2.1983315861883974, + "learning_rate": 9.997672266461613e-06, + "loss": 0.3412, + "step": 264 + }, + { + "epoch": 0.03962913114999252, + "grad_norm": 2.8445138272257666, + "learning_rate": 9.997597789644889e-06, + "loss": 0.3471, + "step": 265 + }, + { + "epoch": 0.03977867504112457, + "grad_norm": 2.6658347323464575, + "learning_rate": 9.997522140343558e-06, + "loss": 0.3785, + "step": 266 + }, + { + "epoch": 0.03992821893225662, + "grad_norm": 1.2913669477506569, + "learning_rate": 9.997445318575371e-06, + "loss": 0.2089, + "step": 267 + }, + { + "epoch": 0.040077762823388666, + "grad_norm": 2.440102551085522, + "learning_rate": 9.99736732435835e-06, + "loss": 0.5639, + "step": 268 + }, + { + "epoch": 0.04022730671452071, + "grad_norm": 2.252623935384866, + "learning_rate": 9.997288157710795e-06, + "loss": 0.447, + "step": 269 + }, + { + "epoch": 0.040376850605652756, + "grad_norm": 1.9038309319538977, + "learning_rate": 9.997207818651273e-06, + "loss": 0.2784, + "step": 270 + }, + { + "epoch": 0.04052639449678481, + "grad_norm": 2.05316637395224, + "learning_rate": 9.99712630719864e-06, + "loss": 0.3874, + "step": 271 + }, + { + "epoch": 0.04067593838791685, + "grad_norm": 4.663034399257074, + "learning_rate": 9.997043623372016e-06, + "loss": 0.3558, + "step": 272 + }, + { + "epoch": 0.0408254822790489, + "grad_norm": 2.0324793909935375, + "learning_rate": 9.996959767190799e-06, + "loss": 0.3884, + "step": 273 + }, + { + "epoch": 0.04097502617018095, + "grad_norm": 2.1897027573531003, + "learning_rate": 9.996874738674663e-06, + "loss": 0.2372, + "step": 274 + }, + { + "epoch": 0.041124570061312996, + "grad_norm": 1.9410471939157525, + "learning_rate": 9.996788537843558e-06, + "loss": 0.3478, + "step": 275 + }, + { + "epoch": 0.04127411395244504, + "grad_norm": 3.650983914269082, + "learning_rate": 9.996701164717704e-06, + "loss": 0.4213, + "step": 276 + }, + { + "epoch": 0.04142365784357709, + "grad_norm": 3.067988013237884, + "learning_rate": 9.996612619317602e-06, + "loss": 0.7209, + "step": 277 + }, + { + "epoch": 0.04157320173470914, + "grad_norm": 2.5863303551652033, + "learning_rate": 9.996522901664028e-06, + "loss": 0.5418, + "step": 278 + }, + { + "epoch": 0.04172274562584118, + "grad_norm": 2.1885641779249476, + "learning_rate": 9.996432011778026e-06, + "loss": 0.371, + "step": 279 + }, + { + "epoch": 0.041872289516973235, + "grad_norm": 2.398824728854803, + "learning_rate": 9.99633994968092e-06, + "loss": 0.5508, + "step": 280 + }, + { + "epoch": 0.04202183340810528, + "grad_norm": 1.5732032420608302, + "learning_rate": 9.996246715394314e-06, + "loss": 0.2468, + "step": 281 + }, + { + "epoch": 0.042171377299237325, + "grad_norm": 2.8532279807617944, + "learning_rate": 9.996152308940075e-06, + "loss": 0.5503, + "step": 282 + }, + { + "epoch": 0.04232092119036937, + "grad_norm": 2.4502727303222733, + "learning_rate": 9.996056730340356e-06, + "loss": 0.4046, + "step": 283 + }, + { + "epoch": 0.04247046508150142, + "grad_norm": 1.9272098426705169, + "learning_rate": 9.995959979617578e-06, + "loss": 0.3906, + "step": 284 + }, + { + "epoch": 0.04262000897263347, + "grad_norm": 2.290690335549339, + "learning_rate": 9.995862056794441e-06, + "loss": 0.2464, + "step": 285 + }, + { + "epoch": 0.04276955286376551, + "grad_norm": 1.656564250859485, + "learning_rate": 9.99576296189392e-06, + "loss": 0.1996, + "step": 286 + }, + { + "epoch": 0.042919096754897565, + "grad_norm": 2.1259148220336965, + "learning_rate": 9.995662694939262e-06, + "loss": 0.3994, + "step": 287 + }, + { + "epoch": 0.04306864064602961, + "grad_norm": 2.286901143642134, + "learning_rate": 9.99556125595399e-06, + "loss": 0.4047, + "step": 288 + }, + { + "epoch": 0.043218184537161655, + "grad_norm": 1.3559455912309712, + "learning_rate": 9.995458644961902e-06, + "loss": 0.2228, + "step": 289 + }, + { + "epoch": 0.04336772842829371, + "grad_norm": 2.285750924681825, + "learning_rate": 9.995354861987075e-06, + "loss": 0.2367, + "step": 290 + }, + { + "epoch": 0.04351727231942575, + "grad_norm": 1.923824453592428, + "learning_rate": 9.995249907053854e-06, + "loss": 0.3951, + "step": 291 + }, + { + "epoch": 0.0436668162105578, + "grad_norm": 1.968047953500074, + "learning_rate": 9.995143780186865e-06, + "loss": 0.2149, + "step": 292 + }, + { + "epoch": 0.04381636010168984, + "grad_norm": 2.3975790519132074, + "learning_rate": 9.995036481411005e-06, + "loss": 0.5312, + "step": 293 + }, + { + "epoch": 0.043965903992821895, + "grad_norm": 1.9664546058841197, + "learning_rate": 9.994928010751447e-06, + "loss": 0.4832, + "step": 294 + }, + { + "epoch": 0.04411544788395394, + "grad_norm": 2.1609011533249785, + "learning_rate": 9.994818368233639e-06, + "loss": 0.571, + "step": 295 + }, + { + "epoch": 0.044264991775085985, + "grad_norm": 1.2099666806993736, + "learning_rate": 9.994707553883305e-06, + "loss": 0.1801, + "step": 296 + }, + { + "epoch": 0.04441453566621804, + "grad_norm": 1.8811137964659612, + "learning_rate": 9.994595567726444e-06, + "loss": 0.2708, + "step": 297 + }, + { + "epoch": 0.04456407955735008, + "grad_norm": 1.6387011737954997, + "learning_rate": 9.994482409789329e-06, + "loss": 0.245, + "step": 298 + }, + { + "epoch": 0.04471362344848213, + "grad_norm": 2.4061797367092486, + "learning_rate": 9.994368080098505e-06, + "loss": 0.204, + "step": 299 + }, + { + "epoch": 0.04486316733961418, + "grad_norm": 2.555264958903577, + "learning_rate": 9.994252578680796e-06, + "loss": 0.5251, + "step": 300 + }, + { + "epoch": 0.045012711230746225, + "grad_norm": 3.1965886018503897, + "learning_rate": 9.994135905563302e-06, + "loss": 0.4353, + "step": 301 + }, + { + "epoch": 0.04516225512187827, + "grad_norm": 2.390530599961774, + "learning_rate": 9.994018060773396e-06, + "loss": 0.4199, + "step": 302 + }, + { + "epoch": 0.04531179901301032, + "grad_norm": 2.694731420269419, + "learning_rate": 9.993899044338722e-06, + "loss": 0.4029, + "step": 303 + }, + { + "epoch": 0.04546134290414237, + "grad_norm": 2.5518583518075437, + "learning_rate": 9.993778856287205e-06, + "loss": 0.3712, + "step": 304 + }, + { + "epoch": 0.04561088679527441, + "grad_norm": 1.958382495979976, + "learning_rate": 9.99365749664704e-06, + "loss": 0.3617, + "step": 305 + }, + { + "epoch": 0.04576043068640646, + "grad_norm": 2.299652220902115, + "learning_rate": 9.993534965446701e-06, + "loss": 0.4059, + "step": 306 + }, + { + "epoch": 0.04590997457753851, + "grad_norm": 4.086258301258261, + "learning_rate": 9.993411262714934e-06, + "loss": 0.2774, + "step": 307 + }, + { + "epoch": 0.046059518468670554, + "grad_norm": 2.0081624141767156, + "learning_rate": 9.993286388480763e-06, + "loss": 0.2724, + "step": 308 + }, + { + "epoch": 0.0462090623598026, + "grad_norm": 2.388037596587926, + "learning_rate": 9.993160342773483e-06, + "loss": 0.2706, + "step": 309 + }, + { + "epoch": 0.04635860625093465, + "grad_norm": 1.5868739255084185, + "learning_rate": 9.993033125622665e-06, + "loss": 0.256, + "step": 310 + }, + { + "epoch": 0.0465081501420667, + "grad_norm": 1.8286822342955051, + "learning_rate": 9.992904737058157e-06, + "loss": 0.209, + "step": 311 + }, + { + "epoch": 0.04665769403319874, + "grad_norm": 2.2060332987484306, + "learning_rate": 9.992775177110078e-06, + "loss": 0.4253, + "step": 312 + }, + { + "epoch": 0.046807237924330794, + "grad_norm": 1.39628419375001, + "learning_rate": 9.992644445808826e-06, + "loss": 0.1693, + "step": 313 + }, + { + "epoch": 0.04695678181546284, + "grad_norm": 1.5668060198088787, + "learning_rate": 9.99251254318507e-06, + "loss": 0.24, + "step": 314 + }, + { + "epoch": 0.047106325706594884, + "grad_norm": 1.998270389587923, + "learning_rate": 9.992379469269758e-06, + "loss": 0.2519, + "step": 315 + }, + { + "epoch": 0.04725586959772693, + "grad_norm": 1.9609810436779118, + "learning_rate": 9.99224522409411e-06, + "loss": 0.2023, + "step": 316 + }, + { + "epoch": 0.04740541348885898, + "grad_norm": 1.4580736241239847, + "learning_rate": 9.992109807689619e-06, + "loss": 0.2387, + "step": 317 + }, + { + "epoch": 0.04755495737999103, + "grad_norm": 2.710681694340303, + "learning_rate": 9.991973220088057e-06, + "loss": 0.6738, + "step": 318 + }, + { + "epoch": 0.04770450127112307, + "grad_norm": 1.2469776099691643, + "learning_rate": 9.991835461321466e-06, + "loss": 0.2013, + "step": 319 + }, + { + "epoch": 0.047854045162255124, + "grad_norm": 2.128896128779159, + "learning_rate": 9.99169653142217e-06, + "loss": 0.3432, + "step": 320 + }, + { + "epoch": 0.04800358905338717, + "grad_norm": 1.6053097848087672, + "learning_rate": 9.991556430422759e-06, + "loss": 0.2301, + "step": 321 + }, + { + "epoch": 0.048153132944519214, + "grad_norm": 1.7774787600035602, + "learning_rate": 9.991415158356106e-06, + "loss": 0.2535, + "step": 322 + }, + { + "epoch": 0.048302676835651266, + "grad_norm": 1.449815289318445, + "learning_rate": 9.991272715255351e-06, + "loss": 0.1878, + "step": 323 + }, + { + "epoch": 0.04845222072678331, + "grad_norm": 1.5118547669168991, + "learning_rate": 9.991129101153916e-06, + "loss": 0.3186, + "step": 324 + }, + { + "epoch": 0.048601764617915356, + "grad_norm": 1.461388444407636, + "learning_rate": 9.99098431608549e-06, + "loss": 0.1747, + "step": 325 + }, + { + "epoch": 0.04875130850904741, + "grad_norm": 2.3912366570769974, + "learning_rate": 9.990838360084045e-06, + "loss": 0.5325, + "step": 326 + }, + { + "epoch": 0.048900852400179454, + "grad_norm": 2.5611474084390937, + "learning_rate": 9.990691233183823e-06, + "loss": 0.2606, + "step": 327 + }, + { + "epoch": 0.0490503962913115, + "grad_norm": 2.21899436894442, + "learning_rate": 9.990542935419341e-06, + "loss": 0.4253, + "step": 328 + }, + { + "epoch": 0.049199940182443544, + "grad_norm": 1.6883179263006298, + "learning_rate": 9.99039346682539e-06, + "loss": 0.1768, + "step": 329 + }, + { + "epoch": 0.049349484073575596, + "grad_norm": 3.2358870266119006, + "learning_rate": 9.990242827437036e-06, + "loss": 0.7866, + "step": 330 + }, + { + "epoch": 0.04949902796470764, + "grad_norm": 2.0627143054944153, + "learning_rate": 9.990091017289623e-06, + "loss": 0.3286, + "step": 331 + }, + { + "epoch": 0.049648571855839686, + "grad_norm": 2.1246533005850523, + "learning_rate": 9.989938036418766e-06, + "loss": 0.2716, + "step": 332 + }, + { + "epoch": 0.04979811574697174, + "grad_norm": 2.6250279686209828, + "learning_rate": 9.989783884860355e-06, + "loss": 0.5058, + "step": 333 + }, + { + "epoch": 0.049947659638103784, + "grad_norm": 2.3409062617647627, + "learning_rate": 9.989628562650558e-06, + "loss": 0.2589, + "step": 334 + }, + { + "epoch": 0.05009720352923583, + "grad_norm": 1.835901073337933, + "learning_rate": 9.989472069825811e-06, + "loss": 0.3493, + "step": 335 + }, + { + "epoch": 0.05024674742036788, + "grad_norm": 2.2454393810241298, + "learning_rate": 9.989314406422835e-06, + "loss": 0.4113, + "step": 336 + }, + { + "epoch": 0.050396291311499926, + "grad_norm": 2.2906853778474674, + "learning_rate": 9.989155572478611e-06, + "loss": 0.5289, + "step": 337 + }, + { + "epoch": 0.05054583520263197, + "grad_norm": 2.3899442476389665, + "learning_rate": 9.98899556803041e-06, + "loss": 0.2174, + "step": 338 + }, + { + "epoch": 0.05069537909376402, + "grad_norm": 1.3681982854338133, + "learning_rate": 9.988834393115768e-06, + "loss": 0.2021, + "step": 339 + }, + { + "epoch": 0.05084492298489607, + "grad_norm": 1.5118760155287632, + "learning_rate": 9.988672047772497e-06, + "loss": 0.1927, + "step": 340 + }, + { + "epoch": 0.05099446687602811, + "grad_norm": 2.1144895431001105, + "learning_rate": 9.988508532038685e-06, + "loss": 0.3325, + "step": 341 + }, + { + "epoch": 0.05114401076716016, + "grad_norm": 1.8616803287346595, + "learning_rate": 9.988343845952697e-06, + "loss": 0.3018, + "step": 342 + }, + { + "epoch": 0.05129355465829221, + "grad_norm": 2.787967616575242, + "learning_rate": 9.988177989553167e-06, + "loss": 0.4641, + "step": 343 + }, + { + "epoch": 0.051443098549424256, + "grad_norm": 2.2905797584406242, + "learning_rate": 9.98801096287901e-06, + "loss": 0.5336, + "step": 344 + }, + { + "epoch": 0.0515926424405563, + "grad_norm": 1.769311364935245, + "learning_rate": 9.987842765969408e-06, + "loss": 0.2843, + "step": 345 + }, + { + "epoch": 0.05174218633168835, + "grad_norm": 1.7122732613639495, + "learning_rate": 9.987673398863824e-06, + "loss": 0.2272, + "step": 346 + }, + { + "epoch": 0.0518917302228204, + "grad_norm": 2.328359950454365, + "learning_rate": 9.987502861601991e-06, + "loss": 0.2645, + "step": 347 + }, + { + "epoch": 0.05204127411395244, + "grad_norm": 2.208277642399548, + "learning_rate": 9.987331154223922e-06, + "loss": 0.5877, + "step": 348 + }, + { + "epoch": 0.052190818005084495, + "grad_norm": 2.154817789687723, + "learning_rate": 9.9871582767699e-06, + "loss": 0.3414, + "step": 349 + }, + { + "epoch": 0.05234036189621654, + "grad_norm": 2.0510314098551814, + "learning_rate": 9.986984229280483e-06, + "loss": 0.3981, + "step": 350 + }, + { + "epoch": 0.052489905787348586, + "grad_norm": 2.346735661125246, + "learning_rate": 9.986809011796503e-06, + "loss": 0.6596, + "step": 351 + }, + { + "epoch": 0.05263944967848063, + "grad_norm": 1.641693244293744, + "learning_rate": 9.98663262435907e-06, + "loss": 0.3657, + "step": 352 + }, + { + "epoch": 0.05278899356961268, + "grad_norm": 2.240226359797858, + "learning_rate": 9.986455067009566e-06, + "loss": 0.3706, + "step": 353 + }, + { + "epoch": 0.05293853746074473, + "grad_norm": 2.3791485993411357, + "learning_rate": 9.986276339789648e-06, + "loss": 0.5428, + "step": 354 + }, + { + "epoch": 0.05308808135187677, + "grad_norm": 1.7806897327965683, + "learning_rate": 9.986096442741241e-06, + "loss": 0.2336, + "step": 355 + }, + { + "epoch": 0.053237625243008825, + "grad_norm": 1.8563417208131827, + "learning_rate": 9.98591537590656e-06, + "loss": 0.2129, + "step": 356 + }, + { + "epoch": 0.05338716913414087, + "grad_norm": 2.2115041121315895, + "learning_rate": 9.98573313932808e-06, + "loss": 0.5232, + "step": 357 + }, + { + "epoch": 0.053536713025272915, + "grad_norm": 1.3693709893910027, + "learning_rate": 9.985549733048556e-06, + "loss": 0.3524, + "step": 358 + }, + { + "epoch": 0.05368625691640497, + "grad_norm": 2.033727598383455, + "learning_rate": 9.985365157111017e-06, + "loss": 0.3987, + "step": 359 + }, + { + "epoch": 0.05383580080753701, + "grad_norm": 2.3258255541409505, + "learning_rate": 9.985179411558767e-06, + "loss": 0.5489, + "step": 360 + }, + { + "epoch": 0.05398534469866906, + "grad_norm": 2.0805855861837057, + "learning_rate": 9.984992496435383e-06, + "loss": 0.3982, + "step": 361 + }, + { + "epoch": 0.05413488858980111, + "grad_norm": 1.4938394292792039, + "learning_rate": 9.984804411784717e-06, + "loss": 0.2279, + "step": 362 + }, + { + "epoch": 0.054284432480933155, + "grad_norm": 1.935765339737269, + "learning_rate": 9.984615157650896e-06, + "loss": 0.2208, + "step": 363 + }, + { + "epoch": 0.0544339763720652, + "grad_norm": 2.294825440673555, + "learning_rate": 9.98442473407832e-06, + "loss": 0.4006, + "step": 364 + }, + { + "epoch": 0.054583520263197245, + "grad_norm": 1.7404498428206792, + "learning_rate": 9.984233141111663e-06, + "loss": 0.3859, + "step": 365 + }, + { + "epoch": 0.0547330641543293, + "grad_norm": 2.382616866788976, + "learning_rate": 9.984040378795879e-06, + "loss": 0.5393, + "step": 366 + }, + { + "epoch": 0.05488260804546134, + "grad_norm": 2.121310368782044, + "learning_rate": 9.983846447176186e-06, + "loss": 0.3808, + "step": 367 + }, + { + "epoch": 0.05503215193659339, + "grad_norm": 1.4327836947551182, + "learning_rate": 9.983651346298089e-06, + "loss": 0.21, + "step": 368 + }, + { + "epoch": 0.05518169582772544, + "grad_norm": 1.8551217286702022, + "learning_rate": 9.983455076207353e-06, + "loss": 0.3611, + "step": 369 + }, + { + "epoch": 0.055331239718857485, + "grad_norm": 1.1962615317465979, + "learning_rate": 9.983257636950032e-06, + "loss": 0.1632, + "step": 370 + }, + { + "epoch": 0.05548078360998953, + "grad_norm": 2.210937603202386, + "learning_rate": 9.983059028572443e-06, + "loss": 0.2054, + "step": 371 + }, + { + "epoch": 0.05563032750112158, + "grad_norm": 1.3676870965949202, + "learning_rate": 9.982859251121183e-06, + "loss": 0.2257, + "step": 372 + }, + { + "epoch": 0.05577987139225363, + "grad_norm": 1.877238753038072, + "learning_rate": 9.98265830464312e-06, + "loss": 0.3069, + "step": 373 + }, + { + "epoch": 0.05592941528338567, + "grad_norm": 2.6215120058588743, + "learning_rate": 9.9824561891854e-06, + "loss": 0.3812, + "step": 374 + }, + { + "epoch": 0.056078959174517724, + "grad_norm": 1.5353869053774183, + "learning_rate": 9.982252904795437e-06, + "loss": 0.3038, + "step": 375 + }, + { + "epoch": 0.05622850306564977, + "grad_norm": 1.5387274188562523, + "learning_rate": 9.98204845152093e-06, + "loss": 0.1784, + "step": 376 + }, + { + "epoch": 0.056378046956781815, + "grad_norm": 2.3221296907492444, + "learning_rate": 9.981842829409842e-06, + "loss": 0.4253, + "step": 377 + }, + { + "epoch": 0.05652759084791386, + "grad_norm": 1.8464138105889263, + "learning_rate": 9.981636038510414e-06, + "loss": 0.2137, + "step": 378 + }, + { + "epoch": 0.05667713473904591, + "grad_norm": 1.9213502252741161, + "learning_rate": 9.98142807887116e-06, + "loss": 0.2652, + "step": 379 + }, + { + "epoch": 0.05682667863017796, + "grad_norm": 1.7697460473662174, + "learning_rate": 9.981218950540874e-06, + "loss": 0.2525, + "step": 380 + }, + { + "epoch": 0.05697622252131, + "grad_norm": 2.001502054151958, + "learning_rate": 9.981008653568613e-06, + "loss": 0.3749, + "step": 381 + }, + { + "epoch": 0.057125766412442054, + "grad_norm": 1.7507480997796745, + "learning_rate": 9.98079718800372e-06, + "loss": 0.3293, + "step": 382 + }, + { + "epoch": 0.0572753103035741, + "grad_norm": 1.8995856376763527, + "learning_rate": 9.980584553895805e-06, + "loss": 0.2595, + "step": 383 + }, + { + "epoch": 0.057424854194706144, + "grad_norm": 1.6960817341003291, + "learning_rate": 9.980370751294754e-06, + "loss": 0.3214, + "step": 384 + }, + { + "epoch": 0.057574398085838197, + "grad_norm": 2.747620756274178, + "learning_rate": 9.980155780250728e-06, + "loss": 0.4678, + "step": 385 + }, + { + "epoch": 0.05772394197697024, + "grad_norm": 1.429295181164985, + "learning_rate": 9.979939640814158e-06, + "loss": 0.3417, + "step": 386 + }, + { + "epoch": 0.05787348586810229, + "grad_norm": 1.546941524577904, + "learning_rate": 9.979722333035757e-06, + "loss": 0.3017, + "step": 387 + }, + { + "epoch": 0.05802302975923433, + "grad_norm": 2.3243262803022753, + "learning_rate": 9.979503856966504e-06, + "loss": 0.3906, + "step": 388 + }, + { + "epoch": 0.058172573650366384, + "grad_norm": 1.5367077444523152, + "learning_rate": 9.979284212657658e-06, + "loss": 0.2735, + "step": 389 + }, + { + "epoch": 0.05832211754149843, + "grad_norm": 1.0259751361449947, + "learning_rate": 9.979063400160747e-06, + "loss": 0.1788, + "step": 390 + }, + { + "epoch": 0.058471661432630474, + "grad_norm": 1.7811616961442123, + "learning_rate": 9.97884141952758e-06, + "loss": 0.2071, + "step": 391 + }, + { + "epoch": 0.058621205323762526, + "grad_norm": 2.347009922116326, + "learning_rate": 9.978618270810229e-06, + "loss": 0.4248, + "step": 392 + }, + { + "epoch": 0.05877074921489457, + "grad_norm": 1.3076474084417338, + "learning_rate": 9.978393954061052e-06, + "loss": 0.1771, + "step": 393 + }, + { + "epoch": 0.05892029310602662, + "grad_norm": 2.4165379692755455, + "learning_rate": 9.978168469332677e-06, + "loss": 0.4913, + "step": 394 + }, + { + "epoch": 0.05906983699715867, + "grad_norm": 1.6584516839965744, + "learning_rate": 9.977941816678e-06, + "loss": 0.2292, + "step": 395 + }, + { + "epoch": 0.059219380888290714, + "grad_norm": 1.3323879687206615, + "learning_rate": 9.9777139961502e-06, + "loss": 0.2042, + "step": 396 + }, + { + "epoch": 0.05936892477942276, + "grad_norm": 1.242996863833067, + "learning_rate": 9.977485007802725e-06, + "loss": 0.1759, + "step": 397 + }, + { + "epoch": 0.05951846867055481, + "grad_norm": 2.0289613301318057, + "learning_rate": 9.977254851689297e-06, + "loss": 0.3391, + "step": 398 + }, + { + "epoch": 0.059668012561686856, + "grad_norm": 1.7111890076718022, + "learning_rate": 9.977023527863913e-06, + "loss": 0.318, + "step": 399 + }, + { + "epoch": 0.0598175564528189, + "grad_norm": 2.360289838407607, + "learning_rate": 9.976791036380844e-06, + "loss": 0.7436, + "step": 400 + }, + { + "epoch": 0.059967100343950946, + "grad_norm": 1.6556682149662436, + "learning_rate": 9.976557377294634e-06, + "loss": 0.3579, + "step": 401 + }, + { + "epoch": 0.060116644235083, + "grad_norm": 1.9472299876725607, + "learning_rate": 9.976322550660103e-06, + "loss": 0.3939, + "step": 402 + }, + { + "epoch": 0.060266188126215044, + "grad_norm": 1.2625006623785717, + "learning_rate": 9.976086556532343e-06, + "loss": 0.1777, + "step": 403 + }, + { + "epoch": 0.06041573201734709, + "grad_norm": 2.142440158571368, + "learning_rate": 9.975849394966721e-06, + "loss": 0.4728, + "step": 404 + }, + { + "epoch": 0.06056527590847914, + "grad_norm": 1.3109446375337697, + "learning_rate": 9.975611066018876e-06, + "loss": 0.2035, + "step": 405 + }, + { + "epoch": 0.060714819799611186, + "grad_norm": 1.473069250695052, + "learning_rate": 9.975371569744723e-06, + "loss": 0.2502, + "step": 406 + }, + { + "epoch": 0.06086436369074323, + "grad_norm": 1.4147256960977963, + "learning_rate": 9.975130906200453e-06, + "loss": 0.1861, + "step": 407 + }, + { + "epoch": 0.06101390758187528, + "grad_norm": 1.5107559691714745, + "learning_rate": 9.97488907544252e-06, + "loss": 0.2309, + "step": 408 + }, + { + "epoch": 0.06116345147300733, + "grad_norm": 1.5467720756101462, + "learning_rate": 9.97464607752767e-06, + "loss": 0.235, + "step": 409 + }, + { + "epoch": 0.061312995364139374, + "grad_norm": 1.2901444374034334, + "learning_rate": 9.974401912512905e-06, + "loss": 0.1877, + "step": 410 + }, + { + "epoch": 0.061462539255271426, + "grad_norm": 1.8751659558285558, + "learning_rate": 9.974156580455512e-06, + "loss": 0.2941, + "step": 411 + }, + { + "epoch": 0.06161208314640347, + "grad_norm": 1.2187366523072891, + "learning_rate": 9.973910081413048e-06, + "loss": 0.2, + "step": 412 + }, + { + "epoch": 0.061761627037535516, + "grad_norm": 2.56665763030278, + "learning_rate": 9.973662415443342e-06, + "loss": 0.4259, + "step": 413 + }, + { + "epoch": 0.06191117092866756, + "grad_norm": 1.5201509236946156, + "learning_rate": 9.973413582604502e-06, + "loss": 0.2098, + "step": 414 + }, + { + "epoch": 0.06206071481979961, + "grad_norm": 2.2299268067487183, + "learning_rate": 9.973163582954903e-06, + "loss": 0.5054, + "step": 415 + }, + { + "epoch": 0.06221025871093166, + "grad_norm": 2.195400724979985, + "learning_rate": 9.972912416553202e-06, + "loss": 0.3856, + "step": 416 + }, + { + "epoch": 0.0623598026020637, + "grad_norm": 2.3196273331545876, + "learning_rate": 9.972660083458321e-06, + "loss": 0.5608, + "step": 417 + }, + { + "epoch": 0.06250934649319576, + "grad_norm": 1.6815269422927719, + "learning_rate": 9.97240658372946e-06, + "loss": 0.3682, + "step": 418 + }, + { + "epoch": 0.0626588903843278, + "grad_norm": 1.7582779956751238, + "learning_rate": 9.972151917426095e-06, + "loss": 0.2256, + "step": 419 + }, + { + "epoch": 0.06280843427545985, + "grad_norm": 1.9523974169697056, + "learning_rate": 9.97189608460797e-06, + "loss": 0.2303, + "step": 420 + }, + { + "epoch": 0.06295797816659189, + "grad_norm": 2.120409254412015, + "learning_rate": 9.97163908533511e-06, + "loss": 0.2198, + "step": 421 + }, + { + "epoch": 0.06310752205772394, + "grad_norm": 1.7213130956608376, + "learning_rate": 9.971380919667806e-06, + "loss": 0.3355, + "step": 422 + }, + { + "epoch": 0.063257065948856, + "grad_norm": 1.6609701125154137, + "learning_rate": 9.971121587666627e-06, + "loss": 0.2354, + "step": 423 + }, + { + "epoch": 0.06340660983998804, + "grad_norm": 1.2809919353271448, + "learning_rate": 9.970861089392415e-06, + "loss": 0.2043, + "step": 424 + }, + { + "epoch": 0.06355615373112009, + "grad_norm": 1.137987748410028, + "learning_rate": 9.970599424906285e-06, + "loss": 0.1714, + "step": 425 + }, + { + "epoch": 0.06370569762225213, + "grad_norm": 2.241505455994119, + "learning_rate": 9.970336594269627e-06, + "loss": 0.559, + "step": 426 + }, + { + "epoch": 0.06385524151338418, + "grad_norm": 1.8145782296174282, + "learning_rate": 9.970072597544102e-06, + "loss": 0.4695, + "step": 427 + }, + { + "epoch": 0.06400478540451622, + "grad_norm": 2.6609160560733924, + "learning_rate": 9.96980743479165e-06, + "loss": 0.3927, + "step": 428 + }, + { + "epoch": 0.06415432929564828, + "grad_norm": 1.5902127205656447, + "learning_rate": 9.969541106074477e-06, + "loss": 0.3221, + "step": 429 + }, + { + "epoch": 0.06430387318678032, + "grad_norm": 1.354440824254012, + "learning_rate": 9.969273611455066e-06, + "loss": 0.1982, + "step": 430 + }, + { + "epoch": 0.06445341707791237, + "grad_norm": 2.1796464676908682, + "learning_rate": 9.969004950996175e-06, + "loss": 0.5947, + "step": 431 + }, + { + "epoch": 0.06460296096904442, + "grad_norm": 1.6772295444343943, + "learning_rate": 9.968735124760834e-06, + "loss": 0.3567, + "step": 432 + }, + { + "epoch": 0.06475250486017646, + "grad_norm": 2.326608368656497, + "learning_rate": 9.968464132812348e-06, + "loss": 0.3934, + "step": 433 + }, + { + "epoch": 0.0649020487513085, + "grad_norm": 1.9737750855760885, + "learning_rate": 9.968191975214293e-06, + "loss": 0.3936, + "step": 434 + }, + { + "epoch": 0.06505159264244055, + "grad_norm": 2.09687169461338, + "learning_rate": 9.967918652030522e-06, + "loss": 0.3644, + "step": 435 + }, + { + "epoch": 0.06520113653357261, + "grad_norm": 2.1122151786614967, + "learning_rate": 9.967644163325157e-06, + "loss": 0.2169, + "step": 436 + }, + { + "epoch": 0.06535068042470465, + "grad_norm": 1.8368706867911107, + "learning_rate": 9.967368509162595e-06, + "loss": 0.3956, + "step": 437 + }, + { + "epoch": 0.0655002243158367, + "grad_norm": 1.7823169737575542, + "learning_rate": 9.96709168960751e-06, + "loss": 0.232, + "step": 438 + }, + { + "epoch": 0.06564976820696874, + "grad_norm": 2.1565508943507194, + "learning_rate": 9.966813704724844e-06, + "loss": 0.2228, + "step": 439 + }, + { + "epoch": 0.06579931209810079, + "grad_norm": 2.2075342060994414, + "learning_rate": 9.966534554579816e-06, + "loss": 0.204, + "step": 440 + }, + { + "epoch": 0.06594885598923284, + "grad_norm": 2.0929887441012602, + "learning_rate": 9.966254239237917e-06, + "loss": 0.3946, + "step": 441 + }, + { + "epoch": 0.06609839988036488, + "grad_norm": 2.0382287962872834, + "learning_rate": 9.965972758764912e-06, + "loss": 0.4633, + "step": 442 + }, + { + "epoch": 0.06624794377149694, + "grad_norm": 1.2772439274586147, + "learning_rate": 9.96569011322684e-06, + "loss": 0.1784, + "step": 443 + }, + { + "epoch": 0.06639748766262898, + "grad_norm": 1.1024457344648066, + "learning_rate": 9.965406302690011e-06, + "loss": 0.1625, + "step": 444 + }, + { + "epoch": 0.06654703155376103, + "grad_norm": 1.2184559623271476, + "learning_rate": 9.965121327221007e-06, + "loss": 0.1959, + "step": 445 + }, + { + "epoch": 0.06669657544489307, + "grad_norm": 1.9215235980087064, + "learning_rate": 9.964835186886692e-06, + "loss": 0.2493, + "step": 446 + }, + { + "epoch": 0.06684611933602512, + "grad_norm": 2.1443052954533974, + "learning_rate": 9.964547881754194e-06, + "loss": 0.3611, + "step": 447 + }, + { + "epoch": 0.06699566322715717, + "grad_norm": 2.6967138020110712, + "learning_rate": 9.964259411890918e-06, + "loss": 0.5427, + "step": 448 + }, + { + "epoch": 0.06714520711828922, + "grad_norm": 1.688779610685555, + "learning_rate": 9.96396977736454e-06, + "loss": 0.2569, + "step": 449 + }, + { + "epoch": 0.06729475100942127, + "grad_norm": 2.1241026975378694, + "learning_rate": 9.963678978243014e-06, + "loss": 0.3863, + "step": 450 + }, + { + "epoch": 0.06744429490055331, + "grad_norm": 1.9388647656441462, + "learning_rate": 9.96338701459456e-06, + "loss": 0.2726, + "step": 451 + }, + { + "epoch": 0.06759383879168536, + "grad_norm": 1.4657993620125664, + "learning_rate": 9.963093886487683e-06, + "loss": 0.2338, + "step": 452 + }, + { + "epoch": 0.0677433826828174, + "grad_norm": 2.307173509923502, + "learning_rate": 9.962799593991146e-06, + "loss": 0.8039, + "step": 453 + }, + { + "epoch": 0.06789292657394945, + "grad_norm": 1.2669540134016812, + "learning_rate": 9.962504137173997e-06, + "loss": 0.169, + "step": 454 + }, + { + "epoch": 0.0680424704650815, + "grad_norm": 1.5981790001004936, + "learning_rate": 9.962207516105552e-06, + "loss": 0.2019, + "step": 455 + }, + { + "epoch": 0.06819201435621355, + "grad_norm": 1.740837427237262, + "learning_rate": 9.9619097308554e-06, + "loss": 0.2116, + "step": 456 + }, + { + "epoch": 0.0683415582473456, + "grad_norm": 1.9511590671787182, + "learning_rate": 9.961610781493407e-06, + "loss": 0.2611, + "step": 457 + }, + { + "epoch": 0.06849110213847764, + "grad_norm": 1.9814713665794252, + "learning_rate": 9.961310668089708e-06, + "loss": 0.3714, + "step": 458 + }, + { + "epoch": 0.06864064602960969, + "grad_norm": 2.755804773731971, + "learning_rate": 9.96100939071471e-06, + "loss": 0.5178, + "step": 459 + }, + { + "epoch": 0.06879018992074173, + "grad_norm": 2.5378159735000225, + "learning_rate": 9.960706949439101e-06, + "loss": 0.7334, + "step": 460 + }, + { + "epoch": 0.06893973381187378, + "grad_norm": 2.3557582569765003, + "learning_rate": 9.960403344333832e-06, + "loss": 0.5763, + "step": 461 + }, + { + "epoch": 0.06908927770300584, + "grad_norm": 1.6501148783544786, + "learning_rate": 9.960098575470131e-06, + "loss": 0.3681, + "step": 462 + }, + { + "epoch": 0.06923882159413788, + "grad_norm": 1.3521314881367383, + "learning_rate": 9.959792642919505e-06, + "loss": 0.216, + "step": 463 + }, + { + "epoch": 0.06938836548526993, + "grad_norm": 1.9967115308447656, + "learning_rate": 9.959485546753724e-06, + "loss": 0.4411, + "step": 464 + }, + { + "epoch": 0.06953790937640197, + "grad_norm": 1.6934835527025132, + "learning_rate": 9.959177287044839e-06, + "loss": 0.3013, + "step": 465 + }, + { + "epoch": 0.06968745326753402, + "grad_norm": 2.1881268216288703, + "learning_rate": 9.958867863865168e-06, + "loss": 0.386, + "step": 466 + }, + { + "epoch": 0.06983699715866606, + "grad_norm": 1.746249573857031, + "learning_rate": 9.958557277287307e-06, + "loss": 0.3486, + "step": 467 + }, + { + "epoch": 0.06998654104979811, + "grad_norm": 1.3309239290400467, + "learning_rate": 9.958245527384118e-06, + "loss": 0.2512, + "step": 468 + }, + { + "epoch": 0.07013608494093017, + "grad_norm": 1.780095751208227, + "learning_rate": 9.957932614228746e-06, + "loss": 0.3579, + "step": 469 + }, + { + "epoch": 0.07028562883206221, + "grad_norm": 2.058627302052003, + "learning_rate": 9.957618537894602e-06, + "loss": 0.2234, + "step": 470 + }, + { + "epoch": 0.07043517272319426, + "grad_norm": 2.1643867800571286, + "learning_rate": 9.95730329845537e-06, + "loss": 0.2658, + "step": 471 + }, + { + "epoch": 0.0705847166143263, + "grad_norm": 1.9162877246393155, + "learning_rate": 9.956986895985009e-06, + "loss": 0.3514, + "step": 472 + }, + { + "epoch": 0.07073426050545835, + "grad_norm": 2.0198300655217474, + "learning_rate": 9.95666933055775e-06, + "loss": 0.4191, + "step": 473 + }, + { + "epoch": 0.0708838043965904, + "grad_norm": 1.8174642496449622, + "learning_rate": 9.956350602248095e-06, + "loss": 0.1802, + "step": 474 + }, + { + "epoch": 0.07103334828772245, + "grad_norm": 1.7641599345266465, + "learning_rate": 9.956030711130824e-06, + "loss": 0.2181, + "step": 475 + }, + { + "epoch": 0.0711828921788545, + "grad_norm": 1.5149058769435404, + "learning_rate": 9.955709657280985e-06, + "loss": 0.2068, + "step": 476 + }, + { + "epoch": 0.07133243606998654, + "grad_norm": 2.14267612952952, + "learning_rate": 9.955387440773902e-06, + "loss": 0.2799, + "step": 477 + }, + { + "epoch": 0.07148197996111859, + "grad_norm": 1.8794948861297893, + "learning_rate": 9.955064061685166e-06, + "loss": 0.3437, + "step": 478 + }, + { + "epoch": 0.07163152385225063, + "grad_norm": 1.595856928796192, + "learning_rate": 9.954739520090649e-06, + "loss": 0.1741, + "step": 479 + }, + { + "epoch": 0.07178106774338268, + "grad_norm": 1.4775459266699813, + "learning_rate": 9.95441381606649e-06, + "loss": 0.2009, + "step": 480 + }, + { + "epoch": 0.07193061163451472, + "grad_norm": 1.4624583034603231, + "learning_rate": 9.954086949689102e-06, + "loss": 0.2413, + "step": 481 + }, + { + "epoch": 0.07208015552564678, + "grad_norm": 1.5685428117813849, + "learning_rate": 9.953758921035171e-06, + "loss": 0.2381, + "step": 482 + }, + { + "epoch": 0.07222969941677883, + "grad_norm": 2.0490413587537524, + "learning_rate": 9.953429730181653e-06, + "loss": 0.4092, + "step": 483 + }, + { + "epoch": 0.07237924330791087, + "grad_norm": 2.605633491672469, + "learning_rate": 9.953099377205786e-06, + "loss": 0.56, + "step": 484 + }, + { + "epoch": 0.07252878719904292, + "grad_norm": 1.6836189923086853, + "learning_rate": 9.952767862185071e-06, + "loss": 0.3514, + "step": 485 + }, + { + "epoch": 0.07267833109017496, + "grad_norm": 2.165692386982445, + "learning_rate": 9.952435185197281e-06, + "loss": 0.4363, + "step": 486 + }, + { + "epoch": 0.07282787498130701, + "grad_norm": 2.328987566639375, + "learning_rate": 9.952101346320471e-06, + "loss": 0.5953, + "step": 487 + }, + { + "epoch": 0.07297741887243907, + "grad_norm": 1.857109300243422, + "learning_rate": 9.951766345632957e-06, + "loss": 0.4125, + "step": 488 + }, + { + "epoch": 0.07312696276357111, + "grad_norm": 1.780608988332075, + "learning_rate": 9.951430183213338e-06, + "loss": 0.2793, + "step": 489 + }, + { + "epoch": 0.07327650665470316, + "grad_norm": 1.2718866410706833, + "learning_rate": 9.951092859140479e-06, + "loss": 0.1878, + "step": 490 + }, + { + "epoch": 0.0734260505458352, + "grad_norm": 1.389385388824981, + "learning_rate": 9.95075437349352e-06, + "loss": 0.1922, + "step": 491 + }, + { + "epoch": 0.07357559443696725, + "grad_norm": 1.2364018773804621, + "learning_rate": 9.950414726351873e-06, + "loss": 0.1972, + "step": 492 + }, + { + "epoch": 0.0737251383280993, + "grad_norm": 1.6438922682719497, + "learning_rate": 9.95007391779522e-06, + "loss": 0.3835, + "step": 493 + }, + { + "epoch": 0.07387468221923134, + "grad_norm": 1.9223258334837023, + "learning_rate": 9.949731947903523e-06, + "loss": 0.5421, + "step": 494 + }, + { + "epoch": 0.0740242261103634, + "grad_norm": 2.1294087718057955, + "learning_rate": 9.949388816757009e-06, + "loss": 0.6584, + "step": 495 + }, + { + "epoch": 0.07417377000149544, + "grad_norm": 1.9620720670123732, + "learning_rate": 9.949044524436178e-06, + "loss": 0.3427, + "step": 496 + }, + { + "epoch": 0.07432331389262749, + "grad_norm": 1.8767982308843718, + "learning_rate": 9.948699071021806e-06, + "loss": 0.2221, + "step": 497 + }, + { + "epoch": 0.07447285778375953, + "grad_norm": 1.5717369659821445, + "learning_rate": 9.948352456594938e-06, + "loss": 0.3915, + "step": 498 + }, + { + "epoch": 0.07462240167489158, + "grad_norm": 1.9105988284269253, + "learning_rate": 9.948004681236896e-06, + "loss": 0.4049, + "step": 499 + }, + { + "epoch": 0.07477194556602362, + "grad_norm": 2.051255434710168, + "learning_rate": 9.94765574502927e-06, + "loss": 0.263, + "step": 500 + }, + { + "epoch": 0.07492148945715567, + "grad_norm": 1.1727115808022262, + "learning_rate": 9.947305648053924e-06, + "loss": 0.2061, + "step": 501 + }, + { + "epoch": 0.07507103334828773, + "grad_norm": 2.3851218898633566, + "learning_rate": 9.946954390392995e-06, + "loss": 0.3587, + "step": 502 + }, + { + "epoch": 0.07522057723941977, + "grad_norm": 2.668333899893354, + "learning_rate": 9.94660197212889e-06, + "loss": 0.279, + "step": 503 + }, + { + "epoch": 0.07537012113055182, + "grad_norm": 2.324044177768054, + "learning_rate": 9.946248393344289e-06, + "loss": 0.5219, + "step": 504 + }, + { + "epoch": 0.07551966502168386, + "grad_norm": 2.252535927387564, + "learning_rate": 9.945893654122147e-06, + "loss": 0.4462, + "step": 505 + }, + { + "epoch": 0.07566920891281591, + "grad_norm": 1.2553962948323492, + "learning_rate": 9.945537754545689e-06, + "loss": 0.1829, + "step": 506 + }, + { + "epoch": 0.07581875280394795, + "grad_norm": 2.009514792075129, + "learning_rate": 9.94518069469841e-06, + "loss": 0.334, + "step": 507 + }, + { + "epoch": 0.07596829669508001, + "grad_norm": 1.7045023449590413, + "learning_rate": 9.944822474664082e-06, + "loss": 0.3202, + "step": 508 + }, + { + "epoch": 0.07611784058621206, + "grad_norm": 1.0508191419172128, + "learning_rate": 9.944463094526747e-06, + "loss": 0.205, + "step": 509 + }, + { + "epoch": 0.0762673844773441, + "grad_norm": 1.6097293192900886, + "learning_rate": 9.944102554370718e-06, + "loss": 0.2324, + "step": 510 + }, + { + "epoch": 0.07641692836847615, + "grad_norm": 1.9399148366487866, + "learning_rate": 9.943740854280582e-06, + "loss": 0.4526, + "step": 511 + }, + { + "epoch": 0.07656647225960819, + "grad_norm": 2.0362256511499335, + "learning_rate": 9.943377994341197e-06, + "loss": 0.3979, + "step": 512 + }, + { + "epoch": 0.07671601615074024, + "grad_norm": 1.5296316888698338, + "learning_rate": 9.943013974637693e-06, + "loss": 0.3789, + "step": 513 + }, + { + "epoch": 0.07686556004187228, + "grad_norm": 1.496691000675503, + "learning_rate": 9.942648795255473e-06, + "loss": 0.2497, + "step": 514 + }, + { + "epoch": 0.07701510393300434, + "grad_norm": 1.4146486247851384, + "learning_rate": 9.942282456280212e-06, + "loss": 0.3088, + "step": 515 + }, + { + "epoch": 0.07716464782413639, + "grad_norm": 1.3671722765483707, + "learning_rate": 9.941914957797855e-06, + "loss": 0.2076, + "step": 516 + }, + { + "epoch": 0.07731419171526843, + "grad_norm": 1.8485057563465108, + "learning_rate": 9.941546299894623e-06, + "loss": 0.3676, + "step": 517 + }, + { + "epoch": 0.07746373560640048, + "grad_norm": 2.0438588429845255, + "learning_rate": 9.941176482657005e-06, + "loss": 0.4905, + "step": 518 + }, + { + "epoch": 0.07761327949753252, + "grad_norm": 1.3215533906334498, + "learning_rate": 9.940805506171765e-06, + "loss": 0.2028, + "step": 519 + }, + { + "epoch": 0.07776282338866457, + "grad_norm": 2.499241081917891, + "learning_rate": 9.940433370525937e-06, + "loss": 0.4323, + "step": 520 + }, + { + "epoch": 0.07791236727979663, + "grad_norm": 1.4654220634749195, + "learning_rate": 9.940060075806827e-06, + "loss": 0.1928, + "step": 521 + }, + { + "epoch": 0.07806191117092867, + "grad_norm": 2.32501667334618, + "learning_rate": 9.939685622102013e-06, + "loss": 0.6039, + "step": 522 + }, + { + "epoch": 0.07821145506206072, + "grad_norm": 2.0353313744113644, + "learning_rate": 9.939310009499348e-06, + "loss": 0.434, + "step": 523 + }, + { + "epoch": 0.07836099895319276, + "grad_norm": 1.5916248439200642, + "learning_rate": 9.938933238086952e-06, + "loss": 0.2484, + "step": 524 + }, + { + "epoch": 0.07851054284432481, + "grad_norm": 1.510761606083, + "learning_rate": 9.938555307953221e-06, + "loss": 0.2761, + "step": 525 + }, + { + "epoch": 0.07866008673545685, + "grad_norm": 1.6041562012438388, + "learning_rate": 9.93817621918682e-06, + "loss": 0.3032, + "step": 526 + }, + { + "epoch": 0.0788096306265889, + "grad_norm": 1.5831322947558841, + "learning_rate": 9.937795971876686e-06, + "loss": 0.3486, + "step": 527 + }, + { + "epoch": 0.07895917451772096, + "grad_norm": 2.2247878916503856, + "learning_rate": 9.93741456611203e-06, + "loss": 0.4087, + "step": 528 + }, + { + "epoch": 0.079108718408853, + "grad_norm": 2.152252638423622, + "learning_rate": 9.937032001982334e-06, + "loss": 0.5629, + "step": 529 + }, + { + "epoch": 0.07925826229998505, + "grad_norm": 2.0483514105705525, + "learning_rate": 9.93664827957735e-06, + "loss": 0.5279, + "step": 530 + }, + { + "epoch": 0.07940780619111709, + "grad_norm": 1.2448870158155207, + "learning_rate": 9.936263398987103e-06, + "loss": 0.3744, + "step": 531 + }, + { + "epoch": 0.07955735008224914, + "grad_norm": 0.9489762178863248, + "learning_rate": 9.93587736030189e-06, + "loss": 0.1631, + "step": 532 + }, + { + "epoch": 0.07970689397338118, + "grad_norm": 1.3545590640653586, + "learning_rate": 9.935490163612279e-06, + "loss": 0.1975, + "step": 533 + }, + { + "epoch": 0.07985643786451324, + "grad_norm": 1.3663228011672384, + "learning_rate": 9.93510180900911e-06, + "loss": 0.184, + "step": 534 + }, + { + "epoch": 0.08000598175564529, + "grad_norm": 1.5768436668872405, + "learning_rate": 9.934712296583497e-06, + "loss": 0.3183, + "step": 535 + }, + { + "epoch": 0.08015552564677733, + "grad_norm": 1.926347057489139, + "learning_rate": 9.93432162642682e-06, + "loss": 0.3305, + "step": 536 + }, + { + "epoch": 0.08030506953790938, + "grad_norm": 2.0791782850566474, + "learning_rate": 9.933929798630738e-06, + "loss": 0.5009, + "step": 537 + }, + { + "epoch": 0.08045461342904142, + "grad_norm": 2.1023331544425523, + "learning_rate": 9.933536813287172e-06, + "loss": 0.4292, + "step": 538 + }, + { + "epoch": 0.08060415732017347, + "grad_norm": 2.8605361415271493, + "learning_rate": 9.933142670488324e-06, + "loss": 0.2666, + "step": 539 + }, + { + "epoch": 0.08075370121130551, + "grad_norm": 2.7087693572573968, + "learning_rate": 9.932747370326664e-06, + "loss": 0.2544, + "step": 540 + }, + { + "epoch": 0.08090324510243757, + "grad_norm": 1.5804074183588281, + "learning_rate": 9.932350912894932e-06, + "loss": 0.2089, + "step": 541 + }, + { + "epoch": 0.08105278899356962, + "grad_norm": 1.6448934387271092, + "learning_rate": 9.931953298286141e-06, + "loss": 0.181, + "step": 542 + }, + { + "epoch": 0.08120233288470166, + "grad_norm": 1.373017928034036, + "learning_rate": 9.931554526593576e-06, + "loss": 0.3218, + "step": 543 + }, + { + "epoch": 0.0813518767758337, + "grad_norm": 1.4895748889012388, + "learning_rate": 9.931154597910791e-06, + "loss": 0.2472, + "step": 544 + }, + { + "epoch": 0.08150142066696575, + "grad_norm": 2.064608760225509, + "learning_rate": 9.930753512331615e-06, + "loss": 0.3765, + "step": 545 + }, + { + "epoch": 0.0816509645580978, + "grad_norm": 1.6526846905937504, + "learning_rate": 9.930351269950144e-06, + "loss": 0.3177, + "step": 546 + }, + { + "epoch": 0.08180050844922986, + "grad_norm": 2.047798829134187, + "learning_rate": 9.92994787086075e-06, + "loss": 0.3192, + "step": 547 + }, + { + "epoch": 0.0819500523403619, + "grad_norm": 2.122394373762569, + "learning_rate": 9.929543315158073e-06, + "loss": 0.5554, + "step": 548 + }, + { + "epoch": 0.08209959623149395, + "grad_norm": 2.311960518258969, + "learning_rate": 9.929137602937028e-06, + "loss": 0.3797, + "step": 549 + }, + { + "epoch": 0.08224914012262599, + "grad_norm": 1.8449832380251867, + "learning_rate": 9.928730734292797e-06, + "loss": 0.3894, + "step": 550 + }, + { + "epoch": 0.08239868401375804, + "grad_norm": 1.995255157883457, + "learning_rate": 9.928322709320834e-06, + "loss": 0.3925, + "step": 551 + }, + { + "epoch": 0.08254822790489008, + "grad_norm": 2.755405061449222, + "learning_rate": 9.92791352811687e-06, + "loss": 0.6899, + "step": 552 + }, + { + "epoch": 0.08269777179602213, + "grad_norm": 1.2254981142470793, + "learning_rate": 9.9275031907769e-06, + "loss": 0.2225, + "step": 553 + }, + { + "epoch": 0.08284731568715419, + "grad_norm": 1.9323036995913243, + "learning_rate": 9.927091697397192e-06, + "loss": 0.3865, + "step": 554 + }, + { + "epoch": 0.08299685957828623, + "grad_norm": 2.0962863974348593, + "learning_rate": 9.926679048074289e-06, + "loss": 0.4, + "step": 555 + }, + { + "epoch": 0.08314640346941828, + "grad_norm": 1.5847691098448267, + "learning_rate": 9.926265242904998e-06, + "loss": 0.247, + "step": 556 + }, + { + "epoch": 0.08329594736055032, + "grad_norm": 2.5967594290859903, + "learning_rate": 9.925850281986408e-06, + "loss": 0.2083, + "step": 557 + }, + { + "epoch": 0.08344549125168237, + "grad_norm": 2.0426826933231226, + "learning_rate": 9.925434165415868e-06, + "loss": 0.449, + "step": 558 + }, + { + "epoch": 0.08359503514281441, + "grad_norm": 1.7693278888452375, + "learning_rate": 9.925016893291007e-06, + "loss": 0.2789, + "step": 559 + }, + { + "epoch": 0.08374457903394647, + "grad_norm": 1.6227416269049326, + "learning_rate": 9.924598465709717e-06, + "loss": 0.2209, + "step": 560 + }, + { + "epoch": 0.08389412292507852, + "grad_norm": 1.7055307729140163, + "learning_rate": 9.924178882770166e-06, + "loss": 0.3554, + "step": 561 + }, + { + "epoch": 0.08404366681621056, + "grad_norm": 1.9245436136675982, + "learning_rate": 9.923758144570792e-06, + "loss": 0.5343, + "step": 562 + }, + { + "epoch": 0.0841932107073426, + "grad_norm": 1.3916186974123048, + "learning_rate": 9.923336251210306e-06, + "loss": 0.2328, + "step": 563 + }, + { + "epoch": 0.08434275459847465, + "grad_norm": 1.8724253939088875, + "learning_rate": 9.92291320278769e-06, + "loss": 0.2691, + "step": 564 + }, + { + "epoch": 0.0844922984896067, + "grad_norm": 1.545927153493535, + "learning_rate": 9.922488999402191e-06, + "loss": 0.2049, + "step": 565 + }, + { + "epoch": 0.08464184238073874, + "grad_norm": 2.216312298348258, + "learning_rate": 9.922063641153332e-06, + "loss": 0.5844, + "step": 566 + }, + { + "epoch": 0.0847913862718708, + "grad_norm": 1.2444734652143745, + "learning_rate": 9.921637128140909e-06, + "loss": 0.2872, + "step": 567 + }, + { + "epoch": 0.08494093016300285, + "grad_norm": 2.133851301389792, + "learning_rate": 9.921209460464983e-06, + "loss": 0.2418, + "step": 568 + }, + { + "epoch": 0.08509047405413489, + "grad_norm": 1.5462263702909163, + "learning_rate": 9.92078063822589e-06, + "loss": 0.3438, + "step": 569 + }, + { + "epoch": 0.08524001794526694, + "grad_norm": 2.341879963295622, + "learning_rate": 9.920350661524237e-06, + "loss": 0.5783, + "step": 570 + }, + { + "epoch": 0.08538956183639898, + "grad_norm": 1.7633187330163729, + "learning_rate": 9.919919530460899e-06, + "loss": 0.3503, + "step": 571 + }, + { + "epoch": 0.08553910572753103, + "grad_norm": 2.1676160714531107, + "learning_rate": 9.919487245137024e-06, + "loss": 0.2098, + "step": 572 + }, + { + "epoch": 0.08568864961866307, + "grad_norm": 2.198855334486466, + "learning_rate": 9.919053805654029e-06, + "loss": 0.3876, + "step": 573 + }, + { + "epoch": 0.08583819350979513, + "grad_norm": 1.821472616891953, + "learning_rate": 9.918619212113607e-06, + "loss": 0.391, + "step": 574 + }, + { + "epoch": 0.08598773740092717, + "grad_norm": 1.4553776733520012, + "learning_rate": 9.918183464617714e-06, + "loss": 0.2032, + "step": 575 + }, + { + "epoch": 0.08613728129205922, + "grad_norm": 1.5817735791823646, + "learning_rate": 9.917746563268581e-06, + "loss": 0.2658, + "step": 576 + }, + { + "epoch": 0.08628682518319127, + "grad_norm": 2.255323258805483, + "learning_rate": 9.917308508168712e-06, + "loss": 0.39, + "step": 577 + }, + { + "epoch": 0.08643636907432331, + "grad_norm": 1.699175902078527, + "learning_rate": 9.916869299420875e-06, + "loss": 0.1906, + "step": 578 + }, + { + "epoch": 0.08658591296545536, + "grad_norm": 1.5572993513277051, + "learning_rate": 9.916428937128117e-06, + "loss": 0.3438, + "step": 579 + }, + { + "epoch": 0.08673545685658741, + "grad_norm": 1.5095119263162684, + "learning_rate": 9.915987421393747e-06, + "loss": 0.272, + "step": 580 + }, + { + "epoch": 0.08688500074771946, + "grad_norm": 2.8137128440101735, + "learning_rate": 9.91554475232135e-06, + "loss": 0.3833, + "step": 581 + }, + { + "epoch": 0.0870345446388515, + "grad_norm": 1.845156278788705, + "learning_rate": 9.915100930014786e-06, + "loss": 0.4658, + "step": 582 + }, + { + "epoch": 0.08718408852998355, + "grad_norm": 1.7624433765379017, + "learning_rate": 9.914655954578171e-06, + "loss": 0.3968, + "step": 583 + }, + { + "epoch": 0.0873336324211156, + "grad_norm": 1.7915618837196812, + "learning_rate": 9.914209826115906e-06, + "loss": 0.4901, + "step": 584 + }, + { + "epoch": 0.08748317631224764, + "grad_norm": 1.8335500777788887, + "learning_rate": 9.913762544732654e-06, + "loss": 0.249, + "step": 585 + }, + { + "epoch": 0.08763272020337969, + "grad_norm": 1.5116580783389033, + "learning_rate": 9.913314110533355e-06, + "loss": 0.3999, + "step": 586 + }, + { + "epoch": 0.08778226409451174, + "grad_norm": 1.9828537343745032, + "learning_rate": 9.912864523623214e-06, + "loss": 0.4153, + "step": 587 + }, + { + "epoch": 0.08793180798564379, + "grad_norm": 1.6056147158647165, + "learning_rate": 9.912413784107709e-06, + "loss": 0.357, + "step": 588 + }, + { + "epoch": 0.08808135187677583, + "grad_norm": 1.7642170812152784, + "learning_rate": 9.911961892092587e-06, + "loss": 0.3425, + "step": 589 + }, + { + "epoch": 0.08823089576790788, + "grad_norm": 1.925307511563271, + "learning_rate": 9.911508847683867e-06, + "loss": 0.4476, + "step": 590 + }, + { + "epoch": 0.08838043965903992, + "grad_norm": 1.9824372539957273, + "learning_rate": 9.911054650987837e-06, + "loss": 0.4597, + "step": 591 + }, + { + "epoch": 0.08852998355017197, + "grad_norm": 1.5805088418089035, + "learning_rate": 9.910599302111057e-06, + "loss": 0.1935, + "step": 592 + }, + { + "epoch": 0.08867952744130403, + "grad_norm": 2.157404890931188, + "learning_rate": 9.910142801160355e-06, + "loss": 0.3443, + "step": 593 + }, + { + "epoch": 0.08882907133243607, + "grad_norm": 2.094900000445731, + "learning_rate": 9.909685148242831e-06, + "loss": 0.404, + "step": 594 + }, + { + "epoch": 0.08897861522356812, + "grad_norm": 2.336415519412793, + "learning_rate": 9.909226343465856e-06, + "loss": 0.6382, + "step": 595 + }, + { + "epoch": 0.08912815911470016, + "grad_norm": 2.0552137049182497, + "learning_rate": 9.908766386937067e-06, + "loss": 0.3908, + "step": 596 + }, + { + "epoch": 0.08927770300583221, + "grad_norm": 1.1564393734179468, + "learning_rate": 9.908305278764376e-06, + "loss": 0.2457, + "step": 597 + }, + { + "epoch": 0.08942724689696425, + "grad_norm": 1.8704284289450437, + "learning_rate": 9.907843019055966e-06, + "loss": 0.3604, + "step": 598 + }, + { + "epoch": 0.0895767907880963, + "grad_norm": 1.295042190600909, + "learning_rate": 9.907379607920281e-06, + "loss": 0.2075, + "step": 599 + }, + { + "epoch": 0.08972633467922836, + "grad_norm": 1.8305770820800886, + "learning_rate": 9.90691504546605e-06, + "loss": 0.2698, + "step": 600 + }, + { + "epoch": 0.0898758785703604, + "grad_norm": 1.7240290275544472, + "learning_rate": 9.906449331802256e-06, + "loss": 0.2504, + "step": 601 + }, + { + "epoch": 0.09002542246149245, + "grad_norm": 1.0036789417827203, + "learning_rate": 9.905982467038167e-06, + "loss": 0.195, + "step": 602 + }, + { + "epoch": 0.0901749663526245, + "grad_norm": 1.6777253578130231, + "learning_rate": 9.905514451283308e-06, + "loss": 0.2436, + "step": 603 + }, + { + "epoch": 0.09032451024375654, + "grad_norm": 1.9190873052270145, + "learning_rate": 9.905045284647483e-06, + "loss": 0.4006, + "step": 604 + }, + { + "epoch": 0.09047405413488858, + "grad_norm": 1.77001911452716, + "learning_rate": 9.904574967240764e-06, + "loss": 0.3703, + "step": 605 + }, + { + "epoch": 0.09062359802602064, + "grad_norm": 1.3114492277508998, + "learning_rate": 9.904103499173487e-06, + "loss": 0.2323, + "step": 606 + }, + { + "epoch": 0.09077314191715269, + "grad_norm": 1.6694643051834908, + "learning_rate": 9.90363088055627e-06, + "loss": 0.2881, + "step": 607 + }, + { + "epoch": 0.09092268580828473, + "grad_norm": 1.4448454411512122, + "learning_rate": 9.903157111499988e-06, + "loss": 0.2341, + "step": 608 + }, + { + "epoch": 0.09107222969941678, + "grad_norm": 1.8302982894061834, + "learning_rate": 9.902682192115795e-06, + "loss": 0.3497, + "step": 609 + }, + { + "epoch": 0.09122177359054882, + "grad_norm": 1.4089802820999182, + "learning_rate": 9.902206122515113e-06, + "loss": 0.1565, + "step": 610 + }, + { + "epoch": 0.09137131748168087, + "grad_norm": 2.275670976517465, + "learning_rate": 9.901728902809627e-06, + "loss": 0.482, + "step": 611 + }, + { + "epoch": 0.09152086137281291, + "grad_norm": 2.3916744409549997, + "learning_rate": 9.901250533111301e-06, + "loss": 0.539, + "step": 612 + }, + { + "epoch": 0.09167040526394497, + "grad_norm": 1.110965438282227, + "learning_rate": 9.900771013532367e-06, + "loss": 0.2257, + "step": 613 + }, + { + "epoch": 0.09181994915507702, + "grad_norm": 1.6169969209154105, + "learning_rate": 9.900290344185321e-06, + "loss": 0.2316, + "step": 614 + }, + { + "epoch": 0.09196949304620906, + "grad_norm": 1.390950490331229, + "learning_rate": 9.899808525182935e-06, + "loss": 0.1735, + "step": 615 + }, + { + "epoch": 0.09211903693734111, + "grad_norm": 1.26641152514348, + "learning_rate": 9.899325556638247e-06, + "loss": 0.2269, + "step": 616 + }, + { + "epoch": 0.09226858082847315, + "grad_norm": 1.107259968960053, + "learning_rate": 9.898841438664568e-06, + "loss": 0.2082, + "step": 617 + }, + { + "epoch": 0.0924181247196052, + "grad_norm": 1.6779136428714192, + "learning_rate": 9.898356171375473e-06, + "loss": 0.3744, + "step": 618 + }, + { + "epoch": 0.09256766861073726, + "grad_norm": 1.8012739115801626, + "learning_rate": 9.897869754884816e-06, + "loss": 0.2438, + "step": 619 + }, + { + "epoch": 0.0927172125018693, + "grad_norm": 1.6400812519548655, + "learning_rate": 9.89738218930671e-06, + "loss": 0.3692, + "step": 620 + }, + { + "epoch": 0.09286675639300135, + "grad_norm": 2.7659374426954972, + "learning_rate": 9.896893474755547e-06, + "loss": 0.5873, + "step": 621 + }, + { + "epoch": 0.0930163002841334, + "grad_norm": 3.020452608035097, + "learning_rate": 9.89640361134598e-06, + "loss": 0.4177, + "step": 622 + }, + { + "epoch": 0.09316584417526544, + "grad_norm": 1.4907614824403637, + "learning_rate": 9.895912599192937e-06, + "loss": 0.2516, + "step": 623 + }, + { + "epoch": 0.09331538806639748, + "grad_norm": 1.6636615032724535, + "learning_rate": 9.895420438411616e-06, + "loss": 0.1935, + "step": 624 + }, + { + "epoch": 0.09346493195752953, + "grad_norm": 1.9719905447621995, + "learning_rate": 9.89492712911748e-06, + "loss": 0.2135, + "step": 625 + }, + { + "epoch": 0.09361447584866159, + "grad_norm": 1.3681787330772102, + "learning_rate": 9.894432671426264e-06, + "loss": 0.208, + "step": 626 + }, + { + "epoch": 0.09376401973979363, + "grad_norm": 2.0793649946453043, + "learning_rate": 9.893937065453976e-06, + "loss": 0.3719, + "step": 627 + }, + { + "epoch": 0.09391356363092568, + "grad_norm": 1.685584025343787, + "learning_rate": 9.893440311316887e-06, + "loss": 0.2164, + "step": 628 + }, + { + "epoch": 0.09406310752205772, + "grad_norm": 1.2145425693019332, + "learning_rate": 9.892942409131541e-06, + "loss": 0.1725, + "step": 629 + }, + { + "epoch": 0.09421265141318977, + "grad_norm": 1.1438517718036314, + "learning_rate": 9.892443359014752e-06, + "loss": 0.2367, + "step": 630 + }, + { + "epoch": 0.09436219530432181, + "grad_norm": 1.4416913213257094, + "learning_rate": 9.8919431610836e-06, + "loss": 0.2254, + "step": 631 + }, + { + "epoch": 0.09451173919545386, + "grad_norm": 1.2656296241346114, + "learning_rate": 9.891441815455436e-06, + "loss": 0.2485, + "step": 632 + }, + { + "epoch": 0.09466128308658592, + "grad_norm": 1.4276056880724206, + "learning_rate": 9.890939322247881e-06, + "loss": 0.1908, + "step": 633 + }, + { + "epoch": 0.09481082697771796, + "grad_norm": 1.8185771152087218, + "learning_rate": 9.890435681578827e-06, + "loss": 0.2096, + "step": 634 + }, + { + "epoch": 0.09496037086885001, + "grad_norm": 1.2794518689910337, + "learning_rate": 9.88993089356643e-06, + "loss": 0.2394, + "step": 635 + }, + { + "epoch": 0.09510991475998205, + "grad_norm": 2.0227594086297738, + "learning_rate": 9.88942495832912e-06, + "loss": 0.59, + "step": 636 + }, + { + "epoch": 0.0952594586511141, + "grad_norm": 1.3323082817593526, + "learning_rate": 9.888917875985593e-06, + "loss": 0.2073, + "step": 637 + }, + { + "epoch": 0.09540900254224614, + "grad_norm": 1.7884206661676574, + "learning_rate": 9.888409646654818e-06, + "loss": 0.3897, + "step": 638 + }, + { + "epoch": 0.0955585464333782, + "grad_norm": 2.124144136353745, + "learning_rate": 9.887900270456025e-06, + "loss": 0.5683, + "step": 639 + }, + { + "epoch": 0.09570809032451025, + "grad_norm": 1.4793433841619534, + "learning_rate": 9.887389747508725e-06, + "loss": 0.3727, + "step": 640 + }, + { + "epoch": 0.09585763421564229, + "grad_norm": 1.0661747667222115, + "learning_rate": 9.88687807793269e-06, + "loss": 0.1983, + "step": 641 + }, + { + "epoch": 0.09600717810677434, + "grad_norm": 1.615153009655538, + "learning_rate": 9.886365261847957e-06, + "loss": 0.3675, + "step": 642 + }, + { + "epoch": 0.09615672199790638, + "grad_norm": 1.4963878387365324, + "learning_rate": 9.885851299374844e-06, + "loss": 0.1805, + "step": 643 + }, + { + "epoch": 0.09630626588903843, + "grad_norm": 1.8529323065992462, + "learning_rate": 9.88533619063393e-06, + "loss": 0.391, + "step": 644 + }, + { + "epoch": 0.09645580978017047, + "grad_norm": 2.4764246014732145, + "learning_rate": 9.884819935746063e-06, + "loss": 0.2605, + "step": 645 + }, + { + "epoch": 0.09660535367130253, + "grad_norm": 1.904672440883197, + "learning_rate": 9.884302534832361e-06, + "loss": 0.3935, + "step": 646 + }, + { + "epoch": 0.09675489756243458, + "grad_norm": 1.9431435460380113, + "learning_rate": 9.883783988014216e-06, + "loss": 0.2092, + "step": 647 + }, + { + "epoch": 0.09690444145356662, + "grad_norm": 2.0946695671241553, + "learning_rate": 9.883264295413278e-06, + "loss": 0.3957, + "step": 648 + }, + { + "epoch": 0.09705398534469867, + "grad_norm": 1.0944344711946927, + "learning_rate": 9.882743457151476e-06, + "loss": 0.202, + "step": 649 + }, + { + "epoch": 0.09720352923583071, + "grad_norm": 1.5147259026498003, + "learning_rate": 9.882221473351e-06, + "loss": 0.3029, + "step": 650 + }, + { + "epoch": 0.09735307312696276, + "grad_norm": 1.3452835965457643, + "learning_rate": 9.881698344134316e-06, + "loss": 0.2159, + "step": 651 + }, + { + "epoch": 0.09750261701809482, + "grad_norm": 1.7952640402406481, + "learning_rate": 9.881174069624155e-06, + "loss": 0.4006, + "step": 652 + }, + { + "epoch": 0.09765216090922686, + "grad_norm": 2.468540255171398, + "learning_rate": 9.880648649943515e-06, + "loss": 0.4393, + "step": 653 + }, + { + "epoch": 0.09780170480035891, + "grad_norm": 1.5332585075726441, + "learning_rate": 9.880122085215664e-06, + "loss": 0.2401, + "step": 654 + }, + { + "epoch": 0.09795124869149095, + "grad_norm": 1.5882881108110953, + "learning_rate": 9.87959437556414e-06, + "loss": 0.2078, + "step": 655 + }, + { + "epoch": 0.098100792582623, + "grad_norm": 1.7962702189497488, + "learning_rate": 9.87906552111275e-06, + "loss": 0.4793, + "step": 656 + }, + { + "epoch": 0.09825033647375504, + "grad_norm": 1.860004859316795, + "learning_rate": 9.878535521985568e-06, + "loss": 0.2388, + "step": 657 + }, + { + "epoch": 0.09839988036488709, + "grad_norm": 1.9861019609665855, + "learning_rate": 9.878004378306934e-06, + "loss": 0.3721, + "step": 658 + }, + { + "epoch": 0.09854942425601915, + "grad_norm": 1.5404208138898199, + "learning_rate": 9.877472090201463e-06, + "loss": 0.3534, + "step": 659 + }, + { + "epoch": 0.09869896814715119, + "grad_norm": 3.0119825067072306, + "learning_rate": 9.876938657794036e-06, + "loss": 0.6732, + "step": 660 + }, + { + "epoch": 0.09884851203828324, + "grad_norm": 1.5069735817087104, + "learning_rate": 9.876404081209796e-06, + "loss": 0.4004, + "step": 661 + }, + { + "epoch": 0.09899805592941528, + "grad_norm": 1.6856753387650372, + "learning_rate": 9.875868360574164e-06, + "loss": 0.2942, + "step": 662 + }, + { + "epoch": 0.09914759982054733, + "grad_norm": 1.6896901311725145, + "learning_rate": 9.875331496012822e-06, + "loss": 0.239, + "step": 663 + }, + { + "epoch": 0.09929714371167937, + "grad_norm": 2.2770505228904225, + "learning_rate": 9.87479348765173e-06, + "loss": 0.4755, + "step": 664 + }, + { + "epoch": 0.09944668760281143, + "grad_norm": 1.9016485099179228, + "learning_rate": 9.874254335617102e-06, + "loss": 0.4645, + "step": 665 + }, + { + "epoch": 0.09959623149394348, + "grad_norm": 1.6638896812103354, + "learning_rate": 9.873714040035434e-06, + "loss": 0.2512, + "step": 666 + }, + { + "epoch": 0.09974577538507552, + "grad_norm": 1.7233554952000107, + "learning_rate": 9.873172601033482e-06, + "loss": 0.3958, + "step": 667 + }, + { + "epoch": 0.09989531927620757, + "grad_norm": 1.7250170911584946, + "learning_rate": 9.872630018738271e-06, + "loss": 0.3115, + "step": 668 + }, + { + "epoch": 0.10004486316733961, + "grad_norm": 1.8843746906489027, + "learning_rate": 9.872086293277101e-06, + "loss": 0.3789, + "step": 669 + }, + { + "epoch": 0.10019440705847166, + "grad_norm": 1.943275185299739, + "learning_rate": 9.871541424777534e-06, + "loss": 0.4192, + "step": 670 + }, + { + "epoch": 0.1003439509496037, + "grad_norm": 1.4918005726247283, + "learning_rate": 9.870995413367397e-06, + "loss": 0.2538, + "step": 671 + }, + { + "epoch": 0.10049349484073576, + "grad_norm": 1.6441123648652987, + "learning_rate": 9.870448259174791e-06, + "loss": 0.2295, + "step": 672 + }, + { + "epoch": 0.1006430387318678, + "grad_norm": 1.933429186975597, + "learning_rate": 9.86989996232809e-06, + "loss": 0.4015, + "step": 673 + }, + { + "epoch": 0.10079258262299985, + "grad_norm": 1.8125640882474123, + "learning_rate": 9.869350522955921e-06, + "loss": 0.3807, + "step": 674 + }, + { + "epoch": 0.1009421265141319, + "grad_norm": 1.9369733002230116, + "learning_rate": 9.868799941187193e-06, + "loss": 0.5201, + "step": 675 + }, + { + "epoch": 0.10109167040526394, + "grad_norm": 1.5216959755972845, + "learning_rate": 9.868248217151075e-06, + "loss": 0.3624, + "step": 676 + }, + { + "epoch": 0.10124121429639599, + "grad_norm": 1.532054269025379, + "learning_rate": 9.867695350977009e-06, + "loss": 0.2738, + "step": 677 + }, + { + "epoch": 0.10139075818752805, + "grad_norm": 1.9725714032650388, + "learning_rate": 9.867141342794703e-06, + "loss": 0.5802, + "step": 678 + }, + { + "epoch": 0.10154030207866009, + "grad_norm": 1.9107978616944274, + "learning_rate": 9.86658619273413e-06, + "loss": 0.482, + "step": 679 + }, + { + "epoch": 0.10168984596979214, + "grad_norm": 1.919639496784501, + "learning_rate": 9.866029900925535e-06, + "loss": 0.3558, + "step": 680 + }, + { + "epoch": 0.10183938986092418, + "grad_norm": 2.174841069849439, + "learning_rate": 9.865472467499431e-06, + "loss": 0.6996, + "step": 681 + }, + { + "epoch": 0.10198893375205623, + "grad_norm": 2.2558702972279807, + "learning_rate": 9.864913892586596e-06, + "loss": 0.2397, + "step": 682 + }, + { + "epoch": 0.10213847764318827, + "grad_norm": 2.1196800969183105, + "learning_rate": 9.864354176318076e-06, + "loss": 0.3793, + "step": 683 + }, + { + "epoch": 0.10228802153432032, + "grad_norm": 2.165719475550091, + "learning_rate": 9.863793318825186e-06, + "loss": 0.2154, + "step": 684 + }, + { + "epoch": 0.10243756542545238, + "grad_norm": 1.7513134063770632, + "learning_rate": 9.86323132023951e-06, + "loss": 0.3816, + "step": 685 + }, + { + "epoch": 0.10258710931658442, + "grad_norm": 1.7103742255808732, + "learning_rate": 9.862668180692897e-06, + "loss": 0.2469, + "step": 686 + }, + { + "epoch": 0.10273665320771647, + "grad_norm": 1.9784764768939407, + "learning_rate": 9.862103900317467e-06, + "loss": 0.2279, + "step": 687 + }, + { + "epoch": 0.10288619709884851, + "grad_norm": 2.5430996153598877, + "learning_rate": 9.861538479245603e-06, + "loss": 0.4512, + "step": 688 + }, + { + "epoch": 0.10303574098998056, + "grad_norm": 1.3584315188319882, + "learning_rate": 9.86097191760996e-06, + "loss": 0.2521, + "step": 689 + }, + { + "epoch": 0.1031852848811126, + "grad_norm": 1.8041511333081743, + "learning_rate": 9.860404215543458e-06, + "loss": 0.3794, + "step": 690 + }, + { + "epoch": 0.10333482877224466, + "grad_norm": 2.261581805469511, + "learning_rate": 9.859835373179285e-06, + "loss": 0.5264, + "step": 691 + }, + { + "epoch": 0.1034843726633767, + "grad_norm": 1.4531049528328563, + "learning_rate": 9.859265390650897e-06, + "loss": 0.2069, + "step": 692 + }, + { + "epoch": 0.10363391655450875, + "grad_norm": 1.6530791454319427, + "learning_rate": 9.85869426809202e-06, + "loss": 0.2304, + "step": 693 + }, + { + "epoch": 0.1037834604456408, + "grad_norm": 1.5868398701857311, + "learning_rate": 9.85812200563664e-06, + "loss": 0.3894, + "step": 694 + }, + { + "epoch": 0.10393300433677284, + "grad_norm": 1.4690408418702507, + "learning_rate": 9.857548603419019e-06, + "loss": 0.3383, + "step": 695 + }, + { + "epoch": 0.10408254822790489, + "grad_norm": 1.868386725453433, + "learning_rate": 9.856974061573682e-06, + "loss": 0.4666, + "step": 696 + }, + { + "epoch": 0.10423209211903693, + "grad_norm": 1.3771017197315938, + "learning_rate": 9.856398380235422e-06, + "loss": 0.2285, + "step": 697 + }, + { + "epoch": 0.10438163601016899, + "grad_norm": 2.452990479638216, + "learning_rate": 9.855821559539298e-06, + "loss": 0.7219, + "step": 698 + }, + { + "epoch": 0.10453117990130104, + "grad_norm": 1.8742322224001207, + "learning_rate": 9.85524359962064e-06, + "loss": 0.4803, + "step": 699 + }, + { + "epoch": 0.10468072379243308, + "grad_norm": 1.858692042760981, + "learning_rate": 9.854664500615041e-06, + "loss": 0.2273, + "step": 700 + }, + { + "epoch": 0.10483026768356513, + "grad_norm": 1.1355721780236596, + "learning_rate": 9.854084262658365e-06, + "loss": 0.1947, + "step": 701 + }, + { + "epoch": 0.10497981157469717, + "grad_norm": 1.3464195395769243, + "learning_rate": 9.853502885886738e-06, + "loss": 0.1988, + "step": 702 + }, + { + "epoch": 0.10512935546582922, + "grad_norm": 1.204875080370136, + "learning_rate": 9.852920370436561e-06, + "loss": 0.3027, + "step": 703 + }, + { + "epoch": 0.10527889935696126, + "grad_norm": 1.3557124537174092, + "learning_rate": 9.852336716444496e-06, + "loss": 0.2158, + "step": 704 + }, + { + "epoch": 0.10542844324809332, + "grad_norm": 1.5752529363149261, + "learning_rate": 9.851751924047472e-06, + "loss": 0.3324, + "step": 705 + }, + { + "epoch": 0.10557798713922537, + "grad_norm": 1.7915590890665287, + "learning_rate": 9.85116599338269e-06, + "loss": 0.4936, + "step": 706 + }, + { + "epoch": 0.10572753103035741, + "grad_norm": 1.6842493918087815, + "learning_rate": 9.850578924587614e-06, + "loss": 0.4249, + "step": 707 + }, + { + "epoch": 0.10587707492148946, + "grad_norm": 1.781419189683173, + "learning_rate": 9.849990717799975e-06, + "loss": 0.2615, + "step": 708 + }, + { + "epoch": 0.1060266188126215, + "grad_norm": 1.4202393409091985, + "learning_rate": 9.849401373157772e-06, + "loss": 0.3256, + "step": 709 + }, + { + "epoch": 0.10617616270375355, + "grad_norm": 1.3714522045342281, + "learning_rate": 9.84881089079927e-06, + "loss": 0.219, + "step": 710 + }, + { + "epoch": 0.1063257065948856, + "grad_norm": 1.7391677942386203, + "learning_rate": 9.848219270863005e-06, + "loss": 0.2249, + "step": 711 + }, + { + "epoch": 0.10647525048601765, + "grad_norm": 1.3023890791191592, + "learning_rate": 9.847626513487774e-06, + "loss": 0.3693, + "step": 712 + }, + { + "epoch": 0.1066247943771497, + "grad_norm": 1.7969068078667318, + "learning_rate": 9.847032618812647e-06, + "loss": 0.2298, + "step": 713 + }, + { + "epoch": 0.10677433826828174, + "grad_norm": 2.102291030534645, + "learning_rate": 9.846437586976952e-06, + "loss": 0.4688, + "step": 714 + }, + { + "epoch": 0.10692388215941379, + "grad_norm": 1.072288463866959, + "learning_rate": 9.845841418120295e-06, + "loss": 0.2023, + "step": 715 + }, + { + "epoch": 0.10707342605054583, + "grad_norm": 1.3278088866624802, + "learning_rate": 9.845244112382536e-06, + "loss": 0.3492, + "step": 716 + }, + { + "epoch": 0.10722296994167788, + "grad_norm": 1.3771047197586663, + "learning_rate": 9.844645669903816e-06, + "loss": 0.2152, + "step": 717 + }, + { + "epoch": 0.10737251383280993, + "grad_norm": 1.243148446265919, + "learning_rate": 9.844046090824533e-06, + "loss": 0.2419, + "step": 718 + }, + { + "epoch": 0.10752205772394198, + "grad_norm": 1.3994827626329662, + "learning_rate": 9.843445375285351e-06, + "loss": 0.3578, + "step": 719 + }, + { + "epoch": 0.10767160161507403, + "grad_norm": 2.180600395588636, + "learning_rate": 9.842843523427207e-06, + "loss": 0.4159, + "step": 720 + }, + { + "epoch": 0.10782114550620607, + "grad_norm": 1.853639106134475, + "learning_rate": 9.842240535391301e-06, + "loss": 0.3929, + "step": 721 + }, + { + "epoch": 0.10797068939733812, + "grad_norm": 2.1662710209518306, + "learning_rate": 9.841636411319098e-06, + "loss": 0.298, + "step": 722 + }, + { + "epoch": 0.10812023328847016, + "grad_norm": 1.604340910092426, + "learning_rate": 9.841031151352332e-06, + "loss": 0.2175, + "step": 723 + }, + { + "epoch": 0.10826977717960222, + "grad_norm": 2.486345181702559, + "learning_rate": 9.840424755633002e-06, + "loss": 0.5179, + "step": 724 + }, + { + "epoch": 0.10841932107073426, + "grad_norm": 1.415864057650498, + "learning_rate": 9.83981722430338e-06, + "loss": 0.3539, + "step": 725 + }, + { + "epoch": 0.10856886496186631, + "grad_norm": 1.4949172725362427, + "learning_rate": 9.839208557505989e-06, + "loss": 0.382, + "step": 726 + }, + { + "epoch": 0.10871840885299835, + "grad_norm": 2.2920695398684576, + "learning_rate": 9.838598755383636e-06, + "loss": 0.5086, + "step": 727 + }, + { + "epoch": 0.1088679527441304, + "grad_norm": 1.6667328126020315, + "learning_rate": 9.837987818079382e-06, + "loss": 0.3736, + "step": 728 + }, + { + "epoch": 0.10901749663526245, + "grad_norm": 1.3398213843433537, + "learning_rate": 9.837375745736562e-06, + "loss": 0.1827, + "step": 729 + }, + { + "epoch": 0.10916704052639449, + "grad_norm": 1.9606925081810038, + "learning_rate": 9.83676253849877e-06, + "loss": 0.3992, + "step": 730 + }, + { + "epoch": 0.10931658441752655, + "grad_norm": 1.7774772468148792, + "learning_rate": 9.836148196509875e-06, + "loss": 0.4769, + "step": 731 + }, + { + "epoch": 0.1094661283086586, + "grad_norm": 1.9643639344581283, + "learning_rate": 9.835532719914005e-06, + "loss": 0.6049, + "step": 732 + }, + { + "epoch": 0.10961567219979064, + "grad_norm": 2.205274189695602, + "learning_rate": 9.834916108855557e-06, + "loss": 0.3679, + "step": 733 + }, + { + "epoch": 0.10976521609092268, + "grad_norm": 1.5293962633909763, + "learning_rate": 9.834298363479193e-06, + "loss": 0.3355, + "step": 734 + }, + { + "epoch": 0.10991475998205473, + "grad_norm": 1.985725165029928, + "learning_rate": 9.833679483929846e-06, + "loss": 0.357, + "step": 735 + }, + { + "epoch": 0.11006430387318678, + "grad_norm": 1.938156924090921, + "learning_rate": 9.833059470352705e-06, + "loss": 0.2667, + "step": 736 + }, + { + "epoch": 0.11021384776431883, + "grad_norm": 1.6208704990029696, + "learning_rate": 9.832438322893235e-06, + "loss": 0.2751, + "step": 737 + }, + { + "epoch": 0.11036339165545088, + "grad_norm": 1.6610841289834064, + "learning_rate": 9.831816041697164e-06, + "loss": 0.2319, + "step": 738 + }, + { + "epoch": 0.11051293554658292, + "grad_norm": 2.003195385581558, + "learning_rate": 9.831192626910482e-06, + "loss": 0.3799, + "step": 739 + }, + { + "epoch": 0.11066247943771497, + "grad_norm": 1.8861050802330894, + "learning_rate": 9.83056807867945e-06, + "loss": 0.4804, + "step": 740 + }, + { + "epoch": 0.11081202332884701, + "grad_norm": 1.6483143403386502, + "learning_rate": 9.829942397150593e-06, + "loss": 0.3658, + "step": 741 + }, + { + "epoch": 0.11096156721997906, + "grad_norm": 1.5438600790491723, + "learning_rate": 9.829315582470702e-06, + "loss": 0.2297, + "step": 742 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 1.4215916414139778, + "learning_rate": 9.828687634786834e-06, + "loss": 0.2365, + "step": 743 + }, + { + "epoch": 0.11126065500224316, + "grad_norm": 1.7761192463313074, + "learning_rate": 9.828058554246309e-06, + "loss": 0.3052, + "step": 744 + }, + { + "epoch": 0.11141019889337521, + "grad_norm": 1.6148872971257344, + "learning_rate": 9.82742834099672e-06, + "loss": 0.2199, + "step": 745 + }, + { + "epoch": 0.11155974278450725, + "grad_norm": 2.0162005563823646, + "learning_rate": 9.826796995185916e-06, + "loss": 0.3839, + "step": 746 + }, + { + "epoch": 0.1117092866756393, + "grad_norm": 1.4737452330787222, + "learning_rate": 9.826164516962022e-06, + "loss": 0.1869, + "step": 747 + }, + { + "epoch": 0.11185883056677134, + "grad_norm": 2.102437337141145, + "learning_rate": 9.82553090647342e-06, + "loss": 0.2615, + "step": 748 + }, + { + "epoch": 0.11200837445790339, + "grad_norm": 1.7050095093194846, + "learning_rate": 9.82489616386876e-06, + "loss": 0.2518, + "step": 749 + }, + { + "epoch": 0.11215791834903545, + "grad_norm": 1.5205595562607015, + "learning_rate": 9.824260289296963e-06, + "loss": 0.1792, + "step": 750 + }, + { + "epoch": 0.1123074622401675, + "grad_norm": 1.7480859507023934, + "learning_rate": 9.823623282907207e-06, + "loss": 0.4179, + "step": 751 + }, + { + "epoch": 0.11245700613129954, + "grad_norm": 1.5498394808983003, + "learning_rate": 9.822985144848944e-06, + "loss": 0.3358, + "step": 752 + }, + { + "epoch": 0.11260655002243158, + "grad_norm": 1.6393482758244988, + "learning_rate": 9.822345875271884e-06, + "loss": 0.2149, + "step": 753 + }, + { + "epoch": 0.11275609391356363, + "grad_norm": 1.9754923234031054, + "learning_rate": 9.821705474326006e-06, + "loss": 0.434, + "step": 754 + }, + { + "epoch": 0.11290563780469567, + "grad_norm": 2.2364190645343154, + "learning_rate": 9.821063942161558e-06, + "loss": 0.2228, + "step": 755 + }, + { + "epoch": 0.11305518169582772, + "grad_norm": 1.4307479990009164, + "learning_rate": 9.820421278929045e-06, + "loss": 0.3547, + "step": 756 + }, + { + "epoch": 0.11320472558695978, + "grad_norm": 1.2078809494224174, + "learning_rate": 9.819777484779242e-06, + "loss": 0.2245, + "step": 757 + }, + { + "epoch": 0.11335426947809182, + "grad_norm": 2.6305960032740354, + "learning_rate": 9.819132559863194e-06, + "loss": 0.6771, + "step": 758 + }, + { + "epoch": 0.11350381336922387, + "grad_norm": 1.4792675137281683, + "learning_rate": 9.818486504332203e-06, + "loss": 0.318, + "step": 759 + }, + { + "epoch": 0.11365335726035591, + "grad_norm": 2.127350110714495, + "learning_rate": 9.817839318337839e-06, + "loss": 0.4925, + "step": 760 + }, + { + "epoch": 0.11380290115148796, + "grad_norm": 1.8233415508114148, + "learning_rate": 9.81719100203194e-06, + "loss": 0.2747, + "step": 761 + }, + { + "epoch": 0.11395244504262, + "grad_norm": 2.1187219443475156, + "learning_rate": 9.81654155556661e-06, + "loss": 0.4595, + "step": 762 + }, + { + "epoch": 0.11410198893375205, + "grad_norm": 1.5759792753813915, + "learning_rate": 9.81589097909421e-06, + "loss": 0.3553, + "step": 763 + }, + { + "epoch": 0.11425153282488411, + "grad_norm": 1.708776908270571, + "learning_rate": 9.815239272767373e-06, + "loss": 0.3091, + "step": 764 + }, + { + "epoch": 0.11440107671601615, + "grad_norm": 1.703775430420963, + "learning_rate": 9.814586436738998e-06, + "loss": 0.3728, + "step": 765 + }, + { + "epoch": 0.1145506206071482, + "grad_norm": 1.6198262441466886, + "learning_rate": 9.813932471162245e-06, + "loss": 0.2498, + "step": 766 + }, + { + "epoch": 0.11470016449828024, + "grad_norm": 1.4858642435718663, + "learning_rate": 9.813277376190539e-06, + "loss": 0.2299, + "step": 767 + }, + { + "epoch": 0.11484970838941229, + "grad_norm": 1.8052387881768808, + "learning_rate": 9.812621151977574e-06, + "loss": 0.3834, + "step": 768 + }, + { + "epoch": 0.11499925228054433, + "grad_norm": 1.7902664470941898, + "learning_rate": 9.811963798677306e-06, + "loss": 0.2282, + "step": 769 + }, + { + "epoch": 0.11514879617167639, + "grad_norm": 1.6316784429425562, + "learning_rate": 9.811305316443956e-06, + "loss": 0.2396, + "step": 770 + }, + { + "epoch": 0.11529834006280844, + "grad_norm": 1.414088700773603, + "learning_rate": 9.81064570543201e-06, + "loss": 0.2353, + "step": 771 + }, + { + "epoch": 0.11544788395394048, + "grad_norm": 1.9219176628835946, + "learning_rate": 9.80998496579622e-06, + "loss": 0.3379, + "step": 772 + }, + { + "epoch": 0.11559742784507253, + "grad_norm": 1.113085528787142, + "learning_rate": 9.809323097691602e-06, + "loss": 0.213, + "step": 773 + }, + { + "epoch": 0.11574697173620457, + "grad_norm": 1.6091723360768109, + "learning_rate": 9.808660101273435e-06, + "loss": 0.3457, + "step": 774 + }, + { + "epoch": 0.11589651562733662, + "grad_norm": 1.4626018681006754, + "learning_rate": 9.807995976697267e-06, + "loss": 0.1777, + "step": 775 + }, + { + "epoch": 0.11604605951846866, + "grad_norm": 2.3953869334660522, + "learning_rate": 9.807330724118906e-06, + "loss": 0.449, + "step": 776 + }, + { + "epoch": 0.11619560340960072, + "grad_norm": 1.595059614098865, + "learning_rate": 9.806664343694425e-06, + "loss": 0.3367, + "step": 777 + }, + { + "epoch": 0.11634514730073277, + "grad_norm": 1.1067814732956414, + "learning_rate": 9.805996835580169e-06, + "loss": 0.1828, + "step": 778 + }, + { + "epoch": 0.11649469119186481, + "grad_norm": 1.62643731894747, + "learning_rate": 9.805328199932736e-06, + "loss": 0.2005, + "step": 779 + }, + { + "epoch": 0.11664423508299686, + "grad_norm": 1.5535907543030336, + "learning_rate": 9.804658436908996e-06, + "loss": 0.2635, + "step": 780 + }, + { + "epoch": 0.1167937789741289, + "grad_norm": 1.0891099881486959, + "learning_rate": 9.803987546666083e-06, + "loss": 0.2012, + "step": 781 + }, + { + "epoch": 0.11694332286526095, + "grad_norm": 1.9927493555965012, + "learning_rate": 9.803315529361395e-06, + "loss": 0.5297, + "step": 782 + }, + { + "epoch": 0.11709286675639301, + "grad_norm": 1.6333695435696685, + "learning_rate": 9.802642385152593e-06, + "loss": 0.2959, + "step": 783 + }, + { + "epoch": 0.11724241064752505, + "grad_norm": 2.0905230122366896, + "learning_rate": 9.8019681141976e-06, + "loss": 0.3662, + "step": 784 + }, + { + "epoch": 0.1173919545386571, + "grad_norm": 1.3626106000772258, + "learning_rate": 9.80129271665461e-06, + "loss": 0.2065, + "step": 785 + }, + { + "epoch": 0.11754149842978914, + "grad_norm": 1.7357589724302078, + "learning_rate": 9.800616192682077e-06, + "loss": 0.3269, + "step": 786 + }, + { + "epoch": 0.11769104232092119, + "grad_norm": 1.3095611640218061, + "learning_rate": 9.79993854243872e-06, + "loss": 0.1993, + "step": 787 + }, + { + "epoch": 0.11784058621205323, + "grad_norm": 2.008966146034465, + "learning_rate": 9.799259766083522e-06, + "loss": 0.2346, + "step": 788 + }, + { + "epoch": 0.11799013010318528, + "grad_norm": 1.4166616715548845, + "learning_rate": 9.798579863775733e-06, + "loss": 0.2053, + "step": 789 + }, + { + "epoch": 0.11813967399431734, + "grad_norm": 2.2231308106975742, + "learning_rate": 9.79789883567486e-06, + "loss": 0.3138, + "step": 790 + }, + { + "epoch": 0.11828921788544938, + "grad_norm": 2.104337845030462, + "learning_rate": 9.79721668194068e-06, + "loss": 0.5896, + "step": 791 + }, + { + "epoch": 0.11843876177658143, + "grad_norm": 1.3961164790172769, + "learning_rate": 9.796533402733235e-06, + "loss": 0.2023, + "step": 792 + }, + { + "epoch": 0.11858830566771347, + "grad_norm": 1.9644410315987328, + "learning_rate": 9.79584899821283e-06, + "loss": 0.2389, + "step": 793 + }, + { + "epoch": 0.11873784955884552, + "grad_norm": 1.911243493993425, + "learning_rate": 9.795163468540028e-06, + "loss": 0.2319, + "step": 794 + }, + { + "epoch": 0.11888739344997756, + "grad_norm": 1.5442762653540485, + "learning_rate": 9.794476813875665e-06, + "loss": 0.256, + "step": 795 + }, + { + "epoch": 0.11903693734110962, + "grad_norm": 1.6347185262551664, + "learning_rate": 9.793789034380833e-06, + "loss": 0.3659, + "step": 796 + }, + { + "epoch": 0.11918648123224167, + "grad_norm": 6.93121052791821, + "learning_rate": 9.793100130216895e-06, + "loss": 0.3348, + "step": 797 + }, + { + "epoch": 0.11933602512337371, + "grad_norm": 1.1914924746720745, + "learning_rate": 9.792410101545475e-06, + "loss": 0.2475, + "step": 798 + }, + { + "epoch": 0.11948556901450576, + "grad_norm": 1.4728413245474197, + "learning_rate": 9.791718948528457e-06, + "loss": 0.3569, + "step": 799 + }, + { + "epoch": 0.1196351129056378, + "grad_norm": 2.0173892018585113, + "learning_rate": 9.791026671327996e-06, + "loss": 0.3154, + "step": 800 + }, + { + "epoch": 0.11978465679676985, + "grad_norm": 1.863844432530015, + "learning_rate": 9.790333270106505e-06, + "loss": 0.349, + "step": 801 + }, + { + "epoch": 0.11993420068790189, + "grad_norm": 1.348620907056274, + "learning_rate": 9.789638745026661e-06, + "loss": 0.2553, + "step": 802 + }, + { + "epoch": 0.12008374457903395, + "grad_norm": 1.7207895705367349, + "learning_rate": 9.78894309625141e-06, + "loss": 0.3931, + "step": 803 + }, + { + "epoch": 0.120233288470166, + "grad_norm": 1.3131471894535065, + "learning_rate": 9.788246323943954e-06, + "loss": 0.1473, + "step": 804 + }, + { + "epoch": 0.12038283236129804, + "grad_norm": 1.054327043113717, + "learning_rate": 9.787548428267766e-06, + "loss": 0.1945, + "step": 805 + }, + { + "epoch": 0.12053237625243009, + "grad_norm": 1.9526157668969721, + "learning_rate": 9.786849409386577e-06, + "loss": 0.3906, + "step": 806 + }, + { + "epoch": 0.12068192014356213, + "grad_norm": 1.8267497285704608, + "learning_rate": 9.786149267464382e-06, + "loss": 0.4193, + "step": 807 + }, + { + "epoch": 0.12083146403469418, + "grad_norm": 0.8038770137897923, + "learning_rate": 9.785448002665446e-06, + "loss": 0.2392, + "step": 808 + }, + { + "epoch": 0.12098100792582624, + "grad_norm": 1.5253624272396114, + "learning_rate": 9.784745615154286e-06, + "loss": 0.3366, + "step": 809 + }, + { + "epoch": 0.12113055181695828, + "grad_norm": 2.093861559130543, + "learning_rate": 9.784042105095694e-06, + "loss": 0.4947, + "step": 810 + }, + { + "epoch": 0.12128009570809033, + "grad_norm": 1.4395999315986885, + "learning_rate": 9.78333747265472e-06, + "loss": 0.3721, + "step": 811 + }, + { + "epoch": 0.12142963959922237, + "grad_norm": 1.852326670020495, + "learning_rate": 9.782631717996675e-06, + "loss": 0.4779, + "step": 812 + }, + { + "epoch": 0.12157918349035442, + "grad_norm": 1.6061911802246367, + "learning_rate": 9.781924841287136e-06, + "loss": 0.3634, + "step": 813 + }, + { + "epoch": 0.12172872738148646, + "grad_norm": 1.6480313202927959, + "learning_rate": 9.781216842691945e-06, + "loss": 0.3486, + "step": 814 + }, + { + "epoch": 0.12187827127261851, + "grad_norm": 1.7352908193060639, + "learning_rate": 9.780507722377205e-06, + "loss": 0.2405, + "step": 815 + }, + { + "epoch": 0.12202781516375057, + "grad_norm": 1.819605505850209, + "learning_rate": 9.779797480509281e-06, + "loss": 0.2702, + "step": 816 + }, + { + "epoch": 0.12217735905488261, + "grad_norm": 5.030925574416197, + "learning_rate": 9.779086117254804e-06, + "loss": 0.4802, + "step": 817 + }, + { + "epoch": 0.12232690294601466, + "grad_norm": 1.4714728946101239, + "learning_rate": 9.778373632780665e-06, + "loss": 0.4002, + "step": 818 + }, + { + "epoch": 0.1224764468371467, + "grad_norm": 1.8961195589940971, + "learning_rate": 9.777660027254022e-06, + "loss": 0.5022, + "step": 819 + }, + { + "epoch": 0.12262599072827875, + "grad_norm": 2.1718036722230343, + "learning_rate": 9.776945300842292e-06, + "loss": 0.3274, + "step": 820 + }, + { + "epoch": 0.12277553461941079, + "grad_norm": 1.620505499680087, + "learning_rate": 9.776229453713158e-06, + "loss": 0.2316, + "step": 821 + }, + { + "epoch": 0.12292507851054285, + "grad_norm": 1.6978035349883904, + "learning_rate": 9.775512486034564e-06, + "loss": 0.3388, + "step": 822 + }, + { + "epoch": 0.1230746224016749, + "grad_norm": 1.8097210824887537, + "learning_rate": 9.774794397974715e-06, + "loss": 0.2658, + "step": 823 + }, + { + "epoch": 0.12322416629280694, + "grad_norm": 1.7832381045534218, + "learning_rate": 9.774075189702085e-06, + "loss": 0.236, + "step": 824 + }, + { + "epoch": 0.12337371018393899, + "grad_norm": 1.5481034516154306, + "learning_rate": 9.773354861385408e-06, + "loss": 0.2209, + "step": 825 + }, + { + "epoch": 0.12352325407507103, + "grad_norm": 1.808929914702085, + "learning_rate": 9.772633413193677e-06, + "loss": 0.3936, + "step": 826 + }, + { + "epoch": 0.12367279796620308, + "grad_norm": 1.4632324741175244, + "learning_rate": 9.771910845296151e-06, + "loss": 0.1809, + "step": 827 + }, + { + "epoch": 0.12382234185733512, + "grad_norm": 1.602480536861921, + "learning_rate": 9.771187157862352e-06, + "loss": 0.3631, + "step": 828 + }, + { + "epoch": 0.12397188574846718, + "grad_norm": 1.695314807275002, + "learning_rate": 9.770462351062065e-06, + "loss": 0.3419, + "step": 829 + }, + { + "epoch": 0.12412142963959923, + "grad_norm": 2.2542289621861262, + "learning_rate": 9.769736425065333e-06, + "loss": 0.4292, + "step": 830 + }, + { + "epoch": 0.12427097353073127, + "grad_norm": 1.7697982755032058, + "learning_rate": 9.76900938004247e-06, + "loss": 0.3735, + "step": 831 + }, + { + "epoch": 0.12442051742186332, + "grad_norm": 1.8120887881814032, + "learning_rate": 9.768281216164045e-06, + "loss": 0.3568, + "step": 832 + }, + { + "epoch": 0.12457006131299536, + "grad_norm": 1.581710048140236, + "learning_rate": 9.767551933600896e-06, + "loss": 0.1999, + "step": 833 + }, + { + "epoch": 0.1247196052041274, + "grad_norm": 1.8611636134135094, + "learning_rate": 9.766821532524113e-06, + "loss": 0.4111, + "step": 834 + }, + { + "epoch": 0.12486914909525945, + "grad_norm": 1.3596930734620556, + "learning_rate": 9.76609001310506e-06, + "loss": 0.1893, + "step": 835 + }, + { + "epoch": 0.1250186929863915, + "grad_norm": 1.7562268713789106, + "learning_rate": 9.76535737551536e-06, + "loss": 0.3948, + "step": 836 + }, + { + "epoch": 0.12516823687752354, + "grad_norm": 2.0138359003020136, + "learning_rate": 9.764623619926891e-06, + "loss": 0.2182, + "step": 837 + }, + { + "epoch": 0.1253177807686556, + "grad_norm": 1.447071144741178, + "learning_rate": 9.763888746511804e-06, + "loss": 0.2027, + "step": 838 + }, + { + "epoch": 0.12546732465978766, + "grad_norm": 2.1652568729944734, + "learning_rate": 9.763152755442504e-06, + "loss": 0.6314, + "step": 839 + }, + { + "epoch": 0.1256168685509197, + "grad_norm": 1.8038679466625127, + "learning_rate": 9.762415646891665e-06, + "loss": 0.3578, + "step": 840 + }, + { + "epoch": 0.12576641244205175, + "grad_norm": 1.6058605347195138, + "learning_rate": 9.761677421032218e-06, + "loss": 0.4411, + "step": 841 + }, + { + "epoch": 0.12591595633318378, + "grad_norm": 1.590419871100753, + "learning_rate": 9.760938078037358e-06, + "loss": 0.3562, + "step": 842 + }, + { + "epoch": 0.12606550022431584, + "grad_norm": 2.1851801492136267, + "learning_rate": 9.76019761808054e-06, + "loss": 0.5822, + "step": 843 + }, + { + "epoch": 0.12621504411544787, + "grad_norm": 1.5855983503039581, + "learning_rate": 9.759456041335487e-06, + "loss": 0.2229, + "step": 844 + }, + { + "epoch": 0.12636458800657993, + "grad_norm": 1.370525319712137, + "learning_rate": 9.758713347976179e-06, + "loss": 0.2126, + "step": 845 + }, + { + "epoch": 0.126514131897712, + "grad_norm": 6.394283226949693, + "learning_rate": 9.757969538176856e-06, + "loss": 0.5925, + "step": 846 + }, + { + "epoch": 0.12666367578884402, + "grad_norm": 1.6599084242802136, + "learning_rate": 9.757224612112026e-06, + "loss": 0.2939, + "step": 847 + }, + { + "epoch": 0.12681321967997608, + "grad_norm": 1.7907787465869436, + "learning_rate": 9.756478569956455e-06, + "loss": 0.222, + "step": 848 + }, + { + "epoch": 0.1269627635711081, + "grad_norm": 2.2366122735755707, + "learning_rate": 9.755731411885172e-06, + "loss": 0.6684, + "step": 849 + }, + { + "epoch": 0.12711230746224017, + "grad_norm": 1.2342377688986181, + "learning_rate": 9.754983138073466e-06, + "loss": 0.1731, + "step": 850 + }, + { + "epoch": 0.1272618513533722, + "grad_norm": 1.890953555602396, + "learning_rate": 9.75423374869689e-06, + "loss": 0.3518, + "step": 851 + }, + { + "epoch": 0.12741139524450426, + "grad_norm": 1.6475639435427634, + "learning_rate": 9.75348324393126e-06, + "loss": 0.2398, + "step": 852 + }, + { + "epoch": 0.12756093913563632, + "grad_norm": 1.6270554683226957, + "learning_rate": 9.752731623952647e-06, + "loss": 0.4891, + "step": 853 + }, + { + "epoch": 0.12771048302676835, + "grad_norm": 1.26855312080081, + "learning_rate": 9.751978888937394e-06, + "loss": 0.256, + "step": 854 + }, + { + "epoch": 0.1278600269179004, + "grad_norm": 1.8185769247015318, + "learning_rate": 9.751225039062096e-06, + "loss": 0.4165, + "step": 855 + }, + { + "epoch": 0.12800957080903244, + "grad_norm": 1.751422967721224, + "learning_rate": 9.750470074503616e-06, + "loss": 0.4006, + "step": 856 + }, + { + "epoch": 0.1281591147001645, + "grad_norm": 1.520424463551602, + "learning_rate": 9.749713995439072e-06, + "loss": 0.221, + "step": 857 + }, + { + "epoch": 0.12830865859129656, + "grad_norm": 1.3721869164602227, + "learning_rate": 9.74895680204585e-06, + "loss": 0.2902, + "step": 858 + }, + { + "epoch": 0.1284582024824286, + "grad_norm": 0.9768480150555632, + "learning_rate": 9.748198494501598e-06, + "loss": 0.2115, + "step": 859 + }, + { + "epoch": 0.12860774637356065, + "grad_norm": 1.1411458146693625, + "learning_rate": 9.747439072984217e-06, + "loss": 0.2657, + "step": 860 + }, + { + "epoch": 0.12875729026469268, + "grad_norm": 1.351877109756975, + "learning_rate": 9.746678537671876e-06, + "loss": 0.1998, + "step": 861 + }, + { + "epoch": 0.12890683415582474, + "grad_norm": 1.735519954859839, + "learning_rate": 9.745916888743006e-06, + "loss": 0.3916, + "step": 862 + }, + { + "epoch": 0.12905637804695677, + "grad_norm": 1.5843585668319187, + "learning_rate": 9.745154126376295e-06, + "loss": 0.2412, + "step": 863 + }, + { + "epoch": 0.12920592193808883, + "grad_norm": 1.2682977552751018, + "learning_rate": 9.744390250750694e-06, + "loss": 0.2082, + "step": 864 + }, + { + "epoch": 0.1293554658292209, + "grad_norm": 1.972644277101951, + "learning_rate": 9.74362526204542e-06, + "loss": 0.5327, + "step": 865 + }, + { + "epoch": 0.12950500972035292, + "grad_norm": 1.9379037313358354, + "learning_rate": 9.74285916043994e-06, + "loss": 0.5184, + "step": 866 + }, + { + "epoch": 0.12965455361148498, + "grad_norm": 2.118494372996469, + "learning_rate": 9.742091946113994e-06, + "loss": 0.4367, + "step": 867 + }, + { + "epoch": 0.129804097502617, + "grad_norm": 3.0042960877566904, + "learning_rate": 9.741323619247575e-06, + "loss": 0.2971, + "step": 868 + }, + { + "epoch": 0.12995364139374907, + "grad_norm": 1.4822743195387478, + "learning_rate": 9.740554180020944e-06, + "loss": 0.2324, + "step": 869 + }, + { + "epoch": 0.1301031852848811, + "grad_norm": 1.629265135056018, + "learning_rate": 9.739783628614614e-06, + "loss": 0.3717, + "step": 870 + }, + { + "epoch": 0.13025272917601316, + "grad_norm": 2.663192450334603, + "learning_rate": 9.739011965209366e-06, + "loss": 0.4405, + "step": 871 + }, + { + "epoch": 0.13040227306714522, + "grad_norm": 1.942071044818614, + "learning_rate": 9.738239189986239e-06, + "loss": 0.2141, + "step": 872 + }, + { + "epoch": 0.13055181695827725, + "grad_norm": 3.5555352510097684, + "learning_rate": 9.737465303126533e-06, + "loss": 0.5084, + "step": 873 + }, + { + "epoch": 0.1307013608494093, + "grad_norm": 131.61785973638575, + "learning_rate": 9.736690304811811e-06, + "loss": 0.2431, + "step": 874 + }, + { + "epoch": 0.13085090474054134, + "grad_norm": 1.190310223040302, + "learning_rate": 9.735914195223894e-06, + "loss": 0.1586, + "step": 875 + }, + { + "epoch": 0.1310004486316734, + "grad_norm": 1.8346983968963104, + "learning_rate": 9.735136974544866e-06, + "loss": 0.5247, + "step": 876 + }, + { + "epoch": 0.13114999252280543, + "grad_norm": 1.7905067752668935, + "learning_rate": 9.734358642957068e-06, + "loss": 0.2645, + "step": 877 + }, + { + "epoch": 0.1312995364139375, + "grad_norm": 1.9545038391601572, + "learning_rate": 9.733579200643108e-06, + "loss": 0.3769, + "step": 878 + }, + { + "epoch": 0.13144908030506955, + "grad_norm": 2.1943279379956477, + "learning_rate": 9.732798647785847e-06, + "loss": 0.5142, + "step": 879 + }, + { + "epoch": 0.13159862419620158, + "grad_norm": 1.8055649624971999, + "learning_rate": 9.73201698456841e-06, + "loss": 0.1857, + "step": 880 + }, + { + "epoch": 0.13174816808733364, + "grad_norm": 1.8149442634221358, + "learning_rate": 9.731234211174188e-06, + "loss": 0.2233, + "step": 881 + }, + { + "epoch": 0.13189771197846567, + "grad_norm": 2.080170101944024, + "learning_rate": 9.73045032778682e-06, + "loss": 0.3904, + "step": 882 + }, + { + "epoch": 0.13204725586959773, + "grad_norm": 1.9510038015167501, + "learning_rate": 9.729665334590217e-06, + "loss": 0.3821, + "step": 883 + }, + { + "epoch": 0.13219679976072976, + "grad_norm": 2.1650257376887545, + "learning_rate": 9.728879231768547e-06, + "loss": 0.2357, + "step": 884 + }, + { + "epoch": 0.13234634365186182, + "grad_norm": 1.1711773735000268, + "learning_rate": 9.728092019506233e-06, + "loss": 0.181, + "step": 885 + }, + { + "epoch": 0.13249588754299388, + "grad_norm": 1.3626230838557185, + "learning_rate": 9.727303697987965e-06, + "loss": 0.2283, + "step": 886 + }, + { + "epoch": 0.1326454314341259, + "grad_norm": 1.9510781854659551, + "learning_rate": 9.72651426739869e-06, + "loss": 0.5154, + "step": 887 + }, + { + "epoch": 0.13279497532525797, + "grad_norm": 0.9715289377560907, + "learning_rate": 9.72572372792362e-06, + "loss": 0.248, + "step": 888 + }, + { + "epoch": 0.13294451921639, + "grad_norm": 2.146774938769699, + "learning_rate": 9.724932079748218e-06, + "loss": 0.5735, + "step": 889 + }, + { + "epoch": 0.13309406310752206, + "grad_norm": 1.9362171525305378, + "learning_rate": 9.724139323058213e-06, + "loss": 0.4643, + "step": 890 + }, + { + "epoch": 0.13324360699865412, + "grad_norm": 1.2101289634017103, + "learning_rate": 9.723345458039595e-06, + "loss": 0.2266, + "step": 891 + }, + { + "epoch": 0.13339315088978615, + "grad_norm": 1.5975435725539564, + "learning_rate": 9.722550484878612e-06, + "loss": 0.2212, + "step": 892 + }, + { + "epoch": 0.1335426947809182, + "grad_norm": 2.1131453338181783, + "learning_rate": 9.721754403761773e-06, + "loss": 0.5017, + "step": 893 + }, + { + "epoch": 0.13369223867205024, + "grad_norm": 1.8229015728915987, + "learning_rate": 9.720957214875846e-06, + "loss": 0.3833, + "step": 894 + }, + { + "epoch": 0.1338417825631823, + "grad_norm": 2.2277630697934416, + "learning_rate": 9.720158918407859e-06, + "loss": 0.2482, + "step": 895 + }, + { + "epoch": 0.13399132645431433, + "grad_norm": 2.7493650830526413, + "learning_rate": 9.719359514545097e-06, + "loss": 0.3227, + "step": 896 + }, + { + "epoch": 0.1341408703454464, + "grad_norm": 1.326522393176365, + "learning_rate": 9.718559003475114e-06, + "loss": 0.3694, + "step": 897 + }, + { + "epoch": 0.13429041423657845, + "grad_norm": 2.031851235687534, + "learning_rate": 9.717757385385713e-06, + "loss": 0.4018, + "step": 898 + }, + { + "epoch": 0.13443995812771048, + "grad_norm": 1.8621806446966467, + "learning_rate": 9.716954660464962e-06, + "loss": 0.4906, + "step": 899 + }, + { + "epoch": 0.13458950201884254, + "grad_norm": 1.2770259017691152, + "learning_rate": 9.716150828901189e-06, + "loss": 0.1858, + "step": 900 + }, + { + "epoch": 0.13473904590997457, + "grad_norm": 1.5913545450304718, + "learning_rate": 9.71534589088298e-06, + "loss": 0.197, + "step": 901 + }, + { + "epoch": 0.13488858980110663, + "grad_norm": 1.2063585747949084, + "learning_rate": 9.714539846599183e-06, + "loss": 0.2366, + "step": 902 + }, + { + "epoch": 0.13503813369223866, + "grad_norm": 2.5200659605885036, + "learning_rate": 9.713732696238901e-06, + "loss": 0.5999, + "step": 903 + }, + { + "epoch": 0.13518767758337072, + "grad_norm": 1.9024340551353864, + "learning_rate": 9.7129244399915e-06, + "loss": 0.3618, + "step": 904 + }, + { + "epoch": 0.13533722147450278, + "grad_norm": 2.069806998096416, + "learning_rate": 9.712115078046606e-06, + "loss": 0.2392, + "step": 905 + }, + { + "epoch": 0.1354867653656348, + "grad_norm": 1.9190350188819136, + "learning_rate": 9.711304610594104e-06, + "loss": 0.4096, + "step": 906 + }, + { + "epoch": 0.13563630925676687, + "grad_norm": 1.0348199460008871, + "learning_rate": 9.710493037824133e-06, + "loss": 0.2071, + "step": 907 + }, + { + "epoch": 0.1357858531478989, + "grad_norm": 2.014537240536291, + "learning_rate": 9.709680359927101e-06, + "loss": 0.4374, + "step": 908 + }, + { + "epoch": 0.13593539703903096, + "grad_norm": 1.7241079783150106, + "learning_rate": 9.708866577093665e-06, + "loss": 0.4161, + "step": 909 + }, + { + "epoch": 0.136084940930163, + "grad_norm": 1.3280448342419884, + "learning_rate": 9.70805168951475e-06, + "loss": 0.1967, + "step": 910 + }, + { + "epoch": 0.13623448482129505, + "grad_norm": 1.6364992809413539, + "learning_rate": 9.707235697381536e-06, + "loss": 0.3394, + "step": 911 + }, + { + "epoch": 0.1363840287124271, + "grad_norm": 1.1211253304635729, + "learning_rate": 9.706418600885462e-06, + "loss": 0.3542, + "step": 912 + }, + { + "epoch": 0.13653357260355914, + "grad_norm": 4.417441150249539, + "learning_rate": 9.705600400218227e-06, + "loss": 0.2605, + "step": 913 + }, + { + "epoch": 0.1366831164946912, + "grad_norm": 1.6849430545358892, + "learning_rate": 9.704781095571788e-06, + "loss": 0.3434, + "step": 914 + }, + { + "epoch": 0.13683266038582323, + "grad_norm": 1.0419590452262997, + "learning_rate": 9.703960687138363e-06, + "loss": 0.1759, + "step": 915 + }, + { + "epoch": 0.1369822042769553, + "grad_norm": 2.3605687929632286, + "learning_rate": 9.703139175110425e-06, + "loss": 0.6175, + "step": 916 + }, + { + "epoch": 0.13713174816808735, + "grad_norm": 1.6017722806543409, + "learning_rate": 9.702316559680714e-06, + "loss": 0.2687, + "step": 917 + }, + { + "epoch": 0.13728129205921938, + "grad_norm": 1.3442020598442603, + "learning_rate": 9.701492841042217e-06, + "loss": 0.3801, + "step": 918 + }, + { + "epoch": 0.13743083595035144, + "grad_norm": 1.6758219573938795, + "learning_rate": 9.70066801938819e-06, + "loss": 0.3869, + "step": 919 + }, + { + "epoch": 0.13758037984148347, + "grad_norm": 1.787015495544375, + "learning_rate": 9.699842094912146e-06, + "loss": 0.319, + "step": 920 + }, + { + "epoch": 0.13772992373261553, + "grad_norm": 1.8015526879631494, + "learning_rate": 9.699015067807851e-06, + "loss": 0.493, + "step": 921 + }, + { + "epoch": 0.13787946762374756, + "grad_norm": 0.9887387021286004, + "learning_rate": 9.698186938269334e-06, + "loss": 0.1724, + "step": 922 + }, + { + "epoch": 0.13802901151487962, + "grad_norm": 1.7253102296559673, + "learning_rate": 9.697357706490885e-06, + "loss": 0.5363, + "step": 923 + }, + { + "epoch": 0.13817855540601168, + "grad_norm": 1.5558864293295054, + "learning_rate": 9.696527372667046e-06, + "loss": 0.2863, + "step": 924 + }, + { + "epoch": 0.1383280992971437, + "grad_norm": 1.8210322672031793, + "learning_rate": 9.695695936992624e-06, + "loss": 0.4107, + "step": 925 + }, + { + "epoch": 0.13847764318827577, + "grad_norm": 1.6117992353983686, + "learning_rate": 9.69486339966268e-06, + "loss": 0.2162, + "step": 926 + }, + { + "epoch": 0.1386271870794078, + "grad_norm": 1.7018476473220923, + "learning_rate": 9.694029760872539e-06, + "loss": 0.3609, + "step": 927 + }, + { + "epoch": 0.13877673097053986, + "grad_norm": 1.5673565467226127, + "learning_rate": 9.693195020817776e-06, + "loss": 0.3164, + "step": 928 + }, + { + "epoch": 0.1389262748616719, + "grad_norm": 1.6536061805316273, + "learning_rate": 9.69235917969423e-06, + "loss": 0.5039, + "step": 929 + }, + { + "epoch": 0.13907581875280395, + "grad_norm": 1.4953772716061529, + "learning_rate": 9.691522237698001e-06, + "loss": 0.2073, + "step": 930 + }, + { + "epoch": 0.139225362643936, + "grad_norm": 1.0372555974478648, + "learning_rate": 9.69068419502544e-06, + "loss": 0.1904, + "step": 931 + }, + { + "epoch": 0.13937490653506804, + "grad_norm": 1.2803091164977878, + "learning_rate": 9.689845051873161e-06, + "loss": 0.2085, + "step": 932 + }, + { + "epoch": 0.1395244504262001, + "grad_norm": 1.4758036204854348, + "learning_rate": 9.689004808438036e-06, + "loss": 0.2012, + "step": 933 + }, + { + "epoch": 0.13967399431733213, + "grad_norm": 1.6660973952553224, + "learning_rate": 9.688163464917191e-06, + "loss": 0.3286, + "step": 934 + }, + { + "epoch": 0.1398235382084642, + "grad_norm": 1.1549059427655604, + "learning_rate": 9.687321021508018e-06, + "loss": 0.2267, + "step": 935 + }, + { + "epoch": 0.13997308209959622, + "grad_norm": 1.272574916603474, + "learning_rate": 9.686477478408159e-06, + "loss": 0.1829, + "step": 936 + }, + { + "epoch": 0.14012262599072828, + "grad_norm": 2.754918857840336, + "learning_rate": 9.685632835815519e-06, + "loss": 0.4481, + "step": 937 + }, + { + "epoch": 0.14027216988186034, + "grad_norm": 1.1790985103907, + "learning_rate": 9.684787093928256e-06, + "loss": 0.1814, + "step": 938 + }, + { + "epoch": 0.14042171377299237, + "grad_norm": 1.011660485817637, + "learning_rate": 9.683940252944794e-06, + "loss": 0.1863, + "step": 939 + }, + { + "epoch": 0.14057125766412443, + "grad_norm": 1.3525074345715755, + "learning_rate": 9.68309231306381e-06, + "loss": 0.2084, + "step": 940 + }, + { + "epoch": 0.14072080155525646, + "grad_norm": 1.6719478297190948, + "learning_rate": 9.682243274484231e-06, + "loss": 0.3459, + "step": 941 + }, + { + "epoch": 0.14087034544638852, + "grad_norm": 1.5225980842484328, + "learning_rate": 9.681393137405259e-06, + "loss": 0.3082, + "step": 942 + }, + { + "epoch": 0.14101988933752055, + "grad_norm": 1.4403779528104341, + "learning_rate": 9.680541902026342e-06, + "loss": 0.1952, + "step": 943 + }, + { + "epoch": 0.1411694332286526, + "grad_norm": 1.7704358094140293, + "learning_rate": 9.679689568547184e-06, + "loss": 0.2925, + "step": 944 + }, + { + "epoch": 0.14131897711978467, + "grad_norm": 1.8325825494125016, + "learning_rate": 9.678836137167753e-06, + "loss": 0.2354, + "step": 945 + }, + { + "epoch": 0.1414685210109167, + "grad_norm": 8.228207444568621, + "learning_rate": 9.677981608088274e-06, + "loss": 0.1945, + "step": 946 + }, + { + "epoch": 0.14161806490204876, + "grad_norm": 1.9420821742118657, + "learning_rate": 9.677125981509227e-06, + "loss": 0.3745, + "step": 947 + }, + { + "epoch": 0.1417676087931808, + "grad_norm": 1.4287526091354055, + "learning_rate": 9.676269257631348e-06, + "loss": 0.159, + "step": 948 + }, + { + "epoch": 0.14191715268431285, + "grad_norm": 2.2979804382628726, + "learning_rate": 9.675411436655636e-06, + "loss": 0.6715, + "step": 949 + }, + { + "epoch": 0.1420666965754449, + "grad_norm": 1.2906292785961546, + "learning_rate": 9.67455251878334e-06, + "loss": 0.1863, + "step": 950 + }, + { + "epoch": 0.14221624046657694, + "grad_norm": 1.542985394545003, + "learning_rate": 9.673692504215974e-06, + "loss": 0.276, + "step": 951 + }, + { + "epoch": 0.142365784357709, + "grad_norm": 2.076155049712511, + "learning_rate": 9.672831393155304e-06, + "loss": 0.2878, + "step": 952 + }, + { + "epoch": 0.14251532824884103, + "grad_norm": 3.2389836303118265, + "learning_rate": 9.671969185803357e-06, + "loss": 0.4539, + "step": 953 + }, + { + "epoch": 0.1426648721399731, + "grad_norm": 0.8145950923200616, + "learning_rate": 9.671105882362412e-06, + "loss": 0.1916, + "step": 954 + }, + { + "epoch": 0.14281441603110512, + "grad_norm": 1.7529614161433102, + "learning_rate": 9.67024148303501e-06, + "loss": 0.3852, + "step": 955 + }, + { + "epoch": 0.14296395992223718, + "grad_norm": 1.6140653815156045, + "learning_rate": 9.669375988023947e-06, + "loss": 0.3317, + "step": 956 + }, + { + "epoch": 0.14311350381336924, + "grad_norm": 1.5917328640289674, + "learning_rate": 9.668509397532278e-06, + "loss": 0.205, + "step": 957 + }, + { + "epoch": 0.14326304770450127, + "grad_norm": 1.9053910041720175, + "learning_rate": 9.667641711763311e-06, + "loss": 0.2016, + "step": 958 + }, + { + "epoch": 0.14341259159563333, + "grad_norm": 1.2223818916012819, + "learning_rate": 9.666772930920614e-06, + "loss": 0.1818, + "step": 959 + }, + { + "epoch": 0.14356213548676536, + "grad_norm": 1.4130639929342779, + "learning_rate": 9.665903055208013e-06, + "loss": 0.1776, + "step": 960 + }, + { + "epoch": 0.14371167937789742, + "grad_norm": 2.515402250912616, + "learning_rate": 9.665032084829588e-06, + "loss": 0.7429, + "step": 961 + }, + { + "epoch": 0.14386122326902945, + "grad_norm": 2.118211041321287, + "learning_rate": 9.66416001998968e-06, + "loss": 0.5489, + "step": 962 + }, + { + "epoch": 0.1440107671601615, + "grad_norm": 1.6323921378905693, + "learning_rate": 9.663286860892877e-06, + "loss": 0.3446, + "step": 963 + }, + { + "epoch": 0.14416031105129357, + "grad_norm": 2.4057165427715335, + "learning_rate": 9.662412607744036e-06, + "loss": 0.2152, + "step": 964 + }, + { + "epoch": 0.1443098549424256, + "grad_norm": 1.5942384074329368, + "learning_rate": 9.661537260748264e-06, + "loss": 0.3746, + "step": 965 + }, + { + "epoch": 0.14445939883355766, + "grad_norm": 1.3815463338875527, + "learning_rate": 9.660660820110926e-06, + "loss": 0.2255, + "step": 966 + }, + { + "epoch": 0.1446089427246897, + "grad_norm": 2.071453535788066, + "learning_rate": 9.659783286037643e-06, + "loss": 0.5075, + "step": 967 + }, + { + "epoch": 0.14475848661582175, + "grad_norm": 1.3265840791490535, + "learning_rate": 9.658904658734293e-06, + "loss": 0.2295, + "step": 968 + }, + { + "epoch": 0.14490803050695378, + "grad_norm": 1.5352887928749521, + "learning_rate": 9.658024938407011e-06, + "loss": 0.3484, + "step": 969 + }, + { + "epoch": 0.14505757439808584, + "grad_norm": 1.5831033158953907, + "learning_rate": 9.657144125262186e-06, + "loss": 0.4039, + "step": 970 + }, + { + "epoch": 0.1452071182892179, + "grad_norm": 1.6289190913913172, + "learning_rate": 9.65626221950647e-06, + "loss": 0.2471, + "step": 971 + }, + { + "epoch": 0.14535666218034993, + "grad_norm": 1.6042830876012686, + "learning_rate": 9.655379221346758e-06, + "loss": 0.4886, + "step": 972 + }, + { + "epoch": 0.14550620607148199, + "grad_norm": 1.2790362471165744, + "learning_rate": 9.654495130990218e-06, + "loss": 0.2065, + "step": 973 + }, + { + "epoch": 0.14565574996261402, + "grad_norm": 1.72777953773445, + "learning_rate": 9.653609948644263e-06, + "loss": 0.4006, + "step": 974 + }, + { + "epoch": 0.14580529385374608, + "grad_norm": 1.7629618868465593, + "learning_rate": 9.652723674516566e-06, + "loss": 0.2142, + "step": 975 + }, + { + "epoch": 0.14595483774487814, + "grad_norm": 2.0277477662995174, + "learning_rate": 9.651836308815055e-06, + "loss": 0.5248, + "step": 976 + }, + { + "epoch": 0.14610438163601017, + "grad_norm": 1.972708159415252, + "learning_rate": 9.650947851747913e-06, + "loss": 0.6236, + "step": 977 + }, + { + "epoch": 0.14625392552714223, + "grad_norm": 1.760683770673202, + "learning_rate": 9.650058303523583e-06, + "loss": 0.3418, + "step": 978 + }, + { + "epoch": 0.14640346941827426, + "grad_norm": 1.8429433949445744, + "learning_rate": 9.649167664350762e-06, + "loss": 0.475, + "step": 979 + }, + { + "epoch": 0.14655301330940632, + "grad_norm": 1.9248727992532182, + "learning_rate": 9.6482759344384e-06, + "loss": 0.3368, + "step": 980 + }, + { + "epoch": 0.14670255720053835, + "grad_norm": 1.9573379093599792, + "learning_rate": 9.647383113995707e-06, + "loss": 0.4203, + "step": 981 + }, + { + "epoch": 0.1468521010916704, + "grad_norm": 2.7789135237612803, + "learning_rate": 9.646489203232145e-06, + "loss": 0.4581, + "step": 982 + }, + { + "epoch": 0.14700164498280247, + "grad_norm": 2.5119171064254724, + "learning_rate": 9.645594202357438e-06, + "loss": 0.8202, + "step": 983 + }, + { + "epoch": 0.1471511888739345, + "grad_norm": 1.4269567118604072, + "learning_rate": 9.644698111581562e-06, + "loss": 0.1954, + "step": 984 + }, + { + "epoch": 0.14730073276506656, + "grad_norm": 1.5211432396508893, + "learning_rate": 9.643800931114742e-06, + "loss": 0.207, + "step": 985 + }, + { + "epoch": 0.1474502766561986, + "grad_norm": 1.607901993958729, + "learning_rate": 9.642902661167472e-06, + "loss": 0.3046, + "step": 986 + }, + { + "epoch": 0.14759982054733065, + "grad_norm": 2.0792720647718776, + "learning_rate": 9.642003301950491e-06, + "loss": 0.4314, + "step": 987 + }, + { + "epoch": 0.14774936443846268, + "grad_norm": 1.6967562403841654, + "learning_rate": 9.641102853674799e-06, + "loss": 0.2142, + "step": 988 + }, + { + "epoch": 0.14789890832959474, + "grad_norm": 1.8666018241841429, + "learning_rate": 9.640201316551651e-06, + "loss": 0.4817, + "step": 989 + }, + { + "epoch": 0.1480484522207268, + "grad_norm": 1.6231253521213436, + "learning_rate": 9.639298690792554e-06, + "loss": 0.304, + "step": 990 + }, + { + "epoch": 0.14819799611185883, + "grad_norm": 0.9588780548142521, + "learning_rate": 9.638394976609274e-06, + "loss": 0.1709, + "step": 991 + }, + { + "epoch": 0.14834754000299089, + "grad_norm": 1.7153802262812925, + "learning_rate": 9.637490174213828e-06, + "loss": 0.1959, + "step": 992 + }, + { + "epoch": 0.14849708389412292, + "grad_norm": 1.8524843327507126, + "learning_rate": 9.636584283818496e-06, + "loss": 0.3957, + "step": 993 + }, + { + "epoch": 0.14864662778525498, + "grad_norm": 1.8045411979525945, + "learning_rate": 9.635677305635807e-06, + "loss": 0.2565, + "step": 994 + }, + { + "epoch": 0.148796171676387, + "grad_norm": 2.1506980932575175, + "learning_rate": 9.634769239878545e-06, + "loss": 0.3777, + "step": 995 + }, + { + "epoch": 0.14894571556751907, + "grad_norm": 2.1465696781275563, + "learning_rate": 9.633860086759753e-06, + "loss": 0.6056, + "step": 996 + }, + { + "epoch": 0.14909525945865112, + "grad_norm": 1.3341555017873934, + "learning_rate": 9.632949846492728e-06, + "loss": 0.2219, + "step": 997 + }, + { + "epoch": 0.14924480334978316, + "grad_norm": 1.2276798431588063, + "learning_rate": 9.632038519291017e-06, + "loss": 0.2074, + "step": 998 + }, + { + "epoch": 0.14939434724091522, + "grad_norm": 1.53279693471178, + "learning_rate": 9.63112610536843e-06, + "loss": 0.4373, + "step": 999 + }, + { + "epoch": 0.14954389113204725, + "grad_norm": 1.770529951492653, + "learning_rate": 9.630212604939026e-06, + "loss": 0.2495, + "step": 1000 + }, + { + "epoch": 0.1496934350231793, + "grad_norm": 1.6948398115693124, + "learning_rate": 9.629298018217123e-06, + "loss": 0.467, + "step": 1001 + }, + { + "epoch": 0.14984297891431134, + "grad_norm": 2.0292952301635534, + "learning_rate": 9.628382345417291e-06, + "loss": 0.5509, + "step": 1002 + }, + { + "epoch": 0.1499925228054434, + "grad_norm": 2.4704877435341186, + "learning_rate": 9.627465586754354e-06, + "loss": 0.2214, + "step": 1003 + }, + { + "epoch": 0.15014206669657545, + "grad_norm": 1.6257900509098846, + "learning_rate": 9.626547742443394e-06, + "loss": 0.3365, + "step": 1004 + }, + { + "epoch": 0.15029161058770749, + "grad_norm": 1.8331565532381362, + "learning_rate": 9.625628812699747e-06, + "loss": 0.4435, + "step": 1005 + }, + { + "epoch": 0.15044115447883954, + "grad_norm": 2.977053464112802, + "learning_rate": 9.624708797739002e-06, + "loss": 0.3177, + "step": 1006 + }, + { + "epoch": 0.15059069836997158, + "grad_norm": 1.756417225108672, + "learning_rate": 9.623787697777001e-06, + "loss": 0.3878, + "step": 1007 + }, + { + "epoch": 0.15074024226110364, + "grad_norm": 1.6276234348428167, + "learning_rate": 9.622865513029846e-06, + "loss": 0.1901, + "step": 1008 + }, + { + "epoch": 0.1508897861522357, + "grad_norm": 3.3786401544653253, + "learning_rate": 9.62194224371389e-06, + "loss": 0.2454, + "step": 1009 + }, + { + "epoch": 0.15103933004336773, + "grad_norm": 2.0516682586662456, + "learning_rate": 9.621017890045739e-06, + "loss": 0.7163, + "step": 1010 + }, + { + "epoch": 0.15118887393449978, + "grad_norm": 1.0885572394332461, + "learning_rate": 9.620092452242257e-06, + "loss": 0.2232, + "step": 1011 + }, + { + "epoch": 0.15133841782563182, + "grad_norm": 1.287954715641653, + "learning_rate": 9.61916593052056e-06, + "loss": 0.2124, + "step": 1012 + }, + { + "epoch": 0.15148796171676387, + "grad_norm": 2.089649291216167, + "learning_rate": 9.618238325098021e-06, + "loss": 0.5129, + "step": 1013 + }, + { + "epoch": 0.1516375056078959, + "grad_norm": 1.512994828933595, + "learning_rate": 9.617309636192262e-06, + "loss": 0.2986, + "step": 1014 + }, + { + "epoch": 0.15178704949902797, + "grad_norm": 1.6901350993268085, + "learning_rate": 9.616379864021163e-06, + "loss": 0.1893, + "step": 1015 + }, + { + "epoch": 0.15193659339016002, + "grad_norm": 1.6240769043159409, + "learning_rate": 9.615449008802858e-06, + "loss": 0.1984, + "step": 1016 + }, + { + "epoch": 0.15208613728129206, + "grad_norm": 2.2110380817085815, + "learning_rate": 9.614517070755736e-06, + "loss": 0.4573, + "step": 1017 + }, + { + "epoch": 0.15223568117242411, + "grad_norm": 2.3590384489615452, + "learning_rate": 9.613584050098436e-06, + "loss": 0.536, + "step": 1018 + }, + { + "epoch": 0.15238522506355615, + "grad_norm": 1.1021064926756596, + "learning_rate": 9.612649947049856e-06, + "loss": 0.224, + "step": 1019 + }, + { + "epoch": 0.1525347689546882, + "grad_norm": 1.73759279762283, + "learning_rate": 9.611714761829146e-06, + "loss": 0.2581, + "step": 1020 + }, + { + "epoch": 0.15268431284582024, + "grad_norm": 1.9739121848543864, + "learning_rate": 9.610778494655706e-06, + "loss": 0.2116, + "step": 1021 + }, + { + "epoch": 0.1528338567369523, + "grad_norm": 1.7608676639305771, + "learning_rate": 9.609841145749196e-06, + "loss": 0.2343, + "step": 1022 + }, + { + "epoch": 0.15298340062808435, + "grad_norm": 1.8973842729049986, + "learning_rate": 9.608902715329527e-06, + "loss": 0.416, + "step": 1023 + }, + { + "epoch": 0.15313294451921639, + "grad_norm": 1.232118282013805, + "learning_rate": 9.607963203616862e-06, + "loss": 0.2428, + "step": 1024 + }, + { + "epoch": 0.15328248841034844, + "grad_norm": 1.65077206870404, + "learning_rate": 9.607022610831623e-06, + "loss": 0.395, + "step": 1025 + }, + { + "epoch": 0.15343203230148048, + "grad_norm": 1.8519962462671438, + "learning_rate": 9.606080937194478e-06, + "loss": 0.3225, + "step": 1026 + }, + { + "epoch": 0.15358157619261253, + "grad_norm": 1.5406605876000279, + "learning_rate": 9.605138182926355e-06, + "loss": 0.1962, + "step": 1027 + }, + { + "epoch": 0.15373112008374457, + "grad_norm": 1.68724090741763, + "learning_rate": 9.604194348248432e-06, + "loss": 0.3412, + "step": 1028 + }, + { + "epoch": 0.15388066397487662, + "grad_norm": 1.3196225832605089, + "learning_rate": 9.603249433382145e-06, + "loss": 0.203, + "step": 1029 + }, + { + "epoch": 0.15403020786600868, + "grad_norm": 1.4552647720547889, + "learning_rate": 9.602303438549177e-06, + "loss": 0.2683, + "step": 1030 + }, + { + "epoch": 0.15417975175714072, + "grad_norm": 1.9356496958747662, + "learning_rate": 9.601356363971467e-06, + "loss": 0.4085, + "step": 1031 + }, + { + "epoch": 0.15432929564827277, + "grad_norm": 1.5727583829762162, + "learning_rate": 9.60040820987121e-06, + "loss": 0.2702, + "step": 1032 + }, + { + "epoch": 0.1544788395394048, + "grad_norm": 1.9130911912208222, + "learning_rate": 9.59945897647085e-06, + "loss": 0.3831, + "step": 1033 + }, + { + "epoch": 0.15462838343053686, + "grad_norm": 1.8240341076741502, + "learning_rate": 9.59850866399309e-06, + "loss": 0.4938, + "step": 1034 + }, + { + "epoch": 0.15477792732166892, + "grad_norm": 3.1510439843833953, + "learning_rate": 9.597557272660878e-06, + "loss": 0.4082, + "step": 1035 + }, + { + "epoch": 0.15492747121280095, + "grad_norm": 2.7952488908518998, + "learning_rate": 9.596604802697422e-06, + "loss": 1.0219, + "step": 1036 + }, + { + "epoch": 0.155077015103933, + "grad_norm": 1.5968867223126475, + "learning_rate": 9.595651254326179e-06, + "loss": 0.3575, + "step": 1037 + }, + { + "epoch": 0.15522655899506504, + "grad_norm": 1.7270374421609462, + "learning_rate": 9.594696627770863e-06, + "loss": 0.4184, + "step": 1038 + }, + { + "epoch": 0.1553761028861971, + "grad_norm": 1.623026584906972, + "learning_rate": 9.593740923255437e-06, + "loss": 0.3364, + "step": 1039 + }, + { + "epoch": 0.15552564677732914, + "grad_norm": 1.724702934068898, + "learning_rate": 9.592784141004118e-06, + "loss": 0.5197, + "step": 1040 + }, + { + "epoch": 0.1556751906684612, + "grad_norm": 1.2087417414530044, + "learning_rate": 9.591826281241379e-06, + "loss": 0.3232, + "step": 1041 + }, + { + "epoch": 0.15582473455959325, + "grad_norm": 2.0110371073704716, + "learning_rate": 9.590867344191941e-06, + "loss": 0.4617, + "step": 1042 + }, + { + "epoch": 0.15597427845072528, + "grad_norm": 2.300355596627081, + "learning_rate": 9.58990733008078e-06, + "loss": 0.3483, + "step": 1043 + }, + { + "epoch": 0.15612382234185734, + "grad_norm": 1.9228199791045963, + "learning_rate": 9.588946239133123e-06, + "loss": 0.4723, + "step": 1044 + }, + { + "epoch": 0.15627336623298937, + "grad_norm": 1.3725700472934328, + "learning_rate": 9.587984071574455e-06, + "loss": 0.212, + "step": 1045 + }, + { + "epoch": 0.15642291012412143, + "grad_norm": 1.9304969649682522, + "learning_rate": 9.587020827630507e-06, + "loss": 0.2317, + "step": 1046 + }, + { + "epoch": 0.15657245401525346, + "grad_norm": 1.6204729661048964, + "learning_rate": 9.586056507527266e-06, + "loss": 0.2135, + "step": 1047 + }, + { + "epoch": 0.15672199790638552, + "grad_norm": 1.2120441425361188, + "learning_rate": 9.58509111149097e-06, + "loss": 0.2785, + "step": 1048 + }, + { + "epoch": 0.15687154179751758, + "grad_norm": 2.1856010368938135, + "learning_rate": 9.584124639748114e-06, + "loss": 0.4117, + "step": 1049 + }, + { + "epoch": 0.15702108568864961, + "grad_norm": 1.4004860796245104, + "learning_rate": 9.583157092525435e-06, + "loss": 0.375, + "step": 1050 + }, + { + "epoch": 0.15717062957978167, + "grad_norm": 1.1728395243844014, + "learning_rate": 9.582188470049935e-06, + "loss": 0.2286, + "step": 1051 + }, + { + "epoch": 0.1573201734709137, + "grad_norm": 1.3417244382182179, + "learning_rate": 9.58121877254886e-06, + "loss": 0.2105, + "step": 1052 + }, + { + "epoch": 0.15746971736204576, + "grad_norm": 1.3092141915312672, + "learning_rate": 9.580248000249709e-06, + "loss": 0.253, + "step": 1053 + }, + { + "epoch": 0.1576192612531778, + "grad_norm": 0.8768020427524035, + "learning_rate": 9.579276153380234e-06, + "loss": 0.199, + "step": 1054 + }, + { + "epoch": 0.15776880514430985, + "grad_norm": 1.8273743204900599, + "learning_rate": 9.578303232168442e-06, + "loss": 0.5377, + "step": 1055 + }, + { + "epoch": 0.1579183490354419, + "grad_norm": 1.4617178204858172, + "learning_rate": 9.57732923684259e-06, + "loss": 0.2318, + "step": 1056 + }, + { + "epoch": 0.15806789292657394, + "grad_norm": 1.7897112328181541, + "learning_rate": 9.576354167631186e-06, + "loss": 0.2312, + "step": 1057 + }, + { + "epoch": 0.158217436817706, + "grad_norm": 2.055748147218842, + "learning_rate": 9.575378024762991e-06, + "loss": 0.208, + "step": 1058 + }, + { + "epoch": 0.15836698070883803, + "grad_norm": 1.9825123132080376, + "learning_rate": 9.574400808467015e-06, + "loss": 0.4415, + "step": 1059 + }, + { + "epoch": 0.1585165245999701, + "grad_norm": 1.203644734681587, + "learning_rate": 9.573422518972524e-06, + "loss": 0.1651, + "step": 1060 + }, + { + "epoch": 0.15866606849110212, + "grad_norm": 2.143121544386558, + "learning_rate": 9.572443156509035e-06, + "loss": 0.2813, + "step": 1061 + }, + { + "epoch": 0.15881561238223418, + "grad_norm": 1.7645898839865752, + "learning_rate": 9.571462721306315e-06, + "loss": 0.314, + "step": 1062 + }, + { + "epoch": 0.15896515627336624, + "grad_norm": 1.6678516650591295, + "learning_rate": 9.570481213594385e-06, + "loss": 0.289, + "step": 1063 + }, + { + "epoch": 0.15911470016449827, + "grad_norm": 1.2850170647115653, + "learning_rate": 9.569498633603513e-06, + "loss": 0.2024, + "step": 1064 + }, + { + "epoch": 0.15926424405563033, + "grad_norm": 1.814229267711803, + "learning_rate": 9.568514981564226e-06, + "loss": 0.4606, + "step": 1065 + }, + { + "epoch": 0.15941378794676236, + "grad_norm": 1.243462382180521, + "learning_rate": 9.567530257707294e-06, + "loss": 0.204, + "step": 1066 + }, + { + "epoch": 0.15956333183789442, + "grad_norm": 1.2909371933651599, + "learning_rate": 9.566544462263744e-06, + "loss": 0.2209, + "step": 1067 + }, + { + "epoch": 0.15971287572902648, + "grad_norm": 1.6817484277248902, + "learning_rate": 9.565557595464854e-06, + "loss": 0.2266, + "step": 1068 + }, + { + "epoch": 0.1598624196201585, + "grad_norm": 1.6783737367668008, + "learning_rate": 9.564569657542153e-06, + "loss": 0.3126, + "step": 1069 + }, + { + "epoch": 0.16001196351129057, + "grad_norm": 1.7121174458116146, + "learning_rate": 9.56358064872742e-06, + "loss": 0.4938, + "step": 1070 + }, + { + "epoch": 0.1601615074024226, + "grad_norm": 1.4168299418446093, + "learning_rate": 9.562590569252685e-06, + "loss": 0.1859, + "step": 1071 + }, + { + "epoch": 0.16031105129355466, + "grad_norm": 1.1574720210896832, + "learning_rate": 9.561599419350233e-06, + "loss": 0.2076, + "step": 1072 + }, + { + "epoch": 0.1604605951846867, + "grad_norm": 1.88010212201121, + "learning_rate": 9.560607199252594e-06, + "loss": 0.489, + "step": 1073 + }, + { + "epoch": 0.16061013907581875, + "grad_norm": 1.7557755514977327, + "learning_rate": 9.559613909192553e-06, + "loss": 0.2593, + "step": 1074 + }, + { + "epoch": 0.1607596829669508, + "grad_norm": 2.0354173981462864, + "learning_rate": 9.558619549403148e-06, + "loss": 0.266, + "step": 1075 + }, + { + "epoch": 0.16090922685808284, + "grad_norm": 1.984771028183608, + "learning_rate": 9.557624120117663e-06, + "loss": 0.1823, + "step": 1076 + }, + { + "epoch": 0.1610587707492149, + "grad_norm": 1.2435802855207878, + "learning_rate": 9.556627621569636e-06, + "loss": 0.2133, + "step": 1077 + }, + { + "epoch": 0.16120831464034693, + "grad_norm": 2.1628484109014603, + "learning_rate": 9.555630053992854e-06, + "loss": 0.6313, + "step": 1078 + }, + { + "epoch": 0.161357858531479, + "grad_norm": 1.3025632600056798, + "learning_rate": 9.554631417621358e-06, + "loss": 0.1861, + "step": 1079 + }, + { + "epoch": 0.16150740242261102, + "grad_norm": 1.2033276727813813, + "learning_rate": 9.553631712689437e-06, + "loss": 0.1411, + "step": 1080 + }, + { + "epoch": 0.16165694631374308, + "grad_norm": 1.7192459951334418, + "learning_rate": 9.55263093943163e-06, + "loss": 0.3415, + "step": 1081 + }, + { + "epoch": 0.16180649020487514, + "grad_norm": 2.195446146090599, + "learning_rate": 9.55162909808273e-06, + "loss": 0.5552, + "step": 1082 + }, + { + "epoch": 0.16195603409600717, + "grad_norm": 1.2518961650965623, + "learning_rate": 9.550626188877779e-06, + "loss": 0.1376, + "step": 1083 + }, + { + "epoch": 0.16210557798713923, + "grad_norm": 1.621475542449237, + "learning_rate": 9.549622212052067e-06, + "loss": 0.1918, + "step": 1084 + }, + { + "epoch": 0.16225512187827126, + "grad_norm": 1.6022576009951304, + "learning_rate": 9.548617167841139e-06, + "loss": 0.4754, + "step": 1085 + }, + { + "epoch": 0.16240466576940332, + "grad_norm": 2.140796057625746, + "learning_rate": 9.547611056480785e-06, + "loss": 0.3365, + "step": 1086 + }, + { + "epoch": 0.16255420966053535, + "grad_norm": 2.1352436057574415, + "learning_rate": 9.54660387820705e-06, + "loss": 0.2548, + "step": 1087 + }, + { + "epoch": 0.1627037535516674, + "grad_norm": 1.533857631085752, + "learning_rate": 9.54559563325623e-06, + "loss": 0.2295, + "step": 1088 + }, + { + "epoch": 0.16285329744279947, + "grad_norm": 1.3212471086223223, + "learning_rate": 9.544586321864865e-06, + "loss": 0.2047, + "step": 1089 + }, + { + "epoch": 0.1630028413339315, + "grad_norm": 1.0794095044455165, + "learning_rate": 9.543575944269752e-06, + "loss": 0.2824, + "step": 1090 + }, + { + "epoch": 0.16315238522506356, + "grad_norm": 1.7780256493338729, + "learning_rate": 9.542564500707934e-06, + "loss": 0.3763, + "step": 1091 + }, + { + "epoch": 0.1633019291161956, + "grad_norm": 1.1347608812400427, + "learning_rate": 9.541551991416704e-06, + "loss": 0.2046, + "step": 1092 + }, + { + "epoch": 0.16345147300732765, + "grad_norm": 1.7298153575861623, + "learning_rate": 9.540538416633611e-06, + "loss": 0.3394, + "step": 1093 + }, + { + "epoch": 0.1636010168984597, + "grad_norm": 1.822850678248627, + "learning_rate": 9.539523776596446e-06, + "loss": 0.4254, + "step": 1094 + }, + { + "epoch": 0.16375056078959174, + "grad_norm": 1.762563084427053, + "learning_rate": 9.538508071543253e-06, + "loss": 0.4083, + "step": 1095 + }, + { + "epoch": 0.1639001046807238, + "grad_norm": 1.653109483519857, + "learning_rate": 9.537491301712328e-06, + "loss": 0.3344, + "step": 1096 + }, + { + "epoch": 0.16404964857185583, + "grad_norm": 2.0381344440794713, + "learning_rate": 9.536473467342213e-06, + "loss": 0.6724, + "step": 1097 + }, + { + "epoch": 0.1641991924629879, + "grad_norm": 1.9754898794658875, + "learning_rate": 9.535454568671705e-06, + "loss": 0.5666, + "step": 1098 + }, + { + "epoch": 0.16434873635411992, + "grad_norm": 2.3447226731261344, + "learning_rate": 9.534434605939845e-06, + "loss": 0.3964, + "step": 1099 + }, + { + "epoch": 0.16449828024525198, + "grad_norm": 1.773763244389934, + "learning_rate": 9.533413579385925e-06, + "loss": 0.3883, + "step": 1100 + }, + { + "epoch": 0.16464782413638404, + "grad_norm": 1.8563640201434743, + "learning_rate": 9.532391489249489e-06, + "loss": 0.5295, + "step": 1101 + }, + { + "epoch": 0.16479736802751607, + "grad_norm": 1.610026424473214, + "learning_rate": 9.53136833577033e-06, + "loss": 0.2045, + "step": 1102 + }, + { + "epoch": 0.16494691191864813, + "grad_norm": 1.1595609057345122, + "learning_rate": 9.530344119188489e-06, + "loss": 0.196, + "step": 1103 + }, + { + "epoch": 0.16509645580978016, + "grad_norm": 1.380807492186586, + "learning_rate": 9.529318839744257e-06, + "loss": 0.1665, + "step": 1104 + }, + { + "epoch": 0.16524599970091222, + "grad_norm": 1.224405894645481, + "learning_rate": 9.528292497678175e-06, + "loss": 0.1904, + "step": 1105 + }, + { + "epoch": 0.16539554359204425, + "grad_norm": 1.6321539600343615, + "learning_rate": 9.527265093231031e-06, + "loss": 0.3776, + "step": 1106 + }, + { + "epoch": 0.1655450874831763, + "grad_norm": 1.8242961573055154, + "learning_rate": 9.526236626643867e-06, + "loss": 0.2581, + "step": 1107 + }, + { + "epoch": 0.16569463137430837, + "grad_norm": 1.3475980511137085, + "learning_rate": 9.525207098157968e-06, + "loss": 0.3415, + "step": 1108 + }, + { + "epoch": 0.1658441752654404, + "grad_norm": 1.9882440245015858, + "learning_rate": 9.524176508014873e-06, + "loss": 0.5573, + "step": 1109 + }, + { + "epoch": 0.16599371915657246, + "grad_norm": 1.5816342314828584, + "learning_rate": 9.523144856456367e-06, + "loss": 0.2691, + "step": 1110 + }, + { + "epoch": 0.1661432630477045, + "grad_norm": 1.3850164199795663, + "learning_rate": 9.522112143724489e-06, + "loss": 0.3378, + "step": 1111 + }, + { + "epoch": 0.16629280693883655, + "grad_norm": 1.4909627841603759, + "learning_rate": 9.52107837006152e-06, + "loss": 0.4023, + "step": 1112 + }, + { + "epoch": 0.16644235082996858, + "grad_norm": 1.338411558324243, + "learning_rate": 9.520043535709994e-06, + "loss": 0.225, + "step": 1113 + }, + { + "epoch": 0.16659189472110064, + "grad_norm": 1.5004852001929436, + "learning_rate": 9.519007640912691e-06, + "loss": 0.367, + "step": 1114 + }, + { + "epoch": 0.1667414386122327, + "grad_norm": 1.3522433759924148, + "learning_rate": 9.517970685912648e-06, + "loss": 0.3267, + "step": 1115 + }, + { + "epoch": 0.16689098250336473, + "grad_norm": 1.2771009278550414, + "learning_rate": 9.516932670953137e-06, + "loss": 0.2343, + "step": 1116 + }, + { + "epoch": 0.1670405263944968, + "grad_norm": 1.2558718520533958, + "learning_rate": 9.515893596277692e-06, + "loss": 0.2146, + "step": 1117 + }, + { + "epoch": 0.16719007028562882, + "grad_norm": 1.747427684722942, + "learning_rate": 9.514853462130087e-06, + "loss": 0.3897, + "step": 1118 + }, + { + "epoch": 0.16733961417676088, + "grad_norm": 1.350710739291567, + "learning_rate": 9.51381226875435e-06, + "loss": 0.1962, + "step": 1119 + }, + { + "epoch": 0.16748915806789294, + "grad_norm": 1.7048566484317351, + "learning_rate": 9.512770016394754e-06, + "loss": 0.4704, + "step": 1120 + }, + { + "epoch": 0.16763870195902497, + "grad_norm": 1.1820262059156426, + "learning_rate": 9.51172670529582e-06, + "loss": 0.2269, + "step": 1121 + }, + { + "epoch": 0.16778824585015703, + "grad_norm": 1.4968174934291172, + "learning_rate": 9.51068233570232e-06, + "loss": 0.2122, + "step": 1122 + }, + { + "epoch": 0.16793778974128906, + "grad_norm": 1.7978736217167008, + "learning_rate": 9.50963690785927e-06, + "loss": 0.4423, + "step": 1123 + }, + { + "epoch": 0.16808733363242112, + "grad_norm": 1.9086737935680094, + "learning_rate": 9.508590422011943e-06, + "loss": 0.5837, + "step": 1124 + }, + { + "epoch": 0.16823687752355315, + "grad_norm": 1.809798628211579, + "learning_rate": 9.507542878405853e-06, + "loss": 0.4956, + "step": 1125 + }, + { + "epoch": 0.1683864214146852, + "grad_norm": 2.256680269043329, + "learning_rate": 9.506494277286762e-06, + "loss": 0.3622, + "step": 1126 + }, + { + "epoch": 0.16853596530581727, + "grad_norm": 1.7846176870962676, + "learning_rate": 9.505444618900682e-06, + "loss": 0.3712, + "step": 1127 + }, + { + "epoch": 0.1686855091969493, + "grad_norm": 2.142681380142415, + "learning_rate": 9.504393903493874e-06, + "loss": 0.2523, + "step": 1128 + }, + { + "epoch": 0.16883505308808136, + "grad_norm": 1.993049487456574, + "learning_rate": 9.503342131312847e-06, + "loss": 0.4667, + "step": 1129 + }, + { + "epoch": 0.1689845969792134, + "grad_norm": 1.4929357757100867, + "learning_rate": 9.502289302604355e-06, + "loss": 0.2124, + "step": 1130 + }, + { + "epoch": 0.16913414087034545, + "grad_norm": 1.763381368002964, + "learning_rate": 9.501235417615402e-06, + "loss": 0.2563, + "step": 1131 + }, + { + "epoch": 0.16928368476147748, + "grad_norm": 1.879192084699487, + "learning_rate": 9.50018047659324e-06, + "loss": 0.6, + "step": 1132 + }, + { + "epoch": 0.16943322865260954, + "grad_norm": 1.7482598713089221, + "learning_rate": 9.49912447978537e-06, + "loss": 0.4743, + "step": 1133 + }, + { + "epoch": 0.1695827725437416, + "grad_norm": 2.413100609605461, + "learning_rate": 9.498067427439535e-06, + "loss": 0.5021, + "step": 1134 + }, + { + "epoch": 0.16973231643487363, + "grad_norm": 1.6675332056868586, + "learning_rate": 9.497009319803732e-06, + "loss": 0.3704, + "step": 1135 + }, + { + "epoch": 0.1698818603260057, + "grad_norm": 1.4103198339625334, + "learning_rate": 9.495950157126204e-06, + "loss": 0.3525, + "step": 1136 + }, + { + "epoch": 0.17003140421713772, + "grad_norm": 1.7592756830906924, + "learning_rate": 9.49488993965544e-06, + "loss": 0.2118, + "step": 1137 + }, + { + "epoch": 0.17018094810826978, + "grad_norm": 1.209600322516803, + "learning_rate": 9.493828667640179e-06, + "loss": 0.2156, + "step": 1138 + }, + { + "epoch": 0.1703304919994018, + "grad_norm": 1.8747303848334145, + "learning_rate": 9.492766341329402e-06, + "loss": 0.2998, + "step": 1139 + }, + { + "epoch": 0.17048003589053387, + "grad_norm": 1.9498503355521368, + "learning_rate": 9.491702960972343e-06, + "loss": 0.3723, + "step": 1140 + }, + { + "epoch": 0.17062957978166593, + "grad_norm": 1.6726039919291162, + "learning_rate": 9.490638526818482e-06, + "loss": 0.408, + "step": 1141 + }, + { + "epoch": 0.17077912367279796, + "grad_norm": 3.7262669891973386, + "learning_rate": 9.489573039117543e-06, + "loss": 0.4009, + "step": 1142 + }, + { + "epoch": 0.17092866756393002, + "grad_norm": 1.8934485182919263, + "learning_rate": 9.488506498119502e-06, + "loss": 0.5141, + "step": 1143 + }, + { + "epoch": 0.17107821145506205, + "grad_norm": 1.4710896960429016, + "learning_rate": 9.487438904074581e-06, + "loss": 0.3149, + "step": 1144 + }, + { + "epoch": 0.1712277553461941, + "grad_norm": 0.7650448318548279, + "learning_rate": 9.486370257233244e-06, + "loss": 0.1787, + "step": 1145 + }, + { + "epoch": 0.17137729923732614, + "grad_norm": 1.110992686058532, + "learning_rate": 9.48530055784621e-06, + "loss": 0.2058, + "step": 1146 + }, + { + "epoch": 0.1715268431284582, + "grad_norm": 1.7079056261573908, + "learning_rate": 9.484229806164435e-06, + "loss": 0.2944, + "step": 1147 + }, + { + "epoch": 0.17167638701959026, + "grad_norm": 1.5033153679810882, + "learning_rate": 9.483158002439134e-06, + "loss": 0.2221, + "step": 1148 + }, + { + "epoch": 0.1718259309107223, + "grad_norm": 1.672220461796242, + "learning_rate": 9.48208514692176e-06, + "loss": 0.3671, + "step": 1149 + }, + { + "epoch": 0.17197547480185435, + "grad_norm": 2.0626867491532885, + "learning_rate": 9.481011239864014e-06, + "loss": 0.3915, + "step": 1150 + }, + { + "epoch": 0.17212501869298638, + "grad_norm": 1.6104087564965828, + "learning_rate": 9.479936281517848e-06, + "loss": 0.3084, + "step": 1151 + }, + { + "epoch": 0.17227456258411844, + "grad_norm": 1.2158699033191027, + "learning_rate": 9.478860272135452e-06, + "loss": 0.2074, + "step": 1152 + }, + { + "epoch": 0.1724241064752505, + "grad_norm": 1.5670367135087604, + "learning_rate": 9.477783211969273e-06, + "loss": 0.3216, + "step": 1153 + }, + { + "epoch": 0.17257365036638253, + "grad_norm": 1.1456620290612602, + "learning_rate": 9.476705101272e-06, + "loss": 0.1631, + "step": 1154 + }, + { + "epoch": 0.1727231942575146, + "grad_norm": 2.240627806354505, + "learning_rate": 9.475625940296567e-06, + "loss": 0.3657, + "step": 1155 + }, + { + "epoch": 0.17287273814864662, + "grad_norm": 1.5745880507830639, + "learning_rate": 9.474545729296152e-06, + "loss": 0.2223, + "step": 1156 + }, + { + "epoch": 0.17302228203977868, + "grad_norm": 2.112259044674356, + "learning_rate": 9.47346446852419e-06, + "loss": 0.6911, + "step": 1157 + }, + { + "epoch": 0.1731718259309107, + "grad_norm": 1.5810443743533733, + "learning_rate": 9.472382158234349e-06, + "loss": 0.3099, + "step": 1158 + }, + { + "epoch": 0.17332136982204277, + "grad_norm": 1.2778440299252458, + "learning_rate": 9.47129879868055e-06, + "loss": 0.2136, + "step": 1159 + }, + { + "epoch": 0.17347091371317483, + "grad_norm": 1.4299544830347497, + "learning_rate": 9.470214390116965e-06, + "loss": 0.2145, + "step": 1160 + }, + { + "epoch": 0.17362045760430686, + "grad_norm": 1.3698311545901527, + "learning_rate": 9.469128932798e-06, + "loss": 0.2377, + "step": 1161 + }, + { + "epoch": 0.17377000149543892, + "grad_norm": 1.2952413351897127, + "learning_rate": 9.468042426978319e-06, + "loss": 0.2452, + "step": 1162 + }, + { + "epoch": 0.17391954538657095, + "grad_norm": 1.3794054233492332, + "learning_rate": 9.466954872912823e-06, + "loss": 0.1923, + "step": 1163 + }, + { + "epoch": 0.174069089277703, + "grad_norm": 1.1390407334512478, + "learning_rate": 9.465866270856665e-06, + "loss": 0.251, + "step": 1164 + }, + { + "epoch": 0.17421863316883504, + "grad_norm": 2.280807269259536, + "learning_rate": 9.46477662106524e-06, + "loss": 0.5433, + "step": 1165 + }, + { + "epoch": 0.1743681770599671, + "grad_norm": 1.4481272121275899, + "learning_rate": 9.463685923794191e-06, + "loss": 0.3209, + "step": 1166 + }, + { + "epoch": 0.17451772095109916, + "grad_norm": 1.3194380931708358, + "learning_rate": 9.462594179299408e-06, + "loss": 0.2208, + "step": 1167 + }, + { + "epoch": 0.1746672648422312, + "grad_norm": 2.0036222277986, + "learning_rate": 9.46150138783702e-06, + "loss": 0.2654, + "step": 1168 + }, + { + "epoch": 0.17481680873336325, + "grad_norm": 0.8359301847512149, + "learning_rate": 9.460407549663411e-06, + "loss": 0.2131, + "step": 1169 + }, + { + "epoch": 0.17496635262449528, + "grad_norm": 1.8746705630094216, + "learning_rate": 9.459312665035203e-06, + "loss": 0.2415, + "step": 1170 + }, + { + "epoch": 0.17511589651562734, + "grad_norm": 2.187475783350278, + "learning_rate": 9.458216734209269e-06, + "loss": 0.6378, + "step": 1171 + }, + { + "epoch": 0.17526544040675937, + "grad_norm": 1.3984177897039258, + "learning_rate": 9.457119757442723e-06, + "loss": 0.2034, + "step": 1172 + }, + { + "epoch": 0.17541498429789143, + "grad_norm": 1.776218225609686, + "learning_rate": 9.456021734992928e-06, + "loss": 0.3717, + "step": 1173 + }, + { + "epoch": 0.1755645281890235, + "grad_norm": 1.360440620305923, + "learning_rate": 9.45492266711749e-06, + "loss": 0.2499, + "step": 1174 + }, + { + "epoch": 0.17571407208015552, + "grad_norm": 1.11686981125786, + "learning_rate": 9.453822554074259e-06, + "loss": 0.1718, + "step": 1175 + }, + { + "epoch": 0.17586361597128758, + "grad_norm": 1.7801052355130151, + "learning_rate": 9.452721396121333e-06, + "loss": 0.392, + "step": 1176 + }, + { + "epoch": 0.1760131598624196, + "grad_norm": 1.4238435171417776, + "learning_rate": 9.451619193517057e-06, + "loss": 0.4248, + "step": 1177 + }, + { + "epoch": 0.17616270375355167, + "grad_norm": 1.4639661830539468, + "learning_rate": 9.450515946520016e-06, + "loss": 0.2049, + "step": 1178 + }, + { + "epoch": 0.17631224764468373, + "grad_norm": 1.5499038616483287, + "learning_rate": 9.449411655389042e-06, + "loss": 0.4062, + "step": 1179 + }, + { + "epoch": 0.17646179153581576, + "grad_norm": 1.7136605249554677, + "learning_rate": 9.448306320383215e-06, + "loss": 0.3911, + "step": 1180 + }, + { + "epoch": 0.17661133542694782, + "grad_norm": 1.4594560853866743, + "learning_rate": 9.447199941761852e-06, + "loss": 0.2117, + "step": 1181 + }, + { + "epoch": 0.17676087931807985, + "grad_norm": 1.6116818115006617, + "learning_rate": 9.446092519784525e-06, + "loss": 0.19, + "step": 1182 + }, + { + "epoch": 0.1769104232092119, + "grad_norm": 1.5420971786965234, + "learning_rate": 9.444984054711044e-06, + "loss": 0.3199, + "step": 1183 + }, + { + "epoch": 0.17705996710034394, + "grad_norm": 1.6968645734800356, + "learning_rate": 9.443874546801465e-06, + "loss": 0.3796, + "step": 1184 + }, + { + "epoch": 0.177209510991476, + "grad_norm": 1.3181328359871092, + "learning_rate": 9.442763996316093e-06, + "loss": 0.3955, + "step": 1185 + }, + { + "epoch": 0.17735905488260806, + "grad_norm": 2.8227798204695733, + "learning_rate": 9.44165240351547e-06, + "loss": 0.5894, + "step": 1186 + }, + { + "epoch": 0.1775085987737401, + "grad_norm": 1.9231832633740291, + "learning_rate": 9.440539768660386e-06, + "loss": 0.3844, + "step": 1187 + }, + { + "epoch": 0.17765814266487215, + "grad_norm": 1.4508084994855301, + "learning_rate": 9.439426092011877e-06, + "loss": 0.1983, + "step": 1188 + }, + { + "epoch": 0.17780768655600418, + "grad_norm": 1.838910674129457, + "learning_rate": 9.438311373831224e-06, + "loss": 0.5758, + "step": 1189 + }, + { + "epoch": 0.17795723044713624, + "grad_norm": 1.878745494476723, + "learning_rate": 9.437195614379947e-06, + "loss": 0.4892, + "step": 1190 + }, + { + "epoch": 0.17810677433826827, + "grad_norm": 1.5763295601740275, + "learning_rate": 9.436078813919818e-06, + "loss": 0.4209, + "step": 1191 + }, + { + "epoch": 0.17825631822940033, + "grad_norm": 1.939965363898736, + "learning_rate": 9.434960972712846e-06, + "loss": 0.4915, + "step": 1192 + }, + { + "epoch": 0.1784058621205324, + "grad_norm": 1.777010297411083, + "learning_rate": 9.433842091021287e-06, + "loss": 0.4445, + "step": 1193 + }, + { + "epoch": 0.17855540601166442, + "grad_norm": 1.8576886519979177, + "learning_rate": 9.432722169107647e-06, + "loss": 0.2065, + "step": 1194 + }, + { + "epoch": 0.17870494990279648, + "grad_norm": 1.6414559576032928, + "learning_rate": 9.431601207234663e-06, + "loss": 0.2878, + "step": 1195 + }, + { + "epoch": 0.1788544937939285, + "grad_norm": 1.6433452397947506, + "learning_rate": 9.430479205665329e-06, + "loss": 0.3933, + "step": 1196 + }, + { + "epoch": 0.17900403768506057, + "grad_norm": 1.3219617553631218, + "learning_rate": 9.429356164662872e-06, + "loss": 0.1886, + "step": 1197 + }, + { + "epoch": 0.1791535815761926, + "grad_norm": 1.3673182854086454, + "learning_rate": 9.428232084490774e-06, + "loss": 0.2098, + "step": 1198 + }, + { + "epoch": 0.17930312546732466, + "grad_norm": 1.4932716672657123, + "learning_rate": 9.427106965412752e-06, + "loss": 0.1868, + "step": 1199 + }, + { + "epoch": 0.17945266935845672, + "grad_norm": 1.2835655324809725, + "learning_rate": 9.425980807692771e-06, + "loss": 0.2841, + "step": 1200 + }, + { + "epoch": 0.17960221324958875, + "grad_norm": 1.5229676329003083, + "learning_rate": 9.424853611595037e-06, + "loss": 0.429, + "step": 1201 + }, + { + "epoch": 0.1797517571407208, + "grad_norm": 1.3834763754455093, + "learning_rate": 9.423725377384e-06, + "loss": 0.199, + "step": 1202 + }, + { + "epoch": 0.17990130103185284, + "grad_norm": 1.172574987366, + "learning_rate": 9.42259610532436e-06, + "loss": 0.2422, + "step": 1203 + }, + { + "epoch": 0.1800508449229849, + "grad_norm": 2.3677332220742753, + "learning_rate": 9.421465795681048e-06, + "loss": 0.6703, + "step": 1204 + }, + { + "epoch": 0.18020038881411693, + "grad_norm": 1.743670576433428, + "learning_rate": 9.420334448719251e-06, + "loss": 0.3879, + "step": 1205 + }, + { + "epoch": 0.180349932705249, + "grad_norm": 1.4354052350500734, + "learning_rate": 9.419202064704393e-06, + "loss": 0.2261, + "step": 1206 + }, + { + "epoch": 0.18049947659638105, + "grad_norm": 1.5355684537494616, + "learning_rate": 9.41806864390214e-06, + "loss": 0.2323, + "step": 1207 + }, + { + "epoch": 0.18064902048751308, + "grad_norm": 2.7749706919729067, + "learning_rate": 9.416934186578403e-06, + "loss": 0.2457, + "step": 1208 + }, + { + "epoch": 0.18079856437864514, + "grad_norm": 1.7998786782731084, + "learning_rate": 9.41579869299934e-06, + "loss": 0.5115, + "step": 1209 + }, + { + "epoch": 0.18094810826977717, + "grad_norm": 1.722249547477117, + "learning_rate": 9.414662163431347e-06, + "loss": 0.3978, + "step": 1210 + }, + { + "epoch": 0.18109765216090923, + "grad_norm": 1.470878100530038, + "learning_rate": 9.413524598141065e-06, + "loss": 0.3655, + "step": 1211 + }, + { + "epoch": 0.1812471960520413, + "grad_norm": 1.2321837235938764, + "learning_rate": 9.412385997395377e-06, + "loss": 0.206, + "step": 1212 + }, + { + "epoch": 0.18139673994317332, + "grad_norm": 1.71695513424398, + "learning_rate": 9.41124636146141e-06, + "loss": 0.2988, + "step": 1213 + }, + { + "epoch": 0.18154628383430538, + "grad_norm": 1.5665377801862033, + "learning_rate": 9.410105690606533e-06, + "loss": 0.435, + "step": 1214 + }, + { + "epoch": 0.1816958277254374, + "grad_norm": 1.9574571743722469, + "learning_rate": 9.40896398509836e-06, + "loss": 0.3844, + "step": 1215 + }, + { + "epoch": 0.18184537161656947, + "grad_norm": 1.732474617655161, + "learning_rate": 9.407821245204746e-06, + "loss": 0.4532, + "step": 1216 + }, + { + "epoch": 0.1819949155077015, + "grad_norm": 1.6282505343946028, + "learning_rate": 9.406677471193788e-06, + "loss": 0.346, + "step": 1217 + }, + { + "epoch": 0.18214445939883356, + "grad_norm": 1.7687288700904007, + "learning_rate": 9.405532663333826e-06, + "loss": 0.2398, + "step": 1218 + }, + { + "epoch": 0.18229400328996562, + "grad_norm": 1.710427334466053, + "learning_rate": 9.404386821893442e-06, + "loss": 0.2851, + "step": 1219 + }, + { + "epoch": 0.18244354718109765, + "grad_norm": 1.017825559673437, + "learning_rate": 9.403239947141467e-06, + "loss": 0.1898, + "step": 1220 + }, + { + "epoch": 0.1825930910722297, + "grad_norm": 1.7058191164095473, + "learning_rate": 9.402092039346961e-06, + "loss": 0.3391, + "step": 1221 + }, + { + "epoch": 0.18274263496336174, + "grad_norm": 1.2429292971081916, + "learning_rate": 9.40094309877924e-06, + "loss": 0.247, + "step": 1222 + }, + { + "epoch": 0.1828921788544938, + "grad_norm": 1.26527696992994, + "learning_rate": 9.399793125707853e-06, + "loss": 0.2229, + "step": 1223 + }, + { + "epoch": 0.18304172274562583, + "grad_norm": 1.150282472600963, + "learning_rate": 9.398642120402596e-06, + "loss": 0.2145, + "step": 1224 + }, + { + "epoch": 0.1831912666367579, + "grad_norm": 1.3914149403501497, + "learning_rate": 9.39749008313351e-06, + "loss": 0.231, + "step": 1225 + }, + { + "epoch": 0.18334081052788995, + "grad_norm": 1.3685090802839712, + "learning_rate": 9.396337014170866e-06, + "loss": 0.1872, + "step": 1226 + }, + { + "epoch": 0.18349035441902198, + "grad_norm": 1.6709772065779387, + "learning_rate": 9.395182913785192e-06, + "loss": 0.2055, + "step": 1227 + }, + { + "epoch": 0.18363989831015404, + "grad_norm": 2.0418194880673783, + "learning_rate": 9.394027782247247e-06, + "loss": 0.4888, + "step": 1228 + }, + { + "epoch": 0.18378944220128607, + "grad_norm": 1.5794839342981186, + "learning_rate": 9.392871619828036e-06, + "loss": 0.3355, + "step": 1229 + }, + { + "epoch": 0.18393898609241813, + "grad_norm": 2.365767436986478, + "learning_rate": 9.39171442679881e-06, + "loss": 0.4306, + "step": 1230 + }, + { + "epoch": 0.18408852998355016, + "grad_norm": 1.535735557296357, + "learning_rate": 9.390556203431053e-06, + "loss": 0.3454, + "step": 1231 + }, + { + "epoch": 0.18423807387468222, + "grad_norm": 2.0146640105762, + "learning_rate": 9.3893969499965e-06, + "loss": 0.5002, + "step": 1232 + }, + { + "epoch": 0.18438761776581428, + "grad_norm": 1.0888630229716356, + "learning_rate": 9.388236666767119e-06, + "loss": 0.1717, + "step": 1233 + }, + { + "epoch": 0.1845371616569463, + "grad_norm": 1.471926551369625, + "learning_rate": 9.387075354015125e-06, + "loss": 0.2728, + "step": 1234 + }, + { + "epoch": 0.18468670554807837, + "grad_norm": 1.2418392055984802, + "learning_rate": 9.385913012012972e-06, + "loss": 0.2338, + "step": 1235 + }, + { + "epoch": 0.1848362494392104, + "grad_norm": 1.1326547586847213, + "learning_rate": 9.384749641033358e-06, + "loss": 0.2014, + "step": 1236 + }, + { + "epoch": 0.18498579333034246, + "grad_norm": 1.2625669973249032, + "learning_rate": 9.383585241349223e-06, + "loss": 0.2257, + "step": 1237 + }, + { + "epoch": 0.18513533722147452, + "grad_norm": 2.0231610702160494, + "learning_rate": 9.382419813233741e-06, + "loss": 0.6136, + "step": 1238 + }, + { + "epoch": 0.18528488111260655, + "grad_norm": 2.2265632194384035, + "learning_rate": 9.381253356960339e-06, + "loss": 0.379, + "step": 1239 + }, + { + "epoch": 0.1854344250037386, + "grad_norm": 1.7519589257117685, + "learning_rate": 9.380085872802672e-06, + "loss": 0.4481, + "step": 1240 + }, + { + "epoch": 0.18558396889487064, + "grad_norm": 1.8894247538731719, + "learning_rate": 9.37891736103465e-06, + "loss": 0.2349, + "step": 1241 + }, + { + "epoch": 0.1857335127860027, + "grad_norm": 2.0840730120928153, + "learning_rate": 9.377747821930411e-06, + "loss": 0.386, + "step": 1242 + }, + { + "epoch": 0.18588305667713473, + "grad_norm": 1.2303960642463392, + "learning_rate": 9.376577255764346e-06, + "loss": 0.2138, + "step": 1243 + }, + { + "epoch": 0.1860326005682668, + "grad_norm": 1.0736052443136495, + "learning_rate": 9.375405662811076e-06, + "loss": 0.2919, + "step": 1244 + }, + { + "epoch": 0.18618214445939885, + "grad_norm": 2.1694546083973236, + "learning_rate": 9.37423304334547e-06, + "loss": 0.4716, + "step": 1245 + }, + { + "epoch": 0.18633168835053088, + "grad_norm": 1.7953994969561728, + "learning_rate": 9.373059397642637e-06, + "loss": 0.2303, + "step": 1246 + }, + { + "epoch": 0.18648123224166294, + "grad_norm": 1.1331346308690267, + "learning_rate": 9.371884725977924e-06, + "loss": 0.1681, + "step": 1247 + }, + { + "epoch": 0.18663077613279497, + "grad_norm": 1.8818511328803789, + "learning_rate": 9.370709028626921e-06, + "loss": 0.3736, + "step": 1248 + }, + { + "epoch": 0.18678032002392703, + "grad_norm": 1.080778831083804, + "learning_rate": 9.369532305865459e-06, + "loss": 0.2155, + "step": 1249 + }, + { + "epoch": 0.18692986391505906, + "grad_norm": 2.005915788639095, + "learning_rate": 9.368354557969606e-06, + "loss": 0.4026, + "step": 1250 + }, + { + "epoch": 0.18707940780619112, + "grad_norm": 0.8756048068588032, + "learning_rate": 9.367175785215674e-06, + "loss": 0.183, + "step": 1251 + }, + { + "epoch": 0.18722895169732318, + "grad_norm": 2.100099144522435, + "learning_rate": 9.365995987880216e-06, + "loss": 0.182, + "step": 1252 + }, + { + "epoch": 0.1873784955884552, + "grad_norm": 1.9441741117291806, + "learning_rate": 9.364815166240023e-06, + "loss": 0.3865, + "step": 1253 + }, + { + "epoch": 0.18752803947958727, + "grad_norm": 1.1762687046527927, + "learning_rate": 9.363633320572124e-06, + "loss": 0.2105, + "step": 1254 + }, + { + "epoch": 0.1876775833707193, + "grad_norm": 1.170057500642311, + "learning_rate": 9.362450451153795e-06, + "loss": 0.201, + "step": 1255 + }, + { + "epoch": 0.18782712726185136, + "grad_norm": 1.774823231357248, + "learning_rate": 9.36126655826255e-06, + "loss": 0.2958, + "step": 1256 + }, + { + "epoch": 0.1879766711529834, + "grad_norm": 1.847386943393164, + "learning_rate": 9.360081642176137e-06, + "loss": 0.2783, + "step": 1257 + }, + { + "epoch": 0.18812621504411545, + "grad_norm": 1.2317043700510546, + "learning_rate": 9.358895703172552e-06, + "loss": 0.2237, + "step": 1258 + }, + { + "epoch": 0.1882757589352475, + "grad_norm": 1.4962863598933458, + "learning_rate": 9.357708741530025e-06, + "loss": 0.208, + "step": 1259 + }, + { + "epoch": 0.18842530282637954, + "grad_norm": 1.753807685308467, + "learning_rate": 9.356520757527032e-06, + "loss": 0.513, + "step": 1260 + }, + { + "epoch": 0.1885748467175116, + "grad_norm": 2.457507671022133, + "learning_rate": 9.355331751442284e-06, + "loss": 0.8743, + "step": 1261 + }, + { + "epoch": 0.18872439060864363, + "grad_norm": 1.7444181168119555, + "learning_rate": 9.354141723554734e-06, + "loss": 0.3346, + "step": 1262 + }, + { + "epoch": 0.1888739344997757, + "grad_norm": 1.971056965005781, + "learning_rate": 9.35295067414357e-06, + "loss": 0.2297, + "step": 1263 + }, + { + "epoch": 0.18902347839090772, + "grad_norm": 1.5052086349314306, + "learning_rate": 9.35175860348823e-06, + "loss": 0.2149, + "step": 1264 + }, + { + "epoch": 0.18917302228203978, + "grad_norm": 1.4361302390685748, + "learning_rate": 9.35056551186838e-06, + "loss": 0.3298, + "step": 1265 + }, + { + "epoch": 0.18932256617317184, + "grad_norm": 1.4206462492110938, + "learning_rate": 9.349371399563935e-06, + "loss": 0.1929, + "step": 1266 + }, + { + "epoch": 0.18947211006430387, + "grad_norm": 1.4913953281160535, + "learning_rate": 9.348176266855042e-06, + "loss": 0.2526, + "step": 1267 + }, + { + "epoch": 0.18962165395543593, + "grad_norm": 2.365270322972236, + "learning_rate": 9.346980114022092e-06, + "loss": 0.5066, + "step": 1268 + }, + { + "epoch": 0.18977119784656796, + "grad_norm": 1.8955954404187068, + "learning_rate": 9.345782941345714e-06, + "loss": 0.4404, + "step": 1269 + }, + { + "epoch": 0.18992074173770002, + "grad_norm": 1.1285554099802715, + "learning_rate": 9.344584749106775e-06, + "loss": 0.2001, + "step": 1270 + }, + { + "epoch": 0.19007028562883208, + "grad_norm": 1.6210163029014748, + "learning_rate": 9.343385537586385e-06, + "loss": 0.3274, + "step": 1271 + }, + { + "epoch": 0.1902198295199641, + "grad_norm": 1.4031028340124463, + "learning_rate": 9.342185307065888e-06, + "loss": 0.1922, + "step": 1272 + }, + { + "epoch": 0.19036937341109617, + "grad_norm": 1.5025368685887945, + "learning_rate": 9.340984057826872e-06, + "loss": 0.4106, + "step": 1273 + }, + { + "epoch": 0.1905189173022282, + "grad_norm": 1.3363573267962257, + "learning_rate": 9.339781790151159e-06, + "loss": 0.2906, + "step": 1274 + }, + { + "epoch": 0.19066846119336026, + "grad_norm": 2.2033082601743263, + "learning_rate": 9.338578504320815e-06, + "loss": 0.3913, + "step": 1275 + }, + { + "epoch": 0.1908180050844923, + "grad_norm": 1.5703985946217345, + "learning_rate": 9.337374200618141e-06, + "loss": 0.2363, + "step": 1276 + }, + { + "epoch": 0.19096754897562435, + "grad_norm": 1.8441964019229968, + "learning_rate": 9.336168879325678e-06, + "loss": 0.2193, + "step": 1277 + }, + { + "epoch": 0.1911170928667564, + "grad_norm": 1.9461041413852502, + "learning_rate": 9.334962540726208e-06, + "loss": 0.3327, + "step": 1278 + }, + { + "epoch": 0.19126663675788844, + "grad_norm": 1.987695873853033, + "learning_rate": 9.333755185102747e-06, + "loss": 0.5218, + "step": 1279 + }, + { + "epoch": 0.1914161806490205, + "grad_norm": 1.9784687710756435, + "learning_rate": 9.332546812738555e-06, + "loss": 0.4903, + "step": 1280 + }, + { + "epoch": 0.19156572454015253, + "grad_norm": 1.6276484190964966, + "learning_rate": 9.331337423917126e-06, + "loss": 0.3464, + "step": 1281 + }, + { + "epoch": 0.19171526843128459, + "grad_norm": 1.2032949092333924, + "learning_rate": 9.330127018922195e-06, + "loss": 0.1529, + "step": 1282 + }, + { + "epoch": 0.19186481232241662, + "grad_norm": 1.8200067255136916, + "learning_rate": 9.328915598037733e-06, + "loss": 0.4354, + "step": 1283 + }, + { + "epoch": 0.19201435621354868, + "grad_norm": 1.4794611609702433, + "learning_rate": 9.327703161547952e-06, + "loss": 0.2071, + "step": 1284 + }, + { + "epoch": 0.19216390010468073, + "grad_norm": 1.8032485542741101, + "learning_rate": 9.326489709737303e-06, + "loss": 0.3813, + "step": 1285 + }, + { + "epoch": 0.19231344399581277, + "grad_norm": 1.3455083426481262, + "learning_rate": 9.325275242890472e-06, + "loss": 0.1853, + "step": 1286 + }, + { + "epoch": 0.19246298788694483, + "grad_norm": 2.125860082250447, + "learning_rate": 9.324059761292385e-06, + "loss": 0.71, + "step": 1287 + }, + { + "epoch": 0.19261253177807686, + "grad_norm": 1.8089276946794224, + "learning_rate": 9.322843265228206e-06, + "loss": 0.3672, + "step": 1288 + }, + { + "epoch": 0.19276207566920892, + "grad_norm": 1.8361099385383872, + "learning_rate": 9.321625754983335e-06, + "loss": 0.3484, + "step": 1289 + }, + { + "epoch": 0.19291161956034095, + "grad_norm": 1.6363642315445044, + "learning_rate": 9.320407230843413e-06, + "loss": 0.3042, + "step": 1290 + }, + { + "epoch": 0.193061163451473, + "grad_norm": 2.021061654973304, + "learning_rate": 9.319187693094318e-06, + "loss": 0.5033, + "step": 1291 + }, + { + "epoch": 0.19321070734260506, + "grad_norm": 2.62527535830696, + "learning_rate": 9.317967142022163e-06, + "loss": 0.275, + "step": 1292 + }, + { + "epoch": 0.1933602512337371, + "grad_norm": 1.3806620083144838, + "learning_rate": 9.316745577913304e-06, + "loss": 0.2855, + "step": 1293 + }, + { + "epoch": 0.19350979512486916, + "grad_norm": 1.7655797800670596, + "learning_rate": 9.31552300105433e-06, + "loss": 0.3915, + "step": 1294 + }, + { + "epoch": 0.1936593390160012, + "grad_norm": 1.3465049850252158, + "learning_rate": 9.314299411732069e-06, + "loss": 0.249, + "step": 1295 + }, + { + "epoch": 0.19380888290713325, + "grad_norm": 0.8256201784583667, + "learning_rate": 9.313074810233589e-06, + "loss": 0.1543, + "step": 1296 + }, + { + "epoch": 0.1939584267982653, + "grad_norm": 1.9261313307855723, + "learning_rate": 9.31184919684619e-06, + "loss": 0.6008, + "step": 1297 + }, + { + "epoch": 0.19410797068939734, + "grad_norm": 1.519619673139573, + "learning_rate": 9.310622571857417e-06, + "loss": 0.239, + "step": 1298 + }, + { + "epoch": 0.1942575145805294, + "grad_norm": 2.0122490785681717, + "learning_rate": 9.309394935555042e-06, + "loss": 0.33, + "step": 1299 + }, + { + "epoch": 0.19440705847166143, + "grad_norm": 1.5990433489122537, + "learning_rate": 9.308166288227088e-06, + "loss": 0.4012, + "step": 1300 + }, + { + "epoch": 0.19455660236279348, + "grad_norm": 1.4956175342537672, + "learning_rate": 9.3069366301618e-06, + "loss": 0.5736, + "step": 1301 + }, + { + "epoch": 0.19470614625392552, + "grad_norm": 1.2896015261249874, + "learning_rate": 9.305705961647672e-06, + "loss": 0.1798, + "step": 1302 + }, + { + "epoch": 0.19485569014505758, + "grad_norm": 1.3006798401099697, + "learning_rate": 9.304474282973432e-06, + "loss": 0.3653, + "step": 1303 + }, + { + "epoch": 0.19500523403618963, + "grad_norm": 2.1249741454515054, + "learning_rate": 9.30324159442804e-06, + "loss": 0.7342, + "step": 1304 + }, + { + "epoch": 0.19515477792732167, + "grad_norm": 1.486234854981151, + "learning_rate": 9.302007896300697e-06, + "loss": 0.2874, + "step": 1305 + }, + { + "epoch": 0.19530432181845372, + "grad_norm": 1.486723968689139, + "learning_rate": 9.300773188880843e-06, + "loss": 0.2301, + "step": 1306 + }, + { + "epoch": 0.19545386570958576, + "grad_norm": 1.697087725237096, + "learning_rate": 9.29953747245815e-06, + "loss": 0.3191, + "step": 1307 + }, + { + "epoch": 0.19560340960071781, + "grad_norm": 1.9495679450825656, + "learning_rate": 9.29830074732253e-06, + "loss": 0.3954, + "step": 1308 + }, + { + "epoch": 0.19575295349184985, + "grad_norm": 1.2105140299371033, + "learning_rate": 9.29706301376413e-06, + "loss": 0.2557, + "step": 1309 + }, + { + "epoch": 0.1959024973829819, + "grad_norm": 1.547760007057872, + "learning_rate": 9.295824272073334e-06, + "loss": 0.2865, + "step": 1310 + }, + { + "epoch": 0.19605204127411396, + "grad_norm": 1.6181428409490188, + "learning_rate": 9.294584522540766e-06, + "loss": 0.3332, + "step": 1311 + }, + { + "epoch": 0.196201585165246, + "grad_norm": 1.664852192256293, + "learning_rate": 9.293343765457278e-06, + "loss": 0.3058, + "step": 1312 + }, + { + "epoch": 0.19635112905637805, + "grad_norm": 1.8608018825705885, + "learning_rate": 9.292102001113968e-06, + "loss": 0.3048, + "step": 1313 + }, + { + "epoch": 0.19650067294751009, + "grad_norm": 1.420503009424543, + "learning_rate": 9.290859229802162e-06, + "loss": 0.2283, + "step": 1314 + }, + { + "epoch": 0.19665021683864214, + "grad_norm": 1.996771180524021, + "learning_rate": 9.289615451813428e-06, + "loss": 0.4804, + "step": 1315 + }, + { + "epoch": 0.19679976072977418, + "grad_norm": 1.8295883207210475, + "learning_rate": 9.28837066743957e-06, + "loss": 0.3065, + "step": 1316 + }, + { + "epoch": 0.19694930462090623, + "grad_norm": 1.6508456488855519, + "learning_rate": 9.287124876972625e-06, + "loss": 0.2617, + "step": 1317 + }, + { + "epoch": 0.1970988485120383, + "grad_norm": 1.7646395203323395, + "learning_rate": 9.285878080704866e-06, + "loss": 0.3484, + "step": 1318 + }, + { + "epoch": 0.19724839240317033, + "grad_norm": 1.6976643786387164, + "learning_rate": 9.284630278928805e-06, + "loss": 0.4485, + "step": 1319 + }, + { + "epoch": 0.19739793629430238, + "grad_norm": 1.6867112197107144, + "learning_rate": 9.283381471937188e-06, + "loss": 0.381, + "step": 1320 + }, + { + "epoch": 0.19754748018543442, + "grad_norm": 1.650278888960391, + "learning_rate": 9.282131660022997e-06, + "loss": 0.2289, + "step": 1321 + }, + { + "epoch": 0.19769702407656647, + "grad_norm": 2.0028177208667977, + "learning_rate": 9.28088084347945e-06, + "loss": 0.5132, + "step": 1322 + }, + { + "epoch": 0.19784656796769853, + "grad_norm": 1.030996633782416, + "learning_rate": 9.279629022600002e-06, + "loss": 0.1764, + "step": 1323 + }, + { + "epoch": 0.19799611185883056, + "grad_norm": 1.448960209222983, + "learning_rate": 9.27837619767834e-06, + "loss": 0.2575, + "step": 1324 + }, + { + "epoch": 0.19814565574996262, + "grad_norm": 2.099657510604881, + "learning_rate": 9.27712236900839e-06, + "loss": 0.4117, + "step": 1325 + }, + { + "epoch": 0.19829519964109465, + "grad_norm": 1.1059240433107884, + "learning_rate": 9.27586753688431e-06, + "loss": 0.2064, + "step": 1326 + }, + { + "epoch": 0.19844474353222671, + "grad_norm": 0.9819176984170996, + "learning_rate": 9.274611701600502e-06, + "loss": 0.2357, + "step": 1327 + }, + { + "epoch": 0.19859428742335875, + "grad_norm": 1.5836465571763443, + "learning_rate": 9.273354863451589e-06, + "loss": 0.2478, + "step": 1328 + }, + { + "epoch": 0.1987438313144908, + "grad_norm": 1.8864795778055325, + "learning_rate": 9.272097022732444e-06, + "loss": 0.3705, + "step": 1329 + }, + { + "epoch": 0.19889337520562286, + "grad_norm": 1.5474710953559745, + "learning_rate": 9.270838179738164e-06, + "loss": 0.1888, + "step": 1330 + }, + { + "epoch": 0.1990429190967549, + "grad_norm": 1.6239105754270915, + "learning_rate": 9.269578334764087e-06, + "loss": 0.3698, + "step": 1331 + }, + { + "epoch": 0.19919246298788695, + "grad_norm": 1.033656840947032, + "learning_rate": 9.268317488105787e-06, + "loss": 0.1741, + "step": 1332 + }, + { + "epoch": 0.19934200687901898, + "grad_norm": 1.2644053752133695, + "learning_rate": 9.267055640059068e-06, + "loss": 0.3292, + "step": 1333 + }, + { + "epoch": 0.19949155077015104, + "grad_norm": 2.0362957566742224, + "learning_rate": 9.265792790919972e-06, + "loss": 0.3348, + "step": 1334 + }, + { + "epoch": 0.19964109466128308, + "grad_norm": 1.528100233715919, + "learning_rate": 9.264528940984777e-06, + "loss": 0.2456, + "step": 1335 + }, + { + "epoch": 0.19979063855241513, + "grad_norm": 1.420463259410139, + "learning_rate": 9.263264090549992e-06, + "loss": 0.3396, + "step": 1336 + }, + { + "epoch": 0.1999401824435472, + "grad_norm": 1.7091828149298964, + "learning_rate": 9.261998239912367e-06, + "loss": 0.2596, + "step": 1337 + }, + { + "epoch": 0.20008972633467922, + "grad_norm": 2.259480708170105, + "learning_rate": 9.26073138936888e-06, + "loss": 0.4451, + "step": 1338 + }, + { + "epoch": 0.20023927022581128, + "grad_norm": 1.8137629934472514, + "learning_rate": 9.259463539216746e-06, + "loss": 0.3549, + "step": 1339 + }, + { + "epoch": 0.20038881411694331, + "grad_norm": 1.1443459601678743, + "learning_rate": 9.258194689753417e-06, + "loss": 0.228, + "step": 1340 + }, + { + "epoch": 0.20053835800807537, + "grad_norm": 1.9827124471950097, + "learning_rate": 9.256924841276576e-06, + "loss": 0.2773, + "step": 1341 + }, + { + "epoch": 0.2006879018992074, + "grad_norm": 1.2970841523500378, + "learning_rate": 9.25565399408414e-06, + "loss": 0.2164, + "step": 1342 + }, + { + "epoch": 0.20083744579033946, + "grad_norm": 1.8476680005205972, + "learning_rate": 9.254382148474264e-06, + "loss": 0.3472, + "step": 1343 + }, + { + "epoch": 0.20098698968147152, + "grad_norm": 0.9750041200477407, + "learning_rate": 9.253109304745335e-06, + "loss": 0.1686, + "step": 1344 + }, + { + "epoch": 0.20113653357260355, + "grad_norm": 1.9603557255590418, + "learning_rate": 9.251835463195977e-06, + "loss": 0.5067, + "step": 1345 + }, + { + "epoch": 0.2012860774637356, + "grad_norm": 1.8365618678908586, + "learning_rate": 9.25056062412504e-06, + "loss": 0.3701, + "step": 1346 + }, + { + "epoch": 0.20143562135486764, + "grad_norm": 1.7722339453250509, + "learning_rate": 9.249284787831617e-06, + "loss": 0.4201, + "step": 1347 + }, + { + "epoch": 0.2015851652459997, + "grad_norm": 1.6882388762311373, + "learning_rate": 9.24800795461503e-06, + "loss": 0.3566, + "step": 1348 + }, + { + "epoch": 0.20173470913713173, + "grad_norm": 1.3408931481790511, + "learning_rate": 9.246730124774839e-06, + "loss": 0.3711, + "step": 1349 + }, + { + "epoch": 0.2018842530282638, + "grad_norm": 1.5904902378620338, + "learning_rate": 9.245451298610833e-06, + "loss": 0.2285, + "step": 1350 + }, + { + "epoch": 0.20203379691939585, + "grad_norm": 1.24091223117027, + "learning_rate": 9.244171476423037e-06, + "loss": 0.1831, + "step": 1351 + }, + { + "epoch": 0.20218334081052788, + "grad_norm": 1.8606049117343595, + "learning_rate": 9.24289065851171e-06, + "loss": 0.2355, + "step": 1352 + }, + { + "epoch": 0.20233288470165994, + "grad_norm": 1.3471226340640614, + "learning_rate": 9.241608845177344e-06, + "loss": 0.2296, + "step": 1353 + }, + { + "epoch": 0.20248242859279197, + "grad_norm": 1.8826806706856583, + "learning_rate": 9.240326036720665e-06, + "loss": 0.3794, + "step": 1354 + }, + { + "epoch": 0.20263197248392403, + "grad_norm": 1.6131643098193205, + "learning_rate": 9.239042233442632e-06, + "loss": 0.2487, + "step": 1355 + }, + { + "epoch": 0.2027815163750561, + "grad_norm": 1.250046228507658, + "learning_rate": 9.23775743564444e-06, + "loss": 0.362, + "step": 1356 + }, + { + "epoch": 0.20293106026618812, + "grad_norm": 1.680459140251771, + "learning_rate": 9.236471643627512e-06, + "loss": 0.218, + "step": 1357 + }, + { + "epoch": 0.20308060415732018, + "grad_norm": 3.806087040662127, + "learning_rate": 9.235184857693506e-06, + "loss": 0.3938, + "step": 1358 + }, + { + "epoch": 0.2032301480484522, + "grad_norm": 1.7243993707977876, + "learning_rate": 9.233897078144317e-06, + "loss": 0.2151, + "step": 1359 + }, + { + "epoch": 0.20337969193958427, + "grad_norm": 1.529478064822457, + "learning_rate": 9.23260830528207e-06, + "loss": 0.3384, + "step": 1360 + }, + { + "epoch": 0.2035292358307163, + "grad_norm": 1.4829330919870423, + "learning_rate": 9.231318539409124e-06, + "loss": 0.3183, + "step": 1361 + }, + { + "epoch": 0.20367877972184836, + "grad_norm": 1.332269401180316, + "learning_rate": 9.23002778082807e-06, + "loss": 0.1772, + "step": 1362 + }, + { + "epoch": 0.20382832361298042, + "grad_norm": 1.0915215645001923, + "learning_rate": 9.228736029841732e-06, + "loss": 0.1997, + "step": 1363 + }, + { + "epoch": 0.20397786750411245, + "grad_norm": 1.159153234988241, + "learning_rate": 9.227443286753167e-06, + "loss": 0.1916, + "step": 1364 + }, + { + "epoch": 0.2041274113952445, + "grad_norm": 1.7356076242142402, + "learning_rate": 9.226149551865665e-06, + "loss": 0.3072, + "step": 1365 + }, + { + "epoch": 0.20427695528637654, + "grad_norm": 1.4127677237560496, + "learning_rate": 9.224854825482752e-06, + "loss": 0.2194, + "step": 1366 + }, + { + "epoch": 0.2044264991775086, + "grad_norm": 1.5370329143503136, + "learning_rate": 9.223559107908178e-06, + "loss": 0.3882, + "step": 1367 + }, + { + "epoch": 0.20457604306864063, + "grad_norm": 1.231450996718572, + "learning_rate": 9.222262399445934e-06, + "loss": 0.2097, + "step": 1368 + }, + { + "epoch": 0.2047255869597727, + "grad_norm": 2.132300752161669, + "learning_rate": 9.22096470040024e-06, + "loss": 0.5167, + "step": 1369 + }, + { + "epoch": 0.20487513085090475, + "grad_norm": 1.6056336366949036, + "learning_rate": 9.219666011075548e-06, + "loss": 0.4712, + "step": 1370 + }, + { + "epoch": 0.20502467474203678, + "grad_norm": 1.0274444121008708, + "learning_rate": 9.218366331776543e-06, + "loss": 0.1489, + "step": 1371 + }, + { + "epoch": 0.20517421863316884, + "grad_norm": 1.7166617308295589, + "learning_rate": 9.217065662808143e-06, + "loss": 0.2071, + "step": 1372 + }, + { + "epoch": 0.20532376252430087, + "grad_norm": 1.678765123523581, + "learning_rate": 9.215764004475496e-06, + "loss": 0.3746, + "step": 1373 + }, + { + "epoch": 0.20547330641543293, + "grad_norm": 1.5537365180594542, + "learning_rate": 9.214461357083986e-06, + "loss": 0.1505, + "step": 1374 + }, + { + "epoch": 0.20562285030656496, + "grad_norm": 1.3435785882872704, + "learning_rate": 9.213157720939226e-06, + "loss": 0.1997, + "step": 1375 + }, + { + "epoch": 0.20577239419769702, + "grad_norm": 1.5875355486569163, + "learning_rate": 9.211853096347059e-06, + "loss": 0.1984, + "step": 1376 + }, + { + "epoch": 0.20592193808882908, + "grad_norm": 1.1101394692319788, + "learning_rate": 9.210547483613566e-06, + "loss": 0.1921, + "step": 1377 + }, + { + "epoch": 0.2060714819799611, + "grad_norm": 1.4017863241620425, + "learning_rate": 9.209240883045054e-06, + "loss": 0.2018, + "step": 1378 + }, + { + "epoch": 0.20622102587109317, + "grad_norm": 1.7309276589079257, + "learning_rate": 9.207933294948064e-06, + "loss": 0.2325, + "step": 1379 + }, + { + "epoch": 0.2063705697622252, + "grad_norm": 2.2693408682674647, + "learning_rate": 9.206624719629371e-06, + "loss": 0.6731, + "step": 1380 + }, + { + "epoch": 0.20652011365335726, + "grad_norm": 1.6617142222154033, + "learning_rate": 9.205315157395978e-06, + "loss": 0.2133, + "step": 1381 + }, + { + "epoch": 0.20666965754448932, + "grad_norm": 1.5651100627218977, + "learning_rate": 9.20400460855512e-06, + "loss": 0.3562, + "step": 1382 + }, + { + "epoch": 0.20681920143562135, + "grad_norm": 1.412248222099086, + "learning_rate": 9.202693073414267e-06, + "loss": 0.2117, + "step": 1383 + }, + { + "epoch": 0.2069687453267534, + "grad_norm": 2.058721397249849, + "learning_rate": 9.201380552281114e-06, + "loss": 0.4157, + "step": 1384 + }, + { + "epoch": 0.20711828921788544, + "grad_norm": 1.190322717831582, + "learning_rate": 9.200067045463594e-06, + "loss": 0.1468, + "step": 1385 + }, + { + "epoch": 0.2072678331090175, + "grad_norm": 1.6727715120686848, + "learning_rate": 9.198752553269867e-06, + "loss": 0.3737, + "step": 1386 + }, + { + "epoch": 0.20741737700014953, + "grad_norm": 1.6151445864315066, + "learning_rate": 9.197437076008328e-06, + "loss": 0.3504, + "step": 1387 + }, + { + "epoch": 0.2075669208912816, + "grad_norm": 1.8626136343488717, + "learning_rate": 9.196120613987596e-06, + "loss": 0.1897, + "step": 1388 + }, + { + "epoch": 0.20771646478241365, + "grad_norm": 2.1277783512048165, + "learning_rate": 9.19480316751653e-06, + "loss": 0.5398, + "step": 1389 + }, + { + "epoch": 0.20786600867354568, + "grad_norm": 1.6518405571968426, + "learning_rate": 9.193484736904214e-06, + "loss": 0.3226, + "step": 1390 + }, + { + "epoch": 0.20801555256467774, + "grad_norm": 1.680879037198174, + "learning_rate": 9.192165322459965e-06, + "loss": 0.2825, + "step": 1391 + }, + { + "epoch": 0.20816509645580977, + "grad_norm": 1.7490131840067196, + "learning_rate": 9.19084492449333e-06, + "loss": 0.4683, + "step": 1392 + }, + { + "epoch": 0.20831464034694183, + "grad_norm": 1.3086537105430815, + "learning_rate": 9.189523543314087e-06, + "loss": 0.2554, + "step": 1393 + }, + { + "epoch": 0.20846418423807386, + "grad_norm": 12.365317544881876, + "learning_rate": 9.188201179232243e-06, + "loss": 0.2051, + "step": 1394 + }, + { + "epoch": 0.20861372812920592, + "grad_norm": 1.6315580687346722, + "learning_rate": 9.18687783255804e-06, + "loss": 0.2119, + "step": 1395 + }, + { + "epoch": 0.20876327202033798, + "grad_norm": 1.7045742133633315, + "learning_rate": 9.185553503601948e-06, + "loss": 0.2251, + "step": 1396 + }, + { + "epoch": 0.20891281591147, + "grad_norm": 1.6188256816772855, + "learning_rate": 9.184228192674667e-06, + "loss": 0.2406, + "step": 1397 + }, + { + "epoch": 0.20906235980260207, + "grad_norm": 3.936101301721377, + "learning_rate": 9.182901900087124e-06, + "loss": 0.4307, + "step": 1398 + }, + { + "epoch": 0.2092119036937341, + "grad_norm": 2.5527420452443406, + "learning_rate": 9.181574626150486e-06, + "loss": 0.2308, + "step": 1399 + }, + { + "epoch": 0.20936144758486616, + "grad_norm": 1.6355845634467614, + "learning_rate": 9.180246371176141e-06, + "loss": 0.1983, + "step": 1400 + }, + { + "epoch": 0.2095109914759982, + "grad_norm": 1.6442195705711569, + "learning_rate": 9.17891713547571e-06, + "loss": 0.4733, + "step": 1401 + }, + { + "epoch": 0.20966053536713025, + "grad_norm": 1.9389155277617691, + "learning_rate": 9.177586919361043e-06, + "loss": 0.3677, + "step": 1402 + }, + { + "epoch": 0.2098100792582623, + "grad_norm": 1.4858954046606485, + "learning_rate": 9.176255723144227e-06, + "loss": 0.2122, + "step": 1403 + }, + { + "epoch": 0.20995962314939434, + "grad_norm": 1.8989896403075806, + "learning_rate": 9.17492354713757e-06, + "loss": 0.5022, + "step": 1404 + }, + { + "epoch": 0.2101091670405264, + "grad_norm": 2.032698049116247, + "learning_rate": 9.173590391653612e-06, + "loss": 0.5431, + "step": 1405 + }, + { + "epoch": 0.21025871093165843, + "grad_norm": 1.5701626022875448, + "learning_rate": 9.172256257005127e-06, + "loss": 0.1862, + "step": 1406 + }, + { + "epoch": 0.2104082548227905, + "grad_norm": 1.9707343228520016, + "learning_rate": 9.170921143505114e-06, + "loss": 0.5544, + "step": 1407 + }, + { + "epoch": 0.21055779871392252, + "grad_norm": 1.3116097098782349, + "learning_rate": 9.169585051466804e-06, + "loss": 0.2191, + "step": 1408 + }, + { + "epoch": 0.21070734260505458, + "grad_norm": 1.4155318018990113, + "learning_rate": 9.168247981203657e-06, + "loss": 0.2867, + "step": 1409 + }, + { + "epoch": 0.21085688649618664, + "grad_norm": 1.5783535607698234, + "learning_rate": 9.166909933029365e-06, + "loss": 0.2115, + "step": 1410 + }, + { + "epoch": 0.21100643038731867, + "grad_norm": 2.1761826396642343, + "learning_rate": 9.16557090725784e-06, + "loss": 0.6031, + "step": 1411 + }, + { + "epoch": 0.21115597427845073, + "grad_norm": 1.5953173737260011, + "learning_rate": 9.16423090420324e-06, + "loss": 0.2972, + "step": 1412 + }, + { + "epoch": 0.21130551816958276, + "grad_norm": 1.859606383856977, + "learning_rate": 9.162889924179934e-06, + "loss": 0.3625, + "step": 1413 + }, + { + "epoch": 0.21145506206071482, + "grad_norm": 1.759605112492795, + "learning_rate": 9.161547967502536e-06, + "loss": 0.3425, + "step": 1414 + }, + { + "epoch": 0.21160460595184688, + "grad_norm": 1.2621512719829986, + "learning_rate": 9.160205034485875e-06, + "loss": 0.2218, + "step": 1415 + }, + { + "epoch": 0.2117541498429789, + "grad_norm": 1.4111741267825535, + "learning_rate": 9.158861125445022e-06, + "loss": 0.3193, + "step": 1416 + }, + { + "epoch": 0.21190369373411097, + "grad_norm": 1.2698781251191589, + "learning_rate": 9.157516240695266e-06, + "loss": 0.1909, + "step": 1417 + }, + { + "epoch": 0.212053237625243, + "grad_norm": 1.6960651115905963, + "learning_rate": 9.156170380552134e-06, + "loss": 0.3382, + "step": 1418 + }, + { + "epoch": 0.21220278151637506, + "grad_norm": 2.1154692881317434, + "learning_rate": 9.154823545331376e-06, + "loss": 0.4082, + "step": 1419 + }, + { + "epoch": 0.2123523254075071, + "grad_norm": 2.033684826459006, + "learning_rate": 9.153475735348973e-06, + "loss": 0.3867, + "step": 1420 + }, + { + "epoch": 0.21250186929863915, + "grad_norm": 1.8325048944634288, + "learning_rate": 9.152126950921135e-06, + "loss": 0.2373, + "step": 1421 + }, + { + "epoch": 0.2126514131897712, + "grad_norm": 2.0710673737402256, + "learning_rate": 9.150777192364297e-06, + "loss": 0.356, + "step": 1422 + }, + { + "epoch": 0.21280095708090324, + "grad_norm": 1.3874235860299686, + "learning_rate": 9.149426459995127e-06, + "loss": 0.3691, + "step": 1423 + }, + { + "epoch": 0.2129505009720353, + "grad_norm": 2.1249487008185803, + "learning_rate": 9.14807475413052e-06, + "loss": 0.3657, + "step": 1424 + }, + { + "epoch": 0.21310004486316733, + "grad_norm": 1.6076473156790783, + "learning_rate": 9.146722075087599e-06, + "loss": 0.2973, + "step": 1425 + }, + { + "epoch": 0.2132495887542994, + "grad_norm": 1.7585364821746667, + "learning_rate": 9.145368423183716e-06, + "loss": 0.3684, + "step": 1426 + }, + { + "epoch": 0.21339913264543142, + "grad_norm": 1.5724475968533793, + "learning_rate": 9.144013798736451e-06, + "loss": 0.3614, + "step": 1427 + }, + { + "epoch": 0.21354867653656348, + "grad_norm": 1.5213037170574124, + "learning_rate": 9.142658202063613e-06, + "loss": 0.331, + "step": 1428 + }, + { + "epoch": 0.21369822042769554, + "grad_norm": 1.7462862535903572, + "learning_rate": 9.141301633483233e-06, + "loss": 0.2972, + "step": 1429 + }, + { + "epoch": 0.21384776431882757, + "grad_norm": 1.3980558796390394, + "learning_rate": 9.139944093313582e-06, + "loss": 0.2375, + "step": 1430 + }, + { + "epoch": 0.21399730820995963, + "grad_norm": 1.655121853169513, + "learning_rate": 9.138585581873145e-06, + "loss": 0.1952, + "step": 1431 + }, + { + "epoch": 0.21414685210109166, + "grad_norm": 1.9616282412950032, + "learning_rate": 9.137226099480649e-06, + "loss": 0.3827, + "step": 1432 + }, + { + "epoch": 0.21429639599222372, + "grad_norm": 1.5101599924640094, + "learning_rate": 9.135865646455035e-06, + "loss": 0.2151, + "step": 1433 + }, + { + "epoch": 0.21444593988335575, + "grad_norm": 1.9492564570386166, + "learning_rate": 9.134504223115483e-06, + "loss": 0.5627, + "step": 1434 + }, + { + "epoch": 0.2145954837744878, + "grad_norm": 1.4722879693007511, + "learning_rate": 9.133141829781396e-06, + "loss": 0.3629, + "step": 1435 + }, + { + "epoch": 0.21474502766561987, + "grad_norm": 1.3336058131154513, + "learning_rate": 9.131778466772401e-06, + "loss": 0.2227, + "step": 1436 + }, + { + "epoch": 0.2148945715567519, + "grad_norm": 1.557620867064145, + "learning_rate": 9.130414134408358e-06, + "loss": 0.1922, + "step": 1437 + }, + { + "epoch": 0.21504411544788396, + "grad_norm": 1.5608371994510732, + "learning_rate": 9.129048833009354e-06, + "loss": 0.345, + "step": 1438 + }, + { + "epoch": 0.215193659339016, + "grad_norm": 2.0372244298598026, + "learning_rate": 9.127682562895701e-06, + "loss": 0.1943, + "step": 1439 + }, + { + "epoch": 0.21534320323014805, + "grad_norm": 1.771798287939486, + "learning_rate": 9.126315324387937e-06, + "loss": 0.2201, + "step": 1440 + }, + { + "epoch": 0.2154927471212801, + "grad_norm": 1.320918694554743, + "learning_rate": 9.124947117806833e-06, + "loss": 0.3557, + "step": 1441 + }, + { + "epoch": 0.21564229101241214, + "grad_norm": 1.2144354045037664, + "learning_rate": 9.12357794347338e-06, + "loss": 0.1677, + "step": 1442 + }, + { + "epoch": 0.2157918349035442, + "grad_norm": 1.7291539866895038, + "learning_rate": 9.122207801708802e-06, + "loss": 0.2361, + "step": 1443 + }, + { + "epoch": 0.21594137879467623, + "grad_norm": 1.525451199822209, + "learning_rate": 9.120836692834547e-06, + "loss": 0.2203, + "step": 1444 + }, + { + "epoch": 0.2160909226858083, + "grad_norm": 1.1450235200011314, + "learning_rate": 9.11946461717229e-06, + "loss": 0.2239, + "step": 1445 + }, + { + "epoch": 0.21624046657694032, + "grad_norm": 1.6249409995005226, + "learning_rate": 9.118091575043931e-06, + "loss": 0.2035, + "step": 1446 + }, + { + "epoch": 0.21639001046807238, + "grad_norm": 1.5911197134726236, + "learning_rate": 9.116717566771602e-06, + "loss": 0.3426, + "step": 1447 + }, + { + "epoch": 0.21653955435920444, + "grad_norm": 1.2768942365723772, + "learning_rate": 9.115342592677658e-06, + "loss": 0.3227, + "step": 1448 + }, + { + "epoch": 0.21668909825033647, + "grad_norm": 2.6013675708313646, + "learning_rate": 9.11396665308468e-06, + "loss": 0.876, + "step": 1449 + }, + { + "epoch": 0.21683864214146853, + "grad_norm": 1.5557280944558294, + "learning_rate": 9.112589748315477e-06, + "loss": 0.1862, + "step": 1450 + }, + { + "epoch": 0.21698818603260056, + "grad_norm": 1.9084211867181637, + "learning_rate": 9.111211878693084e-06, + "loss": 0.4744, + "step": 1451 + }, + { + "epoch": 0.21713772992373262, + "grad_norm": 1.9726349404572066, + "learning_rate": 9.109833044540766e-06, + "loss": 0.4964, + "step": 1452 + }, + { + "epoch": 0.21728727381486465, + "grad_norm": 1.7981363223292532, + "learning_rate": 9.108453246182005e-06, + "loss": 0.4617, + "step": 1453 + }, + { + "epoch": 0.2174368177059967, + "grad_norm": 1.4636175949338748, + "learning_rate": 9.10707248394052e-06, + "loss": 0.3562, + "step": 1454 + }, + { + "epoch": 0.21758636159712877, + "grad_norm": 1.5023091294876543, + "learning_rate": 9.105690758140247e-06, + "loss": 0.1974, + "step": 1455 + }, + { + "epoch": 0.2177359054882608, + "grad_norm": 1.5799772562807408, + "learning_rate": 9.104308069105355e-06, + "loss": 0.4225, + "step": 1456 + }, + { + "epoch": 0.21788544937939286, + "grad_norm": 1.697173297157486, + "learning_rate": 9.102924417160235e-06, + "loss": 0.2355, + "step": 1457 + }, + { + "epoch": 0.2180349932705249, + "grad_norm": 1.9831963966350166, + "learning_rate": 9.101539802629506e-06, + "loss": 0.6711, + "step": 1458 + }, + { + "epoch": 0.21818453716165695, + "grad_norm": 1.3049711330279425, + "learning_rate": 9.10015422583801e-06, + "loss": 0.2306, + "step": 1459 + }, + { + "epoch": 0.21833408105278898, + "grad_norm": 1.3485657787858605, + "learning_rate": 9.09876768711082e-06, + "loss": 0.3155, + "step": 1460 + }, + { + "epoch": 0.21848362494392104, + "grad_norm": 1.430084626746487, + "learning_rate": 9.097380186773225e-06, + "loss": 0.1803, + "step": 1461 + }, + { + "epoch": 0.2186331688350531, + "grad_norm": 1.042217037319963, + "learning_rate": 9.095991725150755e-06, + "loss": 0.2157, + "step": 1462 + }, + { + "epoch": 0.21878271272618513, + "grad_norm": 1.6038911196277919, + "learning_rate": 9.094602302569149e-06, + "loss": 0.3647, + "step": 1463 + }, + { + "epoch": 0.2189322566173172, + "grad_norm": 1.761061421005298, + "learning_rate": 9.093211919354384e-06, + "loss": 0.3914, + "step": 1464 + }, + { + "epoch": 0.21908180050844922, + "grad_norm": 1.780140598214583, + "learning_rate": 9.091820575832653e-06, + "loss": 0.5239, + "step": 1465 + }, + { + "epoch": 0.21923134439958128, + "grad_norm": 1.5056065150417166, + "learning_rate": 9.090428272330381e-06, + "loss": 0.3738, + "step": 1466 + }, + { + "epoch": 0.2193808882907133, + "grad_norm": 1.5037400785516828, + "learning_rate": 9.089035009174213e-06, + "loss": 0.3748, + "step": 1467 + }, + { + "epoch": 0.21953043218184537, + "grad_norm": 1.0034616725342143, + "learning_rate": 9.087640786691029e-06, + "loss": 0.1818, + "step": 1468 + }, + { + "epoch": 0.21967997607297743, + "grad_norm": 1.8826776828221814, + "learning_rate": 9.08624560520792e-06, + "loss": 0.5423, + "step": 1469 + }, + { + "epoch": 0.21982951996410946, + "grad_norm": 1.5762662138471795, + "learning_rate": 9.08484946505221e-06, + "loss": 0.362, + "step": 1470 + }, + { + "epoch": 0.21997906385524152, + "grad_norm": 1.459026627139429, + "learning_rate": 9.08345236655145e-06, + "loss": 0.2379, + "step": 1471 + }, + { + "epoch": 0.22012860774637355, + "grad_norm": 1.9767735560822688, + "learning_rate": 9.082054310033412e-06, + "loss": 0.2279, + "step": 1472 + }, + { + "epoch": 0.2202781516375056, + "grad_norm": 1.5033186272325123, + "learning_rate": 9.08065529582609e-06, + "loss": 0.3475, + "step": 1473 + }, + { + "epoch": 0.22042769552863767, + "grad_norm": 1.7347624363190264, + "learning_rate": 9.07925532425771e-06, + "loss": 0.2346, + "step": 1474 + }, + { + "epoch": 0.2205772394197697, + "grad_norm": 1.311368499943375, + "learning_rate": 9.077854395656719e-06, + "loss": 0.2136, + "step": 1475 + }, + { + "epoch": 0.22072678331090176, + "grad_norm": 2.079168631285795, + "learning_rate": 9.076452510351786e-06, + "loss": 0.5518, + "step": 1476 + }, + { + "epoch": 0.2208763272020338, + "grad_norm": 1.7471493510688818, + "learning_rate": 9.075049668671808e-06, + "loss": 0.4685, + "step": 1477 + }, + { + "epoch": 0.22102587109316585, + "grad_norm": 1.7503079479759727, + "learning_rate": 9.073645870945904e-06, + "loss": 0.3369, + "step": 1478 + }, + { + "epoch": 0.22117541498429788, + "grad_norm": 1.6001592489802432, + "learning_rate": 9.07224111750342e-06, + "loss": 0.2451, + "step": 1479 + }, + { + "epoch": 0.22132495887542994, + "grad_norm": 1.5995131942282934, + "learning_rate": 9.070835408673926e-06, + "loss": 0.2199, + "step": 1480 + }, + { + "epoch": 0.221474502766562, + "grad_norm": 1.4586326961701683, + "learning_rate": 9.06942874478721e-06, + "loss": 0.374, + "step": 1481 + }, + { + "epoch": 0.22162404665769403, + "grad_norm": 1.5381391273868525, + "learning_rate": 9.068021126173294e-06, + "loss": 0.2369, + "step": 1482 + }, + { + "epoch": 0.2217735905488261, + "grad_norm": 1.96520452808454, + "learning_rate": 9.066612553162417e-06, + "loss": 0.2214, + "step": 1483 + }, + { + "epoch": 0.22192313443995812, + "grad_norm": 1.3323608756885599, + "learning_rate": 9.065203026085041e-06, + "loss": 0.2335, + "step": 1484 + }, + { + "epoch": 0.22207267833109018, + "grad_norm": 1.725202352638352, + "learning_rate": 9.063792545271859e-06, + "loss": 0.4229, + "step": 1485 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.9584754362518301, + "learning_rate": 9.062381111053781e-06, + "loss": 0.1886, + "step": 1486 + }, + { + "epoch": 0.22237176611335427, + "grad_norm": 1.72245020942535, + "learning_rate": 9.060968723761945e-06, + "loss": 0.3353, + "step": 1487 + }, + { + "epoch": 0.22252131000448633, + "grad_norm": 1.6135392202141645, + "learning_rate": 9.05955538372771e-06, + "loss": 0.1953, + "step": 1488 + }, + { + "epoch": 0.22267085389561836, + "grad_norm": 1.3152021255142217, + "learning_rate": 9.058141091282656e-06, + "loss": 0.1742, + "step": 1489 + }, + { + "epoch": 0.22282039778675042, + "grad_norm": 2.024778297438531, + "learning_rate": 9.056725846758594e-06, + "loss": 0.4014, + "step": 1490 + }, + { + "epoch": 0.22296994167788245, + "grad_norm": 1.6698713908455967, + "learning_rate": 9.055309650487552e-06, + "loss": 0.3407, + "step": 1491 + }, + { + "epoch": 0.2231194855690145, + "grad_norm": 1.934590891729098, + "learning_rate": 9.053892502801783e-06, + "loss": 0.3714, + "step": 1492 + }, + { + "epoch": 0.22326902946014654, + "grad_norm": 1.8737430969121303, + "learning_rate": 9.052474404033764e-06, + "loss": 0.2273, + "step": 1493 + }, + { + "epoch": 0.2234185733512786, + "grad_norm": 2.1916032794360594, + "learning_rate": 9.051055354516195e-06, + "loss": 0.5141, + "step": 1494 + }, + { + "epoch": 0.22356811724241066, + "grad_norm": 1.3808614199352094, + "learning_rate": 9.049635354581998e-06, + "loss": 0.22, + "step": 1495 + }, + { + "epoch": 0.2237176611335427, + "grad_norm": 1.56662054094386, + "learning_rate": 9.048214404564319e-06, + "loss": 0.23, + "step": 1496 + }, + { + "epoch": 0.22386720502467475, + "grad_norm": 1.3046782458999366, + "learning_rate": 9.046792504796526e-06, + "loss": 0.2448, + "step": 1497 + }, + { + "epoch": 0.22401674891580678, + "grad_norm": 1.2799576312369438, + "learning_rate": 9.045369655612212e-06, + "loss": 0.222, + "step": 1498 + }, + { + "epoch": 0.22416629280693884, + "grad_norm": 1.3122547692582682, + "learning_rate": 9.043945857345189e-06, + "loss": 0.2261, + "step": 1499 + }, + { + "epoch": 0.2243158366980709, + "grad_norm": 1.644275160578279, + "learning_rate": 9.042521110329497e-06, + "loss": 0.3581, + "step": 1500 + }, + { + "epoch": 0.22446538058920293, + "grad_norm": 1.3566851036629488, + "learning_rate": 9.04109541489939e-06, + "loss": 0.3989, + "step": 1501 + }, + { + "epoch": 0.224614924480335, + "grad_norm": 1.1272465309050348, + "learning_rate": 9.039668771389356e-06, + "loss": 0.2049, + "step": 1502 + }, + { + "epoch": 0.22476446837146702, + "grad_norm": 1.2545829340424604, + "learning_rate": 9.038241180134095e-06, + "loss": 0.2186, + "step": 1503 + }, + { + "epoch": 0.22491401226259908, + "grad_norm": 2.458444877022837, + "learning_rate": 9.036812641468535e-06, + "loss": 0.9375, + "step": 1504 + }, + { + "epoch": 0.2250635561537311, + "grad_norm": 1.7631503038372196, + "learning_rate": 9.035383155727826e-06, + "loss": 0.5211, + "step": 1505 + }, + { + "epoch": 0.22521310004486317, + "grad_norm": 1.2773941362574963, + "learning_rate": 9.03395272324734e-06, + "loss": 0.1924, + "step": 1506 + }, + { + "epoch": 0.22536264393599523, + "grad_norm": 0.8285607729871721, + "learning_rate": 9.032521344362665e-06, + "loss": 0.1591, + "step": 1507 + }, + { + "epoch": 0.22551218782712726, + "grad_norm": 1.4527549144198617, + "learning_rate": 9.031089019409622e-06, + "loss": 0.3534, + "step": 1508 + }, + { + "epoch": 0.22566173171825932, + "grad_norm": 1.5654074226457095, + "learning_rate": 9.029655748724245e-06, + "loss": 0.3845, + "step": 1509 + }, + { + "epoch": 0.22581127560939135, + "grad_norm": 1.7456103899692976, + "learning_rate": 9.028221532642793e-06, + "loss": 0.3186, + "step": 1510 + }, + { + "epoch": 0.2259608195005234, + "grad_norm": 1.9962184366897087, + "learning_rate": 9.02678637150175e-06, + "loss": 0.4627, + "step": 1511 + }, + { + "epoch": 0.22611036339165544, + "grad_norm": 1.1115007945174045, + "learning_rate": 9.025350265637816e-06, + "loss": 0.2216, + "step": 1512 + }, + { + "epoch": 0.2262599072827875, + "grad_norm": 1.1002289464689736, + "learning_rate": 9.023913215387914e-06, + "loss": 0.26, + "step": 1513 + }, + { + "epoch": 0.22640945117391956, + "grad_norm": 4.707452449710372, + "learning_rate": 9.022475221089194e-06, + "loss": 0.2086, + "step": 1514 + }, + { + "epoch": 0.2265589950650516, + "grad_norm": 1.1907478811651444, + "learning_rate": 9.021036283079022e-06, + "loss": 0.1712, + "step": 1515 + }, + { + "epoch": 0.22670853895618365, + "grad_norm": 1.4943756417337315, + "learning_rate": 9.019596401694984e-06, + "loss": 0.3612, + "step": 1516 + }, + { + "epoch": 0.22685808284731568, + "grad_norm": 1.8328029211172867, + "learning_rate": 9.018155577274891e-06, + "loss": 0.5014, + "step": 1517 + }, + { + "epoch": 0.22700762673844774, + "grad_norm": 1.5769600203549734, + "learning_rate": 9.016713810156777e-06, + "loss": 0.3982, + "step": 1518 + }, + { + "epoch": 0.22715717062957977, + "grad_norm": 1.033998264540829, + "learning_rate": 9.015271100678892e-06, + "loss": 0.1698, + "step": 1519 + }, + { + "epoch": 0.22730671452071183, + "grad_norm": 1.4450333274104865, + "learning_rate": 9.01382744917971e-06, + "loss": 0.2241, + "step": 1520 + }, + { + "epoch": 0.2274562584118439, + "grad_norm": 1.4009561553962124, + "learning_rate": 9.012382855997925e-06, + "loss": 0.318, + "step": 1521 + }, + { + "epoch": 0.22760580230297592, + "grad_norm": 1.6380239076287348, + "learning_rate": 9.010937321472454e-06, + "loss": 0.3649, + "step": 1522 + }, + { + "epoch": 0.22775534619410798, + "grad_norm": 1.8718410186357894, + "learning_rate": 9.009490845942433e-06, + "loss": 0.3845, + "step": 1523 + }, + { + "epoch": 0.22790489008524, + "grad_norm": 1.474697832902752, + "learning_rate": 9.00804342974722e-06, + "loss": 0.3287, + "step": 1524 + }, + { + "epoch": 0.22805443397637207, + "grad_norm": 1.9033242818165663, + "learning_rate": 9.006595073226387e-06, + "loss": 0.3614, + "step": 1525 + }, + { + "epoch": 0.2282039778675041, + "grad_norm": 1.7576573706097778, + "learning_rate": 9.005145776719742e-06, + "loss": 0.4287, + "step": 1526 + }, + { + "epoch": 0.22835352175863616, + "grad_norm": 1.646435656567765, + "learning_rate": 9.003695540567294e-06, + "loss": 0.3619, + "step": 1527 + }, + { + "epoch": 0.22850306564976822, + "grad_norm": 1.6955358560275535, + "learning_rate": 9.00224436510929e-06, + "loss": 0.371, + "step": 1528 + }, + { + "epoch": 0.22865260954090025, + "grad_norm": 1.658825841648886, + "learning_rate": 9.000792250686186e-06, + "loss": 0.3939, + "step": 1529 + }, + { + "epoch": 0.2288021534320323, + "grad_norm": 1.5326543749267825, + "learning_rate": 8.999339197638664e-06, + "loss": 0.2482, + "step": 1530 + }, + { + "epoch": 0.22895169732316434, + "grad_norm": 2.4642466957027174, + "learning_rate": 8.99788520630762e-06, + "loss": 0.4219, + "step": 1531 + }, + { + "epoch": 0.2291012412142964, + "grad_norm": 1.532408533906101, + "learning_rate": 8.99643027703418e-06, + "loss": 0.3121, + "step": 1532 + }, + { + "epoch": 0.22925078510542846, + "grad_norm": 1.068785213809154, + "learning_rate": 8.994974410159682e-06, + "loss": 0.2221, + "step": 1533 + }, + { + "epoch": 0.2294003289965605, + "grad_norm": 1.6812794615422513, + "learning_rate": 8.993517606025686e-06, + "loss": 0.36, + "step": 1534 + }, + { + "epoch": 0.22954987288769255, + "grad_norm": 1.3397673899297329, + "learning_rate": 8.992059864973972e-06, + "loss": 0.1707, + "step": 1535 + }, + { + "epoch": 0.22969941677882458, + "grad_norm": 2.2086842094358445, + "learning_rate": 8.990601187346542e-06, + "loss": 0.2141, + "step": 1536 + }, + { + "epoch": 0.22984896066995664, + "grad_norm": 1.9813196209128292, + "learning_rate": 8.989141573485614e-06, + "loss": 0.3664, + "step": 1537 + }, + { + "epoch": 0.22999850456108867, + "grad_norm": 1.4968536528735594, + "learning_rate": 8.987681023733628e-06, + "loss": 0.212, + "step": 1538 + }, + { + "epoch": 0.23014804845222073, + "grad_norm": 2.005340468193955, + "learning_rate": 8.986219538433243e-06, + "loss": 0.5171, + "step": 1539 + }, + { + "epoch": 0.23029759234335279, + "grad_norm": 1.6056668375146956, + "learning_rate": 8.984757117927337e-06, + "loss": 0.367, + "step": 1540 + }, + { + "epoch": 0.23044713623448482, + "grad_norm": 1.736972659173659, + "learning_rate": 8.983293762559009e-06, + "loss": 0.3054, + "step": 1541 + }, + { + "epoch": 0.23059668012561688, + "grad_norm": 2.3554690220264893, + "learning_rate": 8.981829472671576e-06, + "loss": 0.48, + "step": 1542 + }, + { + "epoch": 0.2307462240167489, + "grad_norm": 2.441414392076267, + "learning_rate": 8.980364248608576e-06, + "loss": 0.5507, + "step": 1543 + }, + { + "epoch": 0.23089576790788097, + "grad_norm": 1.6179923206419418, + "learning_rate": 8.97889809071376e-06, + "loss": 0.1799, + "step": 1544 + }, + { + "epoch": 0.231045311799013, + "grad_norm": 1.3522900238471818, + "learning_rate": 8.977430999331108e-06, + "loss": 0.2205, + "step": 1545 + }, + { + "epoch": 0.23119485569014506, + "grad_norm": 1.5829130211417721, + "learning_rate": 8.97596297480481e-06, + "loss": 0.3255, + "step": 1546 + }, + { + "epoch": 0.23134439958127712, + "grad_norm": 1.5826039702225925, + "learning_rate": 8.974494017479281e-06, + "loss": 0.2593, + "step": 1547 + }, + { + "epoch": 0.23149394347240915, + "grad_norm": 1.498668084288196, + "learning_rate": 8.973024127699152e-06, + "loss": 0.3489, + "step": 1548 + }, + { + "epoch": 0.2316434873635412, + "grad_norm": 1.753546348637437, + "learning_rate": 8.971553305809274e-06, + "loss": 0.1926, + "step": 1549 + }, + { + "epoch": 0.23179303125467324, + "grad_norm": 1.434969286299199, + "learning_rate": 8.970081552154714e-06, + "loss": 0.3408, + "step": 1550 + }, + { + "epoch": 0.2319425751458053, + "grad_norm": 0.8478352471965336, + "learning_rate": 8.968608867080761e-06, + "loss": 0.2296, + "step": 1551 + }, + { + "epoch": 0.23209211903693733, + "grad_norm": 1.5740700313091822, + "learning_rate": 8.967135250932921e-06, + "loss": 0.2804, + "step": 1552 + }, + { + "epoch": 0.2322416629280694, + "grad_norm": 1.5528025294287224, + "learning_rate": 8.96566070405692e-06, + "loss": 0.3515, + "step": 1553 + }, + { + "epoch": 0.23239120681920145, + "grad_norm": 1.6156309602690202, + "learning_rate": 8.964185226798696e-06, + "loss": 0.3846, + "step": 1554 + }, + { + "epoch": 0.23254075071033348, + "grad_norm": 1.900324205764928, + "learning_rate": 8.962708819504415e-06, + "loss": 0.3893, + "step": 1555 + }, + { + "epoch": 0.23269029460146554, + "grad_norm": 1.6871831737787155, + "learning_rate": 8.961231482520456e-06, + "loss": 0.4995, + "step": 1556 + }, + { + "epoch": 0.23283983849259757, + "grad_norm": 1.7659270863655478, + "learning_rate": 8.959753216193415e-06, + "loss": 0.3834, + "step": 1557 + }, + { + "epoch": 0.23298938238372963, + "grad_norm": 1.4355628771765856, + "learning_rate": 8.958274020870107e-06, + "loss": 0.2458, + "step": 1558 + }, + { + "epoch": 0.23313892627486169, + "grad_norm": 1.091740260422094, + "learning_rate": 8.956793896897566e-06, + "loss": 0.1625, + "step": 1559 + }, + { + "epoch": 0.23328847016599372, + "grad_norm": 1.1925103613752697, + "learning_rate": 8.955312844623045e-06, + "loss": 0.1767, + "step": 1560 + }, + { + "epoch": 0.23343801405712578, + "grad_norm": 1.8795430835063092, + "learning_rate": 8.953830864394012e-06, + "loss": 0.3983, + "step": 1561 + }, + { + "epoch": 0.2335875579482578, + "grad_norm": 1.4373449366320268, + "learning_rate": 8.952347956558152e-06, + "loss": 0.2384, + "step": 1562 + }, + { + "epoch": 0.23373710183938987, + "grad_norm": 1.4903754933686846, + "learning_rate": 8.950864121463374e-06, + "loss": 0.2191, + "step": 1563 + }, + { + "epoch": 0.2338866457305219, + "grad_norm": 1.8744944645931436, + "learning_rate": 8.949379359457795e-06, + "loss": 0.4788, + "step": 1564 + }, + { + "epoch": 0.23403618962165396, + "grad_norm": 2.30369719872738, + "learning_rate": 8.947893670889756e-06, + "loss": 0.5508, + "step": 1565 + }, + { + "epoch": 0.23418573351278602, + "grad_norm": 1.5228147309739217, + "learning_rate": 8.946407056107815e-06, + "loss": 0.2744, + "step": 1566 + }, + { + "epoch": 0.23433527740391805, + "grad_norm": 0.9855113226049245, + "learning_rate": 8.944919515460746e-06, + "loss": 0.1762, + "step": 1567 + }, + { + "epoch": 0.2344848212950501, + "grad_norm": 2.0244671473740476, + "learning_rate": 8.943431049297542e-06, + "loss": 0.4977, + "step": 1568 + }, + { + "epoch": 0.23463436518618214, + "grad_norm": 1.9053040838783037, + "learning_rate": 8.941941657967408e-06, + "loss": 0.3878, + "step": 1569 + }, + { + "epoch": 0.2347839090773142, + "grad_norm": 1.2593264871558218, + "learning_rate": 8.94045134181977e-06, + "loss": 0.2085, + "step": 1570 + }, + { + "epoch": 0.23493345296844623, + "grad_norm": 1.4409087053787786, + "learning_rate": 8.938960101204273e-06, + "loss": 0.392, + "step": 1571 + }, + { + "epoch": 0.23508299685957829, + "grad_norm": 0.9533079750475656, + "learning_rate": 8.937467936470773e-06, + "loss": 0.161, + "step": 1572 + }, + { + "epoch": 0.23523254075071034, + "grad_norm": 1.3187507218092258, + "learning_rate": 8.935974847969352e-06, + "loss": 0.228, + "step": 1573 + }, + { + "epoch": 0.23538208464184238, + "grad_norm": 1.4711338301621406, + "learning_rate": 8.934480836050297e-06, + "loss": 0.2065, + "step": 1574 + }, + { + "epoch": 0.23553162853297444, + "grad_norm": 1.6346673332024881, + "learning_rate": 8.93298590106412e-06, + "loss": 0.3639, + "step": 1575 + }, + { + "epoch": 0.23568117242410647, + "grad_norm": 1.519040249043761, + "learning_rate": 8.931490043361546e-06, + "loss": 0.3451, + "step": 1576 + }, + { + "epoch": 0.23583071631523853, + "grad_norm": 1.5939490670813292, + "learning_rate": 8.929993263293519e-06, + "loss": 0.3313, + "step": 1577 + }, + { + "epoch": 0.23598026020637056, + "grad_norm": 2.4862093840935584, + "learning_rate": 8.928495561211199e-06, + "loss": 0.5844, + "step": 1578 + }, + { + "epoch": 0.23612980409750262, + "grad_norm": 0.9227521449231808, + "learning_rate": 8.92699693746596e-06, + "loss": 0.1858, + "step": 1579 + }, + { + "epoch": 0.23627934798863467, + "grad_norm": 1.7904495917281569, + "learning_rate": 8.925497392409392e-06, + "loss": 0.4007, + "step": 1580 + }, + { + "epoch": 0.2364288918797667, + "grad_norm": 1.6104758349435173, + "learning_rate": 8.923996926393306e-06, + "loss": 0.2506, + "step": 1581 + }, + { + "epoch": 0.23657843577089877, + "grad_norm": 1.162466771602011, + "learning_rate": 8.922495539769722e-06, + "loss": 0.2251, + "step": 1582 + }, + { + "epoch": 0.2367279796620308, + "grad_norm": 1.4044418047951504, + "learning_rate": 8.920993232890885e-06, + "loss": 0.3529, + "step": 1583 + }, + { + "epoch": 0.23687752355316286, + "grad_norm": 1.3430242838204132, + "learning_rate": 8.919490006109243e-06, + "loss": 0.213, + "step": 1584 + }, + { + "epoch": 0.23702706744429491, + "grad_norm": 2.047091159617954, + "learning_rate": 8.917985859777477e-06, + "loss": 0.4145, + "step": 1585 + }, + { + "epoch": 0.23717661133542695, + "grad_norm": 2.0721630966321247, + "learning_rate": 8.916480794248465e-06, + "loss": 0.5021, + "step": 1586 + }, + { + "epoch": 0.237326155226559, + "grad_norm": 1.5816352819652573, + "learning_rate": 8.914974809875317e-06, + "loss": 0.3858, + "step": 1587 + }, + { + "epoch": 0.23747569911769104, + "grad_norm": 1.8550669988064394, + "learning_rate": 8.913467907011349e-06, + "loss": 0.4832, + "step": 1588 + }, + { + "epoch": 0.2376252430088231, + "grad_norm": 1.6366187681919886, + "learning_rate": 8.911960086010091e-06, + "loss": 0.3524, + "step": 1589 + }, + { + "epoch": 0.23777478689995513, + "grad_norm": 1.4416460700943408, + "learning_rate": 8.910451347225298e-06, + "loss": 0.3926, + "step": 1590 + }, + { + "epoch": 0.23792433079108719, + "grad_norm": 1.564579376384169, + "learning_rate": 8.908941691010933e-06, + "loss": 0.2312, + "step": 1591 + }, + { + "epoch": 0.23807387468221924, + "grad_norm": 1.2944939224073606, + "learning_rate": 8.907431117721175e-06, + "loss": 0.2245, + "step": 1592 + }, + { + "epoch": 0.23822341857335128, + "grad_norm": 1.6975131715816902, + "learning_rate": 8.905919627710419e-06, + "loss": 0.3363, + "step": 1593 + }, + { + "epoch": 0.23837296246448333, + "grad_norm": 1.596361094662525, + "learning_rate": 8.904407221333275e-06, + "loss": 0.3658, + "step": 1594 + }, + { + "epoch": 0.23852250635561537, + "grad_norm": 1.4858572761749074, + "learning_rate": 8.902893898944571e-06, + "loss": 0.2827, + "step": 1595 + }, + { + "epoch": 0.23867205024674742, + "grad_norm": 2.4979339586019464, + "learning_rate": 8.901379660899343e-06, + "loss": 0.5078, + "step": 1596 + }, + { + "epoch": 0.23882159413787946, + "grad_norm": 1.647641401011452, + "learning_rate": 8.899864507552846e-06, + "loss": 0.3907, + "step": 1597 + }, + { + "epoch": 0.23897113802901152, + "grad_norm": 1.604466675493429, + "learning_rate": 8.898348439260553e-06, + "loss": 0.4351, + "step": 1598 + }, + { + "epoch": 0.23912068192014357, + "grad_norm": 1.158380693426154, + "learning_rate": 8.896831456378148e-06, + "loss": 0.2045, + "step": 1599 + }, + { + "epoch": 0.2392702258112756, + "grad_norm": 1.6498746618436277, + "learning_rate": 8.895313559261525e-06, + "loss": 0.3361, + "step": 1600 + }, + { + "epoch": 0.23941976970240766, + "grad_norm": 1.237277087089687, + "learning_rate": 8.893794748266801e-06, + "loss": 0.2327, + "step": 1601 + }, + { + "epoch": 0.2395693135935397, + "grad_norm": 1.7276131272814355, + "learning_rate": 8.892275023750305e-06, + "loss": 0.3676, + "step": 1602 + }, + { + "epoch": 0.23971885748467175, + "grad_norm": 1.0737115286717451, + "learning_rate": 8.890754386068577e-06, + "loss": 0.1695, + "step": 1603 + }, + { + "epoch": 0.23986840137580379, + "grad_norm": 1.7893197216212124, + "learning_rate": 8.889232835578372e-06, + "loss": 0.2075, + "step": 1604 + }, + { + "epoch": 0.24001794526693584, + "grad_norm": 1.0316754586933763, + "learning_rate": 8.887710372636662e-06, + "loss": 0.1818, + "step": 1605 + }, + { + "epoch": 0.2401674891580679, + "grad_norm": 1.908936676793948, + "learning_rate": 8.886186997600633e-06, + "loss": 0.4014, + "step": 1606 + }, + { + "epoch": 0.24031703304919994, + "grad_norm": 1.8121753558090572, + "learning_rate": 8.884662710827679e-06, + "loss": 0.5739, + "step": 1607 + }, + { + "epoch": 0.240466576940332, + "grad_norm": 1.5081453030204104, + "learning_rate": 8.883137512675417e-06, + "loss": 0.2058, + "step": 1608 + }, + { + "epoch": 0.24061612083146403, + "grad_norm": 1.6968786810383643, + "learning_rate": 8.88161140350167e-06, + "loss": 0.3992, + "step": 1609 + }, + { + "epoch": 0.24076566472259608, + "grad_norm": 1.0441105843798988, + "learning_rate": 8.880084383664481e-06, + "loss": 0.1736, + "step": 1610 + }, + { + "epoch": 0.24091520861372812, + "grad_norm": 1.4531784381064643, + "learning_rate": 8.8785564535221e-06, + "loss": 0.2213, + "step": 1611 + }, + { + "epoch": 0.24106475250486017, + "grad_norm": 1.6179328389513437, + "learning_rate": 8.877027613432997e-06, + "loss": 0.4397, + "step": 1612 + }, + { + "epoch": 0.24121429639599223, + "grad_norm": 1.3779127226717234, + "learning_rate": 8.875497863755851e-06, + "loss": 0.2095, + "step": 1613 + }, + { + "epoch": 0.24136384028712426, + "grad_norm": 1.4488714989705522, + "learning_rate": 8.873967204849556e-06, + "loss": 0.3495, + "step": 1614 + }, + { + "epoch": 0.24151338417825632, + "grad_norm": 1.699118467638372, + "learning_rate": 8.872435637073223e-06, + "loss": 0.2042, + "step": 1615 + }, + { + "epoch": 0.24166292806938836, + "grad_norm": 1.5724198212446476, + "learning_rate": 8.870903160786166e-06, + "loss": 0.358, + "step": 1616 + }, + { + "epoch": 0.24181247196052041, + "grad_norm": 1.8735086152337022, + "learning_rate": 8.869369776347923e-06, + "loss": 0.352, + "step": 1617 + }, + { + "epoch": 0.24196201585165247, + "grad_norm": 2.071130867923683, + "learning_rate": 8.867835484118241e-06, + "loss": 0.369, + "step": 1618 + }, + { + "epoch": 0.2421115597427845, + "grad_norm": 1.00009701768564, + "learning_rate": 8.866300284457078e-06, + "loss": 0.1362, + "step": 1619 + }, + { + "epoch": 0.24226110363391656, + "grad_norm": 1.2342660250726913, + "learning_rate": 8.864764177724607e-06, + "loss": 0.228, + "step": 1620 + }, + { + "epoch": 0.2424106475250486, + "grad_norm": 1.181191216278982, + "learning_rate": 8.863227164281214e-06, + "loss": 0.185, + "step": 1621 + }, + { + "epoch": 0.24256019141618065, + "grad_norm": 1.600687165751174, + "learning_rate": 8.861689244487497e-06, + "loss": 0.198, + "step": 1622 + }, + { + "epoch": 0.24270973530731269, + "grad_norm": 1.8325486581074766, + "learning_rate": 8.860150418704268e-06, + "loss": 0.3872, + "step": 1623 + }, + { + "epoch": 0.24285927919844474, + "grad_norm": 1.8934954141642377, + "learning_rate": 8.858610687292548e-06, + "loss": 0.4214, + "step": 1624 + }, + { + "epoch": 0.2430088230895768, + "grad_norm": 2.2505887638767184, + "learning_rate": 8.857070050613573e-06, + "loss": 0.2427, + "step": 1625 + }, + { + "epoch": 0.24315836698070883, + "grad_norm": 1.7156119494211053, + "learning_rate": 8.855528509028793e-06, + "loss": 0.3129, + "step": 1626 + }, + { + "epoch": 0.2433079108718409, + "grad_norm": 1.1926433487919907, + "learning_rate": 8.853986062899869e-06, + "loss": 0.1944, + "step": 1627 + }, + { + "epoch": 0.24345745476297292, + "grad_norm": 1.4955836838428689, + "learning_rate": 8.852442712588671e-06, + "loss": 0.4259, + "step": 1628 + }, + { + "epoch": 0.24360699865410498, + "grad_norm": 1.5899010145338972, + "learning_rate": 8.850898458457284e-06, + "loss": 0.1922, + "step": 1629 + }, + { + "epoch": 0.24375654254523701, + "grad_norm": 1.7661527081371415, + "learning_rate": 8.849353300868007e-06, + "loss": 0.3413, + "step": 1630 + }, + { + "epoch": 0.24390608643636907, + "grad_norm": 1.0197540346721858, + "learning_rate": 8.847807240183349e-06, + "loss": 0.2392, + "step": 1631 + }, + { + "epoch": 0.24405563032750113, + "grad_norm": 2.304126832944234, + "learning_rate": 8.84626027676603e-06, + "loss": 0.6652, + "step": 1632 + }, + { + "epoch": 0.24420517421863316, + "grad_norm": 1.730517756971552, + "learning_rate": 8.844712410978981e-06, + "loss": 0.2769, + "step": 1633 + }, + { + "epoch": 0.24435471810976522, + "grad_norm": 1.7941803588719758, + "learning_rate": 8.843163643185347e-06, + "loss": 0.5527, + "step": 1634 + }, + { + "epoch": 0.24450426200089725, + "grad_norm": 1.5034412891428997, + "learning_rate": 8.841613973748486e-06, + "loss": 0.2425, + "step": 1635 + }, + { + "epoch": 0.2446538058920293, + "grad_norm": 1.8387720627371986, + "learning_rate": 8.840063403031962e-06, + "loss": 0.4814, + "step": 1636 + }, + { + "epoch": 0.24480334978316134, + "grad_norm": 1.6058048815286128, + "learning_rate": 8.838511931399557e-06, + "loss": 0.205, + "step": 1637 + }, + { + "epoch": 0.2449528936742934, + "grad_norm": 1.6535570636461936, + "learning_rate": 8.83695955921526e-06, + "loss": 0.3842, + "step": 1638 + }, + { + "epoch": 0.24510243756542546, + "grad_norm": 2.0495436648956677, + "learning_rate": 8.83540628684327e-06, + "loss": 0.588, + "step": 1639 + }, + { + "epoch": 0.2452519814565575, + "grad_norm": 1.932080426112819, + "learning_rate": 8.833852114648006e-06, + "loss": 0.2566, + "step": 1640 + }, + { + "epoch": 0.24540152534768955, + "grad_norm": 1.9660531070561371, + "learning_rate": 8.832297042994083e-06, + "loss": 0.5196, + "step": 1641 + }, + { + "epoch": 0.24555106923882158, + "grad_norm": 1.4040557060017893, + "learning_rate": 8.830741072246343e-06, + "loss": 0.2041, + "step": 1642 + }, + { + "epoch": 0.24570061312995364, + "grad_norm": 2.1028771117617637, + "learning_rate": 8.829184202769828e-06, + "loss": 0.2795, + "step": 1643 + }, + { + "epoch": 0.2458501570210857, + "grad_norm": 1.379854611664084, + "learning_rate": 8.827626434929796e-06, + "loss": 0.1597, + "step": 1644 + }, + { + "epoch": 0.24599970091221773, + "grad_norm": 1.5617312208997327, + "learning_rate": 8.826067769091715e-06, + "loss": 0.3839, + "step": 1645 + }, + { + "epoch": 0.2461492448033498, + "grad_norm": 1.7647416850681912, + "learning_rate": 8.824508205621263e-06, + "loss": 0.373, + "step": 1646 + }, + { + "epoch": 0.24629878869448182, + "grad_norm": 2.0755613511333406, + "learning_rate": 8.822947744884326e-06, + "loss": 0.5092, + "step": 1647 + }, + { + "epoch": 0.24644833258561388, + "grad_norm": 2.235044525778246, + "learning_rate": 8.821386387247006e-06, + "loss": 0.6294, + "step": 1648 + }, + { + "epoch": 0.24659787647674591, + "grad_norm": 1.6548207522042184, + "learning_rate": 8.81982413307561e-06, + "loss": 0.2872, + "step": 1649 + }, + { + "epoch": 0.24674742036787797, + "grad_norm": 0.9460460785889475, + "learning_rate": 8.818260982736662e-06, + "loss": 0.2059, + "step": 1650 + }, + { + "epoch": 0.24689696425901003, + "grad_norm": 1.596549339690058, + "learning_rate": 8.816696936596887e-06, + "loss": 0.2146, + "step": 1651 + }, + { + "epoch": 0.24704650815014206, + "grad_norm": 1.7405302714522426, + "learning_rate": 8.815131995023228e-06, + "loss": 0.2358, + "step": 1652 + }, + { + "epoch": 0.24719605204127412, + "grad_norm": 1.71942999566363, + "learning_rate": 8.813566158382835e-06, + "loss": 0.4195, + "step": 1653 + }, + { + "epoch": 0.24734559593240615, + "grad_norm": 1.8118502915807313, + "learning_rate": 8.81199942704307e-06, + "loss": 0.3486, + "step": 1654 + }, + { + "epoch": 0.2474951398235382, + "grad_norm": 1.0992755654244406, + "learning_rate": 8.810431801371501e-06, + "loss": 0.2063, + "step": 1655 + }, + { + "epoch": 0.24764468371467024, + "grad_norm": 3.017300017050166, + "learning_rate": 8.80886328173591e-06, + "loss": 0.4337, + "step": 1656 + }, + { + "epoch": 0.2477942276058023, + "grad_norm": 1.6022895744312384, + "learning_rate": 8.807293868504282e-06, + "loss": 0.3729, + "step": 1657 + }, + { + "epoch": 0.24794377149693436, + "grad_norm": 1.6149891917484116, + "learning_rate": 8.805723562044825e-06, + "loss": 0.4056, + "step": 1658 + }, + { + "epoch": 0.2480933153880664, + "grad_norm": 1.6145150498338217, + "learning_rate": 8.80415236272594e-06, + "loss": 0.3016, + "step": 1659 + }, + { + "epoch": 0.24824285927919845, + "grad_norm": 1.1459038449035153, + "learning_rate": 8.80258027091625e-06, + "loss": 0.2218, + "step": 1660 + }, + { + "epoch": 0.24839240317033048, + "grad_norm": 1.591564655332456, + "learning_rate": 8.801007286984581e-06, + "loss": 0.245, + "step": 1661 + }, + { + "epoch": 0.24854194706146254, + "grad_norm": 1.7569520327773172, + "learning_rate": 8.799433411299971e-06, + "loss": 0.4247, + "step": 1662 + }, + { + "epoch": 0.24869149095259457, + "grad_norm": 1.064692259533788, + "learning_rate": 8.797858644231666e-06, + "loss": 0.2029, + "step": 1663 + }, + { + "epoch": 0.24884103484372663, + "grad_norm": 1.7207168422676138, + "learning_rate": 8.796282986149123e-06, + "loss": 0.4056, + "step": 1664 + }, + { + "epoch": 0.2489905787348587, + "grad_norm": 1.8670534021913596, + "learning_rate": 8.794706437422004e-06, + "loss": 0.4252, + "step": 1665 + }, + { + "epoch": 0.24914012262599072, + "grad_norm": 1.1847022204061899, + "learning_rate": 8.793128998420183e-06, + "loss": 0.2108, + "step": 1666 + }, + { + "epoch": 0.24928966651712278, + "grad_norm": 1.7975419112991038, + "learning_rate": 8.791550669513746e-06, + "loss": 0.335, + "step": 1667 + }, + { + "epoch": 0.2494392104082548, + "grad_norm": 1.5925842023054706, + "learning_rate": 8.789971451072979e-06, + "loss": 0.3249, + "step": 1668 + }, + { + "epoch": 0.24958875429938687, + "grad_norm": 1.6010634947496465, + "learning_rate": 8.788391343468385e-06, + "loss": 0.395, + "step": 1669 + }, + { + "epoch": 0.2497382981905189, + "grad_norm": 1.6989646502396285, + "learning_rate": 8.78681034707067e-06, + "loss": 0.2221, + "step": 1670 + }, + { + "epoch": 0.24988784208165096, + "grad_norm": 1.1982003401309609, + "learning_rate": 8.785228462250755e-06, + "loss": 0.1606, + "step": 1671 + }, + { + "epoch": 0.250037385972783, + "grad_norm": 1.504124635470649, + "learning_rate": 8.783645689379763e-06, + "loss": 0.2759, + "step": 1672 + }, + { + "epoch": 0.25018692986391505, + "grad_norm": 1.1944052488188541, + "learning_rate": 8.782062028829028e-06, + "loss": 0.1768, + "step": 1673 + }, + { + "epoch": 0.2503364737550471, + "grad_norm": 2.329098645094478, + "learning_rate": 8.78047748097009e-06, + "loss": 0.8662, + "step": 1674 + }, + { + "epoch": 0.25048601764617917, + "grad_norm": 1.0556820313003359, + "learning_rate": 8.778892046174703e-06, + "loss": 0.1784, + "step": 1675 + }, + { + "epoch": 0.2506355615373112, + "grad_norm": 2.158513791608413, + "learning_rate": 8.777305724814823e-06, + "loss": 0.2676, + "step": 1676 + }, + { + "epoch": 0.25078510542844323, + "grad_norm": 1.4537227026901378, + "learning_rate": 8.775718517262616e-06, + "loss": 0.3745, + "step": 1677 + }, + { + "epoch": 0.2509346493195753, + "grad_norm": 0.950798082633677, + "learning_rate": 8.774130423890457e-06, + "loss": 0.1824, + "step": 1678 + }, + { + "epoch": 0.25108419321070735, + "grad_norm": 1.545457884426641, + "learning_rate": 8.77254144507093e-06, + "loss": 0.3852, + "step": 1679 + }, + { + "epoch": 0.2512337371018394, + "grad_norm": 1.319903070406757, + "learning_rate": 8.770951581176819e-06, + "loss": 0.1928, + "step": 1680 + }, + { + "epoch": 0.2513832809929714, + "grad_norm": 1.6348677618098524, + "learning_rate": 8.769360832581127e-06, + "loss": 0.2614, + "step": 1681 + }, + { + "epoch": 0.2515328248841035, + "grad_norm": 1.2973642919936317, + "learning_rate": 8.767769199657056e-06, + "loss": 0.1926, + "step": 1682 + }, + { + "epoch": 0.25168236877523553, + "grad_norm": 2.0805063872419214, + "learning_rate": 8.766176682778021e-06, + "loss": 0.6252, + "step": 1683 + }, + { + "epoch": 0.25183191266636756, + "grad_norm": 1.5452777534643873, + "learning_rate": 8.76458328231764e-06, + "loss": 0.3641, + "step": 1684 + }, + { + "epoch": 0.25198145655749965, + "grad_norm": 2.031081999653712, + "learning_rate": 8.76298899864974e-06, + "loss": 0.6777, + "step": 1685 + }, + { + "epoch": 0.2521310004486317, + "grad_norm": 1.4412028187219246, + "learning_rate": 8.761393832148355e-06, + "loss": 0.3482, + "step": 1686 + }, + { + "epoch": 0.2522805443397637, + "grad_norm": 1.756325567561018, + "learning_rate": 8.759797783187728e-06, + "loss": 0.486, + "step": 1687 + }, + { + "epoch": 0.25243008823089574, + "grad_norm": 1.4179914068088109, + "learning_rate": 8.758200852142306e-06, + "loss": 0.3271, + "step": 1688 + }, + { + "epoch": 0.25257963212202783, + "grad_norm": 1.5841241720765276, + "learning_rate": 8.756603039386744e-06, + "loss": 0.2038, + "step": 1689 + }, + { + "epoch": 0.25272917601315986, + "grad_norm": 2.0246116663194687, + "learning_rate": 8.755004345295906e-06, + "loss": 0.5168, + "step": 1690 + }, + { + "epoch": 0.2528787199042919, + "grad_norm": 1.4410735414416371, + "learning_rate": 8.753404770244861e-06, + "loss": 0.2331, + "step": 1691 + }, + { + "epoch": 0.253028263795424, + "grad_norm": 1.6406911200868926, + "learning_rate": 8.751804314608885e-06, + "loss": 0.3777, + "step": 1692 + }, + { + "epoch": 0.253177807686556, + "grad_norm": 1.4803860988544733, + "learning_rate": 8.750202978763455e-06, + "loss": 0.3677, + "step": 1693 + }, + { + "epoch": 0.25332735157768804, + "grad_norm": 1.1771068735919579, + "learning_rate": 8.748600763084267e-06, + "loss": 0.3354, + "step": 1694 + }, + { + "epoch": 0.2534768954688201, + "grad_norm": 1.364747322095615, + "learning_rate": 8.746997667947215e-06, + "loss": 0.2101, + "step": 1695 + }, + { + "epoch": 0.25362643935995216, + "grad_norm": 1.0147659263108189, + "learning_rate": 8.745393693728395e-06, + "loss": 0.2104, + "step": 1696 + }, + { + "epoch": 0.2537759832510842, + "grad_norm": 1.3008901060796498, + "learning_rate": 8.74378884080412e-06, + "loss": 0.2224, + "step": 1697 + }, + { + "epoch": 0.2539255271422162, + "grad_norm": 1.5410622225873059, + "learning_rate": 8.742183109550906e-06, + "loss": 0.3175, + "step": 1698 + }, + { + "epoch": 0.2540750710333483, + "grad_norm": 1.1271133540339682, + "learning_rate": 8.740576500345465e-06, + "loss": 0.2078, + "step": 1699 + }, + { + "epoch": 0.25422461492448034, + "grad_norm": 1.5002212907557977, + "learning_rate": 8.73896901356473e-06, + "loss": 0.3457, + "step": 1700 + }, + { + "epoch": 0.2543741588156124, + "grad_norm": 1.5194450466740133, + "learning_rate": 8.737360649585831e-06, + "loss": 0.2974, + "step": 1701 + }, + { + "epoch": 0.2545237027067444, + "grad_norm": 1.541411347823068, + "learning_rate": 8.735751408786106e-06, + "loss": 0.1909, + "step": 1702 + }, + { + "epoch": 0.2546732465978765, + "grad_norm": 1.0383311616312054, + "learning_rate": 8.734141291543096e-06, + "loss": 0.1968, + "step": 1703 + }, + { + "epoch": 0.2548227904890085, + "grad_norm": 1.1554526724480272, + "learning_rate": 8.732530298234551e-06, + "loss": 0.3622, + "step": 1704 + }, + { + "epoch": 0.25497233438014055, + "grad_norm": 1.5935676507788412, + "learning_rate": 8.730918429238429e-06, + "loss": 0.4022, + "step": 1705 + }, + { + "epoch": 0.25512187827127264, + "grad_norm": 1.7826101439006568, + "learning_rate": 8.729305684932884e-06, + "loss": 0.4635, + "step": 1706 + }, + { + "epoch": 0.25527142216240467, + "grad_norm": 1.9360511884951532, + "learning_rate": 8.727692065696286e-06, + "loss": 0.5279, + "step": 1707 + }, + { + "epoch": 0.2554209660535367, + "grad_norm": 2.2688771091405355, + "learning_rate": 8.726077571907205e-06, + "loss": 0.4807, + "step": 1708 + }, + { + "epoch": 0.25557050994466873, + "grad_norm": 1.1500886235737136, + "learning_rate": 8.724462203944417e-06, + "loss": 0.1742, + "step": 1709 + }, + { + "epoch": 0.2557200538358008, + "grad_norm": 2.1581902257611505, + "learning_rate": 8.7228459621869e-06, + "loss": 0.2518, + "step": 1710 + }, + { + "epoch": 0.25586959772693285, + "grad_norm": 1.5158538301505384, + "learning_rate": 8.721228847013844e-06, + "loss": 0.4283, + "step": 1711 + }, + { + "epoch": 0.2560191416180649, + "grad_norm": 1.1144613603300342, + "learning_rate": 8.719610858804634e-06, + "loss": 0.1865, + "step": 1712 + }, + { + "epoch": 0.25616868550919697, + "grad_norm": 1.712020727078079, + "learning_rate": 8.717991997938872e-06, + "loss": 0.4059, + "step": 1713 + }, + { + "epoch": 0.256318229400329, + "grad_norm": 1.3838237147007102, + "learning_rate": 8.716372264796355e-06, + "loss": 0.3562, + "step": 1714 + }, + { + "epoch": 0.25646777329146103, + "grad_norm": 2.009760245861866, + "learning_rate": 8.71475165975709e-06, + "loss": 0.6606, + "step": 1715 + }, + { + "epoch": 0.2566173171825931, + "grad_norm": 1.4629059301831144, + "learning_rate": 8.713130183201283e-06, + "loss": 0.204, + "step": 1716 + }, + { + "epoch": 0.25676686107372515, + "grad_norm": 1.712741449487023, + "learning_rate": 8.711507835509352e-06, + "loss": 0.3339, + "step": 1717 + }, + { + "epoch": 0.2569164049648572, + "grad_norm": 1.7719128226224596, + "learning_rate": 8.709884617061912e-06, + "loss": 0.4942, + "step": 1718 + }, + { + "epoch": 0.2570659488559892, + "grad_norm": 2.5501496729214645, + "learning_rate": 8.708260528239788e-06, + "loss": 0.1606, + "step": 1719 + }, + { + "epoch": 0.2572154927471213, + "grad_norm": 1.3379881043667554, + "learning_rate": 8.706635569424007e-06, + "loss": 0.1771, + "step": 1720 + }, + { + "epoch": 0.25736503663825333, + "grad_norm": 1.3113682840451848, + "learning_rate": 8.705009740995798e-06, + "loss": 0.1974, + "step": 1721 + }, + { + "epoch": 0.25751458052938536, + "grad_norm": 1.3252114693451582, + "learning_rate": 8.703383043336598e-06, + "loss": 0.1857, + "step": 1722 + }, + { + "epoch": 0.25766412442051745, + "grad_norm": 1.2740861405038273, + "learning_rate": 8.701755476828045e-06, + "loss": 0.179, + "step": 1723 + }, + { + "epoch": 0.2578136683116495, + "grad_norm": 1.106234026109271, + "learning_rate": 8.700127041851983e-06, + "loss": 0.2017, + "step": 1724 + }, + { + "epoch": 0.2579632122027815, + "grad_norm": 1.6172632969033804, + "learning_rate": 8.698497738790456e-06, + "loss": 0.3627, + "step": 1725 + }, + { + "epoch": 0.25811275609391354, + "grad_norm": 1.1499075369100074, + "learning_rate": 8.696867568025715e-06, + "loss": 0.2216, + "step": 1726 + }, + { + "epoch": 0.25826229998504563, + "grad_norm": 1.360793130358255, + "learning_rate": 8.695236529940217e-06, + "loss": 0.2168, + "step": 1727 + }, + { + "epoch": 0.25841184387617766, + "grad_norm": 1.5474318214998557, + "learning_rate": 8.693604624916614e-06, + "loss": 0.3872, + "step": 1728 + }, + { + "epoch": 0.2585613877673097, + "grad_norm": 1.603910943390027, + "learning_rate": 8.691971853337772e-06, + "loss": 0.3828, + "step": 1729 + }, + { + "epoch": 0.2587109316584418, + "grad_norm": 1.6171942954911174, + "learning_rate": 8.69033821558675e-06, + "loss": 0.3922, + "step": 1730 + }, + { + "epoch": 0.2588604755495738, + "grad_norm": 1.2280233394489264, + "learning_rate": 8.68870371204682e-06, + "loss": 0.234, + "step": 1731 + }, + { + "epoch": 0.25901001944070584, + "grad_norm": 1.495515188592798, + "learning_rate": 8.687068343101449e-06, + "loss": 0.3597, + "step": 1732 + }, + { + "epoch": 0.25915956333183787, + "grad_norm": 1.2004438690731185, + "learning_rate": 8.685432109134309e-06, + "loss": 0.3891, + "step": 1733 + }, + { + "epoch": 0.25930910722296996, + "grad_norm": 1.1612103655288195, + "learning_rate": 8.68379501052928e-06, + "loss": 0.1674, + "step": 1734 + }, + { + "epoch": 0.259458651114102, + "grad_norm": 1.5379960884614503, + "learning_rate": 8.682157047670439e-06, + "loss": 0.2165, + "step": 1735 + }, + { + "epoch": 0.259608195005234, + "grad_norm": 1.256481316250901, + "learning_rate": 8.68051822094207e-06, + "loss": 0.1956, + "step": 1736 + }, + { + "epoch": 0.2597577388963661, + "grad_norm": 1.150195060615946, + "learning_rate": 8.678878530728653e-06, + "loss": 0.2046, + "step": 1737 + }, + { + "epoch": 0.25990728278749814, + "grad_norm": 0.9645721264599928, + "learning_rate": 8.677237977414879e-06, + "loss": 0.2169, + "step": 1738 + }, + { + "epoch": 0.26005682667863017, + "grad_norm": 2.055210192087077, + "learning_rate": 8.675596561385637e-06, + "loss": 0.5667, + "step": 1739 + }, + { + "epoch": 0.2602063705697622, + "grad_norm": 1.6323123777977344, + "learning_rate": 8.67395428302602e-06, + "loss": 0.2085, + "step": 1740 + }, + { + "epoch": 0.2603559144608943, + "grad_norm": 2.0705229310389734, + "learning_rate": 8.67231114272132e-06, + "loss": 0.6948, + "step": 1741 + }, + { + "epoch": 0.2605054583520263, + "grad_norm": 1.4253846439937559, + "learning_rate": 8.670667140857034e-06, + "loss": 0.2156, + "step": 1742 + }, + { + "epoch": 0.26065500224315835, + "grad_norm": 1.2432278896167281, + "learning_rate": 8.669022277818861e-06, + "loss": 0.1668, + "step": 1743 + }, + { + "epoch": 0.26080454613429044, + "grad_norm": 2.1626989048511396, + "learning_rate": 8.6673765539927e-06, + "loss": 0.5859, + "step": 1744 + }, + { + "epoch": 0.26095409002542247, + "grad_norm": 1.3212936003748748, + "learning_rate": 8.66572996976466e-06, + "loss": 0.2208, + "step": 1745 + }, + { + "epoch": 0.2611036339165545, + "grad_norm": 1.6949012011313151, + "learning_rate": 8.66408252552104e-06, + "loss": 0.3465, + "step": 1746 + }, + { + "epoch": 0.26125317780768653, + "grad_norm": 1.6334321794556426, + "learning_rate": 8.662434221648344e-06, + "loss": 0.3536, + "step": 1747 + }, + { + "epoch": 0.2614027216988186, + "grad_norm": 1.4554313973278277, + "learning_rate": 8.660785058533288e-06, + "loss": 0.1853, + "step": 1748 + }, + { + "epoch": 0.26155226558995065, + "grad_norm": 1.4889399004698576, + "learning_rate": 8.659135036562774e-06, + "loss": 0.3361, + "step": 1749 + }, + { + "epoch": 0.2617018094810827, + "grad_norm": 1.2820685580565203, + "learning_rate": 8.657484156123916e-06, + "loss": 0.2098, + "step": 1750 + }, + { + "epoch": 0.26185135337221477, + "grad_norm": 1.4528196588291855, + "learning_rate": 8.655832417604028e-06, + "loss": 0.3716, + "step": 1751 + }, + { + "epoch": 0.2620008972633468, + "grad_norm": 1.1976317675054606, + "learning_rate": 8.65417982139062e-06, + "loss": 0.2427, + "step": 1752 + }, + { + "epoch": 0.26215044115447883, + "grad_norm": 1.5994876354129555, + "learning_rate": 8.652526367871412e-06, + "loss": 0.4616, + "step": 1753 + }, + { + "epoch": 0.26229998504561086, + "grad_norm": 1.4225347188398327, + "learning_rate": 8.650872057434316e-06, + "loss": 0.2385, + "step": 1754 + }, + { + "epoch": 0.26244952893674295, + "grad_norm": 1.6636967962332678, + "learning_rate": 8.64921689046745e-06, + "loss": 0.2447, + "step": 1755 + }, + { + "epoch": 0.262599072827875, + "grad_norm": 1.5225788924725037, + "learning_rate": 8.647560867359133e-06, + "loss": 0.3694, + "step": 1756 + }, + { + "epoch": 0.262748616719007, + "grad_norm": 1.2202525703548277, + "learning_rate": 8.645903988497884e-06, + "loss": 0.187, + "step": 1757 + }, + { + "epoch": 0.2628981606101391, + "grad_norm": 1.9853373281678817, + "learning_rate": 8.644246254272423e-06, + "loss": 0.2257, + "step": 1758 + }, + { + "epoch": 0.26304770450127113, + "grad_norm": 1.1859244322025415, + "learning_rate": 8.64258766507167e-06, + "loss": 0.2107, + "step": 1759 + }, + { + "epoch": 0.26319724839240316, + "grad_norm": 1.610055608255526, + "learning_rate": 8.640928221284744e-06, + "loss": 0.3033, + "step": 1760 + }, + { + "epoch": 0.2633467922835352, + "grad_norm": 1.322382799898529, + "learning_rate": 8.63926792330097e-06, + "loss": 0.219, + "step": 1761 + }, + { + "epoch": 0.2634963361746673, + "grad_norm": 1.5892475425535593, + "learning_rate": 8.637606771509868e-06, + "loss": 0.3678, + "step": 1762 + }, + { + "epoch": 0.2636458800657993, + "grad_norm": 1.6038277957567881, + "learning_rate": 8.635944766301158e-06, + "loss": 0.3819, + "step": 1763 + }, + { + "epoch": 0.26379542395693134, + "grad_norm": 1.7817364162394536, + "learning_rate": 8.634281908064767e-06, + "loss": 0.3632, + "step": 1764 + }, + { + "epoch": 0.2639449678480634, + "grad_norm": 1.8816182001166262, + "learning_rate": 8.632618197190817e-06, + "loss": 0.4874, + "step": 1765 + }, + { + "epoch": 0.26409451173919546, + "grad_norm": 1.7637033301301739, + "learning_rate": 8.630953634069627e-06, + "loss": 0.2109, + "step": 1766 + }, + { + "epoch": 0.2642440556303275, + "grad_norm": 1.4820279840766182, + "learning_rate": 8.629288219091722e-06, + "loss": 0.2482, + "step": 1767 + }, + { + "epoch": 0.2643935995214595, + "grad_norm": 1.3257383394968065, + "learning_rate": 8.627621952647825e-06, + "loss": 0.2066, + "step": 1768 + }, + { + "epoch": 0.2645431434125916, + "grad_norm": 1.6804248433426907, + "learning_rate": 8.625954835128856e-06, + "loss": 0.2436, + "step": 1769 + }, + { + "epoch": 0.26469268730372364, + "grad_norm": 1.8362570342532276, + "learning_rate": 8.624286866925938e-06, + "loss": 0.3694, + "step": 1770 + }, + { + "epoch": 0.26484223119485567, + "grad_norm": 1.5661865351400834, + "learning_rate": 8.622618048430391e-06, + "loss": 0.3591, + "step": 1771 + }, + { + "epoch": 0.26499177508598776, + "grad_norm": 2.1673241284131044, + "learning_rate": 8.62094838003374e-06, + "loss": 0.5737, + "step": 1772 + }, + { + "epoch": 0.2651413189771198, + "grad_norm": 1.0699484499241998, + "learning_rate": 8.619277862127702e-06, + "loss": 0.2189, + "step": 1773 + }, + { + "epoch": 0.2652908628682518, + "grad_norm": 1.736379723484864, + "learning_rate": 8.617606495104198e-06, + "loss": 0.4754, + "step": 1774 + }, + { + "epoch": 0.2654404067593839, + "grad_norm": 1.583421637766384, + "learning_rate": 8.615934279355345e-06, + "loss": 0.3366, + "step": 1775 + }, + { + "epoch": 0.26558995065051594, + "grad_norm": 1.6478010096837548, + "learning_rate": 8.614261215273462e-06, + "loss": 0.3672, + "step": 1776 + }, + { + "epoch": 0.26573949454164797, + "grad_norm": 1.9102927551172686, + "learning_rate": 8.612587303251065e-06, + "loss": 0.4308, + "step": 1777 + }, + { + "epoch": 0.26588903843278, + "grad_norm": 1.013647409926638, + "learning_rate": 8.610912543680872e-06, + "loss": 0.2259, + "step": 1778 + }, + { + "epoch": 0.2660385823239121, + "grad_norm": 0.9074429437734283, + "learning_rate": 8.609236936955797e-06, + "loss": 0.1942, + "step": 1779 + }, + { + "epoch": 0.2661881262150441, + "grad_norm": 1.1128832323090567, + "learning_rate": 8.607560483468953e-06, + "loss": 0.2495, + "step": 1780 + }, + { + "epoch": 0.26633767010617615, + "grad_norm": 1.4209004510295011, + "learning_rate": 8.60588318361365e-06, + "loss": 0.2153, + "step": 1781 + }, + { + "epoch": 0.26648721399730824, + "grad_norm": 1.5238417321433448, + "learning_rate": 8.604205037783404e-06, + "loss": 0.1742, + "step": 1782 + }, + { + "epoch": 0.26663675788844027, + "grad_norm": 1.2906114880614232, + "learning_rate": 8.60252604637192e-06, + "loss": 0.3304, + "step": 1783 + }, + { + "epoch": 0.2667863017795723, + "grad_norm": 2.0207728584980726, + "learning_rate": 8.600846209773107e-06, + "loss": 0.7203, + "step": 1784 + }, + { + "epoch": 0.26693584567070433, + "grad_norm": 1.0286990588537865, + "learning_rate": 8.599165528381068e-06, + "loss": 0.3377, + "step": 1785 + }, + { + "epoch": 0.2670853895618364, + "grad_norm": 1.6105436646556477, + "learning_rate": 8.597484002590113e-06, + "loss": 0.3509, + "step": 1786 + }, + { + "epoch": 0.26723493345296845, + "grad_norm": 1.655849264356554, + "learning_rate": 8.595801632794738e-06, + "loss": 0.3233, + "step": 1787 + }, + { + "epoch": 0.2673844773441005, + "grad_norm": 1.411449436450616, + "learning_rate": 8.594118419389648e-06, + "loss": 0.2504, + "step": 1788 + }, + { + "epoch": 0.26753402123523257, + "grad_norm": 1.3172823398923945, + "learning_rate": 8.592434362769738e-06, + "loss": 0.3616, + "step": 1789 + }, + { + "epoch": 0.2676835651263646, + "grad_norm": 1.4726126876884136, + "learning_rate": 8.590749463330105e-06, + "loss": 0.1986, + "step": 1790 + }, + { + "epoch": 0.26783310901749663, + "grad_norm": 1.7217144976129872, + "learning_rate": 8.589063721466041e-06, + "loss": 0.3317, + "step": 1791 + }, + { + "epoch": 0.26798265290862866, + "grad_norm": 1.5253687117024657, + "learning_rate": 8.58737713757304e-06, + "loss": 0.1819, + "step": 1792 + }, + { + "epoch": 0.26813219679976075, + "grad_norm": 1.0839801985857815, + "learning_rate": 8.585689712046792e-06, + "loss": 0.1985, + "step": 1793 + }, + { + "epoch": 0.2682817406908928, + "grad_norm": 1.4726576160050187, + "learning_rate": 8.58400144528318e-06, + "loss": 0.3811, + "step": 1794 + }, + { + "epoch": 0.2684312845820248, + "grad_norm": 1.628036147711573, + "learning_rate": 8.582312337678286e-06, + "loss": 0.4639, + "step": 1795 + }, + { + "epoch": 0.2685808284731569, + "grad_norm": 1.8952018740494194, + "learning_rate": 8.580622389628395e-06, + "loss": 0.3552, + "step": 1796 + }, + { + "epoch": 0.2687303723642889, + "grad_norm": 2.001984029391798, + "learning_rate": 8.578931601529983e-06, + "loss": 0.3726, + "step": 1797 + }, + { + "epoch": 0.26887991625542096, + "grad_norm": 1.2603899746687859, + "learning_rate": 8.577239973779727e-06, + "loss": 0.3207, + "step": 1798 + }, + { + "epoch": 0.269029460146553, + "grad_norm": 1.4854613896182869, + "learning_rate": 8.575547506774498e-06, + "loss": 0.2007, + "step": 1799 + }, + { + "epoch": 0.2691790040376851, + "grad_norm": 1.8906751578747407, + "learning_rate": 8.573854200911365e-06, + "loss": 0.6032, + "step": 1800 + }, + { + "epoch": 0.2693285479288171, + "grad_norm": 1.9613152098406168, + "learning_rate": 8.572160056587592e-06, + "loss": 0.3775, + "step": 1801 + }, + { + "epoch": 0.26947809181994914, + "grad_norm": 0.9889564582361468, + "learning_rate": 8.570465074200645e-06, + "loss": 0.1862, + "step": 1802 + }, + { + "epoch": 0.2696276357110812, + "grad_norm": 2.0931504871597015, + "learning_rate": 8.568769254148182e-06, + "loss": 0.5499, + "step": 1803 + }, + { + "epoch": 0.26977717960221326, + "grad_norm": 1.5128120195103958, + "learning_rate": 8.56707259682806e-06, + "loss": 0.3961, + "step": 1804 + }, + { + "epoch": 0.2699267234933453, + "grad_norm": 1.4113868405534056, + "learning_rate": 8.565375102638327e-06, + "loss": 0.332, + "step": 1805 + }, + { + "epoch": 0.2700762673844773, + "grad_norm": 1.9594938693881467, + "learning_rate": 8.563676771977234e-06, + "loss": 0.5145, + "step": 1806 + }, + { + "epoch": 0.2702258112756094, + "grad_norm": 1.7464884113562893, + "learning_rate": 8.561977605243228e-06, + "loss": 0.3668, + "step": 1807 + }, + { + "epoch": 0.27037535516674144, + "grad_norm": 1.1626732860512075, + "learning_rate": 8.560277602834945e-06, + "loss": 0.168, + "step": 1808 + }, + { + "epoch": 0.27052489905787347, + "grad_norm": 1.6748170212716917, + "learning_rate": 8.558576765151227e-06, + "loss": 0.2433, + "step": 1809 + }, + { + "epoch": 0.27067444294900556, + "grad_norm": 2.025570121175929, + "learning_rate": 8.556875092591104e-06, + "loss": 0.2798, + "step": 1810 + }, + { + "epoch": 0.2708239868401376, + "grad_norm": 2.374954242014533, + "learning_rate": 8.555172585553804e-06, + "loss": 0.4171, + "step": 1811 + }, + { + "epoch": 0.2709735307312696, + "grad_norm": 1.7745787943498124, + "learning_rate": 8.553469244438754e-06, + "loss": 0.3595, + "step": 1812 + }, + { + "epoch": 0.27112307462240165, + "grad_norm": 2.431602477728119, + "learning_rate": 8.551765069645574e-06, + "loss": 0.472, + "step": 1813 + }, + { + "epoch": 0.27127261851353374, + "grad_norm": 1.5428274790156662, + "learning_rate": 8.55006006157408e-06, + "loss": 0.3913, + "step": 1814 + }, + { + "epoch": 0.27142216240466577, + "grad_norm": 1.4868080151451115, + "learning_rate": 8.548354220624278e-06, + "loss": 0.2358, + "step": 1815 + }, + { + "epoch": 0.2715717062957978, + "grad_norm": 1.5637252724169068, + "learning_rate": 8.546647547196383e-06, + "loss": 0.1934, + "step": 1816 + }, + { + "epoch": 0.2717212501869299, + "grad_norm": 1.4784784501314225, + "learning_rate": 8.544940041690792e-06, + "loss": 0.5315, + "step": 1817 + }, + { + "epoch": 0.2718707940780619, + "grad_norm": 1.6049147134883024, + "learning_rate": 8.543231704508102e-06, + "loss": 0.2277, + "step": 1818 + }, + { + "epoch": 0.27202033796919395, + "grad_norm": 1.1639185886904326, + "learning_rate": 8.541522536049108e-06, + "loss": 0.2021, + "step": 1819 + }, + { + "epoch": 0.272169881860326, + "grad_norm": 1.5080421265491435, + "learning_rate": 8.539812536714796e-06, + "loss": 0.1978, + "step": 1820 + }, + { + "epoch": 0.27231942575145807, + "grad_norm": 0.880232094927403, + "learning_rate": 8.538101706906347e-06, + "loss": 0.1639, + "step": 1821 + }, + { + "epoch": 0.2724689696425901, + "grad_norm": 1.4359482262535581, + "learning_rate": 8.536390047025143e-06, + "loss": 0.2078, + "step": 1822 + }, + { + "epoch": 0.27261851353372213, + "grad_norm": 1.3134455382926835, + "learning_rate": 8.53467755747275e-06, + "loss": 0.3189, + "step": 1823 + }, + { + "epoch": 0.2727680574248542, + "grad_norm": 1.4404618987021778, + "learning_rate": 8.532964238650938e-06, + "loss": 0.3408, + "step": 1824 + }, + { + "epoch": 0.27291760131598625, + "grad_norm": 1.3234925937557052, + "learning_rate": 8.531250090961666e-06, + "loss": 0.2465, + "step": 1825 + }, + { + "epoch": 0.2730671452071183, + "grad_norm": 1.678127185102635, + "learning_rate": 8.52953511480709e-06, + "loss": 0.5327, + "step": 1826 + }, + { + "epoch": 0.2732166890982503, + "grad_norm": 1.4928358926152048, + "learning_rate": 8.527819310589564e-06, + "loss": 0.3706, + "step": 1827 + }, + { + "epoch": 0.2733662329893824, + "grad_norm": 1.7549470816483508, + "learning_rate": 8.526102678711626e-06, + "loss": 0.4288, + "step": 1828 + }, + { + "epoch": 0.2735157768805144, + "grad_norm": 1.5327378751610867, + "learning_rate": 8.524385219576019e-06, + "loss": 0.4647, + "step": 1829 + }, + { + "epoch": 0.27366532077164646, + "grad_norm": 1.7197750676087826, + "learning_rate": 8.522666933585672e-06, + "loss": 0.3523, + "step": 1830 + }, + { + "epoch": 0.27381486466277855, + "grad_norm": 1.174817852571183, + "learning_rate": 8.520947821143714e-06, + "loss": 0.2134, + "step": 1831 + }, + { + "epoch": 0.2739644085539106, + "grad_norm": 2.0070860843733302, + "learning_rate": 8.519227882653465e-06, + "loss": 0.4656, + "step": 1832 + }, + { + "epoch": 0.2741139524450426, + "grad_norm": 1.7275106234713773, + "learning_rate": 8.517507118518436e-06, + "loss": 0.5513, + "step": 1833 + }, + { + "epoch": 0.2742634963361747, + "grad_norm": 1.2685207625970671, + "learning_rate": 8.515785529142339e-06, + "loss": 0.2491, + "step": 1834 + }, + { + "epoch": 0.2744130402273067, + "grad_norm": 1.2419112651105808, + "learning_rate": 8.514063114929077e-06, + "loss": 0.1897, + "step": 1835 + }, + { + "epoch": 0.27456258411843876, + "grad_norm": 1.1605015571551742, + "learning_rate": 8.512339876282738e-06, + "loss": 0.2282, + "step": 1836 + }, + { + "epoch": 0.2747121280095708, + "grad_norm": 1.2889897008048643, + "learning_rate": 8.510615813607617e-06, + "loss": 0.3405, + "step": 1837 + }, + { + "epoch": 0.2748616719007029, + "grad_norm": 1.8490938154917704, + "learning_rate": 8.508890927308191e-06, + "loss": 0.3821, + "step": 1838 + }, + { + "epoch": 0.2750112157918349, + "grad_norm": 1.3689509544740683, + "learning_rate": 8.507165217789137e-06, + "loss": 0.3832, + "step": 1839 + }, + { + "epoch": 0.27516075968296694, + "grad_norm": 1.6403361270537955, + "learning_rate": 8.505438685455325e-06, + "loss": 0.3848, + "step": 1840 + }, + { + "epoch": 0.275310303574099, + "grad_norm": 1.8182842779990729, + "learning_rate": 8.503711330711813e-06, + "loss": 0.2549, + "step": 1841 + }, + { + "epoch": 0.27545984746523106, + "grad_norm": 1.7148274136332127, + "learning_rate": 8.501983153963855e-06, + "loss": 0.5105, + "step": 1842 + }, + { + "epoch": 0.2756093913563631, + "grad_norm": 1.537038227068178, + "learning_rate": 8.500254155616901e-06, + "loss": 0.3514, + "step": 1843 + }, + { + "epoch": 0.2757589352474951, + "grad_norm": 1.641983169416554, + "learning_rate": 8.498524336076587e-06, + "loss": 0.6491, + "step": 1844 + }, + { + "epoch": 0.2759084791386272, + "grad_norm": 1.250729132131747, + "learning_rate": 8.49679369574875e-06, + "loss": 0.2051, + "step": 1845 + }, + { + "epoch": 0.27605802302975924, + "grad_norm": 1.579592841261978, + "learning_rate": 8.49506223503941e-06, + "loss": 0.3549, + "step": 1846 + }, + { + "epoch": 0.27620756692089127, + "grad_norm": 1.1166297247129173, + "learning_rate": 8.493329954354788e-06, + "loss": 0.2016, + "step": 1847 + }, + { + "epoch": 0.27635711081202335, + "grad_norm": 1.457594575978762, + "learning_rate": 8.491596854101292e-06, + "loss": 0.3752, + "step": 1848 + }, + { + "epoch": 0.2765066547031554, + "grad_norm": 1.6194515688957392, + "learning_rate": 8.489862934685523e-06, + "loss": 0.2791, + "step": 1849 + }, + { + "epoch": 0.2766561985942874, + "grad_norm": 1.189645094014293, + "learning_rate": 8.488128196514279e-06, + "loss": 0.1892, + "step": 1850 + }, + { + "epoch": 0.27680574248541945, + "grad_norm": 1.6414546776027312, + "learning_rate": 8.486392639994541e-06, + "loss": 0.4263, + "step": 1851 + }, + { + "epoch": 0.27695528637655153, + "grad_norm": 1.1030338687374446, + "learning_rate": 8.48465626553349e-06, + "loss": 0.2174, + "step": 1852 + }, + { + "epoch": 0.27710483026768357, + "grad_norm": 2.062987335024284, + "learning_rate": 8.482919073538498e-06, + "loss": 0.5137, + "step": 1853 + }, + { + "epoch": 0.2772543741588156, + "grad_norm": 1.9265333026489615, + "learning_rate": 8.481181064417124e-06, + "loss": 0.5053, + "step": 1854 + }, + { + "epoch": 0.2774039180499477, + "grad_norm": 1.2857660275288423, + "learning_rate": 8.479442238577123e-06, + "loss": 0.2356, + "step": 1855 + }, + { + "epoch": 0.2775534619410797, + "grad_norm": 1.4308427978938358, + "learning_rate": 8.477702596426441e-06, + "loss": 0.3124, + "step": 1856 + }, + { + "epoch": 0.27770300583221175, + "grad_norm": 2.112210621284666, + "learning_rate": 8.475962138373212e-06, + "loss": 0.5583, + "step": 1857 + }, + { + "epoch": 0.2778525497233438, + "grad_norm": 1.5648504781268902, + "learning_rate": 8.474220864825768e-06, + "loss": 0.2046, + "step": 1858 + }, + { + "epoch": 0.27800209361447586, + "grad_norm": 1.0861658989898926, + "learning_rate": 8.472478776192624e-06, + "loss": 0.1941, + "step": 1859 + }, + { + "epoch": 0.2781516375056079, + "grad_norm": 1.7593139339710724, + "learning_rate": 8.470735872882498e-06, + "loss": 0.3911, + "step": 1860 + }, + { + "epoch": 0.2783011813967399, + "grad_norm": 1.9051060527282064, + "learning_rate": 8.468992155304285e-06, + "loss": 0.3441, + "step": 1861 + }, + { + "epoch": 0.278450725287872, + "grad_norm": 1.2726234923585076, + "learning_rate": 8.46724762386708e-06, + "loss": 0.1966, + "step": 1862 + }, + { + "epoch": 0.27860026917900405, + "grad_norm": 1.6275578962742665, + "learning_rate": 8.465502278980168e-06, + "loss": 0.2015, + "step": 1863 + }, + { + "epoch": 0.2787498130701361, + "grad_norm": 1.1894457179498164, + "learning_rate": 8.463756121053024e-06, + "loss": 0.2064, + "step": 1864 + }, + { + "epoch": 0.2788993569612681, + "grad_norm": 0.8970585928811048, + "learning_rate": 8.462009150495311e-06, + "loss": 0.2077, + "step": 1865 + }, + { + "epoch": 0.2790489008524002, + "grad_norm": 1.4979876938013277, + "learning_rate": 8.460261367716888e-06, + "loss": 0.1868, + "step": 1866 + }, + { + "epoch": 0.2791984447435322, + "grad_norm": 1.5952489152766243, + "learning_rate": 8.458512773127801e-06, + "loss": 0.3924, + "step": 1867 + }, + { + "epoch": 0.27934798863466426, + "grad_norm": 1.3568222258261973, + "learning_rate": 8.456763367138283e-06, + "loss": 0.3057, + "step": 1868 + }, + { + "epoch": 0.27949753252579634, + "grad_norm": 1.430069295266679, + "learning_rate": 8.455013150158767e-06, + "loss": 0.2027, + "step": 1869 + }, + { + "epoch": 0.2796470764169284, + "grad_norm": 1.835468064313153, + "learning_rate": 8.453262122599871e-06, + "loss": 0.203, + "step": 1870 + }, + { + "epoch": 0.2797966203080604, + "grad_norm": 1.16217124036999, + "learning_rate": 8.451510284872397e-06, + "loss": 0.2065, + "step": 1871 + }, + { + "epoch": 0.27994616419919244, + "grad_norm": 1.2439706377831612, + "learning_rate": 8.449757637387349e-06, + "loss": 0.1854, + "step": 1872 + }, + { + "epoch": 0.2800957080903245, + "grad_norm": 1.771165875925341, + "learning_rate": 8.448004180555912e-06, + "loss": 0.2118, + "step": 1873 + }, + { + "epoch": 0.28024525198145656, + "grad_norm": 1.448237639694636, + "learning_rate": 8.446249914789464e-06, + "loss": 0.329, + "step": 1874 + }, + { + "epoch": 0.2803947958725886, + "grad_norm": 1.9446859154931548, + "learning_rate": 8.444494840499573e-06, + "loss": 0.1968, + "step": 1875 + }, + { + "epoch": 0.2805443397637207, + "grad_norm": 1.1282582980944778, + "learning_rate": 8.442738958097998e-06, + "loss": 0.1929, + "step": 1876 + }, + { + "epoch": 0.2806938836548527, + "grad_norm": 0.9295466099898506, + "learning_rate": 8.440982267996683e-06, + "loss": 0.176, + "step": 1877 + }, + { + "epoch": 0.28084342754598474, + "grad_norm": 1.6067454611327383, + "learning_rate": 8.439224770607768e-06, + "loss": 0.2216, + "step": 1878 + }, + { + "epoch": 0.28099297143711677, + "grad_norm": 1.506148146117863, + "learning_rate": 8.437466466343573e-06, + "loss": 0.3125, + "step": 1879 + }, + { + "epoch": 0.28114251532824885, + "grad_norm": 1.1300992074338925, + "learning_rate": 8.43570735561662e-06, + "loss": 0.2173, + "step": 1880 + }, + { + "epoch": 0.2812920592193809, + "grad_norm": 1.0011833423935625, + "learning_rate": 8.43394743883961e-06, + "loss": 0.2908, + "step": 1881 + }, + { + "epoch": 0.2814416031105129, + "grad_norm": 1.8315268271499454, + "learning_rate": 8.432186716425438e-06, + "loss": 0.3737, + "step": 1882 + }, + { + "epoch": 0.281591147001645, + "grad_norm": 1.7995305921177922, + "learning_rate": 8.430425188787181e-06, + "loss": 0.3303, + "step": 1883 + }, + { + "epoch": 0.28174069089277703, + "grad_norm": 1.2710141324651125, + "learning_rate": 8.428662856338116e-06, + "loss": 0.2441, + "step": 1884 + }, + { + "epoch": 0.28189023478390907, + "grad_norm": 1.3946588107946323, + "learning_rate": 8.426899719491703e-06, + "loss": 0.2108, + "step": 1885 + }, + { + "epoch": 0.2820397786750411, + "grad_norm": 1.1149474892337983, + "learning_rate": 8.42513577866159e-06, + "loss": 0.2215, + "step": 1886 + }, + { + "epoch": 0.2821893225661732, + "grad_norm": 2.008735822937365, + "learning_rate": 8.423371034261612e-06, + "loss": 0.4968, + "step": 1887 + }, + { + "epoch": 0.2823388664573052, + "grad_norm": 1.935748092708562, + "learning_rate": 8.4216054867058e-06, + "loss": 0.1889, + "step": 1888 + }, + { + "epoch": 0.28248841034843725, + "grad_norm": 1.7520452919556104, + "learning_rate": 8.419839136408361e-06, + "loss": 0.3447, + "step": 1889 + }, + { + "epoch": 0.28263795423956933, + "grad_norm": 0.8268783166552869, + "learning_rate": 8.418071983783707e-06, + "loss": 0.2028, + "step": 1890 + }, + { + "epoch": 0.28278749813070136, + "grad_norm": 1.2060134973326222, + "learning_rate": 8.416304029246422e-06, + "loss": 0.2033, + "step": 1891 + }, + { + "epoch": 0.2829370420218334, + "grad_norm": 1.081395124056579, + "learning_rate": 8.414535273211286e-06, + "loss": 0.2197, + "step": 1892 + }, + { + "epoch": 0.2830865859129655, + "grad_norm": 1.6424700307942506, + "learning_rate": 8.412765716093273e-06, + "loss": 0.3939, + "step": 1893 + }, + { + "epoch": 0.2832361298040975, + "grad_norm": 1.3786288095772328, + "learning_rate": 8.410995358307528e-06, + "loss": 0.187, + "step": 1894 + }, + { + "epoch": 0.28338567369522955, + "grad_norm": 1.1978351240462277, + "learning_rate": 8.4092242002694e-06, + "loss": 0.2215, + "step": 1895 + }, + { + "epoch": 0.2835352175863616, + "grad_norm": 1.2274171257303508, + "learning_rate": 8.40745224239442e-06, + "loss": 0.2047, + "step": 1896 + }, + { + "epoch": 0.28368476147749366, + "grad_norm": 1.5902928460844958, + "learning_rate": 8.405679485098304e-06, + "loss": 0.3429, + "step": 1897 + }, + { + "epoch": 0.2838343053686257, + "grad_norm": 1.8154747327907406, + "learning_rate": 8.403905928796961e-06, + "loss": 0.4344, + "step": 1898 + }, + { + "epoch": 0.2839838492597577, + "grad_norm": 1.626710770634783, + "learning_rate": 8.402131573906479e-06, + "loss": 0.4395, + "step": 1899 + }, + { + "epoch": 0.2841333931508898, + "grad_norm": 1.2949990843572656, + "learning_rate": 8.400356420843144e-06, + "loss": 0.2326, + "step": 1900 + }, + { + "epoch": 0.28428293704202184, + "grad_norm": 1.5453342232271752, + "learning_rate": 8.398580470023422e-06, + "loss": 0.3238, + "step": 1901 + }, + { + "epoch": 0.2844324809331539, + "grad_norm": 1.4286541357110913, + "learning_rate": 8.396803721863969e-06, + "loss": 0.2427, + "step": 1902 + }, + { + "epoch": 0.2845820248242859, + "grad_norm": 1.830298456304548, + "learning_rate": 8.395026176781627e-06, + "loss": 0.3781, + "step": 1903 + }, + { + "epoch": 0.284731568715418, + "grad_norm": 0.95541693717727, + "learning_rate": 8.393247835193424e-06, + "loss": 0.2069, + "step": 1904 + }, + { + "epoch": 0.28488111260655, + "grad_norm": 1.4752609480755603, + "learning_rate": 8.391468697516575e-06, + "loss": 0.2127, + "step": 1905 + }, + { + "epoch": 0.28503065649768206, + "grad_norm": 1.4768738670643429, + "learning_rate": 8.389688764168487e-06, + "loss": 0.3605, + "step": 1906 + }, + { + "epoch": 0.28518020038881414, + "grad_norm": 1.2979854503198742, + "learning_rate": 8.387908035566747e-06, + "loss": 0.1915, + "step": 1907 + }, + { + "epoch": 0.2853297442799462, + "grad_norm": 0.9902824337524958, + "learning_rate": 8.38612651212913e-06, + "loss": 0.2028, + "step": 1908 + }, + { + "epoch": 0.2854792881710782, + "grad_norm": 1.6167134405609127, + "learning_rate": 8.384344194273602e-06, + "loss": 0.2436, + "step": 1909 + }, + { + "epoch": 0.28562883206221024, + "grad_norm": 1.901370964967397, + "learning_rate": 8.38256108241831e-06, + "loss": 0.5823, + "step": 1910 + }, + { + "epoch": 0.2857783759533423, + "grad_norm": 1.8663326764685, + "learning_rate": 8.380777176981586e-06, + "loss": 0.296, + "step": 1911 + }, + { + "epoch": 0.28592791984447435, + "grad_norm": 2.010032346949138, + "learning_rate": 8.378992478381957e-06, + "loss": 0.3593, + "step": 1912 + }, + { + "epoch": 0.2860774637356064, + "grad_norm": 1.4476840280584171, + "learning_rate": 8.377206987038128e-06, + "loss": 0.2342, + "step": 1913 + }, + { + "epoch": 0.28622700762673847, + "grad_norm": 1.5594572863531158, + "learning_rate": 8.375420703368993e-06, + "loss": 0.3667, + "step": 1914 + }, + { + "epoch": 0.2863765515178705, + "grad_norm": 1.6185881957264394, + "learning_rate": 8.37363362779363e-06, + "loss": 0.3414, + "step": 1915 + }, + { + "epoch": 0.28652609540900253, + "grad_norm": 1.6321798860422365, + "learning_rate": 8.371845760731305e-06, + "loss": 0.4336, + "step": 1916 + }, + { + "epoch": 0.28667563930013457, + "grad_norm": 1.391635511553069, + "learning_rate": 8.370057102601467e-06, + "loss": 0.2641, + "step": 1917 + }, + { + "epoch": 0.28682518319126665, + "grad_norm": 1.6939840429561068, + "learning_rate": 8.368267653823758e-06, + "loss": 0.3579, + "step": 1918 + }, + { + "epoch": 0.2869747270823987, + "grad_norm": 1.6529503491198996, + "learning_rate": 8.366477414817993e-06, + "loss": 0.3893, + "step": 1919 + }, + { + "epoch": 0.2871242709735307, + "grad_norm": 1.750622438533098, + "learning_rate": 8.364686386004184e-06, + "loss": 0.3065, + "step": 1920 + }, + { + "epoch": 0.2872738148646628, + "grad_norm": 1.5641156236161282, + "learning_rate": 8.362894567802522e-06, + "loss": 0.1891, + "step": 1921 + }, + { + "epoch": 0.28742335875579483, + "grad_norm": 1.3418086188856035, + "learning_rate": 8.361101960633384e-06, + "loss": 0.2246, + "step": 1922 + }, + { + "epoch": 0.28757290264692686, + "grad_norm": 1.5465668819355392, + "learning_rate": 8.359308564917335e-06, + "loss": 0.2342, + "step": 1923 + }, + { + "epoch": 0.2877224465380589, + "grad_norm": 1.6114344336379887, + "learning_rate": 8.357514381075123e-06, + "loss": 0.1817, + "step": 1924 + }, + { + "epoch": 0.287871990429191, + "grad_norm": 1.3668429785293057, + "learning_rate": 8.355719409527676e-06, + "loss": 0.3416, + "step": 1925 + }, + { + "epoch": 0.288021534320323, + "grad_norm": 1.0266243878235821, + "learning_rate": 8.353923650696119e-06, + "loss": 0.1685, + "step": 1926 + }, + { + "epoch": 0.28817107821145505, + "grad_norm": 1.7588874793595086, + "learning_rate": 8.352127105001748e-06, + "loss": 0.3868, + "step": 1927 + }, + { + "epoch": 0.28832062210258713, + "grad_norm": 1.8522510535939594, + "learning_rate": 8.350329772866054e-06, + "loss": 0.2917, + "step": 1928 + }, + { + "epoch": 0.28847016599371916, + "grad_norm": 1.7432437696852978, + "learning_rate": 8.348531654710706e-06, + "loss": 0.2314, + "step": 1929 + }, + { + "epoch": 0.2886197098848512, + "grad_norm": 1.8048485192800683, + "learning_rate": 8.34673275095756e-06, + "loss": 0.421, + "step": 1930 + }, + { + "epoch": 0.2887692537759832, + "grad_norm": 1.254687110691794, + "learning_rate": 8.344933062028659e-06, + "loss": 0.3086, + "step": 1931 + }, + { + "epoch": 0.2889187976671153, + "grad_norm": 1.5516562911782632, + "learning_rate": 8.343132588346223e-06, + "loss": 0.243, + "step": 1932 + }, + { + "epoch": 0.28906834155824734, + "grad_norm": 1.5234323648219494, + "learning_rate": 8.341331330332665e-06, + "loss": 0.3708, + "step": 1933 + }, + { + "epoch": 0.2892178854493794, + "grad_norm": 1.6869191874454617, + "learning_rate": 8.339529288410575e-06, + "loss": 0.3664, + "step": 1934 + }, + { + "epoch": 0.28936742934051146, + "grad_norm": 1.6174375460534882, + "learning_rate": 8.337726463002728e-06, + "loss": 0.3548, + "step": 1935 + }, + { + "epoch": 0.2895169732316435, + "grad_norm": 1.3712125139885123, + "learning_rate": 8.335922854532087e-06, + "loss": 0.258, + "step": 1936 + }, + { + "epoch": 0.2896665171227755, + "grad_norm": 1.782853458880192, + "learning_rate": 8.334118463421795e-06, + "loss": 0.549, + "step": 1937 + }, + { + "epoch": 0.28981606101390756, + "grad_norm": 1.620144816801684, + "learning_rate": 8.33231329009518e-06, + "loss": 0.3349, + "step": 1938 + }, + { + "epoch": 0.28996560490503964, + "grad_norm": 1.333232350703901, + "learning_rate": 8.33050733497575e-06, + "loss": 0.2262, + "step": 1939 + }, + { + "epoch": 0.2901151487961717, + "grad_norm": 1.6837289583297483, + "learning_rate": 8.328700598487203e-06, + "loss": 0.4787, + "step": 1940 + }, + { + "epoch": 0.2902646926873037, + "grad_norm": 1.0591641715642133, + "learning_rate": 8.326893081053417e-06, + "loss": 0.2385, + "step": 1941 + }, + { + "epoch": 0.2904142365784358, + "grad_norm": 1.129840571438994, + "learning_rate": 8.325084783098452e-06, + "loss": 0.247, + "step": 1942 + }, + { + "epoch": 0.2905637804695678, + "grad_norm": 1.432313552747029, + "learning_rate": 8.32327570504655e-06, + "loss": 0.191, + "step": 1943 + }, + { + "epoch": 0.29071332436069985, + "grad_norm": 1.5356970591574979, + "learning_rate": 8.32146584732214e-06, + "loss": 0.187, + "step": 1944 + }, + { + "epoch": 0.2908628682518319, + "grad_norm": 1.0595724079445366, + "learning_rate": 8.319655210349832e-06, + "loss": 0.1802, + "step": 1945 + }, + { + "epoch": 0.29101241214296397, + "grad_norm": 1.6123333459960774, + "learning_rate": 8.31784379455442e-06, + "loss": 0.2293, + "step": 1946 + }, + { + "epoch": 0.291161956034096, + "grad_norm": 1.3341178548001948, + "learning_rate": 8.31603160036088e-06, + "loss": 0.2359, + "step": 1947 + }, + { + "epoch": 0.29131149992522803, + "grad_norm": 1.2376522352882886, + "learning_rate": 8.314218628194365e-06, + "loss": 0.1896, + "step": 1948 + }, + { + "epoch": 0.2914610438163601, + "grad_norm": 1.679702490362655, + "learning_rate": 8.312404878480222e-06, + "loss": 0.3223, + "step": 1949 + }, + { + "epoch": 0.29161058770749215, + "grad_norm": 1.2597407375509204, + "learning_rate": 8.31059035164397e-06, + "loss": 0.2175, + "step": 1950 + }, + { + "epoch": 0.2917601315986242, + "grad_norm": 1.624231610744949, + "learning_rate": 8.308775048111318e-06, + "loss": 0.3432, + "step": 1951 + }, + { + "epoch": 0.29190967548975627, + "grad_norm": 1.9330093174874856, + "learning_rate": 8.306958968308152e-06, + "loss": 0.2494, + "step": 1952 + }, + { + "epoch": 0.2920592193808883, + "grad_norm": 1.7103715792411738, + "learning_rate": 8.30514211266054e-06, + "loss": 0.3095, + "step": 1953 + }, + { + "epoch": 0.29220876327202033, + "grad_norm": 1.27605011963153, + "learning_rate": 8.303324481594737e-06, + "loss": 0.2158, + "step": 1954 + }, + { + "epoch": 0.29235830716315236, + "grad_norm": 1.2814850533429023, + "learning_rate": 8.301506075537173e-06, + "loss": 0.2314, + "step": 1955 + }, + { + "epoch": 0.29250785105428445, + "grad_norm": 1.954217300152197, + "learning_rate": 8.299686894914467e-06, + "loss": 0.567, + "step": 1956 + }, + { + "epoch": 0.2926573949454165, + "grad_norm": 0.9650685264754246, + "learning_rate": 8.297866940153416e-06, + "loss": 0.2588, + "step": 1957 + }, + { + "epoch": 0.2928069388365485, + "grad_norm": 1.5166783973503064, + "learning_rate": 8.296046211681e-06, + "loss": 0.369, + "step": 1958 + }, + { + "epoch": 0.2929564827276806, + "grad_norm": 1.7977899668272896, + "learning_rate": 8.294224709924373e-06, + "loss": 0.5176, + "step": 1959 + }, + { + "epoch": 0.29310602661881263, + "grad_norm": 1.5915279488901846, + "learning_rate": 8.292402435310883e-06, + "loss": 0.3614, + "step": 1960 + }, + { + "epoch": 0.29325557050994466, + "grad_norm": 1.8933979183970016, + "learning_rate": 8.290579388268054e-06, + "loss": 0.5366, + "step": 1961 + }, + { + "epoch": 0.2934051144010767, + "grad_norm": 1.8812762399598735, + "learning_rate": 8.288755569223586e-06, + "loss": 0.5443, + "step": 1962 + }, + { + "epoch": 0.2935546582922088, + "grad_norm": 1.6643902182805455, + "learning_rate": 8.286930978605366e-06, + "loss": 0.3375, + "step": 1963 + }, + { + "epoch": 0.2937042021833408, + "grad_norm": 1.762005285741713, + "learning_rate": 8.285105616841463e-06, + "loss": 0.5123, + "step": 1964 + }, + { + "epoch": 0.29385374607447284, + "grad_norm": 1.6916688314394261, + "learning_rate": 8.283279484360119e-06, + "loss": 0.4779, + "step": 1965 + }, + { + "epoch": 0.29400328996560493, + "grad_norm": 1.0908816823199547, + "learning_rate": 8.281452581589768e-06, + "loss": 0.3536, + "step": 1966 + }, + { + "epoch": 0.29415283385673696, + "grad_norm": 1.4434211599007758, + "learning_rate": 8.279624908959018e-06, + "loss": 0.1613, + "step": 1967 + }, + { + "epoch": 0.294302377747869, + "grad_norm": 1.8196704493501261, + "learning_rate": 8.277796466896657e-06, + "loss": 0.4989, + "step": 1968 + }, + { + "epoch": 0.294451921639001, + "grad_norm": 1.3656918969234806, + "learning_rate": 8.275967255831655e-06, + "loss": 0.1918, + "step": 1969 + }, + { + "epoch": 0.2946014655301331, + "grad_norm": 1.4834473723024195, + "learning_rate": 8.274137276193162e-06, + "loss": 0.2105, + "step": 1970 + }, + { + "epoch": 0.29475100942126514, + "grad_norm": 2.0922406506784745, + "learning_rate": 8.272306528410511e-06, + "loss": 0.3878, + "step": 1971 + }, + { + "epoch": 0.2949005533123972, + "grad_norm": 1.6479894472086396, + "learning_rate": 8.270475012913212e-06, + "loss": 0.3405, + "step": 1972 + }, + { + "epoch": 0.29505009720352926, + "grad_norm": 1.816314957488247, + "learning_rate": 8.268642730130956e-06, + "loss": 0.3493, + "step": 1973 + }, + { + "epoch": 0.2951996410946613, + "grad_norm": 1.7886487407400398, + "learning_rate": 8.266809680493615e-06, + "loss": 0.4509, + "step": 1974 + }, + { + "epoch": 0.2953491849857933, + "grad_norm": 1.4476130090504609, + "learning_rate": 8.264975864431239e-06, + "loss": 0.3609, + "step": 1975 + }, + { + "epoch": 0.29549872887692535, + "grad_norm": 1.6461788405344846, + "learning_rate": 8.263141282374058e-06, + "loss": 0.3404, + "step": 1976 + }, + { + "epoch": 0.29564827276805744, + "grad_norm": 1.5013235196228902, + "learning_rate": 8.261305934752486e-06, + "loss": 0.2106, + "step": 1977 + }, + { + "epoch": 0.29579781665918947, + "grad_norm": 1.4062496662139496, + "learning_rate": 8.259469821997111e-06, + "loss": 0.3559, + "step": 1978 + }, + { + "epoch": 0.2959473605503215, + "grad_norm": 1.1293811953986368, + "learning_rate": 8.257632944538704e-06, + "loss": 0.2078, + "step": 1979 + }, + { + "epoch": 0.2960969044414536, + "grad_norm": 1.4191657864487062, + "learning_rate": 8.255795302808212e-06, + "loss": 0.3335, + "step": 1980 + }, + { + "epoch": 0.2962464483325856, + "grad_norm": 1.2818891440117588, + "learning_rate": 8.253956897236765e-06, + "loss": 0.1667, + "step": 1981 + }, + { + "epoch": 0.29639599222371765, + "grad_norm": 2.032954982306054, + "learning_rate": 8.25211772825567e-06, + "loss": 0.3959, + "step": 1982 + }, + { + "epoch": 0.2965455361148497, + "grad_norm": 2.0441581298148606, + "learning_rate": 8.250277796296412e-06, + "loss": 0.5914, + "step": 1983 + }, + { + "epoch": 0.29669508000598177, + "grad_norm": 1.0550678485707448, + "learning_rate": 8.248437101790663e-06, + "loss": 0.2187, + "step": 1984 + }, + { + "epoch": 0.2968446238971138, + "grad_norm": 1.714428768910598, + "learning_rate": 8.24659564517026e-06, + "loss": 0.3521, + "step": 1985 + }, + { + "epoch": 0.29699416778824583, + "grad_norm": 1.495931419848694, + "learning_rate": 8.244753426867233e-06, + "loss": 0.3141, + "step": 1986 + }, + { + "epoch": 0.2971437116793779, + "grad_norm": 1.4595624143758643, + "learning_rate": 8.24291044731378e-06, + "loss": 0.207, + "step": 1987 + }, + { + "epoch": 0.29729325557050995, + "grad_norm": 1.4893368784443541, + "learning_rate": 8.241066706942282e-06, + "loss": 0.2096, + "step": 1988 + }, + { + "epoch": 0.297442799461642, + "grad_norm": 1.5298733994754299, + "learning_rate": 8.239222206185303e-06, + "loss": 0.2963, + "step": 1989 + }, + { + "epoch": 0.297592343352774, + "grad_norm": 1.7110744142837584, + "learning_rate": 8.237376945475573e-06, + "loss": 0.4879, + "step": 1990 + }, + { + "epoch": 0.2977418872439061, + "grad_norm": 1.3646893921129908, + "learning_rate": 8.235530925246013e-06, + "loss": 0.2199, + "step": 1991 + }, + { + "epoch": 0.29789143113503813, + "grad_norm": 1.5519467713508963, + "learning_rate": 8.233684145929714e-06, + "loss": 0.1567, + "step": 1992 + }, + { + "epoch": 0.29804097502617016, + "grad_norm": 1.2022227133419618, + "learning_rate": 8.231836607959953e-06, + "loss": 0.3048, + "step": 1993 + }, + { + "epoch": 0.29819051891730225, + "grad_norm": 1.0528429237271446, + "learning_rate": 8.229988311770176e-06, + "loss": 0.2089, + "step": 1994 + }, + { + "epoch": 0.2983400628084343, + "grad_norm": 2.0589676323841233, + "learning_rate": 8.228139257794012e-06, + "loss": 0.4729, + "step": 1995 + }, + { + "epoch": 0.2984896066995663, + "grad_norm": 1.699439689917369, + "learning_rate": 8.226289446465269e-06, + "loss": 0.197, + "step": 1996 + }, + { + "epoch": 0.29863915059069834, + "grad_norm": 2.1597718626316214, + "learning_rate": 8.224438878217928e-06, + "loss": 0.5298, + "step": 1997 + }, + { + "epoch": 0.29878869448183043, + "grad_norm": 1.5026245898132045, + "learning_rate": 8.22258755348615e-06, + "loss": 0.4905, + "step": 1998 + }, + { + "epoch": 0.29893823837296246, + "grad_norm": 1.5917682855765136, + "learning_rate": 8.220735472704278e-06, + "loss": 0.2223, + "step": 1999 + }, + { + "epoch": 0.2990877822640945, + "grad_norm": 1.4766123425180093, + "learning_rate": 8.218882636306823e-06, + "loss": 0.2059, + "step": 2000 + }, + { + "epoch": 0.2992373261552266, + "grad_norm": 1.0856069277789302, + "learning_rate": 8.217029044728478e-06, + "loss": 0.193, + "step": 2001 + }, + { + "epoch": 0.2993868700463586, + "grad_norm": 1.2329171435469741, + "learning_rate": 8.215174698404118e-06, + "loss": 0.2372, + "step": 2002 + }, + { + "epoch": 0.29953641393749064, + "grad_norm": 1.7452583821562881, + "learning_rate": 8.213319597768785e-06, + "loss": 0.2098, + "step": 2003 + }, + { + "epoch": 0.2996859578286227, + "grad_norm": 1.0905531929281826, + "learning_rate": 8.21146374325771e-06, + "loss": 0.2187, + "step": 2004 + }, + { + "epoch": 0.29983550171975476, + "grad_norm": 1.816609674773371, + "learning_rate": 8.209607135306287e-06, + "loss": 0.3709, + "step": 2005 + }, + { + "epoch": 0.2999850456108868, + "grad_norm": 1.7738216295020472, + "learning_rate": 8.2077497743501e-06, + "loss": 0.4486, + "step": 2006 + }, + { + "epoch": 0.3001345895020188, + "grad_norm": 1.1263320970303174, + "learning_rate": 8.205891660824903e-06, + "loss": 0.2091, + "step": 2007 + }, + { + "epoch": 0.3002841333931509, + "grad_norm": 1.501664539486306, + "learning_rate": 8.204032795166625e-06, + "loss": 0.2009, + "step": 2008 + }, + { + "epoch": 0.30043367728428294, + "grad_norm": 2.6199919066049335, + "learning_rate": 8.202173177811374e-06, + "loss": 0.203, + "step": 2009 + }, + { + "epoch": 0.30058322117541497, + "grad_norm": 1.625937686489783, + "learning_rate": 8.200312809195436e-06, + "loss": 0.3831, + "step": 2010 + }, + { + "epoch": 0.30073276506654706, + "grad_norm": 1.8974675970112806, + "learning_rate": 8.198451689755269e-06, + "loss": 0.5418, + "step": 2011 + }, + { + "epoch": 0.3008823089576791, + "grad_norm": 1.8748455619787359, + "learning_rate": 8.196589819927512e-06, + "loss": 0.4425, + "step": 2012 + }, + { + "epoch": 0.3010318528488111, + "grad_norm": 1.6094667931646591, + "learning_rate": 8.194727200148978e-06, + "loss": 0.3389, + "step": 2013 + }, + { + "epoch": 0.30118139673994315, + "grad_norm": 0.9707433761295292, + "learning_rate": 8.192863830856652e-06, + "loss": 0.1763, + "step": 2014 + }, + { + "epoch": 0.30133094063107524, + "grad_norm": 1.3903941059235092, + "learning_rate": 8.1909997124877e-06, + "loss": 0.2339, + "step": 2015 + }, + { + "epoch": 0.30148048452220727, + "grad_norm": 1.3528795750135822, + "learning_rate": 8.189134845479462e-06, + "loss": 0.2193, + "step": 2016 + }, + { + "epoch": 0.3016300284133393, + "grad_norm": 1.7397719431371887, + "learning_rate": 8.187269230269458e-06, + "loss": 0.4049, + "step": 2017 + }, + { + "epoch": 0.3017795723044714, + "grad_norm": 1.626350301455843, + "learning_rate": 8.185402867295373e-06, + "loss": 0.2327, + "step": 2018 + }, + { + "epoch": 0.3019291161956034, + "grad_norm": 1.3932636681559976, + "learning_rate": 8.183535756995078e-06, + "loss": 0.2575, + "step": 2019 + }, + { + "epoch": 0.30207866008673545, + "grad_norm": 1.7626753702911266, + "learning_rate": 8.181667899806613e-06, + "loss": 0.4597, + "step": 2020 + }, + { + "epoch": 0.3022282039778675, + "grad_norm": 1.684539378891143, + "learning_rate": 8.179799296168194e-06, + "loss": 0.4608, + "step": 2021 + }, + { + "epoch": 0.30237774786899957, + "grad_norm": 1.0640465826224796, + "learning_rate": 8.177929946518217e-06, + "loss": 0.2221, + "step": 2022 + }, + { + "epoch": 0.3025272917601316, + "grad_norm": 1.5147026539291037, + "learning_rate": 8.176059851295248e-06, + "loss": 0.2353, + "step": 2023 + }, + { + "epoch": 0.30267683565126363, + "grad_norm": 1.7530304138656525, + "learning_rate": 8.174189010938028e-06, + "loss": 0.4141, + "step": 2024 + }, + { + "epoch": 0.3028263795423957, + "grad_norm": 1.9415201656454513, + "learning_rate": 8.172317425885477e-06, + "loss": 0.5352, + "step": 2025 + }, + { + "epoch": 0.30297592343352775, + "grad_norm": 1.9383678107900466, + "learning_rate": 8.170445096576683e-06, + "loss": 0.573, + "step": 2026 + }, + { + "epoch": 0.3031254673246598, + "grad_norm": 1.803033239816454, + "learning_rate": 8.168572023450915e-06, + "loss": 0.5225, + "step": 2027 + }, + { + "epoch": 0.3032750112157918, + "grad_norm": 2.1395850335411244, + "learning_rate": 8.166698206947614e-06, + "loss": 0.7007, + "step": 2028 + }, + { + "epoch": 0.3034245551069239, + "grad_norm": 1.5110862268685226, + "learning_rate": 8.164823647506394e-06, + "loss": 0.2269, + "step": 2029 + }, + { + "epoch": 0.30357409899805593, + "grad_norm": 1.6974467146899468, + "learning_rate": 8.162948345567048e-06, + "loss": 0.3921, + "step": 2030 + }, + { + "epoch": 0.30372364288918796, + "grad_norm": 1.3936816440378674, + "learning_rate": 8.161072301569536e-06, + "loss": 0.2049, + "step": 2031 + }, + { + "epoch": 0.30387318678032005, + "grad_norm": 1.4848287127408877, + "learning_rate": 8.159195515953998e-06, + "loss": 0.2062, + "step": 2032 + }, + { + "epoch": 0.3040227306714521, + "grad_norm": 1.2105792130825146, + "learning_rate": 8.157317989160746e-06, + "loss": 0.1788, + "step": 2033 + }, + { + "epoch": 0.3041722745625841, + "grad_norm": 1.3820320839391493, + "learning_rate": 8.155439721630265e-06, + "loss": 0.2331, + "step": 2034 + }, + { + "epoch": 0.30432181845371614, + "grad_norm": 2.150262549914666, + "learning_rate": 8.153560713803215e-06, + "loss": 0.1875, + "step": 2035 + }, + { + "epoch": 0.30447136234484823, + "grad_norm": 1.0906698589335555, + "learning_rate": 8.15168096612043e-06, + "loss": 0.2033, + "step": 2036 + }, + { + "epoch": 0.30462090623598026, + "grad_norm": 1.065733777923844, + "learning_rate": 8.149800479022917e-06, + "loss": 0.2001, + "step": 2037 + }, + { + "epoch": 0.3047704501271123, + "grad_norm": 1.6644796466482528, + "learning_rate": 8.147919252951855e-06, + "loss": 0.5517, + "step": 2038 + }, + { + "epoch": 0.3049199940182444, + "grad_norm": 1.5874527096275228, + "learning_rate": 8.146037288348598e-06, + "loss": 0.3491, + "step": 2039 + }, + { + "epoch": 0.3050695379093764, + "grad_norm": 2.1160201341828513, + "learning_rate": 8.144154585654675e-06, + "loss": 0.4023, + "step": 2040 + }, + { + "epoch": 0.30521908180050844, + "grad_norm": 1.449019528908297, + "learning_rate": 8.142271145311784e-06, + "loss": 0.3529, + "step": 2041 + }, + { + "epoch": 0.30536862569164047, + "grad_norm": 1.0741990035582996, + "learning_rate": 8.1403869677618e-06, + "loss": 0.2042, + "step": 2042 + }, + { + "epoch": 0.30551816958277256, + "grad_norm": 1.6081149900134495, + "learning_rate": 8.138502053446766e-06, + "loss": 0.3908, + "step": 2043 + }, + { + "epoch": 0.3056677134739046, + "grad_norm": 2.0690220211720716, + "learning_rate": 8.136616402808906e-06, + "loss": 0.5508, + "step": 2044 + }, + { + "epoch": 0.3058172573650366, + "grad_norm": 1.6409484884961714, + "learning_rate": 8.13473001629061e-06, + "loss": 0.3161, + "step": 2045 + }, + { + "epoch": 0.3059668012561687, + "grad_norm": 1.858300355296472, + "learning_rate": 8.132842894334438e-06, + "loss": 0.5614, + "step": 2046 + }, + { + "epoch": 0.30611634514730074, + "grad_norm": 1.2324767485914987, + "learning_rate": 8.130955037383132e-06, + "loss": 0.2132, + "step": 2047 + }, + { + "epoch": 0.30626588903843277, + "grad_norm": 1.7272643652056465, + "learning_rate": 8.1290664458796e-06, + "loss": 0.3132, + "step": 2048 + }, + { + "epoch": 0.3064154329295648, + "grad_norm": 1.3637333594365966, + "learning_rate": 8.127177120266926e-06, + "loss": 0.1739, + "step": 2049 + }, + { + "epoch": 0.3065649768206969, + "grad_norm": 1.538728380001006, + "learning_rate": 8.12528706098836e-06, + "loss": 0.4072, + "step": 2050 + }, + { + "epoch": 0.3067145207118289, + "grad_norm": 1.4591751944118272, + "learning_rate": 8.12339626848733e-06, + "loss": 0.3859, + "step": 2051 + }, + { + "epoch": 0.30686406460296095, + "grad_norm": 1.9028261579580017, + "learning_rate": 8.121504743207436e-06, + "loss": 0.4893, + "step": 2052 + }, + { + "epoch": 0.30701360849409304, + "grad_norm": 1.7379512730839208, + "learning_rate": 8.119612485592442e-06, + "loss": 0.3564, + "step": 2053 + }, + { + "epoch": 0.30716315238522507, + "grad_norm": 1.388968630495014, + "learning_rate": 8.117719496086298e-06, + "loss": 0.3179, + "step": 2054 + }, + { + "epoch": 0.3073126962763571, + "grad_norm": 1.403056196180425, + "learning_rate": 8.115825775133112e-06, + "loss": 0.352, + "step": 2055 + }, + { + "epoch": 0.30746224016748913, + "grad_norm": 1.508992427712604, + "learning_rate": 8.113931323177171e-06, + "loss": 0.4021, + "step": 2056 + }, + { + "epoch": 0.3076117840586212, + "grad_norm": 2.1741360894355015, + "learning_rate": 8.112036140662931e-06, + "loss": 0.467, + "step": 2057 + }, + { + "epoch": 0.30776132794975325, + "grad_norm": 1.6144568383492353, + "learning_rate": 8.110140228035022e-06, + "loss": 0.3924, + "step": 2058 + }, + { + "epoch": 0.3079108718408853, + "grad_norm": 1.4893071701125404, + "learning_rate": 8.10824358573824e-06, + "loss": 0.5036, + "step": 2059 + }, + { + "epoch": 0.30806041573201737, + "grad_norm": 1.4815595377852415, + "learning_rate": 8.10634621421756e-06, + "loss": 0.3972, + "step": 2060 + }, + { + "epoch": 0.3082099596231494, + "grad_norm": 1.7353454529628423, + "learning_rate": 8.104448113918118e-06, + "loss": 0.2237, + "step": 2061 + }, + { + "epoch": 0.30835950351428143, + "grad_norm": 1.0247854772257274, + "learning_rate": 8.102549285285233e-06, + "loss": 0.1595, + "step": 2062 + }, + { + "epoch": 0.30850904740541346, + "grad_norm": 1.6101502986170517, + "learning_rate": 8.100649728764382e-06, + "loss": 0.519, + "step": 2063 + }, + { + "epoch": 0.30865859129654555, + "grad_norm": 1.4785633507789944, + "learning_rate": 8.098749444801226e-06, + "loss": 0.3757, + "step": 2064 + }, + { + "epoch": 0.3088081351876776, + "grad_norm": 1.1653508922546973, + "learning_rate": 8.096848433841585e-06, + "loss": 0.1843, + "step": 2065 + }, + { + "epoch": 0.3089576790788096, + "grad_norm": 2.23989146500782, + "learning_rate": 8.094946696331454e-06, + "loss": 0.5717, + "step": 2066 + }, + { + "epoch": 0.3091072229699417, + "grad_norm": 1.4086002710437948, + "learning_rate": 8.093044232717004e-06, + "loss": 0.3305, + "step": 2067 + }, + { + "epoch": 0.30925676686107373, + "grad_norm": 1.3532349541812738, + "learning_rate": 8.091141043444565e-06, + "loss": 0.2031, + "step": 2068 + }, + { + "epoch": 0.30940631075220576, + "grad_norm": 1.5659716733424975, + "learning_rate": 8.08923712896065e-06, + "loss": 0.264, + "step": 2069 + }, + { + "epoch": 0.30955585464333785, + "grad_norm": 1.188194674034748, + "learning_rate": 8.087332489711931e-06, + "loss": 0.2084, + "step": 2070 + }, + { + "epoch": 0.3097053985344699, + "grad_norm": 1.3639153806477138, + "learning_rate": 8.085427126145255e-06, + "loss": 0.2999, + "step": 2071 + }, + { + "epoch": 0.3098549424256019, + "grad_norm": 1.743026958811897, + "learning_rate": 8.083521038707643e-06, + "loss": 0.5313, + "step": 2072 + }, + { + "epoch": 0.31000448631673394, + "grad_norm": 1.1747827948023122, + "learning_rate": 8.081614227846275e-06, + "loss": 0.1753, + "step": 2073 + }, + { + "epoch": 0.310154030207866, + "grad_norm": 1.1941064963550336, + "learning_rate": 8.079706694008512e-06, + "loss": 0.2424, + "step": 2074 + }, + { + "epoch": 0.31030357409899806, + "grad_norm": 1.7186546887175254, + "learning_rate": 8.077798437641878e-06, + "loss": 0.2263, + "step": 2075 + }, + { + "epoch": 0.3104531179901301, + "grad_norm": 1.0317324156623773, + "learning_rate": 8.075889459194069e-06, + "loss": 0.1556, + "step": 2076 + }, + { + "epoch": 0.3106026618812622, + "grad_norm": 1.5907870570782063, + "learning_rate": 8.073979759112949e-06, + "loss": 0.2729, + "step": 2077 + }, + { + "epoch": 0.3107522057723942, + "grad_norm": 1.8674735165445506, + "learning_rate": 8.072069337846554e-06, + "loss": 0.3821, + "step": 2078 + }, + { + "epoch": 0.31090174966352624, + "grad_norm": 1.983553521137882, + "learning_rate": 8.070158195843084e-06, + "loss": 0.6787, + "step": 2079 + }, + { + "epoch": 0.31105129355465827, + "grad_norm": 1.0817414697908496, + "learning_rate": 8.068246333550913e-06, + "loss": 0.1601, + "step": 2080 + }, + { + "epoch": 0.31120083744579036, + "grad_norm": 1.4614119919401352, + "learning_rate": 8.066333751418582e-06, + "loss": 0.2074, + "step": 2081 + }, + { + "epoch": 0.3113503813369224, + "grad_norm": 1.2801507618992622, + "learning_rate": 8.064420449894802e-06, + "loss": 0.2384, + "step": 2082 + }, + { + "epoch": 0.3114999252280544, + "grad_norm": 1.856756460170748, + "learning_rate": 8.062506429428451e-06, + "loss": 0.3569, + "step": 2083 + }, + { + "epoch": 0.3116494691191865, + "grad_norm": 1.5574561714489221, + "learning_rate": 8.060591690468579e-06, + "loss": 0.2001, + "step": 2084 + }, + { + "epoch": 0.31179901301031854, + "grad_norm": 0.8139823838456822, + "learning_rate": 8.0586762334644e-06, + "loss": 0.1493, + "step": 2085 + }, + { + "epoch": 0.31194855690145057, + "grad_norm": 1.8147690398413907, + "learning_rate": 8.056760058865298e-06, + "loss": 0.3726, + "step": 2086 + }, + { + "epoch": 0.3120981007925826, + "grad_norm": 1.3551230470716993, + "learning_rate": 8.054843167120827e-06, + "loss": 0.3161, + "step": 2087 + }, + { + "epoch": 0.3122476446837147, + "grad_norm": 1.423454170150374, + "learning_rate": 8.052925558680708e-06, + "loss": 0.3286, + "step": 2088 + }, + { + "epoch": 0.3123971885748467, + "grad_norm": 1.1629738020950542, + "learning_rate": 8.051007233994833e-06, + "loss": 0.2128, + "step": 2089 + }, + { + "epoch": 0.31254673246597875, + "grad_norm": 1.0328139570935773, + "learning_rate": 8.049088193513257e-06, + "loss": 0.2028, + "step": 2090 + }, + { + "epoch": 0.31269627635711084, + "grad_norm": 0.9561546972801208, + "learning_rate": 8.047168437686204e-06, + "loss": 0.1901, + "step": 2091 + }, + { + "epoch": 0.31284582024824287, + "grad_norm": 1.5085687791327238, + "learning_rate": 8.045247966964069e-06, + "loss": 0.3978, + "step": 2092 + }, + { + "epoch": 0.3129953641393749, + "grad_norm": 1.7990952747958247, + "learning_rate": 8.043326781797413e-06, + "loss": 0.4526, + "step": 2093 + }, + { + "epoch": 0.31314490803050693, + "grad_norm": 1.6047231005366211, + "learning_rate": 8.041404882636964e-06, + "loss": 0.3136, + "step": 2094 + }, + { + "epoch": 0.313294451921639, + "grad_norm": 1.0885180525142448, + "learning_rate": 8.039482269933619e-06, + "loss": 0.1868, + "step": 2095 + }, + { + "epoch": 0.31344399581277105, + "grad_norm": 1.6731122777438763, + "learning_rate": 8.03755894413844e-06, + "loss": 0.253, + "step": 2096 + }, + { + "epoch": 0.3135935397039031, + "grad_norm": 1.7388060302674344, + "learning_rate": 8.03563490570266e-06, + "loss": 0.3781, + "step": 2097 + }, + { + "epoch": 0.31374308359503517, + "grad_norm": 1.3948567328495716, + "learning_rate": 8.033710155077675e-06, + "loss": 0.3065, + "step": 2098 + }, + { + "epoch": 0.3138926274861672, + "grad_norm": 1.7899046169473158, + "learning_rate": 8.03178469271505e-06, + "loss": 0.4497, + "step": 2099 + }, + { + "epoch": 0.31404217137729923, + "grad_norm": 1.1376056341100615, + "learning_rate": 8.029858519066519e-06, + "loss": 0.1793, + "step": 2100 + }, + { + "epoch": 0.31419171526843126, + "grad_norm": 1.4405559368087475, + "learning_rate": 8.027931634583978e-06, + "loss": 0.1984, + "step": 2101 + }, + { + "epoch": 0.31434125915956335, + "grad_norm": 1.7393459445529194, + "learning_rate": 8.026004039719494e-06, + "loss": 0.4943, + "step": 2102 + }, + { + "epoch": 0.3144908030506954, + "grad_norm": 1.6507941300141975, + "learning_rate": 8.024075734925302e-06, + "loss": 0.3702, + "step": 2103 + }, + { + "epoch": 0.3146403469418274, + "grad_norm": 1.175812152126952, + "learning_rate": 8.022146720653797e-06, + "loss": 0.2246, + "step": 2104 + }, + { + "epoch": 0.3147898908329595, + "grad_norm": 1.505696425174867, + "learning_rate": 8.020216997357547e-06, + "loss": 0.3827, + "step": 2105 + }, + { + "epoch": 0.3149394347240915, + "grad_norm": 1.5053350968315022, + "learning_rate": 8.018286565489281e-06, + "loss": 0.3602, + "step": 2106 + }, + { + "epoch": 0.31508897861522356, + "grad_norm": 7.395352954217879, + "learning_rate": 8.016355425501899e-06, + "loss": 0.5292, + "step": 2107 + }, + { + "epoch": 0.3152385225063556, + "grad_norm": 1.4124551721650014, + "learning_rate": 8.014423577848465e-06, + "loss": 0.3058, + "step": 2108 + }, + { + "epoch": 0.3153880663974877, + "grad_norm": 1.7291348054164273, + "learning_rate": 8.012491022982206e-06, + "loss": 0.4039, + "step": 2109 + }, + { + "epoch": 0.3155376102886197, + "grad_norm": 1.3762981669774148, + "learning_rate": 8.010557761356523e-06, + "loss": 0.3214, + "step": 2110 + }, + { + "epoch": 0.31568715417975174, + "grad_norm": 1.9151602263688279, + "learning_rate": 8.008623793424975e-06, + "loss": 0.5339, + "step": 2111 + }, + { + "epoch": 0.3158366980708838, + "grad_norm": 1.6710466712643997, + "learning_rate": 8.006689119641289e-06, + "loss": 0.3143, + "step": 2112 + }, + { + "epoch": 0.31598624196201586, + "grad_norm": 1.2763038771322965, + "learning_rate": 8.00475374045936e-06, + "loss": 0.3106, + "step": 2113 + }, + { + "epoch": 0.3161357858531479, + "grad_norm": 0.9519969897990209, + "learning_rate": 8.002817656333246e-06, + "loss": 0.1709, + "step": 2114 + }, + { + "epoch": 0.3162853297442799, + "grad_norm": 1.0941737648338323, + "learning_rate": 8.000880867717168e-06, + "loss": 0.3558, + "step": 2115 + }, + { + "epoch": 0.316434873635412, + "grad_norm": 1.6804061150860001, + "learning_rate": 7.99894337506552e-06, + "loss": 0.3005, + "step": 2116 + }, + { + "epoch": 0.31658441752654404, + "grad_norm": 1.1783654687222251, + "learning_rate": 7.997005178832853e-06, + "loss": 0.2406, + "step": 2117 + }, + { + "epoch": 0.31673396141767607, + "grad_norm": 1.6348023119949275, + "learning_rate": 7.99506627947389e-06, + "loss": 0.2475, + "step": 2118 + }, + { + "epoch": 0.31688350530880816, + "grad_norm": 1.4962501892290498, + "learning_rate": 7.993126677443513e-06, + "loss": 0.3126, + "step": 2119 + }, + { + "epoch": 0.3170330491999402, + "grad_norm": 1.1269074931566945, + "learning_rate": 7.991186373196771e-06, + "loss": 0.2101, + "step": 2120 + }, + { + "epoch": 0.3171825930910722, + "grad_norm": 1.5451433092999418, + "learning_rate": 7.989245367188877e-06, + "loss": 0.4435, + "step": 2121 + }, + { + "epoch": 0.31733213698220425, + "grad_norm": 1.6954504563414547, + "learning_rate": 7.987303659875212e-06, + "loss": 0.3777, + "step": 2122 + }, + { + "epoch": 0.31748168087333634, + "grad_norm": 1.7046868376726214, + "learning_rate": 7.98536125171132e-06, + "loss": 0.375, + "step": 2123 + }, + { + "epoch": 0.31763122476446837, + "grad_norm": 1.6583768126843783, + "learning_rate": 7.983418143152906e-06, + "loss": 0.1679, + "step": 2124 + }, + { + "epoch": 0.3177807686556004, + "grad_norm": 1.652523256475839, + "learning_rate": 7.981474334655845e-06, + "loss": 0.3821, + "step": 2125 + }, + { + "epoch": 0.3179303125467325, + "grad_norm": 1.6865422789510223, + "learning_rate": 7.979529826676172e-06, + "loss": 0.2312, + "step": 2126 + }, + { + "epoch": 0.3180798564378645, + "grad_norm": 1.318155763232294, + "learning_rate": 7.977584619670084e-06, + "loss": 0.2417, + "step": 2127 + }, + { + "epoch": 0.31822940032899655, + "grad_norm": 0.9387354141491452, + "learning_rate": 7.97563871409395e-06, + "loss": 0.2152, + "step": 2128 + }, + { + "epoch": 0.31837894422012863, + "grad_norm": 1.4740638203723162, + "learning_rate": 7.973692110404295e-06, + "loss": 0.3663, + "step": 2129 + }, + { + "epoch": 0.31852848811126067, + "grad_norm": 1.5147303848676803, + "learning_rate": 7.971744809057815e-06, + "loss": 0.2317, + "step": 2130 + }, + { + "epoch": 0.3186780320023927, + "grad_norm": 1.3220893173215598, + "learning_rate": 7.96979681051136e-06, + "loss": 0.3673, + "step": 2131 + }, + { + "epoch": 0.31882757589352473, + "grad_norm": 1.778728683314461, + "learning_rate": 7.967848115221953e-06, + "loss": 0.3838, + "step": 2132 + }, + { + "epoch": 0.3189771197846568, + "grad_norm": 1.79666907747646, + "learning_rate": 7.965898723646777e-06, + "loss": 0.4668, + "step": 2133 + }, + { + "epoch": 0.31912666367578885, + "grad_norm": 1.1363334854959442, + "learning_rate": 7.963948636243175e-06, + "loss": 0.1881, + "step": 2134 + }, + { + "epoch": 0.3192762075669209, + "grad_norm": 1.2727017471208417, + "learning_rate": 7.96199785346866e-06, + "loss": 0.3583, + "step": 2135 + }, + { + "epoch": 0.31942575145805296, + "grad_norm": 1.6879630689693983, + "learning_rate": 7.960046375780903e-06, + "loss": 0.528, + "step": 2136 + }, + { + "epoch": 0.319575295349185, + "grad_norm": 1.441558984867543, + "learning_rate": 7.958094203637738e-06, + "loss": 0.3621, + "step": 2137 + }, + { + "epoch": 0.319724839240317, + "grad_norm": 0.9760649090504697, + "learning_rate": 7.956141337497166e-06, + "loss": 0.1714, + "step": 2138 + }, + { + "epoch": 0.31987438313144906, + "grad_norm": 0.9469336665426322, + "learning_rate": 7.954187777817345e-06, + "loss": 0.1795, + "step": 2139 + }, + { + "epoch": 0.32002392702258114, + "grad_norm": 1.0476640767317869, + "learning_rate": 7.952233525056603e-06, + "loss": 0.1922, + "step": 2140 + }, + { + "epoch": 0.3201734709137132, + "grad_norm": 1.254945459546166, + "learning_rate": 7.950278579673422e-06, + "loss": 0.2742, + "step": 2141 + }, + { + "epoch": 0.3203230148048452, + "grad_norm": 0.7230488496853383, + "learning_rate": 7.948322942126456e-06, + "loss": 0.1742, + "step": 2142 + }, + { + "epoch": 0.3204725586959773, + "grad_norm": 0.960182470952214, + "learning_rate": 7.946366612874512e-06, + "loss": 0.1851, + "step": 2143 + }, + { + "epoch": 0.3206221025871093, + "grad_norm": 1.1966344295913398, + "learning_rate": 7.944409592376565e-06, + "loss": 0.1877, + "step": 2144 + }, + { + "epoch": 0.32077164647824136, + "grad_norm": 1.9057275611187459, + "learning_rate": 7.942451881091752e-06, + "loss": 0.2618, + "step": 2145 + }, + { + "epoch": 0.3209211903693734, + "grad_norm": 1.2274689028407764, + "learning_rate": 7.94049347947937e-06, + "loss": 0.1885, + "step": 2146 + }, + { + "epoch": 0.3210707342605055, + "grad_norm": 1.678924591333489, + "learning_rate": 7.93853438799888e-06, + "loss": 0.2338, + "step": 2147 + }, + { + "epoch": 0.3212202781516375, + "grad_norm": 1.7464181014190676, + "learning_rate": 7.936574607109901e-06, + "loss": 0.3676, + "step": 2148 + }, + { + "epoch": 0.32136982204276954, + "grad_norm": 1.5398927642089624, + "learning_rate": 7.934614137272218e-06, + "loss": 0.2293, + "step": 2149 + }, + { + "epoch": 0.3215193659339016, + "grad_norm": 1.5519843635324424, + "learning_rate": 7.932652978945779e-06, + "loss": 0.199, + "step": 2150 + }, + { + "epoch": 0.32166890982503366, + "grad_norm": 1.1920599004222825, + "learning_rate": 7.930691132590686e-06, + "loss": 0.1696, + "step": 2151 + }, + { + "epoch": 0.3218184537161657, + "grad_norm": 0.8145849226466982, + "learning_rate": 7.928728598667211e-06, + "loss": 0.1537, + "step": 2152 + }, + { + "epoch": 0.3219679976072977, + "grad_norm": 1.3926444820307884, + "learning_rate": 7.926765377635781e-06, + "loss": 0.1999, + "step": 2153 + }, + { + "epoch": 0.3221175414984298, + "grad_norm": 1.6412886866609875, + "learning_rate": 7.924801469956986e-06, + "loss": 0.2527, + "step": 2154 + }, + { + "epoch": 0.32226708538956184, + "grad_norm": 1.5325273883442905, + "learning_rate": 7.92283687609158e-06, + "loss": 0.2634, + "step": 2155 + }, + { + "epoch": 0.32241662928069387, + "grad_norm": 1.0316905287286056, + "learning_rate": 7.920871596500473e-06, + "loss": 0.174, + "step": 2156 + }, + { + "epoch": 0.32256617317182595, + "grad_norm": 1.1349047741297784, + "learning_rate": 7.91890563164474e-06, + "loss": 0.1721, + "step": 2157 + }, + { + "epoch": 0.322715717062958, + "grad_norm": 2.057460755429332, + "learning_rate": 7.916938981985619e-06, + "loss": 0.6121, + "step": 2158 + }, + { + "epoch": 0.32286526095409, + "grad_norm": 1.236449628888766, + "learning_rate": 7.914971647984494e-06, + "loss": 0.1979, + "step": 2159 + }, + { + "epoch": 0.32301480484522205, + "grad_norm": 1.952661192183289, + "learning_rate": 7.913003630102934e-06, + "loss": 0.4105, + "step": 2160 + }, + { + "epoch": 0.32316434873635413, + "grad_norm": 1.6581077143849778, + "learning_rate": 7.911034928802647e-06, + "loss": 0.4989, + "step": 2161 + }, + { + "epoch": 0.32331389262748617, + "grad_norm": 1.0964604376832452, + "learning_rate": 7.909065544545511e-06, + "loss": 0.2086, + "step": 2162 + }, + { + "epoch": 0.3234634365186182, + "grad_norm": 1.4794895335762217, + "learning_rate": 7.907095477793563e-06, + "loss": 0.2232, + "step": 2163 + }, + { + "epoch": 0.3236129804097503, + "grad_norm": 1.7007378332145011, + "learning_rate": 7.905124729008996e-06, + "loss": 0.3449, + "step": 2164 + }, + { + "epoch": 0.3237625243008823, + "grad_norm": 5.5988236758577, + "learning_rate": 7.903153298654173e-06, + "loss": 0.2263, + "step": 2165 + }, + { + "epoch": 0.32391206819201435, + "grad_norm": 1.7620287916310595, + "learning_rate": 7.901181187191606e-06, + "loss": 0.4116, + "step": 2166 + }, + { + "epoch": 0.3240616120831464, + "grad_norm": 1.6633135381694006, + "learning_rate": 7.899208395083974e-06, + "loss": 0.2297, + "step": 2167 + }, + { + "epoch": 0.32421115597427846, + "grad_norm": 0.9621828860582361, + "learning_rate": 7.897234922794113e-06, + "loss": 0.2098, + "step": 2168 + }, + { + "epoch": 0.3243606998654105, + "grad_norm": 1.3513242795516456, + "learning_rate": 7.895260770785014e-06, + "loss": 0.3103, + "step": 2169 + }, + { + "epoch": 0.3245102437565425, + "grad_norm": 0.9735967195147534, + "learning_rate": 7.893285939519836e-06, + "loss": 0.1699, + "step": 2170 + }, + { + "epoch": 0.3246597876476746, + "grad_norm": 2.0453322690120554, + "learning_rate": 7.891310429461895e-06, + "loss": 0.5192, + "step": 2171 + }, + { + "epoch": 0.32480933153880664, + "grad_norm": 1.4629318475963238, + "learning_rate": 7.889334241074663e-06, + "loss": 0.4312, + "step": 2172 + }, + { + "epoch": 0.3249588754299387, + "grad_norm": 1.5333310991077456, + "learning_rate": 7.887357374821768e-06, + "loss": 0.2691, + "step": 2173 + }, + { + "epoch": 0.3251084193210707, + "grad_norm": 1.6289642332812333, + "learning_rate": 7.88537983116701e-06, + "loss": 0.4631, + "step": 2174 + }, + { + "epoch": 0.3252579632122028, + "grad_norm": 1.166966622363864, + "learning_rate": 7.883401610574338e-06, + "loss": 0.1523, + "step": 2175 + }, + { + "epoch": 0.3254075071033348, + "grad_norm": 1.7643050095516288, + "learning_rate": 7.881422713507857e-06, + "loss": 0.4944, + "step": 2176 + }, + { + "epoch": 0.32555705099446686, + "grad_norm": 2.0778286729797943, + "learning_rate": 7.879443140431837e-06, + "loss": 0.5285, + "step": 2177 + }, + { + "epoch": 0.32570659488559894, + "grad_norm": 1.22732284237921, + "learning_rate": 7.877462891810708e-06, + "loss": 0.2215, + "step": 2178 + }, + { + "epoch": 0.325856138776731, + "grad_norm": 1.794981875573838, + "learning_rate": 7.875481968109052e-06, + "loss": 0.4738, + "step": 2179 + }, + { + "epoch": 0.326005682667863, + "grad_norm": 1.8625636048305199, + "learning_rate": 7.873500369791615e-06, + "loss": 0.5112, + "step": 2180 + }, + { + "epoch": 0.3261552265589951, + "grad_norm": 1.5970308413484968, + "learning_rate": 7.8715180973233e-06, + "loss": 0.3491, + "step": 2181 + }, + { + "epoch": 0.3263047704501271, + "grad_norm": 4.445014172618314, + "learning_rate": 7.869535151169163e-06, + "loss": 0.4128, + "step": 2182 + }, + { + "epoch": 0.32645431434125916, + "grad_norm": 2.199504833687433, + "learning_rate": 7.867551531794427e-06, + "loss": 0.5042, + "step": 2183 + }, + { + "epoch": 0.3266038582323912, + "grad_norm": 1.9133519547070805, + "learning_rate": 7.865567239664463e-06, + "loss": 0.5783, + "step": 2184 + }, + { + "epoch": 0.3267534021235233, + "grad_norm": 1.847425837596219, + "learning_rate": 7.86358227524481e-06, + "loss": 0.5434, + "step": 2185 + }, + { + "epoch": 0.3269029460146553, + "grad_norm": 1.7506088747332584, + "learning_rate": 7.861596639001157e-06, + "loss": 0.3848, + "step": 2186 + }, + { + "epoch": 0.32705248990578734, + "grad_norm": 1.6285208330736813, + "learning_rate": 7.859610331399354e-06, + "loss": 0.2588, + "step": 2187 + }, + { + "epoch": 0.3272020337969194, + "grad_norm": 1.0354818947520257, + "learning_rate": 7.85762335290541e-06, + "loss": 0.1884, + "step": 2188 + }, + { + "epoch": 0.32735157768805145, + "grad_norm": 2.061471227348859, + "learning_rate": 7.855635703985487e-06, + "loss": 0.5502, + "step": 2189 + }, + { + "epoch": 0.3275011215791835, + "grad_norm": 1.0589713974920532, + "learning_rate": 7.853647385105905e-06, + "loss": 0.1894, + "step": 2190 + }, + { + "epoch": 0.3276506654703155, + "grad_norm": 1.27660152324297, + "learning_rate": 7.851658396733148e-06, + "loss": 0.2325, + "step": 2191 + }, + { + "epoch": 0.3278002093614476, + "grad_norm": 1.602286282256986, + "learning_rate": 7.849668739333846e-06, + "loss": 0.1959, + "step": 2192 + }, + { + "epoch": 0.32794975325257963, + "grad_norm": 1.3119010750867572, + "learning_rate": 7.847678413374795e-06, + "loss": 0.1982, + "step": 2193 + }, + { + "epoch": 0.32809929714371167, + "grad_norm": 1.2840343160987466, + "learning_rate": 7.845687419322945e-06, + "loss": 0.2299, + "step": 2194 + }, + { + "epoch": 0.32824884103484375, + "grad_norm": 1.5111699549772195, + "learning_rate": 7.843695757645402e-06, + "loss": 0.2698, + "step": 2195 + }, + { + "epoch": 0.3283983849259758, + "grad_norm": 1.1728180317528463, + "learning_rate": 7.841703428809426e-06, + "loss": 0.1911, + "step": 2196 + }, + { + "epoch": 0.3285479288171078, + "grad_norm": 1.623773790762259, + "learning_rate": 7.839710433282441e-06, + "loss": 0.4539, + "step": 2197 + }, + { + "epoch": 0.32869747270823985, + "grad_norm": 1.3523478321165887, + "learning_rate": 7.83771677153202e-06, + "loss": 0.1695, + "step": 2198 + }, + { + "epoch": 0.32884701659937193, + "grad_norm": 1.669502440977397, + "learning_rate": 7.835722444025898e-06, + "loss": 0.2157, + "step": 2199 + }, + { + "epoch": 0.32899656049050396, + "grad_norm": 1.3304613082395382, + "learning_rate": 7.83372745123196e-06, + "loss": 0.1949, + "step": 2200 + }, + { + "epoch": 0.329146104381636, + "grad_norm": 1.857526998824266, + "learning_rate": 7.831731793618253e-06, + "loss": 0.2801, + "step": 2201 + }, + { + "epoch": 0.3292956482727681, + "grad_norm": 1.4165319093489228, + "learning_rate": 7.829735471652978e-06, + "loss": 0.3415, + "step": 2202 + }, + { + "epoch": 0.3294451921639001, + "grad_norm": 1.193763566127306, + "learning_rate": 7.827738485804488e-06, + "loss": 0.1967, + "step": 2203 + }, + { + "epoch": 0.32959473605503214, + "grad_norm": 1.4097654443065837, + "learning_rate": 7.825740836541299e-06, + "loss": 0.2322, + "step": 2204 + }, + { + "epoch": 0.3297442799461642, + "grad_norm": 0.8446160856119209, + "learning_rate": 7.823742524332074e-06, + "loss": 0.1748, + "step": 2205 + }, + { + "epoch": 0.32989382383729626, + "grad_norm": 1.4283812084385221, + "learning_rate": 7.821743549645642e-06, + "loss": 0.2485, + "step": 2206 + }, + { + "epoch": 0.3300433677284283, + "grad_norm": 1.522841097169646, + "learning_rate": 7.819743912950979e-06, + "loss": 0.3102, + "step": 2207 + }, + { + "epoch": 0.3301929116195603, + "grad_norm": 1.07026655557967, + "learning_rate": 7.817743614717218e-06, + "loss": 0.2192, + "step": 2208 + }, + { + "epoch": 0.3303424555106924, + "grad_norm": 1.5961930831288902, + "learning_rate": 7.815742655413651e-06, + "loss": 0.3224, + "step": 2209 + }, + { + "epoch": 0.33049199940182444, + "grad_norm": 1.4925568208469404, + "learning_rate": 7.813741035509718e-06, + "loss": 0.4544, + "step": 2210 + }, + { + "epoch": 0.3306415432929565, + "grad_norm": 1.0278606288492513, + "learning_rate": 7.811738755475024e-06, + "loss": 0.1586, + "step": 2211 + }, + { + "epoch": 0.3307910871840885, + "grad_norm": 1.6801015432287125, + "learning_rate": 7.80973581577932e-06, + "loss": 0.3744, + "step": 2212 + }, + { + "epoch": 0.3309406310752206, + "grad_norm": 1.7419776704612941, + "learning_rate": 7.807732216892514e-06, + "loss": 0.5292, + "step": 2213 + }, + { + "epoch": 0.3310901749663526, + "grad_norm": 1.2090204199623609, + "learning_rate": 7.80572795928467e-06, + "loss": 0.3566, + "step": 2214 + }, + { + "epoch": 0.33123971885748466, + "grad_norm": 1.0659334336267288, + "learning_rate": 7.803723043426008e-06, + "loss": 0.1775, + "step": 2215 + }, + { + "epoch": 0.33138926274861674, + "grad_norm": 1.2471584103742004, + "learning_rate": 7.8017174697869e-06, + "loss": 0.2591, + "step": 2216 + }, + { + "epoch": 0.3315388066397488, + "grad_norm": 1.2868762000450256, + "learning_rate": 7.799711238837871e-06, + "loss": 0.2203, + "step": 2217 + }, + { + "epoch": 0.3316883505308808, + "grad_norm": 1.597988709645616, + "learning_rate": 7.797704351049604e-06, + "loss": 0.4692, + "step": 2218 + }, + { + "epoch": 0.33183789442201284, + "grad_norm": 1.6497915275322537, + "learning_rate": 7.795696806892936e-06, + "loss": 0.5004, + "step": 2219 + }, + { + "epoch": 0.3319874383131449, + "grad_norm": 1.3728703674339666, + "learning_rate": 7.793688606838852e-06, + "loss": 0.2219, + "step": 2220 + }, + { + "epoch": 0.33213698220427695, + "grad_norm": 2.095794714839839, + "learning_rate": 7.791679751358497e-06, + "loss": 0.3322, + "step": 2221 + }, + { + "epoch": 0.332286526095409, + "grad_norm": 1.5144434482126574, + "learning_rate": 7.789670240923169e-06, + "loss": 0.3003, + "step": 2222 + }, + { + "epoch": 0.33243606998654107, + "grad_norm": 1.5220974726280843, + "learning_rate": 7.787660076004316e-06, + "loss": 0.3285, + "step": 2223 + }, + { + "epoch": 0.3325856138776731, + "grad_norm": 1.440338460328666, + "learning_rate": 7.785649257073544e-06, + "loss": 0.2101, + "step": 2224 + }, + { + "epoch": 0.33273515776880513, + "grad_norm": 1.5213659837860871, + "learning_rate": 7.783637784602608e-06, + "loss": 0.2036, + "step": 2225 + }, + { + "epoch": 0.33288470165993717, + "grad_norm": 2.991037147646805, + "learning_rate": 7.781625659063423e-06, + "loss": 0.6472, + "step": 2226 + }, + { + "epoch": 0.33303424555106925, + "grad_norm": 1.511479617581753, + "learning_rate": 7.779612880928052e-06, + "loss": 0.1741, + "step": 2227 + }, + { + "epoch": 0.3331837894422013, + "grad_norm": 1.4094694160926888, + "learning_rate": 7.777599450668708e-06, + "loss": 0.1864, + "step": 2228 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.9589341253223518, + "learning_rate": 7.775585368757767e-06, + "loss": 0.2585, + "step": 2229 + }, + { + "epoch": 0.3334828772244654, + "grad_norm": 1.5850528260281727, + "learning_rate": 7.773570635667746e-06, + "loss": 0.3359, + "step": 2230 + }, + { + "epoch": 0.33363242111559743, + "grad_norm": 1.928677128362692, + "learning_rate": 7.771555251871326e-06, + "loss": 0.5367, + "step": 2231 + }, + { + "epoch": 0.33378196500672946, + "grad_norm": 1.8147255946503564, + "learning_rate": 7.769539217841333e-06, + "loss": 0.2452, + "step": 2232 + }, + { + "epoch": 0.3339315088978615, + "grad_norm": 1.533478905239483, + "learning_rate": 7.76752253405075e-06, + "loss": 0.3273, + "step": 2233 + }, + { + "epoch": 0.3340810527889936, + "grad_norm": 1.3049158892916473, + "learning_rate": 7.765505200972708e-06, + "loss": 0.2014, + "step": 2234 + }, + { + "epoch": 0.3342305966801256, + "grad_norm": 1.6006461196766226, + "learning_rate": 7.763487219080492e-06, + "loss": 0.3297, + "step": 2235 + }, + { + "epoch": 0.33438014057125764, + "grad_norm": 1.3959531694066092, + "learning_rate": 7.761468588847543e-06, + "loss": 0.1987, + "step": 2236 + }, + { + "epoch": 0.33452968446238973, + "grad_norm": 1.413036480020334, + "learning_rate": 7.75944931074745e-06, + "loss": 0.2507, + "step": 2237 + }, + { + "epoch": 0.33467922835352176, + "grad_norm": 1.6143147896405532, + "learning_rate": 7.757429385253955e-06, + "loss": 0.365, + "step": 2238 + }, + { + "epoch": 0.3348287722446538, + "grad_norm": 1.2552402165493568, + "learning_rate": 7.755408812840952e-06, + "loss": 0.2404, + "step": 2239 + }, + { + "epoch": 0.3349783161357859, + "grad_norm": 1.884570594935436, + "learning_rate": 7.753387593982485e-06, + "loss": 0.4998, + "step": 2240 + }, + { + "epoch": 0.3351278600269179, + "grad_norm": 1.5525626441469005, + "learning_rate": 7.751365729152754e-06, + "loss": 0.1856, + "step": 2241 + }, + { + "epoch": 0.33527740391804994, + "grad_norm": 1.5710541861971588, + "learning_rate": 7.749343218826107e-06, + "loss": 0.1989, + "step": 2242 + }, + { + "epoch": 0.335426947809182, + "grad_norm": 1.9778222905539145, + "learning_rate": 7.747320063477045e-06, + "loss": 0.602, + "step": 2243 + }, + { + "epoch": 0.33557649170031406, + "grad_norm": 1.6193619290401855, + "learning_rate": 7.745296263580218e-06, + "loss": 0.2811, + "step": 2244 + }, + { + "epoch": 0.3357260355914461, + "grad_norm": 1.9654135361350595, + "learning_rate": 7.743271819610432e-06, + "loss": 0.5169, + "step": 2245 + }, + { + "epoch": 0.3358755794825781, + "grad_norm": 1.5452430622711606, + "learning_rate": 7.741246732042638e-06, + "loss": 0.345, + "step": 2246 + }, + { + "epoch": 0.3360251233737102, + "grad_norm": 1.7587162322576044, + "learning_rate": 7.739221001351942e-06, + "loss": 0.4502, + "step": 2247 + }, + { + "epoch": 0.33617466726484224, + "grad_norm": 1.9373589145144738, + "learning_rate": 7.7371946280136e-06, + "loss": 0.3852, + "step": 2248 + }, + { + "epoch": 0.3363242111559743, + "grad_norm": 1.9597929999948922, + "learning_rate": 7.73516761250302e-06, + "loss": 0.3904, + "step": 2249 + }, + { + "epoch": 0.3364737550471063, + "grad_norm": 1.8798765980010916, + "learning_rate": 7.733139955295756e-06, + "loss": 0.5182, + "step": 2250 + }, + { + "epoch": 0.3366232989382384, + "grad_norm": 1.626082298346251, + "learning_rate": 7.73111165686752e-06, + "loss": 0.3692, + "step": 2251 + }, + { + "epoch": 0.3367728428293704, + "grad_norm": 0.9953658155537086, + "learning_rate": 7.72908271769417e-06, + "loss": 0.1844, + "step": 2252 + }, + { + "epoch": 0.33692238672050245, + "grad_norm": 1.3727196538195634, + "learning_rate": 7.727053138251712e-06, + "loss": 0.2179, + "step": 2253 + }, + { + "epoch": 0.33707193061163454, + "grad_norm": 1.4912804935224222, + "learning_rate": 7.725022919016306e-06, + "loss": 0.3038, + "step": 2254 + }, + { + "epoch": 0.33722147450276657, + "grad_norm": 1.5520829591707979, + "learning_rate": 7.722992060464261e-06, + "loss": 0.2481, + "step": 2255 + }, + { + "epoch": 0.3373710183938986, + "grad_norm": 0.9325160815572334, + "learning_rate": 7.720960563072035e-06, + "loss": 0.1387, + "step": 2256 + }, + { + "epoch": 0.33752056228503063, + "grad_norm": 1.1905265030205905, + "learning_rate": 7.718928427316241e-06, + "loss": 0.2595, + "step": 2257 + }, + { + "epoch": 0.3376701061761627, + "grad_norm": 1.0242402171948481, + "learning_rate": 7.716895653673633e-06, + "loss": 0.1728, + "step": 2258 + }, + { + "epoch": 0.33781965006729475, + "grad_norm": 1.9399325651192516, + "learning_rate": 7.714862242621121e-06, + "loss": 0.236, + "step": 2259 + }, + { + "epoch": 0.3379691939584268, + "grad_norm": 1.2589445942870727, + "learning_rate": 7.712828194635762e-06, + "loss": 0.1777, + "step": 2260 + }, + { + "epoch": 0.33811873784955887, + "grad_norm": 1.0189796376895088, + "learning_rate": 7.710793510194765e-06, + "loss": 0.2061, + "step": 2261 + }, + { + "epoch": 0.3382682817406909, + "grad_norm": 1.3110462323223497, + "learning_rate": 7.708758189775485e-06, + "loss": 0.1907, + "step": 2262 + }, + { + "epoch": 0.33841782563182293, + "grad_norm": 1.8977870965152843, + "learning_rate": 7.706722233855428e-06, + "loss": 0.4697, + "step": 2263 + }, + { + "epoch": 0.33856736952295496, + "grad_norm": 1.676037286436981, + "learning_rate": 7.70468564291225e-06, + "loss": 0.5385, + "step": 2264 + }, + { + "epoch": 0.33871691341408705, + "grad_norm": 1.8926850673510127, + "learning_rate": 7.702648417423755e-06, + "loss": 0.6545, + "step": 2265 + }, + { + "epoch": 0.3388664573052191, + "grad_norm": 1.211799579288037, + "learning_rate": 7.700610557867894e-06, + "loss": 0.1933, + "step": 2266 + }, + { + "epoch": 0.3390160011963511, + "grad_norm": 1.3249080152566644, + "learning_rate": 7.69857206472277e-06, + "loss": 0.3369, + "step": 2267 + }, + { + "epoch": 0.3391655450874832, + "grad_norm": 1.197942464661273, + "learning_rate": 7.696532938466631e-06, + "loss": 0.2168, + "step": 2268 + }, + { + "epoch": 0.33931508897861523, + "grad_norm": 1.5336422309655264, + "learning_rate": 7.69449317957788e-06, + "loss": 0.1862, + "step": 2269 + }, + { + "epoch": 0.33946463286974726, + "grad_norm": 1.4755775481717086, + "learning_rate": 7.692452788535058e-06, + "loss": 0.3991, + "step": 2270 + }, + { + "epoch": 0.3396141767608793, + "grad_norm": 1.2003981851028729, + "learning_rate": 7.690411765816864e-06, + "loss": 0.1887, + "step": 2271 + }, + { + "epoch": 0.3397637206520114, + "grad_norm": 1.5232479955721698, + "learning_rate": 7.688370111902141e-06, + "loss": 0.3291, + "step": 2272 + }, + { + "epoch": 0.3399132645431434, + "grad_norm": 1.1630358360895665, + "learning_rate": 7.686327827269883e-06, + "loss": 0.3571, + "step": 2273 + }, + { + "epoch": 0.34006280843427544, + "grad_norm": 1.2180304152897419, + "learning_rate": 7.684284912399227e-06, + "loss": 0.2026, + "step": 2274 + }, + { + "epoch": 0.34021235232540753, + "grad_norm": 1.2838128091625958, + "learning_rate": 7.68224136776946e-06, + "loss": 0.1883, + "step": 2275 + }, + { + "epoch": 0.34036189621653956, + "grad_norm": 1.9043193483249239, + "learning_rate": 7.680197193860019e-06, + "loss": 0.5335, + "step": 2276 + }, + { + "epoch": 0.3405114401076716, + "grad_norm": 1.6719584667012997, + "learning_rate": 7.678152391150488e-06, + "loss": 0.4823, + "step": 2277 + }, + { + "epoch": 0.3406609839988036, + "grad_norm": 1.614799856208538, + "learning_rate": 7.676106960120595e-06, + "loss": 0.3589, + "step": 2278 + }, + { + "epoch": 0.3408105278899357, + "grad_norm": 1.915461278238496, + "learning_rate": 7.674060901250217e-06, + "loss": 0.4702, + "step": 2279 + }, + { + "epoch": 0.34096007178106774, + "grad_norm": 1.80642832565446, + "learning_rate": 7.672014215019382e-06, + "loss": 0.5783, + "step": 2280 + }, + { + "epoch": 0.3411096156721998, + "grad_norm": 1.8185018530729478, + "learning_rate": 7.66996690190826e-06, + "loss": 0.2136, + "step": 2281 + }, + { + "epoch": 0.34125915956333186, + "grad_norm": 1.5750847935557877, + "learning_rate": 7.667918962397172e-06, + "loss": 0.3577, + "step": 2282 + }, + { + "epoch": 0.3414087034544639, + "grad_norm": 1.4052021520161486, + "learning_rate": 7.665870396966582e-06, + "loss": 0.3421, + "step": 2283 + }, + { + "epoch": 0.3415582473455959, + "grad_norm": 2.4796312435140964, + "learning_rate": 7.663821206097106e-06, + "loss": 0.5199, + "step": 2284 + }, + { + "epoch": 0.34170779123672795, + "grad_norm": 1.9135083427877795, + "learning_rate": 7.661771390269506e-06, + "loss": 0.5724, + "step": 2285 + }, + { + "epoch": 0.34185733512786004, + "grad_norm": 1.47040509500929, + "learning_rate": 7.65972094996468e-06, + "loss": 0.3446, + "step": 2286 + }, + { + "epoch": 0.34200687901899207, + "grad_norm": 1.8826362598607522, + "learning_rate": 7.65766988566369e-06, + "loss": 0.4584, + "step": 2287 + }, + { + "epoch": 0.3421564229101241, + "grad_norm": 1.583414431479772, + "learning_rate": 7.65561819784773e-06, + "loss": 0.2571, + "step": 2288 + }, + { + "epoch": 0.3423059668012562, + "grad_norm": 1.1374097342884393, + "learning_rate": 7.653565886998149e-06, + "loss": 0.2772, + "step": 2289 + }, + { + "epoch": 0.3424555106923882, + "grad_norm": 1.6018927652316786, + "learning_rate": 7.651512953596438e-06, + "loss": 0.3834, + "step": 2290 + }, + { + "epoch": 0.34260505458352025, + "grad_norm": 1.5898030999912747, + "learning_rate": 7.649459398124233e-06, + "loss": 0.268, + "step": 2291 + }, + { + "epoch": 0.3427545984746523, + "grad_norm": 1.788880984497509, + "learning_rate": 7.64740522106332e-06, + "loss": 0.4031, + "step": 2292 + }, + { + "epoch": 0.34290414236578437, + "grad_norm": 1.4103716345007062, + "learning_rate": 7.645350422895627e-06, + "loss": 0.34, + "step": 2293 + }, + { + "epoch": 0.3430536862569164, + "grad_norm": 1.455320179410942, + "learning_rate": 7.643295004103232e-06, + "loss": 0.393, + "step": 2294 + }, + { + "epoch": 0.34320323014804843, + "grad_norm": 1.121798327145041, + "learning_rate": 7.641238965168356e-06, + "loss": 0.1951, + "step": 2295 + }, + { + "epoch": 0.3433527740391805, + "grad_norm": 1.959743352728859, + "learning_rate": 7.639182306573362e-06, + "loss": 0.5306, + "step": 2296 + }, + { + "epoch": 0.34350231793031255, + "grad_norm": 1.0222053736742456, + "learning_rate": 7.637125028800765e-06, + "loss": 0.1877, + "step": 2297 + }, + { + "epoch": 0.3436518618214446, + "grad_norm": 1.2126683631983057, + "learning_rate": 7.63506713233322e-06, + "loss": 0.2219, + "step": 2298 + }, + { + "epoch": 0.34380140571257667, + "grad_norm": 1.182330026578451, + "learning_rate": 7.633008617653531e-06, + "loss": 0.2427, + "step": 2299 + }, + { + "epoch": 0.3439509496037087, + "grad_norm": 1.3284239825142252, + "learning_rate": 7.630949485244646e-06, + "loss": 0.3342, + "step": 2300 + }, + { + "epoch": 0.34410049349484073, + "grad_norm": 1.4991905889604522, + "learning_rate": 7.6288897355896565e-06, + "loss": 0.3277, + "step": 2301 + }, + { + "epoch": 0.34425003738597276, + "grad_norm": 1.2105004198602645, + "learning_rate": 7.6268293691718e-06, + "loss": 0.2175, + "step": 2302 + }, + { + "epoch": 0.34439958127710485, + "grad_norm": 1.137201185526714, + "learning_rate": 7.624768386474456e-06, + "loss": 0.1951, + "step": 2303 + }, + { + "epoch": 0.3445491251682369, + "grad_norm": 1.2921483182674967, + "learning_rate": 7.622706787981153e-06, + "loss": 0.2069, + "step": 2304 + }, + { + "epoch": 0.3446986690593689, + "grad_norm": 1.5385416209630458, + "learning_rate": 7.620644574175562e-06, + "loss": 0.3467, + "step": 2305 + }, + { + "epoch": 0.344848212950501, + "grad_norm": 2.272200349391826, + "learning_rate": 7.6185817455414975e-06, + "loss": 0.4775, + "step": 2306 + }, + { + "epoch": 0.34499775684163303, + "grad_norm": 1.7594806067750843, + "learning_rate": 7.61651830256292e-06, + "loss": 0.2168, + "step": 2307 + }, + { + "epoch": 0.34514730073276506, + "grad_norm": 1.9724043128153095, + "learning_rate": 7.614454245723932e-06, + "loss": 0.5726, + "step": 2308 + }, + { + "epoch": 0.3452968446238971, + "grad_norm": 1.4742051107263883, + "learning_rate": 7.612389575508781e-06, + "loss": 0.3032, + "step": 2309 + }, + { + "epoch": 0.3454463885150292, + "grad_norm": 2.0620637201052325, + "learning_rate": 7.610324292401861e-06, + "loss": 0.309, + "step": 2310 + }, + { + "epoch": 0.3455959324061612, + "grad_norm": 1.0544585597609104, + "learning_rate": 7.608258396887702e-06, + "loss": 0.2339, + "step": 2311 + }, + { + "epoch": 0.34574547629729324, + "grad_norm": 1.2388411799285035, + "learning_rate": 7.606191889450989e-06, + "loss": 0.1824, + "step": 2312 + }, + { + "epoch": 0.34589502018842533, + "grad_norm": 1.9886690672129868, + "learning_rate": 7.604124770576539e-06, + "loss": 0.3321, + "step": 2313 + }, + { + "epoch": 0.34604456407955736, + "grad_norm": 1.3774072344684907, + "learning_rate": 7.602057040749325e-06, + "loss": 0.3824, + "step": 2314 + }, + { + "epoch": 0.3461941079706894, + "grad_norm": 0.842321998785562, + "learning_rate": 7.59998870045445e-06, + "loss": 0.1966, + "step": 2315 + }, + { + "epoch": 0.3463436518618214, + "grad_norm": 1.6133223844738274, + "learning_rate": 7.597919750177168e-06, + "loss": 0.3439, + "step": 2316 + }, + { + "epoch": 0.3464931957529535, + "grad_norm": 2.701793044817531, + "learning_rate": 7.595850190402877e-06, + "loss": 0.3649, + "step": 2317 + }, + { + "epoch": 0.34664273964408554, + "grad_norm": 2.042826281041951, + "learning_rate": 7.593780021617115e-06, + "loss": 0.5945, + "step": 2318 + }, + { + "epoch": 0.34679228353521757, + "grad_norm": 1.8226181825863899, + "learning_rate": 7.591709244305561e-06, + "loss": 0.4039, + "step": 2319 + }, + { + "epoch": 0.34694182742634966, + "grad_norm": 1.7612529231819496, + "learning_rate": 7.589637858954041e-06, + "loss": 0.2384, + "step": 2320 + }, + { + "epoch": 0.3470913713174817, + "grad_norm": 1.2495900823088222, + "learning_rate": 7.587565866048523e-06, + "loss": 0.3633, + "step": 2321 + }, + { + "epoch": 0.3472409152086137, + "grad_norm": 1.2456808766471323, + "learning_rate": 7.5854932660751144e-06, + "loss": 0.2318, + "step": 2322 + }, + { + "epoch": 0.34739045909974575, + "grad_norm": 1.772092412043284, + "learning_rate": 7.58342005952007e-06, + "loss": 0.2514, + "step": 2323 + }, + { + "epoch": 0.34754000299087784, + "grad_norm": 1.0718322965745777, + "learning_rate": 7.581346246869781e-06, + "loss": 0.2001, + "step": 2324 + }, + { + "epoch": 0.34768954688200987, + "grad_norm": 1.9749261015333692, + "learning_rate": 7.579271828610786e-06, + "loss": 0.642, + "step": 2325 + }, + { + "epoch": 0.3478390907731419, + "grad_norm": 3.5434908077443183, + "learning_rate": 7.5771968052297605e-06, + "loss": 0.2132, + "step": 2326 + }, + { + "epoch": 0.347988634664274, + "grad_norm": 1.3138111751860577, + "learning_rate": 7.575121177213528e-06, + "loss": 0.2074, + "step": 2327 + }, + { + "epoch": 0.348138178555406, + "grad_norm": 1.4549216960822273, + "learning_rate": 7.573044945049051e-06, + "loss": 0.2213, + "step": 2328 + }, + { + "epoch": 0.34828772244653805, + "grad_norm": 1.9285857186532291, + "learning_rate": 7.5709681092234315e-06, + "loss": 0.5573, + "step": 2329 + }, + { + "epoch": 0.3484372663376701, + "grad_norm": 1.528743992296175, + "learning_rate": 7.568890670223918e-06, + "loss": 0.2517, + "step": 2330 + }, + { + "epoch": 0.34858681022880217, + "grad_norm": 1.3064713646436477, + "learning_rate": 7.566812628537894e-06, + "loss": 0.1867, + "step": 2331 + }, + { + "epoch": 0.3487363541199342, + "grad_norm": 1.3819819935958808, + "learning_rate": 7.56473398465289e-06, + "loss": 0.2354, + "step": 2332 + }, + { + "epoch": 0.34888589801106623, + "grad_norm": 1.90386469705261, + "learning_rate": 7.5626547390565766e-06, + "loss": 0.6091, + "step": 2333 + }, + { + "epoch": 0.3490354419021983, + "grad_norm": 1.1572160857164981, + "learning_rate": 7.5605748922367636e-06, + "loss": 0.1634, + "step": 2334 + }, + { + "epoch": 0.34918498579333035, + "grad_norm": 1.120922647106074, + "learning_rate": 7.558494444681405e-06, + "loss": 0.2191, + "step": 2335 + }, + { + "epoch": 0.3493345296844624, + "grad_norm": 1.8933761063913324, + "learning_rate": 7.556413396878593e-06, + "loss": 0.4644, + "step": 2336 + }, + { + "epoch": 0.3494840735755944, + "grad_norm": 2.2982217544810526, + "learning_rate": 7.554331749316559e-06, + "loss": 0.5312, + "step": 2337 + }, + { + "epoch": 0.3496336174667265, + "grad_norm": 1.4598497201666212, + "learning_rate": 7.552249502483681e-06, + "loss": 0.2322, + "step": 2338 + }, + { + "epoch": 0.34978316135785853, + "grad_norm": 1.0002482253315497, + "learning_rate": 7.550166656868472e-06, + "loss": 0.183, + "step": 2339 + }, + { + "epoch": 0.34993270524899056, + "grad_norm": 1.7931068633163847, + "learning_rate": 7.548083212959588e-06, + "loss": 0.3655, + "step": 2340 + }, + { + "epoch": 0.35008224914012265, + "grad_norm": 1.9088729651038523, + "learning_rate": 7.545999171245826e-06, + "loss": 0.5897, + "step": 2341 + }, + { + "epoch": 0.3502317930312547, + "grad_norm": 1.6014278355250144, + "learning_rate": 7.543914532216121e-06, + "loss": 0.3399, + "step": 2342 + }, + { + "epoch": 0.3503813369223867, + "grad_norm": 1.589397325565207, + "learning_rate": 7.541829296359552e-06, + "loss": 0.5276, + "step": 2343 + }, + { + "epoch": 0.35053088081351874, + "grad_norm": 1.3666145854556788, + "learning_rate": 7.5397434641653325e-06, + "loss": 0.297, + "step": 2344 + }, + { + "epoch": 0.35068042470465083, + "grad_norm": 1.1296429490766309, + "learning_rate": 7.53765703612282e-06, + "loss": 0.1881, + "step": 2345 + }, + { + "epoch": 0.35082996859578286, + "grad_norm": 1.3370573999376023, + "learning_rate": 7.535570012721509e-06, + "loss": 0.1978, + "step": 2346 + }, + { + "epoch": 0.3509795124869149, + "grad_norm": 1.80559813243436, + "learning_rate": 7.533482394451037e-06, + "loss": 0.5293, + "step": 2347 + }, + { + "epoch": 0.351129056378047, + "grad_norm": 1.2601205754536828, + "learning_rate": 7.531394181801182e-06, + "loss": 0.3029, + "step": 2348 + }, + { + "epoch": 0.351278600269179, + "grad_norm": 1.5151339748208972, + "learning_rate": 7.529305375261852e-06, + "loss": 0.354, + "step": 2349 + }, + { + "epoch": 0.35142814416031104, + "grad_norm": 1.1154348924689401, + "learning_rate": 7.52721597532311e-06, + "loss": 0.1877, + "step": 2350 + }, + { + "epoch": 0.35157768805144307, + "grad_norm": 1.6903766996209675, + "learning_rate": 7.525125982475141e-06, + "loss": 0.3838, + "step": 2351 + }, + { + "epoch": 0.35172723194257516, + "grad_norm": 1.5111917986875527, + "learning_rate": 7.523035397208281e-06, + "loss": 0.3664, + "step": 2352 + }, + { + "epoch": 0.3518767758337072, + "grad_norm": 14.577070159201043, + "learning_rate": 7.520944220013002e-06, + "loss": 0.2204, + "step": 2353 + }, + { + "epoch": 0.3520263197248392, + "grad_norm": 1.2720960634080132, + "learning_rate": 7.518852451379914e-06, + "loss": 0.2156, + "step": 2354 + }, + { + "epoch": 0.3521758636159713, + "grad_norm": 1.4752321481872246, + "learning_rate": 7.516760091799766e-06, + "loss": 0.2068, + "step": 2355 + }, + { + "epoch": 0.35232540750710334, + "grad_norm": 1.2757349060760932, + "learning_rate": 7.5146671417634456e-06, + "loss": 0.3031, + "step": 2356 + }, + { + "epoch": 0.35247495139823537, + "grad_norm": 1.1892854387567553, + "learning_rate": 7.512573601761979e-06, + "loss": 0.1938, + "step": 2357 + }, + { + "epoch": 0.35262449528936746, + "grad_norm": 1.1695321760256765, + "learning_rate": 7.5104794722865305e-06, + "loss": 0.1994, + "step": 2358 + }, + { + "epoch": 0.3527740391804995, + "grad_norm": 1.4157720422061164, + "learning_rate": 7.5083847538284025e-06, + "loss": 0.2415, + "step": 2359 + }, + { + "epoch": 0.3529235830716315, + "grad_norm": 1.642650861184269, + "learning_rate": 7.506289446879038e-06, + "loss": 0.3864, + "step": 2360 + }, + { + "epoch": 0.35307312696276355, + "grad_norm": 1.66074120706464, + "learning_rate": 7.504193551930014e-06, + "loss": 0.2436, + "step": 2361 + }, + { + "epoch": 0.35322267085389564, + "grad_norm": 1.6293105897438254, + "learning_rate": 7.502097069473051e-06, + "loss": 0.4727, + "step": 2362 + }, + { + "epoch": 0.35337221474502767, + "grad_norm": 2.1484736491543326, + "learning_rate": 7.500000000000001e-06, + "loss": 0.6399, + "step": 2363 + }, + { + "epoch": 0.3535217586361597, + "grad_norm": 1.998088679617451, + "learning_rate": 7.497902344002858e-06, + "loss": 0.4879, + "step": 2364 + }, + { + "epoch": 0.3536713025272918, + "grad_norm": 1.0466569132461796, + "learning_rate": 7.495804101973751e-06, + "loss": 0.175, + "step": 2365 + }, + { + "epoch": 0.3538208464184238, + "grad_norm": 1.483423152464052, + "learning_rate": 7.49370527440495e-06, + "loss": 0.1817, + "step": 2366 + }, + { + "epoch": 0.35397039030955585, + "grad_norm": 1.181675839490534, + "learning_rate": 7.491605861788856e-06, + "loss": 0.2158, + "step": 2367 + }, + { + "epoch": 0.3541199342006879, + "grad_norm": 1.272613662606653, + "learning_rate": 7.4895058646180165e-06, + "loss": 0.2161, + "step": 2368 + }, + { + "epoch": 0.35426947809181997, + "grad_norm": 2.249706812141276, + "learning_rate": 7.487405283385109e-06, + "loss": 0.4302, + "step": 2369 + }, + { + "epoch": 0.354419021982952, + "grad_norm": 1.790056947576118, + "learning_rate": 7.485304118582949e-06, + "loss": 0.4237, + "step": 2370 + }, + { + "epoch": 0.35456856587408403, + "grad_norm": 1.728469311368499, + "learning_rate": 7.483202370704492e-06, + "loss": 0.232, + "step": 2371 + }, + { + "epoch": 0.3547181097652161, + "grad_norm": 1.7204599933992115, + "learning_rate": 7.481100040242827e-06, + "loss": 0.2945, + "step": 2372 + }, + { + "epoch": 0.35486765365634815, + "grad_norm": 1.7017419956837787, + "learning_rate": 7.478997127691181e-06, + "loss": 0.324, + "step": 2373 + }, + { + "epoch": 0.3550171975474802, + "grad_norm": 1.7242409041398354, + "learning_rate": 7.476893633542917e-06, + "loss": 0.379, + "step": 2374 + }, + { + "epoch": 0.3551667414386122, + "grad_norm": 1.7352420641389674, + "learning_rate": 7.474789558291537e-06, + "loss": 0.3666, + "step": 2375 + }, + { + "epoch": 0.3553162853297443, + "grad_norm": 2.417484284214745, + "learning_rate": 7.472684902430678e-06, + "loss": 0.5313, + "step": 2376 + }, + { + "epoch": 0.35546582922087633, + "grad_norm": 1.9863314858317191, + "learning_rate": 7.470579666454108e-06, + "loss": 0.4731, + "step": 2377 + }, + { + "epoch": 0.35561537311200836, + "grad_norm": 1.3767673556499953, + "learning_rate": 7.46847385085574e-06, + "loss": 0.2309, + "step": 2378 + }, + { + "epoch": 0.35576491700314045, + "grad_norm": 2.1027176103910423, + "learning_rate": 7.466367456129616e-06, + "loss": 0.2419, + "step": 2379 + }, + { + "epoch": 0.3559144608942725, + "grad_norm": 1.3223085016147877, + "learning_rate": 7.464260482769917e-06, + "loss": 0.2214, + "step": 2380 + }, + { + "epoch": 0.3560640047854045, + "grad_norm": 3.308046838444924, + "learning_rate": 7.462152931270961e-06, + "loss": 0.1637, + "step": 2381 + }, + { + "epoch": 0.35621354867653654, + "grad_norm": 1.2529639271363728, + "learning_rate": 7.4600448021271975e-06, + "loss": 0.209, + "step": 2382 + }, + { + "epoch": 0.3563630925676686, + "grad_norm": 1.4685959227889136, + "learning_rate": 7.457936095833216e-06, + "loss": 0.3916, + "step": 2383 + }, + { + "epoch": 0.35651263645880066, + "grad_norm": 1.762730610248373, + "learning_rate": 7.455826812883738e-06, + "loss": 0.2468, + "step": 2384 + }, + { + "epoch": 0.3566621803499327, + "grad_norm": 1.3974489415451616, + "learning_rate": 7.453716953773622e-06, + "loss": 0.2198, + "step": 2385 + }, + { + "epoch": 0.3568117242410648, + "grad_norm": 1.6020912414110884, + "learning_rate": 7.4516065189978625e-06, + "loss": 0.3663, + "step": 2386 + }, + { + "epoch": 0.3569612681321968, + "grad_norm": 1.1698502771825479, + "learning_rate": 7.449495509051584e-06, + "loss": 0.2301, + "step": 2387 + }, + { + "epoch": 0.35711081202332884, + "grad_norm": 1.6443713783244578, + "learning_rate": 7.447383924430055e-06, + "loss": 0.2023, + "step": 2388 + }, + { + "epoch": 0.35726035591446087, + "grad_norm": 1.320656374724735, + "learning_rate": 7.44527176562867e-06, + "loss": 0.1615, + "step": 2389 + }, + { + "epoch": 0.35740989980559296, + "grad_norm": 2.147900342981391, + "learning_rate": 7.4431590331429615e-06, + "loss": 0.2496, + "step": 2390 + }, + { + "epoch": 0.357559443696725, + "grad_norm": 1.0493224753110673, + "learning_rate": 7.441045727468601e-06, + "loss": 0.174, + "step": 2391 + }, + { + "epoch": 0.357708987587857, + "grad_norm": 1.4365731625537776, + "learning_rate": 7.4389318491013855e-06, + "loss": 0.2509, + "step": 2392 + }, + { + "epoch": 0.3578585314789891, + "grad_norm": 1.3570392728095118, + "learning_rate": 7.436817398537253e-06, + "loss": 0.3505, + "step": 2393 + }, + { + "epoch": 0.35800807537012114, + "grad_norm": 0.9836140143478714, + "learning_rate": 7.434702376272275e-06, + "loss": 0.1676, + "step": 2394 + }, + { + "epoch": 0.35815761926125317, + "grad_norm": 1.6386886795873667, + "learning_rate": 7.4325867828026555e-06, + "loss": 0.3311, + "step": 2395 + }, + { + "epoch": 0.3583071631523852, + "grad_norm": 1.7664542023376915, + "learning_rate": 7.4304706186247344e-06, + "loss": 0.4952, + "step": 2396 + }, + { + "epoch": 0.3584567070435173, + "grad_norm": 1.970974898714591, + "learning_rate": 7.42835388423498e-06, + "loss": 0.5721, + "step": 2397 + }, + { + "epoch": 0.3586062509346493, + "grad_norm": 1.234101790360312, + "learning_rate": 7.426236580130004e-06, + "loss": 0.1842, + "step": 2398 + }, + { + "epoch": 0.35875579482578135, + "grad_norm": 0.8359881523865066, + "learning_rate": 7.424118706806543e-06, + "loss": 0.1574, + "step": 2399 + }, + { + "epoch": 0.35890533871691344, + "grad_norm": 1.2167515845771457, + "learning_rate": 7.422000264761471e-06, + "loss": 0.3393, + "step": 2400 + }, + { + "epoch": 0.35905488260804547, + "grad_norm": 1.1652590311365825, + "learning_rate": 7.419881254491794e-06, + "loss": 0.3154, + "step": 2401 + }, + { + "epoch": 0.3592044264991775, + "grad_norm": 1.7451208164751588, + "learning_rate": 7.417761676494654e-06, + "loss": 0.3306, + "step": 2402 + }, + { + "epoch": 0.35935397039030953, + "grad_norm": 1.50819933578857, + "learning_rate": 7.415641531267325e-06, + "loss": 0.3414, + "step": 2403 + }, + { + "epoch": 0.3595035142814416, + "grad_norm": 1.159271590101521, + "learning_rate": 7.4135208193072126e-06, + "loss": 0.1953, + "step": 2404 + }, + { + "epoch": 0.35965305817257365, + "grad_norm": 1.6763078000161467, + "learning_rate": 7.411399541111855e-06, + "loss": 0.3306, + "step": 2405 + }, + { + "epoch": 0.3598026020637057, + "grad_norm": 1.6105048047144768, + "learning_rate": 7.409277697178926e-06, + "loss": 0.2899, + "step": 2406 + }, + { + "epoch": 0.35995214595483777, + "grad_norm": 1.6423579180660441, + "learning_rate": 7.4071552880062295e-06, + "loss": 0.1941, + "step": 2407 + }, + { + "epoch": 0.3601016898459698, + "grad_norm": 1.984850391003558, + "learning_rate": 7.4050323140917035e-06, + "loss": 0.509, + "step": 2408 + }, + { + "epoch": 0.36025123373710183, + "grad_norm": 1.6162723010996198, + "learning_rate": 7.402908775933419e-06, + "loss": 0.3583, + "step": 2409 + }, + { + "epoch": 0.36040077762823386, + "grad_norm": 1.3526627301579437, + "learning_rate": 7.400784674029579e-06, + "loss": 0.2228, + "step": 2410 + }, + { + "epoch": 0.36055032151936595, + "grad_norm": 1.0613997053031348, + "learning_rate": 7.398660008878517e-06, + "loss": 0.1782, + "step": 2411 + }, + { + "epoch": 0.360699865410498, + "grad_norm": 1.4156014777328207, + "learning_rate": 7.396534780978699e-06, + "loss": 0.3675, + "step": 2412 + }, + { + "epoch": 0.36084940930163, + "grad_norm": 2.0634027441697778, + "learning_rate": 7.394408990828726e-06, + "loss": 0.3529, + "step": 2413 + }, + { + "epoch": 0.3609989531927621, + "grad_norm": 1.6015524340522593, + "learning_rate": 7.392282638927326e-06, + "loss": 0.233, + "step": 2414 + }, + { + "epoch": 0.3611484970838941, + "grad_norm": 1.047148380596162, + "learning_rate": 7.390155725773365e-06, + "loss": 0.1861, + "step": 2415 + }, + { + "epoch": 0.36129804097502616, + "grad_norm": 1.6393955983196027, + "learning_rate": 7.388028251865837e-06, + "loss": 0.1914, + "step": 2416 + }, + { + "epoch": 0.36144758486615824, + "grad_norm": 1.3926718614765299, + "learning_rate": 7.385900217703865e-06, + "loss": 0.186, + "step": 2417 + }, + { + "epoch": 0.3615971287572903, + "grad_norm": 1.5958216662832228, + "learning_rate": 7.383771623786709e-06, + "loss": 0.2127, + "step": 2418 + }, + { + "epoch": 0.3617466726484223, + "grad_norm": 1.956943418621964, + "learning_rate": 7.381642470613758e-06, + "loss": 0.5055, + "step": 2419 + }, + { + "epoch": 0.36189621653955434, + "grad_norm": 2.2836322625873855, + "learning_rate": 7.37951275868453e-06, + "loss": 0.6337, + "step": 2420 + }, + { + "epoch": 0.3620457604306864, + "grad_norm": 1.922712275476603, + "learning_rate": 7.3773824884986744e-06, + "loss": 0.4718, + "step": 2421 + }, + { + "epoch": 0.36219530432181846, + "grad_norm": 1.01255124960959, + "learning_rate": 7.375251660555978e-06, + "loss": 0.1985, + "step": 2422 + }, + { + "epoch": 0.3623448482129505, + "grad_norm": 1.6681331401639201, + "learning_rate": 7.373120275356349e-06, + "loss": 0.4116, + "step": 2423 + }, + { + "epoch": 0.3624943921040826, + "grad_norm": 1.3837406537641457, + "learning_rate": 7.370988333399834e-06, + "loss": 0.2207, + "step": 2424 + }, + { + "epoch": 0.3626439359952146, + "grad_norm": 1.559290768311182, + "learning_rate": 7.3688558351866055e-06, + "loss": 0.2715, + "step": 2425 + }, + { + "epoch": 0.36279347988634664, + "grad_norm": 1.305139066591499, + "learning_rate": 7.366722781216968e-06, + "loss": 0.3011, + "step": 2426 + }, + { + "epoch": 0.36294302377747867, + "grad_norm": 1.6209491159652774, + "learning_rate": 7.3645891719913584e-06, + "loss": 0.3261, + "step": 2427 + }, + { + "epoch": 0.36309256766861076, + "grad_norm": 1.4951914962551227, + "learning_rate": 7.3624550080103385e-06, + "loss": 0.3393, + "step": 2428 + }, + { + "epoch": 0.3632421115597428, + "grad_norm": 1.8477503569108042, + "learning_rate": 7.360320289774607e-06, + "loss": 0.3535, + "step": 2429 + }, + { + "epoch": 0.3633916554508748, + "grad_norm": 1.8411787417724799, + "learning_rate": 7.358185017784989e-06, + "loss": 0.3847, + "step": 2430 + }, + { + "epoch": 0.3635411993420069, + "grad_norm": 1.3297346850852803, + "learning_rate": 7.356049192542439e-06, + "loss": 0.1938, + "step": 2431 + }, + { + "epoch": 0.36369074323313894, + "grad_norm": 1.746289398019253, + "learning_rate": 7.353912814548042e-06, + "loss": 0.5313, + "step": 2432 + }, + { + "epoch": 0.36384028712427097, + "grad_norm": 1.4432992935580808, + "learning_rate": 7.351775884303013e-06, + "loss": 0.3349, + "step": 2433 + }, + { + "epoch": 0.363989831015403, + "grad_norm": 0.9649898248590643, + "learning_rate": 7.349638402308696e-06, + "loss": 0.2208, + "step": 2434 + }, + { + "epoch": 0.3641393749065351, + "grad_norm": 1.8449998111001102, + "learning_rate": 7.347500369066567e-06, + "loss": 0.3445, + "step": 2435 + }, + { + "epoch": 0.3642889187976671, + "grad_norm": 1.7052431627521538, + "learning_rate": 7.345361785078227e-06, + "loss": 0.2205, + "step": 2436 + }, + { + "epoch": 0.36443846268879915, + "grad_norm": 1.172361472405274, + "learning_rate": 7.343222650845408e-06, + "loss": 0.217, + "step": 2437 + }, + { + "epoch": 0.36458800657993123, + "grad_norm": 1.6796925766446777, + "learning_rate": 7.341082966869975e-06, + "loss": 0.3589, + "step": 2438 + }, + { + "epoch": 0.36473755047106327, + "grad_norm": 1.7021974172455587, + "learning_rate": 7.3389427336539146e-06, + "loss": 0.3936, + "step": 2439 + }, + { + "epoch": 0.3648870943621953, + "grad_norm": 1.5350766828655338, + "learning_rate": 7.336801951699348e-06, + "loss": 0.4299, + "step": 2440 + }, + { + "epoch": 0.36503663825332733, + "grad_norm": 2.0471270711764524, + "learning_rate": 7.334660621508523e-06, + "loss": 0.6307, + "step": 2441 + }, + { + "epoch": 0.3651861821444594, + "grad_norm": 1.3866667939244877, + "learning_rate": 7.3325187435838145e-06, + "loss": 0.3435, + "step": 2442 + }, + { + "epoch": 0.36533572603559145, + "grad_norm": 1.9710057444965332, + "learning_rate": 7.330376318427731e-06, + "loss": 0.3715, + "step": 2443 + }, + { + "epoch": 0.3654852699267235, + "grad_norm": 1.9432365570364065, + "learning_rate": 7.328233346542906e-06, + "loss": 0.5569, + "step": 2444 + }, + { + "epoch": 0.36563481381785556, + "grad_norm": 1.0580374357411853, + "learning_rate": 7.326089828432097e-06, + "loss": 0.2205, + "step": 2445 + }, + { + "epoch": 0.3657843577089876, + "grad_norm": 1.9944374991832907, + "learning_rate": 7.323945764598198e-06, + "loss": 0.5225, + "step": 2446 + }, + { + "epoch": 0.3659339016001196, + "grad_norm": 1.3286346243167817, + "learning_rate": 7.321801155544227e-06, + "loss": 0.3126, + "step": 2447 + }, + { + "epoch": 0.36608344549125166, + "grad_norm": 1.797949701766733, + "learning_rate": 7.319656001773326e-06, + "loss": 0.5528, + "step": 2448 + }, + { + "epoch": 0.36623298938238374, + "grad_norm": 1.3871718042688361, + "learning_rate": 7.317510303788775e-06, + "loss": 0.1986, + "step": 2449 + }, + { + "epoch": 0.3663825332735158, + "grad_norm": 1.696323576703982, + "learning_rate": 7.31536406209397e-06, + "loss": 0.4646, + "step": 2450 + }, + { + "epoch": 0.3665320771646478, + "grad_norm": 1.112116781879356, + "learning_rate": 7.313217277192441e-06, + "loss": 0.1698, + "step": 2451 + }, + { + "epoch": 0.3666816210557799, + "grad_norm": 1.6253111202173256, + "learning_rate": 7.311069949587849e-06, + "loss": 0.368, + "step": 2452 + }, + { + "epoch": 0.3668311649469119, + "grad_norm": 1.8012549383451812, + "learning_rate": 7.308922079783972e-06, + "loss": 0.4943, + "step": 2453 + }, + { + "epoch": 0.36698070883804396, + "grad_norm": 1.73000381975083, + "learning_rate": 7.306773668284723e-06, + "loss": 0.543, + "step": 2454 + }, + { + "epoch": 0.367130252729176, + "grad_norm": 1.7982349971106675, + "learning_rate": 7.30462471559414e-06, + "loss": 0.3192, + "step": 2455 + }, + { + "epoch": 0.3672797966203081, + "grad_norm": 1.3452770378011227, + "learning_rate": 7.302475222216388e-06, + "loss": 0.3435, + "step": 2456 + }, + { + "epoch": 0.3674293405114401, + "grad_norm": 1.673472364447151, + "learning_rate": 7.300325188655762e-06, + "loss": 0.2812, + "step": 2457 + }, + { + "epoch": 0.36757888440257214, + "grad_norm": 1.5298434482917413, + "learning_rate": 7.298174615416676e-06, + "loss": 0.2351, + "step": 2458 + }, + { + "epoch": 0.3677284282937042, + "grad_norm": 0.9700694807997261, + "learning_rate": 7.2960235030036765e-06, + "loss": 0.1807, + "step": 2459 + }, + { + "epoch": 0.36787797218483625, + "grad_norm": 1.3628120017832577, + "learning_rate": 7.293871851921435e-06, + "loss": 0.1775, + "step": 2460 + }, + { + "epoch": 0.3680275160759683, + "grad_norm": 1.331669281066242, + "learning_rate": 7.29171966267475e-06, + "loss": 0.1953, + "step": 2461 + }, + { + "epoch": 0.3681770599671003, + "grad_norm": 1.7478823536432608, + "learning_rate": 7.2895669357685465e-06, + "loss": 0.402, + "step": 2462 + }, + { + "epoch": 0.3683266038582324, + "grad_norm": 1.0780884237235537, + "learning_rate": 7.287413671707875e-06, + "loss": 0.2042, + "step": 2463 + }, + { + "epoch": 0.36847614774936444, + "grad_norm": 1.39413350641363, + "learning_rate": 7.285259870997911e-06, + "loss": 0.2161, + "step": 2464 + }, + { + "epoch": 0.36862569164049647, + "grad_norm": 1.781457240613985, + "learning_rate": 7.283105534143957e-06, + "loss": 0.4025, + "step": 2465 + }, + { + "epoch": 0.36877523553162855, + "grad_norm": 1.298349295097918, + "learning_rate": 7.280950661651443e-06, + "loss": 0.2216, + "step": 2466 + }, + { + "epoch": 0.3689247794227606, + "grad_norm": 1.6025246428329694, + "learning_rate": 7.278795254025921e-06, + "loss": 0.2132, + "step": 2467 + }, + { + "epoch": 0.3690743233138926, + "grad_norm": 1.9856391732187633, + "learning_rate": 7.276639311773068e-06, + "loss": 0.5661, + "step": 2468 + }, + { + "epoch": 0.36922386720502465, + "grad_norm": 1.901744160559575, + "learning_rate": 7.274482835398695e-06, + "loss": 0.5376, + "step": 2469 + }, + { + "epoch": 0.36937341109615673, + "grad_norm": 1.3567504841626332, + "learning_rate": 7.272325825408728e-06, + "loss": 0.2408, + "step": 2470 + }, + { + "epoch": 0.36952295498728877, + "grad_norm": 2.2451530510094253, + "learning_rate": 7.270168282309222e-06, + "loss": 0.2229, + "step": 2471 + }, + { + "epoch": 0.3696724988784208, + "grad_norm": 1.3887976362795125, + "learning_rate": 7.268010206606361e-06, + "loss": 0.3222, + "step": 2472 + }, + { + "epoch": 0.3698220427695529, + "grad_norm": 1.2797777276840512, + "learning_rate": 7.265851598806446e-06, + "loss": 0.2033, + "step": 2473 + }, + { + "epoch": 0.3699715866606849, + "grad_norm": 1.191177092854937, + "learning_rate": 7.263692459415909e-06, + "loss": 0.2131, + "step": 2474 + }, + { + "epoch": 0.37012113055181695, + "grad_norm": 1.5776822158264978, + "learning_rate": 7.261532788941306e-06, + "loss": 0.3744, + "step": 2475 + }, + { + "epoch": 0.37027067444294903, + "grad_norm": 1.0854274920898752, + "learning_rate": 7.259372587889314e-06, + "loss": 0.233, + "step": 2476 + }, + { + "epoch": 0.37042021833408106, + "grad_norm": 1.2018243893675977, + "learning_rate": 7.25721185676674e-06, + "loss": 0.2363, + "step": 2477 + }, + { + "epoch": 0.3705697622252131, + "grad_norm": 0.9967469112175203, + "learning_rate": 7.25505059608051e-06, + "loss": 0.1983, + "step": 2478 + }, + { + "epoch": 0.3707193061163451, + "grad_norm": 1.2775894695719086, + "learning_rate": 7.252888806337678e-06, + "loss": 0.208, + "step": 2479 + }, + { + "epoch": 0.3708688500074772, + "grad_norm": 1.5952345161170771, + "learning_rate": 7.25072648804542e-06, + "loss": 0.2703, + "step": 2480 + }, + { + "epoch": 0.37101839389860924, + "grad_norm": 1.1681377853119423, + "learning_rate": 7.248563641711036e-06, + "loss": 0.2119, + "step": 2481 + }, + { + "epoch": 0.3711679377897413, + "grad_norm": 1.9271882432441645, + "learning_rate": 7.2464002678419524e-06, + "loss": 0.3501, + "step": 2482 + }, + { + "epoch": 0.37131748168087336, + "grad_norm": 1.4801233084882979, + "learning_rate": 7.244236366945715e-06, + "loss": 0.2263, + "step": 2483 + }, + { + "epoch": 0.3714670255720054, + "grad_norm": 1.464705478500087, + "learning_rate": 7.242071939529999e-06, + "loss": 0.2709, + "step": 2484 + }, + { + "epoch": 0.3716165694631374, + "grad_norm": 1.5487734242577513, + "learning_rate": 7.239906986102598e-06, + "loss": 0.1828, + "step": 2485 + }, + { + "epoch": 0.37176611335426946, + "grad_norm": 1.1511925750155394, + "learning_rate": 7.237741507171432e-06, + "loss": 0.1896, + "step": 2486 + }, + { + "epoch": 0.37191565724540154, + "grad_norm": 1.6178218431333782, + "learning_rate": 7.235575503244542e-06, + "loss": 0.3265, + "step": 2487 + }, + { + "epoch": 0.3720652011365336, + "grad_norm": 1.005983926413353, + "learning_rate": 7.233408974830093e-06, + "loss": 0.173, + "step": 2488 + }, + { + "epoch": 0.3722147450276656, + "grad_norm": 1.0227776759122873, + "learning_rate": 7.231241922436374e-06, + "loss": 0.1948, + "step": 2489 + }, + { + "epoch": 0.3723642889187977, + "grad_norm": 1.6953767597307763, + "learning_rate": 7.229074346571798e-06, + "loss": 0.5005, + "step": 2490 + }, + { + "epoch": 0.3725138328099297, + "grad_norm": 1.5875223115043986, + "learning_rate": 7.226906247744897e-06, + "loss": 0.3688, + "step": 2491 + }, + { + "epoch": 0.37266337670106175, + "grad_norm": 2.27185613487943, + "learning_rate": 7.2247376264643294e-06, + "loss": 0.4033, + "step": 2492 + }, + { + "epoch": 0.3728129205921938, + "grad_norm": 1.6290529057007825, + "learning_rate": 7.2225684832388745e-06, + "loss": 0.3383, + "step": 2493 + }, + { + "epoch": 0.3729624644833259, + "grad_norm": 1.0562741707118548, + "learning_rate": 7.220398818577432e-06, + "loss": 0.182, + "step": 2494 + }, + { + "epoch": 0.3731120083744579, + "grad_norm": 1.3801253740702135, + "learning_rate": 7.21822863298903e-06, + "loss": 0.4305, + "step": 2495 + }, + { + "epoch": 0.37326155226558994, + "grad_norm": 1.3558340102680362, + "learning_rate": 7.216057926982811e-06, + "loss": 0.2011, + "step": 2496 + }, + { + "epoch": 0.373411096156722, + "grad_norm": 1.6070744311623764, + "learning_rate": 7.213886701068047e-06, + "loss": 0.3777, + "step": 2497 + }, + { + "epoch": 0.37356064004785405, + "grad_norm": 1.0945948539403993, + "learning_rate": 7.211714955754125e-06, + "loss": 0.154, + "step": 2498 + }, + { + "epoch": 0.3737101839389861, + "grad_norm": 1.1736153236745563, + "learning_rate": 7.2095426915505605e-06, + "loss": 0.1973, + "step": 2499 + }, + { + "epoch": 0.3738597278301181, + "grad_norm": 1.5381442491891666, + "learning_rate": 7.207369908966987e-06, + "loss": 0.2319, + "step": 2500 + }, + { + "epoch": 0.3740092717212502, + "grad_norm": 1.143218245070331, + "learning_rate": 7.2051966085131584e-06, + "loss": 0.1947, + "step": 2501 + }, + { + "epoch": 0.37415881561238223, + "grad_norm": 1.756151915421153, + "learning_rate": 7.203022790698954e-06, + "loss": 0.4682, + "step": 2502 + }, + { + "epoch": 0.37430835950351427, + "grad_norm": 1.956298123550812, + "learning_rate": 7.20084845603437e-06, + "loss": 0.5118, + "step": 2503 + }, + { + "epoch": 0.37445790339464635, + "grad_norm": 1.9158174915897688, + "learning_rate": 7.198673605029529e-06, + "loss": 0.3117, + "step": 2504 + }, + { + "epoch": 0.3746074472857784, + "grad_norm": 4.295053824375029, + "learning_rate": 7.196498238194672e-06, + "loss": 0.2234, + "step": 2505 + }, + { + "epoch": 0.3747569911769104, + "grad_norm": 1.6941035724235585, + "learning_rate": 7.194322356040159e-06, + "loss": 0.2312, + "step": 2506 + }, + { + "epoch": 0.37490653506804245, + "grad_norm": 1.5640016182855432, + "learning_rate": 7.192145959076474e-06, + "loss": 0.3622, + "step": 2507 + }, + { + "epoch": 0.37505607895917453, + "grad_norm": 1.8466707952598986, + "learning_rate": 7.1899690478142196e-06, + "loss": 0.4891, + "step": 2508 + }, + { + "epoch": 0.37520562285030656, + "grad_norm": 1.4306072393075668, + "learning_rate": 7.18779162276412e-06, + "loss": 0.2988, + "step": 2509 + }, + { + "epoch": 0.3753551667414386, + "grad_norm": 1.4306403617067467, + "learning_rate": 7.185613684437024e-06, + "loss": 0.3184, + "step": 2510 + }, + { + "epoch": 0.3755047106325707, + "grad_norm": 1.6012766699315721, + "learning_rate": 7.183435233343892e-06, + "loss": 0.3621, + "step": 2511 + }, + { + "epoch": 0.3756542545237027, + "grad_norm": 1.2931435832729816, + "learning_rate": 7.181256269995813e-06, + "loss": 0.3318, + "step": 2512 + }, + { + "epoch": 0.37580379841483474, + "grad_norm": 1.422093979772559, + "learning_rate": 7.179076794903991e-06, + "loss": 0.2357, + "step": 2513 + }, + { + "epoch": 0.3759533423059668, + "grad_norm": 0.9257904335966806, + "learning_rate": 7.176896808579752e-06, + "loss": 0.1813, + "step": 2514 + }, + { + "epoch": 0.37610288619709886, + "grad_norm": 2.037815545812073, + "learning_rate": 7.174716311534542e-06, + "loss": 0.2783, + "step": 2515 + }, + { + "epoch": 0.3762524300882309, + "grad_norm": 1.71379595660131, + "learning_rate": 7.172535304279926e-06, + "loss": 0.4839, + "step": 2516 + }, + { + "epoch": 0.3764019739793629, + "grad_norm": 2.00916868636801, + "learning_rate": 7.170353787327593e-06, + "loss": 0.6153, + "step": 2517 + }, + { + "epoch": 0.376551517870495, + "grad_norm": 1.5789476931879063, + "learning_rate": 7.168171761189343e-06, + "loss": 0.2031, + "step": 2518 + }, + { + "epoch": 0.37670106176162704, + "grad_norm": 1.684853217653668, + "learning_rate": 7.165989226377103e-06, + "loss": 0.3816, + "step": 2519 + }, + { + "epoch": 0.3768506056527591, + "grad_norm": 1.9200462619229837, + "learning_rate": 7.163806183402916e-06, + "loss": 0.4962, + "step": 2520 + }, + { + "epoch": 0.3770001495438911, + "grad_norm": 1.8855548807104212, + "learning_rate": 7.161622632778944e-06, + "loss": 0.2185, + "step": 2521 + }, + { + "epoch": 0.3771496934350232, + "grad_norm": 1.6740717226550226, + "learning_rate": 7.159438575017471e-06, + "loss": 0.3777, + "step": 2522 + }, + { + "epoch": 0.3772992373261552, + "grad_norm": 1.397870156480417, + "learning_rate": 7.157254010630896e-06, + "loss": 0.2426, + "step": 2523 + }, + { + "epoch": 0.37744878121728725, + "grad_norm": 1.6145415521158457, + "learning_rate": 7.155068940131741e-06, + "loss": 0.3926, + "step": 2524 + }, + { + "epoch": 0.37759832510841934, + "grad_norm": 1.7042594169803778, + "learning_rate": 7.152883364032644e-06, + "loss": 0.1917, + "step": 2525 + }, + { + "epoch": 0.3777478689995514, + "grad_norm": 1.5289841742678918, + "learning_rate": 7.15069728284636e-06, + "loss": 0.351, + "step": 2526 + }, + { + "epoch": 0.3778974128906834, + "grad_norm": 2.395918097586016, + "learning_rate": 7.148510697085767e-06, + "loss": 0.3126, + "step": 2527 + }, + { + "epoch": 0.37804695678181544, + "grad_norm": 2.703467639529761, + "learning_rate": 7.146323607263859e-06, + "loss": 0.2623, + "step": 2528 + }, + { + "epoch": 0.3781965006729475, + "grad_norm": 1.7671642324701808, + "learning_rate": 7.144136013893745e-06, + "loss": 0.2137, + "step": 2529 + }, + { + "epoch": 0.37834604456407955, + "grad_norm": 1.932948758218732, + "learning_rate": 7.141947917488663e-06, + "loss": 0.3711, + "step": 2530 + }, + { + "epoch": 0.3784955884552116, + "grad_norm": 1.2026669943340598, + "learning_rate": 7.139759318561954e-06, + "loss": 0.171, + "step": 2531 + }, + { + "epoch": 0.37864513234634367, + "grad_norm": 1.7885589140732676, + "learning_rate": 7.137570217627088e-06, + "loss": 0.4937, + "step": 2532 + }, + { + "epoch": 0.3787946762374757, + "grad_norm": 1.6235037343132346, + "learning_rate": 7.135380615197649e-06, + "loss": 0.4338, + "step": 2533 + }, + { + "epoch": 0.37894422012860773, + "grad_norm": 1.4352072567044432, + "learning_rate": 7.133190511787337e-06, + "loss": 0.203, + "step": 2534 + }, + { + "epoch": 0.3790937640197398, + "grad_norm": 2.021523435648341, + "learning_rate": 7.130999907909972e-06, + "loss": 0.4868, + "step": 2535 + }, + { + "epoch": 0.37924330791087185, + "grad_norm": 1.575689088228456, + "learning_rate": 7.128808804079492e-06, + "loss": 0.3587, + "step": 2536 + }, + { + "epoch": 0.3793928518020039, + "grad_norm": 1.538572538393702, + "learning_rate": 7.126617200809951e-06, + "loss": 0.3164, + "step": 2537 + }, + { + "epoch": 0.3795423956931359, + "grad_norm": 2.1498396130276727, + "learning_rate": 7.12442509861552e-06, + "loss": 0.6093, + "step": 2538 + }, + { + "epoch": 0.379691939584268, + "grad_norm": 1.5246107523171812, + "learning_rate": 7.122232498010486e-06, + "loss": 0.2357, + "step": 2539 + }, + { + "epoch": 0.37984148347540003, + "grad_norm": 1.5356177541463252, + "learning_rate": 7.120039399509257e-06, + "loss": 0.2945, + "step": 2540 + }, + { + "epoch": 0.37999102736653206, + "grad_norm": 1.2696297372616192, + "learning_rate": 7.117845803626352e-06, + "loss": 0.2568, + "step": 2541 + }, + { + "epoch": 0.38014057125766415, + "grad_norm": 1.2940629840823208, + "learning_rate": 7.115651710876411e-06, + "loss": 0.2167, + "step": 2542 + }, + { + "epoch": 0.3802901151487962, + "grad_norm": 1.6760687859259409, + "learning_rate": 7.11345712177419e-06, + "loss": 0.4574, + "step": 2543 + }, + { + "epoch": 0.3804396590399282, + "grad_norm": 1.8274306217532401, + "learning_rate": 7.111262036834559e-06, + "loss": 0.1839, + "step": 2544 + }, + { + "epoch": 0.38058920293106024, + "grad_norm": 1.7239605405645464, + "learning_rate": 7.109066456572508e-06, + "loss": 0.3814, + "step": 2545 + }, + { + "epoch": 0.38073874682219233, + "grad_norm": 2.0774382697068146, + "learning_rate": 7.106870381503139e-06, + "loss": 0.3502, + "step": 2546 + }, + { + "epoch": 0.38088829071332436, + "grad_norm": 1.6407126721162997, + "learning_rate": 7.104673812141676e-06, + "loss": 0.3768, + "step": 2547 + }, + { + "epoch": 0.3810378346044564, + "grad_norm": 1.4264932056782313, + "learning_rate": 7.102476749003453e-06, + "loss": 0.2587, + "step": 2548 + }, + { + "epoch": 0.3811873784955885, + "grad_norm": 1.4315314856298393, + "learning_rate": 7.1002791926039204e-06, + "loss": 0.1918, + "step": 2549 + }, + { + "epoch": 0.3813369223867205, + "grad_norm": 2.268263838831847, + "learning_rate": 7.098081143458649e-06, + "loss": 0.2129, + "step": 2550 + }, + { + "epoch": 0.38148646627785254, + "grad_norm": 1.4599724810600025, + "learning_rate": 7.095882602083321e-06, + "loss": 0.3225, + "step": 2551 + }, + { + "epoch": 0.3816360101689846, + "grad_norm": 1.4198436584338292, + "learning_rate": 7.0936835689937366e-06, + "loss": 0.1947, + "step": 2552 + }, + { + "epoch": 0.38178555406011666, + "grad_norm": 1.3581488206510595, + "learning_rate": 7.09148404470581e-06, + "loss": 0.2492, + "step": 2553 + }, + { + "epoch": 0.3819350979512487, + "grad_norm": 1.8329873345541536, + "learning_rate": 7.089284029735568e-06, + "loss": 0.1999, + "step": 2554 + }, + { + "epoch": 0.3820846418423807, + "grad_norm": 1.879696219955982, + "learning_rate": 7.087083524599158e-06, + "loss": 0.2107, + "step": 2555 + }, + { + "epoch": 0.3822341857335128, + "grad_norm": 1.2252237248239084, + "learning_rate": 7.08488252981284e-06, + "loss": 0.1991, + "step": 2556 + }, + { + "epoch": 0.38238372962464484, + "grad_norm": 1.952002278934872, + "learning_rate": 7.082681045892988e-06, + "loss": 0.3916, + "step": 2557 + }, + { + "epoch": 0.3825332735157769, + "grad_norm": 1.7256855312550468, + "learning_rate": 7.08047907335609e-06, + "loss": 0.3116, + "step": 2558 + }, + { + "epoch": 0.3826828174069089, + "grad_norm": 1.587038951947857, + "learning_rate": 7.078276612718752e-06, + "loss": 0.2224, + "step": 2559 + }, + { + "epoch": 0.382832361298041, + "grad_norm": 1.8262578444211246, + "learning_rate": 7.076073664497691e-06, + "loss": 0.3618, + "step": 2560 + }, + { + "epoch": 0.382981905189173, + "grad_norm": 2.0217394092028167, + "learning_rate": 7.07387022920974e-06, + "loss": 0.2429, + "step": 2561 + }, + { + "epoch": 0.38313144908030505, + "grad_norm": 1.9141258813134443, + "learning_rate": 7.071666307371847e-06, + "loss": 0.4741, + "step": 2562 + }, + { + "epoch": 0.38328099297143714, + "grad_norm": 1.432198404107898, + "learning_rate": 7.069461899501073e-06, + "loss": 0.1918, + "step": 2563 + }, + { + "epoch": 0.38343053686256917, + "grad_norm": 1.5928510931135798, + "learning_rate": 7.067257006114593e-06, + "loss": 0.3226, + "step": 2564 + }, + { + "epoch": 0.3835800807537012, + "grad_norm": 1.6585372345576896, + "learning_rate": 7.065051627729698e-06, + "loss": 0.3276, + "step": 2565 + }, + { + "epoch": 0.38372962464483323, + "grad_norm": 1.2734067275418923, + "learning_rate": 7.062845764863787e-06, + "loss": 0.3313, + "step": 2566 + }, + { + "epoch": 0.3838791685359653, + "grad_norm": 1.1374789118122002, + "learning_rate": 7.0606394180343805e-06, + "loss": 0.1907, + "step": 2567 + }, + { + "epoch": 0.38402871242709735, + "grad_norm": 1.549559123650284, + "learning_rate": 7.058432587759107e-06, + "loss": 0.3915, + "step": 2568 + }, + { + "epoch": 0.3841782563182294, + "grad_norm": 1.4708906613930632, + "learning_rate": 7.0562252745557115e-06, + "loss": 0.1992, + "step": 2569 + }, + { + "epoch": 0.38432780020936147, + "grad_norm": 1.6323056255608734, + "learning_rate": 7.054017478942048e-06, + "loss": 0.2221, + "step": 2570 + }, + { + "epoch": 0.3844773441004935, + "grad_norm": 1.5203359282551667, + "learning_rate": 7.0518092014360905e-06, + "loss": 0.3545, + "step": 2571 + }, + { + "epoch": 0.38462688799162553, + "grad_norm": 1.159124982100583, + "learning_rate": 7.0496004425559195e-06, + "loss": 0.1549, + "step": 2572 + }, + { + "epoch": 0.38477643188275756, + "grad_norm": 2.1232683966349586, + "learning_rate": 7.047391202819734e-06, + "loss": 0.6264, + "step": 2573 + }, + { + "epoch": 0.38492597577388965, + "grad_norm": 1.0622764450389663, + "learning_rate": 7.045181482745837e-06, + "loss": 0.2163, + "step": 2574 + }, + { + "epoch": 0.3850755196650217, + "grad_norm": 1.077605292843362, + "learning_rate": 7.042971282852656e-06, + "loss": 0.2079, + "step": 2575 + }, + { + "epoch": 0.3852250635561537, + "grad_norm": 1.0767369768412656, + "learning_rate": 7.040760603658723e-06, + "loss": 0.2035, + "step": 2576 + }, + { + "epoch": 0.3853746074472858, + "grad_norm": 1.5110599625002836, + "learning_rate": 7.038549445682685e-06, + "loss": 0.1561, + "step": 2577 + }, + { + "epoch": 0.38552415133841783, + "grad_norm": 1.6236972538616068, + "learning_rate": 7.036337809443301e-06, + "loss": 0.4143, + "step": 2578 + }, + { + "epoch": 0.38567369522954986, + "grad_norm": 1.6146980603793564, + "learning_rate": 7.0341256954594415e-06, + "loss": 0.332, + "step": 2579 + }, + { + "epoch": 0.3858232391206819, + "grad_norm": 2.047834790133371, + "learning_rate": 7.031913104250091e-06, + "loss": 0.4432, + "step": 2580 + }, + { + "epoch": 0.385972783011814, + "grad_norm": 1.5013503959890904, + "learning_rate": 7.029700036334344e-06, + "loss": 0.344, + "step": 2581 + }, + { + "epoch": 0.386122326902946, + "grad_norm": 1.0672948108215132, + "learning_rate": 7.027486492231407e-06, + "loss": 0.2127, + "step": 2582 + }, + { + "epoch": 0.38627187079407804, + "grad_norm": 1.6877080780242262, + "learning_rate": 7.025272472460598e-06, + "loss": 0.4702, + "step": 2583 + }, + { + "epoch": 0.38642141468521013, + "grad_norm": 1.5687512279976832, + "learning_rate": 7.02305797754135e-06, + "loss": 0.345, + "step": 2584 + }, + { + "epoch": 0.38657095857634216, + "grad_norm": 1.268209431704893, + "learning_rate": 7.020843007993203e-06, + "loss": 0.1817, + "step": 2585 + }, + { + "epoch": 0.3867205024674742, + "grad_norm": 2.1026474658538508, + "learning_rate": 7.018627564335813e-06, + "loss": 0.3595, + "step": 2586 + }, + { + "epoch": 0.3868700463586062, + "grad_norm": 1.283399204038543, + "learning_rate": 7.01641164708894e-06, + "loss": 0.3985, + "step": 2587 + }, + { + "epoch": 0.3870195902497383, + "grad_norm": 1.5628231644223662, + "learning_rate": 7.014195256772462e-06, + "loss": 0.243, + "step": 2588 + }, + { + "epoch": 0.38716913414087034, + "grad_norm": 1.4666711014261535, + "learning_rate": 7.011978393906366e-06, + "loss": 0.3757, + "step": 2589 + }, + { + "epoch": 0.3873186780320024, + "grad_norm": 1.7424029593387464, + "learning_rate": 7.009761059010746e-06, + "loss": 0.3297, + "step": 2590 + }, + { + "epoch": 0.38746822192313446, + "grad_norm": 1.793037388304316, + "learning_rate": 7.007543252605815e-06, + "loss": 0.1902, + "step": 2591 + }, + { + "epoch": 0.3876177658142665, + "grad_norm": 1.3063151548645278, + "learning_rate": 7.005324975211889e-06, + "loss": 0.2154, + "step": 2592 + }, + { + "epoch": 0.3877673097053985, + "grad_norm": 1.5791521412257716, + "learning_rate": 7.003106227349399e-06, + "loss": 0.353, + "step": 2593 + }, + { + "epoch": 0.3879168535965306, + "grad_norm": 1.1004684710498887, + "learning_rate": 7.0008870095388815e-06, + "loss": 0.1875, + "step": 2594 + }, + { + "epoch": 0.38806639748766264, + "grad_norm": 1.5070698124572854, + "learning_rate": 6.998667322300989e-06, + "loss": 0.3211, + "step": 2595 + }, + { + "epoch": 0.38821594137879467, + "grad_norm": 1.5281886376155964, + "learning_rate": 6.9964471661564815e-06, + "loss": 0.2161, + "step": 2596 + }, + { + "epoch": 0.3883654852699267, + "grad_norm": 1.5252156175553138, + "learning_rate": 6.994226541626227e-06, + "loss": 0.4639, + "step": 2597 + }, + { + "epoch": 0.3885150291610588, + "grad_norm": 1.0809213992053146, + "learning_rate": 6.9920054492312086e-06, + "loss": 0.1773, + "step": 2598 + }, + { + "epoch": 0.3886645730521908, + "grad_norm": 1.0167620236333288, + "learning_rate": 6.989783889492512e-06, + "loss": 0.197, + "step": 2599 + }, + { + "epoch": 0.38881411694332285, + "grad_norm": 1.9918245714247895, + "learning_rate": 6.98756186293134e-06, + "loss": 0.2173, + "step": 2600 + }, + { + "epoch": 0.38896366083445494, + "grad_norm": 1.134571833649737, + "learning_rate": 6.9853393700689995e-06, + "loss": 0.2177, + "step": 2601 + }, + { + "epoch": 0.38911320472558697, + "grad_norm": 1.2111914722156627, + "learning_rate": 6.98311641142691e-06, + "loss": 0.189, + "step": 2602 + }, + { + "epoch": 0.389262748616719, + "grad_norm": 1.5006726664035277, + "learning_rate": 6.9808929875265974e-06, + "loss": 0.3254, + "step": 2603 + }, + { + "epoch": 0.38941229250785103, + "grad_norm": 1.6055883887420528, + "learning_rate": 6.9786690988897e-06, + "loss": 0.3344, + "step": 2604 + }, + { + "epoch": 0.3895618363989831, + "grad_norm": 0.9973538599901819, + "learning_rate": 6.9764447460379625e-06, + "loss": 0.1393, + "step": 2605 + }, + { + "epoch": 0.38971138029011515, + "grad_norm": 1.6233042755021416, + "learning_rate": 6.9742199294932415e-06, + "loss": 0.204, + "step": 2606 + }, + { + "epoch": 0.3898609241812472, + "grad_norm": 1.3344019910552745, + "learning_rate": 6.971994649777497e-06, + "loss": 0.1945, + "step": 2607 + }, + { + "epoch": 0.39001046807237927, + "grad_norm": 1.3137908772185043, + "learning_rate": 6.969768907412804e-06, + "loss": 0.1984, + "step": 2608 + }, + { + "epoch": 0.3901600119635113, + "grad_norm": 1.777140683321665, + "learning_rate": 6.9675427029213405e-06, + "loss": 0.3008, + "step": 2609 + }, + { + "epoch": 0.39030955585464333, + "grad_norm": 1.0681843692748616, + "learning_rate": 6.965316036825398e-06, + "loss": 0.2006, + "step": 2610 + }, + { + "epoch": 0.39045909974577536, + "grad_norm": 1.8543912871172343, + "learning_rate": 6.963088909647372e-06, + "loss": 0.5414, + "step": 2611 + }, + { + "epoch": 0.39060864363690745, + "grad_norm": 1.9579644690684397, + "learning_rate": 6.960861321909769e-06, + "loss": 0.2134, + "step": 2612 + }, + { + "epoch": 0.3907581875280395, + "grad_norm": 1.5162323884971123, + "learning_rate": 6.9586332741352025e-06, + "loss": 0.4165, + "step": 2613 + }, + { + "epoch": 0.3909077314191715, + "grad_norm": 1.6022574800937832, + "learning_rate": 6.956404766846394e-06, + "loss": 0.2272, + "step": 2614 + }, + { + "epoch": 0.3910572753103036, + "grad_norm": 1.6770508498447996, + "learning_rate": 6.954175800566172e-06, + "loss": 0.4977, + "step": 2615 + }, + { + "epoch": 0.39120681920143563, + "grad_norm": 1.8009556625869532, + "learning_rate": 6.9519463758174745e-06, + "loss": 0.5158, + "step": 2616 + }, + { + "epoch": 0.39135636309256766, + "grad_norm": 1.3758659710920118, + "learning_rate": 6.949716493123345e-06, + "loss": 0.1789, + "step": 2617 + }, + { + "epoch": 0.3915059069836997, + "grad_norm": 1.3547231618901736, + "learning_rate": 6.947486153006937e-06, + "loss": 0.3534, + "step": 2618 + }, + { + "epoch": 0.3916554508748318, + "grad_norm": 1.5074942912698235, + "learning_rate": 6.945255355991509e-06, + "loss": 0.2497, + "step": 2619 + }, + { + "epoch": 0.3918049947659638, + "grad_norm": 1.2442646683398755, + "learning_rate": 6.943024102600428e-06, + "loss": 0.1861, + "step": 2620 + }, + { + "epoch": 0.39195453865709584, + "grad_norm": 1.5075321060331324, + "learning_rate": 6.940792393357165e-06, + "loss": 0.2194, + "step": 2621 + }, + { + "epoch": 0.39210408254822793, + "grad_norm": 1.683076961752896, + "learning_rate": 6.938560228785304e-06, + "loss": 0.2156, + "step": 2622 + }, + { + "epoch": 0.39225362643935996, + "grad_norm": 1.7112132072394493, + "learning_rate": 6.9363276094085296e-06, + "loss": 0.3958, + "step": 2623 + }, + { + "epoch": 0.392403170330492, + "grad_norm": 0.7048488756753248, + "learning_rate": 6.934094535750638e-06, + "loss": 0.2183, + "step": 2624 + }, + { + "epoch": 0.392552714221624, + "grad_norm": 1.7586652848799478, + "learning_rate": 6.931861008335527e-06, + "loss": 0.469, + "step": 2625 + }, + { + "epoch": 0.3927022581127561, + "grad_norm": 1.1588740664998758, + "learning_rate": 6.929627027687207e-06, + "loss": 0.2168, + "step": 2626 + }, + { + "epoch": 0.39285180200388814, + "grad_norm": 1.548868104243127, + "learning_rate": 6.927392594329789e-06, + "loss": 0.2312, + "step": 2627 + }, + { + "epoch": 0.39300134589502017, + "grad_norm": 1.595689192189307, + "learning_rate": 6.925157708787493e-06, + "loss": 0.4765, + "step": 2628 + }, + { + "epoch": 0.39315088978615226, + "grad_norm": 1.0277723372062315, + "learning_rate": 6.922922371584647e-06, + "loss": 0.1517, + "step": 2629 + }, + { + "epoch": 0.3933004336772843, + "grad_norm": 1.7591622574617085, + "learning_rate": 6.920686583245679e-06, + "loss": 0.3411, + "step": 2630 + }, + { + "epoch": 0.3934499775684163, + "grad_norm": 1.6744764890594985, + "learning_rate": 6.918450344295129e-06, + "loss": 0.2044, + "step": 2631 + }, + { + "epoch": 0.39359952145954835, + "grad_norm": 1.4007607932440016, + "learning_rate": 6.916213655257639e-06, + "loss": 0.3232, + "step": 2632 + }, + { + "epoch": 0.39374906535068044, + "grad_norm": 1.4204862226075206, + "learning_rate": 6.9139765166579576e-06, + "loss": 0.2314, + "step": 2633 + }, + { + "epoch": 0.39389860924181247, + "grad_norm": 1.3405577478235213, + "learning_rate": 6.9117389290209415e-06, + "loss": 0.211, + "step": 2634 + }, + { + "epoch": 0.3940481531329445, + "grad_norm": 1.454026439973042, + "learning_rate": 6.909500892871547e-06, + "loss": 0.33, + "step": 2635 + }, + { + "epoch": 0.3941976970240766, + "grad_norm": 1.3439856339271192, + "learning_rate": 6.907262408734842e-06, + "loss": 0.1815, + "step": 2636 + }, + { + "epoch": 0.3943472409152086, + "grad_norm": 1.6261472549425744, + "learning_rate": 6.905023477135992e-06, + "loss": 0.3739, + "step": 2637 + }, + { + "epoch": 0.39449678480634065, + "grad_norm": 1.576356761161988, + "learning_rate": 6.902784098600277e-06, + "loss": 0.3767, + "step": 2638 + }, + { + "epoch": 0.3946463286974727, + "grad_norm": 1.339777085672714, + "learning_rate": 6.9005442736530745e-06, + "loss": 0.1876, + "step": 2639 + }, + { + "epoch": 0.39479587258860477, + "grad_norm": 1.4248530907151669, + "learning_rate": 6.898304002819869e-06, + "loss": 0.2061, + "step": 2640 + }, + { + "epoch": 0.3949454164797368, + "grad_norm": 1.8979002423907008, + "learning_rate": 6.896063286626251e-06, + "loss": 0.3686, + "step": 2641 + }, + { + "epoch": 0.39509496037086883, + "grad_norm": 1.4615671922332303, + "learning_rate": 6.893822125597911e-06, + "loss": 0.2747, + "step": 2642 + }, + { + "epoch": 0.3952445042620009, + "grad_norm": 1.1050718617792725, + "learning_rate": 6.891580520260649e-06, + "loss": 0.1886, + "step": 2643 + }, + { + "epoch": 0.39539404815313295, + "grad_norm": 1.1930459776952358, + "learning_rate": 6.8893384711403675e-06, + "loss": 0.1788, + "step": 2644 + }, + { + "epoch": 0.395543592044265, + "grad_norm": 1.0036636367374232, + "learning_rate": 6.887095978763072e-06, + "loss": 0.1815, + "step": 2645 + }, + { + "epoch": 0.39569313593539707, + "grad_norm": 1.751476920569849, + "learning_rate": 6.884853043654876e-06, + "loss": 0.3909, + "step": 2646 + }, + { + "epoch": 0.3958426798265291, + "grad_norm": 1.5414130565030935, + "learning_rate": 6.882609666341988e-06, + "loss": 0.383, + "step": 2647 + }, + { + "epoch": 0.39599222371766113, + "grad_norm": 1.7873679635826936, + "learning_rate": 6.88036584735073e-06, + "loss": 0.5987, + "step": 2648 + }, + { + "epoch": 0.39614176760879316, + "grad_norm": 1.4503283763937924, + "learning_rate": 6.878121587207522e-06, + "loss": 0.1782, + "step": 2649 + }, + { + "epoch": 0.39629131149992525, + "grad_norm": 1.1935592615073942, + "learning_rate": 6.875876886438889e-06, + "loss": 0.3002, + "step": 2650 + }, + { + "epoch": 0.3964408553910573, + "grad_norm": 1.471497098484429, + "learning_rate": 6.873631745571461e-06, + "loss": 0.2105, + "step": 2651 + }, + { + "epoch": 0.3965903992821893, + "grad_norm": 1.7129740678391, + "learning_rate": 6.871386165131968e-06, + "loss": 0.2233, + "step": 2652 + }, + { + "epoch": 0.3967399431733214, + "grad_norm": 1.5756431198477898, + "learning_rate": 6.869140145647245e-06, + "loss": 0.4151, + "step": 2653 + }, + { + "epoch": 0.39688948706445343, + "grad_norm": 1.238820133220739, + "learning_rate": 6.866893687644232e-06, + "loss": 0.2, + "step": 2654 + }, + { + "epoch": 0.39703903095558546, + "grad_norm": 1.9264407972582287, + "learning_rate": 6.864646791649966e-06, + "loss": 0.4677, + "step": 2655 + }, + { + "epoch": 0.3971885748467175, + "grad_norm": 1.550112419127813, + "learning_rate": 6.862399458191593e-06, + "loss": 0.3658, + "step": 2656 + }, + { + "epoch": 0.3973381187378496, + "grad_norm": 1.5119124813290858, + "learning_rate": 6.860151687796359e-06, + "loss": 0.2068, + "step": 2657 + }, + { + "epoch": 0.3974876626289816, + "grad_norm": 1.4272874845074974, + "learning_rate": 6.857903480991611e-06, + "loss": 0.2713, + "step": 2658 + }, + { + "epoch": 0.39763720652011364, + "grad_norm": 1.6594720979073507, + "learning_rate": 6.855654838304802e-06, + "loss": 0.3205, + "step": 2659 + }, + { + "epoch": 0.3977867504112457, + "grad_norm": 2.407352823346937, + "learning_rate": 6.853405760263485e-06, + "loss": 0.2088, + "step": 2660 + }, + { + "epoch": 0.39793629430237776, + "grad_norm": 1.7959792599685889, + "learning_rate": 6.851156247395313e-06, + "loss": 0.393, + "step": 2661 + }, + { + "epoch": 0.3980858381935098, + "grad_norm": 1.4851282768283998, + "learning_rate": 6.848906300228047e-06, + "loss": 0.2354, + "step": 2662 + }, + { + "epoch": 0.3982353820846418, + "grad_norm": 1.4691880516424127, + "learning_rate": 6.846655919289543e-06, + "loss": 0.231, + "step": 2663 + }, + { + "epoch": 0.3983849259757739, + "grad_norm": 1.5828677803244833, + "learning_rate": 6.844405105107763e-06, + "loss": 0.1887, + "step": 2664 + }, + { + "epoch": 0.39853446986690594, + "grad_norm": 1.5541799378105448, + "learning_rate": 6.842153858210772e-06, + "loss": 0.3784, + "step": 2665 + }, + { + "epoch": 0.39868401375803797, + "grad_norm": 1.9024949023533122, + "learning_rate": 6.83990217912673e-06, + "loss": 0.4826, + "step": 2666 + }, + { + "epoch": 0.39883355764917006, + "grad_norm": 1.7789769644036557, + "learning_rate": 6.837650068383908e-06, + "loss": 0.4536, + "step": 2667 + }, + { + "epoch": 0.3989831015403021, + "grad_norm": 1.6091866012593252, + "learning_rate": 6.835397526510667e-06, + "loss": 0.4931, + "step": 2668 + }, + { + "epoch": 0.3991326454314341, + "grad_norm": 1.4778294602532251, + "learning_rate": 6.83314455403548e-06, + "loss": 0.3916, + "step": 2669 + }, + { + "epoch": 0.39928218932256615, + "grad_norm": 1.4396363740839195, + "learning_rate": 6.8308911514869125e-06, + "loss": 0.2875, + "step": 2670 + }, + { + "epoch": 0.39943173321369824, + "grad_norm": 1.5237349879970323, + "learning_rate": 6.828637319393636e-06, + "loss": 0.2372, + "step": 2671 + }, + { + "epoch": 0.39958127710483027, + "grad_norm": 1.4821885218044735, + "learning_rate": 6.826383058284421e-06, + "loss": 0.3351, + "step": 2672 + }, + { + "epoch": 0.3997308209959623, + "grad_norm": 1.6760124989546532, + "learning_rate": 6.824128368688139e-06, + "loss": 0.3599, + "step": 2673 + }, + { + "epoch": 0.3998803648870944, + "grad_norm": 1.0076444138222016, + "learning_rate": 6.821873251133764e-06, + "loss": 0.1925, + "step": 2674 + }, + { + "epoch": 0.4000299087782264, + "grad_norm": 1.0371695095164526, + "learning_rate": 6.819617706150363e-06, + "loss": 0.177, + "step": 2675 + }, + { + "epoch": 0.40017945266935845, + "grad_norm": 1.2181986301270449, + "learning_rate": 6.817361734267114e-06, + "loss": 0.2938, + "step": 2676 + }, + { + "epoch": 0.4003289965604905, + "grad_norm": 1.4070959430963377, + "learning_rate": 6.815105336013287e-06, + "loss": 0.1739, + "step": 2677 + }, + { + "epoch": 0.40047854045162257, + "grad_norm": 2.0098290688275737, + "learning_rate": 6.812848511918254e-06, + "loss": 0.3152, + "step": 2678 + }, + { + "epoch": 0.4006280843427546, + "grad_norm": 1.543663585518377, + "learning_rate": 6.810591262511492e-06, + "loss": 0.3398, + "step": 2679 + }, + { + "epoch": 0.40077762823388663, + "grad_norm": 1.231130815440152, + "learning_rate": 6.808333588322569e-06, + "loss": 0.2017, + "step": 2680 + }, + { + "epoch": 0.4009271721250187, + "grad_norm": 1.8122501869118024, + "learning_rate": 6.806075489881158e-06, + "loss": 0.1972, + "step": 2681 + }, + { + "epoch": 0.40107671601615075, + "grad_norm": 1.7484305402942781, + "learning_rate": 6.803816967717031e-06, + "loss": 0.4729, + "step": 2682 + }, + { + "epoch": 0.4012262599072828, + "grad_norm": 1.3000952584298922, + "learning_rate": 6.801558022360061e-06, + "loss": 0.2636, + "step": 2683 + }, + { + "epoch": 0.4013758037984148, + "grad_norm": 1.8438350811417317, + "learning_rate": 6.799298654340215e-06, + "loss": 0.5354, + "step": 2684 + }, + { + "epoch": 0.4015253476895469, + "grad_norm": 2.2848085970399303, + "learning_rate": 6.797038864187564e-06, + "loss": 0.8736, + "step": 2685 + }, + { + "epoch": 0.4016748915806789, + "grad_norm": 1.4866529052233595, + "learning_rate": 6.794778652432278e-06, + "loss": 0.2244, + "step": 2686 + }, + { + "epoch": 0.40182443547181096, + "grad_norm": 1.2521236529684348, + "learning_rate": 6.792518019604624e-06, + "loss": 0.2239, + "step": 2687 + }, + { + "epoch": 0.40197397936294305, + "grad_norm": 1.576212342415195, + "learning_rate": 6.790256966234966e-06, + "loss": 0.2802, + "step": 2688 + }, + { + "epoch": 0.4021235232540751, + "grad_norm": 1.2579155325895004, + "learning_rate": 6.787995492853771e-06, + "loss": 0.2094, + "step": 2689 + }, + { + "epoch": 0.4022730671452071, + "grad_norm": 1.4213546092771336, + "learning_rate": 6.785733599991602e-06, + "loss": 0.1914, + "step": 2690 + }, + { + "epoch": 0.40242261103633914, + "grad_norm": 0.8485344894569877, + "learning_rate": 6.78347128817912e-06, + "loss": 0.224, + "step": 2691 + }, + { + "epoch": 0.4025721549274712, + "grad_norm": 1.6595476897588828, + "learning_rate": 6.781208557947085e-06, + "loss": 0.3873, + "step": 2692 + }, + { + "epoch": 0.40272169881860326, + "grad_norm": 1.0569857811012024, + "learning_rate": 6.778945409826356e-06, + "loss": 0.2516, + "step": 2693 + }, + { + "epoch": 0.4028712427097353, + "grad_norm": 1.2314194821001476, + "learning_rate": 6.776681844347892e-06, + "loss": 0.2237, + "step": 2694 + }, + { + "epoch": 0.4030207866008674, + "grad_norm": 0.9731617862379426, + "learning_rate": 6.7744178620427435e-06, + "loss": 0.2348, + "step": 2695 + }, + { + "epoch": 0.4031703304919994, + "grad_norm": 1.5995980706007595, + "learning_rate": 6.7721534634420635e-06, + "loss": 0.2542, + "step": 2696 + }, + { + "epoch": 0.40331987438313144, + "grad_norm": 1.788090656384711, + "learning_rate": 6.769888649077103e-06, + "loss": 0.2238, + "step": 2697 + }, + { + "epoch": 0.40346941827426347, + "grad_norm": 1.6873214415283535, + "learning_rate": 6.767623419479207e-06, + "loss": 0.4441, + "step": 2698 + }, + { + "epoch": 0.40361896216539556, + "grad_norm": 1.7639180599847581, + "learning_rate": 6.765357775179822e-06, + "loss": 0.4911, + "step": 2699 + }, + { + "epoch": 0.4037685060565276, + "grad_norm": 1.3115613907742758, + "learning_rate": 6.76309171671049e-06, + "loss": 0.3307, + "step": 2700 + }, + { + "epoch": 0.4039180499476596, + "grad_norm": 1.383566483800298, + "learning_rate": 6.760825244602849e-06, + "loss": 0.2378, + "step": 2701 + }, + { + "epoch": 0.4040675938387917, + "grad_norm": 1.0532366239269708, + "learning_rate": 6.7585583593886376e-06, + "loss": 0.3177, + "step": 2702 + }, + { + "epoch": 0.40421713772992374, + "grad_norm": 1.7117686930778293, + "learning_rate": 6.7562910615996854e-06, + "loss": 0.3372, + "step": 2703 + }, + { + "epoch": 0.40436668162105577, + "grad_norm": 1.59208336340781, + "learning_rate": 6.754023351767924e-06, + "loss": 0.2335, + "step": 2704 + }, + { + "epoch": 0.40451622551218785, + "grad_norm": 2.307120871555123, + "learning_rate": 6.7517552304253815e-06, + "loss": 0.4416, + "step": 2705 + }, + { + "epoch": 0.4046657694033199, + "grad_norm": 2.4127825210857075, + "learning_rate": 6.7494866981041784e-06, + "loss": 0.7162, + "step": 2706 + }, + { + "epoch": 0.4048153132944519, + "grad_norm": 1.768470281885931, + "learning_rate": 6.747217755336537e-06, + "loss": 0.3867, + "step": 2707 + }, + { + "epoch": 0.40496485718558395, + "grad_norm": 0.9575195254141551, + "learning_rate": 6.7449484026547705e-06, + "loss": 0.2125, + "step": 2708 + }, + { + "epoch": 0.40511440107671604, + "grad_norm": 1.2201809246818296, + "learning_rate": 6.742678640591293e-06, + "loss": 0.1848, + "step": 2709 + }, + { + "epoch": 0.40526394496784807, + "grad_norm": 1.022034673727343, + "learning_rate": 6.740408469678611e-06, + "loss": 0.1824, + "step": 2710 + }, + { + "epoch": 0.4054134888589801, + "grad_norm": 1.484686409132478, + "learning_rate": 6.738137890449329e-06, + "loss": 0.2005, + "step": 2711 + }, + { + "epoch": 0.4055630327501122, + "grad_norm": 1.5990317632752664, + "learning_rate": 6.735866903436146e-06, + "loss": 0.2366, + "step": 2712 + }, + { + "epoch": 0.4057125766412442, + "grad_norm": 0.840556680462754, + "learning_rate": 6.7335955091718595e-06, + "loss": 0.1896, + "step": 2713 + }, + { + "epoch": 0.40586212053237625, + "grad_norm": 1.9574024358578521, + "learning_rate": 6.731323708189358e-06, + "loss": 0.5026, + "step": 2714 + }, + { + "epoch": 0.4060116644235083, + "grad_norm": 1.0372558946433956, + "learning_rate": 6.7290515010216305e-06, + "loss": 0.2117, + "step": 2715 + }, + { + "epoch": 0.40616120831464037, + "grad_norm": 1.782559024684888, + "learning_rate": 6.726778888201757e-06, + "loss": 0.401, + "step": 2716 + }, + { + "epoch": 0.4063107522057724, + "grad_norm": 1.8957139836111772, + "learning_rate": 6.724505870262915e-06, + "loss": 0.454, + "step": 2717 + }, + { + "epoch": 0.4064602960969044, + "grad_norm": 0.9411673381180247, + "learning_rate": 6.722232447738375e-06, + "loss": 0.1434, + "step": 2718 + }, + { + "epoch": 0.4066098399880365, + "grad_norm": 1.528883853888666, + "learning_rate": 6.719958621161505e-06, + "loss": 0.2069, + "step": 2719 + }, + { + "epoch": 0.40675938387916855, + "grad_norm": 1.0474505826049523, + "learning_rate": 6.717684391065769e-06, + "loss": 0.2319, + "step": 2720 + }, + { + "epoch": 0.4069089277703006, + "grad_norm": 1.608085985877073, + "learning_rate": 6.715409757984718e-06, + "loss": 0.2077, + "step": 2721 + }, + { + "epoch": 0.4070584716614326, + "grad_norm": 1.213620862112801, + "learning_rate": 6.713134722452009e-06, + "loss": 0.2165, + "step": 2722 + }, + { + "epoch": 0.4072080155525647, + "grad_norm": 1.4959997727780725, + "learning_rate": 6.710859285001381e-06, + "loss": 0.2631, + "step": 2723 + }, + { + "epoch": 0.4073575594436967, + "grad_norm": 1.4898775205900512, + "learning_rate": 6.7085834461666765e-06, + "loss": 0.3716, + "step": 2724 + }, + { + "epoch": 0.40750710333482876, + "grad_norm": 2.940723192965302, + "learning_rate": 6.70630720648183e-06, + "loss": 0.2594, + "step": 2725 + }, + { + "epoch": 0.40765664722596084, + "grad_norm": 1.4132307468761955, + "learning_rate": 6.704030566480869e-06, + "loss": 0.174, + "step": 2726 + }, + { + "epoch": 0.4078061911170929, + "grad_norm": 1.1415584187313277, + "learning_rate": 6.701753526697915e-06, + "loss": 0.2006, + "step": 2727 + }, + { + "epoch": 0.4079557350082249, + "grad_norm": 1.4316392497827604, + "learning_rate": 6.699476087667183e-06, + "loss": 0.1804, + "step": 2728 + }, + { + "epoch": 0.40810527889935694, + "grad_norm": 1.828481566061616, + "learning_rate": 6.697198249922981e-06, + "loss": 0.3837, + "step": 2729 + }, + { + "epoch": 0.408254822790489, + "grad_norm": 1.5937472090977185, + "learning_rate": 6.694920013999715e-06, + "loss": 0.4825, + "step": 2730 + }, + { + "epoch": 0.40840436668162106, + "grad_norm": 1.1850332784758328, + "learning_rate": 6.692641380431879e-06, + "loss": 0.2363, + "step": 2731 + }, + { + "epoch": 0.4085539105727531, + "grad_norm": 1.3868683747713086, + "learning_rate": 6.690362349754062e-06, + "loss": 0.2932, + "step": 2732 + }, + { + "epoch": 0.4087034544638852, + "grad_norm": 2.021537411496191, + "learning_rate": 6.6880829225009455e-06, + "loss": 0.6287, + "step": 2733 + }, + { + "epoch": 0.4088529983550172, + "grad_norm": 1.1157967473978323, + "learning_rate": 6.685803099207309e-06, + "loss": 0.2041, + "step": 2734 + }, + { + "epoch": 0.40900254224614924, + "grad_norm": 1.5873384207624706, + "learning_rate": 6.683522880408019e-06, + "loss": 0.3283, + "step": 2735 + }, + { + "epoch": 0.40915208613728127, + "grad_norm": 1.7562023141944056, + "learning_rate": 6.681242266638035e-06, + "loss": 0.2274, + "step": 2736 + }, + { + "epoch": 0.40930163002841335, + "grad_norm": 2.768807471305314, + "learning_rate": 6.6789612584324135e-06, + "loss": 0.5855, + "step": 2737 + }, + { + "epoch": 0.4094511739195454, + "grad_norm": 1.5373476587783403, + "learning_rate": 6.676679856326298e-06, + "loss": 0.3774, + "step": 2738 + }, + { + "epoch": 0.4096007178106774, + "grad_norm": 1.3129213939580835, + "learning_rate": 6.674398060854931e-06, + "loss": 0.3354, + "step": 2739 + }, + { + "epoch": 0.4097502617018095, + "grad_norm": 1.9542337513011827, + "learning_rate": 6.672115872553643e-06, + "loss": 0.6155, + "step": 2740 + }, + { + "epoch": 0.40989980559294154, + "grad_norm": 1.4597570780931846, + "learning_rate": 6.669833291957854e-06, + "loss": 0.1852, + "step": 2741 + }, + { + "epoch": 0.41004934948407357, + "grad_norm": 1.4488787882100997, + "learning_rate": 6.667550319603084e-06, + "loss": 0.2032, + "step": 2742 + }, + { + "epoch": 0.4101988933752056, + "grad_norm": 1.2741107506652722, + "learning_rate": 6.665266956024939e-06, + "loss": 0.1551, + "step": 2743 + }, + { + "epoch": 0.4103484372663377, + "grad_norm": 1.4577006420793812, + "learning_rate": 6.662983201759116e-06, + "loss": 0.2233, + "step": 2744 + }, + { + "epoch": 0.4104979811574697, + "grad_norm": 1.654731508737811, + "learning_rate": 6.660699057341407e-06, + "loss": 0.163, + "step": 2745 + }, + { + "epoch": 0.41064752504860175, + "grad_norm": 1.6694252301810233, + "learning_rate": 6.658414523307694e-06, + "loss": 0.361, + "step": 2746 + }, + { + "epoch": 0.41079706893973383, + "grad_norm": 1.7611241278128056, + "learning_rate": 6.656129600193951e-06, + "loss": 0.3767, + "step": 2747 + }, + { + "epoch": 0.41094661283086587, + "grad_norm": 1.5219042887796945, + "learning_rate": 6.653844288536244e-06, + "loss": 0.2821, + "step": 2748 + }, + { + "epoch": 0.4110961567219979, + "grad_norm": 1.432463624052981, + "learning_rate": 6.651558588870728e-06, + "loss": 0.278, + "step": 2749 + }, + { + "epoch": 0.4112457006131299, + "grad_norm": 1.6511294342102478, + "learning_rate": 6.649272501733651e-06, + "loss": 0.3371, + "step": 2750 + }, + { + "epoch": 0.411395244504262, + "grad_norm": 2.222481645319511, + "learning_rate": 6.646986027661351e-06, + "loss": 0.8056, + "step": 2751 + }, + { + "epoch": 0.41154478839539405, + "grad_norm": 1.7419918403749177, + "learning_rate": 6.644699167190254e-06, + "loss": 0.375, + "step": 2752 + }, + { + "epoch": 0.4116943322865261, + "grad_norm": 1.9346738438100417, + "learning_rate": 6.642411920856882e-06, + "loss": 0.5479, + "step": 2753 + }, + { + "epoch": 0.41184387617765816, + "grad_norm": 0.9808333961270187, + "learning_rate": 6.640124289197845e-06, + "loss": 0.1877, + "step": 2754 + }, + { + "epoch": 0.4119934200687902, + "grad_norm": 2.0713908859757018, + "learning_rate": 6.637836272749846e-06, + "loss": 0.5199, + "step": 2755 + }, + { + "epoch": 0.4121429639599222, + "grad_norm": 1.4679970421838424, + "learning_rate": 6.63554787204967e-06, + "loss": 0.1927, + "step": 2756 + }, + { + "epoch": 0.41229250785105426, + "grad_norm": 1.1272956717609233, + "learning_rate": 6.6332590876342015e-06, + "loss": 0.2071, + "step": 2757 + }, + { + "epoch": 0.41244205174218634, + "grad_norm": 2.53543503085162, + "learning_rate": 6.630969920040411e-06, + "loss": 0.3872, + "step": 2758 + }, + { + "epoch": 0.4125915956333184, + "grad_norm": 1.7338236929187498, + "learning_rate": 6.628680369805359e-06, + "loss": 0.4355, + "step": 2759 + }, + { + "epoch": 0.4127411395244504, + "grad_norm": 1.3212066026939129, + "learning_rate": 6.626390437466197e-06, + "loss": 0.1943, + "step": 2760 + }, + { + "epoch": 0.4128906834155825, + "grad_norm": 1.7339371416327845, + "learning_rate": 6.624100123560164e-06, + "loss": 0.2408, + "step": 2761 + }, + { + "epoch": 0.4130402273067145, + "grad_norm": 1.4122438327200852, + "learning_rate": 6.621809428624588e-06, + "loss": 0.3316, + "step": 2762 + }, + { + "epoch": 0.41318977119784656, + "grad_norm": 1.378447575239376, + "learning_rate": 6.619518353196892e-06, + "loss": 0.1618, + "step": 2763 + }, + { + "epoch": 0.41333931508897864, + "grad_norm": 1.66925577966763, + "learning_rate": 6.617226897814582e-06, + "loss": 0.24, + "step": 2764 + }, + { + "epoch": 0.4134888589801107, + "grad_norm": 1.688913023066291, + "learning_rate": 6.614935063015256e-06, + "loss": 0.1803, + "step": 2765 + }, + { + "epoch": 0.4136384028712427, + "grad_norm": 1.5610098337094958, + "learning_rate": 6.612642849336599e-06, + "loss": 0.2558, + "step": 2766 + }, + { + "epoch": 0.41378794676237474, + "grad_norm": 1.727757709094625, + "learning_rate": 6.610350257316389e-06, + "loss": 0.3811, + "step": 2767 + }, + { + "epoch": 0.4139374906535068, + "grad_norm": 1.6489611874055852, + "learning_rate": 6.608057287492491e-06, + "loss": 0.4092, + "step": 2768 + }, + { + "epoch": 0.41408703454463885, + "grad_norm": 2.3320480486649755, + "learning_rate": 6.605763940402854e-06, + "loss": 0.6742, + "step": 2769 + }, + { + "epoch": 0.4142365784357709, + "grad_norm": 1.4200969362500393, + "learning_rate": 6.6034702165855225e-06, + "loss": 0.2639, + "step": 2770 + }, + { + "epoch": 0.414386122326903, + "grad_norm": 0.9909541939147573, + "learning_rate": 6.601176116578624e-06, + "loss": 0.1167, + "step": 2771 + }, + { + "epoch": 0.414535666218035, + "grad_norm": 1.673456689402228, + "learning_rate": 6.598881640920376e-06, + "loss": 0.3768, + "step": 2772 + }, + { + "epoch": 0.41468521010916704, + "grad_norm": 1.0737793994976543, + "learning_rate": 6.596586790149087e-06, + "loss": 0.2302, + "step": 2773 + }, + { + "epoch": 0.41483475400029907, + "grad_norm": 1.7858247736400699, + "learning_rate": 6.5942915648031516e-06, + "loss": 0.5081, + "step": 2774 + }, + { + "epoch": 0.41498429789143115, + "grad_norm": 1.7377180708768736, + "learning_rate": 6.591995965421049e-06, + "loss": 0.3567, + "step": 2775 + }, + { + "epoch": 0.4151338417825632, + "grad_norm": 1.729058819350988, + "learning_rate": 6.58969999254135e-06, + "loss": 0.3563, + "step": 2776 + }, + { + "epoch": 0.4152833856736952, + "grad_norm": 2.521334430186825, + "learning_rate": 6.5874036467027135e-06, + "loss": 0.8416, + "step": 2777 + }, + { + "epoch": 0.4154329295648273, + "grad_norm": 1.5115105389474632, + "learning_rate": 6.585106928443883e-06, + "loss": 0.3629, + "step": 2778 + }, + { + "epoch": 0.41558247345595933, + "grad_norm": 0.9562243758146778, + "learning_rate": 6.5828098383036895e-06, + "loss": 0.134, + "step": 2779 + }, + { + "epoch": 0.41573201734709136, + "grad_norm": 1.50030075040931, + "learning_rate": 6.580512376821055e-06, + "loss": 0.2136, + "step": 2780 + }, + { + "epoch": 0.4158815612382234, + "grad_norm": 1.3200898857471022, + "learning_rate": 6.5782145445349845e-06, + "loss": 0.3365, + "step": 2781 + }, + { + "epoch": 0.4160311051293555, + "grad_norm": 1.827699208191328, + "learning_rate": 6.5759163419845715e-06, + "loss": 0.4035, + "step": 2782 + }, + { + "epoch": 0.4161806490204875, + "grad_norm": 1.713119928633606, + "learning_rate": 6.573617769708998e-06, + "loss": 0.2414, + "step": 2783 + }, + { + "epoch": 0.41633019291161955, + "grad_norm": 1.46958562232206, + "learning_rate": 6.57131882824753e-06, + "loss": 0.2587, + "step": 2784 + }, + { + "epoch": 0.41647973680275163, + "grad_norm": 1.5831830835473946, + "learning_rate": 6.569019518139521e-06, + "loss": 0.3713, + "step": 2785 + }, + { + "epoch": 0.41662928069388366, + "grad_norm": 1.6336655738252617, + "learning_rate": 6.566719839924412e-06, + "loss": 0.2954, + "step": 2786 + }, + { + "epoch": 0.4167788245850157, + "grad_norm": 1.7291309360657896, + "learning_rate": 6.564419794141729e-06, + "loss": 0.4808, + "step": 2787 + }, + { + "epoch": 0.4169283684761477, + "grad_norm": 1.1926204169727774, + "learning_rate": 6.562119381331087e-06, + "loss": 0.1963, + "step": 2788 + }, + { + "epoch": 0.4170779123672798, + "grad_norm": 1.0629560827548779, + "learning_rate": 6.559818602032182e-06, + "loss": 0.1881, + "step": 2789 + }, + { + "epoch": 0.41722745625841184, + "grad_norm": 1.7790857371942344, + "learning_rate": 6.557517456784801e-06, + "loss": 0.1946, + "step": 2790 + }, + { + "epoch": 0.4173770001495439, + "grad_norm": 1.226190910118689, + "learning_rate": 6.555215946128815e-06, + "loss": 0.2162, + "step": 2791 + }, + { + "epoch": 0.41752654404067596, + "grad_norm": 2.1109476161237204, + "learning_rate": 6.552914070604178e-06, + "loss": 0.5731, + "step": 2792 + }, + { + "epoch": 0.417676087931808, + "grad_norm": 1.7790077510552844, + "learning_rate": 6.550611830750935e-06, + "loss": 0.3273, + "step": 2793 + }, + { + "epoch": 0.41782563182294, + "grad_norm": 1.7552104298119706, + "learning_rate": 6.5483092271092115e-06, + "loss": 0.4436, + "step": 2794 + }, + { + "epoch": 0.41797517571407206, + "grad_norm": 1.7002774108662209, + "learning_rate": 6.546006260219223e-06, + "loss": 0.3526, + "step": 2795 + }, + { + "epoch": 0.41812471960520414, + "grad_norm": 1.4233636661060545, + "learning_rate": 6.543702930621267e-06, + "loss": 0.3524, + "step": 2796 + }, + { + "epoch": 0.4182742634963362, + "grad_norm": 1.8156591047803354, + "learning_rate": 6.541399238855726e-06, + "loss": 0.3947, + "step": 2797 + }, + { + "epoch": 0.4184238073874682, + "grad_norm": 1.2300331934296698, + "learning_rate": 6.539095185463069e-06, + "loss": 0.2044, + "step": 2798 + }, + { + "epoch": 0.4185733512786003, + "grad_norm": 1.2209879703063857, + "learning_rate": 6.5367907709838466e-06, + "loss": 0.1635, + "step": 2799 + }, + { + "epoch": 0.4187228951697323, + "grad_norm": 2.3744121752643044, + "learning_rate": 6.534485995958699e-06, + "loss": 0.7958, + "step": 2800 + }, + { + "epoch": 0.41887243906086435, + "grad_norm": 1.1638106723385093, + "learning_rate": 6.53218086092835e-06, + "loss": 0.3233, + "step": 2801 + }, + { + "epoch": 0.4190219829519964, + "grad_norm": 2.2950286840379124, + "learning_rate": 6.529875366433604e-06, + "loss": 0.7808, + "step": 2802 + }, + { + "epoch": 0.4191715268431285, + "grad_norm": 1.402864714048257, + "learning_rate": 6.527569513015354e-06, + "loss": 0.2022, + "step": 2803 + }, + { + "epoch": 0.4193210707342605, + "grad_norm": 1.4371707658442519, + "learning_rate": 6.5252633012145735e-06, + "loss": 0.187, + "step": 2804 + }, + { + "epoch": 0.41947061462539253, + "grad_norm": 1.2367409752013236, + "learning_rate": 6.522956731572323e-06, + "loss": 0.1904, + "step": 2805 + }, + { + "epoch": 0.4196201585165246, + "grad_norm": 1.5896821279643083, + "learning_rate": 6.520649804629746e-06, + "loss": 0.3862, + "step": 2806 + }, + { + "epoch": 0.41976970240765665, + "grad_norm": 1.8899618435974137, + "learning_rate": 6.5183425209280694e-06, + "loss": 0.3903, + "step": 2807 + }, + { + "epoch": 0.4199192462987887, + "grad_norm": 2.0562104756171364, + "learning_rate": 6.5160348810086065e-06, + "loss": 0.5683, + "step": 2808 + }, + { + "epoch": 0.4200687901899207, + "grad_norm": 0.9348853804908052, + "learning_rate": 6.513726885412748e-06, + "loss": 0.2017, + "step": 2809 + }, + { + "epoch": 0.4202183340810528, + "grad_norm": 1.8140665018134454, + "learning_rate": 6.511418534681974e-06, + "loss": 0.3551, + "step": 2810 + }, + { + "epoch": 0.42036787797218483, + "grad_norm": 1.8148951879922504, + "learning_rate": 6.509109829357847e-06, + "loss": 0.2311, + "step": 2811 + }, + { + "epoch": 0.42051742186331686, + "grad_norm": 1.2263992513762383, + "learning_rate": 6.506800769982009e-06, + "loss": 0.1839, + "step": 2812 + }, + { + "epoch": 0.42066696575444895, + "grad_norm": 1.405409781791798, + "learning_rate": 6.504491357096189e-06, + "loss": 0.2004, + "step": 2813 + }, + { + "epoch": 0.420816509645581, + "grad_norm": 1.280743321393158, + "learning_rate": 6.5021815912421955e-06, + "loss": 0.3081, + "step": 2814 + }, + { + "epoch": 0.420966053536713, + "grad_norm": 1.2542595826019205, + "learning_rate": 6.499871472961926e-06, + "loss": 0.1564, + "step": 2815 + }, + { + "epoch": 0.42111559742784505, + "grad_norm": 2.473171445619144, + "learning_rate": 6.4975610027973545e-06, + "loss": 0.4957, + "step": 2816 + }, + { + "epoch": 0.42126514131897713, + "grad_norm": 1.6907310283363046, + "learning_rate": 6.495250181290538e-06, + "loss": 0.5282, + "step": 2817 + }, + { + "epoch": 0.42141468521010916, + "grad_norm": 1.6101555551733806, + "learning_rate": 6.49293900898362e-06, + "loss": 0.4522, + "step": 2818 + }, + { + "epoch": 0.4215642291012412, + "grad_norm": 1.9979956071229465, + "learning_rate": 6.49062748641882e-06, + "loss": 0.3861, + "step": 2819 + }, + { + "epoch": 0.4217137729923733, + "grad_norm": 1.1590188886249708, + "learning_rate": 6.488315614138447e-06, + "loss": 0.2171, + "step": 2820 + }, + { + "epoch": 0.4218633168835053, + "grad_norm": 1.9821881424242638, + "learning_rate": 6.486003392684889e-06, + "loss": 0.2409, + "step": 2821 + }, + { + "epoch": 0.42201286077463734, + "grad_norm": 1.876013420334667, + "learning_rate": 6.483690822600613e-06, + "loss": 0.3573, + "step": 2822 + }, + { + "epoch": 0.42216240466576943, + "grad_norm": 1.813581563264969, + "learning_rate": 6.481377904428171e-06, + "loss": 0.4781, + "step": 2823 + }, + { + "epoch": 0.42231194855690146, + "grad_norm": 2.0083488534654808, + "learning_rate": 6.479064638710197e-06, + "loss": 0.2441, + "step": 2824 + }, + { + "epoch": 0.4224614924480335, + "grad_norm": 1.3677082786526151, + "learning_rate": 6.476751025989403e-06, + "loss": 0.3431, + "step": 2825 + }, + { + "epoch": 0.4226110363391655, + "grad_norm": 1.0145275734224242, + "learning_rate": 6.474437066808585e-06, + "loss": 0.1819, + "step": 2826 + }, + { + "epoch": 0.4227605802302976, + "grad_norm": 1.490785561817132, + "learning_rate": 6.472122761710624e-06, + "loss": 0.3551, + "step": 2827 + }, + { + "epoch": 0.42291012412142964, + "grad_norm": 1.4108601765445785, + "learning_rate": 6.469808111238475e-06, + "loss": 0.3113, + "step": 2828 + }, + { + "epoch": 0.4230596680125617, + "grad_norm": 1.8058059826416644, + "learning_rate": 6.467493115935179e-06, + "loss": 0.5298, + "step": 2829 + }, + { + "epoch": 0.42320921190369376, + "grad_norm": 1.3575907869698502, + "learning_rate": 6.465177776343854e-06, + "loss": 0.1908, + "step": 2830 + }, + { + "epoch": 0.4233587557948258, + "grad_norm": 1.8817636884787206, + "learning_rate": 6.462862093007702e-06, + "loss": 0.5272, + "step": 2831 + }, + { + "epoch": 0.4235082996859578, + "grad_norm": 1.4050270193232313, + "learning_rate": 6.460546066470006e-06, + "loss": 0.3227, + "step": 2832 + }, + { + "epoch": 0.42365784357708985, + "grad_norm": 1.8465442174063562, + "learning_rate": 6.458229697274125e-06, + "loss": 0.3563, + "step": 2833 + }, + { + "epoch": 0.42380738746822194, + "grad_norm": 1.5798477268932938, + "learning_rate": 6.455912985963505e-06, + "loss": 0.3733, + "step": 2834 + }, + { + "epoch": 0.423956931359354, + "grad_norm": 1.7347800924431869, + "learning_rate": 6.4535959330816666e-06, + "loss": 0.383, + "step": 2835 + }, + { + "epoch": 0.424106475250486, + "grad_norm": 1.9104772036774023, + "learning_rate": 6.451278539172215e-06, + "loss": 0.5334, + "step": 2836 + }, + { + "epoch": 0.4242560191416181, + "grad_norm": 1.4126707638949678, + "learning_rate": 6.448960804778831e-06, + "loss": 0.3081, + "step": 2837 + }, + { + "epoch": 0.4244055630327501, + "grad_norm": 1.594463001904542, + "learning_rate": 6.446642730445278e-06, + "loss": 0.3175, + "step": 2838 + }, + { + "epoch": 0.42455510692388215, + "grad_norm": 1.6332664292262578, + "learning_rate": 6.444324316715398e-06, + "loss": 0.393, + "step": 2839 + }, + { + "epoch": 0.4247046508150142, + "grad_norm": 1.0759394822281065, + "learning_rate": 6.4420055641331135e-06, + "loss": 0.1674, + "step": 2840 + }, + { + "epoch": 0.42485419470614627, + "grad_norm": 1.3272599039432167, + "learning_rate": 6.439686473242428e-06, + "loss": 0.1989, + "step": 2841 + }, + { + "epoch": 0.4250037385972783, + "grad_norm": 0.9431044903567088, + "learning_rate": 6.437367044587419e-06, + "loss": 0.1963, + "step": 2842 + }, + { + "epoch": 0.42515328248841033, + "grad_norm": 1.5321185412479696, + "learning_rate": 6.43504727871225e-06, + "loss": 0.3409, + "step": 2843 + }, + { + "epoch": 0.4253028263795424, + "grad_norm": 1.3126437322786884, + "learning_rate": 6.43272717616116e-06, + "loss": 0.2271, + "step": 2844 + }, + { + "epoch": 0.42545237027067445, + "grad_norm": 1.555272067935412, + "learning_rate": 6.4304067374784665e-06, + "loss": 0.206, + "step": 2845 + }, + { + "epoch": 0.4256019141618065, + "grad_norm": 1.517354875146092, + "learning_rate": 6.428085963208567e-06, + "loss": 0.2726, + "step": 2846 + }, + { + "epoch": 0.4257514580529385, + "grad_norm": 1.468125115279286, + "learning_rate": 6.425764853895937e-06, + "loss": 0.195, + "step": 2847 + }, + { + "epoch": 0.4259010019440706, + "grad_norm": 1.8509310840841346, + "learning_rate": 6.423443410085131e-06, + "loss": 0.5487, + "step": 2848 + }, + { + "epoch": 0.42605054583520263, + "grad_norm": 1.2399783546042413, + "learning_rate": 6.421121632320785e-06, + "loss": 0.2196, + "step": 2849 + }, + { + "epoch": 0.42620008972633466, + "grad_norm": 1.5030129764671514, + "learning_rate": 6.418799521147606e-06, + "loss": 0.22, + "step": 2850 + }, + { + "epoch": 0.42634963361746675, + "grad_norm": 1.0883578935514098, + "learning_rate": 6.416477077110388e-06, + "loss": 0.1915, + "step": 2851 + }, + { + "epoch": 0.4264991775085988, + "grad_norm": 1.162033835127749, + "learning_rate": 6.414154300753995e-06, + "loss": 0.1868, + "step": 2852 + }, + { + "epoch": 0.4266487213997308, + "grad_norm": 0.9782803837081274, + "learning_rate": 6.4118311926233746e-06, + "loss": 0.22, + "step": 2853 + }, + { + "epoch": 0.42679826529086284, + "grad_norm": 1.7110003618772132, + "learning_rate": 6.40950775326355e-06, + "loss": 0.1888, + "step": 2854 + }, + { + "epoch": 0.42694780918199493, + "grad_norm": 1.427911525647864, + "learning_rate": 6.407183983219622e-06, + "loss": 0.2766, + "step": 2855 + }, + { + "epoch": 0.42709735307312696, + "grad_norm": 1.5840868087742603, + "learning_rate": 6.404859883036771e-06, + "loss": 0.1674, + "step": 2856 + }, + { + "epoch": 0.427246896964259, + "grad_norm": 1.8829291374832988, + "learning_rate": 6.4025354532602524e-06, + "loss": 0.4503, + "step": 2857 + }, + { + "epoch": 0.4273964408553911, + "grad_norm": 1.446898713751755, + "learning_rate": 6.400210694435399e-06, + "loss": 0.2135, + "step": 2858 + }, + { + "epoch": 0.4275459847465231, + "grad_norm": 1.3998039538345768, + "learning_rate": 6.397885607107623e-06, + "loss": 0.2949, + "step": 2859 + }, + { + "epoch": 0.42769552863765514, + "grad_norm": 1.4746825648453787, + "learning_rate": 6.395560191822411e-06, + "loss": 0.2925, + "step": 2860 + }, + { + "epoch": 0.4278450725287872, + "grad_norm": 1.4266836184950478, + "learning_rate": 6.3932344491253295e-06, + "loss": 0.2122, + "step": 2861 + }, + { + "epoch": 0.42799461641991926, + "grad_norm": 1.442493331177269, + "learning_rate": 6.3909083795620175e-06, + "loss": 0.2134, + "step": 2862 + }, + { + "epoch": 0.4281441603110513, + "grad_norm": 2.1184696277850064, + "learning_rate": 6.388581983678196e-06, + "loss": 0.4546, + "step": 2863 + }, + { + "epoch": 0.4282937042021833, + "grad_norm": 1.782817853272648, + "learning_rate": 6.3862552620196595e-06, + "loss": 0.4109, + "step": 2864 + }, + { + "epoch": 0.4284432480933154, + "grad_norm": 1.5667820096885934, + "learning_rate": 6.383928215132278e-06, + "loss": 0.2185, + "step": 2865 + }, + { + "epoch": 0.42859279198444744, + "grad_norm": 1.6221826412250884, + "learning_rate": 6.381600843561999e-06, + "loss": 0.3755, + "step": 2866 + }, + { + "epoch": 0.42874233587557947, + "grad_norm": 1.7294106531552245, + "learning_rate": 6.379273147854848e-06, + "loss": 0.2607, + "step": 2867 + }, + { + "epoch": 0.4288918797667115, + "grad_norm": 2.0912433892879214, + "learning_rate": 6.3769451285569226e-06, + "loss": 0.6155, + "step": 2868 + }, + { + "epoch": 0.4290414236578436, + "grad_norm": 1.5578560481589532, + "learning_rate": 6.374616786214402e-06, + "loss": 0.3255, + "step": 2869 + }, + { + "epoch": 0.4291909675489756, + "grad_norm": 2.308644366798669, + "learning_rate": 6.372288121373535e-06, + "loss": 0.4744, + "step": 2870 + }, + { + "epoch": 0.42934051144010765, + "grad_norm": 1.8825047823223349, + "learning_rate": 6.369959134580649e-06, + "loss": 0.4608, + "step": 2871 + }, + { + "epoch": 0.42949005533123974, + "grad_norm": 1.674818620540395, + "learning_rate": 6.367629826382148e-06, + "loss": 0.1994, + "step": 2872 + }, + { + "epoch": 0.42963959922237177, + "grad_norm": 1.608858497846789, + "learning_rate": 6.365300197324509e-06, + "loss": 0.1895, + "step": 2873 + }, + { + "epoch": 0.4297891431135038, + "grad_norm": 1.6863352566219876, + "learning_rate": 6.362970247954285e-06, + "loss": 0.2895, + "step": 2874 + }, + { + "epoch": 0.42993868700463583, + "grad_norm": 1.603267729504892, + "learning_rate": 6.3606399788181065e-06, + "loss": 0.4429, + "step": 2875 + }, + { + "epoch": 0.4300882308957679, + "grad_norm": 1.1309057334924653, + "learning_rate": 6.358309390462675e-06, + "loss": 0.1826, + "step": 2876 + }, + { + "epoch": 0.43023777478689995, + "grad_norm": 1.6293307512813584, + "learning_rate": 6.355978483434772e-06, + "loss": 0.3851, + "step": 2877 + }, + { + "epoch": 0.430387318678032, + "grad_norm": 1.6095937214526117, + "learning_rate": 6.353647258281246e-06, + "loss": 0.1676, + "step": 2878 + }, + { + "epoch": 0.43053686256916407, + "grad_norm": 1.7332943995241852, + "learning_rate": 6.35131571554903e-06, + "loss": 0.3921, + "step": 2879 + }, + { + "epoch": 0.4306864064602961, + "grad_norm": 3.0740209416462285, + "learning_rate": 6.348983855785122e-06, + "loss": 0.2256, + "step": 2880 + }, + { + "epoch": 0.43083595035142813, + "grad_norm": 1.7176886445978465, + "learning_rate": 6.346651679536601e-06, + "loss": 0.2568, + "step": 2881 + }, + { + "epoch": 0.4309854942425602, + "grad_norm": 1.3651054305058985, + "learning_rate": 6.34431918735062e-06, + "loss": 0.302, + "step": 2882 + }, + { + "epoch": 0.43113503813369225, + "grad_norm": 1.9397360315884353, + "learning_rate": 6.341986379774399e-06, + "loss": 0.2014, + "step": 2883 + }, + { + "epoch": 0.4312845820248243, + "grad_norm": 1.930136802042361, + "learning_rate": 6.339653257355243e-06, + "loss": 0.4523, + "step": 2884 + }, + { + "epoch": 0.4314341259159563, + "grad_norm": 1.5662558809247622, + "learning_rate": 6.337319820640519e-06, + "loss": 0.2364, + "step": 2885 + }, + { + "epoch": 0.4315836698070884, + "grad_norm": 1.5689148044077532, + "learning_rate": 6.3349860701776795e-06, + "loss": 0.3208, + "step": 2886 + }, + { + "epoch": 0.43173321369822043, + "grad_norm": 1.0915420487336667, + "learning_rate": 6.332652006514241e-06, + "loss": 0.1883, + "step": 2887 + }, + { + "epoch": 0.43188275758935246, + "grad_norm": 1.4829825194297046, + "learning_rate": 6.330317630197797e-06, + "loss": 0.2005, + "step": 2888 + }, + { + "epoch": 0.43203230148048455, + "grad_norm": 1.681005936318263, + "learning_rate": 6.3279829417760195e-06, + "loss": 0.1915, + "step": 2889 + }, + { + "epoch": 0.4321818453716166, + "grad_norm": 2.026257210929884, + "learning_rate": 6.325647941796644e-06, + "loss": 0.5229, + "step": 2890 + }, + { + "epoch": 0.4323313892627486, + "grad_norm": 2.02662420274482, + "learning_rate": 6.323312630807485e-06, + "loss": 0.5623, + "step": 2891 + }, + { + "epoch": 0.43248093315388064, + "grad_norm": 0.917059657221862, + "learning_rate": 6.3209770093564315e-06, + "loss": 0.166, + "step": 2892 + }, + { + "epoch": 0.43263047704501273, + "grad_norm": 1.7948196473341813, + "learning_rate": 6.318641077991438e-06, + "loss": 0.2167, + "step": 2893 + }, + { + "epoch": 0.43278002093614476, + "grad_norm": 1.5933182870290437, + "learning_rate": 6.316304837260542e-06, + "loss": 0.348, + "step": 2894 + }, + { + "epoch": 0.4329295648272768, + "grad_norm": 1.5675005503133328, + "learning_rate": 6.313968287711844e-06, + "loss": 0.2499, + "step": 2895 + }, + { + "epoch": 0.4330791087184089, + "grad_norm": 2.176887802685468, + "learning_rate": 6.311631429893523e-06, + "loss": 0.2432, + "step": 2896 + }, + { + "epoch": 0.4332286526095409, + "grad_norm": 2.2841410313564903, + "learning_rate": 6.309294264353828e-06, + "loss": 0.366, + "step": 2897 + }, + { + "epoch": 0.43337819650067294, + "grad_norm": 1.584162799649717, + "learning_rate": 6.30695679164108e-06, + "loss": 0.239, + "step": 2898 + }, + { + "epoch": 0.43352774039180497, + "grad_norm": 1.5007265983531852, + "learning_rate": 6.3046190123036745e-06, + "loss": 0.1616, + "step": 2899 + }, + { + "epoch": 0.43367728428293706, + "grad_norm": 2.030068564709013, + "learning_rate": 6.302280926890074e-06, + "loss": 0.1811, + "step": 2900 + }, + { + "epoch": 0.4338268281740691, + "grad_norm": 1.71620144462109, + "learning_rate": 6.299942535948816e-06, + "loss": 0.4858, + "step": 2901 + }, + { + "epoch": 0.4339763720652011, + "grad_norm": 1.6130080927032686, + "learning_rate": 6.2976038400285145e-06, + "loss": 0.4302, + "step": 2902 + }, + { + "epoch": 0.4341259159563332, + "grad_norm": 1.4256957511872124, + "learning_rate": 6.295264839677844e-06, + "loss": 0.2762, + "step": 2903 + }, + { + "epoch": 0.43427545984746524, + "grad_norm": 1.6311831215781756, + "learning_rate": 6.292925535445561e-06, + "loss": 0.2407, + "step": 2904 + }, + { + "epoch": 0.43442500373859727, + "grad_norm": 1.0501034014457853, + "learning_rate": 6.290585927880486e-06, + "loss": 0.1544, + "step": 2905 + }, + { + "epoch": 0.4345745476297293, + "grad_norm": 1.4075211051922059, + "learning_rate": 6.288246017531514e-06, + "loss": 0.1784, + "step": 2906 + }, + { + "epoch": 0.4347240915208614, + "grad_norm": 1.5099442439253756, + "learning_rate": 6.285905804947612e-06, + "loss": 0.3258, + "step": 2907 + }, + { + "epoch": 0.4348736354119934, + "grad_norm": 1.5566923331523632, + "learning_rate": 6.283565290677816e-06, + "loss": 0.2072, + "step": 2908 + }, + { + "epoch": 0.43502317930312545, + "grad_norm": 1.934151158762352, + "learning_rate": 6.281224475271232e-06, + "loss": 0.3316, + "step": 2909 + }, + { + "epoch": 0.43517272319425754, + "grad_norm": 2.0035355877639445, + "learning_rate": 6.278883359277037e-06, + "loss": 0.5767, + "step": 2910 + }, + { + "epoch": 0.43532226708538957, + "grad_norm": 1.3925787643495353, + "learning_rate": 6.276541943244482e-06, + "loss": 0.3198, + "step": 2911 + }, + { + "epoch": 0.4354718109765216, + "grad_norm": 0.9494373556456771, + "learning_rate": 6.274200227722887e-06, + "loss": 0.1884, + "step": 2912 + }, + { + "epoch": 0.43562135486765363, + "grad_norm": 1.4369294236650456, + "learning_rate": 6.271858213261635e-06, + "loss": 0.1916, + "step": 2913 + }, + { + "epoch": 0.4357708987587857, + "grad_norm": 1.248323456170905, + "learning_rate": 6.2695159004101905e-06, + "loss": 0.1616, + "step": 2914 + }, + { + "epoch": 0.43592044264991775, + "grad_norm": 1.1469262780775566, + "learning_rate": 6.267173289718079e-06, + "loss": 0.1893, + "step": 2915 + }, + { + "epoch": 0.4360699865410498, + "grad_norm": 1.5964423980309195, + "learning_rate": 6.264830381734903e-06, + "loss": 0.3121, + "step": 2916 + }, + { + "epoch": 0.43621953043218187, + "grad_norm": 2.019996585766343, + "learning_rate": 6.262487177010331e-06, + "loss": 0.6759, + "step": 2917 + }, + { + "epoch": 0.4363690743233139, + "grad_norm": 1.7460698802434411, + "learning_rate": 6.2601436760940995e-06, + "loss": 0.3921, + "step": 2918 + }, + { + "epoch": 0.43651861821444593, + "grad_norm": 1.6254679162080976, + "learning_rate": 6.2577998795360166e-06, + "loss": 0.2063, + "step": 2919 + }, + { + "epoch": 0.43666816210557796, + "grad_norm": 1.1145785344063608, + "learning_rate": 6.25545578788596e-06, + "loss": 0.1633, + "step": 2920 + }, + { + "epoch": 0.43681770599671005, + "grad_norm": 1.53505082775515, + "learning_rate": 6.2531114016938755e-06, + "loss": 0.3125, + "step": 2921 + }, + { + "epoch": 0.4369672498878421, + "grad_norm": 2.1297043590480085, + "learning_rate": 6.250766721509781e-06, + "loss": 0.3356, + "step": 2922 + }, + { + "epoch": 0.4371167937789741, + "grad_norm": 1.9039440865057902, + "learning_rate": 6.24842174788376e-06, + "loss": 0.2797, + "step": 2923 + }, + { + "epoch": 0.4372663376701062, + "grad_norm": 1.673786518910741, + "learning_rate": 6.246076481365964e-06, + "loss": 0.3678, + "step": 2924 + }, + { + "epoch": 0.43741588156123823, + "grad_norm": 1.1399913425597843, + "learning_rate": 6.243730922506618e-06, + "loss": 0.1963, + "step": 2925 + }, + { + "epoch": 0.43756542545237026, + "grad_norm": 1.9402190325909736, + "learning_rate": 6.24138507185601e-06, + "loss": 0.5599, + "step": 2926 + }, + { + "epoch": 0.4377149693435023, + "grad_norm": 1.7289659635468384, + "learning_rate": 6.2390389299645e-06, + "loss": 0.3456, + "step": 2927 + }, + { + "epoch": 0.4378645132346344, + "grad_norm": 1.6212378839968007, + "learning_rate": 6.236692497382517e-06, + "loss": 0.2003, + "step": 2928 + }, + { + "epoch": 0.4380140571257664, + "grad_norm": 1.3141303268327558, + "learning_rate": 6.234345774660555e-06, + "loss": 0.2093, + "step": 2929 + }, + { + "epoch": 0.43816360101689844, + "grad_norm": 1.4193693342645899, + "learning_rate": 6.231998762349179e-06, + "loss": 0.3721, + "step": 2930 + }, + { + "epoch": 0.4383131449080305, + "grad_norm": 1.5886256685611988, + "learning_rate": 6.22965146099902e-06, + "loss": 0.1997, + "step": 2931 + }, + { + "epoch": 0.43846268879916256, + "grad_norm": 1.8678471944100403, + "learning_rate": 6.227303871160779e-06, + "loss": 0.3806, + "step": 2932 + }, + { + "epoch": 0.4386122326902946, + "grad_norm": 1.4587758386328973, + "learning_rate": 6.22495599338522e-06, + "loss": 0.3052, + "step": 2933 + }, + { + "epoch": 0.4387617765814266, + "grad_norm": 1.4800252577279251, + "learning_rate": 6.222607828223179e-06, + "loss": 0.3585, + "step": 2934 + }, + { + "epoch": 0.4389113204725587, + "grad_norm": 1.8496390423918079, + "learning_rate": 6.22025937622556e-06, + "loss": 0.5058, + "step": 2935 + }, + { + "epoch": 0.43906086436369074, + "grad_norm": 1.5660477130406603, + "learning_rate": 6.217910637943332e-06, + "loss": 0.202, + "step": 2936 + }, + { + "epoch": 0.43921040825482277, + "grad_norm": 2.051942892319578, + "learning_rate": 6.21556161392753e-06, + "loss": 0.346, + "step": 2937 + }, + { + "epoch": 0.43935995214595486, + "grad_norm": 1.1875003670390716, + "learning_rate": 6.213212304729259e-06, + "loss": 0.2106, + "step": 2938 + }, + { + "epoch": 0.4395094960370869, + "grad_norm": 1.5508505327713367, + "learning_rate": 6.210862710899688e-06, + "loss": 0.3414, + "step": 2939 + }, + { + "epoch": 0.4396590399282189, + "grad_norm": 1.4620940174760757, + "learning_rate": 6.208512832990057e-06, + "loss": 0.2326, + "step": 2940 + }, + { + "epoch": 0.439808583819351, + "grad_norm": 1.5695236333564446, + "learning_rate": 6.206162671551666e-06, + "loss": 0.2074, + "step": 2941 + }, + { + "epoch": 0.43995812771048304, + "grad_norm": 1.7058773024979474, + "learning_rate": 6.203812227135891e-06, + "loss": 0.3473, + "step": 2942 + }, + { + "epoch": 0.44010767160161507, + "grad_norm": 1.4295973540897797, + "learning_rate": 6.201461500294165e-06, + "loss": 0.2694, + "step": 2943 + }, + { + "epoch": 0.4402572154927471, + "grad_norm": 1.9125637921122356, + "learning_rate": 6.199110491577991e-06, + "loss": 0.5139, + "step": 2944 + }, + { + "epoch": 0.4404067593838792, + "grad_norm": 2.2653905286807667, + "learning_rate": 6.196759201538941e-06, + "loss": 0.2828, + "step": 2945 + }, + { + "epoch": 0.4405563032750112, + "grad_norm": 1.1078096531641743, + "learning_rate": 6.1944076307286485e-06, + "loss": 0.222, + "step": 2946 + }, + { + "epoch": 0.44070584716614325, + "grad_norm": 1.1756960372209257, + "learning_rate": 6.192055779698813e-06, + "loss": 0.2178, + "step": 2947 + }, + { + "epoch": 0.44085539105727534, + "grad_norm": 1.3681152887792614, + "learning_rate": 6.1897036490012055e-06, + "loss": 0.1979, + "step": 2948 + }, + { + "epoch": 0.44100493494840737, + "grad_norm": 1.7105899375030225, + "learning_rate": 6.187351239187654e-06, + "loss": 0.4343, + "step": 2949 + }, + { + "epoch": 0.4411544788395394, + "grad_norm": 1.3110170942088808, + "learning_rate": 6.184998550810062e-06, + "loss": 0.2351, + "step": 2950 + }, + { + "epoch": 0.44130402273067143, + "grad_norm": 1.6579637186148253, + "learning_rate": 6.1826455844203885e-06, + "loss": 0.3927, + "step": 2951 + }, + { + "epoch": 0.4414535666218035, + "grad_norm": 1.1835524580145198, + "learning_rate": 6.180292340570663e-06, + "loss": 0.1775, + "step": 2952 + }, + { + "epoch": 0.44160311051293555, + "grad_norm": 1.4534094029698128, + "learning_rate": 6.177938819812979e-06, + "loss": 0.3563, + "step": 2953 + }, + { + "epoch": 0.4417526544040676, + "grad_norm": 1.0596251354889077, + "learning_rate": 6.175585022699495e-06, + "loss": 0.1772, + "step": 2954 + }, + { + "epoch": 0.44190219829519967, + "grad_norm": 1.949849754744267, + "learning_rate": 6.1732309497824365e-06, + "loss": 0.5175, + "step": 2955 + }, + { + "epoch": 0.4420517421863317, + "grad_norm": 1.0125156765624894, + "learning_rate": 6.170876601614089e-06, + "loss": 0.1813, + "step": 2956 + }, + { + "epoch": 0.44220128607746373, + "grad_norm": 1.7733454407312104, + "learning_rate": 6.168521978746808e-06, + "loss": 0.4234, + "step": 2957 + }, + { + "epoch": 0.44235082996859576, + "grad_norm": 1.2228581167428354, + "learning_rate": 6.16616708173301e-06, + "loss": 0.3289, + "step": 2958 + }, + { + "epoch": 0.44250037385972785, + "grad_norm": 1.0361601046635946, + "learning_rate": 6.1638119111251735e-06, + "loss": 0.2069, + "step": 2959 + }, + { + "epoch": 0.4426499177508599, + "grad_norm": 1.203904626957853, + "learning_rate": 6.1614564674758495e-06, + "loss": 0.2516, + "step": 2960 + }, + { + "epoch": 0.4427994616419919, + "grad_norm": 1.023639589842975, + "learning_rate": 6.1591007513376425e-06, + "loss": 0.2318, + "step": 2961 + }, + { + "epoch": 0.442949005533124, + "grad_norm": 1.4382415975816416, + "learning_rate": 6.156744763263228e-06, + "loss": 0.2059, + "step": 2962 + }, + { + "epoch": 0.443098549424256, + "grad_norm": 2.5253874416582027, + "learning_rate": 6.1543885038053464e-06, + "loss": 0.5416, + "step": 2963 + }, + { + "epoch": 0.44324809331538806, + "grad_norm": 1.058256199728368, + "learning_rate": 6.152031973516795e-06, + "loss": 0.1221, + "step": 2964 + }, + { + "epoch": 0.4433976372065201, + "grad_norm": 1.4164780696917707, + "learning_rate": 6.149675172950441e-06, + "loss": 0.2538, + "step": 2965 + }, + { + "epoch": 0.4435471810976522, + "grad_norm": 2.030437299566282, + "learning_rate": 6.147318102659211e-06, + "loss": 0.2116, + "step": 2966 + }, + { + "epoch": 0.4436967249887842, + "grad_norm": 0.9394849585455308, + "learning_rate": 6.144960763196097e-06, + "loss": 0.1816, + "step": 2967 + }, + { + "epoch": 0.44384626887991624, + "grad_norm": 1.8329855420116805, + "learning_rate": 6.142603155114153e-06, + "loss": 0.3582, + "step": 2968 + }, + { + "epoch": 0.4439958127710483, + "grad_norm": 1.8735271669425757, + "learning_rate": 6.140245278966498e-06, + "loss": 0.4366, + "step": 2969 + }, + { + "epoch": 0.44414535666218036, + "grad_norm": 1.7892303304786348, + "learning_rate": 6.137887135306312e-06, + "loss": 0.4139, + "step": 2970 + }, + { + "epoch": 0.4442949005533124, + "grad_norm": 1.6664621642434678, + "learning_rate": 6.1355287246868365e-06, + "loss": 0.3187, + "step": 2971 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.5224427839350583, + "learning_rate": 6.133170047661379e-06, + "loss": 0.2085, + "step": 2972 + }, + { + "epoch": 0.4445939883355765, + "grad_norm": 1.6171598547263366, + "learning_rate": 6.130811104783308e-06, + "loss": 0.3763, + "step": 2973 + }, + { + "epoch": 0.44474353222670854, + "grad_norm": 1.662874989058589, + "learning_rate": 6.128451896606054e-06, + "loss": 0.3489, + "step": 2974 + }, + { + "epoch": 0.44489307611784057, + "grad_norm": 1.0363220335087222, + "learning_rate": 6.12609242368311e-06, + "loss": 0.1397, + "step": 2975 + }, + { + "epoch": 0.44504262000897266, + "grad_norm": 1.6660010816827036, + "learning_rate": 6.123732686568029e-06, + "loss": 0.333, + "step": 2976 + }, + { + "epoch": 0.4451921639001047, + "grad_norm": 1.1891794312637884, + "learning_rate": 6.121372685814431e-06, + "loss": 0.2382, + "step": 2977 + }, + { + "epoch": 0.4453417077912367, + "grad_norm": 1.576086623428251, + "learning_rate": 6.119012421975995e-06, + "loss": 0.3202, + "step": 2978 + }, + { + "epoch": 0.44549125168236875, + "grad_norm": 1.4353721174926903, + "learning_rate": 6.116651895606459e-06, + "loss": 0.22, + "step": 2979 + }, + { + "epoch": 0.44564079557350084, + "grad_norm": 1.3832487075650706, + "learning_rate": 6.1142911072596275e-06, + "loss": 0.2093, + "step": 2980 + }, + { + "epoch": 0.44579033946463287, + "grad_norm": 1.777859539603896, + "learning_rate": 6.111930057489364e-06, + "loss": 0.4684, + "step": 2981 + }, + { + "epoch": 0.4459398833557649, + "grad_norm": 1.2630447234683253, + "learning_rate": 6.10956874684959e-06, + "loss": 0.2158, + "step": 2982 + }, + { + "epoch": 0.446089427246897, + "grad_norm": 1.6976973376368785, + "learning_rate": 6.107207175894297e-06, + "loss": 0.3629, + "step": 2983 + }, + { + "epoch": 0.446238971138029, + "grad_norm": 1.3370435246604835, + "learning_rate": 6.1048453451775305e-06, + "loss": 0.178, + "step": 2984 + }, + { + "epoch": 0.44638851502916105, + "grad_norm": 1.4545067247689016, + "learning_rate": 6.102483255253397e-06, + "loss": 0.3052, + "step": 2985 + }, + { + "epoch": 0.4465380589202931, + "grad_norm": 1.4865245714334914, + "learning_rate": 6.100120906676068e-06, + "loss": 0.215, + "step": 2986 + }, + { + "epoch": 0.44668760281142517, + "grad_norm": 1.5606516109454205, + "learning_rate": 6.097758299999771e-06, + "loss": 0.3224, + "step": 2987 + }, + { + "epoch": 0.4468371467025572, + "grad_norm": 1.6831853862543933, + "learning_rate": 6.095395435778797e-06, + "loss": 0.4388, + "step": 2988 + }, + { + "epoch": 0.44698669059368923, + "grad_norm": 1.6134233850855104, + "learning_rate": 6.093032314567498e-06, + "loss": 0.3387, + "step": 2989 + }, + { + "epoch": 0.4471362344848213, + "grad_norm": 1.8479914330951175, + "learning_rate": 6.0906689369202835e-06, + "loss": 0.5229, + "step": 2990 + }, + { + "epoch": 0.44728577837595335, + "grad_norm": 1.697848699434105, + "learning_rate": 6.088305303391625e-06, + "loss": 0.3263, + "step": 2991 + }, + { + "epoch": 0.4474353222670854, + "grad_norm": 2.340324861868485, + "learning_rate": 6.085941414536054e-06, + "loss": 0.5403, + "step": 2992 + }, + { + "epoch": 0.4475848661582174, + "grad_norm": 1.366594482966174, + "learning_rate": 6.083577270908163e-06, + "loss": 0.2498, + "step": 2993 + }, + { + "epoch": 0.4477344100493495, + "grad_norm": 2.003441472279418, + "learning_rate": 6.0812128730626e-06, + "loss": 0.2421, + "step": 2994 + }, + { + "epoch": 0.4478839539404815, + "grad_norm": 1.4812098336709252, + "learning_rate": 6.078848221554077e-06, + "loss": 0.3513, + "step": 2995 + }, + { + "epoch": 0.44803349783161356, + "grad_norm": 1.121351564649134, + "learning_rate": 6.076483316937364e-06, + "loss": 0.1646, + "step": 2996 + }, + { + "epoch": 0.44818304172274565, + "grad_norm": 1.3593452823065253, + "learning_rate": 6.07411815976729e-06, + "loss": 0.3391, + "step": 2997 + }, + { + "epoch": 0.4483325856138777, + "grad_norm": 1.6796848208383883, + "learning_rate": 6.071752750598745e-06, + "loss": 0.5086, + "step": 2998 + }, + { + "epoch": 0.4484821295050097, + "grad_norm": 1.4048595972263043, + "learning_rate": 6.0693870899866756e-06, + "loss": 0.306, + "step": 2999 + }, + { + "epoch": 0.4486316733961418, + "grad_norm": 1.3825869686134655, + "learning_rate": 6.067021178486088e-06, + "loss": 0.207, + "step": 3000 + }, + { + "epoch": 0.4487812172872738, + "grad_norm": 1.5962644544557885, + "learning_rate": 6.06465501665205e-06, + "loss": 0.401, + "step": 3001 + }, + { + "epoch": 0.44893076117840586, + "grad_norm": 1.4371440397490116, + "learning_rate": 6.062288605039683e-06, + "loss": 0.1915, + "step": 3002 + }, + { + "epoch": 0.4490803050695379, + "grad_norm": 1.968364897408132, + "learning_rate": 6.059921944204173e-06, + "loss": 0.4246, + "step": 3003 + }, + { + "epoch": 0.44922984896067, + "grad_norm": 1.8653866690731011, + "learning_rate": 6.057555034700759e-06, + "loss": 0.427, + "step": 3004 + }, + { + "epoch": 0.449379392851802, + "grad_norm": 1.7168140345422174, + "learning_rate": 6.055187877084743e-06, + "loss": 0.3281, + "step": 3005 + }, + { + "epoch": 0.44952893674293404, + "grad_norm": 1.5060475228175942, + "learning_rate": 6.052820471911481e-06, + "loss": 0.3382, + "step": 3006 + }, + { + "epoch": 0.4496784806340661, + "grad_norm": 1.4572399542192809, + "learning_rate": 6.05045281973639e-06, + "loss": 0.326, + "step": 3007 + }, + { + "epoch": 0.44982802452519816, + "grad_norm": 1.9104089940276523, + "learning_rate": 6.048084921114944e-06, + "loss": 0.3776, + "step": 3008 + }, + { + "epoch": 0.4499775684163302, + "grad_norm": 1.6392841422899054, + "learning_rate": 6.045716776602674e-06, + "loss": 0.3508, + "step": 3009 + }, + { + "epoch": 0.4501271123074622, + "grad_norm": 1.795922624052802, + "learning_rate": 6.043348386755172e-06, + "loss": 0.5236, + "step": 3010 + }, + { + "epoch": 0.4502766561985943, + "grad_norm": 1.4038039245527818, + "learning_rate": 6.040979752128082e-06, + "loss": 0.1898, + "step": 3011 + }, + { + "epoch": 0.45042620008972634, + "grad_norm": 1.34690239318641, + "learning_rate": 6.038610873277109e-06, + "loss": 0.322, + "step": 3012 + }, + { + "epoch": 0.45057574398085837, + "grad_norm": 1.938382182478136, + "learning_rate": 6.036241750758018e-06, + "loss": 0.3155, + "step": 3013 + }, + { + "epoch": 0.45072528787199045, + "grad_norm": 1.5315561280490633, + "learning_rate": 6.033872385126625e-06, + "loss": 0.2086, + "step": 3014 + }, + { + "epoch": 0.4508748317631225, + "grad_norm": 1.4664133436532398, + "learning_rate": 6.031502776938806e-06, + "loss": 0.339, + "step": 3015 + }, + { + "epoch": 0.4510243756542545, + "grad_norm": 1.3762760635076952, + "learning_rate": 6.029132926750494e-06, + "loss": 0.2114, + "step": 3016 + }, + { + "epoch": 0.45117391954538655, + "grad_norm": 1.3005324933468017, + "learning_rate": 6.0267628351176796e-06, + "loss": 0.1908, + "step": 3017 + }, + { + "epoch": 0.45132346343651863, + "grad_norm": 1.0685843417404295, + "learning_rate": 6.02439250259641e-06, + "loss": 0.1924, + "step": 3018 + }, + { + "epoch": 0.45147300732765067, + "grad_norm": 1.771550384435644, + "learning_rate": 6.022021929742785e-06, + "loss": 0.2815, + "step": 3019 + }, + { + "epoch": 0.4516225512187827, + "grad_norm": 2.383086764076166, + "learning_rate": 6.019651117112967e-06, + "loss": 0.3847, + "step": 3020 + }, + { + "epoch": 0.4517720951099148, + "grad_norm": 1.4770169366158958, + "learning_rate": 6.0172800652631706e-06, + "loss": 0.3493, + "step": 3021 + }, + { + "epoch": 0.4519216390010468, + "grad_norm": 1.3322870296999558, + "learning_rate": 6.0149087747496645e-06, + "loss": 0.1781, + "step": 3022 + }, + { + "epoch": 0.45207118289217885, + "grad_norm": 1.0795861187632947, + "learning_rate": 6.0125372461287814e-06, + "loss": 0.2391, + "step": 3023 + }, + { + "epoch": 0.4522207267833109, + "grad_norm": 1.8068779784303053, + "learning_rate": 6.010165479956901e-06, + "loss": 0.418, + "step": 3024 + }, + { + "epoch": 0.45237027067444296, + "grad_norm": 1.9286914389868077, + "learning_rate": 6.007793476790463e-06, + "loss": 0.1949, + "step": 3025 + }, + { + "epoch": 0.452519814565575, + "grad_norm": 1.146144637893908, + "learning_rate": 6.005421237185965e-06, + "loss": 0.1524, + "step": 3026 + }, + { + "epoch": 0.452669358456707, + "grad_norm": 1.0973657922867939, + "learning_rate": 6.003048761699953e-06, + "loss": 0.3547, + "step": 3027 + }, + { + "epoch": 0.4528189023478391, + "grad_norm": 1.4581605124819323, + "learning_rate": 6.000676050889036e-06, + "loss": 0.1848, + "step": 3028 + }, + { + "epoch": 0.45296844623897115, + "grad_norm": 1.269034060908923, + "learning_rate": 5.998303105309873e-06, + "loss": 0.199, + "step": 3029 + }, + { + "epoch": 0.4531179901301032, + "grad_norm": 1.5171368496196267, + "learning_rate": 5.995929925519181e-06, + "loss": 0.212, + "step": 3030 + }, + { + "epoch": 0.4532675340212352, + "grad_norm": 1.9339847824352052, + "learning_rate": 5.99355651207373e-06, + "loss": 0.3863, + "step": 3031 + }, + { + "epoch": 0.4534170779123673, + "grad_norm": 1.6847447888231042, + "learning_rate": 5.991182865530346e-06, + "loss": 0.4776, + "step": 3032 + }, + { + "epoch": 0.4535666218034993, + "grad_norm": 1.7472594442925573, + "learning_rate": 5.988808986445911e-06, + "loss": 0.1947, + "step": 3033 + }, + { + "epoch": 0.45371616569463136, + "grad_norm": 1.1215834395483242, + "learning_rate": 5.986434875377356e-06, + "loss": 0.1856, + "step": 3034 + }, + { + "epoch": 0.45386570958576344, + "grad_norm": 1.31004013577431, + "learning_rate": 5.984060532881673e-06, + "loss": 0.1829, + "step": 3035 + }, + { + "epoch": 0.4540152534768955, + "grad_norm": 1.1780108416771637, + "learning_rate": 5.981685959515905e-06, + "loss": 0.2096, + "step": 3036 + }, + { + "epoch": 0.4541647973680275, + "grad_norm": 1.9874221617073415, + "learning_rate": 5.979311155837151e-06, + "loss": 0.3116, + "step": 3037 + }, + { + "epoch": 0.45431434125915954, + "grad_norm": 1.3795207278158967, + "learning_rate": 5.976936122402562e-06, + "loss": 0.1797, + "step": 3038 + }, + { + "epoch": 0.4544638851502916, + "grad_norm": 1.1174423153921418, + "learning_rate": 5.974560859769343e-06, + "loss": 0.1965, + "step": 3039 + }, + { + "epoch": 0.45461342904142366, + "grad_norm": 1.4270442541948576, + "learning_rate": 5.972185368494754e-06, + "loss": 0.194, + "step": 3040 + }, + { + "epoch": 0.4547629729325557, + "grad_norm": 1.6346945251535583, + "learning_rate": 5.9698096491361095e-06, + "loss": 0.4558, + "step": 3041 + }, + { + "epoch": 0.4549125168236878, + "grad_norm": 1.726471907193774, + "learning_rate": 5.967433702250772e-06, + "loss": 0.3279, + "step": 3042 + }, + { + "epoch": 0.4550620607148198, + "grad_norm": 1.4908257407248642, + "learning_rate": 5.965057528396166e-06, + "loss": 0.3607, + "step": 3043 + }, + { + "epoch": 0.45521160460595184, + "grad_norm": 1.9466145535642352, + "learning_rate": 5.962681128129764e-06, + "loss": 0.3927, + "step": 3044 + }, + { + "epoch": 0.45536114849708387, + "grad_norm": 1.790484795898369, + "learning_rate": 5.960304502009091e-06, + "loss": 0.5027, + "step": 3045 + }, + { + "epoch": 0.45551069238821595, + "grad_norm": 2.1466817305191337, + "learning_rate": 5.957927650591727e-06, + "loss": 0.5421, + "step": 3046 + }, + { + "epoch": 0.455660236279348, + "grad_norm": 1.7488963785457814, + "learning_rate": 5.9555505744353045e-06, + "loss": 0.5295, + "step": 3047 + }, + { + "epoch": 0.45580978017048, + "grad_norm": 1.3663500809725344, + "learning_rate": 5.953173274097508e-06, + "loss": 0.3152, + "step": 3048 + }, + { + "epoch": 0.4559593240616121, + "grad_norm": 1.3010976352200727, + "learning_rate": 5.950795750136075e-06, + "loss": 0.2174, + "step": 3049 + }, + { + "epoch": 0.45610886795274413, + "grad_norm": 1.8659509131326897, + "learning_rate": 5.948418003108795e-06, + "loss": 0.418, + "step": 3050 + }, + { + "epoch": 0.45625841184387617, + "grad_norm": 2.351423158034144, + "learning_rate": 5.946040033573514e-06, + "loss": 0.5714, + "step": 3051 + }, + { + "epoch": 0.4564079557350082, + "grad_norm": 1.7203691095571323, + "learning_rate": 5.943661842088122e-06, + "loss": 0.3555, + "step": 3052 + }, + { + "epoch": 0.4565574996261403, + "grad_norm": 1.6103446132123642, + "learning_rate": 5.941283429210568e-06, + "loss": 0.3919, + "step": 3053 + }, + { + "epoch": 0.4567070435172723, + "grad_norm": 1.634616029154534, + "learning_rate": 5.93890479549885e-06, + "loss": 0.4904, + "step": 3054 + }, + { + "epoch": 0.45685658740840435, + "grad_norm": 1.2519583084546646, + "learning_rate": 5.936525941511017e-06, + "loss": 0.2004, + "step": 3055 + }, + { + "epoch": 0.45700613129953643, + "grad_norm": 1.8604210227354285, + "learning_rate": 5.934146867805175e-06, + "loss": 0.4454, + "step": 3056 + }, + { + "epoch": 0.45715567519066846, + "grad_norm": 1.5966577837546947, + "learning_rate": 5.931767574939473e-06, + "loss": 0.3326, + "step": 3057 + }, + { + "epoch": 0.4573052190818005, + "grad_norm": 1.7485226713542816, + "learning_rate": 5.929388063472119e-06, + "loss": 0.3078, + "step": 3058 + }, + { + "epoch": 0.4574547629729326, + "grad_norm": 1.8101199489307283, + "learning_rate": 5.9270083339613695e-06, + "loss": 0.5576, + "step": 3059 + }, + { + "epoch": 0.4576043068640646, + "grad_norm": 1.1197623012447884, + "learning_rate": 5.924628386965529e-06, + "loss": 0.1692, + "step": 3060 + }, + { + "epoch": 0.45775385075519665, + "grad_norm": 1.5496112484027527, + "learning_rate": 5.922248223042959e-06, + "loss": 0.3748, + "step": 3061 + }, + { + "epoch": 0.4579033946463287, + "grad_norm": 1.4213073823646392, + "learning_rate": 5.919867842752067e-06, + "loss": 0.2585, + "step": 3062 + }, + { + "epoch": 0.45805293853746076, + "grad_norm": 1.6766072796105778, + "learning_rate": 5.917487246651314e-06, + "loss": 0.3301, + "step": 3063 + }, + { + "epoch": 0.4582024824285928, + "grad_norm": 1.1877206484440475, + "learning_rate": 5.915106435299212e-06, + "loss": 0.2118, + "step": 3064 + }, + { + "epoch": 0.4583520263197248, + "grad_norm": 1.4732071986791726, + "learning_rate": 5.912725409254321e-06, + "loss": 0.1794, + "step": 3065 + }, + { + "epoch": 0.4585015702108569, + "grad_norm": 1.761728417820423, + "learning_rate": 5.910344169075252e-06, + "loss": 0.3489, + "step": 3066 + }, + { + "epoch": 0.45865111410198894, + "grad_norm": 1.356911951793647, + "learning_rate": 5.907962715320668e-06, + "loss": 0.325, + "step": 3067 + }, + { + "epoch": 0.458800657993121, + "grad_norm": 1.2177589250447785, + "learning_rate": 5.905581048549279e-06, + "loss": 0.1797, + "step": 3068 + }, + { + "epoch": 0.458950201884253, + "grad_norm": 1.683972181197731, + "learning_rate": 5.90319916931985e-06, + "loss": 0.3762, + "step": 3069 + }, + { + "epoch": 0.4590997457753851, + "grad_norm": 1.6629827017110606, + "learning_rate": 5.900817078191191e-06, + "loss": 0.1876, + "step": 3070 + }, + { + "epoch": 0.4592492896665171, + "grad_norm": 1.990952022645022, + "learning_rate": 5.898434775722165e-06, + "loss": 0.4711, + "step": 3071 + }, + { + "epoch": 0.45939883355764916, + "grad_norm": 1.3405923448155683, + "learning_rate": 5.896052262471681e-06, + "loss": 0.3233, + "step": 3072 + }, + { + "epoch": 0.45954837744878124, + "grad_norm": 1.242924374220424, + "learning_rate": 5.893669538998701e-06, + "loss": 0.2268, + "step": 3073 + }, + { + "epoch": 0.4596979213399133, + "grad_norm": 1.6674961385367664, + "learning_rate": 5.891286605862236e-06, + "loss": 0.4254, + "step": 3074 + }, + { + "epoch": 0.4598474652310453, + "grad_norm": 1.9155364958899164, + "learning_rate": 5.888903463621342e-06, + "loss": 0.3741, + "step": 3075 + }, + { + "epoch": 0.45999700912217734, + "grad_norm": 1.467167356716408, + "learning_rate": 5.886520112835128e-06, + "loss": 0.4894, + "step": 3076 + }, + { + "epoch": 0.4601465530133094, + "grad_norm": 1.528239460716899, + "learning_rate": 5.884136554062753e-06, + "loss": 0.2938, + "step": 3077 + }, + { + "epoch": 0.46029609690444145, + "grad_norm": 1.2857242621803937, + "learning_rate": 5.881752787863421e-06, + "loss": 0.3372, + "step": 3078 + }, + { + "epoch": 0.4604456407955735, + "grad_norm": 1.566112761935387, + "learning_rate": 5.879368814796388e-06, + "loss": 0.3513, + "step": 3079 + }, + { + "epoch": 0.46059518468670557, + "grad_norm": 1.829132984093117, + "learning_rate": 5.876984635420956e-06, + "loss": 0.4273, + "step": 3080 + }, + { + "epoch": 0.4607447285778376, + "grad_norm": 1.3703912717781694, + "learning_rate": 5.874600250296477e-06, + "loss": 0.2825, + "step": 3081 + }, + { + "epoch": 0.46089427246896963, + "grad_norm": 1.102997889638414, + "learning_rate": 5.87221565998235e-06, + "loss": 0.182, + "step": 3082 + }, + { + "epoch": 0.46104381636010167, + "grad_norm": 1.2503343656492412, + "learning_rate": 5.869830865038022e-06, + "loss": 0.1864, + "step": 3083 + }, + { + "epoch": 0.46119336025123375, + "grad_norm": 1.3521302194014102, + "learning_rate": 5.867445866022992e-06, + "loss": 0.3524, + "step": 3084 + }, + { + "epoch": 0.4613429041423658, + "grad_norm": 1.4419400149521013, + "learning_rate": 5.865060663496801e-06, + "loss": 0.2945, + "step": 3085 + }, + { + "epoch": 0.4614924480334978, + "grad_norm": 0.8534550219741183, + "learning_rate": 5.862675258019042e-06, + "loss": 0.2017, + "step": 3086 + }, + { + "epoch": 0.4616419919246299, + "grad_norm": 1.1420661007230424, + "learning_rate": 5.860289650149353e-06, + "loss": 0.1907, + "step": 3087 + }, + { + "epoch": 0.46179153581576193, + "grad_norm": 1.2419372462457205, + "learning_rate": 5.857903840447422e-06, + "loss": 0.3804, + "step": 3088 + }, + { + "epoch": 0.46194107970689396, + "grad_norm": 1.0043595882377505, + "learning_rate": 5.855517829472981e-06, + "loss": 0.181, + "step": 3089 + }, + { + "epoch": 0.462090623598026, + "grad_norm": 1.4634097900797467, + "learning_rate": 5.853131617785813e-06, + "loss": 0.3246, + "step": 3090 + }, + { + "epoch": 0.4622401674891581, + "grad_norm": 1.587493020890882, + "learning_rate": 5.850745205945745e-06, + "loss": 0.2042, + "step": 3091 + }, + { + "epoch": 0.4623897113802901, + "grad_norm": 1.7333993057463715, + "learning_rate": 5.848358594512654e-06, + "loss": 0.325, + "step": 3092 + }, + { + "epoch": 0.46253925527142215, + "grad_norm": 1.616159656091411, + "learning_rate": 5.84597178404646e-06, + "loss": 0.2417, + "step": 3093 + }, + { + "epoch": 0.46268879916255423, + "grad_norm": 1.2656140621324898, + "learning_rate": 5.8435847751071325e-06, + "loss": 0.2453, + "step": 3094 + }, + { + "epoch": 0.46283834305368626, + "grad_norm": 1.074466172247766, + "learning_rate": 5.841197568254687e-06, + "loss": 0.1631, + "step": 3095 + }, + { + "epoch": 0.4629878869448183, + "grad_norm": 1.0979861038745287, + "learning_rate": 5.838810164049185e-06, + "loss": 0.1769, + "step": 3096 + }, + { + "epoch": 0.4631374308359503, + "grad_norm": 1.2926739611623306, + "learning_rate": 5.836422563050734e-06, + "loss": 0.2501, + "step": 3097 + }, + { + "epoch": 0.4632869747270824, + "grad_norm": 1.727710286265107, + "learning_rate": 5.834034765819489e-06, + "loss": 0.5367, + "step": 3098 + }, + { + "epoch": 0.46343651861821444, + "grad_norm": 1.4761492716100206, + "learning_rate": 5.831646772915651e-06, + "loss": 0.3464, + "step": 3099 + }, + { + "epoch": 0.4635860625093465, + "grad_norm": 1.2203558008080015, + "learning_rate": 5.829258584899464e-06, + "loss": 0.1696, + "step": 3100 + }, + { + "epoch": 0.46373560640047856, + "grad_norm": 2.010676119341004, + "learning_rate": 5.826870202331221e-06, + "loss": 0.373, + "step": 3101 + }, + { + "epoch": 0.4638851502916106, + "grad_norm": 1.691843238482411, + "learning_rate": 5.824481625771262e-06, + "loss": 0.3301, + "step": 3102 + }, + { + "epoch": 0.4640346941827426, + "grad_norm": 1.1543076074169056, + "learning_rate": 5.822092855779966e-06, + "loss": 0.1873, + "step": 3103 + }, + { + "epoch": 0.46418423807387466, + "grad_norm": 1.2233599414089642, + "learning_rate": 5.819703892917763e-06, + "loss": 0.1865, + "step": 3104 + }, + { + "epoch": 0.46433378196500674, + "grad_norm": 2.210291468184369, + "learning_rate": 5.817314737745127e-06, + "loss": 0.4183, + "step": 3105 + }, + { + "epoch": 0.4644833258561388, + "grad_norm": 1.4383760211249257, + "learning_rate": 5.8149253908225766e-06, + "loss": 0.2193, + "step": 3106 + }, + { + "epoch": 0.4646328697472708, + "grad_norm": 1.3428933717745177, + "learning_rate": 5.812535852710677e-06, + "loss": 0.4662, + "step": 3107 + }, + { + "epoch": 0.4647824136384029, + "grad_norm": 1.523129681520151, + "learning_rate": 5.810146123970034e-06, + "loss": 0.5232, + "step": 3108 + }, + { + "epoch": 0.4649319575295349, + "grad_norm": 1.378941458503838, + "learning_rate": 5.807756205161304e-06, + "loss": 0.1905, + "step": 3109 + }, + { + "epoch": 0.46508150142066695, + "grad_norm": 2.2157282770241844, + "learning_rate": 5.8053660968451815e-06, + "loss": 0.3278, + "step": 3110 + }, + { + "epoch": 0.46523104531179904, + "grad_norm": 2.1780194356833182, + "learning_rate": 5.802975799582413e-06, + "loss": 0.4822, + "step": 3111 + }, + { + "epoch": 0.46538058920293107, + "grad_norm": 1.1575081784389918, + "learning_rate": 5.800585313933784e-06, + "loss": 0.1978, + "step": 3112 + }, + { + "epoch": 0.4655301330940631, + "grad_norm": 1.9490148059207726, + "learning_rate": 5.798194640460124e-06, + "loss": 0.5377, + "step": 3113 + }, + { + "epoch": 0.46567967698519513, + "grad_norm": 1.0308478611703875, + "learning_rate": 5.795803779722309e-06, + "loss": 0.1269, + "step": 3114 + }, + { + "epoch": 0.4658292208763272, + "grad_norm": 1.456905452881718, + "learning_rate": 5.793412732281258e-06, + "loss": 0.2402, + "step": 3115 + }, + { + "epoch": 0.46597876476745925, + "grad_norm": 1.865438890479385, + "learning_rate": 5.791021498697933e-06, + "loss": 0.2259, + "step": 3116 + }, + { + "epoch": 0.4661283086585913, + "grad_norm": 1.5482682061442332, + "learning_rate": 5.7886300795333405e-06, + "loss": 0.2349, + "step": 3117 + }, + { + "epoch": 0.46627785254972337, + "grad_norm": 1.168054204385437, + "learning_rate": 5.786238475348531e-06, + "loss": 0.1822, + "step": 3118 + }, + { + "epoch": 0.4664273964408554, + "grad_norm": 1.5800515492094007, + "learning_rate": 5.7838466867046e-06, + "loss": 0.2249, + "step": 3119 + }, + { + "epoch": 0.46657694033198743, + "grad_norm": 1.585867798029898, + "learning_rate": 5.78145471416268e-06, + "loss": 0.3313, + "step": 3120 + }, + { + "epoch": 0.46672648422311946, + "grad_norm": 1.2912421381268824, + "learning_rate": 5.779062558283954e-06, + "loss": 0.1931, + "step": 3121 + }, + { + "epoch": 0.46687602811425155, + "grad_norm": 1.303872839200776, + "learning_rate": 5.776670219629643e-06, + "loss": 0.2294, + "step": 3122 + }, + { + "epoch": 0.4670255720053836, + "grad_norm": 1.32116962155212, + "learning_rate": 5.7742776987610135e-06, + "loss": 0.2108, + "step": 3123 + }, + { + "epoch": 0.4671751158965156, + "grad_norm": 1.0010399849500764, + "learning_rate": 5.771884996239373e-06, + "loss": 0.154, + "step": 3124 + }, + { + "epoch": 0.4673246597876477, + "grad_norm": 1.4031763925464884, + "learning_rate": 5.7694921126260765e-06, + "loss": 0.2299, + "step": 3125 + }, + { + "epoch": 0.46747420367877973, + "grad_norm": 1.9032263969945602, + "learning_rate": 5.767099048482512e-06, + "loss": 0.2426, + "step": 3126 + }, + { + "epoch": 0.46762374756991176, + "grad_norm": 1.0697134527322825, + "learning_rate": 5.764705804370119e-06, + "loss": 0.2073, + "step": 3127 + }, + { + "epoch": 0.4677732914610438, + "grad_norm": 1.42472795223601, + "learning_rate": 5.762312380850373e-06, + "loss": 0.3811, + "step": 3128 + }, + { + "epoch": 0.4679228353521759, + "grad_norm": 1.312885115809687, + "learning_rate": 5.759918778484797e-06, + "loss": 0.2057, + "step": 3129 + }, + { + "epoch": 0.4680723792433079, + "grad_norm": 1.5752141659312524, + "learning_rate": 5.757524997834951e-06, + "loss": 0.2032, + "step": 3130 + }, + { + "epoch": 0.46822192313443994, + "grad_norm": 1.9755154425978854, + "learning_rate": 5.755131039462441e-06, + "loss": 0.2944, + "step": 3131 + }, + { + "epoch": 0.46837146702557203, + "grad_norm": 1.5857308751132788, + "learning_rate": 5.752736903928912e-06, + "loss": 0.3172, + "step": 3132 + }, + { + "epoch": 0.46852101091670406, + "grad_norm": 1.276569604328609, + "learning_rate": 5.75034259179605e-06, + "loss": 0.1794, + "step": 3133 + }, + { + "epoch": 0.4686705548078361, + "grad_norm": 1.4874225477081564, + "learning_rate": 5.747948103625585e-06, + "loss": 0.3464, + "step": 3134 + }, + { + "epoch": 0.4688200986989681, + "grad_norm": 1.8677226819900095, + "learning_rate": 5.745553439979287e-06, + "loss": 0.3892, + "step": 3135 + }, + { + "epoch": 0.4689696425901002, + "grad_norm": 1.9234557105998888, + "learning_rate": 5.743158601418967e-06, + "loss": 0.3449, + "step": 3136 + }, + { + "epoch": 0.46911918648123224, + "grad_norm": 1.4122630414281268, + "learning_rate": 5.740763588506475e-06, + "loss": 0.2106, + "step": 3137 + }, + { + "epoch": 0.4692687303723643, + "grad_norm": 2.120747227051215, + "learning_rate": 5.738368401803708e-06, + "loss": 0.4917, + "step": 3138 + }, + { + "epoch": 0.46941827426349636, + "grad_norm": 1.5280456344885767, + "learning_rate": 5.735973041872597e-06, + "loss": 0.2088, + "step": 3139 + }, + { + "epoch": 0.4695678181546284, + "grad_norm": 2.1695026435655715, + "learning_rate": 5.733577509275119e-06, + "loss": 0.5766, + "step": 3140 + }, + { + "epoch": 0.4697173620457604, + "grad_norm": 1.9039804361985175, + "learning_rate": 5.731181804573286e-06, + "loss": 0.3801, + "step": 3141 + }, + { + "epoch": 0.46986690593689245, + "grad_norm": 1.6142863728603896, + "learning_rate": 5.728785928329157e-06, + "loss": 0.261, + "step": 3142 + }, + { + "epoch": 0.47001644982802454, + "grad_norm": 1.6235559241321793, + "learning_rate": 5.726389881104824e-06, + "loss": 0.3315, + "step": 3143 + }, + { + "epoch": 0.47016599371915657, + "grad_norm": 2.715507431689836, + "learning_rate": 5.7239936634624225e-06, + "loss": 0.3486, + "step": 3144 + }, + { + "epoch": 0.4703155376102886, + "grad_norm": 1.8605569560890904, + "learning_rate": 5.7215972759641335e-06, + "loss": 0.4886, + "step": 3145 + }, + { + "epoch": 0.4704650815014207, + "grad_norm": 1.410091594154271, + "learning_rate": 5.719200719172168e-06, + "loss": 0.2126, + "step": 3146 + }, + { + "epoch": 0.4706146253925527, + "grad_norm": 1.2071331191053063, + "learning_rate": 5.716803993648784e-06, + "loss": 0.242, + "step": 3147 + }, + { + "epoch": 0.47076416928368475, + "grad_norm": 1.743927253751831, + "learning_rate": 5.7144070999562726e-06, + "loss": 0.4621, + "step": 3148 + }, + { + "epoch": 0.4709137131748168, + "grad_norm": 1.1162822964236923, + "learning_rate": 5.712010038656972e-06, + "loss": 0.1878, + "step": 3149 + }, + { + "epoch": 0.47106325706594887, + "grad_norm": 1.4397665068284078, + "learning_rate": 5.709612810313253e-06, + "loss": 0.244, + "step": 3150 + }, + { + "epoch": 0.4712128009570809, + "grad_norm": 1.2689183560373911, + "learning_rate": 5.707215415487532e-06, + "loss": 0.2142, + "step": 3151 + }, + { + "epoch": 0.47136234484821293, + "grad_norm": 1.2429339367434729, + "learning_rate": 5.704817854742257e-06, + "loss": 0.2077, + "step": 3152 + }, + { + "epoch": 0.471511888739345, + "grad_norm": 1.4973747904307906, + "learning_rate": 5.7024201286399215e-06, + "loss": 0.1525, + "step": 3153 + }, + { + "epoch": 0.47166143263047705, + "grad_norm": 1.5335821278579143, + "learning_rate": 5.700022237743055e-06, + "loss": 0.2315, + "step": 3154 + }, + { + "epoch": 0.4718109765216091, + "grad_norm": 1.8352653937040397, + "learning_rate": 5.697624182614225e-06, + "loss": 0.4395, + "step": 3155 + }, + { + "epoch": 0.4719605204127411, + "grad_norm": 1.9162160008419673, + "learning_rate": 5.695225963816038e-06, + "loss": 0.4633, + "step": 3156 + }, + { + "epoch": 0.4721100643038732, + "grad_norm": 1.3144121886364732, + "learning_rate": 5.692827581911139e-06, + "loss": 0.3179, + "step": 3157 + }, + { + "epoch": 0.47225960819500523, + "grad_norm": 1.6775962097688284, + "learning_rate": 5.690429037462213e-06, + "loss": 0.3079, + "step": 3158 + }, + { + "epoch": 0.47240915208613726, + "grad_norm": 1.5606712917150614, + "learning_rate": 5.688030331031981e-06, + "loss": 0.391, + "step": 3159 + }, + { + "epoch": 0.47255869597726935, + "grad_norm": 1.8951438306712434, + "learning_rate": 5.685631463183204e-06, + "loss": 0.238, + "step": 3160 + }, + { + "epoch": 0.4727082398684014, + "grad_norm": 1.4421516858300434, + "learning_rate": 5.683232434478675e-06, + "loss": 0.3104, + "step": 3161 + }, + { + "epoch": 0.4728577837595334, + "grad_norm": 1.516615298295563, + "learning_rate": 5.680833245481234e-06, + "loss": 0.2329, + "step": 3162 + }, + { + "epoch": 0.47300732765066544, + "grad_norm": 1.4316839755417867, + "learning_rate": 5.6784338967537524e-06, + "loss": 0.1996, + "step": 3163 + }, + { + "epoch": 0.47315687154179753, + "grad_norm": 1.426150093967574, + "learning_rate": 5.67603438885914e-06, + "loss": 0.2305, + "step": 3164 + }, + { + "epoch": 0.47330641543292956, + "grad_norm": 1.9888683194342018, + "learning_rate": 5.673634722360344e-06, + "loss": 0.5061, + "step": 3165 + }, + { + "epoch": 0.4734559593240616, + "grad_norm": 1.364501688300759, + "learning_rate": 5.6712348978203504e-06, + "loss": 0.2131, + "step": 3166 + }, + { + "epoch": 0.4736055032151937, + "grad_norm": 1.7957506725761962, + "learning_rate": 5.66883491580218e-06, + "loss": 0.3892, + "step": 3167 + }, + { + "epoch": 0.4737550471063257, + "grad_norm": 1.6274375743803688, + "learning_rate": 5.666434776868895e-06, + "loss": 0.2701, + "step": 3168 + }, + { + "epoch": 0.47390459099745774, + "grad_norm": 1.4211736786097835, + "learning_rate": 5.6640344815835866e-06, + "loss": 0.1947, + "step": 3169 + }, + { + "epoch": 0.47405413488858983, + "grad_norm": 1.8576338132648311, + "learning_rate": 5.6616340305093905e-06, + "loss": 0.3133, + "step": 3170 + }, + { + "epoch": 0.47420367877972186, + "grad_norm": 1.8516954583504217, + "learning_rate": 5.659233424209474e-06, + "loss": 0.3246, + "step": 3171 + }, + { + "epoch": 0.4743532226708539, + "grad_norm": 1.5765166821357868, + "learning_rate": 5.6568326632470424e-06, + "loss": 0.288, + "step": 3172 + }, + { + "epoch": 0.4745027665619859, + "grad_norm": 1.5049818589318757, + "learning_rate": 5.654431748185339e-06, + "loss": 0.2027, + "step": 3173 + }, + { + "epoch": 0.474652310453118, + "grad_norm": 1.6563418650791573, + "learning_rate": 5.65203067958764e-06, + "loss": 0.3274, + "step": 3174 + }, + { + "epoch": 0.47480185434425004, + "grad_norm": 1.4060135417396495, + "learning_rate": 5.649629458017261e-06, + "loss": 0.1846, + "step": 3175 + }, + { + "epoch": 0.47495139823538207, + "grad_norm": 1.2055453379363343, + "learning_rate": 5.647228084037548e-06, + "loss": 0.1944, + "step": 3176 + }, + { + "epoch": 0.47510094212651416, + "grad_norm": 1.32347197279457, + "learning_rate": 5.644826558211889e-06, + "loss": 0.2127, + "step": 3177 + }, + { + "epoch": 0.4752504860176462, + "grad_norm": 1.847868151137175, + "learning_rate": 5.642424881103706e-06, + "loss": 0.5842, + "step": 3178 + }, + { + "epoch": 0.4754000299087782, + "grad_norm": 1.674408388100392, + "learning_rate": 5.6400230532764545e-06, + "loss": 0.3812, + "step": 3179 + }, + { + "epoch": 0.47554957379991025, + "grad_norm": 1.757199701941517, + "learning_rate": 5.637621075293627e-06, + "loss": 0.4498, + "step": 3180 + }, + { + "epoch": 0.47569911769104234, + "grad_norm": 1.5559941460663576, + "learning_rate": 5.63521894771875e-06, + "loss": 0.2237, + "step": 3181 + }, + { + "epoch": 0.47584866158217437, + "grad_norm": 1.5462633932338483, + "learning_rate": 5.632816671115385e-06, + "loss": 0.2413, + "step": 3182 + }, + { + "epoch": 0.4759982054733064, + "grad_norm": 1.5498398967317741, + "learning_rate": 5.630414246047132e-06, + "loss": 0.3177, + "step": 3183 + }, + { + "epoch": 0.4761477493644385, + "grad_norm": 1.6167447628978575, + "learning_rate": 5.62801167307762e-06, + "loss": 0.2376, + "step": 3184 + }, + { + "epoch": 0.4762972932555705, + "grad_norm": 1.4919899689914684, + "learning_rate": 5.625608952770518e-06, + "loss": 0.1693, + "step": 3185 + }, + { + "epoch": 0.47644683714670255, + "grad_norm": 1.5067704088223868, + "learning_rate": 5.623206085689525e-06, + "loss": 0.2136, + "step": 3186 + }, + { + "epoch": 0.4765963810378346, + "grad_norm": 1.1449557548932223, + "learning_rate": 5.62080307239838e-06, + "loss": 0.1582, + "step": 3187 + }, + { + "epoch": 0.47674592492896667, + "grad_norm": 1.3257476397757155, + "learning_rate": 5.6183999134608514e-06, + "loss": 0.3345, + "step": 3188 + }, + { + "epoch": 0.4768954688200987, + "grad_norm": 1.4360228664127825, + "learning_rate": 5.6159966094407426e-06, + "loss": 0.2616, + "step": 3189 + }, + { + "epoch": 0.47704501271123073, + "grad_norm": 0.9734180829666479, + "learning_rate": 5.613593160901893e-06, + "loss": 0.1593, + "step": 3190 + }, + { + "epoch": 0.4771945566023628, + "grad_norm": 1.4248599328670821, + "learning_rate": 5.611189568408173e-06, + "loss": 0.1703, + "step": 3191 + }, + { + "epoch": 0.47734410049349485, + "grad_norm": 1.0590562637029552, + "learning_rate": 5.60878583252349e-06, + "loss": 0.2125, + "step": 3192 + }, + { + "epoch": 0.4774936443846269, + "grad_norm": 1.4611566756560919, + "learning_rate": 5.606381953811786e-06, + "loss": 0.3466, + "step": 3193 + }, + { + "epoch": 0.4776431882757589, + "grad_norm": 1.3346184917090114, + "learning_rate": 5.60397793283703e-06, + "loss": 0.2076, + "step": 3194 + }, + { + "epoch": 0.477792732166891, + "grad_norm": 1.566654209478883, + "learning_rate": 5.601573770163231e-06, + "loss": 0.3383, + "step": 3195 + }, + { + "epoch": 0.47794227605802303, + "grad_norm": 1.5414225158374537, + "learning_rate": 5.599169466354427e-06, + "loss": 0.3466, + "step": 3196 + }, + { + "epoch": 0.47809181994915506, + "grad_norm": 1.4350966493610382, + "learning_rate": 5.596765021974693e-06, + "loss": 0.2875, + "step": 3197 + }, + { + "epoch": 0.47824136384028715, + "grad_norm": 1.689845874095469, + "learning_rate": 5.594360437588132e-06, + "loss": 0.426, + "step": 3198 + }, + { + "epoch": 0.4783909077314192, + "grad_norm": 1.0977797048520335, + "learning_rate": 5.591955713758885e-06, + "loss": 0.2392, + "step": 3199 + }, + { + "epoch": 0.4785404516225512, + "grad_norm": 1.7612756142198773, + "learning_rate": 5.589550851051123e-06, + "loss": 0.5625, + "step": 3200 + }, + { + "epoch": 0.47868999551368324, + "grad_norm": 1.9558696306956667, + "learning_rate": 5.58714585002905e-06, + "loss": 0.4112, + "step": 3201 + }, + { + "epoch": 0.47883953940481533, + "grad_norm": 1.6349407082918148, + "learning_rate": 5.584740711256901e-06, + "loss": 0.2047, + "step": 3202 + }, + { + "epoch": 0.47898908329594736, + "grad_norm": 0.9418582681860653, + "learning_rate": 5.5823354352989465e-06, + "loss": 0.1597, + "step": 3203 + }, + { + "epoch": 0.4791386271870794, + "grad_norm": 1.4766455127450906, + "learning_rate": 5.579930022719486e-06, + "loss": 0.2793, + "step": 3204 + }, + { + "epoch": 0.4792881710782115, + "grad_norm": 1.654406135220705, + "learning_rate": 5.577524474082853e-06, + "loss": 0.2065, + "step": 3205 + }, + { + "epoch": 0.4794377149693435, + "grad_norm": 1.6127674887270247, + "learning_rate": 5.575118789953414e-06, + "loss": 0.3119, + "step": 3206 + }, + { + "epoch": 0.47958725886047554, + "grad_norm": 1.3677004533072454, + "learning_rate": 5.572712970895563e-06, + "loss": 0.2697, + "step": 3207 + }, + { + "epoch": 0.47973680275160757, + "grad_norm": 1.3293568676356242, + "learning_rate": 5.5703070174737305e-06, + "loss": 0.3024, + "step": 3208 + }, + { + "epoch": 0.47988634664273966, + "grad_norm": 1.5812284057258443, + "learning_rate": 5.567900930252375e-06, + "loss": 0.2966, + "step": 3209 + }, + { + "epoch": 0.4800358905338717, + "grad_norm": 1.3637746072600263, + "learning_rate": 5.565494709795988e-06, + "loss": 0.2399, + "step": 3210 + }, + { + "epoch": 0.4801854344250037, + "grad_norm": 1.2403212239051626, + "learning_rate": 5.5630883566690915e-06, + "loss": 0.2032, + "step": 3211 + }, + { + "epoch": 0.4803349783161358, + "grad_norm": 1.3275458152883233, + "learning_rate": 5.5606818714362406e-06, + "loss": 0.1753, + "step": 3212 + }, + { + "epoch": 0.48048452220726784, + "grad_norm": 1.6827685464628404, + "learning_rate": 5.55827525466202e-06, + "loss": 0.3836, + "step": 3213 + }, + { + "epoch": 0.48063406609839987, + "grad_norm": 1.5010871722356298, + "learning_rate": 5.5558685069110444e-06, + "loss": 0.3594, + "step": 3214 + }, + { + "epoch": 0.4807836099895319, + "grad_norm": 1.7500323777950166, + "learning_rate": 5.5534616287479585e-06, + "loss": 0.4294, + "step": 3215 + }, + { + "epoch": 0.480933153880664, + "grad_norm": 1.3283210890130643, + "learning_rate": 5.5510546207374415e-06, + "loss": 0.26, + "step": 3216 + }, + { + "epoch": 0.481082697771796, + "grad_norm": 1.6922301032567173, + "learning_rate": 5.548647483444199e-06, + "loss": 0.2203, + "step": 3217 + }, + { + "epoch": 0.48123224166292805, + "grad_norm": 1.3062615721477455, + "learning_rate": 5.54624021743297e-06, + "loss": 0.3097, + "step": 3218 + }, + { + "epoch": 0.48138178555406014, + "grad_norm": 1.346469662217694, + "learning_rate": 5.543832823268522e-06, + "loss": 0.2069, + "step": 3219 + }, + { + "epoch": 0.48153132944519217, + "grad_norm": 0.9633365182885122, + "learning_rate": 5.541425301515652e-06, + "loss": 0.1913, + "step": 3220 + }, + { + "epoch": 0.4816808733363242, + "grad_norm": 1.9220717530990858, + "learning_rate": 5.5390176527391895e-06, + "loss": 0.4657, + "step": 3221 + }, + { + "epoch": 0.48183041722745623, + "grad_norm": 1.5111193343359333, + "learning_rate": 5.536609877503989e-06, + "loss": 0.3328, + "step": 3222 + }, + { + "epoch": 0.4819799611185883, + "grad_norm": 1.952680430360637, + "learning_rate": 5.534201976374941e-06, + "loss": 0.5342, + "step": 3223 + }, + { + "epoch": 0.48212950500972035, + "grad_norm": 1.5983344593704398, + "learning_rate": 5.531793949916961e-06, + "loss": 0.2425, + "step": 3224 + }, + { + "epoch": 0.4822790489008524, + "grad_norm": 1.8572599659337972, + "learning_rate": 5.529385798694995e-06, + "loss": 0.5123, + "step": 3225 + }, + { + "epoch": 0.48242859279198447, + "grad_norm": 1.424752198495054, + "learning_rate": 5.526977523274017e-06, + "loss": 0.1894, + "step": 3226 + }, + { + "epoch": 0.4825781366831165, + "grad_norm": 1.3362880994059592, + "learning_rate": 5.524569124219036e-06, + "loss": 0.2178, + "step": 3227 + }, + { + "epoch": 0.48272768057424853, + "grad_norm": 1.5846787140906795, + "learning_rate": 5.522160602095082e-06, + "loss": 0.3475, + "step": 3228 + }, + { + "epoch": 0.4828772244653806, + "grad_norm": 1.9665234251579087, + "learning_rate": 5.5197519574672185e-06, + "loss": 0.5254, + "step": 3229 + }, + { + "epoch": 0.48302676835651265, + "grad_norm": 1.2406935735339304, + "learning_rate": 5.517343190900537e-06, + "loss": 0.196, + "step": 3230 + }, + { + "epoch": 0.4831763122476447, + "grad_norm": 1.2840122911547907, + "learning_rate": 5.514934302960156e-06, + "loss": 0.3584, + "step": 3231 + }, + { + "epoch": 0.4833258561387767, + "grad_norm": 1.7114869197966367, + "learning_rate": 5.512525294211226e-06, + "loss": 0.5087, + "step": 3232 + }, + { + "epoch": 0.4834754000299088, + "grad_norm": 1.6167473170926372, + "learning_rate": 5.5101161652189225e-06, + "loss": 0.3207, + "step": 3233 + }, + { + "epoch": 0.48362494392104083, + "grad_norm": 1.1172175670125728, + "learning_rate": 5.507706916548451e-06, + "loss": 0.197, + "step": 3234 + }, + { + "epoch": 0.48377448781217286, + "grad_norm": 1.0454088270739283, + "learning_rate": 5.505297548765043e-06, + "loss": 0.244, + "step": 3235 + }, + { + "epoch": 0.48392403170330495, + "grad_norm": 2.1427540459127905, + "learning_rate": 5.502888062433961e-06, + "loss": 0.401, + "step": 3236 + }, + { + "epoch": 0.484073575594437, + "grad_norm": 1.483691258814316, + "learning_rate": 5.500478458120493e-06, + "loss": 0.3204, + "step": 3237 + }, + { + "epoch": 0.484223119485569, + "grad_norm": 1.4985182367477778, + "learning_rate": 5.4980687363899545e-06, + "loss": 0.1841, + "step": 3238 + }, + { + "epoch": 0.48437266337670104, + "grad_norm": 1.5306187126370105, + "learning_rate": 5.495658897807691e-06, + "loss": 0.3307, + "step": 3239 + }, + { + "epoch": 0.4845222072678331, + "grad_norm": 1.9059741922872409, + "learning_rate": 5.4932489429390725e-06, + "loss": 0.4811, + "step": 3240 + }, + { + "epoch": 0.48467175115896516, + "grad_norm": 1.5997735336338483, + "learning_rate": 5.4908388723495e-06, + "loss": 0.4974, + "step": 3241 + }, + { + "epoch": 0.4848212950500972, + "grad_norm": 1.1574319053701934, + "learning_rate": 5.488428686604397e-06, + "loss": 0.1849, + "step": 3242 + }, + { + "epoch": 0.4849708389412293, + "grad_norm": 1.7158993652318315, + "learning_rate": 5.486018386269216e-06, + "loss": 0.1957, + "step": 3243 + }, + { + "epoch": 0.4851203828323613, + "grad_norm": 1.3803613849267424, + "learning_rate": 5.4836079719094395e-06, + "loss": 0.229, + "step": 3244 + }, + { + "epoch": 0.48526992672349334, + "grad_norm": 1.40606103461945, + "learning_rate": 5.4811974440905705e-06, + "loss": 0.1864, + "step": 3245 + }, + { + "epoch": 0.48541947061462537, + "grad_norm": 0.9610421038503347, + "learning_rate": 5.478786803378143e-06, + "loss": 0.2373, + "step": 3246 + }, + { + "epoch": 0.48556901450575746, + "grad_norm": 1.415230316232099, + "learning_rate": 5.476376050337719e-06, + "loss": 0.3828, + "step": 3247 + }, + { + "epoch": 0.4857185583968895, + "grad_norm": 2.307432712150402, + "learning_rate": 5.473965185534882e-06, + "loss": 0.3894, + "step": 3248 + }, + { + "epoch": 0.4858681022880215, + "grad_norm": 1.4880888031014317, + "learning_rate": 5.471554209535244e-06, + "loss": 0.2822, + "step": 3249 + }, + { + "epoch": 0.4860176461791536, + "grad_norm": 1.2863532256194963, + "learning_rate": 5.469143122904444e-06, + "loss": 0.2129, + "step": 3250 + }, + { + "epoch": 0.48616719007028564, + "grad_norm": 1.6475188307887112, + "learning_rate": 5.466731926208148e-06, + "loss": 0.2088, + "step": 3251 + }, + { + "epoch": 0.48631673396141767, + "grad_norm": 1.5875387799039424, + "learning_rate": 5.46432062001204e-06, + "loss": 0.3401, + "step": 3252 + }, + { + "epoch": 0.4864662778525497, + "grad_norm": 1.8220640993048132, + "learning_rate": 5.461909204881842e-06, + "loss": 0.2542, + "step": 3253 + }, + { + "epoch": 0.4866158217436818, + "grad_norm": 1.233777671255329, + "learning_rate": 5.459497681383295e-06, + "loss": 0.2139, + "step": 3254 + }, + { + "epoch": 0.4867653656348138, + "grad_norm": 1.734386302292572, + "learning_rate": 5.457086050082161e-06, + "loss": 0.2322, + "step": 3255 + }, + { + "epoch": 0.48691490952594585, + "grad_norm": 1.2129989858188304, + "learning_rate": 5.454674311544236e-06, + "loss": 0.2004, + "step": 3256 + }, + { + "epoch": 0.48706445341707794, + "grad_norm": 1.5232752004392356, + "learning_rate": 5.4522624663353356e-06, + "loss": 0.3088, + "step": 3257 + }, + { + "epoch": 0.48721399730820997, + "grad_norm": 1.0342508423040848, + "learning_rate": 5.4498505150213e-06, + "loss": 0.19, + "step": 3258 + }, + { + "epoch": 0.487363541199342, + "grad_norm": 1.2149206479073549, + "learning_rate": 5.447438458168e-06, + "loss": 0.1917, + "step": 3259 + }, + { + "epoch": 0.48751308509047403, + "grad_norm": 1.8186709488406252, + "learning_rate": 5.445026296341325e-06, + "loss": 0.4124, + "step": 3260 + }, + { + "epoch": 0.4876626289816061, + "grad_norm": 1.4871462802100281, + "learning_rate": 5.442614030107192e-06, + "loss": 0.3458, + "step": 3261 + }, + { + "epoch": 0.48781217287273815, + "grad_norm": 1.3202962648646308, + "learning_rate": 5.440201660031542e-06, + "loss": 0.3653, + "step": 3262 + }, + { + "epoch": 0.4879617167638702, + "grad_norm": 2.336083852527424, + "learning_rate": 5.4377891866803405e-06, + "loss": 0.2729, + "step": 3263 + }, + { + "epoch": 0.48811126065500227, + "grad_norm": 1.2930049790322846, + "learning_rate": 5.435376610619577e-06, + "loss": 0.2439, + "step": 3264 + }, + { + "epoch": 0.4882608045461343, + "grad_norm": 1.557892290174439, + "learning_rate": 5.432963932415264e-06, + "loss": 0.3053, + "step": 3265 + }, + { + "epoch": 0.48841034843726633, + "grad_norm": 1.3905071948011742, + "learning_rate": 5.430551152633438e-06, + "loss": 0.214, + "step": 3266 + }, + { + "epoch": 0.48855989232839836, + "grad_norm": 1.8231907139006023, + "learning_rate": 5.428138271840165e-06, + "loss": 0.4806, + "step": 3267 + }, + { + "epoch": 0.48870943621953045, + "grad_norm": 1.5732546852605194, + "learning_rate": 5.425725290601527e-06, + "loss": 0.3985, + "step": 3268 + }, + { + "epoch": 0.4888589801106625, + "grad_norm": 2.677639259079234, + "learning_rate": 5.423312209483632e-06, + "loss": 0.4894, + "step": 3269 + }, + { + "epoch": 0.4890085240017945, + "grad_norm": 1.3541375970165608, + "learning_rate": 5.420899029052614e-06, + "loss": 0.1714, + "step": 3270 + }, + { + "epoch": 0.4891580678929266, + "grad_norm": 1.7576357519511228, + "learning_rate": 5.418485749874628e-06, + "loss": 0.2243, + "step": 3271 + }, + { + "epoch": 0.4893076117840586, + "grad_norm": 1.4302229998846714, + "learning_rate": 5.41607237251585e-06, + "loss": 0.3575, + "step": 3272 + }, + { + "epoch": 0.48945715567519066, + "grad_norm": 1.357123434068504, + "learning_rate": 5.413658897542483e-06, + "loss": 0.2934, + "step": 3273 + }, + { + "epoch": 0.4896066995663227, + "grad_norm": 1.620224094370209, + "learning_rate": 5.411245325520754e-06, + "loss": 0.3828, + "step": 3274 + }, + { + "epoch": 0.4897562434574548, + "grad_norm": 1.4222644545024612, + "learning_rate": 5.408831657016908e-06, + "loss": 0.2534, + "step": 3275 + }, + { + "epoch": 0.4899057873485868, + "grad_norm": 1.6038088280717635, + "learning_rate": 5.4064178925972155e-06, + "loss": 0.3653, + "step": 3276 + }, + { + "epoch": 0.49005533123971884, + "grad_norm": 6.225442055629013, + "learning_rate": 5.4040040328279684e-06, + "loss": 0.4034, + "step": 3277 + }, + { + "epoch": 0.4902048751308509, + "grad_norm": 1.3073622659475577, + "learning_rate": 5.40159007827548e-06, + "loss": 0.2864, + "step": 3278 + }, + { + "epoch": 0.49035441902198296, + "grad_norm": 2.0634805595333963, + "learning_rate": 5.399176029506091e-06, + "loss": 0.3318, + "step": 3279 + }, + { + "epoch": 0.490503962913115, + "grad_norm": 1.6410728978525977, + "learning_rate": 5.396761887086157e-06, + "loss": 0.3503, + "step": 3280 + }, + { + "epoch": 0.490653506804247, + "grad_norm": 1.1385654609121518, + "learning_rate": 5.394347651582063e-06, + "loss": 0.1922, + "step": 3281 + }, + { + "epoch": 0.4908030506953791, + "grad_norm": 1.7005263723270547, + "learning_rate": 5.391933323560208e-06, + "loss": 0.441, + "step": 3282 + }, + { + "epoch": 0.49095259458651114, + "grad_norm": 1.770272350276855, + "learning_rate": 5.389518903587016e-06, + "loss": 0.3787, + "step": 3283 + }, + { + "epoch": 0.49110213847764317, + "grad_norm": 1.3506689259925078, + "learning_rate": 5.387104392228939e-06, + "loss": 0.2533, + "step": 3284 + }, + { + "epoch": 0.49125168236877526, + "grad_norm": 1.3898273923789393, + "learning_rate": 5.384689790052439e-06, + "loss": 0.3693, + "step": 3285 + }, + { + "epoch": 0.4914012262599073, + "grad_norm": 1.9579535041564384, + "learning_rate": 5.3822750976240065e-06, + "loss": 0.4891, + "step": 3286 + }, + { + "epoch": 0.4915507701510393, + "grad_norm": 1.6436155900647393, + "learning_rate": 5.379860315510153e-06, + "loss": 0.441, + "step": 3287 + }, + { + "epoch": 0.4917003140421714, + "grad_norm": 1.5550148010776086, + "learning_rate": 5.377445444277407e-06, + "loss": 0.3143, + "step": 3288 + }, + { + "epoch": 0.49184985793330344, + "grad_norm": 1.849270778547779, + "learning_rate": 5.375030484492324e-06, + "loss": 0.2501, + "step": 3289 + }, + { + "epoch": 0.49199940182443547, + "grad_norm": 1.5318913479060694, + "learning_rate": 5.372615436721474e-06, + "loss": 0.3657, + "step": 3290 + }, + { + "epoch": 0.4921489457155675, + "grad_norm": 2.1556436681340028, + "learning_rate": 5.370200301531453e-06, + "loss": 0.541, + "step": 3291 + }, + { + "epoch": 0.4922984896066996, + "grad_norm": 1.5606790446050516, + "learning_rate": 5.3677850794888704e-06, + "loss": 0.3051, + "step": 3292 + }, + { + "epoch": 0.4924480334978316, + "grad_norm": 1.384057987332389, + "learning_rate": 5.365369771160366e-06, + "loss": 0.1973, + "step": 3293 + }, + { + "epoch": 0.49259757738896365, + "grad_norm": 1.15291598303946, + "learning_rate": 5.362954377112593e-06, + "loss": 0.1559, + "step": 3294 + }, + { + "epoch": 0.49274712128009573, + "grad_norm": 1.265068756127247, + "learning_rate": 5.360538897912223e-06, + "loss": 0.2327, + "step": 3295 + }, + { + "epoch": 0.49289666517122777, + "grad_norm": 1.6658809938402388, + "learning_rate": 5.358123334125953e-06, + "loss": 0.3179, + "step": 3296 + }, + { + "epoch": 0.4930462090623598, + "grad_norm": 1.8651590421952458, + "learning_rate": 5.355707686320499e-06, + "loss": 0.3796, + "step": 3297 + }, + { + "epoch": 0.49319575295349183, + "grad_norm": 2.0781278430947956, + "learning_rate": 5.353291955062594e-06, + "loss": 0.3463, + "step": 3298 + }, + { + "epoch": 0.4933452968446239, + "grad_norm": 1.3667567419203674, + "learning_rate": 5.350876140918989e-06, + "loss": 0.1763, + "step": 3299 + }, + { + "epoch": 0.49349484073575595, + "grad_norm": 1.5793291775977985, + "learning_rate": 5.348460244456462e-06, + "loss": 0.3639, + "step": 3300 + }, + { + "epoch": 0.493644384626888, + "grad_norm": 1.436284245662927, + "learning_rate": 5.346044266241802e-06, + "loss": 0.2976, + "step": 3301 + }, + { + "epoch": 0.49379392851802006, + "grad_norm": 1.573325582954275, + "learning_rate": 5.3436282068418245e-06, + "loss": 0.3671, + "step": 3302 + }, + { + "epoch": 0.4939434724091521, + "grad_norm": 1.4650670097603558, + "learning_rate": 5.341212066823356e-06, + "loss": 0.197, + "step": 3303 + }, + { + "epoch": 0.4940930163002841, + "grad_norm": 1.9818504468304579, + "learning_rate": 5.338795846753249e-06, + "loss": 0.2051, + "step": 3304 + }, + { + "epoch": 0.49424256019141616, + "grad_norm": 2.0426688110857087, + "learning_rate": 5.336379547198371e-06, + "loss": 0.4497, + "step": 3305 + }, + { + "epoch": 0.49439210408254824, + "grad_norm": 1.9574028012684885, + "learning_rate": 5.3339631687256085e-06, + "loss": 0.4708, + "step": 3306 + }, + { + "epoch": 0.4945416479736803, + "grad_norm": 1.575961759570781, + "learning_rate": 5.331546711901869e-06, + "loss": 0.3804, + "step": 3307 + }, + { + "epoch": 0.4946911918648123, + "grad_norm": 1.414716802354248, + "learning_rate": 5.3291301772940755e-06, + "loss": 0.2485, + "step": 3308 + }, + { + "epoch": 0.4948407357559444, + "grad_norm": 1.240412229265847, + "learning_rate": 5.32671356546917e-06, + "loss": 0.2153, + "step": 3309 + }, + { + "epoch": 0.4949902796470764, + "grad_norm": 1.8834615593768038, + "learning_rate": 5.324296876994115e-06, + "loss": 0.4081, + "step": 3310 + }, + { + "epoch": 0.49513982353820846, + "grad_norm": 2.0275002053515876, + "learning_rate": 5.321880112435885e-06, + "loss": 0.5748, + "step": 3311 + }, + { + "epoch": 0.4952893674293405, + "grad_norm": 1.1895625858020245, + "learning_rate": 5.31946327236148e-06, + "loss": 0.1955, + "step": 3312 + }, + { + "epoch": 0.4954389113204726, + "grad_norm": 1.6692053077282905, + "learning_rate": 5.3170463573379115e-06, + "loss": 0.3985, + "step": 3313 + }, + { + "epoch": 0.4955884552116046, + "grad_norm": 1.1693085573761919, + "learning_rate": 5.314629367932213e-06, + "loss": 0.201, + "step": 3314 + }, + { + "epoch": 0.49573799910273664, + "grad_norm": 1.607561425215323, + "learning_rate": 5.3122123047114334e-06, + "loss": 0.4755, + "step": 3315 + }, + { + "epoch": 0.4958875429938687, + "grad_norm": 1.6244993103305614, + "learning_rate": 5.3097951682426375e-06, + "loss": 0.2829, + "step": 3316 + }, + { + "epoch": 0.49603708688500076, + "grad_norm": 1.335438428761969, + "learning_rate": 5.30737795909291e-06, + "loss": 0.3357, + "step": 3317 + }, + { + "epoch": 0.4961866307761328, + "grad_norm": 1.1870501993360763, + "learning_rate": 5.304960677829349e-06, + "loss": 0.2321, + "step": 3318 + }, + { + "epoch": 0.4963361746672648, + "grad_norm": 1.2459003700531992, + "learning_rate": 5.302543325019075e-06, + "loss": 0.1832, + "step": 3319 + }, + { + "epoch": 0.4964857185583969, + "grad_norm": 1.174928428747797, + "learning_rate": 5.300125901229222e-06, + "loss": 0.1906, + "step": 3320 + }, + { + "epoch": 0.49663526244952894, + "grad_norm": 1.312638962292064, + "learning_rate": 5.297708407026939e-06, + "loss": 0.1821, + "step": 3321 + }, + { + "epoch": 0.49678480634066097, + "grad_norm": 1.8127687148152816, + "learning_rate": 5.295290842979396e-06, + "loss": 0.3657, + "step": 3322 + }, + { + "epoch": 0.49693435023179305, + "grad_norm": 1.5846846945588664, + "learning_rate": 5.292873209653774e-06, + "loss": 0.4235, + "step": 3323 + }, + { + "epoch": 0.4970838941229251, + "grad_norm": 1.4706523678799026, + "learning_rate": 5.290455507617276e-06, + "loss": 0.3334, + "step": 3324 + }, + { + "epoch": 0.4972334380140571, + "grad_norm": 1.703735745140162, + "learning_rate": 5.288037737437115e-06, + "loss": 0.3885, + "step": 3325 + }, + { + "epoch": 0.49738298190518915, + "grad_norm": 1.376769352546737, + "learning_rate": 5.285619899680527e-06, + "loss": 0.3308, + "step": 3326 + }, + { + "epoch": 0.49753252579632123, + "grad_norm": 1.317011044800177, + "learning_rate": 5.283201994914755e-06, + "loss": 0.2099, + "step": 3327 + }, + { + "epoch": 0.49768206968745327, + "grad_norm": 1.681669403343869, + "learning_rate": 5.280784023707067e-06, + "loss": 0.208, + "step": 3328 + }, + { + "epoch": 0.4978316135785853, + "grad_norm": 1.5061732433872248, + "learning_rate": 5.278365986624743e-06, + "loss": 0.1759, + "step": 3329 + }, + { + "epoch": 0.4979811574697174, + "grad_norm": 1.356987051993483, + "learning_rate": 5.275947884235073e-06, + "loss": 0.2116, + "step": 3330 + }, + { + "epoch": 0.4981307013608494, + "grad_norm": 1.307261358026284, + "learning_rate": 5.2735297171053714e-06, + "loss": 0.2082, + "step": 3331 + }, + { + "epoch": 0.49828024525198145, + "grad_norm": 1.799388885073472, + "learning_rate": 5.271111485802962e-06, + "loss": 0.4406, + "step": 3332 + }, + { + "epoch": 0.4984297891431135, + "grad_norm": 1.5618056900457256, + "learning_rate": 5.2686931908951835e-06, + "loss": 0.3651, + "step": 3333 + }, + { + "epoch": 0.49857933303424556, + "grad_norm": 1.6901599045091793, + "learning_rate": 5.266274832949395e-06, + "loss": 0.4992, + "step": 3334 + }, + { + "epoch": 0.4987288769253776, + "grad_norm": 1.4954460375400984, + "learning_rate": 5.263856412532964e-06, + "loss": 0.3267, + "step": 3335 + }, + { + "epoch": 0.4988784208165096, + "grad_norm": 1.2517059338039354, + "learning_rate": 5.261437930213275e-06, + "loss": 0.178, + "step": 3336 + }, + { + "epoch": 0.4990279647076417, + "grad_norm": 1.1344813829644422, + "learning_rate": 5.259019386557729e-06, + "loss": 0.2142, + "step": 3337 + }, + { + "epoch": 0.49917750859877374, + "grad_norm": 2.101616933181502, + "learning_rate": 5.256600782133738e-06, + "loss": 0.5949, + "step": 3338 + }, + { + "epoch": 0.4993270524899058, + "grad_norm": 1.4383620186111616, + "learning_rate": 5.25418211750873e-06, + "loss": 0.349, + "step": 3339 + }, + { + "epoch": 0.4994765963810378, + "grad_norm": 1.699016778277064, + "learning_rate": 5.251763393250149e-06, + "loss": 0.401, + "step": 3340 + }, + { + "epoch": 0.4996261402721699, + "grad_norm": 1.0482102710690397, + "learning_rate": 5.249344609925449e-06, + "loss": 0.1841, + "step": 3341 + }, + { + "epoch": 0.4997756841633019, + "grad_norm": 1.7173296868671082, + "learning_rate": 5.246925768102101e-06, + "loss": 0.4094, + "step": 3342 + }, + { + "epoch": 0.49992522805443396, + "grad_norm": 1.4164669080686887, + "learning_rate": 5.244506868347588e-06, + "loss": 0.2179, + "step": 3343 + }, + { + "epoch": 0.500074771945566, + "grad_norm": 1.531411863063166, + "learning_rate": 5.242087911229405e-06, + "loss": 0.3356, + "step": 3344 + }, + { + "epoch": 0.500224315836698, + "grad_norm": 1.135792810651757, + "learning_rate": 5.239668897315068e-06, + "loss": 0.1927, + "step": 3345 + }, + { + "epoch": 0.5003738597278301, + "grad_norm": 1.6441864050801602, + "learning_rate": 5.237249827172096e-06, + "loss": 0.4575, + "step": 3346 + }, + { + "epoch": 0.5005234036189622, + "grad_norm": 1.162852279035988, + "learning_rate": 5.234830701368029e-06, + "loss": 0.1545, + "step": 3347 + }, + { + "epoch": 0.5006729475100942, + "grad_norm": 1.410269599279374, + "learning_rate": 5.232411520470416e-06, + "loss": 0.2162, + "step": 3348 + }, + { + "epoch": 0.5008224914012263, + "grad_norm": 1.745801598261214, + "learning_rate": 5.2299922850468195e-06, + "loss": 0.5337, + "step": 3349 + }, + { + "epoch": 0.5009720352923583, + "grad_norm": 2.176023856699294, + "learning_rate": 5.227572995664819e-06, + "loss": 0.4114, + "step": 3350 + }, + { + "epoch": 0.5011215791834903, + "grad_norm": 2.3007951519065246, + "learning_rate": 5.225153652891999e-06, + "loss": 0.4568, + "step": 3351 + }, + { + "epoch": 0.5012711230746224, + "grad_norm": 1.8594233682896886, + "learning_rate": 5.222734257295963e-06, + "loss": 0.4813, + "step": 3352 + }, + { + "epoch": 0.5014206669657545, + "grad_norm": 1.4068832031512468, + "learning_rate": 5.220314809444321e-06, + "loss": 0.3328, + "step": 3353 + }, + { + "epoch": 0.5015702108568865, + "grad_norm": 1.67853079379061, + "learning_rate": 5.217895309904703e-06, + "loss": 0.2757, + "step": 3354 + }, + { + "epoch": 0.5017197547480186, + "grad_norm": 1.6234161746374836, + "learning_rate": 5.215475759244746e-06, + "loss": 0.2088, + "step": 3355 + }, + { + "epoch": 0.5018692986391506, + "grad_norm": 1.4803686485569791, + "learning_rate": 5.2130561580321e-06, + "loss": 0.3738, + "step": 3356 + }, + { + "epoch": 0.5020188425302826, + "grad_norm": 1.7103662059610552, + "learning_rate": 5.210636506834425e-06, + "loss": 0.2087, + "step": 3357 + }, + { + "epoch": 0.5021683864214147, + "grad_norm": 1.7934033193420236, + "learning_rate": 5.208216806219395e-06, + "loss": 0.5237, + "step": 3358 + }, + { + "epoch": 0.5023179303125467, + "grad_norm": 2.0124390173788775, + "learning_rate": 5.205797056754696e-06, + "loss": 0.3567, + "step": 3359 + }, + { + "epoch": 0.5024674742036788, + "grad_norm": 1.644508950632731, + "learning_rate": 5.203377259008024e-06, + "loss": 0.3472, + "step": 3360 + }, + { + "epoch": 0.5026170180948109, + "grad_norm": 1.4189985308607627, + "learning_rate": 5.200957413547086e-06, + "loss": 0.1959, + "step": 3361 + }, + { + "epoch": 0.5027665619859428, + "grad_norm": 2.013951064845426, + "learning_rate": 5.1985375209396035e-06, + "loss": 0.5964, + "step": 3362 + }, + { + "epoch": 0.5029161058770749, + "grad_norm": 1.063791871070481, + "learning_rate": 5.196117581753305e-06, + "loss": 0.171, + "step": 3363 + }, + { + "epoch": 0.503065649768207, + "grad_norm": 1.8308256692715812, + "learning_rate": 5.193697596555932e-06, + "loss": 0.44, + "step": 3364 + }, + { + "epoch": 0.503215193659339, + "grad_norm": 1.8386517447639408, + "learning_rate": 5.191277565915236e-06, + "loss": 0.6374, + "step": 3365 + }, + { + "epoch": 0.5033647375504711, + "grad_norm": 1.9183016447353525, + "learning_rate": 5.18885749039898e-06, + "loss": 0.3919, + "step": 3366 + }, + { + "epoch": 0.5035142814416032, + "grad_norm": 2.0616577191442653, + "learning_rate": 5.186437370574935e-06, + "loss": 0.2469, + "step": 3367 + }, + { + "epoch": 0.5036638253327351, + "grad_norm": 1.7802773589123986, + "learning_rate": 5.184017207010887e-06, + "loss": 0.2157, + "step": 3368 + }, + { + "epoch": 0.5038133692238672, + "grad_norm": 1.7787158574348014, + "learning_rate": 5.181597000274628e-06, + "loss": 0.2512, + "step": 3369 + }, + { + "epoch": 0.5039629131149993, + "grad_norm": 1.5473819922931655, + "learning_rate": 5.179176750933965e-06, + "loss": 0.4162, + "step": 3370 + }, + { + "epoch": 0.5041124570061313, + "grad_norm": 2.1562775008894244, + "learning_rate": 5.176756459556709e-06, + "loss": 0.3656, + "step": 3371 + }, + { + "epoch": 0.5042620008972634, + "grad_norm": 1.4364564466383367, + "learning_rate": 5.174336126710683e-06, + "loss": 0.3441, + "step": 3372 + }, + { + "epoch": 0.5044115447883954, + "grad_norm": 1.6456126737151073, + "learning_rate": 5.171915752963721e-06, + "loss": 0.3596, + "step": 3373 + }, + { + "epoch": 0.5045610886795274, + "grad_norm": 1.9483420592597502, + "learning_rate": 5.169495338883666e-06, + "loss": 0.4743, + "step": 3374 + }, + { + "epoch": 0.5047106325706595, + "grad_norm": 1.649657426297314, + "learning_rate": 5.1670748850383734e-06, + "loss": 0.2528, + "step": 3375 + }, + { + "epoch": 0.5048601764617915, + "grad_norm": 1.6332145086519905, + "learning_rate": 5.164654391995702e-06, + "loss": 0.4456, + "step": 3376 + }, + { + "epoch": 0.5050097203529236, + "grad_norm": 1.6522420757205838, + "learning_rate": 5.162233860323523e-06, + "loss": 0.3963, + "step": 3377 + }, + { + "epoch": 0.5051592642440557, + "grad_norm": 1.2741254428541653, + "learning_rate": 5.159813290589717e-06, + "loss": 0.2927, + "step": 3378 + }, + { + "epoch": 0.5053088081351876, + "grad_norm": 1.0038407207877114, + "learning_rate": 5.1573926833621724e-06, + "loss": 0.1635, + "step": 3379 + }, + { + "epoch": 0.5054583520263197, + "grad_norm": 1.1748485985062411, + "learning_rate": 5.1549720392087865e-06, + "loss": 0.1937, + "step": 3380 + }, + { + "epoch": 0.5056078959174518, + "grad_norm": 1.453744768955761, + "learning_rate": 5.152551358697468e-06, + "loss": 0.2786, + "step": 3381 + }, + { + "epoch": 0.5057574398085838, + "grad_norm": 1.4933599639180963, + "learning_rate": 5.150130642396129e-06, + "loss": 0.2105, + "step": 3382 + }, + { + "epoch": 0.5059069836997159, + "grad_norm": 1.2741666356322288, + "learning_rate": 5.147709890872697e-06, + "loss": 0.1731, + "step": 3383 + }, + { + "epoch": 0.506056527590848, + "grad_norm": 2.2491770404898217, + "learning_rate": 5.145289104695101e-06, + "loss": 0.5676, + "step": 3384 + }, + { + "epoch": 0.5062060714819799, + "grad_norm": 1.9408723876207343, + "learning_rate": 5.14286828443128e-06, + "loss": 0.4774, + "step": 3385 + }, + { + "epoch": 0.506355615373112, + "grad_norm": 1.4212514196361943, + "learning_rate": 5.1404474306491816e-06, + "loss": 0.214, + "step": 3386 + }, + { + "epoch": 0.5065051592642441, + "grad_norm": 1.9045071920993695, + "learning_rate": 5.138026543916763e-06, + "loss": 0.3409, + "step": 3387 + }, + { + "epoch": 0.5066547031553761, + "grad_norm": 2.155362762135023, + "learning_rate": 5.135605624801987e-06, + "loss": 0.5308, + "step": 3388 + }, + { + "epoch": 0.5068042470465082, + "grad_norm": 1.7219918801573693, + "learning_rate": 5.133184673872824e-06, + "loss": 0.2495, + "step": 3389 + }, + { + "epoch": 0.5069537909376401, + "grad_norm": 1.1700199480933995, + "learning_rate": 5.130763691697254e-06, + "loss": 0.1942, + "step": 3390 + }, + { + "epoch": 0.5071033348287722, + "grad_norm": 1.7548276948843984, + "learning_rate": 5.128342678843261e-06, + "loss": 0.4727, + "step": 3391 + }, + { + "epoch": 0.5072528787199043, + "grad_norm": 1.4512806488385455, + "learning_rate": 5.125921635878839e-06, + "loss": 0.3694, + "step": 3392 + }, + { + "epoch": 0.5074024226110363, + "grad_norm": 1.4994540711193551, + "learning_rate": 5.123500563371988e-06, + "loss": 0.2164, + "step": 3393 + }, + { + "epoch": 0.5075519665021684, + "grad_norm": 1.5807710125666976, + "learning_rate": 5.1210794618907135e-06, + "loss": 0.3291, + "step": 3394 + }, + { + "epoch": 0.5077015103933005, + "grad_norm": 1.5954515312505237, + "learning_rate": 5.118658332003031e-06, + "loss": 0.4411, + "step": 3395 + }, + { + "epoch": 0.5078510542844324, + "grad_norm": 1.9192671256027398, + "learning_rate": 5.116237174276961e-06, + "loss": 0.5256, + "step": 3396 + }, + { + "epoch": 0.5080005981755645, + "grad_norm": 1.4114290051973426, + "learning_rate": 5.113815989280528e-06, + "loss": 0.218, + "step": 3397 + }, + { + "epoch": 0.5081501420666966, + "grad_norm": 1.8229179727458815, + "learning_rate": 5.111394777581769e-06, + "loss": 0.433, + "step": 3398 + }, + { + "epoch": 0.5082996859578286, + "grad_norm": 1.0778267074582055, + "learning_rate": 5.108973539748721e-06, + "loss": 0.1962, + "step": 3399 + }, + { + "epoch": 0.5084492298489607, + "grad_norm": 1.5454148953863824, + "learning_rate": 5.10655227634943e-06, + "loss": 0.3592, + "step": 3400 + }, + { + "epoch": 0.5085987737400928, + "grad_norm": 1.74859452433122, + "learning_rate": 5.104130987951947e-06, + "loss": 0.496, + "step": 3401 + }, + { + "epoch": 0.5087483176312247, + "grad_norm": 1.362679322662736, + "learning_rate": 5.101709675124332e-06, + "loss": 0.2273, + "step": 3402 + }, + { + "epoch": 0.5088978615223568, + "grad_norm": 2.070075014276669, + "learning_rate": 5.0992883384346485e-06, + "loss": 0.2638, + "step": 3403 + }, + { + "epoch": 0.5090474054134888, + "grad_norm": 1.5938286200934397, + "learning_rate": 5.096866978450962e-06, + "loss": 0.5063, + "step": 3404 + }, + { + "epoch": 0.5091969493046209, + "grad_norm": 1.16516888607397, + "learning_rate": 5.09444559574135e-06, + "loss": 0.2357, + "step": 3405 + }, + { + "epoch": 0.509346493195753, + "grad_norm": 2.029114392825952, + "learning_rate": 5.0920241908738885e-06, + "loss": 0.5142, + "step": 3406 + }, + { + "epoch": 0.509496037086885, + "grad_norm": 1.5198340641323709, + "learning_rate": 5.089602764416667e-06, + "loss": 0.236, + "step": 3407 + }, + { + "epoch": 0.509645580978017, + "grad_norm": 1.7318426021724316, + "learning_rate": 5.087181316937773e-06, + "loss": 0.342, + "step": 3408 + }, + { + "epoch": 0.5097951248691491, + "grad_norm": 1.412619025249756, + "learning_rate": 5.084759849005301e-06, + "loss": 0.1432, + "step": 3409 + }, + { + "epoch": 0.5099446687602811, + "grad_norm": 1.5740314968616134, + "learning_rate": 5.082338361187354e-06, + "loss": 0.3604, + "step": 3410 + }, + { + "epoch": 0.5100942126514132, + "grad_norm": 1.6106092571840667, + "learning_rate": 5.079916854052031e-06, + "loss": 0.3553, + "step": 3411 + }, + { + "epoch": 0.5102437565425453, + "grad_norm": 1.3048853067534962, + "learning_rate": 5.077495328167446e-06, + "loss": 0.2093, + "step": 3412 + }, + { + "epoch": 0.5103933004336773, + "grad_norm": 1.3774154157596905, + "learning_rate": 5.07507378410171e-06, + "loss": 0.3105, + "step": 3413 + }, + { + "epoch": 0.5105428443248093, + "grad_norm": 1.0928212880625816, + "learning_rate": 5.072652222422938e-06, + "loss": 0.1978, + "step": 3414 + }, + { + "epoch": 0.5106923882159414, + "grad_norm": 1.148279407869326, + "learning_rate": 5.0702306436992575e-06, + "loss": 0.1998, + "step": 3415 + }, + { + "epoch": 0.5108419321070734, + "grad_norm": 2.167927897987719, + "learning_rate": 5.0678090484987895e-06, + "loss": 0.3238, + "step": 3416 + }, + { + "epoch": 0.5109914759982055, + "grad_norm": 1.5482664593136313, + "learning_rate": 5.065387437389666e-06, + "loss": 0.183, + "step": 3417 + }, + { + "epoch": 0.5111410198893375, + "grad_norm": 1.0394819657038659, + "learning_rate": 5.06296581094002e-06, + "loss": 0.1603, + "step": 3418 + }, + { + "epoch": 0.5112905637804696, + "grad_norm": 1.106087546826269, + "learning_rate": 5.060544169717987e-06, + "loss": 0.1787, + "step": 3419 + }, + { + "epoch": 0.5114401076716016, + "grad_norm": 1.0109749456896577, + "learning_rate": 5.058122514291709e-06, + "loss": 0.1278, + "step": 3420 + }, + { + "epoch": 0.5115896515627336, + "grad_norm": 1.3052672308820343, + "learning_rate": 5.0557008452293275e-06, + "loss": 0.3621, + "step": 3421 + }, + { + "epoch": 0.5117391954538657, + "grad_norm": 1.0511096136393951, + "learning_rate": 5.053279163098992e-06, + "loss": 0.1874, + "step": 3422 + }, + { + "epoch": 0.5118887393449978, + "grad_norm": 1.5171824078027538, + "learning_rate": 5.050857468468853e-06, + "loss": 0.3404, + "step": 3423 + }, + { + "epoch": 0.5120382832361298, + "grad_norm": 1.050201412025008, + "learning_rate": 5.04843576190706e-06, + "loss": 0.1693, + "step": 3424 + }, + { + "epoch": 0.5121878271272619, + "grad_norm": 1.4748037147038833, + "learning_rate": 5.0460140439817695e-06, + "loss": 0.2263, + "step": 3425 + }, + { + "epoch": 0.5123373710183939, + "grad_norm": 1.3128576316535443, + "learning_rate": 5.043592315261143e-06, + "loss": 0.1389, + "step": 3426 + }, + { + "epoch": 0.5124869149095259, + "grad_norm": 1.284722286802869, + "learning_rate": 5.041170576313338e-06, + "loss": 0.209, + "step": 3427 + }, + { + "epoch": 0.512636458800658, + "grad_norm": 1.1356824496722022, + "learning_rate": 5.0387488277065186e-06, + "loss": 0.3192, + "step": 3428 + }, + { + "epoch": 0.5127860026917901, + "grad_norm": 1.3644018650118583, + "learning_rate": 5.036327070008852e-06, + "loss": 0.229, + "step": 3429 + }, + { + "epoch": 0.5129355465829221, + "grad_norm": 1.2797031965795196, + "learning_rate": 5.0339053037885035e-06, + "loss": 0.2217, + "step": 3430 + }, + { + "epoch": 0.5130850904740542, + "grad_norm": 1.6886018581906976, + "learning_rate": 5.031483529613646e-06, + "loss": 0.4042, + "step": 3431 + }, + { + "epoch": 0.5132346343651862, + "grad_norm": 1.454212457336455, + "learning_rate": 5.029061748052446e-06, + "loss": 0.1886, + "step": 3432 + }, + { + "epoch": 0.5133841782563182, + "grad_norm": 1.4334168078083147, + "learning_rate": 5.026639959673083e-06, + "loss": 0.3052, + "step": 3433 + }, + { + "epoch": 0.5135337221474503, + "grad_norm": 1.745914040348649, + "learning_rate": 5.024218165043726e-06, + "loss": 0.3588, + "step": 3434 + }, + { + "epoch": 0.5136832660385823, + "grad_norm": 2.0781048544287293, + "learning_rate": 5.021796364732554e-06, + "loss": 0.2493, + "step": 3435 + }, + { + "epoch": 0.5138328099297144, + "grad_norm": 1.6255642021510852, + "learning_rate": 5.019374559307747e-06, + "loss": 0.346, + "step": 3436 + }, + { + "epoch": 0.5139823538208464, + "grad_norm": 1.7336607656638219, + "learning_rate": 5.01695274933748e-06, + "loss": 0.203, + "step": 3437 + }, + { + "epoch": 0.5141318977119784, + "grad_norm": 1.6058882608416245, + "learning_rate": 5.014530935389938e-06, + "loss": 0.3645, + "step": 3438 + }, + { + "epoch": 0.5142814416031105, + "grad_norm": 1.4498498807058657, + "learning_rate": 5.012109118033294e-06, + "loss": 0.2795, + "step": 3439 + }, + { + "epoch": 0.5144309854942426, + "grad_norm": 2.0694828386357873, + "learning_rate": 5.009687297835736e-06, + "loss": 0.5495, + "step": 3440 + }, + { + "epoch": 0.5145805293853746, + "grad_norm": 1.5377492094048737, + "learning_rate": 5.0072654753654444e-06, + "loss": 0.4609, + "step": 3441 + }, + { + "epoch": 0.5147300732765067, + "grad_norm": 1.4923232476167383, + "learning_rate": 5.004843651190602e-06, + "loss": 0.314, + "step": 3442 + }, + { + "epoch": 0.5148796171676387, + "grad_norm": 1.023025356909767, + "learning_rate": 5.002421825879394e-06, + "loss": 0.2076, + "step": 3443 + }, + { + "epoch": 0.5150291610587707, + "grad_norm": 1.4703535067143427, + "learning_rate": 5e-06, + "loss": 0.2911, + "step": 3444 + }, + { + "epoch": 0.5151787049499028, + "grad_norm": 1.635494712784053, + "learning_rate": 4.997578174120607e-06, + "loss": 0.1704, + "step": 3445 + }, + { + "epoch": 0.5153282488410349, + "grad_norm": 1.3501580885318492, + "learning_rate": 4.995156348809398e-06, + "loss": 0.3092, + "step": 3446 + }, + { + "epoch": 0.5154777927321669, + "grad_norm": 1.4552065824604172, + "learning_rate": 4.992734524634557e-06, + "loss": 0.3157, + "step": 3447 + }, + { + "epoch": 0.515627336623299, + "grad_norm": 1.6023986192308008, + "learning_rate": 4.990312702164265e-06, + "loss": 0.3258, + "step": 3448 + }, + { + "epoch": 0.5157768805144309, + "grad_norm": 1.135753254332216, + "learning_rate": 4.987890881966707e-06, + "loss": 0.2194, + "step": 3449 + }, + { + "epoch": 0.515926424405563, + "grad_norm": 1.7579710147169378, + "learning_rate": 4.985469064610065e-06, + "loss": 0.379, + "step": 3450 + }, + { + "epoch": 0.5160759682966951, + "grad_norm": 1.2562049164914357, + "learning_rate": 4.983047250662519e-06, + "loss": 0.19, + "step": 3451 + }, + { + "epoch": 0.5162255121878271, + "grad_norm": 1.7306909181736667, + "learning_rate": 4.980625440692255e-06, + "loss": 0.5524, + "step": 3452 + }, + { + "epoch": 0.5163750560789592, + "grad_norm": 1.0200496356880533, + "learning_rate": 4.978203635267447e-06, + "loss": 0.2777, + "step": 3453 + }, + { + "epoch": 0.5165245999700913, + "grad_norm": 1.02643941739448, + "learning_rate": 4.9757818349562755e-06, + "loss": 0.2026, + "step": 3454 + }, + { + "epoch": 0.5166741438612232, + "grad_norm": 1.5225694971242867, + "learning_rate": 4.973360040326919e-06, + "loss": 0.3837, + "step": 3455 + }, + { + "epoch": 0.5168236877523553, + "grad_norm": 1.4438502518414775, + "learning_rate": 4.970938251947554e-06, + "loss": 0.4278, + "step": 3456 + }, + { + "epoch": 0.5169732316434874, + "grad_norm": 1.3218484273904558, + "learning_rate": 4.968516470386357e-06, + "loss": 0.2523, + "step": 3457 + }, + { + "epoch": 0.5171227755346194, + "grad_norm": 1.3619841304443039, + "learning_rate": 4.966094696211498e-06, + "loss": 0.2346, + "step": 3458 + }, + { + "epoch": 0.5172723194257515, + "grad_norm": 0.9202093983071812, + "learning_rate": 4.96367292999115e-06, + "loss": 0.1737, + "step": 3459 + }, + { + "epoch": 0.5174218633168836, + "grad_norm": 1.5024659381360044, + "learning_rate": 4.961251172293482e-06, + "loss": 0.2774, + "step": 3460 + }, + { + "epoch": 0.5175714072080155, + "grad_norm": 1.142291287339441, + "learning_rate": 4.958829423686663e-06, + "loss": 0.2169, + "step": 3461 + }, + { + "epoch": 0.5177209510991476, + "grad_norm": 1.1269059328349746, + "learning_rate": 4.956407684738859e-06, + "loss": 0.2414, + "step": 3462 + }, + { + "epoch": 0.5178704949902796, + "grad_norm": 1.4408048574377068, + "learning_rate": 4.953985956018231e-06, + "loss": 0.3941, + "step": 3463 + }, + { + "epoch": 0.5180200388814117, + "grad_norm": 1.1493374443284174, + "learning_rate": 4.951564238092942e-06, + "loss": 0.1601, + "step": 3464 + }, + { + "epoch": 0.5181695827725438, + "grad_norm": 1.543338492311835, + "learning_rate": 4.949142531531149e-06, + "loss": 0.26, + "step": 3465 + }, + { + "epoch": 0.5183191266636757, + "grad_norm": 1.0704201449260624, + "learning_rate": 4.946720836901008e-06, + "loss": 0.2098, + "step": 3466 + }, + { + "epoch": 0.5184686705548078, + "grad_norm": 1.1149725431271198, + "learning_rate": 4.944299154770673e-06, + "loss": 0.1564, + "step": 3467 + }, + { + "epoch": 0.5186182144459399, + "grad_norm": 1.58868666961587, + "learning_rate": 4.941877485708293e-06, + "loss": 0.4078, + "step": 3468 + }, + { + "epoch": 0.5187677583370719, + "grad_norm": 1.263877473847384, + "learning_rate": 4.939455830282014e-06, + "loss": 0.2109, + "step": 3469 + }, + { + "epoch": 0.518917302228204, + "grad_norm": 2.0767432150699503, + "learning_rate": 4.937034189059981e-06, + "loss": 0.4936, + "step": 3470 + }, + { + "epoch": 0.5190668461193361, + "grad_norm": 1.564988355482826, + "learning_rate": 4.9346125626103344e-06, + "loss": 0.2268, + "step": 3471 + }, + { + "epoch": 0.519216390010468, + "grad_norm": 1.7520264234912677, + "learning_rate": 4.932190951501212e-06, + "loss": 0.3324, + "step": 3472 + }, + { + "epoch": 0.5193659339016001, + "grad_norm": 0.9378784012067424, + "learning_rate": 4.929769356300745e-06, + "loss": 0.1623, + "step": 3473 + }, + { + "epoch": 0.5195154777927322, + "grad_norm": 1.3070459934481593, + "learning_rate": 4.9273477775770634e-06, + "loss": 0.3314, + "step": 3474 + }, + { + "epoch": 0.5196650216838642, + "grad_norm": 1.7416905476980176, + "learning_rate": 4.924926215898292e-06, + "loss": 0.4998, + "step": 3475 + }, + { + "epoch": 0.5198145655749963, + "grad_norm": 1.356929202517075, + "learning_rate": 4.922504671832555e-06, + "loss": 0.2937, + "step": 3476 + }, + { + "epoch": 0.5199641094661283, + "grad_norm": 1.4583002960913412, + "learning_rate": 4.920083145947971e-06, + "loss": 0.323, + "step": 3477 + }, + { + "epoch": 0.5201136533572603, + "grad_norm": 2.0884343537485734, + "learning_rate": 4.917661638812649e-06, + "loss": 0.5929, + "step": 3478 + }, + { + "epoch": 0.5202631972483924, + "grad_norm": 1.9246883502332144, + "learning_rate": 4.9152401509947e-06, + "loss": 0.5366, + "step": 3479 + }, + { + "epoch": 0.5204127411395244, + "grad_norm": 1.7689104275350418, + "learning_rate": 4.912818683062229e-06, + "loss": 0.2301, + "step": 3480 + }, + { + "epoch": 0.5205622850306565, + "grad_norm": 1.6135079839907187, + "learning_rate": 4.9103972355833345e-06, + "loss": 0.2101, + "step": 3481 + }, + { + "epoch": 0.5207118289217886, + "grad_norm": 1.5135656331832419, + "learning_rate": 4.907975809126113e-06, + "loss": 0.3627, + "step": 3482 + }, + { + "epoch": 0.5208613728129206, + "grad_norm": 1.1866081615701136, + "learning_rate": 4.905554404258654e-06, + "loss": 0.3292, + "step": 3483 + }, + { + "epoch": 0.5210109167040526, + "grad_norm": 1.6290902598708927, + "learning_rate": 4.90313302154904e-06, + "loss": 0.3552, + "step": 3484 + }, + { + "epoch": 0.5211604605951847, + "grad_norm": 1.3734572788893116, + "learning_rate": 4.900711661565353e-06, + "loss": 0.215, + "step": 3485 + }, + { + "epoch": 0.5213100044863167, + "grad_norm": 0.9577406608934755, + "learning_rate": 4.898290324875668e-06, + "loss": 0.1676, + "step": 3486 + }, + { + "epoch": 0.5214595483774488, + "grad_norm": 0.9118461606891877, + "learning_rate": 4.895869012048053e-06, + "loss": 0.1516, + "step": 3487 + }, + { + "epoch": 0.5216090922685809, + "grad_norm": 1.34978265933406, + "learning_rate": 4.893447723650572e-06, + "loss": 0.1731, + "step": 3488 + }, + { + "epoch": 0.5217586361597129, + "grad_norm": 1.3337609032683095, + "learning_rate": 4.89102646025128e-06, + "loss": 0.3653, + "step": 3489 + }, + { + "epoch": 0.5219081800508449, + "grad_norm": 1.1283458585985406, + "learning_rate": 4.888605222418232e-06, + "loss": 0.2716, + "step": 3490 + }, + { + "epoch": 0.522057723941977, + "grad_norm": 1.6956120283556566, + "learning_rate": 4.886184010719472e-06, + "loss": 0.4099, + "step": 3491 + }, + { + "epoch": 0.522207267833109, + "grad_norm": 1.63895974569, + "learning_rate": 4.88376282572304e-06, + "loss": 0.3172, + "step": 3492 + }, + { + "epoch": 0.5223568117242411, + "grad_norm": 1.5138032302467108, + "learning_rate": 4.88134166799697e-06, + "loss": 0.2708, + "step": 3493 + }, + { + "epoch": 0.5225063556153731, + "grad_norm": 1.7341059849859075, + "learning_rate": 4.878920538109288e-06, + "loss": 0.3361, + "step": 3494 + }, + { + "epoch": 0.5226558995065052, + "grad_norm": 1.4357879747612492, + "learning_rate": 4.876499436628013e-06, + "loss": 0.2236, + "step": 3495 + }, + { + "epoch": 0.5228054433976372, + "grad_norm": 1.9388442682852385, + "learning_rate": 4.874078364121162e-06, + "loss": 0.3892, + "step": 3496 + }, + { + "epoch": 0.5229549872887692, + "grad_norm": 1.3139109038725212, + "learning_rate": 4.871657321156738e-06, + "loss": 0.2246, + "step": 3497 + }, + { + "epoch": 0.5231045311799013, + "grad_norm": 1.0818048681042511, + "learning_rate": 4.869236308302748e-06, + "loss": 0.2252, + "step": 3498 + }, + { + "epoch": 0.5232540750710334, + "grad_norm": 1.3703939412577448, + "learning_rate": 4.866815326127177e-06, + "loss": 0.1899, + "step": 3499 + }, + { + "epoch": 0.5234036189621654, + "grad_norm": 1.4822403566706586, + "learning_rate": 4.864394375198015e-06, + "loss": 0.3076, + "step": 3500 + }, + { + "epoch": 0.5235531628532974, + "grad_norm": 1.9706263153865453, + "learning_rate": 4.861973456083238e-06, + "loss": 0.4768, + "step": 3501 + }, + { + "epoch": 0.5237027067444295, + "grad_norm": 2.1181556891438085, + "learning_rate": 4.85955256935082e-06, + "loss": 0.2084, + "step": 3502 + }, + { + "epoch": 0.5238522506355615, + "grad_norm": 1.192035696381639, + "learning_rate": 4.857131715568723e-06, + "loss": 0.224, + "step": 3503 + }, + { + "epoch": 0.5240017945266936, + "grad_norm": 0.7719952352600978, + "learning_rate": 4.854710895304902e-06, + "loss": 0.1399, + "step": 3504 + }, + { + "epoch": 0.5241513384178257, + "grad_norm": 1.3107969810050264, + "learning_rate": 4.852290109127304e-06, + "loss": 0.2784, + "step": 3505 + }, + { + "epoch": 0.5243008823089577, + "grad_norm": 1.4710047540471154, + "learning_rate": 4.849869357603871e-06, + "loss": 0.4199, + "step": 3506 + }, + { + "epoch": 0.5244504262000897, + "grad_norm": 1.9363546523939164, + "learning_rate": 4.847448641302532e-06, + "loss": 0.3845, + "step": 3507 + }, + { + "epoch": 0.5245999700912217, + "grad_norm": 1.4299597363726546, + "learning_rate": 4.845027960791215e-06, + "loss": 0.1818, + "step": 3508 + }, + { + "epoch": 0.5247495139823538, + "grad_norm": 1.392803585853321, + "learning_rate": 4.842607316637829e-06, + "loss": 0.1893, + "step": 3509 + }, + { + "epoch": 0.5248990578734859, + "grad_norm": 1.4652214173791756, + "learning_rate": 4.840186709410285e-06, + "loss": 0.3097, + "step": 3510 + }, + { + "epoch": 0.5250486017646179, + "grad_norm": 0.9930788283031162, + "learning_rate": 4.837766139676478e-06, + "loss": 0.1814, + "step": 3511 + }, + { + "epoch": 0.52519814565575, + "grad_norm": 1.4409939401080238, + "learning_rate": 4.835345608004299e-06, + "loss": 0.2391, + "step": 3512 + }, + { + "epoch": 0.525347689546882, + "grad_norm": 1.136343022154102, + "learning_rate": 4.832925114961629e-06, + "loss": 0.3325, + "step": 3513 + }, + { + "epoch": 0.525497233438014, + "grad_norm": 2.351393910634896, + "learning_rate": 4.8305046611163345e-06, + "loss": 0.2251, + "step": 3514 + }, + { + "epoch": 0.5256467773291461, + "grad_norm": 1.5324676085801396, + "learning_rate": 4.828084247036281e-06, + "loss": 0.3119, + "step": 3515 + }, + { + "epoch": 0.5257963212202782, + "grad_norm": 1.8398614526208819, + "learning_rate": 4.825663873289318e-06, + "loss": 0.221, + "step": 3516 + }, + { + "epoch": 0.5259458651114102, + "grad_norm": 1.7561849264734717, + "learning_rate": 4.823243540443292e-06, + "loss": 0.4972, + "step": 3517 + }, + { + "epoch": 0.5260954090025423, + "grad_norm": 1.2526140804123165, + "learning_rate": 4.820823249066038e-06, + "loss": 0.1815, + "step": 3518 + }, + { + "epoch": 0.5262449528936743, + "grad_norm": 1.6554249201864173, + "learning_rate": 4.818402999725373e-06, + "loss": 0.3794, + "step": 3519 + }, + { + "epoch": 0.5263944967848063, + "grad_norm": 1.3316819776233528, + "learning_rate": 4.815982792989114e-06, + "loss": 0.2839, + "step": 3520 + }, + { + "epoch": 0.5265440406759384, + "grad_norm": 1.692015505122212, + "learning_rate": 4.8135626294250655e-06, + "loss": 0.3722, + "step": 3521 + }, + { + "epoch": 0.5266935845670704, + "grad_norm": 1.7190685063797913, + "learning_rate": 4.811142509601022e-06, + "loss": 0.5343, + "step": 3522 + }, + { + "epoch": 0.5268431284582025, + "grad_norm": 1.4706527377105485, + "learning_rate": 4.808722434084766e-06, + "loss": 0.3285, + "step": 3523 + }, + { + "epoch": 0.5269926723493346, + "grad_norm": 1.8774181151216505, + "learning_rate": 4.80630240344407e-06, + "loss": 0.238, + "step": 3524 + }, + { + "epoch": 0.5271422162404665, + "grad_norm": 1.7586866324598942, + "learning_rate": 4.803882418246696e-06, + "loss": 0.4316, + "step": 3525 + }, + { + "epoch": 0.5272917601315986, + "grad_norm": 1.7649578082955628, + "learning_rate": 4.801462479060397e-06, + "loss": 0.4658, + "step": 3526 + }, + { + "epoch": 0.5274413040227307, + "grad_norm": 1.4270224302991061, + "learning_rate": 4.799042586452915e-06, + "loss": 0.1993, + "step": 3527 + }, + { + "epoch": 0.5275908479138627, + "grad_norm": 1.597762726870796, + "learning_rate": 4.796622740991978e-06, + "loss": 0.3639, + "step": 3528 + }, + { + "epoch": 0.5277403918049948, + "grad_norm": 1.5745989361707744, + "learning_rate": 4.794202943245306e-06, + "loss": 0.2338, + "step": 3529 + }, + { + "epoch": 0.5278899356961269, + "grad_norm": 1.6119777317893236, + "learning_rate": 4.791783193780607e-06, + "loss": 0.3549, + "step": 3530 + }, + { + "epoch": 0.5280394795872588, + "grad_norm": 1.448369751737763, + "learning_rate": 4.789363493165577e-06, + "loss": 0.3254, + "step": 3531 + }, + { + "epoch": 0.5281890234783909, + "grad_norm": 1.9783620520561083, + "learning_rate": 4.786943841967901e-06, + "loss": 0.4244, + "step": 3532 + }, + { + "epoch": 0.528338567369523, + "grad_norm": 1.7086111696643427, + "learning_rate": 4.7845242407552564e-06, + "loss": 0.5218, + "step": 3533 + }, + { + "epoch": 0.528488111260655, + "grad_norm": 2.058530122472167, + "learning_rate": 4.782104690095298e-06, + "loss": 0.705, + "step": 3534 + }, + { + "epoch": 0.5286376551517871, + "grad_norm": 1.4083919927576292, + "learning_rate": 4.779685190555681e-06, + "loss": 0.2537, + "step": 3535 + }, + { + "epoch": 0.528787199042919, + "grad_norm": 1.4221251550637304, + "learning_rate": 4.777265742704039e-06, + "loss": 0.214, + "step": 3536 + }, + { + "epoch": 0.5289367429340511, + "grad_norm": 1.5537572598460359, + "learning_rate": 4.774846347108001e-06, + "loss": 0.4211, + "step": 3537 + }, + { + "epoch": 0.5290862868251832, + "grad_norm": 1.4170336569377495, + "learning_rate": 4.772427004335183e-06, + "loss": 0.3499, + "step": 3538 + }, + { + "epoch": 0.5292358307163152, + "grad_norm": 1.4275209656159527, + "learning_rate": 4.770007714953181e-06, + "loss": 0.341, + "step": 3539 + }, + { + "epoch": 0.5293853746074473, + "grad_norm": 1.6285920446138622, + "learning_rate": 4.7675884795295855e-06, + "loss": 0.1867, + "step": 3540 + }, + { + "epoch": 0.5295349184985794, + "grad_norm": 1.430494043188066, + "learning_rate": 4.765169298631973e-06, + "loss": 0.1935, + "step": 3541 + }, + { + "epoch": 0.5296844623897113, + "grad_norm": 1.2416404413194209, + "learning_rate": 4.762750172827905e-06, + "loss": 0.1899, + "step": 3542 + }, + { + "epoch": 0.5298340062808434, + "grad_norm": 1.4246138627274758, + "learning_rate": 4.760331102684934e-06, + "loss": 0.2316, + "step": 3543 + }, + { + "epoch": 0.5299835501719755, + "grad_norm": 1.6367006596769544, + "learning_rate": 4.757912088770596e-06, + "loss": 0.4669, + "step": 3544 + }, + { + "epoch": 0.5301330940631075, + "grad_norm": 1.366581512757693, + "learning_rate": 4.755493131652415e-06, + "loss": 0.1932, + "step": 3545 + }, + { + "epoch": 0.5302826379542396, + "grad_norm": 1.7191803566801642, + "learning_rate": 4.7530742318979e-06, + "loss": 0.484, + "step": 3546 + }, + { + "epoch": 0.5304321818453717, + "grad_norm": 1.552142422679913, + "learning_rate": 4.750655390074552e-06, + "loss": 0.265, + "step": 3547 + }, + { + "epoch": 0.5305817257365036, + "grad_norm": 1.1885077097062553, + "learning_rate": 4.7482366067498526e-06, + "loss": 0.1907, + "step": 3548 + }, + { + "epoch": 0.5307312696276357, + "grad_norm": 1.6116298485523732, + "learning_rate": 4.7458178824912704e-06, + "loss": 0.2614, + "step": 3549 + }, + { + "epoch": 0.5308808135187678, + "grad_norm": 1.6356455691825849, + "learning_rate": 4.743399217866263e-06, + "loss": 0.3475, + "step": 3550 + }, + { + "epoch": 0.5310303574098998, + "grad_norm": 1.4715217386600319, + "learning_rate": 4.740980613442272e-06, + "loss": 0.1646, + "step": 3551 + }, + { + "epoch": 0.5311799013010319, + "grad_norm": 1.416799241645883, + "learning_rate": 4.738562069786725e-06, + "loss": 0.3194, + "step": 3552 + }, + { + "epoch": 0.5313294451921639, + "grad_norm": 1.5866917428028944, + "learning_rate": 4.736143587467039e-06, + "loss": 0.1772, + "step": 3553 + }, + { + "epoch": 0.5314789890832959, + "grad_norm": 1.229115311730451, + "learning_rate": 4.733725167050607e-06, + "loss": 0.1627, + "step": 3554 + }, + { + "epoch": 0.531628532974428, + "grad_norm": 1.211183396876207, + "learning_rate": 4.731306809104818e-06, + "loss": 0.2267, + "step": 3555 + }, + { + "epoch": 0.53177807686556, + "grad_norm": 2.2369509200391455, + "learning_rate": 4.7288885141970396e-06, + "loss": 0.2039, + "step": 3556 + }, + { + "epoch": 0.5319276207566921, + "grad_norm": 1.8028493082061634, + "learning_rate": 4.726470282894629e-06, + "loss": 0.3871, + "step": 3557 + }, + { + "epoch": 0.5320771646478242, + "grad_norm": 1.969525411582806, + "learning_rate": 4.7240521157649295e-06, + "loss": 0.5476, + "step": 3558 + }, + { + "epoch": 0.5322267085389562, + "grad_norm": 1.6776763319782029, + "learning_rate": 4.7216340133752604e-06, + "loss": 0.4609, + "step": 3559 + }, + { + "epoch": 0.5323762524300882, + "grad_norm": 1.220727007823009, + "learning_rate": 4.719215976292934e-06, + "loss": 0.2119, + "step": 3560 + }, + { + "epoch": 0.5325257963212203, + "grad_norm": 1.7235272167160625, + "learning_rate": 4.716798005085246e-06, + "loss": 0.412, + "step": 3561 + }, + { + "epoch": 0.5326753402123523, + "grad_norm": 1.954837902451542, + "learning_rate": 4.714380100319476e-06, + "loss": 0.5354, + "step": 3562 + }, + { + "epoch": 0.5328248841034844, + "grad_norm": 1.5724258743980508, + "learning_rate": 4.711962262562887e-06, + "loss": 0.2358, + "step": 3563 + }, + { + "epoch": 0.5329744279946165, + "grad_norm": 2.0670603634468545, + "learning_rate": 4.709544492382726e-06, + "loss": 0.5506, + "step": 3564 + }, + { + "epoch": 0.5331239718857484, + "grad_norm": 1.657619094892483, + "learning_rate": 4.707126790346227e-06, + "loss": 0.3559, + "step": 3565 + }, + { + "epoch": 0.5332735157768805, + "grad_norm": 1.4867129285160177, + "learning_rate": 4.704709157020606e-06, + "loss": 0.3264, + "step": 3566 + }, + { + "epoch": 0.5334230596680125, + "grad_norm": 1.7017440972223712, + "learning_rate": 4.702291592973062e-06, + "loss": 0.2979, + "step": 3567 + }, + { + "epoch": 0.5335726035591446, + "grad_norm": 1.9673502426761393, + "learning_rate": 4.69987409877078e-06, + "loss": 0.5318, + "step": 3568 + }, + { + "epoch": 0.5337221474502767, + "grad_norm": 1.4977969143854684, + "learning_rate": 4.697456674980926e-06, + "loss": 0.3394, + "step": 3569 + }, + { + "epoch": 0.5338716913414087, + "grad_norm": 1.6784064708619297, + "learning_rate": 4.695039322170652e-06, + "loss": 0.3555, + "step": 3570 + }, + { + "epoch": 0.5340212352325407, + "grad_norm": 1.96234854429619, + "learning_rate": 4.692622040907092e-06, + "loss": 0.6075, + "step": 3571 + }, + { + "epoch": 0.5341707791236728, + "grad_norm": 1.4259119209086994, + "learning_rate": 4.690204831757363e-06, + "loss": 0.1889, + "step": 3572 + }, + { + "epoch": 0.5343203230148048, + "grad_norm": 1.7546548074341575, + "learning_rate": 4.687787695288567e-06, + "loss": 0.3012, + "step": 3573 + }, + { + "epoch": 0.5344698669059369, + "grad_norm": 1.8205519670137686, + "learning_rate": 4.685370632067788e-06, + "loss": 0.4875, + "step": 3574 + }, + { + "epoch": 0.534619410797069, + "grad_norm": 1.3195329317180426, + "learning_rate": 4.682953642662089e-06, + "loss": 0.2789, + "step": 3575 + }, + { + "epoch": 0.534768954688201, + "grad_norm": 1.1371430231198516, + "learning_rate": 4.680536727638522e-06, + "loss": 0.1545, + "step": 3576 + }, + { + "epoch": 0.534918498579333, + "grad_norm": 1.3375712737529661, + "learning_rate": 4.678119887564115e-06, + "loss": 0.1937, + "step": 3577 + }, + { + "epoch": 0.5350680424704651, + "grad_norm": 1.1773100320990963, + "learning_rate": 4.675703123005886e-06, + "loss": 0.2153, + "step": 3578 + }, + { + "epoch": 0.5352175863615971, + "grad_norm": 1.2694167100147624, + "learning_rate": 4.673286434530832e-06, + "loss": 0.3311, + "step": 3579 + }, + { + "epoch": 0.5353671302527292, + "grad_norm": 1.973094778134566, + "learning_rate": 4.670869822705926e-06, + "loss": 0.5673, + "step": 3580 + }, + { + "epoch": 0.5355166741438612, + "grad_norm": 1.4077081167408994, + "learning_rate": 4.668453288098132e-06, + "loss": 0.2774, + "step": 3581 + }, + { + "epoch": 0.5356662180349933, + "grad_norm": 1.516290100892926, + "learning_rate": 4.666036831274392e-06, + "loss": 0.2193, + "step": 3582 + }, + { + "epoch": 0.5358157619261253, + "grad_norm": 1.5345251033260943, + "learning_rate": 4.663620452801631e-06, + "loss": 0.3554, + "step": 3583 + }, + { + "epoch": 0.5359653058172573, + "grad_norm": 1.4266746622095299, + "learning_rate": 4.661204153246753e-06, + "loss": 0.3471, + "step": 3584 + }, + { + "epoch": 0.5361148497083894, + "grad_norm": 1.959088255237877, + "learning_rate": 4.6587879331766465e-06, + "loss": 0.4178, + "step": 3585 + }, + { + "epoch": 0.5362643935995215, + "grad_norm": 1.2738818990475473, + "learning_rate": 4.656371793158178e-06, + "loss": 0.2069, + "step": 3586 + }, + { + "epoch": 0.5364139374906535, + "grad_norm": 1.9050636351962995, + "learning_rate": 4.653955733758198e-06, + "loss": 0.3567, + "step": 3587 + }, + { + "epoch": 0.5365634813817856, + "grad_norm": 1.6779491300436666, + "learning_rate": 4.651539755543539e-06, + "loss": 0.1506, + "step": 3588 + }, + { + "epoch": 0.5367130252729176, + "grad_norm": 1.7278703163893183, + "learning_rate": 4.6491238590810114e-06, + "loss": 0.5122, + "step": 3589 + }, + { + "epoch": 0.5368625691640496, + "grad_norm": 1.285149964140387, + "learning_rate": 4.646708044937408e-06, + "loss": 0.2056, + "step": 3590 + }, + { + "epoch": 0.5370121130551817, + "grad_norm": 1.0933373626532004, + "learning_rate": 4.644292313679502e-06, + "loss": 0.2019, + "step": 3591 + }, + { + "epoch": 0.5371616569463138, + "grad_norm": 1.4578130787462922, + "learning_rate": 4.6418766658740464e-06, + "loss": 0.191, + "step": 3592 + }, + { + "epoch": 0.5373112008374458, + "grad_norm": 1.5712976259860016, + "learning_rate": 4.639461102087777e-06, + "loss": 0.3531, + "step": 3593 + }, + { + "epoch": 0.5374607447285779, + "grad_norm": 1.4827323156288543, + "learning_rate": 4.63704562288741e-06, + "loss": 0.1951, + "step": 3594 + }, + { + "epoch": 0.5376102886197098, + "grad_norm": 1.115006492033478, + "learning_rate": 4.634630228839635e-06, + "loss": 0.2213, + "step": 3595 + }, + { + "epoch": 0.5377598325108419, + "grad_norm": 1.461958976876183, + "learning_rate": 4.63221492051113e-06, + "loss": 0.4051, + "step": 3596 + }, + { + "epoch": 0.537909376401974, + "grad_norm": 1.6258766326897567, + "learning_rate": 4.629799698468549e-06, + "loss": 0.1955, + "step": 3597 + }, + { + "epoch": 0.538058920293106, + "grad_norm": 1.4054556934541154, + "learning_rate": 4.6273845632785256e-06, + "loss": 0.2125, + "step": 3598 + }, + { + "epoch": 0.5382084641842381, + "grad_norm": 1.041762193432224, + "learning_rate": 4.624969515507679e-06, + "loss": 0.1924, + "step": 3599 + }, + { + "epoch": 0.5383580080753702, + "grad_norm": 1.4542501769456762, + "learning_rate": 4.6225545557225945e-06, + "loss": 0.3441, + "step": 3600 + }, + { + "epoch": 0.5385075519665021, + "grad_norm": 1.4919518389528756, + "learning_rate": 4.620139684489849e-06, + "loss": 0.2355, + "step": 3601 + }, + { + "epoch": 0.5386570958576342, + "grad_norm": 1.3783031852299132, + "learning_rate": 4.617724902375994e-06, + "loss": 0.2304, + "step": 3602 + }, + { + "epoch": 0.5388066397487663, + "grad_norm": 1.4168701539968467, + "learning_rate": 4.615310209947562e-06, + "loss": 0.2107, + "step": 3603 + }, + { + "epoch": 0.5389561836398983, + "grad_norm": 1.644250447091797, + "learning_rate": 4.612895607771064e-06, + "loss": 0.3363, + "step": 3604 + }, + { + "epoch": 0.5391057275310304, + "grad_norm": 1.7003203843951784, + "learning_rate": 4.610481096412985e-06, + "loss": 0.2935, + "step": 3605 + }, + { + "epoch": 0.5392552714221625, + "grad_norm": 1.9608275622063933, + "learning_rate": 4.608066676439794e-06, + "loss": 0.5222, + "step": 3606 + }, + { + "epoch": 0.5394048153132944, + "grad_norm": 1.6707441302180261, + "learning_rate": 4.605652348417938e-06, + "loss": 0.2279, + "step": 3607 + }, + { + "epoch": 0.5395543592044265, + "grad_norm": 1.3570608454800357, + "learning_rate": 4.603238112913842e-06, + "loss": 0.2076, + "step": 3608 + }, + { + "epoch": 0.5397039030955586, + "grad_norm": 1.6140789387503762, + "learning_rate": 4.60082397049391e-06, + "loss": 0.2158, + "step": 3609 + }, + { + "epoch": 0.5398534469866906, + "grad_norm": 2.129218227380532, + "learning_rate": 4.59840992172452e-06, + "loss": 0.6327, + "step": 3610 + }, + { + "epoch": 0.5400029908778227, + "grad_norm": 2.1454675155835017, + "learning_rate": 4.595995967172033e-06, + "loss": 0.6113, + "step": 3611 + }, + { + "epoch": 0.5401525347689546, + "grad_norm": 1.7145729893928194, + "learning_rate": 4.593582107402785e-06, + "loss": 0.3341, + "step": 3612 + }, + { + "epoch": 0.5403020786600867, + "grad_norm": 1.9012732049082708, + "learning_rate": 4.591168342983093e-06, + "loss": 0.2404, + "step": 3613 + }, + { + "epoch": 0.5404516225512188, + "grad_norm": 1.4498433671817794, + "learning_rate": 4.5887546744792476e-06, + "loss": 0.2012, + "step": 3614 + }, + { + "epoch": 0.5406011664423508, + "grad_norm": 1.7171612592641192, + "learning_rate": 4.586341102457518e-06, + "loss": 0.3926, + "step": 3615 + }, + { + "epoch": 0.5407507103334829, + "grad_norm": 1.6371246462787574, + "learning_rate": 4.583927627484153e-06, + "loss": 0.2237, + "step": 3616 + }, + { + "epoch": 0.540900254224615, + "grad_norm": 1.7189037373450475, + "learning_rate": 4.581514250125374e-06, + "loss": 0.1945, + "step": 3617 + }, + { + "epoch": 0.5410497981157469, + "grad_norm": 1.621330478342808, + "learning_rate": 4.579100970947387e-06, + "loss": 0.4194, + "step": 3618 + }, + { + "epoch": 0.541199342006879, + "grad_norm": 2.20237075123333, + "learning_rate": 4.57668779051637e-06, + "loss": 0.4388, + "step": 3619 + }, + { + "epoch": 0.5413488858980111, + "grad_norm": 2.5514565197624197, + "learning_rate": 4.574274709398475e-06, + "loss": 0.2575, + "step": 3620 + }, + { + "epoch": 0.5414984297891431, + "grad_norm": 1.5656114450122998, + "learning_rate": 4.571861728159836e-06, + "loss": 0.2278, + "step": 3621 + }, + { + "epoch": 0.5416479736802752, + "grad_norm": 1.9158231714402134, + "learning_rate": 4.5694488473665625e-06, + "loss": 0.241, + "step": 3622 + }, + { + "epoch": 0.5417975175714073, + "grad_norm": 1.8500844874936493, + "learning_rate": 4.567036067584738e-06, + "loss": 0.4013, + "step": 3623 + }, + { + "epoch": 0.5419470614625392, + "grad_norm": 1.536728323367019, + "learning_rate": 4.564623389380426e-06, + "loss": 0.2078, + "step": 3624 + }, + { + "epoch": 0.5420966053536713, + "grad_norm": 1.7953428909943243, + "learning_rate": 4.562210813319661e-06, + "loss": 0.2134, + "step": 3625 + }, + { + "epoch": 0.5422461492448033, + "grad_norm": 2.018110871488534, + "learning_rate": 4.559798339968459e-06, + "loss": 0.3426, + "step": 3626 + }, + { + "epoch": 0.5423956931359354, + "grad_norm": 1.3138686548561962, + "learning_rate": 4.557385969892809e-06, + "loss": 0.2118, + "step": 3627 + }, + { + "epoch": 0.5425452370270675, + "grad_norm": 2.3800526504743544, + "learning_rate": 4.554973703658676e-06, + "loss": 0.1592, + "step": 3628 + }, + { + "epoch": 0.5426947809181994, + "grad_norm": 1.1674207977655517, + "learning_rate": 4.552561541832002e-06, + "loss": 0.1865, + "step": 3629 + }, + { + "epoch": 0.5428443248093315, + "grad_norm": 1.6296355903967825, + "learning_rate": 4.550149484978701e-06, + "loss": 0.2245, + "step": 3630 + }, + { + "epoch": 0.5429938687004636, + "grad_norm": 1.546275603117866, + "learning_rate": 4.547737533664667e-06, + "loss": 0.1881, + "step": 3631 + }, + { + "epoch": 0.5431434125915956, + "grad_norm": 1.8563039566315576, + "learning_rate": 4.545325688455766e-06, + "loss": 0.2158, + "step": 3632 + }, + { + "epoch": 0.5432929564827277, + "grad_norm": 1.909384983273893, + "learning_rate": 4.54291394991784e-06, + "loss": 0.1928, + "step": 3633 + }, + { + "epoch": 0.5434425003738598, + "grad_norm": 1.7306613061519398, + "learning_rate": 4.540502318616708e-06, + "loss": 0.2385, + "step": 3634 + }, + { + "epoch": 0.5435920442649917, + "grad_norm": 1.5487667458820433, + "learning_rate": 4.538090795118159e-06, + "loss": 0.2039, + "step": 3635 + }, + { + "epoch": 0.5437415881561238, + "grad_norm": 1.8407684602188683, + "learning_rate": 4.535679379987961e-06, + "loss": 0.4428, + "step": 3636 + }, + { + "epoch": 0.5438911320472559, + "grad_norm": 1.8693971467850499, + "learning_rate": 4.533268073791854e-06, + "loss": 0.3359, + "step": 3637 + }, + { + "epoch": 0.5440406759383879, + "grad_norm": 1.540410429421942, + "learning_rate": 4.530856877095557e-06, + "loss": 0.3461, + "step": 3638 + }, + { + "epoch": 0.54419021982952, + "grad_norm": 1.4587300754337025, + "learning_rate": 4.5284457904647585e-06, + "loss": 0.3748, + "step": 3639 + }, + { + "epoch": 0.544339763720652, + "grad_norm": 1.574943101324438, + "learning_rate": 4.526034814465121e-06, + "loss": 0.3262, + "step": 3640 + }, + { + "epoch": 0.544489307611784, + "grad_norm": 1.675685128431864, + "learning_rate": 4.523623949662284e-06, + "loss": 0.302, + "step": 3641 + }, + { + "epoch": 0.5446388515029161, + "grad_norm": 1.2926390795199603, + "learning_rate": 4.521213196621858e-06, + "loss": 0.2833, + "step": 3642 + }, + { + "epoch": 0.5447883953940481, + "grad_norm": 1.4062065912540513, + "learning_rate": 4.518802555909432e-06, + "loss": 0.2638, + "step": 3643 + }, + { + "epoch": 0.5449379392851802, + "grad_norm": 1.5376659922043572, + "learning_rate": 4.516392028090564e-06, + "loss": 0.3906, + "step": 3644 + }, + { + "epoch": 0.5450874831763123, + "grad_norm": 1.4973884587866484, + "learning_rate": 4.513981613730786e-06, + "loss": 0.2047, + "step": 3645 + }, + { + "epoch": 0.5452370270674443, + "grad_norm": 1.3383658073421694, + "learning_rate": 4.511571313395605e-06, + "loss": 0.3175, + "step": 3646 + }, + { + "epoch": 0.5453865709585763, + "grad_norm": 1.8556149033656897, + "learning_rate": 4.509161127650502e-06, + "loss": 0.3392, + "step": 3647 + }, + { + "epoch": 0.5455361148497084, + "grad_norm": 1.80199563542535, + "learning_rate": 4.506751057060928e-06, + "loss": 0.2334, + "step": 3648 + }, + { + "epoch": 0.5456856587408404, + "grad_norm": 1.5120205146776637, + "learning_rate": 4.504341102192311e-06, + "loss": 0.2754, + "step": 3649 + }, + { + "epoch": 0.5458352026319725, + "grad_norm": 1.2694153160565378, + "learning_rate": 4.501931263610047e-06, + "loss": 0.1759, + "step": 3650 + }, + { + "epoch": 0.5459847465231046, + "grad_norm": 2.11280085732116, + "learning_rate": 4.499521541879508e-06, + "loss": 0.3389, + "step": 3651 + }, + { + "epoch": 0.5461342904142366, + "grad_norm": 2.684582629406241, + "learning_rate": 4.49711193756604e-06, + "loss": 0.2197, + "step": 3652 + }, + { + "epoch": 0.5462838343053686, + "grad_norm": 2.1375854659143543, + "learning_rate": 4.494702451234958e-06, + "loss": 0.4994, + "step": 3653 + }, + { + "epoch": 0.5464333781965006, + "grad_norm": 1.7511040236402404, + "learning_rate": 4.492293083451551e-06, + "loss": 0.4358, + "step": 3654 + }, + { + "epoch": 0.5465829220876327, + "grad_norm": 1.9004355383149747, + "learning_rate": 4.489883834781079e-06, + "loss": 0.3991, + "step": 3655 + }, + { + "epoch": 0.5467324659787648, + "grad_norm": 1.9508214870790095, + "learning_rate": 4.4874747057887756e-06, + "loss": 0.2123, + "step": 3656 + }, + { + "epoch": 0.5468820098698968, + "grad_norm": 1.393009322340977, + "learning_rate": 4.4850656970398445e-06, + "loss": 0.1935, + "step": 3657 + }, + { + "epoch": 0.5470315537610289, + "grad_norm": 2.2204350802082264, + "learning_rate": 4.4826568090994635e-06, + "loss": 0.5944, + "step": 3658 + }, + { + "epoch": 0.5471810976521609, + "grad_norm": 1.3876428453270266, + "learning_rate": 4.4802480425327815e-06, + "loss": 0.2091, + "step": 3659 + }, + { + "epoch": 0.5473306415432929, + "grad_norm": 1.5091104368877524, + "learning_rate": 4.47783939790492e-06, + "loss": 0.2027, + "step": 3660 + }, + { + "epoch": 0.547480185434425, + "grad_norm": 1.4782934563639556, + "learning_rate": 4.475430875780966e-06, + "loss": 0.1921, + "step": 3661 + }, + { + "epoch": 0.5476297293255571, + "grad_norm": 1.3476666823273606, + "learning_rate": 4.4730224767259835e-06, + "loss": 0.2987, + "step": 3662 + }, + { + "epoch": 0.5477792732166891, + "grad_norm": 1.497176430979183, + "learning_rate": 4.470614201305007e-06, + "loss": 0.1607, + "step": 3663 + }, + { + "epoch": 0.5479288171078212, + "grad_norm": 1.3796593740972627, + "learning_rate": 4.46820605008304e-06, + "loss": 0.1941, + "step": 3664 + }, + { + "epoch": 0.5480783609989532, + "grad_norm": 1.6361789769261383, + "learning_rate": 4.465798023625062e-06, + "loss": 0.2117, + "step": 3665 + }, + { + "epoch": 0.5482279048900852, + "grad_norm": 1.7401255092969485, + "learning_rate": 4.463390122496012e-06, + "loss": 0.3168, + "step": 3666 + }, + { + "epoch": 0.5483774487812173, + "grad_norm": 1.6768308255934834, + "learning_rate": 4.460982347260813e-06, + "loss": 0.4315, + "step": 3667 + }, + { + "epoch": 0.5485269926723494, + "grad_norm": 1.1769237227237928, + "learning_rate": 4.458574698484349e-06, + "loss": 0.1979, + "step": 3668 + }, + { + "epoch": 0.5486765365634814, + "grad_norm": 1.8181887340685052, + "learning_rate": 4.456167176731478e-06, + "loss": 0.3617, + "step": 3669 + }, + { + "epoch": 0.5488260804546135, + "grad_norm": 1.8597276898084434, + "learning_rate": 4.453759782567031e-06, + "loss": 0.4951, + "step": 3670 + }, + { + "epoch": 0.5489756243457454, + "grad_norm": 2.0804206601929747, + "learning_rate": 4.451352516555802e-06, + "loss": 0.3081, + "step": 3671 + }, + { + "epoch": 0.5491251682368775, + "grad_norm": 1.2510363455073479, + "learning_rate": 4.44894537926256e-06, + "loss": 0.1942, + "step": 3672 + }, + { + "epoch": 0.5492747121280096, + "grad_norm": 1.7610349744949918, + "learning_rate": 4.446538371252042e-06, + "loss": 0.3415, + "step": 3673 + }, + { + "epoch": 0.5494242560191416, + "grad_norm": 1.4549183546588127, + "learning_rate": 4.444131493088956e-06, + "loss": 0.3443, + "step": 3674 + }, + { + "epoch": 0.5495737999102737, + "grad_norm": 1.5300081335892495, + "learning_rate": 4.441724745337982e-06, + "loss": 0.1803, + "step": 3675 + }, + { + "epoch": 0.5497233438014058, + "grad_norm": 1.8473890728356732, + "learning_rate": 4.43931812856376e-06, + "loss": 0.4096, + "step": 3676 + }, + { + "epoch": 0.5498728876925377, + "grad_norm": 1.082783619934534, + "learning_rate": 4.436911643330909e-06, + "loss": 0.1938, + "step": 3677 + }, + { + "epoch": 0.5500224315836698, + "grad_norm": 1.8449280983656142, + "learning_rate": 4.434505290204013e-06, + "loss": 0.2759, + "step": 3678 + }, + { + "epoch": 0.5501719754748019, + "grad_norm": 1.8510421150420215, + "learning_rate": 4.432099069747625e-06, + "loss": 0.3467, + "step": 3679 + }, + { + "epoch": 0.5503215193659339, + "grad_norm": 1.8763952805675503, + "learning_rate": 4.429692982526272e-06, + "loss": 0.3869, + "step": 3680 + }, + { + "epoch": 0.550471063257066, + "grad_norm": 1.8434682323650826, + "learning_rate": 4.4272870291044385e-06, + "loss": 0.3162, + "step": 3681 + }, + { + "epoch": 0.550620607148198, + "grad_norm": 1.619398372644164, + "learning_rate": 4.424881210046588e-06, + "loss": 0.3891, + "step": 3682 + }, + { + "epoch": 0.55077015103933, + "grad_norm": 1.0183619154944885, + "learning_rate": 4.4224755259171474e-06, + "loss": 0.2017, + "step": 3683 + }, + { + "epoch": 0.5509196949304621, + "grad_norm": 1.4729547476353326, + "learning_rate": 4.420069977280515e-06, + "loss": 0.182, + "step": 3684 + }, + { + "epoch": 0.5510692388215941, + "grad_norm": 1.5871225167325878, + "learning_rate": 4.417664564701056e-06, + "loss": 0.1883, + "step": 3685 + }, + { + "epoch": 0.5512187827127262, + "grad_norm": 1.6250502184297433, + "learning_rate": 4.415259288743101e-06, + "loss": 0.2383, + "step": 3686 + }, + { + "epoch": 0.5513683266038583, + "grad_norm": 1.1626293846136282, + "learning_rate": 4.412854149970952e-06, + "loss": 0.2129, + "step": 3687 + }, + { + "epoch": 0.5515178704949902, + "grad_norm": 2.12434434593106, + "learning_rate": 4.410449148948878e-06, + "loss": 0.6634, + "step": 3688 + }, + { + "epoch": 0.5516674143861223, + "grad_norm": 1.779837424684205, + "learning_rate": 4.4080442862411155e-06, + "loss": 0.492, + "step": 3689 + }, + { + "epoch": 0.5518169582772544, + "grad_norm": 1.6280486779070482, + "learning_rate": 4.405639562411869e-06, + "loss": 0.3824, + "step": 3690 + }, + { + "epoch": 0.5519665021683864, + "grad_norm": 1.2269128736362505, + "learning_rate": 4.4032349780253094e-06, + "loss": 0.1843, + "step": 3691 + }, + { + "epoch": 0.5521160460595185, + "grad_norm": 1.5707493169747542, + "learning_rate": 4.400830533645575e-06, + "loss": 0.2989, + "step": 3692 + }, + { + "epoch": 0.5522655899506506, + "grad_norm": 1.5702555536980602, + "learning_rate": 4.3984262298367706e-06, + "loss": 0.3377, + "step": 3693 + }, + { + "epoch": 0.5524151338417825, + "grad_norm": 2.039220723411418, + "learning_rate": 4.3960220671629704e-06, + "loss": 0.2134, + "step": 3694 + }, + { + "epoch": 0.5525646777329146, + "grad_norm": 2.02853402713955, + "learning_rate": 4.393618046188217e-06, + "loss": 0.2223, + "step": 3695 + }, + { + "epoch": 0.5527142216240467, + "grad_norm": 2.1312606881172904, + "learning_rate": 4.391214167476511e-06, + "loss": 0.58, + "step": 3696 + }, + { + "epoch": 0.5528637655151787, + "grad_norm": 1.5335152861023273, + "learning_rate": 4.388810431591829e-06, + "loss": 0.3053, + "step": 3697 + }, + { + "epoch": 0.5530133094063108, + "grad_norm": 1.895118001155134, + "learning_rate": 4.386406839098108e-06, + "loss": 0.3881, + "step": 3698 + }, + { + "epoch": 0.5531628532974427, + "grad_norm": 1.3992028246664994, + "learning_rate": 4.384003390559258e-06, + "loss": 0.196, + "step": 3699 + }, + { + "epoch": 0.5533123971885748, + "grad_norm": 1.421234940471731, + "learning_rate": 4.381600086539151e-06, + "loss": 0.1791, + "step": 3700 + }, + { + "epoch": 0.5534619410797069, + "grad_norm": 1.7926872298354135, + "learning_rate": 4.379196927601622e-06, + "loss": 0.4514, + "step": 3701 + }, + { + "epoch": 0.5536114849708389, + "grad_norm": 1.5046636185379967, + "learning_rate": 4.376793914310476e-06, + "loss": 0.3117, + "step": 3702 + }, + { + "epoch": 0.553761028861971, + "grad_norm": 1.4200654672168842, + "learning_rate": 4.3743910472294835e-06, + "loss": 0.147, + "step": 3703 + }, + { + "epoch": 0.5539105727531031, + "grad_norm": 2.124991711432056, + "learning_rate": 4.371988326922381e-06, + "loss": 0.711, + "step": 3704 + }, + { + "epoch": 0.554060116644235, + "grad_norm": 1.2259532651560952, + "learning_rate": 4.3695857539528705e-06, + "loss": 0.167, + "step": 3705 + }, + { + "epoch": 0.5542096605353671, + "grad_norm": 1.71466572734677, + "learning_rate": 4.367183328884616e-06, + "loss": 0.2966, + "step": 3706 + }, + { + "epoch": 0.5543592044264992, + "grad_norm": 1.4687978199012193, + "learning_rate": 4.364781052281252e-06, + "loss": 0.3417, + "step": 3707 + }, + { + "epoch": 0.5545087483176312, + "grad_norm": 1.4014204433835475, + "learning_rate": 4.362378924706374e-06, + "loss": 0.219, + "step": 3708 + }, + { + "epoch": 0.5546582922087633, + "grad_norm": 1.3260680988141613, + "learning_rate": 4.3599769467235455e-06, + "loss": 0.2324, + "step": 3709 + }, + { + "epoch": 0.5548078360998954, + "grad_norm": 1.9733255365162365, + "learning_rate": 4.357575118896296e-06, + "loss": 0.5631, + "step": 3710 + }, + { + "epoch": 0.5549573799910273, + "grad_norm": 1.704368041443617, + "learning_rate": 4.355173441788112e-06, + "loss": 0.2482, + "step": 3711 + }, + { + "epoch": 0.5551069238821594, + "grad_norm": 1.3602084531631606, + "learning_rate": 4.352771915962454e-06, + "loss": 0.2085, + "step": 3712 + }, + { + "epoch": 0.5552564677732914, + "grad_norm": 1.1924270602927478, + "learning_rate": 4.350370541982742e-06, + "loss": 0.1791, + "step": 3713 + }, + { + "epoch": 0.5554060116644235, + "grad_norm": 1.6609565690705255, + "learning_rate": 4.3479693204123604e-06, + "loss": 0.4581, + "step": 3714 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.3086925554693893, + "learning_rate": 4.345568251814663e-06, + "loss": 0.1964, + "step": 3715 + }, + { + "epoch": 0.5557050994466876, + "grad_norm": 1.459973940584078, + "learning_rate": 4.343167336752959e-06, + "loss": 0.1694, + "step": 3716 + }, + { + "epoch": 0.5558546433378196, + "grad_norm": 1.0197965232991946, + "learning_rate": 4.340766575790528e-06, + "loss": 0.1545, + "step": 3717 + }, + { + "epoch": 0.5560041872289517, + "grad_norm": 1.710086881050594, + "learning_rate": 4.338365969490611e-06, + "loss": 0.479, + "step": 3718 + }, + { + "epoch": 0.5561537311200837, + "grad_norm": 1.2458190047063138, + "learning_rate": 4.3359655184164125e-06, + "loss": 0.2591, + "step": 3719 + }, + { + "epoch": 0.5563032750112158, + "grad_norm": 1.7265111518577587, + "learning_rate": 4.333565223131107e-06, + "loss": 0.3026, + "step": 3720 + }, + { + "epoch": 0.5564528189023479, + "grad_norm": 1.7006783672890977, + "learning_rate": 4.331165084197821e-06, + "loss": 0.4155, + "step": 3721 + }, + { + "epoch": 0.5566023627934799, + "grad_norm": 2.24584551559027, + "learning_rate": 4.328765102179651e-06, + "loss": 0.6092, + "step": 3722 + }, + { + "epoch": 0.5567519066846119, + "grad_norm": 1.5686699024208746, + "learning_rate": 4.326365277639657e-06, + "loss": 0.3227, + "step": 3723 + }, + { + "epoch": 0.556901450575744, + "grad_norm": 1.216697878660411, + "learning_rate": 4.323965611140863e-06, + "loss": 0.3495, + "step": 3724 + }, + { + "epoch": 0.557050994466876, + "grad_norm": 2.1454702555477727, + "learning_rate": 4.32156610324625e-06, + "loss": 0.2151, + "step": 3725 + }, + { + "epoch": 0.5572005383580081, + "grad_norm": 1.4848464819357048, + "learning_rate": 4.319166754518768e-06, + "loss": 0.4727, + "step": 3726 + }, + { + "epoch": 0.5573500822491402, + "grad_norm": 3.270072864805568, + "learning_rate": 4.316767565521327e-06, + "loss": 0.2948, + "step": 3727 + }, + { + "epoch": 0.5574996261402722, + "grad_norm": 1.5653364995014118, + "learning_rate": 4.314368536816799e-06, + "loss": 0.4562, + "step": 3728 + }, + { + "epoch": 0.5576491700314042, + "grad_norm": 2.0500993739442412, + "learning_rate": 4.31196966896802e-06, + "loss": 0.3331, + "step": 3729 + }, + { + "epoch": 0.5577987139225362, + "grad_norm": 2.201295915170263, + "learning_rate": 4.3095709625377885e-06, + "loss": 0.3756, + "step": 3730 + }, + { + "epoch": 0.5579482578136683, + "grad_norm": 1.6854473805153678, + "learning_rate": 4.307172418088863e-06, + "loss": 0.353, + "step": 3731 + }, + { + "epoch": 0.5580978017048004, + "grad_norm": 1.911466231230202, + "learning_rate": 4.304774036183964e-06, + "loss": 0.5574, + "step": 3732 + }, + { + "epoch": 0.5582473455959324, + "grad_norm": 1.929179136209855, + "learning_rate": 4.3023758173857764e-06, + "loss": 0.3294, + "step": 3733 + }, + { + "epoch": 0.5583968894870645, + "grad_norm": 1.3667926899468956, + "learning_rate": 4.299977762256946e-06, + "loss": 0.2124, + "step": 3734 + }, + { + "epoch": 0.5585464333781965, + "grad_norm": 2.697037628503975, + "learning_rate": 4.29757987136008e-06, + "loss": 0.2051, + "step": 3735 + }, + { + "epoch": 0.5586959772693285, + "grad_norm": 1.6135252013042591, + "learning_rate": 4.295182145257744e-06, + "loss": 0.3395, + "step": 3736 + }, + { + "epoch": 0.5588455211604606, + "grad_norm": 1.508358238955974, + "learning_rate": 4.292784584512471e-06, + "loss": 0.1759, + "step": 3737 + }, + { + "epoch": 0.5589950650515927, + "grad_norm": 1.8796805295236263, + "learning_rate": 4.290387189686748e-06, + "loss": 0.476, + "step": 3738 + }, + { + "epoch": 0.5591446089427247, + "grad_norm": 1.5709579431840082, + "learning_rate": 4.287989961343029e-06, + "loss": 0.3692, + "step": 3739 + }, + { + "epoch": 0.5592941528338568, + "grad_norm": 1.5271978772058465, + "learning_rate": 4.28559290004373e-06, + "loss": 0.2455, + "step": 3740 + }, + { + "epoch": 0.5594436967249888, + "grad_norm": 1.5965798116355054, + "learning_rate": 4.283196006351219e-06, + "loss": 0.3172, + "step": 3741 + }, + { + "epoch": 0.5595932406161208, + "grad_norm": 1.5747075138534872, + "learning_rate": 4.280799280827833e-06, + "loss": 0.3397, + "step": 3742 + }, + { + "epoch": 0.5597427845072529, + "grad_norm": 2.348694227529921, + "learning_rate": 4.278402724035868e-06, + "loss": 0.221, + "step": 3743 + }, + { + "epoch": 0.5598923283983849, + "grad_norm": 1.459683116901361, + "learning_rate": 4.276006336537577e-06, + "loss": 0.319, + "step": 3744 + }, + { + "epoch": 0.560041872289517, + "grad_norm": 1.759239579641639, + "learning_rate": 4.273610118895178e-06, + "loss": 0.2453, + "step": 3745 + }, + { + "epoch": 0.560191416180649, + "grad_norm": 1.744850048587745, + "learning_rate": 4.2712140716708465e-06, + "loss": 0.3606, + "step": 3746 + }, + { + "epoch": 0.560340960071781, + "grad_norm": 1.7930423330861387, + "learning_rate": 4.268818195426715e-06, + "loss": 0.3549, + "step": 3747 + }, + { + "epoch": 0.5604905039629131, + "grad_norm": 1.8638965502443476, + "learning_rate": 4.266422490724883e-06, + "loss": 0.2108, + "step": 3748 + }, + { + "epoch": 0.5606400478540452, + "grad_norm": 1.6929676082309733, + "learning_rate": 4.264026958127404e-06, + "loss": 0.3148, + "step": 3749 + }, + { + "epoch": 0.5607895917451772, + "grad_norm": 1.7573506914261001, + "learning_rate": 4.261631598196293e-06, + "loss": 0.3503, + "step": 3750 + }, + { + "epoch": 0.5609391356363093, + "grad_norm": 1.2550353673685644, + "learning_rate": 4.259236411493526e-06, + "loss": 0.2267, + "step": 3751 + }, + { + "epoch": 0.5610886795274413, + "grad_norm": 1.396447043820233, + "learning_rate": 4.256841398581035e-06, + "loss": 0.338, + "step": 3752 + }, + { + "epoch": 0.5612382234185733, + "grad_norm": 1.4047858629698926, + "learning_rate": 4.254446560020714e-06, + "loss": 0.1939, + "step": 3753 + }, + { + "epoch": 0.5613877673097054, + "grad_norm": 1.7448713388804253, + "learning_rate": 4.252051896374416e-06, + "loss": 0.2867, + "step": 3754 + }, + { + "epoch": 0.5615373112008375, + "grad_norm": 1.190396991434064, + "learning_rate": 4.24965740820395e-06, + "loss": 0.1743, + "step": 3755 + }, + { + "epoch": 0.5616868550919695, + "grad_norm": 1.4830547911619545, + "learning_rate": 4.2472630960710906e-06, + "loss": 0.2371, + "step": 3756 + }, + { + "epoch": 0.5618363989831016, + "grad_norm": 1.6803677800346988, + "learning_rate": 4.24486896053756e-06, + "loss": 0.4105, + "step": 3757 + }, + { + "epoch": 0.5619859428742335, + "grad_norm": 1.7843402963560555, + "learning_rate": 4.2424750021650505e-06, + "loss": 0.1911, + "step": 3758 + }, + { + "epoch": 0.5621354867653656, + "grad_norm": 1.3070197531860872, + "learning_rate": 4.240081221515203e-06, + "loss": 0.2109, + "step": 3759 + }, + { + "epoch": 0.5622850306564977, + "grad_norm": 1.4646566398646295, + "learning_rate": 4.237687619149627e-06, + "loss": 0.2391, + "step": 3760 + }, + { + "epoch": 0.5624345745476297, + "grad_norm": 1.3749863369219528, + "learning_rate": 4.2352941956298835e-06, + "loss": 0.3234, + "step": 3761 + }, + { + "epoch": 0.5625841184387618, + "grad_norm": 1.6256377087587262, + "learning_rate": 4.23290095151749e-06, + "loss": 0.3059, + "step": 3762 + }, + { + "epoch": 0.5627336623298939, + "grad_norm": 1.0526622135292967, + "learning_rate": 4.230507887373926e-06, + "loss": 0.1609, + "step": 3763 + }, + { + "epoch": 0.5628832062210258, + "grad_norm": 1.1601833699970812, + "learning_rate": 4.228115003760627e-06, + "loss": 0.2103, + "step": 3764 + }, + { + "epoch": 0.5630327501121579, + "grad_norm": 1.3805756715861772, + "learning_rate": 4.225722301238987e-06, + "loss": 0.2133, + "step": 3765 + }, + { + "epoch": 0.56318229400329, + "grad_norm": 1.2285578315653152, + "learning_rate": 4.223329780370359e-06, + "loss": 0.2906, + "step": 3766 + }, + { + "epoch": 0.563331837894422, + "grad_norm": 1.0545194721595736, + "learning_rate": 4.220937441716048e-06, + "loss": 0.2419, + "step": 3767 + }, + { + "epoch": 0.5634813817855541, + "grad_norm": 1.8754558366761425, + "learning_rate": 4.218545285837321e-06, + "loss": 0.3342, + "step": 3768 + }, + { + "epoch": 0.5636309256766862, + "grad_norm": 1.879834063009346, + "learning_rate": 4.216153313295401e-06, + "loss": 0.3649, + "step": 3769 + }, + { + "epoch": 0.5637804695678181, + "grad_norm": 1.373634860925274, + "learning_rate": 4.213761524651469e-06, + "loss": 0.2021, + "step": 3770 + }, + { + "epoch": 0.5639300134589502, + "grad_norm": 1.6925748087720431, + "learning_rate": 4.211369920466661e-06, + "loss": 0.5069, + "step": 3771 + }, + { + "epoch": 0.5640795573500822, + "grad_norm": 1.217939544459708, + "learning_rate": 4.2089785013020686e-06, + "loss": 0.1998, + "step": 3772 + }, + { + "epoch": 0.5642291012412143, + "grad_norm": 1.926661562944481, + "learning_rate": 4.206587267718743e-06, + "loss": 0.4489, + "step": 3773 + }, + { + "epoch": 0.5643786451323464, + "grad_norm": 1.6974792521617525, + "learning_rate": 4.204196220277692e-06, + "loss": 0.3709, + "step": 3774 + }, + { + "epoch": 0.5645281890234783, + "grad_norm": 1.3914955473555144, + "learning_rate": 4.2018053595398766e-06, + "loss": 0.3334, + "step": 3775 + }, + { + "epoch": 0.5646777329146104, + "grad_norm": 1.8360783705539188, + "learning_rate": 4.199414686066219e-06, + "loss": 0.3313, + "step": 3776 + }, + { + "epoch": 0.5648272768057425, + "grad_norm": 1.623276695386809, + "learning_rate": 4.1970242004175884e-06, + "loss": 0.372, + "step": 3777 + }, + { + "epoch": 0.5649768206968745, + "grad_norm": 1.6066517149442552, + "learning_rate": 4.194633903154819e-06, + "loss": 0.1992, + "step": 3778 + }, + { + "epoch": 0.5651263645880066, + "grad_norm": 1.5792435940178833, + "learning_rate": 4.192243794838698e-06, + "loss": 0.4063, + "step": 3779 + }, + { + "epoch": 0.5652759084791387, + "grad_norm": 0.9997425194899473, + "learning_rate": 4.189853876029966e-06, + "loss": 0.3228, + "step": 3780 + }, + { + "epoch": 0.5654254523702706, + "grad_norm": 2.063913929954077, + "learning_rate": 4.187464147289326e-06, + "loss": 0.3574, + "step": 3781 + }, + { + "epoch": 0.5655749962614027, + "grad_norm": 1.635138880368131, + "learning_rate": 4.185074609177425e-06, + "loss": 0.3418, + "step": 3782 + }, + { + "epoch": 0.5657245401525348, + "grad_norm": 1.7099814378347198, + "learning_rate": 4.182685262254875e-06, + "loss": 0.3688, + "step": 3783 + }, + { + "epoch": 0.5658740840436668, + "grad_norm": 1.048172395904991, + "learning_rate": 4.180296107082238e-06, + "loss": 0.1967, + "step": 3784 + }, + { + "epoch": 0.5660236279347989, + "grad_norm": 1.4773704699879262, + "learning_rate": 4.177907144220036e-06, + "loss": 0.2355, + "step": 3785 + }, + { + "epoch": 0.566173171825931, + "grad_norm": 1.7211220022506921, + "learning_rate": 4.1755183742287404e-06, + "loss": 0.4811, + "step": 3786 + }, + { + "epoch": 0.5663227157170629, + "grad_norm": 1.5450919500167295, + "learning_rate": 4.173129797668779e-06, + "loss": 0.3029, + "step": 3787 + }, + { + "epoch": 0.566472259608195, + "grad_norm": 1.3702653273245642, + "learning_rate": 4.1707414151005374e-06, + "loss": 0.2632, + "step": 3788 + }, + { + "epoch": 0.566621803499327, + "grad_norm": 1.6529737223038268, + "learning_rate": 4.1683532270843505e-06, + "loss": 0.4421, + "step": 3789 + }, + { + "epoch": 0.5667713473904591, + "grad_norm": 1.727830283297176, + "learning_rate": 4.1659652341805115e-06, + "loss": 0.4467, + "step": 3790 + }, + { + "epoch": 0.5669208912815912, + "grad_norm": 1.7423370579387567, + "learning_rate": 4.163577436949267e-06, + "loss": 0.344, + "step": 3791 + }, + { + "epoch": 0.5670704351727232, + "grad_norm": 1.839263370923161, + "learning_rate": 4.161189835950816e-06, + "loss": 0.4408, + "step": 3792 + }, + { + "epoch": 0.5672199790638552, + "grad_norm": 1.2231225047916932, + "learning_rate": 4.158802431745314e-06, + "loss": 0.2086, + "step": 3793 + }, + { + "epoch": 0.5673695229549873, + "grad_norm": 1.6834296845774626, + "learning_rate": 4.156415224892868e-06, + "loss": 0.2075, + "step": 3794 + }, + { + "epoch": 0.5675190668461193, + "grad_norm": 1.63695734693028, + "learning_rate": 4.1540282159535405e-06, + "loss": 0.3609, + "step": 3795 + }, + { + "epoch": 0.5676686107372514, + "grad_norm": 4.050254883222083, + "learning_rate": 4.151641405487348e-06, + "loss": 0.4249, + "step": 3796 + }, + { + "epoch": 0.5678181546283835, + "grad_norm": 2.012583096932945, + "learning_rate": 4.149254794054256e-06, + "loss": 0.3465, + "step": 3797 + }, + { + "epoch": 0.5679676985195155, + "grad_norm": 1.3031350746325656, + "learning_rate": 4.146868382214188e-06, + "loss": 0.2125, + "step": 3798 + }, + { + "epoch": 0.5681172424106475, + "grad_norm": 1.132012454693644, + "learning_rate": 4.14448217052702e-06, + "loss": 0.2188, + "step": 3799 + }, + { + "epoch": 0.5682667863017796, + "grad_norm": 1.7236076504434965, + "learning_rate": 4.142096159552578e-06, + "loss": 0.3361, + "step": 3800 + }, + { + "epoch": 0.5684163301929116, + "grad_norm": 1.8633241948391013, + "learning_rate": 4.139710349850649e-06, + "loss": 0.2458, + "step": 3801 + }, + { + "epoch": 0.5685658740840437, + "grad_norm": 1.625642374423236, + "learning_rate": 4.13732474198096e-06, + "loss": 0.287, + "step": 3802 + }, + { + "epoch": 0.5687154179751757, + "grad_norm": 1.3510065589123672, + "learning_rate": 4.134939336503201e-06, + "loss": 0.2177, + "step": 3803 + }, + { + "epoch": 0.5688649618663078, + "grad_norm": 1.3967447851707402, + "learning_rate": 4.13255413397701e-06, + "loss": 0.2053, + "step": 3804 + }, + { + "epoch": 0.5690145057574398, + "grad_norm": 3.731031445311677, + "learning_rate": 4.130169134961979e-06, + "loss": 0.467, + "step": 3805 + }, + { + "epoch": 0.5691640496485718, + "grad_norm": 1.6356233741539357, + "learning_rate": 4.127784340017653e-06, + "loss": 0.367, + "step": 3806 + }, + { + "epoch": 0.5693135935397039, + "grad_norm": 1.6628010853628497, + "learning_rate": 4.125399749703526e-06, + "loss": 0.3327, + "step": 3807 + }, + { + "epoch": 0.569463137430836, + "grad_norm": 1.7948970149563248, + "learning_rate": 4.123015364579046e-06, + "loss": 0.2519, + "step": 3808 + }, + { + "epoch": 0.569612681321968, + "grad_norm": 1.5509060057609694, + "learning_rate": 4.120631185203613e-06, + "loss": 0.3288, + "step": 3809 + }, + { + "epoch": 0.5697622252131, + "grad_norm": 1.9889960830789197, + "learning_rate": 4.118247212136579e-06, + "loss": 0.2358, + "step": 3810 + }, + { + "epoch": 0.5699117691042321, + "grad_norm": 1.4594387383602159, + "learning_rate": 4.11586344593725e-06, + "loss": 0.2483, + "step": 3811 + }, + { + "epoch": 0.5700613129953641, + "grad_norm": 1.505789986482447, + "learning_rate": 4.113479887164873e-06, + "loss": 0.1721, + "step": 3812 + }, + { + "epoch": 0.5702108568864962, + "grad_norm": 1.5260414366564614, + "learning_rate": 4.111096536378661e-06, + "loss": 0.3631, + "step": 3813 + }, + { + "epoch": 0.5703604007776283, + "grad_norm": 1.1589677612121907, + "learning_rate": 4.108713394137766e-06, + "loss": 0.1784, + "step": 3814 + }, + { + "epoch": 0.5705099446687603, + "grad_norm": 1.8625931258889772, + "learning_rate": 4.106330461001299e-06, + "loss": 0.2186, + "step": 3815 + }, + { + "epoch": 0.5706594885598923, + "grad_norm": 1.8225390074850283, + "learning_rate": 4.103947737528321e-06, + "loss": 0.2044, + "step": 3816 + }, + { + "epoch": 0.5708090324510243, + "grad_norm": 1.2260264786541455, + "learning_rate": 4.101565224277837e-06, + "loss": 0.2178, + "step": 3817 + }, + { + "epoch": 0.5709585763421564, + "grad_norm": 2.25246042259685, + "learning_rate": 4.0991829218088104e-06, + "loss": 0.1861, + "step": 3818 + }, + { + "epoch": 0.5711081202332885, + "grad_norm": 1.4584755271889351, + "learning_rate": 4.096800830680151e-06, + "loss": 0.2435, + "step": 3819 + }, + { + "epoch": 0.5712576641244205, + "grad_norm": 1.3471748072584406, + "learning_rate": 4.094418951450721e-06, + "loss": 0.2657, + "step": 3820 + }, + { + "epoch": 0.5714072080155526, + "grad_norm": 1.4912123602040317, + "learning_rate": 4.092037284679335e-06, + "loss": 0.1924, + "step": 3821 + }, + { + "epoch": 0.5715567519066846, + "grad_norm": 2.3296405416523798, + "learning_rate": 4.08965583092475e-06, + "loss": 0.3366, + "step": 3822 + }, + { + "epoch": 0.5717062957978166, + "grad_norm": 1.2984835174705707, + "learning_rate": 4.087274590745681e-06, + "loss": 0.1936, + "step": 3823 + }, + { + "epoch": 0.5718558396889487, + "grad_norm": 1.824810422234852, + "learning_rate": 4.084893564700789e-06, + "loss": 0.2123, + "step": 3824 + }, + { + "epoch": 0.5720053835800808, + "grad_norm": 1.5552749721927202, + "learning_rate": 4.0825127533486865e-06, + "loss": 0.21, + "step": 3825 + }, + { + "epoch": 0.5721549274712128, + "grad_norm": 2.3462920390606934, + "learning_rate": 4.080132157247935e-06, + "loss": 0.5941, + "step": 3826 + }, + { + "epoch": 0.5723044713623449, + "grad_norm": 1.406630107053223, + "learning_rate": 4.077751776957043e-06, + "loss": 0.1833, + "step": 3827 + }, + { + "epoch": 0.5724540152534769, + "grad_norm": 1.6381181183648261, + "learning_rate": 4.075371613034472e-06, + "loss": 0.1843, + "step": 3828 + }, + { + "epoch": 0.5726035591446089, + "grad_norm": 1.7160296586520287, + "learning_rate": 4.072991666038633e-06, + "loss": 0.3517, + "step": 3829 + }, + { + "epoch": 0.572753103035741, + "grad_norm": 1.625254929701419, + "learning_rate": 4.070611936527882e-06, + "loss": 0.3434, + "step": 3830 + }, + { + "epoch": 0.572902646926873, + "grad_norm": 1.1799818297023703, + "learning_rate": 4.068232425060527e-06, + "loss": 0.1565, + "step": 3831 + }, + { + "epoch": 0.5730521908180051, + "grad_norm": 2.2431824234579283, + "learning_rate": 4.065853132194827e-06, + "loss": 0.2081, + "step": 3832 + }, + { + "epoch": 0.5732017347091372, + "grad_norm": 1.3731533378251932, + "learning_rate": 4.0634740584889835e-06, + "loss": 0.1461, + "step": 3833 + }, + { + "epoch": 0.5733512786002691, + "grad_norm": 1.8919618152249174, + "learning_rate": 4.061095204501151e-06, + "loss": 0.4743, + "step": 3834 + }, + { + "epoch": 0.5735008224914012, + "grad_norm": 1.5440897864945764, + "learning_rate": 4.0587165707894326e-06, + "loss": 0.2421, + "step": 3835 + }, + { + "epoch": 0.5736503663825333, + "grad_norm": 2.102210076263177, + "learning_rate": 4.056338157911877e-06, + "loss": 0.216, + "step": 3836 + }, + { + "epoch": 0.5737999102736653, + "grad_norm": 1.480929733210923, + "learning_rate": 4.053959966426488e-06, + "loss": 0.3114, + "step": 3837 + }, + { + "epoch": 0.5739494541647974, + "grad_norm": 1.675352548569913, + "learning_rate": 4.0515819968912055e-06, + "loss": 0.3535, + "step": 3838 + }, + { + "epoch": 0.5740989980559295, + "grad_norm": 1.2596206818226765, + "learning_rate": 4.049204249863926e-06, + "loss": 0.1904, + "step": 3839 + }, + { + "epoch": 0.5742485419470614, + "grad_norm": 2.5280885615982314, + "learning_rate": 4.0468267259024935e-06, + "loss": 0.4747, + "step": 3840 + }, + { + "epoch": 0.5743980858381935, + "grad_norm": 1.4594935661035582, + "learning_rate": 4.044449425564696e-06, + "loss": 0.3341, + "step": 3841 + }, + { + "epoch": 0.5745476297293256, + "grad_norm": 1.7090098221110461, + "learning_rate": 4.042072349408275e-06, + "loss": 0.3826, + "step": 3842 + }, + { + "epoch": 0.5746971736204576, + "grad_norm": 2.0106514775480755, + "learning_rate": 4.0396954979909105e-06, + "loss": 0.2264, + "step": 3843 + }, + { + "epoch": 0.5748467175115897, + "grad_norm": 1.9951662121993903, + "learning_rate": 4.037318871870238e-06, + "loss": 0.2587, + "step": 3844 + }, + { + "epoch": 0.5749962614027218, + "grad_norm": 2.3735689067814882, + "learning_rate": 4.034942471603835e-06, + "loss": 0.6212, + "step": 3845 + }, + { + "epoch": 0.5751458052938537, + "grad_norm": 1.3313805957100329, + "learning_rate": 4.032566297749229e-06, + "loss": 0.1866, + "step": 3846 + }, + { + "epoch": 0.5752953491849858, + "grad_norm": 1.7289313985492631, + "learning_rate": 4.030190350863894e-06, + "loss": 0.3472, + "step": 3847 + }, + { + "epoch": 0.5754448930761178, + "grad_norm": 1.493639708680967, + "learning_rate": 4.027814631505248e-06, + "loss": 0.1904, + "step": 3848 + }, + { + "epoch": 0.5755944369672499, + "grad_norm": 1.822043958390137, + "learning_rate": 4.0254391402306595e-06, + "loss": 0.3634, + "step": 3849 + }, + { + "epoch": 0.575743980858382, + "grad_norm": 1.713070502237203, + "learning_rate": 4.023063877597439e-06, + "loss": 0.4581, + "step": 3850 + }, + { + "epoch": 0.5758935247495139, + "grad_norm": 1.33095048873002, + "learning_rate": 4.02068884416285e-06, + "loss": 0.2208, + "step": 3851 + }, + { + "epoch": 0.576043068640646, + "grad_norm": 1.7910675740169966, + "learning_rate": 4.018314040484096e-06, + "loss": 0.2775, + "step": 3852 + }, + { + "epoch": 0.5761926125317781, + "grad_norm": 1.9328597024089476, + "learning_rate": 4.015939467118328e-06, + "loss": 0.1634, + "step": 3853 + }, + { + "epoch": 0.5763421564229101, + "grad_norm": 2.0376615827356233, + "learning_rate": 4.0135651246226456e-06, + "loss": 0.2157, + "step": 3854 + }, + { + "epoch": 0.5764917003140422, + "grad_norm": 1.680601957815068, + "learning_rate": 4.011191013554091e-06, + "loss": 0.3631, + "step": 3855 + }, + { + "epoch": 0.5766412442051743, + "grad_norm": 2.1132908114007214, + "learning_rate": 4.0088171344696545e-06, + "loss": 0.4247, + "step": 3856 + }, + { + "epoch": 0.5767907880963062, + "grad_norm": 1.5935575986177055, + "learning_rate": 4.006443487926272e-06, + "loss": 0.2774, + "step": 3857 + }, + { + "epoch": 0.5769403319874383, + "grad_norm": 9.754017882344208, + "learning_rate": 4.004070074480821e-06, + "loss": 0.2705, + "step": 3858 + }, + { + "epoch": 0.5770898758785704, + "grad_norm": 1.55564169140264, + "learning_rate": 4.001696894690128e-06, + "loss": 0.3352, + "step": 3859 + }, + { + "epoch": 0.5772394197697024, + "grad_norm": 1.798973114929995, + "learning_rate": 3.999323949110965e-06, + "loss": 0.3104, + "step": 3860 + }, + { + "epoch": 0.5773889636608345, + "grad_norm": 1.2999830259764698, + "learning_rate": 3.9969512383000465e-06, + "loss": 0.1917, + "step": 3861 + }, + { + "epoch": 0.5775385075519665, + "grad_norm": 1.6701086999443087, + "learning_rate": 3.994578762814037e-06, + "loss": 0.2341, + "step": 3862 + }, + { + "epoch": 0.5776880514430985, + "grad_norm": 2.007518390533276, + "learning_rate": 3.992206523209538e-06, + "loss": 0.4862, + "step": 3863 + }, + { + "epoch": 0.5778375953342306, + "grad_norm": 1.6758189519451339, + "learning_rate": 3.989834520043101e-06, + "loss": 0.3451, + "step": 3864 + }, + { + "epoch": 0.5779871392253626, + "grad_norm": 1.4063121728928503, + "learning_rate": 3.98746275387122e-06, + "loss": 0.1555, + "step": 3865 + }, + { + "epoch": 0.5781366831164947, + "grad_norm": 1.4879389391741649, + "learning_rate": 3.985091225250336e-06, + "loss": 0.1553, + "step": 3866 + }, + { + "epoch": 0.5782862270076268, + "grad_norm": 4.347154093050415, + "learning_rate": 3.982719934736832e-06, + "loss": 0.4053, + "step": 3867 + }, + { + "epoch": 0.5784357708987587, + "grad_norm": 1.5443034001101248, + "learning_rate": 3.980348882887035e-06, + "loss": 0.3198, + "step": 3868 + }, + { + "epoch": 0.5785853147898908, + "grad_norm": 1.7950442773636421, + "learning_rate": 3.977978070257216e-06, + "loss": 0.4694, + "step": 3869 + }, + { + "epoch": 0.5787348586810229, + "grad_norm": 2.1973684295340363, + "learning_rate": 3.975607497403592e-06, + "loss": 0.4131, + "step": 3870 + }, + { + "epoch": 0.5788844025721549, + "grad_norm": 1.7267711780843302, + "learning_rate": 3.973237164882321e-06, + "loss": 0.2991, + "step": 3871 + }, + { + "epoch": 0.579033946463287, + "grad_norm": 1.2991893137030361, + "learning_rate": 3.970867073249508e-06, + "loss": 0.2453, + "step": 3872 + }, + { + "epoch": 0.5791834903544191, + "grad_norm": 1.5602478676658988, + "learning_rate": 3.9684972230611956e-06, + "loss": 0.2093, + "step": 3873 + }, + { + "epoch": 0.579333034245551, + "grad_norm": 1.6934982295673262, + "learning_rate": 3.966127614873376e-06, + "loss": 0.2215, + "step": 3874 + }, + { + "epoch": 0.5794825781366831, + "grad_norm": 2.110410563267741, + "learning_rate": 3.963758249241983e-06, + "loss": 0.5277, + "step": 3875 + }, + { + "epoch": 0.5796321220278151, + "grad_norm": 1.8247166919937388, + "learning_rate": 3.96138912672289e-06, + "loss": 0.3023, + "step": 3876 + }, + { + "epoch": 0.5797816659189472, + "grad_norm": 1.3368960784081465, + "learning_rate": 3.9590202478719205e-06, + "loss": 0.2167, + "step": 3877 + }, + { + "epoch": 0.5799312098100793, + "grad_norm": 1.5584675563310413, + "learning_rate": 3.95665161324483e-06, + "loss": 0.3788, + "step": 3878 + }, + { + "epoch": 0.5800807537012113, + "grad_norm": 1.8874134439396686, + "learning_rate": 3.954283223397327e-06, + "loss": 0.2273, + "step": 3879 + }, + { + "epoch": 0.5802302975923433, + "grad_norm": 1.6256776139655844, + "learning_rate": 3.951915078885057e-06, + "loss": 0.3477, + "step": 3880 + }, + { + "epoch": 0.5803798414834754, + "grad_norm": 2.1424342396636025, + "learning_rate": 3.94954718026361e-06, + "loss": 0.2369, + "step": 3881 + }, + { + "epoch": 0.5805293853746074, + "grad_norm": 1.5525493295636965, + "learning_rate": 3.94717952808852e-06, + "loss": 0.2494, + "step": 3882 + }, + { + "epoch": 0.5806789292657395, + "grad_norm": 1.2525644074657651, + "learning_rate": 3.9448121229152585e-06, + "loss": 0.3307, + "step": 3883 + }, + { + "epoch": 0.5808284731568716, + "grad_norm": 1.3489399531915276, + "learning_rate": 3.942444965299242e-06, + "loss": 0.3343, + "step": 3884 + }, + { + "epoch": 0.5809780170480036, + "grad_norm": 1.6971348754950633, + "learning_rate": 3.9400780557958276e-06, + "loss": 0.358, + "step": 3885 + }, + { + "epoch": 0.5811275609391356, + "grad_norm": 1.0598261581438866, + "learning_rate": 3.937711394960318e-06, + "loss": 0.2013, + "step": 3886 + }, + { + "epoch": 0.5812771048302677, + "grad_norm": 1.7799188090141735, + "learning_rate": 3.9353449833479515e-06, + "loss": 0.3384, + "step": 3887 + }, + { + "epoch": 0.5814266487213997, + "grad_norm": 2.17623015999994, + "learning_rate": 3.932978821513913e-06, + "loss": 0.628, + "step": 3888 + }, + { + "epoch": 0.5815761926125318, + "grad_norm": 1.6370721585688572, + "learning_rate": 3.930612910013326e-06, + "loss": 0.4118, + "step": 3889 + }, + { + "epoch": 0.5817257365036638, + "grad_norm": 1.9972246926476178, + "learning_rate": 3.928247249401256e-06, + "loss": 0.3762, + "step": 3890 + }, + { + "epoch": 0.5818752803947959, + "grad_norm": 1.568923848614493, + "learning_rate": 3.925881840232711e-06, + "loss": 0.19, + "step": 3891 + }, + { + "epoch": 0.5820248242859279, + "grad_norm": 1.989967171024415, + "learning_rate": 3.9235166830626375e-06, + "loss": 0.6291, + "step": 3892 + }, + { + "epoch": 0.5821743681770599, + "grad_norm": 1.6320609023822952, + "learning_rate": 3.921151778445925e-06, + "loss": 0.2308, + "step": 3893 + }, + { + "epoch": 0.582323912068192, + "grad_norm": 1.6089921792274722, + "learning_rate": 3.918787126937401e-06, + "loss": 0.3225, + "step": 3894 + }, + { + "epoch": 0.5824734559593241, + "grad_norm": 1.7543181167298438, + "learning_rate": 3.916422729091838e-06, + "loss": 0.2871, + "step": 3895 + }, + { + "epoch": 0.5826229998504561, + "grad_norm": 1.7660217430179217, + "learning_rate": 3.914058585463946e-06, + "loss": 0.3934, + "step": 3896 + }, + { + "epoch": 0.5827725437415882, + "grad_norm": 1.6926823971057354, + "learning_rate": 3.911694696608377e-06, + "loss": 0.2323, + "step": 3897 + }, + { + "epoch": 0.5829220876327202, + "grad_norm": 1.428661667605896, + "learning_rate": 3.909331063079719e-06, + "loss": 0.2164, + "step": 3898 + }, + { + "epoch": 0.5830716315238522, + "grad_norm": 2.2116367730485087, + "learning_rate": 3.906967685432504e-06, + "loss": 0.6146, + "step": 3899 + }, + { + "epoch": 0.5832211754149843, + "grad_norm": 1.4672573042525427, + "learning_rate": 3.904604564221204e-06, + "loss": 0.359, + "step": 3900 + }, + { + "epoch": 0.5833707193061164, + "grad_norm": 1.4136745331637248, + "learning_rate": 3.90224170000023e-06, + "loss": 0.3082, + "step": 3901 + }, + { + "epoch": 0.5835202631972484, + "grad_norm": 1.5604421702220446, + "learning_rate": 3.899879093323935e-06, + "loss": 0.3606, + "step": 3902 + }, + { + "epoch": 0.5836698070883805, + "grad_norm": 1.4826709528011215, + "learning_rate": 3.897516744746605e-06, + "loss": 0.3643, + "step": 3903 + }, + { + "epoch": 0.5838193509795125, + "grad_norm": 1.436442769039814, + "learning_rate": 3.895154654822471e-06, + "loss": 0.3506, + "step": 3904 + }, + { + "epoch": 0.5839688948706445, + "grad_norm": 1.007210202220155, + "learning_rate": 3.892792824105703e-06, + "loss": 0.1583, + "step": 3905 + }, + { + "epoch": 0.5841184387617766, + "grad_norm": 1.6057856448875152, + "learning_rate": 3.89043125315041e-06, + "loss": 0.367, + "step": 3906 + }, + { + "epoch": 0.5842679826529086, + "grad_norm": 1.1873872540306754, + "learning_rate": 3.888069942510639e-06, + "loss": 0.1812, + "step": 3907 + }, + { + "epoch": 0.5844175265440407, + "grad_norm": 1.531148138356171, + "learning_rate": 3.885708892740374e-06, + "loss": 0.2584, + "step": 3908 + }, + { + "epoch": 0.5845670704351728, + "grad_norm": 1.53531282902828, + "learning_rate": 3.883348104393542e-06, + "loss": 0.2089, + "step": 3909 + }, + { + "epoch": 0.5847166143263047, + "grad_norm": 1.2571052242037701, + "learning_rate": 3.880987578024006e-06, + "loss": 0.1618, + "step": 3910 + }, + { + "epoch": 0.5848661582174368, + "grad_norm": 1.685780465923781, + "learning_rate": 3.878627314185569e-06, + "loss": 0.4313, + "step": 3911 + }, + { + "epoch": 0.5850157021085689, + "grad_norm": 1.7418918273189634, + "learning_rate": 3.876267313431973e-06, + "loss": 0.475, + "step": 3912 + }, + { + "epoch": 0.5851652459997009, + "grad_norm": 1.734807587708549, + "learning_rate": 3.873907576316892e-06, + "loss": 0.4154, + "step": 3913 + }, + { + "epoch": 0.585314789890833, + "grad_norm": 1.6406377655622573, + "learning_rate": 3.871548103393947e-06, + "loss": 0.3568, + "step": 3914 + }, + { + "epoch": 0.585464333781965, + "grad_norm": 1.1517941732296657, + "learning_rate": 3.869188895216692e-06, + "loss": 0.1842, + "step": 3915 + }, + { + "epoch": 0.585613877673097, + "grad_norm": 1.4377802648166382, + "learning_rate": 3.866829952338621e-06, + "loss": 0.3479, + "step": 3916 + }, + { + "epoch": 0.5857634215642291, + "grad_norm": 1.114349518595328, + "learning_rate": 3.864471275313164e-06, + "loss": 0.1237, + "step": 3917 + }, + { + "epoch": 0.5859129654553612, + "grad_norm": 1.4127129455946879, + "learning_rate": 3.862112864693691e-06, + "loss": 0.1984, + "step": 3918 + }, + { + "epoch": 0.5860625093464932, + "grad_norm": 1.4417924544505127, + "learning_rate": 3.859754721033504e-06, + "loss": 0.3395, + "step": 3919 + }, + { + "epoch": 0.5862120532376253, + "grad_norm": 1.7189752041055817, + "learning_rate": 3.857396844885849e-06, + "loss": 0.4097, + "step": 3920 + }, + { + "epoch": 0.5863615971287572, + "grad_norm": 1.3073724826843582, + "learning_rate": 3.8550392368039045e-06, + "loss": 0.2553, + "step": 3921 + }, + { + "epoch": 0.5865111410198893, + "grad_norm": 1.4564439495009185, + "learning_rate": 3.85268189734079e-06, + "loss": 0.2909, + "step": 3922 + }, + { + "epoch": 0.5866606849110214, + "grad_norm": 1.5841823741056198, + "learning_rate": 3.850324827049561e-06, + "loss": 0.2065, + "step": 3923 + }, + { + "epoch": 0.5868102288021534, + "grad_norm": 1.0916215110890355, + "learning_rate": 3.847968026483206e-06, + "loss": 0.1892, + "step": 3924 + }, + { + "epoch": 0.5869597726932855, + "grad_norm": 1.9627347229493133, + "learning_rate": 3.845611496194655e-06, + "loss": 0.2363, + "step": 3925 + }, + { + "epoch": 0.5871093165844176, + "grad_norm": 1.4671089891749798, + "learning_rate": 3.843255236736773e-06, + "loss": 0.1938, + "step": 3926 + }, + { + "epoch": 0.5872588604755495, + "grad_norm": 1.5337050885765506, + "learning_rate": 3.840899248662358e-06, + "loss": 0.3077, + "step": 3927 + }, + { + "epoch": 0.5874084043666816, + "grad_norm": 1.1406387988000504, + "learning_rate": 3.838543532524153e-06, + "loss": 0.1899, + "step": 3928 + }, + { + "epoch": 0.5875579482578137, + "grad_norm": 1.1589973791618806, + "learning_rate": 3.836188088874827e-06, + "loss": 0.1882, + "step": 3929 + }, + { + "epoch": 0.5877074921489457, + "grad_norm": 1.8799012061866471, + "learning_rate": 3.833832918266992e-06, + "loss": 0.2755, + "step": 3930 + }, + { + "epoch": 0.5878570360400778, + "grad_norm": 1.8985733996127807, + "learning_rate": 3.831478021253192e-06, + "loss": 0.5369, + "step": 3931 + }, + { + "epoch": 0.5880065799312099, + "grad_norm": 1.1384325773858892, + "learning_rate": 3.829123398385911e-06, + "loss": 0.1673, + "step": 3932 + }, + { + "epoch": 0.5881561238223418, + "grad_norm": 1.2281350484830058, + "learning_rate": 3.826769050217564e-06, + "loss": 0.1814, + "step": 3933 + }, + { + "epoch": 0.5883056677134739, + "grad_norm": 1.6894736624996396, + "learning_rate": 3.824414977300506e-06, + "loss": 0.3602, + "step": 3934 + }, + { + "epoch": 0.5884552116046059, + "grad_norm": 1.4936989174970754, + "learning_rate": 3.822061180187022e-06, + "loss": 0.3309, + "step": 3935 + }, + { + "epoch": 0.588604755495738, + "grad_norm": 1.2978536320218477, + "learning_rate": 3.819707659429339e-06, + "loss": 0.2046, + "step": 3936 + }, + { + "epoch": 0.5887542993868701, + "grad_norm": 1.8284004362578155, + "learning_rate": 3.817354415579612e-06, + "loss": 0.5456, + "step": 3937 + }, + { + "epoch": 0.588903843278002, + "grad_norm": 1.7662708358791046, + "learning_rate": 3.815001449189941e-06, + "loss": 0.3615, + "step": 3938 + }, + { + "epoch": 0.5890533871691341, + "grad_norm": 1.2321195352474157, + "learning_rate": 3.812648760812347e-06, + "loss": 0.1731, + "step": 3939 + }, + { + "epoch": 0.5892029310602662, + "grad_norm": 1.6990341437020333, + "learning_rate": 3.8102963509987965e-06, + "loss": 0.4168, + "step": 3940 + }, + { + "epoch": 0.5893524749513982, + "grad_norm": 1.6729207645316984, + "learning_rate": 3.807944220301188e-06, + "loss": 0.4699, + "step": 3941 + }, + { + "epoch": 0.5895020188425303, + "grad_norm": 1.2458633939386743, + "learning_rate": 3.8055923692713527e-06, + "loss": 0.3579, + "step": 3942 + }, + { + "epoch": 0.5896515627336624, + "grad_norm": 1.043880257216526, + "learning_rate": 3.803240798461061e-06, + "loss": 0.1603, + "step": 3943 + }, + { + "epoch": 0.5898011066247943, + "grad_norm": 0.9729214387193956, + "learning_rate": 3.80088950842201e-06, + "loss": 0.2006, + "step": 3944 + }, + { + "epoch": 0.5899506505159264, + "grad_norm": 1.6414730241820703, + "learning_rate": 3.7985384997058365e-06, + "loss": 0.2049, + "step": 3945 + }, + { + "epoch": 0.5901001944070585, + "grad_norm": 1.6657086768253946, + "learning_rate": 3.79618777286411e-06, + "loss": 0.2827, + "step": 3946 + }, + { + "epoch": 0.5902497382981905, + "grad_norm": 1.6956367490229918, + "learning_rate": 3.7938373284483336e-06, + "loss": 0.4608, + "step": 3947 + }, + { + "epoch": 0.5903992821893226, + "grad_norm": 1.4757058595390375, + "learning_rate": 3.791487167009945e-06, + "loss": 0.3557, + "step": 3948 + }, + { + "epoch": 0.5905488260804546, + "grad_norm": 1.5729666823262434, + "learning_rate": 3.789137289100313e-06, + "loss": 0.184, + "step": 3949 + }, + { + "epoch": 0.5906983699715866, + "grad_norm": 1.633027833675113, + "learning_rate": 3.786787695270743e-06, + "loss": 0.4198, + "step": 3950 + }, + { + "epoch": 0.5908479138627187, + "grad_norm": 1.438486968065632, + "learning_rate": 3.784438386072471e-06, + "loss": 0.2484, + "step": 3951 + }, + { + "epoch": 0.5909974577538507, + "grad_norm": 1.7582663564747014, + "learning_rate": 3.782089362056669e-06, + "loss": 0.2605, + "step": 3952 + }, + { + "epoch": 0.5911470016449828, + "grad_norm": 1.8491750366304591, + "learning_rate": 3.7797406237744406e-06, + "loss": 0.5609, + "step": 3953 + }, + { + "epoch": 0.5912965455361149, + "grad_norm": 2.1940854570062256, + "learning_rate": 3.7773921717768212e-06, + "loss": 0.3145, + "step": 3954 + }, + { + "epoch": 0.5914460894272469, + "grad_norm": 1.758778183907136, + "learning_rate": 3.7750440066147816e-06, + "loss": 0.4234, + "step": 3955 + }, + { + "epoch": 0.5915956333183789, + "grad_norm": 1.0902355030071746, + "learning_rate": 3.7726961288392227e-06, + "loss": 0.158, + "step": 3956 + }, + { + "epoch": 0.591745177209511, + "grad_norm": 1.3875658617841062, + "learning_rate": 3.7703485390009797e-06, + "loss": 0.2265, + "step": 3957 + }, + { + "epoch": 0.591894721100643, + "grad_norm": 1.6204448366336377, + "learning_rate": 3.7680012376508227e-06, + "loss": 0.3166, + "step": 3958 + }, + { + "epoch": 0.5920442649917751, + "grad_norm": 1.3645834401060262, + "learning_rate": 3.765654225339447e-06, + "loss": 0.4263, + "step": 3959 + }, + { + "epoch": 0.5921938088829072, + "grad_norm": 1.3231211927212307, + "learning_rate": 3.763307502617485e-06, + "loss": 0.1809, + "step": 3960 + }, + { + "epoch": 0.5923433527740392, + "grad_norm": 1.247672175483129, + "learning_rate": 3.7609610700355014e-06, + "loss": 0.1774, + "step": 3961 + }, + { + "epoch": 0.5924928966651712, + "grad_norm": 1.1720146064252939, + "learning_rate": 3.7586149281439908e-06, + "loss": 0.1641, + "step": 3962 + }, + { + "epoch": 0.5926424405563033, + "grad_norm": 1.5718680027072247, + "learning_rate": 3.756269077493385e-06, + "loss": 0.3326, + "step": 3963 + }, + { + "epoch": 0.5927919844474353, + "grad_norm": 1.7772158209903743, + "learning_rate": 3.7539235186340378e-06, + "loss": 0.2532, + "step": 3964 + }, + { + "epoch": 0.5929415283385674, + "grad_norm": 1.3316474232901556, + "learning_rate": 3.751578252116242e-06, + "loss": 0.2147, + "step": 3965 + }, + { + "epoch": 0.5930910722296994, + "grad_norm": 1.4220331937170747, + "learning_rate": 3.7492332784902197e-06, + "loss": 0.1649, + "step": 3966 + }, + { + "epoch": 0.5932406161208315, + "grad_norm": 1.4819153735880548, + "learning_rate": 3.7468885983061245e-06, + "loss": 0.2828, + "step": 3967 + }, + { + "epoch": 0.5933901600119635, + "grad_norm": 1.533582987774297, + "learning_rate": 3.7445442121140417e-06, + "loss": 0.3549, + "step": 3968 + }, + { + "epoch": 0.5935397039030955, + "grad_norm": 2.0084478805318846, + "learning_rate": 3.7422001204639855e-06, + "loss": 0.5592, + "step": 3969 + }, + { + "epoch": 0.5936892477942276, + "grad_norm": 1.4582643481241435, + "learning_rate": 3.7398563239059026e-06, + "loss": 0.3025, + "step": 3970 + }, + { + "epoch": 0.5938387916853597, + "grad_norm": 1.4895919525012717, + "learning_rate": 3.7375128229896707e-06, + "loss": 0.1886, + "step": 3971 + }, + { + "epoch": 0.5939883355764917, + "grad_norm": 1.551469791731259, + "learning_rate": 3.735169618265097e-06, + "loss": 0.2539, + "step": 3972 + }, + { + "epoch": 0.5941378794676238, + "grad_norm": 1.3051779904330787, + "learning_rate": 3.732826710281923e-06, + "loss": 0.3184, + "step": 3973 + }, + { + "epoch": 0.5942874233587558, + "grad_norm": 1.2392389402763688, + "learning_rate": 3.7304840995898116e-06, + "loss": 0.1694, + "step": 3974 + }, + { + "epoch": 0.5944369672498878, + "grad_norm": 1.374809357583305, + "learning_rate": 3.7281417867383663e-06, + "loss": 0.2051, + "step": 3975 + }, + { + "epoch": 0.5945865111410199, + "grad_norm": 1.6294109216768207, + "learning_rate": 3.7257997722771157e-06, + "loss": 0.3044, + "step": 3976 + }, + { + "epoch": 0.594736055032152, + "grad_norm": 1.3224369428308862, + "learning_rate": 3.7234580567555177e-06, + "loss": 0.2726, + "step": 3977 + }, + { + "epoch": 0.594885598923284, + "grad_norm": 1.7787635958789316, + "learning_rate": 3.7211166407229647e-06, + "loss": 0.2811, + "step": 3978 + }, + { + "epoch": 0.595035142814416, + "grad_norm": 1.4233934814396993, + "learning_rate": 3.718775524728771e-06, + "loss": 0.2117, + "step": 3979 + }, + { + "epoch": 0.595184686705548, + "grad_norm": 1.329758067841591, + "learning_rate": 3.7164347093221865e-06, + "loss": 0.1894, + "step": 3980 + }, + { + "epoch": 0.5953342305966801, + "grad_norm": 1.8256996890692794, + "learning_rate": 3.714094195052389e-06, + "loss": 0.5129, + "step": 3981 + }, + { + "epoch": 0.5954837744878122, + "grad_norm": 1.1205085695878227, + "learning_rate": 3.7117539824684866e-06, + "loss": 0.1646, + "step": 3982 + }, + { + "epoch": 0.5956333183789442, + "grad_norm": 1.7874489300108056, + "learning_rate": 3.709414072119516e-06, + "loss": 0.2973, + "step": 3983 + }, + { + "epoch": 0.5957828622700763, + "grad_norm": 1.1884738103980468, + "learning_rate": 3.707074464554441e-06, + "loss": 0.2175, + "step": 3984 + }, + { + "epoch": 0.5959324061612084, + "grad_norm": 1.623711079409265, + "learning_rate": 3.704735160322157e-06, + "loss": 0.3532, + "step": 3985 + }, + { + "epoch": 0.5960819500523403, + "grad_norm": 1.4825541420935746, + "learning_rate": 3.7023961599714876e-06, + "loss": 0.3893, + "step": 3986 + }, + { + "epoch": 0.5962314939434724, + "grad_norm": 1.6573795029984477, + "learning_rate": 3.7000574640511843e-06, + "loss": 0.349, + "step": 3987 + }, + { + "epoch": 0.5963810378346045, + "grad_norm": 1.5325568908598715, + "learning_rate": 3.697719073109929e-06, + "loss": 0.217, + "step": 3988 + }, + { + "epoch": 0.5965305817257365, + "grad_norm": 1.1985716673788906, + "learning_rate": 3.695380987696329e-06, + "loss": 0.208, + "step": 3989 + }, + { + "epoch": 0.5966801256168686, + "grad_norm": 1.511940247427596, + "learning_rate": 3.693043208358922e-06, + "loss": 0.3116, + "step": 3990 + }, + { + "epoch": 0.5968296695080006, + "grad_norm": 1.382101797672102, + "learning_rate": 3.6907057356461733e-06, + "loss": 0.4606, + "step": 3991 + }, + { + "epoch": 0.5969792133991326, + "grad_norm": 1.2466801874804314, + "learning_rate": 3.6883685701064774e-06, + "loss": 0.2164, + "step": 3992 + }, + { + "epoch": 0.5971287572902647, + "grad_norm": 1.2514060754927105, + "learning_rate": 3.6860317122881583e-06, + "loss": 0.2169, + "step": 3993 + }, + { + "epoch": 0.5972783011813967, + "grad_norm": 1.633454160231319, + "learning_rate": 3.6836951627394598e-06, + "loss": 0.2053, + "step": 3994 + }, + { + "epoch": 0.5974278450725288, + "grad_norm": 1.5000653948039326, + "learning_rate": 3.681358922008562e-06, + "loss": 0.3564, + "step": 3995 + }, + { + "epoch": 0.5975773889636609, + "grad_norm": 1.7042876877013595, + "learning_rate": 3.6790229906435706e-06, + "loss": 0.2217, + "step": 3996 + }, + { + "epoch": 0.5977269328547928, + "grad_norm": 1.1660551674685846, + "learning_rate": 3.6766873691925155e-06, + "loss": 0.2042, + "step": 3997 + }, + { + "epoch": 0.5978764767459249, + "grad_norm": 1.0016866110140052, + "learning_rate": 3.674352058203359e-06, + "loss": 0.1465, + "step": 3998 + }, + { + "epoch": 0.598026020637057, + "grad_norm": 1.358478565718614, + "learning_rate": 3.672017058223982e-06, + "loss": 0.234, + "step": 3999 + }, + { + "epoch": 0.598175564528189, + "grad_norm": 1.5541836099323216, + "learning_rate": 3.6696823698022034e-06, + "loss": 0.3531, + "step": 4000 + }, + { + "epoch": 0.5983251084193211, + "grad_norm": 1.4965638649664035, + "learning_rate": 3.6673479934857602e-06, + "loss": 0.1734, + "step": 4001 + }, + { + "epoch": 0.5984746523104532, + "grad_norm": 1.2798780853635923, + "learning_rate": 3.665013929822322e-06, + "loss": 0.2227, + "step": 4002 + }, + { + "epoch": 0.5986241962015851, + "grad_norm": 1.5631428969516747, + "learning_rate": 3.6626801793594802e-06, + "loss": 0.3341, + "step": 4003 + }, + { + "epoch": 0.5987737400927172, + "grad_norm": 1.1547611258266666, + "learning_rate": 3.6603467426447596e-06, + "loss": 0.2265, + "step": 4004 + }, + { + "epoch": 0.5989232839838493, + "grad_norm": 1.7661237138941523, + "learning_rate": 3.6580136202256024e-06, + "loss": 0.2831, + "step": 4005 + }, + { + "epoch": 0.5990728278749813, + "grad_norm": 1.608871850044881, + "learning_rate": 3.655680812649382e-06, + "loss": 0.1863, + "step": 4006 + }, + { + "epoch": 0.5992223717661134, + "grad_norm": 1.2120032911627765, + "learning_rate": 3.653348320463399e-06, + "loss": 0.2105, + "step": 4007 + }, + { + "epoch": 0.5993719156572453, + "grad_norm": 1.7713505485144023, + "learning_rate": 3.6510161442148783e-06, + "loss": 0.4625, + "step": 4008 + }, + { + "epoch": 0.5995214595483774, + "grad_norm": 1.5406088685048114, + "learning_rate": 3.6486842844509727e-06, + "loss": 0.2302, + "step": 4009 + }, + { + "epoch": 0.5996710034395095, + "grad_norm": 1.7566777699356078, + "learning_rate": 3.646352741718755e-06, + "loss": 0.4958, + "step": 4010 + }, + { + "epoch": 0.5998205473306415, + "grad_norm": 1.773701442285048, + "learning_rate": 3.6440215165652305e-06, + "loss": 0.1975, + "step": 4011 + }, + { + "epoch": 0.5999700912217736, + "grad_norm": 1.3009283223219021, + "learning_rate": 3.641690609537325e-06, + "loss": 0.292, + "step": 4012 + }, + { + "epoch": 0.6001196351129057, + "grad_norm": 1.4931178313458866, + "learning_rate": 3.639360021181894e-06, + "loss": 0.3095, + "step": 4013 + }, + { + "epoch": 0.6002691790040376, + "grad_norm": 1.5793921746413464, + "learning_rate": 3.637029752045716e-06, + "loss": 0.339, + "step": 4014 + }, + { + "epoch": 0.6004187228951697, + "grad_norm": 2.1241976260965463, + "learning_rate": 3.6346998026754927e-06, + "loss": 0.3532, + "step": 4015 + }, + { + "epoch": 0.6005682667863018, + "grad_norm": 1.0579231943140326, + "learning_rate": 3.6323701736178528e-06, + "loss": 0.1443, + "step": 4016 + }, + { + "epoch": 0.6007178106774338, + "grad_norm": 1.6519105898111532, + "learning_rate": 3.6300408654193515e-06, + "loss": 0.3343, + "step": 4017 + }, + { + "epoch": 0.6008673545685659, + "grad_norm": 1.1836835886912143, + "learning_rate": 3.6277118786264653e-06, + "loss": 0.2084, + "step": 4018 + }, + { + "epoch": 0.601016898459698, + "grad_norm": 1.6124984883515716, + "learning_rate": 3.6253832137856e-06, + "loss": 0.3631, + "step": 4019 + }, + { + "epoch": 0.6011664423508299, + "grad_norm": 1.8682133876991003, + "learning_rate": 3.623054871443078e-06, + "loss": 0.4665, + "step": 4020 + }, + { + "epoch": 0.601315986241962, + "grad_norm": 2.062835003276884, + "learning_rate": 3.620726852145154e-06, + "loss": 0.2746, + "step": 4021 + }, + { + "epoch": 0.6014655301330941, + "grad_norm": 1.583633628610332, + "learning_rate": 3.6183991564380023e-06, + "loss": 0.3516, + "step": 4022 + }, + { + "epoch": 0.6016150740242261, + "grad_norm": 1.8495282333605947, + "learning_rate": 3.616071784867723e-06, + "loss": 0.2359, + "step": 4023 + }, + { + "epoch": 0.6017646179153582, + "grad_norm": 1.577516249529156, + "learning_rate": 3.613744737980343e-06, + "loss": 0.2088, + "step": 4024 + }, + { + "epoch": 0.6019141618064902, + "grad_norm": 1.550353493297501, + "learning_rate": 3.6114180163218056e-06, + "loss": 0.1928, + "step": 4025 + }, + { + "epoch": 0.6020637056976222, + "grad_norm": 1.1352226203791365, + "learning_rate": 3.6090916204379834e-06, + "loss": 0.1816, + "step": 4026 + }, + { + "epoch": 0.6022132495887543, + "grad_norm": 1.5030785772526614, + "learning_rate": 3.6067655508746726e-06, + "loss": 0.2309, + "step": 4027 + }, + { + "epoch": 0.6023627934798863, + "grad_norm": 1.319464371625546, + "learning_rate": 3.60443980817759e-06, + "loss": 0.1861, + "step": 4028 + }, + { + "epoch": 0.6025123373710184, + "grad_norm": 1.3852086589255572, + "learning_rate": 3.6021143928923783e-06, + "loss": 0.1944, + "step": 4029 + }, + { + "epoch": 0.6026618812621505, + "grad_norm": 2.243854952724501, + "learning_rate": 3.599789305564602e-06, + "loss": 0.51, + "step": 4030 + }, + { + "epoch": 0.6028114251532825, + "grad_norm": 1.419391997587051, + "learning_rate": 3.597464546739749e-06, + "loss": 0.3178, + "step": 4031 + }, + { + "epoch": 0.6029609690444145, + "grad_norm": 1.2894199829326205, + "learning_rate": 3.5951401169632293e-06, + "loss": 0.1947, + "step": 4032 + }, + { + "epoch": 0.6031105129355466, + "grad_norm": 1.530880304394482, + "learning_rate": 3.5928160167803784e-06, + "loss": 0.1948, + "step": 4033 + }, + { + "epoch": 0.6032600568266786, + "grad_norm": 1.1347517974588832, + "learning_rate": 3.5904922467364526e-06, + "loss": 0.2253, + "step": 4034 + }, + { + "epoch": 0.6034096007178107, + "grad_norm": 1.232257517693222, + "learning_rate": 3.5881688073766267e-06, + "loss": 0.205, + "step": 4035 + }, + { + "epoch": 0.6035591446089428, + "grad_norm": 1.6963988945541442, + "learning_rate": 3.5858456992460066e-06, + "loss": 0.2107, + "step": 4036 + }, + { + "epoch": 0.6037086885000748, + "grad_norm": 1.1849360298492893, + "learning_rate": 3.5835229228896133e-06, + "loss": 0.1807, + "step": 4037 + }, + { + "epoch": 0.6038582323912068, + "grad_norm": 1.8006880398870282, + "learning_rate": 3.5812004788523933e-06, + "loss": 0.3419, + "step": 4038 + }, + { + "epoch": 0.6040077762823388, + "grad_norm": 1.1218152100667766, + "learning_rate": 3.5788783676792176e-06, + "loss": 0.1963, + "step": 4039 + }, + { + "epoch": 0.6041573201734709, + "grad_norm": 1.4412889174387291, + "learning_rate": 3.5765565899148703e-06, + "loss": 0.1818, + "step": 4040 + }, + { + "epoch": 0.604306864064603, + "grad_norm": 1.6406744631622459, + "learning_rate": 3.5742351461040646e-06, + "loss": 0.1984, + "step": 4041 + }, + { + "epoch": 0.604456407955735, + "grad_norm": 1.7230960345125375, + "learning_rate": 3.571914036791435e-06, + "loss": 0.235, + "step": 4042 + }, + { + "epoch": 0.604605951846867, + "grad_norm": 1.4708145309645435, + "learning_rate": 3.5695932625215347e-06, + "loss": 0.254, + "step": 4043 + }, + { + "epoch": 0.6047554957379991, + "grad_norm": 1.8016752353343, + "learning_rate": 3.567272823838842e-06, + "loss": 0.2005, + "step": 4044 + }, + { + "epoch": 0.6049050396291311, + "grad_norm": 1.1493815123325322, + "learning_rate": 3.564952721287751e-06, + "loss": 0.1713, + "step": 4045 + }, + { + "epoch": 0.6050545835202632, + "grad_norm": 1.5119225059128631, + "learning_rate": 3.5626329554125823e-06, + "loss": 0.182, + "step": 4046 + }, + { + "epoch": 0.6052041274113953, + "grad_norm": 1.8759224212157393, + "learning_rate": 3.5603135267575737e-06, + "loss": 0.5205, + "step": 4047 + }, + { + "epoch": 0.6053536713025273, + "grad_norm": 1.565295827071973, + "learning_rate": 3.5579944358668873e-06, + "loss": 0.3772, + "step": 4048 + }, + { + "epoch": 0.6055032151936593, + "grad_norm": 2.122632657433492, + "learning_rate": 3.555675683284604e-06, + "loss": 0.4978, + "step": 4049 + }, + { + "epoch": 0.6056527590847914, + "grad_norm": 1.9409099996151258, + "learning_rate": 3.553357269554724e-06, + "loss": 0.3612, + "step": 4050 + }, + { + "epoch": 0.6058023029759234, + "grad_norm": 1.8573438786477485, + "learning_rate": 3.551039195221171e-06, + "loss": 0.2926, + "step": 4051 + }, + { + "epoch": 0.6059518468670555, + "grad_norm": 1.6830272916472488, + "learning_rate": 3.5487214608277857e-06, + "loss": 0.344, + "step": 4052 + }, + { + "epoch": 0.6061013907581875, + "grad_norm": 2.544342408333431, + "learning_rate": 3.546404066918333e-06, + "loss": 0.242, + "step": 4053 + }, + { + "epoch": 0.6062509346493196, + "grad_norm": 1.8351824486146582, + "learning_rate": 3.5440870140364967e-06, + "loss": 0.3456, + "step": 4054 + }, + { + "epoch": 0.6064004785404516, + "grad_norm": 1.9733766052564323, + "learning_rate": 3.5417703027258752e-06, + "loss": 0.2553, + "step": 4055 + }, + { + "epoch": 0.6065500224315836, + "grad_norm": 1.4266711606260554, + "learning_rate": 3.539453933529996e-06, + "loss": 0.3237, + "step": 4056 + }, + { + "epoch": 0.6066995663227157, + "grad_norm": 1.6718168939315017, + "learning_rate": 3.5371379069922983e-06, + "loss": 0.3281, + "step": 4057 + }, + { + "epoch": 0.6068491102138478, + "grad_norm": 2.082935916824144, + "learning_rate": 3.5348222236561467e-06, + "loss": 0.5821, + "step": 4058 + }, + { + "epoch": 0.6069986541049798, + "grad_norm": 1.5696132013810573, + "learning_rate": 3.5325068840648243e-06, + "loss": 0.2177, + "step": 4059 + }, + { + "epoch": 0.6071481979961119, + "grad_norm": 1.4087839185461357, + "learning_rate": 3.530191888761527e-06, + "loss": 0.3296, + "step": 4060 + }, + { + "epoch": 0.607297741887244, + "grad_norm": 2.099550365239757, + "learning_rate": 3.5278772382893777e-06, + "loss": 0.3616, + "step": 4061 + }, + { + "epoch": 0.6074472857783759, + "grad_norm": 1.6842257514961676, + "learning_rate": 3.5255629331914153e-06, + "loss": 0.331, + "step": 4062 + }, + { + "epoch": 0.607596829669508, + "grad_norm": 1.8703972469815064, + "learning_rate": 3.523248974010599e-06, + "loss": 0.1758, + "step": 4063 + }, + { + "epoch": 0.6077463735606401, + "grad_norm": 1.524647589128137, + "learning_rate": 3.5209353612898067e-06, + "loss": 0.3307, + "step": 4064 + }, + { + "epoch": 0.6078959174517721, + "grad_norm": 1.3067348714233367, + "learning_rate": 3.518622095571831e-06, + "loss": 0.1462, + "step": 4065 + }, + { + "epoch": 0.6080454613429042, + "grad_norm": 1.7759379098365817, + "learning_rate": 3.516309177399389e-06, + "loss": 0.4481, + "step": 4066 + }, + { + "epoch": 0.6081950052340361, + "grad_norm": 1.5543708215165737, + "learning_rate": 3.513996607315112e-06, + "loss": 0.2023, + "step": 4067 + }, + { + "epoch": 0.6083445491251682, + "grad_norm": 1.911842884824045, + "learning_rate": 3.5116843858615534e-06, + "loss": 0.4787, + "step": 4068 + }, + { + "epoch": 0.6084940930163003, + "grad_norm": 1.9298854332543098, + "learning_rate": 3.509372513581182e-06, + "loss": 0.3521, + "step": 4069 + }, + { + "epoch": 0.6086436369074323, + "grad_norm": 1.5428623983694816, + "learning_rate": 3.5070609910163824e-06, + "loss": 0.3474, + "step": 4070 + }, + { + "epoch": 0.6087931807985644, + "grad_norm": 0.9977909289978434, + "learning_rate": 3.504749818709463e-06, + "loss": 0.174, + "step": 4071 + }, + { + "epoch": 0.6089427246896965, + "grad_norm": 1.1421293763115168, + "learning_rate": 3.5024389972026467e-06, + "loss": 0.2034, + "step": 4072 + }, + { + "epoch": 0.6090922685808284, + "grad_norm": 1.3644097038150713, + "learning_rate": 3.500128527038074e-06, + "loss": 0.1842, + "step": 4073 + }, + { + "epoch": 0.6092418124719605, + "grad_norm": 1.332440548321116, + "learning_rate": 3.497818408757806e-06, + "loss": 0.2728, + "step": 4074 + }, + { + "epoch": 0.6093913563630926, + "grad_norm": 1.9608087269420822, + "learning_rate": 3.495508642903813e-06, + "loss": 0.2712, + "step": 4075 + }, + { + "epoch": 0.6095409002542246, + "grad_norm": 1.1647404978024332, + "learning_rate": 3.493199230017993e-06, + "loss": 0.1906, + "step": 4076 + }, + { + "epoch": 0.6096904441453567, + "grad_norm": 1.7451274126518581, + "learning_rate": 3.4908901706421548e-06, + "loss": 0.5245, + "step": 4077 + }, + { + "epoch": 0.6098399880364888, + "grad_norm": 1.4276907218803567, + "learning_rate": 3.488581465318026e-06, + "loss": 0.2377, + "step": 4078 + }, + { + "epoch": 0.6099895319276207, + "grad_norm": 1.7169738066445221, + "learning_rate": 3.4862731145872548e-06, + "loss": 0.3399, + "step": 4079 + }, + { + "epoch": 0.6101390758187528, + "grad_norm": 1.688093650559266, + "learning_rate": 3.4839651189913965e-06, + "loss": 0.387, + "step": 4080 + }, + { + "epoch": 0.6102886197098849, + "grad_norm": 1.9535517951044004, + "learning_rate": 3.4816574790719322e-06, + "loss": 0.4879, + "step": 4081 + }, + { + "epoch": 0.6104381636010169, + "grad_norm": 1.6770987277973084, + "learning_rate": 3.479350195370256e-06, + "loss": 0.2773, + "step": 4082 + }, + { + "epoch": 0.610587707492149, + "grad_norm": 1.5959756774847607, + "learning_rate": 3.477043268427679e-06, + "loss": 0.2258, + "step": 4083 + }, + { + "epoch": 0.6107372513832809, + "grad_norm": 1.3440527422443669, + "learning_rate": 3.4747366987854294e-06, + "loss": 0.1886, + "step": 4084 + }, + { + "epoch": 0.610886795274413, + "grad_norm": 1.1452652380873023, + "learning_rate": 3.472430486984648e-06, + "loss": 0.1815, + "step": 4085 + }, + { + "epoch": 0.6110363391655451, + "grad_norm": 1.8862381396037677, + "learning_rate": 3.4701246335663973e-06, + "loss": 0.28, + "step": 4086 + }, + { + "epoch": 0.6111858830566771, + "grad_norm": 1.3462017669113007, + "learning_rate": 3.467819139071651e-06, + "loss": 0.2976, + "step": 4087 + }, + { + "epoch": 0.6113354269478092, + "grad_norm": 0.9872082098092432, + "learning_rate": 3.465514004041301e-06, + "loss": 0.1924, + "step": 4088 + }, + { + "epoch": 0.6114849708389413, + "grad_norm": 1.258797674393771, + "learning_rate": 3.4632092290161547e-06, + "loss": 0.139, + "step": 4089 + }, + { + "epoch": 0.6116345147300732, + "grad_norm": 1.5951256051278744, + "learning_rate": 3.460904814536934e-06, + "loss": 0.2237, + "step": 4090 + }, + { + "epoch": 0.6117840586212053, + "grad_norm": 1.3552720149728599, + "learning_rate": 3.458600761144276e-06, + "loss": 0.205, + "step": 4091 + }, + { + "epoch": 0.6119336025123374, + "grad_norm": 1.2855312617700432, + "learning_rate": 3.456297069378734e-06, + "loss": 0.2011, + "step": 4092 + }, + { + "epoch": 0.6120831464034694, + "grad_norm": 1.131526768203718, + "learning_rate": 3.4539937397807765e-06, + "loss": 0.179, + "step": 4093 + }, + { + "epoch": 0.6122326902946015, + "grad_norm": 1.012638619251334, + "learning_rate": 3.4516907728907876e-06, + "loss": 0.1881, + "step": 4094 + }, + { + "epoch": 0.6123822341857336, + "grad_norm": 2.3057997137329433, + "learning_rate": 3.4493881692490676e-06, + "loss": 0.6228, + "step": 4095 + }, + { + "epoch": 0.6125317780768655, + "grad_norm": 1.6590161930605178, + "learning_rate": 3.4470859293958237e-06, + "loss": 0.2417, + "step": 4096 + }, + { + "epoch": 0.6126813219679976, + "grad_norm": 1.8637272203103565, + "learning_rate": 3.444784053871187e-06, + "loss": 0.276, + "step": 4097 + }, + { + "epoch": 0.6128308658591296, + "grad_norm": 1.7540355711160616, + "learning_rate": 3.4424825432151997e-06, + "loss": 0.3656, + "step": 4098 + }, + { + "epoch": 0.6129804097502617, + "grad_norm": 1.2210296377287981, + "learning_rate": 3.4401813979678184e-06, + "loss": 0.1902, + "step": 4099 + }, + { + "epoch": 0.6131299536413938, + "grad_norm": 1.3676262505744117, + "learning_rate": 3.437880618668916e-06, + "loss": 0.1401, + "step": 4100 + }, + { + "epoch": 0.6132794975325258, + "grad_norm": 1.7617762190417448, + "learning_rate": 3.435580205858272e-06, + "loss": 0.3606, + "step": 4101 + }, + { + "epoch": 0.6134290414236578, + "grad_norm": 1.1968515809658866, + "learning_rate": 3.4332801600755895e-06, + "loss": 0.1833, + "step": 4102 + }, + { + "epoch": 0.6135785853147899, + "grad_norm": 1.560412557500599, + "learning_rate": 3.430980481860481e-06, + "loss": 0.1793, + "step": 4103 + }, + { + "epoch": 0.6137281292059219, + "grad_norm": 2.0021038501103727, + "learning_rate": 3.4286811717524713e-06, + "loss": 0.3963, + "step": 4104 + }, + { + "epoch": 0.613877673097054, + "grad_norm": 1.0542430630039104, + "learning_rate": 3.4263822302910046e-06, + "loss": 0.1729, + "step": 4105 + }, + { + "epoch": 0.6140272169881861, + "grad_norm": 1.5902196547544478, + "learning_rate": 3.42408365801543e-06, + "loss": 0.2775, + "step": 4106 + }, + { + "epoch": 0.614176760879318, + "grad_norm": 1.3721418830423344, + "learning_rate": 3.421785455465017e-06, + "loss": 0.2971, + "step": 4107 + }, + { + "epoch": 0.6143263047704501, + "grad_norm": 1.729094375014569, + "learning_rate": 3.4194876231789464e-06, + "loss": 0.3713, + "step": 4108 + }, + { + "epoch": 0.6144758486615822, + "grad_norm": 2.4117639518236174, + "learning_rate": 3.417190161696311e-06, + "loss": 0.3593, + "step": 4109 + }, + { + "epoch": 0.6146253925527142, + "grad_norm": 2.086422185272039, + "learning_rate": 3.414893071556119e-06, + "loss": 0.3511, + "step": 4110 + }, + { + "epoch": 0.6147749364438463, + "grad_norm": 1.5735352312513073, + "learning_rate": 3.4125963532972878e-06, + "loss": 0.2897, + "step": 4111 + }, + { + "epoch": 0.6149244803349783, + "grad_norm": 1.8232983507476865, + "learning_rate": 3.41030000745865e-06, + "loss": 0.2121, + "step": 4112 + }, + { + "epoch": 0.6150740242261103, + "grad_norm": 1.542088897552113, + "learning_rate": 3.4080040345789515e-06, + "loss": 0.2548, + "step": 4113 + }, + { + "epoch": 0.6152235681172424, + "grad_norm": 2.055704862139885, + "learning_rate": 3.4057084351968497e-06, + "loss": 0.2074, + "step": 4114 + }, + { + "epoch": 0.6153731120083744, + "grad_norm": 1.5475598177649545, + "learning_rate": 3.4034132098509143e-06, + "loss": 0.3923, + "step": 4115 + }, + { + "epoch": 0.6155226558995065, + "grad_norm": 1.4512060477133912, + "learning_rate": 3.401118359079625e-06, + "loss": 0.1771, + "step": 4116 + }, + { + "epoch": 0.6156721997906386, + "grad_norm": 1.3040117809635243, + "learning_rate": 3.3988238834213785e-06, + "loss": 0.3118, + "step": 4117 + }, + { + "epoch": 0.6158217436817706, + "grad_norm": 1.193831767033035, + "learning_rate": 3.3965297834144796e-06, + "loss": 0.1846, + "step": 4118 + }, + { + "epoch": 0.6159712875729026, + "grad_norm": 1.2936870504351123, + "learning_rate": 3.394236059597147e-06, + "loss": 0.3206, + "step": 4119 + }, + { + "epoch": 0.6161208314640347, + "grad_norm": 1.9358809574481608, + "learning_rate": 3.3919427125075117e-06, + "loss": 0.4591, + "step": 4120 + }, + { + "epoch": 0.6162703753551667, + "grad_norm": 1.5626339711928905, + "learning_rate": 3.389649742683612e-06, + "loss": 0.3604, + "step": 4121 + }, + { + "epoch": 0.6164199192462988, + "grad_norm": 1.8684296406735175, + "learning_rate": 3.387357150663402e-06, + "loss": 0.3298, + "step": 4122 + }, + { + "epoch": 0.6165694631374309, + "grad_norm": 1.3357412406024458, + "learning_rate": 3.3850649369847455e-06, + "loss": 0.1642, + "step": 4123 + }, + { + "epoch": 0.6167190070285629, + "grad_norm": 2.19771279087013, + "learning_rate": 3.3827731021854194e-06, + "loss": 0.2888, + "step": 4124 + }, + { + "epoch": 0.616868550919695, + "grad_norm": 1.548714911422848, + "learning_rate": 3.38048164680311e-06, + "loss": 0.1871, + "step": 4125 + }, + { + "epoch": 0.6170180948108269, + "grad_norm": 1.159508819902876, + "learning_rate": 3.3781905713754134e-06, + "loss": 0.1897, + "step": 4126 + }, + { + "epoch": 0.617167638701959, + "grad_norm": 1.5115811682102942, + "learning_rate": 3.375899876439838e-06, + "loss": 0.3759, + "step": 4127 + }, + { + "epoch": 0.6173171825930911, + "grad_norm": 1.790849642617947, + "learning_rate": 3.3736095625338043e-06, + "loss": 0.4749, + "step": 4128 + }, + { + "epoch": 0.6174667264842231, + "grad_norm": 1.905917740616368, + "learning_rate": 3.3713196301946415e-06, + "loss": 0.285, + "step": 4129 + }, + { + "epoch": 0.6176162703753552, + "grad_norm": 2.218642923968202, + "learning_rate": 3.36903007995959e-06, + "loss": 0.52, + "step": 4130 + }, + { + "epoch": 0.6177658142664872, + "grad_norm": 2.304237580429699, + "learning_rate": 3.366740912365799e-06, + "loss": 0.5589, + "step": 4131 + }, + { + "epoch": 0.6179153581576192, + "grad_norm": 1.790679696297901, + "learning_rate": 3.3644521279503305e-06, + "loss": 0.3308, + "step": 4132 + }, + { + "epoch": 0.6180649020487513, + "grad_norm": 1.3187679561865975, + "learning_rate": 3.3621637272501555e-06, + "loss": 0.2061, + "step": 4133 + }, + { + "epoch": 0.6182144459398834, + "grad_norm": 1.8718764354464674, + "learning_rate": 3.3598757108021546e-06, + "loss": 0.2996, + "step": 4134 + }, + { + "epoch": 0.6183639898310154, + "grad_norm": 1.784502907359159, + "learning_rate": 3.3575880791431205e-06, + "loss": 0.3875, + "step": 4135 + }, + { + "epoch": 0.6185135337221475, + "grad_norm": 2.0257075474723276, + "learning_rate": 3.3553008328097473e-06, + "loss": 0.3315, + "step": 4136 + }, + { + "epoch": 0.6186630776132795, + "grad_norm": 1.7363579300166394, + "learning_rate": 3.353013972338651e-06, + "loss": 0.1923, + "step": 4137 + }, + { + "epoch": 0.6188126215044115, + "grad_norm": 1.645989344743434, + "learning_rate": 3.3507274982663495e-06, + "loss": 0.404, + "step": 4138 + }, + { + "epoch": 0.6189621653955436, + "grad_norm": 1.6006600046854644, + "learning_rate": 3.3484414111292716e-06, + "loss": 0.2298, + "step": 4139 + }, + { + "epoch": 0.6191117092866757, + "grad_norm": 1.7303842670320941, + "learning_rate": 3.3461557114637575e-06, + "loss": 0.1871, + "step": 4140 + }, + { + "epoch": 0.6192612531778077, + "grad_norm": 1.2683245690225082, + "learning_rate": 3.3438703998060503e-06, + "loss": 0.1994, + "step": 4141 + }, + { + "epoch": 0.6194107970689398, + "grad_norm": 1.6461137510586357, + "learning_rate": 3.3415854766923073e-06, + "loss": 0.1957, + "step": 4142 + }, + { + "epoch": 0.6195603409600717, + "grad_norm": 1.581644802224599, + "learning_rate": 3.339300942658595e-06, + "loss": 0.2005, + "step": 4143 + }, + { + "epoch": 0.6197098848512038, + "grad_norm": 1.821750932071071, + "learning_rate": 3.3370167982408857e-06, + "loss": 0.404, + "step": 4144 + }, + { + "epoch": 0.6198594287423359, + "grad_norm": 1.010226375154919, + "learning_rate": 3.3347330439750634e-06, + "loss": 0.1895, + "step": 4145 + }, + { + "epoch": 0.6200089726334679, + "grad_norm": 1.7213955718892866, + "learning_rate": 3.332449680396917e-06, + "loss": 0.3164, + "step": 4146 + }, + { + "epoch": 0.6201585165246, + "grad_norm": 1.4718996679664604, + "learning_rate": 3.330166708042146e-06, + "loss": 0.3313, + "step": 4147 + }, + { + "epoch": 0.620308060415732, + "grad_norm": 1.1698035624548326, + "learning_rate": 3.3278841274463585e-06, + "loss": 0.1969, + "step": 4148 + }, + { + "epoch": 0.620457604306864, + "grad_norm": 1.5316613724929764, + "learning_rate": 3.3256019391450696e-06, + "loss": 0.3458, + "step": 4149 + }, + { + "epoch": 0.6206071481979961, + "grad_norm": 1.67069813493684, + "learning_rate": 3.3233201436737032e-06, + "loss": 0.2107, + "step": 4150 + }, + { + "epoch": 0.6207566920891282, + "grad_norm": 1.8523375864622236, + "learning_rate": 3.3210387415675894e-06, + "loss": 0.5053, + "step": 4151 + }, + { + "epoch": 0.6209062359802602, + "grad_norm": 1.268125401219981, + "learning_rate": 3.318757733361967e-06, + "loss": 0.178, + "step": 4152 + }, + { + "epoch": 0.6210557798713923, + "grad_norm": 1.2720176937747762, + "learning_rate": 3.3164771195919833e-06, + "loss": 0.2731, + "step": 4153 + }, + { + "epoch": 0.6212053237625244, + "grad_norm": 1.4082416391595145, + "learning_rate": 3.3141969007926917e-06, + "loss": 0.1942, + "step": 4154 + }, + { + "epoch": 0.6213548676536563, + "grad_norm": 0.8960930795176492, + "learning_rate": 3.311917077499056e-06, + "loss": 0.2027, + "step": 4155 + }, + { + "epoch": 0.6215044115447884, + "grad_norm": 1.5829596350349728, + "learning_rate": 3.309637650245941e-06, + "loss": 0.1931, + "step": 4156 + }, + { + "epoch": 0.6216539554359204, + "grad_norm": 1.0024243338014172, + "learning_rate": 3.307358619568123e-06, + "loss": 0.1921, + "step": 4157 + }, + { + "epoch": 0.6218034993270525, + "grad_norm": 1.4575451361421503, + "learning_rate": 3.305079986000286e-06, + "loss": 0.196, + "step": 4158 + }, + { + "epoch": 0.6219530432181846, + "grad_norm": 1.5847817071066232, + "learning_rate": 3.3028017500770188e-06, + "loss": 0.2149, + "step": 4159 + }, + { + "epoch": 0.6221025871093165, + "grad_norm": 0.9644862006701003, + "learning_rate": 3.30052391233282e-06, + "loss": 0.1878, + "step": 4160 + }, + { + "epoch": 0.6222521310004486, + "grad_norm": 1.8148496541499093, + "learning_rate": 3.298246473302087e-06, + "loss": 0.3232, + "step": 4161 + }, + { + "epoch": 0.6224016748915807, + "grad_norm": 1.8222373927506255, + "learning_rate": 3.2959694335191328e-06, + "loss": 0.3742, + "step": 4162 + }, + { + "epoch": 0.6225512187827127, + "grad_norm": 1.5806770368482173, + "learning_rate": 3.293692793518171e-06, + "loss": 0.5245, + "step": 4163 + }, + { + "epoch": 0.6227007626738448, + "grad_norm": 1.9535273557118913, + "learning_rate": 3.2914165538333247e-06, + "loss": 0.6479, + "step": 4164 + }, + { + "epoch": 0.6228503065649769, + "grad_norm": 1.2749728150835529, + "learning_rate": 3.2891407149986223e-06, + "loss": 0.1906, + "step": 4165 + }, + { + "epoch": 0.6229998504561088, + "grad_norm": 2.1068364656942262, + "learning_rate": 3.2868652775479947e-06, + "loss": 0.563, + "step": 4166 + }, + { + "epoch": 0.6231493943472409, + "grad_norm": 1.6644405150545742, + "learning_rate": 3.2845902420152833e-06, + "loss": 0.242, + "step": 4167 + }, + { + "epoch": 0.623298938238373, + "grad_norm": 1.421746094855021, + "learning_rate": 3.2823156089342335e-06, + "loss": 0.2305, + "step": 4168 + }, + { + "epoch": 0.623448482129505, + "grad_norm": 1.7648579864130391, + "learning_rate": 3.2800413788384956e-06, + "loss": 0.4275, + "step": 4169 + }, + { + "epoch": 0.6235980260206371, + "grad_norm": 1.5957084104714234, + "learning_rate": 3.277767552261626e-06, + "loss": 0.2297, + "step": 4170 + }, + { + "epoch": 0.623747569911769, + "grad_norm": 1.61035540032609, + "learning_rate": 3.2754941297370872e-06, + "loss": 0.1548, + "step": 4171 + }, + { + "epoch": 0.6238971138029011, + "grad_norm": 1.717346159952656, + "learning_rate": 3.273221111798245e-06, + "loss": 0.4091, + "step": 4172 + }, + { + "epoch": 0.6240466576940332, + "grad_norm": 1.188823809082235, + "learning_rate": 3.2709484989783708e-06, + "loss": 0.2987, + "step": 4173 + }, + { + "epoch": 0.6241962015851652, + "grad_norm": 2.099313220051725, + "learning_rate": 3.2686762918106425e-06, + "loss": 0.6985, + "step": 4174 + }, + { + "epoch": 0.6243457454762973, + "grad_norm": 1.6757776086941085, + "learning_rate": 3.2664044908281413e-06, + "loss": 0.2769, + "step": 4175 + }, + { + "epoch": 0.6244952893674294, + "grad_norm": 1.6877573797191467, + "learning_rate": 3.2641330965638563e-06, + "loss": 0.2132, + "step": 4176 + }, + { + "epoch": 0.6246448332585613, + "grad_norm": 1.5982412325900388, + "learning_rate": 3.261862109550673e-06, + "loss": 0.437, + "step": 4177 + }, + { + "epoch": 0.6247943771496934, + "grad_norm": 1.3427594465047028, + "learning_rate": 3.2595915303213902e-06, + "loss": 0.3653, + "step": 4178 + }, + { + "epoch": 0.6249439210408255, + "grad_norm": 1.262764121214261, + "learning_rate": 3.2573213594087084e-06, + "loss": 0.179, + "step": 4179 + }, + { + "epoch": 0.6250934649319575, + "grad_norm": 1.1739874666573993, + "learning_rate": 3.2550515973452295e-06, + "loss": 0.2201, + "step": 4180 + }, + { + "epoch": 0.6252430088230896, + "grad_norm": 1.3535281039016394, + "learning_rate": 3.252782244663465e-06, + "loss": 0.1931, + "step": 4181 + }, + { + "epoch": 0.6253925527142217, + "grad_norm": 1.4264461743578645, + "learning_rate": 3.2505133018958224e-06, + "loss": 0.251, + "step": 4182 + }, + { + "epoch": 0.6255420966053536, + "grad_norm": 1.4547018012543345, + "learning_rate": 3.24824476957462e-06, + "loss": 0.2176, + "step": 4183 + }, + { + "epoch": 0.6256916404964857, + "grad_norm": 1.93315832456521, + "learning_rate": 3.2459766482320764e-06, + "loss": 0.5806, + "step": 4184 + }, + { + "epoch": 0.6258411843876177, + "grad_norm": 1.456983023451318, + "learning_rate": 3.243708938400316e-06, + "loss": 0.3683, + "step": 4185 + }, + { + "epoch": 0.6259907282787498, + "grad_norm": 1.2318359907860033, + "learning_rate": 3.2414416406113645e-06, + "loss": 0.176, + "step": 4186 + }, + { + "epoch": 0.6261402721698819, + "grad_norm": 1.8015103632850937, + "learning_rate": 3.239174755397152e-06, + "loss": 0.3032, + "step": 4187 + }, + { + "epoch": 0.6262898160610139, + "grad_norm": 1.3196497543247543, + "learning_rate": 3.2369082832895116e-06, + "loss": 0.2055, + "step": 4188 + }, + { + "epoch": 0.626439359952146, + "grad_norm": 1.4298858348667032, + "learning_rate": 3.2346422248201792e-06, + "loss": 0.3308, + "step": 4189 + }, + { + "epoch": 0.626588903843278, + "grad_norm": 1.3708295866539182, + "learning_rate": 3.232376580520794e-06, + "loss": 0.215, + "step": 4190 + }, + { + "epoch": 0.62673844773441, + "grad_norm": 1.4861225381998133, + "learning_rate": 3.2301113509228992e-06, + "loss": 0.3438, + "step": 4191 + }, + { + "epoch": 0.6268879916255421, + "grad_norm": 1.4461704439901406, + "learning_rate": 3.227846536557938e-06, + "loss": 0.328, + "step": 4192 + }, + { + "epoch": 0.6270375355166742, + "grad_norm": 1.811748443366764, + "learning_rate": 3.225582137957258e-06, + "loss": 0.3549, + "step": 4193 + }, + { + "epoch": 0.6271870794078062, + "grad_norm": 1.205193555796813, + "learning_rate": 3.223318155652109e-06, + "loss": 0.1965, + "step": 4194 + }, + { + "epoch": 0.6273366232989382, + "grad_norm": 1.2317204179453682, + "learning_rate": 3.2210545901736432e-06, + "loss": 0.2889, + "step": 4195 + }, + { + "epoch": 0.6274861671900703, + "grad_norm": 1.4296358807943457, + "learning_rate": 3.2187914420529176e-06, + "loss": 0.3053, + "step": 4196 + }, + { + "epoch": 0.6276357110812023, + "grad_norm": 1.832702074683805, + "learning_rate": 3.216528711820882e-06, + "loss": 0.4491, + "step": 4197 + }, + { + "epoch": 0.6277852549723344, + "grad_norm": 1.3247323137479683, + "learning_rate": 3.2142664000084e-06, + "loss": 0.1753, + "step": 4198 + }, + { + "epoch": 0.6279347988634665, + "grad_norm": 1.4737325221279896, + "learning_rate": 3.2120045071462303e-06, + "loss": 0.1908, + "step": 4199 + }, + { + "epoch": 0.6280843427545985, + "grad_norm": 2.128037805773552, + "learning_rate": 3.2097430337650347e-06, + "loss": 0.4037, + "step": 4200 + }, + { + "epoch": 0.6282338866457305, + "grad_norm": 1.9359432049498655, + "learning_rate": 3.207481980395379e-06, + "loss": 0.3577, + "step": 4201 + }, + { + "epoch": 0.6283834305368625, + "grad_norm": 1.4914189826217994, + "learning_rate": 3.205221347567723e-06, + "loss": 0.3296, + "step": 4202 + }, + { + "epoch": 0.6285329744279946, + "grad_norm": 1.548420456754965, + "learning_rate": 3.202961135812437e-06, + "loss": 0.3748, + "step": 4203 + }, + { + "epoch": 0.6286825183191267, + "grad_norm": 1.9692451823118604, + "learning_rate": 3.2007013456597864e-06, + "loss": 0.1725, + "step": 4204 + }, + { + "epoch": 0.6288320622102587, + "grad_norm": 1.420481431208107, + "learning_rate": 3.1984419776399413e-06, + "loss": 0.2428, + "step": 4205 + }, + { + "epoch": 0.6289816061013908, + "grad_norm": 1.3526682134449342, + "learning_rate": 3.1961830322829707e-06, + "loss": 0.1709, + "step": 4206 + }, + { + "epoch": 0.6291311499925228, + "grad_norm": 1.5002481533459355, + "learning_rate": 3.1939245101188444e-06, + "loss": 0.3167, + "step": 4207 + }, + { + "epoch": 0.6292806938836548, + "grad_norm": 2.3464477589549126, + "learning_rate": 3.1916664116774333e-06, + "loss": 0.3406, + "step": 4208 + }, + { + "epoch": 0.6294302377747869, + "grad_norm": 1.2641045597562264, + "learning_rate": 3.1894087374885095e-06, + "loss": 0.2122, + "step": 4209 + }, + { + "epoch": 0.629579781665919, + "grad_norm": 1.3787624835316412, + "learning_rate": 3.1871514880817454e-06, + "loss": 0.2141, + "step": 4210 + }, + { + "epoch": 0.629729325557051, + "grad_norm": 1.2143198490479026, + "learning_rate": 3.184894663986715e-06, + "loss": 0.2066, + "step": 4211 + }, + { + "epoch": 0.629878869448183, + "grad_norm": 1.5367748497524958, + "learning_rate": 3.1826382657328877e-06, + "loss": 0.2403, + "step": 4212 + }, + { + "epoch": 0.6300284133393151, + "grad_norm": 1.2749341203227855, + "learning_rate": 3.1803822938496377e-06, + "loss": 0.2034, + "step": 4213 + }, + { + "epoch": 0.6301779572304471, + "grad_norm": 1.8350648672932401, + "learning_rate": 3.1781267488662383e-06, + "loss": 0.1805, + "step": 4214 + }, + { + "epoch": 0.6303275011215792, + "grad_norm": 2.0058052776288537, + "learning_rate": 3.175871631311861e-06, + "loss": 0.2047, + "step": 4215 + }, + { + "epoch": 0.6304770450127112, + "grad_norm": 1.7220131761526412, + "learning_rate": 3.1736169417155814e-06, + "loss": 0.4788, + "step": 4216 + }, + { + "epoch": 0.6306265889038433, + "grad_norm": 1.40884208997203, + "learning_rate": 3.171362680606366e-06, + "loss": 0.3919, + "step": 4217 + }, + { + "epoch": 0.6307761327949754, + "grad_norm": 1.4379399445316037, + "learning_rate": 3.1691088485130896e-06, + "loss": 0.2367, + "step": 4218 + }, + { + "epoch": 0.6309256766861073, + "grad_norm": 1.046787066111561, + "learning_rate": 3.1668554459645217e-06, + "loss": 0.1705, + "step": 4219 + }, + { + "epoch": 0.6310752205772394, + "grad_norm": 1.8102251238144302, + "learning_rate": 3.164602473489333e-06, + "loss": 0.2386, + "step": 4220 + }, + { + "epoch": 0.6312247644683715, + "grad_norm": 1.6606923415837127, + "learning_rate": 3.1623499316160955e-06, + "loss": 0.3065, + "step": 4221 + }, + { + "epoch": 0.6313743083595035, + "grad_norm": 1.6277864893202596, + "learning_rate": 3.1600978208732712e-06, + "loss": 0.1967, + "step": 4222 + }, + { + "epoch": 0.6315238522506356, + "grad_norm": 1.5787248038141106, + "learning_rate": 3.15784614178923e-06, + "loss": 0.2251, + "step": 4223 + }, + { + "epoch": 0.6316733961417677, + "grad_norm": 1.2158381326740617, + "learning_rate": 3.1555948948922376e-06, + "loss": 0.2048, + "step": 4224 + }, + { + "epoch": 0.6318229400328996, + "grad_norm": 1.3127440129245087, + "learning_rate": 3.153344080710459e-06, + "loss": 0.2573, + "step": 4225 + }, + { + "epoch": 0.6319724839240317, + "grad_norm": 1.7939040127871422, + "learning_rate": 3.1510936997719557e-06, + "loss": 0.2294, + "step": 4226 + }, + { + "epoch": 0.6321220278151638, + "grad_norm": 1.3974871896214638, + "learning_rate": 3.1488437526046876e-06, + "loss": 0.2024, + "step": 4227 + }, + { + "epoch": 0.6322715717062958, + "grad_norm": 1.6249650116968457, + "learning_rate": 3.1465942397365168e-06, + "loss": 0.2019, + "step": 4228 + }, + { + "epoch": 0.6324211155974279, + "grad_norm": 1.3398096425022026, + "learning_rate": 3.144345161695199e-06, + "loss": 0.1755, + "step": 4229 + }, + { + "epoch": 0.6325706594885598, + "grad_norm": 1.476733344149943, + "learning_rate": 3.142096519008389e-06, + "loss": 0.1771, + "step": 4230 + }, + { + "epoch": 0.6327202033796919, + "grad_norm": 1.471079632513585, + "learning_rate": 3.1398483122036427e-06, + "loss": 0.4238, + "step": 4231 + }, + { + "epoch": 0.632869747270824, + "grad_norm": 1.6381073298660154, + "learning_rate": 3.1376005418084082e-06, + "loss": 0.3599, + "step": 4232 + }, + { + "epoch": 0.633019291161956, + "grad_norm": 1.7979103671376482, + "learning_rate": 3.135353208350035e-06, + "loss": 0.2413, + "step": 4233 + }, + { + "epoch": 0.6331688350530881, + "grad_norm": 1.9747639038112241, + "learning_rate": 3.1331063123557692e-06, + "loss": 0.3426, + "step": 4234 + }, + { + "epoch": 0.6333183789442202, + "grad_norm": 1.6627981863210517, + "learning_rate": 3.130859854352755e-06, + "loss": 0.228, + "step": 4235 + }, + { + "epoch": 0.6334679228353521, + "grad_norm": 1.584328384767029, + "learning_rate": 3.128613834868035e-06, + "loss": 0.1844, + "step": 4236 + }, + { + "epoch": 0.6336174667264842, + "grad_norm": 2.0955618164544694, + "learning_rate": 3.126368254428541e-06, + "loss": 0.5052, + "step": 4237 + }, + { + "epoch": 0.6337670106176163, + "grad_norm": 1.6176100653355505, + "learning_rate": 3.1241231135611116e-06, + "loss": 0.2819, + "step": 4238 + }, + { + "epoch": 0.6339165545087483, + "grad_norm": 2.9551331489114685, + "learning_rate": 3.1218784127924795e-06, + "loss": 0.5355, + "step": 4239 + }, + { + "epoch": 0.6340660983998804, + "grad_norm": 1.8688017618703836, + "learning_rate": 3.1196341526492715e-06, + "loss": 0.2341, + "step": 4240 + }, + { + "epoch": 0.6342156422910125, + "grad_norm": 1.9962730625781477, + "learning_rate": 3.1173903336580146e-06, + "loss": 0.3511, + "step": 4241 + }, + { + "epoch": 0.6343651861821444, + "grad_norm": 1.6150488602773196, + "learning_rate": 3.1151469563451275e-06, + "loss": 0.221, + "step": 4242 + }, + { + "epoch": 0.6345147300732765, + "grad_norm": 2.3530875232138797, + "learning_rate": 3.1129040212369286e-06, + "loss": 0.3598, + "step": 4243 + }, + { + "epoch": 0.6346642739644085, + "grad_norm": 1.2420835811404023, + "learning_rate": 3.1106615288596337e-06, + "loss": 0.2156, + "step": 4244 + }, + { + "epoch": 0.6348138178555406, + "grad_norm": 1.401385423495105, + "learning_rate": 3.108419479739352e-06, + "loss": 0.2142, + "step": 4245 + }, + { + "epoch": 0.6349633617466727, + "grad_norm": 2.1568206778278993, + "learning_rate": 3.1061778744020916e-06, + "loss": 0.2041, + "step": 4246 + }, + { + "epoch": 0.6351129056378046, + "grad_norm": 1.2507736523913935, + "learning_rate": 3.1039367133737517e-06, + "loss": 0.1835, + "step": 4247 + }, + { + "epoch": 0.6352624495289367, + "grad_norm": 1.7213048240141702, + "learning_rate": 3.1016959971801326e-06, + "loss": 0.4563, + "step": 4248 + }, + { + "epoch": 0.6354119934200688, + "grad_norm": 1.2148837576653622, + "learning_rate": 3.0994557263469267e-06, + "loss": 0.1998, + "step": 4249 + }, + { + "epoch": 0.6355615373112008, + "grad_norm": 1.848918443348012, + "learning_rate": 3.097215901399724e-06, + "loss": 0.4183, + "step": 4250 + }, + { + "epoch": 0.6357110812023329, + "grad_norm": 1.447495899079333, + "learning_rate": 3.0949765228640094e-06, + "loss": 0.2844, + "step": 4251 + }, + { + "epoch": 0.635860625093465, + "grad_norm": 1.9257511306790576, + "learning_rate": 3.092737591265161e-06, + "loss": 0.1671, + "step": 4252 + }, + { + "epoch": 0.636010168984597, + "grad_norm": 1.81538166148831, + "learning_rate": 3.0904991071284544e-06, + "loss": 0.3598, + "step": 4253 + }, + { + "epoch": 0.636159712875729, + "grad_norm": 1.840220273117095, + "learning_rate": 3.0882610709790606e-06, + "loss": 0.2887, + "step": 4254 + }, + { + "epoch": 0.6363092567668611, + "grad_norm": 2.161029128850978, + "learning_rate": 3.086023483342043e-06, + "loss": 0.6133, + "step": 4255 + }, + { + "epoch": 0.6364588006579931, + "grad_norm": 1.6267242774433444, + "learning_rate": 3.083786344742362e-06, + "loss": 0.217, + "step": 4256 + }, + { + "epoch": 0.6366083445491252, + "grad_norm": 1.5140885204209047, + "learning_rate": 3.081549655704874e-06, + "loss": 0.3897, + "step": 4257 + }, + { + "epoch": 0.6367578884402573, + "grad_norm": 1.5106568808657759, + "learning_rate": 3.079313416754322e-06, + "loss": 0.1983, + "step": 4258 + }, + { + "epoch": 0.6369074323313892, + "grad_norm": 1.6011885974138298, + "learning_rate": 3.0770776284153544e-06, + "loss": 0.2696, + "step": 4259 + }, + { + "epoch": 0.6370569762225213, + "grad_norm": 1.7578971545709878, + "learning_rate": 3.0748422912125066e-06, + "loss": 0.3563, + "step": 4260 + }, + { + "epoch": 0.6372065201136533, + "grad_norm": 1.4522858152587212, + "learning_rate": 3.072607405670211e-06, + "loss": 0.2182, + "step": 4261 + }, + { + "epoch": 0.6373560640047854, + "grad_norm": 1.601429394098701, + "learning_rate": 3.070372972312795e-06, + "loss": 0.3268, + "step": 4262 + }, + { + "epoch": 0.6375056078959175, + "grad_norm": 1.586472172174774, + "learning_rate": 3.0681389916644745e-06, + "loss": 0.1793, + "step": 4263 + }, + { + "epoch": 0.6376551517870495, + "grad_norm": 1.6949417042764672, + "learning_rate": 3.065905464249364e-06, + "loss": 0.2796, + "step": 4264 + }, + { + "epoch": 0.6378046956781815, + "grad_norm": 1.3534270198383676, + "learning_rate": 3.0636723905914717e-06, + "loss": 0.3281, + "step": 4265 + }, + { + "epoch": 0.6379542395693136, + "grad_norm": 1.3866514082125108, + "learning_rate": 3.0614397712146974e-06, + "loss": 0.2263, + "step": 4266 + }, + { + "epoch": 0.6381037834604456, + "grad_norm": 1.8462458786372813, + "learning_rate": 3.0592076066428367e-06, + "loss": 0.5081, + "step": 4267 + }, + { + "epoch": 0.6382533273515777, + "grad_norm": 1.5215387444826622, + "learning_rate": 3.0569758973995745e-06, + "loss": 0.3697, + "step": 4268 + }, + { + "epoch": 0.6384028712427098, + "grad_norm": 1.572814999435716, + "learning_rate": 3.0547446440084914e-06, + "loss": 0.3098, + "step": 4269 + }, + { + "epoch": 0.6385524151338418, + "grad_norm": 1.3241548494658333, + "learning_rate": 3.052513846993063e-06, + "loss": 0.3048, + "step": 4270 + }, + { + "epoch": 0.6387019590249738, + "grad_norm": 1.5147109741247766, + "learning_rate": 3.0502835068766545e-06, + "loss": 0.3428, + "step": 4271 + }, + { + "epoch": 0.6388515029161059, + "grad_norm": 1.7322061612980246, + "learning_rate": 3.0480536241825263e-06, + "loss": 0.3386, + "step": 4272 + }, + { + "epoch": 0.6390010468072379, + "grad_norm": 1.5645781715890368, + "learning_rate": 3.045824199433829e-06, + "loss": 0.2606, + "step": 4273 + }, + { + "epoch": 0.63915059069837, + "grad_norm": 2.3293893184482797, + "learning_rate": 3.0435952331536072e-06, + "loss": 0.5555, + "step": 4274 + }, + { + "epoch": 0.639300134589502, + "grad_norm": 1.0615012965836783, + "learning_rate": 3.0413667258647983e-06, + "loss": 0.1921, + "step": 4275 + }, + { + "epoch": 0.639449678480634, + "grad_norm": 1.6734758493782524, + "learning_rate": 3.039138678090232e-06, + "loss": 0.2011, + "step": 4276 + }, + { + "epoch": 0.6395992223717661, + "grad_norm": 2.1157051612171163, + "learning_rate": 3.036911090352631e-06, + "loss": 0.4092, + "step": 4277 + }, + { + "epoch": 0.6397487662628981, + "grad_norm": 1.3554481867568882, + "learning_rate": 3.034683963174604e-06, + "loss": 0.1613, + "step": 4278 + }, + { + "epoch": 0.6398983101540302, + "grad_norm": 1.3257952142585847, + "learning_rate": 3.0324572970786607e-06, + "loss": 0.2839, + "step": 4279 + }, + { + "epoch": 0.6400478540451623, + "grad_norm": 1.691758447323851, + "learning_rate": 3.030231092587198e-06, + "loss": 0.4357, + "step": 4280 + }, + { + "epoch": 0.6401973979362943, + "grad_norm": 1.403610467612681, + "learning_rate": 3.028005350222504e-06, + "loss": 0.3382, + "step": 4281 + }, + { + "epoch": 0.6403469418274264, + "grad_norm": 1.2279332452901197, + "learning_rate": 3.0257800705067614e-06, + "loss": 0.2253, + "step": 4282 + }, + { + "epoch": 0.6404964857185584, + "grad_norm": 1.3097407879338883, + "learning_rate": 3.0235552539620384e-06, + "loss": 0.2013, + "step": 4283 + }, + { + "epoch": 0.6406460296096904, + "grad_norm": 1.7716307700503333, + "learning_rate": 3.021330901110301e-06, + "loss": 0.3922, + "step": 4284 + }, + { + "epoch": 0.6407955735008225, + "grad_norm": 1.4417814009504493, + "learning_rate": 3.0191070124734034e-06, + "loss": 0.3302, + "step": 4285 + }, + { + "epoch": 0.6409451173919546, + "grad_norm": 1.3554969696713202, + "learning_rate": 3.016883588573091e-06, + "loss": 0.1897, + "step": 4286 + }, + { + "epoch": 0.6410946612830866, + "grad_norm": 1.6342674280144482, + "learning_rate": 3.014660629931002e-06, + "loss": 0.2236, + "step": 4287 + }, + { + "epoch": 0.6412442051742187, + "grad_norm": 1.2027823026684163, + "learning_rate": 3.0124381370686617e-06, + "loss": 0.188, + "step": 4288 + }, + { + "epoch": 0.6413937490653506, + "grad_norm": 1.0825612297165725, + "learning_rate": 3.010216110507489e-06, + "loss": 0.1893, + "step": 4289 + }, + { + "epoch": 0.6415432929564827, + "grad_norm": 1.8647778656423657, + "learning_rate": 3.007994550768793e-06, + "loss": 0.4889, + "step": 4290 + }, + { + "epoch": 0.6416928368476148, + "grad_norm": 1.3882812293916185, + "learning_rate": 3.005773458373773e-06, + "loss": 0.2079, + "step": 4291 + }, + { + "epoch": 0.6418423807387468, + "grad_norm": 1.3562041151287785, + "learning_rate": 3.0035528338435205e-06, + "loss": 0.2236, + "step": 4292 + }, + { + "epoch": 0.6419919246298789, + "grad_norm": 1.3686846791536245, + "learning_rate": 3.001332677699012e-06, + "loss": 0.2996, + "step": 4293 + }, + { + "epoch": 0.642141468521011, + "grad_norm": 1.8880382962351019, + "learning_rate": 2.9991129904611193e-06, + "loss": 0.3409, + "step": 4294 + }, + { + "epoch": 0.6422910124121429, + "grad_norm": 1.112992761276077, + "learning_rate": 2.996893772650602e-06, + "loss": 0.1865, + "step": 4295 + }, + { + "epoch": 0.642440556303275, + "grad_norm": 2.0473388917881783, + "learning_rate": 2.9946750247881107e-06, + "loss": 0.4307, + "step": 4296 + }, + { + "epoch": 0.6425901001944071, + "grad_norm": 1.3086798312214234, + "learning_rate": 2.9924567473941867e-06, + "loss": 0.1451, + "step": 4297 + }, + { + "epoch": 0.6427396440855391, + "grad_norm": 1.329348124352134, + "learning_rate": 2.9902389409892553e-06, + "loss": 0.3251, + "step": 4298 + }, + { + "epoch": 0.6428891879766712, + "grad_norm": 1.348801188692966, + "learning_rate": 2.9880216060936364e-06, + "loss": 0.1869, + "step": 4299 + }, + { + "epoch": 0.6430387318678032, + "grad_norm": 1.7721779505445987, + "learning_rate": 2.9858047432275387e-06, + "loss": 0.3939, + "step": 4300 + }, + { + "epoch": 0.6431882757589352, + "grad_norm": 2.270411296146692, + "learning_rate": 2.983588352911061e-06, + "loss": 0.1998, + "step": 4301 + }, + { + "epoch": 0.6433378196500673, + "grad_norm": 1.3542833009033532, + "learning_rate": 2.98137243566419e-06, + "loss": 0.1676, + "step": 4302 + }, + { + "epoch": 0.6434873635411994, + "grad_norm": 1.7866702598193438, + "learning_rate": 2.9791569920067975e-06, + "loss": 0.2745, + "step": 4303 + }, + { + "epoch": 0.6436369074323314, + "grad_norm": 1.7819432699718787, + "learning_rate": 2.976942022458651e-06, + "loss": 0.3678, + "step": 4304 + }, + { + "epoch": 0.6437864513234635, + "grad_norm": 1.9658303788785358, + "learning_rate": 2.974727527539403e-06, + "loss": 0.4969, + "step": 4305 + }, + { + "epoch": 0.6439359952145954, + "grad_norm": 1.607317627948274, + "learning_rate": 2.9725135077685946e-06, + "loss": 0.3529, + "step": 4306 + }, + { + "epoch": 0.6440855391057275, + "grad_norm": 1.475033336602126, + "learning_rate": 2.9702999636656584e-06, + "loss": 0.2222, + "step": 4307 + }, + { + "epoch": 0.6442350829968596, + "grad_norm": 1.973299582548237, + "learning_rate": 2.968086895749911e-06, + "loss": 0.2234, + "step": 4308 + }, + { + "epoch": 0.6443846268879916, + "grad_norm": 1.7058778528147858, + "learning_rate": 2.9658743045405593e-06, + "loss": 0.2131, + "step": 4309 + }, + { + "epoch": 0.6445341707791237, + "grad_norm": 0.9302716463605547, + "learning_rate": 2.9636621905566997e-06, + "loss": 0.1485, + "step": 4310 + }, + { + "epoch": 0.6446837146702558, + "grad_norm": 1.1167565995299549, + "learning_rate": 2.9614505543173154e-06, + "loss": 0.2204, + "step": 4311 + }, + { + "epoch": 0.6448332585613877, + "grad_norm": 1.5708854066715467, + "learning_rate": 2.959239396341278e-06, + "loss": 0.2758, + "step": 4312 + }, + { + "epoch": 0.6449828024525198, + "grad_norm": 2.089312733718862, + "learning_rate": 2.957028717147345e-06, + "loss": 0.201, + "step": 4313 + }, + { + "epoch": 0.6451323463436519, + "grad_norm": 2.1281298525302588, + "learning_rate": 2.954818517254164e-06, + "loss": 0.4695, + "step": 4314 + }, + { + "epoch": 0.6452818902347839, + "grad_norm": 1.5959774304509975, + "learning_rate": 2.9526087971802685e-06, + "loss": 0.3108, + "step": 4315 + }, + { + "epoch": 0.645431434125916, + "grad_norm": 1.5718195549000646, + "learning_rate": 2.950399557444081e-06, + "loss": 0.3204, + "step": 4316 + }, + { + "epoch": 0.6455809780170481, + "grad_norm": 1.4213056419991819, + "learning_rate": 2.948190798563912e-06, + "loss": 0.3553, + "step": 4317 + }, + { + "epoch": 0.64573052190818, + "grad_norm": 1.8761753689830492, + "learning_rate": 2.9459825210579534e-06, + "loss": 0.361, + "step": 4318 + }, + { + "epoch": 0.6458800657993121, + "grad_norm": 1.9599100361948998, + "learning_rate": 2.94377472544429e-06, + "loss": 0.5, + "step": 4319 + }, + { + "epoch": 0.6460296096904441, + "grad_norm": 1.452596811980564, + "learning_rate": 2.941567412240893e-06, + "loss": 0.3076, + "step": 4320 + }, + { + "epoch": 0.6461791535815762, + "grad_norm": 1.867725765581649, + "learning_rate": 2.9393605819656203e-06, + "loss": 0.4896, + "step": 4321 + }, + { + "epoch": 0.6463286974727083, + "grad_norm": 2.0028651025738604, + "learning_rate": 2.937154235136215e-06, + "loss": 0.4414, + "step": 4322 + }, + { + "epoch": 0.6464782413638402, + "grad_norm": 1.444525605000369, + "learning_rate": 2.934948372270305e-06, + "loss": 0.1703, + "step": 4323 + }, + { + "epoch": 0.6466277852549723, + "grad_norm": 1.6260674196352805, + "learning_rate": 2.932742993885408e-06, + "loss": 0.2936, + "step": 4324 + }, + { + "epoch": 0.6467773291461044, + "grad_norm": 1.6357422467132101, + "learning_rate": 2.930538100498928e-06, + "loss": 0.1485, + "step": 4325 + }, + { + "epoch": 0.6469268730372364, + "grad_norm": 1.4888503420859003, + "learning_rate": 2.9283336926281535e-06, + "loss": 0.49, + "step": 4326 + }, + { + "epoch": 0.6470764169283685, + "grad_norm": 1.0301201982118935, + "learning_rate": 2.9261297707902614e-06, + "loss": 0.205, + "step": 4327 + }, + { + "epoch": 0.6472259608195006, + "grad_norm": 1.5229938350109378, + "learning_rate": 2.9239263355023104e-06, + "loss": 0.3428, + "step": 4328 + }, + { + "epoch": 0.6473755047106325, + "grad_norm": 1.3838220241069505, + "learning_rate": 2.9217233872812502e-06, + "loss": 0.2269, + "step": 4329 + }, + { + "epoch": 0.6475250486017646, + "grad_norm": 2.00517100675116, + "learning_rate": 2.919520926643911e-06, + "loss": 0.5935, + "step": 4330 + }, + { + "epoch": 0.6476745924928967, + "grad_norm": 1.4602024554784119, + "learning_rate": 2.9173189541070124e-06, + "loss": 0.3243, + "step": 4331 + }, + { + "epoch": 0.6478241363840287, + "grad_norm": 1.160834929528934, + "learning_rate": 2.9151174701871616e-06, + "loss": 0.1662, + "step": 4332 + }, + { + "epoch": 0.6479736802751608, + "grad_norm": 1.2978941922617255, + "learning_rate": 2.9129164754008433e-06, + "loss": 0.2174, + "step": 4333 + }, + { + "epoch": 0.6481232241662928, + "grad_norm": 1.2391057330727242, + "learning_rate": 2.910715970264433e-06, + "loss": 0.182, + "step": 4334 + }, + { + "epoch": 0.6482727680574248, + "grad_norm": 1.1677889159191694, + "learning_rate": 2.908515955294192e-06, + "loss": 0.1465, + "step": 4335 + }, + { + "epoch": 0.6484223119485569, + "grad_norm": 1.8166755640689873, + "learning_rate": 2.9063164310062643e-06, + "loss": 0.2768, + "step": 4336 + }, + { + "epoch": 0.6485718558396889, + "grad_norm": 1.3259310299996228, + "learning_rate": 2.9041173979166813e-06, + "loss": 0.2222, + "step": 4337 + }, + { + "epoch": 0.648721399730821, + "grad_norm": 1.5337321468214244, + "learning_rate": 2.9019188565413535e-06, + "loss": 0.2905, + "step": 4338 + }, + { + "epoch": 0.6488709436219531, + "grad_norm": 1.5311289564286026, + "learning_rate": 2.899720807396082e-06, + "loss": 0.3367, + "step": 4339 + }, + { + "epoch": 0.649020487513085, + "grad_norm": 1.5629391529927086, + "learning_rate": 2.89752325099655e-06, + "loss": 0.2184, + "step": 4340 + }, + { + "epoch": 0.6491700314042171, + "grad_norm": 1.1937759596313804, + "learning_rate": 2.8953261878583263e-06, + "loss": 0.1794, + "step": 4341 + }, + { + "epoch": 0.6493195752953492, + "grad_norm": 1.8427953915887811, + "learning_rate": 2.8931296184968614e-06, + "loss": 0.2087, + "step": 4342 + }, + { + "epoch": 0.6494691191864812, + "grad_norm": 1.3918761481043547, + "learning_rate": 2.8909335434274932e-06, + "loss": 0.2514, + "step": 4343 + }, + { + "epoch": 0.6496186630776133, + "grad_norm": 2.3172764293559864, + "learning_rate": 2.888737963165442e-06, + "loss": 0.394, + "step": 4344 + }, + { + "epoch": 0.6497682069687454, + "grad_norm": 1.34481782125672, + "learning_rate": 2.886542878225811e-06, + "loss": 0.2146, + "step": 4345 + }, + { + "epoch": 0.6499177508598774, + "grad_norm": 1.3706664753686384, + "learning_rate": 2.8843482891235895e-06, + "loss": 0.1911, + "step": 4346 + }, + { + "epoch": 0.6500672947510094, + "grad_norm": 0.9839726564346342, + "learning_rate": 2.8821541963736482e-06, + "loss": 0.1612, + "step": 4347 + }, + { + "epoch": 0.6502168386421414, + "grad_norm": 2.3385887027986003, + "learning_rate": 2.8799606004907452e-06, + "loss": 0.3699, + "step": 4348 + }, + { + "epoch": 0.6503663825332735, + "grad_norm": 0.9434101059503864, + "learning_rate": 2.8777675019895144e-06, + "loss": 0.2116, + "step": 4349 + }, + { + "epoch": 0.6505159264244056, + "grad_norm": 1.4524046063305824, + "learning_rate": 2.8755749013844813e-06, + "loss": 0.2218, + "step": 4350 + }, + { + "epoch": 0.6506654703155376, + "grad_norm": 1.380876290138162, + "learning_rate": 2.873382799190049e-06, + "loss": 0.2275, + "step": 4351 + }, + { + "epoch": 0.6508150142066697, + "grad_norm": 1.2042028565506917, + "learning_rate": 2.871191195920508e-06, + "loss": 0.2236, + "step": 4352 + }, + { + "epoch": 0.6509645580978017, + "grad_norm": 1.561198882536686, + "learning_rate": 2.86900009209003e-06, + "loss": 0.4117, + "step": 4353 + }, + { + "epoch": 0.6511141019889337, + "grad_norm": 1.78117218182217, + "learning_rate": 2.8668094882126658e-06, + "loss": 0.363, + "step": 4354 + }, + { + "epoch": 0.6512636458800658, + "grad_norm": 1.4500223615377268, + "learning_rate": 2.864619384802354e-06, + "loss": 0.2251, + "step": 4355 + }, + { + "epoch": 0.6514131897711979, + "grad_norm": 1.8816120507444667, + "learning_rate": 2.862429782372914e-06, + "loss": 0.5379, + "step": 4356 + }, + { + "epoch": 0.6515627336623299, + "grad_norm": 1.2483361555174965, + "learning_rate": 2.860240681438048e-06, + "loss": 0.2536, + "step": 4357 + }, + { + "epoch": 0.651712277553462, + "grad_norm": 2.434529906670917, + "learning_rate": 2.858052082511339e-06, + "loss": 0.359, + "step": 4358 + }, + { + "epoch": 0.651861821444594, + "grad_norm": 1.3095152101580132, + "learning_rate": 2.8558639861062544e-06, + "loss": 0.3292, + "step": 4359 + }, + { + "epoch": 0.652011365335726, + "grad_norm": 1.0014669324387186, + "learning_rate": 2.853676392736142e-06, + "loss": 0.1774, + "step": 4360 + }, + { + "epoch": 0.6521609092268581, + "grad_norm": 1.107667196103754, + "learning_rate": 2.8514893029142337e-06, + "loss": 0.1987, + "step": 4361 + }, + { + "epoch": 0.6523104531179902, + "grad_norm": 0.8289718211895785, + "learning_rate": 2.8493027171536403e-06, + "loss": 0.1364, + "step": 4362 + }, + { + "epoch": 0.6524599970091222, + "grad_norm": 1.0509461340888442, + "learning_rate": 2.847116635967359e-06, + "loss": 0.1618, + "step": 4363 + }, + { + "epoch": 0.6526095409002542, + "grad_norm": 1.9624907660418935, + "learning_rate": 2.844931059868261e-06, + "loss": 0.5125, + "step": 4364 + }, + { + "epoch": 0.6527590847913862, + "grad_norm": 1.1521614286081878, + "learning_rate": 2.842745989369106e-06, + "loss": 0.2053, + "step": 4365 + }, + { + "epoch": 0.6529086286825183, + "grad_norm": 1.4513340162465773, + "learning_rate": 2.840561424982531e-06, + "loss": 0.3688, + "step": 4366 + }, + { + "epoch": 0.6530581725736504, + "grad_norm": 2.464838230471469, + "learning_rate": 2.838377367221057e-06, + "loss": 0.7219, + "step": 4367 + }, + { + "epoch": 0.6532077164647824, + "grad_norm": 1.297373135966818, + "learning_rate": 2.8361938165970876e-06, + "loss": 0.2473, + "step": 4368 + }, + { + "epoch": 0.6533572603559145, + "grad_norm": 1.5055358776731897, + "learning_rate": 2.8340107736229e-06, + "loss": 0.411, + "step": 4369 + }, + { + "epoch": 0.6535068042470465, + "grad_norm": 1.6739994793740698, + "learning_rate": 2.831828238810658e-06, + "loss": 0.4319, + "step": 4370 + }, + { + "epoch": 0.6536563481381785, + "grad_norm": 1.2825319924370413, + "learning_rate": 2.8296462126724077e-06, + "loss": 0.2329, + "step": 4371 + }, + { + "epoch": 0.6538058920293106, + "grad_norm": 1.5202140426315065, + "learning_rate": 2.8274646957200724e-06, + "loss": 0.2369, + "step": 4372 + }, + { + "epoch": 0.6539554359204427, + "grad_norm": 1.739325259152319, + "learning_rate": 2.8252836884654594e-06, + "loss": 0.3713, + "step": 4373 + }, + { + "epoch": 0.6541049798115747, + "grad_norm": 1.3433364131880747, + "learning_rate": 2.82310319142025e-06, + "loss": 0.2089, + "step": 4374 + }, + { + "epoch": 0.6542545237027068, + "grad_norm": 1.2350274365674798, + "learning_rate": 2.8209232050960105e-06, + "loss": 0.2776, + "step": 4375 + }, + { + "epoch": 0.6544040675938388, + "grad_norm": 1.3867166787790224, + "learning_rate": 2.818743730004188e-06, + "loss": 0.2008, + "step": 4376 + }, + { + "epoch": 0.6545536114849708, + "grad_norm": 0.941825782121814, + "learning_rate": 2.816564766656108e-06, + "loss": 0.1858, + "step": 4377 + }, + { + "epoch": 0.6547031553761029, + "grad_norm": 1.1183426227174296, + "learning_rate": 2.8143863155629793e-06, + "loss": 0.3019, + "step": 4378 + }, + { + "epoch": 0.6548526992672349, + "grad_norm": 1.9187011951738253, + "learning_rate": 2.812208377235881e-06, + "loss": 0.3987, + "step": 4379 + }, + { + "epoch": 0.655002243158367, + "grad_norm": 1.3958417841195268, + "learning_rate": 2.8100309521857825e-06, + "loss": 0.3578, + "step": 4380 + }, + { + "epoch": 0.6551517870494991, + "grad_norm": 1.1711576507342767, + "learning_rate": 2.807854040923529e-06, + "loss": 0.1963, + "step": 4381 + }, + { + "epoch": 0.655301330940631, + "grad_norm": 0.9913151578185128, + "learning_rate": 2.8056776439598433e-06, + "loss": 0.1655, + "step": 4382 + }, + { + "epoch": 0.6554508748317631, + "grad_norm": 1.5681331704171262, + "learning_rate": 2.8035017618053296e-06, + "loss": 0.2992, + "step": 4383 + }, + { + "epoch": 0.6556004187228952, + "grad_norm": 1.4790018156584064, + "learning_rate": 2.8013263949704706e-06, + "loss": 0.3136, + "step": 4384 + }, + { + "epoch": 0.6557499626140272, + "grad_norm": 0.9994189149888982, + "learning_rate": 2.7991515439656297e-06, + "loss": 0.1362, + "step": 4385 + }, + { + "epoch": 0.6558995065051593, + "grad_norm": 1.3602945298003222, + "learning_rate": 2.796977209301047e-06, + "loss": 0.1949, + "step": 4386 + }, + { + "epoch": 0.6560490503962914, + "grad_norm": 1.5519253212439887, + "learning_rate": 2.7948033914868415e-06, + "loss": 0.1664, + "step": 4387 + }, + { + "epoch": 0.6561985942874233, + "grad_norm": 1.3116382312883463, + "learning_rate": 2.792630091033015e-06, + "loss": 0.2667, + "step": 4388 + }, + { + "epoch": 0.6563481381785554, + "grad_norm": 1.5456359505487205, + "learning_rate": 2.7904573084494407e-06, + "loss": 0.3245, + "step": 4389 + }, + { + "epoch": 0.6564976820696875, + "grad_norm": 1.2913211368683502, + "learning_rate": 2.7882850442458755e-06, + "loss": 0.3066, + "step": 4390 + }, + { + "epoch": 0.6566472259608195, + "grad_norm": 1.869518459083425, + "learning_rate": 2.7861132989319544e-06, + "loss": 0.3495, + "step": 4391 + }, + { + "epoch": 0.6567967698519516, + "grad_norm": 1.373328927428082, + "learning_rate": 2.7839420730171895e-06, + "loss": 0.2125, + "step": 4392 + }, + { + "epoch": 0.6569463137430835, + "grad_norm": 1.8085123954255349, + "learning_rate": 2.781771367010973e-06, + "loss": 0.2415, + "step": 4393 + }, + { + "epoch": 0.6570958576342156, + "grad_norm": 1.0139033929676113, + "learning_rate": 2.77960118142257e-06, + "loss": 0.1742, + "step": 4394 + }, + { + "epoch": 0.6572454015253477, + "grad_norm": 1.685995811270678, + "learning_rate": 2.7774315167611276e-06, + "loss": 0.3223, + "step": 4395 + }, + { + "epoch": 0.6573949454164797, + "grad_norm": 1.7868388236948014, + "learning_rate": 2.7752623735356722e-06, + "loss": 0.3727, + "step": 4396 + }, + { + "epoch": 0.6575444893076118, + "grad_norm": 2.171369150655544, + "learning_rate": 2.7730937522551043e-06, + "loss": 0.3697, + "step": 4397 + }, + { + "epoch": 0.6576940331987439, + "grad_norm": 1.9125192931470543, + "learning_rate": 2.7709256534282037e-06, + "loss": 0.2111, + "step": 4398 + }, + { + "epoch": 0.6578435770898758, + "grad_norm": 1.113585534256139, + "learning_rate": 2.768758077563627e-06, + "loss": 0.1945, + "step": 4399 + }, + { + "epoch": 0.6579931209810079, + "grad_norm": 1.2761501807828817, + "learning_rate": 2.7665910251699083e-06, + "loss": 0.1971, + "step": 4400 + }, + { + "epoch": 0.65814266487214, + "grad_norm": 1.4329874019305284, + "learning_rate": 2.7644244967554595e-06, + "loss": 0.3153, + "step": 4401 + }, + { + "epoch": 0.658292208763272, + "grad_norm": 1.3299910395004908, + "learning_rate": 2.762258492828569e-06, + "loss": 0.2046, + "step": 4402 + }, + { + "epoch": 0.6584417526544041, + "grad_norm": 1.6253732014304119, + "learning_rate": 2.7600930138974035e-06, + "loss": 0.2748, + "step": 4403 + }, + { + "epoch": 0.6585912965455362, + "grad_norm": 1.3081383695661915, + "learning_rate": 2.7579280604700025e-06, + "loss": 0.2215, + "step": 4404 + }, + { + "epoch": 0.6587408404366681, + "grad_norm": 1.562999848046207, + "learning_rate": 2.755763633054286e-06, + "loss": 0.1758, + "step": 4405 + }, + { + "epoch": 0.6588903843278002, + "grad_norm": 1.6911611048983322, + "learning_rate": 2.75359973215805e-06, + "loss": 0.3255, + "step": 4406 + }, + { + "epoch": 0.6590399282189322, + "grad_norm": 1.4640913722306848, + "learning_rate": 2.7514363582889653e-06, + "loss": 0.2046, + "step": 4407 + }, + { + "epoch": 0.6591894721100643, + "grad_norm": 1.218210501349125, + "learning_rate": 2.749273511954583e-06, + "loss": 0.2054, + "step": 4408 + }, + { + "epoch": 0.6593390160011964, + "grad_norm": 1.3940094978870614, + "learning_rate": 2.7471111936623253e-06, + "loss": 0.2438, + "step": 4409 + }, + { + "epoch": 0.6594885598923284, + "grad_norm": 1.3026691632055731, + "learning_rate": 2.74494940391949e-06, + "loss": 0.2353, + "step": 4410 + }, + { + "epoch": 0.6596381037834604, + "grad_norm": 1.6382291329843632, + "learning_rate": 2.7427881432332603e-06, + "loss": 0.312, + "step": 4411 + }, + { + "epoch": 0.6597876476745925, + "grad_norm": 1.5603652939172554, + "learning_rate": 2.7406274121106856e-06, + "loss": 0.3006, + "step": 4412 + }, + { + "epoch": 0.6599371915657245, + "grad_norm": 1.090656104231796, + "learning_rate": 2.7384672110586963e-06, + "loss": 0.1889, + "step": 4413 + }, + { + "epoch": 0.6600867354568566, + "grad_norm": 1.436850245454595, + "learning_rate": 2.736307540584092e-06, + "loss": 0.2647, + "step": 4414 + }, + { + "epoch": 0.6602362793479887, + "grad_norm": 1.7597105163684004, + "learning_rate": 2.7341484011935554e-06, + "loss": 0.385, + "step": 4415 + }, + { + "epoch": 0.6603858232391207, + "grad_norm": 1.2090788451524748, + "learning_rate": 2.7319897933936408e-06, + "loss": 0.241, + "step": 4416 + }, + { + "epoch": 0.6605353671302527, + "grad_norm": 1.7319328710330286, + "learning_rate": 2.7298317176907772e-06, + "loss": 0.451, + "step": 4417 + }, + { + "epoch": 0.6606849110213848, + "grad_norm": 1.5724306339844993, + "learning_rate": 2.727674174591274e-06, + "loss": 0.2432, + "step": 4418 + }, + { + "epoch": 0.6608344549125168, + "grad_norm": 1.9853434639918945, + "learning_rate": 2.7255171646013066e-06, + "loss": 0.4608, + "step": 4419 + }, + { + "epoch": 0.6609839988036489, + "grad_norm": 1.7678752206900543, + "learning_rate": 2.7233606882269326e-06, + "loss": 0.3429, + "step": 4420 + }, + { + "epoch": 0.661133542694781, + "grad_norm": 1.9928878413352948, + "learning_rate": 2.721204745974082e-06, + "loss": 0.2974, + "step": 4421 + }, + { + "epoch": 0.661283086585913, + "grad_norm": 1.5540662511476908, + "learning_rate": 2.719049338348559e-06, + "loss": 0.2107, + "step": 4422 + }, + { + "epoch": 0.661432630477045, + "grad_norm": 1.250271025720372, + "learning_rate": 2.7168944658560436e-06, + "loss": 0.1951, + "step": 4423 + }, + { + "epoch": 0.661582174368177, + "grad_norm": 1.8632502883162096, + "learning_rate": 2.7147401290020903e-06, + "loss": 0.4555, + "step": 4424 + }, + { + "epoch": 0.6617317182593091, + "grad_norm": 1.7615241135628605, + "learning_rate": 2.712586328292126e-06, + "loss": 0.2952, + "step": 4425 + }, + { + "epoch": 0.6618812621504412, + "grad_norm": 1.538421076032742, + "learning_rate": 2.7104330642314534e-06, + "loss": 0.2333, + "step": 4426 + }, + { + "epoch": 0.6620308060415732, + "grad_norm": 1.2007569968926926, + "learning_rate": 2.70828033732525e-06, + "loss": 0.208, + "step": 4427 + }, + { + "epoch": 0.6621803499327052, + "grad_norm": 1.3593872319142528, + "learning_rate": 2.7061281480785655e-06, + "loss": 0.1821, + "step": 4428 + }, + { + "epoch": 0.6623298938238373, + "grad_norm": 1.232105994512284, + "learning_rate": 2.703976496996327e-06, + "loss": 0.1709, + "step": 4429 + }, + { + "epoch": 0.6624794377149693, + "grad_norm": 1.45991562993099, + "learning_rate": 2.7018253845833265e-06, + "loss": 0.3415, + "step": 4430 + }, + { + "epoch": 0.6626289816061014, + "grad_norm": 1.8114343832084139, + "learning_rate": 2.6996748113442397e-06, + "loss": 0.4451, + "step": 4431 + }, + { + "epoch": 0.6627785254972335, + "grad_norm": 1.2612949226382904, + "learning_rate": 2.697524777783611e-06, + "loss": 0.1998, + "step": 4432 + }, + { + "epoch": 0.6629280693883655, + "grad_norm": 1.65296698826448, + "learning_rate": 2.69537528440586e-06, + "loss": 0.187, + "step": 4433 + }, + { + "epoch": 0.6630776132794975, + "grad_norm": 1.8179891682696618, + "learning_rate": 2.6932263317152797e-06, + "loss": 0.2954, + "step": 4434 + }, + { + "epoch": 0.6632271571706296, + "grad_norm": 1.3918087600920022, + "learning_rate": 2.691077920216031e-06, + "loss": 0.2249, + "step": 4435 + }, + { + "epoch": 0.6633767010617616, + "grad_norm": 1.181894833041793, + "learning_rate": 2.6889300504121537e-06, + "loss": 0.3181, + "step": 4436 + }, + { + "epoch": 0.6635262449528937, + "grad_norm": 1.5083352946651556, + "learning_rate": 2.6867827228075596e-06, + "loss": 0.1869, + "step": 4437 + }, + { + "epoch": 0.6636757888440257, + "grad_norm": 1.4057723638218287, + "learning_rate": 2.6846359379060317e-06, + "loss": 0.2019, + "step": 4438 + }, + { + "epoch": 0.6638253327351578, + "grad_norm": 1.5144730730196767, + "learning_rate": 2.6824896962112268e-06, + "loss": 0.3392, + "step": 4439 + }, + { + "epoch": 0.6639748766262898, + "grad_norm": 1.8769019176472341, + "learning_rate": 2.680343998226674e-06, + "loss": 0.3426, + "step": 4440 + }, + { + "epoch": 0.6641244205174218, + "grad_norm": 1.5553252335483767, + "learning_rate": 2.678198844455775e-06, + "loss": 0.3165, + "step": 4441 + }, + { + "epoch": 0.6642739644085539, + "grad_norm": 2.146334878090752, + "learning_rate": 2.6760542354018026e-06, + "loss": 0.6555, + "step": 4442 + }, + { + "epoch": 0.664423508299686, + "grad_norm": 1.3461122126317742, + "learning_rate": 2.6739101715679035e-06, + "loss": 0.1645, + "step": 4443 + }, + { + "epoch": 0.664573052190818, + "grad_norm": 2.2230428081308347, + "learning_rate": 2.6717666534570976e-06, + "loss": 0.3223, + "step": 4444 + }, + { + "epoch": 0.6647225960819501, + "grad_norm": 1.786876248438967, + "learning_rate": 2.6696236815722703e-06, + "loss": 0.2194, + "step": 4445 + }, + { + "epoch": 0.6648721399730821, + "grad_norm": 1.563808365448449, + "learning_rate": 2.6674812564161867e-06, + "loss": 0.2264, + "step": 4446 + }, + { + "epoch": 0.6650216838642141, + "grad_norm": 1.665370120699706, + "learning_rate": 2.6653393784914795e-06, + "loss": 0.2036, + "step": 4447 + }, + { + "epoch": 0.6651712277553462, + "grad_norm": 1.5909818001796514, + "learning_rate": 2.663198048300654e-06, + "loss": 0.2692, + "step": 4448 + }, + { + "epoch": 0.6653207716464783, + "grad_norm": 1.3241563095855466, + "learning_rate": 2.6610572663460888e-06, + "loss": 0.2268, + "step": 4449 + }, + { + "epoch": 0.6654703155376103, + "grad_norm": 1.4147371163461582, + "learning_rate": 2.6589170331300284e-06, + "loss": 0.2954, + "step": 4450 + }, + { + "epoch": 0.6656198594287424, + "grad_norm": 1.290890230215134, + "learning_rate": 2.6567773491545916e-06, + "loss": 0.1932, + "step": 4451 + }, + { + "epoch": 0.6657694033198743, + "grad_norm": 1.328536295316399, + "learning_rate": 2.654638214921773e-06, + "loss": 0.2945, + "step": 4452 + }, + { + "epoch": 0.6659189472110064, + "grad_norm": 1.4691191878880958, + "learning_rate": 2.6524996309334326e-06, + "loss": 0.1881, + "step": 4453 + }, + { + "epoch": 0.6660684911021385, + "grad_norm": 1.9846992941067327, + "learning_rate": 2.650361597691305e-06, + "loss": 0.2611, + "step": 4454 + }, + { + "epoch": 0.6662180349932705, + "grad_norm": 1.1002284284226602, + "learning_rate": 2.6482241156969885e-06, + "loss": 0.143, + "step": 4455 + }, + { + "epoch": 0.6663675788844026, + "grad_norm": 1.6582016063094627, + "learning_rate": 2.6460871854519594e-06, + "loss": 0.4064, + "step": 4456 + }, + { + "epoch": 0.6665171227755347, + "grad_norm": 1.9724808152248507, + "learning_rate": 2.643950807457562e-06, + "loss": 0.2726, + "step": 4457 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.6258299157165212, + "learning_rate": 2.6418149822150115e-06, + "loss": 0.3193, + "step": 4458 + }, + { + "epoch": 0.6668162105577987, + "grad_norm": 1.4247436876423234, + "learning_rate": 2.6396797102253944e-06, + "loss": 0.1605, + "step": 4459 + }, + { + "epoch": 0.6669657544489308, + "grad_norm": 1.6721232488411581, + "learning_rate": 2.6375449919896627e-06, + "loss": 0.3464, + "step": 4460 + }, + { + "epoch": 0.6671152983400628, + "grad_norm": 2.9391451297239892, + "learning_rate": 2.6354108280086445e-06, + "loss": 0.2093, + "step": 4461 + }, + { + "epoch": 0.6672648422311949, + "grad_norm": 1.6599264142878403, + "learning_rate": 2.6332772187830337e-06, + "loss": 0.4938, + "step": 4462 + }, + { + "epoch": 0.667414386122327, + "grad_norm": 1.4217088492225147, + "learning_rate": 2.6311441648133962e-06, + "loss": 0.2503, + "step": 4463 + }, + { + "epoch": 0.6675639300134589, + "grad_norm": 1.1416005231658408, + "learning_rate": 2.6290116666001676e-06, + "loss": 0.1838, + "step": 4464 + }, + { + "epoch": 0.667713473904591, + "grad_norm": 1.2704339969898428, + "learning_rate": 2.626879724643652e-06, + "loss": 0.3511, + "step": 4465 + }, + { + "epoch": 0.667863017795723, + "grad_norm": 1.819552430700748, + "learning_rate": 2.6247483394440233e-06, + "loss": 0.2264, + "step": 4466 + }, + { + "epoch": 0.6680125616868551, + "grad_norm": 1.2833183620041992, + "learning_rate": 2.622617511501325e-06, + "loss": 0.3377, + "step": 4467 + }, + { + "epoch": 0.6681621055779872, + "grad_norm": 1.6275093986896139, + "learning_rate": 2.6204872413154714e-06, + "loss": 0.3522, + "step": 4468 + }, + { + "epoch": 0.6683116494691191, + "grad_norm": 1.253423027263163, + "learning_rate": 2.618357529386244e-06, + "loss": 0.2172, + "step": 4469 + }, + { + "epoch": 0.6684611933602512, + "grad_norm": 1.5980094854941072, + "learning_rate": 2.6162283762132923e-06, + "loss": 0.2653, + "step": 4470 + }, + { + "epoch": 0.6686107372513833, + "grad_norm": 1.403119879248096, + "learning_rate": 2.6140997822961358e-06, + "loss": 0.1532, + "step": 4471 + }, + { + "epoch": 0.6687602811425153, + "grad_norm": 1.4562304037028075, + "learning_rate": 2.6119717481341642e-06, + "loss": 0.2866, + "step": 4472 + }, + { + "epoch": 0.6689098250336474, + "grad_norm": 1.6249281060380736, + "learning_rate": 2.609844274226635e-06, + "loss": 0.3281, + "step": 4473 + }, + { + "epoch": 0.6690593689247795, + "grad_norm": 1.5803890747488134, + "learning_rate": 2.6077173610726757e-06, + "loss": 0.3632, + "step": 4474 + }, + { + "epoch": 0.6692089128159114, + "grad_norm": 1.1776060853740318, + "learning_rate": 2.6055910091712772e-06, + "loss": 0.1556, + "step": 4475 + }, + { + "epoch": 0.6693584567070435, + "grad_norm": 1.5261404365716826, + "learning_rate": 2.603465219021304e-06, + "loss": 0.2003, + "step": 4476 + }, + { + "epoch": 0.6695080005981756, + "grad_norm": 1.947084837453816, + "learning_rate": 2.601339991121486e-06, + "loss": 0.4741, + "step": 4477 + }, + { + "epoch": 0.6696575444893076, + "grad_norm": 1.4616592429774347, + "learning_rate": 2.599215325970423e-06, + "loss": 0.2207, + "step": 4478 + }, + { + "epoch": 0.6698070883804397, + "grad_norm": 1.5158163962493627, + "learning_rate": 2.5970912240665815e-06, + "loss": 0.1717, + "step": 4479 + }, + { + "epoch": 0.6699566322715718, + "grad_norm": 1.3272386482349343, + "learning_rate": 2.5949676859082973e-06, + "loss": 0.2796, + "step": 4480 + }, + { + "epoch": 0.6701061761627037, + "grad_norm": 1.1894607731615547, + "learning_rate": 2.5928447119937717e-06, + "loss": 0.1955, + "step": 4481 + }, + { + "epoch": 0.6702557200538358, + "grad_norm": 1.9252345719407216, + "learning_rate": 2.590722302821075e-06, + "loss": 0.4252, + "step": 4482 + }, + { + "epoch": 0.6704052639449678, + "grad_norm": 1.1239347712911045, + "learning_rate": 2.5886004588881452e-06, + "loss": 0.1884, + "step": 4483 + }, + { + "epoch": 0.6705548078360999, + "grad_norm": 1.6982575120283836, + "learning_rate": 2.5864791806927904e-06, + "loss": 0.3321, + "step": 4484 + }, + { + "epoch": 0.670704351727232, + "grad_norm": 1.5015928793800224, + "learning_rate": 2.584358468732676e-06, + "loss": 0.218, + "step": 4485 + }, + { + "epoch": 0.670853895618364, + "grad_norm": 1.4247217029985146, + "learning_rate": 2.5822383235053464e-06, + "loss": 0.3234, + "step": 4486 + }, + { + "epoch": 0.671003439509496, + "grad_norm": 1.3017682120157468, + "learning_rate": 2.5801187455082064e-06, + "loss": 0.2223, + "step": 4487 + }, + { + "epoch": 0.6711529834006281, + "grad_norm": 1.3499705028843545, + "learning_rate": 2.577999735238531e-06, + "loss": 0.2955, + "step": 4488 + }, + { + "epoch": 0.6713025272917601, + "grad_norm": 1.3330957656181952, + "learning_rate": 2.5758812931934606e-06, + "loss": 0.1952, + "step": 4489 + }, + { + "epoch": 0.6714520711828922, + "grad_norm": 1.6714305866356522, + "learning_rate": 2.573763419869999e-06, + "loss": 0.303, + "step": 4490 + }, + { + "epoch": 0.6716016150740243, + "grad_norm": 1.6228094558496275, + "learning_rate": 2.5716461157650217e-06, + "loss": 0.2104, + "step": 4491 + }, + { + "epoch": 0.6717511589651562, + "grad_norm": 1.5380762009085422, + "learning_rate": 2.5695293813752664e-06, + "loss": 0.1895, + "step": 4492 + }, + { + "epoch": 0.6719007028562883, + "grad_norm": 1.615717823601208, + "learning_rate": 2.5674132171973444e-06, + "loss": 0.449, + "step": 4493 + }, + { + "epoch": 0.6720502467474204, + "grad_norm": 1.7981929838198951, + "learning_rate": 2.565297623727726e-06, + "loss": 0.437, + "step": 4494 + }, + { + "epoch": 0.6721997906385524, + "grad_norm": 1.6339389985322836, + "learning_rate": 2.5631826014627483e-06, + "loss": 0.2037, + "step": 4495 + }, + { + "epoch": 0.6723493345296845, + "grad_norm": 1.5686916032689824, + "learning_rate": 2.561068150898616e-06, + "loss": 0.4212, + "step": 4496 + }, + { + "epoch": 0.6724988784208165, + "grad_norm": 1.4778132137047602, + "learning_rate": 2.558954272531401e-06, + "loss": 0.2207, + "step": 4497 + }, + { + "epoch": 0.6726484223119485, + "grad_norm": 2.4730464271902917, + "learning_rate": 2.556840966857038e-06, + "loss": 0.369, + "step": 4498 + }, + { + "epoch": 0.6727979662030806, + "grad_norm": 1.1173307437030382, + "learning_rate": 2.554728234371333e-06, + "loss": 0.1953, + "step": 4499 + }, + { + "epoch": 0.6729475100942126, + "grad_norm": 1.2102968474734468, + "learning_rate": 2.5526160755699476e-06, + "loss": 0.1725, + "step": 4500 + }, + { + "epoch": 0.6730970539853447, + "grad_norm": 1.841005221927744, + "learning_rate": 2.5505044909484172e-06, + "loss": 0.4732, + "step": 4501 + }, + { + "epoch": 0.6732465978764768, + "grad_norm": 1.6515349173791407, + "learning_rate": 2.54839348100214e-06, + "loss": 0.3197, + "step": 4502 + }, + { + "epoch": 0.6733961417676088, + "grad_norm": 1.5072986883603168, + "learning_rate": 2.5462830462263787e-06, + "loss": 0.4328, + "step": 4503 + }, + { + "epoch": 0.6735456856587408, + "grad_norm": 1.880239085336064, + "learning_rate": 2.5441731871162633e-06, + "loss": 0.476, + "step": 4504 + }, + { + "epoch": 0.6736952295498729, + "grad_norm": 1.7936220910991645, + "learning_rate": 2.542063904166785e-06, + "loss": 0.4094, + "step": 4505 + }, + { + "epoch": 0.6738447734410049, + "grad_norm": 1.4488223065722017, + "learning_rate": 2.5399551978728033e-06, + "loss": 0.2158, + "step": 4506 + }, + { + "epoch": 0.673994317332137, + "grad_norm": 1.5085310088771018, + "learning_rate": 2.5378470687290397e-06, + "loss": 0.1753, + "step": 4507 + }, + { + "epoch": 0.6741438612232691, + "grad_norm": 1.517874631059701, + "learning_rate": 2.535739517230083e-06, + "loss": 0.325, + "step": 4508 + }, + { + "epoch": 0.6742934051144011, + "grad_norm": 1.7195395780133773, + "learning_rate": 2.5336325438703863e-06, + "loss": 0.2393, + "step": 4509 + }, + { + "epoch": 0.6744429490055331, + "grad_norm": 1.3387517983694313, + "learning_rate": 2.531526149144262e-06, + "loss": 0.3497, + "step": 4510 + }, + { + "epoch": 0.6745924928966651, + "grad_norm": 1.95315712711614, + "learning_rate": 2.5294203335458935e-06, + "loss": 0.4593, + "step": 4511 + }, + { + "epoch": 0.6747420367877972, + "grad_norm": 0.9559347391135435, + "learning_rate": 2.527315097569324e-06, + "loss": 0.1639, + "step": 4512 + }, + { + "epoch": 0.6748915806789293, + "grad_norm": 1.414032582762148, + "learning_rate": 2.5252104417084626e-06, + "loss": 0.2025, + "step": 4513 + }, + { + "epoch": 0.6750411245700613, + "grad_norm": 1.566845839312385, + "learning_rate": 2.523106366457082e-06, + "loss": 0.1844, + "step": 4514 + }, + { + "epoch": 0.6751906684611934, + "grad_norm": 1.9898672991386208, + "learning_rate": 2.521002872308821e-06, + "loss": 0.4331, + "step": 4515 + }, + { + "epoch": 0.6753402123523254, + "grad_norm": 1.7100230390337947, + "learning_rate": 2.518899959757175e-06, + "loss": 0.1873, + "step": 4516 + }, + { + "epoch": 0.6754897562434574, + "grad_norm": 2.0525844021935935, + "learning_rate": 2.51679762929551e-06, + "loss": 0.2439, + "step": 4517 + }, + { + "epoch": 0.6756393001345895, + "grad_norm": 1.141829150628222, + "learning_rate": 2.514695881417052e-06, + "loss": 0.1951, + "step": 4518 + }, + { + "epoch": 0.6757888440257216, + "grad_norm": 1.8463419499999694, + "learning_rate": 2.5125947166148923e-06, + "loss": 0.3175, + "step": 4519 + }, + { + "epoch": 0.6759383879168536, + "grad_norm": 1.4468038571336173, + "learning_rate": 2.5104941353819847e-06, + "loss": 0.1468, + "step": 4520 + }, + { + "epoch": 0.6760879318079857, + "grad_norm": 1.840830955171049, + "learning_rate": 2.5083941382111442e-06, + "loss": 0.2451, + "step": 4521 + }, + { + "epoch": 0.6762374756991177, + "grad_norm": 1.2525028087295136, + "learning_rate": 2.506294725595052e-06, + "loss": 0.2028, + "step": 4522 + }, + { + "epoch": 0.6763870195902497, + "grad_norm": 1.6825655787031588, + "learning_rate": 2.50419589802625e-06, + "loss": 0.1967, + "step": 4523 + }, + { + "epoch": 0.6765365634813818, + "grad_norm": 2.0206736390111946, + "learning_rate": 2.502097655997143e-06, + "loss": 0.253, + "step": 4524 + }, + { + "epoch": 0.6766861073725138, + "grad_norm": 2.290957099706866, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.2258, + "step": 4525 + }, + { + "epoch": 0.6768356512636459, + "grad_norm": 1.687469367349712, + "learning_rate": 2.4979029305269507e-06, + "loss": 0.4734, + "step": 4526 + }, + { + "epoch": 0.676985195154778, + "grad_norm": 3.698097562618829, + "learning_rate": 2.495806448069987e-06, + "loss": 0.1845, + "step": 4527 + }, + { + "epoch": 0.6771347390459099, + "grad_norm": 1.4094663924466104, + "learning_rate": 2.493710553120963e-06, + "loss": 0.187, + "step": 4528 + }, + { + "epoch": 0.677284282937042, + "grad_norm": 1.5924439969538318, + "learning_rate": 2.4916152461715987e-06, + "loss": 0.3456, + "step": 4529 + }, + { + "epoch": 0.6774338268281741, + "grad_norm": 1.717909780950052, + "learning_rate": 2.4895205277134733e-06, + "loss": 0.1999, + "step": 4530 + }, + { + "epoch": 0.6775833707193061, + "grad_norm": 3.204725423929768, + "learning_rate": 2.4874263982380242e-06, + "loss": 0.2913, + "step": 4531 + }, + { + "epoch": 0.6777329146104382, + "grad_norm": 1.98394915749841, + "learning_rate": 2.485332858236555e-06, + "loss": 0.2699, + "step": 4532 + }, + { + "epoch": 0.6778824585015703, + "grad_norm": 2.1405017009782705, + "learning_rate": 2.483239908200234e-06, + "loss": 0.3382, + "step": 4533 + }, + { + "epoch": 0.6780320023927022, + "grad_norm": 1.733333956096687, + "learning_rate": 2.4811475486200852e-06, + "loss": 0.1584, + "step": 4534 + }, + { + "epoch": 0.6781815462838343, + "grad_norm": 1.2434392026777898, + "learning_rate": 2.479055779986999e-06, + "loss": 0.1823, + "step": 4535 + }, + { + "epoch": 0.6783310901749664, + "grad_norm": 1.7956490190481578, + "learning_rate": 2.47696460279172e-06, + "loss": 0.3989, + "step": 4536 + }, + { + "epoch": 0.6784806340660984, + "grad_norm": 2.310256294816532, + "learning_rate": 2.4748740175248605e-06, + "loss": 0.6357, + "step": 4537 + }, + { + "epoch": 0.6786301779572305, + "grad_norm": 1.2356897547091927, + "learning_rate": 2.472784024676892e-06, + "loss": 0.2037, + "step": 4538 + }, + { + "epoch": 0.6787797218483625, + "grad_norm": 1.4315797654938371, + "learning_rate": 2.4706946247381468e-06, + "loss": 0.2856, + "step": 4539 + }, + { + "epoch": 0.6789292657394945, + "grad_norm": 1.9722002314369849, + "learning_rate": 2.4686058181988208e-06, + "loss": 0.4924, + "step": 4540 + }, + { + "epoch": 0.6790788096306266, + "grad_norm": 1.4626799815033218, + "learning_rate": 2.4665176055489636e-06, + "loss": 0.286, + "step": 4541 + }, + { + "epoch": 0.6792283535217586, + "grad_norm": 1.865826625567713, + "learning_rate": 2.4644299872784927e-06, + "loss": 0.3587, + "step": 4542 + }, + { + "epoch": 0.6793778974128907, + "grad_norm": 1.8507709461020339, + "learning_rate": 2.4623429638771825e-06, + "loss": 0.5021, + "step": 4543 + }, + { + "epoch": 0.6795274413040228, + "grad_norm": 1.8053822274235412, + "learning_rate": 2.4602565358346696e-06, + "loss": 0.3932, + "step": 4544 + }, + { + "epoch": 0.6796769851951547, + "grad_norm": 1.96415152215224, + "learning_rate": 2.4581707036404494e-06, + "loss": 0.2108, + "step": 4545 + }, + { + "epoch": 0.6798265290862868, + "grad_norm": 2.1231248520162427, + "learning_rate": 2.456085467783879e-06, + "loss": 0.3537, + "step": 4546 + }, + { + "epoch": 0.6799760729774189, + "grad_norm": 1.658230950938692, + "learning_rate": 2.4540008287541746e-06, + "loss": 0.3203, + "step": 4547 + }, + { + "epoch": 0.6801256168685509, + "grad_norm": 2.1309611630975507, + "learning_rate": 2.4519167870404126e-06, + "loss": 0.3312, + "step": 4548 + }, + { + "epoch": 0.680275160759683, + "grad_norm": 1.670714314146571, + "learning_rate": 2.4498333431315287e-06, + "loss": 0.1994, + "step": 4549 + }, + { + "epoch": 0.6804247046508151, + "grad_norm": 1.5024268565249808, + "learning_rate": 2.447750497516321e-06, + "loss": 0.2307, + "step": 4550 + }, + { + "epoch": 0.680574248541947, + "grad_norm": 2.242283862233074, + "learning_rate": 2.445668250683443e-06, + "loss": 0.4919, + "step": 4551 + }, + { + "epoch": 0.6807237924330791, + "grad_norm": 1.421126923428181, + "learning_rate": 2.4435866031214095e-06, + "loss": 0.2254, + "step": 4552 + }, + { + "epoch": 0.6808733363242112, + "grad_norm": 1.4927463259013674, + "learning_rate": 2.4415055553185955e-06, + "loss": 0.3069, + "step": 4553 + }, + { + "epoch": 0.6810228802153432, + "grad_norm": 1.2998806696923215, + "learning_rate": 2.4394251077632364e-06, + "loss": 0.1615, + "step": 4554 + }, + { + "epoch": 0.6811724241064753, + "grad_norm": 1.146805911490797, + "learning_rate": 2.4373452609434255e-06, + "loss": 0.1684, + "step": 4555 + }, + { + "epoch": 0.6813219679976072, + "grad_norm": 1.7713961216520178, + "learning_rate": 2.435266015347112e-06, + "loss": 0.4814, + "step": 4556 + }, + { + "epoch": 0.6814715118887393, + "grad_norm": 1.4120882911216142, + "learning_rate": 2.4331873714621084e-06, + "loss": 0.2663, + "step": 4557 + }, + { + "epoch": 0.6816210557798714, + "grad_norm": 1.8178671099844292, + "learning_rate": 2.431109329776085e-06, + "loss": 0.3536, + "step": 4558 + }, + { + "epoch": 0.6817705996710034, + "grad_norm": 1.896015073812396, + "learning_rate": 2.4290318907765698e-06, + "loss": 0.2162, + "step": 4559 + }, + { + "epoch": 0.6819201435621355, + "grad_norm": 1.549976332160772, + "learning_rate": 2.4269550549509504e-06, + "loss": 0.2146, + "step": 4560 + }, + { + "epoch": 0.6820696874532676, + "grad_norm": 1.7333003400526625, + "learning_rate": 2.424878822786473e-06, + "loss": 0.3507, + "step": 4561 + }, + { + "epoch": 0.6822192313443995, + "grad_norm": 2.080118048199713, + "learning_rate": 2.4228031947702404e-06, + "loss": 0.4497, + "step": 4562 + }, + { + "epoch": 0.6823687752355316, + "grad_norm": 1.3144787761798393, + "learning_rate": 2.420728171389216e-06, + "loss": 0.1721, + "step": 4563 + }, + { + "epoch": 0.6825183191266637, + "grad_norm": 1.9989255760998235, + "learning_rate": 2.41865375313022e-06, + "loss": 0.3456, + "step": 4564 + }, + { + "epoch": 0.6826678630177957, + "grad_norm": 2.1096027039438416, + "learning_rate": 2.4165799404799326e-06, + "loss": 0.2227, + "step": 4565 + }, + { + "epoch": 0.6828174069089278, + "grad_norm": 1.7404558488234287, + "learning_rate": 2.4145067339248872e-06, + "loss": 0.1725, + "step": 4566 + }, + { + "epoch": 0.6829669508000599, + "grad_norm": 1.5383538100690357, + "learning_rate": 2.412434133951479e-06, + "loss": 0.1495, + "step": 4567 + }, + { + "epoch": 0.6831164946911918, + "grad_norm": 1.8764531226763614, + "learning_rate": 2.4103621410459605e-06, + "loss": 0.2587, + "step": 4568 + }, + { + "epoch": 0.6832660385823239, + "grad_norm": 1.4443894712316598, + "learning_rate": 2.4082907556944407e-06, + "loss": 0.1567, + "step": 4569 + }, + { + "epoch": 0.6834155824734559, + "grad_norm": 2.045835653356828, + "learning_rate": 2.4062199783828887e-06, + "loss": 0.5029, + "step": 4570 + }, + { + "epoch": 0.683565126364588, + "grad_norm": 1.7592933563003443, + "learning_rate": 2.4041498095971253e-06, + "loss": 0.2877, + "step": 4571 + }, + { + "epoch": 0.6837146702557201, + "grad_norm": 1.8238984261708266, + "learning_rate": 2.4020802498228333e-06, + "loss": 0.5675, + "step": 4572 + }, + { + "epoch": 0.6838642141468521, + "grad_norm": 1.2044384462662545, + "learning_rate": 2.4000112995455505e-06, + "loss": 0.1853, + "step": 4573 + }, + { + "epoch": 0.6840137580379841, + "grad_norm": 1.9777223930621288, + "learning_rate": 2.397942959250676e-06, + "loss": 0.1935, + "step": 4574 + }, + { + "epoch": 0.6841633019291162, + "grad_norm": 1.6845231378556524, + "learning_rate": 2.395875229423461e-06, + "loss": 0.3364, + "step": 4575 + }, + { + "epoch": 0.6843128458202482, + "grad_norm": 2.1285112381986346, + "learning_rate": 2.393808110549013e-06, + "loss": 0.6261, + "step": 4576 + }, + { + "epoch": 0.6844623897113803, + "grad_norm": 1.7401607339435936, + "learning_rate": 2.3917416031122994e-06, + "loss": 0.3315, + "step": 4577 + }, + { + "epoch": 0.6846119336025124, + "grad_norm": 1.6390547094896344, + "learning_rate": 2.3896757075981415e-06, + "loss": 0.3245, + "step": 4578 + }, + { + "epoch": 0.6847614774936444, + "grad_norm": 1.6121215254166528, + "learning_rate": 2.3876104244912197e-06, + "loss": 0.4402, + "step": 4579 + }, + { + "epoch": 0.6849110213847764, + "grad_norm": 1.5197600998932719, + "learning_rate": 2.3855457542760705e-06, + "loss": 0.3134, + "step": 4580 + }, + { + "epoch": 0.6850605652759085, + "grad_norm": 1.365469861766198, + "learning_rate": 2.3834816974370823e-06, + "loss": 0.2025, + "step": 4581 + }, + { + "epoch": 0.6852101091670405, + "grad_norm": 2.206570913193162, + "learning_rate": 2.381418254458504e-06, + "loss": 0.2094, + "step": 4582 + }, + { + "epoch": 0.6853596530581726, + "grad_norm": 2.179403006196204, + "learning_rate": 2.37935542582444e-06, + "loss": 0.2479, + "step": 4583 + }, + { + "epoch": 0.6855091969493046, + "grad_norm": 1.2681576531108862, + "learning_rate": 2.3772932120188486e-06, + "loss": 0.2015, + "step": 4584 + }, + { + "epoch": 0.6856587408404367, + "grad_norm": 1.6301619906028688, + "learning_rate": 2.3752316135255453e-06, + "loss": 0.3426, + "step": 4585 + }, + { + "epoch": 0.6858082847315687, + "grad_norm": 1.8368572690743332, + "learning_rate": 2.373170630828202e-06, + "loss": 0.3368, + "step": 4586 + }, + { + "epoch": 0.6859578286227007, + "grad_norm": 1.5036958968449223, + "learning_rate": 2.3711102644103447e-06, + "loss": 0.2787, + "step": 4587 + }, + { + "epoch": 0.6861073725138328, + "grad_norm": 1.576871903212716, + "learning_rate": 2.3690505147553537e-06, + "loss": 0.326, + "step": 4588 + }, + { + "epoch": 0.6862569164049649, + "grad_norm": 2.1317550217444183, + "learning_rate": 2.3669913823464685e-06, + "loss": 0.3305, + "step": 4589 + }, + { + "epoch": 0.6864064602960969, + "grad_norm": 2.0013226069777748, + "learning_rate": 2.3649328676667817e-06, + "loss": 0.2133, + "step": 4590 + }, + { + "epoch": 0.686556004187229, + "grad_norm": 3.0309540515640054, + "learning_rate": 2.362874971199237e-06, + "loss": 0.5016, + "step": 4591 + }, + { + "epoch": 0.686705548078361, + "grad_norm": 1.698566386079889, + "learning_rate": 2.3608176934266395e-06, + "loss": 0.3094, + "step": 4592 + }, + { + "epoch": 0.686855091969493, + "grad_norm": 1.1218626097364182, + "learning_rate": 2.358761034831646e-06, + "loss": 0.1704, + "step": 4593 + }, + { + "epoch": 0.6870046358606251, + "grad_norm": 1.7480987711613902, + "learning_rate": 2.356704995896768e-06, + "loss": 0.2841, + "step": 4594 + }, + { + "epoch": 0.6871541797517572, + "grad_norm": 1.9320402744451117, + "learning_rate": 2.3546495771043744e-06, + "loss": 0.2722, + "step": 4595 + }, + { + "epoch": 0.6873037236428892, + "grad_norm": 1.5760126092513453, + "learning_rate": 2.352594778936682e-06, + "loss": 0.2132, + "step": 4596 + }, + { + "epoch": 0.6874532675340213, + "grad_norm": 1.504536632845543, + "learning_rate": 2.3505406018757694e-06, + "loss": 0.3034, + "step": 4597 + }, + { + "epoch": 0.6876028114251533, + "grad_norm": 2.0882148205501943, + "learning_rate": 2.348487046403564e-06, + "loss": 0.4912, + "step": 4598 + }, + { + "epoch": 0.6877523553162853, + "grad_norm": 1.6574096421206816, + "learning_rate": 2.3464341130018525e-06, + "loss": 0.1946, + "step": 4599 + }, + { + "epoch": 0.6879018992074174, + "grad_norm": 1.625508448944616, + "learning_rate": 2.344381802152271e-06, + "loss": 0.2829, + "step": 4600 + }, + { + "epoch": 0.6880514430985494, + "grad_norm": 2.006732033965754, + "learning_rate": 2.3423301143363112e-06, + "loss": 0.4434, + "step": 4601 + }, + { + "epoch": 0.6882009869896815, + "grad_norm": 1.2939033760616547, + "learning_rate": 2.34027905003532e-06, + "loss": 0.1808, + "step": 4602 + }, + { + "epoch": 0.6883505308808135, + "grad_norm": 1.3243865635408918, + "learning_rate": 2.3382286097304963e-06, + "loss": 0.17, + "step": 4603 + }, + { + "epoch": 0.6885000747719455, + "grad_norm": 1.999281798990057, + "learning_rate": 2.3361787939028933e-06, + "loss": 0.471, + "step": 4604 + }, + { + "epoch": 0.6886496186630776, + "grad_norm": 1.7744976857477415, + "learning_rate": 2.3341296030334177e-06, + "loss": 0.3231, + "step": 4605 + }, + { + "epoch": 0.6887991625542097, + "grad_norm": 1.6893906333699582, + "learning_rate": 2.3320810376028306e-06, + "loss": 0.1839, + "step": 4606 + }, + { + "epoch": 0.6889487064453417, + "grad_norm": 1.7226376910950774, + "learning_rate": 2.3300330980917423e-06, + "loss": 0.3491, + "step": 4607 + }, + { + "epoch": 0.6890982503364738, + "grad_norm": 1.4118718729748658, + "learning_rate": 2.3279857849806203e-06, + "loss": 0.3047, + "step": 4608 + }, + { + "epoch": 0.6892477942276058, + "grad_norm": 1.6701895872877535, + "learning_rate": 2.325939098749785e-06, + "loss": 0.2104, + "step": 4609 + }, + { + "epoch": 0.6893973381187378, + "grad_norm": 1.1844959534454789, + "learning_rate": 2.3238930398794067e-06, + "loss": 0.1852, + "step": 4610 + }, + { + "epoch": 0.6895468820098699, + "grad_norm": 1.7090626464358218, + "learning_rate": 2.321847608849515e-06, + "loss": 0.2402, + "step": 4611 + }, + { + "epoch": 0.689696425901002, + "grad_norm": 1.7427682977617915, + "learning_rate": 2.3198028061399824e-06, + "loss": 0.3506, + "step": 4612 + }, + { + "epoch": 0.689845969792134, + "grad_norm": 1.8109575667927527, + "learning_rate": 2.317758632230541e-06, + "loss": 0.3346, + "step": 4613 + }, + { + "epoch": 0.6899955136832661, + "grad_norm": 2.2380666713150004, + "learning_rate": 2.315715087600773e-06, + "loss": 0.5128, + "step": 4614 + }, + { + "epoch": 0.690145057574398, + "grad_norm": 1.5943030258047348, + "learning_rate": 2.3136721727301164e-06, + "loss": 0.3733, + "step": 4615 + }, + { + "epoch": 0.6902946014655301, + "grad_norm": 1.609221048306236, + "learning_rate": 2.3116298880978594e-06, + "loss": 0.3455, + "step": 4616 + }, + { + "epoch": 0.6904441453566622, + "grad_norm": 1.6369844098138138, + "learning_rate": 2.309588234183137e-06, + "loss": 0.2209, + "step": 4617 + }, + { + "epoch": 0.6905936892477942, + "grad_norm": 1.2426055048948337, + "learning_rate": 2.3075472114649438e-06, + "loss": 0.1828, + "step": 4618 + }, + { + "epoch": 0.6907432331389263, + "grad_norm": 1.1641756793179936, + "learning_rate": 2.3055068204221226e-06, + "loss": 0.1906, + "step": 4619 + }, + { + "epoch": 0.6908927770300584, + "grad_norm": 1.5002027327407788, + "learning_rate": 2.3034670615333693e-06, + "loss": 0.2173, + "step": 4620 + }, + { + "epoch": 0.6910423209211903, + "grad_norm": 1.7566844796442385, + "learning_rate": 2.301427935277233e-06, + "loss": 0.3071, + "step": 4621 + }, + { + "epoch": 0.6911918648123224, + "grad_norm": 1.1780204836628183, + "learning_rate": 2.299389442132108e-06, + "loss": 0.217, + "step": 4622 + }, + { + "epoch": 0.6913414087034545, + "grad_norm": 1.1814865815819897, + "learning_rate": 2.2973515825762464e-06, + "loss": 0.2224, + "step": 4623 + }, + { + "epoch": 0.6914909525945865, + "grad_norm": 1.6353908218378568, + "learning_rate": 2.2953143570877507e-06, + "loss": 0.1858, + "step": 4624 + }, + { + "epoch": 0.6916404964857186, + "grad_norm": 1.8174800119127352, + "learning_rate": 2.2932777661445726e-06, + "loss": 0.2081, + "step": 4625 + }, + { + "epoch": 0.6917900403768507, + "grad_norm": 1.4736221292865608, + "learning_rate": 2.291241810224516e-06, + "loss": 0.2021, + "step": 4626 + }, + { + "epoch": 0.6919395842679826, + "grad_norm": 1.5436710352914425, + "learning_rate": 2.289206489805236e-06, + "loss": 0.3133, + "step": 4627 + }, + { + "epoch": 0.6920891281591147, + "grad_norm": 1.6060115936001564, + "learning_rate": 2.287171805364238e-06, + "loss": 0.2044, + "step": 4628 + }, + { + "epoch": 0.6922386720502467, + "grad_norm": 1.2741827188445998, + "learning_rate": 2.2851377573788795e-06, + "loss": 0.3343, + "step": 4629 + }, + { + "epoch": 0.6923882159413788, + "grad_norm": 1.2549708146286533, + "learning_rate": 2.2831043463263674e-06, + "loss": 0.1389, + "step": 4630 + }, + { + "epoch": 0.6925377598325109, + "grad_norm": 1.4200924583299743, + "learning_rate": 2.2810715726837613e-06, + "loss": 0.205, + "step": 4631 + }, + { + "epoch": 0.6926873037236428, + "grad_norm": 1.1966057167582587, + "learning_rate": 2.2790394369279657e-06, + "loss": 0.1924, + "step": 4632 + }, + { + "epoch": 0.6928368476147749, + "grad_norm": 2.0421745842756533, + "learning_rate": 2.2770079395357404e-06, + "loss": 0.4616, + "step": 4633 + }, + { + "epoch": 0.692986391505907, + "grad_norm": 1.3679535926488904, + "learning_rate": 2.274977080983695e-06, + "loss": 0.1438, + "step": 4634 + }, + { + "epoch": 0.693135935397039, + "grad_norm": 1.4377322216709112, + "learning_rate": 2.272946861748289e-06, + "loss": 0.369, + "step": 4635 + }, + { + "epoch": 0.6932854792881711, + "grad_norm": 1.7226883891734146, + "learning_rate": 2.270917282305833e-06, + "loss": 0.3433, + "step": 4636 + }, + { + "epoch": 0.6934350231793032, + "grad_norm": 1.2758073680864919, + "learning_rate": 2.2688883431324808e-06, + "loss": 0.229, + "step": 4637 + }, + { + "epoch": 0.6935845670704351, + "grad_norm": 1.2156104155599643, + "learning_rate": 2.266860044704245e-06, + "loss": 0.1971, + "step": 4638 + }, + { + "epoch": 0.6937341109615672, + "grad_norm": 1.7207331660199756, + "learning_rate": 2.2648323874969824e-06, + "loss": 0.423, + "step": 4639 + }, + { + "epoch": 0.6938836548526993, + "grad_norm": 1.3030113714977887, + "learning_rate": 2.262805371986402e-06, + "loss": 0.2408, + "step": 4640 + }, + { + "epoch": 0.6940331987438313, + "grad_norm": 1.4113844940770655, + "learning_rate": 2.26077899864806e-06, + "loss": 0.2106, + "step": 4641 + }, + { + "epoch": 0.6941827426349634, + "grad_norm": 2.0974485482758363, + "learning_rate": 2.2587532679573644e-06, + "loss": 0.4824, + "step": 4642 + }, + { + "epoch": 0.6943322865260954, + "grad_norm": 1.531885185651406, + "learning_rate": 2.2567281803895696e-06, + "loss": 0.3369, + "step": 4643 + }, + { + "epoch": 0.6944818304172274, + "grad_norm": 1.4252330509985323, + "learning_rate": 2.2547037364197825e-06, + "loss": 0.3347, + "step": 4644 + }, + { + "epoch": 0.6946313743083595, + "grad_norm": 1.702316971805034, + "learning_rate": 2.252679936522956e-06, + "loss": 0.3386, + "step": 4645 + }, + { + "epoch": 0.6947809181994915, + "grad_norm": 1.3719010462493337, + "learning_rate": 2.250656781173895e-06, + "loss": 0.3806, + "step": 4646 + }, + { + "epoch": 0.6949304620906236, + "grad_norm": 1.3372620268377342, + "learning_rate": 2.248634270847248e-06, + "loss": 0.3181, + "step": 4647 + }, + { + "epoch": 0.6950800059817557, + "grad_norm": 1.6909543197914825, + "learning_rate": 2.246612406017517e-06, + "loss": 0.3417, + "step": 4648 + }, + { + "epoch": 0.6952295498728877, + "grad_norm": 1.8885107671906933, + "learning_rate": 2.2445911871590507e-06, + "loss": 0.2214, + "step": 4649 + }, + { + "epoch": 0.6953790937640197, + "grad_norm": 1.9377077129756617, + "learning_rate": 2.242570614746047e-06, + "loss": 0.2576, + "step": 4650 + }, + { + "epoch": 0.6955286376551518, + "grad_norm": 1.2269989622068174, + "learning_rate": 2.240550689252553e-06, + "loss": 0.1895, + "step": 4651 + }, + { + "epoch": 0.6956781815462838, + "grad_norm": 1.7420057321674778, + "learning_rate": 2.238531411152459e-06, + "loss": 0.3922, + "step": 4652 + }, + { + "epoch": 0.6958277254374159, + "grad_norm": 1.2689943322625534, + "learning_rate": 2.2365127809195096e-06, + "loss": 0.2144, + "step": 4653 + }, + { + "epoch": 0.695977269328548, + "grad_norm": 1.4521893544900446, + "learning_rate": 2.234494799027293e-06, + "loss": 0.3224, + "step": 4654 + }, + { + "epoch": 0.69612681321968, + "grad_norm": 1.3816305243737939, + "learning_rate": 2.2324774659492505e-06, + "loss": 0.2161, + "step": 4655 + }, + { + "epoch": 0.696276357110812, + "grad_norm": 1.7389754977060294, + "learning_rate": 2.230460782158668e-06, + "loss": 0.3868, + "step": 4656 + }, + { + "epoch": 0.6964259010019441, + "grad_norm": 1.1403573388410169, + "learning_rate": 2.2284447481286747e-06, + "loss": 0.157, + "step": 4657 + }, + { + "epoch": 0.6965754448930761, + "grad_norm": 1.6258223910169618, + "learning_rate": 2.2264293643322544e-06, + "loss": 0.2441, + "step": 4658 + }, + { + "epoch": 0.6967249887842082, + "grad_norm": 1.4946074012223682, + "learning_rate": 2.224414631242235e-06, + "loss": 0.2575, + "step": 4659 + }, + { + "epoch": 0.6968745326753402, + "grad_norm": 1.9173994792778464, + "learning_rate": 2.2224005493312922e-06, + "loss": 0.181, + "step": 4660 + }, + { + "epoch": 0.6970240765664723, + "grad_norm": 1.3932682055651664, + "learning_rate": 2.220387119071951e-06, + "loss": 0.1464, + "step": 4661 + }, + { + "epoch": 0.6971736204576043, + "grad_norm": 1.779921479622985, + "learning_rate": 2.2183743409365786e-06, + "loss": 0.4004, + "step": 4662 + }, + { + "epoch": 0.6973231643487363, + "grad_norm": 1.6183054740781828, + "learning_rate": 2.216362215397393e-06, + "loss": 0.2814, + "step": 4663 + }, + { + "epoch": 0.6974727082398684, + "grad_norm": 1.8066594550723725, + "learning_rate": 2.214350742926459e-06, + "loss": 0.1958, + "step": 4664 + }, + { + "epoch": 0.6976222521310005, + "grad_norm": 1.2700224996999203, + "learning_rate": 2.2123399239956864e-06, + "loss": 0.1792, + "step": 4665 + }, + { + "epoch": 0.6977717960221325, + "grad_norm": 1.31888606535632, + "learning_rate": 2.2103297590768334e-06, + "loss": 0.2179, + "step": 4666 + }, + { + "epoch": 0.6979213399132645, + "grad_norm": 1.4998521334563557, + "learning_rate": 2.2083202486415045e-06, + "loss": 0.4194, + "step": 4667 + }, + { + "epoch": 0.6980708838043966, + "grad_norm": 1.3738896535935858, + "learning_rate": 2.206311393161149e-06, + "loss": 0.1909, + "step": 4668 + }, + { + "epoch": 0.6982204276955286, + "grad_norm": 1.6437393216232281, + "learning_rate": 2.2043031931070652e-06, + "loss": 0.1955, + "step": 4669 + }, + { + "epoch": 0.6983699715866607, + "grad_norm": 1.4681419448028947, + "learning_rate": 2.202295648950395e-06, + "loss": 0.2907, + "step": 4670 + }, + { + "epoch": 0.6985195154777928, + "grad_norm": 1.6154368318665644, + "learning_rate": 2.20028876116213e-06, + "loss": 0.3833, + "step": 4671 + }, + { + "epoch": 0.6986690593689248, + "grad_norm": 1.4243786745580567, + "learning_rate": 2.1982825302131018e-06, + "loss": 0.1742, + "step": 4672 + }, + { + "epoch": 0.6988186032600568, + "grad_norm": 1.6890524438669836, + "learning_rate": 2.1962769565739926e-06, + "loss": 0.2357, + "step": 4673 + }, + { + "epoch": 0.6989681471511888, + "grad_norm": 1.19650494243045, + "learning_rate": 2.1942720407153305e-06, + "loss": 0.1925, + "step": 4674 + }, + { + "epoch": 0.6991176910423209, + "grad_norm": 1.3446356466305722, + "learning_rate": 2.1922677831074875e-06, + "loss": 0.1953, + "step": 4675 + }, + { + "epoch": 0.699267234933453, + "grad_norm": 1.65371932269424, + "learning_rate": 2.1902641842206827e-06, + "loss": 0.1712, + "step": 4676 + }, + { + "epoch": 0.699416778824585, + "grad_norm": 1.6193435228911113, + "learning_rate": 2.1882612445249778e-06, + "loss": 0.1903, + "step": 4677 + }, + { + "epoch": 0.6995663227157171, + "grad_norm": 1.8691890475100623, + "learning_rate": 2.1862589644902825e-06, + "loss": 0.4669, + "step": 4678 + }, + { + "epoch": 0.6997158666068491, + "grad_norm": 3.966671376799878, + "learning_rate": 2.184257344586351e-06, + "loss": 0.3711, + "step": 4679 + }, + { + "epoch": 0.6998654104979811, + "grad_norm": 1.525153768127254, + "learning_rate": 2.1822563852827827e-06, + "loss": 0.2788, + "step": 4680 + }, + { + "epoch": 0.7000149543891132, + "grad_norm": 1.4895632771384457, + "learning_rate": 2.1802560870490226e-06, + "loss": 0.2301, + "step": 4681 + }, + { + "epoch": 0.7001644982802453, + "grad_norm": 1.4192521773579143, + "learning_rate": 2.178256450354359e-06, + "loss": 0.2622, + "step": 4682 + }, + { + "epoch": 0.7003140421713773, + "grad_norm": 1.9599235390448408, + "learning_rate": 2.176257475667926e-06, + "loss": 0.3301, + "step": 4683 + }, + { + "epoch": 0.7004635860625094, + "grad_norm": 1.7477260785398092, + "learning_rate": 2.174259163458703e-06, + "loss": 0.3082, + "step": 4684 + }, + { + "epoch": 0.7006131299536414, + "grad_norm": 1.5200763401354904, + "learning_rate": 2.172261514195513e-06, + "loss": 0.194, + "step": 4685 + }, + { + "epoch": 0.7007626738447734, + "grad_norm": 1.7098216420009527, + "learning_rate": 2.1702645283470238e-06, + "loss": 0.347, + "step": 4686 + }, + { + "epoch": 0.7009122177359055, + "grad_norm": 2.0044185139827784, + "learning_rate": 2.168268206381749e-06, + "loss": 0.4704, + "step": 4687 + }, + { + "epoch": 0.7010617616270375, + "grad_norm": 1.779151039553705, + "learning_rate": 2.1662725487680415e-06, + "loss": 0.1837, + "step": 4688 + }, + { + "epoch": 0.7012113055181696, + "grad_norm": 1.6643292477429161, + "learning_rate": 2.164277555974104e-06, + "loss": 0.1878, + "step": 4689 + }, + { + "epoch": 0.7013608494093017, + "grad_norm": 2.9446504668455784, + "learning_rate": 2.162283228467981e-06, + "loss": 0.2075, + "step": 4690 + }, + { + "epoch": 0.7015103933004336, + "grad_norm": 1.7449397256765202, + "learning_rate": 2.16028956671756e-06, + "loss": 0.2779, + "step": 4691 + }, + { + "epoch": 0.7016599371915657, + "grad_norm": 1.4132965394938541, + "learning_rate": 2.158296571190576e-06, + "loss": 0.2747, + "step": 4692 + }, + { + "epoch": 0.7018094810826978, + "grad_norm": 1.6241527925634025, + "learning_rate": 2.1563042423546014e-06, + "loss": 0.3598, + "step": 4693 + }, + { + "epoch": 0.7019590249738298, + "grad_norm": 1.9481740533296699, + "learning_rate": 2.154312580677057e-06, + "loss": 0.3221, + "step": 4694 + }, + { + "epoch": 0.7021085688649619, + "grad_norm": 2.0935699791166966, + "learning_rate": 2.1523215866252046e-06, + "loss": 0.3407, + "step": 4695 + }, + { + "epoch": 0.702258112756094, + "grad_norm": 1.6559635872881795, + "learning_rate": 2.1503312606661536e-06, + "loss": 0.319, + "step": 4696 + }, + { + "epoch": 0.7024076566472259, + "grad_norm": 1.179897182347415, + "learning_rate": 2.148341603266854e-06, + "loss": 0.1896, + "step": 4697 + }, + { + "epoch": 0.702557200538358, + "grad_norm": 1.4572695161455325, + "learning_rate": 2.146352614894095e-06, + "loss": 0.3342, + "step": 4698 + }, + { + "epoch": 0.7027067444294901, + "grad_norm": 1.5720481959154646, + "learning_rate": 2.1443642960145146e-06, + "loss": 0.3083, + "step": 4699 + }, + { + "epoch": 0.7028562883206221, + "grad_norm": 1.1533343663921902, + "learning_rate": 2.1423766470945904e-06, + "loss": 0.1564, + "step": 4700 + }, + { + "epoch": 0.7030058322117542, + "grad_norm": 1.3098024593490292, + "learning_rate": 2.1403896686006455e-06, + "loss": 0.1676, + "step": 4701 + }, + { + "epoch": 0.7031553761028861, + "grad_norm": 1.6963395027211619, + "learning_rate": 2.1384033609988446e-06, + "loss": 0.3673, + "step": 4702 + }, + { + "epoch": 0.7033049199940182, + "grad_norm": 1.3604451165819607, + "learning_rate": 2.136417724755192e-06, + "loss": 0.1528, + "step": 4703 + }, + { + "epoch": 0.7034544638851503, + "grad_norm": 1.701125090664159, + "learning_rate": 2.1344327603355386e-06, + "loss": 0.4854, + "step": 4704 + }, + { + "epoch": 0.7036040077762823, + "grad_norm": 2.0597156312410765, + "learning_rate": 2.132448468205576e-06, + "loss": 0.3941, + "step": 4705 + }, + { + "epoch": 0.7037535516674144, + "grad_norm": 1.9368210495078715, + "learning_rate": 2.1304648488308383e-06, + "loss": 0.4007, + "step": 4706 + }, + { + "epoch": 0.7039030955585465, + "grad_norm": 1.6767915245759573, + "learning_rate": 2.1284819026767016e-06, + "loss": 0.2381, + "step": 4707 + }, + { + "epoch": 0.7040526394496784, + "grad_norm": 1.3144913337675048, + "learning_rate": 2.126499630208385e-06, + "loss": 0.2306, + "step": 4708 + }, + { + "epoch": 0.7042021833408105, + "grad_norm": 1.5820996540492964, + "learning_rate": 2.1245180318909482e-06, + "loss": 0.2811, + "step": 4709 + }, + { + "epoch": 0.7043517272319426, + "grad_norm": 2.050004263326817, + "learning_rate": 2.1225371081892927e-06, + "loss": 0.5782, + "step": 4710 + }, + { + "epoch": 0.7045012711230746, + "grad_norm": 1.8213512788429163, + "learning_rate": 2.120556859568163e-06, + "loss": 0.5186, + "step": 4711 + }, + { + "epoch": 0.7046508150142067, + "grad_norm": 1.4587002417536927, + "learning_rate": 2.118577286492146e-06, + "loss": 0.2573, + "step": 4712 + }, + { + "epoch": 0.7048003589053388, + "grad_norm": 1.8259926937548316, + "learning_rate": 2.1165983894256647e-06, + "loss": 0.3384, + "step": 4713 + }, + { + "epoch": 0.7049499027964707, + "grad_norm": 1.5961581880480131, + "learning_rate": 2.1146201688329904e-06, + "loss": 0.2016, + "step": 4714 + }, + { + "epoch": 0.7050994466876028, + "grad_norm": 1.23548395103365, + "learning_rate": 2.1126426251782317e-06, + "loss": 0.2576, + "step": 4715 + }, + { + "epoch": 0.7052489905787349, + "grad_norm": 1.5061958421966601, + "learning_rate": 2.1106657589253395e-06, + "loss": 0.185, + "step": 4716 + }, + { + "epoch": 0.7053985344698669, + "grad_norm": 1.8046523829754955, + "learning_rate": 2.1086895705381076e-06, + "loss": 0.5278, + "step": 4717 + }, + { + "epoch": 0.705548078360999, + "grad_norm": 1.4351344755345183, + "learning_rate": 2.106714060480165e-06, + "loss": 0.2583, + "step": 4718 + }, + { + "epoch": 0.705697622252131, + "grad_norm": 1.5501057188835787, + "learning_rate": 2.1047392292149882e-06, + "loss": 0.2195, + "step": 4719 + }, + { + "epoch": 0.705847166143263, + "grad_norm": 1.5774802245738384, + "learning_rate": 2.1027650772058907e-06, + "loss": 0.1787, + "step": 4720 + }, + { + "epoch": 0.7059967100343951, + "grad_norm": 1.8231536892793683, + "learning_rate": 2.100791604916027e-06, + "loss": 0.3842, + "step": 4721 + }, + { + "epoch": 0.7061462539255271, + "grad_norm": 1.6854332833675343, + "learning_rate": 2.098818812808394e-06, + "loss": 0.1943, + "step": 4722 + }, + { + "epoch": 0.7062957978166592, + "grad_norm": 1.6958455069696248, + "learning_rate": 2.0968467013458278e-06, + "loss": 0.3186, + "step": 4723 + }, + { + "epoch": 0.7064453417077913, + "grad_norm": 1.4747719150923775, + "learning_rate": 2.0948752709910035e-06, + "loss": 0.2034, + "step": 4724 + }, + { + "epoch": 0.7065948855989233, + "grad_norm": 1.9794588768390642, + "learning_rate": 2.092904522206439e-06, + "loss": 0.324, + "step": 4725 + }, + { + "epoch": 0.7067444294900553, + "grad_norm": 1.6232444055400037, + "learning_rate": 2.09093445545449e-06, + "loss": 0.3518, + "step": 4726 + }, + { + "epoch": 0.7068939733811874, + "grad_norm": 2.2386200267330305, + "learning_rate": 2.088965071197355e-06, + "loss": 0.2243, + "step": 4727 + }, + { + "epoch": 0.7070435172723194, + "grad_norm": 1.5936315997405437, + "learning_rate": 2.0869963698970676e-06, + "loss": 0.1983, + "step": 4728 + }, + { + "epoch": 0.7071930611634515, + "grad_norm": 1.4775550208581971, + "learning_rate": 2.0850283520155052e-06, + "loss": 0.1618, + "step": 4729 + }, + { + "epoch": 0.7073426050545836, + "grad_norm": 1.9080994069269488, + "learning_rate": 2.0830610180143846e-06, + "loss": 0.4301, + "step": 4730 + }, + { + "epoch": 0.7074921489457155, + "grad_norm": 1.4333880898039384, + "learning_rate": 2.08109436835526e-06, + "loss": 0.3574, + "step": 4731 + }, + { + "epoch": 0.7076416928368476, + "grad_norm": 1.851506015060867, + "learning_rate": 2.0791284034995296e-06, + "loss": 0.418, + "step": 4732 + }, + { + "epoch": 0.7077912367279796, + "grad_norm": 1.9375351779574483, + "learning_rate": 2.0771631239084233e-06, + "loss": 0.4139, + "step": 4733 + }, + { + "epoch": 0.7079407806191117, + "grad_norm": 1.554772005707828, + "learning_rate": 2.075198530043016e-06, + "loss": 0.1786, + "step": 4734 + }, + { + "epoch": 0.7080903245102438, + "grad_norm": 1.75390264769866, + "learning_rate": 2.07323462236422e-06, + "loss": 0.4727, + "step": 4735 + }, + { + "epoch": 0.7082398684013758, + "grad_norm": 1.3694940891653622, + "learning_rate": 2.071271401332789e-06, + "loss": 0.4066, + "step": 4736 + }, + { + "epoch": 0.7083894122925078, + "grad_norm": 3.4800517006814067, + "learning_rate": 2.0693088674093146e-06, + "loss": 0.3289, + "step": 4737 + }, + { + "epoch": 0.7085389561836399, + "grad_norm": 1.7965126916254985, + "learning_rate": 2.0673470210542223e-06, + "loss": 0.4228, + "step": 4738 + }, + { + "epoch": 0.7086885000747719, + "grad_norm": 2.123641224815276, + "learning_rate": 2.0653858627277816e-06, + "loss": 0.2414, + "step": 4739 + }, + { + "epoch": 0.708838043965904, + "grad_norm": 1.7814274017768608, + "learning_rate": 2.0634253928900997e-06, + "loss": 0.1977, + "step": 4740 + }, + { + "epoch": 0.7089875878570361, + "grad_norm": 1.634731670651409, + "learning_rate": 2.0614656120011217e-06, + "loss": 0.3291, + "step": 4741 + }, + { + "epoch": 0.7091371317481681, + "grad_norm": 1.5987346246232053, + "learning_rate": 2.059506520520632e-06, + "loss": 0.4413, + "step": 4742 + }, + { + "epoch": 0.7092866756393001, + "grad_norm": 1.2557267316237954, + "learning_rate": 2.05754811890825e-06, + "loss": 0.1741, + "step": 4743 + }, + { + "epoch": 0.7094362195304322, + "grad_norm": 1.347421034036709, + "learning_rate": 2.055590407623437e-06, + "loss": 0.163, + "step": 4744 + }, + { + "epoch": 0.7095857634215642, + "grad_norm": 1.3672739274091292, + "learning_rate": 2.05363338712549e-06, + "loss": 0.2058, + "step": 4745 + }, + { + "epoch": 0.7097353073126963, + "grad_norm": 1.8020750267925179, + "learning_rate": 2.051677057873546e-06, + "loss": 0.2011, + "step": 4746 + }, + { + "epoch": 0.7098848512038283, + "grad_norm": 1.6621833497920957, + "learning_rate": 2.049721420326578e-06, + "loss": 0.4209, + "step": 4747 + }, + { + "epoch": 0.7100343950949604, + "grad_norm": 1.5679121222959431, + "learning_rate": 2.047766474943398e-06, + "loss": 0.3769, + "step": 4748 + }, + { + "epoch": 0.7101839389860924, + "grad_norm": 1.5967246886278332, + "learning_rate": 2.0458122221826542e-06, + "loss": 0.235, + "step": 4749 + }, + { + "epoch": 0.7103334828772244, + "grad_norm": 1.3136951590796198, + "learning_rate": 2.043858662502834e-06, + "loss": 0.2018, + "step": 4750 + }, + { + "epoch": 0.7104830267683565, + "grad_norm": 1.9399765437775247, + "learning_rate": 2.0419057963622613e-06, + "loss": 0.3376, + "step": 4751 + }, + { + "epoch": 0.7106325706594886, + "grad_norm": 1.3936869392113336, + "learning_rate": 2.039953624219098e-06, + "loss": 0.2123, + "step": 4752 + }, + { + "epoch": 0.7107821145506206, + "grad_norm": 1.486853182232922, + "learning_rate": 2.0380021465313406e-06, + "loss": 0.3239, + "step": 4753 + }, + { + "epoch": 0.7109316584417527, + "grad_norm": 1.7220891594328658, + "learning_rate": 2.0360513637568254e-06, + "loss": 0.2217, + "step": 4754 + }, + { + "epoch": 0.7110812023328847, + "grad_norm": 1.2988328861923717, + "learning_rate": 2.0341012763532243e-06, + "loss": 0.2319, + "step": 4755 + }, + { + "epoch": 0.7112307462240167, + "grad_norm": 1.7196881854807897, + "learning_rate": 2.0321518847780474e-06, + "loss": 0.3627, + "step": 4756 + }, + { + "epoch": 0.7113802901151488, + "grad_norm": 1.679935485808833, + "learning_rate": 2.0302031894886424e-06, + "loss": 0.1865, + "step": 4757 + }, + { + "epoch": 0.7115298340062809, + "grad_norm": 1.0926826923454207, + "learning_rate": 2.0282551909421886e-06, + "loss": 0.2034, + "step": 4758 + }, + { + "epoch": 0.7116793778974129, + "grad_norm": 0.9404517673265307, + "learning_rate": 2.0263078895957065e-06, + "loss": 0.1358, + "step": 4759 + }, + { + "epoch": 0.711828921788545, + "grad_norm": 2.3549408265695786, + "learning_rate": 2.0243612859060526e-06, + "loss": 0.5566, + "step": 4760 + }, + { + "epoch": 0.7119784656796769, + "grad_norm": 1.5894161675694791, + "learning_rate": 2.022415380329918e-06, + "loss": 0.1957, + "step": 4761 + }, + { + "epoch": 0.712128009570809, + "grad_norm": 1.486464360092802, + "learning_rate": 2.020470173323831e-06, + "loss": 0.3143, + "step": 4762 + }, + { + "epoch": 0.7122775534619411, + "grad_norm": 1.4606498955354394, + "learning_rate": 2.0185256653441563e-06, + "loss": 0.2465, + "step": 4763 + }, + { + "epoch": 0.7124270973530731, + "grad_norm": 2.090611910244458, + "learning_rate": 2.016581856847094e-06, + "loss": 0.2162, + "step": 4764 + }, + { + "epoch": 0.7125766412442052, + "grad_norm": 1.7807448909080528, + "learning_rate": 2.0146387482886804e-06, + "loss": 0.3518, + "step": 4765 + }, + { + "epoch": 0.7127261851353373, + "grad_norm": 1.6666354092011324, + "learning_rate": 2.012696340124788e-06, + "loss": 0.3949, + "step": 4766 + }, + { + "epoch": 0.7128757290264692, + "grad_norm": 1.5273494671570687, + "learning_rate": 2.0107546328111235e-06, + "loss": 0.2107, + "step": 4767 + }, + { + "epoch": 0.7130252729176013, + "grad_norm": 1.5632155877868674, + "learning_rate": 2.0088136268032323e-06, + "loss": 0.2155, + "step": 4768 + }, + { + "epoch": 0.7131748168087334, + "grad_norm": 1.4260160070005103, + "learning_rate": 2.0068733225564894e-06, + "loss": 0.3297, + "step": 4769 + }, + { + "epoch": 0.7133243606998654, + "grad_norm": 1.8836676685538953, + "learning_rate": 2.004933720526111e-06, + "loss": 0.4145, + "step": 4770 + }, + { + "epoch": 0.7134739045909975, + "grad_norm": 1.9231850260831183, + "learning_rate": 2.002994821167147e-06, + "loss": 0.6001, + "step": 4771 + }, + { + "epoch": 0.7136234484821296, + "grad_norm": 1.437024286586543, + "learning_rate": 2.00105662493448e-06, + "loss": 0.1722, + "step": 4772 + }, + { + "epoch": 0.7137729923732615, + "grad_norm": 1.6320153713595178, + "learning_rate": 1.9991191322828336e-06, + "loss": 0.3157, + "step": 4773 + }, + { + "epoch": 0.7139225362643936, + "grad_norm": 1.3855744704624602, + "learning_rate": 1.997182343666757e-06, + "loss": 0.1999, + "step": 4774 + }, + { + "epoch": 0.7140720801555257, + "grad_norm": 1.2355394755747025, + "learning_rate": 1.995246259540642e-06, + "loss": 0.1652, + "step": 4775 + }, + { + "epoch": 0.7142216240466577, + "grad_norm": 1.411860252766047, + "learning_rate": 1.9933108803587105e-06, + "loss": 0.2646, + "step": 4776 + }, + { + "epoch": 0.7143711679377898, + "grad_norm": 1.1769599835854019, + "learning_rate": 1.991376206575025e-06, + "loss": 0.1923, + "step": 4777 + }, + { + "epoch": 0.7145207118289217, + "grad_norm": 1.725317793317903, + "learning_rate": 1.989442238643478e-06, + "loss": 0.3052, + "step": 4778 + }, + { + "epoch": 0.7146702557200538, + "grad_norm": 1.2735072479571612, + "learning_rate": 1.987508977017794e-06, + "loss": 0.2956, + "step": 4779 + }, + { + "epoch": 0.7148197996111859, + "grad_norm": 1.3919949320239815, + "learning_rate": 1.9855764221515366e-06, + "loss": 0.2915, + "step": 4780 + }, + { + "epoch": 0.7149693435023179, + "grad_norm": 2.2506955383039444, + "learning_rate": 1.983644574498102e-06, + "loss": 0.6163, + "step": 4781 + }, + { + "epoch": 0.71511888739345, + "grad_norm": 1.6414432755678665, + "learning_rate": 1.9817134345107196e-06, + "loss": 0.366, + "step": 4782 + }, + { + "epoch": 0.7152684312845821, + "grad_norm": 2.14593769293468, + "learning_rate": 1.979783002642456e-06, + "loss": 0.3334, + "step": 4783 + }, + { + "epoch": 0.715417975175714, + "grad_norm": 1.5813323184211674, + "learning_rate": 1.9778532793462046e-06, + "loss": 0.3541, + "step": 4784 + }, + { + "epoch": 0.7155675190668461, + "grad_norm": 2.027730327933588, + "learning_rate": 1.9759242650746992e-06, + "loss": 0.4575, + "step": 4785 + }, + { + "epoch": 0.7157170629579782, + "grad_norm": 1.2625581234592238, + "learning_rate": 1.973995960280506e-06, + "loss": 0.2204, + "step": 4786 + }, + { + "epoch": 0.7158666068491102, + "grad_norm": 1.567565004211618, + "learning_rate": 1.972068365416023e-06, + "loss": 0.3029, + "step": 4787 + }, + { + "epoch": 0.7160161507402423, + "grad_norm": 1.370260561499202, + "learning_rate": 1.9701414809334825e-06, + "loss": 0.1973, + "step": 4788 + }, + { + "epoch": 0.7161656946313744, + "grad_norm": 1.6439443796052537, + "learning_rate": 1.96821530728495e-06, + "loss": 0.2937, + "step": 4789 + }, + { + "epoch": 0.7163152385225063, + "grad_norm": 1.7062807681205492, + "learning_rate": 1.9662898449223254e-06, + "loss": 0.4497, + "step": 4790 + }, + { + "epoch": 0.7164647824136384, + "grad_norm": 1.7415982092251505, + "learning_rate": 1.9643650942973403e-06, + "loss": 0.359, + "step": 4791 + }, + { + "epoch": 0.7166143263047704, + "grad_norm": 1.3681752043113142, + "learning_rate": 1.9624410558615596e-06, + "loss": 0.1842, + "step": 4792 + }, + { + "epoch": 0.7167638701959025, + "grad_norm": 1.2908931102706829, + "learning_rate": 1.960517730066383e-06, + "loss": 0.2988, + "step": 4793 + }, + { + "epoch": 0.7169134140870346, + "grad_norm": 2.4298641226498687, + "learning_rate": 1.9585951173630376e-06, + "loss": 0.3598, + "step": 4794 + }, + { + "epoch": 0.7170629579781665, + "grad_norm": 1.5797060515449204, + "learning_rate": 1.956673218202589e-06, + "loss": 0.3032, + "step": 4795 + }, + { + "epoch": 0.7172125018692986, + "grad_norm": 1.7219811152387416, + "learning_rate": 1.9547520330359327e-06, + "loss": 0.3501, + "step": 4796 + }, + { + "epoch": 0.7173620457604307, + "grad_norm": 1.9552098247974243, + "learning_rate": 1.952831562313798e-06, + "loss": 0.3774, + "step": 4797 + }, + { + "epoch": 0.7175115896515627, + "grad_norm": 1.210272332516529, + "learning_rate": 1.9509118064867467e-06, + "loss": 0.1831, + "step": 4798 + }, + { + "epoch": 0.7176611335426948, + "grad_norm": 1.8071744882668304, + "learning_rate": 1.9489927660051696e-06, + "loss": 0.4436, + "step": 4799 + }, + { + "epoch": 0.7178106774338269, + "grad_norm": 1.2996122015649152, + "learning_rate": 1.9470744413192928e-06, + "loss": 0.1861, + "step": 4800 + }, + { + "epoch": 0.7179602213249588, + "grad_norm": 4.014526497100866, + "learning_rate": 1.945156832879174e-06, + "loss": 0.2961, + "step": 4801 + }, + { + "epoch": 0.7181097652160909, + "grad_norm": 1.6479807487579767, + "learning_rate": 1.9432399411347035e-06, + "loss": 0.1841, + "step": 4802 + }, + { + "epoch": 0.718259309107223, + "grad_norm": 1.0653125198511966, + "learning_rate": 1.9413237665356018e-06, + "loss": 0.201, + "step": 4803 + }, + { + "epoch": 0.718408852998355, + "grad_norm": 1.624087003839024, + "learning_rate": 1.939408309531422e-06, + "loss": 0.3056, + "step": 4804 + }, + { + "epoch": 0.7185583968894871, + "grad_norm": 5.947475483516305, + "learning_rate": 1.9374935705715485e-06, + "loss": 0.5231, + "step": 4805 + }, + { + "epoch": 0.7187079407806191, + "grad_norm": 1.6483056300420804, + "learning_rate": 1.9355795501051984e-06, + "loss": 0.2196, + "step": 4806 + }, + { + "epoch": 0.7188574846717511, + "grad_norm": 1.3894340473073978, + "learning_rate": 1.933666248581418e-06, + "loss": 0.2128, + "step": 4807 + }, + { + "epoch": 0.7190070285628832, + "grad_norm": 1.0253204031014467, + "learning_rate": 1.9317536664490894e-06, + "loss": 0.1502, + "step": 4808 + }, + { + "epoch": 0.7191565724540152, + "grad_norm": 1.6364028006620832, + "learning_rate": 1.9298418041569185e-06, + "loss": 0.3203, + "step": 4809 + }, + { + "epoch": 0.7193061163451473, + "grad_norm": 0.9078633975338446, + "learning_rate": 1.9279306621534488e-06, + "loss": 0.1478, + "step": 4810 + }, + { + "epoch": 0.7194556602362794, + "grad_norm": 1.8363602253036038, + "learning_rate": 1.926020240887052e-06, + "loss": 0.4678, + "step": 4811 + }, + { + "epoch": 0.7196052041274114, + "grad_norm": 1.736846468850028, + "learning_rate": 1.924110540805932e-06, + "loss": 0.3426, + "step": 4812 + }, + { + "epoch": 0.7197547480185434, + "grad_norm": 1.2066282537831992, + "learning_rate": 1.922201562358124e-06, + "loss": 0.2292, + "step": 4813 + }, + { + "epoch": 0.7199042919096755, + "grad_norm": 1.7762661108753415, + "learning_rate": 1.9202933059914904e-06, + "loss": 0.4426, + "step": 4814 + }, + { + "epoch": 0.7200538358008075, + "grad_norm": 1.959880308308892, + "learning_rate": 1.918385772153727e-06, + "loss": 0.3761, + "step": 4815 + }, + { + "epoch": 0.7202033796919396, + "grad_norm": 1.4598193479538135, + "learning_rate": 1.91647896129236e-06, + "loss": 0.1963, + "step": 4816 + }, + { + "epoch": 0.7203529235830717, + "grad_norm": 1.395554848072498, + "learning_rate": 1.9145728738547444e-06, + "loss": 0.3177, + "step": 4817 + }, + { + "epoch": 0.7205024674742037, + "grad_norm": 1.511186762410378, + "learning_rate": 1.912667510288071e-06, + "loss": 0.3569, + "step": 4818 + }, + { + "epoch": 0.7206520113653357, + "grad_norm": 1.3490597700277405, + "learning_rate": 1.9107628710393515e-06, + "loss": 0.2194, + "step": 4819 + }, + { + "epoch": 0.7208015552564677, + "grad_norm": 1.6936008232835473, + "learning_rate": 1.908858956555435e-06, + "loss": 0.2214, + "step": 4820 + }, + { + "epoch": 0.7209510991475998, + "grad_norm": 1.9458510254771038, + "learning_rate": 1.9069557672829974e-06, + "loss": 0.4656, + "step": 4821 + }, + { + "epoch": 0.7211006430387319, + "grad_norm": 1.81801663865579, + "learning_rate": 1.9050533036685458e-06, + "loss": 0.4798, + "step": 4822 + }, + { + "epoch": 0.7212501869298639, + "grad_norm": 1.7906911175356348, + "learning_rate": 1.9031515661584181e-06, + "loss": 0.406, + "step": 4823 + }, + { + "epoch": 0.721399730820996, + "grad_norm": 1.1444791808017973, + "learning_rate": 1.9012505551987764e-06, + "loss": 0.1843, + "step": 4824 + }, + { + "epoch": 0.721549274712128, + "grad_norm": 1.4698689388624135, + "learning_rate": 1.8993502712356188e-06, + "loss": 0.2031, + "step": 4825 + }, + { + "epoch": 0.72169881860326, + "grad_norm": 2.2276819861424375, + "learning_rate": 1.8974507147147692e-06, + "loss": 0.4352, + "step": 4826 + }, + { + "epoch": 0.7218483624943921, + "grad_norm": 1.7631748973597112, + "learning_rate": 1.8955518860818823e-06, + "loss": 0.453, + "step": 4827 + }, + { + "epoch": 0.7219979063855242, + "grad_norm": 1.7994665448141303, + "learning_rate": 1.8936537857824422e-06, + "loss": 0.3077, + "step": 4828 + }, + { + "epoch": 0.7221474502766562, + "grad_norm": 1.8288572589190288, + "learning_rate": 1.89175641426176e-06, + "loss": 0.3802, + "step": 4829 + }, + { + "epoch": 0.7222969941677883, + "grad_norm": 1.7911243468121087, + "learning_rate": 1.889859771964979e-06, + "loss": 0.3058, + "step": 4830 + }, + { + "epoch": 0.7224465380589203, + "grad_norm": 1.8399567741455625, + "learning_rate": 1.8879638593370686e-06, + "loss": 0.459, + "step": 4831 + }, + { + "epoch": 0.7225960819500523, + "grad_norm": 4.102889088039247, + "learning_rate": 1.8860686768228293e-06, + "loss": 0.3128, + "step": 4832 + }, + { + "epoch": 0.7227456258411844, + "grad_norm": 1.865842023204259, + "learning_rate": 1.88417422486689e-06, + "loss": 0.6379, + "step": 4833 + }, + { + "epoch": 0.7228951697323165, + "grad_norm": 1.35548465730419, + "learning_rate": 1.8822805039137038e-06, + "loss": 0.229, + "step": 4834 + }, + { + "epoch": 0.7230447136234485, + "grad_norm": 1.588183686824576, + "learning_rate": 1.8803875144075578e-06, + "loss": 0.3465, + "step": 4835 + }, + { + "epoch": 0.7231942575145806, + "grad_norm": 2.0359646807808205, + "learning_rate": 1.8784952567925662e-06, + "loss": 0.5241, + "step": 4836 + }, + { + "epoch": 0.7233438014057125, + "grad_norm": 2.0625164688782207, + "learning_rate": 1.8766037315126705e-06, + "loss": 0.2283, + "step": 4837 + }, + { + "epoch": 0.7234933452968446, + "grad_norm": 1.4915802204223139, + "learning_rate": 1.8747129390116419e-06, + "loss": 0.3036, + "step": 4838 + }, + { + "epoch": 0.7236428891879767, + "grad_norm": 2.157183804310608, + "learning_rate": 1.872822879733076e-06, + "loss": 0.2216, + "step": 4839 + }, + { + "epoch": 0.7237924330791087, + "grad_norm": 1.4191482750591682, + "learning_rate": 1.8709335541204006e-06, + "loss": 0.2153, + "step": 4840 + }, + { + "epoch": 0.7239419769702408, + "grad_norm": 1.6683750685067826, + "learning_rate": 1.8690449626168688e-06, + "loss": 0.3868, + "step": 4841 + }, + { + "epoch": 0.7240915208613729, + "grad_norm": 1.0627132124708731, + "learning_rate": 1.867157105665563e-06, + "loss": 0.1548, + "step": 4842 + }, + { + "epoch": 0.7242410647525048, + "grad_norm": 1.4481188000664424, + "learning_rate": 1.8652699837093929e-06, + "loss": 0.3344, + "step": 4843 + }, + { + "epoch": 0.7243906086436369, + "grad_norm": 2.015374759547004, + "learning_rate": 1.8633835971910952e-06, + "loss": 0.1807, + "step": 4844 + }, + { + "epoch": 0.724540152534769, + "grad_norm": 1.0388650670740842, + "learning_rate": 1.8614979465532341e-06, + "loss": 0.1624, + "step": 4845 + }, + { + "epoch": 0.724689696425901, + "grad_norm": 1.6403608926999131, + "learning_rate": 1.8596130322382011e-06, + "loss": 0.2447, + "step": 4846 + }, + { + "epoch": 0.7248392403170331, + "grad_norm": 1.6039276522970418, + "learning_rate": 1.8577288546882167e-06, + "loss": 0.1841, + "step": 4847 + }, + { + "epoch": 0.7249887842081651, + "grad_norm": 1.8156004235972893, + "learning_rate": 1.8558454143453276e-06, + "loss": 0.5061, + "step": 4848 + }, + { + "epoch": 0.7251383280992971, + "grad_norm": 1.5083541020940234, + "learning_rate": 1.8539627116514036e-06, + "loss": 0.2733, + "step": 4849 + }, + { + "epoch": 0.7252878719904292, + "grad_norm": 1.245802761649543, + "learning_rate": 1.8520807470481472e-06, + "loss": 0.1944, + "step": 4850 + }, + { + "epoch": 0.7254374158815612, + "grad_norm": 1.8348700402906997, + "learning_rate": 1.8501995209770852e-06, + "loss": 0.4883, + "step": 4851 + }, + { + "epoch": 0.7255869597726933, + "grad_norm": 1.8471514030276397, + "learning_rate": 1.848319033879571e-06, + "loss": 0.4281, + "step": 4852 + }, + { + "epoch": 0.7257365036638254, + "grad_norm": 2.1838526290814144, + "learning_rate": 1.8464392861967857e-06, + "loss": 0.4389, + "step": 4853 + }, + { + "epoch": 0.7258860475549573, + "grad_norm": 1.7698329643054902, + "learning_rate": 1.8445602783697375e-06, + "loss": 0.3224, + "step": 4854 + }, + { + "epoch": 0.7260355914460894, + "grad_norm": 1.926272037635014, + "learning_rate": 1.8426820108392569e-06, + "loss": 0.483, + "step": 4855 + }, + { + "epoch": 0.7261851353372215, + "grad_norm": 1.4306210768559613, + "learning_rate": 1.8408044840460042e-06, + "loss": 0.1788, + "step": 4856 + }, + { + "epoch": 0.7263346792283535, + "grad_norm": 1.391732198372493, + "learning_rate": 1.8389276984304645e-06, + "loss": 0.3414, + "step": 4857 + }, + { + "epoch": 0.7264842231194856, + "grad_norm": 1.5167767238727428, + "learning_rate": 1.8370516544329525e-06, + "loss": 0.2735, + "step": 4858 + }, + { + "epoch": 0.7266337670106177, + "grad_norm": 1.9064808924563106, + "learning_rate": 1.8351763524936068e-06, + "loss": 0.5048, + "step": 4859 + }, + { + "epoch": 0.7267833109017496, + "grad_norm": 1.158958937877205, + "learning_rate": 1.8333017930523872e-06, + "loss": 0.221, + "step": 4860 + }, + { + "epoch": 0.7269328547928817, + "grad_norm": 1.8553248379762768, + "learning_rate": 1.8314279765490861e-06, + "loss": 0.3385, + "step": 4861 + }, + { + "epoch": 0.7270823986840138, + "grad_norm": 1.4634104519416975, + "learning_rate": 1.8295549034233185e-06, + "loss": 0.172, + "step": 4862 + }, + { + "epoch": 0.7272319425751458, + "grad_norm": 1.2767641928380464, + "learning_rate": 1.827682574114525e-06, + "loss": 0.2092, + "step": 4863 + }, + { + "epoch": 0.7273814864662779, + "grad_norm": 1.677784649203382, + "learning_rate": 1.8258109890619741e-06, + "loss": 0.274, + "step": 4864 + }, + { + "epoch": 0.7275310303574098, + "grad_norm": 1.6008734639734468, + "learning_rate": 1.8239401487047542e-06, + "loss": 0.1879, + "step": 4865 + }, + { + "epoch": 0.7276805742485419, + "grad_norm": 2.1299546556280275, + "learning_rate": 1.8220700534817843e-06, + "loss": 0.5216, + "step": 4866 + }, + { + "epoch": 0.727830118139674, + "grad_norm": 1.2805904807692716, + "learning_rate": 1.8202007038318065e-06, + "loss": 0.2046, + "step": 4867 + }, + { + "epoch": 0.727979662030806, + "grad_norm": 1.6699024043147148, + "learning_rate": 1.818332100193389e-06, + "loss": 0.2377, + "step": 4868 + }, + { + "epoch": 0.7281292059219381, + "grad_norm": 2.0378048844203205, + "learning_rate": 1.8164642430049235e-06, + "loss": 0.371, + "step": 4869 + }, + { + "epoch": 0.7282787498130702, + "grad_norm": 1.2611107675280688, + "learning_rate": 1.8145971327046274e-06, + "loss": 0.1743, + "step": 4870 + }, + { + "epoch": 0.7284282937042021, + "grad_norm": 2.8489240171235926, + "learning_rate": 1.812730769730543e-06, + "loss": 0.1427, + "step": 4871 + }, + { + "epoch": 0.7285778375953342, + "grad_norm": 1.8451784710253105, + "learning_rate": 1.8108651545205364e-06, + "loss": 0.209, + "step": 4872 + }, + { + "epoch": 0.7287273814864663, + "grad_norm": 2.050825866940483, + "learning_rate": 1.8090002875122998e-06, + "loss": 0.4463, + "step": 4873 + }, + { + "epoch": 0.7288769253775983, + "grad_norm": 1.525180680093273, + "learning_rate": 1.8071361691433504e-06, + "loss": 0.2058, + "step": 4874 + }, + { + "epoch": 0.7290264692687304, + "grad_norm": 1.8675724634253468, + "learning_rate": 1.8052727998510244e-06, + "loss": 0.2193, + "step": 4875 + }, + { + "epoch": 0.7291760131598625, + "grad_norm": 1.5555303292225289, + "learning_rate": 1.8034101800724891e-06, + "loss": 0.1661, + "step": 4876 + }, + { + "epoch": 0.7293255570509944, + "grad_norm": 2.071699548717637, + "learning_rate": 1.8015483102447313e-06, + "loss": 0.4313, + "step": 4877 + }, + { + "epoch": 0.7294751009421265, + "grad_norm": 2.0316685667135346, + "learning_rate": 1.7996871908045648e-06, + "loss": 0.1906, + "step": 4878 + }, + { + "epoch": 0.7296246448332585, + "grad_norm": 1.5087990868491579, + "learning_rate": 1.797826822188628e-06, + "loss": 0.2367, + "step": 4879 + }, + { + "epoch": 0.7297741887243906, + "grad_norm": 1.3936681026294564, + "learning_rate": 1.7959672048333776e-06, + "loss": 0.3461, + "step": 4880 + }, + { + "epoch": 0.7299237326155227, + "grad_norm": 2.1186030606992894, + "learning_rate": 1.794108339175099e-06, + "loss": 0.3271, + "step": 4881 + }, + { + "epoch": 0.7300732765066547, + "grad_norm": 1.5491715251363514, + "learning_rate": 1.7922502256499002e-06, + "loss": 0.3444, + "step": 4882 + }, + { + "epoch": 0.7302228203977867, + "grad_norm": 1.9455178010759004, + "learning_rate": 1.790392864693713e-06, + "loss": 0.2189, + "step": 4883 + }, + { + "epoch": 0.7303723642889188, + "grad_norm": 2.166131466243641, + "learning_rate": 1.788536256742292e-06, + "loss": 0.5102, + "step": 4884 + }, + { + "epoch": 0.7305219081800508, + "grad_norm": 1.9183208080307343, + "learning_rate": 1.786680402231215e-06, + "loss": 0.5595, + "step": 4885 + }, + { + "epoch": 0.7306714520711829, + "grad_norm": 2.484328983288574, + "learning_rate": 1.784825301595884e-06, + "loss": 0.716, + "step": 4886 + }, + { + "epoch": 0.730820995962315, + "grad_norm": 1.763515816138748, + "learning_rate": 1.7829709552715225e-06, + "loss": 0.3711, + "step": 4887 + }, + { + "epoch": 0.730970539853447, + "grad_norm": 2.1009495277293135, + "learning_rate": 1.781117363693179e-06, + "loss": 0.3544, + "step": 4888 + }, + { + "epoch": 0.731120083744579, + "grad_norm": 1.5866469126094087, + "learning_rate": 1.7792645272957248e-06, + "loss": 0.3558, + "step": 4889 + }, + { + "epoch": 0.7312696276357111, + "grad_norm": 1.7021011459775688, + "learning_rate": 1.7774124465138504e-06, + "loss": 0.3143, + "step": 4890 + }, + { + "epoch": 0.7314191715268431, + "grad_norm": 1.6676742642163374, + "learning_rate": 1.7755611217820735e-06, + "loss": 0.3288, + "step": 4891 + }, + { + "epoch": 0.7315687154179752, + "grad_norm": 1.4418481185682541, + "learning_rate": 1.773710553534732e-06, + "loss": 0.1749, + "step": 4892 + }, + { + "epoch": 0.7317182593091073, + "grad_norm": 1.398991855838437, + "learning_rate": 1.771860742205988e-06, + "loss": 0.2401, + "step": 4893 + }, + { + "epoch": 0.7318678032002393, + "grad_norm": 1.4952873454574098, + "learning_rate": 1.7700116882298263e-06, + "loss": 0.1886, + "step": 4894 + }, + { + "epoch": 0.7320173470913713, + "grad_norm": 1.5264426985774242, + "learning_rate": 1.7681633920400492e-06, + "loss": 0.2927, + "step": 4895 + }, + { + "epoch": 0.7321668909825033, + "grad_norm": 3.238416533180956, + "learning_rate": 1.7663158540702875e-06, + "loss": 0.3544, + "step": 4896 + }, + { + "epoch": 0.7323164348736354, + "grad_norm": 1.8829901876535788, + "learning_rate": 1.7644690747539894e-06, + "loss": 0.237, + "step": 4897 + }, + { + "epoch": 0.7324659787647675, + "grad_norm": 1.8913438943497576, + "learning_rate": 1.7626230545244278e-06, + "loss": 0.4691, + "step": 4898 + }, + { + "epoch": 0.7326155226558995, + "grad_norm": 1.576090948870641, + "learning_rate": 1.7607777938146998e-06, + "loss": 0.247, + "step": 4899 + }, + { + "epoch": 0.7327650665470316, + "grad_norm": 1.6647802985248792, + "learning_rate": 1.758933293057718e-06, + "loss": 0.1818, + "step": 4900 + }, + { + "epoch": 0.7329146104381636, + "grad_norm": 1.6465920925539999, + "learning_rate": 1.7570895526862202e-06, + "loss": 0.3598, + "step": 4901 + }, + { + "epoch": 0.7330641543292956, + "grad_norm": 2.065760448476014, + "learning_rate": 1.7552465731327673e-06, + "loss": 0.4715, + "step": 4902 + }, + { + "epoch": 0.7332136982204277, + "grad_norm": 2.375540972886836, + "learning_rate": 1.7534043548297386e-06, + "loss": 0.226, + "step": 4903 + }, + { + "epoch": 0.7333632421115598, + "grad_norm": 1.2725697263760996, + "learning_rate": 1.751562898209339e-06, + "loss": 0.1696, + "step": 4904 + }, + { + "epoch": 0.7335127860026918, + "grad_norm": 2.233901453855518, + "learning_rate": 1.7497222037035876e-06, + "loss": 0.6218, + "step": 4905 + }, + { + "epoch": 0.7336623298938239, + "grad_norm": 1.9614451537767406, + "learning_rate": 1.7478822717443323e-06, + "loss": 0.3986, + "step": 4906 + }, + { + "epoch": 0.7338118737849559, + "grad_norm": 1.6046003025473952, + "learning_rate": 1.7460431027632368e-06, + "loss": 0.429, + "step": 4907 + }, + { + "epoch": 0.7339614176760879, + "grad_norm": 1.3989763819945016, + "learning_rate": 1.7442046971917898e-06, + "loss": 0.1538, + "step": 4908 + }, + { + "epoch": 0.73411096156722, + "grad_norm": 1.9975116158208546, + "learning_rate": 1.7423670554612976e-06, + "loss": 0.3242, + "step": 4909 + }, + { + "epoch": 0.734260505458352, + "grad_norm": 1.7807636600919978, + "learning_rate": 1.7405301780028893e-06, + "loss": 0.3002, + "step": 4910 + }, + { + "epoch": 0.7344100493494841, + "grad_norm": 1.8439287810205307, + "learning_rate": 1.7386940652475142e-06, + "loss": 0.425, + "step": 4911 + }, + { + "epoch": 0.7345595932406161, + "grad_norm": 1.7255465422592986, + "learning_rate": 1.7368587176259416e-06, + "loss": 0.2988, + "step": 4912 + }, + { + "epoch": 0.7347091371317481, + "grad_norm": 1.1163651100410859, + "learning_rate": 1.735024135568762e-06, + "loss": 0.2057, + "step": 4913 + }, + { + "epoch": 0.7348586810228802, + "grad_norm": 1.9838908798921489, + "learning_rate": 1.7331903195063871e-06, + "loss": 0.4594, + "step": 4914 + }, + { + "epoch": 0.7350082249140123, + "grad_norm": 1.22811541687039, + "learning_rate": 1.7313572698690456e-06, + "loss": 0.184, + "step": 4915 + }, + { + "epoch": 0.7351577688051443, + "grad_norm": 1.130576098248055, + "learning_rate": 1.7295249870867898e-06, + "loss": 0.1796, + "step": 4916 + }, + { + "epoch": 0.7353073126962764, + "grad_norm": 1.3162366435196555, + "learning_rate": 1.72769347158949e-06, + "loss": 0.2159, + "step": 4917 + }, + { + "epoch": 0.7354568565874084, + "grad_norm": 1.3336102127082576, + "learning_rate": 1.7258627238068387e-06, + "loss": 0.2036, + "step": 4918 + }, + { + "epoch": 0.7356064004785404, + "grad_norm": 1.7092416101048638, + "learning_rate": 1.7240327441683486e-06, + "loss": 0.2639, + "step": 4919 + }, + { + "epoch": 0.7357559443696725, + "grad_norm": 1.842857209339753, + "learning_rate": 1.722203533103346e-06, + "loss": 0.4788, + "step": 4920 + }, + { + "epoch": 0.7359054882608046, + "grad_norm": 1.7145975931191868, + "learning_rate": 1.720375091040984e-06, + "loss": 0.4216, + "step": 4921 + }, + { + "epoch": 0.7360550321519366, + "grad_norm": 1.3731269596018154, + "learning_rate": 1.7185474184102329e-06, + "loss": 0.1715, + "step": 4922 + }, + { + "epoch": 0.7362045760430687, + "grad_norm": 1.3050309431583982, + "learning_rate": 1.7167205156398813e-06, + "loss": 0.2078, + "step": 4923 + }, + { + "epoch": 0.7363541199342006, + "grad_norm": 1.25888103507202, + "learning_rate": 1.7148943831585395e-06, + "loss": 0.2148, + "step": 4924 + }, + { + "epoch": 0.7365036638253327, + "grad_norm": 1.4463970574157627, + "learning_rate": 1.7130690213946355e-06, + "loss": 0.2037, + "step": 4925 + }, + { + "epoch": 0.7366532077164648, + "grad_norm": 2.2062906558691804, + "learning_rate": 1.7112444307764159e-06, + "loss": 0.3348, + "step": 4926 + }, + { + "epoch": 0.7368027516075968, + "grad_norm": 1.488056015633072, + "learning_rate": 1.7094206117319479e-06, + "loss": 0.1917, + "step": 4927 + }, + { + "epoch": 0.7369522954987289, + "grad_norm": 1.9098820401759844, + "learning_rate": 1.7075975646891164e-06, + "loss": 0.4147, + "step": 4928 + }, + { + "epoch": 0.737101839389861, + "grad_norm": 1.6911497207877315, + "learning_rate": 1.7057752900756286e-06, + "loss": 0.2547, + "step": 4929 + }, + { + "epoch": 0.7372513832809929, + "grad_norm": 1.74444983156129, + "learning_rate": 1.7039537883190038e-06, + "loss": 0.2635, + "step": 4930 + }, + { + "epoch": 0.737400927172125, + "grad_norm": 1.7734453394376264, + "learning_rate": 1.7021330598465852e-06, + "loss": 0.4908, + "step": 4931 + }, + { + "epoch": 0.7375504710632571, + "grad_norm": 1.6236984629707876, + "learning_rate": 1.7003131050855332e-06, + "loss": 0.3127, + "step": 4932 + }, + { + "epoch": 0.7377000149543891, + "grad_norm": 1.5560907471994876, + "learning_rate": 1.6984939244628274e-06, + "loss": 0.3418, + "step": 4933 + }, + { + "epoch": 0.7378495588455212, + "grad_norm": 1.3794025036066626, + "learning_rate": 1.6966755184052664e-06, + "loss": 0.1274, + "step": 4934 + }, + { + "epoch": 0.7379991027366533, + "grad_norm": 1.9367957142582564, + "learning_rate": 1.6948578873394623e-06, + "loss": 0.5532, + "step": 4935 + }, + { + "epoch": 0.7381486466277852, + "grad_norm": 1.3721298422066388, + "learning_rate": 1.693041031691851e-06, + "loss": 0.1855, + "step": 4936 + }, + { + "epoch": 0.7382981905189173, + "grad_norm": 2.0769708052609626, + "learning_rate": 1.6912249518886837e-06, + "loss": 0.3111, + "step": 4937 + }, + { + "epoch": 0.7384477344100493, + "grad_norm": 2.143921562202975, + "learning_rate": 1.689409648356029e-06, + "loss": 0.2064, + "step": 4938 + }, + { + "epoch": 0.7385972783011814, + "grad_norm": 1.7748062548541788, + "learning_rate": 1.6875951215197779e-06, + "loss": 0.4532, + "step": 4939 + }, + { + "epoch": 0.7387468221923135, + "grad_norm": 1.9049841516003214, + "learning_rate": 1.6857813718056354e-06, + "loss": 0.4454, + "step": 4940 + }, + { + "epoch": 0.7388963660834454, + "grad_norm": 1.3090947430515303, + "learning_rate": 1.6839683996391227e-06, + "loss": 0.2271, + "step": 4941 + }, + { + "epoch": 0.7390459099745775, + "grad_norm": 2.237405229404865, + "learning_rate": 1.6821562054455804e-06, + "loss": 0.4707, + "step": 4942 + }, + { + "epoch": 0.7391954538657096, + "grad_norm": 1.6574169357553463, + "learning_rate": 1.6803447896501679e-06, + "loss": 0.2455, + "step": 4943 + }, + { + "epoch": 0.7393449977568416, + "grad_norm": 1.5698625859485724, + "learning_rate": 1.6785341526778603e-06, + "loss": 0.2139, + "step": 4944 + }, + { + "epoch": 0.7394945416479737, + "grad_norm": 1.1445448389646231, + "learning_rate": 1.6767242949534523e-06, + "loss": 0.1877, + "step": 4945 + }, + { + "epoch": 0.7396440855391058, + "grad_norm": 1.8341193550658388, + "learning_rate": 1.674915216901551e-06, + "loss": 0.3422, + "step": 4946 + }, + { + "epoch": 0.7397936294302377, + "grad_norm": 1.2986018162948336, + "learning_rate": 1.6731069189465843e-06, + "loss": 0.1937, + "step": 4947 + }, + { + "epoch": 0.7399431733213698, + "grad_norm": 3.4458268834752923, + "learning_rate": 1.6712994015127976e-06, + "loss": 0.1899, + "step": 4948 + }, + { + "epoch": 0.7400927172125019, + "grad_norm": 1.6288023419251345, + "learning_rate": 1.6694926650242504e-06, + "loss": 0.2016, + "step": 4949 + }, + { + "epoch": 0.7402422611036339, + "grad_norm": 1.2690199042261172, + "learning_rate": 1.667686709904822e-06, + "loss": 0.3038, + "step": 4950 + }, + { + "epoch": 0.740391804994766, + "grad_norm": 1.8684985693384317, + "learning_rate": 1.665881536578206e-06, + "loss": 0.514, + "step": 4951 + }, + { + "epoch": 0.7405413488858981, + "grad_norm": 1.9533150504153602, + "learning_rate": 1.6640771454679134e-06, + "loss": 0.3309, + "step": 4952 + }, + { + "epoch": 0.74069089277703, + "grad_norm": 1.8484944061522781, + "learning_rate": 1.662273536997272e-06, + "loss": 0.1938, + "step": 4953 + }, + { + "epoch": 0.7408404366681621, + "grad_norm": 2.028078126292207, + "learning_rate": 1.6604707115894259e-06, + "loss": 0.4777, + "step": 4954 + }, + { + "epoch": 0.7409899805592941, + "grad_norm": 1.2667704096878298, + "learning_rate": 1.6586686696673364e-06, + "loss": 0.2216, + "step": 4955 + }, + { + "epoch": 0.7411395244504262, + "grad_norm": 1.4777059517531128, + "learning_rate": 1.6568674116537775e-06, + "loss": 0.2011, + "step": 4956 + }, + { + "epoch": 0.7412890683415583, + "grad_norm": 1.6294110291319974, + "learning_rate": 1.6550669379713425e-06, + "loss": 0.1741, + "step": 4957 + }, + { + "epoch": 0.7414386122326903, + "grad_norm": 1.622994606838209, + "learning_rate": 1.6532672490424406e-06, + "loss": 0.3036, + "step": 4958 + }, + { + "epoch": 0.7415881561238223, + "grad_norm": 2.046538656570282, + "learning_rate": 1.6514683452892955e-06, + "loss": 0.2847, + "step": 4959 + }, + { + "epoch": 0.7417377000149544, + "grad_norm": 1.7786604143906892, + "learning_rate": 1.6496702271339487e-06, + "loss": 0.1677, + "step": 4960 + }, + { + "epoch": 0.7418872439060864, + "grad_norm": 1.2090273342604059, + "learning_rate": 1.6478728949982542e-06, + "loss": 0.1785, + "step": 4961 + }, + { + "epoch": 0.7420367877972185, + "grad_norm": 1.5854642471996911, + "learning_rate": 1.646076349303884e-06, + "loss": 0.2253, + "step": 4962 + }, + { + "epoch": 0.7421863316883506, + "grad_norm": 1.242112373383462, + "learning_rate": 1.6442805904723246e-06, + "loss": 0.3119, + "step": 4963 + }, + { + "epoch": 0.7423358755794826, + "grad_norm": 1.5378831663137362, + "learning_rate": 1.6424856189248794e-06, + "loss": 0.3666, + "step": 4964 + }, + { + "epoch": 0.7424854194706146, + "grad_norm": 1.5142227056599435, + "learning_rate": 1.6406914350826657e-06, + "loss": 0.3037, + "step": 4965 + }, + { + "epoch": 0.7426349633617467, + "grad_norm": 1.8172948735036927, + "learning_rate": 1.638898039366616e-06, + "loss": 0.1961, + "step": 4966 + }, + { + "epoch": 0.7427845072528787, + "grad_norm": 1.2749929231559476, + "learning_rate": 1.637105432197479e-06, + "loss": 0.166, + "step": 4967 + }, + { + "epoch": 0.7429340511440108, + "grad_norm": 2.324535822678269, + "learning_rate": 1.6353136139958164e-06, + "loss": 0.6694, + "step": 4968 + }, + { + "epoch": 0.7430835950351428, + "grad_norm": 1.480140025499841, + "learning_rate": 1.6335225851820068e-06, + "loss": 0.2349, + "step": 4969 + }, + { + "epoch": 0.7432331389262749, + "grad_norm": 4.800174733796218, + "learning_rate": 1.6317323461762447e-06, + "loss": 0.21, + "step": 4970 + }, + { + "epoch": 0.7433826828174069, + "grad_norm": 1.2593497070252637, + "learning_rate": 1.6299428973985332e-06, + "loss": 0.1609, + "step": 4971 + }, + { + "epoch": 0.7435322267085389, + "grad_norm": 1.255074221488897, + "learning_rate": 1.6281542392686967e-06, + "loss": 0.1761, + "step": 4972 + }, + { + "epoch": 0.743681770599671, + "grad_norm": 1.5729841639918394, + "learning_rate": 1.626366372206371e-06, + "loss": 0.3125, + "step": 4973 + }, + { + "epoch": 0.7438313144908031, + "grad_norm": 1.3804863151982847, + "learning_rate": 1.6245792966310081e-06, + "loss": 0.2061, + "step": 4974 + }, + { + "epoch": 0.7439808583819351, + "grad_norm": 1.4988752161362586, + "learning_rate": 1.6227930129618735e-06, + "loss": 0.2357, + "step": 4975 + }, + { + "epoch": 0.7441304022730671, + "grad_norm": 1.1559635335873584, + "learning_rate": 1.6210075216180438e-06, + "loss": 0.2257, + "step": 4976 + }, + { + "epoch": 0.7442799461641992, + "grad_norm": 1.1787528604986488, + "learning_rate": 1.6192228230184143e-06, + "loss": 0.2064, + "step": 4977 + }, + { + "epoch": 0.7444294900553312, + "grad_norm": 1.8395020477405877, + "learning_rate": 1.6174389175816925e-06, + "loss": 0.2721, + "step": 4978 + }, + { + "epoch": 0.7445790339464633, + "grad_norm": 2.2983433448079214, + "learning_rate": 1.6156558057263972e-06, + "loss": 0.3296, + "step": 4979 + }, + { + "epoch": 0.7447285778375954, + "grad_norm": 1.7476169505025692, + "learning_rate": 1.6138734878708701e-06, + "loss": 0.1581, + "step": 4980 + }, + { + "epoch": 0.7448781217287274, + "grad_norm": 1.6880815219258662, + "learning_rate": 1.6120919644332538e-06, + "loss": 0.3554, + "step": 4981 + }, + { + "epoch": 0.7450276656198594, + "grad_norm": 1.5553252000157987, + "learning_rate": 1.6103112358315137e-06, + "loss": 0.3049, + "step": 4982 + }, + { + "epoch": 0.7451772095109914, + "grad_norm": 1.508629076231468, + "learning_rate": 1.6085313024834248e-06, + "loss": 0.1949, + "step": 4983 + }, + { + "epoch": 0.7453267534021235, + "grad_norm": 1.6111729402662986, + "learning_rate": 1.6067521648065775e-06, + "loss": 0.2028, + "step": 4984 + }, + { + "epoch": 0.7454762972932556, + "grad_norm": 1.6882761786162728, + "learning_rate": 1.604973823218376e-06, + "loss": 0.4145, + "step": 4985 + }, + { + "epoch": 0.7456258411843876, + "grad_norm": 1.9886796023882398, + "learning_rate": 1.6031962781360327e-06, + "loss": 0.3631, + "step": 4986 + }, + { + "epoch": 0.7457753850755197, + "grad_norm": 1.0743322277819911, + "learning_rate": 1.6014195299765795e-06, + "loss": 0.2002, + "step": 4987 + }, + { + "epoch": 0.7459249289666517, + "grad_norm": 2.959920703560377, + "learning_rate": 1.599643579156857e-06, + "loss": 0.6486, + "step": 4988 + }, + { + "epoch": 0.7460744728577837, + "grad_norm": 1.7124238982311148, + "learning_rate": 1.5978684260935218e-06, + "loss": 0.3407, + "step": 4989 + }, + { + "epoch": 0.7462240167489158, + "grad_norm": 1.5419789507577413, + "learning_rate": 1.5960940712030415e-06, + "loss": 0.3379, + "step": 4990 + }, + { + "epoch": 0.7463735606400479, + "grad_norm": 2.0568302715565063, + "learning_rate": 1.5943205149016967e-06, + "loss": 0.1542, + "step": 4991 + }, + { + "epoch": 0.7465231045311799, + "grad_norm": 1.1442810084475554, + "learning_rate": 1.5925477576055808e-06, + "loss": 0.2005, + "step": 4992 + }, + { + "epoch": 0.746672648422312, + "grad_norm": 1.0323642072329389, + "learning_rate": 1.5907757997306e-06, + "loss": 0.1558, + "step": 4993 + }, + { + "epoch": 0.746822192313444, + "grad_norm": 1.7151094243880038, + "learning_rate": 1.5890046416924727e-06, + "loss": 0.201, + "step": 4994 + }, + { + "epoch": 0.746971736204576, + "grad_norm": 1.9038837865131188, + "learning_rate": 1.5872342839067305e-06, + "loss": 0.2425, + "step": 4995 + }, + { + "epoch": 0.7471212800957081, + "grad_norm": 1.4737968126049896, + "learning_rate": 1.5854647267887141e-06, + "loss": 0.1929, + "step": 4996 + }, + { + "epoch": 0.7472708239868401, + "grad_norm": 1.9338720487436842, + "learning_rate": 1.5836959707535798e-06, + "loss": 0.4908, + "step": 4997 + }, + { + "epoch": 0.7474203678779722, + "grad_norm": 1.7687296746572252, + "learning_rate": 1.5819280162162948e-06, + "loss": 0.4599, + "step": 4998 + }, + { + "epoch": 0.7475699117691043, + "grad_norm": 1.7216072867885186, + "learning_rate": 1.5801608635916387e-06, + "loss": 0.3506, + "step": 4999 + }, + { + "epoch": 0.7477194556602362, + "grad_norm": 1.105197711513627, + "learning_rate": 1.5783945132942037e-06, + "loss": 0.1791, + "step": 5000 + }, + { + "epoch": 0.7478689995513683, + "grad_norm": 1.8433637537725378, + "learning_rate": 1.5766289657383893e-06, + "loss": 0.2155, + "step": 5001 + }, + { + "epoch": 0.7480185434425004, + "grad_norm": 1.851082703438374, + "learning_rate": 1.574864221338412e-06, + "loss": 0.2261, + "step": 5002 + }, + { + "epoch": 0.7481680873336324, + "grad_norm": 1.1235403154462775, + "learning_rate": 1.5731002805082979e-06, + "loss": 0.1976, + "step": 5003 + }, + { + "epoch": 0.7483176312247645, + "grad_norm": 1.4005311844858053, + "learning_rate": 1.571337143661884e-06, + "loss": 0.1214, + "step": 5004 + }, + { + "epoch": 0.7484671751158966, + "grad_norm": 1.3991927499661123, + "learning_rate": 1.5695748112128195e-06, + "loss": 0.2138, + "step": 5005 + }, + { + "epoch": 0.7486167190070285, + "grad_norm": 1.5742832057053338, + "learning_rate": 1.5678132835745647e-06, + "loss": 0.3268, + "step": 5006 + }, + { + "epoch": 0.7487662628981606, + "grad_norm": 1.7017372496995755, + "learning_rate": 1.5660525611603904e-06, + "loss": 0.3067, + "step": 5007 + }, + { + "epoch": 0.7489158067892927, + "grad_norm": 2.2239030850506607, + "learning_rate": 1.56429264438338e-06, + "loss": 0.4399, + "step": 5008 + }, + { + "epoch": 0.7490653506804247, + "grad_norm": 1.4825358039189538, + "learning_rate": 1.5625335336564262e-06, + "loss": 0.1838, + "step": 5009 + }, + { + "epoch": 0.7492148945715568, + "grad_norm": 1.843373781549283, + "learning_rate": 1.560775229392235e-06, + "loss": 0.5073, + "step": 5010 + }, + { + "epoch": 0.7493644384626889, + "grad_norm": 1.6081766972792195, + "learning_rate": 1.5590177320033184e-06, + "loss": 0.2901, + "step": 5011 + }, + { + "epoch": 0.7495139823538208, + "grad_norm": 1.6372337350507788, + "learning_rate": 1.557261041902004e-06, + "loss": 0.2235, + "step": 5012 + }, + { + "epoch": 0.7496635262449529, + "grad_norm": 1.889672133817125, + "learning_rate": 1.5555051595004279e-06, + "loss": 0.3568, + "step": 5013 + }, + { + "epoch": 0.7498130701360849, + "grad_norm": 1.8353270753658117, + "learning_rate": 1.5537500852105376e-06, + "loss": 0.469, + "step": 5014 + }, + { + "epoch": 0.749962614027217, + "grad_norm": 1.2515052360240535, + "learning_rate": 1.5519958194440914e-06, + "loss": 0.1958, + "step": 5015 + }, + { + "epoch": 0.7501121579183491, + "grad_norm": 1.411832489473013, + "learning_rate": 1.550242362612654e-06, + "loss": 0.1959, + "step": 5016 + }, + { + "epoch": 0.750261701809481, + "grad_norm": 2.304156057863376, + "learning_rate": 1.5484897151276052e-06, + "loss": 0.3776, + "step": 5017 + }, + { + "epoch": 0.7504112457006131, + "grad_norm": 1.2686373160265543, + "learning_rate": 1.5467378774001325e-06, + "loss": 0.1746, + "step": 5018 + }, + { + "epoch": 0.7505607895917452, + "grad_norm": 1.4290150379380038, + "learning_rate": 1.544986849841234e-06, + "loss": 0.2043, + "step": 5019 + }, + { + "epoch": 0.7507103334828772, + "grad_norm": 1.674021514973367, + "learning_rate": 1.5432366328617182e-06, + "loss": 0.3528, + "step": 5020 + }, + { + "epoch": 0.7508598773740093, + "grad_norm": 1.8225088458049317, + "learning_rate": 1.541487226872202e-06, + "loss": 0.3571, + "step": 5021 + }, + { + "epoch": 0.7510094212651414, + "grad_norm": 1.231715373134182, + "learning_rate": 1.5397386322831131e-06, + "loss": 0.196, + "step": 5022 + }, + { + "epoch": 0.7511589651562733, + "grad_norm": 1.2398172172587025, + "learning_rate": 1.5379908495046892e-06, + "loss": 0.2013, + "step": 5023 + }, + { + "epoch": 0.7513085090474054, + "grad_norm": 1.3768389715397804, + "learning_rate": 1.5362438789469768e-06, + "loss": 0.1743, + "step": 5024 + }, + { + "epoch": 0.7514580529385375, + "grad_norm": 1.97450836406249, + "learning_rate": 1.5344977210198313e-06, + "loss": 0.2024, + "step": 5025 + }, + { + "epoch": 0.7516075968296695, + "grad_norm": 1.7489178198059616, + "learning_rate": 1.532752376132921e-06, + "loss": 0.2381, + "step": 5026 + }, + { + "epoch": 0.7517571407208016, + "grad_norm": 1.9006601987299636, + "learning_rate": 1.5310078446957166e-06, + "loss": 0.5114, + "step": 5027 + }, + { + "epoch": 0.7519066846119336, + "grad_norm": 1.9487728533894322, + "learning_rate": 1.5292641271175035e-06, + "loss": 0.3343, + "step": 5028 + }, + { + "epoch": 0.7520562285030656, + "grad_norm": 1.4232534967996326, + "learning_rate": 1.5275212238073755e-06, + "loss": 0.2949, + "step": 5029 + }, + { + "epoch": 0.7522057723941977, + "grad_norm": 2.466386702045916, + "learning_rate": 1.5257791351742335e-06, + "loss": 0.4721, + "step": 5030 + }, + { + "epoch": 0.7523553162853297, + "grad_norm": 1.116081887809245, + "learning_rate": 1.5240378616267887e-06, + "loss": 0.1345, + "step": 5031 + }, + { + "epoch": 0.7525048601764618, + "grad_norm": 1.2001444074048684, + "learning_rate": 1.522297403573561e-06, + "loss": 0.2245, + "step": 5032 + }, + { + "epoch": 0.7526544040675939, + "grad_norm": 1.401524907716876, + "learning_rate": 1.520557761422878e-06, + "loss": 0.326, + "step": 5033 + }, + { + "epoch": 0.7528039479587258, + "grad_norm": 1.6483945434180083, + "learning_rate": 1.518818935582877e-06, + "loss": 0.2961, + "step": 5034 + }, + { + "epoch": 0.7529534918498579, + "grad_norm": 1.7718583609120988, + "learning_rate": 1.5170809264615028e-06, + "loss": 0.3553, + "step": 5035 + }, + { + "epoch": 0.75310303574099, + "grad_norm": 1.5109861506112048, + "learning_rate": 1.5153437344665112e-06, + "loss": 0.2958, + "step": 5036 + }, + { + "epoch": 0.753252579632122, + "grad_norm": 1.8242522161056527, + "learning_rate": 1.5136073600054606e-06, + "loss": 0.3677, + "step": 5037 + }, + { + "epoch": 0.7534021235232541, + "grad_norm": 1.5061857510551848, + "learning_rate": 1.5118718034857238e-06, + "loss": 0.3322, + "step": 5038 + }, + { + "epoch": 0.7535516674143862, + "grad_norm": 1.7516086286447914, + "learning_rate": 1.5101370653144775e-06, + "loss": 0.3317, + "step": 5039 + }, + { + "epoch": 0.7537012113055181, + "grad_norm": 1.9410830253618627, + "learning_rate": 1.5084031458987086e-06, + "loss": 0.477, + "step": 5040 + }, + { + "epoch": 0.7538507551966502, + "grad_norm": 1.624047738522535, + "learning_rate": 1.5066700456452138e-06, + "loss": 0.2883, + "step": 5041 + }, + { + "epoch": 0.7540002990877822, + "grad_norm": 2.032351730576193, + "learning_rate": 1.5049377649605906e-06, + "loss": 0.2289, + "step": 5042 + }, + { + "epoch": 0.7541498429789143, + "grad_norm": 1.9571845552317846, + "learning_rate": 1.5032063042512512e-06, + "loss": 0.4659, + "step": 5043 + }, + { + "epoch": 0.7542993868700464, + "grad_norm": 1.4169043100292917, + "learning_rate": 1.5014756639234123e-06, + "loss": 0.2016, + "step": 5044 + }, + { + "epoch": 0.7544489307611784, + "grad_norm": 1.6271245758464608, + "learning_rate": 1.4997458443830997e-06, + "loss": 0.1912, + "step": 5045 + }, + { + "epoch": 0.7545984746523104, + "grad_norm": 1.730182102995152, + "learning_rate": 1.498016846036145e-06, + "loss": 0.1766, + "step": 5046 + }, + { + "epoch": 0.7547480185434425, + "grad_norm": 1.844207557379883, + "learning_rate": 1.4962886692881884e-06, + "loss": 0.3448, + "step": 5047 + }, + { + "epoch": 0.7548975624345745, + "grad_norm": 1.686654479519285, + "learning_rate": 1.4945613145446764e-06, + "loss": 0.3413, + "step": 5048 + }, + { + "epoch": 0.7550471063257066, + "grad_norm": 1.5689209518146487, + "learning_rate": 1.492834782210863e-06, + "loss": 0.2017, + "step": 5049 + }, + { + "epoch": 0.7551966502168387, + "grad_norm": 1.2979889962686177, + "learning_rate": 1.4911090726918092e-06, + "loss": 0.1642, + "step": 5050 + }, + { + "epoch": 0.7553461941079707, + "grad_norm": 1.509481815444356, + "learning_rate": 1.4893841863923858e-06, + "loss": 0.2808, + "step": 5051 + }, + { + "epoch": 0.7554957379991027, + "grad_norm": 1.2945562913921012, + "learning_rate": 1.4876601237172633e-06, + "loss": 0.1775, + "step": 5052 + }, + { + "epoch": 0.7556452818902348, + "grad_norm": 1.3900598384820892, + "learning_rate": 1.4859368850709256e-06, + "loss": 0.1958, + "step": 5053 + }, + { + "epoch": 0.7557948257813668, + "grad_norm": 1.599403060631151, + "learning_rate": 1.4842144708576606e-06, + "loss": 0.2444, + "step": 5054 + }, + { + "epoch": 0.7559443696724989, + "grad_norm": 1.3957515741240658, + "learning_rate": 1.4824928814815642e-06, + "loss": 0.1523, + "step": 5055 + }, + { + "epoch": 0.7560939135636309, + "grad_norm": 1.8649507157100929, + "learning_rate": 1.4807721173465384e-06, + "loss": 0.6052, + "step": 5056 + }, + { + "epoch": 0.756243457454763, + "grad_norm": 1.3054300810441961, + "learning_rate": 1.479052178856289e-06, + "loss": 0.1867, + "step": 5057 + }, + { + "epoch": 0.756393001345895, + "grad_norm": 1.2409359547056185, + "learning_rate": 1.4773330664143303e-06, + "loss": 0.2161, + "step": 5058 + }, + { + "epoch": 0.756542545237027, + "grad_norm": 1.517794796120005, + "learning_rate": 1.475614780423984e-06, + "loss": 0.3075, + "step": 5059 + }, + { + "epoch": 0.7566920891281591, + "grad_norm": 2.2683584626782114, + "learning_rate": 1.4738973212883744e-06, + "loss": 0.1662, + "step": 5060 + }, + { + "epoch": 0.7568416330192912, + "grad_norm": 1.9968898477136643, + "learning_rate": 1.4721806894104385e-06, + "loss": 0.491, + "step": 5061 + }, + { + "epoch": 0.7569911769104232, + "grad_norm": 1.646432552223488, + "learning_rate": 1.4704648851929104e-06, + "loss": 0.3103, + "step": 5062 + }, + { + "epoch": 0.7571407208015553, + "grad_norm": 1.7629010278592214, + "learning_rate": 1.4687499090383351e-06, + "loss": 0.2925, + "step": 5063 + }, + { + "epoch": 0.7572902646926873, + "grad_norm": 1.9232933811639126, + "learning_rate": 1.4670357613490639e-06, + "loss": 0.1801, + "step": 5064 + }, + { + "epoch": 0.7574398085838193, + "grad_norm": 1.686411431441648, + "learning_rate": 1.4653224425272512e-06, + "loss": 0.2741, + "step": 5065 + }, + { + "epoch": 0.7575893524749514, + "grad_norm": 1.6200170825201825, + "learning_rate": 1.46360995297486e-06, + "loss": 0.4698, + "step": 5066 + }, + { + "epoch": 0.7577388963660835, + "grad_norm": 1.6203826723565258, + "learning_rate": 1.4618982930936532e-06, + "loss": 0.2704, + "step": 5067 + }, + { + "epoch": 0.7578884402572155, + "grad_norm": 2.026413566334365, + "learning_rate": 1.4601874632852053e-06, + "loss": 0.448, + "step": 5068 + }, + { + "epoch": 0.7580379841483476, + "grad_norm": 1.879075603823234, + "learning_rate": 1.4584774639508931e-06, + "loss": 0.5288, + "step": 5069 + }, + { + "epoch": 0.7581875280394796, + "grad_norm": 1.5003273725894806, + "learning_rate": 1.4567682954918988e-06, + "loss": 0.2539, + "step": 5070 + }, + { + "epoch": 0.7583370719306116, + "grad_norm": 1.2553245034283764, + "learning_rate": 1.4550599583092111e-06, + "loss": 0.1335, + "step": 5071 + }, + { + "epoch": 0.7584866158217437, + "grad_norm": 1.5877306530313855, + "learning_rate": 1.453352452803618e-06, + "loss": 0.2332, + "step": 5072 + }, + { + "epoch": 0.7586361597128757, + "grad_norm": 1.7701093390470832, + "learning_rate": 1.4516457793757215e-06, + "loss": 0.3121, + "step": 5073 + }, + { + "epoch": 0.7587857036040078, + "grad_norm": 0.8674066155063845, + "learning_rate": 1.4499399384259222e-06, + "loss": 0.1345, + "step": 5074 + }, + { + "epoch": 0.7589352474951399, + "grad_norm": 1.2117217139497436, + "learning_rate": 1.4482349303544264e-06, + "loss": 0.195, + "step": 5075 + }, + { + "epoch": 0.7590847913862718, + "grad_norm": 2.2219961154832544, + "learning_rate": 1.4465307555612467e-06, + "loss": 0.3763, + "step": 5076 + }, + { + "epoch": 0.7592343352774039, + "grad_norm": 1.5061167634309305, + "learning_rate": 1.4448274144461965e-06, + "loss": 0.1569, + "step": 5077 + }, + { + "epoch": 0.759383879168536, + "grad_norm": 1.8680090274082126, + "learning_rate": 1.4431249074088976e-06, + "loss": 0.2104, + "step": 5078 + }, + { + "epoch": 0.759533423059668, + "grad_norm": 1.9039417503030744, + "learning_rate": 1.441423234848774e-06, + "loss": 0.273, + "step": 5079 + }, + { + "epoch": 0.7596829669508001, + "grad_norm": 1.4514849698536556, + "learning_rate": 1.4397223971650554e-06, + "loss": 0.1839, + "step": 5080 + }, + { + "epoch": 0.7598325108419322, + "grad_norm": 1.5069640247940201, + "learning_rate": 1.438022394756775e-06, + "loss": 0.3132, + "step": 5081 + }, + { + "epoch": 0.7599820547330641, + "grad_norm": 2.243758810711406, + "learning_rate": 1.4363232280227673e-06, + "loss": 0.3157, + "step": 5082 + }, + { + "epoch": 0.7601315986241962, + "grad_norm": 1.3666381604237785, + "learning_rate": 1.434624897361675e-06, + "loss": 0.1897, + "step": 5083 + }, + { + "epoch": 0.7602811425153283, + "grad_norm": 1.5194858603084764, + "learning_rate": 1.4329274031719427e-06, + "loss": 0.3733, + "step": 5084 + }, + { + "epoch": 0.7604306864064603, + "grad_norm": 1.7839247130392213, + "learning_rate": 1.4312307458518187e-06, + "loss": 0.3784, + "step": 5085 + }, + { + "epoch": 0.7605802302975924, + "grad_norm": 1.7923584025943586, + "learning_rate": 1.4295349257993551e-06, + "loss": 0.3015, + "step": 5086 + }, + { + "epoch": 0.7607297741887243, + "grad_norm": 1.6712705441976172, + "learning_rate": 1.427839943412408e-06, + "loss": 0.1954, + "step": 5087 + }, + { + "epoch": 0.7608793180798564, + "grad_norm": 1.2072895353933133, + "learning_rate": 1.4261457990886363e-06, + "loss": 0.1781, + "step": 5088 + }, + { + "epoch": 0.7610288619709885, + "grad_norm": 1.5042497663667336, + "learning_rate": 1.4244524932255026e-06, + "loss": 0.1984, + "step": 5089 + }, + { + "epoch": 0.7611784058621205, + "grad_norm": 2.107355295283857, + "learning_rate": 1.4227600262202735e-06, + "loss": 0.4434, + "step": 5090 + }, + { + "epoch": 0.7613279497532526, + "grad_norm": 1.8766492583764405, + "learning_rate": 1.421068398470018e-06, + "loss": 0.2612, + "step": 5091 + }, + { + "epoch": 0.7614774936443847, + "grad_norm": 1.796138305576019, + "learning_rate": 1.4193776103716067e-06, + "loss": 0.2362, + "step": 5092 + }, + { + "epoch": 0.7616270375355166, + "grad_norm": 2.181925533887432, + "learning_rate": 1.4176876623217155e-06, + "loss": 0.2451, + "step": 5093 + }, + { + "epoch": 0.7617765814266487, + "grad_norm": 1.856890477039263, + "learning_rate": 1.4159985547168231e-06, + "loss": 0.4182, + "step": 5094 + }, + { + "epoch": 0.7619261253177808, + "grad_norm": 2.1785753345566157, + "learning_rate": 1.4143102879532095e-06, + "loss": 0.3683, + "step": 5095 + }, + { + "epoch": 0.7620756692089128, + "grad_norm": 2.39515228059131, + "learning_rate": 1.412622862426961e-06, + "loss": 0.4871, + "step": 5096 + }, + { + "epoch": 0.7622252131000449, + "grad_norm": 1.8122946113103227, + "learning_rate": 1.4109362785339603e-06, + "loss": 0.3102, + "step": 5097 + }, + { + "epoch": 0.762374756991177, + "grad_norm": 2.025031228030744, + "learning_rate": 1.4092505366698977e-06, + "loss": 0.3625, + "step": 5098 + }, + { + "epoch": 0.7625243008823089, + "grad_norm": 1.914219199792331, + "learning_rate": 1.4075656372302643e-06, + "loss": 0.197, + "step": 5099 + }, + { + "epoch": 0.762673844773441, + "grad_norm": 1.9228829981383313, + "learning_rate": 1.4058815806103542e-06, + "loss": 0.3423, + "step": 5100 + }, + { + "epoch": 0.762823388664573, + "grad_norm": 1.4348421326181278, + "learning_rate": 1.404198367205263e-06, + "loss": 0.203, + "step": 5101 + }, + { + "epoch": 0.7629729325557051, + "grad_norm": 2.226462398755096, + "learning_rate": 1.402515997409889e-06, + "loss": 0.5939, + "step": 5102 + }, + { + "epoch": 0.7631224764468372, + "grad_norm": 2.4144455997712044, + "learning_rate": 1.400834471618932e-06, + "loss": 0.5583, + "step": 5103 + }, + { + "epoch": 0.7632720203379691, + "grad_norm": 1.8002819684827804, + "learning_rate": 1.3991537902268949e-06, + "loss": 0.3969, + "step": 5104 + }, + { + "epoch": 0.7634215642291012, + "grad_norm": 1.131912222291808, + "learning_rate": 1.3974739536280813e-06, + "loss": 0.1708, + "step": 5105 + }, + { + "epoch": 0.7635711081202333, + "grad_norm": 1.1518573963105492, + "learning_rate": 1.3957949622165984e-06, + "loss": 0.2141, + "step": 5106 + }, + { + "epoch": 0.7637206520113653, + "grad_norm": 1.261206361936778, + "learning_rate": 1.3941168163863505e-06, + "loss": 0.1754, + "step": 5107 + }, + { + "epoch": 0.7638701959024974, + "grad_norm": 1.483154100959671, + "learning_rate": 1.392439516531049e-06, + "loss": 0.2928, + "step": 5108 + }, + { + "epoch": 0.7640197397936295, + "grad_norm": 2.0259968651342946, + "learning_rate": 1.3907630630442044e-06, + "loss": 0.3666, + "step": 5109 + }, + { + "epoch": 0.7641692836847614, + "grad_norm": 1.643634583406319, + "learning_rate": 1.3890874563191288e-06, + "loss": 0.164, + "step": 5110 + }, + { + "epoch": 0.7643188275758935, + "grad_norm": 2.3198327361640825, + "learning_rate": 1.3874126967489355e-06, + "loss": 0.6347, + "step": 5111 + }, + { + "epoch": 0.7644683714670256, + "grad_norm": 2.0489051011628545, + "learning_rate": 1.3857387847265391e-06, + "loss": 0.1899, + "step": 5112 + }, + { + "epoch": 0.7646179153581576, + "grad_norm": 1.451286296004528, + "learning_rate": 1.3840657206446562e-06, + "loss": 0.2919, + "step": 5113 + }, + { + "epoch": 0.7647674592492897, + "grad_norm": 1.3652283718866698, + "learning_rate": 1.3823935048958026e-06, + "loss": 0.2496, + "step": 5114 + }, + { + "epoch": 0.7649170031404217, + "grad_norm": 1.2708107882464574, + "learning_rate": 1.380722137872298e-06, + "loss": 0.2123, + "step": 5115 + }, + { + "epoch": 0.7650665470315537, + "grad_norm": 1.7482422831337887, + "learning_rate": 1.379051619966259e-06, + "loss": 0.3106, + "step": 5116 + }, + { + "epoch": 0.7652160909226858, + "grad_norm": 2.012034563626334, + "learning_rate": 1.3773819515696085e-06, + "loss": 0.4865, + "step": 5117 + }, + { + "epoch": 0.7653656348138178, + "grad_norm": 1.4340694999731798, + "learning_rate": 1.3757131330740636e-06, + "loss": 0.3106, + "step": 5118 + }, + { + "epoch": 0.7655151787049499, + "grad_norm": 1.8139431075466297, + "learning_rate": 1.3740451648711457e-06, + "loss": 0.2158, + "step": 5119 + }, + { + "epoch": 0.765664722596082, + "grad_norm": 1.6642052674277892, + "learning_rate": 1.3723780473521765e-06, + "loss": 0.3219, + "step": 5120 + }, + { + "epoch": 0.765814266487214, + "grad_norm": 1.7580275785472688, + "learning_rate": 1.3707117809082787e-06, + "loss": 0.2053, + "step": 5121 + }, + { + "epoch": 0.765963810378346, + "grad_norm": 1.6002774083782452, + "learning_rate": 1.369046365930375e-06, + "loss": 0.1749, + "step": 5122 + }, + { + "epoch": 0.7661133542694781, + "grad_norm": 1.4260068897783758, + "learning_rate": 1.367381802809185e-06, + "loss": 0.1994, + "step": 5123 + }, + { + "epoch": 0.7662628981606101, + "grad_norm": 1.2705607099025549, + "learning_rate": 1.3657180919352336e-06, + "loss": 0.1984, + "step": 5124 + }, + { + "epoch": 0.7664124420517422, + "grad_norm": 1.2946783271168116, + "learning_rate": 1.3640552336988421e-06, + "loss": 0.3071, + "step": 5125 + }, + { + "epoch": 0.7665619859428743, + "grad_norm": 2.030415403862267, + "learning_rate": 1.3623932284901342e-06, + "loss": 0.3528, + "step": 5126 + }, + { + "epoch": 0.7667115298340063, + "grad_norm": 2.24138226820361, + "learning_rate": 1.3607320766990311e-06, + "loss": 0.5005, + "step": 5127 + }, + { + "epoch": 0.7668610737251383, + "grad_norm": 1.8442276481420865, + "learning_rate": 1.3590717787152564e-06, + "loss": 0.3878, + "step": 5128 + }, + { + "epoch": 0.7670106176162704, + "grad_norm": 1.1641423978843881, + "learning_rate": 1.3574123349283314e-06, + "loss": 0.1934, + "step": 5129 + }, + { + "epoch": 0.7671601615074024, + "grad_norm": 2.0359668691512005, + "learning_rate": 1.3557537457275772e-06, + "loss": 0.5111, + "step": 5130 + }, + { + "epoch": 0.7673097053985345, + "grad_norm": 1.2486381085511884, + "learning_rate": 1.3540960115021151e-06, + "loss": 0.1556, + "step": 5131 + }, + { + "epoch": 0.7674592492896665, + "grad_norm": 1.5112396289258176, + "learning_rate": 1.3524391326408675e-06, + "loss": 0.1686, + "step": 5132 + }, + { + "epoch": 0.7676087931807986, + "grad_norm": 1.7511319178239615, + "learning_rate": 1.3507831095325507e-06, + "loss": 0.3124, + "step": 5133 + }, + { + "epoch": 0.7677583370719306, + "grad_norm": 1.3527522831391534, + "learning_rate": 1.349127942565685e-06, + "loss": 0.1773, + "step": 5134 + }, + { + "epoch": 0.7679078809630626, + "grad_norm": 1.9345906394726324, + "learning_rate": 1.3474736321285892e-06, + "loss": 0.4342, + "step": 5135 + }, + { + "epoch": 0.7680574248541947, + "grad_norm": 0.9833467274833904, + "learning_rate": 1.3458201786093795e-06, + "loss": 0.1713, + "step": 5136 + }, + { + "epoch": 0.7682069687453268, + "grad_norm": 1.988559163874012, + "learning_rate": 1.3441675823959743e-06, + "loss": 0.2121, + "step": 5137 + }, + { + "epoch": 0.7683565126364588, + "grad_norm": 1.6989904140653285, + "learning_rate": 1.3425158438760854e-06, + "loss": 0.3356, + "step": 5138 + }, + { + "epoch": 0.7685060565275909, + "grad_norm": 1.7597113504624267, + "learning_rate": 1.3408649634372278e-06, + "loss": 0.2969, + "step": 5139 + }, + { + "epoch": 0.7686556004187229, + "grad_norm": 1.5808745547264054, + "learning_rate": 1.3392149414667144e-06, + "loss": 0.3108, + "step": 5140 + }, + { + "epoch": 0.7688051443098549, + "grad_norm": 1.183072326233728, + "learning_rate": 1.337565778351656e-06, + "loss": 0.1785, + "step": 5141 + }, + { + "epoch": 0.768954688200987, + "grad_norm": 1.377181885025977, + "learning_rate": 1.3359174744789627e-06, + "loss": 0.1877, + "step": 5142 + }, + { + "epoch": 0.7691042320921191, + "grad_norm": 1.6390355630703077, + "learning_rate": 1.3342700302353411e-06, + "loss": 0.1863, + "step": 5143 + }, + { + "epoch": 0.7692537759832511, + "grad_norm": 1.4233754017806632, + "learning_rate": 1.332623446007299e-06, + "loss": 0.2292, + "step": 5144 + }, + { + "epoch": 0.7694033198743832, + "grad_norm": 1.5453883598923903, + "learning_rate": 1.3309777221811398e-06, + "loss": 0.1724, + "step": 5145 + }, + { + "epoch": 0.7695528637655151, + "grad_norm": 1.3471592166252704, + "learning_rate": 1.329332859142967e-06, + "loss": 0.1794, + "step": 5146 + }, + { + "epoch": 0.7697024076566472, + "grad_norm": 1.866000528416247, + "learning_rate": 1.3276888572786829e-06, + "loss": 0.3123, + "step": 5147 + }, + { + "epoch": 0.7698519515477793, + "grad_norm": 1.9653238426653945, + "learning_rate": 1.3260457169739821e-06, + "loss": 0.5209, + "step": 5148 + }, + { + "epoch": 0.7700014954389113, + "grad_norm": 1.744041123921355, + "learning_rate": 1.324403438614364e-06, + "loss": 0.2774, + "step": 5149 + }, + { + "epoch": 0.7701510393300434, + "grad_norm": 1.7153767081245714, + "learning_rate": 1.3227620225851218e-06, + "loss": 0.3048, + "step": 5150 + }, + { + "epoch": 0.7703005832211754, + "grad_norm": 2.0578185400732982, + "learning_rate": 1.321121469271348e-06, + "loss": 0.4632, + "step": 5151 + }, + { + "epoch": 0.7704501271123074, + "grad_norm": 1.943828621957741, + "learning_rate": 1.319481779057934e-06, + "loss": 0.3479, + "step": 5152 + }, + { + "epoch": 0.7705996710034395, + "grad_norm": 1.9588943567254984, + "learning_rate": 1.3178429523295615e-06, + "loss": 0.3768, + "step": 5153 + }, + { + "epoch": 0.7707492148945716, + "grad_norm": 1.7862138182310494, + "learning_rate": 1.3162049894707208e-06, + "loss": 0.3491, + "step": 5154 + }, + { + "epoch": 0.7708987587857036, + "grad_norm": 1.9559502499943882, + "learning_rate": 1.314567890865691e-06, + "loss": 0.4638, + "step": 5155 + }, + { + "epoch": 0.7710483026768357, + "grad_norm": 1.6711506968064866, + "learning_rate": 1.3129316568985523e-06, + "loss": 0.2376, + "step": 5156 + }, + { + "epoch": 0.7711978465679677, + "grad_norm": 1.4349317828854198, + "learning_rate": 1.311296287953182e-06, + "loss": 0.2305, + "step": 5157 + }, + { + "epoch": 0.7713473904590997, + "grad_norm": 3.467349895371973, + "learning_rate": 1.3096617844132509e-06, + "loss": 0.2566, + "step": 5158 + }, + { + "epoch": 0.7714969343502318, + "grad_norm": 2.094062732001438, + "learning_rate": 1.3080281466622297e-06, + "loss": 0.3332, + "step": 5159 + }, + { + "epoch": 0.7716464782413638, + "grad_norm": 2.2304024427615894, + "learning_rate": 1.3063953750833863e-06, + "loss": 0.6173, + "step": 5160 + }, + { + "epoch": 0.7717960221324959, + "grad_norm": 1.5226679105199643, + "learning_rate": 1.3047634700597845e-06, + "loss": 0.205, + "step": 5161 + }, + { + "epoch": 0.771945566023628, + "grad_norm": 1.6996586942607637, + "learning_rate": 1.3031324319742862e-06, + "loss": 0.3515, + "step": 5162 + }, + { + "epoch": 0.7720951099147599, + "grad_norm": 1.4850067551054624, + "learning_rate": 1.3015022612095462e-06, + "loss": 0.1903, + "step": 5163 + }, + { + "epoch": 0.772244653805892, + "grad_norm": 1.1231636882236478, + "learning_rate": 1.2998729581480196e-06, + "loss": 0.1425, + "step": 5164 + }, + { + "epoch": 0.7723941976970241, + "grad_norm": 1.7939064424526463, + "learning_rate": 1.2982445231719565e-06, + "loss": 0.3739, + "step": 5165 + }, + { + "epoch": 0.7725437415881561, + "grad_norm": 1.6489311898199002, + "learning_rate": 1.2966169566634029e-06, + "loss": 0.3142, + "step": 5166 + }, + { + "epoch": 0.7726932854792882, + "grad_norm": 1.785459582394644, + "learning_rate": 1.294990259004203e-06, + "loss": 0.2079, + "step": 5167 + }, + { + "epoch": 0.7728428293704203, + "grad_norm": 2.0711860253417345, + "learning_rate": 1.293364430575994e-06, + "loss": 0.6204, + "step": 5168 + }, + { + "epoch": 0.7729923732615522, + "grad_norm": 1.957511424168931, + "learning_rate": 1.2917394717602123e-06, + "loss": 0.3876, + "step": 5169 + }, + { + "epoch": 0.7731419171526843, + "grad_norm": 1.521492202699478, + "learning_rate": 1.290115382938088e-06, + "loss": 0.3246, + "step": 5170 + }, + { + "epoch": 0.7732914610438164, + "grad_norm": 1.3139500725240865, + "learning_rate": 1.288492164490649e-06, + "loss": 0.1721, + "step": 5171 + }, + { + "epoch": 0.7734410049349484, + "grad_norm": 1.5448621697348717, + "learning_rate": 1.286869816798718e-06, + "loss": 0.1732, + "step": 5172 + }, + { + "epoch": 0.7735905488260805, + "grad_norm": 1.3058555765008295, + "learning_rate": 1.2852483402429123e-06, + "loss": 0.1706, + "step": 5173 + }, + { + "epoch": 0.7737400927172124, + "grad_norm": 1.4533480811391561, + "learning_rate": 1.2836277352036458e-06, + "loss": 0.3222, + "step": 5174 + }, + { + "epoch": 0.7738896366083445, + "grad_norm": 1.784889376535602, + "learning_rate": 1.2820080020611287e-06, + "loss": 0.4126, + "step": 5175 + }, + { + "epoch": 0.7740391804994766, + "grad_norm": 1.6998787486087408, + "learning_rate": 1.2803891411953656e-06, + "loss": 0.2804, + "step": 5176 + }, + { + "epoch": 0.7741887243906086, + "grad_norm": 1.593636695723748, + "learning_rate": 1.278771152986159e-06, + "loss": 0.2147, + "step": 5177 + }, + { + "epoch": 0.7743382682817407, + "grad_norm": 1.978272034241196, + "learning_rate": 1.2771540378131015e-06, + "loss": 0.3492, + "step": 5178 + }, + { + "epoch": 0.7744878121728728, + "grad_norm": 1.9360723446212331, + "learning_rate": 1.2755377960555848e-06, + "loss": 0.1768, + "step": 5179 + }, + { + "epoch": 0.7746373560640047, + "grad_norm": 1.7891672511855676, + "learning_rate": 1.2739224280927959e-06, + "loss": 0.2124, + "step": 5180 + }, + { + "epoch": 0.7747868999551368, + "grad_norm": 2.1658464164443925, + "learning_rate": 1.2723079343037143e-06, + "loss": 0.6568, + "step": 5181 + }, + { + "epoch": 0.7749364438462689, + "grad_norm": 1.47056363116747, + "learning_rate": 1.2706943150671163e-06, + "loss": 0.1576, + "step": 5182 + }, + { + "epoch": 0.7750859877374009, + "grad_norm": 2.0532403498373704, + "learning_rate": 1.2690815707615727e-06, + "loss": 0.7706, + "step": 5183 + }, + { + "epoch": 0.775235531628533, + "grad_norm": 1.5654608710902007, + "learning_rate": 1.267469701765449e-06, + "loss": 0.2848, + "step": 5184 + }, + { + "epoch": 0.7753850755196651, + "grad_norm": 1.9066760024915341, + "learning_rate": 1.2658587084569052e-06, + "loss": 0.3584, + "step": 5185 + }, + { + "epoch": 0.775534619410797, + "grad_norm": 1.8489299239183394, + "learning_rate": 1.2642485912138952e-06, + "loss": 0.5046, + "step": 5186 + }, + { + "epoch": 0.7756841633019291, + "grad_norm": 1.853116095909097, + "learning_rate": 1.2626393504141703e-06, + "loss": 0.2301, + "step": 5187 + }, + { + "epoch": 0.7758337071930612, + "grad_norm": 1.6492497580832963, + "learning_rate": 1.261030986435271e-06, + "loss": 0.3376, + "step": 5188 + }, + { + "epoch": 0.7759832510841932, + "grad_norm": 2.0608610085499293, + "learning_rate": 1.2594234996545358e-06, + "loss": 0.4805, + "step": 5189 + }, + { + "epoch": 0.7761327949753253, + "grad_norm": 1.5281935374174347, + "learning_rate": 1.2578168904490961e-06, + "loss": 0.2001, + "step": 5190 + }, + { + "epoch": 0.7762823388664573, + "grad_norm": 1.4752945527057928, + "learning_rate": 1.2562111591958798e-06, + "loss": 0.1915, + "step": 5191 + }, + { + "epoch": 0.7764318827575893, + "grad_norm": 1.2257318903732652, + "learning_rate": 1.2546063062716069e-06, + "loss": 0.1994, + "step": 5192 + }, + { + "epoch": 0.7765814266487214, + "grad_norm": 1.3328539273933164, + "learning_rate": 1.2530023320527885e-06, + "loss": 0.1759, + "step": 5193 + }, + { + "epoch": 0.7767309705398534, + "grad_norm": 2.3149216704783524, + "learning_rate": 1.251399236915733e-06, + "loss": 0.3307, + "step": 5194 + }, + { + "epoch": 0.7768805144309855, + "grad_norm": 2.207985609196849, + "learning_rate": 1.2497970212365445e-06, + "loss": 0.3746, + "step": 5195 + }, + { + "epoch": 0.7770300583221176, + "grad_norm": 1.5457106552731106, + "learning_rate": 1.248195685391117e-06, + "loss": 0.2302, + "step": 5196 + }, + { + "epoch": 0.7771796022132496, + "grad_norm": 1.8337112808430374, + "learning_rate": 1.2465952297551392e-06, + "loss": 0.3605, + "step": 5197 + }, + { + "epoch": 0.7773291461043816, + "grad_norm": 1.1180432616700489, + "learning_rate": 1.2449956547040947e-06, + "loss": 0.1833, + "step": 5198 + }, + { + "epoch": 0.7774786899955137, + "grad_norm": 2.502925484334131, + "learning_rate": 1.2433969606132567e-06, + "loss": 0.2465, + "step": 5199 + }, + { + "epoch": 0.7776282338866457, + "grad_norm": 2.2065069740569, + "learning_rate": 1.241799147857695e-06, + "loss": 0.1764, + "step": 5200 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 1.4519395658790257, + "learning_rate": 1.2402022168122728e-06, + "loss": 0.1469, + "step": 5201 + }, + { + "epoch": 0.7779273216689099, + "grad_norm": 1.861219549064482, + "learning_rate": 1.2386061678516454e-06, + "loss": 0.3527, + "step": 5202 + }, + { + "epoch": 0.7780768655600419, + "grad_norm": 1.9902252706182828, + "learning_rate": 1.2370110013502619e-06, + "loss": 0.2107, + "step": 5203 + }, + { + "epoch": 0.7782264094511739, + "grad_norm": 2.769366718134359, + "learning_rate": 1.2354167176823617e-06, + "loss": 0.3197, + "step": 5204 + }, + { + "epoch": 0.7783759533423059, + "grad_norm": 1.866646748213707, + "learning_rate": 1.23382331722198e-06, + "loss": 0.1758, + "step": 5205 + }, + { + "epoch": 0.778525497233438, + "grad_norm": 1.7292566295186207, + "learning_rate": 1.232230800342944e-06, + "loss": 0.3464, + "step": 5206 + }, + { + "epoch": 0.7786750411245701, + "grad_norm": 1.9534158032411064, + "learning_rate": 1.2306391674188733e-06, + "loss": 0.4686, + "step": 5207 + }, + { + "epoch": 0.7788245850157021, + "grad_norm": 2.059181429892903, + "learning_rate": 1.2290484188231817e-06, + "loss": 0.4902, + "step": 5208 + }, + { + "epoch": 0.7789741289068342, + "grad_norm": 1.641058928007562, + "learning_rate": 1.2274585549290718e-06, + "loss": 0.2876, + "step": 5209 + }, + { + "epoch": 0.7791236727979662, + "grad_norm": 1.6929071960032525, + "learning_rate": 1.225869576109543e-06, + "loss": 0.1776, + "step": 5210 + }, + { + "epoch": 0.7792732166890982, + "grad_norm": 1.6253245698964378, + "learning_rate": 1.2242814827373844e-06, + "loss": 0.3526, + "step": 5211 + }, + { + "epoch": 0.7794227605802303, + "grad_norm": 2.04295837808168, + "learning_rate": 1.2226942751851773e-06, + "loss": 0.5467, + "step": 5212 + }, + { + "epoch": 0.7795723044713624, + "grad_norm": 1.743780870061826, + "learning_rate": 1.2211079538252985e-06, + "loss": 0.334, + "step": 5213 + }, + { + "epoch": 0.7797218483624944, + "grad_norm": 1.9591199498980094, + "learning_rate": 1.2195225190299108e-06, + "loss": 0.4594, + "step": 5214 + }, + { + "epoch": 0.7798713922536264, + "grad_norm": 1.7219054473843514, + "learning_rate": 1.2179379711709738e-06, + "loss": 0.3348, + "step": 5215 + }, + { + "epoch": 0.7800209361447585, + "grad_norm": 2.093883168554664, + "learning_rate": 1.216354310620238e-06, + "loss": 0.2057, + "step": 5216 + }, + { + "epoch": 0.7801704800358905, + "grad_norm": 1.7834362366509595, + "learning_rate": 1.2147715377492453e-06, + "loss": 0.4279, + "step": 5217 + }, + { + "epoch": 0.7803200239270226, + "grad_norm": 1.926663024705307, + "learning_rate": 1.2131896529293307e-06, + "loss": 0.2761, + "step": 5218 + }, + { + "epoch": 0.7804695678181546, + "grad_norm": 1.96588487893358, + "learning_rate": 1.2116086565316172e-06, + "loss": 0.4397, + "step": 5219 + }, + { + "epoch": 0.7806191117092867, + "grad_norm": 1.5321799903925233, + "learning_rate": 1.210028548927023e-06, + "loss": 0.2289, + "step": 5220 + }, + { + "epoch": 0.7807686556004187, + "grad_norm": 1.8998642571797673, + "learning_rate": 1.2084493304862566e-06, + "loss": 0.3277, + "step": 5221 + }, + { + "epoch": 0.7809181994915507, + "grad_norm": 1.9600185581588903, + "learning_rate": 1.2068710015798173e-06, + "loss": 0.2714, + "step": 5222 + }, + { + "epoch": 0.7810677433826828, + "grad_norm": 1.7864009599583153, + "learning_rate": 1.2052935625779971e-06, + "loss": 0.1933, + "step": 5223 + }, + { + "epoch": 0.7812172872738149, + "grad_norm": 1.869397734653961, + "learning_rate": 1.2037170138508785e-06, + "loss": 0.4623, + "step": 5224 + }, + { + "epoch": 0.7813668311649469, + "grad_norm": 1.4205112519649394, + "learning_rate": 1.2021413557683341e-06, + "loss": 0.1947, + "step": 5225 + }, + { + "epoch": 0.781516375056079, + "grad_norm": 2.047271238805902, + "learning_rate": 1.200566588700029e-06, + "loss": 0.4764, + "step": 5226 + }, + { + "epoch": 0.781665918947211, + "grad_norm": 1.7536964843303968, + "learning_rate": 1.1989927130154188e-06, + "loss": 0.1888, + "step": 5227 + }, + { + "epoch": 0.781815462838343, + "grad_norm": 1.3080390665559112, + "learning_rate": 1.1974197290837513e-06, + "loss": 0.214, + "step": 5228 + }, + { + "epoch": 0.7819650067294751, + "grad_norm": 1.8553890211141864, + "learning_rate": 1.1958476372740613e-06, + "loss": 0.4046, + "step": 5229 + }, + { + "epoch": 0.7821145506206072, + "grad_norm": 1.2388166720119576, + "learning_rate": 1.194276437955177e-06, + "loss": 0.2053, + "step": 5230 + }, + { + "epoch": 0.7822640945117392, + "grad_norm": 1.8911900345926427, + "learning_rate": 1.1927061314957173e-06, + "loss": 0.1435, + "step": 5231 + }, + { + "epoch": 0.7824136384028713, + "grad_norm": 1.8591047459231786, + "learning_rate": 1.191136718264092e-06, + "loss": 0.5541, + "step": 5232 + }, + { + "epoch": 0.7825631822940032, + "grad_norm": 2.1194471568246764, + "learning_rate": 1.1895681986285013e-06, + "loss": 0.1896, + "step": 5233 + }, + { + "epoch": 0.7827127261851353, + "grad_norm": 1.5309028231915267, + "learning_rate": 1.1880005729569305e-06, + "loss": 0.2254, + "step": 5234 + }, + { + "epoch": 0.7828622700762674, + "grad_norm": 1.5598996459031256, + "learning_rate": 1.1864338416171645e-06, + "loss": 0.3328, + "step": 5235 + }, + { + "epoch": 0.7830118139673994, + "grad_norm": 1.474528440565962, + "learning_rate": 1.184868004976772e-06, + "loss": 0.2349, + "step": 5236 + }, + { + "epoch": 0.7831613578585315, + "grad_norm": 1.6887170014803405, + "learning_rate": 1.1833030634031133e-06, + "loss": 0.1614, + "step": 5237 + }, + { + "epoch": 0.7833109017496636, + "grad_norm": 1.979797844043359, + "learning_rate": 1.1817390172633402e-06, + "loss": 0.1771, + "step": 5238 + }, + { + "epoch": 0.7834604456407955, + "grad_norm": 2.022927768675501, + "learning_rate": 1.1801758669243906e-06, + "loss": 0.1835, + "step": 5239 + }, + { + "epoch": 0.7836099895319276, + "grad_norm": 1.6051070044772568, + "learning_rate": 1.1786136127529956e-06, + "loss": 0.1793, + "step": 5240 + }, + { + "epoch": 0.7837595334230597, + "grad_norm": 1.5363371968334916, + "learning_rate": 1.1770522551156749e-06, + "loss": 0.179, + "step": 5241 + }, + { + "epoch": 0.7839090773141917, + "grad_norm": 1.3112160726563684, + "learning_rate": 1.175491794378738e-06, + "loss": 0.1864, + "step": 5242 + }, + { + "epoch": 0.7840586212053238, + "grad_norm": 2.0595831619664957, + "learning_rate": 1.1739322309082863e-06, + "loss": 0.329, + "step": 5243 + }, + { + "epoch": 0.7842081650964559, + "grad_norm": 1.7661938552680008, + "learning_rate": 1.172373565070205e-06, + "loss": 0.3183, + "step": 5244 + }, + { + "epoch": 0.7843577089875878, + "grad_norm": 1.789087420294166, + "learning_rate": 1.170815797230173e-06, + "loss": 0.4763, + "step": 5245 + }, + { + "epoch": 0.7845072528787199, + "grad_norm": 2.0256151536550973, + "learning_rate": 1.1692589277536587e-06, + "loss": 0.3072, + "step": 5246 + }, + { + "epoch": 0.784656796769852, + "grad_norm": 1.1749305277181807, + "learning_rate": 1.1677029570059179e-06, + "loss": 0.2655, + "step": 5247 + }, + { + "epoch": 0.784806340660984, + "grad_norm": 1.679136798094466, + "learning_rate": 1.166147885351997e-06, + "loss": 0.1859, + "step": 5248 + }, + { + "epoch": 0.7849558845521161, + "grad_norm": 1.5237961664786974, + "learning_rate": 1.1645937131567303e-06, + "loss": 0.3246, + "step": 5249 + }, + { + "epoch": 0.785105428443248, + "grad_norm": 2.0072470584961386, + "learning_rate": 1.1630404407847412e-06, + "loss": 0.368, + "step": 5250 + }, + { + "epoch": 0.7852549723343801, + "grad_norm": 1.885681017017375, + "learning_rate": 1.1614880686004438e-06, + "loss": 0.5029, + "step": 5251 + }, + { + "epoch": 0.7854045162255122, + "grad_norm": 1.491129645009784, + "learning_rate": 1.1599365969680376e-06, + "loss": 0.2128, + "step": 5252 + }, + { + "epoch": 0.7855540601166442, + "grad_norm": 1.5958841470973304, + "learning_rate": 1.1583860262515162e-06, + "loss": 0.2492, + "step": 5253 + }, + { + "epoch": 0.7857036040077763, + "grad_norm": 1.2571700036739464, + "learning_rate": 1.156836356814654e-06, + "loss": 0.1872, + "step": 5254 + }, + { + "epoch": 0.7858531478989084, + "grad_norm": 1.1067274429193164, + "learning_rate": 1.1552875890210208e-06, + "loss": 0.1585, + "step": 5255 + }, + { + "epoch": 0.7860026917900403, + "grad_norm": 1.925194296846744, + "learning_rate": 1.153739723233972e-06, + "loss": 0.3717, + "step": 5256 + }, + { + "epoch": 0.7861522356811724, + "grad_norm": 1.2686743088868078, + "learning_rate": 1.152192759816652e-06, + "loss": 0.1384, + "step": 5257 + }, + { + "epoch": 0.7863017795723045, + "grad_norm": 1.7201319860534536, + "learning_rate": 1.1506466991319948e-06, + "loss": 0.318, + "step": 5258 + }, + { + "epoch": 0.7864513234634365, + "grad_norm": 2.092071828571842, + "learning_rate": 1.1491015415427176e-06, + "loss": 0.466, + "step": 5259 + }, + { + "epoch": 0.7866008673545686, + "grad_norm": 1.536769538548223, + "learning_rate": 1.1475572874113317e-06, + "loss": 0.3516, + "step": 5260 + }, + { + "epoch": 0.7867504112457007, + "grad_norm": 2.0644681744130726, + "learning_rate": 1.1460139371001339e-06, + "loss": 0.2093, + "step": 5261 + }, + { + "epoch": 0.7868999551368326, + "grad_norm": 1.3289832091207237, + "learning_rate": 1.1444714909712085e-06, + "loss": 0.179, + "step": 5262 + }, + { + "epoch": 0.7870494990279647, + "grad_norm": 2.2028406852534927, + "learning_rate": 1.1429299493864283e-06, + "loss": 0.359, + "step": 5263 + }, + { + "epoch": 0.7871990429190967, + "grad_norm": 1.9810878662502185, + "learning_rate": 1.1413893127074537e-06, + "loss": 0.2074, + "step": 5264 + }, + { + "epoch": 0.7873485868102288, + "grad_norm": 1.6394785644797076, + "learning_rate": 1.1398495812957333e-06, + "loss": 0.326, + "step": 5265 + }, + { + "epoch": 0.7874981307013609, + "grad_norm": 1.8364777318031276, + "learning_rate": 1.1383107555125033e-06, + "loss": 0.2928, + "step": 5266 + }, + { + "epoch": 0.7876476745924929, + "grad_norm": 1.5493249757699845, + "learning_rate": 1.1367728357187863e-06, + "loss": 0.2279, + "step": 5267 + }, + { + "epoch": 0.7877972184836249, + "grad_norm": 1.641815970339515, + "learning_rate": 1.1352358222753944e-06, + "loss": 0.1938, + "step": 5268 + }, + { + "epoch": 0.787946762374757, + "grad_norm": 2.37354184235881, + "learning_rate": 1.1336997155429235e-06, + "loss": 0.1637, + "step": 5269 + }, + { + "epoch": 0.788096306265889, + "grad_norm": 1.7033674653722883, + "learning_rate": 1.1321645158817607e-06, + "loss": 0.1882, + "step": 5270 + }, + { + "epoch": 0.7882458501570211, + "grad_norm": 1.2846955702747882, + "learning_rate": 1.1306302236520778e-06, + "loss": 0.1998, + "step": 5271 + }, + { + "epoch": 0.7883953940481532, + "grad_norm": 1.3745056043797175, + "learning_rate": 1.1290968392138351e-06, + "loss": 0.1856, + "step": 5272 + }, + { + "epoch": 0.7885449379392852, + "grad_norm": 1.8410278810400311, + "learning_rate": 1.1275643629267808e-06, + "loss": 0.354, + "step": 5273 + }, + { + "epoch": 0.7886944818304172, + "grad_norm": 1.5622941167611866, + "learning_rate": 1.126032795150445e-06, + "loss": 0.179, + "step": 5274 + }, + { + "epoch": 0.7888440257215493, + "grad_norm": 1.1947229651920328, + "learning_rate": 1.12450213624415e-06, + "loss": 0.2075, + "step": 5275 + }, + { + "epoch": 0.7889935696126813, + "grad_norm": 1.7705591933910914, + "learning_rate": 1.122972386567004e-06, + "loss": 0.3807, + "step": 5276 + }, + { + "epoch": 0.7891431135038134, + "grad_norm": 1.294351141284705, + "learning_rate": 1.1214435464779006e-06, + "loss": 0.1878, + "step": 5277 + }, + { + "epoch": 0.7892926573949454, + "grad_norm": 1.4514453727762606, + "learning_rate": 1.1199156163355214e-06, + "loss": 0.1872, + "step": 5278 + }, + { + "epoch": 0.7894422012860774, + "grad_norm": 1.6565436809106264, + "learning_rate": 1.1183885964983316e-06, + "loss": 0.3287, + "step": 5279 + }, + { + "epoch": 0.7895917451772095, + "grad_norm": 2.0008845608339176, + "learning_rate": 1.1168624873245848e-06, + "loss": 0.1799, + "step": 5280 + }, + { + "epoch": 0.7897412890683415, + "grad_norm": 2.182714868872967, + "learning_rate": 1.1153372891723225e-06, + "loss": 0.2159, + "step": 5281 + }, + { + "epoch": 0.7898908329594736, + "grad_norm": 1.1527198307124225, + "learning_rate": 1.1138130023993692e-06, + "loss": 0.1631, + "step": 5282 + }, + { + "epoch": 0.7900403768506057, + "grad_norm": 1.7822532798123156, + "learning_rate": 1.1122896273633387e-06, + "loss": 0.299, + "step": 5283 + }, + { + "epoch": 0.7901899207417377, + "grad_norm": 1.1673369724310274, + "learning_rate": 1.1107671644216305e-06, + "loss": 0.1843, + "step": 5284 + }, + { + "epoch": 0.7903394646328697, + "grad_norm": 1.8090909105667299, + "learning_rate": 1.1092456139314257e-06, + "loss": 0.2422, + "step": 5285 + }, + { + "epoch": 0.7904890085240018, + "grad_norm": 2.3212239437253417, + "learning_rate": 1.1077249762496966e-06, + "loss": 0.2279, + "step": 5286 + }, + { + "epoch": 0.7906385524151338, + "grad_norm": 1.7897473069927503, + "learning_rate": 1.1062052517331995e-06, + "loss": 0.484, + "step": 5287 + }, + { + "epoch": 0.7907880963062659, + "grad_norm": 2.015197139247361, + "learning_rate": 1.1046864407384754e-06, + "loss": 0.184, + "step": 5288 + }, + { + "epoch": 0.790937640197398, + "grad_norm": 1.688163150185381, + "learning_rate": 1.1031685436218541e-06, + "loss": 0.3039, + "step": 5289 + }, + { + "epoch": 0.79108718408853, + "grad_norm": 1.9066573434763772, + "learning_rate": 1.1016515607394469e-06, + "loss": 0.3647, + "step": 5290 + }, + { + "epoch": 0.791236727979662, + "grad_norm": 1.8738226015137964, + "learning_rate": 1.1001354924471536e-06, + "loss": 0.4081, + "step": 5291 + }, + { + "epoch": 0.7913862718707941, + "grad_norm": 1.7435080907369735, + "learning_rate": 1.0986203391006584e-06, + "loss": 0.1527, + "step": 5292 + }, + { + "epoch": 0.7915358157619261, + "grad_norm": 1.3645650617332317, + "learning_rate": 1.09710610105543e-06, + "loss": 0.1843, + "step": 5293 + }, + { + "epoch": 0.7916853596530582, + "grad_norm": 2.0670218389079014, + "learning_rate": 1.0955927786667259e-06, + "loss": 0.3774, + "step": 5294 + }, + { + "epoch": 0.7918349035441902, + "grad_norm": 1.8569683078727917, + "learning_rate": 1.0940803722895826e-06, + "loss": 0.2881, + "step": 5295 + }, + { + "epoch": 0.7919844474353223, + "grad_norm": 1.9175298153574774, + "learning_rate": 1.0925688822788266e-06, + "loss": 0.5207, + "step": 5296 + }, + { + "epoch": 0.7921339913264543, + "grad_norm": 1.7749496684527544, + "learning_rate": 1.091058308989068e-06, + "loss": 0.2527, + "step": 5297 + }, + { + "epoch": 0.7922835352175863, + "grad_norm": 1.2217285731938283, + "learning_rate": 1.0895486527747023e-06, + "loss": 0.1754, + "step": 5298 + }, + { + "epoch": 0.7924330791087184, + "grad_norm": 1.6995366117086494, + "learning_rate": 1.0880399139899105e-06, + "loss": 0.1996, + "step": 5299 + }, + { + "epoch": 0.7925826229998505, + "grad_norm": 1.50247326241567, + "learning_rate": 1.0865320929886542e-06, + "loss": 0.135, + "step": 5300 + }, + { + "epoch": 0.7927321668909825, + "grad_norm": 1.635063768588377, + "learning_rate": 1.0850251901246844e-06, + "loss": 0.3197, + "step": 5301 + }, + { + "epoch": 0.7928817107821146, + "grad_norm": 1.8838567884738247, + "learning_rate": 1.0835192057515353e-06, + "loss": 0.2191, + "step": 5302 + }, + { + "epoch": 0.7930312546732466, + "grad_norm": 1.3004964224536268, + "learning_rate": 1.0820141402225253e-06, + "loss": 0.2527, + "step": 5303 + }, + { + "epoch": 0.7931807985643786, + "grad_norm": 1.6665155660163822, + "learning_rate": 1.0805099938907566e-06, + "loss": 0.2872, + "step": 5304 + }, + { + "epoch": 0.7933303424555107, + "grad_norm": 1.903361555409123, + "learning_rate": 1.079006767109117e-06, + "loss": 0.4715, + "step": 5305 + }, + { + "epoch": 0.7934798863466428, + "grad_norm": 1.406430656696842, + "learning_rate": 1.0775044602302781e-06, + "loss": 0.2823, + "step": 5306 + }, + { + "epoch": 0.7936294302377748, + "grad_norm": 1.7277035890924473, + "learning_rate": 1.0760030736066952e-06, + "loss": 0.3272, + "step": 5307 + }, + { + "epoch": 0.7937789741289069, + "grad_norm": 1.1554649336199252, + "learning_rate": 1.0745026075906078e-06, + "loss": 0.1612, + "step": 5308 + }, + { + "epoch": 0.7939285180200388, + "grad_norm": 1.7791123530058242, + "learning_rate": 1.073003062534042e-06, + "loss": 0.308, + "step": 5309 + }, + { + "epoch": 0.7940780619111709, + "grad_norm": 1.9658245876851332, + "learning_rate": 1.0715044387888024e-06, + "loss": 0.2799, + "step": 5310 + }, + { + "epoch": 0.794227605802303, + "grad_norm": 1.6959397258782598, + "learning_rate": 1.0700067367064814e-06, + "loss": 0.2732, + "step": 5311 + }, + { + "epoch": 0.794377149693435, + "grad_norm": 1.81531320314136, + "learning_rate": 1.0685099566384548e-06, + "loss": 0.3395, + "step": 5312 + }, + { + "epoch": 0.7945266935845671, + "grad_norm": 1.847318033631291, + "learning_rate": 1.0670140989358818e-06, + "loss": 0.1702, + "step": 5313 + }, + { + "epoch": 0.7946762374756992, + "grad_norm": 2.0257824488414506, + "learning_rate": 1.0655191639497058e-06, + "loss": 0.3722, + "step": 5314 + }, + { + "epoch": 0.7948257813668311, + "grad_norm": 1.4192797285786958, + "learning_rate": 1.0640251520306493e-06, + "loss": 0.2118, + "step": 5315 + }, + { + "epoch": 0.7949753252579632, + "grad_norm": 1.8984963639477945, + "learning_rate": 1.0625320635292263e-06, + "loss": 0.1579, + "step": 5316 + }, + { + "epoch": 0.7951248691490953, + "grad_norm": 1.5382028977262527, + "learning_rate": 1.0610398987957277e-06, + "loss": 0.2171, + "step": 5317 + }, + { + "epoch": 0.7952744130402273, + "grad_norm": 1.7690670255014513, + "learning_rate": 1.0595486581802305e-06, + "loss": 0.2159, + "step": 5318 + }, + { + "epoch": 0.7954239569313594, + "grad_norm": 1.432027975142127, + "learning_rate": 1.0580583420325946e-06, + "loss": 0.1665, + "step": 5319 + }, + { + "epoch": 0.7955735008224915, + "grad_norm": 1.648537122057397, + "learning_rate": 1.05656895070246e-06, + "loss": 0.2823, + "step": 5320 + }, + { + "epoch": 0.7957230447136234, + "grad_norm": 2.1181607138649907, + "learning_rate": 1.0550804845392542e-06, + "loss": 0.3154, + "step": 5321 + }, + { + "epoch": 0.7958725886047555, + "grad_norm": 1.823107673084047, + "learning_rate": 1.0535929438921854e-06, + "loss": 0.4188, + "step": 5322 + }, + { + "epoch": 0.7960221324958875, + "grad_norm": 1.3206600685401113, + "learning_rate": 1.0521063291102446e-06, + "loss": 0.1963, + "step": 5323 + }, + { + "epoch": 0.7961716763870196, + "grad_norm": 1.6798018571826112, + "learning_rate": 1.050620640542208e-06, + "loss": 0.4739, + "step": 5324 + }, + { + "epoch": 0.7963212202781517, + "grad_norm": 1.9935111342215572, + "learning_rate": 1.049135878536629e-06, + "loss": 0.5629, + "step": 5325 + }, + { + "epoch": 0.7964707641692836, + "grad_norm": 2.406131698127812, + "learning_rate": 1.0476520434418492e-06, + "loss": 0.5238, + "step": 5326 + }, + { + "epoch": 0.7966203080604157, + "grad_norm": 1.413432708140045, + "learning_rate": 1.04616913560599e-06, + "loss": 0.3231, + "step": 5327 + }, + { + "epoch": 0.7967698519515478, + "grad_norm": 1.66376053841891, + "learning_rate": 1.0446871553769562e-06, + "loss": 0.3455, + "step": 5328 + }, + { + "epoch": 0.7969193958426798, + "grad_norm": 1.5649481834229166, + "learning_rate": 1.0432061031024343e-06, + "loss": 0.2248, + "step": 5329 + }, + { + "epoch": 0.7970689397338119, + "grad_norm": 1.7736064981609523, + "learning_rate": 1.041725979129894e-06, + "loss": 0.3487, + "step": 5330 + }, + { + "epoch": 0.797218483624944, + "grad_norm": 1.9953614177398118, + "learning_rate": 1.040246783806586e-06, + "loss": 0.4229, + "step": 5331 + }, + { + "epoch": 0.7973680275160759, + "grad_norm": 1.58227968973177, + "learning_rate": 1.0387685174795443e-06, + "loss": 0.1912, + "step": 5332 + }, + { + "epoch": 0.797517571407208, + "grad_norm": 1.4695702098932486, + "learning_rate": 1.0372911804955844e-06, + "loss": 0.1987, + "step": 5333 + }, + { + "epoch": 0.7976671152983401, + "grad_norm": 1.640238457511369, + "learning_rate": 1.0358147732013046e-06, + "loss": 0.1873, + "step": 5334 + }, + { + "epoch": 0.7978166591894721, + "grad_norm": 1.4936634174296828, + "learning_rate": 1.0343392959430826e-06, + "loss": 0.1916, + "step": 5335 + }, + { + "epoch": 0.7979662030806042, + "grad_norm": 1.2395120514748919, + "learning_rate": 1.0328647490670795e-06, + "loss": 0.1748, + "step": 5336 + }, + { + "epoch": 0.7981157469717362, + "grad_norm": 1.8154926750825382, + "learning_rate": 1.0313911329192393e-06, + "loss": 0.3312, + "step": 5337 + }, + { + "epoch": 0.7982652908628682, + "grad_norm": 2.1376464545398655, + "learning_rate": 1.0299184478452861e-06, + "loss": 0.3098, + "step": 5338 + }, + { + "epoch": 0.7984148347540003, + "grad_norm": 2.4417223184078236, + "learning_rate": 1.0284466941907278e-06, + "loss": 0.4359, + "step": 5339 + }, + { + "epoch": 0.7985643786451323, + "grad_norm": 1.6721365002201523, + "learning_rate": 1.0269758723008487e-06, + "loss": 0.3503, + "step": 5340 + }, + { + "epoch": 0.7987139225362644, + "grad_norm": 1.8985931112931775, + "learning_rate": 1.0255059825207197e-06, + "loss": 0.2176, + "step": 5341 + }, + { + "epoch": 0.7988634664273965, + "grad_norm": 1.8160831656299763, + "learning_rate": 1.024037025195191e-06, + "loss": 0.2219, + "step": 5342 + }, + { + "epoch": 0.7990130103185284, + "grad_norm": 1.910633721773146, + "learning_rate": 1.0225690006688938e-06, + "loss": 0.3561, + "step": 5343 + }, + { + "epoch": 0.7991625542096605, + "grad_norm": 1.5888253964377586, + "learning_rate": 1.0211019092862411e-06, + "loss": 0.1843, + "step": 5344 + }, + { + "epoch": 0.7993120981007926, + "grad_norm": 1.5931030344056767, + "learning_rate": 1.0196357513914261e-06, + "loss": 0.2058, + "step": 5345 + }, + { + "epoch": 0.7994616419919246, + "grad_norm": 2.1545258276730967, + "learning_rate": 1.0181705273284247e-06, + "loss": 0.6205, + "step": 5346 + }, + { + "epoch": 0.7996111858830567, + "grad_norm": 1.5763172805820727, + "learning_rate": 1.0167062374409914e-06, + "loss": 0.2119, + "step": 5347 + }, + { + "epoch": 0.7997607297741888, + "grad_norm": 1.9360623024652444, + "learning_rate": 1.0152428820726635e-06, + "loss": 0.2238, + "step": 5348 + }, + { + "epoch": 0.7999102736653207, + "grad_norm": 1.2466634924719853, + "learning_rate": 1.0137804615667596e-06, + "loss": 0.1847, + "step": 5349 + }, + { + "epoch": 0.8000598175564528, + "grad_norm": 2.260903832607403, + "learning_rate": 1.0123189762663743e-06, + "loss": 0.6352, + "step": 5350 + }, + { + "epoch": 0.8002093614475849, + "grad_norm": 1.6975391900562669, + "learning_rate": 1.0108584265143878e-06, + "loss": 0.2045, + "step": 5351 + }, + { + "epoch": 0.8003589053387169, + "grad_norm": 2.012799448635203, + "learning_rate": 1.0093988126534598e-06, + "loss": 0.3329, + "step": 5352 + } + ], + "logging_steps": 1.0, + "max_steps": 6687, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 669, + "total_flos": 333058895462400.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}