diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4704 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.10004486316733961, + "eval_steps": 500, + "global_step": 669, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00014954389113204725, + "grad_norm": 35.95169344376715, + "learning_rate": 4.975124378109453e-08, + "loss": 1.1911, + "step": 1 + }, + { + "epoch": 0.0002990877822640945, + "grad_norm": 29.047342527504238, + "learning_rate": 9.950248756218906e-08, + "loss": 1.4707, + "step": 2 + }, + { + "epoch": 0.00044863167339614175, + "grad_norm": 24.718727160032117, + "learning_rate": 1.4925373134328358e-07, + "loss": 0.9534, + "step": 3 + }, + { + "epoch": 0.000598175564528189, + "grad_norm": 32.87218994198639, + "learning_rate": 1.9900497512437812e-07, + "loss": 1.2192, + "step": 4 + }, + { + "epoch": 0.0007477194556602363, + "grad_norm": 25.398344980222138, + "learning_rate": 2.4875621890547267e-07, + "loss": 1.1835, + "step": 5 + }, + { + "epoch": 0.0008972633467922835, + "grad_norm": 30.48079389364258, + "learning_rate": 2.9850746268656716e-07, + "loss": 1.0024, + "step": 6 + }, + { + "epoch": 0.0010468072379243307, + "grad_norm": 27.780032565686206, + "learning_rate": 3.4825870646766175e-07, + "loss": 1.1796, + "step": 7 + }, + { + "epoch": 0.001196351129056378, + "grad_norm": 33.19634259772052, + "learning_rate": 3.9800995024875624e-07, + "loss": 0.9585, + "step": 8 + }, + { + "epoch": 0.0013458950201884253, + "grad_norm": 32.92097675417938, + "learning_rate": 4.4776119402985074e-07, + "loss": 1.1831, + "step": 9 + }, + { + "epoch": 0.0014954389113204726, + "grad_norm": 31.267461918177617, + "learning_rate": 4.975124378109453e-07, + "loss": 0.9208, + "step": 10 + }, + { + "epoch": 0.0016449828024525197, + "grad_norm": 31.652990928454088, + "learning_rate": 5.472636815920398e-07, + "loss": 0.8882, + "step": 11 + }, + { + "epoch": 0.001794526693584567, + "grad_norm": 33.800482625732165, + "learning_rate": 5.970149253731343e-07, + "loss": 1.2138, + "step": 12 + }, + { + "epoch": 0.0019440705847166143, + "grad_norm": 30.753216086819556, + "learning_rate": 6.467661691542289e-07, + "loss": 0.9896, + "step": 13 + }, + { + "epoch": 0.0020936144758486614, + "grad_norm": 32.57679525538582, + "learning_rate": 6.965174129353235e-07, + "loss": 0.9195, + "step": 14 + }, + { + "epoch": 0.0022431583669807087, + "grad_norm": 25.334089702892793, + "learning_rate": 7.462686567164179e-07, + "loss": 0.7515, + "step": 15 + }, + { + "epoch": 0.002392702258112756, + "grad_norm": 22.2961872211284, + "learning_rate": 7.960199004975125e-07, + "loss": 0.6638, + "step": 16 + }, + { + "epoch": 0.0025422461492448033, + "grad_norm": 24.245556768411276, + "learning_rate": 8.457711442786071e-07, + "loss": 0.7704, + "step": 17 + }, + { + "epoch": 0.0026917900403768506, + "grad_norm": 19.23412917202397, + "learning_rate": 8.955223880597015e-07, + "loss": 0.7354, + "step": 18 + }, + { + "epoch": 0.002841333931508898, + "grad_norm": 18.58051317424024, + "learning_rate": 9.452736318407961e-07, + "loss": 0.5749, + "step": 19 + }, + { + "epoch": 0.0029908778226409452, + "grad_norm": 11.242228896944281, + "learning_rate": 9.950248756218907e-07, + "loss": 0.4914, + "step": 20 + }, + { + "epoch": 0.0031404217137729925, + "grad_norm": 11.163527479225325, + "learning_rate": 1.044776119402985e-06, + "loss": 0.5823, + "step": 21 + }, + { + "epoch": 0.0032899656049050394, + "grad_norm": 9.100766388616314, + "learning_rate": 1.0945273631840796e-06, + "loss": 0.6887, + "step": 22 + }, + { + "epoch": 0.0034395094960370867, + "grad_norm": 9.371427313022828, + "learning_rate": 1.1442786069651742e-06, + "loss": 0.3365, + "step": 23 + }, + { + "epoch": 0.003589053387169134, + "grad_norm": 6.591365654298028, + "learning_rate": 1.1940298507462686e-06, + "loss": 0.4092, + "step": 24 + }, + { + "epoch": 0.0037385972783011813, + "grad_norm": 6.692920733889971, + "learning_rate": 1.2437810945273632e-06, + "loss": 0.4459, + "step": 25 + }, + { + "epoch": 0.0038881411694332286, + "grad_norm": 6.609492289627464, + "learning_rate": 1.2935323383084578e-06, + "loss": 0.4577, + "step": 26 + }, + { + "epoch": 0.004037685060565276, + "grad_norm": 4.9115623336358, + "learning_rate": 1.3432835820895524e-06, + "loss": 0.5349, + "step": 27 + }, + { + "epoch": 0.004187228951697323, + "grad_norm": 5.117676678055004, + "learning_rate": 1.393034825870647e-06, + "loss": 0.5483, + "step": 28 + }, + { + "epoch": 0.0043367728428293706, + "grad_norm": 5.263481949191207, + "learning_rate": 1.4427860696517414e-06, + "loss": 0.5991, + "step": 29 + }, + { + "epoch": 0.004486316733961417, + "grad_norm": 6.131569220022702, + "learning_rate": 1.4925373134328358e-06, + "loss": 0.3908, + "step": 30 + }, + { + "epoch": 0.004635860625093465, + "grad_norm": 5.928579435490833, + "learning_rate": 1.5422885572139304e-06, + "loss": 0.2084, + "step": 31 + }, + { + "epoch": 0.004785404516225512, + "grad_norm": 5.916757088180695, + "learning_rate": 1.592039800995025e-06, + "loss": 0.3858, + "step": 32 + }, + { + "epoch": 0.00493494840735756, + "grad_norm": 8.20423570651997, + "learning_rate": 1.6417910447761196e-06, + "loss": 0.2901, + "step": 33 + }, + { + "epoch": 0.005084492298489607, + "grad_norm": 8.219360009824356, + "learning_rate": 1.6915422885572142e-06, + "loss": 0.3919, + "step": 34 + }, + { + "epoch": 0.005234036189621654, + "grad_norm": 5.998450714995048, + "learning_rate": 1.7412935323383088e-06, + "loss": 0.2445, + "step": 35 + }, + { + "epoch": 0.005383580080753701, + "grad_norm": 4.267389037528284, + "learning_rate": 1.791044776119403e-06, + "loss": 0.2062, + "step": 36 + }, + { + "epoch": 0.005533123971885748, + "grad_norm": 5.463746992191978, + "learning_rate": 1.8407960199004975e-06, + "loss": 0.5357, + "step": 37 + }, + { + "epoch": 0.005682667863017796, + "grad_norm": 4.306281637510176, + "learning_rate": 1.8905472636815921e-06, + "loss": 0.1867, + "step": 38 + }, + { + "epoch": 0.005832211754149843, + "grad_norm": 6.551059942168939, + "learning_rate": 1.9402985074626867e-06, + "loss": 0.5944, + "step": 39 + }, + { + "epoch": 0.0059817556452818905, + "grad_norm": 6.110559490141819, + "learning_rate": 1.9900497512437813e-06, + "loss": 0.6173, + "step": 40 + }, + { + "epoch": 0.006131299536413937, + "grad_norm": 4.577457366278138, + "learning_rate": 2.0398009950248755e-06, + "loss": 0.3634, + "step": 41 + }, + { + "epoch": 0.006280843427545985, + "grad_norm": 6.020057986889502, + "learning_rate": 2.08955223880597e-06, + "loss": 0.5398, + "step": 42 + }, + { + "epoch": 0.006430387318678032, + "grad_norm": 12.119213807947853, + "learning_rate": 2.1393034825870647e-06, + "loss": 0.2376, + "step": 43 + }, + { + "epoch": 0.006579931209810079, + "grad_norm": 4.977979102095054, + "learning_rate": 2.1890547263681593e-06, + "loss": 0.2455, + "step": 44 + }, + { + "epoch": 0.006729475100942127, + "grad_norm": 3.4274663141099166, + "learning_rate": 2.238805970149254e-06, + "loss": 0.2356, + "step": 45 + }, + { + "epoch": 0.0068790189920741734, + "grad_norm": 4.552279062958819, + "learning_rate": 2.2885572139303485e-06, + "loss": 0.1681, + "step": 46 + }, + { + "epoch": 0.007028562883206221, + "grad_norm": 2.9323320786902496, + "learning_rate": 2.338308457711443e-06, + "loss": 0.2303, + "step": 47 + }, + { + "epoch": 0.007178106774338268, + "grad_norm": 4.623033466327724, + "learning_rate": 2.3880597014925373e-06, + "loss": 0.2404, + "step": 48 + }, + { + "epoch": 0.007327650665470316, + "grad_norm": 5.05007020882628, + "learning_rate": 2.437810945273632e-06, + "loss": 0.4128, + "step": 49 + }, + { + "epoch": 0.007477194556602363, + "grad_norm": 2.5237349934200273, + "learning_rate": 2.4875621890547264e-06, + "loss": 0.2196, + "step": 50 + }, + { + "epoch": 0.00762673844773441, + "grad_norm": 3.7483142878646594, + "learning_rate": 2.537313432835821e-06, + "loss": 0.1725, + "step": 51 + }, + { + "epoch": 0.007776282338866457, + "grad_norm": 4.032155563605261, + "learning_rate": 2.5870646766169156e-06, + "loss": 0.3821, + "step": 52 + }, + { + "epoch": 0.007925826229998505, + "grad_norm": 3.7782327104964333, + "learning_rate": 2.6368159203980102e-06, + "loss": 0.2207, + "step": 53 + }, + { + "epoch": 0.008075370121130552, + "grad_norm": 4.816720331969929, + "learning_rate": 2.686567164179105e-06, + "loss": 0.2265, + "step": 54 + }, + { + "epoch": 0.008224914012262599, + "grad_norm": 2.8481845548797478, + "learning_rate": 2.736318407960199e-06, + "loss": 0.2174, + "step": 55 + }, + { + "epoch": 0.008374457903394646, + "grad_norm": 4.501151176073331, + "learning_rate": 2.786069651741294e-06, + "loss": 0.2306, + "step": 56 + }, + { + "epoch": 0.008524001794526694, + "grad_norm": 4.326693136186164, + "learning_rate": 2.835820895522388e-06, + "loss": 0.4023, + "step": 57 + }, + { + "epoch": 0.008673545685658741, + "grad_norm": 4.061925818141106, + "learning_rate": 2.885572139303483e-06, + "loss": 0.7602, + "step": 58 + }, + { + "epoch": 0.008823089576790788, + "grad_norm": 6.144988240043741, + "learning_rate": 2.9353233830845774e-06, + "loss": 0.4451, + "step": 59 + }, + { + "epoch": 0.008972633467922835, + "grad_norm": 4.985549166627373, + "learning_rate": 2.9850746268656716e-06, + "loss": 0.4621, + "step": 60 + }, + { + "epoch": 0.009122177359054883, + "grad_norm": 3.192079125281125, + "learning_rate": 3.0348258706467666e-06, + "loss": 0.3694, + "step": 61 + }, + { + "epoch": 0.00927172125018693, + "grad_norm": 4.653619400771914, + "learning_rate": 3.0845771144278608e-06, + "loss": 0.2416, + "step": 62 + }, + { + "epoch": 0.009421265141318977, + "grad_norm": 3.4214006556775156, + "learning_rate": 3.1343283582089558e-06, + "loss": 0.4755, + "step": 63 + }, + { + "epoch": 0.009570809032451024, + "grad_norm": 3.0809019894250613, + "learning_rate": 3.18407960199005e-06, + "loss": 0.4154, + "step": 64 + }, + { + "epoch": 0.009720352923583071, + "grad_norm": 4.190290076677796, + "learning_rate": 3.233830845771145e-06, + "loss": 0.4362, + "step": 65 + }, + { + "epoch": 0.00986989681471512, + "grad_norm": 3.1777725686355356, + "learning_rate": 3.283582089552239e-06, + "loss": 0.3635, + "step": 66 + }, + { + "epoch": 0.010019440705847166, + "grad_norm": 2.592442539170553, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1739, + "step": 67 + }, + { + "epoch": 0.010168984596979213, + "grad_norm": 4.610893839801018, + "learning_rate": 3.3830845771144283e-06, + "loss": 0.3845, + "step": 68 + }, + { + "epoch": 0.01031852848811126, + "grad_norm": 2.941030939381248, + "learning_rate": 3.4328358208955225e-06, + "loss": 0.226, + "step": 69 + }, + { + "epoch": 0.010468072379243309, + "grad_norm": 2.641062959772403, + "learning_rate": 3.4825870646766175e-06, + "loss": 0.2083, + "step": 70 + }, + { + "epoch": 0.010617616270375356, + "grad_norm": 4.573399002022637, + "learning_rate": 3.5323383084577117e-06, + "loss": 0.3639, + "step": 71 + }, + { + "epoch": 0.010767160161507403, + "grad_norm": 3.811597787697304, + "learning_rate": 3.582089552238806e-06, + "loss": 0.2046, + "step": 72 + }, + { + "epoch": 0.01091670405263945, + "grad_norm": 7.593654702612937, + "learning_rate": 3.631840796019901e-06, + "loss": 0.3831, + "step": 73 + }, + { + "epoch": 0.011066247943771496, + "grad_norm": 2.6372126137968013, + "learning_rate": 3.681592039800995e-06, + "loss": 0.2155, + "step": 74 + }, + { + "epoch": 0.011215791834903545, + "grad_norm": 3.401033168780161, + "learning_rate": 3.73134328358209e-06, + "loss": 0.2439, + "step": 75 + }, + { + "epoch": 0.011365335726035592, + "grad_norm": 2.8172647382036047, + "learning_rate": 3.7810945273631843e-06, + "loss": 0.1614, + "step": 76 + }, + { + "epoch": 0.011514879617167639, + "grad_norm": 3.525793180439174, + "learning_rate": 3.8308457711442784e-06, + "loss": 0.2176, + "step": 77 + }, + { + "epoch": 0.011664423508299685, + "grad_norm": 2.4029805525684527, + "learning_rate": 3.8805970149253735e-06, + "loss": 0.1893, + "step": 78 + }, + { + "epoch": 0.011813967399431732, + "grad_norm": 5.727795685387504, + "learning_rate": 3.930348258706468e-06, + "loss": 0.5702, + "step": 79 + }, + { + "epoch": 0.011963511290563781, + "grad_norm": 4.021893784746645, + "learning_rate": 3.980099502487563e-06, + "loss": 0.4027, + "step": 80 + }, + { + "epoch": 0.012113055181695828, + "grad_norm": 2.7773808558650535, + "learning_rate": 4.029850746268657e-06, + "loss": 0.2963, + "step": 81 + }, + { + "epoch": 0.012262599072827875, + "grad_norm": 3.4349426033049992, + "learning_rate": 4.079601990049751e-06, + "loss": 0.2211, + "step": 82 + }, + { + "epoch": 0.012412142963959922, + "grad_norm": 4.127258766074891, + "learning_rate": 4.129353233830846e-06, + "loss": 0.2516, + "step": 83 + }, + { + "epoch": 0.01256168685509197, + "grad_norm": 3.551977981988865, + "learning_rate": 4.17910447761194e-06, + "loss": 0.2206, + "step": 84 + }, + { + "epoch": 0.012711230746224017, + "grad_norm": 2.988554589230421, + "learning_rate": 4.228855721393035e-06, + "loss": 0.366, + "step": 85 + }, + { + "epoch": 0.012860774637356064, + "grad_norm": 3.256233912334862, + "learning_rate": 4.278606965174129e-06, + "loss": 0.341, + "step": 86 + }, + { + "epoch": 0.01301031852848811, + "grad_norm": 3.917242635149468, + "learning_rate": 4.3283582089552236e-06, + "loss": 0.281, + "step": 87 + }, + { + "epoch": 0.013159862419620158, + "grad_norm": 3.8372869351661247, + "learning_rate": 4.378109452736319e-06, + "loss": 0.1933, + "step": 88 + }, + { + "epoch": 0.013309406310752206, + "grad_norm": 4.03192980896834, + "learning_rate": 4.427860696517413e-06, + "loss": 0.184, + "step": 89 + }, + { + "epoch": 0.013458950201884253, + "grad_norm": 4.944440623197377, + "learning_rate": 4.477611940298508e-06, + "loss": 0.2406, + "step": 90 + }, + { + "epoch": 0.0136084940930163, + "grad_norm": 3.2771345760625916, + "learning_rate": 4.527363184079602e-06, + "loss": 0.3635, + "step": 91 + }, + { + "epoch": 0.013758037984148347, + "grad_norm": 2.5552685161479913, + "learning_rate": 4.577114427860697e-06, + "loss": 0.3581, + "step": 92 + }, + { + "epoch": 0.013907581875280395, + "grad_norm": 3.825258197515859, + "learning_rate": 4.626865671641791e-06, + "loss": 0.2157, + "step": 93 + }, + { + "epoch": 0.014057125766412442, + "grad_norm": 3.820006828326968, + "learning_rate": 4.676616915422886e-06, + "loss": 0.401, + "step": 94 + }, + { + "epoch": 0.01420666965754449, + "grad_norm": 3.4269639891084056, + "learning_rate": 4.72636815920398e-06, + "loss": 0.21, + "step": 95 + }, + { + "epoch": 0.014356213548676536, + "grad_norm": 3.614177044324435, + "learning_rate": 4.7761194029850745e-06, + "loss": 0.2305, + "step": 96 + }, + { + "epoch": 0.014505757439808583, + "grad_norm": 2.8474787904051633, + "learning_rate": 4.8258706467661695e-06, + "loss": 0.2002, + "step": 97 + }, + { + "epoch": 0.014655301330940632, + "grad_norm": 3.1529185682156333, + "learning_rate": 4.875621890547264e-06, + "loss": 0.3126, + "step": 98 + }, + { + "epoch": 0.014804845222072678, + "grad_norm": 2.805579699726101, + "learning_rate": 4.925373134328359e-06, + "loss": 0.3977, + "step": 99 + }, + { + "epoch": 0.014954389113204725, + "grad_norm": 2.5072872378288134, + "learning_rate": 4.975124378109453e-06, + "loss": 0.1986, + "step": 100 + }, + { + "epoch": 0.015103933004336772, + "grad_norm": 2.8773082972301816, + "learning_rate": 5.024875621890548e-06, + "loss": 0.2421, + "step": 101 + }, + { + "epoch": 0.01525347689546882, + "grad_norm": 2.3650776175631765, + "learning_rate": 5.074626865671642e-06, + "loss": 0.1864, + "step": 102 + }, + { + "epoch": 0.015403020786600868, + "grad_norm": 4.721891286027898, + "learning_rate": 5.124378109452737e-06, + "loss": 0.2939, + "step": 103 + }, + { + "epoch": 0.015552564677732915, + "grad_norm": 2.6753396233648705, + "learning_rate": 5.174129353233831e-06, + "loss": 0.2558, + "step": 104 + }, + { + "epoch": 0.01570210856886496, + "grad_norm": 3.149876968312327, + "learning_rate": 5.2238805970149255e-06, + "loss": 0.3405, + "step": 105 + }, + { + "epoch": 0.01585165245999701, + "grad_norm": 1.6322197066205648, + "learning_rate": 5.2736318407960205e-06, + "loss": 0.1453, + "step": 106 + }, + { + "epoch": 0.016001196351129055, + "grad_norm": 3.3492234789043236, + "learning_rate": 5.323383084577115e-06, + "loss": 0.404, + "step": 107 + }, + { + "epoch": 0.016150740242261104, + "grad_norm": 2.2518951047915157, + "learning_rate": 5.37313432835821e-06, + "loss": 0.2278, + "step": 108 + }, + { + "epoch": 0.016300284133393152, + "grad_norm": 3.0471913491370404, + "learning_rate": 5.422885572139304e-06, + "loss": 0.265, + "step": 109 + }, + { + "epoch": 0.016449828024525198, + "grad_norm": 1.6928519222295142, + "learning_rate": 5.472636815920398e-06, + "loss": 0.2169, + "step": 110 + }, + { + "epoch": 0.016599371915657246, + "grad_norm": 3.265018826674296, + "learning_rate": 5.522388059701493e-06, + "loss": 0.429, + "step": 111 + }, + { + "epoch": 0.01674891580678929, + "grad_norm": 2.637671664378066, + "learning_rate": 5.572139303482588e-06, + "loss": 0.2762, + "step": 112 + }, + { + "epoch": 0.01689845969792134, + "grad_norm": 3.1617986987096134, + "learning_rate": 5.621890547263682e-06, + "loss": 0.4272, + "step": 113 + }, + { + "epoch": 0.01704800358905339, + "grad_norm": 3.0132316717807175, + "learning_rate": 5.671641791044776e-06, + "loss": 0.3644, + "step": 114 + }, + { + "epoch": 0.017197547480185434, + "grad_norm": 2.2850314864309813, + "learning_rate": 5.721393034825871e-06, + "loss": 0.1967, + "step": 115 + }, + { + "epoch": 0.017347091371317482, + "grad_norm": 3.0835871860462314, + "learning_rate": 5.771144278606966e-06, + "loss": 0.2322, + "step": 116 + }, + { + "epoch": 0.017496635262449527, + "grad_norm": 3.5275796788122893, + "learning_rate": 5.820895522388061e-06, + "loss": 0.3543, + "step": 117 + }, + { + "epoch": 0.017646179153581576, + "grad_norm": 3.1301356173345494, + "learning_rate": 5.870646766169155e-06, + "loss": 0.5064, + "step": 118 + }, + { + "epoch": 0.017795723044713625, + "grad_norm": 3.9689250366780313, + "learning_rate": 5.920398009950249e-06, + "loss": 0.8428, + "step": 119 + }, + { + "epoch": 0.01794526693584567, + "grad_norm": 2.6992548320472984, + "learning_rate": 5.970149253731343e-06, + "loss": 0.2727, + "step": 120 + }, + { + "epoch": 0.01809481082697772, + "grad_norm": 2.8823271138601414, + "learning_rate": 6.019900497512439e-06, + "loss": 0.3301, + "step": 121 + }, + { + "epoch": 0.018244354718109767, + "grad_norm": 2.652199321292131, + "learning_rate": 6.069651741293533e-06, + "loss": 0.234, + "step": 122 + }, + { + "epoch": 0.018393898609241812, + "grad_norm": 4.008459949806747, + "learning_rate": 6.119402985074627e-06, + "loss": 0.5713, + "step": 123 + }, + { + "epoch": 0.01854344250037386, + "grad_norm": 2.8867543983581236, + "learning_rate": 6.1691542288557215e-06, + "loss": 0.2146, + "step": 124 + }, + { + "epoch": 0.018692986391505906, + "grad_norm": 2.379666412119815, + "learning_rate": 6.218905472636816e-06, + "loss": 0.3812, + "step": 125 + }, + { + "epoch": 0.018842530282637954, + "grad_norm": 2.8364015730213716, + "learning_rate": 6.2686567164179116e-06, + "loss": 0.3729, + "step": 126 + }, + { + "epoch": 0.018992074173770003, + "grad_norm": 2.9731590306978957, + "learning_rate": 6.318407960199006e-06, + "loss": 0.3922, + "step": 127 + }, + { + "epoch": 0.019141618064902048, + "grad_norm": 2.431931443805707, + "learning_rate": 6.3681592039801e-06, + "loss": 0.2316, + "step": 128 + }, + { + "epoch": 0.019291161956034097, + "grad_norm": 2.5964092588685594, + "learning_rate": 6.417910447761194e-06, + "loss": 0.2129, + "step": 129 + }, + { + "epoch": 0.019440705847166142, + "grad_norm": 4.241711858566103, + "learning_rate": 6.46766169154229e-06, + "loss": 0.2677, + "step": 130 + }, + { + "epoch": 0.01959024973829819, + "grad_norm": 3.743763522090278, + "learning_rate": 6.517412935323384e-06, + "loss": 0.7324, + "step": 131 + }, + { + "epoch": 0.01973979362943024, + "grad_norm": 2.325325226468886, + "learning_rate": 6.567164179104478e-06, + "loss": 0.2282, + "step": 132 + }, + { + "epoch": 0.019889337520562284, + "grad_norm": 2.187485810642544, + "learning_rate": 6.6169154228855725e-06, + "loss": 0.3479, + "step": 133 + }, + { + "epoch": 0.020038881411694333, + "grad_norm": 2.555235252803596, + "learning_rate": 6.666666666666667e-06, + "loss": 0.3084, + "step": 134 + }, + { + "epoch": 0.020188425302826378, + "grad_norm": 2.1409254211343405, + "learning_rate": 6.7164179104477625e-06, + "loss": 0.2413, + "step": 135 + }, + { + "epoch": 0.020337969193958427, + "grad_norm": 2.9475030013466292, + "learning_rate": 6.766169154228857e-06, + "loss": 0.5899, + "step": 136 + }, + { + "epoch": 0.020487513085090475, + "grad_norm": 3.161190387153201, + "learning_rate": 6.815920398009951e-06, + "loss": 0.2722, + "step": 137 + }, + { + "epoch": 0.02063705697622252, + "grad_norm": 3.4231688087143786, + "learning_rate": 6.865671641791045e-06, + "loss": 0.25, + "step": 138 + }, + { + "epoch": 0.02078660086735457, + "grad_norm": 2.891852432700459, + "learning_rate": 6.915422885572139e-06, + "loss": 0.5206, + "step": 139 + }, + { + "epoch": 0.020936144758486618, + "grad_norm": 2.4149596821734645, + "learning_rate": 6.965174129353235e-06, + "loss": 0.2792, + "step": 140 + }, + { + "epoch": 0.021085688649618663, + "grad_norm": 2.737327253049286, + "learning_rate": 7.014925373134329e-06, + "loss": 0.1785, + "step": 141 + }, + { + "epoch": 0.02123523254075071, + "grad_norm": 2.271710572333297, + "learning_rate": 7.064676616915423e-06, + "loss": 0.2216, + "step": 142 + }, + { + "epoch": 0.021384776431882756, + "grad_norm": 3.123818135886555, + "learning_rate": 7.114427860696518e-06, + "loss": 0.5292, + "step": 143 + }, + { + "epoch": 0.021534320323014805, + "grad_norm": 3.4353230085188775, + "learning_rate": 7.164179104477612e-06, + "loss": 0.257, + "step": 144 + }, + { + "epoch": 0.021683864214146854, + "grad_norm": 3.292198842322858, + "learning_rate": 7.213930348258708e-06, + "loss": 0.4413, + "step": 145 + }, + { + "epoch": 0.0218334081052789, + "grad_norm": 2.408669543365234, + "learning_rate": 7.263681592039802e-06, + "loss": 0.4034, + "step": 146 + }, + { + "epoch": 0.021982951996410947, + "grad_norm": 2.918318139010717, + "learning_rate": 7.313432835820896e-06, + "loss": 0.1789, + "step": 147 + }, + { + "epoch": 0.022132495887542993, + "grad_norm": 2.016064943310167, + "learning_rate": 7.36318407960199e-06, + "loss": 0.2454, + "step": 148 + }, + { + "epoch": 0.02228203977867504, + "grad_norm": 3.375282717272202, + "learning_rate": 7.412935323383084e-06, + "loss": 0.5047, + "step": 149 + }, + { + "epoch": 0.02243158366980709, + "grad_norm": 2.747548142801912, + "learning_rate": 7.46268656716418e-06, + "loss": 0.3193, + "step": 150 + }, + { + "epoch": 0.022581127560939135, + "grad_norm": 5.014531999850111, + "learning_rate": 7.512437810945274e-06, + "loss": 0.5367, + "step": 151 + }, + { + "epoch": 0.022730671452071183, + "grad_norm": 1.7396197448467992, + "learning_rate": 7.5621890547263685e-06, + "loss": 0.1602, + "step": 152 + }, + { + "epoch": 0.02288021534320323, + "grad_norm": 3.9271159318267452, + "learning_rate": 7.611940298507463e-06, + "loss": 0.2763, + "step": 153 + }, + { + "epoch": 0.023029759234335277, + "grad_norm": 2.093726492507833, + "learning_rate": 7.661691542288557e-06, + "loss": 0.169, + "step": 154 + }, + { + "epoch": 0.023179303125467326, + "grad_norm": 1.5357011381308088, + "learning_rate": 7.711442786069654e-06, + "loss": 0.1619, + "step": 155 + }, + { + "epoch": 0.02332884701659937, + "grad_norm": 2.3824458230974863, + "learning_rate": 7.761194029850747e-06, + "loss": 0.2094, + "step": 156 + }, + { + "epoch": 0.02347839090773142, + "grad_norm": 2.8236663879690784, + "learning_rate": 7.810945273631842e-06, + "loss": 0.3426, + "step": 157 + }, + { + "epoch": 0.023627934798863465, + "grad_norm": 3.1375695638809815, + "learning_rate": 7.860696517412935e-06, + "loss": 0.5518, + "step": 158 + }, + { + "epoch": 0.023777478689995513, + "grad_norm": 3.2182906468856105, + "learning_rate": 7.91044776119403e-06, + "loss": 0.1995, + "step": 159 + }, + { + "epoch": 0.023927022581127562, + "grad_norm": 14.749841980168513, + "learning_rate": 7.960199004975125e-06, + "loss": 0.5578, + "step": 160 + }, + { + "epoch": 0.024076566472259607, + "grad_norm": 3.0100123201004045, + "learning_rate": 8.00995024875622e-06, + "loss": 0.5091, + "step": 161 + }, + { + "epoch": 0.024226110363391656, + "grad_norm": 3.5091520525666433, + "learning_rate": 8.059701492537314e-06, + "loss": 0.5357, + "step": 162 + }, + { + "epoch": 0.024375654254523704, + "grad_norm": 2.934851375582722, + "learning_rate": 8.109452736318409e-06, + "loss": 0.2267, + "step": 163 + }, + { + "epoch": 0.02452519814565575, + "grad_norm": 2.5911339240383544, + "learning_rate": 8.159203980099502e-06, + "loss": 0.1782, + "step": 164 + }, + { + "epoch": 0.024674742036787798, + "grad_norm": 2.847206263316536, + "learning_rate": 8.208955223880599e-06, + "loss": 0.2252, + "step": 165 + }, + { + "epoch": 0.024824285927919843, + "grad_norm": 3.5380431553535976, + "learning_rate": 8.258706467661692e-06, + "loss": 0.4295, + "step": 166 + }, + { + "epoch": 0.024973829819051892, + "grad_norm": 3.150492354924513, + "learning_rate": 8.308457711442787e-06, + "loss": 0.3276, + "step": 167 + }, + { + "epoch": 0.02512337371018394, + "grad_norm": 3.114695975436696, + "learning_rate": 8.35820895522388e-06, + "loss": 0.5181, + "step": 168 + }, + { + "epoch": 0.025272917601315985, + "grad_norm": 2.6180846619509355, + "learning_rate": 8.407960199004975e-06, + "loss": 0.2577, + "step": 169 + }, + { + "epoch": 0.025422461492448034, + "grad_norm": 1.859950631659999, + "learning_rate": 8.45771144278607e-06, + "loss": 0.1838, + "step": 170 + }, + { + "epoch": 0.02557200538358008, + "grad_norm": 4.092195798232618, + "learning_rate": 8.507462686567165e-06, + "loss": 0.2676, + "step": 171 + }, + { + "epoch": 0.025721549274712128, + "grad_norm": 2.0820308098425766, + "learning_rate": 8.557213930348259e-06, + "loss": 0.2528, + "step": 172 + }, + { + "epoch": 0.025871093165844176, + "grad_norm": 2.8153771201369087, + "learning_rate": 8.606965174129354e-06, + "loss": 0.3374, + "step": 173 + }, + { + "epoch": 0.02602063705697622, + "grad_norm": 2.6417342231989114, + "learning_rate": 8.656716417910447e-06, + "loss": 0.4309, + "step": 174 + }, + { + "epoch": 0.02617018094810827, + "grad_norm": 3.3553357791865825, + "learning_rate": 8.706467661691544e-06, + "loss": 0.279, + "step": 175 + }, + { + "epoch": 0.026319724839240315, + "grad_norm": 2.5896987414147707, + "learning_rate": 8.756218905472637e-06, + "loss": 0.2505, + "step": 176 + }, + { + "epoch": 0.026469268730372364, + "grad_norm": 15.917959164107543, + "learning_rate": 8.805970149253732e-06, + "loss": 0.3903, + "step": 177 + }, + { + "epoch": 0.026618812621504413, + "grad_norm": 1.897502276352634, + "learning_rate": 8.855721393034826e-06, + "loss": 0.3051, + "step": 178 + }, + { + "epoch": 0.026768356512636458, + "grad_norm": 3.498345426750877, + "learning_rate": 8.905472636815922e-06, + "loss": 0.8122, + "step": 179 + }, + { + "epoch": 0.026917900403768506, + "grad_norm": 3.2270107650642297, + "learning_rate": 8.955223880597016e-06, + "loss": 0.2312, + "step": 180 + }, + { + "epoch": 0.027067444294900555, + "grad_norm": 2.373617987334166, + "learning_rate": 9.00497512437811e-06, + "loss": 0.3553, + "step": 181 + }, + { + "epoch": 0.0272169881860326, + "grad_norm": 2.022495433415561, + "learning_rate": 9.054726368159204e-06, + "loss": 0.3372, + "step": 182 + }, + { + "epoch": 0.02736653207716465, + "grad_norm": 2.471303542690233, + "learning_rate": 9.104477611940299e-06, + "loss": 0.2764, + "step": 183 + }, + { + "epoch": 0.027516075968296694, + "grad_norm": 2.170550660433261, + "learning_rate": 9.154228855721394e-06, + "loss": 0.2429, + "step": 184 + }, + { + "epoch": 0.027665619859428742, + "grad_norm": 1.7750572924031363, + "learning_rate": 9.203980099502489e-06, + "loss": 0.1749, + "step": 185 + }, + { + "epoch": 0.02781516375056079, + "grad_norm": 1.9803173977955488, + "learning_rate": 9.253731343283582e-06, + "loss": 0.3061, + "step": 186 + }, + { + "epoch": 0.027964707641692836, + "grad_norm": 2.686793479118654, + "learning_rate": 9.303482587064677e-06, + "loss": 0.2704, + "step": 187 + }, + { + "epoch": 0.028114251532824885, + "grad_norm": 3.0095995560762088, + "learning_rate": 9.353233830845772e-06, + "loss": 0.3935, + "step": 188 + }, + { + "epoch": 0.02826379542395693, + "grad_norm": 3.296780241377357, + "learning_rate": 9.402985074626867e-06, + "loss": 0.4349, + "step": 189 + }, + { + "epoch": 0.02841333931508898, + "grad_norm": 2.0473844316492262, + "learning_rate": 9.45273631840796e-06, + "loss": 0.3594, + "step": 190 + }, + { + "epoch": 0.028562883206221027, + "grad_norm": 2.6746439974295986, + "learning_rate": 9.502487562189056e-06, + "loss": 0.2507, + "step": 191 + }, + { + "epoch": 0.028712427097353072, + "grad_norm": 2.171372767224107, + "learning_rate": 9.552238805970149e-06, + "loss": 0.4442, + "step": 192 + }, + { + "epoch": 0.02886197098848512, + "grad_norm": 3.412610878033882, + "learning_rate": 9.601990049751244e-06, + "loss": 0.5065, + "step": 193 + }, + { + "epoch": 0.029011514879617166, + "grad_norm": 2.5249672849820843, + "learning_rate": 9.651741293532339e-06, + "loss": 0.2775, + "step": 194 + }, + { + "epoch": 0.029161058770749215, + "grad_norm": 1.9244063665371054, + "learning_rate": 9.701492537313434e-06, + "loss": 0.2501, + "step": 195 + }, + { + "epoch": 0.029310602661881263, + "grad_norm": 2.2928756876943788, + "learning_rate": 9.751243781094527e-06, + "loss": 0.391, + "step": 196 + }, + { + "epoch": 0.02946014655301331, + "grad_norm": 3.2090175671059464, + "learning_rate": 9.800995024875622e-06, + "loss": 0.355, + "step": 197 + }, + { + "epoch": 0.029609690444145357, + "grad_norm": 2.564275054094989, + "learning_rate": 9.850746268656717e-06, + "loss": 0.3824, + "step": 198 + }, + { + "epoch": 0.029759234335277406, + "grad_norm": 2.2612313847384473, + "learning_rate": 9.900497512437812e-06, + "loss": 0.255, + "step": 199 + }, + { + "epoch": 0.02990877822640945, + "grad_norm": 2.867410801811384, + "learning_rate": 9.950248756218906e-06, + "loss": 0.2321, + "step": 200 + }, + { + "epoch": 0.0300583221175415, + "grad_norm": 2.7017080308625316, + "learning_rate": 1e-05, + "loss": 0.5355, + "step": 201 + }, + { + "epoch": 0.030207866008673544, + "grad_norm": 1.7563631058650533, + "learning_rate": 9.999999413475907e-06, + "loss": 0.2366, + "step": 202 + }, + { + "epoch": 0.030357409899805593, + "grad_norm": 2.7923486514729134, + "learning_rate": 9.999997653903764e-06, + "loss": 0.5735, + "step": 203 + }, + { + "epoch": 0.03050695379093764, + "grad_norm": 2.5477270678585935, + "learning_rate": 9.999994721283985e-06, + "loss": 0.2316, + "step": 204 + }, + { + "epoch": 0.030656497682069687, + "grad_norm": 1.6435827637040603, + "learning_rate": 9.99999061561726e-06, + "loss": 0.1958, + "step": 205 + }, + { + "epoch": 0.030806041573201735, + "grad_norm": 4.225438559077688, + "learning_rate": 9.999985336904546e-06, + "loss": 0.6052, + "step": 206 + }, + { + "epoch": 0.03095558546433378, + "grad_norm": 2.384218907777814, + "learning_rate": 9.999978885147086e-06, + "loss": 0.382, + "step": 207 + }, + { + "epoch": 0.03110512935546583, + "grad_norm": 3.082533240684358, + "learning_rate": 9.999971260346394e-06, + "loss": 0.4615, + "step": 208 + }, + { + "epoch": 0.03125467324659788, + "grad_norm": 2.126341746782405, + "learning_rate": 9.999962462504259e-06, + "loss": 0.3489, + "step": 209 + }, + { + "epoch": 0.03140421713772992, + "grad_norm": 2.3157719584793974, + "learning_rate": 9.99995249162274e-06, + "loss": 0.351, + "step": 210 + }, + { + "epoch": 0.03155376102886197, + "grad_norm": 3.2569828989709046, + "learning_rate": 9.999941347704183e-06, + "loss": 0.5452, + "step": 211 + }, + { + "epoch": 0.03170330491999402, + "grad_norm": 2.4010549422177747, + "learning_rate": 9.999929030751199e-06, + "loss": 0.5511, + "step": 212 + }, + { + "epoch": 0.031852848811126065, + "grad_norm": 2.2021354319659956, + "learning_rate": 9.999915540766679e-06, + "loss": 0.409, + "step": 213 + }, + { + "epoch": 0.03200239270225811, + "grad_norm": 2.7467598032746467, + "learning_rate": 9.999900877753786e-06, + "loss": 0.2769, + "step": 214 + }, + { + "epoch": 0.03215193659339016, + "grad_norm": 2.250991470386846, + "learning_rate": 9.99988504171596e-06, + "loss": 0.4243, + "step": 215 + }, + { + "epoch": 0.03230148048452221, + "grad_norm": 7.389570164962262, + "learning_rate": 9.999868032656921e-06, + "loss": 0.5661, + "step": 216 + }, + { + "epoch": 0.03245102437565425, + "grad_norm": 2.3232325152419904, + "learning_rate": 9.999849850580653e-06, + "loss": 0.3622, + "step": 217 + }, + { + "epoch": 0.032600568266786305, + "grad_norm": 2.8448629192721153, + "learning_rate": 9.999830495491425e-06, + "loss": 0.5013, + "step": 218 + }, + { + "epoch": 0.03275011215791835, + "grad_norm": 1.9203985094095042, + "learning_rate": 9.99980996739378e-06, + "loss": 0.2597, + "step": 219 + }, + { + "epoch": 0.032899656049050395, + "grad_norm": 2.1343351176097705, + "learning_rate": 9.99978826629253e-06, + "loss": 0.333, + "step": 220 + }, + { + "epoch": 0.03304919994018244, + "grad_norm": 2.675496675158128, + "learning_rate": 9.999765392192766e-06, + "loss": 0.4679, + "step": 221 + }, + { + "epoch": 0.03319874383131449, + "grad_norm": 2.954897252892918, + "learning_rate": 9.99974134509986e-06, + "loss": 0.5779, + "step": 222 + }, + { + "epoch": 0.03334828772244654, + "grad_norm": 3.164155125145253, + "learning_rate": 9.999716125019448e-06, + "loss": 0.5192, + "step": 223 + }, + { + "epoch": 0.03349783161357858, + "grad_norm": 2.9422429580445377, + "learning_rate": 9.99968973195745e-06, + "loss": 0.3514, + "step": 224 + }, + { + "epoch": 0.033647375504710635, + "grad_norm": 2.016818218277119, + "learning_rate": 9.999662165920056e-06, + "loss": 0.3657, + "step": 225 + }, + { + "epoch": 0.03379691939584268, + "grad_norm": 2.805692301474297, + "learning_rate": 9.999633426913733e-06, + "loss": 0.1912, + "step": 226 + }, + { + "epoch": 0.033946463286974725, + "grad_norm": 2.205403428118743, + "learning_rate": 9.999603514945227e-06, + "loss": 0.234, + "step": 227 + }, + { + "epoch": 0.03409600717810678, + "grad_norm": 2.013271573198516, + "learning_rate": 9.999572430021553e-06, + "loss": 0.464, + "step": 228 + }, + { + "epoch": 0.03424555106923882, + "grad_norm": 3.033803346792209, + "learning_rate": 9.999540172150005e-06, + "loss": 0.2599, + "step": 229 + }, + { + "epoch": 0.03439509496037087, + "grad_norm": 2.854186400231596, + "learning_rate": 9.99950674133815e-06, + "loss": 0.6431, + "step": 230 + }, + { + "epoch": 0.03454463885150292, + "grad_norm": 2.162434347622467, + "learning_rate": 9.999472137593829e-06, + "loss": 0.4779, + "step": 231 + }, + { + "epoch": 0.034694182742634964, + "grad_norm": 1.4691335020169023, + "learning_rate": 9.999436360925165e-06, + "loss": 0.1827, + "step": 232 + }, + { + "epoch": 0.03484372663376701, + "grad_norm": 1.6955188606947214, + "learning_rate": 9.99939941134055e-06, + "loss": 0.2336, + "step": 233 + }, + { + "epoch": 0.034993270524899055, + "grad_norm": 2.0710606069082167, + "learning_rate": 9.99936128884865e-06, + "loss": 0.3671, + "step": 234 + }, + { + "epoch": 0.03514281441603111, + "grad_norm": 2.128464465717484, + "learning_rate": 9.999321993458411e-06, + "loss": 0.2928, + "step": 235 + }, + { + "epoch": 0.03529235830716315, + "grad_norm": 1.9685227247781487, + "learning_rate": 9.999281525179054e-06, + "loss": 0.185, + "step": 236 + }, + { + "epoch": 0.0354419021982952, + "grad_norm": 2.3203573768463115, + "learning_rate": 9.99923988402007e-06, + "loss": 0.3733, + "step": 237 + }, + { + "epoch": 0.03559144608942725, + "grad_norm": 2.2161639851963457, + "learning_rate": 9.99919706999123e-06, + "loss": 0.4, + "step": 238 + }, + { + "epoch": 0.035740989980559294, + "grad_norm": 1.551687214387557, + "learning_rate": 9.99915308310258e-06, + "loss": 0.1723, + "step": 239 + }, + { + "epoch": 0.03589053387169134, + "grad_norm": 1.9544776771870587, + "learning_rate": 9.999107923364436e-06, + "loss": 0.2587, + "step": 240 + }, + { + "epoch": 0.03604007776282339, + "grad_norm": 2.1986380601508375, + "learning_rate": 9.999061590787394e-06, + "loss": 0.544, + "step": 241 + }, + { + "epoch": 0.03618962165395544, + "grad_norm": 2.5816888510040457, + "learning_rate": 9.999014085382326e-06, + "loss": 0.4619, + "step": 242 + }, + { + "epoch": 0.03633916554508748, + "grad_norm": 1.8291845348661409, + "learning_rate": 9.998965407160377e-06, + "loss": 0.2052, + "step": 243 + }, + { + "epoch": 0.036488709436219534, + "grad_norm": 3.167062575704647, + "learning_rate": 9.998915556132966e-06, + "loss": 0.6123, + "step": 244 + }, + { + "epoch": 0.03663825332735158, + "grad_norm": 1.8628898225455814, + "learning_rate": 9.99886453231179e-06, + "loss": 0.3634, + "step": 245 + }, + { + "epoch": 0.036787797218483624, + "grad_norm": 1.7903762911789451, + "learning_rate": 9.998812335708818e-06, + "loss": 0.2162, + "step": 246 + }, + { + "epoch": 0.03693734110961567, + "grad_norm": 1.3282642487848175, + "learning_rate": 9.998758966336296e-06, + "loss": 0.1875, + "step": 247 + }, + { + "epoch": 0.03708688500074772, + "grad_norm": 1.8364953512469955, + "learning_rate": 9.998704424206747e-06, + "loss": 0.208, + "step": 248 + }, + { + "epoch": 0.037236428891879766, + "grad_norm": 1.3941303606582691, + "learning_rate": 9.998648709332965e-06, + "loss": 0.1737, + "step": 249 + }, + { + "epoch": 0.03738597278301181, + "grad_norm": 1.7239196409011197, + "learning_rate": 9.998591821728022e-06, + "loss": 0.2339, + "step": 250 + }, + { + "epoch": 0.037535516674143864, + "grad_norm": 2.623262386600702, + "learning_rate": 9.998533761405265e-06, + "loss": 0.3988, + "step": 251 + }, + { + "epoch": 0.03768506056527591, + "grad_norm": 3.0417113736320354, + "learning_rate": 9.998474528378315e-06, + "loss": 0.3998, + "step": 252 + }, + { + "epoch": 0.037834604456407954, + "grad_norm": 2.3389769972346532, + "learning_rate": 9.998414122661066e-06, + "loss": 0.2157, + "step": 253 + }, + { + "epoch": 0.037984148347540006, + "grad_norm": 2.776666496961099, + "learning_rate": 9.998352544267696e-06, + "loss": 0.5598, + "step": 254 + }, + { + "epoch": 0.03813369223867205, + "grad_norm": 2.1472401976055746, + "learning_rate": 9.998289793212645e-06, + "loss": 0.2375, + "step": 255 + }, + { + "epoch": 0.038283236129804096, + "grad_norm": 2.258529852719024, + "learning_rate": 9.99822586951064e-06, + "loss": 0.257, + "step": 256 + }, + { + "epoch": 0.03843278002093614, + "grad_norm": 2.234662282588329, + "learning_rate": 9.998160773176676e-06, + "loss": 0.2513, + "step": 257 + }, + { + "epoch": 0.038582323912068194, + "grad_norm": 1.557075634748184, + "learning_rate": 9.998094504226025e-06, + "loss": 0.2154, + "step": 258 + }, + { + "epoch": 0.03873186780320024, + "grad_norm": 1.2782097805836874, + "learning_rate": 9.998027062674236e-06, + "loss": 0.1997, + "step": 259 + }, + { + "epoch": 0.038881411694332284, + "grad_norm": 1.5754692941437902, + "learning_rate": 9.997958448537129e-06, + "loss": 0.2271, + "step": 260 + }, + { + "epoch": 0.039030955585464336, + "grad_norm": 2.3273358127526516, + "learning_rate": 9.997888661830803e-06, + "loss": 0.4129, + "step": 261 + }, + { + "epoch": 0.03918049947659638, + "grad_norm": 2.5932478274973705, + "learning_rate": 9.997817702571631e-06, + "loss": 0.2762, + "step": 262 + }, + { + "epoch": 0.039330043367728426, + "grad_norm": 1.7415819067090217, + "learning_rate": 9.99774557077626e-06, + "loss": 0.2677, + "step": 263 + }, + { + "epoch": 0.03947958725886048, + "grad_norm": 2.1983315861883974, + "learning_rate": 9.997672266461613e-06, + "loss": 0.3412, + "step": 264 + }, + { + "epoch": 0.03962913114999252, + "grad_norm": 2.8445138272257666, + "learning_rate": 9.997597789644889e-06, + "loss": 0.3471, + "step": 265 + }, + { + "epoch": 0.03977867504112457, + "grad_norm": 2.6658347323464575, + "learning_rate": 9.997522140343558e-06, + "loss": 0.3785, + "step": 266 + }, + { + "epoch": 0.03992821893225662, + "grad_norm": 1.2913669477506569, + "learning_rate": 9.997445318575371e-06, + "loss": 0.2089, + "step": 267 + }, + { + "epoch": 0.040077762823388666, + "grad_norm": 2.440102551085522, + "learning_rate": 9.99736732435835e-06, + "loss": 0.5639, + "step": 268 + }, + { + "epoch": 0.04022730671452071, + "grad_norm": 2.252623935384866, + "learning_rate": 9.997288157710795e-06, + "loss": 0.447, + "step": 269 + }, + { + "epoch": 0.040376850605652756, + "grad_norm": 1.9038309319538977, + "learning_rate": 9.997207818651273e-06, + "loss": 0.2784, + "step": 270 + }, + { + "epoch": 0.04052639449678481, + "grad_norm": 2.05316637395224, + "learning_rate": 9.99712630719864e-06, + "loss": 0.3874, + "step": 271 + }, + { + "epoch": 0.04067593838791685, + "grad_norm": 4.663034399257074, + "learning_rate": 9.997043623372016e-06, + "loss": 0.3558, + "step": 272 + }, + { + "epoch": 0.0408254822790489, + "grad_norm": 2.0324793909935375, + "learning_rate": 9.996959767190799e-06, + "loss": 0.3884, + "step": 273 + }, + { + "epoch": 0.04097502617018095, + "grad_norm": 2.1897027573531003, + "learning_rate": 9.996874738674663e-06, + "loss": 0.2372, + "step": 274 + }, + { + "epoch": 0.041124570061312996, + "grad_norm": 1.9410471939157525, + "learning_rate": 9.996788537843558e-06, + "loss": 0.3478, + "step": 275 + }, + { + "epoch": 0.04127411395244504, + "grad_norm": 3.650983914269082, + "learning_rate": 9.996701164717704e-06, + "loss": 0.4213, + "step": 276 + }, + { + "epoch": 0.04142365784357709, + "grad_norm": 3.067988013237884, + "learning_rate": 9.996612619317602e-06, + "loss": 0.7209, + "step": 277 + }, + { + "epoch": 0.04157320173470914, + "grad_norm": 2.5863303551652033, + "learning_rate": 9.996522901664028e-06, + "loss": 0.5418, + "step": 278 + }, + { + "epoch": 0.04172274562584118, + "grad_norm": 2.1885641779249476, + "learning_rate": 9.996432011778026e-06, + "loss": 0.371, + "step": 279 + }, + { + "epoch": 0.041872289516973235, + "grad_norm": 2.398824728854803, + "learning_rate": 9.99633994968092e-06, + "loss": 0.5508, + "step": 280 + }, + { + "epoch": 0.04202183340810528, + "grad_norm": 1.5732032420608302, + "learning_rate": 9.996246715394314e-06, + "loss": 0.2468, + "step": 281 + }, + { + "epoch": 0.042171377299237325, + "grad_norm": 2.8532279807617944, + "learning_rate": 9.996152308940075e-06, + "loss": 0.5503, + "step": 282 + }, + { + "epoch": 0.04232092119036937, + "grad_norm": 2.4502727303222733, + "learning_rate": 9.996056730340356e-06, + "loss": 0.4046, + "step": 283 + }, + { + "epoch": 0.04247046508150142, + "grad_norm": 1.9272098426705169, + "learning_rate": 9.995959979617578e-06, + "loss": 0.3906, + "step": 284 + }, + { + "epoch": 0.04262000897263347, + "grad_norm": 2.290690335549339, + "learning_rate": 9.995862056794441e-06, + "loss": 0.2464, + "step": 285 + }, + { + "epoch": 0.04276955286376551, + "grad_norm": 1.656564250859485, + "learning_rate": 9.99576296189392e-06, + "loss": 0.1996, + "step": 286 + }, + { + "epoch": 0.042919096754897565, + "grad_norm": 2.1259148220336965, + "learning_rate": 9.995662694939262e-06, + "loss": 0.3994, + "step": 287 + }, + { + "epoch": 0.04306864064602961, + "grad_norm": 2.286901143642134, + "learning_rate": 9.99556125595399e-06, + "loss": 0.4047, + "step": 288 + }, + { + "epoch": 0.043218184537161655, + "grad_norm": 1.3559455912309712, + "learning_rate": 9.995458644961902e-06, + "loss": 0.2228, + "step": 289 + }, + { + "epoch": 0.04336772842829371, + "grad_norm": 2.285750924681825, + "learning_rate": 9.995354861987075e-06, + "loss": 0.2367, + "step": 290 + }, + { + "epoch": 0.04351727231942575, + "grad_norm": 1.923824453592428, + "learning_rate": 9.995249907053854e-06, + "loss": 0.3951, + "step": 291 + }, + { + "epoch": 0.0436668162105578, + "grad_norm": 1.968047953500074, + "learning_rate": 9.995143780186865e-06, + "loss": 0.2149, + "step": 292 + }, + { + "epoch": 0.04381636010168984, + "grad_norm": 2.3975790519132074, + "learning_rate": 9.995036481411005e-06, + "loss": 0.5312, + "step": 293 + }, + { + "epoch": 0.043965903992821895, + "grad_norm": 1.9664546058841197, + "learning_rate": 9.994928010751447e-06, + "loss": 0.4832, + "step": 294 + }, + { + "epoch": 0.04411544788395394, + "grad_norm": 2.1609011533249785, + "learning_rate": 9.994818368233639e-06, + "loss": 0.571, + "step": 295 + }, + { + "epoch": 0.044264991775085985, + "grad_norm": 1.2099666806993736, + "learning_rate": 9.994707553883305e-06, + "loss": 0.1801, + "step": 296 + }, + { + "epoch": 0.04441453566621804, + "grad_norm": 1.8811137964659612, + "learning_rate": 9.994595567726444e-06, + "loss": 0.2708, + "step": 297 + }, + { + "epoch": 0.04456407955735008, + "grad_norm": 1.6387011737954997, + "learning_rate": 9.994482409789329e-06, + "loss": 0.245, + "step": 298 + }, + { + "epoch": 0.04471362344848213, + "grad_norm": 2.4061797367092486, + "learning_rate": 9.994368080098505e-06, + "loss": 0.204, + "step": 299 + }, + { + "epoch": 0.04486316733961418, + "grad_norm": 2.555264958903577, + "learning_rate": 9.994252578680796e-06, + "loss": 0.5251, + "step": 300 + }, + { + "epoch": 0.045012711230746225, + "grad_norm": 3.1965886018503897, + "learning_rate": 9.994135905563302e-06, + "loss": 0.4353, + "step": 301 + }, + { + "epoch": 0.04516225512187827, + "grad_norm": 2.390530599961774, + "learning_rate": 9.994018060773396e-06, + "loss": 0.4199, + "step": 302 + }, + { + "epoch": 0.04531179901301032, + "grad_norm": 2.694731420269419, + "learning_rate": 9.993899044338722e-06, + "loss": 0.4029, + "step": 303 + }, + { + "epoch": 0.04546134290414237, + "grad_norm": 2.5518583518075437, + "learning_rate": 9.993778856287205e-06, + "loss": 0.3712, + "step": 304 + }, + { + "epoch": 0.04561088679527441, + "grad_norm": 1.958382495979976, + "learning_rate": 9.99365749664704e-06, + "loss": 0.3617, + "step": 305 + }, + { + "epoch": 0.04576043068640646, + "grad_norm": 2.299652220902115, + "learning_rate": 9.993534965446701e-06, + "loss": 0.4059, + "step": 306 + }, + { + "epoch": 0.04590997457753851, + "grad_norm": 4.086258301258261, + "learning_rate": 9.993411262714934e-06, + "loss": 0.2774, + "step": 307 + }, + { + "epoch": 0.046059518468670554, + "grad_norm": 2.0081624141767156, + "learning_rate": 9.993286388480763e-06, + "loss": 0.2724, + "step": 308 + }, + { + "epoch": 0.0462090623598026, + "grad_norm": 2.388037596587926, + "learning_rate": 9.993160342773483e-06, + "loss": 0.2706, + "step": 309 + }, + { + "epoch": 0.04635860625093465, + "grad_norm": 1.5868739255084185, + "learning_rate": 9.993033125622665e-06, + "loss": 0.256, + "step": 310 + }, + { + "epoch": 0.0465081501420667, + "grad_norm": 1.8286822342955051, + "learning_rate": 9.992904737058157e-06, + "loss": 0.209, + "step": 311 + }, + { + "epoch": 0.04665769403319874, + "grad_norm": 2.2060332987484306, + "learning_rate": 9.992775177110078e-06, + "loss": 0.4253, + "step": 312 + }, + { + "epoch": 0.046807237924330794, + "grad_norm": 1.39628419375001, + "learning_rate": 9.992644445808826e-06, + "loss": 0.1693, + "step": 313 + }, + { + "epoch": 0.04695678181546284, + "grad_norm": 1.5668060198088787, + "learning_rate": 9.99251254318507e-06, + "loss": 0.24, + "step": 314 + }, + { + "epoch": 0.047106325706594884, + "grad_norm": 1.998270389587923, + "learning_rate": 9.992379469269758e-06, + "loss": 0.2519, + "step": 315 + }, + { + "epoch": 0.04725586959772693, + "grad_norm": 1.9609810436779118, + "learning_rate": 9.99224522409411e-06, + "loss": 0.2023, + "step": 316 + }, + { + "epoch": 0.04740541348885898, + "grad_norm": 1.4580736241239847, + "learning_rate": 9.992109807689619e-06, + "loss": 0.2387, + "step": 317 + }, + { + "epoch": 0.04755495737999103, + "grad_norm": 2.710681694340303, + "learning_rate": 9.991973220088057e-06, + "loss": 0.6738, + "step": 318 + }, + { + "epoch": 0.04770450127112307, + "grad_norm": 1.2469776099691643, + "learning_rate": 9.991835461321466e-06, + "loss": 0.2013, + "step": 319 + }, + { + "epoch": 0.047854045162255124, + "grad_norm": 2.128896128779159, + "learning_rate": 9.99169653142217e-06, + "loss": 0.3432, + "step": 320 + }, + { + "epoch": 0.04800358905338717, + "grad_norm": 1.6053097848087672, + "learning_rate": 9.991556430422759e-06, + "loss": 0.2301, + "step": 321 + }, + { + "epoch": 0.048153132944519214, + "grad_norm": 1.7774787600035602, + "learning_rate": 9.991415158356106e-06, + "loss": 0.2535, + "step": 322 + }, + { + "epoch": 0.048302676835651266, + "grad_norm": 1.449815289318445, + "learning_rate": 9.991272715255351e-06, + "loss": 0.1878, + "step": 323 + }, + { + "epoch": 0.04845222072678331, + "grad_norm": 1.5118547669168991, + "learning_rate": 9.991129101153916e-06, + "loss": 0.3186, + "step": 324 + }, + { + "epoch": 0.048601764617915356, + "grad_norm": 1.461388444407636, + "learning_rate": 9.99098431608549e-06, + "loss": 0.1747, + "step": 325 + }, + { + "epoch": 0.04875130850904741, + "grad_norm": 2.3912366570769974, + "learning_rate": 9.990838360084045e-06, + "loss": 0.5325, + "step": 326 + }, + { + "epoch": 0.048900852400179454, + "grad_norm": 2.5611474084390937, + "learning_rate": 9.990691233183823e-06, + "loss": 0.2606, + "step": 327 + }, + { + "epoch": 0.0490503962913115, + "grad_norm": 2.21899436894442, + "learning_rate": 9.990542935419341e-06, + "loss": 0.4253, + "step": 328 + }, + { + "epoch": 0.049199940182443544, + "grad_norm": 1.6883179263006298, + "learning_rate": 9.99039346682539e-06, + "loss": 0.1768, + "step": 329 + }, + { + "epoch": 0.049349484073575596, + "grad_norm": 3.2358870266119006, + "learning_rate": 9.990242827437036e-06, + "loss": 0.7866, + "step": 330 + }, + { + "epoch": 0.04949902796470764, + "grad_norm": 2.0627143054944153, + "learning_rate": 9.990091017289623e-06, + "loss": 0.3286, + "step": 331 + }, + { + "epoch": 0.049648571855839686, + "grad_norm": 2.1246533005850523, + "learning_rate": 9.989938036418766e-06, + "loss": 0.2716, + "step": 332 + }, + { + "epoch": 0.04979811574697174, + "grad_norm": 2.6250279686209828, + "learning_rate": 9.989783884860355e-06, + "loss": 0.5058, + "step": 333 + }, + { + "epoch": 0.049947659638103784, + "grad_norm": 2.3409062617647627, + "learning_rate": 9.989628562650558e-06, + "loss": 0.2589, + "step": 334 + }, + { + "epoch": 0.05009720352923583, + "grad_norm": 1.835901073337933, + "learning_rate": 9.989472069825811e-06, + "loss": 0.3493, + "step": 335 + }, + { + "epoch": 0.05024674742036788, + "grad_norm": 2.2454393810241298, + "learning_rate": 9.989314406422835e-06, + "loss": 0.4113, + "step": 336 + }, + { + "epoch": 0.050396291311499926, + "grad_norm": 2.2906853778474674, + "learning_rate": 9.989155572478611e-06, + "loss": 0.5289, + "step": 337 + }, + { + "epoch": 0.05054583520263197, + "grad_norm": 2.3899442476389665, + "learning_rate": 9.98899556803041e-06, + "loss": 0.2174, + "step": 338 + }, + { + "epoch": 0.05069537909376402, + "grad_norm": 1.3681982854338133, + "learning_rate": 9.988834393115768e-06, + "loss": 0.2021, + "step": 339 + }, + { + "epoch": 0.05084492298489607, + "grad_norm": 1.5118760155287632, + "learning_rate": 9.988672047772497e-06, + "loss": 0.1927, + "step": 340 + }, + { + "epoch": 0.05099446687602811, + "grad_norm": 2.1144895431001105, + "learning_rate": 9.988508532038685e-06, + "loss": 0.3325, + "step": 341 + }, + { + "epoch": 0.05114401076716016, + "grad_norm": 1.8616803287346595, + "learning_rate": 9.988343845952697e-06, + "loss": 0.3018, + "step": 342 + }, + { + "epoch": 0.05129355465829221, + "grad_norm": 2.787967616575242, + "learning_rate": 9.988177989553167e-06, + "loss": 0.4641, + "step": 343 + }, + { + "epoch": 0.051443098549424256, + "grad_norm": 2.2905797584406242, + "learning_rate": 9.98801096287901e-06, + "loss": 0.5336, + "step": 344 + }, + { + "epoch": 0.0515926424405563, + "grad_norm": 1.769311364935245, + "learning_rate": 9.987842765969408e-06, + "loss": 0.2843, + "step": 345 + }, + { + "epoch": 0.05174218633168835, + "grad_norm": 1.7122732613639495, + "learning_rate": 9.987673398863824e-06, + "loss": 0.2272, + "step": 346 + }, + { + "epoch": 0.0518917302228204, + "grad_norm": 2.328359950454365, + "learning_rate": 9.987502861601991e-06, + "loss": 0.2645, + "step": 347 + }, + { + "epoch": 0.05204127411395244, + "grad_norm": 2.208277642399548, + "learning_rate": 9.987331154223922e-06, + "loss": 0.5877, + "step": 348 + }, + { + "epoch": 0.052190818005084495, + "grad_norm": 2.154817789687723, + "learning_rate": 9.9871582767699e-06, + "loss": 0.3414, + "step": 349 + }, + { + "epoch": 0.05234036189621654, + "grad_norm": 2.0510314098551814, + "learning_rate": 9.986984229280483e-06, + "loss": 0.3981, + "step": 350 + }, + { + "epoch": 0.052489905787348586, + "grad_norm": 2.346735661125246, + "learning_rate": 9.986809011796503e-06, + "loss": 0.6596, + "step": 351 + }, + { + "epoch": 0.05263944967848063, + "grad_norm": 1.641693244293744, + "learning_rate": 9.98663262435907e-06, + "loss": 0.3657, + "step": 352 + }, + { + "epoch": 0.05278899356961268, + "grad_norm": 2.240226359797858, + "learning_rate": 9.986455067009566e-06, + "loss": 0.3706, + "step": 353 + }, + { + "epoch": 0.05293853746074473, + "grad_norm": 2.3791485993411357, + "learning_rate": 9.986276339789648e-06, + "loss": 0.5428, + "step": 354 + }, + { + "epoch": 0.05308808135187677, + "grad_norm": 1.7806897327965683, + "learning_rate": 9.986096442741241e-06, + "loss": 0.2336, + "step": 355 + }, + { + "epoch": 0.053237625243008825, + "grad_norm": 1.8563417208131827, + "learning_rate": 9.98591537590656e-06, + "loss": 0.2129, + "step": 356 + }, + { + "epoch": 0.05338716913414087, + "grad_norm": 2.2115041121315895, + "learning_rate": 9.98573313932808e-06, + "loss": 0.5232, + "step": 357 + }, + { + "epoch": 0.053536713025272915, + "grad_norm": 1.3693709893910027, + "learning_rate": 9.985549733048556e-06, + "loss": 0.3524, + "step": 358 + }, + { + "epoch": 0.05368625691640497, + "grad_norm": 2.033727598383455, + "learning_rate": 9.985365157111017e-06, + "loss": 0.3987, + "step": 359 + }, + { + "epoch": 0.05383580080753701, + "grad_norm": 2.3258255541409505, + "learning_rate": 9.985179411558767e-06, + "loss": 0.5489, + "step": 360 + }, + { + "epoch": 0.05398534469866906, + "grad_norm": 2.0805855861837057, + "learning_rate": 9.984992496435383e-06, + "loss": 0.3982, + "step": 361 + }, + { + "epoch": 0.05413488858980111, + "grad_norm": 1.4938394292792039, + "learning_rate": 9.984804411784717e-06, + "loss": 0.2279, + "step": 362 + }, + { + "epoch": 0.054284432480933155, + "grad_norm": 1.935765339737269, + "learning_rate": 9.984615157650896e-06, + "loss": 0.2208, + "step": 363 + }, + { + "epoch": 0.0544339763720652, + "grad_norm": 2.294825440673555, + "learning_rate": 9.98442473407832e-06, + "loss": 0.4006, + "step": 364 + }, + { + "epoch": 0.054583520263197245, + "grad_norm": 1.7404498428206792, + "learning_rate": 9.984233141111663e-06, + "loss": 0.3859, + "step": 365 + }, + { + "epoch": 0.0547330641543293, + "grad_norm": 2.382616866788976, + "learning_rate": 9.984040378795879e-06, + "loss": 0.5393, + "step": 366 + }, + { + "epoch": 0.05488260804546134, + "grad_norm": 2.121310368782044, + "learning_rate": 9.983846447176186e-06, + "loss": 0.3808, + "step": 367 + }, + { + "epoch": 0.05503215193659339, + "grad_norm": 1.4327836947551182, + "learning_rate": 9.983651346298089e-06, + "loss": 0.21, + "step": 368 + }, + { + "epoch": 0.05518169582772544, + "grad_norm": 1.8551217286702022, + "learning_rate": 9.983455076207353e-06, + "loss": 0.3611, + "step": 369 + }, + { + "epoch": 0.055331239718857485, + "grad_norm": 1.1962615317465979, + "learning_rate": 9.983257636950032e-06, + "loss": 0.1632, + "step": 370 + }, + { + "epoch": 0.05548078360998953, + "grad_norm": 2.210937603202386, + "learning_rate": 9.983059028572443e-06, + "loss": 0.2054, + "step": 371 + }, + { + "epoch": 0.05563032750112158, + "grad_norm": 1.3676870965949202, + "learning_rate": 9.982859251121183e-06, + "loss": 0.2257, + "step": 372 + }, + { + "epoch": 0.05577987139225363, + "grad_norm": 1.877238753038072, + "learning_rate": 9.98265830464312e-06, + "loss": 0.3069, + "step": 373 + }, + { + "epoch": 0.05592941528338567, + "grad_norm": 2.6215120058588743, + "learning_rate": 9.9824561891854e-06, + "loss": 0.3812, + "step": 374 + }, + { + "epoch": 0.056078959174517724, + "grad_norm": 1.5353869053774183, + "learning_rate": 9.982252904795437e-06, + "loss": 0.3038, + "step": 375 + }, + { + "epoch": 0.05622850306564977, + "grad_norm": 1.5387274188562523, + "learning_rate": 9.98204845152093e-06, + "loss": 0.1784, + "step": 376 + }, + { + "epoch": 0.056378046956781815, + "grad_norm": 2.3221296907492444, + "learning_rate": 9.981842829409842e-06, + "loss": 0.4253, + "step": 377 + }, + { + "epoch": 0.05652759084791386, + "grad_norm": 1.8464138105889263, + "learning_rate": 9.981636038510414e-06, + "loss": 0.2137, + "step": 378 + }, + { + "epoch": 0.05667713473904591, + "grad_norm": 1.9213502252741161, + "learning_rate": 9.98142807887116e-06, + "loss": 0.2652, + "step": 379 + }, + { + "epoch": 0.05682667863017796, + "grad_norm": 1.7697460473662174, + "learning_rate": 9.981218950540874e-06, + "loss": 0.2525, + "step": 380 + }, + { + "epoch": 0.05697622252131, + "grad_norm": 2.001502054151958, + "learning_rate": 9.981008653568613e-06, + "loss": 0.3749, + "step": 381 + }, + { + "epoch": 0.057125766412442054, + "grad_norm": 1.7507480997796745, + "learning_rate": 9.98079718800372e-06, + "loss": 0.3293, + "step": 382 + }, + { + "epoch": 0.0572753103035741, + "grad_norm": 1.8995856376763527, + "learning_rate": 9.980584553895805e-06, + "loss": 0.2595, + "step": 383 + }, + { + "epoch": 0.057424854194706144, + "grad_norm": 1.6960817341003291, + "learning_rate": 9.980370751294754e-06, + "loss": 0.3214, + "step": 384 + }, + { + "epoch": 0.057574398085838197, + "grad_norm": 2.747620756274178, + "learning_rate": 9.980155780250728e-06, + "loss": 0.4678, + "step": 385 + }, + { + "epoch": 0.05772394197697024, + "grad_norm": 1.429295181164985, + "learning_rate": 9.979939640814158e-06, + "loss": 0.3417, + "step": 386 + }, + { + "epoch": 0.05787348586810229, + "grad_norm": 1.546941524577904, + "learning_rate": 9.979722333035757e-06, + "loss": 0.3017, + "step": 387 + }, + { + "epoch": 0.05802302975923433, + "grad_norm": 2.3243262803022753, + "learning_rate": 9.979503856966504e-06, + "loss": 0.3906, + "step": 388 + }, + { + "epoch": 0.058172573650366384, + "grad_norm": 1.5367077444523152, + "learning_rate": 9.979284212657658e-06, + "loss": 0.2735, + "step": 389 + }, + { + "epoch": 0.05832211754149843, + "grad_norm": 1.0259751361449947, + "learning_rate": 9.979063400160747e-06, + "loss": 0.1788, + "step": 390 + }, + { + "epoch": 0.058471661432630474, + "grad_norm": 1.7811616961442123, + "learning_rate": 9.97884141952758e-06, + "loss": 0.2071, + "step": 391 + }, + { + "epoch": 0.058621205323762526, + "grad_norm": 2.347009922116326, + "learning_rate": 9.978618270810229e-06, + "loss": 0.4248, + "step": 392 + }, + { + "epoch": 0.05877074921489457, + "grad_norm": 1.3076474084417338, + "learning_rate": 9.978393954061052e-06, + "loss": 0.1771, + "step": 393 + }, + { + "epoch": 0.05892029310602662, + "grad_norm": 2.4165379692755455, + "learning_rate": 9.978168469332677e-06, + "loss": 0.4913, + "step": 394 + }, + { + "epoch": 0.05906983699715867, + "grad_norm": 1.6584516839965744, + "learning_rate": 9.977941816678e-06, + "loss": 0.2292, + "step": 395 + }, + { + "epoch": 0.059219380888290714, + "grad_norm": 1.3323879687206615, + "learning_rate": 9.9777139961502e-06, + "loss": 0.2042, + "step": 396 + }, + { + "epoch": 0.05936892477942276, + "grad_norm": 1.242996863833067, + "learning_rate": 9.977485007802725e-06, + "loss": 0.1759, + "step": 397 + }, + { + "epoch": 0.05951846867055481, + "grad_norm": 2.0289613301318057, + "learning_rate": 9.977254851689297e-06, + "loss": 0.3391, + "step": 398 + }, + { + "epoch": 0.059668012561686856, + "grad_norm": 1.7111890076718022, + "learning_rate": 9.977023527863913e-06, + "loss": 0.318, + "step": 399 + }, + { + "epoch": 0.0598175564528189, + "grad_norm": 2.360289838407607, + "learning_rate": 9.976791036380844e-06, + "loss": 0.7436, + "step": 400 + }, + { + "epoch": 0.059967100343950946, + "grad_norm": 1.6556682149662436, + "learning_rate": 9.976557377294634e-06, + "loss": 0.3579, + "step": 401 + }, + { + "epoch": 0.060116644235083, + "grad_norm": 1.9472299876725607, + "learning_rate": 9.976322550660103e-06, + "loss": 0.3939, + "step": 402 + }, + { + "epoch": 0.060266188126215044, + "grad_norm": 1.2625006623785717, + "learning_rate": 9.976086556532343e-06, + "loss": 0.1777, + "step": 403 + }, + { + "epoch": 0.06041573201734709, + "grad_norm": 2.142440158571368, + "learning_rate": 9.975849394966721e-06, + "loss": 0.4728, + "step": 404 + }, + { + "epoch": 0.06056527590847914, + "grad_norm": 1.3109446375337697, + "learning_rate": 9.975611066018876e-06, + "loss": 0.2035, + "step": 405 + }, + { + "epoch": 0.060714819799611186, + "grad_norm": 1.473069250695052, + "learning_rate": 9.975371569744723e-06, + "loss": 0.2502, + "step": 406 + }, + { + "epoch": 0.06086436369074323, + "grad_norm": 1.4147256960977963, + "learning_rate": 9.975130906200453e-06, + "loss": 0.1861, + "step": 407 + }, + { + "epoch": 0.06101390758187528, + "grad_norm": 1.5107559691714745, + "learning_rate": 9.97488907544252e-06, + "loss": 0.2309, + "step": 408 + }, + { + "epoch": 0.06116345147300733, + "grad_norm": 1.5467720756101462, + "learning_rate": 9.97464607752767e-06, + "loss": 0.235, + "step": 409 + }, + { + "epoch": 0.061312995364139374, + "grad_norm": 1.2901444374034334, + "learning_rate": 9.974401912512905e-06, + "loss": 0.1877, + "step": 410 + }, + { + "epoch": 0.061462539255271426, + "grad_norm": 1.8751659558285558, + "learning_rate": 9.974156580455512e-06, + "loss": 0.2941, + "step": 411 + }, + { + "epoch": 0.06161208314640347, + "grad_norm": 1.2187366523072891, + "learning_rate": 9.973910081413048e-06, + "loss": 0.2, + "step": 412 + }, + { + "epoch": 0.061761627037535516, + "grad_norm": 2.56665763030278, + "learning_rate": 9.973662415443342e-06, + "loss": 0.4259, + "step": 413 + }, + { + "epoch": 0.06191117092866756, + "grad_norm": 1.5201509236946156, + "learning_rate": 9.973413582604502e-06, + "loss": 0.2098, + "step": 414 + }, + { + "epoch": 0.06206071481979961, + "grad_norm": 2.2299268067487183, + "learning_rate": 9.973163582954903e-06, + "loss": 0.5054, + "step": 415 + }, + { + "epoch": 0.06221025871093166, + "grad_norm": 2.195400724979985, + "learning_rate": 9.972912416553202e-06, + "loss": 0.3856, + "step": 416 + }, + { + "epoch": 0.0623598026020637, + "grad_norm": 2.3196273331545876, + "learning_rate": 9.972660083458321e-06, + "loss": 0.5608, + "step": 417 + }, + { + "epoch": 0.06250934649319576, + "grad_norm": 1.6815269422927719, + "learning_rate": 9.97240658372946e-06, + "loss": 0.3682, + "step": 418 + }, + { + "epoch": 0.0626588903843278, + "grad_norm": 1.7582779956751238, + "learning_rate": 9.972151917426095e-06, + "loss": 0.2256, + "step": 419 + }, + { + "epoch": 0.06280843427545985, + "grad_norm": 1.9523974169697056, + "learning_rate": 9.97189608460797e-06, + "loss": 0.2303, + "step": 420 + }, + { + "epoch": 0.06295797816659189, + "grad_norm": 2.120409254412015, + "learning_rate": 9.97163908533511e-06, + "loss": 0.2198, + "step": 421 + }, + { + "epoch": 0.06310752205772394, + "grad_norm": 1.7213130956608376, + "learning_rate": 9.971380919667806e-06, + "loss": 0.3355, + "step": 422 + }, + { + "epoch": 0.063257065948856, + "grad_norm": 1.6609701125154137, + "learning_rate": 9.971121587666627e-06, + "loss": 0.2354, + "step": 423 + }, + { + "epoch": 0.06340660983998804, + "grad_norm": 1.2809919353271448, + "learning_rate": 9.970861089392415e-06, + "loss": 0.2043, + "step": 424 + }, + { + "epoch": 0.06355615373112009, + "grad_norm": 1.137987748410028, + "learning_rate": 9.970599424906285e-06, + "loss": 0.1714, + "step": 425 + }, + { + "epoch": 0.06370569762225213, + "grad_norm": 2.241505455994119, + "learning_rate": 9.970336594269627e-06, + "loss": 0.559, + "step": 426 + }, + { + "epoch": 0.06385524151338418, + "grad_norm": 1.8145782296174282, + "learning_rate": 9.970072597544102e-06, + "loss": 0.4695, + "step": 427 + }, + { + "epoch": 0.06400478540451622, + "grad_norm": 2.6609160560733924, + "learning_rate": 9.96980743479165e-06, + "loss": 0.3927, + "step": 428 + }, + { + "epoch": 0.06415432929564828, + "grad_norm": 1.5902127205656447, + "learning_rate": 9.969541106074477e-06, + "loss": 0.3221, + "step": 429 + }, + { + "epoch": 0.06430387318678032, + "grad_norm": 1.354440824254012, + "learning_rate": 9.969273611455066e-06, + "loss": 0.1982, + "step": 430 + }, + { + "epoch": 0.06445341707791237, + "grad_norm": 2.1796464676908682, + "learning_rate": 9.969004950996175e-06, + "loss": 0.5947, + "step": 431 + }, + { + "epoch": 0.06460296096904442, + "grad_norm": 1.6772295444343943, + "learning_rate": 9.968735124760834e-06, + "loss": 0.3567, + "step": 432 + }, + { + "epoch": 0.06475250486017646, + "grad_norm": 2.326608368656497, + "learning_rate": 9.968464132812348e-06, + "loss": 0.3934, + "step": 433 + }, + { + "epoch": 0.0649020487513085, + "grad_norm": 1.9737750855760885, + "learning_rate": 9.968191975214293e-06, + "loss": 0.3936, + "step": 434 + }, + { + "epoch": 0.06505159264244055, + "grad_norm": 2.09687169461338, + "learning_rate": 9.967918652030522e-06, + "loss": 0.3644, + "step": 435 + }, + { + "epoch": 0.06520113653357261, + "grad_norm": 2.1122151786614967, + "learning_rate": 9.967644163325157e-06, + "loss": 0.2169, + "step": 436 + }, + { + "epoch": 0.06535068042470465, + "grad_norm": 1.8368706867911107, + "learning_rate": 9.967368509162595e-06, + "loss": 0.3956, + "step": 437 + }, + { + "epoch": 0.0655002243158367, + "grad_norm": 1.7823169737575542, + "learning_rate": 9.96709168960751e-06, + "loss": 0.232, + "step": 438 + }, + { + "epoch": 0.06564976820696874, + "grad_norm": 2.1565508943507194, + "learning_rate": 9.966813704724844e-06, + "loss": 0.2228, + "step": 439 + }, + { + "epoch": 0.06579931209810079, + "grad_norm": 2.2075342060994414, + "learning_rate": 9.966534554579816e-06, + "loss": 0.204, + "step": 440 + }, + { + "epoch": 0.06594885598923284, + "grad_norm": 2.0929887441012602, + "learning_rate": 9.966254239237917e-06, + "loss": 0.3946, + "step": 441 + }, + { + "epoch": 0.06609839988036488, + "grad_norm": 2.0382287962872834, + "learning_rate": 9.965972758764912e-06, + "loss": 0.4633, + "step": 442 + }, + { + "epoch": 0.06624794377149694, + "grad_norm": 1.2772439274586147, + "learning_rate": 9.96569011322684e-06, + "loss": 0.1784, + "step": 443 + }, + { + "epoch": 0.06639748766262898, + "grad_norm": 1.1024457344648066, + "learning_rate": 9.965406302690011e-06, + "loss": 0.1625, + "step": 444 + }, + { + "epoch": 0.06654703155376103, + "grad_norm": 1.2184559623271476, + "learning_rate": 9.965121327221007e-06, + "loss": 0.1959, + "step": 445 + }, + { + "epoch": 0.06669657544489307, + "grad_norm": 1.9215235980087064, + "learning_rate": 9.964835186886692e-06, + "loss": 0.2493, + "step": 446 + }, + { + "epoch": 0.06684611933602512, + "grad_norm": 2.1443052954533974, + "learning_rate": 9.964547881754194e-06, + "loss": 0.3611, + "step": 447 + }, + { + "epoch": 0.06699566322715717, + "grad_norm": 2.6967138020110712, + "learning_rate": 9.964259411890918e-06, + "loss": 0.5427, + "step": 448 + }, + { + "epoch": 0.06714520711828922, + "grad_norm": 1.688779610685555, + "learning_rate": 9.96396977736454e-06, + "loss": 0.2569, + "step": 449 + }, + { + "epoch": 0.06729475100942127, + "grad_norm": 2.1241026975378694, + "learning_rate": 9.963678978243014e-06, + "loss": 0.3863, + "step": 450 + }, + { + "epoch": 0.06744429490055331, + "grad_norm": 1.9388647656441462, + "learning_rate": 9.96338701459456e-06, + "loss": 0.2726, + "step": 451 + }, + { + "epoch": 0.06759383879168536, + "grad_norm": 1.4657993620125664, + "learning_rate": 9.963093886487683e-06, + "loss": 0.2338, + "step": 452 + }, + { + "epoch": 0.0677433826828174, + "grad_norm": 2.307173509923502, + "learning_rate": 9.962799593991146e-06, + "loss": 0.8039, + "step": 453 + }, + { + "epoch": 0.06789292657394945, + "grad_norm": 1.2669540134016812, + "learning_rate": 9.962504137173997e-06, + "loss": 0.169, + "step": 454 + }, + { + "epoch": 0.0680424704650815, + "grad_norm": 1.5981790001004936, + "learning_rate": 9.962207516105552e-06, + "loss": 0.2019, + "step": 455 + }, + { + "epoch": 0.06819201435621355, + "grad_norm": 1.740837427237262, + "learning_rate": 9.9619097308554e-06, + "loss": 0.2116, + "step": 456 + }, + { + "epoch": 0.0683415582473456, + "grad_norm": 1.9511590671787182, + "learning_rate": 9.961610781493407e-06, + "loss": 0.2611, + "step": 457 + }, + { + "epoch": 0.06849110213847764, + "grad_norm": 1.9814713665794252, + "learning_rate": 9.961310668089708e-06, + "loss": 0.3714, + "step": 458 + }, + { + "epoch": 0.06864064602960969, + "grad_norm": 2.755804773731971, + "learning_rate": 9.96100939071471e-06, + "loss": 0.5178, + "step": 459 + }, + { + "epoch": 0.06879018992074173, + "grad_norm": 2.5378159735000225, + "learning_rate": 9.960706949439101e-06, + "loss": 0.7334, + "step": 460 + }, + { + "epoch": 0.06893973381187378, + "grad_norm": 2.3557582569765003, + "learning_rate": 9.960403344333832e-06, + "loss": 0.5763, + "step": 461 + }, + { + "epoch": 0.06908927770300584, + "grad_norm": 1.6501148783544786, + "learning_rate": 9.960098575470131e-06, + "loss": 0.3681, + "step": 462 + }, + { + "epoch": 0.06923882159413788, + "grad_norm": 1.3521314881367383, + "learning_rate": 9.959792642919505e-06, + "loss": 0.216, + "step": 463 + }, + { + "epoch": 0.06938836548526993, + "grad_norm": 1.9967115308447656, + "learning_rate": 9.959485546753724e-06, + "loss": 0.4411, + "step": 464 + }, + { + "epoch": 0.06953790937640197, + "grad_norm": 1.6934835527025132, + "learning_rate": 9.959177287044839e-06, + "loss": 0.3013, + "step": 465 + }, + { + "epoch": 0.06968745326753402, + "grad_norm": 2.1881268216288703, + "learning_rate": 9.958867863865168e-06, + "loss": 0.386, + "step": 466 + }, + { + "epoch": 0.06983699715866606, + "grad_norm": 1.746249573857031, + "learning_rate": 9.958557277287307e-06, + "loss": 0.3486, + "step": 467 + }, + { + "epoch": 0.06998654104979811, + "grad_norm": 1.3309239290400467, + "learning_rate": 9.958245527384118e-06, + "loss": 0.2512, + "step": 468 + }, + { + "epoch": 0.07013608494093017, + "grad_norm": 1.780095751208227, + "learning_rate": 9.957932614228746e-06, + "loss": 0.3579, + "step": 469 + }, + { + "epoch": 0.07028562883206221, + "grad_norm": 2.058627302052003, + "learning_rate": 9.957618537894602e-06, + "loss": 0.2234, + "step": 470 + }, + { + "epoch": 0.07043517272319426, + "grad_norm": 2.1643867800571286, + "learning_rate": 9.95730329845537e-06, + "loss": 0.2658, + "step": 471 + }, + { + "epoch": 0.0705847166143263, + "grad_norm": 1.9162877246393155, + "learning_rate": 9.956986895985009e-06, + "loss": 0.3514, + "step": 472 + }, + { + "epoch": 0.07073426050545835, + "grad_norm": 2.0198300655217474, + "learning_rate": 9.95666933055775e-06, + "loss": 0.4191, + "step": 473 + }, + { + "epoch": 0.0708838043965904, + "grad_norm": 1.8174642496449622, + "learning_rate": 9.956350602248095e-06, + "loss": 0.1802, + "step": 474 + }, + { + "epoch": 0.07103334828772245, + "grad_norm": 1.7641599345266465, + "learning_rate": 9.956030711130824e-06, + "loss": 0.2181, + "step": 475 + }, + { + "epoch": 0.0711828921788545, + "grad_norm": 1.5149058769435404, + "learning_rate": 9.955709657280985e-06, + "loss": 0.2068, + "step": 476 + }, + { + "epoch": 0.07133243606998654, + "grad_norm": 2.14267612952952, + "learning_rate": 9.955387440773902e-06, + "loss": 0.2799, + "step": 477 + }, + { + "epoch": 0.07148197996111859, + "grad_norm": 1.8794948861297893, + "learning_rate": 9.955064061685166e-06, + "loss": 0.3437, + "step": 478 + }, + { + "epoch": 0.07163152385225063, + "grad_norm": 1.595856928796192, + "learning_rate": 9.954739520090649e-06, + "loss": 0.1741, + "step": 479 + }, + { + "epoch": 0.07178106774338268, + "grad_norm": 1.4775459266699813, + "learning_rate": 9.95441381606649e-06, + "loss": 0.2009, + "step": 480 + }, + { + "epoch": 0.07193061163451472, + "grad_norm": 1.4624583034603231, + "learning_rate": 9.954086949689102e-06, + "loss": 0.2413, + "step": 481 + }, + { + "epoch": 0.07208015552564678, + "grad_norm": 1.5685428117813849, + "learning_rate": 9.953758921035171e-06, + "loss": 0.2381, + "step": 482 + }, + { + "epoch": 0.07222969941677883, + "grad_norm": 2.0490413587537524, + "learning_rate": 9.953429730181653e-06, + "loss": 0.4092, + "step": 483 + }, + { + "epoch": 0.07237924330791087, + "grad_norm": 2.605633491672469, + "learning_rate": 9.953099377205786e-06, + "loss": 0.56, + "step": 484 + }, + { + "epoch": 0.07252878719904292, + "grad_norm": 1.6836189923086853, + "learning_rate": 9.952767862185071e-06, + "loss": 0.3514, + "step": 485 + }, + { + "epoch": 0.07267833109017496, + "grad_norm": 2.165692386982445, + "learning_rate": 9.952435185197281e-06, + "loss": 0.4363, + "step": 486 + }, + { + "epoch": 0.07282787498130701, + "grad_norm": 2.328987566639375, + "learning_rate": 9.952101346320471e-06, + "loss": 0.5953, + "step": 487 + }, + { + "epoch": 0.07297741887243907, + "grad_norm": 1.857109300243422, + "learning_rate": 9.951766345632957e-06, + "loss": 0.4125, + "step": 488 + }, + { + "epoch": 0.07312696276357111, + "grad_norm": 1.780608988332075, + "learning_rate": 9.951430183213338e-06, + "loss": 0.2793, + "step": 489 + }, + { + "epoch": 0.07327650665470316, + "grad_norm": 1.2718866410706833, + "learning_rate": 9.951092859140479e-06, + "loss": 0.1878, + "step": 490 + }, + { + "epoch": 0.0734260505458352, + "grad_norm": 1.389385388824981, + "learning_rate": 9.95075437349352e-06, + "loss": 0.1922, + "step": 491 + }, + { + "epoch": 0.07357559443696725, + "grad_norm": 1.2364018773804621, + "learning_rate": 9.950414726351873e-06, + "loss": 0.1972, + "step": 492 + }, + { + "epoch": 0.0737251383280993, + "grad_norm": 1.6438922682719497, + "learning_rate": 9.95007391779522e-06, + "loss": 0.3835, + "step": 493 + }, + { + "epoch": 0.07387468221923134, + "grad_norm": 1.9223258334837023, + "learning_rate": 9.949731947903523e-06, + "loss": 0.5421, + "step": 494 + }, + { + "epoch": 0.0740242261103634, + "grad_norm": 2.1294087718057955, + "learning_rate": 9.949388816757009e-06, + "loss": 0.6584, + "step": 495 + }, + { + "epoch": 0.07417377000149544, + "grad_norm": 1.9620720670123732, + "learning_rate": 9.949044524436178e-06, + "loss": 0.3427, + "step": 496 + }, + { + "epoch": 0.07432331389262749, + "grad_norm": 1.8767982308843718, + "learning_rate": 9.948699071021806e-06, + "loss": 0.2221, + "step": 497 + }, + { + "epoch": 0.07447285778375953, + "grad_norm": 1.5717369659821445, + "learning_rate": 9.948352456594938e-06, + "loss": 0.3915, + "step": 498 + }, + { + "epoch": 0.07462240167489158, + "grad_norm": 1.9105988284269253, + "learning_rate": 9.948004681236896e-06, + "loss": 0.4049, + "step": 499 + }, + { + "epoch": 0.07477194556602362, + "grad_norm": 2.051255434710168, + "learning_rate": 9.94765574502927e-06, + "loss": 0.263, + "step": 500 + }, + { + "epoch": 0.07492148945715567, + "grad_norm": 1.1727115808022262, + "learning_rate": 9.947305648053924e-06, + "loss": 0.2061, + "step": 501 + }, + { + "epoch": 0.07507103334828773, + "grad_norm": 2.3851218898633566, + "learning_rate": 9.946954390392995e-06, + "loss": 0.3587, + "step": 502 + }, + { + "epoch": 0.07522057723941977, + "grad_norm": 2.668333899893354, + "learning_rate": 9.94660197212889e-06, + "loss": 0.279, + "step": 503 + }, + { + "epoch": 0.07537012113055182, + "grad_norm": 2.324044177768054, + "learning_rate": 9.946248393344289e-06, + "loss": 0.5219, + "step": 504 + }, + { + "epoch": 0.07551966502168386, + "grad_norm": 2.252535927387564, + "learning_rate": 9.945893654122147e-06, + "loss": 0.4462, + "step": 505 + }, + { + "epoch": 0.07566920891281591, + "grad_norm": 1.2553962948323492, + "learning_rate": 9.945537754545689e-06, + "loss": 0.1829, + "step": 506 + }, + { + "epoch": 0.07581875280394795, + "grad_norm": 2.009514792075129, + "learning_rate": 9.94518069469841e-06, + "loss": 0.334, + "step": 507 + }, + { + "epoch": 0.07596829669508001, + "grad_norm": 1.7045023449590413, + "learning_rate": 9.944822474664082e-06, + "loss": 0.3202, + "step": 508 + }, + { + "epoch": 0.07611784058621206, + "grad_norm": 1.0508191419172128, + "learning_rate": 9.944463094526747e-06, + "loss": 0.205, + "step": 509 + }, + { + "epoch": 0.0762673844773441, + "grad_norm": 1.6097293192900886, + "learning_rate": 9.944102554370718e-06, + "loss": 0.2324, + "step": 510 + }, + { + "epoch": 0.07641692836847615, + "grad_norm": 1.9399148366487866, + "learning_rate": 9.943740854280582e-06, + "loss": 0.4526, + "step": 511 + }, + { + "epoch": 0.07656647225960819, + "grad_norm": 2.0362256511499335, + "learning_rate": 9.943377994341197e-06, + "loss": 0.3979, + "step": 512 + }, + { + "epoch": 0.07671601615074024, + "grad_norm": 1.5296316888698338, + "learning_rate": 9.943013974637693e-06, + "loss": 0.3789, + "step": 513 + }, + { + "epoch": 0.07686556004187228, + "grad_norm": 1.496691000675503, + "learning_rate": 9.942648795255473e-06, + "loss": 0.2497, + "step": 514 + }, + { + "epoch": 0.07701510393300434, + "grad_norm": 1.4146486247851384, + "learning_rate": 9.942282456280212e-06, + "loss": 0.3088, + "step": 515 + }, + { + "epoch": 0.07716464782413639, + "grad_norm": 1.3671722765483707, + "learning_rate": 9.941914957797855e-06, + "loss": 0.2076, + "step": 516 + }, + { + "epoch": 0.07731419171526843, + "grad_norm": 1.8485057563465108, + "learning_rate": 9.941546299894623e-06, + "loss": 0.3676, + "step": 517 + }, + { + "epoch": 0.07746373560640048, + "grad_norm": 2.0438588429845255, + "learning_rate": 9.941176482657005e-06, + "loss": 0.4905, + "step": 518 + }, + { + "epoch": 0.07761327949753252, + "grad_norm": 1.3215533906334498, + "learning_rate": 9.940805506171765e-06, + "loss": 0.2028, + "step": 519 + }, + { + "epoch": 0.07776282338866457, + "grad_norm": 2.499241081917891, + "learning_rate": 9.940433370525937e-06, + "loss": 0.4323, + "step": 520 + }, + { + "epoch": 0.07791236727979663, + "grad_norm": 1.4654220634749195, + "learning_rate": 9.940060075806827e-06, + "loss": 0.1928, + "step": 521 + }, + { + "epoch": 0.07806191117092867, + "grad_norm": 2.32501667334618, + "learning_rate": 9.939685622102013e-06, + "loss": 0.6039, + "step": 522 + }, + { + "epoch": 0.07821145506206072, + "grad_norm": 2.0353313744113644, + "learning_rate": 9.939310009499348e-06, + "loss": 0.434, + "step": 523 + }, + { + "epoch": 0.07836099895319276, + "grad_norm": 1.5916248439200642, + "learning_rate": 9.938933238086952e-06, + "loss": 0.2484, + "step": 524 + }, + { + "epoch": 0.07851054284432481, + "grad_norm": 1.510761606083, + "learning_rate": 9.938555307953221e-06, + "loss": 0.2761, + "step": 525 + }, + { + "epoch": 0.07866008673545685, + "grad_norm": 1.6041562012438388, + "learning_rate": 9.93817621918682e-06, + "loss": 0.3032, + "step": 526 + }, + { + "epoch": 0.0788096306265889, + "grad_norm": 1.5831322947558841, + "learning_rate": 9.937795971876686e-06, + "loss": 0.3486, + "step": 527 + }, + { + "epoch": 0.07895917451772096, + "grad_norm": 2.2247878916503856, + "learning_rate": 9.93741456611203e-06, + "loss": 0.4087, + "step": 528 + }, + { + "epoch": 0.079108718408853, + "grad_norm": 2.152252638423622, + "learning_rate": 9.937032001982334e-06, + "loss": 0.5629, + "step": 529 + }, + { + "epoch": 0.07925826229998505, + "grad_norm": 2.0483514105705525, + "learning_rate": 9.93664827957735e-06, + "loss": 0.5279, + "step": 530 + }, + { + "epoch": 0.07940780619111709, + "grad_norm": 1.2448870158155207, + "learning_rate": 9.936263398987103e-06, + "loss": 0.3744, + "step": 531 + }, + { + "epoch": 0.07955735008224914, + "grad_norm": 0.9489762178863248, + "learning_rate": 9.93587736030189e-06, + "loss": 0.1631, + "step": 532 + }, + { + "epoch": 0.07970689397338118, + "grad_norm": 1.3545590640653586, + "learning_rate": 9.935490163612279e-06, + "loss": 0.1975, + "step": 533 + }, + { + "epoch": 0.07985643786451324, + "grad_norm": 1.3663228011672384, + "learning_rate": 9.93510180900911e-06, + "loss": 0.184, + "step": 534 + }, + { + "epoch": 0.08000598175564529, + "grad_norm": 1.5768436668872405, + "learning_rate": 9.934712296583497e-06, + "loss": 0.3183, + "step": 535 + }, + { + "epoch": 0.08015552564677733, + "grad_norm": 1.926347057489139, + "learning_rate": 9.93432162642682e-06, + "loss": 0.3305, + "step": 536 + }, + { + "epoch": 0.08030506953790938, + "grad_norm": 2.0791782850566474, + "learning_rate": 9.933929798630738e-06, + "loss": 0.5009, + "step": 537 + }, + { + "epoch": 0.08045461342904142, + "grad_norm": 2.1023331544425523, + "learning_rate": 9.933536813287172e-06, + "loss": 0.4292, + "step": 538 + }, + { + "epoch": 0.08060415732017347, + "grad_norm": 2.8605361415271493, + "learning_rate": 9.933142670488324e-06, + "loss": 0.2666, + "step": 539 + }, + { + "epoch": 0.08075370121130551, + "grad_norm": 2.7087693572573968, + "learning_rate": 9.932747370326664e-06, + "loss": 0.2544, + "step": 540 + }, + { + "epoch": 0.08090324510243757, + "grad_norm": 1.5804074183588281, + "learning_rate": 9.932350912894932e-06, + "loss": 0.2089, + "step": 541 + }, + { + "epoch": 0.08105278899356962, + "grad_norm": 1.6448934387271092, + "learning_rate": 9.931953298286141e-06, + "loss": 0.181, + "step": 542 + }, + { + "epoch": 0.08120233288470166, + "grad_norm": 1.373017928034036, + "learning_rate": 9.931554526593576e-06, + "loss": 0.3218, + "step": 543 + }, + { + "epoch": 0.0813518767758337, + "grad_norm": 1.4895748889012388, + "learning_rate": 9.931154597910791e-06, + "loss": 0.2472, + "step": 544 + }, + { + "epoch": 0.08150142066696575, + "grad_norm": 2.064608760225509, + "learning_rate": 9.930753512331615e-06, + "loss": 0.3765, + "step": 545 + }, + { + "epoch": 0.0816509645580978, + "grad_norm": 1.6526846905937504, + "learning_rate": 9.930351269950144e-06, + "loss": 0.3177, + "step": 546 + }, + { + "epoch": 0.08180050844922986, + "grad_norm": 2.047798829134187, + "learning_rate": 9.92994787086075e-06, + "loss": 0.3192, + "step": 547 + }, + { + "epoch": 0.0819500523403619, + "grad_norm": 2.122394373762569, + "learning_rate": 9.929543315158073e-06, + "loss": 0.5554, + "step": 548 + }, + { + "epoch": 0.08209959623149395, + "grad_norm": 2.311960518258969, + "learning_rate": 9.929137602937028e-06, + "loss": 0.3797, + "step": 549 + }, + { + "epoch": 0.08224914012262599, + "grad_norm": 1.8449832380251867, + "learning_rate": 9.928730734292797e-06, + "loss": 0.3894, + "step": 550 + }, + { + "epoch": 0.08239868401375804, + "grad_norm": 1.995255157883457, + "learning_rate": 9.928322709320834e-06, + "loss": 0.3925, + "step": 551 + }, + { + "epoch": 0.08254822790489008, + "grad_norm": 2.755405061449222, + "learning_rate": 9.92791352811687e-06, + "loss": 0.6899, + "step": 552 + }, + { + "epoch": 0.08269777179602213, + "grad_norm": 1.2254981142470793, + "learning_rate": 9.9275031907769e-06, + "loss": 0.2225, + "step": 553 + }, + { + "epoch": 0.08284731568715419, + "grad_norm": 1.9323036995913243, + "learning_rate": 9.927091697397192e-06, + "loss": 0.3865, + "step": 554 + }, + { + "epoch": 0.08299685957828623, + "grad_norm": 2.0962863974348593, + "learning_rate": 9.926679048074289e-06, + "loss": 0.4, + "step": 555 + }, + { + "epoch": 0.08314640346941828, + "grad_norm": 1.5847691098448267, + "learning_rate": 9.926265242904998e-06, + "loss": 0.247, + "step": 556 + }, + { + "epoch": 0.08329594736055032, + "grad_norm": 2.5967594290859903, + "learning_rate": 9.925850281986408e-06, + "loss": 0.2083, + "step": 557 + }, + { + "epoch": 0.08344549125168237, + "grad_norm": 2.0426826933231226, + "learning_rate": 9.925434165415868e-06, + "loss": 0.449, + "step": 558 + }, + { + "epoch": 0.08359503514281441, + "grad_norm": 1.7693278888452375, + "learning_rate": 9.925016893291007e-06, + "loss": 0.2789, + "step": 559 + }, + { + "epoch": 0.08374457903394647, + "grad_norm": 1.6227416269049326, + "learning_rate": 9.924598465709717e-06, + "loss": 0.2209, + "step": 560 + }, + { + "epoch": 0.08389412292507852, + "grad_norm": 1.7055307729140163, + "learning_rate": 9.924178882770166e-06, + "loss": 0.3554, + "step": 561 + }, + { + "epoch": 0.08404366681621056, + "grad_norm": 1.9245436136675982, + "learning_rate": 9.923758144570792e-06, + "loss": 0.5343, + "step": 562 + }, + { + "epoch": 0.0841932107073426, + "grad_norm": 1.3916186974123048, + "learning_rate": 9.923336251210306e-06, + "loss": 0.2328, + "step": 563 + }, + { + "epoch": 0.08434275459847465, + "grad_norm": 1.8724253939088875, + "learning_rate": 9.92291320278769e-06, + "loss": 0.2691, + "step": 564 + }, + { + "epoch": 0.0844922984896067, + "grad_norm": 1.545927153493535, + "learning_rate": 9.922488999402191e-06, + "loss": 0.2049, + "step": 565 + }, + { + "epoch": 0.08464184238073874, + "grad_norm": 2.216312298348258, + "learning_rate": 9.922063641153332e-06, + "loss": 0.5844, + "step": 566 + }, + { + "epoch": 0.0847913862718708, + "grad_norm": 1.2444734652143745, + "learning_rate": 9.921637128140909e-06, + "loss": 0.2872, + "step": 567 + }, + { + "epoch": 0.08494093016300285, + "grad_norm": 2.133851301389792, + "learning_rate": 9.921209460464983e-06, + "loss": 0.2418, + "step": 568 + }, + { + "epoch": 0.08509047405413489, + "grad_norm": 1.5462263702909163, + "learning_rate": 9.92078063822589e-06, + "loss": 0.3438, + "step": 569 + }, + { + "epoch": 0.08524001794526694, + "grad_norm": 2.341879963295622, + "learning_rate": 9.920350661524237e-06, + "loss": 0.5783, + "step": 570 + }, + { + "epoch": 0.08538956183639898, + "grad_norm": 1.7633187330163729, + "learning_rate": 9.919919530460899e-06, + "loss": 0.3503, + "step": 571 + }, + { + "epoch": 0.08553910572753103, + "grad_norm": 2.1676160714531107, + "learning_rate": 9.919487245137024e-06, + "loss": 0.2098, + "step": 572 + }, + { + "epoch": 0.08568864961866307, + "grad_norm": 2.198855334486466, + "learning_rate": 9.919053805654029e-06, + "loss": 0.3876, + "step": 573 + }, + { + "epoch": 0.08583819350979513, + "grad_norm": 1.821472616891953, + "learning_rate": 9.918619212113607e-06, + "loss": 0.391, + "step": 574 + }, + { + "epoch": 0.08598773740092717, + "grad_norm": 1.4553776733520012, + "learning_rate": 9.918183464617714e-06, + "loss": 0.2032, + "step": 575 + }, + { + "epoch": 0.08613728129205922, + "grad_norm": 1.5817735791823646, + "learning_rate": 9.917746563268581e-06, + "loss": 0.2658, + "step": 576 + }, + { + "epoch": 0.08628682518319127, + "grad_norm": 2.255323258805483, + "learning_rate": 9.917308508168712e-06, + "loss": 0.39, + "step": 577 + }, + { + "epoch": 0.08643636907432331, + "grad_norm": 1.699175902078527, + "learning_rate": 9.916869299420875e-06, + "loss": 0.1906, + "step": 578 + }, + { + "epoch": 0.08658591296545536, + "grad_norm": 1.5572993513277051, + "learning_rate": 9.916428937128117e-06, + "loss": 0.3438, + "step": 579 + }, + { + "epoch": 0.08673545685658741, + "grad_norm": 1.5095119263162684, + "learning_rate": 9.915987421393747e-06, + "loss": 0.272, + "step": 580 + }, + { + "epoch": 0.08688500074771946, + "grad_norm": 2.8137128440101735, + "learning_rate": 9.91554475232135e-06, + "loss": 0.3833, + "step": 581 + }, + { + "epoch": 0.0870345446388515, + "grad_norm": 1.845156278788705, + "learning_rate": 9.915100930014786e-06, + "loss": 0.4658, + "step": 582 + }, + { + "epoch": 0.08718408852998355, + "grad_norm": 1.7624433765379017, + "learning_rate": 9.914655954578171e-06, + "loss": 0.3968, + "step": 583 + }, + { + "epoch": 0.0873336324211156, + "grad_norm": 1.7915618837196812, + "learning_rate": 9.914209826115906e-06, + "loss": 0.4901, + "step": 584 + }, + { + "epoch": 0.08748317631224764, + "grad_norm": 1.8335500777788887, + "learning_rate": 9.913762544732654e-06, + "loss": 0.249, + "step": 585 + }, + { + "epoch": 0.08763272020337969, + "grad_norm": 1.5116580783389033, + "learning_rate": 9.913314110533355e-06, + "loss": 0.3999, + "step": 586 + }, + { + "epoch": 0.08778226409451174, + "grad_norm": 1.9828537343745032, + "learning_rate": 9.912864523623214e-06, + "loss": 0.4153, + "step": 587 + }, + { + "epoch": 0.08793180798564379, + "grad_norm": 1.6056147158647165, + "learning_rate": 9.912413784107709e-06, + "loss": 0.357, + "step": 588 + }, + { + "epoch": 0.08808135187677583, + "grad_norm": 1.7642170812152784, + "learning_rate": 9.911961892092587e-06, + "loss": 0.3425, + "step": 589 + }, + { + "epoch": 0.08823089576790788, + "grad_norm": 1.925307511563271, + "learning_rate": 9.911508847683867e-06, + "loss": 0.4476, + "step": 590 + }, + { + "epoch": 0.08838043965903992, + "grad_norm": 1.9824372539957273, + "learning_rate": 9.911054650987837e-06, + "loss": 0.4597, + "step": 591 + }, + { + "epoch": 0.08852998355017197, + "grad_norm": 1.5805088418089035, + "learning_rate": 9.910599302111057e-06, + "loss": 0.1935, + "step": 592 + }, + { + "epoch": 0.08867952744130403, + "grad_norm": 2.157404890931188, + "learning_rate": 9.910142801160355e-06, + "loss": 0.3443, + "step": 593 + }, + { + "epoch": 0.08882907133243607, + "grad_norm": 2.094900000445731, + "learning_rate": 9.909685148242831e-06, + "loss": 0.404, + "step": 594 + }, + { + "epoch": 0.08897861522356812, + "grad_norm": 2.336415519412793, + "learning_rate": 9.909226343465856e-06, + "loss": 0.6382, + "step": 595 + }, + { + "epoch": 0.08912815911470016, + "grad_norm": 2.0552137049182497, + "learning_rate": 9.908766386937067e-06, + "loss": 0.3908, + "step": 596 + }, + { + "epoch": 0.08927770300583221, + "grad_norm": 1.1564393734179468, + "learning_rate": 9.908305278764376e-06, + "loss": 0.2457, + "step": 597 + }, + { + "epoch": 0.08942724689696425, + "grad_norm": 1.8704284289450437, + "learning_rate": 9.907843019055966e-06, + "loss": 0.3604, + "step": 598 + }, + { + "epoch": 0.0895767907880963, + "grad_norm": 1.295042190600909, + "learning_rate": 9.907379607920281e-06, + "loss": 0.2075, + "step": 599 + }, + { + "epoch": 0.08972633467922836, + "grad_norm": 1.8305770820800886, + "learning_rate": 9.90691504546605e-06, + "loss": 0.2698, + "step": 600 + }, + { + "epoch": 0.0898758785703604, + "grad_norm": 1.7240290275544472, + "learning_rate": 9.906449331802256e-06, + "loss": 0.2504, + "step": 601 + }, + { + "epoch": 0.09002542246149245, + "grad_norm": 1.0036789417827203, + "learning_rate": 9.905982467038167e-06, + "loss": 0.195, + "step": 602 + }, + { + "epoch": 0.0901749663526245, + "grad_norm": 1.6777253578130231, + "learning_rate": 9.905514451283308e-06, + "loss": 0.2436, + "step": 603 + }, + { + "epoch": 0.09032451024375654, + "grad_norm": 1.9190873052270145, + "learning_rate": 9.905045284647483e-06, + "loss": 0.4006, + "step": 604 + }, + { + "epoch": 0.09047405413488858, + "grad_norm": 1.77001911452716, + "learning_rate": 9.904574967240764e-06, + "loss": 0.3703, + "step": 605 + }, + { + "epoch": 0.09062359802602064, + "grad_norm": 1.3114492277508998, + "learning_rate": 9.904103499173487e-06, + "loss": 0.2323, + "step": 606 + }, + { + "epoch": 0.09077314191715269, + "grad_norm": 1.6694643051834908, + "learning_rate": 9.90363088055627e-06, + "loss": 0.2881, + "step": 607 + }, + { + "epoch": 0.09092268580828473, + "grad_norm": 1.4448454411512122, + "learning_rate": 9.903157111499988e-06, + "loss": 0.2341, + "step": 608 + }, + { + "epoch": 0.09107222969941678, + "grad_norm": 1.8302982894061834, + "learning_rate": 9.902682192115795e-06, + "loss": 0.3497, + "step": 609 + }, + { + "epoch": 0.09122177359054882, + "grad_norm": 1.4089802820999182, + "learning_rate": 9.902206122515113e-06, + "loss": 0.1565, + "step": 610 + }, + { + "epoch": 0.09137131748168087, + "grad_norm": 2.275670976517465, + "learning_rate": 9.901728902809627e-06, + "loss": 0.482, + "step": 611 + }, + { + "epoch": 0.09152086137281291, + "grad_norm": 2.3916744409549997, + "learning_rate": 9.901250533111301e-06, + "loss": 0.539, + "step": 612 + }, + { + "epoch": 0.09167040526394497, + "grad_norm": 1.110965438282227, + "learning_rate": 9.900771013532367e-06, + "loss": 0.2257, + "step": 613 + }, + { + "epoch": 0.09181994915507702, + "grad_norm": 1.6169969209154105, + "learning_rate": 9.900290344185321e-06, + "loss": 0.2316, + "step": 614 + }, + { + "epoch": 0.09196949304620906, + "grad_norm": 1.390950490331229, + "learning_rate": 9.899808525182935e-06, + "loss": 0.1735, + "step": 615 + }, + { + "epoch": 0.09211903693734111, + "grad_norm": 1.26641152514348, + "learning_rate": 9.899325556638247e-06, + "loss": 0.2269, + "step": 616 + }, + { + "epoch": 0.09226858082847315, + "grad_norm": 1.107259968960053, + "learning_rate": 9.898841438664568e-06, + "loss": 0.2082, + "step": 617 + }, + { + "epoch": 0.0924181247196052, + "grad_norm": 1.6779136428714192, + "learning_rate": 9.898356171375473e-06, + "loss": 0.3744, + "step": 618 + }, + { + "epoch": 0.09256766861073726, + "grad_norm": 1.8012739115801626, + "learning_rate": 9.897869754884816e-06, + "loss": 0.2438, + "step": 619 + }, + { + "epoch": 0.0927172125018693, + "grad_norm": 1.6400812519548655, + "learning_rate": 9.89738218930671e-06, + "loss": 0.3692, + "step": 620 + }, + { + "epoch": 0.09286675639300135, + "grad_norm": 2.7659374426954972, + "learning_rate": 9.896893474755547e-06, + "loss": 0.5873, + "step": 621 + }, + { + "epoch": 0.0930163002841334, + "grad_norm": 3.020452608035097, + "learning_rate": 9.89640361134598e-06, + "loss": 0.4177, + "step": 622 + }, + { + "epoch": 0.09316584417526544, + "grad_norm": 1.4907614824403637, + "learning_rate": 9.895912599192937e-06, + "loss": 0.2516, + "step": 623 + }, + { + "epoch": 0.09331538806639748, + "grad_norm": 1.6636615032724535, + "learning_rate": 9.895420438411616e-06, + "loss": 0.1935, + "step": 624 + }, + { + "epoch": 0.09346493195752953, + "grad_norm": 1.9719905447621995, + "learning_rate": 9.89492712911748e-06, + "loss": 0.2135, + "step": 625 + }, + { + "epoch": 0.09361447584866159, + "grad_norm": 1.3681787330772102, + "learning_rate": 9.894432671426264e-06, + "loss": 0.208, + "step": 626 + }, + { + "epoch": 0.09376401973979363, + "grad_norm": 2.0793649946453043, + "learning_rate": 9.893937065453976e-06, + "loss": 0.3719, + "step": 627 + }, + { + "epoch": 0.09391356363092568, + "grad_norm": 1.685584025343787, + "learning_rate": 9.893440311316887e-06, + "loss": 0.2164, + "step": 628 + }, + { + "epoch": 0.09406310752205772, + "grad_norm": 1.2145425693019332, + "learning_rate": 9.892942409131541e-06, + "loss": 0.1725, + "step": 629 + }, + { + "epoch": 0.09421265141318977, + "grad_norm": 1.1438517718036314, + "learning_rate": 9.892443359014752e-06, + "loss": 0.2367, + "step": 630 + }, + { + "epoch": 0.09436219530432181, + "grad_norm": 1.4416913213257094, + "learning_rate": 9.8919431610836e-06, + "loss": 0.2254, + "step": 631 + }, + { + "epoch": 0.09451173919545386, + "grad_norm": 1.2656296241346114, + "learning_rate": 9.891441815455436e-06, + "loss": 0.2485, + "step": 632 + }, + { + "epoch": 0.09466128308658592, + "grad_norm": 1.4276056880724206, + "learning_rate": 9.890939322247881e-06, + "loss": 0.1908, + "step": 633 + }, + { + "epoch": 0.09481082697771796, + "grad_norm": 1.8185771152087218, + "learning_rate": 9.890435681578827e-06, + "loss": 0.2096, + "step": 634 + }, + { + "epoch": 0.09496037086885001, + "grad_norm": 1.2794518689910337, + "learning_rate": 9.88993089356643e-06, + "loss": 0.2394, + "step": 635 + }, + { + "epoch": 0.09510991475998205, + "grad_norm": 2.0227594086297738, + "learning_rate": 9.88942495832912e-06, + "loss": 0.59, + "step": 636 + }, + { + "epoch": 0.0952594586511141, + "grad_norm": 1.3323082817593526, + "learning_rate": 9.888917875985593e-06, + "loss": 0.2073, + "step": 637 + }, + { + "epoch": 0.09540900254224614, + "grad_norm": 1.7884206661676574, + "learning_rate": 9.888409646654818e-06, + "loss": 0.3897, + "step": 638 + }, + { + "epoch": 0.0955585464333782, + "grad_norm": 2.124144136353745, + "learning_rate": 9.887900270456025e-06, + "loss": 0.5683, + "step": 639 + }, + { + "epoch": 0.09570809032451025, + "grad_norm": 1.4793433841619534, + "learning_rate": 9.887389747508725e-06, + "loss": 0.3727, + "step": 640 + }, + { + "epoch": 0.09585763421564229, + "grad_norm": 1.0661747667222115, + "learning_rate": 9.88687807793269e-06, + "loss": 0.1983, + "step": 641 + }, + { + "epoch": 0.09600717810677434, + "grad_norm": 1.615153009655538, + "learning_rate": 9.886365261847957e-06, + "loss": 0.3675, + "step": 642 + }, + { + "epoch": 0.09615672199790638, + "grad_norm": 1.4963878387365324, + "learning_rate": 9.885851299374844e-06, + "loss": 0.1805, + "step": 643 + }, + { + "epoch": 0.09630626588903843, + "grad_norm": 1.8529323065992462, + "learning_rate": 9.88533619063393e-06, + "loss": 0.391, + "step": 644 + }, + { + "epoch": 0.09645580978017047, + "grad_norm": 2.4764246014732145, + "learning_rate": 9.884819935746063e-06, + "loss": 0.2605, + "step": 645 + }, + { + "epoch": 0.09660535367130253, + "grad_norm": 1.904672440883197, + "learning_rate": 9.884302534832361e-06, + "loss": 0.3935, + "step": 646 + }, + { + "epoch": 0.09675489756243458, + "grad_norm": 1.9431435460380113, + "learning_rate": 9.883783988014216e-06, + "loss": 0.2092, + "step": 647 + }, + { + "epoch": 0.09690444145356662, + "grad_norm": 2.0946695671241553, + "learning_rate": 9.883264295413278e-06, + "loss": 0.3957, + "step": 648 + }, + { + "epoch": 0.09705398534469867, + "grad_norm": 1.0944344711946927, + "learning_rate": 9.882743457151476e-06, + "loss": 0.202, + "step": 649 + }, + { + "epoch": 0.09720352923583071, + "grad_norm": 1.5147259026498003, + "learning_rate": 9.882221473351e-06, + "loss": 0.3029, + "step": 650 + }, + { + "epoch": 0.09735307312696276, + "grad_norm": 1.3452835965457643, + "learning_rate": 9.881698344134316e-06, + "loss": 0.2159, + "step": 651 + }, + { + "epoch": 0.09750261701809482, + "grad_norm": 1.7952640402406481, + "learning_rate": 9.881174069624155e-06, + "loss": 0.4006, + "step": 652 + }, + { + "epoch": 0.09765216090922686, + "grad_norm": 2.468540255171398, + "learning_rate": 9.880648649943515e-06, + "loss": 0.4393, + "step": 653 + }, + { + "epoch": 0.09780170480035891, + "grad_norm": 1.5332585075726441, + "learning_rate": 9.880122085215664e-06, + "loss": 0.2401, + "step": 654 + }, + { + "epoch": 0.09795124869149095, + "grad_norm": 1.5882881108110953, + "learning_rate": 9.87959437556414e-06, + "loss": 0.2078, + "step": 655 + }, + { + "epoch": 0.098100792582623, + "grad_norm": 1.7962702189497488, + "learning_rate": 9.87906552111275e-06, + "loss": 0.4793, + "step": 656 + }, + { + "epoch": 0.09825033647375504, + "grad_norm": 1.860004859316795, + "learning_rate": 9.878535521985568e-06, + "loss": 0.2388, + "step": 657 + }, + { + "epoch": 0.09839988036488709, + "grad_norm": 1.9861019609665855, + "learning_rate": 9.878004378306934e-06, + "loss": 0.3721, + "step": 658 + }, + { + "epoch": 0.09854942425601915, + "grad_norm": 1.5404208138898199, + "learning_rate": 9.877472090201463e-06, + "loss": 0.3534, + "step": 659 + }, + { + "epoch": 0.09869896814715119, + "grad_norm": 3.0119825067072306, + "learning_rate": 9.876938657794036e-06, + "loss": 0.6732, + "step": 660 + }, + { + "epoch": 0.09884851203828324, + "grad_norm": 1.5069735817087104, + "learning_rate": 9.876404081209796e-06, + "loss": 0.4004, + "step": 661 + }, + { + "epoch": 0.09899805592941528, + "grad_norm": 1.6856753387650372, + "learning_rate": 9.875868360574164e-06, + "loss": 0.2942, + "step": 662 + }, + { + "epoch": 0.09914759982054733, + "grad_norm": 1.6896901311725145, + "learning_rate": 9.875331496012822e-06, + "loss": 0.239, + "step": 663 + }, + { + "epoch": 0.09929714371167937, + "grad_norm": 2.2770505228904225, + "learning_rate": 9.87479348765173e-06, + "loss": 0.4755, + "step": 664 + }, + { + "epoch": 0.09944668760281143, + "grad_norm": 1.9016485099179228, + "learning_rate": 9.874254335617102e-06, + "loss": 0.4645, + "step": 665 + }, + { + "epoch": 0.09959623149394348, + "grad_norm": 1.6638896812103354, + "learning_rate": 9.873714040035434e-06, + "loss": 0.2512, + "step": 666 + }, + { + "epoch": 0.09974577538507552, + "grad_norm": 1.7233554952000107, + "learning_rate": 9.873172601033482e-06, + "loss": 0.3958, + "step": 667 + }, + { + "epoch": 0.09989531927620757, + "grad_norm": 1.7250170911584946, + "learning_rate": 9.872630018738271e-06, + "loss": 0.3115, + "step": 668 + }, + { + "epoch": 0.10004486316733961, + "grad_norm": 1.8843746906489027, + "learning_rate": 9.872086293277101e-06, + "loss": 0.3789, + "step": 669 + } + ], + "logging_steps": 1.0, + "max_steps": 6687, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 669, + "total_flos": 41613770170368.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}