{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.6039927404718695, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01161524500907441, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 4.6825, "step": 1, "ts_encoder_learning_rate": 0.0 }, { "epoch": 0.02323049001814882, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 4.6428, "step": 2, "ts_encoder_learning_rate": 0.0 }, { "epoch": 0.03484573502722323, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 4.6696, "step": 3, "ts_encoder_learning_rate": 0.0 }, { "epoch": 0.04646098003629764, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 4.6588, "step": 4, "ts_encoder_learning_rate": 0.0 }, { "epoch": 0.05807622504537205, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 4.7245, "step": 5, "ts_encoder_learning_rate": 0.0 }, { "epoch": 0.06969147005444647, "grad_norm": 152.50412651338738, "learning_rate": 0.0, "loss": 4.6365, "step": 6, "ts_encoder_learning_rate": 1.25e-06 }, { "epoch": 0.08130671506352087, "grad_norm": 153.96358887797103, "learning_rate": 1.25e-06, "loss": 4.6399, "step": 7, "ts_encoder_learning_rate": 2.5e-06 }, { "epoch": 0.09292196007259527, "grad_norm": 119.30991989360129, "learning_rate": 2.5e-06, "loss": 4.0109, "step": 8, "ts_encoder_learning_rate": 3.7500000000000005e-06 }, { "epoch": 0.10453720508166969, "grad_norm": 62.980179337580374, "learning_rate": 3.7500000000000005e-06, "loss": 2.8417, "step": 9, "ts_encoder_learning_rate": 5e-06 }, { "epoch": 0.1161524500907441, "grad_norm": 33.513562064012554, "learning_rate": 5e-06, "loss": 2.1341, "step": 10, "ts_encoder_learning_rate": 6.25e-06 }, { "epoch": 0.1277676950998185, "grad_norm": 14.313077235644876, "learning_rate": 6.25e-06, "loss": 1.7057, "step": 11, "ts_encoder_learning_rate": 7.500000000000001e-06 }, { "epoch": 0.13938294010889293, "grad_norm": 6.369652791543636, "learning_rate": 7.500000000000001e-06, "loss": 1.3565, "step": 12, "ts_encoder_learning_rate": 8.750000000000001e-06 }, { "epoch": 0.15099818511796734, "grad_norm": 3.731701365388006, "learning_rate": 8.750000000000001e-06, "loss": 1.1662, "step": 13, "ts_encoder_learning_rate": 1e-05 }, { "epoch": 0.16261343012704174, "grad_norm": 2.6361585613163587, "learning_rate": 1e-05, "loss": 1.065, "step": 14, "ts_encoder_learning_rate": 9.999839429671632e-06 }, { "epoch": 0.17422867513611615, "grad_norm": 1.7275515554655163, "learning_rate": 9.999839429671632e-06, "loss": 1.0108, "step": 15, "ts_encoder_learning_rate": 9.999357728999657e-06 }, { "epoch": 0.18584392014519055, "grad_norm": 1.3395365357042155, "learning_rate": 9.999357728999657e-06, "loss": 0.9446, "step": 16, "ts_encoder_learning_rate": 9.99855492892281e-06 }, { "epoch": 0.19745916515426498, "grad_norm": 1.2724446763819477, "learning_rate": 9.99855492892281e-06, "loss": 0.8875, "step": 17, "ts_encoder_learning_rate": 9.99743108100344e-06 }, { "epoch": 0.20907441016333939, "grad_norm": 1.1656594933692108, "learning_rate": 9.99743108100344e-06, "loss": 0.8609, "step": 18, "ts_encoder_learning_rate": 9.9959862574242e-06 }, { "epoch": 0.2206896551724138, "grad_norm": 0.8157300015925317, "learning_rate": 9.9959862574242e-06, "loss": 0.8392, "step": 19, "ts_encoder_learning_rate": 9.994220550983404e-06 }, { "epoch": 0.2323049001814882, "grad_norm": 0.8647381094727201, "learning_rate": 9.994220550983404e-06, "loss": 0.8233, "step": 20, "ts_encoder_learning_rate": 9.992134075089085e-06 }, { "epoch": 0.24392014519056263, "grad_norm": 0.7179729634940382, "learning_rate": 9.992134075089085e-06, "loss": 0.7767, "step": 21, "ts_encoder_learning_rate": 9.989726963751683e-06 }, { "epoch": 0.255535390199637, "grad_norm": 0.5610758217332802, "learning_rate": 9.989726963751683e-06, "loss": 0.7525, "step": 22, "ts_encoder_learning_rate": 9.986999371575465e-06 }, { "epoch": 0.2671506352087114, "grad_norm": 0.6140193378709962, "learning_rate": 9.986999371575465e-06, "loss": 0.7503, "step": 23, "ts_encoder_learning_rate": 9.983951473748579e-06 }, { "epoch": 0.27876588021778587, "grad_norm": 0.5463424314841056, "learning_rate": 9.983951473748579e-06, "loss": 0.7361, "step": 24, "ts_encoder_learning_rate": 9.980583466031808e-06 }, { "epoch": 0.29038112522686027, "grad_norm": 0.5456053434098116, "learning_rate": 9.980583466031808e-06, "loss": 0.7445, "step": 25, "ts_encoder_learning_rate": 9.976895564745993e-06 }, { "epoch": 0.3019963702359347, "grad_norm": 0.4724488207315688, "learning_rate": 9.976895564745993e-06, "loss": 0.7258, "step": 26, "ts_encoder_learning_rate": 9.97288800675814e-06 }, { "epoch": 0.3136116152450091, "grad_norm": 0.43473472160091314, "learning_rate": 9.97288800675814e-06, "loss": 0.7131, "step": 27, "ts_encoder_learning_rate": 9.968561049466214e-06 }, { "epoch": 0.3252268602540835, "grad_norm": 0.4651449922676477, "learning_rate": 9.968561049466214e-06, "loss": 0.7125, "step": 28, "ts_encoder_learning_rate": 9.963914970782594e-06 }, { "epoch": 0.3368421052631579, "grad_norm": 0.42304487514126465, "learning_rate": 9.963914970782594e-06, "loss": 0.6958, "step": 29, "ts_encoder_learning_rate": 9.95895006911623e-06 }, { "epoch": 0.3484573502722323, "grad_norm": 0.37512478085470513, "learning_rate": 9.95895006911623e-06, "loss": 0.6815, "step": 30, "ts_encoder_learning_rate": 9.953666663353485e-06 }, { "epoch": 0.3600725952813067, "grad_norm": 0.38681860428776565, "learning_rate": 9.953666663353485e-06, "loss": 0.682, "step": 31, "ts_encoder_learning_rate": 9.948065092837631e-06 }, { "epoch": 0.3716878402903811, "grad_norm": 0.3878313981786008, "learning_rate": 9.948065092837631e-06, "loss": 0.6705, "step": 32, "ts_encoder_learning_rate": 9.942145717347077e-06 }, { "epoch": 0.38330308529945556, "grad_norm": 0.3483493230771332, "learning_rate": 9.942145717347077e-06, "loss": 0.6517, "step": 33, "ts_encoder_learning_rate": 9.935908917072253e-06 }, { "epoch": 0.39491833030852996, "grad_norm": 0.32925851151692315, "learning_rate": 9.935908917072253e-06, "loss": 0.6611, "step": 34, "ts_encoder_learning_rate": 9.92935509259118e-06 }, { "epoch": 0.40653357531760437, "grad_norm": 0.3655692991223213, "learning_rate": 9.92935509259118e-06, "loss": 0.6449, "step": 35, "ts_encoder_learning_rate": 9.922484664843763e-06 }, { "epoch": 0.41814882032667877, "grad_norm": 0.3582656975961708, "learning_rate": 9.922484664843763e-06, "loss": 0.6522, "step": 36, "ts_encoder_learning_rate": 9.915298075104735e-06 }, { "epoch": 0.4297640653357532, "grad_norm": 0.33379075182945417, "learning_rate": 9.915298075104735e-06, "loss": 0.6416, "step": 37, "ts_encoder_learning_rate": 9.907795784955327e-06 }, { "epoch": 0.4413793103448276, "grad_norm": 0.3347132110396014, "learning_rate": 9.907795784955327e-06, "loss": 0.6539, "step": 38, "ts_encoder_learning_rate": 9.899978276253617e-06 }, { "epoch": 0.452994555353902, "grad_norm": 0.3241158793623529, "learning_rate": 9.899978276253617e-06, "loss": 0.6438, "step": 39, "ts_encoder_learning_rate": 9.891846051103578e-06 }, { "epoch": 0.4646098003629764, "grad_norm": 0.3455452567899899, "learning_rate": 9.891846051103578e-06, "loss": 0.6316, "step": 40, "ts_encoder_learning_rate": 9.883399631822836e-06 }, { "epoch": 0.4762250453720508, "grad_norm": 0.31313557573101863, "learning_rate": 9.883399631822836e-06, "loss": 0.6389, "step": 41, "ts_encoder_learning_rate": 9.874639560909118e-06 }, { "epoch": 0.48784029038112525, "grad_norm": 0.3280416621294979, "learning_rate": 9.874639560909118e-06, "loss": 0.6285, "step": 42, "ts_encoder_learning_rate": 9.86556640100541e-06 }, { "epoch": 0.49945553539019966, "grad_norm": 0.3198451666750831, "learning_rate": 9.86556640100541e-06, "loss": 0.6356, "step": 43, "ts_encoder_learning_rate": 9.85618073486382e-06 }, { "epoch": 0.511070780399274, "grad_norm": 0.32065369243985437, "learning_rate": 9.85618073486382e-06, "loss": 0.6301, "step": 44, "ts_encoder_learning_rate": 9.846483165308142e-06 }, { "epoch": 0.5226860254083484, "grad_norm": 0.2985686533559952, "learning_rate": 9.846483165308142e-06, "loss": 0.6094, "step": 45, "ts_encoder_learning_rate": 9.836474315195148e-06 }, { "epoch": 0.5343012704174228, "grad_norm": 0.3043913719441071, "learning_rate": 9.836474315195148e-06, "loss": 0.618, "step": 46, "ts_encoder_learning_rate": 9.826154827374578e-06 }, { "epoch": 0.5459165154264973, "grad_norm": 0.29426029916433744, "learning_rate": 9.826154827374578e-06, "loss": 0.6117, "step": 47, "ts_encoder_learning_rate": 9.815525364647853e-06 }, { "epoch": 0.5575317604355717, "grad_norm": 0.29759373582076726, "learning_rate": 9.815525364647853e-06, "loss": 0.6102, "step": 48, "ts_encoder_learning_rate": 9.804586609725499e-06 }, { "epoch": 0.5691470054446461, "grad_norm": 0.2991170372194726, "learning_rate": 9.804586609725499e-06, "loss": 0.5973, "step": 49, "ts_encoder_learning_rate": 9.793339265183303e-06 }, { "epoch": 0.5807622504537205, "grad_norm": 0.297629927322108, "learning_rate": 9.793339265183303e-06, "loss": 0.5997, "step": 50, "ts_encoder_learning_rate": 9.781784053417192e-06 }, { "epoch": 0.592377495462795, "grad_norm": 0.29559157031475897, "learning_rate": 9.781784053417192e-06, "loss": 0.6012, "step": 51, "ts_encoder_learning_rate": 9.76992171659682e-06 }, { "epoch": 0.6039927404718693, "grad_norm": 0.30135176793549534, "learning_rate": 9.76992171659682e-06, "loss": 0.5997, "step": 52, "ts_encoder_learning_rate": 9.757753016617917e-06 }, { "epoch": 0.6156079854809438, "grad_norm": 0.2830877744764034, "learning_rate": 9.757753016617917e-06, "loss": 0.593, "step": 53, "ts_encoder_learning_rate": 9.745278735053345e-06 }, { "epoch": 0.6272232304900182, "grad_norm": 0.30643259357036984, "learning_rate": 9.745278735053345e-06, "loss": 0.5861, "step": 54, "ts_encoder_learning_rate": 9.732499673102895e-06 }, { "epoch": 0.6388384754990926, "grad_norm": 0.33677739208267987, "learning_rate": 9.732499673102895e-06, "loss": 0.579, "step": 55, "ts_encoder_learning_rate": 9.719416651541839e-06 }, { "epoch": 0.650453720508167, "grad_norm": 0.3205752816564006, "learning_rate": 9.719416651541839e-06, "loss": 0.589, "step": 56, "ts_encoder_learning_rate": 9.706030510668202e-06 }, { "epoch": 0.6620689655172414, "grad_norm": 0.3136856624465887, "learning_rate": 9.706030510668202e-06, "loss": 0.5719, "step": 57, "ts_encoder_learning_rate": 9.692342110248802e-06 }, { "epoch": 0.6736842105263158, "grad_norm": 0.3538385796052594, "learning_rate": 9.692342110248802e-06, "loss": 0.5836, "step": 58, "ts_encoder_learning_rate": 9.678352329464018e-06 }, { "epoch": 0.6852994555353902, "grad_norm": 0.3412039613534201, "learning_rate": 9.678352329464018e-06, "loss": 0.5776, "step": 59, "ts_encoder_learning_rate": 9.664062066851325e-06 }, { "epoch": 0.6969147005444646, "grad_norm": 0.3470773794842348, "learning_rate": 9.664062066851325e-06, "loss": 0.5851, "step": 60, "ts_encoder_learning_rate": 9.649472240247588e-06 }, { "epoch": 0.708529945553539, "grad_norm": 0.3212528977463762, "learning_rate": 9.649472240247588e-06, "loss": 0.5739, "step": 61, "ts_encoder_learning_rate": 9.63458378673011e-06 }, { "epoch": 0.7201451905626134, "grad_norm": 0.3628948373052999, "learning_rate": 9.63458378673011e-06, "loss": 0.5667, "step": 62, "ts_encoder_learning_rate": 9.619397662556434e-06 }, { "epoch": 0.7317604355716878, "grad_norm": 0.35120878235548797, "learning_rate": 9.619397662556434e-06, "loss": 0.5746, "step": 63, "ts_encoder_learning_rate": 9.603914843102941e-06 }, { "epoch": 0.7433756805807622, "grad_norm": 0.34896054004358656, "learning_rate": 9.603914843102941e-06, "loss": 0.5683, "step": 64, "ts_encoder_learning_rate": 9.588136322802194e-06 }, { "epoch": 0.7549909255898367, "grad_norm": 0.34262893904308916, "learning_rate": 9.588136322802194e-06, "loss": 0.5576, "step": 65, "ts_encoder_learning_rate": 9.572063115079063e-06 }, { "epoch": 0.7666061705989111, "grad_norm": 0.3380023238243971, "learning_rate": 9.572063115079063e-06, "loss": 0.553, "step": 66, "ts_encoder_learning_rate": 9.555696252285648e-06 }, { "epoch": 0.7782214156079855, "grad_norm": 0.3875484812249266, "learning_rate": 9.555696252285648e-06, "loss": 0.5596, "step": 67, "ts_encoder_learning_rate": 9.539036785634961e-06 }, { "epoch": 0.7898366606170599, "grad_norm": 0.35930423509307985, "learning_rate": 9.539036785634961e-06, "loss": 0.5545, "step": 68, "ts_encoder_learning_rate": 9.522085785133415e-06 }, { "epoch": 0.8014519056261343, "grad_norm": 0.3791388166119568, "learning_rate": 9.522085785133415e-06, "loss": 0.5513, "step": 69, "ts_encoder_learning_rate": 9.504844339512096e-06 }, { "epoch": 0.8130671506352087, "grad_norm": 0.34685014988411594, "learning_rate": 9.504844339512096e-06, "loss": 0.5516, "step": 70, "ts_encoder_learning_rate": 9.48731355615684e-06 }, { "epoch": 0.8246823956442831, "grad_norm": 0.35909523874401894, "learning_rate": 9.48731355615684e-06, "loss": 0.5425, "step": 71, "ts_encoder_learning_rate": 9.469494561037097e-06 }, { "epoch": 0.8362976406533575, "grad_norm": 0.4403897777700719, "learning_rate": 9.469494561037097e-06, "loss": 0.5329, "step": 72, "ts_encoder_learning_rate": 9.451388498633635e-06 }, { "epoch": 0.847912885662432, "grad_norm": 0.3818885699775511, "learning_rate": 9.451388498633635e-06, "loss": 0.5365, "step": 73, "ts_encoder_learning_rate": 9.432996531865001e-06 }, { "epoch": 0.8595281306715064, "grad_norm": 0.40435312969694975, "learning_rate": 9.432996531865001e-06, "loss": 0.5315, "step": 74, "ts_encoder_learning_rate": 9.414319842012855e-06 }, { "epoch": 0.8711433756805808, "grad_norm": 0.4248553104454013, "learning_rate": 9.414319842012855e-06, "loss": 0.533, "step": 75, "ts_encoder_learning_rate": 9.395359628646087e-06 }, { "epoch": 0.8827586206896552, "grad_norm": 0.39206791521135576, "learning_rate": 9.395359628646087e-06, "loss": 0.5256, "step": 76, "ts_encoder_learning_rate": 9.376117109543769e-06 }, { "epoch": 0.8943738656987296, "grad_norm": 0.36377005854333166, "learning_rate": 9.376117109543769e-06, "loss": 0.5194, "step": 77, "ts_encoder_learning_rate": 9.356593520616948e-06 }, { "epoch": 0.905989110707804, "grad_norm": 0.43146303271047914, "learning_rate": 9.356593520616948e-06, "loss": 0.5267, "step": 78, "ts_encoder_learning_rate": 9.336790115829255e-06 }, { "epoch": 0.9176043557168784, "grad_norm": 0.3621214209550119, "learning_rate": 9.336790115829255e-06, "loss": 0.5218, "step": 79, "ts_encoder_learning_rate": 9.316708167116377e-06 }, { "epoch": 0.9292196007259528, "grad_norm": 0.40377960504482446, "learning_rate": 9.316708167116377e-06, "loss": 0.5214, "step": 80, "ts_encoder_learning_rate": 9.296348964304351e-06 }, { "epoch": 0.9408348457350272, "grad_norm": 0.3806316238680835, "learning_rate": 9.296348964304351e-06, "loss": 0.5102, "step": 81, "ts_encoder_learning_rate": 9.275713815026732e-06 }, { "epoch": 0.9524500907441016, "grad_norm": 0.387322276165842, "learning_rate": 9.275713815026732e-06, "loss": 0.5069, "step": 82, "ts_encoder_learning_rate": 9.254804044640596e-06 }, { "epoch": 0.964065335753176, "grad_norm": 0.46164777708230237, "learning_rate": 9.254804044640596e-06, "loss": 0.4985, "step": 83, "ts_encoder_learning_rate": 9.233620996141421e-06 }, { "epoch": 0.9756805807622505, "grad_norm": 0.4208251582273127, "learning_rate": 9.233620996141421e-06, "loss": 0.5086, "step": 84, "ts_encoder_learning_rate": 9.212166030076832e-06 }, { "epoch": 0.9872958257713249, "grad_norm": 0.43078427861557256, "learning_rate": 9.212166030076832e-06, "loss": 0.525, "step": 85, "ts_encoder_learning_rate": 9.190440524459203e-06 }, { "epoch": 0.9989110707803993, "grad_norm": 0.47136114267604184, "learning_rate": 9.190440524459203e-06, "loss": 0.504, "step": 86, "ts_encoder_learning_rate": 9.168445874677168e-06 }, { "epoch": 1.0, "grad_norm": 0.47136114267604184, "learning_rate": 9.168445874677168e-06, "loss": 0.0433, "step": 87, "ts_encoder_learning_rate": 9.146183493405976e-06 }, { "epoch": 1.0116152450090745, "grad_norm": 0.4459150481878699, "learning_rate": 9.146183493405976e-06, "loss": 0.4937, "step": 88, "ts_encoder_learning_rate": 9.12365481051678e-06 }, { "epoch": 1.0232304900181488, "grad_norm": 0.4648677558952652, "learning_rate": 9.12365481051678e-06, "loss": 0.4814, "step": 89, "ts_encoder_learning_rate": 9.10086127298478e-06 }, { "epoch": 1.0348457350272233, "grad_norm": 0.424833617058968, "learning_rate": 9.10086127298478e-06, "loss": 0.4872, "step": 90, "ts_encoder_learning_rate": 9.077804344796302e-06 }, { "epoch": 1.0464609800362976, "grad_norm": 0.5447833529096903, "learning_rate": 9.077804344796302e-06, "loss": 0.465, "step": 91, "ts_encoder_learning_rate": 9.054485506854756e-06 }, { "epoch": 1.0580762250453721, "grad_norm": 0.5645327960600057, "learning_rate": 9.054485506854756e-06, "loss": 0.465, "step": 92, "ts_encoder_learning_rate": 9.030906256885528e-06 }, { "epoch": 1.0696914700544464, "grad_norm": 0.5707423730743318, "learning_rate": 9.030906256885528e-06, "loss": 0.4664, "step": 93, "ts_encoder_learning_rate": 9.007068109339783e-06 }, { "epoch": 1.081306715063521, "grad_norm": 0.49970416310158533, "learning_rate": 9.007068109339783e-06, "loss": 0.458, "step": 94, "ts_encoder_learning_rate": 8.982972595297195e-06 }, { "epoch": 1.0929219600725952, "grad_norm": 0.4760304595595311, "learning_rate": 8.982972595297195e-06, "loss": 0.4669, "step": 95, "ts_encoder_learning_rate": 8.9586212623676e-06 }, { "epoch": 1.1045372050816697, "grad_norm": 0.6546521338664805, "learning_rate": 8.9586212623676e-06, "loss": 0.4631, "step": 96, "ts_encoder_learning_rate": 8.93401567459161e-06 }, { "epoch": 1.116152450090744, "grad_norm": 0.6820617032835106, "learning_rate": 8.93401567459161e-06, "loss": 0.4556, "step": 97, "ts_encoder_learning_rate": 8.90915741234015e-06 }, { "epoch": 1.1277676950998186, "grad_norm": 0.4791674527928274, "learning_rate": 8.90915741234015e-06, "loss": 0.4461, "step": 98, "ts_encoder_learning_rate": 8.884048072212952e-06 }, { "epoch": 1.1393829401088928, "grad_norm": 0.8342160989342361, "learning_rate": 8.884048072212952e-06, "loss": 0.4665, "step": 99, "ts_encoder_learning_rate": 8.85868926693601e-06 }, { "epoch": 1.1509981851179674, "grad_norm": 0.6077292569121415, "learning_rate": 8.85868926693601e-06, "loss": 0.4492, "step": 100, "ts_encoder_learning_rate": 8.833082625258003e-06 }, { "epoch": 1.1626134301270417, "grad_norm": 0.5647955360914533, "learning_rate": 8.833082625258003e-06, "loss": 0.4435, "step": 101, "ts_encoder_learning_rate": 8.807229791845673e-06 }, { "epoch": 1.1742286751361162, "grad_norm": 0.6044541014322447, "learning_rate": 8.807229791845673e-06, "loss": 0.4427, "step": 102, "ts_encoder_learning_rate": 8.781132427178203e-06 }, { "epoch": 1.1858439201451905, "grad_norm": 0.5437886950515926, "learning_rate": 8.781132427178203e-06, "loss": 0.4427, "step": 103, "ts_encoder_learning_rate": 8.754792207440557e-06 }, { "epoch": 1.197459165154265, "grad_norm": 0.5946421067486096, "learning_rate": 8.754792207440557e-06, "loss": 0.4327, "step": 104, "ts_encoder_learning_rate": 8.728210824415829e-06 }, { "epoch": 1.2090744101633395, "grad_norm": 0.6434751459915273, "learning_rate": 8.728210824415829e-06, "loss": 0.4342, "step": 105, "ts_encoder_learning_rate": 8.701389985376578e-06 }, { "epoch": 1.2206896551724138, "grad_norm": 0.5051510187934748, "learning_rate": 8.701389985376578e-06, "loss": 0.4398, "step": 106, "ts_encoder_learning_rate": 8.674331412975178e-06 }, { "epoch": 1.232304900181488, "grad_norm": 0.5595267159920397, "learning_rate": 8.674331412975178e-06, "loss": 0.442, "step": 107, "ts_encoder_learning_rate": 8.647036845133171e-06 }, { "epoch": 1.2439201451905626, "grad_norm": 0.525092656163426, "learning_rate": 8.647036845133171e-06, "loss": 0.43, "step": 108, "ts_encoder_learning_rate": 8.619508034929646e-06 }, { "epoch": 1.2555353901996371, "grad_norm": 0.5105752951496461, "learning_rate": 8.619508034929646e-06, "loss": 0.4218, "step": 109, "ts_encoder_learning_rate": 8.591746750488639e-06 }, { "epoch": 1.2671506352087114, "grad_norm": 0.5231024334601619, "learning_rate": 8.591746750488639e-06, "loss": 0.4182, "step": 110, "ts_encoder_learning_rate": 8.563754774865574e-06 }, { "epoch": 1.278765880217786, "grad_norm": 0.4819620849275164, "learning_rate": 8.563754774865574e-06, "loss": 0.4246, "step": 111, "ts_encoder_learning_rate": 8.535533905932739e-06 }, { "epoch": 1.2903811252268602, "grad_norm": 0.5106626492387893, "learning_rate": 8.535533905932739e-06, "loss": 0.4156, "step": 112, "ts_encoder_learning_rate": 8.507085956263808e-06 }, { "epoch": 1.3019963702359347, "grad_norm": 0.48163467909413527, "learning_rate": 8.507085956263808e-06, "loss": 0.3944, "step": 113, "ts_encoder_learning_rate": 8.478412753017433e-06 }, { "epoch": 1.313611615245009, "grad_norm": 0.4708287815222732, "learning_rate": 8.478412753017433e-06, "loss": 0.4198, "step": 114, "ts_encoder_learning_rate": 8.449516137819875e-06 }, { "epoch": 1.3252268602540835, "grad_norm": 0.45335927979980417, "learning_rate": 8.449516137819875e-06, "loss": 0.4004, "step": 115, "ts_encoder_learning_rate": 8.420397966646732e-06 }, { "epoch": 1.3368421052631578, "grad_norm": 0.4731479114137599, "learning_rate": 8.420397966646732e-06, "loss": 0.4012, "step": 116, "ts_encoder_learning_rate": 8.391060109703725e-06 }, { "epoch": 1.3484573502722323, "grad_norm": 0.45763071454798787, "learning_rate": 8.391060109703725e-06, "loss": 0.4043, "step": 117, "ts_encoder_learning_rate": 8.361504451306585e-06 }, { "epoch": 1.3600725952813066, "grad_norm": 0.49218004748616023, "learning_rate": 8.361504451306585e-06, "loss": 0.3804, "step": 118, "ts_encoder_learning_rate": 8.331732889760021e-06 }, { "epoch": 1.3716878402903812, "grad_norm": 0.48411439748425, "learning_rate": 8.331732889760021e-06, "loss": 0.3768, "step": 119, "ts_encoder_learning_rate": 8.301747337235798e-06 }, { "epoch": 1.3833030852994557, "grad_norm": 0.5158115467010486, "learning_rate": 8.301747337235798e-06, "loss": 0.3893, "step": 120, "ts_encoder_learning_rate": 8.271549719649923e-06 }, { "epoch": 1.39491833030853, "grad_norm": 0.4717657156486388, "learning_rate": 8.271549719649923e-06, "loss": 0.3929, "step": 121, "ts_encoder_learning_rate": 8.241141976538944e-06 }, { "epoch": 1.4065335753176043, "grad_norm": 0.48449961022029603, "learning_rate": 8.241141976538944e-06, "loss": 0.3903, "step": 122, "ts_encoder_learning_rate": 8.210526060935377e-06 }, { "epoch": 1.4181488203266788, "grad_norm": 0.5590109000920023, "learning_rate": 8.210526060935377e-06, "loss": 0.3731, "step": 123, "ts_encoder_learning_rate": 8.179703939242276e-06 }, { "epoch": 1.4297640653357533, "grad_norm": 0.48980864166559324, "learning_rate": 8.179703939242276e-06, "loss": 0.37, "step": 124, "ts_encoder_learning_rate": 8.148677591106919e-06 }, { "epoch": 1.4413793103448276, "grad_norm": 0.49306252756918445, "learning_rate": 8.148677591106919e-06, "loss": 0.3821, "step": 125, "ts_encoder_learning_rate": 8.117449009293668e-06 }, { "epoch": 1.4529945553539019, "grad_norm": 0.5343587386921387, "learning_rate": 8.117449009293668e-06, "loss": 0.388, "step": 126, "ts_encoder_learning_rate": 8.08602019955598e-06 }, { "epoch": 1.4646098003629764, "grad_norm": 0.4932498699505098, "learning_rate": 8.08602019955598e-06, "loss": 0.3689, "step": 127, "ts_encoder_learning_rate": 8.054393180507572e-06 }, { "epoch": 1.476225045372051, "grad_norm": 0.5115591275832498, "learning_rate": 8.054393180507572e-06, "loss": 0.3705, "step": 128, "ts_encoder_learning_rate": 8.022569983492781e-06 }, { "epoch": 1.4878402903811252, "grad_norm": 0.5293741893039812, "learning_rate": 8.022569983492781e-06, "loss": 0.3757, "step": 129, "ts_encoder_learning_rate": 7.99055265245608e-06 }, { "epoch": 1.4994555353901997, "grad_norm": 0.4820946133813763, "learning_rate": 7.99055265245608e-06, "loss": 0.3774, "step": 130, "ts_encoder_learning_rate": 7.958343243810818e-06 }, { "epoch": 1.511070780399274, "grad_norm": 0.5715571029957693, "learning_rate": 7.958343243810818e-06, "loss": 0.3536, "step": 131, "ts_encoder_learning_rate": 7.925943826307119e-06 }, { "epoch": 1.5226860254083485, "grad_norm": 0.45688383927056603, "learning_rate": 7.925943826307119e-06, "loss": 0.3781, "step": 132, "ts_encoder_learning_rate": 7.89335648089903e-06 }, { "epoch": 1.5343012704174228, "grad_norm": 0.5417387120089402, "learning_rate": 7.89335648089903e-06, "loss": 0.3849, "step": 133, "ts_encoder_learning_rate": 7.860583300610849e-06 }, { "epoch": 1.5459165154264973, "grad_norm": 0.48249798152601187, "learning_rate": 7.860583300610849e-06, "loss": 0.3614, "step": 134, "ts_encoder_learning_rate": 7.827626390402707e-06 }, { "epoch": 1.5575317604355718, "grad_norm": 0.49148409141245863, "learning_rate": 7.827626390402707e-06, "loss": 0.3604, "step": 135, "ts_encoder_learning_rate": 7.794487867035358e-06 }, { "epoch": 1.5691470054446461, "grad_norm": 0.5575426863669594, "learning_rate": 7.794487867035358e-06, "loss": 0.3582, "step": 136, "ts_encoder_learning_rate": 7.761169858934238e-06 }, { "epoch": 1.5807622504537204, "grad_norm": 0.47759386240015317, "learning_rate": 7.761169858934238e-06, "loss": 0.364, "step": 137, "ts_encoder_learning_rate": 7.727674506052744e-06 }, { "epoch": 1.592377495462795, "grad_norm": 0.5295175062956325, "learning_rate": 7.727674506052744e-06, "loss": 0.3543, "step": 138, "ts_encoder_learning_rate": 7.694003959734802e-06 }, { "epoch": 1.6039927404718695, "grad_norm": 0.6604920964139046, "learning_rate": 7.694003959734802e-06, "loss": 0.3607, "step": 139, "ts_encoder_learning_rate": 7.660160382576683e-06 }, { "epoch": 1.6156079854809438, "grad_norm": 0.5412776964498659, "learning_rate": 7.660160382576683e-06, "loss": 0.3494, "step": 140, "ts_encoder_learning_rate": 7.626145948288107e-06 }, { "epoch": 1.627223230490018, "grad_norm": 0.4885017642227708, "learning_rate": 7.626145948288107e-06, "loss": 0.3563, "step": 141, "ts_encoder_learning_rate": 7.591962841552627e-06 }, { "epoch": 1.6388384754990926, "grad_norm": 0.6310454372740532, "learning_rate": 7.591962841552627e-06, "loss": 0.3513, "step": 142, "ts_encoder_learning_rate": 7.55761325788731e-06 }, { "epoch": 1.650453720508167, "grad_norm": 0.44382432912819253, "learning_rate": 7.55761325788731e-06, "loss": 0.3562, "step": 143, "ts_encoder_learning_rate": 7.52309940350173e-06 }, { "epoch": 1.6620689655172414, "grad_norm": 0.5462511565321053, "learning_rate": 7.52309940350173e-06, "loss": 0.3461, "step": 144, "ts_encoder_learning_rate": 7.488423495156258e-06 }, { "epoch": 1.6736842105263157, "grad_norm": 0.4959866346692337, "learning_rate": 7.488423495156258e-06, "loss": 0.3615, "step": 145, "ts_encoder_learning_rate": 7.453587760019691e-06 }, { "epoch": 1.6852994555353902, "grad_norm": 0.481447728937644, "learning_rate": 7.453587760019691e-06, "loss": 0.3532, "step": 146, "ts_encoder_learning_rate": 7.4185944355261996e-06 }, { "epoch": 1.6969147005444647, "grad_norm": 0.4773235171153849, "learning_rate": 7.4185944355261996e-06, "loss": 0.3646, "step": 147, "ts_encoder_learning_rate": 7.383445769231628e-06 }, { "epoch": 1.708529945553539, "grad_norm": 0.4922147267880892, "learning_rate": 7.383445769231628e-06, "loss": 0.349, "step": 148, "ts_encoder_learning_rate": 7.348144018669129e-06 }, { "epoch": 1.7201451905626133, "grad_norm": 0.500323445240717, "learning_rate": 7.348144018669129e-06, "loss": 0.3271, "step": 149, "ts_encoder_learning_rate": 7.312691451204178e-06 }, { "epoch": 1.7317604355716878, "grad_norm": 0.47751019497636976, "learning_rate": 7.312691451204178e-06, "loss": 0.3635, "step": 150, "ts_encoder_learning_rate": 7.277090343888931e-06 }, { "epoch": 1.7433756805807623, "grad_norm": 0.4939172584022833, "learning_rate": 7.277090343888931e-06, "loss": 0.3522, "step": 151, "ts_encoder_learning_rate": 7.241342983315985e-06 }, { "epoch": 1.7549909255898366, "grad_norm": 0.526535710546132, "learning_rate": 7.241342983315985e-06, "loss": 0.3379, "step": 152, "ts_encoder_learning_rate": 7.205451665471515e-06 }, { "epoch": 1.7666061705989111, "grad_norm": 0.48512244017684314, "learning_rate": 7.205451665471515e-06, "loss": 0.3671, "step": 153, "ts_encoder_learning_rate": 7.169418695587791e-06 }, { "epoch": 1.7782214156079856, "grad_norm": 0.5249208108710423, "learning_rate": 7.169418695587791e-06, "loss": 0.353, "step": 154, "ts_encoder_learning_rate": 7.1332463879951404e-06 }, { "epoch": 1.78983666061706, "grad_norm": 0.5939685667617669, "learning_rate": 7.1332463879951404e-06, "loss": 0.3338, "step": 155, "ts_encoder_learning_rate": 7.096937065973285e-06 }, { "epoch": 1.8014519056261342, "grad_norm": 0.46870716209574953, "learning_rate": 7.096937065973285e-06, "loss": 0.3258, "step": 156, "ts_encoder_learning_rate": 7.060493061602128e-06 }, { "epoch": 1.8130671506352087, "grad_norm": 0.5004844023800996, "learning_rate": 7.060493061602128e-06, "loss": 0.3338, "step": 157, "ts_encoder_learning_rate": 7.023916715611969e-06 }, { "epoch": 1.8246823956442833, "grad_norm": 0.6018225120929914, "learning_rate": 7.023916715611969e-06, "loss": 0.3559, "step": 158, "ts_encoder_learning_rate": 6.987210377233165e-06 }, { "epoch": 1.8362976406533575, "grad_norm": 0.5115688533474527, "learning_rate": 6.987210377233165e-06, "loss": 0.3215, "step": 159, "ts_encoder_learning_rate": 6.950376404045235e-06 }, { "epoch": 1.8479128856624318, "grad_norm": 0.5453997597445909, "learning_rate": 6.950376404045235e-06, "loss": 0.3288, "step": 160, "ts_encoder_learning_rate": 6.913417161825449e-06 }, { "epoch": 1.8595281306715064, "grad_norm": 0.5468491048869575, "learning_rate": 6.913417161825449e-06, "loss": 0.3351, "step": 161, "ts_encoder_learning_rate": 6.876335024396872e-06 }, { "epoch": 1.8711433756805809, "grad_norm": 0.5127435043219846, "learning_rate": 6.876335024396872e-06, "loss": 0.3261, "step": 162, "ts_encoder_learning_rate": 6.839132373475894e-06 }, { "epoch": 1.8827586206896552, "grad_norm": 0.5334108745888081, "learning_rate": 6.839132373475894e-06, "loss": 0.315, "step": 163, "ts_encoder_learning_rate": 6.801811598519268e-06 }, { "epoch": 1.8943738656987295, "grad_norm": 0.5154441355243857, "learning_rate": 6.801811598519268e-06, "loss": 0.3427, "step": 164, "ts_encoder_learning_rate": 6.764375096570628e-06 }, { "epoch": 1.905989110707804, "grad_norm": 0.5561507577872592, "learning_rate": 6.764375096570628e-06, "loss": 0.3259, "step": 165, "ts_encoder_learning_rate": 6.726825272106539e-06 }, { "epoch": 1.9176043557168785, "grad_norm": 0.5291334612556954, "learning_rate": 6.726825272106539e-06, "loss": 0.3198, "step": 166, "ts_encoder_learning_rate": 6.689164536882059e-06 }, { "epoch": 1.9292196007259528, "grad_norm": 0.5011785801262693, "learning_rate": 6.689164536882059e-06, "loss": 0.3187, "step": 167, "ts_encoder_learning_rate": 6.651395309775837e-06 }, { "epoch": 1.940834845735027, "grad_norm": 0.5750688070408072, "learning_rate": 6.651395309775837e-06, "loss": 0.2997, "step": 168, "ts_encoder_learning_rate": 6.6135200166347505e-06 }, { "epoch": 1.9524500907441016, "grad_norm": 0.5043052526525891, "learning_rate": 6.6135200166347505e-06, "loss": 0.3145, "step": 169, "ts_encoder_learning_rate": 6.575541090118105e-06 }, { "epoch": 1.964065335753176, "grad_norm": 0.4896874363768986, "learning_rate": 6.575541090118105e-06, "loss": 0.3009, "step": 170, "ts_encoder_learning_rate": 6.537460969541378e-06 }, { "epoch": 1.9756805807622504, "grad_norm": 0.5088309056638338, "learning_rate": 6.537460969541378e-06, "loss": 0.3129, "step": 171, "ts_encoder_learning_rate": 6.499282100719558e-06 }, { "epoch": 1.987295825771325, "grad_norm": 0.49023494845007226, "learning_rate": 6.499282100719558e-06, "loss": 0.3082, "step": 172, "ts_encoder_learning_rate": 6.461006935810048e-06 }, { "epoch": 1.9989110707803994, "grad_norm": 0.5280481482147419, "learning_rate": 6.461006935810048e-06, "loss": 0.302, "step": 173, "ts_encoder_learning_rate": 6.4226379331551625e-06 }, { "epoch": 2.0, "grad_norm": 0.5280481482147419, "learning_rate": 6.4226379331551625e-06, "loss": 0.0333, "step": 174, "ts_encoder_learning_rate": 6.384177557124247e-06 }, { "epoch": 2.0116152450090743, "grad_norm": 0.5645755484448244, "learning_rate": 6.384177557124247e-06, "loss": 0.2784, "step": 175, "ts_encoder_learning_rate": 6.345628277955384e-06 }, { "epoch": 2.023230490018149, "grad_norm": 0.7089731024549738, "learning_rate": 6.345628277955384e-06, "loss": 0.2864, "step": 176, "ts_encoder_learning_rate": 6.306992571596742e-06 }, { "epoch": 2.0348457350272233, "grad_norm": 0.5602666036492625, "learning_rate": 6.306992571596742e-06, "loss": 0.2881, "step": 177, "ts_encoder_learning_rate": 6.268272919547537e-06 }, { "epoch": 2.0464609800362976, "grad_norm": 0.6075921236619318, "learning_rate": 6.268272919547537e-06, "loss": 0.2798, "step": 178, "ts_encoder_learning_rate": 6.229471808698673e-06 }, { "epoch": 2.058076225045372, "grad_norm": 0.6984592942833859, "learning_rate": 6.229471808698673e-06, "loss": 0.2673, "step": 179, "ts_encoder_learning_rate": 6.1905917311729915e-06 }, { "epoch": 2.0696914700544466, "grad_norm": 0.5022810378072105, "learning_rate": 6.1905917311729915e-06, "loss": 0.2849, "step": 180, "ts_encoder_learning_rate": 6.151635184165219e-06 }, { "epoch": 2.081306715063521, "grad_norm": 0.5093282715650761, "learning_rate": 6.151635184165219e-06, "loss": 0.3026, "step": 181, "ts_encoder_learning_rate": 6.112604669781572e-06 }, { "epoch": 2.0929219600725952, "grad_norm": 0.6048006185139588, "learning_rate": 6.112604669781572e-06, "loss": 0.2835, "step": 182, "ts_encoder_learning_rate": 6.073502694879059e-06 }, { "epoch": 2.1045372050816695, "grad_norm": 0.5065083654679191, "learning_rate": 6.073502694879059e-06, "loss": 0.2782, "step": 183, "ts_encoder_learning_rate": 6.034331770904455e-06 }, { "epoch": 2.1161524500907443, "grad_norm": 0.5598494532388394, "learning_rate": 6.034331770904455e-06, "loss": 0.2825, "step": 184, "ts_encoder_learning_rate": 5.9950944137330125e-06 }, { "epoch": 2.1277676950998186, "grad_norm": 0.530982359241071, "learning_rate": 5.9950944137330125e-06, "loss": 0.2702, "step": 185, "ts_encoder_learning_rate": 5.955793143506863e-06 }, { "epoch": 2.139382940108893, "grad_norm": 0.509154289045405, "learning_rate": 5.955793143506863e-06, "loss": 0.2785, "step": 186, "ts_encoder_learning_rate": 5.916430484473149e-06 }, { "epoch": 2.150998185117967, "grad_norm": 0.5119364886674341, "learning_rate": 5.916430484473149e-06, "loss": 0.2895, "step": 187, "ts_encoder_learning_rate": 5.877008964821909e-06 }, { "epoch": 2.162613430127042, "grad_norm": 0.5034054419615417, "learning_rate": 5.877008964821909e-06, "loss": 0.2936, "step": 188, "ts_encoder_learning_rate": 5.837531116523683e-06 }, { "epoch": 2.174228675136116, "grad_norm": 0.5673133335829393, "learning_rate": 5.837531116523683e-06, "loss": 0.2762, "step": 189, "ts_encoder_learning_rate": 5.797999475166897e-06 }, { "epoch": 2.1858439201451905, "grad_norm": 0.5459438366130899, "learning_rate": 5.797999475166897e-06, "loss": 0.2718, "step": 190, "ts_encoder_learning_rate": 5.7584165797950055e-06 }, { "epoch": 2.1974591651542648, "grad_norm": 0.5533315060900448, "learning_rate": 5.7584165797950055e-06, "loss": 0.2754, "step": 191, "ts_encoder_learning_rate": 5.71878497274341e-06 }, { "epoch": 2.2090744101633395, "grad_norm": 0.6022264770129343, "learning_rate": 5.71878497274341e-06, "loss": 0.2743, "step": 192, "ts_encoder_learning_rate": 5.679107199476174e-06 }, { "epoch": 2.220689655172414, "grad_norm": 0.5540881606867427, "learning_rate": 5.679107199476174e-06, "loss": 0.2878, "step": 193, "ts_encoder_learning_rate": 5.6393858084225305e-06 }, { "epoch": 2.232304900181488, "grad_norm": 0.5577935030781893, "learning_rate": 5.6393858084225305e-06, "loss": 0.2677, "step": 194, "ts_encoder_learning_rate": 5.599623350813202e-06 }, { "epoch": 2.243920145190563, "grad_norm": 0.5369265581968106, "learning_rate": 5.599623350813202e-06, "loss": 0.26, "step": 195, "ts_encoder_learning_rate": 5.559822380516539e-06 }, { "epoch": 2.255535390199637, "grad_norm": 0.5350922011535203, "learning_rate": 5.559822380516539e-06, "loss": 0.2936, "step": 196, "ts_encoder_learning_rate": 5.5199854538744905e-06 }, { "epoch": 2.2671506352087114, "grad_norm": 0.534100714512928, "learning_rate": 5.5199854538744905e-06, "loss": 0.2756, "step": 197, "ts_encoder_learning_rate": 5.480115129538409e-06 }, { "epoch": 2.2787658802177857, "grad_norm": 0.5103849200596632, "learning_rate": 5.480115129538409e-06, "loss": 0.2498, "step": 198, "ts_encoder_learning_rate": 5.440213968304728e-06 }, { "epoch": 2.2903811252268604, "grad_norm": 0.5702739710122362, "learning_rate": 5.440213968304728e-06, "loss": 0.2708, "step": 199, "ts_encoder_learning_rate": 5.4002845329504675e-06 }, { "epoch": 2.3019963702359347, "grad_norm": 0.5216996613028344, "learning_rate": 5.4002845329504675e-06, "loss": 0.2668, "step": 200, "ts_encoder_learning_rate": 5.360329388068649e-06 }, { "epoch": 2.313611615245009, "grad_norm": 0.5388651334047538, "learning_rate": 5.360329388068649e-06, "loss": 0.2703, "step": 201, "ts_encoder_learning_rate": 5.320351099903565e-06 }, { "epoch": 2.3252268602540833, "grad_norm": 0.6001939955314459, "learning_rate": 5.320351099903565e-06, "loss": 0.261, "step": 202, "ts_encoder_learning_rate": 5.2803522361859596e-06 }, { "epoch": 2.336842105263158, "grad_norm": 0.4980937517422858, "learning_rate": 5.2803522361859596e-06, "loss": 0.2404, "step": 203, "ts_encoder_learning_rate": 5.240335365968104e-06 }, { "epoch": 2.3484573502722323, "grad_norm": 0.5342465570777747, "learning_rate": 5.240335365968104e-06, "loss": 0.2654, "step": 204, "ts_encoder_learning_rate": 5.2003030594587964e-06 }, { "epoch": 2.3600725952813066, "grad_norm": 0.5621041789253622, "learning_rate": 5.2003030594587964e-06, "loss": 0.2733, "step": 205, "ts_encoder_learning_rate": 5.160257887858278e-06 }, { "epoch": 2.371687840290381, "grad_norm": 0.5582661532374841, "learning_rate": 5.160257887858278e-06, "loss": 0.2597, "step": 206, "ts_encoder_learning_rate": 5.120202423193085e-06 }, { "epoch": 2.3833030852994557, "grad_norm": 0.5783285206586468, "learning_rate": 5.120202423193085e-06, "loss": 0.2525, "step": 207, "ts_encoder_learning_rate": 5.080139238150869e-06 }, { "epoch": 2.39491833030853, "grad_norm": 0.4822046196118616, "learning_rate": 5.080139238150869e-06, "loss": 0.2518, "step": 208, "ts_encoder_learning_rate": 5.040070905915139e-06 }, { "epoch": 2.4065335753176043, "grad_norm": 0.5559233478270967, "learning_rate": 5.040070905915139e-06, "loss": 0.28, "step": 209, "ts_encoder_learning_rate": 5e-06 }, { "epoch": 2.418148820326679, "grad_norm": 0.48901203329897386, "learning_rate": 5e-06, "loss": 0.2515, "step": 210, "ts_encoder_learning_rate": 4.959929094084862e-06 }, { "epoch": 2.4297640653357533, "grad_norm": 0.48746183943137245, "learning_rate": 4.959929094084862e-06, "loss": 0.2518, "step": 211, "ts_encoder_learning_rate": 4.919860761849132e-06 }, { "epoch": 2.4413793103448276, "grad_norm": 0.5009144470099608, "learning_rate": 4.919860761849132e-06, "loss": 0.2532, "step": 212, "ts_encoder_learning_rate": 4.879797576806915e-06 }, { "epoch": 2.452994555353902, "grad_norm": 0.4965510882041085, "learning_rate": 4.879797576806915e-06, "loss": 0.243, "step": 213, "ts_encoder_learning_rate": 4.839742112141725e-06 }, { "epoch": 2.464609800362976, "grad_norm": 0.48351081176855426, "learning_rate": 4.839742112141725e-06, "loss": 0.2596, "step": 214, "ts_encoder_learning_rate": 4.799696940541204e-06 }, { "epoch": 2.476225045372051, "grad_norm": 0.5172008324226596, "learning_rate": 4.799696940541204e-06, "loss": 0.236, "step": 215, "ts_encoder_learning_rate": 4.759664634031897e-06 }, { "epoch": 2.487840290381125, "grad_norm": 0.5571046196144138, "learning_rate": 4.759664634031897e-06, "loss": 0.2479, "step": 216, "ts_encoder_learning_rate": 4.719647763814041e-06 }, { "epoch": 2.4994555353901995, "grad_norm": 0.5421950769241719, "learning_rate": 4.719647763814041e-06, "loss": 0.2691, "step": 217, "ts_encoder_learning_rate": 4.679648900096436e-06 }, { "epoch": 2.5110707803992742, "grad_norm": 0.5330899469762319, "learning_rate": 4.679648900096436e-06, "loss": 0.2579, "step": 218, "ts_encoder_learning_rate": 4.6396706119313526e-06 }, { "epoch": 2.5226860254083485, "grad_norm": 0.5460154625334825, "learning_rate": 4.6396706119313526e-06, "loss": 0.2496, "step": 219, "ts_encoder_learning_rate": 4.599715467049534e-06 }, { "epoch": 2.534301270417423, "grad_norm": 0.5440271431449177, "learning_rate": 4.599715467049534e-06, "loss": 0.2455, "step": 220, "ts_encoder_learning_rate": 4.559786031695275e-06 }, { "epoch": 2.545916515426497, "grad_norm": 0.5133166382378065, "learning_rate": 4.559786031695275e-06, "loss": 0.2636, "step": 221, "ts_encoder_learning_rate": 4.5198848704615915e-06 }, { "epoch": 2.557531760435572, "grad_norm": 0.5366585588218753, "learning_rate": 4.5198848704615915e-06, "loss": 0.247, "step": 222, "ts_encoder_learning_rate": 4.480014546125511e-06 }, { "epoch": 2.569147005444646, "grad_norm": 0.5474900733819053, "learning_rate": 4.480014546125511e-06, "loss": 0.2795, "step": 223, "ts_encoder_learning_rate": 4.4401776194834615e-06 }, { "epoch": 2.5807622504537204, "grad_norm": 0.4975693304306332, "learning_rate": 4.4401776194834615e-06, "loss": 0.2401, "step": 224, "ts_encoder_learning_rate": 4.4003766491867984e-06 }, { "epoch": 2.592377495462795, "grad_norm": 0.5106865786465035, "learning_rate": 4.4003766491867984e-06, "loss": 0.242, "step": 225, "ts_encoder_learning_rate": 4.3606141915774695e-06 }, { "epoch": 2.6039927404718695, "grad_norm": 0.5168549085943859, "learning_rate": 4.3606141915774695e-06, "loss": 0.2335, "step": 226, "ts_encoder_learning_rate": 4.320892800523827e-06 }, { "epoch": 2.6156079854809438, "grad_norm": 0.5211543822385405, "learning_rate": 4.320892800523827e-06, "loss": 0.2493, "step": 227, "ts_encoder_learning_rate": 4.281215027256592e-06 }, { "epoch": 2.627223230490018, "grad_norm": 0.5127128225307483, "learning_rate": 4.281215027256592e-06, "loss": 0.2514, "step": 228, "ts_encoder_learning_rate": 4.241583420204998e-06 }, { "epoch": 2.6388384754990923, "grad_norm": 0.5378693778374266, "learning_rate": 4.241583420204998e-06, "loss": 0.2399, "step": 229, "ts_encoder_learning_rate": 4.2020005248331056e-06 }, { "epoch": 2.650453720508167, "grad_norm": 0.5721336179178363, "learning_rate": 4.2020005248331056e-06, "loss": 0.2558, "step": 230, "ts_encoder_learning_rate": 4.162468883476319e-06 }, { "epoch": 2.6620689655172414, "grad_norm": 0.5507499405880641, "learning_rate": 4.162468883476319e-06, "loss": 0.2423, "step": 231, "ts_encoder_learning_rate": 4.122991035178093e-06 }, { "epoch": 2.6736842105263157, "grad_norm": 0.5191297273890276, "learning_rate": 4.122991035178093e-06, "loss": 0.2387, "step": 232, "ts_encoder_learning_rate": 4.083569515526853e-06 }, { "epoch": 2.6852994555353904, "grad_norm": 0.5056716954679873, "learning_rate": 4.083569515526853e-06, "loss": 0.2379, "step": 233, "ts_encoder_learning_rate": 4.04420685649314e-06 }, { "epoch": 2.6969147005444647, "grad_norm": 0.5020728789197858, "learning_rate": 4.04420685649314e-06, "loss": 0.2341, "step": 234, "ts_encoder_learning_rate": 4.004905586266988e-06 }, { "epoch": 2.708529945553539, "grad_norm": 0.49710632823598544, "learning_rate": 4.004905586266988e-06, "loss": 0.2142, "step": 235, "ts_encoder_learning_rate": 3.965668229095546e-06 }, { "epoch": 2.7201451905626133, "grad_norm": 0.46282201836826814, "learning_rate": 3.965668229095546e-06, "loss": 0.25, "step": 236, "ts_encoder_learning_rate": 3.926497305120943e-06 }, { "epoch": 2.7317604355716876, "grad_norm": 0.5037406395194425, "learning_rate": 3.926497305120943e-06, "loss": 0.2423, "step": 237, "ts_encoder_learning_rate": 3.887395330218429e-06 }, { "epoch": 2.7433756805807623, "grad_norm": 0.5502464701719969, "learning_rate": 3.887395330218429e-06, "loss": 0.2371, "step": 238, "ts_encoder_learning_rate": 3.848364815834782e-06 }, { "epoch": 2.7549909255898366, "grad_norm": 0.488890082077443, "learning_rate": 3.848364815834782e-06, "loss": 0.2367, "step": 239, "ts_encoder_learning_rate": 3.809408268827009e-06 }, { "epoch": 2.7666061705989113, "grad_norm": 0.5122316791585159, "learning_rate": 3.809408268827009e-06, "loss": 0.2506, "step": 240, "ts_encoder_learning_rate": 3.7705281913013286e-06 }, { "epoch": 2.7782214156079856, "grad_norm": 0.4868899262078824, "learning_rate": 3.7705281913013286e-06, "loss": 0.2413, "step": 241, "ts_encoder_learning_rate": 3.731727080452464e-06 }, { "epoch": 2.78983666061706, "grad_norm": 0.5086075968707988, "learning_rate": 3.731727080452464e-06, "loss": 0.2421, "step": 242, "ts_encoder_learning_rate": 3.6930074284032613e-06 }, { "epoch": 2.801451905626134, "grad_norm": 0.49235271981217643, "learning_rate": 3.6930074284032613e-06, "loss": 0.2406, "step": 243, "ts_encoder_learning_rate": 3.654371722044616e-06 }, { "epoch": 2.8130671506352085, "grad_norm": 0.49103994839708176, "learning_rate": 3.654371722044616e-06, "loss": 0.2439, "step": 244, "ts_encoder_learning_rate": 3.6158224428757538e-06 }, { "epoch": 2.8246823956442833, "grad_norm": 0.9993476254292818, "learning_rate": 3.6158224428757538e-06, "loss": 0.2268, "step": 245, "ts_encoder_learning_rate": 3.5773620668448384e-06 }, { "epoch": 2.8362976406533575, "grad_norm": 0.5159214069274143, "learning_rate": 3.5773620668448384e-06, "loss": 0.2388, "step": 246, "ts_encoder_learning_rate": 3.538993064189954e-06 }, { "epoch": 2.847912885662432, "grad_norm": 0.543713795445949, "learning_rate": 3.538993064189954e-06, "loss": 0.2284, "step": 247, "ts_encoder_learning_rate": 3.500717899280442e-06 }, { "epoch": 2.8595281306715066, "grad_norm": 0.5140037192841642, "learning_rate": 3.500717899280442e-06, "loss": 0.2305, "step": 248, "ts_encoder_learning_rate": 3.4625390304586224e-06 }, { "epoch": 2.871143375680581, "grad_norm": 0.49143474790675895, "learning_rate": 3.4625390304586224e-06, "loss": 0.2348, "step": 249, "ts_encoder_learning_rate": 3.424458909881897e-06 }, { "epoch": 2.882758620689655, "grad_norm": 0.5203399617942011, "learning_rate": 3.424458909881897e-06, "loss": 0.2175, "step": 250, "ts_encoder_learning_rate": 3.386479983365251e-06 }, { "epoch": 2.8943738656987295, "grad_norm": 0.5363618954072708, "learning_rate": 3.386479983365251e-06, "loss": 0.2289, "step": 251, "ts_encoder_learning_rate": 3.3486046902241663e-06 }, { "epoch": 2.9059891107078037, "grad_norm": 0.49208512127705756, "learning_rate": 3.3486046902241663e-06, "loss": 0.2386, "step": 252, "ts_encoder_learning_rate": 3.310835463117942e-06 }, { "epoch": 2.9176043557168785, "grad_norm": 0.48789242095969204, "learning_rate": 3.310835463117942e-06, "loss": 0.2392, "step": 253, "ts_encoder_learning_rate": 3.273174727893463e-06 }, { "epoch": 2.9292196007259528, "grad_norm": 0.5163396778042415, "learning_rate": 3.273174727893463e-06, "loss": 0.2392, "step": 254, "ts_encoder_learning_rate": 3.235624903429374e-06 }, { "epoch": 2.940834845735027, "grad_norm": 0.4839363209051733, "learning_rate": 3.235624903429374e-06, "loss": 0.2294, "step": 255, "ts_encoder_learning_rate": 3.198188401480734e-06 }, { "epoch": 2.952450090744102, "grad_norm": 0.5099295694573828, "learning_rate": 3.198188401480734e-06, "loss": 0.214, "step": 256, "ts_encoder_learning_rate": 3.160867626524107e-06 }, { "epoch": 2.964065335753176, "grad_norm": 0.52866992195366, "learning_rate": 3.160867626524107e-06, "loss": 0.232, "step": 257, "ts_encoder_learning_rate": 3.12366497560313e-06 }, { "epoch": 2.9756805807622504, "grad_norm": 0.5016653572033554, "learning_rate": 3.12366497560313e-06, "loss": 0.2477, "step": 258, "ts_encoder_learning_rate": 3.0865828381745515e-06 }, { "epoch": 2.9872958257713247, "grad_norm": 0.5704722771230331, "learning_rate": 3.0865828381745515e-06, "loss": 0.2215, "step": 259, "ts_encoder_learning_rate": 3.049623595954766e-06 }, { "epoch": 2.9989110707803994, "grad_norm": 0.5049693668147675, "learning_rate": 3.049623595954766e-06, "loss": 0.2239, "step": 260, "ts_encoder_learning_rate": 3.0127896227668367e-06 }, { "epoch": 3.0, "grad_norm": 0.5049693668147675, "learning_rate": 3.0127896227668367e-06, "loss": 0.0215, "step": 261, "ts_encoder_learning_rate": 2.976083284388031e-06 }, { "epoch": 3.0116152450090743, "grad_norm": 0.6043291464224838, "learning_rate": 2.976083284388031e-06, "loss": 0.2239, "step": 262, "ts_encoder_learning_rate": 2.9395069383978725e-06 }, { "epoch": 3.023230490018149, "grad_norm": 0.46138071995137575, "learning_rate": 2.9395069383978725e-06, "loss": 0.1959, "step": 263, "ts_encoder_learning_rate": 2.9030629340267165e-06 }, { "epoch": 3.0348457350272233, "grad_norm": 0.6605611111234634, "learning_rate": 2.9030629340267165e-06, "loss": 0.2223, "step": 264, "ts_encoder_learning_rate": 2.8667536120048616e-06 }, { "epoch": 3.0464609800362976, "grad_norm": 0.4886966776948408, "learning_rate": 2.8667536120048616e-06, "loss": 0.2147, "step": 265, "ts_encoder_learning_rate": 2.83058130441221e-06 }, { "epoch": 3.058076225045372, "grad_norm": 0.603343292555356, "learning_rate": 2.83058130441221e-06, "loss": 0.2122, "step": 266, "ts_encoder_learning_rate": 2.794548334528486e-06 }, { "epoch": 3.0696914700544466, "grad_norm": 0.46864057728976466, "learning_rate": 2.794548334528486e-06, "loss": 0.187, "step": 267, "ts_encoder_learning_rate": 2.7586570166840154e-06 }, { "epoch": 3.081306715063521, "grad_norm": 0.5904508040534621, "learning_rate": 2.7586570166840154e-06, "loss": 0.191, "step": 268, "ts_encoder_learning_rate": 2.7229096561110703e-06 }, { "epoch": 3.0929219600725952, "grad_norm": 0.5389347532364875, "learning_rate": 2.7229096561110703e-06, "loss": 0.2183, "step": 269, "ts_encoder_learning_rate": 2.687308548795825e-06 }, { "epoch": 3.1045372050816695, "grad_norm": 0.5130748388642997, "learning_rate": 2.687308548795825e-06, "loss": 0.2239, "step": 270, "ts_encoder_learning_rate": 2.651855981330872e-06 }, { "epoch": 3.1161524500907443, "grad_norm": 0.49543600582732883, "learning_rate": 2.651855981330872e-06, "loss": 0.1958, "step": 271, "ts_encoder_learning_rate": 2.6165542307683744e-06 }, { "epoch": 3.1277676950998186, "grad_norm": 0.517787257194227, "learning_rate": 2.6165542307683744e-06, "loss": 0.2171, "step": 272, "ts_encoder_learning_rate": 2.5814055644738013e-06 }, { "epoch": 3.139382940108893, "grad_norm": 0.5435576653784301, "learning_rate": 2.5814055644738013e-06, "loss": 0.2144, "step": 273, "ts_encoder_learning_rate": 2.5464122399803126e-06 }, { "epoch": 3.150998185117967, "grad_norm": 0.5217681007016235, "learning_rate": 2.5464122399803126e-06, "loss": 0.1963, "step": 274, "ts_encoder_learning_rate": 2.5115765048437445e-06 }, { "epoch": 3.162613430127042, "grad_norm": 0.4918846481089564, "learning_rate": 2.5115765048437445e-06, "loss": 0.2244, "step": 275, "ts_encoder_learning_rate": 2.4769005964982718e-06 }, { "epoch": 3.174228675136116, "grad_norm": 0.4834963808627837, "learning_rate": 2.4769005964982718e-06, "loss": 0.2125, "step": 276, "ts_encoder_learning_rate": 2.4423867421126923e-06 }, { "epoch": 3.1858439201451905, "grad_norm": 0.4937777628741182, "learning_rate": 2.4423867421126923e-06, "loss": 0.2181, "step": 277, "ts_encoder_learning_rate": 2.408037158447375e-06 }, { "epoch": 3.1974591651542648, "grad_norm": 0.46282845264960937, "learning_rate": 2.408037158447375e-06, "loss": 0.213, "step": 278, "ts_encoder_learning_rate": 2.3738540517118953e-06 }, { "epoch": 3.2090744101633395, "grad_norm": 0.5150835355954807, "learning_rate": 2.3738540517118953e-06, "loss": 0.1852, "step": 279, "ts_encoder_learning_rate": 2.339839617423318e-06 }, { "epoch": 3.220689655172414, "grad_norm": 0.49870953616723984, "learning_rate": 2.339839617423318e-06, "loss": 0.207, "step": 280, "ts_encoder_learning_rate": 2.305996040265198e-06 }, { "epoch": 3.232304900181488, "grad_norm": 0.49174431699593274, "learning_rate": 2.305996040265198e-06, "loss": 0.2162, "step": 281, "ts_encoder_learning_rate": 2.272325493947257e-06 }, { "epoch": 3.243920145190563, "grad_norm": 0.4875355155859377, "learning_rate": 2.272325493947257e-06, "loss": 0.2243, "step": 282, "ts_encoder_learning_rate": 2.238830141065765e-06 }, { "epoch": 3.255535390199637, "grad_norm": 0.5107168259211297, "learning_rate": 2.238830141065765e-06, "loss": 0.201, "step": 283, "ts_encoder_learning_rate": 2.2055121329646416e-06 }, { "epoch": 3.2671506352087114, "grad_norm": 0.4819085595049116, "learning_rate": 2.2055121329646416e-06, "loss": 0.1981, "step": 284, "ts_encoder_learning_rate": 2.1723736095972946e-06 }, { "epoch": 3.2787658802177857, "grad_norm": 0.4477936237119145, "learning_rate": 2.1723736095972946e-06, "loss": 0.1959, "step": 285, "ts_encoder_learning_rate": 2.139416699389153e-06 }, { "epoch": 3.2903811252268604, "grad_norm": 0.4701102863028192, "learning_rate": 2.139416699389153e-06, "loss": 0.1936, "step": 286, "ts_encoder_learning_rate": 2.1066435191009717e-06 }, { "epoch": 3.3019963702359347, "grad_norm": 0.5076017279364189, "learning_rate": 2.1066435191009717e-06, "loss": 0.214, "step": 287, "ts_encoder_learning_rate": 2.074056173692881e-06 }, { "epoch": 3.313611615245009, "grad_norm": 0.480769257020111, "learning_rate": 2.074056173692881e-06, "loss": 0.1793, "step": 288, "ts_encoder_learning_rate": 2.041656756189184e-06 }, { "epoch": 3.3252268602540833, "grad_norm": 0.4957168313559018, "learning_rate": 2.041656756189184e-06, "loss": 0.2041, "step": 289, "ts_encoder_learning_rate": 2.00944734754392e-06 }, { "epoch": 3.336842105263158, "grad_norm": 0.4710750376178812, "learning_rate": 2.00944734754392e-06, "loss": 0.1945, "step": 290, "ts_encoder_learning_rate": 1.977430016507222e-06 }, { "epoch": 3.3484573502722323, "grad_norm": 0.45013646064857127, "learning_rate": 1.977430016507222e-06, "loss": 0.1944, "step": 291, "ts_encoder_learning_rate": 1.945606819492429e-06 }, { "epoch": 3.3600725952813066, "grad_norm": 0.4963393211559648, "learning_rate": 1.945606819492429e-06, "loss": 0.2029, "step": 292, "ts_encoder_learning_rate": 1.913979800444021e-06 }, { "epoch": 3.371687840290381, "grad_norm": 0.45636659614066805, "learning_rate": 1.913979800444021e-06, "loss": 0.2042, "step": 293, "ts_encoder_learning_rate": 1.8825509907063328e-06 }, { "epoch": 3.3833030852994557, "grad_norm": 0.48404604144273267, "learning_rate": 1.8825509907063328e-06, "loss": 0.2073, "step": 294, "ts_encoder_learning_rate": 1.8513224088930814e-06 }, { "epoch": 3.39491833030853, "grad_norm": 0.4903949616722962, "learning_rate": 1.8513224088930814e-06, "loss": 0.2117, "step": 295, "ts_encoder_learning_rate": 1.8202960607577246e-06 }, { "epoch": 3.4065335753176043, "grad_norm": 0.4749952038477868, "learning_rate": 1.8202960607577246e-06, "loss": 0.2105, "step": 296, "ts_encoder_learning_rate": 1.7894739390646227e-06 }, { "epoch": 3.418148820326679, "grad_norm": 0.49308201942231306, "learning_rate": 1.7894739390646227e-06, "loss": 0.1854, "step": 297, "ts_encoder_learning_rate": 1.7588580234610592e-06 }, { "epoch": 3.4297640653357533, "grad_norm": 0.44330828627883645, "learning_rate": 1.7588580234610592e-06, "loss": 0.2049, "step": 298, "ts_encoder_learning_rate": 1.728450280350079e-06 }, { "epoch": 3.4413793103448276, "grad_norm": 0.47198428442224044, "learning_rate": 1.728450280350079e-06, "loss": 0.1917, "step": 299, "ts_encoder_learning_rate": 1.6982526627642043e-06 }, { "epoch": 3.452994555353902, "grad_norm": 0.47052296840051827, "learning_rate": 1.6982526627642043e-06, "loss": 0.1966, "step": 300, "ts_encoder_learning_rate": 1.6682671102399806e-06 }, { "epoch": 3.464609800362976, "grad_norm": 0.47469220902280884, "learning_rate": 1.6682671102399806e-06, "loss": 0.1993, "step": 301, "ts_encoder_learning_rate": 1.6384955486934157e-06 }, { "epoch": 3.476225045372051, "grad_norm": 0.5047215872734404, "learning_rate": 1.6384955486934157e-06, "loss": 0.2087, "step": 302, "ts_encoder_learning_rate": 1.6089398902962767e-06 }, { "epoch": 3.487840290381125, "grad_norm": 0.46226600784092325, "learning_rate": 1.6089398902962767e-06, "loss": 0.2223, "step": 303, "ts_encoder_learning_rate": 1.5796020333532696e-06 }, { "epoch": 3.4994555353901995, "grad_norm": 0.49692738160329974, "learning_rate": 1.5796020333532696e-06, "loss": 0.2098, "step": 304, "ts_encoder_learning_rate": 1.5504838621801272e-06 }, { "epoch": 3.5110707803992742, "grad_norm": 0.44096965404662336, "learning_rate": 1.5504838621801272e-06, "loss": 0.1917, "step": 305, "ts_encoder_learning_rate": 1.5215872469825682e-06 }, { "epoch": 3.5226860254083485, "grad_norm": 0.46470354099812156, "learning_rate": 1.5215872469825682e-06, "loss": 0.2024, "step": 306, "ts_encoder_learning_rate": 1.4929140437361916e-06 }, { "epoch": 3.534301270417423, "grad_norm": 0.6245744672781995, "learning_rate": 1.4929140437361916e-06, "loss": 0.1932, "step": 307, "ts_encoder_learning_rate": 1.4644660940672628e-06 }, { "epoch": 3.545916515426497, "grad_norm": 0.4616850273696799, "learning_rate": 1.4644660940672628e-06, "loss": 0.1894, "step": 308, "ts_encoder_learning_rate": 1.4362452251344283e-06 }, { "epoch": 3.557531760435572, "grad_norm": 0.46072956667527437, "learning_rate": 1.4362452251344283e-06, "loss": 0.1939, "step": 309, "ts_encoder_learning_rate": 1.4082532495113627e-06 }, { "epoch": 3.569147005444646, "grad_norm": 0.4622635293176079, "learning_rate": 1.4082532495113627e-06, "loss": 0.1986, "step": 310, "ts_encoder_learning_rate": 1.3804919650703551e-06 }, { "epoch": 3.5807622504537204, "grad_norm": 0.5048261638194425, "learning_rate": 1.3804919650703551e-06, "loss": 0.1999, "step": 311, "ts_encoder_learning_rate": 1.3529631548668298e-06 }, { "epoch": 3.592377495462795, "grad_norm": 0.4265055997084881, "learning_rate": 1.3529631548668298e-06, "loss": 0.207, "step": 312, "ts_encoder_learning_rate": 1.3256685870248227e-06 }, { "epoch": 3.6039927404718695, "grad_norm": 0.43467381019959384, "learning_rate": 1.3256685870248227e-06, "loss": 0.1809, "step": 313, "ts_encoder_learning_rate": 1.298610014623423e-06 }, { "epoch": 3.6156079854809438, "grad_norm": 0.47840223124389564, "learning_rate": 1.298610014623423e-06, "loss": 0.1985, "step": 314, "ts_encoder_learning_rate": 1.2717891755841722e-06 }, { "epoch": 3.627223230490018, "grad_norm": 0.4736005082533798, "learning_rate": 1.2717891755841722e-06, "loss": 0.2029, "step": 315, "ts_encoder_learning_rate": 1.2452077925594435e-06 }, { "epoch": 3.6388384754990923, "grad_norm": 0.44359091955078794, "learning_rate": 1.2452077925594435e-06, "loss": 0.1974, "step": 316, "ts_encoder_learning_rate": 1.2188675728217986e-06 }, { "epoch": 3.650453720508167, "grad_norm": 0.4835930654461023, "learning_rate": 1.2188675728217986e-06, "loss": 0.2004, "step": 317, "ts_encoder_learning_rate": 1.1927702081543279e-06 }, { "epoch": 3.6620689655172414, "grad_norm": 0.4950865344011163, "learning_rate": 1.1927702081543279e-06, "loss": 0.1971, "step": 318, "ts_encoder_learning_rate": 1.166917374742e-06 }, { "epoch": 3.6736842105263157, "grad_norm": 0.4542349082763096, "learning_rate": 1.166917374742e-06, "loss": 0.2029, "step": 319, "ts_encoder_learning_rate": 1.141310733063991e-06 }, { "epoch": 3.6852994555353904, "grad_norm": 0.4722201030579586, "learning_rate": 1.141310733063991e-06, "loss": 0.1871, "step": 320, "ts_encoder_learning_rate": 1.1159519277870507e-06 }, { "epoch": 3.6969147005444647, "grad_norm": 0.44859200333215415, "learning_rate": 1.1159519277870507e-06, "loss": 0.1912, "step": 321, "ts_encoder_learning_rate": 1.0908425876598512e-06 }, { "epoch": 3.708529945553539, "grad_norm": 0.46643463691801845, "learning_rate": 1.0908425876598512e-06, "loss": 0.1845, "step": 322, "ts_encoder_learning_rate": 1.0659843254083919e-06 }, { "epoch": 3.7201451905626133, "grad_norm": 0.4306281262155224, "learning_rate": 1.0659843254083919e-06, "loss": 0.1779, "step": 323, "ts_encoder_learning_rate": 1.041378737632402e-06 }, { "epoch": 3.7317604355716876, "grad_norm": 0.4753156573309627, "learning_rate": 1.041378737632402e-06, "loss": 0.1989, "step": 324, "ts_encoder_learning_rate": 1.0170274047028068e-06 }, { "epoch": 3.7433756805807623, "grad_norm": 0.45875837749866927, "learning_rate": 1.0170274047028068e-06, "loss": 0.1912, "step": 325, "ts_encoder_learning_rate": 9.929318906602176e-07 }, { "epoch": 3.7549909255898366, "grad_norm": 0.4823250266490247, "learning_rate": 9.929318906602176e-07, "loss": 0.194, "step": 326, "ts_encoder_learning_rate": 9.690937431144725e-07 }, { "epoch": 3.7666061705989113, "grad_norm": 0.4354804449964891, "learning_rate": 9.690937431144725e-07, "loss": 0.193, "step": 327, "ts_encoder_learning_rate": 9.455144931452459e-07 }, { "epoch": 3.7782214156079856, "grad_norm": 0.4634865596978523, "learning_rate": 9.455144931452459e-07, "loss": 0.2046, "step": 328, "ts_encoder_learning_rate": 9.221956552036992e-07 }, { "epoch": 3.78983666061706, "grad_norm": 0.455260620824431, "learning_rate": 9.221956552036992e-07, "loss": 0.1939, "step": 329, "ts_encoder_learning_rate": 8.991387270152202e-07 }, { "epoch": 3.801451905626134, "grad_norm": 0.49649814336003645, "learning_rate": 8.991387270152202e-07, "loss": 0.2102, "step": 330, "ts_encoder_learning_rate": 8.76345189483222e-07 }, { "epoch": 3.8130671506352085, "grad_norm": 0.49114892681403455, "learning_rate": 8.76345189483222e-07, "loss": 0.1963, "step": 331, "ts_encoder_learning_rate": 8.538165065940263e-07 }, { "epoch": 3.8246823956442833, "grad_norm": 0.5286315843085168, "learning_rate": 8.538165065940263e-07, "loss": 0.2193, "step": 332, "ts_encoder_learning_rate": 8.315541253228332e-07 }, { "epoch": 3.8362976406533575, "grad_norm": 0.4405444221304671, "learning_rate": 8.315541253228332e-07, "loss": 0.1914, "step": 333, "ts_encoder_learning_rate": 8.095594755407971e-07 }, { "epoch": 3.847912885662432, "grad_norm": 0.4698521600536052, "learning_rate": 8.095594755407971e-07, "loss": 0.2235, "step": 334, "ts_encoder_learning_rate": 7.878339699231702e-07 }, { "epoch": 3.8595281306715066, "grad_norm": 0.4371598229337324, "learning_rate": 7.878339699231702e-07, "loss": 0.1839, "step": 335, "ts_encoder_learning_rate": 7.663790038585794e-07 }, { "epoch": 3.871143375680581, "grad_norm": 0.45962066234037935, "learning_rate": 7.663790038585794e-07, "loss": 0.1995, "step": 336, "ts_encoder_learning_rate": 7.451959553594051e-07 }, { "epoch": 3.882758620689655, "grad_norm": 0.42173091596380397, "learning_rate": 7.451959553594051e-07, "loss": 0.1827, "step": 337, "ts_encoder_learning_rate": 7.242861849732696e-07 }, { "epoch": 3.8943738656987295, "grad_norm": 0.4331430485638045, "learning_rate": 7.242861849732696e-07, "loss": 0.2012, "step": 338, "ts_encoder_learning_rate": 7.036510356956494e-07 }, { "epoch": 3.9059891107078037, "grad_norm": 0.4465736646292548, "learning_rate": 7.036510356956494e-07, "loss": 0.1989, "step": 339, "ts_encoder_learning_rate": 6.832918328836247e-07 }, { "epoch": 3.9176043557168785, "grad_norm": 0.43493507516276353, "learning_rate": 6.832918328836247e-07, "loss": 0.1951, "step": 340, "ts_encoder_learning_rate": 6.632098841707458e-07 }, { "epoch": 3.9292196007259528, "grad_norm": 0.42038926625601386, "learning_rate": 6.632098841707458e-07, "loss": 0.1915, "step": 341, "ts_encoder_learning_rate": 6.43406479383053e-07 }, { "epoch": 3.940834845735027, "grad_norm": 0.46480078485720294, "learning_rate": 6.43406479383053e-07, "loss": 0.1917, "step": 342, "ts_encoder_learning_rate": 6.238828904562316e-07 }, { "epoch": 3.952450090744102, "grad_norm": 0.4490758579669617, "learning_rate": 6.238828904562316e-07, "loss": 0.2059, "step": 343, "ts_encoder_learning_rate": 6.04640371353914e-07 }, { "epoch": 3.964065335753176, "grad_norm": 0.4613515058442109, "learning_rate": 6.04640371353914e-07, "loss": 0.1985, "step": 344, "ts_encoder_learning_rate": 5.856801579871457e-07 }, { "epoch": 3.9756805807622504, "grad_norm": 0.4456006110376394, "learning_rate": 5.856801579871457e-07, "loss": 0.2025, "step": 345, "ts_encoder_learning_rate": 5.670034681349995e-07 }, { "epoch": 3.9872958257713247, "grad_norm": 0.42933039768161857, "learning_rate": 5.670034681349995e-07, "loss": 0.2012, "step": 346, "ts_encoder_learning_rate": 5.486115013663668e-07 }, { "epoch": 3.9989110707803994, "grad_norm": 0.44999338968368285, "learning_rate": 5.486115013663668e-07, "loss": 0.187, "step": 347, "ts_encoder_learning_rate": 5.305054389629022e-07 }, { "epoch": 4.0, "grad_norm": 0.44999338968368285, "learning_rate": 5.305054389629022e-07, "loss": 0.0145, "step": 348, "ts_encoder_learning_rate": 5.126864438431628e-07 }, { "epoch": 4.011615245009074, "grad_norm": 0.42250491238346477, "learning_rate": 5.126864438431628e-07, "loss": 0.1884, "step": 349, "ts_encoder_learning_rate": 4.951556604879049e-07 }, { "epoch": 4.023230490018149, "grad_norm": 0.4230629092251735, "learning_rate": 4.951556604879049e-07, "loss": 0.1905, "step": 350, "ts_encoder_learning_rate": 4.779142148665855e-07 }, { "epoch": 4.034845735027223, "grad_norm": 0.4174529397738527, "learning_rate": 4.779142148665855e-07, "loss": 0.1658, "step": 351, "ts_encoder_learning_rate": 4.6096321436504e-07 }, { "epoch": 4.046460980036298, "grad_norm": 0.43118539472845935, "learning_rate": 4.6096321436504e-07, "loss": 0.184, "step": 352, "ts_encoder_learning_rate": 4.4430374771435245e-07 }, { "epoch": 4.058076225045372, "grad_norm": 0.41001720611475784, "learning_rate": 4.4430374771435245e-07, "loss": 0.1828, "step": 353, "ts_encoder_learning_rate": 4.279368849209381e-07 }, { "epoch": 4.069691470054447, "grad_norm": 0.4308084296921054, "learning_rate": 4.279368849209381e-07, "loss": 0.1964, "step": 354, "ts_encoder_learning_rate": 4.1186367719780737e-07 }, { "epoch": 4.081306715063521, "grad_norm": 0.4407009368252455, "learning_rate": 4.1186367719780737e-07, "loss": 0.1735, "step": 355, "ts_encoder_learning_rate": 3.960851568970586e-07 }, { "epoch": 4.092921960072595, "grad_norm": 0.41256069394964856, "learning_rate": 3.960851568970586e-07, "loss": 0.1911, "step": 356, "ts_encoder_learning_rate": 3.8060233744356634e-07 }, { "epoch": 4.1045372050816695, "grad_norm": 0.4221589175474974, "learning_rate": 3.8060233744356634e-07, "loss": 0.1863, "step": 357, "ts_encoder_learning_rate": 3.6541621326989183e-07 }, { "epoch": 4.116152450090744, "grad_norm": 0.42751127372899456, "learning_rate": 3.6541621326989183e-07, "loss": 0.2028, "step": 358, "ts_encoder_learning_rate": 3.5052775975241203e-07 }, { "epoch": 4.127767695099818, "grad_norm": 0.40851979892771395, "learning_rate": 3.5052775975241203e-07, "loss": 0.1718, "step": 359, "ts_encoder_learning_rate": 3.359379331486762e-07 }, { "epoch": 4.139382940108893, "grad_norm": 0.41178087270431546, "learning_rate": 3.359379331486762e-07, "loss": 0.181, "step": 360, "ts_encoder_learning_rate": 3.216476705359839e-07 }, { "epoch": 4.150998185117968, "grad_norm": 0.4104761665945451, "learning_rate": 3.216476705359839e-07, "loss": 0.1872, "step": 361, "ts_encoder_learning_rate": 3.076578897511978e-07 }, { "epoch": 4.162613430127042, "grad_norm": 0.41012969922765047, "learning_rate": 3.076578897511978e-07, "loss": 0.182, "step": 362, "ts_encoder_learning_rate": 2.939694893317979e-07 }, { "epoch": 4.174228675136116, "grad_norm": 0.41789852835375363, "learning_rate": 2.939694893317979e-07, "loss": 0.1937, "step": 363, "ts_encoder_learning_rate": 2.8058334845816214e-07 }, { "epoch": 4.1858439201451905, "grad_norm": 0.42143167915544566, "learning_rate": 2.8058334845816214e-07, "loss": 0.1838, "step": 364, "ts_encoder_learning_rate": 2.6750032689710604e-07 }, { "epoch": 4.197459165154265, "grad_norm": 0.39213432837711776, "learning_rate": 2.6750032689710604e-07, "loss": 0.1742, "step": 365, "ts_encoder_learning_rate": 2.547212649466568e-07 }, { "epoch": 4.209074410163339, "grad_norm": 0.3958142973041478, "learning_rate": 2.547212649466568e-07, "loss": 0.1919, "step": 366, "ts_encoder_learning_rate": 2.4224698338208344e-07 }, { "epoch": 4.220689655172414, "grad_norm": 0.44213215894104213, "learning_rate": 2.4224698338208344e-07, "loss": 0.1841, "step": 367, "ts_encoder_learning_rate": 2.3007828340318117e-07 }, { "epoch": 4.2323049001814885, "grad_norm": 0.4245037412302445, "learning_rate": 2.3007828340318117e-07, "loss": 0.1891, "step": 368, "ts_encoder_learning_rate": 2.1821594658280932e-07 }, { "epoch": 4.243920145190563, "grad_norm": 0.4132437127742664, "learning_rate": 2.1821594658280932e-07, "loss": 0.1803, "step": 369, "ts_encoder_learning_rate": 2.0666073481669714e-07 }, { "epoch": 4.255535390199637, "grad_norm": 0.411342871117412, "learning_rate": 2.0666073481669714e-07, "loss": 0.1812, "step": 370, "ts_encoder_learning_rate": 1.9541339027450256e-07 }, { "epoch": 4.267150635208711, "grad_norm": 0.42070511786632736, "learning_rate": 1.9541339027450256e-07, "loss": 0.1781, "step": 371, "ts_encoder_learning_rate": 1.8447463535214872e-07 }, { "epoch": 4.278765880217786, "grad_norm": 0.3960054704602751, "learning_rate": 1.8447463535214872e-07, "loss": 0.1872, "step": 372, "ts_encoder_learning_rate": 1.7384517262542255e-07 }, { "epoch": 4.29038112522686, "grad_norm": 0.4146312365888249, "learning_rate": 1.7384517262542255e-07, "loss": 0.1996, "step": 373, "ts_encoder_learning_rate": 1.6352568480485277e-07 }, { "epoch": 4.301996370235934, "grad_norm": 0.4302254069498354, "learning_rate": 1.6352568480485277e-07, "loss": 0.1799, "step": 374, "ts_encoder_learning_rate": 1.5351683469185973e-07 }, { "epoch": 4.3136116152450095, "grad_norm": 0.4048487003041662, "learning_rate": 1.5351683469185973e-07, "loss": 0.1853, "step": 375, "ts_encoder_learning_rate": 1.4381926513618139e-07 }, { "epoch": 4.325226860254084, "grad_norm": 0.4340645587831662, "learning_rate": 1.4381926513618139e-07, "loss": 0.1745, "step": 376, "ts_encoder_learning_rate": 1.3443359899458997e-07 }, { "epoch": 4.336842105263158, "grad_norm": 0.419931335758943, "learning_rate": 1.3443359899458997e-07, "loss": 0.2001, "step": 377, "ts_encoder_learning_rate": 1.253604390908819e-07 }, { "epoch": 4.348457350272232, "grad_norm": 0.41065254305787063, "learning_rate": 1.253604390908819e-07, "loss": 0.1856, "step": 378, "ts_encoder_learning_rate": 1.1660036817716492e-07 }, { "epoch": 4.360072595281307, "grad_norm": 0.4386933347694567, "learning_rate": 1.1660036817716492e-07, "loss": 0.1885, "step": 379, "ts_encoder_learning_rate": 1.0815394889642339e-07 }, { "epoch": 4.371687840290381, "grad_norm": 0.4206901632288436, "learning_rate": 1.0815394889642339e-07, "loss": 0.1855, "step": 380, "ts_encoder_learning_rate": 1.0002172374638519e-07 }, { "epoch": 4.383303085299455, "grad_norm": 0.45345577005415333, "learning_rate": 1.0002172374638519e-07, "loss": 0.201, "step": 381, "ts_encoder_learning_rate": 9.22042150446728e-08 }, { "epoch": 4.3949183303085295, "grad_norm": 0.42825131486091655, "learning_rate": 9.22042150446728e-08, "loss": 0.1977, "step": 382, "ts_encoder_learning_rate": 8.470192489526519e-08 }, { "epoch": 4.406533575317605, "grad_norm": 0.4180892476123211, "learning_rate": 8.470192489526519e-08, "loss": 0.1819, "step": 383, "ts_encoder_learning_rate": 7.7515335156238e-08 }, { "epoch": 4.418148820326679, "grad_norm": 0.3988898186373836, "learning_rate": 7.7515335156238e-08, "loss": 0.1845, "step": 384, "ts_encoder_learning_rate": 7.064490740882057e-08 }, { "epoch": 4.429764065335753, "grad_norm": 0.4224273125164809, "learning_rate": 7.064490740882057e-08, "loss": 0.1831, "step": 385, "ts_encoder_learning_rate": 6.409108292774912e-08 }, { "epoch": 4.441379310344828, "grad_norm": 0.40574161987282553, "learning_rate": 6.409108292774912e-08, "loss": 0.1884, "step": 386, "ts_encoder_learning_rate": 5.785428265292381e-08 }, { "epoch": 4.452994555353902, "grad_norm": 0.4205040886436754, "learning_rate": 5.785428265292381e-08, "loss": 0.1854, "step": 387, "ts_encoder_learning_rate": 5.1934907162370374e-08 }, { "epoch": 4.464609800362976, "grad_norm": 0.41648710747921297, "learning_rate": 5.1934907162370374e-08, "loss": 0.1756, "step": 388, "ts_encoder_learning_rate": 4.63333366465174e-08 }, { "epoch": 4.4762250453720505, "grad_norm": 0.39540267589707684, "learning_rate": 4.63333366465174e-08, "loss": 0.1831, "step": 389, "ts_encoder_learning_rate": 4.104993088376974e-08 }, { "epoch": 4.487840290381126, "grad_norm": 0.3927817907648021, "learning_rate": 4.104993088376974e-08, "loss": 0.1742, "step": 390, "ts_encoder_learning_rate": 3.608502921740753e-08 }, { "epoch": 4.4994555353902, "grad_norm": 0.43059843726946884, "learning_rate": 3.608502921740753e-08, "loss": 0.1922, "step": 391, "ts_encoder_learning_rate": 3.143895053378698e-08 }, { "epoch": 4.511070780399274, "grad_norm": 0.41821997175820497, "learning_rate": 3.143895053378698e-08, "loss": 0.188, "step": 392, "ts_encoder_learning_rate": 2.7111993241860646e-08 }, { "epoch": 4.5226860254083485, "grad_norm": 0.44898297045915464, "learning_rate": 2.7111993241860646e-08, "loss": 0.195, "step": 393, "ts_encoder_learning_rate": 2.3104435254008852e-08 }, { "epoch": 4.534301270417423, "grad_norm": 0.41081766017860594, "learning_rate": 2.3104435254008852e-08, "loss": 0.1764, "step": 394, "ts_encoder_learning_rate": 1.9416533968193428e-08 }, { "epoch": 4.545916515426497, "grad_norm": 0.4267367649318197, "learning_rate": 1.9416533968193428e-08, "loss": 0.178, "step": 395, "ts_encoder_learning_rate": 1.6048526251421502e-08 }, { "epoch": 4.557531760435571, "grad_norm": 0.40492762389862497, "learning_rate": 1.6048526251421502e-08, "loss": 0.1833, "step": 396, "ts_encoder_learning_rate": 1.3000628424535978e-08 }, { "epoch": 4.569147005444647, "grad_norm": 0.41530646634421503, "learning_rate": 1.3000628424535978e-08, "loss": 0.1764, "step": 397, "ts_encoder_learning_rate": 1.0273036248318325e-08 }, { "epoch": 4.580762250453721, "grad_norm": 0.41054541493317387, "learning_rate": 1.0273036248318325e-08, "loss": 0.1884, "step": 398, "ts_encoder_learning_rate": 7.865924910916977e-09 }, { "epoch": 4.592377495462795, "grad_norm": 0.3927284291620028, "learning_rate": 7.865924910916977e-09, "loss": 0.174, "step": 399, "ts_encoder_learning_rate": 5.779449016595773e-09 }, { "epoch": 4.6039927404718695, "grad_norm": 0.40810543942098576, "learning_rate": 5.779449016595773e-09, "loss": 0.1944, "step": 400, "ts_encoder_learning_rate": 4.0137425758018935e-09 }, { "epoch": 4.6039927404718695, "step": 400, "total_flos": 667646607294464.0, "train_loss": 0.43487690573791044, "train_runtime": 29492.4665, "train_samples_per_second": 6.944, "train_steps_per_second": 0.014, "ts_encoder_learning_rate": 4.0137425758018935e-09 } ], "logging_steps": 1.0, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 667646607294464.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }