STReasoner-8B-CoT / trainer_state.json
Time-HD-Anonymous's picture
Upload folder using huggingface_hub
2144e39 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.6039927404718695,
"eval_steps": 500,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01161524500907441,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 4.6825,
"step": 1,
"ts_encoder_learning_rate": 0.0
},
{
"epoch": 0.02323049001814882,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 4.6428,
"step": 2,
"ts_encoder_learning_rate": 0.0
},
{
"epoch": 0.03484573502722323,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 4.6696,
"step": 3,
"ts_encoder_learning_rate": 0.0
},
{
"epoch": 0.04646098003629764,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 4.6588,
"step": 4,
"ts_encoder_learning_rate": 0.0
},
{
"epoch": 0.05807622504537205,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 4.7245,
"step": 5,
"ts_encoder_learning_rate": 0.0
},
{
"epoch": 0.06969147005444647,
"grad_norm": 152.50412651338738,
"learning_rate": 0.0,
"loss": 4.6365,
"step": 6,
"ts_encoder_learning_rate": 1.25e-06
},
{
"epoch": 0.08130671506352087,
"grad_norm": 153.96358887797103,
"learning_rate": 1.25e-06,
"loss": 4.6399,
"step": 7,
"ts_encoder_learning_rate": 2.5e-06
},
{
"epoch": 0.09292196007259527,
"grad_norm": 119.30991989360129,
"learning_rate": 2.5e-06,
"loss": 4.0109,
"step": 8,
"ts_encoder_learning_rate": 3.7500000000000005e-06
},
{
"epoch": 0.10453720508166969,
"grad_norm": 62.980179337580374,
"learning_rate": 3.7500000000000005e-06,
"loss": 2.8417,
"step": 9,
"ts_encoder_learning_rate": 5e-06
},
{
"epoch": 0.1161524500907441,
"grad_norm": 33.513562064012554,
"learning_rate": 5e-06,
"loss": 2.1341,
"step": 10,
"ts_encoder_learning_rate": 6.25e-06
},
{
"epoch": 0.1277676950998185,
"grad_norm": 14.313077235644876,
"learning_rate": 6.25e-06,
"loss": 1.7057,
"step": 11,
"ts_encoder_learning_rate": 7.500000000000001e-06
},
{
"epoch": 0.13938294010889293,
"grad_norm": 6.369652791543636,
"learning_rate": 7.500000000000001e-06,
"loss": 1.3565,
"step": 12,
"ts_encoder_learning_rate": 8.750000000000001e-06
},
{
"epoch": 0.15099818511796734,
"grad_norm": 3.731701365388006,
"learning_rate": 8.750000000000001e-06,
"loss": 1.1662,
"step": 13,
"ts_encoder_learning_rate": 1e-05
},
{
"epoch": 0.16261343012704174,
"grad_norm": 2.6361585613163587,
"learning_rate": 1e-05,
"loss": 1.065,
"step": 14,
"ts_encoder_learning_rate": 9.999839429671632e-06
},
{
"epoch": 0.17422867513611615,
"grad_norm": 1.7275515554655163,
"learning_rate": 9.999839429671632e-06,
"loss": 1.0108,
"step": 15,
"ts_encoder_learning_rate": 9.999357728999657e-06
},
{
"epoch": 0.18584392014519055,
"grad_norm": 1.3395365357042155,
"learning_rate": 9.999357728999657e-06,
"loss": 0.9446,
"step": 16,
"ts_encoder_learning_rate": 9.99855492892281e-06
},
{
"epoch": 0.19745916515426498,
"grad_norm": 1.2724446763819477,
"learning_rate": 9.99855492892281e-06,
"loss": 0.8875,
"step": 17,
"ts_encoder_learning_rate": 9.99743108100344e-06
},
{
"epoch": 0.20907441016333939,
"grad_norm": 1.1656594933692108,
"learning_rate": 9.99743108100344e-06,
"loss": 0.8609,
"step": 18,
"ts_encoder_learning_rate": 9.9959862574242e-06
},
{
"epoch": 0.2206896551724138,
"grad_norm": 0.8157300015925317,
"learning_rate": 9.9959862574242e-06,
"loss": 0.8392,
"step": 19,
"ts_encoder_learning_rate": 9.994220550983404e-06
},
{
"epoch": 0.2323049001814882,
"grad_norm": 0.8647381094727201,
"learning_rate": 9.994220550983404e-06,
"loss": 0.8233,
"step": 20,
"ts_encoder_learning_rate": 9.992134075089085e-06
},
{
"epoch": 0.24392014519056263,
"grad_norm": 0.7179729634940382,
"learning_rate": 9.992134075089085e-06,
"loss": 0.7767,
"step": 21,
"ts_encoder_learning_rate": 9.989726963751683e-06
},
{
"epoch": 0.255535390199637,
"grad_norm": 0.5610758217332802,
"learning_rate": 9.989726963751683e-06,
"loss": 0.7525,
"step": 22,
"ts_encoder_learning_rate": 9.986999371575465e-06
},
{
"epoch": 0.2671506352087114,
"grad_norm": 0.6140193378709962,
"learning_rate": 9.986999371575465e-06,
"loss": 0.7503,
"step": 23,
"ts_encoder_learning_rate": 9.983951473748579e-06
},
{
"epoch": 0.27876588021778587,
"grad_norm": 0.5463424314841056,
"learning_rate": 9.983951473748579e-06,
"loss": 0.7361,
"step": 24,
"ts_encoder_learning_rate": 9.980583466031808e-06
},
{
"epoch": 0.29038112522686027,
"grad_norm": 0.5456053434098116,
"learning_rate": 9.980583466031808e-06,
"loss": 0.7445,
"step": 25,
"ts_encoder_learning_rate": 9.976895564745993e-06
},
{
"epoch": 0.3019963702359347,
"grad_norm": 0.4724488207315688,
"learning_rate": 9.976895564745993e-06,
"loss": 0.7258,
"step": 26,
"ts_encoder_learning_rate": 9.97288800675814e-06
},
{
"epoch": 0.3136116152450091,
"grad_norm": 0.43473472160091314,
"learning_rate": 9.97288800675814e-06,
"loss": 0.7131,
"step": 27,
"ts_encoder_learning_rate": 9.968561049466214e-06
},
{
"epoch": 0.3252268602540835,
"grad_norm": 0.4651449922676477,
"learning_rate": 9.968561049466214e-06,
"loss": 0.7125,
"step": 28,
"ts_encoder_learning_rate": 9.963914970782594e-06
},
{
"epoch": 0.3368421052631579,
"grad_norm": 0.42304487514126465,
"learning_rate": 9.963914970782594e-06,
"loss": 0.6958,
"step": 29,
"ts_encoder_learning_rate": 9.95895006911623e-06
},
{
"epoch": 0.3484573502722323,
"grad_norm": 0.37512478085470513,
"learning_rate": 9.95895006911623e-06,
"loss": 0.6815,
"step": 30,
"ts_encoder_learning_rate": 9.953666663353485e-06
},
{
"epoch": 0.3600725952813067,
"grad_norm": 0.38681860428776565,
"learning_rate": 9.953666663353485e-06,
"loss": 0.682,
"step": 31,
"ts_encoder_learning_rate": 9.948065092837631e-06
},
{
"epoch": 0.3716878402903811,
"grad_norm": 0.3878313981786008,
"learning_rate": 9.948065092837631e-06,
"loss": 0.6705,
"step": 32,
"ts_encoder_learning_rate": 9.942145717347077e-06
},
{
"epoch": 0.38330308529945556,
"grad_norm": 0.3483493230771332,
"learning_rate": 9.942145717347077e-06,
"loss": 0.6517,
"step": 33,
"ts_encoder_learning_rate": 9.935908917072253e-06
},
{
"epoch": 0.39491833030852996,
"grad_norm": 0.32925851151692315,
"learning_rate": 9.935908917072253e-06,
"loss": 0.6611,
"step": 34,
"ts_encoder_learning_rate": 9.92935509259118e-06
},
{
"epoch": 0.40653357531760437,
"grad_norm": 0.3655692991223213,
"learning_rate": 9.92935509259118e-06,
"loss": 0.6449,
"step": 35,
"ts_encoder_learning_rate": 9.922484664843763e-06
},
{
"epoch": 0.41814882032667877,
"grad_norm": 0.3582656975961708,
"learning_rate": 9.922484664843763e-06,
"loss": 0.6522,
"step": 36,
"ts_encoder_learning_rate": 9.915298075104735e-06
},
{
"epoch": 0.4297640653357532,
"grad_norm": 0.33379075182945417,
"learning_rate": 9.915298075104735e-06,
"loss": 0.6416,
"step": 37,
"ts_encoder_learning_rate": 9.907795784955327e-06
},
{
"epoch": 0.4413793103448276,
"grad_norm": 0.3347132110396014,
"learning_rate": 9.907795784955327e-06,
"loss": 0.6539,
"step": 38,
"ts_encoder_learning_rate": 9.899978276253617e-06
},
{
"epoch": 0.452994555353902,
"grad_norm": 0.3241158793623529,
"learning_rate": 9.899978276253617e-06,
"loss": 0.6438,
"step": 39,
"ts_encoder_learning_rate": 9.891846051103578e-06
},
{
"epoch": 0.4646098003629764,
"grad_norm": 0.3455452567899899,
"learning_rate": 9.891846051103578e-06,
"loss": 0.6316,
"step": 40,
"ts_encoder_learning_rate": 9.883399631822836e-06
},
{
"epoch": 0.4762250453720508,
"grad_norm": 0.31313557573101863,
"learning_rate": 9.883399631822836e-06,
"loss": 0.6389,
"step": 41,
"ts_encoder_learning_rate": 9.874639560909118e-06
},
{
"epoch": 0.48784029038112525,
"grad_norm": 0.3280416621294979,
"learning_rate": 9.874639560909118e-06,
"loss": 0.6285,
"step": 42,
"ts_encoder_learning_rate": 9.86556640100541e-06
},
{
"epoch": 0.49945553539019966,
"grad_norm": 0.3198451666750831,
"learning_rate": 9.86556640100541e-06,
"loss": 0.6356,
"step": 43,
"ts_encoder_learning_rate": 9.85618073486382e-06
},
{
"epoch": 0.511070780399274,
"grad_norm": 0.32065369243985437,
"learning_rate": 9.85618073486382e-06,
"loss": 0.6301,
"step": 44,
"ts_encoder_learning_rate": 9.846483165308142e-06
},
{
"epoch": 0.5226860254083484,
"grad_norm": 0.2985686533559952,
"learning_rate": 9.846483165308142e-06,
"loss": 0.6094,
"step": 45,
"ts_encoder_learning_rate": 9.836474315195148e-06
},
{
"epoch": 0.5343012704174228,
"grad_norm": 0.3043913719441071,
"learning_rate": 9.836474315195148e-06,
"loss": 0.618,
"step": 46,
"ts_encoder_learning_rate": 9.826154827374578e-06
},
{
"epoch": 0.5459165154264973,
"grad_norm": 0.29426029916433744,
"learning_rate": 9.826154827374578e-06,
"loss": 0.6117,
"step": 47,
"ts_encoder_learning_rate": 9.815525364647853e-06
},
{
"epoch": 0.5575317604355717,
"grad_norm": 0.29759373582076726,
"learning_rate": 9.815525364647853e-06,
"loss": 0.6102,
"step": 48,
"ts_encoder_learning_rate": 9.804586609725499e-06
},
{
"epoch": 0.5691470054446461,
"grad_norm": 0.2991170372194726,
"learning_rate": 9.804586609725499e-06,
"loss": 0.5973,
"step": 49,
"ts_encoder_learning_rate": 9.793339265183303e-06
},
{
"epoch": 0.5807622504537205,
"grad_norm": 0.297629927322108,
"learning_rate": 9.793339265183303e-06,
"loss": 0.5997,
"step": 50,
"ts_encoder_learning_rate": 9.781784053417192e-06
},
{
"epoch": 0.592377495462795,
"grad_norm": 0.29559157031475897,
"learning_rate": 9.781784053417192e-06,
"loss": 0.6012,
"step": 51,
"ts_encoder_learning_rate": 9.76992171659682e-06
},
{
"epoch": 0.6039927404718693,
"grad_norm": 0.30135176793549534,
"learning_rate": 9.76992171659682e-06,
"loss": 0.5997,
"step": 52,
"ts_encoder_learning_rate": 9.757753016617917e-06
},
{
"epoch": 0.6156079854809438,
"grad_norm": 0.2830877744764034,
"learning_rate": 9.757753016617917e-06,
"loss": 0.593,
"step": 53,
"ts_encoder_learning_rate": 9.745278735053345e-06
},
{
"epoch": 0.6272232304900182,
"grad_norm": 0.30643259357036984,
"learning_rate": 9.745278735053345e-06,
"loss": 0.5861,
"step": 54,
"ts_encoder_learning_rate": 9.732499673102895e-06
},
{
"epoch": 0.6388384754990926,
"grad_norm": 0.33677739208267987,
"learning_rate": 9.732499673102895e-06,
"loss": 0.579,
"step": 55,
"ts_encoder_learning_rate": 9.719416651541839e-06
},
{
"epoch": 0.650453720508167,
"grad_norm": 0.3205752816564006,
"learning_rate": 9.719416651541839e-06,
"loss": 0.589,
"step": 56,
"ts_encoder_learning_rate": 9.706030510668202e-06
},
{
"epoch": 0.6620689655172414,
"grad_norm": 0.3136856624465887,
"learning_rate": 9.706030510668202e-06,
"loss": 0.5719,
"step": 57,
"ts_encoder_learning_rate": 9.692342110248802e-06
},
{
"epoch": 0.6736842105263158,
"grad_norm": 0.3538385796052594,
"learning_rate": 9.692342110248802e-06,
"loss": 0.5836,
"step": 58,
"ts_encoder_learning_rate": 9.678352329464018e-06
},
{
"epoch": 0.6852994555353902,
"grad_norm": 0.3412039613534201,
"learning_rate": 9.678352329464018e-06,
"loss": 0.5776,
"step": 59,
"ts_encoder_learning_rate": 9.664062066851325e-06
},
{
"epoch": 0.6969147005444646,
"grad_norm": 0.3470773794842348,
"learning_rate": 9.664062066851325e-06,
"loss": 0.5851,
"step": 60,
"ts_encoder_learning_rate": 9.649472240247588e-06
},
{
"epoch": 0.708529945553539,
"grad_norm": 0.3212528977463762,
"learning_rate": 9.649472240247588e-06,
"loss": 0.5739,
"step": 61,
"ts_encoder_learning_rate": 9.63458378673011e-06
},
{
"epoch": 0.7201451905626134,
"grad_norm": 0.3628948373052999,
"learning_rate": 9.63458378673011e-06,
"loss": 0.5667,
"step": 62,
"ts_encoder_learning_rate": 9.619397662556434e-06
},
{
"epoch": 0.7317604355716878,
"grad_norm": 0.35120878235548797,
"learning_rate": 9.619397662556434e-06,
"loss": 0.5746,
"step": 63,
"ts_encoder_learning_rate": 9.603914843102941e-06
},
{
"epoch": 0.7433756805807622,
"grad_norm": 0.34896054004358656,
"learning_rate": 9.603914843102941e-06,
"loss": 0.5683,
"step": 64,
"ts_encoder_learning_rate": 9.588136322802194e-06
},
{
"epoch": 0.7549909255898367,
"grad_norm": 0.34262893904308916,
"learning_rate": 9.588136322802194e-06,
"loss": 0.5576,
"step": 65,
"ts_encoder_learning_rate": 9.572063115079063e-06
},
{
"epoch": 0.7666061705989111,
"grad_norm": 0.3380023238243971,
"learning_rate": 9.572063115079063e-06,
"loss": 0.553,
"step": 66,
"ts_encoder_learning_rate": 9.555696252285648e-06
},
{
"epoch": 0.7782214156079855,
"grad_norm": 0.3875484812249266,
"learning_rate": 9.555696252285648e-06,
"loss": 0.5596,
"step": 67,
"ts_encoder_learning_rate": 9.539036785634961e-06
},
{
"epoch": 0.7898366606170599,
"grad_norm": 0.35930423509307985,
"learning_rate": 9.539036785634961e-06,
"loss": 0.5545,
"step": 68,
"ts_encoder_learning_rate": 9.522085785133415e-06
},
{
"epoch": 0.8014519056261343,
"grad_norm": 0.3791388166119568,
"learning_rate": 9.522085785133415e-06,
"loss": 0.5513,
"step": 69,
"ts_encoder_learning_rate": 9.504844339512096e-06
},
{
"epoch": 0.8130671506352087,
"grad_norm": 0.34685014988411594,
"learning_rate": 9.504844339512096e-06,
"loss": 0.5516,
"step": 70,
"ts_encoder_learning_rate": 9.48731355615684e-06
},
{
"epoch": 0.8246823956442831,
"grad_norm": 0.35909523874401894,
"learning_rate": 9.48731355615684e-06,
"loss": 0.5425,
"step": 71,
"ts_encoder_learning_rate": 9.469494561037097e-06
},
{
"epoch": 0.8362976406533575,
"grad_norm": 0.4403897777700719,
"learning_rate": 9.469494561037097e-06,
"loss": 0.5329,
"step": 72,
"ts_encoder_learning_rate": 9.451388498633635e-06
},
{
"epoch": 0.847912885662432,
"grad_norm": 0.3818885699775511,
"learning_rate": 9.451388498633635e-06,
"loss": 0.5365,
"step": 73,
"ts_encoder_learning_rate": 9.432996531865001e-06
},
{
"epoch": 0.8595281306715064,
"grad_norm": 0.40435312969694975,
"learning_rate": 9.432996531865001e-06,
"loss": 0.5315,
"step": 74,
"ts_encoder_learning_rate": 9.414319842012855e-06
},
{
"epoch": 0.8711433756805808,
"grad_norm": 0.4248553104454013,
"learning_rate": 9.414319842012855e-06,
"loss": 0.533,
"step": 75,
"ts_encoder_learning_rate": 9.395359628646087e-06
},
{
"epoch": 0.8827586206896552,
"grad_norm": 0.39206791521135576,
"learning_rate": 9.395359628646087e-06,
"loss": 0.5256,
"step": 76,
"ts_encoder_learning_rate": 9.376117109543769e-06
},
{
"epoch": 0.8943738656987296,
"grad_norm": 0.36377005854333166,
"learning_rate": 9.376117109543769e-06,
"loss": 0.5194,
"step": 77,
"ts_encoder_learning_rate": 9.356593520616948e-06
},
{
"epoch": 0.905989110707804,
"grad_norm": 0.43146303271047914,
"learning_rate": 9.356593520616948e-06,
"loss": 0.5267,
"step": 78,
"ts_encoder_learning_rate": 9.336790115829255e-06
},
{
"epoch": 0.9176043557168784,
"grad_norm": 0.3621214209550119,
"learning_rate": 9.336790115829255e-06,
"loss": 0.5218,
"step": 79,
"ts_encoder_learning_rate": 9.316708167116377e-06
},
{
"epoch": 0.9292196007259528,
"grad_norm": 0.40377960504482446,
"learning_rate": 9.316708167116377e-06,
"loss": 0.5214,
"step": 80,
"ts_encoder_learning_rate": 9.296348964304351e-06
},
{
"epoch": 0.9408348457350272,
"grad_norm": 0.3806316238680835,
"learning_rate": 9.296348964304351e-06,
"loss": 0.5102,
"step": 81,
"ts_encoder_learning_rate": 9.275713815026732e-06
},
{
"epoch": 0.9524500907441016,
"grad_norm": 0.387322276165842,
"learning_rate": 9.275713815026732e-06,
"loss": 0.5069,
"step": 82,
"ts_encoder_learning_rate": 9.254804044640596e-06
},
{
"epoch": 0.964065335753176,
"grad_norm": 0.46164777708230237,
"learning_rate": 9.254804044640596e-06,
"loss": 0.4985,
"step": 83,
"ts_encoder_learning_rate": 9.233620996141421e-06
},
{
"epoch": 0.9756805807622505,
"grad_norm": 0.4208251582273127,
"learning_rate": 9.233620996141421e-06,
"loss": 0.5086,
"step": 84,
"ts_encoder_learning_rate": 9.212166030076832e-06
},
{
"epoch": 0.9872958257713249,
"grad_norm": 0.43078427861557256,
"learning_rate": 9.212166030076832e-06,
"loss": 0.525,
"step": 85,
"ts_encoder_learning_rate": 9.190440524459203e-06
},
{
"epoch": 0.9989110707803993,
"grad_norm": 0.47136114267604184,
"learning_rate": 9.190440524459203e-06,
"loss": 0.504,
"step": 86,
"ts_encoder_learning_rate": 9.168445874677168e-06
},
{
"epoch": 1.0,
"grad_norm": 0.47136114267604184,
"learning_rate": 9.168445874677168e-06,
"loss": 0.0433,
"step": 87,
"ts_encoder_learning_rate": 9.146183493405976e-06
},
{
"epoch": 1.0116152450090745,
"grad_norm": 0.4459150481878699,
"learning_rate": 9.146183493405976e-06,
"loss": 0.4937,
"step": 88,
"ts_encoder_learning_rate": 9.12365481051678e-06
},
{
"epoch": 1.0232304900181488,
"grad_norm": 0.4648677558952652,
"learning_rate": 9.12365481051678e-06,
"loss": 0.4814,
"step": 89,
"ts_encoder_learning_rate": 9.10086127298478e-06
},
{
"epoch": 1.0348457350272233,
"grad_norm": 0.424833617058968,
"learning_rate": 9.10086127298478e-06,
"loss": 0.4872,
"step": 90,
"ts_encoder_learning_rate": 9.077804344796302e-06
},
{
"epoch": 1.0464609800362976,
"grad_norm": 0.5447833529096903,
"learning_rate": 9.077804344796302e-06,
"loss": 0.465,
"step": 91,
"ts_encoder_learning_rate": 9.054485506854756e-06
},
{
"epoch": 1.0580762250453721,
"grad_norm": 0.5645327960600057,
"learning_rate": 9.054485506854756e-06,
"loss": 0.465,
"step": 92,
"ts_encoder_learning_rate": 9.030906256885528e-06
},
{
"epoch": 1.0696914700544464,
"grad_norm": 0.5707423730743318,
"learning_rate": 9.030906256885528e-06,
"loss": 0.4664,
"step": 93,
"ts_encoder_learning_rate": 9.007068109339783e-06
},
{
"epoch": 1.081306715063521,
"grad_norm": 0.49970416310158533,
"learning_rate": 9.007068109339783e-06,
"loss": 0.458,
"step": 94,
"ts_encoder_learning_rate": 8.982972595297195e-06
},
{
"epoch": 1.0929219600725952,
"grad_norm": 0.4760304595595311,
"learning_rate": 8.982972595297195e-06,
"loss": 0.4669,
"step": 95,
"ts_encoder_learning_rate": 8.9586212623676e-06
},
{
"epoch": 1.1045372050816697,
"grad_norm": 0.6546521338664805,
"learning_rate": 8.9586212623676e-06,
"loss": 0.4631,
"step": 96,
"ts_encoder_learning_rate": 8.93401567459161e-06
},
{
"epoch": 1.116152450090744,
"grad_norm": 0.6820617032835106,
"learning_rate": 8.93401567459161e-06,
"loss": 0.4556,
"step": 97,
"ts_encoder_learning_rate": 8.90915741234015e-06
},
{
"epoch": 1.1277676950998186,
"grad_norm": 0.4791674527928274,
"learning_rate": 8.90915741234015e-06,
"loss": 0.4461,
"step": 98,
"ts_encoder_learning_rate": 8.884048072212952e-06
},
{
"epoch": 1.1393829401088928,
"grad_norm": 0.8342160989342361,
"learning_rate": 8.884048072212952e-06,
"loss": 0.4665,
"step": 99,
"ts_encoder_learning_rate": 8.85868926693601e-06
},
{
"epoch": 1.1509981851179674,
"grad_norm": 0.6077292569121415,
"learning_rate": 8.85868926693601e-06,
"loss": 0.4492,
"step": 100,
"ts_encoder_learning_rate": 8.833082625258003e-06
},
{
"epoch": 1.1626134301270417,
"grad_norm": 0.5647955360914533,
"learning_rate": 8.833082625258003e-06,
"loss": 0.4435,
"step": 101,
"ts_encoder_learning_rate": 8.807229791845673e-06
},
{
"epoch": 1.1742286751361162,
"grad_norm": 0.6044541014322447,
"learning_rate": 8.807229791845673e-06,
"loss": 0.4427,
"step": 102,
"ts_encoder_learning_rate": 8.781132427178203e-06
},
{
"epoch": 1.1858439201451905,
"grad_norm": 0.5437886950515926,
"learning_rate": 8.781132427178203e-06,
"loss": 0.4427,
"step": 103,
"ts_encoder_learning_rate": 8.754792207440557e-06
},
{
"epoch": 1.197459165154265,
"grad_norm": 0.5946421067486096,
"learning_rate": 8.754792207440557e-06,
"loss": 0.4327,
"step": 104,
"ts_encoder_learning_rate": 8.728210824415829e-06
},
{
"epoch": 1.2090744101633395,
"grad_norm": 0.6434751459915273,
"learning_rate": 8.728210824415829e-06,
"loss": 0.4342,
"step": 105,
"ts_encoder_learning_rate": 8.701389985376578e-06
},
{
"epoch": 1.2206896551724138,
"grad_norm": 0.5051510187934748,
"learning_rate": 8.701389985376578e-06,
"loss": 0.4398,
"step": 106,
"ts_encoder_learning_rate": 8.674331412975178e-06
},
{
"epoch": 1.232304900181488,
"grad_norm": 0.5595267159920397,
"learning_rate": 8.674331412975178e-06,
"loss": 0.442,
"step": 107,
"ts_encoder_learning_rate": 8.647036845133171e-06
},
{
"epoch": 1.2439201451905626,
"grad_norm": 0.525092656163426,
"learning_rate": 8.647036845133171e-06,
"loss": 0.43,
"step": 108,
"ts_encoder_learning_rate": 8.619508034929646e-06
},
{
"epoch": 1.2555353901996371,
"grad_norm": 0.5105752951496461,
"learning_rate": 8.619508034929646e-06,
"loss": 0.4218,
"step": 109,
"ts_encoder_learning_rate": 8.591746750488639e-06
},
{
"epoch": 1.2671506352087114,
"grad_norm": 0.5231024334601619,
"learning_rate": 8.591746750488639e-06,
"loss": 0.4182,
"step": 110,
"ts_encoder_learning_rate": 8.563754774865574e-06
},
{
"epoch": 1.278765880217786,
"grad_norm": 0.4819620849275164,
"learning_rate": 8.563754774865574e-06,
"loss": 0.4246,
"step": 111,
"ts_encoder_learning_rate": 8.535533905932739e-06
},
{
"epoch": 1.2903811252268602,
"grad_norm": 0.5106626492387893,
"learning_rate": 8.535533905932739e-06,
"loss": 0.4156,
"step": 112,
"ts_encoder_learning_rate": 8.507085956263808e-06
},
{
"epoch": 1.3019963702359347,
"grad_norm": 0.48163467909413527,
"learning_rate": 8.507085956263808e-06,
"loss": 0.3944,
"step": 113,
"ts_encoder_learning_rate": 8.478412753017433e-06
},
{
"epoch": 1.313611615245009,
"grad_norm": 0.4708287815222732,
"learning_rate": 8.478412753017433e-06,
"loss": 0.4198,
"step": 114,
"ts_encoder_learning_rate": 8.449516137819875e-06
},
{
"epoch": 1.3252268602540835,
"grad_norm": 0.45335927979980417,
"learning_rate": 8.449516137819875e-06,
"loss": 0.4004,
"step": 115,
"ts_encoder_learning_rate": 8.420397966646732e-06
},
{
"epoch": 1.3368421052631578,
"grad_norm": 0.4731479114137599,
"learning_rate": 8.420397966646732e-06,
"loss": 0.4012,
"step": 116,
"ts_encoder_learning_rate": 8.391060109703725e-06
},
{
"epoch": 1.3484573502722323,
"grad_norm": 0.45763071454798787,
"learning_rate": 8.391060109703725e-06,
"loss": 0.4043,
"step": 117,
"ts_encoder_learning_rate": 8.361504451306585e-06
},
{
"epoch": 1.3600725952813066,
"grad_norm": 0.49218004748616023,
"learning_rate": 8.361504451306585e-06,
"loss": 0.3804,
"step": 118,
"ts_encoder_learning_rate": 8.331732889760021e-06
},
{
"epoch": 1.3716878402903812,
"grad_norm": 0.48411439748425,
"learning_rate": 8.331732889760021e-06,
"loss": 0.3768,
"step": 119,
"ts_encoder_learning_rate": 8.301747337235798e-06
},
{
"epoch": 1.3833030852994557,
"grad_norm": 0.5158115467010486,
"learning_rate": 8.301747337235798e-06,
"loss": 0.3893,
"step": 120,
"ts_encoder_learning_rate": 8.271549719649923e-06
},
{
"epoch": 1.39491833030853,
"grad_norm": 0.4717657156486388,
"learning_rate": 8.271549719649923e-06,
"loss": 0.3929,
"step": 121,
"ts_encoder_learning_rate": 8.241141976538944e-06
},
{
"epoch": 1.4065335753176043,
"grad_norm": 0.48449961022029603,
"learning_rate": 8.241141976538944e-06,
"loss": 0.3903,
"step": 122,
"ts_encoder_learning_rate": 8.210526060935377e-06
},
{
"epoch": 1.4181488203266788,
"grad_norm": 0.5590109000920023,
"learning_rate": 8.210526060935377e-06,
"loss": 0.3731,
"step": 123,
"ts_encoder_learning_rate": 8.179703939242276e-06
},
{
"epoch": 1.4297640653357533,
"grad_norm": 0.48980864166559324,
"learning_rate": 8.179703939242276e-06,
"loss": 0.37,
"step": 124,
"ts_encoder_learning_rate": 8.148677591106919e-06
},
{
"epoch": 1.4413793103448276,
"grad_norm": 0.49306252756918445,
"learning_rate": 8.148677591106919e-06,
"loss": 0.3821,
"step": 125,
"ts_encoder_learning_rate": 8.117449009293668e-06
},
{
"epoch": 1.4529945553539019,
"grad_norm": 0.5343587386921387,
"learning_rate": 8.117449009293668e-06,
"loss": 0.388,
"step": 126,
"ts_encoder_learning_rate": 8.08602019955598e-06
},
{
"epoch": 1.4646098003629764,
"grad_norm": 0.4932498699505098,
"learning_rate": 8.08602019955598e-06,
"loss": 0.3689,
"step": 127,
"ts_encoder_learning_rate": 8.054393180507572e-06
},
{
"epoch": 1.476225045372051,
"grad_norm": 0.5115591275832498,
"learning_rate": 8.054393180507572e-06,
"loss": 0.3705,
"step": 128,
"ts_encoder_learning_rate": 8.022569983492781e-06
},
{
"epoch": 1.4878402903811252,
"grad_norm": 0.5293741893039812,
"learning_rate": 8.022569983492781e-06,
"loss": 0.3757,
"step": 129,
"ts_encoder_learning_rate": 7.99055265245608e-06
},
{
"epoch": 1.4994555353901997,
"grad_norm": 0.4820946133813763,
"learning_rate": 7.99055265245608e-06,
"loss": 0.3774,
"step": 130,
"ts_encoder_learning_rate": 7.958343243810818e-06
},
{
"epoch": 1.511070780399274,
"grad_norm": 0.5715571029957693,
"learning_rate": 7.958343243810818e-06,
"loss": 0.3536,
"step": 131,
"ts_encoder_learning_rate": 7.925943826307119e-06
},
{
"epoch": 1.5226860254083485,
"grad_norm": 0.45688383927056603,
"learning_rate": 7.925943826307119e-06,
"loss": 0.3781,
"step": 132,
"ts_encoder_learning_rate": 7.89335648089903e-06
},
{
"epoch": 1.5343012704174228,
"grad_norm": 0.5417387120089402,
"learning_rate": 7.89335648089903e-06,
"loss": 0.3849,
"step": 133,
"ts_encoder_learning_rate": 7.860583300610849e-06
},
{
"epoch": 1.5459165154264973,
"grad_norm": 0.48249798152601187,
"learning_rate": 7.860583300610849e-06,
"loss": 0.3614,
"step": 134,
"ts_encoder_learning_rate": 7.827626390402707e-06
},
{
"epoch": 1.5575317604355718,
"grad_norm": 0.49148409141245863,
"learning_rate": 7.827626390402707e-06,
"loss": 0.3604,
"step": 135,
"ts_encoder_learning_rate": 7.794487867035358e-06
},
{
"epoch": 1.5691470054446461,
"grad_norm": 0.5575426863669594,
"learning_rate": 7.794487867035358e-06,
"loss": 0.3582,
"step": 136,
"ts_encoder_learning_rate": 7.761169858934238e-06
},
{
"epoch": 1.5807622504537204,
"grad_norm": 0.47759386240015317,
"learning_rate": 7.761169858934238e-06,
"loss": 0.364,
"step": 137,
"ts_encoder_learning_rate": 7.727674506052744e-06
},
{
"epoch": 1.592377495462795,
"grad_norm": 0.5295175062956325,
"learning_rate": 7.727674506052744e-06,
"loss": 0.3543,
"step": 138,
"ts_encoder_learning_rate": 7.694003959734802e-06
},
{
"epoch": 1.6039927404718695,
"grad_norm": 0.6604920964139046,
"learning_rate": 7.694003959734802e-06,
"loss": 0.3607,
"step": 139,
"ts_encoder_learning_rate": 7.660160382576683e-06
},
{
"epoch": 1.6156079854809438,
"grad_norm": 0.5412776964498659,
"learning_rate": 7.660160382576683e-06,
"loss": 0.3494,
"step": 140,
"ts_encoder_learning_rate": 7.626145948288107e-06
},
{
"epoch": 1.627223230490018,
"grad_norm": 0.4885017642227708,
"learning_rate": 7.626145948288107e-06,
"loss": 0.3563,
"step": 141,
"ts_encoder_learning_rate": 7.591962841552627e-06
},
{
"epoch": 1.6388384754990926,
"grad_norm": 0.6310454372740532,
"learning_rate": 7.591962841552627e-06,
"loss": 0.3513,
"step": 142,
"ts_encoder_learning_rate": 7.55761325788731e-06
},
{
"epoch": 1.650453720508167,
"grad_norm": 0.44382432912819253,
"learning_rate": 7.55761325788731e-06,
"loss": 0.3562,
"step": 143,
"ts_encoder_learning_rate": 7.52309940350173e-06
},
{
"epoch": 1.6620689655172414,
"grad_norm": 0.5462511565321053,
"learning_rate": 7.52309940350173e-06,
"loss": 0.3461,
"step": 144,
"ts_encoder_learning_rate": 7.488423495156258e-06
},
{
"epoch": 1.6736842105263157,
"grad_norm": 0.4959866346692337,
"learning_rate": 7.488423495156258e-06,
"loss": 0.3615,
"step": 145,
"ts_encoder_learning_rate": 7.453587760019691e-06
},
{
"epoch": 1.6852994555353902,
"grad_norm": 0.481447728937644,
"learning_rate": 7.453587760019691e-06,
"loss": 0.3532,
"step": 146,
"ts_encoder_learning_rate": 7.4185944355261996e-06
},
{
"epoch": 1.6969147005444647,
"grad_norm": 0.4773235171153849,
"learning_rate": 7.4185944355261996e-06,
"loss": 0.3646,
"step": 147,
"ts_encoder_learning_rate": 7.383445769231628e-06
},
{
"epoch": 1.708529945553539,
"grad_norm": 0.4922147267880892,
"learning_rate": 7.383445769231628e-06,
"loss": 0.349,
"step": 148,
"ts_encoder_learning_rate": 7.348144018669129e-06
},
{
"epoch": 1.7201451905626133,
"grad_norm": 0.500323445240717,
"learning_rate": 7.348144018669129e-06,
"loss": 0.3271,
"step": 149,
"ts_encoder_learning_rate": 7.312691451204178e-06
},
{
"epoch": 1.7317604355716878,
"grad_norm": 0.47751019497636976,
"learning_rate": 7.312691451204178e-06,
"loss": 0.3635,
"step": 150,
"ts_encoder_learning_rate": 7.277090343888931e-06
},
{
"epoch": 1.7433756805807623,
"grad_norm": 0.4939172584022833,
"learning_rate": 7.277090343888931e-06,
"loss": 0.3522,
"step": 151,
"ts_encoder_learning_rate": 7.241342983315985e-06
},
{
"epoch": 1.7549909255898366,
"grad_norm": 0.526535710546132,
"learning_rate": 7.241342983315985e-06,
"loss": 0.3379,
"step": 152,
"ts_encoder_learning_rate": 7.205451665471515e-06
},
{
"epoch": 1.7666061705989111,
"grad_norm": 0.48512244017684314,
"learning_rate": 7.205451665471515e-06,
"loss": 0.3671,
"step": 153,
"ts_encoder_learning_rate": 7.169418695587791e-06
},
{
"epoch": 1.7782214156079856,
"grad_norm": 0.5249208108710423,
"learning_rate": 7.169418695587791e-06,
"loss": 0.353,
"step": 154,
"ts_encoder_learning_rate": 7.1332463879951404e-06
},
{
"epoch": 1.78983666061706,
"grad_norm": 0.5939685667617669,
"learning_rate": 7.1332463879951404e-06,
"loss": 0.3338,
"step": 155,
"ts_encoder_learning_rate": 7.096937065973285e-06
},
{
"epoch": 1.8014519056261342,
"grad_norm": 0.46870716209574953,
"learning_rate": 7.096937065973285e-06,
"loss": 0.3258,
"step": 156,
"ts_encoder_learning_rate": 7.060493061602128e-06
},
{
"epoch": 1.8130671506352087,
"grad_norm": 0.5004844023800996,
"learning_rate": 7.060493061602128e-06,
"loss": 0.3338,
"step": 157,
"ts_encoder_learning_rate": 7.023916715611969e-06
},
{
"epoch": 1.8246823956442833,
"grad_norm": 0.6018225120929914,
"learning_rate": 7.023916715611969e-06,
"loss": 0.3559,
"step": 158,
"ts_encoder_learning_rate": 6.987210377233165e-06
},
{
"epoch": 1.8362976406533575,
"grad_norm": 0.5115688533474527,
"learning_rate": 6.987210377233165e-06,
"loss": 0.3215,
"step": 159,
"ts_encoder_learning_rate": 6.950376404045235e-06
},
{
"epoch": 1.8479128856624318,
"grad_norm": 0.5453997597445909,
"learning_rate": 6.950376404045235e-06,
"loss": 0.3288,
"step": 160,
"ts_encoder_learning_rate": 6.913417161825449e-06
},
{
"epoch": 1.8595281306715064,
"grad_norm": 0.5468491048869575,
"learning_rate": 6.913417161825449e-06,
"loss": 0.3351,
"step": 161,
"ts_encoder_learning_rate": 6.876335024396872e-06
},
{
"epoch": 1.8711433756805809,
"grad_norm": 0.5127435043219846,
"learning_rate": 6.876335024396872e-06,
"loss": 0.3261,
"step": 162,
"ts_encoder_learning_rate": 6.839132373475894e-06
},
{
"epoch": 1.8827586206896552,
"grad_norm": 0.5334108745888081,
"learning_rate": 6.839132373475894e-06,
"loss": 0.315,
"step": 163,
"ts_encoder_learning_rate": 6.801811598519268e-06
},
{
"epoch": 1.8943738656987295,
"grad_norm": 0.5154441355243857,
"learning_rate": 6.801811598519268e-06,
"loss": 0.3427,
"step": 164,
"ts_encoder_learning_rate": 6.764375096570628e-06
},
{
"epoch": 1.905989110707804,
"grad_norm": 0.5561507577872592,
"learning_rate": 6.764375096570628e-06,
"loss": 0.3259,
"step": 165,
"ts_encoder_learning_rate": 6.726825272106539e-06
},
{
"epoch": 1.9176043557168785,
"grad_norm": 0.5291334612556954,
"learning_rate": 6.726825272106539e-06,
"loss": 0.3198,
"step": 166,
"ts_encoder_learning_rate": 6.689164536882059e-06
},
{
"epoch": 1.9292196007259528,
"grad_norm": 0.5011785801262693,
"learning_rate": 6.689164536882059e-06,
"loss": 0.3187,
"step": 167,
"ts_encoder_learning_rate": 6.651395309775837e-06
},
{
"epoch": 1.940834845735027,
"grad_norm": 0.5750688070408072,
"learning_rate": 6.651395309775837e-06,
"loss": 0.2997,
"step": 168,
"ts_encoder_learning_rate": 6.6135200166347505e-06
},
{
"epoch": 1.9524500907441016,
"grad_norm": 0.5043052526525891,
"learning_rate": 6.6135200166347505e-06,
"loss": 0.3145,
"step": 169,
"ts_encoder_learning_rate": 6.575541090118105e-06
},
{
"epoch": 1.964065335753176,
"grad_norm": 0.4896874363768986,
"learning_rate": 6.575541090118105e-06,
"loss": 0.3009,
"step": 170,
"ts_encoder_learning_rate": 6.537460969541378e-06
},
{
"epoch": 1.9756805807622504,
"grad_norm": 0.5088309056638338,
"learning_rate": 6.537460969541378e-06,
"loss": 0.3129,
"step": 171,
"ts_encoder_learning_rate": 6.499282100719558e-06
},
{
"epoch": 1.987295825771325,
"grad_norm": 0.49023494845007226,
"learning_rate": 6.499282100719558e-06,
"loss": 0.3082,
"step": 172,
"ts_encoder_learning_rate": 6.461006935810048e-06
},
{
"epoch": 1.9989110707803994,
"grad_norm": 0.5280481482147419,
"learning_rate": 6.461006935810048e-06,
"loss": 0.302,
"step": 173,
"ts_encoder_learning_rate": 6.4226379331551625e-06
},
{
"epoch": 2.0,
"grad_norm": 0.5280481482147419,
"learning_rate": 6.4226379331551625e-06,
"loss": 0.0333,
"step": 174,
"ts_encoder_learning_rate": 6.384177557124247e-06
},
{
"epoch": 2.0116152450090743,
"grad_norm": 0.5645755484448244,
"learning_rate": 6.384177557124247e-06,
"loss": 0.2784,
"step": 175,
"ts_encoder_learning_rate": 6.345628277955384e-06
},
{
"epoch": 2.023230490018149,
"grad_norm": 0.7089731024549738,
"learning_rate": 6.345628277955384e-06,
"loss": 0.2864,
"step": 176,
"ts_encoder_learning_rate": 6.306992571596742e-06
},
{
"epoch": 2.0348457350272233,
"grad_norm": 0.5602666036492625,
"learning_rate": 6.306992571596742e-06,
"loss": 0.2881,
"step": 177,
"ts_encoder_learning_rate": 6.268272919547537e-06
},
{
"epoch": 2.0464609800362976,
"grad_norm": 0.6075921236619318,
"learning_rate": 6.268272919547537e-06,
"loss": 0.2798,
"step": 178,
"ts_encoder_learning_rate": 6.229471808698673e-06
},
{
"epoch": 2.058076225045372,
"grad_norm": 0.6984592942833859,
"learning_rate": 6.229471808698673e-06,
"loss": 0.2673,
"step": 179,
"ts_encoder_learning_rate": 6.1905917311729915e-06
},
{
"epoch": 2.0696914700544466,
"grad_norm": 0.5022810378072105,
"learning_rate": 6.1905917311729915e-06,
"loss": 0.2849,
"step": 180,
"ts_encoder_learning_rate": 6.151635184165219e-06
},
{
"epoch": 2.081306715063521,
"grad_norm": 0.5093282715650761,
"learning_rate": 6.151635184165219e-06,
"loss": 0.3026,
"step": 181,
"ts_encoder_learning_rate": 6.112604669781572e-06
},
{
"epoch": 2.0929219600725952,
"grad_norm": 0.6048006185139588,
"learning_rate": 6.112604669781572e-06,
"loss": 0.2835,
"step": 182,
"ts_encoder_learning_rate": 6.073502694879059e-06
},
{
"epoch": 2.1045372050816695,
"grad_norm": 0.5065083654679191,
"learning_rate": 6.073502694879059e-06,
"loss": 0.2782,
"step": 183,
"ts_encoder_learning_rate": 6.034331770904455e-06
},
{
"epoch": 2.1161524500907443,
"grad_norm": 0.5598494532388394,
"learning_rate": 6.034331770904455e-06,
"loss": 0.2825,
"step": 184,
"ts_encoder_learning_rate": 5.9950944137330125e-06
},
{
"epoch": 2.1277676950998186,
"grad_norm": 0.530982359241071,
"learning_rate": 5.9950944137330125e-06,
"loss": 0.2702,
"step": 185,
"ts_encoder_learning_rate": 5.955793143506863e-06
},
{
"epoch": 2.139382940108893,
"grad_norm": 0.509154289045405,
"learning_rate": 5.955793143506863e-06,
"loss": 0.2785,
"step": 186,
"ts_encoder_learning_rate": 5.916430484473149e-06
},
{
"epoch": 2.150998185117967,
"grad_norm": 0.5119364886674341,
"learning_rate": 5.916430484473149e-06,
"loss": 0.2895,
"step": 187,
"ts_encoder_learning_rate": 5.877008964821909e-06
},
{
"epoch": 2.162613430127042,
"grad_norm": 0.5034054419615417,
"learning_rate": 5.877008964821909e-06,
"loss": 0.2936,
"step": 188,
"ts_encoder_learning_rate": 5.837531116523683e-06
},
{
"epoch": 2.174228675136116,
"grad_norm": 0.5673133335829393,
"learning_rate": 5.837531116523683e-06,
"loss": 0.2762,
"step": 189,
"ts_encoder_learning_rate": 5.797999475166897e-06
},
{
"epoch": 2.1858439201451905,
"grad_norm": 0.5459438366130899,
"learning_rate": 5.797999475166897e-06,
"loss": 0.2718,
"step": 190,
"ts_encoder_learning_rate": 5.7584165797950055e-06
},
{
"epoch": 2.1974591651542648,
"grad_norm": 0.5533315060900448,
"learning_rate": 5.7584165797950055e-06,
"loss": 0.2754,
"step": 191,
"ts_encoder_learning_rate": 5.71878497274341e-06
},
{
"epoch": 2.2090744101633395,
"grad_norm": 0.6022264770129343,
"learning_rate": 5.71878497274341e-06,
"loss": 0.2743,
"step": 192,
"ts_encoder_learning_rate": 5.679107199476174e-06
},
{
"epoch": 2.220689655172414,
"grad_norm": 0.5540881606867427,
"learning_rate": 5.679107199476174e-06,
"loss": 0.2878,
"step": 193,
"ts_encoder_learning_rate": 5.6393858084225305e-06
},
{
"epoch": 2.232304900181488,
"grad_norm": 0.5577935030781893,
"learning_rate": 5.6393858084225305e-06,
"loss": 0.2677,
"step": 194,
"ts_encoder_learning_rate": 5.599623350813202e-06
},
{
"epoch": 2.243920145190563,
"grad_norm": 0.5369265581968106,
"learning_rate": 5.599623350813202e-06,
"loss": 0.26,
"step": 195,
"ts_encoder_learning_rate": 5.559822380516539e-06
},
{
"epoch": 2.255535390199637,
"grad_norm": 0.5350922011535203,
"learning_rate": 5.559822380516539e-06,
"loss": 0.2936,
"step": 196,
"ts_encoder_learning_rate": 5.5199854538744905e-06
},
{
"epoch": 2.2671506352087114,
"grad_norm": 0.534100714512928,
"learning_rate": 5.5199854538744905e-06,
"loss": 0.2756,
"step": 197,
"ts_encoder_learning_rate": 5.480115129538409e-06
},
{
"epoch": 2.2787658802177857,
"grad_norm": 0.5103849200596632,
"learning_rate": 5.480115129538409e-06,
"loss": 0.2498,
"step": 198,
"ts_encoder_learning_rate": 5.440213968304728e-06
},
{
"epoch": 2.2903811252268604,
"grad_norm": 0.5702739710122362,
"learning_rate": 5.440213968304728e-06,
"loss": 0.2708,
"step": 199,
"ts_encoder_learning_rate": 5.4002845329504675e-06
},
{
"epoch": 2.3019963702359347,
"grad_norm": 0.5216996613028344,
"learning_rate": 5.4002845329504675e-06,
"loss": 0.2668,
"step": 200,
"ts_encoder_learning_rate": 5.360329388068649e-06
},
{
"epoch": 2.313611615245009,
"grad_norm": 0.5388651334047538,
"learning_rate": 5.360329388068649e-06,
"loss": 0.2703,
"step": 201,
"ts_encoder_learning_rate": 5.320351099903565e-06
},
{
"epoch": 2.3252268602540833,
"grad_norm": 0.6001939955314459,
"learning_rate": 5.320351099903565e-06,
"loss": 0.261,
"step": 202,
"ts_encoder_learning_rate": 5.2803522361859596e-06
},
{
"epoch": 2.336842105263158,
"grad_norm": 0.4980937517422858,
"learning_rate": 5.2803522361859596e-06,
"loss": 0.2404,
"step": 203,
"ts_encoder_learning_rate": 5.240335365968104e-06
},
{
"epoch": 2.3484573502722323,
"grad_norm": 0.5342465570777747,
"learning_rate": 5.240335365968104e-06,
"loss": 0.2654,
"step": 204,
"ts_encoder_learning_rate": 5.2003030594587964e-06
},
{
"epoch": 2.3600725952813066,
"grad_norm": 0.5621041789253622,
"learning_rate": 5.2003030594587964e-06,
"loss": 0.2733,
"step": 205,
"ts_encoder_learning_rate": 5.160257887858278e-06
},
{
"epoch": 2.371687840290381,
"grad_norm": 0.5582661532374841,
"learning_rate": 5.160257887858278e-06,
"loss": 0.2597,
"step": 206,
"ts_encoder_learning_rate": 5.120202423193085e-06
},
{
"epoch": 2.3833030852994557,
"grad_norm": 0.5783285206586468,
"learning_rate": 5.120202423193085e-06,
"loss": 0.2525,
"step": 207,
"ts_encoder_learning_rate": 5.080139238150869e-06
},
{
"epoch": 2.39491833030853,
"grad_norm": 0.4822046196118616,
"learning_rate": 5.080139238150869e-06,
"loss": 0.2518,
"step": 208,
"ts_encoder_learning_rate": 5.040070905915139e-06
},
{
"epoch": 2.4065335753176043,
"grad_norm": 0.5559233478270967,
"learning_rate": 5.040070905915139e-06,
"loss": 0.28,
"step": 209,
"ts_encoder_learning_rate": 5e-06
},
{
"epoch": 2.418148820326679,
"grad_norm": 0.48901203329897386,
"learning_rate": 5e-06,
"loss": 0.2515,
"step": 210,
"ts_encoder_learning_rate": 4.959929094084862e-06
},
{
"epoch": 2.4297640653357533,
"grad_norm": 0.48746183943137245,
"learning_rate": 4.959929094084862e-06,
"loss": 0.2518,
"step": 211,
"ts_encoder_learning_rate": 4.919860761849132e-06
},
{
"epoch": 2.4413793103448276,
"grad_norm": 0.5009144470099608,
"learning_rate": 4.919860761849132e-06,
"loss": 0.2532,
"step": 212,
"ts_encoder_learning_rate": 4.879797576806915e-06
},
{
"epoch": 2.452994555353902,
"grad_norm": 0.4965510882041085,
"learning_rate": 4.879797576806915e-06,
"loss": 0.243,
"step": 213,
"ts_encoder_learning_rate": 4.839742112141725e-06
},
{
"epoch": 2.464609800362976,
"grad_norm": 0.48351081176855426,
"learning_rate": 4.839742112141725e-06,
"loss": 0.2596,
"step": 214,
"ts_encoder_learning_rate": 4.799696940541204e-06
},
{
"epoch": 2.476225045372051,
"grad_norm": 0.5172008324226596,
"learning_rate": 4.799696940541204e-06,
"loss": 0.236,
"step": 215,
"ts_encoder_learning_rate": 4.759664634031897e-06
},
{
"epoch": 2.487840290381125,
"grad_norm": 0.5571046196144138,
"learning_rate": 4.759664634031897e-06,
"loss": 0.2479,
"step": 216,
"ts_encoder_learning_rate": 4.719647763814041e-06
},
{
"epoch": 2.4994555353901995,
"grad_norm": 0.5421950769241719,
"learning_rate": 4.719647763814041e-06,
"loss": 0.2691,
"step": 217,
"ts_encoder_learning_rate": 4.679648900096436e-06
},
{
"epoch": 2.5110707803992742,
"grad_norm": 0.5330899469762319,
"learning_rate": 4.679648900096436e-06,
"loss": 0.2579,
"step": 218,
"ts_encoder_learning_rate": 4.6396706119313526e-06
},
{
"epoch": 2.5226860254083485,
"grad_norm": 0.5460154625334825,
"learning_rate": 4.6396706119313526e-06,
"loss": 0.2496,
"step": 219,
"ts_encoder_learning_rate": 4.599715467049534e-06
},
{
"epoch": 2.534301270417423,
"grad_norm": 0.5440271431449177,
"learning_rate": 4.599715467049534e-06,
"loss": 0.2455,
"step": 220,
"ts_encoder_learning_rate": 4.559786031695275e-06
},
{
"epoch": 2.545916515426497,
"grad_norm": 0.5133166382378065,
"learning_rate": 4.559786031695275e-06,
"loss": 0.2636,
"step": 221,
"ts_encoder_learning_rate": 4.5198848704615915e-06
},
{
"epoch": 2.557531760435572,
"grad_norm": 0.5366585588218753,
"learning_rate": 4.5198848704615915e-06,
"loss": 0.247,
"step": 222,
"ts_encoder_learning_rate": 4.480014546125511e-06
},
{
"epoch": 2.569147005444646,
"grad_norm": 0.5474900733819053,
"learning_rate": 4.480014546125511e-06,
"loss": 0.2795,
"step": 223,
"ts_encoder_learning_rate": 4.4401776194834615e-06
},
{
"epoch": 2.5807622504537204,
"grad_norm": 0.4975693304306332,
"learning_rate": 4.4401776194834615e-06,
"loss": 0.2401,
"step": 224,
"ts_encoder_learning_rate": 4.4003766491867984e-06
},
{
"epoch": 2.592377495462795,
"grad_norm": 0.5106865786465035,
"learning_rate": 4.4003766491867984e-06,
"loss": 0.242,
"step": 225,
"ts_encoder_learning_rate": 4.3606141915774695e-06
},
{
"epoch": 2.6039927404718695,
"grad_norm": 0.5168549085943859,
"learning_rate": 4.3606141915774695e-06,
"loss": 0.2335,
"step": 226,
"ts_encoder_learning_rate": 4.320892800523827e-06
},
{
"epoch": 2.6156079854809438,
"grad_norm": 0.5211543822385405,
"learning_rate": 4.320892800523827e-06,
"loss": 0.2493,
"step": 227,
"ts_encoder_learning_rate": 4.281215027256592e-06
},
{
"epoch": 2.627223230490018,
"grad_norm": 0.5127128225307483,
"learning_rate": 4.281215027256592e-06,
"loss": 0.2514,
"step": 228,
"ts_encoder_learning_rate": 4.241583420204998e-06
},
{
"epoch": 2.6388384754990923,
"grad_norm": 0.5378693778374266,
"learning_rate": 4.241583420204998e-06,
"loss": 0.2399,
"step": 229,
"ts_encoder_learning_rate": 4.2020005248331056e-06
},
{
"epoch": 2.650453720508167,
"grad_norm": 0.5721336179178363,
"learning_rate": 4.2020005248331056e-06,
"loss": 0.2558,
"step": 230,
"ts_encoder_learning_rate": 4.162468883476319e-06
},
{
"epoch": 2.6620689655172414,
"grad_norm": 0.5507499405880641,
"learning_rate": 4.162468883476319e-06,
"loss": 0.2423,
"step": 231,
"ts_encoder_learning_rate": 4.122991035178093e-06
},
{
"epoch": 2.6736842105263157,
"grad_norm": 0.5191297273890276,
"learning_rate": 4.122991035178093e-06,
"loss": 0.2387,
"step": 232,
"ts_encoder_learning_rate": 4.083569515526853e-06
},
{
"epoch": 2.6852994555353904,
"grad_norm": 0.5056716954679873,
"learning_rate": 4.083569515526853e-06,
"loss": 0.2379,
"step": 233,
"ts_encoder_learning_rate": 4.04420685649314e-06
},
{
"epoch": 2.6969147005444647,
"grad_norm": 0.5020728789197858,
"learning_rate": 4.04420685649314e-06,
"loss": 0.2341,
"step": 234,
"ts_encoder_learning_rate": 4.004905586266988e-06
},
{
"epoch": 2.708529945553539,
"grad_norm": 0.49710632823598544,
"learning_rate": 4.004905586266988e-06,
"loss": 0.2142,
"step": 235,
"ts_encoder_learning_rate": 3.965668229095546e-06
},
{
"epoch": 2.7201451905626133,
"grad_norm": 0.46282201836826814,
"learning_rate": 3.965668229095546e-06,
"loss": 0.25,
"step": 236,
"ts_encoder_learning_rate": 3.926497305120943e-06
},
{
"epoch": 2.7317604355716876,
"grad_norm": 0.5037406395194425,
"learning_rate": 3.926497305120943e-06,
"loss": 0.2423,
"step": 237,
"ts_encoder_learning_rate": 3.887395330218429e-06
},
{
"epoch": 2.7433756805807623,
"grad_norm": 0.5502464701719969,
"learning_rate": 3.887395330218429e-06,
"loss": 0.2371,
"step": 238,
"ts_encoder_learning_rate": 3.848364815834782e-06
},
{
"epoch": 2.7549909255898366,
"grad_norm": 0.488890082077443,
"learning_rate": 3.848364815834782e-06,
"loss": 0.2367,
"step": 239,
"ts_encoder_learning_rate": 3.809408268827009e-06
},
{
"epoch": 2.7666061705989113,
"grad_norm": 0.5122316791585159,
"learning_rate": 3.809408268827009e-06,
"loss": 0.2506,
"step": 240,
"ts_encoder_learning_rate": 3.7705281913013286e-06
},
{
"epoch": 2.7782214156079856,
"grad_norm": 0.4868899262078824,
"learning_rate": 3.7705281913013286e-06,
"loss": 0.2413,
"step": 241,
"ts_encoder_learning_rate": 3.731727080452464e-06
},
{
"epoch": 2.78983666061706,
"grad_norm": 0.5086075968707988,
"learning_rate": 3.731727080452464e-06,
"loss": 0.2421,
"step": 242,
"ts_encoder_learning_rate": 3.6930074284032613e-06
},
{
"epoch": 2.801451905626134,
"grad_norm": 0.49235271981217643,
"learning_rate": 3.6930074284032613e-06,
"loss": 0.2406,
"step": 243,
"ts_encoder_learning_rate": 3.654371722044616e-06
},
{
"epoch": 2.8130671506352085,
"grad_norm": 0.49103994839708176,
"learning_rate": 3.654371722044616e-06,
"loss": 0.2439,
"step": 244,
"ts_encoder_learning_rate": 3.6158224428757538e-06
},
{
"epoch": 2.8246823956442833,
"grad_norm": 0.9993476254292818,
"learning_rate": 3.6158224428757538e-06,
"loss": 0.2268,
"step": 245,
"ts_encoder_learning_rate": 3.5773620668448384e-06
},
{
"epoch": 2.8362976406533575,
"grad_norm": 0.5159214069274143,
"learning_rate": 3.5773620668448384e-06,
"loss": 0.2388,
"step": 246,
"ts_encoder_learning_rate": 3.538993064189954e-06
},
{
"epoch": 2.847912885662432,
"grad_norm": 0.543713795445949,
"learning_rate": 3.538993064189954e-06,
"loss": 0.2284,
"step": 247,
"ts_encoder_learning_rate": 3.500717899280442e-06
},
{
"epoch": 2.8595281306715066,
"grad_norm": 0.5140037192841642,
"learning_rate": 3.500717899280442e-06,
"loss": 0.2305,
"step": 248,
"ts_encoder_learning_rate": 3.4625390304586224e-06
},
{
"epoch": 2.871143375680581,
"grad_norm": 0.49143474790675895,
"learning_rate": 3.4625390304586224e-06,
"loss": 0.2348,
"step": 249,
"ts_encoder_learning_rate": 3.424458909881897e-06
},
{
"epoch": 2.882758620689655,
"grad_norm": 0.5203399617942011,
"learning_rate": 3.424458909881897e-06,
"loss": 0.2175,
"step": 250,
"ts_encoder_learning_rate": 3.386479983365251e-06
},
{
"epoch": 2.8943738656987295,
"grad_norm": 0.5363618954072708,
"learning_rate": 3.386479983365251e-06,
"loss": 0.2289,
"step": 251,
"ts_encoder_learning_rate": 3.3486046902241663e-06
},
{
"epoch": 2.9059891107078037,
"grad_norm": 0.49208512127705756,
"learning_rate": 3.3486046902241663e-06,
"loss": 0.2386,
"step": 252,
"ts_encoder_learning_rate": 3.310835463117942e-06
},
{
"epoch": 2.9176043557168785,
"grad_norm": 0.48789242095969204,
"learning_rate": 3.310835463117942e-06,
"loss": 0.2392,
"step": 253,
"ts_encoder_learning_rate": 3.273174727893463e-06
},
{
"epoch": 2.9292196007259528,
"grad_norm": 0.5163396778042415,
"learning_rate": 3.273174727893463e-06,
"loss": 0.2392,
"step": 254,
"ts_encoder_learning_rate": 3.235624903429374e-06
},
{
"epoch": 2.940834845735027,
"grad_norm": 0.4839363209051733,
"learning_rate": 3.235624903429374e-06,
"loss": 0.2294,
"step": 255,
"ts_encoder_learning_rate": 3.198188401480734e-06
},
{
"epoch": 2.952450090744102,
"grad_norm": 0.5099295694573828,
"learning_rate": 3.198188401480734e-06,
"loss": 0.214,
"step": 256,
"ts_encoder_learning_rate": 3.160867626524107e-06
},
{
"epoch": 2.964065335753176,
"grad_norm": 0.52866992195366,
"learning_rate": 3.160867626524107e-06,
"loss": 0.232,
"step": 257,
"ts_encoder_learning_rate": 3.12366497560313e-06
},
{
"epoch": 2.9756805807622504,
"grad_norm": 0.5016653572033554,
"learning_rate": 3.12366497560313e-06,
"loss": 0.2477,
"step": 258,
"ts_encoder_learning_rate": 3.0865828381745515e-06
},
{
"epoch": 2.9872958257713247,
"grad_norm": 0.5704722771230331,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.2215,
"step": 259,
"ts_encoder_learning_rate": 3.049623595954766e-06
},
{
"epoch": 2.9989110707803994,
"grad_norm": 0.5049693668147675,
"learning_rate": 3.049623595954766e-06,
"loss": 0.2239,
"step": 260,
"ts_encoder_learning_rate": 3.0127896227668367e-06
},
{
"epoch": 3.0,
"grad_norm": 0.5049693668147675,
"learning_rate": 3.0127896227668367e-06,
"loss": 0.0215,
"step": 261,
"ts_encoder_learning_rate": 2.976083284388031e-06
},
{
"epoch": 3.0116152450090743,
"grad_norm": 0.6043291464224838,
"learning_rate": 2.976083284388031e-06,
"loss": 0.2239,
"step": 262,
"ts_encoder_learning_rate": 2.9395069383978725e-06
},
{
"epoch": 3.023230490018149,
"grad_norm": 0.46138071995137575,
"learning_rate": 2.9395069383978725e-06,
"loss": 0.1959,
"step": 263,
"ts_encoder_learning_rate": 2.9030629340267165e-06
},
{
"epoch": 3.0348457350272233,
"grad_norm": 0.6605611111234634,
"learning_rate": 2.9030629340267165e-06,
"loss": 0.2223,
"step": 264,
"ts_encoder_learning_rate": 2.8667536120048616e-06
},
{
"epoch": 3.0464609800362976,
"grad_norm": 0.4886966776948408,
"learning_rate": 2.8667536120048616e-06,
"loss": 0.2147,
"step": 265,
"ts_encoder_learning_rate": 2.83058130441221e-06
},
{
"epoch": 3.058076225045372,
"grad_norm": 0.603343292555356,
"learning_rate": 2.83058130441221e-06,
"loss": 0.2122,
"step": 266,
"ts_encoder_learning_rate": 2.794548334528486e-06
},
{
"epoch": 3.0696914700544466,
"grad_norm": 0.46864057728976466,
"learning_rate": 2.794548334528486e-06,
"loss": 0.187,
"step": 267,
"ts_encoder_learning_rate": 2.7586570166840154e-06
},
{
"epoch": 3.081306715063521,
"grad_norm": 0.5904508040534621,
"learning_rate": 2.7586570166840154e-06,
"loss": 0.191,
"step": 268,
"ts_encoder_learning_rate": 2.7229096561110703e-06
},
{
"epoch": 3.0929219600725952,
"grad_norm": 0.5389347532364875,
"learning_rate": 2.7229096561110703e-06,
"loss": 0.2183,
"step": 269,
"ts_encoder_learning_rate": 2.687308548795825e-06
},
{
"epoch": 3.1045372050816695,
"grad_norm": 0.5130748388642997,
"learning_rate": 2.687308548795825e-06,
"loss": 0.2239,
"step": 270,
"ts_encoder_learning_rate": 2.651855981330872e-06
},
{
"epoch": 3.1161524500907443,
"grad_norm": 0.49543600582732883,
"learning_rate": 2.651855981330872e-06,
"loss": 0.1958,
"step": 271,
"ts_encoder_learning_rate": 2.6165542307683744e-06
},
{
"epoch": 3.1277676950998186,
"grad_norm": 0.517787257194227,
"learning_rate": 2.6165542307683744e-06,
"loss": 0.2171,
"step": 272,
"ts_encoder_learning_rate": 2.5814055644738013e-06
},
{
"epoch": 3.139382940108893,
"grad_norm": 0.5435576653784301,
"learning_rate": 2.5814055644738013e-06,
"loss": 0.2144,
"step": 273,
"ts_encoder_learning_rate": 2.5464122399803126e-06
},
{
"epoch": 3.150998185117967,
"grad_norm": 0.5217681007016235,
"learning_rate": 2.5464122399803126e-06,
"loss": 0.1963,
"step": 274,
"ts_encoder_learning_rate": 2.5115765048437445e-06
},
{
"epoch": 3.162613430127042,
"grad_norm": 0.4918846481089564,
"learning_rate": 2.5115765048437445e-06,
"loss": 0.2244,
"step": 275,
"ts_encoder_learning_rate": 2.4769005964982718e-06
},
{
"epoch": 3.174228675136116,
"grad_norm": 0.4834963808627837,
"learning_rate": 2.4769005964982718e-06,
"loss": 0.2125,
"step": 276,
"ts_encoder_learning_rate": 2.4423867421126923e-06
},
{
"epoch": 3.1858439201451905,
"grad_norm": 0.4937777628741182,
"learning_rate": 2.4423867421126923e-06,
"loss": 0.2181,
"step": 277,
"ts_encoder_learning_rate": 2.408037158447375e-06
},
{
"epoch": 3.1974591651542648,
"grad_norm": 0.46282845264960937,
"learning_rate": 2.408037158447375e-06,
"loss": 0.213,
"step": 278,
"ts_encoder_learning_rate": 2.3738540517118953e-06
},
{
"epoch": 3.2090744101633395,
"grad_norm": 0.5150835355954807,
"learning_rate": 2.3738540517118953e-06,
"loss": 0.1852,
"step": 279,
"ts_encoder_learning_rate": 2.339839617423318e-06
},
{
"epoch": 3.220689655172414,
"grad_norm": 0.49870953616723984,
"learning_rate": 2.339839617423318e-06,
"loss": 0.207,
"step": 280,
"ts_encoder_learning_rate": 2.305996040265198e-06
},
{
"epoch": 3.232304900181488,
"grad_norm": 0.49174431699593274,
"learning_rate": 2.305996040265198e-06,
"loss": 0.2162,
"step": 281,
"ts_encoder_learning_rate": 2.272325493947257e-06
},
{
"epoch": 3.243920145190563,
"grad_norm": 0.4875355155859377,
"learning_rate": 2.272325493947257e-06,
"loss": 0.2243,
"step": 282,
"ts_encoder_learning_rate": 2.238830141065765e-06
},
{
"epoch": 3.255535390199637,
"grad_norm": 0.5107168259211297,
"learning_rate": 2.238830141065765e-06,
"loss": 0.201,
"step": 283,
"ts_encoder_learning_rate": 2.2055121329646416e-06
},
{
"epoch": 3.2671506352087114,
"grad_norm": 0.4819085595049116,
"learning_rate": 2.2055121329646416e-06,
"loss": 0.1981,
"step": 284,
"ts_encoder_learning_rate": 2.1723736095972946e-06
},
{
"epoch": 3.2787658802177857,
"grad_norm": 0.4477936237119145,
"learning_rate": 2.1723736095972946e-06,
"loss": 0.1959,
"step": 285,
"ts_encoder_learning_rate": 2.139416699389153e-06
},
{
"epoch": 3.2903811252268604,
"grad_norm": 0.4701102863028192,
"learning_rate": 2.139416699389153e-06,
"loss": 0.1936,
"step": 286,
"ts_encoder_learning_rate": 2.1066435191009717e-06
},
{
"epoch": 3.3019963702359347,
"grad_norm": 0.5076017279364189,
"learning_rate": 2.1066435191009717e-06,
"loss": 0.214,
"step": 287,
"ts_encoder_learning_rate": 2.074056173692881e-06
},
{
"epoch": 3.313611615245009,
"grad_norm": 0.480769257020111,
"learning_rate": 2.074056173692881e-06,
"loss": 0.1793,
"step": 288,
"ts_encoder_learning_rate": 2.041656756189184e-06
},
{
"epoch": 3.3252268602540833,
"grad_norm": 0.4957168313559018,
"learning_rate": 2.041656756189184e-06,
"loss": 0.2041,
"step": 289,
"ts_encoder_learning_rate": 2.00944734754392e-06
},
{
"epoch": 3.336842105263158,
"grad_norm": 0.4710750376178812,
"learning_rate": 2.00944734754392e-06,
"loss": 0.1945,
"step": 290,
"ts_encoder_learning_rate": 1.977430016507222e-06
},
{
"epoch": 3.3484573502722323,
"grad_norm": 0.45013646064857127,
"learning_rate": 1.977430016507222e-06,
"loss": 0.1944,
"step": 291,
"ts_encoder_learning_rate": 1.945606819492429e-06
},
{
"epoch": 3.3600725952813066,
"grad_norm": 0.4963393211559648,
"learning_rate": 1.945606819492429e-06,
"loss": 0.2029,
"step": 292,
"ts_encoder_learning_rate": 1.913979800444021e-06
},
{
"epoch": 3.371687840290381,
"grad_norm": 0.45636659614066805,
"learning_rate": 1.913979800444021e-06,
"loss": 0.2042,
"step": 293,
"ts_encoder_learning_rate": 1.8825509907063328e-06
},
{
"epoch": 3.3833030852994557,
"grad_norm": 0.48404604144273267,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.2073,
"step": 294,
"ts_encoder_learning_rate": 1.8513224088930814e-06
},
{
"epoch": 3.39491833030853,
"grad_norm": 0.4903949616722962,
"learning_rate": 1.8513224088930814e-06,
"loss": 0.2117,
"step": 295,
"ts_encoder_learning_rate": 1.8202960607577246e-06
},
{
"epoch": 3.4065335753176043,
"grad_norm": 0.4749952038477868,
"learning_rate": 1.8202960607577246e-06,
"loss": 0.2105,
"step": 296,
"ts_encoder_learning_rate": 1.7894739390646227e-06
},
{
"epoch": 3.418148820326679,
"grad_norm": 0.49308201942231306,
"learning_rate": 1.7894739390646227e-06,
"loss": 0.1854,
"step": 297,
"ts_encoder_learning_rate": 1.7588580234610592e-06
},
{
"epoch": 3.4297640653357533,
"grad_norm": 0.44330828627883645,
"learning_rate": 1.7588580234610592e-06,
"loss": 0.2049,
"step": 298,
"ts_encoder_learning_rate": 1.728450280350079e-06
},
{
"epoch": 3.4413793103448276,
"grad_norm": 0.47198428442224044,
"learning_rate": 1.728450280350079e-06,
"loss": 0.1917,
"step": 299,
"ts_encoder_learning_rate": 1.6982526627642043e-06
},
{
"epoch": 3.452994555353902,
"grad_norm": 0.47052296840051827,
"learning_rate": 1.6982526627642043e-06,
"loss": 0.1966,
"step": 300,
"ts_encoder_learning_rate": 1.6682671102399806e-06
},
{
"epoch": 3.464609800362976,
"grad_norm": 0.47469220902280884,
"learning_rate": 1.6682671102399806e-06,
"loss": 0.1993,
"step": 301,
"ts_encoder_learning_rate": 1.6384955486934157e-06
},
{
"epoch": 3.476225045372051,
"grad_norm": 0.5047215872734404,
"learning_rate": 1.6384955486934157e-06,
"loss": 0.2087,
"step": 302,
"ts_encoder_learning_rate": 1.6089398902962767e-06
},
{
"epoch": 3.487840290381125,
"grad_norm": 0.46226600784092325,
"learning_rate": 1.6089398902962767e-06,
"loss": 0.2223,
"step": 303,
"ts_encoder_learning_rate": 1.5796020333532696e-06
},
{
"epoch": 3.4994555353901995,
"grad_norm": 0.49692738160329974,
"learning_rate": 1.5796020333532696e-06,
"loss": 0.2098,
"step": 304,
"ts_encoder_learning_rate": 1.5504838621801272e-06
},
{
"epoch": 3.5110707803992742,
"grad_norm": 0.44096965404662336,
"learning_rate": 1.5504838621801272e-06,
"loss": 0.1917,
"step": 305,
"ts_encoder_learning_rate": 1.5215872469825682e-06
},
{
"epoch": 3.5226860254083485,
"grad_norm": 0.46470354099812156,
"learning_rate": 1.5215872469825682e-06,
"loss": 0.2024,
"step": 306,
"ts_encoder_learning_rate": 1.4929140437361916e-06
},
{
"epoch": 3.534301270417423,
"grad_norm": 0.6245744672781995,
"learning_rate": 1.4929140437361916e-06,
"loss": 0.1932,
"step": 307,
"ts_encoder_learning_rate": 1.4644660940672628e-06
},
{
"epoch": 3.545916515426497,
"grad_norm": 0.4616850273696799,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.1894,
"step": 308,
"ts_encoder_learning_rate": 1.4362452251344283e-06
},
{
"epoch": 3.557531760435572,
"grad_norm": 0.46072956667527437,
"learning_rate": 1.4362452251344283e-06,
"loss": 0.1939,
"step": 309,
"ts_encoder_learning_rate": 1.4082532495113627e-06
},
{
"epoch": 3.569147005444646,
"grad_norm": 0.4622635293176079,
"learning_rate": 1.4082532495113627e-06,
"loss": 0.1986,
"step": 310,
"ts_encoder_learning_rate": 1.3804919650703551e-06
},
{
"epoch": 3.5807622504537204,
"grad_norm": 0.5048261638194425,
"learning_rate": 1.3804919650703551e-06,
"loss": 0.1999,
"step": 311,
"ts_encoder_learning_rate": 1.3529631548668298e-06
},
{
"epoch": 3.592377495462795,
"grad_norm": 0.4265055997084881,
"learning_rate": 1.3529631548668298e-06,
"loss": 0.207,
"step": 312,
"ts_encoder_learning_rate": 1.3256685870248227e-06
},
{
"epoch": 3.6039927404718695,
"grad_norm": 0.43467381019959384,
"learning_rate": 1.3256685870248227e-06,
"loss": 0.1809,
"step": 313,
"ts_encoder_learning_rate": 1.298610014623423e-06
},
{
"epoch": 3.6156079854809438,
"grad_norm": 0.47840223124389564,
"learning_rate": 1.298610014623423e-06,
"loss": 0.1985,
"step": 314,
"ts_encoder_learning_rate": 1.2717891755841722e-06
},
{
"epoch": 3.627223230490018,
"grad_norm": 0.4736005082533798,
"learning_rate": 1.2717891755841722e-06,
"loss": 0.2029,
"step": 315,
"ts_encoder_learning_rate": 1.2452077925594435e-06
},
{
"epoch": 3.6388384754990923,
"grad_norm": 0.44359091955078794,
"learning_rate": 1.2452077925594435e-06,
"loss": 0.1974,
"step": 316,
"ts_encoder_learning_rate": 1.2188675728217986e-06
},
{
"epoch": 3.650453720508167,
"grad_norm": 0.4835930654461023,
"learning_rate": 1.2188675728217986e-06,
"loss": 0.2004,
"step": 317,
"ts_encoder_learning_rate": 1.1927702081543279e-06
},
{
"epoch": 3.6620689655172414,
"grad_norm": 0.4950865344011163,
"learning_rate": 1.1927702081543279e-06,
"loss": 0.1971,
"step": 318,
"ts_encoder_learning_rate": 1.166917374742e-06
},
{
"epoch": 3.6736842105263157,
"grad_norm": 0.4542349082763096,
"learning_rate": 1.166917374742e-06,
"loss": 0.2029,
"step": 319,
"ts_encoder_learning_rate": 1.141310733063991e-06
},
{
"epoch": 3.6852994555353904,
"grad_norm": 0.4722201030579586,
"learning_rate": 1.141310733063991e-06,
"loss": 0.1871,
"step": 320,
"ts_encoder_learning_rate": 1.1159519277870507e-06
},
{
"epoch": 3.6969147005444647,
"grad_norm": 0.44859200333215415,
"learning_rate": 1.1159519277870507e-06,
"loss": 0.1912,
"step": 321,
"ts_encoder_learning_rate": 1.0908425876598512e-06
},
{
"epoch": 3.708529945553539,
"grad_norm": 0.46643463691801845,
"learning_rate": 1.0908425876598512e-06,
"loss": 0.1845,
"step": 322,
"ts_encoder_learning_rate": 1.0659843254083919e-06
},
{
"epoch": 3.7201451905626133,
"grad_norm": 0.4306281262155224,
"learning_rate": 1.0659843254083919e-06,
"loss": 0.1779,
"step": 323,
"ts_encoder_learning_rate": 1.041378737632402e-06
},
{
"epoch": 3.7317604355716876,
"grad_norm": 0.4753156573309627,
"learning_rate": 1.041378737632402e-06,
"loss": 0.1989,
"step": 324,
"ts_encoder_learning_rate": 1.0170274047028068e-06
},
{
"epoch": 3.7433756805807623,
"grad_norm": 0.45875837749866927,
"learning_rate": 1.0170274047028068e-06,
"loss": 0.1912,
"step": 325,
"ts_encoder_learning_rate": 9.929318906602176e-07
},
{
"epoch": 3.7549909255898366,
"grad_norm": 0.4823250266490247,
"learning_rate": 9.929318906602176e-07,
"loss": 0.194,
"step": 326,
"ts_encoder_learning_rate": 9.690937431144725e-07
},
{
"epoch": 3.7666061705989113,
"grad_norm": 0.4354804449964891,
"learning_rate": 9.690937431144725e-07,
"loss": 0.193,
"step": 327,
"ts_encoder_learning_rate": 9.455144931452459e-07
},
{
"epoch": 3.7782214156079856,
"grad_norm": 0.4634865596978523,
"learning_rate": 9.455144931452459e-07,
"loss": 0.2046,
"step": 328,
"ts_encoder_learning_rate": 9.221956552036992e-07
},
{
"epoch": 3.78983666061706,
"grad_norm": 0.455260620824431,
"learning_rate": 9.221956552036992e-07,
"loss": 0.1939,
"step": 329,
"ts_encoder_learning_rate": 8.991387270152202e-07
},
{
"epoch": 3.801451905626134,
"grad_norm": 0.49649814336003645,
"learning_rate": 8.991387270152202e-07,
"loss": 0.2102,
"step": 330,
"ts_encoder_learning_rate": 8.76345189483222e-07
},
{
"epoch": 3.8130671506352085,
"grad_norm": 0.49114892681403455,
"learning_rate": 8.76345189483222e-07,
"loss": 0.1963,
"step": 331,
"ts_encoder_learning_rate": 8.538165065940263e-07
},
{
"epoch": 3.8246823956442833,
"grad_norm": 0.5286315843085168,
"learning_rate": 8.538165065940263e-07,
"loss": 0.2193,
"step": 332,
"ts_encoder_learning_rate": 8.315541253228332e-07
},
{
"epoch": 3.8362976406533575,
"grad_norm": 0.4405444221304671,
"learning_rate": 8.315541253228332e-07,
"loss": 0.1914,
"step": 333,
"ts_encoder_learning_rate": 8.095594755407971e-07
},
{
"epoch": 3.847912885662432,
"grad_norm": 0.4698521600536052,
"learning_rate": 8.095594755407971e-07,
"loss": 0.2235,
"step": 334,
"ts_encoder_learning_rate": 7.878339699231702e-07
},
{
"epoch": 3.8595281306715066,
"grad_norm": 0.4371598229337324,
"learning_rate": 7.878339699231702e-07,
"loss": 0.1839,
"step": 335,
"ts_encoder_learning_rate": 7.663790038585794e-07
},
{
"epoch": 3.871143375680581,
"grad_norm": 0.45962066234037935,
"learning_rate": 7.663790038585794e-07,
"loss": 0.1995,
"step": 336,
"ts_encoder_learning_rate": 7.451959553594051e-07
},
{
"epoch": 3.882758620689655,
"grad_norm": 0.42173091596380397,
"learning_rate": 7.451959553594051e-07,
"loss": 0.1827,
"step": 337,
"ts_encoder_learning_rate": 7.242861849732696e-07
},
{
"epoch": 3.8943738656987295,
"grad_norm": 0.4331430485638045,
"learning_rate": 7.242861849732696e-07,
"loss": 0.2012,
"step": 338,
"ts_encoder_learning_rate": 7.036510356956494e-07
},
{
"epoch": 3.9059891107078037,
"grad_norm": 0.4465736646292548,
"learning_rate": 7.036510356956494e-07,
"loss": 0.1989,
"step": 339,
"ts_encoder_learning_rate": 6.832918328836247e-07
},
{
"epoch": 3.9176043557168785,
"grad_norm": 0.43493507516276353,
"learning_rate": 6.832918328836247e-07,
"loss": 0.1951,
"step": 340,
"ts_encoder_learning_rate": 6.632098841707458e-07
},
{
"epoch": 3.9292196007259528,
"grad_norm": 0.42038926625601386,
"learning_rate": 6.632098841707458e-07,
"loss": 0.1915,
"step": 341,
"ts_encoder_learning_rate": 6.43406479383053e-07
},
{
"epoch": 3.940834845735027,
"grad_norm": 0.46480078485720294,
"learning_rate": 6.43406479383053e-07,
"loss": 0.1917,
"step": 342,
"ts_encoder_learning_rate": 6.238828904562316e-07
},
{
"epoch": 3.952450090744102,
"grad_norm": 0.4490758579669617,
"learning_rate": 6.238828904562316e-07,
"loss": 0.2059,
"step": 343,
"ts_encoder_learning_rate": 6.04640371353914e-07
},
{
"epoch": 3.964065335753176,
"grad_norm": 0.4613515058442109,
"learning_rate": 6.04640371353914e-07,
"loss": 0.1985,
"step": 344,
"ts_encoder_learning_rate": 5.856801579871457e-07
},
{
"epoch": 3.9756805807622504,
"grad_norm": 0.4456006110376394,
"learning_rate": 5.856801579871457e-07,
"loss": 0.2025,
"step": 345,
"ts_encoder_learning_rate": 5.670034681349995e-07
},
{
"epoch": 3.9872958257713247,
"grad_norm": 0.42933039768161857,
"learning_rate": 5.670034681349995e-07,
"loss": 0.2012,
"step": 346,
"ts_encoder_learning_rate": 5.486115013663668e-07
},
{
"epoch": 3.9989110707803994,
"grad_norm": 0.44999338968368285,
"learning_rate": 5.486115013663668e-07,
"loss": 0.187,
"step": 347,
"ts_encoder_learning_rate": 5.305054389629022e-07
},
{
"epoch": 4.0,
"grad_norm": 0.44999338968368285,
"learning_rate": 5.305054389629022e-07,
"loss": 0.0145,
"step": 348,
"ts_encoder_learning_rate": 5.126864438431628e-07
},
{
"epoch": 4.011615245009074,
"grad_norm": 0.42250491238346477,
"learning_rate": 5.126864438431628e-07,
"loss": 0.1884,
"step": 349,
"ts_encoder_learning_rate": 4.951556604879049e-07
},
{
"epoch": 4.023230490018149,
"grad_norm": 0.4230629092251735,
"learning_rate": 4.951556604879049e-07,
"loss": 0.1905,
"step": 350,
"ts_encoder_learning_rate": 4.779142148665855e-07
},
{
"epoch": 4.034845735027223,
"grad_norm": 0.4174529397738527,
"learning_rate": 4.779142148665855e-07,
"loss": 0.1658,
"step": 351,
"ts_encoder_learning_rate": 4.6096321436504e-07
},
{
"epoch": 4.046460980036298,
"grad_norm": 0.43118539472845935,
"learning_rate": 4.6096321436504e-07,
"loss": 0.184,
"step": 352,
"ts_encoder_learning_rate": 4.4430374771435245e-07
},
{
"epoch": 4.058076225045372,
"grad_norm": 0.41001720611475784,
"learning_rate": 4.4430374771435245e-07,
"loss": 0.1828,
"step": 353,
"ts_encoder_learning_rate": 4.279368849209381e-07
},
{
"epoch": 4.069691470054447,
"grad_norm": 0.4308084296921054,
"learning_rate": 4.279368849209381e-07,
"loss": 0.1964,
"step": 354,
"ts_encoder_learning_rate": 4.1186367719780737e-07
},
{
"epoch": 4.081306715063521,
"grad_norm": 0.4407009368252455,
"learning_rate": 4.1186367719780737e-07,
"loss": 0.1735,
"step": 355,
"ts_encoder_learning_rate": 3.960851568970586e-07
},
{
"epoch": 4.092921960072595,
"grad_norm": 0.41256069394964856,
"learning_rate": 3.960851568970586e-07,
"loss": 0.1911,
"step": 356,
"ts_encoder_learning_rate": 3.8060233744356634e-07
},
{
"epoch": 4.1045372050816695,
"grad_norm": 0.4221589175474974,
"learning_rate": 3.8060233744356634e-07,
"loss": 0.1863,
"step": 357,
"ts_encoder_learning_rate": 3.6541621326989183e-07
},
{
"epoch": 4.116152450090744,
"grad_norm": 0.42751127372899456,
"learning_rate": 3.6541621326989183e-07,
"loss": 0.2028,
"step": 358,
"ts_encoder_learning_rate": 3.5052775975241203e-07
},
{
"epoch": 4.127767695099818,
"grad_norm": 0.40851979892771395,
"learning_rate": 3.5052775975241203e-07,
"loss": 0.1718,
"step": 359,
"ts_encoder_learning_rate": 3.359379331486762e-07
},
{
"epoch": 4.139382940108893,
"grad_norm": 0.41178087270431546,
"learning_rate": 3.359379331486762e-07,
"loss": 0.181,
"step": 360,
"ts_encoder_learning_rate": 3.216476705359839e-07
},
{
"epoch": 4.150998185117968,
"grad_norm": 0.4104761665945451,
"learning_rate": 3.216476705359839e-07,
"loss": 0.1872,
"step": 361,
"ts_encoder_learning_rate": 3.076578897511978e-07
},
{
"epoch": 4.162613430127042,
"grad_norm": 0.41012969922765047,
"learning_rate": 3.076578897511978e-07,
"loss": 0.182,
"step": 362,
"ts_encoder_learning_rate": 2.939694893317979e-07
},
{
"epoch": 4.174228675136116,
"grad_norm": 0.41789852835375363,
"learning_rate": 2.939694893317979e-07,
"loss": 0.1937,
"step": 363,
"ts_encoder_learning_rate": 2.8058334845816214e-07
},
{
"epoch": 4.1858439201451905,
"grad_norm": 0.42143167915544566,
"learning_rate": 2.8058334845816214e-07,
"loss": 0.1838,
"step": 364,
"ts_encoder_learning_rate": 2.6750032689710604e-07
},
{
"epoch": 4.197459165154265,
"grad_norm": 0.39213432837711776,
"learning_rate": 2.6750032689710604e-07,
"loss": 0.1742,
"step": 365,
"ts_encoder_learning_rate": 2.547212649466568e-07
},
{
"epoch": 4.209074410163339,
"grad_norm": 0.3958142973041478,
"learning_rate": 2.547212649466568e-07,
"loss": 0.1919,
"step": 366,
"ts_encoder_learning_rate": 2.4224698338208344e-07
},
{
"epoch": 4.220689655172414,
"grad_norm": 0.44213215894104213,
"learning_rate": 2.4224698338208344e-07,
"loss": 0.1841,
"step": 367,
"ts_encoder_learning_rate": 2.3007828340318117e-07
},
{
"epoch": 4.2323049001814885,
"grad_norm": 0.4245037412302445,
"learning_rate": 2.3007828340318117e-07,
"loss": 0.1891,
"step": 368,
"ts_encoder_learning_rate": 2.1821594658280932e-07
},
{
"epoch": 4.243920145190563,
"grad_norm": 0.4132437127742664,
"learning_rate": 2.1821594658280932e-07,
"loss": 0.1803,
"step": 369,
"ts_encoder_learning_rate": 2.0666073481669714e-07
},
{
"epoch": 4.255535390199637,
"grad_norm": 0.411342871117412,
"learning_rate": 2.0666073481669714e-07,
"loss": 0.1812,
"step": 370,
"ts_encoder_learning_rate": 1.9541339027450256e-07
},
{
"epoch": 4.267150635208711,
"grad_norm": 0.42070511786632736,
"learning_rate": 1.9541339027450256e-07,
"loss": 0.1781,
"step": 371,
"ts_encoder_learning_rate": 1.8447463535214872e-07
},
{
"epoch": 4.278765880217786,
"grad_norm": 0.3960054704602751,
"learning_rate": 1.8447463535214872e-07,
"loss": 0.1872,
"step": 372,
"ts_encoder_learning_rate": 1.7384517262542255e-07
},
{
"epoch": 4.29038112522686,
"grad_norm": 0.4146312365888249,
"learning_rate": 1.7384517262542255e-07,
"loss": 0.1996,
"step": 373,
"ts_encoder_learning_rate": 1.6352568480485277e-07
},
{
"epoch": 4.301996370235934,
"grad_norm": 0.4302254069498354,
"learning_rate": 1.6352568480485277e-07,
"loss": 0.1799,
"step": 374,
"ts_encoder_learning_rate": 1.5351683469185973e-07
},
{
"epoch": 4.3136116152450095,
"grad_norm": 0.4048487003041662,
"learning_rate": 1.5351683469185973e-07,
"loss": 0.1853,
"step": 375,
"ts_encoder_learning_rate": 1.4381926513618139e-07
},
{
"epoch": 4.325226860254084,
"grad_norm": 0.4340645587831662,
"learning_rate": 1.4381926513618139e-07,
"loss": 0.1745,
"step": 376,
"ts_encoder_learning_rate": 1.3443359899458997e-07
},
{
"epoch": 4.336842105263158,
"grad_norm": 0.419931335758943,
"learning_rate": 1.3443359899458997e-07,
"loss": 0.2001,
"step": 377,
"ts_encoder_learning_rate": 1.253604390908819e-07
},
{
"epoch": 4.348457350272232,
"grad_norm": 0.41065254305787063,
"learning_rate": 1.253604390908819e-07,
"loss": 0.1856,
"step": 378,
"ts_encoder_learning_rate": 1.1660036817716492e-07
},
{
"epoch": 4.360072595281307,
"grad_norm": 0.4386933347694567,
"learning_rate": 1.1660036817716492e-07,
"loss": 0.1885,
"step": 379,
"ts_encoder_learning_rate": 1.0815394889642339e-07
},
{
"epoch": 4.371687840290381,
"grad_norm": 0.4206901632288436,
"learning_rate": 1.0815394889642339e-07,
"loss": 0.1855,
"step": 380,
"ts_encoder_learning_rate": 1.0002172374638519e-07
},
{
"epoch": 4.383303085299455,
"grad_norm": 0.45345577005415333,
"learning_rate": 1.0002172374638519e-07,
"loss": 0.201,
"step": 381,
"ts_encoder_learning_rate": 9.22042150446728e-08
},
{
"epoch": 4.3949183303085295,
"grad_norm": 0.42825131486091655,
"learning_rate": 9.22042150446728e-08,
"loss": 0.1977,
"step": 382,
"ts_encoder_learning_rate": 8.470192489526519e-08
},
{
"epoch": 4.406533575317605,
"grad_norm": 0.4180892476123211,
"learning_rate": 8.470192489526519e-08,
"loss": 0.1819,
"step": 383,
"ts_encoder_learning_rate": 7.7515335156238e-08
},
{
"epoch": 4.418148820326679,
"grad_norm": 0.3988898186373836,
"learning_rate": 7.7515335156238e-08,
"loss": 0.1845,
"step": 384,
"ts_encoder_learning_rate": 7.064490740882057e-08
},
{
"epoch": 4.429764065335753,
"grad_norm": 0.4224273125164809,
"learning_rate": 7.064490740882057e-08,
"loss": 0.1831,
"step": 385,
"ts_encoder_learning_rate": 6.409108292774912e-08
},
{
"epoch": 4.441379310344828,
"grad_norm": 0.40574161987282553,
"learning_rate": 6.409108292774912e-08,
"loss": 0.1884,
"step": 386,
"ts_encoder_learning_rate": 5.785428265292381e-08
},
{
"epoch": 4.452994555353902,
"grad_norm": 0.4205040886436754,
"learning_rate": 5.785428265292381e-08,
"loss": 0.1854,
"step": 387,
"ts_encoder_learning_rate": 5.1934907162370374e-08
},
{
"epoch": 4.464609800362976,
"grad_norm": 0.41648710747921297,
"learning_rate": 5.1934907162370374e-08,
"loss": 0.1756,
"step": 388,
"ts_encoder_learning_rate": 4.63333366465174e-08
},
{
"epoch": 4.4762250453720505,
"grad_norm": 0.39540267589707684,
"learning_rate": 4.63333366465174e-08,
"loss": 0.1831,
"step": 389,
"ts_encoder_learning_rate": 4.104993088376974e-08
},
{
"epoch": 4.487840290381126,
"grad_norm": 0.3927817907648021,
"learning_rate": 4.104993088376974e-08,
"loss": 0.1742,
"step": 390,
"ts_encoder_learning_rate": 3.608502921740753e-08
},
{
"epoch": 4.4994555353902,
"grad_norm": 0.43059843726946884,
"learning_rate": 3.608502921740753e-08,
"loss": 0.1922,
"step": 391,
"ts_encoder_learning_rate": 3.143895053378698e-08
},
{
"epoch": 4.511070780399274,
"grad_norm": 0.41821997175820497,
"learning_rate": 3.143895053378698e-08,
"loss": 0.188,
"step": 392,
"ts_encoder_learning_rate": 2.7111993241860646e-08
},
{
"epoch": 4.5226860254083485,
"grad_norm": 0.44898297045915464,
"learning_rate": 2.7111993241860646e-08,
"loss": 0.195,
"step": 393,
"ts_encoder_learning_rate": 2.3104435254008852e-08
},
{
"epoch": 4.534301270417423,
"grad_norm": 0.41081766017860594,
"learning_rate": 2.3104435254008852e-08,
"loss": 0.1764,
"step": 394,
"ts_encoder_learning_rate": 1.9416533968193428e-08
},
{
"epoch": 4.545916515426497,
"grad_norm": 0.4267367649318197,
"learning_rate": 1.9416533968193428e-08,
"loss": 0.178,
"step": 395,
"ts_encoder_learning_rate": 1.6048526251421502e-08
},
{
"epoch": 4.557531760435571,
"grad_norm": 0.40492762389862497,
"learning_rate": 1.6048526251421502e-08,
"loss": 0.1833,
"step": 396,
"ts_encoder_learning_rate": 1.3000628424535978e-08
},
{
"epoch": 4.569147005444647,
"grad_norm": 0.41530646634421503,
"learning_rate": 1.3000628424535978e-08,
"loss": 0.1764,
"step": 397,
"ts_encoder_learning_rate": 1.0273036248318325e-08
},
{
"epoch": 4.580762250453721,
"grad_norm": 0.41054541493317387,
"learning_rate": 1.0273036248318325e-08,
"loss": 0.1884,
"step": 398,
"ts_encoder_learning_rate": 7.865924910916977e-09
},
{
"epoch": 4.592377495462795,
"grad_norm": 0.3927284291620028,
"learning_rate": 7.865924910916977e-09,
"loss": 0.174,
"step": 399,
"ts_encoder_learning_rate": 5.779449016595773e-09
},
{
"epoch": 4.6039927404718695,
"grad_norm": 0.40810543942098576,
"learning_rate": 5.779449016595773e-09,
"loss": 0.1944,
"step": 400,
"ts_encoder_learning_rate": 4.0137425758018935e-09
},
{
"epoch": 4.6039927404718695,
"step": 400,
"total_flos": 667646607294464.0,
"train_loss": 0.43487690573791044,
"train_runtime": 29492.4665,
"train_samples_per_second": 6.944,
"train_steps_per_second": 0.014,
"ts_encoder_learning_rate": 4.0137425758018935e-09
}
],
"logging_steps": 1.0,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 667646607294464.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}