random_gr4r28O45vbB0n3v / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
97daa97 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 782,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025575447570332483,
"grad_norm": 23.002245937737467,
"learning_rate": 9.999959651660741e-06,
"loss": 0.7701,
"step": 1
},
{
"epoch": 0.005115089514066497,
"grad_norm": 19.79462448378255,
"learning_rate": 9.999838607294157e-06,
"loss": 0.6713,
"step": 2
},
{
"epoch": 0.0076726342710997444,
"grad_norm": 55.54404299524967,
"learning_rate": 9.999636868853824e-06,
"loss": 0.4213,
"step": 3
},
{
"epoch": 0.010230179028132993,
"grad_norm": 26.73299552059074,
"learning_rate": 9.999354439595668e-06,
"loss": 0.448,
"step": 4
},
{
"epoch": 0.01278772378516624,
"grad_norm": 50.03710815295853,
"learning_rate": 9.998991324077906e-06,
"loss": 0.3936,
"step": 5
},
{
"epoch": 0.015345268542199489,
"grad_norm": 5.992119789491709,
"learning_rate": 9.998547528160987e-06,
"loss": 0.4206,
"step": 6
},
{
"epoch": 0.017902813299232736,
"grad_norm": 4.816601147153798,
"learning_rate": 9.998023059007477e-06,
"loss": 0.3167,
"step": 7
},
{
"epoch": 0.020460358056265986,
"grad_norm": 5.01117985790271,
"learning_rate": 9.997417925081963e-06,
"loss": 0.3277,
"step": 8
},
{
"epoch": 0.023017902813299233,
"grad_norm": 4.645354592299978,
"learning_rate": 9.996732136150902e-06,
"loss": 0.3238,
"step": 9
},
{
"epoch": 0.02557544757033248,
"grad_norm": 3.8492784258791737,
"learning_rate": 9.995965703282472e-06,
"loss": 0.2923,
"step": 10
},
{
"epoch": 0.028132992327365727,
"grad_norm": 5.08776547587877,
"learning_rate": 9.995118638846394e-06,
"loss": 0.3566,
"step": 11
},
{
"epoch": 0.030690537084398978,
"grad_norm": 5.890303471165605,
"learning_rate": 9.99419095651372e-06,
"loss": 0.3514,
"step": 12
},
{
"epoch": 0.03324808184143223,
"grad_norm": 4.820498331793851,
"learning_rate": 9.993182671256633e-06,
"loss": 0.3659,
"step": 13
},
{
"epoch": 0.03580562659846547,
"grad_norm": 5.443835157723716,
"learning_rate": 9.992093799348182e-06,
"loss": 0.3363,
"step": 14
},
{
"epoch": 0.03836317135549872,
"grad_norm": 31.850268661359863,
"learning_rate": 9.990924358362037e-06,
"loss": 0.2992,
"step": 15
},
{
"epoch": 0.04092071611253197,
"grad_norm": 4.020803137085757,
"learning_rate": 9.9896743671722e-06,
"loss": 0.3472,
"step": 16
},
{
"epoch": 0.043478260869565216,
"grad_norm": 4.704323903966358,
"learning_rate": 9.988343845952697e-06,
"loss": 0.3549,
"step": 17
},
{
"epoch": 0.04603580562659847,
"grad_norm": 4.016905077737534,
"learning_rate": 9.986932816177258e-06,
"loss": 0.2994,
"step": 18
},
{
"epoch": 0.04859335038363171,
"grad_norm": 5.055643436808016,
"learning_rate": 9.985441300618966e-06,
"loss": 0.3179,
"step": 19
},
{
"epoch": 0.05115089514066496,
"grad_norm": 4.296200137910896,
"learning_rate": 9.98386932334989e-06,
"loss": 0.3163,
"step": 20
},
{
"epoch": 0.05370843989769821,
"grad_norm": 3.6283871609346217,
"learning_rate": 9.982216909740703e-06,
"loss": 0.2571,
"step": 21
},
{
"epoch": 0.056265984654731455,
"grad_norm": 3.6813787133878213,
"learning_rate": 9.980484086460258e-06,
"loss": 0.3126,
"step": 22
},
{
"epoch": 0.058823529411764705,
"grad_norm": 4.358055860374409,
"learning_rate": 9.978670881475173e-06,
"loss": 0.2937,
"step": 23
},
{
"epoch": 0.061381074168797956,
"grad_norm": 4.477520167271448,
"learning_rate": 9.976777324049374e-06,
"loss": 0.3215,
"step": 24
},
{
"epoch": 0.0639386189258312,
"grad_norm": 3.7668725752663277,
"learning_rate": 9.974803444743617e-06,
"loss": 0.2998,
"step": 25
},
{
"epoch": 0.06649616368286446,
"grad_norm": 4.0809484824780435,
"learning_rate": 9.972749275415005e-06,
"loss": 0.3044,
"step": 26
},
{
"epoch": 0.06905370843989769,
"grad_norm": 5.413718732497824,
"learning_rate": 9.970614849216465e-06,
"loss": 0.2632,
"step": 27
},
{
"epoch": 0.07161125319693094,
"grad_norm": 3.730536919122114,
"learning_rate": 9.96840020059622e-06,
"loss": 0.3246,
"step": 28
},
{
"epoch": 0.0741687979539642,
"grad_norm": 3.731019179851398,
"learning_rate": 9.966105365297226e-06,
"loss": 0.276,
"step": 29
},
{
"epoch": 0.07672634271099744,
"grad_norm": 4.234204618475807,
"learning_rate": 9.963730380356599e-06,
"loss": 0.2954,
"step": 30
},
{
"epoch": 0.0792838874680307,
"grad_norm": 4.462805286038281,
"learning_rate": 9.96127528410502e-06,
"loss": 0.3379,
"step": 31
},
{
"epoch": 0.08184143222506395,
"grad_norm": 3.9662958792274665,
"learning_rate": 9.958740116166113e-06,
"loss": 0.2945,
"step": 32
},
{
"epoch": 0.08439897698209718,
"grad_norm": 3.571374464706402,
"learning_rate": 9.9561249174558e-06,
"loss": 0.261,
"step": 33
},
{
"epoch": 0.08695652173913043,
"grad_norm": 3.981988792102028,
"learning_rate": 9.953429730181653e-06,
"loss": 0.3081,
"step": 34
},
{
"epoch": 0.08951406649616368,
"grad_norm": 4.03802779192331,
"learning_rate": 9.950654597842209e-06,
"loss": 0.285,
"step": 35
},
{
"epoch": 0.09207161125319693,
"grad_norm": 3.5198570607375625,
"learning_rate": 9.947799565226253e-06,
"loss": 0.2842,
"step": 36
},
{
"epoch": 0.09462915601023018,
"grad_norm": 4.510788445189442,
"learning_rate": 9.944864678412118e-06,
"loss": 0.2977,
"step": 37
},
{
"epoch": 0.09718670076726342,
"grad_norm": 3.459042036170383,
"learning_rate": 9.94184998476693e-06,
"loss": 0.2482,
"step": 38
},
{
"epoch": 0.09974424552429667,
"grad_norm": 3.6104760454517026,
"learning_rate": 9.938755532945838e-06,
"loss": 0.2911,
"step": 39
},
{
"epoch": 0.10230179028132992,
"grad_norm": 3.2264924647067885,
"learning_rate": 9.93558137289124e-06,
"loss": 0.2823,
"step": 40
},
{
"epoch": 0.10485933503836317,
"grad_norm": 3.050087024580229,
"learning_rate": 9.932327555831972e-06,
"loss": 0.1963,
"step": 41
},
{
"epoch": 0.10741687979539642,
"grad_norm": 3.0298519282035836,
"learning_rate": 9.928994134282477e-06,
"loss": 0.2329,
"step": 42
},
{
"epoch": 0.10997442455242967,
"grad_norm": 3.489339177075647,
"learning_rate": 9.925581162041967e-06,
"loss": 0.2361,
"step": 43
},
{
"epoch": 0.11253196930946291,
"grad_norm": 3.173939549803925,
"learning_rate": 9.922088694193546e-06,
"loss": 0.2317,
"step": 44
},
{
"epoch": 0.11508951406649616,
"grad_norm": 2.7796494028422525,
"learning_rate": 9.918516787103322e-06,
"loss": 0.2345,
"step": 45
},
{
"epoch": 0.11764705882352941,
"grad_norm": 3.528817318108321,
"learning_rate": 9.91486549841951e-06,
"loss": 0.2543,
"step": 46
},
{
"epoch": 0.12020460358056266,
"grad_norm": 3.535704208599901,
"learning_rate": 9.911134887071477e-06,
"loss": 0.2612,
"step": 47
},
{
"epoch": 0.12276214833759591,
"grad_norm": 3.4658842882423158,
"learning_rate": 9.907325013268816e-06,
"loss": 0.2748,
"step": 48
},
{
"epoch": 0.12531969309462915,
"grad_norm": 2.630457541489405,
"learning_rate": 9.903435938500356e-06,
"loss": 0.1958,
"step": 49
},
{
"epoch": 0.1278772378516624,
"grad_norm": 2.915891246747101,
"learning_rate": 9.899467725533181e-06,
"loss": 0.2338,
"step": 50
},
{
"epoch": 0.13043478260869565,
"grad_norm": 3.40288133819665,
"learning_rate": 9.895420438411616e-06,
"loss": 0.2501,
"step": 51
},
{
"epoch": 0.1329923273657289,
"grad_norm": 3.2304039819864148,
"learning_rate": 9.89129414245618e-06,
"loss": 0.222,
"step": 52
},
{
"epoch": 0.13554987212276215,
"grad_norm": 2.911477652143155,
"learning_rate": 9.887088904262557e-06,
"loss": 0.2285,
"step": 53
},
{
"epoch": 0.13810741687979539,
"grad_norm": 3.042507664568314,
"learning_rate": 9.882804791700488e-06,
"loss": 0.2582,
"step": 54
},
{
"epoch": 0.14066496163682865,
"grad_norm": 2.980463931027759,
"learning_rate": 9.878441873912712e-06,
"loss": 0.2438,
"step": 55
},
{
"epoch": 0.1432225063938619,
"grad_norm": 3.8619552465100853,
"learning_rate": 9.87400022131382e-06,
"loss": 0.2897,
"step": 56
},
{
"epoch": 0.14578005115089515,
"grad_norm": 2.69396158196004,
"learning_rate": 9.869479905589136e-06,
"loss": 0.2023,
"step": 57
},
{
"epoch": 0.1483375959079284,
"grad_norm": 2.935987135845637,
"learning_rate": 9.864880999693551e-06,
"loss": 0.1895,
"step": 58
},
{
"epoch": 0.15089514066496162,
"grad_norm": 3.2452196053990705,
"learning_rate": 9.860203577850353e-06,
"loss": 0.2361,
"step": 59
},
{
"epoch": 0.1534526854219949,
"grad_norm": 2.87111398999677,
"learning_rate": 9.855447715550024e-06,
"loss": 0.2274,
"step": 60
},
{
"epoch": 0.15601023017902813,
"grad_norm": 2.4291351635075635,
"learning_rate": 9.850613489549018e-06,
"loss": 0.1846,
"step": 61
},
{
"epoch": 0.1585677749360614,
"grad_norm": 3.488870929695702,
"learning_rate": 9.845700977868536e-06,
"loss": 0.2709,
"step": 62
},
{
"epoch": 0.16112531969309463,
"grad_norm": 2.85759392571804,
"learning_rate": 9.840710259793251e-06,
"loss": 0.199,
"step": 63
},
{
"epoch": 0.1636828644501279,
"grad_norm": 3.7624802921070772,
"learning_rate": 9.835641415870038e-06,
"loss": 0.2974,
"step": 64
},
{
"epoch": 0.16624040920716113,
"grad_norm": 3.1043894363694924,
"learning_rate": 9.830494527906671e-06,
"loss": 0.1943,
"step": 65
},
{
"epoch": 0.16879795396419436,
"grad_norm": 3.0241718749527813,
"learning_rate": 9.825269678970502e-06,
"loss": 0.2257,
"step": 66
},
{
"epoch": 0.17135549872122763,
"grad_norm": 3.043220225169984,
"learning_rate": 9.819966953387122e-06,
"loss": 0.2404,
"step": 67
},
{
"epoch": 0.17391304347826086,
"grad_norm": 3.326803312632149,
"learning_rate": 9.814586436738998e-06,
"loss": 0.2373,
"step": 68
},
{
"epoch": 0.17647058823529413,
"grad_norm": 3.3830308282373807,
"learning_rate": 9.809128215864096e-06,
"loss": 0.2878,
"step": 69
},
{
"epoch": 0.17902813299232737,
"grad_norm": 2.6361004206605534,
"learning_rate": 9.803592378854476e-06,
"loss": 0.2244,
"step": 70
},
{
"epoch": 0.1815856777493606,
"grad_norm": 2.813271069940707,
"learning_rate": 9.797979015054868e-06,
"loss": 0.2411,
"step": 71
},
{
"epoch": 0.18414322250639387,
"grad_norm": 3.135685284217803,
"learning_rate": 9.792288215061237e-06,
"loss": 0.2383,
"step": 72
},
{
"epoch": 0.1867007672634271,
"grad_norm": 3.6498597735648017,
"learning_rate": 9.786520070719313e-06,
"loss": 0.2021,
"step": 73
},
{
"epoch": 0.18925831202046037,
"grad_norm": 2.96750507093361,
"learning_rate": 9.780674675123113e-06,
"loss": 0.2093,
"step": 74
},
{
"epoch": 0.1918158567774936,
"grad_norm": 3.2483917783877003,
"learning_rate": 9.77475212261344e-06,
"loss": 0.2647,
"step": 75
},
{
"epoch": 0.19437340153452684,
"grad_norm": 3.2378425660345065,
"learning_rate": 9.768752508776358e-06,
"loss": 0.1871,
"step": 76
},
{
"epoch": 0.1969309462915601,
"grad_norm": 2.755630840313636,
"learning_rate": 9.762675930441647e-06,
"loss": 0.2015,
"step": 77
},
{
"epoch": 0.19948849104859334,
"grad_norm": 2.8181426454401493,
"learning_rate": 9.756522485681247e-06,
"loss": 0.2403,
"step": 78
},
{
"epoch": 0.2020460358056266,
"grad_norm": 3.632918000696892,
"learning_rate": 9.750292273807666e-06,
"loss": 0.2978,
"step": 79
},
{
"epoch": 0.20460358056265984,
"grad_norm": 3.893813012751235,
"learning_rate": 9.743985395372387e-06,
"loss": 0.2615,
"step": 80
},
{
"epoch": 0.2071611253196931,
"grad_norm": 3.7796636445556815,
"learning_rate": 9.737601952164238e-06,
"loss": 0.2785,
"step": 81
},
{
"epoch": 0.20971867007672634,
"grad_norm": 3.1590261591104243,
"learning_rate": 9.73114204720775e-06,
"loss": 0.2465,
"step": 82
},
{
"epoch": 0.21227621483375958,
"grad_norm": 2.593757039460282,
"learning_rate": 9.724605784761501e-06,
"loss": 0.2303,
"step": 83
},
{
"epoch": 0.21483375959079284,
"grad_norm": 2.9633378595540463,
"learning_rate": 9.717993270316421e-06,
"loss": 0.1924,
"step": 84
},
{
"epoch": 0.21739130434782608,
"grad_norm": 2.834487770952517,
"learning_rate": 9.711304610594104e-06,
"loss": 0.2601,
"step": 85
},
{
"epoch": 0.21994884910485935,
"grad_norm": 2.393543544620746,
"learning_rate": 9.704539913545073e-06,
"loss": 0.2051,
"step": 86
},
{
"epoch": 0.22250639386189258,
"grad_norm": 2.900171504184875,
"learning_rate": 9.697699288347043e-06,
"loss": 0.1965,
"step": 87
},
{
"epoch": 0.22506393861892582,
"grad_norm": 2.384624513559865,
"learning_rate": 9.690782845403164e-06,
"loss": 0.1861,
"step": 88
},
{
"epoch": 0.22762148337595908,
"grad_norm": 2.8687734099612183,
"learning_rate": 9.683790696340229e-06,
"loss": 0.2277,
"step": 89
},
{
"epoch": 0.23017902813299232,
"grad_norm": 3.1994663958961325,
"learning_rate": 9.676722954006878e-06,
"loss": 0.2639,
"step": 90
},
{
"epoch": 0.23273657289002558,
"grad_norm": 3.5946497721000803,
"learning_rate": 9.669579732471779e-06,
"loss": 0.3063,
"step": 91
},
{
"epoch": 0.23529411764705882,
"grad_norm": 3.2126140147347058,
"learning_rate": 9.66236114702178e-06,
"loss": 0.2311,
"step": 92
},
{
"epoch": 0.23785166240409208,
"grad_norm": 2.898350923062013,
"learning_rate": 9.655067314160058e-06,
"loss": 0.2296,
"step": 93
},
{
"epoch": 0.24040920716112532,
"grad_norm": 3.377946545735314,
"learning_rate": 9.647698351604227e-06,
"loss": 0.3008,
"step": 94
},
{
"epoch": 0.24296675191815856,
"grad_norm": 3.299627018131192,
"learning_rate": 9.640254378284447e-06,
"loss": 0.2697,
"step": 95
},
{
"epoch": 0.24552429667519182,
"grad_norm": 3.11855005777831,
"learning_rate": 9.632735514341508e-06,
"loss": 0.267,
"step": 96
},
{
"epoch": 0.24808184143222506,
"grad_norm": 3.6830270996696823,
"learning_rate": 9.625141881124874e-06,
"loss": 0.3026,
"step": 97
},
{
"epoch": 0.2506393861892583,
"grad_norm": 3.908203185558239,
"learning_rate": 9.617473601190743e-06,
"loss": 0.206,
"step": 98
},
{
"epoch": 0.2531969309462916,
"grad_norm": 2.743300056729194,
"learning_rate": 9.609730798300056e-06,
"loss": 0.1799,
"step": 99
},
{
"epoch": 0.2557544757033248,
"grad_norm": 4.814431714040076,
"learning_rate": 9.601913597416513e-06,
"loss": 0.211,
"step": 100
},
{
"epoch": 0.25831202046035806,
"grad_norm": 4.268877383945361,
"learning_rate": 9.594022124704541e-06,
"loss": 0.345,
"step": 101
},
{
"epoch": 0.2608695652173913,
"grad_norm": 3.383521599612797,
"learning_rate": 9.586056507527266e-06,
"loss": 0.2137,
"step": 102
},
{
"epoch": 0.26342710997442453,
"grad_norm": 3.4109070658867338,
"learning_rate": 9.578016874444459e-06,
"loss": 0.2096,
"step": 103
},
{
"epoch": 0.2659846547314578,
"grad_norm": 4.44984138610315,
"learning_rate": 9.569903355210457e-06,
"loss": 0.181,
"step": 104
},
{
"epoch": 0.26854219948849106,
"grad_norm": 3.14837415888387,
"learning_rate": 9.561716080772072e-06,
"loss": 0.2379,
"step": 105
},
{
"epoch": 0.2710997442455243,
"grad_norm": 3.538274906585293,
"learning_rate": 9.55345518326647e-06,
"loss": 0.2866,
"step": 106
},
{
"epoch": 0.27365728900255754,
"grad_norm": 4.12006215133929,
"learning_rate": 9.545120796019056e-06,
"loss": 0.2089,
"step": 107
},
{
"epoch": 0.27621483375959077,
"grad_norm": 3.7367349991712984,
"learning_rate": 9.5367130535413e-06,
"loss": 0.2824,
"step": 108
},
{
"epoch": 0.27877237851662406,
"grad_norm": 24.188026787197614,
"learning_rate": 9.528232091528578e-06,
"loss": 0.2343,
"step": 109
},
{
"epoch": 0.2813299232736573,
"grad_norm": 5.518174949495067,
"learning_rate": 9.519678046857987e-06,
"loss": 0.2359,
"step": 110
},
{
"epoch": 0.28388746803069054,
"grad_norm": 3.1735804164552444,
"learning_rate": 9.511051057586125e-06,
"loss": 0.2662,
"step": 111
},
{
"epoch": 0.2864450127877238,
"grad_norm": 2.6308622708865803,
"learning_rate": 9.502351262946865e-06,
"loss": 0.2315,
"step": 112
},
{
"epoch": 0.289002557544757,
"grad_norm": 3.545647642738695,
"learning_rate": 9.493578803349117e-06,
"loss": 0.1944,
"step": 113
},
{
"epoch": 0.2915601023017903,
"grad_norm": 3.5521955511759944,
"learning_rate": 9.48473382037455e-06,
"loss": 0.2626,
"step": 114
},
{
"epoch": 0.29411764705882354,
"grad_norm": 4.054758472454413,
"learning_rate": 9.475816456775313e-06,
"loss": 0.2714,
"step": 115
},
{
"epoch": 0.2966751918158568,
"grad_norm": 8.230492979964824,
"learning_rate": 9.466826856471728e-06,
"loss": 0.2479,
"step": 116
},
{
"epoch": 0.29923273657289,
"grad_norm": 3.6339322967957037,
"learning_rate": 9.457765164549979e-06,
"loss": 0.2345,
"step": 117
},
{
"epoch": 0.30179028132992325,
"grad_norm": 297.39275307922753,
"learning_rate": 9.448631527259749e-06,
"loss": 0.3842,
"step": 118
},
{
"epoch": 0.30434782608695654,
"grad_norm": 5.3234836238777,
"learning_rate": 9.439426092011877e-06,
"loss": 0.1926,
"step": 119
},
{
"epoch": 0.3069053708439898,
"grad_norm": 46.01851594690418,
"learning_rate": 9.430149007375974e-06,
"loss": 0.2565,
"step": 120
},
{
"epoch": 0.309462915601023,
"grad_norm": 5.302182848361836,
"learning_rate": 9.42080042307802e-06,
"loss": 0.2408,
"step": 121
},
{
"epoch": 0.31202046035805625,
"grad_norm": 2.578340943631881,
"learning_rate": 9.411380489997962e-06,
"loss": 0.1934,
"step": 122
},
{
"epoch": 0.3145780051150895,
"grad_norm": 3.9625100814656786,
"learning_rate": 9.401889360167256e-06,
"loss": 0.2653,
"step": 123
},
{
"epoch": 0.3171355498721228,
"grad_norm": 2.767266370495417,
"learning_rate": 9.392327186766434e-06,
"loss": 0.2236,
"step": 124
},
{
"epoch": 0.319693094629156,
"grad_norm": 2.366592695821496,
"learning_rate": 9.382694124122624e-06,
"loss": 0.2089,
"step": 125
},
{
"epoch": 0.32225063938618925,
"grad_norm": 2.366756458486761,
"learning_rate": 9.372990327707057e-06,
"loss": 0.184,
"step": 126
},
{
"epoch": 0.3248081841432225,
"grad_norm": 2.7874992191290815,
"learning_rate": 9.36321595413256e-06,
"loss": 0.2171,
"step": 127
},
{
"epoch": 0.3273657289002558,
"grad_norm": 2.8285353939596978,
"learning_rate": 9.353371161151032e-06,
"loss": 0.2968,
"step": 128
},
{
"epoch": 0.329923273657289,
"grad_norm": 2.614287807796315,
"learning_rate": 9.34345610765089e-06,
"loss": 0.1734,
"step": 129
},
{
"epoch": 0.33248081841432225,
"grad_norm": 3.3261759665055326,
"learning_rate": 9.333470953654513e-06,
"loss": 0.2976,
"step": 130
},
{
"epoch": 0.3350383631713555,
"grad_norm": 2.6928328036310862,
"learning_rate": 9.32341586031565e-06,
"loss": 0.2781,
"step": 131
},
{
"epoch": 0.3375959079283887,
"grad_norm": 3.251358150140968,
"learning_rate": 9.31329098991683e-06,
"loss": 0.2767,
"step": 132
},
{
"epoch": 0.340153452685422,
"grad_norm": 3.1871135167369418,
"learning_rate": 9.303096505866734e-06,
"loss": 0.2442,
"step": 133
},
{
"epoch": 0.34271099744245526,
"grad_norm": 2.5681474052955444,
"learning_rate": 9.292832572697566e-06,
"loss": 0.2291,
"step": 134
},
{
"epoch": 0.3452685421994885,
"grad_norm": 2.3640353419986506,
"learning_rate": 9.282499356062385e-06,
"loss": 0.2271,
"step": 135
},
{
"epoch": 0.34782608695652173,
"grad_norm": 2.8099850692384134,
"learning_rate": 9.272097022732444e-06,
"loss": 0.2093,
"step": 136
},
{
"epoch": 0.35038363171355497,
"grad_norm": 3.049978602373718,
"learning_rate": 9.261625740594494e-06,
"loss": 0.2372,
"step": 137
},
{
"epoch": 0.35294117647058826,
"grad_norm": 3.137496790956829,
"learning_rate": 9.251085678648072e-06,
"loss": 0.2506,
"step": 138
},
{
"epoch": 0.3554987212276215,
"grad_norm": 2.867267066152733,
"learning_rate": 9.240477007002777e-06,
"loss": 0.2147,
"step": 139
},
{
"epoch": 0.35805626598465473,
"grad_norm": 3.3365434423871125,
"learning_rate": 9.22979989687552e-06,
"loss": 0.3041,
"step": 140
},
{
"epoch": 0.36061381074168797,
"grad_norm": 2.9940933133119563,
"learning_rate": 9.219054520587766e-06,
"loss": 0.1847,
"step": 141
},
{
"epoch": 0.3631713554987212,
"grad_norm": 2.4723991968002976,
"learning_rate": 9.208241051562753e-06,
"loss": 0.228,
"step": 142
},
{
"epoch": 0.3657289002557545,
"grad_norm": 3.305464271619873,
"learning_rate": 9.197359664322684e-06,
"loss": 0.2376,
"step": 143
},
{
"epoch": 0.36828644501278773,
"grad_norm": 2.8768687198709935,
"learning_rate": 9.186410534485924e-06,
"loss": 0.2564,
"step": 144
},
{
"epoch": 0.37084398976982097,
"grad_norm": 2.478708936880423,
"learning_rate": 9.175393838764153e-06,
"loss": 0.1982,
"step": 145
},
{
"epoch": 0.3734015345268542,
"grad_norm": 2.3943362655534215,
"learning_rate": 9.164309754959523e-06,
"loss": 0.2107,
"step": 146
},
{
"epoch": 0.37595907928388744,
"grad_norm": 2.6390423103142604,
"learning_rate": 9.153158461961782e-06,
"loss": 0.2024,
"step": 147
},
{
"epoch": 0.37851662404092073,
"grad_norm": 2.5371647794977874,
"learning_rate": 9.14194013974539e-06,
"loss": 0.1916,
"step": 148
},
{
"epoch": 0.38107416879795397,
"grad_norm": 2.458057040535335,
"learning_rate": 9.130654969366619e-06,
"loss": 0.2228,
"step": 149
},
{
"epoch": 0.3836317135549872,
"grad_norm": 2.5637807307196736,
"learning_rate": 9.11930313296062e-06,
"loss": 0.212,
"step": 150
},
{
"epoch": 0.38618925831202044,
"grad_norm": 3.123041884855774,
"learning_rate": 9.107884813738492e-06,
"loss": 0.2739,
"step": 151
},
{
"epoch": 0.3887468030690537,
"grad_norm": 2.4409380929261904,
"learning_rate": 9.096400195984322e-06,
"loss": 0.1773,
"step": 152
},
{
"epoch": 0.391304347826087,
"grad_norm": 3.0511733344744982,
"learning_rate": 9.08484946505221e-06,
"loss": 0.2202,
"step": 153
},
{
"epoch": 0.3938618925831202,
"grad_norm": 2.881737174801456,
"learning_rate": 9.073232807363283e-06,
"loss": 0.2198,
"step": 154
},
{
"epoch": 0.39641943734015345,
"grad_norm": 3.5525444573087652,
"learning_rate": 9.061550410402677e-06,
"loss": 0.2939,
"step": 155
},
{
"epoch": 0.3989769820971867,
"grad_norm": 3.3680781528749604,
"learning_rate": 9.049802462716521e-06,
"loss": 0.2434,
"step": 156
},
{
"epoch": 0.40153452685422,
"grad_norm": 3.939196043495901,
"learning_rate": 9.037989153908882e-06,
"loss": 0.2628,
"step": 157
},
{
"epoch": 0.4040920716112532,
"grad_norm": 2.5079359353983155,
"learning_rate": 9.026110674638722e-06,
"loss": 0.2171,
"step": 158
},
{
"epoch": 0.40664961636828645,
"grad_norm": 2.593975182068212,
"learning_rate": 9.0141672166168e-06,
"loss": 0.1617,
"step": 159
},
{
"epoch": 0.4092071611253197,
"grad_norm": 3.0574104006787604,
"learning_rate": 9.002158972602599e-06,
"loss": 0.3072,
"step": 160
},
{
"epoch": 0.4117647058823529,
"grad_norm": 2.948058496064816,
"learning_rate": 8.990086136401199e-06,
"loss": 0.2385,
"step": 161
},
{
"epoch": 0.4143222506393862,
"grad_norm": 3.702960642790453,
"learning_rate": 8.977948902860154e-06,
"loss": 0.2847,
"step": 162
},
{
"epoch": 0.41687979539641945,
"grad_norm": 2.669781467285976,
"learning_rate": 8.965747467866355e-06,
"loss": 0.1822,
"step": 163
},
{
"epoch": 0.4194373401534527,
"grad_norm": 2.6806911897560295,
"learning_rate": 8.953482028342853e-06,
"loss": 0.2796,
"step": 164
},
{
"epoch": 0.4219948849104859,
"grad_norm": 2.5442421084289255,
"learning_rate": 8.9411527822457e-06,
"loss": 0.1958,
"step": 165
},
{
"epoch": 0.42455242966751916,
"grad_norm": 2.9096327300999345,
"learning_rate": 8.92875992856073e-06,
"loss": 0.2634,
"step": 166
},
{
"epoch": 0.42710997442455245,
"grad_norm": 2.285111815505967,
"learning_rate": 8.916303667300373e-06,
"loss": 0.1844,
"step": 167
},
{
"epoch": 0.4296675191815857,
"grad_norm": 3.143641298577265,
"learning_rate": 8.903784199500412e-06,
"loss": 0.22,
"step": 168
},
{
"epoch": 0.4322250639386189,
"grad_norm": 3.1734443032455126,
"learning_rate": 8.89120172721674e-06,
"loss": 0.2289,
"step": 169
},
{
"epoch": 0.43478260869565216,
"grad_norm": 3.236825673549631,
"learning_rate": 8.8785564535221e-06,
"loss": 0.2206,
"step": 170
},
{
"epoch": 0.4373401534526854,
"grad_norm": 2.993422962262724,
"learning_rate": 8.86584858250281e-06,
"loss": 0.2355,
"step": 171
},
{
"epoch": 0.4398976982097187,
"grad_norm": 2.8378682738193826,
"learning_rate": 8.853078319255466e-06,
"loss": 0.2477,
"step": 172
},
{
"epoch": 0.4424552429667519,
"grad_norm": 2.6712684943657403,
"learning_rate": 8.840245869883635e-06,
"loss": 0.2457,
"step": 173
},
{
"epoch": 0.44501278772378516,
"grad_norm": 2.171829542476822,
"learning_rate": 8.827351441494525e-06,
"loss": 0.2035,
"step": 174
},
{
"epoch": 0.4475703324808184,
"grad_norm": 3.227921740305447,
"learning_rate": 8.814395242195642e-06,
"loss": 0.2979,
"step": 175
},
{
"epoch": 0.45012787723785164,
"grad_norm": 2.7461113116814926,
"learning_rate": 8.80137748109144e-06,
"loss": 0.2195,
"step": 176
},
{
"epoch": 0.45268542199488493,
"grad_norm": 1.9489364216184313,
"learning_rate": 8.78829836827993e-06,
"loss": 0.1745,
"step": 177
},
{
"epoch": 0.45524296675191817,
"grad_norm": 2.390111931670749,
"learning_rate": 8.77515811484931e-06,
"loss": 0.2287,
"step": 178
},
{
"epoch": 0.4578005115089514,
"grad_norm": 2.32280460729142,
"learning_rate": 8.761956932874539e-06,
"loss": 0.1771,
"step": 179
},
{
"epoch": 0.46035805626598464,
"grad_norm": 2.7594731738765237,
"learning_rate": 8.748695035413925e-06,
"loss": 0.2272,
"step": 180
},
{
"epoch": 0.4629156010230179,
"grad_norm": 2.2666034081466737,
"learning_rate": 8.735372636505681e-06,
"loss": 0.2167,
"step": 181
},
{
"epoch": 0.46547314578005117,
"grad_norm": 2.427582658847167,
"learning_rate": 8.72198995116448e-06,
"loss": 0.24,
"step": 182
},
{
"epoch": 0.4680306905370844,
"grad_norm": 2.5568780604163326,
"learning_rate": 8.708547195377968e-06,
"loss": 0.3001,
"step": 183
},
{
"epoch": 0.47058823529411764,
"grad_norm": 2.8819289669178163,
"learning_rate": 8.695044586103297e-06,
"loss": 0.2397,
"step": 184
},
{
"epoch": 0.4731457800511509,
"grad_norm": 3.867315677781016,
"learning_rate": 8.68148234126361e-06,
"loss": 0.2899,
"step": 185
},
{
"epoch": 0.47570332480818417,
"grad_norm": 2.804180003869186,
"learning_rate": 8.667860679744529e-06,
"loss": 0.2094,
"step": 186
},
{
"epoch": 0.4782608695652174,
"grad_norm": 2.7034209359291315,
"learning_rate": 8.65417982139062e-06,
"loss": 0.2324,
"step": 187
},
{
"epoch": 0.48081841432225064,
"grad_norm": 2.2812365119673874,
"learning_rate": 8.640439987001855e-06,
"loss": 0.2029,
"step": 188
},
{
"epoch": 0.4833759590792839,
"grad_norm": 2.6299858026479517,
"learning_rate": 8.626641398330027e-06,
"loss": 0.2137,
"step": 189
},
{
"epoch": 0.4859335038363171,
"grad_norm": 2.1221936568568465,
"learning_rate": 8.612784278075195e-06,
"loss": 0.2144,
"step": 190
},
{
"epoch": 0.4884910485933504,
"grad_norm": 2.6431227938147783,
"learning_rate": 8.598868849882074e-06,
"loss": 0.2368,
"step": 191
},
{
"epoch": 0.49104859335038364,
"grad_norm": 2.4185335606377945,
"learning_rate": 8.58489533833643e-06,
"loss": 0.2053,
"step": 192
},
{
"epoch": 0.4936061381074169,
"grad_norm": 2.205243127504733,
"learning_rate": 8.570863968961456e-06,
"loss": 0.1629,
"step": 193
},
{
"epoch": 0.4961636828644501,
"grad_norm": 1.9725752515599342,
"learning_rate": 8.556774968214134e-06,
"loss": 0.2101,
"step": 194
},
{
"epoch": 0.49872122762148335,
"grad_norm": 2.5299942102683035,
"learning_rate": 8.542628563481577e-06,
"loss": 0.2333,
"step": 195
},
{
"epoch": 0.5012787723785166,
"grad_norm": 2.41472732044959,
"learning_rate": 8.52842498307736e-06,
"loss": 0.2526,
"step": 196
},
{
"epoch": 0.5038363171355499,
"grad_norm": 2.8335636174145593,
"learning_rate": 8.514164456237835e-06,
"loss": 0.25,
"step": 197
},
{
"epoch": 0.5063938618925832,
"grad_norm": 2.793873822554608,
"learning_rate": 8.499847213118431e-06,
"loss": 0.2622,
"step": 198
},
{
"epoch": 0.5089514066496164,
"grad_norm": 2.7012530682897378,
"learning_rate": 8.485473484789944e-06,
"loss": 0.2666,
"step": 199
},
{
"epoch": 0.5115089514066496,
"grad_norm": 2.2375978558847387,
"learning_rate": 8.471043503234796e-06,
"loss": 0.2344,
"step": 200
},
{
"epoch": 0.5140664961636828,
"grad_norm": 3.1799564884853146,
"learning_rate": 8.45655750134331e-06,
"loss": 0.2335,
"step": 201
},
{
"epoch": 0.5166240409207161,
"grad_norm": 2.0962041202170654,
"learning_rate": 8.442015712909926e-06,
"loss": 0.1948,
"step": 202
},
{
"epoch": 0.5191815856777494,
"grad_norm": 2.372975255887274,
"learning_rate": 8.427418372629456e-06,
"loss": 0.2071,
"step": 203
},
{
"epoch": 0.5217391304347826,
"grad_norm": 2.555304358266548,
"learning_rate": 8.412765716093273e-06,
"loss": 0.2152,
"step": 204
},
{
"epoch": 0.5242966751918159,
"grad_norm": 1.9638554898256244,
"learning_rate": 8.398057979785515e-06,
"loss": 0.1731,
"step": 205
},
{
"epoch": 0.5268542199488491,
"grad_norm": 2.722593380051442,
"learning_rate": 8.383295401079284e-06,
"loss": 0.2152,
"step": 206
},
{
"epoch": 0.5294117647058824,
"grad_norm": 2.718713160656891,
"learning_rate": 8.368478218232787e-06,
"loss": 0.2458,
"step": 207
},
{
"epoch": 0.5319693094629157,
"grad_norm": 2.205180512995315,
"learning_rate": 8.353606670385514e-06,
"loss": 0.1922,
"step": 208
},
{
"epoch": 0.5345268542199488,
"grad_norm": 2.099261923690752,
"learning_rate": 8.338680997554372e-06,
"loss": 0.1998,
"step": 209
},
{
"epoch": 0.5370843989769821,
"grad_norm": 2.132007011749811,
"learning_rate": 8.3237014406298e-06,
"loss": 0.2043,
"step": 210
},
{
"epoch": 0.5396419437340153,
"grad_norm": 2.290560761638665,
"learning_rate": 8.308668241371897e-06,
"loss": 0.1652,
"step": 211
},
{
"epoch": 0.5421994884910486,
"grad_norm": 2.12644563499644,
"learning_rate": 8.293581642406517e-06,
"loss": 0.1843,
"step": 212
},
{
"epoch": 0.5447570332480819,
"grad_norm": 3.723287563479075,
"learning_rate": 8.278441887221338e-06,
"loss": 0.3019,
"step": 213
},
{
"epoch": 0.5473145780051151,
"grad_norm": 2.2912570816048126,
"learning_rate": 8.263249220161957e-06,
"loss": 0.1796,
"step": 214
},
{
"epoch": 0.5498721227621484,
"grad_norm": 2.260603019533916,
"learning_rate": 8.248003886427927e-06,
"loss": 0.1961,
"step": 215
},
{
"epoch": 0.5524296675191815,
"grad_norm": 2.257255113410867,
"learning_rate": 8.232706132068806e-06,
"loss": 0.1275,
"step": 216
},
{
"epoch": 0.5549872122762148,
"grad_norm": 3.0673877778681113,
"learning_rate": 8.217356203980187e-06,
"loss": 0.1877,
"step": 217
},
{
"epoch": 0.5575447570332481,
"grad_norm": 2.1489605272466634,
"learning_rate": 8.201954349899712e-06,
"loss": 0.2175,
"step": 218
},
{
"epoch": 0.5601023017902813,
"grad_norm": 2.1580989041266676,
"learning_rate": 8.186500818403076e-06,
"loss": 0.1388,
"step": 219
},
{
"epoch": 0.5626598465473146,
"grad_norm": 2.943597219795099,
"learning_rate": 8.17099585890001e-06,
"loss": 0.2643,
"step": 220
},
{
"epoch": 0.5652173913043478,
"grad_norm": 1.9908772203041976,
"learning_rate": 8.155439721630265e-06,
"loss": 0.1636,
"step": 221
},
{
"epoch": 0.5677749360613811,
"grad_norm": 2.4027039208022916,
"learning_rate": 8.139832657659557e-06,
"loss": 0.2044,
"step": 222
},
{
"epoch": 0.5703324808184144,
"grad_norm": 2.651295860517237,
"learning_rate": 8.124174918875532e-06,
"loss": 0.2978,
"step": 223
},
{
"epoch": 0.5728900255754475,
"grad_norm": 2.7727762126746756,
"learning_rate": 8.108466757983695e-06,
"loss": 0.2093,
"step": 224
},
{
"epoch": 0.5754475703324808,
"grad_norm": 2.2636479281802044,
"learning_rate": 8.092708428503324e-06,
"loss": 0.1748,
"step": 225
},
{
"epoch": 0.578005115089514,
"grad_norm": 3.511479308858075,
"learning_rate": 8.076900184763394e-06,
"loss": 0.2064,
"step": 226
},
{
"epoch": 0.5805626598465473,
"grad_norm": 2.3586737252559793,
"learning_rate": 8.061042281898453e-06,
"loss": 0.2045,
"step": 227
},
{
"epoch": 0.5831202046035806,
"grad_norm": 2.62506739236279,
"learning_rate": 8.04513497584452e-06,
"loss": 0.2069,
"step": 228
},
{
"epoch": 0.5856777493606138,
"grad_norm": 2.804637992821702,
"learning_rate": 8.02917852333495e-06,
"loss": 0.2787,
"step": 229
},
{
"epoch": 0.5882352941176471,
"grad_norm": 2.5840154931492387,
"learning_rate": 8.013173181896283e-06,
"loss": 0.2566,
"step": 230
},
{
"epoch": 0.5907928388746803,
"grad_norm": 2.384157388224994,
"learning_rate": 7.9971192098441e-06,
"loss": 0.1297,
"step": 231
},
{
"epoch": 0.5933503836317136,
"grad_norm": 3.616384836402104,
"learning_rate": 7.981016866278843e-06,
"loss": 0.2089,
"step": 232
},
{
"epoch": 0.5959079283887468,
"grad_norm": 2.557601987276289,
"learning_rate": 7.964866411081645e-06,
"loss": 0.2178,
"step": 233
},
{
"epoch": 0.59846547314578,
"grad_norm": 2.3704655232441154,
"learning_rate": 7.94866810491012e-06,
"loss": 0.2102,
"step": 234
},
{
"epoch": 0.6010230179028133,
"grad_norm": 2.5523092935514566,
"learning_rate": 7.93242220919417e-06,
"loss": 0.2189,
"step": 235
},
{
"epoch": 0.6035805626598465,
"grad_norm": 2.1709054016859493,
"learning_rate": 7.916128986131761e-06,
"loss": 0.1908,
"step": 236
},
{
"epoch": 0.6061381074168798,
"grad_norm": 2.068222190576718,
"learning_rate": 7.899788698684687e-06,
"loss": 0.1975,
"step": 237
},
{
"epoch": 0.6086956521739131,
"grad_norm": 2.836960021200559,
"learning_rate": 7.883401610574338e-06,
"loss": 0.255,
"step": 238
},
{
"epoch": 0.6112531969309463,
"grad_norm": 1.8883272553824537,
"learning_rate": 7.866967986277423e-06,
"loss": 0.1455,
"step": 239
},
{
"epoch": 0.6138107416879796,
"grad_norm": 2.4755576248259192,
"learning_rate": 7.850488091021726e-06,
"loss": 0.2442,
"step": 240
},
{
"epoch": 0.6163682864450127,
"grad_norm": 2.802725658457541,
"learning_rate": 7.833962190781809e-06,
"loss": 0.2283,
"step": 241
},
{
"epoch": 0.618925831202046,
"grad_norm": 1.998218824904067,
"learning_rate": 7.817390552274721e-06,
"loss": 0.1389,
"step": 242
},
{
"epoch": 0.6214833759590793,
"grad_norm": 2.6781876390883768,
"learning_rate": 7.800773442955703e-06,
"loss": 0.2238,
"step": 243
},
{
"epoch": 0.6240409207161125,
"grad_norm": 2.564654638673255,
"learning_rate": 7.784111131013858e-06,
"loss": 0.2071,
"step": 244
},
{
"epoch": 0.6265984654731458,
"grad_norm": 2.3286664497026903,
"learning_rate": 7.767403885367832e-06,
"loss": 0.1797,
"step": 245
},
{
"epoch": 0.629156010230179,
"grad_norm": 2.940961657565362,
"learning_rate": 7.750651975661471e-06,
"loss": 0.2773,
"step": 246
},
{
"epoch": 0.6317135549872123,
"grad_norm": 2.39717318483795,
"learning_rate": 7.733855672259472e-06,
"loss": 0.2242,
"step": 247
},
{
"epoch": 0.6342710997442456,
"grad_norm": 2.018378796967046,
"learning_rate": 7.717015246243012e-06,
"loss": 0.1585,
"step": 248
},
{
"epoch": 0.6368286445012787,
"grad_norm": 3.043878287662867,
"learning_rate": 7.700130969405377e-06,
"loss": 0.223,
"step": 249
},
{
"epoch": 0.639386189258312,
"grad_norm": 2.8728908638133195,
"learning_rate": 7.683203114247587e-06,
"loss": 0.2143,
"step": 250
},
{
"epoch": 0.6419437340153452,
"grad_norm": 2.6954559773855706,
"learning_rate": 7.66623195397397e-06,
"loss": 0.2254,
"step": 251
},
{
"epoch": 0.6445012787723785,
"grad_norm": 2.7134657682436236,
"learning_rate": 7.649217762487786e-06,
"loss": 0.2367,
"step": 252
},
{
"epoch": 0.6470588235294118,
"grad_norm": 2.3389717040321525,
"learning_rate": 7.63216081438678e-06,
"loss": 0.1965,
"step": 253
},
{
"epoch": 0.649616368286445,
"grad_norm": 1.9874680601076997,
"learning_rate": 7.615061384958764e-06,
"loss": 0.2062,
"step": 254
},
{
"epoch": 0.6521739130434783,
"grad_norm": 2.9050691223424474,
"learning_rate": 7.597919750177168e-06,
"loss": 0.226,
"step": 255
},
{
"epoch": 0.6547314578005116,
"grad_norm": 2.304447009536863,
"learning_rate": 7.580736186696593e-06,
"loss": 0.1913,
"step": 256
},
{
"epoch": 0.6572890025575447,
"grad_norm": 3.149415595365879,
"learning_rate": 7.563510971848339e-06,
"loss": 0.2187,
"step": 257
},
{
"epoch": 0.659846547314578,
"grad_norm": 2.3690500597631843,
"learning_rate": 7.546244383635929e-06,
"loss": 0.25,
"step": 258
},
{
"epoch": 0.6624040920716112,
"grad_norm": 2.430504175852079,
"learning_rate": 7.528936700730627e-06,
"loss": 0.2277,
"step": 259
},
{
"epoch": 0.6649616368286445,
"grad_norm": 2.270735698643305,
"learning_rate": 7.5115882024669375e-06,
"loss": 0.1974,
"step": 260
},
{
"epoch": 0.6675191815856778,
"grad_norm": 2.4619767379298296,
"learning_rate": 7.494199168838099e-06,
"loss": 0.2642,
"step": 261
},
{
"epoch": 0.670076726342711,
"grad_norm": 2.7725591682853543,
"learning_rate": 7.476769880491561e-06,
"loss": 0.2511,
"step": 262
},
{
"epoch": 0.6726342710997443,
"grad_norm": 2.334392166881636,
"learning_rate": 7.459300618724462e-06,
"loss": 0.1837,
"step": 263
},
{
"epoch": 0.6751918158567775,
"grad_norm": 3.6092024144297405,
"learning_rate": 7.44179166547908e-06,
"loss": 0.2718,
"step": 264
},
{
"epoch": 0.6777493606138107,
"grad_norm": 2.5798015356358266,
"learning_rate": 7.42424330333829e-06,
"loss": 0.2475,
"step": 265
},
{
"epoch": 0.680306905370844,
"grad_norm": 2.33242353641567,
"learning_rate": 7.406655815520998e-06,
"loss": 0.2052,
"step": 266
},
{
"epoch": 0.6828644501278772,
"grad_norm": 2.8947250508158984,
"learning_rate": 7.389029485877577e-06,
"loss": 0.23,
"step": 267
},
{
"epoch": 0.6854219948849105,
"grad_norm": 2.0825711672647826,
"learning_rate": 7.371364598885276e-06,
"loss": 0.1965,
"step": 268
},
{
"epoch": 0.6879795396419437,
"grad_norm": 1.9740063711309666,
"learning_rate": 7.353661439643638e-06,
"loss": 0.1678,
"step": 269
},
{
"epoch": 0.690537084398977,
"grad_norm": 2.0678614348256974,
"learning_rate": 7.335920293869891e-06,
"loss": 0.1817,
"step": 270
},
{
"epoch": 0.6930946291560103,
"grad_norm": 1.9715896584301256,
"learning_rate": 7.318141447894344e-06,
"loss": 0.1317,
"step": 271
},
{
"epoch": 0.6956521739130435,
"grad_norm": 2.260924051051695,
"learning_rate": 7.300325188655762e-06,
"loss": 0.2277,
"step": 272
},
{
"epoch": 0.6982097186700768,
"grad_norm": 2.6190647544478796,
"learning_rate": 7.28247180369673e-06,
"loss": 0.2308,
"step": 273
},
{
"epoch": 0.7007672634271099,
"grad_norm": 3.044526709385895,
"learning_rate": 7.264581581159024e-06,
"loss": 0.2258,
"step": 274
},
{
"epoch": 0.7033248081841432,
"grad_norm": 2.4284330910741954,
"learning_rate": 7.246654809778951e-06,
"loss": 0.2381,
"step": 275
},
{
"epoch": 0.7058823529411765,
"grad_norm": 2.2299016723949716,
"learning_rate": 7.2286917788826926e-06,
"loss": 0.1775,
"step": 276
},
{
"epoch": 0.7084398976982097,
"grad_norm": 1.9829661983833986,
"learning_rate": 7.210692778381634e-06,
"loss": 0.1973,
"step": 277
},
{
"epoch": 0.710997442455243,
"grad_norm": 3.083532290054799,
"learning_rate": 7.192658098767686e-06,
"loss": 0.2442,
"step": 278
},
{
"epoch": 0.7135549872122762,
"grad_norm": 2.138475355654292,
"learning_rate": 7.174588031108598e-06,
"loss": 0.192,
"step": 279
},
{
"epoch": 0.7161125319693095,
"grad_norm": 2.5991226258353817,
"learning_rate": 7.1564828670432595e-06,
"loss": 0.2268,
"step": 280
},
{
"epoch": 0.7186700767263428,
"grad_norm": 2.891848075907018,
"learning_rate": 7.138342898776989e-06,
"loss": 0.2115,
"step": 281
},
{
"epoch": 0.7212276214833759,
"grad_norm": 2.726057165848505,
"learning_rate": 7.120168419076825e-06,
"loss": 0.229,
"step": 282
},
{
"epoch": 0.7237851662404092,
"grad_norm": 2.534815026313142,
"learning_rate": 7.101959721266798e-06,
"loss": 0.2645,
"step": 283
},
{
"epoch": 0.7263427109974424,
"grad_norm": 2.825513673241371,
"learning_rate": 7.083717099223192e-06,
"loss": 0.242,
"step": 284
},
{
"epoch": 0.7289002557544757,
"grad_norm": 1.9732517778995824,
"learning_rate": 7.0654408473698084e-06,
"loss": 0.1711,
"step": 285
},
{
"epoch": 0.731457800511509,
"grad_norm": 2.211705086269456,
"learning_rate": 7.047131260673214e-06,
"loss": 0.163,
"step": 286
},
{
"epoch": 0.7340153452685422,
"grad_norm": 2.697861006533922,
"learning_rate": 7.0287886346379755e-06,
"loss": 0.2455,
"step": 287
},
{
"epoch": 0.7365728900255755,
"grad_norm": 2.8082724007729123,
"learning_rate": 7.010413265301888e-06,
"loss": 0.2302,
"step": 288
},
{
"epoch": 0.7391304347826086,
"grad_norm": 2.365367868240213,
"learning_rate": 6.9920054492312086e-06,
"loss": 0.2454,
"step": 289
},
{
"epoch": 0.7416879795396419,
"grad_norm": 2.523482715187105,
"learning_rate": 6.97356548351586e-06,
"loss": 0.2117,
"step": 290
},
{
"epoch": 0.7442455242966752,
"grad_norm": 2.432296205013838,
"learning_rate": 6.9550936657646386e-06,
"loss": 0.2008,
"step": 291
},
{
"epoch": 0.7468030690537084,
"grad_norm": 2.5596157723702575,
"learning_rate": 6.936590294100414e-06,
"loss": 0.1696,
"step": 292
},
{
"epoch": 0.7493606138107417,
"grad_norm": 2.5875832384332074,
"learning_rate": 6.918055667155311e-06,
"loss": 0.1861,
"step": 293
},
{
"epoch": 0.7519181585677749,
"grad_norm": 2.3852937224082167,
"learning_rate": 6.899490084065897e-06,
"loss": 0.1736,
"step": 294
},
{
"epoch": 0.7544757033248082,
"grad_norm": 2.7399772311401422,
"learning_rate": 6.8808938444683505e-06,
"loss": 0.1942,
"step": 295
},
{
"epoch": 0.7570332480818415,
"grad_norm": 2.7081558169763524,
"learning_rate": 6.862267248493624e-06,
"loss": 0.1894,
"step": 296
},
{
"epoch": 0.7595907928388747,
"grad_norm": 2.6345675146303975,
"learning_rate": 6.843610596762606e-06,
"loss": 0.2092,
"step": 297
},
{
"epoch": 0.7621483375959079,
"grad_norm": 2.4002544312203176,
"learning_rate": 6.824924190381257e-06,
"loss": 0.1646,
"step": 298
},
{
"epoch": 0.7647058823529411,
"grad_norm": 2.903014156552671,
"learning_rate": 6.806208330935766e-06,
"loss": 0.2274,
"step": 299
},
{
"epoch": 0.7672634271099744,
"grad_norm": 2.3818958805093318,
"learning_rate": 6.7874633204876705e-06,
"loss": 0.1511,
"step": 300
},
{
"epoch": 0.7698209718670077,
"grad_norm": 2.1217119638619923,
"learning_rate": 6.768689461568987e-06,
"loss": 0.1818,
"step": 301
},
{
"epoch": 0.7723785166240409,
"grad_norm": 1.9802995962813212,
"learning_rate": 6.7498870571773275e-06,
"loss": 0.1871,
"step": 302
},
{
"epoch": 0.7749360613810742,
"grad_norm": 2.130497009697558,
"learning_rate": 6.731056410771008e-06,
"loss": 0.1929,
"step": 303
},
{
"epoch": 0.7774936061381074,
"grad_norm": 2.6390968570982007,
"learning_rate": 6.712197826264154e-06,
"loss": 0.209,
"step": 304
},
{
"epoch": 0.7800511508951407,
"grad_norm": 2.5486828023341763,
"learning_rate": 6.69331160802179e-06,
"loss": 0.1995,
"step": 305
},
{
"epoch": 0.782608695652174,
"grad_norm": 2.1146363749572554,
"learning_rate": 6.674398060854931e-06,
"loss": 0.1411,
"step": 306
},
{
"epoch": 0.7851662404092071,
"grad_norm": 2.327617229567959,
"learning_rate": 6.655457490015667e-06,
"loss": 0.2067,
"step": 307
},
{
"epoch": 0.7877237851662404,
"grad_norm": 1.9607440916181138,
"learning_rate": 6.636490201192229e-06,
"loss": 0.1866,
"step": 308
},
{
"epoch": 0.7902813299232737,
"grad_norm": 1.7057345078559258,
"learning_rate": 6.617496500504056e-06,
"loss": 0.0958,
"step": 309
},
{
"epoch": 0.7928388746803069,
"grad_norm": 2.3104568865848334,
"learning_rate": 6.5984766944968636e-06,
"loss": 0.1813,
"step": 310
},
{
"epoch": 0.7953964194373402,
"grad_norm": 2.6850823821758647,
"learning_rate": 6.579431090137681e-06,
"loss": 0.1997,
"step": 311
},
{
"epoch": 0.7979539641943734,
"grad_norm": 2.8109972215894006,
"learning_rate": 6.560359994809916e-06,
"loss": 0.3141,
"step": 312
},
{
"epoch": 0.8005115089514067,
"grad_norm": 2.450236301729314,
"learning_rate": 6.541263716308375e-06,
"loss": 0.2208,
"step": 313
},
{
"epoch": 0.80306905370844,
"grad_norm": 3.050425633867066,
"learning_rate": 6.522142562834307e-06,
"loss": 0.2554,
"step": 314
},
{
"epoch": 0.8056265984654731,
"grad_norm": 2.0059660873591283,
"learning_rate": 6.502996842990431e-06,
"loss": 0.1905,
"step": 315
},
{
"epoch": 0.8081841432225064,
"grad_norm": 2.6797087837722504,
"learning_rate": 6.483826865775941e-06,
"loss": 0.236,
"step": 316
},
{
"epoch": 0.8107416879795396,
"grad_norm": 2.282759943678343,
"learning_rate": 6.46463294058154e-06,
"loss": 0.1792,
"step": 317
},
{
"epoch": 0.8132992327365729,
"grad_norm": 2.46733554686103,
"learning_rate": 6.445415377184427e-06,
"loss": 0.2151,
"step": 318
},
{
"epoch": 0.8158567774936062,
"grad_norm": 2.3671420545552286,
"learning_rate": 6.426174485743309e-06,
"loss": 0.1765,
"step": 319
},
{
"epoch": 0.8184143222506394,
"grad_norm": 1.9629970579121538,
"learning_rate": 6.4069105767933944e-06,
"loss": 0.1911,
"step": 320
},
{
"epoch": 0.8209718670076727,
"grad_norm": 3.6762465445985693,
"learning_rate": 6.387623961241375e-06,
"loss": 0.2756,
"step": 321
},
{
"epoch": 0.8235294117647058,
"grad_norm": 2.532439314856185,
"learning_rate": 6.368314950360416e-06,
"loss": 0.1765,
"step": 322
},
{
"epoch": 0.8260869565217391,
"grad_norm": 2.3922839492822003,
"learning_rate": 6.348983855785122e-06,
"loss": 0.1455,
"step": 323
},
{
"epoch": 0.8286445012787724,
"grad_norm": 2.1577159803546437,
"learning_rate": 6.3296309895065215e-06,
"loss": 0.187,
"step": 324
},
{
"epoch": 0.8312020460358056,
"grad_norm": 2.5897751675293565,
"learning_rate": 6.310256663867019e-06,
"loss": 0.1813,
"step": 325
},
{
"epoch": 0.8337595907928389,
"grad_norm": 2.9751298040801397,
"learning_rate": 6.290861191555359e-06,
"loss": 0.1861,
"step": 326
},
{
"epoch": 0.8363171355498721,
"grad_norm": 3.0389364710917413,
"learning_rate": 6.271444885601583e-06,
"loss": 0.2479,
"step": 327
},
{
"epoch": 0.8388746803069054,
"grad_norm": 2.6853416543912845,
"learning_rate": 6.252008059371968e-06,
"loss": 0.2179,
"step": 328
},
{
"epoch": 0.8414322250639387,
"grad_norm": 1.804371167405513,
"learning_rate": 6.2325510265639785e-06,
"loss": 0.1489,
"step": 329
},
{
"epoch": 0.8439897698209718,
"grad_norm": 2.687996264497428,
"learning_rate": 6.213074101201202e-06,
"loss": 0.1877,
"step": 330
},
{
"epoch": 0.8465473145780051,
"grad_norm": 2.2595021879894284,
"learning_rate": 6.193577597628268e-06,
"loss": 0.1835,
"step": 331
},
{
"epoch": 0.8491048593350383,
"grad_norm": 1.9454030240892142,
"learning_rate": 6.174061830505801e-06,
"loss": 0.1744,
"step": 332
},
{
"epoch": 0.8516624040920716,
"grad_norm": 2.7487602756044653,
"learning_rate": 6.154527114805312e-06,
"loss": 0.2756,
"step": 333
},
{
"epoch": 0.8542199488491049,
"grad_norm": 2.1891761636099485,
"learning_rate": 6.1349737658041385e-06,
"loss": 0.2182,
"step": 334
},
{
"epoch": 0.8567774936061381,
"grad_norm": 2.2860716470090625,
"learning_rate": 6.115402099080345e-06,
"loss": 0.1623,
"step": 335
},
{
"epoch": 0.8593350383631714,
"grad_norm": 2.144126721212396,
"learning_rate": 6.095812430507627e-06,
"loss": 0.1309,
"step": 336
},
{
"epoch": 0.8618925831202046,
"grad_norm": 2.92551680405092,
"learning_rate": 6.076205076250227e-06,
"loss": 0.2107,
"step": 337
},
{
"epoch": 0.8644501278772379,
"grad_norm": 2.763974310218809,
"learning_rate": 6.056580352757813e-06,
"loss": 0.2358,
"step": 338
},
{
"epoch": 0.8670076726342711,
"grad_norm": 2.2522928679519714,
"learning_rate": 6.036938576760388e-06,
"loss": 0.1657,
"step": 339
},
{
"epoch": 0.8695652173913043,
"grad_norm": 1.8110497901458547,
"learning_rate": 6.0172800652631706e-06,
"loss": 0.1458,
"step": 340
},
{
"epoch": 0.8721227621483376,
"grad_norm": 2.632061983015455,
"learning_rate": 5.997605135541472e-06,
"loss": 0.1803,
"step": 341
},
{
"epoch": 0.8746803069053708,
"grad_norm": 2.518578410021301,
"learning_rate": 5.977914105135594e-06,
"loss": 0.2311,
"step": 342
},
{
"epoch": 0.8772378516624041,
"grad_norm": 1.9196947260190371,
"learning_rate": 5.9582072918456805e-06,
"loss": 0.1388,
"step": 343
},
{
"epoch": 0.8797953964194374,
"grad_norm": 2.3343184448027303,
"learning_rate": 5.938485013726612e-06,
"loss": 0.2101,
"step": 344
},
{
"epoch": 0.8823529411764706,
"grad_norm": 2.107250405421706,
"learning_rate": 5.918747589082853e-06,
"loss": 0.1753,
"step": 345
},
{
"epoch": 0.8849104859335039,
"grad_norm": 1.8161156075732543,
"learning_rate": 5.898995336463326e-06,
"loss": 0.1707,
"step": 346
},
{
"epoch": 0.887468030690537,
"grad_norm": 2.0187878577288765,
"learning_rate": 5.879228574656269e-06,
"loss": 0.1289,
"step": 347
},
{
"epoch": 0.8900255754475703,
"grad_norm": 3.2101942717767025,
"learning_rate": 5.859447622684084e-06,
"loss": 0.2727,
"step": 348
},
{
"epoch": 0.8925831202046036,
"grad_norm": 2.1882518208143673,
"learning_rate": 5.839652799798197e-06,
"loss": 0.194,
"step": 349
},
{
"epoch": 0.8951406649616368,
"grad_norm": 3.0670778313064044,
"learning_rate": 5.819844425473899e-06,
"loss": 0.2662,
"step": 350
},
{
"epoch": 0.8976982097186701,
"grad_norm": 2.263174260591853,
"learning_rate": 5.800022819405194e-06,
"loss": 0.1937,
"step": 351
},
{
"epoch": 0.9002557544757033,
"grad_norm": 2.436527033057964,
"learning_rate": 5.780188301499636e-06,
"loss": 0.2269,
"step": 352
},
{
"epoch": 0.9028132992327366,
"grad_norm": 3.0292476861844166,
"learning_rate": 5.760341191873167e-06,
"loss": 0.2593,
"step": 353
},
{
"epoch": 0.9053708439897699,
"grad_norm": 2.7791662962631607,
"learning_rate": 5.740481810844952e-06,
"loss": 0.2198,
"step": 354
},
{
"epoch": 0.907928388746803,
"grad_norm": 2.1999927858559145,
"learning_rate": 5.720610478932211e-06,
"loss": 0.164,
"step": 355
},
{
"epoch": 0.9104859335038363,
"grad_norm": 2.5794977388267166,
"learning_rate": 5.700727516845038e-06,
"loss": 0.1773,
"step": 356
},
{
"epoch": 0.9130434782608695,
"grad_norm": 2.147804700886211,
"learning_rate": 5.680833245481234e-06,
"loss": 0.2114,
"step": 357
},
{
"epoch": 0.9156010230179028,
"grad_norm": 2.606878063368861,
"learning_rate": 5.660927985921122e-06,
"loss": 0.2173,
"step": 358
},
{
"epoch": 0.9181585677749361,
"grad_norm": 2.340221113756964,
"learning_rate": 5.641012059422369e-06,
"loss": 0.2223,
"step": 359
},
{
"epoch": 0.9207161125319693,
"grad_norm": 2.4961312290411994,
"learning_rate": 5.621085787414799e-06,
"loss": 0.2255,
"step": 360
},
{
"epoch": 0.9232736572890026,
"grad_norm": 2.5269182037727895,
"learning_rate": 5.601149491495206e-06,
"loss": 0.2135,
"step": 361
},
{
"epoch": 0.9258312020460358,
"grad_norm": 2.0417402035129317,
"learning_rate": 5.581203493422161e-06,
"loss": 0.2078,
"step": 362
},
{
"epoch": 0.928388746803069,
"grad_norm": 2.0727546621065227,
"learning_rate": 5.561248115110822e-06,
"loss": 0.1661,
"step": 363
},
{
"epoch": 0.9309462915601023,
"grad_norm": 2.7807300008729348,
"learning_rate": 5.541283678627742e-06,
"loss": 0.1794,
"step": 364
},
{
"epoch": 0.9335038363171355,
"grad_norm": 1.9508561433458118,
"learning_rate": 5.521310506185661e-06,
"loss": 0.1466,
"step": 365
},
{
"epoch": 0.9360613810741688,
"grad_norm": 2.3405765286165603,
"learning_rate": 5.501328920138314e-06,
"loss": 0.2208,
"step": 366
},
{
"epoch": 0.9386189258312021,
"grad_norm": 2.469209204439655,
"learning_rate": 5.481339242975227e-06,
"loss": 0.1634,
"step": 367
},
{
"epoch": 0.9411764705882353,
"grad_norm": 2.40778706172643,
"learning_rate": 5.46134179731651e-06,
"loss": 0.1898,
"step": 368
},
{
"epoch": 0.9437340153452686,
"grad_norm": 2.2778636718221965,
"learning_rate": 5.441336905907653e-06,
"loss": 0.1926,
"step": 369
},
{
"epoch": 0.9462915601023018,
"grad_norm": 2.7625812295253422,
"learning_rate": 5.421324891614312e-06,
"loss": 0.1973,
"step": 370
},
{
"epoch": 0.948849104859335,
"grad_norm": 1.7446912165051116,
"learning_rate": 5.4013060774171055e-06,
"loss": 0.1638,
"step": 371
},
{
"epoch": 0.9514066496163683,
"grad_norm": 2.5444352315152066,
"learning_rate": 5.3812807864063946e-06,
"loss": 0.2318,
"step": 372
},
{
"epoch": 0.9539641943734015,
"grad_norm": 2.1992426433820587,
"learning_rate": 5.361249341777075e-06,
"loss": 0.1843,
"step": 373
},
{
"epoch": 0.9565217391304348,
"grad_norm": 2.5302908183096964,
"learning_rate": 5.341212066823356e-06,
"loss": 0.2005,
"step": 374
},
{
"epoch": 0.959079283887468,
"grad_norm": 2.6580305183975748,
"learning_rate": 5.321169284933543e-06,
"loss": 0.2198,
"step": 375
},
{
"epoch": 0.9616368286445013,
"grad_norm": 2.2117215566531723,
"learning_rate": 5.3011213195848245e-06,
"loss": 0.2427,
"step": 376
},
{
"epoch": 0.9641943734015346,
"grad_norm": 2.725818072341103,
"learning_rate": 5.281068494338039e-06,
"loss": 0.1718,
"step": 377
},
{
"epoch": 0.9667519181585678,
"grad_norm": 2.2292620609222067,
"learning_rate": 5.26101113283247e-06,
"loss": 0.1534,
"step": 378
},
{
"epoch": 0.969309462915601,
"grad_norm": 2.6330854799034813,
"learning_rate": 5.240949558780605e-06,
"loss": 0.2033,
"step": 379
},
{
"epoch": 0.9718670076726342,
"grad_norm": 2.650742969398919,
"learning_rate": 5.220884095962924e-06,
"loss": 0.226,
"step": 380
},
{
"epoch": 0.9744245524296675,
"grad_norm": 2.6523001115826332,
"learning_rate": 5.200815068222666e-06,
"loss": 0.193,
"step": 381
},
{
"epoch": 0.9769820971867008,
"grad_norm": 2.009859598152987,
"learning_rate": 5.1807427994606065e-06,
"loss": 0.1309,
"step": 382
},
{
"epoch": 0.979539641943734,
"grad_norm": 3.0187208603458187,
"learning_rate": 5.1606676136298305e-06,
"loss": 0.1944,
"step": 383
},
{
"epoch": 0.9820971867007673,
"grad_norm": 2.16488916376895,
"learning_rate": 5.140589834730503e-06,
"loss": 0.1758,
"step": 384
},
{
"epoch": 0.9846547314578005,
"grad_norm": 2.26726877183814,
"learning_rate": 5.120509786804635e-06,
"loss": 0.2036,
"step": 385
},
{
"epoch": 0.9872122762148338,
"grad_norm": 2.5667448076446515,
"learning_rate": 5.100427793930862e-06,
"loss": 0.2137,
"step": 386
},
{
"epoch": 0.989769820971867,
"grad_norm": 3.175490751374496,
"learning_rate": 5.08034418021921e-06,
"loss": 0.218,
"step": 387
},
{
"epoch": 0.9923273657289002,
"grad_norm": 2.071152150237306,
"learning_rate": 5.06025926980586e-06,
"loss": 0.144,
"step": 388
},
{
"epoch": 0.9948849104859335,
"grad_norm": 1.9502478990435759,
"learning_rate": 5.040173386847926e-06,
"loss": 0.1662,
"step": 389
},
{
"epoch": 0.9974424552429667,
"grad_norm": 2.3496611393315554,
"learning_rate": 5.0200868555182155e-06,
"loss": 0.1997,
"step": 390
},
{
"epoch": 1.0,
"grad_norm": 2.1406113822102846,
"learning_rate": 5e-06,
"loss": 0.1649,
"step": 391
},
{
"epoch": 1.0025575447570332,
"grad_norm": 1.7341758333987307,
"learning_rate": 4.979913144481785e-06,
"loss": 0.0982,
"step": 392
},
{
"epoch": 1.0051150895140666,
"grad_norm": 1.6626061824434262,
"learning_rate": 4.959826613152074e-06,
"loss": 0.0868,
"step": 393
},
{
"epoch": 1.0076726342710998,
"grad_norm": 1.420000749929643,
"learning_rate": 4.939740730194141e-06,
"loss": 0.0911,
"step": 394
},
{
"epoch": 1.010230179028133,
"grad_norm": 1.697363619049948,
"learning_rate": 4.919655819780792e-06,
"loss": 0.076,
"step": 395
},
{
"epoch": 1.0127877237851663,
"grad_norm": 1.4002681848958676,
"learning_rate": 4.899572206069138e-06,
"loss": 0.0805,
"step": 396
},
{
"epoch": 1.0153452685421995,
"grad_norm": 1.4426210835750999,
"learning_rate": 4.879490213195366e-06,
"loss": 0.0879,
"step": 397
},
{
"epoch": 1.0179028132992327,
"grad_norm": 2.1433788397954636,
"learning_rate": 4.8594101652694996e-06,
"loss": 0.1065,
"step": 398
},
{
"epoch": 1.020460358056266,
"grad_norm": 1.6010853002979815,
"learning_rate": 4.839332386370171e-06,
"loss": 0.0743,
"step": 399
},
{
"epoch": 1.0230179028132993,
"grad_norm": 1.5836077570503444,
"learning_rate": 4.819257200539394e-06,
"loss": 0.0832,
"step": 400
},
{
"epoch": 1.0255754475703325,
"grad_norm": 1.5997813293419636,
"learning_rate": 4.799184931777337e-06,
"loss": 0.0936,
"step": 401
},
{
"epoch": 1.0281329923273657,
"grad_norm": 1.7521575539423502,
"learning_rate": 4.779115904037079e-06,
"loss": 0.0723,
"step": 402
},
{
"epoch": 1.030690537084399,
"grad_norm": 1.7850328818891048,
"learning_rate": 4.759050441219395e-06,
"loss": 0.0664,
"step": 403
},
{
"epoch": 1.0332480818414322,
"grad_norm": 1.4608104273507212,
"learning_rate": 4.738988867167531e-06,
"loss": 0.0573,
"step": 404
},
{
"epoch": 1.0358056265984654,
"grad_norm": 1.8270185513924122,
"learning_rate": 4.718931505661961e-06,
"loss": 0.0681,
"step": 405
},
{
"epoch": 1.0383631713554988,
"grad_norm": 1.8634214035735037,
"learning_rate": 4.698878680415176e-06,
"loss": 0.0717,
"step": 406
},
{
"epoch": 1.040920716112532,
"grad_norm": 1.748667561404042,
"learning_rate": 4.678830715066458e-06,
"loss": 0.0683,
"step": 407
},
{
"epoch": 1.0434782608695652,
"grad_norm": 2.014253739409776,
"learning_rate": 4.6587879331766465e-06,
"loss": 0.0827,
"step": 408
},
{
"epoch": 1.0460358056265984,
"grad_norm": 1.7966173689760938,
"learning_rate": 4.638750658222927e-06,
"loss": 0.0981,
"step": 409
},
{
"epoch": 1.0485933503836318,
"grad_norm": 1.6525905355592523,
"learning_rate": 4.618719213593605e-06,
"loss": 0.077,
"step": 410
},
{
"epoch": 1.051150895140665,
"grad_norm": 1.4997606657164488,
"learning_rate": 4.598693922582896e-06,
"loss": 0.0702,
"step": 411
},
{
"epoch": 1.0537084398976981,
"grad_norm": 2.022120534248971,
"learning_rate": 4.5786751083856895e-06,
"loss": 0.0814,
"step": 412
},
{
"epoch": 1.0562659846547315,
"grad_norm": 1.9127347409400652,
"learning_rate": 4.558663094092348e-06,
"loss": 0.096,
"step": 413
},
{
"epoch": 1.0588235294117647,
"grad_norm": 2.541069985292318,
"learning_rate": 4.53865820268349e-06,
"loss": 0.0974,
"step": 414
},
{
"epoch": 1.061381074168798,
"grad_norm": 1.9801185106989778,
"learning_rate": 4.518660757024774e-06,
"loss": 0.0649,
"step": 415
},
{
"epoch": 1.0639386189258313,
"grad_norm": 1.9483856548126361,
"learning_rate": 4.498671079861686e-06,
"loss": 0.0881,
"step": 416
},
{
"epoch": 1.0664961636828645,
"grad_norm": 2.107304538718503,
"learning_rate": 4.478689493814341e-06,
"loss": 0.1178,
"step": 417
},
{
"epoch": 1.0690537084398977,
"grad_norm": 2.072047135460629,
"learning_rate": 4.4587163213722595e-06,
"loss": 0.0971,
"step": 418
},
{
"epoch": 1.0716112531969308,
"grad_norm": 1.6567001187673083,
"learning_rate": 4.438751884889179e-06,
"loss": 0.0729,
"step": 419
},
{
"epoch": 1.0741687979539642,
"grad_norm": 1.6640746116359595,
"learning_rate": 4.41879650657784e-06,
"loss": 0.0637,
"step": 420
},
{
"epoch": 1.0767263427109974,
"grad_norm": 1.7205043208443138,
"learning_rate": 4.398850508504795e-06,
"loss": 0.0633,
"step": 421
},
{
"epoch": 1.0792838874680306,
"grad_norm": 1.3625064664087017,
"learning_rate": 4.3789142125852015e-06,
"loss": 0.0652,
"step": 422
},
{
"epoch": 1.081841432225064,
"grad_norm": 1.6440483798308387,
"learning_rate": 4.358987940577631e-06,
"loss": 0.0893,
"step": 423
},
{
"epoch": 1.0843989769820972,
"grad_norm": 1.988437435997785,
"learning_rate": 4.339072014078879e-06,
"loss": 0.0745,
"step": 424
},
{
"epoch": 1.0869565217391304,
"grad_norm": 1.9394483469983206,
"learning_rate": 4.319166754518768e-06,
"loss": 0.0699,
"step": 425
},
{
"epoch": 1.0895140664961638,
"grad_norm": 1.6091679987577228,
"learning_rate": 4.299272483154963e-06,
"loss": 0.0834,
"step": 426
},
{
"epoch": 1.092071611253197,
"grad_norm": 2.1504108683482728,
"learning_rate": 4.27938952106779e-06,
"loss": 0.0975,
"step": 427
},
{
"epoch": 1.0946291560102301,
"grad_norm": 1.2129263595961266,
"learning_rate": 4.259518189155049e-06,
"loss": 0.056,
"step": 428
},
{
"epoch": 1.0971867007672633,
"grad_norm": 1.5136569144328267,
"learning_rate": 4.2396588081268355e-06,
"loss": 0.0583,
"step": 429
},
{
"epoch": 1.0997442455242967,
"grad_norm": 2.0636857667444883,
"learning_rate": 4.219811698500365e-06,
"loss": 0.1227,
"step": 430
},
{
"epoch": 1.10230179028133,
"grad_norm": 1.555373954241444,
"learning_rate": 4.199977180594807e-06,
"loss": 0.0618,
"step": 431
},
{
"epoch": 1.104859335038363,
"grad_norm": 2.234721374673342,
"learning_rate": 4.1801555745261025e-06,
"loss": 0.0719,
"step": 432
},
{
"epoch": 1.1074168797953965,
"grad_norm": 1.4891492181079513,
"learning_rate": 4.160347200201804e-06,
"loss": 0.0848,
"step": 433
},
{
"epoch": 1.1099744245524297,
"grad_norm": 1.7086838274547151,
"learning_rate": 4.140552377315918e-06,
"loss": 0.085,
"step": 434
},
{
"epoch": 1.1125319693094629,
"grad_norm": 1.7588027037354494,
"learning_rate": 4.120771425343733e-06,
"loss": 0.0599,
"step": 435
},
{
"epoch": 1.1150895140664963,
"grad_norm": 1.9369520926526567,
"learning_rate": 4.101004663536675e-06,
"loss": 0.0757,
"step": 436
},
{
"epoch": 1.1176470588235294,
"grad_norm": 1.5573730150751601,
"learning_rate": 4.081252410917148e-06,
"loss": 0.0727,
"step": 437
},
{
"epoch": 1.1202046035805626,
"grad_norm": 1.7157376858170956,
"learning_rate": 4.061514986273391e-06,
"loss": 0.0716,
"step": 438
},
{
"epoch": 1.1227621483375958,
"grad_norm": 2.3412942307866293,
"learning_rate": 4.041792708154321e-06,
"loss": 0.1166,
"step": 439
},
{
"epoch": 1.1253196930946292,
"grad_norm": 1.3827537076323062,
"learning_rate": 4.022085894864408e-06,
"loss": 0.0542,
"step": 440
},
{
"epoch": 1.1278772378516624,
"grad_norm": 1.4810971648201303,
"learning_rate": 4.0023948644585294e-06,
"loss": 0.0627,
"step": 441
},
{
"epoch": 1.1304347826086956,
"grad_norm": 2.475105933705708,
"learning_rate": 3.982719934736832e-06,
"loss": 0.0747,
"step": 442
},
{
"epoch": 1.132992327365729,
"grad_norm": 6.967435074371647,
"learning_rate": 3.963061423239612e-06,
"loss": 0.0885,
"step": 443
},
{
"epoch": 1.1355498721227621,
"grad_norm": 2.362944377096876,
"learning_rate": 3.943419647242189e-06,
"loss": 0.0927,
"step": 444
},
{
"epoch": 1.1381074168797953,
"grad_norm": 3.417514515542525,
"learning_rate": 3.923794923749775e-06,
"loss": 0.1051,
"step": 445
},
{
"epoch": 1.1406649616368287,
"grad_norm": 1.5628055684744868,
"learning_rate": 3.904187569492373e-06,
"loss": 0.0609,
"step": 446
},
{
"epoch": 1.143222506393862,
"grad_norm": 1.853834244292138,
"learning_rate": 3.884597900919656e-06,
"loss": 0.0743,
"step": 447
},
{
"epoch": 1.145780051150895,
"grad_norm": 1.5370022269889596,
"learning_rate": 3.865026234195863e-06,
"loss": 0.0645,
"step": 448
},
{
"epoch": 1.1483375959079285,
"grad_norm": 1.7852407621323894,
"learning_rate": 3.8454728851946885e-06,
"loss": 0.074,
"step": 449
},
{
"epoch": 1.1508951406649617,
"grad_norm": 1.5201740396511454,
"learning_rate": 3.8259381694942e-06,
"loss": 0.0711,
"step": 450
},
{
"epoch": 1.1534526854219949,
"grad_norm": 1.5688527808393755,
"learning_rate": 3.806422402371733e-06,
"loss": 0.0658,
"step": 451
},
{
"epoch": 1.156010230179028,
"grad_norm": 1.854516547291142,
"learning_rate": 3.786925898798801e-06,
"loss": 0.0864,
"step": 452
},
{
"epoch": 1.1585677749360614,
"grad_norm": 1.4304724295860949,
"learning_rate": 3.767448973436021e-06,
"loss": 0.0701,
"step": 453
},
{
"epoch": 1.1611253196930946,
"grad_norm": 2.124867526109046,
"learning_rate": 3.7479919406280334e-06,
"loss": 0.0687,
"step": 454
},
{
"epoch": 1.1636828644501278,
"grad_norm": 1.9407356546402628,
"learning_rate": 3.728555114398419e-06,
"loss": 0.0693,
"step": 455
},
{
"epoch": 1.1662404092071612,
"grad_norm": 2.038902868728145,
"learning_rate": 3.709138808444641e-06,
"loss": 0.075,
"step": 456
},
{
"epoch": 1.1687979539641944,
"grad_norm": 1.4775851732019487,
"learning_rate": 3.689743336132982e-06,
"loss": 0.0548,
"step": 457
},
{
"epoch": 1.1713554987212276,
"grad_norm": 1.780023396964712,
"learning_rate": 3.6703690104934806e-06,
"loss": 0.0597,
"step": 458
},
{
"epoch": 1.1739130434782608,
"grad_norm": 1.9428570066179016,
"learning_rate": 3.6510161442148783e-06,
"loss": 0.0922,
"step": 459
},
{
"epoch": 1.1764705882352942,
"grad_norm": 1.4660436947208233,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.0702,
"step": 460
},
{
"epoch": 1.1790281329923273,
"grad_norm": 1.492258925362867,
"learning_rate": 3.6123760387586265e-06,
"loss": 0.0533,
"step": 461
},
{
"epoch": 1.1815856777493605,
"grad_norm": 1.6080846234395203,
"learning_rate": 3.5930894232066072e-06,
"loss": 0.0613,
"step": 462
},
{
"epoch": 1.184143222506394,
"grad_norm": 1.8762491421887413,
"learning_rate": 3.5738255142566912e-06,
"loss": 0.111,
"step": 463
},
{
"epoch": 1.186700767263427,
"grad_norm": 1.6721148093003684,
"learning_rate": 3.5545846228155743e-06,
"loss": 0.0695,
"step": 464
},
{
"epoch": 1.1892583120204603,
"grad_norm": 1.6557906824838298,
"learning_rate": 3.5353670594184623e-06,
"loss": 0.0648,
"step": 465
},
{
"epoch": 1.1918158567774937,
"grad_norm": 1.6970610387683387,
"learning_rate": 3.516173134224059e-06,
"loss": 0.0808,
"step": 466
},
{
"epoch": 1.1943734015345269,
"grad_norm": 1.7210699030954981,
"learning_rate": 3.4970031570095707e-06,
"loss": 0.0623,
"step": 467
},
{
"epoch": 1.19693094629156,
"grad_norm": 1.540393787825596,
"learning_rate": 3.477857437165694e-06,
"loss": 0.0664,
"step": 468
},
{
"epoch": 1.1994884910485935,
"grad_norm": 1.8901582305083606,
"learning_rate": 3.458736283691626e-06,
"loss": 0.0834,
"step": 469
},
{
"epoch": 1.2020460358056266,
"grad_norm": 1.4585029838366994,
"learning_rate": 3.4396400051900846e-06,
"loss": 0.0547,
"step": 470
},
{
"epoch": 1.2046035805626598,
"grad_norm": 1.2256677190585374,
"learning_rate": 3.4205689098623195e-06,
"loss": 0.0479,
"step": 471
},
{
"epoch": 1.207161125319693,
"grad_norm": 1.055212675096316,
"learning_rate": 3.401523305503139e-06,
"loss": 0.0349,
"step": 472
},
{
"epoch": 1.2097186700767264,
"grad_norm": 2.29047957899524,
"learning_rate": 3.3825034994959445e-06,
"loss": 0.1002,
"step": 473
},
{
"epoch": 1.2122762148337596,
"grad_norm": 1.5122951574759327,
"learning_rate": 3.3635097988077724e-06,
"loss": 0.0697,
"step": 474
},
{
"epoch": 1.2148337595907928,
"grad_norm": 1.8598398658091704,
"learning_rate": 3.3445425099843343e-06,
"loss": 0.0759,
"step": 475
},
{
"epoch": 1.2173913043478262,
"grad_norm": 4.06734950441362,
"learning_rate": 3.3256019391450696e-06,
"loss": 0.064,
"step": 476
},
{
"epoch": 1.2199488491048593,
"grad_norm": 1.8126115952101831,
"learning_rate": 3.3066883919782116e-06,
"loss": 0.0905,
"step": 477
},
{
"epoch": 1.2225063938618925,
"grad_norm": 1.9695485499932877,
"learning_rate": 3.287802173735848e-06,
"loss": 0.073,
"step": 478
},
{
"epoch": 1.2250639386189257,
"grad_norm": 1.8734705052479046,
"learning_rate": 3.268943589228992e-06,
"loss": 0.0765,
"step": 479
},
{
"epoch": 1.227621483375959,
"grad_norm": 1.8227153781564451,
"learning_rate": 3.250112942822673e-06,
"loss": 0.0647,
"step": 480
},
{
"epoch": 1.2301790281329923,
"grad_norm": 1.927223581889169,
"learning_rate": 3.231310538431015e-06,
"loss": 0.1003,
"step": 481
},
{
"epoch": 1.2327365728900257,
"grad_norm": 1.8013350113471298,
"learning_rate": 3.212536679512332e-06,
"loss": 0.0733,
"step": 482
},
{
"epoch": 1.2352941176470589,
"grad_norm": 1.5008823838511212,
"learning_rate": 3.1937916690642356e-06,
"loss": 0.0594,
"step": 483
},
{
"epoch": 1.237851662404092,
"grad_norm": 2.1113680142657314,
"learning_rate": 3.1750758096187446e-06,
"loss": 0.0968,
"step": 484
},
{
"epoch": 1.2404092071611252,
"grad_norm": 1.6422275231634988,
"learning_rate": 3.1563894032373977e-06,
"loss": 0.0648,
"step": 485
},
{
"epoch": 1.2429667519181586,
"grad_norm": 1.5640327931569575,
"learning_rate": 3.137732751506376e-06,
"loss": 0.0614,
"step": 486
},
{
"epoch": 1.2455242966751918,
"grad_norm": 1.931470233335117,
"learning_rate": 3.1191061555316503e-06,
"loss": 0.0752,
"step": 487
},
{
"epoch": 1.248081841432225,
"grad_norm": 1.5026655446391683,
"learning_rate": 3.1005099159341044e-06,
"loss": 0.0672,
"step": 488
},
{
"epoch": 1.2506393861892584,
"grad_norm": 1.7505740735081963,
"learning_rate": 3.08194433284469e-06,
"loss": 0.0914,
"step": 489
},
{
"epoch": 1.2531969309462916,
"grad_norm": 1.4586747212773692,
"learning_rate": 3.0634097058995877e-06,
"loss": 0.0579,
"step": 490
},
{
"epoch": 1.2557544757033248,
"grad_norm": 2.6271299593430375,
"learning_rate": 3.0449063342353635e-06,
"loss": 0.0833,
"step": 491
},
{
"epoch": 1.258312020460358,
"grad_norm": 1.9675276646986675,
"learning_rate": 3.0264345164841426e-06,
"loss": 0.0803,
"step": 492
},
{
"epoch": 1.2608695652173914,
"grad_norm": 1.7237694658208633,
"learning_rate": 3.007994550768793e-06,
"loss": 0.0652,
"step": 493
},
{
"epoch": 1.2634271099744245,
"grad_norm": 1.7092819026417294,
"learning_rate": 2.989586734698113e-06,
"loss": 0.068,
"step": 494
},
{
"epoch": 1.265984654731458,
"grad_norm": 1.4116147134742372,
"learning_rate": 2.971211365362028e-06,
"loss": 0.061,
"step": 495
},
{
"epoch": 1.2685421994884911,
"grad_norm": 1.3575667916154275,
"learning_rate": 2.9528687393267865e-06,
"loss": 0.0685,
"step": 496
},
{
"epoch": 1.2710997442455243,
"grad_norm": 1.802170523220636,
"learning_rate": 2.934559152630192e-06,
"loss": 0.0753,
"step": 497
},
{
"epoch": 1.2736572890025575,
"grad_norm": 1.6262195523928797,
"learning_rate": 2.9162829007768103e-06,
"loss": 0.0941,
"step": 498
},
{
"epoch": 1.2762148337595907,
"grad_norm": 1.3516369552001424,
"learning_rate": 2.898040278733203e-06,
"loss": 0.0683,
"step": 499
},
{
"epoch": 1.278772378516624,
"grad_norm": 1.477326290875936,
"learning_rate": 2.879831580923176e-06,
"loss": 0.0629,
"step": 500
},
{
"epoch": 1.278772378516624,
"eval_loss": 0.21294504404067993,
"eval_runtime": 4.6492,
"eval_samples_per_second": 6.883,
"eval_steps_per_second": 1.721,
"step": 500
},
{
"epoch": 1.2813299232736572,
"grad_norm": 1.7566143836444967,
"learning_rate": 2.8616571012230134e-06,
"loss": 0.0858,
"step": 501
},
{
"epoch": 1.2838874680306906,
"grad_norm": 1.6822488345485334,
"learning_rate": 2.843517132956742e-06,
"loss": 0.0858,
"step": 502
},
{
"epoch": 1.2864450127877238,
"grad_norm": 1.5083738241472828,
"learning_rate": 2.8254119688914017e-06,
"loss": 0.0727,
"step": 503
},
{
"epoch": 1.289002557544757,
"grad_norm": 1.7803194664288695,
"learning_rate": 2.8073419012323154e-06,
"loss": 0.0982,
"step": 504
},
{
"epoch": 1.2915601023017902,
"grad_norm": 1.4987869744478313,
"learning_rate": 2.789307221618369e-06,
"loss": 0.0679,
"step": 505
},
{
"epoch": 1.2941176470588236,
"grad_norm": 2.020271271839902,
"learning_rate": 2.771308221117309e-06,
"loss": 0.0969,
"step": 506
},
{
"epoch": 1.2966751918158568,
"grad_norm": 1.2722299672920545,
"learning_rate": 2.7533451902210512e-06,
"loss": 0.0503,
"step": 507
},
{
"epoch": 1.29923273657289,
"grad_norm": 1.7822467291822328,
"learning_rate": 2.7354184188409773e-06,
"loss": 0.0852,
"step": 508
},
{
"epoch": 1.3017902813299234,
"grad_norm": 1.6127587678726962,
"learning_rate": 2.71752819630327e-06,
"loss": 0.0659,
"step": 509
},
{
"epoch": 1.3043478260869565,
"grad_norm": 1.9796442834814172,
"learning_rate": 2.6996748113442397e-06,
"loss": 0.0651,
"step": 510
},
{
"epoch": 1.3069053708439897,
"grad_norm": 1.3367286391456314,
"learning_rate": 2.6818585521056573e-06,
"loss": 0.057,
"step": 511
},
{
"epoch": 1.309462915601023,
"grad_norm": 1.6215431890082224,
"learning_rate": 2.66407970613011e-06,
"loss": 0.0644,
"step": 512
},
{
"epoch": 1.3120204603580563,
"grad_norm": 1.5423150036952755,
"learning_rate": 2.646338560356363e-06,
"loss": 0.0487,
"step": 513
},
{
"epoch": 1.3145780051150895,
"grad_norm": 1.4322758913833975,
"learning_rate": 2.6286354011147252e-06,
"loss": 0.054,
"step": 514
},
{
"epoch": 1.317135549872123,
"grad_norm": 2.0705557289206933,
"learning_rate": 2.6109705141224255e-06,
"loss": 0.0906,
"step": 515
},
{
"epoch": 1.319693094629156,
"grad_norm": 1.6956814937463585,
"learning_rate": 2.593344184479003e-06,
"loss": 0.0741,
"step": 516
},
{
"epoch": 1.3222506393861893,
"grad_norm": 1.6447905074286118,
"learning_rate": 2.575756696661713e-06,
"loss": 0.0613,
"step": 517
},
{
"epoch": 1.3248081841432224,
"grad_norm": 1.2857330835107534,
"learning_rate": 2.5582083345209217e-06,
"loss": 0.0573,
"step": 518
},
{
"epoch": 1.3273657289002558,
"grad_norm": 1.6996787609477517,
"learning_rate": 2.540699381275539e-06,
"loss": 0.0721,
"step": 519
},
{
"epoch": 1.329923273657289,
"grad_norm": 1.406431219311043,
"learning_rate": 2.5232301195084395e-06,
"loss": 0.0538,
"step": 520
},
{
"epoch": 1.3324808184143222,
"grad_norm": 2.021505231989548,
"learning_rate": 2.5058008311619035e-06,
"loss": 0.0804,
"step": 521
},
{
"epoch": 1.3350383631713556,
"grad_norm": 1.5049328024585746,
"learning_rate": 2.488411797533064e-06,
"loss": 0.0452,
"step": 522
},
{
"epoch": 1.3375959079283888,
"grad_norm": 1.9992178757785442,
"learning_rate": 2.4710632992693737e-06,
"loss": 0.0722,
"step": 523
},
{
"epoch": 1.340153452685422,
"grad_norm": 1.3839067422641855,
"learning_rate": 2.4537556163640726e-06,
"loss": 0.0578,
"step": 524
},
{
"epoch": 1.3427109974424551,
"grad_norm": 1.8578230421356159,
"learning_rate": 2.4364890281516633e-06,
"loss": 0.0769,
"step": 525
},
{
"epoch": 1.3452685421994885,
"grad_norm": 1.2638070516424367,
"learning_rate": 2.4192638133034074e-06,
"loss": 0.0433,
"step": 526
},
{
"epoch": 1.3478260869565217,
"grad_norm": 1.5514020868657306,
"learning_rate": 2.4020802498228333e-06,
"loss": 0.0638,
"step": 527
},
{
"epoch": 1.350383631713555,
"grad_norm": 1.1234091073689203,
"learning_rate": 2.384938615041238e-06,
"loss": 0.0478,
"step": 528
},
{
"epoch": 1.3529411764705883,
"grad_norm": 1.394137463574114,
"learning_rate": 2.3678391856132203e-06,
"loss": 0.0532,
"step": 529
},
{
"epoch": 1.3554987212276215,
"grad_norm": 1.1526941031608549,
"learning_rate": 2.350782237512215e-06,
"loss": 0.0439,
"step": 530
},
{
"epoch": 1.3580562659846547,
"grad_norm": 1.8181630565833733,
"learning_rate": 2.3337680460260314e-06,
"loss": 0.0672,
"step": 531
},
{
"epoch": 1.3606138107416879,
"grad_norm": 3.123874639693688,
"learning_rate": 2.316796885752415e-06,
"loss": 0.1681,
"step": 532
},
{
"epoch": 1.3631713554987213,
"grad_norm": 2.1436757050712214,
"learning_rate": 2.299869030594622e-06,
"loss": 0.0904,
"step": 533
},
{
"epoch": 1.3657289002557544,
"grad_norm": 1.6862552799832844,
"learning_rate": 2.2829847537569904e-06,
"loss": 0.0381,
"step": 534
},
{
"epoch": 1.3682864450127878,
"grad_norm": 2.105194788108369,
"learning_rate": 2.266144327740531e-06,
"loss": 0.0863,
"step": 535
},
{
"epoch": 1.370843989769821,
"grad_norm": 1.752969337535574,
"learning_rate": 2.2493480243385298e-06,
"loss": 0.0702,
"step": 536
},
{
"epoch": 1.3734015345268542,
"grad_norm": 1.854309444483952,
"learning_rate": 2.2325961146321683e-06,
"loss": 0.077,
"step": 537
},
{
"epoch": 1.3759590792838874,
"grad_norm": 1.7507327973613351,
"learning_rate": 2.2158888689861434e-06,
"loss": 0.0683,
"step": 538
},
{
"epoch": 1.3785166240409208,
"grad_norm": 1.3352460032358662,
"learning_rate": 2.1992265570442974e-06,
"loss": 0.0586,
"step": 539
},
{
"epoch": 1.381074168797954,
"grad_norm": 1.6988970605905234,
"learning_rate": 2.182609447725279e-06,
"loss": 0.0789,
"step": 540
},
{
"epoch": 1.3836317135549872,
"grad_norm": 1.8680407758940254,
"learning_rate": 2.1660378092181935e-06,
"loss": 0.0791,
"step": 541
},
{
"epoch": 1.3861892583120206,
"grad_norm": 1.8934496530320313,
"learning_rate": 2.149511908978275e-06,
"loss": 0.0726,
"step": 542
},
{
"epoch": 1.3887468030690537,
"grad_norm": 1.7808474646322217,
"learning_rate": 2.1330320137225773e-06,
"loss": 0.0832,
"step": 543
},
{
"epoch": 1.391304347826087,
"grad_norm": 1.6521067201978266,
"learning_rate": 2.1165983894256647e-06,
"loss": 0.0738,
"step": 544
},
{
"epoch": 1.39386189258312,
"grad_norm": 2.0833857927118196,
"learning_rate": 2.100211301315315e-06,
"loss": 0.0729,
"step": 545
},
{
"epoch": 1.3964194373401535,
"grad_norm": 1.7314836100520905,
"learning_rate": 2.0838710138682412e-06,
"loss": 0.0686,
"step": 546
},
{
"epoch": 1.3989769820971867,
"grad_norm": 1.4118884729586654,
"learning_rate": 2.0675777908058307e-06,
"loss": 0.0644,
"step": 547
},
{
"epoch": 1.40153452685422,
"grad_norm": 1.8528901934251134,
"learning_rate": 2.051331895089882e-06,
"loss": 0.0859,
"step": 548
},
{
"epoch": 1.4040920716112533,
"grad_norm": 1.761689983496555,
"learning_rate": 2.035133588918356e-06,
"loss": 0.0496,
"step": 549
},
{
"epoch": 1.4066496163682864,
"grad_norm": 1.0521984322299474,
"learning_rate": 2.0189831337211573e-06,
"loss": 0.0388,
"step": 550
},
{
"epoch": 1.4092071611253196,
"grad_norm": 1.7178382415710014,
"learning_rate": 2.0028807901559027e-06,
"loss": 0.0733,
"step": 551
},
{
"epoch": 1.4117647058823528,
"grad_norm": 1.6002839531485389,
"learning_rate": 1.9868268181037186e-06,
"loss": 0.0803,
"step": 552
},
{
"epoch": 1.4143222506393862,
"grad_norm": 1.692131956326154,
"learning_rate": 1.970821476665051e-06,
"loss": 0.0632,
"step": 553
},
{
"epoch": 1.4168797953964194,
"grad_norm": 1.4170058706311646,
"learning_rate": 1.9548650241554812e-06,
"loss": 0.0543,
"step": 554
},
{
"epoch": 1.4194373401534528,
"grad_norm": 1.58559188713565,
"learning_rate": 1.9389577181015496e-06,
"loss": 0.0492,
"step": 555
},
{
"epoch": 1.421994884910486,
"grad_norm": 1.3463742170851551,
"learning_rate": 1.923099815236608e-06,
"loss": 0.0518,
"step": 556
},
{
"epoch": 1.4245524296675192,
"grad_norm": 1.258601610278906,
"learning_rate": 1.9072915714966761e-06,
"loss": 0.0517,
"step": 557
},
{
"epoch": 1.4271099744245523,
"grad_norm": 1.7090524252971389,
"learning_rate": 1.8915332420163074e-06,
"loss": 0.0528,
"step": 558
},
{
"epoch": 1.4296675191815857,
"grad_norm": 2.002866251375377,
"learning_rate": 1.8758250811244682e-06,
"loss": 0.0528,
"step": 559
},
{
"epoch": 1.432225063938619,
"grad_norm": 1.412125761127525,
"learning_rate": 1.8601673423404449e-06,
"loss": 0.0627,
"step": 560
},
{
"epoch": 1.434782608695652,
"grad_norm": 2.1658626868049535,
"learning_rate": 1.8445602783697375e-06,
"loss": 0.0779,
"step": 561
},
{
"epoch": 1.4373401534526855,
"grad_norm": 1.748655089068542,
"learning_rate": 1.8290041410999893e-06,
"loss": 0.068,
"step": 562
},
{
"epoch": 1.4398976982097187,
"grad_norm": 1.4799736457543835,
"learning_rate": 1.8134991815969238e-06,
"loss": 0.0654,
"step": 563
},
{
"epoch": 1.4424552429667519,
"grad_norm": 1.759726051902621,
"learning_rate": 1.798045650100289e-06,
"loss": 0.0658,
"step": 564
},
{
"epoch": 1.445012787723785,
"grad_norm": 1.937901480013661,
"learning_rate": 1.782643796019814e-06,
"loss": 0.0698,
"step": 565
},
{
"epoch": 1.4475703324808185,
"grad_norm": 1.8837065847513317,
"learning_rate": 1.7672938679311957e-06,
"loss": 0.0873,
"step": 566
},
{
"epoch": 1.4501278772378516,
"grad_norm": 1.9638583603591262,
"learning_rate": 1.7519961135720737e-06,
"loss": 0.0832,
"step": 567
},
{
"epoch": 1.452685421994885,
"grad_norm": 1.7512819835219864,
"learning_rate": 1.736750779838044e-06,
"loss": 0.067,
"step": 568
},
{
"epoch": 1.4552429667519182,
"grad_norm": 1.3664725992956537,
"learning_rate": 1.7215581127786624e-06,
"loss": 0.0529,
"step": 569
},
{
"epoch": 1.4578005115089514,
"grad_norm": 1.4792338584206843,
"learning_rate": 1.7064183575934856e-06,
"loss": 0.0573,
"step": 570
},
{
"epoch": 1.4603580562659846,
"grad_norm": 1.2900470420904124,
"learning_rate": 1.6913317586281048e-06,
"loss": 0.0592,
"step": 571
},
{
"epoch": 1.4629156010230178,
"grad_norm": 1.875553204960176,
"learning_rate": 1.676298559370202e-06,
"loss": 0.082,
"step": 572
},
{
"epoch": 1.4654731457800512,
"grad_norm": 1.7220714288517276,
"learning_rate": 1.6613190024456293e-06,
"loss": 0.0685,
"step": 573
},
{
"epoch": 1.4680306905370843,
"grad_norm": 1.7924822734158794,
"learning_rate": 1.6463933296144863e-06,
"loss": 0.0645,
"step": 574
},
{
"epoch": 1.4705882352941178,
"grad_norm": 1.86003465133243,
"learning_rate": 1.6315217817672142e-06,
"loss": 0.0768,
"step": 575
},
{
"epoch": 1.473145780051151,
"grad_norm": 1.585398763380168,
"learning_rate": 1.6167045989207185e-06,
"loss": 0.0712,
"step": 576
},
{
"epoch": 1.4757033248081841,
"grad_norm": 1.722372187884546,
"learning_rate": 1.6019420202144853e-06,
"loss": 0.0433,
"step": 577
},
{
"epoch": 1.4782608695652173,
"grad_norm": 1.4746207680749763,
"learning_rate": 1.5872342839067305e-06,
"loss": 0.0498,
"step": 578
},
{
"epoch": 1.4808184143222507,
"grad_norm": 1.3587198954026716,
"learning_rate": 1.5725816273705453e-06,
"loss": 0.0568,
"step": 579
},
{
"epoch": 1.4833759590792839,
"grad_norm": 1.1732740250402258,
"learning_rate": 1.5579842870900746e-06,
"loss": 0.0383,
"step": 580
},
{
"epoch": 1.485933503836317,
"grad_norm": 1.711025296622914,
"learning_rate": 1.5434424986566938e-06,
"loss": 0.0921,
"step": 581
},
{
"epoch": 1.4884910485933505,
"grad_norm": 1.5076078335129486,
"learning_rate": 1.5289564967652033e-06,
"loss": 0.0597,
"step": 582
},
{
"epoch": 1.4910485933503836,
"grad_norm": 1.797113511699182,
"learning_rate": 1.5145265152100574e-06,
"loss": 0.0734,
"step": 583
},
{
"epoch": 1.4936061381074168,
"grad_norm": 1.9528985026557675,
"learning_rate": 1.5001527868815702e-06,
"loss": 0.0957,
"step": 584
},
{
"epoch": 1.49616368286445,
"grad_norm": 2.177500928432752,
"learning_rate": 1.4858355437621663e-06,
"loss": 0.0879,
"step": 585
},
{
"epoch": 1.4987212276214834,
"grad_norm": 1.6347388686414874,
"learning_rate": 1.4715750169226417e-06,
"loss": 0.0702,
"step": 586
},
{
"epoch": 1.5012787723785166,
"grad_norm": 1.7485578039541358,
"learning_rate": 1.457371436518424e-06,
"loss": 0.0709,
"step": 587
},
{
"epoch": 1.50383631713555,
"grad_norm": 1.8238722950404516,
"learning_rate": 1.4432250317858675e-06,
"loss": 0.0657,
"step": 588
},
{
"epoch": 1.5063938618925832,
"grad_norm": 3.4081682641005533,
"learning_rate": 1.4291360310385455e-06,
"loss": 0.0785,
"step": 589
},
{
"epoch": 1.5089514066496164,
"grad_norm": 1.813069192998236,
"learning_rate": 1.4151046616635727e-06,
"loss": 0.0663,
"step": 590
},
{
"epoch": 1.5115089514066495,
"grad_norm": 1.8093217289893018,
"learning_rate": 1.4011311501179287e-06,
"loss": 0.0909,
"step": 591
},
{
"epoch": 1.5140664961636827,
"grad_norm": 1.9482704657873833,
"learning_rate": 1.3872157219248045e-06,
"loss": 0.0658,
"step": 592
},
{
"epoch": 1.5166240409207161,
"grad_norm": 1.5589428403729417,
"learning_rate": 1.373358601669973e-06,
"loss": 0.0478,
"step": 593
},
{
"epoch": 1.5191815856777495,
"grad_norm": 1.1245451969788118,
"learning_rate": 1.3595600129981469e-06,
"loss": 0.0415,
"step": 594
},
{
"epoch": 1.5217391304347827,
"grad_norm": 1.0485996987613,
"learning_rate": 1.3458201786093795e-06,
"loss": 0.0416,
"step": 595
},
{
"epoch": 1.5242966751918159,
"grad_norm": 2.425494769380029,
"learning_rate": 1.3321393202554739e-06,
"loss": 0.0928,
"step": 596
},
{
"epoch": 1.526854219948849,
"grad_norm": 1.9568801962892155,
"learning_rate": 1.3185176587363919e-06,
"loss": 0.0838,
"step": 597
},
{
"epoch": 1.5294117647058822,
"grad_norm": 1.7500315202188572,
"learning_rate": 1.3049554138967052e-06,
"loss": 0.0499,
"step": 598
},
{
"epoch": 1.5319693094629157,
"grad_norm": 1.9300395352394735,
"learning_rate": 1.2914528046220332e-06,
"loss": 0.074,
"step": 599
},
{
"epoch": 1.5345268542199488,
"grad_norm": 1.0926807919484591,
"learning_rate": 1.278010048835523e-06,
"loss": 0.0328,
"step": 600
},
{
"epoch": 1.5370843989769822,
"grad_norm": 1.4162894586733985,
"learning_rate": 1.2646273634943195e-06,
"loss": 0.0696,
"step": 601
},
{
"epoch": 1.5396419437340154,
"grad_norm": 1.7261841506252762,
"learning_rate": 1.2513049645860759e-06,
"loss": 0.0594,
"step": 602
},
{
"epoch": 1.5421994884910486,
"grad_norm": 1.6070582499731865,
"learning_rate": 1.2380430671254618e-06,
"loss": 0.055,
"step": 603
},
{
"epoch": 1.5447570332480818,
"grad_norm": 1.556977293191578,
"learning_rate": 1.224841885150691e-06,
"loss": 0.0532,
"step": 604
},
{
"epoch": 1.547314578005115,
"grad_norm": 1.9310463068711319,
"learning_rate": 1.2117016317200702e-06,
"loss": 0.0932,
"step": 605
},
{
"epoch": 1.5498721227621484,
"grad_norm": 1.4350515399793062,
"learning_rate": 1.1986225189085627e-06,
"loss": 0.0577,
"step": 606
},
{
"epoch": 1.5524296675191815,
"grad_norm": 1.9134498925809693,
"learning_rate": 1.185604757804359e-06,
"loss": 0.0675,
"step": 607
},
{
"epoch": 1.554987212276215,
"grad_norm": 1.9607164063866107,
"learning_rate": 1.172648558505477e-06,
"loss": 0.0622,
"step": 608
},
{
"epoch": 1.5575447570332481,
"grad_norm": 1.4616026782036957,
"learning_rate": 1.1597541301163655e-06,
"loss": 0.0553,
"step": 609
},
{
"epoch": 1.5601023017902813,
"grad_norm": 1.5571131885670004,
"learning_rate": 1.1469216807445348e-06,
"loss": 0.0521,
"step": 610
},
{
"epoch": 1.5626598465473145,
"grad_norm": 1.7401879912173752,
"learning_rate": 1.1341514174971907e-06,
"loss": 0.0662,
"step": 611
},
{
"epoch": 1.5652173913043477,
"grad_norm": 1.8564899739837128,
"learning_rate": 1.1214435464779006e-06,
"loss": 0.1065,
"step": 612
},
{
"epoch": 1.567774936061381,
"grad_norm": 1.6935400217458705,
"learning_rate": 1.1087982727832613e-06,
"loss": 0.0899,
"step": 613
},
{
"epoch": 1.5703324808184145,
"grad_norm": 1.8949613358675155,
"learning_rate": 1.0962158004995893e-06,
"loss": 0.0987,
"step": 614
},
{
"epoch": 1.5728900255754477,
"grad_norm": 1.3271033110720252,
"learning_rate": 1.083696332699628e-06,
"loss": 0.0447,
"step": 615
},
{
"epoch": 1.5754475703324808,
"grad_norm": 1.0282720447492992,
"learning_rate": 1.0712400714392723e-06,
"loss": 0.0364,
"step": 616
},
{
"epoch": 1.578005115089514,
"grad_norm": 1.7054143756676137,
"learning_rate": 1.058847217754303e-06,
"loss": 0.0545,
"step": 617
},
{
"epoch": 1.5805626598465472,
"grad_norm": 2.7824759917467836,
"learning_rate": 1.0465179716571467e-06,
"loss": 0.0394,
"step": 618
},
{
"epoch": 1.5831202046035806,
"grad_norm": 1.4507251252027922,
"learning_rate": 1.034252532133646e-06,
"loss": 0.0551,
"step": 619
},
{
"epoch": 1.5856777493606138,
"grad_norm": 1.403863371004816,
"learning_rate": 1.0220510971398473e-06,
"loss": 0.058,
"step": 620
},
{
"epoch": 1.5882352941176472,
"grad_norm": 2.0464027459211693,
"learning_rate": 1.0099138635988026e-06,
"loss": 0.0729,
"step": 621
},
{
"epoch": 1.5907928388746804,
"grad_norm": 1.5187023072209234,
"learning_rate": 9.978410273974015e-07,
"loss": 0.0541,
"step": 622
},
{
"epoch": 1.5933503836317136,
"grad_norm": 1.9867877118323218,
"learning_rate": 9.858327833832004e-07,
"loss": 0.0654,
"step": 623
},
{
"epoch": 1.5959079283887467,
"grad_norm": 1.4942485190173453,
"learning_rate": 9.738893253612808e-07,
"loss": 0.0616,
"step": 624
},
{
"epoch": 1.59846547314578,
"grad_norm": 1.6023660849896257,
"learning_rate": 9.620108460911181e-07,
"loss": 0.0599,
"step": 625
},
{
"epoch": 1.6010230179028133,
"grad_norm": 1.3308092300012233,
"learning_rate": 9.50197537283481e-07,
"loss": 0.0494,
"step": 626
},
{
"epoch": 1.6035805626598465,
"grad_norm": 1.6437726206607308,
"learning_rate": 9.384495895973227e-07,
"loss": 0.0689,
"step": 627
},
{
"epoch": 1.60613810741688,
"grad_norm": 1.8936628687028338,
"learning_rate": 9.267671926367166e-07,
"loss": 0.0705,
"step": 628
},
{
"epoch": 1.608695652173913,
"grad_norm": 1.693700691427945,
"learning_rate": 9.151505349477901e-07,
"loss": 0.0759,
"step": 629
},
{
"epoch": 1.6112531969309463,
"grad_norm": 1.6481559811475006,
"learning_rate": 9.035998040156801e-07,
"loss": 0.0681,
"step": 630
},
{
"epoch": 1.6138107416879794,
"grad_norm": 1.4547018046696156,
"learning_rate": 8.921151862615091e-07,
"loss": 0.0504,
"step": 631
},
{
"epoch": 1.6163682864450126,
"grad_norm": 2.2460334650503064,
"learning_rate": 8.806968670393801e-07,
"loss": 0.0972,
"step": 632
},
{
"epoch": 1.618925831202046,
"grad_norm": 1.3530428469383553,
"learning_rate": 8.693450306333818e-07,
"loss": 0.0589,
"step": 633
},
{
"epoch": 1.6214833759590794,
"grad_norm": 2.09210545875247,
"learning_rate": 8.580598602546109e-07,
"loss": 0.0852,
"step": 634
},
{
"epoch": 1.6240409207161126,
"grad_norm": 1.3820789409264775,
"learning_rate": 8.4684153803822e-07,
"loss": 0.0439,
"step": 635
},
{
"epoch": 1.6265984654731458,
"grad_norm": 1.451999661618841,
"learning_rate": 8.356902450404792e-07,
"loss": 0.0509,
"step": 636
},
{
"epoch": 1.629156010230179,
"grad_norm": 1.3950789498456047,
"learning_rate": 8.246061612358475e-07,
"loss": 0.0573,
"step": 637
},
{
"epoch": 1.6317135549872122,
"grad_norm": 1.5996434417326504,
"learning_rate": 8.135894655140758e-07,
"loss": 0.0626,
"step": 638
},
{
"epoch": 1.6342710997442456,
"grad_norm": 1.8147870347577688,
"learning_rate": 8.026403356773161e-07,
"loss": 0.0779,
"step": 639
},
{
"epoch": 1.6368286445012787,
"grad_norm": 1.9345750885275885,
"learning_rate": 7.91758948437249e-07,
"loss": 0.0654,
"step": 640
},
{
"epoch": 1.6393861892583121,
"grad_norm": 1.4948901027842707,
"learning_rate": 7.809454794122346e-07,
"loss": 0.064,
"step": 641
},
{
"epoch": 1.6419437340153453,
"grad_norm": 1.9598445552473587,
"learning_rate": 7.702001031244816e-07,
"loss": 0.0633,
"step": 642
},
{
"epoch": 1.6445012787723785,
"grad_norm": 1.3819361805500896,
"learning_rate": 7.595229929972253e-07,
"loss": 0.0677,
"step": 643
},
{
"epoch": 1.6470588235294117,
"grad_norm": 1.2518200685718097,
"learning_rate": 7.489143213519301e-07,
"loss": 0.0365,
"step": 644
},
{
"epoch": 1.6496163682864449,
"grad_norm": 1.2701439132468373,
"learning_rate": 7.383742594055077e-07,
"loss": 0.0546,
"step": 645
},
{
"epoch": 1.6521739130434783,
"grad_norm": 1.7913524756425139,
"learning_rate": 7.279029772675572e-07,
"loss": 0.0726,
"step": 646
},
{
"epoch": 1.6547314578005117,
"grad_norm": 1.6279279703395964,
"learning_rate": 7.17500643937617e-07,
"loss": 0.0496,
"step": 647
},
{
"epoch": 1.6572890025575449,
"grad_norm": 1.6036454756063958,
"learning_rate": 7.071674273024353e-07,
"loss": 0.0604,
"step": 648
},
{
"epoch": 1.659846547314578,
"grad_norm": 2.2697032372331827,
"learning_rate": 6.969034941332664e-07,
"loss": 0.1019,
"step": 649
},
{
"epoch": 1.6624040920716112,
"grad_norm": 1.6139143967181218,
"learning_rate": 6.86709010083172e-07,
"loss": 0.0417,
"step": 650
},
{
"epoch": 1.6649616368286444,
"grad_norm": 1.7145079712414455,
"learning_rate": 6.765841396843514e-07,
"loss": 0.0635,
"step": 651
},
{
"epoch": 1.6675191815856778,
"grad_norm": 1.7988272014623832,
"learning_rate": 6.665290463454882e-07,
"loss": 0.0709,
"step": 652
},
{
"epoch": 1.670076726342711,
"grad_norm": 2.2968330630918956,
"learning_rate": 6.565438923491102e-07,
"loss": 0.0872,
"step": 653
},
{
"epoch": 1.6726342710997444,
"grad_norm": 1.6427468344680394,
"learning_rate": 6.466288388489689e-07,
"loss": 0.074,
"step": 654
},
{
"epoch": 1.6751918158567776,
"grad_norm": 1.49253726984557,
"learning_rate": 6.367840458674401e-07,
"loss": 0.0656,
"step": 655
},
{
"epoch": 1.6777493606138107,
"grad_norm": 1.3826162361150278,
"learning_rate": 6.270096722929442e-07,
"loss": 0.0432,
"step": 656
},
{
"epoch": 1.680306905370844,
"grad_norm": 1.6151703090317941,
"learning_rate": 6.173058758773775e-07,
"loss": 0.0671,
"step": 657
},
{
"epoch": 1.682864450127877,
"grad_norm": 1.3594356593102335,
"learning_rate": 6.076728132335669e-07,
"loss": 0.049,
"step": 658
},
{
"epoch": 1.6854219948849105,
"grad_norm": 1.841090424763904,
"learning_rate": 5.981106398327463e-07,
"loss": 0.0746,
"step": 659
},
{
"epoch": 1.6879795396419437,
"grad_norm": 1.8813485121032132,
"learning_rate": 5.886195100020408e-07,
"loss": 0.0877,
"step": 660
},
{
"epoch": 1.690537084398977,
"grad_norm": 1.8959291749076537,
"learning_rate": 5.7919957692198e-07,
"loss": 0.0643,
"step": 661
},
{
"epoch": 1.6930946291560103,
"grad_norm": 1.4397051152119766,
"learning_rate": 5.698509926240275e-07,
"loss": 0.0672,
"step": 662
},
{
"epoch": 1.6956521739130435,
"grad_norm": 1.6352253687560279,
"learning_rate": 5.60573907988124e-07,
"loss": 0.0704,
"step": 663
},
{
"epoch": 1.6982097186700766,
"grad_norm": 1.2272314127587733,
"learning_rate": 5.513684727402529e-07,
"loss": 0.0419,
"step": 664
},
{
"epoch": 1.7007672634271098,
"grad_norm": 1.2859127827275485,
"learning_rate": 5.422348354500217e-07,
"loss": 0.0482,
"step": 665
},
{
"epoch": 1.7033248081841432,
"grad_norm": 2.3723626704904075,
"learning_rate": 5.331731435282705e-07,
"loss": 0.0711,
"step": 666
},
{
"epoch": 1.7058823529411766,
"grad_norm": 1.5034836312998365,
"learning_rate": 5.241835432246888e-07,
"loss": 0.0505,
"step": 667
},
{
"epoch": 1.7084398976982098,
"grad_norm": 1.4998179842301858,
"learning_rate": 5.152661796254505e-07,
"loss": 0.0428,
"step": 668
},
{
"epoch": 1.710997442455243,
"grad_norm": 1.884454942561692,
"learning_rate": 5.064211966508837e-07,
"loss": 0.0575,
"step": 669
},
{
"epoch": 1.7135549872122762,
"grad_norm": 1.8377467328078436,
"learning_rate": 4.976487370531352e-07,
"loss": 0.0699,
"step": 670
},
{
"epoch": 1.7161125319693094,
"grad_norm": 1.7964279986811604,
"learning_rate": 4.88948942413876e-07,
"loss": 0.0804,
"step": 671
},
{
"epoch": 1.7186700767263428,
"grad_norm": 1.7189535141584609,
"learning_rate": 4.803219531420128e-07,
"loss": 0.0564,
"step": 672
},
{
"epoch": 1.721227621483376,
"grad_norm": 1.5505230731351738,
"learning_rate": 4.717679084714222e-07,
"loss": 0.05,
"step": 673
},
{
"epoch": 1.7237851662404093,
"grad_norm": 1.4214080005147414,
"learning_rate": 4.6328694645870254e-07,
"loss": 0.0644,
"step": 674
},
{
"epoch": 1.7263427109974425,
"grad_norm": 1.8241456107542313,
"learning_rate": 4.5487920398094465e-07,
"loss": 0.0679,
"step": 675
},
{
"epoch": 1.7289002557544757,
"grad_norm": 1.3282311854023192,
"learning_rate": 4.46544816733529e-07,
"loss": 0.0503,
"step": 676
},
{
"epoch": 1.7314578005115089,
"grad_norm": 1.4122224176940064,
"learning_rate": 4.382839192279303e-07,
"loss": 0.0627,
"step": 677
},
{
"epoch": 1.734015345268542,
"grad_norm": 1.5345073928364736,
"learning_rate": 4.3009664478954384e-07,
"loss": 0.0468,
"step": 678
},
{
"epoch": 1.7365728900255755,
"grad_norm": 1.9191873098525403,
"learning_rate": 4.219831255555423e-07,
"loss": 0.0679,
"step": 679
},
{
"epoch": 1.7391304347826086,
"grad_norm": 1.4130394259241825,
"learning_rate": 4.139434924727359e-07,
"loss": 0.0372,
"step": 680
},
{
"epoch": 1.741687979539642,
"grad_norm": 1.6440930896633388,
"learning_rate": 4.059778752954607e-07,
"loss": 0.0588,
"step": 681
},
{
"epoch": 1.7442455242966752,
"grad_norm": 1.6483657156936842,
"learning_rate": 3.9808640258348686e-07,
"loss": 0.0554,
"step": 682
},
{
"epoch": 1.7468030690537084,
"grad_norm": 1.600191039272227,
"learning_rate": 3.9026920169994374e-07,
"loss": 0.0651,
"step": 683
},
{
"epoch": 1.7493606138107416,
"grad_norm": 1.6033515242953238,
"learning_rate": 3.825263988092587e-07,
"loss": 0.0631,
"step": 684
},
{
"epoch": 1.7519181585677748,
"grad_norm": 1.412635066629858,
"learning_rate": 3.7485811887512714e-07,
"loss": 0.0572,
"step": 685
},
{
"epoch": 1.7544757033248082,
"grad_norm": 1.7343917829704119,
"learning_rate": 3.672644856584928e-07,
"loss": 0.0653,
"step": 686
},
{
"epoch": 1.7570332480818416,
"grad_norm": 2.2332503744651544,
"learning_rate": 3.597456217155526e-07,
"loss": 0.0525,
"step": 687
},
{
"epoch": 1.7595907928388748,
"grad_norm": 1.559262559831483,
"learning_rate": 3.523016483957742e-07,
"loss": 0.0695,
"step": 688
},
{
"epoch": 1.762148337595908,
"grad_norm": 1.3735832247322428,
"learning_rate": 3.4493268583994434e-07,
"loss": 0.0514,
"step": 689
},
{
"epoch": 1.7647058823529411,
"grad_norm": 1.1390421075670185,
"learning_rate": 3.3763885297822153e-07,
"loss": 0.026,
"step": 690
},
{
"epoch": 1.7672634271099743,
"grad_norm": 1.6138957382346333,
"learning_rate": 3.3042026752822254e-07,
"loss": 0.0727,
"step": 691
},
{
"epoch": 1.7698209718670077,
"grad_norm": 1.637531444373419,
"learning_rate": 3.2327704599312283e-07,
"loss": 0.0615,
"step": 692
},
{
"epoch": 1.772378516624041,
"grad_norm": 1.7910759929217885,
"learning_rate": 3.16209303659773e-07,
"loss": 0.062,
"step": 693
},
{
"epoch": 1.7749360613810743,
"grad_norm": 1.5204492548400286,
"learning_rate": 3.0921715459683753e-07,
"loss": 0.048,
"step": 694
},
{
"epoch": 1.7774936061381075,
"grad_norm": 1.623940846315055,
"learning_rate": 3.0230071165295804e-07,
"loss": 0.0518,
"step": 695
},
{
"epoch": 1.7800511508951407,
"grad_norm": 1.7492311424019007,
"learning_rate": 2.95460086454929e-07,
"loss": 0.0599,
"step": 696
},
{
"epoch": 1.7826086956521738,
"grad_norm": 1.8374659643706557,
"learning_rate": 2.88695389405898e-07,
"loss": 0.0682,
"step": 697
},
{
"epoch": 1.785166240409207,
"grad_norm": 1.291638322992873,
"learning_rate": 2.820067296835799e-07,
"loss": 0.0459,
"step": 698
},
{
"epoch": 1.7877237851662404,
"grad_norm": 1.6932187157901324,
"learning_rate": 2.753942152385014e-07,
"loss": 0.0568,
"step": 699
},
{
"epoch": 1.7902813299232738,
"grad_norm": 1.5799764190543009,
"learning_rate": 2.688579527922514e-07,
"loss": 0.0474,
"step": 700
},
{
"epoch": 1.792838874680307,
"grad_norm": 1.506629182713951,
"learning_rate": 2.6239804783576294e-07,
"loss": 0.0557,
"step": 701
},
{
"epoch": 1.7953964194373402,
"grad_norm": 1.9630080012456346,
"learning_rate": 2.560146046276135e-07,
"loss": 0.0818,
"step": 702
},
{
"epoch": 1.7979539641943734,
"grad_norm": 1.6836710184467403,
"learning_rate": 2.4970772619233475e-07,
"loss": 0.0743,
"step": 703
},
{
"epoch": 1.8005115089514065,
"grad_norm": 2.0916698780327025,
"learning_rate": 2.4347751431875453e-07,
"loss": 0.0797,
"step": 704
},
{
"epoch": 1.80306905370844,
"grad_norm": 1.7885774176909262,
"learning_rate": 2.373240695583534e-07,
"loss": 0.0772,
"step": 705
},
{
"epoch": 1.8056265984654731,
"grad_norm": 1.4871829837803723,
"learning_rate": 2.3124749122364286e-07,
"loss": 0.064,
"step": 706
},
{
"epoch": 1.8081841432225065,
"grad_norm": 1.5766080256272554,
"learning_rate": 2.2524787738656073e-07,
"loss": 0.0584,
"step": 707
},
{
"epoch": 1.8107416879795397,
"grad_norm": 1.6532530455773482,
"learning_rate": 2.1932532487688784e-07,
"loss": 0.0528,
"step": 708
},
{
"epoch": 1.813299232736573,
"grad_norm": 1.4446085749985063,
"learning_rate": 2.1347992928068884e-07,
"loss": 0.0541,
"step": 709
},
{
"epoch": 1.815856777493606,
"grad_norm": 0.975992197968072,
"learning_rate": 2.0771178493876387e-07,
"loss": 0.03,
"step": 710
},
{
"epoch": 1.8184143222506393,
"grad_norm": 1.870335934306222,
"learning_rate": 2.0202098494513157e-07,
"loss": 0.0782,
"step": 711
},
{
"epoch": 1.8209718670076727,
"grad_norm": 1.5422113588288993,
"learning_rate": 1.964076211455246e-07,
"loss": 0.0528,
"step": 712
},
{
"epoch": 1.8235294117647058,
"grad_norm": 1.805338069694364,
"learning_rate": 1.908717841359048e-07,
"loss": 0.0602,
"step": 713
},
{
"epoch": 1.8260869565217392,
"grad_norm": 1.7608744884671308,
"learning_rate": 1.8541356326100436e-07,
"loss": 0.0512,
"step": 714
},
{
"epoch": 1.8286445012787724,
"grad_norm": 1.7866867532191502,
"learning_rate": 1.800330466128808e-07,
"loss": 0.0685,
"step": 715
},
{
"epoch": 1.8312020460358056,
"grad_norm": 1.2787390793668814,
"learning_rate": 1.7473032102949983e-07,
"loss": 0.0427,
"step": 716
},
{
"epoch": 1.8337595907928388,
"grad_norm": 2.347960248720825,
"learning_rate": 1.695054720933309e-07,
"loss": 0.0937,
"step": 717
},
{
"epoch": 1.836317135549872,
"grad_norm": 1.830326258197173,
"learning_rate": 1.6435858412996275e-07,
"loss": 0.0752,
"step": 718
},
{
"epoch": 1.8388746803069054,
"grad_norm": 2.0209211652014583,
"learning_rate": 1.5928974020674947e-07,
"loss": 0.0645,
"step": 719
},
{
"epoch": 1.8414322250639388,
"grad_norm": 1.483717555265772,
"learning_rate": 1.542990221314644e-07,
"loss": 0.0429,
"step": 720
},
{
"epoch": 1.843989769820972,
"grad_norm": 1.6080693955603764,
"learning_rate": 1.4938651045098174e-07,
"loss": 0.0402,
"step": 721
},
{
"epoch": 1.8465473145780051,
"grad_norm": 1.6364487443092068,
"learning_rate": 1.445522844499775e-07,
"loss": 0.0515,
"step": 722
},
{
"epoch": 1.8491048593350383,
"grad_norm": 2.2286488872159835,
"learning_rate": 1.3979642214964728e-07,
"loss": 0.0792,
"step": 723
},
{
"epoch": 1.8516624040920715,
"grad_norm": 1.8358051982313959,
"learning_rate": 1.3511900030644954e-07,
"loss": 0.0718,
"step": 724
},
{
"epoch": 1.854219948849105,
"grad_norm": 3.327820933989785,
"learning_rate": 1.3052009441086533e-07,
"loss": 0.0665,
"step": 725
},
{
"epoch": 1.856777493606138,
"grad_norm": 1.54562580037595,
"learning_rate": 1.2599977868618052e-07,
"loss": 0.0488,
"step": 726
},
{
"epoch": 1.8593350383631715,
"grad_norm": 1.768894170651948,
"learning_rate": 1.215581260872889e-07,
"loss": 0.0727,
"step": 727
},
{
"epoch": 1.8618925831202047,
"grad_norm": 1.1097850346664122,
"learning_rate": 1.1719520829951203e-07,
"loss": 0.0361,
"step": 728
},
{
"epoch": 1.8644501278772379,
"grad_norm": 1.4290151454014999,
"learning_rate": 1.1291109573744574e-07,
"loss": 0.0378,
"step": 729
},
{
"epoch": 1.867007672634271,
"grad_norm": 1.8281387881477926,
"learning_rate": 1.087058575438199e-07,
"loss": 0.0734,
"step": 730
},
{
"epoch": 1.8695652173913042,
"grad_norm": 1.4987669648542046,
"learning_rate": 1.0457956158838545e-07,
"loss": 0.0649,
"step": 731
},
{
"epoch": 1.8721227621483376,
"grad_norm": 1.2852375949245314,
"learning_rate": 1.0053227446681912e-07,
"loss": 0.0502,
"step": 732
},
{
"epoch": 1.8746803069053708,
"grad_norm": 1.819977919497317,
"learning_rate": 9.656406149964548e-08,
"loss": 0.0784,
"step": 733
},
{
"epoch": 1.8772378516624042,
"grad_norm": 1.668856802723362,
"learning_rate": 9.267498673118547e-08,
"loss": 0.0702,
"step": 734
},
{
"epoch": 1.8797953964194374,
"grad_norm": 1.5831651881175812,
"learning_rate": 8.886511292852395e-08,
"loss": 0.0635,
"step": 735
},
{
"epoch": 1.8823529411764706,
"grad_norm": 1.690650435572024,
"learning_rate": 8.513450158049109e-08,
"loss": 0.0664,
"step": 736
},
{
"epoch": 1.8849104859335037,
"grad_norm": 1.5261545346775505,
"learning_rate": 8.148321289667749e-08,
"loss": 0.0562,
"step": 737
},
{
"epoch": 1.887468030690537,
"grad_norm": 1.1138146339084107,
"learning_rate": 7.791130580645623e-08,
"loss": 0.0414,
"step": 738
},
{
"epoch": 1.8900255754475703,
"grad_norm": 1.5520967732615027,
"learning_rate": 7.441883795803462e-08,
"loss": 0.0551,
"step": 739
},
{
"epoch": 1.8925831202046037,
"grad_norm": 1.5613125152011387,
"learning_rate": 7.100586571752444e-08,
"loss": 0.0384,
"step": 740
},
{
"epoch": 1.895140664961637,
"grad_norm": 1.316176028402755,
"learning_rate": 6.767244416802988e-08,
"loss": 0.0455,
"step": 741
},
{
"epoch": 1.89769820971867,
"grad_norm": 1.6995534435523871,
"learning_rate": 6.441862710876102e-08,
"loss": 0.0664,
"step": 742
},
{
"epoch": 1.9002557544757033,
"grad_norm": 1.4435916963264959,
"learning_rate": 6.124446705416343e-08,
"loss": 0.0433,
"step": 743
},
{
"epoch": 1.9028132992327365,
"grad_norm": 1.3955512049918404,
"learning_rate": 5.815001523307162e-08,
"loss": 0.0523,
"step": 744
},
{
"epoch": 1.9053708439897699,
"grad_norm": 1.5549298687122213,
"learning_rate": 5.513532158788193e-08,
"loss": 0.0632,
"step": 745
},
{
"epoch": 1.907928388746803,
"grad_norm": 2.0565454206759366,
"learning_rate": 5.220043477374759e-08,
"loss": 0.0906,
"step": 746
},
{
"epoch": 1.9104859335038364,
"grad_norm": 1.781632120564299,
"learning_rate": 4.934540215779271e-08,
"loss": 0.0575,
"step": 747
},
{
"epoch": 1.9130434782608696,
"grad_norm": 1.7209181847754342,
"learning_rate": 4.657026981834623e-08,
"loss": 0.0758,
"step": 748
},
{
"epoch": 1.9156010230179028,
"grad_norm": 1.5843098732810394,
"learning_rate": 4.3875082544201364e-08,
"loss": 0.0652,
"step": 749
},
{
"epoch": 1.918158567774936,
"grad_norm": 1.694771633678275,
"learning_rate": 4.125988383388957e-08,
"loss": 0.0661,
"step": 750
},
{
"epoch": 1.9207161125319692,
"grad_norm": 1.5171207087239054,
"learning_rate": 3.87247158949805e-08,
"loss": 0.0498,
"step": 751
},
{
"epoch": 1.9232736572890026,
"grad_norm": 1.5041197496139775,
"learning_rate": 3.626961964340203e-08,
"loss": 0.0687,
"step": 752
},
{
"epoch": 1.9258312020460358,
"grad_norm": 1.901653590307097,
"learning_rate": 3.389463470277576e-08,
"loss": 0.0807,
"step": 753
},
{
"epoch": 1.9283887468030692,
"grad_norm": 2.073140635948293,
"learning_rate": 3.159979940378088e-08,
"loss": 0.0718,
"step": 754
},
{
"epoch": 1.9309462915601023,
"grad_norm": 1.92991294935061,
"learning_rate": 2.938515078353521e-08,
"loss": 0.0612,
"step": 755
},
{
"epoch": 1.9335038363171355,
"grad_norm": 1.4209233459544497,
"learning_rate": 2.725072458499567e-08,
"loss": 0.0598,
"step": 756
},
{
"epoch": 1.9360613810741687,
"grad_norm": 2.270158801782062,
"learning_rate": 2.519655525638376e-08,
"loss": 0.1167,
"step": 757
},
{
"epoch": 1.938618925831202,
"grad_norm": 1.7764725333130844,
"learning_rate": 2.3222675950627106e-08,
"loss": 0.0493,
"step": 758
},
{
"epoch": 1.9411764705882353,
"grad_norm": 1.3953632649829264,
"learning_rate": 2.1329118524827662e-08,
"loss": 0.0486,
"step": 759
},
{
"epoch": 1.9437340153452687,
"grad_norm": 1.5719732312743544,
"learning_rate": 1.9515913539743247e-08,
"loss": 0.057,
"step": 760
},
{
"epoch": 1.9462915601023019,
"grad_norm": 1.2771960619657408,
"learning_rate": 1.7783090259297918e-08,
"loss": 0.0521,
"step": 761
},
{
"epoch": 1.948849104859335,
"grad_norm": 1.5534397400552928,
"learning_rate": 1.613067665010959e-08,
"loss": 0.0479,
"step": 762
},
{
"epoch": 1.9514066496163682,
"grad_norm": 1.9209034504241524,
"learning_rate": 1.4558699381034825e-08,
"loss": 0.0744,
"step": 763
},
{
"epoch": 1.9539641943734014,
"grad_norm": 1.6535913379119196,
"learning_rate": 1.3067183822742525e-08,
"loss": 0.0505,
"step": 764
},
{
"epoch": 1.9565217391304348,
"grad_norm": 1.3997237372409204,
"learning_rate": 1.1656154047303691e-08,
"loss": 0.0364,
"step": 765
},
{
"epoch": 1.959079283887468,
"grad_norm": 2.374063223958248,
"learning_rate": 1.0325632827801745e-08,
"loss": 0.068,
"step": 766
},
{
"epoch": 1.9616368286445014,
"grad_norm": 1.8265000509614693,
"learning_rate": 9.075641637964483e-09,
"loss": 0.0549,
"step": 767
},
{
"epoch": 1.9641943734015346,
"grad_norm": 1.6206955235826634,
"learning_rate": 7.906200651819907e-09,
"loss": 0.045,
"step": 768
},
{
"epoch": 1.9667519181585678,
"grad_norm": 1.7353163027983434,
"learning_rate": 6.817328743368712e-09,
"loss": 0.0597,
"step": 769
},
{
"epoch": 1.969309462915601,
"grad_norm": 1.7398008950469799,
"learning_rate": 5.809043486279531e-09,
"loss": 0.0822,
"step": 770
},
{
"epoch": 1.9718670076726341,
"grad_norm": 1.75495701677338,
"learning_rate": 4.881361153606934e-09,
"loss": 0.0518,
"step": 771
},
{
"epoch": 1.9744245524296675,
"grad_norm": 1.4225600685776296,
"learning_rate": 4.034296717527752e-09,
"loss": 0.0598,
"step": 772
},
{
"epoch": 1.976982097186701,
"grad_norm": 1.5449322680878437,
"learning_rate": 3.2678638490996064e-09,
"loss": 0.0419,
"step": 773
},
{
"epoch": 1.979539641943734,
"grad_norm": 1.5752118407330762,
"learning_rate": 2.5820749180388573e-09,
"loss": 0.069,
"step": 774
},
{
"epoch": 1.9820971867007673,
"grad_norm": 1.6779528904252332,
"learning_rate": 1.976940992523546e-09,
"loss": 0.0623,
"step": 775
},
{
"epoch": 1.9846547314578005,
"grad_norm": 2.1188662150836888,
"learning_rate": 1.4524718390140913e-09,
"loss": 0.0973,
"step": 776
},
{
"epoch": 1.9872122762148337,
"grad_norm": 1.336481358589082,
"learning_rate": 1.0086759220934162e-09,
"loss": 0.0532,
"step": 777
},
{
"epoch": 1.989769820971867,
"grad_norm": 1.4512712789745728,
"learning_rate": 6.455604043331676e-10,
"loss": 0.0552,
"step": 778
},
{
"epoch": 1.9923273657289002,
"grad_norm": 1.0684958274484193,
"learning_rate": 3.631311461765874e-10,
"loss": 0.0324,
"step": 779
},
{
"epoch": 1.9948849104859336,
"grad_norm": 2.7803294897735515,
"learning_rate": 1.6139270584358823e-10,
"loss": 0.076,
"step": 780
},
{
"epoch": 1.9974424552429668,
"grad_norm": 1.7051238941938596,
"learning_rate": 4.034833925969928e-11,
"loss": 0.0641,
"step": 781
},
{
"epoch": 2.0,
"grad_norm": 1.9818075119000704,
"learning_rate": 0.0,
"loss": 0.0691,
"step": 782
},
{
"epoch": 2.0,
"step": 782,
"total_flos": 4414597447680.0,
"train_loss": 0.1485485090061908,
"train_runtime": 1713.8206,
"train_samples_per_second": 3.65,
"train_steps_per_second": 0.456
}
],
"logging_steps": 1,
"max_steps": 782,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4414597447680.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}