Figure240 / trainer_state.json
DukeDDrake1999's picture
Upload folder using huggingface_hub
12ab88b verified
Raw
History Blame Contribute Delete
42.1 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.244705882352941,
"eval_steps": 500,
"global_step": 240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009411764705882352,
"grad_norm": 0.8085687756538391,
"learning_rate": 0.0,
"loss": 0.4256,
"step": 1
},
{
"epoch": 0.018823529411764704,
"grad_norm": 1.3872617483139038,
"learning_rate": 4e-05,
"loss": 0.7559,
"step": 2
},
{
"epoch": 0.02823529411764706,
"grad_norm": 1.3308924436569214,
"learning_rate": 8e-05,
"loss": 0.5674,
"step": 3
},
{
"epoch": 0.03764705882352941,
"grad_norm": 0.9933507442474365,
"learning_rate": 0.00012,
"loss": 0.5053,
"step": 4
},
{
"epoch": 0.047058823529411764,
"grad_norm": 0.5066264271736145,
"learning_rate": 0.00016,
"loss": 0.4117,
"step": 5
},
{
"epoch": 0.05647058823529412,
"grad_norm": 0.3941880464553833,
"learning_rate": 0.0002,
"loss": 0.3627,
"step": 6
},
{
"epoch": 0.06588235294117648,
"grad_norm": 0.3703334629535675,
"learning_rate": 0.0001991489361702128,
"loss": 0.2119,
"step": 7
},
{
"epoch": 0.07529411764705882,
"grad_norm": 0.25386443734169006,
"learning_rate": 0.00019829787234042554,
"loss": 0.26,
"step": 8
},
{
"epoch": 0.08470588235294117,
"grad_norm": 0.18589483201503754,
"learning_rate": 0.00019744680851063832,
"loss": 0.2611,
"step": 9
},
{
"epoch": 0.09411764705882353,
"grad_norm": 0.3556481897830963,
"learning_rate": 0.00019659574468085107,
"loss": 0.3863,
"step": 10
},
{
"epoch": 0.10352941176470588,
"grad_norm": 0.18439431488513947,
"learning_rate": 0.00019574468085106384,
"loss": 0.2365,
"step": 11
},
{
"epoch": 0.11294117647058824,
"grad_norm": 0.17448313534259796,
"learning_rate": 0.0001948936170212766,
"loss": 0.2748,
"step": 12
},
{
"epoch": 0.1223529411764706,
"grad_norm": 0.23789159953594208,
"learning_rate": 0.00019404255319148937,
"loss": 0.2422,
"step": 13
},
{
"epoch": 0.13176470588235295,
"grad_norm": 0.19852899014949799,
"learning_rate": 0.00019319148936170212,
"loss": 0.2522,
"step": 14
},
{
"epoch": 0.1411764705882353,
"grad_norm": 0.4207107424736023,
"learning_rate": 0.0001923404255319149,
"loss": 0.2644,
"step": 15
},
{
"epoch": 0.15058823529411763,
"grad_norm": 0.30403465032577515,
"learning_rate": 0.00019148936170212768,
"loss": 0.2582,
"step": 16
},
{
"epoch": 0.16,
"grad_norm": 0.21143439412117004,
"learning_rate": 0.00019063829787234045,
"loss": 0.1965,
"step": 17
},
{
"epoch": 0.16941176470588235,
"grad_norm": 0.3473486304283142,
"learning_rate": 0.0001897872340425532,
"loss": 0.2762,
"step": 18
},
{
"epoch": 0.17882352941176471,
"grad_norm": 0.20471632480621338,
"learning_rate": 0.00018893617021276598,
"loss": 0.2573,
"step": 19
},
{
"epoch": 0.18823529411764706,
"grad_norm": 0.20691461861133575,
"learning_rate": 0.00018808510638297873,
"loss": 0.2012,
"step": 20
},
{
"epoch": 0.1976470588235294,
"grad_norm": 0.2261233776807785,
"learning_rate": 0.0001872340425531915,
"loss": 0.2161,
"step": 21
},
{
"epoch": 0.20705882352941177,
"grad_norm": 0.2860106825828552,
"learning_rate": 0.00018638297872340426,
"loss": 0.2597,
"step": 22
},
{
"epoch": 0.2164705882352941,
"grad_norm": 0.2977927327156067,
"learning_rate": 0.00018553191489361704,
"loss": 0.2822,
"step": 23
},
{
"epoch": 0.22588235294117648,
"grad_norm": 0.323324054479599,
"learning_rate": 0.0001846808510638298,
"loss": 0.1893,
"step": 24
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.28017860651016235,
"learning_rate": 0.00018382978723404257,
"loss": 0.1978,
"step": 25
},
{
"epoch": 0.2447058823529412,
"grad_norm": 0.19440393149852753,
"learning_rate": 0.00018297872340425532,
"loss": 0.1935,
"step": 26
},
{
"epoch": 0.2541176470588235,
"grad_norm": 0.3916899859905243,
"learning_rate": 0.0001821276595744681,
"loss": 0.2655,
"step": 27
},
{
"epoch": 0.2635294117647059,
"grad_norm": 0.27680760622024536,
"learning_rate": 0.00018127659574468084,
"loss": 0.2443,
"step": 28
},
{
"epoch": 0.27294117647058824,
"grad_norm": 0.2137499898672104,
"learning_rate": 0.00018042553191489362,
"loss": 0.2145,
"step": 29
},
{
"epoch": 0.2823529411764706,
"grad_norm": 0.2534061670303345,
"learning_rate": 0.00017957446808510637,
"loss": 0.2295,
"step": 30
},
{
"epoch": 0.2917647058823529,
"grad_norm": 0.22554630041122437,
"learning_rate": 0.00017872340425531915,
"loss": 0.2054,
"step": 31
},
{
"epoch": 0.30117647058823527,
"grad_norm": 0.33873406052589417,
"learning_rate": 0.0001778723404255319,
"loss": 0.2665,
"step": 32
},
{
"epoch": 0.31058823529411766,
"grad_norm": 0.2693963348865509,
"learning_rate": 0.00017702127659574468,
"loss": 0.2128,
"step": 33
},
{
"epoch": 0.32,
"grad_norm": 0.2127694934606552,
"learning_rate": 0.00017617021276595746,
"loss": 0.1437,
"step": 34
},
{
"epoch": 0.32941176470588235,
"grad_norm": 0.3235378861427307,
"learning_rate": 0.00017531914893617023,
"loss": 0.1666,
"step": 35
},
{
"epoch": 0.3388235294117647,
"grad_norm": 0.3119862973690033,
"learning_rate": 0.00017446808510638298,
"loss": 0.2539,
"step": 36
},
{
"epoch": 0.34823529411764703,
"grad_norm": 0.3158186376094818,
"learning_rate": 0.00017361702127659576,
"loss": 0.1993,
"step": 37
},
{
"epoch": 0.35764705882352943,
"grad_norm": 0.23814643919467926,
"learning_rate": 0.0001727659574468085,
"loss": 0.166,
"step": 38
},
{
"epoch": 0.36705882352941177,
"grad_norm": 0.18762339651584625,
"learning_rate": 0.0001719148936170213,
"loss": 0.1638,
"step": 39
},
{
"epoch": 0.3764705882352941,
"grad_norm": 0.21114632487297058,
"learning_rate": 0.00017106382978723404,
"loss": 0.2818,
"step": 40
},
{
"epoch": 0.38588235294117645,
"grad_norm": 0.30701348185539246,
"learning_rate": 0.00017021276595744682,
"loss": 0.1896,
"step": 41
},
{
"epoch": 0.3952941176470588,
"grad_norm": 0.37595638632774353,
"learning_rate": 0.0001693617021276596,
"loss": 0.2778,
"step": 42
},
{
"epoch": 0.4047058823529412,
"grad_norm": 0.19554150104522705,
"learning_rate": 0.00016851063829787235,
"loss": 0.2649,
"step": 43
},
{
"epoch": 0.41411764705882353,
"grad_norm": 0.3296668827533722,
"learning_rate": 0.00016765957446808512,
"loss": 0.2704,
"step": 44
},
{
"epoch": 0.4235294117647059,
"grad_norm": 0.515943169593811,
"learning_rate": 0.00016680851063829787,
"loss": 0.3078,
"step": 45
},
{
"epoch": 0.4329411764705882,
"grad_norm": 0.24951788783073425,
"learning_rate": 0.00016595744680851065,
"loss": 0.2295,
"step": 46
},
{
"epoch": 0.4423529411764706,
"grad_norm": 0.27803778648376465,
"learning_rate": 0.0001651063829787234,
"loss": 0.1961,
"step": 47
},
{
"epoch": 0.45176470588235296,
"grad_norm": 0.1960020810365677,
"learning_rate": 0.00016425531914893618,
"loss": 0.2334,
"step": 48
},
{
"epoch": 0.4611764705882353,
"grad_norm": 0.23366108536720276,
"learning_rate": 0.00016340425531914893,
"loss": 0.157,
"step": 49
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.22643855214118958,
"learning_rate": 0.0001625531914893617,
"loss": 0.1648,
"step": 50
},
{
"epoch": 0.48,
"grad_norm": 0.203178271651268,
"learning_rate": 0.00016170212765957446,
"loss": 0.1274,
"step": 51
},
{
"epoch": 0.4894117647058824,
"grad_norm": 0.33611050248146057,
"learning_rate": 0.00016085106382978726,
"loss": 0.1737,
"step": 52
},
{
"epoch": 0.4988235294117647,
"grad_norm": 0.4413496255874634,
"learning_rate": 0.00016,
"loss": 0.2414,
"step": 53
},
{
"epoch": 0.508235294117647,
"grad_norm": 0.24674558639526367,
"learning_rate": 0.0001591489361702128,
"loss": 0.228,
"step": 54
},
{
"epoch": 0.5176470588235295,
"grad_norm": 0.21306754648685455,
"learning_rate": 0.00015829787234042554,
"loss": 0.1727,
"step": 55
},
{
"epoch": 0.5270588235294118,
"grad_norm": 0.22712849080562592,
"learning_rate": 0.00015744680851063832,
"loss": 0.1612,
"step": 56
},
{
"epoch": 0.5364705882352941,
"grad_norm": 0.23271703720092773,
"learning_rate": 0.00015659574468085107,
"loss": 0.2325,
"step": 57
},
{
"epoch": 0.5458823529411765,
"grad_norm": 0.2999236285686493,
"learning_rate": 0.00015574468085106385,
"loss": 0.1876,
"step": 58
},
{
"epoch": 0.5552941176470588,
"grad_norm": 0.21795004606246948,
"learning_rate": 0.0001548936170212766,
"loss": 0.2159,
"step": 59
},
{
"epoch": 0.5647058823529412,
"grad_norm": 0.3243270516395569,
"learning_rate": 0.00015404255319148937,
"loss": 0.2184,
"step": 60
},
{
"epoch": 0.5741176470588235,
"grad_norm": 0.3677728474140167,
"learning_rate": 0.00015319148936170213,
"loss": 0.202,
"step": 61
},
{
"epoch": 0.5835294117647059,
"grad_norm": 0.3357590138912201,
"learning_rate": 0.0001523404255319149,
"loss": 0.1905,
"step": 62
},
{
"epoch": 0.5929411764705882,
"grad_norm": 0.33323776721954346,
"learning_rate": 0.00015148936170212765,
"loss": 0.2903,
"step": 63
},
{
"epoch": 0.6023529411764705,
"grad_norm": 0.2734071612358093,
"learning_rate": 0.00015063829787234043,
"loss": 0.1627,
"step": 64
},
{
"epoch": 0.611764705882353,
"grad_norm": 0.29738208651542664,
"learning_rate": 0.00014978723404255318,
"loss": 0.1801,
"step": 65
},
{
"epoch": 0.6211764705882353,
"grad_norm": 0.22212739288806915,
"learning_rate": 0.00014893617021276596,
"loss": 0.2177,
"step": 66
},
{
"epoch": 0.6305882352941177,
"grad_norm": 0.18247352540493011,
"learning_rate": 0.0001480851063829787,
"loss": 0.2103,
"step": 67
},
{
"epoch": 0.64,
"grad_norm": 0.26197579503059387,
"learning_rate": 0.0001472340425531915,
"loss": 0.2514,
"step": 68
},
{
"epoch": 0.6494117647058824,
"grad_norm": 0.21243281662464142,
"learning_rate": 0.00014638297872340426,
"loss": 0.1667,
"step": 69
},
{
"epoch": 0.6588235294117647,
"grad_norm": 0.38252201676368713,
"learning_rate": 0.00014553191489361704,
"loss": 0.181,
"step": 70
},
{
"epoch": 0.668235294117647,
"grad_norm": 0.28905001282691956,
"learning_rate": 0.0001446808510638298,
"loss": 0.2095,
"step": 71
},
{
"epoch": 0.6776470588235294,
"grad_norm": 0.20469601452350616,
"learning_rate": 0.00014382978723404257,
"loss": 0.1561,
"step": 72
},
{
"epoch": 0.6870588235294117,
"grad_norm": 0.25762686133384705,
"learning_rate": 0.00014297872340425532,
"loss": 0.1768,
"step": 73
},
{
"epoch": 0.6964705882352941,
"grad_norm": 0.24704404175281525,
"learning_rate": 0.0001421276595744681,
"loss": 0.2484,
"step": 74
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.275851845741272,
"learning_rate": 0.00014127659574468085,
"loss": 0.183,
"step": 75
},
{
"epoch": 0.7152941176470589,
"grad_norm": 0.2425452470779419,
"learning_rate": 0.00014042553191489363,
"loss": 0.2125,
"step": 76
},
{
"epoch": 0.7247058823529412,
"grad_norm": 0.25931957364082336,
"learning_rate": 0.0001395744680851064,
"loss": 0.1208,
"step": 77
},
{
"epoch": 0.7341176470588235,
"grad_norm": 0.2956542372703552,
"learning_rate": 0.00013872340425531915,
"loss": 0.1712,
"step": 78
},
{
"epoch": 0.7435294117647059,
"grad_norm": 0.25559189915657043,
"learning_rate": 0.00013787234042553193,
"loss": 0.1584,
"step": 79
},
{
"epoch": 0.7529411764705882,
"grad_norm": 0.3442539870738983,
"learning_rate": 0.00013702127659574468,
"loss": 0.1303,
"step": 80
},
{
"epoch": 0.7623529411764706,
"grad_norm": 0.23330341279506683,
"learning_rate": 0.00013617021276595746,
"loss": 0.1523,
"step": 81
},
{
"epoch": 0.7717647058823529,
"grad_norm": 0.28044548630714417,
"learning_rate": 0.0001353191489361702,
"loss": 0.1137,
"step": 82
},
{
"epoch": 0.7811764705882352,
"grad_norm": 0.32764652371406555,
"learning_rate": 0.000134468085106383,
"loss": 0.1864,
"step": 83
},
{
"epoch": 0.7905882352941176,
"grad_norm": 0.25059211254119873,
"learning_rate": 0.00013361702127659574,
"loss": 0.1634,
"step": 84
},
{
"epoch": 0.8,
"grad_norm": 0.21415087580680847,
"learning_rate": 0.00013276595744680852,
"loss": 0.1389,
"step": 85
},
{
"epoch": 0.8094117647058824,
"grad_norm": 0.24330438673496246,
"learning_rate": 0.00013191489361702127,
"loss": 0.2107,
"step": 86
},
{
"epoch": 0.8188235294117647,
"grad_norm": 0.3163723647594452,
"learning_rate": 0.00013106382978723404,
"loss": 0.1567,
"step": 87
},
{
"epoch": 0.8282352941176471,
"grad_norm": 0.31534239649772644,
"learning_rate": 0.00013021276595744682,
"loss": 0.2164,
"step": 88
},
{
"epoch": 0.8376470588235294,
"grad_norm": 0.32444867491722107,
"learning_rate": 0.0001293617021276596,
"loss": 0.2682,
"step": 89
},
{
"epoch": 0.8470588235294118,
"grad_norm": 0.21235518157482147,
"learning_rate": 0.00012851063829787235,
"loss": 0.1137,
"step": 90
},
{
"epoch": 0.8564705882352941,
"grad_norm": 0.29725661873817444,
"learning_rate": 0.00012765957446808513,
"loss": 0.2246,
"step": 91
},
{
"epoch": 0.8658823529411764,
"grad_norm": 0.21191883087158203,
"learning_rate": 0.00012680851063829788,
"loss": 0.2591,
"step": 92
},
{
"epoch": 0.8752941176470588,
"grad_norm": 0.25724413990974426,
"learning_rate": 0.00012595744680851065,
"loss": 0.3289,
"step": 93
},
{
"epoch": 0.8847058823529412,
"grad_norm": 0.33288782835006714,
"learning_rate": 0.0001251063829787234,
"loss": 0.1999,
"step": 94
},
{
"epoch": 0.8941176470588236,
"grad_norm": 0.26389646530151367,
"learning_rate": 0.00012425531914893618,
"loss": 0.1281,
"step": 95
},
{
"epoch": 0.9035294117647059,
"grad_norm": 0.21038174629211426,
"learning_rate": 0.00012340425531914893,
"loss": 0.1759,
"step": 96
},
{
"epoch": 0.9129411764705883,
"grad_norm": 0.15450036525726318,
"learning_rate": 0.0001225531914893617,
"loss": 0.1188,
"step": 97
},
{
"epoch": 0.9223529411764706,
"grad_norm": 0.14022140204906464,
"learning_rate": 0.00012170212765957448,
"loss": 0.089,
"step": 98
},
{
"epoch": 0.9317647058823529,
"grad_norm": 0.2061687856912613,
"learning_rate": 0.00012085106382978724,
"loss": 0.1858,
"step": 99
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.2657790184020996,
"learning_rate": 0.00012,
"loss": 0.1508,
"step": 100
},
{
"epoch": 0.9505882352941176,
"grad_norm": 0.23362405598163605,
"learning_rate": 0.00011914893617021277,
"loss": 0.1581,
"step": 101
},
{
"epoch": 0.96,
"grad_norm": 0.40954795479774475,
"learning_rate": 0.00011829787234042553,
"loss": 0.2477,
"step": 102
},
{
"epoch": 0.9694117647058823,
"grad_norm": 0.22806493937969208,
"learning_rate": 0.0001174468085106383,
"loss": 0.1394,
"step": 103
},
{
"epoch": 0.9788235294117648,
"grad_norm": 0.23955920338630676,
"learning_rate": 0.00011659574468085106,
"loss": 0.1882,
"step": 104
},
{
"epoch": 0.9882352941176471,
"grad_norm": 0.24184423685073853,
"learning_rate": 0.00011574468085106382,
"loss": 0.228,
"step": 105
},
{
"epoch": 0.9976470588235294,
"grad_norm": 0.30989891290664673,
"learning_rate": 0.00011489361702127661,
"loss": 0.1631,
"step": 106
},
{
"epoch": 1.0,
"grad_norm": 0.3729061186313629,
"learning_rate": 0.00011404255319148938,
"loss": 0.1422,
"step": 107
},
{
"epoch": 1.0094117647058825,
"grad_norm": 0.1809885948896408,
"learning_rate": 0.00011319148936170214,
"loss": 0.1159,
"step": 108
},
{
"epoch": 1.0188235294117647,
"grad_norm": 0.16268357634544373,
"learning_rate": 0.0001123404255319149,
"loss": 0.1033,
"step": 109
},
{
"epoch": 1.0282352941176471,
"grad_norm": 0.159242644906044,
"learning_rate": 0.00011148936170212767,
"loss": 0.1209,
"step": 110
},
{
"epoch": 1.0376470588235294,
"grad_norm": 0.33948951959609985,
"learning_rate": 0.00011063829787234043,
"loss": 0.1485,
"step": 111
},
{
"epoch": 1.0470588235294118,
"grad_norm": 0.17068567872047424,
"learning_rate": 0.0001097872340425532,
"loss": 0.1092,
"step": 112
},
{
"epoch": 1.056470588235294,
"grad_norm": 0.16619639098644257,
"learning_rate": 0.00010893617021276596,
"loss": 0.1005,
"step": 113
},
{
"epoch": 1.0658823529411765,
"grad_norm": 0.20606489479541779,
"learning_rate": 0.00010808510638297873,
"loss": 0.14,
"step": 114
},
{
"epoch": 1.0752941176470587,
"grad_norm": 0.28147006034851074,
"learning_rate": 0.00010723404255319149,
"loss": 0.1613,
"step": 115
},
{
"epoch": 1.0847058823529412,
"grad_norm": 0.19388940930366516,
"learning_rate": 0.00010638297872340425,
"loss": 0.1308,
"step": 116
},
{
"epoch": 1.0941176470588236,
"grad_norm": 0.21097290515899658,
"learning_rate": 0.00010553191489361702,
"loss": 0.1588,
"step": 117
},
{
"epoch": 1.1035294117647059,
"grad_norm": 0.1876417100429535,
"learning_rate": 0.00010468085106382978,
"loss": 0.1163,
"step": 118
},
{
"epoch": 1.1129411764705883,
"grad_norm": 0.17917850613594055,
"learning_rate": 0.00010382978723404255,
"loss": 0.0855,
"step": 119
},
{
"epoch": 1.1223529411764706,
"grad_norm": 0.20767854154109955,
"learning_rate": 0.00010297872340425532,
"loss": 0.0891,
"step": 120
},
{
"epoch": 1.131764705882353,
"grad_norm": 0.21883057057857513,
"learning_rate": 0.00010212765957446809,
"loss": 0.1052,
"step": 121
},
{
"epoch": 1.1411764705882352,
"grad_norm": 0.32774657011032104,
"learning_rate": 0.00010127659574468085,
"loss": 0.1866,
"step": 122
},
{
"epoch": 1.1505882352941177,
"grad_norm": 0.32191288471221924,
"learning_rate": 0.00010042553191489362,
"loss": 0.1817,
"step": 123
},
{
"epoch": 1.16,
"grad_norm": 0.2663422226905823,
"learning_rate": 9.95744680851064e-05,
"loss": 0.2135,
"step": 124
},
{
"epoch": 1.1694117647058824,
"grad_norm": 0.19601747393608093,
"learning_rate": 9.872340425531916e-05,
"loss": 0.1152,
"step": 125
},
{
"epoch": 1.1788235294117646,
"grad_norm": 0.22476732730865479,
"learning_rate": 9.787234042553192e-05,
"loss": 0.1368,
"step": 126
},
{
"epoch": 1.188235294117647,
"grad_norm": 0.2908172607421875,
"learning_rate": 9.702127659574469e-05,
"loss": 0.1594,
"step": 127
},
{
"epoch": 1.1976470588235295,
"grad_norm": 0.19038249552249908,
"learning_rate": 9.617021276595745e-05,
"loss": 0.12,
"step": 128
},
{
"epoch": 1.2070588235294117,
"grad_norm": 0.239775151014328,
"learning_rate": 9.531914893617023e-05,
"loss": 0.1203,
"step": 129
},
{
"epoch": 1.2164705882352942,
"grad_norm": 0.3028945028781891,
"learning_rate": 9.446808510638299e-05,
"loss": 0.1797,
"step": 130
},
{
"epoch": 1.2258823529411764,
"grad_norm": 0.2879508435726166,
"learning_rate": 9.361702127659576e-05,
"loss": 0.1679,
"step": 131
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.2691670358181,
"learning_rate": 9.276595744680852e-05,
"loss": 0.121,
"step": 132
},
{
"epoch": 1.244705882352941,
"grad_norm": 0.18453848361968994,
"learning_rate": 9.191489361702128e-05,
"loss": 0.147,
"step": 133
},
{
"epoch": 1.2541176470588236,
"grad_norm": 0.21089564263820648,
"learning_rate": 9.106382978723405e-05,
"loss": 0.1073,
"step": 134
},
{
"epoch": 1.263529411764706,
"grad_norm": 0.26895055174827576,
"learning_rate": 9.021276595744681e-05,
"loss": 0.1971,
"step": 135
},
{
"epoch": 1.2729411764705882,
"grad_norm": 0.26298171281814575,
"learning_rate": 8.936170212765958e-05,
"loss": 0.112,
"step": 136
},
{
"epoch": 1.2823529411764705,
"grad_norm": 0.20527559518814087,
"learning_rate": 8.851063829787234e-05,
"loss": 0.1321,
"step": 137
},
{
"epoch": 1.291764705882353,
"grad_norm": 0.2358022779226303,
"learning_rate": 8.765957446808512e-05,
"loss": 0.1196,
"step": 138
},
{
"epoch": 1.3011764705882354,
"grad_norm": 0.19852350652217865,
"learning_rate": 8.680851063829788e-05,
"loss": 0.1367,
"step": 139
},
{
"epoch": 1.3105882352941176,
"grad_norm": 0.19292528927326202,
"learning_rate": 8.595744680851064e-05,
"loss": 0.063,
"step": 140
},
{
"epoch": 1.32,
"grad_norm": 0.227496936917305,
"learning_rate": 8.510638297872341e-05,
"loss": 0.1328,
"step": 141
},
{
"epoch": 1.3294117647058823,
"grad_norm": 0.22281454503536224,
"learning_rate": 8.425531914893617e-05,
"loss": 0.0913,
"step": 142
},
{
"epoch": 1.3388235294117647,
"grad_norm": 0.24629417061805725,
"learning_rate": 8.340425531914894e-05,
"loss": 0.1564,
"step": 143
},
{
"epoch": 1.348235294117647,
"grad_norm": 0.3085138499736786,
"learning_rate": 8.25531914893617e-05,
"loss": 0.1128,
"step": 144
},
{
"epoch": 1.3576470588235294,
"grad_norm": 0.31958431005477905,
"learning_rate": 8.170212765957446e-05,
"loss": 0.1167,
"step": 145
},
{
"epoch": 1.3670588235294119,
"grad_norm": 0.3235880732536316,
"learning_rate": 8.085106382978723e-05,
"loss": 0.1334,
"step": 146
},
{
"epoch": 1.3764705882352941,
"grad_norm": 0.2108842432498932,
"learning_rate": 8e-05,
"loss": 0.1304,
"step": 147
},
{
"epoch": 1.3858823529411763,
"grad_norm": 0.24360014498233795,
"learning_rate": 7.914893617021277e-05,
"loss": 0.0802,
"step": 148
},
{
"epoch": 1.3952941176470588,
"grad_norm": 0.3124058246612549,
"learning_rate": 7.829787234042553e-05,
"loss": 0.177,
"step": 149
},
{
"epoch": 1.4047058823529412,
"grad_norm": 0.1749386191368103,
"learning_rate": 7.74468085106383e-05,
"loss": 0.0688,
"step": 150
},
{
"epoch": 1.4141176470588235,
"grad_norm": 0.20923930406570435,
"learning_rate": 7.659574468085106e-05,
"loss": 0.1211,
"step": 151
},
{
"epoch": 1.423529411764706,
"grad_norm": 0.23498158156871796,
"learning_rate": 7.574468085106383e-05,
"loss": 0.1758,
"step": 152
},
{
"epoch": 1.4329411764705882,
"grad_norm": 0.2076374739408493,
"learning_rate": 7.489361702127659e-05,
"loss": 0.1108,
"step": 153
},
{
"epoch": 1.4423529411764706,
"grad_norm": 0.32030659914016724,
"learning_rate": 7.404255319148935e-05,
"loss": 0.2205,
"step": 154
},
{
"epoch": 1.4517647058823528,
"grad_norm": 0.30138102173805237,
"learning_rate": 7.319148936170213e-05,
"loss": 0.1242,
"step": 155
},
{
"epoch": 1.4611764705882353,
"grad_norm": 0.20646269619464874,
"learning_rate": 7.23404255319149e-05,
"loss": 0.1307,
"step": 156
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.3253270387649536,
"learning_rate": 7.148936170212766e-05,
"loss": 0.1834,
"step": 157
},
{
"epoch": 1.48,
"grad_norm": 0.382090300321579,
"learning_rate": 7.063829787234042e-05,
"loss": 0.1134,
"step": 158
},
{
"epoch": 1.4894117647058824,
"grad_norm": 0.1788143664598465,
"learning_rate": 6.97872340425532e-05,
"loss": 0.1126,
"step": 159
},
{
"epoch": 1.4988235294117647,
"grad_norm": 0.23251184821128845,
"learning_rate": 6.893617021276597e-05,
"loss": 0.0852,
"step": 160
},
{
"epoch": 1.5082352941176471,
"grad_norm": 0.2091732621192932,
"learning_rate": 6.808510638297873e-05,
"loss": 0.1348,
"step": 161
},
{
"epoch": 1.5176470588235293,
"grad_norm": 0.22493578493595123,
"learning_rate": 6.72340425531915e-05,
"loss": 0.0944,
"step": 162
},
{
"epoch": 1.5270588235294118,
"grad_norm": 0.36818596720695496,
"learning_rate": 6.638297872340426e-05,
"loss": 0.1519,
"step": 163
},
{
"epoch": 1.5364705882352943,
"grad_norm": 0.18922051787376404,
"learning_rate": 6.553191489361702e-05,
"loss": 0.0994,
"step": 164
},
{
"epoch": 1.5458823529411765,
"grad_norm": 0.2148643583059311,
"learning_rate": 6.46808510638298e-05,
"loss": 0.0937,
"step": 165
},
{
"epoch": 1.5552941176470587,
"grad_norm": 0.29744479060173035,
"learning_rate": 6.382978723404256e-05,
"loss": 0.176,
"step": 166
},
{
"epoch": 1.5647058823529412,
"grad_norm": 0.24048790335655212,
"learning_rate": 6.297872340425533e-05,
"loss": 0.0922,
"step": 167
},
{
"epoch": 1.5741176470588236,
"grad_norm": 0.2661268711090088,
"learning_rate": 6.212765957446809e-05,
"loss": 0.1847,
"step": 168
},
{
"epoch": 1.5835294117647059,
"grad_norm": 0.2629673182964325,
"learning_rate": 6.127659574468086e-05,
"loss": 0.1319,
"step": 169
},
{
"epoch": 1.592941176470588,
"grad_norm": 0.2114630937576294,
"learning_rate": 6.042553191489362e-05,
"loss": 0.1031,
"step": 170
},
{
"epoch": 1.6023529411764705,
"grad_norm": 0.2681063115596771,
"learning_rate": 5.9574468085106384e-05,
"loss": 0.1409,
"step": 171
},
{
"epoch": 1.611764705882353,
"grad_norm": 0.2776590585708618,
"learning_rate": 5.872340425531915e-05,
"loss": 0.1455,
"step": 172
},
{
"epoch": 1.6211764705882352,
"grad_norm": 0.28415653109550476,
"learning_rate": 5.787234042553191e-05,
"loss": 0.1753,
"step": 173
},
{
"epoch": 1.6305882352941177,
"grad_norm": 0.3313392698764801,
"learning_rate": 5.702127659574469e-05,
"loss": 0.115,
"step": 174
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.202665776014328,
"learning_rate": 5.617021276595745e-05,
"loss": 0.1024,
"step": 175
},
{
"epoch": 1.6494117647058824,
"grad_norm": 0.24025441706180573,
"learning_rate": 5.531914893617022e-05,
"loss": 0.1127,
"step": 176
},
{
"epoch": 1.6588235294117646,
"grad_norm": 0.24172081053256989,
"learning_rate": 5.446808510638298e-05,
"loss": 0.1548,
"step": 177
},
{
"epoch": 1.668235294117647,
"grad_norm": 0.22560444474220276,
"learning_rate": 5.3617021276595745e-05,
"loss": 0.1373,
"step": 178
},
{
"epoch": 1.6776470588235295,
"grad_norm": 0.2639141082763672,
"learning_rate": 5.276595744680851e-05,
"loss": 0.1611,
"step": 179
},
{
"epoch": 1.6870588235294117,
"grad_norm": 0.231312096118927,
"learning_rate": 5.191489361702127e-05,
"loss": 0.1136,
"step": 180
},
{
"epoch": 1.696470588235294,
"grad_norm": 0.2878687381744385,
"learning_rate": 5.1063829787234044e-05,
"loss": 0.0947,
"step": 181
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.20883168280124664,
"learning_rate": 5.021276595744681e-05,
"loss": 0.1508,
"step": 182
},
{
"epoch": 1.7152941176470589,
"grad_norm": 0.1777513027191162,
"learning_rate": 4.936170212765958e-05,
"loss": 0.0995,
"step": 183
},
{
"epoch": 1.724705882352941,
"grad_norm": 0.2738886773586273,
"learning_rate": 4.851063829787234e-05,
"loss": 0.1215,
"step": 184
},
{
"epoch": 1.7341176470588235,
"grad_norm": 0.3017210066318512,
"learning_rate": 4.7659574468085114e-05,
"loss": 0.1051,
"step": 185
},
{
"epoch": 1.743529411764706,
"grad_norm": 0.2284962385892868,
"learning_rate": 4.680851063829788e-05,
"loss": 0.1366,
"step": 186
},
{
"epoch": 1.7529411764705882,
"grad_norm": 0.25317302346229553,
"learning_rate": 4.595744680851064e-05,
"loss": 0.1723,
"step": 187
},
{
"epoch": 1.7623529411764705,
"grad_norm": 0.3226059377193451,
"learning_rate": 4.5106382978723406e-05,
"loss": 0.1851,
"step": 188
},
{
"epoch": 1.771764705882353,
"grad_norm": 0.23642048239707947,
"learning_rate": 4.425531914893617e-05,
"loss": 0.0906,
"step": 189
},
{
"epoch": 1.7811764705882354,
"grad_norm": 0.41941365599632263,
"learning_rate": 4.340425531914894e-05,
"loss": 0.1894,
"step": 190
},
{
"epoch": 1.7905882352941176,
"grad_norm": 0.25548532605171204,
"learning_rate": 4.2553191489361704e-05,
"loss": 0.133,
"step": 191
},
{
"epoch": 1.8,
"grad_norm": 0.41250723600387573,
"learning_rate": 4.170212765957447e-05,
"loss": 0.2158,
"step": 192
},
{
"epoch": 1.8094117647058825,
"grad_norm": 0.2293664664030075,
"learning_rate": 4.085106382978723e-05,
"loss": 0.1193,
"step": 193
},
{
"epoch": 1.8188235294117647,
"grad_norm": 0.2659620940685272,
"learning_rate": 4e-05,
"loss": 0.1104,
"step": 194
},
{
"epoch": 1.828235294117647,
"grad_norm": 0.25475817918777466,
"learning_rate": 3.914893617021277e-05,
"loss": 0.1283,
"step": 195
},
{
"epoch": 1.8376470588235294,
"grad_norm": 0.3606954514980316,
"learning_rate": 3.829787234042553e-05,
"loss": 0.2067,
"step": 196
},
{
"epoch": 1.8470588235294119,
"grad_norm": 0.25670263171195984,
"learning_rate": 3.7446808510638295e-05,
"loss": 0.1435,
"step": 197
},
{
"epoch": 1.856470588235294,
"grad_norm": 0.21306878328323364,
"learning_rate": 3.6595744680851066e-05,
"loss": 0.1203,
"step": 198
},
{
"epoch": 1.8658823529411763,
"grad_norm": 0.2932317554950714,
"learning_rate": 3.574468085106383e-05,
"loss": 0.1449,
"step": 199
},
{
"epoch": 1.8752941176470588,
"grad_norm": 0.27417030930519104,
"learning_rate": 3.48936170212766e-05,
"loss": 0.183,
"step": 200
},
{
"epoch": 1.8847058823529412,
"grad_norm": 0.21384155750274658,
"learning_rate": 3.4042553191489365e-05,
"loss": 0.0958,
"step": 201
},
{
"epoch": 1.8941176470588235,
"grad_norm": 0.4246179163455963,
"learning_rate": 3.319148936170213e-05,
"loss": 0.0902,
"step": 202
},
{
"epoch": 1.903529411764706,
"grad_norm": 0.24406206607818604,
"learning_rate": 3.23404255319149e-05,
"loss": 0.1508,
"step": 203
},
{
"epoch": 1.9129411764705884,
"grad_norm": 0.29728034138679504,
"learning_rate": 3.1489361702127664e-05,
"loss": 0.0963,
"step": 204
},
{
"epoch": 1.9223529411764706,
"grad_norm": 0.2817091643810272,
"learning_rate": 3.063829787234043e-05,
"loss": 0.1004,
"step": 205
},
{
"epoch": 1.9317647058823528,
"grad_norm": 0.24360136687755585,
"learning_rate": 2.9787234042553192e-05,
"loss": 0.116,
"step": 206
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.37449270486831665,
"learning_rate": 2.8936170212765956e-05,
"loss": 0.116,
"step": 207
},
{
"epoch": 1.9505882352941177,
"grad_norm": 0.28712770342826843,
"learning_rate": 2.8085106382978727e-05,
"loss": 0.1471,
"step": 208
},
{
"epoch": 1.96,
"grad_norm": 0.3738057613372803,
"learning_rate": 2.723404255319149e-05,
"loss": 0.1659,
"step": 209
},
{
"epoch": 1.9694117647058822,
"grad_norm": 0.23495592176914215,
"learning_rate": 2.6382978723404255e-05,
"loss": 0.1511,
"step": 210
},
{
"epoch": 1.9788235294117649,
"grad_norm": 0.2240479439496994,
"learning_rate": 2.5531914893617022e-05,
"loss": 0.1029,
"step": 211
},
{
"epoch": 1.988235294117647,
"grad_norm": 0.3040638267993927,
"learning_rate": 2.468085106382979e-05,
"loss": 0.1475,
"step": 212
},
{
"epoch": 1.9976470588235293,
"grad_norm": 0.31952324509620667,
"learning_rate": 2.3829787234042557e-05,
"loss": 0.2251,
"step": 213
},
{
"epoch": 2.0,
"grad_norm": 0.8529971837997437,
"learning_rate": 2.297872340425532e-05,
"loss": 0.1501,
"step": 214
},
{
"epoch": 2.0094117647058822,
"grad_norm": 0.15144610404968262,
"learning_rate": 2.2127659574468085e-05,
"loss": 0.0475,
"step": 215
},
{
"epoch": 2.018823529411765,
"grad_norm": 0.22670157253742218,
"learning_rate": 2.1276595744680852e-05,
"loss": 0.1331,
"step": 216
},
{
"epoch": 2.028235294117647,
"grad_norm": 0.1298380345106125,
"learning_rate": 2.0425531914893616e-05,
"loss": 0.0565,
"step": 217
},
{
"epoch": 2.0376470588235294,
"grad_norm": 0.20339643955230713,
"learning_rate": 1.9574468085106384e-05,
"loss": 0.0792,
"step": 218
},
{
"epoch": 2.0470588235294116,
"grad_norm": 0.1728561669588089,
"learning_rate": 1.8723404255319148e-05,
"loss": 0.073,
"step": 219
},
{
"epoch": 2.0564705882352943,
"grad_norm": 0.18114320933818817,
"learning_rate": 1.7872340425531915e-05,
"loss": 0.0842,
"step": 220
},
{
"epoch": 2.0658823529411765,
"grad_norm": 0.22063469886779785,
"learning_rate": 1.7021276595744682e-05,
"loss": 0.149,
"step": 221
},
{
"epoch": 2.0752941176470587,
"grad_norm": 0.21402296423912048,
"learning_rate": 1.617021276595745e-05,
"loss": 0.0904,
"step": 222
},
{
"epoch": 2.084705882352941,
"grad_norm": 0.1926405131816864,
"learning_rate": 1.5319148936170214e-05,
"loss": 0.1171,
"step": 223
},
{
"epoch": 2.0941176470588236,
"grad_norm": 0.19383728504180908,
"learning_rate": 1.4468085106382978e-05,
"loss": 0.1062,
"step": 224
},
{
"epoch": 2.103529411764706,
"grad_norm": 0.19772332906723022,
"learning_rate": 1.3617021276595745e-05,
"loss": 0.0943,
"step": 225
},
{
"epoch": 2.112941176470588,
"grad_norm": 0.1649761199951172,
"learning_rate": 1.2765957446808511e-05,
"loss": 0.0856,
"step": 226
},
{
"epoch": 2.1223529411764708,
"grad_norm": 0.2451670914888382,
"learning_rate": 1.1914893617021278e-05,
"loss": 0.0941,
"step": 227
},
{
"epoch": 2.131764705882353,
"grad_norm": 0.16791269183158875,
"learning_rate": 1.1063829787234042e-05,
"loss": 0.0835,
"step": 228
},
{
"epoch": 2.1411764705882352,
"grad_norm": 0.23293572664260864,
"learning_rate": 1.0212765957446808e-05,
"loss": 0.1359,
"step": 229
},
{
"epoch": 2.1505882352941175,
"grad_norm": 0.21684999763965607,
"learning_rate": 9.361702127659574e-06,
"loss": 0.0793,
"step": 230
},
{
"epoch": 2.16,
"grad_norm": 0.2551932632923126,
"learning_rate": 8.510638297872341e-06,
"loss": 0.1588,
"step": 231
},
{
"epoch": 2.1694117647058824,
"grad_norm": 0.2618826925754547,
"learning_rate": 7.659574468085107e-06,
"loss": 0.1019,
"step": 232
},
{
"epoch": 2.1788235294117646,
"grad_norm": 0.17673631012439728,
"learning_rate": 6.808510638297873e-06,
"loss": 0.0576,
"step": 233
},
{
"epoch": 2.1882352941176473,
"grad_norm": 0.18946610391139984,
"learning_rate": 5.957446808510639e-06,
"loss": 0.0929,
"step": 234
},
{
"epoch": 2.1976470588235295,
"grad_norm": 0.23730279505252838,
"learning_rate": 5.106382978723404e-06,
"loss": 0.1291,
"step": 235
},
{
"epoch": 2.2070588235294117,
"grad_norm": 0.23422257602214813,
"learning_rate": 4.255319148936171e-06,
"loss": 0.1169,
"step": 236
},
{
"epoch": 2.216470588235294,
"grad_norm": 0.2329464703798294,
"learning_rate": 3.4042553191489363e-06,
"loss": 0.1428,
"step": 237
},
{
"epoch": 2.2258823529411766,
"grad_norm": 0.22677987813949585,
"learning_rate": 2.553191489361702e-06,
"loss": 0.1091,
"step": 238
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.18695490062236786,
"learning_rate": 1.7021276595744682e-06,
"loss": 0.0791,
"step": 239
},
{
"epoch": 2.244705882352941,
"grad_norm": 0.13504858314990997,
"learning_rate": 8.510638297872341e-07,
"loss": 0.0638,
"step": 240
}
],
"logging_steps": 1,
"max_steps": 240,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.82456266624e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}