qwen-2.5-7b-table-sft-lora / trainer_state.json
6uvsoomJ's picture
Upload folder using huggingface_hub
7979b92 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.992771084337349,
"eval_steps": 500,
"global_step": 775,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00642570281124498,
"grad_norm": 0.8317294716835022,
"learning_rate": 8.333333333333334e-06,
"loss": 0.1978,
"step": 1
},
{
"epoch": 0.01285140562248996,
"grad_norm": 0.94012850522995,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.2672,
"step": 2
},
{
"epoch": 0.01927710843373494,
"grad_norm": 1.2400332689285278,
"learning_rate": 2.5e-05,
"loss": 0.2964,
"step": 3
},
{
"epoch": 0.02570281124497992,
"grad_norm": 2.0498180389404297,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.3573,
"step": 4
},
{
"epoch": 0.0321285140562249,
"grad_norm": 1.034347653388977,
"learning_rate": 4.166666666666667e-05,
"loss": 0.2913,
"step": 5
},
{
"epoch": 0.03855421686746988,
"grad_norm": 0.6081845164299011,
"learning_rate": 5e-05,
"loss": 0.2519,
"step": 6
},
{
"epoch": 0.04497991967871486,
"grad_norm": 0.4193064272403717,
"learning_rate": 5.833333333333334e-05,
"loss": 0.2426,
"step": 7
},
{
"epoch": 0.05140562248995984,
"grad_norm": 0.50704026222229,
"learning_rate": 6.666666666666667e-05,
"loss": 0.2293,
"step": 8
},
{
"epoch": 0.05783132530120482,
"grad_norm": 0.3707605302333832,
"learning_rate": 7.500000000000001e-05,
"loss": 0.2189,
"step": 9
},
{
"epoch": 0.0642570281124498,
"grad_norm": 0.34638485312461853,
"learning_rate": 8.333333333333334e-05,
"loss": 0.1798,
"step": 10
},
{
"epoch": 0.07068273092369477,
"grad_norm": 0.4543774425983429,
"learning_rate": 9.166666666666667e-05,
"loss": 0.1842,
"step": 11
},
{
"epoch": 0.07710843373493977,
"grad_norm": 0.3193999230861664,
"learning_rate": 0.0001,
"loss": 0.2133,
"step": 12
},
{
"epoch": 0.08353413654618475,
"grad_norm": 0.3274695575237274,
"learning_rate": 0.00010833333333333333,
"loss": 0.2086,
"step": 13
},
{
"epoch": 0.08995983935742972,
"grad_norm": 0.32100680470466614,
"learning_rate": 0.00011666666666666668,
"loss": 0.1988,
"step": 14
},
{
"epoch": 0.0963855421686747,
"grad_norm": 0.3277706205844879,
"learning_rate": 0.000125,
"loss": 0.1865,
"step": 15
},
{
"epoch": 0.10281124497991968,
"grad_norm": 0.2264498621225357,
"learning_rate": 0.00013333333333333334,
"loss": 0.1605,
"step": 16
},
{
"epoch": 0.10923694779116466,
"grad_norm": 0.3071700930595398,
"learning_rate": 0.00014166666666666668,
"loss": 0.1535,
"step": 17
},
{
"epoch": 0.11566265060240964,
"grad_norm": 0.3147311508655548,
"learning_rate": 0.00015000000000000001,
"loss": 0.1637,
"step": 18
},
{
"epoch": 0.12208835341365462,
"grad_norm": 0.32233041524887085,
"learning_rate": 0.00015833333333333332,
"loss": 0.17,
"step": 19
},
{
"epoch": 0.1285140562248996,
"grad_norm": 0.2847141921520233,
"learning_rate": 0.0001666666666666667,
"loss": 0.1432,
"step": 20
},
{
"epoch": 0.13493975903614458,
"grad_norm": 0.45303934812545776,
"learning_rate": 0.000175,
"loss": 0.1595,
"step": 21
},
{
"epoch": 0.14136546184738955,
"grad_norm": 0.32803109288215637,
"learning_rate": 0.00018333333333333334,
"loss": 0.1263,
"step": 22
},
{
"epoch": 0.14779116465863454,
"grad_norm": 0.7632677555084229,
"learning_rate": 0.00019166666666666667,
"loss": 0.162,
"step": 23
},
{
"epoch": 0.15421686746987953,
"grad_norm": 7.622311115264893,
"learning_rate": 0.0002,
"loss": 0.1743,
"step": 24
},
{
"epoch": 0.1606425702811245,
"grad_norm": 0.41840752959251404,
"learning_rate": 0.00019999912503789813,
"loss": 0.1967,
"step": 25
},
{
"epoch": 0.1670682730923695,
"grad_norm": 0.29047325253486633,
"learning_rate": 0.00019999650016690364,
"loss": 0.1196,
"step": 26
},
{
"epoch": 0.17349397590361446,
"grad_norm": 0.2337496131658554,
"learning_rate": 0.0001999921254329498,
"loss": 0.1488,
"step": 27
},
{
"epoch": 0.17991967871485945,
"grad_norm": 0.2089911699295044,
"learning_rate": 0.00019998600091259113,
"loss": 0.138,
"step": 28
},
{
"epoch": 0.18634538152610441,
"grad_norm": 0.268136590719223,
"learning_rate": 0.00019997812671300214,
"loss": 0.1845,
"step": 29
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.2347370982170105,
"learning_rate": 0.0001999685029719753,
"loss": 0.1257,
"step": 30
},
{
"epoch": 0.19919678714859437,
"grad_norm": 0.21996308863162994,
"learning_rate": 0.0001999571298579188,
"loss": 0.171,
"step": 31
},
{
"epoch": 0.20562248995983937,
"grad_norm": 0.1974944919347763,
"learning_rate": 0.0001999440075698535,
"loss": 0.1095,
"step": 32
},
{
"epoch": 0.21204819277108433,
"grad_norm": 0.15095502138137817,
"learning_rate": 0.00019992913633740957,
"loss": 0.1663,
"step": 33
},
{
"epoch": 0.21847389558232932,
"grad_norm": 0.2080863118171692,
"learning_rate": 0.0001999125164208222,
"loss": 0.141,
"step": 34
},
{
"epoch": 0.2248995983935743,
"grad_norm": 0.2128421813249588,
"learning_rate": 0.0001998941481109274,
"loss": 0.2076,
"step": 35
},
{
"epoch": 0.23132530120481928,
"grad_norm": 0.2011524736881256,
"learning_rate": 0.00019987403172915666,
"loss": 0.1419,
"step": 36
},
{
"epoch": 0.23775100401606425,
"grad_norm": 0.16084055602550507,
"learning_rate": 0.00019985216762753139,
"loss": 0.1357,
"step": 37
},
{
"epoch": 0.24417670682730924,
"grad_norm": 0.1661912202835083,
"learning_rate": 0.0001998285561886568,
"loss": 0.1471,
"step": 38
},
{
"epoch": 0.25060240963855424,
"grad_norm": 0.2506616413593292,
"learning_rate": 0.00019980319782571523,
"loss": 0.1555,
"step": 39
},
{
"epoch": 0.2570281124497992,
"grad_norm": 0.17315006256103516,
"learning_rate": 0.00019977609298245873,
"loss": 0.1468,
"step": 40
},
{
"epoch": 0.26345381526104417,
"grad_norm": 0.1772138923406601,
"learning_rate": 0.00019974724213320157,
"loss": 0.1447,
"step": 41
},
{
"epoch": 0.26987951807228916,
"grad_norm": 0.2274760603904724,
"learning_rate": 0.00019971664578281173,
"loss": 0.1707,
"step": 42
},
{
"epoch": 0.27630522088353415,
"grad_norm": 0.16768603026866913,
"learning_rate": 0.00019968430446670212,
"loss": 0.147,
"step": 43
},
{
"epoch": 0.2827309236947791,
"grad_norm": 0.1941104382276535,
"learning_rate": 0.0001996502187508213,
"loss": 0.1415,
"step": 44
},
{
"epoch": 0.2891566265060241,
"grad_norm": 0.17718106508255005,
"learning_rate": 0.00019961438923164345,
"loss": 0.1297,
"step": 45
},
{
"epoch": 0.2955823293172691,
"grad_norm": 0.17948807775974274,
"learning_rate": 0.00019957681653615797,
"loss": 0.1349,
"step": 46
},
{
"epoch": 0.30200803212851407,
"grad_norm": 0.19423460960388184,
"learning_rate": 0.0001995375013218586,
"loss": 0.1277,
"step": 47
},
{
"epoch": 0.30843373493975906,
"grad_norm": 0.2893676161766052,
"learning_rate": 0.00019949644427673177,
"loss": 0.1485,
"step": 48
},
{
"epoch": 0.314859437751004,
"grad_norm": 0.4430103600025177,
"learning_rate": 0.00019945364611924463,
"loss": 0.1306,
"step": 49
},
{
"epoch": 0.321285140562249,
"grad_norm": 0.28305917978286743,
"learning_rate": 0.0001994091075983325,
"loss": 0.1646,
"step": 50
},
{
"epoch": 0.327710843373494,
"grad_norm": 0.10388786345720291,
"learning_rate": 0.00019936282949338578,
"loss": 0.097,
"step": 51
},
{
"epoch": 0.334136546184739,
"grad_norm": 0.13306961953639984,
"learning_rate": 0.00019931481261423618,
"loss": 0.1222,
"step": 52
},
{
"epoch": 0.3405622489959839,
"grad_norm": 0.1919727623462677,
"learning_rate": 0.00019926505780114276,
"loss": 0.1566,
"step": 53
},
{
"epoch": 0.3469879518072289,
"grad_norm": 0.21380549669265747,
"learning_rate": 0.0001992135659247769,
"loss": 0.1404,
"step": 54
},
{
"epoch": 0.3534136546184739,
"grad_norm": 0.1510586440563202,
"learning_rate": 0.00019916033788620755,
"loss": 0.1453,
"step": 55
},
{
"epoch": 0.3598393574297189,
"grad_norm": 0.11387544125318527,
"learning_rate": 0.000199105374616885,
"loss": 0.1261,
"step": 56
},
{
"epoch": 0.36626506024096384,
"grad_norm": 0.15470993518829346,
"learning_rate": 0.00019904867707862476,
"loss": 0.163,
"step": 57
},
{
"epoch": 0.37269076305220883,
"grad_norm": 0.18749335408210754,
"learning_rate": 0.0001989902462635908,
"loss": 0.1452,
"step": 58
},
{
"epoch": 0.3791164658634538,
"grad_norm": 0.13796862959861755,
"learning_rate": 0.00019893008319427812,
"loss": 0.1257,
"step": 59
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.18501056730747223,
"learning_rate": 0.00019886818892349482,
"loss": 0.1143,
"step": 60
},
{
"epoch": 0.39196787148594375,
"grad_norm": 0.18443071842193604,
"learning_rate": 0.00019880456453434369,
"loss": 0.1395,
"step": 61
},
{
"epoch": 0.39839357429718875,
"grad_norm": 0.1594623327255249,
"learning_rate": 0.00019873921114020333,
"loss": 0.1505,
"step": 62
},
{
"epoch": 0.40481927710843374,
"grad_norm": 0.1715545505285263,
"learning_rate": 0.00019867212988470864,
"loss": 0.115,
"step": 63
},
{
"epoch": 0.41124497991967873,
"grad_norm": 0.24589896202087402,
"learning_rate": 0.0001986033219417307,
"loss": 0.1549,
"step": 64
},
{
"epoch": 0.41767068273092367,
"grad_norm": 0.1842864602804184,
"learning_rate": 0.00019853278851535638,
"loss": 0.1511,
"step": 65
},
{
"epoch": 0.42409638554216866,
"grad_norm": 0.20570969581604004,
"learning_rate": 0.00019846053083986717,
"loss": 0.168,
"step": 66
},
{
"epoch": 0.43052208835341366,
"grad_norm": 0.1519116312265396,
"learning_rate": 0.00019838655017971767,
"loss": 0.1142,
"step": 67
},
{
"epoch": 0.43694779116465865,
"grad_norm": 0.2530803978443146,
"learning_rate": 0.00019831084782951326,
"loss": 0.1359,
"step": 68
},
{
"epoch": 0.4433734939759036,
"grad_norm": 0.2451787292957306,
"learning_rate": 0.00019823342511398776,
"loss": 0.1257,
"step": 69
},
{
"epoch": 0.4497991967871486,
"grad_norm": 0.43833062052726746,
"learning_rate": 0.00019815428338798002,
"loss": 0.1275,
"step": 70
},
{
"epoch": 0.4562248995983936,
"grad_norm": 0.1903715431690216,
"learning_rate": 0.0001980734240364102,
"loss": 0.1357,
"step": 71
},
{
"epoch": 0.46265060240963857,
"grad_norm": 0.28559988737106323,
"learning_rate": 0.00019799084847425572,
"loss": 0.1312,
"step": 72
},
{
"epoch": 0.46907630522088356,
"grad_norm": 0.2496558576822281,
"learning_rate": 0.0001979065581465263,
"loss": 0.1633,
"step": 73
},
{
"epoch": 0.4755020080321285,
"grad_norm": 0.2327835112810135,
"learning_rate": 0.00019782055452823878,
"loss": 0.1442,
"step": 74
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.3205198645591736,
"learning_rate": 0.00019773283912439133,
"loss": 0.1511,
"step": 75
},
{
"epoch": 0.4883534136546185,
"grad_norm": 0.1274174302816391,
"learning_rate": 0.00019764341346993698,
"loss": 0.0996,
"step": 76
},
{
"epoch": 0.4947791164658635,
"grad_norm": 0.2272380292415619,
"learning_rate": 0.00019755227912975697,
"loss": 0.1176,
"step": 77
},
{
"epoch": 0.5012048192771085,
"grad_norm": 0.13094107806682587,
"learning_rate": 0.0001974594376986331,
"loss": 0.1184,
"step": 78
},
{
"epoch": 0.5076305220883535,
"grad_norm": 0.13428834080696106,
"learning_rate": 0.00019736489080122006,
"loss": 0.1309,
"step": 79
},
{
"epoch": 0.5140562248995983,
"grad_norm": 0.1373153030872345,
"learning_rate": 0.00019726864009201694,
"loss": 0.1376,
"step": 80
},
{
"epoch": 0.5204819277108433,
"grad_norm": 0.1309944987297058,
"learning_rate": 0.00019717068725533818,
"loss": 0.1403,
"step": 81
},
{
"epoch": 0.5269076305220883,
"grad_norm": 0.14153365790843964,
"learning_rate": 0.00019707103400528415,
"loss": 0.1399,
"step": 82
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.1331794112920761,
"learning_rate": 0.0001969696820857112,
"loss": 0.1455,
"step": 83
},
{
"epoch": 0.5397590361445783,
"grad_norm": 0.12772125005722046,
"learning_rate": 0.0001968666332702011,
"loss": 0.1409,
"step": 84
},
{
"epoch": 0.5461847389558233,
"grad_norm": 0.13976705074310303,
"learning_rate": 0.00019676188936203006,
"loss": 0.1144,
"step": 85
},
{
"epoch": 0.5526104417670683,
"grad_norm": 0.1431870311498642,
"learning_rate": 0.00019665545219413701,
"loss": 0.148,
"step": 86
},
{
"epoch": 0.5590361445783133,
"grad_norm": 0.1257038563489914,
"learning_rate": 0.00019654732362909177,
"loss": 0.1197,
"step": 87
},
{
"epoch": 0.5654618473895582,
"grad_norm": 0.16741393506526947,
"learning_rate": 0.00019643750555906224,
"loss": 0.1563,
"step": 88
},
{
"epoch": 0.5718875502008032,
"grad_norm": 0.16155458986759186,
"learning_rate": 0.00019632599990578143,
"loss": 0.1333,
"step": 89
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.17337974905967712,
"learning_rate": 0.00019621280862051373,
"loss": 0.1669,
"step": 90
},
{
"epoch": 0.5847389558232932,
"grad_norm": 0.17315532267093658,
"learning_rate": 0.00019609793368402086,
"loss": 0.1488,
"step": 91
},
{
"epoch": 0.5911646586345382,
"grad_norm": 0.15965646505355835,
"learning_rate": 0.0001959813771065271,
"loss": 0.1207,
"step": 92
},
{
"epoch": 0.5975903614457831,
"grad_norm": 0.15546628832817078,
"learning_rate": 0.00019586314092768424,
"loss": 0.1147,
"step": 93
},
{
"epoch": 0.6040160642570281,
"grad_norm": 0.16691721975803375,
"learning_rate": 0.00019574322721653583,
"loss": 0.1172,
"step": 94
},
{
"epoch": 0.6104417670682731,
"grad_norm": 0.1729833036661148,
"learning_rate": 0.00019562163807148084,
"loss": 0.12,
"step": 95
},
{
"epoch": 0.6168674698795181,
"grad_norm": 0.17864178121089935,
"learning_rate": 0.0001954983756202372,
"loss": 0.1266,
"step": 96
},
{
"epoch": 0.623293172690763,
"grad_norm": 0.20421630144119263,
"learning_rate": 0.0001953734420198044,
"loss": 0.1531,
"step": 97
},
{
"epoch": 0.629718875502008,
"grad_norm": 0.19765256345272064,
"learning_rate": 0.0001952468394564257,
"loss": 0.1134,
"step": 98
},
{
"epoch": 0.636144578313253,
"grad_norm": 0.197422593832016,
"learning_rate": 0.00019511857014555,
"loss": 0.1292,
"step": 99
},
{
"epoch": 0.642570281124498,
"grad_norm": 0.2465924769639969,
"learning_rate": 0.00019498863633179308,
"loss": 0.1426,
"step": 100
},
{
"epoch": 0.648995983935743,
"grad_norm": 0.136220321059227,
"learning_rate": 0.00019485704028889813,
"loss": 0.0881,
"step": 101
},
{
"epoch": 0.655421686746988,
"grad_norm": 0.14770826697349548,
"learning_rate": 0.0001947237843196962,
"loss": 0.125,
"step": 102
},
{
"epoch": 0.661847389558233,
"grad_norm": 0.13019175827503204,
"learning_rate": 0.0001945888707560657,
"loss": 0.1271,
"step": 103
},
{
"epoch": 0.668273092369478,
"grad_norm": 0.10536068677902222,
"learning_rate": 0.0001944523019588918,
"loss": 0.107,
"step": 104
},
{
"epoch": 0.6746987951807228,
"grad_norm": 0.10180668532848358,
"learning_rate": 0.00019431408031802486,
"loss": 0.1145,
"step": 105
},
{
"epoch": 0.6811244979919678,
"grad_norm": 0.14559617638587952,
"learning_rate": 0.00019417420825223891,
"loss": 0.1395,
"step": 106
},
{
"epoch": 0.6875502008032128,
"grad_norm": 0.13509546220302582,
"learning_rate": 0.000194032688209189,
"loss": 0.1478,
"step": 107
},
{
"epoch": 0.6939759036144578,
"grad_norm": 0.13227735459804535,
"learning_rate": 0.00019388952266536868,
"loss": 0.1445,
"step": 108
},
{
"epoch": 0.7004016064257028,
"grad_norm": 0.16162370145320892,
"learning_rate": 0.00019374471412606642,
"loss": 0.1246,
"step": 109
},
{
"epoch": 0.7068273092369478,
"grad_norm": 0.1407587081193924,
"learning_rate": 0.00019359826512532194,
"loss": 0.1421,
"step": 110
},
{
"epoch": 0.7132530120481928,
"grad_norm": 0.15528196096420288,
"learning_rate": 0.00019345017822588168,
"loss": 0.1629,
"step": 111
},
{
"epoch": 0.7196787148594378,
"grad_norm": 0.1608172059059143,
"learning_rate": 0.0001933004560191542,
"loss": 0.1538,
"step": 112
},
{
"epoch": 0.7261044176706827,
"grad_norm": 0.15321175754070282,
"learning_rate": 0.00019314910112516463,
"loss": 0.1251,
"step": 113
},
{
"epoch": 0.7325301204819277,
"grad_norm": 0.17383867502212524,
"learning_rate": 0.00019299611619250881,
"loss": 0.1531,
"step": 114
},
{
"epoch": 0.7389558232931727,
"grad_norm": 0.18434979021549225,
"learning_rate": 0.00019284150389830721,
"loss": 0.1847,
"step": 115
},
{
"epoch": 0.7453815261044177,
"grad_norm": 0.16240356862545013,
"learning_rate": 0.00019268526694815773,
"loss": 0.1712,
"step": 116
},
{
"epoch": 0.7518072289156627,
"grad_norm": 0.17521648108959198,
"learning_rate": 0.0001925274080760886,
"loss": 0.1222,
"step": 117
},
{
"epoch": 0.7582329317269076,
"grad_norm": 0.16100138425827026,
"learning_rate": 0.00019236793004451044,
"loss": 0.1238,
"step": 118
},
{
"epoch": 0.7646586345381526,
"grad_norm": 0.16682398319244385,
"learning_rate": 0.00019220683564416787,
"loss": 0.0914,
"step": 119
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.15211397409439087,
"learning_rate": 0.00019204412769409086,
"loss": 0.1051,
"step": 120
},
{
"epoch": 0.7775100401606426,
"grad_norm": 0.19107018411159515,
"learning_rate": 0.00019187980904154515,
"loss": 0.1532,
"step": 121
},
{
"epoch": 0.7839357429718875,
"grad_norm": 0.18667763471603394,
"learning_rate": 0.00019171388256198268,
"loss": 0.1435,
"step": 122
},
{
"epoch": 0.7903614457831325,
"grad_norm": 0.1942739635705948,
"learning_rate": 0.000191546351158991,
"loss": 0.1137,
"step": 123
},
{
"epoch": 0.7967871485943775,
"grad_norm": 0.23028349876403809,
"learning_rate": 0.00019137721776424274,
"loss": 0.1293,
"step": 124
},
{
"epoch": 0.8032128514056225,
"grad_norm": 0.25495702028274536,
"learning_rate": 0.0001912064853374441,
"loss": 0.1441,
"step": 125
},
{
"epoch": 0.8096385542168675,
"grad_norm": 0.10432910919189453,
"learning_rate": 0.0001910341568662831,
"loss": 0.0831,
"step": 126
},
{
"epoch": 0.8160642570281125,
"grad_norm": 0.11154992133378983,
"learning_rate": 0.00019086023536637737,
"loss": 0.1183,
"step": 127
},
{
"epoch": 0.8224899598393575,
"grad_norm": 0.13584573566913605,
"learning_rate": 0.0001906847238812214,
"loss": 0.1441,
"step": 128
},
{
"epoch": 0.8289156626506025,
"grad_norm": 0.35521605610847473,
"learning_rate": 0.0001905076254821331,
"loss": 0.1368,
"step": 129
},
{
"epoch": 0.8353413654618473,
"grad_norm": 0.14569194614887238,
"learning_rate": 0.00019032894326820023,
"loss": 0.1285,
"step": 130
},
{
"epoch": 0.8417670682730923,
"grad_norm": 0.16236892342567444,
"learning_rate": 0.0001901486803662261,
"loss": 0.1578,
"step": 131
},
{
"epoch": 0.8481927710843373,
"grad_norm": 0.11677072197198868,
"learning_rate": 0.00018996683993067483,
"loss": 0.1183,
"step": 132
},
{
"epoch": 0.8546184738955823,
"grad_norm": 0.1277882605791092,
"learning_rate": 0.00018978342514361626,
"loss": 0.1196,
"step": 133
},
{
"epoch": 0.8610441767068273,
"grad_norm": 0.12904071807861328,
"learning_rate": 0.00018959843921467014,
"loss": 0.1281,
"step": 134
},
{
"epoch": 0.8674698795180723,
"grad_norm": 0.13108272850513458,
"learning_rate": 0.00018941188538094999,
"loss": 0.1187,
"step": 135
},
{
"epoch": 0.8738955823293173,
"grad_norm": 0.15289278328418732,
"learning_rate": 0.0001892237669070065,
"loss": 0.1524,
"step": 136
},
{
"epoch": 0.8803212851405623,
"grad_norm": 0.13197748363018036,
"learning_rate": 0.0001890340870847704,
"loss": 0.1104,
"step": 137
},
{
"epoch": 0.8867469879518072,
"grad_norm": 0.15141014754772186,
"learning_rate": 0.00018884284923349477,
"loss": 0.154,
"step": 138
},
{
"epoch": 0.8931726907630522,
"grad_norm": 0.1232500970363617,
"learning_rate": 0.00018865005669969708,
"loss": 0.102,
"step": 139
},
{
"epoch": 0.8995983935742972,
"grad_norm": 0.16395455598831177,
"learning_rate": 0.00018845571285710058,
"loss": 0.145,
"step": 140
},
{
"epoch": 0.9060240963855422,
"grad_norm": 0.15895424783229828,
"learning_rate": 0.00018825982110657515,
"loss": 0.1268,
"step": 141
},
{
"epoch": 0.9124497991967871,
"grad_norm": 0.15639305114746094,
"learning_rate": 0.00018806238487607794,
"loss": 0.126,
"step": 142
},
{
"epoch": 0.9188755020080321,
"grad_norm": 0.14362278580665588,
"learning_rate": 0.0001878634076205934,
"loss": 0.0981,
"step": 143
},
{
"epoch": 0.9253012048192771,
"grad_norm": 0.1624501496553421,
"learning_rate": 0.00018766289282207263,
"loss": 0.1208,
"step": 144
},
{
"epoch": 0.9317269076305221,
"grad_norm": 0.1643369346857071,
"learning_rate": 0.00018746084398937266,
"loss": 0.1088,
"step": 145
},
{
"epoch": 0.9381526104417671,
"grad_norm": 0.19890688359737396,
"learning_rate": 0.00018725726465819488,
"loss": 0.1476,
"step": 146
},
{
"epoch": 0.944578313253012,
"grad_norm": 0.16708028316497803,
"learning_rate": 0.00018705215839102328,
"loss": 0.1175,
"step": 147
},
{
"epoch": 0.951004016064257,
"grad_norm": 0.20685526728630066,
"learning_rate": 0.0001868455287770621,
"loss": 0.1573,
"step": 148
},
{
"epoch": 0.957429718875502,
"grad_norm": 0.19720108807086945,
"learning_rate": 0.00018663737943217296,
"loss": 0.137,
"step": 149
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.2381121814250946,
"learning_rate": 0.00018642771399881162,
"loss": 0.156,
"step": 150
},
{
"epoch": 0.970281124497992,
"grad_norm": 0.13865579664707184,
"learning_rate": 0.00018621653614596425,
"loss": 0.1229,
"step": 151
},
{
"epoch": 0.976706827309237,
"grad_norm": 0.10851379483938217,
"learning_rate": 0.00018600384956908323,
"loss": 0.1088,
"step": 152
},
{
"epoch": 0.983132530120482,
"grad_norm": 0.1621655523777008,
"learning_rate": 0.00018578965799002236,
"loss": 0.1479,
"step": 153
},
{
"epoch": 0.989558232931727,
"grad_norm": 0.18607285618782043,
"learning_rate": 0.00018557396515697202,
"loss": 0.1489,
"step": 154
},
{
"epoch": 0.9959839357429718,
"grad_norm": 0.19177676737308502,
"learning_rate": 0.0001853567748443933,
"loss": 0.1163,
"step": 155
},
{
"epoch": 1.0056224899598394,
"grad_norm": 0.6790018677711487,
"learning_rate": 0.000185138090852952,
"loss": 0.2256,
"step": 156
},
{
"epoch": 1.0120481927710843,
"grad_norm": 0.10716850310564041,
"learning_rate": 0.0001849179170094522,
"loss": 0.094,
"step": 157
},
{
"epoch": 1.0184738955823294,
"grad_norm": 0.11798243969678879,
"learning_rate": 0.00018469625716676933,
"loss": 0.1108,
"step": 158
},
{
"epoch": 1.0248995983935743,
"grad_norm": 0.13069161772727966,
"learning_rate": 0.00018447311520378262,
"loss": 0.1041,
"step": 159
},
{
"epoch": 1.0313253012048194,
"grad_norm": 0.19986286759376526,
"learning_rate": 0.0001842484950253073,
"loss": 0.125,
"step": 160
},
{
"epoch": 1.0377510040160642,
"grad_norm": 0.16085122525691986,
"learning_rate": 0.00018402240056202614,
"loss": 0.1025,
"step": 161
},
{
"epoch": 1.0441767068273093,
"grad_norm": 0.20288337767124176,
"learning_rate": 0.00018379483577042103,
"loss": 0.1328,
"step": 162
},
{
"epoch": 1.0506024096385542,
"grad_norm": 0.15977489948272705,
"learning_rate": 0.00018356580463270322,
"loss": 0.0985,
"step": 163
},
{
"epoch": 1.057028112449799,
"grad_norm": 0.2264052927494049,
"learning_rate": 0.00018333531115674408,
"loss": 0.0931,
"step": 164
},
{
"epoch": 1.0634538152610442,
"grad_norm": 0.18668119609355927,
"learning_rate": 0.0001831033593760047,
"loss": 0.0777,
"step": 165
},
{
"epoch": 1.069879518072289,
"grad_norm": 0.2187710851430893,
"learning_rate": 0.00018286995334946545,
"loss": 0.1076,
"step": 166
},
{
"epoch": 1.0763052208835342,
"grad_norm": 0.19872407615184784,
"learning_rate": 0.0001826350971615549,
"loss": 0.1008,
"step": 167
},
{
"epoch": 1.082730923694779,
"grad_norm": 0.23164619505405426,
"learning_rate": 0.00018239879492207831,
"loss": 0.1104,
"step": 168
},
{
"epoch": 1.0891566265060242,
"grad_norm": 0.20669420063495636,
"learning_rate": 0.00018216105076614576,
"loss": 0.1042,
"step": 169
},
{
"epoch": 1.095582329317269,
"grad_norm": 0.23208123445510864,
"learning_rate": 0.00018192186885409973,
"loss": 0.1156,
"step": 170
},
{
"epoch": 1.1020080321285142,
"grad_norm": 0.25448471307754517,
"learning_rate": 0.0001816812533714425,
"loss": 0.1322,
"step": 171
},
{
"epoch": 1.108433734939759,
"grad_norm": 0.18578830361366272,
"learning_rate": 0.00018143920852876257,
"loss": 0.078,
"step": 172
},
{
"epoch": 1.114859437751004,
"grad_norm": 0.21140921115875244,
"learning_rate": 0.0001811957385616612,
"loss": 0.1078,
"step": 173
},
{
"epoch": 1.121285140562249,
"grad_norm": 0.24159879982471466,
"learning_rate": 0.0001809508477306783,
"loss": 0.098,
"step": 174
},
{
"epoch": 1.127710843373494,
"grad_norm": 0.2108435034751892,
"learning_rate": 0.00018070454032121787,
"loss": 0.085,
"step": 175
},
{
"epoch": 1.134136546184739,
"grad_norm": 0.25270572304725647,
"learning_rate": 0.00018045682064347275,
"loss": 0.0984,
"step": 176
},
{
"epoch": 1.140562248995984,
"grad_norm": 0.2605237066745758,
"learning_rate": 0.00018020769303234962,
"loss": 0.1125,
"step": 177
},
{
"epoch": 1.146987951807229,
"grad_norm": 0.2768741846084595,
"learning_rate": 0.00017995716184739284,
"loss": 0.0868,
"step": 178
},
{
"epoch": 1.153413654618474,
"grad_norm": 0.3283689320087433,
"learning_rate": 0.00017970523147270822,
"loss": 0.0932,
"step": 179
},
{
"epoch": 1.159839357429719,
"grad_norm": 0.2760496735572815,
"learning_rate": 0.0001794519063168864,
"loss": 0.0702,
"step": 180
},
{
"epoch": 1.1662650602409639,
"grad_norm": 0.1508658230304718,
"learning_rate": 0.0001791971908129256,
"loss": 0.1086,
"step": 181
},
{
"epoch": 1.1726907630522088,
"grad_norm": 0.1741812825202942,
"learning_rate": 0.000178941089418154,
"loss": 0.1102,
"step": 182
},
{
"epoch": 1.1791164658634539,
"grad_norm": 0.18406537175178528,
"learning_rate": 0.000178683606614152,
"loss": 0.1185,
"step": 183
},
{
"epoch": 1.1855421686746987,
"grad_norm": 0.18714162707328796,
"learning_rate": 0.00017842474690667344,
"loss": 0.1078,
"step": 184
},
{
"epoch": 1.1919678714859439,
"grad_norm": 0.15225981175899506,
"learning_rate": 0.00017816451482556702,
"loss": 0.0808,
"step": 185
},
{
"epoch": 1.1983935742971887,
"grad_norm": 0.1622186154127121,
"learning_rate": 0.0001779029149246969,
"loss": 0.097,
"step": 186
},
{
"epoch": 1.2048192771084336,
"grad_norm": 0.17352862656116486,
"learning_rate": 0.00017763995178186307,
"loss": 0.1094,
"step": 187
},
{
"epoch": 1.2112449799196787,
"grad_norm": 0.14557699859142303,
"learning_rate": 0.00017737562999872118,
"loss": 0.1031,
"step": 188
},
{
"epoch": 1.2176706827309236,
"grad_norm": 0.1729564219713211,
"learning_rate": 0.00017710995420070215,
"loss": 0.1109,
"step": 189
},
{
"epoch": 1.2240963855421687,
"grad_norm": 0.17331069707870483,
"learning_rate": 0.00017684292903693102,
"loss": 0.1163,
"step": 190
},
{
"epoch": 1.2305220883534136,
"grad_norm": 0.1967068910598755,
"learning_rate": 0.0001765745591801458,
"loss": 0.1137,
"step": 191
},
{
"epoch": 1.2369477911646587,
"grad_norm": 0.20813412964344025,
"learning_rate": 0.00017630484932661559,
"loss": 0.0865,
"step": 192
},
{
"epoch": 1.2433734939759036,
"grad_norm": 0.17115503549575806,
"learning_rate": 0.0001760338041960583,
"loss": 0.0954,
"step": 193
},
{
"epoch": 1.2497991967871487,
"grad_norm": 0.2135663777589798,
"learning_rate": 0.00017576142853155838,
"loss": 0.099,
"step": 194
},
{
"epoch": 1.2562248995983936,
"grad_norm": 0.2796885669231415,
"learning_rate": 0.00017548772709948343,
"loss": 0.1166,
"step": 195
},
{
"epoch": 1.2626506024096384,
"grad_norm": 0.2290525585412979,
"learning_rate": 0.0001752127046894011,
"loss": 0.1018,
"step": 196
},
{
"epoch": 1.2690763052208835,
"grad_norm": 0.23698222637176514,
"learning_rate": 0.0001749363661139951,
"loss": 0.0871,
"step": 197
},
{
"epoch": 1.2755020080321284,
"grad_norm": 0.2161116749048233,
"learning_rate": 0.00017465871620898102,
"loss": 0.0819,
"step": 198
},
{
"epoch": 1.2819277108433735,
"grad_norm": 0.2709653377532959,
"learning_rate": 0.00017437975983302178,
"loss": 0.082,
"step": 199
},
{
"epoch": 1.2883534136546184,
"grad_norm": 0.2437043935060501,
"learning_rate": 0.0001740995018676425,
"loss": 0.07,
"step": 200
},
{
"epoch": 1.2947791164658635,
"grad_norm": 0.24623267352581024,
"learning_rate": 0.0001738179472171452,
"loss": 0.0868,
"step": 201
},
{
"epoch": 1.3012048192771084,
"grad_norm": 0.27310508489608765,
"learning_rate": 0.00017353510080852282,
"loss": 0.0857,
"step": 202
},
{
"epoch": 1.3076305220883535,
"grad_norm": 0.2532103657722473,
"learning_rate": 0.0001732509675913731,
"loss": 0.0885,
"step": 203
},
{
"epoch": 1.3140562248995984,
"grad_norm": 0.2618705928325653,
"learning_rate": 0.000172965552537812,
"loss": 0.0903,
"step": 204
},
{
"epoch": 1.3204819277108433,
"grad_norm": 0.3039279282093048,
"learning_rate": 0.00017267886064238662,
"loss": 0.0963,
"step": 205
},
{
"epoch": 1.3269076305220884,
"grad_norm": 0.16821685433387756,
"learning_rate": 0.00017239089692198785,
"loss": 0.0837,
"step": 206
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.275329053401947,
"learning_rate": 0.0001721016664157625,
"loss": 0.1015,
"step": 207
},
{
"epoch": 1.3397590361445784,
"grad_norm": 0.1983174830675125,
"learning_rate": 0.00017181117418502525,
"loss": 0.1156,
"step": 208
},
{
"epoch": 1.3461847389558232,
"grad_norm": 0.19256579875946045,
"learning_rate": 0.00017151942531316988,
"loss": 0.1055,
"step": 209
},
{
"epoch": 1.3526104417670683,
"grad_norm": 0.19830577075481415,
"learning_rate": 0.00017122642490558055,
"loss": 0.1142,
"step": 210
},
{
"epoch": 1.3590361445783132,
"grad_norm": 0.17073017358779907,
"learning_rate": 0.00017093217808954232,
"loss": 0.1305,
"step": 211
},
{
"epoch": 1.3654618473895583,
"grad_norm": 0.18915396928787231,
"learning_rate": 0.00017063669001415145,
"loss": 0.1147,
"step": 212
},
{
"epoch": 1.3718875502008032,
"grad_norm": 0.13788312673568726,
"learning_rate": 0.00017033996585022528,
"loss": 0.1056,
"step": 213
},
{
"epoch": 1.378313253012048,
"grad_norm": 0.203065887093544,
"learning_rate": 0.00017004201079021176,
"loss": 0.1355,
"step": 214
},
{
"epoch": 1.3847389558232932,
"grad_norm": 0.2112981230020523,
"learning_rate": 0.00016974283004809858,
"loss": 0.1215,
"step": 215
},
{
"epoch": 1.391164658634538,
"grad_norm": 0.19515666365623474,
"learning_rate": 0.00016944242885932206,
"loss": 0.135,
"step": 216
},
{
"epoch": 1.3975903614457832,
"grad_norm": 0.19761696457862854,
"learning_rate": 0.0001691408124806752,
"loss": 0.125,
"step": 217
},
{
"epoch": 1.404016064257028,
"grad_norm": 0.18920212984085083,
"learning_rate": 0.00016883798619021608,
"loss": 0.0967,
"step": 218
},
{
"epoch": 1.410441767068273,
"grad_norm": 0.1732681393623352,
"learning_rate": 0.0001685339552871752,
"loss": 0.0984,
"step": 219
},
{
"epoch": 1.416867469879518,
"grad_norm": 0.20118467509746552,
"learning_rate": 0.00016822872509186297,
"loss": 0.0871,
"step": 220
},
{
"epoch": 1.4232931726907632,
"grad_norm": 0.24401867389678955,
"learning_rate": 0.0001679223009455764,
"loss": 0.0971,
"step": 221
},
{
"epoch": 1.429718875502008,
"grad_norm": 0.22608117759227753,
"learning_rate": 0.00016761468821050585,
"loss": 0.0996,
"step": 222
},
{
"epoch": 1.436144578313253,
"grad_norm": 0.19186720252037048,
"learning_rate": 0.00016730589226964098,
"loss": 0.0757,
"step": 223
},
{
"epoch": 1.442570281124498,
"grad_norm": 0.24773664772510529,
"learning_rate": 0.00016699591852667673,
"loss": 0.0819,
"step": 224
},
{
"epoch": 1.448995983935743,
"grad_norm": 0.2296506017446518,
"learning_rate": 0.00016668477240591864,
"loss": 0.0967,
"step": 225
},
{
"epoch": 1.455421686746988,
"grad_norm": 0.23210635781288147,
"learning_rate": 0.00016637245935218799,
"loss": 0.089,
"step": 226
},
{
"epoch": 1.461847389558233,
"grad_norm": 0.22376962006092072,
"learning_rate": 0.00016605898483072648,
"loss": 0.0839,
"step": 227
},
{
"epoch": 1.4682730923694778,
"grad_norm": 0.23127049207687378,
"learning_rate": 0.00016574435432710068,
"loss": 0.0827,
"step": 228
},
{
"epoch": 1.4746987951807229,
"grad_norm": 0.31701013445854187,
"learning_rate": 0.0001654285733471059,
"loss": 0.1,
"step": 229
},
{
"epoch": 1.481124497991968,
"grad_norm": 0.3070242702960968,
"learning_rate": 0.0001651116474166699,
"loss": 0.1036,
"step": 230
},
{
"epoch": 1.4875502008032129,
"grad_norm": 0.18072882294654846,
"learning_rate": 0.00016479358208175627,
"loss": 0.1061,
"step": 231
},
{
"epoch": 1.4939759036144578,
"grad_norm": 0.14309802651405334,
"learning_rate": 0.00016447438290826733,
"loss": 0.092,
"step": 232
},
{
"epoch": 1.5004016064257029,
"grad_norm": 0.18977715075016022,
"learning_rate": 0.00016415405548194663,
"loss": 0.1152,
"step": 233
},
{
"epoch": 1.5068273092369477,
"grad_norm": 0.22865934669971466,
"learning_rate": 0.00016383260540828135,
"loss": 0.116,
"step": 234
},
{
"epoch": 1.5132530120481928,
"grad_norm": 0.2020760327577591,
"learning_rate": 0.00016351003831240415,
"loss": 0.112,
"step": 235
},
{
"epoch": 1.5196787148594377,
"grad_norm": 0.25912925601005554,
"learning_rate": 0.00016318635983899465,
"loss": 0.1282,
"step": 236
},
{
"epoch": 1.5261044176706826,
"grad_norm": 0.1735217273235321,
"learning_rate": 0.0001628615756521809,
"loss": 0.1034,
"step": 237
},
{
"epoch": 1.5325301204819277,
"grad_norm": 0.19721132516860962,
"learning_rate": 0.0001625356914354399,
"loss": 0.1338,
"step": 238
},
{
"epoch": 1.5389558232931728,
"grad_norm": 0.19484449923038483,
"learning_rate": 0.0001622087128914985,
"loss": 0.1214,
"step": 239
},
{
"epoch": 1.5453815261044177,
"grad_norm": 0.16421280801296234,
"learning_rate": 0.00016188064574223335,
"loss": 0.0866,
"step": 240
},
{
"epoch": 1.5518072289156626,
"grad_norm": 0.1922108381986618,
"learning_rate": 0.0001615514957285709,
"loss": 0.1298,
"step": 241
},
{
"epoch": 1.5582329317269075,
"grad_norm": 0.16495804488658905,
"learning_rate": 0.00016122126861038688,
"loss": 0.1056,
"step": 242
},
{
"epoch": 1.5646586345381526,
"grad_norm": 0.2061115801334381,
"learning_rate": 0.00016088997016640562,
"loss": 0.1008,
"step": 243
},
{
"epoch": 1.5710843373493977,
"grad_norm": 0.21605950593948364,
"learning_rate": 0.00016055760619409877,
"loss": 0.099,
"step": 244
},
{
"epoch": 1.5775100401606426,
"grad_norm": 0.21308393776416779,
"learning_rate": 0.00016022418250958385,
"loss": 0.1041,
"step": 245
},
{
"epoch": 1.5839357429718874,
"grad_norm": 0.30087393522262573,
"learning_rate": 0.00015988970494752272,
"loss": 0.1192,
"step": 246
},
{
"epoch": 1.5903614457831325,
"grad_norm": 0.22396108508110046,
"learning_rate": 0.00015955417936101913,
"loss": 0.0985,
"step": 247
},
{
"epoch": 1.5967871485943776,
"grad_norm": 0.27335667610168457,
"learning_rate": 0.00015921761162151653,
"loss": 0.0809,
"step": 248
},
{
"epoch": 1.6032128514056225,
"grad_norm": 0.19635237753391266,
"learning_rate": 0.00015888000761869528,
"loss": 0.074,
"step": 249
},
{
"epoch": 1.6096385542168674,
"grad_norm": 0.24129214882850647,
"learning_rate": 0.0001585413732603695,
"loss": 0.0948,
"step": 250
},
{
"epoch": 1.6160642570281123,
"grad_norm": 0.26676061749458313,
"learning_rate": 0.00015820171447238383,
"loss": 0.1169,
"step": 251
},
{
"epoch": 1.6224899598393574,
"grad_norm": 0.23809854686260223,
"learning_rate": 0.0001578610371985096,
"loss": 0.0916,
"step": 252
},
{
"epoch": 1.6289156626506025,
"grad_norm": 0.21806567907333374,
"learning_rate": 0.00015751934740034092,
"loss": 0.0897,
"step": 253
},
{
"epoch": 1.6353413654618474,
"grad_norm": 0.2490801066160202,
"learning_rate": 0.00015717665105719015,
"loss": 0.1021,
"step": 254
},
{
"epoch": 1.6417670682730923,
"grad_norm": 0.32763025164604187,
"learning_rate": 0.00015683295416598367,
"loss": 0.0981,
"step": 255
},
{
"epoch": 1.6481927710843374,
"grad_norm": 0.15386274456977844,
"learning_rate": 0.00015648826274115653,
"loss": 0.0735,
"step": 256
},
{
"epoch": 1.6546184738955825,
"grad_norm": 0.16144217550754547,
"learning_rate": 0.00015614258281454734,
"loss": 0.1047,
"step": 257
},
{
"epoch": 1.6610441767068274,
"grad_norm": 0.16441850364208221,
"learning_rate": 0.00015579592043529292,
"loss": 0.1014,
"step": 258
},
{
"epoch": 1.6674698795180722,
"grad_norm": 0.2219092845916748,
"learning_rate": 0.00015544828166972203,
"loss": 0.1492,
"step": 259
},
{
"epoch": 1.6738955823293171,
"grad_norm": 0.2408316433429718,
"learning_rate": 0.00015509967260124964,
"loss": 0.1373,
"step": 260
},
{
"epoch": 1.6803212851405622,
"grad_norm": 0.16629627346992493,
"learning_rate": 0.0001547500993302702,
"loss": 0.1024,
"step": 261
},
{
"epoch": 1.6867469879518073,
"grad_norm": 0.17428399622440338,
"learning_rate": 0.000154399567974051,
"loss": 0.1071,
"step": 262
},
{
"epoch": 1.6931726907630522,
"grad_norm": 0.20689523220062256,
"learning_rate": 0.00015404808466662508,
"loss": 0.1164,
"step": 263
},
{
"epoch": 1.699598393574297,
"grad_norm": 0.19431588053703308,
"learning_rate": 0.0001536956555586839,
"loss": 0.1095,
"step": 264
},
{
"epoch": 1.7060240963855422,
"grad_norm": 0.28836753964424133,
"learning_rate": 0.0001533422868174697,
"loss": 0.0958,
"step": 265
},
{
"epoch": 1.7124497991967873,
"grad_norm": 0.16699343919754028,
"learning_rate": 0.00015298798462666765,
"loss": 0.1017,
"step": 266
},
{
"epoch": 1.7188755020080322,
"grad_norm": 0.20274591445922852,
"learning_rate": 0.00015263275518629754,
"loss": 0.1082,
"step": 267
},
{
"epoch": 1.725301204819277,
"grad_norm": 0.20032569766044617,
"learning_rate": 0.00015227660471260528,
"loss": 0.1201,
"step": 268
},
{
"epoch": 1.731726907630522,
"grad_norm": 0.19980573654174805,
"learning_rate": 0.00015191953943795427,
"loss": 0.1072,
"step": 269
},
{
"epoch": 1.738152610441767,
"grad_norm": 0.2036619335412979,
"learning_rate": 0.00015156156561071612,
"loss": 0.1083,
"step": 270
},
{
"epoch": 1.7445783132530122,
"grad_norm": 0.17177674174308777,
"learning_rate": 0.0001512026894951615,
"loss": 0.0981,
"step": 271
},
{
"epoch": 1.751004016064257,
"grad_norm": 0.2405836135149002,
"learning_rate": 0.00015084291737135048,
"loss": 0.1005,
"step": 272
},
{
"epoch": 1.757429718875502,
"grad_norm": 0.20927219092845917,
"learning_rate": 0.00015048225553502244,
"loss": 0.0895,
"step": 273
},
{
"epoch": 1.763855421686747,
"grad_norm": 0.22314170002937317,
"learning_rate": 0.00015012071029748614,
"loss": 0.0874,
"step": 274
},
{
"epoch": 1.7702811244979921,
"grad_norm": 0.21546539664268494,
"learning_rate": 0.00014975828798550933,
"loss": 0.0765,
"step": 275
},
{
"epoch": 1.776706827309237,
"grad_norm": 0.27791547775268555,
"learning_rate": 0.00014939499494120761,
"loss": 0.0851,
"step": 276
},
{
"epoch": 1.783132530120482,
"grad_norm": 0.23379187285900116,
"learning_rate": 0.00014903083752193397,
"loss": 0.1173,
"step": 277
},
{
"epoch": 1.7895582329317268,
"grad_norm": 0.30948150157928467,
"learning_rate": 0.0001486658221001672,
"loss": 0.0994,
"step": 278
},
{
"epoch": 1.7959839357429719,
"grad_norm": 0.31466349959373474,
"learning_rate": 0.0001482999550634006,
"loss": 0.1006,
"step": 279
},
{
"epoch": 1.802409638554217,
"grad_norm": 0.29642632603645325,
"learning_rate": 0.0001479332428140299,
"loss": 0.0992,
"step": 280
},
{
"epoch": 1.8088353413654619,
"grad_norm": 0.155403733253479,
"learning_rate": 0.00014756569176924153,
"loss": 0.081,
"step": 281
},
{
"epoch": 1.8152610441767068,
"grad_norm": 0.15768149495124817,
"learning_rate": 0.0001471973083609002,
"loss": 0.0994,
"step": 282
},
{
"epoch": 1.8216867469879519,
"grad_norm": 0.21333357691764832,
"learning_rate": 0.00014682809903543632,
"loss": 0.0975,
"step": 283
},
{
"epoch": 1.8281124497991967,
"grad_norm": 0.1593853384256363,
"learning_rate": 0.00014645807025373328,
"loss": 0.1053,
"step": 284
},
{
"epoch": 1.8345381526104418,
"grad_norm": 0.2197875678539276,
"learning_rate": 0.0001460872284910143,
"loss": 0.1231,
"step": 285
},
{
"epoch": 1.8409638554216867,
"grad_norm": 0.1849106401205063,
"learning_rate": 0.000145715580236729,
"loss": 0.1369,
"step": 286
},
{
"epoch": 1.8473895582329316,
"grad_norm": 0.19566693902015686,
"learning_rate": 0.00014534313199444031,
"loss": 0.1229,
"step": 287
},
{
"epoch": 1.8538152610441767,
"grad_norm": 0.1796487420797348,
"learning_rate": 0.00014496989028171012,
"loss": 0.1046,
"step": 288
},
{
"epoch": 1.8602409638554218,
"grad_norm": 0.18316736817359924,
"learning_rate": 0.00014459586162998545,
"loss": 0.1128,
"step": 289
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.11896166950464249,
"learning_rate": 0.00014422105258448425,
"loss": 0.0722,
"step": 290
},
{
"epoch": 1.8730923694779116,
"grad_norm": 0.17587630450725555,
"learning_rate": 0.00014384546970408067,
"loss": 0.1201,
"step": 291
},
{
"epoch": 1.8795180722891565,
"grad_norm": 0.22650708258152008,
"learning_rate": 0.0001434691195611905,
"loss": 0.1196,
"step": 292
},
{
"epoch": 1.8859437751004016,
"grad_norm": 0.17509253323078156,
"learning_rate": 0.000143092008741656,
"loss": 0.1129,
"step": 293
},
{
"epoch": 1.8923694779116467,
"grad_norm": 0.29918667674064636,
"learning_rate": 0.00014271414384463063,
"loss": 0.1159,
"step": 294
},
{
"epoch": 1.8987951807228916,
"grad_norm": 0.20377112925052643,
"learning_rate": 0.00014233553148246364,
"loss": 0.1046,
"step": 295
},
{
"epoch": 1.9052208835341364,
"grad_norm": 0.20923396944999695,
"learning_rate": 0.00014195617828058446,
"loss": 0.1094,
"step": 296
},
{
"epoch": 1.9116465863453815,
"grad_norm": 0.20258182287216187,
"learning_rate": 0.00014157609087738656,
"loss": 0.095,
"step": 297
},
{
"epoch": 1.9180722891566266,
"grad_norm": 0.17875652015209198,
"learning_rate": 0.00014119527592411146,
"loss": 0.0937,
"step": 298
},
{
"epoch": 1.9244979919678715,
"grad_norm": 0.21565286815166473,
"learning_rate": 0.00014081374008473213,
"loss": 0.0915,
"step": 299
},
{
"epoch": 1.9309236947791164,
"grad_norm": 0.2412794530391693,
"learning_rate": 0.0001404314900358366,
"loss": 0.087,
"step": 300
},
{
"epoch": 1.9373493975903613,
"grad_norm": 0.20525000989437103,
"learning_rate": 0.00014004853246651092,
"loss": 0.1062,
"step": 301
},
{
"epoch": 1.9437751004016064,
"grad_norm": 0.20360495150089264,
"learning_rate": 0.0001396648740782224,
"loss": 0.0909,
"step": 302
},
{
"epoch": 1.9502008032128515,
"grad_norm": 0.2990632653236389,
"learning_rate": 0.000139280521584702,
"loss": 0.0891,
"step": 303
},
{
"epoch": 1.9566265060240964,
"grad_norm": 0.2693440020084381,
"learning_rate": 0.00013889548171182702,
"loss": 0.0964,
"step": 304
},
{
"epoch": 1.9630522088353413,
"grad_norm": 0.35366424918174744,
"learning_rate": 0.0001385097611975034,
"loss": 0.1023,
"step": 305
},
{
"epoch": 1.9694779116465864,
"grad_norm": 0.189253568649292,
"learning_rate": 0.00013812336679154777,
"loss": 0.0928,
"step": 306
},
{
"epoch": 1.9759036144578315,
"grad_norm": 0.20141035318374634,
"learning_rate": 0.0001377363052555693,
"loss": 0.1159,
"step": 307
},
{
"epoch": 1.9823293172690764,
"grad_norm": 0.29336804151535034,
"learning_rate": 0.00013734858336285162,
"loss": 0.1123,
"step": 308
},
{
"epoch": 1.9887550200803212,
"grad_norm": 0.2803572714328766,
"learning_rate": 0.00013696020789823388,
"loss": 0.1217,
"step": 309
},
{
"epoch": 1.9951807228915661,
"grad_norm": 0.2023596316576004,
"learning_rate": 0.00013657118565799236,
"loss": 0.081,
"step": 310
},
{
"epoch": 2.004819277108434,
"grad_norm": 0.6364520192146301,
"learning_rate": 0.00013618152344972142,
"loss": 0.2296,
"step": 311
},
{
"epoch": 2.0112449799196788,
"grad_norm": 0.15013103187084198,
"learning_rate": 0.00013579122809221432,
"loss": 0.0985,
"step": 312
},
{
"epoch": 2.0176706827309236,
"grad_norm": 0.16968320310115814,
"learning_rate": 0.00013540030641534404,
"loss": 0.1061,
"step": 313
},
{
"epoch": 2.0240963855421685,
"grad_norm": 0.11979459226131439,
"learning_rate": 0.00013500876525994354,
"loss": 0.0778,
"step": 314
},
{
"epoch": 2.030522088353414,
"grad_norm": 0.1250924915075302,
"learning_rate": 0.00013461661147768633,
"loss": 0.076,
"step": 315
},
{
"epoch": 2.0369477911646587,
"grad_norm": 0.14744716882705688,
"learning_rate": 0.00013422385193096636,
"loss": 0.1088,
"step": 316
},
{
"epoch": 2.0433734939759036,
"grad_norm": 0.14878836274147034,
"learning_rate": 0.000133830493492778,
"loss": 0.0886,
"step": 317
},
{
"epoch": 2.0497991967871485,
"grad_norm": 0.14586399495601654,
"learning_rate": 0.00013343654304659574,
"loss": 0.0737,
"step": 318
},
{
"epoch": 2.0562248995983934,
"grad_norm": 0.26601502299308777,
"learning_rate": 0.00013304200748625377,
"loss": 0.1376,
"step": 319
},
{
"epoch": 2.0626506024096387,
"grad_norm": 0.13429085910320282,
"learning_rate": 0.0001326468937158254,
"loss": 0.0653,
"step": 320
},
{
"epoch": 2.0690763052208836,
"grad_norm": 0.15061058104038239,
"learning_rate": 0.00013225120864950217,
"loss": 0.0832,
"step": 321
},
{
"epoch": 2.0755020080321285,
"grad_norm": 0.19543641805648804,
"learning_rate": 0.00013185495921147272,
"loss": 0.0904,
"step": 322
},
{
"epoch": 2.0819277108433734,
"grad_norm": 0.19916664063930511,
"learning_rate": 0.00013145815233580192,
"loss": 0.1002,
"step": 323
},
{
"epoch": 2.0883534136546187,
"grad_norm": 0.18353912234306335,
"learning_rate": 0.00013106079496630937,
"loss": 0.065,
"step": 324
},
{
"epoch": 2.0947791164658636,
"grad_norm": 0.22987468540668488,
"learning_rate": 0.00013066289405644778,
"loss": 0.0889,
"step": 325
},
{
"epoch": 2.1012048192771084,
"grad_norm": 0.18574562668800354,
"learning_rate": 0.00013026445656918155,
"loss": 0.0771,
"step": 326
},
{
"epoch": 2.1076305220883533,
"grad_norm": 0.2045065313577652,
"learning_rate": 0.00012986548947686467,
"loss": 0.0761,
"step": 327
},
{
"epoch": 2.114056224899598,
"grad_norm": 0.20475462079048157,
"learning_rate": 0.00012946599976111883,
"loss": 0.0603,
"step": 328
},
{
"epoch": 2.1204819277108435,
"grad_norm": 0.2551652491092682,
"learning_rate": 0.0001290659944127113,
"loss": 0.0619,
"step": 329
},
{
"epoch": 2.1269076305220884,
"grad_norm": 0.24741911888122559,
"learning_rate": 0.0001286654804314325,
"loss": 0.0685,
"step": 330
},
{
"epoch": 2.1333333333333333,
"grad_norm": 0.3083299696445465,
"learning_rate": 0.0001282644648259735,
"loss": 0.073,
"step": 331
},
{
"epoch": 2.139759036144578,
"grad_norm": 0.26823994517326355,
"learning_rate": 0.00012786295461380344,
"loss": 0.0743,
"step": 332
},
{
"epoch": 2.1461847389558235,
"grad_norm": 0.267733633518219,
"learning_rate": 0.00012746095682104669,
"loss": 0.0914,
"step": 333
},
{
"epoch": 2.1526104417670684,
"grad_norm": 0.5392053723335266,
"learning_rate": 0.00012705847848235995,
"loss": 0.0627,
"step": 334
},
{
"epoch": 2.1590361445783133,
"grad_norm": 0.2896229922771454,
"learning_rate": 0.00012665552664080907,
"loss": 0.0777,
"step": 335
},
{
"epoch": 2.165461847389558,
"grad_norm": 0.145288348197937,
"learning_rate": 0.00012625210834774585,
"loss": 0.0673,
"step": 336
},
{
"epoch": 2.171887550200803,
"grad_norm": 0.20815478265285492,
"learning_rate": 0.00012584823066268466,
"loss": 0.0887,
"step": 337
},
{
"epoch": 2.1783132530120484,
"grad_norm": 0.15457971394062042,
"learning_rate": 0.00012544390065317887,
"loss": 0.0806,
"step": 338
},
{
"epoch": 2.1847389558232932,
"grad_norm": 0.1739414930343628,
"learning_rate": 0.00012503912539469714,
"loss": 0.0904,
"step": 339
},
{
"epoch": 2.191164658634538,
"grad_norm": 0.19280481338500977,
"learning_rate": 0.00012463391197049977,
"loss": 0.0936,
"step": 340
},
{
"epoch": 2.197590361445783,
"grad_norm": 0.19487957656383514,
"learning_rate": 0.00012422826747151444,
"loss": 0.0711,
"step": 341
},
{
"epoch": 2.2040160642570283,
"grad_norm": 0.1922217458486557,
"learning_rate": 0.00012382219899621246,
"loss": 0.0937,
"step": 342
},
{
"epoch": 2.2104417670682732,
"grad_norm": 0.19978083670139313,
"learning_rate": 0.00012341571365048442,
"loss": 0.09,
"step": 343
},
{
"epoch": 2.216867469879518,
"grad_norm": 0.17337800562381744,
"learning_rate": 0.00012300881854751568,
"loss": 0.0849,
"step": 344
},
{
"epoch": 2.223293172690763,
"grad_norm": 0.16611528396606445,
"learning_rate": 0.0001226015208076622,
"loss": 0.0695,
"step": 345
},
{
"epoch": 2.229718875502008,
"grad_norm": 0.19705650210380554,
"learning_rate": 0.0001221938275583257,
"loss": 0.0924,
"step": 346
},
{
"epoch": 2.236144578313253,
"grad_norm": 0.2103363275527954,
"learning_rate": 0.00012178574593382899,
"loss": 0.0971,
"step": 347
},
{
"epoch": 2.242570281124498,
"grad_norm": 0.17318083345890045,
"learning_rate": 0.0001213772830752912,
"loss": 0.0764,
"step": 348
},
{
"epoch": 2.248995983935743,
"grad_norm": 0.18497149646282196,
"learning_rate": 0.0001209684461305028,
"loss": 0.0775,
"step": 349
},
{
"epoch": 2.255421686746988,
"grad_norm": 0.16112066805362701,
"learning_rate": 0.00012055924225380038,
"loss": 0.066,
"step": 350
},
{
"epoch": 2.261847389558233,
"grad_norm": 0.14638184010982513,
"learning_rate": 0.00012014967860594164,
"loss": 0.0667,
"step": 351
},
{
"epoch": 2.268273092369478,
"grad_norm": 0.23443520069122314,
"learning_rate": 0.00011973976235398,
"loss": 0.0746,
"step": 352
},
{
"epoch": 2.274698795180723,
"grad_norm": 0.15776869654655457,
"learning_rate": 0.0001193295006711392,
"loss": 0.0576,
"step": 353
},
{
"epoch": 2.281124497991968,
"grad_norm": 0.1940746307373047,
"learning_rate": 0.00011891890073668763,
"loss": 0.0614,
"step": 354
},
{
"epoch": 2.2875502008032127,
"grad_norm": 0.14611606299877167,
"learning_rate": 0.00011850796973581302,
"loss": 0.057,
"step": 355
},
{
"epoch": 2.293975903614458,
"grad_norm": 0.2916417419910431,
"learning_rate": 0.00011809671485949636,
"loss": 0.0677,
"step": 356
},
{
"epoch": 2.300401606425703,
"grad_norm": 0.2484523206949234,
"learning_rate": 0.00011768514330438627,
"loss": 0.0846,
"step": 357
},
{
"epoch": 2.306827309236948,
"grad_norm": 0.18685579299926758,
"learning_rate": 0.00011727326227267308,
"loss": 0.0682,
"step": 358
},
{
"epoch": 2.3132530120481927,
"grad_norm": 0.23231656849384308,
"learning_rate": 0.00011686107897196255,
"loss": 0.0782,
"step": 359
},
{
"epoch": 2.319678714859438,
"grad_norm": 0.2503097653388977,
"learning_rate": 0.00011644860061515008,
"loss": 0.0745,
"step": 360
},
{
"epoch": 2.326104417670683,
"grad_norm": 0.15438750386238098,
"learning_rate": 0.00011603583442029426,
"loss": 0.0599,
"step": 361
},
{
"epoch": 2.3325301204819278,
"grad_norm": 0.11933010071516037,
"learning_rate": 0.00011562278761049066,
"loss": 0.0705,
"step": 362
},
{
"epoch": 2.3389558232931726,
"grad_norm": 0.2344401627779007,
"learning_rate": 0.00011520946741374534,
"loss": 0.1086,
"step": 363
},
{
"epoch": 2.3453815261044175,
"grad_norm": 0.1429268717765808,
"learning_rate": 0.00011479588106284848,
"loss": 0.0793,
"step": 364
},
{
"epoch": 2.3518072289156624,
"grad_norm": 0.170423224568367,
"learning_rate": 0.00011438203579524778,
"loss": 0.0876,
"step": 365
},
{
"epoch": 2.3582329317269077,
"grad_norm": 0.21951356530189514,
"learning_rate": 0.00011396793885292165,
"loss": 0.1001,
"step": 366
},
{
"epoch": 2.3646586345381526,
"grad_norm": 0.18042345345020294,
"learning_rate": 0.00011355359748225279,
"loss": 0.096,
"step": 367
},
{
"epoch": 2.3710843373493975,
"grad_norm": 0.19519764184951782,
"learning_rate": 0.00011313901893390113,
"loss": 0.0842,
"step": 368
},
{
"epoch": 2.3775100401606424,
"grad_norm": 0.1564580202102661,
"learning_rate": 0.00011272421046267696,
"loss": 0.0849,
"step": 369
},
{
"epoch": 2.3839357429718877,
"grad_norm": 0.21376530826091766,
"learning_rate": 0.00011230917932741418,
"loss": 0.0848,
"step": 370
},
{
"epoch": 2.3903614457831326,
"grad_norm": 0.16924604773521423,
"learning_rate": 0.00011189393279084308,
"loss": 0.0986,
"step": 371
},
{
"epoch": 2.3967871485943775,
"grad_norm": 0.142182394862175,
"learning_rate": 0.00011147847811946328,
"loss": 0.0753,
"step": 372
},
{
"epoch": 2.4032128514056224,
"grad_norm": 0.2380589097738266,
"learning_rate": 0.00011106282258341665,
"loss": 0.0873,
"step": 373
},
{
"epoch": 2.4096385542168672,
"grad_norm": 0.17887993156909943,
"learning_rate": 0.00011064697345636002,
"loss": 0.0721,
"step": 374
},
{
"epoch": 2.4160642570281126,
"grad_norm": 0.16593624651432037,
"learning_rate": 0.00011023093801533785,
"loss": 0.0673,
"step": 375
},
{
"epoch": 2.4224899598393574,
"grad_norm": 0.19967305660247803,
"learning_rate": 0.00010981472354065514,
"loss": 0.0839,
"step": 376
},
{
"epoch": 2.4289156626506023,
"grad_norm": 0.1671656221151352,
"learning_rate": 0.00010939833731574967,
"loss": 0.0692,
"step": 377
},
{
"epoch": 2.435341365461847,
"grad_norm": 0.2287617176771164,
"learning_rate": 0.00010898178662706471,
"loss": 0.0841,
"step": 378
},
{
"epoch": 2.4417670682730925,
"grad_norm": 0.1811789572238922,
"learning_rate": 0.00010856507876392166,
"loss": 0.0549,
"step": 379
},
{
"epoch": 2.4481927710843374,
"grad_norm": 0.1921742856502533,
"learning_rate": 0.00010814822101839224,
"loss": 0.0723,
"step": 380
},
{
"epoch": 2.4546184738955823,
"grad_norm": 0.24973797798156738,
"learning_rate": 0.00010773122068517103,
"loss": 0.0746,
"step": 381
},
{
"epoch": 2.461044176706827,
"grad_norm": 0.22716690599918365,
"learning_rate": 0.00010731408506144782,
"loss": 0.0837,
"step": 382
},
{
"epoch": 2.467469879518072,
"grad_norm": 0.19027042388916016,
"learning_rate": 0.00010689682144677983,
"loss": 0.0575,
"step": 383
},
{
"epoch": 2.4738955823293174,
"grad_norm": 0.26334255933761597,
"learning_rate": 0.00010647943714296405,
"loss": 0.0688,
"step": 384
},
{
"epoch": 2.4803212851405623,
"grad_norm": 0.26130348443984985,
"learning_rate": 0.00010606193945390943,
"loss": 0.0704,
"step": 385
},
{
"epoch": 2.486746987951807,
"grad_norm": 0.16175444424152374,
"learning_rate": 0.00010564433568550909,
"loss": 0.0739,
"step": 386
},
{
"epoch": 2.493172690763052,
"grad_norm": 0.34543898701667786,
"learning_rate": 0.00010522663314551247,
"loss": 0.0883,
"step": 387
},
{
"epoch": 2.4995983935742974,
"grad_norm": 0.16623060405254364,
"learning_rate": 0.00010480883914339736,
"loss": 0.0916,
"step": 388
},
{
"epoch": 2.5060240963855422,
"grad_norm": 0.15224424004554749,
"learning_rate": 0.0001043909609902422,
"loss": 0.1017,
"step": 389
},
{
"epoch": 2.512449799196787,
"grad_norm": 0.2146722823381424,
"learning_rate": 0.00010397300599859785,
"loss": 0.0699,
"step": 390
},
{
"epoch": 2.518875502008032,
"grad_norm": 0.18753445148468018,
"learning_rate": 0.00010355498148235996,
"loss": 0.1012,
"step": 391
},
{
"epoch": 2.525301204819277,
"grad_norm": 0.19780860841274261,
"learning_rate": 0.00010313689475664063,
"loss": 0.0876,
"step": 392
},
{
"epoch": 2.531726907630522,
"grad_norm": 0.15880204737186432,
"learning_rate": 0.0001027187531376407,
"loss": 0.0978,
"step": 393
},
{
"epoch": 2.538152610441767,
"grad_norm": 0.20312370359897614,
"learning_rate": 0.00010230056394252161,
"loss": 0.0978,
"step": 394
},
{
"epoch": 2.544578313253012,
"grad_norm": 0.17712520062923431,
"learning_rate": 0.00010188233448927724,
"loss": 0.0632,
"step": 395
},
{
"epoch": 2.551004016064257,
"grad_norm": 0.1579594612121582,
"learning_rate": 0.00010146407209660607,
"loss": 0.0868,
"step": 396
},
{
"epoch": 2.557429718875502,
"grad_norm": 0.18369610607624054,
"learning_rate": 0.00010104578408378289,
"loss": 0.0721,
"step": 397
},
{
"epoch": 2.563855421686747,
"grad_norm": 0.1889200061559677,
"learning_rate": 0.00010062747777053094,
"loss": 0.079,
"step": 398
},
{
"epoch": 2.570281124497992,
"grad_norm": 0.1784103661775589,
"learning_rate": 0.00010020916047689358,
"loss": 0.0703,
"step": 399
},
{
"epoch": 2.576706827309237,
"grad_norm": 0.1942092776298523,
"learning_rate": 9.979083952310643e-05,
"loss": 0.0851,
"step": 400
},
{
"epoch": 2.5831325301204817,
"grad_norm": 0.17258763313293457,
"learning_rate": 9.937252222946908e-05,
"loss": 0.0632,
"step": 401
},
{
"epoch": 2.589558232931727,
"grad_norm": 0.1579107642173767,
"learning_rate": 9.895421591621712e-05,
"loss": 0.0568,
"step": 402
},
{
"epoch": 2.595983935742972,
"grad_norm": 0.16073255240917206,
"learning_rate": 9.853592790339396e-05,
"loss": 0.0571,
"step": 403
},
{
"epoch": 2.602409638554217,
"grad_norm": 0.19859455525875092,
"learning_rate": 9.811766551072278e-05,
"loss": 0.0695,
"step": 404
},
{
"epoch": 2.6088353413654617,
"grad_norm": 0.20779484510421753,
"learning_rate": 9.769943605747844e-05,
"loss": 0.0679,
"step": 405
},
{
"epoch": 2.615261044176707,
"grad_norm": 0.1808435469865799,
"learning_rate": 9.72812468623593e-05,
"loss": 0.0663,
"step": 406
},
{
"epoch": 2.621686746987952,
"grad_norm": 0.18272174894809723,
"learning_rate": 9.686310524335938e-05,
"loss": 0.0617,
"step": 407
},
{
"epoch": 2.628112449799197,
"grad_norm": 0.21186350286006927,
"learning_rate": 9.644501851764007e-05,
"loss": 0.0597,
"step": 408
},
{
"epoch": 2.6345381526104417,
"grad_norm": 0.1978769302368164,
"learning_rate": 9.602699400140218e-05,
"loss": 0.0573,
"step": 409
},
{
"epoch": 2.6409638554216865,
"grad_norm": 0.2839438319206238,
"learning_rate": 9.560903900975785e-05,
"loss": 0.0705,
"step": 410
},
{
"epoch": 2.647389558232932,
"grad_norm": 0.19903969764709473,
"learning_rate": 9.519116085660267e-05,
"loss": 0.0696,
"step": 411
},
{
"epoch": 2.6538152610441768,
"grad_norm": 0.2470594197511673,
"learning_rate": 9.477336685448754e-05,
"loss": 0.0781,
"step": 412
},
{
"epoch": 2.6602409638554216,
"grad_norm": 0.16970917582511902,
"learning_rate": 9.435566431449092e-05,
"loss": 0.0799,
"step": 413
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.20240218937397003,
"learning_rate": 9.39380605460906e-05,
"loss": 0.0751,
"step": 414
},
{
"epoch": 2.673092369477912,
"grad_norm": 0.15831498801708221,
"learning_rate": 9.352056285703599e-05,
"loss": 0.0764,
"step": 415
},
{
"epoch": 2.6795180722891567,
"grad_norm": 0.2208964228630066,
"learning_rate": 9.31031785532202e-05,
"loss": 0.089,
"step": 416
},
{
"epoch": 2.6859437751004016,
"grad_norm": 0.23068203032016754,
"learning_rate": 9.268591493855222e-05,
"loss": 0.0822,
"step": 417
},
{
"epoch": 2.6923694779116465,
"grad_norm": 0.18787510693073273,
"learning_rate": 9.226877931482898e-05,
"loss": 0.0762,
"step": 418
},
{
"epoch": 2.6987951807228914,
"grad_norm": 0.22357405722141266,
"learning_rate": 9.18517789816078e-05,
"loss": 0.0889,
"step": 419
},
{
"epoch": 2.7052208835341367,
"grad_norm": 0.15312552452087402,
"learning_rate": 9.143492123607838e-05,
"loss": 0.0579,
"step": 420
},
{
"epoch": 2.7116465863453816,
"grad_norm": 0.2166433036327362,
"learning_rate": 9.101821337293532e-05,
"loss": 0.1423,
"step": 421
},
{
"epoch": 2.7180722891566265,
"grad_norm": 0.1675548553466797,
"learning_rate": 9.060166268425038e-05,
"loss": 0.09,
"step": 422
},
{
"epoch": 2.7244979919678713,
"grad_norm": 0.16221830248832703,
"learning_rate": 9.018527645934488e-05,
"loss": 0.0606,
"step": 423
},
{
"epoch": 2.7309236947791167,
"grad_norm": 0.20156528055667877,
"learning_rate": 8.976906198466213e-05,
"loss": 0.0855,
"step": 424
},
{
"epoch": 2.7373493975903616,
"grad_norm": 0.2165391594171524,
"learning_rate": 8.935302654364e-05,
"loss": 0.0935,
"step": 425
},
{
"epoch": 2.7437751004016064,
"grad_norm": 0.16395288705825806,
"learning_rate": 8.893717741658336e-05,
"loss": 0.092,
"step": 426
},
{
"epoch": 2.7502008032128513,
"grad_norm": 0.24706722795963287,
"learning_rate": 8.852152188053674e-05,
"loss": 0.0717,
"step": 427
},
{
"epoch": 2.756626506024096,
"grad_norm": 0.19659721851348877,
"learning_rate": 8.810606720915697e-05,
"loss": 0.0703,
"step": 428
},
{
"epoch": 2.7630522088353415,
"grad_norm": 0.2377336174249649,
"learning_rate": 8.769082067258585e-05,
"loss": 0.0711,
"step": 429
},
{
"epoch": 2.7694779116465864,
"grad_norm": 0.1496395319700241,
"learning_rate": 8.727578953732303e-05,
"loss": 0.0565,
"step": 430
},
{
"epoch": 2.7759036144578313,
"grad_norm": 0.230519101023674,
"learning_rate": 8.686098106609889e-05,
"loss": 0.0676,
"step": 431
},
{
"epoch": 2.782329317269076,
"grad_norm": 0.1836637556552887,
"learning_rate": 8.644640251774722e-05,
"loss": 0.0613,
"step": 432
},
{
"epoch": 2.7887550200803215,
"grad_norm": 0.17303813993930817,
"learning_rate": 8.603206114707837e-05,
"loss": 0.066,
"step": 433
},
{
"epoch": 2.7951807228915664,
"grad_norm": 0.18832409381866455,
"learning_rate": 8.561796420475227e-05,
"loss": 0.0539,
"step": 434
},
{
"epoch": 2.8016064257028113,
"grad_norm": 0.23530371487140656,
"learning_rate": 8.52041189371515e-05,
"loss": 0.0669,
"step": 435
},
{
"epoch": 2.808032128514056,
"grad_norm": 0.12783204019069672,
"learning_rate": 8.479053258625467e-05,
"loss": 0.0686,
"step": 436
},
{
"epoch": 2.814457831325301,
"grad_norm": 0.16126012802124023,
"learning_rate": 8.437721238950938e-05,
"loss": 0.0783,
"step": 437
},
{
"epoch": 2.820883534136546,
"grad_norm": 0.16663892567157745,
"learning_rate": 8.396416557970576e-05,
"loss": 0.0899,
"step": 438
},
{
"epoch": 2.8273092369477912,
"grad_norm": 0.12750643491744995,
"learning_rate": 8.355139938484995e-05,
"loss": 0.087,
"step": 439
},
{
"epoch": 2.833734939759036,
"grad_norm": 0.1801062375307083,
"learning_rate": 8.313892102803749e-05,
"loss": 0.079,
"step": 440
},
{
"epoch": 2.840160642570281,
"grad_norm": 0.2397543489933014,
"learning_rate": 8.272673772732695e-05,
"loss": 0.0982,
"step": 441
},
{
"epoch": 2.8465863453815263,
"grad_norm": 0.16935226321220398,
"learning_rate": 8.231485669561371e-05,
"loss": 0.0675,
"step": 442
},
{
"epoch": 2.853012048192771,
"grad_norm": 0.14082035422325134,
"learning_rate": 8.190328514050365e-05,
"loss": 0.0748,
"step": 443
},
{
"epoch": 2.859437751004016,
"grad_norm": 0.1471889168024063,
"learning_rate": 8.1492030264187e-05,
"loss": 0.0622,
"step": 444
},
{
"epoch": 2.865863453815261,
"grad_norm": 0.19526907801628113,
"learning_rate": 8.108109926331238e-05,
"loss": 0.0737,
"step": 445
},
{
"epoch": 2.872289156626506,
"grad_norm": 0.15070655941963196,
"learning_rate": 8.067049932886084e-05,
"loss": 0.0751,
"step": 446
},
{
"epoch": 2.8787148594377507,
"grad_norm": 0.1857602894306183,
"learning_rate": 8.026023764601999e-05,
"loss": 0.0717,
"step": 447
},
{
"epoch": 2.885140562248996,
"grad_norm": 0.20066916942596436,
"learning_rate": 7.985032139405836e-05,
"loss": 0.0792,
"step": 448
},
{
"epoch": 2.891566265060241,
"grad_norm": 0.19384227693080902,
"learning_rate": 7.944075774619963e-05,
"loss": 0.0575,
"step": 449
},
{
"epoch": 2.897991967871486,
"grad_norm": 0.21878117322921753,
"learning_rate": 7.903155386949723e-05,
"loss": 0.0799,
"step": 450
},
{
"epoch": 2.904417670682731,
"grad_norm": 0.2109462022781372,
"learning_rate": 7.862271692470884e-05,
"loss": 0.081,
"step": 451
},
{
"epoch": 2.910843373493976,
"grad_norm": 0.2199956178665161,
"learning_rate": 7.821425406617106e-05,
"loss": 0.0749,
"step": 452
},
{
"epoch": 2.917269076305221,
"grad_norm": 0.2037278711795807,
"learning_rate": 7.780617244167432e-05,
"loss": 0.0529,
"step": 453
},
{
"epoch": 2.923694779116466,
"grad_norm": 0.19817106425762177,
"learning_rate": 7.739847919233781e-05,
"loss": 0.0482,
"step": 454
},
{
"epoch": 2.9301204819277107,
"grad_norm": 0.16905316710472107,
"learning_rate": 7.699118145248434e-05,
"loss": 0.0518,
"step": 455
},
{
"epoch": 2.9365461847389556,
"grad_norm": 0.18011616170406342,
"learning_rate": 7.658428634951562e-05,
"loss": 0.0619,
"step": 456
},
{
"epoch": 2.942971887550201,
"grad_norm": 0.18177881836891174,
"learning_rate": 7.617780100378756e-05,
"loss": 0.057,
"step": 457
},
{
"epoch": 2.9493975903614458,
"grad_norm": 0.1976725161075592,
"learning_rate": 7.57717325284856e-05,
"loss": 0.0512,
"step": 458
},
{
"epoch": 2.9558232931726907,
"grad_norm": 0.16665002703666687,
"learning_rate": 7.536608802950027e-05,
"loss": 0.0603,
"step": 459
},
{
"epoch": 2.962248995983936,
"grad_norm": 0.2631849944591522,
"learning_rate": 7.496087460530285e-05,
"loss": 0.0644,
"step": 460
},
{
"epoch": 2.968674698795181,
"grad_norm": 0.14662465453147888,
"learning_rate": 7.455609934682116e-05,
"loss": 0.1023,
"step": 461
},
{
"epoch": 2.9751004016064257,
"grad_norm": 0.15676066279411316,
"learning_rate": 7.415176933731536e-05,
"loss": 0.078,
"step": 462
},
{
"epoch": 2.9815261044176706,
"grad_norm": 0.15064530074596405,
"learning_rate": 7.374789165225416e-05,
"loss": 0.0697,
"step": 463
},
{
"epoch": 2.9879518072289155,
"grad_norm": 0.21094205975532532,
"learning_rate": 7.334447335919096e-05,
"loss": 0.065,
"step": 464
},
{
"epoch": 2.9943775100401604,
"grad_norm": 0.19423390924930573,
"learning_rate": 7.294152151764006e-05,
"loss": 0.0587,
"step": 465
},
{
"epoch": 3.004016064257028,
"grad_norm": 0.5364864468574524,
"learning_rate": 7.253904317895332e-05,
"loss": 0.0888,
"step": 466
},
{
"epoch": 3.010441767068273,
"grad_norm": 0.10208001732826233,
"learning_rate": 7.21370453861966e-05,
"loss": 0.0712,
"step": 467
},
{
"epoch": 3.016867469879518,
"grad_norm": 0.13294284045696259,
"learning_rate": 7.173553517402652e-05,
"loss": 0.0869,
"step": 468
},
{
"epoch": 3.0232931726907633,
"grad_norm": 0.11979340761899948,
"learning_rate": 7.133451956856751e-05,
"loss": 0.0719,
"step": 469
},
{
"epoch": 3.029718875502008,
"grad_norm": 0.1777201145887375,
"learning_rate": 7.093400558728871e-05,
"loss": 0.068,
"step": 470
},
{
"epoch": 3.036144578313253,
"grad_norm": 0.14670364558696747,
"learning_rate": 7.053400023888115e-05,
"loss": 0.0693,
"step": 471
},
{
"epoch": 3.042570281124498,
"grad_norm": 0.15252293646335602,
"learning_rate": 7.013451052313534e-05,
"loss": 0.0649,
"step": 472
},
{
"epoch": 3.048995983935743,
"grad_norm": 0.14201124012470245,
"learning_rate": 6.973554343081846e-05,
"loss": 0.0515,
"step": 473
},
{
"epoch": 3.055421686746988,
"grad_norm": 0.15977801382541656,
"learning_rate": 6.933710594355225e-05,
"loss": 0.0593,
"step": 474
},
{
"epoch": 3.061847389558233,
"grad_norm": 0.11069828271865845,
"learning_rate": 6.893920503369068e-05,
"loss": 0.0407,
"step": 475
},
{
"epoch": 3.068273092369478,
"grad_norm": 0.1887078583240509,
"learning_rate": 6.854184766419812e-05,
"loss": 0.0619,
"step": 476
},
{
"epoch": 3.0746987951807228,
"grad_norm": 0.17938368022441864,
"learning_rate": 6.814504078852729e-05,
"loss": 0.0634,
"step": 477
},
{
"epoch": 3.081124497991968,
"grad_norm": 0.1669437140226364,
"learning_rate": 6.774879135049787e-05,
"loss": 0.0518,
"step": 478
},
{
"epoch": 3.087550200803213,
"grad_norm": 0.14382565021514893,
"learning_rate": 6.735310628417461e-05,
"loss": 0.0472,
"step": 479
},
{
"epoch": 3.093975903614458,
"grad_norm": 0.176390141248703,
"learning_rate": 6.695799251374625e-05,
"loss": 0.0519,
"step": 480
},
{
"epoch": 3.1004016064257027,
"grad_norm": 0.19478528201580048,
"learning_rate": 6.656345695340431e-05,
"loss": 0.0631,
"step": 481
},
{
"epoch": 3.1068273092369476,
"grad_norm": 0.24837417900562286,
"learning_rate": 6.616950650722205e-05,
"loss": 0.0646,
"step": 482
},
{
"epoch": 3.113253012048193,
"grad_norm": 0.20292866230010986,
"learning_rate": 6.577614806903365e-05,
"loss": 0.048,
"step": 483
},
{
"epoch": 3.119678714859438,
"grad_norm": 0.14314322173595428,
"learning_rate": 6.538338852231367e-05,
"loss": 0.0387,
"step": 484
},
{
"epoch": 3.1261044176706827,
"grad_norm": 0.623166561126709,
"learning_rate": 6.499123474005647e-05,
"loss": 0.056,
"step": 485
},
{
"epoch": 3.1325301204819276,
"grad_norm": 0.18953579664230347,
"learning_rate": 6.4599693584656e-05,
"loss": 0.052,
"step": 486
},
{
"epoch": 3.1389558232931725,
"grad_norm": 0.30603814125061035,
"learning_rate": 6.420877190778569e-05,
"loss": 0.0628,
"step": 487
},
{
"epoch": 3.145381526104418,
"grad_norm": 0.22006677091121674,
"learning_rate": 6.381847655027864e-05,
"loss": 0.0492,
"step": 488
},
{
"epoch": 3.1518072289156627,
"grad_norm": 0.2588195204734802,
"learning_rate": 6.342881434200765e-05,
"loss": 0.0466,
"step": 489
},
{
"epoch": 3.1582329317269076,
"grad_norm": 0.22890865802764893,
"learning_rate": 6.303979210176614e-05,
"loss": 0.0621,
"step": 490
},
{
"epoch": 3.1646586345381524,
"grad_norm": 0.15218952298164368,
"learning_rate": 6.26514166371484e-05,
"loss": 0.0451,
"step": 491
},
{
"epoch": 3.1710843373493978,
"grad_norm": 0.11059743911027908,
"learning_rate": 6.226369474443072e-05,
"loss": 0.0769,
"step": 492
},
{
"epoch": 3.1775100401606426,
"grad_norm": 0.1349543184041977,
"learning_rate": 6.18766332084523e-05,
"loss": 0.0576,
"step": 493
},
{
"epoch": 3.1839357429718875,
"grad_norm": 0.14686280488967896,
"learning_rate": 6.149023880249665e-05,
"loss": 0.0839,
"step": 494
},
{
"epoch": 3.1903614457831324,
"grad_norm": 0.14011719822883606,
"learning_rate": 6.110451828817298e-05,
"loss": 0.0591,
"step": 495
},
{
"epoch": 3.1967871485943773,
"grad_norm": 0.18035003542900085,
"learning_rate": 6.071947841529801e-05,
"loss": 0.069,
"step": 496
},
{
"epoch": 3.2032128514056226,
"grad_norm": 0.1427018791437149,
"learning_rate": 6.03351259217776e-05,
"loss": 0.0647,
"step": 497
},
{
"epoch": 3.2096385542168675,
"grad_norm": 0.21494245529174805,
"learning_rate": 5.995146753348909e-05,
"loss": 0.0764,
"step": 498
},
{
"epoch": 3.2160642570281124,
"grad_norm": 0.18623854219913483,
"learning_rate": 5.9568509964163464e-05,
"loss": 0.0558,
"step": 499
},
{
"epoch": 3.2224899598393573,
"grad_norm": 0.13678805530071259,
"learning_rate": 5.9186259915267916e-05,
"loss": 0.048,
"step": 500
},
{
"epoch": 3.2289156626506026,
"grad_norm": 0.14847120642662048,
"learning_rate": 5.880472407588857e-05,
"loss": 0.0668,
"step": 501
},
{
"epoch": 3.2353413654618475,
"grad_norm": 0.14581717550754547,
"learning_rate": 5.842390912261344e-05,
"loss": 0.0424,
"step": 502
},
{
"epoch": 3.2417670682730924,
"grad_norm": 0.14835858345031738,
"learning_rate": 5.8043821719415534e-05,
"loss": 0.0565,
"step": 503
},
{
"epoch": 3.2481927710843372,
"grad_norm": 0.1661667823791504,
"learning_rate": 5.7664468517536395e-05,
"loss": 0.0432,
"step": 504
},
{
"epoch": 3.254618473895582,
"grad_norm": 0.13604165613651276,
"learning_rate": 5.728585615536946e-05,
"loss": 0.0417,
"step": 505
},
{
"epoch": 3.2610441767068274,
"grad_norm": 0.18449333310127258,
"learning_rate": 5.6907991258344e-05,
"loss": 0.0662,
"step": 506
},
{
"epoch": 3.2674698795180723,
"grad_norm": 0.14538291096687317,
"learning_rate": 5.6530880438809494e-05,
"loss": 0.0378,
"step": 507
},
{
"epoch": 3.273895582329317,
"grad_norm": 0.19400519132614136,
"learning_rate": 5.615453029591935e-05,
"loss": 0.0494,
"step": 508
},
{
"epoch": 3.280321285140562,
"grad_norm": 0.2027265727519989,
"learning_rate": 5.5778947415515784e-05,
"loss": 0.0472,
"step": 509
},
{
"epoch": 3.2867469879518074,
"grad_norm": 0.18922823667526245,
"learning_rate": 5.540413837001459e-05,
"loss": 0.0393,
"step": 510
},
{
"epoch": 3.2931726907630523,
"grad_norm": 0.20464153587818146,
"learning_rate": 5.50301097182899e-05,
"loss": 0.0369,
"step": 511
},
{
"epoch": 3.299598393574297,
"grad_norm": 0.23268243670463562,
"learning_rate": 5.465686800555967e-05,
"loss": 0.0362,
"step": 512
},
{
"epoch": 3.306024096385542,
"grad_norm": 0.16447444260120392,
"learning_rate": 5.4284419763271e-05,
"loss": 0.0305,
"step": 513
},
{
"epoch": 3.312449799196787,
"grad_norm": 0.20870855450630188,
"learning_rate": 5.391277150898575e-05,
"loss": 0.0539,
"step": 514
},
{
"epoch": 3.3188755020080323,
"grad_norm": 0.19453927874565125,
"learning_rate": 5.354192974626674e-05,
"loss": 0.0462,
"step": 515
},
{
"epoch": 3.325301204819277,
"grad_norm": 0.12926504015922546,
"learning_rate": 5.317190096456368e-05,
"loss": 0.047,
"step": 516
},
{
"epoch": 3.331726907630522,
"grad_norm": 0.13683585822582245,
"learning_rate": 5.2802691639099834e-05,
"loss": 0.0655,
"step": 517
},
{
"epoch": 3.338152610441767,
"grad_norm": 0.13933661580085754,
"learning_rate": 5.24343082307585e-05,
"loss": 0.0713,
"step": 518
},
{
"epoch": 3.3445783132530122,
"grad_norm": 0.15325239300727844,
"learning_rate": 5.206675718597012e-05,
"loss": 0.0471,
"step": 519
},
{
"epoch": 3.351004016064257,
"grad_norm": 0.21659892797470093,
"learning_rate": 5.1700044936599434e-05,
"loss": 0.1054,
"step": 520
},
{
"epoch": 3.357429718875502,
"grad_norm": 0.1403968334197998,
"learning_rate": 5.133417789983277e-05,
"loss": 0.0668,
"step": 521
},
{
"epoch": 3.363855421686747,
"grad_norm": 0.16732285916805267,
"learning_rate": 5.0969162478066055e-05,
"loss": 0.0587,
"step": 522
},
{
"epoch": 3.3702811244979918,
"grad_norm": 0.12107214331626892,
"learning_rate": 5.060500505879244e-05,
"loss": 0.0534,
"step": 523
},
{
"epoch": 3.376706827309237,
"grad_norm": 0.1527867466211319,
"learning_rate": 5.0241712014490684e-05,
"loss": 0.0497,
"step": 524
},
{
"epoch": 3.383132530120482,
"grad_norm": 0.15944266319274902,
"learning_rate": 4.9879289702513845e-05,
"loss": 0.0653,
"step": 525
},
{
"epoch": 3.389558232931727,
"grad_norm": 0.15926086902618408,
"learning_rate": 4.95177444649776e-05,
"loss": 0.049,
"step": 526
},
{
"epoch": 3.3959839357429717,
"grad_norm": 0.13858525454998016,
"learning_rate": 4.9157082628649545e-05,
"loss": 0.0544,
"step": 527
},
{
"epoch": 3.402409638554217,
"grad_norm": 0.14041852951049805,
"learning_rate": 4.87973105048385e-05,
"loss": 0.0481,
"step": 528
},
{
"epoch": 3.408835341365462,
"grad_norm": 0.12636280059814453,
"learning_rate": 4.8438434389283895e-05,
"loss": 0.0533,
"step": 529
},
{
"epoch": 3.415261044176707,
"grad_norm": 0.21128305792808533,
"learning_rate": 4.8080460562045736e-05,
"loss": 0.0532,
"step": 530
},
{
"epoch": 3.4216867469879517,
"grad_norm": 0.13390301167964935,
"learning_rate": 4.7723395287394746e-05,
"loss": 0.0334,
"step": 531
},
{
"epoch": 3.4281124497991966,
"grad_norm": 0.17304478585720062,
"learning_rate": 4.736724481370248e-05,
"loss": 0.0537,
"step": 532
},
{
"epoch": 3.434538152610442,
"grad_norm": 0.17624051868915558,
"learning_rate": 4.701201537333237e-05,
"loss": 0.045,
"step": 533
},
{
"epoch": 3.440963855421687,
"grad_norm": 0.18076051771640778,
"learning_rate": 4.6657713182530316e-05,
"loss": 0.0458,
"step": 534
},
{
"epoch": 3.4473895582329317,
"grad_norm": 0.15712404251098633,
"learning_rate": 4.630434444131615e-05,
"loss": 0.0403,
"step": 535
},
{
"epoch": 3.4538152610441766,
"grad_norm": 0.1949484795331955,
"learning_rate": 4.595191533337494e-05,
"loss": 0.0455,
"step": 536
},
{
"epoch": 3.460240963855422,
"grad_norm": 0.18234334886074066,
"learning_rate": 4.560043202594899e-05,
"loss": 0.0347,
"step": 537
},
{
"epoch": 3.466666666666667,
"grad_norm": 0.23936811089515686,
"learning_rate": 4.524990066972982e-05,
"loss": 0.0567,
"step": 538
},
{
"epoch": 3.4730923694779117,
"grad_norm": 0.21601049602031708,
"learning_rate": 4.4900327398750363e-05,
"loss": 0.0487,
"step": 539
},
{
"epoch": 3.4795180722891565,
"grad_norm": 0.20177985727787018,
"learning_rate": 4.4551718330278006e-05,
"loss": 0.0401,
"step": 540
},
{
"epoch": 3.4859437751004014,
"grad_norm": 0.14613081514835358,
"learning_rate": 4.4204079564707144e-05,
"loss": 0.0383,
"step": 541
},
{
"epoch": 3.4923694779116468,
"grad_norm": 0.11608117818832397,
"learning_rate": 4.3857417185452644e-05,
"loss": 0.0638,
"step": 542
},
{
"epoch": 3.4987951807228916,
"grad_norm": 0.13324810564517975,
"learning_rate": 4.351173725884351e-05,
"loss": 0.0705,
"step": 543
},
{
"epoch": 3.5052208835341365,
"grad_norm": 0.16427621245384216,
"learning_rate": 4.3167045834016326e-05,
"loss": 0.0861,
"step": 544
},
{
"epoch": 3.5116465863453814,
"grad_norm": 0.12879817187786102,
"learning_rate": 4.282334894280986e-05,
"loss": 0.0532,
"step": 545
},
{
"epoch": 3.5180722891566267,
"grad_norm": 0.13565368950366974,
"learning_rate": 4.2480652599659154e-05,
"loss": 0.064,
"step": 546
},
{
"epoch": 3.5244979919678716,
"grad_norm": 0.1572374403476715,
"learning_rate": 4.213896280149041e-05,
"loss": 0.0627,
"step": 547
},
{
"epoch": 3.5309236947791165,
"grad_norm": 0.1345859169960022,
"learning_rate": 4.179828552761617e-05,
"loss": 0.0654,
"step": 548
},
{
"epoch": 3.5373493975903614,
"grad_norm": 0.18093015253543854,
"learning_rate": 4.1458626739630526e-05,
"loss": 0.0593,
"step": 549
},
{
"epoch": 3.5437751004016063,
"grad_norm": 0.16655421257019043,
"learning_rate": 4.1119992381304754e-05,
"loss": 0.0524,
"step": 550
},
{
"epoch": 3.550200803212851,
"grad_norm": 0.1454760730266571,
"learning_rate": 4.078238837848352e-05,
"loss": 0.0478,
"step": 551
},
{
"epoch": 3.5566265060240965,
"grad_norm": 0.16041091084480286,
"learning_rate": 4.04458206389809e-05,
"loss": 0.0586,
"step": 552
},
{
"epoch": 3.5630522088353414,
"grad_norm": 0.1313060075044632,
"learning_rate": 4.011029505247732e-05,
"loss": 0.0441,
"step": 553
},
{
"epoch": 3.5694779116465862,
"grad_norm": 0.22944222390651703,
"learning_rate": 3.977581749041616e-05,
"loss": 0.0543,
"step": 554
},
{
"epoch": 3.5759036144578316,
"grad_norm": 0.20055855810642242,
"learning_rate": 3.9442393805901245e-05,
"loss": 0.0542,
"step": 555
},
{
"epoch": 3.5823293172690764,
"grad_norm": 0.20784057676792145,
"learning_rate": 3.91100298335944e-05,
"loss": 0.0474,
"step": 556
},
{
"epoch": 3.5887550200803213,
"grad_norm": 0.1995190680027008,
"learning_rate": 3.877873138961311e-05,
"loss": 0.0494,
"step": 557
},
{
"epoch": 3.595180722891566,
"grad_norm": 0.195552259683609,
"learning_rate": 3.844850427142914e-05,
"loss": 0.0433,
"step": 558
},
{
"epoch": 3.601606425702811,
"grad_norm": 0.18116550147533417,
"learning_rate": 3.811935425776667e-05,
"loss": 0.0454,
"step": 559
},
{
"epoch": 3.608032128514056,
"grad_norm": 0.19290290772914886,
"learning_rate": 3.779128710850151e-05,
"loss": 0.0428,
"step": 560
},
{
"epoch": 3.6144578313253013,
"grad_norm": 0.21416397392749786,
"learning_rate": 3.7464308564560106e-05,
"loss": 0.051,
"step": 561
},
{
"epoch": 3.620883534136546,
"grad_norm": 0.16343973577022552,
"learning_rate": 3.71384243478191e-05,
"loss": 0.0348,
"step": 562
},
{
"epoch": 3.627309236947791,
"grad_norm": 0.23021504282951355,
"learning_rate": 3.681364016100535e-05,
"loss": 0.0395,
"step": 563
},
{
"epoch": 3.6337349397590364,
"grad_norm": 0.17765948176383972,
"learning_rate": 3.64899616875959e-05,
"loss": 0.03,
"step": 564
},
{
"epoch": 3.6401606425702813,
"grad_norm": 0.19902700185775757,
"learning_rate": 3.616739459171866e-05,
"loss": 0.0375,
"step": 565
},
{
"epoch": 3.646586345381526,
"grad_norm": 0.12952236831188202,
"learning_rate": 3.5845944518053376e-05,
"loss": 0.0586,
"step": 566
},
{
"epoch": 3.653012048192771,
"grad_norm": 0.1288733333349228,
"learning_rate": 3.552561709173266e-05,
"loss": 0.0611,
"step": 567
},
{
"epoch": 3.659437751004016,
"grad_norm": 0.13545019924640656,
"learning_rate": 3.520641791824374e-05,
"loss": 0.0742,
"step": 568
},
{
"epoch": 3.665863453815261,
"grad_norm": 0.18385440111160278,
"learning_rate": 3.488835258333014e-05,
"loss": 0.0468,
"step": 569
},
{
"epoch": 3.672289156626506,
"grad_norm": 0.15589255094528198,
"learning_rate": 3.4571426652894144e-05,
"loss": 0.0691,
"step": 570
},
{
"epoch": 3.678714859437751,
"grad_norm": 0.16703784465789795,
"learning_rate": 3.4255645672899325e-05,
"loss": 0.0707,
"step": 571
},
{
"epoch": 3.685140562248996,
"grad_norm": 0.1708272397518158,
"learning_rate": 3.3941015169273524e-05,
"loss": 0.0699,
"step": 572
},
{
"epoch": 3.691566265060241,
"grad_norm": 0.1434539556503296,
"learning_rate": 3.362754064781202e-05,
"loss": 0.058,
"step": 573
},
{
"epoch": 3.697991967871486,
"grad_norm": 0.12717658281326294,
"learning_rate": 3.331522759408138e-05,
"loss": 0.045,
"step": 574
},
{
"epoch": 3.704417670682731,
"grad_norm": 0.1311410516500473,
"learning_rate": 3.300408147332327e-05,
"loss": 0.0369,
"step": 575
},
{
"epoch": 3.710843373493976,
"grad_norm": 0.13452477753162384,
"learning_rate": 3.269410773035903e-05,
"loss": 0.0382,
"step": 576
},
{
"epoch": 3.7172690763052207,
"grad_norm": 1.2888418436050415,
"learning_rate": 3.238531178949417e-05,
"loss": 0.0785,
"step": 577
},
{
"epoch": 3.7236947791164656,
"grad_norm": 0.15146106481552124,
"learning_rate": 3.207769905442359e-05,
"loss": 0.0726,
"step": 578
},
{
"epoch": 3.730120481927711,
"grad_norm": 0.14550191164016724,
"learning_rate": 3.177127490813706e-05,
"loss": 0.0504,
"step": 579
},
{
"epoch": 3.736546184738956,
"grad_norm": 0.1444544643163681,
"learning_rate": 3.1466044712824805e-05,
"loss": 0.0384,
"step": 580
},
{
"epoch": 3.7429718875502007,
"grad_norm": 0.160190150141716,
"learning_rate": 3.1162013809783955e-05,
"loss": 0.0491,
"step": 581
},
{
"epoch": 3.749397590361446,
"grad_norm": 0.1942124217748642,
"learning_rate": 3.0859187519324806e-05,
"loss": 0.0536,
"step": 582
},
{
"epoch": 3.755823293172691,
"grad_norm": 0.1264030784368515,
"learning_rate": 3.055757114067794e-05,
"loss": 0.0319,
"step": 583
},
{
"epoch": 3.762248995983936,
"grad_norm": 0.19579629600048065,
"learning_rate": 3.025716995190141e-05,
"loss": 0.0477,
"step": 584
},
{
"epoch": 3.7686746987951807,
"grad_norm": 0.16901759803295135,
"learning_rate": 2.995798920978825e-05,
"loss": 0.0306,
"step": 585
},
{
"epoch": 3.7751004016064256,
"grad_norm": 0.16561923921108246,
"learning_rate": 2.966003414977475e-05,
"loss": 0.0492,
"step": 586
},
{
"epoch": 3.7815261044176705,
"grad_norm": 0.2909844219684601,
"learning_rate": 2.9363309985848585e-05,
"loss": 0.0502,
"step": 587
},
{
"epoch": 3.787951807228916,
"grad_norm": 0.15100587904453278,
"learning_rate": 2.9067821910457704e-05,
"loss": 0.0343,
"step": 588
},
{
"epoch": 3.7943775100401607,
"grad_norm": 0.26900508999824524,
"learning_rate": 2.877357509441947e-05,
"loss": 0.0544,
"step": 589
},
{
"epoch": 3.8008032128514055,
"grad_norm": 0.23953552544116974,
"learning_rate": 2.8480574686830142e-05,
"loss": 0.0438,
"step": 590
},
{
"epoch": 3.807228915662651,
"grad_norm": 0.14778892695903778,
"learning_rate": 2.8188825814974795e-05,
"loss": 0.0455,
"step": 591
},
{
"epoch": 3.8136546184738958,
"grad_norm": 0.11664500087499619,
"learning_rate": 2.7898333584237534e-05,
"loss": 0.0634,
"step": 592
},
{
"epoch": 3.8200803212851406,
"grad_norm": 0.10635870695114136,
"learning_rate": 2.7609103078012166e-05,
"loss": 0.0603,
"step": 593
},
{
"epoch": 3.8265060240963855,
"grad_norm": 0.11662769317626953,
"learning_rate": 2.7321139357613412e-05,
"loss": 0.0466,
"step": 594
},
{
"epoch": 3.8329317269076304,
"grad_norm": 0.12710097432136536,
"learning_rate": 2.703444746218802e-05,
"loss": 0.063,
"step": 595
},
{
"epoch": 3.8393574297188753,
"grad_norm": 0.16935308277606964,
"learning_rate": 2.6749032408626907e-05,
"loss": 0.0519,
"step": 596
},
{
"epoch": 3.8457831325301206,
"grad_norm": 0.15195302665233612,
"learning_rate": 2.646489919147721e-05,
"loss": 0.076,
"step": 597
},
{
"epoch": 3.8522088353413655,
"grad_norm": 0.21082603931427002,
"learning_rate": 2.6182052782854806e-05,
"loss": 0.0771,
"step": 598
},
{
"epoch": 3.8586345381526104,
"grad_norm": 0.1390170454978943,
"learning_rate": 2.59004981323575e-05,
"loss": 0.0673,
"step": 599
},
{
"epoch": 3.8650602409638557,
"grad_norm": 0.11533928662538528,
"learning_rate": 2.5620240166978226e-05,
"loss": 0.0392,
"step": 600
},
{
"epoch": 3.8714859437751006,
"grad_norm": 0.1865217238664627,
"learning_rate": 2.5341283791018988e-05,
"loss": 0.0505,
"step": 601
},
{
"epoch": 3.8779116465863455,
"grad_norm": 0.15040706098079681,
"learning_rate": 2.5063633886004935e-05,
"loss": 0.0589,
"step": 602
},
{
"epoch": 3.8843373493975903,
"grad_norm": 0.15011747181415558,
"learning_rate": 2.4787295310598913e-05,
"loss": 0.0607,
"step": 603
},
{
"epoch": 3.8907630522088352,
"grad_norm": 0.15416325628757477,
"learning_rate": 2.45122729005166e-05,
"loss": 0.0613,
"step": 604
},
{
"epoch": 3.89718875502008,
"grad_norm": 0.20439183712005615,
"learning_rate": 2.423857146844164e-05,
"loss": 0.0688,
"step": 605
},
{
"epoch": 3.9036144578313254,
"grad_norm": 0.16020047664642334,
"learning_rate": 2.3966195803941715e-05,
"loss": 0.0513,
"step": 606
},
{
"epoch": 3.9100401606425703,
"grad_norm": 0.14457987248897552,
"learning_rate": 2.3695150673384437e-05,
"loss": 0.0374,
"step": 607
},
{
"epoch": 3.916465863453815,
"grad_norm": 0.12410798668861389,
"learning_rate": 2.3425440819854185e-05,
"loss": 0.0335,
"step": 608
},
{
"epoch": 3.92289156626506,
"grad_norm": 0.1418788731098175,
"learning_rate": 2.3157070963068984e-05,
"loss": 0.0286,
"step": 609
},
{
"epoch": 3.9293172690763054,
"grad_norm": 0.1596471220254898,
"learning_rate": 2.2890045799297876e-05,
"loss": 0.0321,
"step": 610
},
{
"epoch": 3.9357429718875503,
"grad_norm": 0.19366958737373352,
"learning_rate": 2.2624370001278838e-05,
"loss": 0.0398,
"step": 611
},
{
"epoch": 3.942168674698795,
"grad_norm": 0.1489342749118805,
"learning_rate": 2.2360048218136985e-05,
"loss": 0.0313,
"step": 612
},
{
"epoch": 3.94859437751004,
"grad_norm": 0.2047041803598404,
"learning_rate": 2.2097085075303138e-05,
"loss": 0.0517,
"step": 613
},
{
"epoch": 3.955020080321285,
"grad_norm": 0.24907703697681427,
"learning_rate": 2.1835485174433002e-05,
"loss": 0.051,
"step": 614
},
{
"epoch": 3.9614457831325303,
"grad_norm": 0.21545164287090302,
"learning_rate": 2.1575253093326586e-05,
"loss": 0.0327,
"step": 615
},
{
"epoch": 3.967871485943775,
"grad_norm": 0.11793698370456696,
"learning_rate": 2.131639338584801e-05,
"loss": 0.0516,
"step": 616
},
{
"epoch": 3.97429718875502,
"grad_norm": 0.14671315252780914,
"learning_rate": 2.1058910581846013e-05,
"loss": 0.0528,
"step": 617
},
{
"epoch": 3.980722891566265,
"grad_norm": 0.12005927413702011,
"learning_rate": 2.0802809187074434e-05,
"loss": 0.0404,
"step": 618
},
{
"epoch": 3.9871485943775102,
"grad_norm": 0.16592156887054443,
"learning_rate": 2.05480936831136e-05,
"loss": 0.0456,
"step": 619
},
{
"epoch": 3.993574297188755,
"grad_norm": 0.14437763392925262,
"learning_rate": 2.0294768527291796e-05,
"loss": 0.0292,
"step": 620
},
{
"epoch": 4.003212851405623,
"grad_norm": 0.4867587685585022,
"learning_rate": 2.004283815260717e-05,
"loss": 0.0602,
"step": 621
},
{
"epoch": 4.009638554216868,
"grad_norm": 0.11754161864519119,
"learning_rate": 1.9792306967650398e-05,
"loss": 0.0577,
"step": 622
},
{
"epoch": 4.016064257028113,
"grad_norm": 0.12216556817293167,
"learning_rate": 1.9543179356527252e-05,
"loss": 0.0782,
"step": 623
},
{
"epoch": 4.0224899598393575,
"grad_norm": 0.10597134381532669,
"learning_rate": 1.9295459678782168e-05,
"loss": 0.0623,
"step": 624
},
{
"epoch": 4.028915662650602,
"grad_norm": 0.13430526852607727,
"learning_rate": 1.904915226932169e-05,
"loss": 0.0573,
"step": 625
},
{
"epoch": 4.035341365461847,
"grad_norm": 0.14790372550487518,
"learning_rate": 1.88042614383388e-05,
"loss": 0.0494,
"step": 626
},
{
"epoch": 4.041767068273092,
"grad_norm": 0.1289214938879013,
"learning_rate": 1.856079147123746e-05,
"loss": 0.0353,
"step": 627
},
{
"epoch": 4.048192771084337,
"grad_norm": 0.12450610101222992,
"learning_rate": 1.8318746628557526e-05,
"loss": 0.0379,
"step": 628
},
{
"epoch": 4.054618473895582,
"grad_norm": 0.17245958745479584,
"learning_rate": 1.8078131145900267e-05,
"loss": 0.0583,
"step": 629
},
{
"epoch": 4.061044176706828,
"grad_norm": 0.14711426198482513,
"learning_rate": 1.7838949233854284e-05,
"loss": 0.0612,
"step": 630
},
{
"epoch": 4.067469879518073,
"grad_norm": 0.16549347341060638,
"learning_rate": 1.760120507792169e-05,
"loss": 0.049,
"step": 631
},
{
"epoch": 4.0738955823293175,
"grad_norm": 0.10457637906074524,
"learning_rate": 1.7364902838445106e-05,
"loss": 0.0337,
"step": 632
},
{
"epoch": 4.080321285140562,
"grad_norm": 0.19436487555503845,
"learning_rate": 1.713004665053457e-05,
"loss": 0.0455,
"step": 633
},
{
"epoch": 4.086746987951807,
"grad_norm": 0.14365814626216888,
"learning_rate": 1.6896640623995318e-05,
"loss": 0.0363,
"step": 634
},
{
"epoch": 4.093172690763052,
"grad_norm": 0.13444365561008453,
"learning_rate": 1.666468884325596e-05,
"loss": 0.0382,
"step": 635
},
{
"epoch": 4.099598393574297,
"grad_norm": 0.11268598586320877,
"learning_rate": 1.6434195367296802e-05,
"loss": 0.0323,
"step": 636
},
{
"epoch": 4.106024096385542,
"grad_norm": 0.12947283685207367,
"learning_rate": 1.6205164229578994e-05,
"loss": 0.0281,
"step": 637
},
{
"epoch": 4.112449799196787,
"grad_norm": 0.12646523118019104,
"learning_rate": 1.5977599437973867e-05,
"loss": 0.0264,
"step": 638
},
{
"epoch": 4.1188755020080325,
"grad_norm": 0.1570567935705185,
"learning_rate": 1.5751504974692733e-05,
"loss": 0.0356,
"step": 639
},
{
"epoch": 4.125301204819277,
"grad_norm": 0.1824863851070404,
"learning_rate": 1.55268847962174e-05,
"loss": 0.0368,
"step": 640
},
{
"epoch": 4.131726907630522,
"grad_norm": 0.17944642901420593,
"learning_rate": 1.5303742833230673e-05,
"loss": 0.0275,
"step": 641
},
{
"epoch": 4.138152610441767,
"grad_norm": 0.16530513763427734,
"learning_rate": 1.5082082990547796e-05,
"loss": 0.0308,
"step": 642
},
{
"epoch": 4.144578313253012,
"grad_norm": 0.17503738403320312,
"learning_rate": 1.4861909147048025e-05,
"loss": 0.0232,
"step": 643
},
{
"epoch": 4.151004016064257,
"grad_norm": 0.18777841329574585,
"learning_rate": 1.464322515560671e-05,
"loss": 0.032,
"step": 644
},
{
"epoch": 4.157429718875502,
"grad_norm": 0.2501942217350006,
"learning_rate": 1.4426034843027969e-05,
"loss": 0.0328,
"step": 645
},
{
"epoch": 4.163855421686747,
"grad_norm": 0.13108326494693756,
"learning_rate": 1.4210342009977628e-05,
"loss": 0.0393,
"step": 646
},
{
"epoch": 4.170281124497992,
"grad_norm": 0.1045239120721817,
"learning_rate": 1.3996150430916799e-05,
"loss": 0.0491,
"step": 647
},
{
"epoch": 4.176706827309237,
"grad_norm": 0.14830029010772705,
"learning_rate": 1.378346385403575e-05,
"loss": 0.0488,
"step": 648
},
{
"epoch": 4.183132530120482,
"grad_norm": 0.12067140638828278,
"learning_rate": 1.357228600118836e-05,
"loss": 0.0431,
"step": 649
},
{
"epoch": 4.189558232931727,
"grad_norm": 0.12366708368062973,
"learning_rate": 1.3362620567827033e-05,
"loss": 0.0488,
"step": 650
},
{
"epoch": 4.195983935742972,
"grad_norm": 0.14555774629116058,
"learning_rate": 1.3154471222937903e-05,
"loss": 0.05,
"step": 651
},
{
"epoch": 4.202409638554217,
"grad_norm": 0.16288283467292786,
"learning_rate": 1.2947841608976718e-05,
"loss": 0.0583,
"step": 652
},
{
"epoch": 4.208835341365462,
"grad_norm": 0.14024418592453003,
"learning_rate": 1.2742735341805145e-05,
"loss": 0.0404,
"step": 653
},
{
"epoch": 4.215261044176707,
"grad_norm": 0.13420836627483368,
"learning_rate": 1.253915601062734e-05,
"loss": 0.0587,
"step": 654
},
{
"epoch": 4.2216867469879515,
"grad_norm": 0.1517859846353531,
"learning_rate": 1.2337107177927365e-05,
"loss": 0.041,
"step": 655
},
{
"epoch": 4.228112449799196,
"grad_norm": 0.14088109135627747,
"learning_rate": 1.213659237940662e-05,
"loss": 0.0419,
"step": 656
},
{
"epoch": 4.234538152610442,
"grad_norm": 0.1399402618408203,
"learning_rate": 1.1937615123922052e-05,
"loss": 0.0443,
"step": 657
},
{
"epoch": 4.240963855421687,
"grad_norm": 0.15205906331539154,
"learning_rate": 1.174017889342489e-05,
"loss": 0.0429,
"step": 658
},
{
"epoch": 4.247389558232932,
"grad_norm": 0.3410640358924866,
"learning_rate": 1.1544287142899446e-05,
"loss": 0.0367,
"step": 659
},
{
"epoch": 4.253815261044177,
"grad_norm": 0.1357734352350235,
"learning_rate": 1.1349943300302913e-05,
"loss": 0.0294,
"step": 660
},
{
"epoch": 4.260240963855422,
"grad_norm": 0.14988286793231964,
"learning_rate": 1.1157150766505253e-05,
"loss": 0.0384,
"step": 661
},
{
"epoch": 4.266666666666667,
"grad_norm": 0.1004982739686966,
"learning_rate": 1.0965912915229625e-05,
"loss": 0.0263,
"step": 662
},
{
"epoch": 4.2730923694779115,
"grad_norm": 0.1396017074584961,
"learning_rate": 1.0776233092993527e-05,
"loss": 0.028,
"step": 663
},
{
"epoch": 4.279518072289156,
"grad_norm": 0.11266002804040909,
"learning_rate": 1.0588114619050028e-05,
"loss": 0.0205,
"step": 664
},
{
"epoch": 4.285943775100401,
"grad_norm": 0.15433090925216675,
"learning_rate": 1.040156078532989e-05,
"loss": 0.0316,
"step": 665
},
{
"epoch": 4.292369477911647,
"grad_norm": 0.15806183218955994,
"learning_rate": 1.0216574856383742e-05,
"loss": 0.0319,
"step": 666
},
{
"epoch": 4.298795180722892,
"grad_norm": 0.184244304895401,
"learning_rate": 1.0033160069325166e-05,
"loss": 0.0203,
"step": 667
},
{
"epoch": 4.305220883534137,
"grad_norm": 0.14676746726036072,
"learning_rate": 9.851319633773926e-06,
"loss": 0.0222,
"step": 668
},
{
"epoch": 4.311646586345382,
"grad_norm": 0.22139282524585724,
"learning_rate": 9.671056731799777e-06,
"loss": 0.0349,
"step": 669
},
{
"epoch": 4.3180722891566266,
"grad_norm": 0.16691678762435913,
"learning_rate": 9.49237451786692e-06,
"loss": 0.0227,
"step": 670
},
{
"epoch": 4.324497991967871,
"grad_norm": 0.16708509624004364,
"learning_rate": 9.315276118778627e-06,
"loss": 0.0374,
"step": 671
},
{
"epoch": 4.330923694779116,
"grad_norm": 0.09550745040178299,
"learning_rate": 9.139764633622617e-06,
"loss": 0.0511,
"step": 672
},
{
"epoch": 4.337349397590361,
"grad_norm": 0.1416180431842804,
"learning_rate": 8.965843133716933e-06,
"loss": 0.0739,
"step": 673
},
{
"epoch": 4.343775100401606,
"grad_norm": 0.15338513255119324,
"learning_rate": 8.793514662555946e-06,
"loss": 0.0452,
"step": 674
},
{
"epoch": 4.350200803212852,
"grad_norm": 0.19460880756378174,
"learning_rate": 8.622782235757276e-06,
"loss": 0.0617,
"step": 675
},
{
"epoch": 4.356626506024097,
"grad_norm": 0.14072959125041962,
"learning_rate": 8.453648841009021e-06,
"loss": 0.0435,
"step": 676
},
{
"epoch": 4.363052208835342,
"grad_norm": 0.25675803422927856,
"learning_rate": 8.286117438017337e-06,
"loss": 0.0503,
"step": 677
},
{
"epoch": 4.3694779116465865,
"grad_norm": 0.13942734897136688,
"learning_rate": 8.120190958454843e-06,
"loss": 0.062,
"step": 678
},
{
"epoch": 4.375903614457831,
"grad_norm": 0.13195668160915375,
"learning_rate": 7.955872305909152e-06,
"loss": 0.0479,
"step": 679
},
{
"epoch": 4.382329317269076,
"grad_norm": 0.13269154727458954,
"learning_rate": 7.793164355832127e-06,
"loss": 0.0414,
"step": 680
},
{
"epoch": 4.388755020080321,
"grad_norm": 0.1273636668920517,
"learning_rate": 7.632069955489585e-06,
"loss": 0.0359,
"step": 681
},
{
"epoch": 4.395180722891566,
"grad_norm": 0.14960840344429016,
"learning_rate": 7.472591923911398e-06,
"loss": 0.0374,
"step": 682
},
{
"epoch": 4.401606425702811,
"grad_norm": 0.14672979712486267,
"learning_rate": 7.314733051842282e-06,
"loss": 0.0323,
"step": 683
},
{
"epoch": 4.408032128514057,
"grad_norm": 0.13686661422252655,
"learning_rate": 7.158496101692802e-06,
"loss": 0.0314,
"step": 684
},
{
"epoch": 4.414457831325302,
"grad_norm": 0.15936917066574097,
"learning_rate": 7.003883807491185e-06,
"loss": 0.0493,
"step": 685
},
{
"epoch": 4.4208835341365464,
"grad_norm": 0.16186754405498505,
"learning_rate": 6.85089887483541e-06,
"loss": 0.0337,
"step": 686
},
{
"epoch": 4.427309236947791,
"grad_norm": 0.1702742725610733,
"learning_rate": 6.699543980845801e-06,
"loss": 0.0306,
"step": 687
},
{
"epoch": 4.433734939759036,
"grad_norm": 0.16083656251430511,
"learning_rate": 6.549821774118325e-06,
"loss": 0.0341,
"step": 688
},
{
"epoch": 4.440160642570281,
"grad_norm": 0.11489646136760712,
"learning_rate": 6.401734874678089e-06,
"loss": 0.0221,
"step": 689
},
{
"epoch": 4.446586345381526,
"grad_norm": 0.1623634397983551,
"learning_rate": 6.255285873933569e-06,
"loss": 0.0234,
"step": 690
},
{
"epoch": 4.453012048192771,
"grad_norm": 0.1647336781024933,
"learning_rate": 6.110477334631326e-06,
"loss": 0.0305,
"step": 691
},
{
"epoch": 4.459437751004016,
"grad_norm": 0.13188236951828003,
"learning_rate": 5.967311790811014e-06,
"loss": 0.0194,
"step": 692
},
{
"epoch": 4.4658634538152615,
"grad_norm": 0.17995339632034302,
"learning_rate": 5.825791747761123e-06,
"loss": 0.0368,
"step": 693
},
{
"epoch": 4.472289156626506,
"grad_norm": 0.18333207070827484,
"learning_rate": 5.685919681975149e-06,
"loss": 0.0303,
"step": 694
},
{
"epoch": 4.478714859437751,
"grad_norm": 0.13925676047801971,
"learning_rate": 5.547698041108229e-06,
"loss": 0.0314,
"step": 695
},
{
"epoch": 4.485140562248996,
"grad_norm": 0.10495959222316742,
"learning_rate": 5.4111292439342986e-06,
"loss": 0.027,
"step": 696
},
{
"epoch": 4.491566265060241,
"grad_norm": 0.1200391873717308,
"learning_rate": 5.276215680303831e-06,
"loss": 0.0475,
"step": 697
},
{
"epoch": 4.497991967871486,
"grad_norm": 0.12512274086475372,
"learning_rate": 5.14295971110188e-06,
"loss": 0.0513,
"step": 698
},
{
"epoch": 4.504417670682731,
"grad_norm": 0.11158251017332077,
"learning_rate": 5.011363668206948e-06,
"loss": 0.049,
"step": 699
},
{
"epoch": 4.510843373493976,
"grad_norm": 0.12815481424331665,
"learning_rate": 4.881429854450004e-06,
"loss": 0.0465,
"step": 700
},
{
"epoch": 4.517269076305221,
"grad_norm": 0.1294533908367157,
"learning_rate": 4.753160543574331e-06,
"loss": 0.0445,
"step": 701
},
{
"epoch": 4.523694779116466,
"grad_norm": 0.1252312958240509,
"learning_rate": 4.626557980195623e-06,
"loss": 0.0383,
"step": 702
},
{
"epoch": 4.530120481927711,
"grad_norm": 0.14612750709056854,
"learning_rate": 4.501624379762803e-06,
"loss": 0.0567,
"step": 703
},
{
"epoch": 4.536546184738956,
"grad_norm": 0.11176297813653946,
"learning_rate": 4.3783619285191705e-06,
"loss": 0.0418,
"step": 704
},
{
"epoch": 4.542971887550201,
"grad_norm": 0.13328702747821808,
"learning_rate": 4.2567727834641915e-06,
"loss": 0.0316,
"step": 705
},
{
"epoch": 4.549397590361446,
"grad_norm": 0.11925917118787766,
"learning_rate": 4.136859072315758e-06,
"loss": 0.0403,
"step": 706
},
{
"epoch": 4.555823293172691,
"grad_norm": 0.14322586357593536,
"learning_rate": 4.018622893472912e-06,
"loss": 0.0486,
"step": 707
},
{
"epoch": 4.562248995983936,
"grad_norm": 0.135334774851799,
"learning_rate": 3.902066315979158e-06,
"loss": 0.0358,
"step": 708
},
{
"epoch": 4.5686746987951805,
"grad_norm": 0.14582079648971558,
"learning_rate": 3.787191379486288e-06,
"loss": 0.0322,
"step": 709
},
{
"epoch": 4.575100401606425,
"grad_norm": 0.16357247531414032,
"learning_rate": 3.674000094218577e-06,
"loss": 0.0331,
"step": 710
},
{
"epoch": 4.581526104417671,
"grad_norm": 0.1673492193222046,
"learning_rate": 3.562494440937769e-06,
"loss": 0.0299,
"step": 711
},
{
"epoch": 4.587951807228916,
"grad_norm": 0.14151403307914734,
"learning_rate": 3.4526763709082476e-06,
"loss": 0.0256,
"step": 712
},
{
"epoch": 4.594377510040161,
"grad_norm": 0.20980997383594513,
"learning_rate": 3.344547805862985e-06,
"loss": 0.0342,
"step": 713
},
{
"epoch": 4.600803212851406,
"grad_norm": 0.12801873683929443,
"learning_rate": 3.2381106379699488e-06,
"loss": 0.022,
"step": 714
},
{
"epoch": 4.607228915662651,
"grad_norm": 0.15073615312576294,
"learning_rate": 3.1333667297989035e-06,
"loss": 0.0179,
"step": 715
},
{
"epoch": 4.613654618473896,
"grad_norm": 0.14964726567268372,
"learning_rate": 3.030317914288816e-06,
"loss": 0.022,
"step": 716
},
{
"epoch": 4.6200803212851405,
"grad_norm": 0.11907092481851578,
"learning_rate": 2.928965994715882e-06,
"loss": 0.0199,
"step": 717
},
{
"epoch": 4.626506024096385,
"grad_norm": 0.1797790676355362,
"learning_rate": 2.8293127446618383e-06,
"loss": 0.0241,
"step": 718
},
{
"epoch": 4.63293172690763,
"grad_norm": 0.2542262673377991,
"learning_rate": 2.7313599079830666e-06,
"loss": 0.0415,
"step": 719
},
{
"epoch": 4.639357429718876,
"grad_norm": 0.16320320963859558,
"learning_rate": 2.63510919877995e-06,
"loss": 0.0283,
"step": 720
},
{
"epoch": 4.64578313253012,
"grad_norm": 0.13019272685050964,
"learning_rate": 2.540562301366922e-06,
"loss": 0.0467,
"step": 721
},
{
"epoch": 4.652208835341366,
"grad_norm": 0.09419631212949753,
"learning_rate": 2.447720870243064e-06,
"loss": 0.0517,
"step": 722
},
{
"epoch": 4.658634538152611,
"grad_norm": 0.2463325709104538,
"learning_rate": 2.3565865300630206e-06,
"loss": 0.0399,
"step": 723
},
{
"epoch": 4.6650602409638555,
"grad_norm": 0.14761923253536224,
"learning_rate": 2.267160875608687e-06,
"loss": 0.0643,
"step": 724
},
{
"epoch": 4.6714859437751,
"grad_norm": 0.15416769683361053,
"learning_rate": 2.179445471761221e-06,
"loss": 0.0506,
"step": 725
},
{
"epoch": 4.677911646586345,
"grad_norm": 0.1494145393371582,
"learning_rate": 2.0934418534737098e-06,
"loss": 0.0553,
"step": 726
},
{
"epoch": 4.68433734939759,
"grad_norm": 0.17155231535434723,
"learning_rate": 2.0091515257442904e-06,
"loss": 0.0434,
"step": 727
},
{
"epoch": 4.690763052208835,
"grad_norm": 0.12443775683641434,
"learning_rate": 1.926575963589805e-06,
"loss": 0.0337,
"step": 728
},
{
"epoch": 4.697188755020081,
"grad_norm": 0.11996244639158249,
"learning_rate": 1.8457166120199987e-06,
"loss": 0.0304,
"step": 729
},
{
"epoch": 4.703614457831325,
"grad_norm": 0.17615464329719543,
"learning_rate": 1.7665748860122512e-06,
"loss": 0.054,
"step": 730
},
{
"epoch": 4.710040160642571,
"grad_norm": 0.16451974213123322,
"learning_rate": 1.689152170486752e-06,
"loss": 0.0435,
"step": 731
},
{
"epoch": 4.7164658634538155,
"grad_norm": 0.223603755235672,
"learning_rate": 1.6134498202823645e-06,
"loss": 0.0439,
"step": 732
},
{
"epoch": 4.72289156626506,
"grad_norm": 0.14352591335773468,
"learning_rate": 1.5394691601328338e-06,
"loss": 0.0347,
"step": 733
},
{
"epoch": 4.729317269076305,
"grad_norm": 0.17710843682289124,
"learning_rate": 1.467211484643627e-06,
"loss": 0.0383,
"step": 734
},
{
"epoch": 4.73574297188755,
"grad_norm": 0.20051458477973938,
"learning_rate": 1.3966780582693185e-06,
"loss": 0.056,
"step": 735
},
{
"epoch": 4.742168674698795,
"grad_norm": 0.1505594104528427,
"learning_rate": 1.3278701152913742e-06,
"loss": 0.036,
"step": 736
},
{
"epoch": 4.74859437751004,
"grad_norm": 0.20499320328235626,
"learning_rate": 1.2607888597966688e-06,
"loss": 0.0438,
"step": 737
},
{
"epoch": 4.755020080321285,
"grad_norm": 0.1615976244211197,
"learning_rate": 1.195435465656325e-06,
"loss": 0.0258,
"step": 738
},
{
"epoch": 4.76144578313253,
"grad_norm": 0.13965967297554016,
"learning_rate": 1.131811076505196e-06,
"loss": 0.0195,
"step": 739
},
{
"epoch": 4.767871485943775,
"grad_norm": 0.18875765800476074,
"learning_rate": 1.0699168057218823e-06,
"loss": 0.0339,
"step": 740
},
{
"epoch": 4.77429718875502,
"grad_norm": 0.12984806299209595,
"learning_rate": 1.0097537364091914e-06,
"loss": 0.0274,
"step": 741
},
{
"epoch": 4.780722891566265,
"grad_norm": 0.11120978742837906,
"learning_rate": 9.513229213752417e-07,
"loss": 0.0233,
"step": 742
},
{
"epoch": 4.78714859437751,
"grad_norm": 0.1367001086473465,
"learning_rate": 8.946253831150109e-07,
"loss": 0.0301,
"step": 743
},
{
"epoch": 4.793574297188755,
"grad_norm": 0.11858794093132019,
"learning_rate": 8.396621137924388e-07,
"loss": 0.0244,
"step": 744
},
{
"epoch": 4.8,
"grad_norm": 0.1457306295633316,
"learning_rate": 7.864340752230859e-07,
"loss": 0.0297,
"step": 745
},
{
"epoch": 4.806425702811245,
"grad_norm": 0.133670836687088,
"learning_rate": 7.349421988572691e-07,
"loss": 0.0458,
"step": 746
},
{
"epoch": 4.81285140562249,
"grad_norm": 0.12656152248382568,
"learning_rate": 6.851873857638192e-07,
"loss": 0.0772,
"step": 747
},
{
"epoch": 4.8192771084337345,
"grad_norm": 0.09214270859956741,
"learning_rate": 6.371705066142264e-07,
"loss": 0.0415,
"step": 748
},
{
"epoch": 4.82570281124498,
"grad_norm": 0.1306275725364685,
"learning_rate": 5.908924016674977e-07,
"loss": 0.0608,
"step": 749
},
{
"epoch": 4.832128514056225,
"grad_norm": 0.1301136314868927,
"learning_rate": 5.463538807553903e-07,
"loss": 0.0429,
"step": 750
},
{
"epoch": 4.83855421686747,
"grad_norm": 0.1389213353395462,
"learning_rate": 5.035557232682564e-07,
"loss": 0.0525,
"step": 751
},
{
"epoch": 4.844979919678715,
"grad_norm": 0.15371650457382202,
"learning_rate": 4.624986781414098e-07,
"loss": 0.0382,
"step": 752
},
{
"epoch": 4.85140562248996,
"grad_norm": 0.13239037990570068,
"learning_rate": 4.231834638420362e-07,
"loss": 0.0378,
"step": 753
},
{
"epoch": 4.857831325301205,
"grad_norm": 0.13462603092193604,
"learning_rate": 3.8561076835657017e-07,
"loss": 0.0574,
"step": 754
},
{
"epoch": 4.8642570281124495,
"grad_norm": 0.13441871106624603,
"learning_rate": 3.4978124917871556e-07,
"loss": 0.0383,
"step": 755
},
{
"epoch": 4.870682730923694,
"grad_norm": 0.12943735718727112,
"learning_rate": 3.1569553329788836e-07,
"loss": 0.0372,
"step": 756
},
{
"epoch": 4.877108433734939,
"grad_norm": 0.13817009329795837,
"learning_rate": 2.8335421718829193e-07,
"loss": 0.037,
"step": 757
},
{
"epoch": 4.883534136546185,
"grad_norm": 0.15584351122379303,
"learning_rate": 2.527578667984365e-07,
"loss": 0.057,
"step": 758
},
{
"epoch": 4.88995983935743,
"grad_norm": 0.13548874855041504,
"learning_rate": 2.239070175412694e-07,
"loss": 0.0312,
"step": 759
},
{
"epoch": 4.896385542168675,
"grad_norm": 0.1511905938386917,
"learning_rate": 1.9680217428479364e-07,
"loss": 0.0422,
"step": 760
},
{
"epoch": 4.90281124497992,
"grad_norm": 0.1757678985595703,
"learning_rate": 1.714438113431971e-07,
"loss": 0.0431,
"step": 761
},
{
"epoch": 4.909236947791165,
"grad_norm": 0.17349182069301605,
"learning_rate": 1.4783237246862592e-07,
"loss": 0.0352,
"step": 762
},
{
"epoch": 4.9156626506024095,
"grad_norm": 0.14009258151054382,
"learning_rate": 1.259682708433574e-07,
"loss": 0.0405,
"step": 763
},
{
"epoch": 4.922088353413654,
"grad_norm": 0.1775335967540741,
"learning_rate": 1.0585188907260569e-07,
"loss": 0.04,
"step": 764
},
{
"epoch": 4.928514056224899,
"grad_norm": 0.15646418929100037,
"learning_rate": 8.748357917780503e-08,
"loss": 0.034,
"step": 765
},
{
"epoch": 4.934939759036144,
"grad_norm": 0.1268489807844162,
"learning_rate": 7.086366259044796e-08,
"loss": 0.0199,
"step": 766
},
{
"epoch": 4.94136546184739,
"grad_norm": 0.17802543938159943,
"learning_rate": 5.59924301464898e-08,
"loss": 0.0365,
"step": 767
},
{
"epoch": 4.947791164658635,
"grad_norm": 0.18624529242515564,
"learning_rate": 4.287014208120832e-08,
"loss": 0.0319,
"step": 768
},
{
"epoch": 4.95421686746988,
"grad_norm": 0.1787281632423401,
"learning_rate": 3.149702802470733e-08,
"loss": 0.0216,
"step": 769
},
{
"epoch": 4.9606425702811245,
"grad_norm": 0.19038884341716766,
"learning_rate": 2.1873286997875498e-08,
"loss": 0.0341,
"step": 770
},
{
"epoch": 4.967068273092369,
"grad_norm": 0.15438446402549744,
"learning_rate": 1.3999087408866906e-08,
"loss": 0.0378,
"step": 771
},
{
"epoch": 4.973493975903614,
"grad_norm": 0.14491575956344604,
"learning_rate": 7.874567050214499e-09,
"loss": 0.0398,
"step": 772
},
{
"epoch": 4.979919678714859,
"grad_norm": 0.18205036222934723,
"learning_rate": 3.4998330963764705e-09,
"loss": 0.0426,
"step": 773
},
{
"epoch": 4.986345381526104,
"grad_norm": 0.11920995265245438,
"learning_rate": 8.749621018822041e-10,
"loss": 0.027,
"step": 774
},
{
"epoch": 4.992771084337349,
"grad_norm": 0.15572018921375275,
"learning_rate": 0.0,
"loss": 0.0339,
"step": 775
}
],
"logging_steps": 1,
"max_steps": 775,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3110452392841708e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}