random_5tPlnrn4pZMU7ZPb / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
8b2af68 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 482,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004149377593360996,
"grad_norm": 1.7676318772055255,
"learning_rate": 9.999893795201304e-06,
"loss": 0.3951,
"step": 1
},
{
"epoch": 0.008298755186721992,
"grad_norm": 2.153938510867046,
"learning_rate": 9.999575185316994e-06,
"loss": 0.3506,
"step": 2
},
{
"epoch": 0.012448132780082987,
"grad_norm": 1.0866083282976962,
"learning_rate": 9.999044183882234e-06,
"loss": 0.3212,
"step": 3
},
{
"epoch": 0.016597510373443983,
"grad_norm": 0.931761317507556,
"learning_rate": 9.998300813454981e-06,
"loss": 0.3017,
"step": 4
},
{
"epoch": 0.02074688796680498,
"grad_norm": 1.1753382039425297,
"learning_rate": 9.997345105615042e-06,
"loss": 0.2737,
"step": 5
},
{
"epoch": 0.024896265560165973,
"grad_norm": 1.0040359387178692,
"learning_rate": 9.996177100962714e-06,
"loss": 0.2452,
"step": 6
},
{
"epoch": 0.029045643153526972,
"grad_norm": 0.8719743071167806,
"learning_rate": 9.994796849117082e-06,
"loss": 0.2552,
"step": 7
},
{
"epoch": 0.03319502074688797,
"grad_norm": 0.8365485517467004,
"learning_rate": 9.99320440871389e-06,
"loss": 0.265,
"step": 8
},
{
"epoch": 0.03734439834024896,
"grad_norm": 0.8217986811130152,
"learning_rate": 9.991399847403066e-06,
"loss": 0.2344,
"step": 9
},
{
"epoch": 0.04149377593360996,
"grad_norm": 0.8569379622633062,
"learning_rate": 9.98938324184584e-06,
"loss": 0.2672,
"step": 10
},
{
"epoch": 0.04564315352697095,
"grad_norm": 0.8207803423650186,
"learning_rate": 9.987154677711482e-06,
"loss": 0.2361,
"step": 11
},
{
"epoch": 0.04979253112033195,
"grad_norm": 0.7412707898464974,
"learning_rate": 9.984714249673676e-06,
"loss": 0.2029,
"step": 12
},
{
"epoch": 0.05394190871369295,
"grad_norm": 0.8150005467454026,
"learning_rate": 9.982062061406489e-06,
"loss": 0.2436,
"step": 13
},
{
"epoch": 0.058091286307053944,
"grad_norm": 0.7451018300789697,
"learning_rate": 9.979198225579968e-06,
"loss": 0.2176,
"step": 14
},
{
"epoch": 0.06224066390041494,
"grad_norm": 0.739345283205148,
"learning_rate": 9.976122863855362e-06,
"loss": 0.1967,
"step": 15
},
{
"epoch": 0.06639004149377593,
"grad_norm": 0.7551340094752071,
"learning_rate": 9.972836106879936e-06,
"loss": 0.2224,
"step": 16
},
{
"epoch": 0.07053941908713693,
"grad_norm": 0.7750691720438508,
"learning_rate": 9.969338094281432e-06,
"loss": 0.2258,
"step": 17
},
{
"epoch": 0.07468879668049792,
"grad_norm": 0.8345827422112104,
"learning_rate": 9.965628974662145e-06,
"loss": 0.2352,
"step": 18
},
{
"epoch": 0.07883817427385892,
"grad_norm": 0.8525577544213736,
"learning_rate": 9.961708905592594e-06,
"loss": 0.2671,
"step": 19
},
{
"epoch": 0.08298755186721991,
"grad_norm": 0.8017727688186173,
"learning_rate": 9.957578053604837e-06,
"loss": 0.2564,
"step": 20
},
{
"epoch": 0.08713692946058091,
"grad_norm": 0.7485772111626616,
"learning_rate": 9.953236594185396e-06,
"loss": 0.2095,
"step": 21
},
{
"epoch": 0.0912863070539419,
"grad_norm": 0.7997431251870294,
"learning_rate": 9.9486847117678e-06,
"loss": 0.2425,
"step": 22
},
{
"epoch": 0.0954356846473029,
"grad_norm": 0.7838444722321428,
"learning_rate": 9.943922599724753e-06,
"loss": 0.2413,
"step": 23
},
{
"epoch": 0.0995850622406639,
"grad_norm": 0.7870701867938219,
"learning_rate": 9.938950460359912e-06,
"loss": 0.2038,
"step": 24
},
{
"epoch": 0.1037344398340249,
"grad_norm": 0.7775437643337811,
"learning_rate": 9.933768504899305e-06,
"loss": 0.1907,
"step": 25
},
{
"epoch": 0.1078838174273859,
"grad_norm": 0.7550159366634545,
"learning_rate": 9.928376953482343e-06,
"loss": 0.2451,
"step": 26
},
{
"epoch": 0.11203319502074689,
"grad_norm": 0.7099704073140934,
"learning_rate": 9.922776035152484e-06,
"loss": 0.2072,
"step": 27
},
{
"epoch": 0.11618257261410789,
"grad_norm": 0.7652497801188408,
"learning_rate": 9.916965987847485e-06,
"loss": 0.1993,
"step": 28
},
{
"epoch": 0.12033195020746888,
"grad_norm": 0.73622015581224,
"learning_rate": 9.910947058389309e-06,
"loss": 0.2322,
"step": 29
},
{
"epoch": 0.12448132780082988,
"grad_norm": 0.7332583019411074,
"learning_rate": 9.904719502473635e-06,
"loss": 0.2009,
"step": 30
},
{
"epoch": 0.12863070539419086,
"grad_norm": 0.7930266814325916,
"learning_rate": 9.898283584658988e-06,
"loss": 0.2256,
"step": 31
},
{
"epoch": 0.13278008298755187,
"grad_norm": 0.808737075078291,
"learning_rate": 9.891639578355511e-06,
"loss": 0.2382,
"step": 32
},
{
"epoch": 0.13692946058091288,
"grad_norm": 0.7295161025634769,
"learning_rate": 9.884787765813348e-06,
"loss": 0.1877,
"step": 33
},
{
"epoch": 0.14107883817427386,
"grad_norm": 0.6960480953194169,
"learning_rate": 9.877728438110645e-06,
"loss": 0.177,
"step": 34
},
{
"epoch": 0.14522821576763487,
"grad_norm": 0.7762118501829516,
"learning_rate": 9.870461895141195e-06,
"loss": 0.2099,
"step": 35
},
{
"epoch": 0.14937759336099585,
"grad_norm": 0.727269753206247,
"learning_rate": 9.86298844560169e-06,
"loss": 0.2127,
"step": 36
},
{
"epoch": 0.15352697095435686,
"grad_norm": 0.7138427929408657,
"learning_rate": 9.85530840697861e-06,
"loss": 0.1848,
"step": 37
},
{
"epoch": 0.15767634854771784,
"grad_norm": 0.7259620276040538,
"learning_rate": 9.847422105534739e-06,
"loss": 0.213,
"step": 38
},
{
"epoch": 0.16182572614107885,
"grad_norm": 0.8324644671562974,
"learning_rate": 9.8393298762953e-06,
"loss": 0.2415,
"step": 39
},
{
"epoch": 0.16597510373443983,
"grad_norm": 0.7064960332949763,
"learning_rate": 9.831032063033726e-06,
"loss": 0.1851,
"step": 40
},
{
"epoch": 0.17012448132780084,
"grad_norm": 0.8110898374330895,
"learning_rate": 9.822529018257049e-06,
"loss": 0.2138,
"step": 41
},
{
"epoch": 0.17427385892116182,
"grad_norm": 0.8714749214007722,
"learning_rate": 9.813821103190932e-06,
"loss": 0.2582,
"step": 42
},
{
"epoch": 0.17842323651452283,
"grad_norm": 0.7394441240060803,
"learning_rate": 9.804908687764326e-06,
"loss": 0.2118,
"step": 43
},
{
"epoch": 0.1825726141078838,
"grad_norm": 0.7761112789120154,
"learning_rate": 9.795792150593739e-06,
"loss": 0.2066,
"step": 44
},
{
"epoch": 0.18672199170124482,
"grad_norm": 0.7971762487982779,
"learning_rate": 9.786471878967174e-06,
"loss": 0.1924,
"step": 45
},
{
"epoch": 0.1908713692946058,
"grad_norm": 0.7962572100593608,
"learning_rate": 9.776948268827658e-06,
"loss": 0.2404,
"step": 46
},
{
"epoch": 0.1950207468879668,
"grad_norm": 0.7355219800994598,
"learning_rate": 9.76722172475643e-06,
"loss": 0.1956,
"step": 47
},
{
"epoch": 0.1991701244813278,
"grad_norm": 0.6799008638651439,
"learning_rate": 9.757292659955755e-06,
"loss": 0.1722,
"step": 48
},
{
"epoch": 0.2033195020746888,
"grad_norm": 0.7571593948984352,
"learning_rate": 9.747161496231359e-06,
"loss": 0.1859,
"step": 49
},
{
"epoch": 0.2074688796680498,
"grad_norm": 0.7486776807975455,
"learning_rate": 9.736828663974527e-06,
"loss": 0.1998,
"step": 50
},
{
"epoch": 0.21161825726141079,
"grad_norm": 0.7496759785941373,
"learning_rate": 9.726294602143807e-06,
"loss": 0.1884,
"step": 51
},
{
"epoch": 0.2157676348547718,
"grad_norm": 0.7016230512589497,
"learning_rate": 9.715559758246363e-06,
"loss": 0.1727,
"step": 52
},
{
"epoch": 0.21991701244813278,
"grad_norm": 0.7940013817707481,
"learning_rate": 9.704624588318972e-06,
"loss": 0.2035,
"step": 53
},
{
"epoch": 0.22406639004149378,
"grad_norm": 0.7477198569806254,
"learning_rate": 9.693489556908641e-06,
"loss": 0.2101,
"step": 54
},
{
"epoch": 0.22821576763485477,
"grad_norm": 0.7770709165987761,
"learning_rate": 9.682155137052879e-06,
"loss": 0.1875,
"step": 55
},
{
"epoch": 0.23236514522821577,
"grad_norm": 0.80078178622984,
"learning_rate": 9.670621810259596e-06,
"loss": 0.1913,
"step": 56
},
{
"epoch": 0.23651452282157676,
"grad_norm": 0.7271062637038916,
"learning_rate": 9.658890066486651e-06,
"loss": 0.1825,
"step": 57
},
{
"epoch": 0.24066390041493776,
"grad_norm": 0.7219480380769352,
"learning_rate": 9.646960404121042e-06,
"loss": 0.1582,
"step": 58
},
{
"epoch": 0.24481327800829875,
"grad_norm": 0.7262914870173502,
"learning_rate": 9.634833329957722e-06,
"loss": 0.1807,
"step": 59
},
{
"epoch": 0.24896265560165975,
"grad_norm": 0.7622053149811464,
"learning_rate": 9.62250935917808e-06,
"loss": 0.2081,
"step": 60
},
{
"epoch": 0.25311203319502074,
"grad_norm": 0.7628500677504657,
"learning_rate": 9.609989015328052e-06,
"loss": 0.2054,
"step": 61
},
{
"epoch": 0.2572614107883817,
"grad_norm": 0.7617360221525938,
"learning_rate": 9.597272830295877e-06,
"loss": 0.2029,
"step": 62
},
{
"epoch": 0.26141078838174275,
"grad_norm": 0.7214154973719157,
"learning_rate": 9.584361344289499e-06,
"loss": 0.1841,
"step": 63
},
{
"epoch": 0.26556016597510373,
"grad_norm": 0.6955736876069925,
"learning_rate": 9.571255105813632e-06,
"loss": 0.1805,
"step": 64
},
{
"epoch": 0.2697095435684647,
"grad_norm": 0.7663861454386831,
"learning_rate": 9.55795467164644e-06,
"loss": 0.2095,
"step": 65
},
{
"epoch": 0.27385892116182575,
"grad_norm": 0.6939193958655497,
"learning_rate": 9.544460606815901e-06,
"loss": 0.1609,
"step": 66
},
{
"epoch": 0.27800829875518673,
"grad_norm": 0.7629821736909411,
"learning_rate": 9.530773484575785e-06,
"loss": 0.1889,
"step": 67
},
{
"epoch": 0.2821576763485477,
"grad_norm": 0.7512315651425395,
"learning_rate": 9.516893886381324e-06,
"loss": 0.2023,
"step": 68
},
{
"epoch": 0.2863070539419087,
"grad_norm": 0.8079916042578716,
"learning_rate": 9.502822401864484e-06,
"loss": 0.2102,
"step": 69
},
{
"epoch": 0.29045643153526973,
"grad_norm": 0.723931061320125,
"learning_rate": 9.488559628808939e-06,
"loss": 0.1768,
"step": 70
},
{
"epoch": 0.2946058091286307,
"grad_norm": 0.7220408990609748,
"learning_rate": 9.474106173124667e-06,
"loss": 0.1765,
"step": 71
},
{
"epoch": 0.2987551867219917,
"grad_norm": 0.7774704324723622,
"learning_rate": 9.459462648822209e-06,
"loss": 0.217,
"step": 72
},
{
"epoch": 0.3029045643153527,
"grad_norm": 0.7443159710540501,
"learning_rate": 9.444629677986583e-06,
"loss": 0.1833,
"step": 73
},
{
"epoch": 0.3070539419087137,
"grad_norm": 0.7696478423628316,
"learning_rate": 9.429607890750863e-06,
"loss": 0.1859,
"step": 74
},
{
"epoch": 0.3112033195020747,
"grad_norm": 0.7735940287455797,
"learning_rate": 9.414397925269402e-06,
"loss": 0.1849,
"step": 75
},
{
"epoch": 0.3153526970954357,
"grad_norm": 0.7205860014852195,
"learning_rate": 9.399000427690736e-06,
"loss": 0.1644,
"step": 76
},
{
"epoch": 0.31950207468879666,
"grad_norm": 0.7433776756498811,
"learning_rate": 9.38341605213011e-06,
"loss": 0.1764,
"step": 77
},
{
"epoch": 0.3236514522821577,
"grad_norm": 0.7653711443173772,
"learning_rate": 9.367645460641716e-06,
"loss": 0.1642,
"step": 78
},
{
"epoch": 0.3278008298755187,
"grad_norm": 0.7603265566217704,
"learning_rate": 9.35168932319055e-06,
"loss": 0.1851,
"step": 79
},
{
"epoch": 0.33195020746887965,
"grad_norm": 0.780586497908624,
"learning_rate": 9.335548317623957e-06,
"loss": 0.1659,
"step": 80
},
{
"epoch": 0.3360995850622407,
"grad_norm": 0.759565238100644,
"learning_rate": 9.31922312964284e-06,
"loss": 0.1829,
"step": 81
},
{
"epoch": 0.34024896265560167,
"grad_norm": 0.8911678916994763,
"learning_rate": 9.302714452772515e-06,
"loss": 0.2233,
"step": 82
},
{
"epoch": 0.34439834024896265,
"grad_norm": 0.7873276205486429,
"learning_rate": 9.286022988333268e-06,
"loss": 0.1826,
"step": 83
},
{
"epoch": 0.34854771784232363,
"grad_norm": 0.7873887244154294,
"learning_rate": 9.269149445410545e-06,
"loss": 0.1912,
"step": 84
},
{
"epoch": 0.35269709543568467,
"grad_norm": 0.7572742935281274,
"learning_rate": 9.252094540824839e-06,
"loss": 0.1884,
"step": 85
},
{
"epoch": 0.35684647302904565,
"grad_norm": 0.7729250135877875,
"learning_rate": 9.234858999101232e-06,
"loss": 0.2045,
"step": 86
},
{
"epoch": 0.36099585062240663,
"grad_norm": 0.7492339910177302,
"learning_rate": 9.21744355243862e-06,
"loss": 0.1713,
"step": 87
},
{
"epoch": 0.3651452282157676,
"grad_norm": 0.7279318208551071,
"learning_rate": 9.199848940678607e-06,
"loss": 0.1526,
"step": 88
},
{
"epoch": 0.36929460580912865,
"grad_norm": 0.7701397202545115,
"learning_rate": 9.18207591127407e-06,
"loss": 0.1839,
"step": 89
},
{
"epoch": 0.37344398340248963,
"grad_norm": 0.7734955369159264,
"learning_rate": 9.164125219257419e-06,
"loss": 0.1936,
"step": 90
},
{
"epoch": 0.3775933609958506,
"grad_norm": 0.6984959494367776,
"learning_rate": 9.1459976272085e-06,
"loss": 0.1535,
"step": 91
},
{
"epoch": 0.3817427385892116,
"grad_norm": 0.7846836912164711,
"learning_rate": 9.127693905222223e-06,
"loss": 0.1869,
"step": 92
},
{
"epoch": 0.38589211618257263,
"grad_norm": 0.760736222391232,
"learning_rate": 9.10921483087583e-06,
"loss": 0.1834,
"step": 93
},
{
"epoch": 0.3900414937759336,
"grad_norm": 0.7700341024971602,
"learning_rate": 9.09056118919587e-06,
"loss": 0.191,
"step": 94
},
{
"epoch": 0.3941908713692946,
"grad_norm": 0.7098525396416538,
"learning_rate": 9.071733772624847e-06,
"loss": 0.1615,
"step": 95
},
{
"epoch": 0.3983402489626556,
"grad_norm": 0.836504808011679,
"learning_rate": 9.052733380987555e-06,
"loss": 0.1996,
"step": 96
},
{
"epoch": 0.4024896265560166,
"grad_norm": 0.7809457430201482,
"learning_rate": 9.033560821457102e-06,
"loss": 0.2037,
"step": 97
},
{
"epoch": 0.4066390041493776,
"grad_norm": 0.7131874504599527,
"learning_rate": 9.014216908520619e-06,
"loss": 0.1567,
"step": 98
},
{
"epoch": 0.4107883817427386,
"grad_norm": 0.7241086621508711,
"learning_rate": 8.994702463944657e-06,
"loss": 0.1762,
"step": 99
},
{
"epoch": 0.4149377593360996,
"grad_norm": 0.7470270273910699,
"learning_rate": 8.975018316740278e-06,
"loss": 0.189,
"step": 100
},
{
"epoch": 0.4190871369294606,
"grad_norm": 0.7956250924167121,
"learning_rate": 8.955165303127841e-06,
"loss": 0.2145,
"step": 101
},
{
"epoch": 0.42323651452282157,
"grad_norm": 0.7610007103879098,
"learning_rate": 8.93514426650147e-06,
"loss": 0.1784,
"step": 102
},
{
"epoch": 0.42738589211618255,
"grad_norm": 0.842316502916225,
"learning_rate": 8.914956057393231e-06,
"loss": 0.2031,
"step": 103
},
{
"epoch": 0.4315352697095436,
"grad_norm": 0.7936898870281893,
"learning_rate": 8.894601533437e-06,
"loss": 0.2046,
"step": 104
},
{
"epoch": 0.43568464730290457,
"grad_norm": 0.7265768890674549,
"learning_rate": 8.87408155933202e-06,
"loss": 0.182,
"step": 105
},
{
"epoch": 0.43983402489626555,
"grad_norm": 0.6526179900015049,
"learning_rate": 8.853397006806183e-06,
"loss": 0.1373,
"step": 106
},
{
"epoch": 0.44398340248962653,
"grad_norm": 0.7081605756028013,
"learning_rate": 8.832548754578981e-06,
"loss": 0.1619,
"step": 107
},
{
"epoch": 0.44813278008298757,
"grad_norm": 0.7795529520759518,
"learning_rate": 8.811537688324187e-06,
"loss": 0.1699,
"step": 108
},
{
"epoch": 0.45228215767634855,
"grad_norm": 0.6863306957159958,
"learning_rate": 8.79036470063223e-06,
"loss": 0.1599,
"step": 109
},
{
"epoch": 0.45643153526970953,
"grad_norm": 0.7392254415434374,
"learning_rate": 8.769030690972262e-06,
"loss": 0.1749,
"step": 110
},
{
"epoch": 0.4605809128630705,
"grad_norm": 0.7406860180908847,
"learning_rate": 8.747536565653966e-06,
"loss": 0.1625,
"step": 111
},
{
"epoch": 0.46473029045643155,
"grad_norm": 0.7256893779396758,
"learning_rate": 8.725883237789046e-06,
"loss": 0.156,
"step": 112
},
{
"epoch": 0.46887966804979253,
"grad_norm": 0.7936161492458803,
"learning_rate": 8.704071627252428e-06,
"loss": 0.1927,
"step": 113
},
{
"epoch": 0.4730290456431535,
"grad_norm": 0.7921344874492351,
"learning_rate": 8.682102660643196e-06,
"loss": 0.1902,
"step": 114
},
{
"epoch": 0.47717842323651455,
"grad_norm": 0.7661741406638606,
"learning_rate": 8.659977271245224e-06,
"loss": 0.2017,
"step": 115
},
{
"epoch": 0.48132780082987553,
"grad_norm": 0.717807546570814,
"learning_rate": 8.637696398987517e-06,
"loss": 0.179,
"step": 116
},
{
"epoch": 0.4854771784232365,
"grad_norm": 0.7450855678577503,
"learning_rate": 8.615260990404301e-06,
"loss": 0.1812,
"step": 117
},
{
"epoch": 0.4896265560165975,
"grad_norm": 0.7841540320532201,
"learning_rate": 8.592671998594794e-06,
"loss": 0.2024,
"step": 118
},
{
"epoch": 0.49377593360995853,
"grad_norm": 0.8347965727378909,
"learning_rate": 8.56993038318273e-06,
"loss": 0.1847,
"step": 119
},
{
"epoch": 0.4979253112033195,
"grad_norm": 0.7582917342684473,
"learning_rate": 8.54703711027558e-06,
"loss": 0.175,
"step": 120
},
{
"epoch": 0.5020746887966805,
"grad_norm": 0.7290057002386009,
"learning_rate": 8.523993152423522e-06,
"loss": 0.1647,
"step": 121
},
{
"epoch": 0.5062240663900415,
"grad_norm": 0.6809327899218293,
"learning_rate": 8.50079948857812e-06,
"loss": 0.1427,
"step": 122
},
{
"epoch": 0.5103734439834025,
"grad_norm": 0.6957572538516275,
"learning_rate": 8.477457104050732e-06,
"loss": 0.1536,
"step": 123
},
{
"epoch": 0.5145228215767634,
"grad_norm": 0.7497563490013779,
"learning_rate": 8.453966990470656e-06,
"loss": 0.1845,
"step": 124
},
{
"epoch": 0.5186721991701245,
"grad_norm": 0.7131343540595416,
"learning_rate": 8.430330145743011e-06,
"loss": 0.1656,
"step": 125
},
{
"epoch": 0.5228215767634855,
"grad_norm": 0.7355273724695276,
"learning_rate": 8.406547574006326e-06,
"loss": 0.1757,
"step": 126
},
{
"epoch": 0.5269709543568465,
"grad_norm": 0.707388890545072,
"learning_rate": 8.3826202855899e-06,
"loss": 0.1645,
"step": 127
},
{
"epoch": 0.5311203319502075,
"grad_norm": 0.7699859449989024,
"learning_rate": 8.358549296970877e-06,
"loss": 0.1908,
"step": 128
},
{
"epoch": 0.5352697095435685,
"grad_norm": 0.7766679325618092,
"learning_rate": 8.334335630731051e-06,
"loss": 0.1697,
"step": 129
},
{
"epoch": 0.5394190871369294,
"grad_norm": 0.8240433568844907,
"learning_rate": 8.309980315513444e-06,
"loss": 0.2141,
"step": 130
},
{
"epoch": 0.5435684647302904,
"grad_norm": 0.7228340520917644,
"learning_rate": 8.285484385978598e-06,
"loss": 0.1642,
"step": 131
},
{
"epoch": 0.5477178423236515,
"grad_norm": 0.7554222129553129,
"learning_rate": 8.260848882760616e-06,
"loss": 0.1785,
"step": 132
},
{
"epoch": 0.5518672199170125,
"grad_norm": 0.7166236882688062,
"learning_rate": 8.236074852422965e-06,
"loss": 0.1687,
"step": 133
},
{
"epoch": 0.5560165975103735,
"grad_norm": 0.7548715208775477,
"learning_rate": 8.211163347414005e-06,
"loss": 0.1818,
"step": 134
},
{
"epoch": 0.5601659751037344,
"grad_norm": 0.7609580580621476,
"learning_rate": 8.186115426022286e-06,
"loss": 0.1726,
"step": 135
},
{
"epoch": 0.5643153526970954,
"grad_norm": 0.7081999400916217,
"learning_rate": 8.160932152331587e-06,
"loss": 0.1569,
"step": 136
},
{
"epoch": 0.5684647302904564,
"grad_norm": 0.7380667317946897,
"learning_rate": 8.135614596175714e-06,
"loss": 0.1512,
"step": 137
},
{
"epoch": 0.5726141078838174,
"grad_norm": 0.7462628971624624,
"learning_rate": 8.11016383309305e-06,
"loss": 0.1408,
"step": 138
},
{
"epoch": 0.5767634854771784,
"grad_norm": 0.7686213864033702,
"learning_rate": 8.084580944280862e-06,
"loss": 0.1816,
"step": 139
},
{
"epoch": 0.5809128630705395,
"grad_norm": 0.7420483653450449,
"learning_rate": 8.058867016549372e-06,
"loss": 0.1791,
"step": 140
},
{
"epoch": 0.5850622406639004,
"grad_norm": 0.71264089436099,
"learning_rate": 8.03302314227559e-06,
"loss": 0.1439,
"step": 141
},
{
"epoch": 0.5892116182572614,
"grad_norm": 0.7586489110810907,
"learning_rate": 8.007050419356898e-06,
"loss": 0.1909,
"step": 142
},
{
"epoch": 0.5933609958506224,
"grad_norm": 0.8567775411352899,
"learning_rate": 7.980949951164422e-06,
"loss": 0.2122,
"step": 143
},
{
"epoch": 0.5975103734439834,
"grad_norm": 0.7346565956077903,
"learning_rate": 7.95472284649615e-06,
"loss": 0.1516,
"step": 144
},
{
"epoch": 0.6016597510373444,
"grad_norm": 0.7432495374421877,
"learning_rate": 7.92837021952983e-06,
"loss": 0.1554,
"step": 145
},
{
"epoch": 0.6058091286307054,
"grad_norm": 0.7799968417933343,
"learning_rate": 7.90189318977564e-06,
"loss": 0.187,
"step": 146
},
{
"epoch": 0.6099585062240664,
"grad_norm": 0.7494236481968739,
"learning_rate": 7.875292882028624e-06,
"loss": 0.1837,
"step": 147
},
{
"epoch": 0.6141078838174274,
"grad_norm": 0.7474431038051979,
"learning_rate": 7.848570426320918e-06,
"loss": 0.1643,
"step": 148
},
{
"epoch": 0.6182572614107884,
"grad_norm": 0.8170730043512447,
"learning_rate": 7.821726957873728e-06,
"loss": 0.1624,
"step": 149
},
{
"epoch": 0.6224066390041494,
"grad_norm": 0.7695549295122858,
"learning_rate": 7.794763617049124e-06,
"loss": 0.1728,
"step": 150
},
{
"epoch": 0.6265560165975104,
"grad_norm": 0.786153361885955,
"learning_rate": 7.767681549301576e-06,
"loss": 0.198,
"step": 151
},
{
"epoch": 0.6307053941908713,
"grad_norm": 0.7293084463135483,
"learning_rate": 7.740481905129307e-06,
"loss": 0.158,
"step": 152
},
{
"epoch": 0.6348547717842323,
"grad_norm": 0.7976980758945141,
"learning_rate": 7.713165840025412e-06,
"loss": 0.1972,
"step": 153
},
{
"epoch": 0.6390041493775933,
"grad_norm": 0.6910969018398649,
"learning_rate": 7.685734514428767e-06,
"loss": 0.1562,
"step": 154
},
{
"epoch": 0.6431535269709544,
"grad_norm": 0.7012920246533267,
"learning_rate": 7.658189093674738e-06,
"loss": 0.1578,
"step": 155
},
{
"epoch": 0.6473029045643154,
"grad_norm": 0.7301136191105242,
"learning_rate": 7.630530747945672e-06,
"loss": 0.1862,
"step": 156
},
{
"epoch": 0.6514522821576764,
"grad_norm": 0.7861556907392464,
"learning_rate": 7.6027606522211835e-06,
"loss": 0.1733,
"step": 157
},
{
"epoch": 0.6556016597510373,
"grad_norm": 0.7157404027922296,
"learning_rate": 7.574879986228245e-06,
"loss": 0.1554,
"step": 158
},
{
"epoch": 0.6597510373443983,
"grad_norm": 0.7821110107372908,
"learning_rate": 7.546889934391065e-06,
"loss": 0.2143,
"step": 159
},
{
"epoch": 0.6639004149377593,
"grad_norm": 0.7268702392383997,
"learning_rate": 7.518791685780769e-06,
"loss": 0.1551,
"step": 160
},
{
"epoch": 0.6680497925311203,
"grad_norm": 0.7543523627446657,
"learning_rate": 7.490586434064893e-06,
"loss": 0.1659,
"step": 161
},
{
"epoch": 0.6721991701244814,
"grad_norm": 0.8320460591998305,
"learning_rate": 7.462275377456671e-06,
"loss": 0.2111,
"step": 162
},
{
"epoch": 0.6763485477178424,
"grad_norm": 0.6854667430244615,
"learning_rate": 7.433859718664127e-06,
"loss": 0.1477,
"step": 163
},
{
"epoch": 0.6804979253112033,
"grad_norm": 0.7395821155536143,
"learning_rate": 7.405340664838994e-06,
"loss": 0.1868,
"step": 164
},
{
"epoch": 0.6846473029045643,
"grad_norm": 0.8119883130258085,
"learning_rate": 7.376719427525415e-06,
"loss": 0.1955,
"step": 165
},
{
"epoch": 0.6887966804979253,
"grad_norm": 0.748708444154493,
"learning_rate": 7.3479972226084925e-06,
"loss": 0.1574,
"step": 166
},
{
"epoch": 0.6929460580912863,
"grad_norm": 0.7541707172275884,
"learning_rate": 7.319175270262624e-06,
"loss": 0.1571,
"step": 167
},
{
"epoch": 0.6970954356846473,
"grad_norm": 0.7901522781346901,
"learning_rate": 7.290254794899665e-06,
"loss": 0.1557,
"step": 168
},
{
"epoch": 0.7012448132780082,
"grad_norm": 0.7618677553737644,
"learning_rate": 7.261237025116923e-06,
"loss": 0.1682,
"step": 169
},
{
"epoch": 0.7053941908713693,
"grad_norm": 0.7642886029206197,
"learning_rate": 7.232123193644957e-06,
"loss": 0.1498,
"step": 170
},
{
"epoch": 0.7095435684647303,
"grad_norm": 0.7791230848104441,
"learning_rate": 7.202914537295211e-06,
"loss": 0.1617,
"step": 171
},
{
"epoch": 0.7136929460580913,
"grad_norm": 0.7393488540214589,
"learning_rate": 7.173612296907473e-06,
"loss": 0.1613,
"step": 172
},
{
"epoch": 0.7178423236514523,
"grad_norm": 0.7038006522246097,
"learning_rate": 7.1442177172971586e-06,
"loss": 0.1401,
"step": 173
},
{
"epoch": 0.7219917012448133,
"grad_norm": 0.7437385440346528,
"learning_rate": 7.114732047202433e-06,
"loss": 0.1822,
"step": 174
},
{
"epoch": 0.7261410788381742,
"grad_norm": 0.7217780558130102,
"learning_rate": 7.085156539231159e-06,
"loss": 0.1639,
"step": 175
},
{
"epoch": 0.7302904564315352,
"grad_norm": 0.7780272872916683,
"learning_rate": 7.055492449807684e-06,
"loss": 0.1909,
"step": 176
},
{
"epoch": 0.7344398340248963,
"grad_norm": 0.7011610009172605,
"learning_rate": 7.025741039119466e-06,
"loss": 0.1504,
"step": 177
},
{
"epoch": 0.7385892116182573,
"grad_norm": 0.6952695444986241,
"learning_rate": 6.995903571063541e-06,
"loss": 0.1367,
"step": 178
},
{
"epoch": 0.7427385892116183,
"grad_norm": 0.7599801857515818,
"learning_rate": 6.96598131319282e-06,
"loss": 0.1677,
"step": 179
},
{
"epoch": 0.7468879668049793,
"grad_norm": 0.7522191275629795,
"learning_rate": 6.935975536662254e-06,
"loss": 0.1851,
"step": 180
},
{
"epoch": 0.7510373443983402,
"grad_norm": 0.7755271260892246,
"learning_rate": 6.905887516174827e-06,
"loss": 0.1673,
"step": 181
},
{
"epoch": 0.7551867219917012,
"grad_norm": 0.7029538367008582,
"learning_rate": 6.875718529927404e-06,
"loss": 0.1573,
"step": 182
},
{
"epoch": 0.7593360995850622,
"grad_norm": 0.7427208404607889,
"learning_rate": 6.845469859556426e-06,
"loss": 0.1629,
"step": 183
},
{
"epoch": 0.7634854771784232,
"grad_norm": 0.7413786757065406,
"learning_rate": 6.815142790083473e-06,
"loss": 0.1608,
"step": 184
},
{
"epoch": 0.7676348547717843,
"grad_norm": 0.7331757595224525,
"learning_rate": 6.784738609860668e-06,
"loss": 0.1611,
"step": 185
},
{
"epoch": 0.7717842323651453,
"grad_norm": 0.7585384853604383,
"learning_rate": 6.754258610515949e-06,
"loss": 0.1908,
"step": 186
},
{
"epoch": 0.7759336099585062,
"grad_norm": 0.7407145482448325,
"learning_rate": 6.723704086898193e-06,
"loss": 0.1836,
"step": 187
},
{
"epoch": 0.7800829875518672,
"grad_norm": 0.7045533572567579,
"learning_rate": 6.6930763370222104e-06,
"loss": 0.1713,
"step": 188
},
{
"epoch": 0.7842323651452282,
"grad_norm": 0.7796381186959462,
"learning_rate": 6.662376662013609e-06,
"loss": 0.1878,
"step": 189
},
{
"epoch": 0.7883817427385892,
"grad_norm": 0.7294399552388171,
"learning_rate": 6.631606366053507e-06,
"loss": 0.1706,
"step": 190
},
{
"epoch": 0.7925311203319502,
"grad_norm": 0.7242566299823047,
"learning_rate": 6.60076675632314e-06,
"loss": 0.1656,
"step": 191
},
{
"epoch": 0.7966804979253111,
"grad_norm": 0.734716432035308,
"learning_rate": 6.5698591429483286e-06,
"loss": 0.1813,
"step": 192
},
{
"epoch": 0.8008298755186722,
"grad_norm": 0.7243228793796995,
"learning_rate": 6.5388848389438095e-06,
"loss": 0.167,
"step": 193
},
{
"epoch": 0.8049792531120332,
"grad_norm": 0.7146769575254076,
"learning_rate": 6.507845160157476e-06,
"loss": 0.1786,
"step": 194
},
{
"epoch": 0.8091286307053942,
"grad_norm": 0.7000888246853346,
"learning_rate": 6.476741425214464e-06,
"loss": 0.1598,
"step": 195
},
{
"epoch": 0.8132780082987552,
"grad_norm": 0.7091228748093505,
"learning_rate": 6.445574955461134e-06,
"loss": 0.1555,
"step": 196
},
{
"epoch": 0.8174273858921162,
"grad_norm": 0.7475590867202404,
"learning_rate": 6.414347074908944e-06,
"loss": 0.1816,
"step": 197
},
{
"epoch": 0.8215767634854771,
"grad_norm": 0.6648744157507708,
"learning_rate": 6.383059110178205e-06,
"loss": 0.125,
"step": 198
},
{
"epoch": 0.8257261410788381,
"grad_norm": 0.7046497121459463,
"learning_rate": 6.35171239044171e-06,
"loss": 0.1671,
"step": 199
},
{
"epoch": 0.8298755186721992,
"grad_norm": 0.7338668870220733,
"learning_rate": 6.320308247368285e-06,
"loss": 0.1792,
"step": 200
},
{
"epoch": 0.8298755186721992,
"eval_loss": 0.18507465720176697,
"eval_runtime": 1.4909,
"eval_samples_per_second": 13.415,
"eval_steps_per_second": 3.354,
"step": 200
},
{
"epoch": 0.8340248962655602,
"grad_norm": 0.8256796055544355,
"learning_rate": 6.288848015066211e-06,
"loss": 0.1791,
"step": 201
},
{
"epoch": 0.8381742738589212,
"grad_norm": 0.7814594735009075,
"learning_rate": 6.2573330300265375e-06,
"loss": 0.2016,
"step": 202
},
{
"epoch": 0.8423236514522822,
"grad_norm": 0.7582397861722057,
"learning_rate": 6.225764631066326e-06,
"loss": 0.1595,
"step": 203
},
{
"epoch": 0.8464730290456431,
"grad_norm": 0.7329889520402166,
"learning_rate": 6.1941441592717564e-06,
"loss": 0.1805,
"step": 204
},
{
"epoch": 0.8506224066390041,
"grad_norm": 0.7159204670536288,
"learning_rate": 6.162472957941167e-06,
"loss": 0.1628,
"step": 205
},
{
"epoch": 0.8547717842323651,
"grad_norm": 0.7450560042081226,
"learning_rate": 6.130752372527981e-06,
"loss": 0.1625,
"step": 206
},
{
"epoch": 0.8589211618257261,
"grad_norm": 0.7276052754485396,
"learning_rate": 6.098983750583556e-06,
"loss": 0.1699,
"step": 207
},
{
"epoch": 0.8630705394190872,
"grad_norm": 0.7080982220713725,
"learning_rate": 6.067168441699927e-06,
"loss": 0.1662,
"step": 208
},
{
"epoch": 0.8672199170124482,
"grad_norm": 0.7202598720219618,
"learning_rate": 6.035307797452489e-06,
"loss": 0.1405,
"step": 209
},
{
"epoch": 0.8713692946058091,
"grad_norm": 0.7491056971873838,
"learning_rate": 6.0034031713425636e-06,
"loss": 0.1754,
"step": 210
},
{
"epoch": 0.8755186721991701,
"grad_norm": 0.7441136532460226,
"learning_rate": 5.9714559187399094e-06,
"loss": 0.162,
"step": 211
},
{
"epoch": 0.8796680497925311,
"grad_norm": 0.8207260119939723,
"learning_rate": 5.939467396825137e-06,
"loss": 0.1698,
"step": 212
},
{
"epoch": 0.8838174273858921,
"grad_norm": 0.6868519061874842,
"learning_rate": 5.907438964532059e-06,
"loss": 0.1379,
"step": 213
},
{
"epoch": 0.8879668049792531,
"grad_norm": 0.7956217383121814,
"learning_rate": 5.875371982489959e-06,
"loss": 0.163,
"step": 214
},
{
"epoch": 0.8921161825726142,
"grad_norm": 0.7394417902907329,
"learning_rate": 5.843267812965783e-06,
"loss": 0.1655,
"step": 215
},
{
"epoch": 0.8962655601659751,
"grad_norm": 0.7319044411180587,
"learning_rate": 5.811127819806277e-06,
"loss": 0.1684,
"step": 216
},
{
"epoch": 0.9004149377593361,
"grad_norm": 0.6996596721883562,
"learning_rate": 5.7789533683800445e-06,
"loss": 0.1467,
"step": 217
},
{
"epoch": 0.9045643153526971,
"grad_norm": 0.7320178544302903,
"learning_rate": 5.746745825519539e-06,
"loss": 0.1552,
"step": 218
},
{
"epoch": 0.9087136929460581,
"grad_norm": 0.724383373673059,
"learning_rate": 5.714506559463001e-06,
"loss": 0.1405,
"step": 219
},
{
"epoch": 0.9128630705394191,
"grad_norm": 0.6677748535395426,
"learning_rate": 5.682236939796337e-06,
"loss": 0.1512,
"step": 220
},
{
"epoch": 0.91701244813278,
"grad_norm": 0.7845934322337835,
"learning_rate": 5.649938337394932e-06,
"loss": 0.1859,
"step": 221
},
{
"epoch": 0.921161825726141,
"grad_norm": 0.7529205324420154,
"learning_rate": 5.617612124365411e-06,
"loss": 0.1686,
"step": 222
},
{
"epoch": 0.9253112033195021,
"grad_norm": 0.7034404844878496,
"learning_rate": 5.585259673987352e-06,
"loss": 0.1473,
"step": 223
},
{
"epoch": 0.9294605809128631,
"grad_norm": 0.716275540783631,
"learning_rate": 5.55288236065495e-06,
"loss": 0.1532,
"step": 224
},
{
"epoch": 0.9336099585062241,
"grad_norm": 0.7270308402949697,
"learning_rate": 5.52048155981862e-06,
"loss": 0.1597,
"step": 225
},
{
"epoch": 0.9377593360995851,
"grad_norm": 0.7001090415882342,
"learning_rate": 5.4880586479265774e-06,
"loss": 0.1754,
"step": 226
},
{
"epoch": 0.941908713692946,
"grad_norm": 0.7711476173055917,
"learning_rate": 5.455615002366351e-06,
"loss": 0.1625,
"step": 227
},
{
"epoch": 0.946058091286307,
"grad_norm": 0.7545759247777888,
"learning_rate": 5.423152001406282e-06,
"loss": 0.1687,
"step": 228
},
{
"epoch": 0.950207468879668,
"grad_norm": 0.7283465494781498,
"learning_rate": 5.390671024136961e-06,
"loss": 0.139,
"step": 229
},
{
"epoch": 0.9543568464730291,
"grad_norm": 0.7108090972367541,
"learning_rate": 5.358173450412649e-06,
"loss": 0.1476,
"step": 230
},
{
"epoch": 0.9585062240663901,
"grad_norm": 0.758784013083764,
"learning_rate": 5.325660660792657e-06,
"loss": 0.1384,
"step": 231
},
{
"epoch": 0.9626556016597511,
"grad_norm": 0.75965590058487,
"learning_rate": 5.293134036482697e-06,
"loss": 0.1705,
"step": 232
},
{
"epoch": 0.966804979253112,
"grad_norm": 0.7382105422676851,
"learning_rate": 5.260594959276203e-06,
"loss": 0.1674,
"step": 233
},
{
"epoch": 0.970954356846473,
"grad_norm": 0.7769590928701867,
"learning_rate": 5.228044811495632e-06,
"loss": 0.1846,
"step": 234
},
{
"epoch": 0.975103734439834,
"grad_norm": 0.6779927231647213,
"learning_rate": 5.195484975933741e-06,
"loss": 0.1286,
"step": 235
},
{
"epoch": 0.979253112033195,
"grad_norm": 0.7308532662700662,
"learning_rate": 5.162916835794843e-06,
"loss": 0.1555,
"step": 236
},
{
"epoch": 0.983402489626556,
"grad_norm": 0.7775184861306178,
"learning_rate": 5.1303417746360455e-06,
"loss": 0.1417,
"step": 237
},
{
"epoch": 0.9875518672199171,
"grad_norm": 0.8164386408608373,
"learning_rate": 5.097761176308471e-06,
"loss": 0.228,
"step": 238
},
{
"epoch": 0.991701244813278,
"grad_norm": 0.6949846700263419,
"learning_rate": 5.0651764248984794e-06,
"loss": 0.1549,
"step": 239
},
{
"epoch": 0.995850622406639,
"grad_norm": 0.7417924630808657,
"learning_rate": 5.032588904668851e-06,
"loss": 0.1693,
"step": 240
},
{
"epoch": 1.0,
"grad_norm": 0.7359282721788815,
"learning_rate": 5e-06,
"loss": 0.1539,
"step": 241
},
{
"epoch": 1.004149377593361,
"grad_norm": 0.7832307294102421,
"learning_rate": 4.967411095331149e-06,
"loss": 0.0961,
"step": 242
},
{
"epoch": 1.008298755186722,
"grad_norm": 0.8163715493753602,
"learning_rate": 4.934823575101523e-06,
"loss": 0.1047,
"step": 243
},
{
"epoch": 1.012448132780083,
"grad_norm": 0.667448725807847,
"learning_rate": 4.9022388236915306e-06,
"loss": 0.0859,
"step": 244
},
{
"epoch": 1.016597510373444,
"grad_norm": 0.6569623280552557,
"learning_rate": 4.869658225363957e-06,
"loss": 0.0848,
"step": 245
},
{
"epoch": 1.020746887966805,
"grad_norm": 0.6630688357796415,
"learning_rate": 4.837083164205159e-06,
"loss": 0.0776,
"step": 246
},
{
"epoch": 1.0248962655601659,
"grad_norm": 0.6783077300452073,
"learning_rate": 4.8045150240662615e-06,
"loss": 0.0884,
"step": 247
},
{
"epoch": 1.0290456431535269,
"grad_norm": 0.7240958065398064,
"learning_rate": 4.771955188504371e-06,
"loss": 0.0998,
"step": 248
},
{
"epoch": 1.033195020746888,
"grad_norm": 0.7601470439290279,
"learning_rate": 4.739405040723798e-06,
"loss": 0.0781,
"step": 249
},
{
"epoch": 1.037344398340249,
"grad_norm": 0.8020958279653632,
"learning_rate": 4.7068659635173034e-06,
"loss": 0.099,
"step": 250
},
{
"epoch": 1.04149377593361,
"grad_norm": 0.8193127008593971,
"learning_rate": 4.6743393392073435e-06,
"loss": 0.0905,
"step": 251
},
{
"epoch": 1.045643153526971,
"grad_norm": 0.8674642457830525,
"learning_rate": 4.641826549587352e-06,
"loss": 0.0847,
"step": 252
},
{
"epoch": 1.049792531120332,
"grad_norm": 0.7063356435232804,
"learning_rate": 4.60932897586304e-06,
"loss": 0.0638,
"step": 253
},
{
"epoch": 1.053941908713693,
"grad_norm": 0.7935013912587565,
"learning_rate": 4.57684799859372e-06,
"loss": 0.0868,
"step": 254
},
{
"epoch": 1.058091286307054,
"grad_norm": 0.8078033123113716,
"learning_rate": 4.54438499763365e-06,
"loss": 0.0849,
"step": 255
},
{
"epoch": 1.062240663900415,
"grad_norm": 0.7491115280286844,
"learning_rate": 4.511941352073424e-06,
"loss": 0.0739,
"step": 256
},
{
"epoch": 1.066390041493776,
"grad_norm": 0.7910641602613229,
"learning_rate": 4.479518440181381e-06,
"loss": 0.1018,
"step": 257
},
{
"epoch": 1.070539419087137,
"grad_norm": 0.7384613928856985,
"learning_rate": 4.447117639345052e-06,
"loss": 0.0814,
"step": 258
},
{
"epoch": 1.0746887966804979,
"grad_norm": 0.7576400301646911,
"learning_rate": 4.414740326012649e-06,
"loss": 0.0767,
"step": 259
},
{
"epoch": 1.0788381742738589,
"grad_norm": 0.7353105802879545,
"learning_rate": 4.382387875634592e-06,
"loss": 0.0824,
"step": 260
},
{
"epoch": 1.0829875518672198,
"grad_norm": 0.7551849354862943,
"learning_rate": 4.3500616626050705e-06,
"loss": 0.1021,
"step": 261
},
{
"epoch": 1.0871369294605808,
"grad_norm": 0.753933529100596,
"learning_rate": 4.317763060203665e-06,
"loss": 0.0906,
"step": 262
},
{
"epoch": 1.0912863070539418,
"grad_norm": 0.6657270829997127,
"learning_rate": 4.285493440537002e-06,
"loss": 0.0699,
"step": 263
},
{
"epoch": 1.095435684647303,
"grad_norm": 0.7356269847744574,
"learning_rate": 4.253254174480462e-06,
"loss": 0.0917,
"step": 264
},
{
"epoch": 1.099585062240664,
"grad_norm": 0.7056456539678235,
"learning_rate": 4.221046631619956e-06,
"loss": 0.0762,
"step": 265
},
{
"epoch": 1.103734439834025,
"grad_norm": 0.7447577063439521,
"learning_rate": 4.188872180193723e-06,
"loss": 0.0773,
"step": 266
},
{
"epoch": 1.107883817427386,
"grad_norm": 0.7285210666049802,
"learning_rate": 4.156732187034219e-06,
"loss": 0.0859,
"step": 267
},
{
"epoch": 1.112033195020747,
"grad_norm": 0.7515360500123252,
"learning_rate": 4.124628017510043e-06,
"loss": 0.081,
"step": 268
},
{
"epoch": 1.116182572614108,
"grad_norm": 0.754107635042173,
"learning_rate": 4.092561035467942e-06,
"loss": 0.0954,
"step": 269
},
{
"epoch": 1.120331950207469,
"grad_norm": 0.7658986514175681,
"learning_rate": 4.060532603174865e-06,
"loss": 0.0803,
"step": 270
},
{
"epoch": 1.1244813278008299,
"grad_norm": 0.7164719929051085,
"learning_rate": 4.028544081260093e-06,
"loss": 0.0875,
"step": 271
},
{
"epoch": 1.1286307053941909,
"grad_norm": 0.7811385800972357,
"learning_rate": 3.996596828657437e-06,
"loss": 0.0957,
"step": 272
},
{
"epoch": 1.1327800829875518,
"grad_norm": 0.8095703190806857,
"learning_rate": 3.9646922025475126e-06,
"loss": 0.0888,
"step": 273
},
{
"epoch": 1.1369294605809128,
"grad_norm": 0.8577161530627458,
"learning_rate": 3.932831558300074e-06,
"loss": 0.1304,
"step": 274
},
{
"epoch": 1.1410788381742738,
"grad_norm": 0.7551634325885881,
"learning_rate": 3.9010162494164475e-06,
"loss": 0.0694,
"step": 275
},
{
"epoch": 1.1452282157676348,
"grad_norm": 0.7259339364603347,
"learning_rate": 3.869247627472021e-06,
"loss": 0.092,
"step": 276
},
{
"epoch": 1.1493775933609958,
"grad_norm": 0.7797326733766821,
"learning_rate": 3.837527042058836e-06,
"loss": 0.0884,
"step": 277
},
{
"epoch": 1.1535269709543567,
"grad_norm": 0.7885805459732599,
"learning_rate": 3.8058558407282465e-06,
"loss": 0.0787,
"step": 278
},
{
"epoch": 1.1576763485477177,
"grad_norm": 0.8196108480937061,
"learning_rate": 3.7742353689336753e-06,
"loss": 0.0809,
"step": 279
},
{
"epoch": 1.161825726141079,
"grad_norm": 0.775286344200391,
"learning_rate": 3.742666969973463e-06,
"loss": 0.0885,
"step": 280
},
{
"epoch": 1.16597510373444,
"grad_norm": 0.7244030638509119,
"learning_rate": 3.7111519849337908e-06,
"loss": 0.08,
"step": 281
},
{
"epoch": 1.170124481327801,
"grad_norm": 0.8052024240159372,
"learning_rate": 3.6796917526317153e-06,
"loss": 0.1052,
"step": 282
},
{
"epoch": 1.1742738589211619,
"grad_norm": 0.7481457604441057,
"learning_rate": 3.648287609558291e-06,
"loss": 0.0828,
"step": 283
},
{
"epoch": 1.1784232365145229,
"grad_norm": 0.8236128177922952,
"learning_rate": 3.6169408898217973e-06,
"loss": 0.126,
"step": 284
},
{
"epoch": 1.1825726141078838,
"grad_norm": 0.8014821382259918,
"learning_rate": 3.5856529250910565e-06,
"loss": 0.1017,
"step": 285
},
{
"epoch": 1.1867219917012448,
"grad_norm": 0.7657225404471665,
"learning_rate": 3.554425044538868e-06,
"loss": 0.0817,
"step": 286
},
{
"epoch": 1.1908713692946058,
"grad_norm": 0.7292541020615424,
"learning_rate": 3.5232585747855376e-06,
"loss": 0.0884,
"step": 287
},
{
"epoch": 1.1950207468879668,
"grad_norm": 0.7104999733621218,
"learning_rate": 3.4921548398425246e-06,
"loss": 0.0705,
"step": 288
},
{
"epoch": 1.1991701244813278,
"grad_norm": 0.7340653325120157,
"learning_rate": 3.461115161056191e-06,
"loss": 0.0876,
"step": 289
},
{
"epoch": 1.2033195020746887,
"grad_norm": 0.8218266725660978,
"learning_rate": 3.430140857051675e-06,
"loss": 0.104,
"step": 290
},
{
"epoch": 1.2074688796680497,
"grad_norm": 0.7539001339920732,
"learning_rate": 3.3992332436768615e-06,
"loss": 0.0663,
"step": 291
},
{
"epoch": 1.2116182572614107,
"grad_norm": 0.8019682759784094,
"learning_rate": 3.3683936339464957e-06,
"loss": 0.0907,
"step": 292
},
{
"epoch": 1.215767634854772,
"grad_norm": 0.7682093069335607,
"learning_rate": 3.3376233379863943e-06,
"loss": 0.0766,
"step": 293
},
{
"epoch": 1.2199170124481329,
"grad_norm": 0.7825481915497294,
"learning_rate": 3.306923662977789e-06,
"loss": 0.1012,
"step": 294
},
{
"epoch": 1.2240663900414939,
"grad_norm": 0.7162145084205342,
"learning_rate": 3.276295913101808e-06,
"loss": 0.0741,
"step": 295
},
{
"epoch": 1.2282157676348548,
"grad_norm": 0.7318286777705275,
"learning_rate": 3.2457413894840516e-06,
"loss": 0.0787,
"step": 296
},
{
"epoch": 1.2323651452282158,
"grad_norm": 0.7641874664846489,
"learning_rate": 3.215261390139332e-06,
"loss": 0.0932,
"step": 297
},
{
"epoch": 1.2365145228215768,
"grad_norm": 0.7445085106059115,
"learning_rate": 3.184857209916528e-06,
"loss": 0.0759,
"step": 298
},
{
"epoch": 1.2406639004149378,
"grad_norm": 0.7754817714088118,
"learning_rate": 3.1545301404435756e-06,
"loss": 0.093,
"step": 299
},
{
"epoch": 1.2448132780082988,
"grad_norm": 0.7577954657403386,
"learning_rate": 3.1242814700725977e-06,
"loss": 0.079,
"step": 300
},
{
"epoch": 1.2489626556016598,
"grad_norm": 0.7421284956952211,
"learning_rate": 3.0941124838251734e-06,
"loss": 0.0816,
"step": 301
},
{
"epoch": 1.2531120331950207,
"grad_norm": 0.8041903952253537,
"learning_rate": 3.064024463337747e-06,
"loss": 0.1059,
"step": 302
},
{
"epoch": 1.2572614107883817,
"grad_norm": 0.8046374578629936,
"learning_rate": 3.034018686807182e-06,
"loss": 0.108,
"step": 303
},
{
"epoch": 1.2614107883817427,
"grad_norm": 0.758163930794237,
"learning_rate": 3.0040964289364618e-06,
"loss": 0.0765,
"step": 304
},
{
"epoch": 1.2655601659751037,
"grad_norm": 0.7344291933199262,
"learning_rate": 2.974258960880535e-06,
"loss": 0.077,
"step": 305
},
{
"epoch": 1.2697095435684647,
"grad_norm": 0.8054215463295404,
"learning_rate": 2.944507550192318e-06,
"loss": 0.0889,
"step": 306
},
{
"epoch": 1.2738589211618256,
"grad_norm": 0.717467590964347,
"learning_rate": 2.9148434607688426e-06,
"loss": 0.0726,
"step": 307
},
{
"epoch": 1.2780082987551866,
"grad_norm": 0.7186905433521467,
"learning_rate": 2.885267952797569e-06,
"loss": 0.0757,
"step": 308
},
{
"epoch": 1.2821576763485476,
"grad_norm": 0.7398347859605869,
"learning_rate": 2.855782282702841e-06,
"loss": 0.0726,
"step": 309
},
{
"epoch": 1.2863070539419086,
"grad_norm": 0.6866464266246485,
"learning_rate": 2.826387703092528e-06,
"loss": 0.0645,
"step": 310
},
{
"epoch": 1.2904564315352698,
"grad_norm": 0.7476154799430902,
"learning_rate": 2.7970854627047893e-06,
"loss": 0.0768,
"step": 311
},
{
"epoch": 1.2946058091286308,
"grad_norm": 0.7783465796646424,
"learning_rate": 2.7678768063550454e-06,
"loss": 0.0952,
"step": 312
},
{
"epoch": 1.2987551867219918,
"grad_norm": 0.7018719402660284,
"learning_rate": 2.738762974883078e-06,
"loss": 0.074,
"step": 313
},
{
"epoch": 1.3029045643153527,
"grad_norm": 0.7006326758865006,
"learning_rate": 2.7097452051003375e-06,
"loss": 0.0752,
"step": 314
},
{
"epoch": 1.3070539419087137,
"grad_norm": 0.7664880666119144,
"learning_rate": 2.680824729737378e-06,
"loss": 0.0823,
"step": 315
},
{
"epoch": 1.3112033195020747,
"grad_norm": 0.7752147371943559,
"learning_rate": 2.6520027773915075e-06,
"loss": 0.0823,
"step": 316
},
{
"epoch": 1.3153526970954357,
"grad_norm": 0.7273629350561008,
"learning_rate": 2.623280572474587e-06,
"loss": 0.0793,
"step": 317
},
{
"epoch": 1.3195020746887967,
"grad_norm": 0.7622833114877324,
"learning_rate": 2.594659335161008e-06,
"loss": 0.0858,
"step": 318
},
{
"epoch": 1.3236514522821576,
"grad_norm": 0.6911144920395204,
"learning_rate": 2.566140281335875e-06,
"loss": 0.0773,
"step": 319
},
{
"epoch": 1.3278008298755186,
"grad_norm": 0.672521374023277,
"learning_rate": 2.5377246225433306e-06,
"loss": 0.0637,
"step": 320
},
{
"epoch": 1.3319502074688796,
"grad_norm": 0.712016056392291,
"learning_rate": 2.509413565935107e-06,
"loss": 0.0672,
"step": 321
},
{
"epoch": 1.3360995850622408,
"grad_norm": 0.7257502701410873,
"learning_rate": 2.481208314219233e-06,
"loss": 0.0729,
"step": 322
},
{
"epoch": 1.3402489626556018,
"grad_norm": 0.757287282113775,
"learning_rate": 2.4531100656089365e-06,
"loss": 0.0748,
"step": 323
},
{
"epoch": 1.3443983402489628,
"grad_norm": 0.7916678661865387,
"learning_rate": 2.4251200137717545e-06,
"loss": 0.0851,
"step": 324
},
{
"epoch": 1.3485477178423237,
"grad_norm": 0.6867276776453386,
"learning_rate": 2.3972393477788157e-06,
"loss": 0.0662,
"step": 325
},
{
"epoch": 1.3526970954356847,
"grad_norm": 0.824772150211262,
"learning_rate": 2.3694692520543293e-06,
"loss": 0.0874,
"step": 326
},
{
"epoch": 1.3568464730290457,
"grad_norm": 0.7732735468909508,
"learning_rate": 2.3418109063252625e-06,
"loss": 0.0823,
"step": 327
},
{
"epoch": 1.3609958506224067,
"grad_norm": 0.8837779304062807,
"learning_rate": 2.3142654855712353e-06,
"loss": 0.0862,
"step": 328
},
{
"epoch": 1.3651452282157677,
"grad_norm": 0.6859223843936688,
"learning_rate": 2.2868341599745895e-06,
"loss": 0.072,
"step": 329
},
{
"epoch": 1.3692946058091287,
"grad_norm": 0.822431545792467,
"learning_rate": 2.259518094870693e-06,
"loss": 0.0943,
"step": 330
},
{
"epoch": 1.3734439834024896,
"grad_norm": 0.873667652862396,
"learning_rate": 2.2323184506984257e-06,
"loss": 0.0836,
"step": 331
},
{
"epoch": 1.3775933609958506,
"grad_norm": 0.7763715356974263,
"learning_rate": 2.2052363829508776e-06,
"loss": 0.0861,
"step": 332
},
{
"epoch": 1.3817427385892116,
"grad_norm": 0.8203628811889314,
"learning_rate": 2.1782730421262738e-06,
"loss": 0.0854,
"step": 333
},
{
"epoch": 1.3858921161825726,
"grad_norm": 0.7403334503658842,
"learning_rate": 2.151429573679084e-06,
"loss": 0.0726,
"step": 334
},
{
"epoch": 1.3900414937759336,
"grad_norm": 0.7925473877054873,
"learning_rate": 2.1247071179713774e-06,
"loss": 0.0848,
"step": 335
},
{
"epoch": 1.3941908713692945,
"grad_norm": 0.7318960603550514,
"learning_rate": 2.098106810224362e-06,
"loss": 0.0717,
"step": 336
},
{
"epoch": 1.3983402489626555,
"grad_norm": 0.7393994006621016,
"learning_rate": 2.071629780470171e-06,
"loss": 0.0711,
"step": 337
},
{
"epoch": 1.4024896265560165,
"grad_norm": 0.7866727572307913,
"learning_rate": 2.0452771535038518e-06,
"loss": 0.0903,
"step": 338
},
{
"epoch": 1.4066390041493775,
"grad_norm": 0.8011405845759542,
"learning_rate": 2.0190500488355776e-06,
"loss": 0.0965,
"step": 339
},
{
"epoch": 1.4107883817427385,
"grad_norm": 0.7024967230772534,
"learning_rate": 1.9929495806431024e-06,
"loss": 0.0713,
"step": 340
},
{
"epoch": 1.4149377593360997,
"grad_norm": 0.7447848584179991,
"learning_rate": 1.9669768577244107e-06,
"loss": 0.0727,
"step": 341
},
{
"epoch": 1.4190871369294606,
"grad_norm": 0.7397444847607574,
"learning_rate": 1.9411329834506286e-06,
"loss": 0.0701,
"step": 342
},
{
"epoch": 1.4232365145228216,
"grad_norm": 0.8596586029285681,
"learning_rate": 1.9154190557191387e-06,
"loss": 0.0935,
"step": 343
},
{
"epoch": 1.4273858921161826,
"grad_norm": 0.7468897071548876,
"learning_rate": 1.8898361669069497e-06,
"loss": 0.0772,
"step": 344
},
{
"epoch": 1.4315352697095436,
"grad_norm": 0.7567860586556581,
"learning_rate": 1.864385403824287e-06,
"loss": 0.0858,
"step": 345
},
{
"epoch": 1.4356846473029046,
"grad_norm": 0.74547343386958,
"learning_rate": 1.8390678476684143e-06,
"loss": 0.0698,
"step": 346
},
{
"epoch": 1.4398340248962656,
"grad_norm": 0.8287041007092206,
"learning_rate": 1.8138845739777167e-06,
"loss": 0.101,
"step": 347
},
{
"epoch": 1.4439834024896265,
"grad_norm": 0.6949015586020539,
"learning_rate": 1.7888366525859968e-06,
"loss": 0.0682,
"step": 348
},
{
"epoch": 1.4481327800829875,
"grad_norm": 0.8019337139478665,
"learning_rate": 1.7639251475770374e-06,
"loss": 0.1039,
"step": 349
},
{
"epoch": 1.4522821576763485,
"grad_norm": 0.7205296459401602,
"learning_rate": 1.7391511172393849e-06,
"loss": 0.0719,
"step": 350
},
{
"epoch": 1.4564315352697095,
"grad_norm": 0.6811493853455287,
"learning_rate": 1.7145156140214032e-06,
"loss": 0.0749,
"step": 351
},
{
"epoch": 1.4605809128630705,
"grad_norm": 0.7172097545397581,
"learning_rate": 1.6900196844865575e-06,
"loss": 0.0738,
"step": 352
},
{
"epoch": 1.4647302904564317,
"grad_norm": 0.7893967258035767,
"learning_rate": 1.6656643692689512e-06,
"loss": 0.0887,
"step": 353
},
{
"epoch": 1.4688796680497926,
"grad_norm": 0.7589832042178465,
"learning_rate": 1.6414507030291249e-06,
"loss": 0.0922,
"step": 354
},
{
"epoch": 1.4730290456431536,
"grad_norm": 0.7925586053286797,
"learning_rate": 1.617379714410099e-06,
"loss": 0.0873,
"step": 355
},
{
"epoch": 1.4771784232365146,
"grad_norm": 0.7675642422062359,
"learning_rate": 1.5934524259936757e-06,
"loss": 0.083,
"step": 356
},
{
"epoch": 1.4813278008298756,
"grad_norm": 0.6337455156122633,
"learning_rate": 1.5696698542569905e-06,
"loss": 0.0673,
"step": 357
},
{
"epoch": 1.4854771784232366,
"grad_norm": 0.7199821450528227,
"learning_rate": 1.5460330095293447e-06,
"loss": 0.0722,
"step": 358
},
{
"epoch": 1.4896265560165975,
"grad_norm": 0.8024411048948309,
"learning_rate": 1.5225428959492695e-06,
"loss": 0.0806,
"step": 359
},
{
"epoch": 1.4937759336099585,
"grad_norm": 0.6978748125879287,
"learning_rate": 1.4992005114218805e-06,
"loss": 0.0705,
"step": 360
},
{
"epoch": 1.4979253112033195,
"grad_norm": 0.7368562452138163,
"learning_rate": 1.4760068475764789e-06,
"loss": 0.0792,
"step": 361
},
{
"epoch": 1.5020746887966805,
"grad_norm": 0.727972065311514,
"learning_rate": 1.4529628897244214e-06,
"loss": 0.0667,
"step": 362
},
{
"epoch": 1.5062240663900415,
"grad_norm": 0.7055803502397686,
"learning_rate": 1.4300696168172735e-06,
"loss": 0.0636,
"step": 363
},
{
"epoch": 1.5103734439834025,
"grad_norm": 0.7192990056922836,
"learning_rate": 1.4073280014052077e-06,
"loss": 0.0812,
"step": 364
},
{
"epoch": 1.5145228215767634,
"grad_norm": 0.7730145132377668,
"learning_rate": 1.3847390095957003e-06,
"loss": 0.079,
"step": 365
},
{
"epoch": 1.5186721991701244,
"grad_norm": 0.6999079487941452,
"learning_rate": 1.3623036010124845e-06,
"loss": 0.0666,
"step": 366
},
{
"epoch": 1.5228215767634854,
"grad_norm": 0.8084988090864743,
"learning_rate": 1.3400227287547785e-06,
"loss": 0.1028,
"step": 367
},
{
"epoch": 1.5269709543568464,
"grad_norm": 0.739877983711314,
"learning_rate": 1.3178973393568055e-06,
"loss": 0.0788,
"step": 368
},
{
"epoch": 1.5311203319502074,
"grad_norm": 0.7277670152792125,
"learning_rate": 1.295928372747574e-06,
"loss": 0.0782,
"step": 369
},
{
"epoch": 1.5352697095435683,
"grad_norm": 0.8270870127969331,
"learning_rate": 1.2741167622109557e-06,
"loss": 0.1103,
"step": 370
},
{
"epoch": 1.5394190871369293,
"grad_norm": 0.7464319854627752,
"learning_rate": 1.2524634343460335e-06,
"loss": 0.0668,
"step": 371
},
{
"epoch": 1.5435684647302903,
"grad_norm": 0.7496260324397296,
"learning_rate": 1.2309693090277392e-06,
"loss": 0.0836,
"step": 372
},
{
"epoch": 1.5477178423236515,
"grad_norm": 0.7514590668417973,
"learning_rate": 1.2096352993677712e-06,
"loss": 0.0722,
"step": 373
},
{
"epoch": 1.5518672199170125,
"grad_norm": 0.7074968757753791,
"learning_rate": 1.1884623116758121e-06,
"loss": 0.0679,
"step": 374
},
{
"epoch": 1.5560165975103735,
"grad_norm": 0.7836392476257869,
"learning_rate": 1.1674512454210202e-06,
"loss": 0.0956,
"step": 375
},
{
"epoch": 1.5601659751037344,
"grad_norm": 0.694186080406532,
"learning_rate": 1.1466029931938182e-06,
"loss": 0.0738,
"step": 376
},
{
"epoch": 1.5643153526970954,
"grad_norm": 0.7353783509200603,
"learning_rate": 1.125918440667982e-06,
"loss": 0.0844,
"step": 377
},
{
"epoch": 1.5684647302904564,
"grad_norm": 0.6929484125543315,
"learning_rate": 1.1053984665630025e-06,
"loss": 0.0645,
"step": 378
},
{
"epoch": 1.5726141078838174,
"grad_norm": 0.7605210746246555,
"learning_rate": 1.0850439426067705e-06,
"loss": 0.0701,
"step": 379
},
{
"epoch": 1.5767634854771784,
"grad_norm": 0.7885536633604376,
"learning_rate": 1.064855733498531e-06,
"loss": 0.1002,
"step": 380
},
{
"epoch": 1.5809128630705396,
"grad_norm": 0.7739423491814291,
"learning_rate": 1.0448346968721596e-06,
"loss": 0.0822,
"step": 381
},
{
"epoch": 1.5850622406639006,
"grad_norm": 0.7000383997686694,
"learning_rate": 1.024981683259723e-06,
"loss": 0.0724,
"step": 382
},
{
"epoch": 1.5892116182572615,
"grad_norm": 0.7612207692729869,
"learning_rate": 1.0052975360553446e-06,
"loss": 0.0789,
"step": 383
},
{
"epoch": 1.5933609958506225,
"grad_norm": 0.7376770921542023,
"learning_rate": 9.857830914793827e-07,
"loss": 0.0787,
"step": 384
},
{
"epoch": 1.5975103734439835,
"grad_norm": 0.7003600585245541,
"learning_rate": 9.664391785428977e-07,
"loss": 0.0669,
"step": 385
},
{
"epoch": 1.6016597510373445,
"grad_norm": 0.7493354808214869,
"learning_rate": 9.472666190124457e-07,
"loss": 0.0939,
"step": 386
},
{
"epoch": 1.6058091286307055,
"grad_norm": 0.6906782939093074,
"learning_rate": 9.282662273751536e-07,
"loss": 0.0654,
"step": 387
},
{
"epoch": 1.6099585062240664,
"grad_norm": 0.7217843451690958,
"learning_rate": 9.094388108041302e-07,
"loss": 0.0728,
"step": 388
},
{
"epoch": 1.6141078838174274,
"grad_norm": 0.7503564894362919,
"learning_rate": 8.907851691241709e-07,
"loss": 0.0786,
"step": 389
},
{
"epoch": 1.6182572614107884,
"grad_norm": 0.6989476339850978,
"learning_rate": 8.723060947777778e-07,
"loss": 0.0693,
"step": 390
},
{
"epoch": 1.6224066390041494,
"grad_norm": 0.7214414839493923,
"learning_rate": 8.540023727915015e-07,
"loss": 0.0693,
"step": 391
},
{
"epoch": 1.6265560165975104,
"grad_norm": 0.7436689425663727,
"learning_rate": 8.358747807425827e-07,
"loss": 0.0817,
"step": 392
},
{
"epoch": 1.6307053941908713,
"grad_norm": 0.6925551456815252,
"learning_rate": 8.179240887259304e-07,
"loss": 0.0722,
"step": 393
},
{
"epoch": 1.6348547717842323,
"grad_norm": 0.7594694713557898,
"learning_rate": 8.001510593213946e-07,
"loss": 0.0961,
"step": 394
},
{
"epoch": 1.6390041493775933,
"grad_norm": 0.7342750817062677,
"learning_rate": 7.825564475613806e-07,
"loss": 0.0768,
"step": 395
},
{
"epoch": 1.6431535269709543,
"grad_norm": 0.688266435838653,
"learning_rate": 7.651410008987698e-07,
"loss": 0.0732,
"step": 396
},
{
"epoch": 1.6473029045643153,
"grad_norm": 0.7322550264869961,
"learning_rate": 7.479054591751623e-07,
"loss": 0.0747,
"step": 397
},
{
"epoch": 1.6514522821576763,
"grad_norm": 0.7206648764802494,
"learning_rate": 7.308505545894567e-07,
"loss": 0.0755,
"step": 398
},
{
"epoch": 1.6556016597510372,
"grad_norm": 0.7661344327610974,
"learning_rate": 7.139770116667333e-07,
"loss": 0.0777,
"step": 399
},
{
"epoch": 1.6597510373443982,
"grad_norm": 0.6668670661993277,
"learning_rate": 6.972855472274853e-07,
"loss": 0.065,
"step": 400
},
{
"epoch": 1.6597510373443982,
"eval_loss": 0.18336524069309235,
"eval_runtime": 1.4913,
"eval_samples_per_second": 13.411,
"eval_steps_per_second": 3.353,
"step": 400
},
{
"epoch": 1.6639004149377592,
"grad_norm": 0.7407493531818214,
"learning_rate": 6.807768703571616e-07,
"loss": 0.0753,
"step": 401
},
{
"epoch": 1.6680497925311202,
"grad_norm": 0.7021176179745514,
"learning_rate": 6.644516823760439e-07,
"loss": 0.0687,
"step": 402
},
{
"epoch": 1.6721991701244814,
"grad_norm": 0.7544780378754766,
"learning_rate": 6.483106768094516e-07,
"loss": 0.0826,
"step": 403
},
{
"epoch": 1.6763485477178424,
"grad_norm": 0.7181204694523459,
"learning_rate": 6.323545393582847e-07,
"loss": 0.0646,
"step": 404
},
{
"epoch": 1.6804979253112033,
"grad_norm": 0.728528171136692,
"learning_rate": 6.165839478698909e-07,
"loss": 0.0735,
"step": 405
},
{
"epoch": 1.6846473029045643,
"grad_norm": 0.7584024865505997,
"learning_rate": 6.009995723092655e-07,
"loss": 0.0701,
"step": 406
},
{
"epoch": 1.6887966804979253,
"grad_norm": 0.7362510646650599,
"learning_rate": 5.85602074730598e-07,
"loss": 0.0702,
"step": 407
},
{
"epoch": 1.6929460580912863,
"grad_norm": 0.8094365167771168,
"learning_rate": 5.703921092491393e-07,
"loss": 0.086,
"step": 408
},
{
"epoch": 1.6970954356846473,
"grad_norm": 0.7692786267568079,
"learning_rate": 5.553703220134188e-07,
"loss": 0.0847,
"step": 409
},
{
"epoch": 1.7012448132780082,
"grad_norm": 0.7898257972112215,
"learning_rate": 5.405373511777939e-07,
"loss": 0.0935,
"step": 410
},
{
"epoch": 1.7053941908713695,
"grad_norm": 0.8534045006213508,
"learning_rate": 5.258938268753344e-07,
"loss": 0.1036,
"step": 411
},
{
"epoch": 1.7095435684647304,
"grad_norm": 0.8122284703848481,
"learning_rate": 5.114403711910631e-07,
"loss": 0.0897,
"step": 412
},
{
"epoch": 1.7136929460580914,
"grad_norm": 0.6752641341390524,
"learning_rate": 4.971775981355181e-07,
"loss": 0.0618,
"step": 413
},
{
"epoch": 1.7178423236514524,
"grad_norm": 0.7247070440800305,
"learning_rate": 4.831061136186787e-07,
"loss": 0.0652,
"step": 414
},
{
"epoch": 1.7219917012448134,
"grad_norm": 0.7935319653399917,
"learning_rate": 4.692265154242137e-07,
"loss": 0.0802,
"step": 415
},
{
"epoch": 1.7261410788381744,
"grad_norm": 0.7337802366702694,
"learning_rate": 4.555393931841001e-07,
"loss": 0.067,
"step": 416
},
{
"epoch": 1.7302904564315353,
"grad_norm": 0.7798660236301157,
"learning_rate": 4.420453283535597e-07,
"loss": 0.0851,
"step": 417
},
{
"epoch": 1.7344398340248963,
"grad_norm": 0.7189681188334966,
"learning_rate": 4.287448941863692e-07,
"loss": 0.0714,
"step": 418
},
{
"epoch": 1.7385892116182573,
"grad_norm": 0.6926391079205091,
"learning_rate": 4.1563865571050243e-07,
"loss": 0.0597,
"step": 419
},
{
"epoch": 1.7427385892116183,
"grad_norm": 0.7673967245329002,
"learning_rate": 4.0272716970412516e-07,
"loss": 0.0846,
"step": 420
},
{
"epoch": 1.7468879668049793,
"grad_norm": 0.7922534453437592,
"learning_rate": 3.9001098467194907e-07,
"loss": 0.0849,
"step": 421
},
{
"epoch": 1.7510373443983402,
"grad_norm": 0.7766499519398827,
"learning_rate": 3.7749064082191976e-07,
"loss": 0.0792,
"step": 422
},
{
"epoch": 1.7551867219917012,
"grad_norm": 0.7624109782606755,
"learning_rate": 3.6516667004227904e-07,
"loss": 0.0748,
"step": 423
},
{
"epoch": 1.7593360995850622,
"grad_norm": 0.8028939023287873,
"learning_rate": 3.53039595878959e-07,
"loss": 0.0858,
"step": 424
},
{
"epoch": 1.7634854771784232,
"grad_norm": 0.7305663120522499,
"learning_rate": 3.4110993351334944e-07,
"loss": 0.0693,
"step": 425
},
{
"epoch": 1.7676348547717842,
"grad_norm": 0.7928883317130192,
"learning_rate": 3.2937818974040637e-07,
"loss": 0.0824,
"step": 426
},
{
"epoch": 1.7717842323651452,
"grad_norm": 0.7302477597530965,
"learning_rate": 3.178448629471226e-07,
"loss": 0.0745,
"step": 427
},
{
"epoch": 1.7759336099585061,
"grad_norm": 0.84724113691283,
"learning_rate": 3.0651044309136016e-07,
"loss": 0.0801,
"step": 428
},
{
"epoch": 1.7800829875518671,
"grad_norm": 0.8237508221533169,
"learning_rate": 2.953754116810287e-07,
"loss": 0.1089,
"step": 429
},
{
"epoch": 1.784232365145228,
"grad_norm": 0.7556669232924229,
"learning_rate": 2.844402417536374e-07,
"loss": 0.0667,
"step": 430
},
{
"epoch": 1.788381742738589,
"grad_norm": 0.772080989183171,
"learning_rate": 2.737053978561943e-07,
"loss": 0.0838,
"step": 431
},
{
"epoch": 1.79253112033195,
"grad_norm": 0.7102656535327171,
"learning_rate": 2.631713360254734e-07,
"loss": 0.0737,
"step": 432
},
{
"epoch": 1.796680497925311,
"grad_norm": 0.788052093799855,
"learning_rate": 2.5283850376864206e-07,
"loss": 0.0838,
"step": 433
},
{
"epoch": 1.8008298755186722,
"grad_norm": 0.7658472127903286,
"learning_rate": 2.4270734004424643e-07,
"loss": 0.0895,
"step": 434
},
{
"epoch": 1.8049792531120332,
"grad_norm": 0.67209631879037,
"learning_rate": 2.3277827524356976e-07,
"loss": 0.0606,
"step": 435
},
{
"epoch": 1.8091286307053942,
"grad_norm": 0.7565388407521139,
"learning_rate": 2.2305173117234236e-07,
"loss": 0.084,
"step": 436
},
{
"epoch": 1.8132780082987552,
"grad_norm": 0.7492152065844847,
"learning_rate": 2.1352812103282715e-07,
"loss": 0.0791,
"step": 437
},
{
"epoch": 1.8174273858921162,
"grad_norm": 0.7805607963350516,
"learning_rate": 2.042078494062616e-07,
"loss": 0.0771,
"step": 438
},
{
"epoch": 1.8215767634854771,
"grad_norm": 0.7172485874763758,
"learning_rate": 1.9509131223567623e-07,
"loss": 0.0719,
"step": 439
},
{
"epoch": 1.8257261410788381,
"grad_norm": 0.8259753492038978,
"learning_rate": 1.861788968090683e-07,
"loss": 0.0855,
"step": 440
},
{
"epoch": 1.8298755186721993,
"grad_norm": 0.731434220061269,
"learning_rate": 1.7747098174295208e-07,
"loss": 0.072,
"step": 441
},
{
"epoch": 1.8340248962655603,
"grad_norm": 0.8070175019781584,
"learning_rate": 1.68967936966275e-07,
"loss": 0.0881,
"step": 442
},
{
"epoch": 1.8381742738589213,
"grad_norm": 0.7425563064212982,
"learning_rate": 1.606701237046998e-07,
"loss": 0.0679,
"step": 443
},
{
"epoch": 1.8423236514522823,
"grad_norm": 0.7479820412933463,
"learning_rate": 1.5257789446526172e-07,
"loss": 0.0764,
"step": 444
},
{
"epoch": 1.8464730290456433,
"grad_norm": 0.736134548408696,
"learning_rate": 1.4469159302139157e-07,
"loss": 0.0735,
"step": 445
},
{
"epoch": 1.8506224066390042,
"grad_norm": 0.7885288652580238,
"learning_rate": 1.3701155439831249e-07,
"loss": 0.0891,
"step": 446
},
{
"epoch": 1.8547717842323652,
"grad_norm": 0.7169793781256871,
"learning_rate": 1.295381048588068e-07,
"loss": 0.0735,
"step": 447
},
{
"epoch": 1.8589211618257262,
"grad_norm": 0.7419465114699466,
"learning_rate": 1.2227156188935552e-07,
"loss": 0.0767,
"step": 448
},
{
"epoch": 1.8630705394190872,
"grad_norm": 0.8529136766099726,
"learning_rate": 1.1521223418665295e-07,
"loss": 0.1095,
"step": 449
},
{
"epoch": 1.8672199170124482,
"grad_norm": 0.6673541325500725,
"learning_rate": 1.0836042164448945e-07,
"loss": 0.0669,
"step": 450
},
{
"epoch": 1.8713692946058091,
"grad_norm": 0.7662725160062882,
"learning_rate": 1.017164153410144e-07,
"loss": 0.1,
"step": 451
},
{
"epoch": 1.8755186721991701,
"grad_norm": 0.7503369715965427,
"learning_rate": 9.528049752636714e-08,
"loss": 0.072,
"step": 452
},
{
"epoch": 1.879668049792531,
"grad_norm": 0.8105258662491354,
"learning_rate": 8.905294161069111e-08,
"loss": 0.1024,
"step": 453
},
{
"epoch": 1.883817427385892,
"grad_norm": 0.7200030691605961,
"learning_rate": 8.303401215251583e-08,
"loss": 0.0709,
"step": 454
},
{
"epoch": 1.887966804979253,
"grad_norm": 0.7279970331056289,
"learning_rate": 7.722396484751705e-08,
"loss": 0.0691,
"step": 455
},
{
"epoch": 1.892116182572614,
"grad_norm": 0.8032992136998793,
"learning_rate": 7.16230465176565e-08,
"loss": 0.0866,
"step": 456
},
{
"epoch": 1.896265560165975,
"grad_norm": 0.7754511619306107,
"learning_rate": 6.623149510069593e-08,
"loss": 0.0774,
"step": 457
},
{
"epoch": 1.900414937759336,
"grad_norm": 0.8395976573342571,
"learning_rate": 6.104953964008897e-08,
"loss": 0.0725,
"step": 458
},
{
"epoch": 1.904564315352697,
"grad_norm": 0.732494745176438,
"learning_rate": 5.6077400275248996e-08,
"loss": 0.0777,
"step": 459
},
{
"epoch": 1.908713692946058,
"grad_norm": 0.7307232915987477,
"learning_rate": 5.1315288232201e-08,
"loss": 0.073,
"step": 460
},
{
"epoch": 1.912863070539419,
"grad_norm": 0.7289477709176224,
"learning_rate": 4.6763405814604926e-08,
"loss": 0.0668,
"step": 461
},
{
"epoch": 1.91701244813278,
"grad_norm": 0.6542938168620738,
"learning_rate": 4.2421946395164174e-08,
"loss": 0.0593,
"step": 462
},
{
"epoch": 1.921161825726141,
"grad_norm": 0.7217186844568827,
"learning_rate": 3.829109440740719e-08,
"loss": 0.0741,
"step": 463
},
{
"epoch": 1.9253112033195021,
"grad_norm": 0.7521600164181688,
"learning_rate": 3.437102533785541e-08,
"loss": 0.0815,
"step": 464
},
{
"epoch": 1.929460580912863,
"grad_norm": 0.7159778531199582,
"learning_rate": 3.066190571856864e-08,
"loss": 0.0721,
"step": 465
},
{
"epoch": 1.933609958506224,
"grad_norm": 0.7659356632299632,
"learning_rate": 2.7163893120066288e-08,
"loss": 0.0923,
"step": 466
},
{
"epoch": 1.937759336099585,
"grad_norm": 0.7560583950564549,
"learning_rate": 2.3877136144638823e-08,
"loss": 0.0981,
"step": 467
},
{
"epoch": 1.941908713692946,
"grad_norm": 0.7329659724004767,
"learning_rate": 2.0801774420031172e-08,
"loss": 0.0821,
"step": 468
},
{
"epoch": 1.946058091286307,
"grad_norm": 0.785189771179115,
"learning_rate": 1.793793859351245e-08,
"loss": 0.0907,
"step": 469
},
{
"epoch": 1.950207468879668,
"grad_norm": 0.7073552301072518,
"learning_rate": 1.5285750326325953e-08,
"loss": 0.0702,
"step": 470
},
{
"epoch": 1.9543568464730292,
"grad_norm": 0.7118012286048271,
"learning_rate": 1.284532228851998e-08,
"loss": 0.0846,
"step": 471
},
{
"epoch": 1.9585062240663902,
"grad_norm": 0.7399820417525123,
"learning_rate": 1.0616758154161633e-08,
"loss": 0.067,
"step": 472
},
{
"epoch": 1.9626556016597512,
"grad_norm": 0.727348534517167,
"learning_rate": 8.600152596933142e-09,
"loss": 0.0691,
"step": 473
},
{
"epoch": 1.9668049792531122,
"grad_norm": 0.7454470988451566,
"learning_rate": 6.7955912861095155e-09,
"loss": 0.0885,
"step": 474
},
{
"epoch": 1.9709543568464731,
"grad_norm": 0.7525549770710052,
"learning_rate": 5.203150882918673e-09,
"loss": 0.0892,
"step": 475
},
{
"epoch": 1.9751037344398341,
"grad_norm": 0.8218857035464914,
"learning_rate": 3.822899037286276e-09,
"loss": 0.0975,
"step": 476
},
{
"epoch": 1.979253112033195,
"grad_norm": 0.7338075673837552,
"learning_rate": 2.654894384959694e-09,
"loss": 0.0709,
"step": 477
},
{
"epoch": 1.983402489626556,
"grad_norm": 0.7806915747584687,
"learning_rate": 1.6991865450188827e-09,
"loss": 0.091,
"step": 478
},
{
"epoch": 1.987551867219917,
"grad_norm": 0.700019407551831,
"learning_rate": 9.558161177669612e-10,
"loss": 0.0738,
"step": 479
},
{
"epoch": 1.991701244813278,
"grad_norm": 0.7540815180352235,
"learning_rate": 4.2481468300603625e-10,
"loss": 0.0811,
"step": 480
},
{
"epoch": 1.995850622406639,
"grad_norm": 0.7372484737179315,
"learning_rate": 1.0620479869771772e-10,
"loss": 0.0744,
"step": 481
},
{
"epoch": 2.0,
"grad_norm": 0.7894507333442309,
"learning_rate": 0.0,
"loss": 0.0925,
"step": 482
},
{
"epoch": 2.0,
"step": 482,
"total_flos": 16593740365824.0,
"train_loss": 0.13330045866459236,
"train_runtime": 872.8294,
"train_samples_per_second": 4.418,
"train_steps_per_second": 0.552
}
],
"logging_steps": 1,
"max_steps": 482,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 16593740365824.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}