byt5-leetspeak-decoder / trainer_state.json
ilyyeees's picture
V3 Production: Improved character mapping and number preservation
0fd5949 verified
{
"best_global_step": 7500,
"best_metric": 0.38118186593055725,
"best_model_checkpoint": "./byt5_leetspeak_v3/checkpoint-7500",
"epoch": 3.0,
"eval_steps": 1500,
"global_step": 8682,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003456022118541559,
"grad_norm": 7.998739719390869,
"learning_rate": 0.0,
"loss": 21.331615447998047,
"step": 1
},
{
"epoch": 0.017280110592707794,
"grad_norm": 4.945672512054443,
"learning_rate": 9.386973180076629e-06,
"loss": 20.682008081552933,
"step": 50
},
{
"epoch": 0.03456022118541559,
"grad_norm": 3.5129992961883545,
"learning_rate": 1.896551724137931e-05,
"loss": 17.8103759765625,
"step": 100
},
{
"epoch": 0.05184033177812338,
"grad_norm": 3.4432644844055176,
"learning_rate": 2.8544061302681996e-05,
"loss": 15.057559814453125,
"step": 150
},
{
"epoch": 0.06912044237083118,
"grad_norm": 3.018535852432251,
"learning_rate": 3.8122605363984674e-05,
"loss": 13.59551513671875,
"step": 200
},
{
"epoch": 0.08640055296353896,
"grad_norm": 2.247919797897339,
"learning_rate": 4.770114942528736e-05,
"loss": 12.362723388671874,
"step": 250
},
{
"epoch": 0.10368066355624676,
"grad_norm": 2.297297239303589,
"learning_rate": 4.9997487868649304e-05,
"loss": 11.349617919921876,
"step": 300
},
{
"epoch": 0.12096077414895455,
"grad_norm": 2.3222248554229736,
"learning_rate": 4.9986528723171024e-05,
"loss": 10.5994140625,
"step": 350
},
{
"epoch": 0.13824088474166235,
"grad_norm": 2.342911958694458,
"learning_rate": 4.996687585561939e-05,
"loss": 10.017122802734375,
"step": 400
},
{
"epoch": 0.15552099533437014,
"grad_norm": 2.3807644844055176,
"learning_rate": 4.993853610394178e-05,
"loss": 9.509613037109375,
"step": 450
},
{
"epoch": 0.17280110592707792,
"grad_norm": 2.3903160095214844,
"learning_rate": 4.9901519328568466e-05,
"loss": 9.067050170898437,
"step": 500
},
{
"epoch": 0.19008121651978574,
"grad_norm": 2.3767359256744385,
"learning_rate": 4.985583840898188e-05,
"loss": 8.662135620117187,
"step": 550
},
{
"epoch": 0.20736132711249353,
"grad_norm": 2.3853607177734375,
"learning_rate": 4.98015092392353e-05,
"loss": 8.290029296875,
"step": 600
},
{
"epoch": 0.2246414377052013,
"grad_norm": 2.372498035430908,
"learning_rate": 4.973855072242276e-05,
"loss": 7.937987060546875,
"step": 650
},
{
"epoch": 0.2419215482979091,
"grad_norm": 2.359043836593628,
"learning_rate": 4.966698476410199e-05,
"loss": 7.60522705078125,
"step": 700
},
{
"epoch": 0.2592016588906169,
"grad_norm": 2.353739023208618,
"learning_rate": 4.9586836264672666e-05,
"loss": 7.296792602539062,
"step": 750
},
{
"epoch": 0.2764817694833247,
"grad_norm": 2.335012435913086,
"learning_rate": 4.9498133110712644e-05,
"loss": 6.98702392578125,
"step": 800
},
{
"epoch": 0.2937618800760325,
"grad_norm": 2.289266586303711,
"learning_rate": 4.940090616527521e-05,
"loss": 6.709508666992187,
"step": 850
},
{
"epoch": 0.3110419906687403,
"grad_norm": 2.252319097518921,
"learning_rate": 4.929518925715071e-05,
"loss": 6.44697021484375,
"step": 900
},
{
"epoch": 0.32832210126144806,
"grad_norm": 2.216578483581543,
"learning_rate": 4.9181019169096285e-05,
"loss": 6.1863818359375,
"step": 950
},
{
"epoch": 0.34560221185415585,
"grad_norm": 2.1667819023132324,
"learning_rate": 4.90584356250378e-05,
"loss": 5.913253173828125,
"step": 1000
},
{
"epoch": 0.36288232244686364,
"grad_norm": 2.1393256187438965,
"learning_rate": 4.892748127624845e-05,
"loss": 5.687144165039062,
"step": 1050
},
{
"epoch": 0.3801624330395715,
"grad_norm": 2.1089439392089844,
"learning_rate": 4.878820168650884e-05,
"loss": 5.421361083984375,
"step": 1100
},
{
"epoch": 0.39744254363227927,
"grad_norm": 2.0389111042022705,
"learning_rate": 4.864064531625366e-05,
"loss": 5.215165405273438,
"step": 1150
},
{
"epoch": 0.41472265422498705,
"grad_norm": 1.9905728101730347,
"learning_rate": 4.8484863505710585e-05,
"loss": 4.993418273925781,
"step": 1200
},
{
"epoch": 0.43200276481769484,
"grad_norm": 1.915732502937317,
"learning_rate": 4.8320910457037105e-05,
"loss": 4.796431884765625,
"step": 1250
},
{
"epoch": 0.4492828754104026,
"grad_norm": 1.876905083656311,
"learning_rate": 4.814884321546163e-05,
"loss": 4.600077819824219,
"step": 1300
},
{
"epoch": 0.4665629860031104,
"grad_norm": 1.8127694129943848,
"learning_rate": 4.796872164943538e-05,
"loss": 4.390880737304688,
"step": 1350
},
{
"epoch": 0.4838430965958182,
"grad_norm": 1.749795913696289,
"learning_rate": 4.778060842980199e-05,
"loss": 4.223977661132812,
"step": 1400
},
{
"epoch": 0.501123207188526,
"grad_norm": 1.7290434837341309,
"learning_rate": 4.758456900799202e-05,
"loss": 4.044484252929688,
"step": 1450
},
{
"epoch": 0.5184033177812338,
"grad_norm": 1.658087968826294,
"learning_rate": 4.738067159325005e-05,
"loss": 3.873255615234375,
"step": 1500
},
{
"epoch": 0.5184033177812338,
"eval_loss": 1.5466166734695435,
"eval_runtime": 95.1917,
"eval_samples_per_second": 377.995,
"eval_steps_per_second": 0.987,
"step": 1500
},
{
"epoch": 0.5356834283739416,
"grad_norm": 1.5666825771331787,
"learning_rate": 4.716898712890218e-05,
"loss": 3.697227783203125,
"step": 1550
},
{
"epoch": 0.5529635389666494,
"grad_norm": 1.5051823854446411,
"learning_rate": 4.6949589267672256e-05,
"loss": 3.5488134765625,
"step": 1600
},
{
"epoch": 0.5702436495593571,
"grad_norm": 1.4208341836929321,
"learning_rate": 4.6722554346055446e-05,
"loss": 3.41210693359375,
"step": 1650
},
{
"epoch": 0.587523760152065,
"grad_norm": 1.4499037265777588,
"learning_rate": 4.648796135775798e-05,
"loss": 3.27009521484375,
"step": 1700
},
{
"epoch": 0.6048038707447728,
"grad_norm": 1.3428349494934082,
"learning_rate": 4.624589192621235e-05,
"loss": 3.14307373046875,
"step": 1750
},
{
"epoch": 0.6220839813374806,
"grad_norm": 1.2956618070602417,
"learning_rate": 4.599643027617758e-05,
"loss": 3.0137338256835937,
"step": 1800
},
{
"epoch": 0.6393640919301884,
"grad_norm": 1.2256124019622803,
"learning_rate": 4.573966320443433e-05,
"loss": 2.8847576904296877,
"step": 1850
},
{
"epoch": 0.6566442025228961,
"grad_norm": 1.1832093000411987,
"learning_rate": 4.547568004958518e-05,
"loss": 2.783875732421875,
"step": 1900
},
{
"epoch": 0.673924313115604,
"grad_norm": 1.1249213218688965,
"learning_rate": 4.520457266097046e-05,
"loss": 2.658479919433594,
"step": 1950
},
{
"epoch": 0.6912044237083117,
"grad_norm": 1.1056411266326904,
"learning_rate": 4.492643536671052e-05,
"loss": 2.5734375,
"step": 2000
},
{
"epoch": 0.7084845343010195,
"grad_norm": 1.0190331935882568,
"learning_rate": 4.4641364940885564e-05,
"loss": 2.468469543457031,
"step": 2050
},
{
"epoch": 0.7257646448937273,
"grad_norm": 1.1260298490524292,
"learning_rate": 4.4349460569864404e-05,
"loss": 2.387096252441406,
"step": 2100
},
{
"epoch": 0.7430447554864351,
"grad_norm": 0.9206457734107971,
"learning_rate": 4.4050823817793944e-05,
"loss": 2.317845458984375,
"step": 2150
},
{
"epoch": 0.760324866079143,
"grad_norm": 0.9262681007385254,
"learning_rate": 4.3745558591261295e-05,
"loss": 2.2218693542480468,
"step": 2200
},
{
"epoch": 0.7776049766718507,
"grad_norm": 0.9071998000144958,
"learning_rate": 4.3433771103140896e-05,
"loss": 2.153177032470703,
"step": 2250
},
{
"epoch": 0.7948850872645585,
"grad_norm": 0.9530045986175537,
"learning_rate": 4.3115569835639215e-05,
"loss": 2.109056243896484,
"step": 2300
},
{
"epoch": 0.8121651978572663,
"grad_norm": 0.8229272961616516,
"learning_rate": 4.279106550254981e-05,
"loss": 2.0358200073242188,
"step": 2350
},
{
"epoch": 0.8294453084499741,
"grad_norm": 0.8083502054214478,
"learning_rate": 4.246037101073202e-05,
"loss": 1.9781124877929688,
"step": 2400
},
{
"epoch": 0.8467254190426818,
"grad_norm": 0.8030848503112793,
"learning_rate": 4.21236014208265e-05,
"loss": 1.9251625061035156,
"step": 2450
},
{
"epoch": 0.8640055296353897,
"grad_norm": 0.7446616291999817,
"learning_rate": 4.178087390722151e-05,
"loss": 1.8921575927734375,
"step": 2500
},
{
"epoch": 0.8812856402280974,
"grad_norm": 0.7391892075538635,
"learning_rate": 4.1432307717283606e-05,
"loss": 1.84561279296875,
"step": 2550
},
{
"epoch": 0.8985657508208053,
"grad_norm": 0.7500560283660889,
"learning_rate": 4.107802412986721e-05,
"loss": 1.80940185546875,
"step": 2600
},
{
"epoch": 0.9158458614135131,
"grad_norm": 0.7202692031860352,
"learning_rate": 4.071814641311728e-05,
"loss": 1.7628054809570313,
"step": 2650
},
{
"epoch": 0.9331259720062208,
"grad_norm": 0.7352117896080017,
"learning_rate": 4.0352799781579786e-05,
"loss": 1.7312879943847657,
"step": 2700
},
{
"epoch": 0.9504060825989287,
"grad_norm": 0.6867756247520447,
"learning_rate": 3.9982111352635064e-05,
"loss": 1.71904052734375,
"step": 2750
},
{
"epoch": 0.9676861931916364,
"grad_norm": 0.6769801378250122,
"learning_rate": 3.960621010226906e-05,
"loss": 1.6703118896484375,
"step": 2800
},
{
"epoch": 0.9849663037843442,
"grad_norm": 0.6680055856704712,
"learning_rate": 3.922522682019785e-05,
"loss": 1.6196646118164062,
"step": 2850
},
{
"epoch": 1.002073613271125,
"grad_norm": 0.659245491027832,
"learning_rate": 3.883929406436118e-05,
"loss": 1.5895655822753907,
"step": 2900
},
{
"epoch": 1.0193537238638328,
"grad_norm": 0.6726027727127075,
"learning_rate": 3.844854611480072e-05,
"loss": 1.5674874877929688,
"step": 2950
},
{
"epoch": 1.0366338344565404,
"grad_norm": 0.6525683999061584,
"learning_rate": 3.805311892693917e-05,
"loss": 1.5413397216796876,
"step": 3000
},
{
"epoch": 1.0366338344565404,
"eval_loss": 0.6183949112892151,
"eval_runtime": 95.1248,
"eval_samples_per_second": 378.261,
"eval_steps_per_second": 0.988,
"step": 3000
},
{
"epoch": 1.0539139450492483,
"grad_norm": 0.6720120310783386,
"learning_rate": 3.765315008427641e-05,
"loss": 1.5179107666015625,
"step": 3050
},
{
"epoch": 1.0711940556419561,
"grad_norm": 0.6000827550888062,
"learning_rate": 3.724877875051918e-05,
"loss": 1.486010284423828,
"step": 3100
},
{
"epoch": 1.088474166234664,
"grad_norm": 0.6552499532699585,
"learning_rate": 3.6840145621161024e-05,
"loss": 1.469657440185547,
"step": 3150
},
{
"epoch": 1.1057542768273716,
"grad_norm": 0.6109654903411865,
"learning_rate": 3.642739287452914e-05,
"loss": 1.4606805419921876,
"step": 3200
},
{
"epoch": 1.1230343874200794,
"grad_norm": 0.6051790118217468,
"learning_rate": 3.601066412231542e-05,
"loss": 1.4501374816894532,
"step": 3250
},
{
"epoch": 1.1403144980127873,
"grad_norm": 0.7028324007987976,
"learning_rate": 3.5590104359608686e-05,
"loss": 1.412510986328125,
"step": 3300
},
{
"epoch": 1.1575946086054951,
"grad_norm": 0.635924220085144,
"learning_rate": 3.516585991444564e-05,
"loss": 1.3964031982421874,
"step": 3350
},
{
"epoch": 1.1748747191982027,
"grad_norm": 0.5996707677841187,
"learning_rate": 3.473807839689803e-05,
"loss": 1.3822164916992188,
"step": 3400
},
{
"epoch": 1.1921548297909106,
"grad_norm": 0.5824238657951355,
"learning_rate": 3.430690864771371e-05,
"loss": 1.3676600646972656,
"step": 3450
},
{
"epoch": 1.2094349403836184,
"grad_norm": 0.5993366241455078,
"learning_rate": 3.387250068652958e-05,
"loss": 1.3601907348632813,
"step": 3500
},
{
"epoch": 1.2267150509763263,
"grad_norm": 0.601268470287323,
"learning_rate": 3.343500565967422e-05,
"loss": 1.332314453125,
"step": 3550
},
{
"epoch": 1.2439951615690341,
"grad_norm": 0.6086856126785278,
"learning_rate": 3.299457578757866e-05,
"loss": 1.3296095275878905,
"step": 3600
},
{
"epoch": 1.261275272161742,
"grad_norm": 0.5556713342666626,
"learning_rate": 3.2551364311813316e-05,
"loss": 1.309993133544922,
"step": 3650
},
{
"epoch": 1.2785553827544496,
"grad_norm": 0.6057120561599731,
"learning_rate": 3.2105525441769676e-05,
"loss": 1.3083804321289063,
"step": 3700
},
{
"epoch": 1.2958354933471574,
"grad_norm": 0.5656481981277466,
"learning_rate": 3.165721430100527e-05,
"loss": 1.2851667785644532,
"step": 3750
},
{
"epoch": 1.3131156039398653,
"grad_norm": 0.5798958539962769,
"learning_rate": 3.120658687327052e-05,
"loss": 1.2679750061035155,
"step": 3800
},
{
"epoch": 1.3303957145325729,
"grad_norm": 0.5177021622657776,
"learning_rate": 3.0753799948236316e-05,
"loss": 1.265101776123047,
"step": 3850
},
{
"epoch": 1.3476758251252807,
"grad_norm": 0.5503849387168884,
"learning_rate": 3.0299011066941203e-05,
"loss": 1.2567321014404298,
"step": 3900
},
{
"epoch": 1.3649559357179886,
"grad_norm": 0.5518880486488342,
"learning_rate": 2.9842378466977128e-05,
"loss": 1.2535091400146485,
"step": 3950
},
{
"epoch": 1.3822360463106964,
"grad_norm": 0.6316933035850525,
"learning_rate": 2.93840610274328e-05,
"loss": 1.2241575622558594,
"step": 4000
},
{
"epoch": 1.3995161569034043,
"grad_norm": 0.5042502284049988,
"learning_rate": 2.8924218213613902e-05,
"loss": 1.2300173950195312,
"step": 4050
},
{
"epoch": 1.416796267496112,
"grad_norm": 0.6486298441886902,
"learning_rate": 2.8463010021559298e-05,
"loss": 1.21891845703125,
"step": 4100
},
{
"epoch": 1.4340763780888197,
"grad_norm": 0.5548715591430664,
"learning_rate": 2.800059692237261e-05,
"loss": 1.204437484741211,
"step": 4150
},
{
"epoch": 1.4513564886815276,
"grad_norm": 0.5416399836540222,
"learning_rate": 2.7537139806388455e-05,
"loss": 1.1822820281982422,
"step": 4200
},
{
"epoch": 1.4686365992742354,
"grad_norm": 0.5168971419334412,
"learning_rate": 2.7072799927192883e-05,
"loss": 1.1682017517089844,
"step": 4250
},
{
"epoch": 1.485916709866943,
"grad_norm": 0.5409218668937683,
"learning_rate": 2.6607738845517348e-05,
"loss": 1.1644657135009766,
"step": 4300
},
{
"epoch": 1.5031968204596509,
"grad_norm": 0.5903815627098083,
"learning_rate": 2.614211837302589e-05,
"loss": 1.165572052001953,
"step": 4350
},
{
"epoch": 1.5204769310523587,
"grad_norm": 0.5404263734817505,
"learning_rate": 2.567610051601497e-05,
"loss": 1.1485606384277345,
"step": 4400
},
{
"epoch": 1.5377570416450665,
"grad_norm": 0.6503647565841675,
"learning_rate": 2.520984741904554e-05,
"loss": 1.136939697265625,
"step": 4450
},
{
"epoch": 1.5550371522377744,
"grad_norm": 0.5339692831039429,
"learning_rate": 2.4743521308527125e-05,
"loss": 1.1341026306152344,
"step": 4500
},
{
"epoch": 1.5550371522377744,
"eval_loss": 0.45711469650268555,
"eval_runtime": 95.1691,
"eval_samples_per_second": 378.085,
"eval_steps_per_second": 0.988,
"step": 4500
},
{
"epoch": 1.5723172628304822,
"grad_norm": 0.5759618282318115,
"learning_rate": 2.4277284436273307e-05,
"loss": 1.126502151489258,
"step": 4550
},
{
"epoch": 1.58959737342319,
"grad_norm": 0.5793848633766174,
"learning_rate": 2.381129902304841e-05,
"loss": 1.1371284484863282,
"step": 4600
},
{
"epoch": 1.6068774840158977,
"grad_norm": 0.5430593490600586,
"learning_rate": 2.3345727202125056e-05,
"loss": 1.1101528930664062,
"step": 4650
},
{
"epoch": 1.6241575946086055,
"grad_norm": 0.5803630948066711,
"learning_rate": 2.2880730962872023e-05,
"loss": 1.1264077758789062,
"step": 4700
},
{
"epoch": 1.6414377052013132,
"grad_norm": 0.6316850781440735,
"learning_rate": 2.2416472094392323e-05,
"loss": 1.1182173919677734,
"step": 4750
},
{
"epoch": 1.658717815794021,
"grad_norm": 0.5736232399940491,
"learning_rate": 2.195311212923085e-05,
"loss": 1.100269775390625,
"step": 4800
},
{
"epoch": 1.6759979263867288,
"grad_norm": 0.5712565183639526,
"learning_rate": 2.149081228717133e-05,
"loss": 1.091978988647461,
"step": 4850
},
{
"epoch": 1.6932780369794367,
"grad_norm": 0.5494542717933655,
"learning_rate": 2.1029733419142128e-05,
"loss": 1.095367431640625,
"step": 4900
},
{
"epoch": 1.7105581475721445,
"grad_norm": 0.4688451290130615,
"learning_rate": 2.0570035951250306e-05,
"loss": 1.0880941009521485,
"step": 4950
},
{
"epoch": 1.7278382581648524,
"grad_norm": 0.5209280848503113,
"learning_rate": 2.0111879828963616e-05,
"loss": 1.0774014282226563,
"step": 5000
},
{
"epoch": 1.7451183687575602,
"grad_norm": 0.530224621295929,
"learning_rate": 1.9655424461459586e-05,
"loss": 1.0644034576416015,
"step": 5050
},
{
"epoch": 1.7623984793502678,
"grad_norm": 0.5463877320289612,
"learning_rate": 1.920082866616132e-05,
"loss": 1.0841154479980468,
"step": 5100
},
{
"epoch": 1.7796785899429757,
"grad_norm": 0.5256918668746948,
"learning_rate": 1.8748250613479124e-05,
"loss": 1.0540755462646485,
"step": 5150
},
{
"epoch": 1.7969587005356833,
"grad_norm": 0.5257004499435425,
"learning_rate": 1.829784777177723e-05,
"loss": 1.0588815307617188,
"step": 5200
},
{
"epoch": 1.8142388111283911,
"grad_norm": 0.5281515121459961,
"learning_rate": 1.784977685258492e-05,
"loss": 1.045956573486328,
"step": 5250
},
{
"epoch": 1.831518921721099,
"grad_norm": 0.4766943156719208,
"learning_rate": 1.7404193756070763e-05,
"loss": 1.0421023559570313,
"step": 5300
},
{
"epoch": 1.8487990323138068,
"grad_norm": 0.501384437084198,
"learning_rate": 1.696125351679938e-05,
"loss": 1.042171630859375,
"step": 5350
},
{
"epoch": 1.8660791429065147,
"grad_norm": 0.5489823222160339,
"learning_rate": 1.6521110249789228e-05,
"loss": 1.0536033630371093,
"step": 5400
},
{
"epoch": 1.8833592534992225,
"grad_norm": 0.4959055185317993,
"learning_rate": 1.6083917096890385e-05,
"loss": 1.0371237945556642,
"step": 5450
},
{
"epoch": 1.9006393640919304,
"grad_norm": 0.524411141872406,
"learning_rate": 1.564982617350096e-05,
"loss": 1.0318540954589843,
"step": 5500
},
{
"epoch": 1.917919474684638,
"grad_norm": 1.0833576917648315,
"learning_rate": 1.5218988515640548e-05,
"loss": 1.0373517608642577,
"step": 5550
},
{
"epoch": 1.9351995852773458,
"grad_norm": 0.565988302230835,
"learning_rate": 1.4791554027399398e-05,
"loss": 1.0258339691162108,
"step": 5600
},
{
"epoch": 1.9524796958700534,
"grad_norm": 0.48943427205085754,
"learning_rate": 1.4367671428781243e-05,
"loss": 1.025996551513672,
"step": 5650
},
{
"epoch": 1.9697598064627613,
"grad_norm": 0.5613059401512146,
"learning_rate": 1.3947488203958265e-05,
"loss": 1.0235104370117187,
"step": 5700
},
{
"epoch": 1.9870399170554691,
"grad_norm": 0.4791814386844635,
"learning_rate": 1.3531150549955943e-05,
"loss": 1.0148072814941407,
"step": 5750
},
{
"epoch": 2.00414722654225,
"grad_norm": 0.5621808767318726,
"learning_rate": 1.31188033257858e-05,
"loss": 1.0037516021728516,
"step": 5800
},
{
"epoch": 2.021427337134958,
"grad_norm": 0.5978628396987915,
"learning_rate": 1.2710590002043729e-05,
"loss": 1.0024045562744142,
"step": 5850
},
{
"epoch": 2.0387074477276657,
"grad_norm": 0.5256090760231018,
"learning_rate": 1.2306652610991288e-05,
"loss": 0.9898030090332032,
"step": 5900
},
{
"epoch": 2.055987558320373,
"grad_norm": 0.5156893134117126,
"learning_rate": 1.1907131697137546e-05,
"loss": 0.9952345275878907,
"step": 5950
},
{
"epoch": 2.073267668913081,
"grad_norm": 0.4754015803337097,
"learning_rate": 1.1512166268338542e-05,
"loss": 0.991960678100586,
"step": 6000
},
{
"epoch": 2.073267668913081,
"eval_loss": 0.3997305929660797,
"eval_runtime": 95.1558,
"eval_samples_per_second": 378.138,
"eval_steps_per_second": 0.988,
"step": 6000
},
{
"epoch": 2.0905477795057887,
"grad_norm": 0.5414108037948608,
"learning_rate": 1.1121893747431378e-05,
"loss": 0.9846409606933594,
"step": 6050
},
{
"epoch": 2.1078278900984966,
"grad_norm": 0.529751181602478,
"learning_rate": 1.0736449924419822e-05,
"loss": 0.9891058349609375,
"step": 6100
},
{
"epoch": 2.1251080006912044,
"grad_norm": 0.4818967878818512,
"learning_rate": 1.0355968909228054e-05,
"loss": 0.9866749572753907,
"step": 6150
},
{
"epoch": 2.1423881112839123,
"grad_norm": 0.5096397399902344,
"learning_rate": 9.980583085038895e-06,
"loss": 0.9821431732177734,
"step": 6200
},
{
"epoch": 2.15966822187662,
"grad_norm": 0.5606327056884766,
"learning_rate": 9.610423062232912e-06,
"loss": 0.9854959869384765,
"step": 6250
},
{
"epoch": 2.176948332469328,
"grad_norm": 0.514130711555481,
"learning_rate": 9.245617632944348e-06,
"loss": 0.9793372344970703,
"step": 6300
},
{
"epoch": 2.194228443062036,
"grad_norm": 0.5572317242622375,
"learning_rate": 8.886293726249562e-06,
"loss": 0.9789413452148438,
"step": 6350
},
{
"epoch": 2.211508553654743,
"grad_norm": 0.5003585815429688,
"learning_rate": 8.532576364003904e-06,
"loss": 0.9631109619140625,
"step": 6400
},
{
"epoch": 2.228788664247451,
"grad_norm": 0.5201903581619263,
"learning_rate": 8.184588617341976e-06,
"loss": 0.9783859252929688,
"step": 6450
},
{
"epoch": 2.246068774840159,
"grad_norm": 0.5099287033081055,
"learning_rate": 7.842451563856742e-06,
"loss": 0.9678781127929688,
"step": 6500
},
{
"epoch": 2.2633488854328667,
"grad_norm": 0.46017828583717346,
"learning_rate": 7.506284245472225e-06,
"loss": 0.9522227478027344,
"step": 6550
},
{
"epoch": 2.2806289960255746,
"grad_norm": 0.45564213395118713,
"learning_rate": 7.176203627024514e-06,
"loss": 0.9609327697753907,
"step": 6600
},
{
"epoch": 2.2979091066182824,
"grad_norm": 0.5416494607925415,
"learning_rate": 6.852324555565404e-06,
"loss": 0.966263427734375,
"step": 6650
},
{
"epoch": 2.3151892172109902,
"grad_norm": 0.5636786818504333,
"learning_rate": 6.53475972040295e-06,
"loss": 0.9645524597167969,
"step": 6700
},
{
"epoch": 2.332469327803698,
"grad_norm": 0.5431479215621948,
"learning_rate": 6.22361961389277e-06,
"loss": 0.9668412780761719,
"step": 6750
},
{
"epoch": 2.3497494383964055,
"grad_norm": 0.4858858585357666,
"learning_rate": 5.919012492993706e-06,
"loss": 0.9710499572753907,
"step": 6800
},
{
"epoch": 2.3670295489891133,
"grad_norm": 0.4986262619495392,
"learning_rate": 5.621044341601342e-06,
"loss": 0.9668563842773438,
"step": 6850
},
{
"epoch": 2.384309659581821,
"grad_norm": 0.5012155771255493,
"learning_rate": 5.329818833672273e-06,
"loss": 0.9694512939453125,
"step": 6900
},
{
"epoch": 2.401589770174529,
"grad_norm": 0.5899285078048706,
"learning_rate": 5.045437297152245e-06,
"loss": 0.9606761169433594,
"step": 6950
},
{
"epoch": 2.418869880767237,
"grad_norm": 0.4638988971710205,
"learning_rate": 4.767998678720448e-06,
"loss": 0.957213134765625,
"step": 7000
},
{
"epoch": 2.4361499913599447,
"grad_norm": 0.5025919079780579,
"learning_rate": 4.4975995093623266e-06,
"loss": 0.9607756042480469,
"step": 7050
},
{
"epoch": 2.4534301019526525,
"grad_norm": 0.5194385051727295,
"learning_rate": 4.234333870783014e-06,
"loss": 0.9540797424316406,
"step": 7100
},
{
"epoch": 2.4707102125453604,
"grad_norm": 0.5033853650093079,
"learning_rate": 3.97829336267283e-06,
"loss": 0.957088851928711,
"step": 7150
},
{
"epoch": 2.4879903231380682,
"grad_norm": 0.4493975341320038,
"learning_rate": 3.729567070836437e-06,
"loss": 0.9415164947509765,
"step": 7200
},
{
"epoch": 2.505270433730776,
"grad_norm": 0.5417333841323853,
"learning_rate": 3.488241536196643e-06,
"loss": 0.9572615051269531,
"step": 7250
},
{
"epoch": 2.522550544323484,
"grad_norm": 0.5607247352600098,
"learning_rate": 3.254400724683673e-06,
"loss": 0.9510250854492187,
"step": 7300
},
{
"epoch": 2.5398306549161913,
"grad_norm": 0.5300142765045166,
"learning_rate": 3.0281259980203757e-06,
"loss": 0.9523114776611328,
"step": 7350
},
{
"epoch": 2.557110765508899,
"grad_norm": 0.5919684171676636,
"learning_rate": 2.809496085413496e-06,
"loss": 0.9338645172119141,
"step": 7400
},
{
"epoch": 2.574390876101607,
"grad_norm": 0.46930092573165894,
"learning_rate": 2.5985870561609448e-06,
"loss": 0.9432379150390625,
"step": 7450
},
{
"epoch": 2.591670986694315,
"grad_norm": 0.5667592883110046,
"learning_rate": 2.3954722931845002e-06,
"loss": 0.9356614685058594,
"step": 7500
},
{
"epoch": 2.591670986694315,
"eval_loss": 0.38118186593055725,
"eval_runtime": 95.0674,
"eval_samples_per_second": 378.489,
"eval_steps_per_second": 0.989,
"step": 7500
},
{
"epoch": 2.6089510972870227,
"grad_norm": 0.48274651169776917,
"learning_rate": 2.2002224674972676e-06,
"loss": 0.9377688598632813,
"step": 7550
},
{
"epoch": 2.6262312078797305,
"grad_norm": 0.4731481969356537,
"learning_rate": 2.012905513614588e-06,
"loss": 0.9340367126464844,
"step": 7600
},
{
"epoch": 2.6435113184724384,
"grad_norm": 0.51893550157547,
"learning_rate": 1.8335866059172258e-06,
"loss": 0.951480712890625,
"step": 7650
},
{
"epoch": 2.6607914290651458,
"grad_norm": 0.5630462765693665,
"learning_rate": 1.6623281359747806e-06,
"loss": 0.9414936828613282,
"step": 7700
},
{
"epoch": 2.6780715396578536,
"grad_norm": 0.49863728880882263,
"learning_rate": 1.499189690837413e-06,
"loss": 0.9399469757080078,
"step": 7750
},
{
"epoch": 2.6953516502505614,
"grad_norm": 0.4599184989929199,
"learning_rate": 1.344228032303349e-06,
"loss": 0.9424849700927734,
"step": 7800
},
{
"epoch": 2.7126317608432693,
"grad_norm": 0.5391411781311035,
"learning_rate": 1.1974970771693543e-06,
"loss": 0.9474674224853515,
"step": 7850
},
{
"epoch": 2.729911871435977,
"grad_norm": 0.5229924917221069,
"learning_rate": 1.0590478784711561e-06,
"loss": 0.9289096069335937,
"step": 7900
},
{
"epoch": 2.747191982028685,
"grad_norm": 0.5078772306442261,
"learning_rate": 9.28928607720217e-07,
"loss": 0.9420416259765625,
"step": 7950
},
{
"epoch": 2.764472092621393,
"grad_norm": 0.5175345540046692,
"learning_rate": 8.071845381431103e-07,
"loss": 0.9313079833984375,
"step": 8000
},
{
"epoch": 2.7817522032141007,
"grad_norm": 0.49628084897994995,
"learning_rate": 6.938580289293339e-07,
"loss": 0.9550038146972656,
"step": 8050
},
{
"epoch": 2.7990323138068085,
"grad_norm": 0.5086449980735779,
"learning_rate": 5.889885104929965e-07,
"loss": 0.9414044952392578,
"step": 8100
},
{
"epoch": 2.8163124243995163,
"grad_norm": 0.4850241243839264,
"learning_rate": 4.926124707535395e-07,
"loss": 0.9482218933105468,
"step": 8150
},
{
"epoch": 2.833592534992224,
"grad_norm": 0.5032879114151001,
"learning_rate": 4.0476344244027023e-07,
"loss": 0.9468795776367187,
"step": 8200
},
{
"epoch": 2.8508726455849316,
"grad_norm": 0.49061650037765503,
"learning_rate": 3.254719914251081e-07,
"loss": 0.9405175018310546,
"step": 8250
},
{
"epoch": 2.8681527561776394,
"grad_norm": 0.5416680574417114,
"learning_rate": 2.547657060875924e-07,
"loss": 0.9449867248535156,
"step": 8300
},
{
"epoch": 2.8854328667703473,
"grad_norm": 0.5759839415550232,
"learning_rate": 1.9266918771590204e-07,
"loss": 0.9381349945068359,
"step": 8350
},
{
"epoch": 2.902712977363055,
"grad_norm": 0.5406273603439331,
"learning_rate": 1.392040419471552e-07,
"loss": 0.9400007629394531,
"step": 8400
},
{
"epoch": 2.919993087955763,
"grad_norm": 0.503950834274292,
"learning_rate": 9.438887125002293e-08,
"loss": 0.9473369598388672,
"step": 8450
},
{
"epoch": 2.937273198548471,
"grad_norm": 0.5274893641471863,
"learning_rate": 5.823926845227312e-08,
"loss": 0.9627262115478515,
"step": 8500
},
{
"epoch": 2.9545533091411786,
"grad_norm": 0.5260080695152283,
"learning_rate": 3.076781131543249e-08,
"loss": 0.9508057403564453,
"step": 8550
},
{
"epoch": 2.971833419733886,
"grad_norm": 0.5042856335639954,
"learning_rate": 1.1984058158542866e-08,
"loss": 0.9278289794921875,
"step": 8600
},
{
"epoch": 2.989113530326594,
"grad_norm": 0.4784366488456726,
"learning_rate": 1.8945445324769494e-09,
"loss": 0.9391728210449218,
"step": 8650
},
{
"epoch": 3.0,
"step": 8682,
"total_flos": 1.83225980060015e+18,
"train_loss": 2.536729412566569,
"train_runtime": 19314.0993,
"train_samples_per_second": 115.053,
"train_steps_per_second": 0.45
}
],
"logging_steps": 50,
"max_steps": 8682,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.83225980060015e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}