{ "best_global_step": 7500, "best_metric": 0.38118186593055725, "best_model_checkpoint": "./byt5_leetspeak_v3/checkpoint-7500", "epoch": 3.0, "eval_steps": 1500, "global_step": 8682, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003456022118541559, "grad_norm": 7.998739719390869, "learning_rate": 0.0, "loss": 21.331615447998047, "step": 1 }, { "epoch": 0.017280110592707794, "grad_norm": 4.945672512054443, "learning_rate": 9.386973180076629e-06, "loss": 20.682008081552933, "step": 50 }, { "epoch": 0.03456022118541559, "grad_norm": 3.5129992961883545, "learning_rate": 1.896551724137931e-05, "loss": 17.8103759765625, "step": 100 }, { "epoch": 0.05184033177812338, "grad_norm": 3.4432644844055176, "learning_rate": 2.8544061302681996e-05, "loss": 15.057559814453125, "step": 150 }, { "epoch": 0.06912044237083118, "grad_norm": 3.018535852432251, "learning_rate": 3.8122605363984674e-05, "loss": 13.59551513671875, "step": 200 }, { "epoch": 0.08640055296353896, "grad_norm": 2.247919797897339, "learning_rate": 4.770114942528736e-05, "loss": 12.362723388671874, "step": 250 }, { "epoch": 0.10368066355624676, "grad_norm": 2.297297239303589, "learning_rate": 4.9997487868649304e-05, "loss": 11.349617919921876, "step": 300 }, { "epoch": 0.12096077414895455, "grad_norm": 2.3222248554229736, "learning_rate": 4.9986528723171024e-05, "loss": 10.5994140625, "step": 350 }, { "epoch": 0.13824088474166235, "grad_norm": 2.342911958694458, "learning_rate": 4.996687585561939e-05, "loss": 10.017122802734375, "step": 400 }, { "epoch": 0.15552099533437014, "grad_norm": 2.3807644844055176, "learning_rate": 4.993853610394178e-05, "loss": 9.509613037109375, "step": 450 }, { "epoch": 0.17280110592707792, "grad_norm": 2.3903160095214844, "learning_rate": 4.9901519328568466e-05, "loss": 9.067050170898437, "step": 500 }, { "epoch": 0.19008121651978574, "grad_norm": 2.3767359256744385, "learning_rate": 4.985583840898188e-05, "loss": 8.662135620117187, "step": 550 }, { "epoch": 0.20736132711249353, "grad_norm": 2.3853607177734375, "learning_rate": 4.98015092392353e-05, "loss": 8.290029296875, "step": 600 }, { "epoch": 0.2246414377052013, "grad_norm": 2.372498035430908, "learning_rate": 4.973855072242276e-05, "loss": 7.937987060546875, "step": 650 }, { "epoch": 0.2419215482979091, "grad_norm": 2.359043836593628, "learning_rate": 4.966698476410199e-05, "loss": 7.60522705078125, "step": 700 }, { "epoch": 0.2592016588906169, "grad_norm": 2.353739023208618, "learning_rate": 4.9586836264672666e-05, "loss": 7.296792602539062, "step": 750 }, { "epoch": 0.2764817694833247, "grad_norm": 2.335012435913086, "learning_rate": 4.9498133110712644e-05, "loss": 6.98702392578125, "step": 800 }, { "epoch": 0.2937618800760325, "grad_norm": 2.289266586303711, "learning_rate": 4.940090616527521e-05, "loss": 6.709508666992187, "step": 850 }, { "epoch": 0.3110419906687403, "grad_norm": 2.252319097518921, "learning_rate": 4.929518925715071e-05, "loss": 6.44697021484375, "step": 900 }, { "epoch": 0.32832210126144806, "grad_norm": 2.216578483581543, "learning_rate": 4.9181019169096285e-05, "loss": 6.1863818359375, "step": 950 }, { "epoch": 0.34560221185415585, "grad_norm": 2.1667819023132324, "learning_rate": 4.90584356250378e-05, "loss": 5.913253173828125, "step": 1000 }, { "epoch": 0.36288232244686364, "grad_norm": 2.1393256187438965, "learning_rate": 4.892748127624845e-05, "loss": 5.687144165039062, "step": 1050 }, { "epoch": 0.3801624330395715, "grad_norm": 2.1089439392089844, "learning_rate": 4.878820168650884e-05, "loss": 5.421361083984375, "step": 1100 }, { "epoch": 0.39744254363227927, "grad_norm": 2.0389111042022705, "learning_rate": 4.864064531625366e-05, "loss": 5.215165405273438, "step": 1150 }, { "epoch": 0.41472265422498705, "grad_norm": 1.9905728101730347, "learning_rate": 4.8484863505710585e-05, "loss": 4.993418273925781, "step": 1200 }, { "epoch": 0.43200276481769484, "grad_norm": 1.915732502937317, "learning_rate": 4.8320910457037105e-05, "loss": 4.796431884765625, "step": 1250 }, { "epoch": 0.4492828754104026, "grad_norm": 1.876905083656311, "learning_rate": 4.814884321546163e-05, "loss": 4.600077819824219, "step": 1300 }, { "epoch": 0.4665629860031104, "grad_norm": 1.8127694129943848, "learning_rate": 4.796872164943538e-05, "loss": 4.390880737304688, "step": 1350 }, { "epoch": 0.4838430965958182, "grad_norm": 1.749795913696289, "learning_rate": 4.778060842980199e-05, "loss": 4.223977661132812, "step": 1400 }, { "epoch": 0.501123207188526, "grad_norm": 1.7290434837341309, "learning_rate": 4.758456900799202e-05, "loss": 4.044484252929688, "step": 1450 }, { "epoch": 0.5184033177812338, "grad_norm": 1.658087968826294, "learning_rate": 4.738067159325005e-05, "loss": 3.873255615234375, "step": 1500 }, { "epoch": 0.5184033177812338, "eval_loss": 1.5466166734695435, "eval_runtime": 95.1917, "eval_samples_per_second": 377.995, "eval_steps_per_second": 0.987, "step": 1500 }, { "epoch": 0.5356834283739416, "grad_norm": 1.5666825771331787, "learning_rate": 4.716898712890218e-05, "loss": 3.697227783203125, "step": 1550 }, { "epoch": 0.5529635389666494, "grad_norm": 1.5051823854446411, "learning_rate": 4.6949589267672256e-05, "loss": 3.5488134765625, "step": 1600 }, { "epoch": 0.5702436495593571, "grad_norm": 1.4208341836929321, "learning_rate": 4.6722554346055446e-05, "loss": 3.41210693359375, "step": 1650 }, { "epoch": 0.587523760152065, "grad_norm": 1.4499037265777588, "learning_rate": 4.648796135775798e-05, "loss": 3.27009521484375, "step": 1700 }, { "epoch": 0.6048038707447728, "grad_norm": 1.3428349494934082, "learning_rate": 4.624589192621235e-05, "loss": 3.14307373046875, "step": 1750 }, { "epoch": 0.6220839813374806, "grad_norm": 1.2956618070602417, "learning_rate": 4.599643027617758e-05, "loss": 3.0137338256835937, "step": 1800 }, { "epoch": 0.6393640919301884, "grad_norm": 1.2256124019622803, "learning_rate": 4.573966320443433e-05, "loss": 2.8847576904296877, "step": 1850 }, { "epoch": 0.6566442025228961, "grad_norm": 1.1832093000411987, "learning_rate": 4.547568004958518e-05, "loss": 2.783875732421875, "step": 1900 }, { "epoch": 0.673924313115604, "grad_norm": 1.1249213218688965, "learning_rate": 4.520457266097046e-05, "loss": 2.658479919433594, "step": 1950 }, { "epoch": 0.6912044237083117, "grad_norm": 1.1056411266326904, "learning_rate": 4.492643536671052e-05, "loss": 2.5734375, "step": 2000 }, { "epoch": 0.7084845343010195, "grad_norm": 1.0190331935882568, "learning_rate": 4.4641364940885564e-05, "loss": 2.468469543457031, "step": 2050 }, { "epoch": 0.7257646448937273, "grad_norm": 1.1260298490524292, "learning_rate": 4.4349460569864404e-05, "loss": 2.387096252441406, "step": 2100 }, { "epoch": 0.7430447554864351, "grad_norm": 0.9206457734107971, "learning_rate": 4.4050823817793944e-05, "loss": 2.317845458984375, "step": 2150 }, { "epoch": 0.760324866079143, "grad_norm": 0.9262681007385254, "learning_rate": 4.3745558591261295e-05, "loss": 2.2218693542480468, "step": 2200 }, { "epoch": 0.7776049766718507, "grad_norm": 0.9071998000144958, "learning_rate": 4.3433771103140896e-05, "loss": 2.153177032470703, "step": 2250 }, { "epoch": 0.7948850872645585, "grad_norm": 0.9530045986175537, "learning_rate": 4.3115569835639215e-05, "loss": 2.109056243896484, "step": 2300 }, { "epoch": 0.8121651978572663, "grad_norm": 0.8229272961616516, "learning_rate": 4.279106550254981e-05, "loss": 2.0358200073242188, "step": 2350 }, { "epoch": 0.8294453084499741, "grad_norm": 0.8083502054214478, "learning_rate": 4.246037101073202e-05, "loss": 1.9781124877929688, "step": 2400 }, { "epoch": 0.8467254190426818, "grad_norm": 0.8030848503112793, "learning_rate": 4.21236014208265e-05, "loss": 1.9251625061035156, "step": 2450 }, { "epoch": 0.8640055296353897, "grad_norm": 0.7446616291999817, "learning_rate": 4.178087390722151e-05, "loss": 1.8921575927734375, "step": 2500 }, { "epoch": 0.8812856402280974, "grad_norm": 0.7391892075538635, "learning_rate": 4.1432307717283606e-05, "loss": 1.84561279296875, "step": 2550 }, { "epoch": 0.8985657508208053, "grad_norm": 0.7500560283660889, "learning_rate": 4.107802412986721e-05, "loss": 1.80940185546875, "step": 2600 }, { "epoch": 0.9158458614135131, "grad_norm": 0.7202692031860352, "learning_rate": 4.071814641311728e-05, "loss": 1.7628054809570313, "step": 2650 }, { "epoch": 0.9331259720062208, "grad_norm": 0.7352117896080017, "learning_rate": 4.0352799781579786e-05, "loss": 1.7312879943847657, "step": 2700 }, { "epoch": 0.9504060825989287, "grad_norm": 0.6867756247520447, "learning_rate": 3.9982111352635064e-05, "loss": 1.71904052734375, "step": 2750 }, { "epoch": 0.9676861931916364, "grad_norm": 0.6769801378250122, "learning_rate": 3.960621010226906e-05, "loss": 1.6703118896484375, "step": 2800 }, { "epoch": 0.9849663037843442, "grad_norm": 0.6680055856704712, "learning_rate": 3.922522682019785e-05, "loss": 1.6196646118164062, "step": 2850 }, { "epoch": 1.002073613271125, "grad_norm": 0.659245491027832, "learning_rate": 3.883929406436118e-05, "loss": 1.5895655822753907, "step": 2900 }, { "epoch": 1.0193537238638328, "grad_norm": 0.6726027727127075, "learning_rate": 3.844854611480072e-05, "loss": 1.5674874877929688, "step": 2950 }, { "epoch": 1.0366338344565404, "grad_norm": 0.6525683999061584, "learning_rate": 3.805311892693917e-05, "loss": 1.5413397216796876, "step": 3000 }, { "epoch": 1.0366338344565404, "eval_loss": 0.6183949112892151, "eval_runtime": 95.1248, "eval_samples_per_second": 378.261, "eval_steps_per_second": 0.988, "step": 3000 }, { "epoch": 1.0539139450492483, "grad_norm": 0.6720120310783386, "learning_rate": 3.765315008427641e-05, "loss": 1.5179107666015625, "step": 3050 }, { "epoch": 1.0711940556419561, "grad_norm": 0.6000827550888062, "learning_rate": 3.724877875051918e-05, "loss": 1.486010284423828, "step": 3100 }, { "epoch": 1.088474166234664, "grad_norm": 0.6552499532699585, "learning_rate": 3.6840145621161024e-05, "loss": 1.469657440185547, "step": 3150 }, { "epoch": 1.1057542768273716, "grad_norm": 0.6109654903411865, "learning_rate": 3.642739287452914e-05, "loss": 1.4606805419921876, "step": 3200 }, { "epoch": 1.1230343874200794, "grad_norm": 0.6051790118217468, "learning_rate": 3.601066412231542e-05, "loss": 1.4501374816894532, "step": 3250 }, { "epoch": 1.1403144980127873, "grad_norm": 0.7028324007987976, "learning_rate": 3.5590104359608686e-05, "loss": 1.412510986328125, "step": 3300 }, { "epoch": 1.1575946086054951, "grad_norm": 0.635924220085144, "learning_rate": 3.516585991444564e-05, "loss": 1.3964031982421874, "step": 3350 }, { "epoch": 1.1748747191982027, "grad_norm": 0.5996707677841187, "learning_rate": 3.473807839689803e-05, "loss": 1.3822164916992188, "step": 3400 }, { "epoch": 1.1921548297909106, "grad_norm": 0.5824238657951355, "learning_rate": 3.430690864771371e-05, "loss": 1.3676600646972656, "step": 3450 }, { "epoch": 1.2094349403836184, "grad_norm": 0.5993366241455078, "learning_rate": 3.387250068652958e-05, "loss": 1.3601907348632813, "step": 3500 }, { "epoch": 1.2267150509763263, "grad_norm": 0.601268470287323, "learning_rate": 3.343500565967422e-05, "loss": 1.332314453125, "step": 3550 }, { "epoch": 1.2439951615690341, "grad_norm": 0.6086856126785278, "learning_rate": 3.299457578757866e-05, "loss": 1.3296095275878905, "step": 3600 }, { "epoch": 1.261275272161742, "grad_norm": 0.5556713342666626, "learning_rate": 3.2551364311813316e-05, "loss": 1.309993133544922, "step": 3650 }, { "epoch": 1.2785553827544496, "grad_norm": 0.6057120561599731, "learning_rate": 3.2105525441769676e-05, "loss": 1.3083804321289063, "step": 3700 }, { "epoch": 1.2958354933471574, "grad_norm": 0.5656481981277466, "learning_rate": 3.165721430100527e-05, "loss": 1.2851667785644532, "step": 3750 }, { "epoch": 1.3131156039398653, "grad_norm": 0.5798958539962769, "learning_rate": 3.120658687327052e-05, "loss": 1.2679750061035155, "step": 3800 }, { "epoch": 1.3303957145325729, "grad_norm": 0.5177021622657776, "learning_rate": 3.0753799948236316e-05, "loss": 1.265101776123047, "step": 3850 }, { "epoch": 1.3476758251252807, "grad_norm": 0.5503849387168884, "learning_rate": 3.0299011066941203e-05, "loss": 1.2567321014404298, "step": 3900 }, { "epoch": 1.3649559357179886, "grad_norm": 0.5518880486488342, "learning_rate": 2.9842378466977128e-05, "loss": 1.2535091400146485, "step": 3950 }, { "epoch": 1.3822360463106964, "grad_norm": 0.6316933035850525, "learning_rate": 2.93840610274328e-05, "loss": 1.2241575622558594, "step": 4000 }, { "epoch": 1.3995161569034043, "grad_norm": 0.5042502284049988, "learning_rate": 2.8924218213613902e-05, "loss": 1.2300173950195312, "step": 4050 }, { "epoch": 1.416796267496112, "grad_norm": 0.6486298441886902, "learning_rate": 2.8463010021559298e-05, "loss": 1.21891845703125, "step": 4100 }, { "epoch": 1.4340763780888197, "grad_norm": 0.5548715591430664, "learning_rate": 2.800059692237261e-05, "loss": 1.204437484741211, "step": 4150 }, { "epoch": 1.4513564886815276, "grad_norm": 0.5416399836540222, "learning_rate": 2.7537139806388455e-05, "loss": 1.1822820281982422, "step": 4200 }, { "epoch": 1.4686365992742354, "grad_norm": 0.5168971419334412, "learning_rate": 2.7072799927192883e-05, "loss": 1.1682017517089844, "step": 4250 }, { "epoch": 1.485916709866943, "grad_norm": 0.5409218668937683, "learning_rate": 2.6607738845517348e-05, "loss": 1.1644657135009766, "step": 4300 }, { "epoch": 1.5031968204596509, "grad_norm": 0.5903815627098083, "learning_rate": 2.614211837302589e-05, "loss": 1.165572052001953, "step": 4350 }, { "epoch": 1.5204769310523587, "grad_norm": 0.5404263734817505, "learning_rate": 2.567610051601497e-05, "loss": 1.1485606384277345, "step": 4400 }, { "epoch": 1.5377570416450665, "grad_norm": 0.6503647565841675, "learning_rate": 2.520984741904554e-05, "loss": 1.136939697265625, "step": 4450 }, { "epoch": 1.5550371522377744, "grad_norm": 0.5339692831039429, "learning_rate": 2.4743521308527125e-05, "loss": 1.1341026306152344, "step": 4500 }, { "epoch": 1.5550371522377744, "eval_loss": 0.45711469650268555, "eval_runtime": 95.1691, "eval_samples_per_second": 378.085, "eval_steps_per_second": 0.988, "step": 4500 }, { "epoch": 1.5723172628304822, "grad_norm": 0.5759618282318115, "learning_rate": 2.4277284436273307e-05, "loss": 1.126502151489258, "step": 4550 }, { "epoch": 1.58959737342319, "grad_norm": 0.5793848633766174, "learning_rate": 2.381129902304841e-05, "loss": 1.1371284484863282, "step": 4600 }, { "epoch": 1.6068774840158977, "grad_norm": 0.5430593490600586, "learning_rate": 2.3345727202125056e-05, "loss": 1.1101528930664062, "step": 4650 }, { "epoch": 1.6241575946086055, "grad_norm": 0.5803630948066711, "learning_rate": 2.2880730962872023e-05, "loss": 1.1264077758789062, "step": 4700 }, { "epoch": 1.6414377052013132, "grad_norm": 0.6316850781440735, "learning_rate": 2.2416472094392323e-05, "loss": 1.1182173919677734, "step": 4750 }, { "epoch": 1.658717815794021, "grad_norm": 0.5736232399940491, "learning_rate": 2.195311212923085e-05, "loss": 1.100269775390625, "step": 4800 }, { "epoch": 1.6759979263867288, "grad_norm": 0.5712565183639526, "learning_rate": 2.149081228717133e-05, "loss": 1.091978988647461, "step": 4850 }, { "epoch": 1.6932780369794367, "grad_norm": 0.5494542717933655, "learning_rate": 2.1029733419142128e-05, "loss": 1.095367431640625, "step": 4900 }, { "epoch": 1.7105581475721445, "grad_norm": 0.4688451290130615, "learning_rate": 2.0570035951250306e-05, "loss": 1.0880941009521485, "step": 4950 }, { "epoch": 1.7278382581648524, "grad_norm": 0.5209280848503113, "learning_rate": 2.0111879828963616e-05, "loss": 1.0774014282226563, "step": 5000 }, { "epoch": 1.7451183687575602, "grad_norm": 0.530224621295929, "learning_rate": 1.9655424461459586e-05, "loss": 1.0644034576416015, "step": 5050 }, { "epoch": 1.7623984793502678, "grad_norm": 0.5463877320289612, "learning_rate": 1.920082866616132e-05, "loss": 1.0841154479980468, "step": 5100 }, { "epoch": 1.7796785899429757, "grad_norm": 0.5256918668746948, "learning_rate": 1.8748250613479124e-05, "loss": 1.0540755462646485, "step": 5150 }, { "epoch": 1.7969587005356833, "grad_norm": 0.5257004499435425, "learning_rate": 1.829784777177723e-05, "loss": 1.0588815307617188, "step": 5200 }, { "epoch": 1.8142388111283911, "grad_norm": 0.5281515121459961, "learning_rate": 1.784977685258492e-05, "loss": 1.045956573486328, "step": 5250 }, { "epoch": 1.831518921721099, "grad_norm": 0.4766943156719208, "learning_rate": 1.7404193756070763e-05, "loss": 1.0421023559570313, "step": 5300 }, { "epoch": 1.8487990323138068, "grad_norm": 0.501384437084198, "learning_rate": 1.696125351679938e-05, "loss": 1.042171630859375, "step": 5350 }, { "epoch": 1.8660791429065147, "grad_norm": 0.5489823222160339, "learning_rate": 1.6521110249789228e-05, "loss": 1.0536033630371093, "step": 5400 }, { "epoch": 1.8833592534992225, "grad_norm": 0.4959055185317993, "learning_rate": 1.6083917096890385e-05, "loss": 1.0371237945556642, "step": 5450 }, { "epoch": 1.9006393640919304, "grad_norm": 0.524411141872406, "learning_rate": 1.564982617350096e-05, "loss": 1.0318540954589843, "step": 5500 }, { "epoch": 1.917919474684638, "grad_norm": 1.0833576917648315, "learning_rate": 1.5218988515640548e-05, "loss": 1.0373517608642577, "step": 5550 }, { "epoch": 1.9351995852773458, "grad_norm": 0.565988302230835, "learning_rate": 1.4791554027399398e-05, "loss": 1.0258339691162108, "step": 5600 }, { "epoch": 1.9524796958700534, "grad_norm": 0.48943427205085754, "learning_rate": 1.4367671428781243e-05, "loss": 1.025996551513672, "step": 5650 }, { "epoch": 1.9697598064627613, "grad_norm": 0.5613059401512146, "learning_rate": 1.3947488203958265e-05, "loss": 1.0235104370117187, "step": 5700 }, { "epoch": 1.9870399170554691, "grad_norm": 0.4791814386844635, "learning_rate": 1.3531150549955943e-05, "loss": 1.0148072814941407, "step": 5750 }, { "epoch": 2.00414722654225, "grad_norm": 0.5621808767318726, "learning_rate": 1.31188033257858e-05, "loss": 1.0037516021728516, "step": 5800 }, { "epoch": 2.021427337134958, "grad_norm": 0.5978628396987915, "learning_rate": 1.2710590002043729e-05, "loss": 1.0024045562744142, "step": 5850 }, { "epoch": 2.0387074477276657, "grad_norm": 0.5256090760231018, "learning_rate": 1.2306652610991288e-05, "loss": 0.9898030090332032, "step": 5900 }, { "epoch": 2.055987558320373, "grad_norm": 0.5156893134117126, "learning_rate": 1.1907131697137546e-05, "loss": 0.9952345275878907, "step": 5950 }, { "epoch": 2.073267668913081, "grad_norm": 0.4754015803337097, "learning_rate": 1.1512166268338542e-05, "loss": 0.991960678100586, "step": 6000 }, { "epoch": 2.073267668913081, "eval_loss": 0.3997305929660797, "eval_runtime": 95.1558, "eval_samples_per_second": 378.138, "eval_steps_per_second": 0.988, "step": 6000 }, { "epoch": 2.0905477795057887, "grad_norm": 0.5414108037948608, "learning_rate": 1.1121893747431378e-05, "loss": 0.9846409606933594, "step": 6050 }, { "epoch": 2.1078278900984966, "grad_norm": 0.529751181602478, "learning_rate": 1.0736449924419822e-05, "loss": 0.9891058349609375, "step": 6100 }, { "epoch": 2.1251080006912044, "grad_norm": 0.4818967878818512, "learning_rate": 1.0355968909228054e-05, "loss": 0.9866749572753907, "step": 6150 }, { "epoch": 2.1423881112839123, "grad_norm": 0.5096397399902344, "learning_rate": 9.980583085038895e-06, "loss": 0.9821431732177734, "step": 6200 }, { "epoch": 2.15966822187662, "grad_norm": 0.5606327056884766, "learning_rate": 9.610423062232912e-06, "loss": 0.9854959869384765, "step": 6250 }, { "epoch": 2.176948332469328, "grad_norm": 0.514130711555481, "learning_rate": 9.245617632944348e-06, "loss": 0.9793372344970703, "step": 6300 }, { "epoch": 2.194228443062036, "grad_norm": 0.5572317242622375, "learning_rate": 8.886293726249562e-06, "loss": 0.9789413452148438, "step": 6350 }, { "epoch": 2.211508553654743, "grad_norm": 0.5003585815429688, "learning_rate": 8.532576364003904e-06, "loss": 0.9631109619140625, "step": 6400 }, { "epoch": 2.228788664247451, "grad_norm": 0.5201903581619263, "learning_rate": 8.184588617341976e-06, "loss": 0.9783859252929688, "step": 6450 }, { "epoch": 2.246068774840159, "grad_norm": 0.5099287033081055, "learning_rate": 7.842451563856742e-06, "loss": 0.9678781127929688, "step": 6500 }, { "epoch": 2.2633488854328667, "grad_norm": 0.46017828583717346, "learning_rate": 7.506284245472225e-06, "loss": 0.9522227478027344, "step": 6550 }, { "epoch": 2.2806289960255746, "grad_norm": 0.45564213395118713, "learning_rate": 7.176203627024514e-06, "loss": 0.9609327697753907, "step": 6600 }, { "epoch": 2.2979091066182824, "grad_norm": 0.5416494607925415, "learning_rate": 6.852324555565404e-06, "loss": 0.966263427734375, "step": 6650 }, { "epoch": 2.3151892172109902, "grad_norm": 0.5636786818504333, "learning_rate": 6.53475972040295e-06, "loss": 0.9645524597167969, "step": 6700 }, { "epoch": 2.332469327803698, "grad_norm": 0.5431479215621948, "learning_rate": 6.22361961389277e-06, "loss": 0.9668412780761719, "step": 6750 }, { "epoch": 2.3497494383964055, "grad_norm": 0.4858858585357666, "learning_rate": 5.919012492993706e-06, "loss": 0.9710499572753907, "step": 6800 }, { "epoch": 2.3670295489891133, "grad_norm": 0.4986262619495392, "learning_rate": 5.621044341601342e-06, "loss": 0.9668563842773438, "step": 6850 }, { "epoch": 2.384309659581821, "grad_norm": 0.5012155771255493, "learning_rate": 5.329818833672273e-06, "loss": 0.9694512939453125, "step": 6900 }, { "epoch": 2.401589770174529, "grad_norm": 0.5899285078048706, "learning_rate": 5.045437297152245e-06, "loss": 0.9606761169433594, "step": 6950 }, { "epoch": 2.418869880767237, "grad_norm": 0.4638988971710205, "learning_rate": 4.767998678720448e-06, "loss": 0.957213134765625, "step": 7000 }, { "epoch": 2.4361499913599447, "grad_norm": 0.5025919079780579, "learning_rate": 4.4975995093623266e-06, "loss": 0.9607756042480469, "step": 7050 }, { "epoch": 2.4534301019526525, "grad_norm": 0.5194385051727295, "learning_rate": 4.234333870783014e-06, "loss": 0.9540797424316406, "step": 7100 }, { "epoch": 2.4707102125453604, "grad_norm": 0.5033853650093079, "learning_rate": 3.97829336267283e-06, "loss": 0.957088851928711, "step": 7150 }, { "epoch": 2.4879903231380682, "grad_norm": 0.4493975341320038, "learning_rate": 3.729567070836437e-06, "loss": 0.9415164947509765, "step": 7200 }, { "epoch": 2.505270433730776, "grad_norm": 0.5417333841323853, "learning_rate": 3.488241536196643e-06, "loss": 0.9572615051269531, "step": 7250 }, { "epoch": 2.522550544323484, "grad_norm": 0.5607247352600098, "learning_rate": 3.254400724683673e-06, "loss": 0.9510250854492187, "step": 7300 }, { "epoch": 2.5398306549161913, "grad_norm": 0.5300142765045166, "learning_rate": 3.0281259980203757e-06, "loss": 0.9523114776611328, "step": 7350 }, { "epoch": 2.557110765508899, "grad_norm": 0.5919684171676636, "learning_rate": 2.809496085413496e-06, "loss": 0.9338645172119141, "step": 7400 }, { "epoch": 2.574390876101607, "grad_norm": 0.46930092573165894, "learning_rate": 2.5985870561609448e-06, "loss": 0.9432379150390625, "step": 7450 }, { "epoch": 2.591670986694315, "grad_norm": 0.5667592883110046, "learning_rate": 2.3954722931845002e-06, "loss": 0.9356614685058594, "step": 7500 }, { "epoch": 2.591670986694315, "eval_loss": 0.38118186593055725, "eval_runtime": 95.0674, "eval_samples_per_second": 378.489, "eval_steps_per_second": 0.989, "step": 7500 }, { "epoch": 2.6089510972870227, "grad_norm": 0.48274651169776917, "learning_rate": 2.2002224674972676e-06, "loss": 0.9377688598632813, "step": 7550 }, { "epoch": 2.6262312078797305, "grad_norm": 0.4731481969356537, "learning_rate": 2.012905513614588e-06, "loss": 0.9340367126464844, "step": 7600 }, { "epoch": 2.6435113184724384, "grad_norm": 0.51893550157547, "learning_rate": 1.8335866059172258e-06, "loss": 0.951480712890625, "step": 7650 }, { "epoch": 2.6607914290651458, "grad_norm": 0.5630462765693665, "learning_rate": 1.6623281359747806e-06, "loss": 0.9414936828613282, "step": 7700 }, { "epoch": 2.6780715396578536, "grad_norm": 0.49863728880882263, "learning_rate": 1.499189690837413e-06, "loss": 0.9399469757080078, "step": 7750 }, { "epoch": 2.6953516502505614, "grad_norm": 0.4599184989929199, "learning_rate": 1.344228032303349e-06, "loss": 0.9424849700927734, "step": 7800 }, { "epoch": 2.7126317608432693, "grad_norm": 0.5391411781311035, "learning_rate": 1.1974970771693543e-06, "loss": 0.9474674224853515, "step": 7850 }, { "epoch": 2.729911871435977, "grad_norm": 0.5229924917221069, "learning_rate": 1.0590478784711561e-06, "loss": 0.9289096069335937, "step": 7900 }, { "epoch": 2.747191982028685, "grad_norm": 0.5078772306442261, "learning_rate": 9.28928607720217e-07, "loss": 0.9420416259765625, "step": 7950 }, { "epoch": 2.764472092621393, "grad_norm": 0.5175345540046692, "learning_rate": 8.071845381431103e-07, "loss": 0.9313079833984375, "step": 8000 }, { "epoch": 2.7817522032141007, "grad_norm": 0.49628084897994995, "learning_rate": 6.938580289293339e-07, "loss": 0.9550038146972656, "step": 8050 }, { "epoch": 2.7990323138068085, "grad_norm": 0.5086449980735779, "learning_rate": 5.889885104929965e-07, "loss": 0.9414044952392578, "step": 8100 }, { "epoch": 2.8163124243995163, "grad_norm": 0.4850241243839264, "learning_rate": 4.926124707535395e-07, "loss": 0.9482218933105468, "step": 8150 }, { "epoch": 2.833592534992224, "grad_norm": 0.5032879114151001, "learning_rate": 4.0476344244027023e-07, "loss": 0.9468795776367187, "step": 8200 }, { "epoch": 2.8508726455849316, "grad_norm": 0.49061650037765503, "learning_rate": 3.254719914251081e-07, "loss": 0.9405175018310546, "step": 8250 }, { "epoch": 2.8681527561776394, "grad_norm": 0.5416680574417114, "learning_rate": 2.547657060875924e-07, "loss": 0.9449867248535156, "step": 8300 }, { "epoch": 2.8854328667703473, "grad_norm": 0.5759839415550232, "learning_rate": 1.9266918771590204e-07, "loss": 0.9381349945068359, "step": 8350 }, { "epoch": 2.902712977363055, "grad_norm": 0.5406273603439331, "learning_rate": 1.392040419471552e-07, "loss": 0.9400007629394531, "step": 8400 }, { "epoch": 2.919993087955763, "grad_norm": 0.503950834274292, "learning_rate": 9.438887125002293e-08, "loss": 0.9473369598388672, "step": 8450 }, { "epoch": 2.937273198548471, "grad_norm": 0.5274893641471863, "learning_rate": 5.823926845227312e-08, "loss": 0.9627262115478515, "step": 8500 }, { "epoch": 2.9545533091411786, "grad_norm": 0.5260080695152283, "learning_rate": 3.076781131543249e-08, "loss": 0.9508057403564453, "step": 8550 }, { "epoch": 2.971833419733886, "grad_norm": 0.5042856335639954, "learning_rate": 1.1984058158542866e-08, "loss": 0.9278289794921875, "step": 8600 }, { "epoch": 2.989113530326594, "grad_norm": 0.4784366488456726, "learning_rate": 1.8945445324769494e-09, "loss": 0.9391728210449218, "step": 8650 }, { "epoch": 3.0, "step": 8682, "total_flos": 1.83225980060015e+18, "train_loss": 2.536729412566569, "train_runtime": 19314.0993, "train_samples_per_second": 115.053, "train_steps_per_second": 0.45 } ], "logging_steps": 50, "max_steps": 8682, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.83225980060015e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }