| { | |
| "best_global_step": 7500, | |
| "best_metric": 0.38118186593055725, | |
| "best_model_checkpoint": "./byt5_leetspeak_v3/checkpoint-7500", | |
| "epoch": 3.0, | |
| "eval_steps": 1500, | |
| "global_step": 8682, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0003456022118541559, | |
| "grad_norm": 7.998739719390869, | |
| "learning_rate": 0.0, | |
| "loss": 21.331615447998047, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.017280110592707794, | |
| "grad_norm": 4.945672512054443, | |
| "learning_rate": 9.386973180076629e-06, | |
| "loss": 20.682008081552933, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03456022118541559, | |
| "grad_norm": 3.5129992961883545, | |
| "learning_rate": 1.896551724137931e-05, | |
| "loss": 17.8103759765625, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05184033177812338, | |
| "grad_norm": 3.4432644844055176, | |
| "learning_rate": 2.8544061302681996e-05, | |
| "loss": 15.057559814453125, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06912044237083118, | |
| "grad_norm": 3.018535852432251, | |
| "learning_rate": 3.8122605363984674e-05, | |
| "loss": 13.59551513671875, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08640055296353896, | |
| "grad_norm": 2.247919797897339, | |
| "learning_rate": 4.770114942528736e-05, | |
| "loss": 12.362723388671874, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.10368066355624676, | |
| "grad_norm": 2.297297239303589, | |
| "learning_rate": 4.9997487868649304e-05, | |
| "loss": 11.349617919921876, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12096077414895455, | |
| "grad_norm": 2.3222248554229736, | |
| "learning_rate": 4.9986528723171024e-05, | |
| "loss": 10.5994140625, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.13824088474166235, | |
| "grad_norm": 2.342911958694458, | |
| "learning_rate": 4.996687585561939e-05, | |
| "loss": 10.017122802734375, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.15552099533437014, | |
| "grad_norm": 2.3807644844055176, | |
| "learning_rate": 4.993853610394178e-05, | |
| "loss": 9.509613037109375, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.17280110592707792, | |
| "grad_norm": 2.3903160095214844, | |
| "learning_rate": 4.9901519328568466e-05, | |
| "loss": 9.067050170898437, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.19008121651978574, | |
| "grad_norm": 2.3767359256744385, | |
| "learning_rate": 4.985583840898188e-05, | |
| "loss": 8.662135620117187, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.20736132711249353, | |
| "grad_norm": 2.3853607177734375, | |
| "learning_rate": 4.98015092392353e-05, | |
| "loss": 8.290029296875, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.2246414377052013, | |
| "grad_norm": 2.372498035430908, | |
| "learning_rate": 4.973855072242276e-05, | |
| "loss": 7.937987060546875, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2419215482979091, | |
| "grad_norm": 2.359043836593628, | |
| "learning_rate": 4.966698476410199e-05, | |
| "loss": 7.60522705078125, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2592016588906169, | |
| "grad_norm": 2.353739023208618, | |
| "learning_rate": 4.9586836264672666e-05, | |
| "loss": 7.296792602539062, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2764817694833247, | |
| "grad_norm": 2.335012435913086, | |
| "learning_rate": 4.9498133110712644e-05, | |
| "loss": 6.98702392578125, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2937618800760325, | |
| "grad_norm": 2.289266586303711, | |
| "learning_rate": 4.940090616527521e-05, | |
| "loss": 6.709508666992187, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3110419906687403, | |
| "grad_norm": 2.252319097518921, | |
| "learning_rate": 4.929518925715071e-05, | |
| "loss": 6.44697021484375, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.32832210126144806, | |
| "grad_norm": 2.216578483581543, | |
| "learning_rate": 4.9181019169096285e-05, | |
| "loss": 6.1863818359375, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.34560221185415585, | |
| "grad_norm": 2.1667819023132324, | |
| "learning_rate": 4.90584356250378e-05, | |
| "loss": 5.913253173828125, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.36288232244686364, | |
| "grad_norm": 2.1393256187438965, | |
| "learning_rate": 4.892748127624845e-05, | |
| "loss": 5.687144165039062, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3801624330395715, | |
| "grad_norm": 2.1089439392089844, | |
| "learning_rate": 4.878820168650884e-05, | |
| "loss": 5.421361083984375, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.39744254363227927, | |
| "grad_norm": 2.0389111042022705, | |
| "learning_rate": 4.864064531625366e-05, | |
| "loss": 5.215165405273438, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.41472265422498705, | |
| "grad_norm": 1.9905728101730347, | |
| "learning_rate": 4.8484863505710585e-05, | |
| "loss": 4.993418273925781, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.43200276481769484, | |
| "grad_norm": 1.915732502937317, | |
| "learning_rate": 4.8320910457037105e-05, | |
| "loss": 4.796431884765625, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4492828754104026, | |
| "grad_norm": 1.876905083656311, | |
| "learning_rate": 4.814884321546163e-05, | |
| "loss": 4.600077819824219, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4665629860031104, | |
| "grad_norm": 1.8127694129943848, | |
| "learning_rate": 4.796872164943538e-05, | |
| "loss": 4.390880737304688, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4838430965958182, | |
| "grad_norm": 1.749795913696289, | |
| "learning_rate": 4.778060842980199e-05, | |
| "loss": 4.223977661132812, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.501123207188526, | |
| "grad_norm": 1.7290434837341309, | |
| "learning_rate": 4.758456900799202e-05, | |
| "loss": 4.044484252929688, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5184033177812338, | |
| "grad_norm": 1.658087968826294, | |
| "learning_rate": 4.738067159325005e-05, | |
| "loss": 3.873255615234375, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5184033177812338, | |
| "eval_loss": 1.5466166734695435, | |
| "eval_runtime": 95.1917, | |
| "eval_samples_per_second": 377.995, | |
| "eval_steps_per_second": 0.987, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5356834283739416, | |
| "grad_norm": 1.5666825771331787, | |
| "learning_rate": 4.716898712890218e-05, | |
| "loss": 3.697227783203125, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5529635389666494, | |
| "grad_norm": 1.5051823854446411, | |
| "learning_rate": 4.6949589267672256e-05, | |
| "loss": 3.5488134765625, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5702436495593571, | |
| "grad_norm": 1.4208341836929321, | |
| "learning_rate": 4.6722554346055446e-05, | |
| "loss": 3.41210693359375, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.587523760152065, | |
| "grad_norm": 1.4499037265777588, | |
| "learning_rate": 4.648796135775798e-05, | |
| "loss": 3.27009521484375, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.6048038707447728, | |
| "grad_norm": 1.3428349494934082, | |
| "learning_rate": 4.624589192621235e-05, | |
| "loss": 3.14307373046875, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.6220839813374806, | |
| "grad_norm": 1.2956618070602417, | |
| "learning_rate": 4.599643027617758e-05, | |
| "loss": 3.0137338256835937, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6393640919301884, | |
| "grad_norm": 1.2256124019622803, | |
| "learning_rate": 4.573966320443433e-05, | |
| "loss": 2.8847576904296877, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.6566442025228961, | |
| "grad_norm": 1.1832093000411987, | |
| "learning_rate": 4.547568004958518e-05, | |
| "loss": 2.783875732421875, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.673924313115604, | |
| "grad_norm": 1.1249213218688965, | |
| "learning_rate": 4.520457266097046e-05, | |
| "loss": 2.658479919433594, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6912044237083117, | |
| "grad_norm": 1.1056411266326904, | |
| "learning_rate": 4.492643536671052e-05, | |
| "loss": 2.5734375, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7084845343010195, | |
| "grad_norm": 1.0190331935882568, | |
| "learning_rate": 4.4641364940885564e-05, | |
| "loss": 2.468469543457031, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.7257646448937273, | |
| "grad_norm": 1.1260298490524292, | |
| "learning_rate": 4.4349460569864404e-05, | |
| "loss": 2.387096252441406, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7430447554864351, | |
| "grad_norm": 0.9206457734107971, | |
| "learning_rate": 4.4050823817793944e-05, | |
| "loss": 2.317845458984375, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.760324866079143, | |
| "grad_norm": 0.9262681007385254, | |
| "learning_rate": 4.3745558591261295e-05, | |
| "loss": 2.2218693542480468, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7776049766718507, | |
| "grad_norm": 0.9071998000144958, | |
| "learning_rate": 4.3433771103140896e-05, | |
| "loss": 2.153177032470703, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7948850872645585, | |
| "grad_norm": 0.9530045986175537, | |
| "learning_rate": 4.3115569835639215e-05, | |
| "loss": 2.109056243896484, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.8121651978572663, | |
| "grad_norm": 0.8229272961616516, | |
| "learning_rate": 4.279106550254981e-05, | |
| "loss": 2.0358200073242188, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.8294453084499741, | |
| "grad_norm": 0.8083502054214478, | |
| "learning_rate": 4.246037101073202e-05, | |
| "loss": 1.9781124877929688, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8467254190426818, | |
| "grad_norm": 0.8030848503112793, | |
| "learning_rate": 4.21236014208265e-05, | |
| "loss": 1.9251625061035156, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.8640055296353897, | |
| "grad_norm": 0.7446616291999817, | |
| "learning_rate": 4.178087390722151e-05, | |
| "loss": 1.8921575927734375, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8812856402280974, | |
| "grad_norm": 0.7391892075538635, | |
| "learning_rate": 4.1432307717283606e-05, | |
| "loss": 1.84561279296875, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8985657508208053, | |
| "grad_norm": 0.7500560283660889, | |
| "learning_rate": 4.107802412986721e-05, | |
| "loss": 1.80940185546875, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.9158458614135131, | |
| "grad_norm": 0.7202692031860352, | |
| "learning_rate": 4.071814641311728e-05, | |
| "loss": 1.7628054809570313, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.9331259720062208, | |
| "grad_norm": 0.7352117896080017, | |
| "learning_rate": 4.0352799781579786e-05, | |
| "loss": 1.7312879943847657, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9504060825989287, | |
| "grad_norm": 0.6867756247520447, | |
| "learning_rate": 3.9982111352635064e-05, | |
| "loss": 1.71904052734375, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.9676861931916364, | |
| "grad_norm": 0.6769801378250122, | |
| "learning_rate": 3.960621010226906e-05, | |
| "loss": 1.6703118896484375, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9849663037843442, | |
| "grad_norm": 0.6680055856704712, | |
| "learning_rate": 3.922522682019785e-05, | |
| "loss": 1.6196646118164062, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.002073613271125, | |
| "grad_norm": 0.659245491027832, | |
| "learning_rate": 3.883929406436118e-05, | |
| "loss": 1.5895655822753907, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.0193537238638328, | |
| "grad_norm": 0.6726027727127075, | |
| "learning_rate": 3.844854611480072e-05, | |
| "loss": 1.5674874877929688, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.0366338344565404, | |
| "grad_norm": 0.6525683999061584, | |
| "learning_rate": 3.805311892693917e-05, | |
| "loss": 1.5413397216796876, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0366338344565404, | |
| "eval_loss": 0.6183949112892151, | |
| "eval_runtime": 95.1248, | |
| "eval_samples_per_second": 378.261, | |
| "eval_steps_per_second": 0.988, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0539139450492483, | |
| "grad_norm": 0.6720120310783386, | |
| "learning_rate": 3.765315008427641e-05, | |
| "loss": 1.5179107666015625, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.0711940556419561, | |
| "grad_norm": 0.6000827550888062, | |
| "learning_rate": 3.724877875051918e-05, | |
| "loss": 1.486010284423828, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.088474166234664, | |
| "grad_norm": 0.6552499532699585, | |
| "learning_rate": 3.6840145621161024e-05, | |
| "loss": 1.469657440185547, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.1057542768273716, | |
| "grad_norm": 0.6109654903411865, | |
| "learning_rate": 3.642739287452914e-05, | |
| "loss": 1.4606805419921876, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.1230343874200794, | |
| "grad_norm": 0.6051790118217468, | |
| "learning_rate": 3.601066412231542e-05, | |
| "loss": 1.4501374816894532, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.1403144980127873, | |
| "grad_norm": 0.7028324007987976, | |
| "learning_rate": 3.5590104359608686e-05, | |
| "loss": 1.412510986328125, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.1575946086054951, | |
| "grad_norm": 0.635924220085144, | |
| "learning_rate": 3.516585991444564e-05, | |
| "loss": 1.3964031982421874, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.1748747191982027, | |
| "grad_norm": 0.5996707677841187, | |
| "learning_rate": 3.473807839689803e-05, | |
| "loss": 1.3822164916992188, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.1921548297909106, | |
| "grad_norm": 0.5824238657951355, | |
| "learning_rate": 3.430690864771371e-05, | |
| "loss": 1.3676600646972656, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.2094349403836184, | |
| "grad_norm": 0.5993366241455078, | |
| "learning_rate": 3.387250068652958e-05, | |
| "loss": 1.3601907348632813, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2267150509763263, | |
| "grad_norm": 0.601268470287323, | |
| "learning_rate": 3.343500565967422e-05, | |
| "loss": 1.332314453125, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.2439951615690341, | |
| "grad_norm": 0.6086856126785278, | |
| "learning_rate": 3.299457578757866e-05, | |
| "loss": 1.3296095275878905, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.261275272161742, | |
| "grad_norm": 0.5556713342666626, | |
| "learning_rate": 3.2551364311813316e-05, | |
| "loss": 1.309993133544922, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.2785553827544496, | |
| "grad_norm": 0.6057120561599731, | |
| "learning_rate": 3.2105525441769676e-05, | |
| "loss": 1.3083804321289063, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2958354933471574, | |
| "grad_norm": 0.5656481981277466, | |
| "learning_rate": 3.165721430100527e-05, | |
| "loss": 1.2851667785644532, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.3131156039398653, | |
| "grad_norm": 0.5798958539962769, | |
| "learning_rate": 3.120658687327052e-05, | |
| "loss": 1.2679750061035155, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.3303957145325729, | |
| "grad_norm": 0.5177021622657776, | |
| "learning_rate": 3.0753799948236316e-05, | |
| "loss": 1.265101776123047, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.3476758251252807, | |
| "grad_norm": 0.5503849387168884, | |
| "learning_rate": 3.0299011066941203e-05, | |
| "loss": 1.2567321014404298, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.3649559357179886, | |
| "grad_norm": 0.5518880486488342, | |
| "learning_rate": 2.9842378466977128e-05, | |
| "loss": 1.2535091400146485, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.3822360463106964, | |
| "grad_norm": 0.6316933035850525, | |
| "learning_rate": 2.93840610274328e-05, | |
| "loss": 1.2241575622558594, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3995161569034043, | |
| "grad_norm": 0.5042502284049988, | |
| "learning_rate": 2.8924218213613902e-05, | |
| "loss": 1.2300173950195312, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.416796267496112, | |
| "grad_norm": 0.6486298441886902, | |
| "learning_rate": 2.8463010021559298e-05, | |
| "loss": 1.21891845703125, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.4340763780888197, | |
| "grad_norm": 0.5548715591430664, | |
| "learning_rate": 2.800059692237261e-05, | |
| "loss": 1.204437484741211, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.4513564886815276, | |
| "grad_norm": 0.5416399836540222, | |
| "learning_rate": 2.7537139806388455e-05, | |
| "loss": 1.1822820281982422, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.4686365992742354, | |
| "grad_norm": 0.5168971419334412, | |
| "learning_rate": 2.7072799927192883e-05, | |
| "loss": 1.1682017517089844, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.485916709866943, | |
| "grad_norm": 0.5409218668937683, | |
| "learning_rate": 2.6607738845517348e-05, | |
| "loss": 1.1644657135009766, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.5031968204596509, | |
| "grad_norm": 0.5903815627098083, | |
| "learning_rate": 2.614211837302589e-05, | |
| "loss": 1.165572052001953, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.5204769310523587, | |
| "grad_norm": 0.5404263734817505, | |
| "learning_rate": 2.567610051601497e-05, | |
| "loss": 1.1485606384277345, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.5377570416450665, | |
| "grad_norm": 0.6503647565841675, | |
| "learning_rate": 2.520984741904554e-05, | |
| "loss": 1.136939697265625, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.5550371522377744, | |
| "grad_norm": 0.5339692831039429, | |
| "learning_rate": 2.4743521308527125e-05, | |
| "loss": 1.1341026306152344, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.5550371522377744, | |
| "eval_loss": 0.45711469650268555, | |
| "eval_runtime": 95.1691, | |
| "eval_samples_per_second": 378.085, | |
| "eval_steps_per_second": 0.988, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.5723172628304822, | |
| "grad_norm": 0.5759618282318115, | |
| "learning_rate": 2.4277284436273307e-05, | |
| "loss": 1.126502151489258, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.58959737342319, | |
| "grad_norm": 0.5793848633766174, | |
| "learning_rate": 2.381129902304841e-05, | |
| "loss": 1.1371284484863282, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.6068774840158977, | |
| "grad_norm": 0.5430593490600586, | |
| "learning_rate": 2.3345727202125056e-05, | |
| "loss": 1.1101528930664062, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.6241575946086055, | |
| "grad_norm": 0.5803630948066711, | |
| "learning_rate": 2.2880730962872023e-05, | |
| "loss": 1.1264077758789062, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.6414377052013132, | |
| "grad_norm": 0.6316850781440735, | |
| "learning_rate": 2.2416472094392323e-05, | |
| "loss": 1.1182173919677734, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.658717815794021, | |
| "grad_norm": 0.5736232399940491, | |
| "learning_rate": 2.195311212923085e-05, | |
| "loss": 1.100269775390625, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6759979263867288, | |
| "grad_norm": 0.5712565183639526, | |
| "learning_rate": 2.149081228717133e-05, | |
| "loss": 1.091978988647461, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.6932780369794367, | |
| "grad_norm": 0.5494542717933655, | |
| "learning_rate": 2.1029733419142128e-05, | |
| "loss": 1.095367431640625, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.7105581475721445, | |
| "grad_norm": 0.4688451290130615, | |
| "learning_rate": 2.0570035951250306e-05, | |
| "loss": 1.0880941009521485, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.7278382581648524, | |
| "grad_norm": 0.5209280848503113, | |
| "learning_rate": 2.0111879828963616e-05, | |
| "loss": 1.0774014282226563, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7451183687575602, | |
| "grad_norm": 0.530224621295929, | |
| "learning_rate": 1.9655424461459586e-05, | |
| "loss": 1.0644034576416015, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.7623984793502678, | |
| "grad_norm": 0.5463877320289612, | |
| "learning_rate": 1.920082866616132e-05, | |
| "loss": 1.0841154479980468, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.7796785899429757, | |
| "grad_norm": 0.5256918668746948, | |
| "learning_rate": 1.8748250613479124e-05, | |
| "loss": 1.0540755462646485, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.7969587005356833, | |
| "grad_norm": 0.5257004499435425, | |
| "learning_rate": 1.829784777177723e-05, | |
| "loss": 1.0588815307617188, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.8142388111283911, | |
| "grad_norm": 0.5281515121459961, | |
| "learning_rate": 1.784977685258492e-05, | |
| "loss": 1.045956573486328, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.831518921721099, | |
| "grad_norm": 0.4766943156719208, | |
| "learning_rate": 1.7404193756070763e-05, | |
| "loss": 1.0421023559570313, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.8487990323138068, | |
| "grad_norm": 0.501384437084198, | |
| "learning_rate": 1.696125351679938e-05, | |
| "loss": 1.042171630859375, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.8660791429065147, | |
| "grad_norm": 0.5489823222160339, | |
| "learning_rate": 1.6521110249789228e-05, | |
| "loss": 1.0536033630371093, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.8833592534992225, | |
| "grad_norm": 0.4959055185317993, | |
| "learning_rate": 1.6083917096890385e-05, | |
| "loss": 1.0371237945556642, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.9006393640919304, | |
| "grad_norm": 0.524411141872406, | |
| "learning_rate": 1.564982617350096e-05, | |
| "loss": 1.0318540954589843, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.917919474684638, | |
| "grad_norm": 1.0833576917648315, | |
| "learning_rate": 1.5218988515640548e-05, | |
| "loss": 1.0373517608642577, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.9351995852773458, | |
| "grad_norm": 0.565988302230835, | |
| "learning_rate": 1.4791554027399398e-05, | |
| "loss": 1.0258339691162108, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.9524796958700534, | |
| "grad_norm": 0.48943427205085754, | |
| "learning_rate": 1.4367671428781243e-05, | |
| "loss": 1.025996551513672, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.9697598064627613, | |
| "grad_norm": 0.5613059401512146, | |
| "learning_rate": 1.3947488203958265e-05, | |
| "loss": 1.0235104370117187, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.9870399170554691, | |
| "grad_norm": 0.4791814386844635, | |
| "learning_rate": 1.3531150549955943e-05, | |
| "loss": 1.0148072814941407, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.00414722654225, | |
| "grad_norm": 0.5621808767318726, | |
| "learning_rate": 1.31188033257858e-05, | |
| "loss": 1.0037516021728516, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.021427337134958, | |
| "grad_norm": 0.5978628396987915, | |
| "learning_rate": 1.2710590002043729e-05, | |
| "loss": 1.0024045562744142, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.0387074477276657, | |
| "grad_norm": 0.5256090760231018, | |
| "learning_rate": 1.2306652610991288e-05, | |
| "loss": 0.9898030090332032, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.055987558320373, | |
| "grad_norm": 0.5156893134117126, | |
| "learning_rate": 1.1907131697137546e-05, | |
| "loss": 0.9952345275878907, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.073267668913081, | |
| "grad_norm": 0.4754015803337097, | |
| "learning_rate": 1.1512166268338542e-05, | |
| "loss": 0.991960678100586, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.073267668913081, | |
| "eval_loss": 0.3997305929660797, | |
| "eval_runtime": 95.1558, | |
| "eval_samples_per_second": 378.138, | |
| "eval_steps_per_second": 0.988, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.0905477795057887, | |
| "grad_norm": 0.5414108037948608, | |
| "learning_rate": 1.1121893747431378e-05, | |
| "loss": 0.9846409606933594, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.1078278900984966, | |
| "grad_norm": 0.529751181602478, | |
| "learning_rate": 1.0736449924419822e-05, | |
| "loss": 0.9891058349609375, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.1251080006912044, | |
| "grad_norm": 0.4818967878818512, | |
| "learning_rate": 1.0355968909228054e-05, | |
| "loss": 0.9866749572753907, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.1423881112839123, | |
| "grad_norm": 0.5096397399902344, | |
| "learning_rate": 9.980583085038895e-06, | |
| "loss": 0.9821431732177734, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.15966822187662, | |
| "grad_norm": 0.5606327056884766, | |
| "learning_rate": 9.610423062232912e-06, | |
| "loss": 0.9854959869384765, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.176948332469328, | |
| "grad_norm": 0.514130711555481, | |
| "learning_rate": 9.245617632944348e-06, | |
| "loss": 0.9793372344970703, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.194228443062036, | |
| "grad_norm": 0.5572317242622375, | |
| "learning_rate": 8.886293726249562e-06, | |
| "loss": 0.9789413452148438, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.211508553654743, | |
| "grad_norm": 0.5003585815429688, | |
| "learning_rate": 8.532576364003904e-06, | |
| "loss": 0.9631109619140625, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.228788664247451, | |
| "grad_norm": 0.5201903581619263, | |
| "learning_rate": 8.184588617341976e-06, | |
| "loss": 0.9783859252929688, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.246068774840159, | |
| "grad_norm": 0.5099287033081055, | |
| "learning_rate": 7.842451563856742e-06, | |
| "loss": 0.9678781127929688, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.2633488854328667, | |
| "grad_norm": 0.46017828583717346, | |
| "learning_rate": 7.506284245472225e-06, | |
| "loss": 0.9522227478027344, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.2806289960255746, | |
| "grad_norm": 0.45564213395118713, | |
| "learning_rate": 7.176203627024514e-06, | |
| "loss": 0.9609327697753907, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.2979091066182824, | |
| "grad_norm": 0.5416494607925415, | |
| "learning_rate": 6.852324555565404e-06, | |
| "loss": 0.966263427734375, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.3151892172109902, | |
| "grad_norm": 0.5636786818504333, | |
| "learning_rate": 6.53475972040295e-06, | |
| "loss": 0.9645524597167969, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.332469327803698, | |
| "grad_norm": 0.5431479215621948, | |
| "learning_rate": 6.22361961389277e-06, | |
| "loss": 0.9668412780761719, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.3497494383964055, | |
| "grad_norm": 0.4858858585357666, | |
| "learning_rate": 5.919012492993706e-06, | |
| "loss": 0.9710499572753907, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.3670295489891133, | |
| "grad_norm": 0.4986262619495392, | |
| "learning_rate": 5.621044341601342e-06, | |
| "loss": 0.9668563842773438, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.384309659581821, | |
| "grad_norm": 0.5012155771255493, | |
| "learning_rate": 5.329818833672273e-06, | |
| "loss": 0.9694512939453125, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.401589770174529, | |
| "grad_norm": 0.5899285078048706, | |
| "learning_rate": 5.045437297152245e-06, | |
| "loss": 0.9606761169433594, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.418869880767237, | |
| "grad_norm": 0.4638988971710205, | |
| "learning_rate": 4.767998678720448e-06, | |
| "loss": 0.957213134765625, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.4361499913599447, | |
| "grad_norm": 0.5025919079780579, | |
| "learning_rate": 4.4975995093623266e-06, | |
| "loss": 0.9607756042480469, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.4534301019526525, | |
| "grad_norm": 0.5194385051727295, | |
| "learning_rate": 4.234333870783014e-06, | |
| "loss": 0.9540797424316406, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.4707102125453604, | |
| "grad_norm": 0.5033853650093079, | |
| "learning_rate": 3.97829336267283e-06, | |
| "loss": 0.957088851928711, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.4879903231380682, | |
| "grad_norm": 0.4493975341320038, | |
| "learning_rate": 3.729567070836437e-06, | |
| "loss": 0.9415164947509765, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.505270433730776, | |
| "grad_norm": 0.5417333841323853, | |
| "learning_rate": 3.488241536196643e-06, | |
| "loss": 0.9572615051269531, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.522550544323484, | |
| "grad_norm": 0.5607247352600098, | |
| "learning_rate": 3.254400724683673e-06, | |
| "loss": 0.9510250854492187, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.5398306549161913, | |
| "grad_norm": 0.5300142765045166, | |
| "learning_rate": 3.0281259980203757e-06, | |
| "loss": 0.9523114776611328, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.557110765508899, | |
| "grad_norm": 0.5919684171676636, | |
| "learning_rate": 2.809496085413496e-06, | |
| "loss": 0.9338645172119141, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.574390876101607, | |
| "grad_norm": 0.46930092573165894, | |
| "learning_rate": 2.5985870561609448e-06, | |
| "loss": 0.9432379150390625, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.591670986694315, | |
| "grad_norm": 0.5667592883110046, | |
| "learning_rate": 2.3954722931845002e-06, | |
| "loss": 0.9356614685058594, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.591670986694315, | |
| "eval_loss": 0.38118186593055725, | |
| "eval_runtime": 95.0674, | |
| "eval_samples_per_second": 378.489, | |
| "eval_steps_per_second": 0.989, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.6089510972870227, | |
| "grad_norm": 0.48274651169776917, | |
| "learning_rate": 2.2002224674972676e-06, | |
| "loss": 0.9377688598632813, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.6262312078797305, | |
| "grad_norm": 0.4731481969356537, | |
| "learning_rate": 2.012905513614588e-06, | |
| "loss": 0.9340367126464844, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.6435113184724384, | |
| "grad_norm": 0.51893550157547, | |
| "learning_rate": 1.8335866059172258e-06, | |
| "loss": 0.951480712890625, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.6607914290651458, | |
| "grad_norm": 0.5630462765693665, | |
| "learning_rate": 1.6623281359747806e-06, | |
| "loss": 0.9414936828613282, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.6780715396578536, | |
| "grad_norm": 0.49863728880882263, | |
| "learning_rate": 1.499189690837413e-06, | |
| "loss": 0.9399469757080078, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.6953516502505614, | |
| "grad_norm": 0.4599184989929199, | |
| "learning_rate": 1.344228032303349e-06, | |
| "loss": 0.9424849700927734, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.7126317608432693, | |
| "grad_norm": 0.5391411781311035, | |
| "learning_rate": 1.1974970771693543e-06, | |
| "loss": 0.9474674224853515, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.729911871435977, | |
| "grad_norm": 0.5229924917221069, | |
| "learning_rate": 1.0590478784711561e-06, | |
| "loss": 0.9289096069335937, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.747191982028685, | |
| "grad_norm": 0.5078772306442261, | |
| "learning_rate": 9.28928607720217e-07, | |
| "loss": 0.9420416259765625, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.764472092621393, | |
| "grad_norm": 0.5175345540046692, | |
| "learning_rate": 8.071845381431103e-07, | |
| "loss": 0.9313079833984375, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.7817522032141007, | |
| "grad_norm": 0.49628084897994995, | |
| "learning_rate": 6.938580289293339e-07, | |
| "loss": 0.9550038146972656, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.7990323138068085, | |
| "grad_norm": 0.5086449980735779, | |
| "learning_rate": 5.889885104929965e-07, | |
| "loss": 0.9414044952392578, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.8163124243995163, | |
| "grad_norm": 0.4850241243839264, | |
| "learning_rate": 4.926124707535395e-07, | |
| "loss": 0.9482218933105468, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.833592534992224, | |
| "grad_norm": 0.5032879114151001, | |
| "learning_rate": 4.0476344244027023e-07, | |
| "loss": 0.9468795776367187, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.8508726455849316, | |
| "grad_norm": 0.49061650037765503, | |
| "learning_rate": 3.254719914251081e-07, | |
| "loss": 0.9405175018310546, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.8681527561776394, | |
| "grad_norm": 0.5416680574417114, | |
| "learning_rate": 2.547657060875924e-07, | |
| "loss": 0.9449867248535156, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.8854328667703473, | |
| "grad_norm": 0.5759839415550232, | |
| "learning_rate": 1.9266918771590204e-07, | |
| "loss": 0.9381349945068359, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.902712977363055, | |
| "grad_norm": 0.5406273603439331, | |
| "learning_rate": 1.392040419471552e-07, | |
| "loss": 0.9400007629394531, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.919993087955763, | |
| "grad_norm": 0.503950834274292, | |
| "learning_rate": 9.438887125002293e-08, | |
| "loss": 0.9473369598388672, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.937273198548471, | |
| "grad_norm": 0.5274893641471863, | |
| "learning_rate": 5.823926845227312e-08, | |
| "loss": 0.9627262115478515, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.9545533091411786, | |
| "grad_norm": 0.5260080695152283, | |
| "learning_rate": 3.076781131543249e-08, | |
| "loss": 0.9508057403564453, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.971833419733886, | |
| "grad_norm": 0.5042856335639954, | |
| "learning_rate": 1.1984058158542866e-08, | |
| "loss": 0.9278289794921875, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.989113530326594, | |
| "grad_norm": 0.4784366488456726, | |
| "learning_rate": 1.8945445324769494e-09, | |
| "loss": 0.9391728210449218, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 8682, | |
| "total_flos": 1.83225980060015e+18, | |
| "train_loss": 2.536729412566569, | |
| "train_runtime": 19314.0993, | |
| "train_samples_per_second": 115.053, | |
| "train_steps_per_second": 0.45 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 8682, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.83225980060015e+18, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |