| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 50.0, | |
| "eval_steps": 500, | |
| "global_step": 80650, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.30998140111593303, | |
| "grad_norm": 0.3543250262737274, | |
| "learning_rate": 6.195786864931847e-05, | |
| "loss": 9.0345, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6199628022318661, | |
| "grad_norm": 0.5106557607650757, | |
| "learning_rate": 9.97582756158962e-05, | |
| "loss": 6.2184, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9299442033477991, | |
| "grad_norm": 2.6617751121520996, | |
| "learning_rate": 9.913204664153402e-05, | |
| "loss": 5.4194, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2399256044637321, | |
| "grad_norm": 1.8096632957458496, | |
| "learning_rate": 9.850581766717182e-05, | |
| "loss": 3.915, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.5499070055796653, | |
| "grad_norm": 1.2520173788070679, | |
| "learning_rate": 9.787958869280964e-05, | |
| "loss": 2.7963, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.8598884066955983, | |
| "grad_norm": 0.8099603056907654, | |
| "learning_rate": 9.725335971844745e-05, | |
| "loss": 2.2568, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.1698698078115313, | |
| "grad_norm": 0.7233591079711914, | |
| "learning_rate": 9.662713074408527e-05, | |
| "loss": 1.9847, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.4798512089274642, | |
| "grad_norm": 0.6427165865898132, | |
| "learning_rate": 9.600090176972308e-05, | |
| "loss": 1.8216, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.7898326100433977, | |
| "grad_norm": 0.6729193925857544, | |
| "learning_rate": 9.53746727953609e-05, | |
| "loss": 1.7067, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.0998140111593306, | |
| "grad_norm": 0.6484789848327637, | |
| "learning_rate": 9.47484438209987e-05, | |
| "loss": 1.6187, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.4097954122752636, | |
| "grad_norm": 0.5950448513031006, | |
| "learning_rate": 9.412221484663653e-05, | |
| "loss": 1.5479, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.7197768133911966, | |
| "grad_norm": 0.6102598309516907, | |
| "learning_rate": 9.349598587227433e-05, | |
| "loss": 1.4879, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.02975821450713, | |
| "grad_norm": 0.6204754710197449, | |
| "learning_rate": 9.286975689791215e-05, | |
| "loss": 1.4379, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 4.3397396156230625, | |
| "grad_norm": 0.590217649936676, | |
| "learning_rate": 9.224352792354997e-05, | |
| "loss": 1.3926, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.6497210167389955, | |
| "grad_norm": 0.6062743663787842, | |
| "learning_rate": 9.161729894918779e-05, | |
| "loss": 1.3553, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 4.9597024178549285, | |
| "grad_norm": 0.5663708448410034, | |
| "learning_rate": 9.09910699748256e-05, | |
| "loss": 1.3201, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 5.2696838189708615, | |
| "grad_norm": 0.5806947350502014, | |
| "learning_rate": 9.036484100046342e-05, | |
| "loss": 1.2904, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 5.579665220086794, | |
| "grad_norm": 0.6131803393363953, | |
| "learning_rate": 8.973861202610123e-05, | |
| "loss": 1.2623, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 5.889646621202727, | |
| "grad_norm": 0.5666236281394958, | |
| "learning_rate": 8.911238305173905e-05, | |
| "loss": 1.2368, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 6.199628022318661, | |
| "grad_norm": 0.6078547239303589, | |
| "learning_rate": 8.848615407737685e-05, | |
| "loss": 1.212, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 6.509609423434594, | |
| "grad_norm": 0.575513482093811, | |
| "learning_rate": 8.785992510301467e-05, | |
| "loss": 1.1914, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 6.819590824550527, | |
| "grad_norm": 0.5826976895332336, | |
| "learning_rate": 8.723369612865248e-05, | |
| "loss": 1.1718, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 7.12957222566646, | |
| "grad_norm": 0.544598400592804, | |
| "learning_rate": 8.66074671542903e-05, | |
| "loss": 1.1548, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 7.439553626782393, | |
| "grad_norm": 0.5824791193008423, | |
| "learning_rate": 8.598123817992811e-05, | |
| "loss": 1.1363, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 7.749535027898326, | |
| "grad_norm": 0.5747692584991455, | |
| "learning_rate": 8.535500920556593e-05, | |
| "loss": 1.1211, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 8.05951642901426, | |
| "grad_norm": 0.5473280549049377, | |
| "learning_rate": 8.472878023120375e-05, | |
| "loss": 1.1077, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 8.369497830130193, | |
| "grad_norm": 0.5574379563331604, | |
| "learning_rate": 8.410255125684155e-05, | |
| "loss": 1.0908, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 8.679479231246125, | |
| "grad_norm": 0.5424452424049377, | |
| "learning_rate": 8.347632228247937e-05, | |
| "loss": 1.0785, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 8.989460632362059, | |
| "grad_norm": 0.5508283376693726, | |
| "learning_rate": 8.285009330811718e-05, | |
| "loss": 1.0683, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 9.299442033477991, | |
| "grad_norm": 0.5519115924835205, | |
| "learning_rate": 8.2223864333755e-05, | |
| "loss": 1.0537, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 9.609423434593925, | |
| "grad_norm": 0.5510475039482117, | |
| "learning_rate": 8.159763535939281e-05, | |
| "loss": 1.0443, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 9.919404835709857, | |
| "grad_norm": 0.5631123185157776, | |
| "learning_rate": 8.097140638503063e-05, | |
| "loss": 1.0339, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 10.22938623682579, | |
| "grad_norm": 0.5705382823944092, | |
| "learning_rate": 8.034517741066844e-05, | |
| "loss": 1.0217, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 10.539367637941723, | |
| "grad_norm": 0.5316577553749084, | |
| "learning_rate": 7.971894843630626e-05, | |
| "loss": 1.0151, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 10.849349039057657, | |
| "grad_norm": 0.5557442307472229, | |
| "learning_rate": 7.909271946194406e-05, | |
| "loss": 1.0043, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 11.159330440173589, | |
| "grad_norm": 0.5498985648155212, | |
| "learning_rate": 7.846649048758188e-05, | |
| "loss": 0.9951, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 11.469311841289523, | |
| "grad_norm": 0.552780032157898, | |
| "learning_rate": 7.784026151321969e-05, | |
| "loss": 0.9855, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 11.779293242405455, | |
| "grad_norm": 0.5406888127326965, | |
| "learning_rate": 7.721403253885752e-05, | |
| "loss": 0.9795, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 12.089274643521389, | |
| "grad_norm": 0.537375271320343, | |
| "learning_rate": 7.658780356449533e-05, | |
| "loss": 0.971, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 12.399256044637323, | |
| "grad_norm": 0.5666614174842834, | |
| "learning_rate": 7.596157459013315e-05, | |
| "loss": 0.9643, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 12.709237445753255, | |
| "grad_norm": 0.5302731990814209, | |
| "learning_rate": 7.533659807371968e-05, | |
| "loss": 0.9582, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 13.019218846869189, | |
| "grad_norm": 0.5608243346214294, | |
| "learning_rate": 7.471036909935749e-05, | |
| "loss": 0.9512, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 13.32920024798512, | |
| "grad_norm": 0.5309119820594788, | |
| "learning_rate": 7.408414012499531e-05, | |
| "loss": 0.9424, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 13.639181649101054, | |
| "grad_norm": 0.5380939245223999, | |
| "learning_rate": 7.345791115063312e-05, | |
| "loss": 0.9383, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 13.949163050216987, | |
| "grad_norm": 0.5440984964370728, | |
| "learning_rate": 7.283168217627094e-05, | |
| "loss": 0.9298, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 14.25914445133292, | |
| "grad_norm": 0.5377441048622131, | |
| "learning_rate": 7.220545320190874e-05, | |
| "loss": 0.9245, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 14.569125852448852, | |
| "grad_norm": 0.5402495265007019, | |
| "learning_rate": 7.157922422754656e-05, | |
| "loss": 0.9196, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 14.879107253564786, | |
| "grad_norm": 0.5610705018043518, | |
| "learning_rate": 7.095299525318437e-05, | |
| "loss": 0.9146, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 15.189088654680718, | |
| "grad_norm": 0.5305636525154114, | |
| "learning_rate": 7.032676627882219e-05, | |
| "loss": 0.9071, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 15.499070055796652, | |
| "grad_norm": 0.5398979187011719, | |
| "learning_rate": 6.970053730446e-05, | |
| "loss": 0.9037, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 15.809051456912584, | |
| "grad_norm": 0.5490283370018005, | |
| "learning_rate": 6.907556078804655e-05, | |
| "loss": 0.8982, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 16.11903285802852, | |
| "grad_norm": 0.5505014061927795, | |
| "learning_rate": 6.844933181368435e-05, | |
| "loss": 0.8933, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 16.429014259144452, | |
| "grad_norm": 0.5260488390922546, | |
| "learning_rate": 6.782310283932217e-05, | |
| "loss": 0.8865, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 16.738995660260386, | |
| "grad_norm": 0.5459970235824585, | |
| "learning_rate": 6.719687386495999e-05, | |
| "loss": 0.8837, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 17.048977061376316, | |
| "grad_norm": 0.5260828733444214, | |
| "learning_rate": 6.657189734854653e-05, | |
| "loss": 0.8812, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 17.35895846249225, | |
| "grad_norm": 0.531878650188446, | |
| "learning_rate": 6.594566837418435e-05, | |
| "loss": 0.874, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 17.668939863608184, | |
| "grad_norm": 0.5373751521110535, | |
| "learning_rate": 6.531943939982215e-05, | |
| "loss": 0.8703, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 17.978921264724118, | |
| "grad_norm": 0.5685413479804993, | |
| "learning_rate": 6.469321042545997e-05, | |
| "loss": 0.8674, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 18.288902665840048, | |
| "grad_norm": 0.5405117273330688, | |
| "learning_rate": 6.406698145109778e-05, | |
| "loss": 0.8618, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 18.598884066955982, | |
| "grad_norm": 0.5303318500518799, | |
| "learning_rate": 6.344325739263305e-05, | |
| "loss": 0.8572, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 18.908865468071916, | |
| "grad_norm": 0.5173208117485046, | |
| "learning_rate": 6.281702841827086e-05, | |
| "loss": 0.8552, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 19.21884686918785, | |
| "grad_norm": 0.5334449410438538, | |
| "learning_rate": 6.219079944390868e-05, | |
| "loss": 0.8494, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 19.52882827030378, | |
| "grad_norm": 0.5522080659866333, | |
| "learning_rate": 6.156457046954649e-05, | |
| "loss": 0.8464, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 19.838809671419714, | |
| "grad_norm": 0.5295758247375488, | |
| "learning_rate": 6.09383414951843e-05, | |
| "loss": 0.845, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 20.148791072535648, | |
| "grad_norm": 0.5164583325386047, | |
| "learning_rate": 6.0312112520822115e-05, | |
| "loss": 0.8395, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 20.45877247365158, | |
| "grad_norm": 0.5620171427726746, | |
| "learning_rate": 5.968713600440865e-05, | |
| "loss": 0.8354, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 20.768753874767516, | |
| "grad_norm": 0.5254458785057068, | |
| "learning_rate": 5.906090703004646e-05, | |
| "loss": 0.8336, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 21.078735275883446, | |
| "grad_norm": 0.5437597632408142, | |
| "learning_rate": 5.8434678055684276e-05, | |
| "loss": 0.8304, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 21.38871667699938, | |
| "grad_norm": 0.5438856482505798, | |
| "learning_rate": 5.78084490813221e-05, | |
| "loss": 0.8263, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 21.698698078115314, | |
| "grad_norm": 0.5386750102043152, | |
| "learning_rate": 5.7182220106959916e-05, | |
| "loss": 0.8248, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 22.008679479231247, | |
| "grad_norm": 0.5307642817497253, | |
| "learning_rate": 5.655724359054645e-05, | |
| "loss": 0.8223, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 22.318660880347178, | |
| "grad_norm": 0.5404214859008789, | |
| "learning_rate": 5.5931014616184264e-05, | |
| "loss": 0.8176, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 22.62864228146311, | |
| "grad_norm": 0.555665910243988, | |
| "learning_rate": 5.530478564182208e-05, | |
| "loss": 0.8164, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 22.938623682579045, | |
| "grad_norm": 0.5331476330757141, | |
| "learning_rate": 5.467855666745989e-05, | |
| "loss": 0.8135, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 23.24860508369498, | |
| "grad_norm": 0.541491687297821, | |
| "learning_rate": 5.405358015104644e-05, | |
| "loss": 0.8097, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 23.55858648481091, | |
| "grad_norm": 0.5554507374763489, | |
| "learning_rate": 5.342735117668425e-05, | |
| "loss": 0.8074, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 23.868567885926844, | |
| "grad_norm": 0.5485785007476807, | |
| "learning_rate": 5.2801122202322065e-05, | |
| "loss": 0.8054, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 24.178549287042777, | |
| "grad_norm": 0.5320767164230347, | |
| "learning_rate": 5.217489322795988e-05, | |
| "loss": 0.8018, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 24.48853068815871, | |
| "grad_norm": 0.5248667001724243, | |
| "learning_rate": 5.154866425359769e-05, | |
| "loss": 0.8008, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 24.798512089274645, | |
| "grad_norm": 0.5368346571922302, | |
| "learning_rate": 5.0922435279235505e-05, | |
| "loss": 0.7975, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 25.108493490390575, | |
| "grad_norm": 0.53144371509552, | |
| "learning_rate": 5.029620630487332e-05, | |
| "loss": 0.7947, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 25.41847489150651, | |
| "grad_norm": 0.5482547879219055, | |
| "learning_rate": 4.966997733051113e-05, | |
| "loss": 0.793, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 25.728456292622443, | |
| "grad_norm": 0.5446964502334595, | |
| "learning_rate": 4.9043748356148946e-05, | |
| "loss": 0.7905, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 26.038437693738377, | |
| "grad_norm": 0.5257270932197571, | |
| "learning_rate": 4.841751938178676e-05, | |
| "loss": 0.7892, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 26.348419094854307, | |
| "grad_norm": 0.5478941202163696, | |
| "learning_rate": 4.779129040742457e-05, | |
| "loss": 0.7856, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 26.65840049597024, | |
| "grad_norm": 0.5381990671157837, | |
| "learning_rate": 4.7165061433062386e-05, | |
| "loss": 0.7863, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 26.968381897086175, | |
| "grad_norm": 0.546461820602417, | |
| "learning_rate": 4.65388324587002e-05, | |
| "loss": 0.7826, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 27.27836329820211, | |
| "grad_norm": 0.543404757976532, | |
| "learning_rate": 4.591260348433802e-05, | |
| "loss": 0.7796, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 27.58834469931804, | |
| "grad_norm": 0.5448907613754272, | |
| "learning_rate": 4.528637450997583e-05, | |
| "loss": 0.7796, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 27.898326100433973, | |
| "grad_norm": 0.5504478216171265, | |
| "learning_rate": 4.466014553561365e-05, | |
| "loss": 0.7761, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 28.208307501549907, | |
| "grad_norm": 0.544154703617096, | |
| "learning_rate": 4.403391656125146e-05, | |
| "loss": 0.7753, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 28.51828890266584, | |
| "grad_norm": 0.542306125164032, | |
| "learning_rate": 4.3407687586889274e-05, | |
| "loss": 0.7735, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 28.828270303781775, | |
| "grad_norm": 0.5549866557121277, | |
| "learning_rate": 4.278145861252709e-05, | |
| "loss": 0.7707, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 29.138251704897705, | |
| "grad_norm": 0.538090169429779, | |
| "learning_rate": 4.21552296381649e-05, | |
| "loss": 0.7697, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 29.44823310601364, | |
| "grad_norm": 0.5609955191612244, | |
| "learning_rate": 4.1529000663802714e-05, | |
| "loss": 0.7682, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 29.758214507129573, | |
| "grad_norm": 0.5595529675483704, | |
| "learning_rate": 4.090277168944053e-05, | |
| "loss": 0.7659, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 30.068195908245507, | |
| "grad_norm": 0.5461651086807251, | |
| "learning_rate": 4.027654271507834e-05, | |
| "loss": 0.7656, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 30.378177309361437, | |
| "grad_norm": 0.5438820719718933, | |
| "learning_rate": 3.9650313740716154e-05, | |
| "loss": 0.7625, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 30.68815871047737, | |
| "grad_norm": 0.5458811521530151, | |
| "learning_rate": 3.902408476635397e-05, | |
| "loss": 0.762, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 30.998140111593305, | |
| "grad_norm": 0.535521388053894, | |
| "learning_rate": 3.839785579199179e-05, | |
| "loss": 0.7589, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 31.30812151270924, | |
| "grad_norm": 0.5407618284225464, | |
| "learning_rate": 3.77716268176296e-05, | |
| "loss": 0.7576, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 31.61810291382517, | |
| "grad_norm": 0.5259741544723511, | |
| "learning_rate": 3.7145397843267415e-05, | |
| "loss": 0.7571, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 31.928084314941103, | |
| "grad_norm": 0.5338233709335327, | |
| "learning_rate": 3.651916886890523e-05, | |
| "loss": 0.7561, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 32.23806571605704, | |
| "grad_norm": 0.5369750261306763, | |
| "learning_rate": 3.589293989454304e-05, | |
| "loss": 0.7541, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 32.54804711717297, | |
| "grad_norm": 0.5418145656585693, | |
| "learning_rate": 3.5266710920180856e-05, | |
| "loss": 0.7521, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 32.858028518288904, | |
| "grad_norm": 0.533149242401123, | |
| "learning_rate": 3.464048194581867e-05, | |
| "loss": 0.7519, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 33.16800991940484, | |
| "grad_norm": 0.5384135246276855, | |
| "learning_rate": 3.401425297145648e-05, | |
| "loss": 0.7497, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 33.47799132052077, | |
| "grad_norm": 0.5323925018310547, | |
| "learning_rate": 3.3388023997094296e-05, | |
| "loss": 0.7485, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 33.7879727216367, | |
| "grad_norm": 0.535434901714325, | |
| "learning_rate": 3.276179502273211e-05, | |
| "loss": 0.7472, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 34.09795412275263, | |
| "grad_norm": 0.5496259331703186, | |
| "learning_rate": 3.213556604836992e-05, | |
| "loss": 0.7454, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 34.40793552386857, | |
| "grad_norm": 0.5429278016090393, | |
| "learning_rate": 3.150933707400774e-05, | |
| "loss": 0.7447, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 34.7179169249845, | |
| "grad_norm": 0.5489596724510193, | |
| "learning_rate": 3.088310809964556e-05, | |
| "loss": 0.7438, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 35.027898326100434, | |
| "grad_norm": 0.5510178208351135, | |
| "learning_rate": 3.025687912528337e-05, | |
| "loss": 0.7416, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 35.33787972721637, | |
| "grad_norm": 0.5540343523025513, | |
| "learning_rate": 2.9630650150921187e-05, | |
| "loss": 0.7401, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 35.6478611283323, | |
| "grad_norm": 0.551895260810852, | |
| "learning_rate": 2.9004421176559e-05, | |
| "loss": 0.7404, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 35.957842529448236, | |
| "grad_norm": 0.5412101149559021, | |
| "learning_rate": 2.8378192202196814e-05, | |
| "loss": 0.74, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 36.26782393056417, | |
| "grad_norm": 0.5450315475463867, | |
| "learning_rate": 2.7751963227834627e-05, | |
| "loss": 0.7386, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 36.577805331680096, | |
| "grad_norm": 0.5550098419189453, | |
| "learning_rate": 2.712573425347244e-05, | |
| "loss": 0.7382, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 36.88778673279603, | |
| "grad_norm": 0.5502198338508606, | |
| "learning_rate": 2.6499505279110254e-05, | |
| "loss": 0.7345, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 37.197768133911964, | |
| "grad_norm": 0.5401105880737305, | |
| "learning_rate": 2.587452876269679e-05, | |
| "loss": 0.7355, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 37.5077495350279, | |
| "grad_norm": 0.543369710445404, | |
| "learning_rate": 2.5248299788334605e-05, | |
| "loss": 0.7338, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 37.81773093614383, | |
| "grad_norm": 0.5440373420715332, | |
| "learning_rate": 2.4622070813972422e-05, | |
| "loss": 0.7326, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 38.127712337259766, | |
| "grad_norm": 0.5450806021690369, | |
| "learning_rate": 2.3995841839610235e-05, | |
| "loss": 0.7315, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 38.4376937383757, | |
| "grad_norm": 0.5412734746932983, | |
| "learning_rate": 2.336961286524805e-05, | |
| "loss": 0.7301, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 38.74767513949163, | |
| "grad_norm": 0.5553017854690552, | |
| "learning_rate": 2.274463634883459e-05, | |
| "loss": 0.732, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 39.05765654060756, | |
| "grad_norm": 0.5467730164527893, | |
| "learning_rate": 2.2118407374472403e-05, | |
| "loss": 0.7289, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 39.367637941723494, | |
| "grad_norm": 0.551267683506012, | |
| "learning_rate": 2.1492178400110216e-05, | |
| "loss": 0.728, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 39.67761934283943, | |
| "grad_norm": 0.5391538739204407, | |
| "learning_rate": 2.0865949425748033e-05, | |
| "loss": 0.7276, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 39.98760074395536, | |
| "grad_norm": 0.5523350238800049, | |
| "learning_rate": 2.0239720451385847e-05, | |
| "loss": 0.7272, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 40.297582145071296, | |
| "grad_norm": 0.5367141366004944, | |
| "learning_rate": 1.961349147702366e-05, | |
| "loss": 0.726, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 40.60756354618723, | |
| "grad_norm": 0.5538766980171204, | |
| "learning_rate": 1.8987262502661473e-05, | |
| "loss": 0.7238, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 40.91754494730316, | |
| "grad_norm": 0.5274632573127747, | |
| "learning_rate": 1.8361033528299287e-05, | |
| "loss": 0.725, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 41.2275263484191, | |
| "grad_norm": 0.521597146987915, | |
| "learning_rate": 1.7736057011885827e-05, | |
| "loss": 0.7233, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 41.53750774953503, | |
| "grad_norm": 0.5390001535415649, | |
| "learning_rate": 1.710982803752364e-05, | |
| "loss": 0.7225, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 41.84748915065096, | |
| "grad_norm": 0.5474331378936768, | |
| "learning_rate": 1.6483599063161458e-05, | |
| "loss": 0.7218, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 42.15747055176689, | |
| "grad_norm": 0.5352886915206909, | |
| "learning_rate": 1.5858622546747995e-05, | |
| "loss": 0.7213, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 42.467451952882826, | |
| "grad_norm": 0.540053129196167, | |
| "learning_rate": 1.5232393572385808e-05, | |
| "loss": 0.7204, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 42.77743335399876, | |
| "grad_norm": 0.5470998883247375, | |
| "learning_rate": 1.4606164598023622e-05, | |
| "loss": 0.721, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 43.08741475511469, | |
| "grad_norm": 0.5613588094711304, | |
| "learning_rate": 1.3979935623661435e-05, | |
| "loss": 0.7194, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 43.39739615623063, | |
| "grad_norm": 0.5471562743186951, | |
| "learning_rate": 1.3354959107247974e-05, | |
| "loss": 0.7178, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 43.70737755734656, | |
| "grad_norm": 0.5386627912521362, | |
| "learning_rate": 1.2728730132885787e-05, | |
| "loss": 0.7184, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 44.017358958462495, | |
| "grad_norm": 0.5391978621482849, | |
| "learning_rate": 1.2102501158523603e-05, | |
| "loss": 0.7186, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 44.32734035957843, | |
| "grad_norm": 0.5381629467010498, | |
| "learning_rate": 1.1476272184161418e-05, | |
| "loss": 0.7168, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 44.637321760694356, | |
| "grad_norm": 0.5467249155044556, | |
| "learning_rate": 1.0850043209799233e-05, | |
| "loss": 0.7162, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 44.94730316181029, | |
| "grad_norm": 0.5548228025436401, | |
| "learning_rate": 1.0223814235437046e-05, | |
| "loss": 0.7146, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 45.25728456292622, | |
| "grad_norm": 0.5488151907920837, | |
| "learning_rate": 9.59758526107486e-06, | |
| "loss": 0.7152, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 45.56726596404216, | |
| "grad_norm": 0.5473387241363525, | |
| "learning_rate": 8.971356286712675e-06, | |
| "loss": 0.7142, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 45.87724736515809, | |
| "grad_norm": 0.5331913828849792, | |
| "learning_rate": 8.345127312350489e-06, | |
| "loss": 0.7155, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 46.187228766274025, | |
| "grad_norm": 0.5443392395973206, | |
| "learning_rate": 7.718898337988302e-06, | |
| "loss": 0.7136, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 46.49721016738996, | |
| "grad_norm": 0.5461409091949463, | |
| "learning_rate": 7.092669363626117e-06, | |
| "loss": 0.7148, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 46.80719156850589, | |
| "grad_norm": 0.5504785180091858, | |
| "learning_rate": 6.466440389263931e-06, | |
| "loss": 0.7133, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 47.11717296962182, | |
| "grad_norm": 0.5478015542030334, | |
| "learning_rate": 5.840211414901745e-06, | |
| "loss": 0.7125, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 47.42715437073775, | |
| "grad_norm": 0.5464319586753845, | |
| "learning_rate": 5.2139824405395585e-06, | |
| "loss": 0.7125, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 47.73713577185369, | |
| "grad_norm": 0.5370163321495056, | |
| "learning_rate": 4.587753466177374e-06, | |
| "loss": 0.7117, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 48.04711717296962, | |
| "grad_norm": 0.5529221892356873, | |
| "learning_rate": 3.961524491815188e-06, | |
| "loss": 0.711, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 48.357098574085555, | |
| "grad_norm": 0.549679160118103, | |
| "learning_rate": 3.3352955174530015e-06, | |
| "loss": 0.7112, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 48.66707997520149, | |
| "grad_norm": 0.5416662096977234, | |
| "learning_rate": 2.709066543090816e-06, | |
| "loss": 0.7112, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 48.97706137631742, | |
| "grad_norm": 0.5428098440170288, | |
| "learning_rate": 2.08283756872863e-06, | |
| "loss": 0.7109, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 49.287042777433356, | |
| "grad_norm": 0.5247154235839844, | |
| "learning_rate": 1.4566085943664442e-06, | |
| "loss": 0.7106, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 49.59702417854929, | |
| "grad_norm": 0.5486724376678467, | |
| "learning_rate": 8.303796200042584e-07, | |
| "loss": 0.7097, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 49.90700557966522, | |
| "grad_norm": 0.5495786070823669, | |
| "learning_rate": 2.0415064564207257e-07, | |
| "loss": 0.7106, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "step": 80650, | |
| "total_flos": 2.052104150815488e+18, | |
| "train_loss": 0.04098836247254364, | |
| "train_runtime": 10357.3823, | |
| "train_samples_per_second": 11959.61, | |
| "train_steps_per_second": 7.787 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 80650, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.052104150815488e+18, | |
| "train_batch_size": 192, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |