| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 449, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011135857461024499, | |
| "grad_norm": 1.3413572311401367, | |
| "learning_rate": 1.0619469026548673e-06, | |
| "loss": 1.3355, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.022271714922048998, | |
| "grad_norm": 1.0000957250595093, | |
| "learning_rate": 2.389380530973451e-06, | |
| "loss": 1.4079, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0334075723830735, | |
| "grad_norm": 0.7801820039749146, | |
| "learning_rate": 3.7168141592920353e-06, | |
| "loss": 1.3474, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.044543429844097995, | |
| "grad_norm": 0.5836945176124573, | |
| "learning_rate": 5.04424778761062e-06, | |
| "loss": 1.3387, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0556792873051225, | |
| "grad_norm": 0.6259394288063049, | |
| "learning_rate": 6.371681415929204e-06, | |
| "loss": 1.3457, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.066815144766147, | |
| "grad_norm": 0.6279721856117249, | |
| "learning_rate": 7.699115044247788e-06, | |
| "loss": 1.345, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0779510022271715, | |
| "grad_norm": 0.5959689021110535, | |
| "learning_rate": 9.026548672566373e-06, | |
| "loss": 1.3064, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08908685968819599, | |
| "grad_norm": 0.4760344624519348, | |
| "learning_rate": 1.0353982300884955e-05, | |
| "loss": 1.1918, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10022271714922049, | |
| "grad_norm": 0.5409045219421387, | |
| "learning_rate": 1.168141592920354e-05, | |
| "loss": 1.2243, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.111358574610245, | |
| "grad_norm": 0.6069848537445068, | |
| "learning_rate": 1.3008849557522125e-05, | |
| "loss": 1.2276, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12249443207126949, | |
| "grad_norm": 0.5506272315979004, | |
| "learning_rate": 1.433628318584071e-05, | |
| "loss": 1.2671, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.133630289532294, | |
| "grad_norm": 0.4604385793209076, | |
| "learning_rate": 1.5663716814159295e-05, | |
| "loss": 1.2268, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1447661469933185, | |
| "grad_norm": 0.5579900741577148, | |
| "learning_rate": 1.6991150442477876e-05, | |
| "loss": 1.2215, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.155902004454343, | |
| "grad_norm": 0.48294851183891296, | |
| "learning_rate": 1.8318584070796458e-05, | |
| "loss": 1.2418, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16703786191536749, | |
| "grad_norm": 0.5626055598258972, | |
| "learning_rate": 1.9646017699115046e-05, | |
| "loss": 1.185, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.17817371937639198, | |
| "grad_norm": 0.5725001096725464, | |
| "learning_rate": 2.0973451327433628e-05, | |
| "loss": 1.2337, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18930957683741648, | |
| "grad_norm": 0.5308484435081482, | |
| "learning_rate": 2.2300884955752213e-05, | |
| "loss": 1.164, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.20044543429844097, | |
| "grad_norm": 0.5047495365142822, | |
| "learning_rate": 2.3628318584070798e-05, | |
| "loss": 1.1862, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21158129175946547, | |
| "grad_norm": 0.5524989366531372, | |
| "learning_rate": 2.495575221238938e-05, | |
| "loss": 1.1503, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.22271714922049, | |
| "grad_norm": 0.5751224160194397, | |
| "learning_rate": 2.6283185840707968e-05, | |
| "loss": 1.1819, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23385300668151449, | |
| "grad_norm": 0.590943455696106, | |
| "learning_rate": 2.761061946902655e-05, | |
| "loss": 1.1177, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.24498886414253898, | |
| "grad_norm": 0.624424159526825, | |
| "learning_rate": 2.893805309734513e-05, | |
| "loss": 1.1612, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2561247216035635, | |
| "grad_norm": 0.7900341153144836, | |
| "learning_rate": 2.9999983715046248e-05, | |
| "loss": 1.1556, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.267260579064588, | |
| "grad_norm": 0.717691957950592, | |
| "learning_rate": 2.9999413745377787e-05, | |
| "loss": 1.1074, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.27839643652561247, | |
| "grad_norm": 0.5590342283248901, | |
| "learning_rate": 2.9998029563381436e-05, | |
| "loss": 1.1106, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.289532293986637, | |
| "grad_norm": 0.5695084929466248, | |
| "learning_rate": 2.999583124419468e-05, | |
| "loss": 1.0615, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.30066815144766146, | |
| "grad_norm": 0.6339223384857178, | |
| "learning_rate": 2.999281890714874e-05, | |
| "loss": 1.1041, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.311804008908686, | |
| "grad_norm": 0.6401590704917908, | |
| "learning_rate": 2.9988992715762148e-05, | |
| "loss": 1.0891, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.32293986636971045, | |
| "grad_norm": 0.6139689087867737, | |
| "learning_rate": 2.9984352877731836e-05, | |
| "loss": 1.0935, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.33407572383073497, | |
| "grad_norm": 0.5202445387840271, | |
| "learning_rate": 2.9978899644921866e-05, | |
| "loss": 1.0622, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.34521158129175944, | |
| "grad_norm": 0.5589560866355896, | |
| "learning_rate": 2.9972633313349764e-05, | |
| "loss": 1.0604, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.35634743875278396, | |
| "grad_norm": 0.6837779879570007, | |
| "learning_rate": 2.9965554223170455e-05, | |
| "loss": 1.008, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3674832962138085, | |
| "grad_norm": 0.7174135446548462, | |
| "learning_rate": 2.9957662758657787e-05, | |
| "loss": 1.0292, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.37861915367483295, | |
| "grad_norm": 0.6325995922088623, | |
| "learning_rate": 2.9948959348183686e-05, | |
| "loss": 1.0019, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3897550111358575, | |
| "grad_norm": 0.6495500802993774, | |
| "learning_rate": 2.993944446419489e-05, | |
| "loss": 0.9838, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.40089086859688194, | |
| "grad_norm": 0.641270101070404, | |
| "learning_rate": 2.9929118623187307e-05, | |
| "loss": 0.9553, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.41202672605790647, | |
| "grad_norm": 0.6926126480102539, | |
| "learning_rate": 2.9917982385677982e-05, | |
| "loss": 1.0296, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.42316258351893093, | |
| "grad_norm": 0.7776742577552795, | |
| "learning_rate": 2.990603635617466e-05, | |
| "loss": 1.0223, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.43429844097995546, | |
| "grad_norm": 0.6762447357177734, | |
| "learning_rate": 2.9893281183142996e-05, | |
| "loss": 1.053, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.44543429844098, | |
| "grad_norm": 0.70978844165802, | |
| "learning_rate": 2.9879717558971317e-05, | |
| "loss": 0.9561, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.45657015590200445, | |
| "grad_norm": 0.7382838726043701, | |
| "learning_rate": 2.986534621993307e-05, | |
| "loss": 0.9923, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.46770601336302897, | |
| "grad_norm": 0.7762662768363953, | |
| "learning_rate": 2.9850167946146838e-05, | |
| "loss": 1.0125, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.47884187082405344, | |
| "grad_norm": 0.7446727752685547, | |
| "learning_rate": 2.9834183561533997e-05, | |
| "loss": 0.9445, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.48997772828507796, | |
| "grad_norm": 0.78370201587677, | |
| "learning_rate": 2.9817393933773994e-05, | |
| "loss": 0.9717, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5011135857461024, | |
| "grad_norm": 0.6878297328948975, | |
| "learning_rate": 2.9799799974257248e-05, | |
| "loss": 0.9373, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.512249443207127, | |
| "grad_norm": 0.690732479095459, | |
| "learning_rate": 2.9781402638035656e-05, | |
| "loss": 0.9109, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5233853006681515, | |
| "grad_norm": 0.8460608124732971, | |
| "learning_rate": 2.9762202923770795e-05, | |
| "loss": 0.9548, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.534521158129176, | |
| "grad_norm": 0.8983225226402283, | |
| "learning_rate": 2.9742201873679656e-05, | |
| "loss": 0.9547, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5456570155902004, | |
| "grad_norm": 0.8119720816612244, | |
| "learning_rate": 2.9721400573478117e-05, | |
| "loss": 0.9748, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5567928730512249, | |
| "grad_norm": 0.8376508951187134, | |
| "learning_rate": 2.9699800152321975e-05, | |
| "loss": 0.8978, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5679287305122495, | |
| "grad_norm": 0.7981753349304199, | |
| "learning_rate": 2.967740178274567e-05, | |
| "loss": 0.8872, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.579064587973274, | |
| "grad_norm": 0.7605271339416504, | |
| "learning_rate": 2.9654206680598638e-05, | |
| "loss": 0.8954, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5902004454342984, | |
| "grad_norm": 0.9272874593734741, | |
| "learning_rate": 2.9630216104979288e-05, | |
| "loss": 0.8797, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6013363028953229, | |
| "grad_norm": 0.8306118249893188, | |
| "learning_rate": 2.9605431358166687e-05, | |
| "loss": 0.9045, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6124721603563474, | |
| "grad_norm": 0.849515438079834, | |
| "learning_rate": 2.957985378554984e-05, | |
| "loss": 0.8929, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.623608017817372, | |
| "grad_norm": 0.9004138708114624, | |
| "learning_rate": 2.955348477555467e-05, | |
| "loss": 0.9355, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6347438752783965, | |
| "grad_norm": 0.836115837097168, | |
| "learning_rate": 2.9526325759568665e-05, | |
| "loss": 0.8795, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6458797327394209, | |
| "grad_norm": 0.8236976861953735, | |
| "learning_rate": 2.9498378211863145e-05, | |
| "loss": 0.8241, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6570155902004454, | |
| "grad_norm": 0.817470908164978, | |
| "learning_rate": 2.9469643649513264e-05, | |
| "loss": 0.8541, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6681514476614699, | |
| "grad_norm": 0.9785251021385193, | |
| "learning_rate": 2.9440123632315647e-05, | |
| "loss": 0.8715, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6792873051224945, | |
| "grad_norm": 0.9281602501869202, | |
| "learning_rate": 2.9409819762703715e-05, | |
| "loss": 0.7914, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6904231625835189, | |
| "grad_norm": 0.8825387358665466, | |
| "learning_rate": 2.9378733685660707e-05, | |
| "loss": 0.8527, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7015590200445434, | |
| "grad_norm": 0.9314531683921814, | |
| "learning_rate": 2.934686708863039e-05, | |
| "loss": 0.8966, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7126948775055679, | |
| "grad_norm": 0.9172025322914124, | |
| "learning_rate": 2.9314221701425445e-05, | |
| "loss": 0.8472, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7238307349665924, | |
| "grad_norm": 0.9198710918426514, | |
| "learning_rate": 2.9280799296133577e-05, | |
| "loss": 0.878, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.734966592427617, | |
| "grad_norm": 0.8762511610984802, | |
| "learning_rate": 2.9246601687021324e-05, | |
| "loss": 0.7845, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7461024498886414, | |
| "grad_norm": 0.8782876133918762, | |
| "learning_rate": 2.9211630730435564e-05, | |
| "loss": 0.7928, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7572383073496659, | |
| "grad_norm": 0.9156160950660706, | |
| "learning_rate": 2.9175888324702762e-05, | |
| "loss": 0.8467, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7683741648106904, | |
| "grad_norm": 1.0404008626937866, | |
| "learning_rate": 2.9139376410025897e-05, | |
| "loss": 0.7927, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.779510022271715, | |
| "grad_norm": 1.0743643045425415, | |
| "learning_rate": 2.9102096968379178e-05, | |
| "loss": 0.8123, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7906458797327395, | |
| "grad_norm": 0.9985504150390625, | |
| "learning_rate": 2.9064052023400417e-05, | |
| "loss": 0.7735, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.8017817371937639, | |
| "grad_norm": 0.9021556973457336, | |
| "learning_rate": 2.9025243640281226e-05, | |
| "loss": 0.8006, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8129175946547884, | |
| "grad_norm": 0.9394658803939819, | |
| "learning_rate": 2.8985673925654853e-05, | |
| "loss": 0.7879, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8240534521158129, | |
| "grad_norm": 0.88721764087677, | |
| "learning_rate": 2.8945345027481884e-05, | |
| "loss": 0.7225, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8351893095768375, | |
| "grad_norm": 0.9523298740386963, | |
| "learning_rate": 2.8904259134933627e-05, | |
| "loss": 0.8202, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8463251670378619, | |
| "grad_norm": 0.9752704501152039, | |
| "learning_rate": 2.8862418478273246e-05, | |
| "loss": 0.7871, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8574610244988864, | |
| "grad_norm": 0.9238730072975159, | |
| "learning_rate": 2.881982532873476e-05, | |
| "loss": 0.7216, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8685968819599109, | |
| "grad_norm": 0.8588601350784302, | |
| "learning_rate": 2.8776481998399676e-05, | |
| "loss": 0.7146, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8797327394209354, | |
| "grad_norm": 0.8526139259338379, | |
| "learning_rate": 2.873239084007156e-05, | |
| "loss": 0.8105, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.89086859688196, | |
| "grad_norm": 1.0353056192398071, | |
| "learning_rate": 2.868755424714825e-05, | |
| "loss": 0.789, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9020044543429844, | |
| "grad_norm": 0.8390752077102661, | |
| "learning_rate": 2.8641974653491997e-05, | |
| "loss": 0.7338, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9131403118040089, | |
| "grad_norm": 0.9766253232955933, | |
| "learning_rate": 2.85956545332973e-05, | |
| "loss": 0.7782, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9242761692650334, | |
| "grad_norm": 0.9131956100463867, | |
| "learning_rate": 2.8548596400956614e-05, | |
| "loss": 0.734, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9354120267260579, | |
| "grad_norm": 0.979026734828949, | |
| "learning_rate": 2.850080281092389e-05, | |
| "loss": 0.7538, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9465478841870824, | |
| "grad_norm": 1.0852129459381104, | |
| "learning_rate": 2.845227635757587e-05, | |
| "loss": 0.7072, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9576837416481069, | |
| "grad_norm": 1.1135140657424927, | |
| "learning_rate": 2.8403019675071267e-05, | |
| "loss": 0.7657, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9688195991091314, | |
| "grad_norm": 0.8528003096580505, | |
| "learning_rate": 2.8353035437207796e-05, | |
| "loss": 0.7036, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.9799554565701559, | |
| "grad_norm": 0.9384568929672241, | |
| "learning_rate": 2.8302326357277013e-05, | |
| "loss": 0.6948, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9910913140311804, | |
| "grad_norm": 1.1752519607543945, | |
| "learning_rate": 2.825089518791704e-05, | |
| "loss": 0.7052, | |
| "step": 445 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2245, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.211605326316175e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |