| [ | |
| { | |
| "loss": 1.9148719787597657, | |
| "grad_norm": 0.984495222568512, | |
| "learning_rate": 2.2641509433962265e-05, | |
| "entropy": 0.9785909144083659, | |
| "num_tokens": 909468.0, | |
| "mean_token_accuracy": 0.6837772730986277, | |
| "epoch": 0.014091122592766557, | |
| "step": 25 | |
| }, | |
| { | |
| "loss": 1.0547267150878907, | |
| "grad_norm": 0.2687967121601105, | |
| "learning_rate": 4.6226415094339625e-05, | |
| "entropy": 1.0105916921297708, | |
| "num_tokens": 1841827.0, | |
| "mean_token_accuracy": 0.7993326298395793, | |
| "epoch": 0.028182245185533115, | |
| "step": 50 | |
| }, | |
| { | |
| "loss": 0.909715805053711, | |
| "grad_norm": 0.30847716331481934, | |
| "learning_rate": 4.998165452627025e-05, | |
| "entropy": 0.8024396904309591, | |
| "num_tokens": 2783417.0, | |
| "mean_token_accuracy": 0.8195314351717631, | |
| "epoch": 0.042273367778299674, | |
| "step": 75 | |
| }, | |
| { | |
| "loss": 0.8271210479736328, | |
| "grad_norm": 0.2949310839176178, | |
| "learning_rate": 4.991201589453377e-05, | |
| "entropy": 0.7632828823725383, | |
| "num_tokens": 3703024.0, | |
| "mean_token_accuracy": 0.8274987975756327, | |
| "epoch": 0.05636449037106623, | |
| "step": 100 | |
| }, | |
| { | |
| "eval_loss": 0.7959097623825073, | |
| "eval_runtime": 42.9182, | |
| "eval_samples_per_second": 11.65, | |
| "eval_steps_per_second": 0.746, | |
| "eval_entropy": 0.7644990533590317, | |
| "eval_num_tokens": 3703024.0, | |
| "eval_mean_token_accuracy": 0.8274666927754879, | |
| "epoch": 0.05636449037106623, | |
| "step": 100 | |
| }, | |
| { | |
| "loss": 0.7740676879882813, | |
| "grad_norm": 0.29744288325309753, | |
| "learning_rate": 4.97905632708703e-05, | |
| "entropy": 0.7656655506292979, | |
| "num_tokens": 4667355.0, | |
| "mean_token_accuracy": 0.8285713505744934, | |
| "epoch": 0.07045561296383279, | |
| "step": 125 | |
| }, | |
| { | |
| "loss": 0.7345146942138672, | |
| "grad_norm": 0.3392024040222168, | |
| "learning_rate": 4.9617549262105724e-05, | |
| "entropy": 0.7322683656215667, | |
| "num_tokens": 5580909.0, | |
| "mean_token_accuracy": 0.8329473527272543, | |
| "epoch": 0.08454673555659935, | |
| "step": 150 | |
| }, | |
| { | |
| "loss": 0.7085108184814453, | |
| "grad_norm": 0.337108314037323, | |
| "learning_rate": 4.939333371653541e-05, | |
| "entropy": 0.7126858182748159, | |
| "num_tokens": 6525675.0, | |
| "mean_token_accuracy": 0.8382143716017405, | |
| "epoch": 0.0986378581493659, | |
| "step": 175 | |
| }, | |
| { | |
| "loss": 0.7087242889404297, | |
| "grad_norm": 0.39108389616012573, | |
| "learning_rate": 4.911838297548306e-05, | |
| "entropy": 0.7103402439753215, | |
| "num_tokens": 7460420.0, | |
| "mean_token_accuracy": 0.8385978392759958, | |
| "epoch": 0.11272898074213246, | |
| "step": 200 | |
| }, | |
| { | |
| "eval_loss": 0.691197395324707, | |
| "eval_runtime": 34.7324, | |
| "eval_samples_per_second": 14.396, | |
| "eval_steps_per_second": 0.921, | |
| "eval_entropy": 0.7021831637248397, | |
| "eval_num_tokens": 7460420.0, | |
| "eval_mean_token_accuracy": 0.8397158589214087, | |
| "epoch": 0.11272898074213246, | |
| "step": 200 | |
| }, | |
| { | |
| "loss": 0.6759407043457031, | |
| "grad_norm": 0.41262030601501465, | |
| "learning_rate": 4.8793268903366905e-05, | |
| "entropy": 0.6836405583222707, | |
| "num_tokens": 8367608.0, | |
| "mean_token_accuracy": 0.8437444992860158, | |
| "epoch": 0.12682010333489901, | |
| "step": 225 | |
| }, | |
| { | |
| "loss": 0.6790435028076172, | |
| "grad_norm": 0.4088114798069, | |
| "learning_rate": 4.8418667698290696e-05, | |
| "entropy": 0.684131217400233, | |
| "num_tokens": 9284184.0, | |
| "mean_token_accuracy": 0.8435306719938914, | |
| "epoch": 0.14091122592766558, | |
| "step": 250 | |
| }, | |
| { | |
| "loss": 0.6590489959716797, | |
| "grad_norm": 0.4000810384750366, | |
| "learning_rate": 4.7995358485633035e-05, | |
| "entropy": 0.6666705779234569, | |
| "num_tokens": 10228116.0, | |
| "mean_token_accuracy": 0.8483462047576904, | |
| "epoch": 0.15500234852043213, | |
| "step": 275 | |
| }, | |
| { | |
| "loss": 0.6522020721435546, | |
| "grad_norm": 0.4356841742992401, | |
| "learning_rate": 4.752422169756048e-05, | |
| "entropy": 0.6561659761269887, | |
| "num_tokens": 11141987.0, | |
| "mean_token_accuracy": 0.8489574348926544, | |
| "epoch": 0.1690934711131987, | |
| "step": 300 | |
| }, | |
| { | |
| "eval_loss": 0.6456555724143982, | |
| "eval_runtime": 34.7464, | |
| "eval_samples_per_second": 14.39, | |
| "eval_steps_per_second": 0.921, | |
| "eval_entropy": 0.6593516366556287, | |
| "eval_num_tokens": 11141987.0, | |
| "eval_mean_token_accuracy": 0.8488058932125568, | |
| "epoch": 0.1690934711131987, | |
| "step": 300 | |
| }, | |
| { | |
| "loss": 0.6289921569824218, | |
| "grad_norm": 0.44314464926719666, | |
| "learning_rate": 4.700623724183468e-05, | |
| "entropy": 0.6275538243850072, | |
| "num_tokens": 12066391.0, | |
| "mean_token_accuracy": 0.8543656957149506, | |
| "epoch": 0.18318459370596524, | |
| "step": 325 | |
| }, | |
| { | |
| "loss": 0.6266510009765625, | |
| "grad_norm": 0.4457905888557434, | |
| "learning_rate": 4.644248246372233e-05, | |
| "entropy": 0.6246062052249909, | |
| "num_tokens": 13002518.0, | |
| "mean_token_accuracy": 0.8540003776550293, | |
| "epoch": 0.1972757162987318, | |
| "step": 350 | |
| }, | |
| { | |
| "loss": 0.6241617965698242, | |
| "grad_norm": 0.45250752568244934, | |
| "learning_rate": 4.5834129905246725e-05, | |
| "entropy": 0.6225514455636343, | |
| "num_tokens": 13915051.0, | |
| "mean_token_accuracy": 0.8545078063011169, | |
| "epoch": 0.21136683889149835, | |
| "step": 375 | |
| }, | |
| { | |
| "loss": 0.6225375366210938, | |
| "grad_norm": 0.4502236545085907, | |
| "learning_rate": 4.5182444866441694e-05, | |
| "entropy": 0.6170252589384715, | |
| "num_tokens": 14840689.0, | |
| "mean_token_accuracy": 0.8547838560740153, | |
| "epoch": 0.22545796148426492, | |
| "step": 400 | |
| }, | |
| { | |
| "eval_loss": 0.624234139919281, | |
| "eval_runtime": 34.7767, | |
| "eval_samples_per_second": 14.377, | |
| "eval_steps_per_second": 0.92, | |
| "eval_entropy": 0.6146921720355749, | |
| "eval_num_tokens": 14840689.0, | |
| "eval_mean_token_accuracy": 0.8521482553333044, | |
| "epoch": 0.22545796148426492, | |
| "step": 400 | |
| }, | |
| { | |
| "loss": 0.6120803451538086, | |
| "grad_norm": 0.40877044200897217, | |
| "learning_rate": 4.4488782773679885e-05, | |
| "entropy": 0.6126995925108591, | |
| "num_tokens": 15781641.0, | |
| "mean_token_accuracy": 0.8559599355856577, | |
| "epoch": 0.23954908407703146, | |
| "step": 425 | |
| }, | |
| { | |
| "loss": 0.6238847732543945, | |
| "grad_norm": 0.43226659297943115, | |
| "learning_rate": 4.375458636054924e-05, | |
| "entropy": 0.621622064312299, | |
| "num_tokens": 16727723.0, | |
| "mean_token_accuracy": 0.8534450817108155, | |
| "epoch": 0.25364020666979803, | |
| "step": 450 | |
| }, | |
| { | |
| "loss": 0.5980339431762696, | |
| "grad_norm": 0.4249129295349121, | |
| "learning_rate": 4.298138266714094e-05, | |
| "entropy": 0.5939697621266047, | |
| "num_tokens": 17644465.0, | |
| "mean_token_accuracy": 0.8589934686819712, | |
| "epoch": 0.2677313292625646, | |
| "step": 475 | |
| }, | |
| { | |
| "loss": 0.6099626541137695, | |
| "grad_norm": 0.4274967908859253, | |
| "learning_rate": 4.2170779863989946e-05, | |
| "entropy": 0.6078906120856603, | |
| "num_tokens": 18563256.0, | |
| "mean_token_accuracy": 0.8569075318177541, | |
| "epoch": 0.28182245185533117, | |
| "step": 500 | |
| }, | |
| { | |
| "eval_loss": 0.6081598997116089, | |
| "eval_runtime": 34.7944, | |
| "eval_samples_per_second": 14.37, | |
| "eval_steps_per_second": 0.92, | |
| "eval_entropy": 0.6181821776553988, | |
| "eval_num_tokens": 18563256.0, | |
| "eval_mean_token_accuracy": 0.855755690485239, | |
| "epoch": 0.28182245185533117, | |
| "step": 500 | |
| }, | |
| { | |
| "loss": 0.5912541961669922, | |
| "grad_norm": 0.4574773907661438, | |
| "learning_rate": 4.132446390727404e-05, | |
| "entropy": 0.5872368462880453, | |
| "num_tokens": 19469800.0, | |
| "mean_token_accuracy": 0.8607503294944763, | |
| "epoch": 0.2959135744480977, | |
| "step": 525 | |
| }, | |
| { | |
| "loss": 0.5932905197143554, | |
| "grad_norm": 0.43131619691848755, | |
| "learning_rate": 4.044419503222808e-05, | |
| "entropy": 0.592293497522672, | |
| "num_tokens": 20385241.0, | |
| "mean_token_accuracy": 0.8604243552684784, | |
| "epoch": 0.31000469704086425, | |
| "step": 550 | |
| }, | |
| { | |
| "loss": 0.5899901580810547, | |
| "grad_norm": 0.4174489974975586, | |
| "learning_rate": 3.953180409206677e-05, | |
| "entropy": 0.5856852753957112, | |
| "num_tokens": 21331318.0, | |
| "mean_token_accuracy": 0.8611550823847453, | |
| "epoch": 0.3240958196336308, | |
| "step": 575 | |
| }, | |
| { | |
| "loss": 0.600746955871582, | |
| "grad_norm": 0.4607154428958893, | |
| "learning_rate": 3.858918875003053e-05, | |
| "entropy": 0.5992459511756897, | |
| "num_tokens": 22288698.0, | |
| "mean_token_accuracy": 0.8587616598606109, | |
| "epoch": 0.3381869422263974, | |
| "step": 600 | |
| }, | |
| { | |
| "eval_loss": 0.5959565043449402, | |
| "eval_runtime": 34.7871, | |
| "eval_samples_per_second": 14.373, | |
| "eval_steps_per_second": 0.92, | |
| "eval_entropy": 0.6024799766018987, | |
| "eval_num_tokens": 22288698.0, | |
| "eval_mean_token_accuracy": 0.8582040295004845, | |
| "epoch": 0.3381869422263974, | |
| "step": 600 | |
| }, | |
| { | |
| "loss": 0.5945447540283203, | |
| "grad_norm": 0.4672609269618988, | |
| "learning_rate": 3.761830953247457e-05, | |
| "entropy": 0.5911998764673869, | |
| "num_tokens": 23239625.0, | |
| "mean_token_accuracy": 0.8611347631613413, | |
| "epoch": 0.3522780648191639, | |
| "step": 625 | |
| }, | |
| { | |
| "loss": 0.5985645294189453, | |
| "grad_norm": 0.4281597137451172, | |
| "learning_rate": 3.662118575121024e-05, | |
| "entropy": 0.5953911445538203, | |
| "num_tokens": 24156885.0, | |
| "mean_token_accuracy": 0.859769054253896, | |
| "epoch": 0.3663691874119305, | |
| "step": 650 | |
| }, | |
| { | |
| "loss": 0.5921562957763672, | |
| "grad_norm": 0.43555110692977905, | |
| "learning_rate": 3.5599891303579746e-05, | |
| "entropy": 0.5895072638988494, | |
| "num_tokens": 25104558.0, | |
| "mean_token_accuracy": 0.860583526690801, | |
| "epoch": 0.38046031000469704, | |
| "step": 675 | |
| }, | |
| { | |
| "loss": 0.5891357421875, | |
| "grad_norm": 0.46079888939857483, | |
| "learning_rate": 3.455655035899951e-05, | |
| "entropy": 0.5860749536752701, | |
| "num_tokens": 26027947.0, | |
| "mean_token_accuracy": 0.8607413911819458, | |
| "epoch": 0.3945514325974636, | |
| "step": 700 | |
| }, | |
| { | |
| "eval_loss": 0.5854880213737488, | |
| "eval_runtime": 34.8333, | |
| "eval_samples_per_second": 14.354, | |
| "eval_steps_per_second": 0.919, | |
| "eval_entropy": 0.5855442956089973, | |
| "eval_num_tokens": 26027947.0, | |
| "eval_mean_token_accuracy": 0.8601280357688665, | |
| "epoch": 0.3945514325974636, | |
| "step": 700 | |
| }, | |
| { | |
| "loss": 0.5821422576904297, | |
| "grad_norm": 0.42215803265571594, | |
| "learning_rate": 3.349333294094369e-05, | |
| "entropy": 0.5828985869884491, | |
| "num_tokens": 26996995.0, | |
| "mean_token_accuracy": 0.8623941914240519, | |
| "epoch": 0.40864255519023013, | |
| "step": 725 | |
| }, | |
| { | |
| "loss": 0.5734980392456055, | |
| "grad_norm": 0.4118139147758484, | |
| "learning_rate": 3.241245041355675e-05, | |
| "entropy": 0.5695036280155182, | |
| "num_tokens": 27948817.0, | |
| "mean_token_accuracy": 0.8648126033941904, | |
| "epoch": 0.4227336777829967, | |
| "step": 750 | |
| }, | |
| { | |
| "loss": 0.5755558776855468, | |
| "grad_norm": 0.40968823432922363, | |
| "learning_rate": 3.131615088228249e-05, | |
| "entropy": 0.5767549270391464, | |
| "num_tokens": 28893932.0, | |
| "mean_token_accuracy": 0.8637475728988647, | |
| "epoch": 0.43682480037576327, | |
| "step": 775 | |
| }, | |
| { | |
| "loss": 0.573729248046875, | |
| "grad_norm": 0.4324798583984375, | |
| "learning_rate": 3.0206714518075486e-05, | |
| "entropy": 0.5696792916456859, | |
| "num_tokens": 29833216.0, | |
| "mean_token_accuracy": 0.8643758261203766, | |
| "epoch": 0.45091592296852984, | |
| "step": 800 | |
| }, | |
| { | |
| "eval_loss": 0.5752155780792236, | |
| "eval_runtime": 34.8957, | |
| "eval_samples_per_second": 14.328, | |
| "eval_steps_per_second": 0.917, | |
| "eval_entropy": 0.5935880783945322, | |
| "eval_num_tokens": 29833216.0, | |
| "eval_mean_token_accuracy": 0.8622864987701178, | |
| "epoch": 0.45091592296852984, | |
| "step": 800 | |
| }, | |
| { | |
| "loss": 0.5751077270507813, | |
| "grad_norm": 0.4815407693386078, | |
| "learning_rate": 2.9086448814920242e-05, | |
| "entropy": 0.5717160554726919, | |
| "num_tokens": 30736838.0, | |
| "mean_token_accuracy": 0.864310040473938, | |
| "epoch": 0.4650070455612964, | |
| "step": 825 | |
| }, | |
| { | |
| "loss": 0.566772575378418, | |
| "grad_norm": 0.4774300158023834, | |
| "learning_rate": 2.7957683790521676e-05, | |
| "entropy": 0.5650917081038157, | |
| "num_tokens": 31659300.0, | |
| "mean_token_accuracy": 0.8658999156951904, | |
| "epoch": 0.4790981681540629, | |
| "step": 850 | |
| }, | |
| { | |
| "loss": 0.5626054382324219, | |
| "grad_norm": 0.42420145869255066, | |
| "learning_rate": 2.6822767140148987e-05, | |
| "entropy": 0.5590727700789769, | |
| "num_tokens": 32593580.0, | |
| "mean_token_accuracy": 0.8666303022702535, | |
| "epoch": 0.4931892907468295, | |
| "step": 875 | |
| }, | |
| { | |
| "loss": 0.5539141082763672, | |
| "grad_norm": 0.47889477014541626, | |
| "learning_rate": 2.5684059353712307e-05, | |
| "entropy": 0.5530497090021769, | |
| "num_tokens": 33494838.0, | |
| "mean_token_accuracy": 0.8674623111883799, | |
| "epoch": 0.5072804133395961, | |
| "step": 900 | |
| }, | |
| { | |
| "eval_loss": 0.5693426728248596, | |
| "eval_runtime": 34.8643, | |
| "eval_samples_per_second": 14.341, | |
| "eval_steps_per_second": 0.918, | |
| "eval_entropy": 0.5721144182607532, | |
| "eval_num_tokens": 33494838.0, | |
| "eval_mean_token_accuracy": 0.8636170122772455, | |
| "epoch": 0.5072804133395961, | |
| "step": 900 | |
| }, | |
| { | |
| "loss": 0.5634239959716797, | |
| "grad_norm": 0.47955217957496643, | |
| "learning_rate": 2.4543928806228074e-05, | |
| "entropy": 0.562302614847819, | |
| "num_tokens": 34443337.0, | |
| "mean_token_accuracy": 0.8664345097541809, | |
| "epoch": 0.5213715359323626, | |
| "step": 925 | |
| }, | |
| { | |
| "loss": 0.5764046096801758, | |
| "grad_norm": 0.4992325007915497, | |
| "learning_rate": 2.340474683188429e-05, | |
| "entropy": 0.570437356432279, | |
| "num_tokens": 35385705.0, | |
| "mean_token_accuracy": 0.8647123599052429, | |
| "epoch": 0.5354626585251292, | |
| "step": 950 | |
| }, | |
| { | |
| "loss": 0.5535079956054687, | |
| "grad_norm": 0.5063010454177856, | |
| "learning_rate": 2.2268882791951127e-05, | |
| "entropy": 0.5491390575965246, | |
| "num_tokens": 36300339.0, | |
| "mean_token_accuracy": 0.8694652744134267, | |
| "epoch": 0.5495537811178958, | |
| "step": 975 | |
| }, | |
| { | |
| "loss": 0.5499050521850586, | |
| "grad_norm": 0.45809435844421387, | |
| "learning_rate": 2.1138699146794867e-05, | |
| "entropy": 0.5487177085876465, | |
| "num_tokens": 37231011.0, | |
| "mean_token_accuracy": 0.8694357828299204, | |
| "epoch": 0.5636449037106623, | |
| "step": 1000 | |
| }, | |
| { | |
| "eval_loss": 0.5629469752311707, | |
| "eval_runtime": 34.8051, | |
| "eval_samples_per_second": 14.366, | |
| "eval_steps_per_second": 0.919, | |
| "eval_entropy": 0.5682820733636618, | |
| "eval_num_tokens": 37231011.0, | |
| "eval_mean_token_accuracy": 0.8649211004376411, | |
| "epoch": 0.5636449037106623, | |
| "step": 1000 | |
| }, | |
| { | |
| "loss": 0.5626242446899414, | |
| "grad_norm": 0.4490196108818054, | |
| "learning_rate": 2.001654654224499e-05, | |
| "entropy": 0.5606711500883103, | |
| "num_tokens": 38163978.0, | |
| "mean_token_accuracy": 0.8663252631823222, | |
| "epoch": 0.5777360263034288, | |
| "step": 1025 | |
| }, | |
| { | |
| "loss": 0.5654045867919922, | |
| "grad_norm": 0.4703851044178009, | |
| "learning_rate": 1.8904758920533988e-05, | |
| "entropy": 0.5644157862663269, | |
| "num_tokens": 39100488.0, | |
| "mean_token_accuracy": 0.8655120352904002, | |
| "epoch": 0.5918271488961954, | |
| "step": 1050 | |
| }, | |
| { | |
| "loss": 0.5633118057250976, | |
| "grad_norm": 0.507513165473938, | |
| "learning_rate": 1.780564866597872e-05, | |
| "entropy": 0.5595145153999329, | |
| "num_tokens": 40015513.0, | |
| "mean_token_accuracy": 0.8673883573214213, | |
| "epoch": 0.6059182714889619, | |
| "step": 1075 | |
| }, | |
| { | |
| "loss": 0.5580905532836914, | |
| "grad_norm": 0.48125702142715454, | |
| "learning_rate": 1.67215017954996e-05, | |
| "entropy": 0.5561687298615774, | |
| "num_tokens": 40947677.0, | |
| "mean_token_accuracy": 0.8681903723875681, | |
| "epoch": 0.6200093940817285, | |
| "step": 1100 | |
| }, | |
| { | |
| "eval_loss": 0.5575993657112122, | |
| "eval_runtime": 34.8521, | |
| "eval_samples_per_second": 14.346, | |
| "eval_steps_per_second": 0.918, | |
| "eval_entropy": 0.5570412985980511, | |
| "eval_num_tokens": 40947677.0, | |
| "eval_mean_token_accuracy": 0.8659888282418251, | |
| "epoch": 0.6200093940817285, | |
| "step": 1100 | |
| }, | |
| { | |
| "loss": 0.5523509979248047, | |
| "grad_norm": 0.4786842167377472, | |
| "learning_rate": 1.5654573203980784e-05, | |
| "entropy": 0.5489772335688273, | |
| "num_tokens": 41870358.0, | |
| "mean_token_accuracy": 0.8689925694465637, | |
| "epoch": 0.6341005166744951, | |
| "step": 1125 | |
| }, | |
| { | |
| "loss": 0.5597280883789062, | |
| "grad_norm": 0.4649102985858917, | |
| "learning_rate": 1.4607081974360465e-05, | |
| "entropy": 0.5547034672896067, | |
| "num_tokens": 42797849.0, | |
| "mean_token_accuracy": 0.8681511521339417, | |
| "epoch": 0.6481916392672616, | |
| "step": 1150 | |
| }, | |
| { | |
| "loss": 0.5571650695800782, | |
| "grad_norm": 0.5057896971702576, | |
| "learning_rate": 1.3581206762205706e-05, | |
| "entropy": 0.5546683881680171, | |
| "num_tokens": 43733188.0, | |
| "mean_token_accuracy": 0.8681851788361867, | |
| "epoch": 0.6622827618600282, | |
| "step": 1175 | |
| }, | |
| { | |
| "loss": 0.5405771255493164, | |
| "grad_norm": 0.45025017857551575, | |
| "learning_rate": 1.257908126437129e-05, | |
| "entropy": 0.5370355778932572, | |
| "num_tokens": 44647535.0, | |
| "mean_token_accuracy": 0.8716498986879985, | |
| "epoch": 0.6763738844527948, | |
| "step": 1200 | |
| }, | |
| { | |
| "eval_loss": 0.5538516640663147, | |
| "eval_runtime": 34.8956, | |
| "eval_samples_per_second": 14.328, | |
| "eval_steps_per_second": 0.917, | |
| "eval_entropy": 0.561651473864913, | |
| "eval_num_tokens": 44647535.0, | |
| "eval_mean_token_accuracy": 0.8666044622659683, | |
| "epoch": 0.6763738844527948, | |
| "step": 1200 | |
| }, | |
| { | |
| "loss": 0.5540570831298828, | |
| "grad_norm": 0.5011326670646667, | |
| "learning_rate": 1.1602789781167347e-05, | |
| "entropy": 0.5510254645347595, | |
| "num_tokens": 45550031.0, | |
| "mean_token_accuracy": 0.8685724465052287, | |
| "epoch": 0.6904650070455612, | |
| "step": 1225 | |
| }, | |
| { | |
| "loss": 0.5520057296752929, | |
| "grad_norm": 0.4687948226928711, | |
| "learning_rate": 1.0654362881265754e-05, | |
| "entropy": 0.549973030090332, | |
| "num_tokens": 46479776.0, | |
| "mean_token_accuracy": 0.8689675887425741, | |
| "epoch": 0.7045561296383278, | |
| "step": 1250 | |
| }, | |
| { | |
| "loss": 0.5608898544311524, | |
| "grad_norm": 0.5059524178504944, | |
| "learning_rate": 9.735773178361964e-06, | |
| "entropy": 0.5597832387685776, | |
| "num_tokens": 47436308.0, | |
| "mean_token_accuracy": 0.867308827638626, | |
| "epoch": 0.7186472522310944, | |
| "step": 1275 | |
| }, | |
| { | |
| "loss": 0.5428831100463867, | |
| "grad_norm": 0.5085554718971252, | |
| "learning_rate": 8.848931228376136e-06, | |
| "entropy": 0.5423163912693659, | |
| "num_tokens": 48366020.0, | |
| "mean_token_accuracy": 0.8694934193293253, | |
| "epoch": 0.732738374823861, | |
| "step": 1300 | |
| }, | |
| { | |
| "eval_loss": 0.5501593947410583, | |
| "eval_runtime": 34.8825, | |
| "eval_samples_per_second": 14.334, | |
| "eval_steps_per_second": 0.917, | |
| "eval_entropy": 0.5527484444901347, | |
| "eval_num_tokens": 48366020.0, | |
| "eval_mean_token_accuracy": 0.8676421549171209, | |
| "epoch": 0.732738374823861, | |
| "step": 1300 | |
| }, | |
| { | |
| "loss": 0.5539481353759765, | |
| "grad_norm": 0.5734500288963318, | |
| "learning_rate": 7.99568155572701e-06, | |
| "entropy": 0.5485140432914098, | |
| "num_tokens": 49280534.0, | |
| "mean_token_accuracy": 0.8694952615102132, | |
| "epoch": 0.7468294974166275, | |
| "step": 1325 | |
| }, | |
| { | |
| "loss": 0.5532180404663086, | |
| "grad_norm": 0.4714227020740509, | |
| "learning_rate": 7.177798816943287e-06, | |
| "entropy": 0.5533179378509522, | |
| "num_tokens": 50216029.0, | |
| "mean_token_accuracy": 0.8688394419352213, | |
| "epoch": 0.7609206200093941, | |
| "step": 1350 | |
| }, | |
| { | |
| "loss": 0.5513345336914063, | |
| "grad_norm": 0.553312361240387, | |
| "learning_rate": 6.3969841095918445e-06, | |
| "entropy": 0.5489596172173818, | |
| "num_tokens": 51157602.0, | |
| "mean_token_accuracy": 0.8692836586634318, | |
| "epoch": 0.7750117426021607, | |
| "step": 1375 | |
| }, | |
| { | |
| "loss": 0.5497291564941407, | |
| "grad_norm": 0.4988526999950409, | |
| "learning_rate": 5.654861434199757e-06, | |
| "entropy": 0.5469332609574, | |
| "num_tokens": 52117292.0, | |
| "mean_token_accuracy": 0.8688764305909474, | |
| "epoch": 0.7891028651949272, | |
| "step": 1400 | |
| }, | |
| { | |
| "eval_loss": 0.5481391549110413, | |
| "eval_runtime": 34.9193, | |
| "eval_samples_per_second": 14.319, | |
| "eval_steps_per_second": 0.916, | |
| "eval_entropy": 0.5496508749201894, | |
| "eval_num_tokens": 52117292.0, | |
| "eval_mean_token_accuracy": 0.8679818995296955, | |
| "epoch": 0.7891028651949272, | |
| "step": 1400 | |
| }, | |
| { | |
| "loss": 0.541988639831543, | |
| "grad_norm": 0.49807438254356384, | |
| "learning_rate": 4.952974316528833e-06, | |
| "entropy": 0.5369369254509608, | |
| "num_tokens": 53073200.0, | |
| "mean_token_accuracy": 0.8717625530560812, | |
| "epoch": 0.8031939877876938, | |
| "step": 1425 | |
| }, | |
| { | |
| "loss": 0.5437938308715821, | |
| "grad_norm": 0.5071395635604858, | |
| "learning_rate": 4.292782597227962e-06, | |
| "entropy": 0.5442028508583705, | |
| "num_tokens": 54003611.0, | |
| "mean_token_accuracy": 0.8705387047926585, | |
| "epoch": 0.8172851103804603, | |
| "step": 1450 | |
| }, | |
| { | |
| "loss": 0.5272453689575195, | |
| "grad_norm": 0.4754573404788971, | |
| "learning_rate": 3.67565939554044e-06, | |
| "entropy": 0.5260281827052434, | |
| "num_tokens": 54934829.0, | |
| "mean_token_accuracy": 0.873881352742513, | |
| "epoch": 0.8313762329732268, | |
| "step": 1475 | |
| }, | |
| { | |
| "loss": 0.5533076095581054, | |
| "grad_norm": 0.4666975140571594, | |
| "learning_rate": 3.1028882533813643e-06, | |
| "entropy": 0.5506138996283213, | |
| "num_tokens": 55878834.0, | |
| "mean_token_accuracy": 0.8685371776421865, | |
| "epoch": 0.8454673555659934, | |
| "step": 1500 | |
| }, | |
| { | |
| "eval_loss": 0.5465222001075745, | |
| "eval_runtime": 34.9157, | |
| "eval_samples_per_second": 14.32, | |
| "eval_steps_per_second": 0.916, | |
| "eval_entropy": 0.5545311672613025, | |
| "eval_num_tokens": 55878834.0, | |
| "eval_mean_token_accuracy": 0.8680466562509537, | |
| "epoch": 0.8454673555659934, | |
| "step": 1500 | |
| }, | |
| { | |
| "loss": 0.5680919265747071, | |
| "grad_norm": 0.4815196990966797, | |
| "learning_rate": 2.57566046572508e-06, | |
| "entropy": 0.5671820533275604, | |
| "num_tokens": 56792774.0, | |
| "mean_token_accuracy": 0.8665999062856038, | |
| "epoch": 0.85955847815876, | |
| "step": 1525 | |
| }, | |
| { | |
| "loss": 0.5526847839355469, | |
| "grad_norm": 0.5315864086151123, | |
| "learning_rate": 2.0950726028551306e-06, | |
| "entropy": 0.5491122953097025, | |
| "num_tokens": 57715569.0, | |
| "mean_token_accuracy": 0.8698907673358918, | |
| "epoch": 0.8736496007515265, | |
| "step": 1550 | |
| }, | |
| { | |
| "loss": 0.5509255599975585, | |
| "grad_norm": 0.4776453375816345, | |
| "learning_rate": 1.6621242296301964e-06, | |
| "entropy": 0.5463390636444092, | |
| "num_tokens": 58638742.0, | |
| "mean_token_accuracy": 0.8696528116861979, | |
| "epoch": 0.8877407233442931, | |
| "step": 1575 | |
| }, | |
| { | |
| "loss": 0.5376947021484375, | |
| "grad_norm": 0.5089407563209534, | |
| "learning_rate": 1.2777158265095901e-06, | |
| "entropy": 0.5351915061473846, | |
| "num_tokens": 59557570.0, | |
| "mean_token_accuracy": 0.8726084315776825, | |
| "epoch": 0.9018318459370597, | |
| "step": 1600 | |
| }, | |
| { | |
| "eval_loss": 0.5455822944641113, | |
| "eval_runtime": 34.9436, | |
| "eval_samples_per_second": 14.309, | |
| "eval_steps_per_second": 0.916, | |
| "eval_entropy": 0.5497067291289568, | |
| "eval_num_tokens": 59557570.0, | |
| "eval_mean_token_accuracy": 0.8682057596743107, | |
| "epoch": 0.9018318459370597, | |
| "step": 1600 | |
| }, | |
| { | |
| "loss": 0.5281303787231445, | |
| "grad_norm": 0.49538421630859375, | |
| "learning_rate": 9.426469166623764e-07, | |
| "entropy": 0.5247322716315588, | |
| "num_tokens": 60483216.0, | |
| "mean_token_accuracy": 0.874461769660314, | |
| "epoch": 0.9159229685298262, | |
| "step": 1625 | |
| }, | |
| { | |
| "loss": 0.5454143524169922, | |
| "grad_norm": 0.49807706475257874, | |
| "learning_rate": 6.576144030555259e-07, | |
| "entropy": 0.5443872211376826, | |
| "num_tokens": 61433543.0, | |
| "mean_token_accuracy": 0.8711319859822592, | |
| "epoch": 0.9300140911225928, | |
| "step": 1650 | |
| }, | |
| { | |
| "loss": 0.5425717926025391, | |
| "grad_norm": 0.4815911650657654, | |
| "learning_rate": 4.2321111897965784e-07, | |
| "entropy": 0.5405582892894745, | |
| "num_tokens": 62383514.0, | |
| "mean_token_accuracy": 0.8713100798924764, | |
| "epoch": 0.9441052137153593, | |
| "step": 1675 | |
| }, | |
| { | |
| "loss": 0.5433485412597656, | |
| "grad_norm": 0.7678675055503845, | |
| "learning_rate": 2.399245950272466e-07, | |
| "entropy": 0.5402486324310303, | |
| "num_tokens": 63316188.0, | |
| "mean_token_accuracy": 0.8717403117815653, | |
| "epoch": 0.9581963363081258, | |
| "step": 1700 | |
| }, | |
| { | |
| "eval_loss": 0.5451184511184692, | |
| "eval_runtime": 34.9697, | |
| "eval_samples_per_second": 14.298, | |
| "eval_steps_per_second": 0.915, | |
| "eval_entropy": 0.5487884283065796, | |
| "eval_num_tokens": 63316188.0, | |
| "eval_mean_token_accuracy": 0.8685821667313576, | |
| "epoch": 0.9581963363081258, | |
| "step": 1700 | |
| }, | |
| { | |
| "loss": 0.5411444473266601, | |
| "grad_norm": 0.6061132550239563, | |
| "learning_rate": 1.0813604508771169e-07, | |
| "entropy": 0.5386804081996281, | |
| "num_tokens": 64243151.0, | |
| "mean_token_accuracy": 0.8711121753851573, | |
| "epoch": 0.9722874589008924, | |
| "step": 1725 | |
| }, | |
| { | |
| "loss": 0.5380656433105468, | |
| "grad_norm": 0.5095033645629883, | |
| "learning_rate": 2.811957346845473e-08, | |
| "entropy": 0.5319451389710108, | |
| "num_tokens": 65152283.0, | |
| "mean_token_accuracy": 0.8721891554196676, | |
| "epoch": 0.986378581493659, | |
| "step": 1750 | |
| }, | |
| { | |
| "loss": 0.5395606231689453, | |
| "grad_norm": 5.098243713378906, | |
| "learning_rate": 4.160479090409286e-11, | |
| "entropy": 0.5309042213291958, | |
| "num_tokens": 66051107.0, | |
| "mean_token_accuracy": 0.8716224238790314, | |
| "epoch": 1.0, | |
| "step": 1775 | |
| }, | |
| { | |
| "train_runtime": 40018.5607, | |
| "train_samples_per_second": 4.256, | |
| "train_steps_per_second": 0.044, | |
| "total_flos": 8.291891439265674e+18, | |
| "train_loss": 0.6173567452229245, | |
| "epoch": 1.0, | |
| "step": 1775 | |
| } | |
| ] |