| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5024967808799975, |
| "eval_steps": 200, |
| "global_step": 8000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0006281209760999969, |
| "grad_norm": 0.6444190740585327, |
| "learning_rate": 0.00019996231392500473, |
| "loss": 2.0338, |
| "mean_token_accuracy": 0.6655033957213163, |
| "num_tokens": 25949.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0012562419521999937, |
| "grad_norm": 0.6188492178916931, |
| "learning_rate": 0.0001999204405083433, |
| "loss": 1.2937, |
| "mean_token_accuracy": 0.7404974050819874, |
| "num_tokens": 52745.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0018843629282999906, |
| "grad_norm": 0.38000112771987915, |
| "learning_rate": 0.00019987856709168184, |
| "loss": 1.1577, |
| "mean_token_accuracy": 0.7421227186918259, |
| "num_tokens": 79263.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0025124839043999874, |
| "grad_norm": 0.2685104310512543, |
| "learning_rate": 0.00019983669367502042, |
| "loss": 1.1229, |
| "mean_token_accuracy": 0.7457756631076335, |
| "num_tokens": 105823.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.003140604880499984, |
| "grad_norm": 0.272366464138031, |
| "learning_rate": 0.00019979482025835898, |
| "loss": 1.131, |
| "mean_token_accuracy": 0.744264229759574, |
| "num_tokens": 132844.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.003768725856599981, |
| "grad_norm": 0.29806405305862427, |
| "learning_rate": 0.00019975294684169756, |
| "loss": 1.1209, |
| "mean_token_accuracy": 0.74511832408607, |
| "num_tokens": 160730.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.004396846832699978, |
| "grad_norm": 0.21983323991298676, |
| "learning_rate": 0.0001997110734250361, |
| "loss": 1.1173, |
| "mean_token_accuracy": 0.7465705782175064, |
| "num_tokens": 186800.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.005024967808799975, |
| "grad_norm": 0.3172134757041931, |
| "learning_rate": 0.0001996692000083747, |
| "loss": 1.065, |
| "mean_token_accuracy": 0.751339340209961, |
| "num_tokens": 214113.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.005653088784899972, |
| "grad_norm": 0.2641281485557556, |
| "learning_rate": 0.00019962732659171327, |
| "loss": 1.0829, |
| "mean_token_accuracy": 0.7596591092646122, |
| "num_tokens": 240924.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.006281209760999968, |
| "grad_norm": 0.2266390323638916, |
| "learning_rate": 0.00019958545317505183, |
| "loss": 1.0631, |
| "mean_token_accuracy": 0.7615599595010281, |
| "num_tokens": 268684.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.006909330737099965, |
| "grad_norm": 0.2795678377151489, |
| "learning_rate": 0.0001995435797583904, |
| "loss": 1.0991, |
| "mean_token_accuracy": 0.7610994651913643, |
| "num_tokens": 294547.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.007537451713199962, |
| "grad_norm": 0.3321146070957184, |
| "learning_rate": 0.00019950170634172896, |
| "loss": 1.0841, |
| "mean_token_accuracy": 0.7635640185326338, |
| "num_tokens": 321449.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.008165572689299959, |
| "grad_norm": 0.27341774106025696, |
| "learning_rate": 0.00019945983292506754, |
| "loss": 1.0804, |
| "mean_token_accuracy": 0.7688221096992492, |
| "num_tokens": 347875.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.008793693665399956, |
| "grad_norm": 0.2848992347717285, |
| "learning_rate": 0.0001994179595084061, |
| "loss": 1.1046, |
| "mean_token_accuracy": 0.7561811745166779, |
| "num_tokens": 376044.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.009421814641499953, |
| "grad_norm": 0.23145896196365356, |
| "learning_rate": 0.00019937608609174468, |
| "loss": 1.0284, |
| "mean_token_accuracy": 0.7750971898436546, |
| "num_tokens": 402655.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.01004993561759995, |
| "grad_norm": 0.2593105137348175, |
| "learning_rate": 0.00019933421267508323, |
| "loss": 1.0749, |
| "mean_token_accuracy": 0.7687978703528643, |
| "num_tokens": 428832.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.010678056593699947, |
| "grad_norm": 0.23464348912239075, |
| "learning_rate": 0.00019929233925842179, |
| "loss": 1.0931, |
| "mean_token_accuracy": 0.7663742013275623, |
| "num_tokens": 454748.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.011306177569799944, |
| "grad_norm": 0.26923108100891113, |
| "learning_rate": 0.00019925046584176037, |
| "loss": 1.0557, |
| "mean_token_accuracy": 0.768491517007351, |
| "num_tokens": 482138.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.011934298545899941, |
| "grad_norm": 0.2493171989917755, |
| "learning_rate": 0.00019920859242509892, |
| "loss": 1.0172, |
| "mean_token_accuracy": 0.7774307206273079, |
| "num_tokens": 508277.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.012562419521999936, |
| "grad_norm": 0.2916482090950012, |
| "learning_rate": 0.0001991667190084375, |
| "loss": 1.0439, |
| "mean_token_accuracy": 0.7681754004210234, |
| "num_tokens": 535580.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.013190540498099933, |
| "grad_norm": 0.3120996654033661, |
| "learning_rate": 0.00019912484559177606, |
| "loss": 1.0623, |
| "mean_token_accuracy": 0.770273020491004, |
| "num_tokens": 562472.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.01381866147419993, |
| "grad_norm": 0.259887158870697, |
| "learning_rate": 0.00019908297217511464, |
| "loss": 1.0382, |
| "mean_token_accuracy": 0.7745798997581005, |
| "num_tokens": 588526.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.014446782450299928, |
| "grad_norm": 0.26071885228157043, |
| "learning_rate": 0.0001990410987584532, |
| "loss": 1.0921, |
| "mean_token_accuracy": 0.7673533238470555, |
| "num_tokens": 615067.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.015074903426399925, |
| "grad_norm": 0.2627691328525543, |
| "learning_rate": 0.00019899922534179177, |
| "loss": 1.0575, |
| "mean_token_accuracy": 0.7697854313999415, |
| "num_tokens": 642418.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.015703024402499922, |
| "grad_norm": 0.25009799003601074, |
| "learning_rate": 0.00019895735192513035, |
| "loss": 1.062, |
| "mean_token_accuracy": 0.7729831222444773, |
| "num_tokens": 669932.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.016331145378599917, |
| "grad_norm": 0.25904473662376404, |
| "learning_rate": 0.0001989154785084689, |
| "loss": 1.0908, |
| "mean_token_accuracy": 0.7664148453623056, |
| "num_tokens": 697838.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.016959266354699916, |
| "grad_norm": 0.2615382671356201, |
| "learning_rate": 0.0001988736050918075, |
| "loss": 1.0743, |
| "mean_token_accuracy": 0.7620799005031585, |
| "num_tokens": 726211.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.01758738733079991, |
| "grad_norm": 0.2364761233329773, |
| "learning_rate": 0.00019883173167514604, |
| "loss": 1.0134, |
| "mean_token_accuracy": 0.7783870816230773, |
| "num_tokens": 753627.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.01821550830689991, |
| "grad_norm": 0.3415350615978241, |
| "learning_rate": 0.00019878985825848462, |
| "loss": 1.037, |
| "mean_token_accuracy": 0.7740533579140901, |
| "num_tokens": 779489.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.018843629282999905, |
| "grad_norm": 0.31381550431251526, |
| "learning_rate": 0.00019874798484182318, |
| "loss": 1.0293, |
| "mean_token_accuracy": 0.7746200568974018, |
| "num_tokens": 807064.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.019471750259099904, |
| "grad_norm": 0.3045092523097992, |
| "learning_rate": 0.00019870611142516176, |
| "loss": 0.9941, |
| "mean_token_accuracy": 0.7810021504759789, |
| "num_tokens": 834484.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.0200998712351999, |
| "grad_norm": 0.29344442486763, |
| "learning_rate": 0.0001986642380085003, |
| "loss": 1.0297, |
| "mean_token_accuracy": 0.7791498117148876, |
| "num_tokens": 860075.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.020727992211299895, |
| "grad_norm": 0.377948522567749, |
| "learning_rate": 0.00019862236459183887, |
| "loss": 1.0225, |
| "mean_token_accuracy": 0.7731472756713629, |
| "num_tokens": 887521.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.021356113187399894, |
| "grad_norm": 0.2640627324581146, |
| "learning_rate": 0.00019858049117517745, |
| "loss": 1.0703, |
| "mean_token_accuracy": 0.7751353096216917, |
| "num_tokens": 913836.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.02198423416349989, |
| "grad_norm": 0.2751738727092743, |
| "learning_rate": 0.000198538617758516, |
| "loss": 1.0321, |
| "mean_token_accuracy": 0.7782084301114083, |
| "num_tokens": 941504.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.022612355139599888, |
| "grad_norm": 0.28567424416542053, |
| "learning_rate": 0.00019849674434185458, |
| "loss": 1.0765, |
| "mean_token_accuracy": 0.7735152095556259, |
| "num_tokens": 966993.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.023240476115699883, |
| "grad_norm": 0.24118006229400635, |
| "learning_rate": 0.00019845487092519314, |
| "loss": 1.0261, |
| "mean_token_accuracy": 0.776251096650958, |
| "num_tokens": 993389.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.023868597091799882, |
| "grad_norm": 0.25114327669143677, |
| "learning_rate": 0.00019841299750853172, |
| "loss": 1.0475, |
| "mean_token_accuracy": 0.7716617304831743, |
| "num_tokens": 1020541.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.024496718067899877, |
| "grad_norm": 0.29382482171058655, |
| "learning_rate": 0.0001983711240918703, |
| "loss": 0.9987, |
| "mean_token_accuracy": 0.7752502433955669, |
| "num_tokens": 1046501.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.025124839043999873, |
| "grad_norm": 0.23243650794029236, |
| "learning_rate": 0.00019832925067520885, |
| "loss": 1.0581, |
| "mean_token_accuracy": 0.7718209594488143, |
| "num_tokens": 1071892.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.02575296002009987, |
| "grad_norm": 0.2621685564517975, |
| "learning_rate": 0.00019828737725854743, |
| "loss": 1.0397, |
| "mean_token_accuracy": 0.7732171807438135, |
| "num_tokens": 1097346.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.026381080996199867, |
| "grad_norm": 0.24452239274978638, |
| "learning_rate": 0.000198245503841886, |
| "loss": 1.0338, |
| "mean_token_accuracy": 0.775847963243723, |
| "num_tokens": 1124990.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.027009201972299866, |
| "grad_norm": 0.2270130068063736, |
| "learning_rate": 0.00019820363042522457, |
| "loss": 1.0344, |
| "mean_token_accuracy": 0.7744502332061529, |
| "num_tokens": 1152942.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.02763732294839986, |
| "grad_norm": 0.22909750044345856, |
| "learning_rate": 0.00019816175700856312, |
| "loss": 1.0799, |
| "mean_token_accuracy": 0.7713461548089982, |
| "num_tokens": 1178395.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.02826544392449986, |
| "grad_norm": 0.25798511505126953, |
| "learning_rate": 0.0001981198835919017, |
| "loss": 1.0087, |
| "mean_token_accuracy": 0.7833342991769314, |
| "num_tokens": 1205452.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.028893564900599855, |
| "grad_norm": 0.2909473478794098, |
| "learning_rate": 0.00019807801017524026, |
| "loss": 1.0093, |
| "mean_token_accuracy": 0.7790829930454493, |
| "num_tokens": 1231125.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.029521685876699854, |
| "grad_norm": 0.2601008117198944, |
| "learning_rate": 0.0001980361367585788, |
| "loss": 1.003, |
| "mean_token_accuracy": 0.7839790925383567, |
| "num_tokens": 1258342.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.03014980685279985, |
| "grad_norm": 0.23418252170085907, |
| "learning_rate": 0.0001979942633419174, |
| "loss": 1.012, |
| "mean_token_accuracy": 0.7800652399659157, |
| "num_tokens": 1285135.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.030777927828899845, |
| "grad_norm": 0.23981328308582306, |
| "learning_rate": 0.00019795238992525595, |
| "loss": 1.0519, |
| "mean_token_accuracy": 0.7733560837805271, |
| "num_tokens": 1310716.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.031406048804999843, |
| "grad_norm": 0.25937291979789734, |
| "learning_rate": 0.00019791051650859453, |
| "loss": 0.9777, |
| "mean_token_accuracy": 0.7844050772488117, |
| "num_tokens": 1338308.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.03203416978109984, |
| "grad_norm": 0.2411338984966278, |
| "learning_rate": 0.00019786864309193308, |
| "loss": 1.0141, |
| "mean_token_accuracy": 0.78155472651124, |
| "num_tokens": 1365889.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.032662290757199834, |
| "grad_norm": 0.24309079349040985, |
| "learning_rate": 0.00019782676967527166, |
| "loss": 1.0542, |
| "mean_token_accuracy": 0.7737418331205845, |
| "num_tokens": 1392368.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.033290411733299836, |
| "grad_norm": 0.26009315252304077, |
| "learning_rate": 0.00019778489625861022, |
| "loss": 1.002, |
| "mean_token_accuracy": 0.7816768281161786, |
| "num_tokens": 1418557.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.03391853270939983, |
| "grad_norm": 0.25517457723617554, |
| "learning_rate": 0.0001977430228419488, |
| "loss": 1.0178, |
| "mean_token_accuracy": 0.7780437018722296, |
| "num_tokens": 1444536.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.03454665368549983, |
| "grad_norm": 0.2931221127510071, |
| "learning_rate": 0.00019770114942528738, |
| "loss": 1.0251, |
| "mean_token_accuracy": 0.7784773204475641, |
| "num_tokens": 1472148.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.03517477466159982, |
| "grad_norm": 0.2510989308357239, |
| "learning_rate": 0.00019765927600862593, |
| "loss": 1.0322, |
| "mean_token_accuracy": 0.7775144059211015, |
| "num_tokens": 1497252.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.03580289563769982, |
| "grad_norm": 0.24499671161174774, |
| "learning_rate": 0.00019761740259196451, |
| "loss": 0.9935, |
| "mean_token_accuracy": 0.7875256646424532, |
| "num_tokens": 1523531.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.03643101661379982, |
| "grad_norm": 0.24137680232524872, |
| "learning_rate": 0.00019757552917530307, |
| "loss": 1.0253, |
| "mean_token_accuracy": 0.7757651243358851, |
| "num_tokens": 1550114.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.037059137589899815, |
| "grad_norm": 0.2509494125843048, |
| "learning_rate": 0.00019753365575864165, |
| "loss": 1.0644, |
| "mean_token_accuracy": 0.7731641355901957, |
| "num_tokens": 1577217.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.03768725856599981, |
| "grad_norm": 0.22997072339057922, |
| "learning_rate": 0.0001974917823419802, |
| "loss": 1.0306, |
| "mean_token_accuracy": 0.7778335962444544, |
| "num_tokens": 1605094.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.038315379542099806, |
| "grad_norm": 0.2381758838891983, |
| "learning_rate": 0.00019744990892531876, |
| "loss": 1.0126, |
| "mean_token_accuracy": 0.7801825743168592, |
| "num_tokens": 1631692.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.03894350051819981, |
| "grad_norm": 0.20709013938903809, |
| "learning_rate": 0.00019740803550865734, |
| "loss": 1.0068, |
| "mean_token_accuracy": 0.7762394435703754, |
| "num_tokens": 1660071.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.039571621494299804, |
| "grad_norm": 0.2484230399131775, |
| "learning_rate": 0.0001973661620919959, |
| "loss": 1.0002, |
| "mean_token_accuracy": 0.7832283467054367, |
| "num_tokens": 1688142.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.0401997424703998, |
| "grad_norm": 0.29590943455696106, |
| "learning_rate": 0.00019732428867533447, |
| "loss": 1.0198, |
| "mean_token_accuracy": 0.780064957216382, |
| "num_tokens": 1714501.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.040827863446499794, |
| "grad_norm": 0.2250148206949234, |
| "learning_rate": 0.00019728241525867303, |
| "loss": 1.0231, |
| "mean_token_accuracy": 0.7774009238928556, |
| "num_tokens": 1740189.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.04145598442259979, |
| "grad_norm": 0.2383430153131485, |
| "learning_rate": 0.0001972405418420116, |
| "loss": 1.0058, |
| "mean_token_accuracy": 0.7813835583627224, |
| "num_tokens": 1767202.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.04208410539869979, |
| "grad_norm": 0.2511632442474365, |
| "learning_rate": 0.00019719866842535016, |
| "loss": 1.0022, |
| "mean_token_accuracy": 0.779718442261219, |
| "num_tokens": 1792213.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.04271222637479979, |
| "grad_norm": 0.24368815124034882, |
| "learning_rate": 0.00019715679500868874, |
| "loss": 1.0201, |
| "mean_token_accuracy": 0.776063310727477, |
| "num_tokens": 1818181.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.04334034735089978, |
| "grad_norm": 0.2473301738500595, |
| "learning_rate": 0.00019711492159202733, |
| "loss": 1.0522, |
| "mean_token_accuracy": 0.7746416825801135, |
| "num_tokens": 1844507.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.04396846832699978, |
| "grad_norm": 0.24195240437984467, |
| "learning_rate": 0.00019707304817536588, |
| "loss": 1.0105, |
| "mean_token_accuracy": 0.7786858607083559, |
| "num_tokens": 1871297.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.04459658930309978, |
| "grad_norm": 0.20953992009162903, |
| "learning_rate": 0.00019703117475870446, |
| "loss": 0.9984, |
| "mean_token_accuracy": 0.7896967530250549, |
| "num_tokens": 1897039.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.045224710279199776, |
| "grad_norm": 0.24665352702140808, |
| "learning_rate": 0.00019698930134204301, |
| "loss": 1.0206, |
| "mean_token_accuracy": 0.7788492277264595, |
| "num_tokens": 1925344.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.04585283125529977, |
| "grad_norm": 0.25865861773490906, |
| "learning_rate": 0.0001969474279253816, |
| "loss": 0.9794, |
| "mean_token_accuracy": 0.7853762939572334, |
| "num_tokens": 1952529.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.046480952231399766, |
| "grad_norm": 0.26470091938972473, |
| "learning_rate": 0.00019690555450872015, |
| "loss": 1.0315, |
| "mean_token_accuracy": 0.776170663908124, |
| "num_tokens": 1979215.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.04710907320749976, |
| "grad_norm": 0.24201270937919617, |
| "learning_rate": 0.0001968636810920587, |
| "loss": 1.0579, |
| "mean_token_accuracy": 0.7728814825415611, |
| "num_tokens": 2005105.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.047737194183599764, |
| "grad_norm": 0.2657768428325653, |
| "learning_rate": 0.00019682180767539728, |
| "loss": 1.0635, |
| "mean_token_accuracy": 0.7685952417552471, |
| "num_tokens": 2031227.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.04836531515969976, |
| "grad_norm": 0.24351350963115692, |
| "learning_rate": 0.00019677993425873584, |
| "loss": 1.0285, |
| "mean_token_accuracy": 0.7760228164494037, |
| "num_tokens": 2058110.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.048993436135799755, |
| "grad_norm": 0.29012376070022583, |
| "learning_rate": 0.00019673806084207442, |
| "loss": 1.0441, |
| "mean_token_accuracy": 0.7735719617456198, |
| "num_tokens": 2085226.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.04962155711189975, |
| "grad_norm": 0.3338630795478821, |
| "learning_rate": 0.00019669618742541297, |
| "loss": 1.0605, |
| "mean_token_accuracy": 0.7712657749652863, |
| "num_tokens": 2113154.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.050249678087999745, |
| "grad_norm": 0.2387358844280243, |
| "learning_rate": 0.00019665431400875155, |
| "loss": 0.9938, |
| "mean_token_accuracy": 0.7843167375773191, |
| "num_tokens": 2139881.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.05087779906409975, |
| "grad_norm": 0.2398860901594162, |
| "learning_rate": 0.0001966124405920901, |
| "loss": 1.0617, |
| "mean_token_accuracy": 0.7687791418284178, |
| "num_tokens": 2167003.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.05150592004019974, |
| "grad_norm": 0.2620822489261627, |
| "learning_rate": 0.0001965705671754287, |
| "loss": 1.0088, |
| "mean_token_accuracy": 0.7815865609794855, |
| "num_tokens": 2193645.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.05213404101629974, |
| "grad_norm": 0.26973757147789, |
| "learning_rate": 0.00019652869375876724, |
| "loss": 0.9954, |
| "mean_token_accuracy": 0.78002959638834, |
| "num_tokens": 2220126.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.052762161992399734, |
| "grad_norm": 0.2633202075958252, |
| "learning_rate": 0.00019648682034210582, |
| "loss": 1.0282, |
| "mean_token_accuracy": 0.7779554452747106, |
| "num_tokens": 2245761.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.053390282968499736, |
| "grad_norm": 0.22578182816505432, |
| "learning_rate": 0.0001964449469254444, |
| "loss": 1.0081, |
| "mean_token_accuracy": 0.7812429942190647, |
| "num_tokens": 2273109.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.05401840394459973, |
| "grad_norm": 0.23409296572208405, |
| "learning_rate": 0.00019640307350878296, |
| "loss": 1.0229, |
| "mean_token_accuracy": 0.7737102590501308, |
| "num_tokens": 2301498.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.05464652492069973, |
| "grad_norm": 0.28782615065574646, |
| "learning_rate": 0.00019636120009212154, |
| "loss": 0.9995, |
| "mean_token_accuracy": 0.7862726211547851, |
| "num_tokens": 2327875.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.05527464589679972, |
| "grad_norm": 0.250499427318573, |
| "learning_rate": 0.0001963193266754601, |
| "loss": 1.0047, |
| "mean_token_accuracy": 0.7766373138874769, |
| "num_tokens": 2354761.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.05590276687289972, |
| "grad_norm": 0.266989141702652, |
| "learning_rate": 0.00019627745325879868, |
| "loss": 1.0043, |
| "mean_token_accuracy": 0.7816190734505654, |
| "num_tokens": 2381600.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.05653088784899972, |
| "grad_norm": 0.23852886259555817, |
| "learning_rate": 0.00019623557984213723, |
| "loss": 1.001, |
| "mean_token_accuracy": 0.7850340217351913, |
| "num_tokens": 2409324.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.057159008825099715, |
| "grad_norm": 0.2646239697933197, |
| "learning_rate": 0.00019619370642547578, |
| "loss": 1.0083, |
| "mean_token_accuracy": 0.7854118514806032, |
| "num_tokens": 2434136.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.05778712980119971, |
| "grad_norm": 0.32965826988220215, |
| "learning_rate": 0.00019615183300881437, |
| "loss": 1.0327, |
| "mean_token_accuracy": 0.778333380818367, |
| "num_tokens": 2460120.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.058415250777299706, |
| "grad_norm": 0.2623177468776703, |
| "learning_rate": 0.00019610995959215292, |
| "loss": 0.9889, |
| "mean_token_accuracy": 0.7845939747989178, |
| "num_tokens": 2487177.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.05904337175339971, |
| "grad_norm": 0.24970988929271698, |
| "learning_rate": 0.0001960680861754915, |
| "loss": 1.0301, |
| "mean_token_accuracy": 0.7751765877008439, |
| "num_tokens": 2513428.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.0596714927294997, |
| "grad_norm": 0.21225541830062866, |
| "learning_rate": 0.00019602621275883005, |
| "loss": 1.0269, |
| "mean_token_accuracy": 0.7762201461941004, |
| "num_tokens": 2539126.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.0602996137055997, |
| "grad_norm": 0.22666792571544647, |
| "learning_rate": 0.00019598433934216864, |
| "loss": 1.0081, |
| "mean_token_accuracy": 0.7794944658875466, |
| "num_tokens": 2565063.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.060927734681699694, |
| "grad_norm": 0.263004332780838, |
| "learning_rate": 0.0001959424659255072, |
| "loss": 1.0402, |
| "mean_token_accuracy": 0.7806816603988409, |
| "num_tokens": 2591221.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.06155585565779969, |
| "grad_norm": 0.2698504626750946, |
| "learning_rate": 0.00019590059250884577, |
| "loss": 1.008, |
| "mean_token_accuracy": 0.7808872204273939, |
| "num_tokens": 2617287.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.06218397663389969, |
| "grad_norm": 0.23650215566158295, |
| "learning_rate": 0.00019585871909218435, |
| "loss": 0.9776, |
| "mean_token_accuracy": 0.7856047466397286, |
| "num_tokens": 2644243.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.06281209760999969, |
| "grad_norm": 0.2340182512998581, |
| "learning_rate": 0.0001958168456755229, |
| "loss": 1.0056, |
| "mean_token_accuracy": 0.7824073404073715, |
| "num_tokens": 2670435.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.06344021858609969, |
| "grad_norm": 0.29255470633506775, |
| "learning_rate": 0.0001957749722588615, |
| "loss": 1.0055, |
| "mean_token_accuracy": 0.7834579069167376, |
| "num_tokens": 2697341.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.06406833956219968, |
| "grad_norm": 0.27291226387023926, |
| "learning_rate": 0.00019573309884220004, |
| "loss": 1.0239, |
| "mean_token_accuracy": 0.7748380672186613, |
| "num_tokens": 2724939.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.06469646053829968, |
| "grad_norm": 0.26702409982681274, |
| "learning_rate": 0.00019569122542553862, |
| "loss": 0.9629, |
| "mean_token_accuracy": 0.7890582896769047, |
| "num_tokens": 2751944.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.06532458151439967, |
| "grad_norm": 0.34549906849861145, |
| "learning_rate": 0.00019564935200887718, |
| "loss": 0.9939, |
| "mean_token_accuracy": 0.7854917496442795, |
| "num_tokens": 2778069.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.06595270249049967, |
| "grad_norm": 0.29321590065956116, |
| "learning_rate": 0.00019560747859221573, |
| "loss": 0.9785, |
| "mean_token_accuracy": 0.781370873004198, |
| "num_tokens": 2805448.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.06658082346659967, |
| "grad_norm": 0.27075111865997314, |
| "learning_rate": 0.0001955656051755543, |
| "loss": 1.0026, |
| "mean_token_accuracy": 0.7849268738180399, |
| "num_tokens": 2832499.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.06720894444269966, |
| "grad_norm": 0.2806377410888672, |
| "learning_rate": 0.00019552373175889286, |
| "loss": 1.0041, |
| "mean_token_accuracy": 0.7824723150581121, |
| "num_tokens": 2859168.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.06783706541879966, |
| "grad_norm": 0.2728383541107178, |
| "learning_rate": 0.00019548185834223145, |
| "loss": 1.0094, |
| "mean_token_accuracy": 0.7800588298588991, |
| "num_tokens": 2884736.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.06846518639489965, |
| "grad_norm": 0.27712950110435486, |
| "learning_rate": 0.00019543998492557, |
| "loss": 0.9936, |
| "mean_token_accuracy": 0.7834884870797396, |
| "num_tokens": 2912858.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.06909330737099965, |
| "grad_norm": 0.228750541806221, |
| "learning_rate": 0.00019539811150890858, |
| "loss": 1.0505, |
| "mean_token_accuracy": 0.7716195099055767, |
| "num_tokens": 2938752.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.06972142834709966, |
| "grad_norm": 0.29332438111305237, |
| "learning_rate": 0.00019535623809224714, |
| "loss": 1.0268, |
| "mean_token_accuracy": 0.7755599562078714, |
| "num_tokens": 2965362.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.07034954932319964, |
| "grad_norm": 0.2691513001918793, |
| "learning_rate": 0.00019531436467558572, |
| "loss": 0.9991, |
| "mean_token_accuracy": 0.7823032017797231, |
| "num_tokens": 2993187.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.07097767029929965, |
| "grad_norm": 0.2795611321926117, |
| "learning_rate": 0.0001952724912589243, |
| "loss": 0.9933, |
| "mean_token_accuracy": 0.7823376722633839, |
| "num_tokens": 3019494.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.07160579127539964, |
| "grad_norm": 0.2678844928741455, |
| "learning_rate": 0.00019523061784226285, |
| "loss": 0.996, |
| "mean_token_accuracy": 0.7836552064865827, |
| "num_tokens": 3047562.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.07223391225149964, |
| "grad_norm": 0.24954286217689514, |
| "learning_rate": 0.00019518874442560143, |
| "loss": 1.0048, |
| "mean_token_accuracy": 0.7796858191490174, |
| "num_tokens": 3074447.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.07286203322759964, |
| "grad_norm": 0.2411104440689087, |
| "learning_rate": 0.00019514687100893999, |
| "loss": 0.9947, |
| "mean_token_accuracy": 0.7852891199290752, |
| "num_tokens": 3101800.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.07349015420369963, |
| "grad_norm": 0.26690879464149475, |
| "learning_rate": 0.00019510499759227857, |
| "loss": 0.9629, |
| "mean_token_accuracy": 0.7877023875713348, |
| "num_tokens": 3129214.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.07411827517979963, |
| "grad_norm": 0.2763614058494568, |
| "learning_rate": 0.00019506312417561712, |
| "loss": 1.0005, |
| "mean_token_accuracy": 0.7793677289038896, |
| "num_tokens": 3156811.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.07474639615589963, |
| "grad_norm": 0.28668391704559326, |
| "learning_rate": 0.00019502125075895568, |
| "loss": 1.0135, |
| "mean_token_accuracy": 0.7761840496212244, |
| "num_tokens": 3183787.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.07537451713199962, |
| "grad_norm": 0.32997289299964905, |
| "learning_rate": 0.00019497937734229426, |
| "loss": 1.0379, |
| "mean_token_accuracy": 0.7786177668720484, |
| "num_tokens": 3209022.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.07600263810809962, |
| "grad_norm": 0.30645307898521423, |
| "learning_rate": 0.0001949375039256328, |
| "loss": 1.0069, |
| "mean_token_accuracy": 0.7809740912169219, |
| "num_tokens": 3237283.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.07663075908419961, |
| "grad_norm": 0.25995928049087524, |
| "learning_rate": 0.0001948956305089714, |
| "loss": 1.0102, |
| "mean_token_accuracy": 0.7820904731750489, |
| "num_tokens": 3262891.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.07725888006029961, |
| "grad_norm": 0.2744593024253845, |
| "learning_rate": 0.00019485375709230995, |
| "loss": 0.9766, |
| "mean_token_accuracy": 0.7883562445640564, |
| "num_tokens": 3290445.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.07788700103639962, |
| "grad_norm": 0.28681257367134094, |
| "learning_rate": 0.00019481188367564853, |
| "loss": 1.0141, |
| "mean_token_accuracy": 0.7812411531805992, |
| "num_tokens": 3316283.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.0785151220124996, |
| "grad_norm": 0.2726808190345764, |
| "learning_rate": 0.00019477001025898708, |
| "loss": 1.0456, |
| "mean_token_accuracy": 0.7727174177765846, |
| "num_tokens": 3344457.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.07914324298859961, |
| "grad_norm": 0.22378629446029663, |
| "learning_rate": 0.00019472813684232563, |
| "loss": 1.057, |
| "mean_token_accuracy": 0.77296442091465, |
| "num_tokens": 3371868.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.0797713639646996, |
| "grad_norm": 0.2829609513282776, |
| "learning_rate": 0.00019468626342566422, |
| "loss": 0.9634, |
| "mean_token_accuracy": 0.7868273138999939, |
| "num_tokens": 3399400.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.0803994849407996, |
| "grad_norm": 0.2706160247325897, |
| "learning_rate": 0.0001946443900090028, |
| "loss": 0.9711, |
| "mean_token_accuracy": 0.7930981118232012, |
| "num_tokens": 3424763.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0810276059168996, |
| "grad_norm": 0.27218466997146606, |
| "learning_rate": 0.00019460251659234138, |
| "loss": 0.9635, |
| "mean_token_accuracy": 0.7916820932179689, |
| "num_tokens": 3451380.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.08165572689299959, |
| "grad_norm": 0.2326808124780655, |
| "learning_rate": 0.00019456064317567993, |
| "loss": 0.9921, |
| "mean_token_accuracy": 0.7916632521897554, |
| "num_tokens": 3477301.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.08228384786909959, |
| "grad_norm": 0.3434126079082489, |
| "learning_rate": 0.0001945187697590185, |
| "loss": 0.9911, |
| "mean_token_accuracy": 0.788581146299839, |
| "num_tokens": 3504062.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.08291196884519958, |
| "grad_norm": 0.24641671776771545, |
| "learning_rate": 0.00019447689634235707, |
| "loss": 1.0139, |
| "mean_token_accuracy": 0.7832327298820019, |
| "num_tokens": 3530414.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.08354008982129958, |
| "grad_norm": 0.28022703528404236, |
| "learning_rate": 0.00019443502292569565, |
| "loss": 1.0172, |
| "mean_token_accuracy": 0.7799417782574892, |
| "num_tokens": 3557839.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.08416821079739958, |
| "grad_norm": 0.2621849477291107, |
| "learning_rate": 0.0001943931495090342, |
| "loss": 0.9954, |
| "mean_token_accuracy": 0.7841709833592176, |
| "num_tokens": 3584652.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.08479633177349957, |
| "grad_norm": 0.2990066111087799, |
| "learning_rate": 0.00019435127609237276, |
| "loss": 0.9938, |
| "mean_token_accuracy": 0.784092029184103, |
| "num_tokens": 3611525.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.08542445274959957, |
| "grad_norm": 0.28870198130607605, |
| "learning_rate": 0.00019430940267571134, |
| "loss": 0.9715, |
| "mean_token_accuracy": 0.788496358320117, |
| "num_tokens": 3638315.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.08605257372569956, |
| "grad_norm": 0.2864430546760559, |
| "learning_rate": 0.0001942675292590499, |
| "loss": 0.9974, |
| "mean_token_accuracy": 0.7812655068933964, |
| "num_tokens": 3665550.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.08668069470179957, |
| "grad_norm": 0.2753150165081024, |
| "learning_rate": 0.00019422565584238847, |
| "loss": 1.0013, |
| "mean_token_accuracy": 0.7857158094644546, |
| "num_tokens": 3692171.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.08730881567789957, |
| "grad_norm": 0.29843252897262573, |
| "learning_rate": 0.00019418378242572703, |
| "loss": 0.9665, |
| "mean_token_accuracy": 0.7897020474076271, |
| "num_tokens": 3720109.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.08793693665399956, |
| "grad_norm": 0.2527588903903961, |
| "learning_rate": 0.0001941419090090656, |
| "loss": 0.9981, |
| "mean_token_accuracy": 0.7841325510293246, |
| "num_tokens": 3746495.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.08856505763009956, |
| "grad_norm": 0.23994463682174683, |
| "learning_rate": 0.00019410003559240416, |
| "loss": 1.0203, |
| "mean_token_accuracy": 0.7768059551715851, |
| "num_tokens": 3773355.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.08919317860619956, |
| "grad_norm": 0.2996773421764374, |
| "learning_rate": 0.00019405816217574274, |
| "loss": 0.9721, |
| "mean_token_accuracy": 0.7872179444879294, |
| "num_tokens": 3800935.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.08982129958229955, |
| "grad_norm": 0.2415090799331665, |
| "learning_rate": 0.00019401628875908132, |
| "loss": 1.0031, |
| "mean_token_accuracy": 0.7829441606998444, |
| "num_tokens": 3828006.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.09044942055839955, |
| "grad_norm": 0.23195892572402954, |
| "learning_rate": 0.00019397441534241988, |
| "loss": 0.9699, |
| "mean_token_accuracy": 0.7887255474925041, |
| "num_tokens": 3855488.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.09107754153449954, |
| "grad_norm": 0.2979389429092407, |
| "learning_rate": 0.00019393254192575846, |
| "loss": 1.0531, |
| "mean_token_accuracy": 0.77476004101336, |
| "num_tokens": 3881914.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.09170566251059954, |
| "grad_norm": 0.23635777831077576, |
| "learning_rate": 0.000193890668509097, |
| "loss": 1.0105, |
| "mean_token_accuracy": 0.7829858396202326, |
| "num_tokens": 3908974.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.09233378348669954, |
| "grad_norm": 0.2289458066225052, |
| "learning_rate": 0.0001938487950924356, |
| "loss": 1.023, |
| "mean_token_accuracy": 0.7806610990315676, |
| "num_tokens": 3936336.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.09296190446279953, |
| "grad_norm": 0.24525542557239532, |
| "learning_rate": 0.00019380692167577415, |
| "loss": 1.0107, |
| "mean_token_accuracy": 0.7814721431583166, |
| "num_tokens": 3962693.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.09359002543889954, |
| "grad_norm": 0.2598733603954315, |
| "learning_rate": 0.0001937650482591127, |
| "loss": 0.9717, |
| "mean_token_accuracy": 0.7862234275788069, |
| "num_tokens": 3988713.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.09421814641499952, |
| "grad_norm": 0.23323708772659302, |
| "learning_rate": 0.00019372317484245128, |
| "loss": 1.0059, |
| "mean_token_accuracy": 0.782355098053813, |
| "num_tokens": 4016895.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.09484626739109953, |
| "grad_norm": 0.24204787611961365, |
| "learning_rate": 0.00019368130142578984, |
| "loss": 0.9812, |
| "mean_token_accuracy": 0.787716443836689, |
| "num_tokens": 4043591.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.09547438836719953, |
| "grad_norm": 0.26067280769348145, |
| "learning_rate": 0.00019363942800912842, |
| "loss": 0.996, |
| "mean_token_accuracy": 0.7856833711266518, |
| "num_tokens": 4071338.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.09610250934329952, |
| "grad_norm": 0.3182675540447235, |
| "learning_rate": 0.00019359755459246697, |
| "loss": 0.9715, |
| "mean_token_accuracy": 0.7866728454828262, |
| "num_tokens": 4098744.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.09673063031939952, |
| "grad_norm": 0.3301153779029846, |
| "learning_rate": 0.00019355568117580555, |
| "loss": 1.0048, |
| "mean_token_accuracy": 0.7794850755482912, |
| "num_tokens": 4127367.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.09735875129549951, |
| "grad_norm": 0.26787373423576355, |
| "learning_rate": 0.0001935138077591441, |
| "loss": 1.0076, |
| "mean_token_accuracy": 0.7816127564758062, |
| "num_tokens": 4154162.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.09798687227159951, |
| "grad_norm": 0.30029037594795227, |
| "learning_rate": 0.0001934719343424827, |
| "loss": 0.9324, |
| "mean_token_accuracy": 0.7917449362576008, |
| "num_tokens": 4182109.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.09861499324769951, |
| "grad_norm": 0.24414442479610443, |
| "learning_rate": 0.00019343006092582124, |
| "loss": 1.0249, |
| "mean_token_accuracy": 0.7818967200815677, |
| "num_tokens": 4209142.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.0992431142237995, |
| "grad_norm": 0.26703324913978577, |
| "learning_rate": 0.00019338818750915982, |
| "loss": 1.0002, |
| "mean_token_accuracy": 0.7795033905655145, |
| "num_tokens": 4235912.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.0998712351998995, |
| "grad_norm": 0.28080078959465027, |
| "learning_rate": 0.0001933463140924984, |
| "loss": 0.9852, |
| "mean_token_accuracy": 0.786427366361022, |
| "num_tokens": 4262060.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.10049935617599949, |
| "grad_norm": 0.26151251792907715, |
| "learning_rate": 0.00019330444067583696, |
| "loss": 0.9857, |
| "mean_token_accuracy": 0.7810143373906613, |
| "num_tokens": 4289261.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.1011274771520995, |
| "grad_norm": 0.2997615933418274, |
| "learning_rate": 0.00019326256725917554, |
| "loss": 0.9968, |
| "mean_token_accuracy": 0.7858185056596995, |
| "num_tokens": 4316403.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.1017555981281995, |
| "grad_norm": 0.2625775635242462, |
| "learning_rate": 0.0001932206938425141, |
| "loss": 0.97, |
| "mean_token_accuracy": 0.7898166347295046, |
| "num_tokens": 4342577.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.10238371910429948, |
| "grad_norm": 0.3279540538787842, |
| "learning_rate": 0.00019317882042585265, |
| "loss": 0.9943, |
| "mean_token_accuracy": 0.7791620220988988, |
| "num_tokens": 4369956.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.10301184008039949, |
| "grad_norm": 0.25715646147727966, |
| "learning_rate": 0.00019313694700919123, |
| "loss": 1.0133, |
| "mean_token_accuracy": 0.7784452803432942, |
| "num_tokens": 4396863.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.10363996105649949, |
| "grad_norm": 0.28638651967048645, |
| "learning_rate": 0.00019309507359252978, |
| "loss": 0.9625, |
| "mean_token_accuracy": 0.7899250488728284, |
| "num_tokens": 4423603.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.10426808203259948, |
| "grad_norm": 0.29546940326690674, |
| "learning_rate": 0.00019305320017586836, |
| "loss": 0.979, |
| "mean_token_accuracy": 0.7873238857835532, |
| "num_tokens": 4450002.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.10489620300869948, |
| "grad_norm": 0.3040964901447296, |
| "learning_rate": 0.00019301132675920692, |
| "loss": 0.9833, |
| "mean_token_accuracy": 0.7876588020473718, |
| "num_tokens": 4477401.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.10552432398479947, |
| "grad_norm": 0.32785487174987793, |
| "learning_rate": 0.0001929694533425455, |
| "loss": 0.996, |
| "mean_token_accuracy": 0.7845009371638298, |
| "num_tokens": 4503456.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.10615244496089947, |
| "grad_norm": 0.28163284063339233, |
| "learning_rate": 0.00019292757992588405, |
| "loss": 1.0135, |
| "mean_token_accuracy": 0.7795811247080564, |
| "num_tokens": 4529752.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.10678056593699947, |
| "grad_norm": 0.28460705280303955, |
| "learning_rate": 0.0001928857065092226, |
| "loss": 0.9923, |
| "mean_token_accuracy": 0.7835657082498073, |
| "num_tokens": 4557856.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.10740868691309946, |
| "grad_norm": 0.2662385106086731, |
| "learning_rate": 0.0001928438330925612, |
| "loss": 0.9934, |
| "mean_token_accuracy": 0.7865429297089577, |
| "num_tokens": 4583859.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.10803680788919946, |
| "grad_norm": 0.2522580325603485, |
| "learning_rate": 0.00019280195967589977, |
| "loss": 1.0404, |
| "mean_token_accuracy": 0.7729208268225193, |
| "num_tokens": 4611276.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.10866492886529945, |
| "grad_norm": 0.2595483958721161, |
| "learning_rate": 0.00019276008625923835, |
| "loss": 1.0, |
| "mean_token_accuracy": 0.7807778958231211, |
| "num_tokens": 4637120.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.10929304984139945, |
| "grad_norm": 0.26809316873550415, |
| "learning_rate": 0.0001927182128425769, |
| "loss": 1.0054, |
| "mean_token_accuracy": 0.785046449303627, |
| "num_tokens": 4663842.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.10992117081749946, |
| "grad_norm": 0.2990230917930603, |
| "learning_rate": 0.00019267633942591548, |
| "loss": 0.9824, |
| "mean_token_accuracy": 0.7885061156004667, |
| "num_tokens": 4689968.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.11054929179359944, |
| "grad_norm": 0.3112734854221344, |
| "learning_rate": 0.00019263446600925404, |
| "loss": 0.9684, |
| "mean_token_accuracy": 0.7874101549386978, |
| "num_tokens": 4716585.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.11117741276969945, |
| "grad_norm": 0.30101364850997925, |
| "learning_rate": 0.0001925925925925926, |
| "loss": 1.0086, |
| "mean_token_accuracy": 0.7845278918743134, |
| "num_tokens": 4742455.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.11180553374579943, |
| "grad_norm": 0.25282517075538635, |
| "learning_rate": 0.00019255071917593117, |
| "loss": 1.0152, |
| "mean_token_accuracy": 0.779124328121543, |
| "num_tokens": 4768991.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.11243365472189944, |
| "grad_norm": 0.29368168115615845, |
| "learning_rate": 0.00019250884575926973, |
| "loss": 1.0208, |
| "mean_token_accuracy": 0.7812142610549927, |
| "num_tokens": 4794582.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.11306177569799944, |
| "grad_norm": 0.278226763010025, |
| "learning_rate": 0.0001924669723426083, |
| "loss": 0.9807, |
| "mean_token_accuracy": 0.7866754315793514, |
| "num_tokens": 4821482.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.11368989667409943, |
| "grad_norm": 0.26571571826934814, |
| "learning_rate": 0.00019242509892594686, |
| "loss": 0.9815, |
| "mean_token_accuracy": 0.7797151349484921, |
| "num_tokens": 4849134.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.11431801765019943, |
| "grad_norm": 0.31266430020332336, |
| "learning_rate": 0.00019238322550928544, |
| "loss": 0.9634, |
| "mean_token_accuracy": 0.7930579505860805, |
| "num_tokens": 4875237.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.11494613862629942, |
| "grad_norm": 0.28882619738578796, |
| "learning_rate": 0.000192341352092624, |
| "loss": 1.0578, |
| "mean_token_accuracy": 0.7782481156289578, |
| "num_tokens": 4901212.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.11557425960239942, |
| "grad_norm": 0.29485785961151123, |
| "learning_rate": 0.00019229947867596255, |
| "loss": 1.014, |
| "mean_token_accuracy": 0.7824111267924309, |
| "num_tokens": 4928104.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.11620238057849942, |
| "grad_norm": 0.2997286021709442, |
| "learning_rate": 0.00019225760525930113, |
| "loss": 0.9971, |
| "mean_token_accuracy": 0.7847440119832754, |
| "num_tokens": 4954454.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.11683050155459941, |
| "grad_norm": 0.3043171167373657, |
| "learning_rate": 0.00019221573184263971, |
| "loss": 1.0072, |
| "mean_token_accuracy": 0.7804120637476444, |
| "num_tokens": 4981254.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.11745862253069941, |
| "grad_norm": 0.29763031005859375, |
| "learning_rate": 0.00019217385842597827, |
| "loss": 0.953, |
| "mean_token_accuracy": 0.7923042386770248, |
| "num_tokens": 5007868.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.11808674350679942, |
| "grad_norm": 0.259555459022522, |
| "learning_rate": 0.00019213198500931685, |
| "loss": 0.9678, |
| "mean_token_accuracy": 0.7879516039043665, |
| "num_tokens": 5034353.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.1187148644828994, |
| "grad_norm": 0.24496302008628845, |
| "learning_rate": 0.00019209011159265543, |
| "loss": 1.0191, |
| "mean_token_accuracy": 0.7830112751573324, |
| "num_tokens": 5060668.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.1193429854589994, |
| "grad_norm": 0.25683844089508057, |
| "learning_rate": 0.00019204823817599398, |
| "loss": 1.0245, |
| "mean_token_accuracy": 0.7791640534996986, |
| "num_tokens": 5087233.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.1199711064350994, |
| "grad_norm": 0.26764971017837524, |
| "learning_rate": 0.00019200636475933257, |
| "loss": 0.9839, |
| "mean_token_accuracy": 0.7851655505597591, |
| "num_tokens": 5112981.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.1205992274111994, |
| "grad_norm": 0.2569602429866791, |
| "learning_rate": 0.00019196449134267112, |
| "loss": 0.982, |
| "mean_token_accuracy": 0.7832812406122684, |
| "num_tokens": 5139753.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.1212273483872994, |
| "grad_norm": 0.3184845745563507, |
| "learning_rate": 0.00019192261792600967, |
| "loss": 0.9749, |
| "mean_token_accuracy": 0.795413101837039, |
| "num_tokens": 5164964.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.12185546936339939, |
| "grad_norm": 0.27534252405166626, |
| "learning_rate": 0.00019188074450934825, |
| "loss": 0.9781, |
| "mean_token_accuracy": 0.7883633185178042, |
| "num_tokens": 5191637.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.12248359033949939, |
| "grad_norm": 0.32787275314331055, |
| "learning_rate": 0.0001918388710926868, |
| "loss": 0.9992, |
| "mean_token_accuracy": 0.782159774377942, |
| "num_tokens": 5218067.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.12311171131559938, |
| "grad_norm": 0.3248906433582306, |
| "learning_rate": 0.0001917969976760254, |
| "loss": 1.0157, |
| "mean_token_accuracy": 0.7802824929356575, |
| "num_tokens": 5243981.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.12373983229169938, |
| "grad_norm": 0.2404022514820099, |
| "learning_rate": 0.00019175512425936394, |
| "loss": 0.9981, |
| "mean_token_accuracy": 0.7823897533118724, |
| "num_tokens": 5271897.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.12436795326779938, |
| "grad_norm": 0.26277250051498413, |
| "learning_rate": 0.00019171325084270252, |
| "loss": 0.9313, |
| "mean_token_accuracy": 0.796189296618104, |
| "num_tokens": 5298642.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.12499607424389937, |
| "grad_norm": 0.24106673896312714, |
| "learning_rate": 0.00019167137742604108, |
| "loss": 1.0172, |
| "mean_token_accuracy": 0.7824347522109747, |
| "num_tokens": 5324815.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.12562419521999937, |
| "grad_norm": 0.25554001331329346, |
| "learning_rate": 0.00019162950400937963, |
| "loss": 0.958, |
| "mean_token_accuracy": 0.7892453044652938, |
| "num_tokens": 5352563.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.12625231619609936, |
| "grad_norm": 0.24342681467533112, |
| "learning_rate": 0.00019158763059271821, |
| "loss": 0.9531, |
| "mean_token_accuracy": 0.7900742635130882, |
| "num_tokens": 5380459.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.12688043717219938, |
| "grad_norm": 0.2770666182041168, |
| "learning_rate": 0.0001915457571760568, |
| "loss": 0.9917, |
| "mean_token_accuracy": 0.7898676563054323, |
| "num_tokens": 5406719.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.12750855814829937, |
| "grad_norm": 0.2893310785293579, |
| "learning_rate": 0.00019150388375939538, |
| "loss": 0.9453, |
| "mean_token_accuracy": 0.795016722008586, |
| "num_tokens": 5433643.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.12813667912439936, |
| "grad_norm": 0.30994004011154175, |
| "learning_rate": 0.00019146201034273393, |
| "loss": 0.9609, |
| "mean_token_accuracy": 0.7939148671925068, |
| "num_tokens": 5460355.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.12876480010049934, |
| "grad_norm": 0.27722305059432983, |
| "learning_rate": 0.0001914201369260725, |
| "loss": 0.9714, |
| "mean_token_accuracy": 0.7912579335272312, |
| "num_tokens": 5487760.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.12939292107659936, |
| "grad_norm": 0.28644341230392456, |
| "learning_rate": 0.00019137826350941107, |
| "loss": 0.9774, |
| "mean_token_accuracy": 0.7874169372022152, |
| "num_tokens": 5514206.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.13002104205269935, |
| "grad_norm": 0.2941623032093048, |
| "learning_rate": 0.00019133639009274962, |
| "loss": 1.0533, |
| "mean_token_accuracy": 0.7767478346824646, |
| "num_tokens": 5540555.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.13064916302879934, |
| "grad_norm": 0.31064921617507935, |
| "learning_rate": 0.0001912945166760882, |
| "loss": 1.0319, |
| "mean_token_accuracy": 0.7783826030790806, |
| "num_tokens": 5566402.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.13127728400489935, |
| "grad_norm": 0.28749072551727295, |
| "learning_rate": 0.00019125264325942675, |
| "loss": 1.0165, |
| "mean_token_accuracy": 0.7805188350379467, |
| "num_tokens": 5593168.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.13190540498099934, |
| "grad_norm": 0.28191903233528137, |
| "learning_rate": 0.00019121076984276534, |
| "loss": 0.9736, |
| "mean_token_accuracy": 0.7900103904306889, |
| "num_tokens": 5619001.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.13253352595709933, |
| "grad_norm": 0.2756195664405823, |
| "learning_rate": 0.0001911688964261039, |
| "loss": 1.0194, |
| "mean_token_accuracy": 0.7771706860512495, |
| "num_tokens": 5645679.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.13316164693319935, |
| "grad_norm": 0.25393643975257874, |
| "learning_rate": 0.00019112702300944247, |
| "loss": 0.9981, |
| "mean_token_accuracy": 0.7815123125910759, |
| "num_tokens": 5672282.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.13378976790929933, |
| "grad_norm": 0.2670615315437317, |
| "learning_rate": 0.00019108514959278102, |
| "loss": 0.993, |
| "mean_token_accuracy": 0.7842022236436605, |
| "num_tokens": 5699662.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.13441788888539932, |
| "grad_norm": 0.2826876640319824, |
| "learning_rate": 0.00019104327617611958, |
| "loss": 0.9995, |
| "mean_token_accuracy": 0.7823238395154476, |
| "num_tokens": 5726105.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.1350460098614993, |
| "grad_norm": 0.2938212454319, |
| "learning_rate": 0.00019100140275945816, |
| "loss": 1.004, |
| "mean_token_accuracy": 0.7877405568957329, |
| "num_tokens": 5752171.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.13567413083759933, |
| "grad_norm": 0.3114703297615051, |
| "learning_rate": 0.00019095952934279674, |
| "loss": 0.9896, |
| "mean_token_accuracy": 0.7846675843000412, |
| "num_tokens": 5779452.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.13630225181369932, |
| "grad_norm": 0.3218187391757965, |
| "learning_rate": 0.00019091765592613532, |
| "loss": 0.9587, |
| "mean_token_accuracy": 0.7931745149195194, |
| "num_tokens": 5805682.0, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.1369303727897993, |
| "grad_norm": 0.2846287190914154, |
| "learning_rate": 0.00019087578250947388, |
| "loss": 0.9928, |
| "mean_token_accuracy": 0.7823387812823057, |
| "num_tokens": 5832859.0, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.13755849376589932, |
| "grad_norm": 0.3179105520248413, |
| "learning_rate": 0.00019083390909281246, |
| "loss": 0.9854, |
| "mean_token_accuracy": 0.7867173902690411, |
| "num_tokens": 5859708.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.1381866147419993, |
| "grad_norm": 0.25780409574508667, |
| "learning_rate": 0.000190792035676151, |
| "loss": 0.9903, |
| "mean_token_accuracy": 0.7886831004172563, |
| "num_tokens": 5885166.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.1388147357180993, |
| "grad_norm": 0.27167221903800964, |
| "learning_rate": 0.00019075016225948956, |
| "loss": 0.9992, |
| "mean_token_accuracy": 0.7835381802171468, |
| "num_tokens": 5912676.0, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.1394428566941993, |
| "grad_norm": 0.28806015849113464, |
| "learning_rate": 0.00019070828884282815, |
| "loss": 0.9932, |
| "mean_token_accuracy": 0.7843023527413606, |
| "num_tokens": 5939655.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.1400709776702993, |
| "grad_norm": 0.26339516043663025, |
| "learning_rate": 0.0001906664154261667, |
| "loss": 0.9745, |
| "mean_token_accuracy": 0.7840303633362055, |
| "num_tokens": 5966542.0, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.1406990986463993, |
| "grad_norm": 0.289928138256073, |
| "learning_rate": 0.00019062454200950528, |
| "loss": 0.975, |
| "mean_token_accuracy": 0.7867904342710972, |
| "num_tokens": 5992575.0, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.1413272196224993, |
| "grad_norm": 0.24072448909282684, |
| "learning_rate": 0.00019058266859284383, |
| "loss": 0.9355, |
| "mean_token_accuracy": 0.7977154236286879, |
| "num_tokens": 6018768.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.1419553405985993, |
| "grad_norm": 0.2968997359275818, |
| "learning_rate": 0.00019054079517618242, |
| "loss": 1.011, |
| "mean_token_accuracy": 0.7791931878775358, |
| "num_tokens": 6046493.0, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.14258346157469928, |
| "grad_norm": 0.307750403881073, |
| "learning_rate": 0.00019049892175952097, |
| "loss": 0.9871, |
| "mean_token_accuracy": 0.7865296632051468, |
| "num_tokens": 6073460.0, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.14321158255079927, |
| "grad_norm": 0.24764509499073029, |
| "learning_rate": 0.00019045704834285952, |
| "loss": 0.9345, |
| "mean_token_accuracy": 0.7938060730695724, |
| "num_tokens": 6099802.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.1438397035268993, |
| "grad_norm": 0.26876288652420044, |
| "learning_rate": 0.0001904151749261981, |
| "loss": 0.9601, |
| "mean_token_accuracy": 0.7912701655179262, |
| "num_tokens": 6126401.0, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.14446782450299928, |
| "grad_norm": 0.25304463505744934, |
| "learning_rate": 0.00019037330150953666, |
| "loss": 0.9968, |
| "mean_token_accuracy": 0.7839574735611677, |
| "num_tokens": 6154217.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.14509594547909926, |
| "grad_norm": 0.3394694924354553, |
| "learning_rate": 0.00019033142809287524, |
| "loss": 0.9873, |
| "mean_token_accuracy": 0.7853979174047708, |
| "num_tokens": 6181392.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.14572406645519928, |
| "grad_norm": 0.244957834482193, |
| "learning_rate": 0.00019028955467621382, |
| "loss": 1.0261, |
| "mean_token_accuracy": 0.777070652320981, |
| "num_tokens": 6209710.0, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.14635218743129927, |
| "grad_norm": 0.2903886139392853, |
| "learning_rate": 0.0001902476812595524, |
| "loss": 0.9856, |
| "mean_token_accuracy": 0.7820085968822241, |
| "num_tokens": 6236756.0, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.14698030840739926, |
| "grad_norm": 0.2940092384815216, |
| "learning_rate": 0.00019020580784289096, |
| "loss": 1.0119, |
| "mean_token_accuracy": 0.782718800008297, |
| "num_tokens": 6263676.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.14760842938349927, |
| "grad_norm": 0.3023865222930908, |
| "learning_rate": 0.0001901639344262295, |
| "loss": 0.9761, |
| "mean_token_accuracy": 0.7865527033805847, |
| "num_tokens": 6290468.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.14823655035959926, |
| "grad_norm": 0.3264501392841339, |
| "learning_rate": 0.0001901220610095681, |
| "loss": 0.9889, |
| "mean_token_accuracy": 0.7882603086531162, |
| "num_tokens": 6317021.0, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.14886467133569925, |
| "grad_norm": 0.29618483781814575, |
| "learning_rate": 0.00019008018759290665, |
| "loss": 1.0414, |
| "mean_token_accuracy": 0.7765590559691191, |
| "num_tokens": 6342448.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.14949279231179927, |
| "grad_norm": 0.275785893201828, |
| "learning_rate": 0.00019003831417624523, |
| "loss": 0.9479, |
| "mean_token_accuracy": 0.7965521182864904, |
| "num_tokens": 6367783.0, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.15012091328789925, |
| "grad_norm": 0.32756808400154114, |
| "learning_rate": 0.00018999644075958378, |
| "loss": 0.9494, |
| "mean_token_accuracy": 0.7935776200145483, |
| "num_tokens": 6395320.0, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.15074903426399924, |
| "grad_norm": 0.3014850318431854, |
| "learning_rate": 0.00018995456734292236, |
| "loss": 0.961, |
| "mean_token_accuracy": 0.7955108307301998, |
| "num_tokens": 6422482.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.15137715524009923, |
| "grad_norm": 0.31270134449005127, |
| "learning_rate": 0.00018991269392626092, |
| "loss": 0.98, |
| "mean_token_accuracy": 0.7862629968672991, |
| "num_tokens": 6450301.0, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.15200527621619925, |
| "grad_norm": 0.27296221256256104, |
| "learning_rate": 0.0001898708205095995, |
| "loss": 0.9404, |
| "mean_token_accuracy": 0.7951943475753069, |
| "num_tokens": 6477122.0, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.15263339719229924, |
| "grad_norm": 0.2963928282260895, |
| "learning_rate": 0.00018982894709293805, |
| "loss": 0.9901, |
| "mean_token_accuracy": 0.7869273141026497, |
| "num_tokens": 6503999.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.15326151816839922, |
| "grad_norm": 0.25688695907592773, |
| "learning_rate": 0.0001897870736762766, |
| "loss": 0.9848, |
| "mean_token_accuracy": 0.7863536704331636, |
| "num_tokens": 6529907.0, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.15388963914449924, |
| "grad_norm": 0.2709560990333557, |
| "learning_rate": 0.00018974520025961519, |
| "loss": 0.9772, |
| "mean_token_accuracy": 0.7843648813664913, |
| "num_tokens": 6558499.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.15451776012059923, |
| "grad_norm": 0.268532395362854, |
| "learning_rate": 0.00018970332684295377, |
| "loss": 0.9949, |
| "mean_token_accuracy": 0.7847375877201557, |
| "num_tokens": 6585742.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.15514588109669922, |
| "grad_norm": 0.2693954408168793, |
| "learning_rate": 0.00018966145342629235, |
| "loss": 0.9563, |
| "mean_token_accuracy": 0.7913835499435663, |
| "num_tokens": 6612218.0, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.15577400207279923, |
| "grad_norm": 0.26215437054634094, |
| "learning_rate": 0.0001896195800096309, |
| "loss": 0.9858, |
| "mean_token_accuracy": 0.7850921977311373, |
| "num_tokens": 6639343.0, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.15640212304889922, |
| "grad_norm": 0.2571866512298584, |
| "learning_rate": 0.00018957770659296948, |
| "loss": 1.0043, |
| "mean_token_accuracy": 0.7850870199501514, |
| "num_tokens": 6667103.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.1570302440249992, |
| "grad_norm": 0.23914095759391785, |
| "learning_rate": 0.00018953583317630804, |
| "loss": 0.9701, |
| "mean_token_accuracy": 0.7945085145533085, |
| "num_tokens": 6694365.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.1576583650010992, |
| "grad_norm": 0.3401123285293579, |
| "learning_rate": 0.0001894939597596466, |
| "loss": 0.9711, |
| "mean_token_accuracy": 0.789525717869401, |
| "num_tokens": 6720672.0, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.15828648597719921, |
| "grad_norm": 0.33853694796562195, |
| "learning_rate": 0.00018945208634298517, |
| "loss": 0.9956, |
| "mean_token_accuracy": 0.7857484348118305, |
| "num_tokens": 6746709.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.1589146069532992, |
| "grad_norm": 0.26513391733169556, |
| "learning_rate": 0.00018941021292632373, |
| "loss": 0.9682, |
| "mean_token_accuracy": 0.7871077805757523, |
| "num_tokens": 6774512.0, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.1595427279293992, |
| "grad_norm": 0.29272544384002686, |
| "learning_rate": 0.0001893683395096623, |
| "loss": 0.9868, |
| "mean_token_accuracy": 0.7914240621030331, |
| "num_tokens": 6800463.0, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.1601708489054992, |
| "grad_norm": 0.2921249270439148, |
| "learning_rate": 0.00018932646609300086, |
| "loss": 0.9646, |
| "mean_token_accuracy": 0.7920668996870518, |
| "num_tokens": 6827179.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.1607989698815992, |
| "grad_norm": 0.33000192046165466, |
| "learning_rate": 0.00018928459267633944, |
| "loss": 1.0272, |
| "mean_token_accuracy": 0.7887022830545902, |
| "num_tokens": 6852306.0, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.16142709085769918, |
| "grad_norm": 0.29650112986564636, |
| "learning_rate": 0.000189242719259678, |
| "loss": 0.9629, |
| "mean_token_accuracy": 0.7959261048585177, |
| "num_tokens": 6878427.0, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.1620552118337992, |
| "grad_norm": 0.26963382959365845, |
| "learning_rate": 0.00018920084584301655, |
| "loss": 0.9915, |
| "mean_token_accuracy": 0.7845980357378721, |
| "num_tokens": 6903163.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.1626833328098992, |
| "grad_norm": 0.3108598589897156, |
| "learning_rate": 0.00018915897242635513, |
| "loss": 0.9931, |
| "mean_token_accuracy": 0.7855133522301913, |
| "num_tokens": 6930211.0, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.16331145378599918, |
| "grad_norm": 0.306082159280777, |
| "learning_rate": 0.0001891170990096937, |
| "loss": 0.9638, |
| "mean_token_accuracy": 0.7936804510653019, |
| "num_tokens": 6956489.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.1639395747620992, |
| "grad_norm": 0.286647230386734, |
| "learning_rate": 0.00018907522559303227, |
| "loss": 0.9918, |
| "mean_token_accuracy": 0.7832652296870947, |
| "num_tokens": 6982824.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.16456769573819918, |
| "grad_norm": 0.30177125334739685, |
| "learning_rate": 0.00018903335217637085, |
| "loss": 0.9848, |
| "mean_token_accuracy": 0.7813052102923393, |
| "num_tokens": 7010550.0, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.16519581671429917, |
| "grad_norm": 0.30501213669776917, |
| "learning_rate": 0.00018899147875970943, |
| "loss": 0.9836, |
| "mean_token_accuracy": 0.789811997488141, |
| "num_tokens": 7035917.0, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.16582393769039916, |
| "grad_norm": 0.2610650062561035, |
| "learning_rate": 0.00018894960534304798, |
| "loss": 0.9851, |
| "mean_token_accuracy": 0.7899245552718639, |
| "num_tokens": 7061388.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.16645205866649918, |
| "grad_norm": 0.2944414019584656, |
| "learning_rate": 0.00018890773192638654, |
| "loss": 1.0011, |
| "mean_token_accuracy": 0.7827542286366225, |
| "num_tokens": 7088492.0, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.16708017964259916, |
| "grad_norm": 0.2874250113964081, |
| "learning_rate": 0.00018886585850972512, |
| "loss": 0.9858, |
| "mean_token_accuracy": 0.7829206600785256, |
| "num_tokens": 7115146.0, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.16770830061869915, |
| "grad_norm": 0.27393653988838196, |
| "learning_rate": 0.00018882398509306367, |
| "loss": 0.973, |
| "mean_token_accuracy": 0.7905403438955545, |
| "num_tokens": 7140131.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.16833642159479917, |
| "grad_norm": 0.29718559980392456, |
| "learning_rate": 0.00018878211167640225, |
| "loss": 0.9891, |
| "mean_token_accuracy": 0.783644600585103, |
| "num_tokens": 7166821.0, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.16896454257089916, |
| "grad_norm": 0.27317872643470764, |
| "learning_rate": 0.0001887402382597408, |
| "loss": 1.017, |
| "mean_token_accuracy": 0.784059465304017, |
| "num_tokens": 7193362.0, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.16959266354699915, |
| "grad_norm": 0.25636228919029236, |
| "learning_rate": 0.0001886983648430794, |
| "loss": 0.9531, |
| "mean_token_accuracy": 0.7920433443039656, |
| "num_tokens": 7220619.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.17022078452309916, |
| "grad_norm": 0.2882969379425049, |
| "learning_rate": 0.00018865649142641794, |
| "loss": 0.934, |
| "mean_token_accuracy": 0.7929070591926575, |
| "num_tokens": 7247688.0, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.17084890549919915, |
| "grad_norm": 0.3216884434223175, |
| "learning_rate": 0.0001886146180097565, |
| "loss": 0.9496, |
| "mean_token_accuracy": 0.7929627750068903, |
| "num_tokens": 7274712.0, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.17147702647529914, |
| "grad_norm": 0.27841243147850037, |
| "learning_rate": 0.00018857274459309508, |
| "loss": 0.9323, |
| "mean_token_accuracy": 0.7978887390345335, |
| "num_tokens": 7300487.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.17210514745139913, |
| "grad_norm": 0.2577762007713318, |
| "learning_rate": 0.00018853087117643363, |
| "loss": 1.0107, |
| "mean_token_accuracy": 0.7844824850559234, |
| "num_tokens": 7327049.0, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.17273326842749914, |
| "grad_norm": 0.29990464448928833, |
| "learning_rate": 0.0001884889977597722, |
| "loss": 0.9467, |
| "mean_token_accuracy": 0.7949109837412834, |
| "num_tokens": 7352797.0, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.17336138940359913, |
| "grad_norm": 0.24397854506969452, |
| "learning_rate": 0.0001884471243431108, |
| "loss": 0.9869, |
| "mean_token_accuracy": 0.7791798021644354, |
| "num_tokens": 7381508.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.17398951037969912, |
| "grad_norm": 0.27623310685157776, |
| "learning_rate": 0.00018840525092644937, |
| "loss": 0.9483, |
| "mean_token_accuracy": 0.7909042112529278, |
| "num_tokens": 7409592.0, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.17461763135579914, |
| "grad_norm": 0.30223146080970764, |
| "learning_rate": 0.00018836337750978793, |
| "loss": 0.9961, |
| "mean_token_accuracy": 0.7863899141550064, |
| "num_tokens": 7436032.0, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.17524575233189912, |
| "grad_norm": 0.2969076633453369, |
| "learning_rate": 0.00018832150409312648, |
| "loss": 0.947, |
| "mean_token_accuracy": 0.7935981251299381, |
| "num_tokens": 7464224.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.1758738733079991, |
| "grad_norm": 0.2720794379711151, |
| "learning_rate": 0.00018827963067646506, |
| "loss": 0.967, |
| "mean_token_accuracy": 0.7897748045623303, |
| "num_tokens": 7491312.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.17650199428409913, |
| "grad_norm": 0.2531968355178833, |
| "learning_rate": 0.00018823775725980362, |
| "loss": 0.9677, |
| "mean_token_accuracy": 0.7889728490263224, |
| "num_tokens": 7520696.0, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.17713011526019912, |
| "grad_norm": 0.24469265341758728, |
| "learning_rate": 0.0001881958838431422, |
| "loss": 0.9286, |
| "mean_token_accuracy": 0.7995743758976459, |
| "num_tokens": 7546911.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.1777582362362991, |
| "grad_norm": 0.2589986026287079, |
| "learning_rate": 0.00018815401042648075, |
| "loss": 1.0061, |
| "mean_token_accuracy": 0.7818490665405988, |
| "num_tokens": 7573321.0, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.17838635721239912, |
| "grad_norm": 0.30957522988319397, |
| "learning_rate": 0.00018811213700981933, |
| "loss": 0.9243, |
| "mean_token_accuracy": 0.7941294971853494, |
| "num_tokens": 7600879.0, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.1790144781884991, |
| "grad_norm": 0.2634665071964264, |
| "learning_rate": 0.0001880702635931579, |
| "loss": 0.9441, |
| "mean_token_accuracy": 0.794154980033636, |
| "num_tokens": 7627977.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.1796425991645991, |
| "grad_norm": 0.2403445690870285, |
| "learning_rate": 0.00018802839017649644, |
| "loss": 0.9614, |
| "mean_token_accuracy": 0.7935540229082108, |
| "num_tokens": 7654559.0, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.1802707201406991, |
| "grad_norm": 0.2723034918308258, |
| "learning_rate": 0.00018798651675983502, |
| "loss": 0.9359, |
| "mean_token_accuracy": 0.7911634873598814, |
| "num_tokens": 7681465.0, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.1808988411167991, |
| "grad_norm": 0.24261696636676788, |
| "learning_rate": 0.00018794464334317358, |
| "loss": 0.9781, |
| "mean_token_accuracy": 0.788073031976819, |
| "num_tokens": 7708526.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.1815269620928991, |
| "grad_norm": 0.29172760248184204, |
| "learning_rate": 0.00018790276992651216, |
| "loss": 0.9764, |
| "mean_token_accuracy": 0.7852031115442515, |
| "num_tokens": 7735060.0, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.18215508306899908, |
| "grad_norm": 0.29553157091140747, |
| "learning_rate": 0.00018786089650985074, |
| "loss": 0.9929, |
| "mean_token_accuracy": 0.789004210010171, |
| "num_tokens": 7761517.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.1827832040450991, |
| "grad_norm": 0.26275527477264404, |
| "learning_rate": 0.0001878190230931893, |
| "loss": 0.98, |
| "mean_token_accuracy": 0.7868116334080696, |
| "num_tokens": 7788884.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.18341132502119908, |
| "grad_norm": 0.26689252257347107, |
| "learning_rate": 0.00018777714967652787, |
| "loss": 0.9934, |
| "mean_token_accuracy": 0.7822608612477779, |
| "num_tokens": 7816982.0, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.18403944599729907, |
| "grad_norm": 0.30625566840171814, |
| "learning_rate": 0.00018773527625986645, |
| "loss": 0.9735, |
| "mean_token_accuracy": 0.7886403530836106, |
| "num_tokens": 7844921.0, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.1846675669733991, |
| "grad_norm": 0.30838075280189514, |
| "learning_rate": 0.000187693402843205, |
| "loss": 0.9722, |
| "mean_token_accuracy": 0.7919574566185474, |
| "num_tokens": 7871646.0, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.18529568794949908, |
| "grad_norm": 0.286663293838501, |
| "learning_rate": 0.00018765152942654356, |
| "loss": 0.9823, |
| "mean_token_accuracy": 0.7880583092570305, |
| "num_tokens": 7899070.0, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.18592380892559907, |
| "grad_norm": 0.2832724452018738, |
| "learning_rate": 0.00018760965600988214, |
| "loss": 0.963, |
| "mean_token_accuracy": 0.7926479645073414, |
| "num_tokens": 7925945.0, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.18655192990169905, |
| "grad_norm": 0.3050813376903534, |
| "learning_rate": 0.0001875677825932207, |
| "loss": 0.9525, |
| "mean_token_accuracy": 0.7921919580549002, |
| "num_tokens": 7952472.0, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.18718005087779907, |
| "grad_norm": 0.29279011487960815, |
| "learning_rate": 0.00018752590917655928, |
| "loss": 1.0318, |
| "mean_token_accuracy": 0.7809717856347561, |
| "num_tokens": 7978482.0, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.18780817185389906, |
| "grad_norm": 0.32541248202323914, |
| "learning_rate": 0.00018748403575989783, |
| "loss": 0.972, |
| "mean_token_accuracy": 0.7893663041293622, |
| "num_tokens": 8005205.0, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.18843629282999905, |
| "grad_norm": 0.32494834065437317, |
| "learning_rate": 0.00018744216234323641, |
| "loss": 0.9281, |
| "mean_token_accuracy": 0.7966420441865921, |
| "num_tokens": 8032535.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.18906441380609906, |
| "grad_norm": 0.24331510066986084, |
| "learning_rate": 0.00018740028892657497, |
| "loss": 0.9983, |
| "mean_token_accuracy": 0.7830683149397373, |
| "num_tokens": 8059911.0, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.18969253478219905, |
| "grad_norm": 0.28518521785736084, |
| "learning_rate": 0.00018735841550991352, |
| "loss": 0.9488, |
| "mean_token_accuracy": 0.7934920992702246, |
| "num_tokens": 8086670.0, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.19032065575829904, |
| "grad_norm": 0.31971925497055054, |
| "learning_rate": 0.0001873165420932521, |
| "loss": 0.9806, |
| "mean_token_accuracy": 0.7857418902218342, |
| "num_tokens": 8112851.0, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.19094877673439906, |
| "grad_norm": 0.3072707951068878, |
| "learning_rate": 0.00018727466867659066, |
| "loss": 0.9864, |
| "mean_token_accuracy": 0.7890610966831446, |
| "num_tokens": 8139623.0, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.19157689771049904, |
| "grad_norm": 0.2833654284477234, |
| "learning_rate": 0.00018723279525992924, |
| "loss": 0.9304, |
| "mean_token_accuracy": 0.7993248742073774, |
| "num_tokens": 8165400.0, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.19220501868659903, |
| "grad_norm": 0.2709767818450928, |
| "learning_rate": 0.00018719092184326782, |
| "loss": 0.9766, |
| "mean_token_accuracy": 0.7900667380541563, |
| "num_tokens": 8192046.0, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.19283313966269905, |
| "grad_norm": 0.3056877851486206, |
| "learning_rate": 0.0001871490484266064, |
| "loss": 0.9859, |
| "mean_token_accuracy": 0.7832565013319254, |
| "num_tokens": 8218756.0, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.19346126063879904, |
| "grad_norm": 0.25996148586273193, |
| "learning_rate": 0.00018710717500994495, |
| "loss": 0.9694, |
| "mean_token_accuracy": 0.7857969712466002, |
| "num_tokens": 8247498.0, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.19408938161489903, |
| "grad_norm": 0.2731459140777588, |
| "learning_rate": 0.0001870653015932835, |
| "loss": 0.9327, |
| "mean_token_accuracy": 0.7962075632065535, |
| "num_tokens": 8273330.0, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.19471750259099901, |
| "grad_norm": 0.26292166113853455, |
| "learning_rate": 0.0001870234281766221, |
| "loss": 0.9521, |
| "mean_token_accuracy": 0.7912837877869606, |
| "num_tokens": 8300718.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.19534562356709903, |
| "grad_norm": 0.26134082674980164, |
| "learning_rate": 0.00018698155475996064, |
| "loss": 0.9764, |
| "mean_token_accuracy": 0.7882513340562582, |
| "num_tokens": 8328366.0, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.19597374454319902, |
| "grad_norm": 0.23814305663108826, |
| "learning_rate": 0.00018693968134329922, |
| "loss": 0.9752, |
| "mean_token_accuracy": 0.7884778048843145, |
| "num_tokens": 8355315.0, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.196601865519299, |
| "grad_norm": 0.3375711143016815, |
| "learning_rate": 0.00018689780792663778, |
| "loss": 0.9621, |
| "mean_token_accuracy": 0.7901697169989348, |
| "num_tokens": 8381642.0, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.19722998649539902, |
| "grad_norm": 0.34628236293792725, |
| "learning_rate": 0.00018685593450997636, |
| "loss": 0.9462, |
| "mean_token_accuracy": 0.795050111413002, |
| "num_tokens": 8407747.0, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.197858107471499, |
| "grad_norm": 0.28467345237731934, |
| "learning_rate": 0.0001868140610933149, |
| "loss": 0.964, |
| "mean_token_accuracy": 0.7922971405088901, |
| "num_tokens": 8433827.0, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.198486228447599, |
| "grad_norm": 0.28550681471824646, |
| "learning_rate": 0.00018677218767665347, |
| "loss": 0.9383, |
| "mean_token_accuracy": 0.7921677011996507, |
| "num_tokens": 8461185.0, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.19911434942369902, |
| "grad_norm": 0.2956470251083374, |
| "learning_rate": 0.00018673031425999205, |
| "loss": 0.9715, |
| "mean_token_accuracy": 0.7869658004492521, |
| "num_tokens": 8488228.0, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.199742470399799, |
| "grad_norm": 0.29445043206214905, |
| "learning_rate": 0.0001866884408433306, |
| "loss": 0.9375, |
| "mean_token_accuracy": 0.7965064492076636, |
| "num_tokens": 8515664.0, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.200370591375899, |
| "grad_norm": 0.2424841821193695, |
| "learning_rate": 0.00018664656742666918, |
| "loss": 0.9514, |
| "mean_token_accuracy": 0.7920619916170836, |
| "num_tokens": 8542727.0, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.20099871235199898, |
| "grad_norm": 0.29190316796302795, |
| "learning_rate": 0.00018660469401000776, |
| "loss": 0.9599, |
| "mean_token_accuracy": 0.7881575852632523, |
| "num_tokens": 8570325.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.201626833328099, |
| "grad_norm": 0.25599437952041626, |
| "learning_rate": 0.00018656282059334635, |
| "loss": 0.9754, |
| "mean_token_accuracy": 0.7894639134407043, |
| "num_tokens": 8598238.0, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.202254954304199, |
| "grad_norm": 0.28486067056655884, |
| "learning_rate": 0.0001865209471766849, |
| "loss": 0.963, |
| "mean_token_accuracy": 0.792768269777298, |
| "num_tokens": 8624523.0, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.20288307528029897, |
| "grad_norm": 0.3167647421360016, |
| "learning_rate": 0.00018647907376002345, |
| "loss": 0.9492, |
| "mean_token_accuracy": 0.791275979205966, |
| "num_tokens": 8651585.0, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.203511196256399, |
| "grad_norm": 0.2570751905441284, |
| "learning_rate": 0.00018643720034336203, |
| "loss": 0.963, |
| "mean_token_accuracy": 0.7880451161414385, |
| "num_tokens": 8678088.0, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.20413931723249898, |
| "grad_norm": 0.31692641973495483, |
| "learning_rate": 0.0001863953269267006, |
| "loss": 0.9428, |
| "mean_token_accuracy": 0.7953941386193037, |
| "num_tokens": 8706528.0, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.20476743820859897, |
| "grad_norm": 0.30903160572052, |
| "learning_rate": 0.00018635345351003917, |
| "loss": 0.956, |
| "mean_token_accuracy": 0.7911488272249698, |
| "num_tokens": 8734751.0, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.20539555918469898, |
| "grad_norm": 0.2711246907711029, |
| "learning_rate": 0.00018631158009337772, |
| "loss": 1.0267, |
| "mean_token_accuracy": 0.7815113704651594, |
| "num_tokens": 8761596.0, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.20602368016079897, |
| "grad_norm": 0.3340023458003998, |
| "learning_rate": 0.0001862697066767163, |
| "loss": 0.9381, |
| "mean_token_accuracy": 0.7947205103933811, |
| "num_tokens": 8788662.0, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.20665180113689896, |
| "grad_norm": 0.27200961112976074, |
| "learning_rate": 0.00018622783326005486, |
| "loss": 1.0081, |
| "mean_token_accuracy": 0.7830899234861135, |
| "num_tokens": 8814161.0, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.20727992211299898, |
| "grad_norm": 0.2782123386859894, |
| "learning_rate": 0.0001861859598433934, |
| "loss": 0.9468, |
| "mean_token_accuracy": 0.7955603264272213, |
| "num_tokens": 8839996.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.20790804308909897, |
| "grad_norm": 0.2713397443294525, |
| "learning_rate": 0.000186144086426732, |
| "loss": 0.9512, |
| "mean_token_accuracy": 0.793865691125393, |
| "num_tokens": 8866539.0, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.20853616406519895, |
| "grad_norm": 0.2926190495491028, |
| "learning_rate": 0.00018610221301007055, |
| "loss": 0.9966, |
| "mean_token_accuracy": 0.7881284438073635, |
| "num_tokens": 8892942.0, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.20916428504129894, |
| "grad_norm": 0.2809631824493408, |
| "learning_rate": 0.00018606033959340913, |
| "loss": 0.9387, |
| "mean_token_accuracy": 0.7946780778467655, |
| "num_tokens": 8920034.0, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.20979240601739896, |
| "grad_norm": 0.2645687758922577, |
| "learning_rate": 0.00018601846617674768, |
| "loss": 0.974, |
| "mean_token_accuracy": 0.7887616034597158, |
| "num_tokens": 8946885.0, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.21042052699349895, |
| "grad_norm": 0.32959234714508057, |
| "learning_rate": 0.00018597659276008626, |
| "loss": 0.97, |
| "mean_token_accuracy": 0.789527265354991, |
| "num_tokens": 8973303.0, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.21104864796959893, |
| "grad_norm": 0.2766159474849701, |
| "learning_rate": 0.00018593471934342485, |
| "loss": 1.0029, |
| "mean_token_accuracy": 0.78571757376194, |
| "num_tokens": 8999782.0, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.21167676894569895, |
| "grad_norm": 0.3022785484790802, |
| "learning_rate": 0.0001858928459267634, |
| "loss": 0.9799, |
| "mean_token_accuracy": 0.7888091869652272, |
| "num_tokens": 9025718.0, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.21230488992179894, |
| "grad_norm": 0.27634453773498535, |
| "learning_rate": 0.00018585097251010198, |
| "loss": 0.9694, |
| "mean_token_accuracy": 0.791129108890891, |
| "num_tokens": 9052146.0, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.21293301089789893, |
| "grad_norm": 0.29203152656555176, |
| "learning_rate": 0.00018580909909344053, |
| "loss": 0.9716, |
| "mean_token_accuracy": 0.7845335718244314, |
| "num_tokens": 9079458.0, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.21356113187399894, |
| "grad_norm": 0.260200172662735, |
| "learning_rate": 0.00018576722567677912, |
| "loss": 0.9915, |
| "mean_token_accuracy": 0.7869384720921516, |
| "num_tokens": 9108765.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.21418925285009893, |
| "grad_norm": 0.3571971654891968, |
| "learning_rate": 0.00018572535226011767, |
| "loss": 0.9498, |
| "mean_token_accuracy": 0.793991993367672, |
| "num_tokens": 9136630.0, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.21481737382619892, |
| "grad_norm": 0.33275195956230164, |
| "learning_rate": 0.00018568347884345625, |
| "loss": 0.9556, |
| "mean_token_accuracy": 0.7915182035416365, |
| "num_tokens": 9163266.0, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.2154454948022989, |
| "grad_norm": 0.2770121097564697, |
| "learning_rate": 0.0001856416054267948, |
| "loss": 0.9607, |
| "mean_token_accuracy": 0.7871123567223549, |
| "num_tokens": 9190732.0, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.21607361577839893, |
| "grad_norm": 0.30268242955207825, |
| "learning_rate": 0.00018559973201013336, |
| "loss": 0.9498, |
| "mean_token_accuracy": 0.7864726580679416, |
| "num_tokens": 9218316.0, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.2167017367544989, |
| "grad_norm": 0.3154946565628052, |
| "learning_rate": 0.00018555785859347194, |
| "loss": 0.9905, |
| "mean_token_accuracy": 0.7879769437015056, |
| "num_tokens": 9244245.0, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.2173298577305989, |
| "grad_norm": 0.23621073365211487, |
| "learning_rate": 0.0001855159851768105, |
| "loss": 0.9842, |
| "mean_token_accuracy": 0.7871025986969471, |
| "num_tokens": 9271353.0, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.21795797870669892, |
| "grad_norm": 0.2876494228839874, |
| "learning_rate": 0.00018547411176014907, |
| "loss": 0.9341, |
| "mean_token_accuracy": 0.8000877648591995, |
| "num_tokens": 9298879.0, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.2185860996827989, |
| "grad_norm": 0.30209431052207947, |
| "learning_rate": 0.00018543223834348763, |
| "loss": 0.9611, |
| "mean_token_accuracy": 0.790924321860075, |
| "num_tokens": 9326024.0, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.2192142206588989, |
| "grad_norm": 0.3473189175128937, |
| "learning_rate": 0.0001853903649268262, |
| "loss": 0.995, |
| "mean_token_accuracy": 0.7826048351824284, |
| "num_tokens": 9352117.0, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.2198423416349989, |
| "grad_norm": 0.27148741483688354, |
| "learning_rate": 0.0001853484915101648, |
| "loss": 0.9608, |
| "mean_token_accuracy": 0.7897682044655084, |
| "num_tokens": 9379999.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.2204704626110989, |
| "grad_norm": 0.26357343792915344, |
| "learning_rate": 0.00018530661809350337, |
| "loss": 0.9745, |
| "mean_token_accuracy": 0.792728316038847, |
| "num_tokens": 9406083.0, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.2210985835871989, |
| "grad_norm": 0.30297690629959106, |
| "learning_rate": 0.00018526474467684193, |
| "loss": 0.9431, |
| "mean_token_accuracy": 0.7893101371824741, |
| "num_tokens": 9434685.0, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.2217267045632989, |
| "grad_norm": 0.3760235905647278, |
| "learning_rate": 0.00018522287126018048, |
| "loss": 0.9832, |
| "mean_token_accuracy": 0.7870549734681844, |
| "num_tokens": 9462037.0, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.2223548255393989, |
| "grad_norm": 0.2574126422405243, |
| "learning_rate": 0.00018518099784351906, |
| "loss": 0.9653, |
| "mean_token_accuracy": 0.7888480603694916, |
| "num_tokens": 9488610.0, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.22298294651549888, |
| "grad_norm": 0.2631290555000305, |
| "learning_rate": 0.00018513912442685762, |
| "loss": 1.008, |
| "mean_token_accuracy": 0.7876730926334858, |
| "num_tokens": 9513663.0, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.22361106749159887, |
| "grad_norm": 0.27038082480430603, |
| "learning_rate": 0.0001850972510101962, |
| "loss": 0.9805, |
| "mean_token_accuracy": 0.7889320895075798, |
| "num_tokens": 9540071.0, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.22423918846769889, |
| "grad_norm": 0.28113994002342224, |
| "learning_rate": 0.00018505537759353475, |
| "loss": 0.9725, |
| "mean_token_accuracy": 0.7862703930586576, |
| "num_tokens": 9569174.0, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.22486730944379887, |
| "grad_norm": 0.3517085611820221, |
| "learning_rate": 0.00018501350417687333, |
| "loss": 0.9755, |
| "mean_token_accuracy": 0.78679881952703, |
| "num_tokens": 9595119.0, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.22549543041989886, |
| "grad_norm": 0.29562246799468994, |
| "learning_rate": 0.00018497163076021189, |
| "loss": 1.0037, |
| "mean_token_accuracy": 0.7866592183709145, |
| "num_tokens": 9621399.0, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.22612355139599888, |
| "grad_norm": 0.27493491768836975, |
| "learning_rate": 0.00018492975734355044, |
| "loss": 0.8894, |
| "mean_token_accuracy": 0.8016994591802359, |
| "num_tokens": 9650206.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.22675167237209887, |
| "grad_norm": 0.3233809471130371, |
| "learning_rate": 0.00018488788392688902, |
| "loss": 0.9396, |
| "mean_token_accuracy": 0.7987181950360537, |
| "num_tokens": 9677143.0, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.22737979334819886, |
| "grad_norm": 0.27679574489593506, |
| "learning_rate": 0.00018484601051022757, |
| "loss": 0.9585, |
| "mean_token_accuracy": 0.7922527860850096, |
| "num_tokens": 9702822.0, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.22800791432429887, |
| "grad_norm": 0.25461262464523315, |
| "learning_rate": 0.00018480413709356616, |
| "loss": 0.9332, |
| "mean_token_accuracy": 0.7918692424893379, |
| "num_tokens": 9730239.0, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.22863603530039886, |
| "grad_norm": 0.29292234778404236, |
| "learning_rate": 0.00018476226367690474, |
| "loss": 0.9731, |
| "mean_token_accuracy": 0.7911592714488507, |
| "num_tokens": 9755244.0, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.22926415627649885, |
| "grad_norm": 0.2941250801086426, |
| "learning_rate": 0.0001847203902602433, |
| "loss": 1.0108, |
| "mean_token_accuracy": 0.7864028055220842, |
| "num_tokens": 9781765.0, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.22989227725259884, |
| "grad_norm": 0.32997268438339233, |
| "learning_rate": 0.00018467851684358187, |
| "loss": 0.9819, |
| "mean_token_accuracy": 0.7870282482355833, |
| "num_tokens": 9808695.0, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.23052039822869885, |
| "grad_norm": 0.3786728084087372, |
| "learning_rate": 0.00018463664342692043, |
| "loss": 0.9318, |
| "mean_token_accuracy": 0.7967084005475045, |
| "num_tokens": 9835188.0, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.23114851920479884, |
| "grad_norm": 0.2662932574748993, |
| "learning_rate": 0.000184594770010259, |
| "loss": 0.9729, |
| "mean_token_accuracy": 0.787113618478179, |
| "num_tokens": 9861403.0, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.23177664018089883, |
| "grad_norm": 0.3011711537837982, |
| "learning_rate": 0.00018455289659359756, |
| "loss": 0.9363, |
| "mean_token_accuracy": 0.7950150787830352, |
| "num_tokens": 9887631.0, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.23240476115699885, |
| "grad_norm": 0.3154990077018738, |
| "learning_rate": 0.00018451102317693614, |
| "loss": 0.9622, |
| "mean_token_accuracy": 0.7922232504934072, |
| "num_tokens": 9913153.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.23303288213309883, |
| "grad_norm": 0.4112165868282318, |
| "learning_rate": 0.0001844691497602747, |
| "loss": 0.9703, |
| "mean_token_accuracy": 0.7831194877624512, |
| "num_tokens": 9940930.0, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.23366100310919882, |
| "grad_norm": 0.26975691318511963, |
| "learning_rate": 0.00018442727634361328, |
| "loss": 0.9212, |
| "mean_token_accuracy": 0.7968139354139566, |
| "num_tokens": 9968634.0, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.23428912408529884, |
| "grad_norm": 0.29109784960746765, |
| "learning_rate": 0.00018438540292695183, |
| "loss": 0.9648, |
| "mean_token_accuracy": 0.7936428785324097, |
| "num_tokens": 9995621.0, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.23491724506139883, |
| "grad_norm": 0.31260260939598083, |
| "learning_rate": 0.00018434352951029038, |
| "loss": 1.0045, |
| "mean_token_accuracy": 0.7816799312829972, |
| "num_tokens": 10022343.0, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.23554536603749882, |
| "grad_norm": 0.29457929730415344, |
| "learning_rate": 0.00018430165609362897, |
| "loss": 0.9825, |
| "mean_token_accuracy": 0.7891633450984955, |
| "num_tokens": 10049185.0, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.23617348701359883, |
| "grad_norm": 0.2756049335002899, |
| "learning_rate": 0.00018425978267696752, |
| "loss": 1.0145, |
| "mean_token_accuracy": 0.7817019656300545, |
| "num_tokens": 10075580.0, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.23680160798969882, |
| "grad_norm": 0.29868707060813904, |
| "learning_rate": 0.0001842179092603061, |
| "loss": 0.9691, |
| "mean_token_accuracy": 0.7917348992079496, |
| "num_tokens": 10101223.0, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.2374297289657988, |
| "grad_norm": 0.3161119222640991, |
| "learning_rate": 0.00018417603584364466, |
| "loss": 0.95, |
| "mean_token_accuracy": 0.7942048270255327, |
| "num_tokens": 10127396.0, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.2380578499418988, |
| "grad_norm": 0.29149937629699707, |
| "learning_rate": 0.00018413416242698324, |
| "loss": 0.9904, |
| "mean_token_accuracy": 0.7899536907672882, |
| "num_tokens": 10153465.0, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.2386859709179988, |
| "grad_norm": 0.27939069271087646, |
| "learning_rate": 0.00018409228901032182, |
| "loss": 0.9415, |
| "mean_token_accuracy": 0.7966868814080954, |
| "num_tokens": 10179724.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.2393140918940988, |
| "grad_norm": 0.3205685317516327, |
| "learning_rate": 0.00018405041559366037, |
| "loss": 0.9857, |
| "mean_token_accuracy": 0.7864225681871175, |
| "num_tokens": 10208771.0, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.2399422128701988, |
| "grad_norm": 0.26006463170051575, |
| "learning_rate": 0.00018400854217699895, |
| "loss": 0.9316, |
| "mean_token_accuracy": 0.7858447533100843, |
| "num_tokens": 10236536.0, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.2405703338462988, |
| "grad_norm": 0.3319949507713318, |
| "learning_rate": 0.0001839666687603375, |
| "loss": 0.9994, |
| "mean_token_accuracy": 0.7825042635202408, |
| "num_tokens": 10264224.0, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.2411984548223988, |
| "grad_norm": 0.3030165433883667, |
| "learning_rate": 0.0001839247953436761, |
| "loss": 0.8837, |
| "mean_token_accuracy": 0.7969729781150818, |
| "num_tokens": 10292171.0, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.24182657579849878, |
| "grad_norm": 0.29574254155158997, |
| "learning_rate": 0.00018388292192701464, |
| "loss": 0.9227, |
| "mean_token_accuracy": 0.7981263287365437, |
| "num_tokens": 10319685.0, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.2424546967745988, |
| "grad_norm": 0.333996444940567, |
| "learning_rate": 0.00018384104851035322, |
| "loss": 0.9511, |
| "mean_token_accuracy": 0.7944566797465086, |
| "num_tokens": 10346651.0, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.2430828177506988, |
| "grad_norm": 0.39183929562568665, |
| "learning_rate": 0.00018379917509369178, |
| "loss": 0.968, |
| "mean_token_accuracy": 0.793184470012784, |
| "num_tokens": 10371962.0, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.24371093872679878, |
| "grad_norm": 0.3321262300014496, |
| "learning_rate": 0.00018375730167703033, |
| "loss": 0.9696, |
| "mean_token_accuracy": 0.7914064366370439, |
| "num_tokens": 10397904.0, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.2443390597028988, |
| "grad_norm": 0.3826558589935303, |
| "learning_rate": 0.0001837154282603689, |
| "loss": 0.955, |
| "mean_token_accuracy": 0.7891239549964666, |
| "num_tokens": 10425656.0, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.24496718067899878, |
| "grad_norm": 0.35757163166999817, |
| "learning_rate": 0.00018367355484370747, |
| "loss": 0.9515, |
| "mean_token_accuracy": 0.7942003328353167, |
| "num_tokens": 10454621.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.24559530165509877, |
| "grad_norm": 0.28849247097969055, |
| "learning_rate": 0.00018363168142704605, |
| "loss": 0.9445, |
| "mean_token_accuracy": 0.7946181803941726, |
| "num_tokens": 10481140.0, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.24622342263119876, |
| "grad_norm": 0.3173221945762634, |
| "learning_rate": 0.0001835898080103846, |
| "loss": 0.9679, |
| "mean_token_accuracy": 0.789798391610384, |
| "num_tokens": 10507760.0, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.24685154360729877, |
| "grad_norm": 0.31581512093544006, |
| "learning_rate": 0.00018354793459372318, |
| "loss": 0.9838, |
| "mean_token_accuracy": 0.7864162161946296, |
| "num_tokens": 10535757.0, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.24747966458339876, |
| "grad_norm": 0.31570613384246826, |
| "learning_rate": 0.00018350606117706176, |
| "loss": 0.9984, |
| "mean_token_accuracy": 0.789697939157486, |
| "num_tokens": 10561484.0, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.24810778555949875, |
| "grad_norm": 0.3009042739868164, |
| "learning_rate": 0.00018346418776040032, |
| "loss": 0.9298, |
| "mean_token_accuracy": 0.7986709404736757, |
| "num_tokens": 10588115.0, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.24873590653559877, |
| "grad_norm": 0.3040034770965576, |
| "learning_rate": 0.0001834223143437389, |
| "loss": 1.0171, |
| "mean_token_accuracy": 0.7786661650985479, |
| "num_tokens": 10613945.0, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.24936402751169875, |
| "grad_norm": 0.26565641164779663, |
| "learning_rate": 0.00018338044092707745, |
| "loss": 0.9167, |
| "mean_token_accuracy": 0.8001658879220486, |
| "num_tokens": 10640631.0, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.24999214848779874, |
| "grad_norm": 0.32167062163352966, |
| "learning_rate": 0.00018333856751041603, |
| "loss": 0.9863, |
| "mean_token_accuracy": 0.7891666326671839, |
| "num_tokens": 10667054.0, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.25062026946389876, |
| "grad_norm": 0.307171106338501, |
| "learning_rate": 0.0001832966940937546, |
| "loss": 0.9554, |
| "mean_token_accuracy": 0.7922811262309551, |
| "num_tokens": 10694229.0, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.25124839043999875, |
| "grad_norm": 0.26245325803756714, |
| "learning_rate": 0.00018325482067709317, |
| "loss": 0.9843, |
| "mean_token_accuracy": 0.7896617949008942, |
| "num_tokens": 10719537.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.25187651141609874, |
| "grad_norm": 0.3621242046356201, |
| "learning_rate": 0.00018321294726043172, |
| "loss": 0.9465, |
| "mean_token_accuracy": 0.7954347494989633, |
| "num_tokens": 10745811.0, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.2525046323921987, |
| "grad_norm": 0.2879714071750641, |
| "learning_rate": 0.0001831710738437703, |
| "loss": 0.9682, |
| "mean_token_accuracy": 0.7943593975156545, |
| "num_tokens": 10772934.0, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.2531327533682987, |
| "grad_norm": 0.40183258056640625, |
| "learning_rate": 0.00018312920042710886, |
| "loss": 0.9464, |
| "mean_token_accuracy": 0.791998778283596, |
| "num_tokens": 10799932.0, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.25376087434439876, |
| "grad_norm": 0.3670477867126465, |
| "learning_rate": 0.0001830873270104474, |
| "loss": 0.9616, |
| "mean_token_accuracy": 0.7946044556796551, |
| "num_tokens": 10825734.0, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.25438899532049875, |
| "grad_norm": 0.3046157658100128, |
| "learning_rate": 0.000183045453593786, |
| "loss": 0.992, |
| "mean_token_accuracy": 0.7855597577989102, |
| "num_tokens": 10853694.0, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.25501711629659873, |
| "grad_norm": 0.30533158779144287, |
| "learning_rate": 0.00018300358017712455, |
| "loss": 0.9873, |
| "mean_token_accuracy": 0.7868763618171215, |
| "num_tokens": 10881365.0, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.2556452372726987, |
| "grad_norm": 0.33252570033073425, |
| "learning_rate": 0.00018296170676046313, |
| "loss": 0.9603, |
| "mean_token_accuracy": 0.789739453420043, |
| "num_tokens": 10907688.0, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.2562733582487987, |
| "grad_norm": 0.3089440166950226, |
| "learning_rate": 0.00018291983334380168, |
| "loss": 0.9137, |
| "mean_token_accuracy": 0.7987202100455761, |
| "num_tokens": 10935619.0, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.2569014792248987, |
| "grad_norm": 0.2957897186279297, |
| "learning_rate": 0.00018287795992714026, |
| "loss": 0.9725, |
| "mean_token_accuracy": 0.7916297178715468, |
| "num_tokens": 10962307.0, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.2575296002009987, |
| "grad_norm": 0.313556969165802, |
| "learning_rate": 0.00018283608651047884, |
| "loss": 1.0029, |
| "mean_token_accuracy": 0.7881143033504486, |
| "num_tokens": 10988375.0, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.25815772117709873, |
| "grad_norm": 0.3082195520401001, |
| "learning_rate": 0.0001827942130938174, |
| "loss": 0.9996, |
| "mean_token_accuracy": 0.786595806479454, |
| "num_tokens": 11015381.0, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.2587858421531987, |
| "grad_norm": 0.27436473965644836, |
| "learning_rate": 0.00018275233967715598, |
| "loss": 0.9067, |
| "mean_token_accuracy": 0.7987894963473081, |
| "num_tokens": 11042227.0, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.2594139631292987, |
| "grad_norm": 0.3056981861591339, |
| "learning_rate": 0.00018271046626049453, |
| "loss": 0.9891, |
| "mean_token_accuracy": 0.7872753620147706, |
| "num_tokens": 11068458.0, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.2600420841053987, |
| "grad_norm": 0.3361382782459259, |
| "learning_rate": 0.00018266859284383311, |
| "loss": 0.9871, |
| "mean_token_accuracy": 0.7879853140562773, |
| "num_tokens": 11094173.0, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.2606702050814987, |
| "grad_norm": 0.29034534096717834, |
| "learning_rate": 0.00018262671942717167, |
| "loss": 0.9292, |
| "mean_token_accuracy": 0.7946262218058109, |
| "num_tokens": 11121589.0, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.2612983260575987, |
| "grad_norm": 0.3440930247306824, |
| "learning_rate": 0.00018258484601051025, |
| "loss": 0.9392, |
| "mean_token_accuracy": 0.793309535458684, |
| "num_tokens": 11148217.0, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.2619264470336987, |
| "grad_norm": 0.38578805327415466, |
| "learning_rate": 0.0001825429725938488, |
| "loss": 0.9227, |
| "mean_token_accuracy": 0.7935150127857924, |
| "num_tokens": 11174747.0, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.2625545680097987, |
| "grad_norm": 0.28303763270378113, |
| "learning_rate": 0.00018250109917718736, |
| "loss": 0.9352, |
| "mean_token_accuracy": 0.7939535096287728, |
| "num_tokens": 11202085.0, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.2631826889858987, |
| "grad_norm": 0.275611013174057, |
| "learning_rate": 0.00018245922576052594, |
| "loss": 0.928, |
| "mean_token_accuracy": 0.7997066121548414, |
| "num_tokens": 11229432.0, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.2638108099619987, |
| "grad_norm": 0.34543925523757935, |
| "learning_rate": 0.0001824173523438645, |
| "loss": 0.9865, |
| "mean_token_accuracy": 0.7830164518207312, |
| "num_tokens": 11257570.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.26443893093809867, |
| "grad_norm": 0.2919905483722687, |
| "learning_rate": 0.00018237547892720307, |
| "loss": 0.925, |
| "mean_token_accuracy": 0.7997510485351086, |
| "num_tokens": 11285747.0, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.26506705191419866, |
| "grad_norm": 0.30447816848754883, |
| "learning_rate": 0.00018233360551054163, |
| "loss": 1.0015, |
| "mean_token_accuracy": 0.7857894655317068, |
| "num_tokens": 11312841.0, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.26569517289029865, |
| "grad_norm": 0.3498225212097168, |
| "learning_rate": 0.0001822917320938802, |
| "loss": 0.9477, |
| "mean_token_accuracy": 0.7929210104048252, |
| "num_tokens": 11341177.0, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.2663232938663987, |
| "grad_norm": 0.3432565927505493, |
| "learning_rate": 0.0001822498586772188, |
| "loss": 0.9762, |
| "mean_token_accuracy": 0.7873878616839647, |
| "num_tokens": 11367145.0, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.2669514148424987, |
| "grad_norm": 0.30563971400260925, |
| "learning_rate": 0.00018220798526055734, |
| "loss": 0.975, |
| "mean_token_accuracy": 0.788095697760582, |
| "num_tokens": 11394015.0, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.26757953581859867, |
| "grad_norm": 0.34202903509140015, |
| "learning_rate": 0.00018216611184389592, |
| "loss": 0.942, |
| "mean_token_accuracy": 0.7937498617917299, |
| "num_tokens": 11421244.0, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.26820765679469866, |
| "grad_norm": 0.306781142950058, |
| "learning_rate": 0.00018212423842723448, |
| "loss": 0.9205, |
| "mean_token_accuracy": 0.8030612777918578, |
| "num_tokens": 11448161.0, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.26883577777079865, |
| "grad_norm": 0.3466769754886627, |
| "learning_rate": 0.00018208236501057306, |
| "loss": 0.9918, |
| "mean_token_accuracy": 0.7877435315400362, |
| "num_tokens": 11474678.0, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.26946389874689863, |
| "grad_norm": 0.28220993280410767, |
| "learning_rate": 0.0001820404915939116, |
| "loss": 0.985, |
| "mean_token_accuracy": 0.7845060952007771, |
| "num_tokens": 11501747.0, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.2700920197229986, |
| "grad_norm": 0.2891874313354492, |
| "learning_rate": 0.0001819986181772502, |
| "loss": 0.9827, |
| "mean_token_accuracy": 0.786826417595148, |
| "num_tokens": 11527619.0, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.27072014069909867, |
| "grad_norm": 0.2904052734375, |
| "learning_rate": 0.00018195674476058875, |
| "loss": 0.9177, |
| "mean_token_accuracy": 0.7959654163569212, |
| "num_tokens": 11555059.0, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.27134826167519865, |
| "grad_norm": 0.35182616114616394, |
| "learning_rate": 0.0001819148713439273, |
| "loss": 0.9559, |
| "mean_token_accuracy": 0.7925355311483144, |
| "num_tokens": 11580504.0, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.27197638265129864, |
| "grad_norm": 0.2936030328273773, |
| "learning_rate": 0.00018187299792726588, |
| "loss": 0.9764, |
| "mean_token_accuracy": 0.7912591960281133, |
| "num_tokens": 11606802.0, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.27260450362739863, |
| "grad_norm": 0.3106949031352997, |
| "learning_rate": 0.00018183112451060444, |
| "loss": 0.966, |
| "mean_token_accuracy": 0.7912954032421112, |
| "num_tokens": 11633448.0, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.2732326246034986, |
| "grad_norm": 0.30941087007522583, |
| "learning_rate": 0.00018178925109394302, |
| "loss": 0.9231, |
| "mean_token_accuracy": 0.7992488227784633, |
| "num_tokens": 11660242.0, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.2738607455795986, |
| "grad_norm": 0.3347420394420624, |
| "learning_rate": 0.00018174737767728157, |
| "loss": 0.9753, |
| "mean_token_accuracy": 0.7909771021455526, |
| "num_tokens": 11687032.0, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.27448886655569865, |
| "grad_norm": 0.3798997104167938, |
| "learning_rate": 0.00018170550426062015, |
| "loss": 0.9779, |
| "mean_token_accuracy": 0.7896815791726113, |
| "num_tokens": 11712797.0, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.27511698753179864, |
| "grad_norm": 0.27888602018356323, |
| "learning_rate": 0.0001816636308439587, |
| "loss": 0.9728, |
| "mean_token_accuracy": 0.7927568309009075, |
| "num_tokens": 11738457.0, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.27574510850789863, |
| "grad_norm": 0.3379577398300171, |
| "learning_rate": 0.0001816217574272973, |
| "loss": 0.9381, |
| "mean_token_accuracy": 0.7947981592267752, |
| "num_tokens": 11766434.0, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.2763732294839986, |
| "grad_norm": 0.37430405616760254, |
| "learning_rate": 0.00018157988401063587, |
| "loss": 0.9852, |
| "mean_token_accuracy": 0.7833209618926048, |
| "num_tokens": 11793518.0, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.2770013504600986, |
| "grad_norm": 0.3272213041782379, |
| "learning_rate": 0.00018153801059397442, |
| "loss": 0.9891, |
| "mean_token_accuracy": 0.7863869782537222, |
| "num_tokens": 11819847.0, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.2776294714361986, |
| "grad_norm": 0.3244299292564392, |
| "learning_rate": 0.000181496137177313, |
| "loss": 0.9815, |
| "mean_token_accuracy": 0.7872505661100149, |
| "num_tokens": 11846697.0, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.2782575924122986, |
| "grad_norm": 0.43501153588294983, |
| "learning_rate": 0.00018145426376065156, |
| "loss": 0.9698, |
| "mean_token_accuracy": 0.789780105650425, |
| "num_tokens": 11872705.0, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.2788857133883986, |
| "grad_norm": 0.2940182089805603, |
| "learning_rate": 0.00018141239034399014, |
| "loss": 0.9041, |
| "mean_token_accuracy": 0.7976524058729411, |
| "num_tokens": 11901223.0, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.2795138343644986, |
| "grad_norm": 0.36934155225753784, |
| "learning_rate": 0.0001813705169273287, |
| "loss": 0.993, |
| "mean_token_accuracy": 0.7860413756221533, |
| "num_tokens": 11928250.0, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.2801419553405986, |
| "grad_norm": 0.33647122979164124, |
| "learning_rate": 0.00018132864351066725, |
| "loss": 0.9744, |
| "mean_token_accuracy": 0.7887967016547919, |
| "num_tokens": 11955060.0, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.2807700763166986, |
| "grad_norm": 0.3306765556335449, |
| "learning_rate": 0.00018128677009400583, |
| "loss": 0.9735, |
| "mean_token_accuracy": 0.7871614292263984, |
| "num_tokens": 11982955.0, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.2813981972927986, |
| "grad_norm": 0.2563401162624359, |
| "learning_rate": 0.00018124489667734438, |
| "loss": 0.9764, |
| "mean_token_accuracy": 0.7863705430179835, |
| "num_tokens": 12010704.0, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.28202631826889857, |
| "grad_norm": 0.4547005295753479, |
| "learning_rate": 0.00018120302326068296, |
| "loss": 0.952, |
| "mean_token_accuracy": 0.7937023017555476, |
| "num_tokens": 12037126.0, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.2826544392449986, |
| "grad_norm": 0.26261693239212036, |
| "learning_rate": 0.00018116114984402152, |
| "loss": 0.9602, |
| "mean_token_accuracy": 0.7892621707171201, |
| "num_tokens": 12063860.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.2832825602210986, |
| "grad_norm": 0.25773391127586365, |
| "learning_rate": 0.0001811192764273601, |
| "loss": 0.9724, |
| "mean_token_accuracy": 0.793383052945137, |
| "num_tokens": 12090682.0, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.2839106811971986, |
| "grad_norm": 0.26285916566848755, |
| "learning_rate": 0.00018107740301069865, |
| "loss": 0.9255, |
| "mean_token_accuracy": 0.7996356416493654, |
| "num_tokens": 12115946.0, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.2845388021732986, |
| "grad_norm": 0.32129448652267456, |
| "learning_rate": 0.00018103552959403723, |
| "loss": 1.0348, |
| "mean_token_accuracy": 0.7764022376388311, |
| "num_tokens": 12143516.0, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.28516692314939857, |
| "grad_norm": 0.27924448251724243, |
| "learning_rate": 0.00018099365617737582, |
| "loss": 0.9575, |
| "mean_token_accuracy": 0.7873435180634261, |
| "num_tokens": 12170735.0, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.28579504412549855, |
| "grad_norm": 0.2748197317123413, |
| "learning_rate": 0.00018095178276071437, |
| "loss": 1.0145, |
| "mean_token_accuracy": 0.7861239977180958, |
| "num_tokens": 12197615.0, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.28642316510159854, |
| "grad_norm": 0.38550078868865967, |
| "learning_rate": 0.00018090990934405295, |
| "loss": 0.9512, |
| "mean_token_accuracy": 0.7938284669071436, |
| "num_tokens": 12224675.0, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.2870512860776986, |
| "grad_norm": 0.3162452280521393, |
| "learning_rate": 0.0001808680359273915, |
| "loss": 0.9402, |
| "mean_token_accuracy": 0.7968017168343067, |
| "num_tokens": 12249844.0, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.2876794070537986, |
| "grad_norm": 0.37126636505126953, |
| "learning_rate": 0.00018082616251073009, |
| "loss": 0.9227, |
| "mean_token_accuracy": 0.7992094796150923, |
| "num_tokens": 12276906.0, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.28830752802989856, |
| "grad_norm": 0.32928577065467834, |
| "learning_rate": 0.00018078428909406864, |
| "loss": 0.9641, |
| "mean_token_accuracy": 0.790322245657444, |
| "num_tokens": 12303762.0, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.28893564900599855, |
| "grad_norm": 0.28634729981422424, |
| "learning_rate": 0.00018074241567740722, |
| "loss": 0.9171, |
| "mean_token_accuracy": 0.8008216977119446, |
| "num_tokens": 12331564.0, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.28956376998209854, |
| "grad_norm": 0.3661513924598694, |
| "learning_rate": 0.00018070054226074577, |
| "loss": 0.9081, |
| "mean_token_accuracy": 0.8026146795600653, |
| "num_tokens": 12358416.0, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.29019189095819853, |
| "grad_norm": 0.2952193021774292, |
| "learning_rate": 0.00018065866884408433, |
| "loss": 0.9801, |
| "mean_token_accuracy": 0.7893622420728207, |
| "num_tokens": 12384614.0, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.2908200119342986, |
| "grad_norm": 0.29245612025260925, |
| "learning_rate": 0.0001806167954274229, |
| "loss": 0.9465, |
| "mean_token_accuracy": 0.7965949896723032, |
| "num_tokens": 12411249.0, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.29144813291039856, |
| "grad_norm": 0.3053031265735626, |
| "learning_rate": 0.00018057492201076146, |
| "loss": 0.95, |
| "mean_token_accuracy": 0.79575967900455, |
| "num_tokens": 12436991.0, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.29207625388649855, |
| "grad_norm": 0.37885236740112305, |
| "learning_rate": 0.00018053304859410004, |
| "loss": 0.9614, |
| "mean_token_accuracy": 0.7889950573444366, |
| "num_tokens": 12465386.0, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.29270437486259854, |
| "grad_norm": 0.3537423610687256, |
| "learning_rate": 0.0001804911751774386, |
| "loss": 0.945, |
| "mean_token_accuracy": 0.793824827671051, |
| "num_tokens": 12491227.0, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.2933324958386985, |
| "grad_norm": 0.2990322411060333, |
| "learning_rate": 0.00018044930176077718, |
| "loss": 0.9232, |
| "mean_token_accuracy": 0.796284407377243, |
| "num_tokens": 12519230.0, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.2939606168147985, |
| "grad_norm": 0.32372578978538513, |
| "learning_rate": 0.00018040742834411576, |
| "loss": 0.9503, |
| "mean_token_accuracy": 0.7953814085572958, |
| "num_tokens": 12546047.0, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.2945887377908985, |
| "grad_norm": 0.2930040657520294, |
| "learning_rate": 0.00018036555492745431, |
| "loss": 0.8811, |
| "mean_token_accuracy": 0.8051992613822222, |
| "num_tokens": 12573184.0, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.29521685876699855, |
| "grad_norm": 0.3390980362892151, |
| "learning_rate": 0.0001803236815107929, |
| "loss": 0.9672, |
| "mean_token_accuracy": 0.7889087818562984, |
| "num_tokens": 12599385.0, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.29584497974309854, |
| "grad_norm": 0.28104329109191895, |
| "learning_rate": 0.00018028180809413145, |
| "loss": 0.9879, |
| "mean_token_accuracy": 0.7799696780741214, |
| "num_tokens": 12627295.0, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.2964731007191985, |
| "grad_norm": 0.2756447494029999, |
| "learning_rate": 0.00018023993467747003, |
| "loss": 0.9536, |
| "mean_token_accuracy": 0.7931569367647171, |
| "num_tokens": 12653585.0, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.2971012216952985, |
| "grad_norm": 0.34889882802963257, |
| "learning_rate": 0.00018019806126080859, |
| "loss": 0.9317, |
| "mean_token_accuracy": 0.7953014809638261, |
| "num_tokens": 12680922.0, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.2977293426713985, |
| "grad_norm": 0.3413899838924408, |
| "learning_rate": 0.00018015618784414717, |
| "loss": 0.9961, |
| "mean_token_accuracy": 0.783359244838357, |
| "num_tokens": 12708393.0, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.2983574636474985, |
| "grad_norm": 0.3137054145336151, |
| "learning_rate": 0.00018011431442748572, |
| "loss": 0.9451, |
| "mean_token_accuracy": 0.7911336876451969, |
| "num_tokens": 12735758.0, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.29898558462359853, |
| "grad_norm": 0.29756060242652893, |
| "learning_rate": 0.00018007244101082427, |
| "loss": 0.9465, |
| "mean_token_accuracy": 0.7898903641849756, |
| "num_tokens": 12763112.0, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.2996137055996985, |
| "grad_norm": 0.3134726881980896, |
| "learning_rate": 0.00018003056759416286, |
| "loss": 0.9612, |
| "mean_token_accuracy": 0.7912685304880143, |
| "num_tokens": 12789686.0, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.3002418265757985, |
| "grad_norm": 0.32834240794181824, |
| "learning_rate": 0.0001799886941775014, |
| "loss": 0.9307, |
| "mean_token_accuracy": 0.8001178815960884, |
| "num_tokens": 12813787.0, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.3008699475518985, |
| "grad_norm": 0.30090004205703735, |
| "learning_rate": 0.00017994682076084, |
| "loss": 0.9261, |
| "mean_token_accuracy": 0.7985043011605739, |
| "num_tokens": 12840195.0, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.3014980685279985, |
| "grad_norm": 0.2991957366466522, |
| "learning_rate": 0.00017990494734417854, |
| "loss": 0.9238, |
| "mean_token_accuracy": 0.799000171199441, |
| "num_tokens": 12868504.0, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.3021261895040985, |
| "grad_norm": 0.37887468934059143, |
| "learning_rate": 0.00017986307392751713, |
| "loss": 0.9457, |
| "mean_token_accuracy": 0.792426348477602, |
| "num_tokens": 12895941.0, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.30275431048019846, |
| "grad_norm": 0.3643001914024353, |
| "learning_rate": 0.00017982120051085568, |
| "loss": 0.9328, |
| "mean_token_accuracy": 0.7979659728705883, |
| "num_tokens": 12922039.0, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.3033824314562985, |
| "grad_norm": 0.32898885011672974, |
| "learning_rate": 0.00017977932709419426, |
| "loss": 0.997, |
| "mean_token_accuracy": 0.7901430610567332, |
| "num_tokens": 12946947.0, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.3040105524323985, |
| "grad_norm": 0.30726224184036255, |
| "learning_rate": 0.00017973745367753284, |
| "loss": 0.9484, |
| "mean_token_accuracy": 0.792856489494443, |
| "num_tokens": 12974095.0, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.3046386734084985, |
| "grad_norm": 0.35284626483917236, |
| "learning_rate": 0.0001796955802608714, |
| "loss": 0.9449, |
| "mean_token_accuracy": 0.7954190034419298, |
| "num_tokens": 13000158.0, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.30526679438459847, |
| "grad_norm": 0.35595861077308655, |
| "learning_rate": 0.00017965370684420998, |
| "loss": 0.9713, |
| "mean_token_accuracy": 0.7911023162305355, |
| "num_tokens": 13027170.0, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.30589491536069846, |
| "grad_norm": 0.34379082918167114, |
| "learning_rate": 0.00017961183342754853, |
| "loss": 0.9699, |
| "mean_token_accuracy": 0.7862888902425766, |
| "num_tokens": 13053584.0, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.30652303633679845, |
| "grad_norm": 0.37722787261009216, |
| "learning_rate": 0.0001795699600108871, |
| "loss": 0.9684, |
| "mean_token_accuracy": 0.7894018895924091, |
| "num_tokens": 13080176.0, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.30715115731289844, |
| "grad_norm": 0.3420683741569519, |
| "learning_rate": 0.00017952808659422567, |
| "loss": 0.9754, |
| "mean_token_accuracy": 0.7926111649721861, |
| "num_tokens": 13106832.0, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.3077792782889985, |
| "grad_norm": 0.30542516708374023, |
| "learning_rate": 0.00017948621317756422, |
| "loss": 0.9606, |
| "mean_token_accuracy": 0.7916467692703009, |
| "num_tokens": 13134773.0, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.30840739926509847, |
| "grad_norm": 0.4134035110473633, |
| "learning_rate": 0.0001794443397609028, |
| "loss": 0.9761, |
| "mean_token_accuracy": 0.7897003520280123, |
| "num_tokens": 13161464.0, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.30903552024119846, |
| "grad_norm": 0.35040563344955444, |
| "learning_rate": 0.00017940246634424135, |
| "loss": 0.9451, |
| "mean_token_accuracy": 0.7977105394005776, |
| "num_tokens": 13186861.0, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.30966364121729845, |
| "grad_norm": 0.2841908931732178, |
| "learning_rate": 0.00017936059292757994, |
| "loss": 0.9407, |
| "mean_token_accuracy": 0.7971719756722451, |
| "num_tokens": 13212420.0, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.31029176219339843, |
| "grad_norm": 0.30812788009643555, |
| "learning_rate": 0.0001793187195109185, |
| "loss": 0.9574, |
| "mean_token_accuracy": 0.7923252787441015, |
| "num_tokens": 13238267.0, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.3109198831694984, |
| "grad_norm": 0.2617832124233246, |
| "learning_rate": 0.00017927684609425707, |
| "loss": 0.9699, |
| "mean_token_accuracy": 0.7890806578099727, |
| "num_tokens": 13265852.0, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.31154800414559847, |
| "grad_norm": 0.32903388142585754, |
| "learning_rate": 0.00017923497267759563, |
| "loss": 0.9375, |
| "mean_token_accuracy": 0.7949575208127498, |
| "num_tokens": 13292850.0, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.31217612512169846, |
| "grad_norm": 0.31031638383865356, |
| "learning_rate": 0.0001791930992609342, |
| "loss": 0.9412, |
| "mean_token_accuracy": 0.7894805524498224, |
| "num_tokens": 13320007.0, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.31280424609779844, |
| "grad_norm": 0.3920259475708008, |
| "learning_rate": 0.0001791512258442728, |
| "loss": 0.9246, |
| "mean_token_accuracy": 0.7959085434675217, |
| "num_tokens": 13346689.0, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.31343236707389843, |
| "grad_norm": 0.30121907591819763, |
| "learning_rate": 0.00017910935242761134, |
| "loss": 0.9555, |
| "mean_token_accuracy": 0.7920045137405396, |
| "num_tokens": 13374367.0, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.3140604880499984, |
| "grad_norm": 0.3035444915294647, |
| "learning_rate": 0.00017906747901094992, |
| "loss": 0.9759, |
| "mean_token_accuracy": 0.7897166911512613, |
| "num_tokens": 13400170.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.3146886090260984, |
| "grad_norm": 0.27995365858078003, |
| "learning_rate": 0.00017902560559428848, |
| "loss": 0.9508, |
| "mean_token_accuracy": 0.796136661618948, |
| "num_tokens": 13428461.0, |
| "step": 5010 |
| }, |
| { |
| "epoch": 0.3153167300021984, |
| "grad_norm": 0.3135192096233368, |
| "learning_rate": 0.00017898373217762706, |
| "loss": 0.8978, |
| "mean_token_accuracy": 0.800453482940793, |
| "num_tokens": 13455697.0, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.31594485097829844, |
| "grad_norm": 0.3059029281139374, |
| "learning_rate": 0.0001789418587609656, |
| "loss": 0.9329, |
| "mean_token_accuracy": 0.793091481178999, |
| "num_tokens": 13483193.0, |
| "step": 5030 |
| }, |
| { |
| "epoch": 0.31657297195439843, |
| "grad_norm": 0.3233183026313782, |
| "learning_rate": 0.00017889998534430417, |
| "loss": 0.9333, |
| "mean_token_accuracy": 0.7970968656241894, |
| "num_tokens": 13508603.0, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.3172010929304984, |
| "grad_norm": 0.3030914068222046, |
| "learning_rate": 0.00017885811192764275, |
| "loss": 0.9514, |
| "mean_token_accuracy": 0.7929343525320292, |
| "num_tokens": 13536186.0, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.3178292139065984, |
| "grad_norm": 0.3712503910064697, |
| "learning_rate": 0.0001788162385109813, |
| "loss": 0.9723, |
| "mean_token_accuracy": 0.7916972611099482, |
| "num_tokens": 13562236.0, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.3184573348826984, |
| "grad_norm": 0.339174747467041, |
| "learning_rate": 0.00017877436509431988, |
| "loss": 0.9515, |
| "mean_token_accuracy": 0.7921494416892528, |
| "num_tokens": 13588482.0, |
| "step": 5070 |
| }, |
| { |
| "epoch": 0.3190854558587984, |
| "grad_norm": 0.3653663694858551, |
| "learning_rate": 0.00017873249167765844, |
| "loss": 0.9686, |
| "mean_token_accuracy": 0.7870390675961971, |
| "num_tokens": 13615395.0, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.3197135768348984, |
| "grad_norm": 0.27835947275161743, |
| "learning_rate": 0.00017869061826099702, |
| "loss": 0.989, |
| "mean_token_accuracy": 0.7866588454693556, |
| "num_tokens": 13643065.0, |
| "step": 5090 |
| }, |
| { |
| "epoch": 0.3203416978109984, |
| "grad_norm": 0.33390867710113525, |
| "learning_rate": 0.00017864874484433557, |
| "loss": 0.9766, |
| "mean_token_accuracy": 0.7885018114000559, |
| "num_tokens": 13670119.0, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.3209698187870984, |
| "grad_norm": 0.2845928966999054, |
| "learning_rate": 0.00017860687142767415, |
| "loss": 0.9291, |
| "mean_token_accuracy": 0.7989787317812442, |
| "num_tokens": 13696601.0, |
| "step": 5110 |
| }, |
| { |
| "epoch": 0.3215979397631984, |
| "grad_norm": 0.35259291529655457, |
| "learning_rate": 0.0001785649980110127, |
| "loss": 0.9458, |
| "mean_token_accuracy": 0.7948228023946285, |
| "num_tokens": 13723415.0, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.3222260607392984, |
| "grad_norm": 0.3824492394924164, |
| "learning_rate": 0.0001785231245943513, |
| "loss": 0.9885, |
| "mean_token_accuracy": 0.7887037217617034, |
| "num_tokens": 13748699.0, |
| "step": 5130 |
| }, |
| { |
| "epoch": 0.32285418171539837, |
| "grad_norm": 0.2946266829967499, |
| "learning_rate": 0.00017848125117768987, |
| "loss": 0.9656, |
| "mean_token_accuracy": 0.7888725634664298, |
| "num_tokens": 13775606.0, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.32348230269149836, |
| "grad_norm": 0.2542276382446289, |
| "learning_rate": 0.00017843937776102842, |
| "loss": 0.9737, |
| "mean_token_accuracy": 0.7885618463158608, |
| "num_tokens": 13802423.0, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.3241104236675984, |
| "grad_norm": 0.30069777369499207, |
| "learning_rate": 0.000178397504344367, |
| "loss": 0.9454, |
| "mean_token_accuracy": 0.7927018702030182, |
| "num_tokens": 13828657.0, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.3247385446436984, |
| "grad_norm": 0.3156718611717224, |
| "learning_rate": 0.00017835563092770556, |
| "loss": 0.9359, |
| "mean_token_accuracy": 0.795510170981288, |
| "num_tokens": 13855305.0, |
| "step": 5170 |
| }, |
| { |
| "epoch": 0.3253666656197984, |
| "grad_norm": 0.3123616874217987, |
| "learning_rate": 0.00017831375751104414, |
| "loss": 0.9678, |
| "mean_token_accuracy": 0.7933816347271204, |
| "num_tokens": 13881684.0, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.32599478659589837, |
| "grad_norm": 0.29278364777565, |
| "learning_rate": 0.0001782718840943827, |
| "loss": 0.9691, |
| "mean_token_accuracy": 0.7854080755263567, |
| "num_tokens": 13908078.0, |
| "step": 5190 |
| }, |
| { |
| "epoch": 0.32662290757199836, |
| "grad_norm": 0.3442671000957489, |
| "learning_rate": 0.00017823001067772125, |
| "loss": 0.9622, |
| "mean_token_accuracy": 0.7895106051117182, |
| "num_tokens": 13935018.0, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.32725102854809834, |
| "grad_norm": 0.325995534658432, |
| "learning_rate": 0.00017818813726105983, |
| "loss": 0.9734, |
| "mean_token_accuracy": 0.7888151530176402, |
| "num_tokens": 13960855.0, |
| "step": 5210 |
| }, |
| { |
| "epoch": 0.3278791495241984, |
| "grad_norm": 0.26817047595977783, |
| "learning_rate": 0.00017814626384439838, |
| "loss": 1.0164, |
| "mean_token_accuracy": 0.7806048065423965, |
| "num_tokens": 13987319.0, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.3285072705002984, |
| "grad_norm": 0.3482087552547455, |
| "learning_rate": 0.00017810439042773696, |
| "loss": 0.9569, |
| "mean_token_accuracy": 0.7944883365184069, |
| "num_tokens": 14012706.0, |
| "step": 5230 |
| }, |
| { |
| "epoch": 0.32913539147639836, |
| "grad_norm": 0.32223814725875854, |
| "learning_rate": 0.00017806251701107552, |
| "loss": 0.9498, |
| "mean_token_accuracy": 0.7911127615720034, |
| "num_tokens": 14038936.0, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.32976351245249835, |
| "grad_norm": 0.33856573700904846, |
| "learning_rate": 0.0001780206435944141, |
| "loss": 0.9662, |
| "mean_token_accuracy": 0.7920481752604246, |
| "num_tokens": 14064469.0, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.33039163342859834, |
| "grad_norm": 0.3517283499240875, |
| "learning_rate": 0.00017797877017775265, |
| "loss": 0.9737, |
| "mean_token_accuracy": 0.7935274243354797, |
| "num_tokens": 14091321.0, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.33101975440469833, |
| "grad_norm": 0.3329240083694458, |
| "learning_rate": 0.00017793689676109123, |
| "loss": 0.9772, |
| "mean_token_accuracy": 0.7892213884741068, |
| "num_tokens": 14116727.0, |
| "step": 5270 |
| }, |
| { |
| "epoch": 0.3316478753807983, |
| "grad_norm": 0.3505692780017853, |
| "learning_rate": 0.0001778950233444298, |
| "loss": 0.9517, |
| "mean_token_accuracy": 0.7921177882701158, |
| "num_tokens": 14144556.0, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.33227599635689836, |
| "grad_norm": 0.36645811796188354, |
| "learning_rate": 0.00017785314992776837, |
| "loss": 0.9694, |
| "mean_token_accuracy": 0.7911825001239776, |
| "num_tokens": 14171522.0, |
| "step": 5290 |
| }, |
| { |
| "epoch": 0.33290411733299835, |
| "grad_norm": 0.29255688190460205, |
| "learning_rate": 0.00017781127651110695, |
| "loss": 0.968, |
| "mean_token_accuracy": 0.789820882678032, |
| "num_tokens": 14198901.0, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.33353223830909834, |
| "grad_norm": 0.34405186772346497, |
| "learning_rate": 0.0001777694030944455, |
| "loss": 0.9414, |
| "mean_token_accuracy": 0.797522522136569, |
| "num_tokens": 14224801.0, |
| "step": 5310 |
| }, |
| { |
| "epoch": 0.3341603592851983, |
| "grad_norm": 0.2979678809642792, |
| "learning_rate": 0.00017772752967778408, |
| "loss": 0.9479, |
| "mean_token_accuracy": 0.7887560345232487, |
| "num_tokens": 14252677.0, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.3347884802612983, |
| "grad_norm": 0.2529391646385193, |
| "learning_rate": 0.00017768565626112264, |
| "loss": 0.9535, |
| "mean_token_accuracy": 0.7968549765646458, |
| "num_tokens": 14280416.0, |
| "step": 5330 |
| }, |
| { |
| "epoch": 0.3354166012373983, |
| "grad_norm": 0.31212252378463745, |
| "learning_rate": 0.0001776437828444612, |
| "loss": 0.9753, |
| "mean_token_accuracy": 0.7860465437173844, |
| "num_tokens": 14307299.0, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.3360447222134983, |
| "grad_norm": 0.3829305171966553, |
| "learning_rate": 0.00017760190942779977, |
| "loss": 0.9157, |
| "mean_token_accuracy": 0.7976312339305878, |
| "num_tokens": 14334235.0, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.33667284318959834, |
| "grad_norm": 0.2862161099910736, |
| "learning_rate": 0.00017756003601113833, |
| "loss": 0.8948, |
| "mean_token_accuracy": 0.8058628041297198, |
| "num_tokens": 14360748.0, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.3373009641656983, |
| "grad_norm": 0.3067629039287567, |
| "learning_rate": 0.0001775181625944769, |
| "loss": 0.93, |
| "mean_token_accuracy": 0.8005598716437816, |
| "num_tokens": 14387372.0, |
| "step": 5370 |
| }, |
| { |
| "epoch": 0.3379290851417983, |
| "grad_norm": 0.33197638392448425, |
| "learning_rate": 0.00017747628917781546, |
| "loss": 0.9684, |
| "mean_token_accuracy": 0.7905099768191576, |
| "num_tokens": 14414745.0, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.3385572061178983, |
| "grad_norm": 0.3821481168270111, |
| "learning_rate": 0.00017743441576115404, |
| "loss": 0.933, |
| "mean_token_accuracy": 0.7985155992209911, |
| "num_tokens": 14442045.0, |
| "step": 5390 |
| }, |
| { |
| "epoch": 0.3391853270939983, |
| "grad_norm": 0.32107749581336975, |
| "learning_rate": 0.0001773925423444926, |
| "loss": 0.9383, |
| "mean_token_accuracy": 0.7931062672287226, |
| "num_tokens": 14469523.0, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.3398134480700983, |
| "grad_norm": 0.3166064918041229, |
| "learning_rate": 0.00017735066892783118, |
| "loss": 0.9129, |
| "mean_token_accuracy": 0.7990059111267328, |
| "num_tokens": 14496732.0, |
| "step": 5410 |
| }, |
| { |
| "epoch": 0.3404415690461983, |
| "grad_norm": 0.3135284185409546, |
| "learning_rate": 0.00017730879551116973, |
| "loss": 0.932, |
| "mean_token_accuracy": 0.7971233692020178, |
| "num_tokens": 14523238.0, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.3410696900222983, |
| "grad_norm": 0.30107152462005615, |
| "learning_rate": 0.0001772669220945083, |
| "loss": 0.9483, |
| "mean_token_accuracy": 0.7947558045387269, |
| "num_tokens": 14550996.0, |
| "step": 5430 |
| }, |
| { |
| "epoch": 0.3416978109983983, |
| "grad_norm": 0.3249971568584442, |
| "learning_rate": 0.0001772250486778469, |
| "loss": 0.967, |
| "mean_token_accuracy": 0.7944178026169538, |
| "num_tokens": 14577389.0, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.3423259319744983, |
| "grad_norm": 0.29780882596969604, |
| "learning_rate": 0.00017718317526118545, |
| "loss": 0.9498, |
| "mean_token_accuracy": 0.7930607028305531, |
| "num_tokens": 14603545.0, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.3429540529505983, |
| "grad_norm": 0.3544989824295044, |
| "learning_rate": 0.00017714130184452403, |
| "loss": 0.944, |
| "mean_token_accuracy": 0.7989666901528836, |
| "num_tokens": 14631497.0, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.34358217392669826, |
| "grad_norm": 0.3154118061065674, |
| "learning_rate": 0.00017709942842786258, |
| "loss": 0.9414, |
| "mean_token_accuracy": 0.8006433036178351, |
| "num_tokens": 14657118.0, |
| "step": 5470 |
| }, |
| { |
| "epoch": 0.34421029490279825, |
| "grad_norm": 0.36263030767440796, |
| "learning_rate": 0.00017705755501120114, |
| "loss": 0.9528, |
| "mean_token_accuracy": 0.7943309776484966, |
| "num_tokens": 14683766.0, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.3448384158788983, |
| "grad_norm": 0.37745627760887146, |
| "learning_rate": 0.00017701568159453972, |
| "loss": 0.9394, |
| "mean_token_accuracy": 0.7972447019070387, |
| "num_tokens": 14712282.0, |
| "step": 5490 |
| }, |
| { |
| "epoch": 0.3454665368549983, |
| "grad_norm": 0.3214879333972931, |
| "learning_rate": 0.00017697380817787827, |
| "loss": 0.9196, |
| "mean_token_accuracy": 0.8019496221095324, |
| "num_tokens": 14738530.0, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.3460946578310983, |
| "grad_norm": 0.3479548692703247, |
| "learning_rate": 0.00017693193476121685, |
| "loss": 0.9791, |
| "mean_token_accuracy": 0.7863402977585793, |
| "num_tokens": 14765716.0, |
| "step": 5510 |
| }, |
| { |
| "epoch": 0.34672277880719826, |
| "grad_norm": 0.36872756481170654, |
| "learning_rate": 0.0001768900613445554, |
| "loss": 0.923, |
| "mean_token_accuracy": 0.7984681211411953, |
| "num_tokens": 14794865.0, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.34735089978329825, |
| "grad_norm": 0.2868911921977997, |
| "learning_rate": 0.000176848187927894, |
| "loss": 1.004, |
| "mean_token_accuracy": 0.7837614696472883, |
| "num_tokens": 14821928.0, |
| "step": 5530 |
| }, |
| { |
| "epoch": 0.34797902075939824, |
| "grad_norm": 0.2693776488304138, |
| "learning_rate": 0.00017680631451123254, |
| "loss": 0.9435, |
| "mean_token_accuracy": 0.7989333860576153, |
| "num_tokens": 14848566.0, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.3486071417354983, |
| "grad_norm": 0.32554733753204346, |
| "learning_rate": 0.0001767644410945711, |
| "loss": 0.9346, |
| "mean_token_accuracy": 0.7996318481862545, |
| "num_tokens": 14875559.0, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.34923526271159827, |
| "grad_norm": 0.3206787705421448, |
| "learning_rate": 0.00017672256767790968, |
| "loss": 0.947, |
| "mean_token_accuracy": 0.7963632360100746, |
| "num_tokens": 14901870.0, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.34986338368769826, |
| "grad_norm": 0.3555513322353363, |
| "learning_rate": 0.00017668069426124826, |
| "loss": 0.9494, |
| "mean_token_accuracy": 0.7952044978737831, |
| "num_tokens": 14928011.0, |
| "step": 5570 |
| }, |
| { |
| "epoch": 0.35049150466379825, |
| "grad_norm": 0.28491222858428955, |
| "learning_rate": 0.00017663882084458684, |
| "loss": 0.9522, |
| "mean_token_accuracy": 0.7924054119735956, |
| "num_tokens": 14956522.0, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.35111962563989824, |
| "grad_norm": 0.3041023910045624, |
| "learning_rate": 0.0001765969474279254, |
| "loss": 0.9318, |
| "mean_token_accuracy": 0.7982840724289417, |
| "num_tokens": 14982785.0, |
| "step": 5590 |
| }, |
| { |
| "epoch": 0.3517477466159982, |
| "grad_norm": 0.35401952266693115, |
| "learning_rate": 0.00017655507401126397, |
| "loss": 0.9956, |
| "mean_token_accuracy": 0.7896864812821149, |
| "num_tokens": 15008716.0, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.3523758675920982, |
| "grad_norm": 0.34883061051368713, |
| "learning_rate": 0.00017651320059460253, |
| "loss": 0.9171, |
| "mean_token_accuracy": 0.797677880898118, |
| "num_tokens": 15034928.0, |
| "step": 5610 |
| }, |
| { |
| "epoch": 0.35300398856819826, |
| "grad_norm": 0.32336753606796265, |
| "learning_rate": 0.00017647132717794108, |
| "loss": 0.9584, |
| "mean_token_accuracy": 0.7909554496407509, |
| "num_tokens": 15062010.0, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.35363210954429825, |
| "grad_norm": 0.3301476836204529, |
| "learning_rate": 0.00017642945376127966, |
| "loss": 0.9494, |
| "mean_token_accuracy": 0.7945166520774365, |
| "num_tokens": 15089039.0, |
| "step": 5630 |
| }, |
| { |
| "epoch": 0.35426023052039823, |
| "grad_norm": 0.37666943669319153, |
| "learning_rate": 0.00017638758034461822, |
| "loss": 0.9253, |
| "mean_token_accuracy": 0.7964357610791921, |
| "num_tokens": 15117169.0, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.3548883514964982, |
| "grad_norm": 0.3678019046783447, |
| "learning_rate": 0.0001763457069279568, |
| "loss": 0.9604, |
| "mean_token_accuracy": 0.7923226218670607, |
| "num_tokens": 15143120.0, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.3555164724725982, |
| "grad_norm": 0.30747926235198975, |
| "learning_rate": 0.00017630383351129535, |
| "loss": 0.9976, |
| "mean_token_accuracy": 0.7888089545071125, |
| "num_tokens": 15169183.0, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.3561445934486982, |
| "grad_norm": 0.28103527426719666, |
| "learning_rate": 0.00017626196009463393, |
| "loss": 0.9283, |
| "mean_token_accuracy": 0.7940921634435654, |
| "num_tokens": 15196750.0, |
| "step": 5670 |
| }, |
| { |
| "epoch": 0.35677271442479824, |
| "grad_norm": 0.30865031480789185, |
| "learning_rate": 0.0001762200866779725, |
| "loss": 0.9423, |
| "mean_token_accuracy": 0.7928956486284733, |
| "num_tokens": 15223518.0, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.35740083540089823, |
| "grad_norm": 0.30006086826324463, |
| "learning_rate": 0.00017617821326131107, |
| "loss": 0.9619, |
| "mean_token_accuracy": 0.791780112311244, |
| "num_tokens": 15249472.0, |
| "step": 5690 |
| }, |
| { |
| "epoch": 0.3580289563769982, |
| "grad_norm": 0.3119317293167114, |
| "learning_rate": 0.00017613633984464962, |
| "loss": 0.9378, |
| "mean_token_accuracy": 0.7980754714459181, |
| "num_tokens": 15275693.0, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.3586570773530982, |
| "grad_norm": 0.3226664066314697, |
| "learning_rate": 0.0001760944664279882, |
| "loss": 0.9678, |
| "mean_token_accuracy": 0.7938013020902872, |
| "num_tokens": 15302267.0, |
| "step": 5710 |
| }, |
| { |
| "epoch": 0.3592851983291982, |
| "grad_norm": 0.3189242482185364, |
| "learning_rate": 0.00017605259301132679, |
| "loss": 0.9533, |
| "mean_token_accuracy": 0.7953298572450876, |
| "num_tokens": 15328385.0, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.3599133193052982, |
| "grad_norm": 0.4274740517139435, |
| "learning_rate": 0.00017601071959466534, |
| "loss": 0.9676, |
| "mean_token_accuracy": 0.7888112541288137, |
| "num_tokens": 15356049.0, |
| "step": 5730 |
| }, |
| { |
| "epoch": 0.3605414402813982, |
| "grad_norm": 0.2636527121067047, |
| "learning_rate": 0.00017596884617800392, |
| "loss": 0.95, |
| "mean_token_accuracy": 0.794311236217618, |
| "num_tokens": 15383261.0, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.3611695612574982, |
| "grad_norm": 0.39001110196113586, |
| "learning_rate": 0.00017592697276134247, |
| "loss": 0.907, |
| "mean_token_accuracy": 0.8008437678217888, |
| "num_tokens": 15410782.0, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.3617976822335982, |
| "grad_norm": 0.3736308515071869, |
| "learning_rate": 0.00017588509934468106, |
| "loss": 0.9399, |
| "mean_token_accuracy": 0.7988893665373326, |
| "num_tokens": 15435944.0, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.3624258032096982, |
| "grad_norm": 0.41546639800071716, |
| "learning_rate": 0.0001758432259280196, |
| "loss": 0.9396, |
| "mean_token_accuracy": 0.7928215757012367, |
| "num_tokens": 15463019.0, |
| "step": 5770 |
| }, |
| { |
| "epoch": 0.3630539241857982, |
| "grad_norm": 0.3147844970226288, |
| "learning_rate": 0.00017580135251135816, |
| "loss": 0.9341, |
| "mean_token_accuracy": 0.8001567754894495, |
| "num_tokens": 15490689.0, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.36368204516189817, |
| "grad_norm": 0.3456019461154938, |
| "learning_rate": 0.00017575947909469674, |
| "loss": 0.9551, |
| "mean_token_accuracy": 0.7912769354879856, |
| "num_tokens": 15516630.0, |
| "step": 5790 |
| }, |
| { |
| "epoch": 0.36431016613799816, |
| "grad_norm": 0.3122086822986603, |
| "learning_rate": 0.0001757176056780353, |
| "loss": 0.9609, |
| "mean_token_accuracy": 0.7908839665353298, |
| "num_tokens": 15543530.0, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.36493828711409815, |
| "grad_norm": 0.29509490728378296, |
| "learning_rate": 0.00017567573226137388, |
| "loss": 0.9038, |
| "mean_token_accuracy": 0.8028480164706707, |
| "num_tokens": 15570842.0, |
| "step": 5810 |
| }, |
| { |
| "epoch": 0.3655664080901982, |
| "grad_norm": 0.32276204228401184, |
| "learning_rate": 0.00017563385884471243, |
| "loss": 0.9863, |
| "mean_token_accuracy": 0.7871858242899179, |
| "num_tokens": 15597394.0, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.3661945290662982, |
| "grad_norm": 0.29563000798225403, |
| "learning_rate": 0.00017559198542805101, |
| "loss": 0.9456, |
| "mean_token_accuracy": 0.796840837597847, |
| "num_tokens": 15624689.0, |
| "step": 5830 |
| }, |
| { |
| "epoch": 0.36682265004239817, |
| "grad_norm": 0.2788376808166504, |
| "learning_rate": 0.00017555011201138957, |
| "loss": 0.9471, |
| "mean_token_accuracy": 0.7943929139524698, |
| "num_tokens": 15651542.0, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.36745077101849816, |
| "grad_norm": 0.33232542872428894, |
| "learning_rate": 0.00017550823859472812, |
| "loss": 0.9591, |
| "mean_token_accuracy": 0.7914271518588066, |
| "num_tokens": 15677826.0, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.36807889199459815, |
| "grad_norm": 0.26375389099121094, |
| "learning_rate": 0.0001754663651780667, |
| "loss": 0.9518, |
| "mean_token_accuracy": 0.7952167768031358, |
| "num_tokens": 15704198.0, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.36870701297069813, |
| "grad_norm": 0.3428965210914612, |
| "learning_rate": 0.00017542449176140528, |
| "loss": 0.9521, |
| "mean_token_accuracy": 0.7944683827459812, |
| "num_tokens": 15730973.0, |
| "step": 5870 |
| }, |
| { |
| "epoch": 0.3693351339467982, |
| "grad_norm": 0.3386590778827667, |
| "learning_rate": 0.00017538261834474387, |
| "loss": 0.9106, |
| "mean_token_accuracy": 0.8001317955553532, |
| "num_tokens": 15758768.0, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.36996325492289817, |
| "grad_norm": 0.3398821949958801, |
| "learning_rate": 0.00017534074492808242, |
| "loss": 0.9813, |
| "mean_token_accuracy": 0.7854520630091428, |
| "num_tokens": 15786299.0, |
| "step": 5890 |
| }, |
| { |
| "epoch": 0.37059137589899815, |
| "grad_norm": 0.32635533809661865, |
| "learning_rate": 0.000175298871511421, |
| "loss": 0.9326, |
| "mean_token_accuracy": 0.79741803817451, |
| "num_tokens": 15813458.0, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.37121949687509814, |
| "grad_norm": 0.3272740840911865, |
| "learning_rate": 0.00017525699809475956, |
| "loss": 0.9625, |
| "mean_token_accuracy": 0.7884005717933178, |
| "num_tokens": 15840582.0, |
| "step": 5910 |
| }, |
| { |
| "epoch": 0.37184761785119813, |
| "grad_norm": 0.33792024850845337, |
| "learning_rate": 0.0001752151246780981, |
| "loss": 0.9932, |
| "mean_token_accuracy": 0.7824124969542027, |
| "num_tokens": 15867012.0, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.3724757388272981, |
| "grad_norm": 0.2628950774669647, |
| "learning_rate": 0.0001751732512614367, |
| "loss": 0.9155, |
| "mean_token_accuracy": 0.7996829584240913, |
| "num_tokens": 15894376.0, |
| "step": 5930 |
| }, |
| { |
| "epoch": 0.3731038598033981, |
| "grad_norm": 0.4289126694202423, |
| "learning_rate": 0.00017513137784477524, |
| "loss": 0.9667, |
| "mean_token_accuracy": 0.7891981620341539, |
| "num_tokens": 15923048.0, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.37373198077949815, |
| "grad_norm": 0.28455209732055664, |
| "learning_rate": 0.00017508950442811383, |
| "loss": 0.9627, |
| "mean_token_accuracy": 0.7922193612903357, |
| "num_tokens": 15951494.0, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.37436010175559814, |
| "grad_norm": 0.3707982003688812, |
| "learning_rate": 0.00017504763101145238, |
| "loss": 0.9345, |
| "mean_token_accuracy": 0.7945095077157021, |
| "num_tokens": 15978771.0, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.37498822273169813, |
| "grad_norm": 0.3321593105792999, |
| "learning_rate": 0.00017500575759479096, |
| "loss": 0.9217, |
| "mean_token_accuracy": 0.7928752236068248, |
| "num_tokens": 16006003.0, |
| "step": 5970 |
| }, |
| { |
| "epoch": 0.3756163437077981, |
| "grad_norm": 0.320150226354599, |
| "learning_rate": 0.00017496388417812951, |
| "loss": 0.952, |
| "mean_token_accuracy": 0.7876376051455736, |
| "num_tokens": 16033395.0, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.3762444646838981, |
| "grad_norm": 0.3004560172557831, |
| "learning_rate": 0.00017492201076146807, |
| "loss": 1.0095, |
| "mean_token_accuracy": 0.7821074955165386, |
| "num_tokens": 16060855.0, |
| "step": 5990 |
| }, |
| { |
| "epoch": 0.3768725856599981, |
| "grad_norm": 0.29771584272384644, |
| "learning_rate": 0.00017488013734480665, |
| "loss": 0.9307, |
| "mean_token_accuracy": 0.796100390329957, |
| "num_tokens": 16088000.0, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.37750070663609814, |
| "grad_norm": 0.27265238761901855, |
| "learning_rate": 0.00017483826392814523, |
| "loss": 0.9272, |
| "mean_token_accuracy": 0.797407491132617, |
| "num_tokens": 16115243.0, |
| "step": 6010 |
| }, |
| { |
| "epoch": 0.3781288276121981, |
| "grad_norm": 0.38913822174072266, |
| "learning_rate": 0.0001747963905114838, |
| "loss": 0.91, |
| "mean_token_accuracy": 0.8007095254957676, |
| "num_tokens": 16142434.0, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.3787569485882981, |
| "grad_norm": 0.2852244973182678, |
| "learning_rate": 0.00017475451709482237, |
| "loss": 0.9507, |
| "mean_token_accuracy": 0.7935369953513145, |
| "num_tokens": 16169907.0, |
| "step": 6030 |
| }, |
| { |
| "epoch": 0.3793850695643981, |
| "grad_norm": 0.3308016061782837, |
| "learning_rate": 0.00017471264367816095, |
| "loss": 0.9864, |
| "mean_token_accuracy": 0.7877254385501147, |
| "num_tokens": 16195064.0, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.3800131905404981, |
| "grad_norm": 0.3460164964199066, |
| "learning_rate": 0.0001746707702614995, |
| "loss": 0.9531, |
| "mean_token_accuracy": 0.7923174686729908, |
| "num_tokens": 16221200.0, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.3806413115165981, |
| "grad_norm": 0.30869802832603455, |
| "learning_rate": 0.00017462889684483805, |
| "loss": 0.9619, |
| "mean_token_accuracy": 0.7955431789159775, |
| "num_tokens": 16248508.0, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.38126943249269807, |
| "grad_norm": 0.28829026222229004, |
| "learning_rate": 0.00017458702342817664, |
| "loss": 0.9753, |
| "mean_token_accuracy": 0.7878083620220423, |
| "num_tokens": 16275106.0, |
| "step": 6070 |
| }, |
| { |
| "epoch": 0.3818975534687981, |
| "grad_norm": 0.3400105834007263, |
| "learning_rate": 0.0001745451500115152, |
| "loss": 0.9432, |
| "mean_token_accuracy": 0.7964709993451834, |
| "num_tokens": 16300621.0, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.3825256744448981, |
| "grad_norm": 0.30248478055000305, |
| "learning_rate": 0.00017450327659485377, |
| "loss": 0.9876, |
| "mean_token_accuracy": 0.7857365075498819, |
| "num_tokens": 16327441.0, |
| "step": 6090 |
| }, |
| { |
| "epoch": 0.3831537954209981, |
| "grad_norm": 0.3251391053199768, |
| "learning_rate": 0.00017446140317819232, |
| "loss": 0.9896, |
| "mean_token_accuracy": 0.7837534084916115, |
| "num_tokens": 16355426.0, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.3837819163970981, |
| "grad_norm": 0.2840956449508667, |
| "learning_rate": 0.0001744195297615309, |
| "loss": 0.9609, |
| "mean_token_accuracy": 0.795376755297184, |
| "num_tokens": 16381372.0, |
| "step": 6110 |
| }, |
| { |
| "epoch": 0.38441003737319807, |
| "grad_norm": 0.30968624353408813, |
| "learning_rate": 0.00017437765634486946, |
| "loss": 0.9473, |
| "mean_token_accuracy": 0.7971586957573891, |
| "num_tokens": 16407482.0, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.38503815834929805, |
| "grad_norm": 0.29990601539611816, |
| "learning_rate": 0.00017433578292820801, |
| "loss": 0.9728, |
| "mean_token_accuracy": 0.7887383218854666, |
| "num_tokens": 16435286.0, |
| "step": 6130 |
| }, |
| { |
| "epoch": 0.3856662793253981, |
| "grad_norm": 0.3170183300971985, |
| "learning_rate": 0.0001742939095115466, |
| "loss": 0.9532, |
| "mean_token_accuracy": 0.7952543575316667, |
| "num_tokens": 16461263.0, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.3862944003014981, |
| "grad_norm": 0.2933950126171112, |
| "learning_rate": 0.00017425203609488518, |
| "loss": 0.933, |
| "mean_token_accuracy": 0.7964635614305735, |
| "num_tokens": 16487429.0, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.3869225212775981, |
| "grad_norm": 0.386870801448822, |
| "learning_rate": 0.00017421016267822373, |
| "loss": 0.9126, |
| "mean_token_accuracy": 0.8014869604259729, |
| "num_tokens": 16514921.0, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.38755064225369806, |
| "grad_norm": 0.2772822976112366, |
| "learning_rate": 0.0001741682892615623, |
| "loss": 0.9646, |
| "mean_token_accuracy": 0.7937680229544639, |
| "num_tokens": 16541215.0, |
| "step": 6170 |
| }, |
| { |
| "epoch": 0.38817876322979805, |
| "grad_norm": 0.3255600035190582, |
| "learning_rate": 0.0001741264158449009, |
| "loss": 0.9233, |
| "mean_token_accuracy": 0.7992643032222986, |
| "num_tokens": 16568598.0, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.38880688420589804, |
| "grad_norm": 0.30316439270973206, |
| "learning_rate": 0.00017408454242823945, |
| "loss": 0.9584, |
| "mean_token_accuracy": 0.7939885523170233, |
| "num_tokens": 16596087.0, |
| "step": 6190 |
| }, |
| { |
| "epoch": 0.38943500518199803, |
| "grad_norm": 0.339186429977417, |
| "learning_rate": 0.00017404266901157803, |
| "loss": 0.9746, |
| "mean_token_accuracy": 0.7881553754210472, |
| "num_tokens": 16624223.0, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.3900631261580981, |
| "grad_norm": 0.3115599751472473, |
| "learning_rate": 0.00017400079559491658, |
| "loss": 0.9473, |
| "mean_token_accuracy": 0.7972762394696474, |
| "num_tokens": 16651744.0, |
| "step": 6210 |
| }, |
| { |
| "epoch": 0.39069124713419806, |
| "grad_norm": 0.36609265208244324, |
| "learning_rate": 0.00017395892217825514, |
| "loss": 0.9055, |
| "mean_token_accuracy": 0.8033052369952202, |
| "num_tokens": 16678227.0, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.39131936811029805, |
| "grad_norm": 0.38528645038604736, |
| "learning_rate": 0.00017391704876159372, |
| "loss": 0.9419, |
| "mean_token_accuracy": 0.8045696560293436, |
| "num_tokens": 16704268.0, |
| "step": 6230 |
| }, |
| { |
| "epoch": 0.39194748908639804, |
| "grad_norm": 0.29467713832855225, |
| "learning_rate": 0.00017387517534493227, |
| "loss": 0.9403, |
| "mean_token_accuracy": 0.7961732547730207, |
| "num_tokens": 16730577.0, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.392575610062498, |
| "grad_norm": 0.3198733329772949, |
| "learning_rate": 0.00017383330192827085, |
| "loss": 0.9486, |
| "mean_token_accuracy": 0.7977675545960665, |
| "num_tokens": 16755742.0, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.393203731038598, |
| "grad_norm": 0.2925213873386383, |
| "learning_rate": 0.0001737914285116094, |
| "loss": 0.949, |
| "mean_token_accuracy": 0.7897519588470459, |
| "num_tokens": 16783547.0, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.39383185201469806, |
| "grad_norm": 0.3132512867450714, |
| "learning_rate": 0.000173749555094948, |
| "loss": 0.9292, |
| "mean_token_accuracy": 0.7962834902107716, |
| "num_tokens": 16810651.0, |
| "step": 6270 |
| }, |
| { |
| "epoch": 0.39445997299079805, |
| "grad_norm": 0.3624895215034485, |
| "learning_rate": 0.00017370768167828654, |
| "loss": 0.9513, |
| "mean_token_accuracy": 0.7966616488993168, |
| "num_tokens": 16837067.0, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.39508809396689804, |
| "grad_norm": 0.389517605304718, |
| "learning_rate": 0.0001736658082616251, |
| "loss": 0.9852, |
| "mean_token_accuracy": 0.7895838055759669, |
| "num_tokens": 16865469.0, |
| "step": 6290 |
| }, |
| { |
| "epoch": 0.395716214942998, |
| "grad_norm": 0.27660834789276123, |
| "learning_rate": 0.00017362393484496368, |
| "loss": 0.9668, |
| "mean_token_accuracy": 0.789311607927084, |
| "num_tokens": 16893654.0, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.396344335919098, |
| "grad_norm": 0.30523520708084106, |
| "learning_rate": 0.00017358206142830226, |
| "loss": 0.9067, |
| "mean_token_accuracy": 0.801710982620716, |
| "num_tokens": 16921161.0, |
| "step": 6310 |
| }, |
| { |
| "epoch": 0.396972456895198, |
| "grad_norm": 0.37683388590812683, |
| "learning_rate": 0.00017354018801164084, |
| "loss": 0.9228, |
| "mean_token_accuracy": 0.7973937816917896, |
| "num_tokens": 16947895.0, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.397600577871298, |
| "grad_norm": 0.31565046310424805, |
| "learning_rate": 0.0001734983145949794, |
| "loss": 0.9568, |
| "mean_token_accuracy": 0.7894732590764761, |
| "num_tokens": 16975421.0, |
| "step": 6330 |
| }, |
| { |
| "epoch": 0.39822869884739803, |
| "grad_norm": 0.29164016246795654, |
| "learning_rate": 0.00017345644117831797, |
| "loss": 0.9618, |
| "mean_token_accuracy": 0.791145333275199, |
| "num_tokens": 17001397.0, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.398856819823498, |
| "grad_norm": 0.27090737223625183, |
| "learning_rate": 0.00017341456776165653, |
| "loss": 0.9091, |
| "mean_token_accuracy": 0.799335828050971, |
| "num_tokens": 17028394.0, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.399484940799598, |
| "grad_norm": 0.32882294058799744, |
| "learning_rate": 0.00017337269434499508, |
| "loss": 0.9809, |
| "mean_token_accuracy": 0.786191276833415, |
| "num_tokens": 17054934.0, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.400113061775698, |
| "grad_norm": 0.2733074128627777, |
| "learning_rate": 0.00017333082092833366, |
| "loss": 0.9421, |
| "mean_token_accuracy": 0.7956891294568778, |
| "num_tokens": 17081735.0, |
| "step": 6370 |
| }, |
| { |
| "epoch": 0.400741182751798, |
| "grad_norm": 0.2858097553253174, |
| "learning_rate": 0.00017328894751167222, |
| "loss": 0.922, |
| "mean_token_accuracy": 0.7970656007528305, |
| "num_tokens": 17108919.0, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.401369303727898, |
| "grad_norm": 0.2953729033470154, |
| "learning_rate": 0.0001732470740950108, |
| "loss": 0.9441, |
| "mean_token_accuracy": 0.7959697268903255, |
| "num_tokens": 17135174.0, |
| "step": 6390 |
| }, |
| { |
| "epoch": 0.40199742470399796, |
| "grad_norm": 0.3655385971069336, |
| "learning_rate": 0.00017320520067834935, |
| "loss": 1.0054, |
| "mean_token_accuracy": 0.7828585598617792, |
| "num_tokens": 17161216.0, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.402625545680098, |
| "grad_norm": 0.28704798221588135, |
| "learning_rate": 0.00017316332726168793, |
| "loss": 0.9546, |
| "mean_token_accuracy": 0.795269351825118, |
| "num_tokens": 17186800.0, |
| "step": 6410 |
| }, |
| { |
| "epoch": 0.403253666656198, |
| "grad_norm": 0.36064931750297546, |
| "learning_rate": 0.00017312145384502649, |
| "loss": 0.9412, |
| "mean_token_accuracy": 0.7885518711060285, |
| "num_tokens": 17214001.0, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.403881787632298, |
| "grad_norm": 0.31733861565589905, |
| "learning_rate": 0.00017307958042836504, |
| "loss": 0.9343, |
| "mean_token_accuracy": 0.7966097947210073, |
| "num_tokens": 17240349.0, |
| "step": 6430 |
| }, |
| { |
| "epoch": 0.404509908608398, |
| "grad_norm": 0.2990235388278961, |
| "learning_rate": 0.00017303770701170362, |
| "loss": 0.9641, |
| "mean_token_accuracy": 0.7916103590279817, |
| "num_tokens": 17267668.0, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.40513802958449796, |
| "grad_norm": 0.3221684992313385, |
| "learning_rate": 0.0001729958335950422, |
| "loss": 0.9661, |
| "mean_token_accuracy": 0.7940225251019001, |
| "num_tokens": 17293563.0, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.40576615056059795, |
| "grad_norm": 0.324481338262558, |
| "learning_rate": 0.00017295396017838076, |
| "loss": 0.9481, |
| "mean_token_accuracy": 0.7895933233201504, |
| "num_tokens": 17320263.0, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.406394271536698, |
| "grad_norm": 0.3949001729488373, |
| "learning_rate": 0.00017291208676171934, |
| "loss": 0.8954, |
| "mean_token_accuracy": 0.8022237163037061, |
| "num_tokens": 17347756.0, |
| "step": 6470 |
| }, |
| { |
| "epoch": 0.407022392512798, |
| "grad_norm": 0.3821220397949219, |
| "learning_rate": 0.00017287021334505792, |
| "loss": 0.9416, |
| "mean_token_accuracy": 0.8014149498194456, |
| "num_tokens": 17373740.0, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.40765051348889797, |
| "grad_norm": 0.3420533537864685, |
| "learning_rate": 0.00017282833992839647, |
| "loss": 0.9606, |
| "mean_token_accuracy": 0.7895737990736962, |
| "num_tokens": 17399868.0, |
| "step": 6490 |
| }, |
| { |
| "epoch": 0.40827863446499796, |
| "grad_norm": 0.32357707619667053, |
| "learning_rate": 0.00017278646651173503, |
| "loss": 0.9051, |
| "mean_token_accuracy": 0.8026961565017701, |
| "num_tokens": 17427575.0, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.40890675544109795, |
| "grad_norm": 0.34716370701789856, |
| "learning_rate": 0.0001727445930950736, |
| "loss": 0.94, |
| "mean_token_accuracy": 0.7938724718987942, |
| "num_tokens": 17455384.0, |
| "step": 6510 |
| }, |
| { |
| "epoch": 0.40953487641719793, |
| "grad_norm": 0.3508943021297455, |
| "learning_rate": 0.00017270271967841216, |
| "loss": 0.9854, |
| "mean_token_accuracy": 0.7838574007153511, |
| "num_tokens": 17482972.0, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.4101629973932979, |
| "grad_norm": 0.30769476294517517, |
| "learning_rate": 0.00017266084626175074, |
| "loss": 0.9398, |
| "mean_token_accuracy": 0.7959376715123654, |
| "num_tokens": 17510173.0, |
| "step": 6530 |
| }, |
| { |
| "epoch": 0.41079111836939797, |
| "grad_norm": 0.3114470839500427, |
| "learning_rate": 0.0001726189728450893, |
| "loss": 0.9722, |
| "mean_token_accuracy": 0.7882124871015549, |
| "num_tokens": 17537102.0, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.41141923934549796, |
| "grad_norm": 0.4177934527397156, |
| "learning_rate": 0.00017257709942842788, |
| "loss": 0.9756, |
| "mean_token_accuracy": 0.7910198099911213, |
| "num_tokens": 17563061.0, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.41204736032159794, |
| "grad_norm": 0.35182762145996094, |
| "learning_rate": 0.00017253522601176643, |
| "loss": 0.9422, |
| "mean_token_accuracy": 0.7991392657160759, |
| "num_tokens": 17589058.0, |
| "step": 6560 |
| }, |
| { |
| "epoch": 0.41267548129769793, |
| "grad_norm": 0.3126845061779022, |
| "learning_rate": 0.00017249335259510499, |
| "loss": 0.9305, |
| "mean_token_accuracy": 0.7946061249822378, |
| "num_tokens": 17615789.0, |
| "step": 6570 |
| }, |
| { |
| "epoch": 0.4133036022737979, |
| "grad_norm": 0.2823250889778137, |
| "learning_rate": 0.00017245147917844357, |
| "loss": 0.9141, |
| "mean_token_accuracy": 0.7987863086163998, |
| "num_tokens": 17643862.0, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.4139317232498979, |
| "grad_norm": 0.3772083520889282, |
| "learning_rate": 0.00017240960576178212, |
| "loss": 0.914, |
| "mean_token_accuracy": 0.799197156727314, |
| "num_tokens": 17671357.0, |
| "step": 6590 |
| }, |
| { |
| "epoch": 0.41455984422599795, |
| "grad_norm": 0.34354081749916077, |
| "learning_rate": 0.0001723677323451207, |
| "loss": 0.9411, |
| "mean_token_accuracy": 0.798019240796566, |
| "num_tokens": 17699385.0, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.41518796520209794, |
| "grad_norm": 0.39620301127433777, |
| "learning_rate": 0.00017232585892845928, |
| "loss": 0.8923, |
| "mean_token_accuracy": 0.8041703008115292, |
| "num_tokens": 17724468.0, |
| "step": 6610 |
| }, |
| { |
| "epoch": 0.41581608617819793, |
| "grad_norm": 0.33372464776039124, |
| "learning_rate": 0.00017228398551179786, |
| "loss": 0.9286, |
| "mean_token_accuracy": 0.7959783185273409, |
| "num_tokens": 17749763.0, |
| "step": 6620 |
| }, |
| { |
| "epoch": 0.4164442071542979, |
| "grad_norm": 0.3709011971950531, |
| "learning_rate": 0.00017224211209513642, |
| "loss": 0.9554, |
| "mean_token_accuracy": 0.7890860054641962, |
| "num_tokens": 17777337.0, |
| "step": 6630 |
| }, |
| { |
| "epoch": 0.4170723281303979, |
| "grad_norm": 0.32917919754981995, |
| "learning_rate": 0.00017220023867847497, |
| "loss": 0.9017, |
| "mean_token_accuracy": 0.8022000085562467, |
| "num_tokens": 17805938.0, |
| "step": 6640 |
| }, |
| { |
| "epoch": 0.4177004491064979, |
| "grad_norm": 0.29168930649757385, |
| "learning_rate": 0.00017215836526181355, |
| "loss": 0.9337, |
| "mean_token_accuracy": 0.7962145168334246, |
| "num_tokens": 17833769.0, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.4183285700825979, |
| "grad_norm": 0.3907414376735687, |
| "learning_rate": 0.0001721164918451521, |
| "loss": 0.9565, |
| "mean_token_accuracy": 0.7868779297918082, |
| "num_tokens": 17861061.0, |
| "step": 6660 |
| }, |
| { |
| "epoch": 0.4189566910586979, |
| "grad_norm": 0.3282051980495453, |
| "learning_rate": 0.0001720746184284907, |
| "loss": 0.965, |
| "mean_token_accuracy": 0.7910456649959088, |
| "num_tokens": 17887366.0, |
| "step": 6670 |
| }, |
| { |
| "epoch": 0.4195848120347979, |
| "grad_norm": 0.29146093130111694, |
| "learning_rate": 0.00017203274501182924, |
| "loss": 0.8854, |
| "mean_token_accuracy": 0.8069033030420542, |
| "num_tokens": 17913922.0, |
| "step": 6680 |
| }, |
| { |
| "epoch": 0.4202129330108979, |
| "grad_norm": 0.3952869772911072, |
| "learning_rate": 0.00017199087159516782, |
| "loss": 0.9253, |
| "mean_token_accuracy": 0.7987654652446509, |
| "num_tokens": 17939912.0, |
| "step": 6690 |
| }, |
| { |
| "epoch": 0.4208410539869979, |
| "grad_norm": 0.3404087424278259, |
| "learning_rate": 0.00017194899817850638, |
| "loss": 0.9574, |
| "mean_token_accuracy": 0.7907421611249447, |
| "num_tokens": 17967318.0, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.4214691749630979, |
| "grad_norm": 0.3051299750804901, |
| "learning_rate": 0.00017190712476184496, |
| "loss": 0.9063, |
| "mean_token_accuracy": 0.803283429145813, |
| "num_tokens": 17994300.0, |
| "step": 6710 |
| }, |
| { |
| "epoch": 0.42209729593919787, |
| "grad_norm": 0.3612078130245209, |
| "learning_rate": 0.0001718652513451835, |
| "loss": 0.9431, |
| "mean_token_accuracy": 0.7964126400649547, |
| "num_tokens": 18020402.0, |
| "step": 6720 |
| }, |
| { |
| "epoch": 0.4227254169152979, |
| "grad_norm": 0.31546175479888916, |
| "learning_rate": 0.00017182337792852207, |
| "loss": 0.9437, |
| "mean_token_accuracy": 0.7950374394655227, |
| "num_tokens": 18047710.0, |
| "step": 6730 |
| }, |
| { |
| "epoch": 0.4233535378913979, |
| "grad_norm": 0.35042259097099304, |
| "learning_rate": 0.00017178150451186065, |
| "loss": 0.9186, |
| "mean_token_accuracy": 0.8037286669015884, |
| "num_tokens": 18074038.0, |
| "step": 6740 |
| }, |
| { |
| "epoch": 0.4239816588674979, |
| "grad_norm": 0.2913114130496979, |
| "learning_rate": 0.00017173963109519923, |
| "loss": 0.9446, |
| "mean_token_accuracy": 0.7999244224280119, |
| "num_tokens": 18100392.0, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.4246097798435979, |
| "grad_norm": 0.5584277510643005, |
| "learning_rate": 0.0001716977576785378, |
| "loss": 0.9768, |
| "mean_token_accuracy": 0.791194049268961, |
| "num_tokens": 18127168.0, |
| "step": 6760 |
| }, |
| { |
| "epoch": 0.42523790081969787, |
| "grad_norm": 0.31865277886390686, |
| "learning_rate": 0.00017165588426187636, |
| "loss": 0.9285, |
| "mean_token_accuracy": 0.7973918996751308, |
| "num_tokens": 18153582.0, |
| "step": 6770 |
| }, |
| { |
| "epoch": 0.42586602179579786, |
| "grad_norm": 0.3605208992958069, |
| "learning_rate": 0.00017161401084521494, |
| "loss": 0.9184, |
| "mean_token_accuracy": 0.8000706914812327, |
| "num_tokens": 18179784.0, |
| "step": 6780 |
| }, |
| { |
| "epoch": 0.42649414277189784, |
| "grad_norm": 0.3389859199523926, |
| "learning_rate": 0.0001715721374285535, |
| "loss": 0.9829, |
| "mean_token_accuracy": 0.7899733152240515, |
| "num_tokens": 18206151.0, |
| "step": 6790 |
| }, |
| { |
| "epoch": 0.4271222637479979, |
| "grad_norm": 0.32781344652175903, |
| "learning_rate": 0.00017153026401189205, |
| "loss": 0.9596, |
| "mean_token_accuracy": 0.7900414571166039, |
| "num_tokens": 18234354.0, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.4277503847240979, |
| "grad_norm": 0.2912145256996155, |
| "learning_rate": 0.00017148839059523063, |
| "loss": 0.9257, |
| "mean_token_accuracy": 0.8005597397685051, |
| "num_tokens": 18261836.0, |
| "step": 6810 |
| }, |
| { |
| "epoch": 0.42837850570019786, |
| "grad_norm": 0.34917712211608887, |
| "learning_rate": 0.0001714465171785692, |
| "loss": 0.9402, |
| "mean_token_accuracy": 0.7959890987724065, |
| "num_tokens": 18288129.0, |
| "step": 6820 |
| }, |
| { |
| "epoch": 0.42900662667629785, |
| "grad_norm": 0.33733370900154114, |
| "learning_rate": 0.00017140464376190777, |
| "loss": 1.0073, |
| "mean_token_accuracy": 0.787006713822484, |
| "num_tokens": 18315159.0, |
| "step": 6830 |
| }, |
| { |
| "epoch": 0.42963474765239784, |
| "grad_norm": 0.3581954538822174, |
| "learning_rate": 0.00017136277034524632, |
| "loss": 0.9486, |
| "mean_token_accuracy": 0.794409342855215, |
| "num_tokens": 18342955.0, |
| "step": 6840 |
| }, |
| { |
| "epoch": 0.43026286862849783, |
| "grad_norm": 0.3676760792732239, |
| "learning_rate": 0.0001713208969285849, |
| "loss": 0.9483, |
| "mean_token_accuracy": 0.8001930240541697, |
| "num_tokens": 18368669.0, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.4308909896045978, |
| "grad_norm": 0.3197173774242401, |
| "learning_rate": 0.00017127902351192346, |
| "loss": 0.9116, |
| "mean_token_accuracy": 0.7953953389078379, |
| "num_tokens": 18397666.0, |
| "step": 6860 |
| }, |
| { |
| "epoch": 0.43151911058069786, |
| "grad_norm": 0.33263349533081055, |
| "learning_rate": 0.000171237150095262, |
| "loss": 0.9441, |
| "mean_token_accuracy": 0.7979573253542185, |
| "num_tokens": 18421950.0, |
| "step": 6870 |
| }, |
| { |
| "epoch": 0.43214723155679785, |
| "grad_norm": 0.38979992270469666, |
| "learning_rate": 0.0001711952766786006, |
| "loss": 0.9049, |
| "mean_token_accuracy": 0.8038561142981052, |
| "num_tokens": 18448253.0, |
| "step": 6880 |
| }, |
| { |
| "epoch": 0.43277535253289784, |
| "grad_norm": 0.3203209340572357, |
| "learning_rate": 0.00017115340326193915, |
| "loss": 0.9158, |
| "mean_token_accuracy": 0.8013805273920298, |
| "num_tokens": 18475417.0, |
| "step": 6890 |
| }, |
| { |
| "epoch": 0.4334034735089978, |
| "grad_norm": 0.3154747486114502, |
| "learning_rate": 0.00017111152984527773, |
| "loss": 0.9082, |
| "mean_token_accuracy": 0.8050479885190726, |
| "num_tokens": 18501073.0, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.4340315944850978, |
| "grad_norm": 0.30107998847961426, |
| "learning_rate": 0.0001710696564286163, |
| "loss": 0.9676, |
| "mean_token_accuracy": 0.791720773279667, |
| "num_tokens": 18527256.0, |
| "step": 6910 |
| }, |
| { |
| "epoch": 0.4346597154611978, |
| "grad_norm": 0.297690749168396, |
| "learning_rate": 0.0001710277830119549, |
| "loss": 0.9109, |
| "mean_token_accuracy": 0.8020948182791472, |
| "num_tokens": 18554278.0, |
| "step": 6920 |
| }, |
| { |
| "epoch": 0.43528783643729785, |
| "grad_norm": 0.33920198678970337, |
| "learning_rate": 0.00017098590959529344, |
| "loss": 0.9209, |
| "mean_token_accuracy": 0.8000293109565974, |
| "num_tokens": 18581647.0, |
| "step": 6930 |
| }, |
| { |
| "epoch": 0.43591595741339784, |
| "grad_norm": 0.3154403865337372, |
| "learning_rate": 0.000170944036178632, |
| "loss": 0.9408, |
| "mean_token_accuracy": 0.7958235062658787, |
| "num_tokens": 18608804.0, |
| "step": 6940 |
| }, |
| { |
| "epoch": 0.4365440783894978, |
| "grad_norm": 0.3859824538230896, |
| "learning_rate": 0.00017090216276197058, |
| "loss": 0.9133, |
| "mean_token_accuracy": 0.80115054436028, |
| "num_tokens": 18636323.0, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.4371721993655978, |
| "grad_norm": 0.3137420415878296, |
| "learning_rate": 0.00017086028934530913, |
| "loss": 0.8943, |
| "mean_token_accuracy": 0.8037898227572441, |
| "num_tokens": 18662828.0, |
| "step": 6960 |
| }, |
| { |
| "epoch": 0.4378003203416978, |
| "grad_norm": 0.37590697407722473, |
| "learning_rate": 0.00017081841592864771, |
| "loss": 0.9348, |
| "mean_token_accuracy": 0.7956229455769062, |
| "num_tokens": 18688409.0, |
| "step": 6970 |
| }, |
| { |
| "epoch": 0.4384284413177978, |
| "grad_norm": 0.38642698526382446, |
| "learning_rate": 0.00017077654251198627, |
| "loss": 0.9773, |
| "mean_token_accuracy": 0.7884405389428139, |
| "num_tokens": 18715579.0, |
| "step": 6980 |
| }, |
| { |
| "epoch": 0.4390565622938978, |
| "grad_norm": 0.38973766565322876, |
| "learning_rate": 0.00017073466909532485, |
| "loss": 0.9588, |
| "mean_token_accuracy": 0.7889534655958415, |
| "num_tokens": 18743302.0, |
| "step": 6990 |
| }, |
| { |
| "epoch": 0.4396846832699978, |
| "grad_norm": 0.3270561695098877, |
| "learning_rate": 0.0001706927956786634, |
| "loss": 0.9433, |
| "mean_token_accuracy": 0.7975011304020881, |
| "num_tokens": 18770735.0, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.4403128042460978, |
| "grad_norm": 0.332520455121994, |
| "learning_rate": 0.00017065092226200196, |
| "loss": 0.9271, |
| "mean_token_accuracy": 0.8032657954841852, |
| "num_tokens": 18795923.0, |
| "step": 7010 |
| }, |
| { |
| "epoch": 0.4409409252221978, |
| "grad_norm": 0.3008480668067932, |
| "learning_rate": 0.00017060904884534054, |
| "loss": 0.9778, |
| "mean_token_accuracy": 0.789083057269454, |
| "num_tokens": 18823499.0, |
| "step": 7020 |
| }, |
| { |
| "epoch": 0.4415690461982978, |
| "grad_norm": 0.34079182147979736, |
| "learning_rate": 0.0001705671754286791, |
| "loss": 0.9217, |
| "mean_token_accuracy": 0.80135333314538, |
| "num_tokens": 18851042.0, |
| "step": 7030 |
| }, |
| { |
| "epoch": 0.4421971671743978, |
| "grad_norm": 0.375567764043808, |
| "learning_rate": 0.00017052530201201767, |
| "loss": 0.951, |
| "mean_token_accuracy": 0.7928535658866167, |
| "num_tokens": 18877749.0, |
| "step": 7040 |
| }, |
| { |
| "epoch": 0.44282528815049776, |
| "grad_norm": 0.33165213465690613, |
| "learning_rate": 0.00017048342859535625, |
| "loss": 0.9332, |
| "mean_token_accuracy": 0.7962594710290432, |
| "num_tokens": 18903877.0, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.4434534091265978, |
| "grad_norm": 0.3591445982456207, |
| "learning_rate": 0.00017044155517869484, |
| "loss": 0.9415, |
| "mean_token_accuracy": 0.7983079668134451, |
| "num_tokens": 18928905.0, |
| "step": 7060 |
| }, |
| { |
| "epoch": 0.4440815301026978, |
| "grad_norm": 0.2903635501861572, |
| "learning_rate": 0.0001703996817620334, |
| "loss": 0.9026, |
| "mean_token_accuracy": 0.8050003577023744, |
| "num_tokens": 18955696.0, |
| "step": 7070 |
| }, |
| { |
| "epoch": 0.4447096510787978, |
| "grad_norm": 0.3327687382698059, |
| "learning_rate": 0.00017035780834537194, |
| "loss": 0.9184, |
| "mean_token_accuracy": 0.7981751836836338, |
| "num_tokens": 18983224.0, |
| "step": 7080 |
| }, |
| { |
| "epoch": 0.4453377720548978, |
| "grad_norm": 0.32097476720809937, |
| "learning_rate": 0.00017031593492871052, |
| "loss": 0.8946, |
| "mean_token_accuracy": 0.8068317249417305, |
| "num_tokens": 19010405.0, |
| "step": 7090 |
| }, |
| { |
| "epoch": 0.44596589303099776, |
| "grad_norm": 0.3469720780849457, |
| "learning_rate": 0.00017027406151204908, |
| "loss": 0.9037, |
| "mean_token_accuracy": 0.8039239943027496, |
| "num_tokens": 19037152.0, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.44659401400709775, |
| "grad_norm": 0.3317658007144928, |
| "learning_rate": 0.00017023218809538766, |
| "loss": 0.9582, |
| "mean_token_accuracy": 0.7961928177624941, |
| "num_tokens": 19063239.0, |
| "step": 7110 |
| }, |
| { |
| "epoch": 0.44722213498319774, |
| "grad_norm": 0.42860665917396545, |
| "learning_rate": 0.00017019031467872621, |
| "loss": 0.9076, |
| "mean_token_accuracy": 0.8047629177570343, |
| "num_tokens": 19088858.0, |
| "step": 7120 |
| }, |
| { |
| "epoch": 0.4478502559592978, |
| "grad_norm": 0.3589876592159271, |
| "learning_rate": 0.0001701484412620648, |
| "loss": 0.9166, |
| "mean_token_accuracy": 0.79860061109066, |
| "num_tokens": 19116820.0, |
| "step": 7130 |
| }, |
| { |
| "epoch": 0.44847837693539777, |
| "grad_norm": 0.3044925630092621, |
| "learning_rate": 0.00017010656784540335, |
| "loss": 0.9565, |
| "mean_token_accuracy": 0.7914801269769669, |
| "num_tokens": 19143530.0, |
| "step": 7140 |
| }, |
| { |
| "epoch": 0.44910649791149776, |
| "grad_norm": 0.3823811709880829, |
| "learning_rate": 0.0001700646944287419, |
| "loss": 0.966, |
| "mean_token_accuracy": 0.793233947083354, |
| "num_tokens": 19169692.0, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.44973461888759775, |
| "grad_norm": 0.33698752522468567, |
| "learning_rate": 0.00017002282101208048, |
| "loss": 0.9343, |
| "mean_token_accuracy": 0.7968378983438015, |
| "num_tokens": 19195206.0, |
| "step": 7160 |
| }, |
| { |
| "epoch": 0.45036273986369774, |
| "grad_norm": 0.28442952036857605, |
| "learning_rate": 0.00016998094759541904, |
| "loss": 0.9625, |
| "mean_token_accuracy": 0.7924941457808018, |
| "num_tokens": 19221574.0, |
| "step": 7170 |
| }, |
| { |
| "epoch": 0.4509908608397977, |
| "grad_norm": 0.29551246762275696, |
| "learning_rate": 0.00016993907417875762, |
| "loss": 0.9383, |
| "mean_token_accuracy": 0.7974245421588421, |
| "num_tokens": 19249889.0, |
| "step": 7180 |
| }, |
| { |
| "epoch": 0.45161898181589777, |
| "grad_norm": 0.38739240169525146, |
| "learning_rate": 0.0001698972007620962, |
| "loss": 0.8745, |
| "mean_token_accuracy": 0.8079649094492197, |
| "num_tokens": 19277540.0, |
| "step": 7190 |
| }, |
| { |
| "epoch": 0.45224710279199776, |
| "grad_norm": 0.3009992241859436, |
| "learning_rate": 0.00016985532734543475, |
| "loss": 0.9334, |
| "mean_token_accuracy": 0.7996255524456501, |
| "num_tokens": 19304330.0, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.45287522376809775, |
| "grad_norm": 0.3752877116203308, |
| "learning_rate": 0.00016981345392877334, |
| "loss": 0.9596, |
| "mean_token_accuracy": 0.7906508490443229, |
| "num_tokens": 19331369.0, |
| "step": 7210 |
| }, |
| { |
| "epoch": 0.45350334474419773, |
| "grad_norm": 0.3402617275714874, |
| "learning_rate": 0.0001697715805121119, |
| "loss": 0.9406, |
| "mean_token_accuracy": 0.7922864690423012, |
| "num_tokens": 19358691.0, |
| "step": 7220 |
| }, |
| { |
| "epoch": 0.4541314657202977, |
| "grad_norm": 0.3796190023422241, |
| "learning_rate": 0.00016972970709545047, |
| "loss": 0.9729, |
| "mean_token_accuracy": 0.7868019372224808, |
| "num_tokens": 19384598.0, |
| "step": 7230 |
| }, |
| { |
| "epoch": 0.4547595866963977, |
| "grad_norm": 0.2936389148235321, |
| "learning_rate": 0.00016968783367878902, |
| "loss": 0.9299, |
| "mean_token_accuracy": 0.7991322789341211, |
| "num_tokens": 19411913.0, |
| "step": 7240 |
| }, |
| { |
| "epoch": 0.4553877076724977, |
| "grad_norm": 0.3057098984718323, |
| "learning_rate": 0.0001696459602621276, |
| "loss": 0.9309, |
| "mean_token_accuracy": 0.793816527351737, |
| "num_tokens": 19440434.0, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.45601582864859774, |
| "grad_norm": 0.33800917863845825, |
| "learning_rate": 0.00016960408684546616, |
| "loss": 0.9347, |
| "mean_token_accuracy": 0.7935182463377715, |
| "num_tokens": 19468211.0, |
| "step": 7260 |
| }, |
| { |
| "epoch": 0.45664394962469773, |
| "grad_norm": 0.3345368802547455, |
| "learning_rate": 0.00016956221342880474, |
| "loss": 0.9564, |
| "mean_token_accuracy": 0.7953835293650627, |
| "num_tokens": 19494223.0, |
| "step": 7270 |
| }, |
| { |
| "epoch": 0.4572720706007977, |
| "grad_norm": 0.34066635370254517, |
| "learning_rate": 0.0001695203400121433, |
| "loss": 0.9516, |
| "mean_token_accuracy": 0.7983359940350055, |
| "num_tokens": 19520687.0, |
| "step": 7280 |
| }, |
| { |
| "epoch": 0.4579001915768977, |
| "grad_norm": 0.3939811885356903, |
| "learning_rate": 0.00016947846659548188, |
| "loss": 0.9402, |
| "mean_token_accuracy": 0.7942854754626751, |
| "num_tokens": 19547875.0, |
| "step": 7290 |
| }, |
| { |
| "epoch": 0.4585283125529977, |
| "grad_norm": 0.30416762828826904, |
| "learning_rate": 0.00016943659317882043, |
| "loss": 0.9776, |
| "mean_token_accuracy": 0.788750433549285, |
| "num_tokens": 19575021.0, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.4591564335290977, |
| "grad_norm": 0.2712084650993347, |
| "learning_rate": 0.00016939471976215898, |
| "loss": 0.9429, |
| "mean_token_accuracy": 0.7919867537915707, |
| "num_tokens": 19604046.0, |
| "step": 7310 |
| }, |
| { |
| "epoch": 0.4597845545051977, |
| "grad_norm": 0.31856003403663635, |
| "learning_rate": 0.00016935284634549756, |
| "loss": 0.9429, |
| "mean_token_accuracy": 0.7997392650693655, |
| "num_tokens": 19630841.0, |
| "step": 7320 |
| }, |
| { |
| "epoch": 0.4604126754812977, |
| "grad_norm": 0.2791038155555725, |
| "learning_rate": 0.00016931097292883612, |
| "loss": 0.9348, |
| "mean_token_accuracy": 0.7959219090640545, |
| "num_tokens": 19657308.0, |
| "step": 7330 |
| }, |
| { |
| "epoch": 0.4610407964573977, |
| "grad_norm": 0.3258204460144043, |
| "learning_rate": 0.0001692690995121747, |
| "loss": 0.9696, |
| "mean_token_accuracy": 0.7872245352715254, |
| "num_tokens": 19685937.0, |
| "step": 7340 |
| }, |
| { |
| "epoch": 0.4616689174334977, |
| "grad_norm": 0.32432809472084045, |
| "learning_rate": 0.00016922722609551328, |
| "loss": 0.9432, |
| "mean_token_accuracy": 0.7930789031088352, |
| "num_tokens": 19711611.0, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.4622970384095977, |
| "grad_norm": 0.31211572885513306, |
| "learning_rate": 0.00016918535267885186, |
| "loss": 0.9426, |
| "mean_token_accuracy": 0.7946818351745606, |
| "num_tokens": 19738162.0, |
| "step": 7360 |
| }, |
| { |
| "epoch": 0.46292515938569767, |
| "grad_norm": 0.35430920124053955, |
| "learning_rate": 0.00016914347926219042, |
| "loss": 0.9485, |
| "mean_token_accuracy": 0.7976432036608457, |
| "num_tokens": 19763931.0, |
| "step": 7370 |
| }, |
| { |
| "epoch": 0.46355328036179766, |
| "grad_norm": 0.2996009588241577, |
| "learning_rate": 0.00016910160584552897, |
| "loss": 0.9209, |
| "mean_token_accuracy": 0.7984352611005306, |
| "num_tokens": 19790663.0, |
| "step": 7380 |
| }, |
| { |
| "epoch": 0.4641814013378977, |
| "grad_norm": 0.3618237376213074, |
| "learning_rate": 0.00016905973242886755, |
| "loss": 0.9762, |
| "mean_token_accuracy": 0.7867707304656506, |
| "num_tokens": 19818336.0, |
| "step": 7390 |
| }, |
| { |
| "epoch": 0.4648095223139977, |
| "grad_norm": 0.3506768047809601, |
| "learning_rate": 0.0001690178590122061, |
| "loss": 0.9157, |
| "mean_token_accuracy": 0.8001652296632529, |
| "num_tokens": 19844561.0, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.4654376432900977, |
| "grad_norm": 0.31607791781425476, |
| "learning_rate": 0.00016897598559554469, |
| "loss": 0.9795, |
| "mean_token_accuracy": 0.7913108296692372, |
| "num_tokens": 19871050.0, |
| "step": 7410 |
| }, |
| { |
| "epoch": 0.46606576426619767, |
| "grad_norm": 0.36683353781700134, |
| "learning_rate": 0.00016893411217888324, |
| "loss": 0.9393, |
| "mean_token_accuracy": 0.7971708361059427, |
| "num_tokens": 19897758.0, |
| "step": 7420 |
| }, |
| { |
| "epoch": 0.46669388524229766, |
| "grad_norm": 0.3240242898464203, |
| "learning_rate": 0.00016889223876222182, |
| "loss": 0.9444, |
| "mean_token_accuracy": 0.7941307682543993, |
| "num_tokens": 19926668.0, |
| "step": 7430 |
| }, |
| { |
| "epoch": 0.46732200621839765, |
| "grad_norm": 0.2812100648880005, |
| "learning_rate": 0.00016885036534556038, |
| "loss": 1.0075, |
| "mean_token_accuracy": 0.783855975791812, |
| "num_tokens": 19955199.0, |
| "step": 7440 |
| }, |
| { |
| "epoch": 0.46795012719449763, |
| "grad_norm": 0.32872602343559265, |
| "learning_rate": 0.00016880849192889893, |
| "loss": 0.9386, |
| "mean_token_accuracy": 0.794111205264926, |
| "num_tokens": 19984176.0, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.4685782481705977, |
| "grad_norm": 0.35838210582733154, |
| "learning_rate": 0.0001687666185122375, |
| "loss": 0.9102, |
| "mean_token_accuracy": 0.8033748425543308, |
| "num_tokens": 20010815.0, |
| "step": 7460 |
| }, |
| { |
| "epoch": 0.46920636914669767, |
| "grad_norm": 0.30107223987579346, |
| "learning_rate": 0.00016872474509557606, |
| "loss": 0.9703, |
| "mean_token_accuracy": 0.7895241472870111, |
| "num_tokens": 20036032.0, |
| "step": 7470 |
| }, |
| { |
| "epoch": 0.46983449012279765, |
| "grad_norm": 0.3146842420101166, |
| "learning_rate": 0.00016868287167891465, |
| "loss": 0.9406, |
| "mean_token_accuracy": 0.792415551096201, |
| "num_tokens": 20064166.0, |
| "step": 7480 |
| }, |
| { |
| "epoch": 0.47046261109889764, |
| "grad_norm": 0.4442558288574219, |
| "learning_rate": 0.00016864099826225323, |
| "loss": 0.9504, |
| "mean_token_accuracy": 0.7952435094863176, |
| "num_tokens": 20090019.0, |
| "step": 7490 |
| }, |
| { |
| "epoch": 0.47109073207499763, |
| "grad_norm": 0.34173983335494995, |
| "learning_rate": 0.00016859912484559178, |
| "loss": 0.9739, |
| "mean_token_accuracy": 0.7935862522572279, |
| "num_tokens": 20116096.0, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.4717188530510976, |
| "grad_norm": 0.3807821273803711, |
| "learning_rate": 0.00016855725142893036, |
| "loss": 0.9247, |
| "mean_token_accuracy": 0.7981947991997004, |
| "num_tokens": 20141974.0, |
| "step": 7510 |
| }, |
| { |
| "epoch": 0.47234697402719766, |
| "grad_norm": 0.3637494146823883, |
| "learning_rate": 0.00016851537801226892, |
| "loss": 0.9383, |
| "mean_token_accuracy": 0.7955837201327085, |
| "num_tokens": 20168582.0, |
| "step": 7520 |
| }, |
| { |
| "epoch": 0.47297509500329765, |
| "grad_norm": 0.35919925570487976, |
| "learning_rate": 0.0001684735045956075, |
| "loss": 0.946, |
| "mean_token_accuracy": 0.7967754255980253, |
| "num_tokens": 20195729.0, |
| "step": 7530 |
| }, |
| { |
| "epoch": 0.47360321597939764, |
| "grad_norm": 0.4207704961299896, |
| "learning_rate": 0.00016843163117894605, |
| "loss": 0.9348, |
| "mean_token_accuracy": 0.8010726600885392, |
| "num_tokens": 20223094.0, |
| "step": 7540 |
| }, |
| { |
| "epoch": 0.47423133695549763, |
| "grad_norm": 0.3021661937236786, |
| "learning_rate": 0.00016838975776228463, |
| "loss": 0.9393, |
| "mean_token_accuracy": 0.7987467546015978, |
| "num_tokens": 20249412.0, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.4748594579315976, |
| "grad_norm": 0.29512059688568115, |
| "learning_rate": 0.00016834788434562319, |
| "loss": 0.9187, |
| "mean_token_accuracy": 0.8019442018121481, |
| "num_tokens": 20275957.0, |
| "step": 7560 |
| }, |
| { |
| "epoch": 0.4754875789076976, |
| "grad_norm": 0.40679019689559937, |
| "learning_rate": 0.00016830601092896177, |
| "loss": 0.9228, |
| "mean_token_accuracy": 0.7991116803139449, |
| "num_tokens": 20302470.0, |
| "step": 7570 |
| }, |
| { |
| "epoch": 0.4761156998837976, |
| "grad_norm": 0.3159092664718628, |
| "learning_rate": 0.00016826413751230032, |
| "loss": 0.9214, |
| "mean_token_accuracy": 0.7994810316711665, |
| "num_tokens": 20328996.0, |
| "step": 7580 |
| }, |
| { |
| "epoch": 0.47674382085989764, |
| "grad_norm": 0.41657283902168274, |
| "learning_rate": 0.00016822226409563887, |
| "loss": 0.8581, |
| "mean_token_accuracy": 0.8116535749286413, |
| "num_tokens": 20357442.0, |
| "step": 7590 |
| }, |
| { |
| "epoch": 0.4773719418359976, |
| "grad_norm": 0.420977383852005, |
| "learning_rate": 0.00016818039067897746, |
| "loss": 0.9449, |
| "mean_token_accuracy": 0.7980388529598713, |
| "num_tokens": 20384687.0, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.4780000628120976, |
| "grad_norm": 0.3933321237564087, |
| "learning_rate": 0.000168138517262316, |
| "loss": 0.9567, |
| "mean_token_accuracy": 0.7897981021553278, |
| "num_tokens": 20410938.0, |
| "step": 7610 |
| }, |
| { |
| "epoch": 0.4786281837881976, |
| "grad_norm": 0.3149840831756592, |
| "learning_rate": 0.0001680966438456546, |
| "loss": 0.9434, |
| "mean_token_accuracy": 0.7927991054952145, |
| "num_tokens": 20438513.0, |
| "step": 7620 |
| }, |
| { |
| "epoch": 0.4792563047642976, |
| "grad_norm": 0.334378182888031, |
| "learning_rate": 0.00016805477042899315, |
| "loss": 0.8846, |
| "mean_token_accuracy": 0.8075113136321306, |
| "num_tokens": 20464813.0, |
| "step": 7630 |
| }, |
| { |
| "epoch": 0.4798844257403976, |
| "grad_norm": 0.3132246434688568, |
| "learning_rate": 0.00016801289701233173, |
| "loss": 0.9215, |
| "mean_token_accuracy": 0.7987807631492615, |
| "num_tokens": 20490998.0, |
| "step": 7640 |
| }, |
| { |
| "epoch": 0.4805125467164976, |
| "grad_norm": 0.32393956184387207, |
| "learning_rate": 0.0001679710235956703, |
| "loss": 0.9562, |
| "mean_token_accuracy": 0.7920228894799948, |
| "num_tokens": 20518297.0, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.4811406676925976, |
| "grad_norm": 0.3257106840610504, |
| "learning_rate": 0.00016792915017900886, |
| "loss": 0.9873, |
| "mean_token_accuracy": 0.7902991570532322, |
| "num_tokens": 20545200.0, |
| "step": 7660 |
| }, |
| { |
| "epoch": 0.4817687886686976, |
| "grad_norm": 0.3050191402435303, |
| "learning_rate": 0.00016788727676234744, |
| "loss": 0.9315, |
| "mean_token_accuracy": 0.7982578534632921, |
| "num_tokens": 20570896.0, |
| "step": 7670 |
| }, |
| { |
| "epoch": 0.4823969096447976, |
| "grad_norm": 0.3876621723175049, |
| "learning_rate": 0.000167845403345686, |
| "loss": 0.9546, |
| "mean_token_accuracy": 0.8005735255777836, |
| "num_tokens": 20598172.0, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.4830250306208976, |
| "grad_norm": 0.3658469319343567, |
| "learning_rate": 0.00016780352992902458, |
| "loss": 0.9461, |
| "mean_token_accuracy": 0.7934409212321043, |
| "num_tokens": 20625326.0, |
| "step": 7690 |
| }, |
| { |
| "epoch": 0.48365315159699757, |
| "grad_norm": 0.29799938201904297, |
| "learning_rate": 0.00016776165651236313, |
| "loss": 0.915, |
| "mean_token_accuracy": 0.8017003744840622, |
| "num_tokens": 20652620.0, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.48428127257309755, |
| "grad_norm": 0.34464171528816223, |
| "learning_rate": 0.0001677197830957017, |
| "loss": 0.9907, |
| "mean_token_accuracy": 0.7844714995473623, |
| "num_tokens": 20679476.0, |
| "step": 7710 |
| }, |
| { |
| "epoch": 0.4849093935491976, |
| "grad_norm": 0.2975150942802429, |
| "learning_rate": 0.00016767790967904027, |
| "loss": 0.8977, |
| "mean_token_accuracy": 0.799476170167327, |
| "num_tokens": 20707100.0, |
| "step": 7720 |
| }, |
| { |
| "epoch": 0.4855375145252976, |
| "grad_norm": 0.4127698838710785, |
| "learning_rate": 0.00016763603626237882, |
| "loss": 0.8805, |
| "mean_token_accuracy": 0.8022565051913262, |
| "num_tokens": 20734538.0, |
| "step": 7730 |
| }, |
| { |
| "epoch": 0.4861656355013976, |
| "grad_norm": 0.2913120687007904, |
| "learning_rate": 0.0001675941628457174, |
| "loss": 0.9268, |
| "mean_token_accuracy": 0.7975892823189497, |
| "num_tokens": 20760647.0, |
| "step": 7740 |
| }, |
| { |
| "epoch": 0.48679375647749756, |
| "grad_norm": 0.37485092878341675, |
| "learning_rate": 0.00016755228942905596, |
| "loss": 0.914, |
| "mean_token_accuracy": 0.7988491103053093, |
| "num_tokens": 20787101.0, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.48742187745359755, |
| "grad_norm": 0.3177768886089325, |
| "learning_rate": 0.00016751041601239454, |
| "loss": 0.9609, |
| "mean_token_accuracy": 0.7920984081923962, |
| "num_tokens": 20814011.0, |
| "step": 7760 |
| }, |
| { |
| "epoch": 0.48804999842969754, |
| "grad_norm": 0.3514329195022583, |
| "learning_rate": 0.0001674685425957331, |
| "loss": 0.9249, |
| "mean_token_accuracy": 0.7984451025724411, |
| "num_tokens": 20839532.0, |
| "step": 7770 |
| }, |
| { |
| "epoch": 0.4886781194057976, |
| "grad_norm": 0.42157721519470215, |
| "learning_rate": 0.00016742666917907167, |
| "loss": 0.9322, |
| "mean_token_accuracy": 0.7990887399762869, |
| "num_tokens": 20865911.0, |
| "step": 7780 |
| }, |
| { |
| "epoch": 0.4893062403818976, |
| "grad_norm": 0.33001431822776794, |
| "learning_rate": 0.00016738479576241025, |
| "loss": 0.9204, |
| "mean_token_accuracy": 0.7994894739240408, |
| "num_tokens": 20892717.0, |
| "step": 7790 |
| }, |
| { |
| "epoch": 0.48993436135799756, |
| "grad_norm": 0.385657399892807, |
| "learning_rate": 0.00016734292234574883, |
| "loss": 0.9414, |
| "mean_token_accuracy": 0.7973318379372358, |
| "num_tokens": 20919688.0, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.49056248233409755, |
| "grad_norm": 0.3426561653614044, |
| "learning_rate": 0.0001673010489290874, |
| "loss": 0.9601, |
| "mean_token_accuracy": 0.7915532372891902, |
| "num_tokens": 20947274.0, |
| "step": 7810 |
| }, |
| { |
| "epoch": 0.49119060331019754, |
| "grad_norm": 0.34158486127853394, |
| "learning_rate": 0.00016725917551242594, |
| "loss": 0.9393, |
| "mean_token_accuracy": 0.7953637775033713, |
| "num_tokens": 20973896.0, |
| "step": 7820 |
| }, |
| { |
| "epoch": 0.4918187242862975, |
| "grad_norm": 0.36255061626434326, |
| "learning_rate": 0.00016721730209576452, |
| "loss": 0.9446, |
| "mean_token_accuracy": 0.7943896591663361, |
| "num_tokens": 20999609.0, |
| "step": 7830 |
| }, |
| { |
| "epoch": 0.4924468452623975, |
| "grad_norm": 0.38851070404052734, |
| "learning_rate": 0.00016717542867910308, |
| "loss": 0.9107, |
| "mean_token_accuracy": 0.8007842686027289, |
| "num_tokens": 21025356.0, |
| "step": 7840 |
| }, |
| { |
| "epoch": 0.49307496623849756, |
| "grad_norm": 0.35424819588661194, |
| "learning_rate": 0.00016713355526244166, |
| "loss": 0.916, |
| "mean_token_accuracy": 0.7971796747297049, |
| "num_tokens": 21051222.0, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.49370308721459755, |
| "grad_norm": 0.36249005794525146, |
| "learning_rate": 0.0001670916818457802, |
| "loss": 0.9501, |
| "mean_token_accuracy": 0.7942442841827869, |
| "num_tokens": 21077182.0, |
| "step": 7860 |
| }, |
| { |
| "epoch": 0.49433120819069754, |
| "grad_norm": 0.2965359091758728, |
| "learning_rate": 0.0001670498084291188, |
| "loss": 0.9108, |
| "mean_token_accuracy": 0.804225553944707, |
| "num_tokens": 21104365.0, |
| "step": 7870 |
| }, |
| { |
| "epoch": 0.4949593291667975, |
| "grad_norm": 0.379820317029953, |
| "learning_rate": 0.00016700793501245735, |
| "loss": 0.9214, |
| "mean_token_accuracy": 0.8005719102919102, |
| "num_tokens": 21131404.0, |
| "step": 7880 |
| }, |
| { |
| "epoch": 0.4955874501428975, |
| "grad_norm": 0.39546650648117065, |
| "learning_rate": 0.0001669660615957959, |
| "loss": 0.9775, |
| "mean_token_accuracy": 0.7893277246505022, |
| "num_tokens": 21157005.0, |
| "step": 7890 |
| }, |
| { |
| "epoch": 0.4962155711189975, |
| "grad_norm": 0.3420298993587494, |
| "learning_rate": 0.00016692418817913448, |
| "loss": 0.9174, |
| "mean_token_accuracy": 0.7994081798940897, |
| "num_tokens": 21184073.0, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.4968436920950975, |
| "grad_norm": 0.30282649397850037, |
| "learning_rate": 0.00016688231476247304, |
| "loss": 0.9289, |
| "mean_token_accuracy": 0.7969534825533628, |
| "num_tokens": 21210150.0, |
| "step": 7910 |
| }, |
| { |
| "epoch": 0.49747181307119753, |
| "grad_norm": 0.39200103282928467, |
| "learning_rate": 0.00016684044134581162, |
| "loss": 0.9481, |
| "mean_token_accuracy": 0.7922174122184515, |
| "num_tokens": 21237922.0, |
| "step": 7920 |
| }, |
| { |
| "epoch": 0.4980999340472975, |
| "grad_norm": 0.3704611659049988, |
| "learning_rate": 0.00016679856792915017, |
| "loss": 0.9554, |
| "mean_token_accuracy": 0.7943468518555165, |
| "num_tokens": 21265332.0, |
| "step": 7930 |
| }, |
| { |
| "epoch": 0.4987280550233975, |
| "grad_norm": 0.3158395290374756, |
| "learning_rate": 0.00016675669451248875, |
| "loss": 0.9366, |
| "mean_token_accuracy": 0.7983962200582028, |
| "num_tokens": 21292870.0, |
| "step": 7940 |
| }, |
| { |
| "epoch": 0.4993561759994975, |
| "grad_norm": 0.32152894139289856, |
| "learning_rate": 0.00016671482109582733, |
| "loss": 0.9699, |
| "mean_token_accuracy": 0.796767120435834, |
| "num_tokens": 21318347.0, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.4999842969755975, |
| "grad_norm": 0.3472147583961487, |
| "learning_rate": 0.0001666729476791659, |
| "loss": 0.9795, |
| "mean_token_accuracy": 0.7905366614460945, |
| "num_tokens": 21344579.0, |
| "step": 7960 |
| }, |
| { |
| "epoch": 0.5006124179516975, |
| "grad_norm": 0.35350507497787476, |
| "learning_rate": 0.00016663107426250447, |
| "loss": 0.9741, |
| "mean_token_accuracy": 0.7871046803891659, |
| "num_tokens": 21370562.0, |
| "step": 7970 |
| }, |
| { |
| "epoch": 0.5012405389277975, |
| "grad_norm": 0.37780460715293884, |
| "learning_rate": 0.00016658920084584302, |
| "loss": 0.9832, |
| "mean_token_accuracy": 0.7835981391370297, |
| "num_tokens": 21397739.0, |
| "step": 7980 |
| }, |
| { |
| "epoch": 0.5018686599038975, |
| "grad_norm": 0.29296019673347473, |
| "learning_rate": 0.0001665473274291816, |
| "loss": 0.9525, |
| "mean_token_accuracy": 0.7943347483873368, |
| "num_tokens": 21424598.0, |
| "step": 7990 |
| }, |
| { |
| "epoch": 0.5024967808799975, |
| "grad_norm": 0.32463762164115906, |
| "learning_rate": 0.00016650545401252016, |
| "loss": 0.9417, |
| "mean_token_accuracy": 0.7945131246000529, |
| "num_tokens": 21451365.0, |
| "step": 8000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 47763, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.4324562160534979e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|