{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002725306171115161, "grad_norm": 58.5752067565918, "learning_rate": 0.0, "loss": 4.1988, "mean_token_accuracy": 0.27959318878129125, "num_tokens": 180497.0, "step": 1 }, { "epoch": 0.0027253061711151614, "grad_norm": 55.061241149902344, "learning_rate": 2.0361990950226245e-06, "loss": 4.1578, "mean_token_accuracy": 0.2840940863307979, "num_tokens": 1772191.0, "step": 10 }, { "epoch": 0.005450612342230323, "grad_norm": 31.145395278930664, "learning_rate": 4.298642533936651e-06, "loss": 3.805, "mean_token_accuracy": 0.30883588222786784, "num_tokens": 3528128.0, "step": 20 }, { "epoch": 0.008175918513345483, "grad_norm": 12.37236213684082, "learning_rate": 6.5610859728506795e-06, "loss": 3.0369, "mean_token_accuracy": 0.3894659894518554, "num_tokens": 5321820.0, "step": 30 }, { "epoch": 0.010901224684460645, "grad_norm": 4.672379016876221, "learning_rate": 8.823529411764707e-06, "loss": 2.3771, "mean_token_accuracy": 0.48662197683006525, "num_tokens": 7036353.0, "step": 40 }, { "epoch": 0.013626530855575806, "grad_norm": 2.293363094329834, "learning_rate": 1.1085972850678733e-05, "loss": 1.9359, "mean_token_accuracy": 0.5632315624505282, "num_tokens": 8794502.0, "step": 50 }, { "epoch": 0.016351837026690966, "grad_norm": 1.8514742851257324, "learning_rate": 1.3348416289592761e-05, "loss": 1.687, "mean_token_accuracy": 0.6138357298448682, "num_tokens": 10519144.0, "step": 60 }, { "epoch": 0.019077143197806127, "grad_norm": 1.5001717805862427, "learning_rate": 1.5610859728506788e-05, "loss": 1.511, "mean_token_accuracy": 0.6497381268069148, "num_tokens": 12272638.0, "step": 70 }, { "epoch": 0.02180244936892129, "grad_norm": 1.2374789714813232, "learning_rate": 1.7873303167420814e-05, "loss": 1.4223, "mean_token_accuracy": 0.667938649840653, "num_tokens": 14037595.0, "step": 80 }, { "epoch": 0.02452775554003645, "grad_norm": 1.232344388961792, "learning_rate": 2.0135746606334844e-05, "loss": 1.3629, "mean_token_accuracy": 0.6789748859591782, "num_tokens": 15813413.0, "step": 90 }, { "epoch": 0.027253061711151612, "grad_norm": 1.2523256540298462, "learning_rate": 2.239819004524887e-05, "loss": 1.3328, "mean_token_accuracy": 0.6849433833733201, "num_tokens": 17606667.0, "step": 100 }, { "epoch": 0.029978367882266772, "grad_norm": 1.1663717031478882, "learning_rate": 2.4660633484162897e-05, "loss": 1.2645, "mean_token_accuracy": 0.7004281104542315, "num_tokens": 19387650.0, "step": 110 }, { "epoch": 0.03270367405338193, "grad_norm": 1.148568868637085, "learning_rate": 2.6923076923076923e-05, "loss": 1.2482, "mean_token_accuracy": 0.7018253466114401, "num_tokens": 21111629.0, "step": 120 }, { "epoch": 0.03542898022449709, "grad_norm": 1.0586553812026978, "learning_rate": 2.9185520361990953e-05, "loss": 1.2162, "mean_token_accuracy": 0.7089157540351152, "num_tokens": 22855847.0, "step": 130 }, { "epoch": 0.038154286395612254, "grad_norm": 1.1304233074188232, "learning_rate": 3.1447963800904976e-05, "loss": 1.1918, "mean_token_accuracy": 0.7152549306862056, "num_tokens": 24647294.0, "step": 140 }, { "epoch": 0.04087959256672742, "grad_norm": 1.1450897455215454, "learning_rate": 3.371040723981901e-05, "loss": 1.1655, "mean_token_accuracy": 0.7181231830269098, "num_tokens": 26336128.0, "step": 150 }, { "epoch": 0.04360489873784258, "grad_norm": 1.0469363927841187, "learning_rate": 3.5972850678733036e-05, "loss": 1.2058, "mean_token_accuracy": 0.7123475090600551, "num_tokens": 28061528.0, "step": 160 }, { "epoch": 0.04633020490895774, "grad_norm": 1.0489130020141602, "learning_rate": 3.8235294117647055e-05, "loss": 1.178, "mean_token_accuracy": 0.7180261522531509, "num_tokens": 29834651.0, "step": 170 }, { "epoch": 0.0490555110800729, "grad_norm": 1.1838144063949585, "learning_rate": 4.049773755656109e-05, "loss": 1.1964, "mean_token_accuracy": 0.7144321620464325, "num_tokens": 31533806.0, "step": 180 }, { "epoch": 0.05178081725118806, "grad_norm": 1.0851892232894897, "learning_rate": 4.2760180995475115e-05, "loss": 1.1408, "mean_token_accuracy": 0.72315530218184, "num_tokens": 33328644.0, "step": 190 }, { "epoch": 0.054506123422303224, "grad_norm": 1.1279138326644897, "learning_rate": 4.502262443438914e-05, "loss": 1.1406, "mean_token_accuracy": 0.7242222094908357, "num_tokens": 35085202.0, "step": 200 }, { "epoch": 0.057231429593418384, "grad_norm": 1.07979416847229, "learning_rate": 4.728506787330317e-05, "loss": 1.1566, "mean_token_accuracy": 0.7219386071898043, "num_tokens": 36899685.0, "step": 210 }, { "epoch": 0.059956735764533545, "grad_norm": 1.1669384241104126, "learning_rate": 4.95475113122172e-05, "loss": 1.0724, "mean_token_accuracy": 0.7370353294536471, "num_tokens": 38663867.0, "step": 220 }, { "epoch": 0.0626820419356487, "grad_norm": 1.0999536514282227, "learning_rate": 4.994381233319287e-05, "loss": 1.0878, "mean_token_accuracy": 0.733780630491674, "num_tokens": 40368688.0, "step": 230 }, { "epoch": 0.06540734810676387, "grad_norm": 1.029019832611084, "learning_rate": 4.9873577749683945e-05, "loss": 1.0928, "mean_token_accuracy": 0.7319904958829284, "num_tokens": 42091556.0, "step": 240 }, { "epoch": 0.06813265427787903, "grad_norm": 1.0037535429000854, "learning_rate": 4.9803343166175026e-05, "loss": 1.0902, "mean_token_accuracy": 0.7336283173412085, "num_tokens": 43832865.0, "step": 250 }, { "epoch": 0.07085796044899419, "grad_norm": 1.0020638704299927, "learning_rate": 4.9733108582666106e-05, "loss": 1.0952, "mean_token_accuracy": 0.7330724668689073, "num_tokens": 45556421.0, "step": 260 }, { "epoch": 0.07358326662010935, "grad_norm": 1.0238029956817627, "learning_rate": 4.9662873999157186e-05, "loss": 1.1225, "mean_token_accuracy": 0.7294999814592302, "num_tokens": 47284838.0, "step": 270 }, { "epoch": 0.07630857279122451, "grad_norm": 1.0471445322036743, "learning_rate": 4.9592639415648266e-05, "loss": 1.0805, "mean_token_accuracy": 0.734851144067943, "num_tokens": 49025592.0, "step": 280 }, { "epoch": 0.07903387896233968, "grad_norm": 1.1734944581985474, "learning_rate": 4.9522404832139346e-05, "loss": 1.1459, "mean_token_accuracy": 0.7227287207730114, "num_tokens": 50771353.0, "step": 290 }, { "epoch": 0.08175918513345484, "grad_norm": 1.0051774978637695, "learning_rate": 4.945217024863043e-05, "loss": 1.0595, "mean_token_accuracy": 0.7396277584135532, "num_tokens": 52521467.0, "step": 300 }, { "epoch": 0.08448449130457, "grad_norm": 1.1502430438995361, "learning_rate": 4.938193566512151e-05, "loss": 1.0856, "mean_token_accuracy": 0.7348609266802668, "num_tokens": 54338858.0, "step": 310 }, { "epoch": 0.08720979747568516, "grad_norm": 1.079554796218872, "learning_rate": 4.931170108161259e-05, "loss": 1.0876, "mean_token_accuracy": 0.7352117918431759, "num_tokens": 56014081.0, "step": 320 }, { "epoch": 0.08993510364680032, "grad_norm": 1.0328309535980225, "learning_rate": 4.924146649810367e-05, "loss": 1.0622, "mean_token_accuracy": 0.7386171711608768, "num_tokens": 57778337.0, "step": 330 }, { "epoch": 0.09266040981791548, "grad_norm": 0.9490402936935425, "learning_rate": 4.9171231914594754e-05, "loss": 1.0549, "mean_token_accuracy": 0.74089086484164, "num_tokens": 59528977.0, "step": 340 }, { "epoch": 0.09538571598903064, "grad_norm": 0.9298028349876404, "learning_rate": 4.910099733108583e-05, "loss": 1.0704, "mean_token_accuracy": 0.7380094730295241, "num_tokens": 61295985.0, "step": 350 }, { "epoch": 0.0981110221601458, "grad_norm": 1.0823702812194824, "learning_rate": 4.903076274757691e-05, "loss": 1.0535, "mean_token_accuracy": 0.7405891207046806, "num_tokens": 63026897.0, "step": 360 }, { "epoch": 0.10083632833126097, "grad_norm": 0.9523594379425049, "learning_rate": 4.896052816406799e-05, "loss": 1.0444, "mean_token_accuracy": 0.7437008136883378, "num_tokens": 64770711.0, "step": 370 }, { "epoch": 0.10356163450237613, "grad_norm": 1.1027802228927612, "learning_rate": 4.889029358055907e-05, "loss": 1.0446, "mean_token_accuracy": 0.7440237908624112, "num_tokens": 66551008.0, "step": 380 }, { "epoch": 0.10628694067349129, "grad_norm": 0.9772301912307739, "learning_rate": 4.882005899705015e-05, "loss": 1.0598, "mean_token_accuracy": 0.7391570524312556, "num_tokens": 68320690.0, "step": 390 }, { "epoch": 0.10901224684460645, "grad_norm": 0.8989792466163635, "learning_rate": 4.874982441354123e-05, "loss": 1.0231, "mean_token_accuracy": 0.7470484189689159, "num_tokens": 70095284.0, "step": 400 }, { "epoch": 0.11173755301572161, "grad_norm": 0.9571840167045593, "learning_rate": 4.8679589830032316e-05, "loss": 1.0416, "mean_token_accuracy": 0.7444285966455937, "num_tokens": 71803511.0, "step": 410 }, { "epoch": 0.11446285918683677, "grad_norm": 0.9428116679191589, "learning_rate": 4.860935524652339e-05, "loss": 0.9991, "mean_token_accuracy": 0.7514733706600965, "num_tokens": 73528297.0, "step": 420 }, { "epoch": 0.11718816535795193, "grad_norm": 0.9464392066001892, "learning_rate": 4.853912066301447e-05, "loss": 1.0022, "mean_token_accuracy": 0.7517835930921137, "num_tokens": 75274289.0, "step": 430 }, { "epoch": 0.11991347152906709, "grad_norm": 1.0934405326843262, "learning_rate": 4.846888607950555e-05, "loss": 1.0466, "mean_token_accuracy": 0.7423655292019248, "num_tokens": 77077403.0, "step": 440 }, { "epoch": 0.12263877770018225, "grad_norm": 0.9798712134361267, "learning_rate": 4.8398651495996636e-05, "loss": 1.0416, "mean_token_accuracy": 0.7423399617895484, "num_tokens": 78859882.0, "step": 450 }, { "epoch": 0.1253640838712974, "grad_norm": 0.9301265478134155, "learning_rate": 4.832841691248771e-05, "loss": 1.0564, "mean_token_accuracy": 0.7402447827160359, "num_tokens": 80590976.0, "step": 460 }, { "epoch": 0.12808939004241257, "grad_norm": 1.028621792793274, "learning_rate": 4.825818232897879e-05, "loss": 1.0135, "mean_token_accuracy": 0.7484846420586109, "num_tokens": 82275126.0, "step": 470 }, { "epoch": 0.13081469621352773, "grad_norm": 0.9343350529670715, "learning_rate": 4.818794774546987e-05, "loss": 1.0102, "mean_token_accuracy": 0.7489786158315838, "num_tokens": 84096910.0, "step": 480 }, { "epoch": 0.1335400023846429, "grad_norm": 0.9360990524291992, "learning_rate": 4.811771316196095e-05, "loss": 1.0402, "mean_token_accuracy": 0.7446358595974744, "num_tokens": 85876326.0, "step": 490 }, { "epoch": 0.13626530855575805, "grad_norm": 0.9260503053665161, "learning_rate": 4.804747857845203e-05, "loss": 1.0412, "mean_token_accuracy": 0.7441607388667763, "num_tokens": 87607478.0, "step": 500 }, { "epoch": 0.1389906147268732, "grad_norm": 0.9006381034851074, "learning_rate": 4.797724399494311e-05, "loss": 1.0014, "mean_token_accuracy": 0.7509601839818061, "num_tokens": 89350066.0, "step": 510 }, { "epoch": 0.14171592089798837, "grad_norm": 0.9362842440605164, "learning_rate": 4.79070094114342e-05, "loss": 1.0138, "mean_token_accuracy": 0.7497716393321753, "num_tokens": 91067910.0, "step": 520 }, { "epoch": 0.14444122706910353, "grad_norm": 0.9452476501464844, "learning_rate": 4.783677482792527e-05, "loss": 1.0052, "mean_token_accuracy": 0.7499888489954174, "num_tokens": 92797017.0, "step": 530 }, { "epoch": 0.1471665332402187, "grad_norm": 0.9116583466529846, "learning_rate": 4.776654024441635e-05, "loss": 1.0263, "mean_token_accuracy": 0.7451268086209893, "num_tokens": 94606329.0, "step": 540 }, { "epoch": 0.14989183941133385, "grad_norm": 0.9305568933486938, "learning_rate": 4.769630566090743e-05, "loss": 1.0039, "mean_token_accuracy": 0.7499175570905209, "num_tokens": 96331087.0, "step": 550 }, { "epoch": 0.15261714558244902, "grad_norm": 0.9302889704704285, "learning_rate": 4.762607107739852e-05, "loss": 1.0239, "mean_token_accuracy": 0.7466340310871601, "num_tokens": 98138711.0, "step": 560 }, { "epoch": 0.15534245175356418, "grad_norm": 0.9294123649597168, "learning_rate": 4.755583649388959e-05, "loss": 1.0285, "mean_token_accuracy": 0.7461859748698771, "num_tokens": 99885005.0, "step": 570 }, { "epoch": 0.15806775792467936, "grad_norm": 0.8810741901397705, "learning_rate": 4.748560191038067e-05, "loss": 0.9361, "mean_token_accuracy": 0.7639201901853084, "num_tokens": 101636075.0, "step": 580 }, { "epoch": 0.16079306409579452, "grad_norm": 0.987924337387085, "learning_rate": 4.741536732687175e-05, "loss": 0.9949, "mean_token_accuracy": 0.7529382834210991, "num_tokens": 103349118.0, "step": 590 }, { "epoch": 0.16351837026690969, "grad_norm": 0.9978275895118713, "learning_rate": 4.734513274336283e-05, "loss": 1.0056, "mean_token_accuracy": 0.7498491807840765, "num_tokens": 105033010.0, "step": 600 }, { "epoch": 0.16624367643802485, "grad_norm": 0.9176990985870361, "learning_rate": 4.727489815985391e-05, "loss": 0.9731, "mean_token_accuracy": 0.7562306389212609, "num_tokens": 106723283.0, "step": 610 }, { "epoch": 0.16896898260914, "grad_norm": 0.8888489007949829, "learning_rate": 4.720466357634499e-05, "loss": 0.9907, "mean_token_accuracy": 0.7524073977023363, "num_tokens": 108436878.0, "step": 620 }, { "epoch": 0.17169428878025517, "grad_norm": 0.9666376709938049, "learning_rate": 4.713442899283608e-05, "loss": 0.9752, "mean_token_accuracy": 0.7562545623630286, "num_tokens": 110203157.0, "step": 630 }, { "epoch": 0.17441959495137033, "grad_norm": 1.034400463104248, "learning_rate": 4.706419440932715e-05, "loss": 1.0029, "mean_token_accuracy": 0.7515896405093372, "num_tokens": 111949130.0, "step": 640 }, { "epoch": 0.1771449011224855, "grad_norm": 1.044718623161316, "learning_rate": 4.6993959825818233e-05, "loss": 1.0037, "mean_token_accuracy": 0.7487363073974848, "num_tokens": 113652926.0, "step": 650 }, { "epoch": 0.17987020729360065, "grad_norm": 0.906299352645874, "learning_rate": 4.6923725242309314e-05, "loss": 0.9903, "mean_token_accuracy": 0.7514964304864407, "num_tokens": 115334647.0, "step": 660 }, { "epoch": 0.1825955134647158, "grad_norm": 0.8646354675292969, "learning_rate": 4.68534906588004e-05, "loss": 0.9501, "mean_token_accuracy": 0.7617973797954619, "num_tokens": 117032381.0, "step": 670 }, { "epoch": 0.18532081963583097, "grad_norm": 0.8861644864082336, "learning_rate": 4.6783256075291474e-05, "loss": 0.9859, "mean_token_accuracy": 0.7530626423656941, "num_tokens": 118801553.0, "step": 680 }, { "epoch": 0.18804612580694613, "grad_norm": 0.9357079863548279, "learning_rate": 4.6713021491782554e-05, "loss": 0.9989, "mean_token_accuracy": 0.7504322250373662, "num_tokens": 120530271.0, "step": 690 }, { "epoch": 0.1907714319780613, "grad_norm": 0.8540539741516113, "learning_rate": 4.6642786908273634e-05, "loss": 0.9904, "mean_token_accuracy": 0.7538057889789342, "num_tokens": 122314411.0, "step": 700 }, { "epoch": 0.19349673814917645, "grad_norm": 0.8792332410812378, "learning_rate": 4.6572552324764715e-05, "loss": 0.9839, "mean_token_accuracy": 0.7542797921225428, "num_tokens": 124054113.0, "step": 710 }, { "epoch": 0.1962220443202916, "grad_norm": 0.9250332713127136, "learning_rate": 4.6502317741255795e-05, "loss": 0.9988, "mean_token_accuracy": 0.7507336236536503, "num_tokens": 125786705.0, "step": 720 }, { "epoch": 0.19894735049140677, "grad_norm": 0.9268140196800232, "learning_rate": 4.6432083157746875e-05, "loss": 0.9998, "mean_token_accuracy": 0.7502107596024871, "num_tokens": 127510112.0, "step": 730 }, { "epoch": 0.20167265666252193, "grad_norm": 0.9151081442832947, "learning_rate": 4.636184857423796e-05, "loss": 0.9637, "mean_token_accuracy": 0.7585853973403573, "num_tokens": 129321733.0, "step": 740 }, { "epoch": 0.2043979628336371, "grad_norm": 0.8551493287086487, "learning_rate": 4.6291613990729035e-05, "loss": 1.0006, "mean_token_accuracy": 0.7512839620932936, "num_tokens": 131068939.0, "step": 750 }, { "epoch": 0.20712326900475225, "grad_norm": 1.0179383754730225, "learning_rate": 4.6221379407220116e-05, "loss": 1.0366, "mean_token_accuracy": 0.7441357178613544, "num_tokens": 132800192.0, "step": 760 }, { "epoch": 0.2098485751758674, "grad_norm": 0.8894006013870239, "learning_rate": 4.6151144823711196e-05, "loss": 0.9706, "mean_token_accuracy": 0.7571743190288543, "num_tokens": 134501880.0, "step": 770 }, { "epoch": 0.21257388134698257, "grad_norm": 0.9344801306724548, "learning_rate": 4.608091024020228e-05, "loss": 0.9495, "mean_token_accuracy": 0.7609304752200842, "num_tokens": 136227230.0, "step": 780 }, { "epoch": 0.21529918751809773, "grad_norm": 0.9688650965690613, "learning_rate": 4.6010675656693356e-05, "loss": 0.9634, "mean_token_accuracy": 0.7590550154447555, "num_tokens": 137938433.0, "step": 790 }, { "epoch": 0.2180244936892129, "grad_norm": 0.8858566880226135, "learning_rate": 4.5940441073184436e-05, "loss": 0.9736, "mean_token_accuracy": 0.7575385818257928, "num_tokens": 139667163.0, "step": 800 }, { "epoch": 0.22074979986032806, "grad_norm": 0.7869423031806946, "learning_rate": 4.5870206489675517e-05, "loss": 0.9494, "mean_token_accuracy": 0.7619166353717446, "num_tokens": 141481099.0, "step": 810 }, { "epoch": 0.22347510603144322, "grad_norm": 0.889818549156189, "learning_rate": 4.57999719061666e-05, "loss": 1.0118, "mean_token_accuracy": 0.7489442000165581, "num_tokens": 143204243.0, "step": 820 }, { "epoch": 0.22620041220255838, "grad_norm": 0.8601938486099243, "learning_rate": 4.572973732265768e-05, "loss": 0.9731, "mean_token_accuracy": 0.7579358568415046, "num_tokens": 144995581.0, "step": 830 }, { "epoch": 0.22892571837367354, "grad_norm": 0.868138313293457, "learning_rate": 4.565950273914876e-05, "loss": 0.9795, "mean_token_accuracy": 0.7540082458406687, "num_tokens": 146711076.0, "step": 840 }, { "epoch": 0.2316510245447887, "grad_norm": 0.8250023722648621, "learning_rate": 4.5589268155639844e-05, "loss": 0.971, "mean_token_accuracy": 0.7573945553973317, "num_tokens": 148463902.0, "step": 850 }, { "epoch": 0.23437633071590386, "grad_norm": 0.8885928392410278, "learning_rate": 4.551903357213092e-05, "loss": 0.9964, "mean_token_accuracy": 0.7519168628379702, "num_tokens": 150212801.0, "step": 860 }, { "epoch": 0.23710163688701902, "grad_norm": 0.8809767961502075, "learning_rate": 4.5448798988622e-05, "loss": 0.945, "mean_token_accuracy": 0.7604282542131842, "num_tokens": 151950016.0, "step": 870 }, { "epoch": 0.23982694305813418, "grad_norm": 1.013855218887329, "learning_rate": 4.537856440511308e-05, "loss": 0.9567, "mean_token_accuracy": 0.7587636118754745, "num_tokens": 153686341.0, "step": 880 }, { "epoch": 0.24255224922924934, "grad_norm": 0.8054437637329102, "learning_rate": 4.5308329821604165e-05, "loss": 0.9852, "mean_token_accuracy": 0.753642223123461, "num_tokens": 155442220.0, "step": 890 }, { "epoch": 0.2452775554003645, "grad_norm": 0.9395073056221008, "learning_rate": 4.523809523809524e-05, "loss": 0.9552, "mean_token_accuracy": 0.7594983844086528, "num_tokens": 157222278.0, "step": 900 }, { "epoch": 0.24800286157147966, "grad_norm": 0.8815314173698425, "learning_rate": 4.516786065458632e-05, "loss": 0.9484, "mean_token_accuracy": 0.7606731666252017, "num_tokens": 158989873.0, "step": 910 }, { "epoch": 0.2507281677425948, "grad_norm": 0.9090284109115601, "learning_rate": 4.50976260710774e-05, "loss": 0.9742, "mean_token_accuracy": 0.7581370803527534, "num_tokens": 160740690.0, "step": 920 }, { "epoch": 0.25345347391371, "grad_norm": 0.907217800617218, "learning_rate": 4.502739148756848e-05, "loss": 0.9779, "mean_token_accuracy": 0.7548290783539414, "num_tokens": 162505210.0, "step": 930 }, { "epoch": 0.25617878008482514, "grad_norm": 0.9016557335853577, "learning_rate": 4.495715690405956e-05, "loss": 0.9842, "mean_token_accuracy": 0.7541791129857301, "num_tokens": 164184119.0, "step": 940 }, { "epoch": 0.2589040862559403, "grad_norm": 0.8286603093147278, "learning_rate": 4.488692232055064e-05, "loss": 0.9433, "mean_token_accuracy": 0.7619124032557011, "num_tokens": 165971146.0, "step": 950 }, { "epoch": 0.26162939242705546, "grad_norm": 0.9046083092689514, "learning_rate": 4.4816687737041726e-05, "loss": 0.9781, "mean_token_accuracy": 0.7572765513323247, "num_tokens": 167701701.0, "step": 960 }, { "epoch": 0.2643546985981706, "grad_norm": 0.936540961265564, "learning_rate": 4.47464531535328e-05, "loss": 0.969, "mean_token_accuracy": 0.7577900117263198, "num_tokens": 169443326.0, "step": 970 }, { "epoch": 0.2670800047692858, "grad_norm": 0.8902041912078857, "learning_rate": 4.467621857002388e-05, "loss": 0.9706, "mean_token_accuracy": 0.7567241456359625, "num_tokens": 171199385.0, "step": 980 }, { "epoch": 0.26980531094040094, "grad_norm": 0.9205445647239685, "learning_rate": 4.460598398651496e-05, "loss": 0.9383, "mean_token_accuracy": 0.7634873200207949, "num_tokens": 172929493.0, "step": 990 }, { "epoch": 0.2725306171115161, "grad_norm": 0.9309903979301453, "learning_rate": 4.453574940300605e-05, "loss": 0.9442, "mean_token_accuracy": 0.7614085223525763, "num_tokens": 174672601.0, "step": 1000 }, { "epoch": 0.27525592328263127, "grad_norm": 0.8428636193275452, "learning_rate": 4.446551481949712e-05, "loss": 0.9607, "mean_token_accuracy": 0.7597405662760138, "num_tokens": 176441962.0, "step": 1010 }, { "epoch": 0.2779812294537464, "grad_norm": 0.9295870661735535, "learning_rate": 4.43952802359882e-05, "loss": 0.9468, "mean_token_accuracy": 0.7614248601719737, "num_tokens": 178197361.0, "step": 1020 }, { "epoch": 0.2807065356248616, "grad_norm": 0.9550367593765259, "learning_rate": 4.432504565247929e-05, "loss": 0.9766, "mean_token_accuracy": 0.7538504358381033, "num_tokens": 179887580.0, "step": 1030 }, { "epoch": 0.28343184179597675, "grad_norm": 0.9089730978012085, "learning_rate": 4.425481106897036e-05, "loss": 0.9382, "mean_token_accuracy": 0.7631622815504671, "num_tokens": 181655865.0, "step": 1040 }, { "epoch": 0.2861571479670919, "grad_norm": 1.0070922374725342, "learning_rate": 4.418457648546144e-05, "loss": 0.962, "mean_token_accuracy": 0.7598225042223931, "num_tokens": 183445880.0, "step": 1050 }, { "epoch": 0.28888245413820707, "grad_norm": 0.8544902205467224, "learning_rate": 4.411434190195252e-05, "loss": 0.9296, "mean_token_accuracy": 0.7648898587562144, "num_tokens": 185175706.0, "step": 1060 }, { "epoch": 0.29160776030932223, "grad_norm": 0.9063568115234375, "learning_rate": 4.404410731844361e-05, "loss": 0.942, "mean_token_accuracy": 0.7613558363169431, "num_tokens": 186873396.0, "step": 1070 }, { "epoch": 0.2943330664804374, "grad_norm": 0.8668667078018188, "learning_rate": 4.397387273493468e-05, "loss": 0.9765, "mean_token_accuracy": 0.7555979456752538, "num_tokens": 188591288.0, "step": 1080 }, { "epoch": 0.29705837265155255, "grad_norm": 0.9103459119796753, "learning_rate": 4.390363815142576e-05, "loss": 0.9691, "mean_token_accuracy": 0.7566804554313421, "num_tokens": 190375182.0, "step": 1090 }, { "epoch": 0.2997836788226677, "grad_norm": 0.9022314548492432, "learning_rate": 4.383340356791684e-05, "loss": 0.955, "mean_token_accuracy": 0.7589525409042835, "num_tokens": 192104609.0, "step": 1100 }, { "epoch": 0.30250898499378287, "grad_norm": 0.8525939583778381, "learning_rate": 4.376316898440793e-05, "loss": 0.8952, "mean_token_accuracy": 0.7715249681845308, "num_tokens": 193819768.0, "step": 1110 }, { "epoch": 0.30523429116489803, "grad_norm": 0.8655187487602234, "learning_rate": 4.3692934400899e-05, "loss": 0.957, "mean_token_accuracy": 0.7608988376334309, "num_tokens": 195541705.0, "step": 1120 }, { "epoch": 0.3079595973360132, "grad_norm": 0.8646089434623718, "learning_rate": 4.362269981739008e-05, "loss": 0.944, "mean_token_accuracy": 0.7616777900606394, "num_tokens": 197250976.0, "step": 1130 }, { "epoch": 0.31068490350712835, "grad_norm": 0.8167153000831604, "learning_rate": 4.355246523388117e-05, "loss": 0.9149, "mean_token_accuracy": 0.7681540355086327, "num_tokens": 198964087.0, "step": 1140 }, { "epoch": 0.31341020967824357, "grad_norm": 0.9367976188659668, "learning_rate": 4.348223065037224e-05, "loss": 0.9599, "mean_token_accuracy": 0.7586290875449777, "num_tokens": 200654341.0, "step": 1150 }, { "epoch": 0.31613551584935873, "grad_norm": 0.8891621828079224, "learning_rate": 4.3411996066863323e-05, "loss": 0.9865, "mean_token_accuracy": 0.7538609990850091, "num_tokens": 202410389.0, "step": 1160 }, { "epoch": 0.3188608220204739, "grad_norm": 0.8583462834358215, "learning_rate": 4.3341761483354404e-05, "loss": 0.9421, "mean_token_accuracy": 0.7621322822757065, "num_tokens": 204176073.0, "step": 1170 }, { "epoch": 0.32158612819158905, "grad_norm": 0.9226254224777222, "learning_rate": 4.327152689984549e-05, "loss": 0.9369, "mean_token_accuracy": 0.7635338991880417, "num_tokens": 206017214.0, "step": 1180 }, { "epoch": 0.3243114343627042, "grad_norm": 0.958926796913147, "learning_rate": 4.3201292316336564e-05, "loss": 0.948, "mean_token_accuracy": 0.7608529534190893, "num_tokens": 207832984.0, "step": 1190 }, { "epoch": 0.32703674053381937, "grad_norm": 0.9389734268188477, "learning_rate": 4.3131057732827644e-05, "loss": 0.9579, "mean_token_accuracy": 0.7594182942993939, "num_tokens": 209533427.0, "step": 1200 }, { "epoch": 0.32976204670493453, "grad_norm": 0.8880856037139893, "learning_rate": 4.3060823149318724e-05, "loss": 0.9352, "mean_token_accuracy": 0.7635649677366019, "num_tokens": 211344332.0, "step": 1210 }, { "epoch": 0.3324873528760497, "grad_norm": 0.9591731429100037, "learning_rate": 4.299058856580981e-05, "loss": 0.9633, "mean_token_accuracy": 0.7583153389394284, "num_tokens": 213088746.0, "step": 1220 }, { "epoch": 0.33521265904716485, "grad_norm": 0.9024151563644409, "learning_rate": 4.2920353982300885e-05, "loss": 0.9647, "mean_token_accuracy": 0.7571177691221237, "num_tokens": 214889528.0, "step": 1230 }, { "epoch": 0.33793796521828, "grad_norm": 0.9884682893753052, "learning_rate": 4.2850119398791965e-05, "loss": 0.9719, "mean_token_accuracy": 0.7572076172567904, "num_tokens": 216603081.0, "step": 1240 }, { "epoch": 0.3406632713893952, "grad_norm": 0.8637387752532959, "learning_rate": 4.277988481528305e-05, "loss": 0.9211, "mean_token_accuracy": 0.7659906836226582, "num_tokens": 218292845.0, "step": 1250 }, { "epoch": 0.34338857756051033, "grad_norm": 0.9273796081542969, "learning_rate": 4.2709650231774125e-05, "loss": 0.9454, "mean_token_accuracy": 0.7626127757132053, "num_tokens": 220029696.0, "step": 1260 }, { "epoch": 0.3461138837316255, "grad_norm": 0.8521084189414978, "learning_rate": 4.2639415648265206e-05, "loss": 1.0002, "mean_token_accuracy": 0.7520326241850853, "num_tokens": 221810553.0, "step": 1270 }, { "epoch": 0.34883918990274065, "grad_norm": 0.9349818229675293, "learning_rate": 4.2569181064756286e-05, "loss": 0.9412, "mean_token_accuracy": 0.7652541293762625, "num_tokens": 223552801.0, "step": 1280 }, { "epoch": 0.3515644960738558, "grad_norm": 0.9496633410453796, "learning_rate": 4.249894648124737e-05, "loss": 0.9486, "mean_token_accuracy": 0.7618991539813578, "num_tokens": 225254929.0, "step": 1290 }, { "epoch": 0.354289802244971, "grad_norm": 0.8988118767738342, "learning_rate": 4.2428711897738446e-05, "loss": 0.9679, "mean_token_accuracy": 0.7575280690565706, "num_tokens": 227017304.0, "step": 1300 }, { "epoch": 0.35701510841608614, "grad_norm": 0.9880586862564087, "learning_rate": 4.2358477314229526e-05, "loss": 0.9199, "mean_token_accuracy": 0.7675149342045188, "num_tokens": 228785771.0, "step": 1310 }, { "epoch": 0.3597404145872013, "grad_norm": 0.9548985958099365, "learning_rate": 4.2288242730720607e-05, "loss": 0.9276, "mean_token_accuracy": 0.7658209035173058, "num_tokens": 230489667.0, "step": 1320 }, { "epoch": 0.36246572075831646, "grad_norm": 0.8222749829292297, "learning_rate": 4.2218008147211694e-05, "loss": 0.9464, "mean_token_accuracy": 0.7618509028106928, "num_tokens": 232269581.0, "step": 1330 }, { "epoch": 0.3651910269294316, "grad_norm": 0.8697149753570557, "learning_rate": 4.214777356370277e-05, "loss": 0.9507, "mean_token_accuracy": 0.7589831293560565, "num_tokens": 234028126.0, "step": 1340 }, { "epoch": 0.3679163331005468, "grad_norm": 1.0205236673355103, "learning_rate": 4.207753898019385e-05, "loss": 0.9703, "mean_token_accuracy": 0.7572768048383296, "num_tokens": 235817129.0, "step": 1350 }, { "epoch": 0.37064163927166194, "grad_norm": 0.8218963742256165, "learning_rate": 4.2007304396684934e-05, "loss": 0.9508, "mean_token_accuracy": 0.7612483019009233, "num_tokens": 237588984.0, "step": 1360 }, { "epoch": 0.3733669454427771, "grad_norm": 0.9388371109962463, "learning_rate": 4.193706981317601e-05, "loss": 0.9144, "mean_token_accuracy": 0.7678116019815207, "num_tokens": 239305200.0, "step": 1370 }, { "epoch": 0.37609225161389226, "grad_norm": 0.8573911190032959, "learning_rate": 4.186683522966709e-05, "loss": 0.9368, "mean_token_accuracy": 0.7639053143560887, "num_tokens": 241078759.0, "step": 1380 }, { "epoch": 0.3788175577850074, "grad_norm": 0.918056845664978, "learning_rate": 4.179660064615817e-05, "loss": 0.9596, "mean_token_accuracy": 0.7594995486550034, "num_tokens": 242876841.0, "step": 1390 }, { "epoch": 0.3815428639561226, "grad_norm": 0.8583497405052185, "learning_rate": 4.1726366062649255e-05, "loss": 0.9252, "mean_token_accuracy": 0.7642927357926965, "num_tokens": 244578724.0, "step": 1400 }, { "epoch": 0.38426817012723774, "grad_norm": 0.8453887104988098, "learning_rate": 4.165613147914033e-05, "loss": 0.9525, "mean_token_accuracy": 0.7590734004974365, "num_tokens": 246363009.0, "step": 1410 }, { "epoch": 0.3869934762983529, "grad_norm": 0.8691115975379944, "learning_rate": 4.158589689563141e-05, "loss": 0.9764, "mean_token_accuracy": 0.7560055317357183, "num_tokens": 248120613.0, "step": 1420 }, { "epoch": 0.38971878246946806, "grad_norm": 0.8741028904914856, "learning_rate": 4.151566231212249e-05, "loss": 0.9514, "mean_token_accuracy": 0.7612073054537177, "num_tokens": 249941867.0, "step": 1430 }, { "epoch": 0.3924440886405832, "grad_norm": 0.8339214324951172, "learning_rate": 4.1445427728613576e-05, "loss": 0.9189, "mean_token_accuracy": 0.7668958367779851, "num_tokens": 251695734.0, "step": 1440 }, { "epoch": 0.3951693948116984, "grad_norm": 0.8296242356300354, "learning_rate": 4.137519314510465e-05, "loss": 0.9627, "mean_token_accuracy": 0.7588108706288039, "num_tokens": 253503211.0, "step": 1450 }, { "epoch": 0.39789470098281354, "grad_norm": 1.0061241388320923, "learning_rate": 4.130495856159573e-05, "loss": 0.9399, "mean_token_accuracy": 0.7630201263353229, "num_tokens": 255313734.0, "step": 1460 }, { "epoch": 0.4006200071539287, "grad_norm": 0.795073390007019, "learning_rate": 4.1234723978086816e-05, "loss": 0.8906, "mean_token_accuracy": 0.7732970826327801, "num_tokens": 257016513.0, "step": 1470 }, { "epoch": 0.40334531332504386, "grad_norm": 0.9418059587478638, "learning_rate": 4.1164489394577896e-05, "loss": 0.9642, "mean_token_accuracy": 0.757817355170846, "num_tokens": 258760362.0, "step": 1480 }, { "epoch": 0.406070619496159, "grad_norm": 0.8528169989585876, "learning_rate": 4.109425481106897e-05, "loss": 0.924, "mean_token_accuracy": 0.7663666036911309, "num_tokens": 260546971.0, "step": 1490 }, { "epoch": 0.4087959256672742, "grad_norm": 0.8477678298950195, "learning_rate": 4.102402022756005e-05, "loss": 0.9325, "mean_token_accuracy": 0.7635815197601914, "num_tokens": 262307672.0, "step": 1500 }, { "epoch": 0.41152123183838935, "grad_norm": 0.9021455645561218, "learning_rate": 4.095378564405114e-05, "loss": 0.9451, "mean_token_accuracy": 0.7621158785186708, "num_tokens": 264023889.0, "step": 1510 }, { "epoch": 0.4142465380095045, "grad_norm": 0.8655399084091187, "learning_rate": 4.088355106054221e-05, "loss": 0.9435, "mean_token_accuracy": 0.76206961190328, "num_tokens": 265798519.0, "step": 1520 }, { "epoch": 0.41697184418061967, "grad_norm": 0.8364386558532715, "learning_rate": 4.081331647703329e-05, "loss": 0.9599, "mean_token_accuracy": 0.7590883946046233, "num_tokens": 267546100.0, "step": 1530 }, { "epoch": 0.4196971503517348, "grad_norm": 0.8579459190368652, "learning_rate": 4.074308189352437e-05, "loss": 0.9233, "mean_token_accuracy": 0.7652724193409085, "num_tokens": 269297641.0, "step": 1540 }, { "epoch": 0.42242245652285, "grad_norm": 0.7599934339523315, "learning_rate": 4.067284731001546e-05, "loss": 0.9606, "mean_token_accuracy": 0.7579821115359664, "num_tokens": 271053006.0, "step": 1550 }, { "epoch": 0.42514776269396515, "grad_norm": 0.934144139289856, "learning_rate": 4.060261272650653e-05, "loss": 0.9439, "mean_token_accuracy": 0.7628989959135651, "num_tokens": 272829487.0, "step": 1560 }, { "epoch": 0.4278730688650803, "grad_norm": 0.8444865345954895, "learning_rate": 4.053237814299761e-05, "loss": 0.937, "mean_token_accuracy": 0.7626888936385512, "num_tokens": 274621990.0, "step": 1570 }, { "epoch": 0.43059837503619547, "grad_norm": 0.854570209980011, "learning_rate": 4.04621435594887e-05, "loss": 0.9555, "mean_token_accuracy": 0.7589856326580048, "num_tokens": 276398989.0, "step": 1580 }, { "epoch": 0.43332368120731063, "grad_norm": 0.916355550289154, "learning_rate": 4.039190897597978e-05, "loss": 0.9217, "mean_token_accuracy": 0.7648066570982337, "num_tokens": 278192884.0, "step": 1590 }, { "epoch": 0.4360489873784258, "grad_norm": 0.9150972962379456, "learning_rate": 4.032167439247085e-05, "loss": 0.9223, "mean_token_accuracy": 0.7662323093041777, "num_tokens": 279937931.0, "step": 1600 }, { "epoch": 0.43877429354954095, "grad_norm": 0.7945618629455566, "learning_rate": 4.025143980896193e-05, "loss": 0.9226, "mean_token_accuracy": 0.7670198388397693, "num_tokens": 281698143.0, "step": 1610 }, { "epoch": 0.4414995997206561, "grad_norm": 0.826994776725769, "learning_rate": 4.018120522545302e-05, "loss": 0.9348, "mean_token_accuracy": 0.7642012946307659, "num_tokens": 283412565.0, "step": 1620 }, { "epoch": 0.44422490589177127, "grad_norm": 0.9012675881385803, "learning_rate": 4.011097064194409e-05, "loss": 0.9413, "mean_token_accuracy": 0.7626177223399282, "num_tokens": 285184244.0, "step": 1630 }, { "epoch": 0.44695021206288643, "grad_norm": 0.7998985648155212, "learning_rate": 4.004073605843517e-05, "loss": 0.8893, "mean_token_accuracy": 0.7731544019654393, "num_tokens": 286963226.0, "step": 1640 }, { "epoch": 0.4496755182340016, "grad_norm": 0.894074559211731, "learning_rate": 3.997050147492625e-05, "loss": 0.9101, "mean_token_accuracy": 0.7684361206367611, "num_tokens": 288742524.0, "step": 1650 }, { "epoch": 0.45240082440511675, "grad_norm": 0.9063016176223755, "learning_rate": 3.990026689141734e-05, "loss": 0.9408, "mean_token_accuracy": 0.7627444183453918, "num_tokens": 290486439.0, "step": 1660 }, { "epoch": 0.4551261305762319, "grad_norm": 0.9253380298614502, "learning_rate": 3.9830032307908413e-05, "loss": 0.9489, "mean_token_accuracy": 0.761582453828305, "num_tokens": 292244616.0, "step": 1670 }, { "epoch": 0.4578514367473471, "grad_norm": 0.908446192741394, "learning_rate": 3.9759797724399494e-05, "loss": 0.9435, "mean_token_accuracy": 0.7627298279665411, "num_tokens": 294028863.0, "step": 1680 }, { "epoch": 0.46057674291846223, "grad_norm": 0.8264852166175842, "learning_rate": 3.968956314089058e-05, "loss": 0.9158, "mean_token_accuracy": 0.765919309295714, "num_tokens": 295842660.0, "step": 1690 }, { "epoch": 0.4633020490895774, "grad_norm": 0.8423106074333191, "learning_rate": 3.961932855738166e-05, "loss": 0.9214, "mean_token_accuracy": 0.767298517934978, "num_tokens": 297606076.0, "step": 1700 }, { "epoch": 0.46602735526069256, "grad_norm": 0.8728176355361938, "learning_rate": 3.9549093973872734e-05, "loss": 0.9501, "mean_token_accuracy": 0.7602741612121463, "num_tokens": 299367632.0, "step": 1710 }, { "epoch": 0.4687526614318077, "grad_norm": 0.7749947309494019, "learning_rate": 3.9478859390363814e-05, "loss": 0.9076, "mean_token_accuracy": 0.770167650654912, "num_tokens": 301097990.0, "step": 1720 }, { "epoch": 0.4714779676029229, "grad_norm": 0.9254289865493774, "learning_rate": 3.94086248068549e-05, "loss": 0.9142, "mean_token_accuracy": 0.7686513620428741, "num_tokens": 302847308.0, "step": 1730 }, { "epoch": 0.47420327377403804, "grad_norm": 0.870252788066864, "learning_rate": 3.9338390223345975e-05, "loss": 0.9356, "mean_token_accuracy": 0.7628345469944179, "num_tokens": 304582625.0, "step": 1740 }, { "epoch": 0.4769285799451532, "grad_norm": 0.845609188079834, "learning_rate": 3.9268155639837055e-05, "loss": 0.9179, "mean_token_accuracy": 0.7681567031890154, "num_tokens": 306358206.0, "step": 1750 }, { "epoch": 0.47965388611626836, "grad_norm": 0.8974276185035706, "learning_rate": 3.9197921056328135e-05, "loss": 0.9172, "mean_token_accuracy": 0.7672481311485171, "num_tokens": 308026562.0, "step": 1760 }, { "epoch": 0.4823791922873835, "grad_norm": 0.9142021536827087, "learning_rate": 3.912768647281922e-05, "loss": 0.9492, "mean_token_accuracy": 0.7608077014796436, "num_tokens": 309804543.0, "step": 1770 }, { "epoch": 0.4851044984584987, "grad_norm": 0.8676534295082092, "learning_rate": 3.9057451889310296e-05, "loss": 0.9406, "mean_token_accuracy": 0.7620649378746748, "num_tokens": 311549771.0, "step": 1780 }, { "epoch": 0.48782980462961384, "grad_norm": 0.9465579986572266, "learning_rate": 3.8987217305801376e-05, "loss": 0.9559, "mean_token_accuracy": 0.7597391474992037, "num_tokens": 313326889.0, "step": 1790 }, { "epoch": 0.490555110800729, "grad_norm": 1.1392822265625, "learning_rate": 3.891698272229246e-05, "loss": 0.9123, "mean_token_accuracy": 0.7693880930542946, "num_tokens": 315064715.0, "step": 1800 }, { "epoch": 0.49328041697184416, "grad_norm": 0.8468901515007019, "learning_rate": 3.884674813878354e-05, "loss": 0.9293, "mean_token_accuracy": 0.7640517603605985, "num_tokens": 316809509.0, "step": 1810 }, { "epoch": 0.4960057231429593, "grad_norm": 0.831238865852356, "learning_rate": 3.8776513555274616e-05, "loss": 0.9097, "mean_token_accuracy": 0.7691411901265383, "num_tokens": 318597481.0, "step": 1820 }, { "epoch": 0.4987310293140745, "grad_norm": 0.8473038077354431, "learning_rate": 3.8706278971765697e-05, "loss": 0.9099, "mean_token_accuracy": 0.7685648757964373, "num_tokens": 320382863.0, "step": 1830 }, { "epoch": 0.5014563354851896, "grad_norm": 0.8610597848892212, "learning_rate": 3.8636044388256784e-05, "loss": 0.9255, "mean_token_accuracy": 0.7672168118879199, "num_tokens": 322136431.0, "step": 1840 }, { "epoch": 0.5041816416563049, "grad_norm": 0.8556321859359741, "learning_rate": 3.856580980474786e-05, "loss": 0.9108, "mean_token_accuracy": 0.7693611716851592, "num_tokens": 323892089.0, "step": 1850 }, { "epoch": 0.50690694782742, "grad_norm": 0.8000465631484985, "learning_rate": 3.849557522123894e-05, "loss": 0.9198, "mean_token_accuracy": 0.7665763789787888, "num_tokens": 325693457.0, "step": 1860 }, { "epoch": 0.5096322539985352, "grad_norm": 0.9535455107688904, "learning_rate": 3.842534063773002e-05, "loss": 0.8953, "mean_token_accuracy": 0.7704567154869437, "num_tokens": 327397884.0, "step": 1870 }, { "epoch": 0.5123575601696503, "grad_norm": 0.9103294610977173, "learning_rate": 3.8355106054221104e-05, "loss": 0.9415, "mean_token_accuracy": 0.7628723161295057, "num_tokens": 329077502.0, "step": 1880 }, { "epoch": 0.5150828663407655, "grad_norm": 0.8798259496688843, "learning_rate": 3.828487147071218e-05, "loss": 0.9294, "mean_token_accuracy": 0.7647034784778952, "num_tokens": 330847695.0, "step": 1890 }, { "epoch": 0.5178081725118806, "grad_norm": 0.8437985181808472, "learning_rate": 3.821463688720326e-05, "loss": 0.9127, "mean_token_accuracy": 0.7662377031520009, "num_tokens": 332578366.0, "step": 1900 }, { "epoch": 0.5205334786829958, "grad_norm": 0.877572774887085, "learning_rate": 3.8144402303694345e-05, "loss": 0.9395, "mean_token_accuracy": 0.7628075636923313, "num_tokens": 334293589.0, "step": 1910 }, { "epoch": 0.5232587848541109, "grad_norm": 0.7866714596748352, "learning_rate": 3.8074167720185425e-05, "loss": 0.9268, "mean_token_accuracy": 0.7632251624017954, "num_tokens": 336016972.0, "step": 1920 }, { "epoch": 0.5259840910252261, "grad_norm": 0.8031445741653442, "learning_rate": 3.80039331366765e-05, "loss": 0.9032, "mean_token_accuracy": 0.7699568081647158, "num_tokens": 337805977.0, "step": 1930 }, { "epoch": 0.5287093971963412, "grad_norm": 0.7914207577705383, "learning_rate": 3.793369855316758e-05, "loss": 0.8779, "mean_token_accuracy": 0.7737740287557244, "num_tokens": 339548221.0, "step": 1940 }, { "epoch": 0.5314347033674565, "grad_norm": 0.8687313795089722, "learning_rate": 3.7863463969658666e-05, "loss": 0.9549, "mean_token_accuracy": 0.7590207350440323, "num_tokens": 341268344.0, "step": 1950 }, { "epoch": 0.5341600095385716, "grad_norm": 0.9336804747581482, "learning_rate": 3.779322938614974e-05, "loss": 0.9221, "mean_token_accuracy": 0.765609790198505, "num_tokens": 343017522.0, "step": 1960 }, { "epoch": 0.5368853157096868, "grad_norm": 0.9002737998962402, "learning_rate": 3.772299480264082e-05, "loss": 0.9181, "mean_token_accuracy": 0.7670193919911981, "num_tokens": 344699192.0, "step": 1970 }, { "epoch": 0.5396106218808019, "grad_norm": 0.8401851654052734, "learning_rate": 3.7652760219131906e-05, "loss": 0.9043, "mean_token_accuracy": 0.7700289068743587, "num_tokens": 346458527.0, "step": 1980 }, { "epoch": 0.5423359280519171, "grad_norm": 0.8544691801071167, "learning_rate": 3.7582525635622986e-05, "loss": 0.9203, "mean_token_accuracy": 0.7662555683404207, "num_tokens": 348204585.0, "step": 1990 }, { "epoch": 0.5450612342230322, "grad_norm": 0.8688130378723145, "learning_rate": 3.751229105211406e-05, "loss": 0.9467, "mean_token_accuracy": 0.7613058963790535, "num_tokens": 349928259.0, "step": 2000 }, { "epoch": 0.5477865403941474, "grad_norm": 0.856677234172821, "learning_rate": 3.744205646860514e-05, "loss": 0.9124, "mean_token_accuracy": 0.767791903577745, "num_tokens": 351568018.0, "step": 2010 }, { "epoch": 0.5505118465652625, "grad_norm": 0.8962033987045288, "learning_rate": 3.737182188509623e-05, "loss": 0.8993, "mean_token_accuracy": 0.7711729943752289, "num_tokens": 353325716.0, "step": 2020 }, { "epoch": 0.5532371527363777, "grad_norm": 0.8988960385322571, "learning_rate": 3.730158730158731e-05, "loss": 0.9338, "mean_token_accuracy": 0.7638613487593829, "num_tokens": 355099655.0, "step": 2030 }, { "epoch": 0.5559624589074929, "grad_norm": 0.7775430083274841, "learning_rate": 3.723135271807838e-05, "loss": 0.9109, "mean_token_accuracy": 0.7683175005018711, "num_tokens": 356892448.0, "step": 2040 }, { "epoch": 0.5586877650786081, "grad_norm": 0.8810161352157593, "learning_rate": 3.716111813456946e-05, "loss": 0.9376, "mean_token_accuracy": 0.7628290761262179, "num_tokens": 358599855.0, "step": 2050 }, { "epoch": 0.5614130712497232, "grad_norm": 0.8723873496055603, "learning_rate": 3.709088355106055e-05, "loss": 0.9433, "mean_token_accuracy": 0.7625115603208542, "num_tokens": 360366528.0, "step": 2060 }, { "epoch": 0.5641383774208384, "grad_norm": 0.9130221605300903, "learning_rate": 3.702064896755162e-05, "loss": 0.915, "mean_token_accuracy": 0.7676266288384795, "num_tokens": 362123903.0, "step": 2070 }, { "epoch": 0.5668636835919535, "grad_norm": 0.7740192413330078, "learning_rate": 3.69504143840427e-05, "loss": 0.8998, "mean_token_accuracy": 0.7719444134272635, "num_tokens": 363907270.0, "step": 2080 }, { "epoch": 0.5695889897630687, "grad_norm": 0.931222677230835, "learning_rate": 3.688017980053379e-05, "loss": 0.8834, "mean_token_accuracy": 0.772805467247963, "num_tokens": 365602883.0, "step": 2090 }, { "epoch": 0.5723142959341838, "grad_norm": 0.9044320583343506, "learning_rate": 3.680994521702487e-05, "loss": 0.9081, "mean_token_accuracy": 0.7687753444537521, "num_tokens": 367308921.0, "step": 2100 }, { "epoch": 0.575039602105299, "grad_norm": 0.8750004768371582, "learning_rate": 3.673971063351594e-05, "loss": 0.909, "mean_token_accuracy": 0.7677365321666002, "num_tokens": 369068618.0, "step": 2110 }, { "epoch": 0.5777649082764141, "grad_norm": 0.836656391620636, "learning_rate": 3.666947605000702e-05, "loss": 0.9237, "mean_token_accuracy": 0.7663044361397624, "num_tokens": 370792272.0, "step": 2120 }, { "epoch": 0.5804902144475294, "grad_norm": 0.9042032361030579, "learning_rate": 3.659924146649811e-05, "loss": 0.9282, "mean_token_accuracy": 0.7646197730675339, "num_tokens": 372506541.0, "step": 2130 }, { "epoch": 0.5832155206186445, "grad_norm": 0.8443713784217834, "learning_rate": 3.652900688298919e-05, "loss": 0.8791, "mean_token_accuracy": 0.7750100614503026, "num_tokens": 374308542.0, "step": 2140 }, { "epoch": 0.5859408267897597, "grad_norm": 0.840281069278717, "learning_rate": 3.645877229948026e-05, "loss": 0.8876, "mean_token_accuracy": 0.7726960835978389, "num_tokens": 376016266.0, "step": 2150 }, { "epoch": 0.5886661329608748, "grad_norm": 0.80794757604599, "learning_rate": 3.638853771597134e-05, "loss": 0.9308, "mean_token_accuracy": 0.7654405352659523, "num_tokens": 377732713.0, "step": 2160 }, { "epoch": 0.59139143913199, "grad_norm": 0.9291797280311584, "learning_rate": 3.631830313246243e-05, "loss": 0.9172, "mean_token_accuracy": 0.76725367102772, "num_tokens": 379457927.0, "step": 2170 }, { "epoch": 0.5941167453031051, "grad_norm": 0.7284071445465088, "learning_rate": 3.6248068548953503e-05, "loss": 0.9533, "mean_token_accuracy": 0.7596265442669392, "num_tokens": 381311531.0, "step": 2180 }, { "epoch": 0.5968420514742203, "grad_norm": 0.8543251156806946, "learning_rate": 3.6177833965444584e-05, "loss": 0.9449, "mean_token_accuracy": 0.7630881817080081, "num_tokens": 383076036.0, "step": 2190 }, { "epoch": 0.5995673576453354, "grad_norm": 0.8819230198860168, "learning_rate": 3.610759938193567e-05, "loss": 0.9396, "mean_token_accuracy": 0.7628208780661225, "num_tokens": 384869981.0, "step": 2200 }, { "epoch": 0.6022926638164506, "grad_norm": 0.9766581058502197, "learning_rate": 3.603736479842675e-05, "loss": 0.9203, "mean_token_accuracy": 0.7657232660800218, "num_tokens": 386654361.0, "step": 2210 }, { "epoch": 0.6050179699875657, "grad_norm": 0.8702138662338257, "learning_rate": 3.5967130214917824e-05, "loss": 0.8969, "mean_token_accuracy": 0.7711658507585526, "num_tokens": 388405388.0, "step": 2220 }, { "epoch": 0.607743276158681, "grad_norm": 0.8681634068489075, "learning_rate": 3.5896895631408904e-05, "loss": 0.928, "mean_token_accuracy": 0.7651922093704343, "num_tokens": 390160411.0, "step": 2230 }, { "epoch": 0.6104685823297961, "grad_norm": 0.8256725668907166, "learning_rate": 3.582666104789999e-05, "loss": 0.9334, "mean_token_accuracy": 0.7633168527856469, "num_tokens": 391963393.0, "step": 2240 }, { "epoch": 0.6131938885009113, "grad_norm": 0.8524363040924072, "learning_rate": 3.575642646439107e-05, "loss": 0.9321, "mean_token_accuracy": 0.7641467567533254, "num_tokens": 393719494.0, "step": 2250 }, { "epoch": 0.6159191946720264, "grad_norm": 0.8173193335533142, "learning_rate": 3.5686191880882145e-05, "loss": 0.93, "mean_token_accuracy": 0.7653383906930685, "num_tokens": 395419867.0, "step": 2260 }, { "epoch": 0.6186445008431416, "grad_norm": 0.7731173634529114, "learning_rate": 3.5615957297373225e-05, "loss": 0.9061, "mean_token_accuracy": 0.7694511445239186, "num_tokens": 397164636.0, "step": 2270 }, { "epoch": 0.6213698070142567, "grad_norm": 0.9051828980445862, "learning_rate": 3.554572271386431e-05, "loss": 0.9409, "mean_token_accuracy": 0.761752724274993, "num_tokens": 398897880.0, "step": 2280 }, { "epoch": 0.6240951131853719, "grad_norm": 0.9036754965782166, "learning_rate": 3.5475488130355386e-05, "loss": 0.9232, "mean_token_accuracy": 0.7656918304041028, "num_tokens": 400668616.0, "step": 2290 }, { "epoch": 0.6268204193564871, "grad_norm": 0.8142061233520508, "learning_rate": 3.5405253546846466e-05, "loss": 0.915, "mean_token_accuracy": 0.7672401297837496, "num_tokens": 402379710.0, "step": 2300 }, { "epoch": 0.6295457255276022, "grad_norm": 0.8594102263450623, "learning_rate": 3.533501896333755e-05, "loss": 0.9119, "mean_token_accuracy": 0.7676404159516096, "num_tokens": 404084345.0, "step": 2310 }, { "epoch": 0.6322710316987175, "grad_norm": 0.8828473091125488, "learning_rate": 3.526478437982863e-05, "loss": 0.8836, "mean_token_accuracy": 0.7746170441620052, "num_tokens": 405770321.0, "step": 2320 }, { "epoch": 0.6349963378698326, "grad_norm": 0.8342154026031494, "learning_rate": 3.5194549796319706e-05, "loss": 0.9064, "mean_token_accuracy": 0.7688289485871792, "num_tokens": 407492020.0, "step": 2330 }, { "epoch": 0.6377216440409478, "grad_norm": 0.8321424722671509, "learning_rate": 3.5124315212810787e-05, "loss": 0.9023, "mean_token_accuracy": 0.7712408562190831, "num_tokens": 409257115.0, "step": 2340 }, { "epoch": 0.6404469502120629, "grad_norm": 0.8814114332199097, "learning_rate": 3.5054080629301874e-05, "loss": 0.9279, "mean_token_accuracy": 0.7646240370348096, "num_tokens": 410986071.0, "step": 2350 }, { "epoch": 0.6431722563831781, "grad_norm": 0.8669275045394897, "learning_rate": 3.4983846045792954e-05, "loss": 0.9115, "mean_token_accuracy": 0.7685579599812626, "num_tokens": 412763897.0, "step": 2360 }, { "epoch": 0.6458975625542932, "grad_norm": 0.8246127367019653, "learning_rate": 3.491361146228403e-05, "loss": 0.8796, "mean_token_accuracy": 0.7756225952878595, "num_tokens": 414503168.0, "step": 2370 }, { "epoch": 0.6486228687254084, "grad_norm": 0.811538577079773, "learning_rate": 3.484337687877511e-05, "loss": 0.9046, "mean_token_accuracy": 0.7706884844228625, "num_tokens": 416227145.0, "step": 2380 }, { "epoch": 0.6513481748965235, "grad_norm": 0.8389571309089661, "learning_rate": 3.4773142295266194e-05, "loss": 0.9023, "mean_token_accuracy": 0.7694288929924369, "num_tokens": 417973040.0, "step": 2390 }, { "epoch": 0.6540734810676387, "grad_norm": 0.9339456558227539, "learning_rate": 3.470290771175727e-05, "loss": 0.9104, "mean_token_accuracy": 0.7684263320639729, "num_tokens": 419736780.0, "step": 2400 }, { "epoch": 0.6567987872387538, "grad_norm": 0.7914407253265381, "learning_rate": 3.463267312824835e-05, "loss": 0.9007, "mean_token_accuracy": 0.7712036456912756, "num_tokens": 421516023.0, "step": 2410 }, { "epoch": 0.6595240934098691, "grad_norm": 0.8769999146461487, "learning_rate": 3.4562438544739435e-05, "loss": 0.9433, "mean_token_accuracy": 0.7634194398298859, "num_tokens": 423324296.0, "step": 2420 }, { "epoch": 0.6622493995809842, "grad_norm": 0.9201934337615967, "learning_rate": 3.4492203961230515e-05, "loss": 0.9272, "mean_token_accuracy": 0.7628090158104897, "num_tokens": 425114987.0, "step": 2430 }, { "epoch": 0.6649747057520994, "grad_norm": 0.833123505115509, "learning_rate": 3.442196937772159e-05, "loss": 0.8962, "mean_token_accuracy": 0.7704461555927992, "num_tokens": 426875751.0, "step": 2440 }, { "epoch": 0.6677000119232145, "grad_norm": 0.84730464220047, "learning_rate": 3.435173479421267e-05, "loss": 0.9062, "mean_token_accuracy": 0.7695771548897028, "num_tokens": 428715433.0, "step": 2450 }, { "epoch": 0.6704253180943297, "grad_norm": 0.9411203265190125, "learning_rate": 3.4281500210703756e-05, "loss": 0.9435, "mean_token_accuracy": 0.761421543173492, "num_tokens": 430443301.0, "step": 2460 }, { "epoch": 0.6731506242654448, "grad_norm": 0.8246670365333557, "learning_rate": 3.4211265627194836e-05, "loss": 0.8701, "mean_token_accuracy": 0.7766519106924534, "num_tokens": 432149898.0, "step": 2470 }, { "epoch": 0.67587593043656, "grad_norm": 0.8873474597930908, "learning_rate": 3.414103104368591e-05, "loss": 0.8906, "mean_token_accuracy": 0.7729644425213337, "num_tokens": 433904775.0, "step": 2480 }, { "epoch": 0.6786012366076751, "grad_norm": 0.8719823956489563, "learning_rate": 3.407079646017699e-05, "loss": 0.8915, "mean_token_accuracy": 0.7718378273770213, "num_tokens": 435648944.0, "step": 2490 }, { "epoch": 0.6813265427787903, "grad_norm": 0.8626745343208313, "learning_rate": 3.4000561876668076e-05, "loss": 0.9286, "mean_token_accuracy": 0.7654551040381193, "num_tokens": 437390122.0, "step": 2500 }, { "epoch": 0.6840518489499054, "grad_norm": 0.8651098608970642, "learning_rate": 3.393032729315915e-05, "loss": 0.9176, "mean_token_accuracy": 0.767458095960319, "num_tokens": 439126311.0, "step": 2510 }, { "epoch": 0.6867771551210207, "grad_norm": 0.82646644115448, "learning_rate": 3.386009270965023e-05, "loss": 0.9056, "mean_token_accuracy": 0.7701940412633121, "num_tokens": 440890933.0, "step": 2520 }, { "epoch": 0.6895024612921358, "grad_norm": 0.9855936169624329, "learning_rate": 3.378985812614132e-05, "loss": 0.8948, "mean_token_accuracy": 0.7717040576040745, "num_tokens": 442611770.0, "step": 2530 }, { "epoch": 0.692227767463251, "grad_norm": 0.8282904624938965, "learning_rate": 3.37196235426324e-05, "loss": 0.8981, "mean_token_accuracy": 0.7711870735511184, "num_tokens": 444358166.0, "step": 2540 }, { "epoch": 0.6949530736343661, "grad_norm": 0.8497506380081177, "learning_rate": 3.364938895912347e-05, "loss": 0.9359, "mean_token_accuracy": 0.7633687354624271, "num_tokens": 446078790.0, "step": 2550 }, { "epoch": 0.6976783798054813, "grad_norm": 0.9366584420204163, "learning_rate": 3.357915437561455e-05, "loss": 0.9024, "mean_token_accuracy": 0.769352519325912, "num_tokens": 447811186.0, "step": 2560 }, { "epoch": 0.7004036859765964, "grad_norm": 0.8402838110923767, "learning_rate": 3.350891979210564e-05, "loss": 0.9157, "mean_token_accuracy": 0.7674468743614853, "num_tokens": 449537962.0, "step": 2570 }, { "epoch": 0.7031289921477116, "grad_norm": 0.8306329250335693, "learning_rate": 3.343868520859672e-05, "loss": 0.8868, "mean_token_accuracy": 0.7735649706795812, "num_tokens": 451259469.0, "step": 2580 }, { "epoch": 0.7058542983188267, "grad_norm": 0.8826439380645752, "learning_rate": 3.336845062508779e-05, "loss": 0.8963, "mean_token_accuracy": 0.7722868470475077, "num_tokens": 453041146.0, "step": 2590 }, { "epoch": 0.708579604489942, "grad_norm": 0.8931313157081604, "learning_rate": 3.329821604157887e-05, "loss": 0.9309, "mean_token_accuracy": 0.7645103015005589, "num_tokens": 454753651.0, "step": 2600 }, { "epoch": 0.711304910661057, "grad_norm": 0.8628774285316467, "learning_rate": 3.322798145806996e-05, "loss": 0.8983, "mean_token_accuracy": 0.7708307450637222, "num_tokens": 456537057.0, "step": 2610 }, { "epoch": 0.7140302168321723, "grad_norm": 0.8462346196174622, "learning_rate": 3.315774687456103e-05, "loss": 0.9424, "mean_token_accuracy": 0.7633946686983109, "num_tokens": 458271296.0, "step": 2620 }, { "epoch": 0.7167555230032874, "grad_norm": 0.7972195744514465, "learning_rate": 3.308751229105211e-05, "loss": 0.9151, "mean_token_accuracy": 0.7673410098999739, "num_tokens": 460014932.0, "step": 2630 }, { "epoch": 0.7194808291744026, "grad_norm": 0.912936270236969, "learning_rate": 3.30172777075432e-05, "loss": 0.9266, "mean_token_accuracy": 0.7656251262873411, "num_tokens": 461818480.0, "step": 2640 }, { "epoch": 0.7222061353455177, "grad_norm": 0.9048453569412231, "learning_rate": 3.294704312403428e-05, "loss": 0.9464, "mean_token_accuracy": 0.7605526108294726, "num_tokens": 463519993.0, "step": 2650 }, { "epoch": 0.7249314415166329, "grad_norm": 0.947562038898468, "learning_rate": 3.287680854052535e-05, "loss": 0.9135, "mean_token_accuracy": 0.7671187071129679, "num_tokens": 465333101.0, "step": 2660 }, { "epoch": 0.727656747687748, "grad_norm": 0.8569560050964355, "learning_rate": 3.280657395701643e-05, "loss": 0.9084, "mean_token_accuracy": 0.76853600833565, "num_tokens": 467047198.0, "step": 2670 }, { "epoch": 0.7303820538588632, "grad_norm": 0.8648985624313354, "learning_rate": 3.273633937350752e-05, "loss": 0.9213, "mean_token_accuracy": 0.766769905667752, "num_tokens": 468806711.0, "step": 2680 }, { "epoch": 0.7331073600299783, "grad_norm": 0.8183128237724304, "learning_rate": 3.26661047899986e-05, "loss": 0.8789, "mean_token_accuracy": 0.7748562408611178, "num_tokens": 470600782.0, "step": 2690 }, { "epoch": 0.7358326662010936, "grad_norm": 0.8345265984535217, "learning_rate": 3.2595870206489674e-05, "loss": 0.9198, "mean_token_accuracy": 0.767015983723104, "num_tokens": 472349330.0, "step": 2700 }, { "epoch": 0.7385579723722087, "grad_norm": 0.836322009563446, "learning_rate": 3.2525635622980754e-05, "loss": 0.907, "mean_token_accuracy": 0.7705251289531588, "num_tokens": 474106983.0, "step": 2710 }, { "epoch": 0.7412832785433239, "grad_norm": 0.8931109309196472, "learning_rate": 3.245540103947184e-05, "loss": 0.939, "mean_token_accuracy": 0.7626953650265932, "num_tokens": 475902407.0, "step": 2720 }, { "epoch": 0.744008584714439, "grad_norm": 0.8067692518234253, "learning_rate": 3.2385166455962914e-05, "loss": 0.8629, "mean_token_accuracy": 0.7779960019513965, "num_tokens": 477654644.0, "step": 2730 }, { "epoch": 0.7467338908855542, "grad_norm": 0.8653134107589722, "learning_rate": 3.2314931872453994e-05, "loss": 0.9292, "mean_token_accuracy": 0.764538905583322, "num_tokens": 479409078.0, "step": 2740 }, { "epoch": 0.7494591970566693, "grad_norm": 0.9657193422317505, "learning_rate": 3.224469728894508e-05, "loss": 0.9136, "mean_token_accuracy": 0.767500625923276, "num_tokens": 481154972.0, "step": 2750 }, { "epoch": 0.7521845032277845, "grad_norm": 0.9113588929176331, "learning_rate": 3.217446270543616e-05, "loss": 0.9395, "mean_token_accuracy": 0.7628454895690083, "num_tokens": 482957554.0, "step": 2760 }, { "epoch": 0.7549098093988996, "grad_norm": 0.8665691614151001, "learning_rate": 3.2104228121927235e-05, "loss": 0.9139, "mean_token_accuracy": 0.7677292965352536, "num_tokens": 484730289.0, "step": 2770 }, { "epoch": 0.7576351155700148, "grad_norm": 0.9618145227432251, "learning_rate": 3.2033993538418315e-05, "loss": 0.9206, "mean_token_accuracy": 0.7647297469899058, "num_tokens": 486390658.0, "step": 2780 }, { "epoch": 0.7603604217411299, "grad_norm": 0.9165218472480774, "learning_rate": 3.19637589549094e-05, "loss": 0.8822, "mean_token_accuracy": 0.7746361203491687, "num_tokens": 488084357.0, "step": 2790 }, { "epoch": 0.7630857279122452, "grad_norm": 0.9210852980613708, "learning_rate": 3.189352437140048e-05, "loss": 0.9589, "mean_token_accuracy": 0.7591536954045296, "num_tokens": 489867504.0, "step": 2800 }, { "epoch": 0.7658110340833603, "grad_norm": 0.8209601044654846, "learning_rate": 3.1823289787891556e-05, "loss": 0.926, "mean_token_accuracy": 0.7646390557289123, "num_tokens": 491614120.0, "step": 2810 }, { "epoch": 0.7685363402544755, "grad_norm": 0.8096806406974792, "learning_rate": 3.1753055204382636e-05, "loss": 0.9237, "mean_token_accuracy": 0.7666580315679312, "num_tokens": 493360108.0, "step": 2820 }, { "epoch": 0.7712616464255906, "grad_norm": 0.8601098656654358, "learning_rate": 3.168282062087372e-05, "loss": 0.9003, "mean_token_accuracy": 0.7720298429951071, "num_tokens": 495060867.0, "step": 2830 }, { "epoch": 0.7739869525967058, "grad_norm": 0.896539568901062, "learning_rate": 3.16125860373648e-05, "loss": 0.9224, "mean_token_accuracy": 0.7646079597994685, "num_tokens": 496731129.0, "step": 2840 }, { "epoch": 0.7767122587678209, "grad_norm": 0.7847779393196106, "learning_rate": 3.1542351453855877e-05, "loss": 0.8955, "mean_token_accuracy": 0.7723177457228303, "num_tokens": 498416929.0, "step": 2850 }, { "epoch": 0.7794375649389361, "grad_norm": 0.9052691459655762, "learning_rate": 3.1472116870346964e-05, "loss": 0.8798, "mean_token_accuracy": 0.7750480374321341, "num_tokens": 500131030.0, "step": 2860 }, { "epoch": 0.7821628711100512, "grad_norm": 0.7930359244346619, "learning_rate": 3.1401882286838044e-05, "loss": 0.8905, "mean_token_accuracy": 0.7722199564799667, "num_tokens": 501860254.0, "step": 2870 }, { "epoch": 0.7848881772811664, "grad_norm": 0.89003986120224, "learning_rate": 3.133164770332912e-05, "loss": 0.9411, "mean_token_accuracy": 0.7619166776537896, "num_tokens": 503599791.0, "step": 2880 }, { "epoch": 0.7876134834522815, "grad_norm": 0.8397514224052429, "learning_rate": 3.12614131198202e-05, "loss": 0.96, "mean_token_accuracy": 0.7589967941865325, "num_tokens": 505277304.0, "step": 2890 }, { "epoch": 0.7903387896233968, "grad_norm": 0.9073812365531921, "learning_rate": 3.1191178536311284e-05, "loss": 0.9119, "mean_token_accuracy": 0.767801726795733, "num_tokens": 507010506.0, "step": 2900 }, { "epoch": 0.7930640957945119, "grad_norm": 0.8646041750907898, "learning_rate": 3.1120943952802364e-05, "loss": 0.8999, "mean_token_accuracy": 0.7706972565501928, "num_tokens": 508717678.0, "step": 2910 }, { "epoch": 0.7957894019656271, "grad_norm": 0.8266422748565674, "learning_rate": 3.105070936929344e-05, "loss": 0.878, "mean_token_accuracy": 0.7762510352768004, "num_tokens": 510478982.0, "step": 2920 }, { "epoch": 0.7985147081367422, "grad_norm": 0.8507207036018372, "learning_rate": 3.0980474785784525e-05, "loss": 0.8742, "mean_token_accuracy": 0.774853196553886, "num_tokens": 512223492.0, "step": 2930 }, { "epoch": 0.8012400143078574, "grad_norm": 0.8072352409362793, "learning_rate": 3.0910240202275605e-05, "loss": 0.8996, "mean_token_accuracy": 0.770034134015441, "num_tokens": 513950826.0, "step": 2940 }, { "epoch": 0.8039653204789725, "grad_norm": 0.88204026222229, "learning_rate": 3.0840005618766685e-05, "loss": 0.9293, "mean_token_accuracy": 0.7653471390716732, "num_tokens": 515747512.0, "step": 2950 }, { "epoch": 0.8066906266500877, "grad_norm": 0.8134139180183411, "learning_rate": 3.076977103525776e-05, "loss": 0.8995, "mean_token_accuracy": 0.7705760426819325, "num_tokens": 517564408.0, "step": 2960 }, { "epoch": 0.8094159328212028, "grad_norm": 0.810004711151123, "learning_rate": 3.0699536451748846e-05, "loss": 0.8433, "mean_token_accuracy": 0.7816835820674897, "num_tokens": 519335113.0, "step": 2970 }, { "epoch": 0.812141238992318, "grad_norm": 0.8259313106536865, "learning_rate": 3.0629301868239926e-05, "loss": 0.8837, "mean_token_accuracy": 0.7741564935073256, "num_tokens": 521052694.0, "step": 2980 }, { "epoch": 0.8148665451634332, "grad_norm": 0.8008886575698853, "learning_rate": 3.0559067284731e-05, "loss": 0.9214, "mean_token_accuracy": 0.7659341244027018, "num_tokens": 522838886.0, "step": 2990 }, { "epoch": 0.8175918513345484, "grad_norm": 0.7934632897377014, "learning_rate": 3.048883270122208e-05, "loss": 0.9041, "mean_token_accuracy": 0.7696094507351517, "num_tokens": 524659688.0, "step": 3000 }, { "epoch": 0.8203171575056635, "grad_norm": 0.839688241481781, "learning_rate": 3.0418598117713166e-05, "loss": 0.8899, "mean_token_accuracy": 0.7726705798879265, "num_tokens": 526428828.0, "step": 3010 }, { "epoch": 0.8230424636767787, "grad_norm": 0.8166800737380981, "learning_rate": 3.0348363534204243e-05, "loss": 0.9119, "mean_token_accuracy": 0.7672878581099212, "num_tokens": 528178357.0, "step": 3020 }, { "epoch": 0.8257677698478938, "grad_norm": 0.8090632557868958, "learning_rate": 3.0278128950695323e-05, "loss": 0.8927, "mean_token_accuracy": 0.7714125035330653, "num_tokens": 529968661.0, "step": 3030 }, { "epoch": 0.828493076019009, "grad_norm": 0.9141647219657898, "learning_rate": 3.0207894367186407e-05, "loss": 0.9022, "mean_token_accuracy": 0.7716053297743202, "num_tokens": 531735846.0, "step": 3040 }, { "epoch": 0.8312183821901241, "grad_norm": 0.8853212594985962, "learning_rate": 3.0137659783677484e-05, "loss": 0.923, "mean_token_accuracy": 0.7664798295125366, "num_tokens": 533482605.0, "step": 3050 }, { "epoch": 0.8339436883612393, "grad_norm": 0.8621941804885864, "learning_rate": 3.0067425200168564e-05, "loss": 0.9064, "mean_token_accuracy": 0.7699273196980357, "num_tokens": 535172117.0, "step": 3060 }, { "epoch": 0.8366689945323545, "grad_norm": 0.840312123298645, "learning_rate": 2.9997190616659644e-05, "loss": 0.8953, "mean_token_accuracy": 0.7715860839933157, "num_tokens": 536922982.0, "step": 3070 }, { "epoch": 0.8393943007034697, "grad_norm": 0.863222062587738, "learning_rate": 2.9926956033150728e-05, "loss": 0.906, "mean_token_accuracy": 0.7695185262709856, "num_tokens": 538697561.0, "step": 3080 }, { "epoch": 0.8421196068745849, "grad_norm": 0.925609290599823, "learning_rate": 2.9856721449641805e-05, "loss": 0.9123, "mean_token_accuracy": 0.7679164627566933, "num_tokens": 540498809.0, "step": 3090 }, { "epoch": 0.8448449130457, "grad_norm": 0.8675920367240906, "learning_rate": 2.9786486866132885e-05, "loss": 0.882, "mean_token_accuracy": 0.7736795306205749, "num_tokens": 542220573.0, "step": 3100 }, { "epoch": 0.8475702192168152, "grad_norm": 0.8793986439704895, "learning_rate": 2.971625228262396e-05, "loss": 0.9003, "mean_token_accuracy": 0.770465992949903, "num_tokens": 543995979.0, "step": 3110 }, { "epoch": 0.8502955253879303, "grad_norm": 0.8559759259223938, "learning_rate": 2.964601769911505e-05, "loss": 0.9159, "mean_token_accuracy": 0.7657374052330852, "num_tokens": 545762046.0, "step": 3120 }, { "epoch": 0.8530208315590455, "grad_norm": 0.7674428224563599, "learning_rate": 2.9575783115606125e-05, "loss": 0.8943, "mean_token_accuracy": 0.7716390110552311, "num_tokens": 547563120.0, "step": 3130 }, { "epoch": 0.8557461377301606, "grad_norm": 0.8808310627937317, "learning_rate": 2.9505548532097206e-05, "loss": 0.9121, "mean_token_accuracy": 0.7677836719900369, "num_tokens": 549276605.0, "step": 3140 }, { "epoch": 0.8584714439012758, "grad_norm": 0.8529791235923767, "learning_rate": 2.943531394858829e-05, "loss": 0.901, "mean_token_accuracy": 0.7700381241738796, "num_tokens": 550991479.0, "step": 3150 }, { "epoch": 0.8611967500723909, "grad_norm": 0.8383409976959229, "learning_rate": 2.9365079365079366e-05, "loss": 0.8921, "mean_token_accuracy": 0.7733515216037631, "num_tokens": 552777662.0, "step": 3160 }, { "epoch": 0.8639220562435062, "grad_norm": 0.9514780640602112, "learning_rate": 2.9294844781570446e-05, "loss": 0.8801, "mean_token_accuracy": 0.7743920419365168, "num_tokens": 554539356.0, "step": 3170 }, { "epoch": 0.8666473624146213, "grad_norm": 0.8511672019958496, "learning_rate": 2.9224610198061526e-05, "loss": 0.9443, "mean_token_accuracy": 0.7616958959959448, "num_tokens": 556212548.0, "step": 3180 }, { "epoch": 0.8693726685857365, "grad_norm": 0.8855804800987244, "learning_rate": 2.915437561455261e-05, "loss": 0.9129, "mean_token_accuracy": 0.7673460508696734, "num_tokens": 557950461.0, "step": 3190 }, { "epoch": 0.8720979747568516, "grad_norm": 0.8707140684127808, "learning_rate": 2.9084141031043687e-05, "loss": 0.8903, "mean_token_accuracy": 0.771921418607235, "num_tokens": 559715926.0, "step": 3200 }, { "epoch": 0.8748232809279668, "grad_norm": 0.9066351652145386, "learning_rate": 2.9013906447534767e-05, "loss": 0.9122, "mean_token_accuracy": 0.7690841609612107, "num_tokens": 561479394.0, "step": 3210 }, { "epoch": 0.8775485870990819, "grad_norm": 0.8212366104125977, "learning_rate": 2.8943671864025844e-05, "loss": 0.9047, "mean_token_accuracy": 0.7691477432847023, "num_tokens": 563271872.0, "step": 3220 }, { "epoch": 0.8802738932701971, "grad_norm": 0.8219311833381653, "learning_rate": 2.887343728051693e-05, "loss": 0.8994, "mean_token_accuracy": 0.7715168165042996, "num_tokens": 565002210.0, "step": 3230 }, { "epoch": 0.8829991994413122, "grad_norm": 0.898818850517273, "learning_rate": 2.8803202697008008e-05, "loss": 0.8221, "mean_token_accuracy": 0.786996117606759, "num_tokens": 566740287.0, "step": 3240 }, { "epoch": 0.8857245056124274, "grad_norm": 0.7879354953765869, "learning_rate": 2.8732968113499088e-05, "loss": 0.9305, "mean_token_accuracy": 0.7649352782405913, "num_tokens": 568560935.0, "step": 3250 }, { "epoch": 0.8884498117835425, "grad_norm": 0.9506646394729614, "learning_rate": 2.866273352999017e-05, "loss": 0.9164, "mean_token_accuracy": 0.7674483682960271, "num_tokens": 570290159.0, "step": 3260 }, { "epoch": 0.8911751179546578, "grad_norm": 0.8495944142341614, "learning_rate": 2.8592498946481248e-05, "loss": 0.9218, "mean_token_accuracy": 0.76632971893996, "num_tokens": 572076704.0, "step": 3270 }, { "epoch": 0.8939004241257729, "grad_norm": 0.8082846999168396, "learning_rate": 2.852226436297233e-05, "loss": 0.9318, "mean_token_accuracy": 0.76468366086483, "num_tokens": 573841975.0, "step": 3280 }, { "epoch": 0.8966257302968881, "grad_norm": 0.809038519859314, "learning_rate": 2.845202977946341e-05, "loss": 0.9057, "mean_token_accuracy": 0.7704976342618466, "num_tokens": 575587035.0, "step": 3290 }, { "epoch": 0.8993510364680032, "grad_norm": 0.8163307309150696, "learning_rate": 2.8381795195954492e-05, "loss": 0.9047, "mean_token_accuracy": 0.769212381914258, "num_tokens": 577285852.0, "step": 3300 }, { "epoch": 0.9020763426391184, "grad_norm": 0.8320499658584595, "learning_rate": 2.831156061244557e-05, "loss": 0.9208, "mean_token_accuracy": 0.7671002107672393, "num_tokens": 579085053.0, "step": 3310 }, { "epoch": 0.9048016488102335, "grad_norm": 0.7883128523826599, "learning_rate": 2.824132602893665e-05, "loss": 0.8903, "mean_token_accuracy": 0.7726314144209028, "num_tokens": 580874041.0, "step": 3320 }, { "epoch": 0.9075269549813487, "grad_norm": 0.8674151301383972, "learning_rate": 2.8171091445427726e-05, "loss": 0.8888, "mean_token_accuracy": 0.7714107790961862, "num_tokens": 582616757.0, "step": 3330 }, { "epoch": 0.9102522611524638, "grad_norm": 0.8607017397880554, "learning_rate": 2.8100856861918813e-05, "loss": 0.8852, "mean_token_accuracy": 0.7738766206428409, "num_tokens": 584370810.0, "step": 3340 }, { "epoch": 0.912977567323579, "grad_norm": 0.850727915763855, "learning_rate": 2.803062227840989e-05, "loss": 0.9006, "mean_token_accuracy": 0.7707684559747576, "num_tokens": 586143330.0, "step": 3350 }, { "epoch": 0.9157028734946941, "grad_norm": 0.8423539400100708, "learning_rate": 2.796038769490097e-05, "loss": 0.8657, "mean_token_accuracy": 0.777682445757091, "num_tokens": 587851884.0, "step": 3360 }, { "epoch": 0.9184281796658094, "grad_norm": 0.8592097759246826, "learning_rate": 2.7890153111392054e-05, "loss": 0.8937, "mean_token_accuracy": 0.7706205818802119, "num_tokens": 589601267.0, "step": 3370 }, { "epoch": 0.9211534858369245, "grad_norm": 0.943073034286499, "learning_rate": 2.7819918527883134e-05, "loss": 0.9254, "mean_token_accuracy": 0.7644982000812888, "num_tokens": 591321205.0, "step": 3380 }, { "epoch": 0.9238787920080397, "grad_norm": 0.7838475704193115, "learning_rate": 2.774968394437421e-05, "loss": 0.8704, "mean_token_accuracy": 0.7773344025947153, "num_tokens": 593110103.0, "step": 3390 }, { "epoch": 0.9266040981791548, "grad_norm": 0.8899914622306824, "learning_rate": 2.767944936086529e-05, "loss": 0.9234, "mean_token_accuracy": 0.7661155204288661, "num_tokens": 594823039.0, "step": 3400 }, { "epoch": 0.92932940435027, "grad_norm": 0.8291741609573364, "learning_rate": 2.7609214777356374e-05, "loss": 0.9049, "mean_token_accuracy": 0.7698826493695379, "num_tokens": 596625643.0, "step": 3410 }, { "epoch": 0.9320547105213851, "grad_norm": 0.8303592205047607, "learning_rate": 2.753898019384745e-05, "loss": 0.9194, "mean_token_accuracy": 0.7646890787407756, "num_tokens": 598409404.0, "step": 3420 }, { "epoch": 0.9347800166925003, "grad_norm": 1.093499779701233, "learning_rate": 2.746874561033853e-05, "loss": 0.9067, "mean_token_accuracy": 0.7687176914885641, "num_tokens": 600131817.0, "step": 3430 }, { "epoch": 0.9375053228636154, "grad_norm": 0.7767653465270996, "learning_rate": 2.7398511026829608e-05, "loss": 0.9087, "mean_token_accuracy": 0.7694620480760932, "num_tokens": 601867823.0, "step": 3440 }, { "epoch": 0.9402306290347306, "grad_norm": 0.8396127223968506, "learning_rate": 2.7328276443320695e-05, "loss": 0.8845, "mean_token_accuracy": 0.7755044038407505, "num_tokens": 603591263.0, "step": 3450 }, { "epoch": 0.9429559352058458, "grad_norm": 0.8479565382003784, "learning_rate": 2.7258041859811772e-05, "loss": 0.9151, "mean_token_accuracy": 0.7681015016511082, "num_tokens": 605400422.0, "step": 3460 }, { "epoch": 0.945681241376961, "grad_norm": 0.8595352172851562, "learning_rate": 2.7187807276302852e-05, "loss": 0.9071, "mean_token_accuracy": 0.7697312761098146, "num_tokens": 607180330.0, "step": 3470 }, { "epoch": 0.9484065475480761, "grad_norm": 0.7876641154289246, "learning_rate": 2.7117572692793936e-05, "loss": 0.8734, "mean_token_accuracy": 0.7765149438753725, "num_tokens": 608937194.0, "step": 3480 }, { "epoch": 0.9511318537191913, "grad_norm": 0.8947613835334778, "learning_rate": 2.7047338109285016e-05, "loss": 0.9133, "mean_token_accuracy": 0.7668988397344947, "num_tokens": 610676168.0, "step": 3490 }, { "epoch": 0.9538571598903064, "grad_norm": 0.8162944912910461, "learning_rate": 2.6977103525776093e-05, "loss": 0.8681, "mean_token_accuracy": 0.7774568162858486, "num_tokens": 612491555.0, "step": 3500 }, { "epoch": 0.9565824660614216, "grad_norm": 0.900248110294342, "learning_rate": 2.6906868942267173e-05, "loss": 0.9189, "mean_token_accuracy": 0.7660327769815922, "num_tokens": 614310820.0, "step": 3510 }, { "epoch": 0.9593077722325367, "grad_norm": 0.8359858989715576, "learning_rate": 2.6836634358758256e-05, "loss": 0.9224, "mean_token_accuracy": 0.7663468400016427, "num_tokens": 616071087.0, "step": 3520 }, { "epoch": 0.9620330784036519, "grad_norm": 0.872882604598999, "learning_rate": 2.6766399775249333e-05, "loss": 0.8712, "mean_token_accuracy": 0.7756553223356605, "num_tokens": 617769166.0, "step": 3530 }, { "epoch": 0.964758384574767, "grad_norm": 0.8891421556472778, "learning_rate": 2.6696165191740413e-05, "loss": 0.9005, "mean_token_accuracy": 0.7701712142676115, "num_tokens": 619538232.0, "step": 3540 }, { "epoch": 0.9674836907458823, "grad_norm": 0.7362189888954163, "learning_rate": 2.662593060823149e-05, "loss": 0.8938, "mean_token_accuracy": 0.7728794073686004, "num_tokens": 621303314.0, "step": 3550 }, { "epoch": 0.9702089969169974, "grad_norm": 0.7591912150382996, "learning_rate": 2.6555696024722577e-05, "loss": 0.8785, "mean_token_accuracy": 0.7754427226260304, "num_tokens": 623100776.0, "step": 3560 }, { "epoch": 0.9729343030881126, "grad_norm": 0.8404043912887573, "learning_rate": 2.6485461441213654e-05, "loss": 0.9387, "mean_token_accuracy": 0.7616929830983281, "num_tokens": 624821413.0, "step": 3570 }, { "epoch": 0.9756596092592277, "grad_norm": 0.8890523314476013, "learning_rate": 2.6415226857704734e-05, "loss": 0.9095, "mean_token_accuracy": 0.7678571032360196, "num_tokens": 626628545.0, "step": 3580 }, { "epoch": 0.9783849154303429, "grad_norm": 0.7919222712516785, "learning_rate": 2.6344992274195818e-05, "loss": 0.8906, "mean_token_accuracy": 0.7729906640946865, "num_tokens": 628425036.0, "step": 3590 }, { "epoch": 0.981110221601458, "grad_norm": 0.7847779393196106, "learning_rate": 2.6274757690686898e-05, "loss": 0.8636, "mean_token_accuracy": 0.7760783301666379, "num_tokens": 630092739.0, "step": 3600 }, { "epoch": 0.9838355277725732, "grad_norm": 0.964148998260498, "learning_rate": 2.6204523107177975e-05, "loss": 0.915, "mean_token_accuracy": 0.7690061263740062, "num_tokens": 631791605.0, "step": 3610 }, { "epoch": 0.9865608339436883, "grad_norm": 0.9110335111618042, "learning_rate": 2.6134288523669055e-05, "loss": 0.9026, "mean_token_accuracy": 0.7687728941440582, "num_tokens": 633554055.0, "step": 3620 }, { "epoch": 0.9892861401148035, "grad_norm": 0.8210707902908325, "learning_rate": 2.606405394016014e-05, "loss": 0.8729, "mean_token_accuracy": 0.7756300155073405, "num_tokens": 635269789.0, "step": 3630 }, { "epoch": 0.9920114462859186, "grad_norm": 0.8379694819450378, "learning_rate": 2.5993819356651215e-05, "loss": 0.8912, "mean_token_accuracy": 0.7726977992802858, "num_tokens": 637078092.0, "step": 3640 }, { "epoch": 0.9947367524570339, "grad_norm": 0.867736279964447, "learning_rate": 2.5923584773142296e-05, "loss": 0.8709, "mean_token_accuracy": 0.7778057091869414, "num_tokens": 638813927.0, "step": 3650 }, { "epoch": 0.997462058628149, "grad_norm": 0.9498102068901062, "learning_rate": 2.5853350189633372e-05, "loss": 0.8953, "mean_token_accuracy": 0.7722262794151902, "num_tokens": 640594007.0, "step": 3660 }, { "epoch": 1.0, "grad_norm": 0.4446529746055603, "learning_rate": 2.578311560612446e-05, "loss": 0.8011, "mean_token_accuracy": 0.7779095188883327, "num_tokens": 642209396.0, "step": 3670 } ], "logging_steps": 10, "max_steps": 7340, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1108118326981394e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }