diff --git "a/checkpoint-7340/trainer_state.json" "b/checkpoint-7340/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-7340/trainer_state.json" @@ -0,0 +1,6649 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 7340, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002725306171115161, + "grad_norm": 106.0, + "learning_rate": 0.0, + "loss": 11.3973, + "mean_token_accuracy": 0.010489485081052408, + "num_tokens": 180497.0, + "step": 1 + }, + { + "epoch": 0.0027253061711151614, + "grad_norm": 89.0, + "learning_rate": 2.0361990950226245e-06, + "loss": 11.167, + "mean_token_accuracy": 0.012705711428780988, + "num_tokens": 1772191.0, + "step": 10 + }, + { + "epoch": 0.005450612342230323, + "grad_norm": 103.0, + "learning_rate": 4.298642533936651e-06, + "loss": 10.5518, + "mean_token_accuracy": 0.016214983707322973, + "num_tokens": 3528128.0, + "step": 20 + }, + { + "epoch": 0.008175918513345483, + "grad_norm": 44.25, + "learning_rate": 6.5610859728506795e-06, + "loss": 9.4561, + "mean_token_accuracy": 0.025063409566791962, + "num_tokens": 5321820.0, + "step": 30 + }, + { + "epoch": 0.010901224684460645, + "grad_norm": 41.25, + "learning_rate": 8.823529411764707e-06, + "loss": 8.4037, + "mean_token_accuracy": 0.0406443662388483, + "num_tokens": 7036353.0, + "step": 40 + }, + { + "epoch": 0.013626530855575806, + "grad_norm": 41.5, + "learning_rate": 1.1085972850678733e-05, + "loss": 7.4734, + "mean_token_accuracy": 0.05987281463458203, + "num_tokens": 8794502.0, + "step": 50 + }, + { + "epoch": 0.016351837026690966, + "grad_norm": 29.875, + "learning_rate": 1.3348416289592761e-05, + "loss": 6.8227, + "mean_token_accuracy": 0.0807909039023798, + "num_tokens": 10519144.0, + "step": 60 + }, + { + "epoch": 0.019077143197806127, + "grad_norm": 26.125, + "learning_rate": 1.5610859728506788e-05, + "loss": 6.2162, + "mean_token_accuracy": 0.10805099562276155, + "num_tokens": 12272638.0, + "step": 70 + }, + { + "epoch": 0.02180244936892129, + "grad_norm": 23.125, + "learning_rate": 1.7873303167420814e-05, + "loss": 5.6551, + "mean_token_accuracy": 0.13820163225755094, + "num_tokens": 14037595.0, + "step": 80 + }, + { + "epoch": 0.02452775554003645, + "grad_norm": 14.25, + "learning_rate": 2.0135746606334844e-05, + "loss": 5.2483, + "mean_token_accuracy": 0.1612478678114712, + "num_tokens": 15813413.0, + "step": 90 + }, + { + "epoch": 0.027253061711151612, + "grad_norm": 13.4375, + "learning_rate": 2.239819004524887e-05, + "loss": 4.8714, + "mean_token_accuracy": 0.18540706855710595, + "num_tokens": 17606667.0, + "step": 100 + }, + { + "epoch": 0.029978367882266772, + "grad_norm": 13.125, + "learning_rate": 2.4660633484162897e-05, + "loss": 4.5352, + "mean_token_accuracy": 0.21171675145160407, + "num_tokens": 19387650.0, + "step": 110 + }, + { + "epoch": 0.03270367405338193, + "grad_norm": 8.125, + "learning_rate": 2.6923076923076923e-05, + "loss": 4.2605, + "mean_token_accuracy": 0.23295689946971834, + "num_tokens": 21111629.0, + "step": 120 + }, + { + "epoch": 0.03542898022449709, + "grad_norm": 8.75, + "learning_rate": 2.9185520361990953e-05, + "loss": 4.0612, + "mean_token_accuracy": 0.25047846739180385, + "num_tokens": 22855847.0, + "step": 130 + }, + { + "epoch": 0.038154286395612254, + "grad_norm": 8.9375, + "learning_rate": 3.1447963800904976e-05, + "loss": 3.854, + "mean_token_accuracy": 0.2728212605463341, + "num_tokens": 24647294.0, + "step": 140 + }, + { + "epoch": 0.04087959256672742, + "grad_norm": 9.625, + "learning_rate": 3.371040723981901e-05, + "loss": 3.6741, + "mean_token_accuracy": 0.29891129268798977, + "num_tokens": 26336128.0, + "step": 150 + }, + { + "epoch": 0.04360489873784258, + "grad_norm": 7.625, + "learning_rate": 3.5972850678733036e-05, + "loss": 3.5007, + "mean_token_accuracy": 0.32518296535126867, + "num_tokens": 28061528.0, + "step": 160 + }, + { + "epoch": 0.04633020490895774, + "grad_norm": 7.75, + "learning_rate": 3.8235294117647055e-05, + "loss": 3.257, + "mean_token_accuracy": 0.35865339674055574, + "num_tokens": 29834651.0, + "step": 170 + }, + { + "epoch": 0.0490555110800729, + "grad_norm": 5.78125, + "learning_rate": 4.049773755656109e-05, + "loss": 3.1448, + "mean_token_accuracy": 0.3812251358292997, + "num_tokens": 31533806.0, + "step": 180 + }, + { + "epoch": 0.05178081725118806, + "grad_norm": 7.0625, + "learning_rate": 4.2760180995475115e-05, + "loss": 2.918, + "mean_token_accuracy": 0.41093885465525093, + "num_tokens": 33328644.0, + "step": 190 + }, + { + "epoch": 0.054506123422303224, + "grad_norm": 5.84375, + "learning_rate": 4.502262443438914e-05, + "loss": 2.8125, + "mean_token_accuracy": 0.429937514802441, + "num_tokens": 35085202.0, + "step": 200 + }, + { + "epoch": 0.057231429593418384, + "grad_norm": 5.59375, + "learning_rate": 4.728506787330317e-05, + "loss": 2.7217, + "mean_token_accuracy": 0.4442617506254464, + "num_tokens": 36899685.0, + "step": 210 + }, + { + "epoch": 0.059956735764533545, + "grad_norm": 4.6875, + "learning_rate": 4.95475113122172e-05, + "loss": 2.5219, + "mean_token_accuracy": 0.4736042513512075, + "num_tokens": 38663867.0, + "step": 220 + }, + { + "epoch": 0.0626820419356487, + "grad_norm": 6.40625, + "learning_rate": 4.994381233319287e-05, + "loss": 2.4641, + "mean_token_accuracy": 0.4836056975647807, + "num_tokens": 40368688.0, + "step": 230 + }, + { + "epoch": 0.06540734810676387, + "grad_norm": 4.8125, + "learning_rate": 4.9873577749683945e-05, + "loss": 2.3962, + "mean_token_accuracy": 0.4944497250020504, + "num_tokens": 42091556.0, + "step": 240 + }, + { + "epoch": 0.06813265427787903, + "grad_norm": 4.3125, + "learning_rate": 4.9803343166175026e-05, + "loss": 2.3276, + "mean_token_accuracy": 0.5070343468338251, + "num_tokens": 43832865.0, + "step": 250 + }, + { + "epoch": 0.07085796044899419, + "grad_norm": 3.265625, + "learning_rate": 4.9733108582666106e-05, + "loss": 2.275, + "mean_token_accuracy": 0.5160813440568746, + "num_tokens": 45556421.0, + "step": 260 + }, + { + "epoch": 0.07358326662010935, + "grad_norm": 9.75, + "learning_rate": 4.9662873999157186e-05, + "loss": 2.2472, + "mean_token_accuracy": 0.525664893258363, + "num_tokens": 47284838.0, + "step": 270 + }, + { + "epoch": 0.07630857279122451, + "grad_norm": 3.421875, + "learning_rate": 4.9592639415648266e-05, + "loss": 2.163, + "mean_token_accuracy": 0.5365134474821389, + "num_tokens": 49025592.0, + "step": 280 + }, + { + "epoch": 0.07903387896233968, + "grad_norm": 3.53125, + "learning_rate": 4.9522404832139346e-05, + "loss": 2.2114, + "mean_token_accuracy": 0.5303179323673248, + "num_tokens": 50771353.0, + "step": 290 + }, + { + "epoch": 0.08175918513345484, + "grad_norm": 3.265625, + "learning_rate": 4.945217024863043e-05, + "loss": 2.0514, + "mean_token_accuracy": 0.5541504692286253, + "num_tokens": 52521467.0, + "step": 300 + }, + { + "epoch": 0.08448449130457, + "grad_norm": 3.359375, + "learning_rate": 4.938193566512151e-05, + "loss": 2.0561, + "mean_token_accuracy": 0.5545550880022347, + "num_tokens": 54338858.0, + "step": 310 + }, + { + "epoch": 0.08720979747568516, + "grad_norm": 3.25, + "learning_rate": 4.931170108161259e-05, + "loss": 2.0299, + "mean_token_accuracy": 0.5621719690505416, + "num_tokens": 56014081.0, + "step": 320 + }, + { + "epoch": 0.08993510364680032, + "grad_norm": 3.015625, + "learning_rate": 4.924146649810367e-05, + "loss": 1.9459, + "mean_token_accuracy": 0.5734196378849447, + "num_tokens": 57778337.0, + "step": 330 + }, + { + "epoch": 0.09266040981791548, + "grad_norm": 2.515625, + "learning_rate": 4.9171231914594754e-05, + "loss": 1.9223, + "mean_token_accuracy": 0.5794367666356266, + "num_tokens": 59528977.0, + "step": 340 + }, + { + "epoch": 0.09538571598903064, + "grad_norm": 2.5625, + "learning_rate": 4.910099733108583e-05, + "loss": 1.9025, + "mean_token_accuracy": 0.5819809279404581, + "num_tokens": 61295985.0, + "step": 350 + }, + { + "epoch": 0.0981110221601458, + "grad_norm": 3.109375, + "learning_rate": 4.903076274757691e-05, + "loss": 1.8654, + "mean_token_accuracy": 0.5886511621065438, + "num_tokens": 63026897.0, + "step": 360 + }, + { + "epoch": 0.10083632833126097, + "grad_norm": 2.484375, + "learning_rate": 4.896052816406799e-05, + "loss": 1.8004, + "mean_token_accuracy": 0.6001709839329123, + "num_tokens": 64770711.0, + "step": 370 + }, + { + "epoch": 0.10356163450237613, + "grad_norm": 2.609375, + "learning_rate": 4.889029358055907e-05, + "loss": 1.8124, + "mean_token_accuracy": 0.6000730013474822, + "num_tokens": 66551008.0, + "step": 380 + }, + { + "epoch": 0.10628694067349129, + "grad_norm": 2.5, + "learning_rate": 4.882005899705015e-05, + "loss": 1.8145, + "mean_token_accuracy": 0.5984284824691712, + "num_tokens": 68320690.0, + "step": 390 + }, + { + "epoch": 0.10901224684460645, + "grad_norm": 2.59375, + "learning_rate": 4.874982441354123e-05, + "loss": 1.7416, + "mean_token_accuracy": 0.6101331522688269, + "num_tokens": 70095284.0, + "step": 400 + }, + { + "epoch": 0.11173755301572161, + "grad_norm": 2.875, + "learning_rate": 4.8679589830032316e-05, + "loss": 1.7673, + "mean_token_accuracy": 0.6082146287895739, + "num_tokens": 71803511.0, + "step": 410 + }, + { + "epoch": 0.11446285918683677, + "grad_norm": 2.46875, + "learning_rate": 4.860935524652339e-05, + "loss": 1.6723, + "mean_token_accuracy": 0.6224730779416859, + "num_tokens": 73528297.0, + "step": 420 + }, + { + "epoch": 0.11718816535795193, + "grad_norm": 2.34375, + "learning_rate": 4.853912066301447e-05, + "loss": 1.6828, + "mean_token_accuracy": 0.6226447049528361, + "num_tokens": 75274289.0, + "step": 430 + }, + { + "epoch": 0.11991347152906709, + "grad_norm": 2.53125, + "learning_rate": 4.846888607950555e-05, + "loss": 1.7227, + "mean_token_accuracy": 0.6148907302878797, + "num_tokens": 77077403.0, + "step": 440 + }, + { + "epoch": 0.12263877770018225, + "grad_norm": 2.96875, + "learning_rate": 4.8398651495996636e-05, + "loss": 1.7, + "mean_token_accuracy": 0.6187901364639401, + "num_tokens": 78859882.0, + "step": 450 + }, + { + "epoch": 0.1253640838712974, + "grad_norm": 2.515625, + "learning_rate": 4.832841691248771e-05, + "loss": 1.7034, + "mean_token_accuracy": 0.6176527316682041, + "num_tokens": 80590976.0, + "step": 460 + }, + { + "epoch": 0.12808939004241257, + "grad_norm": 2.359375, + "learning_rate": 4.825818232897879e-05, + "loss": 1.6386, + "mean_token_accuracy": 0.6282966487109661, + "num_tokens": 82275126.0, + "step": 470 + }, + { + "epoch": 0.13081469621352773, + "grad_norm": 2.203125, + "learning_rate": 4.818794774546987e-05, + "loss": 1.6267, + "mean_token_accuracy": 0.6301278316415846, + "num_tokens": 84096910.0, + "step": 480 + }, + { + "epoch": 0.1335400023846429, + "grad_norm": 2.40625, + "learning_rate": 4.811771316196095e-05, + "loss": 1.6628, + "mean_token_accuracy": 0.626694044843316, + "num_tokens": 85876326.0, + "step": 490 + }, + { + "epoch": 0.13626530855575805, + "grad_norm": 2.578125, + "learning_rate": 4.804747857845203e-05, + "loss": 1.6392, + "mean_token_accuracy": 0.6308354771696031, + "num_tokens": 87607478.0, + "step": 500 + }, + { + "epoch": 0.1389906147268732, + "grad_norm": 2.078125, + "learning_rate": 4.797724399494311e-05, + "loss": 1.5724, + "mean_token_accuracy": 0.6407017651945353, + "num_tokens": 89350066.0, + "step": 510 + }, + { + "epoch": 0.14171592089798837, + "grad_norm": 2.125, + "learning_rate": 4.79070094114342e-05, + "loss": 1.5855, + "mean_token_accuracy": 0.6393750453367829, + "num_tokens": 91067910.0, + "step": 520 + }, + { + "epoch": 0.14444122706910353, + "grad_norm": 2.109375, + "learning_rate": 4.783677482792527e-05, + "loss": 1.5632, + "mean_token_accuracy": 0.6421649686060846, + "num_tokens": 92797017.0, + "step": 530 + }, + { + "epoch": 0.1471665332402187, + "grad_norm": 2.09375, + "learning_rate": 4.776654024441635e-05, + "loss": 1.6004, + "mean_token_accuracy": 0.6356727724894882, + "num_tokens": 94606329.0, + "step": 540 + }, + { + "epoch": 0.14989183941133385, + "grad_norm": 2.09375, + "learning_rate": 4.769630566090743e-05, + "loss": 1.5549, + "mean_token_accuracy": 0.6431776374578476, + "num_tokens": 96331087.0, + "step": 550 + }, + { + "epoch": 0.15261714558244902, + "grad_norm": 2.390625, + "learning_rate": 4.762607107739852e-05, + "loss": 1.5888, + "mean_token_accuracy": 0.6394226610660553, + "num_tokens": 98138711.0, + "step": 560 + }, + { + "epoch": 0.15534245175356418, + "grad_norm": 2.265625, + "learning_rate": 4.755583649388959e-05, + "loss": 1.5821, + "mean_token_accuracy": 0.6415903450921178, + "num_tokens": 99885005.0, + "step": 570 + }, + { + "epoch": 0.15806775792467936, + "grad_norm": 2.640625, + "learning_rate": 4.748560191038067e-05, + "loss": 1.4567, + "mean_token_accuracy": 0.6615531787276268, + "num_tokens": 101636075.0, + "step": 580 + }, + { + "epoch": 0.16079306409579452, + "grad_norm": 2.203125, + "learning_rate": 4.741536732687175e-05, + "loss": 1.5205, + "mean_token_accuracy": 0.651706058345735, + "num_tokens": 103349118.0, + "step": 590 + }, + { + "epoch": 0.16351837026690969, + "grad_norm": 2.375, + "learning_rate": 4.734513274336283e-05, + "loss": 1.5282, + "mean_token_accuracy": 0.6503567652776837, + "num_tokens": 105033010.0, + "step": 600 + }, + { + "epoch": 0.16624367643802485, + "grad_norm": 2.390625, + "learning_rate": 4.727489815985391e-05, + "loss": 1.4769, + "mean_token_accuracy": 0.6579250860959291, + "num_tokens": 106723283.0, + "step": 610 + }, + { + "epoch": 0.16896898260914, + "grad_norm": 1.7734375, + "learning_rate": 4.720466357634499e-05, + "loss": 1.4878, + "mean_token_accuracy": 0.6554030778817832, + "num_tokens": 108436878.0, + "step": 620 + }, + { + "epoch": 0.17169428878025517, + "grad_norm": 1.984375, + "learning_rate": 4.713442899283608e-05, + "loss": 1.4697, + "mean_token_accuracy": 0.6599764323793351, + "num_tokens": 110203157.0, + "step": 630 + }, + { + "epoch": 0.17441959495137033, + "grad_norm": 2.421875, + "learning_rate": 4.706419440932715e-05, + "loss": 1.5011, + "mean_token_accuracy": 0.6555302709341049, + "num_tokens": 111949130.0, + "step": 640 + }, + { + "epoch": 0.1771449011224855, + "grad_norm": 2.046875, + "learning_rate": 4.6993959825818233e-05, + "loss": 1.5068, + "mean_token_accuracy": 0.6517163597047329, + "num_tokens": 113652926.0, + "step": 650 + }, + { + "epoch": 0.17987020729360065, + "grad_norm": 1.8203125, + "learning_rate": 4.6923725242309314e-05, + "loss": 1.4746, + "mean_token_accuracy": 0.657813799008727, + "num_tokens": 115334647.0, + "step": 660 + }, + { + "epoch": 0.1825955134647158, + "grad_norm": 1.875, + "learning_rate": 4.68534906588004e-05, + "loss": 1.4268, + "mean_token_accuracy": 0.6688082559965551, + "num_tokens": 117032381.0, + "step": 670 + }, + { + "epoch": 0.18532081963583097, + "grad_norm": 1.7421875, + "learning_rate": 4.6783256075291474e-05, + "loss": 1.4713, + "mean_token_accuracy": 0.6591908087022602, + "num_tokens": 118801553.0, + "step": 680 + }, + { + "epoch": 0.18804612580694613, + "grad_norm": 1.90625, + "learning_rate": 4.6713021491782554e-05, + "loss": 1.4687, + "mean_token_accuracy": 0.6593087091110647, + "num_tokens": 120530271.0, + "step": 690 + }, + { + "epoch": 0.1907714319780613, + "grad_norm": 1.8125, + "learning_rate": 4.6642786908273634e-05, + "loss": 1.4613, + "mean_token_accuracy": 0.6628443499095737, + "num_tokens": 122314411.0, + "step": 700 + }, + { + "epoch": 0.19349673814917645, + "grad_norm": 1.703125, + "learning_rate": 4.6572552324764715e-05, + "loss": 1.4554, + "mean_token_accuracy": 0.6636740594170988, + "num_tokens": 124054113.0, + "step": 710 + }, + { + "epoch": 0.1962220443202916, + "grad_norm": 1.9609375, + "learning_rate": 4.6502317741255795e-05, + "loss": 1.4493, + "mean_token_accuracy": 0.6633484376594424, + "num_tokens": 125786705.0, + "step": 720 + }, + { + "epoch": 0.19894735049140677, + "grad_norm": 1.8203125, + "learning_rate": 4.6432083157746875e-05, + "loss": 1.4639, + "mean_token_accuracy": 0.6606446763500571, + "num_tokens": 127510112.0, + "step": 730 + }, + { + "epoch": 0.20167265666252193, + "grad_norm": 1.75, + "learning_rate": 4.636184857423796e-05, + "loss": 1.4136, + "mean_token_accuracy": 0.6700579337775707, + "num_tokens": 129321733.0, + "step": 740 + }, + { + "epoch": 0.2043979628336371, + "grad_norm": 1.71875, + "learning_rate": 4.6291613990729035e-05, + "loss": 1.4551, + "mean_token_accuracy": 0.6638867166824639, + "num_tokens": 131068939.0, + "step": 750 + }, + { + "epoch": 0.20712326900475225, + "grad_norm": 2.03125, + "learning_rate": 4.6221379407220116e-05, + "loss": 1.4996, + "mean_token_accuracy": 0.6567820507101715, + "num_tokens": 132800192.0, + "step": 760 + }, + { + "epoch": 0.2098485751758674, + "grad_norm": 1.734375, + "learning_rate": 4.6151144823711196e-05, + "loss": 1.4082, + "mean_token_accuracy": 0.6725725987926126, + "num_tokens": 134501880.0, + "step": 770 + }, + { + "epoch": 0.21257388134698257, + "grad_norm": 1.9296875, + "learning_rate": 4.608091024020228e-05, + "loss": 1.3787, + "mean_token_accuracy": 0.676600266713649, + "num_tokens": 136227230.0, + "step": 780 + }, + { + "epoch": 0.21529918751809773, + "grad_norm": 1.9921875, + "learning_rate": 4.6010675656693356e-05, + "loss": 1.3956, + "mean_token_accuracy": 0.6742036573588848, + "num_tokens": 137938433.0, + "step": 790 + }, + { + "epoch": 0.2180244936892129, + "grad_norm": 1.8984375, + "learning_rate": 4.5940441073184436e-05, + "loss": 1.4021, + "mean_token_accuracy": 0.6748411299660801, + "num_tokens": 139667163.0, + "step": 800 + }, + { + "epoch": 0.22074979986032806, + "grad_norm": 1.8515625, + "learning_rate": 4.5870206489675517e-05, + "loss": 1.3732, + "mean_token_accuracy": 0.6786113461479545, + "num_tokens": 141481099.0, + "step": 810 + }, + { + "epoch": 0.22347510603144322, + "grad_norm": 1.734375, + "learning_rate": 4.57999719061666e-05, + "loss": 1.4499, + "mean_token_accuracy": 0.666275049932301, + "num_tokens": 143204243.0, + "step": 820 + }, + { + "epoch": 0.22620041220255838, + "grad_norm": 1.921875, + "learning_rate": 4.572973732265768e-05, + "loss": 1.3878, + "mean_token_accuracy": 0.67771971905604, + "num_tokens": 144995581.0, + "step": 830 + }, + { + "epoch": 0.22892571837367354, + "grad_norm": 1.71875, + "learning_rate": 4.565950273914876e-05, + "loss": 1.4013, + "mean_token_accuracy": 0.67232207627967, + "num_tokens": 146711076.0, + "step": 840 + }, + { + "epoch": 0.2316510245447887, + "grad_norm": 1.5859375, + "learning_rate": 4.5589268155639844e-05, + "loss": 1.3717, + "mean_token_accuracy": 0.6795911006629467, + "num_tokens": 148463902.0, + "step": 850 + }, + { + "epoch": 0.23437633071590386, + "grad_norm": 1.7890625, + "learning_rate": 4.551903357213092e-05, + "loss": 1.4148, + "mean_token_accuracy": 0.671318475343287, + "num_tokens": 150212801.0, + "step": 860 + }, + { + "epoch": 0.23710163688701902, + "grad_norm": 1.703125, + "learning_rate": 4.5448798988622e-05, + "loss": 1.3469, + "mean_token_accuracy": 0.6808905070647597, + "num_tokens": 151950016.0, + "step": 870 + }, + { + "epoch": 0.23982694305813418, + "grad_norm": 1.65625, + "learning_rate": 4.537856440511308e-05, + "loss": 1.3565, + "mean_token_accuracy": 0.6808952454477548, + "num_tokens": 153686341.0, + "step": 880 + }, + { + "epoch": 0.24255224922924934, + "grad_norm": 1.6875, + "learning_rate": 4.5308329821604165e-05, + "loss": 1.3905, + "mean_token_accuracy": 0.6750849165953696, + "num_tokens": 155442220.0, + "step": 890 + }, + { + "epoch": 0.2452775554003645, + "grad_norm": 1.78125, + "learning_rate": 4.523809523809524e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6828798386268318, + "num_tokens": 157222278.0, + "step": 900 + }, + { + "epoch": 0.24800286157147966, + "grad_norm": 1.59375, + "learning_rate": 4.516786065458632e-05, + "loss": 1.3467, + "mean_token_accuracy": 0.682690916582942, + "num_tokens": 158989873.0, + "step": 910 + }, + { + "epoch": 0.2507281677425948, + "grad_norm": 1.9296875, + "learning_rate": 4.50976260710774e-05, + "loss": 1.3645, + "mean_token_accuracy": 0.6824150150641799, + "num_tokens": 160740690.0, + "step": 920 + }, + { + "epoch": 0.25345347391371, + "grad_norm": 1.8203125, + "learning_rate": 4.502739148756848e-05, + "loss": 1.3808, + "mean_token_accuracy": 0.6758302460424602, + "num_tokens": 162505210.0, + "step": 930 + }, + { + "epoch": 0.25617878008482514, + "grad_norm": 1.7109375, + "learning_rate": 4.495715690405956e-05, + "loss": 1.3703, + "mean_token_accuracy": 0.6793876992538571, + "num_tokens": 164184119.0, + "step": 940 + }, + { + "epoch": 0.2589040862559403, + "grad_norm": 1.7265625, + "learning_rate": 4.488692232055064e-05, + "loss": 1.3309, + "mean_token_accuracy": 0.6854429397732019, + "num_tokens": 165971146.0, + "step": 950 + }, + { + "epoch": 0.26162939242705546, + "grad_norm": 2.28125, + "learning_rate": 4.4816687737041726e-05, + "loss": 1.3738, + "mean_token_accuracy": 0.6808017442002893, + "num_tokens": 167701701.0, + "step": 960 + }, + { + "epoch": 0.2643546985981706, + "grad_norm": 2.21875, + "learning_rate": 4.47464531535328e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6793407511897385, + "num_tokens": 169443326.0, + "step": 970 + }, + { + "epoch": 0.2670800047692858, + "grad_norm": 1.8671875, + "learning_rate": 4.467621857002388e-05, + "loss": 1.363, + "mean_token_accuracy": 0.6808341681025922, + "num_tokens": 171199385.0, + "step": 980 + }, + { + "epoch": 0.26980531094040094, + "grad_norm": 1.8203125, + "learning_rate": 4.460598398651496e-05, + "loss": 1.3164, + "mean_token_accuracy": 0.6899495711550117, + "num_tokens": 172929493.0, + "step": 990 + }, + { + "epoch": 0.2725306171115161, + "grad_norm": 2.28125, + "learning_rate": 4.453574940300605e-05, + "loss": 1.3187, + "mean_token_accuracy": 0.6878650960512459, + "num_tokens": 174672601.0, + "step": 1000 + }, + { + "epoch": 0.27525592328263127, + "grad_norm": 1.59375, + "learning_rate": 4.446551481949712e-05, + "loss": 1.3538, + "mean_token_accuracy": 0.6838336682878434, + "num_tokens": 176441962.0, + "step": 1010 + }, + { + "epoch": 0.2779812294537464, + "grad_norm": 1.75, + "learning_rate": 4.43952802359882e-05, + "loss": 1.3101, + "mean_token_accuracy": 0.6909397638402879, + "num_tokens": 178197361.0, + "step": 1020 + }, + { + "epoch": 0.2807065356248616, + "grad_norm": 1.671875, + "learning_rate": 4.432504565247929e-05, + "loss": 1.3477, + "mean_token_accuracy": 0.6821904895827174, + "num_tokens": 179887580.0, + "step": 1030 + }, + { + "epoch": 0.28343184179597675, + "grad_norm": 1.6796875, + "learning_rate": 4.425481106897036e-05, + "loss": 1.3022, + "mean_token_accuracy": 0.6917614788748324, + "num_tokens": 181655865.0, + "step": 1040 + }, + { + "epoch": 0.2861571479670919, + "grad_norm": 1.7578125, + "learning_rate": 4.418457648546144e-05, + "loss": 1.3359, + "mean_token_accuracy": 0.6866448893211782, + "num_tokens": 183445880.0, + "step": 1050 + }, + { + "epoch": 0.28888245413820707, + "grad_norm": 1.5234375, + "learning_rate": 4.411434190195252e-05, + "loss": 1.2911, + "mean_token_accuracy": 0.69377696281299, + "num_tokens": 185175706.0, + "step": 1060 + }, + { + "epoch": 0.29160776030932223, + "grad_norm": 1.5703125, + "learning_rate": 4.404410731844361e-05, + "loss": 1.3015, + "mean_token_accuracy": 0.6912993769161403, + "num_tokens": 186873396.0, + "step": 1070 + }, + { + "epoch": 0.2943330664804374, + "grad_norm": 1.65625, + "learning_rate": 4.397387273493468e-05, + "loss": 1.3535, + "mean_token_accuracy": 0.6828196115791798, + "num_tokens": 188591288.0, + "step": 1080 + }, + { + "epoch": 0.29705837265155255, + "grad_norm": 1.4921875, + "learning_rate": 4.390363815142576e-05, + "loss": 1.3351, + "mean_token_accuracy": 0.6864844439551234, + "num_tokens": 190375182.0, + "step": 1090 + }, + { + "epoch": 0.2997836788226677, + "grad_norm": 1.625, + "learning_rate": 4.383340356791684e-05, + "loss": 1.3093, + "mean_token_accuracy": 0.6901756428182125, + "num_tokens": 192104609.0, + "step": 1100 + }, + { + "epoch": 0.30250898499378287, + "grad_norm": 1.578125, + "learning_rate": 4.376316898440793e-05, + "loss": 1.2279, + "mean_token_accuracy": 0.7054048574529588, + "num_tokens": 193819768.0, + "step": 1110 + }, + { + "epoch": 0.30523429116489803, + "grad_norm": 1.671875, + "learning_rate": 4.3692934400899e-05, + "loss": 1.3105, + "mean_token_accuracy": 0.6918096936307847, + "num_tokens": 195541705.0, + "step": 1120 + }, + { + "epoch": 0.3079595973360132, + "grad_norm": 1.59375, + "learning_rate": 4.362269981739008e-05, + "loss": 1.2943, + "mean_token_accuracy": 0.692951999604702, + "num_tokens": 197250976.0, + "step": 1130 + }, + { + "epoch": 0.31068490350712835, + "grad_norm": 1.453125, + "learning_rate": 4.355246523388117e-05, + "loss": 1.2566, + "mean_token_accuracy": 0.7000818770378828, + "num_tokens": 198964087.0, + "step": 1140 + }, + { + "epoch": 0.31341020967824357, + "grad_norm": 1.578125, + "learning_rate": 4.348223065037224e-05, + "loss": 1.3103, + "mean_token_accuracy": 0.6903091154061258, + "num_tokens": 200654341.0, + "step": 1150 + }, + { + "epoch": 0.31613551584935873, + "grad_norm": 1.7578125, + "learning_rate": 4.3411996066863323e-05, + "loss": 1.3449, + "mean_token_accuracy": 0.6847572137601674, + "num_tokens": 202410389.0, + "step": 1160 + }, + { + "epoch": 0.3188608220204739, + "grad_norm": 1.6640625, + "learning_rate": 4.3341761483354404e-05, + "loss": 1.2936, + "mean_token_accuracy": 0.6938237980008125, + "num_tokens": 204176073.0, + "step": 1170 + }, + { + "epoch": 0.32158612819158905, + "grad_norm": 1.5703125, + "learning_rate": 4.327152689984549e-05, + "loss": 1.284, + "mean_token_accuracy": 0.6962682608515024, + "num_tokens": 206017214.0, + "step": 1180 + }, + { + "epoch": 0.3243114343627042, + "grad_norm": 2.390625, + "learning_rate": 4.3201292316336564e-05, + "loss": 1.3027, + "mean_token_accuracy": 0.6909335135482252, + "num_tokens": 207832984.0, + "step": 1190 + }, + { + "epoch": 0.32703674053381937, + "grad_norm": 1.515625, + "learning_rate": 4.3131057732827644e-05, + "loss": 1.299, + "mean_token_accuracy": 0.6923303379677236, + "num_tokens": 209533427.0, + "step": 1200 + }, + { + "epoch": 0.32976204670493453, + "grad_norm": 1.5859375, + "learning_rate": 4.3060823149318724e-05, + "loss": 1.2863, + "mean_token_accuracy": 0.6948139815591275, + "num_tokens": 211344332.0, + "step": 1210 + }, + { + "epoch": 0.3324873528760497, + "grad_norm": 1.6875, + "learning_rate": 4.299058856580981e-05, + "loss": 1.3097, + "mean_token_accuracy": 0.6901804354973138, + "num_tokens": 213088746.0, + "step": 1220 + }, + { + "epoch": 0.33521265904716485, + "grad_norm": 1.6796875, + "learning_rate": 4.2920353982300885e-05, + "loss": 1.3145, + "mean_token_accuracy": 0.6885634188540279, + "num_tokens": 214889528.0, + "step": 1230 + }, + { + "epoch": 0.33793796521828, + "grad_norm": 1.8828125, + "learning_rate": 4.2850119398791965e-05, + "loss": 1.3272, + "mean_token_accuracy": 0.6894508360885083, + "num_tokens": 216603081.0, + "step": 1240 + }, + { + "epoch": 0.3406632713893952, + "grad_norm": 1.546875, + "learning_rate": 4.277988481528305e-05, + "loss": 1.2541, + "mean_token_accuracy": 0.6997496448457241, + "num_tokens": 218292845.0, + "step": 1250 + }, + { + "epoch": 0.34338857756051033, + "grad_norm": 1.59375, + "learning_rate": 4.2709650231774125e-05, + "loss": 1.2827, + "mean_token_accuracy": 0.6969962599687278, + "num_tokens": 220029696.0, + "step": 1260 + }, + { + "epoch": 0.3461138837316255, + "grad_norm": 1.6796875, + "learning_rate": 4.2639415648265206e-05, + "loss": 1.3492, + "mean_token_accuracy": 0.685709635540843, + "num_tokens": 221810553.0, + "step": 1270 + }, + { + "epoch": 0.34883918990274065, + "grad_norm": 1.515625, + "learning_rate": 4.2569181064756286e-05, + "loss": 1.2669, + "mean_token_accuracy": 0.7004628435708582, + "num_tokens": 223552801.0, + "step": 1280 + }, + { + "epoch": 0.3515644960738558, + "grad_norm": 1.8515625, + "learning_rate": 4.249894648124737e-05, + "loss": 1.279, + "mean_token_accuracy": 0.6973608860746026, + "num_tokens": 225254929.0, + "step": 1290 + }, + { + "epoch": 0.354289802244971, + "grad_norm": 1.8984375, + "learning_rate": 4.2428711897738446e-05, + "loss": 1.3133, + "mean_token_accuracy": 0.6911240560933948, + "num_tokens": 227017304.0, + "step": 1300 + }, + { + "epoch": 0.35701510841608614, + "grad_norm": 1.5859375, + "learning_rate": 4.2358477314229526e-05, + "loss": 1.2394, + "mean_token_accuracy": 0.7042814038693905, + "num_tokens": 228785771.0, + "step": 1310 + }, + { + "epoch": 0.3597404145872013, + "grad_norm": 1.609375, + "learning_rate": 4.2288242730720607e-05, + "loss": 1.2462, + "mean_token_accuracy": 0.702515134587884, + "num_tokens": 230489667.0, + "step": 1320 + }, + { + "epoch": 0.36246572075831646, + "grad_norm": 1.515625, + "learning_rate": 4.2218008147211694e-05, + "loss": 1.2768, + "mean_token_accuracy": 0.6976381672546268, + "num_tokens": 232269581.0, + "step": 1330 + }, + { + "epoch": 0.3651910269294316, + "grad_norm": 1.546875, + "learning_rate": 4.214777356370277e-05, + "loss": 1.2795, + "mean_token_accuracy": 0.6947634796611964, + "num_tokens": 234028126.0, + "step": 1340 + }, + { + "epoch": 0.3679163331005468, + "grad_norm": 1.859375, + "learning_rate": 4.207753898019385e-05, + "loss": 1.305, + "mean_token_accuracy": 0.6930719532072545, + "num_tokens": 235817129.0, + "step": 1350 + }, + { + "epoch": 0.37064163927166194, + "grad_norm": 1.5390625, + "learning_rate": 4.2007304396684934e-05, + "loss": 1.2765, + "mean_token_accuracy": 0.6983222321607172, + "num_tokens": 237588984.0, + "step": 1360 + }, + { + "epoch": 0.3733669454427771, + "grad_norm": 1.5234375, + "learning_rate": 4.193706981317601e-05, + "loss": 1.2305, + "mean_token_accuracy": 0.7052510293200612, + "num_tokens": 239305200.0, + "step": 1370 + }, + { + "epoch": 0.37609225161389226, + "grad_norm": 1.5234375, + "learning_rate": 4.186683522966709e-05, + "loss": 1.2552, + "mean_token_accuracy": 0.7010797799564898, + "num_tokens": 241078759.0, + "step": 1380 + }, + { + "epoch": 0.3788175577850074, + "grad_norm": 1.640625, + "learning_rate": 4.179660064615817e-05, + "loss": 1.2903, + "mean_token_accuracy": 0.6959062526933849, + "num_tokens": 242876841.0, + "step": 1390 + }, + { + "epoch": 0.3815428639561226, + "grad_norm": 1.5625, + "learning_rate": 4.1726366062649255e-05, + "loss": 1.2394, + "mean_token_accuracy": 0.7018366500735282, + "num_tokens": 244578724.0, + "step": 1400 + }, + { + "epoch": 0.38426817012723774, + "grad_norm": 1.4140625, + "learning_rate": 4.165613147914033e-05, + "loss": 1.2776, + "mean_token_accuracy": 0.6954536657780409, + "num_tokens": 246363009.0, + "step": 1410 + }, + { + "epoch": 0.3869934762983529, + "grad_norm": 1.5, + "learning_rate": 4.158589689563141e-05, + "loss": 1.3017, + "mean_token_accuracy": 0.6934069953858852, + "num_tokens": 248120613.0, + "step": 1420 + }, + { + "epoch": 0.38971878246946806, + "grad_norm": 1.65625, + "learning_rate": 4.151566231212249e-05, + "loss": 1.2857, + "mean_token_accuracy": 0.6958935803733766, + "num_tokens": 249941867.0, + "step": 1430 + }, + { + "epoch": 0.3924440886405832, + "grad_norm": 1.390625, + "learning_rate": 4.1445427728613576e-05, + "loss": 1.2328, + "mean_token_accuracy": 0.7059727218933404, + "num_tokens": 251695734.0, + "step": 1440 + }, + { + "epoch": 0.3951693948116984, + "grad_norm": 1.4296875, + "learning_rate": 4.137519314510465e-05, + "loss": 1.2814, + "mean_token_accuracy": 0.6967121254652738, + "num_tokens": 253503211.0, + "step": 1450 + }, + { + "epoch": 0.39789470098281354, + "grad_norm": 1.6171875, + "learning_rate": 4.130495856159573e-05, + "loss": 1.2694, + "mean_token_accuracy": 0.6995078191161156, + "num_tokens": 255313734.0, + "step": 1460 + }, + { + "epoch": 0.4006200071539287, + "grad_norm": 1.375, + "learning_rate": 4.1234723978086816e-05, + "loss": 1.1998, + "mean_token_accuracy": 0.7114904819987714, + "num_tokens": 257016513.0, + "step": 1470 + }, + { + "epoch": 0.40334531332504386, + "grad_norm": 1.53125, + "learning_rate": 4.1164489394577896e-05, + "loss": 1.2871, + "mean_token_accuracy": 0.6948545157909394, + "num_tokens": 258760362.0, + "step": 1480 + }, + { + "epoch": 0.406070619496159, + "grad_norm": 1.5625, + "learning_rate": 4.109425481106897e-05, + "loss": 1.2301, + "mean_token_accuracy": 0.7058748141862452, + "num_tokens": 260546971.0, + "step": 1490 + }, + { + "epoch": 0.4087959256672742, + "grad_norm": 1.5703125, + "learning_rate": 4.102402022756005e-05, + "loss": 1.2515, + "mean_token_accuracy": 0.7016466647386551, + "num_tokens": 262307672.0, + "step": 1500 + }, + { + "epoch": 0.41152123183838935, + "grad_norm": 1.7265625, + "learning_rate": 4.095378564405114e-05, + "loss": 1.2595, + "mean_token_accuracy": 0.7010509856045246, + "num_tokens": 264023889.0, + "step": 1510 + }, + { + "epoch": 0.4142465380095045, + "grad_norm": 1.4453125, + "learning_rate": 4.088355106054221e-05, + "loss": 1.2509, + "mean_token_accuracy": 0.7020703799091279, + "num_tokens": 265798519.0, + "step": 1520 + }, + { + "epoch": 0.41697184418061967, + "grad_norm": 1.4609375, + "learning_rate": 4.081331647703329e-05, + "loss": 1.2775, + "mean_token_accuracy": 0.6975624321959912, + "num_tokens": 267546100.0, + "step": 1530 + }, + { + "epoch": 0.4196971503517348, + "grad_norm": 1.46875, + "learning_rate": 4.074308189352437e-05, + "loss": 1.2272, + "mean_token_accuracy": 0.7062510661780834, + "num_tokens": 269297641.0, + "step": 1540 + }, + { + "epoch": 0.42242245652285, + "grad_norm": 1.5, + "learning_rate": 4.067284731001546e-05, + "loss": 1.2729, + "mean_token_accuracy": 0.6973782840184868, + "num_tokens": 271053006.0, + "step": 1550 + }, + { + "epoch": 0.42514776269396515, + "grad_norm": 1.859375, + "learning_rate": 4.060261272650653e-05, + "loss": 1.2565, + "mean_token_accuracy": 0.7016788011416792, + "num_tokens": 272829487.0, + "step": 1560 + }, + { + "epoch": 0.4278730688650803, + "grad_norm": 1.4921875, + "learning_rate": 4.053237814299761e-05, + "loss": 1.246, + "mean_token_accuracy": 0.7026969991624356, + "num_tokens": 274621990.0, + "step": 1570 + }, + { + "epoch": 0.43059837503619547, + "grad_norm": 1.625, + "learning_rate": 4.04621435594887e-05, + "loss": 1.2623, + "mean_token_accuracy": 0.6991809997707605, + "num_tokens": 276398989.0, + "step": 1580 + }, + { + "epoch": 0.43332368120731063, + "grad_norm": 1.4921875, + "learning_rate": 4.039190897597978e-05, + "loss": 1.232, + "mean_token_accuracy": 0.7039873175323009, + "num_tokens": 278192884.0, + "step": 1590 + }, + { + "epoch": 0.4360489873784258, + "grad_norm": 1.5625, + "learning_rate": 4.032167439247085e-05, + "loss": 1.2205, + "mean_token_accuracy": 0.7079121223650873, + "num_tokens": 279937931.0, + "step": 1600 + }, + { + "epoch": 0.43877429354954095, + "grad_norm": 1.5234375, + "learning_rate": 4.025143980896193e-05, + "loss": 1.2283, + "mean_token_accuracy": 0.7071776267141103, + "num_tokens": 281698143.0, + "step": 1610 + }, + { + "epoch": 0.4414995997206561, + "grad_norm": 1.5234375, + "learning_rate": 4.018120522545302e-05, + "loss": 1.2414, + "mean_token_accuracy": 0.7045015564188362, + "num_tokens": 283412565.0, + "step": 1620 + }, + { + "epoch": 0.44422490589177127, + "grad_norm": 1.453125, + "learning_rate": 4.011097064194409e-05, + "loss": 1.2495, + "mean_token_accuracy": 0.7027601384557783, + "num_tokens": 285184244.0, + "step": 1630 + }, + { + "epoch": 0.44695021206288643, + "grad_norm": 1.3359375, + "learning_rate": 4.004073605843517e-05, + "loss": 1.1779, + "mean_token_accuracy": 0.7148963597603142, + "num_tokens": 286963226.0, + "step": 1640 + }, + { + "epoch": 0.4496755182340016, + "grad_norm": 1.703125, + "learning_rate": 3.997050147492625e-05, + "loss": 1.21, + "mean_token_accuracy": 0.7099241388961672, + "num_tokens": 288742524.0, + "step": 1650 + }, + { + "epoch": 0.45240082440511675, + "grad_norm": 1.875, + "learning_rate": 3.990026689141734e-05, + "loss": 1.2415, + "mean_token_accuracy": 0.7042178069241345, + "num_tokens": 290486439.0, + "step": 1660 + }, + { + "epoch": 0.4551261305762319, + "grad_norm": 1.4375, + "learning_rate": 3.9830032307908413e-05, + "loss": 1.246, + "mean_token_accuracy": 0.7043319317512214, + "num_tokens": 292244616.0, + "step": 1670 + }, + { + "epoch": 0.4578514367473471, + "grad_norm": 1.484375, + "learning_rate": 3.9759797724399494e-05, + "loss": 1.2455, + "mean_token_accuracy": 0.703259102255106, + "num_tokens": 294028863.0, + "step": 1680 + }, + { + "epoch": 0.46057674291846223, + "grad_norm": 1.34375, + "learning_rate": 3.968956314089058e-05, + "loss": 1.2085, + "mean_token_accuracy": 0.708068885654211, + "num_tokens": 295842660.0, + "step": 1690 + }, + { + "epoch": 0.4633020490895774, + "grad_norm": 1.3125, + "learning_rate": 3.961932855738166e-05, + "loss": 1.2173, + "mean_token_accuracy": 0.7098737230524421, + "num_tokens": 297606076.0, + "step": 1700 + }, + { + "epoch": 0.46602735526069256, + "grad_norm": 1.4375, + "learning_rate": 3.9549093973872734e-05, + "loss": 1.2501, + "mean_token_accuracy": 0.7022371832281351, + "num_tokens": 299367632.0, + "step": 1710 + }, + { + "epoch": 0.4687526614318077, + "grad_norm": 1.4296875, + "learning_rate": 3.9478859390363814e-05, + "loss": 1.1976, + "mean_token_accuracy": 0.7129932347685098, + "num_tokens": 301097990.0, + "step": 1720 + }, + { + "epoch": 0.4714779676029229, + "grad_norm": 1.421875, + "learning_rate": 3.94086248068549e-05, + "loss": 1.2101, + "mean_token_accuracy": 0.7102027184329927, + "num_tokens": 302847308.0, + "step": 1730 + }, + { + "epoch": 0.47420327377403804, + "grad_norm": 1.375, + "learning_rate": 3.9338390223345975e-05, + "loss": 1.2315, + "mean_token_accuracy": 0.704880575183779, + "num_tokens": 304582625.0, + "step": 1740 + }, + { + "epoch": 0.4769285799451532, + "grad_norm": 1.53125, + "learning_rate": 3.9268155639837055e-05, + "loss": 1.2088, + "mean_token_accuracy": 0.7108904106542469, + "num_tokens": 306358206.0, + "step": 1750 + }, + { + "epoch": 0.47965388611626836, + "grad_norm": 1.515625, + "learning_rate": 3.9197921056328135e-05, + "loss": 1.1976, + "mean_token_accuracy": 0.712415215652436, + "num_tokens": 308026562.0, + "step": 1760 + }, + { + "epoch": 0.4823791922873835, + "grad_norm": 1.578125, + "learning_rate": 3.912768647281922e-05, + "loss": 1.2466, + "mean_token_accuracy": 0.7024014497175812, + "num_tokens": 309804543.0, + "step": 1770 + }, + { + "epoch": 0.4851044984584987, + "grad_norm": 1.5625, + "learning_rate": 3.9057451889310296e-05, + "loss": 1.2379, + "mean_token_accuracy": 0.7045082511380315, + "num_tokens": 311549771.0, + "step": 1780 + }, + { + "epoch": 0.48782980462961384, + "grad_norm": 1.53125, + "learning_rate": 3.8987217305801376e-05, + "loss": 1.2623, + "mean_token_accuracy": 0.701041791215539, + "num_tokens": 313326889.0, + "step": 1790 + }, + { + "epoch": 0.490555110800729, + "grad_norm": 1.5703125, + "learning_rate": 3.891698272229246e-05, + "loss": 1.2005, + "mean_token_accuracy": 0.7130162584595382, + "num_tokens": 315064715.0, + "step": 1800 + }, + { + "epoch": 0.49328041697184416, + "grad_norm": 1.359375, + "learning_rate": 3.884674813878354e-05, + "loss": 1.2161, + "mean_token_accuracy": 0.7079024517908692, + "num_tokens": 316809509.0, + "step": 1810 + }, + { + "epoch": 0.4960057231429593, + "grad_norm": 1.359375, + "learning_rate": 3.8776513555274616e-05, + "loss": 1.203, + "mean_token_accuracy": 0.711275870539248, + "num_tokens": 318597481.0, + "step": 1820 + }, + { + "epoch": 0.4987310293140745, + "grad_norm": 1.5625, + "learning_rate": 3.8706278971765697e-05, + "loss": 1.188, + "mean_token_accuracy": 0.7133826318196952, + "num_tokens": 320382863.0, + "step": 1830 + }, + { + "epoch": 0.5014563354851896, + "grad_norm": 1.390625, + "learning_rate": 3.8636044388256784e-05, + "loss": 1.2159, + "mean_token_accuracy": 0.710312622692436, + "num_tokens": 322136431.0, + "step": 1840 + }, + { + "epoch": 0.5041816416563049, + "grad_norm": 1.359375, + "learning_rate": 3.856580980474786e-05, + "loss": 1.1922, + "mean_token_accuracy": 0.7139192272908985, + "num_tokens": 323892089.0, + "step": 1850 + }, + { + "epoch": 0.50690694782742, + "grad_norm": 1.4140625, + "learning_rate": 3.849557522123894e-05, + "loss": 1.2107, + "mean_token_accuracy": 0.7097873773425818, + "num_tokens": 325693457.0, + "step": 1860 + }, + { + "epoch": 0.5096322539985352, + "grad_norm": 1.5234375, + "learning_rate": 3.842534063773002e-05, + "loss": 1.1781, + "mean_token_accuracy": 0.7143914319574833, + "num_tokens": 327397884.0, + "step": 1870 + }, + { + "epoch": 0.5123575601696503, + "grad_norm": 1.53125, + "learning_rate": 3.8355106054221104e-05, + "loss": 1.2316, + "mean_token_accuracy": 0.7061603724025189, + "num_tokens": 329077502.0, + "step": 1880 + }, + { + "epoch": 0.5150828663407655, + "grad_norm": 1.4765625, + "learning_rate": 3.828487147071218e-05, + "loss": 1.2195, + "mean_token_accuracy": 0.708381280489266, + "num_tokens": 330847695.0, + "step": 1890 + }, + { + "epoch": 0.5178081725118806, + "grad_norm": 1.375, + "learning_rate": 3.821463688720326e-05, + "loss": 1.1845, + "mean_token_accuracy": 0.7121982695534825, + "num_tokens": 332578366.0, + "step": 1900 + }, + { + "epoch": 0.5205334786829958, + "grad_norm": 1.4609375, + "learning_rate": 3.8144402303694345e-05, + "loss": 1.2265, + "mean_token_accuracy": 0.7070099180564284, + "num_tokens": 334293589.0, + "step": 1910 + }, + { + "epoch": 0.5232587848541109, + "grad_norm": 1.296875, + "learning_rate": 3.8074167720185425e-05, + "loss": 1.2075, + "mean_token_accuracy": 0.707461370434612, + "num_tokens": 336016972.0, + "step": 1920 + }, + { + "epoch": 0.5259840910252261, + "grad_norm": 1.2890625, + "learning_rate": 3.80039331366765e-05, + "loss": 1.1813, + "mean_token_accuracy": 0.714381551090628, + "num_tokens": 337805977.0, + "step": 1930 + }, + { + "epoch": 0.5287093971963412, + "grad_norm": 1.3203125, + "learning_rate": 3.793369855316758e-05, + "loss": 1.149, + "mean_token_accuracy": 0.7198109852150083, + "num_tokens": 339548221.0, + "step": 1940 + }, + { + "epoch": 0.5314347033674565, + "grad_norm": 1.671875, + "learning_rate": 3.7863463969658666e-05, + "loss": 1.2441, + "mean_token_accuracy": 0.7022737297229469, + "num_tokens": 341268344.0, + "step": 1950 + }, + { + "epoch": 0.5341600095385716, + "grad_norm": 1.46875, + "learning_rate": 3.779322938614974e-05, + "loss": 1.2015, + "mean_token_accuracy": 0.7109639048576355, + "num_tokens": 343017522.0, + "step": 1960 + }, + { + "epoch": 0.5368853157096868, + "grad_norm": 1.5859375, + "learning_rate": 3.772299480264082e-05, + "loss": 1.193, + "mean_token_accuracy": 0.7128876778297126, + "num_tokens": 344699192.0, + "step": 1970 + }, + { + "epoch": 0.5396106218808019, + "grad_norm": 1.375, + "learning_rate": 3.7652760219131906e-05, + "loss": 1.1785, + "mean_token_accuracy": 0.7159237092360854, + "num_tokens": 346458527.0, + "step": 1980 + }, + { + "epoch": 0.5423359280519171, + "grad_norm": 1.4453125, + "learning_rate": 3.7582525635622986e-05, + "loss": 1.1895, + "mean_token_accuracy": 0.7135124854743481, + "num_tokens": 348204585.0, + "step": 1990 + }, + { + "epoch": 0.5450612342230322, + "grad_norm": 1.4609375, + "learning_rate": 3.751229105211406e-05, + "loss": 1.2192, + "mean_token_accuracy": 0.7084178974851966, + "num_tokens": 349928259.0, + "step": 2000 + }, + { + "epoch": 0.5477865403941474, + "grad_norm": 1.3125, + "learning_rate": 3.744205646860514e-05, + "loss": 1.1833, + "mean_token_accuracy": 0.7143334408290685, + "num_tokens": 351568018.0, + "step": 2010 + }, + { + "epoch": 0.5505118465652625, + "grad_norm": 1.3359375, + "learning_rate": 3.737182188509623e-05, + "loss": 1.1763, + "mean_token_accuracy": 0.7172479030676187, + "num_tokens": 353325716.0, + "step": 2020 + }, + { + "epoch": 0.5532371527363777, + "grad_norm": 1.4140625, + "learning_rate": 3.730158730158731e-05, + "loss": 1.2195, + "mean_token_accuracy": 0.707148808799684, + "num_tokens": 355099655.0, + "step": 2030 + }, + { + "epoch": 0.5559624589074929, + "grad_norm": 1.4140625, + "learning_rate": 3.723135271807838e-05, + "loss": 1.1867, + "mean_token_accuracy": 0.7141209022141993, + "num_tokens": 356892448.0, + "step": 2040 + }, + { + "epoch": 0.5586877650786081, + "grad_norm": 1.453125, + "learning_rate": 3.716111813456946e-05, + "loss": 1.2126, + "mean_token_accuracy": 0.709683568123728, + "num_tokens": 358599855.0, + "step": 2050 + }, + { + "epoch": 0.5614130712497232, + "grad_norm": 1.6171875, + "learning_rate": 3.709088355106055e-05, + "loss": 1.2314, + "mean_token_accuracy": 0.705540257319808, + "num_tokens": 360366528.0, + "step": 2060 + }, + { + "epoch": 0.5641383774208384, + "grad_norm": 1.4765625, + "learning_rate": 3.702064896755162e-05, + "loss": 1.1838, + "mean_token_accuracy": 0.7144955797120929, + "num_tokens": 362123903.0, + "step": 2070 + }, + { + "epoch": 0.5668636835919535, + "grad_norm": 1.3671875, + "learning_rate": 3.69504143840427e-05, + "loss": 1.1718, + "mean_token_accuracy": 0.7182300767861307, + "num_tokens": 363907270.0, + "step": 2080 + }, + { + "epoch": 0.5695889897630687, + "grad_norm": 1.4375, + "learning_rate": 3.688017980053379e-05, + "loss": 1.1563, + "mean_token_accuracy": 0.7186566211283207, + "num_tokens": 365602883.0, + "step": 2090 + }, + { + "epoch": 0.5723142959341838, + "grad_norm": 1.5, + "learning_rate": 3.680994521702487e-05, + "loss": 1.1757, + "mean_token_accuracy": 0.7156617695465683, + "num_tokens": 367308921.0, + "step": 2100 + }, + { + "epoch": 0.575039602105299, + "grad_norm": 1.4140625, + "learning_rate": 3.673971063351594e-05, + "loss": 1.1835, + "mean_token_accuracy": 0.7141332181170583, + "num_tokens": 369068618.0, + "step": 2110 + }, + { + "epoch": 0.5777649082764141, + "grad_norm": 1.421875, + "learning_rate": 3.666947605000702e-05, + "loss": 1.1935, + "mean_token_accuracy": 0.713552170060575, + "num_tokens": 370792272.0, + "step": 2120 + }, + { + "epoch": 0.5804902144475294, + "grad_norm": 1.4921875, + "learning_rate": 3.659924146649811e-05, + "loss": 1.199, + "mean_token_accuracy": 0.7122195997275412, + "num_tokens": 372506541.0, + "step": 2130 + }, + { + "epoch": 0.5832155206186445, + "grad_norm": 1.3515625, + "learning_rate": 3.652900688298919e-05, + "loss": 1.1418, + "mean_token_accuracy": 0.722984395455569, + "num_tokens": 374308542.0, + "step": 2140 + }, + { + "epoch": 0.5859408267897597, + "grad_norm": 1.3671875, + "learning_rate": 3.645877229948026e-05, + "loss": 1.1544, + "mean_token_accuracy": 0.7199711099267005, + "num_tokens": 376016266.0, + "step": 2150 + }, + { + "epoch": 0.5886661329608748, + "grad_norm": 1.25, + "learning_rate": 3.638853771597134e-05, + "loss": 1.2019, + "mean_token_accuracy": 0.712724674679339, + "num_tokens": 377732713.0, + "step": 2160 + }, + { + "epoch": 0.59139143913199, + "grad_norm": 2.15625, + "learning_rate": 3.631830313246243e-05, + "loss": 1.1785, + "mean_token_accuracy": 0.7151990966871381, + "num_tokens": 379457927.0, + "step": 2170 + }, + { + "epoch": 0.5941167453031051, + "grad_norm": 1.2421875, + "learning_rate": 3.6248068548953503e-05, + "loss": 1.2353, + "mean_token_accuracy": 0.7053435018286109, + "num_tokens": 381311531.0, + "step": 2180 + }, + { + "epoch": 0.5968420514742203, + "grad_norm": 1.4609375, + "learning_rate": 3.6177833965444584e-05, + "loss": 1.2218, + "mean_token_accuracy": 0.7103118651546538, + "num_tokens": 383076036.0, + "step": 2190 + }, + { + "epoch": 0.5995673576453354, + "grad_norm": 1.46875, + "learning_rate": 3.610759938193567e-05, + "loss": 1.2154, + "mean_token_accuracy": 0.7096952789463102, + "num_tokens": 384869981.0, + "step": 2200 + }, + { + "epoch": 0.6022926638164506, + "grad_norm": 1.46875, + "learning_rate": 3.603736479842675e-05, + "loss": 1.191, + "mean_token_accuracy": 0.7129226897843182, + "num_tokens": 386654361.0, + "step": 2210 + }, + { + "epoch": 0.6050179699875657, + "grad_norm": 1.390625, + "learning_rate": 3.5967130214917824e-05, + "loss": 1.1628, + "mean_token_accuracy": 0.7192864948883653, + "num_tokens": 388405388.0, + "step": 2220 + }, + { + "epoch": 0.607743276158681, + "grad_norm": 1.375, + "learning_rate": 3.5896895631408904e-05, + "loss": 1.1976, + "mean_token_accuracy": 0.7123970666900277, + "num_tokens": 390160411.0, + "step": 2230 + }, + { + "epoch": 0.6104685823297961, + "grad_norm": 1.2578125, + "learning_rate": 3.582666104789999e-05, + "loss": 1.2059, + "mean_token_accuracy": 0.7098761620000005, + "num_tokens": 391963393.0, + "step": 2240 + }, + { + "epoch": 0.6131938885009113, + "grad_norm": 1.515625, + "learning_rate": 3.575642646439107e-05, + "loss": 1.2058, + "mean_token_accuracy": 0.7112857546657324, + "num_tokens": 393719494.0, + "step": 2250 + }, + { + "epoch": 0.6159191946720264, + "grad_norm": 1.2421875, + "learning_rate": 3.5686191880882145e-05, + "loss": 1.2005, + "mean_token_accuracy": 0.7123585233464838, + "num_tokens": 395419867.0, + "step": 2260 + }, + { + "epoch": 0.6186445008431416, + "grad_norm": 1.3203125, + "learning_rate": 3.5615957297373225e-05, + "loss": 1.1723, + "mean_token_accuracy": 0.7168289897032082, + "num_tokens": 397164636.0, + "step": 2270 + }, + { + "epoch": 0.6213698070142567, + "grad_norm": 1.5078125, + "learning_rate": 3.554572271386431e-05, + "loss": 1.2103, + "mean_token_accuracy": 0.7081712177023292, + "num_tokens": 398897880.0, + "step": 2280 + }, + { + "epoch": 0.6240951131853719, + "grad_norm": 1.7265625, + "learning_rate": 3.5475488130355386e-05, + "loss": 1.188, + "mean_token_accuracy": 0.7136732900515199, + "num_tokens": 400668616.0, + "step": 2290 + }, + { + "epoch": 0.6268204193564871, + "grad_norm": 1.375, + "learning_rate": 3.5405253546846466e-05, + "loss": 1.1751, + "mean_token_accuracy": 0.716455262992531, + "num_tokens": 402379710.0, + "step": 2300 + }, + { + "epoch": 0.6295457255276022, + "grad_norm": 1.34375, + "learning_rate": 3.533501896333755e-05, + "loss": 1.178, + "mean_token_accuracy": 0.7147075609304011, + "num_tokens": 404084345.0, + "step": 2310 + }, + { + "epoch": 0.6322710316987175, + "grad_norm": 1.40625, + "learning_rate": 3.526478437982863e-05, + "loss": 1.1428, + "mean_token_accuracy": 0.7234298737719655, + "num_tokens": 405770321.0, + "step": 2320 + }, + { + "epoch": 0.6349963378698326, + "grad_norm": 1.359375, + "learning_rate": 3.5194549796319706e-05, + "loss": 1.1634, + "mean_token_accuracy": 0.7181408229283989, + "num_tokens": 407492020.0, + "step": 2330 + }, + { + "epoch": 0.6377216440409478, + "grad_norm": 1.390625, + "learning_rate": 3.5124315212810787e-05, + "loss": 1.1646, + "mean_token_accuracy": 0.7192243071272969, + "num_tokens": 409257115.0, + "step": 2340 + }, + { + "epoch": 0.6404469502120629, + "grad_norm": 1.421875, + "learning_rate": 3.5054080629301874e-05, + "loss": 1.1939, + "mean_token_accuracy": 0.7125861537642777, + "num_tokens": 410986071.0, + "step": 2350 + }, + { + "epoch": 0.6431722563831781, + "grad_norm": 1.3125, + "learning_rate": 3.4983846045792954e-05, + "loss": 1.1776, + "mean_token_accuracy": 0.7160876172594726, + "num_tokens": 412763897.0, + "step": 2360 + }, + { + "epoch": 0.6458975625542932, + "grad_norm": 1.3515625, + "learning_rate": 3.491361146228403e-05, + "loss": 1.1323, + "mean_token_accuracy": 0.7251203707419336, + "num_tokens": 414503168.0, + "step": 2370 + }, + { + "epoch": 0.6486228687254084, + "grad_norm": 1.3125, + "learning_rate": 3.484337687877511e-05, + "loss": 1.1612, + "mean_token_accuracy": 0.7199062428437173, + "num_tokens": 416227145.0, + "step": 2380 + }, + { + "epoch": 0.6513481748965235, + "grad_norm": 1.3046875, + "learning_rate": 3.4773142295266194e-05, + "loss": 1.1582, + "mean_token_accuracy": 0.7184870925731957, + "num_tokens": 417973040.0, + "step": 2390 + }, + { + "epoch": 0.6540734810676387, + "grad_norm": 1.609375, + "learning_rate": 3.470290771175727e-05, + "loss": 1.1755, + "mean_token_accuracy": 0.7159407096914947, + "num_tokens": 419736780.0, + "step": 2400 + }, + { + "epoch": 0.6567987872387538, + "grad_norm": 1.3203125, + "learning_rate": 3.463267312824835e-05, + "loss": 1.1679, + "mean_token_accuracy": 0.718410755135119, + "num_tokens": 421516023.0, + "step": 2410 + }, + { + "epoch": 0.6595240934098691, + "grad_norm": 1.359375, + "learning_rate": 3.4562438544739435e-05, + "loss": 1.2164, + "mean_token_accuracy": 0.710557876341045, + "num_tokens": 423324296.0, + "step": 2420 + }, + { + "epoch": 0.6622493995809842, + "grad_norm": 1.4765625, + "learning_rate": 3.4492203961230515e-05, + "loss": 1.1903, + "mean_token_accuracy": 0.7110839125700295, + "num_tokens": 425114987.0, + "step": 2430 + }, + { + "epoch": 0.6649747057520994, + "grad_norm": 1.3359375, + "learning_rate": 3.442196937772159e-05, + "loss": 1.155, + "mean_token_accuracy": 0.7194037739187479, + "num_tokens": 426875751.0, + "step": 2440 + }, + { + "epoch": 0.6677000119232145, + "grad_norm": 1.296875, + "learning_rate": 3.435173479421267e-05, + "loss": 1.1687, + "mean_token_accuracy": 0.7176058162003756, + "num_tokens": 428715433.0, + "step": 2450 + }, + { + "epoch": 0.6704253180943297, + "grad_norm": 1.453125, + "learning_rate": 3.4281500210703756e-05, + "loss": 1.2073, + "mean_token_accuracy": 0.7103234235197305, + "num_tokens": 430443301.0, + "step": 2460 + }, + { + "epoch": 0.6731506242654448, + "grad_norm": 1.3515625, + "learning_rate": 3.4211265627194836e-05, + "loss": 1.1158, + "mean_token_accuracy": 0.7272647397592664, + "num_tokens": 432149898.0, + "step": 2470 + }, + { + "epoch": 0.67587593043656, + "grad_norm": 1.375, + "learning_rate": 3.414103104368591e-05, + "loss": 1.1433, + "mean_token_accuracy": 0.723227025847882, + "num_tokens": 433904775.0, + "step": 2480 + }, + { + "epoch": 0.6786012366076751, + "grad_norm": 1.3515625, + "learning_rate": 3.407079646017699e-05, + "loss": 1.1448, + "mean_token_accuracy": 0.7215846830978989, + "num_tokens": 435648944.0, + "step": 2490 + }, + { + "epoch": 0.6813265427787903, + "grad_norm": 1.5625, + "learning_rate": 3.4000561876668076e-05, + "loss": 1.1869, + "mean_token_accuracy": 0.7150498968549073, + "num_tokens": 437390122.0, + "step": 2500 + }, + { + "epoch": 0.6840518489499054, + "grad_norm": 1.34375, + "learning_rate": 3.393032729315915e-05, + "loss": 1.1801, + "mean_token_accuracy": 0.7161353545263409, + "num_tokens": 439126311.0, + "step": 2510 + }, + { + "epoch": 0.6867771551210207, + "grad_norm": 1.2109375, + "learning_rate": 3.386009270965023e-05, + "loss": 1.1568, + "mean_token_accuracy": 0.7200920292176306, + "num_tokens": 440890933.0, + "step": 2520 + }, + { + "epoch": 0.6895024612921358, + "grad_norm": 1.53125, + "learning_rate": 3.378985812614132e-05, + "loss": 1.1541, + "mean_token_accuracy": 0.7206738693639636, + "num_tokens": 442611770.0, + "step": 2530 + }, + { + "epoch": 0.692227767463251, + "grad_norm": 1.4140625, + "learning_rate": 3.37196235426324e-05, + "loss": 1.1591, + "mean_token_accuracy": 0.7203968748450279, + "num_tokens": 444358166.0, + "step": 2540 + }, + { + "epoch": 0.6949530736343661, + "grad_norm": 1.2890625, + "learning_rate": 3.364938895912347e-05, + "loss": 1.1954, + "mean_token_accuracy": 0.7128487601876259, + "num_tokens": 446078790.0, + "step": 2550 + }, + { + "epoch": 0.6976783798054813, + "grad_norm": 1.421875, + "learning_rate": 3.357915437561455e-05, + "loss": 1.1582, + "mean_token_accuracy": 0.7187117761000991, + "num_tokens": 447811186.0, + "step": 2560 + }, + { + "epoch": 0.7004036859765964, + "grad_norm": 1.390625, + "learning_rate": 3.350891979210564e-05, + "loss": 1.1672, + "mean_token_accuracy": 0.7181063129566609, + "num_tokens": 449537962.0, + "step": 2570 + }, + { + "epoch": 0.7031289921477116, + "grad_norm": 1.296875, + "learning_rate": 3.343868520859672e-05, + "loss": 1.1331, + "mean_token_accuracy": 0.7246433124877513, + "num_tokens": 451259469.0, + "step": 2580 + }, + { + "epoch": 0.7058542983188267, + "grad_norm": 1.421875, + "learning_rate": 3.336845062508779e-05, + "loss": 1.1554, + "mean_token_accuracy": 0.7212818250060081, + "num_tokens": 453041146.0, + "step": 2590 + }, + { + "epoch": 0.708579604489942, + "grad_norm": 1.359375, + "learning_rate": 3.329821604157887e-05, + "loss": 1.1884, + "mean_token_accuracy": 0.7135334552265704, + "num_tokens": 454753651.0, + "step": 2600 + }, + { + "epoch": 0.711304910661057, + "grad_norm": 1.3515625, + "learning_rate": 3.322798145806996e-05, + "loss": 1.1484, + "mean_token_accuracy": 0.7207954367622733, + "num_tokens": 456537057.0, + "step": 2610 + }, + { + "epoch": 0.7140302168321723, + "grad_norm": 1.3359375, + "learning_rate": 3.315774687456103e-05, + "loss": 1.1981, + "mean_token_accuracy": 0.7136128084734082, + "num_tokens": 458271296.0, + "step": 2620 + }, + { + "epoch": 0.7167555230032874, + "grad_norm": 1.28125, + "learning_rate": 3.308751229105211e-05, + "loss": 1.1635, + "mean_token_accuracy": 0.7181222994811833, + "num_tokens": 460014932.0, + "step": 2630 + }, + { + "epoch": 0.7194808291744026, + "grad_norm": 1.421875, + "learning_rate": 3.30172777075432e-05, + "loss": 1.1776, + "mean_token_accuracy": 0.7165118259377777, + "num_tokens": 461818480.0, + "step": 2640 + }, + { + "epoch": 0.7222061353455177, + "grad_norm": 1.3984375, + "learning_rate": 3.294704312403428e-05, + "loss": 1.2023, + "mean_token_accuracy": 0.7107026267796754, + "num_tokens": 463519993.0, + "step": 2650 + }, + { + "epoch": 0.7249314415166329, + "grad_norm": 1.328125, + "learning_rate": 3.287680854052535e-05, + "loss": 1.1638, + "mean_token_accuracy": 0.717846125178039, + "num_tokens": 465333101.0, + "step": 2660 + }, + { + "epoch": 0.727656747687748, + "grad_norm": 1.3515625, + "learning_rate": 3.280657395701643e-05, + "loss": 1.1591, + "mean_token_accuracy": 0.718819803185761, + "num_tokens": 467047198.0, + "step": 2670 + }, + { + "epoch": 0.7303820538588632, + "grad_norm": 1.3671875, + "learning_rate": 3.273633937350752e-05, + "loss": 1.1734, + "mean_token_accuracy": 0.7186005939729512, + "num_tokens": 468806711.0, + "step": 2680 + }, + { + "epoch": 0.7331073600299783, + "grad_norm": 1.2890625, + "learning_rate": 3.26661047899986e-05, + "loss": 1.1231, + "mean_token_accuracy": 0.7262043844908476, + "num_tokens": 470600782.0, + "step": 2690 + }, + { + "epoch": 0.7358326662010936, + "grad_norm": 1.5234375, + "learning_rate": 3.2595870206489674e-05, + "loss": 1.1755, + "mean_token_accuracy": 0.7167527761310339, + "num_tokens": 472349330.0, + "step": 2700 + }, + { + "epoch": 0.7385579723722087, + "grad_norm": 1.2734375, + "learning_rate": 3.2525635622980754e-05, + "loss": 1.1535, + "mean_token_accuracy": 0.7221431139856577, + "num_tokens": 474106983.0, + "step": 2710 + }, + { + "epoch": 0.7412832785433239, + "grad_norm": 1.4140625, + "learning_rate": 3.245540103947184e-05, + "loss": 1.1995, + "mean_token_accuracy": 0.7117431500926614, + "num_tokens": 475902407.0, + "step": 2720 + }, + { + "epoch": 0.744008584714439, + "grad_norm": 1.2890625, + "learning_rate": 3.2385166455962914e-05, + "loss": 1.1039, + "mean_token_accuracy": 0.729970954824239, + "num_tokens": 477654644.0, + "step": 2730 + }, + { + "epoch": 0.7467338908855542, + "grad_norm": 1.3828125, + "learning_rate": 3.2314931872453994e-05, + "loss": 1.1845, + "mean_token_accuracy": 0.7140484706498682, + "num_tokens": 479409078.0, + "step": 2740 + }, + { + "epoch": 0.7494591970566693, + "grad_norm": 1.53125, + "learning_rate": 3.224469728894508e-05, + "loss": 1.1676, + "mean_token_accuracy": 0.7176982633769512, + "num_tokens": 481154972.0, + "step": 2750 + }, + { + "epoch": 0.7521845032277845, + "grad_norm": 1.546875, + "learning_rate": 3.217446270543616e-05, + "loss": 1.1947, + "mean_token_accuracy": 0.7132840578444302, + "num_tokens": 482957554.0, + "step": 2760 + }, + { + "epoch": 0.7549098093988996, + "grad_norm": 1.4921875, + "learning_rate": 3.2104228121927235e-05, + "loss": 1.163, + "mean_token_accuracy": 0.7188111429102719, + "num_tokens": 484730289.0, + "step": 2770 + }, + { + "epoch": 0.7576351155700148, + "grad_norm": 1.40625, + "learning_rate": 3.2033993538418315e-05, + "loss": 1.1678, + "mean_token_accuracy": 0.7163368194364012, + "num_tokens": 486390658.0, + "step": 2780 + }, + { + "epoch": 0.7603604217411299, + "grad_norm": 1.3515625, + "learning_rate": 3.19637589549094e-05, + "loss": 1.1218, + "mean_token_accuracy": 0.727223726734519, + "num_tokens": 488084357.0, + "step": 2790 + }, + { + "epoch": 0.7630857279122452, + "grad_norm": 1.265625, + "learning_rate": 3.189352437140048e-05, + "loss": 1.2129, + "mean_token_accuracy": 0.7102430663071573, + "num_tokens": 489867504.0, + "step": 2800 + }, + { + "epoch": 0.7658110340833603, + "grad_norm": 1.3125, + "learning_rate": 3.1823289787891556e-05, + "loss": 1.1838, + "mean_token_accuracy": 0.7139560039155185, + "num_tokens": 491614120.0, + "step": 2810 + }, + { + "epoch": 0.7685363402544755, + "grad_norm": 1.5, + "learning_rate": 3.1753055204382636e-05, + "loss": 1.1806, + "mean_token_accuracy": 0.7168294186703861, + "num_tokens": 493360108.0, + "step": 2820 + }, + { + "epoch": 0.7712616464255906, + "grad_norm": 1.4140625, + "learning_rate": 3.168282062087372e-05, + "loss": 1.1415, + "mean_token_accuracy": 0.7245752868242562, + "num_tokens": 495060867.0, + "step": 2830 + }, + { + "epoch": 0.7739869525967058, + "grad_norm": 1.359375, + "learning_rate": 3.16125860373648e-05, + "loss": 1.1694, + "mean_token_accuracy": 0.715926815662533, + "num_tokens": 496731129.0, + "step": 2840 + }, + { + "epoch": 0.7767122587678209, + "grad_norm": 1.265625, + "learning_rate": 3.1542351453855877e-05, + "loss": 1.1351, + "mean_token_accuracy": 0.7253266898915172, + "num_tokens": 498416929.0, + "step": 2850 + }, + { + "epoch": 0.7794375649389361, + "grad_norm": 1.4921875, + "learning_rate": 3.1472116870346964e-05, + "loss": 1.1225, + "mean_token_accuracy": 0.7269441090524197, + "num_tokens": 500131030.0, + "step": 2860 + }, + { + "epoch": 0.7821628711100512, + "grad_norm": 1.203125, + "learning_rate": 3.1401882286838044e-05, + "loss": 1.1295, + "mean_token_accuracy": 0.7248596154153347, + "num_tokens": 501860254.0, + "step": 2870 + }, + { + "epoch": 0.7848881772811664, + "grad_norm": 1.3828125, + "learning_rate": 3.133164770332912e-05, + "loss": 1.1921, + "mean_token_accuracy": 0.7137732055038214, + "num_tokens": 503599791.0, + "step": 2880 + }, + { + "epoch": 0.7876134834522815, + "grad_norm": 1.390625, + "learning_rate": 3.12614131198202e-05, + "loss": 1.2135, + "mean_token_accuracy": 0.7103905778378248, + "num_tokens": 505277304.0, + "step": 2890 + }, + { + "epoch": 0.7903387896233968, + "grad_norm": 1.3046875, + "learning_rate": 3.1191178536311284e-05, + "loss": 1.1608, + "mean_token_accuracy": 0.718811112549156, + "num_tokens": 507010506.0, + "step": 2900 + }, + { + "epoch": 0.7930640957945119, + "grad_norm": 1.2734375, + "learning_rate": 3.1120943952802364e-05, + "loss": 1.1394, + "mean_token_accuracy": 0.7234757107682526, + "num_tokens": 508717678.0, + "step": 2910 + }, + { + "epoch": 0.7957894019656271, + "grad_norm": 1.234375, + "learning_rate": 3.105070936929344e-05, + "loss": 1.116, + "mean_token_accuracy": 0.7291767308488488, + "num_tokens": 510478982.0, + "step": 2920 + }, + { + "epoch": 0.7985147081367422, + "grad_norm": 1.375, + "learning_rate": 3.0980474785784525e-05, + "loss": 1.1253, + "mean_token_accuracy": 0.7246225934475661, + "num_tokens": 512223492.0, + "step": 2930 + }, + { + "epoch": 0.8012400143078574, + "grad_norm": 1.28125, + "learning_rate": 3.0910240202275605e-05, + "loss": 1.1514, + "mean_token_accuracy": 0.7207284711301327, + "num_tokens": 513950826.0, + "step": 2940 + }, + { + "epoch": 0.8039653204789725, + "grad_norm": 1.3671875, + "learning_rate": 3.0840005618766685e-05, + "loss": 1.1871, + "mean_token_accuracy": 0.7151925875805318, + "num_tokens": 515747512.0, + "step": 2950 + }, + { + "epoch": 0.8066906266500877, + "grad_norm": 1.28125, + "learning_rate": 3.076977103525776e-05, + "loss": 1.1468, + "mean_token_accuracy": 0.7219850319437683, + "num_tokens": 517564408.0, + "step": 2960 + }, + { + "epoch": 0.8094159328212028, + "grad_norm": 1.2578125, + "learning_rate": 3.0699536451748846e-05, + "loss": 1.0791, + "mean_token_accuracy": 0.7344457570463419, + "num_tokens": 519335113.0, + "step": 2970 + }, + { + "epoch": 0.812141238992318, + "grad_norm": 1.2734375, + "learning_rate": 3.0629301868239926e-05, + "loss": 1.1204, + "mean_token_accuracy": 0.7272397927008569, + "num_tokens": 521052694.0, + "step": 2980 + }, + { + "epoch": 0.8148665451634332, + "grad_norm": 1.296875, + "learning_rate": 3.0559067284731e-05, + "loss": 1.1698, + "mean_token_accuracy": 0.7177247768267989, + "num_tokens": 522838886.0, + "step": 2990 + }, + { + "epoch": 0.8175918513345484, + "grad_norm": 1.2421875, + "learning_rate": 3.048883270122208e-05, + "loss": 1.1519, + "mean_token_accuracy": 0.720951104350388, + "num_tokens": 524659688.0, + "step": 3000 + }, + { + "epoch": 0.8203171575056635, + "grad_norm": 1.328125, + "learning_rate": 3.0418598117713166e-05, + "loss": 1.1328, + "mean_token_accuracy": 0.7240156752988696, + "num_tokens": 526428828.0, + "step": 3010 + }, + { + "epoch": 0.8230424636767787, + "grad_norm": 1.3359375, + "learning_rate": 3.0348363534204243e-05, + "loss": 1.1616, + "mean_token_accuracy": 0.7177340661175549, + "num_tokens": 528178357.0, + "step": 3020 + }, + { + "epoch": 0.8257677698478938, + "grad_norm": 1.484375, + "learning_rate": 3.0278128950695323e-05, + "loss": 1.1425, + "mean_token_accuracy": 0.7223709647543728, + "num_tokens": 529968661.0, + "step": 3030 + }, + { + "epoch": 0.828493076019009, + "grad_norm": 1.4375, + "learning_rate": 3.0207894367186407e-05, + "loss": 1.1466, + "mean_token_accuracy": 0.7239165339618922, + "num_tokens": 531735846.0, + "step": 3040 + }, + { + "epoch": 0.8312183821901241, + "grad_norm": 1.3046875, + "learning_rate": 3.0137659783677484e-05, + "loss": 1.1655, + "mean_token_accuracy": 0.7192427675239742, + "num_tokens": 533482605.0, + "step": 3050 + }, + { + "epoch": 0.8339436883612393, + "grad_norm": 1.34375, + "learning_rate": 3.0067425200168564e-05, + "loss": 1.1472, + "mean_token_accuracy": 0.7224777213297784, + "num_tokens": 535172117.0, + "step": 3060 + }, + { + "epoch": 0.8366689945323545, + "grad_norm": 1.375, + "learning_rate": 2.9997190616659644e-05, + "loss": 1.1414, + "mean_token_accuracy": 0.7228651619516313, + "num_tokens": 536922982.0, + "step": 3070 + }, + { + "epoch": 0.8393943007034697, + "grad_norm": 1.3828125, + "learning_rate": 2.9926956033150728e-05, + "loss": 1.1552, + "mean_token_accuracy": 0.7204573257826269, + "num_tokens": 538697561.0, + "step": 3080 + }, + { + "epoch": 0.8421196068745849, + "grad_norm": 1.359375, + "learning_rate": 2.9856721449641805e-05, + "loss": 1.1616, + "mean_token_accuracy": 0.7190472551621496, + "num_tokens": 540498809.0, + "step": 3090 + }, + { + "epoch": 0.8448449130457, + "grad_norm": 1.328125, + "learning_rate": 2.9786486866132885e-05, + "loss": 1.1207, + "mean_token_accuracy": 0.7262116202153266, + "num_tokens": 542220573.0, + "step": 3100 + }, + { + "epoch": 0.8475702192168152, + "grad_norm": 1.3984375, + "learning_rate": 2.971625228262396e-05, + "loss": 1.1447, + "mean_token_accuracy": 0.7223312626592815, + "num_tokens": 543995979.0, + "step": 3110 + }, + { + "epoch": 0.8502955253879303, + "grad_norm": 1.2890625, + "learning_rate": 2.964601769911505e-05, + "loss": 1.1643, + "mean_token_accuracy": 0.7170741960406304, + "num_tokens": 545762046.0, + "step": 3120 + }, + { + "epoch": 0.8530208315590455, + "grad_norm": 1.2109375, + "learning_rate": 2.9575783115606125e-05, + "loss": 1.1455, + "mean_token_accuracy": 0.7230727946385741, + "num_tokens": 547563120.0, + "step": 3130 + }, + { + "epoch": 0.8557461377301606, + "grad_norm": 1.3046875, + "learning_rate": 2.9505548532097206e-05, + "loss": 1.158, + "mean_token_accuracy": 0.7192152096889913, + "num_tokens": 549276605.0, + "step": 3140 + }, + { + "epoch": 0.8584714439012758, + "grad_norm": 1.3046875, + "learning_rate": 2.943531394858829e-05, + "loss": 1.1336, + "mean_token_accuracy": 0.7245622499845922, + "num_tokens": 550991479.0, + "step": 3150 + }, + { + "epoch": 0.8611967500723909, + "grad_norm": 1.3203125, + "learning_rate": 2.9365079365079366e-05, + "loss": 1.1351, + "mean_token_accuracy": 0.7258178818039596, + "num_tokens": 552777662.0, + "step": 3160 + }, + { + "epoch": 0.8639220562435062, + "grad_norm": 1.3828125, + "learning_rate": 2.9294844781570446e-05, + "loss": 1.1166, + "mean_token_accuracy": 0.7274093635380268, + "num_tokens": 554539356.0, + "step": 3170 + }, + { + "epoch": 0.8666473624146213, + "grad_norm": 1.265625, + "learning_rate": 2.9224610198061526e-05, + "loss": 1.1907, + "mean_token_accuracy": 0.7132768167182804, + "num_tokens": 556212548.0, + "step": 3180 + }, + { + "epoch": 0.8693726685857365, + "grad_norm": 1.328125, + "learning_rate": 2.915437561455261e-05, + "loss": 1.1617, + "mean_token_accuracy": 0.7187118930742145, + "num_tokens": 557950461.0, + "step": 3190 + }, + { + "epoch": 0.8720979747568516, + "grad_norm": 1.296875, + "learning_rate": 2.9084141031043687e-05, + "loss": 1.1303, + "mean_token_accuracy": 0.72461500428617, + "num_tokens": 559715926.0, + "step": 3200 + }, + { + "epoch": 0.8748232809279668, + "grad_norm": 1.3125, + "learning_rate": 2.9013906447534767e-05, + "loss": 1.1556, + "mean_token_accuracy": 0.7212729568593204, + "num_tokens": 561479394.0, + "step": 3210 + }, + { + "epoch": 0.8775485870990819, + "grad_norm": 1.28125, + "learning_rate": 2.8943671864025844e-05, + "loss": 1.1465, + "mean_token_accuracy": 0.7218864490278065, + "num_tokens": 563271872.0, + "step": 3220 + }, + { + "epoch": 0.8802738932701971, + "grad_norm": 1.28125, + "learning_rate": 2.887343728051693e-05, + "loss": 1.1411, + "mean_token_accuracy": 0.723791983537376, + "num_tokens": 565002210.0, + "step": 3230 + }, + { + "epoch": 0.8829991994413122, + "grad_norm": 1.3359375, + "learning_rate": 2.8803202697008008e-05, + "loss": 1.0435, + "mean_token_accuracy": 0.7417100036516786, + "num_tokens": 566740287.0, + "step": 3240 + }, + { + "epoch": 0.8857245056124274, + "grad_norm": 1.2578125, + "learning_rate": 2.8732968113499088e-05, + "loss": 1.1746, + "mean_token_accuracy": 0.7177447673864663, + "num_tokens": 568560935.0, + "step": 3250 + }, + { + "epoch": 0.8884498117835425, + "grad_norm": 1.4296875, + "learning_rate": 2.866273352999017e-05, + "loss": 1.1609, + "mean_token_accuracy": 0.7196293252520263, + "num_tokens": 570290159.0, + "step": 3260 + }, + { + "epoch": 0.8911751179546578, + "grad_norm": 1.2890625, + "learning_rate": 2.8592498946481248e-05, + "loss": 1.1613, + "mean_token_accuracy": 0.719492181763053, + "num_tokens": 572076704.0, + "step": 3270 + }, + { + "epoch": 0.8939004241257729, + "grad_norm": 1.1953125, + "learning_rate": 2.852226436297233e-05, + "loss": 1.1808, + "mean_token_accuracy": 0.716338072437793, + "num_tokens": 573841975.0, + "step": 3280 + }, + { + "epoch": 0.8966257302968881, + "grad_norm": 1.3046875, + "learning_rate": 2.845202977946341e-05, + "loss": 1.145, + "mean_token_accuracy": 0.7234473955817521, + "num_tokens": 575587035.0, + "step": 3290 + }, + { + "epoch": 0.8993510364680032, + "grad_norm": 1.1953125, + "learning_rate": 2.8381795195954492e-05, + "loss": 1.1381, + "mean_token_accuracy": 0.7228204027749598, + "num_tokens": 577285852.0, + "step": 3300 + }, + { + "epoch": 0.9020763426391184, + "grad_norm": 1.3359375, + "learning_rate": 2.831156061244557e-05, + "loss": 1.1672, + "mean_token_accuracy": 0.7184297275729478, + "num_tokens": 579085053.0, + "step": 3310 + }, + { + "epoch": 0.9048016488102335, + "grad_norm": 1.3125, + "learning_rate": 2.824132602893665e-05, + "loss": 1.1226, + "mean_token_accuracy": 0.7262911381199956, + "num_tokens": 580874041.0, + "step": 3320 + }, + { + "epoch": 0.9075269549813487, + "grad_norm": 1.3359375, + "learning_rate": 2.8171091445427726e-05, + "loss": 1.123, + "mean_token_accuracy": 0.724399715103209, + "num_tokens": 582616757.0, + "step": 3330 + }, + { + "epoch": 0.9102522611524638, + "grad_norm": 1.3828125, + "learning_rate": 2.8100856861918813e-05, + "loss": 1.1253, + "mean_token_accuracy": 0.726380693167448, + "num_tokens": 584370810.0, + "step": 3340 + }, + { + "epoch": 0.912977567323579, + "grad_norm": 1.28125, + "learning_rate": 2.803062227840989e-05, + "loss": 1.1396, + "mean_token_accuracy": 0.7236145354807377, + "num_tokens": 586143330.0, + "step": 3350 + }, + { + "epoch": 0.9157028734946941, + "grad_norm": 1.2734375, + "learning_rate": 2.796038769490097e-05, + "loss": 1.0929, + "mean_token_accuracy": 0.7320468625053763, + "num_tokens": 587851884.0, + "step": 3360 + }, + { + "epoch": 0.9184281796658094, + "grad_norm": 1.40625, + "learning_rate": 2.7890153111392054e-05, + "loss": 1.1286, + "mean_token_accuracy": 0.7241552670486271, + "num_tokens": 589601267.0, + "step": 3370 + }, + { + "epoch": 0.9211534858369245, + "grad_norm": 1.375, + "learning_rate": 2.7819918527883134e-05, + "loss": 1.1709, + "mean_token_accuracy": 0.7167520637623965, + "num_tokens": 591321205.0, + "step": 3380 + }, + { + "epoch": 0.9238787920080397, + "grad_norm": 1.1875, + "learning_rate": 2.774968394437421e-05, + "loss": 1.0953, + "mean_token_accuracy": 0.7323742469772696, + "num_tokens": 593110103.0, + "step": 3390 + }, + { + "epoch": 0.9266040981791548, + "grad_norm": 1.3984375, + "learning_rate": 2.767944936086529e-05, + "loss": 1.1603, + "mean_token_accuracy": 0.7199522324837744, + "num_tokens": 594823039.0, + "step": 3400 + }, + { + "epoch": 0.92932940435027, + "grad_norm": 1.265625, + "learning_rate": 2.7609214777356374e-05, + "loss": 1.1437, + "mean_token_accuracy": 0.7233537461608648, + "num_tokens": 596625643.0, + "step": 3410 + }, + { + "epoch": 0.9320547105213851, + "grad_norm": 1.296875, + "learning_rate": 2.753898019384745e-05, + "loss": 1.164, + "mean_token_accuracy": 0.716280288156122, + "num_tokens": 598409404.0, + "step": 3420 + }, + { + "epoch": 0.9347800166925003, + "grad_norm": 1.5625, + "learning_rate": 2.746874561033853e-05, + "loss": 1.1393, + "mean_token_accuracy": 0.7229320453479886, + "num_tokens": 600131817.0, + "step": 3430 + }, + { + "epoch": 0.9375053228636154, + "grad_norm": 1.1640625, + "learning_rate": 2.7398511026829608e-05, + "loss": 1.1419, + "mean_token_accuracy": 0.7230301261879504, + "num_tokens": 601867823.0, + "step": 3440 + }, + { + "epoch": 0.9402306290347306, + "grad_norm": 1.3046875, + "learning_rate": 2.7328276443320695e-05, + "loss": 1.1177, + "mean_token_accuracy": 0.729860719665885, + "num_tokens": 603591263.0, + "step": 3450 + }, + { + "epoch": 0.9429559352058458, + "grad_norm": 1.5, + "learning_rate": 2.7258041859811772e-05, + "loss": 1.1591, + "mean_token_accuracy": 0.7213227171450853, + "num_tokens": 605400422.0, + "step": 3460 + }, + { + "epoch": 0.945681241376961, + "grad_norm": 1.3515625, + "learning_rate": 2.7187807276302852e-05, + "loss": 1.149, + "mean_token_accuracy": 0.722071444336325, + "num_tokens": 607180330.0, + "step": 3470 + }, + { + "epoch": 0.9484065475480761, + "grad_norm": 1.2421875, + "learning_rate": 2.7117572692793936e-05, + "loss": 1.1029, + "mean_token_accuracy": 0.730613834504038, + "num_tokens": 608937194.0, + "step": 3480 + }, + { + "epoch": 0.9511318537191913, + "grad_norm": 1.390625, + "learning_rate": 2.7047338109285016e-05, + "loss": 1.1518, + "mean_token_accuracy": 0.7203960535116494, + "num_tokens": 610676168.0, + "step": 3490 + }, + { + "epoch": 0.9538571598903064, + "grad_norm": 1.25, + "learning_rate": 2.6977103525776093e-05, + "loss": 1.097, + "mean_token_accuracy": 0.7318043757230044, + "num_tokens": 612491555.0, + "step": 3500 + }, + { + "epoch": 0.9565824660614216, + "grad_norm": 1.5703125, + "learning_rate": 2.6906868942267173e-05, + "loss": 1.158, + "mean_token_accuracy": 0.7188845065422356, + "num_tokens": 614310820.0, + "step": 3510 + }, + { + "epoch": 0.9593077722325367, + "grad_norm": 1.2734375, + "learning_rate": 2.6836634358758256e-05, + "loss": 1.1655, + "mean_token_accuracy": 0.7194117167033255, + "num_tokens": 616071087.0, + "step": 3520 + }, + { + "epoch": 0.9620330784036519, + "grad_norm": 1.3515625, + "learning_rate": 2.6766399775249333e-05, + "loss": 1.099, + "mean_token_accuracy": 0.7303661842830479, + "num_tokens": 617769166.0, + "step": 3530 + }, + { + "epoch": 0.964758384574767, + "grad_norm": 1.34375, + "learning_rate": 2.6696165191740413e-05, + "loss": 1.1328, + "mean_token_accuracy": 0.7244265062734485, + "num_tokens": 619538232.0, + "step": 3540 + }, + { + "epoch": 0.9674836907458823, + "grad_norm": 1.1328125, + "learning_rate": 2.662593060823149e-05, + "loss": 1.1299, + "mean_token_accuracy": 0.7269687331281602, + "num_tokens": 621303314.0, + "step": 3550 + }, + { + "epoch": 0.9702089969169974, + "grad_norm": 1.2109375, + "learning_rate": 2.6555696024722577e-05, + "loss": 1.1124, + "mean_token_accuracy": 0.7290478084236384, + "num_tokens": 623100776.0, + "step": 3560 + }, + { + "epoch": 0.9729343030881126, + "grad_norm": 1.234375, + "learning_rate": 2.6485461441213654e-05, + "loss": 1.1804, + "mean_token_accuracy": 0.7147768701426684, + "num_tokens": 624821413.0, + "step": 3570 + }, + { + "epoch": 0.9756596092592277, + "grad_norm": 1.3203125, + "learning_rate": 2.6415226857704734e-05, + "loss": 1.1439, + "mean_token_accuracy": 0.7215228925459087, + "num_tokens": 626628545.0, + "step": 3580 + }, + { + "epoch": 0.9783849154303429, + "grad_norm": 1.2578125, + "learning_rate": 2.6344992274195818e-05, + "loss": 1.1218, + "mean_token_accuracy": 0.7278467868454754, + "num_tokens": 628425036.0, + "step": 3590 + }, + { + "epoch": 0.981110221601458, + "grad_norm": 1.2265625, + "learning_rate": 2.6274757690686898e-05, + "loss": 1.0925, + "mean_token_accuracy": 0.7307519348338246, + "num_tokens": 630092739.0, + "step": 3600 + }, + { + "epoch": 0.9838355277725732, + "grad_norm": 1.4609375, + "learning_rate": 2.6204523107177975e-05, + "loss": 1.1461, + "mean_token_accuracy": 0.7233453346416354, + "num_tokens": 631791605.0, + "step": 3610 + }, + { + "epoch": 0.9865608339436883, + "grad_norm": 1.5625, + "learning_rate": 2.6134288523669055e-05, + "loss": 1.1312, + "mean_token_accuracy": 0.7232753919437528, + "num_tokens": 633554055.0, + "step": 3620 + }, + { + "epoch": 0.9892861401148035, + "grad_norm": 1.25, + "learning_rate": 2.606405394016014e-05, + "loss": 1.1021, + "mean_token_accuracy": 0.7298557332716882, + "num_tokens": 635269789.0, + "step": 3630 + }, + { + "epoch": 0.9920114462859186, + "grad_norm": 1.296875, + "learning_rate": 2.5993819356651215e-05, + "loss": 1.129, + "mean_token_accuracy": 0.7261236603371799, + "num_tokens": 637078092.0, + "step": 3640 + }, + { + "epoch": 0.9947367524570339, + "grad_norm": 1.3359375, + "learning_rate": 2.5923584773142296e-05, + "loss": 1.1019, + "mean_token_accuracy": 0.7314373353496194, + "num_tokens": 638813927.0, + "step": 3650 + }, + { + "epoch": 0.997462058628149, + "grad_norm": 1.3828125, + "learning_rate": 2.5853350189633372e-05, + "loss": 1.1339, + "mean_token_accuracy": 0.7248508833348751, + "num_tokens": 640594007.0, + "step": 3660 + }, + { + "epoch": 1.0, + "grad_norm": 0.65625, + "learning_rate": 2.578311560612446e-05, + "loss": 1.0198, + "mean_token_accuracy": 0.731230377150862, + "num_tokens": 642209396.0, + "step": 3670 + }, + { + "epoch": 1.0027253061711152, + "grad_norm": 1.3046875, + "learning_rate": 2.5712881022615536e-05, + "loss": 1.0856, + "mean_token_accuracy": 0.7334377626888454, + "num_tokens": 643925394.0, + "step": 3680 + }, + { + "epoch": 1.0054506123422304, + "grad_norm": 1.2890625, + "learning_rate": 2.5642646439106616e-05, + "loss": 1.1029, + "mean_token_accuracy": 0.728530184365809, + "num_tokens": 645722314.0, + "step": 3690 + }, + { + "epoch": 1.0081759185133454, + "grad_norm": 1.203125, + "learning_rate": 2.55724118555977e-05, + "loss": 1.0496, + "mean_token_accuracy": 0.7422402281314134, + "num_tokens": 647498896.0, + "step": 3700 + }, + { + "epoch": 1.0109012246844606, + "grad_norm": 1.15625, + "learning_rate": 2.550217727208878e-05, + "loss": 1.0701, + "mean_token_accuracy": 0.7356513783335685, + "num_tokens": 649263905.0, + "step": 3710 + }, + { + "epoch": 1.0136265308555759, + "grad_norm": 1.2265625, + "learning_rate": 2.5431942688579857e-05, + "loss": 1.0947, + "mean_token_accuracy": 0.7309626482427121, + "num_tokens": 651037567.0, + "step": 3720 + }, + { + "epoch": 1.016351837026691, + "grad_norm": 1.3125, + "learning_rate": 2.5361708105070937e-05, + "loss": 1.079, + "mean_token_accuracy": 0.7346455365419388, + "num_tokens": 652736955.0, + "step": 3730 + }, + { + "epoch": 1.019077143197806, + "grad_norm": 1.4296875, + "learning_rate": 2.529147352156202e-05, + "loss": 1.0438, + "mean_token_accuracy": 0.7416777711361646, + "num_tokens": 654479273.0, + "step": 3740 + }, + { + "epoch": 1.0218024493689213, + "grad_norm": 1.328125, + "learning_rate": 2.5221238938053098e-05, + "loss": 1.0706, + "mean_token_accuracy": 0.7361437531188131, + "num_tokens": 656259441.0, + "step": 3750 + }, + { + "epoch": 1.0245277555400365, + "grad_norm": 1.234375, + "learning_rate": 2.5151004354544178e-05, + "loss": 1.0571, + "mean_token_accuracy": 0.7389569613151252, + "num_tokens": 658034787.0, + "step": 3760 + }, + { + "epoch": 1.0272530617111517, + "grad_norm": 1.15625, + "learning_rate": 2.5080769771035258e-05, + "loss": 1.1168, + "mean_token_accuracy": 0.727650732640177, + "num_tokens": 659760393.0, + "step": 3770 + }, + { + "epoch": 1.0299783678822667, + "grad_norm": 1.2890625, + "learning_rate": 2.501053518752634e-05, + "loss": 1.0428, + "mean_token_accuracy": 0.7429992444813251, + "num_tokens": 661469264.0, + "step": 3780 + }, + { + "epoch": 1.032703674053382, + "grad_norm": 1.28125, + "learning_rate": 2.494030060401742e-05, + "loss": 1.0676, + "mean_token_accuracy": 0.7347254963591695, + "num_tokens": 663177784.0, + "step": 3790 + }, + { + "epoch": 1.0354289802244971, + "grad_norm": 1.15625, + "learning_rate": 2.4870066020508502e-05, + "loss": 1.0892, + "mean_token_accuracy": 0.7316924562677741, + "num_tokens": 664940637.0, + "step": 3800 + }, + { + "epoch": 1.0381542863956124, + "grad_norm": 1.2578125, + "learning_rate": 2.479983143699958e-05, + "loss": 1.0756, + "mean_token_accuracy": 0.7350615672767162, + "num_tokens": 666668007.0, + "step": 3810 + }, + { + "epoch": 1.0408795925667274, + "grad_norm": 1.171875, + "learning_rate": 2.4729596853490662e-05, + "loss": 1.1148, + "mean_token_accuracy": 0.7279157171025872, + "num_tokens": 668437185.0, + "step": 3820 + }, + { + "epoch": 1.0436048987378426, + "grad_norm": 1.2890625, + "learning_rate": 2.465936226998174e-05, + "loss": 1.1052, + "mean_token_accuracy": 0.7291578905656934, + "num_tokens": 670213712.0, + "step": 3830 + }, + { + "epoch": 1.0463302049089578, + "grad_norm": 1.1640625, + "learning_rate": 2.458912768647282e-05, + "loss": 1.0709, + "mean_token_accuracy": 0.7375737321563065, + "num_tokens": 672017818.0, + "step": 3840 + }, + { + "epoch": 1.049055511080073, + "grad_norm": 1.265625, + "learning_rate": 2.45188931029639e-05, + "loss": 1.0849, + "mean_token_accuracy": 0.7331707324832678, + "num_tokens": 673831446.0, + "step": 3850 + }, + { + "epoch": 1.051780817251188, + "grad_norm": 1.328125, + "learning_rate": 2.444865851945498e-05, + "loss": 1.0958, + "mean_token_accuracy": 0.7301594057120383, + "num_tokens": 675554873.0, + "step": 3860 + }, + { + "epoch": 1.0545061234223032, + "grad_norm": 1.2109375, + "learning_rate": 2.437842393594606e-05, + "loss": 1.0991, + "mean_token_accuracy": 0.7300675465725363, + "num_tokens": 677257984.0, + "step": 3870 + }, + { + "epoch": 1.0572314295934184, + "grad_norm": 1.25, + "learning_rate": 2.430818935243714e-05, + "loss": 1.0591, + "mean_token_accuracy": 0.7378762131556869, + "num_tokens": 678942202.0, + "step": 3880 + }, + { + "epoch": 1.0599567357645336, + "grad_norm": 1.328125, + "learning_rate": 2.4237954768928224e-05, + "loss": 1.1099, + "mean_token_accuracy": 0.7285673432983458, + "num_tokens": 680673377.0, + "step": 3890 + }, + { + "epoch": 1.0626820419356486, + "grad_norm": 1.3515625, + "learning_rate": 2.41677201854193e-05, + "loss": 1.0811, + "mean_token_accuracy": 0.7347205553203822, + "num_tokens": 682485772.0, + "step": 3900 + }, + { + "epoch": 1.0654073481067639, + "grad_norm": 1.28125, + "learning_rate": 2.4097485601910384e-05, + "loss": 1.057, + "mean_token_accuracy": 0.7373597668483853, + "num_tokens": 684233327.0, + "step": 3910 + }, + { + "epoch": 1.068132654277879, + "grad_norm": 1.140625, + "learning_rate": 2.402725101840146e-05, + "loss": 1.1118, + "mean_token_accuracy": 0.7285363599658012, + "num_tokens": 686004478.0, + "step": 3920 + }, + { + "epoch": 1.0708579604489943, + "grad_norm": 1.3359375, + "learning_rate": 2.3957016434892544e-05, + "loss": 1.1181, + "mean_token_accuracy": 0.7268542011268437, + "num_tokens": 687776176.0, + "step": 3930 + }, + { + "epoch": 1.0735832666201093, + "grad_norm": 1.265625, + "learning_rate": 2.388678185138362e-05, + "loss": 1.0644, + "mean_token_accuracy": 0.7378135416656733, + "num_tokens": 689522355.0, + "step": 3940 + }, + { + "epoch": 1.0763085727912245, + "grad_norm": 1.1796875, + "learning_rate": 2.38165472678747e-05, + "loss": 1.0758, + "mean_token_accuracy": 0.7359712744131685, + "num_tokens": 691309245.0, + "step": 3950 + }, + { + "epoch": 1.0790338789623397, + "grad_norm": 1.21875, + "learning_rate": 2.374631268436578e-05, + "loss": 1.0995, + "mean_token_accuracy": 0.729337589815259, + "num_tokens": 693050122.0, + "step": 3960 + }, + { + "epoch": 1.081759185133455, + "grad_norm": 1.265625, + "learning_rate": 2.3676078100856862e-05, + "loss": 1.0808, + "mean_token_accuracy": 0.7320249448530376, + "num_tokens": 694762288.0, + "step": 3970 + }, + { + "epoch": 1.08448449130457, + "grad_norm": 1.203125, + "learning_rate": 2.3605843517347942e-05, + "loss": 1.0738, + "mean_token_accuracy": 0.7363296593539417, + "num_tokens": 696531417.0, + "step": 3980 + }, + { + "epoch": 1.0872097974756851, + "grad_norm": 1.125, + "learning_rate": 2.3535608933839022e-05, + "loss": 1.0191, + "mean_token_accuracy": 0.745452574454248, + "num_tokens": 698257075.0, + "step": 3990 + }, + { + "epoch": 1.0899351036468004, + "grad_norm": 1.1796875, + "learning_rate": 2.3465374350330106e-05, + "loss": 1.0179, + "mean_token_accuracy": 0.7459572400897742, + "num_tokens": 699971132.0, + "step": 4000 + }, + { + "epoch": 1.0926604098179156, + "grad_norm": 1.1875, + "learning_rate": 2.3395139766821183e-05, + "loss": 1.1285, + "mean_token_accuracy": 0.7256388378329575, + "num_tokens": 701752105.0, + "step": 4010 + }, + { + "epoch": 1.0953857159890306, + "grad_norm": 1.2734375, + "learning_rate": 2.3324905183312266e-05, + "loss": 1.1014, + "mean_token_accuracy": 0.7292766220867634, + "num_tokens": 703473481.0, + "step": 4020 + }, + { + "epoch": 1.0981110221601458, + "grad_norm": 1.25, + "learning_rate": 2.3254670599803343e-05, + "loss": 1.0862, + "mean_token_accuracy": 0.7327531099319458, + "num_tokens": 705167075.0, + "step": 4030 + }, + { + "epoch": 1.100836328331261, + "grad_norm": 1.4609375, + "learning_rate": 2.3184436016294427e-05, + "loss": 1.0694, + "mean_token_accuracy": 0.7348756663501262, + "num_tokens": 706874911.0, + "step": 4040 + }, + { + "epoch": 1.1035616345023762, + "grad_norm": 1.25, + "learning_rate": 2.3114201432785503e-05, + "loss": 1.0529, + "mean_token_accuracy": 0.740631565824151, + "num_tokens": 708619583.0, + "step": 4050 + }, + { + "epoch": 1.1062869406734912, + "grad_norm": 1.3359375, + "learning_rate": 2.3043966849276587e-05, + "loss": 1.0997, + "mean_token_accuracy": 0.7288135625422001, + "num_tokens": 710446727.0, + "step": 4060 + }, + { + "epoch": 1.1090122468446064, + "grad_norm": 1.3515625, + "learning_rate": 2.2973732265767664e-05, + "loss": 1.0971, + "mean_token_accuracy": 0.7318846006877721, + "num_tokens": 712149650.0, + "step": 4070 + }, + { + "epoch": 1.1117375530157216, + "grad_norm": 1.1640625, + "learning_rate": 2.2903497682258744e-05, + "loss": 1.092, + "mean_token_accuracy": 0.7316927151754499, + "num_tokens": 713914558.0, + "step": 4080 + }, + { + "epoch": 1.1144628591868369, + "grad_norm": 1.21875, + "learning_rate": 2.2833263098749824e-05, + "loss": 1.0631, + "mean_token_accuracy": 0.7381871834397316, + "num_tokens": 715646472.0, + "step": 4090 + }, + { + "epoch": 1.1171881653579518, + "grad_norm": 1.2421875, + "learning_rate": 2.2763028515240904e-05, + "loss": 1.1446, + "mean_token_accuracy": 0.7214690452441573, + "num_tokens": 717350655.0, + "step": 4100 + }, + { + "epoch": 1.119913471529067, + "grad_norm": 1.2109375, + "learning_rate": 2.2692793931731988e-05, + "loss": 1.079, + "mean_token_accuracy": 0.7343419956974685, + "num_tokens": 719082198.0, + "step": 4110 + }, + { + "epoch": 1.1226387777001823, + "grad_norm": 1.2890625, + "learning_rate": 2.2622559348223065e-05, + "loss": 1.0727, + "mean_token_accuracy": 0.736398237477988, + "num_tokens": 720809425.0, + "step": 4120 + }, + { + "epoch": 1.1253640838712975, + "grad_norm": 1.140625, + "learning_rate": 2.255232476471415e-05, + "loss": 1.0772, + "mean_token_accuracy": 0.7357128752395511, + "num_tokens": 722599262.0, + "step": 4130 + }, + { + "epoch": 1.1280893900424125, + "grad_norm": 1.3125, + "learning_rate": 2.2482090181205225e-05, + "loss": 1.1143, + "mean_token_accuracy": 0.7266730224713683, + "num_tokens": 724347920.0, + "step": 4140 + }, + { + "epoch": 1.1308146962135277, + "grad_norm": 1.375, + "learning_rate": 2.241185559769631e-05, + "loss": 1.0939, + "mean_token_accuracy": 0.7320883464999497, + "num_tokens": 726148714.0, + "step": 4150 + }, + { + "epoch": 1.133540002384643, + "grad_norm": 1.2265625, + "learning_rate": 2.2341621014187386e-05, + "loss": 1.0558, + "mean_token_accuracy": 0.7377167707309127, + "num_tokens": 727926585.0, + "step": 4160 + }, + { + "epoch": 1.1362653085557581, + "grad_norm": 1.171875, + "learning_rate": 2.227138643067847e-05, + "loss": 1.0457, + "mean_token_accuracy": 0.740927542001009, + "num_tokens": 729670197.0, + "step": 4170 + }, + { + "epoch": 1.1389906147268731, + "grad_norm": 1.328125, + "learning_rate": 2.2201151847169546e-05, + "loss": 1.0763, + "mean_token_accuracy": 0.7344906724989414, + "num_tokens": 731485403.0, + "step": 4180 + }, + { + "epoch": 1.1417159208979883, + "grad_norm": 1.2421875, + "learning_rate": 2.2130917263660626e-05, + "loss": 1.1401, + "mean_token_accuracy": 0.7240152478218078, + "num_tokens": 733341300.0, + "step": 4190 + }, + { + "epoch": 1.1444412270691036, + "grad_norm": 1.234375, + "learning_rate": 2.206068268015171e-05, + "loss": 1.084, + "mean_token_accuracy": 0.731456073652953, + "num_tokens": 735058358.0, + "step": 4200 + }, + { + "epoch": 1.1471665332402188, + "grad_norm": 1.1328125, + "learning_rate": 2.1990448096642787e-05, + "loss": 1.0391, + "mean_token_accuracy": 0.741604351811111, + "num_tokens": 736766234.0, + "step": 4210 + }, + { + "epoch": 1.1498918394113338, + "grad_norm": 1.203125, + "learning_rate": 2.192021351313387e-05, + "loss": 1.0695, + "mean_token_accuracy": 0.7358593477867543, + "num_tokens": 738512811.0, + "step": 4220 + }, + { + "epoch": 1.152617145582449, + "grad_norm": 1.2578125, + "learning_rate": 2.1849978929624947e-05, + "loss": 1.0493, + "mean_token_accuracy": 0.7410455554723739, + "num_tokens": 740229199.0, + "step": 4230 + }, + { + "epoch": 1.1553424517535642, + "grad_norm": 1.2265625, + "learning_rate": 2.177974434611603e-05, + "loss": 1.1012, + "mean_token_accuracy": 0.7302769407629967, + "num_tokens": 741944899.0, + "step": 4240 + }, + { + "epoch": 1.1580677579246794, + "grad_norm": 1.1484375, + "learning_rate": 2.1709509762607107e-05, + "loss": 1.0743, + "mean_token_accuracy": 0.7353692702949047, + "num_tokens": 743693703.0, + "step": 4250 + }, + { + "epoch": 1.1607930640957944, + "grad_norm": 1.2734375, + "learning_rate": 2.163927517909819e-05, + "loss": 1.0655, + "mean_token_accuracy": 0.7366907062008977, + "num_tokens": 745478644.0, + "step": 4260 + }, + { + "epoch": 1.1635183702669096, + "grad_norm": 1.296875, + "learning_rate": 2.1569040595589268e-05, + "loss": 1.0476, + "mean_token_accuracy": 0.7388629827648401, + "num_tokens": 747177577.0, + "step": 4270 + }, + { + "epoch": 1.1662436764380248, + "grad_norm": 1.15625, + "learning_rate": 2.149880601208035e-05, + "loss": 1.0586, + "mean_token_accuracy": 0.7396366925910115, + "num_tokens": 748939185.0, + "step": 4280 + }, + { + "epoch": 1.16896898260914, + "grad_norm": 2.015625, + "learning_rate": 2.1428571428571428e-05, + "loss": 1.067, + "mean_token_accuracy": 0.7374932891689241, + "num_tokens": 750713451.0, + "step": 4290 + }, + { + "epoch": 1.1716942887802553, + "grad_norm": 1.3046875, + "learning_rate": 2.135833684506251e-05, + "loss": 1.0906, + "mean_token_accuracy": 0.7312643750570714, + "num_tokens": 752413608.0, + "step": 4300 + }, + { + "epoch": 1.1744195949513703, + "grad_norm": 1.2265625, + "learning_rate": 2.1288102261553592e-05, + "loss": 1.0904, + "mean_token_accuracy": 0.7340483049862087, + "num_tokens": 754171875.0, + "step": 4310 + }, + { + "epoch": 1.1771449011224855, + "grad_norm": 1.171875, + "learning_rate": 2.121786767804467e-05, + "loss": 1.0408, + "mean_token_accuracy": 0.7425878893584013, + "num_tokens": 755959283.0, + "step": 4320 + }, + { + "epoch": 1.1798702072936007, + "grad_norm": 1.1875, + "learning_rate": 2.1147633094535752e-05, + "loss": 1.0851, + "mean_token_accuracy": 0.7337506388314068, + "num_tokens": 757784699.0, + "step": 4330 + }, + { + "epoch": 1.1825955134647157, + "grad_norm": 1.2109375, + "learning_rate": 2.107739851102683e-05, + "loss": 1.0489, + "mean_token_accuracy": 0.7420765064656735, + "num_tokens": 759536372.0, + "step": 4340 + }, + { + "epoch": 1.185320819635831, + "grad_norm": 1.21875, + "learning_rate": 2.1007163927517913e-05, + "loss": 1.0519, + "mean_token_accuracy": 0.7388960063457489, + "num_tokens": 761294878.0, + "step": 4350 + }, + { + "epoch": 1.1880461258069461, + "grad_norm": 1.21875, + "learning_rate": 2.093692934400899e-05, + "loss": 1.0837, + "mean_token_accuracy": 0.7335819829255342, + "num_tokens": 763041449.0, + "step": 4360 + }, + { + "epoch": 1.1907714319780613, + "grad_norm": 1.3515625, + "learning_rate": 2.0866694760500073e-05, + "loss": 1.05, + "mean_token_accuracy": 0.7396579059772194, + "num_tokens": 764760036.0, + "step": 4370 + }, + { + "epoch": 1.1934967381491766, + "grad_norm": 1.2890625, + "learning_rate": 2.079646017699115e-05, + "loss": 1.0973, + "mean_token_accuracy": 0.7316338730975985, + "num_tokens": 766503555.0, + "step": 4380 + }, + { + "epoch": 1.1962220443202916, + "grad_norm": 1.421875, + "learning_rate": 2.0726225593482233e-05, + "loss": 1.0779, + "mean_token_accuracy": 0.734508345182985, + "num_tokens": 768216315.0, + "step": 4390 + }, + { + "epoch": 1.1989473504914068, + "grad_norm": 1.296875, + "learning_rate": 2.065599100997331e-05, + "loss": 1.0198, + "mean_token_accuracy": 0.7466691703535616, + "num_tokens": 769929999.0, + "step": 4400 + }, + { + "epoch": 1.201672656662522, + "grad_norm": 1.265625, + "learning_rate": 2.058575642646439e-05, + "loss": 1.1561, + "mean_token_accuracy": 0.719224436301738, + "num_tokens": 771653789.0, + "step": 4410 + }, + { + "epoch": 1.204397962833637, + "grad_norm": 1.15625, + "learning_rate": 2.0515521842955474e-05, + "loss": 1.0996, + "mean_token_accuracy": 0.7297371865250171, + "num_tokens": 773424211.0, + "step": 4420 + }, + { + "epoch": 1.2071232690047522, + "grad_norm": 1.3125, + "learning_rate": 2.044528725944655e-05, + "loss": 1.0974, + "mean_token_accuracy": 0.7300912352278829, + "num_tokens": 775177147.0, + "step": 4430 + }, + { + "epoch": 1.2098485751758674, + "grad_norm": 1.2265625, + "learning_rate": 2.0375052675937634e-05, + "loss": 1.1137, + "mean_token_accuracy": 0.7295114539563656, + "num_tokens": 776924632.0, + "step": 4440 + }, + { + "epoch": 1.2125738813469826, + "grad_norm": 1.203125, + "learning_rate": 2.030481809242871e-05, + "loss": 1.0753, + "mean_token_accuracy": 0.7359099732711911, + "num_tokens": 778669132.0, + "step": 4450 + }, + { + "epoch": 1.2152991875180978, + "grad_norm": 1.234375, + "learning_rate": 2.0234583508919795e-05, + "loss": 1.0586, + "mean_token_accuracy": 0.7383248614147305, + "num_tokens": 780470551.0, + "step": 4460 + }, + { + "epoch": 1.2180244936892128, + "grad_norm": 1.15625, + "learning_rate": 2.016434892541087e-05, + "loss": 1.1032, + "mean_token_accuracy": 0.7300608051009476, + "num_tokens": 782177060.0, + "step": 4470 + }, + { + "epoch": 1.220749799860328, + "grad_norm": 1.28125, + "learning_rate": 2.0094114341901955e-05, + "loss": 1.0876, + "mean_token_accuracy": 0.7326032761484385, + "num_tokens": 783933412.0, + "step": 4480 + }, + { + "epoch": 1.2234751060314433, + "grad_norm": 1.25, + "learning_rate": 2.0023879758393032e-05, + "loss": 1.0592, + "mean_token_accuracy": 0.7380810688249767, + "num_tokens": 785660774.0, + "step": 4490 + }, + { + "epoch": 1.2262004122025583, + "grad_norm": 1.2890625, + "learning_rate": 1.9953645174884116e-05, + "loss": 1.0827, + "mean_token_accuracy": 0.7343300872482359, + "num_tokens": 787392675.0, + "step": 4500 + }, + { + "epoch": 1.2289257183736735, + "grad_norm": 1.1796875, + "learning_rate": 1.9883410591375192e-05, + "loss": 1.0919, + "mean_token_accuracy": 0.7318628188222647, + "num_tokens": 789226322.0, + "step": 4510 + }, + { + "epoch": 1.2316510245447887, + "grad_norm": 2.171875, + "learning_rate": 1.9813176007866273e-05, + "loss": 1.09, + "mean_token_accuracy": 0.7309126630425453, + "num_tokens": 790926661.0, + "step": 4520 + }, + { + "epoch": 1.234376330715904, + "grad_norm": 1.2421875, + "learning_rate": 1.9742941424357356e-05, + "loss": 1.11, + "mean_token_accuracy": 0.729132838267833, + "num_tokens": 792705504.0, + "step": 4530 + }, + { + "epoch": 1.2371016368870191, + "grad_norm": 1.1796875, + "learning_rate": 1.9672706840848433e-05, + "loss": 1.0787, + "mean_token_accuracy": 0.7340411549434066, + "num_tokens": 794463296.0, + "step": 4540 + }, + { + "epoch": 1.2398269430581341, + "grad_norm": 1.28125, + "learning_rate": 1.9602472257339517e-05, + "loss": 1.0449, + "mean_token_accuracy": 0.73929683547467, + "num_tokens": 796199027.0, + "step": 4550 + }, + { + "epoch": 1.2425522492292493, + "grad_norm": 1.1484375, + "learning_rate": 1.9532237673830593e-05, + "loss": 1.0779, + "mean_token_accuracy": 0.7336781222373248, + "num_tokens": 797958936.0, + "step": 4560 + }, + { + "epoch": 1.2452775554003646, + "grad_norm": 1.2265625, + "learning_rate": 1.9462003090321677e-05, + "loss": 1.0881, + "mean_token_accuracy": 0.7317794054746628, + "num_tokens": 799700358.0, + "step": 4570 + }, + { + "epoch": 1.2480028615714795, + "grad_norm": 1.2265625, + "learning_rate": 1.9391768506812754e-05, + "loss": 1.0845, + "mean_token_accuracy": 0.733612988423556, + "num_tokens": 801489379.0, + "step": 4580 + }, + { + "epoch": 1.2507281677425948, + "grad_norm": 1.21875, + "learning_rate": 1.9321533923303837e-05, + "loss": 1.0885, + "mean_token_accuracy": 0.7329282435588539, + "num_tokens": 803248320.0, + "step": 4590 + }, + { + "epoch": 1.25345347391371, + "grad_norm": 1.2265625, + "learning_rate": 1.9251299339794914e-05, + "loss": 1.0994, + "mean_token_accuracy": 0.73003611844033, + "num_tokens": 804952704.0, + "step": 4600 + }, + { + "epoch": 1.2561787800848252, + "grad_norm": 1.25, + "learning_rate": 1.9181064756285998e-05, + "loss": 1.0691, + "mean_token_accuracy": 0.736358674056828, + "num_tokens": 806596269.0, + "step": 4610 + }, + { + "epoch": 1.2589040862559404, + "grad_norm": 1.1875, + "learning_rate": 1.9110830172777075e-05, + "loss": 1.0962, + "mean_token_accuracy": 0.7311920085921884, + "num_tokens": 808364023.0, + "step": 4620 + }, + { + "epoch": 1.2616293924270554, + "grad_norm": 1.4453125, + "learning_rate": 1.9040595589268155e-05, + "loss": 1.0895, + "mean_token_accuracy": 0.7322369864210486, + "num_tokens": 810145656.0, + "step": 4630 + }, + { + "epoch": 1.2643546985981706, + "grad_norm": 1.25, + "learning_rate": 1.897036100575924e-05, + "loss": 1.0942, + "mean_token_accuracy": 0.7318614183925092, + "num_tokens": 811942362.0, + "step": 4640 + }, + { + "epoch": 1.2670800047692858, + "grad_norm": 1.234375, + "learning_rate": 1.8900126422250315e-05, + "loss": 1.0979, + "mean_token_accuracy": 0.7316563298925758, + "num_tokens": 813665193.0, + "step": 4650 + }, + { + "epoch": 1.2698053109404008, + "grad_norm": 1.2265625, + "learning_rate": 1.88298918387414e-05, + "loss": 1.1026, + "mean_token_accuracy": 0.7298440636135638, + "num_tokens": 815388624.0, + "step": 4660 + }, + { + "epoch": 1.272530617111516, + "grad_norm": 1.3125, + "learning_rate": 1.8759657255232476e-05, + "loss": 1.0693, + "mean_token_accuracy": 0.7348106381483376, + "num_tokens": 817088463.0, + "step": 4670 + }, + { + "epoch": 1.2752559232826313, + "grad_norm": 1.28125, + "learning_rate": 1.868942267172356e-05, + "loss": 1.0429, + "mean_token_accuracy": 0.741504667326808, + "num_tokens": 818867377.0, + "step": 4680 + }, + { + "epoch": 1.2779812294537465, + "grad_norm": 1.078125, + "learning_rate": 1.8619188088214636e-05, + "loss": 1.0893, + "mean_token_accuracy": 0.7326997134834528, + "num_tokens": 820718067.0, + "step": 4690 + }, + { + "epoch": 1.2807065356248617, + "grad_norm": 1.1953125, + "learning_rate": 1.854895350470572e-05, + "loss": 1.0617, + "mean_token_accuracy": 0.7387546991929412, + "num_tokens": 822427113.0, + "step": 4700 + }, + { + "epoch": 1.2834318417959767, + "grad_norm": 1.1875, + "learning_rate": 1.8478718921196796e-05, + "loss": 1.0449, + "mean_token_accuracy": 0.7418490494601429, + "num_tokens": 824202933.0, + "step": 4710 + }, + { + "epoch": 1.286157147967092, + "grad_norm": 1.1640625, + "learning_rate": 1.840848433768788e-05, + "loss": 1.038, + "mean_token_accuracy": 0.7441758699715137, + "num_tokens": 825952635.0, + "step": 4720 + }, + { + "epoch": 1.2888824541382071, + "grad_norm": 1.171875, + "learning_rate": 1.833824975417896e-05, + "loss": 1.0655, + "mean_token_accuracy": 0.7380225669592619, + "num_tokens": 827717059.0, + "step": 4730 + }, + { + "epoch": 1.2916077603093221, + "grad_norm": 1.2109375, + "learning_rate": 1.8268015170670037e-05, + "loss": 1.096, + "mean_token_accuracy": 0.7332455677911639, + "num_tokens": 829416511.0, + "step": 4740 + }, + { + "epoch": 1.2943330664804373, + "grad_norm": 1.1796875, + "learning_rate": 1.819778058716112e-05, + "loss": 1.0873, + "mean_token_accuracy": 0.7343688279390335, + "num_tokens": 831144366.0, + "step": 4750 + }, + { + "epoch": 1.2970583726515525, + "grad_norm": 1.2265625, + "learning_rate": 1.8127546003652197e-05, + "loss": 1.0737, + "mean_token_accuracy": 0.7364111577160657, + "num_tokens": 832951363.0, + "step": 4760 + }, + { + "epoch": 1.2997836788226678, + "grad_norm": 1.203125, + "learning_rate": 1.805731142014328e-05, + "loss": 1.0169, + "mean_token_accuracy": 0.7455294409766793, + "num_tokens": 834718890.0, + "step": 4770 + }, + { + "epoch": 1.302508984993783, + "grad_norm": 1.2578125, + "learning_rate": 1.7987076836634358e-05, + "loss": 1.0805, + "mean_token_accuracy": 0.7351347375661135, + "num_tokens": 836486880.0, + "step": 4780 + }, + { + "epoch": 1.305234291164898, + "grad_norm": 1.3046875, + "learning_rate": 1.791684225312544e-05, + "loss": 1.0653, + "mean_token_accuracy": 0.7369420503266155, + "num_tokens": 838202213.0, + "step": 4790 + }, + { + "epoch": 1.3079595973360132, + "grad_norm": 1.203125, + "learning_rate": 1.7846607669616518e-05, + "loss": 1.0477, + "mean_token_accuracy": 0.740178300999105, + "num_tokens": 839960268.0, + "step": 4800 + }, + { + "epoch": 1.3106849035071284, + "grad_norm": 1.078125, + "learning_rate": 1.7776373086107602e-05, + "loss": 1.0438, + "mean_token_accuracy": 0.7405985148623586, + "num_tokens": 841674015.0, + "step": 4810 + }, + { + "epoch": 1.3134102096782436, + "grad_norm": 1.25, + "learning_rate": 1.770613850259868e-05, + "loss": 1.0645, + "mean_token_accuracy": 0.7378088518977165, + "num_tokens": 843396816.0, + "step": 4820 + }, + { + "epoch": 1.3161355158493588, + "grad_norm": 1.1875, + "learning_rate": 1.7635903919089762e-05, + "loss": 1.0473, + "mean_token_accuracy": 0.739738754928112, + "num_tokens": 845043717.0, + "step": 4830 + }, + { + "epoch": 1.3188608220204738, + "grad_norm": 1.25, + "learning_rate": 1.7565669335580842e-05, + "loss": 1.0559, + "mean_token_accuracy": 0.7385650574229657, + "num_tokens": 846826112.0, + "step": 4840 + }, + { + "epoch": 1.321586128191589, + "grad_norm": 1.2578125, + "learning_rate": 1.7495434752071923e-05, + "loss": 1.0778, + "mean_token_accuracy": 0.7339285519905389, + "num_tokens": 848585637.0, + "step": 4850 + }, + { + "epoch": 1.3243114343627043, + "grad_norm": 1.28125, + "learning_rate": 1.7425200168563003e-05, + "loss": 1.0543, + "mean_token_accuracy": 0.7388118118979037, + "num_tokens": 850326238.0, + "step": 4860 + }, + { + "epoch": 1.3270367405338193, + "grad_norm": 1.21875, + "learning_rate": 1.735496558505408e-05, + "loss": 1.0513, + "mean_token_accuracy": 0.740250199381262, + "num_tokens": 852155502.0, + "step": 4870 + }, + { + "epoch": 1.3297620467049345, + "grad_norm": 1.234375, + "learning_rate": 1.7284731001545163e-05, + "loss": 1.0967, + "mean_token_accuracy": 0.7321735432371497, + "num_tokens": 853949772.0, + "step": 4880 + }, + { + "epoch": 1.3324873528760497, + "grad_norm": 1.25, + "learning_rate": 1.721449641803624e-05, + "loss": 1.0778, + "mean_token_accuracy": 0.7355785867199301, + "num_tokens": 855766767.0, + "step": 4890 + }, + { + "epoch": 1.335212659047165, + "grad_norm": 1.2421875, + "learning_rate": 1.7144261834527323e-05, + "loss": 1.1086, + "mean_token_accuracy": 0.7283882515504956, + "num_tokens": 857505341.0, + "step": 4900 + }, + { + "epoch": 1.3379379652182801, + "grad_norm": 1.234375, + "learning_rate": 1.70740272510184e-05, + "loss": 1.0629, + "mean_token_accuracy": 0.7374999332241714, + "num_tokens": 859262891.0, + "step": 4910 + }, + { + "epoch": 1.3406632713893951, + "grad_norm": 1.1953125, + "learning_rate": 1.7003792667509484e-05, + "loss": 1.0928, + "mean_token_accuracy": 0.7319379338994623, + "num_tokens": 861020281.0, + "step": 4920 + }, + { + "epoch": 1.3433885775605103, + "grad_norm": 1.2578125, + "learning_rate": 1.693355808400056e-05, + "loss": 1.0864, + "mean_token_accuracy": 0.732258545793593, + "num_tokens": 862708279.0, + "step": 4930 + }, + { + "epoch": 1.3461138837316255, + "grad_norm": 1.2421875, + "learning_rate": 1.6863323500491644e-05, + "loss": 1.0786, + "mean_token_accuracy": 0.7352613844908774, + "num_tokens": 864441387.0, + "step": 4940 + }, + { + "epoch": 1.3488391899027405, + "grad_norm": 1.21875, + "learning_rate": 1.6793088916982724e-05, + "loss": 1.0321, + "mean_token_accuracy": 0.7430288815870881, + "num_tokens": 866182511.0, + "step": 4950 + }, + { + "epoch": 1.3515644960738558, + "grad_norm": 1.2890625, + "learning_rate": 1.6722854333473805e-05, + "loss": 1.0547, + "mean_token_accuracy": 0.7388356210663914, + "num_tokens": 867919731.0, + "step": 4960 + }, + { + "epoch": 1.354289802244971, + "grad_norm": 1.1796875, + "learning_rate": 1.6652619749964885e-05, + "loss": 1.1159, + "mean_token_accuracy": 0.7274195526726543, + "num_tokens": 869673959.0, + "step": 4970 + }, + { + "epoch": 1.3570151084160862, + "grad_norm": 1.109375, + "learning_rate": 1.658238516645596e-05, + "loss": 1.0679, + "mean_token_accuracy": 0.7365593810565769, + "num_tokens": 871480789.0, + "step": 4980 + }, + { + "epoch": 1.3597404145872014, + "grad_norm": 1.296875, + "learning_rate": 1.6512150582947045e-05, + "loss": 1.0614, + "mean_token_accuracy": 0.7389228995889425, + "num_tokens": 873221278.0, + "step": 4990 + }, + { + "epoch": 1.3624657207583164, + "grad_norm": 1.265625, + "learning_rate": 1.6441915999438122e-05, + "loss": 1.0835, + "mean_token_accuracy": 0.7324470828287304, + "num_tokens": 874968880.0, + "step": 5000 + }, + { + "epoch": 1.3651910269294316, + "grad_norm": 1.1953125, + "learning_rate": 1.6371681415929206e-05, + "loss": 1.1162, + "mean_token_accuracy": 0.7266049144789577, + "num_tokens": 876752750.0, + "step": 5010 + }, + { + "epoch": 1.3679163331005468, + "grad_norm": 1.1640625, + "learning_rate": 1.6301446832420282e-05, + "loss": 1.0537, + "mean_token_accuracy": 0.7392234316095709, + "num_tokens": 878505084.0, + "step": 5020 + }, + { + "epoch": 1.3706416392716618, + "grad_norm": 1.15625, + "learning_rate": 1.6231212248911366e-05, + "loss": 1.1168, + "mean_token_accuracy": 0.7284122928977013, + "num_tokens": 880302245.0, + "step": 5030 + }, + { + "epoch": 1.373366945442777, + "grad_norm": 1.21875, + "learning_rate": 1.6160977665402443e-05, + "loss": 1.0849, + "mean_token_accuracy": 0.7335296958684921, + "num_tokens": 882111291.0, + "step": 5040 + }, + { + "epoch": 1.3760922516138923, + "grad_norm": 1.1875, + "learning_rate": 1.6090743081893526e-05, + "loss": 1.036, + "mean_token_accuracy": 0.7431163294240832, + "num_tokens": 883876821.0, + "step": 5050 + }, + { + "epoch": 1.3788175577850075, + "grad_norm": 1.3203125, + "learning_rate": 1.6020508498384607e-05, + "loss": 1.0986, + "mean_token_accuracy": 0.7319642775692046, + "num_tokens": 885619837.0, + "step": 5060 + }, + { + "epoch": 1.3815428639561227, + "grad_norm": 1.2578125, + "learning_rate": 1.5950273914875687e-05, + "loss": 1.0956, + "mean_token_accuracy": 0.7325851460918784, + "num_tokens": 887378117.0, + "step": 5070 + }, + { + "epoch": 1.3842681701272377, + "grad_norm": 1.203125, + "learning_rate": 1.5880039331366767e-05, + "loss": 1.0692, + "mean_token_accuracy": 0.7359711118973792, + "num_tokens": 889137008.0, + "step": 5080 + }, + { + "epoch": 1.386993476298353, + "grad_norm": 1.1875, + "learning_rate": 1.5809804747857844e-05, + "loss": 1.0943, + "mean_token_accuracy": 0.733759019849822, + "num_tokens": 890871255.0, + "step": 5090 + }, + { + "epoch": 1.3897187824694681, + "grad_norm": 1.2265625, + "learning_rate": 1.5739570164348927e-05, + "loss": 1.0367, + "mean_token_accuracy": 0.7433673735707998, + "num_tokens": 892626679.0, + "step": 5100 + }, + { + "epoch": 1.392444088640583, + "grad_norm": 1.328125, + "learning_rate": 1.5669335580840004e-05, + "loss": 1.0858, + "mean_token_accuracy": 0.7325516594573855, + "num_tokens": 894371346.0, + "step": 5110 + }, + { + "epoch": 1.3951693948116983, + "grad_norm": 1.2109375, + "learning_rate": 1.5599100997331088e-05, + "loss": 1.0597, + "mean_token_accuracy": 0.7366720103658736, + "num_tokens": 896171792.0, + "step": 5120 + }, + { + "epoch": 1.3978947009828135, + "grad_norm": 1.25, + "learning_rate": 1.5528866413822165e-05, + "loss": 1.0912, + "mean_token_accuracy": 0.7331433389335871, + "num_tokens": 897925465.0, + "step": 5130 + }, + { + "epoch": 1.4006200071539288, + "grad_norm": 1.1953125, + "learning_rate": 1.5458631830313248e-05, + "loss": 1.0963, + "mean_token_accuracy": 0.7325556992553175, + "num_tokens": 899668162.0, + "step": 5140 + }, + { + "epoch": 1.403345313325044, + "grad_norm": 1.3046875, + "learning_rate": 1.5388397246804325e-05, + "loss": 1.0856, + "mean_token_accuracy": 0.7329106355085969, + "num_tokens": 901385964.0, + "step": 5150 + }, + { + "epoch": 1.406070619496159, + "grad_norm": 1.34375, + "learning_rate": 1.531816266329541e-05, + "loss": 1.1263, + "mean_token_accuracy": 0.7270981592126191, + "num_tokens": 903167942.0, + "step": 5160 + }, + { + "epoch": 1.4087959256672742, + "grad_norm": 1.1953125, + "learning_rate": 1.5247928079786489e-05, + "loss": 1.0742, + "mean_token_accuracy": 0.7350091654807329, + "num_tokens": 904903509.0, + "step": 5170 + }, + { + "epoch": 1.4115212318383894, + "grad_norm": 1.21875, + "learning_rate": 1.5177693496277567e-05, + "loss": 1.0731, + "mean_token_accuracy": 0.7360517351888121, + "num_tokens": 906716098.0, + "step": 5180 + }, + { + "epoch": 1.4142465380095044, + "grad_norm": 1.3515625, + "learning_rate": 1.510745891276865e-05, + "loss": 1.1295, + "mean_token_accuracy": 0.7244786282069982, + "num_tokens": 908350817.0, + "step": 5190 + }, + { + "epoch": 1.4169718441806196, + "grad_norm": 1.2421875, + "learning_rate": 1.5037224329259728e-05, + "loss": 1.1112, + "mean_token_accuracy": 0.7273098705336452, + "num_tokens": 910114038.0, + "step": 5200 + }, + { + "epoch": 1.4196971503517348, + "grad_norm": 1.296875, + "learning_rate": 1.496698974575081e-05, + "loss": 1.0635, + "mean_token_accuracy": 0.7377636540681124, + "num_tokens": 911849057.0, + "step": 5210 + }, + { + "epoch": 1.42242245652285, + "grad_norm": 1.40625, + "learning_rate": 1.4896755162241888e-05, + "loss": 1.1131, + "mean_token_accuracy": 0.7273770812898874, + "num_tokens": 913608571.0, + "step": 5220 + }, + { + "epoch": 1.4251477626939653, + "grad_norm": 1.1875, + "learning_rate": 1.482652057873297e-05, + "loss": 1.0667, + "mean_token_accuracy": 0.7367195668630302, + "num_tokens": 915305445.0, + "step": 5230 + }, + { + "epoch": 1.4278730688650803, + "grad_norm": 1.25, + "learning_rate": 1.4756285995224048e-05, + "loss": 1.0784, + "mean_token_accuracy": 0.7353367758914828, + "num_tokens": 917060810.0, + "step": 5240 + }, + { + "epoch": 1.4305983750361955, + "grad_norm": 1.3203125, + "learning_rate": 1.4686051411715129e-05, + "loss": 1.107, + "mean_token_accuracy": 0.7298235032707453, + "num_tokens": 918774532.0, + "step": 5250 + }, + { + "epoch": 1.4333236812073107, + "grad_norm": 1.1953125, + "learning_rate": 1.461581682820621e-05, + "loss": 1.0779, + "mean_token_accuracy": 0.7341393328271806, + "num_tokens": 920510936.0, + "step": 5260 + }, + { + "epoch": 1.4360489873784257, + "grad_norm": 1.140625, + "learning_rate": 1.4545582244697289e-05, + "loss": 1.0601, + "mean_token_accuracy": 0.7375747452490031, + "num_tokens": 922287282.0, + "step": 5270 + }, + { + "epoch": 1.438774293549541, + "grad_norm": 1.1484375, + "learning_rate": 1.4475347661188371e-05, + "loss": 1.0683, + "mean_token_accuracy": 0.7358329862356185, + "num_tokens": 924045825.0, + "step": 5280 + }, + { + "epoch": 1.441499599720656, + "grad_norm": 1.234375, + "learning_rate": 1.440511307767945e-05, + "loss": 1.0431, + "mean_token_accuracy": 0.7421346224844456, + "num_tokens": 925730016.0, + "step": 5290 + }, + { + "epoch": 1.4442249058917713, + "grad_norm": 1.1171875, + "learning_rate": 1.4334878494170531e-05, + "loss": 1.1144, + "mean_token_accuracy": 0.7277820689603687, + "num_tokens": 927502244.0, + "step": 5300 + }, + { + "epoch": 1.4469502120628865, + "grad_norm": 1.1953125, + "learning_rate": 1.426464391066161e-05, + "loss": 1.1026, + "mean_token_accuracy": 0.7290660193189978, + "num_tokens": 929331138.0, + "step": 5310 + }, + { + "epoch": 1.4496755182340015, + "grad_norm": 1.453125, + "learning_rate": 1.4194409327152692e-05, + "loss": 1.0766, + "mean_token_accuracy": 0.7356103613972664, + "num_tokens": 931093746.0, + "step": 5320 + }, + { + "epoch": 1.4524008244051168, + "grad_norm": 1.234375, + "learning_rate": 1.412417474364377e-05, + "loss": 1.0698, + "mean_token_accuracy": 0.7374511297792197, + "num_tokens": 932856400.0, + "step": 5330 + }, + { + "epoch": 1.455126130576232, + "grad_norm": 1.21875, + "learning_rate": 1.4053940160134852e-05, + "loss": 1.0448, + "mean_token_accuracy": 0.7427729295566678, + "num_tokens": 934561196.0, + "step": 5340 + }, + { + "epoch": 1.457851436747347, + "grad_norm": 1.28125, + "learning_rate": 1.398370557662593e-05, + "loss": 1.0801, + "mean_token_accuracy": 0.7346046957187354, + "num_tokens": 936320114.0, + "step": 5350 + }, + { + "epoch": 1.4605767429184622, + "grad_norm": 1.234375, + "learning_rate": 1.391347099311701e-05, + "loss": 1.0425, + "mean_token_accuracy": 0.7405025975778698, + "num_tokens": 938025766.0, + "step": 5360 + }, + { + "epoch": 1.4633020490895774, + "grad_norm": 1.1796875, + "learning_rate": 1.3843236409608093e-05, + "loss": 1.0843, + "mean_token_accuracy": 0.7339933177456259, + "num_tokens": 939734341.0, + "step": 5370 + }, + { + "epoch": 1.4660273552606926, + "grad_norm": 1.3359375, + "learning_rate": 1.3773001826099171e-05, + "loss": 1.0961, + "mean_token_accuracy": 0.7325618612580002, + "num_tokens": 941470370.0, + "step": 5380 + }, + { + "epoch": 1.4687526614318078, + "grad_norm": 1.1640625, + "learning_rate": 1.3702767242590253e-05, + "loss": 1.074, + "mean_token_accuracy": 0.7348563734441995, + "num_tokens": 943242271.0, + "step": 5390 + }, + { + "epoch": 1.4714779676029228, + "grad_norm": 1.234375, + "learning_rate": 1.3632532659081332e-05, + "loss": 1.0487, + "mean_token_accuracy": 0.7408922280184924, + "num_tokens": 945033055.0, + "step": 5400 + }, + { + "epoch": 1.474203273774038, + "grad_norm": 1.2578125, + "learning_rate": 1.3562298075572413e-05, + "loss": 1.056, + "mean_token_accuracy": 0.7387796074151993, + "num_tokens": 946767511.0, + "step": 5410 + }, + { + "epoch": 1.4769285799451533, + "grad_norm": 1.2421875, + "learning_rate": 1.3492063492063492e-05, + "loss": 1.092, + "mean_token_accuracy": 0.731173980422318, + "num_tokens": 948483568.0, + "step": 5420 + }, + { + "epoch": 1.4796538861162682, + "grad_norm": 1.2734375, + "learning_rate": 1.3421828908554574e-05, + "loss": 1.074, + "mean_token_accuracy": 0.735079528670758, + "num_tokens": 950245709.0, + "step": 5430 + }, + { + "epoch": 1.4823791922873835, + "grad_norm": 1.234375, + "learning_rate": 1.3351594325045652e-05, + "loss": 1.079, + "mean_token_accuracy": 0.7340235409326852, + "num_tokens": 951927075.0, + "step": 5440 + }, + { + "epoch": 1.4851044984584987, + "grad_norm": 1.234375, + "learning_rate": 1.3281359741536734e-05, + "loss": 1.104, + "mean_token_accuracy": 0.7302969950251281, + "num_tokens": 953663268.0, + "step": 5450 + }, + { + "epoch": 1.487829804629614, + "grad_norm": 1.109375, + "learning_rate": 1.3211125158027813e-05, + "loss": 1.0602, + "mean_token_accuracy": 0.7398056207224727, + "num_tokens": 955406203.0, + "step": 5460 + }, + { + "epoch": 1.490555110800729, + "grad_norm": 1.3203125, + "learning_rate": 1.3140890574518893e-05, + "loss": 1.1467, + "mean_token_accuracy": 0.7222641437314451, + "num_tokens": 957099890.0, + "step": 5470 + }, + { + "epoch": 1.493280416971844, + "grad_norm": 1.265625, + "learning_rate": 1.3070655991009975e-05, + "loss": 1.059, + "mean_token_accuracy": 0.7376022847369313, + "num_tokens": 958864424.0, + "step": 5480 + }, + { + "epoch": 1.4960057231429593, + "grad_norm": 1.1953125, + "learning_rate": 1.3000421407501053e-05, + "loss": 1.0418, + "mean_token_accuracy": 0.7422663200646639, + "num_tokens": 960615561.0, + "step": 5490 + }, + { + "epoch": 1.4987310293140745, + "grad_norm": 1.453125, + "learning_rate": 1.2930186823992135e-05, + "loss": 1.0315, + "mean_token_accuracy": 0.7441624398343265, + "num_tokens": 962350259.0, + "step": 5500 + }, + { + "epoch": 1.5014563354851895, + "grad_norm": 1.2109375, + "learning_rate": 1.2859952240483214e-05, + "loss": 1.0333, + "mean_token_accuracy": 0.7446512423455716, + "num_tokens": 964103329.0, + "step": 5510 + }, + { + "epoch": 1.504181641656305, + "grad_norm": 1.1171875, + "learning_rate": 1.2789717656974296e-05, + "loss": 1.0874, + "mean_token_accuracy": 0.7329986747354269, + "num_tokens": 965827916.0, + "step": 5520 + }, + { + "epoch": 1.50690694782742, + "grad_norm": 1.1796875, + "learning_rate": 1.2719483073465374e-05, + "loss": 1.0292, + "mean_token_accuracy": 0.7453711663372815, + "num_tokens": 967545740.0, + "step": 5530 + }, + { + "epoch": 1.5096322539985352, + "grad_norm": 1.125, + "learning_rate": 1.2649248489956456e-05, + "loss": 1.0518, + "mean_token_accuracy": 0.7393054939806462, + "num_tokens": 969400946.0, + "step": 5540 + }, + { + "epoch": 1.5123575601696504, + "grad_norm": 1.265625, + "learning_rate": 1.2579013906447535e-05, + "loss": 1.098, + "mean_token_accuracy": 0.7297218488529325, + "num_tokens": 971161426.0, + "step": 5550 + }, + { + "epoch": 1.5150828663407654, + "grad_norm": 1.1875, + "learning_rate": 1.2508779322938616e-05, + "loss": 1.0495, + "mean_token_accuracy": 0.7386811840347945, + "num_tokens": 972880541.0, + "step": 5560 + }, + { + "epoch": 1.5178081725118806, + "grad_norm": 1.171875, + "learning_rate": 1.2438544739429697e-05, + "loss": 1.0386, + "mean_token_accuracy": 0.7431322140619159, + "num_tokens": 974614176.0, + "step": 5570 + }, + { + "epoch": 1.5205334786829958, + "grad_norm": 1.2265625, + "learning_rate": 1.2368310155920775e-05, + "loss": 1.0588, + "mean_token_accuracy": 0.7383014528080821, + "num_tokens": 976368282.0, + "step": 5580 + }, + { + "epoch": 1.5232587848541108, + "grad_norm": 1.2109375, + "learning_rate": 1.2298075572411855e-05, + "loss": 1.0507, + "mean_token_accuracy": 0.7403865492902696, + "num_tokens": 978153249.0, + "step": 5590 + }, + { + "epoch": 1.5259840910252263, + "grad_norm": 1.1875, + "learning_rate": 1.2227840988902936e-05, + "loss": 1.1276, + "mean_token_accuracy": 0.7253991289064288, + "num_tokens": 979934251.0, + "step": 5600 + }, + { + "epoch": 1.5287093971963412, + "grad_norm": 1.1953125, + "learning_rate": 1.2157606405394016e-05, + "loss": 1.0665, + "mean_token_accuracy": 0.7370174353942275, + "num_tokens": 981631256.0, + "step": 5610 + }, + { + "epoch": 1.5314347033674565, + "grad_norm": 1.28125, + "learning_rate": 1.2087371821885096e-05, + "loss": 1.0801, + "mean_token_accuracy": 0.7330641859211028, + "num_tokens": 983465168.0, + "step": 5620 + }, + { + "epoch": 1.5341600095385717, + "grad_norm": 1.1796875, + "learning_rate": 1.2017137238376178e-05, + "loss": 1.098, + "mean_token_accuracy": 0.7290369461290538, + "num_tokens": 985204925.0, + "step": 5630 + }, + { + "epoch": 1.5368853157096867, + "grad_norm": 1.2578125, + "learning_rate": 1.1946902654867258e-05, + "loss": 1.0831, + "mean_token_accuracy": 0.7333806352689862, + "num_tokens": 986932491.0, + "step": 5640 + }, + { + "epoch": 1.5396106218808019, + "grad_norm": 1.140625, + "learning_rate": 1.1876668071358338e-05, + "loss": 1.055, + "mean_token_accuracy": 0.7386937864124775, + "num_tokens": 988679321.0, + "step": 5650 + }, + { + "epoch": 1.542335928051917, + "grad_norm": 1.2578125, + "learning_rate": 1.1806433487849418e-05, + "loss": 1.0538, + "mean_token_accuracy": 0.7390916770324111, + "num_tokens": 990434427.0, + "step": 5660 + }, + { + "epoch": 1.545061234223032, + "grad_norm": 1.3203125, + "learning_rate": 1.1736198904340499e-05, + "loss": 1.0676, + "mean_token_accuracy": 0.7376645108684897, + "num_tokens": 992247068.0, + "step": 5670 + }, + { + "epoch": 1.5477865403941475, + "grad_norm": 1.296875, + "learning_rate": 1.1665964320831579e-05, + "loss": 1.0522, + "mean_token_accuracy": 0.7399232917465269, + "num_tokens": 993933963.0, + "step": 5680 + }, + { + "epoch": 1.5505118465652625, + "grad_norm": 1.1640625, + "learning_rate": 1.1595729737322657e-05, + "loss": 1.0817, + "mean_token_accuracy": 0.7358969361521304, + "num_tokens": 995750639.0, + "step": 5690 + }, + { + "epoch": 1.5532371527363777, + "grad_norm": 1.140625, + "learning_rate": 1.1525495153813737e-05, + "loss": 1.0824, + "mean_token_accuracy": 0.7343440987169743, + "num_tokens": 997519520.0, + "step": 5700 + }, + { + "epoch": 1.555962458907493, + "grad_norm": 1.203125, + "learning_rate": 1.1455260570304818e-05, + "loss": 1.0741, + "mean_token_accuracy": 0.7360345222055912, + "num_tokens": 999260798.0, + "step": 5710 + }, + { + "epoch": 1.558687765078608, + "grad_norm": 1.2421875, + "learning_rate": 1.1385025986795898e-05, + "loss": 1.0694, + "mean_token_accuracy": 0.73609570087865, + "num_tokens": 1000992985.0, + "step": 5720 + }, + { + "epoch": 1.5614130712497232, + "grad_norm": 1.1640625, + "learning_rate": 1.1314791403286978e-05, + "loss": 1.0651, + "mean_token_accuracy": 0.7382791753858328, + "num_tokens": 1002785337.0, + "step": 5730 + }, + { + "epoch": 1.5641383774208384, + "grad_norm": 1.109375, + "learning_rate": 1.124455681977806e-05, + "loss": 1.0588, + "mean_token_accuracy": 0.7395141620188952, + "num_tokens": 1004575506.0, + "step": 5740 + }, + { + "epoch": 1.5668636835919534, + "grad_norm": 1.015625, + "learning_rate": 1.117432223626914e-05, + "loss": 1.044, + "mean_token_accuracy": 0.7408615527674556, + "num_tokens": 1006331741.0, + "step": 5750 + }, + { + "epoch": 1.5695889897630688, + "grad_norm": 1.125, + "learning_rate": 1.110408765276022e-05, + "loss": 1.0484, + "mean_token_accuracy": 0.740070460177958, + "num_tokens": 1008074608.0, + "step": 5760 + }, + { + "epoch": 1.5723142959341838, + "grad_norm": 1.1875, + "learning_rate": 1.10338530692513e-05, + "loss": 1.0634, + "mean_token_accuracy": 0.735400352999568, + "num_tokens": 1009814759.0, + "step": 5770 + }, + { + "epoch": 1.575039602105299, + "grad_norm": 1.1171875, + "learning_rate": 1.096361848574238e-05, + "loss": 1.0497, + "mean_token_accuracy": 0.7401463497430086, + "num_tokens": 1011600470.0, + "step": 5780 + }, + { + "epoch": 1.5777649082764142, + "grad_norm": 1.1328125, + "learning_rate": 1.0893383902233461e-05, + "loss": 1.0633, + "mean_token_accuracy": 0.7358233134262264, + "num_tokens": 1013332790.0, + "step": 5790 + }, + { + "epoch": 1.5804902144475292, + "grad_norm": 1.1328125, + "learning_rate": 1.0823149318724541e-05, + "loss": 1.0517, + "mean_token_accuracy": 0.7393049919977784, + "num_tokens": 1015056384.0, + "step": 5800 + }, + { + "epoch": 1.5832155206186445, + "grad_norm": 1.15625, + "learning_rate": 1.075291473521562e-05, + "loss": 1.1169, + "mean_token_accuracy": 0.7274207789450884, + "num_tokens": 1016805023.0, + "step": 5810 + }, + { + "epoch": 1.5859408267897597, + "grad_norm": 1.21875, + "learning_rate": 1.06826801517067e-05, + "loss": 1.0668, + "mean_token_accuracy": 0.7372116056270898, + "num_tokens": 1018560575.0, + "step": 5820 + }, + { + "epoch": 1.5886661329608747, + "grad_norm": 1.65625, + "learning_rate": 1.061244556819778e-05, + "loss": 1.0273, + "mean_token_accuracy": 0.7447264090180397, + "num_tokens": 1020332708.0, + "step": 5830 + }, + { + "epoch": 1.59139143913199, + "grad_norm": 1.46875, + "learning_rate": 1.0542210984688862e-05, + "loss": 1.0687, + "mean_token_accuracy": 0.7353351947851479, + "num_tokens": 1022116018.0, + "step": 5840 + }, + { + "epoch": 1.594116745303105, + "grad_norm": 1.1796875, + "learning_rate": 1.0471976401179942e-05, + "loss": 1.0871, + "mean_token_accuracy": 0.7342725731432438, + "num_tokens": 1023879971.0, + "step": 5850 + }, + { + "epoch": 1.5968420514742203, + "grad_norm": 1.25, + "learning_rate": 1.0401741817671022e-05, + "loss": 1.0736, + "mean_token_accuracy": 0.7353384065441787, + "num_tokens": 1025589404.0, + "step": 5860 + }, + { + "epoch": 1.5995673576453355, + "grad_norm": 1.1875, + "learning_rate": 1.0331507234162102e-05, + "loss": 1.0689, + "mean_token_accuracy": 0.7359914354048669, + "num_tokens": 1027272595.0, + "step": 5870 + }, + { + "epoch": 1.6022926638164505, + "grad_norm": 1.3671875, + "learning_rate": 1.0261272650653183e-05, + "loss": 1.1171, + "mean_token_accuracy": 0.7269021419808268, + "num_tokens": 1029018598.0, + "step": 5880 + }, + { + "epoch": 1.6050179699875657, + "grad_norm": 1.2265625, + "learning_rate": 1.0191038067144263e-05, + "loss": 1.0743, + "mean_token_accuracy": 0.7348931013606489, + "num_tokens": 1030768423.0, + "step": 5890 + }, + { + "epoch": 1.607743276158681, + "grad_norm": 1.2890625, + "learning_rate": 1.0120803483635343e-05, + "loss": 1.069, + "mean_token_accuracy": 0.7367931939661503, + "num_tokens": 1032505045.0, + "step": 5900 + }, + { + "epoch": 1.610468582329796, + "grad_norm": 1.1953125, + "learning_rate": 1.0050568900126423e-05, + "loss": 1.0729, + "mean_token_accuracy": 0.7349227281287313, + "num_tokens": 1034278282.0, + "step": 5910 + }, + { + "epoch": 1.6131938885009114, + "grad_norm": 1.4375, + "learning_rate": 9.980334316617502e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.7386474424041808, + "num_tokens": 1036001773.0, + "step": 5920 + }, + { + "epoch": 1.6159191946720264, + "grad_norm": 1.2421875, + "learning_rate": 9.910099733108582e-06, + "loss": 1.0944, + "mean_token_accuracy": 0.7326858415268361, + "num_tokens": 1037697305.0, + "step": 5930 + }, + { + "epoch": 1.6186445008431416, + "grad_norm": 1.28125, + "learning_rate": 9.839865149599662e-06, + "loss": 1.058, + "mean_token_accuracy": 0.7376206200569868, + "num_tokens": 1039459781.0, + "step": 5940 + }, + { + "epoch": 1.6213698070142568, + "grad_norm": 1.1484375, + "learning_rate": 9.769630566090744e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.7320966799743474, + "num_tokens": 1041165017.0, + "step": 5950 + }, + { + "epoch": 1.6240951131853718, + "grad_norm": 1.2734375, + "learning_rate": 9.699395982581824e-06, + "loss": 1.051, + "mean_token_accuracy": 0.7402716959826648, + "num_tokens": 1042944243.0, + "step": 5960 + }, + { + "epoch": 1.6268204193564872, + "grad_norm": 1.1484375, + "learning_rate": 9.629161399072904e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.7404622891917825, + "num_tokens": 1044677934.0, + "step": 5970 + }, + { + "epoch": 1.6295457255276022, + "grad_norm": 1.1640625, + "learning_rate": 9.558926815563985e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.7370510194450617, + "num_tokens": 1046454776.0, + "step": 5980 + }, + { + "epoch": 1.6322710316987175, + "grad_norm": 1.1953125, + "learning_rate": 9.488692232055065e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.7358627901412547, + "num_tokens": 1048212574.0, + "step": 5990 + }, + { + "epoch": 1.6349963378698327, + "grad_norm": 1.171875, + "learning_rate": 9.418457648546145e-06, + "loss": 1.081, + "mean_token_accuracy": 0.7341303235851229, + "num_tokens": 1049922044.0, + "step": 6000 + }, + { + "epoch": 1.6377216440409477, + "grad_norm": 1.265625, + "learning_rate": 9.348223065037225e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.7357797903008759, + "num_tokens": 1051673079.0, + "step": 6010 + }, + { + "epoch": 1.6404469502120629, + "grad_norm": 1.3515625, + "learning_rate": 9.277988481528305e-06, + "loss": 1.056, + "mean_token_accuracy": 0.7372635472565889, + "num_tokens": 1053438839.0, + "step": 6020 + }, + { + "epoch": 1.643172256383178, + "grad_norm": 1.3515625, + "learning_rate": 9.207753898019384e-06, + "loss": 1.064, + "mean_token_accuracy": 0.7376120395027101, + "num_tokens": 1055224077.0, + "step": 6030 + }, + { + "epoch": 1.645897562554293, + "grad_norm": 1.3203125, + "learning_rate": 9.137519314510464e-06, + "loss": 1.0812, + "mean_token_accuracy": 0.7339616576209664, + "num_tokens": 1056980085.0, + "step": 6040 + }, + { + "epoch": 1.6486228687254085, + "grad_norm": 1.1875, + "learning_rate": 9.067284731001544e-06, + "loss": 1.1042, + "mean_token_accuracy": 0.7294964977540076, + "num_tokens": 1058709410.0, + "step": 6050 + }, + { + "epoch": 1.6513481748965235, + "grad_norm": 1.2578125, + "learning_rate": 8.997050147492626e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.7401429942809046, + "num_tokens": 1060458057.0, + "step": 6060 + }, + { + "epoch": 1.6540734810676387, + "grad_norm": 1.1953125, + "learning_rate": 8.926815563983706e-06, + "loss": 1.1053, + "mean_token_accuracy": 0.729435405600816, + "num_tokens": 1062204529.0, + "step": 6070 + }, + { + "epoch": 1.656798787238754, + "grad_norm": 1.1796875, + "learning_rate": 8.856580980474787e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.735557492543012, + "num_tokens": 1063946466.0, + "step": 6080 + }, + { + "epoch": 1.659524093409869, + "grad_norm": 1.1640625, + "learning_rate": 8.786346396965867e-06, + "loss": 1.1115, + "mean_token_accuracy": 0.726740291249007, + "num_tokens": 1065727075.0, + "step": 6090 + }, + { + "epoch": 1.6622493995809842, + "grad_norm": 1.3515625, + "learning_rate": 8.716111813456947e-06, + "loss": 1.0956, + "mean_token_accuracy": 0.7305957246571779, + "num_tokens": 1067502000.0, + "step": 6100 + }, + { + "epoch": 1.6649747057520994, + "grad_norm": 1.0625, + "learning_rate": 8.645877229948027e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7509189244359732, + "num_tokens": 1069280628.0, + "step": 6110 + }, + { + "epoch": 1.6677000119232144, + "grad_norm": 1.234375, + "learning_rate": 8.575642646439107e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.736516029201448, + "num_tokens": 1071040088.0, + "step": 6120 + }, + { + "epoch": 1.6704253180943298, + "grad_norm": 1.1796875, + "learning_rate": 8.505408062930188e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.7389789640903472, + "num_tokens": 1072762221.0, + "step": 6130 + }, + { + "epoch": 1.6731506242654448, + "grad_norm": 1.3046875, + "learning_rate": 8.435173479421268e-06, + "loss": 1.0875, + "mean_token_accuracy": 0.7349373430013657, + "num_tokens": 1074556423.0, + "step": 6140 + }, + { + "epoch": 1.67587593043656, + "grad_norm": 1.1796875, + "learning_rate": 8.364938895912346e-06, + "loss": 1.065, + "mean_token_accuracy": 0.7372798976488412, + "num_tokens": 1076338917.0, + "step": 6150 + }, + { + "epoch": 1.6786012366076752, + "grad_norm": 1.140625, + "learning_rate": 8.294704312403428e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.7328151876106859, + "num_tokens": 1077977290.0, + "step": 6160 + }, + { + "epoch": 1.6813265427787902, + "grad_norm": 1.234375, + "learning_rate": 8.224469728894508e-06, + "loss": 1.0706, + "mean_token_accuracy": 0.7358840562403202, + "num_tokens": 1079751546.0, + "step": 6170 + }, + { + "epoch": 1.6840518489499054, + "grad_norm": 1.2734375, + "learning_rate": 8.154235145385589e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.7286890825256706, + "num_tokens": 1081482161.0, + "step": 6180 + }, + { + "epoch": 1.6867771551210207, + "grad_norm": 1.15625, + "learning_rate": 8.084000561876669e-06, + "loss": 1.0996, + "mean_token_accuracy": 0.7321878804825246, + "num_tokens": 1083308646.0, + "step": 6190 + }, + { + "epoch": 1.6895024612921357, + "grad_norm": 1.203125, + "learning_rate": 8.013765978367749e-06, + "loss": 1.0919, + "mean_token_accuracy": 0.7313131378032267, + "num_tokens": 1085049528.0, + "step": 6200 + }, + { + "epoch": 1.692227767463251, + "grad_norm": 1.21875, + "learning_rate": 7.94353139485883e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.7383657024241984, + "num_tokens": 1086786072.0, + "step": 6210 + }, + { + "epoch": 1.694953073634366, + "grad_norm": 1.421875, + "learning_rate": 7.87329681134991e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.74335707006976, + "num_tokens": 1088520760.0, + "step": 6220 + }, + { + "epoch": 1.6976783798054813, + "grad_norm": 1.296875, + "learning_rate": 7.80306222784099e-06, + "loss": 1.0945, + "mean_token_accuracy": 0.7332091301679611, + "num_tokens": 1090282461.0, + "step": 6230 + }, + { + "epoch": 1.7004036859765965, + "grad_norm": 1.1953125, + "learning_rate": 7.73282764433207e-06, + "loss": 1.1228, + "mean_token_accuracy": 0.7257892864756286, + "num_tokens": 1092006571.0, + "step": 6240 + }, + { + "epoch": 1.7031289921477115, + "grad_norm": 1.1640625, + "learning_rate": 7.66259306082315e-06, + "loss": 1.1315, + "mean_token_accuracy": 0.7253849579952657, + "num_tokens": 1093737561.0, + "step": 6250 + }, + { + "epoch": 1.7058542983188267, + "grad_norm": 1.0546875, + "learning_rate": 7.592358477314229e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.7328445117920637, + "num_tokens": 1095514719.0, + "step": 6260 + }, + { + "epoch": 1.708579604489942, + "grad_norm": 1.109375, + "learning_rate": 7.52212389380531e-06, + "loss": 1.1213, + "mean_token_accuracy": 0.7268084879033267, + "num_tokens": 1097274026.0, + "step": 6270 + }, + { + "epoch": 1.711304910661057, + "grad_norm": 1.4296875, + "learning_rate": 7.4518893102963905e-06, + "loss": 1.1158, + "mean_token_accuracy": 0.7276978973299265, + "num_tokens": 1099059973.0, + "step": 6280 + }, + { + "epoch": 1.7140302168321724, + "grad_norm": 1.171875, + "learning_rate": 7.381654726787471e-06, + "loss": 1.0677, + "mean_token_accuracy": 0.7368841106072068, + "num_tokens": 1100840633.0, + "step": 6290 + }, + { + "epoch": 1.7167555230032874, + "grad_norm": 1.2109375, + "learning_rate": 7.311420143278551e-06, + "loss": 1.112, + "mean_token_accuracy": 0.7283360333181917, + "num_tokens": 1102617553.0, + "step": 6300 + }, + { + "epoch": 1.7194808291744026, + "grad_norm": 1.3828125, + "learning_rate": 7.241185559769631e-06, + "loss": 1.0913, + "mean_token_accuracy": 0.7312847984954715, + "num_tokens": 1104327046.0, + "step": 6310 + }, + { + "epoch": 1.7222061353455178, + "grad_norm": 1.296875, + "learning_rate": 7.170950976260711e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.7350753247737885, + "num_tokens": 1106098274.0, + "step": 6320 + }, + { + "epoch": 1.7249314415166328, + "grad_norm": 1.2421875, + "learning_rate": 7.1007163927517915e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.740331229288131, + "num_tokens": 1107850821.0, + "step": 6330 + }, + { + "epoch": 1.727656747687748, + "grad_norm": 1.15625, + "learning_rate": 7.030481809242871e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.741414366196841, + "num_tokens": 1109562198.0, + "step": 6340 + }, + { + "epoch": 1.7303820538588632, + "grad_norm": 1.1484375, + "learning_rate": 6.960247225733951e-06, + "loss": 1.0795, + "mean_token_accuracy": 0.7345545357093215, + "num_tokens": 1111305835.0, + "step": 6350 + }, + { + "epoch": 1.7331073600299782, + "grad_norm": 1.1328125, + "learning_rate": 6.890012642225031e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.7364791300147772, + "num_tokens": 1113027716.0, + "step": 6360 + }, + { + "epoch": 1.7358326662010937, + "grad_norm": 1.2265625, + "learning_rate": 6.819778058716113e-06, + "loss": 1.0791, + "mean_token_accuracy": 0.7377235498279333, + "num_tokens": 1114828620.0, + "step": 6370 + }, + { + "epoch": 1.7385579723722087, + "grad_norm": 1.171875, + "learning_rate": 6.7495434752071925e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.7378025005571545, + "num_tokens": 1116603274.0, + "step": 6380 + }, + { + "epoch": 1.7412832785433239, + "grad_norm": 1.1953125, + "learning_rate": 6.679308891698273e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.7348160912282765, + "num_tokens": 1118297922.0, + "step": 6390 + }, + { + "epoch": 1.744008584714439, + "grad_norm": 1.15625, + "learning_rate": 6.609074308189353e-06, + "loss": 1.0689, + "mean_token_accuracy": 0.7366988636553288, + "num_tokens": 1120135157.0, + "step": 6400 + }, + { + "epoch": 1.746733890885554, + "grad_norm": 1.2890625, + "learning_rate": 6.538839724680433e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.741857647895813, + "num_tokens": 1121889958.0, + "step": 6410 + }, + { + "epoch": 1.7494591970566693, + "grad_norm": 1.1796875, + "learning_rate": 6.468605141171513e-06, + "loss": 1.0843, + "mean_token_accuracy": 0.7341483205556869, + "num_tokens": 1123632536.0, + "step": 6420 + }, + { + "epoch": 1.7521845032277845, + "grad_norm": 1.1953125, + "learning_rate": 6.3983705576625935e-06, + "loss": 1.0892, + "mean_token_accuracy": 0.7336477694101632, + "num_tokens": 1125380817.0, + "step": 6430 + }, + { + "epoch": 1.7549098093988995, + "grad_norm": 1.3046875, + "learning_rate": 6.328135974153674e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.7332813500426709, + "num_tokens": 1127183000.0, + "step": 6440 + }, + { + "epoch": 1.757635115570015, + "grad_norm": 1.1484375, + "learning_rate": 6.257901390644753e-06, + "loss": 1.0861, + "mean_token_accuracy": 0.7328511819243431, + "num_tokens": 1128907056.0, + "step": 6450 + }, + { + "epoch": 1.76036042174113, + "grad_norm": 1.21875, + "learning_rate": 6.187666807135834e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.7417821794748306, + "num_tokens": 1130640406.0, + "step": 6460 + }, + { + "epoch": 1.7630857279122452, + "grad_norm": 1.21875, + "learning_rate": 6.117432223626914e-06, + "loss": 1.1198, + "mean_token_accuracy": 0.7258993354626, + "num_tokens": 1132411436.0, + "step": 6470 + }, + { + "epoch": 1.7658110340833604, + "grad_norm": 1.140625, + "learning_rate": 6.0471976401179945e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.7375982970930636, + "num_tokens": 1134162570.0, + "step": 6480 + }, + { + "epoch": 1.7685363402544754, + "grad_norm": 1.15625, + "learning_rate": 5.976963056609075e-06, + "loss": 1.1008, + "mean_token_accuracy": 0.7292270836420357, + "num_tokens": 1135933024.0, + "step": 6490 + }, + { + "epoch": 1.7712616464255906, + "grad_norm": 1.203125, + "learning_rate": 5.906728473100155e-06, + "loss": 1.0659, + "mean_token_accuracy": 0.7357593169435859, + "num_tokens": 1137647725.0, + "step": 6500 + }, + { + "epoch": 1.7739869525967058, + "grad_norm": 1.2578125, + "learning_rate": 5.836493889591235e-06, + "loss": 1.0787, + "mean_token_accuracy": 0.7348907117731869, + "num_tokens": 1139436589.0, + "step": 6510 + }, + { + "epoch": 1.7767122587678208, + "grad_norm": 1.328125, + "learning_rate": 5.766259306082315e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.7361710716038943, + "num_tokens": 1141121186.0, + "step": 6520 + }, + { + "epoch": 1.7794375649389362, + "grad_norm": 1.140625, + "learning_rate": 5.6960247225733954e-06, + "loss": 1.1214, + "mean_token_accuracy": 0.7259381771087646, + "num_tokens": 1142885840.0, + "step": 6530 + }, + { + "epoch": 1.7821628711100512, + "grad_norm": 1.1328125, + "learning_rate": 5.625790139064476e-06, + "loss": 1.1045, + "mean_token_accuracy": 0.7307565311901272, + "num_tokens": 1144664551.0, + "step": 6540 + }, + { + "epoch": 1.7848881772811664, + "grad_norm": 1.28125, + "learning_rate": 5.555555555555556e-06, + "loss": 1.0941, + "mean_token_accuracy": 0.7319103201851249, + "num_tokens": 1146392624.0, + "step": 6550 + }, + { + "epoch": 1.7876134834522817, + "grad_norm": 1.3203125, + "learning_rate": 5.485320972046636e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.7373215350322425, + "num_tokens": 1148186115.0, + "step": 6560 + }, + { + "epoch": 1.7903387896233967, + "grad_norm": 1.171875, + "learning_rate": 5.415086388537716e-06, + "loss": 1.119, + "mean_token_accuracy": 0.7275857485830783, + "num_tokens": 1149895675.0, + "step": 6570 + }, + { + "epoch": 1.7930640957945119, + "grad_norm": 1.265625, + "learning_rate": 5.344851805028796e-06, + "loss": 1.0892, + "mean_token_accuracy": 0.7332587640732526, + "num_tokens": 1151592671.0, + "step": 6580 + }, + { + "epoch": 1.795789401965627, + "grad_norm": 1.1484375, + "learning_rate": 5.274617221519877e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.7410731894895435, + "num_tokens": 1153340225.0, + "step": 6590 + }, + { + "epoch": 1.798514708136742, + "grad_norm": 1.1015625, + "learning_rate": 5.204382638010957e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.7373812107369304, + "num_tokens": 1155119025.0, + "step": 6600 + }, + { + "epoch": 1.8012400143078575, + "grad_norm": 1.203125, + "learning_rate": 5.134148054502037e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.7350325770676136, + "num_tokens": 1156885610.0, + "step": 6610 + }, + { + "epoch": 1.8039653204789725, + "grad_norm": 1.3125, + "learning_rate": 5.063913470993117e-06, + "loss": 1.1136, + "mean_token_accuracy": 0.7288194528780878, + "num_tokens": 1158614375.0, + "step": 6620 + }, + { + "epoch": 1.8066906266500877, + "grad_norm": 1.09375, + "learning_rate": 4.993678887484197e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.7429588054306805, + "num_tokens": 1160332773.0, + "step": 6630 + }, + { + "epoch": 1.809415932821203, + "grad_norm": 1.1328125, + "learning_rate": 4.923444303975278e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.7377984975464642, + "num_tokens": 1162043122.0, + "step": 6640 + }, + { + "epoch": 1.812141238992318, + "grad_norm": 1.15625, + "learning_rate": 4.853209720466358e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.7405181519687176, + "num_tokens": 1163783401.0, + "step": 6650 + }, + { + "epoch": 1.8148665451634332, + "grad_norm": 1.1953125, + "learning_rate": 4.782975136957439e-06, + "loss": 1.0968, + "mean_token_accuracy": 0.7296288413926959, + "num_tokens": 1165546738.0, + "step": 6660 + }, + { + "epoch": 1.8175918513345484, + "grad_norm": 1.0859375, + "learning_rate": 4.712740553448518e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.7455301341600716, + "num_tokens": 1167290698.0, + "step": 6670 + }, + { + "epoch": 1.8203171575056634, + "grad_norm": 1.1640625, + "learning_rate": 4.642505969939598e-06, + "loss": 1.0804, + "mean_token_accuracy": 0.7327212145552039, + "num_tokens": 1169002358.0, + "step": 6680 + }, + { + "epoch": 1.8230424636767788, + "grad_norm": 1.109375, + "learning_rate": 4.5722713864306786e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.7437866570428013, + "num_tokens": 1170783980.0, + "step": 6690 + }, + { + "epoch": 1.8257677698478938, + "grad_norm": 1.25, + "learning_rate": 4.502036802921759e-06, + "loss": 1.0973, + "mean_token_accuracy": 0.7299256231635809, + "num_tokens": 1172475127.0, + "step": 6700 + }, + { + "epoch": 1.828493076019009, + "grad_norm": 1.2734375, + "learning_rate": 4.431802219412839e-06, + "loss": 1.0834, + "mean_token_accuracy": 0.7324966519139707, + "num_tokens": 1174214378.0, + "step": 6710 + }, + { + "epoch": 1.8312183821901242, + "grad_norm": 1.1484375, + "learning_rate": 4.361567635903919e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.7350320257246494, + "num_tokens": 1175912162.0, + "step": 6720 + }, + { + "epoch": 1.8339436883612392, + "grad_norm": 1.0859375, + "learning_rate": 4.291333052394999e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.7399618875235319, + "num_tokens": 1177698845.0, + "step": 6730 + }, + { + "epoch": 1.8366689945323547, + "grad_norm": 1.140625, + "learning_rate": 4.2210984688860795e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.7414560637436807, + "num_tokens": 1179524910.0, + "step": 6740 + }, + { + "epoch": 1.8393943007034697, + "grad_norm": 1.2890625, + "learning_rate": 4.15086388537716e-06, + "loss": 1.1032, + "mean_token_accuracy": 0.7305867711082101, + "num_tokens": 1181252944.0, + "step": 6750 + }, + { + "epoch": 1.8421196068745849, + "grad_norm": 1.1953125, + "learning_rate": 4.08062930186824e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.7364834869280458, + "num_tokens": 1182978187.0, + "step": 6760 + }, + { + "epoch": 1.8448449130457, + "grad_norm": 1.2109375, + "learning_rate": 4.010394718359321e-06, + "loss": 1.0918, + "mean_token_accuracy": 0.7304992197081447, + "num_tokens": 1184646601.0, + "step": 6770 + }, + { + "epoch": 1.847570219216815, + "grad_norm": 1.25, + "learning_rate": 3.9401601348504e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.7391768081113697, + "num_tokens": 1186432758.0, + "step": 6780 + }, + { + "epoch": 1.8502955253879303, + "grad_norm": 1.15625, + "learning_rate": 3.8699255513414805e-06, + "loss": 1.1037, + "mean_token_accuracy": 0.7292625552043319, + "num_tokens": 1188172544.0, + "step": 6790 + }, + { + "epoch": 1.8530208315590455, + "grad_norm": 1.3359375, + "learning_rate": 3.7996909678325607e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.7358251197263599, + "num_tokens": 1189891287.0, + "step": 6800 + }, + { + "epoch": 1.8557461377301605, + "grad_norm": 1.1875, + "learning_rate": 3.729456384323641e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.7414630267769098, + "num_tokens": 1191622273.0, + "step": 6810 + }, + { + "epoch": 1.858471443901276, + "grad_norm": 1.1875, + "learning_rate": 3.6592218008147215e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.7319276875816285, + "num_tokens": 1193460373.0, + "step": 6820 + }, + { + "epoch": 1.861196750072391, + "grad_norm": 1.25, + "learning_rate": 3.5889872173058017e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.7333787251263857, + "num_tokens": 1195156956.0, + "step": 6830 + }, + { + "epoch": 1.8639220562435062, + "grad_norm": 1.1875, + "learning_rate": 3.518752633796882e-06, + "loss": 1.0804, + "mean_token_accuracy": 0.7354118786752224, + "num_tokens": 1196889114.0, + "step": 6840 + }, + { + "epoch": 1.8666473624146214, + "grad_norm": 1.140625, + "learning_rate": 3.4485180502879617e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.7374997787177563, + "num_tokens": 1198676992.0, + "step": 6850 + }, + { + "epoch": 1.8693726685857364, + "grad_norm": 1.0546875, + "learning_rate": 3.378283466779042e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.7359341490082443, + "num_tokens": 1200444870.0, + "step": 6860 + }, + { + "epoch": 1.8720979747568516, + "grad_norm": 1.25, + "learning_rate": 3.308048883270122e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.736452922783792, + "num_tokens": 1202174597.0, + "step": 6870 + }, + { + "epoch": 1.8748232809279668, + "grad_norm": 1.2109375, + "learning_rate": 3.2378142997612027e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.7363683463074266, + "num_tokens": 1203947194.0, + "step": 6880 + }, + { + "epoch": 1.8775485870990818, + "grad_norm": 1.203125, + "learning_rate": 3.167579716252283e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.7356330995447934, + "num_tokens": 1205696008.0, + "step": 6890 + }, + { + "epoch": 1.8802738932701972, + "grad_norm": 1.09375, + "learning_rate": 3.097345132743363e-06, + "loss": 1.034, + "mean_token_accuracy": 0.7419920676387847, + "num_tokens": 1207477394.0, + "step": 6900 + }, + { + "epoch": 1.8829991994413122, + "grad_norm": 1.2421875, + "learning_rate": 3.0271105492344433e-06, + "loss": 1.1091, + "mean_token_accuracy": 0.7290258550085127, + "num_tokens": 1209268585.0, + "step": 6910 + }, + { + "epoch": 1.8857245056124274, + "grad_norm": 1.25, + "learning_rate": 2.9568759657255235e-06, + "loss": 1.0977, + "mean_token_accuracy": 0.7322898281738162, + "num_tokens": 1211062266.0, + "step": 6920 + }, + { + "epoch": 1.8884498117835427, + "grad_norm": 1.1875, + "learning_rate": 2.8866413822166037e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.7385586686432362, + "num_tokens": 1212830493.0, + "step": 6930 + }, + { + "epoch": 1.8911751179546576, + "grad_norm": 1.2265625, + "learning_rate": 2.816406798707684e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.7373645476065576, + "num_tokens": 1214570541.0, + "step": 6940 + }, + { + "epoch": 1.8939004241257729, + "grad_norm": 1.1484375, + "learning_rate": 2.746172215198764e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.7381766311824322, + "num_tokens": 1216358869.0, + "step": 6950 + }, + { + "epoch": 1.896625730296888, + "grad_norm": 1.28125, + "learning_rate": 2.6759376316898443e-06, + "loss": 1.0972, + "mean_token_accuracy": 0.7319785353727639, + "num_tokens": 1218169173.0, + "step": 6960 + }, + { + "epoch": 1.899351036468003, + "grad_norm": 1.3125, + "learning_rate": 2.6057030481809245e-06, + "loss": 1.0912, + "mean_token_accuracy": 0.7316390814259648, + "num_tokens": 1219932812.0, + "step": 6970 + }, + { + "epoch": 1.9020763426391185, + "grad_norm": 1.265625, + "learning_rate": 2.5354684646720047e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.7394123332574963, + "num_tokens": 1221698971.0, + "step": 6980 + }, + { + "epoch": 1.9048016488102335, + "grad_norm": 1.1640625, + "learning_rate": 2.465233881163085e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.7378796759061516, + "num_tokens": 1223474752.0, + "step": 6990 + }, + { + "epoch": 1.9075269549813487, + "grad_norm": 1.0546875, + "learning_rate": 2.394999297654165e-06, + "loss": 1.0747, + "mean_token_accuracy": 0.7349859910085798, + "num_tokens": 1225205018.0, + "step": 7000 + }, + { + "epoch": 1.910252261152464, + "grad_norm": 1.1796875, + "learning_rate": 2.3247647141452453e-06, + "loss": 1.0707, + "mean_token_accuracy": 0.7361236764118075, + "num_tokens": 1226976554.0, + "step": 7010 + }, + { + "epoch": 1.912977567323579, + "grad_norm": 1.28125, + "learning_rate": 2.2545301306363255e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.7431093015708029, + "num_tokens": 1228681473.0, + "step": 7020 + }, + { + "epoch": 1.9157028734946941, + "grad_norm": 1.15625, + "learning_rate": 2.1842955471274057e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.743872269615531, + "num_tokens": 1230386311.0, + "step": 7030 + }, + { + "epoch": 1.9184281796658094, + "grad_norm": 1.25, + "learning_rate": 2.114060963618486e-06, + "loss": 1.0805, + "mean_token_accuracy": 0.7352949684485793, + "num_tokens": 1232181485.0, + "step": 7040 + }, + { + "epoch": 1.9211534858369244, + "grad_norm": 1.640625, + "learning_rate": 2.043826380109566e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.737717292457819, + "num_tokens": 1233900223.0, + "step": 7050 + }, + { + "epoch": 1.9238787920080398, + "grad_norm": 1.3125, + "learning_rate": 1.9735917966006462e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.726865790784359, + "num_tokens": 1235661434.0, + "step": 7060 + }, + { + "epoch": 1.9266040981791548, + "grad_norm": 1.1796875, + "learning_rate": 1.9033572130917266e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.7419621678069234, + "num_tokens": 1237360867.0, + "step": 7070 + }, + { + "epoch": 1.92932940435027, + "grad_norm": 1.1875, + "learning_rate": 1.8331226295828066e-06, + "loss": 1.0848, + "mean_token_accuracy": 0.7329772229306399, + "num_tokens": 1239085380.0, + "step": 7080 + }, + { + "epoch": 1.9320547105213852, + "grad_norm": 1.1875, + "learning_rate": 1.7628880460738868e-06, + "loss": 1.0918, + "mean_token_accuracy": 0.7324993853457272, + "num_tokens": 1240747584.0, + "step": 7090 + }, + { + "epoch": 1.9347800166925002, + "grad_norm": 1.078125, + "learning_rate": 1.6926534625649672e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.7338228969834745, + "num_tokens": 1242486706.0, + "step": 7100 + }, + { + "epoch": 1.9375053228636154, + "grad_norm": 1.2421875, + "learning_rate": 1.6224188790560472e-06, + "loss": 1.071, + "mean_token_accuracy": 0.735992513410747, + "num_tokens": 1244248596.0, + "step": 7110 + }, + { + "epoch": 1.9402306290347306, + "grad_norm": 1.2890625, + "learning_rate": 1.5521842955471274e-06, + "loss": 1.0894, + "mean_token_accuracy": 0.7322908505797386, + "num_tokens": 1246027714.0, + "step": 7120 + }, + { + "epoch": 1.9429559352058456, + "grad_norm": 1.2578125, + "learning_rate": 1.4819497120382078e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.7299456540495157, + "num_tokens": 1247810720.0, + "step": 7130 + }, + { + "epoch": 1.945681241376961, + "grad_norm": 1.1484375, + "learning_rate": 1.4117151285292878e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.7424954613670707, + "num_tokens": 1249602431.0, + "step": 7140 + }, + { + "epoch": 1.948406547548076, + "grad_norm": 1.0703125, + "learning_rate": 1.341480545020368e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.7422119576483965, + "num_tokens": 1251347854.0, + "step": 7150 + }, + { + "epoch": 1.9511318537191913, + "grad_norm": 1.1796875, + "learning_rate": 1.2712459615114484e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.7364619710482657, + "num_tokens": 1253090972.0, + "step": 7160 + }, + { + "epoch": 1.9538571598903065, + "grad_norm": 1.109375, + "learning_rate": 1.2010113780025286e-06, + "loss": 1.08, + "mean_token_accuracy": 0.7367773661389947, + "num_tokens": 1254882362.0, + "step": 7170 + }, + { + "epoch": 1.9565824660614215, + "grad_norm": 1.1171875, + "learning_rate": 1.1307767944936086e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.7402027255855501, + "num_tokens": 1256581748.0, + "step": 7180 + }, + { + "epoch": 1.9593077722325367, + "grad_norm": 1.09375, + "learning_rate": 1.0605422109846888e-06, + "loss": 1.0946, + "mean_token_accuracy": 0.730874165892601, + "num_tokens": 1258375314.0, + "step": 7190 + }, + { + "epoch": 1.962033078403652, + "grad_norm": 1.2421875, + "learning_rate": 9.903076274757692e-07, + "loss": 1.0619, + "mean_token_accuracy": 0.737995185982436, + "num_tokens": 1260134496.0, + "step": 7200 + }, + { + "epoch": 1.964758384574767, + "grad_norm": 1.1484375, + "learning_rate": 9.200730439668494e-07, + "loss": 1.0665, + "mean_token_accuracy": 0.736280819401145, + "num_tokens": 1261889077.0, + "step": 7210 + }, + { + "epoch": 1.9674836907458824, + "grad_norm": 1.1015625, + "learning_rate": 8.498384604579295e-07, + "loss": 1.0562, + "mean_token_accuracy": 0.7382521599531173, + "num_tokens": 1263571459.0, + "step": 7220 + }, + { + "epoch": 1.9702089969169974, + "grad_norm": 1.3125, + "learning_rate": 7.796038769490097e-07, + "loss": 1.0592, + "mean_token_accuracy": 0.7387294569984079, + "num_tokens": 1265309020.0, + "step": 7230 + }, + { + "epoch": 1.9729343030881126, + "grad_norm": 1.1640625, + "learning_rate": 7.0936929344009e-07, + "loss": 1.0526, + "mean_token_accuracy": 0.7399847122840584, + "num_tokens": 1267094519.0, + "step": 7240 + }, + { + "epoch": 1.9756596092592278, + "grad_norm": 1.2734375, + "learning_rate": 6.391347099311701e-07, + "loss": 1.0842, + "mean_token_accuracy": 0.7341610704548657, + "num_tokens": 1268787322.0, + "step": 7250 + }, + { + "epoch": 1.9783849154303428, + "grad_norm": 1.078125, + "learning_rate": 5.689001264222504e-07, + "loss": 1.0336, + "mean_token_accuracy": 0.7430361213162542, + "num_tokens": 1270503633.0, + "step": 7260 + }, + { + "epoch": 1.981110221601458, + "grad_norm": 1.171875, + "learning_rate": 4.986655429133305e-07, + "loss": 1.0434, + "mean_token_accuracy": 0.741452188976109, + "num_tokens": 1272220571.0, + "step": 7270 + }, + { + "epoch": 1.9838355277725732, + "grad_norm": 1.3125, + "learning_rate": 4.2843095940441076e-07, + "loss": 1.1243, + "mean_token_accuracy": 0.7264798033982516, + "num_tokens": 1274007267.0, + "step": 7280 + }, + { + "epoch": 1.9865608339436882, + "grad_norm": 1.2421875, + "learning_rate": 3.5819637589549096e-07, + "loss": 1.0704, + "mean_token_accuracy": 0.735901418980211, + "num_tokens": 1275737524.0, + "step": 7290 + }, + { + "epoch": 1.9892861401148036, + "grad_norm": 1.15625, + "learning_rate": 2.8796179238657115e-07, + "loss": 1.0758, + "mean_token_accuracy": 0.7356848865747452, + "num_tokens": 1277504467.0, + "step": 7300 + }, + { + "epoch": 1.9920114462859186, + "grad_norm": 1.1953125, + "learning_rate": 2.1772720887765138e-07, + "loss": 1.035, + "mean_token_accuracy": 0.744142868835479, + "num_tokens": 1279280881.0, + "step": 7310 + }, + { + "epoch": 1.9947367524570339, + "grad_norm": 1.1171875, + "learning_rate": 1.4749262536873157e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.7363971907645463, + "num_tokens": 1280999476.0, + "step": 7320 + }, + { + "epoch": 1.997462058628149, + "grad_norm": 1.2734375, + "learning_rate": 7.725804185981177e-08, + "loss": 1.0492, + "mean_token_accuracy": 0.7415896429680288, + "num_tokens": 1282784410.0, + "step": 7330 + }, + { + "epoch": 2.0, + "grad_norm": 0.64453125, + "learning_rate": 7.023458350891979e-09, + "loss": 1.0128, + "mean_token_accuracy": 0.7320375988547434, + "num_tokens": 1284418792.0, + "step": 7340 + } + ], + "logging_steps": 10, + "max_steps": 7340, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.136186251323913e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}