| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 3670, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0002725306171115161, |
| "grad_norm": 106.0, |
| "learning_rate": 0.0, |
| "loss": 11.3973, |
| "mean_token_accuracy": 0.010489485081052408, |
| "num_tokens": 180497.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0027253061711151614, |
| "grad_norm": 89.0, |
| "learning_rate": 2.0361990950226245e-06, |
| "loss": 11.167, |
| "mean_token_accuracy": 0.012705711428780988, |
| "num_tokens": 1772191.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.005450612342230323, |
| "grad_norm": 103.0, |
| "learning_rate": 4.298642533936651e-06, |
| "loss": 10.5518, |
| "mean_token_accuracy": 0.016214983707322973, |
| "num_tokens": 3528128.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.008175918513345483, |
| "grad_norm": 44.25, |
| "learning_rate": 6.5610859728506795e-06, |
| "loss": 9.4561, |
| "mean_token_accuracy": 0.025063409566791962, |
| "num_tokens": 5321820.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.010901224684460645, |
| "grad_norm": 41.25, |
| "learning_rate": 8.823529411764707e-06, |
| "loss": 8.4037, |
| "mean_token_accuracy": 0.0406443662388483, |
| "num_tokens": 7036353.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.013626530855575806, |
| "grad_norm": 41.5, |
| "learning_rate": 1.1085972850678733e-05, |
| "loss": 7.4734, |
| "mean_token_accuracy": 0.05987281463458203, |
| "num_tokens": 8794502.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.016351837026690966, |
| "grad_norm": 29.875, |
| "learning_rate": 1.3348416289592761e-05, |
| "loss": 6.8227, |
| "mean_token_accuracy": 0.0807909039023798, |
| "num_tokens": 10519144.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.019077143197806127, |
| "grad_norm": 26.125, |
| "learning_rate": 1.5610859728506788e-05, |
| "loss": 6.2162, |
| "mean_token_accuracy": 0.10805099562276155, |
| "num_tokens": 12272638.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.02180244936892129, |
| "grad_norm": 23.125, |
| "learning_rate": 1.7873303167420814e-05, |
| "loss": 5.6551, |
| "mean_token_accuracy": 0.13820163225755094, |
| "num_tokens": 14037595.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.02452775554003645, |
| "grad_norm": 14.25, |
| "learning_rate": 2.0135746606334844e-05, |
| "loss": 5.2483, |
| "mean_token_accuracy": 0.1612478678114712, |
| "num_tokens": 15813413.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.027253061711151612, |
| "grad_norm": 13.4375, |
| "learning_rate": 2.239819004524887e-05, |
| "loss": 4.8714, |
| "mean_token_accuracy": 0.18540706855710595, |
| "num_tokens": 17606667.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.029978367882266772, |
| "grad_norm": 13.125, |
| "learning_rate": 2.4660633484162897e-05, |
| "loss": 4.5352, |
| "mean_token_accuracy": 0.21171675145160407, |
| "num_tokens": 19387650.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.03270367405338193, |
| "grad_norm": 8.125, |
| "learning_rate": 2.6923076923076923e-05, |
| "loss": 4.2605, |
| "mean_token_accuracy": 0.23295689946971834, |
| "num_tokens": 21111629.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03542898022449709, |
| "grad_norm": 8.75, |
| "learning_rate": 2.9185520361990953e-05, |
| "loss": 4.0612, |
| "mean_token_accuracy": 0.25047846739180385, |
| "num_tokens": 22855847.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.038154286395612254, |
| "grad_norm": 8.9375, |
| "learning_rate": 3.1447963800904976e-05, |
| "loss": 3.854, |
| "mean_token_accuracy": 0.2728212605463341, |
| "num_tokens": 24647294.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.04087959256672742, |
| "grad_norm": 9.625, |
| "learning_rate": 3.371040723981901e-05, |
| "loss": 3.6741, |
| "mean_token_accuracy": 0.29891129268798977, |
| "num_tokens": 26336128.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.04360489873784258, |
| "grad_norm": 7.625, |
| "learning_rate": 3.5972850678733036e-05, |
| "loss": 3.5007, |
| "mean_token_accuracy": 0.32518296535126867, |
| "num_tokens": 28061528.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.04633020490895774, |
| "grad_norm": 7.75, |
| "learning_rate": 3.8235294117647055e-05, |
| "loss": 3.257, |
| "mean_token_accuracy": 0.35865339674055574, |
| "num_tokens": 29834651.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0490555110800729, |
| "grad_norm": 5.78125, |
| "learning_rate": 4.049773755656109e-05, |
| "loss": 3.1448, |
| "mean_token_accuracy": 0.3812251358292997, |
| "num_tokens": 31533806.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.05178081725118806, |
| "grad_norm": 7.0625, |
| "learning_rate": 4.2760180995475115e-05, |
| "loss": 2.918, |
| "mean_token_accuracy": 0.41093885465525093, |
| "num_tokens": 33328644.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.054506123422303224, |
| "grad_norm": 5.84375, |
| "learning_rate": 4.502262443438914e-05, |
| "loss": 2.8125, |
| "mean_token_accuracy": 0.429937514802441, |
| "num_tokens": 35085202.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.057231429593418384, |
| "grad_norm": 5.59375, |
| "learning_rate": 4.728506787330317e-05, |
| "loss": 2.7217, |
| "mean_token_accuracy": 0.4442617506254464, |
| "num_tokens": 36899685.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.059956735764533545, |
| "grad_norm": 4.6875, |
| "learning_rate": 4.95475113122172e-05, |
| "loss": 2.5219, |
| "mean_token_accuracy": 0.4736042513512075, |
| "num_tokens": 38663867.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0626820419356487, |
| "grad_norm": 6.40625, |
| "learning_rate": 4.994381233319287e-05, |
| "loss": 2.4641, |
| "mean_token_accuracy": 0.4836056975647807, |
| "num_tokens": 40368688.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.06540734810676387, |
| "grad_norm": 4.8125, |
| "learning_rate": 4.9873577749683945e-05, |
| "loss": 2.3962, |
| "mean_token_accuracy": 0.4944497250020504, |
| "num_tokens": 42091556.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.06813265427787903, |
| "grad_norm": 4.3125, |
| "learning_rate": 4.9803343166175026e-05, |
| "loss": 2.3276, |
| "mean_token_accuracy": 0.5070343468338251, |
| "num_tokens": 43832865.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.07085796044899419, |
| "grad_norm": 3.265625, |
| "learning_rate": 4.9733108582666106e-05, |
| "loss": 2.275, |
| "mean_token_accuracy": 0.5160813440568746, |
| "num_tokens": 45556421.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.07358326662010935, |
| "grad_norm": 9.75, |
| "learning_rate": 4.9662873999157186e-05, |
| "loss": 2.2472, |
| "mean_token_accuracy": 0.525664893258363, |
| "num_tokens": 47284838.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.07630857279122451, |
| "grad_norm": 3.421875, |
| "learning_rate": 4.9592639415648266e-05, |
| "loss": 2.163, |
| "mean_token_accuracy": 0.5365134474821389, |
| "num_tokens": 49025592.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.07903387896233968, |
| "grad_norm": 3.53125, |
| "learning_rate": 4.9522404832139346e-05, |
| "loss": 2.2114, |
| "mean_token_accuracy": 0.5303179323673248, |
| "num_tokens": 50771353.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.08175918513345484, |
| "grad_norm": 3.265625, |
| "learning_rate": 4.945217024863043e-05, |
| "loss": 2.0514, |
| "mean_token_accuracy": 0.5541504692286253, |
| "num_tokens": 52521467.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.08448449130457, |
| "grad_norm": 3.359375, |
| "learning_rate": 4.938193566512151e-05, |
| "loss": 2.0561, |
| "mean_token_accuracy": 0.5545550880022347, |
| "num_tokens": 54338858.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.08720979747568516, |
| "grad_norm": 3.25, |
| "learning_rate": 4.931170108161259e-05, |
| "loss": 2.0299, |
| "mean_token_accuracy": 0.5621719690505416, |
| "num_tokens": 56014081.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.08993510364680032, |
| "grad_norm": 3.015625, |
| "learning_rate": 4.924146649810367e-05, |
| "loss": 1.9459, |
| "mean_token_accuracy": 0.5734196378849447, |
| "num_tokens": 57778337.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.09266040981791548, |
| "grad_norm": 2.515625, |
| "learning_rate": 4.9171231914594754e-05, |
| "loss": 1.9223, |
| "mean_token_accuracy": 0.5794367666356266, |
| "num_tokens": 59528977.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.09538571598903064, |
| "grad_norm": 2.5625, |
| "learning_rate": 4.910099733108583e-05, |
| "loss": 1.9025, |
| "mean_token_accuracy": 0.5819809279404581, |
| "num_tokens": 61295985.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0981110221601458, |
| "grad_norm": 3.109375, |
| "learning_rate": 4.903076274757691e-05, |
| "loss": 1.8654, |
| "mean_token_accuracy": 0.5886511621065438, |
| "num_tokens": 63026897.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.10083632833126097, |
| "grad_norm": 2.484375, |
| "learning_rate": 4.896052816406799e-05, |
| "loss": 1.8004, |
| "mean_token_accuracy": 0.6001709839329123, |
| "num_tokens": 64770711.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.10356163450237613, |
| "grad_norm": 2.609375, |
| "learning_rate": 4.889029358055907e-05, |
| "loss": 1.8124, |
| "mean_token_accuracy": 0.6000730013474822, |
| "num_tokens": 66551008.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.10628694067349129, |
| "grad_norm": 2.5, |
| "learning_rate": 4.882005899705015e-05, |
| "loss": 1.8145, |
| "mean_token_accuracy": 0.5984284824691712, |
| "num_tokens": 68320690.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.10901224684460645, |
| "grad_norm": 2.59375, |
| "learning_rate": 4.874982441354123e-05, |
| "loss": 1.7416, |
| "mean_token_accuracy": 0.6101331522688269, |
| "num_tokens": 70095284.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.11173755301572161, |
| "grad_norm": 2.875, |
| "learning_rate": 4.8679589830032316e-05, |
| "loss": 1.7673, |
| "mean_token_accuracy": 0.6082146287895739, |
| "num_tokens": 71803511.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.11446285918683677, |
| "grad_norm": 2.46875, |
| "learning_rate": 4.860935524652339e-05, |
| "loss": 1.6723, |
| "mean_token_accuracy": 0.6224730779416859, |
| "num_tokens": 73528297.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.11718816535795193, |
| "grad_norm": 2.34375, |
| "learning_rate": 4.853912066301447e-05, |
| "loss": 1.6828, |
| "mean_token_accuracy": 0.6226447049528361, |
| "num_tokens": 75274289.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.11991347152906709, |
| "grad_norm": 2.53125, |
| "learning_rate": 4.846888607950555e-05, |
| "loss": 1.7227, |
| "mean_token_accuracy": 0.6148907302878797, |
| "num_tokens": 77077403.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.12263877770018225, |
| "grad_norm": 2.96875, |
| "learning_rate": 4.8398651495996636e-05, |
| "loss": 1.7, |
| "mean_token_accuracy": 0.6187901364639401, |
| "num_tokens": 78859882.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1253640838712974, |
| "grad_norm": 2.515625, |
| "learning_rate": 4.832841691248771e-05, |
| "loss": 1.7034, |
| "mean_token_accuracy": 0.6176527316682041, |
| "num_tokens": 80590976.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.12808939004241257, |
| "grad_norm": 2.359375, |
| "learning_rate": 4.825818232897879e-05, |
| "loss": 1.6386, |
| "mean_token_accuracy": 0.6282966487109661, |
| "num_tokens": 82275126.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.13081469621352773, |
| "grad_norm": 2.203125, |
| "learning_rate": 4.818794774546987e-05, |
| "loss": 1.6267, |
| "mean_token_accuracy": 0.6301278316415846, |
| "num_tokens": 84096910.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.1335400023846429, |
| "grad_norm": 2.40625, |
| "learning_rate": 4.811771316196095e-05, |
| "loss": 1.6628, |
| "mean_token_accuracy": 0.626694044843316, |
| "num_tokens": 85876326.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.13626530855575805, |
| "grad_norm": 2.578125, |
| "learning_rate": 4.804747857845203e-05, |
| "loss": 1.6392, |
| "mean_token_accuracy": 0.6308354771696031, |
| "num_tokens": 87607478.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1389906147268732, |
| "grad_norm": 2.078125, |
| "learning_rate": 4.797724399494311e-05, |
| "loss": 1.5724, |
| "mean_token_accuracy": 0.6407017651945353, |
| "num_tokens": 89350066.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.14171592089798837, |
| "grad_norm": 2.125, |
| "learning_rate": 4.79070094114342e-05, |
| "loss": 1.5855, |
| "mean_token_accuracy": 0.6393750453367829, |
| "num_tokens": 91067910.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.14444122706910353, |
| "grad_norm": 2.109375, |
| "learning_rate": 4.783677482792527e-05, |
| "loss": 1.5632, |
| "mean_token_accuracy": 0.6421649686060846, |
| "num_tokens": 92797017.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.1471665332402187, |
| "grad_norm": 2.09375, |
| "learning_rate": 4.776654024441635e-05, |
| "loss": 1.6004, |
| "mean_token_accuracy": 0.6356727724894882, |
| "num_tokens": 94606329.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.14989183941133385, |
| "grad_norm": 2.09375, |
| "learning_rate": 4.769630566090743e-05, |
| "loss": 1.5549, |
| "mean_token_accuracy": 0.6431776374578476, |
| "num_tokens": 96331087.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.15261714558244902, |
| "grad_norm": 2.390625, |
| "learning_rate": 4.762607107739852e-05, |
| "loss": 1.5888, |
| "mean_token_accuracy": 0.6394226610660553, |
| "num_tokens": 98138711.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.15534245175356418, |
| "grad_norm": 2.265625, |
| "learning_rate": 4.755583649388959e-05, |
| "loss": 1.5821, |
| "mean_token_accuracy": 0.6415903450921178, |
| "num_tokens": 99885005.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.15806775792467936, |
| "grad_norm": 2.640625, |
| "learning_rate": 4.748560191038067e-05, |
| "loss": 1.4567, |
| "mean_token_accuracy": 0.6615531787276268, |
| "num_tokens": 101636075.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.16079306409579452, |
| "grad_norm": 2.203125, |
| "learning_rate": 4.741536732687175e-05, |
| "loss": 1.5205, |
| "mean_token_accuracy": 0.651706058345735, |
| "num_tokens": 103349118.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.16351837026690969, |
| "grad_norm": 2.375, |
| "learning_rate": 4.734513274336283e-05, |
| "loss": 1.5282, |
| "mean_token_accuracy": 0.6503567652776837, |
| "num_tokens": 105033010.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.16624367643802485, |
| "grad_norm": 2.390625, |
| "learning_rate": 4.727489815985391e-05, |
| "loss": 1.4769, |
| "mean_token_accuracy": 0.6579250860959291, |
| "num_tokens": 106723283.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.16896898260914, |
| "grad_norm": 1.7734375, |
| "learning_rate": 4.720466357634499e-05, |
| "loss": 1.4878, |
| "mean_token_accuracy": 0.6554030778817832, |
| "num_tokens": 108436878.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.17169428878025517, |
| "grad_norm": 1.984375, |
| "learning_rate": 4.713442899283608e-05, |
| "loss": 1.4697, |
| "mean_token_accuracy": 0.6599764323793351, |
| "num_tokens": 110203157.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.17441959495137033, |
| "grad_norm": 2.421875, |
| "learning_rate": 4.706419440932715e-05, |
| "loss": 1.5011, |
| "mean_token_accuracy": 0.6555302709341049, |
| "num_tokens": 111949130.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.1771449011224855, |
| "grad_norm": 2.046875, |
| "learning_rate": 4.6993959825818233e-05, |
| "loss": 1.5068, |
| "mean_token_accuracy": 0.6517163597047329, |
| "num_tokens": 113652926.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.17987020729360065, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.6923725242309314e-05, |
| "loss": 1.4746, |
| "mean_token_accuracy": 0.657813799008727, |
| "num_tokens": 115334647.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.1825955134647158, |
| "grad_norm": 1.875, |
| "learning_rate": 4.68534906588004e-05, |
| "loss": 1.4268, |
| "mean_token_accuracy": 0.6688082559965551, |
| "num_tokens": 117032381.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.18532081963583097, |
| "grad_norm": 1.7421875, |
| "learning_rate": 4.6783256075291474e-05, |
| "loss": 1.4713, |
| "mean_token_accuracy": 0.6591908087022602, |
| "num_tokens": 118801553.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.18804612580694613, |
| "grad_norm": 1.90625, |
| "learning_rate": 4.6713021491782554e-05, |
| "loss": 1.4687, |
| "mean_token_accuracy": 0.6593087091110647, |
| "num_tokens": 120530271.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.1907714319780613, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.6642786908273634e-05, |
| "loss": 1.4613, |
| "mean_token_accuracy": 0.6628443499095737, |
| "num_tokens": 122314411.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.19349673814917645, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.6572552324764715e-05, |
| "loss": 1.4554, |
| "mean_token_accuracy": 0.6636740594170988, |
| "num_tokens": 124054113.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.1962220443202916, |
| "grad_norm": 1.9609375, |
| "learning_rate": 4.6502317741255795e-05, |
| "loss": 1.4493, |
| "mean_token_accuracy": 0.6633484376594424, |
| "num_tokens": 125786705.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.19894735049140677, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.6432083157746875e-05, |
| "loss": 1.4639, |
| "mean_token_accuracy": 0.6606446763500571, |
| "num_tokens": 127510112.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.20167265666252193, |
| "grad_norm": 1.75, |
| "learning_rate": 4.636184857423796e-05, |
| "loss": 1.4136, |
| "mean_token_accuracy": 0.6700579337775707, |
| "num_tokens": 129321733.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.2043979628336371, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.6291613990729035e-05, |
| "loss": 1.4551, |
| "mean_token_accuracy": 0.6638867166824639, |
| "num_tokens": 131068939.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.20712326900475225, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.6221379407220116e-05, |
| "loss": 1.4996, |
| "mean_token_accuracy": 0.6567820507101715, |
| "num_tokens": 132800192.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.2098485751758674, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.6151144823711196e-05, |
| "loss": 1.4082, |
| "mean_token_accuracy": 0.6725725987926126, |
| "num_tokens": 134501880.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.21257388134698257, |
| "grad_norm": 1.9296875, |
| "learning_rate": 4.608091024020228e-05, |
| "loss": 1.3787, |
| "mean_token_accuracy": 0.676600266713649, |
| "num_tokens": 136227230.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.21529918751809773, |
| "grad_norm": 1.9921875, |
| "learning_rate": 4.6010675656693356e-05, |
| "loss": 1.3956, |
| "mean_token_accuracy": 0.6742036573588848, |
| "num_tokens": 137938433.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.2180244936892129, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.5940441073184436e-05, |
| "loss": 1.4021, |
| "mean_token_accuracy": 0.6748411299660801, |
| "num_tokens": 139667163.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.22074979986032806, |
| "grad_norm": 1.8515625, |
| "learning_rate": 4.5870206489675517e-05, |
| "loss": 1.3732, |
| "mean_token_accuracy": 0.6786113461479545, |
| "num_tokens": 141481099.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.22347510603144322, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.57999719061666e-05, |
| "loss": 1.4499, |
| "mean_token_accuracy": 0.666275049932301, |
| "num_tokens": 143204243.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.22620041220255838, |
| "grad_norm": 1.921875, |
| "learning_rate": 4.572973732265768e-05, |
| "loss": 1.3878, |
| "mean_token_accuracy": 0.67771971905604, |
| "num_tokens": 144995581.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.22892571837367354, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.565950273914876e-05, |
| "loss": 1.4013, |
| "mean_token_accuracy": 0.67232207627967, |
| "num_tokens": 146711076.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.2316510245447887, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.5589268155639844e-05, |
| "loss": 1.3717, |
| "mean_token_accuracy": 0.6795911006629467, |
| "num_tokens": 148463902.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.23437633071590386, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.551903357213092e-05, |
| "loss": 1.4148, |
| "mean_token_accuracy": 0.671318475343287, |
| "num_tokens": 150212801.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.23710163688701902, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.5448798988622e-05, |
| "loss": 1.3469, |
| "mean_token_accuracy": 0.6808905070647597, |
| "num_tokens": 151950016.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.23982694305813418, |
| "grad_norm": 1.65625, |
| "learning_rate": 4.537856440511308e-05, |
| "loss": 1.3565, |
| "mean_token_accuracy": 0.6808952454477548, |
| "num_tokens": 153686341.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.24255224922924934, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.5308329821604165e-05, |
| "loss": 1.3905, |
| "mean_token_accuracy": 0.6750849165953696, |
| "num_tokens": 155442220.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.2452775554003645, |
| "grad_norm": 1.78125, |
| "learning_rate": 4.523809523809524e-05, |
| "loss": 1.3475, |
| "mean_token_accuracy": 0.6828798386268318, |
| "num_tokens": 157222278.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.24800286157147966, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.516786065458632e-05, |
| "loss": 1.3467, |
| "mean_token_accuracy": 0.682690916582942, |
| "num_tokens": 158989873.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.2507281677425948, |
| "grad_norm": 1.9296875, |
| "learning_rate": 4.50976260710774e-05, |
| "loss": 1.3645, |
| "mean_token_accuracy": 0.6824150150641799, |
| "num_tokens": 160740690.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.25345347391371, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.502739148756848e-05, |
| "loss": 1.3808, |
| "mean_token_accuracy": 0.6758302460424602, |
| "num_tokens": 162505210.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.25617878008482514, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.495715690405956e-05, |
| "loss": 1.3703, |
| "mean_token_accuracy": 0.6793876992538571, |
| "num_tokens": 164184119.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.2589040862559403, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.488692232055064e-05, |
| "loss": 1.3309, |
| "mean_token_accuracy": 0.6854429397732019, |
| "num_tokens": 165971146.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.26162939242705546, |
| "grad_norm": 2.28125, |
| "learning_rate": 4.4816687737041726e-05, |
| "loss": 1.3738, |
| "mean_token_accuracy": 0.6808017442002893, |
| "num_tokens": 167701701.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.2643546985981706, |
| "grad_norm": 2.21875, |
| "learning_rate": 4.47464531535328e-05, |
| "loss": 1.379, |
| "mean_token_accuracy": 0.6793407511897385, |
| "num_tokens": 169443326.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.2670800047692858, |
| "grad_norm": 1.8671875, |
| "learning_rate": 4.467621857002388e-05, |
| "loss": 1.363, |
| "mean_token_accuracy": 0.6808341681025922, |
| "num_tokens": 171199385.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.26980531094040094, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.460598398651496e-05, |
| "loss": 1.3164, |
| "mean_token_accuracy": 0.6899495711550117, |
| "num_tokens": 172929493.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.2725306171115161, |
| "grad_norm": 2.28125, |
| "learning_rate": 4.453574940300605e-05, |
| "loss": 1.3187, |
| "mean_token_accuracy": 0.6878650960512459, |
| "num_tokens": 174672601.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.27525592328263127, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.446551481949712e-05, |
| "loss": 1.3538, |
| "mean_token_accuracy": 0.6838336682878434, |
| "num_tokens": 176441962.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.2779812294537464, |
| "grad_norm": 1.75, |
| "learning_rate": 4.43952802359882e-05, |
| "loss": 1.3101, |
| "mean_token_accuracy": 0.6909397638402879, |
| "num_tokens": 178197361.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.2807065356248616, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.432504565247929e-05, |
| "loss": 1.3477, |
| "mean_token_accuracy": 0.6821904895827174, |
| "num_tokens": 179887580.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.28343184179597675, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.425481106897036e-05, |
| "loss": 1.3022, |
| "mean_token_accuracy": 0.6917614788748324, |
| "num_tokens": 181655865.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.2861571479670919, |
| "grad_norm": 1.7578125, |
| "learning_rate": 4.418457648546144e-05, |
| "loss": 1.3359, |
| "mean_token_accuracy": 0.6866448893211782, |
| "num_tokens": 183445880.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.28888245413820707, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.411434190195252e-05, |
| "loss": 1.2911, |
| "mean_token_accuracy": 0.69377696281299, |
| "num_tokens": 185175706.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.29160776030932223, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.404410731844361e-05, |
| "loss": 1.3015, |
| "mean_token_accuracy": 0.6912993769161403, |
| "num_tokens": 186873396.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.2943330664804374, |
| "grad_norm": 1.65625, |
| "learning_rate": 4.397387273493468e-05, |
| "loss": 1.3535, |
| "mean_token_accuracy": 0.6828196115791798, |
| "num_tokens": 188591288.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.29705837265155255, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.390363815142576e-05, |
| "loss": 1.3351, |
| "mean_token_accuracy": 0.6864844439551234, |
| "num_tokens": 190375182.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.2997836788226677, |
| "grad_norm": 1.625, |
| "learning_rate": 4.383340356791684e-05, |
| "loss": 1.3093, |
| "mean_token_accuracy": 0.6901756428182125, |
| "num_tokens": 192104609.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.30250898499378287, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.376316898440793e-05, |
| "loss": 1.2279, |
| "mean_token_accuracy": 0.7054048574529588, |
| "num_tokens": 193819768.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.30523429116489803, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.3692934400899e-05, |
| "loss": 1.3105, |
| "mean_token_accuracy": 0.6918096936307847, |
| "num_tokens": 195541705.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.3079595973360132, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.362269981739008e-05, |
| "loss": 1.2943, |
| "mean_token_accuracy": 0.692951999604702, |
| "num_tokens": 197250976.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.31068490350712835, |
| "grad_norm": 1.453125, |
| "learning_rate": 4.355246523388117e-05, |
| "loss": 1.2566, |
| "mean_token_accuracy": 0.7000818770378828, |
| "num_tokens": 198964087.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.31341020967824357, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.348223065037224e-05, |
| "loss": 1.3103, |
| "mean_token_accuracy": 0.6903091154061258, |
| "num_tokens": 200654341.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.31613551584935873, |
| "grad_norm": 1.7578125, |
| "learning_rate": 4.3411996066863323e-05, |
| "loss": 1.3449, |
| "mean_token_accuracy": 0.6847572137601674, |
| "num_tokens": 202410389.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.3188608220204739, |
| "grad_norm": 1.6640625, |
| "learning_rate": 4.3341761483354404e-05, |
| "loss": 1.2936, |
| "mean_token_accuracy": 0.6938237980008125, |
| "num_tokens": 204176073.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.32158612819158905, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.327152689984549e-05, |
| "loss": 1.284, |
| "mean_token_accuracy": 0.6962682608515024, |
| "num_tokens": 206017214.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.3243114343627042, |
| "grad_norm": 2.390625, |
| "learning_rate": 4.3201292316336564e-05, |
| "loss": 1.3027, |
| "mean_token_accuracy": 0.6909335135482252, |
| "num_tokens": 207832984.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.32703674053381937, |
| "grad_norm": 1.515625, |
| "learning_rate": 4.3131057732827644e-05, |
| "loss": 1.299, |
| "mean_token_accuracy": 0.6923303379677236, |
| "num_tokens": 209533427.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.32976204670493453, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.3060823149318724e-05, |
| "loss": 1.2863, |
| "mean_token_accuracy": 0.6948139815591275, |
| "num_tokens": 211344332.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.3324873528760497, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.299058856580981e-05, |
| "loss": 1.3097, |
| "mean_token_accuracy": 0.6901804354973138, |
| "num_tokens": 213088746.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.33521265904716485, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.2920353982300885e-05, |
| "loss": 1.3145, |
| "mean_token_accuracy": 0.6885634188540279, |
| "num_tokens": 214889528.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.33793796521828, |
| "grad_norm": 1.8828125, |
| "learning_rate": 4.2850119398791965e-05, |
| "loss": 1.3272, |
| "mean_token_accuracy": 0.6894508360885083, |
| "num_tokens": 216603081.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.3406632713893952, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.277988481528305e-05, |
| "loss": 1.2541, |
| "mean_token_accuracy": 0.6997496448457241, |
| "num_tokens": 218292845.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.34338857756051033, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.2709650231774125e-05, |
| "loss": 1.2827, |
| "mean_token_accuracy": 0.6969962599687278, |
| "num_tokens": 220029696.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.3461138837316255, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.2639415648265206e-05, |
| "loss": 1.3492, |
| "mean_token_accuracy": 0.685709635540843, |
| "num_tokens": 221810553.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.34883918990274065, |
| "grad_norm": 1.515625, |
| "learning_rate": 4.2569181064756286e-05, |
| "loss": 1.2669, |
| "mean_token_accuracy": 0.7004628435708582, |
| "num_tokens": 223552801.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.3515644960738558, |
| "grad_norm": 1.8515625, |
| "learning_rate": 4.249894648124737e-05, |
| "loss": 1.279, |
| "mean_token_accuracy": 0.6973608860746026, |
| "num_tokens": 225254929.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.354289802244971, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.2428711897738446e-05, |
| "loss": 1.3133, |
| "mean_token_accuracy": 0.6911240560933948, |
| "num_tokens": 227017304.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.35701510841608614, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.2358477314229526e-05, |
| "loss": 1.2394, |
| "mean_token_accuracy": 0.7042814038693905, |
| "num_tokens": 228785771.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.3597404145872013, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.2288242730720607e-05, |
| "loss": 1.2462, |
| "mean_token_accuracy": 0.702515134587884, |
| "num_tokens": 230489667.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.36246572075831646, |
| "grad_norm": 1.515625, |
| "learning_rate": 4.2218008147211694e-05, |
| "loss": 1.2768, |
| "mean_token_accuracy": 0.6976381672546268, |
| "num_tokens": 232269581.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.3651910269294316, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.214777356370277e-05, |
| "loss": 1.2795, |
| "mean_token_accuracy": 0.6947634796611964, |
| "num_tokens": 234028126.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.3679163331005468, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.207753898019385e-05, |
| "loss": 1.305, |
| "mean_token_accuracy": 0.6930719532072545, |
| "num_tokens": 235817129.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.37064163927166194, |
| "grad_norm": 1.5390625, |
| "learning_rate": 4.2007304396684934e-05, |
| "loss": 1.2765, |
| "mean_token_accuracy": 0.6983222321607172, |
| "num_tokens": 237588984.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.3733669454427771, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.193706981317601e-05, |
| "loss": 1.2305, |
| "mean_token_accuracy": 0.7052510293200612, |
| "num_tokens": 239305200.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.37609225161389226, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.186683522966709e-05, |
| "loss": 1.2552, |
| "mean_token_accuracy": 0.7010797799564898, |
| "num_tokens": 241078759.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.3788175577850074, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.179660064615817e-05, |
| "loss": 1.2903, |
| "mean_token_accuracy": 0.6959062526933849, |
| "num_tokens": 242876841.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.3815428639561226, |
| "grad_norm": 1.5625, |
| "learning_rate": 4.1726366062649255e-05, |
| "loss": 1.2394, |
| "mean_token_accuracy": 0.7018366500735282, |
| "num_tokens": 244578724.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.38426817012723774, |
| "grad_norm": 1.4140625, |
| "learning_rate": 4.165613147914033e-05, |
| "loss": 1.2776, |
| "mean_token_accuracy": 0.6954536657780409, |
| "num_tokens": 246363009.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.3869934762983529, |
| "grad_norm": 1.5, |
| "learning_rate": 4.158589689563141e-05, |
| "loss": 1.3017, |
| "mean_token_accuracy": 0.6934069953858852, |
| "num_tokens": 248120613.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.38971878246946806, |
| "grad_norm": 1.65625, |
| "learning_rate": 4.151566231212249e-05, |
| "loss": 1.2857, |
| "mean_token_accuracy": 0.6958935803733766, |
| "num_tokens": 249941867.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.3924440886405832, |
| "grad_norm": 1.390625, |
| "learning_rate": 4.1445427728613576e-05, |
| "loss": 1.2328, |
| "mean_token_accuracy": 0.7059727218933404, |
| "num_tokens": 251695734.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.3951693948116984, |
| "grad_norm": 1.4296875, |
| "learning_rate": 4.137519314510465e-05, |
| "loss": 1.2814, |
| "mean_token_accuracy": 0.6967121254652738, |
| "num_tokens": 253503211.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.39789470098281354, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.130495856159573e-05, |
| "loss": 1.2694, |
| "mean_token_accuracy": 0.6995078191161156, |
| "num_tokens": 255313734.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.4006200071539287, |
| "grad_norm": 1.375, |
| "learning_rate": 4.1234723978086816e-05, |
| "loss": 1.1998, |
| "mean_token_accuracy": 0.7114904819987714, |
| "num_tokens": 257016513.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.40334531332504386, |
| "grad_norm": 1.53125, |
| "learning_rate": 4.1164489394577896e-05, |
| "loss": 1.2871, |
| "mean_token_accuracy": 0.6948545157909394, |
| "num_tokens": 258760362.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.406070619496159, |
| "grad_norm": 1.5625, |
| "learning_rate": 4.109425481106897e-05, |
| "loss": 1.2301, |
| "mean_token_accuracy": 0.7058748141862452, |
| "num_tokens": 260546971.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.4087959256672742, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.102402022756005e-05, |
| "loss": 1.2515, |
| "mean_token_accuracy": 0.7016466647386551, |
| "num_tokens": 262307672.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.41152123183838935, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.095378564405114e-05, |
| "loss": 1.2595, |
| "mean_token_accuracy": 0.7010509856045246, |
| "num_tokens": 264023889.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.4142465380095045, |
| "grad_norm": 1.4453125, |
| "learning_rate": 4.088355106054221e-05, |
| "loss": 1.2509, |
| "mean_token_accuracy": 0.7020703799091279, |
| "num_tokens": 265798519.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.41697184418061967, |
| "grad_norm": 1.4609375, |
| "learning_rate": 4.081331647703329e-05, |
| "loss": 1.2775, |
| "mean_token_accuracy": 0.6975624321959912, |
| "num_tokens": 267546100.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.4196971503517348, |
| "grad_norm": 1.46875, |
| "learning_rate": 4.074308189352437e-05, |
| "loss": 1.2272, |
| "mean_token_accuracy": 0.7062510661780834, |
| "num_tokens": 269297641.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.42242245652285, |
| "grad_norm": 1.5, |
| "learning_rate": 4.067284731001546e-05, |
| "loss": 1.2729, |
| "mean_token_accuracy": 0.6973782840184868, |
| "num_tokens": 271053006.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.42514776269396515, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.060261272650653e-05, |
| "loss": 1.2565, |
| "mean_token_accuracy": 0.7016788011416792, |
| "num_tokens": 272829487.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.4278730688650803, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.053237814299761e-05, |
| "loss": 1.246, |
| "mean_token_accuracy": 0.7026969991624356, |
| "num_tokens": 274621990.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.43059837503619547, |
| "grad_norm": 1.625, |
| "learning_rate": 4.04621435594887e-05, |
| "loss": 1.2623, |
| "mean_token_accuracy": 0.6991809997707605, |
| "num_tokens": 276398989.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.43332368120731063, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.039190897597978e-05, |
| "loss": 1.232, |
| "mean_token_accuracy": 0.7039873175323009, |
| "num_tokens": 278192884.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.4360489873784258, |
| "grad_norm": 1.5625, |
| "learning_rate": 4.032167439247085e-05, |
| "loss": 1.2205, |
| "mean_token_accuracy": 0.7079121223650873, |
| "num_tokens": 279937931.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.43877429354954095, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.025143980896193e-05, |
| "loss": 1.2283, |
| "mean_token_accuracy": 0.7071776267141103, |
| "num_tokens": 281698143.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.4414995997206561, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.018120522545302e-05, |
| "loss": 1.2414, |
| "mean_token_accuracy": 0.7045015564188362, |
| "num_tokens": 283412565.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.44422490589177127, |
| "grad_norm": 1.453125, |
| "learning_rate": 4.011097064194409e-05, |
| "loss": 1.2495, |
| "mean_token_accuracy": 0.7027601384557783, |
| "num_tokens": 285184244.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.44695021206288643, |
| "grad_norm": 1.3359375, |
| "learning_rate": 4.004073605843517e-05, |
| "loss": 1.1779, |
| "mean_token_accuracy": 0.7148963597603142, |
| "num_tokens": 286963226.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.4496755182340016, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.997050147492625e-05, |
| "loss": 1.21, |
| "mean_token_accuracy": 0.7099241388961672, |
| "num_tokens": 288742524.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.45240082440511675, |
| "grad_norm": 1.875, |
| "learning_rate": 3.990026689141734e-05, |
| "loss": 1.2415, |
| "mean_token_accuracy": 0.7042178069241345, |
| "num_tokens": 290486439.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.4551261305762319, |
| "grad_norm": 1.4375, |
| "learning_rate": 3.9830032307908413e-05, |
| "loss": 1.246, |
| "mean_token_accuracy": 0.7043319317512214, |
| "num_tokens": 292244616.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.4578514367473471, |
| "grad_norm": 1.484375, |
| "learning_rate": 3.9759797724399494e-05, |
| "loss": 1.2455, |
| "mean_token_accuracy": 0.703259102255106, |
| "num_tokens": 294028863.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.46057674291846223, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.968956314089058e-05, |
| "loss": 1.2085, |
| "mean_token_accuracy": 0.708068885654211, |
| "num_tokens": 295842660.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.4633020490895774, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.961932855738166e-05, |
| "loss": 1.2173, |
| "mean_token_accuracy": 0.7098737230524421, |
| "num_tokens": 297606076.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.46602735526069256, |
| "grad_norm": 1.4375, |
| "learning_rate": 3.9549093973872734e-05, |
| "loss": 1.2501, |
| "mean_token_accuracy": 0.7022371832281351, |
| "num_tokens": 299367632.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.4687526614318077, |
| "grad_norm": 1.4296875, |
| "learning_rate": 3.9478859390363814e-05, |
| "loss": 1.1976, |
| "mean_token_accuracy": 0.7129932347685098, |
| "num_tokens": 301097990.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.4714779676029229, |
| "grad_norm": 1.421875, |
| "learning_rate": 3.94086248068549e-05, |
| "loss": 1.2101, |
| "mean_token_accuracy": 0.7102027184329927, |
| "num_tokens": 302847308.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.47420327377403804, |
| "grad_norm": 1.375, |
| "learning_rate": 3.9338390223345975e-05, |
| "loss": 1.2315, |
| "mean_token_accuracy": 0.704880575183779, |
| "num_tokens": 304582625.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.4769285799451532, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.9268155639837055e-05, |
| "loss": 1.2088, |
| "mean_token_accuracy": 0.7108904106542469, |
| "num_tokens": 306358206.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.47965388611626836, |
| "grad_norm": 1.515625, |
| "learning_rate": 3.9197921056328135e-05, |
| "loss": 1.1976, |
| "mean_token_accuracy": 0.712415215652436, |
| "num_tokens": 308026562.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.4823791922873835, |
| "grad_norm": 1.578125, |
| "learning_rate": 3.912768647281922e-05, |
| "loss": 1.2466, |
| "mean_token_accuracy": 0.7024014497175812, |
| "num_tokens": 309804543.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.4851044984584987, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.9057451889310296e-05, |
| "loss": 1.2379, |
| "mean_token_accuracy": 0.7045082511380315, |
| "num_tokens": 311549771.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.48782980462961384, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.8987217305801376e-05, |
| "loss": 1.2623, |
| "mean_token_accuracy": 0.701041791215539, |
| "num_tokens": 313326889.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.490555110800729, |
| "grad_norm": 1.5703125, |
| "learning_rate": 3.891698272229246e-05, |
| "loss": 1.2005, |
| "mean_token_accuracy": 0.7130162584595382, |
| "num_tokens": 315064715.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.49328041697184416, |
| "grad_norm": 1.359375, |
| "learning_rate": 3.884674813878354e-05, |
| "loss": 1.2161, |
| "mean_token_accuracy": 0.7079024517908692, |
| "num_tokens": 316809509.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.4960057231429593, |
| "grad_norm": 1.359375, |
| "learning_rate": 3.8776513555274616e-05, |
| "loss": 1.203, |
| "mean_token_accuracy": 0.711275870539248, |
| "num_tokens": 318597481.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.4987310293140745, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.8706278971765697e-05, |
| "loss": 1.188, |
| "mean_token_accuracy": 0.7133826318196952, |
| "num_tokens": 320382863.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.5014563354851896, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.8636044388256784e-05, |
| "loss": 1.2159, |
| "mean_token_accuracy": 0.710312622692436, |
| "num_tokens": 322136431.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.5041816416563049, |
| "grad_norm": 1.359375, |
| "learning_rate": 3.856580980474786e-05, |
| "loss": 1.1922, |
| "mean_token_accuracy": 0.7139192272908985, |
| "num_tokens": 323892089.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.50690694782742, |
| "grad_norm": 1.4140625, |
| "learning_rate": 3.849557522123894e-05, |
| "loss": 1.2107, |
| "mean_token_accuracy": 0.7097873773425818, |
| "num_tokens": 325693457.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.5096322539985352, |
| "grad_norm": 1.5234375, |
| "learning_rate": 3.842534063773002e-05, |
| "loss": 1.1781, |
| "mean_token_accuracy": 0.7143914319574833, |
| "num_tokens": 327397884.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.5123575601696503, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.8355106054221104e-05, |
| "loss": 1.2316, |
| "mean_token_accuracy": 0.7061603724025189, |
| "num_tokens": 329077502.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.5150828663407655, |
| "grad_norm": 1.4765625, |
| "learning_rate": 3.828487147071218e-05, |
| "loss": 1.2195, |
| "mean_token_accuracy": 0.708381280489266, |
| "num_tokens": 330847695.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.5178081725118806, |
| "grad_norm": 1.375, |
| "learning_rate": 3.821463688720326e-05, |
| "loss": 1.1845, |
| "mean_token_accuracy": 0.7121982695534825, |
| "num_tokens": 332578366.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5205334786829958, |
| "grad_norm": 1.4609375, |
| "learning_rate": 3.8144402303694345e-05, |
| "loss": 1.2265, |
| "mean_token_accuracy": 0.7070099180564284, |
| "num_tokens": 334293589.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.5232587848541109, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.8074167720185425e-05, |
| "loss": 1.2075, |
| "mean_token_accuracy": 0.707461370434612, |
| "num_tokens": 336016972.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.5259840910252261, |
| "grad_norm": 1.2890625, |
| "learning_rate": 3.80039331366765e-05, |
| "loss": 1.1813, |
| "mean_token_accuracy": 0.714381551090628, |
| "num_tokens": 337805977.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.5287093971963412, |
| "grad_norm": 1.3203125, |
| "learning_rate": 3.793369855316758e-05, |
| "loss": 1.149, |
| "mean_token_accuracy": 0.7198109852150083, |
| "num_tokens": 339548221.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.5314347033674565, |
| "grad_norm": 1.671875, |
| "learning_rate": 3.7863463969658666e-05, |
| "loss": 1.2441, |
| "mean_token_accuracy": 0.7022737297229469, |
| "num_tokens": 341268344.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5341600095385716, |
| "grad_norm": 1.46875, |
| "learning_rate": 3.779322938614974e-05, |
| "loss": 1.2015, |
| "mean_token_accuracy": 0.7109639048576355, |
| "num_tokens": 343017522.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.5368853157096868, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.772299480264082e-05, |
| "loss": 1.193, |
| "mean_token_accuracy": 0.7128876778297126, |
| "num_tokens": 344699192.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.5396106218808019, |
| "grad_norm": 1.375, |
| "learning_rate": 3.7652760219131906e-05, |
| "loss": 1.1785, |
| "mean_token_accuracy": 0.7159237092360854, |
| "num_tokens": 346458527.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.5423359280519171, |
| "grad_norm": 1.4453125, |
| "learning_rate": 3.7582525635622986e-05, |
| "loss": 1.1895, |
| "mean_token_accuracy": 0.7135124854743481, |
| "num_tokens": 348204585.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.5450612342230322, |
| "grad_norm": 1.4609375, |
| "learning_rate": 3.751229105211406e-05, |
| "loss": 1.2192, |
| "mean_token_accuracy": 0.7084178974851966, |
| "num_tokens": 349928259.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5477865403941474, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.744205646860514e-05, |
| "loss": 1.1833, |
| "mean_token_accuracy": 0.7143334408290685, |
| "num_tokens": 351568018.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.5505118465652625, |
| "grad_norm": 1.3359375, |
| "learning_rate": 3.737182188509623e-05, |
| "loss": 1.1763, |
| "mean_token_accuracy": 0.7172479030676187, |
| "num_tokens": 353325716.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.5532371527363777, |
| "grad_norm": 1.4140625, |
| "learning_rate": 3.730158730158731e-05, |
| "loss": 1.2195, |
| "mean_token_accuracy": 0.707148808799684, |
| "num_tokens": 355099655.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.5559624589074929, |
| "grad_norm": 1.4140625, |
| "learning_rate": 3.723135271807838e-05, |
| "loss": 1.1867, |
| "mean_token_accuracy": 0.7141209022141993, |
| "num_tokens": 356892448.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.5586877650786081, |
| "grad_norm": 1.453125, |
| "learning_rate": 3.716111813456946e-05, |
| "loss": 1.2126, |
| "mean_token_accuracy": 0.709683568123728, |
| "num_tokens": 358599855.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.5614130712497232, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.709088355106055e-05, |
| "loss": 1.2314, |
| "mean_token_accuracy": 0.705540257319808, |
| "num_tokens": 360366528.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.5641383774208384, |
| "grad_norm": 1.4765625, |
| "learning_rate": 3.702064896755162e-05, |
| "loss": 1.1838, |
| "mean_token_accuracy": 0.7144955797120929, |
| "num_tokens": 362123903.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.5668636835919535, |
| "grad_norm": 1.3671875, |
| "learning_rate": 3.69504143840427e-05, |
| "loss": 1.1718, |
| "mean_token_accuracy": 0.7182300767861307, |
| "num_tokens": 363907270.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.5695889897630687, |
| "grad_norm": 1.4375, |
| "learning_rate": 3.688017980053379e-05, |
| "loss": 1.1563, |
| "mean_token_accuracy": 0.7186566211283207, |
| "num_tokens": 365602883.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.5723142959341838, |
| "grad_norm": 1.5, |
| "learning_rate": 3.680994521702487e-05, |
| "loss": 1.1757, |
| "mean_token_accuracy": 0.7156617695465683, |
| "num_tokens": 367308921.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.575039602105299, |
| "grad_norm": 1.4140625, |
| "learning_rate": 3.673971063351594e-05, |
| "loss": 1.1835, |
| "mean_token_accuracy": 0.7141332181170583, |
| "num_tokens": 369068618.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.5777649082764141, |
| "grad_norm": 1.421875, |
| "learning_rate": 3.666947605000702e-05, |
| "loss": 1.1935, |
| "mean_token_accuracy": 0.713552170060575, |
| "num_tokens": 370792272.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.5804902144475294, |
| "grad_norm": 1.4921875, |
| "learning_rate": 3.659924146649811e-05, |
| "loss": 1.199, |
| "mean_token_accuracy": 0.7122195997275412, |
| "num_tokens": 372506541.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.5832155206186445, |
| "grad_norm": 1.3515625, |
| "learning_rate": 3.652900688298919e-05, |
| "loss": 1.1418, |
| "mean_token_accuracy": 0.722984395455569, |
| "num_tokens": 374308542.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.5859408267897597, |
| "grad_norm": 1.3671875, |
| "learning_rate": 3.645877229948026e-05, |
| "loss": 1.1544, |
| "mean_token_accuracy": 0.7199711099267005, |
| "num_tokens": 376016266.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5886661329608748, |
| "grad_norm": 1.25, |
| "learning_rate": 3.638853771597134e-05, |
| "loss": 1.2019, |
| "mean_token_accuracy": 0.712724674679339, |
| "num_tokens": 377732713.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.59139143913199, |
| "grad_norm": 2.15625, |
| "learning_rate": 3.631830313246243e-05, |
| "loss": 1.1785, |
| "mean_token_accuracy": 0.7151990966871381, |
| "num_tokens": 379457927.0, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.5941167453031051, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.6248068548953503e-05, |
| "loss": 1.2353, |
| "mean_token_accuracy": 0.7053435018286109, |
| "num_tokens": 381311531.0, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.5968420514742203, |
| "grad_norm": 1.4609375, |
| "learning_rate": 3.6177833965444584e-05, |
| "loss": 1.2218, |
| "mean_token_accuracy": 0.7103118651546538, |
| "num_tokens": 383076036.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.5995673576453354, |
| "grad_norm": 1.46875, |
| "learning_rate": 3.610759938193567e-05, |
| "loss": 1.2154, |
| "mean_token_accuracy": 0.7096952789463102, |
| "num_tokens": 384869981.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6022926638164506, |
| "grad_norm": 1.46875, |
| "learning_rate": 3.603736479842675e-05, |
| "loss": 1.191, |
| "mean_token_accuracy": 0.7129226897843182, |
| "num_tokens": 386654361.0, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.6050179699875657, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.5967130214917824e-05, |
| "loss": 1.1628, |
| "mean_token_accuracy": 0.7192864948883653, |
| "num_tokens": 388405388.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.607743276158681, |
| "grad_norm": 1.375, |
| "learning_rate": 3.5896895631408904e-05, |
| "loss": 1.1976, |
| "mean_token_accuracy": 0.7123970666900277, |
| "num_tokens": 390160411.0, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.6104685823297961, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.582666104789999e-05, |
| "loss": 1.2059, |
| "mean_token_accuracy": 0.7098761620000005, |
| "num_tokens": 391963393.0, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.6131938885009113, |
| "grad_norm": 1.515625, |
| "learning_rate": 3.575642646439107e-05, |
| "loss": 1.2058, |
| "mean_token_accuracy": 0.7112857546657324, |
| "num_tokens": 393719494.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6159191946720264, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.5686191880882145e-05, |
| "loss": 1.2005, |
| "mean_token_accuracy": 0.7123585233464838, |
| "num_tokens": 395419867.0, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.6186445008431416, |
| "grad_norm": 1.3203125, |
| "learning_rate": 3.5615957297373225e-05, |
| "loss": 1.1723, |
| "mean_token_accuracy": 0.7168289897032082, |
| "num_tokens": 397164636.0, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.6213698070142567, |
| "grad_norm": 1.5078125, |
| "learning_rate": 3.554572271386431e-05, |
| "loss": 1.2103, |
| "mean_token_accuracy": 0.7081712177023292, |
| "num_tokens": 398897880.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.6240951131853719, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.5475488130355386e-05, |
| "loss": 1.188, |
| "mean_token_accuracy": 0.7136732900515199, |
| "num_tokens": 400668616.0, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.6268204193564871, |
| "grad_norm": 1.375, |
| "learning_rate": 3.5405253546846466e-05, |
| "loss": 1.1751, |
| "mean_token_accuracy": 0.716455262992531, |
| "num_tokens": 402379710.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6295457255276022, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.533501896333755e-05, |
| "loss": 1.178, |
| "mean_token_accuracy": 0.7147075609304011, |
| "num_tokens": 404084345.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.6322710316987175, |
| "grad_norm": 1.40625, |
| "learning_rate": 3.526478437982863e-05, |
| "loss": 1.1428, |
| "mean_token_accuracy": 0.7234298737719655, |
| "num_tokens": 405770321.0, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.6349963378698326, |
| "grad_norm": 1.359375, |
| "learning_rate": 3.5194549796319706e-05, |
| "loss": 1.1634, |
| "mean_token_accuracy": 0.7181408229283989, |
| "num_tokens": 407492020.0, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.6377216440409478, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.5124315212810787e-05, |
| "loss": 1.1646, |
| "mean_token_accuracy": 0.7192243071272969, |
| "num_tokens": 409257115.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.6404469502120629, |
| "grad_norm": 1.421875, |
| "learning_rate": 3.5054080629301874e-05, |
| "loss": 1.1939, |
| "mean_token_accuracy": 0.7125861537642777, |
| "num_tokens": 410986071.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6431722563831781, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.4983846045792954e-05, |
| "loss": 1.1776, |
| "mean_token_accuracy": 0.7160876172594726, |
| "num_tokens": 412763897.0, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.6458975625542932, |
| "grad_norm": 1.3515625, |
| "learning_rate": 3.491361146228403e-05, |
| "loss": 1.1323, |
| "mean_token_accuracy": 0.7251203707419336, |
| "num_tokens": 414503168.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.6486228687254084, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.484337687877511e-05, |
| "loss": 1.1612, |
| "mean_token_accuracy": 0.7199062428437173, |
| "num_tokens": 416227145.0, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.6513481748965235, |
| "grad_norm": 1.3046875, |
| "learning_rate": 3.4773142295266194e-05, |
| "loss": 1.1582, |
| "mean_token_accuracy": 0.7184870925731957, |
| "num_tokens": 417973040.0, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.6540734810676387, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.470290771175727e-05, |
| "loss": 1.1755, |
| "mean_token_accuracy": 0.7159407096914947, |
| "num_tokens": 419736780.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.6567987872387538, |
| "grad_norm": 1.3203125, |
| "learning_rate": 3.463267312824835e-05, |
| "loss": 1.1679, |
| "mean_token_accuracy": 0.718410755135119, |
| "num_tokens": 421516023.0, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.6595240934098691, |
| "grad_norm": 1.359375, |
| "learning_rate": 3.4562438544739435e-05, |
| "loss": 1.2164, |
| "mean_token_accuracy": 0.710557876341045, |
| "num_tokens": 423324296.0, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.6622493995809842, |
| "grad_norm": 1.4765625, |
| "learning_rate": 3.4492203961230515e-05, |
| "loss": 1.1903, |
| "mean_token_accuracy": 0.7110839125700295, |
| "num_tokens": 425114987.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.6649747057520994, |
| "grad_norm": 1.3359375, |
| "learning_rate": 3.442196937772159e-05, |
| "loss": 1.155, |
| "mean_token_accuracy": 0.7194037739187479, |
| "num_tokens": 426875751.0, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.6677000119232145, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.435173479421267e-05, |
| "loss": 1.1687, |
| "mean_token_accuracy": 0.7176058162003756, |
| "num_tokens": 428715433.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.6704253180943297, |
| "grad_norm": 1.453125, |
| "learning_rate": 3.4281500210703756e-05, |
| "loss": 1.2073, |
| "mean_token_accuracy": 0.7103234235197305, |
| "num_tokens": 430443301.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.6731506242654448, |
| "grad_norm": 1.3515625, |
| "learning_rate": 3.4211265627194836e-05, |
| "loss": 1.1158, |
| "mean_token_accuracy": 0.7272647397592664, |
| "num_tokens": 432149898.0, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.67587593043656, |
| "grad_norm": 1.375, |
| "learning_rate": 3.414103104368591e-05, |
| "loss": 1.1433, |
| "mean_token_accuracy": 0.723227025847882, |
| "num_tokens": 433904775.0, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.6786012366076751, |
| "grad_norm": 1.3515625, |
| "learning_rate": 3.407079646017699e-05, |
| "loss": 1.1448, |
| "mean_token_accuracy": 0.7215846830978989, |
| "num_tokens": 435648944.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.6813265427787903, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.4000561876668076e-05, |
| "loss": 1.1869, |
| "mean_token_accuracy": 0.7150498968549073, |
| "num_tokens": 437390122.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.6840518489499054, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.393032729315915e-05, |
| "loss": 1.1801, |
| "mean_token_accuracy": 0.7161353545263409, |
| "num_tokens": 439126311.0, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.6867771551210207, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.386009270965023e-05, |
| "loss": 1.1568, |
| "mean_token_accuracy": 0.7200920292176306, |
| "num_tokens": 440890933.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.6895024612921358, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.378985812614132e-05, |
| "loss": 1.1541, |
| "mean_token_accuracy": 0.7206738693639636, |
| "num_tokens": 442611770.0, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.692227767463251, |
| "grad_norm": 1.4140625, |
| "learning_rate": 3.37196235426324e-05, |
| "loss": 1.1591, |
| "mean_token_accuracy": 0.7203968748450279, |
| "num_tokens": 444358166.0, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.6949530736343661, |
| "grad_norm": 1.2890625, |
| "learning_rate": 3.364938895912347e-05, |
| "loss": 1.1954, |
| "mean_token_accuracy": 0.7128487601876259, |
| "num_tokens": 446078790.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.6976783798054813, |
| "grad_norm": 1.421875, |
| "learning_rate": 3.357915437561455e-05, |
| "loss": 1.1582, |
| "mean_token_accuracy": 0.7187117761000991, |
| "num_tokens": 447811186.0, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.7004036859765964, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.350891979210564e-05, |
| "loss": 1.1672, |
| "mean_token_accuracy": 0.7181063129566609, |
| "num_tokens": 449537962.0, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.7031289921477116, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.343868520859672e-05, |
| "loss": 1.1331, |
| "mean_token_accuracy": 0.7246433124877513, |
| "num_tokens": 451259469.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.7058542983188267, |
| "grad_norm": 1.421875, |
| "learning_rate": 3.336845062508779e-05, |
| "loss": 1.1554, |
| "mean_token_accuracy": 0.7212818250060081, |
| "num_tokens": 453041146.0, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.708579604489942, |
| "grad_norm": 1.359375, |
| "learning_rate": 3.329821604157887e-05, |
| "loss": 1.1884, |
| "mean_token_accuracy": 0.7135334552265704, |
| "num_tokens": 454753651.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.711304910661057, |
| "grad_norm": 1.3515625, |
| "learning_rate": 3.322798145806996e-05, |
| "loss": 1.1484, |
| "mean_token_accuracy": 0.7207954367622733, |
| "num_tokens": 456537057.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.7140302168321723, |
| "grad_norm": 1.3359375, |
| "learning_rate": 3.315774687456103e-05, |
| "loss": 1.1981, |
| "mean_token_accuracy": 0.7136128084734082, |
| "num_tokens": 458271296.0, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.7167555230032874, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.308751229105211e-05, |
| "loss": 1.1635, |
| "mean_token_accuracy": 0.7181222994811833, |
| "num_tokens": 460014932.0, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.7194808291744026, |
| "grad_norm": 1.421875, |
| "learning_rate": 3.30172777075432e-05, |
| "loss": 1.1776, |
| "mean_token_accuracy": 0.7165118259377777, |
| "num_tokens": 461818480.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.7222061353455177, |
| "grad_norm": 1.3984375, |
| "learning_rate": 3.294704312403428e-05, |
| "loss": 1.2023, |
| "mean_token_accuracy": 0.7107026267796754, |
| "num_tokens": 463519993.0, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7249314415166329, |
| "grad_norm": 1.328125, |
| "learning_rate": 3.287680854052535e-05, |
| "loss": 1.1638, |
| "mean_token_accuracy": 0.717846125178039, |
| "num_tokens": 465333101.0, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.727656747687748, |
| "grad_norm": 1.3515625, |
| "learning_rate": 3.280657395701643e-05, |
| "loss": 1.1591, |
| "mean_token_accuracy": 0.718819803185761, |
| "num_tokens": 467047198.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.7303820538588632, |
| "grad_norm": 1.3671875, |
| "learning_rate": 3.273633937350752e-05, |
| "loss": 1.1734, |
| "mean_token_accuracy": 0.7186005939729512, |
| "num_tokens": 468806711.0, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.7331073600299783, |
| "grad_norm": 1.2890625, |
| "learning_rate": 3.26661047899986e-05, |
| "loss": 1.1231, |
| "mean_token_accuracy": 0.7262043844908476, |
| "num_tokens": 470600782.0, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.7358326662010936, |
| "grad_norm": 1.5234375, |
| "learning_rate": 3.2595870206489674e-05, |
| "loss": 1.1755, |
| "mean_token_accuracy": 0.7167527761310339, |
| "num_tokens": 472349330.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.7385579723722087, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.2525635622980754e-05, |
| "loss": 1.1535, |
| "mean_token_accuracy": 0.7221431139856577, |
| "num_tokens": 474106983.0, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.7412832785433239, |
| "grad_norm": 1.4140625, |
| "learning_rate": 3.245540103947184e-05, |
| "loss": 1.1995, |
| "mean_token_accuracy": 0.7117431500926614, |
| "num_tokens": 475902407.0, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.744008584714439, |
| "grad_norm": 1.2890625, |
| "learning_rate": 3.2385166455962914e-05, |
| "loss": 1.1039, |
| "mean_token_accuracy": 0.729970954824239, |
| "num_tokens": 477654644.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.7467338908855542, |
| "grad_norm": 1.3828125, |
| "learning_rate": 3.2314931872453994e-05, |
| "loss": 1.1845, |
| "mean_token_accuracy": 0.7140484706498682, |
| "num_tokens": 479409078.0, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.7494591970566693, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.224469728894508e-05, |
| "loss": 1.1676, |
| "mean_token_accuracy": 0.7176982633769512, |
| "num_tokens": 481154972.0, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.7521845032277845, |
| "grad_norm": 1.546875, |
| "learning_rate": 3.217446270543616e-05, |
| "loss": 1.1947, |
| "mean_token_accuracy": 0.7132840578444302, |
| "num_tokens": 482957554.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.7549098093988996, |
| "grad_norm": 1.4921875, |
| "learning_rate": 3.2104228121927235e-05, |
| "loss": 1.163, |
| "mean_token_accuracy": 0.7188111429102719, |
| "num_tokens": 484730289.0, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.7576351155700148, |
| "grad_norm": 1.40625, |
| "learning_rate": 3.2033993538418315e-05, |
| "loss": 1.1678, |
| "mean_token_accuracy": 0.7163368194364012, |
| "num_tokens": 486390658.0, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.7603604217411299, |
| "grad_norm": 1.3515625, |
| "learning_rate": 3.19637589549094e-05, |
| "loss": 1.1218, |
| "mean_token_accuracy": 0.727223726734519, |
| "num_tokens": 488084357.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.7630857279122452, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.189352437140048e-05, |
| "loss": 1.2129, |
| "mean_token_accuracy": 0.7102430663071573, |
| "num_tokens": 489867504.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.7658110340833603, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.1823289787891556e-05, |
| "loss": 1.1838, |
| "mean_token_accuracy": 0.7139560039155185, |
| "num_tokens": 491614120.0, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.7685363402544755, |
| "grad_norm": 1.5, |
| "learning_rate": 3.1753055204382636e-05, |
| "loss": 1.1806, |
| "mean_token_accuracy": 0.7168294186703861, |
| "num_tokens": 493360108.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.7712616464255906, |
| "grad_norm": 1.4140625, |
| "learning_rate": 3.168282062087372e-05, |
| "loss": 1.1415, |
| "mean_token_accuracy": 0.7245752868242562, |
| "num_tokens": 495060867.0, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.7739869525967058, |
| "grad_norm": 1.359375, |
| "learning_rate": 3.16125860373648e-05, |
| "loss": 1.1694, |
| "mean_token_accuracy": 0.715926815662533, |
| "num_tokens": 496731129.0, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.7767122587678209, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.1542351453855877e-05, |
| "loss": 1.1351, |
| "mean_token_accuracy": 0.7253266898915172, |
| "num_tokens": 498416929.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.7794375649389361, |
| "grad_norm": 1.4921875, |
| "learning_rate": 3.1472116870346964e-05, |
| "loss": 1.1225, |
| "mean_token_accuracy": 0.7269441090524197, |
| "num_tokens": 500131030.0, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.7821628711100512, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.1401882286838044e-05, |
| "loss": 1.1295, |
| "mean_token_accuracy": 0.7248596154153347, |
| "num_tokens": 501860254.0, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.7848881772811664, |
| "grad_norm": 1.3828125, |
| "learning_rate": 3.133164770332912e-05, |
| "loss": 1.1921, |
| "mean_token_accuracy": 0.7137732055038214, |
| "num_tokens": 503599791.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.7876134834522815, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.12614131198202e-05, |
| "loss": 1.2135, |
| "mean_token_accuracy": 0.7103905778378248, |
| "num_tokens": 505277304.0, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.7903387896233968, |
| "grad_norm": 1.3046875, |
| "learning_rate": 3.1191178536311284e-05, |
| "loss": 1.1608, |
| "mean_token_accuracy": 0.718811112549156, |
| "num_tokens": 507010506.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.7930640957945119, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.1120943952802364e-05, |
| "loss": 1.1394, |
| "mean_token_accuracy": 0.7234757107682526, |
| "num_tokens": 508717678.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.7957894019656271, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.105070936929344e-05, |
| "loss": 1.116, |
| "mean_token_accuracy": 0.7291767308488488, |
| "num_tokens": 510478982.0, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.7985147081367422, |
| "grad_norm": 1.375, |
| "learning_rate": 3.0980474785784525e-05, |
| "loss": 1.1253, |
| "mean_token_accuracy": 0.7246225934475661, |
| "num_tokens": 512223492.0, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.8012400143078574, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.0910240202275605e-05, |
| "loss": 1.1514, |
| "mean_token_accuracy": 0.7207284711301327, |
| "num_tokens": 513950826.0, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.8039653204789725, |
| "grad_norm": 1.3671875, |
| "learning_rate": 3.0840005618766685e-05, |
| "loss": 1.1871, |
| "mean_token_accuracy": 0.7151925875805318, |
| "num_tokens": 515747512.0, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8066906266500877, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.076977103525776e-05, |
| "loss": 1.1468, |
| "mean_token_accuracy": 0.7219850319437683, |
| "num_tokens": 517564408.0, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.8094159328212028, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.0699536451748846e-05, |
| "loss": 1.0791, |
| "mean_token_accuracy": 0.7344457570463419, |
| "num_tokens": 519335113.0, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.812141238992318, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.0629301868239926e-05, |
| "loss": 1.1204, |
| "mean_token_accuracy": 0.7272397927008569, |
| "num_tokens": 521052694.0, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.8148665451634332, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.0559067284731e-05, |
| "loss": 1.1698, |
| "mean_token_accuracy": 0.7177247768267989, |
| "num_tokens": 522838886.0, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.8175918513345484, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.048883270122208e-05, |
| "loss": 1.1519, |
| "mean_token_accuracy": 0.720951104350388, |
| "num_tokens": 524659688.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8203171575056635, |
| "grad_norm": 1.328125, |
| "learning_rate": 3.0418598117713166e-05, |
| "loss": 1.1328, |
| "mean_token_accuracy": 0.7240156752988696, |
| "num_tokens": 526428828.0, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.8230424636767787, |
| "grad_norm": 1.3359375, |
| "learning_rate": 3.0348363534204243e-05, |
| "loss": 1.1616, |
| "mean_token_accuracy": 0.7177340661175549, |
| "num_tokens": 528178357.0, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.8257677698478938, |
| "grad_norm": 1.484375, |
| "learning_rate": 3.0278128950695323e-05, |
| "loss": 1.1425, |
| "mean_token_accuracy": 0.7223709647543728, |
| "num_tokens": 529968661.0, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.828493076019009, |
| "grad_norm": 1.4375, |
| "learning_rate": 3.0207894367186407e-05, |
| "loss": 1.1466, |
| "mean_token_accuracy": 0.7239165339618922, |
| "num_tokens": 531735846.0, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.8312183821901241, |
| "grad_norm": 1.3046875, |
| "learning_rate": 3.0137659783677484e-05, |
| "loss": 1.1655, |
| "mean_token_accuracy": 0.7192427675239742, |
| "num_tokens": 533482605.0, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.8339436883612393, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.0067425200168564e-05, |
| "loss": 1.1472, |
| "mean_token_accuracy": 0.7224777213297784, |
| "num_tokens": 535172117.0, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.8366689945323545, |
| "grad_norm": 1.375, |
| "learning_rate": 2.9997190616659644e-05, |
| "loss": 1.1414, |
| "mean_token_accuracy": 0.7228651619516313, |
| "num_tokens": 536922982.0, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.8393943007034697, |
| "grad_norm": 1.3828125, |
| "learning_rate": 2.9926956033150728e-05, |
| "loss": 1.1552, |
| "mean_token_accuracy": 0.7204573257826269, |
| "num_tokens": 538697561.0, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.8421196068745849, |
| "grad_norm": 1.359375, |
| "learning_rate": 2.9856721449641805e-05, |
| "loss": 1.1616, |
| "mean_token_accuracy": 0.7190472551621496, |
| "num_tokens": 540498809.0, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.8448449130457, |
| "grad_norm": 1.328125, |
| "learning_rate": 2.9786486866132885e-05, |
| "loss": 1.1207, |
| "mean_token_accuracy": 0.7262116202153266, |
| "num_tokens": 542220573.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.8475702192168152, |
| "grad_norm": 1.3984375, |
| "learning_rate": 2.971625228262396e-05, |
| "loss": 1.1447, |
| "mean_token_accuracy": 0.7223312626592815, |
| "num_tokens": 543995979.0, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.8502955253879303, |
| "grad_norm": 1.2890625, |
| "learning_rate": 2.964601769911505e-05, |
| "loss": 1.1643, |
| "mean_token_accuracy": 0.7170741960406304, |
| "num_tokens": 545762046.0, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.8530208315590455, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.9575783115606125e-05, |
| "loss": 1.1455, |
| "mean_token_accuracy": 0.7230727946385741, |
| "num_tokens": 547563120.0, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.8557461377301606, |
| "grad_norm": 1.3046875, |
| "learning_rate": 2.9505548532097206e-05, |
| "loss": 1.158, |
| "mean_token_accuracy": 0.7192152096889913, |
| "num_tokens": 549276605.0, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.8584714439012758, |
| "grad_norm": 1.3046875, |
| "learning_rate": 2.943531394858829e-05, |
| "loss": 1.1336, |
| "mean_token_accuracy": 0.7245622499845922, |
| "num_tokens": 550991479.0, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.8611967500723909, |
| "grad_norm": 1.3203125, |
| "learning_rate": 2.9365079365079366e-05, |
| "loss": 1.1351, |
| "mean_token_accuracy": 0.7258178818039596, |
| "num_tokens": 552777662.0, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.8639220562435062, |
| "grad_norm": 1.3828125, |
| "learning_rate": 2.9294844781570446e-05, |
| "loss": 1.1166, |
| "mean_token_accuracy": 0.7274093635380268, |
| "num_tokens": 554539356.0, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.8666473624146213, |
| "grad_norm": 1.265625, |
| "learning_rate": 2.9224610198061526e-05, |
| "loss": 1.1907, |
| "mean_token_accuracy": 0.7132768167182804, |
| "num_tokens": 556212548.0, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.8693726685857365, |
| "grad_norm": 1.328125, |
| "learning_rate": 2.915437561455261e-05, |
| "loss": 1.1617, |
| "mean_token_accuracy": 0.7187118930742145, |
| "num_tokens": 557950461.0, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.8720979747568516, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.9084141031043687e-05, |
| "loss": 1.1303, |
| "mean_token_accuracy": 0.72461500428617, |
| "num_tokens": 559715926.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.8748232809279668, |
| "grad_norm": 1.3125, |
| "learning_rate": 2.9013906447534767e-05, |
| "loss": 1.1556, |
| "mean_token_accuracy": 0.7212729568593204, |
| "num_tokens": 561479394.0, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.8775485870990819, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.8943671864025844e-05, |
| "loss": 1.1465, |
| "mean_token_accuracy": 0.7218864490278065, |
| "num_tokens": 563271872.0, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.8802738932701971, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.887343728051693e-05, |
| "loss": 1.1411, |
| "mean_token_accuracy": 0.723791983537376, |
| "num_tokens": 565002210.0, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.8829991994413122, |
| "grad_norm": 1.3359375, |
| "learning_rate": 2.8803202697008008e-05, |
| "loss": 1.0435, |
| "mean_token_accuracy": 0.7417100036516786, |
| "num_tokens": 566740287.0, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.8857245056124274, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.8732968113499088e-05, |
| "loss": 1.1746, |
| "mean_token_accuracy": 0.7177447673864663, |
| "num_tokens": 568560935.0, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.8884498117835425, |
| "grad_norm": 1.4296875, |
| "learning_rate": 2.866273352999017e-05, |
| "loss": 1.1609, |
| "mean_token_accuracy": 0.7196293252520263, |
| "num_tokens": 570290159.0, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.8911751179546578, |
| "grad_norm": 1.2890625, |
| "learning_rate": 2.8592498946481248e-05, |
| "loss": 1.1613, |
| "mean_token_accuracy": 0.719492181763053, |
| "num_tokens": 572076704.0, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.8939004241257729, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.852226436297233e-05, |
| "loss": 1.1808, |
| "mean_token_accuracy": 0.716338072437793, |
| "num_tokens": 573841975.0, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.8966257302968881, |
| "grad_norm": 1.3046875, |
| "learning_rate": 2.845202977946341e-05, |
| "loss": 1.145, |
| "mean_token_accuracy": 0.7234473955817521, |
| "num_tokens": 575587035.0, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.8993510364680032, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.8381795195954492e-05, |
| "loss": 1.1381, |
| "mean_token_accuracy": 0.7228204027749598, |
| "num_tokens": 577285852.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9020763426391184, |
| "grad_norm": 1.3359375, |
| "learning_rate": 2.831156061244557e-05, |
| "loss": 1.1672, |
| "mean_token_accuracy": 0.7184297275729478, |
| "num_tokens": 579085053.0, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.9048016488102335, |
| "grad_norm": 1.3125, |
| "learning_rate": 2.824132602893665e-05, |
| "loss": 1.1226, |
| "mean_token_accuracy": 0.7262911381199956, |
| "num_tokens": 580874041.0, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.9075269549813487, |
| "grad_norm": 1.3359375, |
| "learning_rate": 2.8171091445427726e-05, |
| "loss": 1.123, |
| "mean_token_accuracy": 0.724399715103209, |
| "num_tokens": 582616757.0, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.9102522611524638, |
| "grad_norm": 1.3828125, |
| "learning_rate": 2.8100856861918813e-05, |
| "loss": 1.1253, |
| "mean_token_accuracy": 0.726380693167448, |
| "num_tokens": 584370810.0, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.912977567323579, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.803062227840989e-05, |
| "loss": 1.1396, |
| "mean_token_accuracy": 0.7236145354807377, |
| "num_tokens": 586143330.0, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9157028734946941, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.796038769490097e-05, |
| "loss": 1.0929, |
| "mean_token_accuracy": 0.7320468625053763, |
| "num_tokens": 587851884.0, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.9184281796658094, |
| "grad_norm": 1.40625, |
| "learning_rate": 2.7890153111392054e-05, |
| "loss": 1.1286, |
| "mean_token_accuracy": 0.7241552670486271, |
| "num_tokens": 589601267.0, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.9211534858369245, |
| "grad_norm": 1.375, |
| "learning_rate": 2.7819918527883134e-05, |
| "loss": 1.1709, |
| "mean_token_accuracy": 0.7167520637623965, |
| "num_tokens": 591321205.0, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.9238787920080397, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.774968394437421e-05, |
| "loss": 1.0953, |
| "mean_token_accuracy": 0.7323742469772696, |
| "num_tokens": 593110103.0, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.9266040981791548, |
| "grad_norm": 1.3984375, |
| "learning_rate": 2.767944936086529e-05, |
| "loss": 1.1603, |
| "mean_token_accuracy": 0.7199522324837744, |
| "num_tokens": 594823039.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.92932940435027, |
| "grad_norm": 1.265625, |
| "learning_rate": 2.7609214777356374e-05, |
| "loss": 1.1437, |
| "mean_token_accuracy": 0.7233537461608648, |
| "num_tokens": 596625643.0, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.9320547105213851, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.753898019384745e-05, |
| "loss": 1.164, |
| "mean_token_accuracy": 0.716280288156122, |
| "num_tokens": 598409404.0, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.9347800166925003, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.746874561033853e-05, |
| "loss": 1.1393, |
| "mean_token_accuracy": 0.7229320453479886, |
| "num_tokens": 600131817.0, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.9375053228636154, |
| "grad_norm": 1.1640625, |
| "learning_rate": 2.7398511026829608e-05, |
| "loss": 1.1419, |
| "mean_token_accuracy": 0.7230301261879504, |
| "num_tokens": 601867823.0, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.9402306290347306, |
| "grad_norm": 1.3046875, |
| "learning_rate": 2.7328276443320695e-05, |
| "loss": 1.1177, |
| "mean_token_accuracy": 0.729860719665885, |
| "num_tokens": 603591263.0, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.9429559352058458, |
| "grad_norm": 1.5, |
| "learning_rate": 2.7258041859811772e-05, |
| "loss": 1.1591, |
| "mean_token_accuracy": 0.7213227171450853, |
| "num_tokens": 605400422.0, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.945681241376961, |
| "grad_norm": 1.3515625, |
| "learning_rate": 2.7187807276302852e-05, |
| "loss": 1.149, |
| "mean_token_accuracy": 0.722071444336325, |
| "num_tokens": 607180330.0, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.9484065475480761, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.7117572692793936e-05, |
| "loss": 1.1029, |
| "mean_token_accuracy": 0.730613834504038, |
| "num_tokens": 608937194.0, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.9511318537191913, |
| "grad_norm": 1.390625, |
| "learning_rate": 2.7047338109285016e-05, |
| "loss": 1.1518, |
| "mean_token_accuracy": 0.7203960535116494, |
| "num_tokens": 610676168.0, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.9538571598903064, |
| "grad_norm": 1.25, |
| "learning_rate": 2.6977103525776093e-05, |
| "loss": 1.097, |
| "mean_token_accuracy": 0.7318043757230044, |
| "num_tokens": 612491555.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.9565824660614216, |
| "grad_norm": 1.5703125, |
| "learning_rate": 2.6906868942267173e-05, |
| "loss": 1.158, |
| "mean_token_accuracy": 0.7188845065422356, |
| "num_tokens": 614310820.0, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.9593077722325367, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.6836634358758256e-05, |
| "loss": 1.1655, |
| "mean_token_accuracy": 0.7194117167033255, |
| "num_tokens": 616071087.0, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.9620330784036519, |
| "grad_norm": 1.3515625, |
| "learning_rate": 2.6766399775249333e-05, |
| "loss": 1.099, |
| "mean_token_accuracy": 0.7303661842830479, |
| "num_tokens": 617769166.0, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.964758384574767, |
| "grad_norm": 1.34375, |
| "learning_rate": 2.6696165191740413e-05, |
| "loss": 1.1328, |
| "mean_token_accuracy": 0.7244265062734485, |
| "num_tokens": 619538232.0, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.9674836907458823, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.662593060823149e-05, |
| "loss": 1.1299, |
| "mean_token_accuracy": 0.7269687331281602, |
| "num_tokens": 621303314.0, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.9702089969169974, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.6555696024722577e-05, |
| "loss": 1.1124, |
| "mean_token_accuracy": 0.7290478084236384, |
| "num_tokens": 623100776.0, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.9729343030881126, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.6485461441213654e-05, |
| "loss": 1.1804, |
| "mean_token_accuracy": 0.7147768701426684, |
| "num_tokens": 624821413.0, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.9756596092592277, |
| "grad_norm": 1.3203125, |
| "learning_rate": 2.6415226857704734e-05, |
| "loss": 1.1439, |
| "mean_token_accuracy": 0.7215228925459087, |
| "num_tokens": 626628545.0, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.9783849154303429, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.6344992274195818e-05, |
| "loss": 1.1218, |
| "mean_token_accuracy": 0.7278467868454754, |
| "num_tokens": 628425036.0, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.981110221601458, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.6274757690686898e-05, |
| "loss": 1.0925, |
| "mean_token_accuracy": 0.7307519348338246, |
| "num_tokens": 630092739.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.9838355277725732, |
| "grad_norm": 1.4609375, |
| "learning_rate": 2.6204523107177975e-05, |
| "loss": 1.1461, |
| "mean_token_accuracy": 0.7233453346416354, |
| "num_tokens": 631791605.0, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.9865608339436883, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.6134288523669055e-05, |
| "loss": 1.1312, |
| "mean_token_accuracy": 0.7232753919437528, |
| "num_tokens": 633554055.0, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.9892861401148035, |
| "grad_norm": 1.25, |
| "learning_rate": 2.606405394016014e-05, |
| "loss": 1.1021, |
| "mean_token_accuracy": 0.7298557332716882, |
| "num_tokens": 635269789.0, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.9920114462859186, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.5993819356651215e-05, |
| "loss": 1.129, |
| "mean_token_accuracy": 0.7261236603371799, |
| "num_tokens": 637078092.0, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.9947367524570339, |
| "grad_norm": 1.3359375, |
| "learning_rate": 2.5923584773142296e-05, |
| "loss": 1.1019, |
| "mean_token_accuracy": 0.7314373353496194, |
| "num_tokens": 638813927.0, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.997462058628149, |
| "grad_norm": 1.3828125, |
| "learning_rate": 2.5853350189633372e-05, |
| "loss": 1.1339, |
| "mean_token_accuracy": 0.7248508833348751, |
| "num_tokens": 640594007.0, |
| "step": 3660 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.65625, |
| "learning_rate": 2.578311560612446e-05, |
| "loss": 1.0198, |
| "mean_token_accuracy": 0.731230377150862, |
| "num_tokens": 642209396.0, |
| "step": 3670 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 7340, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500.0, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.068172081287866e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|