| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 7340, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0002725306171115161, |
| "grad_norm": 49.46063232421875, |
| "learning_rate": 0.0, |
| "loss": 4.2357, |
| "mean_token_accuracy": 0.2580853081308305, |
| "num_tokens": 180497.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0027253061711151614, |
| "grad_norm": 46.16090774536133, |
| "learning_rate": 2.0361990950226245e-06, |
| "loss": 4.1759, |
| "mean_token_accuracy": 0.2622440684483283, |
| "num_tokens": 1772191.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.005450612342230323, |
| "grad_norm": 27.987140655517578, |
| "learning_rate": 4.298642533936651e-06, |
| "loss": 3.8947, |
| "mean_token_accuracy": 0.29399702919181436, |
| "num_tokens": 3528128.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.008175918513345483, |
| "grad_norm": 12.687642097473145, |
| "learning_rate": 6.5610859728506795e-06, |
| "loss": 3.1855, |
| "mean_token_accuracy": 0.37025331929326055, |
| "num_tokens": 5321820.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.010901224684460645, |
| "grad_norm": 4.484994411468506, |
| "learning_rate": 8.823529411764707e-06, |
| "loss": 2.5, |
| "mean_token_accuracy": 0.46682517854496836, |
| "num_tokens": 7036353.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.013626530855575806, |
| "grad_norm": 2.473914384841919, |
| "learning_rate": 1.1085972850678733e-05, |
| "loss": 2.0545, |
| "mean_token_accuracy": 0.5429269138723611, |
| "num_tokens": 8794502.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.016351837026690966, |
| "grad_norm": 1.9377564191818237, |
| "learning_rate": 1.3348416289592761e-05, |
| "loss": 1.7825, |
| "mean_token_accuracy": 0.594660968054086, |
| "num_tokens": 10519144.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.019077143197806127, |
| "grad_norm": 1.8231310844421387, |
| "learning_rate": 1.5610859728506788e-05, |
| "loss": 1.6083, |
| "mean_token_accuracy": 0.6288187805563211, |
| "num_tokens": 12272638.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.02180244936892129, |
| "grad_norm": 1.4802289009094238, |
| "learning_rate": 1.7873303167420814e-05, |
| "loss": 1.5204, |
| "mean_token_accuracy": 0.6462318933568895, |
| "num_tokens": 14037595.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.02452775554003645, |
| "grad_norm": 1.4305676221847534, |
| "learning_rate": 2.0135746606334844e-05, |
| "loss": 1.4496, |
| "mean_token_accuracy": 0.6578288937918841, |
| "num_tokens": 15813413.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.027253061711151612, |
| "grad_norm": 1.4573628902435303, |
| "learning_rate": 2.239819004524887e-05, |
| "loss": 1.4074, |
| "mean_token_accuracy": 0.6659743607975542, |
| "num_tokens": 17606667.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.029978367882266772, |
| "grad_norm": 1.3907411098480225, |
| "learning_rate": 2.4660633484162897e-05, |
| "loss": 1.3354, |
| "mean_token_accuracy": 0.6814010716974735, |
| "num_tokens": 19387650.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.03270367405338193, |
| "grad_norm": 1.2991719245910645, |
| "learning_rate": 2.6923076923076923e-05, |
| "loss": 1.315, |
| "mean_token_accuracy": 0.6835047041997313, |
| "num_tokens": 21111629.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03542898022449709, |
| "grad_norm": 1.2487128973007202, |
| "learning_rate": 2.9185520361990953e-05, |
| "loss": 1.2761, |
| "mean_token_accuracy": 0.6918578458949923, |
| "num_tokens": 22855847.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.038154286395612254, |
| "grad_norm": 1.3445643186569214, |
| "learning_rate": 3.1447963800904976e-05, |
| "loss": 1.2535, |
| "mean_token_accuracy": 0.6977143987081945, |
| "num_tokens": 24647294.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.04087959256672742, |
| "grad_norm": 1.273087978363037, |
| "learning_rate": 3.371040723981901e-05, |
| "loss": 1.2287, |
| "mean_token_accuracy": 0.7020150443539024, |
| "num_tokens": 26336128.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.04360489873784258, |
| "grad_norm": 1.3392159938812256, |
| "learning_rate": 3.5972850678733036e-05, |
| "loss": 1.254, |
| "mean_token_accuracy": 0.6971366205252707, |
| "num_tokens": 28061528.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.04633020490895774, |
| "grad_norm": 1.2237101793289185, |
| "learning_rate": 3.8235294117647055e-05, |
| "loss": 1.2335, |
| "mean_token_accuracy": 0.7016927156597376, |
| "num_tokens": 29834651.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0490555110800729, |
| "grad_norm": 1.1867234706878662, |
| "learning_rate": 4.049773755656109e-05, |
| "loss": 1.2441, |
| "mean_token_accuracy": 0.6992780463770032, |
| "num_tokens": 31533806.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.05178081725118806, |
| "grad_norm": 1.2524540424346924, |
| "learning_rate": 4.2760180995475115e-05, |
| "loss": 1.1962, |
| "mean_token_accuracy": 0.7076324006542564, |
| "num_tokens": 33328644.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.054506123422303224, |
| "grad_norm": 1.2938035726547241, |
| "learning_rate": 4.502262443438914e-05, |
| "loss": 1.1923, |
| "mean_token_accuracy": 0.7092770641669631, |
| "num_tokens": 35085202.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.057231429593418384, |
| "grad_norm": 1.2193968296051025, |
| "learning_rate": 4.728506787330317e-05, |
| "loss": 1.2002, |
| "mean_token_accuracy": 0.7085541909560561, |
| "num_tokens": 36899685.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.059956735764533545, |
| "grad_norm": 1.1932718753814697, |
| "learning_rate": 4.95475113122172e-05, |
| "loss": 1.1231, |
| "mean_token_accuracy": 0.7225744256749749, |
| "num_tokens": 38663867.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0626820419356487, |
| "grad_norm": 1.2447905540466309, |
| "learning_rate": 4.994381233319287e-05, |
| "loss": 1.1412, |
| "mean_token_accuracy": 0.7189388344995677, |
| "num_tokens": 40368688.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.06540734810676387, |
| "grad_norm": 1.2377219200134277, |
| "learning_rate": 4.9873577749683945e-05, |
| "loss": 1.1419, |
| "mean_token_accuracy": 0.7174957160837948, |
| "num_tokens": 42091556.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.06813265427787903, |
| "grad_norm": 1.103188395500183, |
| "learning_rate": 4.9803343166175026e-05, |
| "loss": 1.1354, |
| "mean_token_accuracy": 0.71980509320274, |
| "num_tokens": 43832865.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.07085796044899419, |
| "grad_norm": 1.1445391178131104, |
| "learning_rate": 4.9733108582666106e-05, |
| "loss": 1.1425, |
| "mean_token_accuracy": 0.7196099638007581, |
| "num_tokens": 45556421.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.07358326662010935, |
| "grad_norm": 1.1179693937301636, |
| "learning_rate": 4.9662873999157186e-05, |
| "loss": 1.1598, |
| "mean_token_accuracy": 0.7173428479582071, |
| "num_tokens": 47284838.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.07630857279122451, |
| "grad_norm": 1.149162769317627, |
| "learning_rate": 4.9592639415648266e-05, |
| "loss": 1.1281, |
| "mean_token_accuracy": 0.7214061733335256, |
| "num_tokens": 49025592.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.07903387896233968, |
| "grad_norm": 1.2658289670944214, |
| "learning_rate": 4.9522404832139346e-05, |
| "loss": 1.1848, |
| "mean_token_accuracy": 0.7107397212646902, |
| "num_tokens": 50771353.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.08175918513345484, |
| "grad_norm": 1.1745802164077759, |
| "learning_rate": 4.945217024863043e-05, |
| "loss": 1.1083, |
| "mean_token_accuracy": 0.7261597216129303, |
| "num_tokens": 52521467.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.08448449130457, |
| "grad_norm": 1.2588995695114136, |
| "learning_rate": 4.938193566512151e-05, |
| "loss": 1.1305, |
| "mean_token_accuracy": 0.7218387089669704, |
| "num_tokens": 54338858.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.08720979747568516, |
| "grad_norm": 1.2129034996032715, |
| "learning_rate": 4.931170108161259e-05, |
| "loss": 1.129, |
| "mean_token_accuracy": 0.7227001185528934, |
| "num_tokens": 56014081.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.08993510364680032, |
| "grad_norm": 1.1418203115463257, |
| "learning_rate": 4.924146649810367e-05, |
| "loss": 1.1089, |
| "mean_token_accuracy": 0.7253761961124837, |
| "num_tokens": 57778337.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.09266040981791548, |
| "grad_norm": 1.0563759803771973, |
| "learning_rate": 4.9171231914594754e-05, |
| "loss": 1.0984, |
| "mean_token_accuracy": 0.7282240198925137, |
| "num_tokens": 59528977.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.09538571598903064, |
| "grad_norm": 1.0599093437194824, |
| "learning_rate": 4.910099733108583e-05, |
| "loss": 1.1142, |
| "mean_token_accuracy": 0.7256616481579841, |
| "num_tokens": 61295985.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0981110221601458, |
| "grad_norm": 1.2205257415771484, |
| "learning_rate": 4.903076274757691e-05, |
| "loss": 1.099, |
| "mean_token_accuracy": 0.7278626722283661, |
| "num_tokens": 63026897.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.10083632833126097, |
| "grad_norm": 1.069904088973999, |
| "learning_rate": 4.896052816406799e-05, |
| "loss": 1.0891, |
| "mean_token_accuracy": 0.7307559937238693, |
| "num_tokens": 64770711.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.10356163450237613, |
| "grad_norm": 1.1615813970565796, |
| "learning_rate": 4.889029358055907e-05, |
| "loss": 1.0833, |
| "mean_token_accuracy": 0.7321938696317375, |
| "num_tokens": 66551008.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.10628694067349129, |
| "grad_norm": 1.0941394567489624, |
| "learning_rate": 4.882005899705015e-05, |
| "loss": 1.1027, |
| "mean_token_accuracy": 0.727668415941298, |
| "num_tokens": 68320690.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.10901224684460645, |
| "grad_norm": 1.0933481454849243, |
| "learning_rate": 4.874982441354123e-05, |
| "loss": 1.0649, |
| "mean_token_accuracy": 0.7347688566893339, |
| "num_tokens": 70095284.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.11173755301572161, |
| "grad_norm": 1.0768437385559082, |
| "learning_rate": 4.8679589830032316e-05, |
| "loss": 1.0794, |
| "mean_token_accuracy": 0.7327698688954115, |
| "num_tokens": 71803511.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.11446285918683677, |
| "grad_norm": 1.091327428817749, |
| "learning_rate": 4.860935524652339e-05, |
| "loss": 1.0419, |
| "mean_token_accuracy": 0.7397373986430467, |
| "num_tokens": 73528297.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.11718816535795193, |
| "grad_norm": 1.06846284866333, |
| "learning_rate": 4.853912066301447e-05, |
| "loss": 1.0433, |
| "mean_token_accuracy": 0.7402802865020931, |
| "num_tokens": 75274289.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.11991347152906709, |
| "grad_norm": 1.123704195022583, |
| "learning_rate": 4.846888607950555e-05, |
| "loss": 1.0839, |
| "mean_token_accuracy": 0.7311916822567582, |
| "num_tokens": 77077403.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.12263877770018225, |
| "grad_norm": 1.0876643657684326, |
| "learning_rate": 4.8398651495996636e-05, |
| "loss": 1.0821, |
| "mean_token_accuracy": 0.7309617185033858, |
| "num_tokens": 78859882.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1253640838712974, |
| "grad_norm": 1.0852820873260498, |
| "learning_rate": 4.832841691248771e-05, |
| "loss": 1.0927, |
| "mean_token_accuracy": 0.7292127916589379, |
| "num_tokens": 80590976.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.12808939004241257, |
| "grad_norm": 1.0788687467575073, |
| "learning_rate": 4.825818232897879e-05, |
| "loss": 1.0541, |
| "mean_token_accuracy": 0.7368990315124393, |
| "num_tokens": 82275126.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.13081469621352773, |
| "grad_norm": 1.0248864889144897, |
| "learning_rate": 4.818794774546987e-05, |
| "loss": 1.0448, |
| "mean_token_accuracy": 0.7376345920376479, |
| "num_tokens": 84096910.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.1335400023846429, |
| "grad_norm": 1.060374140739441, |
| "learning_rate": 4.811771316196095e-05, |
| "loss": 1.0771, |
| "mean_token_accuracy": 0.7329897940158844, |
| "num_tokens": 85876326.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.13626530855575805, |
| "grad_norm": 1.0276813507080078, |
| "learning_rate": 4.804747857845203e-05, |
| "loss": 1.0774, |
| "mean_token_accuracy": 0.733756088744849, |
| "num_tokens": 87607478.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1389906147268732, |
| "grad_norm": 0.996231734752655, |
| "learning_rate": 4.797724399494311e-05, |
| "loss": 1.0459, |
| "mean_token_accuracy": 0.7390362743288279, |
| "num_tokens": 89350066.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.14171592089798837, |
| "grad_norm": 1.084644079208374, |
| "learning_rate": 4.79070094114342e-05, |
| "loss": 1.0518, |
| "mean_token_accuracy": 0.7383495075628161, |
| "num_tokens": 91067910.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.14444122706910353, |
| "grad_norm": 1.0383051633834839, |
| "learning_rate": 4.783677482792527e-05, |
| "loss": 1.0475, |
| "mean_token_accuracy": 0.7384993623942137, |
| "num_tokens": 92797017.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.1471665332402187, |
| "grad_norm": 1.021580457687378, |
| "learning_rate": 4.776654024441635e-05, |
| "loss": 1.0633, |
| "mean_token_accuracy": 0.7345522255636752, |
| "num_tokens": 94606329.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.14989183941133385, |
| "grad_norm": 1.0029984712600708, |
| "learning_rate": 4.769630566090743e-05, |
| "loss": 1.0425, |
| "mean_token_accuracy": 0.73898695576936, |
| "num_tokens": 96331087.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.15261714558244902, |
| "grad_norm": 0.9963593482971191, |
| "learning_rate": 4.762607107739852e-05, |
| "loss": 1.064, |
| "mean_token_accuracy": 0.7353490410372615, |
| "num_tokens": 98138711.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.15534245175356418, |
| "grad_norm": 1.0283918380737305, |
| "learning_rate": 4.755583649388959e-05, |
| "loss": 1.0666, |
| "mean_token_accuracy": 0.7352430403232575, |
| "num_tokens": 99885005.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.15806775792467936, |
| "grad_norm": 1.0260673761367798, |
| "learning_rate": 4.748560191038067e-05, |
| "loss": 0.9751, |
| "mean_token_accuracy": 0.7531527349725365, |
| "num_tokens": 101636075.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.16079306409579452, |
| "grad_norm": 1.0847331285476685, |
| "learning_rate": 4.741536732687175e-05, |
| "loss": 1.0334, |
| "mean_token_accuracy": 0.7419406285509467, |
| "num_tokens": 103349118.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.16351837026690969, |
| "grad_norm": 1.1022111177444458, |
| "learning_rate": 4.734513274336283e-05, |
| "loss": 1.0475, |
| "mean_token_accuracy": 0.7382194061763585, |
| "num_tokens": 105033010.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.16624367643802485, |
| "grad_norm": 1.0745152235031128, |
| "learning_rate": 4.727489815985391e-05, |
| "loss": 1.0114, |
| "mean_token_accuracy": 0.7453443594276905, |
| "num_tokens": 106723283.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.16896898260914, |
| "grad_norm": 0.9670913815498352, |
| "learning_rate": 4.720466357634499e-05, |
| "loss": 1.0299, |
| "mean_token_accuracy": 0.7417304971255362, |
| "num_tokens": 108436878.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.17169428878025517, |
| "grad_norm": 1.0606757402420044, |
| "learning_rate": 4.713442899283608e-05, |
| "loss": 1.0134, |
| "mean_token_accuracy": 0.7449460197240114, |
| "num_tokens": 110203157.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.17441959495137033, |
| "grad_norm": 1.1226489543914795, |
| "learning_rate": 4.706419440932715e-05, |
| "loss": 1.0392, |
| "mean_token_accuracy": 0.7409128420054912, |
| "num_tokens": 111949130.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.1771449011224855, |
| "grad_norm": 1.0842260122299194, |
| "learning_rate": 4.6993959825818233e-05, |
| "loss": 1.0447, |
| "mean_token_accuracy": 0.7380765706300736, |
| "num_tokens": 113652926.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.17987020729360065, |
| "grad_norm": 1.010992169380188, |
| "learning_rate": 4.6923725242309314e-05, |
| "loss": 1.0289, |
| "mean_token_accuracy": 0.7402720710262656, |
| "num_tokens": 115334647.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.1825955134647158, |
| "grad_norm": 0.9638611674308777, |
| "learning_rate": 4.68534906588004e-05, |
| "loss": 0.9863, |
| "mean_token_accuracy": 0.7511739198118448, |
| "num_tokens": 117032381.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.18532081963583097, |
| "grad_norm": 0.9948492050170898, |
| "learning_rate": 4.6783256075291474e-05, |
| "loss": 1.0236, |
| "mean_token_accuracy": 0.7424138585105539, |
| "num_tokens": 118801553.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.18804612580694613, |
| "grad_norm": 1.0282810926437378, |
| "learning_rate": 4.6713021491782554e-05, |
| "loss": 1.0355, |
| "mean_token_accuracy": 0.7400126025080681, |
| "num_tokens": 120530271.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.1907714319780613, |
| "grad_norm": 0.9453698992729187, |
| "learning_rate": 4.6642786908273634e-05, |
| "loss": 1.0278, |
| "mean_token_accuracy": 0.7430232111364603, |
| "num_tokens": 122314411.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.19349673814917645, |
| "grad_norm": 0.9775828123092651, |
| "learning_rate": 4.6572552324764715e-05, |
| "loss": 1.0197, |
| "mean_token_accuracy": 0.743807871080935, |
| "num_tokens": 124054113.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.1962220443202916, |
| "grad_norm": 1.0277308225631714, |
| "learning_rate": 4.6502317741255795e-05, |
| "loss": 1.0356, |
| "mean_token_accuracy": 0.740591025352478, |
| "num_tokens": 125786705.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.19894735049140677, |
| "grad_norm": 1.0205105543136597, |
| "learning_rate": 4.6432083157746875e-05, |
| "loss": 1.0347, |
| "mean_token_accuracy": 0.7396136365830899, |
| "num_tokens": 127510112.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.20167265666252193, |
| "grad_norm": 1.0093523263931274, |
| "learning_rate": 4.636184857423796e-05, |
| "loss": 1.0006, |
| "mean_token_accuracy": 0.7481041301041842, |
| "num_tokens": 129321733.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.2043979628336371, |
| "grad_norm": 0.9474175572395325, |
| "learning_rate": 4.6291613990729035e-05, |
| "loss": 1.035, |
| "mean_token_accuracy": 0.7410462098196149, |
| "num_tokens": 131068939.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.20712326900475225, |
| "grad_norm": 1.1145373582839966, |
| "learning_rate": 4.6221379407220116e-05, |
| "loss": 1.0676, |
| "mean_token_accuracy": 0.7343734119087457, |
| "num_tokens": 132800192.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.2098485751758674, |
| "grad_norm": 0.999275803565979, |
| "learning_rate": 4.6151144823711196e-05, |
| "loss": 1.005, |
| "mean_token_accuracy": 0.7478898199275136, |
| "num_tokens": 134501880.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.21257388134698257, |
| "grad_norm": 1.0402276515960693, |
| "learning_rate": 4.608091024020228e-05, |
| "loss": 0.9878, |
| "mean_token_accuracy": 0.7501668559387327, |
| "num_tokens": 136227230.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.21529918751809773, |
| "grad_norm": 1.0302717685699463, |
| "learning_rate": 4.6010675656693356e-05, |
| "loss": 1.0, |
| "mean_token_accuracy": 0.7479498274624348, |
| "num_tokens": 137938433.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.2180244936892129, |
| "grad_norm": 0.9707064032554626, |
| "learning_rate": 4.5940441073184436e-05, |
| "loss": 1.0062, |
| "mean_token_accuracy": 0.7476970013231039, |
| "num_tokens": 139667163.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.22074979986032806, |
| "grad_norm": 0.9473689794540405, |
| "learning_rate": 4.5870206489675517e-05, |
| "loss": 0.9879, |
| "mean_token_accuracy": 0.7510631861165166, |
| "num_tokens": 141481099.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.22347510603144322, |
| "grad_norm": 0.9907692670822144, |
| "learning_rate": 4.57999719061666e-05, |
| "loss": 1.0453, |
| "mean_token_accuracy": 0.7398228641599417, |
| "num_tokens": 143204243.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.22620041220255838, |
| "grad_norm": 0.9675036668777466, |
| "learning_rate": 4.572973732265768e-05, |
| "loss": 1.0049, |
| "mean_token_accuracy": 0.748301652725786, |
| "num_tokens": 144995581.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.22892571837367354, |
| "grad_norm": 0.9796574115753174, |
| "learning_rate": 4.565950273914876e-05, |
| "loss": 1.0159, |
| "mean_token_accuracy": 0.7437716860324144, |
| "num_tokens": 146711076.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.2316510245447887, |
| "grad_norm": 0.9572359919548035, |
| "learning_rate": 4.5589268155639844e-05, |
| "loss": 1.0059, |
| "mean_token_accuracy": 0.7478106670081616, |
| "num_tokens": 148463902.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.23437633071590386, |
| "grad_norm": 1.010580062866211, |
| "learning_rate": 4.551903357213092e-05, |
| "loss": 1.0323, |
| "mean_token_accuracy": 0.7418697223067283, |
| "num_tokens": 150212801.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.23710163688701902, |
| "grad_norm": 0.9667695164680481, |
| "learning_rate": 4.5448798988622e-05, |
| "loss": 0.9818, |
| "mean_token_accuracy": 0.7502022869884968, |
| "num_tokens": 151950016.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.23982694305813418, |
| "grad_norm": 1.0137828588485718, |
| "learning_rate": 4.537856440511308e-05, |
| "loss": 0.9907, |
| "mean_token_accuracy": 0.7495149873197079, |
| "num_tokens": 153686341.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.24255224922924934, |
| "grad_norm": 0.9276532530784607, |
| "learning_rate": 4.5308329821604165e-05, |
| "loss": 1.0163, |
| "mean_token_accuracy": 0.7445492129772902, |
| "num_tokens": 155442220.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.2452775554003645, |
| "grad_norm": 1.0047796964645386, |
| "learning_rate": 4.523809523809524e-05, |
| "loss": 0.9883, |
| "mean_token_accuracy": 0.7502540521323681, |
| "num_tokens": 157222278.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.24800286157147966, |
| "grad_norm": 0.9885547757148743, |
| "learning_rate": 4.516786065458632e-05, |
| "loss": 0.9833, |
| "mean_token_accuracy": 0.7508710121735931, |
| "num_tokens": 158989873.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.2507281677425948, |
| "grad_norm": 1.0419394969940186, |
| "learning_rate": 4.50976260710774e-05, |
| "loss": 1.0086, |
| "mean_token_accuracy": 0.7485707288607955, |
| "num_tokens": 160740690.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.25345347391371, |
| "grad_norm": 0.9929084777832031, |
| "learning_rate": 4.502739148756848e-05, |
| "loss": 1.0127, |
| "mean_token_accuracy": 0.7446823202073574, |
| "num_tokens": 162505210.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.25617878008482514, |
| "grad_norm": 1.0232727527618408, |
| "learning_rate": 4.495715690405956e-05, |
| "loss": 1.0191, |
| "mean_token_accuracy": 0.744407182559371, |
| "num_tokens": 164184119.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.2589040862559403, |
| "grad_norm": 0.9282605648040771, |
| "learning_rate": 4.488692232055064e-05, |
| "loss": 0.9778, |
| "mean_token_accuracy": 0.7518815349787473, |
| "num_tokens": 165971146.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.26162939242705546, |
| "grad_norm": 1.0070769786834717, |
| "learning_rate": 4.4816687737041726e-05, |
| "loss": 1.0109, |
| "mean_token_accuracy": 0.7478598964400589, |
| "num_tokens": 167701701.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.2643546985981706, |
| "grad_norm": 1.0107619762420654, |
| "learning_rate": 4.47464531535328e-05, |
| "loss": 1.0031, |
| "mean_token_accuracy": 0.7483802428469062, |
| "num_tokens": 169443326.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.2670800047692858, |
| "grad_norm": 0.9970401525497437, |
| "learning_rate": 4.467621857002388e-05, |
| "loss": 1.0022, |
| "mean_token_accuracy": 0.7473449762910604, |
| "num_tokens": 171199385.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.26980531094040094, |
| "grad_norm": 1.0069605112075806, |
| "learning_rate": 4.460598398651496e-05, |
| "loss": 0.9716, |
| "mean_token_accuracy": 0.7539871089160443, |
| "num_tokens": 172929493.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.2725306171115161, |
| "grad_norm": 1.0105503797531128, |
| "learning_rate": 4.453574940300605e-05, |
| "loss": 0.9771, |
| "mean_token_accuracy": 0.7521027243696153, |
| "num_tokens": 174672601.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.27525592328263127, |
| "grad_norm": 0.9245153069496155, |
| "learning_rate": 4.446551481949712e-05, |
| "loss": 0.9871, |
| "mean_token_accuracy": 0.7512713268399238, |
| "num_tokens": 176441962.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.2779812294537464, |
| "grad_norm": 1.0189464092254639, |
| "learning_rate": 4.43952802359882e-05, |
| "loss": 0.9791, |
| "mean_token_accuracy": 0.7525585936382413, |
| "num_tokens": 178197361.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.2807065356248616, |
| "grad_norm": 1.071568250656128, |
| "learning_rate": 4.432504565247929e-05, |
| "loss": 1.009, |
| "mean_token_accuracy": 0.744661932811141, |
| "num_tokens": 179887580.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.28343184179597675, |
| "grad_norm": 0.9799075126647949, |
| "learning_rate": 4.425481106897036e-05, |
| "loss": 0.9712, |
| "mean_token_accuracy": 0.7534190637990832, |
| "num_tokens": 181655865.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.2861571479670919, |
| "grad_norm": 1.054100513458252, |
| "learning_rate": 4.418457648546144e-05, |
| "loss": 0.9937, |
| "mean_token_accuracy": 0.7505493542179466, |
| "num_tokens": 183445880.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.28888245413820707, |
| "grad_norm": 0.9577687978744507, |
| "learning_rate": 4.411434190195252e-05, |
| "loss": 0.9624, |
| "mean_token_accuracy": 0.7555918388999998, |
| "num_tokens": 185175706.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.29160776030932223, |
| "grad_norm": 1.0260660648345947, |
| "learning_rate": 4.404410731844361e-05, |
| "loss": 0.9755, |
| "mean_token_accuracy": 0.7519860923290252, |
| "num_tokens": 186873396.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.2943330664804374, |
| "grad_norm": 0.9616529941558838, |
| "learning_rate": 4.397387273493468e-05, |
| "loss": 1.0078, |
| "mean_token_accuracy": 0.7465968690812588, |
| "num_tokens": 188591288.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.29705837265155255, |
| "grad_norm": 0.9976760149002075, |
| "learning_rate": 4.390363815142576e-05, |
| "loss": 1.0004, |
| "mean_token_accuracy": 0.7482189310714602, |
| "num_tokens": 190375182.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.2997836788226677, |
| "grad_norm": 1.0091749429702759, |
| "learning_rate": 4.383340356791684e-05, |
| "loss": 0.9886, |
| "mean_token_accuracy": 0.7496372631751, |
| "num_tokens": 192104609.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.30250898499378287, |
| "grad_norm": 0.9447279572486877, |
| "learning_rate": 4.376316898440793e-05, |
| "loss": 0.9275, |
| "mean_token_accuracy": 0.7625567795708775, |
| "num_tokens": 193819768.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.30523429116489803, |
| "grad_norm": 0.944039523601532, |
| "learning_rate": 4.3692934400899e-05, |
| "loss": 0.9894, |
| "mean_token_accuracy": 0.7514046527445316, |
| "num_tokens": 195541705.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.3079595973360132, |
| "grad_norm": 0.9289810657501221, |
| "learning_rate": 4.362269981739008e-05, |
| "loss": 0.9757, |
| "mean_token_accuracy": 0.7527969362214207, |
| "num_tokens": 197250976.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.31068490350712835, |
| "grad_norm": 0.9183096885681152, |
| "learning_rate": 4.355246523388117e-05, |
| "loss": 0.9469, |
| "mean_token_accuracy": 0.7588935429230332, |
| "num_tokens": 198964087.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.31341020967824357, |
| "grad_norm": 1.0104970932006836, |
| "learning_rate": 4.348223065037224e-05, |
| "loss": 0.9919, |
| "mean_token_accuracy": 0.7494912428781391, |
| "num_tokens": 200654341.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.31613551584935873, |
| "grad_norm": 0.9720707535743713, |
| "learning_rate": 4.3411996066863323e-05, |
| "loss": 1.0166, |
| "mean_token_accuracy": 0.7451702112331986, |
| "num_tokens": 202410389.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.3188608220204739, |
| "grad_norm": 0.9572804570198059, |
| "learning_rate": 4.3341761483354404e-05, |
| "loss": 0.9747, |
| "mean_token_accuracy": 0.753504987526685, |
| "num_tokens": 204176073.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.32158612819158905, |
| "grad_norm": 0.984469473361969, |
| "learning_rate": 4.327152689984549e-05, |
| "loss": 0.9684, |
| "mean_token_accuracy": 0.7544271955266595, |
| "num_tokens": 206017214.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.3243114343627042, |
| "grad_norm": 1.0402361154556274, |
| "learning_rate": 4.3201292316336564e-05, |
| "loss": 0.9792, |
| "mean_token_accuracy": 0.7519845139235258, |
| "num_tokens": 207832984.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.32703674053381937, |
| "grad_norm": 1.0185561180114746, |
| "learning_rate": 4.3131057732827644e-05, |
| "loss": 0.986, |
| "mean_token_accuracy": 0.7506332467310131, |
| "num_tokens": 209533427.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.32976204670493453, |
| "grad_norm": 0.9821958541870117, |
| "learning_rate": 4.3060823149318724e-05, |
| "loss": 0.9666, |
| "mean_token_accuracy": 0.7546056086197496, |
| "num_tokens": 211344332.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.3324873528760497, |
| "grad_norm": 1.068172574043274, |
| "learning_rate": 4.299058856580981e-05, |
| "loss": 0.9949, |
| "mean_token_accuracy": 0.7498155074194074, |
| "num_tokens": 213088746.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.33521265904716485, |
| "grad_norm": 0.9833975434303284, |
| "learning_rate": 4.2920353982300885e-05, |
| "loss": 0.9931, |
| "mean_token_accuracy": 0.7484281599521637, |
| "num_tokens": 214889528.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.33793796521828, |
| "grad_norm": 1.090116262435913, |
| "learning_rate": 4.2850119398791965e-05, |
| "loss": 1.0017, |
| "mean_token_accuracy": 0.7485339365899563, |
| "num_tokens": 216603081.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.3406632713893952, |
| "grad_norm": 0.9591506719589233, |
| "learning_rate": 4.277988481528305e-05, |
| "loss": 0.9518, |
| "mean_token_accuracy": 0.7570753434672952, |
| "num_tokens": 218292845.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.34338857756051033, |
| "grad_norm": 1.013917326927185, |
| "learning_rate": 4.2709650231774125e-05, |
| "loss": 0.9728, |
| "mean_token_accuracy": 0.7542673271149397, |
| "num_tokens": 220029696.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.3461138837316255, |
| "grad_norm": 0.9289477467536926, |
| "learning_rate": 4.2639415648265206e-05, |
| "loss": 1.0286, |
| "mean_token_accuracy": 0.7439531436190009, |
| "num_tokens": 221810553.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.34883918990274065, |
| "grad_norm": 0.977281391620636, |
| "learning_rate": 4.2569181064756286e-05, |
| "loss": 0.9739, |
| "mean_token_accuracy": 0.7557552525773644, |
| "num_tokens": 223552801.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.3515644960738558, |
| "grad_norm": 1.0503572225570679, |
| "learning_rate": 4.249894648124737e-05, |
| "loss": 0.9786, |
| "mean_token_accuracy": 0.7533216239884496, |
| "num_tokens": 225254929.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.354289802244971, |
| "grad_norm": 0.9800918102264404, |
| "learning_rate": 4.2428711897738446e-05, |
| "loss": 0.9967, |
| "mean_token_accuracy": 0.7491257831454277, |
| "num_tokens": 227017304.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.35701510841608614, |
| "grad_norm": 1.0620349645614624, |
| "learning_rate": 4.2358477314229526e-05, |
| "loss": 0.9496, |
| "mean_token_accuracy": 0.7591037628240883, |
| "num_tokens": 228785771.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.3597404145872013, |
| "grad_norm": 0.986772894859314, |
| "learning_rate": 4.2288242730720607e-05, |
| "loss": 0.9581, |
| "mean_token_accuracy": 0.756846007797867, |
| "num_tokens": 230489667.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.36246572075831646, |
| "grad_norm": 0.8904594779014587, |
| "learning_rate": 4.2218008147211694e-05, |
| "loss": 0.9781, |
| "mean_token_accuracy": 0.7530512401834131, |
| "num_tokens": 232269581.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.3651910269294316, |
| "grad_norm": 0.9492087364196777, |
| "learning_rate": 4.214777356370277e-05, |
| "loss": 0.9822, |
| "mean_token_accuracy": 0.7503454959951341, |
| "num_tokens": 234028126.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.3679163331005468, |
| "grad_norm": 1.1163588762283325, |
| "learning_rate": 4.207753898019385e-05, |
| "loss": 0.9944, |
| "mean_token_accuracy": 0.749394488800317, |
| "num_tokens": 235817129.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.37064163927166194, |
| "grad_norm": 0.9092262983322144, |
| "learning_rate": 4.2007304396684934e-05, |
| "loss": 0.9822, |
| "mean_token_accuracy": 0.7527043742127717, |
| "num_tokens": 237588984.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.3733669454427771, |
| "grad_norm": 1.0118396282196045, |
| "learning_rate": 4.193706981317601e-05, |
| "loss": 0.9445, |
| "mean_token_accuracy": 0.7591466994024814, |
| "num_tokens": 239305200.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.37609225161389226, |
| "grad_norm": 0.9351493716239929, |
| "learning_rate": 4.186683522966709e-05, |
| "loss": 0.9667, |
| "mean_token_accuracy": 0.7552483780309558, |
| "num_tokens": 241078759.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.3788175577850074, |
| "grad_norm": 0.9622650742530823, |
| "learning_rate": 4.179660064615817e-05, |
| "loss": 0.9837, |
| "mean_token_accuracy": 0.7518350075930357, |
| "num_tokens": 242876841.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.3815428639561226, |
| "grad_norm": 0.9375427961349487, |
| "learning_rate": 4.1726366062649255e-05, |
| "loss": 0.958, |
| "mean_token_accuracy": 0.7548901244997979, |
| "num_tokens": 244578724.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.38426817012723774, |
| "grad_norm": 0.9655621647834778, |
| "learning_rate": 4.165613147914033e-05, |
| "loss": 0.9814, |
| "mean_token_accuracy": 0.7507870549336075, |
| "num_tokens": 246363009.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.3869934762983529, |
| "grad_norm": 0.9648198485374451, |
| "learning_rate": 4.158589689563141e-05, |
| "loss": 1.0049, |
| "mean_token_accuracy": 0.7473927522078156, |
| "num_tokens": 248120613.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.38971878246946806, |
| "grad_norm": 0.9323533773422241, |
| "learning_rate": 4.151566231212249e-05, |
| "loss": 0.9795, |
| "mean_token_accuracy": 0.7529980653896928, |
| "num_tokens": 249941867.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.3924440886405832, |
| "grad_norm": 0.9345725178718567, |
| "learning_rate": 4.1445427728613576e-05, |
| "loss": 0.9479, |
| "mean_token_accuracy": 0.7590054305270314, |
| "num_tokens": 251695734.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.3951693948116984, |
| "grad_norm": 0.9144307374954224, |
| "learning_rate": 4.137519314510465e-05, |
| "loss": 0.9915, |
| "mean_token_accuracy": 0.75021045608446, |
| "num_tokens": 253503211.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.39789470098281354, |
| "grad_norm": 1.0824007987976074, |
| "learning_rate": 4.130495856159573e-05, |
| "loss": 0.9676, |
| "mean_token_accuracy": 0.7551144331693649, |
| "num_tokens": 255313734.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.4006200071539287, |
| "grad_norm": 0.8734183311462402, |
| "learning_rate": 4.1234723978086816e-05, |
| "loss": 0.9193, |
| "mean_token_accuracy": 0.7647721905261278, |
| "num_tokens": 257016513.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.40334531332504386, |
| "grad_norm": 1.014374017715454, |
| "learning_rate": 4.1164489394577896e-05, |
| "loss": 0.9919, |
| "mean_token_accuracy": 0.7497567610815168, |
| "num_tokens": 258760362.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.406070619496159, |
| "grad_norm": 0.911683976650238, |
| "learning_rate": 4.109425481106897e-05, |
| "loss": 0.9483, |
| "mean_token_accuracy": 0.7586142903193831, |
| "num_tokens": 260546971.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.4087959256672742, |
| "grad_norm": 0.9247537851333618, |
| "learning_rate": 4.102402022756005e-05, |
| "loss": 0.9632, |
| "mean_token_accuracy": 0.7552125737071037, |
| "num_tokens": 262307672.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.41152123183838935, |
| "grad_norm": 1.0184024572372437, |
| "learning_rate": 4.095378564405114e-05, |
| "loss": 0.9731, |
| "mean_token_accuracy": 0.7537676138803363, |
| "num_tokens": 264023889.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.4142465380095045, |
| "grad_norm": 0.9960761666297913, |
| "learning_rate": 4.088355106054221e-05, |
| "loss": 0.9698, |
| "mean_token_accuracy": 0.75361382458359, |
| "num_tokens": 265798519.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.41697184418061967, |
| "grad_norm": 0.9082701802253723, |
| "learning_rate": 4.081331647703329e-05, |
| "loss": 0.9867, |
| "mean_token_accuracy": 0.751113293133676, |
| "num_tokens": 267546100.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.4196971503517348, |
| "grad_norm": 0.8918993473052979, |
| "learning_rate": 4.074308189352437e-05, |
| "loss": 0.9536, |
| "mean_token_accuracy": 0.7568896351382136, |
| "num_tokens": 269297641.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.42242245652285, |
| "grad_norm": 0.8429189324378967, |
| "learning_rate": 4.067284731001546e-05, |
| "loss": 0.9896, |
| "mean_token_accuracy": 0.7498064401559532, |
| "num_tokens": 271053006.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.42514776269396515, |
| "grad_norm": 1.0133056640625, |
| "learning_rate": 4.060261272650653e-05, |
| "loss": 0.971, |
| "mean_token_accuracy": 0.7551591267809272, |
| "num_tokens": 272829487.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.4278730688650803, |
| "grad_norm": 0.9307904839515686, |
| "learning_rate": 4.053237814299761e-05, |
| "loss": 0.966, |
| "mean_token_accuracy": 0.7548410438001156, |
| "num_tokens": 274621990.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.43059837503619547, |
| "grad_norm": 0.9339297413825989, |
| "learning_rate": 4.04621435594887e-05, |
| "loss": 0.9842, |
| "mean_token_accuracy": 0.7506074154749512, |
| "num_tokens": 276398989.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.43332368120731063, |
| "grad_norm": 0.9794987440109253, |
| "learning_rate": 4.039190897597978e-05, |
| "loss": 0.9494, |
| "mean_token_accuracy": 0.7569991254247725, |
| "num_tokens": 278192884.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.4360489873784258, |
| "grad_norm": 0.9957991242408752, |
| "learning_rate": 4.032167439247085e-05, |
| "loss": 0.9528, |
| "mean_token_accuracy": 0.7580153970047832, |
| "num_tokens": 279937931.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.43877429354954095, |
| "grad_norm": 0.8942903280258179, |
| "learning_rate": 4.025143980896193e-05, |
| "loss": 0.9528, |
| "mean_token_accuracy": 0.7584472270682454, |
| "num_tokens": 281698143.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.4414995997206561, |
| "grad_norm": 0.9091892242431641, |
| "learning_rate": 4.018120522545302e-05, |
| "loss": 0.9642, |
| "mean_token_accuracy": 0.7558311942964793, |
| "num_tokens": 283412565.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.44422490589177127, |
| "grad_norm": 0.9882811307907104, |
| "learning_rate": 4.011097064194409e-05, |
| "loss": 0.9676, |
| "mean_token_accuracy": 0.754830582626164, |
| "num_tokens": 285184244.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.44695021206288643, |
| "grad_norm": 0.8823055624961853, |
| "learning_rate": 4.004073605843517e-05, |
| "loss": 0.9176, |
| "mean_token_accuracy": 0.7649934707209468, |
| "num_tokens": 286963226.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.4496755182340016, |
| "grad_norm": 0.9638675451278687, |
| "learning_rate": 3.997050147492625e-05, |
| "loss": 0.9374, |
| "mean_token_accuracy": 0.7607969364151359, |
| "num_tokens": 288742524.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.45240082440511675, |
| "grad_norm": 0.9809541702270508, |
| "learning_rate": 3.990026689141734e-05, |
| "loss": 0.9674, |
| "mean_token_accuracy": 0.7548656595870853, |
| "num_tokens": 290486439.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.4551261305762319, |
| "grad_norm": 0.9652701616287231, |
| "learning_rate": 3.9830032307908413e-05, |
| "loss": 0.9778, |
| "mean_token_accuracy": 0.7537269618362188, |
| "num_tokens": 292244616.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.4578514367473471, |
| "grad_norm": 0.9816784858703613, |
| "learning_rate": 3.9759797724399494e-05, |
| "loss": 0.9732, |
| "mean_token_accuracy": 0.7538707010447979, |
| "num_tokens": 294028863.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.46057674291846223, |
| "grad_norm": 0.9191619157791138, |
| "learning_rate": 3.968956314089058e-05, |
| "loss": 0.9457, |
| "mean_token_accuracy": 0.7579093240201473, |
| "num_tokens": 295842660.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.4633020490895774, |
| "grad_norm": 0.9497706890106201, |
| "learning_rate": 3.961932855738166e-05, |
| "loss": 0.9482, |
| "mean_token_accuracy": 0.7591779384762048, |
| "num_tokens": 297606076.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.46602735526069256, |
| "grad_norm": 0.952664315700531, |
| "learning_rate": 3.9549093973872734e-05, |
| "loss": 0.9782, |
| "mean_token_accuracy": 0.7524136954918503, |
| "num_tokens": 299367632.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.4687526614318077, |
| "grad_norm": 0.8512211441993713, |
| "learning_rate": 3.9478859390363814e-05, |
| "loss": 0.9332, |
| "mean_token_accuracy": 0.7630375389009714, |
| "num_tokens": 301097990.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.4714779676029229, |
| "grad_norm": 1.0166341066360474, |
| "learning_rate": 3.94086248068549e-05, |
| "loss": 0.9418, |
| "mean_token_accuracy": 0.7609906679950654, |
| "num_tokens": 302847308.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.47420327377403804, |
| "grad_norm": 0.961094081401825, |
| "learning_rate": 3.9338390223345975e-05, |
| "loss": 0.9633, |
| "mean_token_accuracy": 0.7552885929122567, |
| "num_tokens": 304582625.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.4769285799451532, |
| "grad_norm": 0.9322654604911804, |
| "learning_rate": 3.9268155639837055e-05, |
| "loss": 0.9452, |
| "mean_token_accuracy": 0.7605904465541243, |
| "num_tokens": 306358206.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.47965388611626836, |
| "grad_norm": 0.964504063129425, |
| "learning_rate": 3.9197921056328135e-05, |
| "loss": 0.9435, |
| "mean_token_accuracy": 0.76006522141397, |
| "num_tokens": 308026562.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.4823791922873835, |
| "grad_norm": 0.9687669277191162, |
| "learning_rate": 3.912768647281922e-05, |
| "loss": 0.9745, |
| "mean_token_accuracy": 0.7534444922581315, |
| "num_tokens": 309804543.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.4851044984584987, |
| "grad_norm": 0.9425972700119019, |
| "learning_rate": 3.9057451889310296e-05, |
| "loss": 0.9656, |
| "mean_token_accuracy": 0.7545514106750488, |
| "num_tokens": 311549771.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.48782980462961384, |
| "grad_norm": 1.042758822441101, |
| "learning_rate": 3.8987217305801376e-05, |
| "loss": 0.9812, |
| "mean_token_accuracy": 0.7521832747384906, |
| "num_tokens": 313326889.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.490555110800729, |
| "grad_norm": 1.0554726123809814, |
| "learning_rate": 3.891698272229246e-05, |
| "loss": 0.9397, |
| "mean_token_accuracy": 0.7617883637547493, |
| "num_tokens": 315064715.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.49328041697184416, |
| "grad_norm": 0.9078469276428223, |
| "learning_rate": 3.884674813878354e-05, |
| "loss": 0.9575, |
| "mean_token_accuracy": 0.7562927783466875, |
| "num_tokens": 316809509.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.4960057231429593, |
| "grad_norm": 0.9499340057373047, |
| "learning_rate": 3.8776513555274616e-05, |
| "loss": 0.9375, |
| "mean_token_accuracy": 0.7615752270445227, |
| "num_tokens": 318597481.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.4987310293140745, |
| "grad_norm": 0.9514725804328918, |
| "learning_rate": 3.8706278971765697e-05, |
| "loss": 0.9378, |
| "mean_token_accuracy": 0.7608531050384044, |
| "num_tokens": 320382863.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.5014563354851896, |
| "grad_norm": 0.9571301341056824, |
| "learning_rate": 3.8636044388256784e-05, |
| "loss": 0.9493, |
| "mean_token_accuracy": 0.7597140209749341, |
| "num_tokens": 322136431.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.5041816416563049, |
| "grad_norm": 0.9752191305160522, |
| "learning_rate": 3.856580980474786e-05, |
| "loss": 0.9365, |
| "mean_token_accuracy": 0.7619069669395685, |
| "num_tokens": 323892089.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.50690694782742, |
| "grad_norm": 0.859382152557373, |
| "learning_rate": 3.849557522123894e-05, |
| "loss": 0.947, |
| "mean_token_accuracy": 0.7584124825894832, |
| "num_tokens": 325693457.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.5096322539985352, |
| "grad_norm": 1.022796630859375, |
| "learning_rate": 3.842534063773002e-05, |
| "loss": 0.925, |
| "mean_token_accuracy": 0.7624462634325028, |
| "num_tokens": 327397884.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.5123575601696503, |
| "grad_norm": 1.0252681970596313, |
| "learning_rate": 3.8355106054221104e-05, |
| "loss": 0.9675, |
| "mean_token_accuracy": 0.7556776776909828, |
| "num_tokens": 329077502.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.5150828663407655, |
| "grad_norm": 0.9402310252189636, |
| "learning_rate": 3.828487147071218e-05, |
| "loss": 0.956, |
| "mean_token_accuracy": 0.7570736223831773, |
| "num_tokens": 330847695.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.5178081725118806, |
| "grad_norm": 0.9374983310699463, |
| "learning_rate": 3.821463688720326e-05, |
| "loss": 0.943, |
| "mean_token_accuracy": 0.7579239157959818, |
| "num_tokens": 332578366.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5205334786829958, |
| "grad_norm": 0.9612072110176086, |
| "learning_rate": 3.8144402303694345e-05, |
| "loss": 0.9656, |
| "mean_token_accuracy": 0.7552163794636726, |
| "num_tokens": 334293589.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.5232587848541109, |
| "grad_norm": 0.8690987229347229, |
| "learning_rate": 3.8074167720185425e-05, |
| "loss": 0.9551, |
| "mean_token_accuracy": 0.755389365926385, |
| "num_tokens": 336016972.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.5259840910252261, |
| "grad_norm": 0.8889273405075073, |
| "learning_rate": 3.80039331366765e-05, |
| "loss": 0.9312, |
| "mean_token_accuracy": 0.7616344084963202, |
| "num_tokens": 337805977.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.5287093971963412, |
| "grad_norm": 0.888575553894043, |
| "learning_rate": 3.793369855316758e-05, |
| "loss": 0.9064, |
| "mean_token_accuracy": 0.7666762206703425, |
| "num_tokens": 339548221.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.5314347033674565, |
| "grad_norm": 0.9260870814323425, |
| "learning_rate": 3.7863463969658666e-05, |
| "loss": 0.9829, |
| "mean_token_accuracy": 0.750894641969353, |
| "num_tokens": 341268344.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5341600095385716, |
| "grad_norm": 1.0482826232910156, |
| "learning_rate": 3.779322938614974e-05, |
| "loss": 0.9496, |
| "mean_token_accuracy": 0.7581010499969125, |
| "num_tokens": 343017522.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.5368853157096868, |
| "grad_norm": 0.9690923690795898, |
| "learning_rate": 3.772299480264082e-05, |
| "loss": 0.9461, |
| "mean_token_accuracy": 0.7593427566811443, |
| "num_tokens": 344699192.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.5396106218808019, |
| "grad_norm": 0.9007167220115662, |
| "learning_rate": 3.7652760219131906e-05, |
| "loss": 0.9322, |
| "mean_token_accuracy": 0.7622509736567735, |
| "num_tokens": 346458527.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.5423359280519171, |
| "grad_norm": 0.9483133554458618, |
| "learning_rate": 3.7582525635622986e-05, |
| "loss": 0.9492, |
| "mean_token_accuracy": 0.7583487136289477, |
| "num_tokens": 348204585.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.5450612342230322, |
| "grad_norm": 0.9575846195220947, |
| "learning_rate": 3.751229105211406e-05, |
| "loss": 0.9753, |
| "mean_token_accuracy": 0.7533793544396759, |
| "num_tokens": 349928259.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5477865403941474, |
| "grad_norm": 0.9323520660400391, |
| "learning_rate": 3.744205646860514e-05, |
| "loss": 0.9396, |
| "mean_token_accuracy": 0.7598869156092405, |
| "num_tokens": 351568018.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.5505118465652625, |
| "grad_norm": 0.9729679822921753, |
| "learning_rate": 3.737182188509623e-05, |
| "loss": 0.9254, |
| "mean_token_accuracy": 0.7636484606191516, |
| "num_tokens": 353325716.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.5532371527363777, |
| "grad_norm": 1.0164568424224854, |
| "learning_rate": 3.730158730158731e-05, |
| "loss": 0.9575, |
| "mean_token_accuracy": 0.7566652336157859, |
| "num_tokens": 355099655.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.5559624589074929, |
| "grad_norm": 0.8739563822746277, |
| "learning_rate": 3.723135271807838e-05, |
| "loss": 0.9358, |
| "mean_token_accuracy": 0.7612218523398042, |
| "num_tokens": 356892448.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.5586877650786081, |
| "grad_norm": 0.9450079798698425, |
| "learning_rate": 3.716111813456946e-05, |
| "loss": 0.9634, |
| "mean_token_accuracy": 0.7554592994041741, |
| "num_tokens": 358599855.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.5614130712497232, |
| "grad_norm": 0.9322670698165894, |
| "learning_rate": 3.709088355106055e-05, |
| "loss": 0.9664, |
| "mean_token_accuracy": 0.7549236617982388, |
| "num_tokens": 360366528.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.5641383774208384, |
| "grad_norm": 0.9792261123657227, |
| "learning_rate": 3.702064896755162e-05, |
| "loss": 0.9418, |
| "mean_token_accuracy": 0.7601142754778266, |
| "num_tokens": 362123903.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.5668636835919535, |
| "grad_norm": 0.883922815322876, |
| "learning_rate": 3.69504143840427e-05, |
| "loss": 0.9247, |
| "mean_token_accuracy": 0.7645677644759417, |
| "num_tokens": 363907270.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.5695889897630687, |
| "grad_norm": 1.026827335357666, |
| "learning_rate": 3.688017980053379e-05, |
| "loss": 0.9139, |
| "mean_token_accuracy": 0.7644737392663956, |
| "num_tokens": 365602883.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.5723142959341838, |
| "grad_norm": 1.0081133842468262, |
| "learning_rate": 3.680994521702487e-05, |
| "loss": 0.9346, |
| "mean_token_accuracy": 0.7607566144317388, |
| "num_tokens": 367308921.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.575039602105299, |
| "grad_norm": 0.9480522871017456, |
| "learning_rate": 3.673971063351594e-05, |
| "loss": 0.9352, |
| "mean_token_accuracy": 0.7604883845895529, |
| "num_tokens": 369068618.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.5777649082764141, |
| "grad_norm": 0.929105818271637, |
| "learning_rate": 3.666947605000702e-05, |
| "loss": 0.9531, |
| "mean_token_accuracy": 0.7584813937544823, |
| "num_tokens": 370792272.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.5804902144475294, |
| "grad_norm": 0.9752338528633118, |
| "learning_rate": 3.659924146649811e-05, |
| "loss": 0.9549, |
| "mean_token_accuracy": 0.7570922682061791, |
| "num_tokens": 372506541.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.5832155206186445, |
| "grad_norm": 0.9325385093688965, |
| "learning_rate": 3.652900688298919e-05, |
| "loss": 0.9075, |
| "mean_token_accuracy": 0.7672267651185394, |
| "num_tokens": 374308542.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.5859408267897597, |
| "grad_norm": 0.9299280643463135, |
| "learning_rate": 3.645877229948026e-05, |
| "loss": 0.9157, |
| "mean_token_accuracy": 0.7651827426627278, |
| "num_tokens": 376016266.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5886661329608748, |
| "grad_norm": 0.8966337442398071, |
| "learning_rate": 3.638853771597134e-05, |
| "loss": 0.9567, |
| "mean_token_accuracy": 0.7577152790501713, |
| "num_tokens": 377732713.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.59139143913199, |
| "grad_norm": 1.0337902307510376, |
| "learning_rate": 3.631830313246243e-05, |
| "loss": 0.9447, |
| "mean_token_accuracy": 0.7596987700089812, |
| "num_tokens": 379457927.0, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.5941167453031051, |
| "grad_norm": 0.8205364346504211, |
| "learning_rate": 3.6248068548953503e-05, |
| "loss": 0.9786, |
| "mean_token_accuracy": 0.7524961337447167, |
| "num_tokens": 381311531.0, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.5968420514742203, |
| "grad_norm": 0.9586334228515625, |
| "learning_rate": 3.6177833965444584e-05, |
| "loss": 0.9685, |
| "mean_token_accuracy": 0.7561034603975714, |
| "num_tokens": 383076036.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.5995673576453354, |
| "grad_norm": 0.9479374885559082, |
| "learning_rate": 3.610759938193567e-05, |
| "loss": 0.9666, |
| "mean_token_accuracy": 0.7556248934939503, |
| "num_tokens": 384869981.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6022926638164506, |
| "grad_norm": 1.0398281812667847, |
| "learning_rate": 3.603736479842675e-05, |
| "loss": 0.9466, |
| "mean_token_accuracy": 0.7579332664608955, |
| "num_tokens": 386654361.0, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.6050179699875657, |
| "grad_norm": 0.9523603916168213, |
| "learning_rate": 3.5967130214917824e-05, |
| "loss": 0.9224, |
| "mean_token_accuracy": 0.7639172183349728, |
| "num_tokens": 388405388.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.607743276158681, |
| "grad_norm": 0.9337072372436523, |
| "learning_rate": 3.5896895631408904e-05, |
| "loss": 0.9531, |
| "mean_token_accuracy": 0.7578973986208439, |
| "num_tokens": 390160411.0, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.6104685823297961, |
| "grad_norm": 0.9452222585678101, |
| "learning_rate": 3.582666104789999e-05, |
| "loss": 0.9597, |
| "mean_token_accuracy": 0.75540736541152, |
| "num_tokens": 391963393.0, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.6131938885009113, |
| "grad_norm": 0.9237678050994873, |
| "learning_rate": 3.575642646439107e-05, |
| "loss": 0.9592, |
| "mean_token_accuracy": 0.7570389699190855, |
| "num_tokens": 393719494.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6159191946720264, |
| "grad_norm": 0.8745359182357788, |
| "learning_rate": 3.5686191880882145e-05, |
| "loss": 0.9544, |
| "mean_token_accuracy": 0.7581097180023789, |
| "num_tokens": 395419867.0, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.6186445008431416, |
| "grad_norm": 0.872172474861145, |
| "learning_rate": 3.5615957297373225e-05, |
| "loss": 0.9348, |
| "mean_token_accuracy": 0.7611732495948672, |
| "num_tokens": 397164636.0, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.6213698070142567, |
| "grad_norm": 0.9751588106155396, |
| "learning_rate": 3.554572271386431e-05, |
| "loss": 0.9649, |
| "mean_token_accuracy": 0.7546903455629945, |
| "num_tokens": 398897880.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.6240951131853719, |
| "grad_norm": 1.0017331838607788, |
| "learning_rate": 3.5475488130355386e-05, |
| "loss": 0.9488, |
| "mean_token_accuracy": 0.7583517892286181, |
| "num_tokens": 400668616.0, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.6268204193564871, |
| "grad_norm": 0.8993579745292664, |
| "learning_rate": 3.5405253546846466e-05, |
| "loss": 0.942, |
| "mean_token_accuracy": 0.759977068938315, |
| "num_tokens": 402379710.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6295457255276022, |
| "grad_norm": 0.9273610711097717, |
| "learning_rate": 3.533501896333755e-05, |
| "loss": 0.9388, |
| "mean_token_accuracy": 0.759927311167121, |
| "num_tokens": 404084345.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.6322710316987175, |
| "grad_norm": 0.9769418835639954, |
| "learning_rate": 3.526478437982863e-05, |
| "loss": 0.9102, |
| "mean_token_accuracy": 0.7673765732906759, |
| "num_tokens": 405770321.0, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.6349963378698326, |
| "grad_norm": 0.9049491882324219, |
| "learning_rate": 3.5194549796319706e-05, |
| "loss": 0.9333, |
| "mean_token_accuracy": 0.7614351283758879, |
| "num_tokens": 407492020.0, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.6377216440409478, |
| "grad_norm": 0.9118947386741638, |
| "learning_rate": 3.5124315212810787e-05, |
| "loss": 0.9294, |
| "mean_token_accuracy": 0.7637290453538299, |
| "num_tokens": 409257115.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.6404469502120629, |
| "grad_norm": 0.9527559280395508, |
| "learning_rate": 3.5054080629301874e-05, |
| "loss": 0.9522, |
| "mean_token_accuracy": 0.7575726680457592, |
| "num_tokens": 410986071.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6431722563831781, |
| "grad_norm": 0.9394711852073669, |
| "learning_rate": 3.4983846045792954e-05, |
| "loss": 0.9367, |
| "mean_token_accuracy": 0.7613094063475728, |
| "num_tokens": 412763897.0, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.6458975625542932, |
| "grad_norm": 0.9034632444381714, |
| "learning_rate": 3.491361146228403e-05, |
| "loss": 0.9093, |
| "mean_token_accuracy": 0.7680137138813734, |
| "num_tokens": 414503168.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.6486228687254084, |
| "grad_norm": 0.9068887829780579, |
| "learning_rate": 3.484337687877511e-05, |
| "loss": 0.9316, |
| "mean_token_accuracy": 0.7629754545167089, |
| "num_tokens": 416227145.0, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.6513481748965235, |
| "grad_norm": 0.9191030859947205, |
| "learning_rate": 3.4773142295266194e-05, |
| "loss": 0.928, |
| "mean_token_accuracy": 0.7619910618290305, |
| "num_tokens": 417973040.0, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.6540734810676387, |
| "grad_norm": 0.9926860928535461, |
| "learning_rate": 3.470290771175727e-05, |
| "loss": 0.9338, |
| "mean_token_accuracy": 0.7614536901935935, |
| "num_tokens": 419736780.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.6567987872387538, |
| "grad_norm": 0.8897690176963806, |
| "learning_rate": 3.463267312824835e-05, |
| "loss": 0.9242, |
| "mean_token_accuracy": 0.7639796357601881, |
| "num_tokens": 421516023.0, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.6595240934098691, |
| "grad_norm": 0.9441693425178528, |
| "learning_rate": 3.4562438544739435e-05, |
| "loss": 0.9664, |
| "mean_token_accuracy": 0.7563766550272704, |
| "num_tokens": 423324296.0, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.6622493995809842, |
| "grad_norm": 0.9950588941574097, |
| "learning_rate": 3.4492203961230515e-05, |
| "loss": 0.9542, |
| "mean_token_accuracy": 0.7551617925986648, |
| "num_tokens": 425114987.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.6649747057520994, |
| "grad_norm": 0.9122210741043091, |
| "learning_rate": 3.442196937772159e-05, |
| "loss": 0.9204, |
| "mean_token_accuracy": 0.7638139262795448, |
| "num_tokens": 426875751.0, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.6677000119232145, |
| "grad_norm": 0.9361687302589417, |
| "learning_rate": 3.435173479421267e-05, |
| "loss": 0.9309, |
| "mean_token_accuracy": 0.7621823664754629, |
| "num_tokens": 428715433.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.6704253180943297, |
| "grad_norm": 1.0153151750564575, |
| "learning_rate": 3.4281500210703756e-05, |
| "loss": 0.9677, |
| "mean_token_accuracy": 0.754531520511955, |
| "num_tokens": 430443301.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.6731506242654448, |
| "grad_norm": 0.9298137426376343, |
| "learning_rate": 3.4211265627194836e-05, |
| "loss": 0.8981, |
| "mean_token_accuracy": 0.7690068047493697, |
| "num_tokens": 432149898.0, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.67587593043656, |
| "grad_norm": 0.979828953742981, |
| "learning_rate": 3.414103104368591e-05, |
| "loss": 0.9162, |
| "mean_token_accuracy": 0.7655546896159648, |
| "num_tokens": 433904775.0, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.6786012366076751, |
| "grad_norm": 0.9502202868461609, |
| "learning_rate": 3.407079646017699e-05, |
| "loss": 0.9179, |
| "mean_token_accuracy": 0.7641258521005512, |
| "num_tokens": 435648944.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.6813265427787903, |
| "grad_norm": 0.9376423954963684, |
| "learning_rate": 3.4000561876668076e-05, |
| "loss": 0.9536, |
| "mean_token_accuracy": 0.7582374062389136, |
| "num_tokens": 437390122.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.6840518489499054, |
| "grad_norm": 0.9456937313079834, |
| "learning_rate": 3.393032729315915e-05, |
| "loss": 0.9418, |
| "mean_token_accuracy": 0.7602952811866999, |
| "num_tokens": 439126311.0, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.6867771551210207, |
| "grad_norm": 0.9349498748779297, |
| "learning_rate": 3.386009270965023e-05, |
| "loss": 0.9311, |
| "mean_token_accuracy": 0.762849635258317, |
| "num_tokens": 440890933.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.6895024612921358, |
| "grad_norm": 1.079544186592102, |
| "learning_rate": 3.378985812614132e-05, |
| "loss": 0.9195, |
| "mean_token_accuracy": 0.7645381474867463, |
| "num_tokens": 442611770.0, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.692227767463251, |
| "grad_norm": 0.9199313521385193, |
| "learning_rate": 3.37196235426324e-05, |
| "loss": 0.9249, |
| "mean_token_accuracy": 0.7639479031786323, |
| "num_tokens": 444358166.0, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.6949530736343661, |
| "grad_norm": 0.9256271719932556, |
| "learning_rate": 3.364938895912347e-05, |
| "loss": 0.9609, |
| "mean_token_accuracy": 0.756293723359704, |
| "num_tokens": 446078790.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.6976783798054813, |
| "grad_norm": 0.9971742630004883, |
| "learning_rate": 3.357915437561455e-05, |
| "loss": 0.9293, |
| "mean_token_accuracy": 0.7617772882804275, |
| "num_tokens": 447811186.0, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.7004036859765964, |
| "grad_norm": 0.9106160998344421, |
| "learning_rate": 3.350891979210564e-05, |
| "loss": 0.94, |
| "mean_token_accuracy": 0.7603334264829755, |
| "num_tokens": 449537962.0, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.7031289921477116, |
| "grad_norm": 0.9458938837051392, |
| "learning_rate": 3.343868520859672e-05, |
| "loss": 0.9134, |
| "mean_token_accuracy": 0.7661536164581776, |
| "num_tokens": 451259469.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.7058542983188267, |
| "grad_norm": 0.9445596933364868, |
| "learning_rate": 3.336845062508779e-05, |
| "loss": 0.9209, |
| "mean_token_accuracy": 0.7649673901498317, |
| "num_tokens": 453041146.0, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.708579604489942, |
| "grad_norm": 0.9680286049842834, |
| "learning_rate": 3.329821604157887e-05, |
| "loss": 0.9553, |
| "mean_token_accuracy": 0.7577722139656544, |
| "num_tokens": 454753651.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.711304910661057, |
| "grad_norm": 0.9290615320205688, |
| "learning_rate": 3.322798145806996e-05, |
| "loss": 0.9247, |
| "mean_token_accuracy": 0.7633149197325111, |
| "num_tokens": 456537057.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.7140302168321723, |
| "grad_norm": 0.9331244230270386, |
| "learning_rate": 3.315774687456103e-05, |
| "loss": 0.9675, |
| "mean_token_accuracy": 0.7562482981011271, |
| "num_tokens": 458271296.0, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.7167555230032874, |
| "grad_norm": 0.885105550289154, |
| "learning_rate": 3.308751229105211e-05, |
| "loss": 0.9403, |
| "mean_token_accuracy": 0.7598960697650909, |
| "num_tokens": 460014932.0, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.7194808291744026, |
| "grad_norm": 1.002734899520874, |
| "learning_rate": 3.30172777075432e-05, |
| "loss": 0.9506, |
| "mean_token_accuracy": 0.7589968075975776, |
| "num_tokens": 461818480.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.7222061353455177, |
| "grad_norm": 0.9839244484901428, |
| "learning_rate": 3.294704312403428e-05, |
| "loss": 0.9718, |
| "mean_token_accuracy": 0.753429920040071, |
| "num_tokens": 463519993.0, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7249314415166329, |
| "grad_norm": 1.0279382467269897, |
| "learning_rate": 3.287680854052535e-05, |
| "loss": 0.9391, |
| "mean_token_accuracy": 0.7601042149588466, |
| "num_tokens": 465333101.0, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.727656747687748, |
| "grad_norm": 0.9534622430801392, |
| "learning_rate": 3.280657395701643e-05, |
| "loss": 0.9328, |
| "mean_token_accuracy": 0.7614490607753396, |
| "num_tokens": 467047198.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.7303820538588632, |
| "grad_norm": 0.9373722672462463, |
| "learning_rate": 3.273633937350752e-05, |
| "loss": 0.9465, |
| "mean_token_accuracy": 0.7602417379617691, |
| "num_tokens": 468806711.0, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.7331073600299783, |
| "grad_norm": 0.8950750827789307, |
| "learning_rate": 3.26661047899986e-05, |
| "loss": 0.9042, |
| "mean_token_accuracy": 0.7677352372556925, |
| "num_tokens": 470600782.0, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.7358326662010936, |
| "grad_norm": 0.9187238812446594, |
| "learning_rate": 3.2595870206489674e-05, |
| "loss": 0.9443, |
| "mean_token_accuracy": 0.7598759381100535, |
| "num_tokens": 472349330.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.7385579723722087, |
| "grad_norm": 0.9190787076950073, |
| "learning_rate": 3.2525635622980754e-05, |
| "loss": 0.9309, |
| "mean_token_accuracy": 0.7636961450800299, |
| "num_tokens": 474106983.0, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.7412832785433239, |
| "grad_norm": 0.9724632501602173, |
| "learning_rate": 3.245540103947184e-05, |
| "loss": 0.9642, |
| "mean_token_accuracy": 0.7555989472195506, |
| "num_tokens": 475902407.0, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.744008584714439, |
| "grad_norm": 0.8691114187240601, |
| "learning_rate": 3.2385166455962914e-05, |
| "loss": 0.8866, |
| "mean_token_accuracy": 0.7711342711001634, |
| "num_tokens": 477654644.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.7467338908855542, |
| "grad_norm": 0.9264854192733765, |
| "learning_rate": 3.2314931872453994e-05, |
| "loss": 0.9532, |
| "mean_token_accuracy": 0.7577282522805036, |
| "num_tokens": 479409078.0, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.7494591970566693, |
| "grad_norm": 1.0483834743499756, |
| "learning_rate": 3.224469728894508e-05, |
| "loss": 0.9366, |
| "mean_token_accuracy": 0.7610113574191928, |
| "num_tokens": 481154972.0, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.7521845032277845, |
| "grad_norm": 0.9823554158210754, |
| "learning_rate": 3.217446270543616e-05, |
| "loss": 0.9622, |
| "mean_token_accuracy": 0.7564713628962636, |
| "num_tokens": 482957554.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.7549098093988996, |
| "grad_norm": 0.931236743927002, |
| "learning_rate": 3.2104228121927235e-05, |
| "loss": 0.9375, |
| "mean_token_accuracy": 0.7607782265171409, |
| "num_tokens": 484730289.0, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.7576351155700148, |
| "grad_norm": 1.044001817703247, |
| "learning_rate": 3.2033993538418315e-05, |
| "loss": 0.9437, |
| "mean_token_accuracy": 0.7582019144669175, |
| "num_tokens": 486390658.0, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.7603604217411299, |
| "grad_norm": 1.0096402168273926, |
| "learning_rate": 3.19637589549094e-05, |
| "loss": 0.9055, |
| "mean_token_accuracy": 0.7680035123601556, |
| "num_tokens": 488084357.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.7630857279122452, |
| "grad_norm": 1.0040942430496216, |
| "learning_rate": 3.189352437140048e-05, |
| "loss": 0.9812, |
| "mean_token_accuracy": 0.7524819139391183, |
| "num_tokens": 489867504.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.7658110340833603, |
| "grad_norm": 0.9055443406105042, |
| "learning_rate": 3.1823289787891556e-05, |
| "loss": 0.9519, |
| "mean_token_accuracy": 0.7572958417236805, |
| "num_tokens": 491614120.0, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.7685363402544755, |
| "grad_norm": 0.8963329195976257, |
| "learning_rate": 3.1753055204382636e-05, |
| "loss": 0.9504, |
| "mean_token_accuracy": 0.75969771258533, |
| "num_tokens": 493360108.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.7712616464255906, |
| "grad_norm": 0.9222882986068726, |
| "learning_rate": 3.168282062087372e-05, |
| "loss": 0.9273, |
| "mean_token_accuracy": 0.764968883432448, |
| "num_tokens": 495060867.0, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.7739869525967058, |
| "grad_norm": 0.9971688985824585, |
| "learning_rate": 3.16125860373648e-05, |
| "loss": 0.9498, |
| "mean_token_accuracy": 0.7571736957877875, |
| "num_tokens": 496731129.0, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.7767122587678209, |
| "grad_norm": 0.8665016293525696, |
| "learning_rate": 3.1542351453855877e-05, |
| "loss": 0.9189, |
| "mean_token_accuracy": 0.7652535479515791, |
| "num_tokens": 498416929.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.7794375649389361, |
| "grad_norm": 0.9980199933052063, |
| "learning_rate": 3.1472116870346964e-05, |
| "loss": 0.9046, |
| "mean_token_accuracy": 0.7680843161419034, |
| "num_tokens": 500131030.0, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.7821628711100512, |
| "grad_norm": 0.8767507076263428, |
| "learning_rate": 3.1401882286838044e-05, |
| "loss": 0.9157, |
| "mean_token_accuracy": 0.7653609652072191, |
| "num_tokens": 501860254.0, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.7848881772811664, |
| "grad_norm": 0.9489269852638245, |
| "learning_rate": 3.133164770332912e-05, |
| "loss": 0.9644, |
| "mean_token_accuracy": 0.7552736889570951, |
| "num_tokens": 503599791.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.7876134834522815, |
| "grad_norm": 0.9339836835861206, |
| "learning_rate": 3.12614131198202e-05, |
| "loss": 0.9844, |
| "mean_token_accuracy": 0.752055324614048, |
| "num_tokens": 505277304.0, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.7903387896233968, |
| "grad_norm": 1.0076805353164673, |
| "learning_rate": 3.1191178536311284e-05, |
| "loss": 0.9374, |
| "mean_token_accuracy": 0.7605025995522737, |
| "num_tokens": 507010506.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.7930640957945119, |
| "grad_norm": 0.9303880929946899, |
| "learning_rate": 3.1120943952802364e-05, |
| "loss": 0.9236, |
| "mean_token_accuracy": 0.763912508264184, |
| "num_tokens": 508717678.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.7957894019656271, |
| "grad_norm": 0.9282418489456177, |
| "learning_rate": 3.105070936929344e-05, |
| "loss": 0.9036, |
| "mean_token_accuracy": 0.7691353503614664, |
| "num_tokens": 510478982.0, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.7985147081367422, |
| "grad_norm": 0.9317098259925842, |
| "learning_rate": 3.0980474785784525e-05, |
| "loss": 0.9013, |
| "mean_token_accuracy": 0.7674754545092582, |
| "num_tokens": 512223492.0, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.8012400143078574, |
| "grad_norm": 0.8785368800163269, |
| "learning_rate": 3.0910240202275605e-05, |
| "loss": 0.9245, |
| "mean_token_accuracy": 0.7631963776424527, |
| "num_tokens": 513950826.0, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.8039653204789725, |
| "grad_norm": 0.9449997544288635, |
| "learning_rate": 3.0840005618766685e-05, |
| "loss": 0.9511, |
| "mean_token_accuracy": 0.7589579506777226, |
| "num_tokens": 515747512.0, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8066906266500877, |
| "grad_norm": 0.9044075012207031, |
| "learning_rate": 3.076977103525776e-05, |
| "loss": 0.9251, |
| "mean_token_accuracy": 0.7633269606158137, |
| "num_tokens": 517564408.0, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.8094159328212028, |
| "grad_norm": 0.8995125889778137, |
| "learning_rate": 3.0699536451748846e-05, |
| "loss": 0.8688, |
| "mean_token_accuracy": 0.7750391457229853, |
| "num_tokens": 519335113.0, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.812141238992318, |
| "grad_norm": 0.8972451090812683, |
| "learning_rate": 3.0629301868239926e-05, |
| "loss": 0.9102, |
| "mean_token_accuracy": 0.7668128840625286, |
| "num_tokens": 521052694.0, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.8148665451634332, |
| "grad_norm": 0.8758600354194641, |
| "learning_rate": 3.0559067284731e-05, |
| "loss": 0.9454, |
| "mean_token_accuracy": 0.7591741172596812, |
| "num_tokens": 522838886.0, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.8175918513345484, |
| "grad_norm": 0.871060848236084, |
| "learning_rate": 3.048883270122208e-05, |
| "loss": 0.9286, |
| "mean_token_accuracy": 0.762829508818686, |
| "num_tokens": 524659688.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8203171575056635, |
| "grad_norm": 0.9444119334220886, |
| "learning_rate": 3.0418598117713166e-05, |
| "loss": 0.9142, |
| "mean_token_accuracy": 0.7660003494471311, |
| "num_tokens": 526428828.0, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.8230424636767787, |
| "grad_norm": 0.9209753274917603, |
| "learning_rate": 3.0348363534204243e-05, |
| "loss": 0.9376, |
| "mean_token_accuracy": 0.7604016859084368, |
| "num_tokens": 528178357.0, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.8257677698478938, |
| "grad_norm": 0.898345947265625, |
| "learning_rate": 3.0278128950695323e-05, |
| "loss": 0.9201, |
| "mean_token_accuracy": 0.7637051574885845, |
| "num_tokens": 529968661.0, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.828493076019009, |
| "grad_norm": 0.9637786746025085, |
| "learning_rate": 3.0207894367186407e-05, |
| "loss": 0.925, |
| "mean_token_accuracy": 0.7651786257512867, |
| "num_tokens": 531735846.0, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.8312183821901241, |
| "grad_norm": 0.9480170011520386, |
| "learning_rate": 3.0137659783677484e-05, |
| "loss": 0.9447, |
| "mean_token_accuracy": 0.7601438457146287, |
| "num_tokens": 533482605.0, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.8339436883612393, |
| "grad_norm": 0.9402963519096375, |
| "learning_rate": 3.0067425200168564e-05, |
| "loss": 0.9299, |
| "mean_token_accuracy": 0.7631770128384232, |
| "num_tokens": 535172117.0, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.8366689945323545, |
| "grad_norm": 0.9365580677986145, |
| "learning_rate": 2.9997190616659644e-05, |
| "loss": 0.9213, |
| "mean_token_accuracy": 0.7642906453460455, |
| "num_tokens": 536922982.0, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.8393943007034697, |
| "grad_norm": 0.9361058473587036, |
| "learning_rate": 2.9926956033150728e-05, |
| "loss": 0.9286, |
| "mean_token_accuracy": 0.7624955836683511, |
| "num_tokens": 538697561.0, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.8421196068745849, |
| "grad_norm": 0.9975050091743469, |
| "learning_rate": 2.9856721449641805e-05, |
| "loss": 0.9319, |
| "mean_token_accuracy": 0.7618069407530129, |
| "num_tokens": 540498809.0, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.8448449130457, |
| "grad_norm": 0.9614945650100708, |
| "learning_rate": 2.9786486866132885e-05, |
| "loss": 0.9078, |
| "mean_token_accuracy": 0.7663526112213731, |
| "num_tokens": 542220573.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.8475702192168152, |
| "grad_norm": 0.9452424049377441, |
| "learning_rate": 2.971625228262396e-05, |
| "loss": 0.9218, |
| "mean_token_accuracy": 0.7639887780882418, |
| "num_tokens": 543995979.0, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.8502955253879303, |
| "grad_norm": 0.9348092079162598, |
| "learning_rate": 2.964601769911505e-05, |
| "loss": 0.9413, |
| "mean_token_accuracy": 0.7585929285734891, |
| "num_tokens": 545762046.0, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.8530208315590455, |
| "grad_norm": 0.857044517993927, |
| "learning_rate": 2.9575783115606125e-05, |
| "loss": 0.9189, |
| "mean_token_accuracy": 0.7647921906784176, |
| "num_tokens": 547563120.0, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.8557461377301606, |
| "grad_norm": 0.9445975422859192, |
| "learning_rate": 2.9505548532097206e-05, |
| "loss": 0.9354, |
| "mean_token_accuracy": 0.7610112639144063, |
| "num_tokens": 549276605.0, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.8584714439012758, |
| "grad_norm": 0.9536585211753845, |
| "learning_rate": 2.943531394858829e-05, |
| "loss": 0.9257, |
| "mean_token_accuracy": 0.7634840881451964, |
| "num_tokens": 550991479.0, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.8611967500723909, |
| "grad_norm": 0.9297323822975159, |
| "learning_rate": 2.9365079365079366e-05, |
| "loss": 0.9143, |
| "mean_token_accuracy": 0.766822918318212, |
| "num_tokens": 552777662.0, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.8639220562435062, |
| "grad_norm": 1.0276936292648315, |
| "learning_rate": 2.9294844781570446e-05, |
| "loss": 0.9056, |
| "mean_token_accuracy": 0.7674277478829026, |
| "num_tokens": 554539356.0, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.8666473624146213, |
| "grad_norm": 0.9300222992897034, |
| "learning_rate": 2.9224610198061526e-05, |
| "loss": 0.9676, |
| "mean_token_accuracy": 0.7549269145354629, |
| "num_tokens": 556212548.0, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.8693726685857365, |
| "grad_norm": 0.9600440859794617, |
| "learning_rate": 2.915437561455261e-05, |
| "loss": 0.9341, |
| "mean_token_accuracy": 0.7607308451086283, |
| "num_tokens": 557950461.0, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.8720979747568516, |
| "grad_norm": 0.9339573383331299, |
| "learning_rate": 2.9084141031043687e-05, |
| "loss": 0.9145, |
| "mean_token_accuracy": 0.7650301210582257, |
| "num_tokens": 559715926.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.8748232809279668, |
| "grad_norm": 0.9827753305435181, |
| "learning_rate": 2.9013906447534767e-05, |
| "loss": 0.9332, |
| "mean_token_accuracy": 0.7629943957552314, |
| "num_tokens": 561479394.0, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.8775485870990819, |
| "grad_norm": 0.9245224595069885, |
| "learning_rate": 2.8943671864025844e-05, |
| "loss": 0.9288, |
| "mean_token_accuracy": 0.7627130763605237, |
| "num_tokens": 563271872.0, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.8802738932701971, |
| "grad_norm": 0.887617826461792, |
| "learning_rate": 2.887343728051693e-05, |
| "loss": 0.9231, |
| "mean_token_accuracy": 0.7647191828116775, |
| "num_tokens": 565002210.0, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.8829991994413122, |
| "grad_norm": 0.9698552489280701, |
| "learning_rate": 2.8803202697008008e-05, |
| "loss": 0.8498, |
| "mean_token_accuracy": 0.7792680401355028, |
| "num_tokens": 566740287.0, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.8857245056124274, |
| "grad_norm": 0.8935603499412537, |
| "learning_rate": 2.8732968113499088e-05, |
| "loss": 0.9517, |
| "mean_token_accuracy": 0.7585014823824168, |
| "num_tokens": 568560935.0, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.8884498117835425, |
| "grad_norm": 1.0148096084594727, |
| "learning_rate": 2.866273352999017e-05, |
| "loss": 0.94, |
| "mean_token_accuracy": 0.7607061000540852, |
| "num_tokens": 570290159.0, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.8911751179546578, |
| "grad_norm": 0.9350414872169495, |
| "learning_rate": 2.8592498946481248e-05, |
| "loss": 0.9458, |
| "mean_token_accuracy": 0.7595778482034803, |
| "num_tokens": 572076704.0, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.8939004241257729, |
| "grad_norm": 0.8865498304367065, |
| "learning_rate": 2.852226436297233e-05, |
| "loss": 0.9528, |
| "mean_token_accuracy": 0.7583590077236295, |
| "num_tokens": 573841975.0, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.8966257302968881, |
| "grad_norm": 0.9123432040214539, |
| "learning_rate": 2.845202977946341e-05, |
| "loss": 0.9276, |
| "mean_token_accuracy": 0.7639707328751684, |
| "num_tokens": 575587035.0, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.8993510364680032, |
| "grad_norm": 0.8873813152313232, |
| "learning_rate": 2.8381795195954492e-05, |
| "loss": 0.9297, |
| "mean_token_accuracy": 0.76226064004004, |
| "num_tokens": 577285852.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9020763426391184, |
| "grad_norm": 0.9241533875465393, |
| "learning_rate": 2.831156061244557e-05, |
| "loss": 0.9416, |
| "mean_token_accuracy": 0.7608016451820732, |
| "num_tokens": 579085053.0, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.9048016488102335, |
| "grad_norm": 0.8881911635398865, |
| "learning_rate": 2.824132602893665e-05, |
| "loss": 0.9152, |
| "mean_token_accuracy": 0.7653098279610276, |
| "num_tokens": 580874041.0, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.9075269549813487, |
| "grad_norm": 0.9386590123176575, |
| "learning_rate": 2.8171091445427726e-05, |
| "loss": 0.9154, |
| "mean_token_accuracy": 0.7642969543114304, |
| "num_tokens": 582616757.0, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.9102522611524638, |
| "grad_norm": 0.9226840138435364, |
| "learning_rate": 2.8100856861918813e-05, |
| "loss": 0.9081, |
| "mean_token_accuracy": 0.7670234115794301, |
| "num_tokens": 584370810.0, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.912977567323579, |
| "grad_norm": 0.9441806077957153, |
| "learning_rate": 2.803062227840989e-05, |
| "loss": 0.9236, |
| "mean_token_accuracy": 0.7639048263430596, |
| "num_tokens": 586143330.0, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9157028734946941, |
| "grad_norm": 0.9037384986877441, |
| "learning_rate": 2.796038769490097e-05, |
| "loss": 0.8882, |
| "mean_token_accuracy": 0.7715507194399833, |
| "num_tokens": 587851884.0, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.9184281796658094, |
| "grad_norm": 0.9478141665458679, |
| "learning_rate": 2.7890153111392054e-05, |
| "loss": 0.9175, |
| "mean_token_accuracy": 0.7643080299720169, |
| "num_tokens": 589601267.0, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.9211534858369245, |
| "grad_norm": 1.0288023948669434, |
| "learning_rate": 2.7819918527883134e-05, |
| "loss": 0.9489, |
| "mean_token_accuracy": 0.7579874385148286, |
| "num_tokens": 591321205.0, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.9238787920080397, |
| "grad_norm": 0.8367487192153931, |
| "learning_rate": 2.774968394437421e-05, |
| "loss": 0.8963, |
| "mean_token_accuracy": 0.7705693520605564, |
| "num_tokens": 593110103.0, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.9266040981791548, |
| "grad_norm": 1.0031945705413818, |
| "learning_rate": 2.767944936086529e-05, |
| "loss": 0.9476, |
| "mean_token_accuracy": 0.7592636797577142, |
| "num_tokens": 594823039.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.92932940435027, |
| "grad_norm": 0.901996374130249, |
| "learning_rate": 2.7609214777356374e-05, |
| "loss": 0.927, |
| "mean_token_accuracy": 0.7638763342052698, |
| "num_tokens": 596625643.0, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.9320547105213851, |
| "grad_norm": 0.9304973483085632, |
| "learning_rate": 2.753898019384745e-05, |
| "loss": 0.947, |
| "mean_token_accuracy": 0.7571751626208425, |
| "num_tokens": 598409404.0, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.9347800166925003, |
| "grad_norm": 1.1966147422790527, |
| "learning_rate": 2.746874561033853e-05, |
| "loss": 0.9304, |
| "mean_token_accuracy": 0.7618606876581907, |
| "num_tokens": 600131817.0, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.9375053228636154, |
| "grad_norm": 0.8487194776535034, |
| "learning_rate": 2.7398511026829608e-05, |
| "loss": 0.9325, |
| "mean_token_accuracy": 0.7624602910131216, |
| "num_tokens": 601867823.0, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.9402306290347306, |
| "grad_norm": 0.9328591227531433, |
| "learning_rate": 2.7328276443320695e-05, |
| "loss": 0.9096, |
| "mean_token_accuracy": 0.7685537921264768, |
| "num_tokens": 603591263.0, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.9429559352058458, |
| "grad_norm": 0.9486989378929138, |
| "learning_rate": 2.7258041859811772e-05, |
| "loss": 0.9362, |
| "mean_token_accuracy": 0.7623121970333159, |
| "num_tokens": 605400422.0, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.945681241376961, |
| "grad_norm": 0.9277918934822083, |
| "learning_rate": 2.7187807276302852e-05, |
| "loss": 0.9289, |
| "mean_token_accuracy": 0.7633737292140722, |
| "num_tokens": 607180330.0, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.9484065475480761, |
| "grad_norm": 0.8774585127830505, |
| "learning_rate": 2.7117572692793936e-05, |
| "loss": 0.8978, |
| "mean_token_accuracy": 0.7696298151277006, |
| "num_tokens": 608937194.0, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.9511318537191913, |
| "grad_norm": 0.955440104007721, |
| "learning_rate": 2.7047338109285016e-05, |
| "loss": 0.9373, |
| "mean_token_accuracy": 0.7603609010577201, |
| "num_tokens": 610676168.0, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.9538571598903064, |
| "grad_norm": 0.8963478207588196, |
| "learning_rate": 2.6977103525776093e-05, |
| "loss": 0.8916, |
| "mean_token_accuracy": 0.7708989802747965, |
| "num_tokens": 612491555.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.9565824660614216, |
| "grad_norm": 0.9914854764938354, |
| "learning_rate": 2.6906868942267173e-05, |
| "loss": 0.9422, |
| "mean_token_accuracy": 0.7594701206311584, |
| "num_tokens": 614310820.0, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.9593077722325367, |
| "grad_norm": 0.9009816646575928, |
| "learning_rate": 2.6836634358758256e-05, |
| "loss": 0.9448, |
| "mean_token_accuracy": 0.7597188876941801, |
| "num_tokens": 616071087.0, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.9620330784036519, |
| "grad_norm": 0.9539654850959778, |
| "learning_rate": 2.6766399775249333e-05, |
| "loss": 0.896, |
| "mean_token_accuracy": 0.7685511685907841, |
| "num_tokens": 617769166.0, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.964758384574767, |
| "grad_norm": 0.9612520337104797, |
| "learning_rate": 2.6696165191740413e-05, |
| "loss": 0.9235, |
| "mean_token_accuracy": 0.7635913614183665, |
| "num_tokens": 619538232.0, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.9674836907458823, |
| "grad_norm": 0.8072157502174377, |
| "learning_rate": 2.662593060823149e-05, |
| "loss": 0.9138, |
| "mean_token_accuracy": 0.7675083599984646, |
| "num_tokens": 621303314.0, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.9702089969169974, |
| "grad_norm": 0.8455495238304138, |
| "learning_rate": 2.6555696024722577e-05, |
| "loss": 0.9015, |
| "mean_token_accuracy": 0.7690754882991314, |
| "num_tokens": 623100776.0, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.9729343030881126, |
| "grad_norm": 0.9044631123542786, |
| "learning_rate": 2.6485461441213654e-05, |
| "loss": 0.961, |
| "mean_token_accuracy": 0.7549854224547744, |
| "num_tokens": 624821413.0, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.9756596092592277, |
| "grad_norm": 0.9768007397651672, |
| "learning_rate": 2.6415226857704734e-05, |
| "loss": 0.9319, |
| "mean_token_accuracy": 0.7615104261785746, |
| "num_tokens": 626628545.0, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.9783849154303429, |
| "grad_norm": 0.8489816188812256, |
| "learning_rate": 2.6344992274195818e-05, |
| "loss": 0.9142, |
| "mean_token_accuracy": 0.7664257822558284, |
| "num_tokens": 628425036.0, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.981110221601458, |
| "grad_norm": 0.8612228631973267, |
| "learning_rate": 2.6274757690686898e-05, |
| "loss": 0.8872, |
| "mean_token_accuracy": 0.7699680911377073, |
| "num_tokens": 630092739.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.9838355277725732, |
| "grad_norm": 1.0233787298202515, |
| "learning_rate": 2.6204523107177975e-05, |
| "loss": 0.9362, |
| "mean_token_accuracy": 0.7623144701123238, |
| "num_tokens": 631791605.0, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.9865608339436883, |
| "grad_norm": 2.674288034439087, |
| "learning_rate": 2.6134288523669055e-05, |
| "loss": 0.928, |
| "mean_token_accuracy": 0.7619699375703931, |
| "num_tokens": 633554055.0, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.9892861401148035, |
| "grad_norm": 0.9271607398986816, |
| "learning_rate": 2.606405394016014e-05, |
| "loss": 0.8983, |
| "mean_token_accuracy": 0.7686384240165353, |
| "num_tokens": 635269789.0, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.9920114462859186, |
| "grad_norm": 0.9083386063575745, |
| "learning_rate": 2.5993819356651215e-05, |
| "loss": 0.9123, |
| "mean_token_accuracy": 0.766354302316904, |
| "num_tokens": 637078092.0, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.9947367524570339, |
| "grad_norm": 0.9537090063095093, |
| "learning_rate": 2.5923584773142296e-05, |
| "loss": 0.8932, |
| "mean_token_accuracy": 0.7706195389851928, |
| "num_tokens": 638813927.0, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.997462058628149, |
| "grad_norm": 1.0616086721420288, |
| "learning_rate": 2.5853350189633372e-05, |
| "loss": 0.9159, |
| "mean_token_accuracy": 0.7659485065378249, |
| "num_tokens": 640594007.0, |
| "step": 3660 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.5164661407470703, |
| "learning_rate": 2.578311560612446e-05, |
| "loss": 0.8237, |
| "mean_token_accuracy": 0.7708987323629776, |
| "num_tokens": 642209396.0, |
| "step": 3670 |
| }, |
| { |
| "epoch": 1.0027253061711152, |
| "grad_norm": 0.9240249395370483, |
| "learning_rate": 2.5712881022615536e-05, |
| "loss": 0.8722, |
| "mean_token_accuracy": 0.7743298249319196, |
| "num_tokens": 643925394.0, |
| "step": 3680 |
| }, |
| { |
| "epoch": 1.0054506123422304, |
| "grad_norm": 0.9232088923454285, |
| "learning_rate": 2.5642646439106616e-05, |
| "loss": 0.8831, |
| "mean_token_accuracy": 0.7708641194738448, |
| "num_tokens": 645722314.0, |
| "step": 3690 |
| }, |
| { |
| "epoch": 1.0081759185133454, |
| "grad_norm": 0.8724981546401978, |
| "learning_rate": 2.55724118555977e-05, |
| "loss": 0.8436, |
| "mean_token_accuracy": 0.7817438881844282, |
| "num_tokens": 647498896.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0109012246844606, |
| "grad_norm": 0.8756746649742126, |
| "learning_rate": 2.550217727208878e-05, |
| "loss": 0.8572, |
| "mean_token_accuracy": 0.776770019158721, |
| "num_tokens": 649263905.0, |
| "step": 3710 |
| }, |
| { |
| "epoch": 1.0136265308555759, |
| "grad_norm": 0.8888759016990662, |
| "learning_rate": 2.5431942688579857e-05, |
| "loss": 0.87, |
| "mean_token_accuracy": 0.773919434286654, |
| "num_tokens": 651037567.0, |
| "step": 3720 |
| }, |
| { |
| "epoch": 1.016351837026691, |
| "grad_norm": 0.9350723028182983, |
| "learning_rate": 2.5361708105070937e-05, |
| "loss": 0.8653, |
| "mean_token_accuracy": 0.7757575090974569, |
| "num_tokens": 652736955.0, |
| "step": 3730 |
| }, |
| { |
| "epoch": 1.019077143197806, |
| "grad_norm": 0.997680127620697, |
| "learning_rate": 2.529147352156202e-05, |
| "loss": 0.837, |
| "mean_token_accuracy": 0.7817440122365952, |
| "num_tokens": 654479273.0, |
| "step": 3740 |
| }, |
| { |
| "epoch": 1.0218024493689213, |
| "grad_norm": 0.9358142614364624, |
| "learning_rate": 2.5221238938053098e-05, |
| "loss": 0.8512, |
| "mean_token_accuracy": 0.7783015862107276, |
| "num_tokens": 656259441.0, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.0245277555400365, |
| "grad_norm": 0.8940883874893188, |
| "learning_rate": 2.5151004354544178e-05, |
| "loss": 0.8465, |
| "mean_token_accuracy": 0.7796386586502194, |
| "num_tokens": 658034787.0, |
| "step": 3760 |
| }, |
| { |
| "epoch": 1.0272530617111517, |
| "grad_norm": 0.846143901348114, |
| "learning_rate": 2.5080769771035258e-05, |
| "loss": 0.8944, |
| "mean_token_accuracy": 0.7696146221831441, |
| "num_tokens": 659760393.0, |
| "step": 3770 |
| }, |
| { |
| "epoch": 1.0299783678822667, |
| "grad_norm": 0.9335619807243347, |
| "learning_rate": 2.501053518752634e-05, |
| "loss": 0.8311, |
| "mean_token_accuracy": 0.7837788056582212, |
| "num_tokens": 661469264.0, |
| "step": 3780 |
| }, |
| { |
| "epoch": 1.032703674053382, |
| "grad_norm": 0.9666333198547363, |
| "learning_rate": 2.494030060401742e-05, |
| "loss": 0.857, |
| "mean_token_accuracy": 0.7761554718017578, |
| "num_tokens": 663177784.0, |
| "step": 3790 |
| }, |
| { |
| "epoch": 1.0354289802244971, |
| "grad_norm": 0.8531840443611145, |
| "learning_rate": 2.4870066020508502e-05, |
| "loss": 0.8788, |
| "mean_token_accuracy": 0.7721146021038294, |
| "num_tokens": 664940637.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.0381542863956124, |
| "grad_norm": 0.9225367903709412, |
| "learning_rate": 2.479983143699958e-05, |
| "loss": 0.8631, |
| "mean_token_accuracy": 0.7762221993878484, |
| "num_tokens": 666668007.0, |
| "step": 3810 |
| }, |
| { |
| "epoch": 1.0408795925667274, |
| "grad_norm": 0.839841365814209, |
| "learning_rate": 2.4729596853490662e-05, |
| "loss": 0.898, |
| "mean_token_accuracy": 0.7690054396167397, |
| "num_tokens": 668437185.0, |
| "step": 3820 |
| }, |
| { |
| "epoch": 1.0436048987378426, |
| "grad_norm": 0.9301173686981201, |
| "learning_rate": 2.465936226998174e-05, |
| "loss": 0.8848, |
| "mean_token_accuracy": 0.7709696920588612, |
| "num_tokens": 670213712.0, |
| "step": 3830 |
| }, |
| { |
| "epoch": 1.0463302049089578, |
| "grad_norm": 0.8832354545593262, |
| "learning_rate": 2.458912768647282e-05, |
| "loss": 0.8534, |
| "mean_token_accuracy": 0.7794495921581983, |
| "num_tokens": 672017818.0, |
| "step": 3840 |
| }, |
| { |
| "epoch": 1.049055511080073, |
| "grad_norm": 0.9354749917984009, |
| "learning_rate": 2.45188931029639e-05, |
| "loss": 0.8642, |
| "mean_token_accuracy": 0.7752095900475979, |
| "num_tokens": 673831446.0, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.051780817251188, |
| "grad_norm": 0.9421780109405518, |
| "learning_rate": 2.444865851945498e-05, |
| "loss": 0.881, |
| "mean_token_accuracy": 0.7715348338708281, |
| "num_tokens": 675554873.0, |
| "step": 3860 |
| }, |
| { |
| "epoch": 1.0545061234223032, |
| "grad_norm": 0.9302453994750977, |
| "learning_rate": 2.437842393594606e-05, |
| "loss": 0.8838, |
| "mean_token_accuracy": 0.771690078265965, |
| "num_tokens": 677257984.0, |
| "step": 3870 |
| }, |
| { |
| "epoch": 1.0572314295934184, |
| "grad_norm": 0.9617043137550354, |
| "learning_rate": 2.430818935243714e-05, |
| "loss": 0.8517, |
| "mean_token_accuracy": 0.778262036293745, |
| "num_tokens": 678942202.0, |
| "step": 3880 |
| }, |
| { |
| "epoch": 1.0599567357645336, |
| "grad_norm": 0.9521083831787109, |
| "learning_rate": 2.4237954768928224e-05, |
| "loss": 0.882, |
| "mean_token_accuracy": 0.7722775416448713, |
| "num_tokens": 680673377.0, |
| "step": 3890 |
| }, |
| { |
| "epoch": 1.0626820419356486, |
| "grad_norm": 0.9471562504768372, |
| "learning_rate": 2.41677201854193e-05, |
| "loss": 0.867, |
| "mean_token_accuracy": 0.7758576031774282, |
| "num_tokens": 682485772.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.0654073481067639, |
| "grad_norm": 0.9179297089576721, |
| "learning_rate": 2.4097485601910384e-05, |
| "loss": 0.8505, |
| "mean_token_accuracy": 0.7773557582870125, |
| "num_tokens": 684233327.0, |
| "step": 3910 |
| }, |
| { |
| "epoch": 1.068132654277879, |
| "grad_norm": 0.8177363872528076, |
| "learning_rate": 2.402725101840146e-05, |
| "loss": 0.8871, |
| "mean_token_accuracy": 0.7715361347422004, |
| "num_tokens": 686004478.0, |
| "step": 3920 |
| }, |
| { |
| "epoch": 1.0708579604489943, |
| "grad_norm": 0.966841459274292, |
| "learning_rate": 2.3957016434892544e-05, |
| "loss": 0.896, |
| "mean_token_accuracy": 0.7691411130130291, |
| "num_tokens": 687776176.0, |
| "step": 3930 |
| }, |
| { |
| "epoch": 1.0735832666201093, |
| "grad_norm": 0.9280087351799011, |
| "learning_rate": 2.388678185138362e-05, |
| "loss": 0.8472, |
| "mean_token_accuracy": 0.7793797582387925, |
| "num_tokens": 689522355.0, |
| "step": 3940 |
| }, |
| { |
| "epoch": 1.0763085727912245, |
| "grad_norm": 0.8689372539520264, |
| "learning_rate": 2.38165472678747e-05, |
| "loss": 0.8586, |
| "mean_token_accuracy": 0.7773848133161664, |
| "num_tokens": 691309245.0, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.0790338789623397, |
| "grad_norm": 0.9233465790748596, |
| "learning_rate": 2.374631268436578e-05, |
| "loss": 0.8838, |
| "mean_token_accuracy": 0.7714135514572262, |
| "num_tokens": 693050122.0, |
| "step": 3960 |
| }, |
| { |
| "epoch": 1.081759185133455, |
| "grad_norm": 0.9454469084739685, |
| "learning_rate": 2.3676078100856862e-05, |
| "loss": 0.8672, |
| "mean_token_accuracy": 0.7735035054385662, |
| "num_tokens": 694762288.0, |
| "step": 3970 |
| }, |
| { |
| "epoch": 1.08448449130457, |
| "grad_norm": 0.8887579441070557, |
| "learning_rate": 2.3605843517347942e-05, |
| "loss": 0.8562, |
| "mean_token_accuracy": 0.7782448509708046, |
| "num_tokens": 696531417.0, |
| "step": 3980 |
| }, |
| { |
| "epoch": 1.0872097974756851, |
| "grad_norm": 0.8284960389137268, |
| "learning_rate": 2.3535608933839022e-05, |
| "loss": 0.8201, |
| "mean_token_accuracy": 0.7846797706559301, |
| "num_tokens": 698257075.0, |
| "step": 3990 |
| }, |
| { |
| "epoch": 1.0899351036468004, |
| "grad_norm": 0.8965194821357727, |
| "learning_rate": 2.3465374350330106e-05, |
| "loss": 0.819, |
| "mean_token_accuracy": 0.7854189421981573, |
| "num_tokens": 699971132.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.0926604098179156, |
| "grad_norm": 0.9058973789215088, |
| "learning_rate": 2.3395139766821183e-05, |
| "loss": 0.9108, |
| "mean_token_accuracy": 0.766863869689405, |
| "num_tokens": 701752105.0, |
| "step": 4010 |
| }, |
| { |
| "epoch": 1.0953857159890306, |
| "grad_norm": 0.871168315410614, |
| "learning_rate": 2.3324905183312266e-05, |
| "loss": 0.8832, |
| "mean_token_accuracy": 0.7707405330613255, |
| "num_tokens": 703473481.0, |
| "step": 4020 |
| }, |
| { |
| "epoch": 1.0981110221601458, |
| "grad_norm": 0.9408893585205078, |
| "learning_rate": 2.3254670599803343e-05, |
| "loss": 0.878, |
| "mean_token_accuracy": 0.7724182082340121, |
| "num_tokens": 705167075.0, |
| "step": 4030 |
| }, |
| { |
| "epoch": 1.100836328331261, |
| "grad_norm": 1.0479724407196045, |
| "learning_rate": 2.3184436016294427e-05, |
| "loss": 0.8648, |
| "mean_token_accuracy": 0.7744791394099593, |
| "num_tokens": 706874911.0, |
| "step": 4040 |
| }, |
| { |
| "epoch": 1.1035616345023762, |
| "grad_norm": 0.9027190804481506, |
| "learning_rate": 2.3114201432785503e-05, |
| "loss": 0.8485, |
| "mean_token_accuracy": 0.7799729842692613, |
| "num_tokens": 708619583.0, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.1062869406734912, |
| "grad_norm": 0.9587522149085999, |
| "learning_rate": 2.3043966849276587e-05, |
| "loss": 0.8768, |
| "mean_token_accuracy": 0.7716966487467289, |
| "num_tokens": 710446727.0, |
| "step": 4060 |
| }, |
| { |
| "epoch": 1.1090122468446064, |
| "grad_norm": 1.009513020515442, |
| "learning_rate": 2.2973732265767664e-05, |
| "loss": 0.8811, |
| "mean_token_accuracy": 0.7735799238085747, |
| "num_tokens": 712149650.0, |
| "step": 4070 |
| }, |
| { |
| "epoch": 1.1117375530157216, |
| "grad_norm": 0.8594178557395935, |
| "learning_rate": 2.2903497682258744e-05, |
| "loss": 0.8803, |
| "mean_token_accuracy": 0.7721791565418243, |
| "num_tokens": 713914558.0, |
| "step": 4080 |
| }, |
| { |
| "epoch": 1.1144628591868369, |
| "grad_norm": 0.905235230922699, |
| "learning_rate": 2.2833263098749824e-05, |
| "loss": 0.8545, |
| "mean_token_accuracy": 0.7781607685610652, |
| "num_tokens": 715646472.0, |
| "step": 4090 |
| }, |
| { |
| "epoch": 1.1171881653579518, |
| "grad_norm": 0.9111061096191406, |
| "learning_rate": 2.2763028515240904e-05, |
| "loss": 0.9154, |
| "mean_token_accuracy": 0.7648548442870379, |
| "num_tokens": 717350655.0, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.119913471529067, |
| "grad_norm": 0.8776023387908936, |
| "learning_rate": 2.2692793931731988e-05, |
| "loss": 0.8664, |
| "mean_token_accuracy": 0.7753776194527745, |
| "num_tokens": 719082198.0, |
| "step": 4110 |
| }, |
| { |
| "epoch": 1.1226387777001823, |
| "grad_norm": 0.9634692668914795, |
| "learning_rate": 2.2622559348223065e-05, |
| "loss": 0.8593, |
| "mean_token_accuracy": 0.7774181716144085, |
| "num_tokens": 720809425.0, |
| "step": 4120 |
| }, |
| { |
| "epoch": 1.1253640838712975, |
| "grad_norm": 0.8464395999908447, |
| "learning_rate": 2.255232476471415e-05, |
| "loss": 0.8628, |
| "mean_token_accuracy": 0.7767069381661713, |
| "num_tokens": 722599262.0, |
| "step": 4130 |
| }, |
| { |
| "epoch": 1.1280893900424125, |
| "grad_norm": 0.9914641380310059, |
| "learning_rate": 2.2482090181205225e-05, |
| "loss": 0.891, |
| "mean_token_accuracy": 0.7694517195224762, |
| "num_tokens": 724347920.0, |
| "step": 4140 |
| }, |
| { |
| "epoch": 1.1308146962135277, |
| "grad_norm": 1.015712022781372, |
| "learning_rate": 2.241185559769631e-05, |
| "loss": 0.8757, |
| "mean_token_accuracy": 0.7740827260538936, |
| "num_tokens": 726148714.0, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.133540002384643, |
| "grad_norm": 0.9247304201126099, |
| "learning_rate": 2.2341621014187386e-05, |
| "loss": 0.8515, |
| "mean_token_accuracy": 0.778469798900187, |
| "num_tokens": 727926585.0, |
| "step": 4160 |
| }, |
| { |
| "epoch": 1.1362653085557581, |
| "grad_norm": 0.8736830353736877, |
| "learning_rate": 2.227138643067847e-05, |
| "loss": 0.8428, |
| "mean_token_accuracy": 0.7804164415225386, |
| "num_tokens": 729670197.0, |
| "step": 4170 |
| }, |
| { |
| "epoch": 1.1389906147268731, |
| "grad_norm": 1.021480679512024, |
| "learning_rate": 2.2201151847169546e-05, |
| "loss": 0.8686, |
| "mean_token_accuracy": 0.7744184449315071, |
| "num_tokens": 731485403.0, |
| "step": 4180 |
| }, |
| { |
| "epoch": 1.1417159208979883, |
| "grad_norm": 0.9501472115516663, |
| "learning_rate": 2.2130917263660626e-05, |
| "loss": 0.9155, |
| "mean_token_accuracy": 0.7662674132734537, |
| "num_tokens": 733341300.0, |
| "step": 4190 |
| }, |
| { |
| "epoch": 1.1444412270691036, |
| "grad_norm": 0.9563923478126526, |
| "learning_rate": 2.206068268015171e-05, |
| "loss": 0.8724, |
| "mean_token_accuracy": 0.7726614790037274, |
| "num_tokens": 735058358.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.1471665332402188, |
| "grad_norm": 0.8116152882575989, |
| "learning_rate": 2.1990448096642787e-05, |
| "loss": 0.8319, |
| "mean_token_accuracy": 0.782016440294683, |
| "num_tokens": 736766234.0, |
| "step": 4210 |
| }, |
| { |
| "epoch": 1.1498918394113338, |
| "grad_norm": 0.9251613020896912, |
| "learning_rate": 2.192021351313387e-05, |
| "loss": 0.8564, |
| "mean_token_accuracy": 0.7771940764039755, |
| "num_tokens": 738512811.0, |
| "step": 4220 |
| }, |
| { |
| "epoch": 1.152617145582449, |
| "grad_norm": 0.9852247834205627, |
| "learning_rate": 2.1849978929624947e-05, |
| "loss": 0.8433, |
| "mean_token_accuracy": 0.7803450087085366, |
| "num_tokens": 740229199.0, |
| "step": 4230 |
| }, |
| { |
| "epoch": 1.1553424517535642, |
| "grad_norm": 0.8679438233375549, |
| "learning_rate": 2.177974434611603e-05, |
| "loss": 0.8805, |
| "mean_token_accuracy": 0.7727601310238242, |
| "num_tokens": 741944899.0, |
| "step": 4240 |
| }, |
| { |
| "epoch": 1.1580677579246794, |
| "grad_norm": 0.8484223484992981, |
| "learning_rate": 2.1709509762607107e-05, |
| "loss": 0.863, |
| "mean_token_accuracy": 0.7756739309057593, |
| "num_tokens": 743693703.0, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.1607930640957944, |
| "grad_norm": 0.9152857661247253, |
| "learning_rate": 2.163927517909819e-05, |
| "loss": 0.8558, |
| "mean_token_accuracy": 0.7769493071362377, |
| "num_tokens": 745478644.0, |
| "step": 4260 |
| }, |
| { |
| "epoch": 1.1635183702669096, |
| "grad_norm": 0.9829593300819397, |
| "learning_rate": 2.1569040595589268e-05, |
| "loss": 0.8446, |
| "mean_token_accuracy": 0.7785475486889482, |
| "num_tokens": 747177577.0, |
| "step": 4270 |
| }, |
| { |
| "epoch": 1.1662436764380248, |
| "grad_norm": 0.8685614466667175, |
| "learning_rate": 2.149880601208035e-05, |
| "loss": 0.8495, |
| "mean_token_accuracy": 0.7795551843941212, |
| "num_tokens": 748939185.0, |
| "step": 4280 |
| }, |
| { |
| "epoch": 1.16896898260914, |
| "grad_norm": 1.758520245552063, |
| "learning_rate": 2.1428571428571428e-05, |
| "loss": 0.8643, |
| "mean_token_accuracy": 0.7761479124426842, |
| "num_tokens": 750713451.0, |
| "step": 4290 |
| }, |
| { |
| "epoch": 1.1716942887802553, |
| "grad_norm": 0.9914926886558533, |
| "learning_rate": 2.135833684506251e-05, |
| "loss": 0.8764, |
| "mean_token_accuracy": 0.7727332988753914, |
| "num_tokens": 752413608.0, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.1744195949513703, |
| "grad_norm": 0.873138427734375, |
| "learning_rate": 2.1288102261553592e-05, |
| "loss": 0.8749, |
| "mean_token_accuracy": 0.7754033163189888, |
| "num_tokens": 754171875.0, |
| "step": 4310 |
| }, |
| { |
| "epoch": 1.1771449011224855, |
| "grad_norm": 0.8656719326972961, |
| "learning_rate": 2.121786767804467e-05, |
| "loss": 0.8317, |
| "mean_token_accuracy": 0.7830374624580145, |
| "num_tokens": 755959283.0, |
| "step": 4320 |
| }, |
| { |
| "epoch": 1.1798702072936007, |
| "grad_norm": 0.8523385524749756, |
| "learning_rate": 2.1147633094535752e-05, |
| "loss": 0.8723, |
| "mean_token_accuracy": 0.7747141301631928, |
| "num_tokens": 757784699.0, |
| "step": 4330 |
| }, |
| { |
| "epoch": 1.1825955134647157, |
| "grad_norm": 0.8827849626541138, |
| "learning_rate": 2.107739851102683e-05, |
| "loss": 0.8394, |
| "mean_token_accuracy": 0.7828326500952244, |
| "num_tokens": 759536372.0, |
| "step": 4340 |
| }, |
| { |
| "epoch": 1.185320819635831, |
| "grad_norm": 0.9193124771118164, |
| "learning_rate": 2.1007163927517913e-05, |
| "loss": 0.847, |
| "mean_token_accuracy": 0.7787586458027362, |
| "num_tokens": 761294878.0, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.1880461258069461, |
| "grad_norm": 0.8960845470428467, |
| "learning_rate": 2.093692934400899e-05, |
| "loss": 0.8687, |
| "mean_token_accuracy": 0.775050479453057, |
| "num_tokens": 763041449.0, |
| "step": 4360 |
| }, |
| { |
| "epoch": 1.1907714319780613, |
| "grad_norm": 0.9564264416694641, |
| "learning_rate": 2.0866694760500073e-05, |
| "loss": 0.8442, |
| "mean_token_accuracy": 0.780244923941791, |
| "num_tokens": 764760036.0, |
| "step": 4370 |
| }, |
| { |
| "epoch": 1.1934967381491766, |
| "grad_norm": 1.0011682510375977, |
| "learning_rate": 2.079646017699115e-05, |
| "loss": 0.89, |
| "mean_token_accuracy": 0.7707465596497058, |
| "num_tokens": 766503555.0, |
| "step": 4380 |
| }, |
| { |
| "epoch": 1.1962220443202916, |
| "grad_norm": 1.0252357721328735, |
| "learning_rate": 2.0726225593482233e-05, |
| "loss": 0.8655, |
| "mean_token_accuracy": 0.7751994812861085, |
| "num_tokens": 768216315.0, |
| "step": 4390 |
| }, |
| { |
| "epoch": 1.1989473504914068, |
| "grad_norm": 0.9509198069572449, |
| "learning_rate": 2.065599100997331e-05, |
| "loss": 0.8247, |
| "mean_token_accuracy": 0.784438369423151, |
| "num_tokens": 769929999.0, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.201672656662522, |
| "grad_norm": 0.9141966104507446, |
| "learning_rate": 2.058575642646439e-05, |
| "loss": 0.9297, |
| "mean_token_accuracy": 0.7612682949751616, |
| "num_tokens": 771653789.0, |
| "step": 4410 |
| }, |
| { |
| "epoch": 1.204397962833637, |
| "grad_norm": 0.8379613161087036, |
| "learning_rate": 2.0515521842955474e-05, |
| "loss": 0.8875, |
| "mean_token_accuracy": 0.770428568776697, |
| "num_tokens": 773424211.0, |
| "step": 4420 |
| }, |
| { |
| "epoch": 1.2071232690047522, |
| "grad_norm": 0.93412184715271, |
| "learning_rate": 2.044528725944655e-05, |
| "loss": 0.8803, |
| "mean_token_accuracy": 0.7713617945089937, |
| "num_tokens": 775177147.0, |
| "step": 4430 |
| }, |
| { |
| "epoch": 1.2098485751758674, |
| "grad_norm": 0.8681289553642273, |
| "learning_rate": 2.0375052675937634e-05, |
| "loss": 0.8952, |
| "mean_token_accuracy": 0.7711352832615376, |
| "num_tokens": 776924632.0, |
| "step": 4440 |
| }, |
| { |
| "epoch": 1.2125738813469826, |
| "grad_norm": 0.844864547252655, |
| "learning_rate": 2.030481809242871e-05, |
| "loss": 0.8667, |
| "mean_token_accuracy": 0.7762801831588149, |
| "num_tokens": 778669132.0, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.2152991875180978, |
| "grad_norm": 0.888852059841156, |
| "learning_rate": 2.0234583508919795e-05, |
| "loss": 0.8519, |
| "mean_token_accuracy": 0.7777089156210423, |
| "num_tokens": 780470551.0, |
| "step": 4460 |
| }, |
| { |
| "epoch": 1.2180244936892128, |
| "grad_norm": 0.8752065300941467, |
| "learning_rate": 2.016434892541087e-05, |
| "loss": 0.889, |
| "mean_token_accuracy": 0.7703436804935336, |
| "num_tokens": 782177060.0, |
| "step": 4470 |
| }, |
| { |
| "epoch": 1.220749799860328, |
| "grad_norm": 0.9024233222007751, |
| "learning_rate": 2.0094114341901955e-05, |
| "loss": 0.8667, |
| "mean_token_accuracy": 0.7754130480811, |
| "num_tokens": 783933412.0, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.2234751060314433, |
| "grad_norm": 0.9540109038352966, |
| "learning_rate": 2.0023879758393032e-05, |
| "loss": 0.8491, |
| "mean_token_accuracy": 0.7784526620060206, |
| "num_tokens": 785660774.0, |
| "step": 4490 |
| }, |
| { |
| "epoch": 1.2262004122025583, |
| "grad_norm": 1.044049620628357, |
| "learning_rate": 1.9953645174884116e-05, |
| "loss": 0.8701, |
| "mean_token_accuracy": 0.7744961548596621, |
| "num_tokens": 787392675.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.2289257183736735, |
| "grad_norm": 0.8813518285751343, |
| "learning_rate": 1.9883410591375192e-05, |
| "loss": 0.8819, |
| "mean_token_accuracy": 0.7719037376344204, |
| "num_tokens": 789226322.0, |
| "step": 4510 |
| }, |
| { |
| "epoch": 1.2316510245447887, |
| "grad_norm": 1.100071907043457, |
| "learning_rate": 1.9813176007866273e-05, |
| "loss": 0.8791, |
| "mean_token_accuracy": 0.7712693855166435, |
| "num_tokens": 790926661.0, |
| "step": 4520 |
| }, |
| { |
| "epoch": 1.234376330715904, |
| "grad_norm": 0.9320583939552307, |
| "learning_rate": 1.9742941424357356e-05, |
| "loss": 0.8898, |
| "mean_token_accuracy": 0.770352647267282, |
| "num_tokens": 792705504.0, |
| "step": 4530 |
| }, |
| { |
| "epoch": 1.2371016368870191, |
| "grad_norm": 0.8929402828216553, |
| "learning_rate": 1.9672706840848433e-05, |
| "loss": 0.8646, |
| "mean_token_accuracy": 0.7746491042897106, |
| "num_tokens": 794463296.0, |
| "step": 4540 |
| }, |
| { |
| "epoch": 1.2398269430581341, |
| "grad_norm": 0.9568018317222595, |
| "learning_rate": 1.9602472257339517e-05, |
| "loss": 0.84, |
| "mean_token_accuracy": 0.7792695637792348, |
| "num_tokens": 796199027.0, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.2425522492292493, |
| "grad_norm": 0.8973085880279541, |
| "learning_rate": 1.9532237673830593e-05, |
| "loss": 0.86, |
| "mean_token_accuracy": 0.7752432754263282, |
| "num_tokens": 797958936.0, |
| "step": 4560 |
| }, |
| { |
| "epoch": 1.2452775554003646, |
| "grad_norm": 0.9531684517860413, |
| "learning_rate": 1.9462003090321677e-05, |
| "loss": 0.8774, |
| "mean_token_accuracy": 0.7717797016724944, |
| "num_tokens": 799700358.0, |
| "step": 4570 |
| }, |
| { |
| "epoch": 1.2480028615714795, |
| "grad_norm": 0.8983331322669983, |
| "learning_rate": 1.9391768506812754e-05, |
| "loss": 0.8749, |
| "mean_token_accuracy": 0.7740316811949015, |
| "num_tokens": 801489379.0, |
| "step": 4580 |
| }, |
| { |
| "epoch": 1.2507281677425948, |
| "grad_norm": 0.8893999457359314, |
| "learning_rate": 1.9321533923303837e-05, |
| "loss": 0.8731, |
| "mean_token_accuracy": 0.773632800579071, |
| "num_tokens": 803248320.0, |
| "step": 4590 |
| }, |
| { |
| "epoch": 1.25345347391371, |
| "grad_norm": 0.936711847782135, |
| "learning_rate": 1.9251299339794914e-05, |
| "loss": 0.8897, |
| "mean_token_accuracy": 0.7704314980655909, |
| "num_tokens": 804952704.0, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.2561787800848252, |
| "grad_norm": 0.9723296761512756, |
| "learning_rate": 1.9181064756285998e-05, |
| "loss": 0.8649, |
| "mean_token_accuracy": 0.7750582758337259, |
| "num_tokens": 806596269.0, |
| "step": 4610 |
| }, |
| { |
| "epoch": 1.2589040862559404, |
| "grad_norm": 0.8637282252311707, |
| "learning_rate": 1.9110830172777075e-05, |
| "loss": 0.8841, |
| "mean_token_accuracy": 0.7715824166312814, |
| "num_tokens": 808364023.0, |
| "step": 4620 |
| }, |
| { |
| "epoch": 1.2616293924270554, |
| "grad_norm": 0.917448103427887, |
| "learning_rate": 1.9040595589268155e-05, |
| "loss": 0.8781, |
| "mean_token_accuracy": 0.772852830402553, |
| "num_tokens": 810145656.0, |
| "step": 4630 |
| }, |
| { |
| "epoch": 1.2643546985981706, |
| "grad_norm": 0.9181602597236633, |
| "learning_rate": 1.897036100575924e-05, |
| "loss": 0.8793, |
| "mean_token_accuracy": 0.7728120513260365, |
| "num_tokens": 811942362.0, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.2670800047692858, |
| "grad_norm": 0.9608568549156189, |
| "learning_rate": 1.8900126422250315e-05, |
| "loss": 0.8845, |
| "mean_token_accuracy": 0.7719099586829543, |
| "num_tokens": 813665193.0, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.2698053109404008, |
| "grad_norm": 0.9495261907577515, |
| "learning_rate": 1.88298918387414e-05, |
| "loss": 0.8894, |
| "mean_token_accuracy": 0.7707803629338741, |
| "num_tokens": 815388624.0, |
| "step": 4660 |
| }, |
| { |
| "epoch": 1.272530617111516, |
| "grad_norm": 0.9851597547531128, |
| "learning_rate": 1.8759657255232476e-05, |
| "loss": 0.8622, |
| "mean_token_accuracy": 0.7751537755131721, |
| "num_tokens": 817088463.0, |
| "step": 4670 |
| }, |
| { |
| "epoch": 1.2752559232826313, |
| "grad_norm": 0.9643869400024414, |
| "learning_rate": 1.868942267172356e-05, |
| "loss": 0.8389, |
| "mean_token_accuracy": 0.7803213361650705, |
| "num_tokens": 818867377.0, |
| "step": 4680 |
| }, |
| { |
| "epoch": 1.2779812294537465, |
| "grad_norm": 0.8128288984298706, |
| "learning_rate": 1.8619188088214636e-05, |
| "loss": 0.8718, |
| "mean_token_accuracy": 0.7747130613774061, |
| "num_tokens": 820718067.0, |
| "step": 4690 |
| }, |
| { |
| "epoch": 1.2807065356248617, |
| "grad_norm": 0.8736932873725891, |
| "learning_rate": 1.854895350470572e-05, |
| "loss": 0.8565, |
| "mean_token_accuracy": 0.7788665095344186, |
| "num_tokens": 822427113.0, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.2834318417959767, |
| "grad_norm": 0.9515299797058105, |
| "learning_rate": 1.8478718921196796e-05, |
| "loss": 0.8371, |
| "mean_token_accuracy": 0.7818497186526656, |
| "num_tokens": 824202933.0, |
| "step": 4710 |
| }, |
| { |
| "epoch": 1.286157147967092, |
| "grad_norm": 0.8756122589111328, |
| "learning_rate": 1.840848433768788e-05, |
| "loss": 0.834, |
| "mean_token_accuracy": 0.7834942568093538, |
| "num_tokens": 825952635.0, |
| "step": 4720 |
| }, |
| { |
| "epoch": 1.2888824541382071, |
| "grad_norm": 0.8920331597328186, |
| "learning_rate": 1.833824975417896e-05, |
| "loss": 0.8561, |
| "mean_token_accuracy": 0.7782587666064501, |
| "num_tokens": 827717059.0, |
| "step": 4730 |
| }, |
| { |
| "epoch": 1.2916077603093221, |
| "grad_norm": 0.9464604258537292, |
| "learning_rate": 1.8268015170670037e-05, |
| "loss": 0.8842, |
| "mean_token_accuracy": 0.7727599732577801, |
| "num_tokens": 829416511.0, |
| "step": 4740 |
| }, |
| { |
| "epoch": 1.2943330664804373, |
| "grad_norm": 0.9033367037773132, |
| "learning_rate": 1.819778058716112e-05, |
| "loss": 0.8724, |
| "mean_token_accuracy": 0.7747443996369838, |
| "num_tokens": 831144366.0, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.2970583726515525, |
| "grad_norm": 0.8975238800048828, |
| "learning_rate": 1.8127546003652197e-05, |
| "loss": 0.8629, |
| "mean_token_accuracy": 0.7767355851829052, |
| "num_tokens": 832951363.0, |
| "step": 4760 |
| }, |
| { |
| "epoch": 1.2997836788226678, |
| "grad_norm": 0.9090251326560974, |
| "learning_rate": 1.805731142014328e-05, |
| "loss": 0.8168, |
| "mean_token_accuracy": 0.7849198190495372, |
| "num_tokens": 834718890.0, |
| "step": 4770 |
| }, |
| { |
| "epoch": 1.302508984993783, |
| "grad_norm": 0.9395559430122375, |
| "learning_rate": 1.7987076836634358e-05, |
| "loss": 0.8704, |
| "mean_token_accuracy": 0.775361999310553, |
| "num_tokens": 836486880.0, |
| "step": 4780 |
| }, |
| { |
| "epoch": 1.305234291164898, |
| "grad_norm": 0.970077633857727, |
| "learning_rate": 1.791684225312544e-05, |
| "loss": 0.8534, |
| "mean_token_accuracy": 0.777713512070477, |
| "num_tokens": 838202213.0, |
| "step": 4790 |
| }, |
| { |
| "epoch": 1.3079595973360132, |
| "grad_norm": 0.9468600749969482, |
| "learning_rate": 1.7846607669616518e-05, |
| "loss": 0.8477, |
| "mean_token_accuracy": 0.7789401352405548, |
| "num_tokens": 839960268.0, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.3106849035071284, |
| "grad_norm": 0.8117781281471252, |
| "learning_rate": 1.7776373086107602e-05, |
| "loss": 0.8423, |
| "mean_token_accuracy": 0.780457166954875, |
| "num_tokens": 841674015.0, |
| "step": 4810 |
| }, |
| { |
| "epoch": 1.3134102096782436, |
| "grad_norm": 0.9844197034835815, |
| "learning_rate": 1.770613850259868e-05, |
| "loss": 0.8601, |
| "mean_token_accuracy": 0.777037788927555, |
| "num_tokens": 843396816.0, |
| "step": 4820 |
| }, |
| { |
| "epoch": 1.3161355158493588, |
| "grad_norm": 0.9023681879043579, |
| "learning_rate": 1.7635903919089762e-05, |
| "loss": 0.8492, |
| "mean_token_accuracy": 0.778867963515222, |
| "num_tokens": 845043717.0, |
| "step": 4830 |
| }, |
| { |
| "epoch": 1.3188608220204738, |
| "grad_norm": 0.9703834056854248, |
| "learning_rate": 1.7565669335580842e-05, |
| "loss": 0.8455, |
| "mean_token_accuracy": 0.7796616595238447, |
| "num_tokens": 846826112.0, |
| "step": 4840 |
| }, |
| { |
| "epoch": 1.321586128191589, |
| "grad_norm": 0.9444880485534668, |
| "learning_rate": 1.7495434752071923e-05, |
| "loss": 0.8751, |
| "mean_token_accuracy": 0.773424380645156, |
| "num_tokens": 848585637.0, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.3243114343627043, |
| "grad_norm": 1.05745530128479, |
| "learning_rate": 1.7425200168563003e-05, |
| "loss": 0.8534, |
| "mean_token_accuracy": 0.7779850512742996, |
| "num_tokens": 850326238.0, |
| "step": 4860 |
| }, |
| { |
| "epoch": 1.3270367405338193, |
| "grad_norm": 0.8836587071418762, |
| "learning_rate": 1.735496558505408e-05, |
| "loss": 0.8454, |
| "mean_token_accuracy": 0.7796294245868921, |
| "num_tokens": 852155502.0, |
| "step": 4870 |
| }, |
| { |
| "epoch": 1.3297620467049345, |
| "grad_norm": 0.9077982902526855, |
| "learning_rate": 1.7284731001545163e-05, |
| "loss": 0.8762, |
| "mean_token_accuracy": 0.7735532848164439, |
| "num_tokens": 853949772.0, |
| "step": 4880 |
| }, |
| { |
| "epoch": 1.3324873528760497, |
| "grad_norm": 0.965934693813324, |
| "learning_rate": 1.721449641803624e-05, |
| "loss": 0.8726, |
| "mean_token_accuracy": 0.7748719964176416, |
| "num_tokens": 855766767.0, |
| "step": 4890 |
| }, |
| { |
| "epoch": 1.335212659047165, |
| "grad_norm": 0.9339163303375244, |
| "learning_rate": 1.7144261834527323e-05, |
| "loss": 0.8927, |
| "mean_token_accuracy": 0.769934331253171, |
| "num_tokens": 857505341.0, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.3379379652182801, |
| "grad_norm": 0.9092389941215515, |
| "learning_rate": 1.70740272510184e-05, |
| "loss": 0.8491, |
| "mean_token_accuracy": 0.7787934005260467, |
| "num_tokens": 859262891.0, |
| "step": 4910 |
| }, |
| { |
| "epoch": 1.3406632713893951, |
| "grad_norm": 0.9047221541404724, |
| "learning_rate": 1.7003792667509484e-05, |
| "loss": 0.8732, |
| "mean_token_accuracy": 0.7735786143690347, |
| "num_tokens": 861020281.0, |
| "step": 4920 |
| }, |
| { |
| "epoch": 1.3433885775605103, |
| "grad_norm": 0.9687885046005249, |
| "learning_rate": 1.693355808400056e-05, |
| "loss": 0.8712, |
| "mean_token_accuracy": 0.7727358728647232, |
| "num_tokens": 862708279.0, |
| "step": 4930 |
| }, |
| { |
| "epoch": 1.3461138837316255, |
| "grad_norm": 0.9339674711227417, |
| "learning_rate": 1.6863323500491644e-05, |
| "loss": 0.871, |
| "mean_token_accuracy": 0.7751242617145181, |
| "num_tokens": 864441387.0, |
| "step": 4940 |
| }, |
| { |
| "epoch": 1.3488391899027405, |
| "grad_norm": 0.9413111805915833, |
| "learning_rate": 1.6793088916982724e-05, |
| "loss": 0.8329, |
| "mean_token_accuracy": 0.781625160202384, |
| "num_tokens": 866182511.0, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.3515644960738558, |
| "grad_norm": 0.9483174681663513, |
| "learning_rate": 1.6722854333473805e-05, |
| "loss": 0.8499, |
| "mean_token_accuracy": 0.7779764724895358, |
| "num_tokens": 867919731.0, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.354289802244971, |
| "grad_norm": 0.868556022644043, |
| "learning_rate": 1.6652619749964885e-05, |
| "loss": 0.8997, |
| "mean_token_accuracy": 0.7686142545193434, |
| "num_tokens": 869673959.0, |
| "step": 4970 |
| }, |
| { |
| "epoch": 1.3570151084160862, |
| "grad_norm": 0.8492459654808044, |
| "learning_rate": 1.658238516645596e-05, |
| "loss": 0.86, |
| "mean_token_accuracy": 0.7771158112213016, |
| "num_tokens": 871480789.0, |
| "step": 4980 |
| }, |
| { |
| "epoch": 1.3597404145872014, |
| "grad_norm": 1.0295300483703613, |
| "learning_rate": 1.6512150582947045e-05, |
| "loss": 0.8584, |
| "mean_token_accuracy": 0.7776686141267419, |
| "num_tokens": 873221278.0, |
| "step": 4990 |
| }, |
| { |
| "epoch": 1.3624657207583164, |
| "grad_norm": 0.9210092425346375, |
| "learning_rate": 1.6441915999438122e-05, |
| "loss": 0.8712, |
| "mean_token_accuracy": 0.7737454175949097, |
| "num_tokens": 874968880.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.3651910269294316, |
| "grad_norm": 0.9282307624816895, |
| "learning_rate": 1.6371681415929206e-05, |
| "loss": 0.9054, |
| "mean_token_accuracy": 0.7669104115106166, |
| "num_tokens": 876752750.0, |
| "step": 5010 |
| }, |
| { |
| "epoch": 1.3679163331005468, |
| "grad_norm": 0.8968009352684021, |
| "learning_rate": 1.6301446832420282e-05, |
| "loss": 0.8461, |
| "mean_token_accuracy": 0.7793454423546791, |
| "num_tokens": 878505084.0, |
| "step": 5020 |
| }, |
| { |
| "epoch": 1.3706416392716618, |
| "grad_norm": 0.8846127390861511, |
| "learning_rate": 1.6231212248911366e-05, |
| "loss": 0.898, |
| "mean_token_accuracy": 0.7695556240156293, |
| "num_tokens": 880302245.0, |
| "step": 5030 |
| }, |
| { |
| "epoch": 1.373366945442777, |
| "grad_norm": 0.9551399946212769, |
| "learning_rate": 1.6160977665402443e-05, |
| "loss": 0.8757, |
| "mean_token_accuracy": 0.7735548134893179, |
| "num_tokens": 882111291.0, |
| "step": 5040 |
| }, |
| { |
| "epoch": 1.3760922516138923, |
| "grad_norm": 0.931864857673645, |
| "learning_rate": 1.6090743081893526e-05, |
| "loss": 0.8341, |
| "mean_token_accuracy": 0.7826241103932261, |
| "num_tokens": 883876821.0, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.3788175577850075, |
| "grad_norm": 0.972055971622467, |
| "learning_rate": 1.6020508498384607e-05, |
| "loss": 0.8819, |
| "mean_token_accuracy": 0.7730827201157808, |
| "num_tokens": 885619837.0, |
| "step": 5060 |
| }, |
| { |
| "epoch": 1.3815428639561227, |
| "grad_norm": 0.946313202381134, |
| "learning_rate": 1.5950273914875687e-05, |
| "loss": 0.8818, |
| "mean_token_accuracy": 0.7729535020887852, |
| "num_tokens": 887378117.0, |
| "step": 5070 |
| }, |
| { |
| "epoch": 1.3842681701272377, |
| "grad_norm": 0.8595699667930603, |
| "learning_rate": 1.5880039331366767e-05, |
| "loss": 0.8569, |
| "mean_token_accuracy": 0.7767195735126734, |
| "num_tokens": 889137008.0, |
| "step": 5080 |
| }, |
| { |
| "epoch": 1.386993476298353, |
| "grad_norm": 0.8799399733543396, |
| "learning_rate": 1.5809804747857844e-05, |
| "loss": 0.8909, |
| "mean_token_accuracy": 0.7725385947618634, |
| "num_tokens": 890871255.0, |
| "step": 5090 |
| }, |
| { |
| "epoch": 1.3897187824694681, |
| "grad_norm": 0.9310979843139648, |
| "learning_rate": 1.5739570164348927e-05, |
| "loss": 0.8375, |
| "mean_token_accuracy": 0.7820886155590415, |
| "num_tokens": 892626679.0, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.392444088640583, |
| "grad_norm": 0.9329671263694763, |
| "learning_rate": 1.5669335580840004e-05, |
| "loss": 0.8764, |
| "mean_token_accuracy": 0.7733617445454002, |
| "num_tokens": 894371346.0, |
| "step": 5110 |
| }, |
| { |
| "epoch": 1.3951693948116983, |
| "grad_norm": 0.903026282787323, |
| "learning_rate": 1.5599100997331088e-05, |
| "loss": 0.8537, |
| "mean_token_accuracy": 0.7767492802813649, |
| "num_tokens": 896171792.0, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.3978947009828135, |
| "grad_norm": 0.9470346570014954, |
| "learning_rate": 1.5528866413822165e-05, |
| "loss": 0.8801, |
| "mean_token_accuracy": 0.7735201021656394, |
| "num_tokens": 897925465.0, |
| "step": 5130 |
| }, |
| { |
| "epoch": 1.4006200071539288, |
| "grad_norm": 0.938048779964447, |
| "learning_rate": 1.5458631830313248e-05, |
| "loss": 0.8854, |
| "mean_token_accuracy": 0.772962811216712, |
| "num_tokens": 899668162.0, |
| "step": 5140 |
| }, |
| { |
| "epoch": 1.403345313325044, |
| "grad_norm": 0.9794390201568604, |
| "learning_rate": 1.5388397246804325e-05, |
| "loss": 0.8823, |
| "mean_token_accuracy": 0.7721097562462091, |
| "num_tokens": 901385964.0, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.406070619496159, |
| "grad_norm": 1.048996925354004, |
| "learning_rate": 1.531816266329541e-05, |
| "loss": 0.9095, |
| "mean_token_accuracy": 0.7677682116627693, |
| "num_tokens": 903167942.0, |
| "step": 5160 |
| }, |
| { |
| "epoch": 1.4087959256672742, |
| "grad_norm": 0.9089860916137695, |
| "learning_rate": 1.5247928079786489e-05, |
| "loss": 0.86, |
| "mean_token_accuracy": 0.7760318687185646, |
| "num_tokens": 904903509.0, |
| "step": 5170 |
| }, |
| { |
| "epoch": 1.4115212318383894, |
| "grad_norm": 0.9405279755592346, |
| "learning_rate": 1.5177693496277567e-05, |
| "loss": 0.8659, |
| "mean_token_accuracy": 0.7763911234214902, |
| "num_tokens": 906716098.0, |
| "step": 5180 |
| }, |
| { |
| "epoch": 1.4142465380095044, |
| "grad_norm": 1.051804542541504, |
| "learning_rate": 1.510745891276865e-05, |
| "loss": 0.9069, |
| "mean_token_accuracy": 0.7659270778298378, |
| "num_tokens": 908350817.0, |
| "step": 5190 |
| }, |
| { |
| "epoch": 1.4169718441806196, |
| "grad_norm": 0.9250074028968811, |
| "learning_rate": 1.5037224329259728e-05, |
| "loss": 0.8967, |
| "mean_token_accuracy": 0.7680221803486347, |
| "num_tokens": 910114038.0, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.4196971503517348, |
| "grad_norm": 0.8928842544555664, |
| "learning_rate": 1.496698974575081e-05, |
| "loss": 0.851, |
| "mean_token_accuracy": 0.7791072161868214, |
| "num_tokens": 911849057.0, |
| "step": 5210 |
| }, |
| { |
| "epoch": 1.42242245652285, |
| "grad_norm": 0.9152616858482361, |
| "learning_rate": 1.4896755162241888e-05, |
| "loss": 0.9021, |
| "mean_token_accuracy": 0.7676613539457321, |
| "num_tokens": 913608571.0, |
| "step": 5220 |
| }, |
| { |
| "epoch": 1.4251477626939653, |
| "grad_norm": 0.9313606023788452, |
| "learning_rate": 1.482652057873297e-05, |
| "loss": 0.863, |
| "mean_token_accuracy": 0.7756113139912486, |
| "num_tokens": 915305445.0, |
| "step": 5230 |
| }, |
| { |
| "epoch": 1.4278730688650803, |
| "grad_norm": 0.894331157207489, |
| "learning_rate": 1.4756285995224048e-05, |
| "loss": 0.8679, |
| "mean_token_accuracy": 0.7756274785846472, |
| "num_tokens": 917060810.0, |
| "step": 5240 |
| }, |
| { |
| "epoch": 1.4305983750361955, |
| "grad_norm": 1.0039823055267334, |
| "learning_rate": 1.4686051411715129e-05, |
| "loss": 0.8893, |
| "mean_token_accuracy": 0.771225837804377, |
| "num_tokens": 918774532.0, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.4333236812073107, |
| "grad_norm": 0.902562141418457, |
| "learning_rate": 1.461581682820621e-05, |
| "loss": 0.8686, |
| "mean_token_accuracy": 0.7744617283344268, |
| "num_tokens": 920510936.0, |
| "step": 5260 |
| }, |
| { |
| "epoch": 1.4360489873784257, |
| "grad_norm": 0.8837618827819824, |
| "learning_rate": 1.4545582244697289e-05, |
| "loss": 0.8599, |
| "mean_token_accuracy": 0.7760492115281522, |
| "num_tokens": 922287282.0, |
| "step": 5270 |
| }, |
| { |
| "epoch": 1.438774293549541, |
| "grad_norm": 0.8948635458946228, |
| "learning_rate": 1.4475347661188371e-05, |
| "loss": 0.8603, |
| "mean_token_accuracy": 0.7758477000519634, |
| "num_tokens": 924045825.0, |
| "step": 5280 |
| }, |
| { |
| "epoch": 1.441499599720656, |
| "grad_norm": 0.9150235652923584, |
| "learning_rate": 1.440511307767945e-05, |
| "loss": 0.835, |
| "mean_token_accuracy": 0.7826417770236731, |
| "num_tokens": 925730016.0, |
| "step": 5290 |
| }, |
| { |
| "epoch": 1.4442249058917713, |
| "grad_norm": 0.8513997197151184, |
| "learning_rate": 1.4334878494170531e-05, |
| "loss": 0.8959, |
| "mean_token_accuracy": 0.7688881358131766, |
| "num_tokens": 927502244.0, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.4469502120628865, |
| "grad_norm": 0.9665471315383911, |
| "learning_rate": 1.426464391066161e-05, |
| "loss": 0.8888, |
| "mean_token_accuracy": 0.7705901915207505, |
| "num_tokens": 929331138.0, |
| "step": 5310 |
| }, |
| { |
| "epoch": 1.4496755182340015, |
| "grad_norm": 0.9610171318054199, |
| "learning_rate": 1.4194409327152692e-05, |
| "loss": 0.8641, |
| "mean_token_accuracy": 0.7762337351217866, |
| "num_tokens": 931093746.0, |
| "step": 5320 |
| }, |
| { |
| "epoch": 1.4524008244051168, |
| "grad_norm": 0.9864968061447144, |
| "learning_rate": 1.412417474364377e-05, |
| "loss": 0.8619, |
| "mean_token_accuracy": 0.7765546761453151, |
| "num_tokens": 932856400.0, |
| "step": 5330 |
| }, |
| { |
| "epoch": 1.455126130576232, |
| "grad_norm": 0.8662729859352112, |
| "learning_rate": 1.4053940160134852e-05, |
| "loss": 0.8407, |
| "mean_token_accuracy": 0.7820514727383852, |
| "num_tokens": 934561196.0, |
| "step": 5340 |
| }, |
| { |
| "epoch": 1.457851436747347, |
| "grad_norm": 0.9846469759941101, |
| "learning_rate": 1.398370557662593e-05, |
| "loss": 0.87, |
| "mean_token_accuracy": 0.7749409038573504, |
| "num_tokens": 936320114.0, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.4605767429184622, |
| "grad_norm": 0.9434241652488708, |
| "learning_rate": 1.391347099311701e-05, |
| "loss": 0.8455, |
| "mean_token_accuracy": 0.7791736220940948, |
| "num_tokens": 938025766.0, |
| "step": 5360 |
| }, |
| { |
| "epoch": 1.4633020490895774, |
| "grad_norm": 0.8870792984962463, |
| "learning_rate": 1.3843236409608093e-05, |
| "loss": 0.8753, |
| "mean_token_accuracy": 0.7732191385701299, |
| "num_tokens": 939734341.0, |
| "step": 5370 |
| }, |
| { |
| "epoch": 1.4660273552606926, |
| "grad_norm": 0.9394125938415527, |
| "learning_rate": 1.3773001826099171e-05, |
| "loss": 0.8781, |
| "mean_token_accuracy": 0.7740361276082695, |
| "num_tokens": 941470370.0, |
| "step": 5380 |
| }, |
| { |
| "epoch": 1.4687526614318078, |
| "grad_norm": 0.8531573414802551, |
| "learning_rate": 1.3702767242590253e-05, |
| "loss": 0.8679, |
| "mean_token_accuracy": 0.7743699472397566, |
| "num_tokens": 943242271.0, |
| "step": 5390 |
| }, |
| { |
| "epoch": 1.4714779676029228, |
| "grad_norm": 0.9945854544639587, |
| "learning_rate": 1.3632532659081332e-05, |
| "loss": 0.8375, |
| "mean_token_accuracy": 0.7817069901153445, |
| "num_tokens": 945033055.0, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.474203273774038, |
| "grad_norm": 0.9767341017723083, |
| "learning_rate": 1.3562298075572413e-05, |
| "loss": 0.8446, |
| "mean_token_accuracy": 0.7792644891887903, |
| "num_tokens": 946767511.0, |
| "step": 5410 |
| }, |
| { |
| "epoch": 1.4769285799451533, |
| "grad_norm": 0.9598928093910217, |
| "learning_rate": 1.3492063492063492e-05, |
| "loss": 0.8778, |
| "mean_token_accuracy": 0.772851456515491, |
| "num_tokens": 948483568.0, |
| "step": 5420 |
| }, |
| { |
| "epoch": 1.4796538861162682, |
| "grad_norm": 0.9809437990188599, |
| "learning_rate": 1.3421828908554574e-05, |
| "loss": 0.866, |
| "mean_token_accuracy": 0.7753817655146122, |
| "num_tokens": 950245709.0, |
| "step": 5430 |
| }, |
| { |
| "epoch": 1.4823791922873835, |
| "grad_norm": 0.8955346941947937, |
| "learning_rate": 1.3351594325045652e-05, |
| "loss": 0.8702, |
| "mean_token_accuracy": 0.7736866667866706, |
| "num_tokens": 951927075.0, |
| "step": 5440 |
| }, |
| { |
| "epoch": 1.4851044984584987, |
| "grad_norm": 0.8952342867851257, |
| "learning_rate": 1.3281359741536734e-05, |
| "loss": 0.8876, |
| "mean_token_accuracy": 0.771092209033668, |
| "num_tokens": 953663268.0, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.487829804629614, |
| "grad_norm": 0.9086266756057739, |
| "learning_rate": 1.3211125158027813e-05, |
| "loss": 0.8563, |
| "mean_token_accuracy": 0.7787042908370495, |
| "num_tokens": 955406203.0, |
| "step": 5460 |
| }, |
| { |
| "epoch": 1.490555110800729, |
| "grad_norm": 0.9907563924789429, |
| "learning_rate": 1.3140890574518893e-05, |
| "loss": 0.9284, |
| "mean_token_accuracy": 0.7634458415210247, |
| "num_tokens": 957099890.0, |
| "step": 5470 |
| }, |
| { |
| "epoch": 1.493280416971844, |
| "grad_norm": 0.9550221562385559, |
| "learning_rate": 1.3070655991009975e-05, |
| "loss": 0.854, |
| "mean_token_accuracy": 0.7769535241648555, |
| "num_tokens": 958864424.0, |
| "step": 5480 |
| }, |
| { |
| "epoch": 1.4960057231429593, |
| "grad_norm": 0.9292310476303101, |
| "learning_rate": 1.3000421407501053e-05, |
| "loss": 0.8391, |
| "mean_token_accuracy": 0.7812065415084362, |
| "num_tokens": 960615561.0, |
| "step": 5490 |
| }, |
| { |
| "epoch": 1.4987310293140745, |
| "grad_norm": 1.0998324155807495, |
| "learning_rate": 1.2930186823992135e-05, |
| "loss": 0.8341, |
| "mean_token_accuracy": 0.7822911713272334, |
| "num_tokens": 962350259.0, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.5014563354851895, |
| "grad_norm": 0.9170006513595581, |
| "learning_rate": 1.2859952240483214e-05, |
| "loss": 0.8356, |
| "mean_token_accuracy": 0.7831004908308387, |
| "num_tokens": 964103329.0, |
| "step": 5510 |
| }, |
| { |
| "epoch": 1.504181641656305, |
| "grad_norm": 0.8882780075073242, |
| "learning_rate": 1.2789717656974296e-05, |
| "loss": 0.884, |
| "mean_token_accuracy": 0.7715097725391388, |
| "num_tokens": 965827916.0, |
| "step": 5520 |
| }, |
| { |
| "epoch": 1.50690694782742, |
| "grad_norm": 0.9092488288879395, |
| "learning_rate": 1.2719483073465374e-05, |
| "loss": 0.8312, |
| "mean_token_accuracy": 0.7838441342115402, |
| "num_tokens": 967545740.0, |
| "step": 5530 |
| }, |
| { |
| "epoch": 1.5096322539985352, |
| "grad_norm": 0.8435326218605042, |
| "learning_rate": 1.2649248489956456e-05, |
| "loss": 0.8475, |
| "mean_token_accuracy": 0.7786700014024973, |
| "num_tokens": 969400946.0, |
| "step": 5540 |
| }, |
| { |
| "epoch": 1.5123575601696504, |
| "grad_norm": 0.9444941878318787, |
| "learning_rate": 1.2579013906447535e-05, |
| "loss": 0.8827, |
| "mean_token_accuracy": 0.7715349035337568, |
| "num_tokens": 971161426.0, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.5150828663407654, |
| "grad_norm": 0.8825281262397766, |
| "learning_rate": 1.2508779322938616e-05, |
| "loss": 0.8511, |
| "mean_token_accuracy": 0.7775511790066958, |
| "num_tokens": 972880541.0, |
| "step": 5560 |
| }, |
| { |
| "epoch": 1.5178081725118806, |
| "grad_norm": 0.8819468021392822, |
| "learning_rate": 1.2438544739429697e-05, |
| "loss": 0.8374, |
| "mean_token_accuracy": 0.7825239049270749, |
| "num_tokens": 974614176.0, |
| "step": 5570 |
| }, |
| { |
| "epoch": 1.5205334786829958, |
| "grad_norm": 0.956183671951294, |
| "learning_rate": 1.2368310155920775e-05, |
| "loss": 0.8585, |
| "mean_token_accuracy": 0.7770784150809049, |
| "num_tokens": 976368282.0, |
| "step": 5580 |
| }, |
| { |
| "epoch": 1.5232587848541108, |
| "grad_norm": 0.9200752377510071, |
| "learning_rate": 1.2298075572411855e-05, |
| "loss": 0.8466, |
| "mean_token_accuracy": 0.779388022981584, |
| "num_tokens": 978153249.0, |
| "step": 5590 |
| }, |
| { |
| "epoch": 1.5259840910252263, |
| "grad_norm": 0.8917084336280823, |
| "learning_rate": 1.2227840988902936e-05, |
| "loss": 0.9045, |
| "mean_token_accuracy": 0.7673480456694961, |
| "num_tokens": 979934251.0, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.5287093971963412, |
| "grad_norm": 0.9365094900131226, |
| "learning_rate": 1.2157606405394016e-05, |
| "loss": 0.8643, |
| "mean_token_accuracy": 0.7757984729483723, |
| "num_tokens": 981631256.0, |
| "step": 5610 |
| }, |
| { |
| "epoch": 1.5314347033674565, |
| "grad_norm": 0.9112612009048462, |
| "learning_rate": 1.2087371821885096e-05, |
| "loss": 0.8711, |
| "mean_token_accuracy": 0.7740321941673756, |
| "num_tokens": 983465168.0, |
| "step": 5620 |
| }, |
| { |
| "epoch": 1.5341600095385717, |
| "grad_norm": 0.9324068427085876, |
| "learning_rate": 1.2017137238376178e-05, |
| "loss": 0.8887, |
| "mean_token_accuracy": 0.7695561707019806, |
| "num_tokens": 985204925.0, |
| "step": 5630 |
| }, |
| { |
| "epoch": 1.5368853157096867, |
| "grad_norm": 0.9383065104484558, |
| "learning_rate": 1.1946902654867258e-05, |
| "loss": 0.8726, |
| "mean_token_accuracy": 0.7738953325897455, |
| "num_tokens": 986932491.0, |
| "step": 5640 |
| }, |
| { |
| "epoch": 1.5396106218808019, |
| "grad_norm": 0.9061076641082764, |
| "learning_rate": 1.1876668071358338e-05, |
| "loss": 0.8542, |
| "mean_token_accuracy": 0.7773743925616146, |
| "num_tokens": 988679321.0, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.542335928051917, |
| "grad_norm": 0.9603699445724487, |
| "learning_rate": 1.1806433487849418e-05, |
| "loss": 0.8454, |
| "mean_token_accuracy": 0.7790809465572238, |
| "num_tokens": 990434427.0, |
| "step": 5660 |
| }, |
| { |
| "epoch": 1.545061234223032, |
| "grad_norm": 0.999284565448761, |
| "learning_rate": 1.1736198904340499e-05, |
| "loss": 0.8582, |
| "mean_token_accuracy": 0.7775534087792039, |
| "num_tokens": 992247068.0, |
| "step": 5670 |
| }, |
| { |
| "epoch": 1.5477865403941475, |
| "grad_norm": 1.0335644483566284, |
| "learning_rate": 1.1665964320831579e-05, |
| "loss": 0.8453, |
| "mean_token_accuracy": 0.7802035246044398, |
| "num_tokens": 993933963.0, |
| "step": 5680 |
| }, |
| { |
| "epoch": 1.5505118465652625, |
| "grad_norm": 0.9061406254768372, |
| "learning_rate": 1.1595729737322657e-05, |
| "loss": 0.8702, |
| "mean_token_accuracy": 0.776280096359551, |
| "num_tokens": 995750639.0, |
| "step": 5690 |
| }, |
| { |
| "epoch": 1.5532371527363777, |
| "grad_norm": 0.8573745489120483, |
| "learning_rate": 1.1525495153813737e-05, |
| "loss": 0.872, |
| "mean_token_accuracy": 0.7745745237916708, |
| "num_tokens": 997519520.0, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.555962458907493, |
| "grad_norm": 0.9098120927810669, |
| "learning_rate": 1.1455260570304818e-05, |
| "loss": 0.8682, |
| "mean_token_accuracy": 0.7748268697410822, |
| "num_tokens": 999260798.0, |
| "step": 5710 |
| }, |
| { |
| "epoch": 1.558687765078608, |
| "grad_norm": 0.885702908039093, |
| "learning_rate": 1.1385025986795898e-05, |
| "loss": 0.8536, |
| "mean_token_accuracy": 0.7769698390737176, |
| "num_tokens": 1000992985.0, |
| "step": 5720 |
| }, |
| { |
| "epoch": 1.5614130712497232, |
| "grad_norm": 0.8968507051467896, |
| "learning_rate": 1.1314791403286978e-05, |
| "loss": 0.8635, |
| "mean_token_accuracy": 0.7772723453119397, |
| "num_tokens": 1002785337.0, |
| "step": 5730 |
| }, |
| { |
| "epoch": 1.5641383774208384, |
| "grad_norm": 0.8682663440704346, |
| "learning_rate": 1.124455681977806e-05, |
| "loss": 0.8564, |
| "mean_token_accuracy": 0.7786873020231724, |
| "num_tokens": 1004575506.0, |
| "step": 5740 |
| }, |
| { |
| "epoch": 1.5668636835919534, |
| "grad_norm": 0.807613730430603, |
| "learning_rate": 1.117432223626914e-05, |
| "loss": 0.8473, |
| "mean_token_accuracy": 0.779249276779592, |
| "num_tokens": 1006331741.0, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.5695889897630688, |
| "grad_norm": 0.8837918043136597, |
| "learning_rate": 1.110408765276022e-05, |
| "loss": 0.8441, |
| "mean_token_accuracy": 0.7799524381756783, |
| "num_tokens": 1008074608.0, |
| "step": 5760 |
| }, |
| { |
| "epoch": 1.5723142959341838, |
| "grad_norm": 0.8683249354362488, |
| "learning_rate": 1.10338530692513e-05, |
| "loss": 0.8568, |
| "mean_token_accuracy": 0.7762459529563784, |
| "num_tokens": 1009814759.0, |
| "step": 5770 |
| }, |
| { |
| "epoch": 1.575039602105299, |
| "grad_norm": 0.8559950590133667, |
| "learning_rate": 1.096361848574238e-05, |
| "loss": 0.8545, |
| "mean_token_accuracy": 0.7779862441122531, |
| "num_tokens": 1011600470.0, |
| "step": 5780 |
| }, |
| { |
| "epoch": 1.5777649082764142, |
| "grad_norm": 0.8832516074180603, |
| "learning_rate": 1.0893383902233461e-05, |
| "loss": 0.8569, |
| "mean_token_accuracy": 0.7762075675651431, |
| "num_tokens": 1013332790.0, |
| "step": 5790 |
| }, |
| { |
| "epoch": 1.5804902144475292, |
| "grad_norm": 0.8497148752212524, |
| "learning_rate": 1.0823149318724541e-05, |
| "loss": 0.8532, |
| "mean_token_accuracy": 0.7777036292478442, |
| "num_tokens": 1015056384.0, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.5832155206186445, |
| "grad_norm": 0.8826137185096741, |
| "learning_rate": 1.075291473521562e-05, |
| "loss": 0.9066, |
| "mean_token_accuracy": 0.7674546526744962, |
| "num_tokens": 1016805023.0, |
| "step": 5810 |
| }, |
| { |
| "epoch": 1.5859408267897597, |
| "grad_norm": 0.8723825812339783, |
| "learning_rate": 1.06826801517067e-05, |
| "loss": 0.8647, |
| "mean_token_accuracy": 0.7763038536533713, |
| "num_tokens": 1018560575.0, |
| "step": 5820 |
| }, |
| { |
| "epoch": 1.5886661329608747, |
| "grad_norm": 1.0496035814285278, |
| "learning_rate": 1.061244556819778e-05, |
| "loss": 0.8239, |
| "mean_token_accuracy": 0.7842363622039557, |
| "num_tokens": 1020332708.0, |
| "step": 5830 |
| }, |
| { |
| "epoch": 1.59139143913199, |
| "grad_norm": 1.0802680253982544, |
| "learning_rate": 1.0542210984688862e-05, |
| "loss": 0.8607, |
| "mean_token_accuracy": 0.7756680343300104, |
| "num_tokens": 1022116018.0, |
| "step": 5840 |
| }, |
| { |
| "epoch": 1.594116745303105, |
| "grad_norm": 0.8981698751449585, |
| "learning_rate": 1.0471976401179942e-05, |
| "loss": 0.8701, |
| "mean_token_accuracy": 0.7751441445201636, |
| "num_tokens": 1023879971.0, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.5968420514742203, |
| "grad_norm": 0.9268780946731567, |
| "learning_rate": 1.0401741817671022e-05, |
| "loss": 0.8668, |
| "mean_token_accuracy": 0.7751290230080485, |
| "num_tokens": 1025589404.0, |
| "step": 5860 |
| }, |
| { |
| "epoch": 1.5995673576453355, |
| "grad_norm": 0.9141002297401428, |
| "learning_rate": 1.0331507234162102e-05, |
| "loss": 0.8612, |
| "mean_token_accuracy": 0.7754600135609507, |
| "num_tokens": 1027272595.0, |
| "step": 5870 |
| }, |
| { |
| "epoch": 1.6022926638164505, |
| "grad_norm": 1.055737018585205, |
| "learning_rate": 1.0261272650653183e-05, |
| "loss": 0.9044, |
| "mean_token_accuracy": 0.7665908185765147, |
| "num_tokens": 1029018598.0, |
| "step": 5880 |
| }, |
| { |
| "epoch": 1.6050179699875657, |
| "grad_norm": 0.9040539264678955, |
| "learning_rate": 1.0191038067144263e-05, |
| "loss": 0.8717, |
| "mean_token_accuracy": 0.7734175708144904, |
| "num_tokens": 1030768423.0, |
| "step": 5890 |
| }, |
| { |
| "epoch": 1.607743276158681, |
| "grad_norm": 0.9513643980026245, |
| "learning_rate": 1.0120803483635343e-05, |
| "loss": 0.868, |
| "mean_token_accuracy": 0.7750948579981923, |
| "num_tokens": 1032505045.0, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.610468582329796, |
| "grad_norm": 0.8642208576202393, |
| "learning_rate": 1.0050568900126423e-05, |
| "loss": 0.8665, |
| "mean_token_accuracy": 0.7742982119321823, |
| "num_tokens": 1034278282.0, |
| "step": 5910 |
| }, |
| { |
| "epoch": 1.6131938885009114, |
| "grad_norm": 1.039380669593811, |
| "learning_rate": 9.980334316617502e-06, |
| "loss": 0.8562, |
| "mean_token_accuracy": 0.7768575362861156, |
| "num_tokens": 1036001773.0, |
| "step": 5920 |
| }, |
| { |
| "epoch": 1.6159191946720264, |
| "grad_norm": 0.9230625629425049, |
| "learning_rate": 9.910099733108582e-06, |
| "loss": 0.8845, |
| "mean_token_accuracy": 0.7721848776564002, |
| "num_tokens": 1037697305.0, |
| "step": 5930 |
| }, |
| { |
| "epoch": 1.6186445008431416, |
| "grad_norm": 0.9427615404129028, |
| "learning_rate": 9.839865149599662e-06, |
| "loss": 0.8544, |
| "mean_token_accuracy": 0.7773636501282454, |
| "num_tokens": 1039459781.0, |
| "step": 5940 |
| }, |
| { |
| "epoch": 1.6213698070142568, |
| "grad_norm": 0.877819836139679, |
| "learning_rate": 9.769630566090744e-06, |
| "loss": 0.8794, |
| "mean_token_accuracy": 0.7725212635472417, |
| "num_tokens": 1041165017.0, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.6240951131853718, |
| "grad_norm": 0.9745622873306274, |
| "learning_rate": 9.699395982581824e-06, |
| "loss": 0.8472, |
| "mean_token_accuracy": 0.7793188545852899, |
| "num_tokens": 1042944243.0, |
| "step": 5960 |
| }, |
| { |
| "epoch": 1.6268204193564872, |
| "grad_norm": 0.877177894115448, |
| "learning_rate": 9.629161399072904e-06, |
| "loss": 0.8441, |
| "mean_token_accuracy": 0.7796382060274482, |
| "num_tokens": 1044677934.0, |
| "step": 5970 |
| }, |
| { |
| "epoch": 1.6295457255276022, |
| "grad_norm": 0.8958884477615356, |
| "learning_rate": 9.558926815563985e-06, |
| "loss": 0.8573, |
| "mean_token_accuracy": 0.7775826850906015, |
| "num_tokens": 1046454776.0, |
| "step": 5980 |
| }, |
| { |
| "epoch": 1.6322710316987175, |
| "grad_norm": 0.9246017336845398, |
| "learning_rate": 9.488692232055065e-06, |
| "loss": 0.8652, |
| "mean_token_accuracy": 0.7753489876165987, |
| "num_tokens": 1048212574.0, |
| "step": 5990 |
| }, |
| { |
| "epoch": 1.6349963378698327, |
| "grad_norm": 0.9042930006980896, |
| "learning_rate": 9.418457648546145e-06, |
| "loss": 0.8731, |
| "mean_token_accuracy": 0.7735361002385617, |
| "num_tokens": 1049922044.0, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.6377216440409477, |
| "grad_norm": 0.9657729864120483, |
| "learning_rate": 9.348223065037225e-06, |
| "loss": 0.8625, |
| "mean_token_accuracy": 0.775331400334835, |
| "num_tokens": 1051673079.0, |
| "step": 6010 |
| }, |
| { |
| "epoch": 1.6404469502120629, |
| "grad_norm": 0.9372441172599792, |
| "learning_rate": 9.277988481528305e-06, |
| "loss": 0.8553, |
| "mean_token_accuracy": 0.7770338580012321, |
| "num_tokens": 1053438839.0, |
| "step": 6020 |
| }, |
| { |
| "epoch": 1.643172256383178, |
| "grad_norm": 0.942888617515564, |
| "learning_rate": 9.207753898019384e-06, |
| "loss": 0.8574, |
| "mean_token_accuracy": 0.7775866745039821, |
| "num_tokens": 1055224077.0, |
| "step": 6030 |
| }, |
| { |
| "epoch": 1.645897562554293, |
| "grad_norm": 1.0009193420410156, |
| "learning_rate": 9.137519314510464e-06, |
| "loss": 0.8788, |
| "mean_token_accuracy": 0.7727638788521289, |
| "num_tokens": 1056980085.0, |
| "step": 6040 |
| }, |
| { |
| "epoch": 1.6486228687254085, |
| "grad_norm": 0.9073724746704102, |
| "learning_rate": 9.067284731001544e-06, |
| "loss": 0.8839, |
| "mean_token_accuracy": 0.7714164761826396, |
| "num_tokens": 1058709410.0, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.6513481748965235, |
| "grad_norm": 0.9918152093887329, |
| "learning_rate": 8.997050147492626e-06, |
| "loss": 0.8476, |
| "mean_token_accuracy": 0.7794443732127547, |
| "num_tokens": 1060458057.0, |
| "step": 6060 |
| }, |
| { |
| "epoch": 1.6540734810676387, |
| "grad_norm": 0.940454363822937, |
| "learning_rate": 8.926815563983706e-06, |
| "loss": 0.889, |
| "mean_token_accuracy": 0.7704324419610202, |
| "num_tokens": 1062204529.0, |
| "step": 6070 |
| }, |
| { |
| "epoch": 1.656798787238754, |
| "grad_norm": 0.8835647702217102, |
| "learning_rate": 8.856580980474787e-06, |
| "loss": 0.8678, |
| "mean_token_accuracy": 0.7757786093279719, |
| "num_tokens": 1063946466.0, |
| "step": 6080 |
| }, |
| { |
| "epoch": 1.659524093409869, |
| "grad_norm": 0.8910925388336182, |
| "learning_rate": 8.786346396965867e-06, |
| "loss": 0.8915, |
| "mean_token_accuracy": 0.768585343286395, |
| "num_tokens": 1065727075.0, |
| "step": 6090 |
| }, |
| { |
| "epoch": 1.6622493995809842, |
| "grad_norm": 0.9608586430549622, |
| "learning_rate": 8.716111813456947e-06, |
| "loss": 0.8838, |
| "mean_token_accuracy": 0.7712658075615764, |
| "num_tokens": 1067502000.0, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.6649747057520994, |
| "grad_norm": 0.8455779552459717, |
| "learning_rate": 8.645877229948027e-06, |
| "loss": 0.8038, |
| "mean_token_accuracy": 0.7884979326277971, |
| "num_tokens": 1069280628.0, |
| "step": 6110 |
| }, |
| { |
| "epoch": 1.6677000119232144, |
| "grad_norm": 0.9338540434837341, |
| "learning_rate": 8.575642646439107e-06, |
| "loss": 0.8633, |
| "mean_token_accuracy": 0.7757805565372109, |
| "num_tokens": 1071040088.0, |
| "step": 6120 |
| }, |
| { |
| "epoch": 1.6704253180943298, |
| "grad_norm": 0.8902707099914551, |
| "learning_rate": 8.505408062930188e-06, |
| "loss": 0.8512, |
| "mean_token_accuracy": 0.7785200117155909, |
| "num_tokens": 1072762221.0, |
| "step": 6130 |
| }, |
| { |
| "epoch": 1.6731506242654448, |
| "grad_norm": 0.9913966655731201, |
| "learning_rate": 8.435173479421268e-06, |
| "loss": 0.8752, |
| "mean_token_accuracy": 0.7754895342513919, |
| "num_tokens": 1074556423.0, |
| "step": 6140 |
| }, |
| { |
| "epoch": 1.67587593043656, |
| "grad_norm": 0.9583058953285217, |
| "learning_rate": 8.364938895912346e-06, |
| "loss": 0.8589, |
| "mean_token_accuracy": 0.7770748371258378, |
| "num_tokens": 1076338917.0, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.6786012366076752, |
| "grad_norm": 0.9399473667144775, |
| "learning_rate": 8.294704312403428e-06, |
| "loss": 0.8825, |
| "mean_token_accuracy": 0.771752736531198, |
| "num_tokens": 1077977290.0, |
| "step": 6160 |
| }, |
| { |
| "epoch": 1.6813265427787902, |
| "grad_norm": 0.9150588512420654, |
| "learning_rate": 8.224469728894508e-06, |
| "loss": 0.8601, |
| "mean_token_accuracy": 0.7763149598613381, |
| "num_tokens": 1079751546.0, |
| "step": 6170 |
| }, |
| { |
| "epoch": 1.6840518489499054, |
| "grad_norm": 0.9247446656227112, |
| "learning_rate": 8.154235145385589e-06, |
| "loss": 0.8925, |
| "mean_token_accuracy": 0.7686618639156222, |
| "num_tokens": 1081482161.0, |
| "step": 6180 |
| }, |
| { |
| "epoch": 1.6867771551210207, |
| "grad_norm": 0.9206864833831787, |
| "learning_rate": 8.084000561876669e-06, |
| "loss": 0.8884, |
| "mean_token_accuracy": 0.7724121443927288, |
| "num_tokens": 1083308646.0, |
| "step": 6190 |
| }, |
| { |
| "epoch": 1.6895024612921357, |
| "grad_norm": 0.9157947897911072, |
| "learning_rate": 8.013765978367749e-06, |
| "loss": 0.883, |
| "mean_token_accuracy": 0.7716514483094216, |
| "num_tokens": 1085049528.0, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.692227767463251, |
| "grad_norm": 0.9352182745933533, |
| "learning_rate": 7.94353139485883e-06, |
| "loss": 0.8566, |
| "mean_token_accuracy": 0.7775174669921399, |
| "num_tokens": 1086786072.0, |
| "step": 6210 |
| }, |
| { |
| "epoch": 1.694953073634366, |
| "grad_norm": 1.0252374410629272, |
| "learning_rate": 7.87329681134991e-06, |
| "loss": 0.8319, |
| "mean_token_accuracy": 0.7830239053815603, |
| "num_tokens": 1088520760.0, |
| "step": 6220 |
| }, |
| { |
| "epoch": 1.6976783798054813, |
| "grad_norm": 0.8791347742080688, |
| "learning_rate": 7.80306222784099e-06, |
| "loss": 0.8801, |
| "mean_token_accuracy": 0.7735577998682857, |
| "num_tokens": 1090282461.0, |
| "step": 6230 |
| }, |
| { |
| "epoch": 1.7004036859765965, |
| "grad_norm": 0.9212800860404968, |
| "learning_rate": 7.73282764433207e-06, |
| "loss": 0.9056, |
| "mean_token_accuracy": 0.7669639178551734, |
| "num_tokens": 1092006571.0, |
| "step": 6240 |
| }, |
| { |
| "epoch": 1.7031289921477115, |
| "grad_norm": 0.8965592980384827, |
| "learning_rate": 7.66259306082315e-06, |
| "loss": 0.917, |
| "mean_token_accuracy": 0.7657249065116047, |
| "num_tokens": 1093737561.0, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.7058542983188267, |
| "grad_norm": 0.8186634182929993, |
| "learning_rate": 7.592358477314229e-06, |
| "loss": 0.8797, |
| "mean_token_accuracy": 0.7729217361658811, |
| "num_tokens": 1095514719.0, |
| "step": 6260 |
| }, |
| { |
| "epoch": 1.708579604489942, |
| "grad_norm": 0.8526227474212646, |
| "learning_rate": 7.52212389380531e-06, |
| "loss": 0.9098, |
| "mean_token_accuracy": 0.766972879320383, |
| "num_tokens": 1097274026.0, |
| "step": 6270 |
| }, |
| { |
| "epoch": 1.711304910661057, |
| "grad_norm": 1.037373423576355, |
| "learning_rate": 7.4518893102963905e-06, |
| "loss": 0.897, |
| "mean_token_accuracy": 0.7689317859709263, |
| "num_tokens": 1099059973.0, |
| "step": 6280 |
| }, |
| { |
| "epoch": 1.7140302168321724, |
| "grad_norm": 0.9006183743476868, |
| "learning_rate": 7.381654726787471e-06, |
| "loss": 0.8578, |
| "mean_token_accuracy": 0.7776897365227342, |
| "num_tokens": 1100840633.0, |
| "step": 6290 |
| }, |
| { |
| "epoch": 1.7167555230032874, |
| "grad_norm": 0.9405822157859802, |
| "learning_rate": 7.311420143278551e-06, |
| "loss": 0.8964, |
| "mean_token_accuracy": 0.7690107244998217, |
| "num_tokens": 1102617553.0, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.7194808291744026, |
| "grad_norm": 1.0378305912017822, |
| "learning_rate": 7.241185559769631e-06, |
| "loss": 0.881, |
| "mean_token_accuracy": 0.7721499267965555, |
| "num_tokens": 1104327046.0, |
| "step": 6310 |
| }, |
| { |
| "epoch": 1.7222061353455178, |
| "grad_norm": 1.0442075729370117, |
| "learning_rate": 7.170950976260711e-06, |
| "loss": 0.8691, |
| "mean_token_accuracy": 0.7740305051207542, |
| "num_tokens": 1106098274.0, |
| "step": 6320 |
| }, |
| { |
| "epoch": 1.7249314415166328, |
| "grad_norm": 0.9894940853118896, |
| "learning_rate": 7.1007163927517915e-06, |
| "loss": 0.8469, |
| "mean_token_accuracy": 0.7795114817097784, |
| "num_tokens": 1107850821.0, |
| "step": 6330 |
| }, |
| { |
| "epoch": 1.727656747687748, |
| "grad_norm": 0.9134430289268494, |
| "learning_rate": 7.030481809242871e-06, |
| "loss": 0.8418, |
| "mean_token_accuracy": 0.7804009489715099, |
| "num_tokens": 1109562198.0, |
| "step": 6340 |
| }, |
| { |
| "epoch": 1.7303820538588632, |
| "grad_norm": 0.9074909687042236, |
| "learning_rate": 6.960247225733951e-06, |
| "loss": 0.876, |
| "mean_token_accuracy": 0.7732149386778474, |
| "num_tokens": 1111305835.0, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.7331073600299782, |
| "grad_norm": 0.9466888904571533, |
| "learning_rate": 6.890012642225031e-06, |
| "loss": 0.8678, |
| "mean_token_accuracy": 0.7753581205382943, |
| "num_tokens": 1113027716.0, |
| "step": 6360 |
| }, |
| { |
| "epoch": 1.7358326662010937, |
| "grad_norm": 0.9214980006217957, |
| "learning_rate": 6.819778058716113e-06, |
| "loss": 0.874, |
| "mean_token_accuracy": 0.7769591625779867, |
| "num_tokens": 1114828620.0, |
| "step": 6370 |
| }, |
| { |
| "epoch": 1.7385579723722087, |
| "grad_norm": 0.9098883867263794, |
| "learning_rate": 6.7495434752071925e-06, |
| "loss": 0.8554, |
| "mean_token_accuracy": 0.7776141613721848, |
| "num_tokens": 1116603274.0, |
| "step": 6380 |
| }, |
| { |
| "epoch": 1.7412832785433239, |
| "grad_norm": 0.9312921762466431, |
| "learning_rate": 6.679308891698273e-06, |
| "loss": 0.8694, |
| "mean_token_accuracy": 0.7738700607791543, |
| "num_tokens": 1118297922.0, |
| "step": 6390 |
| }, |
| { |
| "epoch": 1.744008584714439, |
| "grad_norm": 0.8718701004981995, |
| "learning_rate": 6.609074308189353e-06, |
| "loss": 0.8637, |
| "mean_token_accuracy": 0.77604665402323, |
| "num_tokens": 1120135157.0, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.746733890885554, |
| "grad_norm": 0.9903004765510559, |
| "learning_rate": 6.538839724680433e-06, |
| "loss": 0.8439, |
| "mean_token_accuracy": 0.780138636380434, |
| "num_tokens": 1121889958.0, |
| "step": 6410 |
| }, |
| { |
| "epoch": 1.7494591970566693, |
| "grad_norm": 0.9147149920463562, |
| "learning_rate": 6.468605141171513e-06, |
| "loss": 0.8757, |
| "mean_token_accuracy": 0.7742672517895699, |
| "num_tokens": 1123632536.0, |
| "step": 6420 |
| }, |
| { |
| "epoch": 1.7521845032277845, |
| "grad_norm": 0.9170229434967041, |
| "learning_rate": 6.3983705576625935e-06, |
| "loss": 0.8824, |
| "mean_token_accuracy": 0.7736239684745669, |
| "num_tokens": 1125380817.0, |
| "step": 6430 |
| }, |
| { |
| "epoch": 1.7549098093988995, |
| "grad_norm": 1.016421914100647, |
| "learning_rate": 6.328135974153674e-06, |
| "loss": 0.877, |
| "mean_token_accuracy": 0.7730204204097391, |
| "num_tokens": 1127183000.0, |
| "step": 6440 |
| }, |
| { |
| "epoch": 1.757635115570015, |
| "grad_norm": 0.8675819635391235, |
| "learning_rate": 6.257901390644753e-06, |
| "loss": 0.8755, |
| "mean_token_accuracy": 0.7724485624581575, |
| "num_tokens": 1128907056.0, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.76036042174113, |
| "grad_norm": 0.971829354763031, |
| "learning_rate": 6.187666807135834e-06, |
| "loss": 0.8484, |
| "mean_token_accuracy": 0.7802778273820877, |
| "num_tokens": 1130640406.0, |
| "step": 6460 |
| }, |
| { |
| "epoch": 1.7630857279122452, |
| "grad_norm": 0.9251874089241028, |
| "learning_rate": 6.117432223626914e-06, |
| "loss": 0.9032, |
| "mean_token_accuracy": 0.7674354026094079, |
| "num_tokens": 1132411436.0, |
| "step": 6470 |
| }, |
| { |
| "epoch": 1.7658110340833604, |
| "grad_norm": 0.8669341206550598, |
| "learning_rate": 6.0471976401179945e-06, |
| "loss": 0.8521, |
| "mean_token_accuracy": 0.7778038660064339, |
| "num_tokens": 1134162570.0, |
| "step": 6480 |
| }, |
| { |
| "epoch": 1.7685363402544754, |
| "grad_norm": 0.8731783032417297, |
| "learning_rate": 5.976963056609075e-06, |
| "loss": 0.8854, |
| "mean_token_accuracy": 0.7706002993509173, |
| "num_tokens": 1135933024.0, |
| "step": 6490 |
| }, |
| { |
| "epoch": 1.7712616464255906, |
| "grad_norm": 0.9514695405960083, |
| "learning_rate": 5.906728473100155e-06, |
| "loss": 0.8607, |
| "mean_token_accuracy": 0.7754776192829013, |
| "num_tokens": 1137647725.0, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.7739869525967058, |
| "grad_norm": 0.9821004867553711, |
| "learning_rate": 5.836493889591235e-06, |
| "loss": 0.8702, |
| "mean_token_accuracy": 0.7744058445096016, |
| "num_tokens": 1139436589.0, |
| "step": 6510 |
| }, |
| { |
| "epoch": 1.7767122587678208, |
| "grad_norm": 0.9453908801078796, |
| "learning_rate": 5.766259306082315e-06, |
| "loss": 0.8669, |
| "mean_token_accuracy": 0.7755882617086172, |
| "num_tokens": 1141121186.0, |
| "step": 6520 |
| }, |
| { |
| "epoch": 1.7794375649389362, |
| "grad_norm": 0.8392819166183472, |
| "learning_rate": 5.6960247225733954e-06, |
| "loss": 0.9023, |
| "mean_token_accuracy": 0.7675127379596234, |
| "num_tokens": 1142885840.0, |
| "step": 6530 |
| }, |
| { |
| "epoch": 1.7821628711100512, |
| "grad_norm": 0.8822066783905029, |
| "learning_rate": 5.625790139064476e-06, |
| "loss": 0.8896, |
| "mean_token_accuracy": 0.7709513738751411, |
| "num_tokens": 1144664551.0, |
| "step": 6540 |
| }, |
| { |
| "epoch": 1.7848881772811664, |
| "grad_norm": 0.9575950503349304, |
| "learning_rate": 5.555555555555556e-06, |
| "loss": 0.8875, |
| "mean_token_accuracy": 0.7716351070441305, |
| "num_tokens": 1146392624.0, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.7876134834522817, |
| "grad_norm": 0.967231035232544, |
| "learning_rate": 5.485320972046636e-06, |
| "loss": 0.8557, |
| "mean_token_accuracy": 0.7777371628209948, |
| "num_tokens": 1148186115.0, |
| "step": 6560 |
| }, |
| { |
| "epoch": 1.7903387896233967, |
| "grad_norm": 0.92610102891922, |
| "learning_rate": 5.415086388537716e-06, |
| "loss": 0.9029, |
| "mean_token_accuracy": 0.7684741642326116, |
| "num_tokens": 1149895675.0, |
| "step": 6570 |
| }, |
| { |
| "epoch": 1.7930640957945119, |
| "grad_norm": 1.0212879180908203, |
| "learning_rate": 5.344851805028796e-06, |
| "loss": 0.8798, |
| "mean_token_accuracy": 0.7727629562839866, |
| "num_tokens": 1151592671.0, |
| "step": 6580 |
| }, |
| { |
| "epoch": 1.795789401965627, |
| "grad_norm": 0.911811113357544, |
| "learning_rate": 5.274617221519877e-06, |
| "loss": 0.8485, |
| "mean_token_accuracy": 0.7795911753550172, |
| "num_tokens": 1153340225.0, |
| "step": 6590 |
| }, |
| { |
| "epoch": 1.798514708136742, |
| "grad_norm": 0.8629640936851501, |
| "learning_rate": 5.204382638010957e-06, |
| "loss": 0.8634, |
| "mean_token_accuracy": 0.7759083677083254, |
| "num_tokens": 1155119025.0, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.8012400143078575, |
| "grad_norm": 0.9127422571182251, |
| "learning_rate": 5.134148054502037e-06, |
| "loss": 0.8712, |
| "mean_token_accuracy": 0.7745040116831661, |
| "num_tokens": 1156885610.0, |
| "step": 6610 |
| }, |
| { |
| "epoch": 1.8039653204789725, |
| "grad_norm": 1.0696830749511719, |
| "learning_rate": 5.063913470993117e-06, |
| "loss": 0.8996, |
| "mean_token_accuracy": 0.7695049934089184, |
| "num_tokens": 1158614375.0, |
| "step": 6620 |
| }, |
| { |
| "epoch": 1.8066906266500877, |
| "grad_norm": 0.8334784507751465, |
| "learning_rate": 4.993678887484197e-06, |
| "loss": 0.838, |
| "mean_token_accuracy": 0.7819551320746541, |
| "num_tokens": 1160332773.0, |
| "step": 6630 |
| }, |
| { |
| "epoch": 1.809415932821203, |
| "grad_norm": 0.877504289150238, |
| "learning_rate": 4.923444303975278e-06, |
| "loss": 0.857, |
| "mean_token_accuracy": 0.7772863812744617, |
| "num_tokens": 1162043122.0, |
| "step": 6640 |
| }, |
| { |
| "epoch": 1.812141238992318, |
| "grad_norm": 0.8956847190856934, |
| "learning_rate": 4.853209720466358e-06, |
| "loss": 0.8509, |
| "mean_token_accuracy": 0.779183566942811, |
| "num_tokens": 1163783401.0, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.8148665451634332, |
| "grad_norm": 0.9233216643333435, |
| "learning_rate": 4.782975136957439e-06, |
| "loss": 0.8838, |
| "mean_token_accuracy": 0.7710191672667861, |
| "num_tokens": 1165546738.0, |
| "step": 6660 |
| }, |
| { |
| "epoch": 1.8175918513345484, |
| "grad_norm": 0.8691230416297913, |
| "learning_rate": 4.712740553448518e-06, |
| "loss": 0.831, |
| "mean_token_accuracy": 0.7832737671211362, |
| "num_tokens": 1167290698.0, |
| "step": 6670 |
| }, |
| { |
| "epoch": 1.8203171575056634, |
| "grad_norm": 0.8880638480186462, |
| "learning_rate": 4.642505969939598e-06, |
| "loss": 0.8744, |
| "mean_token_accuracy": 0.7720922984182834, |
| "num_tokens": 1169002358.0, |
| "step": 6680 |
| }, |
| { |
| "epoch": 1.8230424636767788, |
| "grad_norm": 0.8491354584693909, |
| "learning_rate": 4.5722713864306786e-06, |
| "loss": 0.8377, |
| "mean_token_accuracy": 0.7810861410573124, |
| "num_tokens": 1170783980.0, |
| "step": 6690 |
| }, |
| { |
| "epoch": 1.8257677698478938, |
| "grad_norm": 0.94153892993927, |
| "learning_rate": 4.502036802921759e-06, |
| "loss": 0.8868, |
| "mean_token_accuracy": 0.7701141970232129, |
| "num_tokens": 1172475127.0, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.828493076019009, |
| "grad_norm": 0.9762136340141296, |
| "learning_rate": 4.431802219412839e-06, |
| "loss": 0.8747, |
| "mean_token_accuracy": 0.7722874572500587, |
| "num_tokens": 1174214378.0, |
| "step": 6710 |
| }, |
| { |
| "epoch": 1.8312183821901242, |
| "grad_norm": 0.8815026879310608, |
| "learning_rate": 4.361567635903919e-06, |
| "loss": 0.868, |
| "mean_token_accuracy": 0.7752756904810667, |
| "num_tokens": 1175912162.0, |
| "step": 6720 |
| }, |
| { |
| "epoch": 1.8339436883612392, |
| "grad_norm": 0.8567794561386108, |
| "learning_rate": 4.291333052394999e-06, |
| "loss": 0.8447, |
| "mean_token_accuracy": 0.780145507119596, |
| "num_tokens": 1177698845.0, |
| "step": 6730 |
| }, |
| { |
| "epoch": 1.8366689945323547, |
| "grad_norm": 0.8893793225288391, |
| "learning_rate": 4.2210984688860795e-06, |
| "loss": 0.8484, |
| "mean_token_accuracy": 0.7800832569599152, |
| "num_tokens": 1179524910.0, |
| "step": 6740 |
| }, |
| { |
| "epoch": 1.8393943007034697, |
| "grad_norm": 0.9771442413330078, |
| "learning_rate": 4.15086388537716e-06, |
| "loss": 0.8868, |
| "mean_token_accuracy": 0.7715147029608488, |
| "num_tokens": 1181252944.0, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.8421196068745849, |
| "grad_norm": 0.9260916113853455, |
| "learning_rate": 4.08062930186824e-06, |
| "loss": 0.862, |
| "mean_token_accuracy": 0.7749996662139893, |
| "num_tokens": 1182978187.0, |
| "step": 6760 |
| }, |
| { |
| "epoch": 1.8448449130457, |
| "grad_norm": 0.9010224938392639, |
| "learning_rate": 4.010394718359321e-06, |
| "loss": 0.8842, |
| "mean_token_accuracy": 0.770787863433361, |
| "num_tokens": 1184646601.0, |
| "step": 6770 |
| }, |
| { |
| "epoch": 1.847570219216815, |
| "grad_norm": 0.8967046737670898, |
| "learning_rate": 3.9401601348504e-06, |
| "loss": 0.851, |
| "mean_token_accuracy": 0.7784305892884731, |
| "num_tokens": 1186432758.0, |
| "step": 6780 |
| }, |
| { |
| "epoch": 1.8502955253879303, |
| "grad_norm": 0.8852325677871704, |
| "learning_rate": 3.8699255513414805e-06, |
| "loss": 0.8906, |
| "mean_token_accuracy": 0.7694688705727458, |
| "num_tokens": 1188172544.0, |
| "step": 6790 |
| }, |
| { |
| "epoch": 1.8530208315590455, |
| "grad_norm": 0.952947199344635, |
| "learning_rate": 3.7996909678325607e-06, |
| "loss": 0.8702, |
| "mean_token_accuracy": 0.7760049263946712, |
| "num_tokens": 1189891287.0, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.8557461377301605, |
| "grad_norm": 0.9413578510284424, |
| "learning_rate": 3.729456384323641e-06, |
| "loss": 0.8334, |
| "mean_token_accuracy": 0.7797158444300294, |
| "num_tokens": 1191622273.0, |
| "step": 6810 |
| }, |
| { |
| "epoch": 1.858471443901276, |
| "grad_norm": 0.9108753800392151, |
| "learning_rate": 3.6592218008147215e-06, |
| "loss": 0.8845, |
| "mean_token_accuracy": 0.7724293851293623, |
| "num_tokens": 1193460373.0, |
| "step": 6820 |
| }, |
| { |
| "epoch": 1.861196750072391, |
| "grad_norm": 0.9215091466903687, |
| "learning_rate": 3.5889872173058017e-06, |
| "loss": 0.8769, |
| "mean_token_accuracy": 0.7731356274336576, |
| "num_tokens": 1195156956.0, |
| "step": 6830 |
| }, |
| { |
| "epoch": 1.8639220562435062, |
| "grad_norm": 0.9173537492752075, |
| "learning_rate": 3.518752633796882e-06, |
| "loss": 0.873, |
| "mean_token_accuracy": 0.7745907133445143, |
| "num_tokens": 1196889114.0, |
| "step": 6840 |
| }, |
| { |
| "epoch": 1.8666473624146214, |
| "grad_norm": 0.8616206049919128, |
| "learning_rate": 3.4485180502879617e-06, |
| "loss": 0.8661, |
| "mean_token_accuracy": 0.7771028708666563, |
| "num_tokens": 1198676992.0, |
| "step": 6850 |
| }, |
| { |
| "epoch": 1.8693726685857364, |
| "grad_norm": 0.8219782114028931, |
| "learning_rate": 3.378283466779042e-06, |
| "loss": 0.8645, |
| "mean_token_accuracy": 0.7753412149846554, |
| "num_tokens": 1200444870.0, |
| "step": 6860 |
| }, |
| { |
| "epoch": 1.8720979747568516, |
| "grad_norm": 0.9641156196594238, |
| "learning_rate": 3.308048883270122e-06, |
| "loss": 0.8613, |
| "mean_token_accuracy": 0.77578400131315, |
| "num_tokens": 1202174597.0, |
| "step": 6870 |
| }, |
| { |
| "epoch": 1.8748232809279668, |
| "grad_norm": 0.9042788743972778, |
| "learning_rate": 3.2378142997612027e-06, |
| "loss": 0.863, |
| "mean_token_accuracy": 0.7763865817338228, |
| "num_tokens": 1203947194.0, |
| "step": 6880 |
| }, |
| { |
| "epoch": 1.8775485870990818, |
| "grad_norm": 0.870700478553772, |
| "learning_rate": 3.167579716252283e-06, |
| "loss": 0.866, |
| "mean_token_accuracy": 0.77534787543118, |
| "num_tokens": 1205696008.0, |
| "step": 6890 |
| }, |
| { |
| "epoch": 1.8802738932701972, |
| "grad_norm": 0.8512738347053528, |
| "learning_rate": 3.097345132743363e-06, |
| "loss": 0.8365, |
| "mean_token_accuracy": 0.7811076069250703, |
| "num_tokens": 1207477394.0, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.8829991994413122, |
| "grad_norm": 0.8965963125228882, |
| "learning_rate": 3.0271105492344433e-06, |
| "loss": 0.8903, |
| "mean_token_accuracy": 0.7708822388201952, |
| "num_tokens": 1209268585.0, |
| "step": 6910 |
| }, |
| { |
| "epoch": 1.8857245056124274, |
| "grad_norm": 0.9546549916267395, |
| "learning_rate": 2.9568759657255235e-06, |
| "loss": 0.8893, |
| "mean_token_accuracy": 0.772000799421221, |
| "num_tokens": 1211062266.0, |
| "step": 6920 |
| }, |
| { |
| "epoch": 1.8884498117835427, |
| "grad_norm": 0.9379769563674927, |
| "learning_rate": 2.8866413822166037e-06, |
| "loss": 0.8594, |
| "mean_token_accuracy": 0.7770233813673257, |
| "num_tokens": 1212830493.0, |
| "step": 6930 |
| }, |
| { |
| "epoch": 1.8911751179546576, |
| "grad_norm": 0.9216699004173279, |
| "learning_rate": 2.816406798707684e-06, |
| "loss": 0.8543, |
| "mean_token_accuracy": 0.7784260775893926, |
| "num_tokens": 1214570541.0, |
| "step": 6940 |
| }, |
| { |
| "epoch": 1.8939004241257729, |
| "grad_norm": 0.9036583304405212, |
| "learning_rate": 2.746172215198764e-06, |
| "loss": 0.8621, |
| "mean_token_accuracy": 0.7763115083798766, |
| "num_tokens": 1216358869.0, |
| "step": 6950 |
| }, |
| { |
| "epoch": 1.896625730296888, |
| "grad_norm": 0.9491548538208008, |
| "learning_rate": 2.6759376316898443e-06, |
| "loss": 0.8912, |
| "mean_token_accuracy": 0.7708828750997782, |
| "num_tokens": 1218169173.0, |
| "step": 6960 |
| }, |
| { |
| "epoch": 1.899351036468003, |
| "grad_norm": 1.0159947872161865, |
| "learning_rate": 2.6057030481809245e-06, |
| "loss": 0.8817, |
| "mean_token_accuracy": 0.7712484735995531, |
| "num_tokens": 1219932812.0, |
| "step": 6970 |
| }, |
| { |
| "epoch": 1.9020763426391185, |
| "grad_norm": 0.9909380078315735, |
| "learning_rate": 2.5354684646720047e-06, |
| "loss": 0.85, |
| "mean_token_accuracy": 0.7789081564173103, |
| "num_tokens": 1221698971.0, |
| "step": 6980 |
| }, |
| { |
| "epoch": 1.9048016488102335, |
| "grad_norm": 0.9468783736228943, |
| "learning_rate": 2.465233881163085e-06, |
| "loss": 0.8608, |
| "mean_token_accuracy": 0.7773719995282591, |
| "num_tokens": 1223474752.0, |
| "step": 6990 |
| }, |
| { |
| "epoch": 1.9075269549813487, |
| "grad_norm": 0.8256745934486389, |
| "learning_rate": 2.394999297654165e-06, |
| "loss": 0.8693, |
| "mean_token_accuracy": 0.774550568126142, |
| "num_tokens": 1225205018.0, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.910252261152464, |
| "grad_norm": 0.9096000790596008, |
| "learning_rate": 2.3247647141452453e-06, |
| "loss": 0.8615, |
| "mean_token_accuracy": 0.7756875747814774, |
| "num_tokens": 1226976554.0, |
| "step": 7010 |
| }, |
| { |
| "epoch": 1.912977567323579, |
| "grad_norm": 1.029605507850647, |
| "learning_rate": 2.2545301306363255e-06, |
| "loss": 0.8364, |
| "mean_token_accuracy": 0.781515441648662, |
| "num_tokens": 1228681473.0, |
| "step": 7020 |
| }, |
| { |
| "epoch": 1.9157028734946941, |
| "grad_norm": 0.8915464878082275, |
| "learning_rate": 2.1842955471274057e-06, |
| "loss": 0.8406, |
| "mean_token_accuracy": 0.7821012154221535, |
| "num_tokens": 1230386311.0, |
| "step": 7030 |
| }, |
| { |
| "epoch": 1.9184281796658094, |
| "grad_norm": 0.9511204957962036, |
| "learning_rate": 2.114060963618486e-06, |
| "loss": 0.8683, |
| "mean_token_accuracy": 0.7758362222462892, |
| "num_tokens": 1232181485.0, |
| "step": 7040 |
| }, |
| { |
| "epoch": 1.9211534858369244, |
| "grad_norm": 1.0507603883743286, |
| "learning_rate": 2.043826380109566e-06, |
| "loss": 0.8534, |
| "mean_token_accuracy": 0.7784864647313953, |
| "num_tokens": 1233900223.0, |
| "step": 7050 |
| }, |
| { |
| "epoch": 1.9238787920080398, |
| "grad_norm": 0.9987245202064514, |
| "learning_rate": 1.9735917966006462e-06, |
| "loss": 0.8993, |
| "mean_token_accuracy": 0.7673255430534482, |
| "num_tokens": 1235661434.0, |
| "step": 7060 |
| }, |
| { |
| "epoch": 1.9266040981791548, |
| "grad_norm": 0.9190322756767273, |
| "learning_rate": 1.9033572130917266e-06, |
| "loss": 0.8374, |
| "mean_token_accuracy": 0.7809090284630656, |
| "num_tokens": 1237360867.0, |
| "step": 7070 |
| }, |
| { |
| "epoch": 1.92932940435027, |
| "grad_norm": 0.9413881897926331, |
| "learning_rate": 1.8331226295828066e-06, |
| "loss": 0.8812, |
| "mean_token_accuracy": 0.771918954141438, |
| "num_tokens": 1239085380.0, |
| "step": 7080 |
| }, |
| { |
| "epoch": 1.9320547105213852, |
| "grad_norm": 0.8746035695075989, |
| "learning_rate": 1.7628880460738868e-06, |
| "loss": 0.88, |
| "mean_token_accuracy": 0.772631255723536, |
| "num_tokens": 1240747584.0, |
| "step": 7090 |
| }, |
| { |
| "epoch": 1.9347800166925002, |
| "grad_norm": 0.8726754188537598, |
| "learning_rate": 1.6926534625649672e-06, |
| "loss": 0.8665, |
| "mean_token_accuracy": 0.7746797952800989, |
| "num_tokens": 1242486706.0, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.9375053228636154, |
| "grad_norm": 0.9137101173400879, |
| "learning_rate": 1.6224188790560472e-06, |
| "loss": 0.8681, |
| "mean_token_accuracy": 0.7750147501006722, |
| "num_tokens": 1244248596.0, |
| "step": 7110 |
| }, |
| { |
| "epoch": 1.9402306290347306, |
| "grad_norm": 1.004635214805603, |
| "learning_rate": 1.5521842955471274e-06, |
| "loss": 0.8766, |
| "mean_token_accuracy": 0.773098181374371, |
| "num_tokens": 1246027714.0, |
| "step": 7120 |
| }, |
| { |
| "epoch": 1.9429559352058456, |
| "grad_norm": 0.9150519967079163, |
| "learning_rate": 1.4819497120382078e-06, |
| "loss": 0.8846, |
| "mean_token_accuracy": 0.7700993655249476, |
| "num_tokens": 1247810720.0, |
| "step": 7130 |
| }, |
| { |
| "epoch": 1.945681241376961, |
| "grad_norm": 0.9656493067741394, |
| "learning_rate": 1.4117151285292878e-06, |
| "loss": 0.8351, |
| "mean_token_accuracy": 0.7827043356373906, |
| "num_tokens": 1249602431.0, |
| "step": 7140 |
| }, |
| { |
| "epoch": 1.948406547548076, |
| "grad_norm": 0.8341893553733826, |
| "learning_rate": 1.341480545020368e-06, |
| "loss": 0.8406, |
| "mean_token_accuracy": 0.7799601985141635, |
| "num_tokens": 1251347854.0, |
| "step": 7150 |
| }, |
| { |
| "epoch": 1.9511318537191913, |
| "grad_norm": 0.9483367800712585, |
| "learning_rate": 1.2712459615114484e-06, |
| "loss": 0.865, |
| "mean_token_accuracy": 0.7757392497733235, |
| "num_tokens": 1253090972.0, |
| "step": 7160 |
| }, |
| { |
| "epoch": 1.9538571598903065, |
| "grad_norm": 0.8367253541946411, |
| "learning_rate": 1.2010113780025286e-06, |
| "loss": 0.8769, |
| "mean_token_accuracy": 0.7753980663605035, |
| "num_tokens": 1254882362.0, |
| "step": 7170 |
| }, |
| { |
| "epoch": 1.9565824660614215, |
| "grad_norm": 0.9163729548454285, |
| "learning_rate": 1.1307767944936086e-06, |
| "loss": 0.8497, |
| "mean_token_accuracy": 0.7781329112127423, |
| "num_tokens": 1256581748.0, |
| "step": 7180 |
| }, |
| { |
| "epoch": 1.9593077722325367, |
| "grad_norm": 0.8588080406188965, |
| "learning_rate": 1.0605422109846888e-06, |
| "loss": 0.8867, |
| "mean_token_accuracy": 0.7706562045961618, |
| "num_tokens": 1258375314.0, |
| "step": 7190 |
| }, |
| { |
| "epoch": 1.962033078403652, |
| "grad_norm": 1.0159885883331299, |
| "learning_rate": 9.903076274757692e-07, |
| "loss": 0.8579, |
| "mean_token_accuracy": 0.7774625174701214, |
| "num_tokens": 1260134496.0, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.964758384574767, |
| "grad_norm": 0.9101456999778748, |
| "learning_rate": 9.200730439668494e-07, |
| "loss": 0.8651, |
| "mean_token_accuracy": 0.7751043004915118, |
| "num_tokens": 1261889077.0, |
| "step": 7210 |
| }, |
| { |
| "epoch": 1.9674836907458824, |
| "grad_norm": 0.8841215372085571, |
| "learning_rate": 8.498384604579295e-07, |
| "loss": 0.8584, |
| "mean_token_accuracy": 0.7764962373301387, |
| "num_tokens": 1263571459.0, |
| "step": 7220 |
| }, |
| { |
| "epoch": 1.9702089969169974, |
| "grad_norm": 0.9193898439407349, |
| "learning_rate": 7.796038769490097e-07, |
| "loss": 0.8539, |
| "mean_token_accuracy": 0.7779923478141427, |
| "num_tokens": 1265309020.0, |
| "step": 7230 |
| }, |
| { |
| "epoch": 1.9729343030881126, |
| "grad_norm": 0.918035089969635, |
| "learning_rate": 7.0936929344009e-07, |
| "loss": 0.8548, |
| "mean_token_accuracy": 0.7777844806201756, |
| "num_tokens": 1267094519.0, |
| "step": 7240 |
| }, |
| { |
| "epoch": 1.9756596092592278, |
| "grad_norm": 0.925105094909668, |
| "learning_rate": 6.391347099311701e-07, |
| "loss": 0.875, |
| "mean_token_accuracy": 0.7743610493838787, |
| "num_tokens": 1268787322.0, |
| "step": 7250 |
| }, |
| { |
| "epoch": 1.9783849154303428, |
| "grad_norm": 0.8597863912582397, |
| "learning_rate": 5.689001264222504e-07, |
| "loss": 0.8356, |
| "mean_token_accuracy": 0.7817079512402415, |
| "num_tokens": 1270503633.0, |
| "step": 7260 |
| }, |
| { |
| "epoch": 1.981110221601458, |
| "grad_norm": 0.9246379137039185, |
| "learning_rate": 4.986655429133305e-07, |
| "loss": 0.8402, |
| "mean_token_accuracy": 0.7805968597531319, |
| "num_tokens": 1272220571.0, |
| "step": 7270 |
| }, |
| { |
| "epoch": 1.9838355277725732, |
| "grad_norm": 0.9555491805076599, |
| "learning_rate": 4.2843095940441076e-07, |
| "loss": 0.9061, |
| "mean_token_accuracy": 0.7672311440110207, |
| "num_tokens": 1274007267.0, |
| "step": 7280 |
| }, |
| { |
| "epoch": 1.9865608339436882, |
| "grad_norm": 0.9872004389762878, |
| "learning_rate": 3.5819637589549096e-07, |
| "loss": 0.869, |
| "mean_token_accuracy": 0.7746486615389585, |
| "num_tokens": 1275737524.0, |
| "step": 7290 |
| }, |
| { |
| "epoch": 1.9892861401148036, |
| "grad_norm": 0.8863142728805542, |
| "learning_rate": 2.8796179238657115e-07, |
| "loss": 0.8716, |
| "mean_token_accuracy": 0.7743620758876204, |
| "num_tokens": 1277504467.0, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.9920114462859186, |
| "grad_norm": 0.8783524632453918, |
| "learning_rate": 2.1772720887765138e-07, |
| "loss": 0.836, |
| "mean_token_accuracy": 0.7821097403764725, |
| "num_tokens": 1279280881.0, |
| "step": 7310 |
| }, |
| { |
| "epoch": 1.9947367524570339, |
| "grad_norm": 0.9120770692825317, |
| "learning_rate": 1.4749262536873157e-07, |
| "loss": 0.868, |
| "mean_token_accuracy": 0.774507796112448, |
| "num_tokens": 1280999476.0, |
| "step": 7320 |
| }, |
| { |
| "epoch": 1.997462058628149, |
| "grad_norm": 0.9199837446212769, |
| "learning_rate": 7.725804185981177e-08, |
| "loss": 0.8455, |
| "mean_token_accuracy": 0.7805630661547184, |
| "num_tokens": 1282784410.0, |
| "step": 7330 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.5059590339660645, |
| "learning_rate": 7.023458350891979e-09, |
| "loss": 0.8179, |
| "mean_token_accuracy": 0.772105351190439, |
| "num_tokens": 1284418792.0, |
| "step": 7340 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 7340, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500.0, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.2214594505753625e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|