| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5571030640668524, |
| "eval_steps": 5000, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0011142061281337048, |
| "grad_norm": 526387360.0, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 8.3558, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0022284122562674096, |
| "grad_norm": 10082223104.0, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 9.7414, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.003342618384401114, |
| "grad_norm": 394851360.0, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 8.3657, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.004456824512534819, |
| "grad_norm": 550555200.0, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 7.7018, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005571030640668524, |
| "grad_norm": 6702706176.0, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 7.8417, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.006685236768802228, |
| "grad_norm": 984629184.0, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 7.2844, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007799442896935933, |
| "grad_norm": 136705808.0, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 6.072, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008913649025069638, |
| "grad_norm": 2020311040.0, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 6.2002, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.010027855153203343, |
| "grad_norm": 284398432.0, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 5.9848, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.011142061281337047, |
| "grad_norm": 101863704.0, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 5.6746, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012256267409470752, |
| "grad_norm": 289118624.0, |
| "learning_rate": 2.2e-06, |
| "loss": 4.8833, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.013370473537604457, |
| "grad_norm": 1703932416.0, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 5.7605, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.014484679665738161, |
| "grad_norm": 2717028864.0, |
| "learning_rate": 2.6e-06, |
| "loss": 5.2174, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.015598885793871866, |
| "grad_norm": 73946976.0, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 4.5718, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.016713091922005572, |
| "grad_norm": 19827058.0, |
| "learning_rate": 3e-06, |
| "loss": 3.9569, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.017827298050139277, |
| "grad_norm": 41642140.0, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 3.0831, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.01894150417827298, |
| "grad_norm": 29684964.0, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 3.0789, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.020055710306406686, |
| "grad_norm": 23762328.0, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 2.6555, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.02116991643454039, |
| "grad_norm": 9484590.0, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 2.3382, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.022284122562674095, |
| "grad_norm": 69793928.0, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 2.2238, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0233983286908078, |
| "grad_norm": 117296832.0, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 2.1228, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.024512534818941504, |
| "grad_norm": 12182392.0, |
| "learning_rate": 4.4e-06, |
| "loss": 1.9407, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.02562674094707521, |
| "grad_norm": 16288649.0, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 1.837, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.026740947075208913, |
| "grad_norm": 3740553216.0, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 1.7806, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.027855153203342618, |
| "grad_norm": 16179237.0, |
| "learning_rate": 5e-06, |
| "loss": 2.3418, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.028969359331476322, |
| "grad_norm": 3983458.25, |
| "learning_rate": 5.2e-06, |
| "loss": 1.4532, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.030083565459610027, |
| "grad_norm": 41428304.0, |
| "learning_rate": 5.400000000000001e-06, |
| "loss": 1.3844, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.03119777158774373, |
| "grad_norm": 11067015.0, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 1.218, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.03231197771587744, |
| "grad_norm": 3763499.0, |
| "learning_rate": 5.8e-06, |
| "loss": 1.0569, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.033426183844011144, |
| "grad_norm": 3374811.25, |
| "learning_rate": 6e-06, |
| "loss": 0.9546, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03454038997214485, |
| "grad_norm": 956919.375, |
| "learning_rate": 6.200000000000001e-06, |
| "loss": 0.7927, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.03565459610027855, |
| "grad_norm": 3694079.75, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 0.8144, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.03676880222841226, |
| "grad_norm": 11441138.0, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 0.7171, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.03788300835654596, |
| "grad_norm": 3600018.75, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 0.6414, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.03899721448467967, |
| "grad_norm": 2904519.75, |
| "learning_rate": 7e-06, |
| "loss": 0.6656, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.04011142061281337, |
| "grad_norm": 508876.46875, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 0.7313, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.041225626740947076, |
| "grad_norm": 1115024.25, |
| "learning_rate": 7.4e-06, |
| "loss": 0.6118, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.04233983286908078, |
| "grad_norm": 705881.875, |
| "learning_rate": 7.600000000000001e-06, |
| "loss": 0.576, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.043454038997214485, |
| "grad_norm": 1243545.875, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 0.6819, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.04456824512534819, |
| "grad_norm": 403263.125, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.6145, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.045682451253481894, |
| "grad_norm": 604421.6875, |
| "learning_rate": 8.2e-06, |
| "loss": 0.5955, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0467966573816156, |
| "grad_norm": 659902.5, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 0.5932, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.0479108635097493, |
| "grad_norm": 636760.1875, |
| "learning_rate": 8.6e-06, |
| "loss": 0.5849, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.04902506963788301, |
| "grad_norm": 657968.4375, |
| "learning_rate": 8.8e-06, |
| "loss": 0.5557, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.05013927576601671, |
| "grad_norm": 345233.0, |
| "learning_rate": 9e-06, |
| "loss": 0.5214, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05125348189415042, |
| "grad_norm": 6643789.5, |
| "learning_rate": 9.200000000000002e-06, |
| "loss": 0.5415, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.05236768802228412, |
| "grad_norm": 260660.28125, |
| "learning_rate": 9.4e-06, |
| "loss": 0.598, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.053481894150417826, |
| "grad_norm": 337898.15625, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 0.5329, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.05459610027855153, |
| "grad_norm": 262814.03125, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 0.5255, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.055710306406685235, |
| "grad_norm": 378706.5625, |
| "learning_rate": 1e-05, |
| "loss": 0.5261, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05682451253481894, |
| "grad_norm": 405478.5, |
| "learning_rate": 9.988200589970503e-06, |
| "loss": 0.5366, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.057938718662952644, |
| "grad_norm": 299159.59375, |
| "learning_rate": 9.976401179941004e-06, |
| "loss": 0.5714, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.05905292479108635, |
| "grad_norm": 353445.59375, |
| "learning_rate": 9.964601769911504e-06, |
| "loss": 0.4717, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.06016713091922005, |
| "grad_norm": 320303.59375, |
| "learning_rate": 9.952802359882007e-06, |
| "loss": 0.5686, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.06128133704735376, |
| "grad_norm": 233915.875, |
| "learning_rate": 9.941002949852509e-06, |
| "loss": 0.6282, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.06239554317548746, |
| "grad_norm": 300448.0625, |
| "learning_rate": 9.92920353982301e-06, |
| "loss": 0.4815, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.06350974930362117, |
| "grad_norm": 283011.96875, |
| "learning_rate": 9.917404129793512e-06, |
| "loss": 0.5632, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.06462395543175488, |
| "grad_norm": 695608.625, |
| "learning_rate": 9.905604719764012e-06, |
| "loss": 0.4856, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.06573816155988858, |
| "grad_norm": 216388.484375, |
| "learning_rate": 9.893805309734514e-06, |
| "loss": 0.5146, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.06685236768802229, |
| "grad_norm": 235648.84375, |
| "learning_rate": 9.882005899705015e-06, |
| "loss": 0.508, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.06796657381615599, |
| "grad_norm": 271049.1875, |
| "learning_rate": 9.870206489675517e-06, |
| "loss": 0.5011, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.0690807799442897, |
| "grad_norm": 228041.625, |
| "learning_rate": 9.858407079646018e-06, |
| "loss": 0.486, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.0701949860724234, |
| "grad_norm": 217752.53125, |
| "learning_rate": 9.84660766961652e-06, |
| "loss": 0.5339, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.0713091922005571, |
| "grad_norm": 44443688.0, |
| "learning_rate": 9.83480825958702e-06, |
| "loss": 0.4714, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.07242339832869081, |
| "grad_norm": 222843.84375, |
| "learning_rate": 9.823008849557523e-06, |
| "loss": 0.4784, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.07353760445682452, |
| "grad_norm": 214836.625, |
| "learning_rate": 9.811209439528024e-06, |
| "loss": 0.5541, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.07465181058495822, |
| "grad_norm": 358109.9375, |
| "learning_rate": 9.799410029498526e-06, |
| "loss": 0.5086, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.07576601671309192, |
| "grad_norm": 218794.015625, |
| "learning_rate": 9.787610619469026e-06, |
| "loss": 0.5288, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.07688022284122563, |
| "grad_norm": 233655.0, |
| "learning_rate": 9.775811209439529e-06, |
| "loss": 0.4452, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.07799442896935933, |
| "grad_norm": 314280.78125, |
| "learning_rate": 9.764011799410031e-06, |
| "loss": 0.5537, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.07910863509749304, |
| "grad_norm": 256511.78125, |
| "learning_rate": 9.752212389380532e-06, |
| "loss": 0.4991, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.08022284122562674, |
| "grad_norm": 227226.875, |
| "learning_rate": 9.740412979351032e-06, |
| "loss": 0.5703, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.08133704735376045, |
| "grad_norm": 9358421.0, |
| "learning_rate": 9.728613569321534e-06, |
| "loss": 0.5663, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.08245125348189415, |
| "grad_norm": 179491.125, |
| "learning_rate": 9.716814159292037e-06, |
| "loss": 0.4795, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.08356545961002786, |
| "grad_norm": 247549.875, |
| "learning_rate": 9.705014749262537e-06, |
| "loss": 0.4769, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.08467966573816156, |
| "grad_norm": 237436.28125, |
| "learning_rate": 9.693215339233038e-06, |
| "loss": 0.5177, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.08579387186629527, |
| "grad_norm": 201981.609375, |
| "learning_rate": 9.68141592920354e-06, |
| "loss": 0.4623, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.08690807799442897, |
| "grad_norm": 260105.90625, |
| "learning_rate": 9.669616519174042e-06, |
| "loss": 0.4948, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.08802228412256267, |
| "grad_norm": 205071.875, |
| "learning_rate": 9.657817109144543e-06, |
| "loss": 0.4359, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.08913649025069638, |
| "grad_norm": 223004.84375, |
| "learning_rate": 9.646017699115045e-06, |
| "loss": 0.5101, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.09025069637883008, |
| "grad_norm": 258591.375, |
| "learning_rate": 9.634218289085546e-06, |
| "loss": 0.4504, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.09136490250696379, |
| "grad_norm": 1028943.1875, |
| "learning_rate": 9.622418879056048e-06, |
| "loss": 0.4848, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.09247910863509749, |
| "grad_norm": 173033.28125, |
| "learning_rate": 9.61061946902655e-06, |
| "loss": 0.467, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.0935933147632312, |
| "grad_norm": 404133.6875, |
| "learning_rate": 9.598820058997051e-06, |
| "loss": 0.4951, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.0947075208913649, |
| "grad_norm": 189107.265625, |
| "learning_rate": 9.587020648967552e-06, |
| "loss": 0.4616, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.0958217270194986, |
| "grad_norm": 239286.09375, |
| "learning_rate": 9.575221238938054e-06, |
| "loss": 0.4294, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.09693593314763231, |
| "grad_norm": 240974.71875, |
| "learning_rate": 9.563421828908556e-06, |
| "loss": 0.5314, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.09805013927576602, |
| "grad_norm": 197173.859375, |
| "learning_rate": 9.551622418879057e-06, |
| "loss": 0.4884, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.09916434540389972, |
| "grad_norm": 202271.03125, |
| "learning_rate": 9.539823008849557e-06, |
| "loss": 0.509, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.10027855153203342, |
| "grad_norm": 16256707.0, |
| "learning_rate": 9.52802359882006e-06, |
| "loss": 0.5174, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.10139275766016713, |
| "grad_norm": 243519.046875, |
| "learning_rate": 9.516224188790562e-06, |
| "loss": 0.4546, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.10250696378830083, |
| "grad_norm": 250017.1875, |
| "learning_rate": 9.504424778761062e-06, |
| "loss": 0.4463, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.10362116991643454, |
| "grad_norm": 213143.515625, |
| "learning_rate": 9.492625368731565e-06, |
| "loss": 0.4715, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.10473537604456824, |
| "grad_norm": 195005.078125, |
| "learning_rate": 9.480825958702065e-06, |
| "loss": 0.4921, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.10584958217270195, |
| "grad_norm": 256061.859375, |
| "learning_rate": 9.469026548672568e-06, |
| "loss": 0.5129, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.10696378830083565, |
| "grad_norm": 223323.75, |
| "learning_rate": 9.457227138643068e-06, |
| "loss": 0.4535, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.10807799442896936, |
| "grad_norm": 167112.203125, |
| "learning_rate": 9.44542772861357e-06, |
| "loss": 0.4698, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.10919220055710306, |
| "grad_norm": 223777.640625, |
| "learning_rate": 9.433628318584071e-06, |
| "loss": 0.4681, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.11030640668523677, |
| "grad_norm": 606988.4375, |
| "learning_rate": 9.421828908554573e-06, |
| "loss": 0.5525, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.11142061281337047, |
| "grad_norm": 3532775.25, |
| "learning_rate": 9.410029498525074e-06, |
| "loss": 0.5028, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.11253481894150417, |
| "grad_norm": 234763.8125, |
| "learning_rate": 9.398230088495576e-06, |
| "loss": 0.5049, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.11364902506963788, |
| "grad_norm": 187823.65625, |
| "learning_rate": 9.386430678466077e-06, |
| "loss": 0.4735, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.11476323119777158, |
| "grad_norm": 177220.46875, |
| "learning_rate": 9.374631268436579e-06, |
| "loss": 0.453, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.11587743732590529, |
| "grad_norm": 222257.65625, |
| "learning_rate": 9.36283185840708e-06, |
| "loss": 0.5109, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.116991643454039, |
| "grad_norm": 197302.859375, |
| "learning_rate": 9.351032448377582e-06, |
| "loss": 0.521, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.1181058495821727, |
| "grad_norm": 206071.125, |
| "learning_rate": 9.339233038348084e-06, |
| "loss": 0.4685, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.1192200557103064, |
| "grad_norm": 217538.734375, |
| "learning_rate": 9.327433628318585e-06, |
| "loss": 0.4788, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.1203342618384401, |
| "grad_norm": 175732.96875, |
| "learning_rate": 9.315634218289085e-06, |
| "loss": 0.4497, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.12144846796657381, |
| "grad_norm": 215663.546875, |
| "learning_rate": 9.303834808259587e-06, |
| "loss": 0.4988, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.12256267409470752, |
| "grad_norm": 184019.109375, |
| "learning_rate": 9.29203539823009e-06, |
| "loss": 0.4578, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.12367688022284122, |
| "grad_norm": 215794.1875, |
| "learning_rate": 9.28023598820059e-06, |
| "loss": 0.4774, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.12479108635097493, |
| "grad_norm": 220917.953125, |
| "learning_rate": 9.268436578171091e-06, |
| "loss": 0.4589, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.12590529247910864, |
| "grad_norm": 247261.0625, |
| "learning_rate": 9.256637168141593e-06, |
| "loss": 0.4599, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.12701949860724235, |
| "grad_norm": 210848.296875, |
| "learning_rate": 9.244837758112095e-06, |
| "loss": 0.5151, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.12813370473537605, |
| "grad_norm": 202988.421875, |
| "learning_rate": 9.233038348082598e-06, |
| "loss": 0.4908, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.12924791086350976, |
| "grad_norm": 205763.890625, |
| "learning_rate": 9.221238938053098e-06, |
| "loss": 0.4931, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.13036211699164346, |
| "grad_norm": 228740.640625, |
| "learning_rate": 9.209439528023599e-06, |
| "loss": 0.4155, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.13147632311977717, |
| "grad_norm": 247188.296875, |
| "learning_rate": 9.197640117994101e-06, |
| "loss": 0.4654, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.13259052924791087, |
| "grad_norm": 192504.578125, |
| "learning_rate": 9.185840707964603e-06, |
| "loss": 0.4207, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.13370473537604458, |
| "grad_norm": 163095.4375, |
| "learning_rate": 9.174041297935104e-06, |
| "loss": 0.476, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.13481894150417828, |
| "grad_norm": 206575.328125, |
| "learning_rate": 9.162241887905605e-06, |
| "loss": 0.4762, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.13593314763231198, |
| "grad_norm": 220094.328125, |
| "learning_rate": 9.150442477876107e-06, |
| "loss": 0.5028, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.1370473537604457, |
| "grad_norm": 179223.671875, |
| "learning_rate": 9.138643067846609e-06, |
| "loss": 0.4679, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.1381615598885794, |
| "grad_norm": 159205.375, |
| "learning_rate": 9.12684365781711e-06, |
| "loss": 0.4489, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.1392757660167131, |
| "grad_norm": 239964.90625, |
| "learning_rate": 9.11504424778761e-06, |
| "loss": 0.4944, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.1403899721448468, |
| "grad_norm": 1860553.375, |
| "learning_rate": 9.103244837758113e-06, |
| "loss": 0.4799, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.1415041782729805, |
| "grad_norm": 215564.03125, |
| "learning_rate": 9.091445427728615e-06, |
| "loss": 0.4839, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.1426183844011142, |
| "grad_norm": 165737.609375, |
| "learning_rate": 9.079646017699115e-06, |
| "loss": 0.4782, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.14373259052924792, |
| "grad_norm": 227705.28125, |
| "learning_rate": 9.067846607669618e-06, |
| "loss": 0.4569, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.14484679665738162, |
| "grad_norm": 239591.640625, |
| "learning_rate": 9.056047197640118e-06, |
| "loss": 0.4656, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.14596100278551533, |
| "grad_norm": 228280.484375, |
| "learning_rate": 9.04424778761062e-06, |
| "loss": 0.4438, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.14707520891364903, |
| "grad_norm": 211993.90625, |
| "learning_rate": 9.032448377581121e-06, |
| "loss": 0.4636, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.14818941504178273, |
| "grad_norm": 225698.015625, |
| "learning_rate": 9.020648967551623e-06, |
| "loss": 0.445, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.14930362116991644, |
| "grad_norm": 201884.421875, |
| "learning_rate": 9.008849557522124e-06, |
| "loss": 0.4308, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.15041782729805014, |
| "grad_norm": 177903.1875, |
| "learning_rate": 8.997050147492626e-06, |
| "loss": 0.4389, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.15153203342618385, |
| "grad_norm": 160066.296875, |
| "learning_rate": 8.985250737463127e-06, |
| "loss": 0.4577, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.15264623955431755, |
| "grad_norm": 209065.21875, |
| "learning_rate": 8.973451327433629e-06, |
| "loss": 0.4774, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.15376044568245126, |
| "grad_norm": 174174.75, |
| "learning_rate": 8.961651917404131e-06, |
| "loss": 0.4316, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.15487465181058496, |
| "grad_norm": 207085.890625, |
| "learning_rate": 8.949852507374632e-06, |
| "loss": 0.4979, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.15598885793871867, |
| "grad_norm": 165266.171875, |
| "learning_rate": 8.938053097345133e-06, |
| "loss": 0.4555, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.15710306406685237, |
| "grad_norm": 187202.796875, |
| "learning_rate": 8.926253687315635e-06, |
| "loss": 0.4285, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.15821727019498608, |
| "grad_norm": 241471.515625, |
| "learning_rate": 8.914454277286137e-06, |
| "loss": 0.4079, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.15933147632311978, |
| "grad_norm": 186964.390625, |
| "learning_rate": 8.902654867256638e-06, |
| "loss": 0.4669, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.16044568245125349, |
| "grad_norm": 187827.359375, |
| "learning_rate": 8.890855457227138e-06, |
| "loss": 0.469, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.1615598885793872, |
| "grad_norm": 185109.921875, |
| "learning_rate": 8.87905604719764e-06, |
| "loss": 0.4418, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.1626740947075209, |
| "grad_norm": 163606.359375, |
| "learning_rate": 8.867256637168143e-06, |
| "loss": 0.4303, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.1637883008356546, |
| "grad_norm": 301747.125, |
| "learning_rate": 8.855457227138643e-06, |
| "loss": 0.5794, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.1649025069637883, |
| "grad_norm": 197552.6875, |
| "learning_rate": 8.843657817109144e-06, |
| "loss": 0.4615, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.166016713091922, |
| "grad_norm": 173249.578125, |
| "learning_rate": 8.831858407079646e-06, |
| "loss": 0.4011, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.1671309192200557, |
| "grad_norm": 182496.59375, |
| "learning_rate": 8.820058997050148e-06, |
| "loss": 0.4486, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.16824512534818942, |
| "grad_norm": 210268.046875, |
| "learning_rate": 8.80825958702065e-06, |
| "loss": 0.5297, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.16935933147632312, |
| "grad_norm": 187245.1875, |
| "learning_rate": 8.796460176991151e-06, |
| "loss": 0.5058, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.17047353760445683, |
| "grad_norm": 151204.625, |
| "learning_rate": 8.784660766961652e-06, |
| "loss": 0.4313, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.17158774373259053, |
| "grad_norm": 199678.75, |
| "learning_rate": 8.772861356932154e-06, |
| "loss": 0.4923, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.17270194986072424, |
| "grad_norm": 216835.015625, |
| "learning_rate": 8.761061946902656e-06, |
| "loss": 0.4496, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.17381615598885794, |
| "grad_norm": 220194.328125, |
| "learning_rate": 8.749262536873157e-06, |
| "loss": 0.451, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.17493036211699164, |
| "grad_norm": 257876.03125, |
| "learning_rate": 8.737463126843658e-06, |
| "loss": 0.4967, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.17604456824512535, |
| "grad_norm": 247228.328125, |
| "learning_rate": 8.72566371681416e-06, |
| "loss": 0.4585, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.17715877437325905, |
| "grad_norm": 199109.828125, |
| "learning_rate": 8.713864306784662e-06, |
| "loss": 0.4796, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.17827298050139276, |
| "grad_norm": 168323.46875, |
| "learning_rate": 8.702064896755163e-06, |
| "loss": 0.4713, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.17938718662952646, |
| "grad_norm": 217545.203125, |
| "learning_rate": 8.690265486725665e-06, |
| "loss": 0.5065, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.18050139275766017, |
| "grad_norm": 170031.875, |
| "learning_rate": 8.678466076696166e-06, |
| "loss": 0.4523, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.18161559888579387, |
| "grad_norm": 173914.984375, |
| "learning_rate": 8.666666666666668e-06, |
| "loss": 0.4428, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.18272980501392758, |
| "grad_norm": 328774.90625, |
| "learning_rate": 8.654867256637168e-06, |
| "loss": 0.4636, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.18384401114206128, |
| "grad_norm": 160804.953125, |
| "learning_rate": 8.64306784660767e-06, |
| "loss": 0.4819, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.18495821727019499, |
| "grad_norm": 178090.625, |
| "learning_rate": 8.631268436578171e-06, |
| "loss": 0.3609, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.1860724233983287, |
| "grad_norm": 229922.359375, |
| "learning_rate": 8.619469026548674e-06, |
| "loss": 0.544, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.1871866295264624, |
| "grad_norm": 220525.125, |
| "learning_rate": 8.607669616519174e-06, |
| "loss": 0.4629, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.1883008356545961, |
| "grad_norm": 200167.140625, |
| "learning_rate": 8.595870206489676e-06, |
| "loss": 0.4388, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.1894150417827298, |
| "grad_norm": 162366.34375, |
| "learning_rate": 8.584070796460177e-06, |
| "loss": 0.4031, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.1905292479108635, |
| "grad_norm": 168463.046875, |
| "learning_rate": 8.57227138643068e-06, |
| "loss": 0.4573, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.1916434540389972, |
| "grad_norm": 186504.0625, |
| "learning_rate": 8.56047197640118e-06, |
| "loss": 0.4124, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.19275766016713092, |
| "grad_norm": 172114.3125, |
| "learning_rate": 8.548672566371682e-06, |
| "loss": 0.4513, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.19387186629526462, |
| "grad_norm": 180931.875, |
| "learning_rate": 8.536873156342184e-06, |
| "loss": 0.4853, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.19498607242339833, |
| "grad_norm": 161106.390625, |
| "learning_rate": 8.525073746312685e-06, |
| "loss": 0.4688, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.19610027855153203, |
| "grad_norm": 309931.21875, |
| "learning_rate": 8.513274336283186e-06, |
| "loss": 0.4299, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.19721448467966574, |
| "grad_norm": 187258.265625, |
| "learning_rate": 8.501474926253688e-06, |
| "loss": 0.3971, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.19832869080779944, |
| "grad_norm": 189410.328125, |
| "learning_rate": 8.48967551622419e-06, |
| "loss": 0.4437, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.19944289693593314, |
| "grad_norm": 227058.671875, |
| "learning_rate": 8.47787610619469e-06, |
| "loss": 0.4647, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.20055710306406685, |
| "grad_norm": 212547.15625, |
| "learning_rate": 8.466076696165191e-06, |
| "loss": 0.3928, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.20167130919220055, |
| "grad_norm": 243853.921875, |
| "learning_rate": 8.454277286135693e-06, |
| "loss": 0.4864, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.20278551532033426, |
| "grad_norm": 155955.109375, |
| "learning_rate": 8.442477876106196e-06, |
| "loss": 0.402, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.20389972144846796, |
| "grad_norm": 202079.875, |
| "learning_rate": 8.430678466076696e-06, |
| "loss": 0.4701, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.20501392757660167, |
| "grad_norm": 714132.75, |
| "learning_rate": 8.418879056047199e-06, |
| "loss": 0.4677, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.20612813370473537, |
| "grad_norm": 193706.546875, |
| "learning_rate": 8.4070796460177e-06, |
| "loss": 0.4514, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.20724233983286908, |
| "grad_norm": 185646.90625, |
| "learning_rate": 8.395280235988201e-06, |
| "loss": 0.5203, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.20835654596100278, |
| "grad_norm": 200083.578125, |
| "learning_rate": 8.383480825958704e-06, |
| "loss": 0.4839, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.20947075208913649, |
| "grad_norm": 188911.671875, |
| "learning_rate": 8.371681415929204e-06, |
| "loss": 0.4435, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.2105849582172702, |
| "grad_norm": 181933.046875, |
| "learning_rate": 8.359882005899705e-06, |
| "loss": 0.4596, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.2116991643454039, |
| "grad_norm": 220758.59375, |
| "learning_rate": 8.348082595870207e-06, |
| "loss": 0.4758, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.2128133704735376, |
| "grad_norm": 220606.0, |
| "learning_rate": 8.33628318584071e-06, |
| "loss": 0.4422, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.2139275766016713, |
| "grad_norm": 206306.265625, |
| "learning_rate": 8.32448377581121e-06, |
| "loss": 0.379, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.215041782729805, |
| "grad_norm": 181457.03125, |
| "learning_rate": 8.31268436578171e-06, |
| "loss": 0.5551, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.2161559888579387, |
| "grad_norm": 228577.40625, |
| "learning_rate": 8.300884955752213e-06, |
| "loss": 0.4185, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.21727019498607242, |
| "grad_norm": 213855.328125, |
| "learning_rate": 8.289085545722715e-06, |
| "loss": 0.4823, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.21838440111420612, |
| "grad_norm": 230408.421875, |
| "learning_rate": 8.277286135693216e-06, |
| "loss": 0.4441, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.21949860724233983, |
| "grad_norm": 2382021.0, |
| "learning_rate": 8.265486725663718e-06, |
| "loss": 0.4433, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.22061281337047353, |
| "grad_norm": 205788.359375, |
| "learning_rate": 8.253687315634219e-06, |
| "loss": 0.4529, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.22172701949860724, |
| "grad_norm": 188394.21875, |
| "learning_rate": 8.24188790560472e-06, |
| "loss": 0.4691, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.22284122562674094, |
| "grad_norm": 160699.125, |
| "learning_rate": 8.230088495575221e-06, |
| "loss": 0.3891, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.22395543175487465, |
| "grad_norm": 177135.328125, |
| "learning_rate": 8.218289085545724e-06, |
| "loss": 0.4791, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.22506963788300835, |
| "grad_norm": 176526.390625, |
| "learning_rate": 8.206489675516224e-06, |
| "loss": 0.48, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.22618384401114205, |
| "grad_norm": 176218.734375, |
| "learning_rate": 8.194690265486727e-06, |
| "loss": 0.4568, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.22729805013927576, |
| "grad_norm": 212857.703125, |
| "learning_rate": 8.182890855457227e-06, |
| "loss": 0.5493, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.22841225626740946, |
| "grad_norm": 169878.546875, |
| "learning_rate": 8.17109144542773e-06, |
| "loss": 0.5106, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.22952646239554317, |
| "grad_norm": 191395.078125, |
| "learning_rate": 8.15929203539823e-06, |
| "loss": 0.4612, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.23064066852367687, |
| "grad_norm": 202636.328125, |
| "learning_rate": 8.147492625368732e-06, |
| "loss": 0.4216, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.23175487465181058, |
| "grad_norm": 190389.3125, |
| "learning_rate": 8.135693215339233e-06, |
| "loss": 0.47, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.23286908077994428, |
| "grad_norm": 169839.28125, |
| "learning_rate": 8.123893805309735e-06, |
| "loss": 0.3739, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.233983286908078, |
| "grad_norm": 193950.34375, |
| "learning_rate": 8.112094395280237e-06, |
| "loss": 0.4212, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.2350974930362117, |
| "grad_norm": 244072.90625, |
| "learning_rate": 8.100294985250738e-06, |
| "loss": 0.4404, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.2362116991643454, |
| "grad_norm": 187252.015625, |
| "learning_rate": 8.088495575221239e-06, |
| "loss": 0.4064, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.2373259052924791, |
| "grad_norm": 198252.640625, |
| "learning_rate": 8.07669616519174e-06, |
| "loss": 0.4886, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.2384401114206128, |
| "grad_norm": 166305.921875, |
| "learning_rate": 8.064896755162243e-06, |
| "loss": 0.4645, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.2395543175487465, |
| "grad_norm": 170926.28125, |
| "learning_rate": 8.053097345132744e-06, |
| "loss": 0.4812, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.2406685236768802, |
| "grad_norm": 186730.828125, |
| "learning_rate": 8.041297935103244e-06, |
| "loss": 0.4462, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.24178272980501392, |
| "grad_norm": 166617.265625, |
| "learning_rate": 8.029498525073746e-06, |
| "loss": 0.4239, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.24289693593314762, |
| "grad_norm": 231541.78125, |
| "learning_rate": 8.017699115044249e-06, |
| "loss": 0.494, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.24401114206128133, |
| "grad_norm": 244759.0625, |
| "learning_rate": 8.005899705014751e-06, |
| "loss": 0.4842, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.24512534818941503, |
| "grad_norm": 213151.8125, |
| "learning_rate": 7.994100294985252e-06, |
| "loss": 0.4053, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.24623955431754874, |
| "grad_norm": 177919.25, |
| "learning_rate": 7.982300884955752e-06, |
| "loss": 0.4367, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.24735376044568244, |
| "grad_norm": 189381.546875, |
| "learning_rate": 7.970501474926254e-06, |
| "loss": 0.4456, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.24846796657381615, |
| "grad_norm": 182038.265625, |
| "learning_rate": 7.958702064896757e-06, |
| "loss": 0.4758, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.24958217270194985, |
| "grad_norm": 194923.015625, |
| "learning_rate": 7.946902654867257e-06, |
| "loss": 0.4544, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.25069637883008355, |
| "grad_norm": 213895.234375, |
| "learning_rate": 7.935103244837758e-06, |
| "loss": 0.4855, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.2518105849582173, |
| "grad_norm": 176202.59375, |
| "learning_rate": 7.92330383480826e-06, |
| "loss": 0.4411, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.25292479108635096, |
| "grad_norm": 173177.484375, |
| "learning_rate": 7.911504424778762e-06, |
| "loss": 0.408, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.2540389972144847, |
| "grad_norm": 165414.84375, |
| "learning_rate": 7.899705014749263e-06, |
| "loss": 0.4193, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.2551532033426184, |
| "grad_norm": 187442.96875, |
| "learning_rate": 7.887905604719764e-06, |
| "loss": 0.4563, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.2562674094707521, |
| "grad_norm": 174169.46875, |
| "learning_rate": 7.876106194690266e-06, |
| "loss": 0.469, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.2573816155988858, |
| "grad_norm": 178938.265625, |
| "learning_rate": 7.864306784660768e-06, |
| "loss": 0.4368, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.2584958217270195, |
| "grad_norm": 180309.984375, |
| "learning_rate": 7.852507374631269e-06, |
| "loss": 0.3941, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.2596100278551532, |
| "grad_norm": 163068.09375, |
| "learning_rate": 7.840707964601771e-06, |
| "loss": 0.4593, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.2607242339832869, |
| "grad_norm": 438010.625, |
| "learning_rate": 7.828908554572272e-06, |
| "loss": 0.3922, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.2618384401114206, |
| "grad_norm": 160514.765625, |
| "learning_rate": 7.817109144542774e-06, |
| "loss": 0.414, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.26295264623955433, |
| "grad_norm": 183856.734375, |
| "learning_rate": 7.805309734513274e-06, |
| "loss": 0.4401, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.264066852367688, |
| "grad_norm": 148409.546875, |
| "learning_rate": 7.793510324483777e-06, |
| "loss": 0.4098, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.26518105849582174, |
| "grad_norm": 141153.078125, |
| "learning_rate": 7.781710914454277e-06, |
| "loss": 0.4056, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.2662952646239554, |
| "grad_norm": 180253.265625, |
| "learning_rate": 7.76991150442478e-06, |
| "loss": 0.4528, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.26740947075208915, |
| "grad_norm": 197980.125, |
| "learning_rate": 7.75811209439528e-06, |
| "loss": 0.5156, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.26852367688022283, |
| "grad_norm": 240508.0625, |
| "learning_rate": 7.746312684365782e-06, |
| "loss": 0.5328, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.26963788300835656, |
| "grad_norm": 185720.15625, |
| "learning_rate": 7.734513274336285e-06, |
| "loss": 0.4215, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.27075208913649024, |
| "grad_norm": 202803.9375, |
| "learning_rate": 7.722713864306785e-06, |
| "loss": 0.4465, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.27186629526462397, |
| "grad_norm": 224614.953125, |
| "learning_rate": 7.710914454277286e-06, |
| "loss": 0.4462, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.27298050139275765, |
| "grad_norm": 180010.578125, |
| "learning_rate": 7.699115044247788e-06, |
| "loss": 0.4352, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.2740947075208914, |
| "grad_norm": 163524.25, |
| "learning_rate": 7.68731563421829e-06, |
| "loss": 0.4927, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.27520891364902506, |
| "grad_norm": 207432.9375, |
| "learning_rate": 7.675516224188791e-06, |
| "loss": 0.4633, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.2763231197771588, |
| "grad_norm": 204331.390625, |
| "learning_rate": 7.663716814159292e-06, |
| "loss": 0.3958, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.27743732590529246, |
| "grad_norm": 190513.875, |
| "learning_rate": 7.651917404129794e-06, |
| "loss": 0.4297, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.2785515320334262, |
| "grad_norm": 217766.4375, |
| "learning_rate": 7.640117994100296e-06, |
| "loss": 0.5156, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2796657381615599, |
| "grad_norm": 154360.34375, |
| "learning_rate": 7.6283185840707975e-06, |
| "loss": 0.4699, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.2807799442896936, |
| "grad_norm": 191062.34375, |
| "learning_rate": 7.616519174041298e-06, |
| "loss": 0.4874, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.2818941504178273, |
| "grad_norm": 172361.28125, |
| "learning_rate": 7.6047197640117995e-06, |
| "loss": 0.4375, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.283008356545961, |
| "grad_norm": 184886.234375, |
| "learning_rate": 7.592920353982302e-06, |
| "loss": 0.4439, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.2841225626740947, |
| "grad_norm": 219691.9375, |
| "learning_rate": 7.581120943952803e-06, |
| "loss": 0.4365, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.2852367688022284, |
| "grad_norm": 196143.03125, |
| "learning_rate": 7.569321533923304e-06, |
| "loss": 0.429, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.2863509749303621, |
| "grad_norm": 208180.90625, |
| "learning_rate": 7.557522123893806e-06, |
| "loss": 0.4469, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.28746518105849583, |
| "grad_norm": 171706.4375, |
| "learning_rate": 7.5457227138643075e-06, |
| "loss": 0.3754, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.2885793871866295, |
| "grad_norm": 191040.453125, |
| "learning_rate": 7.533923303834809e-06, |
| "loss": 0.3943, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.28969359331476324, |
| "grad_norm": 242674.359375, |
| "learning_rate": 7.5221238938053095e-06, |
| "loss": 0.4126, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.2908077994428969, |
| "grad_norm": 176212.265625, |
| "learning_rate": 7.510324483775812e-06, |
| "loss": 0.5054, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.29192200557103065, |
| "grad_norm": 160570.40625, |
| "learning_rate": 7.498525073746313e-06, |
| "loss": 0.4517, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.29303621169916433, |
| "grad_norm": 225097.109375, |
| "learning_rate": 7.4867256637168155e-06, |
| "loss": 0.4204, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.29415041782729806, |
| "grad_norm": 212929.71875, |
| "learning_rate": 7.474926253687316e-06, |
| "loss": 0.3645, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.29526462395543174, |
| "grad_norm": 302795.75, |
| "learning_rate": 7.4631268436578175e-06, |
| "loss": 0.4555, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.29637883008356547, |
| "grad_norm": 274847.28125, |
| "learning_rate": 7.451327433628319e-06, |
| "loss": 0.4636, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.29749303621169915, |
| "grad_norm": 247978.859375, |
| "learning_rate": 7.439528023598821e-06, |
| "loss": 0.497, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.2986072423398329, |
| "grad_norm": 235642.4375, |
| "learning_rate": 7.427728613569322e-06, |
| "loss": 0.4798, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.29972144846796656, |
| "grad_norm": 175856.296875, |
| "learning_rate": 7.415929203539823e-06, |
| "loss": 0.3984, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.3008356545961003, |
| "grad_norm": 164871.171875, |
| "learning_rate": 7.4041297935103254e-06, |
| "loss": 0.4505, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.30194986072423396, |
| "grad_norm": 235865.71875, |
| "learning_rate": 7.392330383480827e-06, |
| "loss": 0.4366, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.3030640668523677, |
| "grad_norm": 210909.359375, |
| "learning_rate": 7.3805309734513274e-06, |
| "loss": 0.4519, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.3041782729805014, |
| "grad_norm": 233273.6875, |
| "learning_rate": 7.368731563421829e-06, |
| "loss": 0.5225, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.3052924791086351, |
| "grad_norm": 182172.390625, |
| "learning_rate": 7.356932153392331e-06, |
| "loss": 0.4502, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.3064066852367688, |
| "grad_norm": 148367.265625, |
| "learning_rate": 7.3451327433628326e-06, |
| "loss": 0.4645, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.3075208913649025, |
| "grad_norm": 153879.859375, |
| "learning_rate": 7.333333333333333e-06, |
| "loss": 0.4434, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.3086350974930362, |
| "grad_norm": 166923.828125, |
| "learning_rate": 7.321533923303835e-06, |
| "loss": 0.4275, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.3097493036211699, |
| "grad_norm": 181019.078125, |
| "learning_rate": 7.309734513274337e-06, |
| "loss": 0.4433, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.3108635097493036, |
| "grad_norm": 174066.703125, |
| "learning_rate": 7.297935103244838e-06, |
| "loss": 0.4151, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.31197771587743733, |
| "grad_norm": 182453.328125, |
| "learning_rate": 7.28613569321534e-06, |
| "loss": 0.4393, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.313091922005571, |
| "grad_norm": 221355.9375, |
| "learning_rate": 7.274336283185841e-06, |
| "loss": 0.4509, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.31420612813370474, |
| "grad_norm": 161181.375, |
| "learning_rate": 7.2625368731563425e-06, |
| "loss": 0.4331, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.3153203342618384, |
| "grad_norm": 180448.265625, |
| "learning_rate": 7.250737463126845e-06, |
| "loss": 0.3847, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.31643454038997215, |
| "grad_norm": 195976.234375, |
| "learning_rate": 7.238938053097345e-06, |
| "loss": 0.4582, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.31754874651810583, |
| "grad_norm": 225505.5, |
| "learning_rate": 7.227138643067847e-06, |
| "loss": 0.4278, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.31866295264623956, |
| "grad_norm": 197629.5, |
| "learning_rate": 7.215339233038349e-06, |
| "loss": 0.427, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.31977715877437324, |
| "grad_norm": 186139.46875, |
| "learning_rate": 7.2035398230088505e-06, |
| "loss": 0.4538, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.32089136490250697, |
| "grad_norm": 242211.59375, |
| "learning_rate": 7.191740412979351e-06, |
| "loss": 0.4626, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.32200557103064065, |
| "grad_norm": 219988.5625, |
| "learning_rate": 7.1799410029498525e-06, |
| "loss": 0.4787, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.3231197771587744, |
| "grad_norm": 248857.53125, |
| "learning_rate": 7.168141592920355e-06, |
| "loss": 0.4291, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.32423398328690806, |
| "grad_norm": 189503.140625, |
| "learning_rate": 7.156342182890856e-06, |
| "loss": 0.4115, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.3253481894150418, |
| "grad_norm": 187285.109375, |
| "learning_rate": 7.144542772861357e-06, |
| "loss": 0.409, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.32646239554317547, |
| "grad_norm": 165844.203125, |
| "learning_rate": 7.132743362831859e-06, |
| "loss": 0.4235, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.3275766016713092, |
| "grad_norm": 143210.59375, |
| "learning_rate": 7.1209439528023605e-06, |
| "loss": 0.4236, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.3286908077994429, |
| "grad_norm": 189349.671875, |
| "learning_rate": 7.109144542772862e-06, |
| "loss": 0.4673, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.3298050139275766, |
| "grad_norm": 200229.21875, |
| "learning_rate": 7.0973451327433625e-06, |
| "loss": 0.4638, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.3309192200557103, |
| "grad_norm": 211443.546875, |
| "learning_rate": 7.085545722713865e-06, |
| "loss": 0.4817, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.332033426183844, |
| "grad_norm": 163127.796875, |
| "learning_rate": 7.073746312684366e-06, |
| "loss": 0.4673, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.3331476323119777, |
| "grad_norm": 204851.25, |
| "learning_rate": 7.0619469026548685e-06, |
| "loss": 0.3908, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.3342618384401114, |
| "grad_norm": 197754.03125, |
| "learning_rate": 7.050147492625369e-06, |
| "loss": 0.4242, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3353760445682451, |
| "grad_norm": 169182.234375, |
| "learning_rate": 7.0383480825958705e-06, |
| "loss": 0.3986, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.33649025069637883, |
| "grad_norm": 203432.265625, |
| "learning_rate": 7.026548672566372e-06, |
| "loss": 0.4509, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.3376044568245125, |
| "grad_norm": 151958.265625, |
| "learning_rate": 7.014749262536874e-06, |
| "loss": 0.4076, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.33871866295264624, |
| "grad_norm": 175936.0, |
| "learning_rate": 7.002949852507375e-06, |
| "loss": 0.4503, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.3398328690807799, |
| "grad_norm": 235924.75, |
| "learning_rate": 6.991150442477876e-06, |
| "loss": 0.5033, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.34094707520891365, |
| "grad_norm": 233674.546875, |
| "learning_rate": 6.9793510324483784e-06, |
| "loss": 0.4037, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.34206128133704733, |
| "grad_norm": 219473.421875, |
| "learning_rate": 6.96755162241888e-06, |
| "loss": 0.5048, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.34317548746518106, |
| "grad_norm": 174127.296875, |
| "learning_rate": 6.9557522123893805e-06, |
| "loss": 0.4419, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.34428969359331474, |
| "grad_norm": 192749.65625, |
| "learning_rate": 6.943952802359883e-06, |
| "loss": 0.4122, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.34540389972144847, |
| "grad_norm": 162299.4375, |
| "learning_rate": 6.932153392330384e-06, |
| "loss": 0.3922, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.34651810584958215, |
| "grad_norm": 164610.609375, |
| "learning_rate": 6.9203539823008856e-06, |
| "loss": 0.4145, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.3476323119777159, |
| "grad_norm": 216083.84375, |
| "learning_rate": 6.908554572271386e-06, |
| "loss": 0.4466, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.34874651810584956, |
| "grad_norm": 164565.40625, |
| "learning_rate": 6.8967551622418884e-06, |
| "loss": 0.4169, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.3498607242339833, |
| "grad_norm": 186178.03125, |
| "learning_rate": 6.88495575221239e-06, |
| "loss": 0.4237, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.35097493036211697, |
| "grad_norm": 214979.140625, |
| "learning_rate": 6.873156342182892e-06, |
| "loss": 0.4219, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.3520891364902507, |
| "grad_norm": 189355.046875, |
| "learning_rate": 6.861356932153393e-06, |
| "loss": 0.4277, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.3532033426183844, |
| "grad_norm": 146298.015625, |
| "learning_rate": 6.849557522123894e-06, |
| "loss": 0.3793, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.3543175487465181, |
| "grad_norm": 161813.9375, |
| "learning_rate": 6.8377581120943956e-06, |
| "loss": 0.4261, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.3554317548746518, |
| "grad_norm": 190914.546875, |
| "learning_rate": 6.825958702064898e-06, |
| "loss": 0.4386, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.3565459610027855, |
| "grad_norm": 192714.109375, |
| "learning_rate": 6.814159292035398e-06, |
| "loss": 0.3358, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3576601671309192, |
| "grad_norm": 230763.09375, |
| "learning_rate": 6.8023598820059e-06, |
| "loss": 0.395, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.3587743732590529, |
| "grad_norm": 205893.734375, |
| "learning_rate": 6.790560471976402e-06, |
| "loss": 0.3984, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.3598885793871866, |
| "grad_norm": 197603.359375, |
| "learning_rate": 6.7787610619469035e-06, |
| "loss": 0.4227, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.36100278551532033, |
| "grad_norm": 201566.0625, |
| "learning_rate": 6.766961651917404e-06, |
| "loss": 0.4909, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.362116991643454, |
| "grad_norm": 198001.234375, |
| "learning_rate": 6.7551622418879055e-06, |
| "loss": 0.4225, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.36323119777158774, |
| "grad_norm": 202295.75, |
| "learning_rate": 6.743362831858408e-06, |
| "loss": 0.4214, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.3643454038997215, |
| "grad_norm": 232146.796875, |
| "learning_rate": 6.731563421828909e-06, |
| "loss": 0.4536, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.36545961002785515, |
| "grad_norm": 176061.234375, |
| "learning_rate": 6.71976401179941e-06, |
| "loss": 0.565, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.3665738161559889, |
| "grad_norm": 186836.6875, |
| "learning_rate": 6.707964601769912e-06, |
| "loss": 0.4747, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.36768802228412256, |
| "grad_norm": 209567.4375, |
| "learning_rate": 6.6961651917404135e-06, |
| "loss": 0.5102, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.3688022284122563, |
| "grad_norm": 168502.078125, |
| "learning_rate": 6.684365781710915e-06, |
| "loss": 0.4422, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.36991643454038997, |
| "grad_norm": 140973.546875, |
| "learning_rate": 6.672566371681416e-06, |
| "loss": 0.4106, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.3710306406685237, |
| "grad_norm": 198291.03125, |
| "learning_rate": 6.660766961651918e-06, |
| "loss": 0.4064, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.3721448467966574, |
| "grad_norm": 223377.78125, |
| "learning_rate": 6.648967551622419e-06, |
| "loss": 0.4166, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.3732590529247911, |
| "grad_norm": 153720.453125, |
| "learning_rate": 6.6371681415929215e-06, |
| "loss": 0.4166, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.3743732590529248, |
| "grad_norm": 201311.625, |
| "learning_rate": 6.625368731563422e-06, |
| "loss": 0.4366, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.3754874651810585, |
| "grad_norm": 231880.140625, |
| "learning_rate": 6.6135693215339235e-06, |
| "loss": 0.4424, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.3766016713091922, |
| "grad_norm": 215704.15625, |
| "learning_rate": 6.601769911504426e-06, |
| "loss": 0.4231, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.37771587743732593, |
| "grad_norm": 218725.96875, |
| "learning_rate": 6.589970501474927e-06, |
| "loss": 0.4443, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.3788300835654596, |
| "grad_norm": 172654.328125, |
| "learning_rate": 6.578171091445428e-06, |
| "loss": 0.4628, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.37994428969359334, |
| "grad_norm": 255787.09375, |
| "learning_rate": 6.566371681415929e-06, |
| "loss": 0.3853, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.381058495821727, |
| "grad_norm": 214523.421875, |
| "learning_rate": 6.5545722713864315e-06, |
| "loss": 0.3911, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.38217270194986075, |
| "grad_norm": 171143.546875, |
| "learning_rate": 6.542772861356933e-06, |
| "loss": 0.3911, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.3832869080779944, |
| "grad_norm": 208521.5625, |
| "learning_rate": 6.5309734513274335e-06, |
| "loss": 0.4602, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.38440111420612816, |
| "grad_norm": 164048.6875, |
| "learning_rate": 6.519174041297936e-06, |
| "loss": 0.3663, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.38551532033426184, |
| "grad_norm": 175852.65625, |
| "learning_rate": 6.507374631268437e-06, |
| "loss": 0.4219, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.38662952646239557, |
| "grad_norm": 155706.46875, |
| "learning_rate": 6.495575221238939e-06, |
| "loss": 0.4241, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.38774373259052924, |
| "grad_norm": 208711.96875, |
| "learning_rate": 6.483775811209439e-06, |
| "loss": 0.4556, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.388857938718663, |
| "grad_norm": 190895.765625, |
| "learning_rate": 6.4719764011799414e-06, |
| "loss": 0.4824, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.38997214484679665, |
| "grad_norm": 190407.703125, |
| "learning_rate": 6.460176991150443e-06, |
| "loss": 0.3921, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.3910863509749304, |
| "grad_norm": 189109.84375, |
| "learning_rate": 6.448377581120945e-06, |
| "loss": 0.4369, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.39220055710306406, |
| "grad_norm": 164608.40625, |
| "learning_rate": 6.436578171091446e-06, |
| "loss": 0.4079, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.3933147632311978, |
| "grad_norm": 195097.40625, |
| "learning_rate": 6.424778761061947e-06, |
| "loss": 0.439, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.39442896935933147, |
| "grad_norm": 161624.625, |
| "learning_rate": 6.4129793510324486e-06, |
| "loss": 0.4484, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.3955431754874652, |
| "grad_norm": 183812.625, |
| "learning_rate": 6.401179941002951e-06, |
| "loss": 0.4078, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.3966573816155989, |
| "grad_norm": 160959.0625, |
| "learning_rate": 6.389380530973451e-06, |
| "loss": 0.4448, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.3977715877437326, |
| "grad_norm": 172894.578125, |
| "learning_rate": 6.377581120943953e-06, |
| "loss": 0.4315, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.3988857938718663, |
| "grad_norm": 167681.3125, |
| "learning_rate": 6.365781710914455e-06, |
| "loss": 0.5089, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 210563.953125, |
| "learning_rate": 6.3539823008849565e-06, |
| "loss": 0.4593, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.4011142061281337, |
| "grad_norm": 190382.875, |
| "learning_rate": 6.342182890855457e-06, |
| "loss": 0.4257, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.40222841225626743, |
| "grad_norm": 146628.546875, |
| "learning_rate": 6.330383480825959e-06, |
| "loss": 0.4464, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.4033426183844011, |
| "grad_norm": 185890.65625, |
| "learning_rate": 6.318584070796461e-06, |
| "loss": 0.4525, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.40445682451253484, |
| "grad_norm": 168210.1875, |
| "learning_rate": 6.306784660766962e-06, |
| "loss": 0.4163, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.4055710306406685, |
| "grad_norm": 186928.890625, |
| "learning_rate": 6.294985250737463e-06, |
| "loss": 0.4371, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.40668523676880225, |
| "grad_norm": 156051.6875, |
| "learning_rate": 6.283185840707965e-06, |
| "loss": 0.3911, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.4077994428969359, |
| "grad_norm": 199442.53125, |
| "learning_rate": 6.2713864306784665e-06, |
| "loss": 0.459, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.40891364902506966, |
| "grad_norm": 545817.75, |
| "learning_rate": 6.259587020648969e-06, |
| "loss": 0.4288, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.41002785515320334, |
| "grad_norm": 167921.96875, |
| "learning_rate": 6.247787610619469e-06, |
| "loss": 0.4002, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.41114206128133707, |
| "grad_norm": 150013.328125, |
| "learning_rate": 6.235988200589971e-06, |
| "loss": 0.3893, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.41225626740947074, |
| "grad_norm": 197072.15625, |
| "learning_rate": 6.224188790560472e-06, |
| "loss": 0.4764, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.4133704735376045, |
| "grad_norm": 173856.421875, |
| "learning_rate": 6.2123893805309745e-06, |
| "loss": 0.4133, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.41448467966573815, |
| "grad_norm": 189156.15625, |
| "learning_rate": 6.200589970501475e-06, |
| "loss": 0.4095, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.4155988857938719, |
| "grad_norm": 154648.484375, |
| "learning_rate": 6.1887905604719765e-06, |
| "loss": 0.4359, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.41671309192200556, |
| "grad_norm": 204105.28125, |
| "learning_rate": 6.176991150442479e-06, |
| "loss": 0.4027, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.4178272980501393, |
| "grad_norm": 143494.203125, |
| "learning_rate": 6.16519174041298e-06, |
| "loss": 0.4136, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.41894150417827297, |
| "grad_norm": 207321.671875, |
| "learning_rate": 6.153392330383481e-06, |
| "loss": 0.4809, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.4200557103064067, |
| "grad_norm": 188413.15625, |
| "learning_rate": 6.141592920353982e-06, |
| "loss": 0.4515, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.4211699164345404, |
| "grad_norm": 154952.078125, |
| "learning_rate": 6.1297935103244845e-06, |
| "loss": 0.3791, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.4222841225626741, |
| "grad_norm": 166050.15625, |
| "learning_rate": 6.117994100294986e-06, |
| "loss": 0.4097, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.4233983286908078, |
| "grad_norm": 182377.765625, |
| "learning_rate": 6.1061946902654865e-06, |
| "loss": 0.3749, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.4245125348189415, |
| "grad_norm": 208522.78125, |
| "learning_rate": 6.094395280235989e-06, |
| "loss": 0.3636, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.4256267409470752, |
| "grad_norm": 182704.5, |
| "learning_rate": 6.08259587020649e-06, |
| "loss": 0.4477, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.42674094707520893, |
| "grad_norm": 163660.265625, |
| "learning_rate": 6.070796460176992e-06, |
| "loss": 0.4134, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.4278551532033426, |
| "grad_norm": 191465.125, |
| "learning_rate": 6.058997050147493e-06, |
| "loss": 0.4783, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.42896935933147634, |
| "grad_norm": 185075.90625, |
| "learning_rate": 6.0471976401179945e-06, |
| "loss": 0.4293, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.43008356545961, |
| "grad_norm": 221350.125, |
| "learning_rate": 6.035398230088496e-06, |
| "loss": 0.3962, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.43119777158774375, |
| "grad_norm": 175274.96875, |
| "learning_rate": 6.023598820058998e-06, |
| "loss": 0.4292, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.4323119777158774, |
| "grad_norm": 206309.9375, |
| "learning_rate": 6.011799410029499e-06, |
| "loss": 0.4278, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.43342618384401116, |
| "grad_norm": 184827.65625, |
| "learning_rate": 6e-06, |
| "loss": 0.4033, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.43454038997214484, |
| "grad_norm": 192188.78125, |
| "learning_rate": 5.9882005899705024e-06, |
| "loss": 0.4204, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.43565459610027857, |
| "grad_norm": 171022.75, |
| "learning_rate": 5.976401179941004e-06, |
| "loss": 0.443, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.43676880222841225, |
| "grad_norm": 174545.4375, |
| "learning_rate": 5.9646017699115044e-06, |
| "loss": 0.3965, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.437883008356546, |
| "grad_norm": 196021.375, |
| "learning_rate": 5.952802359882006e-06, |
| "loss": 0.4329, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.43899721448467965, |
| "grad_norm": 201056.484375, |
| "learning_rate": 5.941002949852508e-06, |
| "loss": 0.4765, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.4401114206128134, |
| "grad_norm": 169847.234375, |
| "learning_rate": 5.9292035398230096e-06, |
| "loss": 0.3634, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.44122562674094706, |
| "grad_norm": 164063.359375, |
| "learning_rate": 5.91740412979351e-06, |
| "loss": 0.4535, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.4423398328690808, |
| "grad_norm": 165311.09375, |
| "learning_rate": 5.905604719764012e-06, |
| "loss": 0.3972, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.4434540389972145, |
| "grad_norm": 154916.328125, |
| "learning_rate": 5.893805309734514e-06, |
| "loss": 0.4292, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.4445682451253482, |
| "grad_norm": 242260.9375, |
| "learning_rate": 5.882005899705015e-06, |
| "loss": 0.4778, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.4456824512534819, |
| "grad_norm": 182793.953125, |
| "learning_rate": 5.870206489675516e-06, |
| "loss": 0.411, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4467966573816156, |
| "grad_norm": 162919.3125, |
| "learning_rate": 5.858407079646018e-06, |
| "loss": 0.471, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.4479108635097493, |
| "grad_norm": 195205.6875, |
| "learning_rate": 5.8466076696165195e-06, |
| "loss": 0.4646, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.449025069637883, |
| "grad_norm": 184059.46875, |
| "learning_rate": 5.834808259587022e-06, |
| "loss": 0.4894, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.4501392757660167, |
| "grad_norm": 160941.640625, |
| "learning_rate": 5.823008849557522e-06, |
| "loss": 0.3985, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.45125348189415043, |
| "grad_norm": 173049.71875, |
| "learning_rate": 5.811209439528024e-06, |
| "loss": 0.3436, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.4523676880222841, |
| "grad_norm": 174724.65625, |
| "learning_rate": 5.799410029498525e-06, |
| "loss": 0.3968, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.45348189415041784, |
| "grad_norm": 225425.4375, |
| "learning_rate": 5.7876106194690275e-06, |
| "loss": 0.4066, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.4545961002785515, |
| "grad_norm": 165040.125, |
| "learning_rate": 5.775811209439528e-06, |
| "loss": 0.4007, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.45571030640668525, |
| "grad_norm": 163311.609375, |
| "learning_rate": 5.7640117994100295e-06, |
| "loss": 0.3548, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.4568245125348189, |
| "grad_norm": 165943.046875, |
| "learning_rate": 5.752212389380532e-06, |
| "loss": 0.4311, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.45793871866295266, |
| "grad_norm": 150324.8125, |
| "learning_rate": 5.740412979351033e-06, |
| "loss": 0.4099, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.45905292479108634, |
| "grad_norm": 172383.046875, |
| "learning_rate": 5.728613569321534e-06, |
| "loss": 0.4104, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.46016713091922007, |
| "grad_norm": 183318.265625, |
| "learning_rate": 5.716814159292036e-06, |
| "loss": 0.4281, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.46128133704735375, |
| "grad_norm": 152074.421875, |
| "learning_rate": 5.7050147492625375e-06, |
| "loss": 0.4459, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.4623955431754875, |
| "grad_norm": 191926.703125, |
| "learning_rate": 5.693215339233039e-06, |
| "loss": 0.3893, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.46350974930362115, |
| "grad_norm": 163396.796875, |
| "learning_rate": 5.6814159292035395e-06, |
| "loss": 0.4141, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.4646239554317549, |
| "grad_norm": 192465.65625, |
| "learning_rate": 5.669616519174042e-06, |
| "loss": 0.4562, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.46573816155988856, |
| "grad_norm": 218218.90625, |
| "learning_rate": 5.657817109144543e-06, |
| "loss": 0.4242, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.4668523676880223, |
| "grad_norm": 202565.15625, |
| "learning_rate": 5.6460176991150455e-06, |
| "loss": 0.4162, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.467966573816156, |
| "grad_norm": 173776.546875, |
| "learning_rate": 5.634218289085546e-06, |
| "loss": 0.3934, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.4690807799442897, |
| "grad_norm": 176373.203125, |
| "learning_rate": 5.6224188790560475e-06, |
| "loss": 0.4091, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.4701949860724234, |
| "grad_norm": 212039.5625, |
| "learning_rate": 5.610619469026549e-06, |
| "loss": 0.4188, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.4713091922005571, |
| "grad_norm": 197447.328125, |
| "learning_rate": 5.598820058997051e-06, |
| "loss": 0.3962, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.4724233983286908, |
| "grad_norm": 173795.234375, |
| "learning_rate": 5.587020648967552e-06, |
| "loss": 0.3441, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.4735376044568245, |
| "grad_norm": 210427.46875, |
| "learning_rate": 5.575221238938053e-06, |
| "loss": 0.409, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.4746518105849582, |
| "grad_norm": 185594.359375, |
| "learning_rate": 5.5634218289085554e-06, |
| "loss": 0.4243, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.47576601671309193, |
| "grad_norm": 160850.046875, |
| "learning_rate": 5.551622418879057e-06, |
| "loss": 0.4024, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.4768802228412256, |
| "grad_norm": 165540.953125, |
| "learning_rate": 5.5398230088495574e-06, |
| "loss": 0.4727, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.47799442896935934, |
| "grad_norm": 207796.59375, |
| "learning_rate": 5.528023598820059e-06, |
| "loss": 0.4451, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.479108635097493, |
| "grad_norm": 200276.796875, |
| "learning_rate": 5.516224188790561e-06, |
| "loss": 0.5015, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.48022284122562675, |
| "grad_norm": 177270.5625, |
| "learning_rate": 5.5044247787610626e-06, |
| "loss": 0.4163, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.4813370473537604, |
| "grad_norm": 169286.03125, |
| "learning_rate": 5.492625368731563e-06, |
| "loss": 0.4025, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.48245125348189416, |
| "grad_norm": 159905.65625, |
| "learning_rate": 5.480825958702065e-06, |
| "loss": 0.4096, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.48356545961002784, |
| "grad_norm": 168440.375, |
| "learning_rate": 5.469026548672567e-06, |
| "loss": 0.4202, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.48467966573816157, |
| "grad_norm": 168913.28125, |
| "learning_rate": 5.457227138643068e-06, |
| "loss": 0.3979, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.48579387186629525, |
| "grad_norm": 165448.90625, |
| "learning_rate": 5.44542772861357e-06, |
| "loss": 0.4168, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.486908077994429, |
| "grad_norm": 183158.03125, |
| "learning_rate": 5.433628318584071e-06, |
| "loss": 0.3508, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.48802228412256266, |
| "grad_norm": 194381.09375, |
| "learning_rate": 5.4218289085545725e-06, |
| "loss": 0.4704, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.4891364902506964, |
| "grad_norm": 179988.703125, |
| "learning_rate": 5.410029498525075e-06, |
| "loss": 0.3706, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.49025069637883006, |
| "grad_norm": 150438.375, |
| "learning_rate": 5.398230088495575e-06, |
| "loss": 0.4081, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.4913649025069638, |
| "grad_norm": 151454.5625, |
| "learning_rate": 5.386430678466077e-06, |
| "loss": 0.409, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.4924791086350975, |
| "grad_norm": 183632.453125, |
| "learning_rate": 5.374631268436579e-06, |
| "loss": 0.454, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.4935933147632312, |
| "grad_norm": 159092.578125, |
| "learning_rate": 5.3628318584070805e-06, |
| "loss": 0.4194, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.4947075208913649, |
| "grad_norm": 165830.4375, |
| "learning_rate": 5.351032448377581e-06, |
| "loss": 0.3769, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.4958217270194986, |
| "grad_norm": 155446.09375, |
| "learning_rate": 5.3392330383480825e-06, |
| "loss": 0.3807, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.4969359331476323, |
| "grad_norm": 154877.59375, |
| "learning_rate": 5.327433628318585e-06, |
| "loss": 0.4705, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.498050139275766, |
| "grad_norm": 248433.34375, |
| "learning_rate": 5.315634218289086e-06, |
| "loss": 0.4605, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.4991643454038997, |
| "grad_norm": 173858.453125, |
| "learning_rate": 5.303834808259587e-06, |
| "loss": 0.4343, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.5002785515320334, |
| "grad_norm": 173157.5625, |
| "learning_rate": 5.292035398230089e-06, |
| "loss": 0.4337, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.5013927576601671, |
| "grad_norm": 173380.3125, |
| "learning_rate": 5.2802359882005905e-06, |
| "loss": 0.3652, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5025069637883008, |
| "grad_norm": 173306.734375, |
| "learning_rate": 5.268436578171092e-06, |
| "loss": 0.4024, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.5036211699164346, |
| "grad_norm": 162839.046875, |
| "learning_rate": 5.2566371681415925e-06, |
| "loss": 0.424, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.5047353760445682, |
| "grad_norm": 188280.625, |
| "learning_rate": 5.244837758112095e-06, |
| "loss": 0.4393, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.5058495821727019, |
| "grad_norm": 151708.1875, |
| "learning_rate": 5.233038348082596e-06, |
| "loss": 0.3599, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.5069637883008357, |
| "grad_norm": 212805.03125, |
| "learning_rate": 5.2212389380530985e-06, |
| "loss": 0.4578, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.5080779944289694, |
| "grad_norm": 157613.765625, |
| "learning_rate": 5.209439528023599e-06, |
| "loss": 0.4536, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.509192200557103, |
| "grad_norm": 183449.875, |
| "learning_rate": 5.1976401179941005e-06, |
| "loss": 0.4293, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.5103064066852367, |
| "grad_norm": 195998.046875, |
| "learning_rate": 5.185840707964602e-06, |
| "loss": 0.4089, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.5114206128133705, |
| "grad_norm": 168456.9375, |
| "learning_rate": 5.174041297935104e-06, |
| "loss": 0.4112, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.5125348189415042, |
| "grad_norm": 151747.875, |
| "learning_rate": 5.162241887905605e-06, |
| "loss": 0.4146, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5136490250696378, |
| "grad_norm": 181618.5, |
| "learning_rate": 5.150442477876106e-06, |
| "loss": 0.4189, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.5147632311977716, |
| "grad_norm": 193827.984375, |
| "learning_rate": 5.1386430678466084e-06, |
| "loss": 0.4167, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.5158774373259053, |
| "grad_norm": 171558.84375, |
| "learning_rate": 5.12684365781711e-06, |
| "loss": 0.4338, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.516991643454039, |
| "grad_norm": 175443.65625, |
| "learning_rate": 5.1150442477876105e-06, |
| "loss": 0.3747, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.5181058495821727, |
| "grad_norm": 171425.59375, |
| "learning_rate": 5.103244837758113e-06, |
| "loss": 0.4327, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.5192200557103064, |
| "grad_norm": 179066.078125, |
| "learning_rate": 5.091445427728614e-06, |
| "loss": 0.4302, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.5203342618384401, |
| "grad_norm": 174975.703125, |
| "learning_rate": 5.0796460176991156e-06, |
| "loss": 0.4329, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.5214484679665738, |
| "grad_norm": 195037.359375, |
| "learning_rate": 5.067846607669616e-06, |
| "loss": 0.4322, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.5225626740947075, |
| "grad_norm": 175203.125, |
| "learning_rate": 5.0560471976401184e-06, |
| "loss": 0.4506, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.5236768802228412, |
| "grad_norm": 172270.53125, |
| "learning_rate": 5.04424778761062e-06, |
| "loss": 0.3653, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5247910863509749, |
| "grad_norm": 203312.03125, |
| "learning_rate": 5.032448377581122e-06, |
| "loss": 0.4235, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.5259052924791087, |
| "grad_norm": 193338.390625, |
| "learning_rate": 5.020648967551623e-06, |
| "loss": 0.497, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.5270194986072423, |
| "grad_norm": 181869.234375, |
| "learning_rate": 5.008849557522124e-06, |
| "loss": 0.4106, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.528133704735376, |
| "grad_norm": 185743.0, |
| "learning_rate": 4.9970501474926256e-06, |
| "loss": 0.4143, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.5292479108635098, |
| "grad_norm": 187086.46875, |
| "learning_rate": 4.985250737463127e-06, |
| "loss": 0.4683, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.5303621169916435, |
| "grad_norm": 203731.9375, |
| "learning_rate": 4.973451327433628e-06, |
| "loss": 0.4548, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.5314763231197771, |
| "grad_norm": 175473.453125, |
| "learning_rate": 4.96165191740413e-06, |
| "loss": 0.4167, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.5325905292479108, |
| "grad_norm": 181863.0625, |
| "learning_rate": 4.949852507374632e-06, |
| "loss": 0.4219, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.5337047353760446, |
| "grad_norm": 170986.390625, |
| "learning_rate": 4.938053097345133e-06, |
| "loss": 0.4189, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.5348189415041783, |
| "grad_norm": 172215.15625, |
| "learning_rate": 4.926253687315635e-06, |
| "loss": 0.4129, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.5359331476323119, |
| "grad_norm": 180747.453125, |
| "learning_rate": 4.9144542772861355e-06, |
| "loss": 0.4347, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.5370473537604457, |
| "grad_norm": 176216.703125, |
| "learning_rate": 4.902654867256638e-06, |
| "loss": 0.4296, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.5381615598885794, |
| "grad_norm": 206918.265625, |
| "learning_rate": 4.890855457227139e-06, |
| "loss": 0.4342, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.5392757660167131, |
| "grad_norm": 184958.484375, |
| "learning_rate": 4.879056047197641e-06, |
| "loss": 0.4113, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.5403899721448467, |
| "grad_norm": 140585.65625, |
| "learning_rate": 4.867256637168142e-06, |
| "loss": 0.449, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.5415041782729805, |
| "grad_norm": 192482.1875, |
| "learning_rate": 4.8554572271386435e-06, |
| "loss": 0.3664, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.5426183844011142, |
| "grad_norm": 191947.328125, |
| "learning_rate": 4.843657817109145e-06, |
| "loss": 0.4395, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.5437325905292479, |
| "grad_norm": 202158.28125, |
| "learning_rate": 4.831858407079646e-06, |
| "loss": 0.4609, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.5448467966573816, |
| "grad_norm": 147651.859375, |
| "learning_rate": 4.820058997050148e-06, |
| "loss": 0.4156, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.5459610027855153, |
| "grad_norm": 176702.09375, |
| "learning_rate": 4.808259587020649e-06, |
| "loss": 0.3789, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.547075208913649, |
| "grad_norm": 196184.625, |
| "learning_rate": 4.796460176991151e-06, |
| "loss": 0.4384, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.5481894150417828, |
| "grad_norm": 188953.125, |
| "learning_rate": 4.784660766961652e-06, |
| "loss": 0.4238, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.5493036211699165, |
| "grad_norm": 202452.03125, |
| "learning_rate": 4.7728613569321535e-06, |
| "loss": 0.4208, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.5504178272980501, |
| "grad_norm": 196150.65625, |
| "learning_rate": 4.761061946902655e-06, |
| "loss": 0.4011, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.5515320334261838, |
| "grad_norm": 156776.75, |
| "learning_rate": 4.749262536873156e-06, |
| "loss": 0.3853, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.5526462395543176, |
| "grad_norm": 178294.34375, |
| "learning_rate": 4.737463126843659e-06, |
| "loss": 0.4161, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.5537604456824513, |
| "grad_norm": 191155.40625, |
| "learning_rate": 4.725663716814159e-06, |
| "loss": 0.3637, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.5548746518105849, |
| "grad_norm": 168218.421875, |
| "learning_rate": 4.7138643067846615e-06, |
| "loss": 0.4271, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.5559888579387187, |
| "grad_norm": 157993.203125, |
| "learning_rate": 4.702064896755162e-06, |
| "loss": 0.4609, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.5571030640668524, |
| "grad_norm": 177861.5625, |
| "learning_rate": 4.690265486725664e-06, |
| "loss": 0.3956, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 8975, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.707023777792e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|