| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.99927797833935, |
| "eval_steps": 500, |
| "global_step": 12462, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0012033694344163659, |
| "grad_norm": 5.875, |
| "learning_rate": 1.1482631771428573e-05, |
| "loss": 1.9783, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0024067388688327317, |
| "grad_norm": 3.296875, |
| "learning_rate": 2.5835921485714284e-05, |
| "loss": 1.9013, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0036101083032490976, |
| "grad_norm": 3.03125, |
| "learning_rate": 4.0189211200000005e-05, |
| "loss": 1.8287, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0048134777376654635, |
| "grad_norm": 2.75, |
| "learning_rate": 5.454250091428571e-05, |
| "loss": 1.7177, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.006016847172081829, |
| "grad_norm": 2.734375, |
| "learning_rate": 6.889579062857144e-05, |
| "loss": 1.5836, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.007220216606498195, |
| "grad_norm": 3.0, |
| "learning_rate": 8.324908034285715e-05, |
| "loss": 1.5078, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.00842358604091456, |
| "grad_norm": 3.125, |
| "learning_rate": 9.760237005714287e-05, |
| "loss": 1.4619, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.009626955475330927, |
| "grad_norm": 3.15625, |
| "learning_rate": 0.0001004730087456795, |
| "loss": 1.447, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.010830324909747292, |
| "grad_norm": 2.9375, |
| "learning_rate": 0.00010047293052503616, |
| "loss": 1.3868, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.012033694344163659, |
| "grad_norm": 2.671875, |
| "learning_rate": 0.0001004727921347998, |
| "loss": 1.3767, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.013237063778580024, |
| "grad_norm": 2.578125, |
| "learning_rate": 0.00010047259357519138, |
| "loss": 1.3253, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.01444043321299639, |
| "grad_norm": 2.734375, |
| "learning_rate": 0.00010047233484652802, |
| "loss": 1.3186, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.015643802647412757, |
| "grad_norm": 2.75, |
| "learning_rate": 0.00010047201594922292, |
| "loss": 1.2852, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.01684717208182912, |
| "grad_norm": 2.734375, |
| "learning_rate": 0.00010047163688378533, |
| "loss": 1.2451, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.018050541516245487, |
| "grad_norm": 2.328125, |
| "learning_rate": 0.0001004711976508206, |
| "loss": 1.2399, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.019253910950661854, |
| "grad_norm": 2.390625, |
| "learning_rate": 0.00010047069825103019, |
| "loss": 1.2288, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.02045728038507822, |
| "grad_norm": 2.53125, |
| "learning_rate": 0.00010047013868521161, |
| "loss": 1.2198, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.021660649819494584, |
| "grad_norm": 2.5625, |
| "learning_rate": 0.00010046951895425849, |
| "loss": 1.2158, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.02286401925391095, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.00010046883905916052, |
| "loss": 1.2246, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.024067388688327317, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.00010046809900100347, |
| "loss": 1.1625, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.02527075812274368, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.00010046729878096918, |
| "loss": 1.1858, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.026474127557160047, |
| "grad_norm": 2.625, |
| "learning_rate": 0.0001004664384003356, |
| "loss": 1.1756, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.027677496991576414, |
| "grad_norm": 2.5, |
| "learning_rate": 0.00010046551786047672, |
| "loss": 1.1696, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.02888086642599278, |
| "grad_norm": 2.390625, |
| "learning_rate": 0.00010046453716286263, |
| "loss": 1.1177, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.030084235860409144, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00010046349630905946, |
| "loss": 1.1426, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.031287605294825514, |
| "grad_norm": 2.609375, |
| "learning_rate": 0.00010046239530072942, |
| "loss": 1.1572, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.032490974729241874, |
| "grad_norm": 2.546875, |
| "learning_rate": 0.00010046123413963083, |
| "loss": 1.1154, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.03369434416365824, |
| "grad_norm": 2.46875, |
| "learning_rate": 0.00010046001282761799, |
| "loss": 1.1424, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.03489771359807461, |
| "grad_norm": 2.921875, |
| "learning_rate": 0.00010045873136664134, |
| "loss": 1.1045, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.036101083032490974, |
| "grad_norm": 2.59375, |
| "learning_rate": 0.00010045738975874728, |
| "loss": 1.1311, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.03730445246690734, |
| "grad_norm": 2.546875, |
| "learning_rate": 0.00010045598800607839, |
| "loss": 1.1298, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.03850782190132371, |
| "grad_norm": 2.53125, |
| "learning_rate": 0.00010045452611087318, |
| "loss": 1.0965, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.039711191335740074, |
| "grad_norm": 2.75, |
| "learning_rate": 0.00010045300407546628, |
| "loss": 1.0964, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.04091456077015644, |
| "grad_norm": 2.984375, |
| "learning_rate": 0.00010045142190228834, |
| "loss": 1.0847, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0421179302045728, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.00010044977959386601, |
| "loss": 1.073, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.04332129963898917, |
| "grad_norm": 2.671875, |
| "learning_rate": 0.00010044807715282207, |
| "loss": 1.1091, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.044524669073405534, |
| "grad_norm": 2.296875, |
| "learning_rate": 0.00010044631458187523, |
| "loss": 1.0794, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.0457280385078219, |
| "grad_norm": 2.453125, |
| "learning_rate": 0.00010044449188384028, |
| "loss": 1.1212, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.04693140794223827, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.00010044260906162802, |
| "loss": 1.0835, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.048134777376654635, |
| "grad_norm": 2.5, |
| "learning_rate": 0.00010044066611824526, |
| "loss": 1.0714, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.049338146811071, |
| "grad_norm": 2.328125, |
| "learning_rate": 0.00010043866305679484, |
| "loss": 1.0716, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.05054151624548736, |
| "grad_norm": 2.46875, |
| "learning_rate": 0.00010043659988047559, |
| "loss": 1.0586, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.05174488567990373, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.00010043447659258233, |
| "loss": 1.0528, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.052948255114320095, |
| "grad_norm": 3.03125, |
| "learning_rate": 0.0001004322931965059, |
| "loss": 1.0601, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.05415162454873646, |
| "grad_norm": 2.59375, |
| "learning_rate": 0.00010043004969573312, |
| "loss": 1.0495, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.05535499398315283, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.00010042774609384681, |
| "loss": 1.058, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.056558363417569195, |
| "grad_norm": 2.578125, |
| "learning_rate": 0.00010042538239452578, |
| "loss": 1.0285, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.05776173285198556, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.00010042295860154474, |
| "loss": 1.0332, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.05896510228640193, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.00010042047471877447, |
| "loss": 1.0177, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.06016847172081829, |
| "grad_norm": 2.796875, |
| "learning_rate": 0.00010041793075018161, |
| "loss": 1.0364, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.061371841155234655, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.00010041532669982886, |
| "loss": 1.0268, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.06257521058965103, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.00010041266257187478, |
| "loss": 0.9869, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.06377858002406739, |
| "grad_norm": 2.703125, |
| "learning_rate": 0.00010040993837057395, |
| "loss": 1.0372, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.06498194945848375, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.00010040715410027679, |
| "loss": 0.987, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.06618531889290012, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.00010040430976542976, |
| "loss": 1.0099, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.06738868832731648, |
| "grad_norm": 2.734375, |
| "learning_rate": 0.00010040140537057514, |
| "loss": 1.0188, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.06859205776173286, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.00010039844092035118, |
| "loss": 1.0453, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.06979542719614922, |
| "grad_norm": 2.296875, |
| "learning_rate": 0.00010039541641949205, |
| "loss": 1.0081, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.07099879663056559, |
| "grad_norm": 2.46875, |
| "learning_rate": 0.00010039233187282776, |
| "loss": 0.994, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.07220216606498195, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00010038918728528426, |
| "loss": 1.0029, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.07340553549939831, |
| "grad_norm": 2.453125, |
| "learning_rate": 0.00010038598266188339, |
| "loss": 1.0095, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.07460890493381468, |
| "grad_norm": 2.25, |
| "learning_rate": 0.00010038271800774282, |
| "loss": 0.9699, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.07581227436823104, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.0001003793933280761, |
| "loss": 0.9616, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.07701564380264742, |
| "grad_norm": 2.34375, |
| "learning_rate": 0.0001003760086281927, |
| "loss": 1.0072, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.07821901323706378, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00010037256391349785, |
| "loss": 0.9806, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.07942238267148015, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.00010036905918949269, |
| "loss": 0.9474, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.08062575210589651, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.00010036549446177414, |
| "loss": 1.0093, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.08182912154031288, |
| "grad_norm": 2.5625, |
| "learning_rate": 0.000100361869736035, |
| "loss": 0.952, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.08303249097472924, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00010035818501806385, |
| "loss": 0.9729, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.0842358604091456, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00010035444031374507, |
| "loss": 0.947, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.08543922984356198, |
| "grad_norm": 2.125, |
| "learning_rate": 0.00010035063562905885, |
| "loss": 0.9918, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.08664259927797834, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00010034677097008121, |
| "loss": 0.9803, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.08784596871239471, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00010034284634298385, |
| "loss": 0.9303, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.08904933814681107, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.0001003388617540343, |
| "loss": 0.9267, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.09025270758122744, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.00010033481720959588, |
| "loss": 0.9313, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.0914560770156438, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.00010033071271612757, |
| "loss": 0.9154, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.09265944645006016, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.00010032654828018417, |
| "loss": 0.9725, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.09386281588447654, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.00010032232390841613, |
| "loss": 0.9499, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.0950661853188929, |
| "grad_norm": 2.359375, |
| "learning_rate": 0.00010031803960756968, |
| "loss": 0.926, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.09626955475330927, |
| "grad_norm": 2.359375, |
| "learning_rate": 0.00010031369538448673, |
| "loss": 0.9864, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.09747292418772563, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.00010030929124610487, |
| "loss": 0.9206, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.098676293622142, |
| "grad_norm": 2.28125, |
| "learning_rate": 0.0001003048271994574, |
| "loss": 0.9297, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.09987966305655836, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.0001003003032516733, |
| "loss": 0.942, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.10108303249097472, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.00010029571940997716, |
| "loss": 0.937, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.1022864019253911, |
| "grad_norm": 2.25, |
| "learning_rate": 0.00010029107568168925, |
| "loss": 0.9598, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.10348977135980746, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.0001002863720742255, |
| "loss": 0.9444, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.10469314079422383, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.00010028160859509742, |
| "loss": 0.9544, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.10589651022864019, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.00010027678525191216, |
| "loss": 0.9032, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.10709987966305656, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00010027190205237246, |
| "loss": 0.9468, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.10830324909747292, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00010026695900427668, |
| "loss": 0.9244, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1095066185318893, |
| "grad_norm": 2.390625, |
| "learning_rate": 0.00010026195611551872, |
| "loss": 0.9286, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.11070998796630566, |
| "grad_norm": 2.390625, |
| "learning_rate": 0.00010025689339408803, |
| "loss": 0.9255, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.11191335740072202, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.00010025177084806967, |
| "loss": 0.9652, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.11311672683513839, |
| "grad_norm": 2.125, |
| "learning_rate": 0.00010024658848564418, |
| "loss": 0.921, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.11432009626955475, |
| "grad_norm": 2.546875, |
| "learning_rate": 0.00010024134631508768, |
| "loss": 0.9406, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.11552346570397112, |
| "grad_norm": 2.21875, |
| "learning_rate": 0.00010023604434477176, |
| "loss": 0.9309, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.11672683513838748, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.00010023068258316352, |
| "loss": 0.925, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.11793020457280386, |
| "grad_norm": 2.21875, |
| "learning_rate": 0.00010022526103882553, |
| "loss": 0.8907, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.11913357400722022, |
| "grad_norm": 2.21875, |
| "learning_rate": 0.00010021977972041586, |
| "loss": 0.9072, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.12033694344163658, |
| "grad_norm": 2.359375, |
| "learning_rate": 0.00010021423863668802, |
| "loss": 0.8906, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.12033694344163658, |
| "eval_loss": 0.8048076033592224, |
| "eval_runtime": 2.7309, |
| "eval_samples_per_second": 73.235, |
| "eval_steps_per_second": 73.235, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.12154031287605295, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.000100208637796491, |
| "loss": 0.8702, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.12274368231046931, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.00010020297720876913, |
| "loss": 0.9234, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.12394705174488568, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00010019725688256226, |
| "loss": 0.9286, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.12515042117930206, |
| "grad_norm": 2.578125, |
| "learning_rate": 0.00010019147682700556, |
| "loss": 0.9069, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.1263537906137184, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00010018563705132966, |
| "loss": 0.8735, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.12755716004813478, |
| "grad_norm": 2.125, |
| "learning_rate": 0.00010017973756486048, |
| "loss": 0.9177, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.12876052948255115, |
| "grad_norm": 2.34375, |
| "learning_rate": 0.00010017377837701939, |
| "loss": 0.8844, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.1299638989169675, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00010016775949732298, |
| "loss": 0.9013, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.13116726835138387, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.0001001616809353833, |
| "loss": 0.8743, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.13237063778580024, |
| "grad_norm": 2.546875, |
| "learning_rate": 0.00010015554270090759, |
| "loss": 0.9059, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.13357400722021662, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.00010014934480369846, |
| "loss": 0.8756, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.13477737665463296, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00010014308725365383, |
| "loss": 0.8941, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.13598074608904934, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00010013677006076677, |
| "loss": 0.8459, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.1371841155234657, |
| "grad_norm": 2.28125, |
| "learning_rate": 0.00010013039323512569, |
| "loss": 0.8917, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.13838748495788206, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.00010012395678691421, |
| "loss": 0.8788, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.13959085439229843, |
| "grad_norm": 2.28125, |
| "learning_rate": 0.00010011746072641115, |
| "loss": 0.878, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.1407942238267148, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.00010011090506399054, |
| "loss": 0.8567, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.14199759326113118, |
| "grad_norm": 2.28125, |
| "learning_rate": 0.00010010428981012161, |
| "loss": 0.9212, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.14320096269554752, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00010009761497536873, |
| "loss": 0.8739, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.1444043321299639, |
| "grad_norm": 2.25, |
| "learning_rate": 0.00010009088057039142, |
| "loss": 0.8645, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.14560770156438027, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.00010008408660594433, |
| "loss": 0.8443, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.14681107099879662, |
| "grad_norm": 2.28125, |
| "learning_rate": 0.00010007723309287726, |
| "loss": 0.8587, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.148014440433213, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.00010007032004213507, |
| "loss": 0.9038, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.14921780986762936, |
| "grad_norm": 2.90625, |
| "learning_rate": 0.0001000633474647577, |
| "loss": 0.8359, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.15042117930204574, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.0001000563153718802, |
| "loss": 0.877, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.15162454873646208, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.00010004922377473258, |
| "loss": 0.8872, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.15282791817087846, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.00010004207268463997, |
| "loss": 0.85, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.15403128760529483, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00010003486211302243, |
| "loss": 0.8759, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.1552346570397112, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00010002759207139508, |
| "loss": 0.8689, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.15643802647412755, |
| "grad_norm": 2.359375, |
| "learning_rate": 0.00010002026257136792, |
| "loss": 0.9227, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.15764139590854392, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.00010001287362464602, |
| "loss": 0.8438, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.1588447653429603, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.0001000054252430293, |
| "loss": 0.8453, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.16004813477737664, |
| "grad_norm": 2.453125, |
| "learning_rate": 9.999791743841263e-05, |
| "loss": 0.8422, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.16125150421179302, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.999035022278575e-05, |
| "loss": 0.866, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.1624548736462094, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.99827236082333e-05, |
| "loss": 0.8817, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.16365824308062576, |
| "grad_norm": 2.234375, |
| "learning_rate": 9.997503760693478e-05, |
| "loss": 0.9046, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.1648616125150421, |
| "grad_norm": 2.40625, |
| "learning_rate": 9.996729223116452e-05, |
| "loss": 0.8713, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.16606498194945848, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.995948749329168e-05, |
| "loss": 0.8363, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.16726835138387486, |
| "grad_norm": 2.3125, |
| "learning_rate": 9.99516234057802e-05, |
| "loss": 0.8764, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.1684717208182912, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.994369998118882e-05, |
| "loss": 0.8316, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.16967509025270758, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.993571723217103e-05, |
| "loss": 0.8533, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.17087845968712395, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.992767517147505e-05, |
| "loss": 0.8919, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.17208182912154033, |
| "grad_norm": 2.390625, |
| "learning_rate": 9.991957381194385e-05, |
| "loss": 0.8414, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.17328519855595667, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.991141316651505e-05, |
| "loss": 0.8667, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.17448856799037304, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.990319324822101e-05, |
| "loss": 0.8545, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.17569193742478942, |
| "grad_norm": 2.4375, |
| "learning_rate": 9.989491407018868e-05, |
| "loss": 0.8622, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.17689530685920576, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.98865756456397e-05, |
| "loss": 0.8522, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.17809867629362214, |
| "grad_norm": 2.453125, |
| "learning_rate": 9.98781779878903e-05, |
| "loss": 0.8441, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.1793020457280385, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.986972111035132e-05, |
| "loss": 0.8572, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.18050541516245489, |
| "grad_norm": 2.296875, |
| "learning_rate": 9.986120502652816e-05, |
| "loss": 0.8486, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.18170878459687123, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.985262975002073e-05, |
| "loss": 0.8385, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.1829121540312876, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.984399529452357e-05, |
| "loss": 0.8344, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.18411552346570398, |
| "grad_norm": 2.125, |
| "learning_rate": 9.983530167382562e-05, |
| "loss": 0.8603, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.18531889290012032, |
| "grad_norm": 2.421875, |
| "learning_rate": 9.98265489018104e-05, |
| "loss": 0.8683, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.1865222623345367, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.981773699245579e-05, |
| "loss": 0.8596, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.18772563176895307, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.980886595983423e-05, |
| "loss": 0.8547, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.18892900120336945, |
| "grad_norm": 2.0, |
| "learning_rate": 9.979993581811245e-05, |
| "loss": 0.8407, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.1901323706377858, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.979094658155169e-05, |
| "loss": 0.8383, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.19133574007220217, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.978189826450748e-05, |
| "loss": 0.8117, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.19253910950661854, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.977279088142978e-05, |
| "loss": 0.8503, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.19374247894103488, |
| "grad_norm": 2.28125, |
| "learning_rate": 9.976362444686279e-05, |
| "loss": 0.8837, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.19494584837545126, |
| "grad_norm": 2.125, |
| "learning_rate": 9.975439897544506e-05, |
| "loss": 0.886, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.19614921780986763, |
| "grad_norm": 2.125, |
| "learning_rate": 9.974511448190943e-05, |
| "loss": 0.8254, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.197352587244284, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.973577098108294e-05, |
| "loss": 0.8184, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.19855595667870035, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.972636848788696e-05, |
| "loss": 0.8493, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.19975932611311673, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.971690701733699e-05, |
| "loss": 0.8167, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.2009626955475331, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.970738658454271e-05, |
| "loss": 0.8, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.20216606498194944, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.969780720470804e-05, |
| "loss": 0.8075, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.20336943441636582, |
| "grad_norm": 2.3125, |
| "learning_rate": 9.968816889313095e-05, |
| "loss": 0.8419, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.2045728038507822, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.967847166520357e-05, |
| "loss": 0.8243, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.20577617328519857, |
| "grad_norm": 2.453125, |
| "learning_rate": 9.966871553641211e-05, |
| "loss": 0.8059, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.2069795427196149, |
| "grad_norm": 2.0, |
| "learning_rate": 9.965890052233684e-05, |
| "loss": 0.8207, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.20818291215403129, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.964902663865205e-05, |
| "loss": 0.8258, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.20938628158844766, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.963909390112608e-05, |
| "loss": 0.8312, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.21058965102286403, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.962910232562123e-05, |
| "loss": 0.8332, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.21179302045728038, |
| "grad_norm": 2.25, |
| "learning_rate": 9.961905192809377e-05, |
| "loss": 0.8314, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.21299638989169675, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.960894272459392e-05, |
| "loss": 0.8296, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.21419975932611313, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.959877473126578e-05, |
| "loss": 0.8368, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.21540312876052947, |
| "grad_norm": 2.0, |
| "learning_rate": 9.958854796434738e-05, |
| "loss": 0.8194, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.21660649819494585, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.957826244017058e-05, |
| "loss": 0.8188, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.21780986762936222, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.956791817516107e-05, |
| "loss": 0.7867, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.2190132370637786, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.955751518583835e-05, |
| "loss": 0.7898, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.22021660649819494, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.954705348881573e-05, |
| "loss": 0.8214, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.2214199759326113, |
| "grad_norm": 2.46875, |
| "learning_rate": 9.953653310080022e-05, |
| "loss": 0.8218, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.2226233453670277, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.952595403859263e-05, |
| "loss": 0.7663, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.22382671480144403, |
| "grad_norm": 2.28125, |
| "learning_rate": 9.951531631908739e-05, |
| "loss": 0.7601, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.2250300842358604, |
| "grad_norm": 2.15625, |
| "learning_rate": 9.95046199592727e-05, |
| "loss": 0.7777, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.22623345367027678, |
| "grad_norm": 2.265625, |
| "learning_rate": 9.94938649762303e-05, |
| "loss": 0.8109, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.22743682310469315, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.948305138713562e-05, |
| "loss": 0.8333, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.2286401925391095, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.947217920925766e-05, |
| "loss": 0.8006, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.22984356197352587, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.946124845995902e-05, |
| "loss": 0.8408, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.23104693140794225, |
| "grad_norm": 2.484375, |
| "learning_rate": 9.945025915669577e-05, |
| "loss": 0.798, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.2322503008423586, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.943921131701754e-05, |
| "loss": 0.7899, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.23345367027677497, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.942810495856742e-05, |
| "loss": 0.8247, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.23465703971119134, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.941694009908194e-05, |
| "loss": 0.7924, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.2358604091456077, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.940571675639114e-05, |
| "loss": 0.7573, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.23706377858002406, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.939443494841831e-05, |
| "loss": 0.825, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.23826714801444043, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.938309469318024e-05, |
| "loss": 0.8057, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.2394705174488568, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.937169600878699e-05, |
| "loss": 0.7873, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.24067388688327315, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.936023891344194e-05, |
| "loss": 0.8192, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.24067388688327315, |
| "eval_loss": 0.7019978165626526, |
| "eval_runtime": 2.6886, |
| "eval_samples_per_second": 74.389, |
| "eval_steps_per_second": 74.389, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.24187725631768953, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.934872342544176e-05, |
| "loss": 0.7758, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.2430806257521059, |
| "grad_norm": 2.34375, |
| "learning_rate": 9.933714956317638e-05, |
| "loss": 0.8306, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.24428399518652227, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.932551734512891e-05, |
| "loss": 0.7868, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.24548736462093862, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.931382678987572e-05, |
| "loss": 0.7612, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.246690734055355, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.930207791608625e-05, |
| "loss": 0.7586, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.24789410348977137, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.92902707425232e-05, |
| "loss": 0.8066, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.2490974729241877, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.927840528804226e-05, |
| "loss": 0.8022, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.2503008423586041, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.926648157159222e-05, |
| "loss": 0.7819, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.25150421179302046, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.925449961221496e-05, |
| "loss": 0.804, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.2527075812274368, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.924245942904534e-05, |
| "loss": 0.7823, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.2539109506618532, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.923036104131117e-05, |
| "loss": 0.7887, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.25511432009626955, |
| "grad_norm": 2.171875, |
| "learning_rate": 9.921820446833328e-05, |
| "loss": 0.8386, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.2563176895306859, |
| "grad_norm": 2.15625, |
| "learning_rate": 9.920598972952534e-05, |
| "loss": 0.7996, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.2575210589651023, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.919371684439401e-05, |
| "loss": 0.7612, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.25872442839951865, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.918138583253869e-05, |
| "loss": 0.8034, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.259927797833935, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.91689967136517e-05, |
| "loss": 0.7952, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.2611311672683514, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.915654950751811e-05, |
| "loss": 0.8253, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.26233453670276774, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.914404423401577e-05, |
| "loss": 0.7845, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.26353790613718414, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.913148091311526e-05, |
| "loss": 0.7825, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.2647412755716005, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.911885956487988e-05, |
| "loss": 0.767, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.26594464500601683, |
| "grad_norm": 2.125, |
| "learning_rate": 9.910618020946552e-05, |
| "loss": 0.8171, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.26714801444043323, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.909344286712083e-05, |
| "loss": 0.7493, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.2683513838748496, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.908064755818693e-05, |
| "loss": 0.7957, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.2695547533092659, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.906779430309763e-05, |
| "loss": 0.7947, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.27075812274368233, |
| "grad_norm": 2.125, |
| "learning_rate": 9.90548831223792e-05, |
| "loss": 0.7759, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.2719614921780987, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.904191403665043e-05, |
| "loss": 0.8045, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.273164861612515, |
| "grad_norm": 2.296875, |
| "learning_rate": 9.902888706662262e-05, |
| "loss": 0.784, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.2743682310469314, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.901580223309946e-05, |
| "loss": 0.8168, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.27557160048134777, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.900265955697708e-05, |
| "loss": 0.7761, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.2767749699157641, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.898945905924397e-05, |
| "loss": 0.7857, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.2779783393501805, |
| "grad_norm": 2.359375, |
| "learning_rate": 9.897620076098097e-05, |
| "loss": 0.7853, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.27918170878459686, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.89628846833612e-05, |
| "loss": 0.7904, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.28038507821901326, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.894951084765009e-05, |
| "loss": 0.7402, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.2815884476534296, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.893607927520527e-05, |
| "loss": 0.7577, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.28279181708784595, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.892258998747662e-05, |
| "loss": 0.8035, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.28399518652226236, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.890904300600613e-05, |
| "loss": 0.7901, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.2851985559566787, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.889543835242796e-05, |
| "loss": 0.7655, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.28640192539109505, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.888177604846838e-05, |
| "loss": 0.7969, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.28760529482551145, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.886805611594573e-05, |
| "loss": 0.7737, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.2888086642599278, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.885427857677032e-05, |
| "loss": 0.8114, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.29001203369434414, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.884044345294454e-05, |
| "loss": 0.7722, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.29121540312876054, |
| "grad_norm": 1.875, |
| "learning_rate": 9.882655076656269e-05, |
| "loss": 0.7765, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.2924187725631769, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.8812600539811e-05, |
| "loss": 0.7874, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.29362214199759323, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.879859279496761e-05, |
| "loss": 0.7992, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.29482551143200963, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.87845275544025e-05, |
| "loss": 0.7625, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.296028880866426, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.877040484057748e-05, |
| "loss": 0.8007, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.2972322503008424, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.875622467604612e-05, |
| "loss": 0.7735, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.29843561973525873, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.874198708345375e-05, |
| "loss": 0.7712, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.2996389891696751, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.872769208553744e-05, |
| "loss": 0.7869, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.3008423586040915, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.871333970512589e-05, |
| "loss": 0.7851, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3020457280385078, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.869892996513945e-05, |
| "loss": 0.7396, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.30324909747292417, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.86844628885901e-05, |
| "loss": 0.8024, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.30445246690734057, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.866993849858135e-05, |
| "loss": 0.764, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.3056558363417569, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.865535681830825e-05, |
| "loss": 0.7776, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.30685920577617326, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.864071787105735e-05, |
| "loss": 0.7484, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.30806257521058966, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.862602168020664e-05, |
| "loss": 0.7562, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.309265944645006, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.861126826922553e-05, |
| "loss": 0.7467, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.3104693140794224, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.859645766167482e-05, |
| "loss": 0.7581, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.31167268351383876, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.858158988120664e-05, |
| "loss": 0.7602, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.3128760529482551, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.856666495156442e-05, |
| "loss": 0.7553, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3140794223826715, |
| "grad_norm": 2.125, |
| "learning_rate": 9.855168289658285e-05, |
| "loss": 0.7648, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.31528279181708785, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.853664374018793e-05, |
| "loss": 0.7684, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.3164861612515042, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.852154750639669e-05, |
| "loss": 0.7478, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.3176895306859206, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.850639421931745e-05, |
| "loss": 0.7559, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.31889290012033694, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.849118390314957e-05, |
| "loss": 0.7993, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.3200962695547533, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.847591658218353e-05, |
| "loss": 0.7609, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.3212996389891697, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.846059228080081e-05, |
| "loss": 0.7558, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.32250300842358604, |
| "grad_norm": 2.0, |
| "learning_rate": 9.844521102347389e-05, |
| "loss": 0.7756, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.3237063778580024, |
| "grad_norm": 2.125, |
| "learning_rate": 9.842977283476621e-05, |
| "loss": 0.778, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.3249097472924188, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.841427773933217e-05, |
| "loss": 0.7515, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.32611311672683513, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.839872576191697e-05, |
| "loss": 0.7545, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.32731648616125153, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.838311692735671e-05, |
| "loss": 0.7668, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.3285198555956679, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.836745126057828e-05, |
| "loss": 0.7554, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.3297232250300842, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.83517287865993e-05, |
| "loss": 0.7413, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.3309265944645006, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.833594953052811e-05, |
| "loss": 0.7243, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.33212996389891697, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.83201135175638e-05, |
| "loss": 0.7439, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.830422077299601e-05, |
| "loss": 0.7836, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.3345367027677497, |
| "grad_norm": 1.8515625, |
| "learning_rate": 9.828827132220504e-05, |
| "loss": 0.7121, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.33574007220216606, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.827226519066169e-05, |
| "loss": 0.726, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.3369434416365824, |
| "grad_norm": 1.875, |
| "learning_rate": 9.825620240392733e-05, |
| "loss": 0.7737, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.3381468110709988, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.82400829876538e-05, |
| "loss": 0.7407, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.33935018050541516, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.822390696758336e-05, |
| "loss": 0.765, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.3405535499398315, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.820767436954865e-05, |
| "loss": 0.7207, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.3417569193742479, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.81913852194727e-05, |
| "loss": 0.742, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.34296028880866425, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.817503954336885e-05, |
| "loss": 0.7437, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.34416365824308065, |
| "grad_norm": 2.296875, |
| "learning_rate": 9.815863736734066e-05, |
| "loss": 0.7577, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.345367027677497, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.814217871758198e-05, |
| "loss": 0.7372, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.34657039711191334, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.812566362037682e-05, |
| "loss": 0.7302, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.34777376654632974, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.810909210209932e-05, |
| "loss": 0.7469, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.3489771359807461, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.809246418921374e-05, |
| "loss": 0.7544, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.35018050541516244, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.807577990827442e-05, |
| "loss": 0.7288, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.35138387484957884, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.805903928592567e-05, |
| "loss": 0.7725, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.3525872442839952, |
| "grad_norm": 2.15625, |
| "learning_rate": 9.804224234890185e-05, |
| "loss": 0.7269, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.35379061371841153, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.802538912402715e-05, |
| "loss": 0.7577, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.35499398315282793, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.800847963821573e-05, |
| "loss": 0.7393, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.3561973525872443, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.799151391847158e-05, |
| "loss": 0.7159, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.3574007220216607, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.79744919918885e-05, |
| "loss": 0.7584, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.358604091456077, |
| "grad_norm": 1.875, |
| "learning_rate": 9.795741388565e-05, |
| "loss": 0.7267, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.35980746089049337, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.794027962702939e-05, |
| "loss": 0.7622, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.36101083032490977, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.792308924338958e-05, |
| "loss": 0.7363, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.36101083032490977, |
| "eval_loss": 0.644199550151825, |
| "eval_runtime": 2.6553, |
| "eval_samples_per_second": 75.32, |
| "eval_steps_per_second": 75.32, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3622141997593261, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.790584276218317e-05, |
| "loss": 0.7645, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.36341756919374246, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.788854021095226e-05, |
| "loss": 0.7213, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.36462093862815886, |
| "grad_norm": 1.875, |
| "learning_rate": 9.78711816173286e-05, |
| "loss": 0.7573, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.3658243080625752, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.785376700903338e-05, |
| "loss": 0.7674, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.36702767749699156, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.783629641387724e-05, |
| "loss": 0.7761, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.36823104693140796, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.781876985976023e-05, |
| "loss": 0.7686, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.3694344163658243, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.780118737467178e-05, |
| "loss": 0.7692, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.37063778580024065, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.778354898669067e-05, |
| "loss": 0.7359, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.37184115523465705, |
| "grad_norm": 2.0, |
| "learning_rate": 9.776585472398488e-05, |
| "loss": 0.7126, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.3730445246690734, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.774810461481165e-05, |
| "loss": 0.7319, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.3742478941034898, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.773029868751746e-05, |
| "loss": 0.7645, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.37545126353790614, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.771243697053787e-05, |
| "loss": 0.735, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.3766546329723225, |
| "grad_norm": 2.234375, |
| "learning_rate": 9.769451949239755e-05, |
| "loss": 0.7707, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.3778580024067389, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.767654628171021e-05, |
| "loss": 0.7207, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.37906137184115524, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.765851736717858e-05, |
| "loss": 0.7396, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.3802647412755716, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.764043277759434e-05, |
| "loss": 0.7359, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.381468110709988, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.762229254183808e-05, |
| "loss": 0.7163, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.38267148014440433, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.760409668887927e-05, |
| "loss": 0.7657, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.3838748495788207, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.758584524777617e-05, |
| "loss": 0.7268, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.3850782190132371, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.756753824767585e-05, |
| "loss": 0.6808, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.3862815884476534, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.754917571781406e-05, |
| "loss": 0.7574, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.38748495788206977, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.753075768751528e-05, |
| "loss": 0.7114, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.38868832731648617, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.751228418619257e-05, |
| "loss": 0.7273, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.3898916967509025, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.749375524334765e-05, |
| "loss": 0.7251, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.3910950661853189, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.747517088857068e-05, |
| "loss": 0.7132, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.39229843561973526, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.745653115154038e-05, |
| "loss": 0.7307, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.3935018050541516, |
| "grad_norm": 2.25, |
| "learning_rate": 9.743783606202393e-05, |
| "loss": 0.7415, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.394705174488568, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.741908564987682e-05, |
| "loss": 0.7341, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.39590854392298436, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.740027994504299e-05, |
| "loss": 0.7851, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.3971119133574007, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.73814189775546e-05, |
| "loss": 0.7394, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.3983152827918171, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.736250277753213e-05, |
| "loss": 0.7116, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.39951865222623345, |
| "grad_norm": 1.875, |
| "learning_rate": 9.734353137518419e-05, |
| "loss": 0.7178, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.4007220216606498, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.732450480080762e-05, |
| "loss": 0.7381, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.4019253910950662, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.730542308478733e-05, |
| "loss": 0.7467, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.40312876052948254, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.728628625759628e-05, |
| "loss": 0.7412, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.4043321299638989, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.726709434979548e-05, |
| "loss": 0.7245, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.4055354993983153, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.724784739203386e-05, |
| "loss": 0.7203, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.40673886883273164, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.72285454150483e-05, |
| "loss": 0.735, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.40794223826714804, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.720918844966352e-05, |
| "loss": 0.7185, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.4091456077015644, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.718977652679205e-05, |
| "loss": 0.6864, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.41034897713598073, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.71703096774342e-05, |
| "loss": 0.7182, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.41155234657039713, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.7150787932678e-05, |
| "loss": 0.7583, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.4127557160048135, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.713121132369914e-05, |
| "loss": 0.7215, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.4139590854392298, |
| "grad_norm": 2.125, |
| "learning_rate": 9.711157988176094e-05, |
| "loss": 0.7288, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.4151624548736462, |
| "grad_norm": 2.0, |
| "learning_rate": 9.709189363821422e-05, |
| "loss": 0.7336, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.41636582430806257, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.707215262449744e-05, |
| "loss": 0.7454, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.4175691937424789, |
| "grad_norm": 2.0, |
| "learning_rate": 9.705235687213642e-05, |
| "loss": 0.7579, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.4187725631768953, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.703250641274442e-05, |
| "loss": 0.7328, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.41997593261131166, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.701260127802211e-05, |
| "loss": 0.6927, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.42117930204572807, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.69926414997574e-05, |
| "loss": 0.6976, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.4223826714801444, |
| "grad_norm": 2.0, |
| "learning_rate": 9.697262710982557e-05, |
| "loss": 0.6918, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.42358604091456076, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.695255814018899e-05, |
| "loss": 0.7205, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.42478941034897716, |
| "grad_norm": 1.875, |
| "learning_rate": 9.693243462289728e-05, |
| "loss": 0.6963, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.4259927797833935, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.691225659008713e-05, |
| "loss": 0.7363, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.42719614921780985, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.689202407398234e-05, |
| "loss": 0.715, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.42839951865222625, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.687173710689363e-05, |
| "loss": 0.7121, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.4296028880866426, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.685139572121876e-05, |
| "loss": 0.7253, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.43080625752105894, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.683099994944237e-05, |
| "loss": 0.7242, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.43200962695547535, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.681054982413593e-05, |
| "loss": 0.7028, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.4332129963898917, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.679004537795773e-05, |
| "loss": 0.7058, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.43441636582430804, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.67694866436528e-05, |
| "loss": 0.7455, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.43561973525872444, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.67488736540529e-05, |
| "loss": 0.7209, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.4368231046931408, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.672820644207639e-05, |
| "loss": 0.7455, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.4380264741275572, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.670748504072822e-05, |
| "loss": 0.7298, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.43922984356197353, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.668670948309992e-05, |
| "loss": 0.7449, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.4404332129963899, |
| "grad_norm": 1.8515625, |
| "learning_rate": 9.666587980236952e-05, |
| "loss": 0.7101, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.4416365824308063, |
| "grad_norm": 2.171875, |
| "learning_rate": 9.664499603180135e-05, |
| "loss": 0.7173, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.4428399518652226, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.662405820474634e-05, |
| "loss": 0.705, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.44404332129963897, |
| "grad_norm": 2.15625, |
| "learning_rate": 9.660306635464152e-05, |
| "loss": 0.7186, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.4452466907340554, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.658202051501034e-05, |
| "loss": 0.7093, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.4464500601684717, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.656092071946243e-05, |
| "loss": 0.6771, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.44765342960288806, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.65397670016936e-05, |
| "loss": 0.7028, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.44885679903730447, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.65185593954857e-05, |
| "loss": 0.7239, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.4500601684717208, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.649729793470675e-05, |
| "loss": 0.718, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.45126353790613716, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.64759826533107e-05, |
| "loss": 0.6947, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.45246690734055356, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.645461358533747e-05, |
| "loss": 0.708, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.4536702767749699, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.643319076491285e-05, |
| "loss": 0.7103, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.4548736462093863, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.641171422624853e-05, |
| "loss": 0.7052, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.45607701564380265, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.639018400364193e-05, |
| "loss": 0.7162, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.457280385078219, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.636860013147622e-05, |
| "loss": 0.7069, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.4584837545126354, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.63469626442203e-05, |
| "loss": 0.7118, |
| "step": 1905 |
| }, |
| { |
| "epoch": 0.45968712394705175, |
| "grad_norm": 2.0, |
| "learning_rate": 9.632527157642855e-05, |
| "loss": 0.7374, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.4608904933814681, |
| "grad_norm": 1.875, |
| "learning_rate": 9.630352696274109e-05, |
| "loss": 0.7097, |
| "step": 1915 |
| }, |
| { |
| "epoch": 0.4620938628158845, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.628172883788343e-05, |
| "loss": 0.7122, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.46329723225030084, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.625987723666661e-05, |
| "loss": 0.7131, |
| "step": 1925 |
| }, |
| { |
| "epoch": 0.4645006016847172, |
| "grad_norm": 2.0, |
| "learning_rate": 9.623797219398698e-05, |
| "loss": 0.7118, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.4657039711191336, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.621601374482635e-05, |
| "loss": 0.7052, |
| "step": 1935 |
| }, |
| { |
| "epoch": 0.46690734055354993, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.619400192425173e-05, |
| "loss": 0.6992, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.4681107099879663, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.61719367674154e-05, |
| "loss": 0.7074, |
| "step": 1945 |
| }, |
| { |
| "epoch": 0.4693140794223827, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.614981830955479e-05, |
| "loss": 0.6958, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.470517448856799, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.61276465859925e-05, |
| "loss": 0.7268, |
| "step": 1955 |
| }, |
| { |
| "epoch": 0.4717208182912154, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.610542163213614e-05, |
| "loss": 0.6819, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.4729241877256318, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.608314348347836e-05, |
| "loss": 0.7258, |
| "step": 1965 |
| }, |
| { |
| "epoch": 0.4741275571600481, |
| "grad_norm": 1.6171875, |
| "learning_rate": 9.606081217559674e-05, |
| "loss": 0.7005, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.4753309265944645, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.60384277441538e-05, |
| "loss": 0.6824, |
| "step": 1975 |
| }, |
| { |
| "epoch": 0.47653429602888087, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.60159902248968e-05, |
| "loss": 0.6861, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.4777376654632972, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.59934996536579e-05, |
| "loss": 0.7052, |
| "step": 1985 |
| }, |
| { |
| "epoch": 0.4789410348977136, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.59709560663539e-05, |
| "loss": 0.6741, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.48014440433212996, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.594835949898627e-05, |
| "loss": 0.6708, |
| "step": 1995 |
| }, |
| { |
| "epoch": 0.4813477737665463, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.592570998764115e-05, |
| "loss": 0.715, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.4813477737665463, |
| "eval_loss": 0.6060333847999573, |
| "eval_runtime": 2.6578, |
| "eval_samples_per_second": 75.249, |
| "eval_steps_per_second": 75.249, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.4825511432009627, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.590300756848915e-05, |
| "loss": 0.6681, |
| "step": 2005 |
| }, |
| { |
| "epoch": 0.48375451263537905, |
| "grad_norm": 1.8515625, |
| "learning_rate": 9.588025227778541e-05, |
| "loss": 0.6976, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.48495788206979545, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.585744415186954e-05, |
| "loss": 0.7225, |
| "step": 2015 |
| }, |
| { |
| "epoch": 0.4861612515042118, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.583458322716543e-05, |
| "loss": 0.6724, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.48736462093862815, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.581166954018142e-05, |
| "loss": 0.6961, |
| "step": 2025 |
| }, |
| { |
| "epoch": 0.48856799037304455, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.578870312750999e-05, |
| "loss": 0.7048, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.4897713598074609, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.576568402582787e-05, |
| "loss": 0.6904, |
| "step": 2035 |
| }, |
| { |
| "epoch": 0.49097472924187724, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.574261227189596e-05, |
| "loss": 0.6904, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.49217809867629364, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.57194879025592e-05, |
| "loss": 0.714, |
| "step": 2045 |
| }, |
| { |
| "epoch": 0.49338146811071, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.569631095474656e-05, |
| "loss": 0.7156, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.49458483754512633, |
| "grad_norm": 2.0, |
| "learning_rate": 9.567308146547101e-05, |
| "loss": 0.73, |
| "step": 2055 |
| }, |
| { |
| "epoch": 0.49578820697954273, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.56497994718294e-05, |
| "loss": 0.7007, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.4969915764139591, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.562646501100241e-05, |
| "loss": 0.7158, |
| "step": 2065 |
| }, |
| { |
| "epoch": 0.4981949458483754, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.560307812025458e-05, |
| "loss": 0.7404, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.4993983152827918, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.55796388369341e-05, |
| "loss": 0.6807, |
| "step": 2075 |
| }, |
| { |
| "epoch": 0.5006016847172082, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.555614719847286e-05, |
| "loss": 0.7248, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.5018050541516246, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.553260324238636e-05, |
| "loss": 0.708, |
| "step": 2085 |
| }, |
| { |
| "epoch": 0.5030084235860409, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.550900700627365e-05, |
| "loss": 0.7364, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.5042117930204573, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.54853585278173e-05, |
| "loss": 0.6955, |
| "step": 2095 |
| }, |
| { |
| "epoch": 0.5054151624548736, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.546165784478324e-05, |
| "loss": 0.6905, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.50661853188929, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.543790499502083e-05, |
| "loss": 0.7117, |
| "step": 2105 |
| }, |
| { |
| "epoch": 0.5078219013237064, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.541410001646273e-05, |
| "loss": 0.6493, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.5090252707581228, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.539024294712479e-05, |
| "loss": 0.6618, |
| "step": 2115 |
| }, |
| { |
| "epoch": 0.5102286401925391, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.536633382510613e-05, |
| "loss": 0.6866, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.5114320096269555, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.534237268858897e-05, |
| "loss": 0.699, |
| "step": 2125 |
| }, |
| { |
| "epoch": 0.5126353790613718, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.531835957583856e-05, |
| "loss": 0.6887, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.5138387484957883, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.529429452520319e-05, |
| "loss": 0.7027, |
| "step": 2135 |
| }, |
| { |
| "epoch": 0.5150421179302046, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.527017757511408e-05, |
| "loss": 0.7184, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.516245487364621, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.524600876408537e-05, |
| "loss": 0.6526, |
| "step": 2145 |
| }, |
| { |
| "epoch": 0.5174488567990373, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.522178813071393e-05, |
| "loss": 0.7014, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5186522262334536, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.519751571367947e-05, |
| "loss": 0.6784, |
| "step": 2155 |
| }, |
| { |
| "epoch": 0.51985559566787, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.51731915517444e-05, |
| "loss": 0.6842, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.5210589651022864, |
| "grad_norm": 2.140625, |
| "learning_rate": 9.51488156837537e-05, |
| "loss": 0.6869, |
| "step": 2165 |
| }, |
| { |
| "epoch": 0.5222623345367028, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.512438814863498e-05, |
| "loss": 0.7064, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.5234657039711191, |
| "grad_norm": 1.8515625, |
| "learning_rate": 9.509990898539835e-05, |
| "loss": 0.689, |
| "step": 2175 |
| }, |
| { |
| "epoch": 0.5246690734055355, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.507537823313632e-05, |
| "loss": 0.6771, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.5258724428399518, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.505079593102387e-05, |
| "loss": 0.7189, |
| "step": 2185 |
| }, |
| { |
| "epoch": 0.5270758122743683, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.502616211831825e-05, |
| "loss": 0.6938, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.5282791817087846, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.500147683435897e-05, |
| "loss": 0.698, |
| "step": 2195 |
| }, |
| { |
| "epoch": 0.529482551143201, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.497674011856776e-05, |
| "loss": 0.7275, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.5306859205776173, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.495195201044847e-05, |
| "loss": 0.6597, |
| "step": 2205 |
| }, |
| { |
| "epoch": 0.5318892900120337, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.492711254958704e-05, |
| "loss": 0.6686, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.53309265944645, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.490222177565137e-05, |
| "loss": 0.6785, |
| "step": 2215 |
| }, |
| { |
| "epoch": 0.5342960288808665, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.487727972839139e-05, |
| "loss": 0.6563, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.5354993983152828, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.485228644763884e-05, |
| "loss": 0.6653, |
| "step": 2225 |
| }, |
| { |
| "epoch": 0.5367027677496992, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.482724197330728e-05, |
| "loss": 0.6741, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.5379061371841155, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.480214634539208e-05, |
| "loss": 0.6841, |
| "step": 2235 |
| }, |
| { |
| "epoch": 0.5391095066185319, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.477699960397028e-05, |
| "loss": 0.6954, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.5403128760529483, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.475180178920049e-05, |
| "loss": 0.6455, |
| "step": 2245 |
| }, |
| { |
| "epoch": 0.5415162454873647, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.472655294132295e-05, |
| "loss": 0.6932, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.542719614921781, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.470125310065938e-05, |
| "loss": 0.6943, |
| "step": 2255 |
| }, |
| { |
| "epoch": 0.5439229843561973, |
| "grad_norm": 1.875, |
| "learning_rate": 9.467590230761291e-05, |
| "loss": 0.6848, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.5451263537906137, |
| "grad_norm": 1.875, |
| "learning_rate": 9.46505006026681e-05, |
| "loss": 0.6785, |
| "step": 2265 |
| }, |
| { |
| "epoch": 0.54632972322503, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.462504802639071e-05, |
| "loss": 0.6803, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.5475330926594465, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.459954461942788e-05, |
| "loss": 0.7146, |
| "step": 2275 |
| }, |
| { |
| "epoch": 0.5487364620938628, |
| "grad_norm": 2.0, |
| "learning_rate": 9.45739904225078e-05, |
| "loss": 0.6779, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.5499398315282792, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.454838547643983e-05, |
| "loss": 0.6444, |
| "step": 2285 |
| }, |
| { |
| "epoch": 0.5511432009626955, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.452272982211438e-05, |
| "loss": 0.7118, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.5523465703971119, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.44970235005028e-05, |
| "loss": 0.6885, |
| "step": 2295 |
| }, |
| { |
| "epoch": 0.5535499398315282, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.447126655265739e-05, |
| "loss": 0.6662, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.5547533092659447, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.44454590197113e-05, |
| "loss": 0.6727, |
| "step": 2305 |
| }, |
| { |
| "epoch": 0.555956678700361, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.441960094287843e-05, |
| "loss": 0.6777, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.5571600481347774, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.439369236345344e-05, |
| "loss": 0.6696, |
| "step": 2315 |
| }, |
| { |
| "epoch": 0.5583634175691937, |
| "grad_norm": 1.875, |
| "learning_rate": 9.436773332281159e-05, |
| "loss": 0.6575, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.5595667870036101, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.434172386240877e-05, |
| "loss": 0.6939, |
| "step": 2325 |
| }, |
| { |
| "epoch": 0.5607701564380265, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.431566402378138e-05, |
| "loss": 0.713, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.5619735258724429, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.428955384854625e-05, |
| "loss": 0.6941, |
| "step": 2335 |
| }, |
| { |
| "epoch": 0.5631768953068592, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.426339337840063e-05, |
| "loss": 0.6716, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.5643802647412756, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.423718265512205e-05, |
| "loss": 0.7128, |
| "step": 2345 |
| }, |
| { |
| "epoch": 0.5655836341756919, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.421092172056834e-05, |
| "loss": 0.7135, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.5667870036101083, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.418461061667748e-05, |
| "loss": 0.6849, |
| "step": 2355 |
| }, |
| { |
| "epoch": 0.5679903730445247, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.415824938546756e-05, |
| "loss": 0.6792, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.5691937424789411, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.413183806903677e-05, |
| "loss": 0.6967, |
| "step": 2365 |
| }, |
| { |
| "epoch": 0.5703971119133574, |
| "grad_norm": 1.75, |
| "learning_rate": 9.410537670956326e-05, |
| "loss": 0.681, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.5716004813477737, |
| "grad_norm": 1.71875, |
| "learning_rate": 9.407886534930506e-05, |
| "loss": 0.6516, |
| "step": 2375 |
| }, |
| { |
| "epoch": 0.5728038507821901, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.405230403060008e-05, |
| "loss": 0.7073, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.5740072202166066, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.402569279586608e-05, |
| "loss": 0.6888, |
| "step": 2385 |
| }, |
| { |
| "epoch": 0.5752105896510229, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.39990316876004e-05, |
| "loss": 0.7084, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.5764139590854392, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.397232074838013e-05, |
| "loss": 0.6788, |
| "step": 2395 |
| }, |
| { |
| "epoch": 0.5776173285198556, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.39455600208619e-05, |
| "loss": 0.6872, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.5788206979542719, |
| "grad_norm": 2.0, |
| "learning_rate": 9.391874954778185e-05, |
| "loss": 0.7138, |
| "step": 2405 |
| }, |
| { |
| "epoch": 0.5800240673886883, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.389188937195558e-05, |
| "loss": 0.6856, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.5812274368231047, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.386497953627805e-05, |
| "loss": 0.6685, |
| "step": 2415 |
| }, |
| { |
| "epoch": 0.5824308062575211, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.383802008372352e-05, |
| "loss": 0.674, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.5836341756919374, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.38110110573455e-05, |
| "loss": 0.6623, |
| "step": 2425 |
| }, |
| { |
| "epoch": 0.5848375451263538, |
| "grad_norm": 1.75, |
| "learning_rate": 9.378395250027666e-05, |
| "loss": 0.7277, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.5860409145607701, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.375684445572877e-05, |
| "loss": 0.6575, |
| "step": 2435 |
| }, |
| { |
| "epoch": 0.5872442839951865, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.372968696699263e-05, |
| "loss": 0.6956, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.5884476534296029, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.370248007743801e-05, |
| "loss": 0.6764, |
| "step": 2445 |
| }, |
| { |
| "epoch": 0.5896510228640193, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.367522383051357e-05, |
| "loss": 0.6936, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.5908543922984356, |
| "grad_norm": 1.8515625, |
| "learning_rate": 9.364791826974676e-05, |
| "loss": 0.6705, |
| "step": 2455 |
| }, |
| { |
| "epoch": 0.592057761732852, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.362056343874385e-05, |
| "loss": 0.6488, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.5932611311672683, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.359315938118973e-05, |
| "loss": 0.6975, |
| "step": 2465 |
| }, |
| { |
| "epoch": 0.5944645006016848, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.356570614084792e-05, |
| "loss": 0.6532, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.5956678700361011, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.35382037615605e-05, |
| "loss": 0.6788, |
| "step": 2475 |
| }, |
| { |
| "epoch": 0.5968712394705175, |
| "grad_norm": 2.0, |
| "learning_rate": 9.351065228724801e-05, |
| "loss": 0.6791, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.5980746089049338, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.348305176190939e-05, |
| "loss": 0.6726, |
| "step": 2485 |
| }, |
| { |
| "epoch": 0.5992779783393501, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.345540222962194e-05, |
| "loss": 0.6567, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.6004813477737665, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.342770373454118e-05, |
| "loss": 0.693, |
| "step": 2495 |
| }, |
| { |
| "epoch": 0.601684717208183, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.339995632090086e-05, |
| "loss": 0.6774, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.601684717208183, |
| "eval_loss": 0.5781615972518921, |
| "eval_runtime": 2.7886, |
| "eval_samples_per_second": 71.721, |
| "eval_steps_per_second": 71.721, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.6028880866425993, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.337216003301284e-05, |
| "loss": 0.697, |
| "step": 2505 |
| }, |
| { |
| "epoch": 0.6040914560770156, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.334431491526702e-05, |
| "loss": 0.6809, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.605294825511432, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.33164210121313e-05, |
| "loss": 0.6707, |
| "step": 2515 |
| }, |
| { |
| "epoch": 0.6064981949458483, |
| "grad_norm": 1.8515625, |
| "learning_rate": 9.328847836815145e-05, |
| "loss": 0.6736, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.6077015643802648, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.326048702795112e-05, |
| "loss": 0.6887, |
| "step": 2525 |
| }, |
| { |
| "epoch": 0.6089049338146811, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.32324470362317e-05, |
| "loss": 0.6671, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.6101083032490975, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.320435843777231e-05, |
| "loss": 0.6665, |
| "step": 2535 |
| }, |
| { |
| "epoch": 0.6113116726835138, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.317622127742965e-05, |
| "loss": 0.6633, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.6125150421179302, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.3148035600138e-05, |
| "loss": 0.6628, |
| "step": 2545 |
| }, |
| { |
| "epoch": 0.6137184115523465, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.311980145090908e-05, |
| "loss": 0.6491, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.614921780986763, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.30915188748321e-05, |
| "loss": 0.6935, |
| "step": 2555 |
| }, |
| { |
| "epoch": 0.6161251504211793, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.306318791707354e-05, |
| "loss": 0.683, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.6173285198555957, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.303480862287714e-05, |
| "loss": 0.6731, |
| "step": 2565 |
| }, |
| { |
| "epoch": 0.618531889290012, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.300638103756387e-05, |
| "loss": 0.6911, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.6197352587244284, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.29779052065318e-05, |
| "loss": 0.6096, |
| "step": 2575 |
| }, |
| { |
| "epoch": 0.6209386281588448, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.294938117525605e-05, |
| "loss": 0.6799, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.6221419975932612, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.292080898928872e-05, |
| "loss": 0.6589, |
| "step": 2585 |
| }, |
| { |
| "epoch": 0.6233453670276775, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.289218869425879e-05, |
| "loss": 0.6221, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.6245487364620939, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.28635203358721e-05, |
| "loss": 0.6513, |
| "step": 2595 |
| }, |
| { |
| "epoch": 0.6257521058965102, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.283480395991122e-05, |
| "loss": 0.6326, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.6269554753309265, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.28060396122354e-05, |
| "loss": 0.6824, |
| "step": 2605 |
| }, |
| { |
| "epoch": 0.628158844765343, |
| "grad_norm": 1.875, |
| "learning_rate": 9.277722733878054e-05, |
| "loss": 0.6777, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.6293622141997594, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.274836718555903e-05, |
| "loss": 0.652, |
| "step": 2615 |
| }, |
| { |
| "epoch": 0.6305655836341757, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.271945919865976e-05, |
| "loss": 0.6514, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.631768953068592, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.269050342424796e-05, |
| "loss": 0.6998, |
| "step": 2625 |
| }, |
| { |
| "epoch": 0.6329723225030084, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.266149990856521e-05, |
| "loss": 0.6756, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.6341756919374247, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.263244869792937e-05, |
| "loss": 0.6714, |
| "step": 2635 |
| }, |
| { |
| "epoch": 0.6353790613718412, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.260334983873439e-05, |
| "loss": 0.6422, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.6365824308062575, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.257420337745034e-05, |
| "loss": 0.6618, |
| "step": 2645 |
| }, |
| { |
| "epoch": 0.6377858002406739, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.254500936062334e-05, |
| "loss": 0.652, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.6389891696750902, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.251576783487541e-05, |
| "loss": 0.6936, |
| "step": 2655 |
| }, |
| { |
| "epoch": 0.6401925391095066, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.248647884690448e-05, |
| "loss": 0.6398, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.641395908543923, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.245714244348425e-05, |
| "loss": 0.6414, |
| "step": 2665 |
| }, |
| { |
| "epoch": 0.6425992779783394, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.242775867146415e-05, |
| "loss": 0.6777, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.6438026474127557, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.239832757776926e-05, |
| "loss": 0.6736, |
| "step": 2675 |
| }, |
| { |
| "epoch": 0.6450060168471721, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.236884920940024e-05, |
| "loss": 0.662, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.6462093862815884, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.233932361343321e-05, |
| "loss": 0.6441, |
| "step": 2685 |
| }, |
| { |
| "epoch": 0.6474127557160048, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.230975083701976e-05, |
| "loss": 0.6484, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.6486161251504212, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.228013092738675e-05, |
| "loss": 0.6511, |
| "step": 2695 |
| }, |
| { |
| "epoch": 0.6498194945848376, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.225046393183642e-05, |
| "loss": 0.6311, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.6510228640192539, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.222074989774612e-05, |
| "loss": 0.657, |
| "step": 2705 |
| }, |
| { |
| "epoch": 0.6522262334536703, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.219098887256835e-05, |
| "loss": 0.6849, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.6534296028880866, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.216118090383067e-05, |
| "loss": 0.6848, |
| "step": 2715 |
| }, |
| { |
| "epoch": 0.6546329723225031, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.213132603913553e-05, |
| "loss": 0.6083, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.6558363417569194, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.210142432616035e-05, |
| "loss": 0.6691, |
| "step": 2725 |
| }, |
| { |
| "epoch": 0.6570397111913358, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.207147581265739e-05, |
| "loss": 0.6658, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.6582430806257521, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.204148054645356e-05, |
| "loss": 0.6723, |
| "step": 2735 |
| }, |
| { |
| "epoch": 0.6594464500601684, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.201143857545048e-05, |
| "loss": 0.6736, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.6606498194945848, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.198134994762436e-05, |
| "loss": 0.6836, |
| "step": 2745 |
| }, |
| { |
| "epoch": 0.6618531889290012, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.19512147110259e-05, |
| "loss": 0.6888, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.6630565583634176, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.192103291378025e-05, |
| "loss": 0.6839, |
| "step": 2755 |
| }, |
| { |
| "epoch": 0.6642599277978339, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.189080460408692e-05, |
| "loss": 0.6831, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.6654632972322503, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.186052983021966e-05, |
| "loss": 0.6923, |
| "step": 2765 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.18302086405265e-05, |
| "loss": 0.6671, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.6678700361010831, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.17998410834295e-05, |
| "loss": 0.6979, |
| "step": 2775 |
| }, |
| { |
| "epoch": 0.6690734055354994, |
| "grad_norm": 2.359375, |
| "learning_rate": 9.176942720742483e-05, |
| "loss": 0.6569, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.6702767749699158, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.173896706108264e-05, |
| "loss": 0.6729, |
| "step": 2785 |
| }, |
| { |
| "epoch": 0.6714801444043321, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.17084606930469e-05, |
| "loss": 0.6415, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.6726835138387485, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.167790815203546e-05, |
| "loss": 0.6599, |
| "step": 2795 |
| }, |
| { |
| "epoch": 0.6738868832731648, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.164730948683991e-05, |
| "loss": 0.6626, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.6750902527075813, |
| "grad_norm": 1.75, |
| "learning_rate": 9.161666474632543e-05, |
| "loss": 0.6445, |
| "step": 2805 |
| }, |
| { |
| "epoch": 0.6762936221419976, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.158597397943088e-05, |
| "loss": 0.6633, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.677496991576414, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.155523723516855e-05, |
| "loss": 0.6755, |
| "step": 2815 |
| }, |
| { |
| "epoch": 0.6787003610108303, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.152445456262417e-05, |
| "loss": 0.6859, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.6799037304452467, |
| "grad_norm": 2.0, |
| "learning_rate": 9.14936260109568e-05, |
| "loss": 0.6821, |
| "step": 2825 |
| }, |
| { |
| "epoch": 0.681107099879663, |
| "grad_norm": 1.875, |
| "learning_rate": 9.146275162939884e-05, |
| "loss": 0.6637, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.6823104693140795, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.143183146725579e-05, |
| "loss": 0.6739, |
| "step": 2835 |
| }, |
| { |
| "epoch": 0.6835138387484958, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.140086557390632e-05, |
| "loss": 0.6819, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.6847172081829122, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.13698539988021e-05, |
| "loss": 0.6595, |
| "step": 2845 |
| }, |
| { |
| "epoch": 0.6859205776173285, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.133879679146775e-05, |
| "loss": 0.6268, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.6871239470517448, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.13076940015008e-05, |
| "loss": 0.6373, |
| "step": 2855 |
| }, |
| { |
| "epoch": 0.6883273164861613, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.127654567857154e-05, |
| "loss": 0.6741, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.6895306859205776, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.124535187242297e-05, |
| "loss": 0.6556, |
| "step": 2865 |
| }, |
| { |
| "epoch": 0.690734055354994, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.121411263287077e-05, |
| "loss": 0.6569, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.6919374247894103, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.118282800980315e-05, |
| "loss": 0.6308, |
| "step": 2875 |
| }, |
| { |
| "epoch": 0.6931407942238267, |
| "grad_norm": 1.6640625, |
| "learning_rate": 9.115149805318075e-05, |
| "loss": 0.6613, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.694344163658243, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.11201228130367e-05, |
| "loss": 0.6141, |
| "step": 2885 |
| }, |
| { |
| "epoch": 0.6955475330926595, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.108870233947634e-05, |
| "loss": 0.6304, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.6967509025270758, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.105723668267736e-05, |
| "loss": 0.6662, |
| "step": 2895 |
| }, |
| { |
| "epoch": 0.6979542719614922, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.10257258928895e-05, |
| "loss": 0.6322, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.6991576413959085, |
| "grad_norm": 1.8515625, |
| "learning_rate": 9.099417002043466e-05, |
| "loss": 0.6472, |
| "step": 2905 |
| }, |
| { |
| "epoch": 0.7003610108303249, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.09625691157067e-05, |
| "loss": 0.6505, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.7015643802647413, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.093092322917136e-05, |
| "loss": 0.6569, |
| "step": 2915 |
| }, |
| { |
| "epoch": 0.7027677496991577, |
| "grad_norm": 1.875, |
| "learning_rate": 9.089923241136629e-05, |
| "loss": 0.6569, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.703971119133574, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.086749671290083e-05, |
| "loss": 0.6523, |
| "step": 2925 |
| }, |
| { |
| "epoch": 0.7051744885679904, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.083571618445603e-05, |
| "loss": 0.6619, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.7063778580024067, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.080389087678451e-05, |
| "loss": 0.65, |
| "step": 2935 |
| }, |
| { |
| "epoch": 0.7075812274368231, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.077202084071043e-05, |
| "loss": 0.6475, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.7087845968712395, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.074010612712936e-05, |
| "loss": 0.6487, |
| "step": 2945 |
| }, |
| { |
| "epoch": 0.7099879663056559, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.070814678700821e-05, |
| "loss": 0.6291, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.7111913357400722, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.067614287138514e-05, |
| "loss": 0.6742, |
| "step": 2955 |
| }, |
| { |
| "epoch": 0.7123947051744886, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.064409443136955e-05, |
| "loss": 0.6316, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.7135980746089049, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.061200151814195e-05, |
| "loss": 0.6678, |
| "step": 2965 |
| }, |
| { |
| "epoch": 0.7148014440433214, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.057986418295378e-05, |
| "loss": 0.6628, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.7160048134777377, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.054768247712753e-05, |
| "loss": 0.6297, |
| "step": 2975 |
| }, |
| { |
| "epoch": 0.717208182912154, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.051545645205646e-05, |
| "loss": 0.6502, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.7184115523465704, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.048318615920468e-05, |
| "loss": 0.6704, |
| "step": 2985 |
| }, |
| { |
| "epoch": 0.7196149217809867, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.045087165010694e-05, |
| "loss": 0.6473, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.7208182912154031, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.041851297636862e-05, |
| "loss": 0.6508, |
| "step": 2995 |
| }, |
| { |
| "epoch": 0.7220216606498195, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.038611018966564e-05, |
| "loss": 0.6492, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7220216606498195, |
| "eval_loss": 0.5566111207008362, |
| "eval_runtime": 2.6561, |
| "eval_samples_per_second": 75.299, |
| "eval_steps_per_second": 75.299, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7232250300842359, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.035366334174436e-05, |
| "loss": 0.6277, |
| "step": 3005 |
| }, |
| { |
| "epoch": 0.7244283995186522, |
| "grad_norm": 1.71875, |
| "learning_rate": 9.032117248442153e-05, |
| "loss": 0.649, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.7256317689530686, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.02886376695841e-05, |
| "loss": 0.6538, |
| "step": 3015 |
| }, |
| { |
| "epoch": 0.7268351383874849, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.02560589491893e-05, |
| "loss": 0.635, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.7280385078219013, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.022343637526446e-05, |
| "loss": 0.6399, |
| "step": 3025 |
| }, |
| { |
| "epoch": 0.7292418772563177, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.019076999990694e-05, |
| "loss": 0.6522, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.7304452466907341, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.015805987528402e-05, |
| "loss": 0.6566, |
| "step": 3035 |
| }, |
| { |
| "epoch": 0.7316486161251504, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.012530605363289e-05, |
| "loss": 0.6781, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.7328519855595668, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.009250858726047e-05, |
| "loss": 0.6405, |
| "step": 3045 |
| }, |
| { |
| "epoch": 0.7340553549939831, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.005966752854345e-05, |
| "loss": 0.6467, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.7352587244283996, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.002678292992809e-05, |
| "loss": 0.6345, |
| "step": 3055 |
| }, |
| { |
| "epoch": 0.7364620938628159, |
| "grad_norm": 1.75, |
| "learning_rate": 8.999385484393018e-05, |
| "loss": 0.6224, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.7376654632972323, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.996088332313497e-05, |
| "loss": 0.6236, |
| "step": 3065 |
| }, |
| { |
| "epoch": 0.7388688327316486, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.992786842019708e-05, |
| "loss": 0.6464, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.740072202166065, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.989481018784036e-05, |
| "loss": 0.6277, |
| "step": 3075 |
| }, |
| { |
| "epoch": 0.7412755716004813, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.986170867885797e-05, |
| "loss": 0.6739, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.7424789410348978, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.982856394611208e-05, |
| "loss": 0.6344, |
| "step": 3085 |
| }, |
| { |
| "epoch": 0.7436823104693141, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.97953760425339e-05, |
| "loss": 0.6574, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.7448856799037304, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.976214502112361e-05, |
| "loss": 0.6485, |
| "step": 3095 |
| }, |
| { |
| "epoch": 0.7460890493381468, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.972887093495022e-05, |
| "loss": 0.6452, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.7472924187725631, |
| "grad_norm": 1.984375, |
| "learning_rate": 8.969555383715156e-05, |
| "loss": 0.6481, |
| "step": 3105 |
| }, |
| { |
| "epoch": 0.7484957882069796, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.966219378093411e-05, |
| "loss": 0.6169, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.7496991576413959, |
| "grad_norm": 2.09375, |
| "learning_rate": 8.962879081957296e-05, |
| "loss": 0.6774, |
| "step": 3115 |
| }, |
| { |
| "epoch": 0.7509025270758123, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.95953450064117e-05, |
| "loss": 0.6313, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.7521058965102286, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.956185639486242e-05, |
| "loss": 0.6508, |
| "step": 3125 |
| }, |
| { |
| "epoch": 0.753309265944645, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.952832503840548e-05, |
| "loss": 0.6782, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.7545126353790613, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.949475099058955e-05, |
| "loss": 0.681, |
| "step": 3135 |
| }, |
| { |
| "epoch": 0.7557160048134778, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.946113430503143e-05, |
| "loss": 0.6357, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.7569193742478941, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.942747503541607e-05, |
| "loss": 0.67, |
| "step": 3145 |
| }, |
| { |
| "epoch": 0.7581227436823105, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.93937732354964e-05, |
| "loss": 0.6323, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.7593261131167268, |
| "grad_norm": 1.75, |
| "learning_rate": 8.936002895909326e-05, |
| "loss": 0.6367, |
| "step": 3155 |
| }, |
| { |
| "epoch": 0.7605294825511432, |
| "grad_norm": 1.875, |
| "learning_rate": 8.932624226009533e-05, |
| "loss": 0.6521, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.7617328519855595, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.929241319245903e-05, |
| "loss": 0.6794, |
| "step": 3165 |
| }, |
| { |
| "epoch": 0.762936221419976, |
| "grad_norm": 1.75, |
| "learning_rate": 8.925854181020849e-05, |
| "loss": 0.6673, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.7641395908543923, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.922462816743533e-05, |
| "loss": 0.6203, |
| "step": 3175 |
| }, |
| { |
| "epoch": 0.7653429602888087, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.919067231829874e-05, |
| "loss": 0.6765, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.766546329723225, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.915667431702524e-05, |
| "loss": 0.6379, |
| "step": 3185 |
| }, |
| { |
| "epoch": 0.7677496991576414, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.912263421790873e-05, |
| "loss": 0.6319, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.7689530685920578, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.908855207531031e-05, |
| "loss": 0.6547, |
| "step": 3195 |
| }, |
| { |
| "epoch": 0.7701564380264742, |
| "grad_norm": 1.8984375, |
| "learning_rate": 8.905442794365822e-05, |
| "loss": 0.6284, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7713598074608905, |
| "grad_norm": 1.75, |
| "learning_rate": 8.902026187744776e-05, |
| "loss": 0.6478, |
| "step": 3205 |
| }, |
| { |
| "epoch": 0.7725631768953068, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.89860539312412e-05, |
| "loss": 0.6863, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.7737665463297232, |
| "grad_norm": 1.75, |
| "learning_rate": 8.89518041596677e-05, |
| "loss": 0.6777, |
| "step": 3215 |
| }, |
| { |
| "epoch": 0.7749699157641395, |
| "grad_norm": 1.703125, |
| "learning_rate": 8.891751261742318e-05, |
| "loss": 0.6713, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.776173285198556, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.88831793592703e-05, |
| "loss": 0.6486, |
| "step": 3225 |
| }, |
| { |
| "epoch": 0.7773766546329723, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.884880444003835e-05, |
| "loss": 0.6512, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.7785800240673887, |
| "grad_norm": 2.046875, |
| "learning_rate": 8.881438791462308e-05, |
| "loss": 0.6519, |
| "step": 3235 |
| }, |
| { |
| "epoch": 0.779783393501805, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.877992983798678e-05, |
| "loss": 0.6361, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.7809867629362214, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.874543026515806e-05, |
| "loss": 0.6329, |
| "step": 3245 |
| }, |
| { |
| "epoch": 0.7821901323706378, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.871088925123174e-05, |
| "loss": 0.6481, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.7833935018050542, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.867630685136893e-05, |
| "loss": 0.6521, |
| "step": 3255 |
| }, |
| { |
| "epoch": 0.7845968712394705, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.864168312079671e-05, |
| "loss": 0.6221, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.7858002406738869, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.860701811480828e-05, |
| "loss": 0.6471, |
| "step": 3265 |
| }, |
| { |
| "epoch": 0.7870036101083032, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.857231188876265e-05, |
| "loss": 0.6588, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.7882069795427196, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.853756449808476e-05, |
| "loss": 0.6249, |
| "step": 3275 |
| }, |
| { |
| "epoch": 0.789410348977136, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.85027759982652e-05, |
| "loss": 0.6372, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.7906137184115524, |
| "grad_norm": 1.9921875, |
| "learning_rate": 8.846794644486026e-05, |
| "loss": 0.6523, |
| "step": 3285 |
| }, |
| { |
| "epoch": 0.7918170878459687, |
| "grad_norm": 1.9375, |
| "learning_rate": 8.843307589349178e-05, |
| "loss": 0.6329, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.7930204572803851, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.839816439984706e-05, |
| "loss": 0.6525, |
| "step": 3295 |
| }, |
| { |
| "epoch": 0.7942238267148014, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.836321201967884e-05, |
| "loss": 0.6191, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.7954271961492179, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.832821880880504e-05, |
| "loss": 0.5898, |
| "step": 3305 |
| }, |
| { |
| "epoch": 0.7966305655836342, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.829318482310891e-05, |
| "loss": 0.6426, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.7978339350180506, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.825811011853874e-05, |
| "loss": 0.6301, |
| "step": 3315 |
| }, |
| { |
| "epoch": 0.7990373044524669, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.822299475110787e-05, |
| "loss": 0.6321, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.8002406738868832, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.81878387768946e-05, |
| "loss": 0.6336, |
| "step": 3325 |
| }, |
| { |
| "epoch": 0.8014440433212996, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.815264225204201e-05, |
| "loss": 0.6566, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.802647412755716, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.811740523275802e-05, |
| "loss": 0.6386, |
| "step": 3335 |
| }, |
| { |
| "epoch": 0.8038507821901324, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.808212777531515e-05, |
| "loss": 0.6428, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.8050541516245487, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.804680993605056e-05, |
| "loss": 0.648, |
| "step": 3345 |
| }, |
| { |
| "epoch": 0.8062575210589651, |
| "grad_norm": 1.9921875, |
| "learning_rate": 8.801145177136586e-05, |
| "loss": 0.6724, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.8074608904933814, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.797605333772706e-05, |
| "loss": 0.6142, |
| "step": 3355 |
| }, |
| { |
| "epoch": 0.8086642599277978, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.794061469166449e-05, |
| "loss": 0.6161, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.8098676293622142, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.790513588977268e-05, |
| "loss": 0.6351, |
| "step": 3365 |
| }, |
| { |
| "epoch": 0.8110709987966306, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.786961698871031e-05, |
| "loss": 0.6748, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.8122743682310469, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.78340580452001e-05, |
| "loss": 0.6414, |
| "step": 3375 |
| }, |
| { |
| "epoch": 0.8134777376654633, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.779845911602868e-05, |
| "loss": 0.6533, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.8146811070998796, |
| "grad_norm": 1.6171875, |
| "learning_rate": 8.776282025804659e-05, |
| "loss": 0.6725, |
| "step": 3385 |
| }, |
| { |
| "epoch": 0.8158844765342961, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.77271415281681e-05, |
| "loss": 0.6398, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.8170878459687124, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.769142298337113e-05, |
| "loss": 0.6317, |
| "step": 3395 |
| }, |
| { |
| "epoch": 0.8182912154031288, |
| "grad_norm": 1.953125, |
| "learning_rate": 8.765566468069726e-05, |
| "loss": 0.6587, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.8194945848375451, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.761986667725147e-05, |
| "loss": 0.6201, |
| "step": 3405 |
| }, |
| { |
| "epoch": 0.8206979542719615, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.758402903020221e-05, |
| "loss": 0.5976, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.8219013237063778, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.754815179678124e-05, |
| "loss": 0.6232, |
| "step": 3415 |
| }, |
| { |
| "epoch": 0.8231046931407943, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.751223503428348e-05, |
| "loss": 0.6512, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.8243080625752106, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.747627880006703e-05, |
| "loss": 0.6592, |
| "step": 3425 |
| }, |
| { |
| "epoch": 0.825511432009627, |
| "grad_norm": 1.875, |
| "learning_rate": 8.7440283151553e-05, |
| "loss": 0.6428, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.8267148014440433, |
| "grad_norm": 1.6640625, |
| "learning_rate": 8.740424814622546e-05, |
| "loss": 0.574, |
| "step": 3435 |
| }, |
| { |
| "epoch": 0.8279181708784596, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.736817384163131e-05, |
| "loss": 0.614, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.8291215403128761, |
| "grad_norm": 1.75, |
| "learning_rate": 8.733206029538023e-05, |
| "loss": 0.6405, |
| "step": 3445 |
| }, |
| { |
| "epoch": 0.8303249097472925, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.729590756514455e-05, |
| "loss": 0.6592, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.8315282791817088, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.725971570865921e-05, |
| "loss": 0.6584, |
| "step": 3455 |
| }, |
| { |
| "epoch": 0.8327316486161251, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.722348478372159e-05, |
| "loss": 0.6305, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.8339350180505415, |
| "grad_norm": 1.65625, |
| "learning_rate": 8.718721484819147e-05, |
| "loss": 0.6425, |
| "step": 3465 |
| }, |
| { |
| "epoch": 0.8351383874849578, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.715090595999096e-05, |
| "loss": 0.6302, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.8363417569193743, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.711455817710437e-05, |
| "loss": 0.6188, |
| "step": 3475 |
| }, |
| { |
| "epoch": 0.8375451263537906, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.70781715575781e-05, |
| "loss": 0.6559, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.838748495788207, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.704174615952055e-05, |
| "loss": 0.6032, |
| "step": 3485 |
| }, |
| { |
| "epoch": 0.8399518652226233, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.70052820411021e-05, |
| "loss": 0.6567, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.8411552346570397, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.696877926055497e-05, |
| "loss": 0.5912, |
| "step": 3495 |
| }, |
| { |
| "epoch": 0.8423586040914561, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.693223787617304e-05, |
| "loss": 0.6628, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.8423586040914561, |
| "eval_loss": 0.5347813367843628, |
| "eval_runtime": 2.6685, |
| "eval_samples_per_second": 74.949, |
| "eval_steps_per_second": 74.949, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.8435619735258725, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.689565794631195e-05, |
| "loss": 0.6222, |
| "step": 3505 |
| }, |
| { |
| "epoch": 0.8447653429602888, |
| "grad_norm": 1.6484375, |
| "learning_rate": 8.685903952938881e-05, |
| "loss": 0.618, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.8459687123947052, |
| "grad_norm": 2.5625, |
| "learning_rate": 8.682238268388225e-05, |
| "loss": 0.6677, |
| "step": 3515 |
| }, |
| { |
| "epoch": 0.8471720818291215, |
| "grad_norm": 1.640625, |
| "learning_rate": 8.678568746833222e-05, |
| "loss": 0.6229, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.8483754512635379, |
| "grad_norm": 1.625, |
| "learning_rate": 8.674895394134e-05, |
| "loss": 0.5955, |
| "step": 3525 |
| }, |
| { |
| "epoch": 0.8495788206979543, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.6712182161568e-05, |
| "loss": 0.6069, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.8507821901323707, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.667537218773973e-05, |
| "loss": 0.6031, |
| "step": 3535 |
| }, |
| { |
| "epoch": 0.851985559566787, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.663852407863973e-05, |
| "loss": 0.6408, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.8531889290012034, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.66016378931134e-05, |
| "loss": 0.625, |
| "step": 3545 |
| }, |
| { |
| "epoch": 0.8543922984356197, |
| "grad_norm": 1.65625, |
| "learning_rate": 8.656471369006697e-05, |
| "loss": 0.5847, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.855595667870036, |
| "grad_norm": 1.9921875, |
| "learning_rate": 8.65277515284674e-05, |
| "loss": 0.6638, |
| "step": 3555 |
| }, |
| { |
| "epoch": 0.8567990373044525, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.649075146734222e-05, |
| "loss": 0.6436, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.8580024067388689, |
| "grad_norm": 1.9765625, |
| "learning_rate": 8.64537135657795e-05, |
| "loss": 0.6382, |
| "step": 3565 |
| }, |
| { |
| "epoch": 0.8592057761732852, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.641663788292779e-05, |
| "loss": 0.6395, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.8604091456077015, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.637952447799589e-05, |
| "loss": 0.6353, |
| "step": 3575 |
| }, |
| { |
| "epoch": 0.8616125150421179, |
| "grad_norm": 1.75, |
| "learning_rate": 8.634237341025292e-05, |
| "loss": 0.6124, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.8628158844765343, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.63051847390281e-05, |
| "loss": 0.6215, |
| "step": 3585 |
| }, |
| { |
| "epoch": 0.8640192539109507, |
| "grad_norm": 1.9375, |
| "learning_rate": 8.626795852371071e-05, |
| "loss": 0.6341, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.865222623345367, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.623069482375002e-05, |
| "loss": 0.6319, |
| "step": 3595 |
| }, |
| { |
| "epoch": 0.8664259927797834, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.61933936986551e-05, |
| "loss": 0.6488, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.8676293622141997, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.615605520799485e-05, |
| "loss": 0.6515, |
| "step": 3605 |
| }, |
| { |
| "epoch": 0.8688327316486161, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.611867941139779e-05, |
| "loss": 0.629, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.8700361010830325, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.608126636855206e-05, |
| "loss": 0.6395, |
| "step": 3615 |
| }, |
| { |
| "epoch": 0.8712394705174489, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.604381613920524e-05, |
| "loss": 0.6356, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.8724428399518652, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.600632878316433e-05, |
| "loss": 0.6264, |
| "step": 3625 |
| }, |
| { |
| "epoch": 0.8736462093862816, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.596880436029559e-05, |
| "loss": 0.5993, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.8748495788206979, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.593124293052452e-05, |
| "loss": 0.6338, |
| "step": 3635 |
| }, |
| { |
| "epoch": 0.8760529482551144, |
| "grad_norm": 1.9375, |
| "learning_rate": 8.589364455383568e-05, |
| "loss": 0.6467, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.8772563176895307, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.585600929027262e-05, |
| "loss": 0.625, |
| "step": 3645 |
| }, |
| { |
| "epoch": 0.8784596871239471, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.581833719993785e-05, |
| "loss": 0.6371, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.8796630565583634, |
| "grad_norm": 1.6640625, |
| "learning_rate": 8.578062834299264e-05, |
| "loss": 0.6239, |
| "step": 3655 |
| }, |
| { |
| "epoch": 0.8808664259927798, |
| "grad_norm": 1.9375, |
| "learning_rate": 8.574288277965703e-05, |
| "loss": 0.6019, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.8820697954271961, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.570510057020962e-05, |
| "loss": 0.6434, |
| "step": 3665 |
| }, |
| { |
| "epoch": 0.8832731648616126, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.566728177498757e-05, |
| "loss": 0.623, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.8844765342960289, |
| "grad_norm": 1.96875, |
| "learning_rate": 8.562942645438646e-05, |
| "loss": 0.6125, |
| "step": 3675 |
| }, |
| { |
| "epoch": 0.8856799037304453, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.559153466886017e-05, |
| "loss": 0.5848, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.8868832731648616, |
| "grad_norm": 1.703125, |
| "learning_rate": 8.555360647892087e-05, |
| "loss": 0.6241, |
| "step": 3685 |
| }, |
| { |
| "epoch": 0.8880866425992779, |
| "grad_norm": 1.5703125, |
| "learning_rate": 8.551564194513882e-05, |
| "loss": 0.6044, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.8892900120336944, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.547764112814234e-05, |
| "loss": 0.6086, |
| "step": 3695 |
| }, |
| { |
| "epoch": 0.8904933814681107, |
| "grad_norm": 1.65625, |
| "learning_rate": 8.54396040886177e-05, |
| "loss": 0.5887, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.8916967509025271, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.5401530887309e-05, |
| "loss": 0.6021, |
| "step": 3705 |
| }, |
| { |
| "epoch": 0.8929001203369434, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.536342158501808e-05, |
| "loss": 0.6041, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.8941034897713598, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.532527624260448e-05, |
| "loss": 0.6172, |
| "step": 3715 |
| }, |
| { |
| "epoch": 0.8953068592057761, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.528709492098524e-05, |
| "loss": 0.6071, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.8965102286401926, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.524887768113489e-05, |
| "loss": 0.6526, |
| "step": 3725 |
| }, |
| { |
| "epoch": 0.8977135980746089, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.521062458408529e-05, |
| "loss": 0.6289, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.8989169675090253, |
| "grad_norm": 1.6484375, |
| "learning_rate": 8.517233569092561e-05, |
| "loss": 0.6516, |
| "step": 3735 |
| }, |
| { |
| "epoch": 0.9001203369434416, |
| "grad_norm": 1.96875, |
| "learning_rate": 8.513401106280215e-05, |
| "loss": 0.6479, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.901323706377858, |
| "grad_norm": 1.6484375, |
| "learning_rate": 8.50956507609183e-05, |
| "loss": 0.6149, |
| "step": 3745 |
| }, |
| { |
| "epoch": 0.9025270758122743, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.505725484653437e-05, |
| "loss": 0.6184, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.9037304452466908, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.501882338096763e-05, |
| "loss": 0.627, |
| "step": 3755 |
| }, |
| { |
| "epoch": 0.9049338146811071, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.498035642559203e-05, |
| "loss": 0.6689, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.9061371841155235, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.494185404183828e-05, |
| "loss": 0.633, |
| "step": 3765 |
| }, |
| { |
| "epoch": 0.9073405535499398, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.49033162911936e-05, |
| "loss": 0.5911, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.9085439229843562, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.486474323520172e-05, |
| "loss": 0.6439, |
| "step": 3775 |
| }, |
| { |
| "epoch": 0.9097472924187726, |
| "grad_norm": 1.78125, |
| "learning_rate": 8.482613493546275e-05, |
| "loss": 0.6373, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.910950661853189, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.47874914536331e-05, |
| "loss": 0.6325, |
| "step": 3785 |
| }, |
| { |
| "epoch": 0.9121540312876053, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.474881285142532e-05, |
| "loss": 0.6449, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.9133574007220217, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.471009919060811e-05, |
| "loss": 0.6369, |
| "step": 3795 |
| }, |
| { |
| "epoch": 0.914560770156438, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.46713505330061e-05, |
| "loss": 0.633, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.9157641395908543, |
| "grad_norm": 1.78125, |
| "learning_rate": 8.463256694049982e-05, |
| "loss": 0.6176, |
| "step": 3805 |
| }, |
| { |
| "epoch": 0.9169675090252708, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.459374847502562e-05, |
| "loss": 0.6279, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.9181708784596871, |
| "grad_norm": 2.140625, |
| "learning_rate": 8.455489519857552e-05, |
| "loss": 0.613, |
| "step": 3815 |
| }, |
| { |
| "epoch": 0.9193742478941035, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.451600717319714e-05, |
| "loss": 0.6297, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.9205776173285198, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.447708446099358e-05, |
| "loss": 0.6043, |
| "step": 3825 |
| }, |
| { |
| "epoch": 0.9217809867629362, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.443812712412331e-05, |
| "loss": 0.6347, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.9229843561973526, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.439913522480016e-05, |
| "loss": 0.6038, |
| "step": 3835 |
| }, |
| { |
| "epoch": 0.924187725631769, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.436010882529314e-05, |
| "loss": 0.6332, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.9253910950661853, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.432104798792632e-05, |
| "loss": 0.617, |
| "step": 3845 |
| }, |
| { |
| "epoch": 0.9265944645006017, |
| "grad_norm": 1.875, |
| "learning_rate": 8.428195277507874e-05, |
| "loss": 0.6533, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.927797833935018, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.424282324918444e-05, |
| "loss": 0.6323, |
| "step": 3855 |
| }, |
| { |
| "epoch": 0.9290012033694344, |
| "grad_norm": 1.640625, |
| "learning_rate": 8.420365947273217e-05, |
| "loss": 0.6411, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.9302045728038508, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.41644615082654e-05, |
| "loss": 0.6448, |
| "step": 3865 |
| }, |
| { |
| "epoch": 0.9314079422382672, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.412522941838221e-05, |
| "loss": 0.6081, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.9326113116726835, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.408596326573518e-05, |
| "loss": 0.5949, |
| "step": 3875 |
| }, |
| { |
| "epoch": 0.9338146811070999, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.404666311303126e-05, |
| "loss": 0.6219, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.9350180505415162, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.400732902303172e-05, |
| "loss": 0.6101, |
| "step": 3885 |
| }, |
| { |
| "epoch": 0.9362214199759326, |
| "grad_norm": 1.546875, |
| "learning_rate": 8.3967961058552e-05, |
| "loss": 0.6045, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.937424789410349, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.392855928246169e-05, |
| "loss": 0.6527, |
| "step": 3895 |
| }, |
| { |
| "epoch": 0.9386281588447654, |
| "grad_norm": 1.625, |
| "learning_rate": 8.388912375768433e-05, |
| "loss": 0.5743, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.9398315282791817, |
| "grad_norm": 1.640625, |
| "learning_rate": 8.384965454719734e-05, |
| "loss": 0.6105, |
| "step": 3905 |
| }, |
| { |
| "epoch": 0.941034897713598, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.381015171403202e-05, |
| "loss": 0.6586, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.9422382671480144, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.377061532127327e-05, |
| "loss": 0.608, |
| "step": 3915 |
| }, |
| { |
| "epoch": 0.9434416365824309, |
| "grad_norm": 1.5390625, |
| "learning_rate": 8.373104543205963e-05, |
| "loss": 0.5946, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.9446450060168472, |
| "grad_norm": 1.609375, |
| "learning_rate": 8.369144210958312e-05, |
| "loss": 0.5954, |
| "step": 3925 |
| }, |
| { |
| "epoch": 0.9458483754512635, |
| "grad_norm": 1.875, |
| "learning_rate": 8.365180541708917e-05, |
| "loss": 0.631, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.9470517448856799, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.361213541787648e-05, |
| "loss": 0.596, |
| "step": 3935 |
| }, |
| { |
| "epoch": 0.9482551143200962, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.357243217529694e-05, |
| "loss": 0.6246, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.9494584837545126, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.353269575275555e-05, |
| "loss": 0.6135, |
| "step": 3945 |
| }, |
| { |
| "epoch": 0.950661853188929, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.34929262137103e-05, |
| "loss": 0.6014, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.9518652226233454, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.3453123621672e-05, |
| "loss": 0.6129, |
| "step": 3955 |
| }, |
| { |
| "epoch": 0.9530685920577617, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.341328804020435e-05, |
| "loss": 0.6022, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.9542719614921781, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.337341953292365e-05, |
| "loss": 0.5934, |
| "step": 3965 |
| }, |
| { |
| "epoch": 0.9554753309265944, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.33335181634988e-05, |
| "loss": 0.6331, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.9566787003610109, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.329358399565123e-05, |
| "loss": 0.6013, |
| "step": 3975 |
| }, |
| { |
| "epoch": 0.9578820697954272, |
| "grad_norm": 1.625, |
| "learning_rate": 8.325361709315466e-05, |
| "loss": 0.5836, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.9590854392298436, |
| "grad_norm": 1.640625, |
| "learning_rate": 8.321361751983516e-05, |
| "loss": 0.6138, |
| "step": 3985 |
| }, |
| { |
| "epoch": 0.9602888086642599, |
| "grad_norm": 1.875, |
| "learning_rate": 8.317358533957095e-05, |
| "loss": 0.6239, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.9614921780986763, |
| "grad_norm": 1.6484375, |
| "learning_rate": 8.313352061629231e-05, |
| "loss": 0.6073, |
| "step": 3995 |
| }, |
| { |
| "epoch": 0.9626955475330926, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.309342341398151e-05, |
| "loss": 0.6274, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9626955475330926, |
| "eval_loss": 0.5121676325798035, |
| "eval_runtime": 2.6684, |
| "eval_samples_per_second": 74.95, |
| "eval_steps_per_second": 74.95, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9638989169675091, |
| "grad_norm": 1.6640625, |
| "learning_rate": 8.305329379667268e-05, |
| "loss": 0.6214, |
| "step": 4005 |
| }, |
| { |
| "epoch": 0.9651022864019254, |
| "grad_norm": 1.75, |
| "learning_rate": 8.30131318284517e-05, |
| "loss": 0.6042, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.9663056558363418, |
| "grad_norm": 1.6015625, |
| "learning_rate": 8.297293757345617e-05, |
| "loss": 0.5803, |
| "step": 4015 |
| }, |
| { |
| "epoch": 0.9675090252707581, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.293271109587516e-05, |
| "loss": 0.6131, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.9687123947051745, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.289245245994927e-05, |
| "loss": 0.6206, |
| "step": 4025 |
| }, |
| { |
| "epoch": 0.9699157641395909, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.285216172997044e-05, |
| "loss": 0.6494, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.9711191335740073, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.281183897028185e-05, |
| "loss": 0.6191, |
| "step": 4035 |
| }, |
| { |
| "epoch": 0.9723225030084236, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.277148424527785e-05, |
| "loss": 0.6054, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.97352587244284, |
| "grad_norm": 1.6640625, |
| "learning_rate": 8.273109761940382e-05, |
| "loss": 0.5866, |
| "step": 4045 |
| }, |
| { |
| "epoch": 0.9747292418772563, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.269067915715609e-05, |
| "loss": 0.6342, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.9759326113116726, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.265022892308182e-05, |
| "loss": 0.6076, |
| "step": 4055 |
| }, |
| { |
| "epoch": 0.9771359807460891, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.260974698177894e-05, |
| "loss": 0.6061, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.9783393501805054, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.2569233397896e-05, |
| "loss": 0.6105, |
| "step": 4065 |
| }, |
| { |
| "epoch": 0.9795427196149218, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.252868823613206e-05, |
| "loss": 0.6044, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.9807460890493381, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.248811156123662e-05, |
| "loss": 0.6238, |
| "step": 4075 |
| }, |
| { |
| "epoch": 0.9819494584837545, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.244750343800957e-05, |
| "loss": 0.5803, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.9831528279181708, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.240686393130091e-05, |
| "loss": 0.6073, |
| "step": 4085 |
| }, |
| { |
| "epoch": 0.9843561973525873, |
| "grad_norm": 1.75, |
| "learning_rate": 8.236619310601085e-05, |
| "loss": 0.6112, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.9855595667870036, |
| "grad_norm": 1.65625, |
| "learning_rate": 8.232549102708955e-05, |
| "loss": 0.6062, |
| "step": 4095 |
| }, |
| { |
| "epoch": 0.98676293622142, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.228475775953712e-05, |
| "loss": 0.6052, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.9879663056558363, |
| "grad_norm": 1.625, |
| "learning_rate": 8.224399336840348e-05, |
| "loss": 0.6065, |
| "step": 4105 |
| }, |
| { |
| "epoch": 0.9891696750902527, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.220319791878824e-05, |
| "loss": 0.6078, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.9903730445246691, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.21623714758406e-05, |
| "loss": 0.6207, |
| "step": 4115 |
| }, |
| { |
| "epoch": 0.9915764139590855, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.212151410475929e-05, |
| "loss": 0.6269, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.9927797833935018, |
| "grad_norm": 1.6640625, |
| "learning_rate": 8.208062587079237e-05, |
| "loss": 0.5992, |
| "step": 4125 |
| }, |
| { |
| "epoch": 0.9939831528279182, |
| "grad_norm": 1.7109375, |
| "learning_rate": 8.203970683923724e-05, |
| "loss": 0.6045, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.9951865222623345, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.199875707544049e-05, |
| "loss": 0.6244, |
| "step": 4135 |
| }, |
| { |
| "epoch": 0.9963898916967509, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.195777664479775e-05, |
| "loss": 0.6142, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.9975932611311673, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.191676561275365e-05, |
| "loss": 0.6293, |
| "step": 4145 |
| }, |
| { |
| "epoch": 0.9987966305655837, |
| "grad_norm": 1.7109375, |
| "learning_rate": 8.187572404480168e-05, |
| "loss": 0.6087, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.9997593261131167, |
| "eval_loss": 0.5122986435890198, |
| "eval_runtime": 2.6521, |
| "eval_samples_per_second": 75.413, |
| "eval_steps_per_second": 75.413, |
| "step": 4154 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.183465200648407e-05, |
| "loss": 0.5928, |
| "step": 4155 |
| }, |
| { |
| "epoch": 1.0012033694344165, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.179354956339176e-05, |
| "loss": 0.5627, |
| "step": 4160 |
| }, |
| { |
| "epoch": 1.0024067388688327, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.175241678116423e-05, |
| "loss": 0.5571, |
| "step": 4165 |
| }, |
| { |
| "epoch": 1.0036101083032491, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.171125372548937e-05, |
| "loss": 0.6396, |
| "step": 4170 |
| }, |
| { |
| "epoch": 1.0048134777376654, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.167006046210347e-05, |
| "loss": 0.6025, |
| "step": 4175 |
| }, |
| { |
| "epoch": 1.0060168471720818, |
| "grad_norm": 1.6640625, |
| "learning_rate": 8.162883705679107e-05, |
| "loss": 0.6016, |
| "step": 4180 |
| }, |
| { |
| "epoch": 1.0072202166064983, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.158758357538474e-05, |
| "loss": 0.5501, |
| "step": 4185 |
| }, |
| { |
| "epoch": 1.0084235860409145, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.154630008376522e-05, |
| "loss": 0.548, |
| "step": 4190 |
| }, |
| { |
| "epoch": 1.009626955475331, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.150498664786107e-05, |
| "loss": 0.5533, |
| "step": 4195 |
| }, |
| { |
| "epoch": 1.0108303249097472, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.146364333364874e-05, |
| "loss": 0.5879, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.0120336943441637, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.142227020715236e-05, |
| "loss": 0.5898, |
| "step": 4205 |
| }, |
| { |
| "epoch": 1.01323706377858, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.138086733444366e-05, |
| "loss": 0.5648, |
| "step": 4210 |
| }, |
| { |
| "epoch": 1.0144404332129964, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.13394347816419e-05, |
| "loss": 0.5852, |
| "step": 4215 |
| }, |
| { |
| "epoch": 1.0156438026474128, |
| "grad_norm": 1.75, |
| "learning_rate": 8.129797261491373e-05, |
| "loss": 0.5723, |
| "step": 4220 |
| }, |
| { |
| "epoch": 1.016847172081829, |
| "grad_norm": 1.921875, |
| "learning_rate": 8.125648090047308e-05, |
| "loss": 0.5741, |
| "step": 4225 |
| }, |
| { |
| "epoch": 1.0180505415162455, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.12149597045811e-05, |
| "loss": 0.578, |
| "step": 4230 |
| }, |
| { |
| "epoch": 1.0192539109506618, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.117340909354598e-05, |
| "loss": 0.5999, |
| "step": 4235 |
| }, |
| { |
| "epoch": 1.0204572803850782, |
| "grad_norm": 1.8984375, |
| "learning_rate": 8.11318291337229e-05, |
| "loss": 0.6071, |
| "step": 4240 |
| }, |
| { |
| "epoch": 1.0216606498194947, |
| "grad_norm": 1.78125, |
| "learning_rate": 8.10902198915139e-05, |
| "loss": 0.6093, |
| "step": 4245 |
| }, |
| { |
| "epoch": 1.022864019253911, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.104858143336784e-05, |
| "loss": 0.5742, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.0240673886883274, |
| "grad_norm": 1.75, |
| "learning_rate": 8.100691382578017e-05, |
| "loss": 0.5686, |
| "step": 4255 |
| }, |
| { |
| "epoch": 1.0252707581227436, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.096521713529291e-05, |
| "loss": 0.5793, |
| "step": 4260 |
| }, |
| { |
| "epoch": 1.02647412755716, |
| "grad_norm": 1.640625, |
| "learning_rate": 8.092349142849451e-05, |
| "loss": 0.5686, |
| "step": 4265 |
| }, |
| { |
| "epoch": 1.0276774969915765, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.088173677201983e-05, |
| "loss": 0.5715, |
| "step": 4270 |
| }, |
| { |
| "epoch": 1.0288808664259927, |
| "grad_norm": 1.640625, |
| "learning_rate": 8.083995323254985e-05, |
| "loss": 0.5892, |
| "step": 4275 |
| }, |
| { |
| "epoch": 1.0300842358604092, |
| "grad_norm": 1.6171875, |
| "learning_rate": 8.079814087681177e-05, |
| "loss": 0.5761, |
| "step": 4280 |
| }, |
| { |
| "epoch": 1.0312876052948254, |
| "grad_norm": 1.59375, |
| "learning_rate": 8.075629977157875e-05, |
| "loss": 0.5844, |
| "step": 4285 |
| }, |
| { |
| "epoch": 1.032490974729242, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.071442998366989e-05, |
| "loss": 0.5738, |
| "step": 4290 |
| }, |
| { |
| "epoch": 1.0336943441636581, |
| "grad_norm": 1.6484375, |
| "learning_rate": 8.06725315799501e-05, |
| "loss": 0.5608, |
| "step": 4295 |
| }, |
| { |
| "epoch": 1.0348977135980746, |
| "grad_norm": 1.5625, |
| "learning_rate": 8.063060462732998e-05, |
| "loss": 0.5486, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.036101083032491, |
| "grad_norm": 1.6484375, |
| "learning_rate": 8.058864919276573e-05, |
| "loss": 0.5528, |
| "step": 4305 |
| }, |
| { |
| "epoch": 1.0373044524669073, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.054666534325897e-05, |
| "loss": 0.5759, |
| "step": 4310 |
| }, |
| { |
| "epoch": 1.0385078219013237, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.050465314585683e-05, |
| "loss": 0.5518, |
| "step": 4315 |
| }, |
| { |
| "epoch": 1.03971119133574, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.046261266765159e-05, |
| "loss": 0.5691, |
| "step": 4320 |
| }, |
| { |
| "epoch": 1.0409145607701564, |
| "grad_norm": 1.703125, |
| "learning_rate": 8.042054397578074e-05, |
| "loss": 0.5697, |
| "step": 4325 |
| }, |
| { |
| "epoch": 1.0421179302045729, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.037844713742681e-05, |
| "loss": 0.5935, |
| "step": 4330 |
| }, |
| { |
| "epoch": 1.0433212996389891, |
| "grad_norm": 1.6171875, |
| "learning_rate": 8.033632221981734e-05, |
| "loss": 0.55, |
| "step": 4335 |
| }, |
| { |
| "epoch": 1.0445246690734056, |
| "grad_norm": 1.640625, |
| "learning_rate": 8.029416929022463e-05, |
| "loss": 0.5731, |
| "step": 4340 |
| }, |
| { |
| "epoch": 1.0457280385078218, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.025198841596576e-05, |
| "loss": 0.5923, |
| "step": 4345 |
| }, |
| { |
| "epoch": 1.0469314079422383, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.020977966440242e-05, |
| "loss": 0.5673, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.0481347773766547, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.016754310294083e-05, |
| "loss": 0.564, |
| "step": 4355 |
| }, |
| { |
| "epoch": 1.049338146811071, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.012527879903161e-05, |
| "loss": 0.5489, |
| "step": 4360 |
| }, |
| { |
| "epoch": 1.0505415162454874, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.008298682016972e-05, |
| "loss": 0.5378, |
| "step": 4365 |
| }, |
| { |
| "epoch": 1.0517448856799037, |
| "grad_norm": 1.7109375, |
| "learning_rate": 8.004066723389425e-05, |
| "loss": 0.5723, |
| "step": 4370 |
| }, |
| { |
| "epoch": 1.05294825511432, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.999832010778844e-05, |
| "loss": 0.5554, |
| "step": 4375 |
| }, |
| { |
| "epoch": 1.0541516245487366, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.995594550947946e-05, |
| "loss": 0.5525, |
| "step": 4380 |
| }, |
| { |
| "epoch": 1.0553549939831528, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.991354350663841e-05, |
| "loss": 0.5611, |
| "step": 4385 |
| }, |
| { |
| "epoch": 1.0565583634175693, |
| "grad_norm": 1.859375, |
| "learning_rate": 7.98711141669801e-05, |
| "loss": 0.5853, |
| "step": 4390 |
| }, |
| { |
| "epoch": 1.0577617328519855, |
| "grad_norm": 1.9375, |
| "learning_rate": 7.982865755826304e-05, |
| "loss": 0.5756, |
| "step": 4395 |
| }, |
| { |
| "epoch": 1.058965102286402, |
| "grad_norm": 1.75, |
| "learning_rate": 7.978617374828925e-05, |
| "loss": 0.6022, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.0601684717208182, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.974366280490421e-05, |
| "loss": 0.5918, |
| "step": 4405 |
| }, |
| { |
| "epoch": 1.0613718411552346, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.970112479599673e-05, |
| "loss": 0.5539, |
| "step": 4410 |
| }, |
| { |
| "epoch": 1.062575210589651, |
| "grad_norm": 1.5859375, |
| "learning_rate": 7.965855978949883e-05, |
| "loss": 0.5474, |
| "step": 4415 |
| }, |
| { |
| "epoch": 1.0637785800240673, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.961596785338567e-05, |
| "loss": 0.5954, |
| "step": 4420 |
| }, |
| { |
| "epoch": 1.0649819494584838, |
| "grad_norm": 1.7265625, |
| "learning_rate": 7.95733490556754e-05, |
| "loss": 0.5742, |
| "step": 4425 |
| }, |
| { |
| "epoch": 1.0661853188929, |
| "grad_norm": 1.6953125, |
| "learning_rate": 7.953070346442906e-05, |
| "loss": 0.5809, |
| "step": 4430 |
| }, |
| { |
| "epoch": 1.0673886883273165, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.948803114775048e-05, |
| "loss": 0.5991, |
| "step": 4435 |
| }, |
| { |
| "epoch": 1.068592057761733, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.944533217378621e-05, |
| "loss": 0.5433, |
| "step": 4440 |
| }, |
| { |
| "epoch": 1.0697954271961492, |
| "grad_norm": 1.765625, |
| "learning_rate": 7.940260661072528e-05, |
| "loss": 0.5351, |
| "step": 4445 |
| }, |
| { |
| "epoch": 1.0709987966305656, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.935985452679931e-05, |
| "loss": 0.5339, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.0722021660649819, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.931707599028214e-05, |
| "loss": 0.5568, |
| "step": 4455 |
| }, |
| { |
| "epoch": 1.0734055354993983, |
| "grad_norm": 1.5625, |
| "learning_rate": 7.927427106948996e-05, |
| "loss": 0.5831, |
| "step": 4460 |
| }, |
| { |
| "epoch": 1.0746089049338148, |
| "grad_norm": 1.7578125, |
| "learning_rate": 7.923143983278104e-05, |
| "loss": 0.5555, |
| "step": 4465 |
| }, |
| { |
| "epoch": 1.075812274368231, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.918858234855566e-05, |
| "loss": 0.5828, |
| "step": 4470 |
| }, |
| { |
| "epoch": 1.0770156438026475, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.914569868525606e-05, |
| "loss": 0.5506, |
| "step": 4475 |
| }, |
| { |
| "epoch": 1.0782190132370637, |
| "grad_norm": 1.828125, |
| "learning_rate": 7.910278891136629e-05, |
| "loss": 0.5747, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.0794223826714802, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.905985309541206e-05, |
| "loss": 0.5624, |
| "step": 4485 |
| }, |
| { |
| "epoch": 1.0806257521058966, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.90168913059607e-05, |
| "loss": 0.5785, |
| "step": 4490 |
| }, |
| { |
| "epoch": 1.0818291215403129, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.897390361162101e-05, |
| "loss": 0.5106, |
| "step": 4495 |
| }, |
| { |
| "epoch": 1.0830324909747293, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.893089008104314e-05, |
| "loss": 0.5917, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.0830324909747293, |
| "eval_loss": 0.5059241652488708, |
| "eval_runtime": 2.6442, |
| "eval_samples_per_second": 75.636, |
| "eval_steps_per_second": 75.636, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.0842358604091455, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.88878507829185e-05, |
| "loss": 0.5805, |
| "step": 4505 |
| }, |
| { |
| "epoch": 1.085439229843562, |
| "grad_norm": 1.6640625, |
| "learning_rate": 7.884478578597971e-05, |
| "loss": 0.567, |
| "step": 4510 |
| }, |
| { |
| "epoch": 1.0866425992779782, |
| "grad_norm": 1.828125, |
| "learning_rate": 7.880169515900036e-05, |
| "loss": 0.5463, |
| "step": 4515 |
| }, |
| { |
| "epoch": 1.0878459687123947, |
| "grad_norm": 1.671875, |
| "learning_rate": 7.875857897079499e-05, |
| "loss": 0.5636, |
| "step": 4520 |
| }, |
| { |
| "epoch": 1.0890493381468112, |
| "grad_norm": 1.75, |
| "learning_rate": 7.871543729021899e-05, |
| "loss": 0.5626, |
| "step": 4525 |
| }, |
| { |
| "epoch": 1.0902527075812274, |
| "grad_norm": 1.7890625, |
| "learning_rate": 7.867227018616841e-05, |
| "loss": 0.5819, |
| "step": 4530 |
| }, |
| { |
| "epoch": 1.0914560770156438, |
| "grad_norm": 1.6640625, |
| "learning_rate": 7.862907772757996e-05, |
| "loss": 0.5692, |
| "step": 4535 |
| }, |
| { |
| "epoch": 1.09265944645006, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.858585998343077e-05, |
| "loss": 0.5725, |
| "step": 4540 |
| }, |
| { |
| "epoch": 1.0938628158844765, |
| "grad_norm": 1.640625, |
| "learning_rate": 7.854261702273843e-05, |
| "loss": 0.5476, |
| "step": 4545 |
| }, |
| { |
| "epoch": 1.095066185318893, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.849934891456073e-05, |
| "loss": 0.554, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.0962695547533092, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.845605572799565e-05, |
| "loss": 0.5864, |
| "step": 4555 |
| }, |
| { |
| "epoch": 1.0974729241877257, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.841273753218124e-05, |
| "loss": 0.5642, |
| "step": 4560 |
| }, |
| { |
| "epoch": 1.098676293622142, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.836939439629546e-05, |
| "loss": 0.5659, |
| "step": 4565 |
| }, |
| { |
| "epoch": 1.0998796630565584, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.832602638955609e-05, |
| "loss": 0.5449, |
| "step": 4570 |
| }, |
| { |
| "epoch": 1.1010830324909748, |
| "grad_norm": 1.59375, |
| "learning_rate": 7.828263358122067e-05, |
| "loss": 0.571, |
| "step": 4575 |
| }, |
| { |
| "epoch": 1.102286401925391, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.82392160405863e-05, |
| "loss": 0.5436, |
| "step": 4580 |
| }, |
| { |
| "epoch": 1.1034897713598075, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.81957738369896e-05, |
| "loss": 0.5785, |
| "step": 4585 |
| }, |
| { |
| "epoch": 1.1046931407942238, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.815230703980658e-05, |
| "loss": 0.5879, |
| "step": 4590 |
| }, |
| { |
| "epoch": 1.1058965102286402, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.810881571845253e-05, |
| "loss": 0.5882, |
| "step": 4595 |
| }, |
| { |
| "epoch": 1.1070998796630565, |
| "grad_norm": 1.71875, |
| "learning_rate": 7.806529994238188e-05, |
| "loss": 0.5838, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.108303249097473, |
| "grad_norm": 1.640625, |
| "learning_rate": 7.802175978108813e-05, |
| "loss": 0.5813, |
| "step": 4605 |
| }, |
| { |
| "epoch": 1.1095066185318894, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.797819530410373e-05, |
| "loss": 0.5836, |
| "step": 4610 |
| }, |
| { |
| "epoch": 1.1107099879663056, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.793460658099995e-05, |
| "loss": 0.6057, |
| "step": 4615 |
| }, |
| { |
| "epoch": 1.111913357400722, |
| "grad_norm": 1.7265625, |
| "learning_rate": 7.789099368138678e-05, |
| "loss": 0.5564, |
| "step": 4620 |
| }, |
| { |
| "epoch": 1.1131167268351383, |
| "grad_norm": 1.5546875, |
| "learning_rate": 7.78473566749128e-05, |
| "loss": 0.5527, |
| "step": 4625 |
| }, |
| { |
| "epoch": 1.1143200962695547, |
| "grad_norm": 1.875, |
| "learning_rate": 7.780369563126516e-05, |
| "loss": 0.5814, |
| "step": 4630 |
| }, |
| { |
| "epoch": 1.1155234657039712, |
| "grad_norm": 1.625, |
| "learning_rate": 7.77600106201693e-05, |
| "loss": 0.5483, |
| "step": 4635 |
| }, |
| { |
| "epoch": 1.1167268351383874, |
| "grad_norm": 1.6640625, |
| "learning_rate": 7.7716301711389e-05, |
| "loss": 0.5661, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.117930204572804, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.767256897472616e-05, |
| "loss": 0.5879, |
| "step": 4645 |
| }, |
| { |
| "epoch": 1.1191335740072201, |
| "grad_norm": 1.9140625, |
| "learning_rate": 7.76288124800208e-05, |
| "loss": 0.5954, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.1203369434416366, |
| "grad_norm": 1.828125, |
| "learning_rate": 7.758503229715083e-05, |
| "loss": 0.5783, |
| "step": 4655 |
| }, |
| { |
| "epoch": 1.121540312876053, |
| "grad_norm": 1.765625, |
| "learning_rate": 7.754122849603197e-05, |
| "loss": 0.5783, |
| "step": 4660 |
| }, |
| { |
| "epoch": 1.1227436823104693, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.749740114661771e-05, |
| "loss": 0.5272, |
| "step": 4665 |
| }, |
| { |
| "epoch": 1.1239470517448857, |
| "grad_norm": 1.578125, |
| "learning_rate": 7.745355031889913e-05, |
| "loss": 0.5758, |
| "step": 4670 |
| }, |
| { |
| "epoch": 1.125150421179302, |
| "grad_norm": 1.6015625, |
| "learning_rate": 7.740967608290477e-05, |
| "loss": 0.534, |
| "step": 4675 |
| }, |
| { |
| "epoch": 1.1263537906137184, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.736577850870058e-05, |
| "loss": 0.5413, |
| "step": 4680 |
| }, |
| { |
| "epoch": 1.1275571600481347, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.73218576663898e-05, |
| "loss": 0.5879, |
| "step": 4685 |
| }, |
| { |
| "epoch": 1.1287605294825511, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.727791362611279e-05, |
| "loss": 0.5373, |
| "step": 4690 |
| }, |
| { |
| "epoch": 1.1299638989169676, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.7233946458047e-05, |
| "loss": 0.5613, |
| "step": 4695 |
| }, |
| { |
| "epoch": 1.1311672683513838, |
| "grad_norm": 1.8984375, |
| "learning_rate": 7.718995623240674e-05, |
| "loss": 0.592, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.1323706377858003, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.714594301944324e-05, |
| "loss": 0.5852, |
| "step": 4705 |
| }, |
| { |
| "epoch": 1.1335740072202167, |
| "grad_norm": 1.7890625, |
| "learning_rate": 7.710190688944438e-05, |
| "loss": 0.5835, |
| "step": 4710 |
| }, |
| { |
| "epoch": 1.134777376654633, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.705784791273463e-05, |
| "loss": 0.5649, |
| "step": 4715 |
| }, |
| { |
| "epoch": 1.1359807460890494, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.701376615967497e-05, |
| "loss": 0.5628, |
| "step": 4720 |
| }, |
| { |
| "epoch": 1.1371841155234657, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.696966170066276e-05, |
| "loss": 0.5844, |
| "step": 4725 |
| }, |
| { |
| "epoch": 1.1383874849578821, |
| "grad_norm": 1.671875, |
| "learning_rate": 7.69255346061316e-05, |
| "loss": 0.5349, |
| "step": 4730 |
| }, |
| { |
| "epoch": 1.1395908543922983, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.688138494655125e-05, |
| "loss": 0.5429, |
| "step": 4735 |
| }, |
| { |
| "epoch": 1.1407942238267148, |
| "grad_norm": 1.7265625, |
| "learning_rate": 7.683721279242749e-05, |
| "loss": 0.5762, |
| "step": 4740 |
| }, |
| { |
| "epoch": 1.1419975932611313, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.679301821430205e-05, |
| "loss": 0.5778, |
| "step": 4745 |
| }, |
| { |
| "epoch": 1.1432009626955475, |
| "grad_norm": 1.671875, |
| "learning_rate": 7.674880128275246e-05, |
| "loss": 0.5823, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.144404332129964, |
| "grad_norm": 1.5703125, |
| "learning_rate": 7.670456206839193e-05, |
| "loss": 0.5535, |
| "step": 4755 |
| }, |
| { |
| "epoch": 1.1456077015643802, |
| "grad_norm": 1.609375, |
| "learning_rate": 7.666030064186928e-05, |
| "loss": 0.5651, |
| "step": 4760 |
| }, |
| { |
| "epoch": 1.1468110709987966, |
| "grad_norm": 1.6015625, |
| "learning_rate": 7.661601707386877e-05, |
| "loss": 0.5744, |
| "step": 4765 |
| }, |
| { |
| "epoch": 1.1480144404332129, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.657171143511008e-05, |
| "loss": 0.5506, |
| "step": 4770 |
| }, |
| { |
| "epoch": 1.1492178098676293, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.652738379634806e-05, |
| "loss": 0.5681, |
| "step": 4775 |
| }, |
| { |
| "epoch": 1.1504211793020458, |
| "grad_norm": 1.5703125, |
| "learning_rate": 7.648303422837275e-05, |
| "loss": 0.5271, |
| "step": 4780 |
| }, |
| { |
| "epoch": 1.151624548736462, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.643866280200916e-05, |
| "loss": 0.5712, |
| "step": 4785 |
| }, |
| { |
| "epoch": 1.1528279181708785, |
| "grad_norm": 1.5390625, |
| "learning_rate": 7.639426958811728e-05, |
| "loss": 0.5676, |
| "step": 4790 |
| }, |
| { |
| "epoch": 1.154031287605295, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.634985465759185e-05, |
| "loss": 0.5726, |
| "step": 4795 |
| }, |
| { |
| "epoch": 1.1552346570397112, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.630541808136224e-05, |
| "loss": 0.5879, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.1564380264741276, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.626095993039252e-05, |
| "loss": 0.5786, |
| "step": 4805 |
| }, |
| { |
| "epoch": 1.1576413959085439, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.621648027568106e-05, |
| "loss": 0.5461, |
| "step": 4810 |
| }, |
| { |
| "epoch": 1.1588447653429603, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.61719791882607e-05, |
| "loss": 0.5605, |
| "step": 4815 |
| }, |
| { |
| "epoch": 1.1600481347773766, |
| "grad_norm": 2.0, |
| "learning_rate": 7.612745673919841e-05, |
| "loss": 0.5982, |
| "step": 4820 |
| }, |
| { |
| "epoch": 1.161251504211793, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.608291299959535e-05, |
| "loss": 0.5808, |
| "step": 4825 |
| }, |
| { |
| "epoch": 1.1624548736462095, |
| "grad_norm": 1.7578125, |
| "learning_rate": 7.603834804058663e-05, |
| "loss": 0.5782, |
| "step": 4830 |
| }, |
| { |
| "epoch": 1.1636582430806257, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.599376193334128e-05, |
| "loss": 0.5563, |
| "step": 4835 |
| }, |
| { |
| "epoch": 1.1648616125150422, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.594915474906208e-05, |
| "loss": 0.5864, |
| "step": 4840 |
| }, |
| { |
| "epoch": 1.1660649819494584, |
| "grad_norm": 1.71875, |
| "learning_rate": 7.590452655898546e-05, |
| "loss": 0.5598, |
| "step": 4845 |
| }, |
| { |
| "epoch": 1.1672683513838749, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.585987743438143e-05, |
| "loss": 0.5418, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.168471720818291, |
| "grad_norm": 1.75, |
| "learning_rate": 7.58152074465534e-05, |
| "loss": 0.5851, |
| "step": 4855 |
| }, |
| { |
| "epoch": 1.1696750902527075, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.577051666683814e-05, |
| "loss": 0.5671, |
| "step": 4860 |
| }, |
| { |
| "epoch": 1.170878459687124, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.572580516660559e-05, |
| "loss": 0.5606, |
| "step": 4865 |
| }, |
| { |
| "epoch": 1.1720818291215402, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.568107301725875e-05, |
| "loss": 0.5843, |
| "step": 4870 |
| }, |
| { |
| "epoch": 1.1732851985559567, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.563632029023366e-05, |
| "loss": 0.5452, |
| "step": 4875 |
| }, |
| { |
| "epoch": 1.1744885679903732, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.55915470569992e-05, |
| "loss": 0.5551, |
| "step": 4880 |
| }, |
| { |
| "epoch": 1.1756919374247894, |
| "grad_norm": 1.765625, |
| "learning_rate": 7.554675338905699e-05, |
| "loss": 0.585, |
| "step": 4885 |
| }, |
| { |
| "epoch": 1.1768953068592058, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.550193935794128e-05, |
| "loss": 0.5754, |
| "step": 4890 |
| }, |
| { |
| "epoch": 1.178098676293622, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.545710503521885e-05, |
| "loss": 0.5483, |
| "step": 4895 |
| }, |
| { |
| "epoch": 1.1793020457280385, |
| "grad_norm": 1.7265625, |
| "learning_rate": 7.541225049248888e-05, |
| "loss": 0.5752, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.1805054151624548, |
| "grad_norm": 1.671875, |
| "learning_rate": 7.536737580138282e-05, |
| "loss": 0.5567, |
| "step": 4905 |
| }, |
| { |
| "epoch": 1.1817087845968712, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.532248103356436e-05, |
| "loss": 0.5805, |
| "step": 4910 |
| }, |
| { |
| "epoch": 1.1829121540312877, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.527756626072919e-05, |
| "loss": 0.5726, |
| "step": 4915 |
| }, |
| { |
| "epoch": 1.184115523465704, |
| "grad_norm": 1.859375, |
| "learning_rate": 7.523263155460495e-05, |
| "loss": 0.5953, |
| "step": 4920 |
| }, |
| { |
| "epoch": 1.1853188929001204, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.518767698695114e-05, |
| "loss": 0.5617, |
| "step": 4925 |
| }, |
| { |
| "epoch": 1.1865222623345366, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.514270262955898e-05, |
| "loss": 0.5497, |
| "step": 4930 |
| }, |
| { |
| "epoch": 1.187725631768953, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.509770855425127e-05, |
| "loss": 0.5711, |
| "step": 4935 |
| }, |
| { |
| "epoch": 1.1889290012033695, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.505269483288231e-05, |
| "loss": 0.5499, |
| "step": 4940 |
| }, |
| { |
| "epoch": 1.1901323706377858, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.50076615373378e-05, |
| "loss": 0.592, |
| "step": 4945 |
| }, |
| { |
| "epoch": 1.1913357400722022, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.496260873953466e-05, |
| "loss": 0.5446, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.1925391095066185, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.491753651142096e-05, |
| "loss": 0.5606, |
| "step": 4955 |
| }, |
| { |
| "epoch": 1.193742478941035, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.48724449249758e-05, |
| "loss": 0.5566, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.1949458483754514, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.482733405220924e-05, |
| "loss": 0.544, |
| "step": 4965 |
| }, |
| { |
| "epoch": 1.1961492178098676, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.478220396516206e-05, |
| "loss": 0.5476, |
| "step": 4970 |
| }, |
| { |
| "epoch": 1.197352587244284, |
| "grad_norm": 1.578125, |
| "learning_rate": 7.473705473590582e-05, |
| "loss": 0.556, |
| "step": 4975 |
| }, |
| { |
| "epoch": 1.1985559566787003, |
| "grad_norm": 1.6015625, |
| "learning_rate": 7.469188643654254e-05, |
| "loss": 0.5527, |
| "step": 4980 |
| }, |
| { |
| "epoch": 1.1997593261131168, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.46466991392048e-05, |
| "loss": 0.5428, |
| "step": 4985 |
| }, |
| { |
| "epoch": 1.200962695547533, |
| "grad_norm": 1.71875, |
| "learning_rate": 7.460149291605544e-05, |
| "loss": 0.5524, |
| "step": 4990 |
| }, |
| { |
| "epoch": 1.2021660649819494, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.455626783928757e-05, |
| "loss": 0.597, |
| "step": 4995 |
| }, |
| { |
| "epoch": 1.203369434416366, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.45110239811244e-05, |
| "loss": 0.5684, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.203369434416366, |
| "eval_loss": 0.49116280674934387, |
| "eval_runtime": 2.6476, |
| "eval_samples_per_second": 75.54, |
| "eval_steps_per_second": 75.54, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.2045728038507821, |
| "grad_norm": 1.6015625, |
| "learning_rate": 7.446576141381909e-05, |
| "loss": 0.5554, |
| "step": 5005 |
| }, |
| { |
| "epoch": 1.2057761732851986, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.442048020965478e-05, |
| "loss": 0.5728, |
| "step": 5010 |
| }, |
| { |
| "epoch": 1.2069795427196148, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.437518044094426e-05, |
| "loss": 0.5353, |
| "step": 5015 |
| }, |
| { |
| "epoch": 1.2081829121540313, |
| "grad_norm": 1.6171875, |
| "learning_rate": 7.432986218003002e-05, |
| "loss": 0.5448, |
| "step": 5020 |
| }, |
| { |
| "epoch": 1.2093862815884477, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.428452549928408e-05, |
| "loss": 0.5824, |
| "step": 5025 |
| }, |
| { |
| "epoch": 1.210589651022864, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.423917047110788e-05, |
| "loss": 0.5641, |
| "step": 5030 |
| }, |
| { |
| "epoch": 1.2117930204572804, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.419379716793217e-05, |
| "loss": 0.5362, |
| "step": 5035 |
| }, |
| { |
| "epoch": 1.2129963898916967, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.414840566221683e-05, |
| "loss": 0.5717, |
| "step": 5040 |
| }, |
| { |
| "epoch": 1.2141997593261131, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.410299602645087e-05, |
| "loss": 0.59, |
| "step": 5045 |
| }, |
| { |
| "epoch": 1.2154031287605296, |
| "grad_norm": 1.7578125, |
| "learning_rate": 7.405756833315221e-05, |
| "loss": 0.5442, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.2166064981949458, |
| "grad_norm": 1.8984375, |
| "learning_rate": 7.401212265486765e-05, |
| "loss": 0.5929, |
| "step": 5055 |
| }, |
| { |
| "epoch": 1.2178098676293623, |
| "grad_norm": 1.6640625, |
| "learning_rate": 7.396665906417269e-05, |
| "loss": 0.5278, |
| "step": 5060 |
| }, |
| { |
| "epoch": 1.2190132370637785, |
| "grad_norm": 1.609375, |
| "learning_rate": 7.392117763367142e-05, |
| "loss": 0.5568, |
| "step": 5065 |
| }, |
| { |
| "epoch": 1.220216606498195, |
| "grad_norm": 1.7578125, |
| "learning_rate": 7.387567843599643e-05, |
| "loss": 0.5513, |
| "step": 5070 |
| }, |
| { |
| "epoch": 1.2214199759326112, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.383016154380869e-05, |
| "loss": 0.5313, |
| "step": 5075 |
| }, |
| { |
| "epoch": 1.2226233453670277, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.378462702979744e-05, |
| "loss": 0.5842, |
| "step": 5080 |
| }, |
| { |
| "epoch": 1.2238267148014441, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.373907496668005e-05, |
| "loss": 0.561, |
| "step": 5085 |
| }, |
| { |
| "epoch": 1.2250300842358604, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.369350542720187e-05, |
| "loss": 0.5642, |
| "step": 5090 |
| }, |
| { |
| "epoch": 1.2262334536702768, |
| "grad_norm": 1.59375, |
| "learning_rate": 7.364791848413625e-05, |
| "loss": 0.5696, |
| "step": 5095 |
| }, |
| { |
| "epoch": 1.2274368231046933, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.360231421028426e-05, |
| "loss": 0.5507, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.2286401925391095, |
| "grad_norm": 1.5625, |
| "learning_rate": 7.355669267847469e-05, |
| "loss": 0.5596, |
| "step": 5105 |
| }, |
| { |
| "epoch": 1.229843561973526, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.351105396156385e-05, |
| "loss": 0.5707, |
| "step": 5110 |
| }, |
| { |
| "epoch": 1.2310469314079422, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.346539813243554e-05, |
| "loss": 0.5737, |
| "step": 5115 |
| }, |
| { |
| "epoch": 1.2322503008423586, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.341972526400083e-05, |
| "loss": 0.5665, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.2334536702767749, |
| "grad_norm": 1.6015625, |
| "learning_rate": 7.337403542919807e-05, |
| "loss": 0.5499, |
| "step": 5125 |
| }, |
| { |
| "epoch": 1.2346570397111913, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.332832870099264e-05, |
| "loss": 0.5377, |
| "step": 5130 |
| }, |
| { |
| "epoch": 1.2358604091456078, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.328260515237696e-05, |
| "loss": 0.5678, |
| "step": 5135 |
| }, |
| { |
| "epoch": 1.237063778580024, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.323686485637024e-05, |
| "loss": 0.5411, |
| "step": 5140 |
| }, |
| { |
| "epoch": 1.2382671480144405, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.319110788601853e-05, |
| "loss": 0.5587, |
| "step": 5145 |
| }, |
| { |
| "epoch": 1.2394705174488567, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.31453343143944e-05, |
| "loss": 0.5671, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.2406738868832732, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.3099544214597e-05, |
| "loss": 0.5675, |
| "step": 5155 |
| }, |
| { |
| "epoch": 1.2418772563176894, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.305373765975188e-05, |
| "loss": 0.552, |
| "step": 5160 |
| }, |
| { |
| "epoch": 1.2430806257521059, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.300791472301084e-05, |
| "loss": 0.5643, |
| "step": 5165 |
| }, |
| { |
| "epoch": 1.2442839951865223, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.296207547755183e-05, |
| "loss": 0.5496, |
| "step": 5170 |
| }, |
| { |
| "epoch": 1.2454873646209386, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.29162199965789e-05, |
| "loss": 0.5535, |
| "step": 5175 |
| }, |
| { |
| "epoch": 1.246690734055355, |
| "grad_norm": 1.765625, |
| "learning_rate": 7.287034835332196e-05, |
| "loss": 0.5823, |
| "step": 5180 |
| }, |
| { |
| "epoch": 1.2478941034897715, |
| "grad_norm": 1.7890625, |
| "learning_rate": 7.282446062103678e-05, |
| "loss": 0.5633, |
| "step": 5185 |
| }, |
| { |
| "epoch": 1.2490974729241877, |
| "grad_norm": 1.640625, |
| "learning_rate": 7.277855687300481e-05, |
| "loss": 0.5696, |
| "step": 5190 |
| }, |
| { |
| "epoch": 1.2503008423586042, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.273263718253305e-05, |
| "loss": 0.5769, |
| "step": 5195 |
| }, |
| { |
| "epoch": 1.2515042117930204, |
| "grad_norm": 1.7265625, |
| "learning_rate": 7.268670162295402e-05, |
| "loss": 0.5457, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.2527075812274369, |
| "grad_norm": 1.71875, |
| "learning_rate": 7.26407502676255e-05, |
| "loss": 0.5766, |
| "step": 5205 |
| }, |
| { |
| "epoch": 1.253910950661853, |
| "grad_norm": 1.578125, |
| "learning_rate": 7.259478318993058e-05, |
| "loss": 0.5354, |
| "step": 5210 |
| }, |
| { |
| "epoch": 1.2551143200962696, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.25488004632774e-05, |
| "loss": 0.5561, |
| "step": 5215 |
| }, |
| { |
| "epoch": 1.256317689530686, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.250280216109913e-05, |
| "loss": 0.5684, |
| "step": 5220 |
| }, |
| { |
| "epoch": 1.2575210589651022, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.245678835685378e-05, |
| "loss": 0.5453, |
| "step": 5225 |
| }, |
| { |
| "epoch": 1.2587244283995187, |
| "grad_norm": 1.6171875, |
| "learning_rate": 7.24107591240241e-05, |
| "loss": 0.5364, |
| "step": 5230 |
| }, |
| { |
| "epoch": 1.259927797833935, |
| "grad_norm": 1.609375, |
| "learning_rate": 7.236471453611757e-05, |
| "loss": 0.5611, |
| "step": 5235 |
| }, |
| { |
| "epoch": 1.2611311672683514, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.231865466666609e-05, |
| "loss": 0.5558, |
| "step": 5240 |
| }, |
| { |
| "epoch": 1.2623345367027676, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.227257958922601e-05, |
| "loss": 0.6038, |
| "step": 5245 |
| }, |
| { |
| "epoch": 1.263537906137184, |
| "grad_norm": 1.546875, |
| "learning_rate": 7.222648937737797e-05, |
| "loss": 0.5456, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.2647412755716005, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.218038410472676e-05, |
| "loss": 0.553, |
| "step": 5255 |
| }, |
| { |
| "epoch": 1.2659446450060168, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.213426384490126e-05, |
| "loss": 0.556, |
| "step": 5260 |
| }, |
| { |
| "epoch": 1.2671480144404332, |
| "grad_norm": 1.5859375, |
| "learning_rate": 7.208812867155422e-05, |
| "loss": 0.5676, |
| "step": 5265 |
| }, |
| { |
| "epoch": 1.2683513838748497, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.204197865836226e-05, |
| "loss": 0.5484, |
| "step": 5270 |
| }, |
| { |
| "epoch": 1.269554753309266, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.199581387902569e-05, |
| "loss": 0.5779, |
| "step": 5275 |
| }, |
| { |
| "epoch": 1.2707581227436824, |
| "grad_norm": 1.6953125, |
| "learning_rate": 7.194963440726839e-05, |
| "loss": 0.5296, |
| "step": 5280 |
| }, |
| { |
| "epoch": 1.2719614921780986, |
| "grad_norm": 1.7890625, |
| "learning_rate": 7.19034403168377e-05, |
| "loss": 0.5792, |
| "step": 5285 |
| }, |
| { |
| "epoch": 1.273164861612515, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.185723168150432e-05, |
| "loss": 0.5887, |
| "step": 5290 |
| }, |
| { |
| "epoch": 1.2743682310469313, |
| "grad_norm": 1.7890625, |
| "learning_rate": 7.181100857506215e-05, |
| "loss": 0.556, |
| "step": 5295 |
| }, |
| { |
| "epoch": 1.2755716004813478, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.176477107132825e-05, |
| "loss": 0.5639, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.2767749699157642, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.171851924414262e-05, |
| "loss": 0.551, |
| "step": 5305 |
| }, |
| { |
| "epoch": 1.2779783393501805, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.167225316736816e-05, |
| "loss": 0.5926, |
| "step": 5310 |
| }, |
| { |
| "epoch": 1.279181708784597, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.162597291489054e-05, |
| "loss": 0.5579, |
| "step": 5315 |
| }, |
| { |
| "epoch": 1.2803850782190134, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.157967856061803e-05, |
| "loss": 0.5578, |
| "step": 5320 |
| }, |
| { |
| "epoch": 1.2815884476534296, |
| "grad_norm": 1.578125, |
| "learning_rate": 7.153337017848145e-05, |
| "loss": 0.5501, |
| "step": 5325 |
| }, |
| { |
| "epoch": 1.2827918170878458, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.148704784243401e-05, |
| "loss": 0.5606, |
| "step": 5330 |
| }, |
| { |
| "epoch": 1.2839951865222623, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.14407116264512e-05, |
| "loss": 0.5626, |
| "step": 5335 |
| }, |
| { |
| "epoch": 1.2851985559566788, |
| "grad_norm": 1.6015625, |
| "learning_rate": 7.13943616045307e-05, |
| "loss": 0.5929, |
| "step": 5340 |
| }, |
| { |
| "epoch": 1.286401925391095, |
| "grad_norm": 1.953125, |
| "learning_rate": 7.134799785069222e-05, |
| "loss": 0.5574, |
| "step": 5345 |
| }, |
| { |
| "epoch": 1.2876052948255114, |
| "grad_norm": 1.9609375, |
| "learning_rate": 7.130162043897738e-05, |
| "loss": 0.5547, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.288808664259928, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.125522944344964e-05, |
| "loss": 0.5545, |
| "step": 5355 |
| }, |
| { |
| "epoch": 1.2900120336943441, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.120882493819416e-05, |
| "loss": 0.5794, |
| "step": 5360 |
| }, |
| { |
| "epoch": 1.2912154031287606, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.116240699731764e-05, |
| "loss": 0.5531, |
| "step": 5365 |
| }, |
| { |
| "epoch": 1.2924187725631768, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.111597569494826e-05, |
| "loss": 0.5526, |
| "step": 5370 |
| }, |
| { |
| "epoch": 1.2936221419975933, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.106953110523552e-05, |
| "loss": 0.5128, |
| "step": 5375 |
| }, |
| { |
| "epoch": 1.2948255114320095, |
| "grad_norm": 1.6171875, |
| "learning_rate": 7.102307330235018e-05, |
| "loss": 0.5917, |
| "step": 5380 |
| }, |
| { |
| "epoch": 1.296028880866426, |
| "grad_norm": 1.703125, |
| "learning_rate": 7.097660236048408e-05, |
| "loss": 0.5499, |
| "step": 5385 |
| }, |
| { |
| "epoch": 1.2972322503008424, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.093011835385e-05, |
| "loss": 0.5861, |
| "step": 5390 |
| }, |
| { |
| "epoch": 1.2984356197352587, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.088362135668165e-05, |
| "loss": 0.5668, |
| "step": 5395 |
| }, |
| { |
| "epoch": 1.2996389891696751, |
| "grad_norm": 1.7578125, |
| "learning_rate": 7.083711144323343e-05, |
| "loss": 0.5407, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.3008423586040916, |
| "grad_norm": 1.5703125, |
| "learning_rate": 7.079058868778041e-05, |
| "loss": 0.5213, |
| "step": 5405 |
| }, |
| { |
| "epoch": 1.3020457280385078, |
| "grad_norm": 2.0, |
| "learning_rate": 7.074405316461816e-05, |
| "loss": 0.5457, |
| "step": 5410 |
| }, |
| { |
| "epoch": 1.303249097472924, |
| "grad_norm": 1.75, |
| "learning_rate": 7.06975049480626e-05, |
| "loss": 0.544, |
| "step": 5415 |
| }, |
| { |
| "epoch": 1.3044524669073405, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.065094411245e-05, |
| "loss": 0.581, |
| "step": 5420 |
| }, |
| { |
| "epoch": 1.305655836341757, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.060437073213668e-05, |
| "loss": 0.5678, |
| "step": 5425 |
| }, |
| { |
| "epoch": 1.3068592057761732, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.05577848814991e-05, |
| "loss": 0.5917, |
| "step": 5430 |
| }, |
| { |
| "epoch": 1.3080625752105897, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.051118663493353e-05, |
| "loss": 0.5271, |
| "step": 5435 |
| }, |
| { |
| "epoch": 1.3092659446450061, |
| "grad_norm": 1.71875, |
| "learning_rate": 7.046457606685615e-05, |
| "loss": 0.5609, |
| "step": 5440 |
| }, |
| { |
| "epoch": 1.3104693140794224, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.041795325170268e-05, |
| "loss": 0.5716, |
| "step": 5445 |
| }, |
| { |
| "epoch": 1.3116726835138388, |
| "grad_norm": 1.671875, |
| "learning_rate": 7.037131826392854e-05, |
| "loss": 0.5357, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.312876052948255, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.032467117800849e-05, |
| "loss": 0.5427, |
| "step": 5455 |
| }, |
| { |
| "epoch": 1.3140794223826715, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.027801206843664e-05, |
| "loss": 0.5346, |
| "step": 5460 |
| }, |
| { |
| "epoch": 1.3152827918170877, |
| "grad_norm": 1.578125, |
| "learning_rate": 7.023134100972636e-05, |
| "loss": 0.5622, |
| "step": 5465 |
| }, |
| { |
| "epoch": 1.3164861612515042, |
| "grad_norm": 1.6171875, |
| "learning_rate": 7.018465807640995e-05, |
| "loss": 0.5627, |
| "step": 5470 |
| }, |
| { |
| "epoch": 1.3176895306859207, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.013796334303887e-05, |
| "loss": 0.5483, |
| "step": 5475 |
| }, |
| { |
| "epoch": 1.3188929001203369, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.009125688418325e-05, |
| "loss": 0.5764, |
| "step": 5480 |
| }, |
| { |
| "epoch": 1.3200962695547533, |
| "grad_norm": 1.671875, |
| "learning_rate": 7.004453877443206e-05, |
| "loss": 0.5626, |
| "step": 5485 |
| }, |
| { |
| "epoch": 1.3212996389891698, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.999780908839284e-05, |
| "loss": 0.5367, |
| "step": 5490 |
| }, |
| { |
| "epoch": 1.322503008423586, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.995106790069157e-05, |
| "loss": 0.5612, |
| "step": 5495 |
| }, |
| { |
| "epoch": 1.3237063778580023, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.99043152859727e-05, |
| "loss": 0.5511, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.3237063778580023, |
| "eval_loss": 0.48222118616104126, |
| "eval_runtime": 2.6547, |
| "eval_samples_per_second": 75.338, |
| "eval_steps_per_second": 75.338, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.3249097472924187, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.985755131889885e-05, |
| "loss": 0.56, |
| "step": 5505 |
| }, |
| { |
| "epoch": 1.3261131167268352, |
| "grad_norm": 1.5703125, |
| "learning_rate": 6.981077607415076e-05, |
| "loss": 0.5547, |
| "step": 5510 |
| }, |
| { |
| "epoch": 1.3273164861612514, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.976398962642725e-05, |
| "loss": 0.5537, |
| "step": 5515 |
| }, |
| { |
| "epoch": 1.3285198555956679, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.971719205044497e-05, |
| "loss": 0.5344, |
| "step": 5520 |
| }, |
| { |
| "epoch": 1.3297232250300843, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.967038342093837e-05, |
| "loss": 0.5497, |
| "step": 5525 |
| }, |
| { |
| "epoch": 1.3309265944645006, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.962356381265953e-05, |
| "loss": 0.5408, |
| "step": 5530 |
| }, |
| { |
| "epoch": 1.332129963898917, |
| "grad_norm": 1.7421875, |
| "learning_rate": 6.957673330037809e-05, |
| "loss": 0.5532, |
| "step": 5535 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 1.859375, |
| "learning_rate": 6.952989195888108e-05, |
| "loss": 0.5074, |
| "step": 5540 |
| }, |
| { |
| "epoch": 1.3345367027677497, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.948303986297285e-05, |
| "loss": 0.5585, |
| "step": 5545 |
| }, |
| { |
| "epoch": 1.335740072202166, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.943617708747491e-05, |
| "loss": 0.5719, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.3369434416365824, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.938930370722581e-05, |
| "loss": 0.5581, |
| "step": 5555 |
| }, |
| { |
| "epoch": 1.3381468110709989, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.934241979708105e-05, |
| "loss": 0.5387, |
| "step": 5560 |
| }, |
| { |
| "epoch": 1.339350180505415, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.929552543191295e-05, |
| "loss": 0.5287, |
| "step": 5565 |
| }, |
| { |
| "epoch": 1.3405535499398316, |
| "grad_norm": 1.6171875, |
| "learning_rate": 6.924862068661052e-05, |
| "loss": 0.5346, |
| "step": 5570 |
| }, |
| { |
| "epoch": 1.341756919374248, |
| "grad_norm": 1.578125, |
| "learning_rate": 6.920170563607937e-05, |
| "loss": 0.5506, |
| "step": 5575 |
| }, |
| { |
| "epoch": 1.3429602888086642, |
| "grad_norm": 1.625, |
| "learning_rate": 6.915478035524151e-05, |
| "loss": 0.5401, |
| "step": 5580 |
| }, |
| { |
| "epoch": 1.3441636582430807, |
| "grad_norm": 1.765625, |
| "learning_rate": 6.910784491903533e-05, |
| "loss": 0.5626, |
| "step": 5585 |
| }, |
| { |
| "epoch": 1.345367027677497, |
| "grad_norm": 1.75, |
| "learning_rate": 6.906089940241545e-05, |
| "loss": 0.5523, |
| "step": 5590 |
| }, |
| { |
| "epoch": 1.3465703971119134, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.901394388035256e-05, |
| "loss": 0.5049, |
| "step": 5595 |
| }, |
| { |
| "epoch": 1.3477737665463296, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.896697842783335e-05, |
| "loss": 0.5747, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.348977135980746, |
| "grad_norm": 1.8515625, |
| "learning_rate": 6.892000311986032e-05, |
| "loss": 0.5706, |
| "step": 5605 |
| }, |
| { |
| "epoch": 1.3501805054151625, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.88730180314518e-05, |
| "loss": 0.5127, |
| "step": 5610 |
| }, |
| { |
| "epoch": 1.3513838748495788, |
| "grad_norm": 1.7421875, |
| "learning_rate": 6.882602323764164e-05, |
| "loss": 0.5712, |
| "step": 5615 |
| }, |
| { |
| "epoch": 1.3525872442839952, |
| "grad_norm": 1.8359375, |
| "learning_rate": 6.877901881347923e-05, |
| "loss": 0.5524, |
| "step": 5620 |
| }, |
| { |
| "epoch": 1.3537906137184115, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.873200483402938e-05, |
| "loss": 0.5465, |
| "step": 5625 |
| }, |
| { |
| "epoch": 1.354993983152828, |
| "grad_norm": 1.7578125, |
| "learning_rate": 6.868498137437209e-05, |
| "loss": 0.5542, |
| "step": 5630 |
| }, |
| { |
| "epoch": 1.3561973525872442, |
| "grad_norm": 1.9140625, |
| "learning_rate": 6.863794850960255e-05, |
| "loss": 0.5428, |
| "step": 5635 |
| }, |
| { |
| "epoch": 1.3574007220216606, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.859090631483094e-05, |
| "loss": 0.5673, |
| "step": 5640 |
| }, |
| { |
| "epoch": 1.358604091456077, |
| "grad_norm": 1.5859375, |
| "learning_rate": 6.854385486518237e-05, |
| "loss": 0.5339, |
| "step": 5645 |
| }, |
| { |
| "epoch": 1.3598074608904933, |
| "grad_norm": 1.6171875, |
| "learning_rate": 6.849679423579671e-05, |
| "loss": 0.5295, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.3610108303249098, |
| "grad_norm": 1.6953125, |
| "learning_rate": 6.844972450182847e-05, |
| "loss": 0.5426, |
| "step": 5655 |
| }, |
| { |
| "epoch": 1.3622141997593262, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.840264573844672e-05, |
| "loss": 0.5092, |
| "step": 5660 |
| }, |
| { |
| "epoch": 1.3634175691937425, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.835555802083496e-05, |
| "loss": 0.5254, |
| "step": 5665 |
| }, |
| { |
| "epoch": 1.364620938628159, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.830846142419101e-05, |
| "loss": 0.5454, |
| "step": 5670 |
| }, |
| { |
| "epoch": 1.3658243080625752, |
| "grad_norm": 1.5859375, |
| "learning_rate": 6.82613560237268e-05, |
| "loss": 0.5416, |
| "step": 5675 |
| }, |
| { |
| "epoch": 1.3670276774969916, |
| "grad_norm": 1.703125, |
| "learning_rate": 6.821424189466835e-05, |
| "loss": 0.5677, |
| "step": 5680 |
| }, |
| { |
| "epoch": 1.3682310469314078, |
| "grad_norm": 1.8828125, |
| "learning_rate": 6.816711911225568e-05, |
| "loss": 0.5598, |
| "step": 5685 |
| }, |
| { |
| "epoch": 1.3694344163658243, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.811998775174254e-05, |
| "loss": 0.5408, |
| "step": 5690 |
| }, |
| { |
| "epoch": 1.3706377858002408, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.807284788839642e-05, |
| "loss": 0.5276, |
| "step": 5695 |
| }, |
| { |
| "epoch": 1.371841155234657, |
| "grad_norm": 1.703125, |
| "learning_rate": 6.80256995974984e-05, |
| "loss": 0.5043, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.3730445246690735, |
| "grad_norm": 1.90625, |
| "learning_rate": 6.797854295434299e-05, |
| "loss": 0.5536, |
| "step": 5705 |
| }, |
| { |
| "epoch": 1.37424789410349, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.793137803423807e-05, |
| "loss": 0.5534, |
| "step": 5710 |
| }, |
| { |
| "epoch": 1.3754512635379061, |
| "grad_norm": 1.625, |
| "learning_rate": 6.788420491250469e-05, |
| "loss": 0.5142, |
| "step": 5715 |
| }, |
| { |
| "epoch": 1.3766546329723224, |
| "grad_norm": 1.6875, |
| "learning_rate": 6.783702366447707e-05, |
| "loss": 0.5742, |
| "step": 5720 |
| }, |
| { |
| "epoch": 1.3778580024067388, |
| "grad_norm": 1.5703125, |
| "learning_rate": 6.77898343655023e-05, |
| "loss": 0.5231, |
| "step": 5725 |
| }, |
| { |
| "epoch": 1.3790613718411553, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.774263709094048e-05, |
| "loss": 0.5276, |
| "step": 5730 |
| }, |
| { |
| "epoch": 1.3802647412755715, |
| "grad_norm": 1.8359375, |
| "learning_rate": 6.76954319161643e-05, |
| "loss": 0.5268, |
| "step": 5735 |
| }, |
| { |
| "epoch": 1.381468110709988, |
| "grad_norm": 1.5546875, |
| "learning_rate": 6.764821891655914e-05, |
| "loss": 0.5439, |
| "step": 5740 |
| }, |
| { |
| "epoch": 1.3826714801444044, |
| "grad_norm": 1.8125, |
| "learning_rate": 6.760099816752288e-05, |
| "loss": 0.5383, |
| "step": 5745 |
| }, |
| { |
| "epoch": 1.3838748495788207, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.755376974446573e-05, |
| "loss": 0.5373, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.3850782190132371, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.750653372281018e-05, |
| "loss": 0.5448, |
| "step": 5755 |
| }, |
| { |
| "epoch": 1.3862815884476534, |
| "grad_norm": 1.6640625, |
| "learning_rate": 6.745929017799091e-05, |
| "loss": 0.5449, |
| "step": 5760 |
| }, |
| { |
| "epoch": 1.3874849578820698, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.74120391854545e-05, |
| "loss": 0.5297, |
| "step": 5765 |
| }, |
| { |
| "epoch": 1.388688327316486, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.73647808206595e-05, |
| "loss": 0.5672, |
| "step": 5770 |
| }, |
| { |
| "epoch": 1.3898916967509025, |
| "grad_norm": 1.6875, |
| "learning_rate": 6.731751515907624e-05, |
| "loss": 0.5419, |
| "step": 5775 |
| }, |
| { |
| "epoch": 1.391095066185319, |
| "grad_norm": 1.6875, |
| "learning_rate": 6.727024227618667e-05, |
| "loss": 0.5312, |
| "step": 5780 |
| }, |
| { |
| "epoch": 1.3922984356197352, |
| "grad_norm": 1.8125, |
| "learning_rate": 6.72229622474843e-05, |
| "loss": 0.5624, |
| "step": 5785 |
| }, |
| { |
| "epoch": 1.3935018050541517, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.717567514847401e-05, |
| "loss": 0.5465, |
| "step": 5790 |
| }, |
| { |
| "epoch": 1.3947051744885681, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.712838105467204e-05, |
| "loss": 0.5549, |
| "step": 5795 |
| }, |
| { |
| "epoch": 1.3959085439229844, |
| "grad_norm": 1.5859375, |
| "learning_rate": 6.70810800416057e-05, |
| "loss": 0.537, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.3971119133574006, |
| "grad_norm": 1.6953125, |
| "learning_rate": 6.703377218481343e-05, |
| "loss": 0.529, |
| "step": 5805 |
| }, |
| { |
| "epoch": 1.398315282791817, |
| "grad_norm": 1.59375, |
| "learning_rate": 6.698645755984457e-05, |
| "loss": 0.5774, |
| "step": 5810 |
| }, |
| { |
| "epoch": 1.3995186522262335, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.693913624225931e-05, |
| "loss": 0.5552, |
| "step": 5815 |
| }, |
| { |
| "epoch": 1.4007220216606497, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.689180830762847e-05, |
| "loss": 0.5401, |
| "step": 5820 |
| }, |
| { |
| "epoch": 1.4019253910950662, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.684447383153344e-05, |
| "loss": 0.5649, |
| "step": 5825 |
| }, |
| { |
| "epoch": 1.4031287605294827, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.679713288956611e-05, |
| "loss": 0.536, |
| "step": 5830 |
| }, |
| { |
| "epoch": 1.404332129963899, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.674978555732864e-05, |
| "loss": 0.5612, |
| "step": 5835 |
| }, |
| { |
| "epoch": 1.4055354993983153, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.670243191043344e-05, |
| "loss": 0.5487, |
| "step": 5840 |
| }, |
| { |
| "epoch": 1.4067388688327316, |
| "grad_norm": 1.5703125, |
| "learning_rate": 6.665507202450296e-05, |
| "loss": 0.5507, |
| "step": 5845 |
| }, |
| { |
| "epoch": 1.407942238267148, |
| "grad_norm": 1.53125, |
| "learning_rate": 6.660770597516963e-05, |
| "loss": 0.547, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.4091456077015643, |
| "grad_norm": 1.8046875, |
| "learning_rate": 6.656033383807577e-05, |
| "loss": 0.557, |
| "step": 5855 |
| }, |
| { |
| "epoch": 1.4103489771359807, |
| "grad_norm": 1.5625, |
| "learning_rate": 6.651295568887334e-05, |
| "loss": 0.5393, |
| "step": 5860 |
| }, |
| { |
| "epoch": 1.4115523465703972, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.646557160322395e-05, |
| "loss": 0.5572, |
| "step": 5865 |
| }, |
| { |
| "epoch": 1.4127557160048134, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.641818165679871e-05, |
| "loss": 0.5423, |
| "step": 5870 |
| }, |
| { |
| "epoch": 1.4139590854392299, |
| "grad_norm": 1.8671875, |
| "learning_rate": 6.637078592527801e-05, |
| "loss": 0.561, |
| "step": 5875 |
| }, |
| { |
| "epoch": 1.4151624548736463, |
| "grad_norm": 1.8515625, |
| "learning_rate": 6.632338448435156e-05, |
| "loss": 0.5859, |
| "step": 5880 |
| }, |
| { |
| "epoch": 1.4163658243080626, |
| "grad_norm": 1.7578125, |
| "learning_rate": 6.627597740971817e-05, |
| "loss": 0.5415, |
| "step": 5885 |
| }, |
| { |
| "epoch": 1.4175691937424788, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.622856477708562e-05, |
| "loss": 0.5513, |
| "step": 5890 |
| }, |
| { |
| "epoch": 1.4187725631768953, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.618114666217058e-05, |
| "loss": 0.5319, |
| "step": 5895 |
| }, |
| { |
| "epoch": 1.4199759326113117, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.613372314069847e-05, |
| "loss": 0.5853, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.421179302045728, |
| "grad_norm": 1.5078125, |
| "learning_rate": 6.608629428840336e-05, |
| "loss": 0.5123, |
| "step": 5905 |
| }, |
| { |
| "epoch": 1.4223826714801444, |
| "grad_norm": 1.7890625, |
| "learning_rate": 6.603886018102781e-05, |
| "loss": 0.5672, |
| "step": 5910 |
| }, |
| { |
| "epoch": 1.4235860409145609, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.599142089432282e-05, |
| "loss": 0.5437, |
| "step": 5915 |
| }, |
| { |
| "epoch": 1.424789410348977, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.594397650404759e-05, |
| "loss": 0.5595, |
| "step": 5920 |
| }, |
| { |
| "epoch": 1.4259927797833936, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.589652708596952e-05, |
| "loss": 0.5239, |
| "step": 5925 |
| }, |
| { |
| "epoch": 1.4271961492178098, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.584907271586402e-05, |
| "loss": 0.5637, |
| "step": 5930 |
| }, |
| { |
| "epoch": 1.4283995186522263, |
| "grad_norm": 1.8828125, |
| "learning_rate": 6.580161346951442e-05, |
| "loss": 0.5361, |
| "step": 5935 |
| }, |
| { |
| "epoch": 1.4296028880866425, |
| "grad_norm": 1.7890625, |
| "learning_rate": 6.575414942271184e-05, |
| "loss": 0.5286, |
| "step": 5940 |
| }, |
| { |
| "epoch": 1.430806257521059, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.570668065125504e-05, |
| "loss": 0.5967, |
| "step": 5945 |
| }, |
| { |
| "epoch": 1.4320096269554754, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.565920723095035e-05, |
| "loss": 0.536, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.4332129963898916, |
| "grad_norm": 1.796875, |
| "learning_rate": 6.561172923761153e-05, |
| "loss": 0.5393, |
| "step": 5955 |
| }, |
| { |
| "epoch": 1.434416365824308, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.556424674705962e-05, |
| "loss": 0.5651, |
| "step": 5960 |
| }, |
| { |
| "epoch": 1.4356197352587245, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.551675983512283e-05, |
| "loss": 0.5558, |
| "step": 5965 |
| }, |
| { |
| "epoch": 1.4368231046931408, |
| "grad_norm": 1.7890625, |
| "learning_rate": 6.546926857763648e-05, |
| "loss": 0.5897, |
| "step": 5970 |
| }, |
| { |
| "epoch": 1.4380264741275572, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.54217730504428e-05, |
| "loss": 0.5656, |
| "step": 5975 |
| }, |
| { |
| "epoch": 1.4392298435619735, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.537427332939083e-05, |
| "loss": 0.515, |
| "step": 5980 |
| }, |
| { |
| "epoch": 1.44043321299639, |
| "grad_norm": 1.6875, |
| "learning_rate": 6.532676949033634e-05, |
| "loss": 0.5269, |
| "step": 5985 |
| }, |
| { |
| "epoch": 1.4416365824308062, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.527926160914163e-05, |
| "loss": 0.5277, |
| "step": 5990 |
| }, |
| { |
| "epoch": 1.4428399518652226, |
| "grad_norm": 1.5859375, |
| "learning_rate": 6.523174976167547e-05, |
| "loss": 0.5545, |
| "step": 5995 |
| }, |
| { |
| "epoch": 1.444043321299639, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.518423402381303e-05, |
| "loss": 0.5319, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.444043321299639, |
| "eval_loss": 0.46789681911468506, |
| "eval_runtime": 2.6551, |
| "eval_samples_per_second": 75.327, |
| "eval_steps_per_second": 75.327, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.4452466907340553, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.513671447143558e-05, |
| "loss": 0.5483, |
| "step": 6005 |
| }, |
| { |
| "epoch": 1.4464500601684718, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.508919118043056e-05, |
| "loss": 0.5712, |
| "step": 6010 |
| }, |
| { |
| "epoch": 1.447653429602888, |
| "grad_norm": 1.7890625, |
| "learning_rate": 6.504166422669137e-05, |
| "loss": 0.5172, |
| "step": 6015 |
| }, |
| { |
| "epoch": 1.4488567990373045, |
| "grad_norm": 1.625, |
| "learning_rate": 6.499413368611722e-05, |
| "loss": 0.5448, |
| "step": 6020 |
| }, |
| { |
| "epoch": 1.4500601684717207, |
| "grad_norm": 1.765625, |
| "learning_rate": 6.49465996346131e-05, |
| "loss": 0.5728, |
| "step": 6025 |
| }, |
| { |
| "epoch": 1.4512635379061372, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.489906214808959e-05, |
| "loss": 0.5545, |
| "step": 6030 |
| }, |
| { |
| "epoch": 1.4524669073405536, |
| "grad_norm": 1.6015625, |
| "learning_rate": 6.485152130246271e-05, |
| "loss": 0.5488, |
| "step": 6035 |
| }, |
| { |
| "epoch": 1.4536702767749698, |
| "grad_norm": 1.6015625, |
| "learning_rate": 6.480397717365393e-05, |
| "loss": 0.5566, |
| "step": 6040 |
| }, |
| { |
| "epoch": 1.4548736462093863, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.47564298375899e-05, |
| "loss": 0.563, |
| "step": 6045 |
| }, |
| { |
| "epoch": 1.4560770156438028, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.470887937020238e-05, |
| "loss": 0.5657, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.457280385078219, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.466132584742819e-05, |
| "loss": 0.5372, |
| "step": 6055 |
| }, |
| { |
| "epoch": 1.4584837545126355, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.4613769345209e-05, |
| "loss": 0.5713, |
| "step": 6060 |
| }, |
| { |
| "epoch": 1.4596871239470517, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.456620993949122e-05, |
| "loss": 0.5335, |
| "step": 6065 |
| }, |
| { |
| "epoch": 1.4608904933814681, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.451864770622593e-05, |
| "loss": 0.5202, |
| "step": 6070 |
| }, |
| { |
| "epoch": 1.4620938628158844, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.44710827213687e-05, |
| "loss": 0.5149, |
| "step": 6075 |
| }, |
| { |
| "epoch": 1.4632972322503008, |
| "grad_norm": 1.5859375, |
| "learning_rate": 6.442351506087949e-05, |
| "loss": 0.5188, |
| "step": 6080 |
| }, |
| { |
| "epoch": 1.4645006016847173, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.437594480072255e-05, |
| "loss": 0.5472, |
| "step": 6085 |
| }, |
| { |
| "epoch": 1.4657039711191335, |
| "grad_norm": 1.6875, |
| "learning_rate": 6.432837201686631e-05, |
| "loss": 0.5382, |
| "step": 6090 |
| }, |
| { |
| "epoch": 1.46690734055355, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.428079678528317e-05, |
| "loss": 0.5424, |
| "step": 6095 |
| }, |
| { |
| "epoch": 1.4681107099879662, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.423321918194947e-05, |
| "loss": 0.5614, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.4693140794223827, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.418563928284533e-05, |
| "loss": 0.5205, |
| "step": 6105 |
| }, |
| { |
| "epoch": 1.470517448856799, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.413805716395453e-05, |
| "loss": 0.5253, |
| "step": 6110 |
| }, |
| { |
| "epoch": 1.4717208182912154, |
| "grad_norm": 1.5546875, |
| "learning_rate": 6.409047290126444e-05, |
| "loss": 0.5575, |
| "step": 6115 |
| }, |
| { |
| "epoch": 1.4729241877256318, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.404288657076581e-05, |
| "loss": 0.5642, |
| "step": 6120 |
| }, |
| { |
| "epoch": 1.474127557160048, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.399529824845267e-05, |
| "loss": 0.5647, |
| "step": 6125 |
| }, |
| { |
| "epoch": 1.4753309265944645, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.39477080103223e-05, |
| "loss": 0.5488, |
| "step": 6130 |
| }, |
| { |
| "epoch": 1.476534296028881, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.390011593237498e-05, |
| "loss": 0.5288, |
| "step": 6135 |
| }, |
| { |
| "epoch": 1.4777376654632972, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.385252209061396e-05, |
| "loss": 0.5336, |
| "step": 6140 |
| }, |
| { |
| "epoch": 1.4789410348977137, |
| "grad_norm": 1.578125, |
| "learning_rate": 6.380492656104528e-05, |
| "loss": 0.5341, |
| "step": 6145 |
| }, |
| { |
| "epoch": 1.48014440433213, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.375732941967771e-05, |
| "loss": 0.5528, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.4813477737665464, |
| "grad_norm": 1.8125, |
| "learning_rate": 6.370973074252259e-05, |
| "loss": 0.5265, |
| "step": 6155 |
| }, |
| { |
| "epoch": 1.4825511432009626, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.366213060559366e-05, |
| "loss": 0.5381, |
| "step": 6160 |
| }, |
| { |
| "epoch": 1.483754512635379, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.361452908490706e-05, |
| "loss": 0.5573, |
| "step": 6165 |
| }, |
| { |
| "epoch": 1.4849578820697955, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.356692625648108e-05, |
| "loss": 0.5663, |
| "step": 6170 |
| }, |
| { |
| "epoch": 1.4861612515042117, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.351932219633617e-05, |
| "loss": 0.5554, |
| "step": 6175 |
| }, |
| { |
| "epoch": 1.4873646209386282, |
| "grad_norm": 1.5546875, |
| "learning_rate": 6.347171698049466e-05, |
| "loss": 0.54, |
| "step": 6180 |
| }, |
| { |
| "epoch": 1.4885679903730447, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.34241106849808e-05, |
| "loss": 0.5344, |
| "step": 6185 |
| }, |
| { |
| "epoch": 1.489771359807461, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.337650338582055e-05, |
| "loss": 0.552, |
| "step": 6190 |
| }, |
| { |
| "epoch": 1.4909747292418771, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.33288951590414e-05, |
| "loss": 0.5318, |
| "step": 6195 |
| }, |
| { |
| "epoch": 1.4921780986762936, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.328128608067245e-05, |
| "loss": 0.5585, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.49338146811071, |
| "grad_norm": 1.8125, |
| "learning_rate": 6.323367622674404e-05, |
| "loss": 0.5456, |
| "step": 6205 |
| }, |
| { |
| "epoch": 1.4945848375451263, |
| "grad_norm": 1.546875, |
| "learning_rate": 6.318606567328783e-05, |
| "loss": 0.5315, |
| "step": 6210 |
| }, |
| { |
| "epoch": 1.4957882069795427, |
| "grad_norm": 2.28125, |
| "learning_rate": 6.313845449633651e-05, |
| "loss": 0.5548, |
| "step": 6215 |
| }, |
| { |
| "epoch": 1.4969915764139592, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.309084277192387e-05, |
| "loss": 0.5095, |
| "step": 6220 |
| }, |
| { |
| "epoch": 1.4981949458483754, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.304323057608452e-05, |
| "loss": 0.5339, |
| "step": 6225 |
| }, |
| { |
| "epoch": 1.4993983152827919, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.299561798485382e-05, |
| "loss": 0.5374, |
| "step": 6230 |
| }, |
| { |
| "epoch": 1.5006016847172083, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.294800507426777e-05, |
| "loss": 0.5234, |
| "step": 6235 |
| }, |
| { |
| "epoch": 1.5018050541516246, |
| "grad_norm": 1.75, |
| "learning_rate": 6.290039192036287e-05, |
| "loss": 0.5835, |
| "step": 6240 |
| }, |
| { |
| "epoch": 1.5030084235860408, |
| "grad_norm": 1.6640625, |
| "learning_rate": 6.285277859917603e-05, |
| "loss": 0.5291, |
| "step": 6245 |
| }, |
| { |
| "epoch": 1.5042117930204573, |
| "grad_norm": 1.59375, |
| "learning_rate": 6.280516518674442e-05, |
| "loss": 0.5387, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.5054151624548737, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.275755175910535e-05, |
| "loss": 0.5424, |
| "step": 6255 |
| }, |
| { |
| "epoch": 1.50661853188929, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.27099383922961e-05, |
| "loss": 0.5523, |
| "step": 6260 |
| }, |
| { |
| "epoch": 1.5078219013237064, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.266232516235398e-05, |
| "loss": 0.5244, |
| "step": 6265 |
| }, |
| { |
| "epoch": 1.5090252707581229, |
| "grad_norm": 1.6015625, |
| "learning_rate": 6.2614712145316e-05, |
| "loss": 0.5452, |
| "step": 6270 |
| }, |
| { |
| "epoch": 1.510228640192539, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.25670994172188e-05, |
| "loss": 0.5606, |
| "step": 6275 |
| }, |
| { |
| "epoch": 1.5114320096269553, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.251948705409863e-05, |
| "loss": 0.542, |
| "step": 6280 |
| }, |
| { |
| "epoch": 1.5126353790613718, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.24718751319911e-05, |
| "loss": 0.5373, |
| "step": 6285 |
| }, |
| { |
| "epoch": 1.5138387484957883, |
| "grad_norm": 1.75, |
| "learning_rate": 6.242426372693119e-05, |
| "loss": 0.5523, |
| "step": 6290 |
| }, |
| { |
| "epoch": 1.5150421179302045, |
| "grad_norm": 1.90625, |
| "learning_rate": 6.237665291495294e-05, |
| "loss": 0.5456, |
| "step": 6295 |
| }, |
| { |
| "epoch": 1.516245487364621, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.232904277208953e-05, |
| "loss": 0.5438, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.5174488567990374, |
| "grad_norm": 1.7578125, |
| "learning_rate": 6.228143337437309e-05, |
| "loss": 0.573, |
| "step": 6305 |
| }, |
| { |
| "epoch": 1.5186522262334536, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.223382479783445e-05, |
| "loss": 0.5395, |
| "step": 6310 |
| }, |
| { |
| "epoch": 1.5198555956678699, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.218621711850323e-05, |
| "loss": 0.551, |
| "step": 6315 |
| }, |
| { |
| "epoch": 1.5210589651022866, |
| "grad_norm": 1.796875, |
| "learning_rate": 6.213861041240763e-05, |
| "loss": 0.5512, |
| "step": 6320 |
| }, |
| { |
| "epoch": 1.5222623345367028, |
| "grad_norm": 1.8359375, |
| "learning_rate": 6.209100475557418e-05, |
| "loss": 0.5605, |
| "step": 6325 |
| }, |
| { |
| "epoch": 1.523465703971119, |
| "grad_norm": 1.703125, |
| "learning_rate": 6.204340022402783e-05, |
| "loss": 0.5225, |
| "step": 6330 |
| }, |
| { |
| "epoch": 1.5246690734055355, |
| "grad_norm": 1.6640625, |
| "learning_rate": 6.199579689379172e-05, |
| "loss": 0.5727, |
| "step": 6335 |
| }, |
| { |
| "epoch": 1.525872442839952, |
| "grad_norm": 1.703125, |
| "learning_rate": 6.194819484088706e-05, |
| "loss": 0.5537, |
| "step": 6340 |
| }, |
| { |
| "epoch": 1.5270758122743682, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.1900594141333e-05, |
| "loss": 0.5679, |
| "step": 6345 |
| }, |
| { |
| "epoch": 1.5282791817087846, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.185299487114657e-05, |
| "loss": 0.5396, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.529482551143201, |
| "grad_norm": 1.8671875, |
| "learning_rate": 6.180539710634246e-05, |
| "loss": 0.5241, |
| "step": 6355 |
| }, |
| { |
| "epoch": 1.5306859205776173, |
| "grad_norm": 1.6171875, |
| "learning_rate": 6.175780092293305e-05, |
| "loss": 0.5411, |
| "step": 6360 |
| }, |
| { |
| "epoch": 1.5318892900120336, |
| "grad_norm": 1.703125, |
| "learning_rate": 6.171020639692808e-05, |
| "loss": 0.5657, |
| "step": 6365 |
| }, |
| { |
| "epoch": 1.53309265944645, |
| "grad_norm": 1.625, |
| "learning_rate": 6.166261360433472e-05, |
| "loss": 0.519, |
| "step": 6370 |
| }, |
| { |
| "epoch": 1.5342960288808665, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.161502262115736e-05, |
| "loss": 0.5226, |
| "step": 6375 |
| }, |
| { |
| "epoch": 1.5354993983152827, |
| "grad_norm": 1.625, |
| "learning_rate": 6.15674335233975e-05, |
| "loss": 0.5691, |
| "step": 6380 |
| }, |
| { |
| "epoch": 1.5367027677496992, |
| "grad_norm": 1.6640625, |
| "learning_rate": 6.151984638705359e-05, |
| "loss": 0.5337, |
| "step": 6385 |
| }, |
| { |
| "epoch": 1.5379061371841156, |
| "grad_norm": 1.625, |
| "learning_rate": 6.147226128812099e-05, |
| "loss": 0.522, |
| "step": 6390 |
| }, |
| { |
| "epoch": 1.5391095066185319, |
| "grad_norm": 1.6015625, |
| "learning_rate": 6.142467830259183e-05, |
| "loss": 0.549, |
| "step": 6395 |
| }, |
| { |
| "epoch": 1.5403128760529483, |
| "grad_norm": 1.6875, |
| "learning_rate": 6.137709750645475e-05, |
| "loss": 0.5509, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.5415162454873648, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.132951897569506e-05, |
| "loss": 0.524, |
| "step": 6405 |
| }, |
| { |
| "epoch": 1.542719614921781, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.128194278629432e-05, |
| "loss": 0.5388, |
| "step": 6410 |
| }, |
| { |
| "epoch": 1.5439229843561972, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.12343690142304e-05, |
| "loss": 0.531, |
| "step": 6415 |
| }, |
| { |
| "epoch": 1.5451263537906137, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.11867977354773e-05, |
| "loss": 0.5653, |
| "step": 6420 |
| }, |
| { |
| "epoch": 1.5463297232250302, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.113922902600507e-05, |
| "loss": 0.57, |
| "step": 6425 |
| }, |
| { |
| "epoch": 1.5475330926594464, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.10916629617796e-05, |
| "loss": 0.5529, |
| "step": 6430 |
| }, |
| { |
| "epoch": 1.5487364620938628, |
| "grad_norm": 1.546875, |
| "learning_rate": 6.104409961876263e-05, |
| "loss": 0.5196, |
| "step": 6435 |
| }, |
| { |
| "epoch": 1.5499398315282793, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.0996539072911456e-05, |
| "loss": 0.555, |
| "step": 6440 |
| }, |
| { |
| "epoch": 1.5511432009626955, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.0948981400178985e-05, |
| "loss": 0.511, |
| "step": 6445 |
| }, |
| { |
| "epoch": 1.5523465703971118, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.090142667651353e-05, |
| "loss": 0.5294, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.5535499398315282, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.085387497785866e-05, |
| "loss": 0.5526, |
| "step": 6455 |
| }, |
| { |
| "epoch": 1.5547533092659447, |
| "grad_norm": 1.78125, |
| "learning_rate": 6.080632638015313e-05, |
| "loss": 0.5515, |
| "step": 6460 |
| }, |
| { |
| "epoch": 1.555956678700361, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.0758780959330756e-05, |
| "loss": 0.5441, |
| "step": 6465 |
| }, |
| { |
| "epoch": 1.5571600481347774, |
| "grad_norm": 1.7578125, |
| "learning_rate": 6.0711238791320256e-05, |
| "loss": 0.5897, |
| "step": 6470 |
| }, |
| { |
| "epoch": 1.5583634175691938, |
| "grad_norm": 1.75, |
| "learning_rate": 6.066369995204517e-05, |
| "loss": 0.5655, |
| "step": 6475 |
| }, |
| { |
| "epoch": 1.55956678700361, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.061616451742371e-05, |
| "loss": 0.5299, |
| "step": 6480 |
| }, |
| { |
| "epoch": 1.5607701564380265, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.056863256336866e-05, |
| "loss": 0.5418, |
| "step": 6485 |
| }, |
| { |
| "epoch": 1.561973525872443, |
| "grad_norm": 1.75, |
| "learning_rate": 6.0521104165787255e-05, |
| "loss": 0.5359, |
| "step": 6490 |
| }, |
| { |
| "epoch": 1.5631768953068592, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.047357940058102e-05, |
| "loss": 0.5489, |
| "step": 6495 |
| }, |
| { |
| "epoch": 1.5643802647412755, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.042605834364572e-05, |
| "loss": 0.5236, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.5643802647412755, |
| "eval_loss": 0.46134811639785767, |
| "eval_runtime": 2.6539, |
| "eval_samples_per_second": 75.362, |
| "eval_steps_per_second": 75.362, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.565583634175692, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.037854107087116e-05, |
| "loss": 0.5509, |
| "step": 6505 |
| }, |
| { |
| "epoch": 1.5667870036101084, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.033102765814113e-05, |
| "loss": 0.5156, |
| "step": 6510 |
| }, |
| { |
| "epoch": 1.5679903730445246, |
| "grad_norm": 1.703125, |
| "learning_rate": 6.028351818133324e-05, |
| "loss": 0.5783, |
| "step": 6515 |
| }, |
| { |
| "epoch": 1.569193742478941, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.023601271631882e-05, |
| "loss": 0.5482, |
| "step": 6520 |
| }, |
| { |
| "epoch": 1.5703971119133575, |
| "grad_norm": 1.6171875, |
| "learning_rate": 6.018851133896279e-05, |
| "loss": 0.5538, |
| "step": 6525 |
| }, |
| { |
| "epoch": 1.5716004813477737, |
| "grad_norm": 1.75, |
| "learning_rate": 6.014101412512352e-05, |
| "loss": 0.5254, |
| "step": 6530 |
| }, |
| { |
| "epoch": 1.57280385078219, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.009352115065279e-05, |
| "loss": 0.5402, |
| "step": 6535 |
| }, |
| { |
| "epoch": 1.5740072202166067, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.004603249139556e-05, |
| "loss": 0.518, |
| "step": 6540 |
| }, |
| { |
| "epoch": 1.575210589651023, |
| "grad_norm": 1.671875, |
| "learning_rate": 5.999854822318991e-05, |
| "loss": 0.5331, |
| "step": 6545 |
| }, |
| { |
| "epoch": 1.5764139590854391, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.995106842186692e-05, |
| "loss": 0.5737, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.5776173285198556, |
| "grad_norm": 1.515625, |
| "learning_rate": 5.9903593163250524e-05, |
| "loss": 0.5486, |
| "step": 6555 |
| }, |
| { |
| "epoch": 1.578820697954272, |
| "grad_norm": 1.5859375, |
| "learning_rate": 5.985612252315738e-05, |
| "loss": 0.5431, |
| "step": 6560 |
| }, |
| { |
| "epoch": 1.5800240673886883, |
| "grad_norm": 1.8125, |
| "learning_rate": 5.980865657739683e-05, |
| "loss": 0.5495, |
| "step": 6565 |
| }, |
| { |
| "epoch": 1.5812274368231047, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.976119540177067e-05, |
| "loss": 0.5227, |
| "step": 6570 |
| }, |
| { |
| "epoch": 1.5824308062575212, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.9713739072073096e-05, |
| "loss": 0.5472, |
| "step": 6575 |
| }, |
| { |
| "epoch": 1.5836341756919374, |
| "grad_norm": 1.6171875, |
| "learning_rate": 5.966628766409056e-05, |
| "loss": 0.5873, |
| "step": 6580 |
| }, |
| { |
| "epoch": 1.5848375451263537, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.961884125360166e-05, |
| "loss": 0.5349, |
| "step": 6585 |
| }, |
| { |
| "epoch": 1.5860409145607701, |
| "grad_norm": 1.46875, |
| "learning_rate": 5.957139991637701e-05, |
| "loss": 0.5293, |
| "step": 6590 |
| }, |
| { |
| "epoch": 1.5872442839951866, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.9523963728179093e-05, |
| "loss": 0.5599, |
| "step": 6595 |
| }, |
| { |
| "epoch": 1.5884476534296028, |
| "grad_norm": 1.71875, |
| "learning_rate": 5.947653276476223e-05, |
| "loss": 0.5388, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.5896510228640193, |
| "grad_norm": 1.8203125, |
| "learning_rate": 5.942910710187236e-05, |
| "loss": 0.5362, |
| "step": 6605 |
| }, |
| { |
| "epoch": 1.5908543922984357, |
| "grad_norm": 1.671875, |
| "learning_rate": 5.9381686815246954e-05, |
| "loss": 0.496, |
| "step": 6610 |
| }, |
| { |
| "epoch": 1.592057761732852, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.933427198061491e-05, |
| "loss": 0.5428, |
| "step": 6615 |
| }, |
| { |
| "epoch": 1.5932611311672682, |
| "grad_norm": 1.5390625, |
| "learning_rate": 5.9286862673696416e-05, |
| "loss": 0.5177, |
| "step": 6620 |
| }, |
| { |
| "epoch": 1.5944645006016849, |
| "grad_norm": 1.8359375, |
| "learning_rate": 5.9239458970202826e-05, |
| "loss": 0.5348, |
| "step": 6625 |
| }, |
| { |
| "epoch": 1.595667870036101, |
| "grad_norm": 1.6953125, |
| "learning_rate": 5.9192060945836565e-05, |
| "loss": 0.4923, |
| "step": 6630 |
| }, |
| { |
| "epoch": 1.5968712394705173, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.9144668676290955e-05, |
| "loss": 0.5447, |
| "step": 6635 |
| }, |
| { |
| "epoch": 1.5980746089049338, |
| "grad_norm": 1.625, |
| "learning_rate": 5.909728223725017e-05, |
| "loss": 0.5258, |
| "step": 6640 |
| }, |
| { |
| "epoch": 1.5992779783393503, |
| "grad_norm": 1.6015625, |
| "learning_rate": 5.9049901704389036e-05, |
| "loss": 0.5539, |
| "step": 6645 |
| }, |
| { |
| "epoch": 1.6004813477737665, |
| "grad_norm": 1.8359375, |
| "learning_rate": 5.9002527153372964e-05, |
| "loss": 0.5423, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.601684717208183, |
| "grad_norm": 1.71875, |
| "learning_rate": 5.8955158659857795e-05, |
| "loss": 0.5508, |
| "step": 6655 |
| }, |
| { |
| "epoch": 1.6028880866425994, |
| "grad_norm": 1.734375, |
| "learning_rate": 5.8907796299489736e-05, |
| "loss": 0.5579, |
| "step": 6660 |
| }, |
| { |
| "epoch": 1.6040914560770156, |
| "grad_norm": 1.5625, |
| "learning_rate": 5.886044014790516e-05, |
| "loss": 0.5439, |
| "step": 6665 |
| }, |
| { |
| "epoch": 1.6052948255114319, |
| "grad_norm": 1.7734375, |
| "learning_rate": 5.8813090280730544e-05, |
| "loss": 0.5171, |
| "step": 6670 |
| }, |
| { |
| "epoch": 1.6064981949458483, |
| "grad_norm": 1.6171875, |
| "learning_rate": 5.876574677358232e-05, |
| "loss": 0.5517, |
| "step": 6675 |
| }, |
| { |
| "epoch": 1.6077015643802648, |
| "grad_norm": 1.8046875, |
| "learning_rate": 5.871840970206677e-05, |
| "loss": 0.5603, |
| "step": 6680 |
| }, |
| { |
| "epoch": 1.608904933814681, |
| "grad_norm": 1.71875, |
| "learning_rate": 5.8671079141779896e-05, |
| "loss": 0.5159, |
| "step": 6685 |
| }, |
| { |
| "epoch": 1.6101083032490975, |
| "grad_norm": 1.6953125, |
| "learning_rate": 5.8623755168307306e-05, |
| "loss": 0.5102, |
| "step": 6690 |
| }, |
| { |
| "epoch": 1.611311672683514, |
| "grad_norm": 1.6953125, |
| "learning_rate": 5.857643785722407e-05, |
| "loss": 0.5499, |
| "step": 6695 |
| }, |
| { |
| "epoch": 1.6125150421179302, |
| "grad_norm": 1.8125, |
| "learning_rate": 5.8529127284094636e-05, |
| "loss": 0.5489, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.6137184115523464, |
| "grad_norm": 1.6171875, |
| "learning_rate": 5.848182352447271e-05, |
| "loss": 0.5285, |
| "step": 6705 |
| }, |
| { |
| "epoch": 1.614921780986763, |
| "grad_norm": 1.7109375, |
| "learning_rate": 5.843452665390107e-05, |
| "loss": 0.5249, |
| "step": 6710 |
| }, |
| { |
| "epoch": 1.6161251504211793, |
| "grad_norm": 1.6953125, |
| "learning_rate": 5.838723674791153e-05, |
| "loss": 0.53, |
| "step": 6715 |
| }, |
| { |
| "epoch": 1.6173285198555956, |
| "grad_norm": 1.625, |
| "learning_rate": 5.833995388202477e-05, |
| "loss": 0.5461, |
| "step": 6720 |
| }, |
| { |
| "epoch": 1.618531889290012, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.829267813175021e-05, |
| "loss": 0.5346, |
| "step": 6725 |
| }, |
| { |
| "epoch": 1.6197352587244285, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.8245409572585945e-05, |
| "loss": 0.5415, |
| "step": 6730 |
| }, |
| { |
| "epoch": 1.6209386281588447, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.819814828001857e-05, |
| "loss": 0.5267, |
| "step": 6735 |
| }, |
| { |
| "epoch": 1.6221419975932612, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.815089432952304e-05, |
| "loss": 0.5565, |
| "step": 6740 |
| }, |
| { |
| "epoch": 1.6233453670276776, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.810364779656263e-05, |
| "loss": 0.5342, |
| "step": 6745 |
| }, |
| { |
| "epoch": 1.6245487364620939, |
| "grad_norm": 1.7578125, |
| "learning_rate": 5.8056408756588765e-05, |
| "loss": 0.5535, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.62575210589651, |
| "grad_norm": 1.78125, |
| "learning_rate": 5.800917728504089e-05, |
| "loss": 0.5675, |
| "step": 6755 |
| }, |
| { |
| "epoch": 1.6269554753309265, |
| "grad_norm": 1.828125, |
| "learning_rate": 5.796195345734634e-05, |
| "loss": 0.5713, |
| "step": 6760 |
| }, |
| { |
| "epoch": 1.628158844765343, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.7914737348920306e-05, |
| "loss": 0.5212, |
| "step": 6765 |
| }, |
| { |
| "epoch": 1.6293622141997592, |
| "grad_norm": 1.6640625, |
| "learning_rate": 5.786752903516559e-05, |
| "loss": 0.5071, |
| "step": 6770 |
| }, |
| { |
| "epoch": 1.6305655836341757, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.782032859147257e-05, |
| "loss": 0.5464, |
| "step": 6775 |
| }, |
| { |
| "epoch": 1.6317689530685922, |
| "grad_norm": 1.8828125, |
| "learning_rate": 5.777313609321908e-05, |
| "loss": 0.5345, |
| "step": 6780 |
| }, |
| { |
| "epoch": 1.6329723225030084, |
| "grad_norm": 1.734375, |
| "learning_rate": 5.772595161577023e-05, |
| "loss": 0.5479, |
| "step": 6785 |
| }, |
| { |
| "epoch": 1.6341756919374246, |
| "grad_norm": 1.7109375, |
| "learning_rate": 5.767877523447831e-05, |
| "loss": 0.5244, |
| "step": 6790 |
| }, |
| { |
| "epoch": 1.6353790613718413, |
| "grad_norm": 1.71875, |
| "learning_rate": 5.7631607024682735e-05, |
| "loss": 0.5496, |
| "step": 6795 |
| }, |
| { |
| "epoch": 1.6365824308062575, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.758444706170984e-05, |
| "loss": 0.5478, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.6377858002406738, |
| "grad_norm": 1.59375, |
| "learning_rate": 5.7537295420872764e-05, |
| "loss": 0.5553, |
| "step": 6805 |
| }, |
| { |
| "epoch": 1.6389891696750902, |
| "grad_norm": 1.515625, |
| "learning_rate": 5.7490152177471396e-05, |
| "loss": 0.506, |
| "step": 6810 |
| }, |
| { |
| "epoch": 1.6401925391095067, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.744301740679222e-05, |
| "loss": 0.5151, |
| "step": 6815 |
| }, |
| { |
| "epoch": 1.641395908543923, |
| "grad_norm": 1.5, |
| "learning_rate": 5.739589118410812e-05, |
| "loss": 0.5418, |
| "step": 6820 |
| }, |
| { |
| "epoch": 1.6425992779783394, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.734877358467841e-05, |
| "loss": 0.5289, |
| "step": 6825 |
| }, |
| { |
| "epoch": 1.6438026474127558, |
| "grad_norm": 1.7734375, |
| "learning_rate": 5.730166468374861e-05, |
| "loss": 0.5453, |
| "step": 6830 |
| }, |
| { |
| "epoch": 1.645006016847172, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.725456455655032e-05, |
| "loss": 0.5122, |
| "step": 6835 |
| }, |
| { |
| "epoch": 1.6462093862815883, |
| "grad_norm": 1.5859375, |
| "learning_rate": 5.7207473278301156e-05, |
| "loss": 0.5027, |
| "step": 6840 |
| }, |
| { |
| "epoch": 1.6474127557160048, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.716039092420459e-05, |
| "loss": 0.5548, |
| "step": 6845 |
| }, |
| { |
| "epoch": 1.6486161251504212, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.711331756944984e-05, |
| "loss": 0.5339, |
| "step": 6850 |
| }, |
| { |
| "epoch": 1.6498194945848375, |
| "grad_norm": 1.8203125, |
| "learning_rate": 5.706625328921179e-05, |
| "loss": 0.5223, |
| "step": 6855 |
| }, |
| { |
| "epoch": 1.651022864019254, |
| "grad_norm": 1.625, |
| "learning_rate": 5.701919815865077e-05, |
| "loss": 0.5125, |
| "step": 6860 |
| }, |
| { |
| "epoch": 1.6522262334536704, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.697215225291253e-05, |
| "loss": 0.5413, |
| "step": 6865 |
| }, |
| { |
| "epoch": 1.6534296028880866, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.692511564712811e-05, |
| "loss": 0.5466, |
| "step": 6870 |
| }, |
| { |
| "epoch": 1.654632972322503, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.687808841641366e-05, |
| "loss": 0.5303, |
| "step": 6875 |
| }, |
| { |
| "epoch": 1.6558363417569195, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.683107063587036e-05, |
| "loss": 0.5052, |
| "step": 6880 |
| }, |
| { |
| "epoch": 1.6570397111913358, |
| "grad_norm": 1.625, |
| "learning_rate": 5.678406238058436e-05, |
| "loss": 0.5444, |
| "step": 6885 |
| }, |
| { |
| "epoch": 1.658243080625752, |
| "grad_norm": 1.7890625, |
| "learning_rate": 5.673706372562651e-05, |
| "loss": 0.5216, |
| "step": 6890 |
| }, |
| { |
| "epoch": 1.6594464500601684, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.6690074746052387e-05, |
| "loss": 0.4973, |
| "step": 6895 |
| }, |
| { |
| "epoch": 1.660649819494585, |
| "grad_norm": 1.5546875, |
| "learning_rate": 5.6643095516902086e-05, |
| "loss": 0.5067, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.6618531889290011, |
| "grad_norm": 1.6171875, |
| "learning_rate": 5.659612611320016e-05, |
| "loss": 0.513, |
| "step": 6905 |
| }, |
| { |
| "epoch": 1.6630565583634176, |
| "grad_norm": 1.609375, |
| "learning_rate": 5.654916660995545e-05, |
| "loss": 0.5462, |
| "step": 6910 |
| }, |
| { |
| "epoch": 1.664259927797834, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.6502217082160985e-05, |
| "loss": 0.5546, |
| "step": 6915 |
| }, |
| { |
| "epoch": 1.6654632972322503, |
| "grad_norm": 1.765625, |
| "learning_rate": 5.645527760479389e-05, |
| "loss": 0.5366, |
| "step": 6920 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.6408348252815175e-05, |
| "loss": 0.5309, |
| "step": 6925 |
| }, |
| { |
| "epoch": 1.6678700361010832, |
| "grad_norm": 1.671875, |
| "learning_rate": 5.636142910116977e-05, |
| "loss": 0.5298, |
| "step": 6930 |
| }, |
| { |
| "epoch": 1.6690734055354994, |
| "grad_norm": 1.7734375, |
| "learning_rate": 5.6314520224786246e-05, |
| "loss": 0.5296, |
| "step": 6935 |
| }, |
| { |
| "epoch": 1.6702767749699157, |
| "grad_norm": 1.5703125, |
| "learning_rate": 5.626762169857681e-05, |
| "loss": 0.5437, |
| "step": 6940 |
| }, |
| { |
| "epoch": 1.6714801444043321, |
| "grad_norm": 1.671875, |
| "learning_rate": 5.6220733597437104e-05, |
| "loss": 0.5303, |
| "step": 6945 |
| }, |
| { |
| "epoch": 1.6726835138387486, |
| "grad_norm": 1.5859375, |
| "learning_rate": 5.6173855996246134e-05, |
| "loss": 0.5361, |
| "step": 6950 |
| }, |
| { |
| "epoch": 1.6738868832731648, |
| "grad_norm": 1.6953125, |
| "learning_rate": 5.6126988969866155e-05, |
| "loss": 0.5326, |
| "step": 6955 |
| }, |
| { |
| "epoch": 1.6750902527075813, |
| "grad_norm": 1.5234375, |
| "learning_rate": 5.6080132593142505e-05, |
| "loss": 0.5333, |
| "step": 6960 |
| }, |
| { |
| "epoch": 1.6762936221419977, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.6033286940903516e-05, |
| "loss": 0.5072, |
| "step": 6965 |
| }, |
| { |
| "epoch": 1.677496991576414, |
| "grad_norm": 1.8671875, |
| "learning_rate": 5.598645208796045e-05, |
| "loss": 0.5403, |
| "step": 6970 |
| }, |
| { |
| "epoch": 1.6787003610108302, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.593962810910722e-05, |
| "loss": 0.5101, |
| "step": 6975 |
| }, |
| { |
| "epoch": 1.6799037304452467, |
| "grad_norm": 1.734375, |
| "learning_rate": 5.5892815079120496e-05, |
| "loss": 0.5334, |
| "step": 6980 |
| }, |
| { |
| "epoch": 1.6811070998796631, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.5846013072759353e-05, |
| "loss": 0.5519, |
| "step": 6985 |
| }, |
| { |
| "epoch": 1.6823104693140793, |
| "grad_norm": 1.609375, |
| "learning_rate": 5.5799222164765336e-05, |
| "loss": 0.5024, |
| "step": 6990 |
| }, |
| { |
| "epoch": 1.6835138387484958, |
| "grad_norm": 1.6640625, |
| "learning_rate": 5.575244242986223e-05, |
| "loss": 0.5289, |
| "step": 6995 |
| }, |
| { |
| "epoch": 1.6847172081829123, |
| "grad_norm": 1.671875, |
| "learning_rate": 5.570567394275598e-05, |
| "loss": 0.505, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.6847172081829123, |
| "eval_loss": 0.4532296061515808, |
| "eval_runtime": 2.6618, |
| "eval_samples_per_second": 75.138, |
| "eval_steps_per_second": 75.138, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.6859205776173285, |
| "grad_norm": 1.75, |
| "learning_rate": 5.565891677813456e-05, |
| "loss": 0.5362, |
| "step": 7005 |
| }, |
| { |
| "epoch": 1.6871239470517447, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.561217101066792e-05, |
| "loss": 0.5461, |
| "step": 7010 |
| }, |
| { |
| "epoch": 1.6883273164861614, |
| "grad_norm": 1.7578125, |
| "learning_rate": 5.5565436715007724e-05, |
| "loss": 0.5384, |
| "step": 7015 |
| }, |
| { |
| "epoch": 1.6895306859205776, |
| "grad_norm": 1.859375, |
| "learning_rate": 5.5518713965787376e-05, |
| "loss": 0.5458, |
| "step": 7020 |
| }, |
| { |
| "epoch": 1.6907340553549939, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.547200283762182e-05, |
| "loss": 0.5111, |
| "step": 7025 |
| }, |
| { |
| "epoch": 1.6919374247894103, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.542530340510744e-05, |
| "loss": 0.509, |
| "step": 7030 |
| }, |
| { |
| "epoch": 1.6931407942238268, |
| "grad_norm": 1.6015625, |
| "learning_rate": 5.537861574282195e-05, |
| "loss": 0.5409, |
| "step": 7035 |
| }, |
| { |
| "epoch": 1.694344163658243, |
| "grad_norm": 1.578125, |
| "learning_rate": 5.533193992532426e-05, |
| "loss": 0.5218, |
| "step": 7040 |
| }, |
| { |
| "epoch": 1.6955475330926595, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.5285276027154364e-05, |
| "loss": 0.5498, |
| "step": 7045 |
| }, |
| { |
| "epoch": 1.696750902527076, |
| "grad_norm": 1.546875, |
| "learning_rate": 5.523862412283323e-05, |
| "loss": 0.5424, |
| "step": 7050 |
| }, |
| { |
| "epoch": 1.6979542719614922, |
| "grad_norm": 1.9140625, |
| "learning_rate": 5.519198428686266e-05, |
| "loss": 0.5261, |
| "step": 7055 |
| }, |
| { |
| "epoch": 1.6991576413959084, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.5145356593725205e-05, |
| "loss": 0.5015, |
| "step": 7060 |
| }, |
| { |
| "epoch": 1.7003610108303249, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.5098741117883974e-05, |
| "loss": 0.5267, |
| "step": 7065 |
| }, |
| { |
| "epoch": 1.7015643802647413, |
| "grad_norm": 1.7109375, |
| "learning_rate": 5.505213793378264e-05, |
| "loss": 0.5124, |
| "step": 7070 |
| }, |
| { |
| "epoch": 1.7027677496991576, |
| "grad_norm": 1.890625, |
| "learning_rate": 5.5005547115845195e-05, |
| "loss": 0.5388, |
| "step": 7075 |
| }, |
| { |
| "epoch": 1.703971119133574, |
| "grad_norm": 1.6171875, |
| "learning_rate": 5.495896873847589e-05, |
| "loss": 0.5185, |
| "step": 7080 |
| }, |
| { |
| "epoch": 1.7051744885679905, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.491240287605911e-05, |
| "loss": 0.5264, |
| "step": 7085 |
| }, |
| { |
| "epoch": 1.7063778580024067, |
| "grad_norm": 1.6328125, |
| "learning_rate": 5.486584960295926e-05, |
| "loss": 0.4885, |
| "step": 7090 |
| }, |
| { |
| "epoch": 1.707581227436823, |
| "grad_norm": 1.96875, |
| "learning_rate": 5.4819308993520635e-05, |
| "loss": 0.5509, |
| "step": 7095 |
| }, |
| { |
| "epoch": 1.7087845968712396, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.477278112206731e-05, |
| "loss": 0.5354, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.7099879663056559, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.4726266062903016e-05, |
| "loss": 0.526, |
| "step": 7105 |
| }, |
| { |
| "epoch": 1.711191335740072, |
| "grad_norm": 1.734375, |
| "learning_rate": 5.467976389031103e-05, |
| "loss": 0.5211, |
| "step": 7110 |
| }, |
| { |
| "epoch": 1.7123947051744886, |
| "grad_norm": 1.78125, |
| "learning_rate": 5.463327467855402e-05, |
| "loss": 0.4991, |
| "step": 7115 |
| }, |
| { |
| "epoch": 1.713598074608905, |
| "grad_norm": 1.7421875, |
| "learning_rate": 5.458679850187402e-05, |
| "loss": 0.5524, |
| "step": 7120 |
| }, |
| { |
| "epoch": 1.7148014440433212, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.454033543449218e-05, |
| "loss": 0.5212, |
| "step": 7125 |
| }, |
| { |
| "epoch": 1.7160048134777377, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.449388555060873e-05, |
| "loss": 0.5653, |
| "step": 7130 |
| }, |
| { |
| "epoch": 1.7172081829121542, |
| "grad_norm": 1.5625, |
| "learning_rate": 5.444744892440289e-05, |
| "loss": 0.5258, |
| "step": 7135 |
| }, |
| { |
| "epoch": 1.7184115523465704, |
| "grad_norm": 1.5390625, |
| "learning_rate": 5.4401025630032645e-05, |
| "loss": 0.5132, |
| "step": 7140 |
| }, |
| { |
| "epoch": 1.7196149217809866, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.435461574163473e-05, |
| "loss": 0.5255, |
| "step": 7145 |
| }, |
| { |
| "epoch": 1.720818291215403, |
| "grad_norm": 1.6328125, |
| "learning_rate": 5.4308219333324475e-05, |
| "loss": 0.5403, |
| "step": 7150 |
| }, |
| { |
| "epoch": 1.7220216606498195, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.426183647919564e-05, |
| "loss": 0.569, |
| "step": 7155 |
| }, |
| { |
| "epoch": 1.7232250300842358, |
| "grad_norm": 1.5859375, |
| "learning_rate": 5.421546725332038e-05, |
| "loss": 0.5447, |
| "step": 7160 |
| }, |
| { |
| "epoch": 1.7244283995186522, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.416911172974909e-05, |
| "loss": 0.5287, |
| "step": 7165 |
| }, |
| { |
| "epoch": 1.7256317689530687, |
| "grad_norm": 1.6171875, |
| "learning_rate": 5.412276998251026e-05, |
| "loss": 0.5299, |
| "step": 7170 |
| }, |
| { |
| "epoch": 1.726835138387485, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.407644208561036e-05, |
| "loss": 0.5364, |
| "step": 7175 |
| }, |
| { |
| "epoch": 1.7280385078219012, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.40301281130338e-05, |
| "loss": 0.5338, |
| "step": 7180 |
| }, |
| { |
| "epoch": 1.7292418772563178, |
| "grad_norm": 1.6328125, |
| "learning_rate": 5.398382813874271e-05, |
| "loss": 0.5242, |
| "step": 7185 |
| }, |
| { |
| "epoch": 1.730445246690734, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.393754223667687e-05, |
| "loss": 0.5531, |
| "step": 7190 |
| }, |
| { |
| "epoch": 1.7316486161251503, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.3891270480753595e-05, |
| "loss": 0.4976, |
| "step": 7195 |
| }, |
| { |
| "epoch": 1.7328519855595668, |
| "grad_norm": 1.609375, |
| "learning_rate": 5.3845012944867614e-05, |
| "loss": 0.5231, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.7340553549939832, |
| "grad_norm": 1.5390625, |
| "learning_rate": 5.379876970289094e-05, |
| "loss": 0.502, |
| "step": 7205 |
| }, |
| { |
| "epoch": 1.7352587244283995, |
| "grad_norm": 1.8046875, |
| "learning_rate": 5.3752540828672736e-05, |
| "loss": 0.5223, |
| "step": 7210 |
| }, |
| { |
| "epoch": 1.736462093862816, |
| "grad_norm": 1.5546875, |
| "learning_rate": 5.3706326396039276e-05, |
| "loss": 0.5151, |
| "step": 7215 |
| }, |
| { |
| "epoch": 1.7376654632972324, |
| "grad_norm": 1.8984375, |
| "learning_rate": 5.3660126478793724e-05, |
| "loss": 0.5491, |
| "step": 7220 |
| }, |
| { |
| "epoch": 1.7388688327316486, |
| "grad_norm": 1.625, |
| "learning_rate": 5.361394115071608e-05, |
| "loss": 0.5299, |
| "step": 7225 |
| }, |
| { |
| "epoch": 1.7400722021660648, |
| "grad_norm": 1.96875, |
| "learning_rate": 5.356777048556303e-05, |
| "loss": 0.5394, |
| "step": 7230 |
| }, |
| { |
| "epoch": 1.7412755716004813, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.352161455706787e-05, |
| "loss": 0.5529, |
| "step": 7235 |
| }, |
| { |
| "epoch": 1.7424789410348978, |
| "grad_norm": 1.7578125, |
| "learning_rate": 5.347547343894035e-05, |
| "loss": 0.5515, |
| "step": 7240 |
| }, |
| { |
| "epoch": 1.743682310469314, |
| "grad_norm": 1.609375, |
| "learning_rate": 5.3429347204866555e-05, |
| "loss": 0.5264, |
| "step": 7245 |
| }, |
| { |
| "epoch": 1.7448856799037304, |
| "grad_norm": 1.8359375, |
| "learning_rate": 5.338323592850881e-05, |
| "loss": 0.5158, |
| "step": 7250 |
| }, |
| { |
| "epoch": 1.746089049338147, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.333713968350558e-05, |
| "loss": 0.5633, |
| "step": 7255 |
| }, |
| { |
| "epoch": 1.7472924187725631, |
| "grad_norm": 1.8125, |
| "learning_rate": 5.329105854347129e-05, |
| "loss": 0.5356, |
| "step": 7260 |
| }, |
| { |
| "epoch": 1.7484957882069796, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.324499258199625e-05, |
| "loss": 0.5117, |
| "step": 7265 |
| }, |
| { |
| "epoch": 1.749699157641396, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.319894187264653e-05, |
| "loss": 0.5246, |
| "step": 7270 |
| }, |
| { |
| "epoch": 1.7509025270758123, |
| "grad_norm": 1.671875, |
| "learning_rate": 5.315290648896386e-05, |
| "loss": 0.5215, |
| "step": 7275 |
| }, |
| { |
| "epoch": 1.7521058965102285, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.310688650446546e-05, |
| "loss": 0.5411, |
| "step": 7280 |
| }, |
| { |
| "epoch": 1.753309265944645, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.3060881992644e-05, |
| "loss": 0.4988, |
| "step": 7285 |
| }, |
| { |
| "epoch": 1.7545126353790614, |
| "grad_norm": 1.625, |
| "learning_rate": 5.301489302696741e-05, |
| "loss": 0.5333, |
| "step": 7290 |
| }, |
| { |
| "epoch": 1.7557160048134777, |
| "grad_norm": 1.5703125, |
| "learning_rate": 5.29689196808788e-05, |
| "loss": 0.5175, |
| "step": 7295 |
| }, |
| { |
| "epoch": 1.7569193742478941, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.292296202779636e-05, |
| "loss": 0.5333, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.7581227436823106, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.287702014111317e-05, |
| "loss": 0.563, |
| "step": 7305 |
| }, |
| { |
| "epoch": 1.7593261131167268, |
| "grad_norm": 1.6171875, |
| "learning_rate": 5.2831094094197186e-05, |
| "loss": 0.5025, |
| "step": 7310 |
| }, |
| { |
| "epoch": 1.760529482551143, |
| "grad_norm": 1.453125, |
| "learning_rate": 5.2785183960391025e-05, |
| "loss": 0.519, |
| "step": 7315 |
| }, |
| { |
| "epoch": 1.7617328519855595, |
| "grad_norm": 1.5625, |
| "learning_rate": 5.2739289813011925e-05, |
| "loss": 0.4948, |
| "step": 7320 |
| }, |
| { |
| "epoch": 1.762936221419976, |
| "grad_norm": 1.734375, |
| "learning_rate": 5.269341172535156e-05, |
| "loss": 0.5202, |
| "step": 7325 |
| }, |
| { |
| "epoch": 1.7641395908543922, |
| "grad_norm": 1.6953125, |
| "learning_rate": 5.2647549770675984e-05, |
| "loss": 0.4981, |
| "step": 7330 |
| }, |
| { |
| "epoch": 1.7653429602888087, |
| "grad_norm": 1.5390625, |
| "learning_rate": 5.2601704022225466e-05, |
| "loss": 0.5138, |
| "step": 7335 |
| }, |
| { |
| "epoch": 1.7665463297232251, |
| "grad_norm": 1.7421875, |
| "learning_rate": 5.2555874553214414e-05, |
| "loss": 0.5303, |
| "step": 7340 |
| }, |
| { |
| "epoch": 1.7677496991576414, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.251006143683123e-05, |
| "loss": 0.5314, |
| "step": 7345 |
| }, |
| { |
| "epoch": 1.7689530685920578, |
| "grad_norm": 1.671875, |
| "learning_rate": 5.2464264746238215e-05, |
| "loss": 0.5064, |
| "step": 7350 |
| }, |
| { |
| "epoch": 1.7701564380264743, |
| "grad_norm": 1.4921875, |
| "learning_rate": 5.241848455457141e-05, |
| "loss": 0.5114, |
| "step": 7355 |
| }, |
| { |
| "epoch": 1.7713598074608905, |
| "grad_norm": 1.5546875, |
| "learning_rate": 5.237272093494052e-05, |
| "loss": 0.5072, |
| "step": 7360 |
| }, |
| { |
| "epoch": 1.7725631768953067, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.2326973960428794e-05, |
| "loss": 0.534, |
| "step": 7365 |
| }, |
| { |
| "epoch": 1.7737665463297232, |
| "grad_norm": 1.5234375, |
| "learning_rate": 5.2281243704092886e-05, |
| "loss": 0.5141, |
| "step": 7370 |
| }, |
| { |
| "epoch": 1.7749699157641396, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.2235530238962774e-05, |
| "loss": 0.5247, |
| "step": 7375 |
| }, |
| { |
| "epoch": 1.7761732851985559, |
| "grad_norm": 1.71875, |
| "learning_rate": 5.218983363804158e-05, |
| "loss": 0.5001, |
| "step": 7380 |
| }, |
| { |
| "epoch": 1.7773766546329723, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.214415397430552e-05, |
| "loss": 0.5389, |
| "step": 7385 |
| }, |
| { |
| "epoch": 1.7785800240673888, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.2098491320703795e-05, |
| "loss": 0.5073, |
| "step": 7390 |
| }, |
| { |
| "epoch": 1.779783393501805, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.205284575015837e-05, |
| "loss": 0.5679, |
| "step": 7395 |
| }, |
| { |
| "epoch": 1.7809867629362213, |
| "grad_norm": 1.8359375, |
| "learning_rate": 5.200721733556399e-05, |
| "loss": 0.5513, |
| "step": 7400 |
| }, |
| { |
| "epoch": 1.782190132370638, |
| "grad_norm": 1.59375, |
| "learning_rate": 5.196160614978798e-05, |
| "loss": 0.5526, |
| "step": 7405 |
| }, |
| { |
| "epoch": 1.7833935018050542, |
| "grad_norm": 1.7421875, |
| "learning_rate": 5.191601226567014e-05, |
| "loss": 0.5022, |
| "step": 7410 |
| }, |
| { |
| "epoch": 1.7845968712394704, |
| "grad_norm": 1.8125, |
| "learning_rate": 5.187043575602264e-05, |
| "loss": 0.5309, |
| "step": 7415 |
| }, |
| { |
| "epoch": 1.7858002406738869, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.182487669362992e-05, |
| "loss": 0.5247, |
| "step": 7420 |
| }, |
| { |
| "epoch": 1.7870036101083033, |
| "grad_norm": 1.71875, |
| "learning_rate": 5.177933515124856e-05, |
| "loss": 0.5395, |
| "step": 7425 |
| }, |
| { |
| "epoch": 1.7882069795427196, |
| "grad_norm": 1.59375, |
| "learning_rate": 5.173381120160712e-05, |
| "loss": 0.5064, |
| "step": 7430 |
| }, |
| { |
| "epoch": 1.789410348977136, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.1688304917406134e-05, |
| "loss": 0.5329, |
| "step": 7435 |
| }, |
| { |
| "epoch": 1.7906137184115525, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.1642816371317875e-05, |
| "loss": 0.5505, |
| "step": 7440 |
| }, |
| { |
| "epoch": 1.7918170878459687, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.159734563598629e-05, |
| "loss": 0.5252, |
| "step": 7445 |
| }, |
| { |
| "epoch": 1.793020457280385, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.1551892784026905e-05, |
| "loss": 0.5287, |
| "step": 7450 |
| }, |
| { |
| "epoch": 1.7942238267148014, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.150645788802666e-05, |
| "loss": 0.5489, |
| "step": 7455 |
| }, |
| { |
| "epoch": 1.7954271961492179, |
| "grad_norm": 1.6640625, |
| "learning_rate": 5.1461041020543855e-05, |
| "loss": 0.529, |
| "step": 7460 |
| }, |
| { |
| "epoch": 1.796630565583634, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.141564225410797e-05, |
| "loss": 0.5412, |
| "step": 7465 |
| }, |
| { |
| "epoch": 1.7978339350180506, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.137026166121958e-05, |
| "loss": 0.5298, |
| "step": 7470 |
| }, |
| { |
| "epoch": 1.799037304452467, |
| "grad_norm": 1.609375, |
| "learning_rate": 5.1324899314350255e-05, |
| "loss": 0.5254, |
| "step": 7475 |
| }, |
| { |
| "epoch": 1.8002406738868832, |
| "grad_norm": 1.5859375, |
| "learning_rate": 5.12795552859424e-05, |
| "loss": 0.4987, |
| "step": 7480 |
| }, |
| { |
| "epoch": 1.8014440433212995, |
| "grad_norm": 1.71875, |
| "learning_rate": 5.1234229648409194e-05, |
| "loss": 0.5129, |
| "step": 7485 |
| }, |
| { |
| "epoch": 1.8026474127557162, |
| "grad_norm": 1.5703125, |
| "learning_rate": 5.118892247413443e-05, |
| "loss": 0.5365, |
| "step": 7490 |
| }, |
| { |
| "epoch": 1.8038507821901324, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.1143633835472424e-05, |
| "loss": 0.5648, |
| "step": 7495 |
| }, |
| { |
| "epoch": 1.8050541516245486, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.1098363804747874e-05, |
| "loss": 0.5456, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.8050541516245486, |
| "eval_loss": 0.44866788387298584, |
| "eval_runtime": 2.6548, |
| "eval_samples_per_second": 75.335, |
| "eval_steps_per_second": 75.335, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.806257521058965, |
| "grad_norm": 1.6015625, |
| "learning_rate": 5.105311245425579e-05, |
| "loss": 0.5104, |
| "step": 7505 |
| }, |
| { |
| "epoch": 1.8074608904933815, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.100787985626133e-05, |
| "loss": 0.4927, |
| "step": 7510 |
| }, |
| { |
| "epoch": 1.8086642599277978, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.0962666082999714e-05, |
| "loss": 0.5157, |
| "step": 7515 |
| }, |
| { |
| "epoch": 1.8098676293622142, |
| "grad_norm": 1.65625, |
| "learning_rate": 5.0917471206676075e-05, |
| "loss": 0.5282, |
| "step": 7520 |
| }, |
| { |
| "epoch": 1.8110709987966307, |
| "grad_norm": 1.5859375, |
| "learning_rate": 5.087229529946542e-05, |
| "loss": 0.5211, |
| "step": 7525 |
| }, |
| { |
| "epoch": 1.812274368231047, |
| "grad_norm": 1.765625, |
| "learning_rate": 5.08271384335124e-05, |
| "loss": 0.5446, |
| "step": 7530 |
| }, |
| { |
| "epoch": 1.8134777376654632, |
| "grad_norm": 1.78125, |
| "learning_rate": 5.07820006809313e-05, |
| "loss": 0.5155, |
| "step": 7535 |
| }, |
| { |
| "epoch": 1.8146811070998796, |
| "grad_norm": 1.5625, |
| "learning_rate": 5.073688211380587e-05, |
| "loss": 0.5065, |
| "step": 7540 |
| }, |
| { |
| "epoch": 1.815884476534296, |
| "grad_norm": 1.7109375, |
| "learning_rate": 5.069178280418923e-05, |
| "loss": 0.508, |
| "step": 7545 |
| }, |
| { |
| "epoch": 1.8170878459687123, |
| "grad_norm": 1.8046875, |
| "learning_rate": 5.064670282410371e-05, |
| "loss": 0.5329, |
| "step": 7550 |
| }, |
| { |
| "epoch": 1.8182912154031288, |
| "grad_norm": 1.46875, |
| "learning_rate": 5.0601642245540826e-05, |
| "loss": 0.5221, |
| "step": 7555 |
| }, |
| { |
| "epoch": 1.8194945848375452, |
| "grad_norm": 1.7109375, |
| "learning_rate": 5.055660114046108e-05, |
| "loss": 0.5261, |
| "step": 7560 |
| }, |
| { |
| "epoch": 1.8206979542719615, |
| "grad_norm": 1.6953125, |
| "learning_rate": 5.0511579580793865e-05, |
| "loss": 0.5041, |
| "step": 7565 |
| }, |
| { |
| "epoch": 1.8219013237063777, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.046657763843739e-05, |
| "loss": 0.4916, |
| "step": 7570 |
| }, |
| { |
| "epoch": 1.8231046931407944, |
| "grad_norm": 1.6953125, |
| "learning_rate": 5.0421595385258514e-05, |
| "loss": 0.5279, |
| "step": 7575 |
| }, |
| { |
| "epoch": 1.8243080625752106, |
| "grad_norm": 1.578125, |
| "learning_rate": 5.037663289309264e-05, |
| "loss": 0.5219, |
| "step": 7580 |
| }, |
| { |
| "epoch": 1.8255114320096268, |
| "grad_norm": 1.7421875, |
| "learning_rate": 5.0331690233743665e-05, |
| "loss": 0.5261, |
| "step": 7585 |
| }, |
| { |
| "epoch": 1.8267148014440433, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.028676747898376e-05, |
| "loss": 0.5183, |
| "step": 7590 |
| }, |
| { |
| "epoch": 1.8279181708784598, |
| "grad_norm": 1.5546875, |
| "learning_rate": 5.024186470055332e-05, |
| "loss": 0.4969, |
| "step": 7595 |
| }, |
| { |
| "epoch": 1.829121540312876, |
| "grad_norm": 1.90625, |
| "learning_rate": 5.019698197016088e-05, |
| "loss": 0.5332, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.8303249097472925, |
| "grad_norm": 1.7421875, |
| "learning_rate": 5.0152119359482886e-05, |
| "loss": 0.5203, |
| "step": 7605 |
| }, |
| { |
| "epoch": 1.831528279181709, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.010727694016373e-05, |
| "loss": 0.5195, |
| "step": 7610 |
| }, |
| { |
| "epoch": 1.8327316486161251, |
| "grad_norm": 1.5234375, |
| "learning_rate": 5.00624547838155e-05, |
| "loss": 0.5075, |
| "step": 7615 |
| }, |
| { |
| "epoch": 1.8339350180505414, |
| "grad_norm": 1.6640625, |
| "learning_rate": 5.001765296201796e-05, |
| "loss": 0.5253, |
| "step": 7620 |
| }, |
| { |
| "epoch": 1.8351383874849578, |
| "grad_norm": 1.5625, |
| "learning_rate": 4.997287154631837e-05, |
| "loss": 0.5408, |
| "step": 7625 |
| }, |
| { |
| "epoch": 1.8363417569193743, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.992811060823143e-05, |
| "loss": 0.5472, |
| "step": 7630 |
| }, |
| { |
| "epoch": 1.8375451263537905, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.9883370219239146e-05, |
| "loss": 0.5116, |
| "step": 7635 |
| }, |
| { |
| "epoch": 1.838748495788207, |
| "grad_norm": 1.8359375, |
| "learning_rate": 4.983865045079065e-05, |
| "loss": 0.5468, |
| "step": 7640 |
| }, |
| { |
| "epoch": 1.8399518652226234, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.979395137430222e-05, |
| "loss": 0.527, |
| "step": 7645 |
| }, |
| { |
| "epoch": 1.8411552346570397, |
| "grad_norm": 1.6953125, |
| "learning_rate": 4.974927306115703e-05, |
| "loss": 0.5331, |
| "step": 7650 |
| }, |
| { |
| "epoch": 1.8423586040914561, |
| "grad_norm": 1.75, |
| "learning_rate": 4.970461558270513e-05, |
| "loss": 0.5134, |
| "step": 7655 |
| }, |
| { |
| "epoch": 1.8435619735258726, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.965997901026327e-05, |
| "loss": 0.5049, |
| "step": 7660 |
| }, |
| { |
| "epoch": 1.8447653429602888, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.961536341511485e-05, |
| "loss": 0.546, |
| "step": 7665 |
| }, |
| { |
| "epoch": 1.845968712394705, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.957076886850973e-05, |
| "loss": 0.523, |
| "step": 7670 |
| }, |
| { |
| "epoch": 1.8471720818291215, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.952619544166418e-05, |
| "loss": 0.5506, |
| "step": 7675 |
| }, |
| { |
| "epoch": 1.848375451263538, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.9481643205760744e-05, |
| "loss": 0.5393, |
| "step": 7680 |
| }, |
| { |
| "epoch": 1.8495788206979542, |
| "grad_norm": 1.75, |
| "learning_rate": 4.94371122319481e-05, |
| "loss": 0.521, |
| "step": 7685 |
| }, |
| { |
| "epoch": 1.8507821901323707, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.939260259134102e-05, |
| "loss": 0.5014, |
| "step": 7690 |
| }, |
| { |
| "epoch": 1.8519855595667871, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.9348114355020146e-05, |
| "loss": 0.5539, |
| "step": 7695 |
| }, |
| { |
| "epoch": 1.8531889290012034, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.930364759403198e-05, |
| "loss": 0.5332, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.8543922984356196, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.925920237938872e-05, |
| "loss": 0.527, |
| "step": 7705 |
| }, |
| { |
| "epoch": 1.855595667870036, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.921477878206815e-05, |
| "loss": 0.5315, |
| "step": 7710 |
| }, |
| { |
| "epoch": 1.8567990373044525, |
| "grad_norm": 1.78125, |
| "learning_rate": 4.9170376873013535e-05, |
| "loss": 0.5518, |
| "step": 7715 |
| }, |
| { |
| "epoch": 1.8580024067388687, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.91259967231335e-05, |
| "loss": 0.4976, |
| "step": 7720 |
| }, |
| { |
| "epoch": 1.8592057761732852, |
| "grad_norm": 1.8671875, |
| "learning_rate": 4.908163840330192e-05, |
| "loss": 0.5039, |
| "step": 7725 |
| }, |
| { |
| "epoch": 1.8604091456077017, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.9037301984357806e-05, |
| "loss": 0.5119, |
| "step": 7730 |
| }, |
| { |
| "epoch": 1.8616125150421179, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.899298753710522e-05, |
| "loss": 0.476, |
| "step": 7735 |
| }, |
| { |
| "epoch": 1.8628158844765343, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.894869513231311e-05, |
| "loss": 0.5074, |
| "step": 7740 |
| }, |
| { |
| "epoch": 1.8640192539109508, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.890442484071522e-05, |
| "loss": 0.5264, |
| "step": 7745 |
| }, |
| { |
| "epoch": 1.865222623345367, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.886017673301e-05, |
| "loss": 0.5583, |
| "step": 7750 |
| }, |
| { |
| "epoch": 1.8664259927797833, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.8815950879860446e-05, |
| "loss": 0.5001, |
| "step": 7755 |
| }, |
| { |
| "epoch": 1.8676293622141997, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.877174735189404e-05, |
| "loss": 0.5219, |
| "step": 7760 |
| }, |
| { |
| "epoch": 1.8688327316486162, |
| "grad_norm": 1.65625, |
| "learning_rate": 4.87275662197026e-05, |
| "loss": 0.5202, |
| "step": 7765 |
| }, |
| { |
| "epoch": 1.8700361010830324, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.868340755384218e-05, |
| "loss": 0.5163, |
| "step": 7770 |
| }, |
| { |
| "epoch": 1.8712394705174489, |
| "grad_norm": 1.625, |
| "learning_rate": 4.863927142483294e-05, |
| "loss": 0.5207, |
| "step": 7775 |
| }, |
| { |
| "epoch": 1.8724428399518653, |
| "grad_norm": 1.6640625, |
| "learning_rate": 4.859515790315909e-05, |
| "loss": 0.5159, |
| "step": 7780 |
| }, |
| { |
| "epoch": 1.8736462093862816, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.8551067059268704e-05, |
| "loss": 0.5157, |
| "step": 7785 |
| }, |
| { |
| "epoch": 1.8748495788206978, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.850699896357363e-05, |
| "loss": 0.5055, |
| "step": 7790 |
| }, |
| { |
| "epoch": 1.8760529482551145, |
| "grad_norm": 1.4765625, |
| "learning_rate": 4.8462953686449424e-05, |
| "loss": 0.5044, |
| "step": 7795 |
| }, |
| { |
| "epoch": 1.8772563176895307, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.8418931298235174e-05, |
| "loss": 0.5022, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.878459687123947, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.8374931869233425e-05, |
| "loss": 0.5135, |
| "step": 7805 |
| }, |
| { |
| "epoch": 1.8796630565583634, |
| "grad_norm": 1.6640625, |
| "learning_rate": 4.8330955469710065e-05, |
| "loss": 0.5238, |
| "step": 7810 |
| }, |
| { |
| "epoch": 1.8808664259927799, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.828700216989418e-05, |
| "loss": 0.5563, |
| "step": 7815 |
| }, |
| { |
| "epoch": 1.882069795427196, |
| "grad_norm": 1.796875, |
| "learning_rate": 4.8243072039977986e-05, |
| "loss": 0.5009, |
| "step": 7820 |
| }, |
| { |
| "epoch": 1.8832731648616126, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.819916515011669e-05, |
| "loss": 0.4995, |
| "step": 7825 |
| }, |
| { |
| "epoch": 1.884476534296029, |
| "grad_norm": 1.765625, |
| "learning_rate": 4.815528157042841e-05, |
| "loss": 0.5511, |
| "step": 7830 |
| }, |
| { |
| "epoch": 1.8856799037304453, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.811142137099398e-05, |
| "loss": 0.5293, |
| "step": 7835 |
| }, |
| { |
| "epoch": 1.8868832731648615, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.806758462185695e-05, |
| "loss": 0.5794, |
| "step": 7840 |
| }, |
| { |
| "epoch": 1.888086642599278, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.802377139302342e-05, |
| "loss": 0.5428, |
| "step": 7845 |
| }, |
| { |
| "epoch": 1.8892900120336944, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.797998175446188e-05, |
| "loss": 0.4973, |
| "step": 7850 |
| }, |
| { |
| "epoch": 1.8904933814681106, |
| "grad_norm": 1.6640625, |
| "learning_rate": 4.79362157761032e-05, |
| "loss": 0.5191, |
| "step": 7855 |
| }, |
| { |
| "epoch": 1.891696750902527, |
| "grad_norm": 1.65625, |
| "learning_rate": 4.789247352784044e-05, |
| "loss": 0.511, |
| "step": 7860 |
| }, |
| { |
| "epoch": 1.8929001203369435, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.784875507952876e-05, |
| "loss": 0.5398, |
| "step": 7865 |
| }, |
| { |
| "epoch": 1.8941034897713598, |
| "grad_norm": 1.8515625, |
| "learning_rate": 4.780506050098533e-05, |
| "loss": 0.5216, |
| "step": 7870 |
| }, |
| { |
| "epoch": 1.895306859205776, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.7761389861989194e-05, |
| "loss": 0.5211, |
| "step": 7875 |
| }, |
| { |
| "epoch": 1.8965102286401927, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.7717743232281144e-05, |
| "loss": 0.5146, |
| "step": 7880 |
| }, |
| { |
| "epoch": 1.897713598074609, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.7674120681563665e-05, |
| "loss": 0.5227, |
| "step": 7885 |
| }, |
| { |
| "epoch": 1.8989169675090252, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.763052227950076e-05, |
| "loss": 0.4982, |
| "step": 7890 |
| }, |
| { |
| "epoch": 1.9001203369434416, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.758694809571788e-05, |
| "loss": 0.5442, |
| "step": 7895 |
| }, |
| { |
| "epoch": 1.901323706377858, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.754339819980181e-05, |
| "loss": 0.5375, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.9025270758122743, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.749987266130052e-05, |
| "loss": 0.543, |
| "step": 7905 |
| }, |
| { |
| "epoch": 1.9037304452466908, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.7456371549723105e-05, |
| "loss": 0.5202, |
| "step": 7910 |
| }, |
| { |
| "epoch": 1.9049338146811072, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.741289493453965e-05, |
| "loss": 0.5017, |
| "step": 7915 |
| }, |
| { |
| "epoch": 1.9061371841155235, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.73694428851811e-05, |
| "loss": 0.5265, |
| "step": 7920 |
| }, |
| { |
| "epoch": 1.9073405535499397, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.73260154710392e-05, |
| "loss": 0.5084, |
| "step": 7925 |
| }, |
| { |
| "epoch": 1.9085439229843562, |
| "grad_norm": 1.7734375, |
| "learning_rate": 4.728261276146632e-05, |
| "loss": 0.5465, |
| "step": 7930 |
| }, |
| { |
| "epoch": 1.9097472924187726, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.723923482577541e-05, |
| "loss": 0.5287, |
| "step": 7935 |
| }, |
| { |
| "epoch": 1.9109506618531888, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.719588173323983e-05, |
| "loss": 0.4913, |
| "step": 7940 |
| }, |
| { |
| "epoch": 1.9121540312876053, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.715255355309328e-05, |
| "loss": 0.5275, |
| "step": 7945 |
| }, |
| { |
| "epoch": 1.9133574007220218, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.710925035452967e-05, |
| "loss": 0.4948, |
| "step": 7950 |
| }, |
| { |
| "epoch": 1.914560770156438, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.706597220670303e-05, |
| "loss": 0.5135, |
| "step": 7955 |
| }, |
| { |
| "epoch": 1.9157641395908542, |
| "grad_norm": 1.6640625, |
| "learning_rate": 4.702271917872734e-05, |
| "loss": 0.5358, |
| "step": 7960 |
| }, |
| { |
| "epoch": 1.916967509025271, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.697949133967653e-05, |
| "loss": 0.5001, |
| "step": 7965 |
| }, |
| { |
| "epoch": 1.9181708784596871, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.6936288758584256e-05, |
| "loss": 0.5063, |
| "step": 7970 |
| }, |
| { |
| "epoch": 1.9193742478941034, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.689311150444387e-05, |
| "loss": 0.5036, |
| "step": 7975 |
| }, |
| { |
| "epoch": 1.9205776173285198, |
| "grad_norm": 1.5625, |
| "learning_rate": 4.684995964620821e-05, |
| "loss": 0.5218, |
| "step": 7980 |
| }, |
| { |
| "epoch": 1.9217809867629363, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.680683325278967e-05, |
| "loss": 0.5123, |
| "step": 7985 |
| }, |
| { |
| "epoch": 1.9229843561973525, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.6763732393059874e-05, |
| "loss": 0.5022, |
| "step": 7990 |
| }, |
| { |
| "epoch": 1.924187725631769, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.6720657135849716e-05, |
| "loss": 0.4931, |
| "step": 7995 |
| }, |
| { |
| "epoch": 1.9253910950661854, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.667760754994923e-05, |
| "loss": 0.5287, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.9253910950661854, |
| "eval_loss": 0.44210097193717957, |
| "eval_runtime": 2.6601, |
| "eval_samples_per_second": 75.184, |
| "eval_steps_per_second": 75.184, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.9265944645006017, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.663458370410737e-05, |
| "loss": 0.5263, |
| "step": 8005 |
| }, |
| { |
| "epoch": 1.927797833935018, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.6591585667032065e-05, |
| "loss": 0.5124, |
| "step": 8010 |
| }, |
| { |
| "epoch": 1.9290012033694344, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.6548613507389994e-05, |
| "loss": 0.519, |
| "step": 8015 |
| }, |
| { |
| "epoch": 1.9302045728038508, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.650566729380652e-05, |
| "loss": 0.5118, |
| "step": 8020 |
| }, |
| { |
| "epoch": 1.931407942238267, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.646274709486554e-05, |
| "loss": 0.5414, |
| "step": 8025 |
| }, |
| { |
| "epoch": 1.9326113116726835, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.641985297910945e-05, |
| "loss": 0.5142, |
| "step": 8030 |
| }, |
| { |
| "epoch": 1.9338146811071, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.637698501503899e-05, |
| "loss": 0.5272, |
| "step": 8035 |
| }, |
| { |
| "epoch": 1.9350180505415162, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.6334143271113086e-05, |
| "loss": 0.4837, |
| "step": 8040 |
| }, |
| { |
| "epoch": 1.9362214199759324, |
| "grad_norm": 1.7734375, |
| "learning_rate": 4.629132781574886e-05, |
| "loss": 0.5211, |
| "step": 8045 |
| }, |
| { |
| "epoch": 1.9374247894103491, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.6248538717321385e-05, |
| "loss": 0.5349, |
| "step": 8050 |
| }, |
| { |
| "epoch": 1.9386281588447654, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.620577604416369e-05, |
| "loss": 0.5315, |
| "step": 8055 |
| }, |
| { |
| "epoch": 1.9398315282791816, |
| "grad_norm": 1.5625, |
| "learning_rate": 4.616303986456659e-05, |
| "loss": 0.5028, |
| "step": 8060 |
| }, |
| { |
| "epoch": 1.941034897713598, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.612033024677859e-05, |
| "loss": 0.5402, |
| "step": 8065 |
| }, |
| { |
| "epoch": 1.9422382671480145, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.607764725900576e-05, |
| "loss": 0.5104, |
| "step": 8070 |
| }, |
| { |
| "epoch": 1.9434416365824307, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.603499096941167e-05, |
| "loss": 0.5094, |
| "step": 8075 |
| }, |
| { |
| "epoch": 1.9446450060168472, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.5992361446117254e-05, |
| "loss": 0.5097, |
| "step": 8080 |
| }, |
| { |
| "epoch": 1.9458483754512637, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.5949758757200655e-05, |
| "loss": 0.485, |
| "step": 8085 |
| }, |
| { |
| "epoch": 1.94705174488568, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.590718297069724e-05, |
| "loss": 0.5589, |
| "step": 8090 |
| }, |
| { |
| "epoch": 1.9482551143200961, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.586463415459933e-05, |
| "loss": 0.5094, |
| "step": 8095 |
| }, |
| { |
| "epoch": 1.9494584837545126, |
| "grad_norm": 1.625, |
| "learning_rate": 4.5822112376856236e-05, |
| "loss": 0.5018, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.950661853188929, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.577961770537406e-05, |
| "loss": 0.5215, |
| "step": 8105 |
| }, |
| { |
| "epoch": 1.9518652226233453, |
| "grad_norm": 2.78125, |
| "learning_rate": 4.573715020801563e-05, |
| "loss": 0.5018, |
| "step": 8110 |
| }, |
| { |
| "epoch": 1.9530685920577617, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.569470995260035e-05, |
| "loss": 0.5245, |
| "step": 8115 |
| }, |
| { |
| "epoch": 1.9542719614921782, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.565229700690417e-05, |
| "loss": 0.5267, |
| "step": 8120 |
| }, |
| { |
| "epoch": 1.9554753309265944, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.560991143865939e-05, |
| "loss": 0.5192, |
| "step": 8125 |
| }, |
| { |
| "epoch": 1.9566787003610109, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.556755331555459e-05, |
| "loss": 0.5226, |
| "step": 8130 |
| }, |
| { |
| "epoch": 1.9578820697954273, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.552522270523454e-05, |
| "loss": 0.5142, |
| "step": 8135 |
| }, |
| { |
| "epoch": 1.9590854392298436, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.5482919675300036e-05, |
| "loss": 0.5439, |
| "step": 8140 |
| }, |
| { |
| "epoch": 1.9602888086642598, |
| "grad_norm": 1.7421875, |
| "learning_rate": 4.544064429330788e-05, |
| "loss": 0.5377, |
| "step": 8145 |
| }, |
| { |
| "epoch": 1.9614921780986763, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.5398396626770674e-05, |
| "loss": 0.4933, |
| "step": 8150 |
| }, |
| { |
| "epoch": 1.9626955475330927, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.535617674315682e-05, |
| "loss": 0.5361, |
| "step": 8155 |
| }, |
| { |
| "epoch": 1.963898916967509, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.5313984709890246e-05, |
| "loss": 0.4983, |
| "step": 8160 |
| }, |
| { |
| "epoch": 1.9651022864019254, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.5271820594350514e-05, |
| "loss": 0.4995, |
| "step": 8165 |
| }, |
| { |
| "epoch": 1.9663056558363419, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.522968446387257e-05, |
| "loss": 0.5564, |
| "step": 8170 |
| }, |
| { |
| "epoch": 1.967509025270758, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.5187576385746606e-05, |
| "loss": 0.5333, |
| "step": 8175 |
| }, |
| { |
| "epoch": 1.9687123947051743, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.5145496427218115e-05, |
| "loss": 0.5434, |
| "step": 8180 |
| }, |
| { |
| "epoch": 1.969915764139591, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.51034446554876e-05, |
| "loss": 0.5388, |
| "step": 8185 |
| }, |
| { |
| "epoch": 1.9711191335740073, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.50614211377106e-05, |
| "loss": 0.5048, |
| "step": 8190 |
| }, |
| { |
| "epoch": 1.9723225030084235, |
| "grad_norm": 1.7421875, |
| "learning_rate": 4.501942594099751e-05, |
| "loss": 0.5497, |
| "step": 8195 |
| }, |
| { |
| "epoch": 1.97352587244284, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.497745913241351e-05, |
| "loss": 0.5155, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.9747292418772564, |
| "grad_norm": 1.53125, |
| "learning_rate": 4.4935520778978436e-05, |
| "loss": 0.5105, |
| "step": 8205 |
| }, |
| { |
| "epoch": 1.9759326113116726, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.489361094766668e-05, |
| "loss": 0.5334, |
| "step": 8210 |
| }, |
| { |
| "epoch": 1.977135980746089, |
| "grad_norm": 1.6640625, |
| "learning_rate": 4.4851729705407116e-05, |
| "loss": 0.5078, |
| "step": 8215 |
| }, |
| { |
| "epoch": 1.9783393501805056, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.4809877119082895e-05, |
| "loss": 0.5246, |
| "step": 8220 |
| }, |
| { |
| "epoch": 1.9795427196149218, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.476805325553149e-05, |
| "loss": 0.5111, |
| "step": 8225 |
| }, |
| { |
| "epoch": 1.980746089049338, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.472625818154443e-05, |
| "loss": 0.5518, |
| "step": 8230 |
| }, |
| { |
| "epoch": 1.9819494584837545, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.4684491963867316e-05, |
| "loss": 0.4914, |
| "step": 8235 |
| }, |
| { |
| "epoch": 1.983152827918171, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.464275466919963e-05, |
| "loss": 0.4908, |
| "step": 8240 |
| }, |
| { |
| "epoch": 1.9843561973525872, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.460104636419471e-05, |
| "loss": 0.5182, |
| "step": 8245 |
| }, |
| { |
| "epoch": 1.9855595667870036, |
| "grad_norm": 1.8515625, |
| "learning_rate": 4.455936711545955e-05, |
| "loss": 0.5369, |
| "step": 8250 |
| }, |
| { |
| "epoch": 1.98676293622142, |
| "grad_norm": 1.765625, |
| "learning_rate": 4.4517716989554765e-05, |
| "loss": 0.5488, |
| "step": 8255 |
| }, |
| { |
| "epoch": 1.9879663056558363, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.447609605299448e-05, |
| "loss": 0.5152, |
| "step": 8260 |
| }, |
| { |
| "epoch": 1.9891696750902526, |
| "grad_norm": 1.796875, |
| "learning_rate": 4.443450437224617e-05, |
| "loss": 0.5413, |
| "step": 8265 |
| }, |
| { |
| "epoch": 1.9903730445246692, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.4392942013730636e-05, |
| "loss": 0.5233, |
| "step": 8270 |
| }, |
| { |
| "epoch": 1.9915764139590855, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.4351409043821766e-05, |
| "loss": 0.5414, |
| "step": 8275 |
| }, |
| { |
| "epoch": 1.9927797833935017, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.4309905528846614e-05, |
| "loss": 0.5151, |
| "step": 8280 |
| }, |
| { |
| "epoch": 1.9939831528279182, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.4268431535085135e-05, |
| "loss": 0.5228, |
| "step": 8285 |
| }, |
| { |
| "epoch": 1.9951865222623346, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.422698712877017e-05, |
| "loss": 0.5155, |
| "step": 8290 |
| }, |
| { |
| "epoch": 1.9963898916967509, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.418557237608729e-05, |
| "loss": 0.5104, |
| "step": 8295 |
| }, |
| { |
| "epoch": 1.9975932611311673, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.41441873431747e-05, |
| "loss": 0.5153, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.9987966305655838, |
| "grad_norm": 1.6953125, |
| "learning_rate": 4.410283209612319e-05, |
| "loss": 0.5152, |
| "step": 8305 |
| }, |
| { |
| "epoch": 1.9995186522262336, |
| "eval_loss": 0.4374042749404907, |
| "eval_runtime": 2.6528, |
| "eval_samples_per_second": 75.393, |
| "eval_steps_per_second": 75.393, |
| "step": 8308 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.406150670097592e-05, |
| "loss": 0.5064, |
| "step": 8310 |
| }, |
| { |
| "epoch": 2.0012033694344162, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.4020211223728445e-05, |
| "loss": 0.4974, |
| "step": 8315 |
| }, |
| { |
| "epoch": 2.002406738868833, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.3978945730328466e-05, |
| "loss": 0.4635, |
| "step": 8320 |
| }, |
| { |
| "epoch": 2.003610108303249, |
| "grad_norm": 1.796875, |
| "learning_rate": 4.393771028667588e-05, |
| "loss": 0.487, |
| "step": 8325 |
| }, |
| { |
| "epoch": 2.0048134777376654, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.389650495862253e-05, |
| "loss": 0.496, |
| "step": 8330 |
| }, |
| { |
| "epoch": 2.0060168471720816, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.38553298119722e-05, |
| "loss": 0.493, |
| "step": 8335 |
| }, |
| { |
| "epoch": 2.0072202166064983, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.381418491248049e-05, |
| "loss": 0.4847, |
| "step": 8340 |
| }, |
| { |
| "epoch": 2.0084235860409145, |
| "grad_norm": 1.515625, |
| "learning_rate": 4.377307032585463e-05, |
| "loss": 0.489, |
| "step": 8345 |
| }, |
| { |
| "epoch": 2.0096269554753308, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.373198611775352e-05, |
| "loss": 0.4824, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.0108303249097474, |
| "grad_norm": 1.65625, |
| "learning_rate": 4.3690932353787495e-05, |
| "loss": 0.5218, |
| "step": 8355 |
| }, |
| { |
| "epoch": 2.0120336943441637, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.364990909951829e-05, |
| "loss": 0.4906, |
| "step": 8360 |
| }, |
| { |
| "epoch": 2.01323706377858, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.36089164204589e-05, |
| "loss": 0.5099, |
| "step": 8365 |
| }, |
| { |
| "epoch": 2.0144404332129966, |
| "grad_norm": 1.5546875, |
| "learning_rate": 4.3567954382073545e-05, |
| "loss": 0.4783, |
| "step": 8370 |
| }, |
| { |
| "epoch": 2.015643802647413, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.352702304977743e-05, |
| "loss": 0.4765, |
| "step": 8375 |
| }, |
| { |
| "epoch": 2.016847172081829, |
| "grad_norm": 1.7421875, |
| "learning_rate": 4.348612248893679e-05, |
| "loss": 0.503, |
| "step": 8380 |
| }, |
| { |
| "epoch": 2.0180505415162453, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.3445252764868696e-05, |
| "loss": 0.4434, |
| "step": 8385 |
| }, |
| { |
| "epoch": 2.019253910950662, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.340441394284097e-05, |
| "loss": 0.4639, |
| "step": 8390 |
| }, |
| { |
| "epoch": 2.020457280385078, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.3363606088072096e-05, |
| "loss": 0.4673, |
| "step": 8395 |
| }, |
| { |
| "epoch": 2.0216606498194944, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.332282926573107e-05, |
| "loss": 0.4869, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.022864019253911, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.3282083540937395e-05, |
| "loss": 0.4653, |
| "step": 8405 |
| }, |
| { |
| "epoch": 2.0240673886883274, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.324136897876084e-05, |
| "loss": 0.4799, |
| "step": 8410 |
| }, |
| { |
| "epoch": 2.0252707581227436, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.320068564422146e-05, |
| "loss": 0.5021, |
| "step": 8415 |
| }, |
| { |
| "epoch": 2.02647412755716, |
| "grad_norm": 1.625, |
| "learning_rate": 4.316003360228941e-05, |
| "loss": 0.4804, |
| "step": 8420 |
| }, |
| { |
| "epoch": 2.0276774969915765, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.311941291788489e-05, |
| "loss": 0.5143, |
| "step": 8425 |
| }, |
| { |
| "epoch": 2.0288808664259927, |
| "grad_norm": 1.5078125, |
| "learning_rate": 4.307882365587802e-05, |
| "loss": 0.501, |
| "step": 8430 |
| }, |
| { |
| "epoch": 2.030084235860409, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.303826588108873e-05, |
| "loss": 0.5066, |
| "step": 8435 |
| }, |
| { |
| "epoch": 2.0312876052948257, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.299773965828667e-05, |
| "loss": 0.497, |
| "step": 8440 |
| }, |
| { |
| "epoch": 2.032490974729242, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.295724505219109e-05, |
| "loss": 0.4801, |
| "step": 8445 |
| }, |
| { |
| "epoch": 2.033694344163658, |
| "grad_norm": 1.765625, |
| "learning_rate": 4.291678212747081e-05, |
| "loss": 0.4918, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.034897713598075, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.287635094874395e-05, |
| "loss": 0.4965, |
| "step": 8455 |
| }, |
| { |
| "epoch": 2.036101083032491, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.283595158057803e-05, |
| "loss": 0.507, |
| "step": 8460 |
| }, |
| { |
| "epoch": 2.0373044524669073, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.2795584087489705e-05, |
| "loss": 0.5007, |
| "step": 8465 |
| }, |
| { |
| "epoch": 2.0385078219013235, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.275524853394477e-05, |
| "loss": 0.498, |
| "step": 8470 |
| }, |
| { |
| "epoch": 2.03971119133574, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.2714944984357995e-05, |
| "loss": 0.4973, |
| "step": 8475 |
| }, |
| { |
| "epoch": 2.0409145607701564, |
| "grad_norm": 1.625, |
| "learning_rate": 4.267467350309302e-05, |
| "loss": 0.4778, |
| "step": 8480 |
| }, |
| { |
| "epoch": 2.0421179302045727, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.2634434154462325e-05, |
| "loss": 0.4889, |
| "step": 8485 |
| }, |
| { |
| "epoch": 2.0433212996389893, |
| "grad_norm": 1.4765625, |
| "learning_rate": 4.259422700272701e-05, |
| "loss": 0.4903, |
| "step": 8490 |
| }, |
| { |
| "epoch": 2.0445246690734056, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.255405211209682e-05, |
| "loss": 0.5066, |
| "step": 8495 |
| }, |
| { |
| "epoch": 2.045728038507822, |
| "grad_norm": 1.75, |
| "learning_rate": 4.251390954672993e-05, |
| "loss": 0.4997, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.045728038507822, |
| "eval_loss": 0.4380384087562561, |
| "eval_runtime": 2.6444, |
| "eval_samples_per_second": 75.632, |
| "eval_steps_per_second": 75.632, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.046931407942238, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.247379937073293e-05, |
| "loss": 0.5082, |
| "step": 8505 |
| }, |
| { |
| "epoch": 2.0481347773766547, |
| "grad_norm": 1.765625, |
| "learning_rate": 4.243372164816064e-05, |
| "loss": 0.4997, |
| "step": 8510 |
| }, |
| { |
| "epoch": 2.049338146811071, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.23936764430161e-05, |
| "loss": 0.5169, |
| "step": 8515 |
| }, |
| { |
| "epoch": 2.050541516245487, |
| "grad_norm": 1.75, |
| "learning_rate": 4.235366381925042e-05, |
| "loss": 0.4879, |
| "step": 8520 |
| }, |
| { |
| "epoch": 2.051744885679904, |
| "grad_norm": 1.796875, |
| "learning_rate": 4.231368384076263e-05, |
| "loss": 0.4754, |
| "step": 8525 |
| }, |
| { |
| "epoch": 2.05294825511432, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.227373657139968e-05, |
| "loss": 0.5083, |
| "step": 8530 |
| }, |
| { |
| "epoch": 2.0541516245487363, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.223382207495623e-05, |
| "loss": 0.4871, |
| "step": 8535 |
| }, |
| { |
| "epoch": 2.055354993983153, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.219394041517467e-05, |
| "loss": 0.4533, |
| "step": 8540 |
| }, |
| { |
| "epoch": 2.0565583634175693, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.215409165574487e-05, |
| "loss": 0.4723, |
| "step": 8545 |
| }, |
| { |
| "epoch": 2.0577617328519855, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.211427586030426e-05, |
| "loss": 0.4635, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.0589651022864017, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.207449309243752e-05, |
| "loss": 0.4732, |
| "step": 8555 |
| }, |
| { |
| "epoch": 2.0601684717208184, |
| "grad_norm": 1.7734375, |
| "learning_rate": 4.203474341567665e-05, |
| "loss": 0.5234, |
| "step": 8560 |
| }, |
| { |
| "epoch": 2.0613718411552346, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.199502689350083e-05, |
| "loss": 0.487, |
| "step": 8565 |
| }, |
| { |
| "epoch": 2.062575210589651, |
| "grad_norm": 1.6640625, |
| "learning_rate": 4.195534358933619e-05, |
| "loss": 0.4711, |
| "step": 8570 |
| }, |
| { |
| "epoch": 2.0637785800240676, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.191569356655592e-05, |
| "loss": 0.4706, |
| "step": 8575 |
| }, |
| { |
| "epoch": 2.064981949458484, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.1876076888479995e-05, |
| "loss": 0.5129, |
| "step": 8580 |
| }, |
| { |
| "epoch": 2.0661853188929, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.183649361837518e-05, |
| "loss": 0.4672, |
| "step": 8585 |
| }, |
| { |
| "epoch": 2.0673886883273163, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.179694381945485e-05, |
| "loss": 0.4796, |
| "step": 8590 |
| }, |
| { |
| "epoch": 2.068592057761733, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.175742755487896e-05, |
| "loss": 0.484, |
| "step": 8595 |
| }, |
| { |
| "epoch": 2.069795427196149, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.171794488775388e-05, |
| "loss": 0.5078, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.0709987966305654, |
| "grad_norm": 1.6953125, |
| "learning_rate": 4.167849588113236e-05, |
| "loss": 0.4734, |
| "step": 8605 |
| }, |
| { |
| "epoch": 2.072202166064982, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.1639080598013376e-05, |
| "loss": 0.4798, |
| "step": 8610 |
| }, |
| { |
| "epoch": 2.0734055354993983, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.1599699101342056e-05, |
| "loss": 0.4717, |
| "step": 8615 |
| }, |
| { |
| "epoch": 2.0746089049338146, |
| "grad_norm": 1.8359375, |
| "learning_rate": 4.156035145400956e-05, |
| "loss": 0.5044, |
| "step": 8620 |
| }, |
| { |
| "epoch": 2.0758122743682312, |
| "grad_norm": 1.7578125, |
| "learning_rate": 4.1521037718852995e-05, |
| "loss": 0.5029, |
| "step": 8625 |
| }, |
| { |
| "epoch": 2.0770156438026475, |
| "grad_norm": 2.1875, |
| "learning_rate": 4.148175795865533e-05, |
| "loss": 0.4681, |
| "step": 8630 |
| }, |
| { |
| "epoch": 2.0782190132370637, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.1442512236145245e-05, |
| "loss": 0.4573, |
| "step": 8635 |
| }, |
| { |
| "epoch": 2.07942238267148, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.140330061399709e-05, |
| "loss": 0.4982, |
| "step": 8640 |
| }, |
| { |
| "epoch": 2.0806257521058966, |
| "grad_norm": 1.625, |
| "learning_rate": 4.136412315483074e-05, |
| "loss": 0.5154, |
| "step": 8645 |
| }, |
| { |
| "epoch": 2.081829121540313, |
| "grad_norm": 1.5234375, |
| "learning_rate": 4.132497992121152e-05, |
| "loss": 0.4675, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.083032490974729, |
| "grad_norm": 1.75, |
| "learning_rate": 4.1285870975650125e-05, |
| "loss": 0.5419, |
| "step": 8655 |
| }, |
| { |
| "epoch": 2.0842358604091458, |
| "grad_norm": 1.65625, |
| "learning_rate": 4.1246796380602425e-05, |
| "loss": 0.4852, |
| "step": 8660 |
| }, |
| { |
| "epoch": 2.085439229843562, |
| "grad_norm": 1.59375, |
| "learning_rate": 4.12077561984695e-05, |
| "loss": 0.4931, |
| "step": 8665 |
| }, |
| { |
| "epoch": 2.0866425992779782, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.116875049159743e-05, |
| "loss": 0.4868, |
| "step": 8670 |
| }, |
| { |
| "epoch": 2.087845968712395, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.112977932227729e-05, |
| "loss": 0.4852, |
| "step": 8675 |
| }, |
| { |
| "epoch": 2.089049338146811, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.109084275274492e-05, |
| "loss": 0.4568, |
| "step": 8680 |
| }, |
| { |
| "epoch": 2.0902527075812274, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.105194084518098e-05, |
| "loss": 0.507, |
| "step": 8685 |
| }, |
| { |
| "epoch": 2.0914560770156436, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.101307366171073e-05, |
| "loss": 0.4767, |
| "step": 8690 |
| }, |
| { |
| "epoch": 2.0926594464500603, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.0974241264404015e-05, |
| "loss": 0.4906, |
| "step": 8695 |
| }, |
| { |
| "epoch": 2.0938628158844765, |
| "grad_norm": 1.75, |
| "learning_rate": 4.09354437152751e-05, |
| "loss": 0.4946, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.0950661853188928, |
| "grad_norm": 1.6875, |
| "learning_rate": 4.089668107628259e-05, |
| "loss": 0.523, |
| "step": 8705 |
| }, |
| { |
| "epoch": 2.0962695547533094, |
| "grad_norm": 1.7734375, |
| "learning_rate": 4.085795340932937e-05, |
| "loss": 0.4806, |
| "step": 8710 |
| }, |
| { |
| "epoch": 2.0974729241877257, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.081926077626244e-05, |
| "loss": 0.4834, |
| "step": 8715 |
| }, |
| { |
| "epoch": 2.098676293622142, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.0780603238872895e-05, |
| "loss": 0.488, |
| "step": 8720 |
| }, |
| { |
| "epoch": 2.099879663056558, |
| "grad_norm": 1.6171875, |
| "learning_rate": 4.0741980858895734e-05, |
| "loss": 0.4758, |
| "step": 8725 |
| }, |
| { |
| "epoch": 2.101083032490975, |
| "grad_norm": 1.6484375, |
| "learning_rate": 4.0703393698009855e-05, |
| "loss": 0.4907, |
| "step": 8730 |
| }, |
| { |
| "epoch": 2.102286401925391, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.066484181783788e-05, |
| "loss": 0.4809, |
| "step": 8735 |
| }, |
| { |
| "epoch": 2.1034897713598073, |
| "grad_norm": 1.7578125, |
| "learning_rate": 4.062632527994611e-05, |
| "loss": 0.5182, |
| "step": 8740 |
| }, |
| { |
| "epoch": 2.104693140794224, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.058784414584441e-05, |
| "loss": 0.4739, |
| "step": 8745 |
| }, |
| { |
| "epoch": 2.10589651022864, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.054939847698606e-05, |
| "loss": 0.5135, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.1070998796630565, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.0510988334767776e-05, |
| "loss": 0.514, |
| "step": 8755 |
| }, |
| { |
| "epoch": 2.108303249097473, |
| "grad_norm": 1.6640625, |
| "learning_rate": 4.047261378052946e-05, |
| "loss": 0.4837, |
| "step": 8760 |
| }, |
| { |
| "epoch": 2.1095066185318894, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.043427487555425e-05, |
| "loss": 0.5302, |
| "step": 8765 |
| }, |
| { |
| "epoch": 2.1107099879663056, |
| "grad_norm": 1.765625, |
| "learning_rate": 4.03959716810683e-05, |
| "loss": 0.5048, |
| "step": 8770 |
| }, |
| { |
| "epoch": 2.111913357400722, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.0357704258240775e-05, |
| "loss": 0.4944, |
| "step": 8775 |
| }, |
| { |
| "epoch": 2.1131167268351385, |
| "grad_norm": 1.546875, |
| "learning_rate": 4.0319472668183685e-05, |
| "loss": 0.4866, |
| "step": 8780 |
| }, |
| { |
| "epoch": 2.1143200962695547, |
| "grad_norm": 1.5546875, |
| "learning_rate": 4.028127697195183e-05, |
| "loss": 0.4931, |
| "step": 8785 |
| }, |
| { |
| "epoch": 2.115523465703971, |
| "grad_norm": 1.5859375, |
| "learning_rate": 4.024311723054271e-05, |
| "loss": 0.4933, |
| "step": 8790 |
| }, |
| { |
| "epoch": 2.1167268351383877, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.020499350489635e-05, |
| "loss": 0.4959, |
| "step": 8795 |
| }, |
| { |
| "epoch": 2.117930204572804, |
| "grad_norm": 1.5703125, |
| "learning_rate": 4.016690585589532e-05, |
| "loss": 0.4659, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.11913357400722, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.012885434436451e-05, |
| "loss": 0.4954, |
| "step": 8805 |
| }, |
| { |
| "epoch": 2.1203369434416364, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.009083903107119e-05, |
| "loss": 0.4986, |
| "step": 8810 |
| }, |
| { |
| "epoch": 2.121540312876053, |
| "grad_norm": 1.78125, |
| "learning_rate": 4.005285997672472e-05, |
| "loss": 0.4771, |
| "step": 8815 |
| }, |
| { |
| "epoch": 2.1227436823104693, |
| "grad_norm": 1.765625, |
| "learning_rate": 4.001491724197664e-05, |
| "loss": 0.4917, |
| "step": 8820 |
| }, |
| { |
| "epoch": 2.1239470517448855, |
| "grad_norm": 1.765625, |
| "learning_rate": 3.9977010887420435e-05, |
| "loss": 0.4628, |
| "step": 8825 |
| }, |
| { |
| "epoch": 2.125150421179302, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.993914097359151e-05, |
| "loss": 0.4825, |
| "step": 8830 |
| }, |
| { |
| "epoch": 2.1263537906137184, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.99013075609671e-05, |
| "loss": 0.488, |
| "step": 8835 |
| }, |
| { |
| "epoch": 2.1275571600481347, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.98635107099661e-05, |
| "loss": 0.4911, |
| "step": 8840 |
| }, |
| { |
| "epoch": 2.1287605294825513, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.982575048094906e-05, |
| "loss": 0.5006, |
| "step": 8845 |
| }, |
| { |
| "epoch": 2.1299638989169676, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.9788026934218e-05, |
| "loss": 0.4646, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.131167268351384, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.975034013001642e-05, |
| "loss": 0.4789, |
| "step": 8855 |
| }, |
| { |
| "epoch": 2.1323706377858, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.971269012852908e-05, |
| "loss": 0.5018, |
| "step": 8860 |
| }, |
| { |
| "epoch": 2.1335740072202167, |
| "grad_norm": 1.75, |
| "learning_rate": 3.967507698988203e-05, |
| "loss": 0.4741, |
| "step": 8865 |
| }, |
| { |
| "epoch": 2.134777376654633, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.963750077414238e-05, |
| "loss": 0.4832, |
| "step": 8870 |
| }, |
| { |
| "epoch": 2.135980746089049, |
| "grad_norm": 1.625, |
| "learning_rate": 3.959996154131834e-05, |
| "loss": 0.4968, |
| "step": 8875 |
| }, |
| { |
| "epoch": 2.137184115523466, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.9562459351359045e-05, |
| "loss": 0.4943, |
| "step": 8880 |
| }, |
| { |
| "epoch": 2.138387484957882, |
| "grad_norm": 1.78125, |
| "learning_rate": 3.9524994264154416e-05, |
| "loss": 0.4864, |
| "step": 8885 |
| }, |
| { |
| "epoch": 2.1395908543922983, |
| "grad_norm": 1.625, |
| "learning_rate": 3.948756633953521e-05, |
| "loss": 0.4893, |
| "step": 8890 |
| }, |
| { |
| "epoch": 2.140794223826715, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.945017563727277e-05, |
| "loss": 0.4954, |
| "step": 8895 |
| }, |
| { |
| "epoch": 2.1419975932611313, |
| "grad_norm": 1.5546875, |
| "learning_rate": 3.941282221707903e-05, |
| "loss": 0.482, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.1432009626955475, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.937550613860637e-05, |
| "loss": 0.4891, |
| "step": 8905 |
| }, |
| { |
| "epoch": 2.1444043321299637, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.933822746144755e-05, |
| "loss": 0.4836, |
| "step": 8910 |
| }, |
| { |
| "epoch": 2.1456077015643804, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.9300986245135574e-05, |
| "loss": 0.4893, |
| "step": 8915 |
| }, |
| { |
| "epoch": 2.1468110709987966, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.926378254914365e-05, |
| "loss": 0.5184, |
| "step": 8920 |
| }, |
| { |
| "epoch": 2.148014440433213, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.922661643288507e-05, |
| "loss": 0.4951, |
| "step": 8925 |
| }, |
| { |
| "epoch": 2.1492178098676296, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.918948795571309e-05, |
| "loss": 0.5026, |
| "step": 8930 |
| }, |
| { |
| "epoch": 2.150421179302046, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.9152397176920856e-05, |
| "loss": 0.4973, |
| "step": 8935 |
| }, |
| { |
| "epoch": 2.151624548736462, |
| "grad_norm": 1.921875, |
| "learning_rate": 3.911534415574134e-05, |
| "loss": 0.4689, |
| "step": 8940 |
| }, |
| { |
| "epoch": 2.1528279181708783, |
| "grad_norm": 1.75, |
| "learning_rate": 3.9078328951347194e-05, |
| "loss": 0.4896, |
| "step": 8945 |
| }, |
| { |
| "epoch": 2.154031287605295, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.904135162285066e-05, |
| "loss": 0.5008, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.155234657039711, |
| "grad_norm": 1.578125, |
| "learning_rate": 3.9004412229303564e-05, |
| "loss": 0.4997, |
| "step": 8955 |
| }, |
| { |
| "epoch": 2.1564380264741274, |
| "grad_norm": 1.8046875, |
| "learning_rate": 3.8967510829697046e-05, |
| "loss": 0.5027, |
| "step": 8960 |
| }, |
| { |
| "epoch": 2.157641395908544, |
| "grad_norm": 1.8125, |
| "learning_rate": 3.8930647482961664e-05, |
| "loss": 0.4757, |
| "step": 8965 |
| }, |
| { |
| "epoch": 2.1588447653429603, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.8893822247967144e-05, |
| "loss": 0.4581, |
| "step": 8970 |
| }, |
| { |
| "epoch": 2.1600481347773766, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.885703518352239e-05, |
| "loss": 0.4969, |
| "step": 8975 |
| }, |
| { |
| "epoch": 2.1612515042117932, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.8820286348375334e-05, |
| "loss": 0.5136, |
| "step": 8980 |
| }, |
| { |
| "epoch": 2.1624548736462095, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.8783575801212846e-05, |
| "loss": 0.471, |
| "step": 8985 |
| }, |
| { |
| "epoch": 2.1636582430806257, |
| "grad_norm": 1.625, |
| "learning_rate": 3.874690360066068e-05, |
| "loss": 0.4926, |
| "step": 8990 |
| }, |
| { |
| "epoch": 2.164861612515042, |
| "grad_norm": 1.5390625, |
| "learning_rate": 3.871026980528329e-05, |
| "loss": 0.476, |
| "step": 8995 |
| }, |
| { |
| "epoch": 2.1660649819494586, |
| "grad_norm": 1.5390625, |
| "learning_rate": 3.867367447358389e-05, |
| "loss": 0.4717, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.1660649819494586, |
| "eval_loss": 0.4340422749519348, |
| "eval_runtime": 2.659, |
| "eval_samples_per_second": 75.217, |
| "eval_steps_per_second": 75.217, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.167268351383875, |
| "grad_norm": 1.578125, |
| "learning_rate": 3.863711766400418e-05, |
| "loss": 0.4822, |
| "step": 9005 |
| }, |
| { |
| "epoch": 2.168471720818291, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.860059943492441e-05, |
| "loss": 0.4784, |
| "step": 9010 |
| }, |
| { |
| "epoch": 2.1696750902527078, |
| "grad_norm": 1.796875, |
| "learning_rate": 3.856411984466316e-05, |
| "loss": 0.4664, |
| "step": 9015 |
| }, |
| { |
| "epoch": 2.170878459687124, |
| "grad_norm": 1.765625, |
| "learning_rate": 3.8527678951477334e-05, |
| "loss": 0.4541, |
| "step": 9020 |
| }, |
| { |
| "epoch": 2.1720818291215402, |
| "grad_norm": 1.5703125, |
| "learning_rate": 3.849127681356208e-05, |
| "loss": 0.4829, |
| "step": 9025 |
| }, |
| { |
| "epoch": 2.1732851985559565, |
| "grad_norm": 1.625, |
| "learning_rate": 3.8454913489050544e-05, |
| "loss": 0.4587, |
| "step": 9030 |
| }, |
| { |
| "epoch": 2.174488567990373, |
| "grad_norm": 1.5546875, |
| "learning_rate": 3.841858903601402e-05, |
| "loss": 0.4861, |
| "step": 9035 |
| }, |
| { |
| "epoch": 2.1756919374247894, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.8382303512461616e-05, |
| "loss": 0.4912, |
| "step": 9040 |
| }, |
| { |
| "epoch": 2.1768953068592056, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.8346056976340335e-05, |
| "loss": 0.4779, |
| "step": 9045 |
| }, |
| { |
| "epoch": 2.1780986762936223, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.830984948553489e-05, |
| "loss": 0.4761, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.1793020457280385, |
| "grad_norm": 1.890625, |
| "learning_rate": 3.827368109786767e-05, |
| "loss": 0.4887, |
| "step": 9055 |
| }, |
| { |
| "epoch": 2.1805054151624548, |
| "grad_norm": 1.546875, |
| "learning_rate": 3.8237551871098557e-05, |
| "loss": 0.4917, |
| "step": 9060 |
| }, |
| { |
| "epoch": 2.1817087845968715, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.8201461862924956e-05, |
| "loss": 0.4808, |
| "step": 9065 |
| }, |
| { |
| "epoch": 2.1829121540312877, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.8165411130981627e-05, |
| "loss": 0.4828, |
| "step": 9070 |
| }, |
| { |
| "epoch": 2.184115523465704, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.812939973284058e-05, |
| "loss": 0.4741, |
| "step": 9075 |
| }, |
| { |
| "epoch": 2.18531889290012, |
| "grad_norm": 1.6484375, |
| "learning_rate": 3.809342772601104e-05, |
| "loss": 0.4735, |
| "step": 9080 |
| }, |
| { |
| "epoch": 2.186522262334537, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.80574951679393e-05, |
| "loss": 0.4672, |
| "step": 9085 |
| }, |
| { |
| "epoch": 2.187725631768953, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.802160211600868e-05, |
| "loss": 0.4813, |
| "step": 9090 |
| }, |
| { |
| "epoch": 2.1889290012033693, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.798574862753938e-05, |
| "loss": 0.5285, |
| "step": 9095 |
| }, |
| { |
| "epoch": 2.190132370637786, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.7949934759788466e-05, |
| "loss": 0.4922, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.191335740072202, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.791416056994966e-05, |
| "loss": 0.4752, |
| "step": 9105 |
| }, |
| { |
| "epoch": 2.1925391095066185, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.787842611515339e-05, |
| "loss": 0.4723, |
| "step": 9110 |
| }, |
| { |
| "epoch": 2.1937424789410347, |
| "grad_norm": 1.578125, |
| "learning_rate": 3.784273145246659e-05, |
| "loss": 0.5092, |
| "step": 9115 |
| }, |
| { |
| "epoch": 2.1949458483754514, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.7807076638892646e-05, |
| "loss": 0.4837, |
| "step": 9120 |
| }, |
| { |
| "epoch": 2.1961492178098676, |
| "grad_norm": 1.546875, |
| "learning_rate": 3.777146173137134e-05, |
| "loss": 0.4488, |
| "step": 9125 |
| }, |
| { |
| "epoch": 2.197352587244284, |
| "grad_norm": 1.796875, |
| "learning_rate": 3.773588678677867e-05, |
| "loss": 0.4671, |
| "step": 9130 |
| }, |
| { |
| "epoch": 2.1985559566787005, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.770035186192685e-05, |
| "loss": 0.4949, |
| "step": 9135 |
| }, |
| { |
| "epoch": 2.1997593261131168, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.766485701356419e-05, |
| "loss": 0.4647, |
| "step": 9140 |
| }, |
| { |
| "epoch": 2.200962695547533, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.7629402298374994e-05, |
| "loss": 0.4678, |
| "step": 9145 |
| }, |
| { |
| "epoch": 2.2021660649819497, |
| "grad_norm": 1.75, |
| "learning_rate": 3.7593987772979436e-05, |
| "loss": 0.4933, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.203369434416366, |
| "grad_norm": 1.625, |
| "learning_rate": 3.755861349393356e-05, |
| "loss": 0.4879, |
| "step": 9155 |
| }, |
| { |
| "epoch": 2.204572803850782, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.752327951772913e-05, |
| "loss": 0.4716, |
| "step": 9160 |
| }, |
| { |
| "epoch": 2.2057761732851984, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.74879859007935e-05, |
| "loss": 0.5021, |
| "step": 9165 |
| }, |
| { |
| "epoch": 2.206979542719615, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.745273269948964e-05, |
| "loss": 0.4915, |
| "step": 9170 |
| }, |
| { |
| "epoch": 2.2081829121540313, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.7417519970115926e-05, |
| "loss": 0.4831, |
| "step": 9175 |
| }, |
| { |
| "epoch": 2.2093862815884475, |
| "grad_norm": 1.5390625, |
| "learning_rate": 3.738234776890612e-05, |
| "loss": 0.4543, |
| "step": 9180 |
| }, |
| { |
| "epoch": 2.210589651022864, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.734721615202927e-05, |
| "loss": 0.4941, |
| "step": 9185 |
| }, |
| { |
| "epoch": 2.2117930204572804, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.731212517558958e-05, |
| "loss": 0.485, |
| "step": 9190 |
| }, |
| { |
| "epoch": 2.2129963898916967, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.72770748956264e-05, |
| "loss": 0.4662, |
| "step": 9195 |
| }, |
| { |
| "epoch": 2.214199759326113, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.724206536811404e-05, |
| "loss": 0.4902, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.2154031287605296, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.7207096648961774e-05, |
| "loss": 0.4659, |
| "step": 9205 |
| }, |
| { |
| "epoch": 2.216606498194946, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.717216879401367e-05, |
| "loss": 0.4956, |
| "step": 9210 |
| }, |
| { |
| "epoch": 2.217809867629362, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.713728185904856e-05, |
| "loss": 0.4943, |
| "step": 9215 |
| }, |
| { |
| "epoch": 2.2190132370637787, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.710243589977991e-05, |
| "loss": 0.4873, |
| "step": 9220 |
| }, |
| { |
| "epoch": 2.220216606498195, |
| "grad_norm": 1.5703125, |
| "learning_rate": 3.706763097185577e-05, |
| "loss": 0.4603, |
| "step": 9225 |
| }, |
| { |
| "epoch": 2.221419975932611, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.7032867130858654e-05, |
| "loss": 0.4683, |
| "step": 9230 |
| }, |
| { |
| "epoch": 2.222623345367028, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.6998144432305444e-05, |
| "loss": 0.4932, |
| "step": 9235 |
| }, |
| { |
| "epoch": 2.223826714801444, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.696346293164737e-05, |
| "loss": 0.4947, |
| "step": 9240 |
| }, |
| { |
| "epoch": 2.2250300842358604, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.692882268426979e-05, |
| "loss": 0.4836, |
| "step": 9245 |
| }, |
| { |
| "epoch": 2.2262334536702766, |
| "grad_norm": 1.5390625, |
| "learning_rate": 3.6894223745492257e-05, |
| "loss": 0.4734, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.2274368231046933, |
| "grad_norm": 1.8046875, |
| "learning_rate": 3.685966617056831e-05, |
| "loss": 0.4968, |
| "step": 9255 |
| }, |
| { |
| "epoch": 2.2286401925391095, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.682515001468546e-05, |
| "loss": 0.5096, |
| "step": 9260 |
| }, |
| { |
| "epoch": 2.2298435619735257, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.6790675332965045e-05, |
| "loss": 0.4882, |
| "step": 9265 |
| }, |
| { |
| "epoch": 2.2310469314079424, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.675624218046218e-05, |
| "loss": 0.4786, |
| "step": 9270 |
| }, |
| { |
| "epoch": 2.2322503008423586, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.6721850612165695e-05, |
| "loss": 0.4842, |
| "step": 9275 |
| }, |
| { |
| "epoch": 2.233453670276775, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.668750068299794e-05, |
| "loss": 0.5154, |
| "step": 9280 |
| }, |
| { |
| "epoch": 2.234657039711191, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.6653192447814835e-05, |
| "loss": 0.4666, |
| "step": 9285 |
| }, |
| { |
| "epoch": 2.235860409145608, |
| "grad_norm": 1.5390625, |
| "learning_rate": 3.661892596140566e-05, |
| "loss": 0.5, |
| "step": 9290 |
| }, |
| { |
| "epoch": 2.237063778580024, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.6584701278493095e-05, |
| "loss": 0.4798, |
| "step": 9295 |
| }, |
| { |
| "epoch": 2.2382671480144403, |
| "grad_norm": 1.5078125, |
| "learning_rate": 3.655051845373297e-05, |
| "loss": 0.449, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.239470517448857, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.651637754171435e-05, |
| "loss": 0.4917, |
| "step": 9305 |
| }, |
| { |
| "epoch": 2.240673886883273, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.648227859695931e-05, |
| "loss": 0.4574, |
| "step": 9310 |
| }, |
| { |
| "epoch": 2.2418772563176894, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.644822167392293e-05, |
| "loss": 0.5086, |
| "step": 9315 |
| }, |
| { |
| "epoch": 2.243080625752106, |
| "grad_norm": 1.765625, |
| "learning_rate": 3.6414206826993216e-05, |
| "loss": 0.4691, |
| "step": 9320 |
| }, |
| { |
| "epoch": 2.2442839951865223, |
| "grad_norm": 1.5, |
| "learning_rate": 3.638023411049089e-05, |
| "loss": 0.462, |
| "step": 9325 |
| }, |
| { |
| "epoch": 2.2454873646209386, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.634630357866948e-05, |
| "loss": 0.4666, |
| "step": 9330 |
| }, |
| { |
| "epoch": 2.246690734055355, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.6312415285715105e-05, |
| "loss": 0.511, |
| "step": 9335 |
| }, |
| { |
| "epoch": 2.2478941034897715, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.627856928574643e-05, |
| "loss": 0.4759, |
| "step": 9340 |
| }, |
| { |
| "epoch": 2.2490974729241877, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.6244765632814595e-05, |
| "loss": 0.4943, |
| "step": 9345 |
| }, |
| { |
| "epoch": 2.250300842358604, |
| "grad_norm": 1.546875, |
| "learning_rate": 3.621100438090311e-05, |
| "loss": 0.451, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.2515042117930206, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.6177285583927745e-05, |
| "loss": 0.4786, |
| "step": 9355 |
| }, |
| { |
| "epoch": 2.252707581227437, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.6143609295736494e-05, |
| "loss": 0.4712, |
| "step": 9360 |
| }, |
| { |
| "epoch": 2.253910950661853, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.6109975570109494e-05, |
| "loss": 0.4952, |
| "step": 9365 |
| }, |
| { |
| "epoch": 2.2551143200962693, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.607638446075884e-05, |
| "loss": 0.4684, |
| "step": 9370 |
| }, |
| { |
| "epoch": 2.256317689530686, |
| "grad_norm": 1.625, |
| "learning_rate": 3.604283602132865e-05, |
| "loss": 0.5315, |
| "step": 9375 |
| }, |
| { |
| "epoch": 2.2575210589651022, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.6009330305394825e-05, |
| "loss": 0.457, |
| "step": 9380 |
| }, |
| { |
| "epoch": 2.2587244283995185, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.59758673664651e-05, |
| "loss": 0.466, |
| "step": 9385 |
| }, |
| { |
| "epoch": 2.259927797833935, |
| "grad_norm": 1.546875, |
| "learning_rate": 3.594244725797883e-05, |
| "loss": 0.4609, |
| "step": 9390 |
| }, |
| { |
| "epoch": 2.2611311672683514, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.590907003330705e-05, |
| "loss": 0.4598, |
| "step": 9395 |
| }, |
| { |
| "epoch": 2.2623345367027676, |
| "grad_norm": 1.8671875, |
| "learning_rate": 3.587573574575226e-05, |
| "loss": 0.481, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.2635379061371843, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.5842444448548365e-05, |
| "loss": 0.4712, |
| "step": 9405 |
| }, |
| { |
| "epoch": 2.2647412755716005, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.5809196194860714e-05, |
| "loss": 0.4952, |
| "step": 9410 |
| }, |
| { |
| "epoch": 2.2659446450060168, |
| "grad_norm": 1.828125, |
| "learning_rate": 3.577599103778581e-05, |
| "loss": 0.5135, |
| "step": 9415 |
| }, |
| { |
| "epoch": 2.2671480144404335, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.574282903035138e-05, |
| "loss": 0.4821, |
| "step": 9420 |
| }, |
| { |
| "epoch": 2.2683513838748497, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.570971022551624e-05, |
| "loss": 0.4877, |
| "step": 9425 |
| }, |
| { |
| "epoch": 2.269554753309266, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.567663467617023e-05, |
| "loss": 0.4727, |
| "step": 9430 |
| }, |
| { |
| "epoch": 2.270758122743682, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.564360243513407e-05, |
| "loss": 0.4715, |
| "step": 9435 |
| }, |
| { |
| "epoch": 2.271961492178099, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.561061355515935e-05, |
| "loss": 0.4631, |
| "step": 9440 |
| }, |
| { |
| "epoch": 2.273164861612515, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.55776680889284e-05, |
| "loss": 0.4907, |
| "step": 9445 |
| }, |
| { |
| "epoch": 2.2743682310469313, |
| "grad_norm": 1.75, |
| "learning_rate": 3.5544766089054224e-05, |
| "loss": 0.5336, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.2755716004813475, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.551190760808043e-05, |
| "loss": 0.4999, |
| "step": 9455 |
| }, |
| { |
| "epoch": 2.2767749699157642, |
| "grad_norm": 1.9453125, |
| "learning_rate": 3.547909269848108e-05, |
| "loss": 0.51, |
| "step": 9460 |
| }, |
| { |
| "epoch": 2.2779783393501805, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.54463214126607e-05, |
| "loss": 0.4699, |
| "step": 9465 |
| }, |
| { |
| "epoch": 2.2791817087845967, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.5413593802954115e-05, |
| "loss": 0.4887, |
| "step": 9470 |
| }, |
| { |
| "epoch": 2.2803850782190134, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.5380909921626446e-05, |
| "loss": 0.4792, |
| "step": 9475 |
| }, |
| { |
| "epoch": 2.2815884476534296, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.534826982087293e-05, |
| "loss": 0.482, |
| "step": 9480 |
| }, |
| { |
| "epoch": 2.282791817087846, |
| "grad_norm": 1.578125, |
| "learning_rate": 3.531567355281892e-05, |
| "loss": 0.4671, |
| "step": 9485 |
| }, |
| { |
| "epoch": 2.2839951865222625, |
| "grad_norm": 1.546875, |
| "learning_rate": 3.528312116951975e-05, |
| "loss": 0.4872, |
| "step": 9490 |
| }, |
| { |
| "epoch": 2.2851985559566788, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.525061272296069e-05, |
| "loss": 0.514, |
| "step": 9495 |
| }, |
| { |
| "epoch": 2.286401925391095, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.521814826505684e-05, |
| "loss": 0.4797, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.286401925391095, |
| "eval_loss": 0.4312799870967865, |
| "eval_runtime": 2.6559, |
| "eval_samples_per_second": 75.304, |
| "eval_steps_per_second": 75.304, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.2876052948255117, |
| "grad_norm": 1.671875, |
| "learning_rate": 3.518572784765304e-05, |
| "loss": 0.4765, |
| "step": 9505 |
| }, |
| { |
| "epoch": 2.288808664259928, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.515335152252381e-05, |
| "loss": 0.4641, |
| "step": 9510 |
| }, |
| { |
| "epoch": 2.290012033694344, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.5121019341373234e-05, |
| "loss": 0.5206, |
| "step": 9515 |
| }, |
| { |
| "epoch": 2.2912154031287604, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.508873135583495e-05, |
| "loss": 0.4597, |
| "step": 9520 |
| }, |
| { |
| "epoch": 2.292418772563177, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.505648761747194e-05, |
| "loss": 0.4892, |
| "step": 9525 |
| }, |
| { |
| "epoch": 2.2936221419975933, |
| "grad_norm": 1.578125, |
| "learning_rate": 3.5024288177776606e-05, |
| "loss": 0.5179, |
| "step": 9530 |
| }, |
| { |
| "epoch": 2.2948255114320095, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.4992133088170536e-05, |
| "loss": 0.4698, |
| "step": 9535 |
| }, |
| { |
| "epoch": 2.2960288808664258, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.496002240000455e-05, |
| "loss": 0.5085, |
| "step": 9540 |
| }, |
| { |
| "epoch": 2.2972322503008424, |
| "grad_norm": 1.546875, |
| "learning_rate": 3.492795616455852e-05, |
| "loss": 0.4844, |
| "step": 9545 |
| }, |
| { |
| "epoch": 2.2984356197352587, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.4895934433041344e-05, |
| "loss": 0.4827, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.299638989169675, |
| "grad_norm": 1.671875, |
| "learning_rate": 3.4863957256590857e-05, |
| "loss": 0.4983, |
| "step": 9555 |
| }, |
| { |
| "epoch": 2.3008423586040916, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.4832024686273724e-05, |
| "loss": 0.4688, |
| "step": 9560 |
| }, |
| { |
| "epoch": 2.302045728038508, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.4800136773085376e-05, |
| "loss": 0.4444, |
| "step": 9565 |
| }, |
| { |
| "epoch": 2.303249097472924, |
| "grad_norm": 1.765625, |
| "learning_rate": 3.476829356794993e-05, |
| "loss": 0.4913, |
| "step": 9570 |
| }, |
| { |
| "epoch": 2.3044524669073407, |
| "grad_norm": 1.515625, |
| "learning_rate": 3.473649512172013e-05, |
| "loss": 0.5131, |
| "step": 9575 |
| }, |
| { |
| "epoch": 2.305655836341757, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.4704741485177205e-05, |
| "loss": 0.455, |
| "step": 9580 |
| }, |
| { |
| "epoch": 2.306859205776173, |
| "grad_norm": 2.484375, |
| "learning_rate": 3.4673032709030835e-05, |
| "loss": 0.4867, |
| "step": 9585 |
| }, |
| { |
| "epoch": 2.30806257521059, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.4641368843919074e-05, |
| "loss": 0.4971, |
| "step": 9590 |
| }, |
| { |
| "epoch": 2.309265944645006, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.4609749940408235e-05, |
| "loss": 0.4625, |
| "step": 9595 |
| }, |
| { |
| "epoch": 2.3104693140794224, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.4578176048992855e-05, |
| "loss": 0.5039, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.3116726835138386, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.454664722009554e-05, |
| "loss": 0.4597, |
| "step": 9605 |
| }, |
| { |
| "epoch": 2.3128760529482553, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.4515163504067006e-05, |
| "loss": 0.4524, |
| "step": 9610 |
| }, |
| { |
| "epoch": 2.3140794223826715, |
| "grad_norm": 1.7734375, |
| "learning_rate": 3.4483724951185835e-05, |
| "loss": 0.4841, |
| "step": 9615 |
| }, |
| { |
| "epoch": 2.3152827918170877, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.445233161165858e-05, |
| "loss": 0.4709, |
| "step": 9620 |
| }, |
| { |
| "epoch": 2.316486161251504, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.442098353561948e-05, |
| "loss": 0.4543, |
| "step": 9625 |
| }, |
| { |
| "epoch": 2.3176895306859207, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.4389680773130616e-05, |
| "loss": 0.4861, |
| "step": 9630 |
| }, |
| { |
| "epoch": 2.318892900120337, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.435842337418159e-05, |
| "loss": 0.4898, |
| "step": 9635 |
| }, |
| { |
| "epoch": 2.320096269554753, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.432721138868963e-05, |
| "loss": 0.4986, |
| "step": 9640 |
| }, |
| { |
| "epoch": 2.32129963898917, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.429604486649943e-05, |
| "loss": 0.482, |
| "step": 9645 |
| }, |
| { |
| "epoch": 2.322503008423586, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.426492385738303e-05, |
| "loss": 0.4934, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.3237063778580023, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.423384841103988e-05, |
| "loss": 0.4967, |
| "step": 9655 |
| }, |
| { |
| "epoch": 2.324909747292419, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.4202818577096564e-05, |
| "loss": 0.4717, |
| "step": 9660 |
| }, |
| { |
| "epoch": 2.326113116726835, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.41718344051069e-05, |
| "loss": 0.4812, |
| "step": 9665 |
| }, |
| { |
| "epoch": 2.3273164861612514, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.414089594455175e-05, |
| "loss": 0.4733, |
| "step": 9670 |
| }, |
| { |
| "epoch": 2.328519855595668, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.411000324483899e-05, |
| "loss": 0.4884, |
| "step": 9675 |
| }, |
| { |
| "epoch": 2.3297232250300843, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.407915635530341e-05, |
| "loss": 0.4813, |
| "step": 9680 |
| }, |
| { |
| "epoch": 2.3309265944645006, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.404835532520663e-05, |
| "loss": 0.4684, |
| "step": 9685 |
| }, |
| { |
| "epoch": 2.332129963898917, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.401760020373708e-05, |
| "loss": 0.4846, |
| "step": 9690 |
| }, |
| { |
| "epoch": 2.3333333333333335, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.3986891040009825e-05, |
| "loss": 0.4763, |
| "step": 9695 |
| }, |
| { |
| "epoch": 2.3345367027677497, |
| "grad_norm": 1.8671875, |
| "learning_rate": 3.3956227883066555e-05, |
| "loss": 0.4765, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.335740072202166, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.392561078187548e-05, |
| "loss": 0.4773, |
| "step": 9705 |
| }, |
| { |
| "epoch": 2.336943441636582, |
| "grad_norm": 1.6484375, |
| "learning_rate": 3.389503978533128e-05, |
| "loss": 0.4516, |
| "step": 9710 |
| }, |
| { |
| "epoch": 2.338146811070999, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.3864514942254994e-05, |
| "loss": 0.4536, |
| "step": 9715 |
| }, |
| { |
| "epoch": 2.339350180505415, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.383403630139397e-05, |
| "loss": 0.4585, |
| "step": 9720 |
| }, |
| { |
| "epoch": 2.3405535499398313, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.3803603911421726e-05, |
| "loss": 0.4775, |
| "step": 9725 |
| }, |
| { |
| "epoch": 2.341756919374248, |
| "grad_norm": 1.5546875, |
| "learning_rate": 3.377321782093798e-05, |
| "loss": 0.4656, |
| "step": 9730 |
| }, |
| { |
| "epoch": 2.3429602888086642, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.374287807846848e-05, |
| "loss": 0.4747, |
| "step": 9735 |
| }, |
| { |
| "epoch": 2.3441636582430805, |
| "grad_norm": 1.875, |
| "learning_rate": 3.371258473246495e-05, |
| "loss": 0.4847, |
| "step": 9740 |
| }, |
| { |
| "epoch": 2.345367027677497, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.368233783130504e-05, |
| "loss": 0.4841, |
| "step": 9745 |
| }, |
| { |
| "epoch": 2.3465703971119134, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.365213742329222e-05, |
| "loss": 0.5139, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.3477737665463296, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.3621983556655705e-05, |
| "loss": 0.4983, |
| "step": 9755 |
| }, |
| { |
| "epoch": 2.3489771359807463, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.3591876279550384e-05, |
| "loss": 0.5193, |
| "step": 9760 |
| }, |
| { |
| "epoch": 2.3501805054151625, |
| "grad_norm": 1.8125, |
| "learning_rate": 3.3561815640056776e-05, |
| "loss": 0.4765, |
| "step": 9765 |
| }, |
| { |
| "epoch": 2.351383874849579, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.353180168618087e-05, |
| "loss": 0.5217, |
| "step": 9770 |
| }, |
| { |
| "epoch": 2.352587244283995, |
| "grad_norm": 1.515625, |
| "learning_rate": 3.3501834465854126e-05, |
| "loss": 0.4547, |
| "step": 9775 |
| }, |
| { |
| "epoch": 2.3537906137184117, |
| "grad_norm": 1.796875, |
| "learning_rate": 3.34719140269334e-05, |
| "loss": 0.458, |
| "step": 9780 |
| }, |
| { |
| "epoch": 2.354993983152828, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.344204041720077e-05, |
| "loss": 0.4832, |
| "step": 9785 |
| }, |
| { |
| "epoch": 2.356197352587244, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.341221368436361e-05, |
| "loss": 0.4849, |
| "step": 9790 |
| }, |
| { |
| "epoch": 2.357400722021661, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.3382433876054354e-05, |
| "loss": 0.4725, |
| "step": 9795 |
| }, |
| { |
| "epoch": 2.358604091456077, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.335270103983057e-05, |
| "loss": 0.456, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.3598074608904933, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.332301522317476e-05, |
| "loss": 0.499, |
| "step": 9805 |
| }, |
| { |
| "epoch": 2.3610108303249095, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.3293376473494355e-05, |
| "loss": 0.4877, |
| "step": 9810 |
| }, |
| { |
| "epoch": 2.3622141997593262, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.326378483812163e-05, |
| "loss": 0.4875, |
| "step": 9815 |
| }, |
| { |
| "epoch": 2.3634175691937425, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.323424036431362e-05, |
| "loss": 0.4932, |
| "step": 9820 |
| }, |
| { |
| "epoch": 2.3646209386281587, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.3204743099252036e-05, |
| "loss": 0.5022, |
| "step": 9825 |
| }, |
| { |
| "epoch": 2.3658243080625754, |
| "grad_norm": 1.5234375, |
| "learning_rate": 3.3175293090043175e-05, |
| "loss": 0.5026, |
| "step": 9830 |
| }, |
| { |
| "epoch": 2.3670276774969916, |
| "grad_norm": 1.5703125, |
| "learning_rate": 3.314589038371793e-05, |
| "loss": 0.5034, |
| "step": 9835 |
| }, |
| { |
| "epoch": 2.368231046931408, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.3116535027231584e-05, |
| "loss": 0.4961, |
| "step": 9840 |
| }, |
| { |
| "epoch": 2.3694344163658245, |
| "grad_norm": 1.625, |
| "learning_rate": 3.3087227067463855e-05, |
| "loss": 0.4831, |
| "step": 9845 |
| }, |
| { |
| "epoch": 2.3706377858002408, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.305796655121872e-05, |
| "loss": 0.5074, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.371841155234657, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.302875352522446e-05, |
| "loss": 0.5183, |
| "step": 9855 |
| }, |
| { |
| "epoch": 2.3730445246690732, |
| "grad_norm": 1.5546875, |
| "learning_rate": 3.299958803613344e-05, |
| "loss": 0.4634, |
| "step": 9860 |
| }, |
| { |
| "epoch": 2.37424789410349, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.2970470130522156e-05, |
| "loss": 0.4769, |
| "step": 9865 |
| }, |
| { |
| "epoch": 2.375451263537906, |
| "grad_norm": 1.84375, |
| "learning_rate": 3.2941399854891116e-05, |
| "loss": 0.4786, |
| "step": 9870 |
| }, |
| { |
| "epoch": 2.3766546329723224, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.291237725566472e-05, |
| "loss": 0.5156, |
| "step": 9875 |
| }, |
| { |
| "epoch": 2.377858002406739, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.2883402379191304e-05, |
| "loss": 0.5056, |
| "step": 9880 |
| }, |
| { |
| "epoch": 2.3790613718411553, |
| "grad_norm": 1.671875, |
| "learning_rate": 3.28544752717429e-05, |
| "loss": 0.4833, |
| "step": 9885 |
| }, |
| { |
| "epoch": 2.3802647412755715, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.2825595979515344e-05, |
| "loss": 0.4869, |
| "step": 9890 |
| }, |
| { |
| "epoch": 2.3814681107099878, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.279676454862805e-05, |
| "loss": 0.4726, |
| "step": 9895 |
| }, |
| { |
| "epoch": 2.3826714801444044, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.2767981025124024e-05, |
| "loss": 0.4983, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.3838748495788207, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.2739245454969764e-05, |
| "loss": 0.4839, |
| "step": 9905 |
| }, |
| { |
| "epoch": 2.385078219013237, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.271055788405518e-05, |
| "loss": 0.5325, |
| "step": 9910 |
| }, |
| { |
| "epoch": 2.3862815884476536, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.2681918358193556e-05, |
| "loss": 0.4825, |
| "step": 9915 |
| }, |
| { |
| "epoch": 2.38748495788207, |
| "grad_norm": 1.828125, |
| "learning_rate": 3.265332692312139e-05, |
| "loss": 0.4817, |
| "step": 9920 |
| }, |
| { |
| "epoch": 2.388688327316486, |
| "grad_norm": 1.5546875, |
| "learning_rate": 3.262478362449845e-05, |
| "loss": 0.4546, |
| "step": 9925 |
| }, |
| { |
| "epoch": 2.3898916967509027, |
| "grad_norm": 1.7734375, |
| "learning_rate": 3.2596288507907585e-05, |
| "loss": 0.5006, |
| "step": 9930 |
| }, |
| { |
| "epoch": 2.391095066185319, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.2567841618854715e-05, |
| "loss": 0.4897, |
| "step": 9935 |
| }, |
| { |
| "epoch": 2.392298435619735, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.2539443002768734e-05, |
| "loss": 0.4854, |
| "step": 9940 |
| }, |
| { |
| "epoch": 2.3935018050541514, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.2511092705001466e-05, |
| "loss": 0.4524, |
| "step": 9945 |
| }, |
| { |
| "epoch": 2.394705174488568, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.248279077082754e-05, |
| "loss": 0.4999, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.3959085439229844, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.245453724544437e-05, |
| "loss": 0.5016, |
| "step": 9955 |
| }, |
| { |
| "epoch": 2.3971119133574006, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.2426332173972076e-05, |
| "loss": 0.4637, |
| "step": 9960 |
| }, |
| { |
| "epoch": 2.3983152827918173, |
| "grad_norm": 1.5078125, |
| "learning_rate": 3.239817560145335e-05, |
| "loss": 0.4696, |
| "step": 9965 |
| }, |
| { |
| "epoch": 2.3995186522262335, |
| "grad_norm": 1.5390625, |
| "learning_rate": 3.2370067572853505e-05, |
| "loss": 0.4652, |
| "step": 9970 |
| }, |
| { |
| "epoch": 2.4007220216606497, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.234200813306026e-05, |
| "loss": 0.5054, |
| "step": 9975 |
| }, |
| { |
| "epoch": 2.401925391095066, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.2313997326883785e-05, |
| "loss": 0.4773, |
| "step": 9980 |
| }, |
| { |
| "epoch": 2.4031287605294827, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.228603519905654e-05, |
| "loss": 0.5104, |
| "step": 9985 |
| }, |
| { |
| "epoch": 2.404332129963899, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.225812179423331e-05, |
| "loss": 0.4805, |
| "step": 9990 |
| }, |
| { |
| "epoch": 2.405535499398315, |
| "grad_norm": 1.7734375, |
| "learning_rate": 3.2230257156991e-05, |
| "loss": 0.468, |
| "step": 9995 |
| }, |
| { |
| "epoch": 2.406738868832732, |
| "grad_norm": 1.828125, |
| "learning_rate": 3.22024413318287e-05, |
| "loss": 0.4634, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.406738868832732, |
| "eval_loss": 0.42871612310409546, |
| "eval_runtime": 2.6519, |
| "eval_samples_per_second": 75.418, |
| "eval_steps_per_second": 75.418, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.407942238267148, |
| "grad_norm": 1.5546875, |
| "learning_rate": 3.2174674363167515e-05, |
| "loss": 0.4666, |
| "step": 10005 |
| }, |
| { |
| "epoch": 2.4091456077015643, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.2146956295350504e-05, |
| "loss": 0.5036, |
| "step": 10010 |
| }, |
| { |
| "epoch": 2.410348977135981, |
| "grad_norm": 1.6484375, |
| "learning_rate": 3.2119287172642696e-05, |
| "loss": 0.4966, |
| "step": 10015 |
| }, |
| { |
| "epoch": 2.411552346570397, |
| "grad_norm": 1.578125, |
| "learning_rate": 3.2091667039230886e-05, |
| "loss": 0.4953, |
| "step": 10020 |
| }, |
| { |
| "epoch": 2.4127557160048134, |
| "grad_norm": 1.8828125, |
| "learning_rate": 3.206409593922369e-05, |
| "loss": 0.48, |
| "step": 10025 |
| }, |
| { |
| "epoch": 2.4139590854392297, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.203657391665138e-05, |
| "loss": 0.4826, |
| "step": 10030 |
| }, |
| { |
| "epoch": 2.4151624548736463, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.200910101546588e-05, |
| "loss": 0.4572, |
| "step": 10035 |
| }, |
| { |
| "epoch": 2.4163658243080626, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.1981677279540655e-05, |
| "loss": 0.48, |
| "step": 10040 |
| }, |
| { |
| "epoch": 2.417569193742479, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.195430275267066e-05, |
| "loss": 0.4708, |
| "step": 10045 |
| }, |
| { |
| "epoch": 2.4187725631768955, |
| "grad_norm": 1.6484375, |
| "learning_rate": 3.1926977478572265e-05, |
| "loss": 0.4647, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.4199759326113117, |
| "grad_norm": 1.7734375, |
| "learning_rate": 3.189970150088317e-05, |
| "loss": 0.4696, |
| "step": 10055 |
| }, |
| { |
| "epoch": 2.421179302045728, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.1872474863162365e-05, |
| "loss": 0.4835, |
| "step": 10060 |
| }, |
| { |
| "epoch": 2.422382671480144, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.1845297608890044e-05, |
| "loss": 0.4729, |
| "step": 10065 |
| }, |
| { |
| "epoch": 2.423586040914561, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.1818169781467544e-05, |
| "loss": 0.474, |
| "step": 10070 |
| }, |
| { |
| "epoch": 2.424789410348977, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.1791091424217244e-05, |
| "loss": 0.4806, |
| "step": 10075 |
| }, |
| { |
| "epoch": 2.4259927797833933, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.176406258038254e-05, |
| "loss": 0.4609, |
| "step": 10080 |
| }, |
| { |
| "epoch": 2.42719614921781, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.1737083293127764e-05, |
| "loss": 0.482, |
| "step": 10085 |
| }, |
| { |
| "epoch": 2.4283995186522263, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.171015360553807e-05, |
| "loss": 0.4992, |
| "step": 10090 |
| }, |
| { |
| "epoch": 2.4296028880866425, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.1683273560619464e-05, |
| "loss": 0.4687, |
| "step": 10095 |
| }, |
| { |
| "epoch": 2.430806257521059, |
| "grad_norm": 1.859375, |
| "learning_rate": 3.165644320129861e-05, |
| "loss": 0.4969, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.4320096269554754, |
| "grad_norm": 1.6328125, |
| "learning_rate": 3.162966257042287e-05, |
| "loss": 0.4818, |
| "step": 10105 |
| }, |
| { |
| "epoch": 2.4332129963898916, |
| "grad_norm": 1.5546875, |
| "learning_rate": 3.160293171076016e-05, |
| "loss": 0.4966, |
| "step": 10110 |
| }, |
| { |
| "epoch": 2.434416365824308, |
| "grad_norm": 1.78125, |
| "learning_rate": 3.157625066499894e-05, |
| "loss": 0.4543, |
| "step": 10115 |
| }, |
| { |
| "epoch": 2.4356197352587245, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.1549619475748115e-05, |
| "loss": 0.4877, |
| "step": 10120 |
| }, |
| { |
| "epoch": 2.436823104693141, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.1523038185536936e-05, |
| "loss": 0.4581, |
| "step": 10125 |
| }, |
| { |
| "epoch": 2.438026474127557, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.1496506836815015e-05, |
| "loss": 0.4877, |
| "step": 10130 |
| }, |
| { |
| "epoch": 2.4392298435619737, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.147002547195218e-05, |
| "loss": 0.5074, |
| "step": 10135 |
| }, |
| { |
| "epoch": 2.44043321299639, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.144359413323845e-05, |
| "loss": 0.5168, |
| "step": 10140 |
| }, |
| { |
| "epoch": 2.441636582430806, |
| "grad_norm": 1.625, |
| "learning_rate": 3.1417212862883916e-05, |
| "loss": 0.4843, |
| "step": 10145 |
| }, |
| { |
| "epoch": 2.4428399518652224, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.139088170301878e-05, |
| "loss": 0.4809, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.444043321299639, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.136460069569316e-05, |
| "loss": 0.4872, |
| "step": 10155 |
| }, |
| { |
| "epoch": 2.4452466907340553, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.133836988287711e-05, |
| "loss": 0.4707, |
| "step": 10160 |
| }, |
| { |
| "epoch": 2.4464500601684716, |
| "grad_norm": 1.8828125, |
| "learning_rate": 3.1312189306460496e-05, |
| "loss": 0.4945, |
| "step": 10165 |
| }, |
| { |
| "epoch": 2.4476534296028882, |
| "grad_norm": 1.515625, |
| "learning_rate": 3.128605900825301e-05, |
| "loss": 0.4746, |
| "step": 10170 |
| }, |
| { |
| "epoch": 2.4488567990373045, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.1259979029984e-05, |
| "loss": 0.4841, |
| "step": 10175 |
| }, |
| { |
| "epoch": 2.4500601684717207, |
| "grad_norm": 1.75, |
| "learning_rate": 3.1233949413302476e-05, |
| "loss": 0.4884, |
| "step": 10180 |
| }, |
| { |
| "epoch": 2.4512635379061374, |
| "grad_norm": 1.7734375, |
| "learning_rate": 3.120797019977703e-05, |
| "loss": 0.5056, |
| "step": 10185 |
| }, |
| { |
| "epoch": 2.4524669073405536, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.118204143089574e-05, |
| "loss": 0.484, |
| "step": 10190 |
| }, |
| { |
| "epoch": 2.45367027677497, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.1156163148066154e-05, |
| "loss": 0.4626, |
| "step": 10195 |
| }, |
| { |
| "epoch": 2.4548736462093865, |
| "grad_norm": 1.828125, |
| "learning_rate": 3.1130335392615145e-05, |
| "loss": 0.5227, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.4560770156438028, |
| "grad_norm": 1.671875, |
| "learning_rate": 3.110455820578897e-05, |
| "loss": 0.529, |
| "step": 10205 |
| }, |
| { |
| "epoch": 2.457280385078219, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.107883162875305e-05, |
| "loss": 0.486, |
| "step": 10210 |
| }, |
| { |
| "epoch": 2.4584837545126352, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.1053155702592044e-05, |
| "loss": 0.4785, |
| "step": 10215 |
| }, |
| { |
| "epoch": 2.459687123947052, |
| "grad_norm": 1.765625, |
| "learning_rate": 3.1027530468309706e-05, |
| "loss": 0.4844, |
| "step": 10220 |
| }, |
| { |
| "epoch": 2.460890493381468, |
| "grad_norm": 1.625, |
| "learning_rate": 3.1001955966828806e-05, |
| "loss": 0.4732, |
| "step": 10225 |
| }, |
| { |
| "epoch": 2.4620938628158844, |
| "grad_norm": 1.515625, |
| "learning_rate": 3.097643223899114e-05, |
| "loss": 0.474, |
| "step": 10230 |
| }, |
| { |
| "epoch": 2.4632972322503006, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.095095932555738e-05, |
| "loss": 0.4966, |
| "step": 10235 |
| }, |
| { |
| "epoch": 2.4645006016847173, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.0925537267207096e-05, |
| "loss": 0.4779, |
| "step": 10240 |
| }, |
| { |
| "epoch": 2.4657039711191335, |
| "grad_norm": 1.6484375, |
| "learning_rate": 3.090016610453859e-05, |
| "loss": 0.4876, |
| "step": 10245 |
| }, |
| { |
| "epoch": 2.4669073405535498, |
| "grad_norm": 1.75, |
| "learning_rate": 3.0874845878068933e-05, |
| "loss": 0.4808, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.4681107099879664, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.084957662823381e-05, |
| "loss": 0.4721, |
| "step": 10255 |
| }, |
| { |
| "epoch": 2.4693140794223827, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.082435839538753e-05, |
| "loss": 0.5265, |
| "step": 10260 |
| }, |
| { |
| "epoch": 2.470517448856799, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.079919121980291e-05, |
| "loss": 0.46, |
| "step": 10265 |
| }, |
| { |
| "epoch": 2.4717208182912156, |
| "grad_norm": 1.671875, |
| "learning_rate": 3.0774075141671253e-05, |
| "loss": 0.4614, |
| "step": 10270 |
| }, |
| { |
| "epoch": 2.472924187725632, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.074901020110223e-05, |
| "loss": 0.4906, |
| "step": 10275 |
| }, |
| { |
| "epoch": 2.474127557160048, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.0723996438123863e-05, |
| "loss": 0.4863, |
| "step": 10280 |
| }, |
| { |
| "epoch": 2.4753309265944647, |
| "grad_norm": 1.5703125, |
| "learning_rate": 3.0699033892682455e-05, |
| "loss": 0.4468, |
| "step": 10285 |
| }, |
| { |
| "epoch": 2.476534296028881, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.0674122604642494e-05, |
| "loss": 0.4698, |
| "step": 10290 |
| }, |
| { |
| "epoch": 2.477737665463297, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.064926261378663e-05, |
| "loss": 0.4576, |
| "step": 10295 |
| }, |
| { |
| "epoch": 2.4789410348977134, |
| "grad_norm": 1.8984375, |
| "learning_rate": 3.062445395981557e-05, |
| "loss": 0.472, |
| "step": 10300 |
| }, |
| { |
| "epoch": 2.48014440433213, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.0599696682348065e-05, |
| "loss": 0.4512, |
| "step": 10305 |
| }, |
| { |
| "epoch": 2.4813477737665464, |
| "grad_norm": 1.78125, |
| "learning_rate": 3.057499082092081e-05, |
| "loss": 0.4875, |
| "step": 10310 |
| }, |
| { |
| "epoch": 2.4825511432009626, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.055033641498837e-05, |
| "loss": 0.4799, |
| "step": 10315 |
| }, |
| { |
| "epoch": 2.483754512635379, |
| "grad_norm": 1.75, |
| "learning_rate": 3.052573350392318e-05, |
| "loss": 0.5623, |
| "step": 10320 |
| }, |
| { |
| "epoch": 2.4849578820697955, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.050118212701538e-05, |
| "loss": 0.5132, |
| "step": 10325 |
| }, |
| { |
| "epoch": 2.4861612515042117, |
| "grad_norm": 1.546875, |
| "learning_rate": 3.0476682323472873e-05, |
| "loss": 0.4654, |
| "step": 10330 |
| }, |
| { |
| "epoch": 2.487364620938628, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.0452234132421153e-05, |
| "loss": 0.4787, |
| "step": 10335 |
| }, |
| { |
| "epoch": 2.4885679903730447, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.0427837592903322e-05, |
| "loss": 0.4643, |
| "step": 10340 |
| }, |
| { |
| "epoch": 2.489771359807461, |
| "grad_norm": 1.78125, |
| "learning_rate": 3.0403492743879963e-05, |
| "loss": 0.4728, |
| "step": 10345 |
| }, |
| { |
| "epoch": 2.490974729241877, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.037919962422914e-05, |
| "loss": 0.4616, |
| "step": 10350 |
| }, |
| { |
| "epoch": 2.492178098676294, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.0354958272746315e-05, |
| "loss": 0.4936, |
| "step": 10355 |
| }, |
| { |
| "epoch": 2.49338146811071, |
| "grad_norm": 1.796875, |
| "learning_rate": 3.0330768728144233e-05, |
| "loss": 0.4876, |
| "step": 10360 |
| }, |
| { |
| "epoch": 2.4945848375451263, |
| "grad_norm": 1.5703125, |
| "learning_rate": 3.0306631029052953e-05, |
| "loss": 0.4492, |
| "step": 10365 |
| }, |
| { |
| "epoch": 2.495788206979543, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.0282545214019703e-05, |
| "loss": 0.4701, |
| "step": 10370 |
| }, |
| { |
| "epoch": 2.496991576413959, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.0258511321508874e-05, |
| "loss": 0.4946, |
| "step": 10375 |
| }, |
| { |
| "epoch": 2.4981949458483754, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.0234529389901923e-05, |
| "loss": 0.4841, |
| "step": 10380 |
| }, |
| { |
| "epoch": 2.4993983152827917, |
| "grad_norm": 1.75, |
| "learning_rate": 3.0210599457497347e-05, |
| "loss": 0.4744, |
| "step": 10385 |
| }, |
| { |
| "epoch": 2.5006016847172083, |
| "grad_norm": 1.625, |
| "learning_rate": 3.0186721562510583e-05, |
| "loss": 0.4639, |
| "step": 10390 |
| }, |
| { |
| "epoch": 2.5018050541516246, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.016289574307397e-05, |
| "loss": 0.482, |
| "step": 10395 |
| }, |
| { |
| "epoch": 2.503008423586041, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.01391220372367e-05, |
| "loss": 0.4678, |
| "step": 10400 |
| }, |
| { |
| "epoch": 2.504211793020457, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.011540048296471e-05, |
| "loss": 0.4825, |
| "step": 10405 |
| }, |
| { |
| "epoch": 2.5054151624548737, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.009173111814069e-05, |
| "loss": 0.4693, |
| "step": 10410 |
| }, |
| { |
| "epoch": 2.50661853188929, |
| "grad_norm": 1.6015625, |
| "learning_rate": 3.006811398056394e-05, |
| "loss": 0.4785, |
| "step": 10415 |
| }, |
| { |
| "epoch": 2.507821901323706, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.0044549107950403e-05, |
| "loss": 0.4637, |
| "step": 10420 |
| }, |
| { |
| "epoch": 2.509025270758123, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.0021036537932523e-05, |
| "loss": 0.5127, |
| "step": 10425 |
| }, |
| { |
| "epoch": 2.510228640192539, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.9997576308059222e-05, |
| "loss": 0.4805, |
| "step": 10430 |
| }, |
| { |
| "epoch": 2.5114320096269553, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.997416845579585e-05, |
| "loss": 0.4855, |
| "step": 10435 |
| }, |
| { |
| "epoch": 2.512635379061372, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.9950813018524096e-05, |
| "loss": 0.4735, |
| "step": 10440 |
| }, |
| { |
| "epoch": 2.5138387484957883, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.992751003354196e-05, |
| "loss": 0.4839, |
| "step": 10445 |
| }, |
| { |
| "epoch": 2.5150421179302045, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.9904259538063662e-05, |
| "loss": 0.4965, |
| "step": 10450 |
| }, |
| { |
| "epoch": 2.516245487364621, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.9881061569219613e-05, |
| "loss": 0.499, |
| "step": 10455 |
| }, |
| { |
| "epoch": 2.5174488567990374, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.9857916164056317e-05, |
| "loss": 0.4927, |
| "step": 10460 |
| }, |
| { |
| "epoch": 2.5186522262334536, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.9834823359536362e-05, |
| "loss": 0.4586, |
| "step": 10465 |
| }, |
| { |
| "epoch": 2.51985559566787, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.981178319253831e-05, |
| "loss": 0.4548, |
| "step": 10470 |
| }, |
| { |
| "epoch": 2.5210589651022866, |
| "grad_norm": 1.8671875, |
| "learning_rate": 2.9788795699856688e-05, |
| "loss": 0.4943, |
| "step": 10475 |
| }, |
| { |
| "epoch": 2.522262334536703, |
| "grad_norm": 1.625, |
| "learning_rate": 2.976586091820188e-05, |
| "loss": 0.4588, |
| "step": 10480 |
| }, |
| { |
| "epoch": 2.523465703971119, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.9742978884200102e-05, |
| "loss": 0.5151, |
| "step": 10485 |
| }, |
| { |
| "epoch": 2.5246690734055353, |
| "grad_norm": 1.75, |
| "learning_rate": 2.9720149634393347e-05, |
| "loss": 0.5076, |
| "step": 10490 |
| }, |
| { |
| "epoch": 2.525872442839952, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.969737320523928e-05, |
| "loss": 0.4618, |
| "step": 10495 |
| }, |
| { |
| "epoch": 2.527075812274368, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.967464963311125e-05, |
| "loss": 0.466, |
| "step": 10500 |
| }, |
| { |
| "epoch": 2.527075812274368, |
| "eval_loss": 0.42753535509109497, |
| "eval_runtime": 2.6456, |
| "eval_samples_per_second": 75.598, |
| "eval_steps_per_second": 75.598, |
| "step": 10500 |
| }, |
| { |
| "epoch": 2.5282791817087844, |
| "grad_norm": 1.625, |
| "learning_rate": 2.9651978954298154e-05, |
| "loss": 0.4825, |
| "step": 10505 |
| }, |
| { |
| "epoch": 2.529482551143201, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.9629361205004465e-05, |
| "loss": 0.4836, |
| "step": 10510 |
| }, |
| { |
| "epoch": 2.5306859205776173, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.960679642135009e-05, |
| "loss": 0.5091, |
| "step": 10515 |
| }, |
| { |
| "epoch": 2.5318892900120336, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.9584284639370386e-05, |
| "loss": 0.4936, |
| "step": 10520 |
| }, |
| { |
| "epoch": 2.5330926594464502, |
| "grad_norm": 1.546875, |
| "learning_rate": 2.956182589501603e-05, |
| "loss": 0.471, |
| "step": 10525 |
| }, |
| { |
| "epoch": 2.5342960288808665, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.953942022415303e-05, |
| "loss": 0.4968, |
| "step": 10530 |
| }, |
| { |
| "epoch": 2.5354993983152827, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.9517067662562634e-05, |
| "loss": 0.4462, |
| "step": 10535 |
| }, |
| { |
| "epoch": 2.5367027677496994, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.9494768245941254e-05, |
| "loss": 0.4922, |
| "step": 10540 |
| }, |
| { |
| "epoch": 2.5379061371841156, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.9472522009900453e-05, |
| "loss": 0.5037, |
| "step": 10545 |
| }, |
| { |
| "epoch": 2.539109506618532, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.945032898996685e-05, |
| "loss": 0.4743, |
| "step": 10550 |
| }, |
| { |
| "epoch": 2.5403128760529485, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.942818922158211e-05, |
| "loss": 0.4727, |
| "step": 10555 |
| }, |
| { |
| "epoch": 2.5415162454873648, |
| "grad_norm": 1.625, |
| "learning_rate": 2.9406102740102805e-05, |
| "loss": 0.489, |
| "step": 10560 |
| }, |
| { |
| "epoch": 2.542719614921781, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.938406958080045e-05, |
| "loss": 0.4562, |
| "step": 10565 |
| }, |
| { |
| "epoch": 2.5439229843561972, |
| "grad_norm": 1.75, |
| "learning_rate": 2.93620897788614e-05, |
| "loss": 0.4893, |
| "step": 10570 |
| }, |
| { |
| "epoch": 2.5451263537906135, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.934016336938679e-05, |
| "loss": 0.4842, |
| "step": 10575 |
| }, |
| { |
| "epoch": 2.54632972322503, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.9318290387392497e-05, |
| "loss": 0.4988, |
| "step": 10580 |
| }, |
| { |
| "epoch": 2.5475330926594464, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.929647086780906e-05, |
| "loss": 0.4788, |
| "step": 10585 |
| }, |
| { |
| "epoch": 2.5487364620938626, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.9274704845481668e-05, |
| "loss": 0.5066, |
| "step": 10590 |
| }, |
| { |
| "epoch": 2.5499398315282793, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.925299235517005e-05, |
| "loss": 0.478, |
| "step": 10595 |
| }, |
| { |
| "epoch": 2.5511432009626955, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.923133343154847e-05, |
| "loss": 0.4756, |
| "step": 10600 |
| }, |
| { |
| "epoch": 2.5523465703971118, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.9209728109205602e-05, |
| "loss": 0.4817, |
| "step": 10605 |
| }, |
| { |
| "epoch": 2.5535499398315284, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.9188176422644586e-05, |
| "loss": 0.4795, |
| "step": 10610 |
| }, |
| { |
| "epoch": 2.5547533092659447, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.916667840628286e-05, |
| "loss": 0.4618, |
| "step": 10615 |
| }, |
| { |
| "epoch": 2.555956678700361, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.9145234094452152e-05, |
| "loss": 0.4792, |
| "step": 10620 |
| }, |
| { |
| "epoch": 2.5571600481347776, |
| "grad_norm": 1.8359375, |
| "learning_rate": 2.912384352139846e-05, |
| "loss": 0.5195, |
| "step": 10625 |
| }, |
| { |
| "epoch": 2.558363417569194, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.9102506721281932e-05, |
| "loss": 0.4711, |
| "step": 10630 |
| }, |
| { |
| "epoch": 2.55956678700361, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.908122372817685e-05, |
| "loss": 0.5014, |
| "step": 10635 |
| }, |
| { |
| "epoch": 2.5607701564380267, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.9059994576071566e-05, |
| "loss": 0.4875, |
| "step": 10640 |
| }, |
| { |
| "epoch": 2.561973525872443, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.9038819298868456e-05, |
| "loss": 0.4798, |
| "step": 10645 |
| }, |
| { |
| "epoch": 2.563176895306859, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.9017697930383852e-05, |
| "loss": 0.5266, |
| "step": 10650 |
| }, |
| { |
| "epoch": 2.5643802647412755, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.8996630504348013e-05, |
| "loss": 0.4718, |
| "step": 10655 |
| }, |
| { |
| "epoch": 2.5655836341756917, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.897561705440501e-05, |
| "loss": 0.47, |
| "step": 10660 |
| }, |
| { |
| "epoch": 2.5667870036101084, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.8954657614112776e-05, |
| "loss": 0.4771, |
| "step": 10665 |
| }, |
| { |
| "epoch": 2.5679903730445246, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.893375221694295e-05, |
| "loss": 0.4754, |
| "step": 10670 |
| }, |
| { |
| "epoch": 2.569193742478941, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.891290089628086e-05, |
| "loss": 0.4988, |
| "step": 10675 |
| }, |
| { |
| "epoch": 2.5703971119133575, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.889210368542552e-05, |
| "loss": 0.4776, |
| "step": 10680 |
| }, |
| { |
| "epoch": 2.5716004813477737, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.8871360617589482e-05, |
| "loss": 0.4859, |
| "step": 10685 |
| }, |
| { |
| "epoch": 2.57280385078219, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.8850671725898866e-05, |
| "loss": 0.4934, |
| "step": 10690 |
| }, |
| { |
| "epoch": 2.5740072202166067, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.883003704339326e-05, |
| "loss": 0.5091, |
| "step": 10695 |
| }, |
| { |
| "epoch": 2.575210589651023, |
| "grad_norm": 1.8203125, |
| "learning_rate": 2.880945660302568e-05, |
| "loss": 0.5044, |
| "step": 10700 |
| }, |
| { |
| "epoch": 2.576413959085439, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.8788930437662534e-05, |
| "loss": 0.4895, |
| "step": 10705 |
| }, |
| { |
| "epoch": 2.577617328519856, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.8768458580083534e-05, |
| "loss": 0.4935, |
| "step": 10710 |
| }, |
| { |
| "epoch": 2.578820697954272, |
| "grad_norm": 1.8125, |
| "learning_rate": 2.874804106298168e-05, |
| "loss": 0.4641, |
| "step": 10715 |
| }, |
| { |
| "epoch": 2.5800240673886883, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.8727677918963175e-05, |
| "loss": 0.493, |
| "step": 10720 |
| }, |
| { |
| "epoch": 2.581227436823105, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.8707369180547415e-05, |
| "loss": 0.4825, |
| "step": 10725 |
| }, |
| { |
| "epoch": 2.582430806257521, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.868711488016689e-05, |
| "loss": 0.4867, |
| "step": 10730 |
| }, |
| { |
| "epoch": 2.5836341756919374, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.8666915050167162e-05, |
| "loss": 0.4928, |
| "step": 10735 |
| }, |
| { |
| "epoch": 2.5848375451263537, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.8646769722806797e-05, |
| "loss": 0.4806, |
| "step": 10740 |
| }, |
| { |
| "epoch": 2.58604091456077, |
| "grad_norm": 1.890625, |
| "learning_rate": 2.8626678930257337e-05, |
| "loss": 0.5266, |
| "step": 10745 |
| }, |
| { |
| "epoch": 2.5872442839951866, |
| "grad_norm": 1.484375, |
| "learning_rate": 2.8606642704603222e-05, |
| "loss": 0.4748, |
| "step": 10750 |
| }, |
| { |
| "epoch": 2.588447653429603, |
| "grad_norm": 1.625, |
| "learning_rate": 2.8586661077841757e-05, |
| "loss": 0.4731, |
| "step": 10755 |
| }, |
| { |
| "epoch": 2.589651022864019, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.8566734081883053e-05, |
| "loss": 0.4626, |
| "step": 10760 |
| }, |
| { |
| "epoch": 2.5908543922984357, |
| "grad_norm": 1.484375, |
| "learning_rate": 2.8546861748549954e-05, |
| "loss": 0.4794, |
| "step": 10765 |
| }, |
| { |
| "epoch": 2.592057761732852, |
| "grad_norm": 1.5703125, |
| "learning_rate": 2.852704410957806e-05, |
| "loss": 0.5056, |
| "step": 10770 |
| }, |
| { |
| "epoch": 2.593261131167268, |
| "grad_norm": 1.9375, |
| "learning_rate": 2.8507281196615577e-05, |
| "loss": 0.481, |
| "step": 10775 |
| }, |
| { |
| "epoch": 2.594464500601685, |
| "grad_norm": 1.625, |
| "learning_rate": 2.8487573041223333e-05, |
| "loss": 0.4982, |
| "step": 10780 |
| }, |
| { |
| "epoch": 2.595667870036101, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.846791967487472e-05, |
| "loss": 0.4746, |
| "step": 10785 |
| }, |
| { |
| "epoch": 2.5968712394705173, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.8448321128955624e-05, |
| "loss": 0.4864, |
| "step": 10790 |
| }, |
| { |
| "epoch": 2.598074608904934, |
| "grad_norm": 1.90625, |
| "learning_rate": 2.842877743476436e-05, |
| "loss": 0.4606, |
| "step": 10795 |
| }, |
| { |
| "epoch": 2.5992779783393503, |
| "grad_norm": 1.5078125, |
| "learning_rate": 2.84092886235117e-05, |
| "loss": 0.4661, |
| "step": 10800 |
| }, |
| { |
| "epoch": 2.6004813477737665, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.8389854726320735e-05, |
| "loss": 0.4523, |
| "step": 10805 |
| }, |
| { |
| "epoch": 2.601684717208183, |
| "grad_norm": 1.625, |
| "learning_rate": 2.8370475774226854e-05, |
| "loss": 0.4783, |
| "step": 10810 |
| }, |
| { |
| "epoch": 2.6028880866425994, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.8351151798177713e-05, |
| "loss": 0.4668, |
| "step": 10815 |
| }, |
| { |
| "epoch": 2.6040914560770156, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.8331882829033165e-05, |
| "loss": 0.4396, |
| "step": 10820 |
| }, |
| { |
| "epoch": 2.605294825511432, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.8312668897565252e-05, |
| "loss": 0.4702, |
| "step": 10825 |
| }, |
| { |
| "epoch": 2.606498194945848, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.8293510034458053e-05, |
| "loss": 0.4539, |
| "step": 10830 |
| }, |
| { |
| "epoch": 2.607701564380265, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.8274406270307774e-05, |
| "loss": 0.4816, |
| "step": 10835 |
| }, |
| { |
| "epoch": 2.608904933814681, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.825535763562259e-05, |
| "loss": 0.5128, |
| "step": 10840 |
| }, |
| { |
| "epoch": 2.6101083032490973, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.823636416082266e-05, |
| "loss": 0.4873, |
| "step": 10845 |
| }, |
| { |
| "epoch": 2.611311672683514, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.8217425876240016e-05, |
| "loss": 0.4858, |
| "step": 10850 |
| }, |
| { |
| "epoch": 2.61251504211793, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.81985428121186e-05, |
| "loss": 0.5068, |
| "step": 10855 |
| }, |
| { |
| "epoch": 2.6137184115523464, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.8179714998614137e-05, |
| "loss": 0.4981, |
| "step": 10860 |
| }, |
| { |
| "epoch": 2.614921780986763, |
| "grad_norm": 1.625, |
| "learning_rate": 2.816094246579413e-05, |
| "loss": 0.4599, |
| "step": 10865 |
| }, |
| { |
| "epoch": 2.6161251504211793, |
| "grad_norm": 1.75, |
| "learning_rate": 2.81422252436378e-05, |
| "loss": 0.5052, |
| "step": 10870 |
| }, |
| { |
| "epoch": 2.6173285198555956, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.8123563362036032e-05, |
| "loss": 0.4965, |
| "step": 10875 |
| }, |
| { |
| "epoch": 2.6185318892900122, |
| "grad_norm": 1.625, |
| "learning_rate": 2.8104956850791343e-05, |
| "loss": 0.4629, |
| "step": 10880 |
| }, |
| { |
| "epoch": 2.6197352587244285, |
| "grad_norm": 1.625, |
| "learning_rate": 2.808640573961781e-05, |
| "loss": 0.4899, |
| "step": 10885 |
| }, |
| { |
| "epoch": 2.6209386281588447, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.8067910058141053e-05, |
| "loss": 0.4821, |
| "step": 10890 |
| }, |
| { |
| "epoch": 2.6221419975932614, |
| "grad_norm": 1.5390625, |
| "learning_rate": 2.804946983589817e-05, |
| "loss": 0.4627, |
| "step": 10895 |
| }, |
| { |
| "epoch": 2.6233453670276776, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.8031085102337683e-05, |
| "loss": 0.4992, |
| "step": 10900 |
| }, |
| { |
| "epoch": 2.624548736462094, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.8012755886819522e-05, |
| "loss": 0.4943, |
| "step": 10905 |
| }, |
| { |
| "epoch": 2.62575210589651, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.799448221861491e-05, |
| "loss": 0.4827, |
| "step": 10910 |
| }, |
| { |
| "epoch": 2.6269554753309263, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.797626412690642e-05, |
| "loss": 0.5237, |
| "step": 10915 |
| }, |
| { |
| "epoch": 2.628158844765343, |
| "grad_norm": 1.8359375, |
| "learning_rate": 2.795810164078783e-05, |
| "loss": 0.4704, |
| "step": 10920 |
| }, |
| { |
| "epoch": 2.6293622141997592, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.793999478926414e-05, |
| "loss": 0.4923, |
| "step": 10925 |
| }, |
| { |
| "epoch": 2.6305655836341755, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.7921943601251483e-05, |
| "loss": 0.4792, |
| "step": 10930 |
| }, |
| { |
| "epoch": 2.631768953068592, |
| "grad_norm": 1.75, |
| "learning_rate": 2.790394810557712e-05, |
| "loss": 0.4865, |
| "step": 10935 |
| }, |
| { |
| "epoch": 2.6329723225030084, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.7886008330979353e-05, |
| "loss": 0.4697, |
| "step": 10940 |
| }, |
| { |
| "epoch": 2.6341756919374246, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.7868124306107516e-05, |
| "loss": 0.4771, |
| "step": 10945 |
| }, |
| { |
| "epoch": 2.6353790613718413, |
| "grad_norm": 1.5078125, |
| "learning_rate": 2.785029605952191e-05, |
| "loss": 0.4955, |
| "step": 10950 |
| }, |
| { |
| "epoch": 2.6365824308062575, |
| "grad_norm": 1.8125, |
| "learning_rate": 2.783252361969374e-05, |
| "loss": 0.4593, |
| "step": 10955 |
| }, |
| { |
| "epoch": 2.6377858002406738, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.781480701500512e-05, |
| "loss": 0.5014, |
| "step": 10960 |
| }, |
| { |
| "epoch": 2.6389891696750905, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.7797146273748968e-05, |
| "loss": 0.4649, |
| "step": 10965 |
| }, |
| { |
| "epoch": 2.6401925391095067, |
| "grad_norm": 1.546875, |
| "learning_rate": 2.7779541424128996e-05, |
| "loss": 0.4714, |
| "step": 10970 |
| }, |
| { |
| "epoch": 2.641395908543923, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.7761992494259687e-05, |
| "loss": 0.507, |
| "step": 10975 |
| }, |
| { |
| "epoch": 2.6425992779783396, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.7744499512166185e-05, |
| "loss": 0.4885, |
| "step": 10980 |
| }, |
| { |
| "epoch": 2.643802647412756, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.77270625057843e-05, |
| "loss": 0.4656, |
| "step": 10985 |
| }, |
| { |
| "epoch": 2.645006016847172, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.7709681502960464e-05, |
| "loss": 0.4905, |
| "step": 10990 |
| }, |
| { |
| "epoch": 2.6462093862815883, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.769235653145165e-05, |
| "loss": 0.4678, |
| "step": 10995 |
| }, |
| { |
| "epoch": 2.6474127557160045, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.7675087618925355e-05, |
| "loss": 0.4929, |
| "step": 11000 |
| }, |
| { |
| "epoch": 2.6474127557160045, |
| "eval_loss": 0.42500078678131104, |
| "eval_runtime": 2.6494, |
| "eval_samples_per_second": 75.49, |
| "eval_steps_per_second": 75.49, |
| "step": 11000 |
| }, |
| { |
| "epoch": 2.648616125150421, |
| "grad_norm": 1.5390625, |
| "learning_rate": 2.765787479295958e-05, |
| "loss": 0.4801, |
| "step": 11005 |
| }, |
| { |
| "epoch": 2.6498194945848375, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.7640718081042713e-05, |
| "loss": 0.4399, |
| "step": 11010 |
| }, |
| { |
| "epoch": 2.6510228640192537, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.7623617510573552e-05, |
| "loss": 0.4657, |
| "step": 11015 |
| }, |
| { |
| "epoch": 2.6522262334536704, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.7606573108861258e-05, |
| "loss": 0.4789, |
| "step": 11020 |
| }, |
| { |
| "epoch": 2.6534296028880866, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.7589584903125248e-05, |
| "loss": 0.4456, |
| "step": 11025 |
| }, |
| { |
| "epoch": 2.654632972322503, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.757265292049524e-05, |
| "loss": 0.4729, |
| "step": 11030 |
| }, |
| { |
| "epoch": 2.6558363417569195, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.755577718801113e-05, |
| "loss": 0.4684, |
| "step": 11035 |
| }, |
| { |
| "epoch": 2.6570397111913358, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.7538957732623012e-05, |
| "loss": 0.4481, |
| "step": 11040 |
| }, |
| { |
| "epoch": 2.658243080625752, |
| "grad_norm": 1.8984375, |
| "learning_rate": 2.7522194581191066e-05, |
| "loss": 0.4906, |
| "step": 11045 |
| }, |
| { |
| "epoch": 2.6594464500601687, |
| "grad_norm": 1.875, |
| "learning_rate": 2.7505487760485622e-05, |
| "loss": 0.513, |
| "step": 11050 |
| }, |
| { |
| "epoch": 2.660649819494585, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.7488837297186986e-05, |
| "loss": 0.4924, |
| "step": 11055 |
| }, |
| { |
| "epoch": 2.661853188929001, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.7472243217885502e-05, |
| "loss": 0.4782, |
| "step": 11060 |
| }, |
| { |
| "epoch": 2.663056558363418, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.745570554908146e-05, |
| "loss": 0.5242, |
| "step": 11065 |
| }, |
| { |
| "epoch": 2.664259927797834, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.7439224317185042e-05, |
| "loss": 0.4725, |
| "step": 11070 |
| }, |
| { |
| "epoch": 2.6654632972322503, |
| "grad_norm": 1.625, |
| "learning_rate": 2.7422799548516356e-05, |
| "loss": 0.5028, |
| "step": 11075 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 1.5234375, |
| "learning_rate": 2.7406431269305273e-05, |
| "loss": 0.4748, |
| "step": 11080 |
| }, |
| { |
| "epoch": 2.667870036101083, |
| "grad_norm": 1.5234375, |
| "learning_rate": 2.7390119505691513e-05, |
| "loss": 0.464, |
| "step": 11085 |
| }, |
| { |
| "epoch": 2.6690734055354994, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.7373864283724494e-05, |
| "loss": 0.4731, |
| "step": 11090 |
| }, |
| { |
| "epoch": 2.6702767749699157, |
| "grad_norm": 1.5703125, |
| "learning_rate": 2.735766562936337e-05, |
| "loss": 0.4776, |
| "step": 11095 |
| }, |
| { |
| "epoch": 2.671480144404332, |
| "grad_norm": 1.5234375, |
| "learning_rate": 2.7341523568476932e-05, |
| "loss": 0.4903, |
| "step": 11100 |
| }, |
| { |
| "epoch": 2.6726835138387486, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.7325438126843623e-05, |
| "loss": 0.4393, |
| "step": 11105 |
| }, |
| { |
| "epoch": 2.673886883273165, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.7309409330151448e-05, |
| "loss": 0.4801, |
| "step": 11110 |
| }, |
| { |
| "epoch": 2.675090252707581, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.7293437203997948e-05, |
| "loss": 0.477, |
| "step": 11115 |
| }, |
| { |
| "epoch": 2.6762936221419977, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.7277521773890184e-05, |
| "loss": 0.4975, |
| "step": 11120 |
| }, |
| { |
| "epoch": 2.677496991576414, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.7261663065244633e-05, |
| "loss": 0.4837, |
| "step": 11125 |
| }, |
| { |
| "epoch": 2.67870036101083, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.7245861103387245e-05, |
| "loss": 0.4679, |
| "step": 11130 |
| }, |
| { |
| "epoch": 2.679903730445247, |
| "grad_norm": 1.8828125, |
| "learning_rate": 2.72301159135533e-05, |
| "loss": 0.5291, |
| "step": 11135 |
| }, |
| { |
| "epoch": 2.681107099879663, |
| "grad_norm": 1.546875, |
| "learning_rate": 2.7214427520887443e-05, |
| "loss": 0.4773, |
| "step": 11140 |
| }, |
| { |
| "epoch": 2.6823104693140793, |
| "grad_norm": 1.625, |
| "learning_rate": 2.7198795950443602e-05, |
| "loss": 0.4797, |
| "step": 11145 |
| }, |
| { |
| "epoch": 2.683513838748496, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.7183221227184965e-05, |
| "loss": 0.4914, |
| "step": 11150 |
| }, |
| { |
| "epoch": 2.6847172081829123, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.716770337598394e-05, |
| "loss": 0.4819, |
| "step": 11155 |
| }, |
| { |
| "epoch": 2.6859205776173285, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.715224242162211e-05, |
| "loss": 0.4945, |
| "step": 11160 |
| }, |
| { |
| "epoch": 2.6871239470517447, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.713683838879019e-05, |
| "loss": 0.5188, |
| "step": 11165 |
| }, |
| { |
| "epoch": 2.6883273164861614, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.712149130208799e-05, |
| "loss": 0.4976, |
| "step": 11170 |
| }, |
| { |
| "epoch": 2.6895306859205776, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.7106201186024398e-05, |
| "loss": 0.4624, |
| "step": 11175 |
| }, |
| { |
| "epoch": 2.690734055354994, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.7090968065017305e-05, |
| "loss": 0.515, |
| "step": 11180 |
| }, |
| { |
| "epoch": 2.69193742478941, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.7075791963393585e-05, |
| "loss": 0.4496, |
| "step": 11185 |
| }, |
| { |
| "epoch": 2.693140794223827, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.7060672905389052e-05, |
| "loss": 0.4344, |
| "step": 11190 |
| }, |
| { |
| "epoch": 2.694344163658243, |
| "grad_norm": 1.546875, |
| "learning_rate": 2.704561091514843e-05, |
| "loss": 0.4846, |
| "step": 11195 |
| }, |
| { |
| "epoch": 2.6955475330926593, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.7030606016725304e-05, |
| "loss": 0.4739, |
| "step": 11200 |
| }, |
| { |
| "epoch": 2.696750902527076, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.7015658234082083e-05, |
| "loss": 0.455, |
| "step": 11205 |
| }, |
| { |
| "epoch": 2.697954271961492, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.7000767591089962e-05, |
| "loss": 0.5107, |
| "step": 11210 |
| }, |
| { |
| "epoch": 2.6991576413959084, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.6985934111528894e-05, |
| "loss": 0.4824, |
| "step": 11215 |
| }, |
| { |
| "epoch": 2.700361010830325, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.6971157819087537e-05, |
| "loss": 0.4571, |
| "step": 11220 |
| }, |
| { |
| "epoch": 2.7015643802647413, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.6956438737363216e-05, |
| "loss": 0.4845, |
| "step": 11225 |
| }, |
| { |
| "epoch": 2.7027677496991576, |
| "grad_norm": 1.828125, |
| "learning_rate": 2.6941776889861912e-05, |
| "loss": 0.4821, |
| "step": 11230 |
| }, |
| { |
| "epoch": 2.7039711191335742, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.6927172299998184e-05, |
| "loss": 0.4681, |
| "step": 11235 |
| }, |
| { |
| "epoch": 2.7051744885679905, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.6912624991095162e-05, |
| "loss": 0.4716, |
| "step": 11240 |
| }, |
| { |
| "epoch": 2.7063778580024067, |
| "grad_norm": 1.8203125, |
| "learning_rate": 2.6898134986384503e-05, |
| "loss": 0.4784, |
| "step": 11245 |
| }, |
| { |
| "epoch": 2.707581227436823, |
| "grad_norm": 1.53125, |
| "learning_rate": 2.688370230900633e-05, |
| "loss": 0.4818, |
| "step": 11250 |
| }, |
| { |
| "epoch": 2.7087845968712396, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.6869326982009258e-05, |
| "loss": 0.454, |
| "step": 11255 |
| }, |
| { |
| "epoch": 2.709987966305656, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.6855009028350265e-05, |
| "loss": 0.4757, |
| "step": 11260 |
| }, |
| { |
| "epoch": 2.711191335740072, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.6840748470894747e-05, |
| "loss": 0.4624, |
| "step": 11265 |
| }, |
| { |
| "epoch": 2.7123947051744883, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.6826545332416404e-05, |
| "loss": 0.476, |
| "step": 11270 |
| }, |
| { |
| "epoch": 2.713598074608905, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.681239963559728e-05, |
| "loss": 0.4831, |
| "step": 11275 |
| }, |
| { |
| "epoch": 2.7148014440433212, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.679831140302764e-05, |
| "loss": 0.4741, |
| "step": 11280 |
| }, |
| { |
| "epoch": 2.7160048134777375, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.678428065720602e-05, |
| "loss": 0.4468, |
| "step": 11285 |
| }, |
| { |
| "epoch": 2.717208182912154, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.6770307420539135e-05, |
| "loss": 0.4755, |
| "step": 11290 |
| }, |
| { |
| "epoch": 2.7184115523465704, |
| "grad_norm": 1.4765625, |
| "learning_rate": 2.675639171534185e-05, |
| "loss": 0.4792, |
| "step": 11295 |
| }, |
| { |
| "epoch": 2.7196149217809866, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.6742533563837175e-05, |
| "loss": 0.4592, |
| "step": 11300 |
| }, |
| { |
| "epoch": 2.7208182912154033, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.6728732988156193e-05, |
| "loss": 0.4599, |
| "step": 11305 |
| }, |
| { |
| "epoch": 2.7220216606498195, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.6714990010338047e-05, |
| "loss": 0.4659, |
| "step": 11310 |
| }, |
| { |
| "epoch": 2.7232250300842358, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.6701304652329877e-05, |
| "loss": 0.498, |
| "step": 11315 |
| }, |
| { |
| "epoch": 2.7244283995186525, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.6687676935986846e-05, |
| "loss": 0.4826, |
| "step": 11320 |
| }, |
| { |
| "epoch": 2.7256317689530687, |
| "grad_norm": 1.765625, |
| "learning_rate": 2.6674106883072023e-05, |
| "loss": 0.4886, |
| "step": 11325 |
| }, |
| { |
| "epoch": 2.726835138387485, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.666059451525643e-05, |
| "loss": 0.4858, |
| "step": 11330 |
| }, |
| { |
| "epoch": 2.728038507821901, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.664713985411893e-05, |
| "loss": 0.4971, |
| "step": 11335 |
| }, |
| { |
| "epoch": 2.729241877256318, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.6633742921146254e-05, |
| "loss": 0.4797, |
| "step": 11340 |
| }, |
| { |
| "epoch": 2.730445246690734, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.6620403737732942e-05, |
| "loss": 0.4846, |
| "step": 11345 |
| }, |
| { |
| "epoch": 2.7316486161251503, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.660712232518129e-05, |
| "loss": 0.4783, |
| "step": 11350 |
| }, |
| { |
| "epoch": 2.7328519855595665, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.659389870470135e-05, |
| "loss": 0.466, |
| "step": 11355 |
| }, |
| { |
| "epoch": 2.7340553549939832, |
| "grad_norm": 1.8515625, |
| "learning_rate": 2.6580732897410888e-05, |
| "loss": 0.4862, |
| "step": 11360 |
| }, |
| { |
| "epoch": 2.7352587244283995, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.6567624924335334e-05, |
| "loss": 0.4569, |
| "step": 11365 |
| }, |
| { |
| "epoch": 2.7364620938628157, |
| "grad_norm": 1.625, |
| "learning_rate": 2.655457480640776e-05, |
| "loss": 0.4681, |
| "step": 11370 |
| }, |
| { |
| "epoch": 2.7376654632972324, |
| "grad_norm": 1.5703125, |
| "learning_rate": 2.654158256446884e-05, |
| "loss": 0.4885, |
| "step": 11375 |
| }, |
| { |
| "epoch": 2.7388688327316486, |
| "grad_norm": 1.59375, |
| "learning_rate": 2.652864821926684e-05, |
| "loss": 0.4908, |
| "step": 11380 |
| }, |
| { |
| "epoch": 2.740072202166065, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.651577179145755e-05, |
| "loss": 0.445, |
| "step": 11385 |
| }, |
| { |
| "epoch": 2.7412755716004815, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.6502953301604253e-05, |
| "loss": 0.4859, |
| "step": 11390 |
| }, |
| { |
| "epoch": 2.7424789410348978, |
| "grad_norm": 1.515625, |
| "learning_rate": 2.6490192770177752e-05, |
| "loss": 0.4743, |
| "step": 11395 |
| }, |
| { |
| "epoch": 2.743682310469314, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.6477490217556253e-05, |
| "loss": 0.4595, |
| "step": 11400 |
| }, |
| { |
| "epoch": 2.7448856799037307, |
| "grad_norm": 1.75, |
| "learning_rate": 2.646484566402537e-05, |
| "loss": 0.4935, |
| "step": 11405 |
| }, |
| { |
| "epoch": 2.746089049338147, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.6452259129778125e-05, |
| "loss": 0.4911, |
| "step": 11410 |
| }, |
| { |
| "epoch": 2.747292418772563, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.643973063491486e-05, |
| "loss": 0.4661, |
| "step": 11415 |
| }, |
| { |
| "epoch": 2.74849578820698, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.6427260199443243e-05, |
| "loss": 0.4701, |
| "step": 11420 |
| }, |
| { |
| "epoch": 2.749699157641396, |
| "grad_norm": 1.75, |
| "learning_rate": 2.6414847843278208e-05, |
| "loss": 0.463, |
| "step": 11425 |
| }, |
| { |
| "epoch": 2.7509025270758123, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.640249358624195e-05, |
| "loss": 0.4793, |
| "step": 11430 |
| }, |
| { |
| "epoch": 2.7521058965102285, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.6390197448063887e-05, |
| "loss": 0.4497, |
| "step": 11435 |
| }, |
| { |
| "epoch": 2.7533092659446448, |
| "grad_norm": 1.765625, |
| "learning_rate": 2.6377959448380596e-05, |
| "loss": 0.4766, |
| "step": 11440 |
| }, |
| { |
| "epoch": 2.7545126353790614, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.636577960673584e-05, |
| "loss": 0.474, |
| "step": 11445 |
| }, |
| { |
| "epoch": 2.7557160048134777, |
| "grad_norm": 1.625, |
| "learning_rate": 2.6353657942580473e-05, |
| "loss": 0.4788, |
| "step": 11450 |
| }, |
| { |
| "epoch": 2.756919374247894, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.634159447527249e-05, |
| "loss": 0.4455, |
| "step": 11455 |
| }, |
| { |
| "epoch": 2.7581227436823106, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.6329589224076887e-05, |
| "loss": 0.4846, |
| "step": 11460 |
| }, |
| { |
| "epoch": 2.759326113116727, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.6317642208165745e-05, |
| "loss": 0.4729, |
| "step": 11465 |
| }, |
| { |
| "epoch": 2.760529482551143, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.6305753446618094e-05, |
| "loss": 0.4455, |
| "step": 11470 |
| }, |
| { |
| "epoch": 2.7617328519855597, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.6293922958419978e-05, |
| "loss": 0.4674, |
| "step": 11475 |
| }, |
| { |
| "epoch": 2.762936221419976, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.628215076246436e-05, |
| "loss": 0.4855, |
| "step": 11480 |
| }, |
| { |
| "epoch": 2.764139590854392, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.62704368775511e-05, |
| "loss": 0.4713, |
| "step": 11485 |
| }, |
| { |
| "epoch": 2.765342960288809, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.6258781322386965e-05, |
| "loss": 0.4653, |
| "step": 11490 |
| }, |
| { |
| "epoch": 2.766546329723225, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.6247184115585548e-05, |
| "loss": 0.4888, |
| "step": 11495 |
| }, |
| { |
| "epoch": 2.7677496991576414, |
| "grad_norm": 1.828125, |
| "learning_rate": 2.6235645275667272e-05, |
| "loss": 0.5032, |
| "step": 11500 |
| }, |
| { |
| "epoch": 2.7677496991576414, |
| "eval_loss": 0.4223162531852722, |
| "eval_runtime": 2.7761, |
| "eval_samples_per_second": 72.042, |
| "eval_steps_per_second": 72.042, |
| "step": 11500 |
| }, |
| { |
| "epoch": 2.768953068592058, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.6224164821059353e-05, |
| "loss": 0.5094, |
| "step": 11505 |
| }, |
| { |
| "epoch": 2.7701564380264743, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.621274277009575e-05, |
| "loss": 0.4785, |
| "step": 11510 |
| }, |
| { |
| "epoch": 2.7713598074608905, |
| "grad_norm": 1.5703125, |
| "learning_rate": 2.6201379141017168e-05, |
| "loss": 0.4683, |
| "step": 11515 |
| }, |
| { |
| "epoch": 2.7725631768953067, |
| "grad_norm": 1.625, |
| "learning_rate": 2.6190073951971017e-05, |
| "loss": 0.4826, |
| "step": 11520 |
| }, |
| { |
| "epoch": 2.773766546329723, |
| "grad_norm": 1.75, |
| "learning_rate": 2.6178827221011365e-05, |
| "loss": 0.4548, |
| "step": 11525 |
| }, |
| { |
| "epoch": 2.7749699157641396, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.6167638966098933e-05, |
| "loss": 0.4904, |
| "step": 11530 |
| }, |
| { |
| "epoch": 2.776173285198556, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.6156509205101063e-05, |
| "loss": 0.5256, |
| "step": 11535 |
| }, |
| { |
| "epoch": 2.777376654632972, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.6145437955791663e-05, |
| "loss": 0.5097, |
| "step": 11540 |
| }, |
| { |
| "epoch": 2.778580024067389, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.6134425235851232e-05, |
| "loss": 0.4746, |
| "step": 11545 |
| }, |
| { |
| "epoch": 2.779783393501805, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.612347106286677e-05, |
| "loss": 0.4973, |
| "step": 11550 |
| }, |
| { |
| "epoch": 2.7809867629362213, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.6112575454331793e-05, |
| "loss": 0.4872, |
| "step": 11555 |
| }, |
| { |
| "epoch": 2.782190132370638, |
| "grad_norm": 1.5703125, |
| "learning_rate": 2.6101738427646286e-05, |
| "loss": 0.451, |
| "step": 11560 |
| }, |
| { |
| "epoch": 2.783393501805054, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.6090960000116686e-05, |
| "loss": 0.5001, |
| "step": 11565 |
| }, |
| { |
| "epoch": 2.7845968712394704, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.6080240188955846e-05, |
| "loss": 0.4847, |
| "step": 11570 |
| }, |
| { |
| "epoch": 2.785800240673887, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.6069579011283002e-05, |
| "loss": 0.4543, |
| "step": 11575 |
| }, |
| { |
| "epoch": 2.7870036101083033, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.605897648412377e-05, |
| "loss": 0.4936, |
| "step": 11580 |
| }, |
| { |
| "epoch": 2.7882069795427196, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.604843262441009e-05, |
| "loss": 0.4913, |
| "step": 11585 |
| }, |
| { |
| "epoch": 2.7894103489771362, |
| "grad_norm": 1.5546875, |
| "learning_rate": 2.603794744898022e-05, |
| "loss": 0.501, |
| "step": 11590 |
| }, |
| { |
| "epoch": 2.7906137184115525, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.6027520974578697e-05, |
| "loss": 0.5053, |
| "step": 11595 |
| }, |
| { |
| "epoch": 2.7918170878459687, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.601715321785631e-05, |
| "loss": 0.47, |
| "step": 11600 |
| }, |
| { |
| "epoch": 2.793020457280385, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.6006844195370077e-05, |
| "loss": 0.4937, |
| "step": 11605 |
| }, |
| { |
| "epoch": 2.794223826714801, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.599659392358323e-05, |
| "loss": 0.4883, |
| "step": 11610 |
| }, |
| { |
| "epoch": 2.795427196149218, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.5986402418865176e-05, |
| "loss": 0.4875, |
| "step": 11615 |
| }, |
| { |
| "epoch": 2.796630565583634, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.597626969749146e-05, |
| "loss": 0.4724, |
| "step": 11620 |
| }, |
| { |
| "epoch": 2.7978339350180503, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.5966195775643762e-05, |
| "loss": 0.4903, |
| "step": 11625 |
| }, |
| { |
| "epoch": 2.799037304452467, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.5956180669409856e-05, |
| "loss": 0.4942, |
| "step": 11630 |
| }, |
| { |
| "epoch": 2.8002406738868832, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.594622439478359e-05, |
| "loss": 0.4572, |
| "step": 11635 |
| }, |
| { |
| "epoch": 2.8014440433212995, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.5936326967664868e-05, |
| "loss": 0.4629, |
| "step": 11640 |
| }, |
| { |
| "epoch": 2.802647412755716, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.592648840385961e-05, |
| "loss": 0.4768, |
| "step": 11645 |
| }, |
| { |
| "epoch": 2.8038507821901324, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.591670871907972e-05, |
| "loss": 0.491, |
| "step": 11650 |
| }, |
| { |
| "epoch": 2.8050541516245486, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.59069879289431e-05, |
| "loss": 0.4437, |
| "step": 11655 |
| }, |
| { |
| "epoch": 2.8062575210589653, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.5897326048973566e-05, |
| "loss": 0.4843, |
| "step": 11660 |
| }, |
| { |
| "epoch": 2.8074608904933815, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.588772309460089e-05, |
| "loss": 0.4542, |
| "step": 11665 |
| }, |
| { |
| "epoch": 2.808664259927798, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.587817908116073e-05, |
| "loss": 0.4739, |
| "step": 11670 |
| }, |
| { |
| "epoch": 2.8098676293622145, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.586869402389459e-05, |
| "loss": 0.4724, |
| "step": 11675 |
| }, |
| { |
| "epoch": 2.8110709987966307, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.5859267937949874e-05, |
| "loss": 0.4864, |
| "step": 11680 |
| }, |
| { |
| "epoch": 2.812274368231047, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.584990083837976e-05, |
| "loss": 0.4378, |
| "step": 11685 |
| }, |
| { |
| "epoch": 2.813477737665463, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.5840592740143267e-05, |
| "loss": 0.4654, |
| "step": 11690 |
| }, |
| { |
| "epoch": 2.8146811070998794, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.5831343658105154e-05, |
| "loss": 0.4958, |
| "step": 11695 |
| }, |
| { |
| "epoch": 2.815884476534296, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.5822153607035953e-05, |
| "loss": 0.4828, |
| "step": 11700 |
| }, |
| { |
| "epoch": 2.8170878459687123, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.5813022601611946e-05, |
| "loss": 0.4764, |
| "step": 11705 |
| }, |
| { |
| "epoch": 2.8182912154031285, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.5803950656415066e-05, |
| "loss": 0.4836, |
| "step": 11710 |
| }, |
| { |
| "epoch": 2.8194945848375452, |
| "grad_norm": 1.5234375, |
| "learning_rate": 2.5794937785932978e-05, |
| "loss": 0.4733, |
| "step": 11715 |
| }, |
| { |
| "epoch": 2.8206979542719615, |
| "grad_norm": 1.875, |
| "learning_rate": 2.5785984004558992e-05, |
| "loss": 0.4774, |
| "step": 11720 |
| }, |
| { |
| "epoch": 2.8219013237063777, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.5777089326592036e-05, |
| "loss": 0.4972, |
| "step": 11725 |
| }, |
| { |
| "epoch": 2.8231046931407944, |
| "grad_norm": 1.859375, |
| "learning_rate": 2.5768253766236677e-05, |
| "loss": 0.4992, |
| "step": 11730 |
| }, |
| { |
| "epoch": 2.8243080625752106, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.5759477337603057e-05, |
| "loss": 0.4865, |
| "step": 11735 |
| }, |
| { |
| "epoch": 2.825511432009627, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.5750760054706888e-05, |
| "loss": 0.4935, |
| "step": 11740 |
| }, |
| { |
| "epoch": 2.8267148014440435, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.5742101931469435e-05, |
| "loss": 0.488, |
| "step": 11745 |
| }, |
| { |
| "epoch": 2.8279181708784598, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.5733502981717494e-05, |
| "loss": 0.4821, |
| "step": 11750 |
| }, |
| { |
| "epoch": 2.829121540312876, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.572496321918333e-05, |
| "loss": 0.4761, |
| "step": 11755 |
| }, |
| { |
| "epoch": 2.8303249097472927, |
| "grad_norm": 1.625, |
| "learning_rate": 2.571648265750473e-05, |
| "loss": 0.4941, |
| "step": 11760 |
| }, |
| { |
| "epoch": 2.831528279181709, |
| "grad_norm": 1.546875, |
| "learning_rate": 2.5708061310224904e-05, |
| "loss": 0.4951, |
| "step": 11765 |
| }, |
| { |
| "epoch": 2.832731648616125, |
| "grad_norm": 1.5390625, |
| "learning_rate": 2.5699699190792516e-05, |
| "loss": 0.4537, |
| "step": 11770 |
| }, |
| { |
| "epoch": 2.8339350180505414, |
| "grad_norm": 1.625, |
| "learning_rate": 2.5691396312561637e-05, |
| "loss": 0.4732, |
| "step": 11775 |
| }, |
| { |
| "epoch": 2.8351383874849576, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.5683152688791736e-05, |
| "loss": 0.4944, |
| "step": 11780 |
| }, |
| { |
| "epoch": 2.8363417569193743, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.5674968332647663e-05, |
| "loss": 0.467, |
| "step": 11785 |
| }, |
| { |
| "epoch": 2.8375451263537905, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.56668432571996e-05, |
| "loss": 0.4785, |
| "step": 11790 |
| }, |
| { |
| "epoch": 2.8387484957882068, |
| "grad_norm": 1.75, |
| "learning_rate": 2.5658777475423076e-05, |
| "loss": 0.4583, |
| "step": 11795 |
| }, |
| { |
| "epoch": 2.8399518652226234, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.565077100019892e-05, |
| "loss": 0.488, |
| "step": 11800 |
| }, |
| { |
| "epoch": 2.8411552346570397, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.5642823844313247e-05, |
| "loss": 0.5022, |
| "step": 11805 |
| }, |
| { |
| "epoch": 2.842358604091456, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.563493602045745e-05, |
| "loss": 0.4925, |
| "step": 11810 |
| }, |
| { |
| "epoch": 2.8435619735258726, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.5627107541228168e-05, |
| "loss": 0.4912, |
| "step": 11815 |
| }, |
| { |
| "epoch": 2.844765342960289, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.5619338419127275e-05, |
| "loss": 0.4603, |
| "step": 11820 |
| }, |
| { |
| "epoch": 2.845968712394705, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.561162866656183e-05, |
| "loss": 0.4702, |
| "step": 11825 |
| }, |
| { |
| "epoch": 2.8471720818291217, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.5603978295844106e-05, |
| "loss": 0.4548, |
| "step": 11830 |
| }, |
| { |
| "epoch": 2.848375451263538, |
| "grad_norm": 1.78125, |
| "learning_rate": 2.5596387319191524e-05, |
| "loss": 0.5103, |
| "step": 11835 |
| }, |
| { |
| "epoch": 2.849578820697954, |
| "grad_norm": 1.46875, |
| "learning_rate": 2.5588855748726685e-05, |
| "loss": 0.4699, |
| "step": 11840 |
| }, |
| { |
| "epoch": 2.850782190132371, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.5581383596477283e-05, |
| "loss": 0.4721, |
| "step": 11845 |
| }, |
| { |
| "epoch": 2.851985559566787, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.5573970874376144e-05, |
| "loss": 0.4692, |
| "step": 11850 |
| }, |
| { |
| "epoch": 2.8531889290012034, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.5566617594261185e-05, |
| "loss": 0.5246, |
| "step": 11855 |
| }, |
| { |
| "epoch": 2.8543922984356196, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.5559323767875385e-05, |
| "loss": 0.4567, |
| "step": 11860 |
| }, |
| { |
| "epoch": 2.855595667870036, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.555208940686679e-05, |
| "loss": 0.4832, |
| "step": 11865 |
| }, |
| { |
| "epoch": 2.8567990373044525, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.5544914522788472e-05, |
| "loss": 0.4807, |
| "step": 11870 |
| }, |
| { |
| "epoch": 2.8580024067388687, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.553779912709853e-05, |
| "loss": 0.4649, |
| "step": 11875 |
| }, |
| { |
| "epoch": 2.859205776173285, |
| "grad_norm": 1.4375, |
| "learning_rate": 2.5530743231160042e-05, |
| "loss": 0.4521, |
| "step": 11880 |
| }, |
| { |
| "epoch": 2.8604091456077017, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.55237468462411e-05, |
| "loss": 0.5021, |
| "step": 11885 |
| }, |
| { |
| "epoch": 2.861612515042118, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.551680998351472e-05, |
| "loss": 0.4874, |
| "step": 11890 |
| }, |
| { |
| "epoch": 2.862815884476534, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.550993265405889e-05, |
| "loss": 0.47, |
| "step": 11895 |
| }, |
| { |
| "epoch": 2.864019253910951, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.5503114868856515e-05, |
| "loss": 0.506, |
| "step": 11900 |
| }, |
| { |
| "epoch": 2.865222623345367, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.5496356638795408e-05, |
| "loss": 0.4731, |
| "step": 11905 |
| }, |
| { |
| "epoch": 2.8664259927797833, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.5489657974668276e-05, |
| "loss": 0.4912, |
| "step": 11910 |
| }, |
| { |
| "epoch": 2.8676293622142, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.5483018887172713e-05, |
| "loss": 0.4489, |
| "step": 11915 |
| }, |
| { |
| "epoch": 2.868832731648616, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.5476439386911137e-05, |
| "loss": 0.4685, |
| "step": 11920 |
| }, |
| { |
| "epoch": 2.8700361010830324, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.546991948439085e-05, |
| "loss": 0.4934, |
| "step": 11925 |
| }, |
| { |
| "epoch": 2.871239470517449, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.546345919002395e-05, |
| "loss": 0.482, |
| "step": 11930 |
| }, |
| { |
| "epoch": 2.8724428399518653, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.545705851412734e-05, |
| "loss": 0.4905, |
| "step": 11935 |
| }, |
| { |
| "epoch": 2.8736462093862816, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.545071746692274e-05, |
| "loss": 0.496, |
| "step": 11940 |
| }, |
| { |
| "epoch": 2.874849578820698, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.544443605853662e-05, |
| "loss": 0.4808, |
| "step": 11945 |
| }, |
| { |
| "epoch": 2.8760529482551145, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.5438214299000206e-05, |
| "loss": 0.5077, |
| "step": 11950 |
| }, |
| { |
| "epoch": 2.8772563176895307, |
| "grad_norm": 1.75, |
| "learning_rate": 2.54320521982495e-05, |
| "loss": 0.4662, |
| "step": 11955 |
| }, |
| { |
| "epoch": 2.878459687123947, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.542594976612519e-05, |
| "loss": 0.4772, |
| "step": 11960 |
| }, |
| { |
| "epoch": 2.879663056558363, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.5419907012372698e-05, |
| "loss": 0.4876, |
| "step": 11965 |
| }, |
| { |
| "epoch": 2.88086642599278, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.5413923946642128e-05, |
| "loss": 0.477, |
| "step": 11970 |
| }, |
| { |
| "epoch": 2.882069795427196, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.5408000578488285e-05, |
| "loss": 0.4921, |
| "step": 11975 |
| }, |
| { |
| "epoch": 2.8832731648616123, |
| "grad_norm": 1.765625, |
| "learning_rate": 2.5402136917370614e-05, |
| "loss": 0.4814, |
| "step": 11980 |
| }, |
| { |
| "epoch": 2.884476534296029, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.539633297265323e-05, |
| "loss": 0.4581, |
| "step": 11985 |
| }, |
| { |
| "epoch": 2.8856799037304453, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.5390588753604865e-05, |
| "loss": 0.4747, |
| "step": 11990 |
| }, |
| { |
| "epoch": 2.8868832731648615, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.538490426939888e-05, |
| "loss": 0.4919, |
| "step": 11995 |
| }, |
| { |
| "epoch": 2.888086642599278, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.537927952911324e-05, |
| "loss": 0.5019, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.888086642599278, |
| "eval_loss": 0.42064329981803894, |
| "eval_runtime": 2.6982, |
| "eval_samples_per_second": 74.123, |
| "eval_steps_per_second": 74.123, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.8892900120336944, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.537371454173051e-05, |
| "loss": 0.4696, |
| "step": 12005 |
| }, |
| { |
| "epoch": 2.8904933814681106, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.536820931613781e-05, |
| "loss": 0.4803, |
| "step": 12010 |
| }, |
| { |
| "epoch": 2.8916967509025273, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.5362763861126836e-05, |
| "loss": 0.4403, |
| "step": 12015 |
| }, |
| { |
| "epoch": 2.8929001203369435, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.535737818539384e-05, |
| "loss": 0.5083, |
| "step": 12020 |
| }, |
| { |
| "epoch": 2.89410348977136, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.5352052297539577e-05, |
| "loss": 0.4773, |
| "step": 12025 |
| }, |
| { |
| "epoch": 2.895306859205776, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.5346786206069368e-05, |
| "loss": 0.4629, |
| "step": 12030 |
| }, |
| { |
| "epoch": 2.8965102286401927, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.5341579919392997e-05, |
| "loss": 0.4756, |
| "step": 12035 |
| }, |
| { |
| "epoch": 2.897713598074609, |
| "grad_norm": 1.5703125, |
| "learning_rate": 2.5336433445824774e-05, |
| "loss": 0.467, |
| "step": 12040 |
| }, |
| { |
| "epoch": 2.898916967509025, |
| "grad_norm": 1.625, |
| "learning_rate": 2.533134679358347e-05, |
| "loss": 0.4646, |
| "step": 12045 |
| }, |
| { |
| "epoch": 2.9001203369434414, |
| "grad_norm": 1.5078125, |
| "learning_rate": 2.5326319970792325e-05, |
| "loss": 0.4964, |
| "step": 12050 |
| }, |
| { |
| "epoch": 2.901323706377858, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.5321352985479046e-05, |
| "loss": 0.4581, |
| "step": 12055 |
| }, |
| { |
| "epoch": 2.9025270758122743, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.531644584557576e-05, |
| "loss": 0.4934, |
| "step": 12060 |
| }, |
| { |
| "epoch": 2.9037304452466906, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.5311598558919045e-05, |
| "loss": 0.4836, |
| "step": 12065 |
| }, |
| { |
| "epoch": 2.9049338146811072, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.5306811133249887e-05, |
| "loss": 0.4881, |
| "step": 12070 |
| }, |
| { |
| "epoch": 2.9061371841155235, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.5302083576213657e-05, |
| "loss": 0.4912, |
| "step": 12075 |
| }, |
| { |
| "epoch": 2.9073405535499397, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.5297415895360148e-05, |
| "loss": 0.4502, |
| "step": 12080 |
| }, |
| { |
| "epoch": 2.9085439229843564, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.5292808098143517e-05, |
| "loss": 0.4689, |
| "step": 12085 |
| }, |
| { |
| "epoch": 2.9097472924187726, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.5288260191922284e-05, |
| "loss": 0.4673, |
| "step": 12090 |
| }, |
| { |
| "epoch": 2.910950661853189, |
| "grad_norm": 1.765625, |
| "learning_rate": 2.5283772183959324e-05, |
| "loss": 0.4968, |
| "step": 12095 |
| }, |
| { |
| "epoch": 2.9121540312876055, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.5279344081421883e-05, |
| "loss": 0.4661, |
| "step": 12100 |
| }, |
| { |
| "epoch": 2.9133574007220218, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.5274975891381498e-05, |
| "loss": 0.4769, |
| "step": 12105 |
| }, |
| { |
| "epoch": 2.914560770156438, |
| "grad_norm": 1.625, |
| "learning_rate": 2.5270667620814064e-05, |
| "loss": 0.4779, |
| "step": 12110 |
| }, |
| { |
| "epoch": 2.9157641395908542, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.5266419276599755e-05, |
| "loss": 0.4539, |
| "step": 12115 |
| }, |
| { |
| "epoch": 2.916967509025271, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.5262230865523076e-05, |
| "loss": 0.5058, |
| "step": 12120 |
| }, |
| { |
| "epoch": 2.918170878459687, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.5258102394272793e-05, |
| "loss": 0.491, |
| "step": 12125 |
| }, |
| { |
| "epoch": 2.9193742478941034, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.5254033869441966e-05, |
| "loss": 0.4759, |
| "step": 12130 |
| }, |
| { |
| "epoch": 2.9205776173285196, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.5250025297527914e-05, |
| "loss": 0.4659, |
| "step": 12135 |
| }, |
| { |
| "epoch": 2.9217809867629363, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.5246076684932223e-05, |
| "loss": 0.4559, |
| "step": 12140 |
| }, |
| { |
| "epoch": 2.9229843561973525, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.5242188037960707e-05, |
| "loss": 0.4981, |
| "step": 12145 |
| }, |
| { |
| "epoch": 2.9241877256317688, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.5238359362823442e-05, |
| "loss": 0.4798, |
| "step": 12150 |
| }, |
| { |
| "epoch": 2.9253910950661854, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.5234590665634706e-05, |
| "loss": 0.475, |
| "step": 12155 |
| }, |
| { |
| "epoch": 2.9265944645006017, |
| "grad_norm": 1.765625, |
| "learning_rate": 2.523088195241301e-05, |
| "loss": 0.505, |
| "step": 12160 |
| }, |
| { |
| "epoch": 2.927797833935018, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.5227233229081072e-05, |
| "loss": 0.5083, |
| "step": 12165 |
| }, |
| { |
| "epoch": 2.9290012033694346, |
| "grad_norm": 1.5390625, |
| "learning_rate": 2.5223644501465792e-05, |
| "loss": 0.4681, |
| "step": 12170 |
| }, |
| { |
| "epoch": 2.930204572803851, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.5220115775298276e-05, |
| "loss": 0.4649, |
| "step": 12175 |
| }, |
| { |
| "epoch": 2.931407942238267, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.52166470562138e-05, |
| "loss": 0.4824, |
| "step": 12180 |
| }, |
| { |
| "epoch": 2.9326113116726837, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.5213238349751808e-05, |
| "loss": 0.4894, |
| "step": 12185 |
| }, |
| { |
| "epoch": 2.9338146811071, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.5209889661355926e-05, |
| "loss": 0.4793, |
| "step": 12190 |
| }, |
| { |
| "epoch": 2.935018050541516, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.52066009963739e-05, |
| "loss": 0.4906, |
| "step": 12195 |
| }, |
| { |
| "epoch": 2.9362214199759324, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.5203372360057653e-05, |
| "loss": 0.4909, |
| "step": 12200 |
| }, |
| { |
| "epoch": 2.937424789410349, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.5200203757563216e-05, |
| "loss": 0.479, |
| "step": 12205 |
| }, |
| { |
| "epoch": 2.9386281588447654, |
| "grad_norm": 1.6796875, |
| "learning_rate": 2.5197095193950768e-05, |
| "loss": 0.4705, |
| "step": 12210 |
| }, |
| { |
| "epoch": 2.9398315282791816, |
| "grad_norm": 1.625, |
| "learning_rate": 2.5194046674184594e-05, |
| "loss": 0.4807, |
| "step": 12215 |
| }, |
| { |
| "epoch": 2.941034897713598, |
| "grad_norm": 1.625, |
| "learning_rate": 2.51910582031331e-05, |
| "loss": 0.4625, |
| "step": 12220 |
| }, |
| { |
| "epoch": 2.9422382671480145, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.5188129785568787e-05, |
| "loss": 0.4968, |
| "step": 12225 |
| }, |
| { |
| "epoch": 2.9434416365824307, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.518526142616826e-05, |
| "loss": 0.4911, |
| "step": 12230 |
| }, |
| { |
| "epoch": 2.944645006016847, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.5182453129512218e-05, |
| "loss": 0.5157, |
| "step": 12235 |
| }, |
| { |
| "epoch": 2.9458483754512637, |
| "grad_norm": 1.546875, |
| "learning_rate": 2.5179704900085426e-05, |
| "loss": 0.5027, |
| "step": 12240 |
| }, |
| { |
| "epoch": 2.94705174488568, |
| "grad_norm": 1.59375, |
| "learning_rate": 2.5177016742276727e-05, |
| "loss": 0.4583, |
| "step": 12245 |
| }, |
| { |
| "epoch": 2.948255114320096, |
| "grad_norm": 1.59375, |
| "learning_rate": 2.5174388660379044e-05, |
| "loss": 0.4926, |
| "step": 12250 |
| }, |
| { |
| "epoch": 2.949458483754513, |
| "grad_norm": 1.6953125, |
| "learning_rate": 2.517182065858935e-05, |
| "loss": 0.4835, |
| "step": 12255 |
| }, |
| { |
| "epoch": 2.950661853188929, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.516931274100866e-05, |
| "loss": 0.4722, |
| "step": 12260 |
| }, |
| { |
| "epoch": 2.9518652226233453, |
| "grad_norm": 2.234375, |
| "learning_rate": 2.516686491164207e-05, |
| "loss": 0.4709, |
| "step": 12265 |
| }, |
| { |
| "epoch": 2.953068592057762, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.516447717439868e-05, |
| "loss": 0.4793, |
| "step": 12270 |
| }, |
| { |
| "epoch": 2.954271961492178, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.5162149533091646e-05, |
| "loss": 0.4718, |
| "step": 12275 |
| }, |
| { |
| "epoch": 2.9554753309265944, |
| "grad_norm": 1.5546875, |
| "learning_rate": 2.5159881991438154e-05, |
| "loss": 0.4704, |
| "step": 12280 |
| }, |
| { |
| "epoch": 2.956678700361011, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.5157674553059392e-05, |
| "loss": 0.466, |
| "step": 12285 |
| }, |
| { |
| "epoch": 2.9578820697954273, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.515552722148059e-05, |
| "loss": 0.4939, |
| "step": 12290 |
| }, |
| { |
| "epoch": 2.9590854392298436, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.515344000013098e-05, |
| "loss": 0.4697, |
| "step": 12295 |
| }, |
| { |
| "epoch": 2.96028880866426, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.515141289234378e-05, |
| "loss": 0.4388, |
| "step": 12300 |
| }, |
| { |
| "epoch": 2.961492178098676, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.5149445901356243e-05, |
| "loss": 0.4616, |
| "step": 12305 |
| }, |
| { |
| "epoch": 2.9626955475330927, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.514753903030959e-05, |
| "loss": 0.469, |
| "step": 12310 |
| }, |
| { |
| "epoch": 2.963898916967509, |
| "grad_norm": 1.453125, |
| "learning_rate": 2.5145692282249044e-05, |
| "loss": 0.4812, |
| "step": 12315 |
| }, |
| { |
| "epoch": 2.965102286401925, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.514390566012381e-05, |
| "loss": 0.4706, |
| "step": 12320 |
| }, |
| { |
| "epoch": 2.966305655836342, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.514217916678708e-05, |
| "loss": 0.477, |
| "step": 12325 |
| }, |
| { |
| "epoch": 2.967509025270758, |
| "grad_norm": 1.75, |
| "learning_rate": 2.514051280499601e-05, |
| "loss": 0.4592, |
| "step": 12330 |
| }, |
| { |
| "epoch": 2.9687123947051743, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.5138906577411732e-05, |
| "loss": 0.4594, |
| "step": 12335 |
| }, |
| { |
| "epoch": 2.969915764139591, |
| "grad_norm": 1.5625, |
| "learning_rate": 2.5137360486599356e-05, |
| "loss": 0.4699, |
| "step": 12340 |
| }, |
| { |
| "epoch": 2.9711191335740073, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.513587453502794e-05, |
| "loss": 0.477, |
| "step": 12345 |
| }, |
| { |
| "epoch": 2.9723225030084235, |
| "grad_norm": 1.625, |
| "learning_rate": 2.513444872507051e-05, |
| "loss": 0.4775, |
| "step": 12350 |
| }, |
| { |
| "epoch": 2.97352587244284, |
| "grad_norm": 1.9140625, |
| "learning_rate": 2.513308305900404e-05, |
| "loss": 0.4711, |
| "step": 12355 |
| }, |
| { |
| "epoch": 2.9747292418772564, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.5131777539009473e-05, |
| "loss": 0.4933, |
| "step": 12360 |
| }, |
| { |
| "epoch": 2.9759326113116726, |
| "grad_norm": 1.703125, |
| "learning_rate": 2.5130532167171676e-05, |
| "loss": 0.4859, |
| "step": 12365 |
| }, |
| { |
| "epoch": 2.9771359807460893, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.5129346945479483e-05, |
| "loss": 0.48, |
| "step": 12370 |
| }, |
| { |
| "epoch": 2.9783393501805056, |
| "grad_norm": 1.8984375, |
| "learning_rate": 2.5128221875825652e-05, |
| "loss": 0.4538, |
| "step": 12375 |
| }, |
| { |
| "epoch": 2.979542719614922, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.5127156960006893e-05, |
| "loss": 0.4736, |
| "step": 12380 |
| }, |
| { |
| "epoch": 2.980746089049338, |
| "grad_norm": 1.625, |
| "learning_rate": 2.5126152199723852e-05, |
| "loss": 0.4635, |
| "step": 12385 |
| }, |
| { |
| "epoch": 2.9819494584837543, |
| "grad_norm": 1.6640625, |
| "learning_rate": 2.51252075965811e-05, |
| "loss": 0.5018, |
| "step": 12390 |
| }, |
| { |
| "epoch": 2.983152827918171, |
| "grad_norm": 1.609375, |
| "learning_rate": 2.512432315208714e-05, |
| "loss": 0.4592, |
| "step": 12395 |
| }, |
| { |
| "epoch": 2.984356197352587, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.5123498867654397e-05, |
| "loss": 0.4874, |
| "step": 12400 |
| }, |
| { |
| "epoch": 2.9855595667870034, |
| "grad_norm": 1.53125, |
| "learning_rate": 2.5122734744599256e-05, |
| "loss": 0.4315, |
| "step": 12405 |
| }, |
| { |
| "epoch": 2.98676293622142, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.5122030784141974e-05, |
| "loss": 0.4757, |
| "step": 12410 |
| }, |
| { |
| "epoch": 2.9879663056558363, |
| "grad_norm": 1.75, |
| "learning_rate": 2.5121386987406766e-05, |
| "loss": 0.4613, |
| "step": 12415 |
| }, |
| { |
| "epoch": 2.9891696750902526, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.5120803355421764e-05, |
| "loss": 0.4887, |
| "step": 12420 |
| }, |
| { |
| "epoch": 2.9903730445246692, |
| "grad_norm": 1.59375, |
| "learning_rate": 2.5120279889119002e-05, |
| "loss": 0.4892, |
| "step": 12425 |
| }, |
| { |
| "epoch": 2.9915764139590855, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.5119816589334445e-05, |
| "loss": 0.4503, |
| "step": 12430 |
| }, |
| { |
| "epoch": 2.9927797833935017, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.511941345680798e-05, |
| "loss": 0.4954, |
| "step": 12435 |
| }, |
| { |
| "epoch": 2.9939831528279184, |
| "grad_norm": 1.625, |
| "learning_rate": 2.5119070492183377e-05, |
| "loss": 0.4456, |
| "step": 12440 |
| }, |
| { |
| "epoch": 2.9951865222623346, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.5118787696008367e-05, |
| "loss": 0.4645, |
| "step": 12445 |
| }, |
| { |
| "epoch": 2.996389891696751, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.5118565068734546e-05, |
| "loss": 0.4891, |
| "step": 12450 |
| }, |
| { |
| "epoch": 2.9975932611311675, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.5118402610717452e-05, |
| "loss": 0.4804, |
| "step": 12455 |
| }, |
| { |
| "epoch": 2.9987966305655838, |
| "grad_norm": 1.890625, |
| "learning_rate": 2.5118300322216534e-05, |
| "loss": 0.4957, |
| "step": 12460 |
| }, |
| { |
| "epoch": 2.99927797833935, |
| "eval_loss": 0.42031019926071167, |
| "eval_runtime": 2.6521, |
| "eval_samples_per_second": 75.411, |
| "eval_steps_per_second": 75.411, |
| "step": 12462 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 12465, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.00021780783616e+17, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|