| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9999055980364392, |
| "eval_steps": 500, |
| "global_step": 4965, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0010069542779823153, |
| "grad_norm": 147.125, |
| "learning_rate": 1.0060362173038228e-08, |
| "loss": 154.8444, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0020139085559646307, |
| "grad_norm": 128.5, |
| "learning_rate": 2.0120724346076457e-08, |
| "loss": 154.8017, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0030208628339469462, |
| "grad_norm": 120.1875, |
| "learning_rate": 3.018108651911469e-08, |
| "loss": 151.1319, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.004027817111929261, |
| "grad_norm": 121.125, |
| "learning_rate": 4.0241448692152913e-08, |
| "loss": 152.4657, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0050347713899115765, |
| "grad_norm": 121.5, |
| "learning_rate": 5.0301810865191145e-08, |
| "loss": 150.7515, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0060417256678938925, |
| "grad_norm": 120.6875, |
| "learning_rate": 6.036217303822938e-08, |
| "loss": 151.6619, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.007048679945876208, |
| "grad_norm": 124.3125, |
| "learning_rate": 7.042253521126761e-08, |
| "loss": 152.2705, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.008055634223858523, |
| "grad_norm": 124.625, |
| "learning_rate": 8.048289738430583e-08, |
| "loss": 149.9593, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.009062588501840838, |
| "grad_norm": 125.375, |
| "learning_rate": 9.054325955734406e-08, |
| "loss": 151.1752, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.010069542779823153, |
| "grad_norm": 123.1875, |
| "learning_rate": 1.0060362173038229e-07, |
| "loss": 150.065, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.01107649705780547, |
| "grad_norm": 129.0, |
| "learning_rate": 1.1066398390342052e-07, |
| "loss": 149.9216, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.012083451335787785, |
| "grad_norm": 120.0625, |
| "learning_rate": 1.2072434607645875e-07, |
| "loss": 151.1044, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0130904056137701, |
| "grad_norm": 122.0, |
| "learning_rate": 1.3078470824949698e-07, |
| "loss": 151.5673, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.014097359891752415, |
| "grad_norm": 124.25, |
| "learning_rate": 1.4084507042253522e-07, |
| "loss": 150.4482, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.01510431416973473, |
| "grad_norm": 128.75, |
| "learning_rate": 1.5090543259557342e-07, |
| "loss": 150.4678, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.016111268447717045, |
| "grad_norm": 127.9375, |
| "learning_rate": 1.6096579476861165e-07, |
| "loss": 149.0443, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.01711822272569936, |
| "grad_norm": 126.25, |
| "learning_rate": 1.710261569416499e-07, |
| "loss": 151.2827, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.018125177003681676, |
| "grad_norm": 125.0, |
| "learning_rate": 1.8108651911468812e-07, |
| "loss": 149.4031, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.01913213128166399, |
| "grad_norm": 120.125, |
| "learning_rate": 1.9114688128772635e-07, |
| "loss": 149.7004, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.020139085559646306, |
| "grad_norm": 121.75, |
| "learning_rate": 2.0120724346076458e-07, |
| "loss": 150.3224, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.02114603983762862, |
| "grad_norm": 127.375, |
| "learning_rate": 2.112676056338028e-07, |
| "loss": 149.7613, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.02215299411561094, |
| "grad_norm": 119.0, |
| "learning_rate": 2.2132796780684104e-07, |
| "loss": 148.3228, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.023159948393593255, |
| "grad_norm": 119.3125, |
| "learning_rate": 2.3138832997987925e-07, |
| "loss": 150.6336, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.02416690267157557, |
| "grad_norm": 119.8125, |
| "learning_rate": 2.414486921529175e-07, |
| "loss": 151.2738, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.025173856949557885, |
| "grad_norm": 118.5625, |
| "learning_rate": 2.515090543259557e-07, |
| "loss": 149.1195, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.0261808112275402, |
| "grad_norm": 120.125, |
| "learning_rate": 2.6156941649899397e-07, |
| "loss": 150.1052, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.027187765505522515, |
| "grad_norm": 126.8125, |
| "learning_rate": 2.716297786720322e-07, |
| "loss": 150.0314, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.02819471978350483, |
| "grad_norm": 123.1875, |
| "learning_rate": 2.8169014084507043e-07, |
| "loss": 148.3245, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.029201674061487146, |
| "grad_norm": 120.5, |
| "learning_rate": 2.9175050301810864e-07, |
| "loss": 148.8877, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.03020862833946946, |
| "grad_norm": 121.4375, |
| "learning_rate": 3.0181086519114684e-07, |
| "loss": 148.9477, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.031215582617451776, |
| "grad_norm": 124.5625, |
| "learning_rate": 3.118712273641851e-07, |
| "loss": 149.5775, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.03222253689543409, |
| "grad_norm": 114.625, |
| "learning_rate": 3.219315895372233e-07, |
| "loss": 149.9319, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.033229491173416406, |
| "grad_norm": 113.8125, |
| "learning_rate": 3.3199195171026156e-07, |
| "loss": 149.8396, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.03423644545139872, |
| "grad_norm": 119.5625, |
| "learning_rate": 3.420523138832998e-07, |
| "loss": 149.8816, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.035243399729381036, |
| "grad_norm": 117.0625, |
| "learning_rate": 3.52112676056338e-07, |
| "loss": 148.9375, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.03625035400736335, |
| "grad_norm": 126.125, |
| "learning_rate": 3.6217303822937623e-07, |
| "loss": 148.8681, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.03725730828534567, |
| "grad_norm": 122.25, |
| "learning_rate": 3.722334004024145e-07, |
| "loss": 148.1881, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.03826426256332798, |
| "grad_norm": 116.5625, |
| "learning_rate": 3.822937625754527e-07, |
| "loss": 151.0358, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.0392712168413103, |
| "grad_norm": 112.8125, |
| "learning_rate": 3.9235412474849095e-07, |
| "loss": 149.5753, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.04027817111929261, |
| "grad_norm": 114.75, |
| "learning_rate": 4.0241448692152916e-07, |
| "loss": 148.3218, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.04128512539727493, |
| "grad_norm": 117.25, |
| "learning_rate": 4.1247484909456736e-07, |
| "loss": 148.1832, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.04229207967525724, |
| "grad_norm": 111.1875, |
| "learning_rate": 4.225352112676056e-07, |
| "loss": 148.289, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.04329903395323956, |
| "grad_norm": 111.75, |
| "learning_rate": 4.3259557344064383e-07, |
| "loss": 148.9568, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.04430598823122188, |
| "grad_norm": 107.375, |
| "learning_rate": 4.426559356136821e-07, |
| "loss": 147.067, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.045312942509204195, |
| "grad_norm": 114.125, |
| "learning_rate": 4.5271629778672034e-07, |
| "loss": 149.1166, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.04631989678718651, |
| "grad_norm": 114.5625, |
| "learning_rate": 4.627766599597585e-07, |
| "loss": 149.51, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.047326851065168825, |
| "grad_norm": 114.6875, |
| "learning_rate": 4.7283702213279675e-07, |
| "loss": 149.2539, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.04833380534315114, |
| "grad_norm": 114.3125, |
| "learning_rate": 4.82897384305835e-07, |
| "loss": 148.6982, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.049340759621133455, |
| "grad_norm": 114.8125, |
| "learning_rate": 4.929577464788733e-07, |
| "loss": 148.5024, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.05034771389911577, |
| "grad_norm": 112.125, |
| "learning_rate": 5.030181086519114e-07, |
| "loss": 148.5248, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.051354668177098085, |
| "grad_norm": 109.3125, |
| "learning_rate": 5.130784708249497e-07, |
| "loss": 148.3245, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.0523616224550804, |
| "grad_norm": 113.4375, |
| "learning_rate": 5.231388329979879e-07, |
| "loss": 148.0799, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.053368576733062716, |
| "grad_norm": 119.5, |
| "learning_rate": 5.331991951710262e-07, |
| "loss": 149.3715, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.05437553101104503, |
| "grad_norm": 116.4375, |
| "learning_rate": 5.432595573440643e-07, |
| "loss": 148.9591, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.055382485289027346, |
| "grad_norm": 116.9375, |
| "learning_rate": 5.533199195171025e-07, |
| "loss": 148.6525, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.05638943956700966, |
| "grad_norm": 112.0, |
| "learning_rate": 5.633802816901409e-07, |
| "loss": 148.389, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.057396393844991976, |
| "grad_norm": 110.5, |
| "learning_rate": 5.73440643863179e-07, |
| "loss": 149.1673, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.05840334812297429, |
| "grad_norm": 105.375, |
| "learning_rate": 5.835010060362173e-07, |
| "loss": 147.8345, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.059410302400956606, |
| "grad_norm": 108.875, |
| "learning_rate": 5.935613682092555e-07, |
| "loss": 148.039, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.06041725667893892, |
| "grad_norm": 109.75, |
| "learning_rate": 6.036217303822937e-07, |
| "loss": 148.6279, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.06142421095692124, |
| "grad_norm": 105.875, |
| "learning_rate": 6.136820925553319e-07, |
| "loss": 147.5383, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.06243116523490355, |
| "grad_norm": 105.8125, |
| "learning_rate": 6.237424547283702e-07, |
| "loss": 148.9635, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.06343811951288587, |
| "grad_norm": 107.5625, |
| "learning_rate": 6.338028169014085e-07, |
| "loss": 146.221, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.06444507379086818, |
| "grad_norm": 107.75, |
| "learning_rate": 6.438631790744466e-07, |
| "loss": 146.1134, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.0654520280688505, |
| "grad_norm": 109.75, |
| "learning_rate": 6.539235412474849e-07, |
| "loss": 148.4854, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.06645898234683281, |
| "grad_norm": 112.4375, |
| "learning_rate": 6.639839034205231e-07, |
| "loss": 146.1542, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.06746593662481513, |
| "grad_norm": 105.9375, |
| "learning_rate": 6.740442655935613e-07, |
| "loss": 147.7582, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.06847289090279744, |
| "grad_norm": 105.3125, |
| "learning_rate": 6.841046277665996e-07, |
| "loss": 146.3988, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.06947984518077976, |
| "grad_norm": 106.0, |
| "learning_rate": 6.941649899396378e-07, |
| "loss": 146.1215, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.07048679945876207, |
| "grad_norm": 109.375, |
| "learning_rate": 7.04225352112676e-07, |
| "loss": 146.9638, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0714937537367444, |
| "grad_norm": 107.375, |
| "learning_rate": 7.142857142857143e-07, |
| "loss": 146.0221, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.0725007080147267, |
| "grad_norm": 105.0625, |
| "learning_rate": 7.243460764587525e-07, |
| "loss": 146.9222, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.07350766229270903, |
| "grad_norm": 108.125, |
| "learning_rate": 7.344064386317907e-07, |
| "loss": 145.8448, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.07451461657069133, |
| "grad_norm": 109.0, |
| "learning_rate": 7.44466800804829e-07, |
| "loss": 147.5462, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.07552157084867366, |
| "grad_norm": 108.0, |
| "learning_rate": 7.545271629778671e-07, |
| "loss": 147.6979, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.07652852512665596, |
| "grad_norm": 107.0, |
| "learning_rate": 7.645875251509054e-07, |
| "loss": 146.4761, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.07753547940463829, |
| "grad_norm": 103.4375, |
| "learning_rate": 7.746478873239435e-07, |
| "loss": 144.9393, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.0785424336826206, |
| "grad_norm": 106.6875, |
| "learning_rate": 7.847082494969819e-07, |
| "loss": 145.0384, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.07954938796060292, |
| "grad_norm": 107.3125, |
| "learning_rate": 7.947686116700201e-07, |
| "loss": 146.4607, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.08055634223858522, |
| "grad_norm": 106.9375, |
| "learning_rate": 8.048289738430583e-07, |
| "loss": 144.5673, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.08156329651656755, |
| "grad_norm": 103.125, |
| "learning_rate": 8.148893360160966e-07, |
| "loss": 145.9938, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.08257025079454985, |
| "grad_norm": 105.3125, |
| "learning_rate": 8.249496981891347e-07, |
| "loss": 145.0131, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.08357720507253218, |
| "grad_norm": 101.0625, |
| "learning_rate": 8.35010060362173e-07, |
| "loss": 143.9037, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.08458415935051448, |
| "grad_norm": 106.375, |
| "learning_rate": 8.450704225352112e-07, |
| "loss": 146.0414, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.0855911136284968, |
| "grad_norm": 106.375, |
| "learning_rate": 8.551307847082495e-07, |
| "loss": 145.0052, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.08659806790647911, |
| "grad_norm": 102.375, |
| "learning_rate": 8.651911468812877e-07, |
| "loss": 144.8919, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.08760502218446144, |
| "grad_norm": 105.0625, |
| "learning_rate": 8.752515090543259e-07, |
| "loss": 144.2957, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.08861197646244376, |
| "grad_norm": 106.6875, |
| "learning_rate": 8.853118712273642e-07, |
| "loss": 144.0421, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.08961893074042607, |
| "grad_norm": 100.5, |
| "learning_rate": 8.953722334004023e-07, |
| "loss": 144.8475, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.09062588501840839, |
| "grad_norm": 102.4375, |
| "learning_rate": 9.054325955734407e-07, |
| "loss": 145.6034, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.0916328392963907, |
| "grad_norm": 101.75, |
| "learning_rate": 9.154929577464788e-07, |
| "loss": 145.2008, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.09263979357437302, |
| "grad_norm": 105.75, |
| "learning_rate": 9.25553319919517e-07, |
| "loss": 144.6417, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.09364674785235533, |
| "grad_norm": 104.9375, |
| "learning_rate": 9.356136820925554e-07, |
| "loss": 144.1944, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.09465370213033765, |
| "grad_norm": 104.0, |
| "learning_rate": 9.456740442655935e-07, |
| "loss": 144.2291, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.09566065640831996, |
| "grad_norm": 101.4375, |
| "learning_rate": 9.557344064386319e-07, |
| "loss": 142.8143, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.09666761068630228, |
| "grad_norm": 103.0, |
| "learning_rate": 9.6579476861167e-07, |
| "loss": 143.5473, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.09767456496428459, |
| "grad_norm": 104.375, |
| "learning_rate": 9.758551307847082e-07, |
| "loss": 144.2911, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.09868151924226691, |
| "grad_norm": 104.125, |
| "learning_rate": 9.859154929577465e-07, |
| "loss": 144.1867, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.09968847352024922, |
| "grad_norm": 104.5625, |
| "learning_rate": 9.959758551307847e-07, |
| "loss": 143.2442, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.10069542779823154, |
| "grad_norm": 99.875, |
| "learning_rate": 9.99328558639212e-07, |
| "loss": 142.1995, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.10069542779823154, |
| "eval_loss": 4.476833343505859, |
| "eval_runtime": 239.4636, |
| "eval_samples_per_second": 1117.552, |
| "eval_steps_per_second": 34.924, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.10170238207621385, |
| "grad_norm": 101.4375, |
| "learning_rate": 9.982094897045659e-07, |
| "loss": 143.2257, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.10270933635419617, |
| "grad_norm": 102.1875, |
| "learning_rate": 9.970904207699194e-07, |
| "loss": 142.6529, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.10371629063217848, |
| "grad_norm": 99.9375, |
| "learning_rate": 9.95971351835273e-07, |
| "loss": 142.5815, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.1047232449101608, |
| "grad_norm": 99.125, |
| "learning_rate": 9.948522829006265e-07, |
| "loss": 142.8741, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.10573019918814311, |
| "grad_norm": 107.125, |
| "learning_rate": 9.937332139659803e-07, |
| "loss": 144.5413, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.10673715346612543, |
| "grad_norm": 100.1875, |
| "learning_rate": 9.926141450313339e-07, |
| "loss": 141.7836, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.10774410774410774, |
| "grad_norm": 105.5, |
| "learning_rate": 9.914950760966874e-07, |
| "loss": 142.5062, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.10875106202209006, |
| "grad_norm": 102.375, |
| "learning_rate": 9.903760071620412e-07, |
| "loss": 140.559, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.10975801630007237, |
| "grad_norm": 98.5625, |
| "learning_rate": 9.892569382273947e-07, |
| "loss": 141.6972, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.11076497057805469, |
| "grad_norm": 99.75, |
| "learning_rate": 9.881378692927483e-07, |
| "loss": 141.2675, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.111771924856037, |
| "grad_norm": 100.6875, |
| "learning_rate": 9.87018800358102e-07, |
| "loss": 141.5898, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.11277887913401932, |
| "grad_norm": 104.6875, |
| "learning_rate": 9.858997314234556e-07, |
| "loss": 140.7125, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.11378583341200163, |
| "grad_norm": 99.5, |
| "learning_rate": 9.847806624888094e-07, |
| "loss": 141.6575, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.11479278768998395, |
| "grad_norm": 102.125, |
| "learning_rate": 9.83661593554163e-07, |
| "loss": 141.1669, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.11579974196796626, |
| "grad_norm": 101.0625, |
| "learning_rate": 9.825425246195165e-07, |
| "loss": 141.7356, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.11680669624594858, |
| "grad_norm": 100.25, |
| "learning_rate": 9.8142345568487e-07, |
| "loss": 140.7489, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.1178136505239309, |
| "grad_norm": 99.4375, |
| "learning_rate": 9.803043867502238e-07, |
| "loss": 139.6734, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.11882060480191321, |
| "grad_norm": 98.6875, |
| "learning_rate": 9.791853178155774e-07, |
| "loss": 138.0151, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.11982755907989553, |
| "grad_norm": 100.6875, |
| "learning_rate": 9.78066248880931e-07, |
| "loss": 138.5095, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.12083451335787784, |
| "grad_norm": 98.0, |
| "learning_rate": 9.769471799462845e-07, |
| "loss": 139.3675, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.12184146763586017, |
| "grad_norm": 107.5, |
| "learning_rate": 9.758281110116383e-07, |
| "loss": 140.4003, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.12284842191384247, |
| "grad_norm": 97.375, |
| "learning_rate": 9.747090420769918e-07, |
| "loss": 138.405, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.1238553761918248, |
| "grad_norm": 98.6875, |
| "learning_rate": 9.735899731423454e-07, |
| "loss": 138.4601, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.1248623304698071, |
| "grad_norm": 97.75, |
| "learning_rate": 9.724709042076992e-07, |
| "loss": 138.2733, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.1258692847477894, |
| "grad_norm": 97.125, |
| "learning_rate": 9.713518352730527e-07, |
| "loss": 138.8488, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.12687623902577175, |
| "grad_norm": 98.0, |
| "learning_rate": 9.702327663384065e-07, |
| "loss": 139.726, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.12788319330375406, |
| "grad_norm": 100.875, |
| "learning_rate": 9.6911369740376e-07, |
| "loss": 139.2006, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.12889014758173636, |
| "grad_norm": 97.625, |
| "learning_rate": 9.679946284691136e-07, |
| "loss": 136.4355, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.12989710185971867, |
| "grad_norm": 97.5625, |
| "learning_rate": 9.668755595344674e-07, |
| "loss": 138.7321, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.130904056137701, |
| "grad_norm": 99.5, |
| "learning_rate": 9.65756490599821e-07, |
| "loss": 137.7812, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.13191101041568332, |
| "grad_norm": 95.6875, |
| "learning_rate": 9.646374216651745e-07, |
| "loss": 138.1662, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.13291796469366562, |
| "grad_norm": 100.5625, |
| "learning_rate": 9.635183527305283e-07, |
| "loss": 137.3001, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.13392491897164793, |
| "grad_norm": 101.4375, |
| "learning_rate": 9.623992837958818e-07, |
| "loss": 135.9984, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.13493187324963027, |
| "grad_norm": 99.5, |
| "learning_rate": 9.612802148612354e-07, |
| "loss": 138.5501, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.13593882752761258, |
| "grad_norm": 98.375, |
| "learning_rate": 9.60161145926589e-07, |
| "loss": 137.8872, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.13694578180559489, |
| "grad_norm": 102.6875, |
| "learning_rate": 9.590420769919427e-07, |
| "loss": 137.8639, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.1379527360835772, |
| "grad_norm": 98.1875, |
| "learning_rate": 9.579230080572963e-07, |
| "loss": 137.8511, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.13895969036155953, |
| "grad_norm": 98.4375, |
| "learning_rate": 9.568039391226498e-07, |
| "loss": 135.7094, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.13996664463954184, |
| "grad_norm": 96.4375, |
| "learning_rate": 9.556848701880036e-07, |
| "loss": 135.8998, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.14097359891752415, |
| "grad_norm": 96.9375, |
| "learning_rate": 9.545658012533572e-07, |
| "loss": 137.496, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.14198055319550645, |
| "grad_norm": 98.1875, |
| "learning_rate": 9.534467323187107e-07, |
| "loss": 134.7175, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.1429875074734888, |
| "grad_norm": 99.0, |
| "learning_rate": 9.523276633840645e-07, |
| "loss": 134.1437, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.1439944617514711, |
| "grad_norm": 102.125, |
| "learning_rate": 9.51208594449418e-07, |
| "loss": 135.4387, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.1450014160294534, |
| "grad_norm": 98.6875, |
| "learning_rate": 9.500895255147716e-07, |
| "loss": 134.4652, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.14600837030743571, |
| "grad_norm": 98.375, |
| "learning_rate": 9.489704565801253e-07, |
| "loss": 133.8391, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.14701532458541805, |
| "grad_norm": 99.0625, |
| "learning_rate": 9.478513876454789e-07, |
| "loss": 134.2267, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.14802227886340036, |
| "grad_norm": 98.625, |
| "learning_rate": 9.467323187108326e-07, |
| "loss": 134.7458, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.14902923314138267, |
| "grad_norm": 100.4375, |
| "learning_rate": 9.456132497761861e-07, |
| "loss": 134.7538, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.150036187419365, |
| "grad_norm": 99.375, |
| "learning_rate": 9.444941808415397e-07, |
| "loss": 135.1127, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.1510431416973473, |
| "grad_norm": 98.5, |
| "learning_rate": 9.433751119068935e-07, |
| "loss": 132.5026, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.15205009597532962, |
| "grad_norm": 100.0625, |
| "learning_rate": 9.42256042972247e-07, |
| "loss": 134.5469, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.15305705025331193, |
| "grad_norm": 97.875, |
| "learning_rate": 9.411369740376007e-07, |
| "loss": 133.0167, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.15406400453129426, |
| "grad_norm": 101.9375, |
| "learning_rate": 9.400179051029544e-07, |
| "loss": 135.0586, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.15507095880927657, |
| "grad_norm": 101.8125, |
| "learning_rate": 9.388988361683079e-07, |
| "loss": 133.1228, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.15607791308725888, |
| "grad_norm": 97.875, |
| "learning_rate": 9.377797672336616e-07, |
| "loss": 134.4232, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.1570848673652412, |
| "grad_norm": 101.4375, |
| "learning_rate": 9.366606982990151e-07, |
| "loss": 133.6743, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.15809182164322352, |
| "grad_norm": 96.0625, |
| "learning_rate": 9.355416293643688e-07, |
| "loss": 132.8021, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.15909877592120583, |
| "grad_norm": 98.0, |
| "learning_rate": 9.344225604297225e-07, |
| "loss": 131.8535, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.16010573019918814, |
| "grad_norm": 96.6875, |
| "learning_rate": 9.33303491495076e-07, |
| "loss": 132.8115, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.16111268447717045, |
| "grad_norm": 98.5, |
| "learning_rate": 9.321844225604297e-07, |
| "loss": 132.4902, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.16211963875515278, |
| "grad_norm": 96.625, |
| "learning_rate": 9.310653536257834e-07, |
| "loss": 132.3855, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.1631265930331351, |
| "grad_norm": 99.5, |
| "learning_rate": 9.299462846911369e-07, |
| "loss": 132.9767, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.1641335473111174, |
| "grad_norm": 96.8125, |
| "learning_rate": 9.288272157564906e-07, |
| "loss": 130.722, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.1651405015890997, |
| "grad_norm": 97.1875, |
| "learning_rate": 9.277081468218441e-07, |
| "loss": 132.2955, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.16614745586708204, |
| "grad_norm": 100.4375, |
| "learning_rate": 9.265890778871978e-07, |
| "loss": 130.5667, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.16715441014506435, |
| "grad_norm": 92.75, |
| "learning_rate": 9.254700089525515e-07, |
| "loss": 130.172, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.16816136442304666, |
| "grad_norm": 99.0, |
| "learning_rate": 9.24350940017905e-07, |
| "loss": 132.0079, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.16916831870102897, |
| "grad_norm": 99.1875, |
| "learning_rate": 9.232318710832587e-07, |
| "loss": 133.0523, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.1701752729790113, |
| "grad_norm": 98.0, |
| "learning_rate": 9.221128021486123e-07, |
| "loss": 132.6847, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.1711822272569936, |
| "grad_norm": 101.5625, |
| "learning_rate": 9.209937332139659e-07, |
| "loss": 129.9938, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.17218918153497592, |
| "grad_norm": 100.75, |
| "learning_rate": 9.198746642793196e-07, |
| "loss": 130.9808, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.17319613581295823, |
| "grad_norm": 99.375, |
| "learning_rate": 9.187555953446731e-07, |
| "loss": 131.95, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.17420309009094057, |
| "grad_norm": 99.875, |
| "learning_rate": 9.176365264100269e-07, |
| "loss": 132.5511, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.17521004436892287, |
| "grad_norm": 96.6875, |
| "learning_rate": 9.165174574753805e-07, |
| "loss": 129.9553, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.17621699864690518, |
| "grad_norm": 96.3125, |
| "learning_rate": 9.15398388540734e-07, |
| "loss": 131.2535, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.17722395292488752, |
| "grad_norm": 97.5625, |
| "learning_rate": 9.142793196060878e-07, |
| "loss": 129.6035, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.17823090720286983, |
| "grad_norm": 96.3125, |
| "learning_rate": 9.131602506714413e-07, |
| "loss": 130.5564, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.17923786148085213, |
| "grad_norm": 95.375, |
| "learning_rate": 9.120411817367949e-07, |
| "loss": 131.6668, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.18024481575883444, |
| "grad_norm": 106.3125, |
| "learning_rate": 9.109221128021486e-07, |
| "loss": 130.4126, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.18125177003681678, |
| "grad_norm": 95.0, |
| "learning_rate": 9.098030438675022e-07, |
| "loss": 129.2205, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.1822587243147991, |
| "grad_norm": 100.0, |
| "learning_rate": 9.086839749328559e-07, |
| "loss": 129.7743, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.1832656785927814, |
| "grad_norm": 99.625, |
| "learning_rate": 9.075649059982094e-07, |
| "loss": 129.5778, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.1842726328707637, |
| "grad_norm": 99.3125, |
| "learning_rate": 9.06445837063563e-07, |
| "loss": 127.7286, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.18527958714874604, |
| "grad_norm": 97.1875, |
| "learning_rate": 9.053267681289168e-07, |
| "loss": 128.4797, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.18628654142672835, |
| "grad_norm": 97.0, |
| "learning_rate": 9.042076991942703e-07, |
| "loss": 129.3254, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.18729349570471066, |
| "grad_norm": 95.875, |
| "learning_rate": 9.03088630259624e-07, |
| "loss": 129.162, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.18830044998269296, |
| "grad_norm": 99.4375, |
| "learning_rate": 9.019695613249775e-07, |
| "loss": 128.9798, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.1893074042606753, |
| "grad_norm": 99.8125, |
| "learning_rate": 9.008504923903312e-07, |
| "loss": 128.7978, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.1903143585386576, |
| "grad_norm": 98.5, |
| "learning_rate": 8.997314234556849e-07, |
| "loss": 129.5615, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.19132131281663992, |
| "grad_norm": 95.3125, |
| "learning_rate": 8.986123545210384e-07, |
| "loss": 128.219, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.19232826709462222, |
| "grad_norm": 95.5625, |
| "learning_rate": 8.97493285586392e-07, |
| "loss": 128.3994, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.19333522137260456, |
| "grad_norm": 98.875, |
| "learning_rate": 8.963742166517458e-07, |
| "loss": 128.6512, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.19434217565058687, |
| "grad_norm": 96.0, |
| "learning_rate": 8.952551477170993e-07, |
| "loss": 127.9091, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.19534912992856918, |
| "grad_norm": 98.5625, |
| "learning_rate": 8.94136078782453e-07, |
| "loss": 127.4443, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.19635608420655148, |
| "grad_norm": 101.75, |
| "learning_rate": 8.930170098478065e-07, |
| "loss": 126.552, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.19736303848453382, |
| "grad_norm": 102.4375, |
| "learning_rate": 8.918979409131602e-07, |
| "loss": 128.1213, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.19836999276251613, |
| "grad_norm": 98.375, |
| "learning_rate": 8.907788719785139e-07, |
| "loss": 126.8494, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.19937694704049844, |
| "grad_norm": 98.5625, |
| "learning_rate": 8.896598030438674e-07, |
| "loss": 128.1531, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.20038390131848074, |
| "grad_norm": 97.25, |
| "learning_rate": 8.88540734109221e-07, |
| "loss": 126.7792, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.20139085559646308, |
| "grad_norm": 99.125, |
| "learning_rate": 8.874216651745748e-07, |
| "loss": 125.983, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.20139085559646308, |
| "eval_loss": 3.978339433670044, |
| "eval_runtime": 239.476, |
| "eval_samples_per_second": 1117.494, |
| "eval_steps_per_second": 34.922, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2023978098744454, |
| "grad_norm": 100.4375, |
| "learning_rate": 8.863025962399283e-07, |
| "loss": 127.6748, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.2034047641524277, |
| "grad_norm": 98.375, |
| "learning_rate": 8.85183527305282e-07, |
| "loss": 128.4397, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.20441171843041, |
| "grad_norm": 98.4375, |
| "learning_rate": 8.840644583706356e-07, |
| "loss": 128.5681, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.20541867270839234, |
| "grad_norm": 101.5625, |
| "learning_rate": 8.829453894359892e-07, |
| "loss": 125.6765, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.20642562698637465, |
| "grad_norm": 99.0, |
| "learning_rate": 8.818263205013429e-07, |
| "loss": 127.8648, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.20743258126435696, |
| "grad_norm": 100.4375, |
| "learning_rate": 8.807072515666964e-07, |
| "loss": 127.2665, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.2084395355423393, |
| "grad_norm": 97.0625, |
| "learning_rate": 8.795881826320502e-07, |
| "loss": 125.6571, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.2094464898203216, |
| "grad_norm": 99.5625, |
| "learning_rate": 8.784691136974037e-07, |
| "loss": 127.1407, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.2104534440983039, |
| "grad_norm": 97.625, |
| "learning_rate": 8.773500447627573e-07, |
| "loss": 125.1989, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.21146039837628622, |
| "grad_norm": 97.5, |
| "learning_rate": 8.76230975828111e-07, |
| "loss": 126.1473, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.21246735265426855, |
| "grad_norm": 98.375, |
| "learning_rate": 8.751119068934646e-07, |
| "loss": 125.087, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.21347430693225086, |
| "grad_norm": 96.125, |
| "learning_rate": 8.739928379588182e-07, |
| "loss": 125.8863, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.21448126121023317, |
| "grad_norm": 98.5625, |
| "learning_rate": 8.728737690241719e-07, |
| "loss": 124.4642, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.21548821548821548, |
| "grad_norm": 98.5625, |
| "learning_rate": 8.717547000895254e-07, |
| "loss": 128.1609, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.21649516976619781, |
| "grad_norm": 97.3125, |
| "learning_rate": 8.706356311548792e-07, |
| "loss": 124.3779, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.21750212404418012, |
| "grad_norm": 100.375, |
| "learning_rate": 8.695165622202327e-07, |
| "loss": 125.5783, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.21850907832216243, |
| "grad_norm": 99.625, |
| "learning_rate": 8.683974932855863e-07, |
| "loss": 124.0356, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.21951603260014474, |
| "grad_norm": 98.5, |
| "learning_rate": 8.6727842435094e-07, |
| "loss": 125.126, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.22052298687812708, |
| "grad_norm": 100.625, |
| "learning_rate": 8.661593554162936e-07, |
| "loss": 126.576, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.22152994115610938, |
| "grad_norm": 98.75, |
| "learning_rate": 8.650402864816473e-07, |
| "loss": 125.1371, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.2225368954340917, |
| "grad_norm": 98.75, |
| "learning_rate": 8.639212175470008e-07, |
| "loss": 125.9719, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.223543849712074, |
| "grad_norm": 100.5, |
| "learning_rate": 8.628021486123544e-07, |
| "loss": 125.1309, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.22455080399005634, |
| "grad_norm": 96.6875, |
| "learning_rate": 8.616830796777082e-07, |
| "loss": 123.8529, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.22555775826803864, |
| "grad_norm": 97.4375, |
| "learning_rate": 8.605640107430617e-07, |
| "loss": 123.9571, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.22656471254602095, |
| "grad_norm": 100.6875, |
| "learning_rate": 8.594449418084153e-07, |
| "loss": 124.4524, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.22757166682400326, |
| "grad_norm": 98.75, |
| "learning_rate": 8.583258728737691e-07, |
| "loss": 123.4843, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.2285786211019856, |
| "grad_norm": 95.0625, |
| "learning_rate": 8.572068039391226e-07, |
| "loss": 123.4948, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.2295855753799679, |
| "grad_norm": 100.125, |
| "learning_rate": 8.560877350044763e-07, |
| "loss": 123.4176, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.2305925296579502, |
| "grad_norm": 101.0, |
| "learning_rate": 8.549686660698298e-07, |
| "loss": 123.8447, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.23159948393593252, |
| "grad_norm": 95.625, |
| "learning_rate": 8.538495971351835e-07, |
| "loss": 124.9102, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.23260643821391486, |
| "grad_norm": 99.375, |
| "learning_rate": 8.527305282005372e-07, |
| "loss": 123.519, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.23361339249189716, |
| "grad_norm": 97.25, |
| "learning_rate": 8.516114592658907e-07, |
| "loss": 123.2095, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.23462034676987947, |
| "grad_norm": 100.0, |
| "learning_rate": 8.504923903312443e-07, |
| "loss": 122.7344, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.2356273010478618, |
| "grad_norm": 96.5, |
| "learning_rate": 8.49373321396598e-07, |
| "loss": 122.7375, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.23663425532584412, |
| "grad_norm": 98.0625, |
| "learning_rate": 8.482542524619516e-07, |
| "loss": 124.5513, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.23764120960382643, |
| "grad_norm": 101.0625, |
| "learning_rate": 8.471351835273053e-07, |
| "loss": 123.7978, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.23864816388180873, |
| "grad_norm": 98.6875, |
| "learning_rate": 8.460161145926588e-07, |
| "loss": 122.5464, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.23965511815979107, |
| "grad_norm": 100.8125, |
| "learning_rate": 8.448970456580125e-07, |
| "loss": 123.1573, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.24066207243777338, |
| "grad_norm": 100.25, |
| "learning_rate": 8.437779767233662e-07, |
| "loss": 123.0318, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.24166902671575569, |
| "grad_norm": 95.8125, |
| "learning_rate": 8.426589077887197e-07, |
| "loss": 124.5656, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.242675980993738, |
| "grad_norm": 93.5, |
| "learning_rate": 8.415398388540734e-07, |
| "loss": 123.566, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.24368293527172033, |
| "grad_norm": 97.0, |
| "learning_rate": 8.40420769919427e-07, |
| "loss": 121.6469, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.24468988954970264, |
| "grad_norm": 99.125, |
| "learning_rate": 8.393017009847806e-07, |
| "loss": 123.3631, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.24569684382768495, |
| "grad_norm": 98.25, |
| "learning_rate": 8.381826320501343e-07, |
| "loss": 122.9663, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.24670379810566725, |
| "grad_norm": 96.8125, |
| "learning_rate": 8.370635631154878e-07, |
| "loss": 123.2738, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.2477107523836496, |
| "grad_norm": 96.0625, |
| "learning_rate": 8.359444941808415e-07, |
| "loss": 122.7263, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.2487177066616319, |
| "grad_norm": 101.1875, |
| "learning_rate": 8.348254252461951e-07, |
| "loss": 123.9187, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.2497246609396142, |
| "grad_norm": 97.625, |
| "learning_rate": 8.337063563115487e-07, |
| "loss": 121.4109, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.2507316152175965, |
| "grad_norm": 97.125, |
| "learning_rate": 8.325872873769025e-07, |
| "loss": 120.955, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.2517385694955788, |
| "grad_norm": 100.5625, |
| "learning_rate": 8.31468218442256e-07, |
| "loss": 120.8311, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.25274552377356113, |
| "grad_norm": 97.5, |
| "learning_rate": 8.303491495076096e-07, |
| "loss": 121.3494, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.2537524780515435, |
| "grad_norm": 96.75, |
| "learning_rate": 8.292300805729633e-07, |
| "loss": 122.0731, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.2547594323295258, |
| "grad_norm": 99.1875, |
| "learning_rate": 8.281110116383169e-07, |
| "loss": 123.7734, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.2557663866075081, |
| "grad_norm": 98.75, |
| "learning_rate": 8.269919427036705e-07, |
| "loss": 121.2907, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.2567733408854904, |
| "grad_norm": 94.1875, |
| "learning_rate": 8.258728737690241e-07, |
| "loss": 121.2714, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.25778029516347273, |
| "grad_norm": 97.4375, |
| "learning_rate": 8.247538048343777e-07, |
| "loss": 121.5029, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.25878724944145504, |
| "grad_norm": 99.125, |
| "learning_rate": 8.236347358997315e-07, |
| "loss": 119.4266, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.25979420371943734, |
| "grad_norm": 98.6875, |
| "learning_rate": 8.22515666965085e-07, |
| "loss": 121.0074, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.26080115799741965, |
| "grad_norm": 98.375, |
| "learning_rate": 8.213965980304386e-07, |
| "loss": 120.9616, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.261808112275402, |
| "grad_norm": 98.875, |
| "learning_rate": 8.202775290957922e-07, |
| "loss": 120.9374, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.2628150665533843, |
| "grad_norm": 98.375, |
| "learning_rate": 8.191584601611459e-07, |
| "loss": 120.8794, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.26382202083136663, |
| "grad_norm": 96.75, |
| "learning_rate": 8.180393912264996e-07, |
| "loss": 120.3623, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.26482897510934894, |
| "grad_norm": 99.8125, |
| "learning_rate": 8.169203222918531e-07, |
| "loss": 121.5525, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.26583592938733125, |
| "grad_norm": 95.8125, |
| "learning_rate": 8.158012533572067e-07, |
| "loss": 122.4818, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.26684288366531356, |
| "grad_norm": 99.3125, |
| "learning_rate": 8.146821844225605e-07, |
| "loss": 121.3006, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.26784983794329587, |
| "grad_norm": 101.125, |
| "learning_rate": 8.13563115487914e-07, |
| "loss": 119.9612, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.26885679222127823, |
| "grad_norm": 96.8125, |
| "learning_rate": 8.124440465532676e-07, |
| "loss": 121.866, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.26986374649926054, |
| "grad_norm": 95.875, |
| "learning_rate": 8.113249776186212e-07, |
| "loss": 119.8637, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.27087070077724285, |
| "grad_norm": 97.6875, |
| "learning_rate": 8.102059086839749e-07, |
| "loss": 120.1557, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.27187765505522515, |
| "grad_norm": 102.9375, |
| "learning_rate": 8.090868397493286e-07, |
| "loss": 120.6683, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.27288460933320746, |
| "grad_norm": 98.9375, |
| "learning_rate": 8.079677708146821e-07, |
| "loss": 119.1554, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.27389156361118977, |
| "grad_norm": 102.25, |
| "learning_rate": 8.068487018800357e-07, |
| "loss": 119.5964, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.2748985178891721, |
| "grad_norm": 100.5625, |
| "learning_rate": 8.057296329453895e-07, |
| "loss": 118.3494, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.2759054721671544, |
| "grad_norm": 98.375, |
| "learning_rate": 8.04610564010743e-07, |
| "loss": 120.6742, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.27691242644513675, |
| "grad_norm": 99.125, |
| "learning_rate": 8.034914950760967e-07, |
| "loss": 120.3351, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.27791938072311906, |
| "grad_norm": 97.0625, |
| "learning_rate": 8.023724261414503e-07, |
| "loss": 120.4014, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.27892633500110137, |
| "grad_norm": 97.8125, |
| "learning_rate": 8.012533572068039e-07, |
| "loss": 119.3332, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.2799332892790837, |
| "grad_norm": 98.125, |
| "learning_rate": 8.001342882721576e-07, |
| "loss": 118.5001, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.280940243557066, |
| "grad_norm": 98.625, |
| "learning_rate": 7.990152193375111e-07, |
| "loss": 119.7872, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.2819471978350483, |
| "grad_norm": 99.1875, |
| "learning_rate": 7.978961504028648e-07, |
| "loss": 120.3533, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.2829541521130306, |
| "grad_norm": 97.5625, |
| "learning_rate": 7.967770814682184e-07, |
| "loss": 119.9716, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.2839611063910129, |
| "grad_norm": 100.125, |
| "learning_rate": 7.95658012533572e-07, |
| "loss": 118.2968, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.28496806066899527, |
| "grad_norm": 98.875, |
| "learning_rate": 7.945389435989257e-07, |
| "loss": 119.1026, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.2859750149469776, |
| "grad_norm": 95.75, |
| "learning_rate": 7.934198746642793e-07, |
| "loss": 118.9463, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.2869819692249599, |
| "grad_norm": 97.5, |
| "learning_rate": 7.923008057296329e-07, |
| "loss": 118.8782, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.2879889235029422, |
| "grad_norm": 99.5625, |
| "learning_rate": 7.911817367949866e-07, |
| "loss": 119.4829, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.2889958777809245, |
| "grad_norm": 96.75, |
| "learning_rate": 7.900626678603401e-07, |
| "loss": 117.9972, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.2900028320589068, |
| "grad_norm": 101.75, |
| "learning_rate": 7.889435989256938e-07, |
| "loss": 118.6872, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.2910097863368891, |
| "grad_norm": 100.0, |
| "learning_rate": 7.878245299910474e-07, |
| "loss": 119.475, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.29201674061487143, |
| "grad_norm": 102.9375, |
| "learning_rate": 7.86705461056401e-07, |
| "loss": 118.7264, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.2930236948928538, |
| "grad_norm": 100.0, |
| "learning_rate": 7.855863921217547e-07, |
| "loss": 118.6106, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.2940306491708361, |
| "grad_norm": 98.125, |
| "learning_rate": 7.844673231871083e-07, |
| "loss": 119.1069, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.2950376034488184, |
| "grad_norm": 102.9375, |
| "learning_rate": 7.833482542524619e-07, |
| "loss": 119.3006, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.2960445577268007, |
| "grad_norm": 102.9375, |
| "learning_rate": 7.822291853178155e-07, |
| "loss": 117.8286, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.297051512004783, |
| "grad_norm": 103.6875, |
| "learning_rate": 7.811101163831691e-07, |
| "loss": 119.018, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.29805846628276533, |
| "grad_norm": 101.375, |
| "learning_rate": 7.799910474485229e-07, |
| "loss": 119.4814, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.29906542056074764, |
| "grad_norm": 99.375, |
| "learning_rate": 7.788719785138764e-07, |
| "loss": 118.5113, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.30007237483873, |
| "grad_norm": 100.875, |
| "learning_rate": 7.7775290957923e-07, |
| "loss": 120.1722, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.3010793291167123, |
| "grad_norm": 99.8125, |
| "learning_rate": 7.766338406445838e-07, |
| "loss": 117.5911, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.3020862833946946, |
| "grad_norm": 97.75, |
| "learning_rate": 7.755147717099373e-07, |
| "loss": 116.4555, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3020862833946946, |
| "eval_loss": 3.6974689960479736, |
| "eval_runtime": 239.1833, |
| "eval_samples_per_second": 1118.861, |
| "eval_steps_per_second": 34.965, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.30309323767267693, |
| "grad_norm": 99.75, |
| "learning_rate": 7.743957027752909e-07, |
| "loss": 117.8555, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.30410019195065924, |
| "grad_norm": 102.9375, |
| "learning_rate": 7.732766338406445e-07, |
| "loss": 118.8926, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.30510714622864155, |
| "grad_norm": 98.3125, |
| "learning_rate": 7.721575649059982e-07, |
| "loss": 118.4608, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.30611410050662385, |
| "grad_norm": 101.0625, |
| "learning_rate": 7.710384959713519e-07, |
| "loss": 117.194, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.30712105478460616, |
| "grad_norm": 100.0625, |
| "learning_rate": 7.699194270367054e-07, |
| "loss": 116.1222, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.3081280090625885, |
| "grad_norm": 102.625, |
| "learning_rate": 7.68800358102059e-07, |
| "loss": 116.386, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.30913496334057083, |
| "grad_norm": 99.1875, |
| "learning_rate": 7.676812891674127e-07, |
| "loss": 118.1568, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.31014191761855314, |
| "grad_norm": 102.125, |
| "learning_rate": 7.665622202327663e-07, |
| "loss": 117.6367, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.31114887189653545, |
| "grad_norm": 99.0625, |
| "learning_rate": 7.6544315129812e-07, |
| "loss": 117.9046, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.31215582617451776, |
| "grad_norm": 100.875, |
| "learning_rate": 7.643240823634735e-07, |
| "loss": 117.9566, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.31316278045250007, |
| "grad_norm": 99.5, |
| "learning_rate": 7.632050134288272e-07, |
| "loss": 115.877, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.3141697347304824, |
| "grad_norm": 102.875, |
| "learning_rate": 7.620859444941809e-07, |
| "loss": 118.0572, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.3151766890084647, |
| "grad_norm": 99.0, |
| "learning_rate": 7.609668755595344e-07, |
| "loss": 117.6729, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.31618364328644705, |
| "grad_norm": 100.0, |
| "learning_rate": 7.59847806624888e-07, |
| "loss": 116.7598, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.31719059756442936, |
| "grad_norm": 101.6875, |
| "learning_rate": 7.587287376902417e-07, |
| "loss": 116.1664, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.31819755184241166, |
| "grad_norm": 100.75, |
| "learning_rate": 7.576096687555953e-07, |
| "loss": 116.2372, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.31920450612039397, |
| "grad_norm": 101.8125, |
| "learning_rate": 7.56490599820949e-07, |
| "loss": 116.9723, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.3202114603983763, |
| "grad_norm": 101.25, |
| "learning_rate": 7.553715308863025e-07, |
| "loss": 118.3011, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.3212184146763586, |
| "grad_norm": 100.75, |
| "learning_rate": 7.542524619516562e-07, |
| "loss": 116.3056, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.3222253689543409, |
| "grad_norm": 101.625, |
| "learning_rate": 7.531333930170098e-07, |
| "loss": 116.9207, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.32323232323232326, |
| "grad_norm": 97.0625, |
| "learning_rate": 7.520143240823634e-07, |
| "loss": 116.7428, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.32423927751030557, |
| "grad_norm": 99.5625, |
| "learning_rate": 7.50895255147717e-07, |
| "loss": 115.7755, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.3252462317882879, |
| "grad_norm": 100.4375, |
| "learning_rate": 7.497761862130707e-07, |
| "loss": 117.4555, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.3262531860662702, |
| "grad_norm": 98.9375, |
| "learning_rate": 7.486571172784243e-07, |
| "loss": 114.5551, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.3272601403442525, |
| "grad_norm": 101.4375, |
| "learning_rate": 7.47538048343778e-07, |
| "loss": 115.1901, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.3282670946222348, |
| "grad_norm": 99.0, |
| "learning_rate": 7.464189794091316e-07, |
| "loss": 115.1624, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.3292740489002171, |
| "grad_norm": 97.25, |
| "learning_rate": 7.452999104744852e-07, |
| "loss": 116.2408, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.3302810031781994, |
| "grad_norm": 97.1875, |
| "learning_rate": 7.441808415398388e-07, |
| "loss": 115.4953, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.3312879574561818, |
| "grad_norm": 100.125, |
| "learning_rate": 7.430617726051924e-07, |
| "loss": 115.8885, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.3322949117341641, |
| "grad_norm": 100.0, |
| "learning_rate": 7.419427036705462e-07, |
| "loss": 114.8555, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.3333018660121464, |
| "grad_norm": 99.9375, |
| "learning_rate": 7.408236347358997e-07, |
| "loss": 117.4458, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.3343088202901287, |
| "grad_norm": 99.4375, |
| "learning_rate": 7.397045658012533e-07, |
| "loss": 115.667, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.335315774568111, |
| "grad_norm": 99.9375, |
| "learning_rate": 7.385854968666069e-07, |
| "loss": 115.4635, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.3363227288460933, |
| "grad_norm": 99.0625, |
| "learning_rate": 7.374664279319606e-07, |
| "loss": 116.1706, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.33732968312407563, |
| "grad_norm": 102.625, |
| "learning_rate": 7.363473589973142e-07, |
| "loss": 115.5255, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.33833663740205794, |
| "grad_norm": 101.25, |
| "learning_rate": 7.352282900626678e-07, |
| "loss": 116.4496, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.3393435916800403, |
| "grad_norm": 100.25, |
| "learning_rate": 7.341092211280214e-07, |
| "loss": 115.7757, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.3403505459580226, |
| "grad_norm": 98.125, |
| "learning_rate": 7.329901521933752e-07, |
| "loss": 114.8857, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.3413575002360049, |
| "grad_norm": 100.5625, |
| "learning_rate": 7.318710832587287e-07, |
| "loss": 117.0493, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.3423644545139872, |
| "grad_norm": 100.5, |
| "learning_rate": 7.307520143240823e-07, |
| "loss": 116.3239, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.34337140879196953, |
| "grad_norm": 99.9375, |
| "learning_rate": 7.296329453894359e-07, |
| "loss": 113.2169, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.34437836306995184, |
| "grad_norm": 98.6875, |
| "learning_rate": 7.285138764547896e-07, |
| "loss": 114.3095, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.34538531734793415, |
| "grad_norm": 101.0625, |
| "learning_rate": 7.273948075201433e-07, |
| "loss": 116.0238, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.34639227162591646, |
| "grad_norm": 99.375, |
| "learning_rate": 7.262757385854968e-07, |
| "loss": 115.388, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.3473992259038988, |
| "grad_norm": 103.3125, |
| "learning_rate": 7.251566696508504e-07, |
| "loss": 115.1991, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.34840618018188113, |
| "grad_norm": 102.0625, |
| "learning_rate": 7.240376007162041e-07, |
| "loss": 113.6285, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.34941313445986344, |
| "grad_norm": 102.25, |
| "learning_rate": 7.229185317815577e-07, |
| "loss": 113.7743, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.35042008873784575, |
| "grad_norm": 100.0, |
| "learning_rate": 7.217994628469113e-07, |
| "loss": 113.9617, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.35142704301582806, |
| "grad_norm": 104.25, |
| "learning_rate": 7.20680393912265e-07, |
| "loss": 115.0941, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.35243399729381036, |
| "grad_norm": 99.5, |
| "learning_rate": 7.195613249776186e-07, |
| "loss": 113.6155, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.35344095157179267, |
| "grad_norm": 101.375, |
| "learning_rate": 7.184422560429723e-07, |
| "loss": 115.6367, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.35444790584977504, |
| "grad_norm": 96.0625, |
| "learning_rate": 7.173231871083258e-07, |
| "loss": 115.1059, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.35545486012775734, |
| "grad_norm": 98.875, |
| "learning_rate": 7.162041181736795e-07, |
| "loss": 114.0925, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.35646181440573965, |
| "grad_norm": 103.625, |
| "learning_rate": 7.150850492390331e-07, |
| "loss": 114.7297, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.35746876868372196, |
| "grad_norm": 96.8125, |
| "learning_rate": 7.139659803043867e-07, |
| "loss": 114.0845, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.35847572296170427, |
| "grad_norm": 104.25, |
| "learning_rate": 7.128469113697403e-07, |
| "loss": 114.4882, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.3594826772396866, |
| "grad_norm": 101.625, |
| "learning_rate": 7.11727842435094e-07, |
| "loss": 114.6614, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.3604896315176689, |
| "grad_norm": 100.125, |
| "learning_rate": 7.106087735004476e-07, |
| "loss": 115.3073, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.3614965857956512, |
| "grad_norm": 100.3125, |
| "learning_rate": 7.094897045658012e-07, |
| "loss": 116.3388, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.36250354007363356, |
| "grad_norm": 102.6875, |
| "learning_rate": 7.083706356311548e-07, |
| "loss": 114.3609, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.36351049435161586, |
| "grad_norm": 99.0, |
| "learning_rate": 7.072515666965085e-07, |
| "loss": 113.4134, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.3645174486295982, |
| "grad_norm": 102.8125, |
| "learning_rate": 7.061324977618621e-07, |
| "loss": 113.573, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.3655244029075805, |
| "grad_norm": 102.4375, |
| "learning_rate": 7.050134288272157e-07, |
| "loss": 114.1263, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.3665313571855628, |
| "grad_norm": 104.0, |
| "learning_rate": 7.038943598925694e-07, |
| "loss": 114.0493, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.3675383114635451, |
| "grad_norm": 101.75, |
| "learning_rate": 7.02775290957923e-07, |
| "loss": 113.4941, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.3685452657415274, |
| "grad_norm": 100.5625, |
| "learning_rate": 7.016562220232766e-07, |
| "loss": 114.3765, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.3695522200195097, |
| "grad_norm": 100.3125, |
| "learning_rate": 7.005371530886302e-07, |
| "loss": 113.0758, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.3705591742974921, |
| "grad_norm": 100.875, |
| "learning_rate": 6.994180841539838e-07, |
| "loss": 112.9355, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.3715661285754744, |
| "grad_norm": 99.8125, |
| "learning_rate": 6.982990152193375e-07, |
| "loss": 113.0069, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.3725730828534567, |
| "grad_norm": 104.9375, |
| "learning_rate": 6.971799462846911e-07, |
| "loss": 114.7156, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.373580037131439, |
| "grad_norm": 103.0, |
| "learning_rate": 6.960608773500447e-07, |
| "loss": 112.6265, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.3745869914094213, |
| "grad_norm": 101.25, |
| "learning_rate": 6.949418084153985e-07, |
| "loss": 112.8266, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.3755939456874036, |
| "grad_norm": 101.3125, |
| "learning_rate": 6.93822739480752e-07, |
| "loss": 112.4899, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.3766008999653859, |
| "grad_norm": 99.75, |
| "learning_rate": 6.927036705461056e-07, |
| "loss": 113.6427, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.37760785424336824, |
| "grad_norm": 99.5, |
| "learning_rate": 6.915846016114592e-07, |
| "loss": 113.0808, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.3786148085213506, |
| "grad_norm": 101.375, |
| "learning_rate": 6.904655326768129e-07, |
| "loss": 113.0404, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.3796217627993329, |
| "grad_norm": 103.125, |
| "learning_rate": 6.893464637421666e-07, |
| "loss": 110.9456, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.3806287170773152, |
| "grad_norm": 100.6875, |
| "learning_rate": 6.882273948075201e-07, |
| "loss": 112.3675, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.3816356713552975, |
| "grad_norm": 102.3125, |
| "learning_rate": 6.871083258728737e-07, |
| "loss": 112.1686, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.38264262563327983, |
| "grad_norm": 98.3125, |
| "learning_rate": 6.859892569382274e-07, |
| "loss": 114.4239, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.38364957991126214, |
| "grad_norm": 101.375, |
| "learning_rate": 6.84870188003581e-07, |
| "loss": 111.5497, |
| "step": 1905 |
| }, |
| { |
| "epoch": 0.38465653418924445, |
| "grad_norm": 102.1875, |
| "learning_rate": 6.837511190689346e-07, |
| "loss": 111.1532, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.3856634884672268, |
| "grad_norm": 100.875, |
| "learning_rate": 6.826320501342882e-07, |
| "loss": 112.9669, |
| "step": 1915 |
| }, |
| { |
| "epoch": 0.3866704427452091, |
| "grad_norm": 101.625, |
| "learning_rate": 6.815129811996419e-07, |
| "loss": 113.497, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.38767739702319143, |
| "grad_norm": 99.9375, |
| "learning_rate": 6.803939122649956e-07, |
| "loss": 113.1546, |
| "step": 1925 |
| }, |
| { |
| "epoch": 0.38868435130117374, |
| "grad_norm": 103.5625, |
| "learning_rate": 6.792748433303491e-07, |
| "loss": 111.5201, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.38969130557915604, |
| "grad_norm": 101.3125, |
| "learning_rate": 6.781557743957027e-07, |
| "loss": 112.034, |
| "step": 1935 |
| }, |
| { |
| "epoch": 0.39069825985713835, |
| "grad_norm": 101.875, |
| "learning_rate": 6.770367054610564e-07, |
| "loss": 109.9169, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.39170521413512066, |
| "grad_norm": 97.9375, |
| "learning_rate": 6.7591763652641e-07, |
| "loss": 113.2691, |
| "step": 1945 |
| }, |
| { |
| "epoch": 0.39271216841310297, |
| "grad_norm": 103.375, |
| "learning_rate": 6.747985675917636e-07, |
| "loss": 112.5936, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.39371912269108533, |
| "grad_norm": 97.1875, |
| "learning_rate": 6.736794986571172e-07, |
| "loss": 112.2075, |
| "step": 1955 |
| }, |
| { |
| "epoch": 0.39472607696906764, |
| "grad_norm": 103.25, |
| "learning_rate": 6.725604297224709e-07, |
| "loss": 114.289, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.39573303124704995, |
| "grad_norm": 100.3125, |
| "learning_rate": 6.714413607878245e-07, |
| "loss": 111.8496, |
| "step": 1965 |
| }, |
| { |
| "epoch": 0.39673998552503226, |
| "grad_norm": 100.9375, |
| "learning_rate": 6.703222918531781e-07, |
| "loss": 112.454, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.39774693980301457, |
| "grad_norm": 103.375, |
| "learning_rate": 6.692032229185317e-07, |
| "loss": 112.7298, |
| "step": 1975 |
| }, |
| { |
| "epoch": 0.3987538940809969, |
| "grad_norm": 100.0, |
| "learning_rate": 6.680841539838854e-07, |
| "loss": 112.2657, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.3997608483589792, |
| "grad_norm": 102.875, |
| "learning_rate": 6.66965085049239e-07, |
| "loss": 110.3278, |
| "step": 1985 |
| }, |
| { |
| "epoch": 0.4007678026369615, |
| "grad_norm": 102.1875, |
| "learning_rate": 6.658460161145926e-07, |
| "loss": 112.1858, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.40177475691494385, |
| "grad_norm": 100.0, |
| "learning_rate": 6.647269471799463e-07, |
| "loss": 112.2546, |
| "step": 1995 |
| }, |
| { |
| "epoch": 0.40278171119292616, |
| "grad_norm": 99.25, |
| "learning_rate": 6.636078782452999e-07, |
| "loss": 112.8675, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.40278171119292616, |
| "eval_loss": 3.4998972415924072, |
| "eval_runtime": 241.0777, |
| "eval_samples_per_second": 1110.069, |
| "eval_steps_per_second": 34.69, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.40378866547090847, |
| "grad_norm": 104.3125, |
| "learning_rate": 6.624888093106535e-07, |
| "loss": 112.4003, |
| "step": 2005 |
| }, |
| { |
| "epoch": 0.4047956197488908, |
| "grad_norm": 103.5, |
| "learning_rate": 6.613697403760071e-07, |
| "loss": 112.6595, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.4058025740268731, |
| "grad_norm": 103.75, |
| "learning_rate": 6.602506714413608e-07, |
| "loss": 112.9923, |
| "step": 2015 |
| }, |
| { |
| "epoch": 0.4068095283048554, |
| "grad_norm": 103.4375, |
| "learning_rate": 6.591316025067144e-07, |
| "loss": 112.2758, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.4078164825828377, |
| "grad_norm": 103.5625, |
| "learning_rate": 6.58012533572068e-07, |
| "loss": 111.0822, |
| "step": 2025 |
| }, |
| { |
| "epoch": 0.40882343686082, |
| "grad_norm": 103.6875, |
| "learning_rate": 6.568934646374216e-07, |
| "loss": 113.0516, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.4098303911388024, |
| "grad_norm": 103.8125, |
| "learning_rate": 6.557743957027753e-07, |
| "loss": 113.0746, |
| "step": 2035 |
| }, |
| { |
| "epoch": 0.4108373454167847, |
| "grad_norm": 103.25, |
| "learning_rate": 6.546553267681289e-07, |
| "loss": 110.6382, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.411844299694767, |
| "grad_norm": 101.625, |
| "learning_rate": 6.535362578334825e-07, |
| "loss": 113.3481, |
| "step": 2045 |
| }, |
| { |
| "epoch": 0.4128512539727493, |
| "grad_norm": 100.3125, |
| "learning_rate": 6.524171888988361e-07, |
| "loss": 111.0591, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.4138582082507316, |
| "grad_norm": 104.5, |
| "learning_rate": 6.512981199641899e-07, |
| "loss": 110.473, |
| "step": 2055 |
| }, |
| { |
| "epoch": 0.4148651625287139, |
| "grad_norm": 99.25, |
| "learning_rate": 6.501790510295434e-07, |
| "loss": 110.724, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.4158721168066962, |
| "grad_norm": 103.375, |
| "learning_rate": 6.49059982094897e-07, |
| "loss": 113.1481, |
| "step": 2065 |
| }, |
| { |
| "epoch": 0.4168790710846786, |
| "grad_norm": 101.3125, |
| "learning_rate": 6.479409131602506e-07, |
| "loss": 111.1849, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.4178860253626609, |
| "grad_norm": 101.8125, |
| "learning_rate": 6.468218442256043e-07, |
| "loss": 111.4414, |
| "step": 2075 |
| }, |
| { |
| "epoch": 0.4188929796406432, |
| "grad_norm": 104.3125, |
| "learning_rate": 6.457027752909579e-07, |
| "loss": 110.1911, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.4198999339186255, |
| "grad_norm": 99.6875, |
| "learning_rate": 6.445837063563115e-07, |
| "loss": 112.5238, |
| "step": 2085 |
| }, |
| { |
| "epoch": 0.4209068881966078, |
| "grad_norm": 104.1875, |
| "learning_rate": 6.434646374216651e-07, |
| "loss": 111.1536, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.42191384247459013, |
| "grad_norm": 101.8125, |
| "learning_rate": 6.423455684870188e-07, |
| "loss": 110.8878, |
| "step": 2095 |
| }, |
| { |
| "epoch": 0.42292079675257244, |
| "grad_norm": 102.875, |
| "learning_rate": 6.412264995523724e-07, |
| "loss": 111.2218, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.42392775103055474, |
| "grad_norm": 102.25, |
| "learning_rate": 6.40107430617726e-07, |
| "loss": 110.9818, |
| "step": 2105 |
| }, |
| { |
| "epoch": 0.4249347053085371, |
| "grad_norm": 100.5625, |
| "learning_rate": 6.389883616830797e-07, |
| "loss": 111.3958, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.4259416595865194, |
| "grad_norm": 103.3125, |
| "learning_rate": 6.378692927484333e-07, |
| "loss": 112.1829, |
| "step": 2115 |
| }, |
| { |
| "epoch": 0.4269486138645017, |
| "grad_norm": 103.0625, |
| "learning_rate": 6.367502238137868e-07, |
| "loss": 111.266, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.42795556814248403, |
| "grad_norm": 103.375, |
| "learning_rate": 6.356311548791405e-07, |
| "loss": 110.5654, |
| "step": 2125 |
| }, |
| { |
| "epoch": 0.42896252242046634, |
| "grad_norm": 101.4375, |
| "learning_rate": 6.345120859444942e-07, |
| "loss": 110.188, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.42996947669844865, |
| "grad_norm": 101.25, |
| "learning_rate": 6.333930170098478e-07, |
| "loss": 110.1502, |
| "step": 2135 |
| }, |
| { |
| "epoch": 0.43097643097643096, |
| "grad_norm": 103.125, |
| "learning_rate": 6.322739480752014e-07, |
| "loss": 110.9768, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.43198338525441327, |
| "grad_norm": 101.1875, |
| "learning_rate": 6.31154879140555e-07, |
| "loss": 109.8121, |
| "step": 2145 |
| }, |
| { |
| "epoch": 0.43299033953239563, |
| "grad_norm": 101.4375, |
| "learning_rate": 6.300358102059087e-07, |
| "loss": 109.5695, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.43399729381037794, |
| "grad_norm": 100.625, |
| "learning_rate": 6.289167412712623e-07, |
| "loss": 110.7248, |
| "step": 2155 |
| }, |
| { |
| "epoch": 0.43500424808836025, |
| "grad_norm": 103.375, |
| "learning_rate": 6.277976723366159e-07, |
| "loss": 110.0195, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.43601120236634255, |
| "grad_norm": 97.375, |
| "learning_rate": 6.266786034019695e-07, |
| "loss": 110.2418, |
| "step": 2165 |
| }, |
| { |
| "epoch": 0.43701815664432486, |
| "grad_norm": 102.4375, |
| "learning_rate": 6.255595344673232e-07, |
| "loss": 109.4576, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.43802511092230717, |
| "grad_norm": 104.25, |
| "learning_rate": 6.244404655326768e-07, |
| "loss": 109.8756, |
| "step": 2175 |
| }, |
| { |
| "epoch": 0.4390320652002895, |
| "grad_norm": 103.375, |
| "learning_rate": 6.233213965980304e-07, |
| "loss": 111.7631, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.44003901947827184, |
| "grad_norm": 105.125, |
| "learning_rate": 6.222023276633839e-07, |
| "loss": 110.7248, |
| "step": 2185 |
| }, |
| { |
| "epoch": 0.44104597375625415, |
| "grad_norm": 98.0625, |
| "learning_rate": 6.210832587287377e-07, |
| "loss": 109.8976, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.44205292803423646, |
| "grad_norm": 101.0, |
| "learning_rate": 6.199641897940913e-07, |
| "loss": 111.2177, |
| "step": 2195 |
| }, |
| { |
| "epoch": 0.44305988231221877, |
| "grad_norm": 99.5, |
| "learning_rate": 6.188451208594449e-07, |
| "loss": 110.6901, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.4440668365902011, |
| "grad_norm": 102.5, |
| "learning_rate": 6.177260519247985e-07, |
| "loss": 110.3281, |
| "step": 2205 |
| }, |
| { |
| "epoch": 0.4450737908681834, |
| "grad_norm": 103.375, |
| "learning_rate": 6.166069829901522e-07, |
| "loss": 110.1482, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.4460807451461657, |
| "grad_norm": 101.125, |
| "learning_rate": 6.154879140555058e-07, |
| "loss": 110.9915, |
| "step": 2215 |
| }, |
| { |
| "epoch": 0.447087699424148, |
| "grad_norm": 101.5625, |
| "learning_rate": 6.143688451208594e-07, |
| "loss": 109.2988, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.44809465370213036, |
| "grad_norm": 101.5625, |
| "learning_rate": 6.132497761862131e-07, |
| "loss": 110.0758, |
| "step": 2225 |
| }, |
| { |
| "epoch": 0.44910160798011267, |
| "grad_norm": 99.9375, |
| "learning_rate": 6.121307072515667e-07, |
| "loss": 108.8493, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.450108562258095, |
| "grad_norm": 100.4375, |
| "learning_rate": 6.110116383169203e-07, |
| "loss": 108.6601, |
| "step": 2235 |
| }, |
| { |
| "epoch": 0.4511155165360773, |
| "grad_norm": 102.9375, |
| "learning_rate": 6.098925693822739e-07, |
| "loss": 109.1784, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.4521224708140596, |
| "grad_norm": 101.0625, |
| "learning_rate": 6.087735004476276e-07, |
| "loss": 109.5287, |
| "step": 2245 |
| }, |
| { |
| "epoch": 0.4531294250920419, |
| "grad_norm": 101.0, |
| "learning_rate": 6.076544315129812e-07, |
| "loss": 110.2235, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.4541363793700242, |
| "grad_norm": 106.1875, |
| "learning_rate": 6.065353625783348e-07, |
| "loss": 109.2282, |
| "step": 2255 |
| }, |
| { |
| "epoch": 0.4551433336480065, |
| "grad_norm": 103.75, |
| "learning_rate": 6.054162936436884e-07, |
| "loss": 108.3096, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.4561502879259889, |
| "grad_norm": 96.9375, |
| "learning_rate": 6.042972247090421e-07, |
| "loss": 109.567, |
| "step": 2265 |
| }, |
| { |
| "epoch": 0.4571572422039712, |
| "grad_norm": 103.875, |
| "learning_rate": 6.031781557743957e-07, |
| "loss": 108.5729, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.4581641964819535, |
| "grad_norm": 101.4375, |
| "learning_rate": 6.020590868397493e-07, |
| "loss": 108.8917, |
| "step": 2275 |
| }, |
| { |
| "epoch": 0.4591711507599358, |
| "grad_norm": 104.4375, |
| "learning_rate": 6.009400179051029e-07, |
| "loss": 109.6874, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.4601781050379181, |
| "grad_norm": 101.0625, |
| "learning_rate": 5.998209489704566e-07, |
| "loss": 109.5204, |
| "step": 2285 |
| }, |
| { |
| "epoch": 0.4611850593159004, |
| "grad_norm": 100.5625, |
| "learning_rate": 5.987018800358101e-07, |
| "loss": 108.9875, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.46219201359388273, |
| "grad_norm": 103.25, |
| "learning_rate": 5.975828111011638e-07, |
| "loss": 108.4358, |
| "step": 2295 |
| }, |
| { |
| "epoch": 0.46319896787186504, |
| "grad_norm": 102.4375, |
| "learning_rate": 5.964637421665174e-07, |
| "loss": 109.1291, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.4642059221498474, |
| "grad_norm": 101.0, |
| "learning_rate": 5.953446732318711e-07, |
| "loss": 108.4389, |
| "step": 2305 |
| }, |
| { |
| "epoch": 0.4652128764278297, |
| "grad_norm": 101.875, |
| "learning_rate": 5.942256042972247e-07, |
| "loss": 109.845, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.466219830705812, |
| "grad_norm": 102.25, |
| "learning_rate": 5.931065353625782e-07, |
| "loss": 108.6821, |
| "step": 2315 |
| }, |
| { |
| "epoch": 0.46722678498379433, |
| "grad_norm": 102.625, |
| "learning_rate": 5.919874664279319e-07, |
| "loss": 109.3875, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.46823373926177664, |
| "grad_norm": 101.5625, |
| "learning_rate": 5.908683974932856e-07, |
| "loss": 108.3035, |
| "step": 2325 |
| }, |
| { |
| "epoch": 0.46924069353975895, |
| "grad_norm": 100.375, |
| "learning_rate": 5.897493285586392e-07, |
| "loss": 107.0217, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.47024764781774125, |
| "grad_norm": 105.0, |
| "learning_rate": 5.886302596239928e-07, |
| "loss": 110.8466, |
| "step": 2335 |
| }, |
| { |
| "epoch": 0.4712546020957236, |
| "grad_norm": 103.5, |
| "learning_rate": 5.875111906893464e-07, |
| "loss": 109.4547, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.4722615563737059, |
| "grad_norm": 100.9375, |
| "learning_rate": 5.863921217547001e-07, |
| "loss": 107.5557, |
| "step": 2345 |
| }, |
| { |
| "epoch": 0.47326851065168823, |
| "grad_norm": 102.1875, |
| "learning_rate": 5.852730528200537e-07, |
| "loss": 109.2353, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.47427546492967054, |
| "grad_norm": 101.5, |
| "learning_rate": 5.841539838854072e-07, |
| "loss": 108.1182, |
| "step": 2355 |
| }, |
| { |
| "epoch": 0.47528241920765285, |
| "grad_norm": 102.6875, |
| "learning_rate": 5.83034914950761e-07, |
| "loss": 107.2573, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.47628937348563516, |
| "grad_norm": 102.1875, |
| "learning_rate": 5.819158460161146e-07, |
| "loss": 108.402, |
| "step": 2365 |
| }, |
| { |
| "epoch": 0.47729632776361747, |
| "grad_norm": 101.5, |
| "learning_rate": 5.807967770814682e-07, |
| "loss": 108.3766, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.4783032820415998, |
| "grad_norm": 104.0, |
| "learning_rate": 5.796777081468218e-07, |
| "loss": 108.5534, |
| "step": 2375 |
| }, |
| { |
| "epoch": 0.47931023631958214, |
| "grad_norm": 100.3125, |
| "learning_rate": 5.785586392121755e-07, |
| "loss": 109.6284, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.48031719059756445, |
| "grad_norm": 105.5, |
| "learning_rate": 5.774395702775291e-07, |
| "loss": 108.7794, |
| "step": 2385 |
| }, |
| { |
| "epoch": 0.48132414487554676, |
| "grad_norm": 104.375, |
| "learning_rate": 5.763205013428827e-07, |
| "loss": 107.7422, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.48233109915352906, |
| "grad_norm": 102.0625, |
| "learning_rate": 5.752014324082363e-07, |
| "loss": 108.9189, |
| "step": 2395 |
| }, |
| { |
| "epoch": 0.48333805343151137, |
| "grad_norm": 100.1875, |
| "learning_rate": 5.7408236347359e-07, |
| "loss": 107.5696, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.4843450077094937, |
| "grad_norm": 102.6875, |
| "learning_rate": 5.729632945389436e-07, |
| "loss": 108.2401, |
| "step": 2405 |
| }, |
| { |
| "epoch": 0.485351961987476, |
| "grad_norm": 101.625, |
| "learning_rate": 5.718442256042972e-07, |
| "loss": 107.8118, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.4863589162654583, |
| "grad_norm": 101.75, |
| "learning_rate": 5.707251566696508e-07, |
| "loss": 108.2832, |
| "step": 2415 |
| }, |
| { |
| "epoch": 0.48736587054344066, |
| "grad_norm": 102.375, |
| "learning_rate": 5.696060877350044e-07, |
| "loss": 109.5226, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.48837282482142297, |
| "grad_norm": 107.625, |
| "learning_rate": 5.684870188003581e-07, |
| "loss": 109.3006, |
| "step": 2425 |
| }, |
| { |
| "epoch": 0.4893797790994053, |
| "grad_norm": 102.3125, |
| "learning_rate": 5.673679498657117e-07, |
| "loss": 108.7192, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.4903867333773876, |
| "grad_norm": 102.6875, |
| "learning_rate": 5.662488809310653e-07, |
| "loss": 106.2096, |
| "step": 2435 |
| }, |
| { |
| "epoch": 0.4913936876553699, |
| "grad_norm": 101.3125, |
| "learning_rate": 5.65129811996419e-07, |
| "loss": 106.725, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.4924006419333522, |
| "grad_norm": 100.1875, |
| "learning_rate": 5.640107430617726e-07, |
| "loss": 107.4696, |
| "step": 2445 |
| }, |
| { |
| "epoch": 0.4934075962113345, |
| "grad_norm": 102.8125, |
| "learning_rate": 5.628916741271262e-07, |
| "loss": 108.2278, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.4944145504893168, |
| "grad_norm": 100.0625, |
| "learning_rate": 5.617726051924798e-07, |
| "loss": 108.2526, |
| "step": 2455 |
| }, |
| { |
| "epoch": 0.4954215047672992, |
| "grad_norm": 100.875, |
| "learning_rate": 5.606535362578334e-07, |
| "loss": 108.8548, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.4964284590452815, |
| "grad_norm": 103.4375, |
| "learning_rate": 5.595344673231871e-07, |
| "loss": 107.2892, |
| "step": 2465 |
| }, |
| { |
| "epoch": 0.4974354133232638, |
| "grad_norm": 101.6875, |
| "learning_rate": 5.584153983885407e-07, |
| "loss": 106.3791, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.4984423676012461, |
| "grad_norm": 99.0625, |
| "learning_rate": 5.572963294538944e-07, |
| "loss": 108.912, |
| "step": 2475 |
| }, |
| { |
| "epoch": 0.4994493218792284, |
| "grad_norm": 103.5625, |
| "learning_rate": 5.56177260519248e-07, |
| "loss": 108.9683, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.5004562761572108, |
| "grad_norm": 101.6875, |
| "learning_rate": 5.550581915846015e-07, |
| "loss": 106.8892, |
| "step": 2485 |
| }, |
| { |
| "epoch": 0.501463230435193, |
| "grad_norm": 106.3125, |
| "learning_rate": 5.539391226499552e-07, |
| "loss": 107.4377, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.5024701847131754, |
| "grad_norm": 101.625, |
| "learning_rate": 5.528200537153089e-07, |
| "loss": 107.6573, |
| "step": 2495 |
| }, |
| { |
| "epoch": 0.5034771389911576, |
| "grad_norm": 99.375, |
| "learning_rate": 5.517009847806625e-07, |
| "loss": 108.9813, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.5034771389911576, |
| "eval_loss": 3.3530521392822266, |
| "eval_runtime": 240.5322, |
| "eval_samples_per_second": 1112.587, |
| "eval_steps_per_second": 34.769, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.50448409326914, |
| "grad_norm": 102.0, |
| "learning_rate": 5.505819158460161e-07, |
| "loss": 107.6658, |
| "step": 2505 |
| }, |
| { |
| "epoch": 0.5054910475471223, |
| "grad_norm": 102.1875, |
| "learning_rate": 5.494628469113697e-07, |
| "loss": 107.6049, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.5064980018251046, |
| "grad_norm": 99.25, |
| "learning_rate": 5.483437779767234e-07, |
| "loss": 106.7267, |
| "step": 2515 |
| }, |
| { |
| "epoch": 0.507504956103087, |
| "grad_norm": 102.125, |
| "learning_rate": 5.47224709042077e-07, |
| "loss": 106.255, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.5085119103810692, |
| "grad_norm": 100.875, |
| "learning_rate": 5.461056401074305e-07, |
| "loss": 107.4151, |
| "step": 2525 |
| }, |
| { |
| "epoch": 0.5095188646590516, |
| "grad_norm": 103.375, |
| "learning_rate": 5.449865711727842e-07, |
| "loss": 107.2014, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.5105258189370339, |
| "grad_norm": 101.75, |
| "learning_rate": 5.438675022381379e-07, |
| "loss": 106.2904, |
| "step": 2535 |
| }, |
| { |
| "epoch": 0.5115327732150162, |
| "grad_norm": 101.6875, |
| "learning_rate": 5.427484333034915e-07, |
| "loss": 105.7796, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.5125397274929985, |
| "grad_norm": 100.5, |
| "learning_rate": 5.416293643688451e-07, |
| "loss": 107.3293, |
| "step": 2545 |
| }, |
| { |
| "epoch": 0.5135466817709808, |
| "grad_norm": 102.9375, |
| "learning_rate": 5.405102954341986e-07, |
| "loss": 105.9381, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.5145536360489632, |
| "grad_norm": 103.8125, |
| "learning_rate": 5.393912264995524e-07, |
| "loss": 106.3883, |
| "step": 2555 |
| }, |
| { |
| "epoch": 0.5155605903269455, |
| "grad_norm": 104.625, |
| "learning_rate": 5.38272157564906e-07, |
| "loss": 106.8838, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.5165675446049278, |
| "grad_norm": 100.375, |
| "learning_rate": 5.371530886302596e-07, |
| "loss": 104.9295, |
| "step": 2565 |
| }, |
| { |
| "epoch": 0.5175744988829101, |
| "grad_norm": 104.3125, |
| "learning_rate": 5.360340196956132e-07, |
| "loss": 105.8216, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.5185814531608924, |
| "grad_norm": 103.8125, |
| "learning_rate": 5.349149507609669e-07, |
| "loss": 107.064, |
| "step": 2575 |
| }, |
| { |
| "epoch": 0.5195884074388747, |
| "grad_norm": 102.25, |
| "learning_rate": 5.337958818263205e-07, |
| "loss": 107.4001, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.520595361716857, |
| "grad_norm": 104.75, |
| "learning_rate": 5.326768128916741e-07, |
| "loss": 105.6752, |
| "step": 2585 |
| }, |
| { |
| "epoch": 0.5216023159948393, |
| "grad_norm": 105.1875, |
| "learning_rate": 5.315577439570276e-07, |
| "loss": 106.9133, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.5226092702728217, |
| "grad_norm": 101.875, |
| "learning_rate": 5.304386750223814e-07, |
| "loss": 106.7293, |
| "step": 2595 |
| }, |
| { |
| "epoch": 0.523616224550804, |
| "grad_norm": 100.625, |
| "learning_rate": 5.29319606087735e-07, |
| "loss": 105.289, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.5246231788287863, |
| "grad_norm": 102.625, |
| "learning_rate": 5.282005371530886e-07, |
| "loss": 107.3164, |
| "step": 2605 |
| }, |
| { |
| "epoch": 0.5256301331067686, |
| "grad_norm": 104.75, |
| "learning_rate": 5.270814682184423e-07, |
| "loss": 107.9815, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.5266370873847509, |
| "grad_norm": 102.625, |
| "learning_rate": 5.259623992837958e-07, |
| "loss": 106.1467, |
| "step": 2615 |
| }, |
| { |
| "epoch": 0.5276440416627333, |
| "grad_norm": 106.6875, |
| "learning_rate": 5.248433303491495e-07, |
| "loss": 106.5354, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.5286509959407155, |
| "grad_norm": 104.1875, |
| "learning_rate": 5.237242614145031e-07, |
| "loss": 106.1924, |
| "step": 2625 |
| }, |
| { |
| "epoch": 0.5296579502186979, |
| "grad_norm": 102.625, |
| "learning_rate": 5.226051924798567e-07, |
| "loss": 107.1888, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.5306649044966802, |
| "grad_norm": 99.5625, |
| "learning_rate": 5.214861235452104e-07, |
| "loss": 105.9879, |
| "step": 2635 |
| }, |
| { |
| "epoch": 0.5316718587746625, |
| "grad_norm": 100.625, |
| "learning_rate": 5.20367054610564e-07, |
| "loss": 105.3448, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.5326788130526449, |
| "grad_norm": 103.875, |
| "learning_rate": 5.192479856759176e-07, |
| "loss": 107.7056, |
| "step": 2645 |
| }, |
| { |
| "epoch": 0.5336857673306271, |
| "grad_norm": 102.875, |
| "learning_rate": 5.181289167412713e-07, |
| "loss": 105.9089, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.5346927216086095, |
| "grad_norm": 103.6875, |
| "learning_rate": 5.170098478066248e-07, |
| "loss": 106.6165, |
| "step": 2655 |
| }, |
| { |
| "epoch": 0.5356996758865917, |
| "grad_norm": 100.5625, |
| "learning_rate": 5.158907788719785e-07, |
| "loss": 106.0283, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.5367066301645741, |
| "grad_norm": 105.0, |
| "learning_rate": 5.147717099373321e-07, |
| "loss": 106.3744, |
| "step": 2665 |
| }, |
| { |
| "epoch": 0.5377135844425565, |
| "grad_norm": 107.625, |
| "learning_rate": 5.136526410026858e-07, |
| "loss": 105.7139, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.5387205387205387, |
| "grad_norm": 103.125, |
| "learning_rate": 5.125335720680394e-07, |
| "loss": 106.4616, |
| "step": 2675 |
| }, |
| { |
| "epoch": 0.5397274929985211, |
| "grad_norm": 99.125, |
| "learning_rate": 5.114145031333929e-07, |
| "loss": 103.4881, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.5407344472765033, |
| "grad_norm": 101.1875, |
| "learning_rate": 5.102954341987466e-07, |
| "loss": 105.4083, |
| "step": 2685 |
| }, |
| { |
| "epoch": 0.5417414015544857, |
| "grad_norm": 103.5, |
| "learning_rate": 5.091763652641003e-07, |
| "loss": 105.16, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.5427483558324679, |
| "grad_norm": 104.3125, |
| "learning_rate": 5.080572963294538e-07, |
| "loss": 105.9961, |
| "step": 2695 |
| }, |
| { |
| "epoch": 0.5437553101104503, |
| "grad_norm": 101.5625, |
| "learning_rate": 5.069382273948075e-07, |
| "loss": 103.5584, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.5447622643884326, |
| "grad_norm": 103.0, |
| "learning_rate": 5.058191584601611e-07, |
| "loss": 105.5014, |
| "step": 2705 |
| }, |
| { |
| "epoch": 0.5457692186664149, |
| "grad_norm": 103.75, |
| "learning_rate": 5.047000895255148e-07, |
| "loss": 106.6602, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.5467761729443973, |
| "grad_norm": 105.125, |
| "learning_rate": 5.035810205908684e-07, |
| "loss": 107.0629, |
| "step": 2715 |
| }, |
| { |
| "epoch": 0.5477831272223795, |
| "grad_norm": 103.75, |
| "learning_rate": 5.024619516562219e-07, |
| "loss": 103.3805, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.5487900815003619, |
| "grad_norm": 102.5, |
| "learning_rate": 5.013428827215757e-07, |
| "loss": 105.2457, |
| "step": 2725 |
| }, |
| { |
| "epoch": 0.5497970357783442, |
| "grad_norm": 100.8125, |
| "learning_rate": 5.002238137869293e-07, |
| "loss": 104.6135, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.5508039900563265, |
| "grad_norm": 103.0625, |
| "learning_rate": 4.991047448522829e-07, |
| "loss": 104.9963, |
| "step": 2735 |
| }, |
| { |
| "epoch": 0.5518109443343088, |
| "grad_norm": 100.4375, |
| "learning_rate": 4.979856759176365e-07, |
| "loss": 105.1985, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.5528178986122911, |
| "grad_norm": 101.5, |
| "learning_rate": 4.968666069829902e-07, |
| "loss": 106.8182, |
| "step": 2745 |
| }, |
| { |
| "epoch": 0.5538248528902735, |
| "grad_norm": 102.0, |
| "learning_rate": 4.957475380483437e-07, |
| "loss": 105.6893, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.5548318071682558, |
| "grad_norm": 101.625, |
| "learning_rate": 4.946284691136974e-07, |
| "loss": 104.0999, |
| "step": 2755 |
| }, |
| { |
| "epoch": 0.5558387614462381, |
| "grad_norm": 102.6875, |
| "learning_rate": 4.93509400179051e-07, |
| "loss": 106.6457, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.5568457157242204, |
| "grad_norm": 102.5625, |
| "learning_rate": 4.923903312444047e-07, |
| "loss": 105.8887, |
| "step": 2765 |
| }, |
| { |
| "epoch": 0.5578526700022027, |
| "grad_norm": 101.375, |
| "learning_rate": 4.912712623097583e-07, |
| "loss": 106.124, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.558859624280185, |
| "grad_norm": 104.125, |
| "learning_rate": 4.901521933751119e-07, |
| "loss": 105.8778, |
| "step": 2775 |
| }, |
| { |
| "epoch": 0.5598665785581673, |
| "grad_norm": 101.5625, |
| "learning_rate": 4.890331244404655e-07, |
| "loss": 104.9353, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.5608735328361497, |
| "grad_norm": 103.625, |
| "learning_rate": 4.879140555058191e-07, |
| "loss": 105.6354, |
| "step": 2785 |
| }, |
| { |
| "epoch": 0.561880487114132, |
| "grad_norm": 103.875, |
| "learning_rate": 4.867949865711727e-07, |
| "loss": 104.5165, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.5628874413921143, |
| "grad_norm": 106.4375, |
| "learning_rate": 4.856759176365264e-07, |
| "loss": 106.2399, |
| "step": 2795 |
| }, |
| { |
| "epoch": 0.5638943956700966, |
| "grad_norm": 104.25, |
| "learning_rate": 4.8455684870188e-07, |
| "loss": 105.9224, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.564901349948079, |
| "grad_norm": 102.75, |
| "learning_rate": 4.834377797672337e-07, |
| "loss": 105.3704, |
| "step": 2805 |
| }, |
| { |
| "epoch": 0.5659083042260612, |
| "grad_norm": 100.1875, |
| "learning_rate": 4.823187108325872e-07, |
| "loss": 104.1296, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.5669152585040436, |
| "grad_norm": 103.9375, |
| "learning_rate": 4.811996418979409e-07, |
| "loss": 104.9443, |
| "step": 2815 |
| }, |
| { |
| "epoch": 0.5679222127820258, |
| "grad_norm": 103.8125, |
| "learning_rate": 4.800805729632945e-07, |
| "loss": 105.7805, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.5689291670600082, |
| "grad_norm": 101.0625, |
| "learning_rate": 4.789615040286481e-07, |
| "loss": 104.9703, |
| "step": 2825 |
| }, |
| { |
| "epoch": 0.5699361213379905, |
| "grad_norm": 102.25, |
| "learning_rate": 4.778424350940018e-07, |
| "loss": 105.2172, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.5709430756159728, |
| "grad_norm": 102.875, |
| "learning_rate": 4.7672336615935536e-07, |
| "loss": 105.6472, |
| "step": 2835 |
| }, |
| { |
| "epoch": 0.5719500298939552, |
| "grad_norm": 100.25, |
| "learning_rate": 4.75604297224709e-07, |
| "loss": 104.7867, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.5729569841719374, |
| "grad_norm": 107.125, |
| "learning_rate": 4.7448522829006263e-07, |
| "loss": 105.9132, |
| "step": 2845 |
| }, |
| { |
| "epoch": 0.5739639384499198, |
| "grad_norm": 102.375, |
| "learning_rate": 4.733661593554163e-07, |
| "loss": 104.1496, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.574970892727902, |
| "grad_norm": 101.6875, |
| "learning_rate": 4.7224709042076985e-07, |
| "loss": 103.5616, |
| "step": 2855 |
| }, |
| { |
| "epoch": 0.5759778470058844, |
| "grad_norm": 103.6875, |
| "learning_rate": 4.711280214861235e-07, |
| "loss": 104.5214, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.5769848012838668, |
| "grad_norm": 101.6875, |
| "learning_rate": 4.700089525514772e-07, |
| "loss": 106.2387, |
| "step": 2865 |
| }, |
| { |
| "epoch": 0.577991755561849, |
| "grad_norm": 100.75, |
| "learning_rate": 4.688898836168308e-07, |
| "loss": 105.0707, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.5789987098398314, |
| "grad_norm": 105.5, |
| "learning_rate": 4.677708146821844e-07, |
| "loss": 105.1878, |
| "step": 2875 |
| }, |
| { |
| "epoch": 0.5800056641178136, |
| "grad_norm": 103.375, |
| "learning_rate": 4.66651745747538e-07, |
| "loss": 104.6173, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.581012618395796, |
| "grad_norm": 103.75, |
| "learning_rate": 4.655326768128917e-07, |
| "loss": 103.618, |
| "step": 2885 |
| }, |
| { |
| "epoch": 0.5820195726737782, |
| "grad_norm": 104.3125, |
| "learning_rate": 4.644136078782453e-07, |
| "loss": 103.7888, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.5830265269517606, |
| "grad_norm": 102.0, |
| "learning_rate": 4.632945389435989e-07, |
| "loss": 103.5098, |
| "step": 2895 |
| }, |
| { |
| "epoch": 0.5840334812297429, |
| "grad_norm": 103.75, |
| "learning_rate": 4.621754700089525e-07, |
| "loss": 104.3576, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.5850404355077252, |
| "grad_norm": 101.0, |
| "learning_rate": 4.6105640107430617e-07, |
| "loss": 103.4598, |
| "step": 2905 |
| }, |
| { |
| "epoch": 0.5860473897857076, |
| "grad_norm": 104.9375, |
| "learning_rate": 4.599373321396598e-07, |
| "loss": 102.7616, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.5870543440636898, |
| "grad_norm": 104.9375, |
| "learning_rate": 4.5881826320501345e-07, |
| "loss": 104.4935, |
| "step": 2915 |
| }, |
| { |
| "epoch": 0.5880612983416722, |
| "grad_norm": 102.5625, |
| "learning_rate": 4.57699194270367e-07, |
| "loss": 104.2107, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.5890682526196545, |
| "grad_norm": 104.25, |
| "learning_rate": 4.5658012533572067e-07, |
| "loss": 103.9403, |
| "step": 2925 |
| }, |
| { |
| "epoch": 0.5900752068976368, |
| "grad_norm": 100.4375, |
| "learning_rate": 4.554610564010743e-07, |
| "loss": 104.4585, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.5910821611756191, |
| "grad_norm": 102.8125, |
| "learning_rate": 4.5434198746642794e-07, |
| "loss": 103.71, |
| "step": 2935 |
| }, |
| { |
| "epoch": 0.5920891154536014, |
| "grad_norm": 104.125, |
| "learning_rate": 4.532229185317815e-07, |
| "loss": 102.3344, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.5930960697315838, |
| "grad_norm": 104.0625, |
| "learning_rate": 4.5210384959713516e-07, |
| "loss": 102.8993, |
| "step": 2945 |
| }, |
| { |
| "epoch": 0.594103024009566, |
| "grad_norm": 104.5, |
| "learning_rate": 4.509847806624888e-07, |
| "loss": 103.8401, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.5951099782875484, |
| "grad_norm": 101.875, |
| "learning_rate": 4.4986571172784244e-07, |
| "loss": 103.9012, |
| "step": 2955 |
| }, |
| { |
| "epoch": 0.5961169325655307, |
| "grad_norm": 100.75, |
| "learning_rate": 4.48746642793196e-07, |
| "loss": 102.7268, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.597123886843513, |
| "grad_norm": 105.9375, |
| "learning_rate": 4.4762757385854966e-07, |
| "loss": 104.2173, |
| "step": 2965 |
| }, |
| { |
| "epoch": 0.5981308411214953, |
| "grad_norm": 102.375, |
| "learning_rate": 4.4650850492390327e-07, |
| "loss": 105.3016, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.5991377953994776, |
| "grad_norm": 100.1875, |
| "learning_rate": 4.4538943598925693e-07, |
| "loss": 103.7173, |
| "step": 2975 |
| }, |
| { |
| "epoch": 0.60014474967746, |
| "grad_norm": 101.75, |
| "learning_rate": 4.442703670546105e-07, |
| "loss": 105.0442, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.6011517039554423, |
| "grad_norm": 101.875, |
| "learning_rate": 4.4315129811996416e-07, |
| "loss": 106.4008, |
| "step": 2985 |
| }, |
| { |
| "epoch": 0.6021586582334246, |
| "grad_norm": 102.625, |
| "learning_rate": 4.420322291853178e-07, |
| "loss": 102.636, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.6031656125114069, |
| "grad_norm": 103.3125, |
| "learning_rate": 4.4091316025067143e-07, |
| "loss": 103.5773, |
| "step": 2995 |
| }, |
| { |
| "epoch": 0.6041725667893892, |
| "grad_norm": 106.375, |
| "learning_rate": 4.397940913160251e-07, |
| "loss": 103.8032, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.6041725667893892, |
| "eval_loss": 3.2479496002197266, |
| "eval_runtime": 241.8144, |
| "eval_samples_per_second": 1106.688, |
| "eval_steps_per_second": 34.584, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.6051795210673715, |
| "grad_norm": 103.9375, |
| "learning_rate": 4.3867502238137865e-07, |
| "loss": 104.9436, |
| "step": 3005 |
| }, |
| { |
| "epoch": 0.6061864753453539, |
| "grad_norm": 101.75, |
| "learning_rate": 4.375559534467323e-07, |
| "loss": 103.6502, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.6071934296233361, |
| "grad_norm": 102.5, |
| "learning_rate": 4.364368845120859e-07, |
| "loss": 103.4684, |
| "step": 3015 |
| }, |
| { |
| "epoch": 0.6082003839013185, |
| "grad_norm": 104.6875, |
| "learning_rate": 4.353178155774396e-07, |
| "loss": 103.867, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.6092073381793008, |
| "grad_norm": 103.875, |
| "learning_rate": 4.3419874664279315e-07, |
| "loss": 103.4931, |
| "step": 3025 |
| }, |
| { |
| "epoch": 0.6102142924572831, |
| "grad_norm": 104.5625, |
| "learning_rate": 4.330796777081468e-07, |
| "loss": 103.9812, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.6112212467352655, |
| "grad_norm": 102.25, |
| "learning_rate": 4.319606087735004e-07, |
| "loss": 104.1066, |
| "step": 3035 |
| }, |
| { |
| "epoch": 0.6122282010132477, |
| "grad_norm": 102.375, |
| "learning_rate": 4.308415398388541e-07, |
| "loss": 103.3831, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.6132351552912301, |
| "grad_norm": 103.1875, |
| "learning_rate": 4.2972247090420764e-07, |
| "loss": 103.5499, |
| "step": 3045 |
| }, |
| { |
| "epoch": 0.6142421095692123, |
| "grad_norm": 106.4375, |
| "learning_rate": 4.286034019695613e-07, |
| "loss": 101.8933, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.6152490638471947, |
| "grad_norm": 103.0, |
| "learning_rate": 4.274843330349149e-07, |
| "loss": 102.0398, |
| "step": 3055 |
| }, |
| { |
| "epoch": 0.616256018125177, |
| "grad_norm": 104.3125, |
| "learning_rate": 4.263652641002686e-07, |
| "loss": 102.9206, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.6172629724031593, |
| "grad_norm": 101.5625, |
| "learning_rate": 4.2524619516562214e-07, |
| "loss": 102.4662, |
| "step": 3065 |
| }, |
| { |
| "epoch": 0.6182699266811417, |
| "grad_norm": 101.75, |
| "learning_rate": 4.241271262309758e-07, |
| "loss": 103.3072, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.6192768809591239, |
| "grad_norm": 104.375, |
| "learning_rate": 4.230080572963294e-07, |
| "loss": 102.1457, |
| "step": 3075 |
| }, |
| { |
| "epoch": 0.6202838352371063, |
| "grad_norm": 104.1875, |
| "learning_rate": 4.218889883616831e-07, |
| "loss": 103.5531, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.6212907895150885, |
| "grad_norm": 104.125, |
| "learning_rate": 4.207699194270367e-07, |
| "loss": 104.2774, |
| "step": 3085 |
| }, |
| { |
| "epoch": 0.6222977437930709, |
| "grad_norm": 105.5, |
| "learning_rate": 4.196508504923903e-07, |
| "loss": 102.702, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.6233046980710533, |
| "grad_norm": 103.25, |
| "learning_rate": 4.185317815577439e-07, |
| "loss": 103.7312, |
| "step": 3095 |
| }, |
| { |
| "epoch": 0.6243116523490355, |
| "grad_norm": 104.6875, |
| "learning_rate": 4.174127126230976e-07, |
| "loss": 102.1448, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.6253186066270179, |
| "grad_norm": 104.0625, |
| "learning_rate": 4.1629364368845124e-07, |
| "loss": 103.0693, |
| "step": 3105 |
| }, |
| { |
| "epoch": 0.6263255609050001, |
| "grad_norm": 99.25, |
| "learning_rate": 4.151745747538048e-07, |
| "loss": 103.7934, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.6273325151829825, |
| "grad_norm": 104.75, |
| "learning_rate": 4.1405550581915846e-07, |
| "loss": 102.7301, |
| "step": 3115 |
| }, |
| { |
| "epoch": 0.6283394694609648, |
| "grad_norm": 105.125, |
| "learning_rate": 4.1293643688451207e-07, |
| "loss": 102.1029, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.6293464237389471, |
| "grad_norm": 103.375, |
| "learning_rate": 4.1181736794986573e-07, |
| "loss": 101.7747, |
| "step": 3125 |
| }, |
| { |
| "epoch": 0.6303533780169294, |
| "grad_norm": 107.75, |
| "learning_rate": 4.106982990152193e-07, |
| "loss": 101.7472, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.6313603322949117, |
| "grad_norm": 102.75, |
| "learning_rate": 4.0957923008057296e-07, |
| "loss": 101.3941, |
| "step": 3135 |
| }, |
| { |
| "epoch": 0.6323672865728941, |
| "grad_norm": 107.9375, |
| "learning_rate": 4.0846016114592657e-07, |
| "loss": 102.634, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.6333742408508763, |
| "grad_norm": 106.5625, |
| "learning_rate": 4.0734109221128023e-07, |
| "loss": 103.399, |
| "step": 3145 |
| }, |
| { |
| "epoch": 0.6343811951288587, |
| "grad_norm": 105.5, |
| "learning_rate": 4.062220232766338e-07, |
| "loss": 101.831, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.635388149406841, |
| "grad_norm": 104.75, |
| "learning_rate": 4.0510295434198745e-07, |
| "loss": 101.9133, |
| "step": 3155 |
| }, |
| { |
| "epoch": 0.6363951036848233, |
| "grad_norm": 106.5, |
| "learning_rate": 4.0398388540734106e-07, |
| "loss": 103.1237, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.6374020579628056, |
| "grad_norm": 106.5, |
| "learning_rate": 4.028648164726947e-07, |
| "loss": 103.293, |
| "step": 3165 |
| }, |
| { |
| "epoch": 0.6384090122407879, |
| "grad_norm": 100.125, |
| "learning_rate": 4.0174574753804834e-07, |
| "loss": 102.386, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.6394159665187703, |
| "grad_norm": 105.5, |
| "learning_rate": 4.0062667860340195e-07, |
| "loss": 101.1026, |
| "step": 3175 |
| }, |
| { |
| "epoch": 0.6404229207967526, |
| "grad_norm": 105.0625, |
| "learning_rate": 3.9950760966875556e-07, |
| "loss": 103.0412, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.6414298750747349, |
| "grad_norm": 102.0625, |
| "learning_rate": 3.983885407341092e-07, |
| "loss": 102.8331, |
| "step": 3185 |
| }, |
| { |
| "epoch": 0.6424368293527172, |
| "grad_norm": 102.5, |
| "learning_rate": 3.9726947179946283e-07, |
| "loss": 102.7278, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.6434437836306995, |
| "grad_norm": 101.375, |
| "learning_rate": 3.9615040286481644e-07, |
| "loss": 100.927, |
| "step": 3195 |
| }, |
| { |
| "epoch": 0.6444507379086818, |
| "grad_norm": 102.625, |
| "learning_rate": 3.9503133393017005e-07, |
| "loss": 101.5179, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.6454576921866642, |
| "grad_norm": 103.875, |
| "learning_rate": 3.939122649955237e-07, |
| "loss": 101.7856, |
| "step": 3205 |
| }, |
| { |
| "epoch": 0.6464646464646465, |
| "grad_norm": 103.8125, |
| "learning_rate": 3.9279319606087733e-07, |
| "loss": 101.8363, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.6474716007426288, |
| "grad_norm": 105.25, |
| "learning_rate": 3.9167412712623094e-07, |
| "loss": 103.548, |
| "step": 3215 |
| }, |
| { |
| "epoch": 0.6484785550206111, |
| "grad_norm": 104.6875, |
| "learning_rate": 3.9055505819158455e-07, |
| "loss": 103.2727, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.6494855092985934, |
| "grad_norm": 103.25, |
| "learning_rate": 3.894359892569382e-07, |
| "loss": 102.914, |
| "step": 3225 |
| }, |
| { |
| "epoch": 0.6504924635765758, |
| "grad_norm": 105.0, |
| "learning_rate": 3.883169203222919e-07, |
| "loss": 101.5854, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.651499417854558, |
| "grad_norm": 105.125, |
| "learning_rate": 3.8719785138764544e-07, |
| "loss": 101.9887, |
| "step": 3235 |
| }, |
| { |
| "epoch": 0.6525063721325404, |
| "grad_norm": 103.4375, |
| "learning_rate": 3.860787824529991e-07, |
| "loss": 101.3439, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.6535133264105226, |
| "grad_norm": 103.0625, |
| "learning_rate": 3.849597135183527e-07, |
| "loss": 103.7408, |
| "step": 3245 |
| }, |
| { |
| "epoch": 0.654520280688505, |
| "grad_norm": 102.5, |
| "learning_rate": 3.8384064458370637e-07, |
| "loss": 101.1225, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.6555272349664873, |
| "grad_norm": 108.6875, |
| "learning_rate": 3.8272157564906e-07, |
| "loss": 101.483, |
| "step": 3255 |
| }, |
| { |
| "epoch": 0.6565341892444696, |
| "grad_norm": 104.875, |
| "learning_rate": 3.816025067144136e-07, |
| "loss": 101.052, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.657541143522452, |
| "grad_norm": 101.5625, |
| "learning_rate": 3.804834377797672e-07, |
| "loss": 101.2586, |
| "step": 3265 |
| }, |
| { |
| "epoch": 0.6585480978004342, |
| "grad_norm": 106.25, |
| "learning_rate": 3.7936436884512087e-07, |
| "loss": 102.4246, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.6595550520784166, |
| "grad_norm": 104.75, |
| "learning_rate": 3.782452999104745e-07, |
| "loss": 102.6763, |
| "step": 3275 |
| }, |
| { |
| "epoch": 0.6605620063563988, |
| "grad_norm": 104.3125, |
| "learning_rate": 3.771262309758281e-07, |
| "loss": 102.0128, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.6615689606343812, |
| "grad_norm": 103.1875, |
| "learning_rate": 3.760071620411817e-07, |
| "loss": 102.5605, |
| "step": 3285 |
| }, |
| { |
| "epoch": 0.6625759149123636, |
| "grad_norm": 103.375, |
| "learning_rate": 3.7488809310653537e-07, |
| "loss": 102.7485, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.6635828691903458, |
| "grad_norm": 105.5625, |
| "learning_rate": 3.73769024171889e-07, |
| "loss": 101.5186, |
| "step": 3295 |
| }, |
| { |
| "epoch": 0.6645898234683282, |
| "grad_norm": 102.125, |
| "learning_rate": 3.726499552372426e-07, |
| "loss": 101.6677, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.6655967777463104, |
| "grad_norm": 101.4375, |
| "learning_rate": 3.715308863025962e-07, |
| "loss": 101.6941, |
| "step": 3305 |
| }, |
| { |
| "epoch": 0.6666037320242928, |
| "grad_norm": 100.25, |
| "learning_rate": 3.7041181736794986e-07, |
| "loss": 100.565, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.667610686302275, |
| "grad_norm": 103.1875, |
| "learning_rate": 3.6929274843330347e-07, |
| "loss": 102.0876, |
| "step": 3315 |
| }, |
| { |
| "epoch": 0.6686176405802574, |
| "grad_norm": 102.0, |
| "learning_rate": 3.681736794986571e-07, |
| "loss": 101.5779, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.6696245948582397, |
| "grad_norm": 103.6875, |
| "learning_rate": 3.670546105640107e-07, |
| "loss": 100.2148, |
| "step": 3325 |
| }, |
| { |
| "epoch": 0.670631549136222, |
| "grad_norm": 104.125, |
| "learning_rate": 3.6593554162936436e-07, |
| "loss": 103.0346, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.6716385034142044, |
| "grad_norm": 104.1875, |
| "learning_rate": 3.6481647269471797e-07, |
| "loss": 100.2821, |
| "step": 3335 |
| }, |
| { |
| "epoch": 0.6726454576921866, |
| "grad_norm": 104.0, |
| "learning_rate": 3.6369740376007163e-07, |
| "loss": 102.4122, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.673652411970169, |
| "grad_norm": 104.5, |
| "learning_rate": 3.625783348254252e-07, |
| "loss": 101.9806, |
| "step": 3345 |
| }, |
| { |
| "epoch": 0.6746593662481513, |
| "grad_norm": 106.5, |
| "learning_rate": 3.6145926589077885e-07, |
| "loss": 100.5335, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.6756663205261336, |
| "grad_norm": 106.0, |
| "learning_rate": 3.603401969561325e-07, |
| "loss": 101.7242, |
| "step": 3355 |
| }, |
| { |
| "epoch": 0.6766732748041159, |
| "grad_norm": 102.6875, |
| "learning_rate": 3.5922112802148613e-07, |
| "loss": 101.2396, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.6776802290820982, |
| "grad_norm": 108.5625, |
| "learning_rate": 3.5810205908683974e-07, |
| "loss": 100.8422, |
| "step": 3365 |
| }, |
| { |
| "epoch": 0.6786871833600806, |
| "grad_norm": 103.25, |
| "learning_rate": 3.5698299015219335e-07, |
| "loss": 101.5823, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.6796941376380629, |
| "grad_norm": 102.875, |
| "learning_rate": 3.55863921217547e-07, |
| "loss": 101.6988, |
| "step": 3375 |
| }, |
| { |
| "epoch": 0.6807010919160452, |
| "grad_norm": 102.375, |
| "learning_rate": 3.547448522829006e-07, |
| "loss": 100.5021, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.6817080461940275, |
| "grad_norm": 106.4375, |
| "learning_rate": 3.5362578334825423e-07, |
| "loss": 100.3675, |
| "step": 3385 |
| }, |
| { |
| "epoch": 0.6827150004720098, |
| "grad_norm": 101.875, |
| "learning_rate": 3.5250671441360785e-07, |
| "loss": 101.8122, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.6837219547499921, |
| "grad_norm": 103.375, |
| "learning_rate": 3.513876454789615e-07, |
| "loss": 100.3997, |
| "step": 3395 |
| }, |
| { |
| "epoch": 0.6847289090279745, |
| "grad_norm": 104.6875, |
| "learning_rate": 3.502685765443151e-07, |
| "loss": 100.5548, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.6857358633059568, |
| "grad_norm": 106.0625, |
| "learning_rate": 3.4914950760966873e-07, |
| "loss": 102.6518, |
| "step": 3405 |
| }, |
| { |
| "epoch": 0.6867428175839391, |
| "grad_norm": 107.375, |
| "learning_rate": 3.4803043867502234e-07, |
| "loss": 101.1174, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.6877497718619214, |
| "grad_norm": 103.25, |
| "learning_rate": 3.46911369740376e-07, |
| "loss": 101.1875, |
| "step": 3415 |
| }, |
| { |
| "epoch": 0.6887567261399037, |
| "grad_norm": 105.9375, |
| "learning_rate": 3.457923008057296e-07, |
| "loss": 101.0168, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.689763680417886, |
| "grad_norm": 104.25, |
| "learning_rate": 3.446732318710833e-07, |
| "loss": 100.9963, |
| "step": 3425 |
| }, |
| { |
| "epoch": 0.6907706346958683, |
| "grad_norm": 100.1875, |
| "learning_rate": 3.4355416293643684e-07, |
| "loss": 99.1402, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.6917775889738507, |
| "grad_norm": 104.625, |
| "learning_rate": 3.424350940017905e-07, |
| "loss": 100.4257, |
| "step": 3435 |
| }, |
| { |
| "epoch": 0.6927845432518329, |
| "grad_norm": 102.0, |
| "learning_rate": 3.413160250671441e-07, |
| "loss": 99.5312, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.6937914975298153, |
| "grad_norm": 105.8125, |
| "learning_rate": 3.401969561324978e-07, |
| "loss": 101.3088, |
| "step": 3445 |
| }, |
| { |
| "epoch": 0.6947984518077976, |
| "grad_norm": 106.875, |
| "learning_rate": 3.3907788719785133e-07, |
| "loss": 101.2592, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.6958054060857799, |
| "grad_norm": 106.4375, |
| "learning_rate": 3.37958818263205e-07, |
| "loss": 99.822, |
| "step": 3455 |
| }, |
| { |
| "epoch": 0.6968123603637623, |
| "grad_norm": 104.3125, |
| "learning_rate": 3.368397493285586e-07, |
| "loss": 102.2747, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.6978193146417445, |
| "grad_norm": 104.5625, |
| "learning_rate": 3.3572068039391227e-07, |
| "loss": 100.0264, |
| "step": 3465 |
| }, |
| { |
| "epoch": 0.6988262689197269, |
| "grad_norm": 102.0625, |
| "learning_rate": 3.3460161145926583e-07, |
| "loss": 99.6872, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.6998332231977091, |
| "grad_norm": 102.8125, |
| "learning_rate": 3.334825425246195e-07, |
| "loss": 99.866, |
| "step": 3475 |
| }, |
| { |
| "epoch": 0.7008401774756915, |
| "grad_norm": 100.875, |
| "learning_rate": 3.3236347358997316e-07, |
| "loss": 99.9147, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.7018471317536739, |
| "grad_norm": 102.5, |
| "learning_rate": 3.3124440465532677e-07, |
| "loss": 101.803, |
| "step": 3485 |
| }, |
| { |
| "epoch": 0.7028540860316561, |
| "grad_norm": 107.0625, |
| "learning_rate": 3.301253357206804e-07, |
| "loss": 101.0587, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.7038610403096385, |
| "grad_norm": 107.0625, |
| "learning_rate": 3.29006266786034e-07, |
| "loss": 102.4405, |
| "step": 3495 |
| }, |
| { |
| "epoch": 0.7048679945876207, |
| "grad_norm": 101.125, |
| "learning_rate": 3.2788719785138765e-07, |
| "loss": 101.6015, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.7048679945876207, |
| "eval_loss": 3.153071880340576, |
| "eval_runtime": 239.9872, |
| "eval_samples_per_second": 1115.114, |
| "eval_steps_per_second": 34.848, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.7058749488656031, |
| "grad_norm": 101.375, |
| "learning_rate": 3.2676812891674126e-07, |
| "loss": 100.2117, |
| "step": 3505 |
| }, |
| { |
| "epoch": 0.7068819031435853, |
| "grad_norm": 101.5625, |
| "learning_rate": 3.2564905998209493e-07, |
| "loss": 100.942, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.7078888574215677, |
| "grad_norm": 105.0, |
| "learning_rate": 3.245299910474485e-07, |
| "loss": 101.196, |
| "step": 3515 |
| }, |
| { |
| "epoch": 0.7088958116995501, |
| "grad_norm": 106.5, |
| "learning_rate": 3.2341092211280215e-07, |
| "loss": 100.3638, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.7099027659775323, |
| "grad_norm": 102.0625, |
| "learning_rate": 3.2229185317815576e-07, |
| "loss": 101.518, |
| "step": 3525 |
| }, |
| { |
| "epoch": 0.7109097202555147, |
| "grad_norm": 107.875, |
| "learning_rate": 3.211727842435094e-07, |
| "loss": 99.2336, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.7119166745334969, |
| "grad_norm": 102.625, |
| "learning_rate": 3.20053715308863e-07, |
| "loss": 101.3888, |
| "step": 3535 |
| }, |
| { |
| "epoch": 0.7129236288114793, |
| "grad_norm": 106.9375, |
| "learning_rate": 3.1893464637421664e-07, |
| "loss": 98.6846, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.7139305830894616, |
| "grad_norm": 104.6875, |
| "learning_rate": 3.1781557743957026e-07, |
| "loss": 101.1002, |
| "step": 3545 |
| }, |
| { |
| "epoch": 0.7149375373674439, |
| "grad_norm": 104.9375, |
| "learning_rate": 3.166965085049239e-07, |
| "loss": 100.9625, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.7159444916454262, |
| "grad_norm": 104.8125, |
| "learning_rate": 3.155774395702775e-07, |
| "loss": 101.6369, |
| "step": 3555 |
| }, |
| { |
| "epoch": 0.7169514459234085, |
| "grad_norm": 101.0, |
| "learning_rate": 3.1445837063563114e-07, |
| "loss": 101.1003, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.7179584002013909, |
| "grad_norm": 106.5, |
| "learning_rate": 3.1333930170098475e-07, |
| "loss": 98.8858, |
| "step": 3565 |
| }, |
| { |
| "epoch": 0.7189653544793732, |
| "grad_norm": 106.1875, |
| "learning_rate": 3.122202327663384e-07, |
| "loss": 99.8127, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.7199723087573555, |
| "grad_norm": 106.0625, |
| "learning_rate": 3.1110116383169197e-07, |
| "loss": 99.7345, |
| "step": 3575 |
| }, |
| { |
| "epoch": 0.7209792630353378, |
| "grad_norm": 107.875, |
| "learning_rate": 3.0998209489704564e-07, |
| "loss": 98.8684, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.7219862173133201, |
| "grad_norm": 102.3125, |
| "learning_rate": 3.0886302596239925e-07, |
| "loss": 100.5252, |
| "step": 3585 |
| }, |
| { |
| "epoch": 0.7229931715913024, |
| "grad_norm": 102.25, |
| "learning_rate": 3.077439570277529e-07, |
| "loss": 99.1437, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.7240001258692847, |
| "grad_norm": 105.0625, |
| "learning_rate": 3.066248880931066e-07, |
| "loss": 99.6645, |
| "step": 3595 |
| }, |
| { |
| "epoch": 0.7250070801472671, |
| "grad_norm": 108.4375, |
| "learning_rate": 3.0550581915846013e-07, |
| "loss": 101.7029, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.7260140344252494, |
| "grad_norm": 102.3125, |
| "learning_rate": 3.043867502238138e-07, |
| "loss": 101.3023, |
| "step": 3605 |
| }, |
| { |
| "epoch": 0.7270209887032317, |
| "grad_norm": 103.0, |
| "learning_rate": 3.032676812891674e-07, |
| "loss": 100.7414, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.728027942981214, |
| "grad_norm": 106.125, |
| "learning_rate": 3.0214861235452107e-07, |
| "loss": 100.1441, |
| "step": 3615 |
| }, |
| { |
| "epoch": 0.7290348972591963, |
| "grad_norm": 106.5625, |
| "learning_rate": 3.0102954341987463e-07, |
| "loss": 100.4396, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.7300418515371786, |
| "grad_norm": 103.625, |
| "learning_rate": 2.999104744852283e-07, |
| "loss": 100.7662, |
| "step": 3625 |
| }, |
| { |
| "epoch": 0.731048805815161, |
| "grad_norm": 102.8125, |
| "learning_rate": 2.987914055505819e-07, |
| "loss": 99.562, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.7320557600931433, |
| "grad_norm": 100.0, |
| "learning_rate": 2.9767233661593557e-07, |
| "loss": 101.1267, |
| "step": 3635 |
| }, |
| { |
| "epoch": 0.7330627143711256, |
| "grad_norm": 105.75, |
| "learning_rate": 2.965532676812891e-07, |
| "loss": 99.4763, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.7340696686491079, |
| "grad_norm": 107.5625, |
| "learning_rate": 2.954341987466428e-07, |
| "loss": 102.3207, |
| "step": 3645 |
| }, |
| { |
| "epoch": 0.7350766229270902, |
| "grad_norm": 106.3125, |
| "learning_rate": 2.943151298119964e-07, |
| "loss": 99.7648, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.7360835772050726, |
| "grad_norm": 105.375, |
| "learning_rate": 2.9319606087735006e-07, |
| "loss": 99.4035, |
| "step": 3655 |
| }, |
| { |
| "epoch": 0.7370905314830548, |
| "grad_norm": 103.1875, |
| "learning_rate": 2.920769919427036e-07, |
| "loss": 100.6968, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.7380974857610372, |
| "grad_norm": 108.0625, |
| "learning_rate": 2.909579230080573e-07, |
| "loss": 100.2772, |
| "step": 3665 |
| }, |
| { |
| "epoch": 0.7391044400390194, |
| "grad_norm": 110.0, |
| "learning_rate": 2.898388540734109e-07, |
| "loss": 99.7772, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.7401113943170018, |
| "grad_norm": 104.0625, |
| "learning_rate": 2.8871978513876456e-07, |
| "loss": 99.2794, |
| "step": 3675 |
| }, |
| { |
| "epoch": 0.7411183485949842, |
| "grad_norm": 106.125, |
| "learning_rate": 2.8760071620411817e-07, |
| "loss": 100.0197, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.7421253028729664, |
| "grad_norm": 104.6875, |
| "learning_rate": 2.864816472694718e-07, |
| "loss": 99.3644, |
| "step": 3685 |
| }, |
| { |
| "epoch": 0.7431322571509488, |
| "grad_norm": 105.625, |
| "learning_rate": 2.853625783348254e-07, |
| "loss": 101.4238, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.744139211428931, |
| "grad_norm": 105.9375, |
| "learning_rate": 2.8424350940017905e-07, |
| "loss": 98.9052, |
| "step": 3695 |
| }, |
| { |
| "epoch": 0.7451461657069134, |
| "grad_norm": 110.625, |
| "learning_rate": 2.8312444046553267e-07, |
| "loss": 99.8988, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.7461531199848956, |
| "grad_norm": 107.25, |
| "learning_rate": 2.820053715308863e-07, |
| "loss": 98.9521, |
| "step": 3705 |
| }, |
| { |
| "epoch": 0.747160074262878, |
| "grad_norm": 107.25, |
| "learning_rate": 2.808863025962399e-07, |
| "loss": 99.7661, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.7481670285408604, |
| "grad_norm": 107.9375, |
| "learning_rate": 2.7976723366159355e-07, |
| "loss": 98.4117, |
| "step": 3715 |
| }, |
| { |
| "epoch": 0.7491739828188426, |
| "grad_norm": 106.0625, |
| "learning_rate": 2.786481647269472e-07, |
| "loss": 100.2589, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.750180937096825, |
| "grad_norm": 102.5625, |
| "learning_rate": 2.7752909579230077e-07, |
| "loss": 98.1626, |
| "step": 3725 |
| }, |
| { |
| "epoch": 0.7511878913748072, |
| "grad_norm": 107.5, |
| "learning_rate": 2.7641002685765444e-07, |
| "loss": 99.1188, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.7521948456527896, |
| "grad_norm": 106.375, |
| "learning_rate": 2.7529095792300805e-07, |
| "loss": 98.337, |
| "step": 3735 |
| }, |
| { |
| "epoch": 0.7532017999307719, |
| "grad_norm": 106.8125, |
| "learning_rate": 2.741718889883617e-07, |
| "loss": 98.323, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.7542087542087542, |
| "grad_norm": 106.0625, |
| "learning_rate": 2.7305282005371527e-07, |
| "loss": 99.4523, |
| "step": 3745 |
| }, |
| { |
| "epoch": 0.7552157084867365, |
| "grad_norm": 100.0, |
| "learning_rate": 2.7193375111906893e-07, |
| "loss": 100.4829, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.7562226627647188, |
| "grad_norm": 104.75, |
| "learning_rate": 2.7081468218442254e-07, |
| "loss": 100.2488, |
| "step": 3755 |
| }, |
| { |
| "epoch": 0.7572296170427012, |
| "grad_norm": 103.875, |
| "learning_rate": 2.696956132497762e-07, |
| "loss": 100.597, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.7582365713206835, |
| "grad_norm": 102.6875, |
| "learning_rate": 2.685765443151298e-07, |
| "loss": 100.5632, |
| "step": 3765 |
| }, |
| { |
| "epoch": 0.7592435255986658, |
| "grad_norm": 106.5625, |
| "learning_rate": 2.6745747538048343e-07, |
| "loss": 99.1815, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.7602504798766481, |
| "grad_norm": 108.875, |
| "learning_rate": 2.6633840644583704e-07, |
| "loss": 99.6381, |
| "step": 3775 |
| }, |
| { |
| "epoch": 0.7612574341546304, |
| "grad_norm": 104.125, |
| "learning_rate": 2.652193375111907e-07, |
| "loss": 101.5287, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.7622643884326127, |
| "grad_norm": 106.4375, |
| "learning_rate": 2.641002685765443e-07, |
| "loss": 99.129, |
| "step": 3785 |
| }, |
| { |
| "epoch": 0.763271342710595, |
| "grad_norm": 105.4375, |
| "learning_rate": 2.629811996418979e-07, |
| "loss": 98.0637, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.7642782969885774, |
| "grad_norm": 105.0625, |
| "learning_rate": 2.6186213070725153e-07, |
| "loss": 99.2074, |
| "step": 3795 |
| }, |
| { |
| "epoch": 0.7652852512665597, |
| "grad_norm": 108.6875, |
| "learning_rate": 2.607430617726052e-07, |
| "loss": 100.7883, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.766292205544542, |
| "grad_norm": 103.5625, |
| "learning_rate": 2.596239928379588e-07, |
| "loss": 100.375, |
| "step": 3805 |
| }, |
| { |
| "epoch": 0.7672991598225243, |
| "grad_norm": 102.875, |
| "learning_rate": 2.585049239033124e-07, |
| "loss": 98.6215, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.7683061141005066, |
| "grad_norm": 104.125, |
| "learning_rate": 2.5738585496866603e-07, |
| "loss": 98.8108, |
| "step": 3815 |
| }, |
| { |
| "epoch": 0.7693130683784889, |
| "grad_norm": 103.9375, |
| "learning_rate": 2.562667860340197e-07, |
| "loss": 99.4923, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.7703200226564713, |
| "grad_norm": 104.8125, |
| "learning_rate": 2.551477170993733e-07, |
| "loss": 98.6085, |
| "step": 3825 |
| }, |
| { |
| "epoch": 0.7713269769344536, |
| "grad_norm": 102.6875, |
| "learning_rate": 2.540286481647269e-07, |
| "loss": 98.7937, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.7723339312124359, |
| "grad_norm": 105.3125, |
| "learning_rate": 2.5290957923008053e-07, |
| "loss": 99.0454, |
| "step": 3835 |
| }, |
| { |
| "epoch": 0.7733408854904182, |
| "grad_norm": 107.6875, |
| "learning_rate": 2.517905102954342e-07, |
| "loss": 101.8479, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.7743478397684005, |
| "grad_norm": 103.6875, |
| "learning_rate": 2.5067144136078785e-07, |
| "loss": 99.2844, |
| "step": 3845 |
| }, |
| { |
| "epoch": 0.7753547940463829, |
| "grad_norm": 104.6875, |
| "learning_rate": 2.4955237242614146e-07, |
| "loss": 98.6868, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.7763617483243651, |
| "grad_norm": 106.25, |
| "learning_rate": 2.484333034914951e-07, |
| "loss": 98.5338, |
| "step": 3855 |
| }, |
| { |
| "epoch": 0.7773687026023475, |
| "grad_norm": 107.625, |
| "learning_rate": 2.473142345568487e-07, |
| "loss": 101.3545, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.7783756568803297, |
| "grad_norm": 105.8125, |
| "learning_rate": 2.4619516562220235e-07, |
| "loss": 98.9249, |
| "step": 3865 |
| }, |
| { |
| "epoch": 0.7793826111583121, |
| "grad_norm": 106.1875, |
| "learning_rate": 2.4507609668755596e-07, |
| "loss": 97.8303, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.7803895654362945, |
| "grad_norm": 105.0, |
| "learning_rate": 2.4395702775290957e-07, |
| "loss": 99.5434, |
| "step": 3875 |
| }, |
| { |
| "epoch": 0.7813965197142767, |
| "grad_norm": 105.5625, |
| "learning_rate": 2.428379588182632e-07, |
| "loss": 98.4932, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.7824034739922591, |
| "grad_norm": 106.1875, |
| "learning_rate": 2.4171888988361685e-07, |
| "loss": 97.8763, |
| "step": 3885 |
| }, |
| { |
| "epoch": 0.7834104282702413, |
| "grad_norm": 105.5625, |
| "learning_rate": 2.4059982094897046e-07, |
| "loss": 98.7099, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.7844173825482237, |
| "grad_norm": 104.6875, |
| "learning_rate": 2.3948075201432407e-07, |
| "loss": 97.9921, |
| "step": 3895 |
| }, |
| { |
| "epoch": 0.7854243368262059, |
| "grad_norm": 102.1875, |
| "learning_rate": 2.3836168307967768e-07, |
| "loss": 97.4684, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.7864312911041883, |
| "grad_norm": 108.125, |
| "learning_rate": 2.3724261414503132e-07, |
| "loss": 98.4995, |
| "step": 3905 |
| }, |
| { |
| "epoch": 0.7874382453821707, |
| "grad_norm": 105.4375, |
| "learning_rate": 2.3612354521038493e-07, |
| "loss": 98.6665, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.7884451996601529, |
| "grad_norm": 106.0625, |
| "learning_rate": 2.350044762757386e-07, |
| "loss": 100.4663, |
| "step": 3915 |
| }, |
| { |
| "epoch": 0.7894521539381353, |
| "grad_norm": 105.1875, |
| "learning_rate": 2.338854073410922e-07, |
| "loss": 98.1109, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.7904591082161175, |
| "grad_norm": 106.1875, |
| "learning_rate": 2.3276633840644584e-07, |
| "loss": 98.0027, |
| "step": 3925 |
| }, |
| { |
| "epoch": 0.7914660624940999, |
| "grad_norm": 106.375, |
| "learning_rate": 2.3164726947179945e-07, |
| "loss": 97.9504, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.7924730167720822, |
| "grad_norm": 108.25, |
| "learning_rate": 2.3052820053715309e-07, |
| "loss": 98.3706, |
| "step": 3935 |
| }, |
| { |
| "epoch": 0.7934799710500645, |
| "grad_norm": 106.1875, |
| "learning_rate": 2.2940913160250672e-07, |
| "loss": 99.981, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.7944869253280469, |
| "grad_norm": 103.25, |
| "learning_rate": 2.2829006266786033e-07, |
| "loss": 99.1271, |
| "step": 3945 |
| }, |
| { |
| "epoch": 0.7954938796060291, |
| "grad_norm": 106.6875, |
| "learning_rate": 2.2717099373321397e-07, |
| "loss": 97.7095, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.7965008338840115, |
| "grad_norm": 102.4375, |
| "learning_rate": 2.2605192479856758e-07, |
| "loss": 97.0702, |
| "step": 3955 |
| }, |
| { |
| "epoch": 0.7975077881619937, |
| "grad_norm": 106.0625, |
| "learning_rate": 2.2493285586392122e-07, |
| "loss": 98.8119, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.7985147424399761, |
| "grad_norm": 105.3125, |
| "learning_rate": 2.2381378692927483e-07, |
| "loss": 99.1582, |
| "step": 3965 |
| }, |
| { |
| "epoch": 0.7995216967179584, |
| "grad_norm": 105.625, |
| "learning_rate": 2.2269471799462847e-07, |
| "loss": 97.7976, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.8005286509959407, |
| "grad_norm": 106.125, |
| "learning_rate": 2.2157564905998208e-07, |
| "loss": 98.5606, |
| "step": 3975 |
| }, |
| { |
| "epoch": 0.801535605273923, |
| "grad_norm": 101.375, |
| "learning_rate": 2.2045658012533572e-07, |
| "loss": 97.7195, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.8025425595519053, |
| "grad_norm": 106.875, |
| "learning_rate": 2.1933751119068933e-07, |
| "loss": 98.1909, |
| "step": 3985 |
| }, |
| { |
| "epoch": 0.8035495138298877, |
| "grad_norm": 103.5625, |
| "learning_rate": 2.1821844225604296e-07, |
| "loss": 97.943, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.80455646810787, |
| "grad_norm": 103.625, |
| "learning_rate": 2.1709937332139657e-07, |
| "loss": 97.9477, |
| "step": 3995 |
| }, |
| { |
| "epoch": 0.8055634223858523, |
| "grad_norm": 107.1875, |
| "learning_rate": 2.159803043867502e-07, |
| "loss": 98.1402, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.8055634223858523, |
| "eval_loss": 3.0711116790771484, |
| "eval_runtime": 241.4463, |
| "eval_samples_per_second": 1108.375, |
| "eval_steps_per_second": 34.637, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.8065703766638346, |
| "grad_norm": 103.75, |
| "learning_rate": 2.1486123545210382e-07, |
| "loss": 98.4772, |
| "step": 4005 |
| }, |
| { |
| "epoch": 0.8075773309418169, |
| "grad_norm": 103.8125, |
| "learning_rate": 2.1374216651745746e-07, |
| "loss": 98.346, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.8085842852197992, |
| "grad_norm": 110.0, |
| "learning_rate": 2.1262309758281107e-07, |
| "loss": 96.7867, |
| "step": 4015 |
| }, |
| { |
| "epoch": 0.8095912394977816, |
| "grad_norm": 106.5625, |
| "learning_rate": 2.115040286481647e-07, |
| "loss": 98.4165, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.8105981937757639, |
| "grad_norm": 104.375, |
| "learning_rate": 2.1038495971351834e-07, |
| "loss": 96.7289, |
| "step": 4025 |
| }, |
| { |
| "epoch": 0.8116051480537462, |
| "grad_norm": 105.625, |
| "learning_rate": 2.0926589077887196e-07, |
| "loss": 98.2023, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.8126121023317285, |
| "grad_norm": 104.75, |
| "learning_rate": 2.0814682184422562e-07, |
| "loss": 97.7403, |
| "step": 4035 |
| }, |
| { |
| "epoch": 0.8136190566097108, |
| "grad_norm": 100.375, |
| "learning_rate": 2.0702775290957923e-07, |
| "loss": 98.5274, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.8146260108876932, |
| "grad_norm": 103.875, |
| "learning_rate": 2.0590868397493287e-07, |
| "loss": 96.4971, |
| "step": 4045 |
| }, |
| { |
| "epoch": 0.8156329651656754, |
| "grad_norm": 106.125, |
| "learning_rate": 2.0478961504028648e-07, |
| "loss": 96.3516, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.8166399194436578, |
| "grad_norm": 102.125, |
| "learning_rate": 2.0367054610564011e-07, |
| "loss": 97.1612, |
| "step": 4055 |
| }, |
| { |
| "epoch": 0.81764687372164, |
| "grad_norm": 105.3125, |
| "learning_rate": 2.0255147717099373e-07, |
| "loss": 98.3791, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.8186538279996224, |
| "grad_norm": 107.75, |
| "learning_rate": 2.0143240823634736e-07, |
| "loss": 99.03, |
| "step": 4065 |
| }, |
| { |
| "epoch": 0.8196607822776047, |
| "grad_norm": 106.875, |
| "learning_rate": 2.0031333930170097e-07, |
| "loss": 97.8408, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.820667736555587, |
| "grad_norm": 104.5625, |
| "learning_rate": 1.991942703670546e-07, |
| "loss": 97.6941, |
| "step": 4075 |
| }, |
| { |
| "epoch": 0.8216746908335694, |
| "grad_norm": 106.1875, |
| "learning_rate": 1.9807520143240822e-07, |
| "loss": 97.8008, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.8226816451115516, |
| "grad_norm": 103.25, |
| "learning_rate": 1.9695613249776186e-07, |
| "loss": 96.2979, |
| "step": 4085 |
| }, |
| { |
| "epoch": 0.823688599389534, |
| "grad_norm": 104.6875, |
| "learning_rate": 1.9583706356311547e-07, |
| "loss": 99.1271, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.8246955536675162, |
| "grad_norm": 104.1875, |
| "learning_rate": 1.947179946284691e-07, |
| "loss": 97.5088, |
| "step": 4095 |
| }, |
| { |
| "epoch": 0.8257025079454986, |
| "grad_norm": 105.125, |
| "learning_rate": 1.9359892569382272e-07, |
| "loss": 98.5705, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.826709462223481, |
| "grad_norm": 107.125, |
| "learning_rate": 1.9247985675917635e-07, |
| "loss": 97.7035, |
| "step": 4105 |
| }, |
| { |
| "epoch": 0.8277164165014632, |
| "grad_norm": 104.0, |
| "learning_rate": 1.9136078782453e-07, |
| "loss": 97.2328, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.8287233707794456, |
| "grad_norm": 108.4375, |
| "learning_rate": 1.902417188898836e-07, |
| "loss": 99.1522, |
| "step": 4115 |
| }, |
| { |
| "epoch": 0.8297303250574278, |
| "grad_norm": 102.5, |
| "learning_rate": 1.8912264995523724e-07, |
| "loss": 100.5609, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.8307372793354102, |
| "grad_norm": 110.375, |
| "learning_rate": 1.8800358102059085e-07, |
| "loss": 96.9263, |
| "step": 4125 |
| }, |
| { |
| "epoch": 0.8317442336133924, |
| "grad_norm": 103.5, |
| "learning_rate": 1.868845120859445e-07, |
| "loss": 97.9833, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.8327511878913748, |
| "grad_norm": 104.5625, |
| "learning_rate": 1.857654431512981e-07, |
| "loss": 97.5248, |
| "step": 4135 |
| }, |
| { |
| "epoch": 0.8337581421693572, |
| "grad_norm": 102.3125, |
| "learning_rate": 1.8464637421665174e-07, |
| "loss": 96.6869, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.8347650964473394, |
| "grad_norm": 105.5625, |
| "learning_rate": 1.8352730528200535e-07, |
| "loss": 96.7125, |
| "step": 4145 |
| }, |
| { |
| "epoch": 0.8357720507253218, |
| "grad_norm": 106.9375, |
| "learning_rate": 1.8240823634735898e-07, |
| "loss": 98.5368, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.836779005003304, |
| "grad_norm": 107.375, |
| "learning_rate": 1.812891674127126e-07, |
| "loss": 97.8067, |
| "step": 4155 |
| }, |
| { |
| "epoch": 0.8377859592812864, |
| "grad_norm": 103.0625, |
| "learning_rate": 1.8017009847806626e-07, |
| "loss": 96.4311, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.8387929135592687, |
| "grad_norm": 105.375, |
| "learning_rate": 1.7905102954341987e-07, |
| "loss": 97.63, |
| "step": 4165 |
| }, |
| { |
| "epoch": 0.839799867837251, |
| "grad_norm": 104.3125, |
| "learning_rate": 1.779319606087735e-07, |
| "loss": 98.0877, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.8408068221152333, |
| "grad_norm": 104.8125, |
| "learning_rate": 1.7681289167412712e-07, |
| "loss": 96.8849, |
| "step": 4175 |
| }, |
| { |
| "epoch": 0.8418137763932156, |
| "grad_norm": 104.1875, |
| "learning_rate": 1.7569382273948075e-07, |
| "loss": 96.4972, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.842820730671198, |
| "grad_norm": 103.8125, |
| "learning_rate": 1.7457475380483437e-07, |
| "loss": 96.8067, |
| "step": 4185 |
| }, |
| { |
| "epoch": 0.8438276849491803, |
| "grad_norm": 104.875, |
| "learning_rate": 1.73455684870188e-07, |
| "loss": 97.2139, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.8448346392271626, |
| "grad_norm": 106.8125, |
| "learning_rate": 1.7233661593554164e-07, |
| "loss": 96.7182, |
| "step": 4195 |
| }, |
| { |
| "epoch": 0.8458415935051449, |
| "grad_norm": 108.6875, |
| "learning_rate": 1.7121754700089525e-07, |
| "loss": 97.3783, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.8468485477831272, |
| "grad_norm": 105.75, |
| "learning_rate": 1.700984780662489e-07, |
| "loss": 97.2013, |
| "step": 4205 |
| }, |
| { |
| "epoch": 0.8478555020611095, |
| "grad_norm": 106.875, |
| "learning_rate": 1.689794091316025e-07, |
| "loss": 97.2643, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.8488624563390919, |
| "grad_norm": 105.625, |
| "learning_rate": 1.6786034019695614e-07, |
| "loss": 97.3306, |
| "step": 4215 |
| }, |
| { |
| "epoch": 0.8498694106170742, |
| "grad_norm": 103.8125, |
| "learning_rate": 1.6674127126230975e-07, |
| "loss": 97.9119, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.8508763648950565, |
| "grad_norm": 102.625, |
| "learning_rate": 1.6562220232766338e-07, |
| "loss": 97.3807, |
| "step": 4225 |
| }, |
| { |
| "epoch": 0.8518833191730388, |
| "grad_norm": 107.1875, |
| "learning_rate": 1.64503133393017e-07, |
| "loss": 97.2101, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.8528902734510211, |
| "grad_norm": 104.875, |
| "learning_rate": 1.6338406445837063e-07, |
| "loss": 97.9154, |
| "step": 4235 |
| }, |
| { |
| "epoch": 0.8538972277290034, |
| "grad_norm": 105.1875, |
| "learning_rate": 1.6226499552372424e-07, |
| "loss": 97.5589, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.8549041820069857, |
| "grad_norm": 108.125, |
| "learning_rate": 1.6114592658907788e-07, |
| "loss": 97.9489, |
| "step": 4245 |
| }, |
| { |
| "epoch": 0.8559111362849681, |
| "grad_norm": 108.625, |
| "learning_rate": 1.600268576544315e-07, |
| "loss": 97.1754, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.8569180905629504, |
| "grad_norm": 107.8125, |
| "learning_rate": 1.5890778871978513e-07, |
| "loss": 97.4207, |
| "step": 4255 |
| }, |
| { |
| "epoch": 0.8579250448409327, |
| "grad_norm": 108.0625, |
| "learning_rate": 1.5778871978513874e-07, |
| "loss": 97.7349, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.858931999118915, |
| "grad_norm": 106.9375, |
| "learning_rate": 1.5666965085049238e-07, |
| "loss": 96.8319, |
| "step": 4265 |
| }, |
| { |
| "epoch": 0.8599389533968973, |
| "grad_norm": 105.125, |
| "learning_rate": 1.5555058191584599e-07, |
| "loss": 97.4651, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.8609459076748797, |
| "grad_norm": 104.1875, |
| "learning_rate": 1.5443151298119962e-07, |
| "loss": 97.7243, |
| "step": 4275 |
| }, |
| { |
| "epoch": 0.8619528619528619, |
| "grad_norm": 107.1875, |
| "learning_rate": 1.533124440465533e-07, |
| "loss": 96.1737, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.8629598162308443, |
| "grad_norm": 103.4375, |
| "learning_rate": 1.521933751119069e-07, |
| "loss": 97.9076, |
| "step": 4285 |
| }, |
| { |
| "epoch": 0.8639667705088265, |
| "grad_norm": 103.9375, |
| "learning_rate": 1.5107430617726054e-07, |
| "loss": 96.7344, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.8649737247868089, |
| "grad_norm": 105.125, |
| "learning_rate": 1.4995523724261415e-07, |
| "loss": 96.4767, |
| "step": 4295 |
| }, |
| { |
| "epoch": 0.8659806790647913, |
| "grad_norm": 106.0, |
| "learning_rate": 1.4883616830796778e-07, |
| "loss": 96.0296, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.8669876333427735, |
| "grad_norm": 103.4375, |
| "learning_rate": 1.477170993733214e-07, |
| "loss": 96.7257, |
| "step": 4305 |
| }, |
| { |
| "epoch": 0.8679945876207559, |
| "grad_norm": 107.625, |
| "learning_rate": 1.4659803043867503e-07, |
| "loss": 96.7568, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.8690015418987381, |
| "grad_norm": 107.1875, |
| "learning_rate": 1.4547896150402864e-07, |
| "loss": 97.2062, |
| "step": 4315 |
| }, |
| { |
| "epoch": 0.8700084961767205, |
| "grad_norm": 103.0, |
| "learning_rate": 1.4435989256938228e-07, |
| "loss": 96.2074, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.8710154504547027, |
| "grad_norm": 103.875, |
| "learning_rate": 1.432408236347359e-07, |
| "loss": 96.5843, |
| "step": 4325 |
| }, |
| { |
| "epoch": 0.8720224047326851, |
| "grad_norm": 105.5625, |
| "learning_rate": 1.4212175470008953e-07, |
| "loss": 97.9795, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.8730293590106675, |
| "grad_norm": 107.6875, |
| "learning_rate": 1.4100268576544314e-07, |
| "loss": 97.3432, |
| "step": 4335 |
| }, |
| { |
| "epoch": 0.8740363132886497, |
| "grad_norm": 103.3125, |
| "learning_rate": 1.3988361683079678e-07, |
| "loss": 95.1063, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.8750432675666321, |
| "grad_norm": 104.3125, |
| "learning_rate": 1.3876454789615039e-07, |
| "loss": 95.7163, |
| "step": 4345 |
| }, |
| { |
| "epoch": 0.8760502218446143, |
| "grad_norm": 104.9375, |
| "learning_rate": 1.3764547896150402e-07, |
| "loss": 96.0049, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.8770571761225967, |
| "grad_norm": 104.75, |
| "learning_rate": 1.3652641002685763e-07, |
| "loss": 96.9776, |
| "step": 4355 |
| }, |
| { |
| "epoch": 0.878064130400579, |
| "grad_norm": 105.6875, |
| "learning_rate": 1.3540734109221127e-07, |
| "loss": 94.5039, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.8790710846785613, |
| "grad_norm": 106.1875, |
| "learning_rate": 1.342882721575649e-07, |
| "loss": 96.5091, |
| "step": 4365 |
| }, |
| { |
| "epoch": 0.8800780389565437, |
| "grad_norm": 107.5625, |
| "learning_rate": 1.3316920322291852e-07, |
| "loss": 95.8942, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.8810849932345259, |
| "grad_norm": 109.0, |
| "learning_rate": 1.3205013428827216e-07, |
| "loss": 96.0599, |
| "step": 4375 |
| }, |
| { |
| "epoch": 0.8820919475125083, |
| "grad_norm": 106.875, |
| "learning_rate": 1.3093106535362577e-07, |
| "loss": 97.5782, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.8830989017904906, |
| "grad_norm": 105.5625, |
| "learning_rate": 1.298119964189794e-07, |
| "loss": 96.5007, |
| "step": 4385 |
| }, |
| { |
| "epoch": 0.8841058560684729, |
| "grad_norm": 104.625, |
| "learning_rate": 1.2869292748433302e-07, |
| "loss": 95.4609, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.8851128103464552, |
| "grad_norm": 108.4375, |
| "learning_rate": 1.2757385854968665e-07, |
| "loss": 97.2176, |
| "step": 4395 |
| }, |
| { |
| "epoch": 0.8861197646244375, |
| "grad_norm": 104.8125, |
| "learning_rate": 1.2645478961504026e-07, |
| "loss": 96.037, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.8871267189024198, |
| "grad_norm": 105.3125, |
| "learning_rate": 1.2533572068039393e-07, |
| "loss": 95.1831, |
| "step": 4405 |
| }, |
| { |
| "epoch": 0.8881336731804022, |
| "grad_norm": 102.5, |
| "learning_rate": 1.2421665174574754e-07, |
| "loss": 94.7369, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.8891406274583845, |
| "grad_norm": 105.3125, |
| "learning_rate": 1.2309758281110117e-07, |
| "loss": 95.9481, |
| "step": 4415 |
| }, |
| { |
| "epoch": 0.8901475817363668, |
| "grad_norm": 104.4375, |
| "learning_rate": 1.2197851387645479e-07, |
| "loss": 96.1412, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.8911545360143491, |
| "grad_norm": 102.75, |
| "learning_rate": 1.2085944494180842e-07, |
| "loss": 96.705, |
| "step": 4425 |
| }, |
| { |
| "epoch": 0.8921614902923314, |
| "grad_norm": 107.8125, |
| "learning_rate": 1.1974037600716203e-07, |
| "loss": 96.1038, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.8931684445703137, |
| "grad_norm": 109.875, |
| "learning_rate": 1.1862130707251566e-07, |
| "loss": 98.2343, |
| "step": 4435 |
| }, |
| { |
| "epoch": 0.894175398848296, |
| "grad_norm": 104.8125, |
| "learning_rate": 1.175022381378693e-07, |
| "loss": 98.8879, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.8951823531262784, |
| "grad_norm": 104.5625, |
| "learning_rate": 1.1638316920322292e-07, |
| "loss": 95.8946, |
| "step": 4445 |
| }, |
| { |
| "epoch": 0.8961893074042607, |
| "grad_norm": 107.9375, |
| "learning_rate": 1.1526410026857654e-07, |
| "loss": 96.5908, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.897196261682243, |
| "grad_norm": 100.6875, |
| "learning_rate": 1.1414503133393017e-07, |
| "loss": 97.5184, |
| "step": 4455 |
| }, |
| { |
| "epoch": 0.8982032159602253, |
| "grad_norm": 105.8125, |
| "learning_rate": 1.1302596239928379e-07, |
| "loss": 97.1954, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.8992101702382076, |
| "grad_norm": 101.1875, |
| "learning_rate": 1.1190689346463741e-07, |
| "loss": 95.303, |
| "step": 4465 |
| }, |
| { |
| "epoch": 0.90021712451619, |
| "grad_norm": 106.9375, |
| "learning_rate": 1.1078782452999104e-07, |
| "loss": 95.9828, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.9012240787941722, |
| "grad_norm": 109.875, |
| "learning_rate": 1.0966875559534466e-07, |
| "loss": 96.7188, |
| "step": 4475 |
| }, |
| { |
| "epoch": 0.9022310330721546, |
| "grad_norm": 103.5, |
| "learning_rate": 1.0854968666069829e-07, |
| "loss": 96.2868, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.9032379873501368, |
| "grad_norm": 108.25, |
| "learning_rate": 1.0743061772605191e-07, |
| "loss": 97.6396, |
| "step": 4485 |
| }, |
| { |
| "epoch": 0.9042449416281192, |
| "grad_norm": 105.625, |
| "learning_rate": 1.0631154879140553e-07, |
| "loss": 95.9678, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.9052518959061016, |
| "grad_norm": 105.1875, |
| "learning_rate": 1.0519247985675917e-07, |
| "loss": 96.746, |
| "step": 4495 |
| }, |
| { |
| "epoch": 0.9062588501840838, |
| "grad_norm": 107.25, |
| "learning_rate": 1.0407341092211281e-07, |
| "loss": 95.7666, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.9062588501840838, |
| "eval_loss": 3.013758897781372, |
| "eval_runtime": 241.0945, |
| "eval_samples_per_second": 1109.992, |
| "eval_steps_per_second": 34.688, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.9072658044620662, |
| "grad_norm": 104.1875, |
| "learning_rate": 1.0295434198746643e-07, |
| "loss": 96.1257, |
| "step": 4505 |
| }, |
| { |
| "epoch": 0.9082727587400484, |
| "grad_norm": 105.625, |
| "learning_rate": 1.0183527305282006e-07, |
| "loss": 96.9505, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.9092797130180308, |
| "grad_norm": 108.375, |
| "learning_rate": 1.0071620411817368e-07, |
| "loss": 96.6111, |
| "step": 4515 |
| }, |
| { |
| "epoch": 0.910286667296013, |
| "grad_norm": 106.4375, |
| "learning_rate": 9.95971351835273e-08, |
| "loss": 97.3165, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.9112936215739954, |
| "grad_norm": 105.375, |
| "learning_rate": 9.847806624888093e-08, |
| "loss": 97.2006, |
| "step": 4525 |
| }, |
| { |
| "epoch": 0.9123005758519778, |
| "grad_norm": 109.0, |
| "learning_rate": 9.735899731423455e-08, |
| "loss": 96.5357, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.91330753012996, |
| "grad_norm": 104.4375, |
| "learning_rate": 9.623992837958818e-08, |
| "loss": 96.6608, |
| "step": 4535 |
| }, |
| { |
| "epoch": 0.9143144844079424, |
| "grad_norm": 103.4375, |
| "learning_rate": 9.51208594449418e-08, |
| "loss": 96.2924, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.9153214386859246, |
| "grad_norm": 101.5, |
| "learning_rate": 9.400179051029543e-08, |
| "loss": 94.9458, |
| "step": 4545 |
| }, |
| { |
| "epoch": 0.916328392963907, |
| "grad_norm": 108.875, |
| "learning_rate": 9.288272157564905e-08, |
| "loss": 95.3736, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.9173353472418893, |
| "grad_norm": 106.0, |
| "learning_rate": 9.176365264100267e-08, |
| "loss": 94.3943, |
| "step": 4555 |
| }, |
| { |
| "epoch": 0.9183423015198716, |
| "grad_norm": 105.4375, |
| "learning_rate": 9.06445837063563e-08, |
| "loss": 97.27, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.919349255797854, |
| "grad_norm": 106.3125, |
| "learning_rate": 8.952551477170993e-08, |
| "loss": 95.4415, |
| "step": 4565 |
| }, |
| { |
| "epoch": 0.9203562100758362, |
| "grad_norm": 107.1875, |
| "learning_rate": 8.840644583706356e-08, |
| "loss": 96.8434, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.9213631643538186, |
| "grad_norm": 105.1875, |
| "learning_rate": 8.728737690241718e-08, |
| "loss": 96.2896, |
| "step": 4575 |
| }, |
| { |
| "epoch": 0.9223701186318009, |
| "grad_norm": 104.875, |
| "learning_rate": 8.616830796777082e-08, |
| "loss": 97.0949, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.9233770729097832, |
| "grad_norm": 107.375, |
| "learning_rate": 8.504923903312444e-08, |
| "loss": 96.0602, |
| "step": 4585 |
| }, |
| { |
| "epoch": 0.9243840271877655, |
| "grad_norm": 105.0, |
| "learning_rate": 8.393017009847807e-08, |
| "loss": 96.6697, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.9253909814657478, |
| "grad_norm": 103.5, |
| "learning_rate": 8.281110116383169e-08, |
| "loss": 95.5824, |
| "step": 4595 |
| }, |
| { |
| "epoch": 0.9263979357437301, |
| "grad_norm": 107.5625, |
| "learning_rate": 8.169203222918532e-08, |
| "loss": 96.6081, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.9274048900217124, |
| "grad_norm": 108.4375, |
| "learning_rate": 8.057296329453894e-08, |
| "loss": 96.3714, |
| "step": 4605 |
| }, |
| { |
| "epoch": 0.9284118442996948, |
| "grad_norm": 105.3125, |
| "learning_rate": 7.945389435989256e-08, |
| "loss": 95.8521, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.9294187985776771, |
| "grad_norm": 108.125, |
| "learning_rate": 7.833482542524619e-08, |
| "loss": 96.356, |
| "step": 4615 |
| }, |
| { |
| "epoch": 0.9304257528556594, |
| "grad_norm": 105.9375, |
| "learning_rate": 7.721575649059981e-08, |
| "loss": 96.4865, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.9314327071336417, |
| "grad_norm": 105.6875, |
| "learning_rate": 7.609668755595345e-08, |
| "loss": 95.1476, |
| "step": 4625 |
| }, |
| { |
| "epoch": 0.932439661411624, |
| "grad_norm": 106.9375, |
| "learning_rate": 7.497761862130707e-08, |
| "loss": 95.1061, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.9334466156896063, |
| "grad_norm": 105.5625, |
| "learning_rate": 7.38585496866607e-08, |
| "loss": 95.2852, |
| "step": 4635 |
| }, |
| { |
| "epoch": 0.9344535699675887, |
| "grad_norm": 107.5625, |
| "learning_rate": 7.273948075201432e-08, |
| "loss": 95.0002, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.935460524245571, |
| "grad_norm": 106.5625, |
| "learning_rate": 7.162041181736795e-08, |
| "loss": 97.3515, |
| "step": 4645 |
| }, |
| { |
| "epoch": 0.9364674785235533, |
| "grad_norm": 106.875, |
| "learning_rate": 7.050134288272157e-08, |
| "loss": 96.8893, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.9374744328015356, |
| "grad_norm": 106.125, |
| "learning_rate": 6.938227394807519e-08, |
| "loss": 96.1281, |
| "step": 4655 |
| }, |
| { |
| "epoch": 0.9384813870795179, |
| "grad_norm": 105.4375, |
| "learning_rate": 6.826320501342882e-08, |
| "loss": 95.932, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.9394883413575003, |
| "grad_norm": 106.0625, |
| "learning_rate": 6.714413607878245e-08, |
| "loss": 96.195, |
| "step": 4665 |
| }, |
| { |
| "epoch": 0.9404952956354825, |
| "grad_norm": 106.1875, |
| "learning_rate": 6.602506714413608e-08, |
| "loss": 94.7684, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.9415022499134649, |
| "grad_norm": 109.0, |
| "learning_rate": 6.49059982094897e-08, |
| "loss": 96.4495, |
| "step": 4675 |
| }, |
| { |
| "epoch": 0.9425092041914472, |
| "grad_norm": 109.0, |
| "learning_rate": 6.378692927484333e-08, |
| "loss": 96.9962, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.9435161584694295, |
| "grad_norm": 104.3125, |
| "learning_rate": 6.266786034019696e-08, |
| "loss": 94.3069, |
| "step": 4685 |
| }, |
| { |
| "epoch": 0.9445231127474119, |
| "grad_norm": 107.625, |
| "learning_rate": 6.154879140555059e-08, |
| "loss": 96.7521, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.9455300670253941, |
| "grad_norm": 104.3125, |
| "learning_rate": 6.042972247090421e-08, |
| "loss": 96.0066, |
| "step": 4695 |
| }, |
| { |
| "epoch": 0.9465370213033765, |
| "grad_norm": 104.875, |
| "learning_rate": 5.931065353625783e-08, |
| "loss": 94.7801, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.9475439755813587, |
| "grad_norm": 106.375, |
| "learning_rate": 5.819158460161146e-08, |
| "loss": 95.1509, |
| "step": 4705 |
| }, |
| { |
| "epoch": 0.9485509298593411, |
| "grad_norm": 104.9375, |
| "learning_rate": 5.7072515666965083e-08, |
| "loss": 95.5377, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.9495578841373233, |
| "grad_norm": 105.0, |
| "learning_rate": 5.595344673231871e-08, |
| "loss": 96.0342, |
| "step": 4715 |
| }, |
| { |
| "epoch": 0.9505648384153057, |
| "grad_norm": 106.8125, |
| "learning_rate": 5.483437779767233e-08, |
| "loss": 95.7919, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.9515717926932881, |
| "grad_norm": 104.0625, |
| "learning_rate": 5.3715308863025955e-08, |
| "loss": 95.4794, |
| "step": 4725 |
| }, |
| { |
| "epoch": 0.9525787469712703, |
| "grad_norm": 105.1875, |
| "learning_rate": 5.2596239928379586e-08, |
| "loss": 96.2796, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.9535857012492527, |
| "grad_norm": 107.4375, |
| "learning_rate": 5.147717099373322e-08, |
| "loss": 96.9097, |
| "step": 4735 |
| }, |
| { |
| "epoch": 0.9545926555272349, |
| "grad_norm": 104.0, |
| "learning_rate": 5.035810205908684e-08, |
| "loss": 95.2215, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.9555996098052173, |
| "grad_norm": 103.875, |
| "learning_rate": 4.9239033124440465e-08, |
| "loss": 95.531, |
| "step": 4745 |
| }, |
| { |
| "epoch": 0.9566065640831996, |
| "grad_norm": 104.5625, |
| "learning_rate": 4.811996418979409e-08, |
| "loss": 94.8213, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.9576135183611819, |
| "grad_norm": 103.125, |
| "learning_rate": 4.700089525514771e-08, |
| "loss": 94.9124, |
| "step": 4755 |
| }, |
| { |
| "epoch": 0.9586204726391643, |
| "grad_norm": 108.75, |
| "learning_rate": 4.588182632050134e-08, |
| "loss": 94.8872, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.9596274269171465, |
| "grad_norm": 106.0625, |
| "learning_rate": 4.476275738585497e-08, |
| "loss": 94.9003, |
| "step": 4765 |
| }, |
| { |
| "epoch": 0.9606343811951289, |
| "grad_norm": 109.0, |
| "learning_rate": 4.364368845120859e-08, |
| "loss": 97.4909, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.9616413354731111, |
| "grad_norm": 105.1875, |
| "learning_rate": 4.252461951656222e-08, |
| "loss": 95.4977, |
| "step": 4775 |
| }, |
| { |
| "epoch": 0.9626482897510935, |
| "grad_norm": 103.4375, |
| "learning_rate": 4.1405550581915846e-08, |
| "loss": 95.1702, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.9636552440290758, |
| "grad_norm": 105.4375, |
| "learning_rate": 4.028648164726947e-08, |
| "loss": 95.1124, |
| "step": 4785 |
| }, |
| { |
| "epoch": 0.9646621983070581, |
| "grad_norm": 107.125, |
| "learning_rate": 3.9167412712623094e-08, |
| "loss": 95.5008, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.9656691525850404, |
| "grad_norm": 103.8125, |
| "learning_rate": 3.8048343777976725e-08, |
| "loss": 96.745, |
| "step": 4795 |
| }, |
| { |
| "epoch": 0.9666761068630227, |
| "grad_norm": 103.875, |
| "learning_rate": 3.692927484333035e-08, |
| "loss": 96.3884, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.9676830611410051, |
| "grad_norm": 103.375, |
| "learning_rate": 3.581020590868397e-08, |
| "loss": 94.6912, |
| "step": 4805 |
| }, |
| { |
| "epoch": 0.9686900154189874, |
| "grad_norm": 106.125, |
| "learning_rate": 3.4691136974037597e-08, |
| "loss": 95.0865, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.9696969696969697, |
| "grad_norm": 103.625, |
| "learning_rate": 3.357206803939123e-08, |
| "loss": 93.7961, |
| "step": 4815 |
| }, |
| { |
| "epoch": 0.970703923974952, |
| "grad_norm": 104.3125, |
| "learning_rate": 3.245299910474485e-08, |
| "loss": 95.4935, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.9717108782529343, |
| "grad_norm": 102.0625, |
| "learning_rate": 3.133393017009848e-08, |
| "loss": 95.072, |
| "step": 4825 |
| }, |
| { |
| "epoch": 0.9727178325309166, |
| "grad_norm": 107.1875, |
| "learning_rate": 3.0214861235452106e-08, |
| "loss": 95.2949, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.973724786808899, |
| "grad_norm": 105.375, |
| "learning_rate": 2.909579230080573e-08, |
| "loss": 94.6876, |
| "step": 4835 |
| }, |
| { |
| "epoch": 0.9747317410868813, |
| "grad_norm": 104.5625, |
| "learning_rate": 2.7976723366159354e-08, |
| "loss": 96.9518, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.9757386953648636, |
| "grad_norm": 106.1875, |
| "learning_rate": 2.6857654431512978e-08, |
| "loss": 95.4756, |
| "step": 4845 |
| }, |
| { |
| "epoch": 0.9767456496428459, |
| "grad_norm": 107.5625, |
| "learning_rate": 2.573858549686661e-08, |
| "loss": 94.8716, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.9777526039208282, |
| "grad_norm": 103.4375, |
| "learning_rate": 2.4619516562220232e-08, |
| "loss": 95.0801, |
| "step": 4855 |
| }, |
| { |
| "epoch": 0.9787595581988106, |
| "grad_norm": 102.75, |
| "learning_rate": 2.3500447627573856e-08, |
| "loss": 95.3318, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.9797665124767928, |
| "grad_norm": 107.75, |
| "learning_rate": 2.2381378692927484e-08, |
| "loss": 95.9054, |
| "step": 4865 |
| }, |
| { |
| "epoch": 0.9807734667547752, |
| "grad_norm": 103.1875, |
| "learning_rate": 2.126230975828111e-08, |
| "loss": 95.6948, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.9817804210327575, |
| "grad_norm": 107.5625, |
| "learning_rate": 2.0143240823634735e-08, |
| "loss": 95.1651, |
| "step": 4875 |
| }, |
| { |
| "epoch": 0.9827873753107398, |
| "grad_norm": 102.3125, |
| "learning_rate": 1.9024171888988362e-08, |
| "loss": 95.8977, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.9837943295887221, |
| "grad_norm": 107.3125, |
| "learning_rate": 1.7905102954341986e-08, |
| "loss": 94.0943, |
| "step": 4885 |
| }, |
| { |
| "epoch": 0.9848012838667044, |
| "grad_norm": 105.8125, |
| "learning_rate": 1.6786034019695614e-08, |
| "loss": 96.5686, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.9858082381446868, |
| "grad_norm": 104.75, |
| "learning_rate": 1.566696508504924e-08, |
| "loss": 96.2139, |
| "step": 4895 |
| }, |
| { |
| "epoch": 0.986815192422669, |
| "grad_norm": 106.5625, |
| "learning_rate": 1.4547896150402865e-08, |
| "loss": 96.4123, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.9878221467006514, |
| "grad_norm": 107.5, |
| "learning_rate": 1.3428827215756489e-08, |
| "loss": 95.4067, |
| "step": 4905 |
| }, |
| { |
| "epoch": 0.9888291009786336, |
| "grad_norm": 106.125, |
| "learning_rate": 1.2309758281110116e-08, |
| "loss": 96.4161, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.989836055256616, |
| "grad_norm": 104.6875, |
| "learning_rate": 1.1190689346463742e-08, |
| "loss": 94.9028, |
| "step": 4915 |
| }, |
| { |
| "epoch": 0.9908430095345984, |
| "grad_norm": 106.0625, |
| "learning_rate": 1.0071620411817367e-08, |
| "loss": 96.9095, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.9918499638125806, |
| "grad_norm": 106.8125, |
| "learning_rate": 8.952551477170993e-09, |
| "loss": 94.9621, |
| "step": 4925 |
| }, |
| { |
| "epoch": 0.992856918090563, |
| "grad_norm": 105.75, |
| "learning_rate": 7.83348254252462e-09, |
| "loss": 95.0764, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.9938638723685452, |
| "grad_norm": 107.5, |
| "learning_rate": 6.7144136078782444e-09, |
| "loss": 96.6513, |
| "step": 4935 |
| }, |
| { |
| "epoch": 0.9948708266465276, |
| "grad_norm": 104.1875, |
| "learning_rate": 5.595344673231871e-09, |
| "loss": 94.489, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.9958777809245098, |
| "grad_norm": 105.75, |
| "learning_rate": 4.4762757385854966e-09, |
| "loss": 95.3881, |
| "step": 4945 |
| }, |
| { |
| "epoch": 0.9968847352024922, |
| "grad_norm": 108.0625, |
| "learning_rate": 3.3572068039391222e-09, |
| "loss": 95.4261, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.9978916894804746, |
| "grad_norm": 105.9375, |
| "learning_rate": 2.2381378692927483e-09, |
| "loss": 95.8491, |
| "step": 4955 |
| }, |
| { |
| "epoch": 0.9988986437584568, |
| "grad_norm": 104.6875, |
| "learning_rate": 1.1190689346463741e-09, |
| "loss": 94.8424, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.9999055980364392, |
| "grad_norm": 101.75, |
| "learning_rate": 0.0, |
| "loss": 94.7523, |
| "step": 4965 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 4965, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.151015743419633e+19, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|