| { | |
| "best_metric": 0.35205078125, | |
| "best_model_checkpoint": "./results/checkpoint-7094", | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 10641, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 2.7654, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 2.862, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 146.36835642526796, | |
| "learning_rate": 4.8e-06, | |
| "loss": 2.4598, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 254.1399327894623, | |
| "learning_rate": 1.02e-05, | |
| "loss": 3.048, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 55.794104847848445, | |
| "learning_rate": 1.6199999999999997e-05, | |
| "loss": 2.4278, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 29.62062990171776, | |
| "learning_rate": 2.2199999999999998e-05, | |
| "loss": 0.9157, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 36.90337952409684, | |
| "learning_rate": 2.8199999999999998e-05, | |
| "loss": 1.0977, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 70.28513966911416, | |
| "learning_rate": 3.42e-05, | |
| "loss": 1.4851, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 76.38058442522734, | |
| "learning_rate": 4.02e-05, | |
| "loss": 0.9004, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 215.04691118737435, | |
| "learning_rate": 4.62e-05, | |
| "loss": 1.3784, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 156.18636353053705, | |
| "learning_rate": 5.2199999999999995e-05, | |
| "loss": 1.9046, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 83.58499534326243, | |
| "learning_rate": 5.82e-05, | |
| "loss": 1.8243, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 58.89968090715743, | |
| "learning_rate": 6.419999999999999e-05, | |
| "loss": 3.5648, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 279.1237096487612, | |
| "learning_rate": 7.02e-05, | |
| "loss": 1.3321, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 14.282470404736001, | |
| "learning_rate": 7.62e-05, | |
| "loss": 0.7693, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 21.416320260069092, | |
| "learning_rate": 8.22e-05, | |
| "loss": 0.7282, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 7.027757159631835, | |
| "learning_rate": 8.819999999999999e-05, | |
| "loss": 0.8376, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 28.808627534490917, | |
| "learning_rate": 9.419999999999999e-05, | |
| "loss": 0.8771, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 55.14542562374617, | |
| "learning_rate": 0.0001002, | |
| "loss": 1.2092, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 12.485175678803063, | |
| "learning_rate": 0.00010619999999999998, | |
| "loss": 0.7898, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 30.98880210639734, | |
| "learning_rate": 0.00011219999999999999, | |
| "loss": 1.1421, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 34.4471000818379, | |
| "learning_rate": 0.0001182, | |
| "loss": 0.755, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 25.20646584085208, | |
| "learning_rate": 0.00012419999999999998, | |
| "loss": 0.8171, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 57.11988442886105, | |
| "learning_rate": 0.0001302, | |
| "loss": 0.9672, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 31.266794385874547, | |
| "learning_rate": 0.0001362, | |
| "loss": 0.863, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 28.801620787803333, | |
| "learning_rate": 0.0001422, | |
| "loss": 0.9292, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 18.379715667965055, | |
| "learning_rate": 0.0001482, | |
| "loss": 0.6885, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 26.615184398415803, | |
| "learning_rate": 0.00015419999999999998, | |
| "loss": 0.8698, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 27.84018584708001, | |
| "learning_rate": 0.0001602, | |
| "loss": 0.7403, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 83.6950577392233, | |
| "learning_rate": 0.0001662, | |
| "loss": 1.7649, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 62.62507175586115, | |
| "learning_rate": 0.00017219999999999998, | |
| "loss": 1.5992, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 34.83362360351182, | |
| "learning_rate": 0.00017819999999999997, | |
| "loss": 3.7618, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.1566836386456626, | |
| "learning_rate": 0.00018419999999999998, | |
| "loss": 1.1883, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 44.53868085198857, | |
| "learning_rate": 0.0001902, | |
| "loss": 1.5033, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 16.47571390018737, | |
| "learning_rate": 0.0001962, | |
| "loss": 0.7708, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 90.50256233776733, | |
| "learning_rate": 0.0002022, | |
| "loss": 1.0146, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 16.535117663656802, | |
| "learning_rate": 0.00020819999999999996, | |
| "loss": 0.673, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 41.013763949361135, | |
| "learning_rate": 0.00021419999999999998, | |
| "loss": 0.7536, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 50.83635956076198, | |
| "learning_rate": 0.00022019999999999999, | |
| "loss": 0.8832, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 51.192653868723845, | |
| "learning_rate": 0.00022619999999999997, | |
| "loss": 0.756, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 95.02104619699388, | |
| "learning_rate": 0.00023219999999999998, | |
| "loss": 1.0475, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 14.755367161274899, | |
| "learning_rate": 0.0002382, | |
| "loss": 0.8114, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 26.824869427969787, | |
| "learning_rate": 0.00024419999999999997, | |
| "loss": 0.7827, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 10.199008591764807, | |
| "learning_rate": 0.00025019999999999996, | |
| "loss": 0.7201, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 16.099046337033606, | |
| "learning_rate": 0.0002562, | |
| "loss": 0.7852, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 72.84156691472333, | |
| "learning_rate": 0.0002622, | |
| "loss": 0.7819, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 23.060057287801556, | |
| "learning_rate": 0.00026819999999999996, | |
| "loss": 1.1294, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 18.372892721573056, | |
| "learning_rate": 0.0002742, | |
| "loss": 0.8509, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 44.80535522734965, | |
| "learning_rate": 0.0002802, | |
| "loss": 1.3075, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 53.039346895060866, | |
| "learning_rate": 0.00028619999999999996, | |
| "loss": 0.8981, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 8.933950393723551, | |
| "learning_rate": 0.00029219999999999995, | |
| "loss": 0.8596, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 54.86618147368649, | |
| "learning_rate": 0.0002982, | |
| "loss": 0.8419, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 52.0156886597986, | |
| "learning_rate": 0.00029984658094681473, | |
| "loss": 1.0992, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 48.61185102339785, | |
| "learning_rate": 0.00029962741087083574, | |
| "loss": 0.8373, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 85.88381074144633, | |
| "learning_rate": 0.0002994082407948568, | |
| "loss": 0.9952, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 46.828573117866156, | |
| "learning_rate": 0.0002991890707188778, | |
| "loss": 1.2285, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.319911680589826, | |
| "learning_rate": 0.00029896990064289886, | |
| "loss": 1.0784, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 49.18948488974743, | |
| "learning_rate": 0.0002987507305669199, | |
| "loss": 0.6879, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 14.790075292508273, | |
| "learning_rate": 0.0002985315604909409, | |
| "loss": 0.7182, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 81.77111956641443, | |
| "learning_rate": 0.000298312390414962, | |
| "loss": 0.6341, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.015074610499661, | |
| "learning_rate": 0.00029809322033898304, | |
| "loss": 0.8418, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 43.326414318341016, | |
| "learning_rate": 0.00029787405026300405, | |
| "loss": 0.7158, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 21.749351652802584, | |
| "learning_rate": 0.0002976548801870251, | |
| "loss": 0.5541, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 104.8554633037212, | |
| "learning_rate": 0.00029743571011104616, | |
| "loss": 0.7715, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 53.28037677060509, | |
| "learning_rate": 0.00029721654003506717, | |
| "loss": 0.7089, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 60.68620976609669, | |
| "learning_rate": 0.00029699736995908823, | |
| "loss": 0.7251, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 83.67598838205309, | |
| "learning_rate": 0.0002967781998831093, | |
| "loss": 0.9626, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 22.217129800838155, | |
| "learning_rate": 0.0002965590298071303, | |
| "loss": 0.7762, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 86.4481164773122, | |
| "learning_rate": 0.00029633985973115135, | |
| "loss": 0.6535, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 40.75531136163561, | |
| "learning_rate": 0.00029612068965517236, | |
| "loss": 0.6025, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 56.971296704966576, | |
| "learning_rate": 0.0002959015195791934, | |
| "loss": 0.7087, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 30.278786860468646, | |
| "learning_rate": 0.0002956823495032145, | |
| "loss": 0.8943, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 14.890198931647719, | |
| "learning_rate": 0.0002954631794272355, | |
| "loss": 0.8818, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 39.88836482589719, | |
| "learning_rate": 0.00029524400935125654, | |
| "loss": 0.9544, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 102.4851007431489, | |
| "learning_rate": 0.0002950248392752776, | |
| "loss": 0.7461, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 43.156700980283695, | |
| "learning_rate": 0.0002948056691992986, | |
| "loss": 0.7074, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 49.376028998432666, | |
| "learning_rate": 0.00029458649912331966, | |
| "loss": 0.6382, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 31.433406542237964, | |
| "learning_rate": 0.0002943673290473407, | |
| "loss": 0.732, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 44.3715979494319, | |
| "learning_rate": 0.0002941481589713617, | |
| "loss": 0.6883, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 17.06187425481664, | |
| "learning_rate": 0.0002939289888953828, | |
| "loss": 0.648, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 8.777853690776, | |
| "learning_rate": 0.00029370981881940384, | |
| "loss": 0.4979, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.880875214974237, | |
| "learning_rate": 0.00029349064874342485, | |
| "loss": 0.743, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 9.720423607604369, | |
| "learning_rate": 0.0002932714786674459, | |
| "loss": 0.645, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 9.066124973757804, | |
| "learning_rate": 0.00029305230859146697, | |
| "loss": 0.4977, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 23.89066736642937, | |
| "learning_rate": 0.00029283313851548797, | |
| "loss": 0.4926, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 41.695686510198016, | |
| "learning_rate": 0.00029261396843950903, | |
| "loss": 0.7886, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 66.53797988620009, | |
| "learning_rate": 0.00029239479836353004, | |
| "loss": 0.5658, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 55.97718756593597, | |
| "learning_rate": 0.0002921756282875511, | |
| "loss": 0.5484, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 108.03497298548182, | |
| "learning_rate": 0.00029195645821157215, | |
| "loss": 0.7978, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 40.14088131595207, | |
| "learning_rate": 0.00029173728813559316, | |
| "loss": 0.6178, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 48.46822333526267, | |
| "learning_rate": 0.00029151811805961427, | |
| "loss": 0.565, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 16.17488665111733, | |
| "learning_rate": 0.0002912989479836353, | |
| "loss": 0.6886, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 26.592963237868908, | |
| "learning_rate": 0.0002910797779076563, | |
| "loss": 0.6966, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 27.00009471161443, | |
| "learning_rate": 0.0002908606078316774, | |
| "loss": 0.6178, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 28.195576793914913, | |
| "learning_rate": 0.0002906414377556984, | |
| "loss": 0.4663, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 23.538388615789113, | |
| "learning_rate": 0.00029042226767971946, | |
| "loss": 0.7135, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 20.353666443170358, | |
| "learning_rate": 0.00029020309760374046, | |
| "loss": 0.708, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 30.98820805802448, | |
| "learning_rate": 0.0002899839275277615, | |
| "loss": 0.5138, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 49.96332795879376, | |
| "learning_rate": 0.0002897647574517826, | |
| "loss": 0.5238, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 36.472514659373466, | |
| "learning_rate": 0.0002895455873758036, | |
| "loss": 0.7198, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.727047796229911, | |
| "learning_rate": 0.00028932641729982465, | |
| "loss": 0.5427, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 6.39978136661481, | |
| "learning_rate": 0.0002891072472238457, | |
| "loss": 0.6386, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 26.496446288276136, | |
| "learning_rate": 0.0002888880771478667, | |
| "loss": 0.4019, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 96.50801416396253, | |
| "learning_rate": 0.00028866890707188777, | |
| "loss": 0.8864, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 26.933228918624025, | |
| "learning_rate": 0.00028844973699590883, | |
| "loss": 0.8963, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 20.70682008967005, | |
| "learning_rate": 0.00028825248392752773, | |
| "loss": 0.6956, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 46.724804444462556, | |
| "learning_rate": 0.00028803331385154874, | |
| "loss": 0.8669, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.5954267342032225, | |
| "learning_rate": 0.00028781414377556985, | |
| "loss": 0.9219, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 60.59949911543188, | |
| "learning_rate": 0.00028759497369959086, | |
| "loss": 0.6557, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 37.05416363332341, | |
| "learning_rate": 0.00028737580362361186, | |
| "loss": 0.7692, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 14.82417785628839, | |
| "learning_rate": 0.000287156633547633, | |
| "loss": 0.5669, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 34.71923857782051, | |
| "learning_rate": 0.000286937463471654, | |
| "loss": 0.7427, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.620541208209977, | |
| "learning_rate": 0.00028671829339567504, | |
| "loss": 0.4811, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 65.72414363498706, | |
| "learning_rate": 0.00028649912331969604, | |
| "loss": 0.6696, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 31.255536486132993, | |
| "learning_rate": 0.0002862799532437171, | |
| "loss": 0.6253, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 14.845236948350237, | |
| "learning_rate": 0.00028606078316773816, | |
| "loss": 0.5473, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 62.44169640686947, | |
| "learning_rate": 0.00028584161309175917, | |
| "loss": 0.6686, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 105.6997308934719, | |
| "learning_rate": 0.0002856224430157802, | |
| "loss": 0.7738, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 75.19754745372326, | |
| "learning_rate": 0.0002854032729398013, | |
| "loss": 0.5995, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 34.71806881877583, | |
| "learning_rate": 0.0002851841028638223, | |
| "loss": 0.5701, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 48.159897749652075, | |
| "learning_rate": 0.00028496493278784335, | |
| "loss": 0.4724, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.544316071386722, | |
| "learning_rate": 0.0002847457627118644, | |
| "loss": 0.6537, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 21.365400503740027, | |
| "learning_rate": 0.0002845265926358854, | |
| "loss": 0.4979, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.248896963680607, | |
| "learning_rate": 0.00028430742255990647, | |
| "loss": 0.4174, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 7.535922720940304, | |
| "learning_rate": 0.00028408825248392753, | |
| "loss": 0.7422, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 22.985477551377006, | |
| "learning_rate": 0.00028386908240794854, | |
| "loss": 0.6509, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 44.34169162733889, | |
| "learning_rate": 0.0002836499123319696, | |
| "loss": 0.6908, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 41.93808698385512, | |
| "learning_rate": 0.00028343074225599065, | |
| "loss": 0.5134, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 9.954650885829432, | |
| "learning_rate": 0.00028321157218001166, | |
| "loss": 0.4772, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 14.453953812006848, | |
| "learning_rate": 0.0002829924021040327, | |
| "loss": 0.6207, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.273785434123745, | |
| "learning_rate": 0.0002827732320280537, | |
| "loss": 0.5753, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 4.858445515524344, | |
| "learning_rate": 0.0002825540619520748, | |
| "loss": 0.5052, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 9.635141224231973, | |
| "learning_rate": 0.00028233489187609584, | |
| "loss": 0.4951, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 43.41263842722645, | |
| "learning_rate": 0.00028211572180011685, | |
| "loss": 0.5636, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 15.495027516009957, | |
| "learning_rate": 0.0002818965517241379, | |
| "loss": 0.466, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 64.28182805477857, | |
| "learning_rate": 0.00028167738164815896, | |
| "loss": 0.5672, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 64.9516330457715, | |
| "learning_rate": 0.00028145821157217997, | |
| "loss": 0.569, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 18.376532518493583, | |
| "learning_rate": 0.00028123904149620103, | |
| "loss": 0.4619, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 8.562914609442593, | |
| "learning_rate": 0.0002810198714202221, | |
| "loss": 0.545, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 60.21363972002377, | |
| "learning_rate": 0.0002808007013442431, | |
| "loss": 0.3975, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 28.047906754092985, | |
| "learning_rate": 0.00028058153126826415, | |
| "loss": 0.6251, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 35.10586789783855, | |
| "learning_rate": 0.0002803623611922852, | |
| "loss": 0.6277, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 12.797561679192818, | |
| "learning_rate": 0.0002801431911163062, | |
| "loss": 0.5604, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 45.77020206433535, | |
| "learning_rate": 0.0002799240210403273, | |
| "loss": 0.5095, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 8.950325235661051, | |
| "learning_rate": 0.00027970485096434833, | |
| "loss": 0.6742, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 89.39440158187, | |
| "learning_rate": 0.00027948568088836934, | |
| "loss": 0.6291, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 42.25863783297151, | |
| "learning_rate": 0.0002792665108123904, | |
| "loss": 0.5981, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 70.1528669289255, | |
| "learning_rate": 0.0002790473407364114, | |
| "loss": 0.6403, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 46.97367429458831, | |
| "learning_rate": 0.00027882817066043246, | |
| "loss": 0.4118, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 40.541175129277065, | |
| "learning_rate": 0.0002786090005844535, | |
| "loss": 0.5049, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 15.523999358550466, | |
| "learning_rate": 0.0002783898305084745, | |
| "loss": 0.4035, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 10.185134626898725, | |
| "learning_rate": 0.0002781706604324956, | |
| "loss": 0.3807, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 41.67374680359159, | |
| "learning_rate": 0.00027795149035651664, | |
| "loss": 0.6452, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 17.98431769773285, | |
| "learning_rate": 0.00027773232028053765, | |
| "loss": 0.4049, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 28.42010603664099, | |
| "learning_rate": 0.0002775131502045587, | |
| "loss": 0.5067, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 8.903302713389115, | |
| "learning_rate": 0.00027729398012857977, | |
| "loss": 0.6382, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 65.95394817146764, | |
| "learning_rate": 0.00027707481005260077, | |
| "loss": 0.651, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 13.423645923612371, | |
| "learning_rate": 0.00027685563997662183, | |
| "loss": 0.4343, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 19.870165040548233, | |
| "learning_rate": 0.0002766364699006429, | |
| "loss": 0.5016, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 7.295472062733364, | |
| "learning_rate": 0.0002764172998246639, | |
| "loss": 0.4203, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 13.426653279288725, | |
| "learning_rate": 0.00027619812974868495, | |
| "loss": 0.5413, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 35.81973630385564, | |
| "learning_rate": 0.00027597895967270596, | |
| "loss": 0.536, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 15.789511808342628, | |
| "learning_rate": 0.000275759789596727, | |
| "loss": 0.3641, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 3.438056573729943, | |
| "learning_rate": 0.0002755406195207481, | |
| "loss": 0.3846, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 43.693835213902354, | |
| "learning_rate": 0.0002753214494447691, | |
| "loss": 0.6304, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 41.69843964932105, | |
| "learning_rate": 0.00027510227936879014, | |
| "loss": 0.5184, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 26.698062076363627, | |
| "learning_rate": 0.0002748831092928112, | |
| "loss": 0.5844, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.7012189494264895, | |
| "learning_rate": 0.0002746639392168322, | |
| "loss": 0.4043, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 52.6718491008472, | |
| "learning_rate": 0.0002744447691408533, | |
| "loss": 0.5674, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 14.061719898202849, | |
| "learning_rate": 0.0002742255990648743, | |
| "loss": 0.4724, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 3.847972088161487, | |
| "learning_rate": 0.00027400642898889533, | |
| "loss": 0.3894, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 10.733410570701619, | |
| "learning_rate": 0.0002737872589129164, | |
| "loss": 0.3706, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.582634401736958, | |
| "learning_rate": 0.00027356808883693745, | |
| "loss": 0.6391, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 26.185314688453758, | |
| "learning_rate": 0.0002733489187609585, | |
| "loss": 0.6093, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.754244803162683, | |
| "learning_rate": 0.0002731297486849795, | |
| "loss": 0.4868, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 39.5290991827534, | |
| "learning_rate": 0.00027291057860900057, | |
| "loss": 0.3798, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 50.24696278971589, | |
| "learning_rate": 0.00027269140853302163, | |
| "loss": 0.5672, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 40.15272638877307, | |
| "learning_rate": 0.00027247223845704263, | |
| "loss": 0.4471, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 28.70418219314436, | |
| "learning_rate": 0.00027225306838106364, | |
| "loss": 0.6382, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 28.561604385347348, | |
| "learning_rate": 0.00027203389830508475, | |
| "loss": 0.4176, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 17.815402549033355, | |
| "learning_rate": 0.00027181472822910576, | |
| "loss": 0.5882, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 15.342408938615899, | |
| "learning_rate": 0.0002715955581531268, | |
| "loss": 0.5019, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 23.598442882592035, | |
| "learning_rate": 0.0002713763880771479, | |
| "loss": 0.556, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 67.23786205262353, | |
| "learning_rate": 0.0002711572180011689, | |
| "loss": 0.5685, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 4.243022538273344, | |
| "learning_rate": 0.00027093804792518994, | |
| "loss": 0.4733, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 13.180044758827014, | |
| "learning_rate": 0.000270718877849211, | |
| "loss": 0.4605, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 22.165941849190077, | |
| "learning_rate": 0.000270499707773232, | |
| "loss": 0.5884, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 63.37369359897044, | |
| "learning_rate": 0.00027028053769725306, | |
| "loss": 0.5035, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 7.126542990903908, | |
| "learning_rate": 0.00027006136762127407, | |
| "loss": 0.5265, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 18.999488189785442, | |
| "learning_rate": 0.0002698421975452951, | |
| "loss": 0.4029, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 74.7735357892713, | |
| "learning_rate": 0.0002696230274693162, | |
| "loss": 0.5864, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 4.537363990276932, | |
| "learning_rate": 0.0002694038573933372, | |
| "loss": 0.4943, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 5.4900230565856125, | |
| "learning_rate": 0.00026918468731735825, | |
| "loss": 0.4046, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 13.0055044025215, | |
| "learning_rate": 0.0002689655172413793, | |
| "loss": 0.4548, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 38.12513704333924, | |
| "learning_rate": 0.0002687463471654003, | |
| "loss": 0.4879, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 39.26658731655891, | |
| "learning_rate": 0.0002685271770894214, | |
| "loss": 0.4619, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 25.60070282398106, | |
| "learning_rate": 0.00026830800701344243, | |
| "loss": 0.3977, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 26.2056587948436, | |
| "learning_rate": 0.00026808883693746344, | |
| "loss": 0.6761, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 6.11452999302761, | |
| "learning_rate": 0.0002678696668614845, | |
| "loss": 0.4193, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 33.901640420477705, | |
| "learning_rate": 0.00026765049678550556, | |
| "loss": 0.3582, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 39.05264118867418, | |
| "learning_rate": 0.00026743132670952656, | |
| "loss": 0.6291, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 37.194416649503715, | |
| "learning_rate": 0.0002672121566335476, | |
| "loss": 0.5405, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 10.433072414980515, | |
| "learning_rate": 0.0002669929865575686, | |
| "loss": 0.3794, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 13.092304284069662, | |
| "learning_rate": 0.0002667738164815897, | |
| "loss": 0.5334, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 63.86550661120767, | |
| "learning_rate": 0.00026655464640561074, | |
| "loss": 0.5122, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 2.517042940759129, | |
| "learning_rate": 0.00026633547632963175, | |
| "loss": 0.5369, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 5.619466465382078, | |
| "learning_rate": 0.0002661163062536528, | |
| "loss": 0.5914, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 8.934697851639797, | |
| "learning_rate": 0.00026589713617767387, | |
| "loss": 0.4139, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 6.230035836028844, | |
| "learning_rate": 0.00026567796610169487, | |
| "loss": 0.5029, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 15.356060765823438, | |
| "learning_rate": 0.00026545879602571593, | |
| "loss": 0.3824, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 7.576984306147784, | |
| "learning_rate": 0.000265239625949737, | |
| "loss": 0.6175, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 28.748362595639772, | |
| "learning_rate": 0.000265020455873758, | |
| "loss": 0.5614, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 16.32965992610039, | |
| "learning_rate": 0.00026480128579777905, | |
| "loss": 0.5116, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 3.2753516984217614, | |
| "learning_rate": 0.0002645821157218001, | |
| "loss": 0.4541, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 42.92699392822083, | |
| "learning_rate": 0.0002643629456458211, | |
| "loss": 0.4284, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 17.612783953748007, | |
| "learning_rate": 0.0002641437755698422, | |
| "loss": 0.465, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 8.549512475388743, | |
| "learning_rate": 0.00026392460549386324, | |
| "loss": 0.409, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 12.657379379480137, | |
| "learning_rate": 0.00026370543541788424, | |
| "loss": 0.5745, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 19.533607385657547, | |
| "learning_rate": 0.0002634862653419053, | |
| "loss": 0.493, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 30.019312017902323, | |
| "learning_rate": 0.0002632670952659263, | |
| "loss": 0.486, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 15.192669371979543, | |
| "learning_rate": 0.00026304792518994736, | |
| "loss": 0.4214, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 32.465439529928304, | |
| "learning_rate": 0.0002628287551139684, | |
| "loss": 0.5712, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 36.91886247399315, | |
| "learning_rate": 0.00026260958503798943, | |
| "loss": 0.4334, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 23.910769075350032, | |
| "learning_rate": 0.0002623904149620105, | |
| "loss": 0.5462, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 67.93569743336582, | |
| "learning_rate": 0.00026217124488603155, | |
| "loss": 0.4493, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.532118414700022, | |
| "learning_rate": 0.00026195207481005255, | |
| "loss": 0.4534, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 29.266963293461014, | |
| "learning_rate": 0.0002617329047340736, | |
| "loss": 0.4529, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 48.97288077592708, | |
| "learning_rate": 0.00026153565166569257, | |
| "loss": 0.5841, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 20.00706276686128, | |
| "learning_rate": 0.00026131648158971357, | |
| "loss": 0.5449, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 52.88683636279855, | |
| "learning_rate": 0.00026109731151373463, | |
| "loss": 0.5451, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 19.058421611399435, | |
| "learning_rate": 0.0002608781414377557, | |
| "loss": 0.4933, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 12.662298018994834, | |
| "learning_rate": 0.0002606589713617767, | |
| "loss": 0.4933, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 45.53858660601269, | |
| "learning_rate": 0.00026043980128579776, | |
| "loss": 0.53, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 26.19222918219647, | |
| "learning_rate": 0.0002602206312098188, | |
| "loss": 0.4293, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 12.902630462226897, | |
| "learning_rate": 0.0002600014611338398, | |
| "loss": 0.4571, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 30.841311400161104, | |
| "learning_rate": 0.0002597822910578609, | |
| "loss": 0.3776, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.354115675571604, | |
| "learning_rate": 0.00025956312098188194, | |
| "loss": 0.3751, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 11.162047701554282, | |
| "learning_rate": 0.00025934395090590294, | |
| "loss": 0.2605, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 42.422050961854865, | |
| "learning_rate": 0.000259124780829924, | |
| "loss": 0.5152, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 38.9786626728249, | |
| "learning_rate": 0.000258905610753945, | |
| "loss": 0.3935, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 38.1773503077784, | |
| "learning_rate": 0.00025868644067796607, | |
| "loss": 0.5644, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 14.79430105184665, | |
| "learning_rate": 0.0002584672706019871, | |
| "loss": 0.5293, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 34.794933030659074, | |
| "learning_rate": 0.00025824810052600813, | |
| "loss": 0.5646, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 11.775866235902662, | |
| "learning_rate": 0.0002580289304500292, | |
| "loss": 0.4189, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 74.35669253529362, | |
| "learning_rate": 0.00025780976037405025, | |
| "loss": 0.6112, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 38.88263346213307, | |
| "learning_rate": 0.00025759059029807125, | |
| "loss": 0.4503, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 37.361077582565066, | |
| "learning_rate": 0.0002573714202220923, | |
| "loss": 0.4393, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.7092125809415342, | |
| "learning_rate": 0.00025715225014611337, | |
| "loss": 0.4243, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 57.33578620565298, | |
| "learning_rate": 0.0002569330800701344, | |
| "loss": 0.4961, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 24.953469848220585, | |
| "learning_rate": 0.00025671390999415543, | |
| "loss": 0.5143, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 69.87186379679335, | |
| "learning_rate": 0.0002564947399181765, | |
| "loss": 0.5994, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 9.423503958754821, | |
| "learning_rate": 0.0002562755698421975, | |
| "loss": 0.3865, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 35.63761411276129, | |
| "learning_rate": 0.00025605639976621856, | |
| "loss": 0.4244, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 7.718201160525164, | |
| "learning_rate": 0.0002558372296902396, | |
| "loss": 0.303, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 4.591243708143018, | |
| "learning_rate": 0.0002556180596142607, | |
| "loss": 0.5215, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 22.307015902915715, | |
| "learning_rate": 0.0002553988895382817, | |
| "loss": 0.4571, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 22.779301668637764, | |
| "learning_rate": 0.0002551797194623027, | |
| "loss": 0.4807, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 12.190202028042945, | |
| "learning_rate": 0.0002549605493863238, | |
| "loss": 0.3605, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 7.251641577848608, | |
| "learning_rate": 0.0002547413793103448, | |
| "loss": 0.6469, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.5118716411197952, | |
| "learning_rate": 0.00025452220923436586, | |
| "loss": 0.3641, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 3.4089258042614086, | |
| "learning_rate": 0.0002543030391583869, | |
| "loss": 0.6651, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.9557173441623856, | |
| "learning_rate": 0.00025408386908240793, | |
| "loss": 0.6309, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 29.838754199710966, | |
| "learning_rate": 0.000253864699006429, | |
| "loss": 0.4806, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 27.703367909949506, | |
| "learning_rate": 0.00025364552893045, | |
| "loss": 0.5419, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 30.019133853453674, | |
| "learning_rate": 0.00025342635885447105, | |
| "loss": 0.5585, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 34.77564734000214, | |
| "learning_rate": 0.0002532071887784921, | |
| "loss": 0.4483, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 47.783274628573835, | |
| "learning_rate": 0.0002529880187025131, | |
| "loss": 0.5153, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 26.23337473603945, | |
| "learning_rate": 0.0002527688486265342, | |
| "loss": 0.4699, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 29.216617715519472, | |
| "learning_rate": 0.00025254967855055523, | |
| "loss": 0.5015, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 62.76087495760497, | |
| "learning_rate": 0.00025233050847457624, | |
| "loss": 0.3711, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 45.88533063155937, | |
| "learning_rate": 0.0002521113383985973, | |
| "loss": 0.6509, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 20.44584239605378, | |
| "learning_rate": 0.00025189216832261836, | |
| "loss": 0.3635, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 16.861314315606865, | |
| "learning_rate": 0.00025167299824663936, | |
| "loss": 0.4719, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 28.52875293425469, | |
| "learning_rate": 0.0002514538281706604, | |
| "loss": 0.689, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 35.22222000713686, | |
| "learning_rate": 0.0002512346580946815, | |
| "loss": 0.6445, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 35.45687930499127, | |
| "learning_rate": 0.0002510154880187025, | |
| "loss": 0.4704, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 8.810315004780433, | |
| "learning_rate": 0.00025079631794272354, | |
| "loss": 0.5576, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 35.77216936747473, | |
| "learning_rate": 0.0002505771478667446, | |
| "loss": 0.3533, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 6.045353414942304, | |
| "learning_rate": 0.0002503579777907656, | |
| "loss": 0.4113, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 45.51398057288223, | |
| "learning_rate": 0.00025013880771478667, | |
| "loss": 0.4071, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 36.84264931235842, | |
| "learning_rate": 0.00024991963763880767, | |
| "loss": 0.6049, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 7.376752961081483, | |
| "learning_rate": 0.00024970046756282873, | |
| "loss": 0.4454, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 45.296630887337315, | |
| "learning_rate": 0.0002494812974868498, | |
| "loss": 0.4267, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 43.62273117063915, | |
| "learning_rate": 0.0002492621274108708, | |
| "loss": 0.4359, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 51.720566303652305, | |
| "learning_rate": 0.00024904295733489185, | |
| "loss": 0.4941, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 24.249504968694367, | |
| "learning_rate": 0.0002488237872589129, | |
| "loss": 0.345, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 14.634006430151054, | |
| "learning_rate": 0.0002486046171829339, | |
| "loss": 0.4586, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 11.217467398348905, | |
| "learning_rate": 0.000248385447106955, | |
| "loss": 0.4376, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 45.30561601980238, | |
| "learning_rate": 0.00024816627703097604, | |
| "loss": 0.3944, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 15.499098833410399, | |
| "learning_rate": 0.00024794710695499704, | |
| "loss": 0.4469, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 15.594634721523647, | |
| "learning_rate": 0.0002477279368790181, | |
| "loss": 0.4524, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 4.415769490237178, | |
| "learning_rate": 0.00024750876680303916, | |
| "loss": 0.5186, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 50.952551749899285, | |
| "learning_rate": 0.00024728959672706016, | |
| "loss": 0.3835, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 44.37013904045829, | |
| "learning_rate": 0.0002470704266510812, | |
| "loss": 0.3692, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 5.538761067578604, | |
| "learning_rate": 0.0002468512565751023, | |
| "loss": 0.4466, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 36.623608048249956, | |
| "learning_rate": 0.0002466320864991233, | |
| "loss": 0.4119, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 14.12931554239451, | |
| "learning_rate": 0.00024641291642314435, | |
| "loss": 0.4094, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 5.17846897516002, | |
| "learning_rate": 0.00024619374634716535, | |
| "loss": 0.6419, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 17.27825883448974, | |
| "learning_rate": 0.0002459745762711864, | |
| "loss": 0.4454, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 39.317445442045795, | |
| "learning_rate": 0.00024575540619520747, | |
| "loss": 0.4505, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 69.64730678545516, | |
| "learning_rate": 0.0002455362361192285, | |
| "loss": 0.4583, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 28.844952718130543, | |
| "learning_rate": 0.00024531706604324953, | |
| "loss": 0.6161, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 19.0210560197049, | |
| "learning_rate": 0.0002450978959672706, | |
| "loss": 0.4669, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 6.173405033604054, | |
| "learning_rate": 0.0002448787258912916, | |
| "loss": 0.4409, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 52.77800432968733, | |
| "learning_rate": 0.00024465955581531266, | |
| "loss": 0.4491, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 44.46853107524498, | |
| "learning_rate": 0.0002444403857393337, | |
| "loss": 0.3192, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 5.0160255873775546, | |
| "learning_rate": 0.0002442212156633547, | |
| "loss": 0.4516, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 23.9601395551555, | |
| "learning_rate": 0.00024400204558737578, | |
| "loss": 0.5556, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 29.267790594083895, | |
| "learning_rate": 0.00024378287551139684, | |
| "loss": 0.592, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 23.012129032542134, | |
| "learning_rate": 0.00024356370543541787, | |
| "loss": 0.3747, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 14.454095097216845, | |
| "learning_rate": 0.0002433445353594389, | |
| "loss": 0.384, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 26.97816432687431, | |
| "learning_rate": 0.00024312536528345993, | |
| "loss": 0.3297, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 9.801537735404679, | |
| "learning_rate": 0.000242906195207481, | |
| "loss": 0.5374, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 6.8951700119996, | |
| "learning_rate": 0.00024268702513150203, | |
| "loss": 0.5083, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 14.817012261931632, | |
| "learning_rate": 0.00024246785505552306, | |
| "loss": 0.5883, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 41.27878717469242, | |
| "learning_rate": 0.00024224868497954412, | |
| "loss": 0.3978, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 37.80902917455334, | |
| "learning_rate": 0.00024202951490356515, | |
| "loss": 0.5849, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 18.49777920142327, | |
| "learning_rate": 0.00024181034482758618, | |
| "loss": 0.4706, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 19.061143834111856, | |
| "learning_rate": 0.00024159117475160724, | |
| "loss": 0.4619, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 22.2453112157315, | |
| "learning_rate": 0.00024137200467562827, | |
| "loss": 0.3842, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 50.79421799621206, | |
| "learning_rate": 0.0002411528345996493, | |
| "loss": 0.3655, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 32.50988571681092, | |
| "learning_rate": 0.00024093366452367034, | |
| "loss": 0.5652, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 3.5054157276555955, | |
| "learning_rate": 0.0002407144944476914, | |
| "loss": 0.6236, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 34.425678272389796, | |
| "learning_rate": 0.00024049532437171243, | |
| "loss": 0.4448, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 27.486059433114335, | |
| "learning_rate": 0.00024027615429573346, | |
| "loss": 0.4669, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 19.22828927795651, | |
| "learning_rate": 0.00024005698421975452, | |
| "loss": 0.4644, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 21.0363603349132, | |
| "learning_rate": 0.00023983781414377555, | |
| "loss": 0.431, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 3.5723838037716984, | |
| "learning_rate": 0.00023961864406779658, | |
| "loss": 0.5383, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 13.546990268361519, | |
| "learning_rate": 0.00023939947399181761, | |
| "loss": 0.3201, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 24.38084566112433, | |
| "learning_rate": 0.00023918030391583867, | |
| "loss": 0.3129, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 25.645694818995487, | |
| "learning_rate": 0.0002389611338398597, | |
| "loss": 0.5026, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 66.5222383607227, | |
| "learning_rate": 0.00023874196376388074, | |
| "loss": 0.5204, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 24.167963861089163, | |
| "learning_rate": 0.0002385227936879018, | |
| "loss": 0.2994, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.9923273025019665, | |
| "learning_rate": 0.00023830362361192283, | |
| "loss": 0.3327, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 7.036402375923249, | |
| "learning_rate": 0.00023808445353594386, | |
| "loss": 0.4824, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 4.932257241329014, | |
| "learning_rate": 0.00023786528345996492, | |
| "loss": 0.3872, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 8.597901439972688, | |
| "learning_rate": 0.00023764611338398595, | |
| "loss": 0.3997, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 11.315382409369429, | |
| "learning_rate": 0.00023742694330800698, | |
| "loss": 0.5147, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 14.52575221014949, | |
| "learning_rate": 0.00023720777323202802, | |
| "loss": 0.3314, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 43.682402501837984, | |
| "learning_rate": 0.00023698860315604907, | |
| "loss": 0.4339, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 53.44210886125679, | |
| "learning_rate": 0.0002367694330800701, | |
| "loss": 0.4009, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 13.63857046515992, | |
| "learning_rate": 0.00023655026300409114, | |
| "loss": 0.4517, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 18.188881596196584, | |
| "learning_rate": 0.0002363310929281122, | |
| "loss": 0.7472, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 15.622454098714014, | |
| "learning_rate": 0.00023611192285213323, | |
| "loss": 0.4332, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 23.030560101084287, | |
| "learning_rate": 0.00023589275277615426, | |
| "loss": 0.4659, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 11.217028241840945, | |
| "learning_rate": 0.0002356735827001753, | |
| "loss": 0.4288, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 20.08432954445042, | |
| "learning_rate": 0.00023545441262419635, | |
| "loss": 0.277, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 74.65434882424645, | |
| "learning_rate": 0.00023523524254821738, | |
| "loss": 0.635, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 40.495503288549955, | |
| "learning_rate": 0.00023501607247223842, | |
| "loss": 0.388, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 12.752161600605927, | |
| "learning_rate": 0.0002347969023962595, | |
| "loss": 0.5092, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 14.53893507733024, | |
| "learning_rate": 0.0002345777323202805, | |
| "loss": 0.4765, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 4.316017393789724, | |
| "learning_rate": 0.00023435856224430154, | |
| "loss": 0.3659, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 30.899825163472197, | |
| "learning_rate": 0.00023413939216832257, | |
| "loss": 0.4516, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 39.99478400763079, | |
| "learning_rate": 0.00023392022209234366, | |
| "loss": 0.381, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_0_f1": 0.6956680014561338, | |
| "eval_0_precision": 0.607631160572337, | |
| "eval_0_recall": 0.8135376756066411, | |
| "eval_1_f1": 0.8688421713209915, | |
| "eval_1_precision": 0.9267068273092369, | |
| "eval_1_recall": 0.8177790903721205, | |
| "eval_accuracy": 0.8166867668018858, | |
| "eval_loss": 0.3857421875, | |
| "eval_runtime": 546.7196, | |
| "eval_samples_per_second": 16.683, | |
| "eval_steps_per_second": 2.782, | |
| "step": 3547 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 5.0389158685922375, | |
| "learning_rate": 0.00023370105201636466, | |
| "loss": 0.4146, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 11.9478624203234, | |
| "learning_rate": 0.0002334818819403857, | |
| "loss": 0.219, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 2.574561125344869, | |
| "learning_rate": 0.00023326271186440678, | |
| "loss": 0.1326, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 6.645296190271201, | |
| "learning_rate": 0.0002330435417884278, | |
| "loss": 0.3714, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 21.165312017263854, | |
| "learning_rate": 0.00023282437171244884, | |
| "loss": 0.186, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 33.09937520843868, | |
| "learning_rate": 0.0002326052016364699, | |
| "loss": 0.189, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 28.917213358449875, | |
| "learning_rate": 0.00023238603156049094, | |
| "loss": 0.2064, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 6.362982741180963, | |
| "learning_rate": 0.00023216686148451197, | |
| "loss": 0.3743, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 15.278355603224561, | |
| "learning_rate": 0.000231947691408533, | |
| "loss": 0.339, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 19.75892836736991, | |
| "learning_rate": 0.00023172852133255406, | |
| "loss": 0.3234, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 27.481948501720865, | |
| "learning_rate": 0.0002315093512565751, | |
| "loss": 0.3744, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 4.553602693616155, | |
| "learning_rate": 0.00023129018118059612, | |
| "loss": 0.292, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 5.428732611717056, | |
| "learning_rate": 0.00023107101110461718, | |
| "loss": 0.2631, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 5.050323966343426, | |
| "learning_rate": 0.00023085184102863821, | |
| "loss": 0.4004, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 24.80793600752628, | |
| "learning_rate": 0.00023063267095265925, | |
| "loss": 0.5381, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 18.258408123657016, | |
| "learning_rate": 0.00023041350087668028, | |
| "loss": 0.3396, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 29.325589345887487, | |
| "learning_rate": 0.00023019433080070134, | |
| "loss": 0.3759, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 5.83589085044179, | |
| "learning_rate": 0.00022997516072472237, | |
| "loss": 0.3515, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 31.709214076980373, | |
| "learning_rate": 0.0002297559906487434, | |
| "loss": 0.3038, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 5.331144297018398, | |
| "learning_rate": 0.00022953682057276446, | |
| "loss": 0.2608, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 10.008440999378909, | |
| "learning_rate": 0.0002293176504967855, | |
| "loss": 0.392, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 24.06127490696333, | |
| "learning_rate": 0.00022909848042080652, | |
| "loss": 0.464, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 36.47161723251647, | |
| "learning_rate": 0.00022887931034482758, | |
| "loss": 0.3144, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 44.318502303445214, | |
| "learning_rate": 0.00022866014026884862, | |
| "loss": 0.2178, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 21.011798626587794, | |
| "learning_rate": 0.00022844097019286965, | |
| "loss": 0.2655, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 7.015038983544465, | |
| "learning_rate": 0.00022822180011689068, | |
| "loss": 0.33, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 9.837625310344656, | |
| "learning_rate": 0.00022800263004091174, | |
| "loss": 0.3804, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 3.5246300872909493, | |
| "learning_rate": 0.00022778345996493277, | |
| "loss": 0.3144, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 7.333333429866329, | |
| "learning_rate": 0.0002275642898889538, | |
| "loss": 0.208, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 2.9276220226615655, | |
| "learning_rate": 0.00022734511981297486, | |
| "loss": 0.1965, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 3.296027114122367, | |
| "learning_rate": 0.0002271259497369959, | |
| "loss": 0.1641, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 63.98473163919884, | |
| "learning_rate": 0.00022690677966101693, | |
| "loss": 0.2204, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 25.79236796313587, | |
| "learning_rate": 0.00022668760958503796, | |
| "loss": 0.3985, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 47.19895401753889, | |
| "learning_rate": 0.00022646843950905902, | |
| "loss": 0.3724, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 11.740165468615665, | |
| "learning_rate": 0.00022624926943308005, | |
| "loss": 0.3228, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.7482155895929372, | |
| "learning_rate": 0.00022603009935710108, | |
| "loss": 0.3069, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 31.032450056833344, | |
| "learning_rate": 0.00022581092928112214, | |
| "loss": 0.2573, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 11.15417567821661, | |
| "learning_rate": 0.00022559175920514317, | |
| "loss": 0.2432, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 9.77526286134104, | |
| "learning_rate": 0.0002253725891291642, | |
| "loss": 0.202, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 43.631843372398045, | |
| "learning_rate": 0.00022515341905318524, | |
| "loss": 0.3633, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 24.908344949793975, | |
| "learning_rate": 0.0002249342489772063, | |
| "loss": 0.2629, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.9235099739993984, | |
| "learning_rate": 0.00022471507890122733, | |
| "loss": 0.312, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 8.901642976423531, | |
| "learning_rate": 0.00022449590882524836, | |
| "loss": 0.2134, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 35.49143975289104, | |
| "learning_rate": 0.00022427673874926942, | |
| "loss": 0.3581, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 9.880151735005258, | |
| "learning_rate": 0.00022405756867329045, | |
| "loss": 0.2105, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 11.472473033640894, | |
| "learning_rate": 0.00022383839859731148, | |
| "loss": 0.2771, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 7.564996853716761, | |
| "learning_rate": 0.00022361922852133254, | |
| "loss": 0.1729, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.6868590263390821, | |
| "learning_rate": 0.00022340005844535357, | |
| "loss": 0.4633, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 28.229059784881606, | |
| "learning_rate": 0.0002231808883693746, | |
| "loss": 0.5021, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 17.572412568669705, | |
| "learning_rate": 0.00022296171829339564, | |
| "loss": 0.2666, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 11.720230148855956, | |
| "learning_rate": 0.0002227425482174167, | |
| "loss": 0.3706, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 57.76450102261794, | |
| "learning_rate": 0.00022252337814143773, | |
| "loss": 0.3514, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 8.937408336756231, | |
| "learning_rate": 0.00022230420806545876, | |
| "loss": 0.3542, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 40.358482259032456, | |
| "learning_rate": 0.00022208503798947982, | |
| "loss": 0.3049, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 22.567330463151773, | |
| "learning_rate": 0.00022186586791350085, | |
| "loss": 0.2778, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 27.666117112861954, | |
| "learning_rate": 0.00022164669783752188, | |
| "loss": 0.2204, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 34.568905094183656, | |
| "learning_rate": 0.00022142752776154292, | |
| "loss": 0.4829, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 37.20888182696178, | |
| "learning_rate": 0.00022120835768556397, | |
| "loss": 0.3751, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 2.628119016749957, | |
| "learning_rate": 0.000220989187609585, | |
| "loss": 0.2531, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 12.787898954063717, | |
| "learning_rate": 0.00022077001753360604, | |
| "loss": 0.2568, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 13.740422198890055, | |
| "learning_rate": 0.0002205508474576271, | |
| "loss": 0.1521, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 2.754528026569796, | |
| "learning_rate": 0.00022033167738164813, | |
| "loss": 0.3939, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 17.13803914567701, | |
| "learning_rate": 0.00022011250730566916, | |
| "loss": 0.2548, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 8.755380723031136, | |
| "learning_rate": 0.00021989333722969025, | |
| "loss": 0.22, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 11.327279230065335, | |
| "learning_rate": 0.00021967416715371128, | |
| "loss": 0.1881, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 11.76478464724261, | |
| "learning_rate": 0.00021945499707773229, | |
| "loss": 0.4543, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 21.089139286448926, | |
| "learning_rate": 0.00021923582700175332, | |
| "loss": 0.412, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 4.35685071056218, | |
| "learning_rate": 0.0002190166569257744, | |
| "loss": 0.2452, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 16.567825119225937, | |
| "learning_rate": 0.00021879748684979544, | |
| "loss": 0.2714, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 51.4802446925637, | |
| "learning_rate": 0.00021857831677381644, | |
| "loss": 0.3585, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 50.02592512359097, | |
| "learning_rate": 0.00021835914669783753, | |
| "loss": 0.4534, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 6.384047108249605, | |
| "learning_rate": 0.00021813997662185856, | |
| "loss": 0.3206, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 9.283525914668507, | |
| "learning_rate": 0.0002179208065458796, | |
| "loss": 0.3393, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 45.654861594884274, | |
| "learning_rate": 0.0002177016364699006, | |
| "loss": 0.335, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 3.5003233393976263, | |
| "learning_rate": 0.00021748246639392168, | |
| "loss": 0.3477, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 14.979317988693346, | |
| "learning_rate": 0.00021726329631794271, | |
| "loss": 0.2205, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 9.173367064164506, | |
| "learning_rate": 0.00021704412624196375, | |
| "loss": 0.4878, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 13.812041376988086, | |
| "learning_rate": 0.0002168249561659848, | |
| "loss": 0.3115, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 5.145857101646077, | |
| "learning_rate": 0.0002166277030976037, | |
| "loss": 0.1912, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.5222263707561177, | |
| "learning_rate": 0.00021640853302162474, | |
| "loss": 0.2155, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 20.94214337532918, | |
| "learning_rate": 0.00021618936294564583, | |
| "loss": 0.1663, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 46.6321807054184, | |
| "learning_rate": 0.00021597019286966686, | |
| "loss": 0.2966, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 4.374585543579176, | |
| "learning_rate": 0.00021575102279368786, | |
| "loss": 0.3326, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 5.550072623324231, | |
| "learning_rate": 0.0002155318527177089, | |
| "loss": 0.1478, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.6613601079645897, | |
| "learning_rate": 0.00021531268264172998, | |
| "loss": 0.2295, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 3.000033161148002, | |
| "learning_rate": 0.00021509351256575101, | |
| "loss": 0.5028, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 38.67630998438632, | |
| "learning_rate": 0.00021487434248977205, | |
| "loss": 0.304, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 31.0474775238587, | |
| "learning_rate": 0.0002146551724137931, | |
| "loss": 0.3397, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 21.038657972290533, | |
| "learning_rate": 0.00021443600233781414, | |
| "loss": 0.2639, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 29.03727828295053, | |
| "learning_rate": 0.00021421683226183517, | |
| "loss": 0.269, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 4.822263791040293, | |
| "learning_rate": 0.0002139976621858562, | |
| "loss": 0.2482, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 8.31426730608056, | |
| "learning_rate": 0.00021377849210987726, | |
| "loss": 0.2249, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 43.20888606905504, | |
| "learning_rate": 0.0002135593220338983, | |
| "loss": 0.3654, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 2.446800168689743, | |
| "learning_rate": 0.00021334015195791932, | |
| "loss": 0.3182, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 7.6988347063951466, | |
| "learning_rate": 0.00021312098188194038, | |
| "loss": 0.3373, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 31.935307455130687, | |
| "learning_rate": 0.00021290181180596142, | |
| "loss": 0.3036, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 20.726406179500756, | |
| "learning_rate": 0.00021268264172998245, | |
| "loss": 0.3627, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 7.097138033352009, | |
| "learning_rate": 0.0002124634716540035, | |
| "loss": 0.2623, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 3.8873072142785183, | |
| "learning_rate": 0.00021224430157802454, | |
| "loss": 0.2236, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 14.527770370800262, | |
| "learning_rate": 0.00021202513150204557, | |
| "loss": 0.137, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 7.558415335390357, | |
| "learning_rate": 0.0002118059614260666, | |
| "loss": 0.217, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 29.89529000773257, | |
| "learning_rate": 0.00021158679135008766, | |
| "loss": 0.251, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 52.64728318565253, | |
| "learning_rate": 0.0002113676212741087, | |
| "loss": 0.3333, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 30.152792328177426, | |
| "learning_rate": 0.00021114845119812973, | |
| "loss": 0.2724, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 2.5341334779748967, | |
| "learning_rate": 0.00021092928112215079, | |
| "loss": 0.269, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 22.55021824043886, | |
| "learning_rate": 0.00021071011104617182, | |
| "loss": 0.0765, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 20.277146842395847, | |
| "learning_rate": 0.00021049094097019285, | |
| "loss": 0.2189, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 31.332786897175072, | |
| "learning_rate": 0.00021027177089421388, | |
| "loss": 0.4537, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 4.0878545777289395, | |
| "learning_rate": 0.00021005260081823494, | |
| "loss": 0.2703, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 26.911824724655006, | |
| "learning_rate": 0.00020983343074225597, | |
| "loss": 0.2727, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 2.5043488479928016, | |
| "learning_rate": 0.000209614260666277, | |
| "loss": 0.1888, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 38.26894575743672, | |
| "learning_rate": 0.00020939509059029806, | |
| "loss": 0.334, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 3.073803318378424, | |
| "learning_rate": 0.0002091759205143191, | |
| "loss": 0.3018, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 50.99829894564531, | |
| "learning_rate": 0.00020895675043834013, | |
| "loss": 0.309, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 33.03113078011947, | |
| "learning_rate": 0.0002087375803623612, | |
| "loss": 0.3267, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 49.72639701138489, | |
| "learning_rate": 0.00020851841028638222, | |
| "loss": 0.3428, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 15.250514847558827, | |
| "learning_rate": 0.00020829924021040325, | |
| "loss": 0.2442, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 29.630372731951933, | |
| "learning_rate": 0.00020808007013442428, | |
| "loss": 0.4006, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 2.6025635165269834, | |
| "learning_rate": 0.00020786090005844534, | |
| "loss": 0.1417, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 46.58385970647689, | |
| "learning_rate": 0.00020764172998246637, | |
| "loss": 0.3178, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 33.748707418723626, | |
| "learning_rate": 0.0002074225599064874, | |
| "loss": 0.4147, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 8.516868922828099, | |
| "learning_rate": 0.00020720338983050846, | |
| "loss": 0.2285, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 56.676130895938115, | |
| "learning_rate": 0.0002069842197545295, | |
| "loss": 0.3276, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 57.88752849086389, | |
| "learning_rate": 0.00020676504967855053, | |
| "loss": 0.4268, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.2518432859009083, | |
| "learning_rate": 0.00020654587960257156, | |
| "loss": 0.2271, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 8.476321740016418, | |
| "learning_rate": 0.00020632670952659262, | |
| "loss": 0.2991, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 5.939165676183437, | |
| "learning_rate": 0.00020610753945061365, | |
| "loss": 0.2207, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 6.721301020536598, | |
| "learning_rate": 0.00020588836937463468, | |
| "loss": 0.1735, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 3.2082370683760044, | |
| "learning_rate": 0.00020566919929865574, | |
| "loss": 0.4545, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 29.422853677429167, | |
| "learning_rate": 0.00020545002922267678, | |
| "loss": 0.4142, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 5.299481055699104, | |
| "learning_rate": 0.0002052308591466978, | |
| "loss": 0.3986, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 22.75931248540711, | |
| "learning_rate": 0.00020501168907071887, | |
| "loss": 0.4092, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 6.466191538331885, | |
| "learning_rate": 0.0002047925189947399, | |
| "loss": 0.1388, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 8.95128170273167, | |
| "learning_rate": 0.00020457334891876093, | |
| "loss": 0.4025, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 30.917184250084812, | |
| "learning_rate": 0.00020435417884278196, | |
| "loss": 0.3814, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 20.91623684082827, | |
| "learning_rate": 0.00020413500876680302, | |
| "loss": 0.3143, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 28.994187069448067, | |
| "learning_rate": 0.00020391583869082405, | |
| "loss": 0.2026, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 42.11122790207425, | |
| "learning_rate": 0.00020369666861484509, | |
| "loss": 0.3098, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 36.159828304877706, | |
| "learning_rate": 0.00020347749853886614, | |
| "loss": 0.2509, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 20.23189475386178, | |
| "learning_rate": 0.00020325832846288718, | |
| "loss": 0.3435, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 9.446387490878042, | |
| "learning_rate": 0.0002030391583869082, | |
| "loss": 0.3598, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 31.356335012679175, | |
| "learning_rate": 0.00020281998831092924, | |
| "loss": 0.3427, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 25.111026271060013, | |
| "learning_rate": 0.0002026008182349503, | |
| "loss": 0.2731, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 10.288548511014797, | |
| "learning_rate": 0.00020238164815897133, | |
| "loss": 0.2741, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 18.078048168470804, | |
| "learning_rate": 0.00020216247808299236, | |
| "loss": 0.1965, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 2.5818740774616953, | |
| "learning_rate": 0.00020194330800701345, | |
| "loss": 0.423, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 5.029346814046525, | |
| "learning_rate": 0.00020172413793103448, | |
| "loss": 0.1981, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 3.69515764136778, | |
| "learning_rate": 0.0002015049678550555, | |
| "loss": 0.3407, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.1185073469260922, | |
| "learning_rate": 0.00020128579777907652, | |
| "loss": 0.3759, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 25.82929883275312, | |
| "learning_rate": 0.0002010666277030976, | |
| "loss": 0.3386, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 5.658058990862141, | |
| "learning_rate": 0.00020084745762711864, | |
| "loss": 0.3067, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 33.789807219043574, | |
| "learning_rate": 0.00020062828755113964, | |
| "loss": 0.3116, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 29.665346856502566, | |
| "learning_rate": 0.00020040911747516073, | |
| "loss": 0.4307, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 2.953173260130455, | |
| "learning_rate": 0.00020018994739918176, | |
| "loss": 0.2401, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 59.25975112660892, | |
| "learning_rate": 0.0001999707773232028, | |
| "loss": 0.3712, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 16.61695534436786, | |
| "learning_rate": 0.00019975160724722385, | |
| "loss": 0.2978, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 27.59304302639443, | |
| "learning_rate": 0.00019953243717124488, | |
| "loss": 0.2207, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 33.3996635406056, | |
| "learning_rate": 0.00019931326709526592, | |
| "loss": 0.4006, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 7.909724279076434, | |
| "learning_rate": 0.00019909409701928695, | |
| "loss": 0.3282, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 10.96614010093427, | |
| "learning_rate": 0.000198874926943308, | |
| "loss": 0.3641, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 4.747888769171669, | |
| "learning_rate": 0.00019865575686732904, | |
| "loss": 0.2596, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 17.19969500744997, | |
| "learning_rate": 0.00019843658679135007, | |
| "loss": 0.2359, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 12.445792367631444, | |
| "learning_rate": 0.00019821741671537113, | |
| "loss": 0.291, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 16.38158373051574, | |
| "learning_rate": 0.00019799824663939216, | |
| "loss": 0.2152, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 19.55308310137066, | |
| "learning_rate": 0.0001977790765634132, | |
| "loss": 0.245, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 4.151920692809044, | |
| "learning_rate": 0.00019755990648743423, | |
| "loss": 0.263, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 23.195604876425826, | |
| "learning_rate": 0.00019734073641145528, | |
| "loss": 0.3534, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 21.589521451918323, | |
| "learning_rate": 0.00019712156633547632, | |
| "loss": 0.1662, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 11.887752916702121, | |
| "learning_rate": 0.00019690239625949735, | |
| "loss": 0.2105, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 5.176318946475123, | |
| "learning_rate": 0.0001966832261835184, | |
| "loss": 0.2695, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 11.690634456574996, | |
| "learning_rate": 0.00019646405610753944, | |
| "loss": 0.3302, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 62.85698360348048, | |
| "learning_rate": 0.00019624488603156047, | |
| "loss": 0.5215, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 13.466880633693304, | |
| "learning_rate": 0.00019602571595558153, | |
| "loss": 0.2985, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.965805579763693, | |
| "learning_rate": 0.00019580654587960256, | |
| "loss": 0.2372, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 12.499426672870545, | |
| "learning_rate": 0.0001955873758036236, | |
| "loss": 0.1587, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 15.307208098887806, | |
| "learning_rate": 0.00019536820572764463, | |
| "loss": 0.207, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 14.010835079736815, | |
| "learning_rate": 0.00019514903565166569, | |
| "loss": 0.1688, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 7.104347674810669, | |
| "learning_rate": 0.00019492986557568672, | |
| "loss": 0.1888, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.5642209100774738, | |
| "learning_rate": 0.00019473261250730565, | |
| "loss": 0.2465, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 32.89294792453513, | |
| "learning_rate": 0.0001945134424313267, | |
| "loss": 0.4685, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 16.400791636569856, | |
| "learning_rate": 0.00019429427235534774, | |
| "loss": 0.2564, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 6.986562239990336, | |
| "learning_rate": 0.00019407510227936877, | |
| "loss": 0.2917, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 11.557550672896545, | |
| "learning_rate": 0.00019385593220338983, | |
| "loss": 0.3427, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 2.1712768411403927, | |
| "learning_rate": 0.00019363676212741086, | |
| "loss": 0.2312, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 3.4188406927450874, | |
| "learning_rate": 0.0001934175920514319, | |
| "loss": 0.2822, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 18.41553181032857, | |
| "learning_rate": 0.00019319842197545293, | |
| "loss": 0.3085, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 8.925374304814394, | |
| "learning_rate": 0.000192979251899474, | |
| "loss": 0.3014, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 2.21057714781379, | |
| "learning_rate": 0.00019276008182349502, | |
| "loss": 0.2019, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 7.995029854183915, | |
| "learning_rate": 0.00019254091174751605, | |
| "loss": 0.3143, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 24.818437530389712, | |
| "learning_rate": 0.0001923217416715371, | |
| "loss": 0.2312, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 4.630710193156958, | |
| "learning_rate": 0.00019210257159555814, | |
| "loss": 0.2946, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 18.89029942903239, | |
| "learning_rate": 0.00019188340151957917, | |
| "loss": 0.2129, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 21.18937099773853, | |
| "learning_rate": 0.0001916642314436002, | |
| "loss": 0.3114, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 44.57514569240804, | |
| "learning_rate": 0.00019144506136762127, | |
| "loss": 0.4142, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 10.696173605726317, | |
| "learning_rate": 0.0001912258912916423, | |
| "loss": 0.1754, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 40.73458136589298, | |
| "learning_rate": 0.00019100672121566333, | |
| "loss": 0.2926, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 39.31886057121872, | |
| "learning_rate": 0.0001907875511396844, | |
| "loss": 0.266, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.9842826591961664, | |
| "learning_rate": 0.00019056838106370542, | |
| "loss": 0.2224, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 9.331926869782274, | |
| "learning_rate": 0.00019034921098772645, | |
| "loss": 0.1006, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.42527725447521897, | |
| "learning_rate": 0.00019013004091174748, | |
| "loss": 0.2903, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.2444076494386254, | |
| "learning_rate": 0.00018991087083576854, | |
| "loss": 0.1155, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 37.94628287997447, | |
| "learning_rate": 0.00018969170075978958, | |
| "loss": 0.4778, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 42.20359422033777, | |
| "learning_rate": 0.0001894725306838106, | |
| "loss": 0.268, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 5.954129998100243, | |
| "learning_rate": 0.00018925336060783167, | |
| "loss": 0.2858, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 12.545115176449832, | |
| "learning_rate": 0.0001890341905318527, | |
| "loss": 0.4231, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 4.6634905746414965, | |
| "learning_rate": 0.00018881502045587373, | |
| "loss": 0.2802, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 25.89963576909228, | |
| "learning_rate": 0.0001885958503798948, | |
| "loss": 0.3474, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 8.132022049717532, | |
| "learning_rate": 0.00018837668030391582, | |
| "loss": 0.2969, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 9.15480977443254, | |
| "learning_rate": 0.00018815751022793685, | |
| "loss": 0.2564, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 8.525165992903926, | |
| "learning_rate": 0.00018793834015195789, | |
| "loss": 0.3067, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 32.8997919695663, | |
| "learning_rate": 0.00018771917007597894, | |
| "loss": 0.3231, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 13.880130230974345, | |
| "learning_rate": 0.00018749999999999998, | |
| "loss": 0.233, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 5.059925820076422, | |
| "learning_rate": 0.000187280829924021, | |
| "loss": 0.274, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 33.92209798070997, | |
| "learning_rate": 0.00018706165984804207, | |
| "loss": 0.1971, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 11.486919782540255, | |
| "learning_rate": 0.0001868424897720631, | |
| "loss": 0.5099, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.118181876078142, | |
| "learning_rate": 0.00018662331969608413, | |
| "loss": 0.1531, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 18.07168190463933, | |
| "learning_rate": 0.00018640414962010516, | |
| "loss": 0.4196, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 8.865176806627508, | |
| "learning_rate": 0.00018618497954412622, | |
| "loss": 0.2771, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 35.74999402744737, | |
| "learning_rate": 0.00018596580946814726, | |
| "loss": 0.3592, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 14.709743095476123, | |
| "learning_rate": 0.0001857466393921683, | |
| "loss": 0.2372, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 8.547571860162597, | |
| "learning_rate": 0.00018552746931618935, | |
| "loss": 0.1769, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 29.188932438810735, | |
| "learning_rate": 0.00018530829924021038, | |
| "loss": 0.1595, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 9.72627004103299, | |
| "learning_rate": 0.0001850891291642314, | |
| "loss": 0.267, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 17.585505956653314, | |
| "learning_rate": 0.0001848699590882525, | |
| "loss": 0.2313, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 11.872045841554243, | |
| "learning_rate": 0.0001846507890122735, | |
| "loss": 0.2371, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 17.194223427069954, | |
| "learning_rate": 0.00018443161893629453, | |
| "loss": 0.3299, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 11.83536538733706, | |
| "learning_rate": 0.00018421244886031557, | |
| "loss": 0.1985, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 3.4341999458410153, | |
| "learning_rate": 0.00018399327878433665, | |
| "loss": 0.2886, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 4.533321919018761, | |
| "learning_rate": 0.00018377410870835768, | |
| "loss": 0.2209, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 15.786307762774168, | |
| "learning_rate": 0.0001835549386323787, | |
| "loss": 0.1963, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 4.999680638451794, | |
| "learning_rate": 0.00018333576855639977, | |
| "loss": 0.3231, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 7.345856055836037, | |
| "learning_rate": 0.0001831165984804208, | |
| "loss": 0.3497, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 24.05112698349709, | |
| "learning_rate": 0.00018289742840444184, | |
| "loss": 0.1703, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 5.14777448497475, | |
| "learning_rate": 0.00018267825832846284, | |
| "loss": 0.2877, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 22.321301335050478, | |
| "learning_rate": 0.00018245908825248393, | |
| "loss": 0.3838, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 13.553653717813507, | |
| "learning_rate": 0.00018223991817650496, | |
| "loss": 0.2386, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 8.36423228043391, | |
| "learning_rate": 0.000182020748100526, | |
| "loss": 0.3274, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 32.214527042072234, | |
| "learning_rate": 0.00018180157802454705, | |
| "loss": 0.2649, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 7.310510867463555, | |
| "learning_rate": 0.00018158240794856808, | |
| "loss": 0.3419, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 26.843414736420463, | |
| "learning_rate": 0.00018136323787258912, | |
| "loss": 0.3201, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 17.750295021021458, | |
| "learning_rate": 0.00018114406779661015, | |
| "loss": 0.2248, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.8761970753193081, | |
| "learning_rate": 0.0001809248977206312, | |
| "loss": 0.1681, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 8.023645949418535, | |
| "learning_rate": 0.00018070572764465224, | |
| "loss": 0.3362, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 21.33676597686644, | |
| "learning_rate": 0.00018048655756867327, | |
| "loss": 0.1318, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 17.955586403704974, | |
| "learning_rate": 0.00018026738749269433, | |
| "loss": 0.3109, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 37.44497964171056, | |
| "learning_rate": 0.00018004821741671536, | |
| "loss": 0.224, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 21.77593890358771, | |
| "learning_rate": 0.0001798290473407364, | |
| "loss": 0.3034, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.7352368145267243, | |
| "learning_rate": 0.00017960987726475745, | |
| "loss": 0.1274, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 21.416372952522156, | |
| "learning_rate": 0.00017939070718877849, | |
| "loss": 0.2799, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 6.43140669106781, | |
| "learning_rate": 0.00017917153711279952, | |
| "loss": 0.3944, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 21.589922507704756, | |
| "learning_rate": 0.00017895236703682055, | |
| "loss": 0.4062, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 11.40138578000891, | |
| "learning_rate": 0.0001787331969608416, | |
| "loss": 0.244, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 17.091859469856562, | |
| "learning_rate": 0.00017851402688486264, | |
| "loss": 0.2566, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 11.99466531174607, | |
| "learning_rate": 0.00017829485680888367, | |
| "loss": 0.2817, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 20.51599983452521, | |
| "learning_rate": 0.00017807568673290473, | |
| "loss": 0.2728, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 43.54037932343941, | |
| "learning_rate": 0.00017785651665692576, | |
| "loss": 0.3911, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 14.595613560758464, | |
| "learning_rate": 0.0001776373465809468, | |
| "loss": 0.2645, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 24.91815308399374, | |
| "learning_rate": 0.00017741817650496783, | |
| "loss": 0.2268, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 9.720913254909993, | |
| "learning_rate": 0.0001771990064289889, | |
| "loss": 0.3635, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 29.77336977828609, | |
| "learning_rate": 0.00017697983635300992, | |
| "loss": 0.2624, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 35.22870133705211, | |
| "learning_rate": 0.00017676066627703095, | |
| "loss": 0.3233, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.3492806118043075, | |
| "learning_rate": 0.000176541496201052, | |
| "loss": 0.2593, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 12.854084741350912, | |
| "learning_rate": 0.00017632232612507304, | |
| "loss": 0.288, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 26.51581773086576, | |
| "learning_rate": 0.00017610315604909407, | |
| "loss": 0.4244, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 8.677592676470999, | |
| "learning_rate": 0.00017588398597311513, | |
| "loss": 0.2529, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 49.69265782318678, | |
| "learning_rate": 0.00017566481589713617, | |
| "loss": 0.2555, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 9.137310587778737, | |
| "learning_rate": 0.0001754456458211572, | |
| "loss": 0.2237, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 2.5627703512715154, | |
| "learning_rate": 0.00017522647574517823, | |
| "loss": 0.2253, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.3427155017491879, | |
| "learning_rate": 0.0001750073056691993, | |
| "loss": 0.4267, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 32.026763021399056, | |
| "learning_rate": 0.00017478813559322032, | |
| "loss": 0.2694, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.8952720705907915, | |
| "learning_rate": 0.00017456896551724135, | |
| "loss": 0.2647, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 8.293829114055725, | |
| "learning_rate": 0.0001743497954412624, | |
| "loss": 0.3184, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 21.630558018188975, | |
| "learning_rate": 0.00017413062536528344, | |
| "loss": 0.2528, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 11.263065393245803, | |
| "learning_rate": 0.00017391145528930448, | |
| "loss": 0.3655, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 0.5383693893588692, | |
| "learning_rate": 0.0001736922852133255, | |
| "loss": 0.2871, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 8.222238464796305, | |
| "learning_rate": 0.00017347311513734657, | |
| "loss": 0.2595, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 17.415154838487343, | |
| "learning_rate": 0.0001732539450613676, | |
| "loss": 0.3208, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 2.384813430472559, | |
| "learning_rate": 0.00017303477498538863, | |
| "loss": 0.1697, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 3.430324248430251, | |
| "learning_rate": 0.0001728156049094097, | |
| "loss": 0.3111, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 29.695298798800554, | |
| "learning_rate": 0.00017259643483343072, | |
| "loss": 0.2624, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 3.4530119939476016, | |
| "learning_rate": 0.00017239918176504965, | |
| "loss": 0.5039, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 14.522683359889644, | |
| "learning_rate": 0.0001721800116890707, | |
| "loss": 0.2896, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 11.419748641445885, | |
| "learning_rate": 0.00017196084161309175, | |
| "loss": 0.2413, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 12.845162868376475, | |
| "learning_rate": 0.00017174167153711278, | |
| "loss": 0.2085, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 50.97954814308535, | |
| "learning_rate": 0.0001715225014611338, | |
| "loss": 0.4096, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 3.776354286239877, | |
| "learning_rate": 0.00017130333138515487, | |
| "loss": 0.2438, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 3.71168428040738, | |
| "learning_rate": 0.0001710841613091759, | |
| "loss": 0.2612, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 32.131330999922675, | |
| "learning_rate": 0.00017086499123319693, | |
| "loss": 0.4542, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 6.736921728341746, | |
| "learning_rate": 0.000170645821157218, | |
| "loss": 0.2303, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 15.468427433397974, | |
| "learning_rate": 0.00017042665108123902, | |
| "loss": 0.3181, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 64.72524937993848, | |
| "learning_rate": 0.00017020748100526006, | |
| "loss": 0.2804, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 5.219058854813203, | |
| "learning_rate": 0.0001699883109292811, | |
| "loss": 0.2497, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.6037966913707118, | |
| "learning_rate": 0.00016976914085330215, | |
| "loss": 0.2066, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 6.240768583919815, | |
| "learning_rate": 0.00016954997077732318, | |
| "loss": 0.2428, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 23.240452777334195, | |
| "learning_rate": 0.0001693308007013442, | |
| "loss": 0.2075, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 39.5379292284798, | |
| "learning_rate": 0.00016911163062536527, | |
| "loss": 0.2936, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 32.798181084355704, | |
| "learning_rate": 0.0001688924605493863, | |
| "loss": 0.3773, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 17.376010201609784, | |
| "learning_rate": 0.00016867329047340733, | |
| "loss": 0.3442, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 12.08974541668813, | |
| "learning_rate": 0.0001684541203974284, | |
| "loss": 0.3321, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 34.210468800599315, | |
| "learning_rate": 0.00016823495032144942, | |
| "loss": 0.2846, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 13.86396524279559, | |
| "learning_rate": 0.00016801578024547046, | |
| "loss": 0.2522, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 2.9623178774098693, | |
| "learning_rate": 0.0001677966101694915, | |
| "loss": 0.3122, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 16.202793893907323, | |
| "learning_rate": 0.00016757744009351255, | |
| "loss": 0.2785, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 8.818952376048744, | |
| "learning_rate": 0.00016735827001753358, | |
| "loss": 0.2893, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 9.840941433124744, | |
| "learning_rate": 0.0001671390999415546, | |
| "loss": 0.4644, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 22.487991211369078, | |
| "learning_rate": 0.0001669199298655757, | |
| "loss": 0.3367, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 29.408750577985327, | |
| "learning_rate": 0.0001667007597895967, | |
| "loss": 0.433, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 44.84702783843238, | |
| "learning_rate": 0.00016648158971361773, | |
| "loss": 0.288, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 35.39092595728219, | |
| "learning_rate": 0.00016626241963763877, | |
| "loss": 0.401, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 9.939332121236001, | |
| "learning_rate": 0.00016604324956165985, | |
| "loss": 0.1682, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 44.968039933166224, | |
| "learning_rate": 0.00016582407948568089, | |
| "loss": 0.2435, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 8.897239396543608, | |
| "learning_rate": 0.0001656049094097019, | |
| "loss": 0.2616, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 24.54192700316105, | |
| "learning_rate": 0.00016538573933372298, | |
| "loss": 0.2454, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.9484839859694942, | |
| "learning_rate": 0.000165166569257744, | |
| "loss": 0.2232, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 15.628379568346645, | |
| "learning_rate": 0.00016494739918176504, | |
| "loss": 0.2314, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 23.499563414114768, | |
| "learning_rate": 0.0001647282291057861, | |
| "loss": 0.1903, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 19.39538243318877, | |
| "learning_rate": 0.00016450905902980713, | |
| "loss": 0.2385, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 5.238154008583709, | |
| "learning_rate": 0.00016428988895382816, | |
| "loss": 0.3917, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 24.81933612902287, | |
| "learning_rate": 0.0001640707188778492, | |
| "loss": 0.2389, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 4.608132814218328, | |
| "learning_rate": 0.00016385154880187025, | |
| "loss": 0.1413, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 11.881773803892107, | |
| "learning_rate": 0.0001636323787258913, | |
| "loss": 0.1785, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 14.177125312181635, | |
| "learning_rate": 0.00016341320864991232, | |
| "loss": 0.2461, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 33.173507263725085, | |
| "learning_rate": 0.00016319403857393338, | |
| "loss": 0.5047, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 37.82372264857794, | |
| "learning_rate": 0.0001629748684979544, | |
| "loss": 0.3656, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 23.51699250829612, | |
| "learning_rate": 0.00016275569842197544, | |
| "loss": 0.3609, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 26.427233006930997, | |
| "learning_rate": 0.00016253652834599647, | |
| "loss": 0.2522, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.8480665720492925, | |
| "learning_rate": 0.00016231735827001753, | |
| "loss": 0.1934, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.0073865565621205, | |
| "learning_rate": 0.00016209818819403856, | |
| "loss": 0.2325, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 7.079199003953245, | |
| "learning_rate": 0.0001618790181180596, | |
| "loss": 0.2733, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 4.227514966678838, | |
| "learning_rate": 0.00016165984804208066, | |
| "loss": 0.5935, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 2.3825703295584146, | |
| "learning_rate": 0.0001614406779661017, | |
| "loss": 0.2733, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.8576971315426782, | |
| "learning_rate": 0.00016122150789012272, | |
| "loss": 0.303, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 33.6413124083341, | |
| "learning_rate": 0.00016100233781414378, | |
| "loss": 0.2274, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.9887468524380643, | |
| "learning_rate": 0.0001607831677381648, | |
| "loss": 0.2919, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 4.991113672687678, | |
| "learning_rate": 0.00016056399766218584, | |
| "loss": 0.2303, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 56.489557072844796, | |
| "learning_rate": 0.00016034482758620688, | |
| "loss": 0.3899, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 2.8149639916154947, | |
| "learning_rate": 0.00016012565751022793, | |
| "loss": 0.2179, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 9.821507341872895, | |
| "learning_rate": 0.00015990648743424897, | |
| "loss": 0.3172, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 2.8569862825069285, | |
| "learning_rate": 0.00015968731735827, | |
| "loss": 0.256, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.1928517803266643, | |
| "learning_rate": 0.00015946814728229106, | |
| "loss": 0.1468, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 8.161581000646946, | |
| "learning_rate": 0.0001592489772063121, | |
| "loss": 0.3608, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.8284875998450847, | |
| "learning_rate": 0.00015902980713033312, | |
| "loss": 0.2207, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 3.6951898202003726, | |
| "learning_rate": 0.00015881063705435415, | |
| "loss": 0.2749, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 17.687512857327995, | |
| "learning_rate": 0.0001585914669783752, | |
| "loss": 0.2825, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 4.61555546951409, | |
| "learning_rate": 0.00015837229690239624, | |
| "loss": 0.3753, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 28.47716869865466, | |
| "learning_rate": 0.00015815312682641728, | |
| "loss": 0.3437, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 9.853541461506175, | |
| "learning_rate": 0.00015793395675043834, | |
| "loss": 0.2261, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 3.150395806350278, | |
| "learning_rate": 0.00015771478667445937, | |
| "loss": 0.3094, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 8.382086348656976, | |
| "learning_rate": 0.0001574956165984804, | |
| "loss": 0.3093, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.722468973867923, | |
| "learning_rate": 0.00015727644652250143, | |
| "loss": 0.1783, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 7.546076976068019, | |
| "learning_rate": 0.0001570572764465225, | |
| "loss": 0.1107, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 20.5642182254047, | |
| "learning_rate": 0.00015683810637054352, | |
| "loss": 0.4277, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 23.175588346263925, | |
| "learning_rate": 0.00015661893629456455, | |
| "loss": 0.4047, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 16.76827959394083, | |
| "learning_rate": 0.00015639976621858561, | |
| "loss": 0.2191, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 39.47975455838656, | |
| "learning_rate": 0.00015618059614260665, | |
| "loss": 0.3542, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_0_f1": 0.7521064301552106, | |
| "eval_0_precision": 0.7848218417399352, | |
| "eval_0_recall": 0.7220093656875266, | |
| "eval_1_f1": 0.9185843285755897, | |
| "eval_1_precision": 0.9061781609195402, | |
| "eval_1_recall": 0.9313349084465445, | |
| "eval_accuracy": 0.8774257208639403, | |
| "eval_loss": 0.35205078125, | |
| "eval_runtime": 546.1666, | |
| "eval_samples_per_second": 16.7, | |
| "eval_steps_per_second": 2.785, | |
| "step": 7094 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3.9467741467334223, | |
| "learning_rate": 0.00015596142606662768, | |
| "loss": 0.185, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.796842409513772, | |
| "learning_rate": 0.00015574225599064874, | |
| "loss": 0.1507, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 1.4260048578862903, | |
| "learning_rate": 0.00015552308591466977, | |
| "loss": 0.1149, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 1.5807242351519994, | |
| "learning_rate": 0.0001553039158386908, | |
| "loss": 0.0837, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 0.04513928456377307, | |
| "learning_rate": 0.00015508474576271183, | |
| "loss": 0.1494, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 40.003470604804754, | |
| "learning_rate": 0.0001548655756867329, | |
| "loss": 0.1058, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 5.841175038437886, | |
| "learning_rate": 0.00015464640561075392, | |
| "loss": 0.1506, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 7.276698167932587, | |
| "learning_rate": 0.00015442723553477496, | |
| "loss": 0.0631, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.3902255476639265, | |
| "learning_rate": 0.00015420806545879602, | |
| "loss": 0.0569, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 8.321999591495654, | |
| "learning_rate": 0.00015398889538281705, | |
| "loss": 0.2596, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 4.274705509444957, | |
| "learning_rate": 0.00015376972530683808, | |
| "loss": 0.0755, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 14.284605361939498, | |
| "learning_rate": 0.0001535505552308591, | |
| "loss": 0.0506, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 1.2721793288961767, | |
| "learning_rate": 0.00015333138515488017, | |
| "loss": 0.1444, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 10.887784732379894, | |
| "learning_rate": 0.0001531122150789012, | |
| "loss": 0.0952, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.33776382671575805, | |
| "learning_rate": 0.00015289304500292223, | |
| "loss": 0.1503, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.7362979177108379, | |
| "learning_rate": 0.00015267387492694332, | |
| "loss": 0.0826, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 12.73307715279125, | |
| "learning_rate": 0.00015245470485096433, | |
| "loss": 0.2583, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 38.30889802059039, | |
| "learning_rate": 0.00015223553477498536, | |
| "loss": 0.2281, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 21.730855964037335, | |
| "learning_rate": 0.00015201636469900644, | |
| "loss": 0.3252, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 19.535045567591606, | |
| "learning_rate": 0.00015179719462302748, | |
| "loss": 0.1412, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 6.082430108448023, | |
| "learning_rate": 0.00015157802454704848, | |
| "loss": 0.1301, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 3.5121677910383875, | |
| "learning_rate": 0.0001513588544710695, | |
| "loss": 0.2337, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 18.706966962801445, | |
| "learning_rate": 0.0001511396843950906, | |
| "loss": 0.0847, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.4783230754439916, | |
| "learning_rate": 0.00015092051431911163, | |
| "loss": 0.1419, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 2.9639753705286136, | |
| "learning_rate": 0.00015070134424313264, | |
| "loss": 0.0583, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 23.160696704392283, | |
| "learning_rate": 0.00015048217416715372, | |
| "loss": 0.2117, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 12.771447911890823, | |
| "learning_rate": 0.00015026300409117475, | |
| "loss": 0.0548, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 11.531079583730829, | |
| "learning_rate": 0.00015004383401519579, | |
| "loss": 0.1, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.5725191403592071, | |
| "learning_rate": 0.00014982466393921682, | |
| "loss": 0.0763, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 19.62591146141424, | |
| "learning_rate": 0.00014960549386323785, | |
| "loss": 0.2172, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 4.328008880202292, | |
| "learning_rate": 0.0001493863237872589, | |
| "loss": 0.1791, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 2.542706232558499, | |
| "learning_rate": 0.00014916715371127994, | |
| "loss": 0.1069, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 5.01099611371998, | |
| "learning_rate": 0.00014894798363530097, | |
| "loss": 0.0867, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 1.4225594957009309, | |
| "learning_rate": 0.00014872881355932203, | |
| "loss": 0.1617, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.4366748859319889, | |
| "learning_rate": 0.00014850964348334306, | |
| "loss": 0.1247, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.4798934066028662, | |
| "learning_rate": 0.0001482904734073641, | |
| "loss": 0.1457, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.8614048372726179, | |
| "learning_rate": 0.00014807130333138516, | |
| "loss": 0.1488, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.5048632731892742, | |
| "learning_rate": 0.0001478521332554062, | |
| "loss": 0.0832, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 2.6843269650855808, | |
| "learning_rate": 0.00014763296317942722, | |
| "loss": 0.0961, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 16.8741628191936, | |
| "learning_rate": 0.00014741379310344825, | |
| "loss": 0.2136, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 0.807139340475033, | |
| "learning_rate": 0.0001471946230274693, | |
| "loss": 0.1095, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 2.2184905450148986, | |
| "learning_rate": 0.00014697545295149034, | |
| "loss": 0.1902, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 5.771979829819086, | |
| "learning_rate": 0.00014675628287551137, | |
| "loss": 0.138, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 5.97570946689331, | |
| "learning_rate": 0.00014653711279953243, | |
| "loss": 0.21, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 5.577046113368879, | |
| "learning_rate": 0.00014631794272355347, | |
| "loss": 0.2296, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.16147618396834, | |
| "learning_rate": 0.0001460987726475745, | |
| "loss": 0.1041, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 1.3482389383340438, | |
| "learning_rate": 0.00014587960257159553, | |
| "loss": 0.0832, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 5.133780628362138, | |
| "learning_rate": 0.0001456604324956166, | |
| "loss": 0.0916, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 2.729454474197146, | |
| "learning_rate": 0.00014544126241963762, | |
| "loss": 0.1806, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 2.72773715656119, | |
| "learning_rate": 0.00014522209234365865, | |
| "loss": 0.0647, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 3.507884957259747, | |
| "learning_rate": 0.0001450248392752776, | |
| "loss": 0.2701, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 19.081923498359, | |
| "learning_rate": 0.00014480566919929864, | |
| "loss": 0.0962, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 45.75244080209308, | |
| "learning_rate": 0.00014458649912331968, | |
| "loss": 0.2713, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 6.360305569438668, | |
| "learning_rate": 0.00014436732904734073, | |
| "loss": 0.0377, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 11.812868187605755, | |
| "learning_rate": 0.00014414815897136177, | |
| "loss": 0.0538, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 11.581826556875212, | |
| "learning_rate": 0.0001439289888953828, | |
| "loss": 0.1113, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 3.0769487150537067, | |
| "learning_rate": 0.00014370981881940383, | |
| "loss": 0.1045, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 19.551871190286114, | |
| "learning_rate": 0.0001434906487434249, | |
| "loss": 0.1375, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 4.0482427664543925, | |
| "learning_rate": 0.00014327147866744592, | |
| "loss": 0.1863, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 3.1977613453815654, | |
| "learning_rate": 0.00014305230859146695, | |
| "loss": 0.0756, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 0.9737985079221272, | |
| "learning_rate": 0.000142833138515488, | |
| "loss": 0.0916, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.984146795188883, | |
| "learning_rate": 0.00014261396843950904, | |
| "loss": 0.2625, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 1.72026076920096, | |
| "learning_rate": 0.00014239479836353008, | |
| "loss": 0.1073, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.555301240735939, | |
| "learning_rate": 0.00014217562828755114, | |
| "loss": 0.0962, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 12.306241433553296, | |
| "learning_rate": 0.00014195645821157217, | |
| "loss": 0.312, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 39.31295240290497, | |
| "learning_rate": 0.0001417372881355932, | |
| "loss": 0.1382, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 18.239083580266996, | |
| "learning_rate": 0.00014151811805961423, | |
| "loss": 0.1959, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 3.6063732707629277, | |
| "learning_rate": 0.0001412989479836353, | |
| "loss": 0.1882, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 22.59443633796688, | |
| "learning_rate": 0.00014107977790765632, | |
| "loss": 0.1411, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 9.943061970841525, | |
| "learning_rate": 0.00014086060783167738, | |
| "loss": 0.2073, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 13.334986026618791, | |
| "learning_rate": 0.00014064143775569841, | |
| "loss": 0.1452, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 18.672479936744026, | |
| "learning_rate": 0.00014042226767971945, | |
| "loss": 0.1604, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 8.209768975033892, | |
| "learning_rate": 0.0001402030976037405, | |
| "loss": 0.0633, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 7.929920880167193, | |
| "learning_rate": 0.00013998392752776154, | |
| "loss": 0.0759, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 5.155314191153851, | |
| "learning_rate": 0.00013976475745178257, | |
| "loss": 0.1334, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 12.293469154578297, | |
| "learning_rate": 0.00013954558737580363, | |
| "loss": 0.1731, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 27.793799163143525, | |
| "learning_rate": 0.00013932641729982466, | |
| "loss": 0.1718, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 18.160732235536603, | |
| "learning_rate": 0.0001391072472238457, | |
| "loss": 0.3889, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 6.719945966655997, | |
| "learning_rate": 0.00013888807714786672, | |
| "loss": 0.0903, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 4.702233374552434, | |
| "learning_rate": 0.00013866890707188778, | |
| "loss": 0.1029, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.6814706649468594, | |
| "learning_rate": 0.00013844973699590882, | |
| "loss": 0.1609, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 35.183209348221745, | |
| "learning_rate": 0.00013823056691992985, | |
| "loss": 0.0987, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.3853650828148091, | |
| "learning_rate": 0.0001380113968439509, | |
| "loss": 0.092, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.1900815019481827, | |
| "learning_rate": 0.00013779222676797194, | |
| "loss": 0.0721, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.719013304384676, | |
| "learning_rate": 0.00013757305669199297, | |
| "loss": 0.2757, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.9065891840102538, | |
| "learning_rate": 0.000137353886616014, | |
| "loss": 0.0894, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.7041589905973044, | |
| "learning_rate": 0.00013713471654003506, | |
| "loss": 0.1567, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 13.09370538833684, | |
| "learning_rate": 0.0001369155464640561, | |
| "loss": 0.1153, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.10005792474473749, | |
| "learning_rate": 0.00013669637638807713, | |
| "loss": 0.0889, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 8.650143720385165, | |
| "learning_rate": 0.00013647720631209818, | |
| "loss": 0.102, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 13.834552459976706, | |
| "learning_rate": 0.00013625803623611922, | |
| "loss": 0.1991, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 7.1136160697882636, | |
| "learning_rate": 0.00013603886616014025, | |
| "loss": 0.1605, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 3.409491115278956, | |
| "learning_rate": 0.00013581969608416128, | |
| "loss": 0.182, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 11.603050853164602, | |
| "learning_rate": 0.00013560052600818234, | |
| "loss": 0.1393, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 91.11285901526837, | |
| "learning_rate": 0.00013538135593220337, | |
| "loss": 0.5593, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 9.002577439722662, | |
| "learning_rate": 0.0001351621858562244, | |
| "loss": 0.2636, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 2.8423642348776945, | |
| "learning_rate": 0.00013494301578024546, | |
| "loss": 0.2049, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 9.017487980947873, | |
| "learning_rate": 0.0001347238457042665, | |
| "loss": 0.2001, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 5.463991248637368, | |
| "learning_rate": 0.00013450467562828753, | |
| "loss": 0.1163, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 35.9490561961924, | |
| "learning_rate": 0.00013428550555230859, | |
| "loss": 0.1705, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.5214346506002336, | |
| "learning_rate": 0.00013406633547632962, | |
| "loss": 0.1706, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 6.553206900744027, | |
| "learning_rate": 0.00013384716540035068, | |
| "loss": 0.1406, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 10.219338883484347, | |
| "learning_rate": 0.00013362799532437168, | |
| "loss": 0.2204, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 18.748997475525492, | |
| "learning_rate": 0.00013340882524839274, | |
| "loss": 0.2109, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 1.3727170176969377, | |
| "learning_rate": 0.0001331896551724138, | |
| "loss": 0.1122, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 10.948203912180993, | |
| "learning_rate": 0.00013297048509643483, | |
| "loss": 0.3022, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 2.056636685995259, | |
| "learning_rate": 0.00013275131502045586, | |
| "loss": 0.239, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 6.518651435362685, | |
| "learning_rate": 0.0001325321449444769, | |
| "loss": 0.3234, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 5.475635275763899, | |
| "learning_rate": 0.00013231297486849796, | |
| "loss": 0.2198, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 3.671883643297412, | |
| "learning_rate": 0.000132093804792519, | |
| "loss": 0.1707, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 0.4579375750027517, | |
| "learning_rate": 0.00013187463471654002, | |
| "loss": 0.2133, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 5.194066447141225, | |
| "learning_rate": 0.00013165546464056108, | |
| "loss": 0.2439, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 10.494958205761325, | |
| "learning_rate": 0.0001314362945645821, | |
| "loss": 0.244, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 3.2710044167508534, | |
| "learning_rate": 0.00013121712448860314, | |
| "loss": 0.1088, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 16.056832749725743, | |
| "learning_rate": 0.00013099795441262417, | |
| "loss": 0.2221, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 0.7672515772397378, | |
| "learning_rate": 0.00013077878433664523, | |
| "loss": 0.0554, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 21.726819743293646, | |
| "learning_rate": 0.00013055961426066627, | |
| "loss": 0.1701, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 11.287063948392506, | |
| "learning_rate": 0.0001303404441846873, | |
| "loss": 0.2342, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 22.058917311910815, | |
| "learning_rate": 0.00013012127410870836, | |
| "loss": 0.163, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 3.2943302484351142, | |
| "learning_rate": 0.0001299021040327294, | |
| "loss": 0.1142, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 41.01468692106424, | |
| "learning_rate": 0.00012968293395675042, | |
| "loss": 0.1652, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 15.110863439212581, | |
| "learning_rate": 0.00012946376388077145, | |
| "loss": 0.2891, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 4.692836606354725, | |
| "learning_rate": 0.0001292445938047925, | |
| "loss": 0.0777, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 42.35071301518718, | |
| "learning_rate": 0.00012902542372881354, | |
| "loss": 0.2289, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 3.3851205501937085, | |
| "learning_rate": 0.00012880625365283458, | |
| "loss": 0.0574, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 32.382836448835434, | |
| "learning_rate": 0.00012858708357685564, | |
| "loss": 0.2681, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 11.990285837961236, | |
| "learning_rate": 0.00012836791350087667, | |
| "loss": 0.1145, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 6.231424331853902, | |
| "learning_rate": 0.0001281487434248977, | |
| "loss": 0.1645, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 8.823978019308194, | |
| "learning_rate": 0.00012792957334891876, | |
| "loss": 0.0895, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 3.5791569065379147, | |
| "learning_rate": 0.0001277104032729398, | |
| "loss": 0.2226, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 4.904760306159147, | |
| "learning_rate": 0.00012749123319696082, | |
| "loss": 0.1586, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 35.790471287396194, | |
| "learning_rate": 0.00012727206312098185, | |
| "loss": 0.1451, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 19.49158058941717, | |
| "learning_rate": 0.0001270528930450029, | |
| "loss": 0.1129, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 1.2301380360175656, | |
| "learning_rate": 0.00012683372296902397, | |
| "loss": 0.2055, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 6.081015675249448, | |
| "learning_rate": 0.00012661455289304498, | |
| "loss": 0.1039, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 8.51374019556884, | |
| "learning_rate": 0.00012639538281706604, | |
| "loss": 0.0764, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 22.61234712969463, | |
| "learning_rate": 0.00012617621274108707, | |
| "loss": 0.1144, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 16.325546182379608, | |
| "learning_rate": 0.00012595704266510813, | |
| "loss": 0.2252, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 10.684407579259915, | |
| "learning_rate": 0.00012573787258912916, | |
| "loss": 0.1617, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 18.8401359355114, | |
| "learning_rate": 0.0001255187025131502, | |
| "loss": 0.1327, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 4.428339354625936, | |
| "learning_rate": 0.00012529953243717125, | |
| "loss": 0.2793, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 34.97712900138805, | |
| "learning_rate": 0.00012508036236119228, | |
| "loss": 0.1734, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 10.842732736668664, | |
| "learning_rate": 0.00012486119228521331, | |
| "loss": 0.172, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 3.8204570700978753, | |
| "learning_rate": 0.00012464202220923435, | |
| "loss": 0.1893, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 1.7847088171149714, | |
| "learning_rate": 0.0001244228521332554, | |
| "loss": 0.1119, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 10.013026009815832, | |
| "learning_rate": 0.00012420368205727644, | |
| "loss": 0.1488, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.9956055302547419, | |
| "learning_rate": 0.00012398451198129747, | |
| "loss": 0.167, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 15.708190043930621, | |
| "learning_rate": 0.00012376534190531853, | |
| "loss": 0.103, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 9.516127340363248, | |
| "learning_rate": 0.00012354617182933956, | |
| "loss": 0.1761, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 11.289621429730468, | |
| "learning_rate": 0.0001233270017533606, | |
| "loss": 0.21, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 6.438699785103895, | |
| "learning_rate": 0.00012310783167738162, | |
| "loss": 0.1212, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 0.4291084022368479, | |
| "learning_rate": 0.00012288866160140268, | |
| "loss": 0.1315, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 3.090543654415638, | |
| "learning_rate": 0.00012266949152542372, | |
| "loss": 0.0689, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 0.47917246377381595, | |
| "learning_rate": 0.00012245032144944475, | |
| "loss": 0.1119, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.7069329797066186, | |
| "learning_rate": 0.0001222311513734658, | |
| "loss": 0.0562, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 18.96685701324762, | |
| "learning_rate": 0.00012201198129748684, | |
| "loss": 0.1177, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.00279294620786177, | |
| "learning_rate": 0.00012179281122150788, | |
| "loss": 0.0998, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 6.199666064354547, | |
| "learning_rate": 0.00012157364114552893, | |
| "loss": 0.083, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.639057585376392, | |
| "learning_rate": 0.00012135447106954996, | |
| "loss": 0.0941, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.15447864197247607, | |
| "learning_rate": 0.0001211572180011689, | |
| "loss": 0.1944, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.8173758905341566, | |
| "learning_rate": 0.00012093804792518993, | |
| "loss": 0.1019, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 3.243981396595975, | |
| "learning_rate": 0.00012071887784921097, | |
| "loss": 0.3167, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 3.4464993538031634, | |
| "learning_rate": 0.00012049970777323202, | |
| "loss": 0.1482, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 23.425856884504597, | |
| "learning_rate": 0.00012028053769725305, | |
| "loss": 0.2159, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 18.894344998479365, | |
| "learning_rate": 0.00012006136762127411, | |
| "loss": 0.3282, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 4.015044349522744, | |
| "learning_rate": 0.00011984219754529513, | |
| "loss": 0.1495, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 0.456213488330113, | |
| "learning_rate": 0.00011962302746931619, | |
| "loss": 0.1598, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 13.704116606800925, | |
| "learning_rate": 0.00011940385739333723, | |
| "loss": 0.1294, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 7.3368094990394175, | |
| "learning_rate": 0.00011918468731735826, | |
| "loss": 0.1551, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.0015347738020366, | |
| "learning_rate": 0.00011896551724137931, | |
| "loss": 0.1037, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 4.798234813041826, | |
| "learning_rate": 0.00011874634716540034, | |
| "loss": 0.0762, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.46732247838464797, | |
| "learning_rate": 0.00011852717708942139, | |
| "loss": 0.2361, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.78510526101886, | |
| "learning_rate": 0.00011830800701344242, | |
| "loss": 0.1172, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 14.754170828495456, | |
| "learning_rate": 0.00011808883693746346, | |
| "loss": 0.1337, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 5.457626503330071, | |
| "learning_rate": 0.00011786966686148451, | |
| "loss": 0.1238, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 24.359633554477504, | |
| "learning_rate": 0.00011765049678550554, | |
| "loss": 0.159, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 4.052116414721034, | |
| "learning_rate": 0.00011743132670952659, | |
| "loss": 0.1946, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 10.286136518184552, | |
| "learning_rate": 0.00011721215663354762, | |
| "loss": 0.0675, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.7798842992321797, | |
| "learning_rate": 0.00011699298655756866, | |
| "loss": 0.0518, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 3.1590348601862037, | |
| "learning_rate": 0.00011677381648158971, | |
| "loss": 0.237, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 9.309421761709203, | |
| "learning_rate": 0.00011655464640561074, | |
| "loss": 0.1237, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 7.223449459613724, | |
| "learning_rate": 0.00011633547632963179, | |
| "loss": 0.1144, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 2.293633045983554, | |
| "learning_rate": 0.00011611630625365282, | |
| "loss": 0.1469, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 11.619319474508913, | |
| "learning_rate": 0.00011589713617767387, | |
| "loss": 0.1324, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 8.237900621376555, | |
| "learning_rate": 0.00011567796610169491, | |
| "loss": 0.1019, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 3.4844703517603746, | |
| "learning_rate": 0.00011545879602571594, | |
| "loss": 0.1582, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 48.74139317560625, | |
| "learning_rate": 0.00011523962594973699, | |
| "loss": 0.243, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 32.91098913412278, | |
| "learning_rate": 0.00011502045587375802, | |
| "loss": 0.153, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 5.659700047857308, | |
| "learning_rate": 0.00011480128579777907, | |
| "loss": 0.0843, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 22.35388198625644, | |
| "learning_rate": 0.0001145821157218001, | |
| "loss": 0.1841, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 5.24175893236962, | |
| "learning_rate": 0.00011436294564582114, | |
| "loss": 0.1452, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 5.865583240157655, | |
| "learning_rate": 0.00011414377556984219, | |
| "loss": 0.1757, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 16.96991984978489, | |
| "learning_rate": 0.00011392460549386322, | |
| "loss": 0.2905, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 1.4459460915714275, | |
| "learning_rate": 0.00011370543541788427, | |
| "loss": 0.0953, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.27775375444037353, | |
| "learning_rate": 0.0001134862653419053, | |
| "loss": 0.0792, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 1.1397011386751719, | |
| "learning_rate": 0.00011326709526592634, | |
| "loss": 0.0971, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 32.740624235968234, | |
| "learning_rate": 0.0001130479251899474, | |
| "loss": 0.152, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 15.666578971132482, | |
| "learning_rate": 0.00011282875511396842, | |
| "loss": 0.2745, | |
| "step": 9070 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.6312408815420002, | |
| "learning_rate": 0.00011260958503798948, | |
| "loss": 0.0736, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.9882525535102352, | |
| "learning_rate": 0.0001123904149620105, | |
| "loss": 0.1197, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 22.45196464336915, | |
| "learning_rate": 0.00011217124488603156, | |
| "loss": 0.0746, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.4466669958671361, | |
| "learning_rate": 0.00011195207481005258, | |
| "loss": 0.026, | |
| "step": 9110 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 18.22674384805627, | |
| "learning_rate": 0.00011173290473407364, | |
| "loss": 0.1488, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 3.2075642222454324, | |
| "learning_rate": 0.00011151373465809468, | |
| "loss": 0.271, | |
| "step": 9130 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 8.45175655576022, | |
| "learning_rate": 0.00011129456458211571, | |
| "loss": 0.0749, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 13.874834426706034, | |
| "learning_rate": 0.00011107539450613676, | |
| "loss": 0.0782, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.9676566458873671, | |
| "learning_rate": 0.00011085622443015779, | |
| "loss": 0.0905, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 5.621065616371578, | |
| "learning_rate": 0.00011063705435417884, | |
| "loss": 0.0798, | |
| "step": 9170 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 7.042618733836522, | |
| "learning_rate": 0.00011041788427819988, | |
| "loss": 0.2499, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 2.4221807383973646, | |
| "learning_rate": 0.00011019871420222091, | |
| "loss": 0.0507, | |
| "step": 9190 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 24.467155852219083, | |
| "learning_rate": 0.00010997954412624196, | |
| "loss": 0.1741, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 4.428974769529165, | |
| "learning_rate": 0.00010976037405026299, | |
| "loss": 0.1104, | |
| "step": 9210 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 20.92148868212674, | |
| "learning_rate": 0.00010954120397428404, | |
| "loss": 0.0914, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.30038070281462703, | |
| "learning_rate": 0.00010932203389830507, | |
| "loss": 0.1936, | |
| "step": 9230 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 48.36362550140161, | |
| "learning_rate": 0.00010910286382232611, | |
| "loss": 0.3639, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 8.15759688958997, | |
| "learning_rate": 0.00010888369374634716, | |
| "loss": 0.1991, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 12.841408835810743, | |
| "learning_rate": 0.00010866452367036819, | |
| "loss": 0.1441, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 13.483453911295381, | |
| "learning_rate": 0.00010844535359438924, | |
| "loss": 0.0981, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.5843792074816087, | |
| "learning_rate": 0.00010822618351841027, | |
| "loss": 0.2757, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 7.822943624957112, | |
| "learning_rate": 0.00010800701344243132, | |
| "loss": 0.1102, | |
| "step": 9290 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 35.655682175617585, | |
| "learning_rate": 0.00010778784336645236, | |
| "loss": 0.2879, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 1.429017224224025, | |
| "learning_rate": 0.0001075686732904734, | |
| "loss": 0.0986, | |
| "step": 9310 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 8.077860057159654, | |
| "learning_rate": 0.00010734950321449444, | |
| "loss": 0.2654, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 1.2534241595837954, | |
| "learning_rate": 0.00010713033313851547, | |
| "loss": 0.0941, | |
| "step": 9330 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 0.9501360823975038, | |
| "learning_rate": 0.00010691116306253652, | |
| "loss": 0.1358, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.3120476171581812, | |
| "learning_rate": 0.00010669199298655756, | |
| "loss": 0.1927, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 10.128511370932692, | |
| "learning_rate": 0.0001064728229105786, | |
| "loss": 0.2176, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 3.376491562592107, | |
| "learning_rate": 0.00010625365283459964, | |
| "loss": 0.0673, | |
| "step": 9370 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.011877575586390247, | |
| "learning_rate": 0.00010603448275862067, | |
| "loss": 0.1272, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 15.244920750217991, | |
| "learning_rate": 0.00010581531268264172, | |
| "loss": 0.1012, | |
| "step": 9390 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.2705443274155431, | |
| "learning_rate": 0.00010559614260666275, | |
| "loss": 0.1024, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 12.05359833618471, | |
| "learning_rate": 0.0001053769725306838, | |
| "loss": 0.1826, | |
| "step": 9410 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 9.360989137584955, | |
| "learning_rate": 0.00010515780245470485, | |
| "loss": 0.205, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 6.831707184981156, | |
| "learning_rate": 0.00010493863237872587, | |
| "loss": 0.2364, | |
| "step": 9430 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 12.68075831146527, | |
| "learning_rate": 0.00010471946230274693, | |
| "loss": 0.1878, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 9.461914667245052, | |
| "learning_rate": 0.00010450029222676796, | |
| "loss": 0.1061, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 33.7830548827646, | |
| "learning_rate": 0.00010428112215078901, | |
| "loss": 0.0955, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 3.41262405773915, | |
| "learning_rate": 0.00010406195207481005, | |
| "loss": 0.0893, | |
| "step": 9470 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 16.661587161769187, | |
| "learning_rate": 0.00010384278199883109, | |
| "loss": 0.1281, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 19.501609655955452, | |
| "learning_rate": 0.00010362361192285213, | |
| "loss": 0.3944, | |
| "step": 9490 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 4.601907463118784, | |
| "learning_rate": 0.00010340444184687316, | |
| "loss": 0.1158, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 24.10143095564842, | |
| "learning_rate": 0.00010318527177089421, | |
| "loss": 0.2357, | |
| "step": 9510 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 4.970899462803766, | |
| "learning_rate": 0.00010296610169491524, | |
| "loss": 0.1134, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.04492151846400819, | |
| "learning_rate": 0.00010274693161893629, | |
| "loss": 0.1146, | |
| "step": 9530 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 7.2274322872988055, | |
| "learning_rate": 0.00010252776154295733, | |
| "loss": 0.1354, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 6.048047676599459, | |
| "learning_rate": 0.00010230859146697836, | |
| "loss": 0.2284, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.9638985947560608, | |
| "learning_rate": 0.00010208942139099941, | |
| "loss": 0.0955, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 6.19171074222296, | |
| "learning_rate": 0.00010187025131502044, | |
| "loss": 0.0909, | |
| "step": 9570 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 22.167114710888278, | |
| "learning_rate": 0.00010165108123904149, | |
| "loss": 0.1367, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.3557238646240087, | |
| "learning_rate": 0.00010143191116306253, | |
| "loss": 0.0825, | |
| "step": 9590 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 23.067425155746133, | |
| "learning_rate": 0.00010121274108708357, | |
| "loss": 0.0815, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.3212362814667589, | |
| "learning_rate": 0.00010099357101110461, | |
| "loss": 0.1454, | |
| "step": 9610 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 9.03334133310524, | |
| "learning_rate": 0.00010077440093512564, | |
| "loss": 0.1943, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 2.1696899004758556, | |
| "learning_rate": 0.00010055523085914669, | |
| "loss": 0.1105, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 3.8419745918801067, | |
| "learning_rate": 0.00010033606078316773, | |
| "loss": 0.3075, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.9683867410845369, | |
| "learning_rate": 0.00010011689070718877, | |
| "loss": 0.1233, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 13.53465449736677, | |
| "learning_rate": 9.989772063120981e-05, | |
| "loss": 0.2396, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 1.3390366123084314, | |
| "learning_rate": 9.967855055523084e-05, | |
| "loss": 0.117, | |
| "step": 9670 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.7238054927151057, | |
| "learning_rate": 9.945938047925189e-05, | |
| "loss": 0.0825, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 1.343299376332715, | |
| "learning_rate": 9.924021040327292e-05, | |
| "loss": 0.1852, | |
| "step": 9690 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 2.7174665471907296, | |
| "learning_rate": 9.902104032729397e-05, | |
| "loss": 0.185, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 19.67625891707943, | |
| "learning_rate": 9.880187025131501e-05, | |
| "loss": 0.1037, | |
| "step": 9710 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 1.7362888479778698, | |
| "learning_rate": 9.858270017533604e-05, | |
| "loss": 0.0993, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 24.97466231442174, | |
| "learning_rate": 9.836353009935709e-05, | |
| "loss": 0.3, | |
| "step": 9730 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.19223391446010424, | |
| "learning_rate": 9.814436002337812e-05, | |
| "loss": 0.139, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.0232214471272263, | |
| "learning_rate": 9.792518994739917e-05, | |
| "loss": 0.0779, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 2.4406875106467685, | |
| "learning_rate": 9.770601987142023e-05, | |
| "loss": 0.2194, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 29.07247504638446, | |
| "learning_rate": 9.748684979544126e-05, | |
| "loss": 0.0975, | |
| "step": 9770 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.7785847782304731, | |
| "learning_rate": 9.72676797194623e-05, | |
| "loss": 0.1824, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 15.90756717391926, | |
| "learning_rate": 9.704850964348334e-05, | |
| "loss": 0.0997, | |
| "step": 9790 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.5434568527038021, | |
| "learning_rate": 9.682933956750438e-05, | |
| "loss": 0.0639, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.8181189757985562, | |
| "learning_rate": 9.661016949152541e-05, | |
| "loss": 0.1401, | |
| "step": 9810 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.7002007823686216, | |
| "learning_rate": 9.639099941554646e-05, | |
| "loss": 0.1317, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 10.646847543416506, | |
| "learning_rate": 9.61718293395675e-05, | |
| "loss": 0.0834, | |
| "step": 9830 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 2.854877487641081, | |
| "learning_rate": 9.595265926358854e-05, | |
| "loss": 0.094, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 21.086749067232077, | |
| "learning_rate": 9.573348918760958e-05, | |
| "loss": 0.1658, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 6.599220856291059, | |
| "learning_rate": 9.551431911163061e-05, | |
| "loss": 0.1013, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 10.557135159098205, | |
| "learning_rate": 9.529514903565166e-05, | |
| "loss": 0.1954, | |
| "step": 9870 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 9.42143848698557, | |
| "learning_rate": 9.50759789596727e-05, | |
| "loss": 0.2229, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 36.05803797726123, | |
| "learning_rate": 9.485680888369374e-05, | |
| "loss": 0.1102, | |
| "step": 9890 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 19.750284280798315, | |
| "learning_rate": 9.463763880771478e-05, | |
| "loss": 0.1737, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 20.670824131237584, | |
| "learning_rate": 9.441846873173582e-05, | |
| "loss": 0.1087, | |
| "step": 9910 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 3.3595758310158126, | |
| "learning_rate": 9.419929865575686e-05, | |
| "loss": 0.0856, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.6471151094333392, | |
| "learning_rate": 9.398012857977789e-05, | |
| "loss": 0.1957, | |
| "step": 9930 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 26.791209319259156, | |
| "learning_rate": 9.376095850379894e-05, | |
| "loss": 0.1552, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 2.0994878103860124, | |
| "learning_rate": 9.354178842781998e-05, | |
| "loss": 0.0714, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 3.939232559831004, | |
| "learning_rate": 9.332261835184102e-05, | |
| "loss": 0.1463, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 1.93177447901502, | |
| "learning_rate": 9.310344827586206e-05, | |
| "loss": 0.3218, | |
| "step": 9970 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 11.674268724271638, | |
| "learning_rate": 9.28842781998831e-05, | |
| "loss": 0.121, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 29.77892155165882, | |
| "learning_rate": 9.266510812390414e-05, | |
| "loss": 0.2259, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 3.1793474536477495, | |
| "learning_rate": 9.244593804792518e-05, | |
| "loss": 0.1147, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 29.467125692383135, | |
| "learning_rate": 9.222676797194622e-05, | |
| "loss": 0.1296, | |
| "step": 10010 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 12.829429531674693, | |
| "learning_rate": 9.202951490356516e-05, | |
| "loss": 0.343, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 4.842170823899554, | |
| "learning_rate": 9.18103448275862e-05, | |
| "loss": 0.1342, | |
| "step": 10030 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 2.8036087435052117, | |
| "learning_rate": 9.159117475160724e-05, | |
| "loss": 0.1981, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.2567121612024053, | |
| "learning_rate": 9.137200467562828e-05, | |
| "loss": 0.1156, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 39.13892743280264, | |
| "learning_rate": 9.115283459964932e-05, | |
| "loss": 0.217, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 4.018926730147514, | |
| "learning_rate": 9.093366452367036e-05, | |
| "loss": 0.1098, | |
| "step": 10070 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 20.370189526716274, | |
| "learning_rate": 9.07144944476914e-05, | |
| "loss": 0.1131, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 12.513349385435367, | |
| "learning_rate": 9.049532437171244e-05, | |
| "loss": 0.0835, | |
| "step": 10090 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 3.0078713954695693, | |
| "learning_rate": 9.027615429573349e-05, | |
| "loss": 0.0935, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.35428391785347213, | |
| "learning_rate": 9.005698421975452e-05, | |
| "loss": 0.1368, | |
| "step": 10110 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 2.8765836604387487, | |
| "learning_rate": 8.983781414377556e-05, | |
| "loss": 0.0582, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.27300803686564074, | |
| "learning_rate": 8.96186440677966e-05, | |
| "loss": 0.1443, | |
| "step": 10130 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 1.0632557332902792, | |
| "learning_rate": 8.939947399181764e-05, | |
| "loss": 0.2713, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 1.3919283720717666, | |
| "learning_rate": 8.918030391583869e-05, | |
| "loss": 0.1426, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.2975889790690355, | |
| "learning_rate": 8.896113383985972e-05, | |
| "loss": 0.1255, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 0.6331908778097588, | |
| "learning_rate": 8.874196376388076e-05, | |
| "loss": 0.1088, | |
| "step": 10170 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 0.49968429973464584, | |
| "learning_rate": 8.85227936879018e-05, | |
| "loss": 0.0462, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 0.3998283883981307, | |
| "learning_rate": 8.830362361192284e-05, | |
| "loss": 0.1456, | |
| "step": 10190 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.6762860335305919, | |
| "learning_rate": 8.808445353594387e-05, | |
| "loss": 0.1, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.5918195856520565, | |
| "learning_rate": 8.786528345996492e-05, | |
| "loss": 0.0775, | |
| "step": 10210 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.47908599501430993, | |
| "learning_rate": 8.764611338398598e-05, | |
| "loss": 0.1299, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.02533220752832844, | |
| "learning_rate": 8.7426943308007e-05, | |
| "loss": 0.1705, | |
| "step": 10230 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 1.6916417760501654, | |
| "learning_rate": 8.720777323202806e-05, | |
| "loss": 0.3096, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 5.6132755915793116, | |
| "learning_rate": 8.698860315604907e-05, | |
| "loss": 0.14, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 3.6946000211637866, | |
| "learning_rate": 8.676943308007013e-05, | |
| "loss": 0.1283, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.5612252460560485, | |
| "learning_rate": 8.655026300409118e-05, | |
| "loss": 0.2084, | |
| "step": 10270 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 17.442735530139714, | |
| "learning_rate": 8.633109292811221e-05, | |
| "loss": 0.0978, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.7188172450254493, | |
| "learning_rate": 8.611192285213326e-05, | |
| "loss": 0.1475, | |
| "step": 10290 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.23472035328250088, | |
| "learning_rate": 8.589275277615429e-05, | |
| "loss": 0.084, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 3.847787592250939, | |
| "learning_rate": 8.567358270017533e-05, | |
| "loss": 0.0532, | |
| "step": 10310 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 8.67255157030904, | |
| "learning_rate": 8.545441262419637e-05, | |
| "loss": 0.2527, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.6329813929319419, | |
| "learning_rate": 8.523524254821741e-05, | |
| "loss": 0.1688, | |
| "step": 10330 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 38.31309176901214, | |
| "learning_rate": 8.501607247223846e-05, | |
| "loss": 0.1513, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.9873950760760376, | |
| "learning_rate": 8.479690239625949e-05, | |
| "loss": 0.1007, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 12.444359816230664, | |
| "learning_rate": 8.457773232028053e-05, | |
| "loss": 0.0936, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.1983295007186743, | |
| "learning_rate": 8.435856224430157e-05, | |
| "loss": 0.1726, | |
| "step": 10370 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 25.402786909188887, | |
| "learning_rate": 8.413939216832261e-05, | |
| "loss": 0.2034, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 10.271503407485543, | |
| "learning_rate": 8.392022209234366e-05, | |
| "loss": 0.0983, | |
| "step": 10390 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 11.41372086097229, | |
| "learning_rate": 8.370105201636469e-05, | |
| "loss": 0.1763, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.0021380382604943, | |
| "learning_rate": 8.348188194038573e-05, | |
| "loss": 0.1691, | |
| "step": 10410 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 1.4011630497758993, | |
| "learning_rate": 8.326271186440677e-05, | |
| "loss": 0.0795, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.5995590743117784, | |
| "learning_rate": 8.304354178842781e-05, | |
| "loss": 0.155, | |
| "step": 10430 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 13.891285620986729, | |
| "learning_rate": 8.282437171244884e-05, | |
| "loss": 0.152, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 4.322816995983127, | |
| "learning_rate": 8.260520163646989e-05, | |
| "loss": 0.1174, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 4.199701886882987, | |
| "learning_rate": 8.238603156049094e-05, | |
| "loss": 0.1112, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.0003512662431095, | |
| "learning_rate": 8.216686148451197e-05, | |
| "loss": 0.1023, | |
| "step": 10470 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 2.7506464116256915, | |
| "learning_rate": 8.194769140853301e-05, | |
| "loss": 0.125, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.267916746880681, | |
| "learning_rate": 8.172852133255405e-05, | |
| "loss": 0.1657, | |
| "step": 10490 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.1331681687447057, | |
| "learning_rate": 8.150935125657509e-05, | |
| "loss": 0.1416, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.8446952462251813, | |
| "learning_rate": 8.129018118059614e-05, | |
| "loss": 0.2599, | |
| "step": 10510 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 0.51816310859789, | |
| "learning_rate": 8.107101110461717e-05, | |
| "loss": 0.0686, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 1.2233390386002774, | |
| "learning_rate": 8.085184102863821e-05, | |
| "loss": 0.1502, | |
| "step": 10530 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 83.49175206154096, | |
| "learning_rate": 8.063267095265925e-05, | |
| "loss": 0.1806, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 4.967238991092641, | |
| "learning_rate": 8.041350087668029e-05, | |
| "loss": 0.1847, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 15.102812775710126, | |
| "learning_rate": 8.019433080070135e-05, | |
| "loss": 0.1602, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 26.52721557816532, | |
| "learning_rate": 7.997516072472237e-05, | |
| "loss": 0.1863, | |
| "step": 10570 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 1.232502179558844, | |
| "learning_rate": 7.975599064874343e-05, | |
| "loss": 0.1091, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 0.802916184416696, | |
| "learning_rate": 7.953682057276446e-05, | |
| "loss": 0.1202, | |
| "step": 10590 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 18.591440679290102, | |
| "learning_rate": 7.93176504967855e-05, | |
| "loss": 0.1693, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 1.0156253505497566, | |
| "learning_rate": 7.909848042080654e-05, | |
| "loss": 0.1564, | |
| "step": 10610 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 10.068403109613666, | |
| "learning_rate": 7.887931034482758e-05, | |
| "loss": 0.2361, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.6740912563898531, | |
| "learning_rate": 7.866014026884863e-05, | |
| "loss": 0.0799, | |
| "step": 10630 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 2.461614313039469, | |
| "learning_rate": 7.844097019286966e-05, | |
| "loss": 0.0537, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_0_f1": 0.7738325801592424, | |
| "eval_0_precision": 0.7824194952132288, | |
| "eval_0_recall": 0.7654320987654321, | |
| "eval_1_f1": 0.9226921662375874, | |
| "eval_1_precision": 0.9192437344276712, | |
| "eval_1_recall": 0.926166568222091, | |
| "eval_accuracy": 0.8847714066440083, | |
| "eval_loss": 0.39013671875, | |
| "eval_runtime": 544.8422, | |
| "eval_samples_per_second": 16.741, | |
| "eval_steps_per_second": 2.792, | |
| "step": 10641 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 14188, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "total_flos": 2.0001268071346995e+17, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |