{ "best_metric": 0.35205078125, "best_model_checkpoint": "./results/checkpoint-7094", "epoch": 3.0, "eval_steps": 500, "global_step": 10641, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.7654, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.862, "step": 20 }, { "epoch": 0.01, "grad_norm": 146.36835642526796, "learning_rate": 4.8e-06, "loss": 2.4598, "step": 30 }, { "epoch": 0.01, "grad_norm": 254.1399327894623, "learning_rate": 1.02e-05, "loss": 3.048, "step": 40 }, { "epoch": 0.01, "grad_norm": 55.794104847848445, "learning_rate": 1.6199999999999997e-05, "loss": 2.4278, "step": 50 }, { "epoch": 0.02, "grad_norm": 29.62062990171776, "learning_rate": 2.2199999999999998e-05, "loss": 0.9157, "step": 60 }, { "epoch": 0.02, "grad_norm": 36.90337952409684, "learning_rate": 2.8199999999999998e-05, "loss": 1.0977, "step": 70 }, { "epoch": 0.02, "grad_norm": 70.28513966911416, "learning_rate": 3.42e-05, "loss": 1.4851, "step": 80 }, { "epoch": 0.03, "grad_norm": 76.38058442522734, "learning_rate": 4.02e-05, "loss": 0.9004, "step": 90 }, { "epoch": 0.03, "grad_norm": 215.04691118737435, "learning_rate": 4.62e-05, "loss": 1.3784, "step": 100 }, { "epoch": 0.03, "grad_norm": 156.18636353053705, "learning_rate": 5.2199999999999995e-05, "loss": 1.9046, "step": 110 }, { "epoch": 0.03, "grad_norm": 83.58499534326243, "learning_rate": 5.82e-05, "loss": 1.8243, "step": 120 }, { "epoch": 0.04, "grad_norm": 58.89968090715743, "learning_rate": 6.419999999999999e-05, "loss": 3.5648, "step": 130 }, { "epoch": 0.04, "grad_norm": 279.1237096487612, "learning_rate": 7.02e-05, "loss": 1.3321, "step": 140 }, { "epoch": 0.04, "grad_norm": 14.282470404736001, "learning_rate": 7.62e-05, "loss": 0.7693, "step": 150 }, { "epoch": 0.05, "grad_norm": 21.416320260069092, "learning_rate": 8.22e-05, "loss": 0.7282, "step": 160 }, { "epoch": 0.05, "grad_norm": 7.027757159631835, "learning_rate": 8.819999999999999e-05, "loss": 0.8376, "step": 170 }, { "epoch": 0.05, "grad_norm": 28.808627534490917, "learning_rate": 9.419999999999999e-05, "loss": 0.8771, "step": 180 }, { "epoch": 0.05, "grad_norm": 55.14542562374617, "learning_rate": 0.0001002, "loss": 1.2092, "step": 190 }, { "epoch": 0.06, "grad_norm": 12.485175678803063, "learning_rate": 0.00010619999999999998, "loss": 0.7898, "step": 200 }, { "epoch": 0.06, "grad_norm": 30.98880210639734, "learning_rate": 0.00011219999999999999, "loss": 1.1421, "step": 210 }, { "epoch": 0.06, "grad_norm": 34.4471000818379, "learning_rate": 0.0001182, "loss": 0.755, "step": 220 }, { "epoch": 0.06, "grad_norm": 25.20646584085208, "learning_rate": 0.00012419999999999998, "loss": 0.8171, "step": 230 }, { "epoch": 0.07, "grad_norm": 57.11988442886105, "learning_rate": 0.0001302, "loss": 0.9672, "step": 240 }, { "epoch": 0.07, "grad_norm": 31.266794385874547, "learning_rate": 0.0001362, "loss": 0.863, "step": 250 }, { "epoch": 0.07, "grad_norm": 28.801620787803333, "learning_rate": 0.0001422, "loss": 0.9292, "step": 260 }, { "epoch": 0.08, "grad_norm": 18.379715667965055, "learning_rate": 0.0001482, "loss": 0.6885, "step": 270 }, { "epoch": 0.08, "grad_norm": 26.615184398415803, "learning_rate": 0.00015419999999999998, "loss": 0.8698, "step": 280 }, { "epoch": 0.08, "grad_norm": 27.84018584708001, "learning_rate": 0.0001602, "loss": 0.7403, "step": 290 }, { "epoch": 0.08, "grad_norm": 83.6950577392233, "learning_rate": 0.0001662, "loss": 1.7649, "step": 300 }, { "epoch": 0.09, "grad_norm": 62.62507175586115, "learning_rate": 0.00017219999999999998, "loss": 1.5992, "step": 310 }, { "epoch": 0.09, "grad_norm": 34.83362360351182, "learning_rate": 0.00017819999999999997, "loss": 3.7618, "step": 320 }, { "epoch": 0.09, "grad_norm": 0.1566836386456626, "learning_rate": 0.00018419999999999998, "loss": 1.1883, "step": 330 }, { "epoch": 0.1, "grad_norm": 44.53868085198857, "learning_rate": 0.0001902, "loss": 1.5033, "step": 340 }, { "epoch": 0.1, "grad_norm": 16.47571390018737, "learning_rate": 0.0001962, "loss": 0.7708, "step": 350 }, { "epoch": 0.1, "grad_norm": 90.50256233776733, "learning_rate": 0.0002022, "loss": 1.0146, "step": 360 }, { "epoch": 0.1, "grad_norm": 16.535117663656802, "learning_rate": 0.00020819999999999996, "loss": 0.673, "step": 370 }, { "epoch": 0.11, "grad_norm": 41.013763949361135, "learning_rate": 0.00021419999999999998, "loss": 0.7536, "step": 380 }, { "epoch": 0.11, "grad_norm": 50.83635956076198, "learning_rate": 0.00022019999999999999, "loss": 0.8832, "step": 390 }, { "epoch": 0.11, "grad_norm": 51.192653868723845, "learning_rate": 0.00022619999999999997, "loss": 0.756, "step": 400 }, { "epoch": 0.12, "grad_norm": 95.02104619699388, "learning_rate": 0.00023219999999999998, "loss": 1.0475, "step": 410 }, { "epoch": 0.12, "grad_norm": 14.755367161274899, "learning_rate": 0.0002382, "loss": 0.8114, "step": 420 }, { "epoch": 0.12, "grad_norm": 26.824869427969787, "learning_rate": 0.00024419999999999997, "loss": 0.7827, "step": 430 }, { "epoch": 0.12, "grad_norm": 10.199008591764807, "learning_rate": 0.00025019999999999996, "loss": 0.7201, "step": 440 }, { "epoch": 0.13, "grad_norm": 16.099046337033606, "learning_rate": 0.0002562, "loss": 0.7852, "step": 450 }, { "epoch": 0.13, "grad_norm": 72.84156691472333, "learning_rate": 0.0002622, "loss": 0.7819, "step": 460 }, { "epoch": 0.13, "grad_norm": 23.060057287801556, "learning_rate": 0.00026819999999999996, "loss": 1.1294, "step": 470 }, { "epoch": 0.14, "grad_norm": 18.372892721573056, "learning_rate": 0.0002742, "loss": 0.8509, "step": 480 }, { "epoch": 0.14, "grad_norm": 44.80535522734965, "learning_rate": 0.0002802, "loss": 1.3075, "step": 490 }, { "epoch": 0.14, "grad_norm": 53.039346895060866, "learning_rate": 0.00028619999999999996, "loss": 0.8981, "step": 500 }, { "epoch": 0.14, "grad_norm": 8.933950393723551, "learning_rate": 0.00029219999999999995, "loss": 0.8596, "step": 510 }, { "epoch": 0.15, "grad_norm": 54.86618147368649, "learning_rate": 0.0002982, "loss": 0.8419, "step": 520 }, { "epoch": 0.15, "grad_norm": 52.0156886597986, "learning_rate": 0.00029984658094681473, "loss": 1.0992, "step": 530 }, { "epoch": 0.15, "grad_norm": 48.61185102339785, "learning_rate": 0.00029962741087083574, "loss": 0.8373, "step": 540 }, { "epoch": 0.16, "grad_norm": 85.88381074144633, "learning_rate": 0.0002994082407948568, "loss": 0.9952, "step": 550 }, { "epoch": 0.16, "grad_norm": 46.828573117866156, "learning_rate": 0.0002991890707188778, "loss": 1.2285, "step": 560 }, { "epoch": 0.16, "grad_norm": 4.319911680589826, "learning_rate": 0.00029896990064289886, "loss": 1.0784, "step": 570 }, { "epoch": 0.16, "grad_norm": 49.18948488974743, "learning_rate": 0.0002987507305669199, "loss": 0.6879, "step": 580 }, { "epoch": 0.17, "grad_norm": 14.790075292508273, "learning_rate": 0.0002985315604909409, "loss": 0.7182, "step": 590 }, { "epoch": 0.17, "grad_norm": 81.77111956641443, "learning_rate": 0.000298312390414962, "loss": 0.6341, "step": 600 }, { "epoch": 0.17, "grad_norm": 5.015074610499661, "learning_rate": 0.00029809322033898304, "loss": 0.8418, "step": 610 }, { "epoch": 0.17, "grad_norm": 43.326414318341016, "learning_rate": 0.00029787405026300405, "loss": 0.7158, "step": 620 }, { "epoch": 0.18, "grad_norm": 21.749351652802584, "learning_rate": 0.0002976548801870251, "loss": 0.5541, "step": 630 }, { "epoch": 0.18, "grad_norm": 104.8554633037212, "learning_rate": 0.00029743571011104616, "loss": 0.7715, "step": 640 }, { "epoch": 0.18, "grad_norm": 53.28037677060509, "learning_rate": 0.00029721654003506717, "loss": 0.7089, "step": 650 }, { "epoch": 0.19, "grad_norm": 60.68620976609669, "learning_rate": 0.00029699736995908823, "loss": 0.7251, "step": 660 }, { "epoch": 0.19, "grad_norm": 83.67598838205309, "learning_rate": 0.0002967781998831093, "loss": 0.9626, "step": 670 }, { "epoch": 0.19, "grad_norm": 22.217129800838155, "learning_rate": 0.0002965590298071303, "loss": 0.7762, "step": 680 }, { "epoch": 0.19, "grad_norm": 86.4481164773122, "learning_rate": 0.00029633985973115135, "loss": 0.6535, "step": 690 }, { "epoch": 0.2, "grad_norm": 40.75531136163561, "learning_rate": 0.00029612068965517236, "loss": 0.6025, "step": 700 }, { "epoch": 0.2, "grad_norm": 56.971296704966576, "learning_rate": 0.0002959015195791934, "loss": 0.7087, "step": 710 }, { "epoch": 0.2, "grad_norm": 30.278786860468646, "learning_rate": 0.0002956823495032145, "loss": 0.8943, "step": 720 }, { "epoch": 0.21, "grad_norm": 14.890198931647719, "learning_rate": 0.0002954631794272355, "loss": 0.8818, "step": 730 }, { "epoch": 0.21, "grad_norm": 39.88836482589719, "learning_rate": 0.00029524400935125654, "loss": 0.9544, "step": 740 }, { "epoch": 0.21, "grad_norm": 102.4851007431489, "learning_rate": 0.0002950248392752776, "loss": 0.7461, "step": 750 }, { "epoch": 0.21, "grad_norm": 43.156700980283695, "learning_rate": 0.0002948056691992986, "loss": 0.7074, "step": 760 }, { "epoch": 0.22, "grad_norm": 49.376028998432666, "learning_rate": 0.00029458649912331966, "loss": 0.6382, "step": 770 }, { "epoch": 0.22, "grad_norm": 31.433406542237964, "learning_rate": 0.0002943673290473407, "loss": 0.732, "step": 780 }, { "epoch": 0.22, "grad_norm": 44.3715979494319, "learning_rate": 0.0002941481589713617, "loss": 0.6883, "step": 790 }, { "epoch": 0.23, "grad_norm": 17.06187425481664, "learning_rate": 0.0002939289888953828, "loss": 0.648, "step": 800 }, { "epoch": 0.23, "grad_norm": 8.777853690776, "learning_rate": 0.00029370981881940384, "loss": 0.4979, "step": 810 }, { "epoch": 0.23, "grad_norm": 4.880875214974237, "learning_rate": 0.00029349064874342485, "loss": 0.743, "step": 820 }, { "epoch": 0.23, "grad_norm": 9.720423607604369, "learning_rate": 0.0002932714786674459, "loss": 0.645, "step": 830 }, { "epoch": 0.24, "grad_norm": 9.066124973757804, "learning_rate": 0.00029305230859146697, "loss": 0.4977, "step": 840 }, { "epoch": 0.24, "grad_norm": 23.89066736642937, "learning_rate": 0.00029283313851548797, "loss": 0.4926, "step": 850 }, { "epoch": 0.24, "grad_norm": 41.695686510198016, "learning_rate": 0.00029261396843950903, "loss": 0.7886, "step": 860 }, { "epoch": 0.25, "grad_norm": 66.53797988620009, "learning_rate": 0.00029239479836353004, "loss": 0.5658, "step": 870 }, { "epoch": 0.25, "grad_norm": 55.97718756593597, "learning_rate": 0.0002921756282875511, "loss": 0.5484, "step": 880 }, { "epoch": 0.25, "grad_norm": 108.03497298548182, "learning_rate": 0.00029195645821157215, "loss": 0.7978, "step": 890 }, { "epoch": 0.25, "grad_norm": 40.14088131595207, "learning_rate": 0.00029173728813559316, "loss": 0.6178, "step": 900 }, { "epoch": 0.26, "grad_norm": 48.46822333526267, "learning_rate": 0.00029151811805961427, "loss": 0.565, "step": 910 }, { "epoch": 0.26, "grad_norm": 16.17488665111733, "learning_rate": 0.0002912989479836353, "loss": 0.6886, "step": 920 }, { "epoch": 0.26, "grad_norm": 26.592963237868908, "learning_rate": 0.0002910797779076563, "loss": 0.6966, "step": 930 }, { "epoch": 0.27, "grad_norm": 27.00009471161443, "learning_rate": 0.0002908606078316774, "loss": 0.6178, "step": 940 }, { "epoch": 0.27, "grad_norm": 28.195576793914913, "learning_rate": 0.0002906414377556984, "loss": 0.4663, "step": 950 }, { "epoch": 0.27, "grad_norm": 23.538388615789113, "learning_rate": 0.00029042226767971946, "loss": 0.7135, "step": 960 }, { "epoch": 0.27, "grad_norm": 20.353666443170358, "learning_rate": 0.00029020309760374046, "loss": 0.708, "step": 970 }, { "epoch": 0.28, "grad_norm": 30.98820805802448, "learning_rate": 0.0002899839275277615, "loss": 0.5138, "step": 980 }, { "epoch": 0.28, "grad_norm": 49.96332795879376, "learning_rate": 0.0002897647574517826, "loss": 0.5238, "step": 990 }, { "epoch": 0.28, "grad_norm": 36.472514659373466, "learning_rate": 0.0002895455873758036, "loss": 0.7198, "step": 1000 }, { "epoch": 0.28, "grad_norm": 5.727047796229911, "learning_rate": 0.00028932641729982465, "loss": 0.5427, "step": 1010 }, { "epoch": 0.29, "grad_norm": 6.39978136661481, "learning_rate": 0.0002891072472238457, "loss": 0.6386, "step": 1020 }, { "epoch": 0.29, "grad_norm": 26.496446288276136, "learning_rate": 0.0002888880771478667, "loss": 0.4019, "step": 1030 }, { "epoch": 0.29, "grad_norm": 96.50801416396253, "learning_rate": 0.00028866890707188777, "loss": 0.8864, "step": 1040 }, { "epoch": 0.3, "grad_norm": 26.933228918624025, "learning_rate": 0.00028844973699590883, "loss": 0.8963, "step": 1050 }, { "epoch": 0.3, "grad_norm": 20.70682008967005, "learning_rate": 0.00028825248392752773, "loss": 0.6956, "step": 1060 }, { "epoch": 0.3, "grad_norm": 46.724804444462556, "learning_rate": 0.00028803331385154874, "loss": 0.8669, "step": 1070 }, { "epoch": 0.3, "grad_norm": 5.5954267342032225, "learning_rate": 0.00028781414377556985, "loss": 0.9219, "step": 1080 }, { "epoch": 0.31, "grad_norm": 60.59949911543188, "learning_rate": 0.00028759497369959086, "loss": 0.6557, "step": 1090 }, { "epoch": 0.31, "grad_norm": 37.05416363332341, "learning_rate": 0.00028737580362361186, "loss": 0.7692, "step": 1100 }, { "epoch": 0.31, "grad_norm": 14.82417785628839, "learning_rate": 0.000287156633547633, "loss": 0.5669, "step": 1110 }, { "epoch": 0.32, "grad_norm": 34.71923857782051, "learning_rate": 0.000286937463471654, "loss": 0.7427, "step": 1120 }, { "epoch": 0.32, "grad_norm": 4.620541208209977, "learning_rate": 0.00028671829339567504, "loss": 0.4811, "step": 1130 }, { "epoch": 0.32, "grad_norm": 65.72414363498706, "learning_rate": 0.00028649912331969604, "loss": 0.6696, "step": 1140 }, { "epoch": 0.32, "grad_norm": 31.255536486132993, "learning_rate": 0.0002862799532437171, "loss": 0.6253, "step": 1150 }, { "epoch": 0.33, "grad_norm": 14.845236948350237, "learning_rate": 0.00028606078316773816, "loss": 0.5473, "step": 1160 }, { "epoch": 0.33, "grad_norm": 62.44169640686947, "learning_rate": 0.00028584161309175917, "loss": 0.6686, "step": 1170 }, { "epoch": 0.33, "grad_norm": 105.6997308934719, "learning_rate": 0.0002856224430157802, "loss": 0.7738, "step": 1180 }, { "epoch": 0.34, "grad_norm": 75.19754745372326, "learning_rate": 0.0002854032729398013, "loss": 0.5995, "step": 1190 }, { "epoch": 0.34, "grad_norm": 34.71806881877583, "learning_rate": 0.0002851841028638223, "loss": 0.5701, "step": 1200 }, { "epoch": 0.34, "grad_norm": 48.159897749652075, "learning_rate": 0.00028496493278784335, "loss": 0.4724, "step": 1210 }, { "epoch": 0.34, "grad_norm": 6.544316071386722, "learning_rate": 0.0002847457627118644, "loss": 0.6537, "step": 1220 }, { "epoch": 0.35, "grad_norm": 21.365400503740027, "learning_rate": 0.0002845265926358854, "loss": 0.4979, "step": 1230 }, { "epoch": 0.35, "grad_norm": 5.248896963680607, "learning_rate": 0.00028430742255990647, "loss": 0.4174, "step": 1240 }, { "epoch": 0.35, "grad_norm": 7.535922720940304, "learning_rate": 0.00028408825248392753, "loss": 0.7422, "step": 1250 }, { "epoch": 0.36, "grad_norm": 22.985477551377006, "learning_rate": 0.00028386908240794854, "loss": 0.6509, "step": 1260 }, { "epoch": 0.36, "grad_norm": 44.34169162733889, "learning_rate": 0.0002836499123319696, "loss": 0.6908, "step": 1270 }, { "epoch": 0.36, "grad_norm": 41.93808698385512, "learning_rate": 0.00028343074225599065, "loss": 0.5134, "step": 1280 }, { "epoch": 0.36, "grad_norm": 9.954650885829432, "learning_rate": 0.00028321157218001166, "loss": 0.4772, "step": 1290 }, { "epoch": 0.37, "grad_norm": 14.453953812006848, "learning_rate": 0.0002829924021040327, "loss": 0.6207, "step": 1300 }, { "epoch": 0.37, "grad_norm": 5.273785434123745, "learning_rate": 0.0002827732320280537, "loss": 0.5753, "step": 1310 }, { "epoch": 0.37, "grad_norm": 4.858445515524344, "learning_rate": 0.0002825540619520748, "loss": 0.5052, "step": 1320 }, { "epoch": 0.37, "grad_norm": 9.635141224231973, "learning_rate": 0.00028233489187609584, "loss": 0.4951, "step": 1330 }, { "epoch": 0.38, "grad_norm": 43.41263842722645, "learning_rate": 0.00028211572180011685, "loss": 0.5636, "step": 1340 }, { "epoch": 0.38, "grad_norm": 15.495027516009957, "learning_rate": 0.0002818965517241379, "loss": 0.466, "step": 1350 }, { "epoch": 0.38, "grad_norm": 64.28182805477857, "learning_rate": 0.00028167738164815896, "loss": 0.5672, "step": 1360 }, { "epoch": 0.39, "grad_norm": 64.9516330457715, "learning_rate": 0.00028145821157217997, "loss": 0.569, "step": 1370 }, { "epoch": 0.39, "grad_norm": 18.376532518493583, "learning_rate": 0.00028123904149620103, "loss": 0.4619, "step": 1380 }, { "epoch": 0.39, "grad_norm": 8.562914609442593, "learning_rate": 0.0002810198714202221, "loss": 0.545, "step": 1390 }, { "epoch": 0.39, "grad_norm": 60.21363972002377, "learning_rate": 0.0002808007013442431, "loss": 0.3975, "step": 1400 }, { "epoch": 0.4, "grad_norm": 28.047906754092985, "learning_rate": 0.00028058153126826415, "loss": 0.6251, "step": 1410 }, { "epoch": 0.4, "grad_norm": 35.10586789783855, "learning_rate": 0.0002803623611922852, "loss": 0.6277, "step": 1420 }, { "epoch": 0.4, "grad_norm": 12.797561679192818, "learning_rate": 0.0002801431911163062, "loss": 0.5604, "step": 1430 }, { "epoch": 0.41, "grad_norm": 45.77020206433535, "learning_rate": 0.0002799240210403273, "loss": 0.5095, "step": 1440 }, { "epoch": 0.41, "grad_norm": 8.950325235661051, "learning_rate": 0.00027970485096434833, "loss": 0.6742, "step": 1450 }, { "epoch": 0.41, "grad_norm": 89.39440158187, "learning_rate": 0.00027948568088836934, "loss": 0.6291, "step": 1460 }, { "epoch": 0.41, "grad_norm": 42.25863783297151, "learning_rate": 0.0002792665108123904, "loss": 0.5981, "step": 1470 }, { "epoch": 0.42, "grad_norm": 70.1528669289255, "learning_rate": 0.0002790473407364114, "loss": 0.6403, "step": 1480 }, { "epoch": 0.42, "grad_norm": 46.97367429458831, "learning_rate": 0.00027882817066043246, "loss": 0.4118, "step": 1490 }, { "epoch": 0.42, "grad_norm": 40.541175129277065, "learning_rate": 0.0002786090005844535, "loss": 0.5049, "step": 1500 }, { "epoch": 0.43, "grad_norm": 15.523999358550466, "learning_rate": 0.0002783898305084745, "loss": 0.4035, "step": 1510 }, { "epoch": 0.43, "grad_norm": 10.185134626898725, "learning_rate": 0.0002781706604324956, "loss": 0.3807, "step": 1520 }, { "epoch": 0.43, "grad_norm": 41.67374680359159, "learning_rate": 0.00027795149035651664, "loss": 0.6452, "step": 1530 }, { "epoch": 0.43, "grad_norm": 17.98431769773285, "learning_rate": 0.00027773232028053765, "loss": 0.4049, "step": 1540 }, { "epoch": 0.44, "grad_norm": 28.42010603664099, "learning_rate": 0.0002775131502045587, "loss": 0.5067, "step": 1550 }, { "epoch": 0.44, "grad_norm": 8.903302713389115, "learning_rate": 0.00027729398012857977, "loss": 0.6382, "step": 1560 }, { "epoch": 0.44, "grad_norm": 65.95394817146764, "learning_rate": 0.00027707481005260077, "loss": 0.651, "step": 1570 }, { "epoch": 0.45, "grad_norm": 13.423645923612371, "learning_rate": 0.00027685563997662183, "loss": 0.4343, "step": 1580 }, { "epoch": 0.45, "grad_norm": 19.870165040548233, "learning_rate": 0.0002766364699006429, "loss": 0.5016, "step": 1590 }, { "epoch": 0.45, "grad_norm": 7.295472062733364, "learning_rate": 0.0002764172998246639, "loss": 0.4203, "step": 1600 }, { "epoch": 0.45, "grad_norm": 13.426653279288725, "learning_rate": 0.00027619812974868495, "loss": 0.5413, "step": 1610 }, { "epoch": 0.46, "grad_norm": 35.81973630385564, "learning_rate": 0.00027597895967270596, "loss": 0.536, "step": 1620 }, { "epoch": 0.46, "grad_norm": 15.789511808342628, "learning_rate": 0.000275759789596727, "loss": 0.3641, "step": 1630 }, { "epoch": 0.46, "grad_norm": 3.438056573729943, "learning_rate": 0.0002755406195207481, "loss": 0.3846, "step": 1640 }, { "epoch": 0.47, "grad_norm": 43.693835213902354, "learning_rate": 0.0002753214494447691, "loss": 0.6304, "step": 1650 }, { "epoch": 0.47, "grad_norm": 41.69843964932105, "learning_rate": 0.00027510227936879014, "loss": 0.5184, "step": 1660 }, { "epoch": 0.47, "grad_norm": 26.698062076363627, "learning_rate": 0.0002748831092928112, "loss": 0.5844, "step": 1670 }, { "epoch": 0.47, "grad_norm": 1.7012189494264895, "learning_rate": 0.0002746639392168322, "loss": 0.4043, "step": 1680 }, { "epoch": 0.48, "grad_norm": 52.6718491008472, "learning_rate": 0.0002744447691408533, "loss": 0.5674, "step": 1690 }, { "epoch": 0.48, "grad_norm": 14.061719898202849, "learning_rate": 0.0002742255990648743, "loss": 0.4724, "step": 1700 }, { "epoch": 0.48, "grad_norm": 3.847972088161487, "learning_rate": 0.00027400642898889533, "loss": 0.3894, "step": 1710 }, { "epoch": 0.48, "grad_norm": 10.733410570701619, "learning_rate": 0.0002737872589129164, "loss": 0.3706, "step": 1720 }, { "epoch": 0.49, "grad_norm": 5.582634401736958, "learning_rate": 0.00027356808883693745, "loss": 0.6391, "step": 1730 }, { "epoch": 0.49, "grad_norm": 26.185314688453758, "learning_rate": 0.0002733489187609585, "loss": 0.6093, "step": 1740 }, { "epoch": 0.49, "grad_norm": 5.754244803162683, "learning_rate": 0.0002731297486849795, "loss": 0.4868, "step": 1750 }, { "epoch": 0.5, "grad_norm": 39.5290991827534, "learning_rate": 0.00027291057860900057, "loss": 0.3798, "step": 1760 }, { "epoch": 0.5, "grad_norm": 50.24696278971589, "learning_rate": 0.00027269140853302163, "loss": 0.5672, "step": 1770 }, { "epoch": 0.5, "grad_norm": 40.15272638877307, "learning_rate": 0.00027247223845704263, "loss": 0.4471, "step": 1780 }, { "epoch": 0.5, "grad_norm": 28.70418219314436, "learning_rate": 0.00027225306838106364, "loss": 0.6382, "step": 1790 }, { "epoch": 0.51, "grad_norm": 28.561604385347348, "learning_rate": 0.00027203389830508475, "loss": 0.4176, "step": 1800 }, { "epoch": 0.51, "grad_norm": 17.815402549033355, "learning_rate": 0.00027181472822910576, "loss": 0.5882, "step": 1810 }, { "epoch": 0.51, "grad_norm": 15.342408938615899, "learning_rate": 0.0002715955581531268, "loss": 0.5019, "step": 1820 }, { "epoch": 0.52, "grad_norm": 23.598442882592035, "learning_rate": 0.0002713763880771479, "loss": 0.556, "step": 1830 }, { "epoch": 0.52, "grad_norm": 67.23786205262353, "learning_rate": 0.0002711572180011689, "loss": 0.5685, "step": 1840 }, { "epoch": 0.52, "grad_norm": 4.243022538273344, "learning_rate": 0.00027093804792518994, "loss": 0.4733, "step": 1850 }, { "epoch": 0.52, "grad_norm": 13.180044758827014, "learning_rate": 0.000270718877849211, "loss": 0.4605, "step": 1860 }, { "epoch": 0.53, "grad_norm": 22.165941849190077, "learning_rate": 0.000270499707773232, "loss": 0.5884, "step": 1870 }, { "epoch": 0.53, "grad_norm": 63.37369359897044, "learning_rate": 0.00027028053769725306, "loss": 0.5035, "step": 1880 }, { "epoch": 0.53, "grad_norm": 7.126542990903908, "learning_rate": 0.00027006136762127407, "loss": 0.5265, "step": 1890 }, { "epoch": 0.54, "grad_norm": 18.999488189785442, "learning_rate": 0.0002698421975452951, "loss": 0.4029, "step": 1900 }, { "epoch": 0.54, "grad_norm": 74.7735357892713, "learning_rate": 0.0002696230274693162, "loss": 0.5864, "step": 1910 }, { "epoch": 0.54, "grad_norm": 4.537363990276932, "learning_rate": 0.0002694038573933372, "loss": 0.4943, "step": 1920 }, { "epoch": 0.54, "grad_norm": 5.4900230565856125, "learning_rate": 0.00026918468731735825, "loss": 0.4046, "step": 1930 }, { "epoch": 0.55, "grad_norm": 13.0055044025215, "learning_rate": 0.0002689655172413793, "loss": 0.4548, "step": 1940 }, { "epoch": 0.55, "grad_norm": 38.12513704333924, "learning_rate": 0.0002687463471654003, "loss": 0.4879, "step": 1950 }, { "epoch": 0.55, "grad_norm": 39.26658731655891, "learning_rate": 0.0002685271770894214, "loss": 0.4619, "step": 1960 }, { "epoch": 0.56, "grad_norm": 25.60070282398106, "learning_rate": 0.00026830800701344243, "loss": 0.3977, "step": 1970 }, { "epoch": 0.56, "grad_norm": 26.2056587948436, "learning_rate": 0.00026808883693746344, "loss": 0.6761, "step": 1980 }, { "epoch": 0.56, "grad_norm": 6.11452999302761, "learning_rate": 0.0002678696668614845, "loss": 0.4193, "step": 1990 }, { "epoch": 0.56, "grad_norm": 33.901640420477705, "learning_rate": 0.00026765049678550556, "loss": 0.3582, "step": 2000 }, { "epoch": 0.57, "grad_norm": 39.05264118867418, "learning_rate": 0.00026743132670952656, "loss": 0.6291, "step": 2010 }, { "epoch": 0.57, "grad_norm": 37.194416649503715, "learning_rate": 0.0002672121566335476, "loss": 0.5405, "step": 2020 }, { "epoch": 0.57, "grad_norm": 10.433072414980515, "learning_rate": 0.0002669929865575686, "loss": 0.3794, "step": 2030 }, { "epoch": 0.58, "grad_norm": 13.092304284069662, "learning_rate": 0.0002667738164815897, "loss": 0.5334, "step": 2040 }, { "epoch": 0.58, "grad_norm": 63.86550661120767, "learning_rate": 0.00026655464640561074, "loss": 0.5122, "step": 2050 }, { "epoch": 0.58, "grad_norm": 2.517042940759129, "learning_rate": 0.00026633547632963175, "loss": 0.5369, "step": 2060 }, { "epoch": 0.58, "grad_norm": 5.619466465382078, "learning_rate": 0.0002661163062536528, "loss": 0.5914, "step": 2070 }, { "epoch": 0.59, "grad_norm": 8.934697851639797, "learning_rate": 0.00026589713617767387, "loss": 0.4139, "step": 2080 }, { "epoch": 0.59, "grad_norm": 6.230035836028844, "learning_rate": 0.00026567796610169487, "loss": 0.5029, "step": 2090 }, { "epoch": 0.59, "grad_norm": 15.356060765823438, "learning_rate": 0.00026545879602571593, "loss": 0.3824, "step": 2100 }, { "epoch": 0.59, "grad_norm": 7.576984306147784, "learning_rate": 0.000265239625949737, "loss": 0.6175, "step": 2110 }, { "epoch": 0.6, "grad_norm": 28.748362595639772, "learning_rate": 0.000265020455873758, "loss": 0.5614, "step": 2120 }, { "epoch": 0.6, "grad_norm": 16.32965992610039, "learning_rate": 0.00026480128579777905, "loss": 0.5116, "step": 2130 }, { "epoch": 0.6, "grad_norm": 3.2753516984217614, "learning_rate": 0.0002645821157218001, "loss": 0.4541, "step": 2140 }, { "epoch": 0.61, "grad_norm": 42.92699392822083, "learning_rate": 0.0002643629456458211, "loss": 0.4284, "step": 2150 }, { "epoch": 0.61, "grad_norm": 17.612783953748007, "learning_rate": 0.0002641437755698422, "loss": 0.465, "step": 2160 }, { "epoch": 0.61, "grad_norm": 8.549512475388743, "learning_rate": 0.00026392460549386324, "loss": 0.409, "step": 2170 }, { "epoch": 0.61, "grad_norm": 12.657379379480137, "learning_rate": 0.00026370543541788424, "loss": 0.5745, "step": 2180 }, { "epoch": 0.62, "grad_norm": 19.533607385657547, "learning_rate": 0.0002634862653419053, "loss": 0.493, "step": 2190 }, { "epoch": 0.62, "grad_norm": 30.019312017902323, "learning_rate": 0.0002632670952659263, "loss": 0.486, "step": 2200 }, { "epoch": 0.62, "grad_norm": 15.192669371979543, "learning_rate": 0.00026304792518994736, "loss": 0.4214, "step": 2210 }, { "epoch": 0.63, "grad_norm": 32.465439529928304, "learning_rate": 0.0002628287551139684, "loss": 0.5712, "step": 2220 }, { "epoch": 0.63, "grad_norm": 36.91886247399315, "learning_rate": 0.00026260958503798943, "loss": 0.4334, "step": 2230 }, { "epoch": 0.63, "grad_norm": 23.910769075350032, "learning_rate": 0.0002623904149620105, "loss": 0.5462, "step": 2240 }, { "epoch": 0.63, "grad_norm": 67.93569743336582, "learning_rate": 0.00026217124488603155, "loss": 0.4493, "step": 2250 }, { "epoch": 0.64, "grad_norm": 2.532118414700022, "learning_rate": 0.00026195207481005255, "loss": 0.4534, "step": 2260 }, { "epoch": 0.64, "grad_norm": 29.266963293461014, "learning_rate": 0.0002617329047340736, "loss": 0.4529, "step": 2270 }, { "epoch": 0.64, "grad_norm": 48.97288077592708, "learning_rate": 0.00026153565166569257, "loss": 0.5841, "step": 2280 }, { "epoch": 0.65, "grad_norm": 20.00706276686128, "learning_rate": 0.00026131648158971357, "loss": 0.5449, "step": 2290 }, { "epoch": 0.65, "grad_norm": 52.88683636279855, "learning_rate": 0.00026109731151373463, "loss": 0.5451, "step": 2300 }, { "epoch": 0.65, "grad_norm": 19.058421611399435, "learning_rate": 0.0002608781414377557, "loss": 0.4933, "step": 2310 }, { "epoch": 0.65, "grad_norm": 12.662298018994834, "learning_rate": 0.0002606589713617767, "loss": 0.4933, "step": 2320 }, { "epoch": 0.66, "grad_norm": 45.53858660601269, "learning_rate": 0.00026043980128579776, "loss": 0.53, "step": 2330 }, { "epoch": 0.66, "grad_norm": 26.19222918219647, "learning_rate": 0.0002602206312098188, "loss": 0.4293, "step": 2340 }, { "epoch": 0.66, "grad_norm": 12.902630462226897, "learning_rate": 0.0002600014611338398, "loss": 0.4571, "step": 2350 }, { "epoch": 0.67, "grad_norm": 30.841311400161104, "learning_rate": 0.0002597822910578609, "loss": 0.3776, "step": 2360 }, { "epoch": 0.67, "grad_norm": 2.354115675571604, "learning_rate": 0.00025956312098188194, "loss": 0.3751, "step": 2370 }, { "epoch": 0.67, "grad_norm": 11.162047701554282, "learning_rate": 0.00025934395090590294, "loss": 0.2605, "step": 2380 }, { "epoch": 0.67, "grad_norm": 42.422050961854865, "learning_rate": 0.000259124780829924, "loss": 0.5152, "step": 2390 }, { "epoch": 0.68, "grad_norm": 38.9786626728249, "learning_rate": 0.000258905610753945, "loss": 0.3935, "step": 2400 }, { "epoch": 0.68, "grad_norm": 38.1773503077784, "learning_rate": 0.00025868644067796607, "loss": 0.5644, "step": 2410 }, { "epoch": 0.68, "grad_norm": 14.79430105184665, "learning_rate": 0.0002584672706019871, "loss": 0.5293, "step": 2420 }, { "epoch": 0.69, "grad_norm": 34.794933030659074, "learning_rate": 0.00025824810052600813, "loss": 0.5646, "step": 2430 }, { "epoch": 0.69, "grad_norm": 11.775866235902662, "learning_rate": 0.0002580289304500292, "loss": 0.4189, "step": 2440 }, { "epoch": 0.69, "grad_norm": 74.35669253529362, "learning_rate": 0.00025780976037405025, "loss": 0.6112, "step": 2450 }, { "epoch": 0.69, "grad_norm": 38.88263346213307, "learning_rate": 0.00025759059029807125, "loss": 0.4503, "step": 2460 }, { "epoch": 0.7, "grad_norm": 37.361077582565066, "learning_rate": 0.0002573714202220923, "loss": 0.4393, "step": 2470 }, { "epoch": 0.7, "grad_norm": 1.7092125809415342, "learning_rate": 0.00025715225014611337, "loss": 0.4243, "step": 2480 }, { "epoch": 0.7, "grad_norm": 57.33578620565298, "learning_rate": 0.0002569330800701344, "loss": 0.4961, "step": 2490 }, { "epoch": 0.7, "grad_norm": 24.953469848220585, "learning_rate": 0.00025671390999415543, "loss": 0.5143, "step": 2500 }, { "epoch": 0.71, "grad_norm": 69.87186379679335, "learning_rate": 0.0002564947399181765, "loss": 0.5994, "step": 2510 }, { "epoch": 0.71, "grad_norm": 9.423503958754821, "learning_rate": 0.0002562755698421975, "loss": 0.3865, "step": 2520 }, { "epoch": 0.71, "grad_norm": 35.63761411276129, "learning_rate": 0.00025605639976621856, "loss": 0.4244, "step": 2530 }, { "epoch": 0.72, "grad_norm": 7.718201160525164, "learning_rate": 0.0002558372296902396, "loss": 0.303, "step": 2540 }, { "epoch": 0.72, "grad_norm": 4.591243708143018, "learning_rate": 0.0002556180596142607, "loss": 0.5215, "step": 2550 }, { "epoch": 0.72, "grad_norm": 22.307015902915715, "learning_rate": 0.0002553988895382817, "loss": 0.4571, "step": 2560 }, { "epoch": 0.72, "grad_norm": 22.779301668637764, "learning_rate": 0.0002551797194623027, "loss": 0.4807, "step": 2570 }, { "epoch": 0.73, "grad_norm": 12.190202028042945, "learning_rate": 0.0002549605493863238, "loss": 0.3605, "step": 2580 }, { "epoch": 0.73, "grad_norm": 7.251641577848608, "learning_rate": 0.0002547413793103448, "loss": 0.6469, "step": 2590 }, { "epoch": 0.73, "grad_norm": 1.5118716411197952, "learning_rate": 0.00025452220923436586, "loss": 0.3641, "step": 2600 }, { "epoch": 0.74, "grad_norm": 3.4089258042614086, "learning_rate": 0.0002543030391583869, "loss": 0.6651, "step": 2610 }, { "epoch": 0.74, "grad_norm": 2.9557173441623856, "learning_rate": 0.00025408386908240793, "loss": 0.6309, "step": 2620 }, { "epoch": 0.74, "grad_norm": 29.838754199710966, "learning_rate": 0.000253864699006429, "loss": 0.4806, "step": 2630 }, { "epoch": 0.74, "grad_norm": 27.703367909949506, "learning_rate": 0.00025364552893045, "loss": 0.5419, "step": 2640 }, { "epoch": 0.75, "grad_norm": 30.019133853453674, "learning_rate": 0.00025342635885447105, "loss": 0.5585, "step": 2650 }, { "epoch": 0.75, "grad_norm": 34.77564734000214, "learning_rate": 0.0002532071887784921, "loss": 0.4483, "step": 2660 }, { "epoch": 0.75, "grad_norm": 47.783274628573835, "learning_rate": 0.0002529880187025131, "loss": 0.5153, "step": 2670 }, { "epoch": 0.76, "grad_norm": 26.23337473603945, "learning_rate": 0.0002527688486265342, "loss": 0.4699, "step": 2680 }, { "epoch": 0.76, "grad_norm": 29.216617715519472, "learning_rate": 0.00025254967855055523, "loss": 0.5015, "step": 2690 }, { "epoch": 0.76, "grad_norm": 62.76087495760497, "learning_rate": 0.00025233050847457624, "loss": 0.3711, "step": 2700 }, { "epoch": 0.76, "grad_norm": 45.88533063155937, "learning_rate": 0.0002521113383985973, "loss": 0.6509, "step": 2710 }, { "epoch": 0.77, "grad_norm": 20.44584239605378, "learning_rate": 0.00025189216832261836, "loss": 0.3635, "step": 2720 }, { "epoch": 0.77, "grad_norm": 16.861314315606865, "learning_rate": 0.00025167299824663936, "loss": 0.4719, "step": 2730 }, { "epoch": 0.77, "grad_norm": 28.52875293425469, "learning_rate": 0.0002514538281706604, "loss": 0.689, "step": 2740 }, { "epoch": 0.78, "grad_norm": 35.22222000713686, "learning_rate": 0.0002512346580946815, "loss": 0.6445, "step": 2750 }, { "epoch": 0.78, "grad_norm": 35.45687930499127, "learning_rate": 0.0002510154880187025, "loss": 0.4704, "step": 2760 }, { "epoch": 0.78, "grad_norm": 8.810315004780433, "learning_rate": 0.00025079631794272354, "loss": 0.5576, "step": 2770 }, { "epoch": 0.78, "grad_norm": 35.77216936747473, "learning_rate": 0.0002505771478667446, "loss": 0.3533, "step": 2780 }, { "epoch": 0.79, "grad_norm": 6.045353414942304, "learning_rate": 0.0002503579777907656, "loss": 0.4113, "step": 2790 }, { "epoch": 0.79, "grad_norm": 45.51398057288223, "learning_rate": 0.00025013880771478667, "loss": 0.4071, "step": 2800 }, { "epoch": 0.79, "grad_norm": 36.84264931235842, "learning_rate": 0.00024991963763880767, "loss": 0.6049, "step": 2810 }, { "epoch": 0.8, "grad_norm": 7.376752961081483, "learning_rate": 0.00024970046756282873, "loss": 0.4454, "step": 2820 }, { "epoch": 0.8, "grad_norm": 45.296630887337315, "learning_rate": 0.0002494812974868498, "loss": 0.4267, "step": 2830 }, { "epoch": 0.8, "grad_norm": 43.62273117063915, "learning_rate": 0.0002492621274108708, "loss": 0.4359, "step": 2840 }, { "epoch": 0.8, "grad_norm": 51.720566303652305, "learning_rate": 0.00024904295733489185, "loss": 0.4941, "step": 2850 }, { "epoch": 0.81, "grad_norm": 24.249504968694367, "learning_rate": 0.0002488237872589129, "loss": 0.345, "step": 2860 }, { "epoch": 0.81, "grad_norm": 14.634006430151054, "learning_rate": 0.0002486046171829339, "loss": 0.4586, "step": 2870 }, { "epoch": 0.81, "grad_norm": 11.217467398348905, "learning_rate": 0.000248385447106955, "loss": 0.4376, "step": 2880 }, { "epoch": 0.81, "grad_norm": 45.30561601980238, "learning_rate": 0.00024816627703097604, "loss": 0.3944, "step": 2890 }, { "epoch": 0.82, "grad_norm": 15.499098833410399, "learning_rate": 0.00024794710695499704, "loss": 0.4469, "step": 2900 }, { "epoch": 0.82, "grad_norm": 15.594634721523647, "learning_rate": 0.0002477279368790181, "loss": 0.4524, "step": 2910 }, { "epoch": 0.82, "grad_norm": 4.415769490237178, "learning_rate": 0.00024750876680303916, "loss": 0.5186, "step": 2920 }, { "epoch": 0.83, "grad_norm": 50.952551749899285, "learning_rate": 0.00024728959672706016, "loss": 0.3835, "step": 2930 }, { "epoch": 0.83, "grad_norm": 44.37013904045829, "learning_rate": 0.0002470704266510812, "loss": 0.3692, "step": 2940 }, { "epoch": 0.83, "grad_norm": 5.538761067578604, "learning_rate": 0.0002468512565751023, "loss": 0.4466, "step": 2950 }, { "epoch": 0.83, "grad_norm": 36.623608048249956, "learning_rate": 0.0002466320864991233, "loss": 0.4119, "step": 2960 }, { "epoch": 0.84, "grad_norm": 14.12931554239451, "learning_rate": 0.00024641291642314435, "loss": 0.4094, "step": 2970 }, { "epoch": 0.84, "grad_norm": 5.17846897516002, "learning_rate": 0.00024619374634716535, "loss": 0.6419, "step": 2980 }, { "epoch": 0.84, "grad_norm": 17.27825883448974, "learning_rate": 0.0002459745762711864, "loss": 0.4454, "step": 2990 }, { "epoch": 0.85, "grad_norm": 39.317445442045795, "learning_rate": 0.00024575540619520747, "loss": 0.4505, "step": 3000 }, { "epoch": 0.85, "grad_norm": 69.64730678545516, "learning_rate": 0.0002455362361192285, "loss": 0.4583, "step": 3010 }, { "epoch": 0.85, "grad_norm": 28.844952718130543, "learning_rate": 0.00024531706604324953, "loss": 0.6161, "step": 3020 }, { "epoch": 0.85, "grad_norm": 19.0210560197049, "learning_rate": 0.0002450978959672706, "loss": 0.4669, "step": 3030 }, { "epoch": 0.86, "grad_norm": 6.173405033604054, "learning_rate": 0.0002448787258912916, "loss": 0.4409, "step": 3040 }, { "epoch": 0.86, "grad_norm": 52.77800432968733, "learning_rate": 0.00024465955581531266, "loss": 0.4491, "step": 3050 }, { "epoch": 0.86, "grad_norm": 44.46853107524498, "learning_rate": 0.0002444403857393337, "loss": 0.3192, "step": 3060 }, { "epoch": 0.87, "grad_norm": 5.0160255873775546, "learning_rate": 0.0002442212156633547, "loss": 0.4516, "step": 3070 }, { "epoch": 0.87, "grad_norm": 23.9601395551555, "learning_rate": 0.00024400204558737578, "loss": 0.5556, "step": 3080 }, { "epoch": 0.87, "grad_norm": 29.267790594083895, "learning_rate": 0.00024378287551139684, "loss": 0.592, "step": 3090 }, { "epoch": 0.87, "grad_norm": 23.012129032542134, "learning_rate": 0.00024356370543541787, "loss": 0.3747, "step": 3100 }, { "epoch": 0.88, "grad_norm": 14.454095097216845, "learning_rate": 0.0002433445353594389, "loss": 0.384, "step": 3110 }, { "epoch": 0.88, "grad_norm": 26.97816432687431, "learning_rate": 0.00024312536528345993, "loss": 0.3297, "step": 3120 }, { "epoch": 0.88, "grad_norm": 9.801537735404679, "learning_rate": 0.000242906195207481, "loss": 0.5374, "step": 3130 }, { "epoch": 0.89, "grad_norm": 6.8951700119996, "learning_rate": 0.00024268702513150203, "loss": 0.5083, "step": 3140 }, { "epoch": 0.89, "grad_norm": 14.817012261931632, "learning_rate": 0.00024246785505552306, "loss": 0.5883, "step": 3150 }, { "epoch": 0.89, "grad_norm": 41.27878717469242, "learning_rate": 0.00024224868497954412, "loss": 0.3978, "step": 3160 }, { "epoch": 0.89, "grad_norm": 37.80902917455334, "learning_rate": 0.00024202951490356515, "loss": 0.5849, "step": 3170 }, { "epoch": 0.9, "grad_norm": 18.49777920142327, "learning_rate": 0.00024181034482758618, "loss": 0.4706, "step": 3180 }, { "epoch": 0.9, "grad_norm": 19.061143834111856, "learning_rate": 0.00024159117475160724, "loss": 0.4619, "step": 3190 }, { "epoch": 0.9, "grad_norm": 22.2453112157315, "learning_rate": 0.00024137200467562827, "loss": 0.3842, "step": 3200 }, { "epoch": 0.9, "grad_norm": 50.79421799621206, "learning_rate": 0.0002411528345996493, "loss": 0.3655, "step": 3210 }, { "epoch": 0.91, "grad_norm": 32.50988571681092, "learning_rate": 0.00024093366452367034, "loss": 0.5652, "step": 3220 }, { "epoch": 0.91, "grad_norm": 3.5054157276555955, "learning_rate": 0.0002407144944476914, "loss": 0.6236, "step": 3230 }, { "epoch": 0.91, "grad_norm": 34.425678272389796, "learning_rate": 0.00024049532437171243, "loss": 0.4448, "step": 3240 }, { "epoch": 0.92, "grad_norm": 27.486059433114335, "learning_rate": 0.00024027615429573346, "loss": 0.4669, "step": 3250 }, { "epoch": 0.92, "grad_norm": 19.22828927795651, "learning_rate": 0.00024005698421975452, "loss": 0.4644, "step": 3260 }, { "epoch": 0.92, "grad_norm": 21.0363603349132, "learning_rate": 0.00023983781414377555, "loss": 0.431, "step": 3270 }, { "epoch": 0.92, "grad_norm": 3.5723838037716984, "learning_rate": 0.00023961864406779658, "loss": 0.5383, "step": 3280 }, { "epoch": 0.93, "grad_norm": 13.546990268361519, "learning_rate": 0.00023939947399181761, "loss": 0.3201, "step": 3290 }, { "epoch": 0.93, "grad_norm": 24.38084566112433, "learning_rate": 0.00023918030391583867, "loss": 0.3129, "step": 3300 }, { "epoch": 0.93, "grad_norm": 25.645694818995487, "learning_rate": 0.0002389611338398597, "loss": 0.5026, "step": 3310 }, { "epoch": 0.94, "grad_norm": 66.5222383607227, "learning_rate": 0.00023874196376388074, "loss": 0.5204, "step": 3320 }, { "epoch": 0.94, "grad_norm": 24.167963861089163, "learning_rate": 0.0002385227936879018, "loss": 0.2994, "step": 3330 }, { "epoch": 0.94, "grad_norm": 2.9923273025019665, "learning_rate": 0.00023830362361192283, "loss": 0.3327, "step": 3340 }, { "epoch": 0.94, "grad_norm": 7.036402375923249, "learning_rate": 0.00023808445353594386, "loss": 0.4824, "step": 3350 }, { "epoch": 0.95, "grad_norm": 4.932257241329014, "learning_rate": 0.00023786528345996492, "loss": 0.3872, "step": 3360 }, { "epoch": 0.95, "grad_norm": 8.597901439972688, "learning_rate": 0.00023764611338398595, "loss": 0.3997, "step": 3370 }, { "epoch": 0.95, "grad_norm": 11.315382409369429, "learning_rate": 0.00023742694330800698, "loss": 0.5147, "step": 3380 }, { "epoch": 0.96, "grad_norm": 14.52575221014949, "learning_rate": 0.00023720777323202802, "loss": 0.3314, "step": 3390 }, { "epoch": 0.96, "grad_norm": 43.682402501837984, "learning_rate": 0.00023698860315604907, "loss": 0.4339, "step": 3400 }, { "epoch": 0.96, "grad_norm": 53.44210886125679, "learning_rate": 0.0002367694330800701, "loss": 0.4009, "step": 3410 }, { "epoch": 0.96, "grad_norm": 13.63857046515992, "learning_rate": 0.00023655026300409114, "loss": 0.4517, "step": 3420 }, { "epoch": 0.97, "grad_norm": 18.188881596196584, "learning_rate": 0.0002363310929281122, "loss": 0.7472, "step": 3430 }, { "epoch": 0.97, "grad_norm": 15.622454098714014, "learning_rate": 0.00023611192285213323, "loss": 0.4332, "step": 3440 }, { "epoch": 0.97, "grad_norm": 23.030560101084287, "learning_rate": 0.00023589275277615426, "loss": 0.4659, "step": 3450 }, { "epoch": 0.98, "grad_norm": 11.217028241840945, "learning_rate": 0.0002356735827001753, "loss": 0.4288, "step": 3460 }, { "epoch": 0.98, "grad_norm": 20.08432954445042, "learning_rate": 0.00023545441262419635, "loss": 0.277, "step": 3470 }, { "epoch": 0.98, "grad_norm": 74.65434882424645, "learning_rate": 0.00023523524254821738, "loss": 0.635, "step": 3480 }, { "epoch": 0.98, "grad_norm": 40.495503288549955, "learning_rate": 0.00023501607247223842, "loss": 0.388, "step": 3490 }, { "epoch": 0.99, "grad_norm": 12.752161600605927, "learning_rate": 0.0002347969023962595, "loss": 0.5092, "step": 3500 }, { "epoch": 0.99, "grad_norm": 14.53893507733024, "learning_rate": 0.0002345777323202805, "loss": 0.4765, "step": 3510 }, { "epoch": 0.99, "grad_norm": 4.316017393789724, "learning_rate": 0.00023435856224430154, "loss": 0.3659, "step": 3520 }, { "epoch": 1.0, "grad_norm": 30.899825163472197, "learning_rate": 0.00023413939216832257, "loss": 0.4516, "step": 3530 }, { "epoch": 1.0, "grad_norm": 39.99478400763079, "learning_rate": 0.00023392022209234366, "loss": 0.381, "step": 3540 }, { "epoch": 1.0, "eval_0_f1": 0.6956680014561338, "eval_0_precision": 0.607631160572337, "eval_0_recall": 0.8135376756066411, "eval_1_f1": 0.8688421713209915, "eval_1_precision": 0.9267068273092369, "eval_1_recall": 0.8177790903721205, "eval_accuracy": 0.8166867668018858, "eval_loss": 0.3857421875, "eval_runtime": 546.7196, "eval_samples_per_second": 16.683, "eval_steps_per_second": 2.782, "step": 3547 }, { "epoch": 1.0, "grad_norm": 5.0389158685922375, "learning_rate": 0.00023370105201636466, "loss": 0.4146, "step": 3550 }, { "epoch": 1.0, "grad_norm": 11.9478624203234, "learning_rate": 0.0002334818819403857, "loss": 0.219, "step": 3560 }, { "epoch": 1.01, "grad_norm": 2.574561125344869, "learning_rate": 0.00023326271186440678, "loss": 0.1326, "step": 3570 }, { "epoch": 1.01, "grad_norm": 6.645296190271201, "learning_rate": 0.0002330435417884278, "loss": 0.3714, "step": 3580 }, { "epoch": 1.01, "grad_norm": 21.165312017263854, "learning_rate": 0.00023282437171244884, "loss": 0.186, "step": 3590 }, { "epoch": 1.01, "grad_norm": 33.09937520843868, "learning_rate": 0.0002326052016364699, "loss": 0.189, "step": 3600 }, { "epoch": 1.02, "grad_norm": 28.917213358449875, "learning_rate": 0.00023238603156049094, "loss": 0.2064, "step": 3610 }, { "epoch": 1.02, "grad_norm": 6.362982741180963, "learning_rate": 0.00023216686148451197, "loss": 0.3743, "step": 3620 }, { "epoch": 1.02, "grad_norm": 15.278355603224561, "learning_rate": 0.000231947691408533, "loss": 0.339, "step": 3630 }, { "epoch": 1.03, "grad_norm": 19.75892836736991, "learning_rate": 0.00023172852133255406, "loss": 0.3234, "step": 3640 }, { "epoch": 1.03, "grad_norm": 27.481948501720865, "learning_rate": 0.0002315093512565751, "loss": 0.3744, "step": 3650 }, { "epoch": 1.03, "grad_norm": 4.553602693616155, "learning_rate": 0.00023129018118059612, "loss": 0.292, "step": 3660 }, { "epoch": 1.03, "grad_norm": 5.428732611717056, "learning_rate": 0.00023107101110461718, "loss": 0.2631, "step": 3670 }, { "epoch": 1.04, "grad_norm": 5.050323966343426, "learning_rate": 0.00023085184102863821, "loss": 0.4004, "step": 3680 }, { "epoch": 1.04, "grad_norm": 24.80793600752628, "learning_rate": 0.00023063267095265925, "loss": 0.5381, "step": 3690 }, { "epoch": 1.04, "grad_norm": 18.258408123657016, "learning_rate": 0.00023041350087668028, "loss": 0.3396, "step": 3700 }, { "epoch": 1.05, "grad_norm": 29.325589345887487, "learning_rate": 0.00023019433080070134, "loss": 0.3759, "step": 3710 }, { "epoch": 1.05, "grad_norm": 5.83589085044179, "learning_rate": 0.00022997516072472237, "loss": 0.3515, "step": 3720 }, { "epoch": 1.05, "grad_norm": 31.709214076980373, "learning_rate": 0.0002297559906487434, "loss": 0.3038, "step": 3730 }, { "epoch": 1.05, "grad_norm": 5.331144297018398, "learning_rate": 0.00022953682057276446, "loss": 0.2608, "step": 3740 }, { "epoch": 1.06, "grad_norm": 10.008440999378909, "learning_rate": 0.0002293176504967855, "loss": 0.392, "step": 3750 }, { "epoch": 1.06, "grad_norm": 24.06127490696333, "learning_rate": 0.00022909848042080652, "loss": 0.464, "step": 3760 }, { "epoch": 1.06, "grad_norm": 36.47161723251647, "learning_rate": 0.00022887931034482758, "loss": 0.3144, "step": 3770 }, { "epoch": 1.07, "grad_norm": 44.318502303445214, "learning_rate": 0.00022866014026884862, "loss": 0.2178, "step": 3780 }, { "epoch": 1.07, "grad_norm": 21.011798626587794, "learning_rate": 0.00022844097019286965, "loss": 0.2655, "step": 3790 }, { "epoch": 1.07, "grad_norm": 7.015038983544465, "learning_rate": 0.00022822180011689068, "loss": 0.33, "step": 3800 }, { "epoch": 1.07, "grad_norm": 9.837625310344656, "learning_rate": 0.00022800263004091174, "loss": 0.3804, "step": 3810 }, { "epoch": 1.08, "grad_norm": 3.5246300872909493, "learning_rate": 0.00022778345996493277, "loss": 0.3144, "step": 3820 }, { "epoch": 1.08, "grad_norm": 7.333333429866329, "learning_rate": 0.0002275642898889538, "loss": 0.208, "step": 3830 }, { "epoch": 1.08, "grad_norm": 2.9276220226615655, "learning_rate": 0.00022734511981297486, "loss": 0.1965, "step": 3840 }, { "epoch": 1.09, "grad_norm": 3.296027114122367, "learning_rate": 0.0002271259497369959, "loss": 0.1641, "step": 3850 }, { "epoch": 1.09, "grad_norm": 63.98473163919884, "learning_rate": 0.00022690677966101693, "loss": 0.2204, "step": 3860 }, { "epoch": 1.09, "grad_norm": 25.79236796313587, "learning_rate": 0.00022668760958503796, "loss": 0.3985, "step": 3870 }, { "epoch": 1.09, "grad_norm": 47.19895401753889, "learning_rate": 0.00022646843950905902, "loss": 0.3724, "step": 3880 }, { "epoch": 1.1, "grad_norm": 11.740165468615665, "learning_rate": 0.00022624926943308005, "loss": 0.3228, "step": 3890 }, { "epoch": 1.1, "grad_norm": 1.7482155895929372, "learning_rate": 0.00022603009935710108, "loss": 0.3069, "step": 3900 }, { "epoch": 1.1, "grad_norm": 31.032450056833344, "learning_rate": 0.00022581092928112214, "loss": 0.2573, "step": 3910 }, { "epoch": 1.11, "grad_norm": 11.15417567821661, "learning_rate": 0.00022559175920514317, "loss": 0.2432, "step": 3920 }, { "epoch": 1.11, "grad_norm": 9.77526286134104, "learning_rate": 0.0002253725891291642, "loss": 0.202, "step": 3930 }, { "epoch": 1.11, "grad_norm": 43.631843372398045, "learning_rate": 0.00022515341905318524, "loss": 0.3633, "step": 3940 }, { "epoch": 1.11, "grad_norm": 24.908344949793975, "learning_rate": 0.0002249342489772063, "loss": 0.2629, "step": 3950 }, { "epoch": 1.12, "grad_norm": 1.9235099739993984, "learning_rate": 0.00022471507890122733, "loss": 0.312, "step": 3960 }, { "epoch": 1.12, "grad_norm": 8.901642976423531, "learning_rate": 0.00022449590882524836, "loss": 0.2134, "step": 3970 }, { "epoch": 1.12, "grad_norm": 35.49143975289104, "learning_rate": 0.00022427673874926942, "loss": 0.3581, "step": 3980 }, { "epoch": 1.12, "grad_norm": 9.880151735005258, "learning_rate": 0.00022405756867329045, "loss": 0.2105, "step": 3990 }, { "epoch": 1.13, "grad_norm": 11.472473033640894, "learning_rate": 0.00022383839859731148, "loss": 0.2771, "step": 4000 }, { "epoch": 1.13, "grad_norm": 7.564996853716761, "learning_rate": 0.00022361922852133254, "loss": 0.1729, "step": 4010 }, { "epoch": 1.13, "grad_norm": 1.6868590263390821, "learning_rate": 0.00022340005844535357, "loss": 0.4633, "step": 4020 }, { "epoch": 1.14, "grad_norm": 28.229059784881606, "learning_rate": 0.0002231808883693746, "loss": 0.5021, "step": 4030 }, { "epoch": 1.14, "grad_norm": 17.572412568669705, "learning_rate": 0.00022296171829339564, "loss": 0.2666, "step": 4040 }, { "epoch": 1.14, "grad_norm": 11.720230148855956, "learning_rate": 0.0002227425482174167, "loss": 0.3706, "step": 4050 }, { "epoch": 1.14, "grad_norm": 57.76450102261794, "learning_rate": 0.00022252337814143773, "loss": 0.3514, "step": 4060 }, { "epoch": 1.15, "grad_norm": 8.937408336756231, "learning_rate": 0.00022230420806545876, "loss": 0.3542, "step": 4070 }, { "epoch": 1.15, "grad_norm": 40.358482259032456, "learning_rate": 0.00022208503798947982, "loss": 0.3049, "step": 4080 }, { "epoch": 1.15, "grad_norm": 22.567330463151773, "learning_rate": 0.00022186586791350085, "loss": 0.2778, "step": 4090 }, { "epoch": 1.16, "grad_norm": 27.666117112861954, "learning_rate": 0.00022164669783752188, "loss": 0.2204, "step": 4100 }, { "epoch": 1.16, "grad_norm": 34.568905094183656, "learning_rate": 0.00022142752776154292, "loss": 0.4829, "step": 4110 }, { "epoch": 1.16, "grad_norm": 37.20888182696178, "learning_rate": 0.00022120835768556397, "loss": 0.3751, "step": 4120 }, { "epoch": 1.16, "grad_norm": 2.628119016749957, "learning_rate": 0.000220989187609585, "loss": 0.2531, "step": 4130 }, { "epoch": 1.17, "grad_norm": 12.787898954063717, "learning_rate": 0.00022077001753360604, "loss": 0.2568, "step": 4140 }, { "epoch": 1.17, "grad_norm": 13.740422198890055, "learning_rate": 0.0002205508474576271, "loss": 0.1521, "step": 4150 }, { "epoch": 1.17, "grad_norm": 2.754528026569796, "learning_rate": 0.00022033167738164813, "loss": 0.3939, "step": 4160 }, { "epoch": 1.18, "grad_norm": 17.13803914567701, "learning_rate": 0.00022011250730566916, "loss": 0.2548, "step": 4170 }, { "epoch": 1.18, "grad_norm": 8.755380723031136, "learning_rate": 0.00021989333722969025, "loss": 0.22, "step": 4180 }, { "epoch": 1.18, "grad_norm": 11.327279230065335, "learning_rate": 0.00021967416715371128, "loss": 0.1881, "step": 4190 }, { "epoch": 1.18, "grad_norm": 11.76478464724261, "learning_rate": 0.00021945499707773229, "loss": 0.4543, "step": 4200 }, { "epoch": 1.19, "grad_norm": 21.089139286448926, "learning_rate": 0.00021923582700175332, "loss": 0.412, "step": 4210 }, { "epoch": 1.19, "grad_norm": 4.35685071056218, "learning_rate": 0.0002190166569257744, "loss": 0.2452, "step": 4220 }, { "epoch": 1.19, "grad_norm": 16.567825119225937, "learning_rate": 0.00021879748684979544, "loss": 0.2714, "step": 4230 }, { "epoch": 1.2, "grad_norm": 51.4802446925637, "learning_rate": 0.00021857831677381644, "loss": 0.3585, "step": 4240 }, { "epoch": 1.2, "grad_norm": 50.02592512359097, "learning_rate": 0.00021835914669783753, "loss": 0.4534, "step": 4250 }, { "epoch": 1.2, "grad_norm": 6.384047108249605, "learning_rate": 0.00021813997662185856, "loss": 0.3206, "step": 4260 }, { "epoch": 1.2, "grad_norm": 9.283525914668507, "learning_rate": 0.0002179208065458796, "loss": 0.3393, "step": 4270 }, { "epoch": 1.21, "grad_norm": 45.654861594884274, "learning_rate": 0.0002177016364699006, "loss": 0.335, "step": 4280 }, { "epoch": 1.21, "grad_norm": 3.5003233393976263, "learning_rate": 0.00021748246639392168, "loss": 0.3477, "step": 4290 }, { "epoch": 1.21, "grad_norm": 14.979317988693346, "learning_rate": 0.00021726329631794271, "loss": 0.2205, "step": 4300 }, { "epoch": 1.22, "grad_norm": 9.173367064164506, "learning_rate": 0.00021704412624196375, "loss": 0.4878, "step": 4310 }, { "epoch": 1.22, "grad_norm": 13.812041376988086, "learning_rate": 0.0002168249561659848, "loss": 0.3115, "step": 4320 }, { "epoch": 1.22, "grad_norm": 5.145857101646077, "learning_rate": 0.0002166277030976037, "loss": 0.1912, "step": 4330 }, { "epoch": 1.22, "grad_norm": 1.5222263707561177, "learning_rate": 0.00021640853302162474, "loss": 0.2155, "step": 4340 }, { "epoch": 1.23, "grad_norm": 20.94214337532918, "learning_rate": 0.00021618936294564583, "loss": 0.1663, "step": 4350 }, { "epoch": 1.23, "grad_norm": 46.6321807054184, "learning_rate": 0.00021597019286966686, "loss": 0.2966, "step": 4360 }, { "epoch": 1.23, "grad_norm": 4.374585543579176, "learning_rate": 0.00021575102279368786, "loss": 0.3326, "step": 4370 }, { "epoch": 1.23, "grad_norm": 5.550072623324231, "learning_rate": 0.0002155318527177089, "loss": 0.1478, "step": 4380 }, { "epoch": 1.24, "grad_norm": 0.6613601079645897, "learning_rate": 0.00021531268264172998, "loss": 0.2295, "step": 4390 }, { "epoch": 1.24, "grad_norm": 3.000033161148002, "learning_rate": 0.00021509351256575101, "loss": 0.5028, "step": 4400 }, { "epoch": 1.24, "grad_norm": 38.67630998438632, "learning_rate": 0.00021487434248977205, "loss": 0.304, "step": 4410 }, { "epoch": 1.25, "grad_norm": 31.0474775238587, "learning_rate": 0.0002146551724137931, "loss": 0.3397, "step": 4420 }, { "epoch": 1.25, "grad_norm": 21.038657972290533, "learning_rate": 0.00021443600233781414, "loss": 0.2639, "step": 4430 }, { "epoch": 1.25, "grad_norm": 29.03727828295053, "learning_rate": 0.00021421683226183517, "loss": 0.269, "step": 4440 }, { "epoch": 1.25, "grad_norm": 4.822263791040293, "learning_rate": 0.0002139976621858562, "loss": 0.2482, "step": 4450 }, { "epoch": 1.26, "grad_norm": 8.31426730608056, "learning_rate": 0.00021377849210987726, "loss": 0.2249, "step": 4460 }, { "epoch": 1.26, "grad_norm": 43.20888606905504, "learning_rate": 0.0002135593220338983, "loss": 0.3654, "step": 4470 }, { "epoch": 1.26, "grad_norm": 2.446800168689743, "learning_rate": 0.00021334015195791932, "loss": 0.3182, "step": 4480 }, { "epoch": 1.27, "grad_norm": 7.6988347063951466, "learning_rate": 0.00021312098188194038, "loss": 0.3373, "step": 4490 }, { "epoch": 1.27, "grad_norm": 31.935307455130687, "learning_rate": 0.00021290181180596142, "loss": 0.3036, "step": 4500 }, { "epoch": 1.27, "grad_norm": 20.726406179500756, "learning_rate": 0.00021268264172998245, "loss": 0.3627, "step": 4510 }, { "epoch": 1.27, "grad_norm": 7.097138033352009, "learning_rate": 0.0002124634716540035, "loss": 0.2623, "step": 4520 }, { "epoch": 1.28, "grad_norm": 3.8873072142785183, "learning_rate": 0.00021224430157802454, "loss": 0.2236, "step": 4530 }, { "epoch": 1.28, "grad_norm": 14.527770370800262, "learning_rate": 0.00021202513150204557, "loss": 0.137, "step": 4540 }, { "epoch": 1.28, "grad_norm": 7.558415335390357, "learning_rate": 0.0002118059614260666, "loss": 0.217, "step": 4550 }, { "epoch": 1.29, "grad_norm": 29.89529000773257, "learning_rate": 0.00021158679135008766, "loss": 0.251, "step": 4560 }, { "epoch": 1.29, "grad_norm": 52.64728318565253, "learning_rate": 0.0002113676212741087, "loss": 0.3333, "step": 4570 }, { "epoch": 1.29, "grad_norm": 30.152792328177426, "learning_rate": 0.00021114845119812973, "loss": 0.2724, "step": 4580 }, { "epoch": 1.29, "grad_norm": 2.5341334779748967, "learning_rate": 0.00021092928112215079, "loss": 0.269, "step": 4590 }, { "epoch": 1.3, "grad_norm": 22.55021824043886, "learning_rate": 0.00021071011104617182, "loss": 0.0765, "step": 4600 }, { "epoch": 1.3, "grad_norm": 20.277146842395847, "learning_rate": 0.00021049094097019285, "loss": 0.2189, "step": 4610 }, { "epoch": 1.3, "grad_norm": 31.332786897175072, "learning_rate": 0.00021027177089421388, "loss": 0.4537, "step": 4620 }, { "epoch": 1.31, "grad_norm": 4.0878545777289395, "learning_rate": 0.00021005260081823494, "loss": 0.2703, "step": 4630 }, { "epoch": 1.31, "grad_norm": 26.911824724655006, "learning_rate": 0.00020983343074225597, "loss": 0.2727, "step": 4640 }, { "epoch": 1.31, "grad_norm": 2.5043488479928016, "learning_rate": 0.000209614260666277, "loss": 0.1888, "step": 4650 }, { "epoch": 1.31, "grad_norm": 38.26894575743672, "learning_rate": 0.00020939509059029806, "loss": 0.334, "step": 4660 }, { "epoch": 1.32, "grad_norm": 3.073803318378424, "learning_rate": 0.0002091759205143191, "loss": 0.3018, "step": 4670 }, { "epoch": 1.32, "grad_norm": 50.99829894564531, "learning_rate": 0.00020895675043834013, "loss": 0.309, "step": 4680 }, { "epoch": 1.32, "grad_norm": 33.03113078011947, "learning_rate": 0.0002087375803623612, "loss": 0.3267, "step": 4690 }, { "epoch": 1.33, "grad_norm": 49.72639701138489, "learning_rate": 0.00020851841028638222, "loss": 0.3428, "step": 4700 }, { "epoch": 1.33, "grad_norm": 15.250514847558827, "learning_rate": 0.00020829924021040325, "loss": 0.2442, "step": 4710 }, { "epoch": 1.33, "grad_norm": 29.630372731951933, "learning_rate": 0.00020808007013442428, "loss": 0.4006, "step": 4720 }, { "epoch": 1.33, "grad_norm": 2.6025635165269834, "learning_rate": 0.00020786090005844534, "loss": 0.1417, "step": 4730 }, { "epoch": 1.34, "grad_norm": 46.58385970647689, "learning_rate": 0.00020764172998246637, "loss": 0.3178, "step": 4740 }, { "epoch": 1.34, "grad_norm": 33.748707418723626, "learning_rate": 0.0002074225599064874, "loss": 0.4147, "step": 4750 }, { "epoch": 1.34, "grad_norm": 8.516868922828099, "learning_rate": 0.00020720338983050846, "loss": 0.2285, "step": 4760 }, { "epoch": 1.34, "grad_norm": 56.676130895938115, "learning_rate": 0.0002069842197545295, "loss": 0.3276, "step": 4770 }, { "epoch": 1.35, "grad_norm": 57.88752849086389, "learning_rate": 0.00020676504967855053, "loss": 0.4268, "step": 4780 }, { "epoch": 1.35, "grad_norm": 2.2518432859009083, "learning_rate": 0.00020654587960257156, "loss": 0.2271, "step": 4790 }, { "epoch": 1.35, "grad_norm": 8.476321740016418, "learning_rate": 0.00020632670952659262, "loss": 0.2991, "step": 4800 }, { "epoch": 1.36, "grad_norm": 5.939165676183437, "learning_rate": 0.00020610753945061365, "loss": 0.2207, "step": 4810 }, { "epoch": 1.36, "grad_norm": 6.721301020536598, "learning_rate": 0.00020588836937463468, "loss": 0.1735, "step": 4820 }, { "epoch": 1.36, "grad_norm": 3.2082370683760044, "learning_rate": 0.00020566919929865574, "loss": 0.4545, "step": 4830 }, { "epoch": 1.36, "grad_norm": 29.422853677429167, "learning_rate": 0.00020545002922267678, "loss": 0.4142, "step": 4840 }, { "epoch": 1.37, "grad_norm": 5.299481055699104, "learning_rate": 0.0002052308591466978, "loss": 0.3986, "step": 4850 }, { "epoch": 1.37, "grad_norm": 22.75931248540711, "learning_rate": 0.00020501168907071887, "loss": 0.4092, "step": 4860 }, { "epoch": 1.37, "grad_norm": 6.466191538331885, "learning_rate": 0.0002047925189947399, "loss": 0.1388, "step": 4870 }, { "epoch": 1.38, "grad_norm": 8.95128170273167, "learning_rate": 0.00020457334891876093, "loss": 0.4025, "step": 4880 }, { "epoch": 1.38, "grad_norm": 30.917184250084812, "learning_rate": 0.00020435417884278196, "loss": 0.3814, "step": 4890 }, { "epoch": 1.38, "grad_norm": 20.91623684082827, "learning_rate": 0.00020413500876680302, "loss": 0.3143, "step": 4900 }, { "epoch": 1.38, "grad_norm": 28.994187069448067, "learning_rate": 0.00020391583869082405, "loss": 0.2026, "step": 4910 }, { "epoch": 1.39, "grad_norm": 42.11122790207425, "learning_rate": 0.00020369666861484509, "loss": 0.3098, "step": 4920 }, { "epoch": 1.39, "grad_norm": 36.159828304877706, "learning_rate": 0.00020347749853886614, "loss": 0.2509, "step": 4930 }, { "epoch": 1.39, "grad_norm": 20.23189475386178, "learning_rate": 0.00020325832846288718, "loss": 0.3435, "step": 4940 }, { "epoch": 1.4, "grad_norm": 9.446387490878042, "learning_rate": 0.0002030391583869082, "loss": 0.3598, "step": 4950 }, { "epoch": 1.4, "grad_norm": 31.356335012679175, "learning_rate": 0.00020281998831092924, "loss": 0.3427, "step": 4960 }, { "epoch": 1.4, "grad_norm": 25.111026271060013, "learning_rate": 0.0002026008182349503, "loss": 0.2731, "step": 4970 }, { "epoch": 1.4, "grad_norm": 10.288548511014797, "learning_rate": 0.00020238164815897133, "loss": 0.2741, "step": 4980 }, { "epoch": 1.41, "grad_norm": 18.078048168470804, "learning_rate": 0.00020216247808299236, "loss": 0.1965, "step": 4990 }, { "epoch": 1.41, "grad_norm": 2.5818740774616953, "learning_rate": 0.00020194330800701345, "loss": 0.423, "step": 5000 }, { "epoch": 1.41, "grad_norm": 5.029346814046525, "learning_rate": 0.00020172413793103448, "loss": 0.1981, "step": 5010 }, { "epoch": 1.42, "grad_norm": 3.69515764136778, "learning_rate": 0.0002015049678550555, "loss": 0.3407, "step": 5020 }, { "epoch": 1.42, "grad_norm": 1.1185073469260922, "learning_rate": 0.00020128579777907652, "loss": 0.3759, "step": 5030 }, { "epoch": 1.42, "grad_norm": 25.82929883275312, "learning_rate": 0.0002010666277030976, "loss": 0.3386, "step": 5040 }, { "epoch": 1.42, "grad_norm": 5.658058990862141, "learning_rate": 0.00020084745762711864, "loss": 0.3067, "step": 5050 }, { "epoch": 1.43, "grad_norm": 33.789807219043574, "learning_rate": 0.00020062828755113964, "loss": 0.3116, "step": 5060 }, { "epoch": 1.43, "grad_norm": 29.665346856502566, "learning_rate": 0.00020040911747516073, "loss": 0.4307, "step": 5070 }, { "epoch": 1.43, "grad_norm": 2.953173260130455, "learning_rate": 0.00020018994739918176, "loss": 0.2401, "step": 5080 }, { "epoch": 1.44, "grad_norm": 59.25975112660892, "learning_rate": 0.0001999707773232028, "loss": 0.3712, "step": 5090 }, { "epoch": 1.44, "grad_norm": 16.61695534436786, "learning_rate": 0.00019975160724722385, "loss": 0.2978, "step": 5100 }, { "epoch": 1.44, "grad_norm": 27.59304302639443, "learning_rate": 0.00019953243717124488, "loss": 0.2207, "step": 5110 }, { "epoch": 1.44, "grad_norm": 33.3996635406056, "learning_rate": 0.00019931326709526592, "loss": 0.4006, "step": 5120 }, { "epoch": 1.45, "grad_norm": 7.909724279076434, "learning_rate": 0.00019909409701928695, "loss": 0.3282, "step": 5130 }, { "epoch": 1.45, "grad_norm": 10.96614010093427, "learning_rate": 0.000198874926943308, "loss": 0.3641, "step": 5140 }, { "epoch": 1.45, "grad_norm": 4.747888769171669, "learning_rate": 0.00019865575686732904, "loss": 0.2596, "step": 5150 }, { "epoch": 1.45, "grad_norm": 17.19969500744997, "learning_rate": 0.00019843658679135007, "loss": 0.2359, "step": 5160 }, { "epoch": 1.46, "grad_norm": 12.445792367631444, "learning_rate": 0.00019821741671537113, "loss": 0.291, "step": 5170 }, { "epoch": 1.46, "grad_norm": 16.38158373051574, "learning_rate": 0.00019799824663939216, "loss": 0.2152, "step": 5180 }, { "epoch": 1.46, "grad_norm": 19.55308310137066, "learning_rate": 0.0001977790765634132, "loss": 0.245, "step": 5190 }, { "epoch": 1.47, "grad_norm": 4.151920692809044, "learning_rate": 0.00019755990648743423, "loss": 0.263, "step": 5200 }, { "epoch": 1.47, "grad_norm": 23.195604876425826, "learning_rate": 0.00019734073641145528, "loss": 0.3534, "step": 5210 }, { "epoch": 1.47, "grad_norm": 21.589521451918323, "learning_rate": 0.00019712156633547632, "loss": 0.1662, "step": 5220 }, { "epoch": 1.47, "grad_norm": 11.887752916702121, "learning_rate": 0.00019690239625949735, "loss": 0.2105, "step": 5230 }, { "epoch": 1.48, "grad_norm": 5.176318946475123, "learning_rate": 0.0001966832261835184, "loss": 0.2695, "step": 5240 }, { "epoch": 1.48, "grad_norm": 11.690634456574996, "learning_rate": 0.00019646405610753944, "loss": 0.3302, "step": 5250 }, { "epoch": 1.48, "grad_norm": 62.85698360348048, "learning_rate": 0.00019624488603156047, "loss": 0.5215, "step": 5260 }, { "epoch": 1.49, "grad_norm": 13.466880633693304, "learning_rate": 0.00019602571595558153, "loss": 0.2985, "step": 5270 }, { "epoch": 1.49, "grad_norm": 1.965805579763693, "learning_rate": 0.00019580654587960256, "loss": 0.2372, "step": 5280 }, { "epoch": 1.49, "grad_norm": 12.499426672870545, "learning_rate": 0.0001955873758036236, "loss": 0.1587, "step": 5290 }, { "epoch": 1.49, "grad_norm": 15.307208098887806, "learning_rate": 0.00019536820572764463, "loss": 0.207, "step": 5300 }, { "epoch": 1.5, "grad_norm": 14.010835079736815, "learning_rate": 0.00019514903565166569, "loss": 0.1688, "step": 5310 }, { "epoch": 1.5, "grad_norm": 7.104347674810669, "learning_rate": 0.00019492986557568672, "loss": 0.1888, "step": 5320 }, { "epoch": 1.5, "grad_norm": 1.5642209100774738, "learning_rate": 0.00019473261250730565, "loss": 0.2465, "step": 5330 }, { "epoch": 1.51, "grad_norm": 32.89294792453513, "learning_rate": 0.0001945134424313267, "loss": 0.4685, "step": 5340 }, { "epoch": 1.51, "grad_norm": 16.400791636569856, "learning_rate": 0.00019429427235534774, "loss": 0.2564, "step": 5350 }, { "epoch": 1.51, "grad_norm": 6.986562239990336, "learning_rate": 0.00019407510227936877, "loss": 0.2917, "step": 5360 }, { "epoch": 1.51, "grad_norm": 11.557550672896545, "learning_rate": 0.00019385593220338983, "loss": 0.3427, "step": 5370 }, { "epoch": 1.52, "grad_norm": 2.1712768411403927, "learning_rate": 0.00019363676212741086, "loss": 0.2312, "step": 5380 }, { "epoch": 1.52, "grad_norm": 3.4188406927450874, "learning_rate": 0.0001934175920514319, "loss": 0.2822, "step": 5390 }, { "epoch": 1.52, "grad_norm": 18.41553181032857, "learning_rate": 0.00019319842197545293, "loss": 0.3085, "step": 5400 }, { "epoch": 1.53, "grad_norm": 8.925374304814394, "learning_rate": 0.000192979251899474, "loss": 0.3014, "step": 5410 }, { "epoch": 1.53, "grad_norm": 2.21057714781379, "learning_rate": 0.00019276008182349502, "loss": 0.2019, "step": 5420 }, { "epoch": 1.53, "grad_norm": 7.995029854183915, "learning_rate": 0.00019254091174751605, "loss": 0.3143, "step": 5430 }, { "epoch": 1.53, "grad_norm": 24.818437530389712, "learning_rate": 0.0001923217416715371, "loss": 0.2312, "step": 5440 }, { "epoch": 1.54, "grad_norm": 4.630710193156958, "learning_rate": 0.00019210257159555814, "loss": 0.2946, "step": 5450 }, { "epoch": 1.54, "grad_norm": 18.89029942903239, "learning_rate": 0.00019188340151957917, "loss": 0.2129, "step": 5460 }, { "epoch": 1.54, "grad_norm": 21.18937099773853, "learning_rate": 0.0001916642314436002, "loss": 0.3114, "step": 5470 }, { "epoch": 1.54, "grad_norm": 44.57514569240804, "learning_rate": 0.00019144506136762127, "loss": 0.4142, "step": 5480 }, { "epoch": 1.55, "grad_norm": 10.696173605726317, "learning_rate": 0.0001912258912916423, "loss": 0.1754, "step": 5490 }, { "epoch": 1.55, "grad_norm": 40.73458136589298, "learning_rate": 0.00019100672121566333, "loss": 0.2926, "step": 5500 }, { "epoch": 1.55, "grad_norm": 39.31886057121872, "learning_rate": 0.0001907875511396844, "loss": 0.266, "step": 5510 }, { "epoch": 1.56, "grad_norm": 2.9842826591961664, "learning_rate": 0.00019056838106370542, "loss": 0.2224, "step": 5520 }, { "epoch": 1.56, "grad_norm": 9.331926869782274, "learning_rate": 0.00019034921098772645, "loss": 0.1006, "step": 5530 }, { "epoch": 1.56, "grad_norm": 0.42527725447521897, "learning_rate": 0.00019013004091174748, "loss": 0.2903, "step": 5540 }, { "epoch": 1.56, "grad_norm": 2.2444076494386254, "learning_rate": 0.00018991087083576854, "loss": 0.1155, "step": 5550 }, { "epoch": 1.57, "grad_norm": 37.94628287997447, "learning_rate": 0.00018969170075978958, "loss": 0.4778, "step": 5560 }, { "epoch": 1.57, "grad_norm": 42.20359422033777, "learning_rate": 0.0001894725306838106, "loss": 0.268, "step": 5570 }, { "epoch": 1.57, "grad_norm": 5.954129998100243, "learning_rate": 0.00018925336060783167, "loss": 0.2858, "step": 5580 }, { "epoch": 1.58, "grad_norm": 12.545115176449832, "learning_rate": 0.0001890341905318527, "loss": 0.4231, "step": 5590 }, { "epoch": 1.58, "grad_norm": 4.6634905746414965, "learning_rate": 0.00018881502045587373, "loss": 0.2802, "step": 5600 }, { "epoch": 1.58, "grad_norm": 25.89963576909228, "learning_rate": 0.0001885958503798948, "loss": 0.3474, "step": 5610 }, { "epoch": 1.58, "grad_norm": 8.132022049717532, "learning_rate": 0.00018837668030391582, "loss": 0.2969, "step": 5620 }, { "epoch": 1.59, "grad_norm": 9.15480977443254, "learning_rate": 0.00018815751022793685, "loss": 0.2564, "step": 5630 }, { "epoch": 1.59, "grad_norm": 8.525165992903926, "learning_rate": 0.00018793834015195789, "loss": 0.3067, "step": 5640 }, { "epoch": 1.59, "grad_norm": 32.8997919695663, "learning_rate": 0.00018771917007597894, "loss": 0.3231, "step": 5650 }, { "epoch": 1.6, "grad_norm": 13.880130230974345, "learning_rate": 0.00018749999999999998, "loss": 0.233, "step": 5660 }, { "epoch": 1.6, "grad_norm": 5.059925820076422, "learning_rate": 0.000187280829924021, "loss": 0.274, "step": 5670 }, { "epoch": 1.6, "grad_norm": 33.92209798070997, "learning_rate": 0.00018706165984804207, "loss": 0.1971, "step": 5680 }, { "epoch": 1.6, "grad_norm": 11.486919782540255, "learning_rate": 0.0001868424897720631, "loss": 0.5099, "step": 5690 }, { "epoch": 1.61, "grad_norm": 1.118181876078142, "learning_rate": 0.00018662331969608413, "loss": 0.1531, "step": 5700 }, { "epoch": 1.61, "grad_norm": 18.07168190463933, "learning_rate": 0.00018640414962010516, "loss": 0.4196, "step": 5710 }, { "epoch": 1.61, "grad_norm": 8.865176806627508, "learning_rate": 0.00018618497954412622, "loss": 0.2771, "step": 5720 }, { "epoch": 1.62, "grad_norm": 35.74999402744737, "learning_rate": 0.00018596580946814726, "loss": 0.3592, "step": 5730 }, { "epoch": 1.62, "grad_norm": 14.709743095476123, "learning_rate": 0.0001857466393921683, "loss": 0.2372, "step": 5740 }, { "epoch": 1.62, "grad_norm": 8.547571860162597, "learning_rate": 0.00018552746931618935, "loss": 0.1769, "step": 5750 }, { "epoch": 1.62, "grad_norm": 29.188932438810735, "learning_rate": 0.00018530829924021038, "loss": 0.1595, "step": 5760 }, { "epoch": 1.63, "grad_norm": 9.72627004103299, "learning_rate": 0.0001850891291642314, "loss": 0.267, "step": 5770 }, { "epoch": 1.63, "grad_norm": 17.585505956653314, "learning_rate": 0.0001848699590882525, "loss": 0.2313, "step": 5780 }, { "epoch": 1.63, "grad_norm": 11.872045841554243, "learning_rate": 0.0001846507890122735, "loss": 0.2371, "step": 5790 }, { "epoch": 1.64, "grad_norm": 17.194223427069954, "learning_rate": 0.00018443161893629453, "loss": 0.3299, "step": 5800 }, { "epoch": 1.64, "grad_norm": 11.83536538733706, "learning_rate": 0.00018421244886031557, "loss": 0.1985, "step": 5810 }, { "epoch": 1.64, "grad_norm": 3.4341999458410153, "learning_rate": 0.00018399327878433665, "loss": 0.2886, "step": 5820 }, { "epoch": 1.64, "grad_norm": 4.533321919018761, "learning_rate": 0.00018377410870835768, "loss": 0.2209, "step": 5830 }, { "epoch": 1.65, "grad_norm": 15.786307762774168, "learning_rate": 0.0001835549386323787, "loss": 0.1963, "step": 5840 }, { "epoch": 1.65, "grad_norm": 4.999680638451794, "learning_rate": 0.00018333576855639977, "loss": 0.3231, "step": 5850 }, { "epoch": 1.65, "grad_norm": 7.345856055836037, "learning_rate": 0.0001831165984804208, "loss": 0.3497, "step": 5860 }, { "epoch": 1.65, "grad_norm": 24.05112698349709, "learning_rate": 0.00018289742840444184, "loss": 0.1703, "step": 5870 }, { "epoch": 1.66, "grad_norm": 5.14777448497475, "learning_rate": 0.00018267825832846284, "loss": 0.2877, "step": 5880 }, { "epoch": 1.66, "grad_norm": 22.321301335050478, "learning_rate": 0.00018245908825248393, "loss": 0.3838, "step": 5890 }, { "epoch": 1.66, "grad_norm": 13.553653717813507, "learning_rate": 0.00018223991817650496, "loss": 0.2386, "step": 5900 }, { "epoch": 1.67, "grad_norm": 8.36423228043391, "learning_rate": 0.000182020748100526, "loss": 0.3274, "step": 5910 }, { "epoch": 1.67, "grad_norm": 32.214527042072234, "learning_rate": 0.00018180157802454705, "loss": 0.2649, "step": 5920 }, { "epoch": 1.67, "grad_norm": 7.310510867463555, "learning_rate": 0.00018158240794856808, "loss": 0.3419, "step": 5930 }, { "epoch": 1.67, "grad_norm": 26.843414736420463, "learning_rate": 0.00018136323787258912, "loss": 0.3201, "step": 5940 }, { "epoch": 1.68, "grad_norm": 17.750295021021458, "learning_rate": 0.00018114406779661015, "loss": 0.2248, "step": 5950 }, { "epoch": 1.68, "grad_norm": 0.8761970753193081, "learning_rate": 0.0001809248977206312, "loss": 0.1681, "step": 5960 }, { "epoch": 1.68, "grad_norm": 8.023645949418535, "learning_rate": 0.00018070572764465224, "loss": 0.3362, "step": 5970 }, { "epoch": 1.69, "grad_norm": 21.33676597686644, "learning_rate": 0.00018048655756867327, "loss": 0.1318, "step": 5980 }, { "epoch": 1.69, "grad_norm": 17.955586403704974, "learning_rate": 0.00018026738749269433, "loss": 0.3109, "step": 5990 }, { "epoch": 1.69, "grad_norm": 37.44497964171056, "learning_rate": 0.00018004821741671536, "loss": 0.224, "step": 6000 }, { "epoch": 1.69, "grad_norm": 21.77593890358771, "learning_rate": 0.0001798290473407364, "loss": 0.3034, "step": 6010 }, { "epoch": 1.7, "grad_norm": 0.7352368145267243, "learning_rate": 0.00017960987726475745, "loss": 0.1274, "step": 6020 }, { "epoch": 1.7, "grad_norm": 21.416372952522156, "learning_rate": 0.00017939070718877849, "loss": 0.2799, "step": 6030 }, { "epoch": 1.7, "grad_norm": 6.43140669106781, "learning_rate": 0.00017917153711279952, "loss": 0.3944, "step": 6040 }, { "epoch": 1.71, "grad_norm": 21.589922507704756, "learning_rate": 0.00017895236703682055, "loss": 0.4062, "step": 6050 }, { "epoch": 1.71, "grad_norm": 11.40138578000891, "learning_rate": 0.0001787331969608416, "loss": 0.244, "step": 6060 }, { "epoch": 1.71, "grad_norm": 17.091859469856562, "learning_rate": 0.00017851402688486264, "loss": 0.2566, "step": 6070 }, { "epoch": 1.71, "grad_norm": 11.99466531174607, "learning_rate": 0.00017829485680888367, "loss": 0.2817, "step": 6080 }, { "epoch": 1.72, "grad_norm": 20.51599983452521, "learning_rate": 0.00017807568673290473, "loss": 0.2728, "step": 6090 }, { "epoch": 1.72, "grad_norm": 43.54037932343941, "learning_rate": 0.00017785651665692576, "loss": 0.3911, "step": 6100 }, { "epoch": 1.72, "grad_norm": 14.595613560758464, "learning_rate": 0.0001776373465809468, "loss": 0.2645, "step": 6110 }, { "epoch": 1.73, "grad_norm": 24.91815308399374, "learning_rate": 0.00017741817650496783, "loss": 0.2268, "step": 6120 }, { "epoch": 1.73, "grad_norm": 9.720913254909993, "learning_rate": 0.0001771990064289889, "loss": 0.3635, "step": 6130 }, { "epoch": 1.73, "grad_norm": 29.77336977828609, "learning_rate": 0.00017697983635300992, "loss": 0.2624, "step": 6140 }, { "epoch": 1.73, "grad_norm": 35.22870133705211, "learning_rate": 0.00017676066627703095, "loss": 0.3233, "step": 6150 }, { "epoch": 1.74, "grad_norm": 1.3492806118043075, "learning_rate": 0.000176541496201052, "loss": 0.2593, "step": 6160 }, { "epoch": 1.74, "grad_norm": 12.854084741350912, "learning_rate": 0.00017632232612507304, "loss": 0.288, "step": 6170 }, { "epoch": 1.74, "grad_norm": 26.51581773086576, "learning_rate": 0.00017610315604909407, "loss": 0.4244, "step": 6180 }, { "epoch": 1.75, "grad_norm": 8.677592676470999, "learning_rate": 0.00017588398597311513, "loss": 0.2529, "step": 6190 }, { "epoch": 1.75, "grad_norm": 49.69265782318678, "learning_rate": 0.00017566481589713617, "loss": 0.2555, "step": 6200 }, { "epoch": 1.75, "grad_norm": 9.137310587778737, "learning_rate": 0.0001754456458211572, "loss": 0.2237, "step": 6210 }, { "epoch": 1.75, "grad_norm": 2.5627703512715154, "learning_rate": 0.00017522647574517823, "loss": 0.2253, "step": 6220 }, { "epoch": 1.76, "grad_norm": 1.3427155017491879, "learning_rate": 0.0001750073056691993, "loss": 0.4267, "step": 6230 }, { "epoch": 1.76, "grad_norm": 32.026763021399056, "learning_rate": 0.00017478813559322032, "loss": 0.2694, "step": 6240 }, { "epoch": 1.76, "grad_norm": 1.8952720705907915, "learning_rate": 0.00017456896551724135, "loss": 0.2647, "step": 6250 }, { "epoch": 1.76, "grad_norm": 8.293829114055725, "learning_rate": 0.0001743497954412624, "loss": 0.3184, "step": 6260 }, { "epoch": 1.77, "grad_norm": 21.630558018188975, "learning_rate": 0.00017413062536528344, "loss": 0.2528, "step": 6270 }, { "epoch": 1.77, "grad_norm": 11.263065393245803, "learning_rate": 0.00017391145528930448, "loss": 0.3655, "step": 6280 }, { "epoch": 1.77, "grad_norm": 0.5383693893588692, "learning_rate": 0.0001736922852133255, "loss": 0.2871, "step": 6290 }, { "epoch": 1.78, "grad_norm": 8.222238464796305, "learning_rate": 0.00017347311513734657, "loss": 0.2595, "step": 6300 }, { "epoch": 1.78, "grad_norm": 17.415154838487343, "learning_rate": 0.0001732539450613676, "loss": 0.3208, "step": 6310 }, { "epoch": 1.78, "grad_norm": 2.384813430472559, "learning_rate": 0.00017303477498538863, "loss": 0.1697, "step": 6320 }, { "epoch": 1.78, "grad_norm": 3.430324248430251, "learning_rate": 0.0001728156049094097, "loss": 0.3111, "step": 6330 }, { "epoch": 1.79, "grad_norm": 29.695298798800554, "learning_rate": 0.00017259643483343072, "loss": 0.2624, "step": 6340 }, { "epoch": 1.79, "grad_norm": 3.4530119939476016, "learning_rate": 0.00017239918176504965, "loss": 0.5039, "step": 6350 }, { "epoch": 1.79, "grad_norm": 14.522683359889644, "learning_rate": 0.0001721800116890707, "loss": 0.2896, "step": 6360 }, { "epoch": 1.8, "grad_norm": 11.419748641445885, "learning_rate": 0.00017196084161309175, "loss": 0.2413, "step": 6370 }, { "epoch": 1.8, "grad_norm": 12.845162868376475, "learning_rate": 0.00017174167153711278, "loss": 0.2085, "step": 6380 }, { "epoch": 1.8, "grad_norm": 50.97954814308535, "learning_rate": 0.0001715225014611338, "loss": 0.4096, "step": 6390 }, { "epoch": 1.8, "grad_norm": 3.776354286239877, "learning_rate": 0.00017130333138515487, "loss": 0.2438, "step": 6400 }, { "epoch": 1.81, "grad_norm": 3.71168428040738, "learning_rate": 0.0001710841613091759, "loss": 0.2612, "step": 6410 }, { "epoch": 1.81, "grad_norm": 32.131330999922675, "learning_rate": 0.00017086499123319693, "loss": 0.4542, "step": 6420 }, { "epoch": 1.81, "grad_norm": 6.736921728341746, "learning_rate": 0.000170645821157218, "loss": 0.2303, "step": 6430 }, { "epoch": 1.82, "grad_norm": 15.468427433397974, "learning_rate": 0.00017042665108123902, "loss": 0.3181, "step": 6440 }, { "epoch": 1.82, "grad_norm": 64.72524937993848, "learning_rate": 0.00017020748100526006, "loss": 0.2804, "step": 6450 }, { "epoch": 1.82, "grad_norm": 5.219058854813203, "learning_rate": 0.0001699883109292811, "loss": 0.2497, "step": 6460 }, { "epoch": 1.82, "grad_norm": 1.6037966913707118, "learning_rate": 0.00016976914085330215, "loss": 0.2066, "step": 6470 }, { "epoch": 1.83, "grad_norm": 6.240768583919815, "learning_rate": 0.00016954997077732318, "loss": 0.2428, "step": 6480 }, { "epoch": 1.83, "grad_norm": 23.240452777334195, "learning_rate": 0.0001693308007013442, "loss": 0.2075, "step": 6490 }, { "epoch": 1.83, "grad_norm": 39.5379292284798, "learning_rate": 0.00016911163062536527, "loss": 0.2936, "step": 6500 }, { "epoch": 1.84, "grad_norm": 32.798181084355704, "learning_rate": 0.0001688924605493863, "loss": 0.3773, "step": 6510 }, { "epoch": 1.84, "grad_norm": 17.376010201609784, "learning_rate": 0.00016867329047340733, "loss": 0.3442, "step": 6520 }, { "epoch": 1.84, "grad_norm": 12.08974541668813, "learning_rate": 0.0001684541203974284, "loss": 0.3321, "step": 6530 }, { "epoch": 1.84, "grad_norm": 34.210468800599315, "learning_rate": 0.00016823495032144942, "loss": 0.2846, "step": 6540 }, { "epoch": 1.85, "grad_norm": 13.86396524279559, "learning_rate": 0.00016801578024547046, "loss": 0.2522, "step": 6550 }, { "epoch": 1.85, "grad_norm": 2.9623178774098693, "learning_rate": 0.0001677966101694915, "loss": 0.3122, "step": 6560 }, { "epoch": 1.85, "grad_norm": 16.202793893907323, "learning_rate": 0.00016757744009351255, "loss": 0.2785, "step": 6570 }, { "epoch": 1.86, "grad_norm": 8.818952376048744, "learning_rate": 0.00016735827001753358, "loss": 0.2893, "step": 6580 }, { "epoch": 1.86, "grad_norm": 9.840941433124744, "learning_rate": 0.0001671390999415546, "loss": 0.4644, "step": 6590 }, { "epoch": 1.86, "grad_norm": 22.487991211369078, "learning_rate": 0.0001669199298655757, "loss": 0.3367, "step": 6600 }, { "epoch": 1.86, "grad_norm": 29.408750577985327, "learning_rate": 0.0001667007597895967, "loss": 0.433, "step": 6610 }, { "epoch": 1.87, "grad_norm": 44.84702783843238, "learning_rate": 0.00016648158971361773, "loss": 0.288, "step": 6620 }, { "epoch": 1.87, "grad_norm": 35.39092595728219, "learning_rate": 0.00016626241963763877, "loss": 0.401, "step": 6630 }, { "epoch": 1.87, "grad_norm": 9.939332121236001, "learning_rate": 0.00016604324956165985, "loss": 0.1682, "step": 6640 }, { "epoch": 1.87, "grad_norm": 44.968039933166224, "learning_rate": 0.00016582407948568089, "loss": 0.2435, "step": 6650 }, { "epoch": 1.88, "grad_norm": 8.897239396543608, "learning_rate": 0.0001656049094097019, "loss": 0.2616, "step": 6660 }, { "epoch": 1.88, "grad_norm": 24.54192700316105, "learning_rate": 0.00016538573933372298, "loss": 0.2454, "step": 6670 }, { "epoch": 1.88, "grad_norm": 1.9484839859694942, "learning_rate": 0.000165166569257744, "loss": 0.2232, "step": 6680 }, { "epoch": 1.89, "grad_norm": 15.628379568346645, "learning_rate": 0.00016494739918176504, "loss": 0.2314, "step": 6690 }, { "epoch": 1.89, "grad_norm": 23.499563414114768, "learning_rate": 0.0001647282291057861, "loss": 0.1903, "step": 6700 }, { "epoch": 1.89, "grad_norm": 19.39538243318877, "learning_rate": 0.00016450905902980713, "loss": 0.2385, "step": 6710 }, { "epoch": 1.89, "grad_norm": 5.238154008583709, "learning_rate": 0.00016428988895382816, "loss": 0.3917, "step": 6720 }, { "epoch": 1.9, "grad_norm": 24.81933612902287, "learning_rate": 0.0001640707188778492, "loss": 0.2389, "step": 6730 }, { "epoch": 1.9, "grad_norm": 4.608132814218328, "learning_rate": 0.00016385154880187025, "loss": 0.1413, "step": 6740 }, { "epoch": 1.9, "grad_norm": 11.881773803892107, "learning_rate": 0.0001636323787258913, "loss": 0.1785, "step": 6750 }, { "epoch": 1.91, "grad_norm": 14.177125312181635, "learning_rate": 0.00016341320864991232, "loss": 0.2461, "step": 6760 }, { "epoch": 1.91, "grad_norm": 33.173507263725085, "learning_rate": 0.00016319403857393338, "loss": 0.5047, "step": 6770 }, { "epoch": 1.91, "grad_norm": 37.82372264857794, "learning_rate": 0.0001629748684979544, "loss": 0.3656, "step": 6780 }, { "epoch": 1.91, "grad_norm": 23.51699250829612, "learning_rate": 0.00016275569842197544, "loss": 0.3609, "step": 6790 }, { "epoch": 1.92, "grad_norm": 26.427233006930997, "learning_rate": 0.00016253652834599647, "loss": 0.2522, "step": 6800 }, { "epoch": 1.92, "grad_norm": 0.8480665720492925, "learning_rate": 0.00016231735827001753, "loss": 0.1934, "step": 6810 }, { "epoch": 1.92, "grad_norm": 1.0073865565621205, "learning_rate": 0.00016209818819403856, "loss": 0.2325, "step": 6820 }, { "epoch": 1.93, "grad_norm": 7.079199003953245, "learning_rate": 0.0001618790181180596, "loss": 0.2733, "step": 6830 }, { "epoch": 1.93, "grad_norm": 4.227514966678838, "learning_rate": 0.00016165984804208066, "loss": 0.5935, "step": 6840 }, { "epoch": 1.93, "grad_norm": 2.3825703295584146, "learning_rate": 0.0001614406779661017, "loss": 0.2733, "step": 6850 }, { "epoch": 1.93, "grad_norm": 1.8576971315426782, "learning_rate": 0.00016122150789012272, "loss": 0.303, "step": 6860 }, { "epoch": 1.94, "grad_norm": 33.6413124083341, "learning_rate": 0.00016100233781414378, "loss": 0.2274, "step": 6870 }, { "epoch": 1.94, "grad_norm": 0.9887468524380643, "learning_rate": 0.0001607831677381648, "loss": 0.2919, "step": 6880 }, { "epoch": 1.94, "grad_norm": 4.991113672687678, "learning_rate": 0.00016056399766218584, "loss": 0.2303, "step": 6890 }, { "epoch": 1.95, "grad_norm": 56.489557072844796, "learning_rate": 0.00016034482758620688, "loss": 0.3899, "step": 6900 }, { "epoch": 1.95, "grad_norm": 2.8149639916154947, "learning_rate": 0.00016012565751022793, "loss": 0.2179, "step": 6910 }, { "epoch": 1.95, "grad_norm": 9.821507341872895, "learning_rate": 0.00015990648743424897, "loss": 0.3172, "step": 6920 }, { "epoch": 1.95, "grad_norm": 2.8569862825069285, "learning_rate": 0.00015968731735827, "loss": 0.256, "step": 6930 }, { "epoch": 1.96, "grad_norm": 2.1928517803266643, "learning_rate": 0.00015946814728229106, "loss": 0.1468, "step": 6940 }, { "epoch": 1.96, "grad_norm": 8.161581000646946, "learning_rate": 0.0001592489772063121, "loss": 0.3608, "step": 6950 }, { "epoch": 1.96, "grad_norm": 1.8284875998450847, "learning_rate": 0.00015902980713033312, "loss": 0.2207, "step": 6960 }, { "epoch": 1.97, "grad_norm": 3.6951898202003726, "learning_rate": 0.00015881063705435415, "loss": 0.2749, "step": 6970 }, { "epoch": 1.97, "grad_norm": 17.687512857327995, "learning_rate": 0.0001585914669783752, "loss": 0.2825, "step": 6980 }, { "epoch": 1.97, "grad_norm": 4.61555546951409, "learning_rate": 0.00015837229690239624, "loss": 0.3753, "step": 6990 }, { "epoch": 1.97, "grad_norm": 28.47716869865466, "learning_rate": 0.00015815312682641728, "loss": 0.3437, "step": 7000 }, { "epoch": 1.98, "grad_norm": 9.853541461506175, "learning_rate": 0.00015793395675043834, "loss": 0.2261, "step": 7010 }, { "epoch": 1.98, "grad_norm": 3.150395806350278, "learning_rate": 0.00015771478667445937, "loss": 0.3094, "step": 7020 }, { "epoch": 1.98, "grad_norm": 8.382086348656976, "learning_rate": 0.0001574956165984804, "loss": 0.3093, "step": 7030 }, { "epoch": 1.98, "grad_norm": 2.722468973867923, "learning_rate": 0.00015727644652250143, "loss": 0.1783, "step": 7040 }, { "epoch": 1.99, "grad_norm": 7.546076976068019, "learning_rate": 0.0001570572764465225, "loss": 0.1107, "step": 7050 }, { "epoch": 1.99, "grad_norm": 20.5642182254047, "learning_rate": 0.00015683810637054352, "loss": 0.4277, "step": 7060 }, { "epoch": 1.99, "grad_norm": 23.175588346263925, "learning_rate": 0.00015661893629456455, "loss": 0.4047, "step": 7070 }, { "epoch": 2.0, "grad_norm": 16.76827959394083, "learning_rate": 0.00015639976621858561, "loss": 0.2191, "step": 7080 }, { "epoch": 2.0, "grad_norm": 39.47975455838656, "learning_rate": 0.00015618059614260665, "loss": 0.3542, "step": 7090 }, { "epoch": 2.0, "eval_0_f1": 0.7521064301552106, "eval_0_precision": 0.7848218417399352, "eval_0_recall": 0.7220093656875266, "eval_1_f1": 0.9185843285755897, "eval_1_precision": 0.9061781609195402, "eval_1_recall": 0.9313349084465445, "eval_accuracy": 0.8774257208639403, "eval_loss": 0.35205078125, "eval_runtime": 546.1666, "eval_samples_per_second": 16.7, "eval_steps_per_second": 2.785, "step": 7094 }, { "epoch": 2.0, "grad_norm": 3.9467741467334223, "learning_rate": 0.00015596142606662768, "loss": 0.185, "step": 7100 }, { "epoch": 2.0, "grad_norm": 2.796842409513772, "learning_rate": 0.00015574225599064874, "loss": 0.1507, "step": 7110 }, { "epoch": 2.01, "grad_norm": 1.4260048578862903, "learning_rate": 0.00015552308591466977, "loss": 0.1149, "step": 7120 }, { "epoch": 2.01, "grad_norm": 1.5807242351519994, "learning_rate": 0.0001553039158386908, "loss": 0.0837, "step": 7130 }, { "epoch": 2.01, "grad_norm": 0.04513928456377307, "learning_rate": 0.00015508474576271183, "loss": 0.1494, "step": 7140 }, { "epoch": 2.02, "grad_norm": 40.003470604804754, "learning_rate": 0.0001548655756867329, "loss": 0.1058, "step": 7150 }, { "epoch": 2.02, "grad_norm": 5.841175038437886, "learning_rate": 0.00015464640561075392, "loss": 0.1506, "step": 7160 }, { "epoch": 2.02, "grad_norm": 7.276698167932587, "learning_rate": 0.00015442723553477496, "loss": 0.0631, "step": 7170 }, { "epoch": 2.02, "grad_norm": 1.3902255476639265, "learning_rate": 0.00015420806545879602, "loss": 0.0569, "step": 7180 }, { "epoch": 2.03, "grad_norm": 8.321999591495654, "learning_rate": 0.00015398889538281705, "loss": 0.2596, "step": 7190 }, { "epoch": 2.03, "grad_norm": 4.274705509444957, "learning_rate": 0.00015376972530683808, "loss": 0.0755, "step": 7200 }, { "epoch": 2.03, "grad_norm": 14.284605361939498, "learning_rate": 0.0001535505552308591, "loss": 0.0506, "step": 7210 }, { "epoch": 2.04, "grad_norm": 1.2721793288961767, "learning_rate": 0.00015333138515488017, "loss": 0.1444, "step": 7220 }, { "epoch": 2.04, "grad_norm": 10.887784732379894, "learning_rate": 0.0001531122150789012, "loss": 0.0952, "step": 7230 }, { "epoch": 2.04, "grad_norm": 0.33776382671575805, "learning_rate": 0.00015289304500292223, "loss": 0.1503, "step": 7240 }, { "epoch": 2.04, "grad_norm": 0.7362979177108379, "learning_rate": 0.00015267387492694332, "loss": 0.0826, "step": 7250 }, { "epoch": 2.05, "grad_norm": 12.73307715279125, "learning_rate": 0.00015245470485096433, "loss": 0.2583, "step": 7260 }, { "epoch": 2.05, "grad_norm": 38.30889802059039, "learning_rate": 0.00015223553477498536, "loss": 0.2281, "step": 7270 }, { "epoch": 2.05, "grad_norm": 21.730855964037335, "learning_rate": 0.00015201636469900644, "loss": 0.3252, "step": 7280 }, { "epoch": 2.06, "grad_norm": 19.535045567591606, "learning_rate": 0.00015179719462302748, "loss": 0.1412, "step": 7290 }, { "epoch": 2.06, "grad_norm": 6.082430108448023, "learning_rate": 0.00015157802454704848, "loss": 0.1301, "step": 7300 }, { "epoch": 2.06, "grad_norm": 3.5121677910383875, "learning_rate": 0.0001513588544710695, "loss": 0.2337, "step": 7310 }, { "epoch": 2.06, "grad_norm": 18.706966962801445, "learning_rate": 0.0001511396843950906, "loss": 0.0847, "step": 7320 }, { "epoch": 2.07, "grad_norm": 1.4783230754439916, "learning_rate": 0.00015092051431911163, "loss": 0.1419, "step": 7330 }, { "epoch": 2.07, "grad_norm": 2.9639753705286136, "learning_rate": 0.00015070134424313264, "loss": 0.0583, "step": 7340 }, { "epoch": 2.07, "grad_norm": 23.160696704392283, "learning_rate": 0.00015048217416715372, "loss": 0.2117, "step": 7350 }, { "epoch": 2.07, "grad_norm": 12.771447911890823, "learning_rate": 0.00015026300409117475, "loss": 0.0548, "step": 7360 }, { "epoch": 2.08, "grad_norm": 11.531079583730829, "learning_rate": 0.00015004383401519579, "loss": 0.1, "step": 7370 }, { "epoch": 2.08, "grad_norm": 1.5725191403592071, "learning_rate": 0.00014982466393921682, "loss": 0.0763, "step": 7380 }, { "epoch": 2.08, "grad_norm": 19.62591146141424, "learning_rate": 0.00014960549386323785, "loss": 0.2172, "step": 7390 }, { "epoch": 2.09, "grad_norm": 4.328008880202292, "learning_rate": 0.0001493863237872589, "loss": 0.1791, "step": 7400 }, { "epoch": 2.09, "grad_norm": 2.542706232558499, "learning_rate": 0.00014916715371127994, "loss": 0.1069, "step": 7410 }, { "epoch": 2.09, "grad_norm": 5.01099611371998, "learning_rate": 0.00014894798363530097, "loss": 0.0867, "step": 7420 }, { "epoch": 2.09, "grad_norm": 1.4225594957009309, "learning_rate": 0.00014872881355932203, "loss": 0.1617, "step": 7430 }, { "epoch": 2.1, "grad_norm": 1.4366748859319889, "learning_rate": 0.00014850964348334306, "loss": 0.1247, "step": 7440 }, { "epoch": 2.1, "grad_norm": 0.4798934066028662, "learning_rate": 0.0001482904734073641, "loss": 0.1457, "step": 7450 }, { "epoch": 2.1, "grad_norm": 0.8614048372726179, "learning_rate": 0.00014807130333138516, "loss": 0.1488, "step": 7460 }, { "epoch": 2.11, "grad_norm": 1.5048632731892742, "learning_rate": 0.0001478521332554062, "loss": 0.0832, "step": 7470 }, { "epoch": 2.11, "grad_norm": 2.6843269650855808, "learning_rate": 0.00014763296317942722, "loss": 0.0961, "step": 7480 }, { "epoch": 2.11, "grad_norm": 16.8741628191936, "learning_rate": 0.00014741379310344825, "loss": 0.2136, "step": 7490 }, { "epoch": 2.11, "grad_norm": 0.807139340475033, "learning_rate": 0.0001471946230274693, "loss": 0.1095, "step": 7500 }, { "epoch": 2.12, "grad_norm": 2.2184905450148986, "learning_rate": 0.00014697545295149034, "loss": 0.1902, "step": 7510 }, { "epoch": 2.12, "grad_norm": 5.771979829819086, "learning_rate": 0.00014675628287551137, "loss": 0.138, "step": 7520 }, { "epoch": 2.12, "grad_norm": 5.97570946689331, "learning_rate": 0.00014653711279953243, "loss": 0.21, "step": 7530 }, { "epoch": 2.13, "grad_norm": 5.577046113368879, "learning_rate": 0.00014631794272355347, "loss": 0.2296, "step": 7540 }, { "epoch": 2.13, "grad_norm": 0.16147618396834, "learning_rate": 0.0001460987726475745, "loss": 0.1041, "step": 7550 }, { "epoch": 2.13, "grad_norm": 1.3482389383340438, "learning_rate": 0.00014587960257159553, "loss": 0.0832, "step": 7560 }, { "epoch": 2.13, "grad_norm": 5.133780628362138, "learning_rate": 0.0001456604324956166, "loss": 0.0916, "step": 7570 }, { "epoch": 2.14, "grad_norm": 2.729454474197146, "learning_rate": 0.00014544126241963762, "loss": 0.1806, "step": 7580 }, { "epoch": 2.14, "grad_norm": 2.72773715656119, "learning_rate": 0.00014522209234365865, "loss": 0.0647, "step": 7590 }, { "epoch": 2.14, "grad_norm": 3.507884957259747, "learning_rate": 0.0001450248392752776, "loss": 0.2701, "step": 7600 }, { "epoch": 2.15, "grad_norm": 19.081923498359, "learning_rate": 0.00014480566919929864, "loss": 0.0962, "step": 7610 }, { "epoch": 2.15, "grad_norm": 45.75244080209308, "learning_rate": 0.00014458649912331968, "loss": 0.2713, "step": 7620 }, { "epoch": 2.15, "grad_norm": 6.360305569438668, "learning_rate": 0.00014436732904734073, "loss": 0.0377, "step": 7630 }, { "epoch": 2.15, "grad_norm": 11.812868187605755, "learning_rate": 0.00014414815897136177, "loss": 0.0538, "step": 7640 }, { "epoch": 2.16, "grad_norm": 11.581826556875212, "learning_rate": 0.0001439289888953828, "loss": 0.1113, "step": 7650 }, { "epoch": 2.16, "grad_norm": 3.0769487150537067, "learning_rate": 0.00014370981881940383, "loss": 0.1045, "step": 7660 }, { "epoch": 2.16, "grad_norm": 19.551871190286114, "learning_rate": 0.0001434906487434249, "loss": 0.1375, "step": 7670 }, { "epoch": 2.17, "grad_norm": 4.0482427664543925, "learning_rate": 0.00014327147866744592, "loss": 0.1863, "step": 7680 }, { "epoch": 2.17, "grad_norm": 3.1977613453815654, "learning_rate": 0.00014305230859146695, "loss": 0.0756, "step": 7690 }, { "epoch": 2.17, "grad_norm": 0.9737985079221272, "learning_rate": 0.000142833138515488, "loss": 0.0916, "step": 7700 }, { "epoch": 2.17, "grad_norm": 1.984146795188883, "learning_rate": 0.00014261396843950904, "loss": 0.2625, "step": 7710 }, { "epoch": 2.18, "grad_norm": 1.72026076920096, "learning_rate": 0.00014239479836353008, "loss": 0.1073, "step": 7720 }, { "epoch": 2.18, "grad_norm": 0.555301240735939, "learning_rate": 0.00014217562828755114, "loss": 0.0962, "step": 7730 }, { "epoch": 2.18, "grad_norm": 12.306241433553296, "learning_rate": 0.00014195645821157217, "loss": 0.312, "step": 7740 }, { "epoch": 2.18, "grad_norm": 39.31295240290497, "learning_rate": 0.0001417372881355932, "loss": 0.1382, "step": 7750 }, { "epoch": 2.19, "grad_norm": 18.239083580266996, "learning_rate": 0.00014151811805961423, "loss": 0.1959, "step": 7760 }, { "epoch": 2.19, "grad_norm": 3.6063732707629277, "learning_rate": 0.0001412989479836353, "loss": 0.1882, "step": 7770 }, { "epoch": 2.19, "grad_norm": 22.59443633796688, "learning_rate": 0.00014107977790765632, "loss": 0.1411, "step": 7780 }, { "epoch": 2.2, "grad_norm": 9.943061970841525, "learning_rate": 0.00014086060783167738, "loss": 0.2073, "step": 7790 }, { "epoch": 2.2, "grad_norm": 13.334986026618791, "learning_rate": 0.00014064143775569841, "loss": 0.1452, "step": 7800 }, { "epoch": 2.2, "grad_norm": 18.672479936744026, "learning_rate": 0.00014042226767971945, "loss": 0.1604, "step": 7810 }, { "epoch": 2.2, "grad_norm": 8.209768975033892, "learning_rate": 0.0001402030976037405, "loss": 0.0633, "step": 7820 }, { "epoch": 2.21, "grad_norm": 7.929920880167193, "learning_rate": 0.00013998392752776154, "loss": 0.0759, "step": 7830 }, { "epoch": 2.21, "grad_norm": 5.155314191153851, "learning_rate": 0.00013976475745178257, "loss": 0.1334, "step": 7840 }, { "epoch": 2.21, "grad_norm": 12.293469154578297, "learning_rate": 0.00013954558737580363, "loss": 0.1731, "step": 7850 }, { "epoch": 2.22, "grad_norm": 27.793799163143525, "learning_rate": 0.00013932641729982466, "loss": 0.1718, "step": 7860 }, { "epoch": 2.22, "grad_norm": 18.160732235536603, "learning_rate": 0.0001391072472238457, "loss": 0.3889, "step": 7870 }, { "epoch": 2.22, "grad_norm": 6.719945966655997, "learning_rate": 0.00013888807714786672, "loss": 0.0903, "step": 7880 }, { "epoch": 2.22, "grad_norm": 4.702233374552434, "learning_rate": 0.00013866890707188778, "loss": 0.1029, "step": 7890 }, { "epoch": 2.23, "grad_norm": 1.6814706649468594, "learning_rate": 0.00013844973699590882, "loss": 0.1609, "step": 7900 }, { "epoch": 2.23, "grad_norm": 35.183209348221745, "learning_rate": 0.00013823056691992985, "loss": 0.0987, "step": 7910 }, { "epoch": 2.23, "grad_norm": 0.3853650828148091, "learning_rate": 0.0001380113968439509, "loss": 0.092, "step": 7920 }, { "epoch": 2.24, "grad_norm": 2.1900815019481827, "learning_rate": 0.00013779222676797194, "loss": 0.0721, "step": 7930 }, { "epoch": 2.24, "grad_norm": 2.719013304384676, "learning_rate": 0.00013757305669199297, "loss": 0.2757, "step": 7940 }, { "epoch": 2.24, "grad_norm": 1.9065891840102538, "learning_rate": 0.000137353886616014, "loss": 0.0894, "step": 7950 }, { "epoch": 2.24, "grad_norm": 0.7041589905973044, "learning_rate": 0.00013713471654003506, "loss": 0.1567, "step": 7960 }, { "epoch": 2.25, "grad_norm": 13.09370538833684, "learning_rate": 0.0001369155464640561, "loss": 0.1153, "step": 7970 }, { "epoch": 2.25, "grad_norm": 0.10005792474473749, "learning_rate": 0.00013669637638807713, "loss": 0.0889, "step": 7980 }, { "epoch": 2.25, "grad_norm": 8.650143720385165, "learning_rate": 0.00013647720631209818, "loss": 0.102, "step": 7990 }, { "epoch": 2.26, "grad_norm": 13.834552459976706, "learning_rate": 0.00013625803623611922, "loss": 0.1991, "step": 8000 }, { "epoch": 2.26, "grad_norm": 7.1136160697882636, "learning_rate": 0.00013603886616014025, "loss": 0.1605, "step": 8010 }, { "epoch": 2.26, "grad_norm": 3.409491115278956, "learning_rate": 0.00013581969608416128, "loss": 0.182, "step": 8020 }, { "epoch": 2.26, "grad_norm": 11.603050853164602, "learning_rate": 0.00013560052600818234, "loss": 0.1393, "step": 8030 }, { "epoch": 2.27, "grad_norm": 91.11285901526837, "learning_rate": 0.00013538135593220337, "loss": 0.5593, "step": 8040 }, { "epoch": 2.27, "grad_norm": 9.002577439722662, "learning_rate": 0.0001351621858562244, "loss": 0.2636, "step": 8050 }, { "epoch": 2.27, "grad_norm": 2.8423642348776945, "learning_rate": 0.00013494301578024546, "loss": 0.2049, "step": 8060 }, { "epoch": 2.28, "grad_norm": 9.017487980947873, "learning_rate": 0.0001347238457042665, "loss": 0.2001, "step": 8070 }, { "epoch": 2.28, "grad_norm": 5.463991248637368, "learning_rate": 0.00013450467562828753, "loss": 0.1163, "step": 8080 }, { "epoch": 2.28, "grad_norm": 35.9490561961924, "learning_rate": 0.00013428550555230859, "loss": 0.1705, "step": 8090 }, { "epoch": 2.28, "grad_norm": 1.5214346506002336, "learning_rate": 0.00013406633547632962, "loss": 0.1706, "step": 8100 }, { "epoch": 2.29, "grad_norm": 6.553206900744027, "learning_rate": 0.00013384716540035068, "loss": 0.1406, "step": 8110 }, { "epoch": 2.29, "grad_norm": 10.219338883484347, "learning_rate": 0.00013362799532437168, "loss": 0.2204, "step": 8120 }, { "epoch": 2.29, "grad_norm": 18.748997475525492, "learning_rate": 0.00013340882524839274, "loss": 0.2109, "step": 8130 }, { "epoch": 2.29, "grad_norm": 1.3727170176969377, "learning_rate": 0.0001331896551724138, "loss": 0.1122, "step": 8140 }, { "epoch": 2.3, "grad_norm": 10.948203912180993, "learning_rate": 0.00013297048509643483, "loss": 0.3022, "step": 8150 }, { "epoch": 2.3, "grad_norm": 2.056636685995259, "learning_rate": 0.00013275131502045586, "loss": 0.239, "step": 8160 }, { "epoch": 2.3, "grad_norm": 6.518651435362685, "learning_rate": 0.0001325321449444769, "loss": 0.3234, "step": 8170 }, { "epoch": 2.31, "grad_norm": 5.475635275763899, "learning_rate": 0.00013231297486849796, "loss": 0.2198, "step": 8180 }, { "epoch": 2.31, "grad_norm": 3.671883643297412, "learning_rate": 0.000132093804792519, "loss": 0.1707, "step": 8190 }, { "epoch": 2.31, "grad_norm": 0.4579375750027517, "learning_rate": 0.00013187463471654002, "loss": 0.2133, "step": 8200 }, { "epoch": 2.31, "grad_norm": 5.194066447141225, "learning_rate": 0.00013165546464056108, "loss": 0.2439, "step": 8210 }, { "epoch": 2.32, "grad_norm": 10.494958205761325, "learning_rate": 0.0001314362945645821, "loss": 0.244, "step": 8220 }, { "epoch": 2.32, "grad_norm": 3.2710044167508534, "learning_rate": 0.00013121712448860314, "loss": 0.1088, "step": 8230 }, { "epoch": 2.32, "grad_norm": 16.056832749725743, "learning_rate": 0.00013099795441262417, "loss": 0.2221, "step": 8240 }, { "epoch": 2.33, "grad_norm": 0.7672515772397378, "learning_rate": 0.00013077878433664523, "loss": 0.0554, "step": 8250 }, { "epoch": 2.33, "grad_norm": 21.726819743293646, "learning_rate": 0.00013055961426066627, "loss": 0.1701, "step": 8260 }, { "epoch": 2.33, "grad_norm": 11.287063948392506, "learning_rate": 0.0001303404441846873, "loss": 0.2342, "step": 8270 }, { "epoch": 2.33, "grad_norm": 22.058917311910815, "learning_rate": 0.00013012127410870836, "loss": 0.163, "step": 8280 }, { "epoch": 2.34, "grad_norm": 3.2943302484351142, "learning_rate": 0.0001299021040327294, "loss": 0.1142, "step": 8290 }, { "epoch": 2.34, "grad_norm": 41.01468692106424, "learning_rate": 0.00012968293395675042, "loss": 0.1652, "step": 8300 }, { "epoch": 2.34, "grad_norm": 15.110863439212581, "learning_rate": 0.00012946376388077145, "loss": 0.2891, "step": 8310 }, { "epoch": 2.35, "grad_norm": 4.692836606354725, "learning_rate": 0.0001292445938047925, "loss": 0.0777, "step": 8320 }, { "epoch": 2.35, "grad_norm": 42.35071301518718, "learning_rate": 0.00012902542372881354, "loss": 0.2289, "step": 8330 }, { "epoch": 2.35, "grad_norm": 3.3851205501937085, "learning_rate": 0.00012880625365283458, "loss": 0.0574, "step": 8340 }, { "epoch": 2.35, "grad_norm": 32.382836448835434, "learning_rate": 0.00012858708357685564, "loss": 0.2681, "step": 8350 }, { "epoch": 2.36, "grad_norm": 11.990285837961236, "learning_rate": 0.00012836791350087667, "loss": 0.1145, "step": 8360 }, { "epoch": 2.36, "grad_norm": 6.231424331853902, "learning_rate": 0.0001281487434248977, "loss": 0.1645, "step": 8370 }, { "epoch": 2.36, "grad_norm": 8.823978019308194, "learning_rate": 0.00012792957334891876, "loss": 0.0895, "step": 8380 }, { "epoch": 2.37, "grad_norm": 3.5791569065379147, "learning_rate": 0.0001277104032729398, "loss": 0.2226, "step": 8390 }, { "epoch": 2.37, "grad_norm": 4.904760306159147, "learning_rate": 0.00012749123319696082, "loss": 0.1586, "step": 8400 }, { "epoch": 2.37, "grad_norm": 35.790471287396194, "learning_rate": 0.00012727206312098185, "loss": 0.1451, "step": 8410 }, { "epoch": 2.37, "grad_norm": 19.49158058941717, "learning_rate": 0.0001270528930450029, "loss": 0.1129, "step": 8420 }, { "epoch": 2.38, "grad_norm": 1.2301380360175656, "learning_rate": 0.00012683372296902397, "loss": 0.2055, "step": 8430 }, { "epoch": 2.38, "grad_norm": 6.081015675249448, "learning_rate": 0.00012661455289304498, "loss": 0.1039, "step": 8440 }, { "epoch": 2.38, "grad_norm": 8.51374019556884, "learning_rate": 0.00012639538281706604, "loss": 0.0764, "step": 8450 }, { "epoch": 2.39, "grad_norm": 22.61234712969463, "learning_rate": 0.00012617621274108707, "loss": 0.1144, "step": 8460 }, { "epoch": 2.39, "grad_norm": 16.325546182379608, "learning_rate": 0.00012595704266510813, "loss": 0.2252, "step": 8470 }, { "epoch": 2.39, "grad_norm": 10.684407579259915, "learning_rate": 0.00012573787258912916, "loss": 0.1617, "step": 8480 }, { "epoch": 2.39, "grad_norm": 18.8401359355114, "learning_rate": 0.0001255187025131502, "loss": 0.1327, "step": 8490 }, { "epoch": 2.4, "grad_norm": 4.428339354625936, "learning_rate": 0.00012529953243717125, "loss": 0.2793, "step": 8500 }, { "epoch": 2.4, "grad_norm": 34.97712900138805, "learning_rate": 0.00012508036236119228, "loss": 0.1734, "step": 8510 }, { "epoch": 2.4, "grad_norm": 10.842732736668664, "learning_rate": 0.00012486119228521331, "loss": 0.172, "step": 8520 }, { "epoch": 2.4, "grad_norm": 3.8204570700978753, "learning_rate": 0.00012464202220923435, "loss": 0.1893, "step": 8530 }, { "epoch": 2.41, "grad_norm": 1.7847088171149714, "learning_rate": 0.0001244228521332554, "loss": 0.1119, "step": 8540 }, { "epoch": 2.41, "grad_norm": 10.013026009815832, "learning_rate": 0.00012420368205727644, "loss": 0.1488, "step": 8550 }, { "epoch": 2.41, "grad_norm": 0.9956055302547419, "learning_rate": 0.00012398451198129747, "loss": 0.167, "step": 8560 }, { "epoch": 2.42, "grad_norm": 15.708190043930621, "learning_rate": 0.00012376534190531853, "loss": 0.103, "step": 8570 }, { "epoch": 2.42, "grad_norm": 9.516127340363248, "learning_rate": 0.00012354617182933956, "loss": 0.1761, "step": 8580 }, { "epoch": 2.42, "grad_norm": 11.289621429730468, "learning_rate": 0.0001233270017533606, "loss": 0.21, "step": 8590 }, { "epoch": 2.42, "grad_norm": 6.438699785103895, "learning_rate": 0.00012310783167738162, "loss": 0.1212, "step": 8600 }, { "epoch": 2.43, "grad_norm": 0.4291084022368479, "learning_rate": 0.00012288866160140268, "loss": 0.1315, "step": 8610 }, { "epoch": 2.43, "grad_norm": 3.090543654415638, "learning_rate": 0.00012266949152542372, "loss": 0.0689, "step": 8620 }, { "epoch": 2.43, "grad_norm": 0.47917246377381595, "learning_rate": 0.00012245032144944475, "loss": 0.1119, "step": 8630 }, { "epoch": 2.44, "grad_norm": 0.7069329797066186, "learning_rate": 0.0001222311513734658, "loss": 0.0562, "step": 8640 }, { "epoch": 2.44, "grad_norm": 18.96685701324762, "learning_rate": 0.00012201198129748684, "loss": 0.1177, "step": 8650 }, { "epoch": 2.44, "grad_norm": 0.00279294620786177, "learning_rate": 0.00012179281122150788, "loss": 0.0998, "step": 8660 }, { "epoch": 2.44, "grad_norm": 6.199666064354547, "learning_rate": 0.00012157364114552893, "loss": 0.083, "step": 8670 }, { "epoch": 2.45, "grad_norm": 0.639057585376392, "learning_rate": 0.00012135447106954996, "loss": 0.0941, "step": 8680 }, { "epoch": 2.45, "grad_norm": 0.15447864197247607, "learning_rate": 0.0001211572180011689, "loss": 0.1944, "step": 8690 }, { "epoch": 2.45, "grad_norm": 0.8173758905341566, "learning_rate": 0.00012093804792518993, "loss": 0.1019, "step": 8700 }, { "epoch": 2.46, "grad_norm": 3.243981396595975, "learning_rate": 0.00012071887784921097, "loss": 0.3167, "step": 8710 }, { "epoch": 2.46, "grad_norm": 3.4464993538031634, "learning_rate": 0.00012049970777323202, "loss": 0.1482, "step": 8720 }, { "epoch": 2.46, "grad_norm": 23.425856884504597, "learning_rate": 0.00012028053769725305, "loss": 0.2159, "step": 8730 }, { "epoch": 2.46, "grad_norm": 18.894344998479365, "learning_rate": 0.00012006136762127411, "loss": 0.3282, "step": 8740 }, { "epoch": 2.47, "grad_norm": 4.015044349522744, "learning_rate": 0.00011984219754529513, "loss": 0.1495, "step": 8750 }, { "epoch": 2.47, "grad_norm": 0.456213488330113, "learning_rate": 0.00011962302746931619, "loss": 0.1598, "step": 8760 }, { "epoch": 2.47, "grad_norm": 13.704116606800925, "learning_rate": 0.00011940385739333723, "loss": 0.1294, "step": 8770 }, { "epoch": 2.48, "grad_norm": 7.3368094990394175, "learning_rate": 0.00011918468731735826, "loss": 0.1551, "step": 8780 }, { "epoch": 2.48, "grad_norm": 1.0015347738020366, "learning_rate": 0.00011896551724137931, "loss": 0.1037, "step": 8790 }, { "epoch": 2.48, "grad_norm": 4.798234813041826, "learning_rate": 0.00011874634716540034, "loss": 0.0762, "step": 8800 }, { "epoch": 2.48, "grad_norm": 0.46732247838464797, "learning_rate": 0.00011852717708942139, "loss": 0.2361, "step": 8810 }, { "epoch": 2.49, "grad_norm": 0.78510526101886, "learning_rate": 0.00011830800701344242, "loss": 0.1172, "step": 8820 }, { "epoch": 2.49, "grad_norm": 14.754170828495456, "learning_rate": 0.00011808883693746346, "loss": 0.1337, "step": 8830 }, { "epoch": 2.49, "grad_norm": 5.457626503330071, "learning_rate": 0.00011786966686148451, "loss": 0.1238, "step": 8840 }, { "epoch": 2.5, "grad_norm": 24.359633554477504, "learning_rate": 0.00011765049678550554, "loss": 0.159, "step": 8850 }, { "epoch": 2.5, "grad_norm": 4.052116414721034, "learning_rate": 0.00011743132670952659, "loss": 0.1946, "step": 8860 }, { "epoch": 2.5, "grad_norm": 10.286136518184552, "learning_rate": 0.00011721215663354762, "loss": 0.0675, "step": 8870 }, { "epoch": 2.5, "grad_norm": 0.7798842992321797, "learning_rate": 0.00011699298655756866, "loss": 0.0518, "step": 8880 }, { "epoch": 2.51, "grad_norm": 3.1590348601862037, "learning_rate": 0.00011677381648158971, "loss": 0.237, "step": 8890 }, { "epoch": 2.51, "grad_norm": 9.309421761709203, "learning_rate": 0.00011655464640561074, "loss": 0.1237, "step": 8900 }, { "epoch": 2.51, "grad_norm": 7.223449459613724, "learning_rate": 0.00011633547632963179, "loss": 0.1144, "step": 8910 }, { "epoch": 2.51, "grad_norm": 2.293633045983554, "learning_rate": 0.00011611630625365282, "loss": 0.1469, "step": 8920 }, { "epoch": 2.52, "grad_norm": 11.619319474508913, "learning_rate": 0.00011589713617767387, "loss": 0.1324, "step": 8930 }, { "epoch": 2.52, "grad_norm": 8.237900621376555, "learning_rate": 0.00011567796610169491, "loss": 0.1019, "step": 8940 }, { "epoch": 2.52, "grad_norm": 3.4844703517603746, "learning_rate": 0.00011545879602571594, "loss": 0.1582, "step": 8950 }, { "epoch": 2.53, "grad_norm": 48.74139317560625, "learning_rate": 0.00011523962594973699, "loss": 0.243, "step": 8960 }, { "epoch": 2.53, "grad_norm": 32.91098913412278, "learning_rate": 0.00011502045587375802, "loss": 0.153, "step": 8970 }, { "epoch": 2.53, "grad_norm": 5.659700047857308, "learning_rate": 0.00011480128579777907, "loss": 0.0843, "step": 8980 }, { "epoch": 2.53, "grad_norm": 22.35388198625644, "learning_rate": 0.0001145821157218001, "loss": 0.1841, "step": 8990 }, { "epoch": 2.54, "grad_norm": 5.24175893236962, "learning_rate": 0.00011436294564582114, "loss": 0.1452, "step": 9000 }, { "epoch": 2.54, "grad_norm": 5.865583240157655, "learning_rate": 0.00011414377556984219, "loss": 0.1757, "step": 9010 }, { "epoch": 2.54, "grad_norm": 16.96991984978489, "learning_rate": 0.00011392460549386322, "loss": 0.2905, "step": 9020 }, { "epoch": 2.55, "grad_norm": 1.4459460915714275, "learning_rate": 0.00011370543541788427, "loss": 0.0953, "step": 9030 }, { "epoch": 2.55, "grad_norm": 0.27775375444037353, "learning_rate": 0.0001134862653419053, "loss": 0.0792, "step": 9040 }, { "epoch": 2.55, "grad_norm": 1.1397011386751719, "learning_rate": 0.00011326709526592634, "loss": 0.0971, "step": 9050 }, { "epoch": 2.55, "grad_norm": 32.740624235968234, "learning_rate": 0.0001130479251899474, "loss": 0.152, "step": 9060 }, { "epoch": 2.56, "grad_norm": 15.666578971132482, "learning_rate": 0.00011282875511396842, "loss": 0.2745, "step": 9070 }, { "epoch": 2.56, "grad_norm": 0.6312408815420002, "learning_rate": 0.00011260958503798948, "loss": 0.0736, "step": 9080 }, { "epoch": 2.56, "grad_norm": 0.9882525535102352, "learning_rate": 0.0001123904149620105, "loss": 0.1197, "step": 9090 }, { "epoch": 2.57, "grad_norm": 22.45196464336915, "learning_rate": 0.00011217124488603156, "loss": 0.0746, "step": 9100 }, { "epoch": 2.57, "grad_norm": 0.4466669958671361, "learning_rate": 0.00011195207481005258, "loss": 0.026, "step": 9110 }, { "epoch": 2.57, "grad_norm": 18.22674384805627, "learning_rate": 0.00011173290473407364, "loss": 0.1488, "step": 9120 }, { "epoch": 2.57, "grad_norm": 3.2075642222454324, "learning_rate": 0.00011151373465809468, "loss": 0.271, "step": 9130 }, { "epoch": 2.58, "grad_norm": 8.45175655576022, "learning_rate": 0.00011129456458211571, "loss": 0.0749, "step": 9140 }, { "epoch": 2.58, "grad_norm": 13.874834426706034, "learning_rate": 0.00011107539450613676, "loss": 0.0782, "step": 9150 }, { "epoch": 2.58, "grad_norm": 0.9676566458873671, "learning_rate": 0.00011085622443015779, "loss": 0.0905, "step": 9160 }, { "epoch": 2.59, "grad_norm": 5.621065616371578, "learning_rate": 0.00011063705435417884, "loss": 0.0798, "step": 9170 }, { "epoch": 2.59, "grad_norm": 7.042618733836522, "learning_rate": 0.00011041788427819988, "loss": 0.2499, "step": 9180 }, { "epoch": 2.59, "grad_norm": 2.4221807383973646, "learning_rate": 0.00011019871420222091, "loss": 0.0507, "step": 9190 }, { "epoch": 2.59, "grad_norm": 24.467155852219083, "learning_rate": 0.00010997954412624196, "loss": 0.1741, "step": 9200 }, { "epoch": 2.6, "grad_norm": 4.428974769529165, "learning_rate": 0.00010976037405026299, "loss": 0.1104, "step": 9210 }, { "epoch": 2.6, "grad_norm": 20.92148868212674, "learning_rate": 0.00010954120397428404, "loss": 0.0914, "step": 9220 }, { "epoch": 2.6, "grad_norm": 0.30038070281462703, "learning_rate": 0.00010932203389830507, "loss": 0.1936, "step": 9230 }, { "epoch": 2.61, "grad_norm": 48.36362550140161, "learning_rate": 0.00010910286382232611, "loss": 0.3639, "step": 9240 }, { "epoch": 2.61, "grad_norm": 8.15759688958997, "learning_rate": 0.00010888369374634716, "loss": 0.1991, "step": 9250 }, { "epoch": 2.61, "grad_norm": 12.841408835810743, "learning_rate": 0.00010866452367036819, "loss": 0.1441, "step": 9260 }, { "epoch": 2.61, "grad_norm": 13.483453911295381, "learning_rate": 0.00010844535359438924, "loss": 0.0981, "step": 9270 }, { "epoch": 2.62, "grad_norm": 0.5843792074816087, "learning_rate": 0.00010822618351841027, "loss": 0.2757, "step": 9280 }, { "epoch": 2.62, "grad_norm": 7.822943624957112, "learning_rate": 0.00010800701344243132, "loss": 0.1102, "step": 9290 }, { "epoch": 2.62, "grad_norm": 35.655682175617585, "learning_rate": 0.00010778784336645236, "loss": 0.2879, "step": 9300 }, { "epoch": 2.62, "grad_norm": 1.429017224224025, "learning_rate": 0.0001075686732904734, "loss": 0.0986, "step": 9310 }, { "epoch": 2.63, "grad_norm": 8.077860057159654, "learning_rate": 0.00010734950321449444, "loss": 0.2654, "step": 9320 }, { "epoch": 2.63, "grad_norm": 1.2534241595837954, "learning_rate": 0.00010713033313851547, "loss": 0.0941, "step": 9330 }, { "epoch": 2.63, "grad_norm": 0.9501360823975038, "learning_rate": 0.00010691116306253652, "loss": 0.1358, "step": 9340 }, { "epoch": 2.64, "grad_norm": 1.3120476171581812, "learning_rate": 0.00010669199298655756, "loss": 0.1927, "step": 9350 }, { "epoch": 2.64, "grad_norm": 10.128511370932692, "learning_rate": 0.0001064728229105786, "loss": 0.2176, "step": 9360 }, { "epoch": 2.64, "grad_norm": 3.376491562592107, "learning_rate": 0.00010625365283459964, "loss": 0.0673, "step": 9370 }, { "epoch": 2.64, "grad_norm": 0.011877575586390247, "learning_rate": 0.00010603448275862067, "loss": 0.1272, "step": 9380 }, { "epoch": 2.65, "grad_norm": 15.244920750217991, "learning_rate": 0.00010581531268264172, "loss": 0.1012, "step": 9390 }, { "epoch": 2.65, "grad_norm": 0.2705443274155431, "learning_rate": 0.00010559614260666275, "loss": 0.1024, "step": 9400 }, { "epoch": 2.65, "grad_norm": 12.05359833618471, "learning_rate": 0.0001053769725306838, "loss": 0.1826, "step": 9410 }, { "epoch": 2.66, "grad_norm": 9.360989137584955, "learning_rate": 0.00010515780245470485, "loss": 0.205, "step": 9420 }, { "epoch": 2.66, "grad_norm": 6.831707184981156, "learning_rate": 0.00010493863237872587, "loss": 0.2364, "step": 9430 }, { "epoch": 2.66, "grad_norm": 12.68075831146527, "learning_rate": 0.00010471946230274693, "loss": 0.1878, "step": 9440 }, { "epoch": 2.66, "grad_norm": 9.461914667245052, "learning_rate": 0.00010450029222676796, "loss": 0.1061, "step": 9450 }, { "epoch": 2.67, "grad_norm": 33.7830548827646, "learning_rate": 0.00010428112215078901, "loss": 0.0955, "step": 9460 }, { "epoch": 2.67, "grad_norm": 3.41262405773915, "learning_rate": 0.00010406195207481005, "loss": 0.0893, "step": 9470 }, { "epoch": 2.67, "grad_norm": 16.661587161769187, "learning_rate": 0.00010384278199883109, "loss": 0.1281, "step": 9480 }, { "epoch": 2.68, "grad_norm": 19.501609655955452, "learning_rate": 0.00010362361192285213, "loss": 0.3944, "step": 9490 }, { "epoch": 2.68, "grad_norm": 4.601907463118784, "learning_rate": 0.00010340444184687316, "loss": 0.1158, "step": 9500 }, { "epoch": 2.68, "grad_norm": 24.10143095564842, "learning_rate": 0.00010318527177089421, "loss": 0.2357, "step": 9510 }, { "epoch": 2.68, "grad_norm": 4.970899462803766, "learning_rate": 0.00010296610169491524, "loss": 0.1134, "step": 9520 }, { "epoch": 2.69, "grad_norm": 0.04492151846400819, "learning_rate": 0.00010274693161893629, "loss": 0.1146, "step": 9530 }, { "epoch": 2.69, "grad_norm": 7.2274322872988055, "learning_rate": 0.00010252776154295733, "loss": 0.1354, "step": 9540 }, { "epoch": 2.69, "grad_norm": 6.048047676599459, "learning_rate": 0.00010230859146697836, "loss": 0.2284, "step": 9550 }, { "epoch": 2.7, "grad_norm": 0.9638985947560608, "learning_rate": 0.00010208942139099941, "loss": 0.0955, "step": 9560 }, { "epoch": 2.7, "grad_norm": 6.19171074222296, "learning_rate": 0.00010187025131502044, "loss": 0.0909, "step": 9570 }, { "epoch": 2.7, "grad_norm": 22.167114710888278, "learning_rate": 0.00010165108123904149, "loss": 0.1367, "step": 9580 }, { "epoch": 2.7, "grad_norm": 0.3557238646240087, "learning_rate": 0.00010143191116306253, "loss": 0.0825, "step": 9590 }, { "epoch": 2.71, "grad_norm": 23.067425155746133, "learning_rate": 0.00010121274108708357, "loss": 0.0815, "step": 9600 }, { "epoch": 2.71, "grad_norm": 1.3212362814667589, "learning_rate": 0.00010099357101110461, "loss": 0.1454, "step": 9610 }, { "epoch": 2.71, "grad_norm": 9.03334133310524, "learning_rate": 0.00010077440093512564, "loss": 0.1943, "step": 9620 }, { "epoch": 2.71, "grad_norm": 2.1696899004758556, "learning_rate": 0.00010055523085914669, "loss": 0.1105, "step": 9630 }, { "epoch": 2.72, "grad_norm": 3.8419745918801067, "learning_rate": 0.00010033606078316773, "loss": 0.3075, "step": 9640 }, { "epoch": 2.72, "grad_norm": 0.9683867410845369, "learning_rate": 0.00010011689070718877, "loss": 0.1233, "step": 9650 }, { "epoch": 2.72, "grad_norm": 13.53465449736677, "learning_rate": 9.989772063120981e-05, "loss": 0.2396, "step": 9660 }, { "epoch": 2.73, "grad_norm": 1.3390366123084314, "learning_rate": 9.967855055523084e-05, "loss": 0.117, "step": 9670 }, { "epoch": 2.73, "grad_norm": 0.7238054927151057, "learning_rate": 9.945938047925189e-05, "loss": 0.0825, "step": 9680 }, { "epoch": 2.73, "grad_norm": 1.343299376332715, "learning_rate": 9.924021040327292e-05, "loss": 0.1852, "step": 9690 }, { "epoch": 2.73, "grad_norm": 2.7174665471907296, "learning_rate": 9.902104032729397e-05, "loss": 0.185, "step": 9700 }, { "epoch": 2.74, "grad_norm": 19.67625891707943, "learning_rate": 9.880187025131501e-05, "loss": 0.1037, "step": 9710 }, { "epoch": 2.74, "grad_norm": 1.7362888479778698, "learning_rate": 9.858270017533604e-05, "loss": 0.0993, "step": 9720 }, { "epoch": 2.74, "grad_norm": 24.97466231442174, "learning_rate": 9.836353009935709e-05, "loss": 0.3, "step": 9730 }, { "epoch": 2.75, "grad_norm": 0.19223391446010424, "learning_rate": 9.814436002337812e-05, "loss": 0.139, "step": 9740 }, { "epoch": 2.75, "grad_norm": 1.0232214471272263, "learning_rate": 9.792518994739917e-05, "loss": 0.0779, "step": 9750 }, { "epoch": 2.75, "grad_norm": 2.4406875106467685, "learning_rate": 9.770601987142023e-05, "loss": 0.2194, "step": 9760 }, { "epoch": 2.75, "grad_norm": 29.07247504638446, "learning_rate": 9.748684979544126e-05, "loss": 0.0975, "step": 9770 }, { "epoch": 2.76, "grad_norm": 0.7785847782304731, "learning_rate": 9.72676797194623e-05, "loss": 0.1824, "step": 9780 }, { "epoch": 2.76, "grad_norm": 15.90756717391926, "learning_rate": 9.704850964348334e-05, "loss": 0.0997, "step": 9790 }, { "epoch": 2.76, "grad_norm": 0.5434568527038021, "learning_rate": 9.682933956750438e-05, "loss": 0.0639, "step": 9800 }, { "epoch": 2.77, "grad_norm": 0.8181189757985562, "learning_rate": 9.661016949152541e-05, "loss": 0.1401, "step": 9810 }, { "epoch": 2.77, "grad_norm": 0.7002007823686216, "learning_rate": 9.639099941554646e-05, "loss": 0.1317, "step": 9820 }, { "epoch": 2.77, "grad_norm": 10.646847543416506, "learning_rate": 9.61718293395675e-05, "loss": 0.0834, "step": 9830 }, { "epoch": 2.77, "grad_norm": 2.854877487641081, "learning_rate": 9.595265926358854e-05, "loss": 0.094, "step": 9840 }, { "epoch": 2.78, "grad_norm": 21.086749067232077, "learning_rate": 9.573348918760958e-05, "loss": 0.1658, "step": 9850 }, { "epoch": 2.78, "grad_norm": 6.599220856291059, "learning_rate": 9.551431911163061e-05, "loss": 0.1013, "step": 9860 }, { "epoch": 2.78, "grad_norm": 10.557135159098205, "learning_rate": 9.529514903565166e-05, "loss": 0.1954, "step": 9870 }, { "epoch": 2.79, "grad_norm": 9.42143848698557, "learning_rate": 9.50759789596727e-05, "loss": 0.2229, "step": 9880 }, { "epoch": 2.79, "grad_norm": 36.05803797726123, "learning_rate": 9.485680888369374e-05, "loss": 0.1102, "step": 9890 }, { "epoch": 2.79, "grad_norm": 19.750284280798315, "learning_rate": 9.463763880771478e-05, "loss": 0.1737, "step": 9900 }, { "epoch": 2.79, "grad_norm": 20.670824131237584, "learning_rate": 9.441846873173582e-05, "loss": 0.1087, "step": 9910 }, { "epoch": 2.8, "grad_norm": 3.3595758310158126, "learning_rate": 9.419929865575686e-05, "loss": 0.0856, "step": 9920 }, { "epoch": 2.8, "grad_norm": 0.6471151094333392, "learning_rate": 9.398012857977789e-05, "loss": 0.1957, "step": 9930 }, { "epoch": 2.8, "grad_norm": 26.791209319259156, "learning_rate": 9.376095850379894e-05, "loss": 0.1552, "step": 9940 }, { "epoch": 2.81, "grad_norm": 2.0994878103860124, "learning_rate": 9.354178842781998e-05, "loss": 0.0714, "step": 9950 }, { "epoch": 2.81, "grad_norm": 3.939232559831004, "learning_rate": 9.332261835184102e-05, "loss": 0.1463, "step": 9960 }, { "epoch": 2.81, "grad_norm": 1.93177447901502, "learning_rate": 9.310344827586206e-05, "loss": 0.3218, "step": 9970 }, { "epoch": 2.81, "grad_norm": 11.674268724271638, "learning_rate": 9.28842781998831e-05, "loss": 0.121, "step": 9980 }, { "epoch": 2.82, "grad_norm": 29.77892155165882, "learning_rate": 9.266510812390414e-05, "loss": 0.2259, "step": 9990 }, { "epoch": 2.82, "grad_norm": 3.1793474536477495, "learning_rate": 9.244593804792518e-05, "loss": 0.1147, "step": 10000 }, { "epoch": 2.82, "grad_norm": 29.467125692383135, "learning_rate": 9.222676797194622e-05, "loss": 0.1296, "step": 10010 }, { "epoch": 2.82, "grad_norm": 12.829429531674693, "learning_rate": 9.202951490356516e-05, "loss": 0.343, "step": 10020 }, { "epoch": 2.83, "grad_norm": 4.842170823899554, "learning_rate": 9.18103448275862e-05, "loss": 0.1342, "step": 10030 }, { "epoch": 2.83, "grad_norm": 2.8036087435052117, "learning_rate": 9.159117475160724e-05, "loss": 0.1981, "step": 10040 }, { "epoch": 2.83, "grad_norm": 1.2567121612024053, "learning_rate": 9.137200467562828e-05, "loss": 0.1156, "step": 10050 }, { "epoch": 2.84, "grad_norm": 39.13892743280264, "learning_rate": 9.115283459964932e-05, "loss": 0.217, "step": 10060 }, { "epoch": 2.84, "grad_norm": 4.018926730147514, "learning_rate": 9.093366452367036e-05, "loss": 0.1098, "step": 10070 }, { "epoch": 2.84, "grad_norm": 20.370189526716274, "learning_rate": 9.07144944476914e-05, "loss": 0.1131, "step": 10080 }, { "epoch": 2.84, "grad_norm": 12.513349385435367, "learning_rate": 9.049532437171244e-05, "loss": 0.0835, "step": 10090 }, { "epoch": 2.85, "grad_norm": 3.0078713954695693, "learning_rate": 9.027615429573349e-05, "loss": 0.0935, "step": 10100 }, { "epoch": 2.85, "grad_norm": 0.35428391785347213, "learning_rate": 9.005698421975452e-05, "loss": 0.1368, "step": 10110 }, { "epoch": 2.85, "grad_norm": 2.8765836604387487, "learning_rate": 8.983781414377556e-05, "loss": 0.0582, "step": 10120 }, { "epoch": 2.86, "grad_norm": 0.27300803686564074, "learning_rate": 8.96186440677966e-05, "loss": 0.1443, "step": 10130 }, { "epoch": 2.86, "grad_norm": 1.0632557332902792, "learning_rate": 8.939947399181764e-05, "loss": 0.2713, "step": 10140 }, { "epoch": 2.86, "grad_norm": 1.3919283720717666, "learning_rate": 8.918030391583869e-05, "loss": 0.1426, "step": 10150 }, { "epoch": 2.86, "grad_norm": 0.2975889790690355, "learning_rate": 8.896113383985972e-05, "loss": 0.1255, "step": 10160 }, { "epoch": 2.87, "grad_norm": 0.6331908778097588, "learning_rate": 8.874196376388076e-05, "loss": 0.1088, "step": 10170 }, { "epoch": 2.87, "grad_norm": 0.49968429973464584, "learning_rate": 8.85227936879018e-05, "loss": 0.0462, "step": 10180 }, { "epoch": 2.87, "grad_norm": 0.3998283883981307, "learning_rate": 8.830362361192284e-05, "loss": 0.1456, "step": 10190 }, { "epoch": 2.88, "grad_norm": 0.6762860335305919, "learning_rate": 8.808445353594387e-05, "loss": 0.1, "step": 10200 }, { "epoch": 2.88, "grad_norm": 1.5918195856520565, "learning_rate": 8.786528345996492e-05, "loss": 0.0775, "step": 10210 }, { "epoch": 2.88, "grad_norm": 0.47908599501430993, "learning_rate": 8.764611338398598e-05, "loss": 0.1299, "step": 10220 }, { "epoch": 2.88, "grad_norm": 0.02533220752832844, "learning_rate": 8.7426943308007e-05, "loss": 0.1705, "step": 10230 }, { "epoch": 2.89, "grad_norm": 1.6916417760501654, "learning_rate": 8.720777323202806e-05, "loss": 0.3096, "step": 10240 }, { "epoch": 2.89, "grad_norm": 5.6132755915793116, "learning_rate": 8.698860315604907e-05, "loss": 0.14, "step": 10250 }, { "epoch": 2.89, "grad_norm": 3.6946000211637866, "learning_rate": 8.676943308007013e-05, "loss": 0.1283, "step": 10260 }, { "epoch": 2.9, "grad_norm": 1.5612252460560485, "learning_rate": 8.655026300409118e-05, "loss": 0.2084, "step": 10270 }, { "epoch": 2.9, "grad_norm": 17.442735530139714, "learning_rate": 8.633109292811221e-05, "loss": 0.0978, "step": 10280 }, { "epoch": 2.9, "grad_norm": 0.7188172450254493, "learning_rate": 8.611192285213326e-05, "loss": 0.1475, "step": 10290 }, { "epoch": 2.9, "grad_norm": 0.23472035328250088, "learning_rate": 8.589275277615429e-05, "loss": 0.084, "step": 10300 }, { "epoch": 2.91, "grad_norm": 3.847787592250939, "learning_rate": 8.567358270017533e-05, "loss": 0.0532, "step": 10310 }, { "epoch": 2.91, "grad_norm": 8.67255157030904, "learning_rate": 8.545441262419637e-05, "loss": 0.2527, "step": 10320 }, { "epoch": 2.91, "grad_norm": 0.6329813929319419, "learning_rate": 8.523524254821741e-05, "loss": 0.1688, "step": 10330 }, { "epoch": 2.92, "grad_norm": 38.31309176901214, "learning_rate": 8.501607247223846e-05, "loss": 0.1513, "step": 10340 }, { "epoch": 2.92, "grad_norm": 0.9873950760760376, "learning_rate": 8.479690239625949e-05, "loss": 0.1007, "step": 10350 }, { "epoch": 2.92, "grad_norm": 12.444359816230664, "learning_rate": 8.457773232028053e-05, "loss": 0.0936, "step": 10360 }, { "epoch": 2.92, "grad_norm": 2.1983295007186743, "learning_rate": 8.435856224430157e-05, "loss": 0.1726, "step": 10370 }, { "epoch": 2.93, "grad_norm": 25.402786909188887, "learning_rate": 8.413939216832261e-05, "loss": 0.2034, "step": 10380 }, { "epoch": 2.93, "grad_norm": 10.271503407485543, "learning_rate": 8.392022209234366e-05, "loss": 0.0983, "step": 10390 }, { "epoch": 2.93, "grad_norm": 11.41372086097229, "learning_rate": 8.370105201636469e-05, "loss": 0.1763, "step": 10400 }, { "epoch": 2.93, "grad_norm": 1.0021380382604943, "learning_rate": 8.348188194038573e-05, "loss": 0.1691, "step": 10410 }, { "epoch": 2.94, "grad_norm": 1.4011630497758993, "learning_rate": 8.326271186440677e-05, "loss": 0.0795, "step": 10420 }, { "epoch": 2.94, "grad_norm": 0.5995590743117784, "learning_rate": 8.304354178842781e-05, "loss": 0.155, "step": 10430 }, { "epoch": 2.94, "grad_norm": 13.891285620986729, "learning_rate": 8.282437171244884e-05, "loss": 0.152, "step": 10440 }, { "epoch": 2.95, "grad_norm": 4.322816995983127, "learning_rate": 8.260520163646989e-05, "loss": 0.1174, "step": 10450 }, { "epoch": 2.95, "grad_norm": 4.199701886882987, "learning_rate": 8.238603156049094e-05, "loss": 0.1112, "step": 10460 }, { "epoch": 2.95, "grad_norm": 1.0003512662431095, "learning_rate": 8.216686148451197e-05, "loss": 0.1023, "step": 10470 }, { "epoch": 2.95, "grad_norm": 2.7506464116256915, "learning_rate": 8.194769140853301e-05, "loss": 0.125, "step": 10480 }, { "epoch": 2.96, "grad_norm": 0.267916746880681, "learning_rate": 8.172852133255405e-05, "loss": 0.1657, "step": 10490 }, { "epoch": 2.96, "grad_norm": 1.1331681687447057, "learning_rate": 8.150935125657509e-05, "loss": 0.1416, "step": 10500 }, { "epoch": 2.96, "grad_norm": 1.8446952462251813, "learning_rate": 8.129018118059614e-05, "loss": 0.2599, "step": 10510 }, { "epoch": 2.97, "grad_norm": 0.51816310859789, "learning_rate": 8.107101110461717e-05, "loss": 0.0686, "step": 10520 }, { "epoch": 2.97, "grad_norm": 1.2233390386002774, "learning_rate": 8.085184102863821e-05, "loss": 0.1502, "step": 10530 }, { "epoch": 2.97, "grad_norm": 83.49175206154096, "learning_rate": 8.063267095265925e-05, "loss": 0.1806, "step": 10540 }, { "epoch": 2.97, "grad_norm": 4.967238991092641, "learning_rate": 8.041350087668029e-05, "loss": 0.1847, "step": 10550 }, { "epoch": 2.98, "grad_norm": 15.102812775710126, "learning_rate": 8.019433080070135e-05, "loss": 0.1602, "step": 10560 }, { "epoch": 2.98, "grad_norm": 26.52721557816532, "learning_rate": 7.997516072472237e-05, "loss": 0.1863, "step": 10570 }, { "epoch": 2.98, "grad_norm": 1.232502179558844, "learning_rate": 7.975599064874343e-05, "loss": 0.1091, "step": 10580 }, { "epoch": 2.99, "grad_norm": 0.802916184416696, "learning_rate": 7.953682057276446e-05, "loss": 0.1202, "step": 10590 }, { "epoch": 2.99, "grad_norm": 18.591440679290102, "learning_rate": 7.93176504967855e-05, "loss": 0.1693, "step": 10600 }, { "epoch": 2.99, "grad_norm": 1.0156253505497566, "learning_rate": 7.909848042080654e-05, "loss": 0.1564, "step": 10610 }, { "epoch": 2.99, "grad_norm": 10.068403109613666, "learning_rate": 7.887931034482758e-05, "loss": 0.2361, "step": 10620 }, { "epoch": 3.0, "grad_norm": 1.6740912563898531, "learning_rate": 7.866014026884863e-05, "loss": 0.0799, "step": 10630 }, { "epoch": 3.0, "grad_norm": 2.461614313039469, "learning_rate": 7.844097019286966e-05, "loss": 0.0537, "step": 10640 }, { "epoch": 3.0, "eval_0_f1": 0.7738325801592424, "eval_0_precision": 0.7824194952132288, "eval_0_recall": 0.7654320987654321, "eval_1_f1": 0.9226921662375874, "eval_1_precision": 0.9192437344276712, "eval_1_recall": 0.926166568222091, "eval_accuracy": 0.8847714066440083, "eval_loss": 0.39013671875, "eval_runtime": 544.8422, "eval_samples_per_second": 16.741, "eval_steps_per_second": 2.792, "step": 10641 } ], "logging_steps": 10, "max_steps": 14188, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 2.0001268071346995e+17, "train_batch_size": 6, "trial_name": null, "trial_params": null }