{ "best_metric": 0.48, "best_model_checkpoint": "5c_4/checkpoint-9360", "epoch": 99.01, "eval_steps": 500, "global_step": 23400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00042735042735042735, "grad_norm": 6.5477800369262695, "learning_rate": 4.273504273504274e-08, "loss": 1.341, "step": 10 }, { "epoch": 0.0008547008547008547, "grad_norm": 7.9439568519592285, "learning_rate": 8.547008547008549e-08, "loss": 1.3047, "step": 20 }, { "epoch": 0.001282051282051282, "grad_norm": 6.425937175750732, "learning_rate": 1.282051282051282e-07, "loss": 1.3641, "step": 30 }, { "epoch": 0.0017094017094017094, "grad_norm": 6.959607124328613, "learning_rate": 1.7094017094017097e-07, "loss": 1.3257, "step": 40 }, { "epoch": 0.002136752136752137, "grad_norm": 6.3154730796813965, "learning_rate": 2.136752136752137e-07, "loss": 1.3008, "step": 50 }, { "epoch": 0.002564102564102564, "grad_norm": 6.919759750366211, "learning_rate": 2.564102564102564e-07, "loss": 1.3322, "step": 60 }, { "epoch": 0.0029914529914529917, "grad_norm": 6.870538711547852, "learning_rate": 2.991452991452992e-07, "loss": 1.313, "step": 70 }, { "epoch": 0.003418803418803419, "grad_norm": 6.823713302612305, "learning_rate": 3.4188034188034194e-07, "loss": 1.3179, "step": 80 }, { "epoch": 0.0038461538461538464, "grad_norm": 7.863243579864502, "learning_rate": 3.846153846153847e-07, "loss": 1.2466, "step": 90 }, { "epoch": 0.004273504273504274, "grad_norm": 7.5396881103515625, "learning_rate": 4.273504273504274e-07, "loss": 1.2098, "step": 100 }, { "epoch": 0.004700854700854701, "grad_norm": 8.087361335754395, "learning_rate": 4.700854700854701e-07, "loss": 1.2309, "step": 110 }, { "epoch": 0.005128205128205128, "grad_norm": 8.429758071899414, "learning_rate": 5.128205128205128e-07, "loss": 1.1752, "step": 120 }, { "epoch": 0.005555555555555556, "grad_norm": 9.436071395874023, "learning_rate": 5.555555555555555e-07, "loss": 1.1474, "step": 130 }, { "epoch": 0.005982905982905983, "grad_norm": 80.47299194335938, "learning_rate": 5.982905982905984e-07, "loss": 1.1764, "step": 140 }, { "epoch": 0.00641025641025641, "grad_norm": 16.705190658569336, "learning_rate": 6.41025641025641e-07, "loss": 1.1038, "step": 150 }, { "epoch": 0.006837606837606838, "grad_norm": 22.808284759521484, "learning_rate": 6.837606837606839e-07, "loss": 1.1758, "step": 160 }, { "epoch": 0.007264957264957265, "grad_norm": 23.328384399414062, "learning_rate": 7.264957264957266e-07, "loss": 1.2182, "step": 170 }, { "epoch": 0.007692307692307693, "grad_norm": 16.98978614807129, "learning_rate": 7.692307692307694e-07, "loss": 0.8752, "step": 180 }, { "epoch": 0.00811965811965812, "grad_norm": 197.35919189453125, "learning_rate": 8.11965811965812e-07, "loss": 1.122, "step": 190 }, { "epoch": 0.008547008547008548, "grad_norm": 32.2603759765625, "learning_rate": 8.547008547008548e-07, "loss": 1.1263, "step": 200 }, { "epoch": 0.008974358974358974, "grad_norm": 15.653111457824707, "learning_rate": 8.974358974358975e-07, "loss": 1.3598, "step": 210 }, { "epoch": 0.009401709401709401, "grad_norm": 13.729981422424316, "learning_rate": 9.401709401709402e-07, "loss": 0.6189, "step": 220 }, { "epoch": 0.009829059829059829, "grad_norm": 30.82917594909668, "learning_rate": 9.829059829059829e-07, "loss": 0.9641, "step": 230 }, { "epoch": 0.01, "eval_accuracy": 0.4, "eval_loss": 1.4837762117385864, "eval_runtime": 34.3992, "eval_samples_per_second": 0.727, "eval_steps_per_second": 0.727, "step": 234 }, { "epoch": 1.0002564102564102, "grad_norm": 13.610635757446289, "learning_rate": 1.0256410256410257e-06, "loss": 0.9614, "step": 240 }, { "epoch": 1.0006837606837606, "grad_norm": 51.50576400756836, "learning_rate": 1.0683760683760685e-06, "loss": 0.9305, "step": 250 }, { "epoch": 1.001111111111111, "grad_norm": 101.85962677001953, "learning_rate": 1.111111111111111e-06, "loss": 0.923, "step": 260 }, { "epoch": 1.0015384615384615, "grad_norm": 15.406578063964844, "learning_rate": 1.153846153846154e-06, "loss": 1.558, "step": 270 }, { "epoch": 1.001965811965812, "grad_norm": 49.64127731323242, "learning_rate": 1.1965811965811968e-06, "loss": 0.749, "step": 280 }, { "epoch": 1.0023931623931623, "grad_norm": 23.893503189086914, "learning_rate": 1.2393162393162394e-06, "loss": 0.7217, "step": 290 }, { "epoch": 1.0028205128205128, "grad_norm": 12.812580108642578, "learning_rate": 1.282051282051282e-06, "loss": 1.3277, "step": 300 }, { "epoch": 1.0032478632478632, "grad_norm": 8.36343002319336, "learning_rate": 1.324786324786325e-06, "loss": 0.4727, "step": 310 }, { "epoch": 1.0036752136752136, "grad_norm": 3.717958450317383, "learning_rate": 1.3675213675213678e-06, "loss": 1.0901, "step": 320 }, { "epoch": 1.004102564102564, "grad_norm": 120.84101867675781, "learning_rate": 1.4102564102564104e-06, "loss": 1.2407, "step": 330 }, { "epoch": 1.0045299145299145, "grad_norm": 76.76756286621094, "learning_rate": 1.4529914529914531e-06, "loss": 1.2918, "step": 340 }, { "epoch": 1.004957264957265, "grad_norm": 66.22538757324219, "learning_rate": 1.4957264957264957e-06, "loss": 2.1196, "step": 350 }, { "epoch": 1.0053846153846153, "grad_norm": 49.87553024291992, "learning_rate": 1.5384615384615387e-06, "loss": 0.9441, "step": 360 }, { "epoch": 1.0058119658119657, "grad_norm": 64.26871490478516, "learning_rate": 1.5811965811965813e-06, "loss": 1.3502, "step": 370 }, { "epoch": 1.0062393162393162, "grad_norm": 1.687718391418457, "learning_rate": 1.623931623931624e-06, "loss": 2.3065, "step": 380 }, { "epoch": 1.0066666666666666, "grad_norm": 1.3850126266479492, "learning_rate": 1.6666666666666667e-06, "loss": 0.9136, "step": 390 }, { "epoch": 1.007094017094017, "grad_norm": 1.278273344039917, "learning_rate": 1.7094017094017097e-06, "loss": 1.5894, "step": 400 }, { "epoch": 1.0075213675213675, "grad_norm": 63.78499221801758, "learning_rate": 1.7521367521367522e-06, "loss": 1.1296, "step": 410 }, { "epoch": 1.0079487179487179, "grad_norm": 0.9789712429046631, "learning_rate": 1.794871794871795e-06, "loss": 1.6205, "step": 420 }, { "epoch": 1.0083760683760683, "grad_norm": 0.5999449491500854, "learning_rate": 1.8376068376068378e-06, "loss": 1.7319, "step": 430 }, { "epoch": 1.0088034188034187, "grad_norm": 0.31746140122413635, "learning_rate": 1.8803418803418804e-06, "loss": 0.643, "step": 440 }, { "epoch": 1.0092307692307692, "grad_norm": 76.34538269042969, "learning_rate": 1.9230769230769234e-06, "loss": 2.4496, "step": 450 }, { "epoch": 1.0096581196581196, "grad_norm": 5.805349349975586, "learning_rate": 1.9658119658119658e-06, "loss": 1.5439, "step": 460 }, { "epoch": 1.01, "eval_accuracy": 0.4, "eval_loss": 3.7125139236450195, "eval_runtime": 33.4952, "eval_samples_per_second": 0.746, "eval_steps_per_second": 0.746, "step": 468 }, { "epoch": 2.00008547008547, "grad_norm": 72.71012878417969, "learning_rate": 2.008547008547009e-06, "loss": 1.7175, "step": 470 }, { "epoch": 2.0005128205128204, "grad_norm": 0.6215125918388367, "learning_rate": 2.0512820512820513e-06, "loss": 2.393, "step": 480 }, { "epoch": 2.000940170940171, "grad_norm": 1.5661790370941162, "learning_rate": 2.094017094017094e-06, "loss": 2.3866, "step": 490 }, { "epoch": 2.0013675213675213, "grad_norm": 0.6268726587295532, "learning_rate": 2.136752136752137e-06, "loss": 2.4783, "step": 500 }, { "epoch": 2.0017948717948717, "grad_norm": 95.59490203857422, "learning_rate": 2.1794871794871797e-06, "loss": 2.3849, "step": 510 }, { "epoch": 2.002222222222222, "grad_norm": 0.4951792061328888, "learning_rate": 2.222222222222222e-06, "loss": 1.2576, "step": 520 }, { "epoch": 2.0026495726495726, "grad_norm": 0.2503603398799896, "learning_rate": 2.2649572649572653e-06, "loss": 1.2306, "step": 530 }, { "epoch": 2.003076923076923, "grad_norm": 52.469234466552734, "learning_rate": 2.307692307692308e-06, "loss": 1.8782, "step": 540 }, { "epoch": 2.0035042735042734, "grad_norm": 0.20398114621639252, "learning_rate": 2.3504273504273504e-06, "loss": 0.5924, "step": 550 }, { "epoch": 2.003931623931624, "grad_norm": 71.42668914794922, "learning_rate": 2.3931623931623937e-06, "loss": 3.2851, "step": 560 }, { "epoch": 2.0043589743589743, "grad_norm": 56.71471405029297, "learning_rate": 2.435897435897436e-06, "loss": 1.2107, "step": 570 }, { "epoch": 2.0047863247863247, "grad_norm": 0.25754937529563904, "learning_rate": 2.478632478632479e-06, "loss": 1.874, "step": 580 }, { "epoch": 2.005213675213675, "grad_norm": 1.0091850757598877, "learning_rate": 2.5213675213675216e-06, "loss": 2.3834, "step": 590 }, { "epoch": 2.0056410256410255, "grad_norm": 0.3749796450138092, "learning_rate": 2.564102564102564e-06, "loss": 1.1427, "step": 600 }, { "epoch": 2.006068376068376, "grad_norm": 0.9544910788536072, "learning_rate": 2.606837606837607e-06, "loss": 2.4185, "step": 610 }, { "epoch": 2.0064957264957264, "grad_norm": 0.21911416947841644, "learning_rate": 2.64957264957265e-06, "loss": 1.2006, "step": 620 }, { "epoch": 2.006923076923077, "grad_norm": 0.1923692524433136, "learning_rate": 2.6923076923076923e-06, "loss": 1.2047, "step": 630 }, { "epoch": 2.0073504273504272, "grad_norm": 0.2782430052757263, "learning_rate": 2.7350427350427355e-06, "loss": 1.7899, "step": 640 }, { "epoch": 2.0077777777777777, "grad_norm": 0.17398680746555328, "learning_rate": 2.7777777777777783e-06, "loss": 1.0863, "step": 650 }, { "epoch": 2.008205128205128, "grad_norm": 121.83155059814453, "learning_rate": 2.8205128205128207e-06, "loss": 1.805, "step": 660 }, { "epoch": 2.0086324786324785, "grad_norm": 6.328676700592041, "learning_rate": 2.8632478632478635e-06, "loss": 1.6228, "step": 670 }, { "epoch": 2.009059829059829, "grad_norm": 74.6235580444336, "learning_rate": 2.9059829059829063e-06, "loss": 3.3584, "step": 680 }, { "epoch": 2.0094871794871794, "grad_norm": 0.6486341953277588, "learning_rate": 2.948717948717949e-06, "loss": 0.8365, "step": 690 }, { "epoch": 2.00991452991453, "grad_norm": 0.20082318782806396, "learning_rate": 2.9914529914529914e-06, "loss": 1.2944, "step": 700 }, { "epoch": 2.01, "eval_accuracy": 0.4, "eval_loss": 3.6749236583709717, "eval_runtime": 33.4285, "eval_samples_per_second": 0.748, "eval_steps_per_second": 0.748, "step": 702 }, { "epoch": 3.0003418803418804, "grad_norm": 49.130332946777344, "learning_rate": 3.0341880341880342e-06, "loss": 3.0762, "step": 710 }, { "epoch": 3.000769230769231, "grad_norm": 10.840944290161133, "learning_rate": 3.0769230769230774e-06, "loss": 3.0125, "step": 720 }, { "epoch": 3.0011965811965813, "grad_norm": 0.9621607661247253, "learning_rate": 3.11965811965812e-06, "loss": 1.0516, "step": 730 }, { "epoch": 3.0016239316239317, "grad_norm": 0.4117166996002197, "learning_rate": 3.1623931623931626e-06, "loss": 0.5509, "step": 740 }, { "epoch": 3.002051282051282, "grad_norm": 0.34464403986930847, "learning_rate": 3.205128205128206e-06, "loss": 0.5911, "step": 750 }, { "epoch": 3.0024786324786326, "grad_norm": 0.21714632213115692, "learning_rate": 3.247863247863248e-06, "loss": 1.2975, "step": 760 }, { "epoch": 3.002905982905983, "grad_norm": 0.6396281719207764, "learning_rate": 3.290598290598291e-06, "loss": 3.169, "step": 770 }, { "epoch": 3.0033333333333334, "grad_norm": 0.28683122992515564, "learning_rate": 3.3333333333333333e-06, "loss": 1.8898, "step": 780 }, { "epoch": 3.003760683760684, "grad_norm": 0.158400759100914, "learning_rate": 3.3760683760683765e-06, "loss": 0.6267, "step": 790 }, { "epoch": 3.0041880341880343, "grad_norm": 49.82303237915039, "learning_rate": 3.4188034188034193e-06, "loss": 0.6848, "step": 800 }, { "epoch": 3.0046153846153847, "grad_norm": 0.5617250800132751, "learning_rate": 3.4615384615384617e-06, "loss": 3.2734, "step": 810 }, { "epoch": 3.005042735042735, "grad_norm": 104.6950912475586, "learning_rate": 3.5042735042735045e-06, "loss": 2.4173, "step": 820 }, { "epoch": 3.0054700854700855, "grad_norm": 3.587768077850342, "learning_rate": 3.5470085470085473e-06, "loss": 0.7486, "step": 830 }, { "epoch": 3.005897435897436, "grad_norm": 1.223972201347351, "learning_rate": 3.58974358974359e-06, "loss": 2.2701, "step": 840 }, { "epoch": 3.0063247863247864, "grad_norm": 84.18639373779297, "learning_rate": 3.632478632478633e-06, "loss": 2.5556, "step": 850 }, { "epoch": 3.006752136752137, "grad_norm": 0.33438971638679504, "learning_rate": 3.6752136752136756e-06, "loss": 1.1486, "step": 860 }, { "epoch": 3.0071794871794872, "grad_norm": 0.4742439091205597, "learning_rate": 3.7179487179487184e-06, "loss": 1.8177, "step": 870 }, { "epoch": 3.0076068376068377, "grad_norm": 0.20394758880138397, "learning_rate": 3.760683760683761e-06, "loss": 1.2203, "step": 880 }, { "epoch": 3.008034188034188, "grad_norm": 51.59945297241211, "learning_rate": 3.8034188034188036e-06, "loss": 1.09, "step": 890 }, { "epoch": 3.0084615384615385, "grad_norm": 0.2848564386367798, "learning_rate": 3.846153846153847e-06, "loss": 1.3275, "step": 900 }, { "epoch": 3.008888888888889, "grad_norm": 0.22904685139656067, "learning_rate": 3.88888888888889e-06, "loss": 3.0584, "step": 910 }, { "epoch": 3.0093162393162394, "grad_norm": 33.74935531616211, "learning_rate": 3.9316239316239315e-06, "loss": 1.7752, "step": 920 }, { "epoch": 3.00974358974359, "grad_norm": 0.3561188280582428, "learning_rate": 3.974358974358974e-06, "loss": 0.9419, "step": 930 }, { "epoch": 3.01, "eval_accuracy": 0.4, "eval_loss": 3.042177200317383, "eval_runtime": 33.5077, "eval_samples_per_second": 0.746, "eval_steps_per_second": 0.746, "step": 936 }, { "epoch": 4.00017094017094, "grad_norm": 61.73008728027344, "learning_rate": 4.017094017094018e-06, "loss": 3.1491, "step": 940 }, { "epoch": 4.00059829059829, "grad_norm": 32.0042839050293, "learning_rate": 4.05982905982906e-06, "loss": 1.1443, "step": 950 }, { "epoch": 4.001025641025641, "grad_norm": 0.49480342864990234, "learning_rate": 4.102564102564103e-06, "loss": 1.1544, "step": 960 }, { "epoch": 4.001452991452991, "grad_norm": 0.1361558586359024, "learning_rate": 4.145299145299146e-06, "loss": 0.574, "step": 970 }, { "epoch": 4.001880341880342, "grad_norm": 0.10107045620679855, "learning_rate": 4.188034188034188e-06, "loss": 2.0072, "step": 980 }, { "epoch": 4.002307692307692, "grad_norm": 36.34455871582031, "learning_rate": 4.230769230769231e-06, "loss": 1.849, "step": 990 }, { "epoch": 4.0027350427350425, "grad_norm": 179.90792846679688, "learning_rate": 4.273504273504274e-06, "loss": 1.7365, "step": 1000 }, { "epoch": 4.003162393162393, "grad_norm": 38.759735107421875, "learning_rate": 4.316239316239317e-06, "loss": 2.1145, "step": 1010 }, { "epoch": 4.003589743589743, "grad_norm": 0.1710689514875412, "learning_rate": 4.358974358974359e-06, "loss": 0.0095, "step": 1020 }, { "epoch": 4.004017094017094, "grad_norm": 0.4290190041065216, "learning_rate": 4.401709401709402e-06, "loss": 1.7793, "step": 1030 }, { "epoch": 4.004444444444444, "grad_norm": 32.45518493652344, "learning_rate": 4.444444444444444e-06, "loss": 1.7186, "step": 1040 }, { "epoch": 4.004871794871795, "grad_norm": 36.26081466674805, "learning_rate": 4.487179487179488e-06, "loss": 1.7024, "step": 1050 }, { "epoch": 4.005299145299145, "grad_norm": 0.3600796163082123, "learning_rate": 4.5299145299145306e-06, "loss": 0.5479, "step": 1060 }, { "epoch": 4.0057264957264955, "grad_norm": 0.39198485016822815, "learning_rate": 4.5726495726495725e-06, "loss": 1.6426, "step": 1070 }, { "epoch": 4.006153846153846, "grad_norm": 0.25071772933006287, "learning_rate": 4.615384615384616e-06, "loss": 1.0627, "step": 1080 }, { "epoch": 4.006581196581196, "grad_norm": 35.45331573486328, "learning_rate": 4.658119658119659e-06, "loss": 2.8291, "step": 1090 }, { "epoch": 4.007008547008547, "grad_norm": 0.4302135407924652, "learning_rate": 4.700854700854701e-06, "loss": 1.1957, "step": 1100 }, { "epoch": 4.007435897435897, "grad_norm": 35.813323974609375, "learning_rate": 4.743589743589744e-06, "loss": 2.2429, "step": 1110 }, { "epoch": 4.007863247863248, "grad_norm": 35.34008026123047, "learning_rate": 4.786324786324787e-06, "loss": 1.077, "step": 1120 }, { "epoch": 4.008290598290598, "grad_norm": 1.468093991279602, "learning_rate": 4.829059829059829e-06, "loss": 2.7016, "step": 1130 }, { "epoch": 4.0087179487179485, "grad_norm": 29.234296798706055, "learning_rate": 4.871794871794872e-06, "loss": 2.1301, "step": 1140 }, { "epoch": 4.009145299145299, "grad_norm": 31.05660629272461, "learning_rate": 4.914529914529915e-06, "loss": 2.0786, "step": 1150 }, { "epoch": 4.009572649572649, "grad_norm": 0.4601377546787262, "learning_rate": 4.957264957264958e-06, "loss": 2.0616, "step": 1160 }, { "epoch": 4.01, "grad_norm": 1.4168052673339844, "learning_rate": 5e-06, "loss": 2.4333, "step": 1170 }, { "epoch": 4.01, "eval_accuracy": 0.4, "eval_loss": 2.6802823543548584, "eval_runtime": 33.4138, "eval_samples_per_second": 0.748, "eval_steps_per_second": 0.748, "step": 1170 }, { "epoch": 5.00042735042735, "grad_norm": 0.4595077633857727, "learning_rate": 5.042735042735043e-06, "loss": 0.8973, "step": 1180 }, { "epoch": 5.000854700854701, "grad_norm": 32.307037353515625, "learning_rate": 5.085470085470086e-06, "loss": 1.9978, "step": 1190 }, { "epoch": 5.001282051282051, "grad_norm": 33.618709564208984, "learning_rate": 5.128205128205128e-06, "loss": 3.0544, "step": 1200 }, { "epoch": 5.001709401709402, "grad_norm": 31.92491340637207, "learning_rate": 5.1709401709401716e-06, "loss": 1.8459, "step": 1210 }, { "epoch": 5.002136752136752, "grad_norm": 0.769111156463623, "learning_rate": 5.213675213675214e-06, "loss": 1.928, "step": 1220 }, { "epoch": 5.0025641025641026, "grad_norm": 30.650548934936523, "learning_rate": 5.256410256410257e-06, "loss": 1.1278, "step": 1230 }, { "epoch": 5.002991452991453, "grad_norm": 31.011295318603516, "learning_rate": 5.2991452991453e-06, "loss": 1.8213, "step": 1240 }, { "epoch": 5.003418803418803, "grad_norm": 0.40149080753326416, "learning_rate": 5.341880341880342e-06, "loss": 2.31, "step": 1250 }, { "epoch": 5.003846153846154, "grad_norm": 0.4121752977371216, "learning_rate": 5.384615384615385e-06, "loss": 1.051, "step": 1260 }, { "epoch": 5.004273504273504, "grad_norm": 0.29824018478393555, "learning_rate": 5.4273504273504275e-06, "loss": 2.0995, "step": 1270 }, { "epoch": 5.004700854700855, "grad_norm": 0.4708477258682251, "learning_rate": 5.470085470085471e-06, "loss": 0.4382, "step": 1280 }, { "epoch": 5.005128205128205, "grad_norm": 0.25618794560432434, "learning_rate": 5.512820512820514e-06, "loss": 2.2577, "step": 1290 }, { "epoch": 5.0055555555555555, "grad_norm": 0.17655441164970398, "learning_rate": 5.555555555555557e-06, "loss": 0.6334, "step": 1300 }, { "epoch": 5.005982905982906, "grad_norm": 29.26540184020996, "learning_rate": 5.598290598290599e-06, "loss": 1.6359, "step": 1310 }, { "epoch": 5.006410256410256, "grad_norm": 0.33771806955337524, "learning_rate": 5.641025641025641e-06, "loss": 2.2658, "step": 1320 }, { "epoch": 5.006837606837607, "grad_norm": 0.19950927793979645, "learning_rate": 5.683760683760684e-06, "loss": 0.6106, "step": 1330 }, { "epoch": 5.007264957264957, "grad_norm": 32.63151931762695, "learning_rate": 5.726495726495727e-06, "loss": 1.9519, "step": 1340 }, { "epoch": 5.007692307692308, "grad_norm": 0.3686060309410095, "learning_rate": 5.769230769230769e-06, "loss": 1.2481, "step": 1350 }, { "epoch": 5.008119658119658, "grad_norm": 0.1446453183889389, "learning_rate": 5.8119658119658126e-06, "loss": 0.5926, "step": 1360 }, { "epoch": 5.0085470085470085, "grad_norm": 32.86191177368164, "learning_rate": 5.854700854700855e-06, "loss": 2.247, "step": 1370 }, { "epoch": 5.008974358974359, "grad_norm": 37.676414489746094, "learning_rate": 5.897435897435898e-06, "loss": 1.9409, "step": 1380 }, { "epoch": 5.009401709401709, "grad_norm": 0.4700198471546173, "learning_rate": 5.940170940170941e-06, "loss": 1.4757, "step": 1390 }, { "epoch": 5.00982905982906, "grad_norm": 0.603115975856781, "learning_rate": 5.982905982905983e-06, "loss": 1.4646, "step": 1400 }, { "epoch": 5.01, "eval_accuracy": 0.4, "eval_loss": 3.5354690551757812, "eval_runtime": 33.4888, "eval_samples_per_second": 0.747, "eval_steps_per_second": 0.747, "step": 1404 }, { "epoch": 6.00025641025641, "grad_norm": 0.20566977560520172, "learning_rate": 6.025641025641026e-06, "loss": 2.5308, "step": 1410 }, { "epoch": 6.000683760683761, "grad_norm": 1.3430235385894775, "learning_rate": 6.0683760683760684e-06, "loss": 1.7984, "step": 1420 }, { "epoch": 6.001111111111111, "grad_norm": 34.01142120361328, "learning_rate": 6.111111111111112e-06, "loss": 0.5602, "step": 1430 }, { "epoch": 6.001538461538462, "grad_norm": 0.08811581879854202, "learning_rate": 6.153846153846155e-06, "loss": 0.5698, "step": 1440 }, { "epoch": 6.001965811965812, "grad_norm": 31.072364807128906, "learning_rate": 6.196581196581198e-06, "loss": 1.9642, "step": 1450 }, { "epoch": 6.002393162393163, "grad_norm": 0.3049279749393463, "learning_rate": 6.23931623931624e-06, "loss": 1.0951, "step": 1460 }, { "epoch": 6.002820512820513, "grad_norm": 1.261183500289917, "learning_rate": 6.282051282051282e-06, "loss": 1.5718, "step": 1470 }, { "epoch": 6.003247863247863, "grad_norm": 29.76000213623047, "learning_rate": 6.324786324786325e-06, "loss": 2.3047, "step": 1480 }, { "epoch": 6.003675213675214, "grad_norm": 0.3626205325126648, "learning_rate": 6.367521367521368e-06, "loss": 1.7411, "step": 1490 }, { "epoch": 6.004102564102564, "grad_norm": 27.959877014160156, "learning_rate": 6.410256410256412e-06, "loss": 2.4126, "step": 1500 }, { "epoch": 6.004529914529915, "grad_norm": 0.5188838243484497, "learning_rate": 6.4529914529914535e-06, "loss": 1.3862, "step": 1510 }, { "epoch": 6.004957264957265, "grad_norm": 0.29587042331695557, "learning_rate": 6.495726495726496e-06, "loss": 1.414, "step": 1520 }, { "epoch": 6.0053846153846155, "grad_norm": 0.25534921884536743, "learning_rate": 6.538461538461539e-06, "loss": 1.7212, "step": 1530 }, { "epoch": 6.005811965811966, "grad_norm": 0.2307765781879425, "learning_rate": 6.581196581196582e-06, "loss": 2.294, "step": 1540 }, { "epoch": 6.006239316239316, "grad_norm": 28.337125778198242, "learning_rate": 6.623931623931624e-06, "loss": 0.946, "step": 1550 }, { "epoch": 6.006666666666667, "grad_norm": 0.16784150898456573, "learning_rate": 6.666666666666667e-06, "loss": 1.1255, "step": 1560 }, { "epoch": 6.007094017094017, "grad_norm": 27.95771598815918, "learning_rate": 6.7094017094017094e-06, "loss": 1.6833, "step": 1570 }, { "epoch": 6.007521367521368, "grad_norm": 0.4080459773540497, "learning_rate": 6.752136752136753e-06, "loss": 1.3406, "step": 1580 }, { "epoch": 6.007948717948718, "grad_norm": 0.8113716244697571, "learning_rate": 6.794871794871796e-06, "loss": 2.6112, "step": 1590 }, { "epoch": 6.0083760683760685, "grad_norm": 30.46502685546875, "learning_rate": 6.837606837606839e-06, "loss": 1.4458, "step": 1600 }, { "epoch": 6.008803418803419, "grad_norm": 0.22283227741718292, "learning_rate": 6.880341880341881e-06, "loss": 0.5283, "step": 1610 }, { "epoch": 6.009230769230769, "grad_norm": 28.292848587036133, "learning_rate": 6.923076923076923e-06, "loss": 1.5511, "step": 1620 }, { "epoch": 6.00965811965812, "grad_norm": 28.0618839263916, "learning_rate": 6.965811965811966e-06, "loss": 2.1201, "step": 1630 }, { "epoch": 6.01, "eval_accuracy": 0.4, "eval_loss": 3.0478899478912354, "eval_runtime": 33.4068, "eval_samples_per_second": 0.748, "eval_steps_per_second": 0.748, "step": 1638 }, { "epoch": 7.00008547008547, "grad_norm": 33.30900573730469, "learning_rate": 7.008547008547009e-06, "loss": 1.429, "step": 1640 }, { "epoch": 7.000512820512821, "grad_norm": 29.006980895996094, "learning_rate": 7.051282051282053e-06, "loss": 1.6062, "step": 1650 }, { "epoch": 7.000940170940171, "grad_norm": 28.740236282348633, "learning_rate": 7.0940170940170945e-06, "loss": 2.1064, "step": 1660 }, { "epoch": 7.001367521367522, "grad_norm": 0.5548082590103149, "learning_rate": 7.136752136752137e-06, "loss": 1.4227, "step": 1670 }, { "epoch": 7.001794871794872, "grad_norm": 0.2548494040966034, "learning_rate": 7.17948717948718e-06, "loss": 0.4677, "step": 1680 }, { "epoch": 7.002222222222223, "grad_norm": 0.08594454079866409, "learning_rate": 7.222222222222223e-06, "loss": 0.6487, "step": 1690 }, { "epoch": 7.002649572649573, "grad_norm": 0.11181553453207016, "learning_rate": 7.264957264957266e-06, "loss": 0.6856, "step": 1700 }, { "epoch": 7.003076923076923, "grad_norm": 0.09680427610874176, "learning_rate": 7.307692307692308e-06, "loss": 1.2589, "step": 1710 }, { "epoch": 7.003504273504274, "grad_norm": 0.18477901816368103, "learning_rate": 7.350427350427351e-06, "loss": 2.0357, "step": 1720 }, { "epoch": 7.003931623931624, "grad_norm": 28.30834197998047, "learning_rate": 7.393162393162394e-06, "loss": 1.7978, "step": 1730 }, { "epoch": 7.004358974358975, "grad_norm": 0.4138849675655365, "learning_rate": 7.435897435897437e-06, "loss": 1.4938, "step": 1740 }, { "epoch": 7.004786324786325, "grad_norm": 46.25049591064453, "learning_rate": 7.47863247863248e-06, "loss": 1.2354, "step": 1750 }, { "epoch": 7.0052136752136756, "grad_norm": 0.2702569365501404, "learning_rate": 7.521367521367522e-06, "loss": 1.8753, "step": 1760 }, { "epoch": 7.005641025641026, "grad_norm": 0.2681766450405121, "learning_rate": 7.564102564102564e-06, "loss": 1.2446, "step": 1770 }, { "epoch": 7.006068376068376, "grad_norm": 47.34070587158203, "learning_rate": 7.606837606837607e-06, "loss": 2.2493, "step": 1780 }, { "epoch": 7.006495726495727, "grad_norm": 31.515134811401367, "learning_rate": 7.649572649572649e-06, "loss": 2.1708, "step": 1790 }, { "epoch": 7.006923076923077, "grad_norm": 0.44051307439804077, "learning_rate": 7.692307692307694e-06, "loss": 0.9425, "step": 1800 }, { "epoch": 7.007350427350428, "grad_norm": 0.16471922397613525, "learning_rate": 7.735042735042736e-06, "loss": 1.6227, "step": 1810 }, { "epoch": 7.007777777777778, "grad_norm": 36.21978759765625, "learning_rate": 7.77777777777778e-06, "loss": 3.136, "step": 1820 }, { "epoch": 7.0082051282051285, "grad_norm": 0.6311964988708496, "learning_rate": 7.820512820512822e-06, "loss": 1.6591, "step": 1830 }, { "epoch": 7.008632478632479, "grad_norm": 0.15079300105571747, "learning_rate": 7.863247863247863e-06, "loss": 0.4923, "step": 1840 }, { "epoch": 7.009059829059829, "grad_norm": 0.0793166533112526, "learning_rate": 7.905982905982906e-06, "loss": 0.6274, "step": 1850 }, { "epoch": 7.00948717948718, "grad_norm": 39.972232818603516, "learning_rate": 7.948717948717949e-06, "loss": 1.902, "step": 1860 }, { "epoch": 7.00991452991453, "grad_norm": 2.84613299369812, "learning_rate": 7.991452991452993e-06, "loss": 2.9021, "step": 1870 }, { "epoch": 7.01, "eval_accuracy": 0.4, "eval_loss": 2.8181352615356445, "eval_runtime": 33.4387, "eval_samples_per_second": 0.748, "eval_steps_per_second": 0.748, "step": 1872 }, { "epoch": 8.00034188034188, "grad_norm": 32.629676818847656, "learning_rate": 8.034188034188036e-06, "loss": 0.6487, "step": 1880 }, { "epoch": 8.000769230769231, "grad_norm": 0.23920656740665436, "learning_rate": 8.076923076923077e-06, "loss": 1.2006, "step": 1890 }, { "epoch": 8.00119658119658, "grad_norm": 0.14234939217567444, "learning_rate": 8.11965811965812e-06, "loss": 1.2065, "step": 1900 }, { "epoch": 8.001623931623932, "grad_norm": 0.12460368871688843, "learning_rate": 8.162393162393163e-06, "loss": 0.5436, "step": 1910 }, { "epoch": 8.002051282051282, "grad_norm": 0.43331626057624817, "learning_rate": 8.205128205128205e-06, "loss": 1.5192, "step": 1920 }, { "epoch": 8.002478632478633, "grad_norm": 0.12986142933368683, "learning_rate": 8.247863247863248e-06, "loss": 0.6905, "step": 1930 }, { "epoch": 8.002905982905983, "grad_norm": 0.5832127928733826, "learning_rate": 8.290598290598293e-06, "loss": 3.0564, "step": 1940 }, { "epoch": 8.003333333333334, "grad_norm": 28.78748321533203, "learning_rate": 8.333333333333334e-06, "loss": 2.3034, "step": 1950 }, { "epoch": 8.003760683760683, "grad_norm": 31.847166061401367, "learning_rate": 8.376068376068377e-06, "loss": 1.9432, "step": 1960 }, { "epoch": 8.004188034188035, "grad_norm": 0.23129618167877197, "learning_rate": 8.41880341880342e-06, "loss": 0.8151, "step": 1970 }, { "epoch": 8.004615384615384, "grad_norm": 0.08719867467880249, "learning_rate": 8.461538461538462e-06, "loss": 0.4525, "step": 1980 }, { "epoch": 8.005042735042736, "grad_norm": 0.17510618269443512, "learning_rate": 8.504273504273505e-06, "loss": 2.3329, "step": 1990 }, { "epoch": 8.005470085470085, "grad_norm": 0.23968863487243652, "learning_rate": 8.547008547008548e-06, "loss": 1.4327, "step": 2000 }, { "epoch": 8.005897435897436, "grad_norm": 0.1258096992969513, "learning_rate": 8.58974358974359e-06, "loss": 1.5496, "step": 2010 }, { "epoch": 8.006324786324786, "grad_norm": 33.13344192504883, "learning_rate": 8.632478632478633e-06, "loss": 1.8633, "step": 2020 }, { "epoch": 8.006752136752137, "grad_norm": 34.57917022705078, "learning_rate": 8.675213675213676e-06, "loss": 2.1156, "step": 2030 }, { "epoch": 8.007179487179487, "grad_norm": 30.499267578125, "learning_rate": 8.717948717948719e-06, "loss": 1.4366, "step": 2040 }, { "epoch": 8.007606837606838, "grad_norm": 0.2296626716852188, "learning_rate": 8.760683760683762e-06, "loss": 0.5388, "step": 2050 }, { "epoch": 8.008034188034188, "grad_norm": 0.6007127165794373, "learning_rate": 8.803418803418804e-06, "loss": 2.0019, "step": 2060 }, { "epoch": 8.008461538461539, "grad_norm": 29.225831985473633, "learning_rate": 8.846153846153847e-06, "loss": 1.6262, "step": 2070 }, { "epoch": 8.008888888888889, "grad_norm": 0.9478847980499268, "learning_rate": 8.888888888888888e-06, "loss": 2.0313, "step": 2080 }, { "epoch": 8.00931623931624, "grad_norm": 0.12494786083698273, "learning_rate": 8.931623931623933e-06, "loss": 0.9103, "step": 2090 }, { "epoch": 8.00974358974359, "grad_norm": 23.248950958251953, "learning_rate": 8.974358974358976e-06, "loss": 2.1527, "step": 2100 }, { "epoch": 8.01, "eval_accuracy": 0.4, "eval_loss": 2.7605271339416504, "eval_runtime": 32.6542, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.766, "step": 2106 }, { "epoch": 9.00017094017094, "grad_norm": 25.770069122314453, "learning_rate": 9.017094017094018e-06, "loss": 1.5706, "step": 2110 }, { "epoch": 9.00059829059829, "grad_norm": 0.7022117376327515, "learning_rate": 9.059829059829061e-06, "loss": 0.4205, "step": 2120 }, { "epoch": 9.001025641025642, "grad_norm": 0.08867479860782623, "learning_rate": 9.102564102564104e-06, "loss": 1.2611, "step": 2130 }, { "epoch": 9.001452991452991, "grad_norm": 0.09678523242473602, "learning_rate": 9.145299145299145e-06, "loss": 1.1773, "step": 2140 }, { "epoch": 9.001880341880343, "grad_norm": 0.15343564748764038, "learning_rate": 9.188034188034188e-06, "loss": 1.2586, "step": 2150 }, { "epoch": 9.002307692307692, "grad_norm": 0.35261672735214233, "learning_rate": 9.230769230769232e-06, "loss": 0.8426, "step": 2160 }, { "epoch": 9.002735042735043, "grad_norm": 0.2280280739068985, "learning_rate": 9.273504273504275e-06, "loss": 1.1276, "step": 2170 }, { "epoch": 9.003162393162393, "grad_norm": 0.11199548840522766, "learning_rate": 9.316239316239318e-06, "loss": 2.7832, "step": 2180 }, { "epoch": 9.003589743589744, "grad_norm": 0.2621850073337555, "learning_rate": 9.358974358974359e-06, "loss": 0.9458, "step": 2190 }, { "epoch": 9.004017094017094, "grad_norm": 0.36012938618659973, "learning_rate": 9.401709401709402e-06, "loss": 1.1242, "step": 2200 }, { "epoch": 9.004444444444445, "grad_norm": 48.78242492675781, "learning_rate": 9.444444444444445e-06, "loss": 3.3251, "step": 2210 }, { "epoch": 9.004871794871795, "grad_norm": 0.15066808462142944, "learning_rate": 9.487179487179487e-06, "loss": 1.708, "step": 2220 }, { "epoch": 9.005299145299146, "grad_norm": 0.4691851735115051, "learning_rate": 9.52991452991453e-06, "loss": 1.553, "step": 2230 }, { "epoch": 9.005726495726496, "grad_norm": 28.600677490234375, "learning_rate": 9.572649572649575e-06, "loss": 1.2926, "step": 2240 }, { "epoch": 9.006153846153847, "grad_norm": 32.24127960205078, "learning_rate": 9.615384615384616e-06, "loss": 2.4733, "step": 2250 }, { "epoch": 9.006581196581196, "grad_norm": 0.7631069421768188, "learning_rate": 9.658119658119659e-06, "loss": 1.7091, "step": 2260 }, { "epoch": 9.007008547008548, "grad_norm": 0.2995574176311493, "learning_rate": 9.700854700854701e-06, "loss": 2.0053, "step": 2270 }, { "epoch": 9.007435897435897, "grad_norm": 3.1298446655273438, "learning_rate": 9.743589743589744e-06, "loss": 1.1043, "step": 2280 }, { "epoch": 9.007863247863249, "grad_norm": 0.1203409880399704, "learning_rate": 9.786324786324787e-06, "loss": 0.6468, "step": 2290 }, { "epoch": 9.008290598290598, "grad_norm": 0.3753611445426941, "learning_rate": 9.82905982905983e-06, "loss": 1.8003, "step": 2300 }, { "epoch": 9.00871794871795, "grad_norm": 0.24749663472175598, "learning_rate": 9.871794871794872e-06, "loss": 0.6302, "step": 2310 }, { "epoch": 9.009145299145299, "grad_norm": 0.15980985760688782, "learning_rate": 9.914529914529915e-06, "loss": 1.7506, "step": 2320 }, { "epoch": 9.00957264957265, "grad_norm": 30.294530868530273, "learning_rate": 9.957264957264958e-06, "loss": 2.5406, "step": 2330 }, { "epoch": 9.01, "grad_norm": 1.435558557510376, "learning_rate": 1e-05, "loss": 1.9428, "step": 2340 }, { "epoch": 9.01, "eval_accuracy": 0.4, "eval_loss": 2.4512882232666016, "eval_runtime": 32.6431, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.766, "step": 2340 }, { "epoch": 10.000427350427351, "grad_norm": 26.23455810546875, "learning_rate": 9.99525166191833e-06, "loss": 0.9502, "step": 2350 }, { "epoch": 10.0008547008547, "grad_norm": 28.52158546447754, "learning_rate": 9.990503323836657e-06, "loss": 1.7874, "step": 2360 }, { "epoch": 10.001282051282052, "grad_norm": 0.10940665751695633, "learning_rate": 9.985754985754987e-06, "loss": 1.1145, "step": 2370 }, { "epoch": 10.001709401709402, "grad_norm": 0.49054867029190063, "learning_rate": 9.981006647673314e-06, "loss": 2.1067, "step": 2380 }, { "epoch": 10.002136752136753, "grad_norm": 0.4312061071395874, "learning_rate": 9.976258309591643e-06, "loss": 1.4459, "step": 2390 }, { "epoch": 10.002564102564103, "grad_norm": 0.9604039788246155, "learning_rate": 9.971509971509972e-06, "loss": 1.5741, "step": 2400 }, { "epoch": 10.002991452991454, "grad_norm": 28.289756774902344, "learning_rate": 9.966761633428301e-06, "loss": 3.0157, "step": 2410 }, { "epoch": 10.003418803418803, "grad_norm": 1.3765814304351807, "learning_rate": 9.96201329534663e-06, "loss": 1.7118, "step": 2420 }, { "epoch": 10.003846153846155, "grad_norm": 0.4429996609687805, "learning_rate": 9.957264957264958e-06, "loss": 1.8346, "step": 2430 }, { "epoch": 10.004273504273504, "grad_norm": 0.1344616711139679, "learning_rate": 9.952516619183287e-06, "loss": 1.1072, "step": 2440 }, { "epoch": 10.004700854700856, "grad_norm": 0.2778800129890442, "learning_rate": 9.947768281101615e-06, "loss": 1.0562, "step": 2450 }, { "epoch": 10.005128205128205, "grad_norm": 41.122074127197266, "learning_rate": 9.943019943019944e-06, "loss": 2.096, "step": 2460 }, { "epoch": 10.005555555555556, "grad_norm": 0.6196286082267761, "learning_rate": 9.938271604938273e-06, "loss": 1.9945, "step": 2470 }, { "epoch": 10.005982905982906, "grad_norm": 38.4301872253418, "learning_rate": 9.9335232668566e-06, "loss": 1.5745, "step": 2480 }, { "epoch": 10.006410256410257, "grad_norm": 0.5608100295066833, "learning_rate": 9.92877492877493e-06, "loss": 1.7862, "step": 2490 }, { "epoch": 10.006837606837607, "grad_norm": 0.08879340440034866, "learning_rate": 9.924026590693259e-06, "loss": 1.2266, "step": 2500 }, { "epoch": 10.007264957264958, "grad_norm": 0.14399921894073486, "learning_rate": 9.919278252611588e-06, "loss": 1.5128, "step": 2510 }, { "epoch": 10.007692307692308, "grad_norm": 0.567003071308136, "learning_rate": 9.914529914529915e-06, "loss": 0.5539, "step": 2520 }, { "epoch": 10.008119658119659, "grad_norm": 0.1469346433877945, "learning_rate": 9.909781576448244e-06, "loss": 1.3608, "step": 2530 }, { "epoch": 10.008547008547009, "grad_norm": 0.6318036913871765, "learning_rate": 9.905033238366572e-06, "loss": 0.6639, "step": 2540 }, { "epoch": 10.00897435897436, "grad_norm": 0.13676932454109192, "learning_rate": 9.900284900284901e-06, "loss": 2.7342, "step": 2550 }, { "epoch": 10.00940170940171, "grad_norm": 28.927698135375977, "learning_rate": 9.89553656220323e-06, "loss": 2.5179, "step": 2560 }, { "epoch": 10.00982905982906, "grad_norm": 26.96585464477539, "learning_rate": 9.890788224121558e-06, "loss": 1.6949, "step": 2570 }, { "epoch": 10.01, "eval_accuracy": 0.4, "eval_loss": 3.2310287952423096, "eval_runtime": 32.7403, "eval_samples_per_second": 0.764, "eval_steps_per_second": 0.764, "step": 2574 }, { "epoch": 11.00025641025641, "grad_norm": 29.56972312927246, "learning_rate": 9.886039886039887e-06, "loss": 2.0711, "step": 2580 }, { "epoch": 11.00068376068376, "grad_norm": 1.1823328733444214, "learning_rate": 9.881291547958214e-06, "loss": 1.3947, "step": 2590 }, { "epoch": 11.001111111111111, "grad_norm": 34.95638656616211, "learning_rate": 9.876543209876543e-06, "loss": 1.3976, "step": 2600 }, { "epoch": 11.00153846153846, "grad_norm": 0.5684574842453003, "learning_rate": 9.871794871794872e-06, "loss": 0.5828, "step": 2610 }, { "epoch": 11.001965811965812, "grad_norm": 0.13000904023647308, "learning_rate": 9.867046533713202e-06, "loss": 0.547, "step": 2620 }, { "epoch": 11.002393162393162, "grad_norm": 0.08917149901390076, "learning_rate": 9.86229819563153e-06, "loss": 2.5733, "step": 2630 }, { "epoch": 11.002820512820513, "grad_norm": 27.73358154296875, "learning_rate": 9.857549857549858e-06, "loss": 3.1395, "step": 2640 }, { "epoch": 11.003247863247863, "grad_norm": 24.41839027404785, "learning_rate": 9.852801519468187e-06, "loss": 1.2438, "step": 2650 }, { "epoch": 11.003675213675214, "grad_norm": 3.812605142593384, "learning_rate": 9.848053181386515e-06, "loss": 1.9698, "step": 2660 }, { "epoch": 11.004102564102563, "grad_norm": 1.3905110359191895, "learning_rate": 9.843304843304844e-06, "loss": 1.0108, "step": 2670 }, { "epoch": 11.004529914529915, "grad_norm": 0.7647677063941956, "learning_rate": 9.838556505223173e-06, "loss": 2.3101, "step": 2680 }, { "epoch": 11.004957264957264, "grad_norm": 31.24732780456543, "learning_rate": 9.8338081671415e-06, "loss": 2.3872, "step": 2690 }, { "epoch": 11.005384615384616, "grad_norm": 29.57086944580078, "learning_rate": 9.82905982905983e-06, "loss": 1.7839, "step": 2700 }, { "epoch": 11.005811965811965, "grad_norm": 0.18307341635227203, "learning_rate": 9.824311490978159e-06, "loss": 1.4724, "step": 2710 }, { "epoch": 11.006239316239316, "grad_norm": 0.0853017047047615, "learning_rate": 9.819563152896488e-06, "loss": 0.2625, "step": 2720 }, { "epoch": 11.006666666666666, "grad_norm": 0.10152238607406616, "learning_rate": 9.814814814814815e-06, "loss": 1.0233, "step": 2730 }, { "epoch": 11.007094017094017, "grad_norm": 4.792705535888672, "learning_rate": 9.810066476733145e-06, "loss": 2.28, "step": 2740 }, { "epoch": 11.007521367521367, "grad_norm": 0.5689181685447693, "learning_rate": 9.805318138651474e-06, "loss": 1.3607, "step": 2750 }, { "epoch": 11.007948717948718, "grad_norm": 0.07235058397054672, "learning_rate": 9.800569800569801e-06, "loss": 0.5546, "step": 2760 }, { "epoch": 11.008376068376068, "grad_norm": 0.18848180770874023, "learning_rate": 9.79582146248813e-06, "loss": 0.0045, "step": 2770 }, { "epoch": 11.008803418803419, "grad_norm": 0.14690843224525452, "learning_rate": 9.791073124406458e-06, "loss": 0.6331, "step": 2780 }, { "epoch": 11.009230769230768, "grad_norm": 35.11496353149414, "learning_rate": 9.786324786324787e-06, "loss": 1.3858, "step": 2790 }, { "epoch": 11.00965811965812, "grad_norm": 0.35278671979904175, "learning_rate": 9.781576448243116e-06, "loss": 0.7839, "step": 2800 }, { "epoch": 11.01, "eval_accuracy": 0.4, "eval_loss": 3.237220525741577, "eval_runtime": 31.3652, "eval_samples_per_second": 0.797, "eval_steps_per_second": 0.797, "step": 2808 }, { "epoch": 12.00008547008547, "grad_norm": 0.31852149963378906, "learning_rate": 9.776828110161445e-06, "loss": 1.8794, "step": 2810 }, { "epoch": 12.00051282051282, "grad_norm": 1.0761709213256836, "learning_rate": 9.772079772079773e-06, "loss": 1.9101, "step": 2820 }, { "epoch": 12.00094017094017, "grad_norm": 0.06357964873313904, "learning_rate": 9.767331433998102e-06, "loss": 1.1383, "step": 2830 }, { "epoch": 12.001367521367522, "grad_norm": 54.093082427978516, "learning_rate": 9.762583095916431e-06, "loss": 2.112, "step": 2840 }, { "epoch": 12.001794871794871, "grad_norm": 0.2823314070701599, "learning_rate": 9.757834757834758e-06, "loss": 0.5963, "step": 2850 }, { "epoch": 12.002222222222223, "grad_norm": 0.3854888677597046, "learning_rate": 9.753086419753087e-06, "loss": 0.5196, "step": 2860 }, { "epoch": 12.002649572649572, "grad_norm": 32.18788146972656, "learning_rate": 9.748338081671415e-06, "loss": 3.0031, "step": 2870 }, { "epoch": 12.003076923076923, "grad_norm": 1.2660186290740967, "learning_rate": 9.743589743589744e-06, "loss": 1.9453, "step": 2880 }, { "epoch": 12.003504273504273, "grad_norm": 49.34856033325195, "learning_rate": 9.738841405508073e-06, "loss": 2.1836, "step": 2890 }, { "epoch": 12.003931623931624, "grad_norm": 3.923487663269043, "learning_rate": 9.7340930674264e-06, "loss": 0.6615, "step": 2900 }, { "epoch": 12.004358974358974, "grad_norm": 0.09915890544652939, "learning_rate": 9.72934472934473e-06, "loss": 1.8817, "step": 2910 }, { "epoch": 12.004786324786325, "grad_norm": 0.21953581273555756, "learning_rate": 9.724596391263059e-06, "loss": 2.0783, "step": 2920 }, { "epoch": 12.005213675213675, "grad_norm": 0.0524006113409996, "learning_rate": 9.719848053181388e-06, "loss": 1.1279, "step": 2930 }, { "epoch": 12.005641025641026, "grad_norm": 56.099334716796875, "learning_rate": 9.715099715099716e-06, "loss": 1.4809, "step": 2940 }, { "epoch": 12.006068376068376, "grad_norm": 0.5895985960960388, "learning_rate": 9.710351377018045e-06, "loss": 0.5329, "step": 2950 }, { "epoch": 12.006495726495727, "grad_norm": 42.8546257019043, "learning_rate": 9.705603038936374e-06, "loss": 2.1514, "step": 2960 }, { "epoch": 12.006923076923076, "grad_norm": 0.03076835907995701, "learning_rate": 9.700854700854701e-06, "loss": 0.4221, "step": 2970 }, { "epoch": 12.007350427350428, "grad_norm": 0.02532465010881424, "learning_rate": 9.69610636277303e-06, "loss": 2.3099, "step": 2980 }, { "epoch": 12.007777777777777, "grad_norm": 62.39934158325195, "learning_rate": 9.691358024691358e-06, "loss": 0.6646, "step": 2990 }, { "epoch": 12.008205128205129, "grad_norm": 54.54960632324219, "learning_rate": 9.686609686609687e-06, "loss": 0.4507, "step": 3000 }, { "epoch": 12.008632478632478, "grad_norm": 1.0829317569732666, "learning_rate": 9.681861348528016e-06, "loss": 1.9857, "step": 3010 }, { "epoch": 12.00905982905983, "grad_norm": 11.404279708862305, "learning_rate": 9.677113010446345e-06, "loss": 1.1354, "step": 3020 }, { "epoch": 12.009487179487179, "grad_norm": 0.041183874011039734, "learning_rate": 9.672364672364673e-06, "loss": 0.8741, "step": 3030 }, { "epoch": 12.00991452991453, "grad_norm": 19.812788009643555, "learning_rate": 9.667616334283002e-06, "loss": 0.3228, "step": 3040 }, { "epoch": 12.01, "eval_accuracy": 0.4, "eval_loss": 4.458773136138916, "eval_runtime": 31.367, "eval_samples_per_second": 0.797, "eval_steps_per_second": 0.797, "step": 3042 }, { "epoch": 13.00034188034188, "grad_norm": 49.4742317199707, "learning_rate": 9.662867996201331e-06, "loss": 2.7291, "step": 3050 }, { "epoch": 13.000769230769231, "grad_norm": 33.5203857421875, "learning_rate": 9.658119658119659e-06, "loss": 1.3205, "step": 3060 }, { "epoch": 13.00119658119658, "grad_norm": 32.33980941772461, "learning_rate": 9.653371320037988e-06, "loss": 1.9122, "step": 3070 }, { "epoch": 13.001623931623932, "grad_norm": 0.2694286108016968, "learning_rate": 9.648622981956317e-06, "loss": 1.0847, "step": 3080 }, { "epoch": 13.002051282051282, "grad_norm": 0.17534177005290985, "learning_rate": 9.643874643874644e-06, "loss": 1.2066, "step": 3090 }, { "epoch": 13.002478632478633, "grad_norm": 0.06266115605831146, "learning_rate": 9.639126305792973e-06, "loss": 1.0707, "step": 3100 }, { "epoch": 13.002905982905983, "grad_norm": 28.773160934448242, "learning_rate": 9.6343779677113e-06, "loss": 3.3066, "step": 3110 }, { "epoch": 13.003333333333334, "grad_norm": 36.008384704589844, "learning_rate": 9.62962962962963e-06, "loss": 1.0375, "step": 3120 }, { "epoch": 13.003760683760683, "grad_norm": 0.03540613874793053, "learning_rate": 9.624881291547959e-06, "loss": 0.5259, "step": 3130 }, { "epoch": 13.004188034188035, "grad_norm": 0.03709403797984123, "learning_rate": 9.620132953466288e-06, "loss": 0.0029, "step": 3140 }, { "epoch": 13.004615384615384, "grad_norm": 0.5573925375938416, "learning_rate": 9.615384615384616e-06, "loss": 1.7656, "step": 3150 }, { "epoch": 13.005042735042736, "grad_norm": 0.03247790411114693, "learning_rate": 9.610636277302945e-06, "loss": 1.6948, "step": 3160 }, { "epoch": 13.005470085470085, "grad_norm": 32.801788330078125, "learning_rate": 9.605887939221274e-06, "loss": 2.2156, "step": 3170 }, { "epoch": 13.005897435897436, "grad_norm": 3.8869693279266357, "learning_rate": 9.601139601139601e-06, "loss": 1.0666, "step": 3180 }, { "epoch": 13.006324786324786, "grad_norm": 0.8103746175765991, "learning_rate": 9.59639126305793e-06, "loss": 1.5307, "step": 3190 }, { "epoch": 13.006752136752137, "grad_norm": 10.717965126037598, "learning_rate": 9.591642924976258e-06, "loss": 1.6417, "step": 3200 }, { "epoch": 13.007179487179487, "grad_norm": 41.972896575927734, "learning_rate": 9.586894586894587e-06, "loss": 0.5114, "step": 3210 }, { "epoch": 13.007606837606838, "grad_norm": 79.40288543701172, "learning_rate": 9.582146248812916e-06, "loss": 1.7693, "step": 3220 }, { "epoch": 13.008034188034188, "grad_norm": 0.05152087286114693, "learning_rate": 9.577397910731245e-06, "loss": 1.1784, "step": 3230 }, { "epoch": 13.008461538461539, "grad_norm": 0.10308440029621124, "learning_rate": 9.572649572649575e-06, "loss": 1.9728, "step": 3240 }, { "epoch": 13.008888888888889, "grad_norm": 0.10380467772483826, "learning_rate": 9.567901234567902e-06, "loss": 1.1541, "step": 3250 }, { "epoch": 13.00931623931624, "grad_norm": 0.8398118019104004, "learning_rate": 9.563152896486231e-06, "loss": 0.5554, "step": 3260 }, { "epoch": 13.00974358974359, "grad_norm": 41.57830810546875, "learning_rate": 9.558404558404559e-06, "loss": 3.5377, "step": 3270 }, { "epoch": 13.01, "eval_accuracy": 0.4, "eval_loss": 2.862149238586426, "eval_runtime": 31.9079, "eval_samples_per_second": 0.784, "eval_steps_per_second": 0.784, "step": 3276 }, { "epoch": 14.00017094017094, "grad_norm": 37.766448974609375, "learning_rate": 9.553656220322888e-06, "loss": 1.0101, "step": 3280 }, { "epoch": 14.00059829059829, "grad_norm": 0.0569000206887722, "learning_rate": 9.548907882241217e-06, "loss": 0.5743, "step": 3290 }, { "epoch": 14.001025641025642, "grad_norm": 0.07739593833684921, "learning_rate": 9.544159544159544e-06, "loss": 0.6754, "step": 3300 }, { "epoch": 14.001452991452991, "grad_norm": 0.08230680227279663, "learning_rate": 9.539411206077874e-06, "loss": 2.4253, "step": 3310 }, { "epoch": 14.001880341880343, "grad_norm": 64.62518310546875, "learning_rate": 9.534662867996201e-06, "loss": 2.6924, "step": 3320 }, { "epoch": 14.002307692307692, "grad_norm": 6.906940460205078, "learning_rate": 9.52991452991453e-06, "loss": 1.5079, "step": 3330 }, { "epoch": 14.002735042735043, "grad_norm": 42.747108459472656, "learning_rate": 9.52516619183286e-06, "loss": 0.4776, "step": 3340 }, { "epoch": 14.003162393162393, "grad_norm": 0.0985599160194397, "learning_rate": 9.520417853751188e-06, "loss": 1.2448, "step": 3350 }, { "epoch": 14.003589743589744, "grad_norm": 43.92565155029297, "learning_rate": 9.515669515669516e-06, "loss": 1.1799, "step": 3360 }, { "epoch": 14.004017094017094, "grad_norm": 0.21025124192237854, "learning_rate": 9.510921177587845e-06, "loss": 1.5539, "step": 3370 }, { "epoch": 14.004444444444445, "grad_norm": 0.03208040073513985, "learning_rate": 9.506172839506174e-06, "loss": 0.0046, "step": 3380 }, { "epoch": 14.004871794871795, "grad_norm": 0.054694049060344696, "learning_rate": 9.501424501424502e-06, "loss": 1.6939, "step": 3390 }, { "epoch": 14.005299145299146, "grad_norm": 70.33654022216797, "learning_rate": 9.49667616334283e-06, "loss": 1.1934, "step": 3400 }, { "epoch": 14.005726495726496, "grad_norm": 22.107633590698242, "learning_rate": 9.49192782526116e-06, "loss": 1.3934, "step": 3410 }, { "epoch": 14.006153846153847, "grad_norm": 24.67310905456543, "learning_rate": 9.487179487179487e-06, "loss": 1.545, "step": 3420 }, { "epoch": 14.006581196581196, "grad_norm": 15.487519264221191, "learning_rate": 9.482431149097816e-06, "loss": 1.0724, "step": 3430 }, { "epoch": 14.007008547008548, "grad_norm": 8.748615264892578, "learning_rate": 9.477682811016146e-06, "loss": 1.1913, "step": 3440 }, { "epoch": 14.007435897435897, "grad_norm": 0.04509786143898964, "learning_rate": 9.472934472934475e-06, "loss": 0.8465, "step": 3450 }, { "epoch": 14.007863247863249, "grad_norm": 0.052408602088689804, "learning_rate": 9.468186134852802e-06, "loss": 0.9335, "step": 3460 }, { "epoch": 14.008290598290598, "grad_norm": 4.355685234069824, "learning_rate": 9.463437796771131e-06, "loss": 1.4667, "step": 3470 }, { "epoch": 14.00871794871795, "grad_norm": 31.14105796813965, "learning_rate": 9.458689458689459e-06, "loss": 1.9456, "step": 3480 }, { "epoch": 14.009145299145299, "grad_norm": 5.5146284103393555, "learning_rate": 9.453941120607788e-06, "loss": 2.4975, "step": 3490 }, { "epoch": 14.00957264957265, "grad_norm": 1.885633945465088, "learning_rate": 9.449192782526117e-06, "loss": 1.6884, "step": 3500 }, { "epoch": 14.01, "grad_norm": 0.19068482518196106, "learning_rate": 9.444444444444445e-06, "loss": 0.509, "step": 3510 }, { "epoch": 14.01, "eval_accuracy": 0.4, "eval_loss": 2.746034860610962, "eval_runtime": 31.4372, "eval_samples_per_second": 0.795, "eval_steps_per_second": 0.795, "step": 3510 }, { "epoch": 15.000427350427351, "grad_norm": 0.03531758114695549, "learning_rate": 9.439696106362774e-06, "loss": 1.7623, "step": 3520 }, { "epoch": 15.0008547008547, "grad_norm": 0.2500402331352234, "learning_rate": 9.434947768281101e-06, "loss": 0.9918, "step": 3530 }, { "epoch": 15.001282051282052, "grad_norm": 6.20103120803833, "learning_rate": 9.430199430199432e-06, "loss": 1.3489, "step": 3540 }, { "epoch": 15.001709401709402, "grad_norm": 0.275524377822876, "learning_rate": 9.42545109211776e-06, "loss": 0.6611, "step": 3550 }, { "epoch": 15.002136752136753, "grad_norm": 0.08374079316854477, "learning_rate": 9.420702754036089e-06, "loss": 1.2316, "step": 3560 }, { "epoch": 15.002564102564103, "grad_norm": 0.5503836274147034, "learning_rate": 9.415954415954418e-06, "loss": 0.8269, "step": 3570 }, { "epoch": 15.002991452991454, "grad_norm": 46.557186126708984, "learning_rate": 9.411206077872745e-06, "loss": 1.2204, "step": 3580 }, { "epoch": 15.003418803418803, "grad_norm": 63.70065689086914, "learning_rate": 9.406457739791074e-06, "loss": 1.834, "step": 3590 }, { "epoch": 15.003846153846155, "grad_norm": 35.3852424621582, "learning_rate": 9.401709401709402e-06, "loss": 0.6711, "step": 3600 }, { "epoch": 15.004273504273504, "grad_norm": 16.211149215698242, "learning_rate": 9.396961063627731e-06, "loss": 0.6182, "step": 3610 }, { "epoch": 15.004700854700856, "grad_norm": 76.63227081298828, "learning_rate": 9.39221272554606e-06, "loss": 0.831, "step": 3620 }, { "epoch": 15.005128205128205, "grad_norm": 0.5662261843681335, "learning_rate": 9.387464387464388e-06, "loss": 0.6825, "step": 3630 }, { "epoch": 15.005555555555556, "grad_norm": 0.5306532382965088, "learning_rate": 9.382716049382717e-06, "loss": 1.213, "step": 3640 }, { "epoch": 15.005982905982906, "grad_norm": 74.40409088134766, "learning_rate": 9.377967711301046e-06, "loss": 0.5874, "step": 3650 }, { "epoch": 15.006410256410257, "grad_norm": 0.02636171504855156, "learning_rate": 9.373219373219375e-06, "loss": 0.6467, "step": 3660 }, { "epoch": 15.006837606837607, "grad_norm": 0.043768566101789474, "learning_rate": 9.368471035137702e-06, "loss": 1.444, "step": 3670 }, { "epoch": 15.007264957264958, "grad_norm": 1.168853998184204, "learning_rate": 9.363722697056032e-06, "loss": 2.6958, "step": 3680 }, { "epoch": 15.007692307692308, "grad_norm": 41.314117431640625, "learning_rate": 9.358974358974359e-06, "loss": 1.917, "step": 3690 }, { "epoch": 15.008119658119659, "grad_norm": 28.175498962402344, "learning_rate": 9.354226020892688e-06, "loss": 1.4498, "step": 3700 }, { "epoch": 15.008547008547009, "grad_norm": 0.07334843277931213, "learning_rate": 9.349477682811017e-06, "loss": 0.3526, "step": 3710 }, { "epoch": 15.00897435897436, "grad_norm": 0.06012887507677078, "learning_rate": 9.344729344729345e-06, "loss": 1.0099, "step": 3720 }, { "epoch": 15.00940170940171, "grad_norm": 0.030554110184311867, "learning_rate": 9.339981006647674e-06, "loss": 0.5927, "step": 3730 }, { "epoch": 15.00982905982906, "grad_norm": 0.12558279931545258, "learning_rate": 9.335232668566003e-06, "loss": 0.1437, "step": 3740 }, { "epoch": 15.01, "eval_accuracy": 0.4, "eval_loss": 2.969764471054077, "eval_runtime": 31.3741, "eval_samples_per_second": 0.797, "eval_steps_per_second": 0.797, "step": 3744 }, { "epoch": 16.00025641025641, "grad_norm": 0.020290929824113846, "learning_rate": 9.330484330484332e-06, "loss": 1.4002, "step": 3750 }, { "epoch": 16.00068376068376, "grad_norm": 0.02340000309050083, "learning_rate": 9.32573599240266e-06, "loss": 1.3384, "step": 3760 }, { "epoch": 16.00111111111111, "grad_norm": 0.027016054838895798, "learning_rate": 9.320987654320989e-06, "loss": 2.2665, "step": 3770 }, { "epoch": 16.001538461538463, "grad_norm": 0.11080523580312729, "learning_rate": 9.316239316239318e-06, "loss": 0.7879, "step": 3780 }, { "epoch": 16.00196581196581, "grad_norm": 123.3178939819336, "learning_rate": 9.311490978157645e-06, "loss": 1.1529, "step": 3790 }, { "epoch": 16.00239316239316, "grad_norm": 0.09909241646528244, "learning_rate": 9.306742640075974e-06, "loss": 2.4706, "step": 3800 }, { "epoch": 16.002820512820513, "grad_norm": 0.025457249954342842, "learning_rate": 9.301994301994302e-06, "loss": 0.1416, "step": 3810 }, { "epoch": 16.003247863247864, "grad_norm": 0.054812464863061905, "learning_rate": 9.297245963912631e-06, "loss": 0.1454, "step": 3820 }, { "epoch": 16.003675213675212, "grad_norm": 42.573997497558594, "learning_rate": 9.29249762583096e-06, "loss": 2.6915, "step": 3830 }, { "epoch": 16.004102564102563, "grad_norm": 0.10259363800287247, "learning_rate": 9.287749287749288e-06, "loss": 1.2003, "step": 3840 }, { "epoch": 16.004529914529915, "grad_norm": 0.0471661239862442, "learning_rate": 9.283000949667617e-06, "loss": 1.2392, "step": 3850 }, { "epoch": 16.004957264957266, "grad_norm": 0.0715036392211914, "learning_rate": 9.278252611585946e-06, "loss": 0.4382, "step": 3860 }, { "epoch": 16.005384615384614, "grad_norm": 0.045701488852500916, "learning_rate": 9.273504273504275e-06, "loss": 0.3563, "step": 3870 }, { "epoch": 16.005811965811965, "grad_norm": 91.8859634399414, "learning_rate": 9.268755935422603e-06, "loss": 1.0391, "step": 3880 }, { "epoch": 16.006239316239316, "grad_norm": 2.5818240642547607, "learning_rate": 9.264007597340932e-06, "loss": 1.1685, "step": 3890 }, { "epoch": 16.006666666666668, "grad_norm": 0.031820375472307205, "learning_rate": 9.25925925925926e-06, "loss": 1.1833, "step": 3900 }, { "epoch": 16.007094017094015, "grad_norm": 0.07577929645776749, "learning_rate": 9.254510921177588e-06, "loss": 2.5019, "step": 3910 }, { "epoch": 16.007521367521367, "grad_norm": 0.03320784494280815, "learning_rate": 9.249762583095917e-06, "loss": 0.9758, "step": 3920 }, { "epoch": 16.007948717948718, "grad_norm": 117.20964813232422, "learning_rate": 9.245014245014245e-06, "loss": 0.7272, "step": 3930 }, { "epoch": 16.00837606837607, "grad_norm": 12.37725830078125, "learning_rate": 9.240265906932574e-06, "loss": 0.4982, "step": 3940 }, { "epoch": 16.008803418803417, "grad_norm": 0.03360183909535408, "learning_rate": 9.235517568850903e-06, "loss": 1.0855, "step": 3950 }, { "epoch": 16.00923076923077, "grad_norm": 209.72621154785156, "learning_rate": 9.230769230769232e-06, "loss": 1.1257, "step": 3960 }, { "epoch": 16.00965811965812, "grad_norm": 45.13322830200195, "learning_rate": 9.22602089268756e-06, "loss": 1.0039, "step": 3970 }, { "epoch": 16.01, "eval_accuracy": 0.44, "eval_loss": 1.9414650201797485, "eval_runtime": 31.717, "eval_samples_per_second": 0.788, "eval_steps_per_second": 0.788, "step": 3978 }, { "epoch": 17.00008547008547, "grad_norm": 0.20372648537158966, "learning_rate": 9.221272554605889e-06, "loss": 0.989, "step": 3980 }, { "epoch": 17.00051282051282, "grad_norm": 143.5639190673828, "learning_rate": 9.216524216524218e-06, "loss": 0.5736, "step": 3990 }, { "epoch": 17.000940170940172, "grad_norm": 0.0925712063908577, "learning_rate": 9.211775878442545e-06, "loss": 0.7445, "step": 4000 }, { "epoch": 17.00136752136752, "grad_norm": 0.03829040750861168, "learning_rate": 9.207027540360875e-06, "loss": 0.5319, "step": 4010 }, { "epoch": 17.00179487179487, "grad_norm": 66.21209716796875, "learning_rate": 9.202279202279202e-06, "loss": 1.2316, "step": 4020 }, { "epoch": 17.002222222222223, "grad_norm": 0.07042970508337021, "learning_rate": 9.197530864197531e-06, "loss": 1.7251, "step": 4030 }, { "epoch": 17.002649572649574, "grad_norm": 1.4991275072097778, "learning_rate": 9.19278252611586e-06, "loss": 0.9513, "step": 4040 }, { "epoch": 17.00307692307692, "grad_norm": 0.5810855627059937, "learning_rate": 9.188034188034188e-06, "loss": 2.5822, "step": 4050 }, { "epoch": 17.003504273504273, "grad_norm": 0.08620700240135193, "learning_rate": 9.183285849952517e-06, "loss": 1.3987, "step": 4060 }, { "epoch": 17.003931623931624, "grad_norm": 35.556392669677734, "learning_rate": 9.178537511870846e-06, "loss": 1.2302, "step": 4070 }, { "epoch": 17.004358974358976, "grad_norm": 33.437049865722656, "learning_rate": 9.173789173789175e-06, "loss": 1.1155, "step": 4080 }, { "epoch": 17.004786324786323, "grad_norm": 0.6656836867332458, "learning_rate": 9.169040835707503e-06, "loss": 1.1672, "step": 4090 }, { "epoch": 17.005213675213675, "grad_norm": 0.0775466188788414, "learning_rate": 9.164292497625832e-06, "loss": 0.9781, "step": 4100 }, { "epoch": 17.005641025641026, "grad_norm": 177.47276306152344, "learning_rate": 9.159544159544161e-06, "loss": 1.6938, "step": 4110 }, { "epoch": 17.006068376068377, "grad_norm": 0.02730586938560009, "learning_rate": 9.154795821462488e-06, "loss": 1.2777, "step": 4120 }, { "epoch": 17.006495726495725, "grad_norm": 76.6297607421875, "learning_rate": 9.150047483380818e-06, "loss": 0.6365, "step": 4130 }, { "epoch": 17.006923076923076, "grad_norm": 1.5181001424789429, "learning_rate": 9.145299145299145e-06, "loss": 0.5115, "step": 4140 }, { "epoch": 17.007350427350428, "grad_norm": 7.319108486175537, "learning_rate": 9.140550807217474e-06, "loss": 1.4807, "step": 4150 }, { "epoch": 17.00777777777778, "grad_norm": 139.0987548828125, "learning_rate": 9.135802469135803e-06, "loss": 1.5217, "step": 4160 }, { "epoch": 17.008205128205127, "grad_norm": 16.966917037963867, "learning_rate": 9.131054131054132e-06, "loss": 2.0712, "step": 4170 }, { "epoch": 17.008632478632478, "grad_norm": 73.7410659790039, "learning_rate": 9.126305792972462e-06, "loss": 1.0309, "step": 4180 }, { "epoch": 17.00905982905983, "grad_norm": 0.03598838672041893, "learning_rate": 9.121557454890789e-06, "loss": 0.3113, "step": 4190 }, { "epoch": 17.00948717948718, "grad_norm": 0.01311536505818367, "learning_rate": 9.116809116809118e-06, "loss": 1.8983, "step": 4200 }, { "epoch": 17.00991452991453, "grad_norm": 0.07452392578125, "learning_rate": 9.112060778727446e-06, "loss": 0.0062, "step": 4210 }, { "epoch": 17.01, "eval_accuracy": 0.4, "eval_loss": 3.7040703296661377, "eval_runtime": 31.9027, "eval_samples_per_second": 0.784, "eval_steps_per_second": 0.784, "step": 4212 }, { "epoch": 18.00034188034188, "grad_norm": 107.44599151611328, "learning_rate": 9.107312440645775e-06, "loss": 1.8731, "step": 4220 }, { "epoch": 18.00076923076923, "grad_norm": 0.23524224758148193, "learning_rate": 9.102564102564104e-06, "loss": 1.0308, "step": 4230 }, { "epoch": 18.00119658119658, "grad_norm": 0.07270976901054382, "learning_rate": 9.097815764482431e-06, "loss": 1.2283, "step": 4240 }, { "epoch": 18.001623931623932, "grad_norm": 6.627790451049805, "learning_rate": 9.09306742640076e-06, "loss": 1.0094, "step": 4250 }, { "epoch": 18.002051282051283, "grad_norm": 0.02103256992995739, "learning_rate": 9.088319088319088e-06, "loss": 0.5875, "step": 4260 }, { "epoch": 18.00247863247863, "grad_norm": 0.6851010322570801, "learning_rate": 9.083570750237419e-06, "loss": 1.2334, "step": 4270 }, { "epoch": 18.002905982905983, "grad_norm": 82.27792358398438, "learning_rate": 9.078822412155746e-06, "loss": 1.9545, "step": 4280 }, { "epoch": 18.003333333333334, "grad_norm": 139.23216247558594, "learning_rate": 9.074074074074075e-06, "loss": 0.8405, "step": 4290 }, { "epoch": 18.003760683760685, "grad_norm": 0.8755741715431213, "learning_rate": 9.069325735992403e-06, "loss": 1.1443, "step": 4300 }, { "epoch": 18.004188034188033, "grad_norm": 105.50877380371094, "learning_rate": 9.064577397910732e-06, "loss": 0.821, "step": 4310 }, { "epoch": 18.004615384615384, "grad_norm": 114.59051513671875, "learning_rate": 9.059829059829061e-06, "loss": 1.6665, "step": 4320 }, { "epoch": 18.005042735042736, "grad_norm": 0.04322560876607895, "learning_rate": 9.055080721747389e-06, "loss": 0.9944, "step": 4330 }, { "epoch": 18.005470085470087, "grad_norm": 16.47235870361328, "learning_rate": 9.050332383665718e-06, "loss": 0.6319, "step": 4340 }, { "epoch": 18.005897435897435, "grad_norm": 5.790500640869141, "learning_rate": 9.045584045584045e-06, "loss": 1.4753, "step": 4350 }, { "epoch": 18.006324786324786, "grad_norm": 0.045558616518974304, "learning_rate": 9.040835707502374e-06, "loss": 1.4482, "step": 4360 }, { "epoch": 18.006752136752137, "grad_norm": 0.014822369441390038, "learning_rate": 9.036087369420703e-06, "loss": 1.1214, "step": 4370 }, { "epoch": 18.00717948717949, "grad_norm": 90.13900756835938, "learning_rate": 9.031339031339033e-06, "loss": 2.0725, "step": 4380 }, { "epoch": 18.007606837606836, "grad_norm": 80.13975524902344, "learning_rate": 9.026590693257362e-06, "loss": 1.1801, "step": 4390 }, { "epoch": 18.008034188034188, "grad_norm": 0.009048054926097393, "learning_rate": 9.02184235517569e-06, "loss": 0.6618, "step": 4400 }, { "epoch": 18.00846153846154, "grad_norm": 59.612308502197266, "learning_rate": 9.017094017094018e-06, "loss": 1.7319, "step": 4410 }, { "epoch": 18.00888888888889, "grad_norm": 0.1411258578300476, "learning_rate": 9.012345679012346e-06, "loss": 0.9388, "step": 4420 }, { "epoch": 18.009316239316238, "grad_norm": 12.921591758728027, "learning_rate": 9.007597340930675e-06, "loss": 0.6901, "step": 4430 }, { "epoch": 18.00974358974359, "grad_norm": 25.335466384887695, "learning_rate": 9.002849002849004e-06, "loss": 0.6038, "step": 4440 }, { "epoch": 18.01, "eval_accuracy": 0.4, "eval_loss": 3.2141363620758057, "eval_runtime": 31.3241, "eval_samples_per_second": 0.798, "eval_steps_per_second": 0.798, "step": 4446 }, { "epoch": 19.00017094017094, "grad_norm": 119.39568328857422, "learning_rate": 8.998100664767332e-06, "loss": 1.4807, "step": 4450 }, { "epoch": 19.00059829059829, "grad_norm": 86.97232055664062, "learning_rate": 8.99335232668566e-06, "loss": 1.6098, "step": 4460 }, { "epoch": 19.00102564102564, "grad_norm": 69.82275390625, "learning_rate": 8.988603988603988e-06, "loss": 0.7945, "step": 4470 }, { "epoch": 19.001452991452993, "grad_norm": 0.024586718529462814, "learning_rate": 8.983855650522319e-06, "loss": 0.289, "step": 4480 }, { "epoch": 19.00188034188034, "grad_norm": 0.08284483104944229, "learning_rate": 8.979107312440646e-06, "loss": 0.7986, "step": 4490 }, { "epoch": 19.002307692307692, "grad_norm": 31.52510643005371, "learning_rate": 8.974358974358976e-06, "loss": 0.337, "step": 4500 }, { "epoch": 19.002735042735043, "grad_norm": 0.04926073178648949, "learning_rate": 8.969610636277305e-06, "loss": 1.7346, "step": 4510 }, { "epoch": 19.003162393162395, "grad_norm": 0.06321626901626587, "learning_rate": 8.964862298195632e-06, "loss": 0.9723, "step": 4520 }, { "epoch": 19.003589743589743, "grad_norm": 0.09472198784351349, "learning_rate": 8.960113960113961e-06, "loss": 0.0057, "step": 4530 }, { "epoch": 19.004017094017094, "grad_norm": 145.72950744628906, "learning_rate": 8.955365622032289e-06, "loss": 0.7365, "step": 4540 }, { "epoch": 19.004444444444445, "grad_norm": 0.012398917227983475, "learning_rate": 8.950617283950618e-06, "loss": 2.1059, "step": 4550 }, { "epoch": 19.004871794871796, "grad_norm": 73.18333435058594, "learning_rate": 8.945868945868947e-06, "loss": 1.8259, "step": 4560 }, { "epoch": 19.005299145299144, "grad_norm": 84.90556335449219, "learning_rate": 8.941120607787274e-06, "loss": 0.696, "step": 4570 }, { "epoch": 19.005726495726496, "grad_norm": 0.024428799748420715, "learning_rate": 8.936372269705604e-06, "loss": 1.0575, "step": 4580 }, { "epoch": 19.006153846153847, "grad_norm": 44.86989974975586, "learning_rate": 8.931623931623933e-06, "loss": 1.0172, "step": 4590 }, { "epoch": 19.006581196581198, "grad_norm": 0.16147296130657196, "learning_rate": 8.926875593542262e-06, "loss": 0.4637, "step": 4600 }, { "epoch": 19.007008547008546, "grad_norm": 0.016476508229970932, "learning_rate": 8.92212725546059e-06, "loss": 0.0585, "step": 4610 }, { "epoch": 19.007435897435897, "grad_norm": 69.78681945800781, "learning_rate": 8.917378917378919e-06, "loss": 2.8439, "step": 4620 }, { "epoch": 19.00786324786325, "grad_norm": 143.7225799560547, "learning_rate": 8.912630579297246e-06, "loss": 0.3701, "step": 4630 }, { "epoch": 19.0082905982906, "grad_norm": 3.918398141860962, "learning_rate": 8.907882241215575e-06, "loss": 0.6645, "step": 4640 }, { "epoch": 19.008717948717948, "grad_norm": 0.4194408357143402, "learning_rate": 8.903133903133904e-06, "loss": 1.5778, "step": 4650 }, { "epoch": 19.0091452991453, "grad_norm": 0.7064778804779053, "learning_rate": 8.898385565052232e-06, "loss": 0.9134, "step": 4660 }, { "epoch": 19.00957264957265, "grad_norm": 97.42158508300781, "learning_rate": 8.893637226970561e-06, "loss": 0.6072, "step": 4670 }, { "epoch": 19.01, "grad_norm": 0.028133299201726913, "learning_rate": 8.888888888888888e-06, "loss": 1.1687, "step": 4680 }, { "epoch": 19.01, "eval_accuracy": 0.44, "eval_loss": 2.407238721847534, "eval_runtime": 31.5073, "eval_samples_per_second": 0.793, "eval_steps_per_second": 0.793, "step": 4680 }, { "epoch": 20.00042735042735, "grad_norm": 0.0467204824090004, "learning_rate": 8.884140550807219e-06, "loss": 1.0038, "step": 4690 }, { "epoch": 20.000854700854703, "grad_norm": 0.044175442308187485, "learning_rate": 8.879392212725547e-06, "loss": 0.2621, "step": 4700 }, { "epoch": 20.00128205128205, "grad_norm": 0.12440741062164307, "learning_rate": 8.874643874643876e-06, "loss": 1.2026, "step": 4710 }, { "epoch": 20.0017094017094, "grad_norm": 0.011806718073785305, "learning_rate": 8.869895536562205e-06, "loss": 0.7563, "step": 4720 }, { "epoch": 20.002136752136753, "grad_norm": 1.0385252237319946, "learning_rate": 8.865147198480532e-06, "loss": 0.5996, "step": 4730 }, { "epoch": 20.002564102564104, "grad_norm": 175.9308319091797, "learning_rate": 8.860398860398861e-06, "loss": 0.8629, "step": 4740 }, { "epoch": 20.002991452991452, "grad_norm": 0.028463028371334076, "learning_rate": 8.855650522317189e-06, "loss": 2.8217, "step": 4750 }, { "epoch": 20.003418803418803, "grad_norm": 0.030408434569835663, "learning_rate": 8.850902184235518e-06, "loss": 0.6615, "step": 4760 }, { "epoch": 20.003846153846155, "grad_norm": 0.057413261383771896, "learning_rate": 8.846153846153847e-06, "loss": 0.5805, "step": 4770 }, { "epoch": 20.004273504273506, "grad_norm": 73.27574920654297, "learning_rate": 8.841405508072175e-06, "loss": 1.1753, "step": 4780 }, { "epoch": 20.004700854700854, "grad_norm": 0.06163284182548523, "learning_rate": 8.836657169990504e-06, "loss": 0.4274, "step": 4790 }, { "epoch": 20.005128205128205, "grad_norm": 23.880149841308594, "learning_rate": 8.831908831908833e-06, "loss": 0.6001, "step": 4800 }, { "epoch": 20.005555555555556, "grad_norm": 81.80984497070312, "learning_rate": 8.827160493827162e-06, "loss": 0.7434, "step": 4810 }, { "epoch": 20.005982905982908, "grad_norm": 0.018306629732251167, "learning_rate": 8.82241215574549e-06, "loss": 0.7265, "step": 4820 }, { "epoch": 20.006410256410255, "grad_norm": 2.1246113777160645, "learning_rate": 8.817663817663819e-06, "loss": 1.1732, "step": 4830 }, { "epoch": 20.006837606837607, "grad_norm": 0.1039893627166748, "learning_rate": 8.812915479582148e-06, "loss": 1.2742, "step": 4840 }, { "epoch": 20.007264957264958, "grad_norm": 0.3535962402820587, "learning_rate": 8.808167141500475e-06, "loss": 0.4528, "step": 4850 }, { "epoch": 20.00769230769231, "grad_norm": 177.94131469726562, "learning_rate": 8.803418803418804e-06, "loss": 0.4491, "step": 4860 }, { "epoch": 20.008119658119657, "grad_norm": 0.01753688044846058, "learning_rate": 8.798670465337132e-06, "loss": 1.9356, "step": 4870 }, { "epoch": 20.00854700854701, "grad_norm": 5.777415752410889, "learning_rate": 8.793922127255461e-06, "loss": 0.7365, "step": 4880 }, { "epoch": 20.00897435897436, "grad_norm": 0.021493086591362953, "learning_rate": 8.78917378917379e-06, "loss": 1.5335, "step": 4890 }, { "epoch": 20.00940170940171, "grad_norm": 56.53275680541992, "learning_rate": 8.78442545109212e-06, "loss": 2.5917, "step": 4900 }, { "epoch": 20.00982905982906, "grad_norm": 0.8556663393974304, "learning_rate": 8.779677113010447e-06, "loss": 0.8397, "step": 4910 }, { "epoch": 20.01, "eval_accuracy": 0.4, "eval_loss": 3.42124080657959, "eval_runtime": 32.6346, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.766, "step": 4914 }, { "epoch": 21.00025641025641, "grad_norm": 0.02766931988298893, "learning_rate": 8.774928774928776e-06, "loss": 1.0177, "step": 4920 }, { "epoch": 21.00068376068376, "grad_norm": 1.8544671535491943, "learning_rate": 8.770180436847105e-06, "loss": 0.6464, "step": 4930 }, { "epoch": 21.00111111111111, "grad_norm": 64.42668914794922, "learning_rate": 8.765432098765432e-06, "loss": 2.2753, "step": 4940 }, { "epoch": 21.001538461538463, "grad_norm": 5.441823959350586, "learning_rate": 8.760683760683762e-06, "loss": 0.3229, "step": 4950 }, { "epoch": 21.00196581196581, "grad_norm": 0.053660500794649124, "learning_rate": 8.755935422602089e-06, "loss": 0.4219, "step": 4960 }, { "epoch": 21.00239316239316, "grad_norm": 0.015398544259369373, "learning_rate": 8.751187084520418e-06, "loss": 0.0334, "step": 4970 }, { "epoch": 21.002820512820513, "grad_norm": 27.52494239807129, "learning_rate": 8.746438746438747e-06, "loss": 1.3975, "step": 4980 }, { "epoch": 21.003247863247864, "grad_norm": 0.019361304119229317, "learning_rate": 8.741690408357075e-06, "loss": 1.1835, "step": 4990 }, { "epoch": 21.003675213675212, "grad_norm": 0.15508921444416046, "learning_rate": 8.736942070275406e-06, "loss": 0.9132, "step": 5000 }, { "epoch": 21.004102564102563, "grad_norm": 394.57867431640625, "learning_rate": 8.732193732193733e-06, "loss": 0.8003, "step": 5010 }, { "epoch": 21.004529914529915, "grad_norm": 0.3072645962238312, "learning_rate": 8.727445394112062e-06, "loss": 1.085, "step": 5020 }, { "epoch": 21.004957264957266, "grad_norm": 128.3138427734375, "learning_rate": 8.72269705603039e-06, "loss": 1.6623, "step": 5030 }, { "epoch": 21.005384615384614, "grad_norm": 0.40651294589042664, "learning_rate": 8.717948717948719e-06, "loss": 1.3648, "step": 5040 }, { "epoch": 21.005811965811965, "grad_norm": 51.56714630126953, "learning_rate": 8.713200379867048e-06, "loss": 1.1869, "step": 5050 }, { "epoch": 21.006239316239316, "grad_norm": 167.30929565429688, "learning_rate": 8.708452041785375e-06, "loss": 1.2651, "step": 5060 }, { "epoch": 21.006666666666668, "grad_norm": 26.427907943725586, "learning_rate": 8.703703703703705e-06, "loss": 1.136, "step": 5070 }, { "epoch": 21.007094017094015, "grad_norm": 0.03196699172258377, "learning_rate": 8.698955365622032e-06, "loss": 0.5952, "step": 5080 }, { "epoch": 21.007521367521367, "grad_norm": 131.19776916503906, "learning_rate": 8.694207027540361e-06, "loss": 1.0937, "step": 5090 }, { "epoch": 21.007948717948718, "grad_norm": 0.009517078287899494, "learning_rate": 8.68945868945869e-06, "loss": 0.6601, "step": 5100 }, { "epoch": 21.00837606837607, "grad_norm": 106.69688415527344, "learning_rate": 8.68471035137702e-06, "loss": 1.0542, "step": 5110 }, { "epoch": 21.008803418803417, "grad_norm": 0.006986229680478573, "learning_rate": 8.679962013295347e-06, "loss": 1.1394, "step": 5120 }, { "epoch": 21.00923076923077, "grad_norm": 127.98587799072266, "learning_rate": 8.675213675213676e-06, "loss": 2.0223, "step": 5130 }, { "epoch": 21.00965811965812, "grad_norm": 113.3585205078125, "learning_rate": 8.670465337132005e-06, "loss": 1.1147, "step": 5140 }, { "epoch": 21.01, "eval_accuracy": 0.44, "eval_loss": 2.511547565460205, "eval_runtime": 33.474, "eval_samples_per_second": 0.747, "eval_steps_per_second": 0.747, "step": 5148 }, { "epoch": 22.00008547008547, "grad_norm": 0.017451312392950058, "learning_rate": 8.665716999050333e-06, "loss": 0.4376, "step": 5150 }, { "epoch": 22.00051282051282, "grad_norm": 0.21703580021858215, "learning_rate": 8.660968660968662e-06, "loss": 0.8807, "step": 5160 }, { "epoch": 22.000940170940172, "grad_norm": 0.01205768994987011, "learning_rate": 8.656220322886991e-06, "loss": 0.4462, "step": 5170 }, { "epoch": 22.00136752136752, "grad_norm": 62.46580123901367, "learning_rate": 8.651471984805318e-06, "loss": 0.6433, "step": 5180 }, { "epoch": 22.00179487179487, "grad_norm": 195.9849853515625, "learning_rate": 8.646723646723648e-06, "loss": 0.6842, "step": 5190 }, { "epoch": 22.002222222222223, "grad_norm": 23.76949691772461, "learning_rate": 8.641975308641975e-06, "loss": 1.8101, "step": 5200 }, { "epoch": 22.002649572649574, "grad_norm": 1.1113344430923462, "learning_rate": 8.637226970560306e-06, "loss": 1.3208, "step": 5210 }, { "epoch": 22.00307692307692, "grad_norm": 50.05499267578125, "learning_rate": 8.632478632478633e-06, "loss": 1.7615, "step": 5220 }, { "epoch": 22.003504273504273, "grad_norm": 148.08843994140625, "learning_rate": 8.627730294396962e-06, "loss": 0.5733, "step": 5230 }, { "epoch": 22.003931623931624, "grad_norm": 22.12079429626465, "learning_rate": 8.62298195631529e-06, "loss": 0.399, "step": 5240 }, { "epoch": 22.004358974358976, "grad_norm": 61.95510482788086, "learning_rate": 8.618233618233619e-06, "loss": 0.8791, "step": 5250 }, { "epoch": 22.004786324786323, "grad_norm": 0.0063286456279456615, "learning_rate": 8.613485280151948e-06, "loss": 0.9967, "step": 5260 }, { "epoch": 22.005213675213675, "grad_norm": 0.005704652518033981, "learning_rate": 8.608736942070276e-06, "loss": 0.0331, "step": 5270 }, { "epoch": 22.005641025641026, "grad_norm": 0.019357306882739067, "learning_rate": 8.603988603988605e-06, "loss": 1.0326, "step": 5280 }, { "epoch": 22.006068376068377, "grad_norm": 38.796791076660156, "learning_rate": 8.599240265906932e-06, "loss": 0.3044, "step": 5290 }, { "epoch": 22.006495726495725, "grad_norm": 0.023437276482582092, "learning_rate": 8.594491927825261e-06, "loss": 1.0089, "step": 5300 }, { "epoch": 22.006923076923076, "grad_norm": 91.38776397705078, "learning_rate": 8.58974358974359e-06, "loss": 1.7264, "step": 5310 }, { "epoch": 22.007350427350428, "grad_norm": 0.006801048293709755, "learning_rate": 8.58499525166192e-06, "loss": 0.2886, "step": 5320 }, { "epoch": 22.00777777777778, "grad_norm": 0.005525406915694475, "learning_rate": 8.580246913580249e-06, "loss": 0.6458, "step": 5330 }, { "epoch": 22.008205128205127, "grad_norm": 396.8755187988281, "learning_rate": 8.575498575498576e-06, "loss": 0.6596, "step": 5340 }, { "epoch": 22.008632478632478, "grad_norm": 0.45548421144485474, "learning_rate": 8.570750237416905e-06, "loss": 0.8506, "step": 5350 }, { "epoch": 22.00905982905983, "grad_norm": 0.06096582114696503, "learning_rate": 8.566001899335233e-06, "loss": 0.9077, "step": 5360 }, { "epoch": 22.00948717948718, "grad_norm": 52.80357360839844, "learning_rate": 8.561253561253562e-06, "loss": 1.4106, "step": 5370 }, { "epoch": 22.00991452991453, "grad_norm": 0.40176790952682495, "learning_rate": 8.556505223171891e-06, "loss": 0.2286, "step": 5380 }, { "epoch": 22.01, "eval_accuracy": 0.44, "eval_loss": 2.434278964996338, "eval_runtime": 33.4186, "eval_samples_per_second": 0.748, "eval_steps_per_second": 0.748, "step": 5382 }, { "epoch": 23.00034188034188, "grad_norm": 0.008083014748990536, "learning_rate": 8.551756885090219e-06, "loss": 0.2302, "step": 5390 }, { "epoch": 23.00076923076923, "grad_norm": 81.26300048828125, "learning_rate": 8.547008547008548e-06, "loss": 0.586, "step": 5400 }, { "epoch": 23.00119658119658, "grad_norm": 0.00498996814712882, "learning_rate": 8.542260208926875e-06, "loss": 0.3741, "step": 5410 }, { "epoch": 23.001623931623932, "grad_norm": 0.028612615540623665, "learning_rate": 8.537511870845206e-06, "loss": 0.4084, "step": 5420 }, { "epoch": 23.002051282051283, "grad_norm": 0.005741656292229891, "learning_rate": 8.532763532763533e-06, "loss": 1.1549, "step": 5430 }, { "epoch": 23.00247863247863, "grad_norm": 0.0038078909274190664, "learning_rate": 8.528015194681863e-06, "loss": 0.5803, "step": 5440 }, { "epoch": 23.002905982905983, "grad_norm": 0.07777408510446548, "learning_rate": 8.52326685660019e-06, "loss": 1.8444, "step": 5450 }, { "epoch": 23.003333333333334, "grad_norm": 0.23302805423736572, "learning_rate": 8.518518518518519e-06, "loss": 0.3736, "step": 5460 }, { "epoch": 23.003760683760685, "grad_norm": 0.03206072375178337, "learning_rate": 8.513770180436848e-06, "loss": 0.6338, "step": 5470 }, { "epoch": 23.004188034188033, "grad_norm": 55.12649154663086, "learning_rate": 8.509021842355176e-06, "loss": 0.9267, "step": 5480 }, { "epoch": 23.004615384615384, "grad_norm": 68.18521881103516, "learning_rate": 8.504273504273505e-06, "loss": 0.8771, "step": 5490 }, { "epoch": 23.005042735042736, "grad_norm": 28.95956039428711, "learning_rate": 8.499525166191834e-06, "loss": 0.8377, "step": 5500 }, { "epoch": 23.005470085470087, "grad_norm": 0.004457338713109493, "learning_rate": 8.494776828110161e-06, "loss": 0.0553, "step": 5510 }, { "epoch": 23.005897435897435, "grad_norm": 0.024025579914450645, "learning_rate": 8.49002849002849e-06, "loss": 0.8331, "step": 5520 }, { "epoch": 23.006324786324786, "grad_norm": 0.019380344077944756, "learning_rate": 8.48528015194682e-06, "loss": 1.7575, "step": 5530 }, { "epoch": 23.006752136752137, "grad_norm": 0.015158622525632381, "learning_rate": 8.480531813865149e-06, "loss": 1.1634, "step": 5540 }, { "epoch": 23.00717948717949, "grad_norm": 27.653972625732422, "learning_rate": 8.475783475783476e-06, "loss": 1.3936, "step": 5550 }, { "epoch": 23.007606837606836, "grad_norm": 197.41244506835938, "learning_rate": 8.471035137701806e-06, "loss": 0.9821, "step": 5560 }, { "epoch": 23.008034188034188, "grad_norm": 0.009685487486422062, "learning_rate": 8.466286799620133e-06, "loss": 0.048, "step": 5570 }, { "epoch": 23.00846153846154, "grad_norm": 0.04525621980428696, "learning_rate": 8.461538461538462e-06, "loss": 1.1907, "step": 5580 }, { "epoch": 23.00888888888889, "grad_norm": 0.29436931014060974, "learning_rate": 8.456790123456791e-06, "loss": 2.1125, "step": 5590 }, { "epoch": 23.009316239316238, "grad_norm": 0.028307698667049408, "learning_rate": 8.452041785375119e-06, "loss": 0.6447, "step": 5600 }, { "epoch": 23.00974358974359, "grad_norm": 171.65646362304688, "learning_rate": 8.447293447293448e-06, "loss": 0.8939, "step": 5610 }, { "epoch": 23.01, "eval_accuracy": 0.4, "eval_loss": 3.0712430477142334, "eval_runtime": 33.4249, "eval_samples_per_second": 0.748, "eval_steps_per_second": 0.748, "step": 5616 }, { "epoch": 24.00017094017094, "grad_norm": 0.5712462067604065, "learning_rate": 8.442545109211775e-06, "loss": 0.2082, "step": 5620 }, { "epoch": 24.00059829059829, "grad_norm": 0.008292453363537788, "learning_rate": 8.437796771130106e-06, "loss": 0.6958, "step": 5630 }, { "epoch": 24.00102564102564, "grad_norm": 118.00381469726562, "learning_rate": 8.433048433048434e-06, "loss": 0.9527, "step": 5640 }, { "epoch": 24.001452991452993, "grad_norm": 1.618455171585083, "learning_rate": 8.428300094966763e-06, "loss": 0.3995, "step": 5650 }, { "epoch": 24.00188034188034, "grad_norm": 131.17752075195312, "learning_rate": 8.423551756885092e-06, "loss": 0.3038, "step": 5660 }, { "epoch": 24.002307692307692, "grad_norm": 2.7916219234466553, "learning_rate": 8.41880341880342e-06, "loss": 0.7865, "step": 5670 }, { "epoch": 24.002735042735043, "grad_norm": 37.944374084472656, "learning_rate": 8.414055080721748e-06, "loss": 0.1766, "step": 5680 }, { "epoch": 24.003162393162395, "grad_norm": 0.038069456815719604, "learning_rate": 8.409306742640076e-06, "loss": 0.3233, "step": 5690 }, { "epoch": 24.003589743589743, "grad_norm": 0.01322519313544035, "learning_rate": 8.404558404558405e-06, "loss": 0.2382, "step": 5700 }, { "epoch": 24.004017094017094, "grad_norm": 181.4189910888672, "learning_rate": 8.399810066476734e-06, "loss": 0.9649, "step": 5710 }, { "epoch": 24.004444444444445, "grad_norm": 37.26016616821289, "learning_rate": 8.395061728395062e-06, "loss": 1.6616, "step": 5720 }, { "epoch": 24.004871794871796, "grad_norm": 0.007100946735590696, "learning_rate": 8.39031339031339e-06, "loss": 1.193, "step": 5730 }, { "epoch": 24.005299145299144, "grad_norm": 2.50199031829834, "learning_rate": 8.38556505223172e-06, "loss": 1.475, "step": 5740 }, { "epoch": 24.005726495726496, "grad_norm": 0.008331292308866978, "learning_rate": 8.380816714150049e-06, "loss": 0.2329, "step": 5750 }, { "epoch": 24.006153846153847, "grad_norm": 0.04707374423742294, "learning_rate": 8.376068376068377e-06, "loss": 0.363, "step": 5760 }, { "epoch": 24.006581196581198, "grad_norm": 0.08683554083108902, "learning_rate": 8.371320037986706e-06, "loss": 1.9591, "step": 5770 }, { "epoch": 24.007008547008546, "grad_norm": 0.020473746582865715, "learning_rate": 8.366571699905033e-06, "loss": 1.562, "step": 5780 }, { "epoch": 24.007435897435897, "grad_norm": 48.53575134277344, "learning_rate": 8.361823361823362e-06, "loss": 0.5132, "step": 5790 }, { "epoch": 24.00786324786325, "grad_norm": 0.007784120738506317, "learning_rate": 8.357075023741691e-06, "loss": 1.1691, "step": 5800 }, { "epoch": 24.0082905982906, "grad_norm": 0.08522366732358932, "learning_rate": 8.352326685660019e-06, "loss": 0.0145, "step": 5810 }, { "epoch": 24.008717948717948, "grad_norm": 3.588449001312256, "learning_rate": 8.347578347578348e-06, "loss": 0.2557, "step": 5820 }, { "epoch": 24.0091452991453, "grad_norm": 6.431326389312744, "learning_rate": 8.342830009496677e-06, "loss": 0.8734, "step": 5830 }, { "epoch": 24.00957264957265, "grad_norm": 10.404790878295898, "learning_rate": 8.338081671415006e-06, "loss": 1.3251, "step": 5840 }, { "epoch": 24.01, "grad_norm": 0.02091125212609768, "learning_rate": 8.333333333333334e-06, "loss": 0.3871, "step": 5850 }, { "epoch": 24.01, "eval_accuracy": 0.4, "eval_loss": 3.2393715381622314, "eval_runtime": 33.9657, "eval_samples_per_second": 0.736, "eval_steps_per_second": 0.736, "step": 5850 }, { "epoch": 25.00042735042735, "grad_norm": 134.0677947998047, "learning_rate": 8.328584995251663e-06, "loss": 0.6986, "step": 5860 }, { "epoch": 25.000854700854703, "grad_norm": 0.03229491412639618, "learning_rate": 8.323836657169992e-06, "loss": 0.768, "step": 5870 }, { "epoch": 25.00128205128205, "grad_norm": 156.05152893066406, "learning_rate": 8.31908831908832e-06, "loss": 1.0463, "step": 5880 }, { "epoch": 25.0017094017094, "grad_norm": 0.014203476719558239, "learning_rate": 8.314339981006649e-06, "loss": 0.488, "step": 5890 }, { "epoch": 25.002136752136753, "grad_norm": 285.82000732421875, "learning_rate": 8.309591642924976e-06, "loss": 0.8084, "step": 5900 }, { "epoch": 25.002564102564104, "grad_norm": 122.90914154052734, "learning_rate": 8.304843304843305e-06, "loss": 0.8786, "step": 5910 }, { "epoch": 25.002991452991452, "grad_norm": 8.224214553833008, "learning_rate": 8.300094966761634e-06, "loss": 0.3977, "step": 5920 }, { "epoch": 25.003418803418803, "grad_norm": 230.34385681152344, "learning_rate": 8.295346628679962e-06, "loss": 0.5285, "step": 5930 }, { "epoch": 25.003846153846155, "grad_norm": 1.262283205986023, "learning_rate": 8.290598290598293e-06, "loss": 0.8955, "step": 5940 }, { "epoch": 25.004273504273506, "grad_norm": 71.28157043457031, "learning_rate": 8.28584995251662e-06, "loss": 0.53, "step": 5950 }, { "epoch": 25.004700854700854, "grad_norm": 0.19136950373649597, "learning_rate": 8.28110161443495e-06, "loss": 0.8647, "step": 5960 }, { "epoch": 25.005128205128205, "grad_norm": 0.2358364760875702, "learning_rate": 8.276353276353277e-06, "loss": 0.0021, "step": 5970 }, { "epoch": 25.005555555555556, "grad_norm": 0.007490760181099176, "learning_rate": 8.271604938271606e-06, "loss": 0.6602, "step": 5980 }, { "epoch": 25.005982905982908, "grad_norm": 0.006508139427751303, "learning_rate": 8.266856600189935e-06, "loss": 0.7927, "step": 5990 }, { "epoch": 25.006410256410255, "grad_norm": 0.024243731051683426, "learning_rate": 8.262108262108262e-06, "loss": 1.5309, "step": 6000 }, { "epoch": 25.006837606837607, "grad_norm": 11.39758586883545, "learning_rate": 8.257359924026592e-06, "loss": 0.311, "step": 6010 }, { "epoch": 25.007264957264958, "grad_norm": 0.23802872002124786, "learning_rate": 8.252611585944919e-06, "loss": 0.0891, "step": 6020 }, { "epoch": 25.00769230769231, "grad_norm": 14.19528865814209, "learning_rate": 8.247863247863248e-06, "loss": 1.7728, "step": 6030 }, { "epoch": 25.008119658119657, "grad_norm": 0.0041798874735832214, "learning_rate": 8.243114909781577e-06, "loss": 0.0076, "step": 6040 }, { "epoch": 25.00854700854701, "grad_norm": 83.91490936279297, "learning_rate": 8.238366571699906e-06, "loss": 1.0939, "step": 6050 }, { "epoch": 25.00897435897436, "grad_norm": 109.98046875, "learning_rate": 8.233618233618234e-06, "loss": 0.7906, "step": 6060 }, { "epoch": 25.00940170940171, "grad_norm": 0.0034200188238173723, "learning_rate": 8.228869895536563e-06, "loss": 0.3631, "step": 6070 }, { "epoch": 25.00982905982906, "grad_norm": 0.009681742638349533, "learning_rate": 8.224121557454892e-06, "loss": 0.3649, "step": 6080 }, { "epoch": 25.01, "eval_accuracy": 0.44, "eval_loss": 3.9465994834899902, "eval_runtime": 33.084, "eval_samples_per_second": 0.756, "eval_steps_per_second": 0.756, "step": 6084 }, { "epoch": 26.00025641025641, "grad_norm": 0.19076739251613617, "learning_rate": 8.21937321937322e-06, "loss": 0.9254, "step": 6090 }, { "epoch": 26.00068376068376, "grad_norm": 0.0032322730403393507, "learning_rate": 8.214624881291549e-06, "loss": 0.7628, "step": 6100 }, { "epoch": 26.00111111111111, "grad_norm": 78.26629638671875, "learning_rate": 8.209876543209876e-06, "loss": 1.0775, "step": 6110 }, { "epoch": 26.001538461538463, "grad_norm": 41.407955169677734, "learning_rate": 8.205128205128205e-06, "loss": 0.6687, "step": 6120 }, { "epoch": 26.00196581196581, "grad_norm": 0.018887581303715706, "learning_rate": 8.200379867046535e-06, "loss": 0.6125, "step": 6130 }, { "epoch": 26.00239316239316, "grad_norm": 0.004367130342870951, "learning_rate": 8.195631528964862e-06, "loss": 0.4048, "step": 6140 }, { "epoch": 26.002820512820513, "grad_norm": 0.005417748354375362, "learning_rate": 8.190883190883193e-06, "loss": 0.1982, "step": 6150 }, { "epoch": 26.003247863247864, "grad_norm": 0.028607884421944618, "learning_rate": 8.18613485280152e-06, "loss": 1.1038, "step": 6160 }, { "epoch": 26.003675213675212, "grad_norm": 167.5290069580078, "learning_rate": 8.18138651471985e-06, "loss": 0.2927, "step": 6170 }, { "epoch": 26.004102564102563, "grad_norm": 0.5390312671661377, "learning_rate": 8.176638176638177e-06, "loss": 1.0214, "step": 6180 }, { "epoch": 26.004529914529915, "grad_norm": 0.007749592885375023, "learning_rate": 8.171889838556506e-06, "loss": 0.2385, "step": 6190 }, { "epoch": 26.004957264957266, "grad_norm": 0.005856471136212349, "learning_rate": 8.167141500474835e-06, "loss": 0.2423, "step": 6200 }, { "epoch": 26.005384615384614, "grad_norm": 136.56642150878906, "learning_rate": 8.162393162393163e-06, "loss": 0.7251, "step": 6210 }, { "epoch": 26.005811965811965, "grad_norm": 482.7048034667969, "learning_rate": 8.157644824311492e-06, "loss": 0.7191, "step": 6220 }, { "epoch": 26.006239316239316, "grad_norm": 0.00340673909522593, "learning_rate": 8.15289648622982e-06, "loss": 0.0008, "step": 6230 }, { "epoch": 26.006666666666668, "grad_norm": 0.0029049227014184, "learning_rate": 8.148148148148148e-06, "loss": 0.7355, "step": 6240 }, { "epoch": 26.007094017094015, "grad_norm": 0.15764747560024261, "learning_rate": 8.143399810066477e-06, "loss": 0.8012, "step": 6250 }, { "epoch": 26.007521367521367, "grad_norm": 0.00497576454654336, "learning_rate": 8.138651471984807e-06, "loss": 0.7658, "step": 6260 }, { "epoch": 26.007948717948718, "grad_norm": 0.009462869726121426, "learning_rate": 8.133903133903136e-06, "loss": 0.001, "step": 6270 }, { "epoch": 26.00837606837607, "grad_norm": 0.006881650071591139, "learning_rate": 8.129154795821463e-06, "loss": 0.0038, "step": 6280 }, { "epoch": 26.008803418803417, "grad_norm": 0.036327850073575974, "learning_rate": 8.124406457739792e-06, "loss": 3.0791, "step": 6290 }, { "epoch": 26.00923076923077, "grad_norm": 78.0274658203125, "learning_rate": 8.11965811965812e-06, "loss": 0.3814, "step": 6300 }, { "epoch": 26.00965811965812, "grad_norm": 0.21120551228523254, "learning_rate": 8.114909781576449e-06, "loss": 1.2601, "step": 6310 }, { "epoch": 26.01, "eval_accuracy": 0.44, "eval_loss": 2.9586069583892822, "eval_runtime": 33.5582, "eval_samples_per_second": 0.745, "eval_steps_per_second": 0.745, "step": 6318 }, { "epoch": 27.00008547008547, "grad_norm": 67.51969146728516, "learning_rate": 8.110161443494778e-06, "loss": 0.9303, "step": 6320 }, { "epoch": 27.00051282051282, "grad_norm": 104.40100860595703, "learning_rate": 8.105413105413106e-06, "loss": 0.4351, "step": 6330 }, { "epoch": 27.000940170940172, "grad_norm": 0.016888683661818504, "learning_rate": 8.100664767331435e-06, "loss": 0.3423, "step": 6340 }, { "epoch": 27.00136752136752, "grad_norm": 0.011165400967001915, "learning_rate": 8.095916429249762e-06, "loss": 1.4128, "step": 6350 }, { "epoch": 27.00179487179487, "grad_norm": 0.027417538687586784, "learning_rate": 8.091168091168093e-06, "loss": 0.002, "step": 6360 }, { "epoch": 27.002222222222223, "grad_norm": 0.09121936559677124, "learning_rate": 8.08641975308642e-06, "loss": 2.5068, "step": 6370 }, { "epoch": 27.002649572649574, "grad_norm": 0.007114642299711704, "learning_rate": 8.08167141500475e-06, "loss": 0.1004, "step": 6380 }, { "epoch": 27.00307692307692, "grad_norm": 0.03234013542532921, "learning_rate": 8.076923076923077e-06, "loss": 0.2587, "step": 6390 }, { "epoch": 27.003504273504273, "grad_norm": 0.06352563947439194, "learning_rate": 8.072174738841406e-06, "loss": 0.2638, "step": 6400 }, { "epoch": 27.003931623931624, "grad_norm": 0.03514587879180908, "learning_rate": 8.067426400759735e-06, "loss": 0.0102, "step": 6410 }, { "epoch": 27.004358974358976, "grad_norm": 0.08466164022684097, "learning_rate": 8.062678062678063e-06, "loss": 0.3654, "step": 6420 }, { "epoch": 27.004786324786323, "grad_norm": 0.01347273774445057, "learning_rate": 8.057929724596392e-06, "loss": 0.0482, "step": 6430 }, { "epoch": 27.005213675213675, "grad_norm": 25.30817985534668, "learning_rate": 8.05318138651472e-06, "loss": 0.0527, "step": 6440 }, { "epoch": 27.005641025641026, "grad_norm": 0.003483937354758382, "learning_rate": 8.048433048433048e-06, "loss": 0.5832, "step": 6450 }, { "epoch": 27.006068376068377, "grad_norm": 0.07294996827840805, "learning_rate": 8.043684710351378e-06, "loss": 0.5054, "step": 6460 }, { "epoch": 27.006495726495725, "grad_norm": 0.02032245323061943, "learning_rate": 8.038936372269707e-06, "loss": 0.2399, "step": 6470 }, { "epoch": 27.006923076923076, "grad_norm": 5.702065944671631, "learning_rate": 8.034188034188036e-06, "loss": 1.0618, "step": 6480 }, { "epoch": 27.007350427350428, "grad_norm": 0.2200225442647934, "learning_rate": 8.029439696106363e-06, "loss": 0.4855, "step": 6490 }, { "epoch": 27.00777777777778, "grad_norm": 0.02679477632045746, "learning_rate": 8.024691358024692e-06, "loss": 0.9622, "step": 6500 }, { "epoch": 27.008205128205127, "grad_norm": 0.00978939514607191, "learning_rate": 8.01994301994302e-06, "loss": 0.173, "step": 6510 }, { "epoch": 27.008632478632478, "grad_norm": 0.009447705931961536, "learning_rate": 8.015194681861349e-06, "loss": 0.0833, "step": 6520 }, { "epoch": 27.00905982905983, "grad_norm": 6.86177921295166, "learning_rate": 8.010446343779678e-06, "loss": 0.3262, "step": 6530 }, { "epoch": 27.00948717948718, "grad_norm": 0.2438230961561203, "learning_rate": 8.005698005698006e-06, "loss": 0.1945, "step": 6540 }, { "epoch": 27.00991452991453, "grad_norm": 0.0071107167750597, "learning_rate": 8.000949667616335e-06, "loss": 0.852, "step": 6550 }, { "epoch": 27.01, "eval_accuracy": 0.4, "eval_loss": 4.646361827850342, "eval_runtime": 33.5078, "eval_samples_per_second": 0.746, "eval_steps_per_second": 0.746, "step": 6552 }, { "epoch": 28.00034188034188, "grad_norm": 0.007593700196594, "learning_rate": 7.996201329534662e-06, "loss": 0.9613, "step": 6560 }, { "epoch": 28.00076923076923, "grad_norm": 198.82052612304688, "learning_rate": 7.991452991452993e-06, "loss": 0.6948, "step": 6570 }, { "epoch": 28.00119658119658, "grad_norm": 0.006188979372382164, "learning_rate": 7.98670465337132e-06, "loss": 1.0506, "step": 6580 }, { "epoch": 28.001623931623932, "grad_norm": 0.0029619692359119654, "learning_rate": 7.98195631528965e-06, "loss": 0.5341, "step": 6590 }, { "epoch": 28.002051282051283, "grad_norm": 0.017630403861403465, "learning_rate": 7.977207977207979e-06, "loss": 0.775, "step": 6600 }, { "epoch": 28.00247863247863, "grad_norm": 0.15129226446151733, "learning_rate": 7.972459639126306e-06, "loss": 0.3747, "step": 6610 }, { "epoch": 28.002905982905983, "grad_norm": 0.11231208592653275, "learning_rate": 7.967711301044635e-06, "loss": 0.0478, "step": 6620 }, { "epoch": 28.003333333333334, "grad_norm": 0.01004121731966734, "learning_rate": 7.962962962962963e-06, "loss": 0.5073, "step": 6630 }, { "epoch": 28.003760683760685, "grad_norm": 0.0024136013817042112, "learning_rate": 7.958214624881292e-06, "loss": 1.5819, "step": 6640 }, { "epoch": 28.004188034188033, "grad_norm": 329.79754638671875, "learning_rate": 7.953466286799621e-06, "loss": 0.5354, "step": 6650 }, { "epoch": 28.004615384615384, "grad_norm": 9.606019973754883, "learning_rate": 7.948717948717949e-06, "loss": 0.0239, "step": 6660 }, { "epoch": 28.005042735042736, "grad_norm": 161.5113067626953, "learning_rate": 7.943969610636278e-06, "loss": 1.0728, "step": 6670 }, { "epoch": 28.005470085470087, "grad_norm": 0.014358972199261189, "learning_rate": 7.939221272554607e-06, "loss": 0.0171, "step": 6680 }, { "epoch": 28.005897435897435, "grad_norm": 96.80726623535156, "learning_rate": 7.934472934472936e-06, "loss": 0.5362, "step": 6690 }, { "epoch": 28.006324786324786, "grad_norm": 0.0033912707585841417, "learning_rate": 7.929724596391264e-06, "loss": 0.6892, "step": 6700 }, { "epoch": 28.006752136752137, "grad_norm": 11.932380676269531, "learning_rate": 7.924976258309593e-06, "loss": 0.4339, "step": 6710 }, { "epoch": 28.00717948717949, "grad_norm": 0.026500051841139793, "learning_rate": 7.92022792022792e-06, "loss": 1.4133, "step": 6720 }, { "epoch": 28.007606837606836, "grad_norm": 0.003824119921773672, "learning_rate": 7.91547958214625e-06, "loss": 0.8, "step": 6730 }, { "epoch": 28.008034188034188, "grad_norm": 0.02020156756043434, "learning_rate": 7.910731244064578e-06, "loss": 0.9052, "step": 6740 }, { "epoch": 28.00846153846154, "grad_norm": 80.97819519042969, "learning_rate": 7.905982905982906e-06, "loss": 0.2557, "step": 6750 }, { "epoch": 28.00888888888889, "grad_norm": 79.75435638427734, "learning_rate": 7.901234567901235e-06, "loss": 0.7378, "step": 6760 }, { "epoch": 28.009316239316238, "grad_norm": 0.6139788031578064, "learning_rate": 7.896486229819562e-06, "loss": 0.595, "step": 6770 }, { "epoch": 28.00974358974359, "grad_norm": 181.7918243408203, "learning_rate": 7.891737891737893e-06, "loss": 0.6269, "step": 6780 }, { "epoch": 28.01, "eval_accuracy": 0.44, "eval_loss": 3.1291773319244385, "eval_runtime": 33.0523, "eval_samples_per_second": 0.756, "eval_steps_per_second": 0.756, "step": 6786 }, { "epoch": 29.00017094017094, "grad_norm": 0.010640930384397507, "learning_rate": 7.88698955365622e-06, "loss": 0.8873, "step": 6790 }, { "epoch": 29.00059829059829, "grad_norm": 0.007870799861848354, "learning_rate": 7.88224121557455e-06, "loss": 0.013, "step": 6800 }, { "epoch": 29.00102564102564, "grad_norm": 428.0439453125, "learning_rate": 7.877492877492879e-06, "loss": 0.3937, "step": 6810 }, { "epoch": 29.001452991452993, "grad_norm": 0.003777164965867996, "learning_rate": 7.872744539411206e-06, "loss": 0.2548, "step": 6820 }, { "epoch": 29.00188034188034, "grad_norm": 754.8897094726562, "learning_rate": 7.867996201329536e-06, "loss": 0.7434, "step": 6830 }, { "epoch": 29.002307692307692, "grad_norm": 0.002904111985117197, "learning_rate": 7.863247863247863e-06, "loss": 0.6923, "step": 6840 }, { "epoch": 29.002735042735043, "grad_norm": 405.4997253417969, "learning_rate": 7.858499525166192e-06, "loss": 1.4677, "step": 6850 }, { "epoch": 29.003162393162395, "grad_norm": 0.03453909978270531, "learning_rate": 7.853751187084521e-06, "loss": 0.2134, "step": 6860 }, { "epoch": 29.003589743589743, "grad_norm": 0.002487962134182453, "learning_rate": 7.849002849002849e-06, "loss": 0.0651, "step": 6870 }, { "epoch": 29.004017094017094, "grad_norm": 0.0023798488546162844, "learning_rate": 7.844254510921178e-06, "loss": 0.8198, "step": 6880 }, { "epoch": 29.004444444444445, "grad_norm": 0.0023906242568045855, "learning_rate": 7.839506172839507e-06, "loss": 1.3586, "step": 6890 }, { "epoch": 29.004871794871796, "grad_norm": 0.0033716100733727217, "learning_rate": 7.834757834757836e-06, "loss": 0.0181, "step": 6900 }, { "epoch": 29.005299145299144, "grad_norm": 260.02825927734375, "learning_rate": 7.830009496676164e-06, "loss": 0.287, "step": 6910 }, { "epoch": 29.005726495726496, "grad_norm": 0.00520634651184082, "learning_rate": 7.825261158594493e-06, "loss": 0.0378, "step": 6920 }, { "epoch": 29.006153846153847, "grad_norm": 0.005019763018935919, "learning_rate": 7.820512820512822e-06, "loss": 1.2482, "step": 6930 }, { "epoch": 29.006581196581198, "grad_norm": 136.67764282226562, "learning_rate": 7.81576448243115e-06, "loss": 1.2099, "step": 6940 }, { "epoch": 29.007008547008546, "grad_norm": 3.6764883995056152, "learning_rate": 7.811016144349479e-06, "loss": 1.5087, "step": 6950 }, { "epoch": 29.007435897435897, "grad_norm": 0.004404416773468256, "learning_rate": 7.806267806267806e-06, "loss": 0.5894, "step": 6960 }, { "epoch": 29.00786324786325, "grad_norm": 0.008280979469418526, "learning_rate": 7.801519468186135e-06, "loss": 0.4367, "step": 6970 }, { "epoch": 29.0082905982906, "grad_norm": 0.006669474299997091, "learning_rate": 7.796771130104464e-06, "loss": 0.5476, "step": 6980 }, { "epoch": 29.008717948717948, "grad_norm": 0.012090092524886131, "learning_rate": 7.792022792022793e-06, "loss": 1.0435, "step": 6990 }, { "epoch": 29.0091452991453, "grad_norm": 0.0026431807782500982, "learning_rate": 7.787274453941121e-06, "loss": 0.0031, "step": 7000 }, { "epoch": 29.00957264957265, "grad_norm": 0.012273382395505905, "learning_rate": 7.78252611585945e-06, "loss": 0.7421, "step": 7010 }, { "epoch": 29.01, "grad_norm": 0.3034244179725647, "learning_rate": 7.77777777777778e-06, "loss": 1.0013, "step": 7020 }, { "epoch": 29.01, "eval_accuracy": 0.4, "eval_loss": 4.631889820098877, "eval_runtime": 33.5346, "eval_samples_per_second": 0.745, "eval_steps_per_second": 0.745, "step": 7020 }, { "epoch": 30.00042735042735, "grad_norm": 1.9434560537338257, "learning_rate": 7.773029439696107e-06, "loss": 0.8706, "step": 7030 }, { "epoch": 30.000854700854703, "grad_norm": 0.003200115170329809, "learning_rate": 7.768281101614436e-06, "loss": 0.5544, "step": 7040 }, { "epoch": 30.00128205128205, "grad_norm": 0.005437285173684359, "learning_rate": 7.763532763532763e-06, "loss": 1.3336, "step": 7050 }, { "epoch": 30.0017094017094, "grad_norm": 0.4712231457233429, "learning_rate": 7.758784425451092e-06, "loss": 1.0848, "step": 7060 }, { "epoch": 30.002136752136753, "grad_norm": 0.0037900730967521667, "learning_rate": 7.754036087369421e-06, "loss": 0.3756, "step": 7070 }, { "epoch": 30.002564102564104, "grad_norm": 114.87593841552734, "learning_rate": 7.749287749287749e-06, "loss": 0.6611, "step": 7080 }, { "epoch": 30.002991452991452, "grad_norm": 0.08566898107528687, "learning_rate": 7.74453941120608e-06, "loss": 0.2429, "step": 7090 }, { "epoch": 30.003418803418803, "grad_norm": 0.0038280077278614044, "learning_rate": 7.739791073124407e-06, "loss": 0.001, "step": 7100 }, { "epoch": 30.003846153846155, "grad_norm": 0.008761651813983917, "learning_rate": 7.735042735042736e-06, "loss": 0.0407, "step": 7110 }, { "epoch": 30.004273504273506, "grad_norm": 0.0022454620338976383, "learning_rate": 7.730294396961064e-06, "loss": 0.8858, "step": 7120 }, { "epoch": 30.004700854700854, "grad_norm": 0.007320001721382141, "learning_rate": 7.725546058879393e-06, "loss": 0.5604, "step": 7130 }, { "epoch": 30.005128205128205, "grad_norm": 99.08616638183594, "learning_rate": 7.720797720797722e-06, "loss": 0.727, "step": 7140 }, { "epoch": 30.005555555555556, "grad_norm": 0.013165280222892761, "learning_rate": 7.71604938271605e-06, "loss": 0.0583, "step": 7150 }, { "epoch": 30.005982905982908, "grad_norm": 0.0017615576507523656, "learning_rate": 7.711301044634379e-06, "loss": 0.435, "step": 7160 }, { "epoch": 30.006410256410255, "grad_norm": 0.008409475907683372, "learning_rate": 7.706552706552706e-06, "loss": 0.7466, "step": 7170 }, { "epoch": 30.006837606837607, "grad_norm": 0.002216171706095338, "learning_rate": 7.701804368471035e-06, "loss": 0.6197, "step": 7180 }, { "epoch": 30.007264957264958, "grad_norm": 0.015576567500829697, "learning_rate": 7.697056030389364e-06, "loss": 0.0003, "step": 7190 }, { "epoch": 30.00769230769231, "grad_norm": 0.002197784371674061, "learning_rate": 7.692307692307694e-06, "loss": 0.0428, "step": 7200 }, { "epoch": 30.008119658119657, "grad_norm": 0.019170358777046204, "learning_rate": 7.687559354226021e-06, "loss": 2.7654, "step": 7210 }, { "epoch": 30.00854700854701, "grad_norm": 0.0038994126953184605, "learning_rate": 7.68281101614435e-06, "loss": 1.1708, "step": 7220 }, { "epoch": 30.00897435897436, "grad_norm": 0.17869791388511658, "learning_rate": 7.67806267806268e-06, "loss": 0.0286, "step": 7230 }, { "epoch": 30.00940170940171, "grad_norm": 0.006502739619463682, "learning_rate": 7.673314339981007e-06, "loss": 0.4125, "step": 7240 }, { "epoch": 30.00982905982906, "grad_norm": 0.01781412959098816, "learning_rate": 7.668566001899336e-06, "loss": 0.02, "step": 7250 }, { "epoch": 30.01, "eval_accuracy": 0.4, "eval_loss": 4.251375198364258, "eval_runtime": 33.6135, "eval_samples_per_second": 0.744, "eval_steps_per_second": 0.744, "step": 7254 }, { "epoch": 31.00025641025641, "grad_norm": 0.3418414294719696, "learning_rate": 7.663817663817665e-06, "loss": 0.4944, "step": 7260 }, { "epoch": 31.00068376068376, "grad_norm": 0.08820147812366486, "learning_rate": 7.659069325735993e-06, "loss": 0.6409, "step": 7270 }, { "epoch": 31.00111111111111, "grad_norm": 0.004805733449757099, "learning_rate": 7.654320987654322e-06, "loss": 0.703, "step": 7280 }, { "epoch": 31.001538461538463, "grad_norm": 171.22964477539062, "learning_rate": 7.649572649572649e-06, "loss": 0.3897, "step": 7290 }, { "epoch": 31.00196581196581, "grad_norm": 0.0735219419002533, "learning_rate": 7.64482431149098e-06, "loss": 1.2327, "step": 7300 }, { "epoch": 31.00239316239316, "grad_norm": 0.19371773302555084, "learning_rate": 7.640075973409307e-06, "loss": 0.0612, "step": 7310 }, { "epoch": 31.002820512820513, "grad_norm": 0.011904860846698284, "learning_rate": 7.635327635327637e-06, "loss": 0.0011, "step": 7320 }, { "epoch": 31.003247863247864, "grad_norm": 53.154327392578125, "learning_rate": 7.630579297245964e-06, "loss": 0.032, "step": 7330 }, { "epoch": 31.003675213675212, "grad_norm": 0.030124612152576447, "learning_rate": 7.625830959164293e-06, "loss": 0.171, "step": 7340 }, { "epoch": 31.004102564102563, "grad_norm": 0.005205671768635511, "learning_rate": 7.6210826210826214e-06, "loss": 0.5295, "step": 7350 }, { "epoch": 31.004529914529915, "grad_norm": 3.4219090938568115, "learning_rate": 7.61633428300095e-06, "loss": 0.5748, "step": 7360 }, { "epoch": 31.004957264957266, "grad_norm": 0.027771934866905212, "learning_rate": 7.611585944919279e-06, "loss": 0.0483, "step": 7370 }, { "epoch": 31.005384615384614, "grad_norm": 0.0065915524028241634, "learning_rate": 7.606837606837607e-06, "loss": 0.8829, "step": 7380 }, { "epoch": 31.005811965811965, "grad_norm": 0.01646728627383709, "learning_rate": 7.6020892687559355e-06, "loss": 0.0029, "step": 7390 }, { "epoch": 31.006239316239316, "grad_norm": 0.007259257137775421, "learning_rate": 7.5973409306742655e-06, "loss": 0.8513, "step": 7400 }, { "epoch": 31.006666666666668, "grad_norm": 0.004463976714760065, "learning_rate": 7.592592592592594e-06, "loss": 0.4636, "step": 7410 }, { "epoch": 31.007094017094015, "grad_norm": 0.0043134731240570545, "learning_rate": 7.587844254510922e-06, "loss": 1.0482, "step": 7420 }, { "epoch": 31.007521367521367, "grad_norm": 0.006201723124831915, "learning_rate": 7.58309591642925e-06, "loss": 0.2689, "step": 7430 }, { "epoch": 31.007948717948718, "grad_norm": 17.315887451171875, "learning_rate": 7.578347578347579e-06, "loss": 1.319, "step": 7440 }, { "epoch": 31.00837606837607, "grad_norm": 0.005171514581888914, "learning_rate": 7.573599240265908e-06, "loss": 0.002, "step": 7450 }, { "epoch": 31.008803418803417, "grad_norm": 108.37242889404297, "learning_rate": 7.568850902184236e-06, "loss": 0.9821, "step": 7460 }, { "epoch": 31.00923076923077, "grad_norm": 3.896742820739746, "learning_rate": 7.564102564102564e-06, "loss": 1.0268, "step": 7470 }, { "epoch": 31.00965811965812, "grad_norm": 0.0028510969132184982, "learning_rate": 7.559354226020893e-06, "loss": 0.1333, "step": 7480 }, { "epoch": 31.01, "eval_accuracy": 0.4, "eval_loss": 4.330972671508789, "eval_runtime": 33.6382, "eval_samples_per_second": 0.743, "eval_steps_per_second": 0.743, "step": 7488 }, { "epoch": 32.00008547008547, "grad_norm": 0.0023284293711185455, "learning_rate": 7.554605887939222e-06, "loss": 0.2603, "step": 7490 }, { "epoch": 32.00051282051282, "grad_norm": 0.01587550714612007, "learning_rate": 7.54985754985755e-06, "loss": 0.1526, "step": 7500 }, { "epoch": 32.00094017094017, "grad_norm": 345.5118408203125, "learning_rate": 7.545109211775879e-06, "loss": 0.1876, "step": 7510 }, { "epoch": 32.00136752136752, "grad_norm": 0.13611197471618652, "learning_rate": 7.540360873694208e-06, "loss": 0.0039, "step": 7520 }, { "epoch": 32.00179487179487, "grad_norm": 378.3825988769531, "learning_rate": 7.535612535612537e-06, "loss": 0.5279, "step": 7530 }, { "epoch": 32.00222222222222, "grad_norm": 0.00310819735750556, "learning_rate": 7.530864197530865e-06, "loss": 0.0143, "step": 7540 }, { "epoch": 32.002649572649574, "grad_norm": 0.009138006716966629, "learning_rate": 7.526115859449193e-06, "loss": 0.0012, "step": 7550 }, { "epoch": 32.003076923076925, "grad_norm": 96.70939636230469, "learning_rate": 7.521367521367522e-06, "loss": 0.8369, "step": 7560 }, { "epoch": 32.00350427350428, "grad_norm": 136.33778381347656, "learning_rate": 7.516619183285851e-06, "loss": 0.5351, "step": 7570 }, { "epoch": 32.00393162393162, "grad_norm": 0.0020322606433182955, "learning_rate": 7.511870845204179e-06, "loss": 1.2159, "step": 7580 }, { "epoch": 32.00435897435897, "grad_norm": 0.03725990280508995, "learning_rate": 7.507122507122507e-06, "loss": 0.6448, "step": 7590 }, { "epoch": 32.00478632478632, "grad_norm": 0.0038324042689055204, "learning_rate": 7.502374169040836e-06, "loss": 0.5215, "step": 7600 }, { "epoch": 32.005213675213675, "grad_norm": 0.009491757489740849, "learning_rate": 7.497625830959166e-06, "loss": 0.8682, "step": 7610 }, { "epoch": 32.005641025641026, "grad_norm": 0.005307774059474468, "learning_rate": 7.492877492877494e-06, "loss": 0.0158, "step": 7620 }, { "epoch": 32.00606837606838, "grad_norm": 0.004275487270206213, "learning_rate": 7.488129154795822e-06, "loss": 0.5831, "step": 7630 }, { "epoch": 32.00649572649573, "grad_norm": 0.011664590798318386, "learning_rate": 7.4833808167141505e-06, "loss": 0.0162, "step": 7640 }, { "epoch": 32.00692307692308, "grad_norm": 554.8225708007812, "learning_rate": 7.47863247863248e-06, "loss": 0.2277, "step": 7650 }, { "epoch": 32.007350427350424, "grad_norm": 0.0021805327851325274, "learning_rate": 7.473884140550808e-06, "loss": 2.079, "step": 7660 }, { "epoch": 32.007777777777775, "grad_norm": 2.559633255004883, "learning_rate": 7.469135802469136e-06, "loss": 0.0108, "step": 7670 }, { "epoch": 32.00820512820513, "grad_norm": 15.645346641540527, "learning_rate": 7.4643874643874645e-06, "loss": 0.9423, "step": 7680 }, { "epoch": 32.00863247863248, "grad_norm": 0.0020319002214819193, "learning_rate": 7.459639126305793e-06, "loss": 0.7076, "step": 7690 }, { "epoch": 32.00905982905983, "grad_norm": 0.6667153239250183, "learning_rate": 7.454890788224122e-06, "loss": 0.5031, "step": 7700 }, { "epoch": 32.00948717948718, "grad_norm": 0.0021193181164562702, "learning_rate": 7.450142450142451e-06, "loss": 0.3348, "step": 7710 }, { "epoch": 32.00991452991453, "grad_norm": 0.005187495611608028, "learning_rate": 7.445394112060779e-06, "loss": 0.0005, "step": 7720 }, { "epoch": 32.01, "eval_accuracy": 0.4, "eval_loss": 4.535378932952881, "eval_runtime": 34.0349, "eval_samples_per_second": 0.735, "eval_steps_per_second": 0.735, "step": 7722 }, { "epoch": 33.00034188034188, "grad_norm": 0.0027542465832084417, "learning_rate": 7.4406457739791086e-06, "loss": 0.267, "step": 7730 }, { "epoch": 33.00076923076923, "grad_norm": 0.00943476427346468, "learning_rate": 7.435897435897437e-06, "loss": 0.4691, "step": 7740 }, { "epoch": 33.00119658119658, "grad_norm": 96.7542724609375, "learning_rate": 7.431149097815765e-06, "loss": 0.1307, "step": 7750 }, { "epoch": 33.00162393162393, "grad_norm": 0.005492000840604305, "learning_rate": 7.4264007597340934e-06, "loss": 0.0305, "step": 7760 }, { "epoch": 33.00205128205128, "grad_norm": 12.753386497497559, "learning_rate": 7.421652421652423e-06, "loss": 0.5948, "step": 7770 }, { "epoch": 33.002478632478635, "grad_norm": 0.0043577225878834724, "learning_rate": 7.416904083570751e-06, "loss": 0.6122, "step": 7780 }, { "epoch": 33.002905982905986, "grad_norm": 0.00950498878955841, "learning_rate": 7.412155745489079e-06, "loss": 0.1552, "step": 7790 }, { "epoch": 33.00333333333333, "grad_norm": 146.59999084472656, "learning_rate": 7.4074074074074075e-06, "loss": 0.7224, "step": 7800 }, { "epoch": 33.00376068376068, "grad_norm": 0.0066252099350094795, "learning_rate": 7.402659069325736e-06, "loss": 0.0714, "step": 7810 }, { "epoch": 33.00418803418803, "grad_norm": 0.003962312359362841, "learning_rate": 7.397910731244066e-06, "loss": 0.1235, "step": 7820 }, { "epoch": 33.004615384615384, "grad_norm": 0.004090788774192333, "learning_rate": 7.393162393162394e-06, "loss": 0.8119, "step": 7830 }, { "epoch": 33.005042735042736, "grad_norm": 0.005719439126551151, "learning_rate": 7.388414055080722e-06, "loss": 1.4667, "step": 7840 }, { "epoch": 33.00547008547009, "grad_norm": 0.004016694147139788, "learning_rate": 7.3836657169990515e-06, "loss": 1.7774, "step": 7850 }, { "epoch": 33.00589743589744, "grad_norm": 0.02771218866109848, "learning_rate": 7.37891737891738e-06, "loss": 0.1237, "step": 7860 }, { "epoch": 33.00632478632479, "grad_norm": 10.215202331542969, "learning_rate": 7.374169040835708e-06, "loss": 0.2107, "step": 7870 }, { "epoch": 33.006752136752134, "grad_norm": 0.036797523498535156, "learning_rate": 7.369420702754036e-06, "loss": 0.3696, "step": 7880 }, { "epoch": 33.007179487179485, "grad_norm": 0.0023336284793913364, "learning_rate": 7.364672364672365e-06, "loss": 0.0944, "step": 7890 }, { "epoch": 33.007606837606836, "grad_norm": 0.002150049665942788, "learning_rate": 7.359924026590694e-06, "loss": 0.1512, "step": 7900 }, { "epoch": 33.00803418803419, "grad_norm": 0.027237646281719208, "learning_rate": 7.355175688509022e-06, "loss": 0.1805, "step": 7910 }, { "epoch": 33.00846153846154, "grad_norm": 0.004415407776832581, "learning_rate": 7.350427350427351e-06, "loss": 0.6579, "step": 7920 }, { "epoch": 33.00888888888889, "grad_norm": 0.009564803913235664, "learning_rate": 7.34567901234568e-06, "loss": 0.6692, "step": 7930 }, { "epoch": 33.00931623931624, "grad_norm": 0.012014131061732769, "learning_rate": 7.340930674264009e-06, "loss": 0.0433, "step": 7940 }, { "epoch": 33.00974358974359, "grad_norm": 1.3320039510726929, "learning_rate": 7.336182336182337e-06, "loss": 0.004, "step": 7950 }, { "epoch": 33.01, "eval_accuracy": 0.4, "eval_loss": 4.597040176391602, "eval_runtime": 33.0791, "eval_samples_per_second": 0.756, "eval_steps_per_second": 0.756, "step": 7956 }, { "epoch": 34.00017094017094, "grad_norm": 0.011348673142492771, "learning_rate": 7.331433998100665e-06, "loss": 0.0008, "step": 7960 }, { "epoch": 34.00059829059829, "grad_norm": 0.008984715677797794, "learning_rate": 7.326685660018994e-06, "loss": 0.026, "step": 7970 }, { "epoch": 34.00102564102564, "grad_norm": 0.002917045494541526, "learning_rate": 7.321937321937323e-06, "loss": 1.1443, "step": 7980 }, { "epoch": 34.00145299145299, "grad_norm": 215.349365234375, "learning_rate": 7.317188983855651e-06, "loss": 0.4194, "step": 7990 }, { "epoch": 34.001880341880344, "grad_norm": 0.02057277224957943, "learning_rate": 7.312440645773979e-06, "loss": 0.3795, "step": 8000 }, { "epoch": 34.002307692307696, "grad_norm": 0.009319877251982689, "learning_rate": 7.307692307692308e-06, "loss": 1.1649, "step": 8010 }, { "epoch": 34.00273504273504, "grad_norm": 0.003309460124000907, "learning_rate": 7.302943969610636e-06, "loss": 0.665, "step": 8020 }, { "epoch": 34.00316239316239, "grad_norm": 0.0023234267719089985, "learning_rate": 7.298195631528966e-06, "loss": 0.1577, "step": 8030 }, { "epoch": 34.00358974358974, "grad_norm": 19.653993606567383, "learning_rate": 7.293447293447294e-06, "loss": 1.0529, "step": 8040 }, { "epoch": 34.004017094017094, "grad_norm": 0.0021447527687996626, "learning_rate": 7.2886989553656225e-06, "loss": 0.6675, "step": 8050 }, { "epoch": 34.004444444444445, "grad_norm": 0.011748022399842739, "learning_rate": 7.283950617283952e-06, "loss": 0.0002, "step": 8060 }, { "epoch": 34.0048717948718, "grad_norm": 0.0019833322148770094, "learning_rate": 7.27920227920228e-06, "loss": 0.5008, "step": 8070 }, { "epoch": 34.00529914529915, "grad_norm": 0.016223294660449028, "learning_rate": 7.274453941120608e-06, "loss": 0.0013, "step": 8080 }, { "epoch": 34.0057264957265, "grad_norm": 0.001770445262081921, "learning_rate": 7.2697056030389366e-06, "loss": 0.761, "step": 8090 }, { "epoch": 34.00615384615384, "grad_norm": 0.005002091638743877, "learning_rate": 7.264957264957266e-06, "loss": 0.0004, "step": 8100 }, { "epoch": 34.006581196581195, "grad_norm": 0.0015637052711099386, "learning_rate": 7.260208926875594e-06, "loss": 1.2961, "step": 8110 }, { "epoch": 34.007008547008546, "grad_norm": 0.002154473215341568, "learning_rate": 7.255460588793922e-06, "loss": 0.0005, "step": 8120 }, { "epoch": 34.0074358974359, "grad_norm": 0.004174029920250177, "learning_rate": 7.2507122507122514e-06, "loss": 0.2425, "step": 8130 }, { "epoch": 34.00786324786325, "grad_norm": 0.0016593949403613806, "learning_rate": 7.245963912630581e-06, "loss": 0.7072, "step": 8140 }, { "epoch": 34.0082905982906, "grad_norm": 0.003939433954656124, "learning_rate": 7.241215574548909e-06, "loss": 0.2379, "step": 8150 }, { "epoch": 34.00871794871795, "grad_norm": 0.0038190174382179976, "learning_rate": 7.236467236467237e-06, "loss": 0.0005, "step": 8160 }, { "epoch": 34.0091452991453, "grad_norm": 0.009816322475671768, "learning_rate": 7.2317188983855655e-06, "loss": 0.0006, "step": 8170 }, { "epoch": 34.00957264957265, "grad_norm": 0.0018094313563778996, "learning_rate": 7.226970560303895e-06, "loss": 0.0108, "step": 8180 }, { "epoch": 34.01, "grad_norm": 0.0064643691293895245, "learning_rate": 7.222222222222223e-06, "loss": 0.3017, "step": 8190 }, { "epoch": 34.01, "eval_accuracy": 0.44, "eval_loss": 4.587911605834961, "eval_runtime": 33.8673, "eval_samples_per_second": 0.738, "eval_steps_per_second": 0.738, "step": 8190 }, { "epoch": 35.00042735042735, "grad_norm": 0.00291146500967443, "learning_rate": 7.217473884140551e-06, "loss": 0.002, "step": 8200 }, { "epoch": 35.0008547008547, "grad_norm": 0.009560974314808846, "learning_rate": 7.2127255460588795e-06, "loss": 0.0003, "step": 8210 }, { "epoch": 35.001282051282054, "grad_norm": 66.06437683105469, "learning_rate": 7.207977207977208e-06, "loss": 1.0965, "step": 8220 }, { "epoch": 35.001709401709405, "grad_norm": 0.0027835587970912457, "learning_rate": 7.203228869895537e-06, "loss": 0.0116, "step": 8230 }, { "epoch": 35.00213675213675, "grad_norm": 0.022427700459957123, "learning_rate": 7.198480531813866e-06, "loss": 0.4445, "step": 8240 }, { "epoch": 35.0025641025641, "grad_norm": 6.788797378540039, "learning_rate": 7.193732193732194e-06, "loss": 0.7864, "step": 8250 }, { "epoch": 35.00299145299145, "grad_norm": 0.002020070794969797, "learning_rate": 7.1889838556505235e-06, "loss": 0.7858, "step": 8260 }, { "epoch": 35.0034188034188, "grad_norm": 0.08197605609893799, "learning_rate": 7.184235517568852e-06, "loss": 0.6455, "step": 8270 }, { "epoch": 35.003846153846155, "grad_norm": 0.001466662622988224, "learning_rate": 7.17948717948718e-06, "loss": 0.057, "step": 8280 }, { "epoch": 35.004273504273506, "grad_norm": 0.014914972707629204, "learning_rate": 7.174738841405508e-06, "loss": 0.065, "step": 8290 }, { "epoch": 35.00470085470086, "grad_norm": 0.04162408038973808, "learning_rate": 7.169990503323837e-06, "loss": 0.0734, "step": 8300 }, { "epoch": 35.00512820512821, "grad_norm": 0.33848291635513306, "learning_rate": 7.165242165242166e-06, "loss": 0.7488, "step": 8310 }, { "epoch": 35.00555555555555, "grad_norm": 0.00879302341490984, "learning_rate": 7.160493827160494e-06, "loss": 0.0036, "step": 8320 }, { "epoch": 35.005982905982904, "grad_norm": 0.0014278884045779705, "learning_rate": 7.1557454890788224e-06, "loss": 0.8021, "step": 8330 }, { "epoch": 35.006410256410255, "grad_norm": 0.03050161898136139, "learning_rate": 7.1509971509971524e-06, "loss": 0.0002, "step": 8340 }, { "epoch": 35.00683760683761, "grad_norm": 0.0032293125987052917, "learning_rate": 7.146248812915481e-06, "loss": 0.4796, "step": 8350 }, { "epoch": 35.00726495726496, "grad_norm": 0.0025629340671002865, "learning_rate": 7.141500474833809e-06, "loss": 0.7034, "step": 8360 }, { "epoch": 35.00769230769231, "grad_norm": 0.004474421963095665, "learning_rate": 7.136752136752137e-06, "loss": 0.0047, "step": 8370 }, { "epoch": 35.00811965811966, "grad_norm": 0.04403112828731537, "learning_rate": 7.132003798670466e-06, "loss": 0.9985, "step": 8380 }, { "epoch": 35.00854700854701, "grad_norm": 0.002210602629929781, "learning_rate": 7.127255460588795e-06, "loss": 0.0011, "step": 8390 }, { "epoch": 35.008974358974356, "grad_norm": 413.6798400878906, "learning_rate": 7.122507122507123e-06, "loss": 0.378, "step": 8400 }, { "epoch": 35.00940170940171, "grad_norm": 0.003414222039282322, "learning_rate": 7.117758784425451e-06, "loss": 0.722, "step": 8410 }, { "epoch": 35.00982905982906, "grad_norm": 0.0017525185830891132, "learning_rate": 7.11301044634378e-06, "loss": 0.2014, "step": 8420 }, { "epoch": 35.01, "eval_accuracy": 0.4, "eval_loss": 4.280939102172852, "eval_runtime": 33.6967, "eval_samples_per_second": 0.742, "eval_steps_per_second": 0.742, "step": 8424 }, { "epoch": 36.00025641025641, "grad_norm": 323.9665222167969, "learning_rate": 7.108262108262109e-06, "loss": 0.7504, "step": 8430 }, { "epoch": 36.00068376068376, "grad_norm": 0.0009274838957935572, "learning_rate": 7.103513770180438e-06, "loss": 0.0036, "step": 8440 }, { "epoch": 36.00111111111111, "grad_norm": 0.004535932093858719, "learning_rate": 7.098765432098766e-06, "loss": 0.0003, "step": 8450 }, { "epoch": 36.00153846153846, "grad_norm": 0.007801331579685211, "learning_rate": 7.0940170940170945e-06, "loss": 0.0066, "step": 8460 }, { "epoch": 36.00196581196581, "grad_norm": 1.9428983926773071, "learning_rate": 7.089268755935424e-06, "loss": 0.5418, "step": 8470 }, { "epoch": 36.00239316239316, "grad_norm": 0.09623009711503983, "learning_rate": 7.084520417853752e-06, "loss": 0.2488, "step": 8480 }, { "epoch": 36.00282051282051, "grad_norm": 0.020373547449707985, "learning_rate": 7.07977207977208e-06, "loss": 0.0012, "step": 8490 }, { "epoch": 36.003247863247864, "grad_norm": 0.002565371571108699, "learning_rate": 7.0750237416904086e-06, "loss": 0.0004, "step": 8500 }, { "epoch": 36.003675213675216, "grad_norm": 243.3353271484375, "learning_rate": 7.070275403608738e-06, "loss": 0.1137, "step": 8510 }, { "epoch": 36.00410256410257, "grad_norm": 0.0050058732740581036, "learning_rate": 7.065527065527066e-06, "loss": 0.7545, "step": 8520 }, { "epoch": 36.00452991452991, "grad_norm": 0.0049826339818537235, "learning_rate": 7.060778727445394e-06, "loss": 0.0002, "step": 8530 }, { "epoch": 36.00495726495726, "grad_norm": 0.0015328944427892566, "learning_rate": 7.056030389363723e-06, "loss": 1.5905, "step": 8540 }, { "epoch": 36.005384615384614, "grad_norm": 0.12297794222831726, "learning_rate": 7.051282051282053e-06, "loss": 0.0142, "step": 8550 }, { "epoch": 36.005811965811965, "grad_norm": 363.6435546875, "learning_rate": 7.046533713200381e-06, "loss": 0.4281, "step": 8560 }, { "epoch": 36.006239316239316, "grad_norm": 10.307737350463867, "learning_rate": 7.041785375118709e-06, "loss": 0.0332, "step": 8570 }, { "epoch": 36.00666666666667, "grad_norm": 0.001833245623856783, "learning_rate": 7.0370370370370375e-06, "loss": 0.1285, "step": 8580 }, { "epoch": 36.00709401709402, "grad_norm": 75.9461441040039, "learning_rate": 7.032288698955367e-06, "loss": 0.0096, "step": 8590 }, { "epoch": 36.00752136752137, "grad_norm": 0.007748633157461882, "learning_rate": 7.027540360873695e-06, "loss": 0.0074, "step": 8600 }, { "epoch": 36.007948717948715, "grad_norm": 0.0246294979006052, "learning_rate": 7.022792022792023e-06, "loss": 0.0058, "step": 8610 }, { "epoch": 36.008376068376066, "grad_norm": 0.07270914316177368, "learning_rate": 7.0180436847103515e-06, "loss": 0.0007, "step": 8620 }, { "epoch": 36.00880341880342, "grad_norm": 0.0704275518655777, "learning_rate": 7.01329534662868e-06, "loss": 0.0986, "step": 8630 }, { "epoch": 36.00923076923077, "grad_norm": 0.03107783943414688, "learning_rate": 7.008547008547009e-06, "loss": 0.0033, "step": 8640 }, { "epoch": 36.00965811965812, "grad_norm": 0.21083621680736542, "learning_rate": 7.003798670465338e-06, "loss": 0.1573, "step": 8650 }, { "epoch": 36.01, "eval_accuracy": 0.44, "eval_loss": 4.6822123527526855, "eval_runtime": 33.5951, "eval_samples_per_second": 0.744, "eval_steps_per_second": 0.744, "step": 8658 }, { "epoch": 37.00008547008547, "grad_norm": 318.6376953125, "learning_rate": 6.999050332383666e-06, "loss": 0.5961, "step": 8660 }, { "epoch": 37.00051282051282, "grad_norm": 1.5630220174789429, "learning_rate": 6.9943019943019955e-06, "loss": 0.005, "step": 8670 }, { "epoch": 37.00094017094017, "grad_norm": 0.00120359449647367, "learning_rate": 6.989553656220324e-06, "loss": 0.0008, "step": 8680 }, { "epoch": 37.00136752136752, "grad_norm": 0.0014155134558677673, "learning_rate": 6.984805318138652e-06, "loss": 0.4491, "step": 8690 }, { "epoch": 37.00179487179487, "grad_norm": 0.0017230873927474022, "learning_rate": 6.9800569800569804e-06, "loss": 0.0004, "step": 8700 }, { "epoch": 37.00222222222222, "grad_norm": 0.07810331135988235, "learning_rate": 6.975308641975309e-06, "loss": 0.0007, "step": 8710 }, { "epoch": 37.002649572649574, "grad_norm": 0.0017077282536774874, "learning_rate": 6.970560303893638e-06, "loss": 0.0988, "step": 8720 }, { "epoch": 37.003076923076925, "grad_norm": 0.004637227393686771, "learning_rate": 6.965811965811966e-06, "loss": 0.1457, "step": 8730 }, { "epoch": 37.00350427350428, "grad_norm": 0.017330659553408623, "learning_rate": 6.9610636277302945e-06, "loss": 0.0003, "step": 8740 }, { "epoch": 37.00393162393162, "grad_norm": 0.0017195155378431082, "learning_rate": 6.956315289648623e-06, "loss": 1.9324, "step": 8750 }, { "epoch": 37.00435897435897, "grad_norm": 0.0010282599832862616, "learning_rate": 6.951566951566953e-06, "loss": 0.6652, "step": 8760 }, { "epoch": 37.00478632478632, "grad_norm": 28.446561813354492, "learning_rate": 6.946818613485281e-06, "loss": 0.9279, "step": 8770 }, { "epoch": 37.005213675213675, "grad_norm": 0.022886212915182114, "learning_rate": 6.942070275403609e-06, "loss": 0.9287, "step": 8780 }, { "epoch": 37.005641025641026, "grad_norm": 0.006739874370396137, "learning_rate": 6.937321937321938e-06, "loss": 0.0193, "step": 8790 }, { "epoch": 37.00606837606838, "grad_norm": 0.0022217826917767525, "learning_rate": 6.932573599240267e-06, "loss": 0.3743, "step": 8800 }, { "epoch": 37.00649572649573, "grad_norm": 353.1299133300781, "learning_rate": 6.927825261158595e-06, "loss": 0.7619, "step": 8810 }, { "epoch": 37.00692307692308, "grad_norm": 0.010308923199772835, "learning_rate": 6.923076923076923e-06, "loss": 0.5226, "step": 8820 }, { "epoch": 37.007350427350424, "grad_norm": 0.386707603931427, "learning_rate": 6.918328584995252e-06, "loss": 0.6146, "step": 8830 }, { "epoch": 37.007777777777775, "grad_norm": 0.01583043485879898, "learning_rate": 6.913580246913581e-06, "loss": 1.2693, "step": 8840 }, { "epoch": 37.00820512820513, "grad_norm": 0.41986873745918274, "learning_rate": 6.908831908831909e-06, "loss": 0.4624, "step": 8850 }, { "epoch": 37.00863247863248, "grad_norm": 0.05952145531773567, "learning_rate": 6.904083570750238e-06, "loss": 0.1098, "step": 8860 }, { "epoch": 37.00905982905983, "grad_norm": 0.5341818332672119, "learning_rate": 6.8993352326685666e-06, "loss": 0.0011, "step": 8870 }, { "epoch": 37.00948717948718, "grad_norm": 0.7262808680534363, "learning_rate": 6.894586894586896e-06, "loss": 0.1184, "step": 8880 }, { "epoch": 37.00991452991453, "grad_norm": 0.006815705914050341, "learning_rate": 6.889838556505224e-06, "loss": 0.0041, "step": 8890 }, { "epoch": 37.01, "eval_accuracy": 0.4, "eval_loss": 5.167320728302002, "eval_runtime": 33.711, "eval_samples_per_second": 0.742, "eval_steps_per_second": 0.742, "step": 8892 }, { "epoch": 38.00034188034188, "grad_norm": 0.0014624128816649318, "learning_rate": 6.885090218423552e-06, "loss": 0.0327, "step": 8900 }, { "epoch": 38.00076923076923, "grad_norm": 0.4525696933269501, "learning_rate": 6.880341880341881e-06, "loss": 0.0079, "step": 8910 }, { "epoch": 38.00119658119658, "grad_norm": 0.0014630717923864722, "learning_rate": 6.87559354226021e-06, "loss": 0.0003, "step": 8920 }, { "epoch": 38.00162393162393, "grad_norm": 0.009966408833861351, "learning_rate": 6.870845204178538e-06, "loss": 0.0023, "step": 8930 }, { "epoch": 38.00205128205128, "grad_norm": 0.02271059900522232, "learning_rate": 6.866096866096866e-06, "loss": 0.0006, "step": 8940 }, { "epoch": 38.002478632478635, "grad_norm": 0.15712451934814453, "learning_rate": 6.861348528015195e-06, "loss": 0.0004, "step": 8950 }, { "epoch": 38.002905982905986, "grad_norm": 0.011041790246963501, "learning_rate": 6.856600189933523e-06, "loss": 1.3034, "step": 8960 }, { "epoch": 38.00333333333333, "grad_norm": 0.0015798502136021852, "learning_rate": 6.851851851851853e-06, "loss": 0.0009, "step": 8970 }, { "epoch": 38.00376068376068, "grad_norm": 0.008027788251638412, "learning_rate": 6.847103513770181e-06, "loss": 0.4744, "step": 8980 }, { "epoch": 38.00418803418803, "grad_norm": 0.045825887471437454, "learning_rate": 6.8423551756885095e-06, "loss": 0.003, "step": 8990 }, { "epoch": 38.004615384615384, "grad_norm": 0.0023124797735363245, "learning_rate": 6.837606837606839e-06, "loss": 0.0006, "step": 9000 }, { "epoch": 38.005042735042736, "grad_norm": 4.002900123596191, "learning_rate": 6.832858499525167e-06, "loss": 0.5629, "step": 9010 }, { "epoch": 38.00547008547009, "grad_norm": 0.1215251013636589, "learning_rate": 6.828110161443495e-06, "loss": 0.0034, "step": 9020 }, { "epoch": 38.00589743589744, "grad_norm": 0.007816506549715996, "learning_rate": 6.8233618233618235e-06, "loss": 0.3739, "step": 9030 }, { "epoch": 38.00632478632479, "grad_norm": 80.12126922607422, "learning_rate": 6.818613485280152e-06, "loss": 0.258, "step": 9040 }, { "epoch": 38.006752136752134, "grad_norm": 0.0014129136689007282, "learning_rate": 6.813865147198481e-06, "loss": 0.0002, "step": 9050 }, { "epoch": 38.007179487179485, "grad_norm": 507.7630310058594, "learning_rate": 6.809116809116809e-06, "loss": 0.2478, "step": 9060 }, { "epoch": 38.007606837606836, "grad_norm": 0.0028367233462631702, "learning_rate": 6.804368471035138e-06, "loss": 0.5826, "step": 9070 }, { "epoch": 38.00803418803419, "grad_norm": 0.018132785335183144, "learning_rate": 6.7996201329534676e-06, "loss": 0.3636, "step": 9080 }, { "epoch": 38.00846153846154, "grad_norm": 1.1534035205841064, "learning_rate": 6.794871794871796e-06, "loss": 0.454, "step": 9090 }, { "epoch": 38.00888888888889, "grad_norm": 0.0010121689410880208, "learning_rate": 6.790123456790124e-06, "loss": 0.8389, "step": 9100 }, { "epoch": 38.00931623931624, "grad_norm": 0.0008693256531842053, "learning_rate": 6.7853751187084525e-06, "loss": 0.0002, "step": 9110 }, { "epoch": 38.00974358974359, "grad_norm": 0.001992259407415986, "learning_rate": 6.780626780626781e-06, "loss": 0.0001, "step": 9120 }, { "epoch": 38.01, "eval_accuracy": 0.4, "eval_loss": 5.400512218475342, "eval_runtime": 33.5959, "eval_samples_per_second": 0.744, "eval_steps_per_second": 0.744, "step": 9126 }, { "epoch": 39.00017094017094, "grad_norm": 0.010901215486228466, "learning_rate": 6.77587844254511e-06, "loss": 0.0009, "step": 9130 }, { "epoch": 39.00059829059829, "grad_norm": 0.016072046011686325, "learning_rate": 6.771130104463438e-06, "loss": 0.1631, "step": 9140 }, { "epoch": 39.00102564102564, "grad_norm": 0.017606910318136215, "learning_rate": 6.7663817663817665e-06, "loss": 0.3611, "step": 9150 }, { "epoch": 39.00145299145299, "grad_norm": 0.07369554787874222, "learning_rate": 6.761633428300095e-06, "loss": 0.0008, "step": 9160 }, { "epoch": 39.001880341880344, "grad_norm": 0.023543603718280792, "learning_rate": 6.756885090218425e-06, "loss": 0.121, "step": 9170 }, { "epoch": 39.002307692307696, "grad_norm": 37.938270568847656, "learning_rate": 6.752136752136753e-06, "loss": 1.2801, "step": 9180 }, { "epoch": 39.00273504273504, "grad_norm": 0.0017556172097101808, "learning_rate": 6.747388414055081e-06, "loss": 0.0766, "step": 9190 }, { "epoch": 39.00316239316239, "grad_norm": 0.017493102699518204, "learning_rate": 6.74264007597341e-06, "loss": 0.0422, "step": 9200 }, { "epoch": 39.00358974358974, "grad_norm": 0.0010195322101935744, "learning_rate": 6.737891737891739e-06, "loss": 0.0576, "step": 9210 }, { "epoch": 39.004017094017094, "grad_norm": 0.009548988193273544, "learning_rate": 6.733143399810067e-06, "loss": 0.0003, "step": 9220 }, { "epoch": 39.004444444444445, "grad_norm": 0.00347512518055737, "learning_rate": 6.728395061728395e-06, "loss": 0.0004, "step": 9230 }, { "epoch": 39.0048717948718, "grad_norm": 0.027547018602490425, "learning_rate": 6.723646723646724e-06, "loss": 0.0002, "step": 9240 }, { "epoch": 39.00529914529915, "grad_norm": 0.005075276829302311, "learning_rate": 6.718898385565053e-06, "loss": 0.2796, "step": 9250 }, { "epoch": 39.0057264957265, "grad_norm": 0.03868807852268219, "learning_rate": 6.714150047483381e-06, "loss": 0.005, "step": 9260 }, { "epoch": 39.00615384615384, "grad_norm": 324.1128234863281, "learning_rate": 6.7094017094017094e-06, "loss": 0.82, "step": 9270 }, { "epoch": 39.006581196581195, "grad_norm": 0.10437753051519394, "learning_rate": 6.7046533713200394e-06, "loss": 0.9481, "step": 9280 }, { "epoch": 39.007008547008546, "grad_norm": 0.12134591490030289, "learning_rate": 6.699905033238368e-06, "loss": 0.0065, "step": 9290 }, { "epoch": 39.0074358974359, "grad_norm": 0.029539357870817184, "learning_rate": 6.695156695156696e-06, "loss": 0.0004, "step": 9300 }, { "epoch": 39.00786324786325, "grad_norm": 205.1163787841797, "learning_rate": 6.690408357075024e-06, "loss": 0.4614, "step": 9310 }, { "epoch": 39.0082905982906, "grad_norm": 0.008227716200053692, "learning_rate": 6.685660018993353e-06, "loss": 0.0715, "step": 9320 }, { "epoch": 39.00871794871795, "grad_norm": 292.14813232421875, "learning_rate": 6.680911680911682e-06, "loss": 0.2026, "step": 9330 }, { "epoch": 39.0091452991453, "grad_norm": 0.0007633490022271872, "learning_rate": 6.67616334283001e-06, "loss": 0.1259, "step": 9340 }, { "epoch": 39.00957264957265, "grad_norm": 0.0007212384953163564, "learning_rate": 6.671415004748338e-06, "loss": 0.05, "step": 9350 }, { "epoch": 39.01, "grad_norm": 0.0017658963333815336, "learning_rate": 6.666666666666667e-06, "loss": 0.1066, "step": 9360 }, { "epoch": 39.01, "eval_accuracy": 0.48, "eval_loss": 4.4508891105651855, "eval_runtime": 33.5596, "eval_samples_per_second": 0.745, "eval_steps_per_second": 0.745, "step": 9360 }, { "epoch": 40.00042735042735, "grad_norm": 0.24033397436141968, "learning_rate": 6.661918328584995e-06, "loss": 0.3139, "step": 9370 }, { "epoch": 40.0008547008547, "grad_norm": 0.11893019825220108, "learning_rate": 6.657169990503325e-06, "loss": 0.07, "step": 9380 }, { "epoch": 40.001282051282054, "grad_norm": 262.5067443847656, "learning_rate": 6.652421652421653e-06, "loss": 0.4037, "step": 9390 }, { "epoch": 40.001709401709405, "grad_norm": 0.05252963304519653, "learning_rate": 6.6476733143399815e-06, "loss": 0.3586, "step": 9400 }, { "epoch": 40.00213675213675, "grad_norm": 255.7079620361328, "learning_rate": 6.642924976258311e-06, "loss": 0.0414, "step": 9410 }, { "epoch": 40.0025641025641, "grad_norm": 0.07694090157747269, "learning_rate": 6.638176638176639e-06, "loss": 0.3065, "step": 9420 }, { "epoch": 40.00299145299145, "grad_norm": 0.0017998847179114819, "learning_rate": 6.633428300094967e-06, "loss": 0.0004, "step": 9430 }, { "epoch": 40.0034188034188, "grad_norm": 208.48390197753906, "learning_rate": 6.6286799620132956e-06, "loss": 0.775, "step": 9440 }, { "epoch": 40.003846153846155, "grad_norm": 0.004139886237680912, "learning_rate": 6.623931623931624e-06, "loss": 0.0009, "step": 9450 }, { "epoch": 40.004273504273506, "grad_norm": 0.0015383479185402393, "learning_rate": 6.619183285849953e-06, "loss": 0.0003, "step": 9460 }, { "epoch": 40.00470085470086, "grad_norm": 79.24043273925781, "learning_rate": 6.614434947768281e-06, "loss": 0.0126, "step": 9470 }, { "epoch": 40.00512820512821, "grad_norm": 0.10130419582128525, "learning_rate": 6.60968660968661e-06, "loss": 0.0092, "step": 9480 }, { "epoch": 40.00555555555555, "grad_norm": 0.0018481501610949636, "learning_rate": 6.60493827160494e-06, "loss": 0.0001, "step": 9490 }, { "epoch": 40.005982905982904, "grad_norm": 0.00359438662417233, "learning_rate": 6.600189933523268e-06, "loss": 0.0015, "step": 9500 }, { "epoch": 40.006410256410255, "grad_norm": 0.0007249795598909259, "learning_rate": 6.595441595441596e-06, "loss": 0.0001, "step": 9510 }, { "epoch": 40.00683760683761, "grad_norm": 0.0008260689792223275, "learning_rate": 6.5906932573599245e-06, "loss": 0.5924, "step": 9520 }, { "epoch": 40.00726495726496, "grad_norm": 0.0014530919725075364, "learning_rate": 6.585944919278253e-06, "loss": 0.7378, "step": 9530 }, { "epoch": 40.00769230769231, "grad_norm": 14.047000885009766, "learning_rate": 6.581196581196582e-06, "loss": 0.617, "step": 9540 }, { "epoch": 40.00811965811966, "grad_norm": 0.00833844393491745, "learning_rate": 6.57644824311491e-06, "loss": 0.0337, "step": 9550 }, { "epoch": 40.00854700854701, "grad_norm": 0.001020289957523346, "learning_rate": 6.5716999050332385e-06, "loss": 0.5866, "step": 9560 }, { "epoch": 40.008974358974356, "grad_norm": 0.0010084334062412381, "learning_rate": 6.566951566951567e-06, "loss": 0.765, "step": 9570 }, { "epoch": 40.00940170940171, "grad_norm": 0.0008349449490197003, "learning_rate": 6.562203228869896e-06, "loss": 0.0001, "step": 9580 }, { "epoch": 40.00982905982906, "grad_norm": 0.003181006060913205, "learning_rate": 6.557454890788225e-06, "loss": 0.0001, "step": 9590 }, { "epoch": 40.01, "eval_accuracy": 0.44, "eval_loss": 5.09063196182251, "eval_runtime": 33.6154, "eval_samples_per_second": 0.744, "eval_steps_per_second": 0.744, "step": 9594 }, { "epoch": 41.00025641025641, "grad_norm": 0.027484165504574776, "learning_rate": 6.552706552706553e-06, "loss": 0.0001, "step": 9600 }, { "epoch": 41.00068376068376, "grad_norm": 0.002781185321509838, "learning_rate": 6.5479582146248825e-06, "loss": 0.9085, "step": 9610 }, { "epoch": 41.00111111111111, "grad_norm": 0.002277638763189316, "learning_rate": 6.543209876543211e-06, "loss": 0.0046, "step": 9620 }, { "epoch": 41.00153846153846, "grad_norm": 0.001006715465337038, "learning_rate": 6.538461538461539e-06, "loss": 0.2547, "step": 9630 }, { "epoch": 41.00196581196581, "grad_norm": 0.017884666100144386, "learning_rate": 6.533713200379867e-06, "loss": 0.0003, "step": 9640 }, { "epoch": 41.00239316239316, "grad_norm": 0.0043096174485981464, "learning_rate": 6.528964862298196e-06, "loss": 0.3413, "step": 9650 }, { "epoch": 41.00282051282051, "grad_norm": 0.008827520534396172, "learning_rate": 6.524216524216525e-06, "loss": 0.5963, "step": 9660 }, { "epoch": 41.003247863247864, "grad_norm": 380.2333679199219, "learning_rate": 6.519468186134853e-06, "loss": 0.1933, "step": 9670 }, { "epoch": 41.003675213675216, "grad_norm": 53.198787689208984, "learning_rate": 6.5147198480531815e-06, "loss": 0.3678, "step": 9680 }, { "epoch": 41.00410256410257, "grad_norm": 0.0006724594277329743, "learning_rate": 6.50997150997151e-06, "loss": 0.0017, "step": 9690 }, { "epoch": 41.00452991452991, "grad_norm": 0.0016128338174894452, "learning_rate": 6.50522317188984e-06, "loss": 0.7708, "step": 9700 }, { "epoch": 41.00495726495726, "grad_norm": 0.0053380681201815605, "learning_rate": 6.500474833808168e-06, "loss": 0.0048, "step": 9710 }, { "epoch": 41.005384615384614, "grad_norm": 11.402435302734375, "learning_rate": 6.495726495726496e-06, "loss": 0.1091, "step": 9720 }, { "epoch": 41.005811965811965, "grad_norm": 0.0011073511559516191, "learning_rate": 6.490978157644825e-06, "loss": 0.0594, "step": 9730 }, { "epoch": 41.006239316239316, "grad_norm": 0.0016850410029292107, "learning_rate": 6.486229819563154e-06, "loss": 0.5871, "step": 9740 }, { "epoch": 41.00666666666667, "grad_norm": 0.011738145723938942, "learning_rate": 6.481481481481482e-06, "loss": 0.0002, "step": 9750 }, { "epoch": 41.00709401709402, "grad_norm": 0.02577214501798153, "learning_rate": 6.47673314339981e-06, "loss": 0.7295, "step": 9760 }, { "epoch": 41.00752136752137, "grad_norm": 0.0008236413705162704, "learning_rate": 6.471984805318139e-06, "loss": 0.0001, "step": 9770 }, { "epoch": 41.007948717948715, "grad_norm": 0.0007155478815548122, "learning_rate": 6.467236467236467e-06, "loss": 0.2632, "step": 9780 }, { "epoch": 41.008376068376066, "grad_norm": 0.0013219125103205442, "learning_rate": 6.462488129154796e-06, "loss": 0.0001, "step": 9790 }, { "epoch": 41.00880341880342, "grad_norm": 203.72512817382812, "learning_rate": 6.457739791073125e-06, "loss": 0.8773, "step": 9800 }, { "epoch": 41.00923076923077, "grad_norm": 0.0009469674550928175, "learning_rate": 6.4529914529914535e-06, "loss": 0.0001, "step": 9810 }, { "epoch": 41.00965811965812, "grad_norm": 0.013066309504210949, "learning_rate": 6.448243114909783e-06, "loss": 1.3235, "step": 9820 }, { "epoch": 41.01, "eval_accuracy": 0.48, "eval_loss": 4.409305572509766, "eval_runtime": 33.7085, "eval_samples_per_second": 0.742, "eval_steps_per_second": 0.742, "step": 9828 }, { "epoch": 42.00008547008547, "grad_norm": 0.0041074506007134914, "learning_rate": 6.443494776828111e-06, "loss": 0.0003, "step": 9830 }, { "epoch": 42.00051282051282, "grad_norm": 0.0007542824023403227, "learning_rate": 6.438746438746439e-06, "loss": 0.1122, "step": 9840 }, { "epoch": 42.00094017094017, "grad_norm": 0.0015057259006425738, "learning_rate": 6.433998100664768e-06, "loss": 0.0002, "step": 9850 }, { "epoch": 42.00136752136752, "grad_norm": 0.003451643278822303, "learning_rate": 6.429249762583096e-06, "loss": 0.0002, "step": 9860 }, { "epoch": 42.00179487179487, "grad_norm": 0.0011182624148204923, "learning_rate": 6.424501424501425e-06, "loss": 0.0045, "step": 9870 }, { "epoch": 42.00222222222222, "grad_norm": 0.0010131365852430463, "learning_rate": 6.419753086419753e-06, "loss": 0.0001, "step": 9880 }, { "epoch": 42.002649572649574, "grad_norm": 0.0007555413176305592, "learning_rate": 6.415004748338082e-06, "loss": 0.0006, "step": 9890 }, { "epoch": 42.003076923076925, "grad_norm": 0.0009690074366517365, "learning_rate": 6.410256410256412e-06, "loss": 0.0002, "step": 9900 }, { "epoch": 42.00350427350428, "grad_norm": 0.0007182178669609129, "learning_rate": 6.40550807217474e-06, "loss": 0.1885, "step": 9910 }, { "epoch": 42.00393162393162, "grad_norm": 0.0008008884033188224, "learning_rate": 6.400759734093068e-06, "loss": 0.0, "step": 9920 }, { "epoch": 42.00435897435897, "grad_norm": 0.00729968398809433, "learning_rate": 6.3960113960113965e-06, "loss": 0.9073, "step": 9930 }, { "epoch": 42.00478632478632, "grad_norm": 0.0006498902221210301, "learning_rate": 6.391263057929726e-06, "loss": 0.0001, "step": 9940 }, { "epoch": 42.005213675213675, "grad_norm": 0.00066241534659639, "learning_rate": 6.386514719848054e-06, "loss": 1.0737, "step": 9950 }, { "epoch": 42.005641025641026, "grad_norm": 330.1017150878906, "learning_rate": 6.381766381766382e-06, "loss": 1.9267, "step": 9960 }, { "epoch": 42.00606837606838, "grad_norm": 0.01651351898908615, "learning_rate": 6.3770180436847105e-06, "loss": 0.0003, "step": 9970 }, { "epoch": 42.00649572649573, "grad_norm": 0.005164479836821556, "learning_rate": 6.372269705603039e-06, "loss": 0.0008, "step": 9980 }, { "epoch": 42.00692307692308, "grad_norm": 425.1309814453125, "learning_rate": 6.367521367521368e-06, "loss": 0.3781, "step": 9990 }, { "epoch": 42.007350427350424, "grad_norm": 0.014773045666515827, "learning_rate": 6.362773029439696e-06, "loss": 0.0002, "step": 10000 }, { "epoch": 42.007777777777775, "grad_norm": 0.008133858442306519, "learning_rate": 6.358024691358025e-06, "loss": 0.0166, "step": 10010 }, { "epoch": 42.00820512820513, "grad_norm": 0.4493364691734314, "learning_rate": 6.3532763532763546e-06, "loss": 1.622, "step": 10020 }, { "epoch": 42.00863247863248, "grad_norm": 0.007051916792988777, "learning_rate": 6.348528015194683e-06, "loss": 0.0094, "step": 10030 }, { "epoch": 42.00905982905983, "grad_norm": 0.003875425783917308, "learning_rate": 6.343779677113011e-06, "loss": 0.0892, "step": 10040 }, { "epoch": 42.00948717948718, "grad_norm": 0.0009031430818140507, "learning_rate": 6.3390313390313394e-06, "loss": 0.0001, "step": 10050 }, { "epoch": 42.00991452991453, "grad_norm": 392.3472595214844, "learning_rate": 6.334283000949668e-06, "loss": 0.4313, "step": 10060 }, { "epoch": 42.01, "eval_accuracy": 0.48, "eval_loss": 4.089754581451416, "eval_runtime": 33.5482, "eval_samples_per_second": 0.745, "eval_steps_per_second": 0.745, "step": 10062 }, { "epoch": 43.00034188034188, "grad_norm": 0.004300788510590792, "learning_rate": 6.329534662867997e-06, "loss": 0.0001, "step": 10070 }, { "epoch": 43.00076923076923, "grad_norm": 0.0026491975877434015, "learning_rate": 6.324786324786325e-06, "loss": 0.004, "step": 10080 }, { "epoch": 43.00119658119658, "grad_norm": 0.002639173995703459, "learning_rate": 6.3200379867046535e-06, "loss": 0.0006, "step": 10090 }, { "epoch": 43.00162393162393, "grad_norm": 0.0024889677297323942, "learning_rate": 6.315289648622982e-06, "loss": 0.0006, "step": 10100 }, { "epoch": 43.00205128205128, "grad_norm": 0.0487215556204319, "learning_rate": 6.310541310541312e-06, "loss": 0.0001, "step": 10110 }, { "epoch": 43.002478632478635, "grad_norm": 0.0023086927831172943, "learning_rate": 6.30579297245964e-06, "loss": 0.307, "step": 10120 }, { "epoch": 43.002905982905986, "grad_norm": 0.007922650314867496, "learning_rate": 6.301044634377968e-06, "loss": 0.3353, "step": 10130 }, { "epoch": 43.00333333333333, "grad_norm": 0.0018696904880926013, "learning_rate": 6.296296296296297e-06, "loss": 0.0131, "step": 10140 }, { "epoch": 43.00376068376068, "grad_norm": 0.027236852794885635, "learning_rate": 6.291547958214626e-06, "loss": 0.017, "step": 10150 }, { "epoch": 43.00418803418803, "grad_norm": 0.0007559856749139726, "learning_rate": 6.286799620132954e-06, "loss": 0.0005, "step": 10160 }, { "epoch": 43.004615384615384, "grad_norm": 11.704588890075684, "learning_rate": 6.282051282051282e-06, "loss": 0.0014, "step": 10170 }, { "epoch": 43.005042735042736, "grad_norm": 0.06166832521557808, "learning_rate": 6.277302943969611e-06, "loss": 0.9794, "step": 10180 }, { "epoch": 43.00547008547009, "grad_norm": 0.0030286931432783604, "learning_rate": 6.272554605887939e-06, "loss": 0.2002, "step": 10190 }, { "epoch": 43.00589743589744, "grad_norm": 0.006914252880960703, "learning_rate": 6.267806267806268e-06, "loss": 0.0001, "step": 10200 }, { "epoch": 43.00632478632479, "grad_norm": 149.87620544433594, "learning_rate": 6.2630579297245964e-06, "loss": 1.023, "step": 10210 }, { "epoch": 43.006752136752134, "grad_norm": 0.0074666948057711124, "learning_rate": 6.2583095916429256e-06, "loss": 0.2019, "step": 10220 }, { "epoch": 43.007179487179485, "grad_norm": 0.0010838122107088566, "learning_rate": 6.253561253561255e-06, "loss": 0.0011, "step": 10230 }, { "epoch": 43.007606837606836, "grad_norm": 0.10124364495277405, "learning_rate": 6.248812915479583e-06, "loss": 0.3781, "step": 10240 }, { "epoch": 43.00803418803419, "grad_norm": 0.0029677078127861023, "learning_rate": 6.244064577397911e-06, "loss": 0.1722, "step": 10250 }, { "epoch": 43.00846153846154, "grad_norm": 0.012870069593191147, "learning_rate": 6.23931623931624e-06, "loss": 0.0001, "step": 10260 }, { "epoch": 43.00888888888889, "grad_norm": 0.0010046313982456923, "learning_rate": 6.234567901234569e-06, "loss": 0.0003, "step": 10270 }, { "epoch": 43.00931623931624, "grad_norm": 0.0031989836134016514, "learning_rate": 6.229819563152897e-06, "loss": 0.0002, "step": 10280 }, { "epoch": 43.00974358974359, "grad_norm": 0.0007750399527139962, "learning_rate": 6.225071225071225e-06, "loss": 0.0002, "step": 10290 }, { "epoch": 43.01, "eval_accuracy": 0.44, "eval_loss": 4.78167200088501, "eval_runtime": 33.6921, "eval_samples_per_second": 0.742, "eval_steps_per_second": 0.742, "step": 10296 }, { "epoch": 44.00017094017094, "grad_norm": 0.0008096142555586994, "learning_rate": 6.220322886989554e-06, "loss": 0.7658, "step": 10300 }, { "epoch": 44.00059829059829, "grad_norm": 0.001107421237975359, "learning_rate": 6.215574548907882e-06, "loss": 0.0011, "step": 10310 }, { "epoch": 44.00102564102564, "grad_norm": 147.27357482910156, "learning_rate": 6.210826210826212e-06, "loss": 0.8891, "step": 10320 }, { "epoch": 44.00145299145299, "grad_norm": 0.21983663737773895, "learning_rate": 6.20607787274454e-06, "loss": 0.0002, "step": 10330 }, { "epoch": 44.001880341880344, "grad_norm": 0.0018023523734882474, "learning_rate": 6.2013295346628685e-06, "loss": 0.0806, "step": 10340 }, { "epoch": 44.002307692307696, "grad_norm": 0.0013478569453582168, "learning_rate": 6.196581196581198e-06, "loss": 0.442, "step": 10350 }, { "epoch": 44.00273504273504, "grad_norm": 0.0480712465941906, "learning_rate": 6.191832858499526e-06, "loss": 1.1859, "step": 10360 }, { "epoch": 44.00316239316239, "grad_norm": 0.0018559067975729704, "learning_rate": 6.187084520417854e-06, "loss": 0.8634, "step": 10370 }, { "epoch": 44.00358974358974, "grad_norm": 0.0008481431868858635, "learning_rate": 6.1823361823361825e-06, "loss": 0.8489, "step": 10380 }, { "epoch": 44.004017094017094, "grad_norm": 0.003446524264290929, "learning_rate": 6.177587844254511e-06, "loss": 0.5956, "step": 10390 }, { "epoch": 44.004444444444445, "grad_norm": 0.0011318651959300041, "learning_rate": 6.17283950617284e-06, "loss": 0.0008, "step": 10400 }, { "epoch": 44.0048717948718, "grad_norm": 0.002299872925505042, "learning_rate": 6.168091168091168e-06, "loss": 0.0322, "step": 10410 }, { "epoch": 44.00529914529915, "grad_norm": 0.002232542959973216, "learning_rate": 6.1633428300094974e-06, "loss": 1.0857, "step": 10420 }, { "epoch": 44.0057264957265, "grad_norm": 0.1128619983792305, "learning_rate": 6.1585944919278266e-06, "loss": 0.0205, "step": 10430 }, { "epoch": 44.00615384615384, "grad_norm": 0.12662559747695923, "learning_rate": 6.153846153846155e-06, "loss": 0.876, "step": 10440 }, { "epoch": 44.006581196581195, "grad_norm": 0.009139850735664368, "learning_rate": 6.149097815764483e-06, "loss": 0.0007, "step": 10450 }, { "epoch": 44.007008547008546, "grad_norm": 0.0021971724927425385, "learning_rate": 6.1443494776828115e-06, "loss": 0.0002, "step": 10460 }, { "epoch": 44.0074358974359, "grad_norm": 0.00538423378020525, "learning_rate": 6.13960113960114e-06, "loss": 0.0001, "step": 10470 }, { "epoch": 44.00786324786325, "grad_norm": 0.026287034153938293, "learning_rate": 6.134852801519469e-06, "loss": 0.0432, "step": 10480 }, { "epoch": 44.0082905982906, "grad_norm": 0.0007875883602537215, "learning_rate": 6.130104463437797e-06, "loss": 0.266, "step": 10490 }, { "epoch": 44.00871794871795, "grad_norm": 0.01668292097747326, "learning_rate": 6.1253561253561255e-06, "loss": 0.0248, "step": 10500 }, { "epoch": 44.0091452991453, "grad_norm": 0.025921987369656563, "learning_rate": 6.120607787274454e-06, "loss": 0.0002, "step": 10510 }, { "epoch": 44.00957264957265, "grad_norm": 0.0005760848871432245, "learning_rate": 6.115859449192783e-06, "loss": 0.0901, "step": 10520 }, { "epoch": 44.01, "grad_norm": 0.0007085147080942988, "learning_rate": 6.111111111111112e-06, "loss": 0.0001, "step": 10530 }, { "epoch": 44.01, "eval_accuracy": 0.48, "eval_loss": 4.866738319396973, "eval_runtime": 33.6315, "eval_samples_per_second": 0.743, "eval_steps_per_second": 0.743, "step": 10530 }, { "epoch": 45.00042735042735, "grad_norm": 0.013209497556090355, "learning_rate": 6.10636277302944e-06, "loss": 0.2852, "step": 10540 }, { "epoch": 45.0008547008547, "grad_norm": 0.0015938766300678253, "learning_rate": 6.101614434947769e-06, "loss": 0.0001, "step": 10550 }, { "epoch": 45.001282051282054, "grad_norm": 0.05549630522727966, "learning_rate": 6.096866096866098e-06, "loss": 0.9446, "step": 10560 }, { "epoch": 45.001709401709405, "grad_norm": 0.0019539748318493366, "learning_rate": 6.092117758784426e-06, "loss": 0.0001, "step": 10570 }, { "epoch": 45.00213675213675, "grad_norm": 0.0008734731236472726, "learning_rate": 6.087369420702754e-06, "loss": 0.0078, "step": 10580 }, { "epoch": 45.0025641025641, "grad_norm": 0.0014504172140732408, "learning_rate": 6.082621082621083e-06, "loss": 0.0003, "step": 10590 }, { "epoch": 45.00299145299145, "grad_norm": 0.0006624649395234883, "learning_rate": 6.077872744539412e-06, "loss": 0.0005, "step": 10600 }, { "epoch": 45.0034188034188, "grad_norm": 0.0008454503258690238, "learning_rate": 6.07312440645774e-06, "loss": 0.7002, "step": 10610 }, { "epoch": 45.003846153846155, "grad_norm": 0.0008091210620477796, "learning_rate": 6.0683760683760684e-06, "loss": 0.0006, "step": 10620 }, { "epoch": 45.004273504273506, "grad_norm": 0.0016585165867581964, "learning_rate": 6.063627730294398e-06, "loss": 0.0001, "step": 10630 }, { "epoch": 45.00470085470086, "grad_norm": 0.0022859878372401, "learning_rate": 6.058879392212727e-06, "loss": 0.1929, "step": 10640 }, { "epoch": 45.00512820512821, "grad_norm": 604.373291015625, "learning_rate": 6.054131054131055e-06, "loss": 0.6576, "step": 10650 }, { "epoch": 45.00555555555555, "grad_norm": 0.015343650244176388, "learning_rate": 6.049382716049383e-06, "loss": 0.555, "step": 10660 }, { "epoch": 45.005982905982904, "grad_norm": 0.0035480374936014414, "learning_rate": 6.044634377967712e-06, "loss": 0.0039, "step": 10670 }, { "epoch": 45.006410256410255, "grad_norm": 0.0072784130461514, "learning_rate": 6.039886039886041e-06, "loss": 0.0001, "step": 10680 }, { "epoch": 45.00683760683761, "grad_norm": 0.0026009362190961838, "learning_rate": 6.035137701804369e-06, "loss": 0.0002, "step": 10690 }, { "epoch": 45.00726495726496, "grad_norm": 0.000990421511232853, "learning_rate": 6.030389363722697e-06, "loss": 0.0012, "step": 10700 }, { "epoch": 45.00769230769231, "grad_norm": 0.0006937840371392667, "learning_rate": 6.025641025641026e-06, "loss": 0.0, "step": 10710 }, { "epoch": 45.00811965811966, "grad_norm": 0.001313463319092989, "learning_rate": 6.020892687559354e-06, "loss": 0.0073, "step": 10720 }, { "epoch": 45.00854700854701, "grad_norm": 0.02181072160601616, "learning_rate": 6.016144349477683e-06, "loss": 0.0001, "step": 10730 }, { "epoch": 45.008974358974356, "grad_norm": 0.2195536494255066, "learning_rate": 6.011396011396012e-06, "loss": 0.0003, "step": 10740 }, { "epoch": 45.00940170940171, "grad_norm": 0.006073196418583393, "learning_rate": 6.0066476733143405e-06, "loss": 0.0004, "step": 10750 }, { "epoch": 45.00982905982906, "grad_norm": 0.0016091590514406562, "learning_rate": 6.00189933523267e-06, "loss": 0.0007, "step": 10760 }, { "epoch": 45.01, "eval_accuracy": 0.48, "eval_loss": 4.561890125274658, "eval_runtime": 34.2721, "eval_samples_per_second": 0.729, "eval_steps_per_second": 0.729, "step": 10764 }, { "epoch": 46.00025641025641, "grad_norm": 0.0011347552062943578, "learning_rate": 5.997150997150998e-06, "loss": 0.7172, "step": 10770 }, { "epoch": 46.00068376068376, "grad_norm": 375.4585266113281, "learning_rate": 5.992402659069326e-06, "loss": 0.821, "step": 10780 }, { "epoch": 46.00111111111111, "grad_norm": 0.0010995555203408003, "learning_rate": 5.9876543209876546e-06, "loss": 0.1693, "step": 10790 }, { "epoch": 46.00153846153846, "grad_norm": 0.11372661590576172, "learning_rate": 5.982905982905983e-06, "loss": 0.0002, "step": 10800 }, { "epoch": 46.00196581196581, "grad_norm": 0.0009383049909956753, "learning_rate": 5.978157644824312e-06, "loss": 0.0001, "step": 10810 }, { "epoch": 46.00239316239316, "grad_norm": 0.004879310727119446, "learning_rate": 5.97340930674264e-06, "loss": 0.0005, "step": 10820 }, { "epoch": 46.00282051282051, "grad_norm": 0.00243366789072752, "learning_rate": 5.968660968660969e-06, "loss": 0.1072, "step": 10830 }, { "epoch": 46.003247863247864, "grad_norm": 0.0007890159031376243, "learning_rate": 5.963912630579299e-06, "loss": 0.0003, "step": 10840 }, { "epoch": 46.003675213675216, "grad_norm": 0.009691433981060982, "learning_rate": 5.959164292497627e-06, "loss": 0.4108, "step": 10850 }, { "epoch": 46.00410256410257, "grad_norm": 0.003631254890933633, "learning_rate": 5.954415954415955e-06, "loss": 0.0001, "step": 10860 }, { "epoch": 46.00452991452991, "grad_norm": 0.0005667143850587308, "learning_rate": 5.9496676163342835e-06, "loss": 1.0405, "step": 10870 }, { "epoch": 46.00495726495726, "grad_norm": 0.0033552220556885004, "learning_rate": 5.944919278252612e-06, "loss": 0.0001, "step": 10880 }, { "epoch": 46.005384615384614, "grad_norm": 0.07237747311592102, "learning_rate": 5.940170940170941e-06, "loss": 0.6576, "step": 10890 }, { "epoch": 46.005811965811965, "grad_norm": 0.0010696067474782467, "learning_rate": 5.935422602089269e-06, "loss": 0.0001, "step": 10900 }, { "epoch": 46.006239316239316, "grad_norm": 0.0015365901635959744, "learning_rate": 5.9306742640075975e-06, "loss": 0.7127, "step": 10910 }, { "epoch": 46.00666666666667, "grad_norm": 0.002339027589187026, "learning_rate": 5.925925925925926e-06, "loss": 0.0023, "step": 10920 }, { "epoch": 46.00709401709402, "grad_norm": 0.01849093660712242, "learning_rate": 5.921177587844255e-06, "loss": 0.4024, "step": 10930 }, { "epoch": 46.00752136752137, "grad_norm": 0.009332343935966492, "learning_rate": 5.916429249762583e-06, "loss": 0.0002, "step": 10940 }, { "epoch": 46.007948717948715, "grad_norm": 0.009404930286109447, "learning_rate": 5.911680911680912e-06, "loss": 0.001, "step": 10950 }, { "epoch": 46.008376068376066, "grad_norm": 0.0012134173884987831, "learning_rate": 5.906932573599241e-06, "loss": 0.5511, "step": 10960 }, { "epoch": 46.00880341880342, "grad_norm": 0.01179675292223692, "learning_rate": 5.90218423551757e-06, "loss": 0.0018, "step": 10970 }, { "epoch": 46.00923076923077, "grad_norm": 0.0012383083812892437, "learning_rate": 5.897435897435898e-06, "loss": 0.7245, "step": 10980 }, { "epoch": 46.00965811965812, "grad_norm": 0.001664446317590773, "learning_rate": 5.8926875593542264e-06, "loss": 0.0009, "step": 10990 }, { "epoch": 46.01, "eval_accuracy": 0.44, "eval_loss": 5.025023937225342, "eval_runtime": 33.6102, "eval_samples_per_second": 0.744, "eval_steps_per_second": 0.744, "step": 10998 }, { "epoch": 47.00008547008547, "grad_norm": 0.002813153900206089, "learning_rate": 5.887939221272555e-06, "loss": 0.6121, "step": 11000 }, { "epoch": 47.00051282051282, "grad_norm": 0.002290428848937154, "learning_rate": 5.883190883190884e-06, "loss": 0.0002, "step": 11010 }, { "epoch": 47.00094017094017, "grad_norm": 0.013784612528979778, "learning_rate": 5.878442545109212e-06, "loss": 0.0041, "step": 11020 }, { "epoch": 47.00136752136752, "grad_norm": 0.008057618513703346, "learning_rate": 5.8736942070275405e-06, "loss": 0.0002, "step": 11030 }, { "epoch": 47.00179487179487, "grad_norm": 0.0026002710219472647, "learning_rate": 5.868945868945869e-06, "loss": 0.9305, "step": 11040 }, { "epoch": 47.00222222222222, "grad_norm": 375.0780029296875, "learning_rate": 5.864197530864199e-06, "loss": 0.9073, "step": 11050 }, { "epoch": 47.002649572649574, "grad_norm": 0.002601037034764886, "learning_rate": 5.859449192782527e-06, "loss": 0.1539, "step": 11060 }, { "epoch": 47.003076923076925, "grad_norm": 0.013979580253362656, "learning_rate": 5.854700854700855e-06, "loss": 0.0031, "step": 11070 }, { "epoch": 47.00350427350428, "grad_norm": 0.003610535990446806, "learning_rate": 5.849952516619184e-06, "loss": 0.0126, "step": 11080 }, { "epoch": 47.00393162393162, "grad_norm": 0.0018818234093487263, "learning_rate": 5.845204178537513e-06, "loss": 0.0001, "step": 11090 }, { "epoch": 47.00435897435897, "grad_norm": 209.32333374023438, "learning_rate": 5.840455840455841e-06, "loss": 1.5139, "step": 11100 }, { "epoch": 47.00478632478632, "grad_norm": 0.002293006982654333, "learning_rate": 5.835707502374169e-06, "loss": 0.0011, "step": 11110 }, { "epoch": 47.005213675213675, "grad_norm": 0.0014583432348445058, "learning_rate": 5.830959164292498e-06, "loss": 0.0002, "step": 11120 }, { "epoch": 47.005641025641026, "grad_norm": 0.000818538770545274, "learning_rate": 5.826210826210826e-06, "loss": 0.0164, "step": 11130 }, { "epoch": 47.00606837606838, "grad_norm": 0.025977041572332382, "learning_rate": 5.821462488129155e-06, "loss": 0.0002, "step": 11140 }, { "epoch": 47.00649572649573, "grad_norm": 0.0008319366024807096, "learning_rate": 5.816714150047484e-06, "loss": 0.0003, "step": 11150 }, { "epoch": 47.00692307692308, "grad_norm": 0.007519662380218506, "learning_rate": 5.8119658119658126e-06, "loss": 0.0643, "step": 11160 }, { "epoch": 47.007350427350424, "grad_norm": 0.14387542009353638, "learning_rate": 5.807217473884142e-06, "loss": 0.0002, "step": 11170 }, { "epoch": 47.007777777777775, "grad_norm": 0.0030812029726803303, "learning_rate": 5.80246913580247e-06, "loss": 0.0001, "step": 11180 }, { "epoch": 47.00820512820513, "grad_norm": 0.0011086307931691408, "learning_rate": 5.797720797720798e-06, "loss": 0.3969, "step": 11190 }, { "epoch": 47.00863247863248, "grad_norm": 0.0029646113980561495, "learning_rate": 5.792972459639127e-06, "loss": 0.0001, "step": 11200 }, { "epoch": 47.00905982905983, "grad_norm": 0.003931617829948664, "learning_rate": 5.788224121557455e-06, "loss": 0.7591, "step": 11210 }, { "epoch": 47.00948717948718, "grad_norm": 0.006124768406152725, "learning_rate": 5.783475783475784e-06, "loss": 0.2924, "step": 11220 }, { "epoch": 47.00991452991453, "grad_norm": 0.0010141300735995173, "learning_rate": 5.778727445394112e-06, "loss": 0.0001, "step": 11230 }, { "epoch": 47.01, "eval_accuracy": 0.48, "eval_loss": 4.412927627563477, "eval_runtime": 33.7429, "eval_samples_per_second": 0.741, "eval_steps_per_second": 0.741, "step": 11232 }, { "epoch": 48.00034188034188, "grad_norm": 0.007142414338886738, "learning_rate": 5.773979107312441e-06, "loss": 0.8418, "step": 11240 }, { "epoch": 48.00076923076923, "grad_norm": 1.3151068687438965, "learning_rate": 5.769230769230769e-06, "loss": 1.6759, "step": 11250 }, { "epoch": 48.00119658119658, "grad_norm": 0.0036189991515129805, "learning_rate": 5.764482431149099e-06, "loss": 0.0004, "step": 11260 }, { "epoch": 48.00162393162393, "grad_norm": 0.000717452319804579, "learning_rate": 5.759734093067427e-06, "loss": 0.0001, "step": 11270 }, { "epoch": 48.00205128205128, "grad_norm": 0.07057972997426987, "learning_rate": 5.7549857549857555e-06, "loss": 0.0422, "step": 11280 }, { "epoch": 48.002478632478635, "grad_norm": 589.4522094726562, "learning_rate": 5.750237416904084e-06, "loss": 0.0541, "step": 11290 }, { "epoch": 48.002905982905986, "grad_norm": 0.01362372562289238, "learning_rate": 5.745489078822413e-06, "loss": 0.0055, "step": 11300 }, { "epoch": 48.00333333333333, "grad_norm": 0.01434069313108921, "learning_rate": 5.740740740740741e-06, "loss": 0.7408, "step": 11310 }, { "epoch": 48.00376068376068, "grad_norm": 0.0020310671534389257, "learning_rate": 5.7359924026590695e-06, "loss": 0.0002, "step": 11320 }, { "epoch": 48.00418803418803, "grad_norm": 0.005610823631286621, "learning_rate": 5.731244064577398e-06, "loss": 0.0002, "step": 11330 }, { "epoch": 48.004615384615384, "grad_norm": 0.0008401147206313908, "learning_rate": 5.726495726495727e-06, "loss": 0.0001, "step": 11340 }, { "epoch": 48.005042735042736, "grad_norm": 0.03286376968026161, "learning_rate": 5.721747388414055e-06, "loss": 0.0007, "step": 11350 }, { "epoch": 48.00547008547009, "grad_norm": 0.002525913529098034, "learning_rate": 5.716999050332384e-06, "loss": 0.0003, "step": 11360 }, { "epoch": 48.00589743589744, "grad_norm": 0.0011132374638691545, "learning_rate": 5.7122507122507136e-06, "loss": 0.0, "step": 11370 }, { "epoch": 48.00632478632479, "grad_norm": 0.0005615410045720637, "learning_rate": 5.707502374169042e-06, "loss": 0.0002, "step": 11380 }, { "epoch": 48.006752136752134, "grad_norm": 0.006012198980897665, "learning_rate": 5.70275403608737e-06, "loss": 0.0007, "step": 11390 }, { "epoch": 48.007179487179485, "grad_norm": 0.014253268949687481, "learning_rate": 5.6980056980056985e-06, "loss": 0.0001, "step": 11400 }, { "epoch": 48.007606837606836, "grad_norm": 0.001018693670630455, "learning_rate": 5.693257359924027e-06, "loss": 0.0002, "step": 11410 }, { "epoch": 48.00803418803419, "grad_norm": 0.030744800344109535, "learning_rate": 5.688509021842356e-06, "loss": 0.6403, "step": 11420 }, { "epoch": 48.00846153846154, "grad_norm": 0.013697023503482342, "learning_rate": 5.683760683760684e-06, "loss": 0.0001, "step": 11430 }, { "epoch": 48.00888888888889, "grad_norm": 0.01648533344268799, "learning_rate": 5.6790123456790125e-06, "loss": 0.0002, "step": 11440 }, { "epoch": 48.00931623931624, "grad_norm": 0.0017602647421881557, "learning_rate": 5.674264007597341e-06, "loss": 0.0014, "step": 11450 }, { "epoch": 48.00974358974359, "grad_norm": 0.0007377556175924838, "learning_rate": 5.669515669515669e-06, "loss": 0.0001, "step": 11460 }, { "epoch": 48.01, "eval_accuracy": 0.44, "eval_loss": 5.5987443923950195, "eval_runtime": 33.7645, "eval_samples_per_second": 0.74, "eval_steps_per_second": 0.74, "step": 11466 }, { "epoch": 49.00017094017094, "grad_norm": 0.0007492690929211676, "learning_rate": 5.664767331433999e-06, "loss": 0.6818, "step": 11470 }, { "epoch": 49.00059829059829, "grad_norm": 0.03045983612537384, "learning_rate": 5.660018993352327e-06, "loss": 0.5287, "step": 11480 }, { "epoch": 49.00102564102564, "grad_norm": 0.0005993829108774662, "learning_rate": 5.655270655270656e-06, "loss": 0.0425, "step": 11490 }, { "epoch": 49.00145299145299, "grad_norm": 0.0005655125132761896, "learning_rate": 5.650522317188985e-06, "loss": 0.0001, "step": 11500 }, { "epoch": 49.001880341880344, "grad_norm": 0.000584998691920191, "learning_rate": 5.645773979107313e-06, "loss": 0.0001, "step": 11510 }, { "epoch": 49.002307692307696, "grad_norm": 0.0036167309153825045, "learning_rate": 5.641025641025641e-06, "loss": 0.0019, "step": 11520 }, { "epoch": 49.00273504273504, "grad_norm": 0.0014566316967830062, "learning_rate": 5.63627730294397e-06, "loss": 0.0002, "step": 11530 }, { "epoch": 49.00316239316239, "grad_norm": 194.14332580566406, "learning_rate": 5.631528964862298e-06, "loss": 0.8389, "step": 11540 }, { "epoch": 49.00358974358974, "grad_norm": 0.03927035257220268, "learning_rate": 5.626780626780627e-06, "loss": 0.0001, "step": 11550 }, { "epoch": 49.004017094017094, "grad_norm": 0.01512466836720705, "learning_rate": 5.6220322886989554e-06, "loss": 0.0011, "step": 11560 }, { "epoch": 49.004444444444445, "grad_norm": 0.0020757820457220078, "learning_rate": 5.617283950617285e-06, "loss": 0.3843, "step": 11570 }, { "epoch": 49.0048717948718, "grad_norm": 0.0007370910025201738, "learning_rate": 5.612535612535614e-06, "loss": 0.0001, "step": 11580 }, { "epoch": 49.00529914529915, "grad_norm": 0.0047119674272835255, "learning_rate": 5.607787274453942e-06, "loss": 0.4626, "step": 11590 }, { "epoch": 49.0057264957265, "grad_norm": 0.0026911916211247444, "learning_rate": 5.60303893637227e-06, "loss": 0.0649, "step": 11600 }, { "epoch": 49.00615384615384, "grad_norm": 0.02297067642211914, "learning_rate": 5.598290598290599e-06, "loss": 0.0036, "step": 11610 }, { "epoch": 49.006581196581195, "grad_norm": 0.0033419374376535416, "learning_rate": 5.593542260208927e-06, "loss": 0.0001, "step": 11620 }, { "epoch": 49.007008547008546, "grad_norm": 0.08777555078268051, "learning_rate": 5.588793922127256e-06, "loss": 0.95, "step": 11630 }, { "epoch": 49.0074358974359, "grad_norm": 0.0006313354824669659, "learning_rate": 5.584045584045584e-06, "loss": 0.6174, "step": 11640 }, { "epoch": 49.00786324786325, "grad_norm": 0.0009615861345082521, "learning_rate": 5.579297245963913e-06, "loss": 0.0004, "step": 11650 }, { "epoch": 49.0082905982906, "grad_norm": 0.04878242313861847, "learning_rate": 5.574548907882241e-06, "loss": 0.0001, "step": 11660 }, { "epoch": 49.00871794871795, "grad_norm": 0.0008145536412484944, "learning_rate": 5.56980056980057e-06, "loss": 0.0002, "step": 11670 }, { "epoch": 49.0091452991453, "grad_norm": 0.0008402611128985882, "learning_rate": 5.565052231718899e-06, "loss": 0.0, "step": 11680 }, { "epoch": 49.00957264957265, "grad_norm": 0.17107157409191132, "learning_rate": 5.5603038936372275e-06, "loss": 0.044, "step": 11690 }, { "epoch": 49.01, "grad_norm": 0.007154208142310381, "learning_rate": 5.555555555555557e-06, "loss": 0.0003, "step": 11700 }, { "epoch": 49.01, "eval_accuracy": 0.44, "eval_loss": 5.4567461013793945, "eval_runtime": 33.7217, "eval_samples_per_second": 0.741, "eval_steps_per_second": 0.741, "step": 11700 }, { "epoch": 50.00042735042735, "grad_norm": 0.0006979930330999196, "learning_rate": 5.550807217473885e-06, "loss": 0.0, "step": 11710 }, { "epoch": 50.0008547008547, "grad_norm": 0.005530244205147028, "learning_rate": 5.546058879392213e-06, "loss": 0.0002, "step": 11720 }, { "epoch": 50.001282051282054, "grad_norm": 0.007461980450898409, "learning_rate": 5.5413105413105416e-06, "loss": 0.0723, "step": 11730 }, { "epoch": 50.001709401709405, "grad_norm": 0.000708905456122011, "learning_rate": 5.53656220322887e-06, "loss": 0.0008, "step": 11740 }, { "epoch": 50.00213675213675, "grad_norm": 0.0005582983139902353, "learning_rate": 5.531813865147199e-06, "loss": 0.2155, "step": 11750 }, { "epoch": 50.0025641025641, "grad_norm": 0.001401066081598401, "learning_rate": 5.527065527065527e-06, "loss": 0.0042, "step": 11760 }, { "epoch": 50.00299145299145, "grad_norm": 0.0036240883637219667, "learning_rate": 5.522317188983856e-06, "loss": 0.5656, "step": 11770 }, { "epoch": 50.0034188034188, "grad_norm": 0.0006863917224109173, "learning_rate": 5.517568850902186e-06, "loss": 0.0002, "step": 11780 }, { "epoch": 50.003846153846155, "grad_norm": 0.0006204345263540745, "learning_rate": 5.512820512820514e-06, "loss": 0.0001, "step": 11790 }, { "epoch": 50.004273504273506, "grad_norm": 0.0025562657974660397, "learning_rate": 5.508072174738842e-06, "loss": 0.0004, "step": 11800 }, { "epoch": 50.00470085470086, "grad_norm": 0.0017411591252312064, "learning_rate": 5.5033238366571705e-06, "loss": 0.0002, "step": 11810 }, { "epoch": 50.00512820512821, "grad_norm": 23.52789306640625, "learning_rate": 5.498575498575499e-06, "loss": 0.0036, "step": 11820 }, { "epoch": 50.00555555555555, "grad_norm": 0.0005795454489998519, "learning_rate": 5.493827160493828e-06, "loss": 0.7016, "step": 11830 }, { "epoch": 50.005982905982904, "grad_norm": 0.0019489077385514975, "learning_rate": 5.489078822412156e-06, "loss": 0.0001, "step": 11840 }, { "epoch": 50.006410256410255, "grad_norm": 0.0617222860455513, "learning_rate": 5.4843304843304845e-06, "loss": 0.0001, "step": 11850 }, { "epoch": 50.00683760683761, "grad_norm": 403.005859375, "learning_rate": 5.479582146248813e-06, "loss": 0.1644, "step": 11860 }, { "epoch": 50.00726495726496, "grad_norm": 0.14567570388317108, "learning_rate": 5.474833808167141e-06, "loss": 0.0004, "step": 11870 }, { "epoch": 50.00769230769231, "grad_norm": 0.0005991229554638267, "learning_rate": 5.470085470085471e-06, "loss": 0.0898, "step": 11880 }, { "epoch": 50.00811965811966, "grad_norm": 0.14940251410007477, "learning_rate": 5.465337132003799e-06, "loss": 0.0001, "step": 11890 }, { "epoch": 50.00854700854701, "grad_norm": 0.028690291568636894, "learning_rate": 5.460588793922128e-06, "loss": 0.0023, "step": 11900 }, { "epoch": 50.008974358974356, "grad_norm": 0.21514463424682617, "learning_rate": 5.455840455840457e-06, "loss": 0.6174, "step": 11910 }, { "epoch": 50.00940170940171, "grad_norm": 0.0008885301067493856, "learning_rate": 5.451092117758785e-06, "loss": 0.0001, "step": 11920 }, { "epoch": 50.00982905982906, "grad_norm": 0.0035741357132792473, "learning_rate": 5.446343779677113e-06, "loss": 0.0468, "step": 11930 }, { "epoch": 50.01, "eval_accuracy": 0.48, "eval_loss": 5.021820545196533, "eval_runtime": 33.527, "eval_samples_per_second": 0.746, "eval_steps_per_second": 0.746, "step": 11934 }, { "epoch": 51.00025641025641, "grad_norm": 0.11837821453809738, "learning_rate": 5.441595441595442e-06, "loss": 0.0001, "step": 11940 }, { "epoch": 51.00068376068376, "grad_norm": 32.09723663330078, "learning_rate": 5.43684710351377e-06, "loss": 0.0062, "step": 11950 }, { "epoch": 51.00111111111111, "grad_norm": 0.0008311708807013929, "learning_rate": 5.432098765432099e-06, "loss": 0.4382, "step": 11960 }, { "epoch": 51.00153846153846, "grad_norm": 0.00047426484525203705, "learning_rate": 5.4273504273504275e-06, "loss": 0.0019, "step": 11970 }, { "epoch": 51.00196581196581, "grad_norm": 0.0012167677050456405, "learning_rate": 5.422602089268756e-06, "loss": 0.0002, "step": 11980 }, { "epoch": 51.00239316239316, "grad_norm": 0.0028268792666494846, "learning_rate": 5.417853751187086e-06, "loss": 0.0001, "step": 11990 }, { "epoch": 51.00282051282051, "grad_norm": 0.0009294459596276283, "learning_rate": 5.413105413105414e-06, "loss": 0.0001, "step": 12000 }, { "epoch": 51.003247863247864, "grad_norm": 0.014754371717572212, "learning_rate": 5.408357075023742e-06, "loss": 0.0001, "step": 12010 }, { "epoch": 51.003675213675216, "grad_norm": 0.027568388730287552, "learning_rate": 5.403608736942071e-06, "loss": 0.0002, "step": 12020 }, { "epoch": 51.00410256410257, "grad_norm": 0.07785722613334656, "learning_rate": 5.3988603988604e-06, "loss": 0.0001, "step": 12030 }, { "epoch": 51.00452991452991, "grad_norm": 0.0007142764516174793, "learning_rate": 5.394112060778728e-06, "loss": 0.0003, "step": 12040 }, { "epoch": 51.00495726495726, "grad_norm": 0.002452197717502713, "learning_rate": 5.389363722697056e-06, "loss": 0.0003, "step": 12050 }, { "epoch": 51.005384615384614, "grad_norm": 0.0013394391862675548, "learning_rate": 5.384615384615385e-06, "loss": 0.0001, "step": 12060 }, { "epoch": 51.005811965811965, "grad_norm": 0.0005540793645195663, "learning_rate": 5.379867046533713e-06, "loss": 0.0, "step": 12070 }, { "epoch": 51.006239316239316, "grad_norm": 0.004768889397382736, "learning_rate": 5.375118708452042e-06, "loss": 0.0002, "step": 12080 }, { "epoch": 51.00666666666667, "grad_norm": 0.0035481529776006937, "learning_rate": 5.370370370370371e-06, "loss": 0.004, "step": 12090 }, { "epoch": 51.00709401709402, "grad_norm": 0.0004956016782671213, "learning_rate": 5.3656220322886995e-06, "loss": 0.0001, "step": 12100 }, { "epoch": 51.00752136752137, "grad_norm": 0.0010804428020492196, "learning_rate": 5.360873694207029e-06, "loss": 0.0001, "step": 12110 }, { "epoch": 51.007948717948715, "grad_norm": 0.004500087816268206, "learning_rate": 5.356125356125357e-06, "loss": 0.031, "step": 12120 }, { "epoch": 51.008376068376066, "grad_norm": 0.0010846515651792288, "learning_rate": 5.351377018043685e-06, "loss": 1.3764, "step": 12130 }, { "epoch": 51.00880341880342, "grad_norm": 0.0024583388585597277, "learning_rate": 5.346628679962014e-06, "loss": 0.0, "step": 12140 }, { "epoch": 51.00923076923077, "grad_norm": 0.000765733711887151, "learning_rate": 5.341880341880342e-06, "loss": 0.0134, "step": 12150 }, { "epoch": 51.00965811965812, "grad_norm": 0.0004111463204026222, "learning_rate": 5.337132003798671e-06, "loss": 0.187, "step": 12160 }, { "epoch": 51.01, "eval_accuracy": 0.4, "eval_loss": 5.326884746551514, "eval_runtime": 33.7105, "eval_samples_per_second": 0.742, "eval_steps_per_second": 0.742, "step": 12168 }, { "epoch": 52.00008547008547, "grad_norm": 0.0006366129964590073, "learning_rate": 5.332383665716999e-06, "loss": 0.2729, "step": 12170 }, { "epoch": 52.00051282051282, "grad_norm": 0.00045478667016141117, "learning_rate": 5.327635327635328e-06, "loss": 0.0002, "step": 12180 }, { "epoch": 52.00094017094017, "grad_norm": 0.0012099344749003649, "learning_rate": 5.322886989553656e-06, "loss": 0.0017, "step": 12190 }, { "epoch": 52.00136752136752, "grad_norm": 0.010483755730092525, "learning_rate": 5.318138651471986e-06, "loss": 0.0001, "step": 12200 }, { "epoch": 52.00179487179487, "grad_norm": 0.001030059065669775, "learning_rate": 5.313390313390314e-06, "loss": 0.0001, "step": 12210 }, { "epoch": 52.00222222222222, "grad_norm": 0.005780795589089394, "learning_rate": 5.3086419753086425e-06, "loss": 0.0001, "step": 12220 }, { "epoch": 52.002649572649574, "grad_norm": 0.001182667212560773, "learning_rate": 5.303893637226971e-06, "loss": 0.7791, "step": 12230 }, { "epoch": 52.003076923076925, "grad_norm": 0.10405033081769943, "learning_rate": 5.2991452991453e-06, "loss": 0.172, "step": 12240 }, { "epoch": 52.00350427350428, "grad_norm": 0.0005188258364796638, "learning_rate": 5.294396961063628e-06, "loss": 0.0001, "step": 12250 }, { "epoch": 52.00393162393162, "grad_norm": 0.0017536180093884468, "learning_rate": 5.2896486229819565e-06, "loss": 0.0001, "step": 12260 }, { "epoch": 52.00435897435897, "grad_norm": 0.0009690941660664976, "learning_rate": 5.284900284900285e-06, "loss": 0.0001, "step": 12270 }, { "epoch": 52.00478632478632, "grad_norm": 0.0005580552387982607, "learning_rate": 5.280151946818613e-06, "loss": 0.0003, "step": 12280 }, { "epoch": 52.005213675213675, "grad_norm": 0.1803247481584549, "learning_rate": 5.275403608736942e-06, "loss": 0.0001, "step": 12290 }, { "epoch": 52.005641025641026, "grad_norm": 0.005001471843570471, "learning_rate": 5.270655270655271e-06, "loss": 0.0002, "step": 12300 }, { "epoch": 52.00606837606838, "grad_norm": 0.004487255588173866, "learning_rate": 5.2659069325736e-06, "loss": 0.5693, "step": 12310 }, { "epoch": 52.00649572649573, "grad_norm": 612.653564453125, "learning_rate": 5.261158594491929e-06, "loss": 0.7203, "step": 12320 }, { "epoch": 52.00692307692308, "grad_norm": 0.0006710452144034207, "learning_rate": 5.256410256410257e-06, "loss": 0.0004, "step": 12330 }, { "epoch": 52.007350427350424, "grad_norm": 0.0005012887413613498, "learning_rate": 5.2516619183285854e-06, "loss": 0.5105, "step": 12340 }, { "epoch": 52.007777777777775, "grad_norm": 0.002993144793435931, "learning_rate": 5.246913580246914e-06, "loss": 0.8932, "step": 12350 }, { "epoch": 52.00820512820513, "grad_norm": 0.2561093270778656, "learning_rate": 5.242165242165243e-06, "loss": 0.0001, "step": 12360 }, { "epoch": 52.00863247863248, "grad_norm": 0.003413307713344693, "learning_rate": 5.237416904083571e-06, "loss": 0.771, "step": 12370 }, { "epoch": 52.00905982905983, "grad_norm": 0.000699891708791256, "learning_rate": 5.2326685660018995e-06, "loss": 0.8725, "step": 12380 }, { "epoch": 52.00948717948718, "grad_norm": 0.0005549703491851687, "learning_rate": 5.227920227920228e-06, "loss": 0.0001, "step": 12390 }, { "epoch": 52.00991452991453, "grad_norm": 0.02157350443303585, "learning_rate": 5.223171889838556e-06, "loss": 0.0002, "step": 12400 }, { "epoch": 52.01, "eval_accuracy": 0.44, "eval_loss": 5.436356067657471, "eval_runtime": 33.5444, "eval_samples_per_second": 0.745, "eval_steps_per_second": 0.745, "step": 12402 }, { "epoch": 53.00034188034188, "grad_norm": 0.0005383410025388002, "learning_rate": 5.218423551756886e-06, "loss": 0.0001, "step": 12410 }, { "epoch": 53.00076923076923, "grad_norm": 0.0005392262828536332, "learning_rate": 5.213675213675214e-06, "loss": 0.0001, "step": 12420 }, { "epoch": 53.00119658119658, "grad_norm": 3.4017317295074463, "learning_rate": 5.208926875593543e-06, "loss": 0.8486, "step": 12430 }, { "epoch": 53.00162393162393, "grad_norm": 0.0005204902845434844, "learning_rate": 5.204178537511872e-06, "loss": 0.7513, "step": 12440 }, { "epoch": 53.00205128205128, "grad_norm": 0.05535678565502167, "learning_rate": 5.1994301994302e-06, "loss": 0.0001, "step": 12450 }, { "epoch": 53.002478632478635, "grad_norm": 0.002707291394472122, "learning_rate": 5.194681861348528e-06, "loss": 0.0001, "step": 12460 }, { "epoch": 53.002905982905986, "grad_norm": 0.000867334078066051, "learning_rate": 5.189933523266857e-06, "loss": 0.0008, "step": 12470 }, { "epoch": 53.00333333333333, "grad_norm": 0.0019842975307255983, "learning_rate": 5.185185185185185e-06, "loss": 0.0007, "step": 12480 }, { "epoch": 53.00376068376068, "grad_norm": 0.002031872980296612, "learning_rate": 5.180436847103514e-06, "loss": 0.0013, "step": 12490 }, { "epoch": 53.00418803418803, "grad_norm": 0.8920242786407471, "learning_rate": 5.175688509021842e-06, "loss": 0.0004, "step": 12500 }, { "epoch": 53.004615384615384, "grad_norm": 0.005432880017906427, "learning_rate": 5.1709401709401716e-06, "loss": 0.0011, "step": 12510 }, { "epoch": 53.005042735042736, "grad_norm": 0.0005356788169592619, "learning_rate": 5.166191832858501e-06, "loss": 0.1904, "step": 12520 }, { "epoch": 53.00547008547009, "grad_norm": 0.00788265373557806, "learning_rate": 5.161443494776829e-06, "loss": 0.0093, "step": 12530 }, { "epoch": 53.00589743589744, "grad_norm": 0.0006382990977726877, "learning_rate": 5.156695156695157e-06, "loss": 0.0003, "step": 12540 }, { "epoch": 53.00632478632479, "grad_norm": 0.0006224510143510997, "learning_rate": 5.151946818613486e-06, "loss": 0.012, "step": 12550 }, { "epoch": 53.006752136752134, "grad_norm": 0.00042305790702812374, "learning_rate": 5.147198480531814e-06, "loss": 0.0001, "step": 12560 }, { "epoch": 53.007179487179485, "grad_norm": 0.00042074365774169564, "learning_rate": 5.142450142450143e-06, "loss": 0.0009, "step": 12570 }, { "epoch": 53.007606837606836, "grad_norm": 0.00039612961700186133, "learning_rate": 5.137701804368471e-06, "loss": 0.0001, "step": 12580 }, { "epoch": 53.00803418803419, "grad_norm": 0.029377451166510582, "learning_rate": 5.1329534662868e-06, "loss": 0.0, "step": 12590 }, { "epoch": 53.00846153846154, "grad_norm": 0.0005169266369193792, "learning_rate": 5.128205128205128e-06, "loss": 0.0001, "step": 12600 }, { "epoch": 53.00888888888889, "grad_norm": 0.007242798339575529, "learning_rate": 5.123456790123458e-06, "loss": 0.0, "step": 12610 }, { "epoch": 53.00931623931624, "grad_norm": 0.01810172200202942, "learning_rate": 5.118708452041786e-06, "loss": 0.0055, "step": 12620 }, { "epoch": 53.00974358974359, "grad_norm": 0.00594985531643033, "learning_rate": 5.1139601139601145e-06, "loss": 0.0001, "step": 12630 }, { "epoch": 53.01, "eval_accuracy": 0.44, "eval_loss": 5.730712890625, "eval_runtime": 33.9799, "eval_samples_per_second": 0.736, "eval_steps_per_second": 0.736, "step": 12636 }, { "epoch": 54.00017094017094, "grad_norm": 0.0023435179609805346, "learning_rate": 5.109211775878443e-06, "loss": 0.0001, "step": 12640 }, { "epoch": 54.00059829059829, "grad_norm": 8.973128318786621, "learning_rate": 5.104463437796772e-06, "loss": 0.0012, "step": 12650 }, { "epoch": 54.00102564102564, "grad_norm": 0.008565773256123066, "learning_rate": 5.0997150997151e-06, "loss": 0.0003, "step": 12660 }, { "epoch": 54.00145299145299, "grad_norm": 0.0006859298446215689, "learning_rate": 5.0949667616334285e-06, "loss": 0.0, "step": 12670 }, { "epoch": 54.001880341880344, "grad_norm": 0.0004111983289476484, "learning_rate": 5.090218423551757e-06, "loss": 0.0001, "step": 12680 }, { "epoch": 54.002307692307696, "grad_norm": 0.0004719541466329247, "learning_rate": 5.085470085470086e-06, "loss": 0.5531, "step": 12690 }, { "epoch": 54.00273504273504, "grad_norm": 0.004437173251062632, "learning_rate": 5.080721747388414e-06, "loss": 0.006, "step": 12700 }, { "epoch": 54.00316239316239, "grad_norm": 0.0022185053676366806, "learning_rate": 5.075973409306743e-06, "loss": 0.0003, "step": 12710 }, { "epoch": 54.00358974358974, "grad_norm": 0.0506693460047245, "learning_rate": 5.071225071225072e-06, "loss": 0.0001, "step": 12720 }, { "epoch": 54.004017094017094, "grad_norm": 0.0014226617058739066, "learning_rate": 5.066476733143401e-06, "loss": 0.5765, "step": 12730 }, { "epoch": 54.004444444444445, "grad_norm": 0.0005160128348506987, "learning_rate": 5.061728395061729e-06, "loss": 0.0, "step": 12740 }, { "epoch": 54.0048717948718, "grad_norm": 0.0004296216939110309, "learning_rate": 5.0569800569800575e-06, "loss": 0.3456, "step": 12750 }, { "epoch": 54.00529914529915, "grad_norm": 0.0012724578846246004, "learning_rate": 5.052231718898386e-06, "loss": 0.072, "step": 12760 }, { "epoch": 54.0057264957265, "grad_norm": 0.0030480227433145046, "learning_rate": 5.047483380816715e-06, "loss": 0.0001, "step": 12770 }, { "epoch": 54.00615384615384, "grad_norm": 0.0005519132246263325, "learning_rate": 5.042735042735043e-06, "loss": 0.0, "step": 12780 }, { "epoch": 54.006581196581195, "grad_norm": 0.0007735707913525403, "learning_rate": 5.0379867046533715e-06, "loss": 0.4349, "step": 12790 }, { "epoch": 54.007008547008546, "grad_norm": 0.0024715519975870848, "learning_rate": 5.0332383665717e-06, "loss": 0.9521, "step": 12800 }, { "epoch": 54.0074358974359, "grad_norm": 0.005817775148898363, "learning_rate": 5.028490028490028e-06, "loss": 0.0001, "step": 12810 }, { "epoch": 54.00786324786325, "grad_norm": 0.0017352089053019881, "learning_rate": 5.023741690408358e-06, "loss": 0.0004, "step": 12820 }, { "epoch": 54.0082905982906, "grad_norm": 0.0010756135452538729, "learning_rate": 5.018993352326686e-06, "loss": 0.0001, "step": 12830 }, { "epoch": 54.00871794871795, "grad_norm": 0.003498775651678443, "learning_rate": 5.014245014245015e-06, "loss": 0.0003, "step": 12840 }, { "epoch": 54.0091452991453, "grad_norm": 0.010095912963151932, "learning_rate": 5.009496676163344e-06, "loss": 0.017, "step": 12850 }, { "epoch": 54.00957264957265, "grad_norm": 0.00034551063436083496, "learning_rate": 5.004748338081672e-06, "loss": 0.0026, "step": 12860 }, { "epoch": 54.01, "grad_norm": 0.0008016827632673085, "learning_rate": 5e-06, "loss": 0.0, "step": 12870 }, { "epoch": 54.01, "eval_accuracy": 0.44, "eval_loss": 5.978097915649414, "eval_runtime": 33.5999, "eval_samples_per_second": 0.744, "eval_steps_per_second": 0.744, "step": 12870 }, { "epoch": 55.00042735042735, "grad_norm": 0.0008040638058446348, "learning_rate": 4.995251661918329e-06, "loss": 0.0, "step": 12880 }, { "epoch": 55.0008547008547, "grad_norm": 0.0010009427787736058, "learning_rate": 4.990503323836657e-06, "loss": 0.0243, "step": 12890 }, { "epoch": 55.001282051282054, "grad_norm": 0.6322616934776306, "learning_rate": 4.985754985754986e-06, "loss": 0.0002, "step": 12900 }, { "epoch": 55.001709401709405, "grad_norm": 0.00035148794995620847, "learning_rate": 4.981006647673315e-06, "loss": 0.0001, "step": 12910 }, { "epoch": 55.00213675213675, "grad_norm": 0.0009499528096057475, "learning_rate": 4.976258309591644e-06, "loss": 0.158, "step": 12920 }, { "epoch": 55.0025641025641, "grad_norm": 1.7123631238937378, "learning_rate": 4.971509971509972e-06, "loss": 0.0004, "step": 12930 }, { "epoch": 55.00299145299145, "grad_norm": 0.00044161363621242344, "learning_rate": 4.9667616334283e-06, "loss": 0.0001, "step": 12940 }, { "epoch": 55.0034188034188, "grad_norm": 0.0013492783764377236, "learning_rate": 4.962013295346629e-06, "loss": 0.4795, "step": 12950 }, { "epoch": 55.003846153846155, "grad_norm": 0.0004561662208288908, "learning_rate": 4.957264957264958e-06, "loss": 0.0001, "step": 12960 }, { "epoch": 55.004273504273506, "grad_norm": 0.00044428068213164806, "learning_rate": 4.952516619183286e-06, "loss": 0.001, "step": 12970 }, { "epoch": 55.00470085470086, "grad_norm": 0.0006210142164491117, "learning_rate": 4.947768281101615e-06, "loss": 0.0001, "step": 12980 }, { "epoch": 55.00512820512821, "grad_norm": 0.0037781489081680775, "learning_rate": 4.943019943019943e-06, "loss": 0.0, "step": 12990 }, { "epoch": 55.00555555555555, "grad_norm": 0.0003753666242118925, "learning_rate": 4.938271604938272e-06, "loss": 0.0001, "step": 13000 }, { "epoch": 55.005982905982904, "grad_norm": 0.009278559125959873, "learning_rate": 4.933523266856601e-06, "loss": 0.0001, "step": 13010 }, { "epoch": 55.006410256410255, "grad_norm": 1.3523920774459839, "learning_rate": 4.928774928774929e-06, "loss": 0.0003, "step": 13020 }, { "epoch": 55.00683760683761, "grad_norm": 0.00043029204243794084, "learning_rate": 4.924026590693257e-06, "loss": 0.0001, "step": 13030 }, { "epoch": 55.00726495726496, "grad_norm": 0.00037467462243512273, "learning_rate": 4.9192782526115865e-06, "loss": 0.0001, "step": 13040 }, { "epoch": 55.00769230769231, "grad_norm": 0.006767070386558771, "learning_rate": 4.914529914529915e-06, "loss": 0.0001, "step": 13050 }, { "epoch": 55.00811965811966, "grad_norm": 0.00043661269592121243, "learning_rate": 4.909781576448244e-06, "loss": 0.0002, "step": 13060 }, { "epoch": 55.00854700854701, "grad_norm": 0.0015735122142359614, "learning_rate": 4.905033238366572e-06, "loss": 0.0002, "step": 13070 }, { "epoch": 55.008974358974356, "grad_norm": 0.009783231653273106, "learning_rate": 4.9002849002849006e-06, "loss": 0.0001, "step": 13080 }, { "epoch": 55.00940170940171, "grad_norm": 0.000962352380156517, "learning_rate": 4.895536562203229e-06, "loss": 0.9947, "step": 13090 }, { "epoch": 55.00982905982906, "grad_norm": 0.0005439156084321439, "learning_rate": 4.890788224121558e-06, "loss": 0.0001, "step": 13100 }, { "epoch": 55.01, "eval_accuracy": 0.44, "eval_loss": 4.822087287902832, "eval_runtime": 33.7892, "eval_samples_per_second": 0.74, "eval_steps_per_second": 0.74, "step": 13104 }, { "epoch": 56.00025641025641, "grad_norm": 0.010900448076426983, "learning_rate": 4.886039886039886e-06, "loss": 0.0001, "step": 13110 }, { "epoch": 56.00068376068376, "grad_norm": 0.0009302808903157711, "learning_rate": 4.8812915479582154e-06, "loss": 0.0001, "step": 13120 }, { "epoch": 56.00111111111111, "grad_norm": 0.0025970793794840574, "learning_rate": 4.876543209876544e-06, "loss": 0.582, "step": 13130 }, { "epoch": 56.00153846153846, "grad_norm": 0.003855284536257386, "learning_rate": 4.871794871794872e-06, "loss": 0.0001, "step": 13140 }, { "epoch": 56.00196581196581, "grad_norm": 0.16761374473571777, "learning_rate": 4.8670465337132e-06, "loss": 0.7305, "step": 13150 }, { "epoch": 56.00239316239316, "grad_norm": 0.01091548427939415, "learning_rate": 4.8622981956315295e-06, "loss": 0.0001, "step": 13160 }, { "epoch": 56.00282051282051, "grad_norm": 0.0012768160086125135, "learning_rate": 4.857549857549858e-06, "loss": 0.0184, "step": 13170 }, { "epoch": 56.003247863247864, "grad_norm": 0.0003495585115160793, "learning_rate": 4.852801519468187e-06, "loss": 0.0, "step": 13180 }, { "epoch": 56.003675213675216, "grad_norm": 0.00034958813921548426, "learning_rate": 4.848053181386515e-06, "loss": 0.19, "step": 13190 }, { "epoch": 56.00410256410257, "grad_norm": 0.00039932169602252543, "learning_rate": 4.8433048433048435e-06, "loss": 0.0, "step": 13200 }, { "epoch": 56.00452991452991, "grad_norm": 0.00140125781763345, "learning_rate": 4.838556505223173e-06, "loss": 0.4053, "step": 13210 }, { "epoch": 56.00495726495726, "grad_norm": 0.00046302462578751147, "learning_rate": 4.833808167141501e-06, "loss": 0.1168, "step": 13220 }, { "epoch": 56.005384615384614, "grad_norm": 0.07333427667617798, "learning_rate": 4.829059829059829e-06, "loss": 0.0, "step": 13230 }, { "epoch": 56.005811965811965, "grad_norm": 0.5369700789451599, "learning_rate": 4.824311490978158e-06, "loss": 0.0002, "step": 13240 }, { "epoch": 56.006239316239316, "grad_norm": 0.0011444406118243933, "learning_rate": 4.819563152896487e-06, "loss": 0.0001, "step": 13250 }, { "epoch": 56.00666666666667, "grad_norm": 0.005999968387186527, "learning_rate": 4.814814814814815e-06, "loss": 0.0003, "step": 13260 }, { "epoch": 56.00709401709402, "grad_norm": 0.006480031181126833, "learning_rate": 4.810066476733144e-06, "loss": 0.1278, "step": 13270 }, { "epoch": 56.00752136752137, "grad_norm": 0.001407441683113575, "learning_rate": 4.8053181386514724e-06, "loss": 0.0011, "step": 13280 }, { "epoch": 56.007948717948715, "grad_norm": 0.008889264427125454, "learning_rate": 4.800569800569801e-06, "loss": 0.0001, "step": 13290 }, { "epoch": 56.008376068376066, "grad_norm": 0.0005853257607668638, "learning_rate": 4.795821462488129e-06, "loss": 0.0242, "step": 13300 }, { "epoch": 56.00880341880342, "grad_norm": 0.0004666523600462824, "learning_rate": 4.791073124406458e-06, "loss": 0.0001, "step": 13310 }, { "epoch": 56.00923076923077, "grad_norm": 404.7363586425781, "learning_rate": 4.786324786324787e-06, "loss": 0.7525, "step": 13320 }, { "epoch": 56.00965811965812, "grad_norm": 0.0004823113849852234, "learning_rate": 4.781576448243116e-06, "loss": 0.0001, "step": 13330 }, { "epoch": 56.01, "eval_accuracy": 0.4, "eval_loss": 5.58083438873291, "eval_runtime": 33.7533, "eval_samples_per_second": 0.741, "eval_steps_per_second": 0.741, "step": 13338 }, { "epoch": 57.00008547008547, "grad_norm": 0.001120713190175593, "learning_rate": 4.776828110161444e-06, "loss": 0.0001, "step": 13340 }, { "epoch": 57.00051282051282, "grad_norm": 0.0011573611991479993, "learning_rate": 4.772079772079772e-06, "loss": 0.0, "step": 13350 }, { "epoch": 57.00094017094017, "grad_norm": 0.0003190473944414407, "learning_rate": 4.7673314339981005e-06, "loss": 0.0, "step": 13360 }, { "epoch": 57.00136752136752, "grad_norm": 0.29808613657951355, "learning_rate": 4.76258309591643e-06, "loss": 0.0003, "step": 13370 }, { "epoch": 57.00179487179487, "grad_norm": 0.0006945320055820048, "learning_rate": 4.757834757834758e-06, "loss": 0.0001, "step": 13380 }, { "epoch": 57.00222222222222, "grad_norm": 0.0003567099920473993, "learning_rate": 4.753086419753087e-06, "loss": 0.0001, "step": 13390 }, { "epoch": 57.002649572649574, "grad_norm": 0.012886996380984783, "learning_rate": 4.748338081671415e-06, "loss": 0.0, "step": 13400 }, { "epoch": 57.003076923076925, "grad_norm": 0.0008997747790999711, "learning_rate": 4.743589743589744e-06, "loss": 0.0001, "step": 13410 }, { "epoch": 57.00350427350428, "grad_norm": 0.008139757439494133, "learning_rate": 4.738841405508073e-06, "loss": 0.0, "step": 13420 }, { "epoch": 57.00393162393162, "grad_norm": 0.008020474575459957, "learning_rate": 4.734093067426401e-06, "loss": 0.0001, "step": 13430 }, { "epoch": 57.00435897435897, "grad_norm": 0.0004538062203209847, "learning_rate": 4.729344729344729e-06, "loss": 0.0001, "step": 13440 }, { "epoch": 57.00478632478632, "grad_norm": 0.0063064745627343655, "learning_rate": 4.7245963912630586e-06, "loss": 0.0004, "step": 13450 }, { "epoch": 57.005213675213675, "grad_norm": 0.001134122721850872, "learning_rate": 4.719848053181387e-06, "loss": 0.0, "step": 13460 }, { "epoch": 57.005641025641026, "grad_norm": 0.01796836219727993, "learning_rate": 4.715099715099716e-06, "loss": 0.0001, "step": 13470 }, { "epoch": 57.00606837606838, "grad_norm": 0.00409694854170084, "learning_rate": 4.710351377018044e-06, "loss": 0.0002, "step": 13480 }, { "epoch": 57.00649572649573, "grad_norm": 0.00029308826196938753, "learning_rate": 4.705603038936373e-06, "loss": 0.0033, "step": 13490 }, { "epoch": 57.00692307692308, "grad_norm": 0.0007244080770760775, "learning_rate": 4.700854700854701e-06, "loss": 0.0, "step": 13500 }, { "epoch": 57.007350427350424, "grad_norm": 0.012751772068440914, "learning_rate": 4.69610636277303e-06, "loss": 0.0001, "step": 13510 }, { "epoch": 57.007777777777775, "grad_norm": 0.0007130270823836327, "learning_rate": 4.691358024691358e-06, "loss": 0.0, "step": 13520 }, { "epoch": 57.00820512820513, "grad_norm": 0.0003145153750665486, "learning_rate": 4.6866096866096875e-06, "loss": 0.1948, "step": 13530 }, { "epoch": 57.00863247863248, "grad_norm": 0.00031738655525259674, "learning_rate": 4.681861348528016e-06, "loss": 0.0, "step": 13540 }, { "epoch": 57.00905982905983, "grad_norm": 0.0002995093527715653, "learning_rate": 4.677113010446344e-06, "loss": 0.0001, "step": 13550 }, { "epoch": 57.00948717948718, "grad_norm": 0.00032525527058169246, "learning_rate": 4.672364672364672e-06, "loss": 0.0, "step": 13560 }, { "epoch": 57.00991452991453, "grad_norm": 0.00266217440366745, "learning_rate": 4.6676163342830015e-06, "loss": 0.0, "step": 13570 }, { "epoch": 57.01, "eval_accuracy": 0.44, "eval_loss": 5.7661638259887695, "eval_runtime": 33.6667, "eval_samples_per_second": 0.743, "eval_steps_per_second": 0.743, "step": 13572 }, { "epoch": 58.00034188034188, "grad_norm": 956.22998046875, "learning_rate": 4.66286799620133e-06, "loss": 0.9196, "step": 13580 }, { "epoch": 58.00076923076923, "grad_norm": 0.0004830555117223412, "learning_rate": 4.658119658119659e-06, "loss": 0.0, "step": 13590 }, { "epoch": 58.00119658119658, "grad_norm": 0.00030270041315816343, "learning_rate": 4.653371320037987e-06, "loss": 0.0, "step": 13600 }, { "epoch": 58.00162393162393, "grad_norm": 0.0006561644258908927, "learning_rate": 4.6486229819563155e-06, "loss": 0.0, "step": 13610 }, { "epoch": 58.00205128205128, "grad_norm": 0.0006711311871185899, "learning_rate": 4.643874643874644e-06, "loss": 0.0, "step": 13620 }, { "epoch": 58.002478632478635, "grad_norm": 0.0025408940855413675, "learning_rate": 4.639126305792973e-06, "loss": 0.0, "step": 13630 }, { "epoch": 58.002905982905986, "grad_norm": 0.0009754471248015761, "learning_rate": 4.634377967711301e-06, "loss": 0.0, "step": 13640 }, { "epoch": 58.00333333333333, "grad_norm": 0.0024775350466370583, "learning_rate": 4.62962962962963e-06, "loss": 0.0, "step": 13650 }, { "epoch": 58.00376068376068, "grad_norm": 0.007733345031738281, "learning_rate": 4.624881291547959e-06, "loss": 0.0001, "step": 13660 }, { "epoch": 58.00418803418803, "grad_norm": 0.00038200884591788054, "learning_rate": 4.620132953466287e-06, "loss": 0.0, "step": 13670 }, { "epoch": 58.004615384615384, "grad_norm": 0.00032710927189327776, "learning_rate": 4.615384615384616e-06, "loss": 0.0, "step": 13680 }, { "epoch": 58.005042735042736, "grad_norm": 0.00046126110828481615, "learning_rate": 4.6106362773029444e-06, "loss": 0.0001, "step": 13690 }, { "epoch": 58.00547008547009, "grad_norm": 0.0009468531352467835, "learning_rate": 4.605887939221273e-06, "loss": 0.0001, "step": 13700 }, { "epoch": 58.00589743589744, "grad_norm": 0.0003751327458303422, "learning_rate": 4.601139601139601e-06, "loss": 0.0001, "step": 13710 }, { "epoch": 58.00632478632479, "grad_norm": 0.0024412404745817184, "learning_rate": 4.59639126305793e-06, "loss": 0.0, "step": 13720 }, { "epoch": 58.006752136752134, "grad_norm": 0.0011317358585074544, "learning_rate": 4.5916429249762585e-06, "loss": 0.0, "step": 13730 }, { "epoch": 58.007179487179485, "grad_norm": 0.00213524978607893, "learning_rate": 4.586894586894588e-06, "loss": 0.0001, "step": 13740 }, { "epoch": 58.007606837606836, "grad_norm": 0.00029954416095279157, "learning_rate": 4.582146248812916e-06, "loss": 0.0001, "step": 13750 }, { "epoch": 58.00803418803419, "grad_norm": 0.004025555215775967, "learning_rate": 4.577397910731244e-06, "loss": 0.0, "step": 13760 }, { "epoch": 58.00846153846154, "grad_norm": 0.0070886253379285336, "learning_rate": 4.5726495726495725e-06, "loss": 0.5328, "step": 13770 }, { "epoch": 58.00888888888889, "grad_norm": 0.0006350260809995234, "learning_rate": 4.567901234567902e-06, "loss": 0.0, "step": 13780 }, { "epoch": 58.00931623931624, "grad_norm": 89.9181900024414, "learning_rate": 4.563152896486231e-06, "loss": 0.0124, "step": 13790 }, { "epoch": 58.00974358974359, "grad_norm": 0.000428661733167246, "learning_rate": 4.558404558404559e-06, "loss": 0.0001, "step": 13800 }, { "epoch": 58.01, "eval_accuracy": 0.44, "eval_loss": 5.446341037750244, "eval_runtime": 33.6227, "eval_samples_per_second": 0.744, "eval_steps_per_second": 0.744, "step": 13806 }, { "epoch": 59.00017094017094, "grad_norm": 0.0007079663337208331, "learning_rate": 4.553656220322887e-06, "loss": 0.0001, "step": 13810 }, { "epoch": 59.00059829059829, "grad_norm": 0.0012396123493090272, "learning_rate": 4.548907882241216e-06, "loss": 1.3237, "step": 13820 }, { "epoch": 59.00102564102564, "grad_norm": 0.005205500405281782, "learning_rate": 4.544159544159544e-06, "loss": 0.4377, "step": 13830 }, { "epoch": 59.00145299145299, "grad_norm": 0.00035839254269376397, "learning_rate": 4.539411206077873e-06, "loss": 0.0083, "step": 13840 }, { "epoch": 59.001880341880344, "grad_norm": 0.0005130122299306095, "learning_rate": 4.5346628679962014e-06, "loss": 0.0, "step": 13850 }, { "epoch": 59.002307692307696, "grad_norm": 0.0014134275261312723, "learning_rate": 4.5299145299145306e-06, "loss": 0.0, "step": 13860 }, { "epoch": 59.00273504273504, "grad_norm": 0.0625193640589714, "learning_rate": 4.525166191832859e-06, "loss": 0.0001, "step": 13870 }, { "epoch": 59.00316239316239, "grad_norm": 0.0004801045579370111, "learning_rate": 4.520417853751187e-06, "loss": 0.0001, "step": 13880 }, { "epoch": 59.00358974358974, "grad_norm": 0.0007496966863982379, "learning_rate": 4.515669515669516e-06, "loss": 0.1716, "step": 13890 }, { "epoch": 59.004017094017094, "grad_norm": 0.0012893045786768198, "learning_rate": 4.510921177587845e-06, "loss": 0.0, "step": 13900 }, { "epoch": 59.004444444444445, "grad_norm": 0.0003310160245746374, "learning_rate": 4.506172839506173e-06, "loss": 0.0001, "step": 13910 }, { "epoch": 59.0048717948718, "grad_norm": 44.41610336303711, "learning_rate": 4.501424501424502e-06, "loss": 0.0076, "step": 13920 }, { "epoch": 59.00529914529915, "grad_norm": 0.0027762632817029953, "learning_rate": 4.49667616334283e-06, "loss": 0.0, "step": 13930 }, { "epoch": 59.0057264957265, "grad_norm": 0.0003408943011891097, "learning_rate": 4.4919278252611595e-06, "loss": 0.0, "step": 13940 }, { "epoch": 59.00615384615384, "grad_norm": 0.0014558567199856043, "learning_rate": 4.487179487179488e-06, "loss": 0.0, "step": 13950 }, { "epoch": 59.006581196581195, "grad_norm": 0.002356948796659708, "learning_rate": 4.482431149097816e-06, "loss": 0.0108, "step": 13960 }, { "epoch": 59.007008547008546, "grad_norm": 0.0007559444638900459, "learning_rate": 4.477682811016144e-06, "loss": 0.6313, "step": 13970 }, { "epoch": 59.0074358974359, "grad_norm": 0.004984420258551836, "learning_rate": 4.4729344729344735e-06, "loss": 0.1682, "step": 13980 }, { "epoch": 59.00786324786325, "grad_norm": 0.0007952677551656961, "learning_rate": 4.468186134852802e-06, "loss": 0.9454, "step": 13990 }, { "epoch": 59.0082905982906, "grad_norm": 0.02415173314511776, "learning_rate": 4.463437796771131e-06, "loss": 0.0001, "step": 14000 }, { "epoch": 59.00871794871795, "grad_norm": 0.0003063694457523525, "learning_rate": 4.458689458689459e-06, "loss": 0.0001, "step": 14010 }, { "epoch": 59.0091452991453, "grad_norm": 0.00032008966081775725, "learning_rate": 4.4539411206077876e-06, "loss": 0.4779, "step": 14020 }, { "epoch": 59.00957264957265, "grad_norm": 0.00046775556984357536, "learning_rate": 4.449192782526116e-06, "loss": 0.0001, "step": 14030 }, { "epoch": 59.01, "grad_norm": 0.06117213889956474, "learning_rate": 4.444444444444444e-06, "loss": 0.0021, "step": 14040 }, { "epoch": 59.01, "eval_accuracy": 0.44, "eval_loss": 5.957569599151611, "eval_runtime": 36.0184, "eval_samples_per_second": 0.694, "eval_steps_per_second": 0.694, "step": 14040 }, { "epoch": 60.00042735042735, "grad_norm": 0.0008756799506954849, "learning_rate": 4.439696106362773e-06, "loss": 0.6096, "step": 14050 }, { "epoch": 60.0008547008547, "grad_norm": 945.39501953125, "learning_rate": 4.4349477682811024e-06, "loss": 0.5127, "step": 14060 }, { "epoch": 60.001282051282054, "grad_norm": 0.0003207973495591432, "learning_rate": 4.430199430199431e-06, "loss": 0.0, "step": 14070 }, { "epoch": 60.001709401709405, "grad_norm": 0.005959488917142153, "learning_rate": 4.425451092117759e-06, "loss": 0.0001, "step": 14080 }, { "epoch": 60.00213675213675, "grad_norm": 0.0004705670871771872, "learning_rate": 4.420702754036087e-06, "loss": 0.0, "step": 14090 }, { "epoch": 60.0025641025641, "grad_norm": 0.0004403094353619963, "learning_rate": 4.4159544159544165e-06, "loss": 0.0, "step": 14100 }, { "epoch": 60.00299145299145, "grad_norm": 707.6570434570312, "learning_rate": 4.411206077872745e-06, "loss": 0.7011, "step": 14110 }, { "epoch": 60.0034188034188, "grad_norm": 0.00034931182744912803, "learning_rate": 4.406457739791074e-06, "loss": 0.0011, "step": 14120 }, { "epoch": 60.003846153846155, "grad_norm": 0.00047842093044891953, "learning_rate": 4.401709401709402e-06, "loss": 0.7669, "step": 14130 }, { "epoch": 60.004273504273506, "grad_norm": 0.0005029200110584497, "learning_rate": 4.3969610636277305e-06, "loss": 0.6763, "step": 14140 }, { "epoch": 60.00470085470086, "grad_norm": 0.0007298401324078441, "learning_rate": 4.39221272554606e-06, "loss": 0.0001, "step": 14150 }, { "epoch": 60.00512820512821, "grad_norm": 0.08432787656784058, "learning_rate": 4.387464387464388e-06, "loss": 1.019, "step": 14160 }, { "epoch": 60.00555555555555, "grad_norm": 0.011608411557972431, "learning_rate": 4.382716049382716e-06, "loss": 0.0, "step": 14170 }, { "epoch": 60.005982905982904, "grad_norm": 0.0004716445109806955, "learning_rate": 4.3779677113010445e-06, "loss": 1.0345, "step": 14180 }, { "epoch": 60.006410256410255, "grad_norm": 0.0010066544637084007, "learning_rate": 4.373219373219374e-06, "loss": 0.8799, "step": 14190 }, { "epoch": 60.00683760683761, "grad_norm": 0.01125133316963911, "learning_rate": 4.368471035137703e-06, "loss": 0.0012, "step": 14200 }, { "epoch": 60.00726495726496, "grad_norm": 0.0006305111455731094, "learning_rate": 4.363722697056031e-06, "loss": 0.7549, "step": 14210 }, { "epoch": 60.00769230769231, "grad_norm": 0.003195669502019882, "learning_rate": 4.358974358974359e-06, "loss": 0.0, "step": 14220 }, { "epoch": 60.00811965811966, "grad_norm": 0.20871169865131378, "learning_rate": 4.354226020892688e-06, "loss": 0.0753, "step": 14230 }, { "epoch": 60.00854700854701, "grad_norm": 0.0661257803440094, "learning_rate": 4.349477682811016e-06, "loss": 0.0001, "step": 14240 }, { "epoch": 60.008974358974356, "grad_norm": 0.0009889448992908, "learning_rate": 4.344729344729345e-06, "loss": 0.0002, "step": 14250 }, { "epoch": 60.00940170940171, "grad_norm": 0.009586402215063572, "learning_rate": 4.3399810066476735e-06, "loss": 0.0004, "step": 14260 }, { "epoch": 60.00982905982906, "grad_norm": 558.0621337890625, "learning_rate": 4.335232668566003e-06, "loss": 0.5042, "step": 14270 }, { "epoch": 60.01, "eval_accuracy": 0.4, "eval_loss": 5.9418511390686035, "eval_runtime": 37.744, "eval_samples_per_second": 0.662, "eval_steps_per_second": 0.662, "step": 14274 }, { "epoch": 61.00025641025641, "grad_norm": 0.00028759066481143236, "learning_rate": 4.330484330484331e-06, "loss": 0.9175, "step": 14280 }, { "epoch": 61.00068376068376, "grad_norm": 0.007706713397055864, "learning_rate": 4.325735992402659e-06, "loss": 0.0001, "step": 14290 }, { "epoch": 61.00111111111111, "grad_norm": 0.0005172354285605252, "learning_rate": 4.3209876543209875e-06, "loss": 0.0, "step": 14300 }, { "epoch": 61.00153846153846, "grad_norm": 0.0005246769869700074, "learning_rate": 4.316239316239317e-06, "loss": 0.6762, "step": 14310 }, { "epoch": 61.00196581196581, "grad_norm": 0.0011663725599646568, "learning_rate": 4.311490978157645e-06, "loss": 0.0, "step": 14320 }, { "epoch": 61.00239316239316, "grad_norm": 0.005837870761752129, "learning_rate": 4.306742640075974e-06, "loss": 0.0907, "step": 14330 }, { "epoch": 61.00282051282051, "grad_norm": 0.00042240836773999035, "learning_rate": 4.301994301994302e-06, "loss": 0.0, "step": 14340 }, { "epoch": 61.003247863247864, "grad_norm": 0.0007428540848195553, "learning_rate": 4.297245963912631e-06, "loss": 0.0001, "step": 14350 }, { "epoch": 61.003675213675216, "grad_norm": 0.0005527780158445239, "learning_rate": 4.29249762583096e-06, "loss": 0.0, "step": 14360 }, { "epoch": 61.00410256410257, "grad_norm": 0.001011466607451439, "learning_rate": 4.287749287749288e-06, "loss": 0.0001, "step": 14370 }, { "epoch": 61.00452991452991, "grad_norm": 17.799911499023438, "learning_rate": 4.283000949667616e-06, "loss": 0.0566, "step": 14380 }, { "epoch": 61.00495726495726, "grad_norm": 20.882823944091797, "learning_rate": 4.2782526115859455e-06, "loss": 0.0043, "step": 14390 }, { "epoch": 61.005384615384614, "grad_norm": 0.00030632095877081156, "learning_rate": 4.273504273504274e-06, "loss": 0.0004, "step": 14400 }, { "epoch": 61.005811965811965, "grad_norm": 0.000464937009382993, "learning_rate": 4.268755935422603e-06, "loss": 0.0001, "step": 14410 }, { "epoch": 61.006239316239316, "grad_norm": 0.0014673583209514618, "learning_rate": 4.264007597340931e-06, "loss": 0.0001, "step": 14420 }, { "epoch": 61.00666666666667, "grad_norm": 0.0011050072498619556, "learning_rate": 4.2592592592592596e-06, "loss": 0.0, "step": 14430 }, { "epoch": 61.00709401709402, "grad_norm": 408.4623107910156, "learning_rate": 4.254510921177588e-06, "loss": 0.5193, "step": 14440 }, { "epoch": 61.00752136752137, "grad_norm": 0.0010529108112677932, "learning_rate": 4.249762583095917e-06, "loss": 0.0011, "step": 14450 }, { "epoch": 61.007948717948715, "grad_norm": 0.0004740229342132807, "learning_rate": 4.245014245014245e-06, "loss": 0.0001, "step": 14460 }, { "epoch": 61.008376068376066, "grad_norm": 0.0004031808057334274, "learning_rate": 4.2402659069325745e-06, "loss": 0.0, "step": 14470 }, { "epoch": 61.00880341880342, "grad_norm": 0.00028951760032214224, "learning_rate": 4.235517568850903e-06, "loss": 0.0, "step": 14480 }, { "epoch": 61.00923076923077, "grad_norm": 0.00033271079882979393, "learning_rate": 4.230769230769231e-06, "loss": 0.0001, "step": 14490 }, { "epoch": 61.00965811965812, "grad_norm": 41.438385009765625, "learning_rate": 4.226020892687559e-06, "loss": 0.0053, "step": 14500 }, { "epoch": 61.01, "eval_accuracy": 0.48, "eval_loss": 5.297748565673828, "eval_runtime": 37.9181, "eval_samples_per_second": 0.659, "eval_steps_per_second": 0.659, "step": 14508 }, { "epoch": 62.00008547008547, "grad_norm": 0.0023157300893217325, "learning_rate": 4.221272554605888e-06, "loss": 0.0, "step": 14510 }, { "epoch": 62.00051282051282, "grad_norm": 0.000285146088572219, "learning_rate": 4.216524216524217e-06, "loss": 0.0, "step": 14520 }, { "epoch": 62.00094017094017, "grad_norm": 0.00043018904398195446, "learning_rate": 4.211775878442546e-06, "loss": 0.0, "step": 14530 }, { "epoch": 62.00136752136752, "grad_norm": 0.0006764789577573538, "learning_rate": 4.207027540360874e-06, "loss": 0.2522, "step": 14540 }, { "epoch": 62.00179487179487, "grad_norm": 0.0003662327362690121, "learning_rate": 4.2022792022792025e-06, "loss": 0.0, "step": 14550 }, { "epoch": 62.00222222222222, "grad_norm": 0.0002613349643070251, "learning_rate": 4.197530864197531e-06, "loss": 0.0001, "step": 14560 }, { "epoch": 62.002649572649574, "grad_norm": 0.0002579360152594745, "learning_rate": 4.19278252611586e-06, "loss": 0.0001, "step": 14570 }, { "epoch": 62.003076923076925, "grad_norm": 0.002828446449711919, "learning_rate": 4.188034188034188e-06, "loss": 0.0001, "step": 14580 }, { "epoch": 62.00350427350428, "grad_norm": 0.02188822254538536, "learning_rate": 4.1832858499525166e-06, "loss": 0.0, "step": 14590 }, { "epoch": 62.00393162393162, "grad_norm": 0.0006723613478243351, "learning_rate": 4.178537511870846e-06, "loss": 0.0, "step": 14600 }, { "epoch": 62.00435897435897, "grad_norm": 0.0004710882785730064, "learning_rate": 4.173789173789174e-06, "loss": 0.0, "step": 14610 }, { "epoch": 62.00478632478632, "grad_norm": 0.0002582537126727402, "learning_rate": 4.169040835707503e-06, "loss": 0.0, "step": 14620 }, { "epoch": 62.005213675213675, "grad_norm": 0.0019376453710719943, "learning_rate": 4.1642924976258314e-06, "loss": 0.0001, "step": 14630 }, { "epoch": 62.005641025641026, "grad_norm": 0.00032024920801632106, "learning_rate": 4.15954415954416e-06, "loss": 0.0, "step": 14640 }, { "epoch": 62.00606837606838, "grad_norm": 0.00024245944223366678, "learning_rate": 4.154795821462488e-06, "loss": 0.0, "step": 14650 }, { "epoch": 62.00649572649573, "grad_norm": 0.0006904223700985312, "learning_rate": 4.150047483380817e-06, "loss": 0.7856, "step": 14660 }, { "epoch": 62.00692307692308, "grad_norm": 0.00029405683744698763, "learning_rate": 4.145299145299146e-06, "loss": 0.0632, "step": 14670 }, { "epoch": 62.007350427350424, "grad_norm": 0.0007452707504853606, "learning_rate": 4.140550807217475e-06, "loss": 0.0, "step": 14680 }, { "epoch": 62.007777777777775, "grad_norm": 559.78662109375, "learning_rate": 4.135802469135803e-06, "loss": 0.3101, "step": 14690 }, { "epoch": 62.00820512820513, "grad_norm": 0.02688130922615528, "learning_rate": 4.131054131054131e-06, "loss": 0.0001, "step": 14700 }, { "epoch": 62.00863247863248, "grad_norm": 2.4815587997436523, "learning_rate": 4.1263057929724595e-06, "loss": 0.0005, "step": 14710 }, { "epoch": 62.00905982905983, "grad_norm": 0.004581361077725887, "learning_rate": 4.121557454890789e-06, "loss": 0.0031, "step": 14720 }, { "epoch": 62.00948717948718, "grad_norm": 0.017405280843377113, "learning_rate": 4.116809116809117e-06, "loss": 0.308, "step": 14730 }, { "epoch": 62.00991452991453, "grad_norm": 0.001918576774187386, "learning_rate": 4.112060778727446e-06, "loss": 0.0, "step": 14740 }, { "epoch": 62.01, "eval_accuracy": 0.4, "eval_loss": 5.854069232940674, "eval_runtime": 32.1126, "eval_samples_per_second": 0.779, "eval_steps_per_second": 0.779, "step": 14742 }, { "epoch": 63.00034188034188, "grad_norm": 0.004541910719126463, "learning_rate": 4.107312440645774e-06, "loss": 0.0, "step": 14750 }, { "epoch": 63.00076923076923, "grad_norm": 0.0010953666642308235, "learning_rate": 4.102564102564103e-06, "loss": 0.0001, "step": 14760 }, { "epoch": 63.00119658119658, "grad_norm": 0.001070062629878521, "learning_rate": 4.097815764482431e-06, "loss": 0.0001, "step": 14770 }, { "epoch": 63.00162393162393, "grad_norm": 0.001508062588982284, "learning_rate": 4.09306742640076e-06, "loss": 0.0001, "step": 14780 }, { "epoch": 63.00205128205128, "grad_norm": 0.0003479434235487133, "learning_rate": 4.088319088319088e-06, "loss": 0.0, "step": 14790 }, { "epoch": 63.002478632478635, "grad_norm": 0.0033716338220983744, "learning_rate": 4.0835707502374176e-06, "loss": 0.0001, "step": 14800 }, { "epoch": 63.002905982905986, "grad_norm": 0.0008653479162603617, "learning_rate": 4.078822412155746e-06, "loss": 0.0173, "step": 14810 }, { "epoch": 63.00333333333333, "grad_norm": 0.00030156198772601783, "learning_rate": 4.074074074074074e-06, "loss": 0.0, "step": 14820 }, { "epoch": 63.00376068376068, "grad_norm": 0.0004595111822709441, "learning_rate": 4.069325735992403e-06, "loss": 0.0, "step": 14830 }, { "epoch": 63.00418803418803, "grad_norm": 0.0010495609603822231, "learning_rate": 4.064577397910732e-06, "loss": 0.0, "step": 14840 }, { "epoch": 63.004615384615384, "grad_norm": 0.004493600223213434, "learning_rate": 4.05982905982906e-06, "loss": 0.6305, "step": 14850 }, { "epoch": 63.005042735042736, "grad_norm": 0.0505562499165535, "learning_rate": 4.055080721747389e-06, "loss": 0.0016, "step": 14860 }, { "epoch": 63.00547008547009, "grad_norm": 0.002653477480635047, "learning_rate": 4.050332383665717e-06, "loss": 0.0001, "step": 14870 }, { "epoch": 63.00589743589744, "grad_norm": 0.0020723838824778795, "learning_rate": 4.0455840455840465e-06, "loss": 0.0001, "step": 14880 }, { "epoch": 63.00632478632479, "grad_norm": 0.0002628070651553571, "learning_rate": 4.040835707502375e-06, "loss": 0.0004, "step": 14890 }, { "epoch": 63.006752136752134, "grad_norm": 0.0003271848545409739, "learning_rate": 4.036087369420703e-06, "loss": 0.0001, "step": 14900 }, { "epoch": 63.007179487179485, "grad_norm": 0.0002233712439192459, "learning_rate": 4.031339031339031e-06, "loss": 0.0013, "step": 14910 }, { "epoch": 63.007606837606836, "grad_norm": 0.000736577610950917, "learning_rate": 4.02659069325736e-06, "loss": 0.0, "step": 14920 }, { "epoch": 63.00803418803419, "grad_norm": 0.0002595110272523016, "learning_rate": 4.021842355175689e-06, "loss": 0.0, "step": 14930 }, { "epoch": 63.00846153846154, "grad_norm": 0.00025847507640719414, "learning_rate": 4.017094017094018e-06, "loss": 0.0, "step": 14940 }, { "epoch": 63.00888888888889, "grad_norm": 0.0003213146119378507, "learning_rate": 4.012345679012346e-06, "loss": 0.0, "step": 14950 }, { "epoch": 63.00931623931624, "grad_norm": 111.42301177978516, "learning_rate": 4.0075973409306745e-06, "loss": 0.0102, "step": 14960 }, { "epoch": 63.00974358974359, "grad_norm": 705.5123291015625, "learning_rate": 4.002849002849003e-06, "loss": 0.1555, "step": 14970 }, { "epoch": 63.01, "eval_accuracy": 0.4, "eval_loss": 6.5367207527160645, "eval_runtime": 31.9232, "eval_samples_per_second": 0.783, "eval_steps_per_second": 0.783, "step": 14976 }, { "epoch": 64.00017094017095, "grad_norm": 0.42229506373405457, "learning_rate": 3.998100664767331e-06, "loss": 0.1863, "step": 14980 }, { "epoch": 64.00059829059829, "grad_norm": 0.00032686052145436406, "learning_rate": 3.99335232668566e-06, "loss": 0.0, "step": 14990 }, { "epoch": 64.00102564102563, "grad_norm": 0.0032908704597502947, "learning_rate": 3.9886039886039894e-06, "loss": 0.0002, "step": 15000 }, { "epoch": 64.001452991453, "grad_norm": 3.955153703689575, "learning_rate": 3.983855650522318e-06, "loss": 0.0007, "step": 15010 }, { "epoch": 64.00188034188034, "grad_norm": 0.0003446421178523451, "learning_rate": 3.979107312440646e-06, "loss": 1.0107, "step": 15020 }, { "epoch": 64.0023076923077, "grad_norm": 0.002301581669598818, "learning_rate": 3.974358974358974e-06, "loss": 0.0, "step": 15030 }, { "epoch": 64.00273504273504, "grad_norm": 0.0007779121515341103, "learning_rate": 3.9696106362773035e-06, "loss": 0.7845, "step": 15040 }, { "epoch": 64.0031623931624, "grad_norm": 0.11843976378440857, "learning_rate": 3.964862298195632e-06, "loss": 0.1222, "step": 15050 }, { "epoch": 64.00358974358974, "grad_norm": 0.020959218963980675, "learning_rate": 3.96011396011396e-06, "loss": 0.0, "step": 15060 }, { "epoch": 64.0040170940171, "grad_norm": 506.3919982910156, "learning_rate": 3.955365622032289e-06, "loss": 0.0546, "step": 15070 }, { "epoch": 64.00444444444445, "grad_norm": 0.0002198971196776256, "learning_rate": 3.9506172839506175e-06, "loss": 0.0, "step": 15080 }, { "epoch": 64.00487179487179, "grad_norm": 0.008080464787781239, "learning_rate": 3.945868945868947e-06, "loss": 0.0001, "step": 15090 }, { "epoch": 64.00529914529915, "grad_norm": 0.002219392219558358, "learning_rate": 3.941120607787275e-06, "loss": 0.0007, "step": 15100 }, { "epoch": 64.00572649572649, "grad_norm": 0.0003887212951667607, "learning_rate": 3.936372269705603e-06, "loss": 0.0016, "step": 15110 }, { "epoch": 64.00615384615385, "grad_norm": 0.001071627251803875, "learning_rate": 3.9316239316239315e-06, "loss": 0.0, "step": 15120 }, { "epoch": 64.0065811965812, "grad_norm": 0.00028002672479487956, "learning_rate": 3.926875593542261e-06, "loss": 0.0, "step": 15130 }, { "epoch": 64.00700854700855, "grad_norm": 0.00031417288118973374, "learning_rate": 3.922127255460589e-06, "loss": 0.0, "step": 15140 }, { "epoch": 64.0074358974359, "grad_norm": 0.0010667052119970322, "learning_rate": 3.917378917378918e-06, "loss": 0.0273, "step": 15150 }, { "epoch": 64.00786324786324, "grad_norm": 0.0006762260454706848, "learning_rate": 3.912630579297246e-06, "loss": 0.051, "step": 15160 }, { "epoch": 64.0082905982906, "grad_norm": 0.006420073565095663, "learning_rate": 3.907882241215575e-06, "loss": 0.0, "step": 15170 }, { "epoch": 64.00871794871794, "grad_norm": 0.0005878534284420311, "learning_rate": 3.903133903133903e-06, "loss": 0.0, "step": 15180 }, { "epoch": 64.0091452991453, "grad_norm": 0.0019342320738360286, "learning_rate": 3.898385565052232e-06, "loss": 0.0, "step": 15190 }, { "epoch": 64.00957264957265, "grad_norm": 0.00036040143459104, "learning_rate": 3.8936372269705604e-06, "loss": 0.0002, "step": 15200 }, { "epoch": 64.01, "grad_norm": 0.0003253432805649936, "learning_rate": 3.88888888888889e-06, "loss": 0.0081, "step": 15210 }, { "epoch": 64.01, "eval_accuracy": 0.4, "eval_loss": 5.480795383453369, "eval_runtime": 31.9103, "eval_samples_per_second": 0.783, "eval_steps_per_second": 0.783, "step": 15210 }, { "epoch": 65.00042735042734, "grad_norm": 0.0006786137819290161, "learning_rate": 3.884140550807218e-06, "loss": 0.0001, "step": 15220 }, { "epoch": 65.0008547008547, "grad_norm": 0.00022978027118369937, "learning_rate": 3.879392212725546e-06, "loss": 0.0, "step": 15230 }, { "epoch": 65.00128205128205, "grad_norm": 0.0007715200772508979, "learning_rate": 3.8746438746438745e-06, "loss": 0.003, "step": 15240 }, { "epoch": 65.0017094017094, "grad_norm": 0.02402709610760212, "learning_rate": 3.869895536562204e-06, "loss": 0.0, "step": 15250 }, { "epoch": 65.00213675213675, "grad_norm": 0.0015532065881416202, "learning_rate": 3.865147198480532e-06, "loss": 0.0197, "step": 15260 }, { "epoch": 65.00256410256411, "grad_norm": 85.6712417602539, "learning_rate": 3.860398860398861e-06, "loss": 1.1, "step": 15270 }, { "epoch": 65.00299145299145, "grad_norm": 0.0003184400557074696, "learning_rate": 3.855650522317189e-06, "loss": 0.0012, "step": 15280 }, { "epoch": 65.00341880341881, "grad_norm": 0.0002915811201091856, "learning_rate": 3.850902184235518e-06, "loss": 0.0002, "step": 15290 }, { "epoch": 65.00384615384615, "grad_norm": 0.001770331640727818, "learning_rate": 3.846153846153847e-06, "loss": 0.0023, "step": 15300 }, { "epoch": 65.0042735042735, "grad_norm": 0.0037825191393494606, "learning_rate": 3.841405508072175e-06, "loss": 0.0, "step": 15310 }, { "epoch": 65.00470085470086, "grad_norm": 0.00039001365075819194, "learning_rate": 3.836657169990503e-06, "loss": 0.0, "step": 15320 }, { "epoch": 65.0051282051282, "grad_norm": 0.029118506237864494, "learning_rate": 3.8319088319088325e-06, "loss": 0.0001, "step": 15330 }, { "epoch": 65.00555555555556, "grad_norm": 0.004345818888396025, "learning_rate": 3.827160493827161e-06, "loss": 0.0001, "step": 15340 }, { "epoch": 65.0059829059829, "grad_norm": 0.0013317528646439314, "learning_rate": 3.82241215574549e-06, "loss": 0.0, "step": 15350 }, { "epoch": 65.00641025641026, "grad_norm": 0.00033316414919681847, "learning_rate": 3.817663817663818e-06, "loss": 0.0, "step": 15360 }, { "epoch": 65.0068376068376, "grad_norm": 0.006611653603613377, "learning_rate": 3.8129154795821466e-06, "loss": 0.0001, "step": 15370 }, { "epoch": 65.00726495726495, "grad_norm": 0.0014399340143427253, "learning_rate": 3.808167141500475e-06, "loss": 0.0024, "step": 15380 }, { "epoch": 65.00769230769231, "grad_norm": 0.000310930801788345, "learning_rate": 3.8034188034188036e-06, "loss": 0.0001, "step": 15390 }, { "epoch": 65.00811965811965, "grad_norm": 0.0017023945692926645, "learning_rate": 3.7986704653371327e-06, "loss": 0.0, "step": 15400 }, { "epoch": 65.00854700854701, "grad_norm": 0.01913696527481079, "learning_rate": 3.793922127255461e-06, "loss": 0.0, "step": 15410 }, { "epoch": 65.00897435897436, "grad_norm": 0.00039296565228141844, "learning_rate": 3.7891737891737893e-06, "loss": 0.0, "step": 15420 }, { "epoch": 65.00940170940171, "grad_norm": 0.00039896511589176953, "learning_rate": 3.784425451092118e-06, "loss": 1.0074, "step": 15430 }, { "epoch": 65.00982905982906, "grad_norm": 0.0005804836982861161, "learning_rate": 3.7796771130104463e-06, "loss": 0.0008, "step": 15440 }, { "epoch": 65.01, "eval_accuracy": 0.4, "eval_loss": 5.881788730621338, "eval_runtime": 35.8586, "eval_samples_per_second": 0.697, "eval_steps_per_second": 0.697, "step": 15444 }, { "epoch": 66.00025641025641, "grad_norm": 0.0012229053536430001, "learning_rate": 3.774928774928775e-06, "loss": 0.0001, "step": 15450 }, { "epoch": 66.00068376068376, "grad_norm": 0.0003784662112593651, "learning_rate": 3.770180436847104e-06, "loss": 0.0001, "step": 15460 }, { "epoch": 66.00111111111111, "grad_norm": 0.0005255657597444952, "learning_rate": 3.7654320987654325e-06, "loss": 0.0001, "step": 15470 }, { "epoch": 66.00153846153846, "grad_norm": 0.00034700758988037705, "learning_rate": 3.760683760683761e-06, "loss": 0.0005, "step": 15480 }, { "epoch": 66.00196581196582, "grad_norm": 0.00021708589338231832, "learning_rate": 3.7559354226020895e-06, "loss": 0.0, "step": 15490 }, { "epoch": 66.00239316239316, "grad_norm": 0.0038388799875974655, "learning_rate": 3.751187084520418e-06, "loss": 0.0, "step": 15500 }, { "epoch": 66.0028205128205, "grad_norm": 0.0007136081694625318, "learning_rate": 3.746438746438747e-06, "loss": 0.0001, "step": 15510 }, { "epoch": 66.00324786324786, "grad_norm": 0.0002888034505303949, "learning_rate": 3.7416904083570752e-06, "loss": 0.0, "step": 15520 }, { "epoch": 66.00367521367521, "grad_norm": 720.2564697265625, "learning_rate": 3.736942070275404e-06, "loss": 0.2916, "step": 15530 }, { "epoch": 66.00410256410257, "grad_norm": 0.00027234895969741046, "learning_rate": 3.7321937321937323e-06, "loss": 0.0003, "step": 15540 }, { "epoch": 66.00452991452991, "grad_norm": 0.0012739448575302958, "learning_rate": 3.727445394112061e-06, "loss": 0.0, "step": 15550 }, { "epoch": 66.00495726495727, "grad_norm": 0.0005203865002840757, "learning_rate": 3.7226970560303897e-06, "loss": 0.0, "step": 15560 }, { "epoch": 66.00538461538461, "grad_norm": 0.0003277095383964479, "learning_rate": 3.7179487179487184e-06, "loss": 0.0, "step": 15570 }, { "epoch": 66.00581196581197, "grad_norm": 0.00044921765220351517, "learning_rate": 3.7132003798670467e-06, "loss": 0.0, "step": 15580 }, { "epoch": 66.00623931623932, "grad_norm": 0.0003073754196520895, "learning_rate": 3.7084520417853754e-06, "loss": 0.0, "step": 15590 }, { "epoch": 66.00666666666666, "grad_norm": 0.00030189528479240835, "learning_rate": 3.7037037037037037e-06, "loss": 0.0001, "step": 15600 }, { "epoch": 66.00709401709402, "grad_norm": 0.0005707856616936624, "learning_rate": 3.698955365622033e-06, "loss": 0.0001, "step": 15610 }, { "epoch": 66.00752136752136, "grad_norm": 0.00040740257827565074, "learning_rate": 3.694207027540361e-06, "loss": 0.0, "step": 15620 }, { "epoch": 66.00794871794872, "grad_norm": 0.00039323364035226405, "learning_rate": 3.68945868945869e-06, "loss": 0.0, "step": 15630 }, { "epoch": 66.00837606837607, "grad_norm": 0.0014208744978532195, "learning_rate": 3.684710351377018e-06, "loss": 0.0, "step": 15640 }, { "epoch": 66.00880341880342, "grad_norm": 0.0007801980245858431, "learning_rate": 3.679962013295347e-06, "loss": 0.0001, "step": 15650 }, { "epoch": 66.00923076923077, "grad_norm": 0.0017435887129977345, "learning_rate": 3.6752136752136756e-06, "loss": 0.0001, "step": 15660 }, { "epoch": 66.00965811965811, "grad_norm": 0.03651350364089012, "learning_rate": 3.6704653371320044e-06, "loss": 0.0, "step": 15670 }, { "epoch": 66.01, "eval_accuracy": 0.4, "eval_loss": 6.4378485679626465, "eval_runtime": 33.2393, "eval_samples_per_second": 0.752, "eval_steps_per_second": 0.752, "step": 15678 }, { "epoch": 67.00008547008547, "grad_norm": 0.00035544627462513745, "learning_rate": 3.6657169990503327e-06, "loss": 0.0001, "step": 15680 }, { "epoch": 67.00051282051282, "grad_norm": 0.00033005906152538955, "learning_rate": 3.6609686609686614e-06, "loss": 0.0, "step": 15690 }, { "epoch": 67.00094017094017, "grad_norm": 0.0003103645867668092, "learning_rate": 3.6562203228869897e-06, "loss": 0.0, "step": 15700 }, { "epoch": 67.00136752136753, "grad_norm": 0.0008121723076328635, "learning_rate": 3.651471984805318e-06, "loss": 0.0, "step": 15710 }, { "epoch": 67.00179487179487, "grad_norm": 0.0004981961101293564, "learning_rate": 3.646723646723647e-06, "loss": 0.0, "step": 15720 }, { "epoch": 67.00222222222222, "grad_norm": 0.0003290316089987755, "learning_rate": 3.641975308641976e-06, "loss": 0.0, "step": 15730 }, { "epoch": 67.00264957264957, "grad_norm": 0.0010973671451210976, "learning_rate": 3.637226970560304e-06, "loss": 0.0, "step": 15740 }, { "epoch": 67.00307692307692, "grad_norm": 0.0006377913523465395, "learning_rate": 3.632478632478633e-06, "loss": 0.0, "step": 15750 }, { "epoch": 67.00350427350428, "grad_norm": 0.0005305053782649338, "learning_rate": 3.627730294396961e-06, "loss": 0.0, "step": 15760 }, { "epoch": 67.00393162393162, "grad_norm": 0.0019237673841416836, "learning_rate": 3.6229819563152903e-06, "loss": 0.0006, "step": 15770 }, { "epoch": 67.00435897435898, "grad_norm": 0.0016178624937310815, "learning_rate": 3.6182336182336186e-06, "loss": 0.0, "step": 15780 }, { "epoch": 67.00478632478632, "grad_norm": 0.00046172275324352086, "learning_rate": 3.6134852801519473e-06, "loss": 0.0, "step": 15790 }, { "epoch": 67.00521367521368, "grad_norm": 0.0002967897162307054, "learning_rate": 3.6087369420702756e-06, "loss": 0.0001, "step": 15800 }, { "epoch": 67.00564102564103, "grad_norm": 0.0002238056476926431, "learning_rate": 3.603988603988604e-06, "loss": 0.0941, "step": 15810 }, { "epoch": 67.00606837606837, "grad_norm": 0.00390687957406044, "learning_rate": 3.599240265906933e-06, "loss": 0.001, "step": 15820 }, { "epoch": 67.00649572649573, "grad_norm": 0.0019423745106905699, "learning_rate": 3.5944919278252618e-06, "loss": 0.0001, "step": 15830 }, { "epoch": 67.00692307692307, "grad_norm": 0.00027149979723617435, "learning_rate": 3.58974358974359e-06, "loss": 0.0, "step": 15840 }, { "epoch": 67.00735042735043, "grad_norm": 0.0007080771028995514, "learning_rate": 3.5849952516619184e-06, "loss": 0.0, "step": 15850 }, { "epoch": 67.00777777777778, "grad_norm": 0.0003103041963186115, "learning_rate": 3.580246913580247e-06, "loss": 0.0135, "step": 15860 }, { "epoch": 67.00820512820513, "grad_norm": 0.001910716644488275, "learning_rate": 3.5754985754985762e-06, "loss": 0.0, "step": 15870 }, { "epoch": 67.00863247863248, "grad_norm": 904.8418579101562, "learning_rate": 3.5707502374169045e-06, "loss": 0.3092, "step": 15880 }, { "epoch": 67.00905982905982, "grad_norm": 0.0006746945437043905, "learning_rate": 3.566001899335233e-06, "loss": 0.0, "step": 15890 }, { "epoch": 67.00948717948718, "grad_norm": 0.0018293018219992518, "learning_rate": 3.5612535612535615e-06, "loss": 0.0001, "step": 15900 }, { "epoch": 67.00991452991452, "grad_norm": 0.007653082720935345, "learning_rate": 3.55650522317189e-06, "loss": 0.0, "step": 15910 }, { "epoch": 67.01, "eval_accuracy": 0.4, "eval_loss": 5.659733772277832, "eval_runtime": 35.0596, "eval_samples_per_second": 0.713, "eval_steps_per_second": 0.713, "step": 15912 }, { "epoch": 68.00034188034188, "grad_norm": 0.0002990284119732678, "learning_rate": 3.551756885090219e-06, "loss": 0.0272, "step": 15920 }, { "epoch": 68.00076923076924, "grad_norm": 0.0034541944041848183, "learning_rate": 3.5470085470085473e-06, "loss": 0.0001, "step": 15930 }, { "epoch": 68.00119658119658, "grad_norm": 0.00031465632491745055, "learning_rate": 3.542260208926876e-06, "loss": 0.0, "step": 15940 }, { "epoch": 68.00162393162393, "grad_norm": 0.0003612114815041423, "learning_rate": 3.5375118708452043e-06, "loss": 0.0002, "step": 15950 }, { "epoch": 68.00205128205128, "grad_norm": 0.0013253976358100772, "learning_rate": 3.532763532763533e-06, "loss": 0.0, "step": 15960 }, { "epoch": 68.00247863247863, "grad_norm": 0.0014574190136045218, "learning_rate": 3.5280151946818613e-06, "loss": 0.0, "step": 15970 }, { "epoch": 68.00290598290599, "grad_norm": 0.0002389967121416703, "learning_rate": 3.5232668566001904e-06, "loss": 0.0003, "step": 15980 }, { "epoch": 68.00333333333333, "grad_norm": 0.0006442684680223465, "learning_rate": 3.5185185185185187e-06, "loss": 0.0, "step": 15990 }, { "epoch": 68.00376068376069, "grad_norm": 0.0004441600467544049, "learning_rate": 3.5137701804368475e-06, "loss": 0.0, "step": 16000 }, { "epoch": 68.00418803418803, "grad_norm": 0.0006568527896888554, "learning_rate": 3.5090218423551758e-06, "loss": 0.0, "step": 16010 }, { "epoch": 68.00461538461539, "grad_norm": 0.0003113803395535797, "learning_rate": 3.5042735042735045e-06, "loss": 0.0, "step": 16020 }, { "epoch": 68.00504273504274, "grad_norm": 0.0012534753186628222, "learning_rate": 3.499525166191833e-06, "loss": 0.0, "step": 16030 }, { "epoch": 68.00547008547008, "grad_norm": 0.0026429889257997274, "learning_rate": 3.494776828110162e-06, "loss": 0.2954, "step": 16040 }, { "epoch": 68.00589743589744, "grad_norm": 0.0012878195848315954, "learning_rate": 3.4900284900284902e-06, "loss": 0.0, "step": 16050 }, { "epoch": 68.00632478632478, "grad_norm": 0.0004001953057013452, "learning_rate": 3.485280151946819e-06, "loss": 0.4867, "step": 16060 }, { "epoch": 68.00675213675214, "grad_norm": 0.004468402359634638, "learning_rate": 3.4805318138651472e-06, "loss": 0.0006, "step": 16070 }, { "epoch": 68.00717948717949, "grad_norm": 0.002943785861134529, "learning_rate": 3.4757834757834764e-06, "loss": 0.0002, "step": 16080 }, { "epoch": 68.00760683760684, "grad_norm": 0.002336192177608609, "learning_rate": 3.4710351377018047e-06, "loss": 0.8271, "step": 16090 }, { "epoch": 68.00803418803419, "grad_norm": 0.0003368109464645386, "learning_rate": 3.4662867996201334e-06, "loss": 0.3306, "step": 16100 }, { "epoch": 68.00846153846153, "grad_norm": 0.005345764569938183, "learning_rate": 3.4615384615384617e-06, "loss": 0.498, "step": 16110 }, { "epoch": 68.00888888888889, "grad_norm": 0.005093185696750879, "learning_rate": 3.4567901234567904e-06, "loss": 0.0001, "step": 16120 }, { "epoch": 68.00931623931623, "grad_norm": 0.00032298368751071393, "learning_rate": 3.452041785375119e-06, "loss": 0.0, "step": 16130 }, { "epoch": 68.0097435897436, "grad_norm": 0.0008845495176501572, "learning_rate": 3.447293447293448e-06, "loss": 0.0, "step": 16140 }, { "epoch": 68.01, "eval_accuracy": 0.44, "eval_loss": 5.819715976715088, "eval_runtime": 32.8708, "eval_samples_per_second": 0.761, "eval_steps_per_second": 0.761, "step": 16146 }, { "epoch": 69.00017094017095, "grad_norm": 0.003377682762220502, "learning_rate": 3.442545109211776e-06, "loss": 0.0002, "step": 16150 }, { "epoch": 69.00059829059829, "grad_norm": 0.0002674728457350284, "learning_rate": 3.437796771130105e-06, "loss": 0.0, "step": 16160 }, { "epoch": 69.00102564102563, "grad_norm": 0.0016994690522551537, "learning_rate": 3.433048433048433e-06, "loss": 0.0, "step": 16170 }, { "epoch": 69.001452991453, "grad_norm": 0.00035284223849885166, "learning_rate": 3.4283000949667615e-06, "loss": 0.0, "step": 16180 }, { "epoch": 69.00188034188034, "grad_norm": 0.00023986499581951648, "learning_rate": 3.4235517568850906e-06, "loss": 0.0, "step": 16190 }, { "epoch": 69.0023076923077, "grad_norm": 0.0023895238991826773, "learning_rate": 3.4188034188034193e-06, "loss": 0.0, "step": 16200 }, { "epoch": 69.00273504273504, "grad_norm": 0.0018031138461083174, "learning_rate": 3.4140550807217476e-06, "loss": 0.0, "step": 16210 }, { "epoch": 69.0031623931624, "grad_norm": 0.003573950147256255, "learning_rate": 3.409306742640076e-06, "loss": 0.0, "step": 16220 }, { "epoch": 69.00358974358974, "grad_norm": 0.0015754502965137362, "learning_rate": 3.4045584045584046e-06, "loss": 0.0, "step": 16230 }, { "epoch": 69.0040170940171, "grad_norm": 0.0012102372711524367, "learning_rate": 3.3998100664767338e-06, "loss": 0.0, "step": 16240 }, { "epoch": 69.00444444444445, "grad_norm": 0.0002841241657733917, "learning_rate": 3.395061728395062e-06, "loss": 0.0, "step": 16250 }, { "epoch": 69.00487179487179, "grad_norm": 0.01132782083004713, "learning_rate": 3.3903133903133904e-06, "loss": 0.0001, "step": 16260 }, { "epoch": 69.00529914529915, "grad_norm": 0.00031476642470806837, "learning_rate": 3.385565052231719e-06, "loss": 0.0001, "step": 16270 }, { "epoch": 69.00572649572649, "grad_norm": 0.000270717719104141, "learning_rate": 3.3808167141500474e-06, "loss": 0.0, "step": 16280 }, { "epoch": 69.00615384615385, "grad_norm": 0.00036470277700573206, "learning_rate": 3.3760683760683765e-06, "loss": 0.0, "step": 16290 }, { "epoch": 69.0065811965812, "grad_norm": 0.001150319236330688, "learning_rate": 3.371320037986705e-06, "loss": 0.0, "step": 16300 }, { "epoch": 69.00700854700855, "grad_norm": 0.0003526509099174291, "learning_rate": 3.3665716999050336e-06, "loss": 0.0, "step": 16310 }, { "epoch": 69.0074358974359, "grad_norm": 0.0005036306101828814, "learning_rate": 3.361823361823362e-06, "loss": 0.0, "step": 16320 }, { "epoch": 69.00786324786324, "grad_norm": 0.0007515393663197756, "learning_rate": 3.3570750237416906e-06, "loss": 0.0, "step": 16330 }, { "epoch": 69.0082905982906, "grad_norm": 0.0001931802835315466, "learning_rate": 3.3523266856600197e-06, "loss": 0.0, "step": 16340 }, { "epoch": 69.00871794871794, "grad_norm": 0.00042826347635127604, "learning_rate": 3.347578347578348e-06, "loss": 0.0001, "step": 16350 }, { "epoch": 69.0091452991453, "grad_norm": 0.00022397881548386067, "learning_rate": 3.3428300094966763e-06, "loss": 0.0, "step": 16360 }, { "epoch": 69.00957264957265, "grad_norm": 0.00021716070477850735, "learning_rate": 3.338081671415005e-06, "loss": 0.0, "step": 16370 }, { "epoch": 69.01, "grad_norm": 0.00088693160796538, "learning_rate": 3.3333333333333333e-06, "loss": 0.0061, "step": 16380 }, { "epoch": 69.01, "eval_accuracy": 0.4, "eval_loss": 6.014081001281738, "eval_runtime": 33.8632, "eval_samples_per_second": 0.738, "eval_steps_per_second": 0.738, "step": 16380 }, { "epoch": 70.00042735042734, "grad_norm": 0.0004485013196244836, "learning_rate": 3.3285849952516625e-06, "loss": 0.0, "step": 16390 }, { "epoch": 70.0008547008547, "grad_norm": 0.0004966802662238479, "learning_rate": 3.3238366571699908e-06, "loss": 0.0, "step": 16400 }, { "epoch": 70.00128205128205, "grad_norm": 0.0006574277649633586, "learning_rate": 3.3190883190883195e-06, "loss": 0.001, "step": 16410 }, { "epoch": 70.0017094017094, "grad_norm": 0.21187789738178253, "learning_rate": 3.3143399810066478e-06, "loss": 0.0001, "step": 16420 }, { "epoch": 70.00213675213675, "grad_norm": 0.00020025823323521763, "learning_rate": 3.3095916429249765e-06, "loss": 0.0, "step": 16430 }, { "epoch": 70.00256410256411, "grad_norm": 0.0003043335455004126, "learning_rate": 3.304843304843305e-06, "loss": 0.0, "step": 16440 }, { "epoch": 70.00299145299145, "grad_norm": 0.0003146652306895703, "learning_rate": 3.300094966761634e-06, "loss": 0.0, "step": 16450 }, { "epoch": 70.00341880341881, "grad_norm": 0.00840036105364561, "learning_rate": 3.2953466286799622e-06, "loss": 1.0593, "step": 16460 }, { "epoch": 70.00384615384615, "grad_norm": 0.0003192836884409189, "learning_rate": 3.290598290598291e-06, "loss": 0.0, "step": 16470 }, { "epoch": 70.0042735042735, "grad_norm": 0.0012308260193094611, "learning_rate": 3.2858499525166193e-06, "loss": 0.0, "step": 16480 }, { "epoch": 70.00470085470086, "grad_norm": 0.0008993004448711872, "learning_rate": 3.281101614434948e-06, "loss": 0.0, "step": 16490 }, { "epoch": 70.0051282051282, "grad_norm": 0.0010926368413493037, "learning_rate": 3.2763532763532767e-06, "loss": 0.0, "step": 16500 }, { "epoch": 70.00555555555556, "grad_norm": 0.0005557582480832934, "learning_rate": 3.2716049382716054e-06, "loss": 0.0001, "step": 16510 }, { "epoch": 70.0059829059829, "grad_norm": 0.0008928438182920218, "learning_rate": 3.2668566001899337e-06, "loss": 0.0, "step": 16520 }, { "epoch": 70.00641025641026, "grad_norm": 0.0003105592622887343, "learning_rate": 3.2621082621082624e-06, "loss": 0.0001, "step": 16530 }, { "epoch": 70.0068376068376, "grad_norm": 0.0003956545260734856, "learning_rate": 3.2573599240265907e-06, "loss": 0.0, "step": 16540 }, { "epoch": 70.00726495726495, "grad_norm": 0.0002564324822742492, "learning_rate": 3.25261158594492e-06, "loss": 0.0066, "step": 16550 }, { "epoch": 70.00769230769231, "grad_norm": 0.00025351883959956467, "learning_rate": 3.247863247863248e-06, "loss": 0.0, "step": 16560 }, { "epoch": 70.00811965811965, "grad_norm": 0.00046608541742898524, "learning_rate": 3.243114909781577e-06, "loss": 0.0023, "step": 16570 }, { "epoch": 70.00854700854701, "grad_norm": 0.002306720009073615, "learning_rate": 3.238366571699905e-06, "loss": 0.0, "step": 16580 }, { "epoch": 70.00897435897436, "grad_norm": 0.0005604177713394165, "learning_rate": 3.2336182336182335e-06, "loss": 0.0, "step": 16590 }, { "epoch": 70.00940170940171, "grad_norm": 0.0008777379989624023, "learning_rate": 3.2288698955365626e-06, "loss": 0.0, "step": 16600 }, { "epoch": 70.00982905982906, "grad_norm": 0.0015941828023642302, "learning_rate": 3.2241215574548913e-06, "loss": 0.0001, "step": 16610 }, { "epoch": 70.01, "eval_accuracy": 0.4, "eval_loss": 6.244938373565674, "eval_runtime": 34.6306, "eval_samples_per_second": 0.722, "eval_steps_per_second": 0.722, "step": 16614 }, { "epoch": 71.00025641025641, "grad_norm": 0.0021248271223157644, "learning_rate": 3.2193732193732196e-06, "loss": 0.7973, "step": 16620 }, { "epoch": 71.00068376068376, "grad_norm": 0.00018868227198254317, "learning_rate": 3.214624881291548e-06, "loss": 0.0, "step": 16630 }, { "epoch": 71.00111111111111, "grad_norm": 921.1409301757812, "learning_rate": 3.2098765432098767e-06, "loss": 0.3031, "step": 16640 }, { "epoch": 71.00153846153846, "grad_norm": 0.0012714448384940624, "learning_rate": 3.205128205128206e-06, "loss": 0.0, "step": 16650 }, { "epoch": 71.00196581196582, "grad_norm": 0.0014924101997166872, "learning_rate": 3.200379867046534e-06, "loss": 0.0079, "step": 16660 }, { "epoch": 71.00239316239316, "grad_norm": 0.00040770156192593277, "learning_rate": 3.195631528964863e-06, "loss": 0.0018, "step": 16670 }, { "epoch": 71.0028205128205, "grad_norm": 0.00022908284154254943, "learning_rate": 3.190883190883191e-06, "loss": 0.002, "step": 16680 }, { "epoch": 71.00324786324786, "grad_norm": 0.0016254654619842768, "learning_rate": 3.1861348528015194e-06, "loss": 0.7552, "step": 16690 }, { "epoch": 71.00367521367521, "grad_norm": 0.0006611282587982714, "learning_rate": 3.181386514719848e-06, "loss": 0.0972, "step": 16700 }, { "epoch": 71.00410256410257, "grad_norm": 2.668363332748413, "learning_rate": 3.1766381766381773e-06, "loss": 0.0006, "step": 16710 }, { "epoch": 71.00452991452991, "grad_norm": 0.0007531665614806116, "learning_rate": 3.1718898385565056e-06, "loss": 0.0, "step": 16720 }, { "epoch": 71.00495726495727, "grad_norm": 0.0007412447594106197, "learning_rate": 3.167141500474834e-06, "loss": 0.0, "step": 16730 }, { "epoch": 71.00538461538461, "grad_norm": 0.0008477046503685415, "learning_rate": 3.1623931623931626e-06, "loss": 0.9046, "step": 16740 }, { "epoch": 71.00581196581197, "grad_norm": 146.11962890625, "learning_rate": 3.157644824311491e-06, "loss": 0.3816, "step": 16750 }, { "epoch": 71.00623931623932, "grad_norm": 0.0002443613775540143, "learning_rate": 3.15289648622982e-06, "loss": 0.2428, "step": 16760 }, { "epoch": 71.00666666666666, "grad_norm": 0.00033410079777240753, "learning_rate": 3.1481481481481483e-06, "loss": 0.0, "step": 16770 }, { "epoch": 71.00709401709402, "grad_norm": 0.002586389658972621, "learning_rate": 3.143399810066477e-06, "loss": 0.8823, "step": 16780 }, { "epoch": 71.00752136752136, "grad_norm": 0.001075676642358303, "learning_rate": 3.1386514719848053e-06, "loss": 0.0, "step": 16790 }, { "epoch": 71.00794871794872, "grad_norm": 0.002573596313595772, "learning_rate": 3.133903133903134e-06, "loss": 0.0, "step": 16800 }, { "epoch": 71.00837606837607, "grad_norm": 0.00022963988885749131, "learning_rate": 3.1291547958214628e-06, "loss": 0.0, "step": 16810 }, { "epoch": 71.00880341880342, "grad_norm": 0.0005840660887770355, "learning_rate": 3.1244064577397915e-06, "loss": 0.0, "step": 16820 }, { "epoch": 71.00923076923077, "grad_norm": 0.00029322231421247125, "learning_rate": 3.11965811965812e-06, "loss": 0.0, "step": 16830 }, { "epoch": 71.00965811965811, "grad_norm": 0.00025132184964604676, "learning_rate": 3.1149097815764485e-06, "loss": 0.0001, "step": 16840 }, { "epoch": 71.01, "eval_accuracy": 0.4, "eval_loss": 6.252962589263916, "eval_runtime": 33.4983, "eval_samples_per_second": 0.746, "eval_steps_per_second": 0.746, "step": 16848 }, { "epoch": 72.00008547008547, "grad_norm": 0.0019571471493691206, "learning_rate": 3.110161443494777e-06, "loss": 0.0, "step": 16850 }, { "epoch": 72.00051282051282, "grad_norm": 0.0005339854396879673, "learning_rate": 3.105413105413106e-06, "loss": 0.0001, "step": 16860 }, { "epoch": 72.00094017094017, "grad_norm": 0.00019883255299646407, "learning_rate": 3.1006647673314343e-06, "loss": 0.0, "step": 16870 }, { "epoch": 72.00136752136753, "grad_norm": 0.00024648249382153153, "learning_rate": 3.095916429249763e-06, "loss": 0.0003, "step": 16880 }, { "epoch": 72.00179487179487, "grad_norm": 0.0015446147881448269, "learning_rate": 3.0911680911680913e-06, "loss": 0.0006, "step": 16890 }, { "epoch": 72.00222222222222, "grad_norm": 0.0021758642978966236, "learning_rate": 3.08641975308642e-06, "loss": 0.0001, "step": 16900 }, { "epoch": 72.00264957264957, "grad_norm": 0.0003914159897249192, "learning_rate": 3.0816714150047487e-06, "loss": 0.0, "step": 16910 }, { "epoch": 72.00307692307692, "grad_norm": 0.0009236117475666106, "learning_rate": 3.0769230769230774e-06, "loss": 0.0, "step": 16920 }, { "epoch": 72.00350427350428, "grad_norm": 0.0002460191317368299, "learning_rate": 3.0721747388414057e-06, "loss": 0.0, "step": 16930 }, { "epoch": 72.00393162393162, "grad_norm": 0.00033975299447774887, "learning_rate": 3.0674264007597345e-06, "loss": 0.0, "step": 16940 }, { "epoch": 72.00435897435898, "grad_norm": 0.001101949717849493, "learning_rate": 3.0626780626780627e-06, "loss": 0.0, "step": 16950 }, { "epoch": 72.00478632478632, "grad_norm": 0.0031472702976316214, "learning_rate": 3.0579297245963915e-06, "loss": 0.5291, "step": 16960 }, { "epoch": 72.00521367521368, "grad_norm": 0.00030517615959979594, "learning_rate": 3.05318138651472e-06, "loss": 0.0, "step": 16970 }, { "epoch": 72.00564102564103, "grad_norm": 0.0015221843495965004, "learning_rate": 3.048433048433049e-06, "loss": 0.0004, "step": 16980 }, { "epoch": 72.00606837606837, "grad_norm": 0.0034548670519143343, "learning_rate": 3.043684710351377e-06, "loss": 0.0, "step": 16990 }, { "epoch": 72.00649572649573, "grad_norm": 0.0006523687625303864, "learning_rate": 3.038936372269706e-06, "loss": 0.9208, "step": 17000 }, { "epoch": 72.00692307692307, "grad_norm": 0.0015028053894639015, "learning_rate": 3.0341880341880342e-06, "loss": 0.0, "step": 17010 }, { "epoch": 72.00735042735043, "grad_norm": 0.00025994013412855566, "learning_rate": 3.0294396961063634e-06, "loss": 0.0, "step": 17020 }, { "epoch": 72.00777777777778, "grad_norm": 0.000636533775832504, "learning_rate": 3.0246913580246917e-06, "loss": 0.0303, "step": 17030 }, { "epoch": 72.00820512820513, "grad_norm": 0.003830546513199806, "learning_rate": 3.0199430199430204e-06, "loss": 0.0001, "step": 17040 }, { "epoch": 72.00863247863248, "grad_norm": 0.00017824105452746153, "learning_rate": 3.0151946818613487e-06, "loss": 0.0, "step": 17050 }, { "epoch": 72.00905982905982, "grad_norm": 0.00022492320567835122, "learning_rate": 3.010446343779677e-06, "loss": 0.0, "step": 17060 }, { "epoch": 72.00948717948718, "grad_norm": 0.0020875674672424793, "learning_rate": 3.005698005698006e-06, "loss": 0.0, "step": 17070 }, { "epoch": 72.00991452991452, "grad_norm": 0.0006881671724840999, "learning_rate": 3.000949667616335e-06, "loss": 0.0, "step": 17080 }, { "epoch": 72.01, "eval_accuracy": 0.4, "eval_loss": 5.765503406524658, "eval_runtime": 33.3353, "eval_samples_per_second": 0.75, "eval_steps_per_second": 0.75, "step": 17082 }, { "epoch": 73.00034188034188, "grad_norm": 0.0012791932094842196, "learning_rate": 2.996201329534663e-06, "loss": 0.0, "step": 17090 }, { "epoch": 73.00076923076924, "grad_norm": 0.00039805949199944735, "learning_rate": 2.9914529914529914e-06, "loss": 0.0, "step": 17100 }, { "epoch": 73.00119658119658, "grad_norm": 0.0010755633702501655, "learning_rate": 2.98670465337132e-06, "loss": 0.0001, "step": 17110 }, { "epoch": 73.00162393162393, "grad_norm": 0.07058065384626389, "learning_rate": 2.9819563152896493e-06, "loss": 0.0, "step": 17120 }, { "epoch": 73.00205128205128, "grad_norm": 0.00041162193519994617, "learning_rate": 2.9772079772079776e-06, "loss": 0.0, "step": 17130 }, { "epoch": 73.00247863247863, "grad_norm": 0.0027573970146477222, "learning_rate": 2.972459639126306e-06, "loss": 0.0, "step": 17140 }, { "epoch": 73.00290598290599, "grad_norm": 0.0003702895774040371, "learning_rate": 2.9677113010446346e-06, "loss": 0.0, "step": 17150 }, { "epoch": 73.00333333333333, "grad_norm": 0.0007125766715034842, "learning_rate": 2.962962962962963e-06, "loss": 0.0, "step": 17160 }, { "epoch": 73.00376068376069, "grad_norm": 0.00024414356448687613, "learning_rate": 2.9582146248812916e-06, "loss": 0.0, "step": 17170 }, { "epoch": 73.00418803418803, "grad_norm": 0.0009582286584191024, "learning_rate": 2.9534662867996203e-06, "loss": 0.0001, "step": 17180 }, { "epoch": 73.00461538461539, "grad_norm": 0.00019269179028924555, "learning_rate": 2.948717948717949e-06, "loss": 0.0, "step": 17190 }, { "epoch": 73.00504273504274, "grad_norm": 0.0013847124064341187, "learning_rate": 2.9439696106362774e-06, "loss": 0.0, "step": 17200 }, { "epoch": 73.00547008547008, "grad_norm": 0.8285155892372131, "learning_rate": 2.939221272554606e-06, "loss": 0.0002, "step": 17210 }, { "epoch": 73.00589743589744, "grad_norm": 0.0001845570222940296, "learning_rate": 2.9344729344729344e-06, "loss": 0.0, "step": 17220 }, { "epoch": 73.00632478632478, "grad_norm": 561.99072265625, "learning_rate": 2.9297245963912635e-06, "loss": 0.6593, "step": 17230 }, { "epoch": 73.00675213675214, "grad_norm": 0.17753881216049194, "learning_rate": 2.924976258309592e-06, "loss": 0.0001, "step": 17240 }, { "epoch": 73.00717948717949, "grad_norm": 0.00024875238887034357, "learning_rate": 2.9202279202279205e-06, "loss": 0.0, "step": 17250 }, { "epoch": 73.00760683760684, "grad_norm": 0.016402428969740868, "learning_rate": 2.915479582146249e-06, "loss": 0.0039, "step": 17260 }, { "epoch": 73.00803418803419, "grad_norm": 0.001006082515232265, "learning_rate": 2.9107312440645776e-06, "loss": 0.0038, "step": 17270 }, { "epoch": 73.00846153846153, "grad_norm": 0.0013266198802739382, "learning_rate": 2.9059829059829063e-06, "loss": 0.0113, "step": 17280 }, { "epoch": 73.00888888888889, "grad_norm": 0.00038516995846293867, "learning_rate": 2.901234567901235e-06, "loss": 0.0, "step": 17290 }, { "epoch": 73.00931623931623, "grad_norm": 0.0010296168038621545, "learning_rate": 2.8964862298195633e-06, "loss": 0.0, "step": 17300 }, { "epoch": 73.0097435897436, "grad_norm": 0.0006145525840111077, "learning_rate": 2.891737891737892e-06, "loss": 0.0, "step": 17310 }, { "epoch": 73.01, "eval_accuracy": 0.4, "eval_loss": 6.152116775512695, "eval_runtime": 33.4645, "eval_samples_per_second": 0.747, "eval_steps_per_second": 0.747, "step": 17316 }, { "epoch": 74.00017094017095, "grad_norm": 0.00017263834888581187, "learning_rate": 2.8869895536562203e-06, "loss": 0.0, "step": 17320 }, { "epoch": 74.00059829059829, "grad_norm": 0.00031588933779858053, "learning_rate": 2.8822412155745495e-06, "loss": 0.0034, "step": 17330 }, { "epoch": 74.00102564102563, "grad_norm": 0.0006647444679401815, "learning_rate": 2.8774928774928778e-06, "loss": 0.0, "step": 17340 }, { "epoch": 74.001452991453, "grad_norm": 0.0013857238227501512, "learning_rate": 2.8727445394112065e-06, "loss": 0.0004, "step": 17350 }, { "epoch": 74.00188034188034, "grad_norm": 0.003892462467774749, "learning_rate": 2.8679962013295348e-06, "loss": 0.0002, "step": 17360 }, { "epoch": 74.0023076923077, "grad_norm": 0.0004585685092024505, "learning_rate": 2.8632478632478635e-06, "loss": 0.0, "step": 17370 }, { "epoch": 74.00273504273504, "grad_norm": 0.00025991059374064207, "learning_rate": 2.858499525166192e-06, "loss": 0.0008, "step": 17380 }, { "epoch": 74.0031623931624, "grad_norm": 0.0004791081009898335, "learning_rate": 2.853751187084521e-06, "loss": 0.0354, "step": 17390 }, { "epoch": 74.00358974358974, "grad_norm": 0.0005468902527354658, "learning_rate": 2.8490028490028492e-06, "loss": 0.0001, "step": 17400 }, { "epoch": 74.0040170940171, "grad_norm": 0.01083172857761383, "learning_rate": 2.844254510921178e-06, "loss": 0.0, "step": 17410 }, { "epoch": 74.00444444444445, "grad_norm": 0.0011809701099991798, "learning_rate": 2.8395061728395062e-06, "loss": 0.0, "step": 17420 }, { "epoch": 74.00487179487179, "grad_norm": 0.0013473471626639366, "learning_rate": 2.8347578347578345e-06, "loss": 0.0, "step": 17430 }, { "epoch": 74.00529914529915, "grad_norm": 0.00039711128920316696, "learning_rate": 2.8300094966761637e-06, "loss": 0.0, "step": 17440 }, { "epoch": 74.00572649572649, "grad_norm": 0.00017134596419055015, "learning_rate": 2.8252611585944924e-06, "loss": 0.0, "step": 17450 }, { "epoch": 74.00615384615385, "grad_norm": 0.00042395462514832616, "learning_rate": 2.8205128205128207e-06, "loss": 0.0, "step": 17460 }, { "epoch": 74.0065811965812, "grad_norm": 0.00016071122081484646, "learning_rate": 2.815764482431149e-06, "loss": 0.0, "step": 17470 }, { "epoch": 74.00700854700855, "grad_norm": 0.20301441848278046, "learning_rate": 2.8110161443494777e-06, "loss": 0.0001, "step": 17480 }, { "epoch": 74.0074358974359, "grad_norm": 0.0002695897710509598, "learning_rate": 2.806267806267807e-06, "loss": 0.0, "step": 17490 }, { "epoch": 74.00786324786324, "grad_norm": 0.005475103389471769, "learning_rate": 2.801519468186135e-06, "loss": 0.0, "step": 17500 }, { "epoch": 74.0082905982906, "grad_norm": 0.01020097080618143, "learning_rate": 2.7967711301044635e-06, "loss": 0.0, "step": 17510 }, { "epoch": 74.00871794871794, "grad_norm": 0.00026164480368606746, "learning_rate": 2.792022792022792e-06, "loss": 0.0, "step": 17520 }, { "epoch": 74.0091452991453, "grad_norm": 0.0005212483229115605, "learning_rate": 2.7872744539411205e-06, "loss": 0.0001, "step": 17530 }, { "epoch": 74.00957264957265, "grad_norm": 0.00018099919543601573, "learning_rate": 2.7825261158594496e-06, "loss": 0.0, "step": 17540 }, { "epoch": 74.01, "grad_norm": 0.00015604341751895845, "learning_rate": 2.7777777777777783e-06, "loss": 0.0, "step": 17550 }, { "epoch": 74.01, "eval_accuracy": 0.44, "eval_loss": 6.159724235534668, "eval_runtime": 33.1573, "eval_samples_per_second": 0.754, "eval_steps_per_second": 0.754, "step": 17550 }, { "epoch": 75.00042735042734, "grad_norm": 0.00023313055862672627, "learning_rate": 2.7730294396961066e-06, "loss": 0.0, "step": 17560 }, { "epoch": 75.0008547008547, "grad_norm": 0.002859926549717784, "learning_rate": 2.768281101614435e-06, "loss": 0.0, "step": 17570 }, { "epoch": 75.00128205128205, "grad_norm": 0.002907276153564453, "learning_rate": 2.7635327635327636e-06, "loss": 0.0001, "step": 17580 }, { "epoch": 75.0017094017094, "grad_norm": 0.0013060672208666801, "learning_rate": 2.758784425451093e-06, "loss": 0.0, "step": 17590 }, { "epoch": 75.00213675213675, "grad_norm": 0.0003227469860576093, "learning_rate": 2.754036087369421e-06, "loss": 0.0, "step": 17600 }, { "epoch": 75.00256410256411, "grad_norm": 0.0003190785355400294, "learning_rate": 2.7492877492877494e-06, "loss": 0.0, "step": 17610 }, { "epoch": 75.00299145299145, "grad_norm": 0.00020427798153832555, "learning_rate": 2.744539411206078e-06, "loss": 0.0, "step": 17620 }, { "epoch": 75.00341880341881, "grad_norm": 0.053999051451683044, "learning_rate": 2.7397910731244064e-06, "loss": 0.0001, "step": 17630 }, { "epoch": 75.00384615384615, "grad_norm": 0.0002565534086897969, "learning_rate": 2.7350427350427355e-06, "loss": 0.0, "step": 17640 }, { "epoch": 75.0042735042735, "grad_norm": 0.00017350871348753572, "learning_rate": 2.730294396961064e-06, "loss": 0.0, "step": 17650 }, { "epoch": 75.00470085470086, "grad_norm": 0.00023242604220286012, "learning_rate": 2.7255460588793926e-06, "loss": 0.0, "step": 17660 }, { "epoch": 75.0051282051282, "grad_norm": 0.00020732257689815015, "learning_rate": 2.720797720797721e-06, "loss": 0.0, "step": 17670 }, { "epoch": 75.00555555555556, "grad_norm": 0.00024015163944568485, "learning_rate": 2.7160493827160496e-06, "loss": 0.0, "step": 17680 }, { "epoch": 75.0059829059829, "grad_norm": 0.00017238871078006923, "learning_rate": 2.711301044634378e-06, "loss": 0.0, "step": 17690 }, { "epoch": 75.00641025641026, "grad_norm": 0.0001814868883229792, "learning_rate": 2.706552706552707e-06, "loss": 0.0002, "step": 17700 }, { "epoch": 75.0068376068376, "grad_norm": 0.00020034310000482947, "learning_rate": 2.7018043684710353e-06, "loss": 0.0, "step": 17710 }, { "epoch": 75.00726495726495, "grad_norm": 0.004398710560053587, "learning_rate": 2.697056030389364e-06, "loss": 0.0, "step": 17720 }, { "epoch": 75.00769230769231, "grad_norm": 0.00018758252554107457, "learning_rate": 2.6923076923076923e-06, "loss": 0.0, "step": 17730 }, { "epoch": 75.00811965811965, "grad_norm": 0.0014624909963458776, "learning_rate": 2.687559354226021e-06, "loss": 0.0, "step": 17740 }, { "epoch": 75.00854700854701, "grad_norm": 0.0002650118840392679, "learning_rate": 2.6828110161443498e-06, "loss": 0.0, "step": 17750 }, { "epoch": 75.00897435897436, "grad_norm": 0.004331326577812433, "learning_rate": 2.6780626780626785e-06, "loss": 0.0, "step": 17760 }, { "epoch": 75.00940170940171, "grad_norm": 0.0013256591046229005, "learning_rate": 2.673314339981007e-06, "loss": 0.0, "step": 17770 }, { "epoch": 75.00982905982906, "grad_norm": 0.0001800288155209273, "learning_rate": 2.6685660018993355e-06, "loss": 0.6123, "step": 17780 }, { "epoch": 75.01, "eval_accuracy": 0.4, "eval_loss": 6.478565216064453, "eval_runtime": 33.5224, "eval_samples_per_second": 0.746, "eval_steps_per_second": 0.746, "step": 17784 }, { "epoch": 76.00025641025641, "grad_norm": 0.00023036974016577005, "learning_rate": 2.663817663817664e-06, "loss": 0.0, "step": 17790 }, { "epoch": 76.00068376068376, "grad_norm": 0.0002549351193010807, "learning_rate": 2.659069325735993e-06, "loss": 0.0, "step": 17800 }, { "epoch": 76.00111111111111, "grad_norm": 0.00015945105405990034, "learning_rate": 2.6543209876543212e-06, "loss": 0.0768, "step": 17810 }, { "epoch": 76.00153846153846, "grad_norm": 0.00026457657804712653, "learning_rate": 2.64957264957265e-06, "loss": 0.0007, "step": 17820 }, { "epoch": 76.00196581196582, "grad_norm": 0.0002729761472437531, "learning_rate": 2.6448243114909783e-06, "loss": 0.0, "step": 17830 }, { "epoch": 76.00239316239316, "grad_norm": 0.00020036596106365323, "learning_rate": 2.6400759734093066e-06, "loss": 0.0, "step": 17840 }, { "epoch": 76.0028205128205, "grad_norm": 0.00016762949235271662, "learning_rate": 2.6353276353276357e-06, "loss": 0.0, "step": 17850 }, { "epoch": 76.00324786324786, "grad_norm": 0.0004586647264659405, "learning_rate": 2.6305792972459644e-06, "loss": 0.0, "step": 17860 }, { "epoch": 76.00367521367521, "grad_norm": 0.006451745051890612, "learning_rate": 2.6258309591642927e-06, "loss": 0.0, "step": 17870 }, { "epoch": 76.00410256410257, "grad_norm": 0.0031748872715979815, "learning_rate": 2.6210826210826214e-06, "loss": 0.0, "step": 17880 }, { "epoch": 76.00452991452991, "grad_norm": 0.0012073371326550841, "learning_rate": 2.6163342830009497e-06, "loss": 0.0, "step": 17890 }, { "epoch": 76.00495726495727, "grad_norm": 0.0001654605002840981, "learning_rate": 2.611585944919278e-06, "loss": 0.0, "step": 17900 }, { "epoch": 76.00538461538461, "grad_norm": 0.0012388996547088027, "learning_rate": 2.606837606837607e-06, "loss": 0.0, "step": 17910 }, { "epoch": 76.00581196581197, "grad_norm": 0.00017125860904343426, "learning_rate": 2.602089268755936e-06, "loss": 0.0, "step": 17920 }, { "epoch": 76.00623931623932, "grad_norm": 0.004054773598909378, "learning_rate": 2.597340930674264e-06, "loss": 0.0001, "step": 17930 }, { "epoch": 76.00666666666666, "grad_norm": 0.0019136080518364906, "learning_rate": 2.5925925925925925e-06, "loss": 0.0, "step": 17940 }, { "epoch": 76.00709401709402, "grad_norm": 0.0007586510619148612, "learning_rate": 2.587844254510921e-06, "loss": 0.0, "step": 17950 }, { "epoch": 76.00752136752136, "grad_norm": 0.0010209938045591116, "learning_rate": 2.5830959164292504e-06, "loss": 0.0004, "step": 17960 }, { "epoch": 76.00794871794872, "grad_norm": 0.00018688829732127488, "learning_rate": 2.5783475783475787e-06, "loss": 0.0, "step": 17970 }, { "epoch": 76.00837606837607, "grad_norm": 0.00018369669851381332, "learning_rate": 2.573599240265907e-06, "loss": 0.0, "step": 17980 }, { "epoch": 76.00880341880342, "grad_norm": 0.0010952269658446312, "learning_rate": 2.5688509021842357e-06, "loss": 0.0, "step": 17990 }, { "epoch": 76.00923076923077, "grad_norm": 0.00021538576402235776, "learning_rate": 2.564102564102564e-06, "loss": 0.0003, "step": 18000 }, { "epoch": 76.00965811965811, "grad_norm": 0.0005051797488704324, "learning_rate": 2.559354226020893e-06, "loss": 0.0, "step": 18010 }, { "epoch": 76.01, "eval_accuracy": 0.4, "eval_loss": 6.552752494812012, "eval_runtime": 33.6617, "eval_samples_per_second": 0.743, "eval_steps_per_second": 0.743, "step": 18018 }, { "epoch": 77.00008547008547, "grad_norm": 0.000823773501906544, "learning_rate": 2.5546058879392214e-06, "loss": 0.0004, "step": 18020 }, { "epoch": 77.00051282051282, "grad_norm": 0.001240988029167056, "learning_rate": 2.54985754985755e-06, "loss": 0.0, "step": 18030 }, { "epoch": 77.00094017094017, "grad_norm": 0.0006408431218005717, "learning_rate": 2.5451092117758784e-06, "loss": 0.0001, "step": 18040 }, { "epoch": 77.00136752136753, "grad_norm": 0.00021658137848135084, "learning_rate": 2.540360873694207e-06, "loss": 0.0, "step": 18050 }, { "epoch": 77.00179487179487, "grad_norm": 0.00019429672101978213, "learning_rate": 2.535612535612536e-06, "loss": 0.0, "step": 18060 }, { "epoch": 77.00222222222222, "grad_norm": 0.00022740900749340653, "learning_rate": 2.5308641975308646e-06, "loss": 0.0, "step": 18070 }, { "epoch": 77.00264957264957, "grad_norm": 0.00015916908159852028, "learning_rate": 2.526115859449193e-06, "loss": 0.7221, "step": 18080 }, { "epoch": 77.00307692307692, "grad_norm": 0.00014186595217324793, "learning_rate": 2.5213675213675216e-06, "loss": 0.0, "step": 18090 }, { "epoch": 77.00350427350428, "grad_norm": 0.00016282236902043223, "learning_rate": 2.51661918328585e-06, "loss": 0.0, "step": 18100 }, { "epoch": 77.00393162393162, "grad_norm": 0.0003063238982576877, "learning_rate": 2.511870845204179e-06, "loss": 0.0, "step": 18110 }, { "epoch": 77.00435897435898, "grad_norm": 0.00018556939903646708, "learning_rate": 2.5071225071225073e-06, "loss": 0.0, "step": 18120 }, { "epoch": 77.00478632478632, "grad_norm": 0.00014702802582178265, "learning_rate": 2.502374169040836e-06, "loss": 0.0, "step": 18130 }, { "epoch": 77.00521367521368, "grad_norm": 0.004362730775028467, "learning_rate": 2.4976258309591644e-06, "loss": 0.0, "step": 18140 }, { "epoch": 77.00564102564103, "grad_norm": 0.00015520227316301316, "learning_rate": 2.492877492877493e-06, "loss": 0.0013, "step": 18150 }, { "epoch": 77.00606837606837, "grad_norm": 528.2244262695312, "learning_rate": 2.488129154795822e-06, "loss": 0.274, "step": 18160 }, { "epoch": 77.00649572649573, "grad_norm": 0.00016038109606597573, "learning_rate": 2.48338081671415e-06, "loss": 0.0, "step": 18170 }, { "epoch": 77.00692307692307, "grad_norm": 0.0002260198671137914, "learning_rate": 2.478632478632479e-06, "loss": 0.0022, "step": 18180 }, { "epoch": 77.00735042735043, "grad_norm": 0.008054505102336407, "learning_rate": 2.4738841405508075e-06, "loss": 0.0718, "step": 18190 }, { "epoch": 77.00777777777778, "grad_norm": 0.0019935595337301493, "learning_rate": 2.469135802469136e-06, "loss": 0.0, "step": 18200 }, { "epoch": 77.00820512820513, "grad_norm": 0.0007075904286466539, "learning_rate": 2.4643874643874645e-06, "loss": 0.0, "step": 18210 }, { "epoch": 77.00863247863248, "grad_norm": 0.0013958788476884365, "learning_rate": 2.4596391263057933e-06, "loss": 0.0, "step": 18220 }, { "epoch": 77.00905982905982, "grad_norm": 0.0016203945269808173, "learning_rate": 2.454890788224122e-06, "loss": 0.0, "step": 18230 }, { "epoch": 77.00948717948718, "grad_norm": 0.00014267765800468624, "learning_rate": 2.4501424501424503e-06, "loss": 0.0, "step": 18240 }, { "epoch": 77.00991452991452, "grad_norm": 0.001626075361855328, "learning_rate": 2.445394112060779e-06, "loss": 0.0, "step": 18250 }, { "epoch": 77.01, "eval_accuracy": 0.44, "eval_loss": 5.542598724365234, "eval_runtime": 33.0633, "eval_samples_per_second": 0.756, "eval_steps_per_second": 0.756, "step": 18252 }, { "epoch": 78.00034188034188, "grad_norm": 0.0003114393330179155, "learning_rate": 2.4406457739791077e-06, "loss": 0.0, "step": 18260 }, { "epoch": 78.00076923076924, "grad_norm": 0.00026152783539146185, "learning_rate": 2.435897435897436e-06, "loss": 0.0, "step": 18270 }, { "epoch": 78.00119658119658, "grad_norm": 0.0003940732858609408, "learning_rate": 2.4311490978157647e-06, "loss": 0.0, "step": 18280 }, { "epoch": 78.00162393162393, "grad_norm": 0.000839279149658978, "learning_rate": 2.4264007597340935e-06, "loss": 0.0059, "step": 18290 }, { "epoch": 78.00205128205128, "grad_norm": 0.00048554647946730256, "learning_rate": 2.4216524216524218e-06, "loss": 0.0, "step": 18300 }, { "epoch": 78.00247863247863, "grad_norm": 0.00023615985992364585, "learning_rate": 2.4169040835707505e-06, "loss": 0.0, "step": 18310 }, { "epoch": 78.00290598290599, "grad_norm": 0.0001385568466503173, "learning_rate": 2.412155745489079e-06, "loss": 0.0, "step": 18320 }, { "epoch": 78.00333333333333, "grad_norm": 0.0001412118726875633, "learning_rate": 2.4074074074074075e-06, "loss": 0.0, "step": 18330 }, { "epoch": 78.00376068376069, "grad_norm": 0.0002054434735327959, "learning_rate": 2.4026590693257362e-06, "loss": 0.0, "step": 18340 }, { "epoch": 78.00418803418803, "grad_norm": 0.0019602985121309757, "learning_rate": 2.3979107312440645e-06, "loss": 0.0, "step": 18350 }, { "epoch": 78.00461538461539, "grad_norm": 0.00021056836703792214, "learning_rate": 2.3931623931623937e-06, "loss": 0.0, "step": 18360 }, { "epoch": 78.00504273504274, "grad_norm": 40.98805618286133, "learning_rate": 2.388414055080722e-06, "loss": 0.0028, "step": 18370 }, { "epoch": 78.00547008547008, "grad_norm": 0.00020283031335566193, "learning_rate": 2.3836657169990502e-06, "loss": 0.0, "step": 18380 }, { "epoch": 78.00589743589744, "grad_norm": 0.00025577127235010266, "learning_rate": 2.378917378917379e-06, "loss": 0.0001, "step": 18390 }, { "epoch": 78.00632478632478, "grad_norm": 0.00014881067909300327, "learning_rate": 2.3741690408357077e-06, "loss": 0.0002, "step": 18400 }, { "epoch": 78.00675213675214, "grad_norm": 0.00021390440815594047, "learning_rate": 2.3694207027540364e-06, "loss": 0.0, "step": 18410 }, { "epoch": 78.00717948717949, "grad_norm": 0.00022567427367903292, "learning_rate": 2.3646723646723647e-06, "loss": 0.0, "step": 18420 }, { "epoch": 78.00760683760684, "grad_norm": 0.00015315061318688095, "learning_rate": 2.3599240265906934e-06, "loss": 0.0, "step": 18430 }, { "epoch": 78.00803418803419, "grad_norm": 0.00031218226649798453, "learning_rate": 2.355175688509022e-06, "loss": 0.0009, "step": 18440 }, { "epoch": 78.00846153846153, "grad_norm": 0.6312761902809143, "learning_rate": 2.3504273504273504e-06, "loss": 0.0002, "step": 18450 }, { "epoch": 78.00888888888889, "grad_norm": 0.13864165544509888, "learning_rate": 2.345679012345679e-06, "loss": 0.0003, "step": 18460 }, { "epoch": 78.00931623931623, "grad_norm": 0.0004563250986393541, "learning_rate": 2.340930674264008e-06, "loss": 0.0006, "step": 18470 }, { "epoch": 78.0097435897436, "grad_norm": 0.0005144558963365853, "learning_rate": 2.336182336182336e-06, "loss": 0.0, "step": 18480 }, { "epoch": 78.01, "eval_accuracy": 0.4, "eval_loss": 6.4275970458984375, "eval_runtime": 33.5576, "eval_samples_per_second": 0.745, "eval_steps_per_second": 0.745, "step": 18486 }, { "epoch": 79.00017094017095, "grad_norm": 0.00022311207430902869, "learning_rate": 2.331433998100665e-06, "loss": 0.0038, "step": 18490 }, { "epoch": 79.00059829059829, "grad_norm": 0.0002333286392968148, "learning_rate": 2.3266856600189936e-06, "loss": 0.0, "step": 18500 }, { "epoch": 79.00102564102563, "grad_norm": 0.00024575545103289187, "learning_rate": 2.321937321937322e-06, "loss": 0.0001, "step": 18510 }, { "epoch": 79.001452991453, "grad_norm": 0.0004118859360460192, "learning_rate": 2.3171889838556506e-06, "loss": 0.0, "step": 18520 }, { "epoch": 79.00188034188034, "grad_norm": 0.0018699835054576397, "learning_rate": 2.3124406457739794e-06, "loss": 0.0, "step": 18530 }, { "epoch": 79.0023076923077, "grad_norm": 0.0005015349015593529, "learning_rate": 2.307692307692308e-06, "loss": 0.0, "step": 18540 }, { "epoch": 79.00273504273504, "grad_norm": 0.00027656828751787543, "learning_rate": 2.3029439696106364e-06, "loss": 0.0, "step": 18550 }, { "epoch": 79.0031623931624, "grad_norm": 0.0010210457257926464, "learning_rate": 2.298195631528965e-06, "loss": 0.0, "step": 18560 }, { "epoch": 79.00358974358974, "grad_norm": 0.0015956490533426404, "learning_rate": 2.293447293447294e-06, "loss": 0.0, "step": 18570 }, { "epoch": 79.0040170940171, "grad_norm": 0.001694871811196208, "learning_rate": 2.288698955365622e-06, "loss": 0.0, "step": 18580 }, { "epoch": 79.00444444444445, "grad_norm": 0.0004087260167580098, "learning_rate": 2.283950617283951e-06, "loss": 0.0, "step": 18590 }, { "epoch": 79.00487179487179, "grad_norm": 0.00020924824639223516, "learning_rate": 2.2792022792022796e-06, "loss": 0.0, "step": 18600 }, { "epoch": 79.00529914529915, "grad_norm": 0.0006689398433081806, "learning_rate": 2.274453941120608e-06, "loss": 0.0, "step": 18610 }, { "epoch": 79.00572649572649, "grad_norm": 0.0007431868580169976, "learning_rate": 2.2697056030389366e-06, "loss": 0.5593, "step": 18620 }, { "epoch": 79.00615384615385, "grad_norm": 0.00014568243932444602, "learning_rate": 2.2649572649572653e-06, "loss": 0.0001, "step": 18630 }, { "epoch": 79.0065811965812, "grad_norm": 0.0003919856681022793, "learning_rate": 2.2602089268755936e-06, "loss": 0.0, "step": 18640 }, { "epoch": 79.00700854700855, "grad_norm": 0.00014353814185597003, "learning_rate": 2.2554605887939223e-06, "loss": 0.0, "step": 18650 }, { "epoch": 79.0074358974359, "grad_norm": 0.00022350714425556362, "learning_rate": 2.250712250712251e-06, "loss": 0.0, "step": 18660 }, { "epoch": 79.00786324786324, "grad_norm": 0.0012620558263733983, "learning_rate": 2.2459639126305797e-06, "loss": 0.0012, "step": 18670 }, { "epoch": 79.0082905982906, "grad_norm": 0.0014929536264389753, "learning_rate": 2.241215574548908e-06, "loss": 0.0, "step": 18680 }, { "epoch": 79.00871794871794, "grad_norm": 0.0001310633379034698, "learning_rate": 2.2364672364672368e-06, "loss": 0.0, "step": 18690 }, { "epoch": 79.0091452991453, "grad_norm": 2.5448873043060303, "learning_rate": 2.2317188983855655e-06, "loss": 0.0006, "step": 18700 }, { "epoch": 79.00957264957265, "grad_norm": 0.0006994788418523967, "learning_rate": 2.2269705603038938e-06, "loss": 0.0, "step": 18710 }, { "epoch": 79.01, "grad_norm": 0.00040815817192196846, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "step": 18720 }, { "epoch": 79.01, "eval_accuracy": 0.4, "eval_loss": 6.867609024047852, "eval_runtime": 33.9385, "eval_samples_per_second": 0.737, "eval_steps_per_second": 0.737, "step": 18720 }, { "epoch": 80.00042735042734, "grad_norm": 0.0001611685729585588, "learning_rate": 2.2174738841405512e-06, "loss": 0.0, "step": 18730 }, { "epoch": 80.0008547008547, "grad_norm": 0.00016659338143654168, "learning_rate": 2.2127255460588795e-06, "loss": 0.0, "step": 18740 }, { "epoch": 80.00128205128205, "grad_norm": 0.0005343634402379394, "learning_rate": 2.2079772079772082e-06, "loss": 0.0, "step": 18750 }, { "epoch": 80.0017094017094, "grad_norm": 0.00014189437206368893, "learning_rate": 2.203228869895537e-06, "loss": 0.0, "step": 18760 }, { "epoch": 80.00213675213675, "grad_norm": 0.00016400543972849846, "learning_rate": 2.1984805318138653e-06, "loss": 0.0, "step": 18770 }, { "epoch": 80.00256410256411, "grad_norm": 0.00013671614578925073, "learning_rate": 2.193732193732194e-06, "loss": 0.0, "step": 18780 }, { "epoch": 80.00299145299145, "grad_norm": 0.0015447050100192428, "learning_rate": 2.1889838556505223e-06, "loss": 0.0, "step": 18790 }, { "epoch": 80.00341880341881, "grad_norm": 0.00014961874694563448, "learning_rate": 2.1842355175688514e-06, "loss": 0.0, "step": 18800 }, { "epoch": 80.00384615384615, "grad_norm": 0.000166281548445113, "learning_rate": 2.1794871794871797e-06, "loss": 0.2076, "step": 18810 }, { "epoch": 80.0042735042735, "grad_norm": 0.0008461292018182576, "learning_rate": 2.174738841405508e-06, "loss": 0.0, "step": 18820 }, { "epoch": 80.00470085470086, "grad_norm": 0.00018119644664693624, "learning_rate": 2.1699905033238367e-06, "loss": 1.0097, "step": 18830 }, { "epoch": 80.0051282051282, "grad_norm": 0.0001796925498638302, "learning_rate": 2.1652421652421654e-06, "loss": 0.0, "step": 18840 }, { "epoch": 80.00555555555556, "grad_norm": 0.0001590051979292184, "learning_rate": 2.1604938271604937e-06, "loss": 0.0917, "step": 18850 }, { "epoch": 80.0059829059829, "grad_norm": 0.00016695998783688992, "learning_rate": 2.1557454890788225e-06, "loss": 0.0, "step": 18860 }, { "epoch": 80.00641025641026, "grad_norm": 0.0002668647503014654, "learning_rate": 2.150997150997151e-06, "loss": 0.0, "step": 18870 }, { "epoch": 80.0068376068376, "grad_norm": 0.0016600067028775811, "learning_rate": 2.14624881291548e-06, "loss": 0.0, "step": 18880 }, { "epoch": 80.00726495726495, "grad_norm": 0.0004276032268535346, "learning_rate": 2.141500474833808e-06, "loss": 0.0, "step": 18890 }, { "epoch": 80.00769230769231, "grad_norm": 0.5101644992828369, "learning_rate": 2.136752136752137e-06, "loss": 0.0001, "step": 18900 }, { "epoch": 80.00811965811965, "grad_norm": 0.00014222673780750483, "learning_rate": 2.1320037986704656e-06, "loss": 0.0, "step": 18910 }, { "epoch": 80.00854700854701, "grad_norm": 0.0010868561221286654, "learning_rate": 2.127255460588794e-06, "loss": 0.0, "step": 18920 }, { "epoch": 80.00897435897436, "grad_norm": 0.0014827632112428546, "learning_rate": 2.1225071225071227e-06, "loss": 0.0, "step": 18930 }, { "epoch": 80.00940170940171, "grad_norm": 0.00015382345009129494, "learning_rate": 2.1177587844254514e-06, "loss": 0.0, "step": 18940 }, { "epoch": 80.00982905982906, "grad_norm": 0.00015482779417652637, "learning_rate": 2.1130104463437797e-06, "loss": 0.0, "step": 18950 }, { "epoch": 80.01, "eval_accuracy": 0.4, "eval_loss": 6.669287204742432, "eval_runtime": 33.799, "eval_samples_per_second": 0.74, "eval_steps_per_second": 0.74, "step": 18954 }, { "epoch": 81.00025641025641, "grad_norm": 0.00021314578771125525, "learning_rate": 2.1082621082621084e-06, "loss": 0.0002, "step": 18960 }, { "epoch": 81.00068376068376, "grad_norm": 0.00015724689001217484, "learning_rate": 2.103513770180437e-06, "loss": 0.0, "step": 18970 }, { "epoch": 81.00111111111111, "grad_norm": 0.00015798299864400178, "learning_rate": 2.0987654320987654e-06, "loss": 0.6005, "step": 18980 }, { "epoch": 81.00153846153846, "grad_norm": 0.00015623167564626783, "learning_rate": 2.094017094017094e-06, "loss": 0.0, "step": 18990 }, { "epoch": 81.00196581196582, "grad_norm": 0.001082591712474823, "learning_rate": 2.089268755935423e-06, "loss": 0.0, "step": 19000 }, { "epoch": 81.00239316239316, "grad_norm": 0.00016879322356544435, "learning_rate": 2.0845204178537516e-06, "loss": 0.0001, "step": 19010 }, { "epoch": 81.0028205128205, "grad_norm": 0.0006452444358728826, "learning_rate": 2.07977207977208e-06, "loss": 0.0, "step": 19020 }, { "epoch": 81.00324786324786, "grad_norm": 0.00027353313635103405, "learning_rate": 2.0750237416904086e-06, "loss": 0.0, "step": 19030 }, { "epoch": 81.00367521367521, "grad_norm": 0.0008052527555264533, "learning_rate": 2.0702754036087373e-06, "loss": 0.0, "step": 19040 }, { "epoch": 81.00410256410257, "grad_norm": 0.00014285088400356472, "learning_rate": 2.0655270655270656e-06, "loss": 0.0, "step": 19050 }, { "epoch": 81.00452991452991, "grad_norm": 0.00035150200710631907, "learning_rate": 2.0607787274453943e-06, "loss": 0.0013, "step": 19060 }, { "epoch": 81.00495726495727, "grad_norm": 0.0010856334120035172, "learning_rate": 2.056030389363723e-06, "loss": 0.0, "step": 19070 }, { "epoch": 81.00538461538461, "grad_norm": 0.0009435404208488762, "learning_rate": 2.0512820512820513e-06, "loss": 0.0, "step": 19080 }, { "epoch": 81.00581196581197, "grad_norm": 0.001047900877892971, "learning_rate": 2.04653371320038e-06, "loss": 0.0, "step": 19090 }, { "epoch": 81.00623931623932, "grad_norm": 0.00044585313298739493, "learning_rate": 2.0417853751187088e-06, "loss": 0.0, "step": 19100 }, { "epoch": 81.00666666666666, "grad_norm": 0.00014062630361877382, "learning_rate": 2.037037037037037e-06, "loss": 0.0, "step": 19110 }, { "epoch": 81.00709401709402, "grad_norm": 0.00024204060900956392, "learning_rate": 2.032288698955366e-06, "loss": 0.0, "step": 19120 }, { "epoch": 81.00752136752136, "grad_norm": 0.000514078012201935, "learning_rate": 2.0275403608736945e-06, "loss": 0.0128, "step": 19130 }, { "epoch": 81.00794871794872, "grad_norm": 0.0003828479675576091, "learning_rate": 2.0227920227920232e-06, "loss": 0.0, "step": 19140 }, { "epoch": 81.00837606837607, "grad_norm": 0.000210257523576729, "learning_rate": 2.0180436847103515e-06, "loss": 0.0, "step": 19150 }, { "epoch": 81.00880341880342, "grad_norm": 0.00039357185596600175, "learning_rate": 2.01329534662868e-06, "loss": 0.0, "step": 19160 }, { "epoch": 81.00923076923077, "grad_norm": 0.00014624725736211985, "learning_rate": 2.008547008547009e-06, "loss": 0.0, "step": 19170 }, { "epoch": 81.00965811965811, "grad_norm": 0.0019884561188519, "learning_rate": 2.0037986704653373e-06, "loss": 0.0, "step": 19180 }, { "epoch": 81.01, "eval_accuracy": 0.4, "eval_loss": 6.791877269744873, "eval_runtime": 33.6676, "eval_samples_per_second": 0.743, "eval_steps_per_second": 0.743, "step": 19188 }, { "epoch": 82.00008547008547, "grad_norm": 0.00014335830928757787, "learning_rate": 1.9990503323836656e-06, "loss": 0.0, "step": 19190 }, { "epoch": 82.00051282051282, "grad_norm": 0.00016715576930437237, "learning_rate": 1.9943019943019947e-06, "loss": 0.0, "step": 19200 }, { "epoch": 82.00094017094017, "grad_norm": 0.00015401339624077082, "learning_rate": 1.989553656220323e-06, "loss": 0.0, "step": 19210 }, { "epoch": 82.00136752136753, "grad_norm": 0.0009122826741077006, "learning_rate": 1.9848053181386517e-06, "loss": 0.0, "step": 19220 }, { "epoch": 82.00179487179487, "grad_norm": 0.0039600650779902935, "learning_rate": 1.98005698005698e-06, "loss": 0.0, "step": 19230 }, { "epoch": 82.00222222222222, "grad_norm": 0.00023766346566844732, "learning_rate": 1.9753086419753087e-06, "loss": 0.0, "step": 19240 }, { "epoch": 82.00264957264957, "grad_norm": 0.0002676662988960743, "learning_rate": 1.9705603038936375e-06, "loss": 0.0, "step": 19250 }, { "epoch": 82.00307692307692, "grad_norm": 0.00012970353418495506, "learning_rate": 1.9658119658119658e-06, "loss": 0.9915, "step": 19260 }, { "epoch": 82.00350427350428, "grad_norm": 0.001931848586536944, "learning_rate": 1.9610636277302945e-06, "loss": 0.0, "step": 19270 }, { "epoch": 82.00393162393162, "grad_norm": 0.00016302384028676897, "learning_rate": 1.956315289648623e-06, "loss": 0.0, "step": 19280 }, { "epoch": 82.00435897435898, "grad_norm": 0.0006244336836971343, "learning_rate": 1.9515669515669515e-06, "loss": 0.0, "step": 19290 }, { "epoch": 82.00478632478632, "grad_norm": 0.0013233753852546215, "learning_rate": 1.9468186134852802e-06, "loss": 0.0, "step": 19300 }, { "epoch": 82.00521367521368, "grad_norm": 0.00016536538896616548, "learning_rate": 1.942070275403609e-06, "loss": 0.0, "step": 19310 }, { "epoch": 82.00564102564103, "grad_norm": 0.002657419303432107, "learning_rate": 1.9373219373219372e-06, "loss": 0.0276, "step": 19320 }, { "epoch": 82.00606837606837, "grad_norm": 0.000514014158397913, "learning_rate": 1.932573599240266e-06, "loss": 0.0, "step": 19330 }, { "epoch": 82.00649572649573, "grad_norm": 0.000511766062118113, "learning_rate": 1.9278252611585947e-06, "loss": 0.0, "step": 19340 }, { "epoch": 82.00692307692307, "grad_norm": 0.0024127494543790817, "learning_rate": 1.9230769230769234e-06, "loss": 0.0, "step": 19350 }, { "epoch": 82.00735042735043, "grad_norm": 0.0032117695081979036, "learning_rate": 1.9183285849952517e-06, "loss": 0.0, "step": 19360 }, { "epoch": 82.00777777777778, "grad_norm": 0.0002099874400300905, "learning_rate": 1.9135802469135804e-06, "loss": 0.0, "step": 19370 }, { "epoch": 82.00820512820513, "grad_norm": 0.0007865933002904058, "learning_rate": 1.908831908831909e-06, "loss": 0.0, "step": 19380 }, { "epoch": 82.00863247863248, "grad_norm": 0.000656930438708514, "learning_rate": 1.9040835707502374e-06, "loss": 0.0, "step": 19390 }, { "epoch": 82.00905982905982, "grad_norm": 0.16030727326869965, "learning_rate": 1.8993352326685664e-06, "loss": 0.0001, "step": 19400 }, { "epoch": 82.00948717948718, "grad_norm": 0.0001436812017345801, "learning_rate": 1.8945868945868947e-06, "loss": 0.0, "step": 19410 }, { "epoch": 82.00991452991452, "grad_norm": 0.000723684614058584, "learning_rate": 1.8898385565052232e-06, "loss": 0.0, "step": 19420 }, { "epoch": 82.01, "eval_accuracy": 0.4, "eval_loss": 6.751951694488525, "eval_runtime": 36.0289, "eval_samples_per_second": 0.694, "eval_steps_per_second": 0.694, "step": 19422 }, { "epoch": 83.00034188034188, "grad_norm": 0.000560764514375478, "learning_rate": 1.885090218423552e-06, "loss": 0.0, "step": 19430 }, { "epoch": 83.00076923076924, "grad_norm": 0.00022796269331593066, "learning_rate": 1.8803418803418804e-06, "loss": 0.0, "step": 19440 }, { "epoch": 83.00119658119658, "grad_norm": 0.00013079910422675312, "learning_rate": 1.875593542260209e-06, "loss": 0.0, "step": 19450 }, { "epoch": 83.00162393162393, "grad_norm": 0.0005907687591388822, "learning_rate": 1.8708452041785376e-06, "loss": 0.3814, "step": 19460 }, { "epoch": 83.00205128205128, "grad_norm": 0.00020753988064825535, "learning_rate": 1.8660968660968661e-06, "loss": 0.0, "step": 19470 }, { "epoch": 83.00247863247863, "grad_norm": 0.00022883056954015046, "learning_rate": 1.8613485280151949e-06, "loss": 0.0001, "step": 19480 }, { "epoch": 83.00290598290599, "grad_norm": 0.0005515534430742264, "learning_rate": 1.8566001899335234e-06, "loss": 0.0003, "step": 19490 }, { "epoch": 83.00333333333333, "grad_norm": 0.0006961479666642845, "learning_rate": 1.8518518518518519e-06, "loss": 0.0, "step": 19500 }, { "epoch": 83.00376068376069, "grad_norm": 0.0002969226916320622, "learning_rate": 1.8471035137701806e-06, "loss": 0.0, "step": 19510 }, { "epoch": 83.00418803418803, "grad_norm": 0.0003522407787386328, "learning_rate": 1.842355175688509e-06, "loss": 0.0, "step": 19520 }, { "epoch": 83.00461538461539, "grad_norm": 0.0003604775993153453, "learning_rate": 1.8376068376068378e-06, "loss": 0.0, "step": 19530 }, { "epoch": 83.00504273504274, "grad_norm": 0.00028395134722813964, "learning_rate": 1.8328584995251663e-06, "loss": 0.0022, "step": 19540 }, { "epoch": 83.00547008547008, "grad_norm": 0.0031416804995387793, "learning_rate": 1.8281101614434948e-06, "loss": 0.0, "step": 19550 }, { "epoch": 83.00589743589744, "grad_norm": 0.0014400221407413483, "learning_rate": 1.8233618233618236e-06, "loss": 0.0, "step": 19560 }, { "epoch": 83.00632478632478, "grad_norm": 0.0030254279263317585, "learning_rate": 1.818613485280152e-06, "loss": 0.0, "step": 19570 }, { "epoch": 83.00675213675214, "grad_norm": 0.00016171699098777026, "learning_rate": 1.8138651471984806e-06, "loss": 0.0, "step": 19580 }, { "epoch": 83.00717948717949, "grad_norm": 0.3200160264968872, "learning_rate": 1.8091168091168093e-06, "loss": 0.0001, "step": 19590 }, { "epoch": 83.00760683760684, "grad_norm": 0.00022861655452288687, "learning_rate": 1.8043684710351378e-06, "loss": 0.0006, "step": 19600 }, { "epoch": 83.00803418803419, "grad_norm": 0.00018940556037705392, "learning_rate": 1.7996201329534665e-06, "loss": 0.0, "step": 19610 }, { "epoch": 83.00846153846153, "grad_norm": 0.0001795957941794768, "learning_rate": 1.794871794871795e-06, "loss": 0.0, "step": 19620 }, { "epoch": 83.00888888888889, "grad_norm": 0.00021350740280468017, "learning_rate": 1.7901234567901235e-06, "loss": 0.0, "step": 19630 }, { "epoch": 83.00931623931623, "grad_norm": 0.0026247838977724314, "learning_rate": 1.7853751187084523e-06, "loss": 0.0, "step": 19640 }, { "epoch": 83.0097435897436, "grad_norm": 0.0006859219283796847, "learning_rate": 1.7806267806267808e-06, "loss": 0.0, "step": 19650 }, { "epoch": 83.01, "eval_accuracy": 0.4, "eval_loss": 6.7565107345581055, "eval_runtime": 33.4174, "eval_samples_per_second": 0.748, "eval_steps_per_second": 0.748, "step": 19656 }, { "epoch": 84.00017094017095, "grad_norm": 0.00018314612680114806, "learning_rate": 1.7758784425451095e-06, "loss": 0.0, "step": 19660 }, { "epoch": 84.00059829059829, "grad_norm": 0.0016701172571629286, "learning_rate": 1.771130104463438e-06, "loss": 0.0001, "step": 19670 }, { "epoch": 84.00102564102563, "grad_norm": 0.1983662247657776, "learning_rate": 1.7663817663817665e-06, "loss": 0.0, "step": 19680 }, { "epoch": 84.001452991453, "grad_norm": 0.00020793025032617152, "learning_rate": 1.7616334283000952e-06, "loss": 0.0, "step": 19690 }, { "epoch": 84.00188034188034, "grad_norm": 0.00018910918151959777, "learning_rate": 1.7568850902184237e-06, "loss": 0.0, "step": 19700 }, { "epoch": 84.0023076923077, "grad_norm": 0.0018449919298291206, "learning_rate": 1.7521367521367522e-06, "loss": 0.0, "step": 19710 }, { "epoch": 84.00273504273504, "grad_norm": 0.0001609017635928467, "learning_rate": 1.747388414055081e-06, "loss": 0.0, "step": 19720 }, { "epoch": 84.0031623931624, "grad_norm": 0.00035862025106325746, "learning_rate": 1.7426400759734095e-06, "loss": 0.0, "step": 19730 }, { "epoch": 84.00358974358974, "grad_norm": 0.0008477799710817635, "learning_rate": 1.7378917378917382e-06, "loss": 0.0, "step": 19740 }, { "epoch": 84.0040170940171, "grad_norm": 0.0038533161859959364, "learning_rate": 1.7331433998100667e-06, "loss": 0.0, "step": 19750 }, { "epoch": 84.00444444444445, "grad_norm": 0.0002816705673467368, "learning_rate": 1.7283950617283952e-06, "loss": 0.0, "step": 19760 }, { "epoch": 84.00487179487179, "grad_norm": 0.000693616340868175, "learning_rate": 1.723646723646724e-06, "loss": 0.0, "step": 19770 }, { "epoch": 84.00529914529915, "grad_norm": 0.00019760453142225742, "learning_rate": 1.7188983855650524e-06, "loss": 0.0001, "step": 19780 }, { "epoch": 84.00572649572649, "grad_norm": 0.0001434326113667339, "learning_rate": 1.7141500474833807e-06, "loss": 0.0, "step": 19790 }, { "epoch": 84.00615384615385, "grad_norm": 0.000697402167133987, "learning_rate": 1.7094017094017097e-06, "loss": 0.0, "step": 19800 }, { "epoch": 84.0065811965812, "grad_norm": 0.0014031692408025265, "learning_rate": 1.704653371320038e-06, "loss": 0.0, "step": 19810 }, { "epoch": 84.00700854700855, "grad_norm": 0.00014115864178165793, "learning_rate": 1.6999050332383669e-06, "loss": 0.0, "step": 19820 }, { "epoch": 84.0074358974359, "grad_norm": 0.0013099861098453403, "learning_rate": 1.6951566951566952e-06, "loss": 0.0, "step": 19830 }, { "epoch": 84.00786324786324, "grad_norm": 0.00024018825206439942, "learning_rate": 1.6904083570750237e-06, "loss": 0.0, "step": 19840 }, { "epoch": 84.0082905982906, "grad_norm": 0.0007821761537343264, "learning_rate": 1.6856600189933524e-06, "loss": 0.0, "step": 19850 }, { "epoch": 84.00871794871794, "grad_norm": 0.00013779080472886562, "learning_rate": 1.680911680911681e-06, "loss": 0.0, "step": 19860 }, { "epoch": 84.0091452991453, "grad_norm": 0.0010995293268933892, "learning_rate": 1.6761633428300099e-06, "loss": 0.0, "step": 19870 }, { "epoch": 84.00957264957265, "grad_norm": 0.0002249151439173147, "learning_rate": 1.6714150047483382e-06, "loss": 0.0, "step": 19880 }, { "epoch": 84.01, "grad_norm": 0.00015639988123439252, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "step": 19890 }, { "epoch": 84.01, "eval_accuracy": 0.4, "eval_loss": 6.818636417388916, "eval_runtime": 33.5438, "eval_samples_per_second": 0.745, "eval_steps_per_second": 0.745, "step": 19890 }, { "epoch": 85.00042735042734, "grad_norm": 0.0001349115773336962, "learning_rate": 1.6619183285849954e-06, "loss": 0.0, "step": 19900 }, { "epoch": 85.0008547008547, "grad_norm": 0.00012621526548173279, "learning_rate": 1.6571699905033239e-06, "loss": 0.0, "step": 19910 }, { "epoch": 85.00128205128205, "grad_norm": 0.00013298312842380255, "learning_rate": 1.6524216524216524e-06, "loss": 0.0, "step": 19920 }, { "epoch": 85.0017094017094, "grad_norm": 0.005015491507947445, "learning_rate": 1.6476733143399811e-06, "loss": 0.0, "step": 19930 }, { "epoch": 85.00213675213675, "grad_norm": 0.00015077627904247493, "learning_rate": 1.6429249762583096e-06, "loss": 0.0, "step": 19940 }, { "epoch": 85.00256410256411, "grad_norm": 0.000555673090275377, "learning_rate": 1.6381766381766383e-06, "loss": 0.0, "step": 19950 }, { "epoch": 85.00299145299145, "grad_norm": 0.0001231930946232751, "learning_rate": 1.6334283000949669e-06, "loss": 0.0, "step": 19960 }, { "epoch": 85.00341880341881, "grad_norm": 0.0003615697205532342, "learning_rate": 1.6286799620132954e-06, "loss": 0.0, "step": 19970 }, { "epoch": 85.00384615384615, "grad_norm": 0.00024398553068749607, "learning_rate": 1.623931623931624e-06, "loss": 0.0, "step": 19980 }, { "epoch": 85.0042735042735, "grad_norm": 0.0009278811048716307, "learning_rate": 1.6191832858499526e-06, "loss": 0.0, "step": 19990 }, { "epoch": 85.00470085470086, "grad_norm": 0.0006217070040293038, "learning_rate": 1.6144349477682813e-06, "loss": 0.0, "step": 20000 }, { "epoch": 85.0051282051282, "grad_norm": 0.000226405740249902, "learning_rate": 1.6096866096866098e-06, "loss": 0.0, "step": 20010 }, { "epoch": 85.00555555555556, "grad_norm": 0.00026999341207556427, "learning_rate": 1.6049382716049383e-06, "loss": 0.0021, "step": 20020 }, { "epoch": 85.0059829059829, "grad_norm": 0.00012980998144485056, "learning_rate": 1.600189933523267e-06, "loss": 0.0, "step": 20030 }, { "epoch": 85.00641025641026, "grad_norm": 0.0006714918417856097, "learning_rate": 1.5954415954415956e-06, "loss": 0.0, "step": 20040 }, { "epoch": 85.0068376068376, "grad_norm": 0.0013149407459422946, "learning_rate": 1.590693257359924e-06, "loss": 0.0, "step": 20050 }, { "epoch": 85.00726495726495, "grad_norm": 0.00016666753799654543, "learning_rate": 1.5859449192782528e-06, "loss": 0.0, "step": 20060 }, { "epoch": 85.00769230769231, "grad_norm": 0.0008010066230781376, "learning_rate": 1.5811965811965813e-06, "loss": 0.0557, "step": 20070 }, { "epoch": 85.00811965811965, "grad_norm": 0.0002666155050974339, "learning_rate": 1.57644824311491e-06, "loss": 0.0, "step": 20080 }, { "epoch": 85.00854700854701, "grad_norm": 0.00023051415337249637, "learning_rate": 1.5716999050332385e-06, "loss": 0.0, "step": 20090 }, { "epoch": 85.00897435897436, "grad_norm": 0.0011181911686435342, "learning_rate": 1.566951566951567e-06, "loss": 0.0, "step": 20100 }, { "epoch": 85.00940170940171, "grad_norm": 0.0008536745444871485, "learning_rate": 1.5622032288698958e-06, "loss": 0.1919, "step": 20110 }, { "epoch": 85.00982905982906, "grad_norm": 0.0005208790535107255, "learning_rate": 1.5574548907882243e-06, "loss": 0.0, "step": 20120 }, { "epoch": 85.01, "eval_accuracy": 0.4, "eval_loss": 6.55494499206543, "eval_runtime": 33.6803, "eval_samples_per_second": 0.742, "eval_steps_per_second": 0.742, "step": 20124 }, { "epoch": 86.00025641025641, "grad_norm": 0.00014588376507163048, "learning_rate": 1.552706552706553e-06, "loss": 0.0, "step": 20130 }, { "epoch": 86.00068376068376, "grad_norm": 0.00038126404979266226, "learning_rate": 1.5479582146248815e-06, "loss": 0.0, "step": 20140 }, { "epoch": 86.00111111111111, "grad_norm": 0.0008430579327978194, "learning_rate": 1.54320987654321e-06, "loss": 0.0, "step": 20150 }, { "epoch": 86.00153846153846, "grad_norm": 0.0002688311506062746, "learning_rate": 1.5384615384615387e-06, "loss": 0.0, "step": 20160 }, { "epoch": 86.00196581196582, "grad_norm": 0.00030692145810462534, "learning_rate": 1.5337132003798672e-06, "loss": 0.0, "step": 20170 }, { "epoch": 86.00239316239316, "grad_norm": 0.00014298380119726062, "learning_rate": 1.5289648622981957e-06, "loss": 0.0, "step": 20180 }, { "epoch": 86.0028205128205, "grad_norm": 0.0001439500047126785, "learning_rate": 1.5242165242165245e-06, "loss": 0.0, "step": 20190 }, { "epoch": 86.00324786324786, "grad_norm": 0.0001333106920355931, "learning_rate": 1.519468186134853e-06, "loss": 0.0, "step": 20200 }, { "epoch": 86.00367521367521, "grad_norm": 0.0004077716148458421, "learning_rate": 1.5147198480531817e-06, "loss": 0.0, "step": 20210 }, { "epoch": 86.00410256410257, "grad_norm": 0.0005425411509349942, "learning_rate": 1.5099715099715102e-06, "loss": 0.0081, "step": 20220 }, { "epoch": 86.00452991452991, "grad_norm": 0.0001421132474206388, "learning_rate": 1.5052231718898385e-06, "loss": 0.0, "step": 20230 }, { "epoch": 86.00495726495727, "grad_norm": 0.0005388124845921993, "learning_rate": 1.5004748338081674e-06, "loss": 0.0002, "step": 20240 }, { "epoch": 86.00538461538461, "grad_norm": 0.0001884761149995029, "learning_rate": 1.4957264957264957e-06, "loss": 0.0, "step": 20250 }, { "epoch": 86.00581196581197, "grad_norm": 0.0015297207282856107, "learning_rate": 1.4909781576448246e-06, "loss": 0.0, "step": 20260 }, { "epoch": 86.00623931623932, "grad_norm": 0.0007038248004391789, "learning_rate": 1.486229819563153e-06, "loss": 0.0, "step": 20270 }, { "epoch": 86.00666666666666, "grad_norm": 0.00011914034985238686, "learning_rate": 1.4814814814814815e-06, "loss": 0.0, "step": 20280 }, { "epoch": 86.00709401709402, "grad_norm": 0.0005224226042628288, "learning_rate": 1.4767331433998102e-06, "loss": 0.0, "step": 20290 }, { "epoch": 86.00752136752136, "grad_norm": 0.00024032553483266383, "learning_rate": 1.4719848053181387e-06, "loss": 0.0, "step": 20300 }, { "epoch": 86.00794871794872, "grad_norm": 0.0015607475070282817, "learning_rate": 1.4672364672364672e-06, "loss": 0.0, "step": 20310 }, { "epoch": 86.00837606837607, "grad_norm": 0.000935015850700438, "learning_rate": 1.462488129154796e-06, "loss": 0.0, "step": 20320 }, { "epoch": 86.00880341880342, "grad_norm": 0.0009676801273599267, "learning_rate": 1.4577397910731244e-06, "loss": 0.0, "step": 20330 }, { "epoch": 86.00923076923077, "grad_norm": 0.0018314124317839742, "learning_rate": 1.4529914529914531e-06, "loss": 0.0, "step": 20340 }, { "epoch": 86.00965811965811, "grad_norm": 0.00039655648288317025, "learning_rate": 1.4482431149097816e-06, "loss": 0.0, "step": 20350 }, { "epoch": 86.01, "eval_accuracy": 0.4, "eval_loss": 6.722276210784912, "eval_runtime": 34.4477, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.726, "step": 20358 }, { "epoch": 87.00008547008547, "grad_norm": 0.00019830497330985963, "learning_rate": 1.4434947768281102e-06, "loss": 0.0, "step": 20360 }, { "epoch": 87.00051282051282, "grad_norm": 0.00021265774557832628, "learning_rate": 1.4387464387464389e-06, "loss": 0.0, "step": 20370 }, { "epoch": 87.00094017094017, "grad_norm": 0.00013740244321525097, "learning_rate": 1.4339981006647674e-06, "loss": 0.0, "step": 20380 }, { "epoch": 87.00136752136753, "grad_norm": 0.0008310623816214502, "learning_rate": 1.429249762583096e-06, "loss": 0.0, "step": 20390 }, { "epoch": 87.00179487179487, "grad_norm": 0.00020110943296458572, "learning_rate": 1.4245014245014246e-06, "loss": 0.0, "step": 20400 }, { "epoch": 87.00222222222222, "grad_norm": 0.0004781411844305694, "learning_rate": 1.4197530864197531e-06, "loss": 0.0, "step": 20410 }, { "epoch": 87.00264957264957, "grad_norm": 0.0001626553712412715, "learning_rate": 1.4150047483380818e-06, "loss": 0.0, "step": 20420 }, { "epoch": 87.00307692307692, "grad_norm": 0.00045715321903117, "learning_rate": 1.4102564102564104e-06, "loss": 0.0, "step": 20430 }, { "epoch": 87.00350427350428, "grad_norm": 0.00021040246065240353, "learning_rate": 1.4055080721747389e-06, "loss": 0.0, "step": 20440 }, { "epoch": 87.00393162393162, "grad_norm": 0.00011547945905476809, "learning_rate": 1.4007597340930676e-06, "loss": 0.0, "step": 20450 }, { "epoch": 87.00435897435898, "grad_norm": 0.00020624134049285203, "learning_rate": 1.396011396011396e-06, "loss": 0.0, "step": 20460 }, { "epoch": 87.00478632478632, "grad_norm": 0.0008065582369454205, "learning_rate": 1.3912630579297248e-06, "loss": 0.0, "step": 20470 }, { "epoch": 87.00521367521368, "grad_norm": 0.00020026916172355413, "learning_rate": 1.3865147198480533e-06, "loss": 0.0, "step": 20480 }, { "epoch": 87.00564102564103, "grad_norm": 0.00014089647447690368, "learning_rate": 1.3817663817663818e-06, "loss": 0.0, "step": 20490 }, { "epoch": 87.00606837606837, "grad_norm": 0.0007202147389762104, "learning_rate": 1.3770180436847105e-06, "loss": 0.0, "step": 20500 }, { "epoch": 87.00649572649573, "grad_norm": 0.0001863235083874315, "learning_rate": 1.372269705603039e-06, "loss": 0.0, "step": 20510 }, { "epoch": 87.00692307692307, "grad_norm": 0.00013089961430523545, "learning_rate": 1.3675213675213678e-06, "loss": 0.0, "step": 20520 }, { "epoch": 87.00735042735043, "grad_norm": 0.000519382010679692, "learning_rate": 1.3627730294396963e-06, "loss": 0.0, "step": 20530 }, { "epoch": 87.00777777777778, "grad_norm": 0.00013078245683573186, "learning_rate": 1.3580246913580248e-06, "loss": 0.0, "step": 20540 }, { "epoch": 87.00820512820513, "grad_norm": 0.0006038915598765016, "learning_rate": 1.3532763532763535e-06, "loss": 0.0, "step": 20550 }, { "epoch": 87.00863247863248, "grad_norm": 0.0002612897951621562, "learning_rate": 1.348528015194682e-06, "loss": 0.0, "step": 20560 }, { "epoch": 87.00905982905982, "grad_norm": 0.00011701687617460266, "learning_rate": 1.3437796771130105e-06, "loss": 0.3334, "step": 20570 }, { "epoch": 87.00948717948718, "grad_norm": 0.0023586824536323547, "learning_rate": 1.3390313390313392e-06, "loss": 0.0, "step": 20580 }, { "epoch": 87.00991452991452, "grad_norm": 0.00032859586644917727, "learning_rate": 1.3342830009496678e-06, "loss": 0.0, "step": 20590 }, { "epoch": 87.01, "eval_accuracy": 0.4, "eval_loss": 6.909550189971924, "eval_runtime": 34.96, "eval_samples_per_second": 0.715, "eval_steps_per_second": 0.715, "step": 20592 }, { "epoch": 88.00034188034188, "grad_norm": 0.00037892567343078554, "learning_rate": 1.3295346628679965e-06, "loss": 0.0, "step": 20600 }, { "epoch": 88.00076923076924, "grad_norm": 0.00016175136261153966, "learning_rate": 1.324786324786325e-06, "loss": 0.0, "step": 20610 }, { "epoch": 88.00119658119658, "grad_norm": 0.00011514942161738873, "learning_rate": 1.3200379867046533e-06, "loss": 0.0, "step": 20620 }, { "epoch": 88.00162393162393, "grad_norm": 0.000466162251541391, "learning_rate": 1.3152896486229822e-06, "loss": 0.0, "step": 20630 }, { "epoch": 88.00205128205128, "grad_norm": 0.0005339948693290353, "learning_rate": 1.3105413105413107e-06, "loss": 0.0, "step": 20640 }, { "epoch": 88.00247863247863, "grad_norm": 0.00011431486200308427, "learning_rate": 1.305792972459639e-06, "loss": 0.0, "step": 20650 }, { "epoch": 88.00290598290599, "grad_norm": 0.00010979742364725098, "learning_rate": 1.301044634377968e-06, "loss": 0.0, "step": 20660 }, { "epoch": 88.00333333333333, "grad_norm": 0.001283150864765048, "learning_rate": 1.2962962962962962e-06, "loss": 0.0, "step": 20670 }, { "epoch": 88.00376068376069, "grad_norm": 0.0002451012551318854, "learning_rate": 1.2915479582146252e-06, "loss": 0.0, "step": 20680 }, { "epoch": 88.00418803418803, "grad_norm": 0.00014794590242672712, "learning_rate": 1.2867996201329535e-06, "loss": 0.0, "step": 20690 }, { "epoch": 88.00461538461539, "grad_norm": 0.0005672202096320689, "learning_rate": 1.282051282051282e-06, "loss": 0.0, "step": 20700 }, { "epoch": 88.00504273504274, "grad_norm": 0.00013605415006168187, "learning_rate": 1.2773029439696107e-06, "loss": 0.0, "step": 20710 }, { "epoch": 88.00547008547008, "grad_norm": 0.00011451664613559842, "learning_rate": 1.2725546058879392e-06, "loss": 0.0, "step": 20720 }, { "epoch": 88.00589743589744, "grad_norm": 0.00019140506628900766, "learning_rate": 1.267806267806268e-06, "loss": 0.0, "step": 20730 }, { "epoch": 88.00632478632478, "grad_norm": 0.00013691678759641945, "learning_rate": 1.2630579297245964e-06, "loss": 0.0, "step": 20740 }, { "epoch": 88.00675213675214, "grad_norm": 0.00016725384921301156, "learning_rate": 1.258309591642925e-06, "loss": 0.0, "step": 20750 }, { "epoch": 88.00717948717949, "grad_norm": 0.00016475172014907002, "learning_rate": 1.2535612535612537e-06, "loss": 0.0, "step": 20760 }, { "epoch": 88.00760683760684, "grad_norm": 0.00015170658298302442, "learning_rate": 1.2488129154795822e-06, "loss": 0.0, "step": 20770 }, { "epoch": 88.00803418803419, "grad_norm": 0.012962790206074715, "learning_rate": 1.244064577397911e-06, "loss": 0.0, "step": 20780 }, { "epoch": 88.00846153846153, "grad_norm": 0.00019268876349087805, "learning_rate": 1.2393162393162394e-06, "loss": 0.0, "step": 20790 }, { "epoch": 88.00888888888889, "grad_norm": 0.00012487791536841542, "learning_rate": 1.234567901234568e-06, "loss": 0.0, "step": 20800 }, { "epoch": 88.00931623931623, "grad_norm": 0.009372721426188946, "learning_rate": 1.2298195631528966e-06, "loss": 0.0, "step": 20810 }, { "epoch": 88.0097435897436, "grad_norm": 0.0001522890670457855, "learning_rate": 1.2250712250712251e-06, "loss": 0.0, "step": 20820 }, { "epoch": 88.01, "eval_accuracy": 0.4, "eval_loss": 6.991762638092041, "eval_runtime": 34.8658, "eval_samples_per_second": 0.717, "eval_steps_per_second": 0.717, "step": 20826 }, { "epoch": 89.00017094017095, "grad_norm": 1.7047139406204224, "learning_rate": 1.2203228869895539e-06, "loss": 0.0003, "step": 20830 }, { "epoch": 89.00059829059829, "grad_norm": 0.004476090893149376, "learning_rate": 1.2155745489078824e-06, "loss": 0.0, "step": 20840 }, { "epoch": 89.00102564102563, "grad_norm": 0.0009234125609509647, "learning_rate": 1.2108262108262109e-06, "loss": 0.0, "step": 20850 }, { "epoch": 89.001452991453, "grad_norm": 0.0005792377050966024, "learning_rate": 1.2060778727445396e-06, "loss": 0.0, "step": 20860 }, { "epoch": 89.00188034188034, "grad_norm": 0.0001534123730380088, "learning_rate": 1.2013295346628681e-06, "loss": 0.0, "step": 20870 }, { "epoch": 89.0023076923077, "grad_norm": 0.0013493632432073355, "learning_rate": 1.1965811965811968e-06, "loss": 0.0, "step": 20880 }, { "epoch": 89.00273504273504, "grad_norm": 0.00017192041559610516, "learning_rate": 1.1918328584995251e-06, "loss": 0.0, "step": 20890 }, { "epoch": 89.0031623931624, "grad_norm": 0.00012882999726571143, "learning_rate": 1.1870845204178538e-06, "loss": 0.0, "step": 20900 }, { "epoch": 89.00358974358974, "grad_norm": 0.0013170551974326372, "learning_rate": 1.1823361823361824e-06, "loss": 0.0, "step": 20910 }, { "epoch": 89.0040170940171, "grad_norm": 0.0004404323117341846, "learning_rate": 1.177587844254511e-06, "loss": 0.0, "step": 20920 }, { "epoch": 89.00444444444445, "grad_norm": 0.00011723486386472359, "learning_rate": 1.1728395061728396e-06, "loss": 0.0015, "step": 20930 }, { "epoch": 89.00487179487179, "grad_norm": 0.00012327823787927628, "learning_rate": 1.168091168091168e-06, "loss": 0.0, "step": 20940 }, { "epoch": 89.00529914529915, "grad_norm": 0.00013142797979526222, "learning_rate": 1.1633428300094968e-06, "loss": 0.0, "step": 20950 }, { "epoch": 89.00572649572649, "grad_norm": 0.0001497309422120452, "learning_rate": 1.1585944919278253e-06, "loss": 0.0, "step": 20960 }, { "epoch": 89.00615384615385, "grad_norm": 0.0004792478575836867, "learning_rate": 1.153846153846154e-06, "loss": 0.0, "step": 20970 }, { "epoch": 89.0065811965812, "grad_norm": 0.0001388894597766921, "learning_rate": 1.1490978157644825e-06, "loss": 0.0, "step": 20980 }, { "epoch": 89.00700854700855, "grad_norm": 0.00023399751808028668, "learning_rate": 1.144349477682811e-06, "loss": 0.0, "step": 20990 }, { "epoch": 89.0074358974359, "grad_norm": 0.00014475570060312748, "learning_rate": 1.1396011396011398e-06, "loss": 0.0, "step": 21000 }, { "epoch": 89.00786324786324, "grad_norm": 0.00012018286361126229, "learning_rate": 1.1348528015194683e-06, "loss": 0.0, "step": 21010 }, { "epoch": 89.0082905982906, "grad_norm": 0.00017406410188414156, "learning_rate": 1.1301044634377968e-06, "loss": 0.0002, "step": 21020 }, { "epoch": 89.00871794871794, "grad_norm": 0.0001829592656577006, "learning_rate": 1.1253561253561255e-06, "loss": 0.0, "step": 21030 }, { "epoch": 89.0091452991453, "grad_norm": 0.0005391041049733758, "learning_rate": 1.120607787274454e-06, "loss": 0.0, "step": 21040 }, { "epoch": 89.00957264957265, "grad_norm": 0.00044315739069133997, "learning_rate": 1.1158594491927827e-06, "loss": 0.0, "step": 21050 }, { "epoch": 89.01, "grad_norm": 0.00018860511772800237, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "step": 21060 }, { "epoch": 89.01, "eval_accuracy": 0.4, "eval_loss": 7.22465705871582, "eval_runtime": 35.0687, "eval_samples_per_second": 0.713, "eval_steps_per_second": 0.713, "step": 21060 }, { "epoch": 90.00042735042734, "grad_norm": 0.00021203322103247046, "learning_rate": 1.1063627730294398e-06, "loss": 0.0, "step": 21070 }, { "epoch": 90.0008547008547, "grad_norm": 0.00013690422929357737, "learning_rate": 1.1016144349477685e-06, "loss": 0.0, "step": 21080 }, { "epoch": 90.00128205128205, "grad_norm": 0.0015827484894543886, "learning_rate": 1.096866096866097e-06, "loss": 0.6422, "step": 21090 }, { "epoch": 90.0017094017094, "grad_norm": 0.00015395709488075227, "learning_rate": 1.0921177587844257e-06, "loss": 0.0, "step": 21100 }, { "epoch": 90.00213675213675, "grad_norm": 0.0011581846047192812, "learning_rate": 1.087369420702754e-06, "loss": 0.0, "step": 21110 }, { "epoch": 90.00256410256411, "grad_norm": 0.00010031823330791667, "learning_rate": 1.0826210826210827e-06, "loss": 0.0, "step": 21120 }, { "epoch": 90.00299145299145, "grad_norm": 0.00012422981671988964, "learning_rate": 1.0778727445394112e-06, "loss": 0.0, "step": 21130 }, { "epoch": 90.00341880341881, "grad_norm": 0.009594716131687164, "learning_rate": 1.07312440645774e-06, "loss": 0.0, "step": 21140 }, { "epoch": 90.00384615384615, "grad_norm": 0.010623271577060223, "learning_rate": 1.0683760683760685e-06, "loss": 0.0, "step": 21150 }, { "epoch": 90.0042735042735, "grad_norm": 0.0004154906782787293, "learning_rate": 1.063627730294397e-06, "loss": 0.0, "step": 21160 }, { "epoch": 90.00470085470086, "grad_norm": 0.0005235851858742535, "learning_rate": 1.0588793922127257e-06, "loss": 0.0, "step": 21170 }, { "epoch": 90.0051282051282, "grad_norm": 0.0006127028027549386, "learning_rate": 1.0541310541310542e-06, "loss": 0.0, "step": 21180 }, { "epoch": 90.00555555555556, "grad_norm": 0.00012018286361126229, "learning_rate": 1.0493827160493827e-06, "loss": 0.0, "step": 21190 }, { "epoch": 90.0059829059829, "grad_norm": 0.00017059137462638319, "learning_rate": 1.0446343779677114e-06, "loss": 0.0, "step": 21200 }, { "epoch": 90.00641025641026, "grad_norm": 0.00011609737703111023, "learning_rate": 1.03988603988604e-06, "loss": 0.0012, "step": 21210 }, { "epoch": 90.0068376068376, "grad_norm": 0.00016542985395062715, "learning_rate": 1.0351377018043687e-06, "loss": 0.0001, "step": 21220 }, { "epoch": 90.00726495726495, "grad_norm": 0.22980256378650665, "learning_rate": 1.0303893637226972e-06, "loss": 0.0001, "step": 21230 }, { "epoch": 90.00769230769231, "grad_norm": 0.0003457261191215366, "learning_rate": 1.0256410256410257e-06, "loss": 0.0, "step": 21240 }, { "epoch": 90.00811965811965, "grad_norm": 0.0006093020201660693, "learning_rate": 1.0208926875593544e-06, "loss": 0.0, "step": 21250 }, { "epoch": 90.00854700854701, "grad_norm": 0.00011088740575360134, "learning_rate": 1.016144349477683e-06, "loss": 0.0, "step": 21260 }, { "epoch": 90.00897435897436, "grad_norm": 0.0004857526218984276, "learning_rate": 1.0113960113960116e-06, "loss": 0.0, "step": 21270 }, { "epoch": 90.00940170940171, "grad_norm": 0.00013570766896009445, "learning_rate": 1.00664767331434e-06, "loss": 0.0, "step": 21280 }, { "epoch": 90.00982905982906, "grad_norm": 0.0014373520389199257, "learning_rate": 1.0018993352326686e-06, "loss": 0.0001, "step": 21290 }, { "epoch": 90.01, "eval_accuracy": 0.4, "eval_loss": 7.226686954498291, "eval_runtime": 37.4838, "eval_samples_per_second": 0.667, "eval_steps_per_second": 0.667, "step": 21294 }, { "epoch": 91.00025641025641, "grad_norm": 0.00011334038572385907, "learning_rate": 9.971509971509974e-07, "loss": 0.0, "step": 21300 }, { "epoch": 91.00068376068376, "grad_norm": 0.00015362763951998204, "learning_rate": 9.924026590693259e-07, "loss": 0.0, "step": 21310 }, { "epoch": 91.00111111111111, "grad_norm": 0.0003192702424712479, "learning_rate": 9.876543209876544e-07, "loss": 0.0, "step": 21320 }, { "epoch": 91.00153846153846, "grad_norm": 0.0005757631151936948, "learning_rate": 9.829059829059829e-07, "loss": 0.0, "step": 21330 }, { "epoch": 91.00196581196582, "grad_norm": 0.0004981070524081588, "learning_rate": 9.781576448243116e-07, "loss": 0.0, "step": 21340 }, { "epoch": 91.00239316239316, "grad_norm": 0.000131376669742167, "learning_rate": 9.734093067426401e-07, "loss": 0.0, "step": 21350 }, { "epoch": 91.0028205128205, "grad_norm": 1261.06396484375, "learning_rate": 9.686609686609686e-07, "loss": 0.3664, "step": 21360 }, { "epoch": 91.00324786324786, "grad_norm": 0.00013327557826414704, "learning_rate": 9.639126305792973e-07, "loss": 0.0, "step": 21370 }, { "epoch": 91.00367521367521, "grad_norm": 0.0001816957665141672, "learning_rate": 9.591642924976258e-07, "loss": 0.0, "step": 21380 }, { "epoch": 91.00410256410257, "grad_norm": 0.004342993255704641, "learning_rate": 9.544159544159546e-07, "loss": 0.0, "step": 21390 }, { "epoch": 91.00452991452991, "grad_norm": 0.0002144295140169561, "learning_rate": 9.496676163342832e-07, "loss": 0.0, "step": 21400 }, { "epoch": 91.00495726495727, "grad_norm": 0.0005280747427605093, "learning_rate": 9.449192782526116e-07, "loss": 0.0001, "step": 21410 }, { "epoch": 91.00538461538461, "grad_norm": 0.0002650852547958493, "learning_rate": 9.401709401709402e-07, "loss": 0.0, "step": 21420 }, { "epoch": 91.00581196581197, "grad_norm": 0.0003367229946888983, "learning_rate": 9.354226020892688e-07, "loss": 0.0, "step": 21430 }, { "epoch": 91.00623931623932, "grad_norm": 0.00010731956717791036, "learning_rate": 9.306742640075974e-07, "loss": 0.0, "step": 21440 }, { "epoch": 91.00666666666666, "grad_norm": 0.00010221748379990458, "learning_rate": 9.259259259259259e-07, "loss": 0.0, "step": 21450 }, { "epoch": 91.00709401709402, "grad_norm": 0.00013446349475998431, "learning_rate": 9.211775878442545e-07, "loss": 0.0, "step": 21460 }, { "epoch": 91.00752136752136, "grad_norm": 0.00018187399837188423, "learning_rate": 9.164292497625832e-07, "loss": 0.0, "step": 21470 }, { "epoch": 91.00794871794872, "grad_norm": 0.00015760202950332314, "learning_rate": 9.116809116809118e-07, "loss": 0.0, "step": 21480 }, { "epoch": 91.00837606837607, "grad_norm": 0.00026204713503830135, "learning_rate": 9.069325735992403e-07, "loss": 0.0, "step": 21490 }, { "epoch": 91.00880341880342, "grad_norm": 0.00038499056245200336, "learning_rate": 9.021842355175689e-07, "loss": 0.0, "step": 21500 }, { "epoch": 91.00923076923077, "grad_norm": 0.000741164549253881, "learning_rate": 8.974358974358975e-07, "loss": 0.0, "step": 21510 }, { "epoch": 91.00965811965811, "grad_norm": 0.00012043473543599248, "learning_rate": 8.926875593542261e-07, "loss": 0.0, "step": 21520 }, { "epoch": 91.01, "eval_accuracy": 0.4, "eval_loss": 6.982587814331055, "eval_runtime": 37.2182, "eval_samples_per_second": 0.672, "eval_steps_per_second": 0.672, "step": 21528 }, { "epoch": 92.00008547008547, "grad_norm": 0.0001411245611961931, "learning_rate": 8.879392212725547e-07, "loss": 0.0, "step": 21530 }, { "epoch": 92.00051282051282, "grad_norm": 0.0017277189763262868, "learning_rate": 8.831908831908833e-07, "loss": 0.0, "step": 21540 }, { "epoch": 92.00094017094017, "grad_norm": 0.00011866500426549464, "learning_rate": 8.784425451092119e-07, "loss": 0.0, "step": 21550 }, { "epoch": 92.00136752136753, "grad_norm": 0.00031862963805906475, "learning_rate": 8.736942070275405e-07, "loss": 0.0, "step": 21560 }, { "epoch": 92.00179487179487, "grad_norm": 0.00011709554382832721, "learning_rate": 8.689458689458691e-07, "loss": 0.0, "step": 21570 }, { "epoch": 92.00222222222222, "grad_norm": 0.0005032451008446515, "learning_rate": 8.641975308641976e-07, "loss": 0.0, "step": 21580 }, { "epoch": 92.00264957264957, "grad_norm": 0.0010287113254889846, "learning_rate": 8.594491927825262e-07, "loss": 0.0, "step": 21590 }, { "epoch": 92.00307692307692, "grad_norm": 0.0004769675724674016, "learning_rate": 8.547008547008548e-07, "loss": 0.0, "step": 21600 }, { "epoch": 92.00350427350428, "grad_norm": 0.00032892791205085814, "learning_rate": 8.499525166191834e-07, "loss": 0.0, "step": 21610 }, { "epoch": 92.00393162393162, "grad_norm": 0.0001310220395680517, "learning_rate": 8.452041785375118e-07, "loss": 0.0, "step": 21620 }, { "epoch": 92.00435897435898, "grad_norm": 0.0010653804056346416, "learning_rate": 8.404558404558405e-07, "loss": 0.0, "step": 21630 }, { "epoch": 92.00478632478632, "grad_norm": 0.00020127870084252208, "learning_rate": 8.357075023741691e-07, "loss": 0.0, "step": 21640 }, { "epoch": 92.00521367521368, "grad_norm": 0.0025258706882596016, "learning_rate": 8.309591642924977e-07, "loss": 0.0, "step": 21650 }, { "epoch": 92.00564102564103, "grad_norm": 0.00015467533376067877, "learning_rate": 8.262108262108262e-07, "loss": 0.9533, "step": 21660 }, { "epoch": 92.00606837606837, "grad_norm": 0.00014860654482617974, "learning_rate": 8.214624881291548e-07, "loss": 0.0, "step": 21670 }, { "epoch": 92.00649572649573, "grad_norm": 0.0005292973946779966, "learning_rate": 8.167141500474834e-07, "loss": 0.0, "step": 21680 }, { "epoch": 92.00692307692307, "grad_norm": 0.00015186695964075625, "learning_rate": 8.11965811965812e-07, "loss": 0.0, "step": 21690 }, { "epoch": 92.00735042735043, "grad_norm": 0.00021409436885733157, "learning_rate": 8.072174738841407e-07, "loss": 0.0, "step": 21700 }, { "epoch": 92.00777777777778, "grad_norm": 0.0007851360714994371, "learning_rate": 8.024691358024692e-07, "loss": 0.0, "step": 21710 }, { "epoch": 92.00820512820513, "grad_norm": 0.00014987800386734307, "learning_rate": 7.977207977207978e-07, "loss": 0.0, "step": 21720 }, { "epoch": 92.00863247863248, "grad_norm": 0.00021006350289098918, "learning_rate": 7.929724596391264e-07, "loss": 0.0001, "step": 21730 }, { "epoch": 92.00905982905982, "grad_norm": 0.00011848520807689056, "learning_rate": 7.88224121557455e-07, "loss": 0.0, "step": 21740 }, { "epoch": 92.00948717948718, "grad_norm": 0.0001388939272146672, "learning_rate": 7.834757834757835e-07, "loss": 0.0, "step": 21750 }, { "epoch": 92.00991452991452, "grad_norm": 0.00011886980064446107, "learning_rate": 7.787274453941121e-07, "loss": 0.0, "step": 21760 }, { "epoch": 92.01, "eval_accuracy": 0.4, "eval_loss": 6.638452529907227, "eval_runtime": 36.9317, "eval_samples_per_second": 0.677, "eval_steps_per_second": 0.677, "step": 21762 }, { "epoch": 93.00034188034188, "grad_norm": 0.00011863345571327955, "learning_rate": 7.739791073124407e-07, "loss": 0.0, "step": 21770 }, { "epoch": 93.00076923076924, "grad_norm": 0.000148111954331398, "learning_rate": 7.692307692307694e-07, "loss": 0.0, "step": 21780 }, { "epoch": 93.00119658119658, "grad_norm": 0.0006991415284574032, "learning_rate": 7.644824311490979e-07, "loss": 0.0, "step": 21790 }, { "epoch": 93.00162393162393, "grad_norm": 0.006693857256323099, "learning_rate": 7.597340930674265e-07, "loss": 0.0, "step": 21800 }, { "epoch": 93.00205128205128, "grad_norm": 0.0005863769329153001, "learning_rate": 7.549857549857551e-07, "loss": 0.0, "step": 21810 }, { "epoch": 93.00247863247863, "grad_norm": 0.00011994949454674497, "learning_rate": 7.502374169040837e-07, "loss": 0.0, "step": 21820 }, { "epoch": 93.00290598290599, "grad_norm": 0.00011668866500258446, "learning_rate": 7.454890788224123e-07, "loss": 0.0001, "step": 21830 }, { "epoch": 93.00333333333333, "grad_norm": 0.00011160215217387304, "learning_rate": 7.407407407407407e-07, "loss": 0.0, "step": 21840 }, { "epoch": 93.00376068376069, "grad_norm": 0.0001440770720364526, "learning_rate": 7.359924026590693e-07, "loss": 0.0, "step": 21850 }, { "epoch": 93.00418803418803, "grad_norm": 0.00012972467811778188, "learning_rate": 7.31244064577398e-07, "loss": 0.0, "step": 21860 }, { "epoch": 93.00461538461539, "grad_norm": 0.0002495836524758488, "learning_rate": 7.264957264957266e-07, "loss": 0.0, "step": 21870 }, { "epoch": 93.00504273504274, "grad_norm": 0.00011603141319938004, "learning_rate": 7.217473884140551e-07, "loss": 0.0, "step": 21880 }, { "epoch": 93.00547008547008, "grad_norm": 0.0005587959312833846, "learning_rate": 7.169990503323837e-07, "loss": 0.0, "step": 21890 }, { "epoch": 93.00589743589744, "grad_norm": 0.00015568920935038477, "learning_rate": 7.122507122507123e-07, "loss": 0.0, "step": 21900 }, { "epoch": 93.00632478632478, "grad_norm": 0.00015569105744361877, "learning_rate": 7.075023741690409e-07, "loss": 0.0, "step": 21910 }, { "epoch": 93.00675213675214, "grad_norm": 0.00011201628512935713, "learning_rate": 7.027540360873694e-07, "loss": 0.0, "step": 21920 }, { "epoch": 93.00717948717949, "grad_norm": 0.0012152871349826455, "learning_rate": 6.98005698005698e-07, "loss": 0.0, "step": 21930 }, { "epoch": 93.00760683760684, "grad_norm": 0.00011229109804844484, "learning_rate": 6.932573599240267e-07, "loss": 0.0, "step": 21940 }, { "epoch": 93.00803418803419, "grad_norm": 0.0010557627538219094, "learning_rate": 6.885090218423553e-07, "loss": 0.0, "step": 21950 }, { "epoch": 93.00846153846153, "grad_norm": 0.0005080616101622581, "learning_rate": 6.837606837606839e-07, "loss": 0.0, "step": 21960 }, { "epoch": 93.00888888888889, "grad_norm": 0.00011757034371839836, "learning_rate": 6.790123456790124e-07, "loss": 0.0001, "step": 21970 }, { "epoch": 93.00931623931623, "grad_norm": 0.00048101996071636677, "learning_rate": 6.74264007597341e-07, "loss": 0.0, "step": 21980 }, { "epoch": 93.0097435897436, "grad_norm": 0.00011343754886183888, "learning_rate": 6.695156695156696e-07, "loss": 0.792, "step": 21990 }, { "epoch": 93.01, "eval_accuracy": 0.4, "eval_loss": 6.402031898498535, "eval_runtime": 34.6678, "eval_samples_per_second": 0.721, "eval_steps_per_second": 0.721, "step": 21996 }, { "epoch": 94.00017094017095, "grad_norm": 0.00010894170554820448, "learning_rate": 6.647673314339982e-07, "loss": 0.0, "step": 22000 }, { "epoch": 94.00059829059829, "grad_norm": 0.00013790665252599865, "learning_rate": 6.600189933523266e-07, "loss": 0.0, "step": 22010 }, { "epoch": 94.00102564102563, "grad_norm": 0.00014016970817465335, "learning_rate": 6.552706552706554e-07, "loss": 0.0, "step": 22020 }, { "epoch": 94.001452991453, "grad_norm": 0.002228393219411373, "learning_rate": 6.50522317188984e-07, "loss": 0.0, "step": 22030 }, { "epoch": 94.00188034188034, "grad_norm": 0.00015126651851460338, "learning_rate": 6.457739791073126e-07, "loss": 0.0, "step": 22040 }, { "epoch": 94.0023076923077, "grad_norm": 0.008348544128239155, "learning_rate": 6.41025641025641e-07, "loss": 0.0, "step": 22050 }, { "epoch": 94.00273504273504, "grad_norm": 0.00011175816325703636, "learning_rate": 6.362773029439696e-07, "loss": 0.0001, "step": 22060 }, { "epoch": 94.0031623931624, "grad_norm": 0.00012174518633401021, "learning_rate": 6.315289648622982e-07, "loss": 0.0, "step": 22070 }, { "epoch": 94.00358974358974, "grad_norm": 0.00026770823751576245, "learning_rate": 6.267806267806268e-07, "loss": 0.0, "step": 22080 }, { "epoch": 94.0040170940171, "grad_norm": 0.00019762790179811418, "learning_rate": 6.220322886989554e-07, "loss": 0.0158, "step": 22090 }, { "epoch": 94.00444444444445, "grad_norm": 0.00017857948841992766, "learning_rate": 6.17283950617284e-07, "loss": 0.0001, "step": 22100 }, { "epoch": 94.00487179487179, "grad_norm": 0.00011318692850181833, "learning_rate": 6.125356125356126e-07, "loss": 0.0, "step": 22110 }, { "epoch": 94.00529914529915, "grad_norm": 1054.87451171875, "learning_rate": 6.077872744539412e-07, "loss": 0.1906, "step": 22120 }, { "epoch": 94.00572649572649, "grad_norm": 0.0002740290074143559, "learning_rate": 6.030389363722698e-07, "loss": 0.0, "step": 22130 }, { "epoch": 94.00615384615385, "grad_norm": 0.00017096682859119028, "learning_rate": 5.982905982905984e-07, "loss": 0.0, "step": 22140 }, { "epoch": 94.0065811965812, "grad_norm": 0.00021798326633870602, "learning_rate": 5.935422602089269e-07, "loss": 0.0, "step": 22150 }, { "epoch": 94.00700854700855, "grad_norm": 0.00105681037530303, "learning_rate": 5.887939221272555e-07, "loss": 0.0, "step": 22160 }, { "epoch": 94.0074358974359, "grad_norm": 0.0005219983286224306, "learning_rate": 5.84045584045584e-07, "loss": 0.0, "step": 22170 }, { "epoch": 94.00786324786324, "grad_norm": 0.00013317124103195965, "learning_rate": 5.792972459639127e-07, "loss": 0.0, "step": 22180 }, { "epoch": 94.0082905982906, "grad_norm": 0.00028317165561020374, "learning_rate": 5.745489078822413e-07, "loss": 0.0, "step": 22190 }, { "epoch": 94.00871794871794, "grad_norm": 0.00024831280461512506, "learning_rate": 5.698005698005699e-07, "loss": 0.0, "step": 22200 }, { "epoch": 94.0091452991453, "grad_norm": 24.371335983276367, "learning_rate": 5.650522317188984e-07, "loss": 0.0029, "step": 22210 }, { "epoch": 94.00957264957265, "grad_norm": 0.0005085449665784836, "learning_rate": 5.60303893637227e-07, "loss": 0.0, "step": 22220 }, { "epoch": 94.01, "grad_norm": 0.00015020875434856862, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "step": 22230 }, { "epoch": 94.01, "eval_accuracy": 0.4, "eval_loss": 6.445286273956299, "eval_runtime": 36.5582, "eval_samples_per_second": 0.684, "eval_steps_per_second": 0.684, "step": 22230 }, { "epoch": 95.00042735042734, "grad_norm": 0.00011259275925112888, "learning_rate": 5.508072174738842e-07, "loss": 0.0, "step": 22240 }, { "epoch": 95.0008547008547, "grad_norm": 0.00021879606356378645, "learning_rate": 5.460588793922129e-07, "loss": 0.0, "step": 22250 }, { "epoch": 95.00128205128205, "grad_norm": 0.00042616037535481155, "learning_rate": 5.413105413105414e-07, "loss": 0.0, "step": 22260 }, { "epoch": 95.0017094017094, "grad_norm": 0.00019269147014711052, "learning_rate": 5.3656220322887e-07, "loss": 0.0, "step": 22270 }, { "epoch": 95.00213675213675, "grad_norm": 0.00035289855441078544, "learning_rate": 5.318138651471985e-07, "loss": 0.0, "step": 22280 }, { "epoch": 95.00256410256411, "grad_norm": 0.00012624288501683623, "learning_rate": 5.270655270655271e-07, "loss": 0.0, "step": 22290 }, { "epoch": 95.00299145299145, "grad_norm": 0.00018576315778773278, "learning_rate": 5.223171889838557e-07, "loss": 0.0, "step": 22300 }, { "epoch": 95.00341880341881, "grad_norm": 0.000108227992313914, "learning_rate": 5.175688509021843e-07, "loss": 0.0002, "step": 22310 }, { "epoch": 95.00384615384615, "grad_norm": 0.0001419824839103967, "learning_rate": 5.128205128205128e-07, "loss": 0.0, "step": 22320 }, { "epoch": 95.0042735042735, "grad_norm": 0.0004882781649939716, "learning_rate": 5.080721747388414e-07, "loss": 0.0, "step": 22330 }, { "epoch": 95.00470085470086, "grad_norm": 0.0001368028315482661, "learning_rate": 5.0332383665717e-07, "loss": 0.0, "step": 22340 }, { "epoch": 95.0051282051282, "grad_norm": 0.0009662008378654718, "learning_rate": 4.985754985754987e-07, "loss": 0.0, "step": 22350 }, { "epoch": 95.00555555555556, "grad_norm": 0.0002917400561273098, "learning_rate": 4.938271604938272e-07, "loss": 0.0, "step": 22360 }, { "epoch": 95.0059829059829, "grad_norm": 0.00049038470024243, "learning_rate": 4.890788224121558e-07, "loss": 0.0, "step": 22370 }, { "epoch": 95.00641025641026, "grad_norm": 0.00018329080194234848, "learning_rate": 4.843304843304843e-07, "loss": 0.0, "step": 22380 }, { "epoch": 95.0068376068376, "grad_norm": 0.00012125585635658354, "learning_rate": 4.795821462488129e-07, "loss": 0.0, "step": 22390 }, { "epoch": 95.00726495726495, "grad_norm": 0.00016512990987394005, "learning_rate": 4.748338081671416e-07, "loss": 0.0, "step": 22400 }, { "epoch": 95.00769230769231, "grad_norm": 0.0014239164302125573, "learning_rate": 4.700854700854701e-07, "loss": 0.0, "step": 22410 }, { "epoch": 95.00811965811965, "grad_norm": 0.011379273608326912, "learning_rate": 4.653371320037987e-07, "loss": 0.0, "step": 22420 }, { "epoch": 95.00854700854701, "grad_norm": 0.0001255527458852157, "learning_rate": 4.605887939221273e-07, "loss": 0.0, "step": 22430 }, { "epoch": 95.00897435897436, "grad_norm": 0.00010715150710893795, "learning_rate": 4.558404558404559e-07, "loss": 0.0, "step": 22440 }, { "epoch": 95.00940170940171, "grad_norm": 0.0006061216117814183, "learning_rate": 4.5109211775878445e-07, "loss": 0.0009, "step": 22450 }, { "epoch": 95.00982905982906, "grad_norm": 0.0009303994593210518, "learning_rate": 4.4634377967711306e-07, "loss": 0.0, "step": 22460 }, { "epoch": 95.01, "eval_accuracy": 0.4, "eval_loss": 6.910160064697266, "eval_runtime": 36.7216, "eval_samples_per_second": 0.681, "eval_steps_per_second": 0.681, "step": 22464 }, { "epoch": 96.00025641025641, "grad_norm": 0.00010953476885333657, "learning_rate": 4.415954415954416e-07, "loss": 0.0, "step": 22470 }, { "epoch": 96.00068376068376, "grad_norm": 0.0005640748422592878, "learning_rate": 4.3684710351377024e-07, "loss": 0.0, "step": 22480 }, { "epoch": 96.00111111111111, "grad_norm": 0.00019859550229739398, "learning_rate": 4.320987654320988e-07, "loss": 0.0, "step": 22490 }, { "epoch": 96.00153846153846, "grad_norm": 0.00011656359856715426, "learning_rate": 4.273504273504274e-07, "loss": 0.0, "step": 22500 }, { "epoch": 96.00196581196582, "grad_norm": 0.010922577232122421, "learning_rate": 4.226020892687559e-07, "loss": 0.0, "step": 22510 }, { "epoch": 96.00239316239316, "grad_norm": 0.0017669210210442543, "learning_rate": 4.1785375118708454e-07, "loss": 0.0, "step": 22520 }, { "epoch": 96.0028205128205, "grad_norm": 9.618465264793485e-05, "learning_rate": 4.131054131054131e-07, "loss": 0.0, "step": 22530 }, { "epoch": 96.00324786324786, "grad_norm": 0.00018471429939381778, "learning_rate": 4.083570750237417e-07, "loss": 0.0, "step": 22540 }, { "epoch": 96.00367521367521, "grad_norm": 0.00048411861644126475, "learning_rate": 4.0360873694207033e-07, "loss": 0.0, "step": 22550 }, { "epoch": 96.00410256410257, "grad_norm": 0.00014513205678667873, "learning_rate": 3.988603988603989e-07, "loss": 0.0, "step": 22560 }, { "epoch": 96.00452991452991, "grad_norm": 0.00017483255942352116, "learning_rate": 3.941120607787275e-07, "loss": 0.0, "step": 22570 }, { "epoch": 96.00495726495727, "grad_norm": 0.0007405714131891727, "learning_rate": 3.8936372269705607e-07, "loss": 0.0, "step": 22580 }, { "epoch": 96.00538461538461, "grad_norm": 0.00010351594391977414, "learning_rate": 3.846153846153847e-07, "loss": 0.0, "step": 22590 }, { "epoch": 96.00581196581197, "grad_norm": 0.00017762374773155898, "learning_rate": 3.7986704653371324e-07, "loss": 0.0, "step": 22600 }, { "epoch": 96.00623931623932, "grad_norm": 0.0008599003194831312, "learning_rate": 3.7511870845204186e-07, "loss": 0.0, "step": 22610 }, { "epoch": 96.00666666666666, "grad_norm": 9.716143540572375e-05, "learning_rate": 3.7037037037037036e-07, "loss": 0.0, "step": 22620 }, { "epoch": 96.00709401709402, "grad_norm": 0.0006491367239505053, "learning_rate": 3.65622032288699e-07, "loss": 0.0001, "step": 22630 }, { "epoch": 96.00752136752136, "grad_norm": 0.0002162454038625583, "learning_rate": 3.6087369420702754e-07, "loss": 0.0, "step": 22640 }, { "epoch": 96.00794871794872, "grad_norm": 0.0001346368808299303, "learning_rate": 3.5612535612535615e-07, "loss": 0.0, "step": 22650 }, { "epoch": 96.00837606837607, "grad_norm": 0.000659442157484591, "learning_rate": 3.513770180436847e-07, "loss": 0.0, "step": 22660 }, { "epoch": 96.00880341880342, "grad_norm": 0.00012783669808413833, "learning_rate": 3.4662867996201333e-07, "loss": 0.0, "step": 22670 }, { "epoch": 96.00923076923077, "grad_norm": 0.00012355089711491019, "learning_rate": 3.4188034188034194e-07, "loss": 0.0, "step": 22680 }, { "epoch": 96.00965811965811, "grad_norm": 0.00010718983685364947, "learning_rate": 3.371320037986705e-07, "loss": 0.0, "step": 22690 }, { "epoch": 96.01, "eval_accuracy": 0.4, "eval_loss": 6.9261651039123535, "eval_runtime": 37.5649, "eval_samples_per_second": 0.666, "eval_steps_per_second": 0.666, "step": 22698 }, { "epoch": 97.00008547008547, "grad_norm": 0.00010189504973823205, "learning_rate": 3.323836657169991e-07, "loss": 0.0, "step": 22700 }, { "epoch": 97.00051282051282, "grad_norm": 0.01328178122639656, "learning_rate": 3.276353276353277e-07, "loss": 0.4979, "step": 22710 }, { "epoch": 97.00094017094017, "grad_norm": 0.00010843879863386974, "learning_rate": 3.228869895536563e-07, "loss": 0.0, "step": 22720 }, { "epoch": 97.00136752136753, "grad_norm": 0.0005735277663916349, "learning_rate": 3.181386514719848e-07, "loss": 0.0, "step": 22730 }, { "epoch": 97.00179487179487, "grad_norm": 0.0010003969073295593, "learning_rate": 3.133903133903134e-07, "loss": 0.0, "step": 22740 }, { "epoch": 97.00222222222222, "grad_norm": 0.0009289424051530659, "learning_rate": 3.08641975308642e-07, "loss": 0.0, "step": 22750 }, { "epoch": 97.00264957264957, "grad_norm": 0.0013247294118627906, "learning_rate": 3.038936372269706e-07, "loss": 0.0, "step": 22760 }, { "epoch": 97.00307692307692, "grad_norm": 0.00010199108510278165, "learning_rate": 2.991452991452992e-07, "loss": 0.0, "step": 22770 }, { "epoch": 97.00350427350428, "grad_norm": 0.0014735024888068438, "learning_rate": 2.9439696106362777e-07, "loss": 0.0, "step": 22780 }, { "epoch": 97.00393162393162, "grad_norm": 0.00014969809853937477, "learning_rate": 2.8964862298195633e-07, "loss": 0.0, "step": 22790 }, { "epoch": 97.00435897435898, "grad_norm": 0.0001152329205069691, "learning_rate": 2.8490028490028494e-07, "loss": 0.0, "step": 22800 }, { "epoch": 97.00478632478632, "grad_norm": 0.0011336614843457937, "learning_rate": 2.801519468186135e-07, "loss": 0.0, "step": 22810 }, { "epoch": 97.00521367521368, "grad_norm": 0.0002444600104354322, "learning_rate": 2.754036087369421e-07, "loss": 0.0, "step": 22820 }, { "epoch": 97.00564102564103, "grad_norm": 0.0011971363564953208, "learning_rate": 2.706552706552707e-07, "loss": 0.0, "step": 22830 }, { "epoch": 97.00606837606837, "grad_norm": 0.000934019626583904, "learning_rate": 2.6590693257359924e-07, "loss": 0.0, "step": 22840 }, { "epoch": 97.00649572649573, "grad_norm": 0.00019381985475774854, "learning_rate": 2.6115859449192786e-07, "loss": 0.0, "step": 22850 }, { "epoch": 97.00692307692307, "grad_norm": 0.00011170632933499292, "learning_rate": 2.564102564102564e-07, "loss": 0.0, "step": 22860 }, { "epoch": 97.00735042735043, "grad_norm": 0.00010790720989461988, "learning_rate": 2.51661918328585e-07, "loss": 0.0, "step": 22870 }, { "epoch": 97.00777777777778, "grad_norm": 0.0001576395152369514, "learning_rate": 2.469135802469136e-07, "loss": 0.0, "step": 22880 }, { "epoch": 97.00820512820513, "grad_norm": 0.00010023313370766118, "learning_rate": 2.4216524216524215e-07, "loss": 0.0, "step": 22890 }, { "epoch": 97.00863247863248, "grad_norm": 0.0016645913710817695, "learning_rate": 2.374169040835708e-07, "loss": 0.0, "step": 22900 }, { "epoch": 97.00905982905982, "grad_norm": 0.0006122203776612878, "learning_rate": 2.3266856600189936e-07, "loss": 0.0, "step": 22910 }, { "epoch": 97.00948717948718, "grad_norm": 0.0002639706071931869, "learning_rate": 2.2792022792022794e-07, "loss": 0.0, "step": 22920 }, { "epoch": 97.00991452991452, "grad_norm": 0.00010460082557983696, "learning_rate": 2.2317188983855653e-07, "loss": 0.0, "step": 22930 }, { "epoch": 97.01, "eval_accuracy": 0.4, "eval_loss": 6.775699615478516, "eval_runtime": 36.4004, "eval_samples_per_second": 0.687, "eval_steps_per_second": 0.687, "step": 22932 }, { "epoch": 98.00034188034188, "grad_norm": 0.0007676715613342822, "learning_rate": 2.1842355175688512e-07, "loss": 0.0, "step": 22940 }, { "epoch": 98.00076923076924, "grad_norm": 0.00042605900671333075, "learning_rate": 2.136752136752137e-07, "loss": 0.0, "step": 22950 }, { "epoch": 98.00119658119658, "grad_norm": 0.0001429203402949497, "learning_rate": 2.0892687559354227e-07, "loss": 0.0, "step": 22960 }, { "epoch": 98.00162393162393, "grad_norm": 0.0004025712551083416, "learning_rate": 2.0417853751187086e-07, "loss": 0.0, "step": 22970 }, { "epoch": 98.00205128205128, "grad_norm": 0.001206466811709106, "learning_rate": 1.9943019943019944e-07, "loss": 0.0, "step": 22980 }, { "epoch": 98.00247863247863, "grad_norm": 0.00012541662727016956, "learning_rate": 1.9468186134852803e-07, "loss": 0.3638, "step": 22990 }, { "epoch": 98.00290598290599, "grad_norm": 0.0002750447019934654, "learning_rate": 1.8993352326685662e-07, "loss": 0.0, "step": 23000 }, { "epoch": 98.00333333333333, "grad_norm": 0.00010290797217749059, "learning_rate": 1.8518518518518518e-07, "loss": 0.0, "step": 23010 }, { "epoch": 98.00376068376069, "grad_norm": 0.0001165928115369752, "learning_rate": 1.8043684710351377e-07, "loss": 0.0, "step": 23020 }, { "epoch": 98.00418803418803, "grad_norm": 0.0001025166129693389, "learning_rate": 1.7568850902184236e-07, "loss": 0.0, "step": 23030 }, { "epoch": 98.00461538461539, "grad_norm": 9.691870218375698e-05, "learning_rate": 1.7094017094017097e-07, "loss": 0.0, "step": 23040 }, { "epoch": 98.00504273504274, "grad_norm": 0.00016511735157109797, "learning_rate": 1.6619183285849956e-07, "loss": 0.0, "step": 23050 }, { "epoch": 98.00547008547008, "grad_norm": 0.00010559390648268163, "learning_rate": 1.6144349477682815e-07, "loss": 0.0003, "step": 23060 }, { "epoch": 98.00589743589744, "grad_norm": 0.0008746449020691216, "learning_rate": 1.566951566951567e-07, "loss": 0.0, "step": 23070 }, { "epoch": 98.00632478632478, "grad_norm": 0.0002250542602268979, "learning_rate": 1.519468186134853e-07, "loss": 0.0, "step": 23080 }, { "epoch": 98.00675213675214, "grad_norm": 0.00017459446098655462, "learning_rate": 1.4719848053181388e-07, "loss": 0.0, "step": 23090 }, { "epoch": 98.00717948717949, "grad_norm": 0.00018693608581088483, "learning_rate": 1.4245014245014247e-07, "loss": 0.0, "step": 23100 }, { "epoch": 98.00760683760684, "grad_norm": 0.0016856353031471372, "learning_rate": 1.3770180436847106e-07, "loss": 0.0, "step": 23110 }, { "epoch": 98.00803418803419, "grad_norm": 0.00015905691543594003, "learning_rate": 1.3295346628679962e-07, "loss": 0.0, "step": 23120 }, { "epoch": 98.00846153846153, "grad_norm": 0.00016817098367027938, "learning_rate": 1.282051282051282e-07, "loss": 0.0, "step": 23130 }, { "epoch": 98.00888888888889, "grad_norm": 0.00019543507369235158, "learning_rate": 1.234567901234568e-07, "loss": 0.0, "step": 23140 }, { "epoch": 98.00931623931623, "grad_norm": 0.0006627991679124534, "learning_rate": 1.187084520417854e-07, "loss": 0.0, "step": 23150 }, { "epoch": 98.0097435897436, "grad_norm": 0.00031626957934349775, "learning_rate": 1.1396011396011397e-07, "loss": 0.0, "step": 23160 }, { "epoch": 98.01, "eval_accuracy": 0.4, "eval_loss": 6.829818248748779, "eval_runtime": 35.756, "eval_samples_per_second": 0.699, "eval_steps_per_second": 0.699, "step": 23166 }, { "epoch": 99.00017094017095, "grad_norm": 0.00048812164459377527, "learning_rate": 1.0921177587844256e-07, "loss": 0.0, "step": 23170 }, { "epoch": 99.00059829059829, "grad_norm": 0.0003830118221230805, "learning_rate": 1.0446343779677113e-07, "loss": 0.0, "step": 23180 }, { "epoch": 99.00102564102563, "grad_norm": 0.00040612902375869453, "learning_rate": 9.971509971509972e-08, "loss": 0.0, "step": 23190 }, { "epoch": 99.001452991453, "grad_norm": 0.00012840931594837457, "learning_rate": 9.496676163342831e-08, "loss": 0.0, "step": 23200 }, { "epoch": 99.00188034188034, "grad_norm": 0.0002664999628905207, "learning_rate": 9.021842355175688e-08, "loss": 0.0, "step": 23210 }, { "epoch": 99.0023076923077, "grad_norm": 0.00010337825369788334, "learning_rate": 8.547008547008549e-08, "loss": 0.0, "step": 23220 }, { "epoch": 99.00273504273504, "grad_norm": 0.0008002725662663579, "learning_rate": 8.072174738841407e-08, "loss": 0.0, "step": 23230 }, { "epoch": 99.0031623931624, "grad_norm": 0.00012994400458410382, "learning_rate": 7.597340930674265e-08, "loss": 0.0, "step": 23240 }, { "epoch": 99.00358974358974, "grad_norm": 0.0011528537143021822, "learning_rate": 7.122507122507124e-08, "loss": 0.0, "step": 23250 }, { "epoch": 99.0040170940171, "grad_norm": 0.00013322003360372037, "learning_rate": 6.647673314339981e-08, "loss": 0.0, "step": 23260 }, { "epoch": 99.00444444444445, "grad_norm": 0.001754231285303831, "learning_rate": 6.17283950617284e-08, "loss": 0.0, "step": 23270 }, { "epoch": 99.00487179487179, "grad_norm": 0.0002544873859733343, "learning_rate": 5.6980056980056986e-08, "loss": 0.0, "step": 23280 }, { "epoch": 99.00529914529915, "grad_norm": 0.0005114562809467316, "learning_rate": 5.223171889838557e-08, "loss": 0.0, "step": 23290 }, { "epoch": 99.00572649572649, "grad_norm": 0.00014945660950616002, "learning_rate": 4.7483380816714155e-08, "loss": 0.0, "step": 23300 }, { "epoch": 99.00615384615385, "grad_norm": 0.0005400913069024682, "learning_rate": 4.273504273504274e-08, "loss": 0.0, "step": 23310 }, { "epoch": 99.0065811965812, "grad_norm": 0.0001350509119220078, "learning_rate": 3.7986704653371324e-08, "loss": 0.0, "step": 23320 }, { "epoch": 99.00700854700855, "grad_norm": 0.00013444873911794275, "learning_rate": 3.3238366571699905e-08, "loss": 0.0, "step": 23330 }, { "epoch": 99.0074358974359, "grad_norm": 0.0003640491340775043, "learning_rate": 2.8490028490028493e-08, "loss": 0.0, "step": 23340 }, { "epoch": 99.00786324786324, "grad_norm": 0.0004577758372761309, "learning_rate": 2.3741690408357078e-08, "loss": 0.0, "step": 23350 }, { "epoch": 99.0082905982906, "grad_norm": 0.0005503903958015144, "learning_rate": 1.8993352326685662e-08, "loss": 0.0, "step": 23360 }, { "epoch": 99.00871794871794, "grad_norm": 0.0008972916402854025, "learning_rate": 1.4245014245014247e-08, "loss": 0.0, "step": 23370 }, { "epoch": 99.0091452991453, "grad_norm": 0.00013151808525435627, "learning_rate": 9.496676163342831e-09, "loss": 0.0, "step": 23380 }, { "epoch": 99.00957264957265, "grad_norm": 0.00012301822425797582, "learning_rate": 4.7483380816714155e-09, "loss": 0.0, "step": 23390 }, { "epoch": 99.01, "grad_norm": 0.002225738950073719, "learning_rate": 0.0, "loss": 0.0, "step": 23400 }, { "epoch": 99.01, "eval_accuracy": 0.4, "eval_loss": 6.831691741943359, "eval_runtime": 36.0994, "eval_samples_per_second": 0.693, "eval_steps_per_second": 0.693, "step": 23400 }, { "epoch": 99.01, "step": 23400, "total_flos": 1.0275070897744773e+20, "train_loss": 0.44932733817713766, "train_runtime": 74953.482, "train_samples_per_second": 0.312, "train_steps_per_second": 0.312 }, { "epoch": 99.01, "eval_accuracy": 0.48, "eval_loss": 4.4508891105651855, "eval_runtime": 32.6922, "eval_samples_per_second": 0.765, "eval_steps_per_second": 0.765, "step": 23400 }, { "epoch": 99.01, "eval_accuracy": 0.48, "eval_loss": 4.450888633728027, "eval_runtime": 32.6242, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.766, "step": 23400 } ], "logging_steps": 10, "max_steps": 23400, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0275070897744773e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }